From 02dab518db419a417d583690221fcdcbe7d5a1ab Mon Sep 17 00:00:00 2001
From: jiangmengyu <jiangmengyu1@huawei.com>
Date: Tue, 24 Dec 2024 19:53:04 +0800
Subject: [PATCH 01/91] =?UTF-8?q?CogView3plus=E9=80=82=E9=85=8D?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../built-in/foundation/cogview3/README.md    | 314 ++++++
 .../cogview3/cogview3plus/__init__.py         |  18 +
 .../cogview3/cogview3plus/layers/__init__.py  |   2 +
 .../cogview3plus/layers/embeddings.py         | 484 +++++++++
 .../cogview3plus/layers/normalization.py      | 166 +++
 .../cogview3/cogview3plus/models/__init__.py  |   1 +
 .../cogview3/cogview3plus/models/attention.py |  86 ++
 .../models/attention_processor.py             |  82 ++
 .../models/transformer_cogview3plus.py        | 383 +++++++
 .../cogview3plus/pipeline/__init__.py         |   1 +
 .../pipeline/pipeline_cogview3plus.py         | 675 ++++++++++++
 .../cogview3plus/pipeline/pipeline_output.py  |  21 +
 .../cogview3plus/schedulers/__init__.py       |   2 +
 .../schedulers/scheduling_ddim_cogvideox.py   | 452 ++++++++
 .../schedulers/scheduling_dpm_cogvideox.py    | 489 +++++++++
 .../schedulers/scheduling_utils.py            | 193 ++++
 .../cogview3/cogview3plus/vae/__init__.py     |   1 +
 .../cogview3plus/vae/autoencoder_kl.py        | 571 ++++++++++
 .../cogview3/cogview3plus/vae/vae.py          | 995 ++++++++++++++++++
 .../cogview3/inference_cogview3plus.py        |  99 ++
 .../foundation/cogview3/requirents.txt        |  15 +
 21 files changed, 5050 insertions(+)
 create mode 100644 MindIE/MindIE-Torch/built-in/foundation/cogview3/README.md
 create mode 100644 MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/__init__.py
 create mode 100644 MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/__init__.py
 create mode 100644 MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/embeddings.py
 create mode 100644 MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/normalization.py
 create mode 100644 MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/__init__.py
 create mode 100644 MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/attention.py
 create mode 100644 MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/attention_processor.py
 create mode 100644 MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/transformer_cogview3plus.py
 create mode 100644 MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/__init__.py
 create mode 100644 MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py
 create mode 100644 MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_output.py
 create mode 100644 MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/__init__.py
 create mode 100644 MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_ddim_cogvideox.py
 create mode 100644 MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_dpm_cogvideox.py
 create mode 100644 MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_utils.py
 create mode 100644 MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/vae/__init__.py
 create mode 100644 MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/vae/autoencoder_kl.py
 create mode 100644 MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/vae/vae.py
 create mode 100644 MindIE/MindIE-Torch/built-in/foundation/cogview3/inference_cogview3plus.py
 create mode 100644 MindIE/MindIE-Torch/built-in/foundation/cogview3/requirents.txt

diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/README.md b/MindIE/MindIE-Torch/built-in/foundation/cogview3/README.md
new file mode 100644
index 0000000000..028c765d30
--- /dev/null
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/README.md
@@ -0,0 +1,314 @@
+## 一、准备运行环境
+
+  **表 1**  版本配套表
+
+  | 配套  | 版本 | 环境准备指导 |
+  | ----- | ----- |-----|
+  | Python | 3.10.2 | - |
+  | torch | 2.1.0 | - |
+
+### 1.1 获取CANN&MindIE安装包&环境准备
+- [800I A2](https://www.hiascend.com/developer/download/community/result?module=pt+ie+cann&product=4&model=32)
+- [Duo卡](https://www.hiascend.com/developer/download/community/result?module=pt+ie+cann&product=2&model=17)
+- [环境准备指导](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/80RC2alpha002/softwareinst/instg/instg_0001.html)
+
+### 1.2 CANN安装
+```shell
+# 增加软件包可执行权限，{version}表示软件版本号，{arch}表示CPU架构，{soc}表示昇腾AI处理器的版本。
+chmod +x ./Ascend-cann-toolkit_{version}_linux-{arch}.run
+chmod +x ./Ascend-cann-kernels-{soc}_{version}_linux.run
+# 校验软件包安装文件的一致性和完整性
+./Ascend-cann-toolkit_{version}_linux-{arch}.run --check
+./Ascend-cann-kernels-{soc}_{version}_linux.run --check
+# 安装
+./Ascend-cann-toolkit_{version}_linux-{arch}.run --install
+./Ascend-cann-kernels-{soc}_{version}_linux.run --install
+
+# 设置环境变量
+source /usr/local/Ascend/ascend-toolkit/set_env.sh
+```
+
+### 1.3 MindIE安装
+```shell
+# 增加软件包可执行权限，{version}表示软件版本号，{arch}表示CPU架构。
+chmod +x ./Ascend-mindie_${version}_linux-${arch}.run
+./Ascend-mindie_${version}_linux-${arch}.run --check
+
+# 方式一：默认路径安装
+./Ascend-mindie_${version}_linux-${arch}.run --install
+# 设置环境变量
+cd /usr/local/Ascend/mindie && source set_env.sh
+
+# 方式二：指定路径安装
+./Ascend-mindie_${version}_linux-${arch}.run --install-path=${AieInstallPath}
+# 设置环境变量
+cd ${AieInstallPath}/mindie && source set_env.sh
+```
+
+### 1.4 Torch_npu安装
+安装pytorch框架 版本2.1.0
+[安装包下载](https://download.pytorch.org/whl/cpu/torch/)
+
+使用pip安装
+```shell
+# {version}表示软件版本号，{arch}表示CPU架构。
+pip install torch-${version}-cp310-cp310-linux_${arch}.whl
+```
+下载 pytorch_v{pytorchversion}_py{pythonversion}.tar.gz
+```shell
+tar -xzvf pytorch_v{pytorchversion}_py{pythonversion}.tar.gz
+# 解压后，会有whl包
+pip install torch_npu-{pytorchversion}.xxxx.{arch}.whl
+```
+## 二、下载本仓库
+
+### 2.1 下载到本地
+```shell
+   git clone https://gitee.com/ascend/ModelZoo-PyTorch.git
+```
+
+## 三、HunyuanDiT使用
+
+### 3.1 权重及配置文件说明
+1. text_encoder权重链接:
+```shell
+https://huggingface.co/Tencent-Hunyuan/HunyuanDiT-v1.2-Diffusers/tree/main/text_encoder
+```
+2. text_encoder_2权重链接：
+```shell
+https://huggingface.co/Tencent-Hunyuan/HunyuanDiT-v1.2-Diffusers/tree/main/text_encoder_2
+```
+3. tokenizer权重链接：
+```shell
+https://huggingface.co/Tencent-Hunyuan/HunyuanDiT-v1.2-Diffusers/tree/main/tokenizer
+```
+4. tokenizer_2权重链接：
+```shell
+https://huggingface.co/Tencent-Hunyuan/HunyuanDiT-v1.2-Diffusers/tree/main/tokenizer_2
+```
+5. transformer权重链接：
+```shell
+https://huggingface.co/Tencent-Hunyuan/HunyuanDiT-v1.2/tree/main/t2i/model
+```
+- 修改该权重的config.json
+```shell
+{
+  "architectures": [
+    "HunyuanDiT2DModel"
+  ],
+  "input_size": [
+    null,
+    null
+  ],
+  "patch_size": 2,
+  "in_channels": 4,
+  "hidden_size": 1408,
+  "depth": 40,
+  "num_heads": 16,
+  "mlp_ratio": 4.3637,
+  "text_states_dim": 1024,
+  "text_states_dim_t5": 2048,
+  "text_len": 77,
+  "text_len_t5": 256
+}
+```
+6. vae权重链接：
+```shell
+https://huggingface.co/Tencent-Hunyuan/HunyuanDiT-v1.2-Diffusers/tree/main/vae
+```
+- 修改该权重的config.json
+```shell
+{
+  "architectures": [
+    "AutoencoderKL"
+  ],
+  "in_channels": 3,
+  "out_channels": 3,
+  "down_block_types": [
+    "DownEncoderBlock2D",
+    "DownEncoderBlock2D",
+    "DownEncoderBlock2D",
+    "DownEncoderBlock2D"
+  ],
+  "up_block_types": [
+    "UpDecoderBlock2D",
+    "UpDecoderBlock2D",
+    "UpDecoderBlock2D",
+    "UpDecoderBlock2D"
+  ],
+  "block_out_channels": [
+    128,
+    256,
+    512,
+    512
+  ],
+  "layers_per_block": 2,
+  "act_fn": "silu",
+  "latent_channels": 4,
+  "norm_num_groups": 32,
+  "sample_size": 512,
+  "scaling_factor": 0.13025,
+  "shift_factor": null,
+  "latents_mean": null,
+  "latents_std": null,
+  "force_upcast": false,
+  "use_quant_conv": true,
+  "use_post_quant_conv": true
+}
+```
+7. scheduler:
+- 新增scheduler_config.json配置文件, 内容如下所示: 
+```shell
+{
+  "_class_name": "DDPMScheduler",
+  "_mindiesd_version": "1.0.0",
+  "steps_offset": 1,
+  "beta_start": 0.00085,
+  "beta_end": 0.02,
+  "num_train_timesteps": 1000
+}
+```
+8. 新增model_index.json
+将以上步骤下载的权重放在同一目录下, 并新增model_index.json文件, 该文件内容如下所示
+```shell
+{
+    "_class_name": "HunyuanDiTPipeline",
+    "_mindiesd_version": "1.0.RC3",
+    "scheduler": [
+      "mindiesd",
+      "DDPMScheduler"
+    ],
+    "text_encoder": [
+      "transformers",
+      "BertModel"
+    ],
+    "text_encoder_2": [
+      "transformers",
+      "T5EncoderModel"
+    ],
+    "tokenizer": [
+      "transformers",
+      "BertTokenizer"
+    ],
+    "tokenizer_2": [
+      "transformers",
+      "T5Tokenizer"
+    ],
+    "transformer": [
+      "mindiesd",
+      "HunyuanDiT2DModel"
+    ],
+    "vae": [
+      "mindiesd",
+      "AutoencoderKL"
+    ]
+}
+```
+9. 各模型的配置文件、权重文件的层级样例如下所示。
+```commandline
+|----hunyuandit
+|    |---- model_index.json
+|    |---- scheduler
+|    |    |---- scheduler_config.json
+|    |---- text_encoder
+|    |    |---- config.json
+|    |    |---- 模型权重
+|    |---- text_encoder_2
+|    |    |---- config.json
+|    |    |---- 模型权重
+|    |---- tokenizer
+|    |    |---- config.json
+|    |    |---- 模型权重
+|    |---- tokenizer_2
+|    |    |---- config.json
+|    |    |---- 模型权重
+|    |---- transformer
+|    |    |---- config.json
+|    |    |---- 模型权重
+|    |---- vae
+|    |    |---- config.json
+|    |    |---- 模型权重
+```
+
+### 3.2 单卡单prompt功能测试
+设置权重路径
+```shell
+path = 'ckpts/hydit'
+```
+执行命令：
+```shell
+python inference_hydit.py \
+       --path ${path} \
+       --device_id 0 \
+       --prompt "青花瓷风格，一只小狗" \
+       --input_size (1024, 1024) \
+       --seed 42 \
+       --infer_steps 25
+```
+参数说明：
+- path：权重路径，包含scheduler、text_encoder、text_encoder_2、tokenizer、 tokenizer_2、transformer、vae，七个模型的配置文件及权重。
+- device_id：推理设备ID。
+- prompt：用于图像生成的文字描述提示。
+- input_size：需要生成的图像尺寸。
+- seed：设置随机种子，默认值为42。
+- infer_steps：推理迭代步数。
+
+### 3.3 单卡多prompts进行性能/精度测试
+设置权重路径
+```shell
+path = 'ckpts/hydit'
+```
+执行命令：
+```shell
+python inference_hydit.py \
+       --path ${path} \
+       --device_id 0 \
+       --test_acc \
+       --prompt_list "prompts/example_prompts.txt" \
+       --input_size (1024, 1024) \
+       --seed 42 \
+       --infer_steps 25
+```
+参数说明：
+- path：权重路径，包含scheduler、text_encoder、text_encoder_2、tokenizer、 tokenizer_2、transformer、vae，七个模型的配置文件及权重。
+- device_id：推理设备ID。
+- test_acc：使用 --test_acc 开启全量图像生成，用于性能/精度测试。单prompt功能测试时，不开启该参数。
+- prompt_list：用于图像生成的文字描述提示的列表文件路径。
+- input_size：需要生成的图像尺寸。
+- seed：设置随机种子，默认值为42。
+- infer_steps：推理迭代步数。
+
+### 3.4 用LoRA进行测试
+设置权重路径
+```shell
+path = 'ckpts/hydit'
+```
+LoRA权重链接：
+```shell
+https://huggingface.co/Tencent-Hunyuan/HYDiT-LoRA/tree/main
+```
+设置LoRA权重路径
+```shell
+lora_path = 'ckpts/lora'
+```
+执行命令：
+```shell
+python inference_hydit.py \
+       --path ${path} \
+       --device_id 0 \
+       --prompt "青花瓷风格，一只小狗" \
+       --input_size (1024, 1024) \
+       --seed 42 \
+       --infer_steps 25
+       --use_lora \
+       --lora_ckpt ${lora_path}
+```
+参数说明：
+- path：权重路径，包含scheduler、text_encoder、text_encoder_2、tokenizer、 tokenizer_2、transformer、vae，七个模型的配置文件及权重。
+- device_id：推理设备ID。
+- prompt：用于图像生成的文字描述提示。
+- input_size：需要生成的图像尺寸。
+- seed：设置随机种子，默认值为42。
+- infer_steps：推理迭代步数。
+- use_lora：使用 --use_lora 开启LoRA风格化切换。
+- lora_ckpt：LoRA权重路径。
\ No newline at end of file
diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/__init__.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/__init__.py
new file mode 100644
index 0000000000..acbd223eb6
--- /dev/null
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/__init__.py
@@ -0,0 +1,18 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2024 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from .pipeline import CogView3PlusPipeline
diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/__init__.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/__init__.py
new file mode 100644
index 0000000000..c3e7c569e2
--- /dev/null
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/__init__.py
@@ -0,0 +1,2 @@
+from .normalization import CogView3PlusAdaLayerNormZeroTextImage, AdaLayerNormContinuous
+from .embeddings import CogView3CombinedTimestepSizeEmbeddings, CogView3PlusPatchEmbed
\ No newline at end of file
diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/embeddings.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/embeddings.py
new file mode 100644
index 0000000000..445ad8245a
--- /dev/null
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/embeddings.py
@@ -0,0 +1,484 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+from typing import Optional
+
+import numpy as np
+import torch
+from torch import nn
+
+from diffusers.utils import deprecate
+from diffusers.models.activations import FP32SiLU, get_activation
+
+
+def get_timestep_embedding(
+    timesteps: torch.Tensor,
+    embedding_dim: int,
+    flip_sin_to_cos: bool = False,
+    downscale_freq_shift: float = 1,
+    scale: float = 1,
+    max_period: int = 10000,
+):
+    """
+    This matches the implementation in Denoising Diffusion Probabilistic Models: Create sinusoidal timestep embeddings.
+
+    Args
+        timesteps (torch.Tensor):
+            a 1-D Tensor of N indices, one per batch element. These may be fractional.
+        embedding_dim (int):
+            the dimension of the output.
+        flip_sin_to_cos (bool):
+            Whether the embedding order should be `cos, sin` (if True) or `sin, cos` (if False)
+        downscale_freq_shift (float):
+            Controls the delta between frequencies between dimensions
+        scale (float):
+            Scaling factor applied to the embeddings.
+        max_period (int):
+            Controls the maximum frequency of the embeddings
+    Returns
+        torch.Tensor: an [N x dim] Tensor of positional embeddings.
+    """
+    assert len(timesteps.shape) == 1, "Timesteps should be a 1d-array"
+
+    half_dim = embedding_dim // 2
+    exponent = -math.log(max_period) * torch.arange(
+        start=0, end=half_dim, dtype=torch.float32, device=timesteps.device
+    )
+    exponent = exponent / (half_dim - downscale_freq_shift)
+
+    emb = torch.exp(exponent)
+    emb = timesteps[:, None].float() * emb[None, :]
+
+    # scale embeddings
+    emb = scale * emb
+
+    # concat sine and cosine embeddings
+    emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=-1)
+
+    # flip sine and cosine embeddings
+    if flip_sin_to_cos:
+        emb = torch.cat([emb[:, half_dim:], emb[:, :half_dim]], dim=-1)
+
+    # zero pad
+    if embedding_dim % 2 == 1:
+        emb = torch.nn.functional.pad(emb, (0, 1, 0, 0))
+    return emb
+
+
+def get_2d_sincos_pos_embed(
+    embed_dim,
+    grid_size,
+    cls_token=False,
+    extra_tokens=0,
+    interpolation_scale=1.0,
+    base_size=16,
+    device: Optional[torch.device] = None,
+    output_type: str = "np",
+):
+    """
+    Creates 2D sinusoidal positional embeddings.
+
+    Args:
+        embed_dim (`int`):
+            The embedding dimension.
+        grid_size (`int`):
+            The size of the grid height and width.
+        cls_token (`bool`, defaults to `False`):
+            Whether or not to add a classification token.
+        extra_tokens (`int`, defaults to `0`):
+            The number of extra tokens to add.
+        interpolation_scale (`float`, defaults to `1.0`):
+            The scale of the interpolation.
+
+    Returns:
+        pos_embed (`torch.Tensor`):
+            Shape is either `[grid_size * grid_size, embed_dim]` if not using cls_token, or `[1 + grid_size*grid_size,
+            embed_dim]` if using cls_token
+    """
+    if output_type == "np":
+        deprecation_message = (
+            "`get_2d_sincos_pos_embed` uses `torch` and supports `device`."
+            " `from_numpy` is no longer required."
+            "  Pass `output_type='pt' to use the new version now."
+        )
+        deprecate("output_type=='np'", "0.33.0", deprecation_message, standard_warn=False)
+        return get_2d_sincos_pos_embed_np(
+            embed_dim=embed_dim,
+            grid_size=grid_size,
+            cls_token=cls_token,
+            extra_tokens=extra_tokens,
+            interpolation_scale=interpolation_scale,
+            base_size=base_size,
+        )
+    if isinstance(grid_size, int):
+        grid_size = (grid_size, grid_size)
+
+    grid_h = (
+        torch.arange(grid_size[0], device=device, dtype=torch.float32)
+        / (grid_size[0] / base_size)
+        / interpolation_scale
+    )
+    grid_w = (
+        torch.arange(grid_size[1], device=device, dtype=torch.float32)
+        / (grid_size[1] / base_size)
+        / interpolation_scale
+    )
+    grid = torch.meshgrid(grid_w, grid_h, indexing="xy")  # here w goes first
+    grid = torch.stack(grid, dim=0)
+
+    grid = grid.reshape([2, 1, grid_size[1], grid_size[0]])
+    pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid, output_type=output_type)
+    if cls_token and extra_tokens > 0:
+        pos_embed = torch.concat([torch.zeros([extra_tokens, embed_dim]), pos_embed], dim=0)
+    return pos_embed
+
+
+def get_2d_sincos_pos_embed_from_grid(embed_dim, grid, output_type="np"):
+    r"""
+    This function generates 2D sinusoidal positional embeddings from a grid.
+
+    Args:
+        embed_dim (`int`): The embedding dimension.
+        grid (`torch.Tensor`): Grid of positions with shape `(H * W,)`.
+
+    Returns:
+        `torch.Tensor`: The 2D sinusoidal positional embeddings with shape `(H * W, embed_dim)`
+    """
+    if output_type == "np":
+        deprecation_message = (
+            "`get_2d_sincos_pos_embed_from_grid` uses `torch` and supports `device`."
+            " `from_numpy` is no longer required."
+            "  Pass `output_type='pt' to use the new version now."
+        )
+        deprecate("output_type=='np'", "0.33.0", deprecation_message, standard_warn=False)
+        return get_2d_sincos_pos_embed_from_grid_np(
+            embed_dim=embed_dim,
+            grid=grid,
+        )
+    if embed_dim % 2 != 0:
+        raise ValueError("embed_dim must be divisible by 2")
+
+    # use half of dimensions to encode grid_h
+    emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0], output_type=output_type)  # (H*W, D/2)
+    emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1], output_type=output_type)  # (H*W, D/2)
+
+    emb = torch.concat([emb_h, emb_w], dim=1)  # (H*W, D)
+    return emb
+
+
+def get_1d_sincos_pos_embed_from_grid(embed_dim, pos, output_type="np"):
+    """
+    This function generates 1D positional embeddings from a grid.
+
+    Args:
+        embed_dim (`int`): The embedding dimension `D`
+        pos (`torch.Tensor`): 1D tensor of positions with shape `(M,)`
+
+    Returns:
+        `torch.Tensor`: Sinusoidal positional embeddings of shape `(M, D)`.
+    """
+    if output_type == "np":
+        deprecation_message = (
+            "`get_1d_sincos_pos_embed_from_grid` uses `torch` and supports `device`."
+            " `from_numpy` is no longer required."
+            "  Pass `output_type='pt' to use the new version now."
+        )
+        deprecate("output_type=='np'", "0.33.0", deprecation_message, standard_warn=False)
+        return get_1d_sincos_pos_embed_from_grid_np(embed_dim=embed_dim, pos=pos)
+    if embed_dim % 2 != 0:
+        raise ValueError("embed_dim must be divisible by 2")
+
+    omega = torch.arange(embed_dim // 2, device=pos.device, dtype=torch.float64)
+    omega /= embed_dim / 2.0
+    omega = 1.0 / 10000**omega  # (D/2,)
+
+    pos = pos.reshape(-1)  # (M,)
+    out = torch.outer(pos, omega)  # (M, D/2), outer product
+
+    emb_sin = torch.sin(out)  # (M, D/2)
+    emb_cos = torch.cos(out)  # (M, D/2)
+
+    emb = torch.concat([emb_sin, emb_cos], dim=1)  # (M, D)
+    return emb
+
+
+def get_2d_sincos_pos_embed_np(
+    embed_dim, grid_size, cls_token=False, extra_tokens=0, interpolation_scale=1.0, base_size=16
+):
+    """
+    Creates 2D sinusoidal positional embeddings.
+
+    Args:
+        embed_dim (`int`):
+            The embedding dimension.
+        grid_size (`int`):
+            The size of the grid height and width.
+        cls_token (`bool`, defaults to `False`):
+            Whether or not to add a classification token.
+        extra_tokens (`int`, defaults to `0`):
+            The number of extra tokens to add.
+        interpolation_scale (`float`, defaults to `1.0`):
+            The scale of the interpolation.
+
+    Returns:
+        pos_embed (`np.ndarray`):
+            Shape is either `[grid_size * grid_size, embed_dim]` if not using cls_token, or `[1 + grid_size*grid_size,
+            embed_dim]` if using cls_token
+    """
+    if isinstance(grid_size, int):
+        grid_size = (grid_size, grid_size)
+
+    grid_h = np.arange(grid_size[0], dtype=np.float32) / (grid_size[0] / base_size) / interpolation_scale
+    grid_w = np.arange(grid_size[1], dtype=np.float32) / (grid_size[1] / base_size) / interpolation_scale
+    grid = np.meshgrid(grid_w, grid_h)  # here w goes first
+    grid = np.stack(grid, axis=0)
+
+    grid = grid.reshape([2, 1, grid_size[1], grid_size[0]])
+    pos_embed = get_2d_sincos_pos_embed_from_grid_np(embed_dim, grid)
+    if cls_token and extra_tokens > 0:
+        pos_embed = np.concatenate([np.zeros([extra_tokens, embed_dim]), pos_embed], axis=0)
+    return pos_embed
+
+
+def get_2d_sincos_pos_embed_from_grid_np(embed_dim, grid):
+    r"""
+    This function generates 2D sinusoidal positional embeddings from a grid.
+
+    Args:
+        embed_dim (`int`): The embedding dimension.
+        grid (`np.ndarray`): Grid of positions with shape `(H * W,)`.
+
+    Returns:
+        `np.ndarray`: The 2D sinusoidal positional embeddings with shape `(H * W, embed_dim)`
+    """
+    if embed_dim % 2 != 0:
+        raise ValueError("embed_dim must be divisible by 2")
+
+    # use half of dimensions to encode grid_h
+    emb_h = get_1d_sincos_pos_embed_from_grid_np(embed_dim // 2, grid[0])  # (H*W, D/2)
+    emb_w = get_1d_sincos_pos_embed_from_grid_np(embed_dim // 2, grid[1])  # (H*W, D/2)
+
+    emb = np.concatenate([emb_h, emb_w], axis=1)  # (H*W, D)
+    return emb
+
+
+def get_1d_sincos_pos_embed_from_grid_np(embed_dim, pos):
+    """
+    This function generates 1D positional embeddings from a grid.
+
+    Args:
+        embed_dim (`int`): The embedding dimension `D`
+        pos (`numpy.ndarray`): 1D tensor of positions with shape `(M,)`
+
+    Returns:
+        `numpy.ndarray`: Sinusoidal positional embeddings of shape `(M, D)`.
+    """
+    if embed_dim % 2 != 0:
+        raise ValueError("embed_dim must be divisible by 2")
+
+    omega = np.arange(embed_dim // 2, dtype=np.float64)
+    omega /= embed_dim / 2.0
+    omega = 1.0 / 10000**omega  # (D/2,)
+
+    pos = pos.reshape(-1)  # (M,)
+    out = np.einsum("m,d->md", pos, omega)  # (M, D/2), outer product
+
+    emb_sin = np.sin(out)  # (M, D/2)
+    emb_cos = np.cos(out)  # (M, D/2)
+
+    emb = np.concatenate([emb_sin, emb_cos], axis=1)  # (M, D)
+    return emb
+
+
+class Timesteps(nn.Module):
+    def __init__(self, num_channels: int, flip_sin_to_cos: bool, downscale_freq_shift: float, scale: int = 1):
+        super().__init__()
+        self.num_channels = num_channels
+        self.flip_sin_to_cos = flip_sin_to_cos
+        self.downscale_freq_shift = downscale_freq_shift
+        self.scale = scale
+
+    def forward(self, timesteps):
+        t_emb = get_timestep_embedding(
+            timesteps,
+            self.num_channels,
+            flip_sin_to_cos=self.flip_sin_to_cos,
+            downscale_freq_shift=self.downscale_freq_shift,
+            scale=self.scale,
+        )
+        return t_emb
+
+
+class TimestepEmbedding(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        time_embed_dim: int,
+        act_fn: str = "silu",
+        out_dim: int = None,
+        post_act_fn: Optional[str] = None,
+        cond_proj_dim=None,
+        sample_proj_bias=True,
+    ):
+        super().__init__()
+
+        self.linear_1 = nn.Linear(in_channels, time_embed_dim, sample_proj_bias)
+
+        if cond_proj_dim is not None:
+            self.cond_proj = nn.Linear(cond_proj_dim, in_channels, bias=False)
+        else:
+            self.cond_proj = None
+
+        self.act = get_activation(act_fn)
+
+        if out_dim is not None:
+            time_embed_dim_out = out_dim
+        else:
+            time_embed_dim_out = time_embed_dim
+        self.linear_2 = nn.Linear(time_embed_dim, time_embed_dim_out, sample_proj_bias)
+
+        if post_act_fn is None:
+            self.post_act = None
+        else:
+            self.post_act = get_activation(post_act_fn)
+
+    def forward(self, sample, condition=None):
+        if condition is not None:
+            sample = sample + self.cond_proj(condition)
+        sample = self.linear_1(sample)
+
+        if self.act is not None:
+            sample = self.act(sample)
+
+        sample = self.linear_2(sample)
+
+        if self.post_act is not None:
+            sample = self.post_act(sample)
+        return sample
+
+
+class PixArtAlphaTextProjection(nn.Module):
+    """
+    Projects caption embeddings. Also handles dropout for classifier-free guidance.
+    """
+
+    def __init__(self, in_features, hidden_size, out_features=None, act_fn="gelu_tanh"):
+        super().__init__()
+        if out_features is None:
+            out_features = hidden_size
+        self.linear_1 = nn.Linear(in_features=in_features, out_features=hidden_size, bias=True)
+        if act_fn == "gelu_tanh":
+            self.act_1 = nn.GELU(approximate="tanh")
+        elif act_fn == "silu":
+            self.act_1 = nn.SiLU()
+        elif act_fn == "silu_fp32":
+            self.act_1 = FP32SiLU()
+        else:
+            raise ValueError(f"Unknown activation function: {act_fn}")
+        self.linear_2 = nn.Linear(in_features=hidden_size, out_features=out_features, bias=True)
+
+    def forward(self, caption):
+        hidden_states = self.linear_1(caption)
+        hidden_states = self.act_1(hidden_states)
+        hidden_states = self.linear_2(hidden_states)
+        return hidden_states
+
+
+class CogView3CombinedTimestepSizeEmbeddings(nn.Module):
+    def __init__(self, embedding_dim: int, condition_dim: int, pooled_projection_dim: int, timesteps_dim: int = 256):
+        super().__init__()
+
+        self.time_proj = Timesteps(num_channels=timesteps_dim, flip_sin_to_cos=True, downscale_freq_shift=0)
+        self.condition_proj = Timesteps(num_channels=condition_dim, flip_sin_to_cos=True, downscale_freq_shift=0)
+        self.timestep_embedder = TimestepEmbedding(in_channels=timesteps_dim, time_embed_dim=embedding_dim)
+        self.condition_embedder = PixArtAlphaTextProjection(pooled_projection_dim, embedding_dim, act_fn="silu")
+
+    def forward(
+        self,
+        timestep: torch.Tensor,
+        original_size: torch.Tensor,
+        target_size: torch.Tensor,
+        crop_coords: torch.Tensor,
+        hidden_dtype: torch.dtype,
+    ) -> torch.Tensor:
+        timesteps_proj = self.time_proj(timestep)
+
+        original_size_proj = self.condition_proj(original_size.flatten()).view(original_size.size(0), -1)
+        crop_coords_proj = self.condition_proj(crop_coords.flatten()).view(crop_coords.size(0), -1)
+        target_size_proj = self.condition_proj(target_size.flatten()).view(target_size.size(0), -1)
+
+        # (B, 3 * condition_dim)
+        condition_proj = torch.cat([original_size_proj, crop_coords_proj, target_size_proj], dim=1)
+
+        timesteps_emb = self.timestep_embedder(timesteps_proj.to(dtype=hidden_dtype))  # (B, embedding_dim)
+        condition_emb = self.condition_embedder(condition_proj.to(dtype=hidden_dtype))  # (B, embedding_dim)
+
+        conditioning = timesteps_emb + condition_emb
+        return conditioning
+
+
+class CogView3PlusPatchEmbed(nn.Module):
+    def __init__(
+        self,
+        in_channels: int = 16,
+        hidden_size: int = 2560,
+        patch_size: int = 2,
+        text_hidden_size: int = 4096,
+        pos_embed_max_size: int = 128,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.hidden_size = hidden_size
+        self.patch_size = patch_size
+        self.text_hidden_size = text_hidden_size
+        self.pos_embed_max_size = pos_embed_max_size
+        # Linear projection for image patches
+        self.proj = nn.Linear(in_channels * patch_size**2, hidden_size)
+
+        # Linear projection for text embeddings
+        self.text_proj = nn.Linear(text_hidden_size, hidden_size)
+
+        pos_embed = get_2d_sincos_pos_embed(
+            hidden_size, pos_embed_max_size, base_size=pos_embed_max_size, output_type="pt"
+        )
+        pos_embed = pos_embed.reshape(pos_embed_max_size, pos_embed_max_size, hidden_size)
+        self.register_buffer("pos_embed", pos_embed.float(), persistent=False)
+
+    def forward(self, hidden_states: torch.Tensor, encoder_hidden_states: torch.Tensor) -> torch.Tensor:
+        batch_size, channel, height, width = hidden_states.shape
+
+        if height % self.patch_size != 0 or width % self.patch_size != 0:
+            raise ValueError("Height and width must be divisible by patch size")
+
+        height = height // self.patch_size
+        width = width // self.patch_size
+        hidden_states = hidden_states.view(batch_size, channel, height, self.patch_size, width, self.patch_size)
+        hidden_states = hidden_states.permute(0, 2, 4, 1, 3, 5).contiguous()
+        hidden_states = hidden_states.view(batch_size, height * width, channel * self.patch_size * self.patch_size)
+
+        # Project the patches
+        hidden_states = self.proj(hidden_states)
+        encoder_hidden_states = self.text_proj(encoder_hidden_states)
+        hidden_states = torch.cat([encoder_hidden_states, hidden_states], dim=1)
+
+        # Calculate text_length
+        text_length = encoder_hidden_states.shape[1]
+
+        image_pos_embed = self.pos_embed[:height, :width].reshape(height * width, -1)
+        text_pos_embed = torch.zeros(
+            (text_length, self.hidden_size), dtype=image_pos_embed.dtype, device=image_pos_embed.device
+        )
+        pos_embed = torch.cat([text_pos_embed, image_pos_embed], dim=0)[None, ...]
+
+        return (hidden_states + pos_embed).to(hidden_states.dtype)
\ No newline at end of file
diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/normalization.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/normalization.py
new file mode 100644
index 0000000000..b2576d26f5
--- /dev/null
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/normalization.py
@@ -0,0 +1,166 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numbers
+from typing import Optional, Tuple
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from diffusers.utils import is_torch_version
+
+
+if is_torch_version(">=", "2.1.0"):
+    LayerNorm = nn.LayerNorm
+else:
+    # Has optional bias parameter compared to torch layer norm
+    # TODO: replace with torch layernorm once min required torch version >= 2.1
+    class LayerNorm(nn.Module):
+        def __init__(self, dim, eps: float = 1e-5, elementwise_affine: bool = True, bias: bool = True):
+            super().__init__()
+
+            self.eps = eps
+
+            if isinstance(dim, numbers.Integral):
+                dim = (dim,)
+
+            self.dim = torch.Size(dim)
+
+            if elementwise_affine:
+                self.weight = nn.Parameter(torch.ones(dim))
+                self.bias = nn.Parameter(torch.zeros(dim)) if bias else None
+            else:
+                self.weight = None
+                self.bias = None
+
+        def forward(self, input):
+            return F.layer_norm(input, self.dim, self.weight, self.bias, self.eps)
+
+
+class RMSNorm(nn.Module):
+    def __init__(self, dim, eps: float, elementwise_affine: bool = True, bias: bool = False):
+        super().__init__()
+
+        self.eps = eps
+        self.elementwise_affine = elementwise_affine
+
+        if isinstance(dim, numbers.Integral):
+            dim = (dim,)
+
+        self.dim = torch.Size(dim)
+
+        self.weight = None
+        self.bias = None
+
+        if elementwise_affine:
+            self.weight = nn.Parameter(torch.ones(dim))
+            if bias:
+                self.bias = nn.Parameter(torch.zeros(dim))
+
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.eps)
+
+        if self.weight is not None:
+            # convert into half-precision if necessary
+            if self.weight.dtype in [torch.float16, torch.bfloat16]:
+                hidden_states = hidden_states.to(self.weight.dtype)
+            hidden_states = hidden_states * self.weight
+            if self.bias is not None:
+                hidden_states = hidden_states + self.bias
+        else:
+            hidden_states = hidden_states.to(input_dtype)
+
+        return hidden_states
+    
+
+class CogView3PlusAdaLayerNormZeroTextImage(nn.Module):
+    r"""
+    Norm layer adaptive layer norm zero (adaLN-Zero).
+
+    Parameters:
+        embedding_dim (`int`): The size of each embedding vector.
+        num_embeddings (`int`): The size of the embeddings dictionary.
+    """
+
+    def __init__(self, embedding_dim: int, dim: int):
+        super().__init__()
+
+        self.silu = nn.SiLU()
+        self.linear = nn.Linear(embedding_dim, 12 * dim, bias=True)
+        self.norm_x = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-5)
+        self.norm_c = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-5)
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        context: torch.Tensor,
+        emb: Optional[torch.Tensor] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        emb = self.linear(self.silu(emb))
+        (
+            shift_msa,
+            scale_msa,
+            gate_msa,
+            shift_mlp,
+            scale_mlp,
+            gate_mlp,
+            c_shift_msa,
+            c_scale_msa,
+            c_gate_msa,
+            c_shift_mlp,
+            c_scale_mlp,
+            c_gate_mlp,
+        ) = emb.chunk(12, dim=1)
+        normed_x = self.norm_x(x)
+        normed_context = self.norm_c(context)
+        x = normed_x * (1 + scale_msa[:, None]) + shift_msa[:, None]
+        context = normed_context * (1 + c_scale_msa[:, None]) + c_shift_msa[:, None]
+        return x, gate_msa, shift_mlp, scale_mlp, gate_mlp, context, c_gate_msa, c_shift_mlp, c_scale_mlp, c_gate_mlp
+
+
+class AdaLayerNormContinuous(nn.Module):
+    def __init__(
+        self,
+        embedding_dim: int,
+        conditioning_embedding_dim: int,
+        # NOTE: It is a bit weird that the norm layer can be configured to have scale and shift parameters
+        # because the output is immediately scaled and shifted by the projected conditioning embeddings.
+        # Note that AdaLayerNorm does not let the norm layer have scale and shift parameters.
+        # However, this is how it was implemented in the original code, and it's rather likely you should
+        # set `elementwise_affine` to False.
+        elementwise_affine=True,
+        eps=1e-5,
+        bias=True,
+        norm_type="layer_norm",
+    ):
+        super().__init__()
+        self.silu = nn.SiLU()
+        self.linear = nn.Linear(conditioning_embedding_dim, embedding_dim * 2, bias=bias)
+        if norm_type == "layer_norm":
+            self.norm = LayerNorm(embedding_dim, eps, elementwise_affine, bias)
+        elif norm_type == "rms_norm":
+            self.norm = RMSNorm(embedding_dim, eps, elementwise_affine)
+        else:
+            raise ValueError(f"unknown norm_type {norm_type}")
+
+    def forward(self, x: torch.Tensor, conditioning_embedding: torch.Tensor) -> torch.Tensor:
+        # convert back to the original dtype in case `conditioning_embedding`` is upcasted to float32 (needed for hunyuanDiT)
+        emb = self.linear(self.silu(conditioning_embedding).to(x.dtype))
+        scale, shift = torch.chunk(emb, 2, dim=1)
+        x = self.norm(x) * (1 + scale)[:, None, :] + shift[:, None, :]
+        return x
\ No newline at end of file
diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/__init__.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/__init__.py
new file mode 100644
index 0000000000..06571c58d3
--- /dev/null
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/__init__.py
@@ -0,0 +1 @@
+from transformer_cogview3plus import CogView3PlusTransformer2DModel
\ No newline at end of file
diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/attention.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/attention.py
new file mode 100644
index 0000000000..00aabc9fdd
--- /dev/null
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/attention.py
@@ -0,0 +1,86 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Optional
+
+import torch
+from torch import nn
+
+from diffusers.utils import deprecate, logging
+from diffusers.models.activations import GEGLU, GELU, ApproximateGELU, LinearActivation, SwiGLU
+
+
+logger = logging.get_logger(__name__)
+
+
+class FeedForward(nn.Module):
+    r"""
+    A feed-forward layer.
+
+    Parameters:
+        dim (`int`): The number of channels in the input.
+        dim_out (`int`, *optional*): The number of channels in the output. If not given, defaults to `dim`.
+        mult (`int`, *optional*, defaults to 4): The multiplier to use for the hidden dimension.
+        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
+        activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward.
+        final_dropout (`bool` *optional*, defaults to False): Apply a final dropout.
+        bias (`bool`, defaults to True): Whether to use a bias in the linear layer.
+    """
+
+    def __init__(
+        self,
+        dim: int,
+        dim_out: Optional[int] = None,
+        mult: int = 4,
+        dropout: float = 0.0,
+        activation_fn: str = "geglu",
+        final_dropout: bool = False,
+        inner_dim=None,
+        bias: bool = True,
+    ):
+        super().__init__()
+        if inner_dim is None:
+            inner_dim = int(dim * mult)
+        dim_out = dim_out if dim_out is not None else dim
+
+        if activation_fn == "gelu":
+            act_fn = GELU(dim, inner_dim, bias=bias)
+        if activation_fn == "gelu-approximate":
+            act_fn = GELU(dim, inner_dim, approximate="tanh", bias=bias)
+        elif activation_fn == "geglu":
+            act_fn = GEGLU(dim, inner_dim, bias=bias)
+        elif activation_fn == "geglu-approximate":
+            act_fn = ApproximateGELU(dim, inner_dim, bias=bias)
+        elif activation_fn == "swiglu":
+            act_fn = SwiGLU(dim, inner_dim, bias=bias)
+        elif activation_fn == "linear-silu":
+            act_fn = LinearActivation(dim, inner_dim, bias=bias, activation="silu")
+
+        self.net = nn.ModuleList([])
+        # project in
+        self.net.append(act_fn)
+        # project dropout
+        self.net.append(nn.Dropout(dropout))
+        # project out
+        self.net.append(nn.Linear(inner_dim, dim_out, bias=bias))
+        # FF as used in Vision Transformer, MLP-Mixer, etc. have a final dropout
+        if final_dropout:
+            self.net.append(nn.Dropout(dropout))
+
+    def forward(self, hidden_states: torch.Tensor, *args, **kwargs) -> torch.Tensor:
+        if len(args) > 0 or kwargs.get("scale", None) is not None:
+            deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`."
+            deprecate("scale", "1.0.0", deprecation_message)
+        for module in self.net:
+            hidden_states = module(hidden_states)
+        return hidden_states
\ No newline at end of file
diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/attention_processor.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/attention_processor.py
new file mode 100644
index 0000000000..6632f7f83f
--- /dev/null
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/attention_processor.py
@@ -0,0 +1,82 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Optional
+
+import torch
+import torch.nn.functional as F
+
+from diffusers.models.attention_processor import Attention
+
+
+class CogVideoXAttnProcessor2_0:
+    r"""
+    Processor for implementing scaled dot-product attention for the CogVideoX model. It applies a rotary embedding on
+    query and key vectors, but does not include spatial normalization.
+    """
+
+    def __init__(self):
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError("CogVideoXAttnProcessor requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
+
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,s
+    ) -> torch.Tensor:
+        text_seq_length = encoder_hidden_states.size(1)
+
+        hidden_states = torch.cat([encoder_hidden_states, hidden_states], dim=1)
+
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+
+        if attention_mask is not None:
+            attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+            attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
+
+        query = attn.to_q(hidden_states)
+        key = attn.to_k(hidden_states)
+        value = attn.to_v(hidden_states)
+
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+
+        if attn.norm_q is not None:
+            query = attn.norm_q(query)
+        if attn.norm_k is not None:
+            key = attn.norm_k(key)
+
+        hidden_states = F.scaled_dot_product_attention(
+            query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+        )
+
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+
+        encoder_hidden_states, hidden_states = hidden_states.split(
+            [text_seq_length, hidden_states.size(1) - text_seq_length], dim=1
+        )
+        return hidden_states, encoder_hidden_states
\ No newline at end of file
diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/transformer_cogview3plus.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/transformer_cogview3plus.py
new file mode 100644
index 0000000000..78360f61e9
--- /dev/null
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/transformer_cogview3plus.py
@@ -0,0 +1,383 @@
+# Copyright 2024 The CogView team, Tsinghua University & ZhipuAI and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from typing import Any, Dict, Union
+
+import torch
+import torch.nn as nn
+
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.models.attention_processor import Attention, AttentionProcessor
+from diffusers.models.modeling_utils import ModelMixin
+from diffusers.utils import is_torch_version, logging
+from diffusers.models.modeling_outputs import Transformer2DModelOutput
+
+from .attention import FeedForward
+from .attention_processor import CogVideoXAttnProcessor2_0
+from ..layers import CogView3PlusAdaLayerNormZeroTextImage, AdaLayerNormContinuous
+from ..layers import CogView3CombinedTimestepSizeEmbeddings, CogView3PlusPatchEmbed
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+class CogView3PlusTransformerBlock(nn.Module):
+    r"""
+    Transformer block used in [CogView](https://github.com/THUDM/CogView3) model.
+
+    Args:
+        dim (`int`):
+            The number of channels in the input and output.
+        num_attention_heads (`int`):
+            The number of heads to use for multi-head attention.
+        attention_head_dim (`int`):
+            The number of channels in each head.
+        time_embed_dim (`int`):
+            The number of channels in timestep embedding.
+    """
+
+    def __init__(
+        self,
+        dim: int = 2560,
+        num_attention_heads: int = 64,
+        attention_head_dim: int = 40,
+        time_embed_dim: int = 512,
+    ):
+        super().__init__()
+
+        self.norm1 = CogView3PlusAdaLayerNormZeroTextImage(embedding_dim=time_embed_dim, dim=dim)
+
+        self.attn1 = Attention(
+            query_dim=dim,
+            heads=num_attention_heads,
+            dim_head=attention_head_dim,
+            out_dim=dim,
+            bias=True,
+            qk_norm="layer_norm",
+            elementwise_affine=False,
+            eps=1e-6,
+            processor=CogVideoXAttnProcessor2_0(),
+        )
+
+        self.norm2 = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-5)
+        self.norm2_context = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-5)
+
+        self.ff = FeedForward(dim=dim, dim_out=dim, activation_fn="gelu-approximate")
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: torch.Tensor,
+        emb: torch.Tensor,
+    ) -> torch.Tensor:
+        text_seq_length = encoder_hidden_states.size(1)
+
+        # norm & modulate
+        (
+            norm_hidden_states,
+            gate_msa,
+            shift_mlp,
+            scale_mlp,
+            gate_mlp,
+            norm_encoder_hidden_states,
+            c_gate_msa,
+            c_shift_mlp,
+            c_scale_mlp,
+            c_gate_mlp,
+        ) = self.norm1(hidden_states, encoder_hidden_states, emb)
+
+        # attention
+        attn_hidden_states, attn_encoder_hidden_states = self.attn1(
+            hidden_states=norm_hidden_states, encoder_hidden_states=norm_encoder_hidden_states
+        )
+
+        hidden_states = hidden_states + gate_msa.unsqueeze(1) * attn_hidden_states
+        encoder_hidden_states = encoder_hidden_states + c_gate_msa.unsqueeze(1) * attn_encoder_hidden_states
+
+        # norm & modulate
+        norm_hidden_states = self.norm2(hidden_states)
+        norm_hidden_states = norm_hidden_states * (1 + scale_mlp[:, None]) + shift_mlp[:, None]
+
+        norm_encoder_hidden_states = self.norm2_context(encoder_hidden_states)
+        norm_encoder_hidden_states = norm_encoder_hidden_states * (1 + c_scale_mlp[:, None]) + c_shift_mlp[:, None]
+
+        # feed-forward
+        norm_hidden_states = torch.cat([norm_encoder_hidden_states, norm_hidden_states], dim=1)
+        ff_output = self.ff(norm_hidden_states)
+
+        hidden_states = hidden_states + gate_mlp.unsqueeze(1) * ff_output[:, text_seq_length:]
+        encoder_hidden_states = encoder_hidden_states + c_gate_mlp.unsqueeze(1) * ff_output[:, :text_seq_length]
+
+        if hidden_states.dtype == torch.float16:
+            hidden_states = hidden_states.clip(-65504, 65504)
+        if encoder_hidden_states.dtype == torch.float16:
+            encoder_hidden_states = encoder_hidden_states.clip(-65504, 65504)
+        return hidden_states, encoder_hidden_states
+
+
+class CogView3PlusTransformer2DModel(ModelMixin, ConfigMixin):
+    r"""
+    The Transformer model introduced in [CogView3: Finer and Faster Text-to-Image Generation via Relay
+    Diffusion](https://huggingface.co/papers/2403.05121).
+
+    Args:
+        patch_size (`int`, defaults to `2`):
+            The size of the patches to use in the patch embedding layer.
+        in_channels (`int`, defaults to `16`):
+            The number of channels in the input.
+        num_layers (`int`, defaults to `30`):
+            The number of layers of Transformer blocks to use.
+        attention_head_dim (`int`, defaults to `40`):
+            The number of channels in each head.
+        num_attention_heads (`int`, defaults to `64`):
+            The number of heads to use for multi-head attention.
+        out_channels (`int`, defaults to `16`):
+            The number of channels in the output.
+        text_embed_dim (`int`, defaults to `4096`):
+            Input dimension of text embeddings from the text encoder.
+        time_embed_dim (`int`, defaults to `512`):
+            Output dimension of timestep embeddings.
+        condition_dim (`int`, defaults to `256`):
+            The embedding dimension of the input SDXL-style resolution conditions (original_size, target_size,
+            crop_coords).
+        pos_embed_max_size (`int`, defaults to `128`):
+            The maximum resolution of the positional embeddings, from which slices of shape `H x W` are taken and added
+            to input patched latents, where `H` and `W` are the latent height and width respectively. A value of 128
+            means that the maximum supported height and width for image generation is `128 * vae_scale_factor *
+            patch_size => 128 * 8 * 2 => 2048`.
+        sample_size (`int`, defaults to `128`):
+            The base resolution of input latents. If height/width is not provided during generation, this value is used
+            to determine the resolution as `sample_size * vae_scale_factor => 128 * 8 => 1024`
+    """
+
+    _supports_gradient_checkpointing = True
+
+    @register_to_config
+    def __init__(
+        self,
+        patch_size: int = 2,
+        in_channels: int = 16,
+        num_layers: int = 30,
+        attention_head_dim: int = 40,
+        num_attention_heads: int = 64,
+        out_channels: int = 16,
+        text_embed_dim: int = 4096,
+        time_embed_dim: int = 512,
+        condition_dim: int = 256,
+        pos_embed_max_size: int = 128,
+        sample_size: int = 128,
+    ):
+        super().__init__()
+        self.out_channels = out_channels
+        self.inner_dim = num_attention_heads * attention_head_dim
+
+        # CogView3 uses 3 additional SDXL-like conditions - original_size, target_size, crop_coords
+        # Each of these are sincos embeddings of shape 2 * condition_dim
+        self.pooled_projection_dim = 3 * 2 * condition_dim
+
+        self.patch_embed = CogView3PlusPatchEmbed(
+            in_channels=in_channels,
+            hidden_size=self.inner_dim,
+            patch_size=patch_size,
+            text_hidden_size=text_embed_dim,
+            pos_embed_max_size=pos_embed_max_size,
+        )
+
+        self.time_condition_embed = CogView3CombinedTimestepSizeEmbeddings(
+            embedding_dim=time_embed_dim,
+            condition_dim=condition_dim,
+            pooled_projection_dim=self.pooled_projection_dim,
+            timesteps_dim=self.inner_dim,
+        )
+
+        self.transformer_blocks = nn.ModuleList(
+            [
+                CogView3PlusTransformerBlock(
+                    dim=self.inner_dim,
+                    num_attention_heads=num_attention_heads,
+                    attention_head_dim=attention_head_dim,
+                    time_embed_dim=time_embed_dim,
+                )
+                for _ in range(num_layers)
+            ]
+        )
+
+        self.norm_out = AdaLayerNormContinuous(
+            embedding_dim=self.inner_dim,
+            conditioning_embedding_dim=time_embed_dim,
+            elementwise_affine=False,
+            eps=1e-6,
+        )
+        self.proj_out = nn.Linear(self.inner_dim, patch_size * patch_size * self.out_channels, bias=True)
+
+        self.gradient_checkpointing = False
+
+    @property
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.attn_processors
+    def attn_processors(self) -> Dict[str, AttentionProcessor]:
+        r"""
+        Returns:
+            `dict` of attention processors: A dictionary containing all attention processors used in the model with
+            indexed by its weight name.
+        """
+        # set recursively
+        processors = {}
+
+        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
+            if hasattr(module, "get_processor"):
+                processors[f"{name}.processor"] = module.get_processor()
+
+            for sub_name, child in module.named_children():
+                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
+
+            return processors
+
+        for name, module in self.named_children():
+            fn_recursive_add_processors(name, module, processors)
+
+        return processors
+
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_attn_processor
+    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
+        r"""
+        Sets the attention processor to use to compute attention.
+
+        Parameters:
+            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
+                The instantiated processor class or a dictionary of processor classes that will be set as the processor
+                for **all** `Attention` layers.
+
+                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
+                processor. This is strongly recommended when setting trainable attention processors.
+
+        """
+        count = len(self.attn_processors.keys())
+
+        if isinstance(processor, dict) and len(processor) != count:
+            raise ValueError(
+                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
+                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
+            )
+
+        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
+            if hasattr(module, "set_processor"):
+                if not isinstance(processor, dict):
+                    module.set_processor(processor)
+                else:
+                    module.set_processor(processor.pop(f"{name}.processor"))
+
+            for sub_name, child in module.named_children():
+                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
+
+        for name, module in self.named_children():
+            fn_recursive_attn_processor(name, module, processor)
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if hasattr(module, "gradient_checkpointing"):
+            module.gradient_checkpointing = value
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: torch.Tensor,
+        timestep: torch.LongTensor,
+        original_size: torch.Tensor,
+        target_size: torch.Tensor,
+        crop_coords: torch.Tensor,
+        return_dict: bool = True,
+    ) -> Union[torch.Tensor, Transformer2DModelOutput]:
+        """
+        The [`CogView3PlusTransformer2DModel`] forward method.
+
+        Args:
+            hidden_states (`torch.Tensor`):
+                Input `hidden_states` of shape `(batch size, channel, height, width)`.
+            encoder_hidden_states (`torch.Tensor`):
+                Conditional embeddings (embeddings computed from the input conditions such as prompts) of shape
+                `(batch_size, sequence_len, text_embed_dim)`
+            timestep (`torch.LongTensor`):
+                Used to indicate denoising step.
+            original_size (`torch.Tensor`):
+                CogView3 uses SDXL-like micro-conditioning for original image size as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            target_size (`torch.Tensor`):
+                CogView3 uses SDXL-like micro-conditioning for target image size as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            crop_coords (`torch.Tensor`):
+                CogView3 uses SDXL-like micro-conditioning for crop coordinates as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~models.transformer_2d.Transformer2DModelOutput`] instead of a plain
+                tuple.
+
+        Returns:
+            `torch.Tensor` or [`~models.transformer_2d.Transformer2DModelOutput`]:
+                The denoised latents using provided inputs as conditioning.
+        """
+        height, width = hidden_states.shape[-2:]
+        text_seq_length = encoder_hidden_states.shape[1]
+
+        hidden_states = self.patch_embed(
+            hidden_states, encoder_hidden_states
+        )  # takes care of adding positional embeddings too.
+        emb = self.time_condition_embed(timestep, original_size, target_size, crop_coords, hidden_states.dtype)
+
+        encoder_hidden_states = hidden_states[:, :text_seq_length]
+        hidden_states = hidden_states[:, text_seq_length:]
+
+        for index_block, block in enumerate(self.transformer_blocks):
+            if torch.is_grad_enabled() and self.gradient_checkpointing:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs)
+
+                    return custom_forward
+
+                ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+                hidden_states, encoder_hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(block),
+                    hidden_states,
+                    encoder_hidden_states,
+                    emb,
+                    **ckpt_kwargs,
+                )
+            else:
+                hidden_states, encoder_hidden_states = block(
+                    hidden_states=hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    emb=emb,
+                )
+
+        hidden_states = self.norm_out(hidden_states, emb)
+        hidden_states = self.proj_out(hidden_states)  # (batch_size, height*width, patch_size*patch_size*out_channels)
+
+        # unpatchify
+        patch_size = self.config.patch_size
+        height = height // patch_size
+        width = width // patch_size
+
+        hidden_states = hidden_states.reshape(
+            shape=(hidden_states.shape[0], height, width, self.out_channels, patch_size, patch_size)
+        )
+        hidden_states = torch.einsum("nhwcpq->nchpwq", hidden_states)
+        output = hidden_states.reshape(
+            shape=(hidden_states.shape[0], self.out_channels, height * patch_size, width * patch_size)
+        )
+
+        if not return_dict:
+            return (output,)
+
+        return Transformer2DModelOutput(sample=output)
\ No newline at end of file
diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/__init__.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/__init__.py
new file mode 100644
index 0000000000..aea730c2e3
--- /dev/null
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/__init__.py
@@ -0,0 +1 @@
+from .pipeline_cogview3plus import CogView3PlusPipeline
\ No newline at end of file
diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py
new file mode 100644
index 0000000000..a78f82a9b2
--- /dev/null
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py
@@ -0,0 +1,675 @@
+# Copyright 2024 The CogVideoX team, Tsinghua University & ZhipuAI and The HuggingFace Team.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import Callable, Dict, List, Optional, Tuple, Union
+
+import torch
+from transformers import T5EncoderModel, T5Tokenizer
+
+from diffusers.callbacks import MultiPipelineCallbacks, PipelineCallback
+from diffusers.image_processor import VaeImageProcessor
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline
+from diffusers.utils import logging, replace_example_docstring
+from diffusers.utils.torch_utils import randn_tensor
+
+from ..vae import AutoencoderKL
+from ..models import CogView3PlusTransformer2DModel
+from ..schedulers import CogVideoXDDIMScheduler, CogVideoXDPMScheduler
+from .pipeline_output import CogView3PipelineOutput
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```python
+        >>> import torch
+        >>> from diffusers import CogView3PlusPipeline
+
+        >>> pipe = CogView3PlusPipeline.from_pretrained("THUDM/CogView3-Plus-3B", torch_dtype=torch.bfloat16)
+        >>> pipe.to("cuda")
+
+        >>> prompt = "A photo of an astronaut riding a horse on mars"
+        >>> image = pipe(prompt).images[0]
+        >>> image.save("output.png")
+        ```
+"""
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    sigmas: Optional[List[float]] = None,
+    **kwargs,
+):
+    r"""
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
+            must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
+            `num_inference_steps` and `sigmas` must be `None`.
+        sigmas (`List[float]`, *optional*):
+            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
+            `num_inference_steps` and `timesteps` must be `None`.
+
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None and sigmas is not None:
+        raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    elif sigmas is not None:
+        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accept_sigmas:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" sigmas schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+
+
+class CogView3PlusPipeline(DiffusionPipeline):
+    r"""
+    Pipeline for text-to-image generation using CogView3Plus.
+
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`T5EncoderModel`]):
+            Frozen text-encoder. CogView3Plus uses
+            [T5](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5EncoderModel); specifically the
+            [t5-v1_1-xxl](https://huggingface.co/PixArt-alpha/PixArt-alpha/tree/main/t5-v1_1-xxl) variant.
+        tokenizer (`T5Tokenizer`):
+            Tokenizer of class
+            [T5Tokenizer](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5Tokenizer).
+        transformer ([`CogView3PlusTransformer2DModel`]):
+            A text conditioned `CogView3PlusTransformer2DModel` to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `transformer` to denoise the encoded image latents.
+    """
+
+    _optional_components = []
+    model_cpu_offload_seq = "text_encoder->transformer->vae"
+
+    _callback_tensor_inputs = [
+        "latents",
+        "prompt_embeds",
+        "negative_prompt_embeds",
+    ]
+
+    def __init__(
+        self,
+        tokenizer: T5Tokenizer,
+        text_encoder: T5EncoderModel,
+        vae: AutoencoderKL,
+        transformer: CogView3PlusTransformer2DModel,
+        scheduler: Union[CogVideoXDDIMScheduler, CogVideoXDPMScheduler],
+    ):
+        super().__init__()
+
+        self.register_modules(
+            tokenizer=tokenizer, text_encoder=text_encoder, vae=vae, transformer=transformer, scheduler=scheduler
+        )
+        self.vae_scale_factor = (
+            2 ** (len(self.vae.config.block_out_channels) - 1) if hasattr(self, "vae") and self.vae is not None else 8
+        )
+
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+
+    # Copied from diffusers.pipelines.cogvideo.pipeline_cogvideox.CogVideoXPipeline._get_t5_prompt_embeds with num_videos_per_prompt->num_images_per_prompt
+    def _get_t5_prompt_embeds(
+        self,
+        prompt: Union[str, List[str]] = None,
+        num_images_per_prompt: int = 1,
+        max_sequence_length: int = 226,
+        device: Optional[torch.device] = None,
+        dtype: Optional[torch.dtype] = None,
+    ):
+        device = device or self._execution_device
+        dtype = dtype or self.text_encoder.dtype
+
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        batch_size = len(prompt)
+
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=max_sequence_length,
+            truncation=True,
+            add_special_tokens=True,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+        untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
+            removed_text = self.tokenizer.batch_decode(untruncated_ids[:, max_sequence_length - 1 : -1])
+            logger.warning(
+                "The following part of your input was truncated because `max_sequence_length` is set to "
+                f" {max_sequence_length} tokens: {removed_text}"
+            )
+
+        prompt_embeds = self.text_encoder(text_input_ids.to(device))[0]
+        prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
+
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        _, seq_len, _ = prompt_embeds.shape
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+
+        return prompt_embeds
+
+    def encode_prompt(
+        self,
+        prompt: Union[str, List[str]],
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        do_classifier_free_guidance: bool = True,
+        num_images_per_prompt: int = 1,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        max_sequence_length: int = 224,
+        device: Optional[torch.device] = None,
+        dtype: Optional[torch.dtype] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            do_classifier_free_guidance (`bool`, *optional*, defaults to `True`):
+                Whether to use classifier free guidance or not.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                Number of images that should be generated per prompt. torch device to place the resulting embeddings on
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            max_sequence_length (`int`, defaults to `224`):
+                Maximum sequence length in encoded prompt. Can be set to other values but may lead to poorer results.
+            device: (`torch.device`, *optional*):
+                torch device
+            dtype: (`torch.dtype`, *optional*):
+                torch dtype
+        """
+        device = device or self._execution_device
+
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        if prompt is not None:
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            prompt_embeds = self._get_t5_prompt_embeds(
+                prompt=prompt,
+                num_images_per_prompt=num_images_per_prompt,
+                max_sequence_length=max_sequence_length,
+                device=device,
+                dtype=dtype,
+            )
+
+        if do_classifier_free_guidance and negative_prompt is None:
+            negative_prompt_embeds = prompt_embeds.new_zeros(prompt_embeds.shape)
+
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            negative_prompt = batch_size * [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt
+
+            if prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+
+            negative_prompt_embeds = self._get_t5_prompt_embeds(
+                prompt=negative_prompt,
+                num_images_per_prompt=num_images_per_prompt,
+                max_sequence_length=max_sequence_length,
+                device=device,
+                dtype=dtype,
+            )
+
+        return prompt_embeds, negative_prompt_embeds
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
+        shape = (
+            batch_size,
+            num_channels_latents,
+            int(height) // self.vae_scale_factor,
+            int(width) // self.vae_scale_factor,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    # Copied from diffusers.pipelines.latte.pipeline_latte.LattePipeline.check_inputs
+    def check_inputs(
+        self,
+        prompt,
+        height,
+        width,
+        negative_prompt,
+        callback_on_step_end_tensor_inputs,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+    # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+    # corresponds to doing no classifier free guidance.
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    @property
+    def interrupt(self):
+        return self._interrupt
+
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Optional[Union[str, List[str]]] = None,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        timesteps: Optional[List[int]] = None,
+        guidance_scale: float = 5.0,
+        num_images_per_prompt: int = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        original_size: Optional[Tuple[int, int]] = None,
+        crops_coords_top_left: Tuple[int, int] = (0, 0),
+        output_type: str = "pil",
+        return_dict: bool = True,
+        callback_on_step_end: Optional[
+            Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
+        ] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        max_sequence_length: int = 224,
+    ) -> Union[CogView3PipelineOutput, Tuple]:
+        """
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            height (`int`, *optional*, defaults to self.transformer.config.sample_size * self.vae_scale_factor):
+                The height in pixels of the generated image. If not provided, it is set to 1024.
+            width (`int`, *optional*, defaults to self.transformer.config.sample_size * self.vae_scale_factor):
+                The width in pixels of the generated image. If not provided it is set to 1024.
+            num_inference_steps (`int`, *optional*, defaults to `50`):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
+                in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
+                passed will be used. Must be in descending order.
+            guidance_scale (`float`, *optional*, defaults to `5.0`):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            num_images_per_prompt (`int`, *optional*, defaults to `1`):
+                The number of images to generate per prompt.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                If `original_size` is not the same as `target_size` the image will appear to be down- or upsampled.
+                `original_size` defaults to `(height, width)` if not specified. Part of SDXL's micro-conditioning as
+                explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
+                `crops_coords_top_left` can be used to generate an image that appears to be "cropped" from the position
+                `crops_coords_top_left` downwards. Favorable, well-centered images are usually achieved by setting
+                `crops_coords_top_left` to (0, 0). Part of SDXL's micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] instead
+                of a plain tuple.
+            attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+            max_sequence_length (`int`, defaults to `224`):
+                Maximum sequence length in encoded prompt. Can be set to other values but may lead to poorer results.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.cogview3.pipeline_cogview3plus.CogView3PipelineOutput`] or `tuple`:
+            [`~pipelines.cogview3.pipeline_cogview3plus.CogView3PipelineOutput`] if `return_dict` is True, otherwise a
+            `tuple`. When returning a tuple, the first element is a list with the generated images.
+        """
+
+        if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
+            callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
+
+        height = height or self.transformer.config.sample_size * self.vae_scale_factor
+        width = width or self.transformer.config.sample_size * self.vae_scale_factor
+
+        original_size = original_size or (height, width)
+        target_size = (height, width)
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            height,
+            width,
+            negative_prompt,
+            callback_on_step_end_tensor_inputs,
+            prompt_embeds,
+            negative_prompt_embeds,
+        )
+        self._guidance_scale = guidance_scale
+        self._interrupt = False
+
+        # 2. Default call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        # 3. Encode input prompt
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt,
+            negative_prompt,
+            self.do_classifier_free_guidance,
+            num_images_per_prompt=num_images_per_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            max_sequence_length=max_sequence_length,
+            device=device,
+        )
+        if self.do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
+
+        # 4. Prepare timesteps
+        timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps)
+        self._num_timesteps = len(timesteps)
+
+        # 5. Prepare latents.
+        latent_channels = self.transformer.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            latent_channels,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 7. Prepare additional timestep conditions
+        original_size = torch.tensor([original_size], dtype=prompt_embeds.dtype)
+        target_size = torch.tensor([target_size], dtype=prompt_embeds.dtype)
+        crops_coords_top_left = torch.tensor([crops_coords_top_left], dtype=prompt_embeds.dtype)
+
+        if self.do_classifier_free_guidance:
+            original_size = torch.cat([original_size, original_size])
+            target_size = torch.cat([target_size, target_size])
+            crops_coords_top_left = torch.cat([crops_coords_top_left, crops_coords_top_left])
+
+        original_size = original_size.to(device).repeat(batch_size * num_images_per_prompt, 1)
+        target_size = target_size.to(device).repeat(batch_size * num_images_per_prompt, 1)
+        crops_coords_top_left = crops_coords_top_left.to(device).repeat(batch_size * num_images_per_prompt, 1)
+
+        # 8. Denoising loop
+        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
+
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            # for DPM-solver++
+            old_pred_original_sample = None
+            for i, t in enumerate(timesteps):
+                if self.interrupt:
+                    continue
+
+                latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+                timestep = t.expand(latent_model_input.shape[0])
+
+                # predict noise model_output
+                noise_pred = self.transformer(
+                    hidden_states=latent_model_input,
+                    encoder_hidden_states=prompt_embeds,
+                    timestep=timestep,
+                    original_size=original_size,
+                    target_size=target_size,
+                    crop_coords=crops_coords_top_left,
+                    return_dict=False,
+                )[0]
+                noise_pred = noise_pred.float()
+
+                # perform guidance
+                if self.do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                if not isinstance(self.scheduler, CogVideoXDPMScheduler):
+                    latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+                else:
+                    latents, old_pred_original_sample = self.scheduler.step(
+                        noise_pred,
+                        old_pred_original_sample,
+                        t,
+                        timesteps[i - 1] if i > 0 else None,
+                        latents,
+                        **extra_step_kwargs,
+                        return_dict=False,
+                    )
+                latents = latents.to(prompt_embeds.dtype)
+
+                # call the callback, if provided
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+
+        if not output_type == "latent":
+            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False, generator=generator)[
+                0
+            ]
+        else:
+            image = latents
+
+        image = self.image_processor.postprocess(image, output_type=output_type)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image,)
+
+        return CogView3PipelineOutput(images=image)
\ No newline at end of file
diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_output.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_output.py
new file mode 100644
index 0000000000..11f8976f0e
--- /dev/null
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_output.py
@@ -0,0 +1,21 @@
+from dataclasses import dataclass
+from typing import List, Union
+
+import numpy as np
+import PIL.Image
+
+from diffusers.utils import BaseOutput
+
+
+@dataclass
+class CogView3PipelineOutput(BaseOutput):
+    """
+    Output class for CogView3 pipelines.
+
+    Args:
+        images (`List[PIL.Image.Image]` or `np.ndarray`)
+            List of denoised PIL images of length `batch_size` or numpy array of shape `(batch_size, height, width,
+            num_channels)`. PIL images or numpy array present the denoised images of the diffusion pipeline.
+    """
+
+    images: Union[List[PIL.Image.Image], np.ndarray]
\ No newline at end of file
diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/__init__.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/__init__.py
new file mode 100644
index 0000000000..76b000d4bb
--- /dev/null
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/__init__.py
@@ -0,0 +1,2 @@
+from .scheduling_ddim_cogvideox import CogVideoXDDIMScheduler
+from .scheduling_dpm_cogvideox import CogVideoXDPMScheduler
\ No newline at end of file
diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_ddim_cogvideox.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_ddim_cogvideox.py
new file mode 100644
index 0000000000..27c31923fe
--- /dev/null
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_ddim_cogvideox.py
@@ -0,0 +1,452 @@
+# Copyright 2024 The CogVideoX team, Tsinghua University & ZhipuAI and The HuggingFace Team.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# DISCLAIMER: This code is strongly influenced by https://github.com/pesser/pytorch_diffusion
+# and https://github.com/hojonathanho/diffusion
+
+import math
+from dataclasses import dataclass
+from typing import List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.utils import BaseOutput
+from .scheduling_utils import KarrasDiffusionSchedulers, SchedulerMixin
+
+
+@dataclass
+# Copied from diffusers.schedulers.scheduling_ddpm.DDPMSchedulerOutput with DDPM->DDIM
+class DDIMSchedulerOutput(BaseOutput):
+    """
+    Output class for the scheduler's `step` function output.
+
+    Args:
+        prev_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images):
+            Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the
+            denoising loop.
+        pred_original_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images):
+            The predicted denoised sample `(x_{0})` based on the model output from the current timestep.
+            `pred_original_sample` can be used to preview progress or for guidance.
+    """
+
+    prev_sample: torch.Tensor
+    pred_original_sample: Optional[torch.Tensor] = None
+
+
+# Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
+def betas_for_alpha_bar(
+    num_diffusion_timesteps,
+    max_beta=0.999,
+    alpha_transform_type="cosine",
+):
+    """
+    Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
+    (1-beta) over time from t = [0,1].
+
+    Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
+    to that part of the diffusion process.
+
+
+    Args:
+        num_diffusion_timesteps (`int`): the number of betas to produce.
+        max_beta (`float`): the maximum beta to use; use values lower than 1 to
+                     prevent singularities.
+        alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
+                     Choose from `cosine` or `exp`
+
+    Returns:
+        betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
+    """
+    if alpha_transform_type == "cosine":
+
+        def alpha_bar_fn(t):
+            return math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2
+
+    elif alpha_transform_type == "exp":
+
+        def alpha_bar_fn(t):
+            return math.exp(t * -12.0)
+
+    else:
+        raise ValueError(f"Unsupported alpha_transform_type: {alpha_transform_type}")
+
+    betas = []
+    for i in range(num_diffusion_timesteps):
+        t1 = i / num_diffusion_timesteps
+        t2 = (i + 1) / num_diffusion_timesteps
+        betas.append(min(1 - alpha_bar_fn(t2) / alpha_bar_fn(t1), max_beta))
+    return torch.tensor(betas, dtype=torch.float32)
+
+
+def rescale_zero_terminal_snr(alphas_cumprod):
+    """
+    Rescales betas to have zero terminal SNR Based on https://arxiv.org/pdf/2305.08891.pdf (Algorithm 1)
+
+
+    Args:
+        betas (`torch.Tensor`):
+            the betas that the scheduler is being initialized with.
+
+    Returns:
+        `torch.Tensor`: rescaled betas with zero terminal SNR
+    """
+
+    alphas_bar_sqrt = alphas_cumprod.sqrt()
+
+    # Store old values.
+    alphas_bar_sqrt_0 = alphas_bar_sqrt[0].clone()
+    alphas_bar_sqrt_T = alphas_bar_sqrt[-1].clone()
+
+    # Shift so the last timestep is zero.
+    alphas_bar_sqrt -= alphas_bar_sqrt_T
+
+    # Scale so the first timestep is back to the old value.
+    alphas_bar_sqrt *= alphas_bar_sqrt_0 / (alphas_bar_sqrt_0 - alphas_bar_sqrt_T)
+
+    # Convert alphas_bar_sqrt to betas
+    alphas_bar = alphas_bar_sqrt**2  # Revert sqrt
+
+    return alphas_bar
+
+
+class CogVideoXDDIMScheduler(SchedulerMixin, ConfigMixin):
+    """
+    `DDIMScheduler` extends the denoising procedure introduced in denoising diffusion probabilistic models (DDPMs) with
+    non-Markovian guidance.
+
+    This model inherits from [`SchedulerMixin`] and [`ConfigMixin`]. Check the superclass documentation for the generic
+    methods the library implements for all schedulers such as loading and saving.
+
+    Args:
+        num_train_timesteps (`int`, defaults to 1000):
+            The number of diffusion steps to train the model.
+        beta_start (`float`, defaults to 0.0001):
+            The starting `beta` value of inference.
+        beta_end (`float`, defaults to 0.02):
+            The final `beta` value.
+        beta_schedule (`str`, defaults to `"linear"`):
+            The beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
+            `linear`, `scaled_linear`, or `squaredcos_cap_v2`.
+        trained_betas (`np.ndarray`, *optional*):
+            Pass an array of betas directly to the constructor to bypass `beta_start` and `beta_end`.
+        clip_sample (`bool`, defaults to `True`):
+            Clip the predicted sample for numerical stability.
+        clip_sample_range (`float`, defaults to 1.0):
+            The maximum magnitude for sample clipping. Valid only when `clip_sample=True`.
+        set_alpha_to_one (`bool`, defaults to `True`):
+            Each diffusion step uses the alphas product value at that step and at the previous one. For the final step
+            there is no previous alpha. When this option is `True` the previous alpha product is fixed to `1`,
+            otherwise it uses the alpha value at step 0.
+        steps_offset (`int`, defaults to 0):
+            An offset added to the inference steps, as required by some model families.
+        prediction_type (`str`, defaults to `epsilon`, *optional*):
+            Prediction type of the scheduler function; can be `epsilon` (predicts the noise of the diffusion process),
+            `sample` (directly predicts the noisy sample`) or `v_prediction` (see section 2.4 of [Imagen
+            Video](https://imagen.research.google/video/paper.pdf) paper).
+        thresholding (`bool`, defaults to `False`):
+            Whether to use the "dynamic thresholding" method. This is unsuitable for latent-space diffusion models such
+            as Stable Diffusion.
+        dynamic_thresholding_ratio (`float`, defaults to 0.995):
+            The ratio for the dynamic thresholding method. Valid only when `thresholding=True`.
+        sample_max_value (`float`, defaults to 1.0):
+            The threshold value for dynamic thresholding. Valid only when `thresholding=True`.
+        timestep_spacing (`str`, defaults to `"leading"`):
+            The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and
+            Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information.
+        rescale_betas_zero_snr (`bool`, defaults to `False`):
+            Whether to rescale the betas to have zero terminal SNR. This enables the model to generate very bright and
+            dark samples instead of limiting it to samples with medium brightness. Loosely related to
+            [`--offset_noise`](https://github.com/huggingface/diffusers/blob/74fd735eb073eb1d774b1ab4154a0876eb82f055/examples/dreambooth/train_dreambooth.py#L506).
+    """
+
+    _compatibles = [e.name for e in KarrasDiffusionSchedulers]
+    order = 1
+
+    @register_to_config
+    def __init__(
+        self,
+        num_train_timesteps: int = 1000,
+        beta_start: float = 0.00085,
+        beta_end: float = 0.0120,
+        beta_schedule: str = "scaled_linear",
+        trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
+        clip_sample: bool = True,
+        set_alpha_to_one: bool = True,
+        steps_offset: int = 0,
+        prediction_type: str = "epsilon",
+        clip_sample_range: float = 1.0,
+        sample_max_value: float = 1.0,
+        timestep_spacing: str = "leading",
+        rescale_betas_zero_snr: bool = False,
+        snr_shift_scale: float = 3.0,
+    ):
+        if trained_betas is not None:
+            self.betas = torch.tensor(trained_betas, dtype=torch.float32)
+        elif beta_schedule == "linear":
+            self.betas = torch.linspace(beta_start, beta_end, num_train_timesteps, dtype=torch.float32)
+        elif beta_schedule == "scaled_linear":
+            # this schedule is very specific to the latent diffusion model.
+            self.betas = torch.linspace(beta_start**0.5, beta_end**0.5, num_train_timesteps, dtype=torch.float64) ** 2
+        elif beta_schedule == "squaredcos_cap_v2":
+            # Glide cosine schedule
+            self.betas = betas_for_alpha_bar(num_train_timesteps)
+        else:
+            raise NotImplementedError(f"{beta_schedule} is not implemented for {self.__class__}")
+
+        self.alphas = 1.0 - self.betas
+        self.alphas_cumprod = torch.cumprod(self.alphas, dim=0)
+
+        # Modify: SNR shift following SD3
+        self.alphas_cumprod = self.alphas_cumprod / (snr_shift_scale + (1 - snr_shift_scale) * self.alphas_cumprod)
+
+        # Rescale for zero SNR
+        if rescale_betas_zero_snr:
+            self.alphas_cumprod = rescale_zero_terminal_snr(self.alphas_cumprod)
+
+        # At every step in ddim, we are looking into the previous alphas_cumprod
+        # For the final step, there is no previous alphas_cumprod because we are already at 0
+        # `set_alpha_to_one` decides whether we set this parameter simply to one or
+        # whether we use the final alpha of the "non-previous" one.
+        self.final_alpha_cumprod = torch.tensor(1.0) if set_alpha_to_one else self.alphas_cumprod[0]
+
+        # standard deviation of the initial noise distribution
+        self.init_noise_sigma = 1.0
+
+        # setable values
+        self.num_inference_steps = None
+        self.timesteps = torch.from_numpy(np.arange(0, num_train_timesteps)[::-1].copy().astype(np.int64))
+
+    def _get_variance(self, timestep, prev_timestep):
+        alpha_prod_t = self.alphas_cumprod[timestep]
+        alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.final_alpha_cumprod
+        beta_prod_t = 1 - alpha_prod_t
+        beta_prod_t_prev = 1 - alpha_prod_t_prev
+
+        variance = (beta_prod_t_prev / beta_prod_t) * (1 - alpha_prod_t / alpha_prod_t_prev)
+
+        return variance
+
+    def scale_model_input(self, sample: torch.Tensor, timestep: Optional[int] = None) -> torch.Tensor:
+        """
+        Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
+        current timestep.
+
+        Args:
+            sample (`torch.Tensor`):
+                The input sample.
+            timestep (`int`, *optional*):
+                The current timestep in the diffusion chain.
+
+        Returns:
+            `torch.Tensor`:
+                A scaled input sample.
+        """
+        return sample
+
+    def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.device] = None):
+        """
+        Sets the discrete timesteps used for the diffusion chain (to be run before inference).
+
+        Args:
+            num_inference_steps (`int`):
+                The number of diffusion steps used when generating samples with a pre-trained model.
+        """
+
+        if num_inference_steps > self.config.num_train_timesteps:
+            raise ValueError(
+                f"`num_inference_steps`: {num_inference_steps} cannot be larger than `self.config.train_timesteps`:"
+                f" {self.config.num_train_timesteps} as the unet model trained with this scheduler can only handle"
+                f" maximal {self.config.num_train_timesteps} timesteps."
+            )
+
+        self.num_inference_steps = num_inference_steps
+
+        # "linspace", "leading", "trailing" corresponds to annotation of Table 2. of https://arxiv.org/abs/2305.08891
+        if self.config.timestep_spacing == "linspace":
+            timesteps = (
+                np.linspace(0, self.config.num_train_timesteps - 1, num_inference_steps)
+                .round()[::-1]
+                .copy()
+                .astype(np.int64)
+            )
+        elif self.config.timestep_spacing == "leading":
+            step_ratio = self.config.num_train_timesteps // self.num_inference_steps
+            # creates integer timesteps by multiplying by ratio
+            # casting to int to avoid issues when num_inference_step is power of 3
+            timesteps = (np.arange(0, num_inference_steps) * step_ratio).round()[::-1].copy().astype(np.int64)
+            timesteps += self.config.steps_offset
+        elif self.config.timestep_spacing == "trailing":
+            step_ratio = self.config.num_train_timesteps / self.num_inference_steps
+            # creates integer timesteps by multiplying by ratio
+            # casting to int to avoid issues when num_inference_step is power of 3
+            timesteps = np.round(np.arange(self.config.num_train_timesteps, 0, -step_ratio)).astype(np.int64)
+            timesteps -= 1
+        else:
+            raise ValueError(
+                f"{self.config.timestep_spacing} is not supported. Please make sure to choose one of 'leading' or 'trailing'."
+            )
+
+        self.timesteps = torch.from_numpy(timesteps).to(device)
+
+    def step(
+        self,
+        model_output: torch.Tensor,
+        timestep: int,
+        sample: torch.Tensor,
+        eta: float = 0.0,
+        use_clipped_model_output: bool = False,
+        generator=None,
+        variance_noise: Optional[torch.Tensor] = None,
+        return_dict: bool = True,
+    ) -> Union[DDIMSchedulerOutput, Tuple]:
+        """
+        Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion
+        process from the learned model outputs (most often the predicted noise).
+
+        Args:
+            model_output (`torch.Tensor`):
+                The direct output from learned diffusion model.
+            timestep (`float`):
+                The current discrete timestep in the diffusion chain.
+            sample (`torch.Tensor`):
+                A current instance of a sample created by the diffusion process.
+            eta (`float`):
+                The weight of noise for added noise in diffusion step.
+            use_clipped_model_output (`bool`, defaults to `False`):
+                If `True`, computes "corrected" `model_output` from the clipped predicted original sample. Necessary
+                because predicted original sample is clipped to [-1, 1] when `self.config.clip_sample` is `True`. If no
+                clipping has happened, "corrected" `model_output` would coincide with the one provided as input and
+                `use_clipped_model_output` has no effect.
+            generator (`torch.Generator`, *optional*):
+                A random number generator.
+            variance_noise (`torch.Tensor`):
+                Alternative to generating noise with `generator` by directly providing the noise for the variance
+                itself. Useful for methods such as [`CycleDiffusion`].
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~schedulers.scheduling_ddim.DDIMSchedulerOutput`] or `tuple`.
+
+        Returns:
+            [`~schedulers.scheduling_ddim.DDIMSchedulerOutput`] or `tuple`:
+                If return_dict is `True`, [`~schedulers.scheduling_ddim.DDIMSchedulerOutput`] is returned, otherwise a
+                tuple is returned where the first element is the sample tensor.
+
+        """
+        if self.num_inference_steps is None:
+            raise ValueError(
+                "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
+            )
+
+        # See formulas (12) and (16) of DDIM paper https://arxiv.org/pdf/2010.02502.pdf
+        # Ideally, read DDIM paper in-detail understanding
+
+        # Notation (<variable name> -> <name in paper>
+        # - pred_noise_t -> e_theta(x_t, t)
+        # - pred_original_sample -> f_theta(x_t, t) or x_0
+        # - std_dev_t -> sigma_t
+        # - eta -> η
+        # - pred_sample_direction -> "direction pointing to x_t"
+        # - pred_prev_sample -> "x_t-1"
+
+        # 1. get previous step value (=t-1)
+        prev_timestep = timestep - self.config.num_train_timesteps // self.num_inference_steps
+
+        # 2. compute alphas, betas
+        alpha_prod_t = self.alphas_cumprod[timestep]
+        alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.final_alpha_cumprod
+
+        beta_prod_t = 1 - alpha_prod_t
+
+        # 3. compute predicted original sample from predicted noise also called
+        # "predicted x_0" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
+        # To make style tests pass, commented out `pred_epsilon` as it is an unused variable
+        if self.config.prediction_type == "epsilon":
+            pred_original_sample = (sample - beta_prod_t ** (0.5) * model_output) / alpha_prod_t ** (0.5)
+            # pred_epsilon = model_output
+        elif self.config.prediction_type == "sample":
+            pred_original_sample = model_output
+            # pred_epsilon = (sample - alpha_prod_t ** (0.5) * pred_original_sample) / beta_prod_t ** (0.5)
+        elif self.config.prediction_type == "v_prediction":
+            pred_original_sample = (alpha_prod_t**0.5) * sample - (beta_prod_t**0.5) * model_output
+            # pred_epsilon = (alpha_prod_t**0.5) * model_output + (beta_prod_t**0.5) * sample
+        else:
+            raise ValueError(
+                f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample`, or"
+                " `v_prediction`"
+            )
+
+        a_t = ((1 - alpha_prod_t_prev) / (1 - alpha_prod_t)) ** 0.5
+        b_t = alpha_prod_t_prev**0.5 - alpha_prod_t**0.5 * a_t
+
+        prev_sample = a_t * sample + b_t * pred_original_sample
+
+        if not return_dict:
+            return (
+                prev_sample,
+                pred_original_sample,
+            )
+
+        return DDIMSchedulerOutput(prev_sample=prev_sample, pred_original_sample=pred_original_sample)
+
+    # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.add_noise
+    def add_noise(
+        self,
+        original_samples: torch.Tensor,
+        noise: torch.Tensor,
+        timesteps: torch.IntTensor,
+    ) -> torch.Tensor:
+        # Make sure alphas_cumprod and timestep have same device and dtype as original_samples
+        # Move the self.alphas_cumprod to device to avoid redundant CPU to GPU data movement
+        # for the subsequent add_noise calls
+        self.alphas_cumprod = self.alphas_cumprod.to(device=original_samples.device)
+        alphas_cumprod = self.alphas_cumprod.to(dtype=original_samples.dtype)
+        timesteps = timesteps.to(original_samples.device)
+
+        sqrt_alpha_prod = alphas_cumprod[timesteps] ** 0.5
+        sqrt_alpha_prod = sqrt_alpha_prod.flatten()
+        while len(sqrt_alpha_prod.shape) < len(original_samples.shape):
+            sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1)
+
+        sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps]) ** 0.5
+        sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
+        while len(sqrt_one_minus_alpha_prod.shape) < len(original_samples.shape):
+            sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1)
+
+        noisy_samples = sqrt_alpha_prod * original_samples + sqrt_one_minus_alpha_prod * noise
+        return noisy_samples
+
+    # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.get_velocity
+    def get_velocity(self, sample: torch.Tensor, noise: torch.Tensor, timesteps: torch.IntTensor) -> torch.Tensor:
+        # Make sure alphas_cumprod and timestep have same device and dtype as sample
+        self.alphas_cumprod = self.alphas_cumprod.to(device=sample.device)
+        alphas_cumprod = self.alphas_cumprod.to(dtype=sample.dtype)
+        timesteps = timesteps.to(sample.device)
+
+        sqrt_alpha_prod = alphas_cumprod[timesteps] ** 0.5
+        sqrt_alpha_prod = sqrt_alpha_prod.flatten()
+        while len(sqrt_alpha_prod.shape) < len(sample.shape):
+            sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1)
+
+        sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps]) ** 0.5
+        sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
+        while len(sqrt_one_minus_alpha_prod.shape) < len(sample.shape):
+            sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1)
+
+        velocity = sqrt_alpha_prod * noise - sqrt_one_minus_alpha_prod * sample
+        return velocity
+
+    def __len__(self):
+        return self.config.num_train_timesteps
\ No newline at end of file
diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_dpm_cogvideox.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_dpm_cogvideox.py
new file mode 100644
index 0000000000..4269fff66a
--- /dev/null
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_dpm_cogvideox.py
@@ -0,0 +1,489 @@
+# Copyright 2024 The CogVideoX team, Tsinghua University & ZhipuAI and The HuggingFace Team.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# DISCLAIMER: This code is strongly influenced by https://github.com/pesser/pytorch_diffusion
+# and https://github.com/hojonathanho/diffusion
+
+import math
+from dataclasses import dataclass
+from typing import List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.utils import BaseOutput
+from diffusers.utils.torch_utils import randn_tensor
+from .scheduling_utils import KarrasDiffusionSchedulers, SchedulerMixin
+
+
+@dataclass
+# Copied from diffusers.schedulers.scheduling_ddpm.DDPMSchedulerOutput with DDPM->DDIM
+class DDIMSchedulerOutput(BaseOutput):
+    """
+    Output class for the scheduler's `step` function output.
+
+    Args:
+        prev_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images):
+            Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the
+            denoising loop.
+        pred_original_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images):
+            The predicted denoised sample `(x_{0})` based on the model output from the current timestep.
+            `pred_original_sample` can be used to preview progress or for guidance.
+    """
+
+    prev_sample: torch.Tensor
+    pred_original_sample: Optional[torch.Tensor] = None
+
+
+# Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
+def betas_for_alpha_bar(
+    num_diffusion_timesteps,
+    max_beta=0.999,
+    alpha_transform_type="cosine",
+):
+    """
+    Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
+    (1-beta) over time from t = [0,1].
+
+    Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
+    to that part of the diffusion process.
+
+
+    Args:
+        num_diffusion_timesteps (`int`): the number of betas to produce.
+        max_beta (`float`): the maximum beta to use; use values lower than 1 to
+                     prevent singularities.
+        alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
+                     Choose from `cosine` or `exp`
+
+    Returns:
+        betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
+    """
+    if alpha_transform_type == "cosine":
+
+        def alpha_bar_fn(t):
+            return math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2
+
+    elif alpha_transform_type == "exp":
+
+        def alpha_bar_fn(t):
+            return math.exp(t * -12.0)
+
+    else:
+        raise ValueError(f"Unsupported alpha_transform_type: {alpha_transform_type}")
+
+    betas = []
+    for i in range(num_diffusion_timesteps):
+        t1 = i / num_diffusion_timesteps
+        t2 = (i + 1) / num_diffusion_timesteps
+        betas.append(min(1 - alpha_bar_fn(t2) / alpha_bar_fn(t1), max_beta))
+    return torch.tensor(betas, dtype=torch.float32)
+
+
+def rescale_zero_terminal_snr(alphas_cumprod):
+    """
+    Rescales betas to have zero terminal SNR Based on https://arxiv.org/pdf/2305.08891.pdf (Algorithm 1)
+
+
+    Args:
+        betas (`torch.Tensor`):
+            the betas that the scheduler is being initialized with.
+
+    Returns:
+        `torch.Tensor`: rescaled betas with zero terminal SNR
+    """
+
+    alphas_bar_sqrt = alphas_cumprod.sqrt()
+
+    # Store old values.
+    alphas_bar_sqrt_0 = alphas_bar_sqrt[0].clone()
+    alphas_bar_sqrt_T = alphas_bar_sqrt[-1].clone()
+
+    # Shift so the last timestep is zero.
+    alphas_bar_sqrt -= alphas_bar_sqrt_T
+
+    # Scale so the first timestep is back to the old value.
+    alphas_bar_sqrt *= alphas_bar_sqrt_0 / (alphas_bar_sqrt_0 - alphas_bar_sqrt_T)
+
+    # Convert alphas_bar_sqrt to betas
+    alphas_bar = alphas_bar_sqrt**2  # Revert sqrt
+
+    return alphas_bar
+
+
+class CogVideoXDPMScheduler(SchedulerMixin, ConfigMixin):
+    """
+    `DDIMScheduler` extends the denoising procedure introduced in denoising diffusion probabilistic models (DDPMs) with
+    non-Markovian guidance.
+
+    This model inherits from [`SchedulerMixin`] and [`ConfigMixin`]. Check the superclass documentation for the generic
+    methods the library implements for all schedulers such as loading and saving.
+
+    Args:
+        num_train_timesteps (`int`, defaults to 1000):
+            The number of diffusion steps to train the model.
+        beta_start (`float`, defaults to 0.0001):
+            The starting `beta` value of inference.
+        beta_end (`float`, defaults to 0.02):
+            The final `beta` value.
+        beta_schedule (`str`, defaults to `"linear"`):
+            The beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
+            `linear`, `scaled_linear`, or `squaredcos_cap_v2`.
+        trained_betas (`np.ndarray`, *optional*):
+            Pass an array of betas directly to the constructor to bypass `beta_start` and `beta_end`.
+        clip_sample (`bool`, defaults to `True`):
+            Clip the predicted sample for numerical stability.
+        clip_sample_range (`float`, defaults to 1.0):
+            The maximum magnitude for sample clipping. Valid only when `clip_sample=True`.
+        set_alpha_to_one (`bool`, defaults to `True`):
+            Each diffusion step uses the alphas product value at that step and at the previous one. For the final step
+            there is no previous alpha. When this option is `True` the previous alpha product is fixed to `1`,
+            otherwise it uses the alpha value at step 0.
+        steps_offset (`int`, defaults to 0):
+            An offset added to the inference steps, as required by some model families.
+        prediction_type (`str`, defaults to `epsilon`, *optional*):
+            Prediction type of the scheduler function; can be `epsilon` (predicts the noise of the diffusion process),
+            `sample` (directly predicts the noisy sample`) or `v_prediction` (see section 2.4 of [Imagen
+            Video](https://imagen.research.google/video/paper.pdf) paper).
+        thresholding (`bool`, defaults to `False`):
+            Whether to use the "dynamic thresholding" method. This is unsuitable for latent-space diffusion models such
+            as Stable Diffusion.
+        dynamic_thresholding_ratio (`float`, defaults to 0.995):
+            The ratio for the dynamic thresholding method. Valid only when `thresholding=True`.
+        sample_max_value (`float`, defaults to 1.0):
+            The threshold value for dynamic thresholding. Valid only when `thresholding=True`.
+        timestep_spacing (`str`, defaults to `"leading"`):
+            The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and
+            Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information.
+        rescale_betas_zero_snr (`bool`, defaults to `False`):
+            Whether to rescale the betas to have zero terminal SNR. This enables the model to generate very bright and
+            dark samples instead of limiting it to samples with medium brightness. Loosely related to
+            [`--offset_noise`](https://github.com/huggingface/diffusers/blob/74fd735eb073eb1d774b1ab4154a0876eb82f055/examples/dreambooth/train_dreambooth.py#L506).
+    """
+
+    _compatibles = [e.name for e in KarrasDiffusionSchedulers]
+    order = 1
+
+    @register_to_config
+    def __init__(
+        self,
+        num_train_timesteps: int = 1000,
+        beta_start: float = 0.00085,
+        beta_end: float = 0.0120,
+        beta_schedule: str = "scaled_linear",
+        trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
+        clip_sample: bool = True,
+        set_alpha_to_one: bool = True,
+        steps_offset: int = 0,
+        prediction_type: str = "epsilon",
+        clip_sample_range: float = 1.0,
+        sample_max_value: float = 1.0,
+        timestep_spacing: str = "leading",
+        rescale_betas_zero_snr: bool = False,
+        snr_shift_scale: float = 3.0,
+    ):
+        if trained_betas is not None:
+            self.betas = torch.tensor(trained_betas, dtype=torch.float32)
+        elif beta_schedule == "linear":
+            self.betas = torch.linspace(beta_start, beta_end, num_train_timesteps, dtype=torch.float32)
+        elif beta_schedule == "scaled_linear":
+            # this schedule is very specific to the latent diffusion model.
+            self.betas = torch.linspace(beta_start**0.5, beta_end**0.5, num_train_timesteps, dtype=torch.float64) ** 2
+        elif beta_schedule == "squaredcos_cap_v2":
+            # Glide cosine schedule
+            self.betas = betas_for_alpha_bar(num_train_timesteps)
+        else:
+            raise NotImplementedError(f"{beta_schedule} is not implemented for {self.__class__}")
+
+        self.alphas = 1.0 - self.betas
+        self.alphas_cumprod = torch.cumprod(self.alphas, dim=0)
+
+        # Modify: SNR shift following SD3
+        self.alphas_cumprod = self.alphas_cumprod / (snr_shift_scale + (1 - snr_shift_scale) * self.alphas_cumprod)
+
+        # Rescale for zero SNR
+        if rescale_betas_zero_snr:
+            self.alphas_cumprod = rescale_zero_terminal_snr(self.alphas_cumprod)
+
+        # At every step in ddim, we are looking into the previous alphas_cumprod
+        # For the final step, there is no previous alphas_cumprod because we are already at 0
+        # `set_alpha_to_one` decides whether we set this parameter simply to one or
+        # whether we use the final alpha of the "non-previous" one.
+        self.final_alpha_cumprod = torch.tensor(1.0) if set_alpha_to_one else self.alphas_cumprod[0]
+
+        # standard deviation of the initial noise distribution
+        self.init_noise_sigma = 1.0
+
+        # setable values
+        self.num_inference_steps = None
+        self.timesteps = torch.from_numpy(np.arange(0, num_train_timesteps)[::-1].copy().astype(np.int64))
+
+    def _get_variance(self, timestep, prev_timestep):
+        alpha_prod_t = self.alphas_cumprod[timestep]
+        alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.final_alpha_cumprod
+        beta_prod_t = 1 - alpha_prod_t
+        beta_prod_t_prev = 1 - alpha_prod_t_prev
+
+        variance = (beta_prod_t_prev / beta_prod_t) * (1 - alpha_prod_t / alpha_prod_t_prev)
+
+        return variance
+
+    def scale_model_input(self, sample: torch.Tensor, timestep: Optional[int] = None) -> torch.Tensor:
+        """
+        Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
+        current timestep.
+
+        Args:
+            sample (`torch.Tensor`):
+                The input sample.
+            timestep (`int`, *optional*):
+                The current timestep in the diffusion chain.
+
+        Returns:
+            `torch.Tensor`:
+                A scaled input sample.
+        """
+        return sample
+
+    def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.device] = None):
+        """
+        Sets the discrete timesteps used for the diffusion chain (to be run before inference).
+
+        Args:
+            num_inference_steps (`int`):
+                The number of diffusion steps used when generating samples with a pre-trained model.
+        """
+
+        if num_inference_steps > self.config.num_train_timesteps:
+            raise ValueError(
+                f"`num_inference_steps`: {num_inference_steps} cannot be larger than `self.config.train_timesteps`:"
+                f" {self.config.num_train_timesteps} as the unet model trained with this scheduler can only handle"
+                f" maximal {self.config.num_train_timesteps} timesteps."
+            )
+
+        self.num_inference_steps = num_inference_steps
+
+        # "linspace", "leading", "trailing" corresponds to annotation of Table 2. of https://arxiv.org/abs/2305.08891
+        if self.config.timestep_spacing == "linspace":
+            timesteps = (
+                np.linspace(0, self.config.num_train_timesteps - 1, num_inference_steps)
+                .round()[::-1]
+                .copy()
+                .astype(np.int64)
+            )
+        elif self.config.timestep_spacing == "leading":
+            step_ratio = self.config.num_train_timesteps // self.num_inference_steps
+            # creates integer timesteps by multiplying by ratio
+            # casting to int to avoid issues when num_inference_step is power of 3
+            timesteps = (np.arange(0, num_inference_steps) * step_ratio).round()[::-1].copy().astype(np.int64)
+            timesteps += self.config.steps_offset
+        elif self.config.timestep_spacing == "trailing":
+            step_ratio = self.config.num_train_timesteps / self.num_inference_steps
+            # creates integer timesteps by multiplying by ratio
+            # casting to int to avoid issues when num_inference_step is power of 3
+            timesteps = np.round(np.arange(self.config.num_train_timesteps, 0, -step_ratio)).astype(np.int64)
+            timesteps -= 1
+        else:
+            raise ValueError(
+                f"{self.config.timestep_spacing} is not supported. Please make sure to choose one of 'leading' or 'trailing'."
+            )
+
+        self.timesteps = torch.from_numpy(timesteps).to(device)
+
+    def get_variables(self, alpha_prod_t, alpha_prod_t_prev, alpha_prod_t_back=None):
+        lamb = ((alpha_prod_t / (1 - alpha_prod_t)) ** 0.5).log()
+        lamb_next = ((alpha_prod_t_prev / (1 - alpha_prod_t_prev)) ** 0.5).log()
+        h = lamb_next - lamb
+
+        if alpha_prod_t_back is not None:
+            lamb_previous = ((alpha_prod_t_back / (1 - alpha_prod_t_back)) ** 0.5).log()
+            h_last = lamb - lamb_previous
+            r = h_last / h
+            return h, r, lamb, lamb_next
+        else:
+            return h, None, lamb, lamb_next
+
+    def get_mult(self, h, r, alpha_prod_t, alpha_prod_t_prev, alpha_prod_t_back):
+        mult1 = ((1 - alpha_prod_t_prev) / (1 - alpha_prod_t)) ** 0.5 * (-h).exp()
+        mult2 = (-2 * h).expm1() * alpha_prod_t_prev**0.5
+
+        if alpha_prod_t_back is not None:
+            mult3 = 1 + 1 / (2 * r)
+            mult4 = 1 / (2 * r)
+            return mult1, mult2, mult3, mult4
+        else:
+            return mult1, mult2
+
+    def step(
+        self,
+        model_output: torch.Tensor,
+        old_pred_original_sample: torch.Tensor,
+        timestep: int,
+        timestep_back: int,
+        sample: torch.Tensor,
+        eta: float = 0.0,
+        use_clipped_model_output: bool = False,
+        generator=None,
+        variance_noise: Optional[torch.Tensor] = None,
+        return_dict: bool = False,
+    ) -> Union[DDIMSchedulerOutput, Tuple]:
+        """
+        Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion
+        process from the learned model outputs (most often the predicted noise).
+
+        Args:
+            model_output (`torch.Tensor`):
+                The direct output from learned diffusion model.
+            timestep (`float`):
+                The current discrete timestep in the diffusion chain.
+            sample (`torch.Tensor`):
+                A current instance of a sample created by the diffusion process.
+            eta (`float`):
+                The weight of noise for added noise in diffusion step.
+            use_clipped_model_output (`bool`, defaults to `False`):
+                If `True`, computes "corrected" `model_output` from the clipped predicted original sample. Necessary
+                because predicted original sample is clipped to [-1, 1] when `self.config.clip_sample` is `True`. If no
+                clipping has happened, "corrected" `model_output` would coincide with the one provided as input and
+                `use_clipped_model_output` has no effect.
+            generator (`torch.Generator`, *optional*):
+                A random number generator.
+            variance_noise (`torch.Tensor`):
+                Alternative to generating noise with `generator` by directly providing the noise for the variance
+                itself. Useful for methods such as [`CycleDiffusion`].
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~schedulers.scheduling_ddim.DDIMSchedulerOutput`] or `tuple`.
+
+        Returns:
+            [`~schedulers.scheduling_ddim.DDIMSchedulerOutput`] or `tuple`:
+                If return_dict is `True`, [`~schedulers.scheduling_ddim.DDIMSchedulerOutput`] is returned, otherwise a
+                tuple is returned where the first element is the sample tensor.
+
+        """
+        if self.num_inference_steps is None:
+            raise ValueError(
+                "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
+            )
+
+        # See formulas (12) and (16) of DDIM paper https://arxiv.org/pdf/2010.02502.pdf
+        # Ideally, read DDIM paper in-detail understanding
+
+        # Notation (<variable name> -> <name in paper>
+        # - pred_noise_t -> e_theta(x_t, t)
+        # - pred_original_sample -> f_theta(x_t, t) or x_0
+        # - std_dev_t -> sigma_t
+        # - eta -> η
+        # - pred_sample_direction -> "direction pointing to x_t"
+        # - pred_prev_sample -> "x_t-1"
+
+        # 1. get previous step value (=t-1)
+        prev_timestep = timestep - self.config.num_train_timesteps // self.num_inference_steps
+
+        # 2. compute alphas, betas
+        alpha_prod_t = self.alphas_cumprod[timestep]
+        alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.final_alpha_cumprod
+        alpha_prod_t_back = self.alphas_cumprod[timestep_back] if timestep_back is not None else None
+
+        beta_prod_t = 1 - alpha_prod_t
+
+        # 3. compute predicted original sample from predicted noise also called
+        # "predicted x_0" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
+        # To make style tests pass, commented out `pred_epsilon` as it is an unused variable
+        if self.config.prediction_type == "epsilon":
+            pred_original_sample = (sample - beta_prod_t ** (0.5) * model_output) / alpha_prod_t ** (0.5)
+            # pred_epsilon = model_output
+        elif self.config.prediction_type == "sample":
+            pred_original_sample = model_output
+            # pred_epsilon = (sample - alpha_prod_t ** (0.5) * pred_original_sample) / beta_prod_t ** (0.5)
+        elif self.config.prediction_type == "v_prediction":
+            pred_original_sample = (alpha_prod_t**0.5) * sample - (beta_prod_t**0.5) * model_output
+            # pred_epsilon = (alpha_prod_t**0.5) * model_output + (beta_prod_t**0.5) * sample
+        else:
+            raise ValueError(
+                f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample`, or"
+                " `v_prediction`"
+            )
+
+        h, r, lamb, lamb_next = self.get_variables(alpha_prod_t, alpha_prod_t_prev, alpha_prod_t_back)
+        mult = list(self.get_mult(h, r, alpha_prod_t, alpha_prod_t_prev, alpha_prod_t_back))
+        mult_noise = (1 - alpha_prod_t_prev) ** 0.5 * (1 - (-2 * h).exp()) ** 0.5
+
+        noise = randn_tensor(sample.shape, generator=generator, device=sample.device, dtype=sample.dtype)
+        prev_sample = mult[0] * sample - mult[1] * pred_original_sample + mult_noise * noise
+
+        if old_pred_original_sample is None or prev_timestep < 0:
+            # Save a network evaluation if all noise levels are 0 or on the first step
+            return prev_sample, pred_original_sample
+        else:
+            denoised_d = mult[2] * pred_original_sample - mult[3] * old_pred_original_sample
+            noise = randn_tensor(sample.shape, generator=generator, device=sample.device, dtype=sample.dtype)
+            x_advanced = mult[0] * sample - mult[1] * denoised_d + mult_noise * noise
+
+            prev_sample = x_advanced
+
+        if not return_dict:
+            return (prev_sample, pred_original_sample)
+
+        return DDIMSchedulerOutput(prev_sample=prev_sample, pred_original_sample=pred_original_sample)
+
+    # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.add_noise
+    def add_noise(
+        self,
+        original_samples: torch.Tensor,
+        noise: torch.Tensor,
+        timesteps: torch.IntTensor,
+    ) -> torch.Tensor:
+        # Make sure alphas_cumprod and timestep have same device and dtype as original_samples
+        # Move the self.alphas_cumprod to device to avoid redundant CPU to GPU data movement
+        # for the subsequent add_noise calls
+        self.alphas_cumprod = self.alphas_cumprod.to(device=original_samples.device)
+        alphas_cumprod = self.alphas_cumprod.to(dtype=original_samples.dtype)
+        timesteps = timesteps.to(original_samples.device)
+
+        sqrt_alpha_prod = alphas_cumprod[timesteps] ** 0.5
+        sqrt_alpha_prod = sqrt_alpha_prod.flatten()
+        while len(sqrt_alpha_prod.shape) < len(original_samples.shape):
+            sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1)
+
+        sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps]) ** 0.5
+        sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
+        while len(sqrt_one_minus_alpha_prod.shape) < len(original_samples.shape):
+            sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1)
+
+        noisy_samples = sqrt_alpha_prod * original_samples + sqrt_one_minus_alpha_prod * noise
+        return noisy_samples
+
+    # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.get_velocity
+    def get_velocity(self, sample: torch.Tensor, noise: torch.Tensor, timesteps: torch.IntTensor) -> torch.Tensor:
+        # Make sure alphas_cumprod and timestep have same device and dtype as sample
+        self.alphas_cumprod = self.alphas_cumprod.to(device=sample.device)
+        alphas_cumprod = self.alphas_cumprod.to(dtype=sample.dtype)
+        timesteps = timesteps.to(sample.device)
+
+        sqrt_alpha_prod = alphas_cumprod[timesteps] ** 0.5
+        sqrt_alpha_prod = sqrt_alpha_prod.flatten()
+        while len(sqrt_alpha_prod.shape) < len(sample.shape):
+            sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1)
+
+        sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps]) ** 0.5
+        sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
+        while len(sqrt_one_minus_alpha_prod.shape) < len(sample.shape):
+            sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1)
+
+        velocity = sqrt_alpha_prod * noise - sqrt_one_minus_alpha_prod * sample
+        return velocity
+
+    def __len__(self):
+        return self.config.num_train_timesteps
\ No newline at end of file
diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_utils.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_utils.py
new file mode 100644
index 0000000000..b6418f89dd
--- /dev/null
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_utils.py
@@ -0,0 +1,193 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import importlib
+import os
+from dataclasses import dataclass
+from enum import Enum
+from typing import Optional, Union
+
+import torch
+from huggingface_hub.utils import validate_hf_hub_args
+
+from diffusers.utils import BaseOutput, PushToHubMixin
+
+
+SCHEDULER_CONFIG_NAME = "scheduler_config.json"
+
+
+# NOTE: We make this type an enum because it simplifies usage in docs and prevents
+# circular imports when used for `_compatibles` within the schedulers module.
+# When it's used as a type in pipelines, it really is a Union because the actual
+# scheduler instance is passed in.
+class KarrasDiffusionSchedulers(Enum):
+    DDIMScheduler = 1
+    DDPMScheduler = 2
+    PNDMScheduler = 3
+    LMSDiscreteScheduler = 4
+    EulerDiscreteScheduler = 5
+    HeunDiscreteScheduler = 6
+    EulerAncestralDiscreteScheduler = 7
+    DPMSolverMultistepScheduler = 8
+    DPMSolverSinglestepScheduler = 9
+    KDPM2DiscreteScheduler = 10
+    KDPM2AncestralDiscreteScheduler = 11
+    DEISMultistepScheduler = 12
+    UniPCMultistepScheduler = 13
+    DPMSolverSDEScheduler = 14
+    EDMEulerScheduler = 15
+
+
+AysSchedules = {
+    "StableDiffusionTimesteps": [999, 850, 736, 645, 545, 455, 343, 233, 124, 24],
+    "StableDiffusionSigmas": [14.615, 6.475, 3.861, 2.697, 1.886, 1.396, 0.963, 0.652, 0.399, 0.152, 0.0],
+    "StableDiffusionXLTimesteps": [999, 845, 730, 587, 443, 310, 193, 116, 53, 13],
+    "StableDiffusionXLSigmas": [14.615, 6.315, 3.771, 2.181, 1.342, 0.862, 0.555, 0.380, 0.234, 0.113, 0.0],
+    "StableDiffusionVideoSigmas": [700.00, 54.5, 15.886, 7.977, 4.248, 1.789, 0.981, 0.403, 0.173, 0.034, 0.0],
+}
+
+
+@dataclass
+class SchedulerOutput(BaseOutput):
+    """
+    Base class for the output of a scheduler's `step` function.
+
+    Args:
+        prev_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images):
+            Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the
+            denoising loop.
+    """
+
+    prev_sample: torch.Tensor
+
+
+class SchedulerMixin(PushToHubMixin):
+    """
+    Base class for all schedulers.
+
+    [`SchedulerMixin`] contains common functions shared by all schedulers such as general loading and saving
+    functionalities.
+
+    [`ConfigMixin`] takes care of storing the configuration attributes (like `num_train_timesteps`) that are passed to
+    the scheduler's `__init__` function, and the attributes can be accessed by `scheduler.config.num_train_timesteps`.
+
+    Class attributes:
+        - **_compatibles** (`List[str]`) -- A list of scheduler classes that are compatible with the parent scheduler
+          class. Use [`~ConfigMixin.from_config`] to load a different compatible scheduler class (should be overridden
+          by parent class).
+    """
+
+    config_name = SCHEDULER_CONFIG_NAME
+    _compatibles = []
+    has_compatibles = True
+
+    @classmethod
+    @validate_hf_hub_args
+    def from_pretrained(
+        cls,
+        pretrained_model_name_or_path: Optional[Union[str, os.PathLike]] = None,
+        subfolder: Optional[str] = None,
+        return_unused_kwargs=False,
+        **kwargs,
+    ):
+        r"""
+        Instantiate a scheduler from a pre-defined JSON configuration file in a local directory or Hub repository.
+
+        Parameters:
+            pretrained_model_name_or_path (`str` or `os.PathLike`, *optional*):
+                Can be either:
+
+                    - A string, the *model id* (for example `google/ddpm-celebahq-256`) of a pretrained model hosted on
+                      the Hub.
+                    - A path to a *directory* (for example `./my_model_directory`) containing the scheduler
+                      configuration saved with [`~SchedulerMixin.save_pretrained`].
+            subfolder (`str`, *optional*):
+                The subfolder location of a model file within a larger model repository on the Hub or locally.
+            return_unused_kwargs (`bool`, *optional*, defaults to `False`):
+                Whether kwargs that are not consumed by the Python class should be returned or not.
+            cache_dir (`Union[str, os.PathLike]`, *optional*):
+                Path to a directory where a downloaded pretrained model configuration is cached if the standard cache
+                is not used.
+            force_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
+                cached versions if they exist.
+
+            proxies (`Dict[str, str]`, *optional*):
+                A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
+                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
+            output_loading_info(`bool`, *optional*, defaults to `False`):
+                Whether or not to also return a dictionary containing missing keys, unexpected keys and error messages.
+            local_files_only(`bool`, *optional*, defaults to `False`):
+                Whether to only load local model weights and configuration files or not. If set to `True`, the model
+                won't be downloaded from the Hub.
+            token (`str` or *bool*, *optional*):
+                The token to use as HTTP bearer authorization for remote files. If `True`, the token generated from
+                `diffusers-cli login` (stored in `~/.huggingface`) is used.
+            revision (`str`, *optional*, defaults to `"main"`):
+                The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier
+                allowed by Git.
+
+        <Tip>
+
+        To use private or [gated models](https://huggingface.co/docs/hub/models-gated#gated-models), log-in with
+        `huggingface-cli login`. You can also activate the special
+        ["offline-mode"](https://huggingface.co/diffusers/installation.html#offline-mode) to use this method in a
+        firewalled environment.
+
+        </Tip>
+
+        """
+        config, kwargs, commit_hash = cls.load_config(
+            pretrained_model_name_or_path=pretrained_model_name_or_path,
+            subfolder=subfolder,
+            return_unused_kwargs=True,
+            return_commit_hash=True,
+            **kwargs,
+        )
+        return cls.from_config(config, return_unused_kwargs=return_unused_kwargs, **kwargs)
+
+    def save_pretrained(self, save_directory: Union[str, os.PathLike], push_to_hub: bool = False, **kwargs):
+        """
+        Save a scheduler configuration object to a directory so that it can be reloaded using the
+        [`~SchedulerMixin.from_pretrained`] class method.
+
+        Args:
+            save_directory (`str` or `os.PathLike`):
+                Directory where the configuration JSON file will be saved (will be created if it does not exist).
+            push_to_hub (`bool`, *optional*, defaults to `False`):
+                Whether or not to push your model to the Hugging Face Hub after saving it. You can specify the
+                repository you want to push to with `repo_id` (will default to the name of `save_directory` in your
+                namespace).
+            kwargs (`Dict[str, Any]`, *optional*):
+                Additional keyword arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
+        """
+        self.save_config(save_directory=save_directory, push_to_hub=push_to_hub, **kwargs)
+
+    @property
+    def compatibles(self):
+        """
+        Returns all schedulers that are compatible with this scheduler
+
+        Returns:
+            `List[SchedulerMixin]`: List of compatible schedulers
+        """
+        return self._get_compatibles()
+
+    @classmethod
+    def _get_compatibles(cls):
+        compatible_classes_str = list(set([cls.__name__] + cls._compatibles))
+        diffusers_library = importlib.import_module(__name__.split(".")[0])
+        compatible_classes = [
+            getattr(diffusers_library, c) for c in compatible_classes_str if hasattr(diffusers_library, c)
+        ]
+        return compatible_classes
\ No newline at end of file
diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/vae/__init__.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/vae/__init__.py
new file mode 100644
index 0000000000..a91642a899
--- /dev/null
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/vae/__init__.py
@@ -0,0 +1 @@
+from autoencoder_kl import AutoencoderKL
\ No newline at end of file
diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/vae/autoencoder_kl.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/vae/autoencoder_kl.py
new file mode 100644
index 0000000000..99ba70c8cd
--- /dev/null
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/vae/autoencoder_kl.py
@@ -0,0 +1,571 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Dict, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.loaders import PeftAdapterMixin
+from diffusers.loaders.single_file_model import FromOriginalModelMixin
+from diffusers.utils import deprecate
+from diffusers.utils.accelerate_utils import apply_forward_hook
+from diffusers.models.attention_processor import (
+    ADDED_KV_ATTENTION_PROCESSORS,
+    CROSS_ATTENTION_PROCESSORS,
+    Attention,
+    AttentionProcessor,
+    AttnAddedKVProcessor,
+    AttnProcessor,
+    FusedAttnProcessor2_0,
+)
+from diffusers.models.modeling_outputs import AutoencoderKLOutput
+from diffusers.models.modeling_utils import ModelMixin
+from .vae import Decoder, DecoderOutput, DiagonalGaussianDistribution, Encoder
+
+
+class AutoencoderKL(ModelMixin, ConfigMixin, FromOriginalModelMixin, PeftAdapterMixin):
+    r"""
+    A VAE model with KL loss for encoding images into latents and decoding latent representations into images.
+
+    This model inherits from [`ModelMixin`]. Check the superclass documentation for it's generic methods implemented
+    for all models (such as downloading or saving).
+
+    Parameters:
+        in_channels (int, *optional*, defaults to 3): Number of channels in the input image.
+        out_channels (int,  *optional*, defaults to 3): Number of channels in the output.
+        down_block_types (`Tuple[str]`, *optional*, defaults to `("DownEncoderBlock2D",)`):
+            Tuple of downsample block types.
+        up_block_types (`Tuple[str]`, *optional*, defaults to `("UpDecoderBlock2D",)`):
+            Tuple of upsample block types.
+        block_out_channels (`Tuple[int]`, *optional*, defaults to `(64,)`):
+            Tuple of block output channels.
+        act_fn (`str`, *optional*, defaults to `"silu"`): The activation function to use.
+        latent_channels (`int`, *optional*, defaults to 4): Number of channels in the latent space.
+        sample_size (`int`, *optional*, defaults to `32`): Sample input size.
+        scaling_factor (`float`, *optional*, defaults to 0.18215):
+            The component-wise standard deviation of the trained latent space computed using the first batch of the
+            training set. This is used to scale the latent space to have unit variance when training the diffusion
+            model. The latents are scaled with the formula `z = z * scaling_factor` before being passed to the
+            diffusion model. When decoding, the latents are scaled back to the original scale with the formula: `z = 1
+            / scaling_factor * z`. For more details, refer to sections 4.3.2 and D.1 of the [High-Resolution Image
+            Synthesis with Latent Diffusion Models](https://arxiv.org/abs/2112.10752) paper.
+        force_upcast (`bool`, *optional*, default to `True`):
+            If enabled it will force the VAE to run in float32 for high image resolution pipelines, such as SD-XL. VAE
+            can be fine-tuned / trained to a lower range without loosing too much precision in which case
+            `force_upcast` can be set to `False` - see: https://huggingface.co/madebyollin/sdxl-vae-fp16-fix
+        mid_block_add_attention (`bool`, *optional*, default to `True`):
+            If enabled, the mid_block of the Encoder and Decoder will have attention blocks. If set to false, the
+            mid_block will only have resnet blocks
+    """
+
+    _supports_gradient_checkpointing = True
+    _no_split_modules = ["BasicTransformerBlock", "ResnetBlock2D"]
+
+    @register_to_config
+    def __init__(
+        self,
+        in_channels: int = 3,
+        out_channels: int = 3,
+        down_block_types: Tuple[str] = ("DownEncoderBlock2D",),
+        up_block_types: Tuple[str] = ("UpDecoderBlock2D",),
+        block_out_channels: Tuple[int] = (64,),
+        layers_per_block: int = 1,
+        act_fn: str = "silu",
+        latent_channels: int = 4,
+        norm_num_groups: int = 32,
+        sample_size: int = 32,
+        scaling_factor: float = 0.18215,
+        shift_factor: Optional[float] = None,
+        latents_mean: Optional[Tuple[float]] = None,
+        latents_std: Optional[Tuple[float]] = None,
+        force_upcast: float = True,
+        use_quant_conv: bool = True,
+        use_post_quant_conv: bool = True,
+        mid_block_add_attention: bool = True,
+    ):
+        super().__init__()
+
+        # pass init params to Encoder
+        self.encoder = Encoder(
+            in_channels=in_channels,
+            out_channels=latent_channels,
+            down_block_types=down_block_types,
+            block_out_channels=block_out_channels,
+            layers_per_block=layers_per_block,
+            act_fn=act_fn,
+            norm_num_groups=norm_num_groups,
+            double_z=True,
+            mid_block_add_attention=mid_block_add_attention,
+        )
+
+        # pass init params to Decoder
+        self.decoder = Decoder(
+            in_channels=latent_channels,
+            out_channels=out_channels,
+            up_block_types=up_block_types,
+            block_out_channels=block_out_channels,
+            layers_per_block=layers_per_block,
+            norm_num_groups=norm_num_groups,
+            act_fn=act_fn,
+            mid_block_add_attention=mid_block_add_attention,
+        )
+
+        self.quant_conv = nn.Conv2d(2 * latent_channels, 2 * latent_channels, 1) if use_quant_conv else None
+        self.post_quant_conv = nn.Conv2d(latent_channels, latent_channels, 1) if use_post_quant_conv else None
+
+        self.use_slicing = False
+        self.use_tiling = False
+
+        # only relevant if vae tiling is enabled
+        self.tile_sample_min_size = self.config.sample_size
+        sample_size = (
+            self.config.sample_size[0]
+            if isinstance(self.config.sample_size, (list, tuple))
+            else self.config.sample_size
+        )
+        self.tile_latent_min_size = int(sample_size / (2 ** (len(self.config.block_out_channels) - 1)))
+        self.tile_overlap_factor = 0.25
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, (Encoder, Decoder)):
+            module.gradient_checkpointing = value
+
+    def enable_tiling(self, use_tiling: bool = True):
+        r"""
+        Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
+        compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
+        processing larger images.
+        """
+        self.use_tiling = use_tiling
+
+    def disable_tiling(self):
+        r"""
+        Disable tiled VAE decoding. If `enable_tiling` was previously enabled, this method will go back to computing
+        decoding in one step.
+        """
+        self.enable_tiling(False)
+
+    def enable_slicing(self):
+        r"""
+        Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
+        compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
+        """
+        self.use_slicing = True
+
+    def disable_slicing(self):
+        r"""
+        Disable sliced VAE decoding. If `enable_slicing` was previously enabled, this method will go back to computing
+        decoding in one step.
+        """
+        self.use_slicing = False
+
+    @property
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.attn_processors
+    def attn_processors(self) -> Dict[str, AttentionProcessor]:
+        r"""
+        Returns:
+            `dict` of attention processors: A dictionary containing all attention processors used in the model with
+            indexed by its weight name.
+        """
+        # set recursively
+        processors = {}
+
+        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
+            if hasattr(module, "get_processor"):
+                processors[f"{name}.processor"] = module.get_processor()
+
+            for sub_name, child in module.named_children():
+                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
+
+            return processors
+
+        for name, module in self.named_children():
+            fn_recursive_add_processors(name, module, processors)
+
+        return processors
+
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_attn_processor
+    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
+        r"""
+        Sets the attention processor to use to compute attention.
+
+        Parameters:
+            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
+                The instantiated processor class or a dictionary of processor classes that will be set as the processor
+                for **all** `Attention` layers.
+
+                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
+                processor. This is strongly recommended when setting trainable attention processors.
+
+        """
+        count = len(self.attn_processors.keys())
+
+        if isinstance(processor, dict) and len(processor) != count:
+            raise ValueError(
+                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
+                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
+            )
+
+        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
+            if hasattr(module, "set_processor"):
+                if not isinstance(processor, dict):
+                    module.set_processor(processor)
+                else:
+                    module.set_processor(processor.pop(f"{name}.processor"))
+
+            for sub_name, child in module.named_children():
+                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
+
+        for name, module in self.named_children():
+            fn_recursive_attn_processor(name, module, processor)
+
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_default_attn_processor
+    def set_default_attn_processor(self):
+        """
+        Disables custom attention processors and sets the default attention implementation.
+        """
+        if all(proc.__class__ in ADDED_KV_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
+            processor = AttnAddedKVProcessor()
+        elif all(proc.__class__ in CROSS_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
+            processor = AttnProcessor()
+        else:
+            raise ValueError(
+                f"Cannot call `set_default_attn_processor` when attention processors are of type {next(iter(self.attn_processors.values()))}"
+            )
+
+        self.set_attn_processor(processor)
+
+    def _encode(self, x: torch.Tensor) -> torch.Tensor:
+        batch_size, num_channels, height, width = x.shape
+
+        if self.use_tiling and (width > self.tile_sample_min_size or height > self.tile_sample_min_size):
+            return self._tiled_encode(x)
+
+        enc = self.encoder(x)
+        if self.quant_conv is not None:
+            enc = self.quant_conv(enc)
+
+        return enc
+
+    @apply_forward_hook
+    def encode(
+        self, x: torch.Tensor, return_dict: bool = True
+    ) -> Union[AutoencoderKLOutput, Tuple[DiagonalGaussianDistribution]]:
+        """
+        Encode a batch of images into latents.
+
+        Args:
+            x (`torch.Tensor`): Input batch of images.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether to return a [`~models.autoencoder_kl.AutoencoderKLOutput`] instead of a plain tuple.
+
+        Returns:
+                The latent representations of the encoded images. If `return_dict` is True, a
+                [`~models.autoencoder_kl.AutoencoderKLOutput`] is returned, otherwise a plain `tuple` is returned.
+        """
+        if self.use_slicing and x.shape[0] > 1:
+            encoded_slices = [self._encode(x_slice) for x_slice in x.split(1)]
+            h = torch.cat(encoded_slices)
+        else:
+            h = self._encode(x)
+
+        posterior = DiagonalGaussianDistribution(h)
+
+        if not return_dict:
+            return (posterior,)
+
+        return AutoencoderKLOutput(latent_dist=posterior)
+
+    def _decode(self, z: torch.Tensor, return_dict: bool = True) -> Union[DecoderOutput, torch.Tensor]:
+        if self.use_tiling and (z.shape[-1] > self.tile_latent_min_size or z.shape[-2] > self.tile_latent_min_size):
+            return self.tiled_decode(z, return_dict=return_dict)
+
+        if self.post_quant_conv is not None:
+            z = self.post_quant_conv(z)
+
+        dec = self.decoder(z)
+
+        if not return_dict:
+            return (dec,)
+
+        return DecoderOutput(sample=dec)
+
+    @apply_forward_hook
+    def decode(
+        self, z: torch.FloatTensor, return_dict: bool = True, generator=None
+    ) -> Union[DecoderOutput, torch.FloatTensor]:
+        """
+        Decode a batch of images.
+
+        Args:
+            z (`torch.Tensor`): Input batch of latent vectors.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether to return a [`~models.vae.DecoderOutput`] instead of a plain tuple.
+
+        Returns:
+            [`~models.vae.DecoderOutput`] or `tuple`:
+                If return_dict is True, a [`~models.vae.DecoderOutput`] is returned, otherwise a plain `tuple` is
+                returned.
+
+        """
+        if self.use_slicing and z.shape[0] > 1:
+            decoded_slices = [self._decode(z_slice).sample for z_slice in z.split(1)]
+            decoded = torch.cat(decoded_slices)
+        else:
+            decoded = self._decode(z).sample
+
+        if not return_dict:
+            return (decoded,)
+
+        return DecoderOutput(sample=decoded)
+
+    def blend_v(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch.Tensor:
+        blend_extent = min(a.shape[2], b.shape[2], blend_extent)
+        for y in range(blend_extent):
+            b[:, :, y, :] = a[:, :, -blend_extent + y, :] * (1 - y / blend_extent) + b[:, :, y, :] * (y / blend_extent)
+        return b
+
+    def blend_h(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch.Tensor:
+        blend_extent = min(a.shape[3], b.shape[3], blend_extent)
+        for x in range(blend_extent):
+            b[:, :, :, x] = a[:, :, :, -blend_extent + x] * (1 - x / blend_extent) + b[:, :, :, x] * (x / blend_extent)
+        return b
+
+    def _tiled_encode(self, x: torch.Tensor) -> torch.Tensor:
+        r"""Encode a batch of images using a tiled encoder.
+
+        When this option is enabled, the VAE will split the input tensor into tiles to compute encoding in several
+        steps. This is useful to keep memory use constant regardless of image size. The end result of tiled encoding is
+        different from non-tiled encoding because each tile uses a different encoder. To avoid tiling artifacts, the
+        tiles overlap and are blended together to form a smooth output. You may still see tile-sized changes in the
+        output, but they should be much less noticeable.
+
+        Args:
+            x (`torch.Tensor`): Input batch of images.
+
+        Returns:
+            `torch.Tensor`:
+                The latent representation of the encoded videos.
+        """
+
+        overlap_size = int(self.tile_sample_min_size * (1 - self.tile_overlap_factor))
+        blend_extent = int(self.tile_latent_min_size * self.tile_overlap_factor)
+        row_limit = self.tile_latent_min_size - blend_extent
+
+        # Split the image into 512x512 tiles and encode them separately.
+        rows = []
+        for i in range(0, x.shape[2], overlap_size):
+            row = []
+            for j in range(0, x.shape[3], overlap_size):
+                tile = x[:, :, i : i + self.tile_sample_min_size, j : j + self.tile_sample_min_size]
+                tile = self.encoder(tile)
+                if self.config.use_quant_conv:
+                    tile = self.quant_conv(tile)
+                row.append(tile)
+            rows.append(row)
+        result_rows = []
+        for i, row in enumerate(rows):
+            result_row = []
+            for j, tile in enumerate(row):
+                # blend the above tile and the left tile
+                # to the current tile and add the current tile to the result row
+                if i > 0:
+                    tile = self.blend_v(rows[i - 1][j], tile, blend_extent)
+                if j > 0:
+                    tile = self.blend_h(row[j - 1], tile, blend_extent)
+                result_row.append(tile[:, :, :row_limit, :row_limit])
+            result_rows.append(torch.cat(result_row, dim=3))
+
+        enc = torch.cat(result_rows, dim=2)
+        return enc
+
+    def tiled_encode(self, x: torch.Tensor, return_dict: bool = True) -> AutoencoderKLOutput:
+        r"""Encode a batch of images using a tiled encoder.
+
+        When this option is enabled, the VAE will split the input tensor into tiles to compute encoding in several
+        steps. This is useful to keep memory use constant regardless of image size. The end result of tiled encoding is
+        different from non-tiled encoding because each tile uses a different encoder. To avoid tiling artifacts, the
+        tiles overlap and are blended together to form a smooth output. You may still see tile-sized changes in the
+        output, but they should be much less noticeable.
+
+        Args:
+            x (`torch.Tensor`): Input batch of images.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~models.autoencoder_kl.AutoencoderKLOutput`] instead of a plain tuple.
+
+        Returns:
+            [`~models.autoencoder_kl.AutoencoderKLOutput`] or `tuple`:
+                If return_dict is True, a [`~models.autoencoder_kl.AutoencoderKLOutput`] is returned, otherwise a plain
+                `tuple` is returned.
+        """
+        deprecation_message = (
+            "The tiled_encode implementation supporting the `return_dict` parameter is deprecated. In the future, the "
+            "implementation of this method will be replaced with that of `_tiled_encode` and you will no longer be able "
+            "to pass `return_dict`. You will also have to create a `DiagonalGaussianDistribution()` from the returned value."
+        )
+        deprecate("tiled_encode", "1.0.0", deprecation_message, standard_warn=False)
+
+        overlap_size = int(self.tile_sample_min_size * (1 - self.tile_overlap_factor))
+        blend_extent = int(self.tile_latent_min_size * self.tile_overlap_factor)
+        row_limit = self.tile_latent_min_size - blend_extent
+
+        # Split the image into 512x512 tiles and encode them separately.
+        rows = []
+        for i in range(0, x.shape[2], overlap_size):
+            row = []
+            for j in range(0, x.shape[3], overlap_size):
+                tile = x[:, :, i : i + self.tile_sample_min_size, j : j + self.tile_sample_min_size]
+                tile = self.encoder(tile)
+                if self.config.use_quant_conv:
+                    tile = self.quant_conv(tile)
+                row.append(tile)
+            rows.append(row)
+        result_rows = []
+        for i, row in enumerate(rows):
+            result_row = []
+            for j, tile in enumerate(row):
+                # blend the above tile and the left tile
+                # to the current tile and add the current tile to the result row
+                if i > 0:
+                    tile = self.blend_v(rows[i - 1][j], tile, blend_extent)
+                if j > 0:
+                    tile = self.blend_h(row[j - 1], tile, blend_extent)
+                result_row.append(tile[:, :, :row_limit, :row_limit])
+            result_rows.append(torch.cat(result_row, dim=3))
+
+        moments = torch.cat(result_rows, dim=2)
+        posterior = DiagonalGaussianDistribution(moments)
+
+        if not return_dict:
+            return (posterior,)
+
+        return AutoencoderKLOutput(latent_dist=posterior)
+
+    def tiled_decode(self, z: torch.Tensor, return_dict: bool = True) -> Union[DecoderOutput, torch.Tensor]:
+        r"""
+        Decode a batch of images using a tiled decoder.
+
+        Args:
+            z (`torch.Tensor`): Input batch of latent vectors.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~models.vae.DecoderOutput`] instead of a plain tuple.
+
+        Returns:
+            [`~models.vae.DecoderOutput`] or `tuple`:
+                If return_dict is True, a [`~models.vae.DecoderOutput`] is returned, otherwise a plain `tuple` is
+                returned.
+        """
+        overlap_size = int(self.tile_latent_min_size * (1 - self.tile_overlap_factor))
+        blend_extent = int(self.tile_sample_min_size * self.tile_overlap_factor)
+        row_limit = self.tile_sample_min_size - blend_extent
+
+        # Split z into overlapping 64x64 tiles and decode them separately.
+        # The tiles have an overlap to avoid seams between tiles.
+        rows = []
+        for i in range(0, z.shape[2], overlap_size):
+            row = []
+            for j in range(0, z.shape[3], overlap_size):
+                tile = z[:, :, i : i + self.tile_latent_min_size, j : j + self.tile_latent_min_size]
+                if self.config.use_post_quant_conv:
+                    tile = self.post_quant_conv(tile)
+                decoded = self.decoder(tile)
+                row.append(decoded)
+            rows.append(row)
+        result_rows = []
+        for i, row in enumerate(rows):
+            result_row = []
+            for j, tile in enumerate(row):
+                # blend the above tile and the left tile
+                # to the current tile and add the current tile to the result row
+                if i > 0:
+                    tile = self.blend_v(rows[i - 1][j], tile, blend_extent)
+                if j > 0:
+                    tile = self.blend_h(row[j - 1], tile, blend_extent)
+                result_row.append(tile[:, :, :row_limit, :row_limit])
+            result_rows.append(torch.cat(result_row, dim=3))
+
+        dec = torch.cat(result_rows, dim=2)
+        if not return_dict:
+            return (dec,)
+
+        return DecoderOutput(sample=dec)
+
+    def forward(
+        self,
+        sample: torch.Tensor,
+        sample_posterior: bool = False,
+        return_dict: bool = True,
+        generator: Optional[torch.Generator] = None,
+    ) -> Union[DecoderOutput, torch.Tensor]:
+        r"""
+        Args:
+            sample (`torch.Tensor`): Input sample.
+            sample_posterior (`bool`, *optional*, defaults to `False`):
+                Whether to sample from the posterior.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`DecoderOutput`] instead of a plain tuple.
+        """
+        x = sample
+        posterior = self.encode(x).latent_dist
+        if sample_posterior:
+            z = posterior.sample(generator=generator)
+        else:
+            z = posterior.mode()
+        dec = self.decode(z).sample
+
+        if not return_dict:
+            return (dec,)
+
+        return DecoderOutput(sample=dec)
+
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.fuse_qkv_projections
+    def fuse_qkv_projections(self):
+        """
+        Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, key, value)
+        are fused. For cross-attention modules, key and value projection matrices are fused.
+
+        <Tip warning={true}>
+
+        This API is 🧪 experimental.
+
+        </Tip>
+        """
+        self.original_attn_processors = None
+
+        for _, attn_processor in self.attn_processors.items():
+            if "Added" in str(attn_processor.__class__.__name__):
+                raise ValueError("`fuse_qkv_projections()` is not supported for models having added KV projections.")
+
+        self.original_attn_processors = self.attn_processors
+
+        for module in self.modules():
+            if isinstance(module, Attention):
+                module.fuse_projections(fuse=True)
+
+        self.set_attn_processor(FusedAttnProcessor2_0())
+
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.unfuse_qkv_projections
+    def unfuse_qkv_projections(self):
+        """Disables the fused QKV projection if enabled.
+
+        <Tip warning={true}>
+
+        This API is 🧪 experimental.
+
+        </Tip>
+
+        """
+        if self.original_attn_processors is not None:
+            self.set_attn_processor(self.original_attn_processors)
\ No newline at end of file
diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/vae/vae.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/vae/vae.py
new file mode 100644
index 0000000000..006ed75f1f
--- /dev/null
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/vae/vae.py
@@ -0,0 +1,995 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from dataclasses import dataclass
+from typing import Optional, Tuple
+
+import numpy as np
+import torch
+import torch.nn as nn
+
+from diffusers.utils import BaseOutput, is_torch_version
+from diffusers.utils.torch_utils import randn_tensor
+from diffusers.models.activations import get_activation
+from diffusers.models.attention_processor import SpatialNorm
+from diffusers.models.unets.unet_2d_blocks import (
+    AutoencoderTinyBlock,
+    UNetMidBlock2D,
+    get_down_block,
+    get_up_block,
+)
+
+
+@dataclass
+class EncoderOutput(BaseOutput):
+    r"""
+    Output of encoding method.
+
+    Args:
+        latent (`torch.Tensor` of shape `(batch_size, num_channels, latent_height, latent_width)`):
+            The encoded latent.
+    """
+
+    latent: torch.Tensor
+
+
+@dataclass
+class DecoderOutput(BaseOutput):
+    r"""
+    Output of decoding method.
+
+    Args:
+        sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)`):
+            The decoded output sample from the last layer of the model.
+    """
+
+    sample: torch.Tensor
+    commit_loss: Optional[torch.FloatTensor] = None
+
+
+class Encoder(nn.Module):
+    r"""
+    The `Encoder` layer of a variational autoencoder that encodes its input into a latent representation.
+
+    Args:
+        in_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        out_channels (`int`, *optional*, defaults to 3):
+            The number of output channels.
+        down_block_types (`Tuple[str, ...]`, *optional*, defaults to `("DownEncoderBlock2D",)`):
+            The types of down blocks to use. See `~diffusers.models.unet_2d_blocks.get_down_block` for available
+            options.
+        block_out_channels (`Tuple[int, ...]`, *optional*, defaults to `(64,)`):
+            The number of output channels for each block.
+        layers_per_block (`int`, *optional*, defaults to 2):
+            The number of layers per block.
+        norm_num_groups (`int`, *optional*, defaults to 32):
+            The number of groups for normalization.
+        act_fn (`str`, *optional*, defaults to `"silu"`):
+            The activation function to use. See `~diffusers.models.activations.get_activation` for available options.
+        double_z (`bool`, *optional*, defaults to `True`):
+            Whether to double the number of output channels for the last block.
+    """
+
+    def __init__(
+        self,
+        in_channels: int = 3,
+        out_channels: int = 3,
+        down_block_types: Tuple[str, ...] = ("DownEncoderBlock2D",),
+        block_out_channels: Tuple[int, ...] = (64,),
+        layers_per_block: int = 2,
+        norm_num_groups: int = 32,
+        act_fn: str = "silu",
+        double_z: bool = True,
+        mid_block_add_attention=True,
+    ):
+        super().__init__()
+        self.layers_per_block = layers_per_block
+
+        self.conv_in = nn.Conv2d(
+            in_channels,
+            block_out_channels[0],
+            kernel_size=3,
+            stride=1,
+            padding=1,
+        )
+
+        self.down_blocks = nn.ModuleList([])
+
+        # down
+        output_channel = block_out_channels[0]
+        for i, down_block_type in enumerate(down_block_types):
+            input_channel = output_channel
+            output_channel = block_out_channels[i]
+            is_final_block = i == len(block_out_channels) - 1
+
+            down_block = get_down_block(
+                down_block_type,
+                num_layers=self.layers_per_block,
+                in_channels=input_channel,
+                out_channels=output_channel,
+                add_downsample=not is_final_block,
+                resnet_eps=1e-6,
+                downsample_padding=0,
+                resnet_act_fn=act_fn,
+                resnet_groups=norm_num_groups,
+                attention_head_dim=output_channel,
+                temb_channels=None,
+            )
+            self.down_blocks.append(down_block)
+
+        # mid
+        self.mid_block = UNetMidBlock2D(
+            in_channels=block_out_channels[-1],
+            resnet_eps=1e-6,
+            resnet_act_fn=act_fn,
+            output_scale_factor=1,
+            resnet_time_scale_shift="default",
+            attention_head_dim=block_out_channels[-1],
+            resnet_groups=norm_num_groups,
+            temb_channels=None,
+            add_attention=mid_block_add_attention,
+        )
+
+        # out
+        self.conv_norm_out = nn.GroupNorm(num_channels=block_out_channels[-1], num_groups=norm_num_groups, eps=1e-6)
+        self.conv_act = nn.SiLU()
+
+        conv_out_channels = 2 * out_channels if double_z else out_channels
+        self.conv_out = nn.Conv2d(block_out_channels[-1], conv_out_channels, 3, padding=1)
+
+        self.gradient_checkpointing = False
+
+    def forward(self, sample: torch.Tensor) -> torch.Tensor:
+        r"""The forward method of the `Encoder` class."""
+
+        sample = self.conv_in(sample)
+
+        if torch.is_grad_enabled() and self.gradient_checkpointing:
+
+            def create_custom_forward(module):
+                def custom_forward(*inputs):
+                    return module(*inputs)
+
+                return custom_forward
+
+            # down
+            if is_torch_version(">=", "1.11.0"):
+                for down_block in self.down_blocks:
+                    sample = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(down_block), sample, use_reentrant=False
+                    )
+                # middle
+                sample = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(self.mid_block), sample, use_reentrant=False
+                )
+            else:
+                for down_block in self.down_blocks:
+                    sample = torch.utils.checkpoint.checkpoint(create_custom_forward(down_block), sample)
+                # middle
+                sample = torch.utils.checkpoint.checkpoint(create_custom_forward(self.mid_block), sample)
+
+        else:
+            # down
+            for down_block in self.down_blocks:
+                sample = down_block(sample)
+
+            # middle
+            sample = self.mid_block(sample)
+
+        # post-process
+        sample = self.conv_norm_out(sample)
+        sample = self.conv_act(sample)
+        sample = self.conv_out(sample)
+
+        return sample
+
+
+class Decoder(nn.Module):
+    r"""
+    The `Decoder` layer of a variational autoencoder that decodes its latent representation into an output sample.
+
+    Args:
+        in_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        out_channels (`int`, *optional*, defaults to 3):
+            The number of output channels.
+        up_block_types (`Tuple[str, ...]`, *optional*, defaults to `("UpDecoderBlock2D",)`):
+            The types of up blocks to use. See `~diffusers.models.unet_2d_blocks.get_up_block` for available options.
+        block_out_channels (`Tuple[int, ...]`, *optional*, defaults to `(64,)`):
+            The number of output channels for each block.
+        layers_per_block (`int`, *optional*, defaults to 2):
+            The number of layers per block.
+        norm_num_groups (`int`, *optional*, defaults to 32):
+            The number of groups for normalization.
+        act_fn (`str`, *optional*, defaults to `"silu"`):
+            The activation function to use. See `~diffusers.models.activations.get_activation` for available options.
+        norm_type (`str`, *optional*, defaults to `"group"`):
+            The normalization type to use. Can be either `"group"` or `"spatial"`.
+    """
+
+    def __init__(
+        self,
+        in_channels: int = 3,
+        out_channels: int = 3,
+        up_block_types: Tuple[str, ...] = ("UpDecoderBlock2D",),
+        block_out_channels: Tuple[int, ...] = (64,),
+        layers_per_block: int = 2,
+        norm_num_groups: int = 32,
+        act_fn: str = "silu",
+        norm_type: str = "group",  # group, spatial
+        mid_block_add_attention=True,
+    ):
+        super().__init__()
+        self.layers_per_block = layers_per_block
+
+        self.conv_in = nn.Conv2d(
+            in_channels,
+            block_out_channels[-1],
+            kernel_size=3,
+            stride=1,
+            padding=1,
+        )
+
+        self.up_blocks = nn.ModuleList([])
+
+        temb_channels = in_channels if norm_type == "spatial" else None
+
+        # mid
+        self.mid_block = UNetMidBlock2D(
+            in_channels=block_out_channels[-1],
+            resnet_eps=1e-6,
+            resnet_act_fn=act_fn,
+            output_scale_factor=1,
+            resnet_time_scale_shift="default" if norm_type == "group" else norm_type,
+            attention_head_dim=block_out_channels[-1],
+            resnet_groups=norm_num_groups,
+            temb_channels=temb_channels,
+            add_attention=mid_block_add_attention,
+        )
+
+        # up
+        reversed_block_out_channels = list(reversed(block_out_channels))
+        output_channel = reversed_block_out_channels[0]
+        for i, up_block_type in enumerate(up_block_types):
+            prev_output_channel = output_channel
+            output_channel = reversed_block_out_channels[i]
+
+            is_final_block = i == len(block_out_channels) - 1
+
+            up_block = get_up_block(
+                up_block_type,
+                num_layers=self.layers_per_block + 1,
+                in_channels=prev_output_channel,
+                out_channels=output_channel,
+                prev_output_channel=None,
+                add_upsample=not is_final_block,
+                resnet_eps=1e-6,
+                resnet_act_fn=act_fn,
+                resnet_groups=norm_num_groups,
+                attention_head_dim=output_channel,
+                temb_channels=temb_channels,
+                resnet_time_scale_shift=norm_type,
+            )
+            self.up_blocks.append(up_block)
+            prev_output_channel = output_channel
+
+        # out
+        if norm_type == "spatial":
+            self.conv_norm_out = SpatialNorm(block_out_channels[0], temb_channels)
+        else:
+            self.conv_norm_out = nn.GroupNorm(num_channels=block_out_channels[0], num_groups=norm_num_groups, eps=1e-6)
+        self.conv_act = nn.SiLU()
+        self.conv_out = nn.Conv2d(block_out_channels[0], out_channels, 3, padding=1)
+
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        sample: torch.Tensor,
+        latent_embeds: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        r"""The forward method of the `Decoder` class."""
+
+        sample = self.conv_in(sample)
+
+        upscale_dtype = next(iter(self.up_blocks.parameters())).dtype
+        if torch.is_grad_enabled() and self.gradient_checkpointing:
+
+            def create_custom_forward(module):
+                def custom_forward(*inputs):
+                    return module(*inputs)
+
+                return custom_forward
+
+            if is_torch_version(">=", "1.11.0"):
+                # middle
+                sample = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(self.mid_block),
+                    sample,
+                    latent_embeds,
+                    use_reentrant=False,
+                )
+                sample = sample.to(upscale_dtype)
+
+                # up
+                for up_block in self.up_blocks:
+                    sample = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(up_block),
+                        sample,
+                        latent_embeds,
+                        use_reentrant=False,
+                    )
+            else:
+                # middle
+                sample = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(self.mid_block), sample, latent_embeds
+                )
+                sample = sample.to(upscale_dtype)
+
+                # up
+                for up_block in self.up_blocks:
+                    sample = torch.utils.checkpoint.checkpoint(create_custom_forward(up_block), sample, latent_embeds)
+        else:
+            # middle
+            sample = self.mid_block(sample, latent_embeds)
+            sample = sample.to(upscale_dtype)
+
+            # up
+            for up_block in self.up_blocks:
+                sample = up_block(sample, latent_embeds)
+
+        # post-process
+        if latent_embeds is None:
+            sample = self.conv_norm_out(sample)
+        else:
+            sample = self.conv_norm_out(sample, latent_embeds)
+        sample = self.conv_act(sample)
+        sample = self.conv_out(sample)
+
+        return sample
+
+
+class UpSample(nn.Module):
+    r"""
+    The `UpSample` layer of a variational autoencoder that upsamples its input.
+
+    Args:
+        in_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        out_channels (`int`, *optional*, defaults to 3):
+            The number of output channels.
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+    ) -> None:
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.deconv = nn.ConvTranspose2d(in_channels, out_channels, kernel_size=4, stride=2, padding=1)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        r"""The forward method of the `UpSample` class."""
+        x = torch.relu(x)
+        x = self.deconv(x)
+        return x
+
+
+class MaskConditionEncoder(nn.Module):
+    """
+    used in AsymmetricAutoencoderKL
+    """
+
+    def __init__(
+        self,
+        in_ch: int,
+        out_ch: int = 192,
+        res_ch: int = 768,
+        stride: int = 16,
+    ) -> None:
+        super().__init__()
+
+        channels = []
+        while stride > 1:
+            stride = stride // 2
+            in_ch_ = out_ch * 2
+            if out_ch > res_ch:
+                out_ch = res_ch
+            if stride == 1:
+                in_ch_ = res_ch
+            channels.append((in_ch_, out_ch))
+            out_ch *= 2
+
+        out_channels = []
+        for _in_ch, _out_ch in channels:
+            out_channels.append(_out_ch)
+        out_channels.append(channels[-1][0])
+
+        layers = []
+        in_ch_ = in_ch
+        for l in range(len(out_channels)):
+            out_ch_ = out_channels[l]
+            if l == 0 or l == 1:
+                layers.append(nn.Conv2d(in_ch_, out_ch_, kernel_size=3, stride=1, padding=1))
+            else:
+                layers.append(nn.Conv2d(in_ch_, out_ch_, kernel_size=4, stride=2, padding=1))
+            in_ch_ = out_ch_
+
+        self.layers = nn.Sequential(*layers)
+
+    def forward(self, x: torch.Tensor, mask=None) -> torch.Tensor:
+        r"""The forward method of the `MaskConditionEncoder` class."""
+        out = {}
+        for l in range(len(self.layers)):
+            layer = self.layers[l]
+            x = layer(x)
+            out[str(tuple(x.shape))] = x
+            x = torch.relu(x)
+        return out
+
+
+class MaskConditionDecoder(nn.Module):
+    r"""The `MaskConditionDecoder` should be used in combination with [`AsymmetricAutoencoderKL`] to enhance the model's
+    decoder with a conditioner on the mask and masked image.
+
+    Args:
+        in_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        out_channels (`int`, *optional*, defaults to 3):
+            The number of output channels.
+        up_block_types (`Tuple[str, ...]`, *optional*, defaults to `("UpDecoderBlock2D",)`):
+            The types of up blocks to use. See `~diffusers.models.unet_2d_blocks.get_up_block` for available options.
+        block_out_channels (`Tuple[int, ...]`, *optional*, defaults to `(64,)`):
+            The number of output channels for each block.
+        layers_per_block (`int`, *optional*, defaults to 2):
+            The number of layers per block.
+        norm_num_groups (`int`, *optional*, defaults to 32):
+            The number of groups for normalization.
+        act_fn (`str`, *optional*, defaults to `"silu"`):
+            The activation function to use. See `~diffusers.models.activations.get_activation` for available options.
+        norm_type (`str`, *optional*, defaults to `"group"`):
+            The normalization type to use. Can be either `"group"` or `"spatial"`.
+    """
+
+    def __init__(
+        self,
+        in_channels: int = 3,
+        out_channels: int = 3,
+        up_block_types: Tuple[str, ...] = ("UpDecoderBlock2D",),
+        block_out_channels: Tuple[int, ...] = (64,),
+        layers_per_block: int = 2,
+        norm_num_groups: int = 32,
+        act_fn: str = "silu",
+        norm_type: str = "group",  # group, spatial
+    ):
+        super().__init__()
+        self.layers_per_block = layers_per_block
+
+        self.conv_in = nn.Conv2d(
+            in_channels,
+            block_out_channels[-1],
+            kernel_size=3,
+            stride=1,
+            padding=1,
+        )
+
+        self.up_blocks = nn.ModuleList([])
+
+        temb_channels = in_channels if norm_type == "spatial" else None
+
+        # mid
+        self.mid_block = UNetMidBlock2D(
+            in_channels=block_out_channels[-1],
+            resnet_eps=1e-6,
+            resnet_act_fn=act_fn,
+            output_scale_factor=1,
+            resnet_time_scale_shift="default" if norm_type == "group" else norm_type,
+            attention_head_dim=block_out_channels[-1],
+            resnet_groups=norm_num_groups,
+            temb_channels=temb_channels,
+        )
+
+        # up
+        reversed_block_out_channels = list(reversed(block_out_channels))
+        output_channel = reversed_block_out_channels[0]
+        for i, up_block_type in enumerate(up_block_types):
+            prev_output_channel = output_channel
+            output_channel = reversed_block_out_channels[i]
+
+            is_final_block = i == len(block_out_channels) - 1
+
+            up_block = get_up_block(
+                up_block_type,
+                num_layers=self.layers_per_block + 1,
+                in_channels=prev_output_channel,
+                out_channels=output_channel,
+                prev_output_channel=None,
+                add_upsample=not is_final_block,
+                resnet_eps=1e-6,
+                resnet_act_fn=act_fn,
+                resnet_groups=norm_num_groups,
+                attention_head_dim=output_channel,
+                temb_channels=temb_channels,
+                resnet_time_scale_shift=norm_type,
+            )
+            self.up_blocks.append(up_block)
+            prev_output_channel = output_channel
+
+        # condition encoder
+        self.condition_encoder = MaskConditionEncoder(
+            in_ch=out_channels,
+            out_ch=block_out_channels[0],
+            res_ch=block_out_channels[-1],
+        )
+
+        # out
+        if norm_type == "spatial":
+            self.conv_norm_out = SpatialNorm(block_out_channels[0], temb_channels)
+        else:
+            self.conv_norm_out = nn.GroupNorm(num_channels=block_out_channels[0], num_groups=norm_num_groups, eps=1e-6)
+        self.conv_act = nn.SiLU()
+        self.conv_out = nn.Conv2d(block_out_channels[0], out_channels, 3, padding=1)
+
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        z: torch.Tensor,
+        image: Optional[torch.Tensor] = None,
+        mask: Optional[torch.Tensor] = None,
+        latent_embeds: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        r"""The forward method of the `MaskConditionDecoder` class."""
+        sample = z
+        sample = self.conv_in(sample)
+
+        upscale_dtype = next(iter(self.up_blocks.parameters())).dtype
+        if torch.is_grad_enabled() and self.gradient_checkpointing:
+
+            def create_custom_forward(module):
+                def custom_forward(*inputs):
+                    return module(*inputs)
+
+                return custom_forward
+
+            if is_torch_version(">=", "1.11.0"):
+                # middle
+                sample = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(self.mid_block),
+                    sample,
+                    latent_embeds,
+                    use_reentrant=False,
+                )
+                sample = sample.to(upscale_dtype)
+
+                # condition encoder
+                if image is not None and mask is not None:
+                    masked_image = (1 - mask) * image
+                    im_x = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(self.condition_encoder),
+                        masked_image,
+                        mask,
+                        use_reentrant=False,
+                    )
+
+                # up
+                for up_block in self.up_blocks:
+                    if image is not None and mask is not None:
+                        sample_ = im_x[str(tuple(sample.shape))]
+                        mask_ = nn.functional.interpolate(mask, size=sample.shape[-2:], mode="nearest")
+                        sample = sample * mask_ + sample_ * (1 - mask_)
+                    sample = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(up_block),
+                        sample,
+                        latent_embeds,
+                        use_reentrant=False,
+                    )
+                if image is not None and mask is not None:
+                    sample = sample * mask + im_x[str(tuple(sample.shape))] * (1 - mask)
+            else:
+                # middle
+                sample = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(self.mid_block), sample, latent_embeds
+                )
+                sample = sample.to(upscale_dtype)
+
+                # condition encoder
+                if image is not None and mask is not None:
+                    masked_image = (1 - mask) * image
+                    im_x = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(self.condition_encoder),
+                        masked_image,
+                        mask,
+                    )
+
+                # up
+                for up_block in self.up_blocks:
+                    if image is not None and mask is not None:
+                        sample_ = im_x[str(tuple(sample.shape))]
+                        mask_ = nn.functional.interpolate(mask, size=sample.shape[-2:], mode="nearest")
+                        sample = sample * mask_ + sample_ * (1 - mask_)
+                    sample = torch.utils.checkpoint.checkpoint(create_custom_forward(up_block), sample, latent_embeds)
+                if image is not None and mask is not None:
+                    sample = sample * mask + im_x[str(tuple(sample.shape))] * (1 - mask)
+        else:
+            # middle
+            sample = self.mid_block(sample, latent_embeds)
+            sample = sample.to(upscale_dtype)
+
+            # condition encoder
+            if image is not None and mask is not None:
+                masked_image = (1 - mask) * image
+                im_x = self.condition_encoder(masked_image, mask)
+
+            # up
+            for up_block in self.up_blocks:
+                if image is not None and mask is not None:
+                    sample_ = im_x[str(tuple(sample.shape))]
+                    mask_ = nn.functional.interpolate(mask, size=sample.shape[-2:], mode="nearest")
+                    sample = sample * mask_ + sample_ * (1 - mask_)
+                sample = up_block(sample, latent_embeds)
+            if image is not None and mask is not None:
+                sample = sample * mask + im_x[str(tuple(sample.shape))] * (1 - mask)
+
+        # post-process
+        if latent_embeds is None:
+            sample = self.conv_norm_out(sample)
+        else:
+            sample = self.conv_norm_out(sample, latent_embeds)
+        sample = self.conv_act(sample)
+        sample = self.conv_out(sample)
+
+        return sample
+
+
+class VectorQuantizer(nn.Module):
+    """
+    Improved version over VectorQuantizer, can be used as a drop-in replacement. Mostly avoids costly matrix
+    multiplications and allows for post-hoc remapping of indices.
+    """
+
+    # NOTE: due to a bug the beta term was applied to the wrong term. for
+    # backwards compatibility we use the buggy version by default, but you can
+    # specify legacy=False to fix it.
+    def __init__(
+        self,
+        n_e: int,
+        vq_embed_dim: int,
+        beta: float,
+        remap=None,
+        unknown_index: str = "random",
+        sane_index_shape: bool = False,
+        legacy: bool = True,
+    ):
+        super().__init__()
+        self.n_e = n_e
+        self.vq_embed_dim = vq_embed_dim
+        self.beta = beta
+        self.legacy = legacy
+
+        self.embedding = nn.Embedding(self.n_e, self.vq_embed_dim)
+        self.embedding.weight.data.uniform_(-1.0 / self.n_e, 1.0 / self.n_e)
+
+        self.remap = remap
+        if self.remap is not None:
+            self.register_buffer("used", torch.tensor(np.load(self.remap)))
+            self.used: torch.Tensor
+            self.re_embed = self.used.shape[0]
+            self.unknown_index = unknown_index  # "random" or "extra" or integer
+            if self.unknown_index == "extra":
+                self.unknown_index = self.re_embed
+                self.re_embed = self.re_embed + 1
+            print(
+                f"Remapping {self.n_e} indices to {self.re_embed} indices. "
+                f"Using {self.unknown_index} for unknown indices."
+            )
+        else:
+            self.re_embed = n_e
+
+        self.sane_index_shape = sane_index_shape
+
+    def remap_to_used(self, inds: torch.LongTensor) -> torch.LongTensor:
+        ishape = inds.shape
+        assert len(ishape) > 1
+        inds = inds.reshape(ishape[0], -1)
+        used = self.used.to(inds)
+        match = (inds[:, :, None] == used[None, None, ...]).long()
+        new = match.argmax(-1)
+        unknown = match.sum(2) < 1
+        if self.unknown_index == "random":
+            new[unknown] = torch.randint(0, self.re_embed, size=new[unknown].shape).to(device=new.device)
+        else:
+            new[unknown] = self.unknown_index
+        return new.reshape(ishape)
+
+    def unmap_to_all(self, inds: torch.LongTensor) -> torch.LongTensor:
+        ishape = inds.shape
+        assert len(ishape) > 1
+        inds = inds.reshape(ishape[0], -1)
+        used = self.used.to(inds)
+        if self.re_embed > self.used.shape[0]:  # extra token
+            inds[inds >= self.used.shape[0]] = 0  # simply set to zero
+        back = torch.gather(used[None, :][inds.shape[0] * [0], :], 1, inds)
+        return back.reshape(ishape)
+
+    def forward(self, z: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, Tuple]:
+        # reshape z -> (batch, height, width, channel) and flatten
+        z = z.permute(0, 2, 3, 1).contiguous()
+        z_flattened = z.view(-1, self.vq_embed_dim)
+
+        # distances from z to embeddings e_j (z - e)^2 = z^2 + e^2 - 2 e * z
+        min_encoding_indices = torch.argmin(torch.cdist(z_flattened, self.embedding.weight), dim=1)
+
+        z_q = self.embedding(min_encoding_indices).view(z.shape)
+        perplexity = None
+        min_encodings = None
+
+        # compute loss for embedding
+        if not self.legacy:
+            loss = self.beta * torch.mean((z_q.detach() - z) ** 2) + torch.mean((z_q - z.detach()) ** 2)
+        else:
+            loss = torch.mean((z_q.detach() - z) ** 2) + self.beta * torch.mean((z_q - z.detach()) ** 2)
+
+        # preserve gradients
+        z_q: torch.Tensor = z + (z_q - z).detach()
+
+        # reshape back to match original input shape
+        z_q = z_q.permute(0, 3, 1, 2).contiguous()
+
+        if self.remap is not None:
+            min_encoding_indices = min_encoding_indices.reshape(z.shape[0], -1)  # add batch axis
+            min_encoding_indices = self.remap_to_used(min_encoding_indices)
+            min_encoding_indices = min_encoding_indices.reshape(-1, 1)  # flatten
+
+        if self.sane_index_shape:
+            min_encoding_indices = min_encoding_indices.reshape(z_q.shape[0], z_q.shape[2], z_q.shape[3])
+
+        return z_q, loss, (perplexity, min_encodings, min_encoding_indices)
+
+    def get_codebook_entry(self, indices: torch.LongTensor, shape: Tuple[int, ...]) -> torch.Tensor:
+        # shape specifying (batch, height, width, channel)
+        if self.remap is not None:
+            indices = indices.reshape(shape[0], -1)  # add batch axis
+            indices = self.unmap_to_all(indices)
+            indices = indices.reshape(-1)  # flatten again
+
+        # get quantized latent vectors
+        z_q: torch.Tensor = self.embedding(indices)
+
+        if shape is not None:
+            z_q = z_q.view(shape)
+            # reshape back to match original input shape
+            z_q = z_q.permute(0, 3, 1, 2).contiguous()
+
+        return z_q
+
+
+class DiagonalGaussianDistribution(object):
+    def __init__(self, parameters: torch.Tensor, deterministic: bool = False):
+        self.parameters = parameters
+        self.mean, self.logvar = torch.chunk(parameters, 2, dim=1)
+        self.logvar = torch.clamp(self.logvar, -30.0, 20.0)
+        self.deterministic = deterministic
+        self.std = torch.exp(0.5 * self.logvar)
+        self.var = torch.exp(self.logvar)
+        if self.deterministic:
+            self.var = self.std = torch.zeros_like(
+                self.mean, device=self.parameters.device, dtype=self.parameters.dtype
+            )
+
+    def sample(self, generator: Optional[torch.Generator] = None) -> torch.Tensor:
+        # make sure sample is on the same device as the parameters and has same dtype
+        sample = randn_tensor(
+            self.mean.shape,
+            generator=generator,
+            device=self.parameters.device,
+            dtype=self.parameters.dtype,
+        )
+        x = self.mean + self.std * sample
+        return x
+
+    def kl(self, other: "DiagonalGaussianDistribution" = None) -> torch.Tensor:
+        if self.deterministic:
+            return torch.Tensor([0.0])
+        else:
+            if other is None:
+                return 0.5 * torch.sum(
+                    torch.pow(self.mean, 2) + self.var - 1.0 - self.logvar,
+                    dim=[1, 2, 3],
+                )
+            else:
+                return 0.5 * torch.sum(
+                    torch.pow(self.mean - other.mean, 2) / other.var
+                    + self.var / other.var
+                    - 1.0
+                    - self.logvar
+                    + other.logvar,
+                    dim=[1, 2, 3],
+                )
+
+    def nll(self, sample: torch.Tensor, dims: Tuple[int, ...] = [1, 2, 3]) -> torch.Tensor:
+        if self.deterministic:
+            return torch.Tensor([0.0])
+        logtwopi = np.log(2.0 * np.pi)
+        return 0.5 * torch.sum(
+            logtwopi + self.logvar + torch.pow(sample - self.mean, 2) / self.var,
+            dim=dims,
+        )
+
+    def mode(self) -> torch.Tensor:
+        return self.mean
+
+
+class EncoderTiny(nn.Module):
+    r"""
+    The `EncoderTiny` layer is a simpler version of the `Encoder` layer.
+
+    Args:
+        in_channels (`int`):
+            The number of input channels.
+        out_channels (`int`):
+            The number of output channels.
+        num_blocks (`Tuple[int, ...]`):
+            Each value of the tuple represents a Conv2d layer followed by `value` number of `AutoencoderTinyBlock`'s to
+            use.
+        block_out_channels (`Tuple[int, ...]`):
+            The number of output channels for each block.
+        act_fn (`str`):
+            The activation function to use. See `~diffusers.models.activations.get_activation` for available options.
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        num_blocks: Tuple[int, ...],
+        block_out_channels: Tuple[int, ...],
+        act_fn: str,
+    ):
+        super().__init__()
+
+        layers = []
+        for i, num_block in enumerate(num_blocks):
+            num_channels = block_out_channels[i]
+
+            if i == 0:
+                layers.append(nn.Conv2d(in_channels, num_channels, kernel_size=3, padding=1))
+            else:
+                layers.append(
+                    nn.Conv2d(
+                        num_channels,
+                        num_channels,
+                        kernel_size=3,
+                        padding=1,
+                        stride=2,
+                        bias=False,
+                    )
+                )
+
+            for _ in range(num_block):
+                layers.append(AutoencoderTinyBlock(num_channels, num_channels, act_fn))
+
+        layers.append(nn.Conv2d(block_out_channels[-1], out_channels, kernel_size=3, padding=1))
+
+        self.layers = nn.Sequential(*layers)
+        self.gradient_checkpointing = False
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        r"""The forward method of the `EncoderTiny` class."""
+        if torch.is_grad_enabled() and self.gradient_checkpointing:
+
+            def create_custom_forward(module):
+                def custom_forward(*inputs):
+                    return module(*inputs)
+
+                return custom_forward
+
+            if is_torch_version(">=", "1.11.0"):
+                x = torch.utils.checkpoint.checkpoint(create_custom_forward(self.layers), x, use_reentrant=False)
+            else:
+                x = torch.utils.checkpoint.checkpoint(create_custom_forward(self.layers), x)
+
+        else:
+            # scale image from [-1, 1] to [0, 1] to match TAESD convention
+            x = self.layers(x.add(1).div(2))
+
+        return x
+
+
+class DecoderTiny(nn.Module):
+    r"""
+    The `DecoderTiny` layer is a simpler version of the `Decoder` layer.
+
+    Args:
+        in_channels (`int`):
+            The number of input channels.
+        out_channels (`int`):
+            The number of output channels.
+        num_blocks (`Tuple[int, ...]`):
+            Each value of the tuple represents a Conv2d layer followed by `value` number of `AutoencoderTinyBlock`'s to
+            use.
+        block_out_channels (`Tuple[int, ...]`):
+            The number of output channels for each block.
+        upsampling_scaling_factor (`int`):
+            The scaling factor to use for upsampling.
+        act_fn (`str`):
+            The activation function to use. See `~diffusers.models.activations.get_activation` for available options.
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        num_blocks: Tuple[int, ...],
+        block_out_channels: Tuple[int, ...],
+        upsampling_scaling_factor: int,
+        act_fn: str,
+        upsample_fn: str,
+    ):
+        super().__init__()
+
+        layers = [
+            nn.Conv2d(in_channels, block_out_channels[0], kernel_size=3, padding=1),
+            get_activation(act_fn),
+        ]
+
+        for i, num_block in enumerate(num_blocks):
+            is_final_block = i == (len(num_blocks) - 1)
+            num_channels = block_out_channels[i]
+
+            for _ in range(num_block):
+                layers.append(AutoencoderTinyBlock(num_channels, num_channels, act_fn))
+
+            if not is_final_block:
+                layers.append(nn.Upsample(scale_factor=upsampling_scaling_factor, mode=upsample_fn))
+
+            conv_out_channel = num_channels if not is_final_block else out_channels
+            layers.append(
+                nn.Conv2d(
+                    num_channels,
+                    conv_out_channel,
+                    kernel_size=3,
+                    padding=1,
+                    bias=is_final_block,
+                )
+            )
+
+        self.layers = nn.Sequential(*layers)
+        self.gradient_checkpointing = False
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        r"""The forward method of the `DecoderTiny` class."""
+        # Clamp.
+        x = torch.tanh(x / 3) * 3
+
+        if torch.is_grad_enabled() and self.gradient_checkpointing:
+
+            def create_custom_forward(module):
+                def custom_forward(*inputs):
+                    return module(*inputs)
+
+                return custom_forward
+
+            if is_torch_version(">=", "1.11.0"):
+                x = torch.utils.checkpoint.checkpoint(create_custom_forward(self.layers), x, use_reentrant=False)
+            else:
+                x = torch.utils.checkpoint.checkpoint(create_custom_forward(self.layers), x)
+
+        else:
+            x = self.layers(x)
+
+        # scale image from [0, 1] to [-1, 1] to match diffusers convention
+        return x.mul(2).sub(1)
\ No newline at end of file
diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/inference_cogview3plus.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/inference_cogview3plus.py
new file mode 100644
index 0000000000..33d0c9aeed
--- /dev/null
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/inference_cogview3plus.py
@@ -0,0 +1,99 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2024 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import logging
+import torch
+
+from cogview3plus import CogView3PlusPipeline
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+
+def parse_arguments():
+    parser = argparse.ArgumentParser(description="Generate an image using the CogView3-Plus-3B model.")
+
+    # Define arguments for prompt, model path, etc.
+    parser.add_argument(
+        "--prompt", 
+        type=list, 
+        default=[
+            "A vibrant cherry red sports car sits proudly under the gleaming sun, its polished exterior smooth and flawless, casting a mirror-like reflection. The car features a low, aerodynamic body, angular headlights that gaze forward like predatory eyes, and a set of black, high-gloss racing rims that contrast starkly with the red. A subtle hint of chrome embellishes the grille and exhaust, while the tinted windows suggest a luxurious and private interior. The scene conveys a sense of speed and elegance, the car appearing as if it's about to burst into a sprint along a coastal road, with the ocean's azure waves crashing in the background."
+        ], 
+        help="The text description for generating the image."
+    )
+    parser.add_argument(
+        "--model_path", type=str, default="/data/CogView3B", help="Path to the pre-trained model."
+    )
+    parser.add_argument(
+        "--guidance_scale", type=float, default=7.0, help="The guidance scale for classifier-free guidance."
+    )
+    parser.add_argument(
+        "--num_images_per_prompt", type=int, default=1, help="Number of images to generate per prompt."
+    )
+    parser.add_argument("--num_inference_steps", type=int, default=50, help="Number of denoising steps for inference.")
+    parser.add_argument("--width", type=int, default=1024, help="Width of the generated image.")
+    parser.add_argument("--height", type=int, default=1024, help="Height of the generated image.")
+    parser.add_argument("--output_path", type=str, default="cogview3.png", help="Path to save the generated image.")
+    parser.add_argument("--dtype", type=str, default="bf16", help="bf16 or fp16")
+    parser.add_argument("--device_id", type=int, default=2, help="NPU device id")
+
+    return parser.parse_args()
+
+
+def generate_image(
+    prompt, model_path, guidance_scale, num_images_per_prompt, num_inference_steps, width, height, output_path, dtype
+):
+    # Load the pre-trained model with the specified precision
+    pipe = CogView3PlusPipeline.from_pretrained(model_path, torch_dtype=dtype)
+
+    # Generate the image based on the prompt
+    image = pipe(
+        prompt=prompt,
+        guidance_scale=guidance_scale,
+        num_images_per_prompt=num_images_per_prompt,
+        num_inference_steps=num_inference_steps,
+        width=width,
+        height=height,
+    ).images[0]
+
+    # Save the generated image to the local file system
+    image.save(output_path)
+
+    print(f"Image saved to {output_path}")
+
+
+def infer(args):
+    torch.npu.set_device(args.device_id)
+    dtype = torch.bfloat16 if args.dtype == "bf16" else torch.float16
+    generate_image(
+        prompt=args.prompt[0],
+        model_path=args.model_path,
+        guidance_scale=args.guidance_scale,
+        num_images_per_prompt=args.num_images_per_prompt,
+        num_inference_steps=args.num_inference_steps,
+        width=args.width,
+        height=args.height,
+        output_path=args.output_path,
+        dtype=dtype,
+    )
+
+
+if __name__ == "__main__":
+    inference_args = parse_arguments()
+    infer(inference_args)
+
diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/requirents.txt b/MindIE/MindIE-Torch/built-in/foundation/cogview3/requirents.txt
new file mode 100644
index 0000000000..ac2fa2a7f6
--- /dev/null
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/requirents.txt
@@ -0,0 +1,15 @@
+accelerate==0.29.3
+deepspeed==0.15.4
+einops==0.7.0
+gradio==3.50.2
+huggingface-hub==0.24.7
+Jinja2==3.1.4
+numpy==1.26.4
+peft==0.10.0
+safetensors==0.4.5
+timm==0.9.5
+tokenizers==0.15.2
+torch==2.1.0
+torchvision==0.14.1
+tqdm==4.66.5
+transformers==4.39.3
\ No newline at end of file
-- 
Gitee


From 4d72fcf83352eaa7498e533e02c3dd8ccb01fc11 Mon Sep 17 00:00:00 2001
From: jiangmengyu <jiangmengyu1@huawei.com>
Date: Tue, 24 Dec 2024 20:04:47 +0800
Subject: [PATCH 02/91] =?UTF-8?q?CogView3plus=E9=80=82=E9=85=8D?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../built-in/foundation/cogview3/cogview3plus/vae/__init__.py   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/vae/__init__.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/vae/__init__.py
index a91642a899..58bbb8f14e 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/vae/__init__.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/vae/__init__.py
@@ -1 +1 @@
-from autoencoder_kl import AutoencoderKL
\ No newline at end of file
+from .autoencoder_kl import AutoencoderKL
\ No newline at end of file
-- 
Gitee


From 69b98374c21a0a4f395fd044fbc93dc5a1049eff Mon Sep 17 00:00:00 2001
From: jiangmengyu <jiangmengyu1@huawei.com>
Date: Tue, 24 Dec 2024 20:05:28 +0800
Subject: [PATCH 03/91] =?UTF-8?q?CogView3plus=E9=80=82=E9=85=8D?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../foundation/cogview3/cogview3plus/models/__init__.py         | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/__init__.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/__init__.py
index 06571c58d3..68d6997c34 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/__init__.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/__init__.py
@@ -1 +1 @@
-from transformer_cogview3plus import CogView3PlusTransformer2DModel
\ No newline at end of file
+from .transformer_cogview3plus import CogView3PlusTransformer2DModel
\ No newline at end of file
-- 
Gitee


From 2a0a4ea777fc9562a51b98cce4243d017a80b4d6 Mon Sep 17 00:00:00 2001
From: jiangmengyu <jiangmengyu1@huawei.com>
Date: Tue, 24 Dec 2024 20:11:01 +0800
Subject: [PATCH 04/91] =?UTF-8?q?CogView3plus=E9=80=82=E9=85=8D?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../cogview3plus/models/activations.py        | 178 ++++++++++++++++++
 .../cogview3/cogview3plus/models/attention.py |   2 +-
 2 files changed, 179 insertions(+), 1 deletion(-)
 create mode 100644 MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/activations.py

diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/activations.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/activations.py
new file mode 100644
index 0000000000..cb1c29919e
--- /dev/null
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/activations.py
@@ -0,0 +1,178 @@
+# coding=utf-8
+# Copyright 2024 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from diffusers.utils import deprecate
+from diffusers.utils.import_utils import is_torch_npu_available, is_torch_version
+
+
+if is_torch_npu_available():
+    import torch_npu
+
+ACTIVATION_FUNCTIONS = {
+    "swish": nn.SiLU(),
+    "silu": nn.SiLU(),
+    "mish": nn.Mish(),
+    "gelu": nn.GELU(),
+    "relu": nn.ReLU(),
+}
+
+
+def get_activation(act_fn: str) -> nn.Module:
+    """Helper function to get activation function from string.
+
+    Args:
+        act_fn (str): Name of activation function.
+
+    Returns:
+        nn.Module: Activation function.
+    """
+
+    act_fn = act_fn.lower()
+    if act_fn in ACTIVATION_FUNCTIONS:
+        return ACTIVATION_FUNCTIONS[act_fn]
+    else:
+        raise ValueError(f"Unsupported activation function: {act_fn}")
+
+
+class FP32SiLU(nn.Module):
+    r"""
+    SiLU activation function with input upcasted to torch.float32.
+    """
+
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, inputs: torch.Tensor) -> torch.Tensor:
+        return F.silu(inputs.float(), inplace=False).to(inputs.dtype)
+
+
+class GELU(nn.Module):
+    r"""
+    GELU activation function with tanh approximation support with `approximate="tanh"`.
+
+    Parameters:
+        dim_in (`int`): The number of channels in the input.
+        dim_out (`int`): The number of channels in the output.
+        approximate (`str`, *optional*, defaults to `"none"`): If `"tanh"`, use tanh approximation.
+        bias (`bool`, defaults to True): Whether to use a bias in the linear layer.
+    """
+
+    def __init__(self, dim_in: int, dim_out: int, approximate: str = "none", bias: bool = True):
+        super().__init__()
+        self.proj = nn.Linear(dim_in, dim_out, bias=bias)
+        self.approximate = approximate
+
+    def gelu(self, gate: torch.Tensor) -> torch.Tensor:
+        if gate.device.type == "mps" and is_torch_version("<", "2.0.0"):
+            # fp16 gelu not supported on mps before torch 2.0
+            return F.gelu(gate.to(dtype=torch.float32), approximate=self.approximate).to(dtype=gate.dtype)
+        return F.gelu(gate, approximate=self.approximate)
+
+    def forward(self, hidden_states):
+        hidden_states = self.proj(hidden_states)
+        hidden_states = self.gelu(hidden_states)
+        return hidden_states
+
+
+class GEGLU(nn.Module):
+    r"""
+    A [variant](https://arxiv.org/abs/2002.05202) of the gated linear unit activation function.
+
+    Parameters:
+        dim_in (`int`): The number of channels in the input.
+        dim_out (`int`): The number of channels in the output.
+        bias (`bool`, defaults to True): Whether to use a bias in the linear layer.
+    """
+
+    def __init__(self, dim_in: int, dim_out: int, bias: bool = True):
+        super().__init__()
+        self.proj = nn.Linear(dim_in, dim_out * 2, bias=bias)
+
+    def gelu(self, gate: torch.Tensor) -> torch.Tensor:
+        if gate.device.type == "mps" and is_torch_version("<", "2.0.0"):
+            # fp16 gelu not supported on mps before torch 2.0
+            return F.gelu(gate.to(dtype=torch.float32)).to(dtype=gate.dtype)
+        return F.gelu(gate)
+
+    def forward(self, hidden_states, *args, **kwargs):
+        if len(args) > 0 or kwargs.get("scale", None) is not None:
+            deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`."
+            deprecate("scale", "1.0.0", deprecation_message)
+        hidden_states = self.proj(hidden_states)
+        if is_torch_npu_available():
+            # using torch_npu.npu_geglu can run faster and save memory on NPU.
+            return torch_npu.npu_geglu(hidden_states, dim=-1, approximate=1)[0]
+        else:
+            hidden_states, gate = hidden_states.chunk(2, dim=-1)
+            return hidden_states * self.gelu(gate)
+
+
+class SwiGLU(nn.Module):
+    r"""
+    A [variant](https://arxiv.org/abs/2002.05202) of the gated linear unit activation function. It's similar to `GEGLU`
+    but uses SiLU / Swish instead of GeLU.
+
+    Parameters:
+        dim_in (`int`): The number of channels in the input.
+        dim_out (`int`): The number of channels in the output.
+        bias (`bool`, defaults to True): Whether to use a bias in the linear layer.
+    """
+
+    def __init__(self, dim_in: int, dim_out: int, bias: bool = True):
+        super().__init__()
+
+        self.proj = nn.Linear(dim_in, dim_out * 2, bias=bias)
+        self.activation = nn.SiLU()
+
+    def forward(self, hidden_states):
+        hidden_states = self.proj(hidden_states)
+        hidden_states, gate = hidden_states.chunk(2, dim=-1)
+        return hidden_states * self.activation(gate)
+
+
+class ApproximateGELU(nn.Module):
+    r"""
+    The approximate form of the Gaussian Error Linear Unit (GELU). For more details, see section 2 of this
+    [paper](https://arxiv.org/abs/1606.08415).
+
+    Parameters:
+        dim_in (`int`): The number of channels in the input.
+        dim_out (`int`): The number of channels in the output.
+        bias (`bool`, defaults to True): Whether to use a bias in the linear layer.
+    """
+
+    def __init__(self, dim_in: int, dim_out: int, bias: bool = True):
+        super().__init__()
+        self.proj = nn.Linear(dim_in, dim_out, bias=bias)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.proj(x)
+        return x * torch.sigmoid(1.702 * x)
+
+
+class LinearActivation(nn.Module):
+    def __init__(self, dim_in: int, dim_out: int, bias: bool = True, activation: str = "silu"):
+        super().__init__()
+
+        self.proj = nn.Linear(dim_in, dim_out, bias=bias)
+        self.activation = get_activation(activation)
+
+    def forward(self, hidden_states):
+        hidden_states = self.proj(hidden_states)
+        return self.activation(hidden_states)
\ No newline at end of file
diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/attention.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/attention.py
index 00aabc9fdd..ac85e70e05 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/attention.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/attention.py
@@ -17,7 +17,7 @@ import torch
 from torch import nn
 
 from diffusers.utils import deprecate, logging
-from diffusers.models.activations import GEGLU, GELU, ApproximateGELU, LinearActivation, SwiGLU
+from .activations import GEGLU, GELU, ApproximateGELU, LinearActivation, SwiGLU
 
 
 logger = logging.get_logger(__name__)
-- 
Gitee


From cea407effe135e8a64852975c73319b1a34dbc59 Mon Sep 17 00:00:00 2001
From: jiangmengyu <jiangmengyu1@huawei.com>
Date: Tue, 24 Dec 2024 20:11:54 +0800
Subject: [PATCH 05/91] =?UTF-8?q?CogView3plus=E9=80=82=E9=85=8D?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../cogview3/cogview3plus/models/attention_processor.py         | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/attention_processor.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/attention_processor.py
index 6632f7f83f..796efa0318 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/attention_processor.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/attention_processor.py
@@ -35,7 +35,7 @@ class CogVideoXAttnProcessor2_0:
         attn: Attention,
         hidden_states: torch.Tensor,
         encoder_hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,s
+        attention_mask: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         text_seq_length = encoder_hidden_states.size(1)
 
-- 
Gitee


From 3d8b3bc40bd596dd4524bb75bb1edb5cba0be03d Mon Sep 17 00:00:00 2001
From: jiangmengyu <jiangmengyu1@huawei.com>
Date: Tue, 24 Dec 2024 20:20:06 +0800
Subject: [PATCH 06/91] =?UTF-8?q?CogView3plus=E9=80=82=E9=85=8D?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../cogview3/cogview3plus/models/transformer_cogview3plus.py     | 1 +
 1 file changed, 1 insertion(+)

diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/transformer_cogview3plus.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/transformer_cogview3plus.py
index 78360f61e9..782fa5b4db 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/transformer_cogview3plus.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/transformer_cogview3plus.py
@@ -180,6 +180,7 @@ class CogView3PlusTransformer2DModel(ModelMixin, ConfigMixin):
         sample_size: int = 128,
     ):
         super().__init__()
+        print("====================================================")
         self.out_channels = out_channels
         self.inner_dim = num_attention_heads * attention_head_dim
 
-- 
Gitee


From 97c427ae393947263635cc62fe938c1a5e15e6b4 Mon Sep 17 00:00:00 2001
From: jiangmengyu <jiangmengyu1@huawei.com>
Date: Wed, 25 Dec 2024 14:07:30 +0800
Subject: [PATCH 07/91] =?UTF-8?q?CogView3plus=E9=80=82=E9=85=8D?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../cogview3/cogview3plus/__init__.py         |  5 ++-
 .../cogview3/cogview3plus/models/__init__.py  |  3 +-
 .../cogview3plus/models/modeling_utils.py     |  1 +
 .../models/transformer_cogview3plus.py        |  3 +-
 .../cogview3plus/pipeline/__init__.py         |  2 +-
 .../cogview3plus/schedulers/__init__.py       |  3 +-
 .../cogview3plus/vae/autoencoder_kl.py        |  3 +-
 .../cogview3/inference_cogview3plus.py        | 35 +++++++++++++------
 8 files changed, 37 insertions(+), 18 deletions(-)
 create mode 100644 MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/modeling_utils.py

diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/__init__.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/__init__.py
index acbd223eb6..11a5548362 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/__init__.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/__init__.py
@@ -15,4 +15,7 @@
 # limitations under the License.
 
 
-from .pipeline import CogView3PlusPipeline
+from .pipeline import CogView3PlusPipeline, DiffusionPipeline
+from .vae import AutoencoderKL
+from .schedulers import CogVideoXDDIMScheduler, CogVideoXDPMScheduler, SchedulerMixin
+from .models import CogView3PlusTransformer2DModel, ModelMixin
\ No newline at end of file
diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/__init__.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/__init__.py
index 68d6997c34..b3c595bfcc 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/__init__.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/__init__.py
@@ -1 +1,2 @@
-from .transformer_cogview3plus import CogView3PlusTransformer2DModel
\ No newline at end of file
+from .transformer_cogview3plus import CogView3PlusTransformer2DModel
+from .modeling_utils import ModelMixin
\ No newline at end of file
diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/modeling_utils.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/modeling_utils.py
new file mode 100644
index 0000000000..1b4243486f
--- /dev/null
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/modeling_utils.py
@@ -0,0 +1 @@
+from diffusers import ModelMixin
\ No newline at end of file
diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/transformer_cogview3plus.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/transformer_cogview3plus.py
index 782fa5b4db..9a343d4c86 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/transformer_cogview3plus.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/transformer_cogview3plus.py
@@ -20,10 +20,10 @@ import torch.nn as nn
 
 from diffusers.configuration_utils import ConfigMixin, register_to_config
 from diffusers.models.attention_processor import Attention, AttentionProcessor
-from diffusers.models.modeling_utils import ModelMixin
 from diffusers.utils import is_torch_version, logging
 from diffusers.models.modeling_outputs import Transformer2DModelOutput
 
+from .modeling_utils import ModelMixin
 from .attention import FeedForward
 from .attention_processor import CogVideoXAttnProcessor2_0
 from ..layers import CogView3PlusAdaLayerNormZeroTextImage, AdaLayerNormContinuous
@@ -180,7 +180,6 @@ class CogView3PlusTransformer2DModel(ModelMixin, ConfigMixin):
         sample_size: int = 128,
     ):
         super().__init__()
-        print("====================================================")
         self.out_channels = out_channels
         self.inner_dim = num_attention_heads * attention_head_dim
 
diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/__init__.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/__init__.py
index aea730c2e3..626e0d588b 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/__init__.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/__init__.py
@@ -1 +1 @@
-from .pipeline_cogview3plus import CogView3PlusPipeline
\ No newline at end of file
+from .pipeline_cogview3plus import CogView3PlusPipeline, DiffusionPipeline
\ No newline at end of file
diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/__init__.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/__init__.py
index 76b000d4bb..32d0c223e7 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/__init__.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/__init__.py
@@ -1,2 +1,3 @@
 from .scheduling_ddim_cogvideox import CogVideoXDDIMScheduler
-from .scheduling_dpm_cogvideox import CogVideoXDPMScheduler
\ No newline at end of file
+from .scheduling_dpm_cogvideox import CogVideoXDPMScheduler
+from .scheduling_utils import SchedulerMixin
\ No newline at end of file
diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/vae/autoencoder_kl.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/vae/autoencoder_kl.py
index 99ba70c8cd..fcba50ccae 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/vae/autoencoder_kl.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/vae/autoencoder_kl.py
@@ -31,7 +31,8 @@ from diffusers.models.attention_processor import (
     FusedAttnProcessor2_0,
 )
 from diffusers.models.modeling_outputs import AutoencoderKLOutput
-from diffusers.models.modeling_utils import ModelMixin
+
+from ..models import ModelMixin
 from .vae import Decoder, DecoderOutput, DiagonalGaussianDistribution, Encoder
 
 
diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/inference_cogview3plus.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/inference_cogview3plus.py
index 33d0c9aeed..34a2158f6b 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/inference_cogview3plus.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/inference_cogview3plus.py
@@ -17,6 +17,7 @@
 import argparse
 import logging
 import torch
+import time
 
 from cogview3plus import CogView3PlusPipeline
 
@@ -59,17 +60,29 @@ def generate_image(
     prompt, model_path, guidance_scale, num_images_per_prompt, num_inference_steps, width, height, output_path, dtype
 ):
     # Load the pre-trained model with the specified precision
-    pipe = CogView3PlusPipeline.from_pretrained(model_path, torch_dtype=dtype)
-
-    # Generate the image based on the prompt
-    image = pipe(
-        prompt=prompt,
-        guidance_scale=guidance_scale,
-        num_images_per_prompt=num_images_per_prompt,
-        num_inference_steps=num_inference_steps,
-        width=width,
-        height=height,
-    ).images[0]
+    pipe = CogView3PlusPipeline.from_pretrained(model_path, torch_dtype=dtype).to("npu")
+
+    use_time = 0
+    loops = 5
+    for i in range(loops):
+        start_time = time.time()
+        # Generate the image based on the prompt
+        image = pipe(
+            prompt=prompt,
+            guidance_scale=guidance_scale,
+            num_images_per_prompt=num_images_per_prompt,
+            num_inference_steps=num_inference_steps,
+            width=width,
+            height=height,
+        ).images[0]
+        
+        if i >= 2:
+            use_time += time.time() - start_time
+            logger.info("current_time is %.3f )", time.time() - start_time)
+
+        torch.npu.empty_cache()
+    
+    logger.info("use_time is %.3f)", use_time / 3)
 
     # Save the generated image to the local file system
     image.save(output_path)
-- 
Gitee


From c9b51415e826ff8c71d45f71e6e60c29d464243a Mon Sep 17 00:00:00 2001
From: jiangmengyu <jiangmengyu1@huawei.com>
Date: Wed, 25 Dec 2024 14:23:32 +0800
Subject: [PATCH 08/91] =?UTF-8?q?CogView3plus=E9=80=82=E9=85=8D?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../cogview3plus/models/attention_processor.py  | 17 +++++++++++++++--
 .../cogview3/inference_cogview3plus.py          |  2 +-
 2 files changed, 16 insertions(+), 3 deletions(-)

diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/attention_processor.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/attention_processor.py
index 796efa0318..1b9ce8616c 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/attention_processor.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/attention_processor.py
@@ -16,6 +16,7 @@ from typing import Optional
 
 import torch
 import torch.nn.functional as F
+import torch_npu
 
 from diffusers.models.attention_processor import Attention
 
@@ -65,9 +66,21 @@ class CogVideoXAttnProcessor2_0:
         if attn.norm_k is not None:
             key = attn.norm_k(key)
 
-        hidden_states = F.scaled_dot_product_attention(
-            query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+        _, N, _, D = query.shape
+        query = F.pad(query, (0, 64 - D))
+        key = F.pad(key, (0, 64 - D))
+        value = F.pad(value, (0, 64 - D))
+        hidden_states = torch_npu.npu_prompt_flash_attention(
+            query,
+            key,
+            value,
+            input_layout='BNSD',
+            scale_value=D**-0.5,
+            pre_tokens=65535,
+            next_tokens=65535,
+            num_heads=N
         )
+        hidden_states = hidden_states[:, :, :, :D]
 
         hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
 
diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/inference_cogview3plus.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/inference_cogview3plus.py
index 34a2158f6b..74dd914294 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/inference_cogview3plus.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/inference_cogview3plus.py
@@ -51,7 +51,7 @@ def parse_arguments():
     parser.add_argument("--height", type=int, default=1024, help="Height of the generated image.")
     parser.add_argument("--output_path", type=str, default="cogview3.png", help="Path to save the generated image.")
     parser.add_argument("--dtype", type=str, default="bf16", help="bf16 or fp16")
-    parser.add_argument("--device_id", type=int, default=2, help="NPU device id")
+    parser.add_argument("--device_id", type=int, default=1, help="NPU device id")
 
     return parser.parse_args()
 
-- 
Gitee


From 62a721a2f5d0485f19726ddce37cd825225f6d86 Mon Sep 17 00:00:00 2001
From: jiangmengyu <jiangmengyu1@huawei.com>
Date: Wed, 25 Dec 2024 17:06:13 +0800
Subject: [PATCH 09/91] =?UTF-8?q?CogView3plus=E9=80=82=E9=85=8D?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../cogview3plus/models/model_load_utils.py   |   42 +
 .../cogview3plus/models/modeling_utils.py     | 1423 ++++++++++++++++-
 2 files changed, 1464 insertions(+), 1 deletion(-)
 create mode 100644 MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/model_load_utils.py

diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/model_load_utils.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/model_load_utils.py
new file mode 100644
index 0000000000..f6d3b20570
--- /dev/null
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/model_load_utils.py
@@ -0,0 +1,42 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright(C) 2024. Huawei Technologies Co.,Ltd. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0 
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License
+
+import os
+import torch
+import safetensors.torch
+
+
+SAFETENSORS_EXTENSION = "safetensors"
+EMA_STATE_DICT = "ema_state_dict"
+STATE_DICT = "state_dict"
+CPU = "cpu"
+
+
+def load_state_dict_sd(model_path):
+    name = os.path.basename(model_path).split('.')[-1] # get weights name
+    if name.endswith("ckpt"):
+        weight = torch.load(model_path, map_location=CPU)
+        if (EMA_STATE_DICT in weight):
+            weight = weight[EMA_STATE_DICT]
+            weight = {key.replace("module.", ""): value for key, value in weight.items()}
+        elif STATE_DICT in weight:
+            weight = weight[STATE_DICT]
+        return weight
+    elif name == SAFETENSORS_EXTENSION: # diffuser model use same name
+        return safetensors.torch.load_file(model_path, device=CPU) # first load on cpu
+    else:
+        # to support hf shard model weights
+        return torch.load(model_path, map_location=CPU) # first load on cpu
\ No newline at end of file
diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/modeling_utils.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/modeling_utils.py
index 1b4243486f..a0740b8c67 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/modeling_utils.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/modeling_utils.py
@@ -1 +1,1422 @@
-from diffusers import ModelMixin
\ No newline at end of file
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team.
+# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+import inspect
+import itertools
+import json
+import os
+import re
+from collections import OrderedDict
+from functools import partial, wraps
+from pathlib import Path
+from typing import Any, Callable, List, Optional, Tuple, Union
+
+import safetensors
+import torch
+from huggingface_hub import create_repo, split_torch_state_dict_into_shards
+from huggingface_hub.utils import validate_hf_hub_args
+from torch import Tensor, nn
+
+from diffusers import __version__
+from diffusers.quantizers import DiffusersAutoQuantizer, DiffusersQuantizer
+from diffusers.quantizers.quantization_config import QuantizationMethod
+from diffusers.utils import (
+    CONFIG_NAME,
+    FLAX_WEIGHTS_NAME,
+    SAFE_WEIGHTS_INDEX_NAME,
+    SAFETENSORS_WEIGHTS_NAME,
+    WEIGHTS_INDEX_NAME,
+    WEIGHTS_NAME,
+    _add_variant,
+    _get_checkpoint_shard_files,
+    _get_model_file,
+    deprecate,
+    is_accelerate_available,
+    is_bitsandbytes_available,
+    is_bitsandbytes_version,
+    is_torch_version,
+    logging,
+)
+from diffusers.utils.hub_utils import (
+    PushToHubMixin,
+    load_or_create_model_card,
+    populate_model_card,
+)
+from diffusers.models.model_loading_utils import (
+    _determine_device_map,
+    _fetch_index_file,
+    _fetch_index_file_legacy,
+    _load_state_dict_into_model,
+    _merge_sharded_checkpoints,
+    load_model_dict_into_meta,
+    load_state_dict,
+)
+
+from .model_load_utils import load_state_dict_sd
+
+
+logger = logging.get_logger(__name__)
+
+_REGEX_SHARD = re.compile(r"(.*?)-\d{5}-of-\d{5}")
+
+
+if is_torch_version(">=", "1.9.0"):
+    _LOW_CPU_MEM_USAGE_DEFAULT = True
+else:
+    _LOW_CPU_MEM_USAGE_DEFAULT = False
+
+
+if is_accelerate_available():
+    import accelerate
+
+
+def get_parameter_device(parameter: torch.nn.Module) -> torch.device:
+    try:
+        parameters_and_buffers = itertools.chain(parameter.parameters(), parameter.buffers())
+        return next(parameters_and_buffers).device
+    except StopIteration:
+        # For torch.nn.DataParallel compatibility in PyTorch 1.5
+
+        def find_tensor_attributes(module: torch.nn.Module) -> List[Tuple[str, Tensor]]:
+            tuples = [(k, v) for k, v in module.__dict__.items() if torch.is_tensor(v)]
+            return tuples
+
+        gen = parameter._named_members(get_members_fn=find_tensor_attributes)
+        first_tuple = next(gen)
+        return first_tuple[1].device
+
+
+def get_parameter_dtype(parameter: torch.nn.Module) -> torch.dtype:
+    """
+    Returns the first found floating dtype in parameters if there is one, otherwise returns the last dtype it found.
+    """
+    last_dtype = None
+    for param in parameter.parameters():
+        last_dtype = param.dtype
+        if param.is_floating_point():
+            return param.dtype
+
+    for buffer in parameter.buffers():
+        last_dtype = buffer.dtype
+        if buffer.is_floating_point():
+            return buffer.dtype
+
+    if last_dtype is not None:
+        # if no floating dtype was found return whatever the first dtype is
+        return last_dtype
+
+    # For nn.DataParallel compatibility in PyTorch > 1.5
+    def find_tensor_attributes(module: nn.Module) -> List[Tuple[str, Tensor]]:
+        tuples = [(k, v) for k, v in module.__dict__.items() if torch.is_tensor(v)]
+        return tuples
+
+    gen = parameter._named_members(get_members_fn=find_tensor_attributes)
+    last_tuple = None
+    for tuple in gen:
+        last_tuple = tuple
+        if tuple[1].is_floating_point():
+            return tuple[1].dtype
+
+    if last_tuple is not None:
+        # fallback to the last dtype
+        return last_tuple[1].dtype
+
+
+class ModelMixin(torch.nn.Module, PushToHubMixin):
+    r"""
+    Base class for all models.
+
+    [`ModelMixin`] takes care of storing the model configuration and provides methods for loading, downloading and
+    saving models.
+
+        - **config_name** ([`str`]) -- Filename to save a model to when calling [`~models.ModelMixin.save_pretrained`].
+    """
+
+    config_name = CONFIG_NAME
+    _automatically_saved_args = ["_diffusers_version", "_class_name", "_name_or_path"]
+    _supports_gradient_checkpointing = False
+    _keys_to_ignore_on_load_unexpected = None
+    _no_split_modules = None
+    _keep_in_fp32_modules = None
+
+    def __init__(self):
+        super().__init__()
+
+    def __getattr__(self, name: str) -> Any:
+        """The only reason we overwrite `getattr` here is to gracefully deprecate accessing
+        config attributes directly. See https://github.com/huggingface/diffusers/pull/3129 We need to overwrite
+        __getattr__ here in addition so that we don't trigger `torch.nn.Module`'s __getattr__':
+        https://pytorch.org/docs/stable/_modules/torch/nn/modules/module.html#Module
+        """
+
+        is_in_config = "_internal_dict" in self.__dict__ and hasattr(self.__dict__["_internal_dict"], name)
+        is_attribute = name in self.__dict__
+
+        if is_in_config and not is_attribute:
+            deprecation_message = f"Accessing config attribute `{name}` directly via '{type(self).__name__}' object attribute is deprecated. Please access '{name}' over '{type(self).__name__}'s config object instead, e.g. 'unet.config.{name}'."
+            deprecate("direct config name access", "1.0.0", deprecation_message, standard_warn=False, stacklevel=3)
+            return self._internal_dict[name]
+
+        # call PyTorch's https://pytorch.org/docs/stable/_modules/torch/nn/modules/module.html#Module
+        return super().__getattr__(name)
+
+    @property
+    def is_gradient_checkpointing(self) -> bool:
+        """
+        Whether gradient checkpointing is activated for this model or not.
+        """
+        return any(hasattr(m, "gradient_checkpointing") and m.gradient_checkpointing for m in self.modules())
+
+    def enable_gradient_checkpointing(self) -> None:
+        """
+        Activates gradient checkpointing for the current model (may be referred to as *activation checkpointing* or
+        *checkpoint activations* in other frameworks).
+        """
+        if not self._supports_gradient_checkpointing:
+            raise ValueError(f"{self.__class__.__name__} does not support gradient checkpointing.")
+        self.apply(partial(self._set_gradient_checkpointing, value=True))
+
+    def disable_gradient_checkpointing(self) -> None:
+        """
+        Deactivates gradient checkpointing for the current model (may be referred to as *activation checkpointing* or
+        *checkpoint activations* in other frameworks).
+        """
+        if self._supports_gradient_checkpointing:
+            self.apply(partial(self._set_gradient_checkpointing, value=False))
+
+    def set_use_npu_flash_attention(self, valid: bool) -> None:
+        r"""
+        Set the switch for the npu flash attention.
+        """
+
+        def fn_recursive_set_npu_flash_attention(module: torch.nn.Module):
+            if hasattr(module, "set_use_npu_flash_attention"):
+                module.set_use_npu_flash_attention(valid)
+
+            for child in module.children():
+                fn_recursive_set_npu_flash_attention(child)
+
+        for module in self.children():
+            if isinstance(module, torch.nn.Module):
+                fn_recursive_set_npu_flash_attention(module)
+
+    def enable_npu_flash_attention(self) -> None:
+        r"""
+        Enable npu flash attention from torch_npu
+
+        """
+        self.set_use_npu_flash_attention(True)
+
+    def disable_npu_flash_attention(self) -> None:
+        r"""
+        disable npu flash attention from torch_npu
+
+        """
+        self.set_use_npu_flash_attention(False)
+
+    def set_use_xla_flash_attention(
+        self, use_xla_flash_attention: bool, partition_spec: Optional[Callable] = None
+    ) -> None:
+        # Recursively walk through all the children.
+        # Any children which exposes the set_use_xla_flash_attention method
+        # gets the message
+        def fn_recursive_set_flash_attention(module: torch.nn.Module):
+            if hasattr(module, "set_use_xla_flash_attention"):
+                module.set_use_xla_flash_attention(use_xla_flash_attention, partition_spec)
+
+            for child in module.children():
+                fn_recursive_set_flash_attention(child)
+
+        for module in self.children():
+            if isinstance(module, torch.nn.Module):
+                fn_recursive_set_flash_attention(module)
+
+    def enable_xla_flash_attention(self, partition_spec: Optional[Callable] = None):
+        r"""
+        Enable the flash attention pallals kernel for torch_xla.
+        """
+        self.set_use_xla_flash_attention(True, partition_spec)
+
+    def disable_xla_flash_attention(self):
+        r"""
+        Disable the flash attention pallals kernel for torch_xla.
+        """
+        self.set_use_xla_flash_attention(False)
+
+    def set_use_memory_efficient_attention_xformers(
+        self, valid: bool, attention_op: Optional[Callable] = None
+    ) -> None:
+        # Recursively walk through all the children.
+        # Any children which exposes the set_use_memory_efficient_attention_xformers method
+        # gets the message
+        def fn_recursive_set_mem_eff(module: torch.nn.Module):
+            if hasattr(module, "set_use_memory_efficient_attention_xformers"):
+                module.set_use_memory_efficient_attention_xformers(valid, attention_op)
+
+            for child in module.children():
+                fn_recursive_set_mem_eff(child)
+
+        for module in self.children():
+            if isinstance(module, torch.nn.Module):
+                fn_recursive_set_mem_eff(module)
+
+    def enable_xformers_memory_efficient_attention(self, attention_op: Optional[Callable] = None) -> None:
+        r"""
+        Enable memory efficient attention from [xFormers](https://facebookresearch.github.io/xformers/).
+
+        When this option is enabled, you should observe lower GPU memory usage and a potential speed up during
+        inference. Speed up during training is not guaranteed.
+
+        <Tip warning={true}>
+
+        ⚠️ When memory efficient attention and sliced attention are both enabled, memory efficient attention takes
+        precedent.
+
+        </Tip>
+
+        Parameters:
+            attention_op (`Callable`, *optional*):
+                Override the default `None` operator for use as `op` argument to the
+                [`memory_efficient_attention()`](https://facebookresearch.github.io/xformers/components/ops.html#xformers.ops.memory_efficient_attention)
+                function of xFormers.
+
+        Examples:
+
+        ```py
+        >>> import torch
+        >>> from diffusers import UNet2DConditionModel
+        >>> from xformers.ops import MemoryEfficientAttentionFlashAttentionOp
+
+        >>> model = UNet2DConditionModel.from_pretrained(
+        ...     "stabilityai/stable-diffusion-2-1", subfolder="unet", torch_dtype=torch.float16
+        ... )
+        >>> model = model.to("cuda")
+        >>> model.enable_xformers_memory_efficient_attention(attention_op=MemoryEfficientAttentionFlashAttentionOp)
+        ```
+        """
+        self.set_use_memory_efficient_attention_xformers(True, attention_op)
+
+    def disable_xformers_memory_efficient_attention(self) -> None:
+        r"""
+        Disable memory efficient attention from [xFormers](https://facebookresearch.github.io/xformers/).
+        """
+        self.set_use_memory_efficient_attention_xformers(False)
+
+    def save_pretrained(
+        self,
+        save_directory: Union[str, os.PathLike],
+        is_main_process: bool = True,
+        save_function: Optional[Callable] = None,
+        safe_serialization: bool = True,
+        variant: Optional[str] = None,
+        max_shard_size: Union[int, str] = "10GB",
+        push_to_hub: bool = False,
+        **kwargs,
+    ):
+        """
+        Save a model and its configuration file to a directory so that it can be reloaded using the
+        [`~models.ModelMixin.from_pretrained`] class method.
+
+        Arguments:
+            save_directory (`str` or `os.PathLike`):
+                Directory to save a model and its configuration file to. Will be created if it doesn't exist.
+            is_main_process (`bool`, *optional*, defaults to `True`):
+                Whether the process calling this is the main process or not. Useful during distributed training and you
+                need to call this function on all processes. In this case, set `is_main_process=True` only on the main
+                process to avoid race conditions.
+            save_function (`Callable`):
+                The function to use to save the state dictionary. Useful during distributed training when you need to
+                replace `torch.save` with another method. Can be configured with the environment variable
+                `DIFFUSERS_SAVE_MODE`.
+            safe_serialization (`bool`, *optional*, defaults to `True`):
+                Whether to save the model using `safetensors` or the traditional PyTorch way with `pickle`.
+            variant (`str`, *optional*):
+                If specified, weights are saved in the format `pytorch_model.<variant>.bin`.
+            max_shard_size (`int` or `str`, defaults to `"10GB"`):
+                The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size
+                lower than this size. If expressed as a string, needs to be digits followed by a unit (like `"5GB"`).
+                If expressed as an integer, the unit is bytes. Note that this limit will be decreased after a certain
+                period of time (starting from Oct 2024) to allow users to upgrade to the latest version of `diffusers`.
+                This is to establish a common default size for this argument across different libraries in the Hugging
+                Face ecosystem (`transformers`, and `accelerate`, for example).
+            push_to_hub (`bool`, *optional*, defaults to `False`):
+                Whether or not to push your model to the Hugging Face Hub after saving it. You can specify the
+                repository you want to push to with `repo_id` (will default to the name of `save_directory` in your
+                namespace).
+            kwargs (`Dict[str, Any]`, *optional*):
+                Additional keyword arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
+        """
+        if os.path.isfile(save_directory):
+            logger.error(f"Provided path ({save_directory}) should be a directory, not a file")
+            return
+
+        hf_quantizer = getattr(self, "hf_quantizer", None)
+        if hf_quantizer is not None:
+            quantization_serializable = (
+                hf_quantizer is not None
+                and isinstance(hf_quantizer, DiffusersQuantizer)
+                and hf_quantizer.is_serializable
+            )
+            if not quantization_serializable:
+                raise ValueError(
+                    f"The model is quantized with {hf_quantizer.quantization_config.quant_method} and is not serializable - check out the warnings from"
+                    " the logger on the traceback to understand the reason why the quantized model is not serializable."
+                )
+
+        weights_name = SAFETENSORS_WEIGHTS_NAME if safe_serialization else WEIGHTS_NAME
+        weights_name = _add_variant(weights_name, variant)
+        weights_name_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(
+            ".safetensors", "{suffix}.safetensors"
+        )
+
+        os.makedirs(save_directory, exist_ok=True)
+
+        if push_to_hub:
+            commit_message = kwargs.pop("commit_message", None)
+            private = kwargs.pop("private", None)
+            create_pr = kwargs.pop("create_pr", False)
+            token = kwargs.pop("token", None)
+            repo_id = kwargs.pop("repo_id", save_directory.split(os.path.sep)[-1])
+            repo_id = create_repo(repo_id, exist_ok=True, private=private, token=token).repo_id
+
+        # Only save the model itself if we are using distributed training
+        model_to_save = self
+
+        # Attach architecture to the config
+        # Save the config
+        if is_main_process:
+            model_to_save.save_config(save_directory)
+
+        # Save the model
+        state_dict = model_to_save.state_dict()
+
+        # Save the model
+        state_dict_split = split_torch_state_dict_into_shards(
+            state_dict, max_shard_size=max_shard_size, filename_pattern=weights_name_pattern
+        )
+
+        # Clean the folder from a previous save
+        if is_main_process:
+            for filename in os.listdir(save_directory):
+                if filename in state_dict_split.filename_to_tensors.keys():
+                    continue
+                full_filename = os.path.join(save_directory, filename)
+                if not os.path.isfile(full_filename):
+                    continue
+                weights_without_ext = weights_name_pattern.replace(".bin", "").replace(".safetensors", "")
+                weights_without_ext = weights_without_ext.replace("{suffix}", "")
+                filename_without_ext = filename.replace(".bin", "").replace(".safetensors", "")
+                # make sure that file to be deleted matches format of sharded file, e.g. pytorch_model-00001-of-00005
+                if (
+                    filename.startswith(weights_without_ext)
+                    and _REGEX_SHARD.fullmatch(filename_without_ext) is not None
+                ):
+                    os.remove(full_filename)
+
+        for filename, tensors in state_dict_split.filename_to_tensors.items():
+            shard = {tensor: state_dict[tensor] for tensor in tensors}
+            filepath = os.path.join(save_directory, filename)
+            if safe_serialization:
+                # At some point we will need to deal better with save_function (used for TPU and other distributed
+                # joyfulness), but for now this enough.
+                safetensors.torch.save_file(shard, filepath, metadata={"format": "pt"})
+            else:
+                torch.save(shard, filepath)
+
+        if state_dict_split.is_sharded:
+            index = {
+                "metadata": state_dict_split.metadata,
+                "weight_map": state_dict_split.tensor_to_filename,
+            }
+            save_index_file = SAFE_WEIGHTS_INDEX_NAME if safe_serialization else WEIGHTS_INDEX_NAME
+            save_index_file = os.path.join(save_directory, _add_variant(save_index_file, variant))
+            # Save the index as well
+            with open(save_index_file, "w", encoding="utf-8") as f:
+                content = json.dumps(index, indent=2, sort_keys=True) + "\n"
+                f.write(content)
+            logger.info(
+                f"The model is bigger than the maximum size per checkpoint ({max_shard_size}) and is going to be "
+                f"split in {len(state_dict_split.filename_to_tensors)} checkpoint shards. You can find where each parameters has been saved in the "
+                f"index located at {save_index_file}."
+            )
+        else:
+            path_to_weights = os.path.join(save_directory, weights_name)
+            logger.info(f"Model weights saved in {path_to_weights}")
+
+        if push_to_hub:
+            # Create a new empty model card and eventually tag it
+            model_card = load_or_create_model_card(repo_id, token=token)
+            model_card = populate_model_card(model_card)
+            model_card.save(Path(save_directory, "README.md").as_posix())
+
+            self._upload_folder(
+                save_directory,
+                repo_id,
+                token=token,
+                commit_message=commit_message,
+                create_pr=create_pr,
+            )
+
+    def dequantize(self):
+        """
+        Potentially dequantize the model in case it has been quantized by a quantization method that support
+        dequantization.
+        """
+        hf_quantizer = getattr(self, "hf_quantizer", None)
+
+        if hf_quantizer is None:
+            raise ValueError("You need to first quantize your model in order to dequantize it")
+
+        return hf_quantizer.dequantize(self)
+
+    @classmethod
+    @validate_hf_hub_args
+    def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], **kwargs):
+        r"""
+        Instantiate a pretrained PyTorch model from a pretrained model configuration.
+
+        The model is set in evaluation mode - `model.eval()` - by default, and dropout modules are deactivated. To
+        train the model, set it back in training mode with `model.train()`.
+
+        Parameters:
+            pretrained_model_name_or_path (`str` or `os.PathLike`, *optional*):
+                Can be either:
+
+                    - A string, the *model id* (for example `google/ddpm-celebahq-256`) of a pretrained model hosted on
+                      the Hub.
+                    - A path to a *directory* (for example `./my_model_directory`) containing the model weights saved
+                      with [`~ModelMixin.save_pretrained`].
+
+            cache_dir (`Union[str, os.PathLike]`, *optional*):
+                Path to a directory where a downloaded pretrained model configuration is cached if the standard cache
+                is not used.
+            torch_dtype (`str` or `torch.dtype`, *optional*):
+                Override the default `torch.dtype` and load the model with another dtype. If `"auto"` is passed, the
+                dtype is automatically derived from the model's weights.
+            force_download (`bool`, *optional*, defaults to `False`):
+                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
+                cached versions if they exist.
+            proxies (`Dict[str, str]`, *optional*):
+                A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
+                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
+            output_loading_info (`bool`, *optional*, defaults to `False`):
+                Whether or not to also return a dictionary containing missing keys, unexpected keys and error messages.
+            local_files_only(`bool`, *optional*, defaults to `False`):
+                Whether to only load local model weights and configuration files or not. If set to `True`, the model
+                won't be downloaded from the Hub.
+            token (`str` or *bool*, *optional*):
+                The token to use as HTTP bearer authorization for remote files. If `True`, the token generated from
+                `diffusers-cli login` (stored in `~/.huggingface`) is used.
+            revision (`str`, *optional*, defaults to `"main"`):
+                The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier
+                allowed by Git.
+            from_flax (`bool`, *optional*, defaults to `False`):
+                Load the model weights from a Flax checkpoint save file.
+            subfolder (`str`, *optional*, defaults to `""`):
+                The subfolder location of a model file within a larger model repository on the Hub or locally.
+            mirror (`str`, *optional*):
+                Mirror source to resolve accessibility issues if you're downloading a model in China. We do not
+                guarantee the timeliness or safety of the source, and you should refer to the mirror site for more
+                information.
+            device_map (`str` or `Dict[str, Union[int, str, torch.device]]`, *optional*):
+                A map that specifies where each submodule should go. It doesn't need to be defined for each
+                parameter/buffer name; once a given module name is inside, every submodule of it will be sent to the
+                same device. Defaults to `None`, meaning that the model will be loaded on CPU.
+
+                Set `device_map="auto"` to have 🤗 Accelerate automatically compute the most optimized `device_map`. For
+                more information about each option see [designing a device
+                map](https://hf.co/docs/accelerate/main/en/usage_guides/big_modeling#designing-a-device-map).
+            max_memory (`Dict`, *optional*):
+                A dictionary device identifier for the maximum memory. Will default to the maximum memory available for
+                each GPU and the available CPU RAM if unset.
+            offload_folder (`str` or `os.PathLike`, *optional*):
+                The path to offload weights if `device_map` contains the value `"disk"`.
+            offload_state_dict (`bool`, *optional*):
+                If `True`, temporarily offloads the CPU state dict to the hard drive to avoid running out of CPU RAM if
+                the weight of the CPU state dict + the biggest shard of the checkpoint does not fit. Defaults to `True`
+                when there is some disk offload.
+            low_cpu_mem_usage (`bool`, *optional*, defaults to `True` if torch version >= 1.9.0 else `False`):
+                Speed up model loading only loading the pretrained weights and not initializing the weights. This also
+                tries to not use more than 1x model size in CPU memory (including peak memory) while loading the model.
+                Only supported for PyTorch >= 1.9.0. If you are using an older version of PyTorch, setting this
+                argument to `True` will raise an error.
+            variant (`str`, *optional*):
+                Load weights from a specified `variant` filename such as `"fp16"` or `"ema"`. This is ignored when
+                loading `from_flax`.
+            use_safetensors (`bool`, *optional*, defaults to `None`):
+                If set to `None`, the `safetensors` weights are downloaded if they're available **and** if the
+                `safetensors` library is installed. If set to `True`, the model is forcibly loaded from `safetensors`
+                weights. If set to `False`, `safetensors` weights are not loaded.
+
+        <Tip>
+
+        To use private or [gated models](https://huggingface.co/docs/hub/models-gated#gated-models), log-in with
+        `huggingface-cli login`. You can also activate the special
+        ["offline-mode"](https://huggingface.co/diffusers/installation.html#offline-mode) to use this method in a
+        firewalled environment.
+
+        </Tip>
+
+        Example:
+
+        ```py
+        from diffusers import UNet2DConditionModel
+
+        unet = UNet2DConditionModel.from_pretrained("runwayml/stable-diffusion-v1-5", subfolder="unet")
+        ```
+
+        If you get the error message below, you need to finetune the weights for your downstream task:
+
+        ```bash
+        Some weights of UNet2DConditionModel were not initialized from the model checkpoint at runwayml/stable-diffusion-v1-5 and are newly initialized because the shapes did not match:
+        - conv_in.weight: found shape torch.Size([320, 4, 3, 3]) in the checkpoint and torch.Size([320, 9, 3, 3]) in the model instantiated
+        You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
+        ```
+        """
+        cache_dir = kwargs.pop("cache_dir", None)
+        ignore_mismatched_sizes = kwargs.pop("ignore_mismatched_sizes", False)
+        force_download = kwargs.pop("force_download", False)
+        from_flax = kwargs.pop("from_flax", False)
+        proxies = kwargs.pop("proxies", None)
+        output_loading_info = kwargs.pop("output_loading_info", False)
+        local_files_only = kwargs.pop("local_files_only", None)
+        token = kwargs.pop("token", None)
+        revision = kwargs.pop("revision", None)
+        torch_dtype = kwargs.pop("torch_dtype", None)
+        subfolder = kwargs.pop("subfolder", None)
+        device_map = kwargs.pop("device_map", None)
+        max_memory = kwargs.pop("max_memory", None)
+        offload_folder = kwargs.pop("offload_folder", None)
+        offload_state_dict = kwargs.pop("offload_state_dict", False)
+        low_cpu_mem_usage = kwargs.pop("low_cpu_mem_usage", _LOW_CPU_MEM_USAGE_DEFAULT)
+        variant = kwargs.pop("variant", None)
+        use_safetensors = kwargs.pop("use_safetensors", None)
+        quantization_config = kwargs.pop("quantization_config", None)
+
+        allow_pickle = False
+        if use_safetensors is None:
+            use_safetensors = True
+            allow_pickle = True
+
+        if low_cpu_mem_usage and not is_accelerate_available():
+            low_cpu_mem_usage = False
+            logger.warning(
+                "Cannot initialize model with low cpu memory usage because `accelerate` was not found in the"
+                " environment. Defaulting to `low_cpu_mem_usage=False`. It is strongly recommended to install"
+                " `accelerate` for faster and less memory-intense model loading. You can do so with: \n```\npip"
+                " install accelerate\n```\n."
+            )
+
+        if device_map is not None and not is_accelerate_available():
+            raise NotImplementedError(
+                "Loading and dispatching requires `accelerate`. Please make sure to install accelerate or set"
+                " `device_map=None`. You can install accelerate with `pip install accelerate`."
+            )
+
+        # Check if we can handle device_map and dispatching the weights
+        if device_map is not None and not is_torch_version(">=", "1.9.0"):
+            raise NotImplementedError(
+                "Loading and dispatching requires torch >= 1.9.0. Please either update your PyTorch version or set"
+                " `device_map=None`."
+            )
+
+        if low_cpu_mem_usage is True and not is_torch_version(">=", "1.9.0"):
+            raise NotImplementedError(
+                "Low memory initialization requires torch >= 1.9.0. Please either update your PyTorch version or set"
+                " `low_cpu_mem_usage=False`."
+            )
+
+        if low_cpu_mem_usage is False and device_map is not None:
+            raise ValueError(
+                f"You cannot set `low_cpu_mem_usage` to `False` while using device_map={device_map} for loading and"
+                " dispatching. Please make sure to set `low_cpu_mem_usage=True`."
+            )
+
+        # change device_map into a map if we passed an int, a str or a torch.device
+        if isinstance(device_map, torch.device):
+            device_map = {"": device_map}
+        elif isinstance(device_map, str) and device_map not in ["auto", "balanced", "balanced_low_0", "sequential"]:
+            try:
+                device_map = {"": torch.device(device_map)}
+            except RuntimeError:
+                raise ValueError(
+                    "When passing device_map as a string, the value needs to be a device name (e.g. cpu, cuda:0) or "
+                    f"'auto', 'balanced', 'balanced_low_0', 'sequential' but found {device_map}."
+                )
+        elif isinstance(device_map, int):
+            if device_map < 0:
+                raise ValueError(
+                    "You can't pass device_map as a negative int. If you want to put the model on the cpu, pass device_map = 'cpu' "
+                )
+            else:
+                device_map = {"": device_map}
+
+        if device_map is not None:
+            if low_cpu_mem_usage is None:
+                low_cpu_mem_usage = True
+            elif not low_cpu_mem_usage:
+                raise ValueError("Passing along a `device_map` requires `low_cpu_mem_usage=True`")
+
+        if low_cpu_mem_usage:
+            if device_map is not None and not is_torch_version(">=", "1.10"):
+                # The max memory utils require PyTorch >= 1.10 to have torch.cuda.mem_get_info.
+                raise ValueError("`low_cpu_mem_usage` and `device_map` require PyTorch >= 1.10.")
+
+        # Load config if we don't provide a configuration
+        config_path = pretrained_model_name_or_path
+
+        user_agent = {
+            "diffusers": __version__,
+            "file_type": "model",
+            "framework": "pytorch",
+        }
+
+        # load config
+        config, unused_kwargs, commit_hash = cls.load_config(
+            config_path,
+            cache_dir=cache_dir,
+            return_unused_kwargs=True,
+            return_commit_hash=True,
+            force_download=force_download,
+            proxies=proxies,
+            local_files_only=local_files_only,
+            token=token,
+            revision=revision,
+            subfolder=subfolder,
+            user_agent=user_agent,
+            **kwargs,
+        )
+        # no in-place modification of the original config.
+        config = copy.deepcopy(config)
+
+        # determine initial quantization config.
+        #######################################
+        pre_quantized = "quantization_config" in config and config["quantization_config"] is not None
+        if pre_quantized or quantization_config is not None:
+            if pre_quantized:
+                config["quantization_config"] = DiffusersAutoQuantizer.merge_quantization_configs(
+                    config["quantization_config"], quantization_config
+                )
+            else:
+                config["quantization_config"] = quantization_config
+            hf_quantizer = DiffusersAutoQuantizer.from_config(
+                config["quantization_config"], pre_quantized=pre_quantized
+            )
+        else:
+            hf_quantizer = None
+
+        if hf_quantizer is not None:
+            is_bnb_quantization_method = hf_quantizer.quantization_config.quant_method.value == "bitsandbytes"
+            if is_bnb_quantization_method and device_map is not None:
+                raise NotImplementedError(
+                    "Currently, `device_map` is automatically inferred for quantized bitsandbytes models. Support for providing `device_map` as an input will be added in the future."
+                )
+
+            hf_quantizer.validate_environment(torch_dtype=torch_dtype, from_flax=from_flax, device_map=device_map)
+            torch_dtype = hf_quantizer.update_torch_dtype(torch_dtype)
+
+            # In order to ensure popular quantization methods are supported. Can be disable with `disable_telemetry`
+            user_agent["quant"] = hf_quantizer.quantization_config.quant_method.value
+
+            # Force-set to `True` for more mem efficiency
+            if low_cpu_mem_usage is None:
+                low_cpu_mem_usage = True
+                logger.info("Set `low_cpu_mem_usage` to True as `hf_quantizer` is not None.")
+            elif not low_cpu_mem_usage:
+                raise ValueError("`low_cpu_mem_usage` cannot be False or None when using quantization.")
+
+        # Check if `_keep_in_fp32_modules` is not None
+        use_keep_in_fp32_modules = (cls._keep_in_fp32_modules is not None) and (
+            (torch_dtype == torch.float16) or hasattr(hf_quantizer, "use_keep_in_fp32_modules")
+        )
+        if use_keep_in_fp32_modules:
+            keep_in_fp32_modules = cls._keep_in_fp32_modules
+            if not isinstance(keep_in_fp32_modules, list):
+                keep_in_fp32_modules = [keep_in_fp32_modules]
+
+            if low_cpu_mem_usage is None:
+                low_cpu_mem_usage = True
+                logger.info("Set `low_cpu_mem_usage` to True as `_keep_in_fp32_modules` is not None.")
+            elif not low_cpu_mem_usage:
+                raise ValueError("`low_cpu_mem_usage` cannot be False when `keep_in_fp32_modules` is True.")
+        else:
+            keep_in_fp32_modules = []
+        #######################################
+
+        # Determine if we're loading from a directory of sharded checkpoints.
+        is_sharded = False
+        index_file = None
+        is_local = os.path.isdir(pretrained_model_name_or_path)
+        index_file_kwargs = {
+            "is_local": is_local,
+            "pretrained_model_name_or_path": pretrained_model_name_or_path,
+            "subfolder": subfolder or "",
+            "use_safetensors": use_safetensors,
+            "cache_dir": cache_dir,
+            "variant": variant,
+            "force_download": force_download,
+            "proxies": proxies,
+            "local_files_only": local_files_only,
+            "token": token,
+            "revision": revision,
+            "user_agent": user_agent,
+            "commit_hash": commit_hash,
+        }
+        index_file = _fetch_index_file(**index_file_kwargs)
+        # In case the index file was not found we still have to consider the legacy format.
+        # this becomes applicable when the variant is not None.
+        if variant is not None and (index_file is None or not os.path.exists(index_file)):
+            index_file = _fetch_index_file_legacy(**index_file_kwargs)
+        if index_file is not None and index_file.is_file():
+            is_sharded = True
+
+        if is_sharded and from_flax:
+            raise ValueError("Loading of sharded checkpoints is not supported when `from_flax=True`.")
+
+        # load model
+        model_file = None
+        if from_flax:
+            model_file = _get_model_file(
+                pretrained_model_name_or_path,
+                weights_name=FLAX_WEIGHTS_NAME,
+                cache_dir=cache_dir,
+                force_download=force_download,
+                proxies=proxies,
+                local_files_only=local_files_only,
+                token=token,
+                revision=revision,
+                subfolder=subfolder,
+                user_agent=user_agent,
+                commit_hash=commit_hash,
+            )
+            model = cls.from_config(config, **unused_kwargs)
+
+            # Convert the weights
+            from .modeling_pytorch_flax_utils import load_flax_checkpoint_in_pytorch_model
+
+            model = load_flax_checkpoint_in_pytorch_model(model, model_file)
+        else:
+            if is_sharded:
+                sharded_ckpt_cached_folder, sharded_metadata = _get_checkpoint_shard_files(
+                    pretrained_model_name_or_path,
+                    index_file,
+                    cache_dir=cache_dir,
+                    proxies=proxies,
+                    local_files_only=local_files_only,
+                    token=token,
+                    user_agent=user_agent,
+                    revision=revision,
+                    subfolder=subfolder or "",
+                )
+                if hf_quantizer is not None and is_bnb_quantization_method:
+                    model_file = _merge_sharded_checkpoints(sharded_ckpt_cached_folder, sharded_metadata)
+                    logger.info("Merged sharded checkpoints as `hf_quantizer` is not None.")
+                    is_sharded = False
+
+            elif use_safetensors and not is_sharded:
+                try:
+                    model_file = _get_model_file(
+                        pretrained_model_name_or_path,
+                        weights_name=_add_variant(SAFETENSORS_WEIGHTS_NAME, variant),
+                        cache_dir=cache_dir,
+                        force_download=force_download,
+                        proxies=proxies,
+                        local_files_only=local_files_only,
+                        token=token,
+                        revision=revision,
+                        subfolder=subfolder,
+                        user_agent=user_agent,
+                        commit_hash=commit_hash,
+                    )
+
+                except IOError as e:
+                    logger.error(f"An error occurred while trying to fetch {pretrained_model_name_or_path}: {e}")
+                    if not allow_pickle:
+                        raise
+                    logger.warning(
+                        "Defaulting to unsafe serialization. Pass `allow_pickle=False` to raise an error instead."
+                    )
+
+            if model_file is None and not is_sharded:
+                model_file = _get_model_file(
+                    pretrained_model_name_or_path,
+                    weights_name=_add_variant(WEIGHTS_NAME, variant),
+                    cache_dir=cache_dir,
+                    force_download=force_download,
+                    proxies=proxies,
+                    local_files_only=local_files_only,
+                    token=token,
+                    revision=revision,
+                    subfolder=subfolder,
+                    user_agent=user_agent,
+                    commit_hash=commit_hash,
+                )
+
+            if low_cpu_mem_usage:
+                # Instantiate model with empty weights
+                with accelerate.init_empty_weights():
+                    model = cls.from_config(config, **unused_kwargs)
+
+                if hf_quantizer is not None:
+                    hf_quantizer.preprocess_model(
+                        model=model, device_map=device_map, keep_in_fp32_modules=keep_in_fp32_modules
+                    )
+
+                # if device_map is None, load the state dict and move the params from meta device to the cpu
+                if device_map is None and not is_sharded:
+                    # `torch.cuda.current_device()` is fine here when `hf_quantizer` is not None.
+                    # It would error out during the `validate_environment()` call above in the absence of cuda.
+                    if hf_quantizer is None:
+                        param_device = "cpu"
+                    # TODO (sayakpaul,  SunMarc): remove this after model loading refactor
+                    else:
+                        param_device = torch.device(torch.cuda.current_device())
+                    state_dict = load_state_dict(model_file, variant=variant)
+                    model._convert_deprecated_attention_blocks(state_dict)
+
+                    # move the params from meta device to cpu
+                    missing_keys = set(model.state_dict().keys()) - set(state_dict.keys())
+                    if hf_quantizer is not None:
+                        missing_keys = hf_quantizer.update_missing_keys(model, missing_keys, prefix="")
+                    if len(missing_keys) > 0:
+                        raise ValueError(
+                            f"Cannot load {cls} from {pretrained_model_name_or_path} because the following keys are"
+                            f" missing: \n {', '.join(missing_keys)}. \n Please make sure to pass"
+                            " `low_cpu_mem_usage=False` and `device_map=None` if you want to randomly initialize"
+                            " those weights or else make sure your checkpoint file is correct."
+                        )
+
+                    unexpected_keys = load_model_dict_into_meta(
+                        model,
+                        state_dict,
+                        device=param_device,
+                        dtype=torch_dtype,
+                        model_name_or_path=pretrained_model_name_or_path,
+                        hf_quantizer=hf_quantizer,
+                        keep_in_fp32_modules=keep_in_fp32_modules,
+                    )
+
+                    if cls._keys_to_ignore_on_load_unexpected is not None:
+                        for pat in cls._keys_to_ignore_on_load_unexpected:
+                            unexpected_keys = [k for k in unexpected_keys if re.search(pat, k) is None]
+
+                    if len(unexpected_keys) > 0:
+                        logger.warning(
+                            f"Some weights of the model checkpoint were not used when initializing {cls.__name__}: \n {[', '.join(unexpected_keys)]}"
+                        )
+
+                else:
+                    weights_path = index_file
+                    with open(index_file) as f:
+                        index = json.loads(f.read())
+                    if "weight_map" in index:
+                        index = index["weight_map"]
+                    weights_path = sorted(list(set(index.values())))
+                    weights_path = [os.path.join(pretrained_model_name_or_path, f) for f in weights_path]
+
+                    model = cls._load_model(model, weights_path, is_sharded)
+
+                loading_info = {
+                    "missing_keys": [],
+                    "unexpected_keys": [],
+                    "mismatched_keys": [],
+                    "error_msgs": [],
+                }
+            else:
+                model = cls.from_config(config, **unused_kwargs)
+
+                state_dict = load_state_dict(model_file, variant=variant)
+                model._convert_deprecated_attention_blocks(state_dict)
+
+                model, missing_keys, unexpected_keys, mismatched_keys, error_msgs = cls._load_pretrained_model(
+                    model,
+                    state_dict,
+                    model_file,
+                    pretrained_model_name_or_path,
+                    ignore_mismatched_sizes=ignore_mismatched_sizes,
+                )
+
+                loading_info = {
+                    "missing_keys": missing_keys,
+                    "unexpected_keys": unexpected_keys,
+                    "mismatched_keys": mismatched_keys,
+                    "error_msgs": error_msgs,
+                }
+
+        if hf_quantizer is not None:
+            hf_quantizer.postprocess_model(model)
+            model.hf_quantizer = hf_quantizer
+
+        if torch_dtype is not None and not isinstance(torch_dtype, torch.dtype):
+            raise ValueError(
+                f"{torch_dtype} needs to be of type `torch.dtype`, e.g. `torch.float16`, but is {type(torch_dtype)}."
+            )
+        # When using `use_keep_in_fp32_modules` if we do a global `to()` here, then we will
+        # completely lose the effectivity of `use_keep_in_fp32_modules`.
+        elif torch_dtype is not None and hf_quantizer is None and not use_keep_in_fp32_modules:
+            model = model.to(torch_dtype)
+
+        if hf_quantizer is not None:
+            # We also make sure to purge `_pre_quantization_dtype` when we serialize
+            # the model config because `_pre_quantization_dtype` is `torch.dtype`, not JSON serializable.
+            model.register_to_config(_name_or_path=pretrained_model_name_or_path, _pre_quantization_dtype=torch_dtype)
+        else:
+            model.register_to_config(_name_or_path=pretrained_model_name_or_path)
+
+        # Set model in evaluation mode to deactivate DropOut modules by default
+        model.eval()
+        if output_loading_info:
+            return model, loading_info
+
+        return model
+
+    @classmethod
+    def _load_model(cls, model, weights_path, is_sharded):
+        if not is_sharded:
+            state_dict = load_state_dict(weights_path)
+            model.load_weights(state_dict)
+        else:
+            need_key = set(model.state_dict().keys())
+            state_dict = {}
+            cache = {}
+            for weight_file in weights_path:
+                state_dict = load_state_dict(weight_file)
+                state_dict.update(cache)
+                loadkey_cache = model.load_weights(state_dict, is_sharded)
+                if loadkey_cache :
+                    if isinstance(loadkey_cache, tuple):
+                        loaded_keys, cache = loadkey_cache
+                    else:
+                        loaded_keys = loadkey_cache
+                need_key = need_key.symmetric_difference(set(loaded_keys))
+                
+            if len(need_key) > 0:
+                raise ValueError(f"The weight miss key: {need_key}")
+        return model
+
+    def load_weights(self, state_dict, shard=False):
+        with torch.no_grad():
+            if not shard:
+                self.load_state_dict(state_dict)
+                return {}
+            else:
+                self.load_state_dict(state_dict, strict=False, assign=True)
+                return state_dict.keys()
+
+    # Adapted from `transformers`.
+    @wraps(torch.nn.Module.cuda)
+    def cuda(self, *args, **kwargs):
+        # Checks if the model has been loaded in 4-bit or 8-bit with BNB
+        if getattr(self, "quantization_method", None) == QuantizationMethod.BITS_AND_BYTES:
+            if getattr(self, "is_loaded_in_8bit", False):
+                raise ValueError(
+                    "Calling `cuda()` is not supported for `8-bit` quantized models. "
+                    " Please use the model as it is, since the model has already been set to the correct devices."
+                )
+            elif is_bitsandbytes_version("<", "0.43.2"):
+                raise ValueError(
+                    "Calling `cuda()` is not supported for `4-bit` quantized models with the installed version of bitsandbytes. "
+                    f"The current device is `{self.device}`. If you intended to move the model, please install bitsandbytes >= 0.43.2."
+                )
+        return super().cuda(*args, **kwargs)
+
+    # Adapted from `transformers`.
+    @wraps(torch.nn.Module.to)
+    def to(self, *args, **kwargs):
+        dtype_present_in_args = "dtype" in kwargs
+
+        if not dtype_present_in_args:
+            for arg in args:
+                if isinstance(arg, torch.dtype):
+                    dtype_present_in_args = True
+                    break
+
+        if getattr(self, "is_quantized", False):
+            if dtype_present_in_args:
+                raise ValueError(
+                    "Casting a quantized model to a new `dtype` is unsupported. To set the dtype of unquantized layers, please "
+                    "use the `torch_dtype` argument when loading the model using `from_pretrained` or `from_single_file`"
+                )
+
+        if getattr(self, "quantization_method", None) == QuantizationMethod.BITS_AND_BYTES:
+            if getattr(self, "is_loaded_in_8bit", False):
+                raise ValueError(
+                    "`.to` is not supported for `8-bit` bitsandbytes models. Please use the model as it is, since the"
+                    " model has already been set to the correct devices and casted to the correct `dtype`."
+                )
+            elif is_bitsandbytes_version("<", "0.43.2"):
+                raise ValueError(
+                    "Calling `to()` is not supported for `4-bit` quantized models with the installed version of bitsandbytes. "
+                    f"The current device is `{self.device}`. If you intended to move the model, please install bitsandbytes >= 0.43.2."
+                )
+        return super().to(*args, **kwargs)
+
+    # Taken from `transformers`.
+    def half(self, *args):
+        # Checks if the model is quantized
+        if getattr(self, "is_quantized", False):
+            raise ValueError(
+                "`.half()` is not supported for quantized model. Please use the model as it is, since the"
+                " model has already been cast to the correct `dtype`."
+            )
+        else:
+            return super().half(*args)
+
+    # Taken from `transformers`.
+    def float(self, *args):
+        # Checks if the model is quantized
+        if getattr(self, "is_quantized", False):
+            raise ValueError(
+                "`.float()` is not supported for quantized model. Please use the model as it is, since the"
+                " model has already been cast to the correct `dtype`."
+            )
+        else:
+            return super().float(*args)
+
+    @classmethod
+    def _load_pretrained_model(
+        cls,
+        model,
+        state_dict: OrderedDict,
+        resolved_archive_file,
+        pretrained_model_name_or_path: Union[str, os.PathLike],
+        ignore_mismatched_sizes: bool = False,
+    ):
+        # Retrieve missing & unexpected_keys
+        model_state_dict = model.state_dict()
+        loaded_keys = list(state_dict.keys())
+
+        expected_keys = list(model_state_dict.keys())
+
+        original_loaded_keys = loaded_keys
+
+        missing_keys = list(set(expected_keys) - set(loaded_keys))
+        unexpected_keys = list(set(loaded_keys) - set(expected_keys))
+
+        # Make sure we are able to load base models as well as derived models (with heads)
+        model_to_load = model
+
+        def _find_mismatched_keys(
+            state_dict,
+            model_state_dict,
+            loaded_keys,
+            ignore_mismatched_sizes,
+        ):
+            mismatched_keys = []
+            if ignore_mismatched_sizes:
+                for checkpoint_key in loaded_keys:
+                    model_key = checkpoint_key
+
+                    if (
+                        model_key in model_state_dict
+                        and state_dict[checkpoint_key].shape != model_state_dict[model_key].shape
+                    ):
+                        mismatched_keys.append(
+                            (checkpoint_key, state_dict[checkpoint_key].shape, model_state_dict[model_key].shape)
+                        )
+                        del state_dict[checkpoint_key]
+            return mismatched_keys
+
+        if state_dict is not None:
+            # Whole checkpoint
+            mismatched_keys = _find_mismatched_keys(
+                state_dict,
+                model_state_dict,
+                original_loaded_keys,
+                ignore_mismatched_sizes,
+            )
+            error_msgs = _load_state_dict_into_model(model_to_load, state_dict)
+
+        if len(error_msgs) > 0:
+            error_msg = "\n\t".join(error_msgs)
+            if "size mismatch" in error_msg:
+                error_msg += (
+                    "\n\tYou may consider adding `ignore_mismatched_sizes=True` in the model `from_pretrained` method."
+                )
+            raise RuntimeError(f"Error(s) in loading state_dict for {model.__class__.__name__}:\n\t{error_msg}")
+
+        if len(unexpected_keys) > 0:
+            logger.warning(
+                f"Some weights of the model checkpoint at {pretrained_model_name_or_path} were not used when"
+                f" initializing {model.__class__.__name__}: {unexpected_keys}\n- This IS expected if you are"
+                f" initializing {model.__class__.__name__} from the checkpoint of a model trained on another task"
+                " or with another architecture (e.g. initializing a BertForSequenceClassification model from a"
+                " BertForPreTraining model).\n- This IS NOT expected if you are initializing"
+                f" {model.__class__.__name__} from the checkpoint of a model that you expect to be exactly"
+                " identical (initializing a BertForSequenceClassification model from a"
+                " BertForSequenceClassification model)."
+            )
+        else:
+            logger.info(f"All model checkpoint weights were used when initializing {model.__class__.__name__}.\n")
+        if len(missing_keys) > 0:
+            logger.warning(
+                f"Some weights of {model.__class__.__name__} were not initialized from the model checkpoint at"
+                f" {pretrained_model_name_or_path} and are newly initialized: {missing_keys}\nYou should probably"
+                " TRAIN this model on a down-stream task to be able to use it for predictions and inference."
+            )
+        elif len(mismatched_keys) == 0:
+            logger.info(
+                f"All the weights of {model.__class__.__name__} were initialized from the model checkpoint at"
+                f" {pretrained_model_name_or_path}.\nIf your task is similar to the task the model of the"
+                f" checkpoint was trained on, you can already use {model.__class__.__name__} for predictions"
+                " without further training."
+            )
+        if len(mismatched_keys) > 0:
+            mismatched_warning = "\n".join(
+                [
+                    f"- {key}: found shape {shape1} in the checkpoint and {shape2} in the model instantiated"
+                    for key, shape1, shape2 in mismatched_keys
+                ]
+            )
+            logger.warning(
+                f"Some weights of {model.__class__.__name__} were not initialized from the model checkpoint at"
+                f" {pretrained_model_name_or_path} and are newly initialized because the shapes did not"
+                f" match:\n{mismatched_warning}\nYou should probably TRAIN this model on a down-stream task to be"
+                " able to use it for predictions and inference."
+            )
+
+        return model, missing_keys, unexpected_keys, mismatched_keys, error_msgs
+
+    @classmethod
+    def _get_signature_keys(cls, obj):
+        parameters = inspect.signature(obj.__init__).parameters
+        required_parameters = {k: v for k, v in parameters.items() if v.default == inspect._empty}
+        optional_parameters = set({k for k, v in parameters.items() if v.default != inspect._empty})
+        expected_modules = set(required_parameters.keys()) - {"self"}
+
+        return expected_modules, optional_parameters
+
+    # Adapted from `transformers` modeling_utils.py
+    def _get_no_split_modules(self, device_map: str):
+        """
+        Get the modules of the model that should not be spit when using device_map. We iterate through the modules to
+        get the underlying `_no_split_modules`.
+
+        Args:
+            device_map (`str`):
+                The device map value. Options are ["auto", "balanced", "balanced_low_0", "sequential"]
+
+        Returns:
+            `List[str]`: List of modules that should not be split
+        """
+        _no_split_modules = set()
+        modules_to_check = [self]
+        while len(modules_to_check) > 0:
+            module = modules_to_check.pop(-1)
+            # if the module does not appear in _no_split_modules, we also check the children
+            if module.__class__.__name__ not in _no_split_modules:
+                if isinstance(module, ModelMixin):
+                    if module._no_split_modules is None:
+                        raise ValueError(
+                            f"{module.__class__.__name__} does not support `device_map='{device_map}'`. To implement support, the model "
+                            "class needs to implement the `_no_split_modules` attribute."
+                        )
+                    else:
+                        _no_split_modules = _no_split_modules | set(module._no_split_modules)
+                modules_to_check += list(module.children())
+        return list(_no_split_modules)
+
+    @property
+    def device(self) -> torch.device:
+        """
+        `torch.device`: The device on which the module is (assuming that all the module parameters are on the same
+        device).
+        """
+        return get_parameter_device(self)
+
+    @property
+    def dtype(self) -> torch.dtype:
+        """
+        `torch.dtype`: The dtype of the module (assuming that all the module parameters have the same dtype).
+        """
+        return get_parameter_dtype(self)
+
+    def num_parameters(self, only_trainable: bool = False, exclude_embeddings: bool = False) -> int:
+        """
+        Get number of (trainable or non-embedding) parameters in the module.
+
+        Args:
+            only_trainable (`bool`, *optional*, defaults to `False`):
+                Whether or not to return only the number of trainable parameters.
+            exclude_embeddings (`bool`, *optional*, defaults to `False`):
+                Whether or not to return only the number of non-embedding parameters.
+
+        Returns:
+            `int`: The number of parameters.
+
+        Example:
+
+        ```py
+        from diffusers import UNet2DConditionModel
+
+        model_id = "runwayml/stable-diffusion-v1-5"
+        unet = UNet2DConditionModel.from_pretrained(model_id, subfolder="unet")
+        unet.num_parameters(only_trainable=True)
+        859520964
+        ```
+        """
+        is_loaded_in_4bit = getattr(self, "is_loaded_in_4bit", False)
+
+        if is_loaded_in_4bit:
+            if is_bitsandbytes_available():
+                import bitsandbytes as bnb
+            else:
+                raise ValueError(
+                    "bitsandbytes is not installed but it seems that the model has been loaded in 4bit precision, something went wrong"
+                    " make sure to install bitsandbytes with `pip install bitsandbytes`. You also need a GPU. "
+                )
+
+        if exclude_embeddings:
+            embedding_param_names = [
+                f"{name}.weight" for name, module_type in self.named_modules() if isinstance(module_type, nn.Embedding)
+            ]
+            total_parameters = [
+                parameter for name, parameter in self.named_parameters() if name not in embedding_param_names
+            ]
+        else:
+            total_parameters = list(self.parameters())
+
+        total_numel = []
+
+        for param in total_parameters:
+            if param.requires_grad or not only_trainable:
+                # For 4bit models, we need to multiply the number of parameters by 2 as half of the parameters are
+                # used for the 4bit quantization (uint8 tensors are stored)
+                if is_loaded_in_4bit and isinstance(param, bnb.nn.Params4bit):
+                    if hasattr(param, "element_size"):
+                        num_bytes = param.element_size()
+                    elif hasattr(param, "quant_storage"):
+                        num_bytes = param.quant_storage.itemsize
+                    else:
+                        num_bytes = 1
+                    total_numel.append(param.numel() * 2 * num_bytes)
+                else:
+                    total_numel.append(param.numel())
+
+        return sum(total_numel)
+
+    def get_memory_footprint(self, return_buffers=True):
+        r"""
+        Get the memory footprint of a model. This will return the memory footprint of the current model in bytes.
+        Useful to benchmark the memory footprint of the current model and design some tests. Solution inspired from the
+        PyTorch discussions: https://discuss.pytorch.org/t/gpu-memory-that-model-uses/56822/2
+
+        Arguments:
+            return_buffers (`bool`, *optional*, defaults to `True`):
+                Whether to return the size of the buffer tensors in the computation of the memory footprint. Buffers
+                are tensors that do not require gradients and not registered as parameters. E.g. mean and std in batch
+                norm layers. Please see: https://discuss.pytorch.org/t/what-pytorch-means-by-buffers/120266/2
+        """
+        mem = sum([param.nelement() * param.element_size() for param in self.parameters()])
+        if return_buffers:
+            mem_bufs = sum([buf.nelement() * buf.element_size() for buf in self.buffers()])
+            mem = mem + mem_bufs
+        return mem
+
+    def _convert_deprecated_attention_blocks(self, state_dict: OrderedDict) -> None:
+        deprecated_attention_block_paths = []
+
+        def recursive_find_attn_block(name, module):
+            if hasattr(module, "_from_deprecated_attn_block") and module._from_deprecated_attn_block:
+                deprecated_attention_block_paths.append(name)
+
+            for sub_name, sub_module in module.named_children():
+                sub_name = sub_name if name == "" else f"{name}.{sub_name}"
+                recursive_find_attn_block(sub_name, sub_module)
+
+        recursive_find_attn_block("", self)
+
+        # NOTE: we have to check if the deprecated parameters are in the state dict
+        # because it is possible we are loading from a state dict that was already
+        # converted
+
+        for path in deprecated_attention_block_paths:
+            # group_norm path stays the same
+
+            # query -> to_q
+            if f"{path}.query.weight" in state_dict:
+                state_dict[f"{path}.to_q.weight"] = state_dict.pop(f"{path}.query.weight")
+            if f"{path}.query.bias" in state_dict:
+                state_dict[f"{path}.to_q.bias"] = state_dict.pop(f"{path}.query.bias")
+
+            # key -> to_k
+            if f"{path}.key.weight" in state_dict:
+                state_dict[f"{path}.to_k.weight"] = state_dict.pop(f"{path}.key.weight")
+            if f"{path}.key.bias" in state_dict:
+                state_dict[f"{path}.to_k.bias"] = state_dict.pop(f"{path}.key.bias")
+
+            # value -> to_v
+            if f"{path}.value.weight" in state_dict:
+                state_dict[f"{path}.to_v.weight"] = state_dict.pop(f"{path}.value.weight")
+            if f"{path}.value.bias" in state_dict:
+                state_dict[f"{path}.to_v.bias"] = state_dict.pop(f"{path}.value.bias")
+
+            # proj_attn -> to_out.0
+            if f"{path}.proj_attn.weight" in state_dict:
+                state_dict[f"{path}.to_out.0.weight"] = state_dict.pop(f"{path}.proj_attn.weight")
+            if f"{path}.proj_attn.bias" in state_dict:
+                state_dict[f"{path}.to_out.0.bias"] = state_dict.pop(f"{path}.proj_attn.bias")
+
+    def _temp_convert_self_to_deprecated_attention_blocks(self) -> None:
+        deprecated_attention_block_modules = []
+
+        def recursive_find_attn_block(module):
+            if hasattr(module, "_from_deprecated_attn_block") and module._from_deprecated_attn_block:
+                deprecated_attention_block_modules.append(module)
+
+            for sub_module in module.children():
+                recursive_find_attn_block(sub_module)
+
+        recursive_find_attn_block(self)
+
+        for module in deprecated_attention_block_modules:
+            module.query = module.to_q
+            module.key = module.to_k
+            module.value = module.to_v
+            module.proj_attn = module.to_out[0]
+
+            # We don't _have_ to delete the old attributes, but it's helpful to ensure
+            # that _all_ the weights are loaded into the new attributes and we're not
+            # making an incorrect assumption that this model should be converted when
+            # it really shouldn't be.
+            del module.to_q
+            del module.to_k
+            del module.to_v
+            del module.to_out
+
+    def _undo_temp_convert_self_to_deprecated_attention_blocks(self) -> None:
+        deprecated_attention_block_modules = []
+
+        def recursive_find_attn_block(module) -> None:
+            if hasattr(module, "_from_deprecated_attn_block") and module._from_deprecated_attn_block:
+                deprecated_attention_block_modules.append(module)
+
+            for sub_module in module.children():
+                recursive_find_attn_block(sub_module)
+
+        recursive_find_attn_block(self)
+
+        for module in deprecated_attention_block_modules:
+            module.to_q = module.query
+            module.to_k = module.key
+            module.to_v = module.value
+            module.to_out = nn.ModuleList([module.proj_attn, nn.Dropout(module.dropout)])
+
+            del module.query
+            del module.key
+            del module.value
+            del module.proj_attn
-- 
Gitee


From 50bdb564ce183ce96debfb36fe5b700e07da068b Mon Sep 17 00:00:00 2001
From: jiangmengyu <jiangmengyu1@huawei.com>
Date: Wed, 25 Dec 2024 17:13:23 +0800
Subject: [PATCH 10/91] =?UTF-8?q?CogView3plus=E9=80=82=E9=85=8D?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../cogview3/cogview3plus/models/attention_processor.py    | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/attention_processor.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/attention_processor.py
index 1b9ce8616c..1feaa2be1f 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/attention_processor.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/attention_processor.py
@@ -67,9 +67,10 @@ class CogVideoXAttnProcessor2_0:
             key = attn.norm_k(key)
 
         _, N, _, D = query.shape
-        query = F.pad(query, (0, 64 - D))
-        key = F.pad(key, (0, 64 - D))
-        value = F.pad(value, (0, 64 - D))
+        dim = 64
+        query = F.pad(query, (0, dim - D))
+        key = F.pad(key, (0, dim - D))
+        value = F.pad(value, (0, dim - D))
         hidden_states = torch_npu.npu_prompt_flash_attention(
             query,
             key,
-- 
Gitee


From 1d06781e8aa66b106da82124b8848789f816b1df Mon Sep 17 00:00:00 2001
From: jiangmengyu <jiangmengyu1@huawei.com>
Date: Thu, 26 Dec 2024 09:28:29 +0800
Subject: [PATCH 11/91] =?UTF-8?q?CogView3plus=E9=80=82=E9=85=8D?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../cogview3plus/layers/normalization.py      |  24 ++
 .../models/attention_processor.py             | 376 +++++++++++++++++-
 .../models/transformer_cogview3plus.py        |   5 +-
 3 files changed, 399 insertions(+), 6 deletions(-)

diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/normalization.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/normalization.py
index b2576d26f5..3dd2bba76c 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/normalization.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/normalization.py
@@ -133,6 +133,30 @@ class CogView3PlusAdaLayerNormZeroTextImage(nn.Module):
         return x, gate_msa, shift_mlp, scale_mlp, gate_mlp, context, c_gate_msa, c_shift_mlp, c_scale_mlp, c_gate_mlp
 
 
+class FP32LayerNorm(nn.LayerNorm):
+    def forward(self, inputs: torch.Tensor) -> torch.Tensor:
+        origin_dtype = inputs.dtype
+        return F.layer_norm(
+            inputs.float(),
+            self.normalized_shape,
+            self.weight.float() if self.weight is not None else None,
+            self.bias.float() if self.bias is not None else None,
+            self.eps,
+        ).to(origin_dtype)
+
+
+class LpNorm(nn.Module):
+    def __init__(self, p: int = 2, dim: int = -1, eps: float = 1e-12):
+        super().__init__()
+
+        self.p = p
+        self.dim = dim
+        self.eps = eps
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        return F.normalize(hidden_states, p=self.p, dim=self.dim, eps=self.eps)
+
+
 class AdaLayerNormContinuous(nn.Module):
     def __init__(
         self,
diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/attention_processor.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/attention_processor.py
index 1feaa2be1f..c6879c9898 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/attention_processor.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/attention_processor.py
@@ -11,14 +11,384 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
+import inspect
 from typing import Optional
 
 import torch
 import torch.nn.functional as F
-import torch_npu
+from torch import nn
+
+from diffusers.utils import logging
+from diffusers.utils.torch_utils import maybe_allow_in_graph
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+@maybe_allow_in_graph
+class Attention(nn.Module):
+    r"""
+    A cross attention layer.
+
+    Parameters:
+        query_dim (`int`):
+            The number of channels in the query.
+        cross_attention_dim (`int`, *optional*):
+            The number of channels in the encoder_hidden_states. If not given, defaults to `query_dim`.
+        heads (`int`,  *optional*, defaults to 8):
+            The number of heads to use for multi-head attention.
+        kv_heads (`int`,  *optional*, defaults to `None`):
+            The number of key and value heads to use for multi-head attention. Defaults to `heads`. If
+            `kv_heads=heads`, the model will use Multi Head Attention (MHA), if `kv_heads=1` the model will use Multi
+            Query Attention (MQA) otherwise GQA is used.
+        dim_head (`int`,  *optional*, defaults to 64):
+            The number of channels in each head.
+        dropout (`float`, *optional*, defaults to 0.0):
+            The dropout probability to use.
+        bias (`bool`, *optional*, defaults to False):
+            Set to `True` for the query, key, and value linear layers to contain a bias parameter.
+        upcast_attention (`bool`, *optional*, defaults to False):
+            Set to `True` to upcast the attention computation to `float32`.
+        upcast_softmax (`bool`, *optional*, defaults to False):
+            Set to `True` to upcast the softmax computation to `float32`.
+        cross_attention_norm (`str`, *optional*, defaults to `None`):
+            The type of normalization to use for the cross attention. Can be `None`, `layer_norm`, or `group_norm`.
+        cross_attention_norm_num_groups (`int`, *optional*, defaults to 32):
+            The number of groups to use for the group norm in the cross attention.
+        added_kv_proj_dim (`int`, *optional*, defaults to `None`):
+            The number of channels to use for the added key and value projections. If `None`, no projection is used.
+        norm_num_groups (`int`, *optional*, defaults to `None`):
+            The number of groups to use for the group norm in the attention.
+        spatial_norm_dim (`int`, *optional*, defaults to `None`):
+            The number of channels to use for the spatial normalization.
+        out_bias (`bool`, *optional*, defaults to `True`):
+            Set to `True` to use a bias in the output linear layer.
+        scale_qk (`bool`, *optional*, defaults to `True`):
+            Set to `True` to scale the query and key by `1 / sqrt(dim_head)`.
+        only_cross_attention (`bool`, *optional*, defaults to `False`):
+            Set to `True` to only use cross attention and not added_kv_proj_dim. Can only be set to `True` if
+            `added_kv_proj_dim` is not `None`.
+        eps (`float`, *optional*, defaults to 1e-5):
+            An additional value added to the denominator in group normalization that is used for numerical stability.
+        rescale_output_factor (`float`, *optional*, defaults to 1.0):
+            A factor to rescale the output by dividing it with this value.
+        residual_connection (`bool`, *optional*, defaults to `False`):
+            Set to `True` to add the residual connection to the output.
+        _from_deprecated_attn_block (`bool`, *optional*, defaults to `False`):
+            Set to `True` if the attention block is loaded from a deprecated state dict.
+        processor (`AttnProcessor`, *optional*, defaults to `None`):
+            The attention processor to use. If `None`, defaults to `AttnProcessor2_0` if `torch 2.x` is used and
+            `AttnProcessor` otherwise.
+    """
+
+    def __init__(
+        self,
+        query_dim: int,
+        cross_attention_dim: Optional[int] = None,
+        heads: int = 8,
+        kv_heads: Optional[int] = None,
+        dim_head: int = 64,
+        dropout: float = 0.0,
+        bias: bool = False,
+        upcast_attention: bool = False,
+        upcast_softmax: bool = False,
+        cross_attention_norm: Optional[str] = None,
+        cross_attention_norm_num_groups: int = 32,
+        qk_norm: Optional[str] = None,
+        added_kv_proj_dim: Optional[int] = None,
+        added_proj_bias: Optional[bool] = True,
+        norm_num_groups: Optional[int] = None,
+        spatial_norm_dim: Optional[int] = None,
+        out_bias: bool = True,
+        scale_qk: bool = True,
+        only_cross_attention: bool = False,
+        eps: float = 1e-5,
+        rescale_output_factor: float = 1.0,
+        residual_connection: bool = False,
+        _from_deprecated_attn_block: bool = False,
+        processor: Optional["AttnProcessor"] = None,
+        out_dim: int = None,
+        out_context_dim: int = None,
+        context_pre_only=None,
+        pre_only=False,
+        elementwise_affine: bool = True,
+        is_causal: bool = False,
+    ):
+        super().__init__()
+
+        # To prevent circular import.
+        from ..layers.normalization import FP32LayerNorm, LpNorm, RMSNorm
+
+        self.inner_dim = out_dim if out_dim is not None else dim_head * heads
+        self.inner_kv_dim = self.inner_dim if kv_heads is None else dim_head * kv_heads
+        self.query_dim = query_dim
+        self.use_bias = bias
+        self.is_cross_attention = cross_attention_dim is not None
+        self.cross_attention_dim = cross_attention_dim if cross_attention_dim is not None else query_dim
+        self.upcast_attention = upcast_attention
+        self.upcast_softmax = upcast_softmax
+        self.rescale_output_factor = rescale_output_factor
+        self.residual_connection = residual_connection
+        self.dropout = dropout
+        self.fused_projections = False
+        self.out_dim = out_dim if out_dim is not None else query_dim
+        self.out_context_dim = out_context_dim if out_context_dim is not None else query_dim
+        self.context_pre_only = context_pre_only
+        self.pre_only = pre_only
+        self.is_causal = is_causal
+
+        # we make use of this private variable to know whether this class is loaded
+        # with an deprecated state dict so that we can convert it on the fly
+        self._from_deprecated_attn_block = _from_deprecated_attn_block
+
+        self.scale_qk = scale_qk
+        self.scale = dim_head**-0.5 if self.scale_qk else 1.0
+
+        self.heads = out_dim // dim_head if out_dim is not None else heads
+        # for slice_size > 0 the attention score computation
+        # is split across the batch axis to save memory
+        # You can set slice_size with `set_attention_slice`
+        self.sliceable_head_dim = heads
+
+        self.added_kv_proj_dim = added_kv_proj_dim
+        self.only_cross_attention = only_cross_attention
+
+        if self.added_kv_proj_dim is None and self.only_cross_attention:
+            raise ValueError(
+                "`only_cross_attention` can only be set to True if `added_kv_proj_dim` is not None. Make sure to set either `only_cross_attention=False` or define `added_kv_proj_dim`."
+            )
+
+        if norm_num_groups is not None:
+            self.group_norm = nn.GroupNorm(num_channels=query_dim, num_groups=norm_num_groups, eps=eps, affine=True)
+        else:
+            self.group_norm = None
+
+        self.spatial_norm = None
+
+        if qk_norm is None:
+            self.norm_q = None
+            self.norm_k = None
+        elif qk_norm == "layer_norm":
+            self.norm_q = nn.LayerNorm(dim_head, eps=eps, elementwise_affine=elementwise_affine)
+            self.norm_k = nn.LayerNorm(dim_head, eps=eps, elementwise_affine=elementwise_affine)
+        elif qk_norm == "fp32_layer_norm":
+            self.norm_q = FP32LayerNorm(dim_head, elementwise_affine=False, bias=False, eps=eps)
+            self.norm_k = FP32LayerNorm(dim_head, elementwise_affine=False, bias=False, eps=eps)
+        elif qk_norm == "layer_norm_across_heads":
+            # Lumina applies qk norm across all heads
+            self.norm_q = nn.LayerNorm(dim_head * heads, eps=eps)
+            self.norm_k = nn.LayerNorm(dim_head * kv_heads, eps=eps)
+        elif qk_norm == "rms_norm":
+            self.norm_q = RMSNorm(dim_head, eps=eps)
+            self.norm_k = RMSNorm(dim_head, eps=eps)
+        elif qk_norm == "rms_norm_across_heads":
+            # LTX applies qk norm across all heads
+            self.norm_q = RMSNorm(dim_head * heads, eps=eps)
+            self.norm_k = RMSNorm(dim_head * kv_heads, eps=eps)
+        elif qk_norm == "l2":
+            self.norm_q = LpNorm(p=2, dim=-1, eps=eps)
+            self.norm_k = LpNorm(p=2, dim=-1, eps=eps)
+        else:
+            raise ValueError(f"unknown qk_norm: {qk_norm}. Should be None,'layer_norm','fp32_layer_norm','rms_norm'")
+
+        if cross_attention_norm is None:
+            self.norm_cross = None
+        elif cross_attention_norm == "layer_norm":
+            self.norm_cross = nn.LayerNorm(self.cross_attention_dim)
+        elif cross_attention_norm == "group_norm":
+            if self.added_kv_proj_dim is not None:
+                # The given `encoder_hidden_states` are initially of shape
+                # (batch_size, seq_len, added_kv_proj_dim) before being projected
+                # to (batch_size, seq_len, cross_attention_dim). The norm is applied
+                # before the projection, so we need to use `added_kv_proj_dim` as
+                # the number of channels for the group norm.
+                norm_cross_num_channels = added_kv_proj_dim
+            else:
+                norm_cross_num_channels = self.cross_attention_dim
+
+            self.norm_cross = nn.GroupNorm(
+                num_channels=norm_cross_num_channels, num_groups=cross_attention_norm_num_groups, eps=1e-5, affine=True
+            )
+        else:
+            raise ValueError(
+                f"unknown cross_attention_norm: {cross_attention_norm}. Should be None, 'layer_norm' or 'group_norm'"
+            )
+
+        self.to_q = nn.Linear(query_dim, self.inner_dim, bias=bias)
+
+        if not self.only_cross_attention:
+            # only relevant for the `AddedKVProcessor` classes
+            self.to_k = nn.Linear(self.cross_attention_dim, self.inner_kv_dim, bias=bias)
+            self.to_v = nn.Linear(self.cross_attention_dim, self.inner_kv_dim, bias=bias)
+        else:
+            self.to_k = None
+            self.to_v = None
+
+        self.added_proj_bias = added_proj_bias
+        if self.added_kv_proj_dim is not None:
+            self.add_k_proj = nn.Linear(added_kv_proj_dim, self.inner_kv_dim, bias=added_proj_bias)
+            self.add_v_proj = nn.Linear(added_kv_proj_dim, self.inner_kv_dim, bias=added_proj_bias)
+            if self.context_pre_only is not None:
+                self.add_q_proj = nn.Linear(added_kv_proj_dim, self.inner_dim, bias=added_proj_bias)
+        else:
+            self.add_q_proj = None
+            self.add_k_proj = None
+            self.add_v_proj = None
+
+        if not self.pre_only:
+            self.to_out = nn.ModuleList([])
+            self.to_out.append(nn.Linear(self.inner_dim, self.out_dim, bias=out_bias))
+            self.to_out.append(nn.Dropout(dropout))
+        else:
+            self.to_out = None
+
+        if self.context_pre_only is not None and not self.context_pre_only:
+            self.to_add_out = nn.Linear(self.inner_dim, self.out_context_dim, bias=out_bias)
+        else:
+            self.to_add_out = None
+
+        if qk_norm is not None and added_kv_proj_dim is not None:
+            if qk_norm == "fp32_layer_norm":
+                self.norm_added_q = FP32LayerNorm(dim_head, elementwise_affine=False, bias=False, eps=eps)
+                self.norm_added_k = FP32LayerNorm(dim_head, elementwise_affine=False, bias=False, eps=eps)
+            elif qk_norm == "rms_norm":
+                self.norm_added_q = RMSNorm(dim_head, eps=eps)
+                self.norm_added_k = RMSNorm(dim_head, eps=eps)
+            else:
+                raise ValueError(
+                    f"unknown qk_norm: {qk_norm}. Should be one of `None,'layer_norm','fp32_layer_norm','rms_norm'`"
+                )
+        else:
+            self.norm_added_q = None
+            self.norm_added_k = None
+
+        self.set_processor(processor)
+
+    def set_processor(self, processor: "AttnProcessor") -> None:
+        r"""
+        Set the attention processor to use.
+
+        Args:
+            processor (`AttnProcessor`):
+                The attention processor to use.
+        """
+        # if current processor is in `self._modules` and if passed `processor` is not, we need to
+        # pop `processor` from `self._modules`
+        if (
+            hasattr(self, "processor")
+            and isinstance(self.processor, torch.nn.Module)
+            and not isinstance(processor, torch.nn.Module)
+        ):
+            logger.info(f"You are removing possibly trained weights of {self.processor} with {processor}")
+            self._modules.pop("processor")
+
+        self.processor = processor
+
+    def get_processor(self, return_deprecated_lora: bool = False) -> "AttentionProcessor":
+        r"""
+        Get the attention processor in use.
+
+        Args:
+            return_deprecated_lora (`bool`, *optional*, defaults to `False`):
+                Set to `True` to return the deprecated LoRA attention processor.
+
+        Returns:
+            "AttentionProcessor": The attention processor in use.
+        """
+        if not return_deprecated_lora:
+            return self.processor
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        **cross_attention_kwargs,
+    ) -> torch.Tensor:
+        r"""
+        The forward method of the `Attention` class.
+
+        Args:
+            hidden_states (`torch.Tensor`):
+                The hidden states of the query.
+            encoder_hidden_states (`torch.Tensor`, *optional*):
+                The hidden states of the encoder.
+            attention_mask (`torch.Tensor`, *optional*):
+                The attention mask to use. If `None`, no mask is applied.
+            **cross_attention_kwargs:
+                Additional keyword arguments to pass along to the cross attention.
+
+        Returns:
+            `torch.Tensor`: The output of the attention layer.
+        """
+        # The `Attention` class can call different attention processors / attention functions
+        # here we simply pass along all tensors to the selected processor class
+        # For standard processors that are defined here, `**cross_attention_kwargs` is empty
+
+        attn_parameters = set(inspect.signature(self.processor.__call__).parameters.keys())
+        quiet_attn_parameters = {"ip_adapter_masks", "ip_hidden_states"}
+        unused_kwargs = [
+            k for k, _ in cross_attention_kwargs.items() if k not in attn_parameters and k not in quiet_attn_parameters
+        ]
+        if len(unused_kwargs) > 0:
+            logger.warning(
+                f"cross_attention_kwargs {unused_kwargs} are not expected by {self.processor.__class__.__name__} and will be ignored."
+            )
+        cross_attention_kwargs = {k: w for k, w in cross_attention_kwargs.items() if k in attn_parameters}
+
+        return self.processor(
+            self,
+            hidden_states,
+            encoder_hidden_states=encoder_hidden_states,
+            attention_mask=attention_mask,
+            **cross_attention_kwargs,
+        )
+
+    def prepare_attention_mask(
+        self, attention_mask: torch.Tensor, target_length: int, batch_size: int, out_dim: int = 3
+    ) -> torch.Tensor:
+        r"""
+        Prepare the attention mask for the attention computation.
+
+        Args:
+            attention_mask (`torch.Tensor`):
+                The attention mask to prepare.
+            target_length (`int`):
+                The target length of the attention mask. This is the length of the attention mask after padding.
+            batch_size (`int`):
+                The batch size, which is used to repeat the attention mask.
+            out_dim (`int`, *optional*, defaults to `3`):
+                The output dimension of the attention mask. Can be either `3` or `4`.
+
+        Returns:
+            `torch.Tensor`: The prepared attention mask.
+        """
+        head_size = self.heads
+        if attention_mask is None:
+            return attention_mask
+
+        current_length: int = attention_mask.shape[-1]
+        if current_length != target_length:
+            if attention_mask.device.type == "mps":
+                # HACK: MPS: Does not support padding by greater than dimension of input tensor.
+                # Instead, we can manually construct the padding tensor.
+                padding_shape = (attention_mask.shape[0], attention_mask.shape[1], target_length)
+                padding = torch.zeros(padding_shape, dtype=attention_mask.dtype, device=attention_mask.device)
+                attention_mask = torch.cat([attention_mask, padding], dim=2)
+            else:
+                # TODO: for pipelines such as stable-diffusion, padding cross-attn mask:
+                #       we want to instead pad by (0, remaining_length), where remaining_length is:
+                #       remaining_length: int = target_length - current_length
+                # TODO: re-enable tests/models/test_models_unet_2d_condition.py#test_model_xattn_padding
+                attention_mask = F.pad(attention_mask, (0, target_length), value=0.0)
+
+        if out_dim == 3:
+            if attention_mask.shape[0] < batch_size * head_size:
+                attention_mask = attention_mask.repeat_interleave(head_size, dim=0)
+        elif out_dim == 4:
+            attention_mask = attention_mask.unsqueeze(1)
+            attention_mask = attention_mask.repeat_interleave(head_size, dim=1)
 
-from diffusers.models.attention_processor import Attention
+        return attention_mask
 
 
 class CogVideoXAttnProcessor2_0:
diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/transformer_cogview3plus.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/transformer_cogview3plus.py
index 9a343d4c86..ec773e4af9 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/transformer_cogview3plus.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/transformer_cogview3plus.py
@@ -19,13 +19,13 @@ import torch
 import torch.nn as nn
 
 from diffusers.configuration_utils import ConfigMixin, register_to_config
-from diffusers.models.attention_processor import Attention, AttentionProcessor
+from diffusers.models.attention_processor import AttentionProcessor
 from diffusers.utils import is_torch_version, logging
 from diffusers.models.modeling_outputs import Transformer2DModelOutput
 
 from .modeling_utils import ModelMixin
 from .attention import FeedForward
-from .attention_processor import CogVideoXAttnProcessor2_0
+from .attention_processor import CogVideoXAttnProcessor2_0, Attention
 from ..layers import CogView3PlusAdaLayerNormZeroTextImage, AdaLayerNormContinuous
 from ..layers import CogView3CombinedTimestepSizeEmbeddings, CogView3PlusPatchEmbed
 
@@ -177,7 +177,6 @@ class CogView3PlusTransformer2DModel(ModelMixin, ConfigMixin):
         time_embed_dim: int = 512,
         condition_dim: int = 256,
         pos_embed_max_size: int = 128,
-        sample_size: int = 128,
     ):
         super().__init__()
         self.out_channels = out_channels
-- 
Gitee


From 7c95104fe3b385f0035097eb3450c3fe837f80b6 Mon Sep 17 00:00:00 2001
From: jiangmengyu <jiangmengyu1@huawei.com>
Date: Thu, 26 Dec 2024 09:31:00 +0800
Subject: [PATCH 12/91] =?UTF-8?q?CogView3plus=E9=80=82=E9=85=8D?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../foundation/cogview3/cogview3plus/vae/autoencoder_kl.py   | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/vae/autoencoder_kl.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/vae/autoencoder_kl.py
index fcba50ccae..cea74eb29f 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/vae/autoencoder_kl.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/vae/autoencoder_kl.py
@@ -87,11 +87,6 @@ class AutoencoderKL(ModelMixin, ConfigMixin, FromOriginalModelMixin, PeftAdapter
         latent_channels: int = 4,
         norm_num_groups: int = 32,
         sample_size: int = 32,
-        scaling_factor: float = 0.18215,
-        shift_factor: Optional[float] = None,
-        latents_mean: Optional[Tuple[float]] = None,
-        latents_std: Optional[Tuple[float]] = None,
-        force_upcast: float = True,
         use_quant_conv: bool = True,
         use_post_quant_conv: bool = True,
         mid_block_add_attention: bool = True,
-- 
Gitee


From f4e1a62fc4ed070086bf0303bd82653f1f05792a Mon Sep 17 00:00:00 2001
From: jiangmengyu <jiangmengyu1@huawei.com>
Date: Thu, 26 Dec 2024 09:32:17 +0800
Subject: [PATCH 13/91] =?UTF-8?q?CogView3plus=E9=80=82=E9=85=8D?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../cogview3/cogview3plus/models/attention_processor.py          | 1 +
 1 file changed, 1 insertion(+)

diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/attention_processor.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/attention_processor.py
index c6879c9898..b7cdf97abb 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/attention_processor.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/attention_processor.py
@@ -17,6 +17,7 @@ from typing import Optional
 import torch
 import torch.nn.functional as F
 from torch import nn
+import torch_npu
 
 from diffusers.utils import logging
 from diffusers.utils.torch_utils import maybe_allow_in_graph
-- 
Gitee


From c756489d04d84d263e0792ce2e777ff499df944e Mon Sep 17 00:00:00 2001
From: jiangmengyu <jiangmengyu1@huawei.com>
Date: Thu, 26 Dec 2024 09:58:49 +0800
Subject: [PATCH 14/91] =?UTF-8?q?CogView3plus=E9=80=82=E9=85=8D?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../cogview3/cogview3plus/layers/__init__.py  |  3 +-
 .../cogview3/cogview3plus/layers/linear.py    | 95 +++++++++++++++++++
 .../models/attention_processor.py             | 19 +---
 .../cogview3plus/models/modeling_utils.py     |  3 +
 4 files changed, 105 insertions(+), 15 deletions(-)
 create mode 100644 MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/linear.py

diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/__init__.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/__init__.py
index c3e7c569e2..09760b9fd0 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/__init__.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/__init__.py
@@ -1,2 +1,3 @@
 from .normalization import CogView3PlusAdaLayerNormZeroTextImage, AdaLayerNormContinuous
-from .embeddings import CogView3CombinedTimestepSizeEmbeddings, CogView3PlusPatchEmbed
\ No newline at end of file
+from .embeddings import CogView3CombinedTimestepSizeEmbeddings, CogView3PlusPatchEmbed
+from .linear import QKVLinear
\ No newline at end of file
diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/linear.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/linear.py
new file mode 100644
index 0000000000..805c2d2b34
--- /dev/null
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/linear.py
@@ -0,0 +1,95 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2024 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import torch
+import torch.nn as nn
+import torch_npu
+
+
+class QKVLinear(nn.Module):
+    def __init__(self, attention_dim, hidden_size, qkv_bias=True, cross_attention_dim=None, device=None, dtype=None):
+        super(QKVLinear, self).__init__()
+        self.attention_dim = attention_dim
+        self.hidden_size = hidden_size
+        self.cross_attention_dim = cross_attention_dim
+        self.qkv_bias = qkv_bias
+
+        factory_kwargs = {"device": device, "dtype": dtype}
+
+        if not cross_attention_dim:
+            self.weight = nn.Parameter(torch.empty([self.attention_dim, 3 * self.hidden_size], **factory_kwargs))
+            if self.qkv_bias:
+                self.bias = nn.Parameter(torch.empty([3 * self.hidden_size], **factory_kwargs))
+        else:
+            self.q_weight = nn.Parameter(torch.empty([self.attention_dim, self.hidden_size], **factory_kwargs))
+            self.kv_weight = nn.Parameter(torch.empty([self.attention_dim, 2 * self.hidden_size], **factory_kwargs))
+
+            if self.qkv_bias:
+                self.q_bias = nn.Parameter(torch.empty([self.hidden_size], **factory_kwargs))
+                self.kv_bias = nn.Parameter(torch.empty([2 * self.hidden_size], **factory_kwargs))
+
+
+    def forward(self, hidden_states, encoder_hidden_states=None):
+
+        if self.cross_attention_dim is None:
+            if not self.qkv_bias:
+                qkv = torch.matmul(hidden_states, self.weight)
+            else:
+                qkv = torch.addmm(
+                    self.bias, 
+                    hidden_states.view(hidden_states.size(0) * hidden_states.size(1), hidden_states.size(2)),
+                    self.weight, 
+                    beta=1, 
+                    alpha=1
+                )
+
+            batch, seqlen, _ = hidden_states.shape
+            qkv_shape = (batch, seqlen, 3, -1)
+            qkv = qkv.view(qkv_shape)
+            q, k, v = qkv.unbind(2)
+
+        else:
+            if not self.qkv_bias:
+                q = torch.matmul(hidden_states, self.q_weight)
+                kv = torch.matmul(encoder_hidden_states, self.kv_weight)
+            else:
+                q = torch.addmm(
+                    self.q_bias, 
+                    hidden_states.view(hidden_states.size(0) * hidden_states.size(1), hidden_states.size(2)), 
+                    self.q_weight, 
+                    beta=1, 
+                    alpha=1
+                )
+                kv = torch.addmm(
+                    self.kv_bias, 
+                    encoder_hidden_states.view(
+                        encoder_hidden_states.size(0) * encoder_hidden_states.size(1),
+                        encoder_hidden_states.size(2)), 
+                    self.kv_weight, 
+                    beta=1, 
+                    alpha=1
+                )
+
+            batch, seqlen, _ = encoder_hidden_states.shape
+            kv_shape = (batch, seqlen, 2, -1)
+
+            kv = kv.view(kv_shape)
+            k, v = kv.unbind(2)
+
+            q = q.view(hidden_states.shape)
+
+        return q, k, v
\ No newline at end of file
diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/attention_processor.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/attention_processor.py
index b7cdf97abb..77863c2275 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/attention_processor.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/attention_processor.py
@@ -22,6 +22,8 @@ import torch_npu
 from diffusers.utils import logging
 from diffusers.utils.torch_utils import maybe_allow_in_graph
 
+from ..layers import QKVLinear
+
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
 
@@ -98,7 +100,6 @@ class Attention(nn.Module):
         added_kv_proj_dim: Optional[int] = None,
         added_proj_bias: Optional[bool] = True,
         norm_num_groups: Optional[int] = None,
-        spatial_norm_dim: Optional[int] = None,
         out_bias: bool = True,
         scale_qk: bool = True,
         only_cross_attention: bool = False,
@@ -213,16 +214,8 @@ class Attention(nn.Module):
             raise ValueError(
                 f"unknown cross_attention_norm: {cross_attention_norm}. Should be None, 'layer_norm' or 'group_norm'"
             )
-
-        self.to_q = nn.Linear(query_dim, self.inner_dim, bias=bias)
-
-        if not self.only_cross_attention:
-            # only relevant for the `AddedKVProcessor` classes
-            self.to_k = nn.Linear(self.cross_attention_dim, self.inner_kv_dim, bias=bias)
-            self.to_v = nn.Linear(self.cross_attention_dim, self.inner_kv_dim, bias=bias)
-        else:
-            self.to_k = None
-            self.to_v = None
+        
+        self.to_qkv = QKVLinear(self.inner_dim, query_dim, qkv_bias=bias)
 
         self.added_proj_bias = added_proj_bias
         if self.added_kv_proj_dim is not None:
@@ -421,9 +414,7 @@ class CogVideoXAttnProcessor2_0:
             attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
             attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
 
-        query = attn.to_q(hidden_states)
-        key = attn.to_k(hidden_states)
-        value = attn.to_v(hidden_states)
+        query, key, value = attn.to_qkv(hidden_states)
 
         inner_dim = key.shape[-1]
         head_dim = inner_dim // attn.heads
diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/modeling_utils.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/modeling_utils.py
index a0740b8c67..56f94f0c0d 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/modeling_utils.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/modeling_utils.py
@@ -920,6 +920,9 @@ class ModelMixin(torch.nn.Module, PushToHubMixin):
                         )
 
                 else:
+                    print(model.state_dict())
+                    exit()
+
                     weights_path = index_file
                     with open(index_file) as f:
                         index = json.loads(f.read())
-- 
Gitee


From 7e0d57a85c40a970034be3b1243e2af0a38fe544 Mon Sep 17 00:00:00 2001
From: jiangmengyu <jiangmengyu1@huawei.com>
Date: Thu, 26 Dec 2024 11:03:23 +0800
Subject: [PATCH 15/91] =?UTF-8?q?CogView3plus=E9=80=82=E9=85=8D?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../cogview3plus/models/modeling_utils.py     |  3 -
 .../models/transformer_cogview3plus.py        | 59 ++++++++++++++++++-
 2 files changed, 58 insertions(+), 4 deletions(-)

diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/modeling_utils.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/modeling_utils.py
index 56f94f0c0d..a0740b8c67 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/modeling_utils.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/modeling_utils.py
@@ -920,9 +920,6 @@ class ModelMixin(torch.nn.Module, PushToHubMixin):
                         )
 
                 else:
-                    print(model.state_dict())
-                    exit()
-
                     weights_path = index_file
                     with open(index_file) as f:
                         index = json.loads(f.read())
diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/transformer_cogview3plus.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/transformer_cogview3plus.py
index ec773e4af9..0846744ef7 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/transformer_cogview3plus.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/transformer_cogview3plus.py
@@ -181,6 +181,7 @@ class CogView3PlusTransformer2DModel(ModelMixin, ConfigMixin):
         super().__init__()
         self.out_channels = out_channels
         self.inner_dim = num_attention_heads * attention_head_dim
+        self.num_layers = num_layers
 
         # CogView3 uses 3 additional SDXL-like conditions - original_size, target_size, crop_coords
         # Each of these are sincos embeddings of shape 2 * condition_dim
@@ -223,6 +224,13 @@ class CogView3PlusTransformer2DModel(ModelMixin, ConfigMixin):
 
         self.gradient_checkpointing = False
 
+        self.q_weight_cache = None
+        self.q_bias_cache = None
+        self.k_weight_cache = None
+        self.k_bias_cache = None
+        self.v_weight_cache = None
+        self.v_bias_cache = None
+
     @property
     # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.attn_processors
     def attn_processors(self) -> Dict[str, AttentionProcessor]:
@@ -379,4 +387,53 @@ class CogView3PlusTransformer2DModel(ModelMixin, ConfigMixin):
         if not return_dict:
             return (output,)
 
-        return Transformer2DModelOutput(sample=output)
\ No newline at end of file
+        return Transformer2DModelOutput(sample=output)
+
+    def load_weights(self, state_dict, shard=False):
+        with torch.no_grad():
+            if not shard:
+                self.load_state_dict(state_dict)
+                return {}
+            else:
+                weights = state_dict
+
+                for i in range(self.num_layers):
+                    if i != 26:
+                        q_weight = weights.pop(f"transformer_blocks.{i}.attn1.to_q.weight", None)
+                        q_bias = weights.pop(f"transformer_blocks.{i}.attn1.to_q.bias", None)
+                        k_weight = weights.pop(f"transformer_blocks.{i}.attn1.to_k.weight", None)
+                        k_bias = weights.pop(f"transformer_blocks.{i}.attn1.to_k.bias", None)
+                        v_weight = weights.pop(f"transformer_blocks.{i}.attn1.to_v.weight", None)
+                        v_bias = weights.pop(f"transformer_blocks.{i}.attn1.to_v.bias", None)
+
+                        # query, key, value的weight和bias权重存在同一个文件中，不会分开存储。
+                        if q_weight and k_weight and v_weight:
+                            qkv_weight = torch.cat([q_weight, k_weight, v_weight], dim=0).transpose(0, 1).contiguous()
+                            qkv_bias = torch.cat([q_bias, k_bias, v_bias], dim=0).contiguous()
+                            weights[f"transformer_blocks.{i}.attn1.to_qkv.weight"] = qkv_weight
+                            weights[f"transformer_blocks.{i}.attn1.to_qkv.bias"] = qkv_bias
+                    else:
+                        if self.q_weight_cache is None:
+                            self.q_weight_cache = weights.pop(f"transformer_blocks.{i}.attn1.to_q.weight", None)
+                        if self.q_bias_cache is None:
+                            self.q_bias_cache = weights.pop(f"transformer_blocks.{i}.attn1.to_q.bias", None)
+                        if self.k_weight_cache is None:
+                            self.k_weight_cache = weights.pop(f"transformer_blocks.{i}.attn1.to_k.weight", None)
+                        if self.k_bias_cache is None:
+                            self.k_bias_cache = weights.pop(f"transformer_blocks.{i}.attn1.to_k.bias", None)
+                        if self.v_weight_cache is None:
+                            self.v_weight_cache = weights.pop(f"transformer_blocks.{i}.attn1.to_v.weight", None)
+                        if self.v_bias_cache is None:
+                            self.v_bias_cache = weights.pop(f"transformer_blocks.{i}.attn1.to_v.bias", None)
+
+                if self.q_weight_cache and self.k_weight_cache and self.v_weight_cache:
+                    qkv_weight = torch.cat(
+                        [self.q_weight_cache, self.k_weight_cache, self.v_weight_cache], 
+                        dim=0
+                    ).transpose(0, 1).contiguous()
+                    qkv_bias = torch.cat([self.q_bias_cache, self.k_bias_cache, self.v_bias_cache], dim=0).contiguous()
+                    weights[f"transformer_blocks.{i}.attn1.to_qkv.weight"] = qkv_weight
+                    weights[f"transformer_blocks.{i}.attn1.to_qkv.bias"] = qkv_bias
+
+                self.load_state_dict(weights, strict=False, assign=True)
+                return state_dict.keys()
\ No newline at end of file
-- 
Gitee


From 7765d34253ca81c1424c4a1059e381838e78b837 Mon Sep 17 00:00:00 2001
From: jiangmengyu <jiangmengyu1@huawei.com>
Date: Thu, 26 Dec 2024 11:12:31 +0800
Subject: [PATCH 16/91] =?UTF-8?q?CogView3plus=E9=80=82=E9=85=8D?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../cogview3/cogview3plus/models/transformer_cogview3plus.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/transformer_cogview3plus.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/transformer_cogview3plus.py
index 0846744ef7..8c2086d644 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/transformer_cogview3plus.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/transformer_cogview3plus.py
@@ -407,7 +407,7 @@ class CogView3PlusTransformer2DModel(ModelMixin, ConfigMixin):
                         v_bias = weights.pop(f"transformer_blocks.{i}.attn1.to_v.bias", None)
 
                         # query, key, value的weight和bias权重存在同一个文件中，不会分开存储。
-                        if q_weight and k_weight and v_weight:
+                        if q_weight is not None and k_weight is not None and v_weight is not None:
                             qkv_weight = torch.cat([q_weight, k_weight, v_weight], dim=0).transpose(0, 1).contiguous()
                             qkv_bias = torch.cat([q_bias, k_bias, v_bias], dim=0).contiguous()
                             weights[f"transformer_blocks.{i}.attn1.to_qkv.weight"] = qkv_weight
@@ -426,7 +426,8 @@ class CogView3PlusTransformer2DModel(ModelMixin, ConfigMixin):
                         if self.v_bias_cache is None:
                             self.v_bias_cache = weights.pop(f"transformer_blocks.{i}.attn1.to_v.bias", None)
 
-                if self.q_weight_cache and self.k_weight_cache and self.v_weight_cache:
+                qk_weight_cache = self.q_weight_cache is not None and self.k_weight_cache is not None
+                if  qk_weight_cache and self.v_weight_cache is not None:
                     qkv_weight = torch.cat(
                         [self.q_weight_cache, self.k_weight_cache, self.v_weight_cache], 
                         dim=0
-- 
Gitee


From f00c0fbe2012c8577e90feced506e76f9face611 Mon Sep 17 00:00:00 2001
From: jiangmengyu <jiangmengyu1@huawei.com>
Date: Thu, 26 Dec 2024 11:29:58 +0800
Subject: [PATCH 17/91] =?UTF-8?q?CogView3plus=E9=80=82=E9=85=8D?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../cogview3plus/models/transformer_cogview3plus.py         | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/transformer_cogview3plus.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/transformer_cogview3plus.py
index 8c2086d644..fe16cafe50 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/transformer_cogview3plus.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/transformer_cogview3plus.py
@@ -433,8 +433,8 @@ class CogView3PlusTransformer2DModel(ModelMixin, ConfigMixin):
                         dim=0
                     ).transpose(0, 1).contiguous()
                     qkv_bias = torch.cat([self.q_bias_cache, self.k_bias_cache, self.v_bias_cache], dim=0).contiguous()
-                    weights[f"transformer_blocks.{i}.attn1.to_qkv.weight"] = qkv_weight
-                    weights[f"transformer_blocks.{i}.attn1.to_qkv.bias"] = qkv_bias
+                    weights[f"transformer_blocks.26.attn1.to_qkv.weight"] = qkv_weight
+                    weights[f"transformer_blocks.26.attn1.to_qkv.bias"] = qkv_bias
 
                 self.load_state_dict(weights, strict=False, assign=True)
-                return state_dict.keys()
\ No newline at end of file
+                return weights.keys()
\ No newline at end of file
-- 
Gitee


From 7258f2dcbf35958be84e39f0fefb067c248474cb Mon Sep 17 00:00:00 2001
From: jiangmengyu <jiangmengyu1@huawei.com>
Date: Thu, 26 Dec 2024 14:16:10 +0800
Subject: [PATCH 18/91] =?UTF-8?q?CogView3plus=E9=80=82=E9=85=8D?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../cogview3plus/models/activations.py        |   8 --
 .../models/attention_processor.py             |  83 ++++---------
 .../models/transformer_cogview3plus.py        |  83 -------------
 .../pipeline/pipeline_cogview3plus.py         | 111 ------------------
 .../schedulers/scheduling_ddim_cogvideox.py   |  95 ---------------
 .../schedulers/scheduling_dpm_cogvideox.py    |   2 -
 .../schedulers/scheduling_utils.py            |  60 ----------
 .../cogview3plus/vae/autoencoder_kl.py        |  53 +--------
 8 files changed, 21 insertions(+), 474 deletions(-)

diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/activations.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/activations.py
index cb1c29919e..48fe8ed17d 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/activations.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/activations.py
@@ -92,8 +92,6 @@ class GELU(nn.Module):
 
 class GEGLU(nn.Module):
     r"""
-    A [variant](https://arxiv.org/abs/2002.05202) of the gated linear unit activation function.
-
     Parameters:
         dim_in (`int`): The number of channels in the input.
         dim_out (`int`): The number of channels in the output.
@@ -125,9 +123,6 @@ class GEGLU(nn.Module):
 
 class SwiGLU(nn.Module):
     r"""
-    A [variant](https://arxiv.org/abs/2002.05202) of the gated linear unit activation function. It's similar to `GEGLU`
-    but uses SiLU / Swish instead of GeLU.
-
     Parameters:
         dim_in (`int`): The number of channels in the input.
         dim_out (`int`): The number of channels in the output.
@@ -148,9 +143,6 @@ class SwiGLU(nn.Module):
 
 class ApproximateGELU(nn.Module):
     r"""
-    The approximate form of the Gaussian Error Linear Unit (GELU). For more details, see section 2 of this
-    [paper](https://arxiv.org/abs/1606.08415).
-
     Parameters:
         dim_in (`int`): The number of channels in the input.
         dim_out (`int`): The number of channels in the output.
diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/attention_processor.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/attention_processor.py
index 77863c2275..de7a2a130f 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/attention_processor.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/attention_processor.py
@@ -22,67 +22,11 @@ import torch_npu
 from diffusers.utils import logging
 from diffusers.utils.torch_utils import maybe_allow_in_graph
 
-from ..layers import QKVLinear
-
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
 
 @maybe_allow_in_graph
 class Attention(nn.Module):
-    r"""
-    A cross attention layer.
-
-    Parameters:
-        query_dim (`int`):
-            The number of channels in the query.
-        cross_attention_dim (`int`, *optional*):
-            The number of channels in the encoder_hidden_states. If not given, defaults to `query_dim`.
-        heads (`int`,  *optional*, defaults to 8):
-            The number of heads to use for multi-head attention.
-        kv_heads (`int`,  *optional*, defaults to `None`):
-            The number of key and value heads to use for multi-head attention. Defaults to `heads`. If
-            `kv_heads=heads`, the model will use Multi Head Attention (MHA), if `kv_heads=1` the model will use Multi
-            Query Attention (MQA) otherwise GQA is used.
-        dim_head (`int`,  *optional*, defaults to 64):
-            The number of channels in each head.
-        dropout (`float`, *optional*, defaults to 0.0):
-            The dropout probability to use.
-        bias (`bool`, *optional*, defaults to False):
-            Set to `True` for the query, key, and value linear layers to contain a bias parameter.
-        upcast_attention (`bool`, *optional*, defaults to False):
-            Set to `True` to upcast the attention computation to `float32`.
-        upcast_softmax (`bool`, *optional*, defaults to False):
-            Set to `True` to upcast the softmax computation to `float32`.
-        cross_attention_norm (`str`, *optional*, defaults to `None`):
-            The type of normalization to use for the cross attention. Can be `None`, `layer_norm`, or `group_norm`.
-        cross_attention_norm_num_groups (`int`, *optional*, defaults to 32):
-            The number of groups to use for the group norm in the cross attention.
-        added_kv_proj_dim (`int`, *optional*, defaults to `None`):
-            The number of channels to use for the added key and value projections. If `None`, no projection is used.
-        norm_num_groups (`int`, *optional*, defaults to `None`):
-            The number of groups to use for the group norm in the attention.
-        spatial_norm_dim (`int`, *optional*, defaults to `None`):
-            The number of channels to use for the spatial normalization.
-        out_bias (`bool`, *optional*, defaults to `True`):
-            Set to `True` to use a bias in the output linear layer.
-        scale_qk (`bool`, *optional*, defaults to `True`):
-            Set to `True` to scale the query and key by `1 / sqrt(dim_head)`.
-        only_cross_attention (`bool`, *optional*, defaults to `False`):
-            Set to `True` to only use cross attention and not added_kv_proj_dim. Can only be set to `True` if
-            `added_kv_proj_dim` is not `None`.
-        eps (`float`, *optional*, defaults to 1e-5):
-            An additional value added to the denominator in group normalization that is used for numerical stability.
-        rescale_output_factor (`float`, *optional*, defaults to 1.0):
-            A factor to rescale the output by dividing it with this value.
-        residual_connection (`bool`, *optional*, defaults to `False`):
-            Set to `True` to add the residual connection to the output.
-        _from_deprecated_attn_block (`bool`, *optional*, defaults to `False`):
-            Set to `True` if the attention block is loaded from a deprecated state dict.
-        processor (`AttnProcessor`, *optional*, defaults to `None`):
-            The attention processor to use. If `None`, defaults to `AttnProcessor2_0` if `torch 2.x` is used and
-            `AttnProcessor` otherwise.
-    """
-
     def __init__(
         self,
         query_dim: int,
@@ -215,7 +159,15 @@ class Attention(nn.Module):
                 f"unknown cross_attention_norm: {cross_attention_norm}. Should be None, 'layer_norm' or 'group_norm'"
             )
         
-        self.to_qkv = QKVLinear(self.inner_dim, query_dim, qkv_bias=bias)
+        self.to_q = nn.Linear(query_dim, self.inner_dim, bias=bias)
+
+        if not self.only_cross_attention:
+            # only relevant for the `AddedKVProcessor` classes
+            self.to_k = nn.Linear(self.cross_attention_dim, self.inner_kv_dim, bias=bias)
+            self.to_v = nn.Linear(self.cross_attention_dim, self.inner_kv_dim, bias=bias)
+        else:
+            self.to_k = None
+            self.to_v = None
 
         self.added_proj_bias = added_proj_bias
         if self.added_kv_proj_dim is not None:
@@ -414,7 +366,9 @@ class CogVideoXAttnProcessor2_0:
             attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
             attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
 
-        query, key, value = attn.to_qkv(hidden_states)
+        query = attn.to_q(hidden_states)
+        key = attn.to_k(hidden_states)
+        value = attn.to_v(hidden_states)
 
         inner_dim = key.shape[-1]
         head_dim = inner_dim // attn.heads
@@ -428,11 +382,14 @@ class CogVideoXAttnProcessor2_0:
         if attn.norm_k is not None:
             key = attn.norm_k(key)
 
-        _, N, _, D = query.shape
-        dim = 64
-        query = F.pad(query, (0, dim - D))
-        key = F.pad(key, (0, dim - D))
-        value = F.pad(value, (0, dim - D))
+        B, N, S, D = query.shape
+        dim = 48
+        pad_shape = [B, N, S, D]
+        pad_shape[-1] = dim - pad_shape[-1]
+        pad = torch.zeros(pad_shape, dtype=query.dtype, device=query.device)
+        query = torch.cat([query, pad], dim=-1)
+        key = torch.cat([key, pad], dim=-1)
+        value = torch.cat([value, pad], dim=-1)
         hidden_states = torch_npu.npu_prompt_flash_attention(
             query,
             key,
diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/transformer_cogview3plus.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/transformer_cogview3plus.py
index fe16cafe50..f13d71880b 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/transformer_cogview3plus.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/transformer_cogview3plus.py
@@ -35,8 +35,6 @@ logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
 class CogView3PlusTransformerBlock(nn.Module):
     r"""
-    Transformer block used in [CogView](https://github.com/THUDM/CogView3) model.
-
     Args:
         dim (`int`):
             The number of channels in the input and output.
@@ -129,9 +127,6 @@ class CogView3PlusTransformerBlock(nn.Module):
 
 class CogView3PlusTransformer2DModel(ModelMixin, ConfigMixin):
     r"""
-    The Transformer model introduced in [CogView3: Finer and Faster Text-to-Image Generation via Relay
-    Diffusion](https://huggingface.co/papers/2403.05121).
-
     Args:
         patch_size (`int`, defaults to `2`):
             The size of the patches to use in the patch embedding layer.
@@ -305,34 +300,6 @@ class CogView3PlusTransformer2DModel(ModelMixin, ConfigMixin):
         crop_coords: torch.Tensor,
         return_dict: bool = True,
     ) -> Union[torch.Tensor, Transformer2DModelOutput]:
-        """
-        The [`CogView3PlusTransformer2DModel`] forward method.
-
-        Args:
-            hidden_states (`torch.Tensor`):
-                Input `hidden_states` of shape `(batch size, channel, height, width)`.
-            encoder_hidden_states (`torch.Tensor`):
-                Conditional embeddings (embeddings computed from the input conditions such as prompts) of shape
-                `(batch_size, sequence_len, text_embed_dim)`
-            timestep (`torch.LongTensor`):
-                Used to indicate denoising step.
-            original_size (`torch.Tensor`):
-                CogView3 uses SDXL-like micro-conditioning for original image size as explained in section 2.2 of
-                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
-            target_size (`torch.Tensor`):
-                CogView3 uses SDXL-like micro-conditioning for target image size as explained in section 2.2 of
-                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
-            crop_coords (`torch.Tensor`):
-                CogView3 uses SDXL-like micro-conditioning for crop coordinates as explained in section 2.2 of
-                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~models.transformer_2d.Transformer2DModelOutput`] instead of a plain
-                tuple.
-
-        Returns:
-            `torch.Tensor` or [`~models.transformer_2d.Transformer2DModelOutput`]:
-                The denoised latents using provided inputs as conditioning.
-        """
         height, width = hidden_states.shape[-2:]
         text_seq_length = encoder_hidden_states.shape[1]
 
@@ -388,53 +355,3 @@ class CogView3PlusTransformer2DModel(ModelMixin, ConfigMixin):
             return (output,)
 
         return Transformer2DModelOutput(sample=output)
-
-    def load_weights(self, state_dict, shard=False):
-        with torch.no_grad():
-            if not shard:
-                self.load_state_dict(state_dict)
-                return {}
-            else:
-                weights = state_dict
-
-                for i in range(self.num_layers):
-                    if i != 26:
-                        q_weight = weights.pop(f"transformer_blocks.{i}.attn1.to_q.weight", None)
-                        q_bias = weights.pop(f"transformer_blocks.{i}.attn1.to_q.bias", None)
-                        k_weight = weights.pop(f"transformer_blocks.{i}.attn1.to_k.weight", None)
-                        k_bias = weights.pop(f"transformer_blocks.{i}.attn1.to_k.bias", None)
-                        v_weight = weights.pop(f"transformer_blocks.{i}.attn1.to_v.weight", None)
-                        v_bias = weights.pop(f"transformer_blocks.{i}.attn1.to_v.bias", None)
-
-                        # query, key, value的weight和bias权重存在同一个文件中，不会分开存储。
-                        if q_weight is not None and k_weight is not None and v_weight is not None:
-                            qkv_weight = torch.cat([q_weight, k_weight, v_weight], dim=0).transpose(0, 1).contiguous()
-                            qkv_bias = torch.cat([q_bias, k_bias, v_bias], dim=0).contiguous()
-                            weights[f"transformer_blocks.{i}.attn1.to_qkv.weight"] = qkv_weight
-                            weights[f"transformer_blocks.{i}.attn1.to_qkv.bias"] = qkv_bias
-                    else:
-                        if self.q_weight_cache is None:
-                            self.q_weight_cache = weights.pop(f"transformer_blocks.{i}.attn1.to_q.weight", None)
-                        if self.q_bias_cache is None:
-                            self.q_bias_cache = weights.pop(f"transformer_blocks.{i}.attn1.to_q.bias", None)
-                        if self.k_weight_cache is None:
-                            self.k_weight_cache = weights.pop(f"transformer_blocks.{i}.attn1.to_k.weight", None)
-                        if self.k_bias_cache is None:
-                            self.k_bias_cache = weights.pop(f"transformer_blocks.{i}.attn1.to_k.bias", None)
-                        if self.v_weight_cache is None:
-                            self.v_weight_cache = weights.pop(f"transformer_blocks.{i}.attn1.to_v.weight", None)
-                        if self.v_bias_cache is None:
-                            self.v_bias_cache = weights.pop(f"transformer_blocks.{i}.attn1.to_v.bias", None)
-
-                qk_weight_cache = self.q_weight_cache is not None and self.k_weight_cache is not None
-                if  qk_weight_cache and self.v_weight_cache is not None:
-                    qkv_weight = torch.cat(
-                        [self.q_weight_cache, self.k_weight_cache, self.v_weight_cache], 
-                        dim=0
-                    ).transpose(0, 1).contiguous()
-                    qkv_bias = torch.cat([self.q_bias_cache, self.k_bias_cache, self.v_bias_cache], dim=0).contiguous()
-                    weights[f"transformer_blocks.26.attn1.to_qkv.weight"] = qkv_weight
-                    weights[f"transformer_blocks.26.attn1.to_qkv.bias"] = qkv_bias
-
-                self.load_state_dict(weights, strict=False, assign=True)
-                return weights.keys()
\ No newline at end of file
diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py
index a78f82a9b2..91559134c6 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py
@@ -110,28 +110,6 @@ def retrieve_timesteps(
 
 
 class CogView3PlusPipeline(DiffusionPipeline):
-    r"""
-    Pipeline for text-to-image generation using CogView3Plus.
-
-    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
-    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
-
-    Args:
-        vae ([`AutoencoderKL`]):
-            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
-        text_encoder ([`T5EncoderModel`]):
-            Frozen text-encoder. CogView3Plus uses
-            [T5](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5EncoderModel); specifically the
-            [t5-v1_1-xxl](https://huggingface.co/PixArt-alpha/PixArt-alpha/tree/main/t5-v1_1-xxl) variant.
-        tokenizer (`T5Tokenizer`):
-            Tokenizer of class
-            [T5Tokenizer](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5Tokenizer).
-        transformer ([`CogView3PlusTransformer2DModel`]):
-            A text conditioned `CogView3PlusTransformer2DModel` to denoise the encoded image latents.
-        scheduler ([`SchedulerMixin`]):
-            A scheduler to be used in combination with `transformer` to denoise the encoded image latents.
-    """
-
     _optional_components = []
     model_cpu_offload_seq = "text_encoder->transformer->vae"
 
@@ -313,10 +291,6 @@ class CogView3PlusPipeline(DiffusionPipeline):
 
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
     def prepare_extra_step_kwargs(self, generator, eta):
-        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
-        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
-        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
-        # and should be between [0, 1]
 
         accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
         extra_step_kwargs = {}
@@ -385,9 +359,6 @@ class CogView3PlusPipeline(DiffusionPipeline):
     def guidance_scale(self):
         return self._guidance_scale
 
-    # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
-    # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
-    # corresponds to doing no classifier free guidance.
     @property
     def do_classifier_free_guidance(self):
         return self._guidance_scale > 1
@@ -427,88 +398,6 @@ class CogView3PlusPipeline(DiffusionPipeline):
         callback_on_step_end_tensor_inputs: List[str] = ["latents"],
         max_sequence_length: int = 224,
     ) -> Union[CogView3PipelineOutput, Tuple]:
-        """
-        Function invoked when calling the pipeline for generation.
-
-        Args:
-            prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
-            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. If not defined, one has to pass
-                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
-                less than `1`).
-            height (`int`, *optional*, defaults to self.transformer.config.sample_size * self.vae_scale_factor):
-                The height in pixels of the generated image. If not provided, it is set to 1024.
-            width (`int`, *optional*, defaults to self.transformer.config.sample_size * self.vae_scale_factor):
-                The width in pixels of the generated image. If not provided it is set to 1024.
-            num_inference_steps (`int`, *optional*, defaults to `50`):
-                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
-                expense of slower inference.
-            timesteps (`List[int]`, *optional*):
-                Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
-                in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
-                passed will be used. Must be in descending order.
-            guidance_scale (`float`, *optional*, defaults to `5.0`):
-                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
-                `guidance_scale` is defined as `w` of equation 2. of [Imagen
-                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
-                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
-                usually at the expense of lower image quality.
-            num_images_per_prompt (`int`, *optional*, defaults to `1`):
-                The number of images to generate per prompt.
-            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
-                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
-                to make generation deterministic.
-            latents (`torch.FloatTensor`, *optional*):
-                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
-                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
-                tensor will ge generated by sampling using the supplied random `generator`.
-            prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
-            original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
-                If `original_size` is not the same as `target_size` the image will appear to be down- or upsampled.
-                `original_size` defaults to `(height, width)` if not specified. Part of SDXL's micro-conditioning as
-                explained in section 2.2 of
-                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
-            crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
-                `crops_coords_top_left` can be used to generate an image that appears to be "cropped" from the position
-                `crops_coords_top_left` downwards. Favorable, well-centered images are usually achieved by setting
-                `crops_coords_top_left` to (0, 0). Part of SDXL's micro-conditioning as explained in section 2.2 of
-                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
-            output_type (`str`, *optional*, defaults to `"pil"`):
-                The output format of the generate image. Choose between
-                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] instead
-                of a plain tuple.
-            attention_kwargs (`dict`, *optional*):
-                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
-                `self.processor` in
-                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
-            callback_on_step_end (`Callable`, *optional*):
-                A function that calls at the end of each denoising steps during the inference. The function is called
-                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
-                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
-                `callback_on_step_end_tensor_inputs`.
-            callback_on_step_end_tensor_inputs (`List`, *optional*):
-                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
-                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
-                `._callback_tensor_inputs` attribute of your pipeline class.
-            max_sequence_length (`int`, defaults to `224`):
-                Maximum sequence length in encoded prompt. Can be set to other values but may lead to poorer results.
-
-        Examples:
-
-        Returns:
-            [`~pipelines.cogview3.pipeline_cogview3plus.CogView3PipelineOutput`] or `tuple`:
-            [`~pipelines.cogview3.pipeline_cogview3plus.CogView3PipelineOutput`] if `return_dict` is True, otherwise a
-            `tuple`. When returning a tuple, the first element is a list with the generated images.
-        """
 
         if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
             callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_ddim_cogvideox.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_ddim_cogvideox.py
index 27c31923fe..b4e22a0615 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_ddim_cogvideox.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_ddim_cogvideox.py
@@ -124,55 +124,6 @@ def rescale_zero_terminal_snr(alphas_cumprod):
 
 
 class CogVideoXDDIMScheduler(SchedulerMixin, ConfigMixin):
-    """
-    `DDIMScheduler` extends the denoising procedure introduced in denoising diffusion probabilistic models (DDPMs) with
-    non-Markovian guidance.
-
-    This model inherits from [`SchedulerMixin`] and [`ConfigMixin`]. Check the superclass documentation for the generic
-    methods the library implements for all schedulers such as loading and saving.
-
-    Args:
-        num_train_timesteps (`int`, defaults to 1000):
-            The number of diffusion steps to train the model.
-        beta_start (`float`, defaults to 0.0001):
-            The starting `beta` value of inference.
-        beta_end (`float`, defaults to 0.02):
-            The final `beta` value.
-        beta_schedule (`str`, defaults to `"linear"`):
-            The beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
-            `linear`, `scaled_linear`, or `squaredcos_cap_v2`.
-        trained_betas (`np.ndarray`, *optional*):
-            Pass an array of betas directly to the constructor to bypass `beta_start` and `beta_end`.
-        clip_sample (`bool`, defaults to `True`):
-            Clip the predicted sample for numerical stability.
-        clip_sample_range (`float`, defaults to 1.0):
-            The maximum magnitude for sample clipping. Valid only when `clip_sample=True`.
-        set_alpha_to_one (`bool`, defaults to `True`):
-            Each diffusion step uses the alphas product value at that step and at the previous one. For the final step
-            there is no previous alpha. When this option is `True` the previous alpha product is fixed to `1`,
-            otherwise it uses the alpha value at step 0.
-        steps_offset (`int`, defaults to 0):
-            An offset added to the inference steps, as required by some model families.
-        prediction_type (`str`, defaults to `epsilon`, *optional*):
-            Prediction type of the scheduler function; can be `epsilon` (predicts the noise of the diffusion process),
-            `sample` (directly predicts the noisy sample`) or `v_prediction` (see section 2.4 of [Imagen
-            Video](https://imagen.research.google/video/paper.pdf) paper).
-        thresholding (`bool`, defaults to `False`):
-            Whether to use the "dynamic thresholding" method. This is unsuitable for latent-space diffusion models such
-            as Stable Diffusion.
-        dynamic_thresholding_ratio (`float`, defaults to 0.995):
-            The ratio for the dynamic thresholding method. Valid only when `thresholding=True`.
-        sample_max_value (`float`, defaults to 1.0):
-            The threshold value for dynamic thresholding. Valid only when `thresholding=True`.
-        timestep_spacing (`str`, defaults to `"leading"`):
-            The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and
-            Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information.
-        rescale_betas_zero_snr (`bool`, defaults to `False`):
-            Whether to rescale the betas to have zero terminal SNR. This enables the model to generate very bright and
-            dark samples instead of limiting it to samples with medium brightness. Loosely related to
-            [`--offset_noise`](https://github.com/huggingface/diffusers/blob/74fd735eb073eb1d774b1ab4154a0876eb82f055/examples/dreambooth/train_dreambooth.py#L506).
-    """
-
     _compatibles = [e.name for e in KarrasDiffusionSchedulers]
     order = 1
 
@@ -313,55 +264,11 @@ class CogVideoXDDIMScheduler(SchedulerMixin, ConfigMixin):
         variance_noise: Optional[torch.Tensor] = None,
         return_dict: bool = True,
     ) -> Union[DDIMSchedulerOutput, Tuple]:
-        """
-        Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion
-        process from the learned model outputs (most often the predicted noise).
-
-        Args:
-            model_output (`torch.Tensor`):
-                The direct output from learned diffusion model.
-            timestep (`float`):
-                The current discrete timestep in the diffusion chain.
-            sample (`torch.Tensor`):
-                A current instance of a sample created by the diffusion process.
-            eta (`float`):
-                The weight of noise for added noise in diffusion step.
-            use_clipped_model_output (`bool`, defaults to `False`):
-                If `True`, computes "corrected" `model_output` from the clipped predicted original sample. Necessary
-                because predicted original sample is clipped to [-1, 1] when `self.config.clip_sample` is `True`. If no
-                clipping has happened, "corrected" `model_output` would coincide with the one provided as input and
-                `use_clipped_model_output` has no effect.
-            generator (`torch.Generator`, *optional*):
-                A random number generator.
-            variance_noise (`torch.Tensor`):
-                Alternative to generating noise with `generator` by directly providing the noise for the variance
-                itself. Useful for methods such as [`CycleDiffusion`].
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~schedulers.scheduling_ddim.DDIMSchedulerOutput`] or `tuple`.
-
-        Returns:
-            [`~schedulers.scheduling_ddim.DDIMSchedulerOutput`] or `tuple`:
-                If return_dict is `True`, [`~schedulers.scheduling_ddim.DDIMSchedulerOutput`] is returned, otherwise a
-                tuple is returned where the first element is the sample tensor.
-
-        """
         if self.num_inference_steps is None:
             raise ValueError(
                 "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
             )
 
-        # See formulas (12) and (16) of DDIM paper https://arxiv.org/pdf/2010.02502.pdf
-        # Ideally, read DDIM paper in-detail understanding
-
-        # Notation (<variable name> -> <name in paper>
-        # - pred_noise_t -> e_theta(x_t, t)
-        # - pred_original_sample -> f_theta(x_t, t) or x_0
-        # - std_dev_t -> sigma_t
-        # - eta -> η
-        # - pred_sample_direction -> "direction pointing to x_t"
-        # - pred_prev_sample -> "x_t-1"
-
-        # 1. get previous step value (=t-1)
         prev_timestep = timestep - self.config.num_train_timesteps // self.num_inference_steps
 
         # 2. compute alphas, betas
@@ -371,8 +278,6 @@ class CogVideoXDDIMScheduler(SchedulerMixin, ConfigMixin):
         beta_prod_t = 1 - alpha_prod_t
 
         # 3. compute predicted original sample from predicted noise also called
-        # "predicted x_0" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
-        # To make style tests pass, commented out `pred_epsilon` as it is an unused variable
         if self.config.prediction_type == "epsilon":
             pred_original_sample = (sample - beta_prod_t ** (0.5) * model_output) / alpha_prod_t ** (0.5)
             # pred_epsilon = model_output
diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_dpm_cogvideox.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_dpm_cogvideox.py
index 4269fff66a..6d25dea524 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_dpm_cogvideox.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_dpm_cogvideox.py
@@ -95,8 +95,6 @@ def betas_for_alpha_bar(
 
 def rescale_zero_terminal_snr(alphas_cumprod):
     """
-    Rescales betas to have zero terminal SNR Based on https://arxiv.org/pdf/2305.08891.pdf (Algorithm 1)
-
 
     Args:
         betas (`torch.Tensor`):
diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_utils.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_utils.py
index b6418f89dd..7a72eb3d06 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_utils.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_utils.py
@@ -72,20 +72,6 @@ class SchedulerOutput(BaseOutput):
 
 
 class SchedulerMixin(PushToHubMixin):
-    """
-    Base class for all schedulers.
-
-    [`SchedulerMixin`] contains common functions shared by all schedulers such as general loading and saving
-    functionalities.
-
-    [`ConfigMixin`] takes care of storing the configuration attributes (like `num_train_timesteps`) that are passed to
-    the scheduler's `__init__` function, and the attributes can be accessed by `scheduler.config.num_train_timesteps`.
-
-    Class attributes:
-        - **_compatibles** (`List[str]`) -- A list of scheduler classes that are compatible with the parent scheduler
-          class. Use [`~ConfigMixin.from_config`] to load a different compatible scheduler class (should be overridden
-          by parent class).
-    """
 
     config_name = SCHEDULER_CONFIG_NAME
     _compatibles = []
@@ -100,53 +86,7 @@ class SchedulerMixin(PushToHubMixin):
         return_unused_kwargs=False,
         **kwargs,
     ):
-        r"""
-        Instantiate a scheduler from a pre-defined JSON configuration file in a local directory or Hub repository.
-
-        Parameters:
-            pretrained_model_name_or_path (`str` or `os.PathLike`, *optional*):
-                Can be either:
-
-                    - A string, the *model id* (for example `google/ddpm-celebahq-256`) of a pretrained model hosted on
-                      the Hub.
-                    - A path to a *directory* (for example `./my_model_directory`) containing the scheduler
-                      configuration saved with [`~SchedulerMixin.save_pretrained`].
-            subfolder (`str`, *optional*):
-                The subfolder location of a model file within a larger model repository on the Hub or locally.
-            return_unused_kwargs (`bool`, *optional*, defaults to `False`):
-                Whether kwargs that are not consumed by the Python class should be returned or not.
-            cache_dir (`Union[str, os.PathLike]`, *optional*):
-                Path to a directory where a downloaded pretrained model configuration is cached if the standard cache
-                is not used.
-            force_download (`bool`, *optional*, defaults to `False`):
-                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
-                cached versions if they exist.
-
-            proxies (`Dict[str, str]`, *optional*):
-                A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
-                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
-            output_loading_info(`bool`, *optional*, defaults to `False`):
-                Whether or not to also return a dictionary containing missing keys, unexpected keys and error messages.
-            local_files_only(`bool`, *optional*, defaults to `False`):
-                Whether to only load local model weights and configuration files or not. If set to `True`, the model
-                won't be downloaded from the Hub.
-            token (`str` or *bool*, *optional*):
-                The token to use as HTTP bearer authorization for remote files. If `True`, the token generated from
-                `diffusers-cli login` (stored in `~/.huggingface`) is used.
-            revision (`str`, *optional*, defaults to `"main"`):
-                The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier
-                allowed by Git.
-
-        <Tip>
-
-        To use private or [gated models](https://huggingface.co/docs/hub/models-gated#gated-models), log-in with
-        `huggingface-cli login`. You can also activate the special
-        ["offline-mode"](https://huggingface.co/diffusers/installation.html#offline-mode) to use this method in a
-        firewalled environment.
-
-        </Tip>
 
-        """
         config, kwargs, commit_hash = cls.load_config(
             pretrained_model_name_or_path=pretrained_model_name_or_path,
             subfolder=subfolder,
diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/vae/autoencoder_kl.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/vae/autoencoder_kl.py
index cea74eb29f..bbe9bddf3e 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/vae/autoencoder_kl.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/vae/autoencoder_kl.py
@@ -37,40 +37,7 @@ from .vae import Decoder, DecoderOutput, DiagonalGaussianDistribution, Encoder
 
 
 class AutoencoderKL(ModelMixin, ConfigMixin, FromOriginalModelMixin, PeftAdapterMixin):
-    r"""
-    A VAE model with KL loss for encoding images into latents and decoding latent representations into images.
-
-    This model inherits from [`ModelMixin`]. Check the superclass documentation for it's generic methods implemented
-    for all models (such as downloading or saving).
-
-    Parameters:
-        in_channels (int, *optional*, defaults to 3): Number of channels in the input image.
-        out_channels (int,  *optional*, defaults to 3): Number of channels in the output.
-        down_block_types (`Tuple[str]`, *optional*, defaults to `("DownEncoderBlock2D",)`):
-            Tuple of downsample block types.
-        up_block_types (`Tuple[str]`, *optional*, defaults to `("UpDecoderBlock2D",)`):
-            Tuple of upsample block types.
-        block_out_channels (`Tuple[int]`, *optional*, defaults to `(64,)`):
-            Tuple of block output channels.
-        act_fn (`str`, *optional*, defaults to `"silu"`): The activation function to use.
-        latent_channels (`int`, *optional*, defaults to 4): Number of channels in the latent space.
-        sample_size (`int`, *optional*, defaults to `32`): Sample input size.
-        scaling_factor (`float`, *optional*, defaults to 0.18215):
-            The component-wise standard deviation of the trained latent space computed using the first batch of the
-            training set. This is used to scale the latent space to have unit variance when training the diffusion
-            model. The latents are scaled with the formula `z = z * scaling_factor` before being passed to the
-            diffusion model. When decoding, the latents are scaled back to the original scale with the formula: `z = 1
-            / scaling_factor * z`. For more details, refer to sections 4.3.2 and D.1 of the [High-Resolution Image
-            Synthesis with Latent Diffusion Models](https://arxiv.org/abs/2112.10752) paper.
-        force_upcast (`bool`, *optional*, default to `True`):
-            If enabled it will force the VAE to run in float32 for high image resolution pipelines, such as SD-XL. VAE
-            can be fine-tuned / trained to a lower range without loosing too much precision in which case
-            `force_upcast` can be set to `False` - see: https://huggingface.co/madebyollin/sdxl-vae-fp16-fix
-        mid_block_add_attention (`bool`, *optional*, default to `True`):
-            If enabled, the mid_block of the Encoder and Decoder will have attention blocks. If set to false, the
-            mid_block will only have resnet blocks
-    """
-
+    
     _supports_gradient_checkpointing = True
     _no_split_modules = ["BasicTransformerBlock", "ResnetBlock2D"]
 
@@ -388,24 +355,6 @@ class AutoencoderKL(ModelMixin, ConfigMixin, FromOriginalModelMixin, PeftAdapter
         return enc
 
     def tiled_encode(self, x: torch.Tensor, return_dict: bool = True) -> AutoencoderKLOutput:
-        r"""Encode a batch of images using a tiled encoder.
-
-        When this option is enabled, the VAE will split the input tensor into tiles to compute encoding in several
-        steps. This is useful to keep memory use constant regardless of image size. The end result of tiled encoding is
-        different from non-tiled encoding because each tile uses a different encoder. To avoid tiling artifacts, the
-        tiles overlap and are blended together to form a smooth output. You may still see tile-sized changes in the
-        output, but they should be much less noticeable.
-
-        Args:
-            x (`torch.Tensor`): Input batch of images.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~models.autoencoder_kl.AutoencoderKLOutput`] instead of a plain tuple.
-
-        Returns:
-            [`~models.autoencoder_kl.AutoencoderKLOutput`] or `tuple`:
-                If return_dict is True, a [`~models.autoencoder_kl.AutoencoderKLOutput`] is returned, otherwise a plain
-                `tuple` is returned.
-        """
         deprecation_message = (
             "The tiled_encode implementation supporting the `return_dict` parameter is deprecated. In the future, the "
             "implementation of this method will be replaced with that of `_tiled_encode` and you will no longer be able "
-- 
Gitee


From d20c7e8e8de31ae015c0304b92c4a45ac7a51295 Mon Sep 17 00:00:00 2001
From: jiangmengyu <jiangmengyu1@huawei.com>
Date: Thu, 26 Dec 2024 14:29:28 +0800
Subject: [PATCH 19/91] =?UTF-8?q?CogView3plus=E9=80=82=E9=85=8D?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../pipeline/pipeline_cogview3plus.py         | 83 ++++++++++++++++++-
 1 file changed, 82 insertions(+), 1 deletion(-)

diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py
index 91559134c6..2e2ec1c8e8 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py
@@ -291,7 +291,6 @@ class CogView3PlusPipeline(DiffusionPipeline):
 
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
     def prepare_extra_step_kwargs(self, generator, eta):
-
         accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
         extra_step_kwargs = {}
         if accepts_eta:
@@ -398,6 +397,88 @@ class CogView3PlusPipeline(DiffusionPipeline):
         callback_on_step_end_tensor_inputs: List[str] = ["latents"],
         max_sequence_length: int = 224,
     ) -> Union[CogView3PipelineOutput, Tuple]:
+        """
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            height (`int`, *optional*, defaults to self.transformer.config.sample_size * self.vae_scale_factor):
+                The height in pixels of the generated image. If not provided, it is set to 1024.
+            width (`int`, *optional*, defaults to self.transformer.config.sample_size * self.vae_scale_factor):
+                The width in pixels of the generated image. If not provided it is set to 1024.
+            num_inference_steps (`int`, *optional*, defaults to `50`):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
+                in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
+                passed will be used. Must be in descending order.
+            guidance_scale (`float`, *optional*, defaults to `5.0`):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            num_images_per_prompt (`int`, *optional*, defaults to `1`):
+                The number of images to generate per prompt.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                If `original_size` is not the same as `target_size` the image will appear to be down- or upsampled.
+                `original_size` defaults to `(height, width)` if not specified. Part of SDXL's micro-conditioning as
+                explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
+                `crops_coords_top_left` can be used to generate an image that appears to be "cropped" from the position
+                `crops_coords_top_left` downwards. Favorable, well-centered images are usually achieved by setting
+                `crops_coords_top_left` to (0, 0). Part of SDXL's micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] instead
+                of a plain tuple.
+            attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+            max_sequence_length (`int`, defaults to `224`):
+                Maximum sequence length in encoded prompt. Can be set to other values but may lead to poorer results.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.cogview3.pipeline_cogview3plus.CogView3PipelineOutput`] or `tuple`:
+            [`~pipelines.cogview3.pipeline_cogview3plus.CogView3PipelineOutput`] if `return_dict` is True, otherwise a
+            `tuple`. When returning a tuple, the first element is a list with the generated images.
+        """
 
         if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
             callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
-- 
Gitee


From 10d485a9e886f23782b80984463b84a4ae0abcfa Mon Sep 17 00:00:00 2001
From: jiangmengyu <jiangmengyu1@huawei.com>
Date: Thu, 26 Dec 2024 15:06:50 +0800
Subject: [PATCH 20/91] =?UTF-8?q?CogView3plus=E9=80=82=E9=85=8D?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../cogview3/inference_cogview3plus.py        |  2 +-
 .../foundation/cogview3/requirents.txt        | 23 +++++++------------
 2 files changed, 9 insertions(+), 16 deletions(-)

diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/inference_cogview3plus.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/inference_cogview3plus.py
index 74dd914294..e2b39e04aa 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/inference_cogview3plus.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/inference_cogview3plus.py
@@ -51,7 +51,7 @@ def parse_arguments():
     parser.add_argument("--height", type=int, default=1024, help="Height of the generated image.")
     parser.add_argument("--output_path", type=str, default="cogview3.png", help="Path to save the generated image.")
     parser.add_argument("--dtype", type=str, default="bf16", help="bf16 or fp16")
-    parser.add_argument("--device_id", type=int, default=1, help="NPU device id")
+    parser.add_argument("--device_id", type=int, default=6, help="NPU device id")
 
     return parser.parse_args()
 
diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/requirents.txt b/MindIE/MindIE-Torch/built-in/foundation/cogview3/requirents.txt
index ac2fa2a7f6..1600434700 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/requirents.txt
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/requirents.txt
@@ -1,15 +1,8 @@
-accelerate==0.29.3
-deepspeed==0.15.4
-einops==0.7.0
-gradio==3.50.2
-huggingface-hub==0.24.7
-Jinja2==3.1.4
-numpy==1.26.4
-peft==0.10.0
-safetensors==0.4.5
-timm==0.9.5
-tokenizers==0.15.2
-torch==2.1.0
-torchvision==0.14.1
-tqdm==4.66.5
-transformers==4.39.3
\ No newline at end of file
+deepspeed==0.16.1
+transformers==4.47.1
+gradio==5.9.1
+accelerate==1.0.1
+diffusers==0.31.0
+sentencepiece==0.2.0
+torch==2.4.0
+openai==1.58.1
\ No newline at end of file
-- 
Gitee


From a5ef08b4dddd8654ff1c3ee2054684be32397506 Mon Sep 17 00:00:00 2001
From: jiangmengyu <jiangmengyu1@huawei.com>
Date: Thu, 26 Dec 2024 16:07:08 +0800
Subject: [PATCH 21/91] =?UTF-8?q?CogView3plus=E9=80=82=E9=85=8D?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../built-in/foundation/cogview3/README.md    | 170 +++++-------------
 .../cogview3/inference_cogview3plus.py        |  10 +-
 2 files changed, 50 insertions(+), 130 deletions(-)

diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/README.md b/MindIE/MindIE-Torch/built-in/foundation/cogview3/README.md
index 028c765d30..2ad3b64c91 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/README.md
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/README.md
@@ -5,7 +5,7 @@
   | 配套  | 版本 | 环境准备指导 |
   | ----- | ----- |-----|
   | Python | 3.10.2 | - |
-  | torch | 2.1.0 | - |
+  | torch | 2.4.0 | - |
 
 ### 1.1 获取CANN&MindIE安装包&环境准备
 - [800I A2](https://www.hiascend.com/developer/download/community/result?module=pt+ie+cann&product=4&model=32)
@@ -46,7 +46,7 @@ cd ${AieInstallPath}/mindie && source set_env.sh
 ```
 
 ### 1.4 Torch_npu安装
-安装pytorch框架 版本2.1.0
+安装pytorch框架 版本2.4.0
 [安装包下载](https://download.pytorch.org/whl/cpu/torch/)
 
 使用pip安装
@@ -67,161 +67,73 @@ pip install torch_npu-{pytorchversion}.xxxx.{arch}.whl
    git clone https://gitee.com/ascend/ModelZoo-PyTorch.git
 ```
 
-## 三、HunyuanDiT使用
+## 三、CogView3使用
 
 ### 3.1 权重及配置文件说明
-1. text_encoder权重链接:
+1. CogView3权重路径:
 ```shell
-https://huggingface.co/Tencent-Hunyuan/HunyuanDiT-v1.2-Diffusers/tree/main/text_encoder
+https://huggingface.co/THUDM/CogView3-Plus-3B/tree/main
 ```
-2. text_encoder_2权重链接：
-```shell
-https://huggingface.co/Tencent-Hunyuan/HunyuanDiT-v1.2-Diffusers/tree/main/text_encoder_2
-```
-3. tokenizer权重链接：
-```shell
-https://huggingface.co/Tencent-Hunyuan/HunyuanDiT-v1.2-Diffusers/tree/main/tokenizer
-```
-4. tokenizer_2权重链接：
-```shell
-https://huggingface.co/Tencent-Hunyuan/HunyuanDiT-v1.2-Diffusers/tree/main/tokenizer_2
-```
-5. transformer权重链接：
-```shell
-https://huggingface.co/Tencent-Hunyuan/HunyuanDiT-v1.2/tree/main/t2i/model
-```
-- 修改该权重的config.json
+- 修改该权重的model_index.json
 ```shell
 {
-  "architectures": [
-    "HunyuanDiT2DModel"
+  "_class_name": "CogView3PlusPipeline",
+  "_diffusers_version": "0.31.0.dev0",
+  "scheduler": [
+    "cogview3plus",
+    "CogVideoXDDIMScheduler"
+  ],
+  "text_encoder": [
+    "transformers",
+    "T5EncoderModel"
   ],
-  "input_size": [
-    null,
-    null
+  "tokenizer": [
+    "transformers",
+    "T5Tokenizer"
   ],
-  "patch_size": 2,
-  "in_channels": 4,
-  "hidden_size": 1408,
-  "depth": 40,
-  "num_heads": 16,
-  "mlp_ratio": 4.3637,
-  "text_states_dim": 1024,
-  "text_states_dim_t5": 2048,
-  "text_len": 77,
-  "text_len_t5": 256
+  "transformer": [
+    "cogview3plus",
+    "CogView3PlusTransformer2DModel"
+  ],
+  "vae": [
+    "cogview3plus",
+    "AutoencoderKL"
+  ]
 }
 ```
-6. vae权重链接：
+2. scheduler权重链接:
 ```shell
-https://huggingface.co/Tencent-Hunyuan/HunyuanDiT-v1.2-Diffusers/tree/main/vae
+https://huggingface.co/THUDM/CogView3-Plus-3B/tree/main/scheduler
 ```
-- 修改该权重的config.json
+3. text_encoder权重链接：
 ```shell
-{
-  "architectures": [
-    "AutoencoderKL"
-  ],
-  "in_channels": 3,
-  "out_channels": 3,
-  "down_block_types": [
-    "DownEncoderBlock2D",
-    "DownEncoderBlock2D",
-    "DownEncoderBlock2D",
-    "DownEncoderBlock2D"
-  ],
-  "up_block_types": [
-    "UpDecoderBlock2D",
-    "UpDecoderBlock2D",
-    "UpDecoderBlock2D",
-    "UpDecoderBlock2D"
-  ],
-  "block_out_channels": [
-    128,
-    256,
-    512,
-    512
-  ],
-  "layers_per_block": 2,
-  "act_fn": "silu",
-  "latent_channels": 4,
-  "norm_num_groups": 32,
-  "sample_size": 512,
-  "scaling_factor": 0.13025,
-  "shift_factor": null,
-  "latents_mean": null,
-  "latents_std": null,
-  "force_upcast": false,
-  "use_quant_conv": true,
-  "use_post_quant_conv": true
-}
+https://huggingface.co/THUDM/CogView3-Plus-3B/tree/main/text_encoder
 ```
-7. scheduler:
-- 新增scheduler_config.json配置文件, 内容如下所示: 
+4. tokenizer权重链接：
 ```shell
-{
-  "_class_name": "DDPMScheduler",
-  "_mindiesd_version": "1.0.0",
-  "steps_offset": 1,
-  "beta_start": 0.00085,
-  "beta_end": 0.02,
-  "num_train_timesteps": 1000
-}
+https://huggingface.co/THUDM/CogView3-Plus-3B/tree/main/tokenizer
 ```
-8. 新增model_index.json
-将以上步骤下载的权重放在同一目录下, 并新增model_index.json文件, 该文件内容如下所示
+5. transformer权重链接：
 ```shell
-{
-    "_class_name": "HunyuanDiTPipeline",
-    "_mindiesd_version": "1.0.RC3",
-    "scheduler": [
-      "mindiesd",
-      "DDPMScheduler"
-    ],
-    "text_encoder": [
-      "transformers",
-      "BertModel"
-    ],
-    "text_encoder_2": [
-      "transformers",
-      "T5EncoderModel"
-    ],
-    "tokenizer": [
-      "transformers",
-      "BertTokenizer"
-    ],
-    "tokenizer_2": [
-      "transformers",
-      "T5Tokenizer"
-    ],
-    "transformer": [
-      "mindiesd",
-      "HunyuanDiT2DModel"
-    ],
-    "vae": [
-      "mindiesd",
-      "AutoencoderKL"
-    ]
-}
+https://huggingface.co/THUDM/CogView3-Plus-3B/tree/main/transformer
+```
+6. vae权重链接：
+```shell
+https://huggingface.co/THUDM/CogView3-Plus-3B/tree/main/vae
 ```
-9. 各模型的配置文件、权重文件的层级样例如下所示。
+7. 各模型的配置文件、权重文件的层级样例如下所示。
 ```commandline
-|----hunyuandit
+|----CogView3B
+|    |---- configuration.json
 |    |---- model_index.json
 |    |---- scheduler
 |    |    |---- scheduler_config.json
 |    |---- text_encoder
 |    |    |---- config.json
 |    |    |---- 模型权重
-|    |---- text_encoder_2
-|    |    |---- config.json
-|    |    |---- 模型权重
 |    |---- tokenizer
 |    |    |---- config.json
 |    |    |---- 模型权重
-|    |---- tokenizer_2
-|    |    |---- config.json
-|    |    |---- 模型权重
 |    |---- transformer
 |    |    |---- config.json
 |    |    |---- 模型权重
diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/inference_cogview3plus.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/inference_cogview3plus.py
index e2b39e04aa..ea9c77744e 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/inference_cogview3plus.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/inference_cogview3plus.py
@@ -33,7 +33,15 @@ def parse_arguments():
         "--prompt", 
         type=list, 
         default=[
-            "A vibrant cherry red sports car sits proudly under the gleaming sun, its polished exterior smooth and flawless, casting a mirror-like reflection. The car features a low, aerodynamic body, angular headlights that gaze forward like predatory eyes, and a set of black, high-gloss racing rims that contrast starkly with the red. A subtle hint of chrome embellishes the grille and exhaust, while the tinted windows suggest a luxurious and private interior. The scene conveys a sense of speed and elegance, the car appearing as if it's about to burst into a sprint along a coastal road, with the ocean's azure waves crashing in the background."
+            "A vibrant cherry red sports car sits proudly under the gleaming sun, \
+            its polished exterior smooth and flawless, casting a mirror-like reflection. \
+            The car features a low, aerodynamic body, angular headlights that gaze forward like predatory eyes, \
+            and a set of black, high-gloss racing rims that contrast starkly with the red. \
+            A subtle hint of chrome embellishes the grille and exhaust, \
+            while the tinted windows suggest a luxurious and private interior. \
+            he scene conveys a sense of speed and elegance, \
+            the car appearing as if it's about to burst into a sprint along a coastal road, \
+            with the ocean's azure waves crashing in the background."
         ], 
         help="The text description for generating the image."
     )
-- 
Gitee


From 6eab401dc3c783504098658202368ca1994cc385 Mon Sep 17 00:00:00 2001
From: jiangmengyu <jiangmengyu1@huawei.com>
Date: Thu, 26 Dec 2024 16:31:08 +0800
Subject: [PATCH 22/91] =?UTF-8?q?CogView3plus=E9=80=82=E9=85=8D?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../built-in/foundation/cogview3/README.md    | 84 +++----------------
 1 file changed, 12 insertions(+), 72 deletions(-)

diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/README.md b/MindIE/MindIE-Torch/built-in/foundation/cogview3/README.md
index 2ad3b64c91..d5a9e274e6 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/README.md
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/README.md
@@ -145,82 +145,22 @@ https://huggingface.co/THUDM/CogView3-Plus-3B/tree/main/vae
 ### 3.2 单卡单prompt功能测试
 设置权重路径
 ```shell
-path = 'ckpts/hydit'
+model_path = '/data/CogView3B'
 ```
 执行命令：
 ```shell
-python inference_hydit.py \
-       --path ${path} \
+python inference_cogview3plus.py \
+       --model_path ${model_path} \
        --device_id 0 \
-       --prompt "青花瓷风格，一只小狗" \
-       --input_size (1024, 1024) \
-       --seed 42 \
-       --infer_steps 25
+       --width 1024 \
+       --height 1024 \
+       --num_inference_steps 50 \
+       --dtype bf16
 ```
 参数说明：
-- path：权重路径，包含scheduler、text_encoder、text_encoder_2、tokenizer、 tokenizer_2、transformer、vae，七个模型的配置文件及权重。
+- model_path：权重路径，包含scheduler、text_encoder、tokenizer、transformer、vae，5个模型的配置文件及权重。
 - device_id：推理设备ID。
-- prompt：用于图像生成的文字描述提示。
-- input_size：需要生成的图像尺寸。
-- seed：设置随机种子，默认值为42。
-- infer_steps：推理迭代步数。
-
-### 3.3 单卡多prompts进行性能/精度测试
-设置权重路径
-```shell
-path = 'ckpts/hydit'
-```
-执行命令：
-```shell
-python inference_hydit.py \
-       --path ${path} \
-       --device_id 0 \
-       --test_acc \
-       --prompt_list "prompts/example_prompts.txt" \
-       --input_size (1024, 1024) \
-       --seed 42 \
-       --infer_steps 25
-```
-参数说明：
-- path：权重路径，包含scheduler、text_encoder、text_encoder_2、tokenizer、 tokenizer_2、transformer、vae，七个模型的配置文件及权重。
-- device_id：推理设备ID。
-- test_acc：使用 --test_acc 开启全量图像生成，用于性能/精度测试。单prompt功能测试时，不开启该参数。
-- prompt_list：用于图像生成的文字描述提示的列表文件路径。
-- input_size：需要生成的图像尺寸。
-- seed：设置随机种子，默认值为42。
-- infer_steps：推理迭代步数。
-
-### 3.4 用LoRA进行测试
-设置权重路径
-```shell
-path = 'ckpts/hydit'
-```
-LoRA权重链接：
-```shell
-https://huggingface.co/Tencent-Hunyuan/HYDiT-LoRA/tree/main
-```
-设置LoRA权重路径
-```shell
-lora_path = 'ckpts/lora'
-```
-执行命令：
-```shell
-python inference_hydit.py \
-       --path ${path} \
-       --device_id 0 \
-       --prompt "青花瓷风格，一只小狗" \
-       --input_size (1024, 1024) \
-       --seed 42 \
-       --infer_steps 25
-       --use_lora \
-       --lora_ckpt ${lora_path}
-```
-参数说明：
-- path：权重路径，包含scheduler、text_encoder、text_encoder_2、tokenizer、 tokenizer_2、transformer、vae，七个模型的配置文件及权重。
-- device_id：推理设备ID。
-- prompt：用于图像生成的文字描述提示。
-- input_size：需要生成的图像尺寸。
-- seed：设置随机种子，默认值为42。
-- infer_steps：推理迭代步数。
-- use_lora：使用 --use_lora 开启LoRA风格化切换。
-- lora_ckpt：LoRA权重路径。
\ No newline at end of file
+- width：需要生成的图像的宽。
+- height: 需要生成的图像的高。
+- num_inference_steps：推理迭代步数。
+- dtype: 数据类型。目前只支持bf16。
-- 
Gitee


From 00d969b6194fbf27a1fb472a7c2832e26280d82f Mon Sep 17 00:00:00 2001
From: jiangmengyu <jiangmengyu1@huawei.com>
Date: Thu, 26 Dec 2024 16:33:00 +0800
Subject: [PATCH 23/91] =?UTF-8?q?CogView3plus=E9=80=82=E9=85=8D?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 MindIE/MindIE-Torch/built-in/foundation/cogview3/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/README.md b/MindIE/MindIE-Torch/built-in/foundation/cogview3/README.md
index d5a9e274e6..dfcf259a0b 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/README.md
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/README.md
@@ -4,7 +4,7 @@
 
   | 配套  | 版本 | 环境准备指导 |
   | ----- | ----- |-----|
-  | Python | 3.10.2 | - |
+  | Python | 3.10.12 | - |
   | torch | 2.4.0 | - |
 
 ### 1.1 获取CANN&MindIE安装包&环境准备
-- 
Gitee


From 09c395f8857fc2288200b1d9d60dd64082370b0d Mon Sep 17 00:00:00 2001
From: jiangmengyu <jiangmengyu1@huawei.com>
Date: Thu, 26 Dec 2024 17:22:46 +0800
Subject: [PATCH 24/91] =?UTF-8?q?cogview3=E9=80=82=E9=85=8D?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../cogview3plus/layers/embeddings.py         | 35 ++-------------
 .../cogview3plus/layers/normalization.py      |  5 +--
 .../cogview3/inference_cogview3plus.py        | 44 +++++++------------
 3 files changed, 20 insertions(+), 64 deletions(-)

diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/embeddings.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/embeddings.py
index 445ad8245a..29896c0814 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/embeddings.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/embeddings.py
@@ -17,7 +17,6 @@ from typing import Optional
 import numpy as np
 import torch
 from torch import nn
-
 from diffusers.utils import deprecate
 from diffusers.models.activations import FP32SiLU, get_activation
 
@@ -27,30 +26,8 @@ def get_timestep_embedding(
     embedding_dim: int,
     flip_sin_to_cos: bool = False,
     downscale_freq_shift: float = 1,
-    scale: float = 1,
     max_period: int = 10000,
 ):
-    """
-    This matches the implementation in Denoising Diffusion Probabilistic Models: Create sinusoidal timestep embeddings.
-
-    Args
-        timesteps (torch.Tensor):
-            a 1-D Tensor of N indices, one per batch element. These may be fractional.
-        embedding_dim (int):
-            the dimension of the output.
-        flip_sin_to_cos (bool):
-            Whether the embedding order should be `cos, sin` (if True) or `sin, cos` (if False)
-        downscale_freq_shift (float):
-            Controls the delta between frequencies between dimensions
-        scale (float):
-            Scaling factor applied to the embeddings.
-        max_period (int):
-            Controls the maximum frequency of the embeddings
-    Returns
-        torch.Tensor: an [N x dim] Tensor of positional embeddings.
-    """
-    assert len(timesteps.shape) == 1, "Timesteps should be a 1d-array"
-
     half_dim = embedding_dim // 2
     exponent = -math.log(max_period) * torch.arange(
         start=0, end=half_dim, dtype=torch.float32, device=timesteps.device
@@ -60,9 +37,6 @@ def get_timestep_embedding(
     emb = torch.exp(exponent)
     emb = timesteps[:, None].float() * emb[None, :]
 
-    # scale embeddings
-    emb = scale * emb
-
     # concat sine and cosine embeddings
     emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=-1)
 
@@ -83,7 +57,6 @@ def get_2d_sincos_pos_embed(
     extra_tokens=0,
     interpolation_scale=1.0,
     base_size=16,
-    device: Optional[torch.device] = None,
     output_type: str = "np",
 ):
     """
@@ -125,12 +98,12 @@ def get_2d_sincos_pos_embed(
         grid_size = (grid_size, grid_size)
 
     grid_h = (
-        torch.arange(grid_size[0], device=device, dtype=torch.float32)
+        torch.arange(grid_size[0], dtype=torch.float32)
         / (grid_size[0] / base_size)
         / interpolation_scale
     )
     grid_w = (
-        torch.arange(grid_size[1], device=device, dtype=torch.float32)
+        torch.arange(grid_size[1], dtype=torch.float32)
         / (grid_size[1] / base_size)
         / interpolation_scale
     )
@@ -302,12 +275,11 @@ def get_1d_sincos_pos_embed_from_grid_np(embed_dim, pos):
 
 
 class Timesteps(nn.Module):
-    def __init__(self, num_channels: int, flip_sin_to_cos: bool, downscale_freq_shift: float, scale: int = 1):
+    def __init__(self, num_channels: int, flip_sin_to_cos: bool, downscale_freq_shift: float):
         super().__init__()
         self.num_channels = num_channels
         self.flip_sin_to_cos = flip_sin_to_cos
         self.downscale_freq_shift = downscale_freq_shift
-        self.scale = scale
 
     def forward(self, timesteps):
         t_emb = get_timestep_embedding(
@@ -315,7 +287,6 @@ class Timesteps(nn.Module):
             self.num_channels,
             flip_sin_to_cos=self.flip_sin_to_cos,
             downscale_freq_shift=self.downscale_freq_shift,
-            scale=self.scale,
         )
         return t_emb
 
diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/normalization.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/normalization.py
index 3dd2bba76c..64dbbe058a 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/normalization.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/normalization.py
@@ -27,7 +27,6 @@ if is_torch_version(">=", "2.1.0"):
     LayerNorm = nn.LayerNorm
 else:
     # Has optional bias parameter compared to torch layer norm
-    # TODO: replace with torch layernorm once min required torch version >= 2.1
     class LayerNorm(nn.Module):
         def __init__(self, dim, eps: float = 1e-5, elementwise_affine: bool = True, bias: bool = True):
             super().__init__()
@@ -46,8 +45,8 @@ else:
                 self.weight = None
                 self.bias = None
 
-        def forward(self, input):
-            return F.layer_norm(input, self.dim, self.weight, self.bias, self.eps)
+        def forward(self, x):
+            return F.layer_norm(x, self.dim, self.weight, self.bias, self.eps)
 
 
 class RMSNorm(nn.Module):
diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/inference_cogview3plus.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/inference_cogview3plus.py
index ea9c77744e..5c12695d6c 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/inference_cogview3plus.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/inference_cogview3plus.py
@@ -16,9 +16,10 @@
 
 import argparse
 import logging
-import torch
 import time
 
+import torch
+
 from cogview3plus import CogView3PlusPipeline
 
 logging.basicConfig(level=logging.INFO)
@@ -64,11 +65,12 @@ def parse_arguments():
     return parser.parse_args()
 
 
-def generate_image(
-    prompt, model_path, guidance_scale, num_images_per_prompt, num_inference_steps, width, height, output_path, dtype
-):
+def infer(args):
+    torch.npu.set_device(args.device_id)
+    dtype = torch.bfloat16 if args.dtype == "bf16" else torch.float16
+
     # Load the pre-trained model with the specified precision
-    pipe = CogView3PlusPipeline.from_pretrained(model_path, torch_dtype=dtype).to("npu")
+    pipe = CogView3PlusPipeline.from_pretrained(args.model_path, torch_dtype=dtype).to("npu")
 
     use_time = 0
     loops = 5
@@ -76,12 +78,12 @@ def generate_image(
         start_time = time.time()
         # Generate the image based on the prompt
         image = pipe(
-            prompt=prompt,
-            guidance_scale=guidance_scale,
-            num_images_per_prompt=num_images_per_prompt,
-            num_inference_steps=num_inference_steps,
-            width=width,
-            height=height,
+            prompt=args.prompt[0],
+            guidance_scale=args.guidance_scale,
+            num_images_per_prompt=args.num_images_per_prompt,
+            num_inference_steps=args.num_inference_steps,
+            width=args.width,
+            height=args.height,
         ).images[0]
         
         if i >= 2:
@@ -93,25 +95,9 @@ def generate_image(
     logger.info("use_time is %.3f)", use_time / 3)
 
     # Save the generated image to the local file system
-    image.save(output_path)
+    image.save(args.output_path)
 
-    print(f"Image saved to {output_path}")
-
-
-def infer(args):
-    torch.npu.set_device(args.device_id)
-    dtype = torch.bfloat16 if args.dtype == "bf16" else torch.float16
-    generate_image(
-        prompt=args.prompt[0],
-        model_path=args.model_path,
-        guidance_scale=args.guidance_scale,
-        num_images_per_prompt=args.num_images_per_prompt,
-        num_inference_steps=args.num_inference_steps,
-        width=args.width,
-        height=args.height,
-        output_path=args.output_path,
-        dtype=dtype,
-    )
+    print(f"Image saved to {args.output_path}")
 
 
 if __name__ == "__main__":
-- 
Gitee


From 64616aa9f302369d84d435ec5b059e23a24dfb55 Mon Sep 17 00:00:00 2001
From: jiangmengyu <jiangmengyu1@huawei.com>
Date: Thu, 26 Dec 2024 17:27:03 +0800
Subject: [PATCH 25/91] =?UTF-8?q?cogview3=E9=80=82=E9=85=8D?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../cogview3plus/layers/embeddings.py         | 159 +-----------------
 1 file changed, 6 insertions(+), 153 deletions(-)

diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/embeddings.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/embeddings.py
index 29896c0814..dc1c683c63 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/embeddings.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/embeddings.py
@@ -53,47 +53,9 @@ def get_timestep_embedding(
 def get_2d_sincos_pos_embed(
     embed_dim,
     grid_size,
-    cls_token=False,
-    extra_tokens=0,
     interpolation_scale=1.0,
     base_size=16,
-    output_type: str = "np",
 ):
-    """
-    Creates 2D sinusoidal positional embeddings.
-
-    Args:
-        embed_dim (`int`):
-            The embedding dimension.
-        grid_size (`int`):
-            The size of the grid height and width.
-        cls_token (`bool`, defaults to `False`):
-            Whether or not to add a classification token.
-        extra_tokens (`int`, defaults to `0`):
-            The number of extra tokens to add.
-        interpolation_scale (`float`, defaults to `1.0`):
-            The scale of the interpolation.
-
-    Returns:
-        pos_embed (`torch.Tensor`):
-            Shape is either `[grid_size * grid_size, embed_dim]` if not using cls_token, or `[1 + grid_size*grid_size,
-            embed_dim]` if using cls_token
-    """
-    if output_type == "np":
-        deprecation_message = (
-            "`get_2d_sincos_pos_embed` uses `torch` and supports `device`."
-            " `from_numpy` is no longer required."
-            "  Pass `output_type='pt' to use the new version now."
-        )
-        deprecate("output_type=='np'", "0.33.0", deprecation_message, standard_warn=False)
-        return get_2d_sincos_pos_embed_np(
-            embed_dim=embed_dim,
-            grid_size=grid_size,
-            cls_token=cls_token,
-            extra_tokens=extra_tokens,
-            interpolation_scale=interpolation_scale,
-            base_size=base_size,
-        )
     if isinstance(grid_size, int):
         grid_size = (grid_size, grid_size)
 
@@ -111,13 +73,11 @@ def get_2d_sincos_pos_embed(
     grid = torch.stack(grid, dim=0)
 
     grid = grid.reshape([2, 1, grid_size[1], grid_size[0]])
-    pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid, output_type=output_type)
-    if cls_token and extra_tokens > 0:
-        pos_embed = torch.concat([torch.zeros([extra_tokens, embed_dim]), pos_embed], dim=0)
+    pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
     return pos_embed
 
 
-def get_2d_sincos_pos_embed_from_grid(embed_dim, grid, output_type="np"):
+def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
     r"""
     This function generates 2D sinusoidal positional embeddings from a grid.
 
@@ -128,29 +88,18 @@ def get_2d_sincos_pos_embed_from_grid(embed_dim, grid, output_type="np"):
     Returns:
         `torch.Tensor`: The 2D sinusoidal positional embeddings with shape `(H * W, embed_dim)`
     """
-    if output_type == "np":
-        deprecation_message = (
-            "`get_2d_sincos_pos_embed_from_grid` uses `torch` and supports `device`."
-            " `from_numpy` is no longer required."
-            "  Pass `output_type='pt' to use the new version now."
-        )
-        deprecate("output_type=='np'", "0.33.0", deprecation_message, standard_warn=False)
-        return get_2d_sincos_pos_embed_from_grid_np(
-            embed_dim=embed_dim,
-            grid=grid,
-        )
     if embed_dim % 2 != 0:
         raise ValueError("embed_dim must be divisible by 2")
 
     # use half of dimensions to encode grid_h
-    emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0], output_type=output_type)  # (H*W, D/2)
-    emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1], output_type=output_type)  # (H*W, D/2)
+    emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0])  # (H*W, D/2)
+    emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1])  # (H*W, D/2)
 
     emb = torch.concat([emb_h, emb_w], dim=1)  # (H*W, D)
     return emb
 
 
-def get_1d_sincos_pos_embed_from_grid(embed_dim, pos, output_type="np"):
+def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
     """
     This function generates 1D positional embeddings from a grid.
 
@@ -161,14 +110,6 @@ def get_1d_sincos_pos_embed_from_grid(embed_dim, pos, output_type="np"):
     Returns:
         `torch.Tensor`: Sinusoidal positional embeddings of shape `(M, D)`.
     """
-    if output_type == "np":
-        deprecation_message = (
-            "`get_1d_sincos_pos_embed_from_grid` uses `torch` and supports `device`."
-            " `from_numpy` is no longer required."
-            "  Pass `output_type='pt' to use the new version now."
-        )
-        deprecate("output_type=='np'", "0.33.0", deprecation_message, standard_warn=False)
-        return get_1d_sincos_pos_embed_from_grid_np(embed_dim=embed_dim, pos=pos)
     if embed_dim % 2 != 0:
         raise ValueError("embed_dim must be divisible by 2")
 
@@ -186,94 +127,6 @@ def get_1d_sincos_pos_embed_from_grid(embed_dim, pos, output_type="np"):
     return emb
 
 
-def get_2d_sincos_pos_embed_np(
-    embed_dim, grid_size, cls_token=False, extra_tokens=0, interpolation_scale=1.0, base_size=16
-):
-    """
-    Creates 2D sinusoidal positional embeddings.
-
-    Args:
-        embed_dim (`int`):
-            The embedding dimension.
-        grid_size (`int`):
-            The size of the grid height and width.
-        cls_token (`bool`, defaults to `False`):
-            Whether or not to add a classification token.
-        extra_tokens (`int`, defaults to `0`):
-            The number of extra tokens to add.
-        interpolation_scale (`float`, defaults to `1.0`):
-            The scale of the interpolation.
-
-    Returns:
-        pos_embed (`np.ndarray`):
-            Shape is either `[grid_size * grid_size, embed_dim]` if not using cls_token, or `[1 + grid_size*grid_size,
-            embed_dim]` if using cls_token
-    """
-    if isinstance(grid_size, int):
-        grid_size = (grid_size, grid_size)
-
-    grid_h = np.arange(grid_size[0], dtype=np.float32) / (grid_size[0] / base_size) / interpolation_scale
-    grid_w = np.arange(grid_size[1], dtype=np.float32) / (grid_size[1] / base_size) / interpolation_scale
-    grid = np.meshgrid(grid_w, grid_h)  # here w goes first
-    grid = np.stack(grid, axis=0)
-
-    grid = grid.reshape([2, 1, grid_size[1], grid_size[0]])
-    pos_embed = get_2d_sincos_pos_embed_from_grid_np(embed_dim, grid)
-    if cls_token and extra_tokens > 0:
-        pos_embed = np.concatenate([np.zeros([extra_tokens, embed_dim]), pos_embed], axis=0)
-    return pos_embed
-
-
-def get_2d_sincos_pos_embed_from_grid_np(embed_dim, grid):
-    r"""
-    This function generates 2D sinusoidal positional embeddings from a grid.
-
-    Args:
-        embed_dim (`int`): The embedding dimension.
-        grid (`np.ndarray`): Grid of positions with shape `(H * W,)`.
-
-    Returns:
-        `np.ndarray`: The 2D sinusoidal positional embeddings with shape `(H * W, embed_dim)`
-    """
-    if embed_dim % 2 != 0:
-        raise ValueError("embed_dim must be divisible by 2")
-
-    # use half of dimensions to encode grid_h
-    emb_h = get_1d_sincos_pos_embed_from_grid_np(embed_dim // 2, grid[0])  # (H*W, D/2)
-    emb_w = get_1d_sincos_pos_embed_from_grid_np(embed_dim // 2, grid[1])  # (H*W, D/2)
-
-    emb = np.concatenate([emb_h, emb_w], axis=1)  # (H*W, D)
-    return emb
-
-
-def get_1d_sincos_pos_embed_from_grid_np(embed_dim, pos):
-    """
-    This function generates 1D positional embeddings from a grid.
-
-    Args:
-        embed_dim (`int`): The embedding dimension `D`
-        pos (`numpy.ndarray`): 1D tensor of positions with shape `(M,)`
-
-    Returns:
-        `numpy.ndarray`: Sinusoidal positional embeddings of shape `(M, D)`.
-    """
-    if embed_dim % 2 != 0:
-        raise ValueError("embed_dim must be divisible by 2")
-
-    omega = np.arange(embed_dim // 2, dtype=np.float64)
-    omega /= embed_dim / 2.0
-    omega = 1.0 / 10000**omega  # (D/2,)
-
-    pos = pos.reshape(-1)  # (M,)
-    out = np.einsum("m,d->md", pos, omega)  # (M, D/2), outer product
-
-    emb_sin = np.sin(out)  # (M, D/2)
-    emb_cos = np.cos(out)  # (M, D/2)
-
-    emb = np.concatenate([emb_sin, emb_cos], axis=1)  # (M, D)
-    return emb
-
-
 class Timesteps(nn.Module):
     def __init__(self, num_channels: int, flip_sin_to_cos: bool, downscale_freq_shift: float):
         super().__init__()
@@ -421,7 +274,7 @@ class CogView3PlusPatchEmbed(nn.Module):
         self.text_proj = nn.Linear(text_hidden_size, hidden_size)
 
         pos_embed = get_2d_sincos_pos_embed(
-            hidden_size, pos_embed_max_size, base_size=pos_embed_max_size, output_type="pt"
+            hidden_size, pos_embed_max_size, base_size=pos_embed_max_size
         )
         pos_embed = pos_embed.reshape(pos_embed_max_size, pos_embed_max_size, hidden_size)
         self.register_buffer("pos_embed", pos_embed.float(), persistent=False)
-- 
Gitee


From e9b1be42198f4964d6fe6d89f75d77b1d8fa8ec8 Mon Sep 17 00:00:00 2001
From: jiangmengyu <jiangmengyu1@huawei.com>
Date: Thu, 26 Dec 2024 18:30:38 +0800
Subject: [PATCH 26/91] =?UTF-8?q?cogview3=E9=80=82=E9=85=8D?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../cogview3plus/layers/embeddings.py         |  1 -
 .../cogview3plus/layers/normalization.py      | 20 ++++++++++++++--
 .../cogview3plus/models/modeling_utils.py     |  8 +++----
 .../models/transformer_cogview3plus.py        | 23 +++++++++----------
 4 files changed, 33 insertions(+), 19 deletions(-)

diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/embeddings.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/embeddings.py
index dc1c683c63..1763b3b910 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/embeddings.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/embeddings.py
@@ -242,7 +242,6 @@ class CogView3CombinedTimestepSizeEmbeddings(nn.Module):
         crop_coords_proj = self.condition_proj(crop_coords.flatten()).view(crop_coords.size(0), -1)
         target_size_proj = self.condition_proj(target_size.flatten()).view(target_size.size(0), -1)
 
-        # (B, 3 * condition_dim)
         condition_proj = torch.cat([original_size_proj, crop_coords_proj, target_size_proj], dim=1)
 
         timesteps_emb = self.timestep_embedder(timesteps_proj.to(dtype=hidden_dtype))  # (B, embedding_dim)
diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/normalization.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/normalization.py
index 64dbbe058a..88fd20c378 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/normalization.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/normalization.py
@@ -15,6 +15,7 @@
 
 import numbers
 from typing import Optional, Tuple
+from dataclasses import dataclass
 
 import torch
 import torch.nn as nn
@@ -85,7 +86,20 @@ class RMSNorm(nn.Module):
             hidden_states = hidden_states.to(input_dtype)
 
         return hidden_states
-    
+
+
+@dataclass
+class ChunkParam:
+    gate_msa: torch.Tensor
+    shift_mlp: torch.Tensor
+    scale_mlp: torch.Tensor
+    gate_mlp: torch.Tensor
+    context: torch.Tensor
+    c_gate_msa: torch.Tensor
+    c_shift_mlp: torch.Tensor
+    c_scale_mlp: torch.Tensor
+    c_gate_mlp_again: torch.Tensor
+
 
 class CogView3PlusAdaLayerNormZeroTextImage(nn.Module):
     r"""
@@ -129,7 +143,9 @@ class CogView3PlusAdaLayerNormZeroTextImage(nn.Module):
         normed_context = self.norm_c(context)
         x = normed_x * (1 + scale_msa[:, None]) + shift_msa[:, None]
         context = normed_context * (1 + c_scale_msa[:, None]) + c_shift_msa[:, None]
-        return x, gate_msa, shift_mlp, scale_mlp, gate_mlp, context, c_gate_msa, c_shift_mlp, c_scale_mlp, c_gate_mlp
+        return x, ChunkParam(
+            gate_msa, shift_mlp, scale_mlp, gate_mlp, context, c_gate_msa, c_shift_mlp, c_scale_mlp, c_gate_mlp
+        )
 
 
 class FP32LayerNorm(nn.LayerNorm):
diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/modeling_utils.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/modeling_utils.py
index a0740b8c67..4a2816f283 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/modeling_utils.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/modeling_utils.py
@@ -126,10 +126,10 @@ def get_parameter_dtype(parameter: torch.nn.Module) -> torch.dtype:
 
     gen = parameter._named_members(get_members_fn=find_tensor_attributes)
     last_tuple = None
-    for tuple in gen:
-        last_tuple = tuple
-        if tuple[1].is_floating_point():
-            return tuple[1].dtype
+    for current_tuple  in gen:
+        last_tuple = current_tuple 
+        if current_tuple [1].is_floating_point():
+            return current_tuple [1].dtype
 
     if last_tuple is not None:
         # fallback to the last dtype
diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/transformer_cogview3plus.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/transformer_cogview3plus.py
index f13d71880b..d96ca3966a 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/transformer_cogview3plus.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/transformer_cogview3plus.py
@@ -83,18 +83,17 @@ class CogView3PlusTransformerBlock(nn.Module):
         text_seq_length = encoder_hidden_states.size(1)
 
         # norm & modulate
-        (
-            norm_hidden_states,
-            gate_msa,
-            shift_mlp,
-            scale_mlp,
-            gate_mlp,
-            norm_encoder_hidden_states,
-            c_gate_msa,
-            c_shift_mlp,
-            c_scale_mlp,
-            c_gate_mlp,
-        ) = self.norm1(hidden_states, encoder_hidden_states, emb)
+        norm_hidden_states, chunk_params = self.norm1(hidden_states, encoder_hidden_states, emb)
+
+        gate_msa = chunk_params.gate_msa
+        shift_mlp = chunk_params.shift_mlp
+        scale_mlp = chunk_params.scale_mlp
+        gate_mlp = chunk_params.gate_mlp
+        norm_encoder_hidden_states = chunk_params.context
+        c_gate_msa = chunk_params.c_gate_msa
+        c_shift_mlp = chunk_params.c_shift_mlp
+        c_scale_mlp = chunk_params.c_scale_mlp
+        c_gate_mlp = chunk_params.c_gate_mlp
 
         # attention
         attn_hidden_states, attn_encoder_hidden_states = self.attn1(
-- 
Gitee


From 883c4f55a505d64022d54f11889027fb1814aebe Mon Sep 17 00:00:00 2001
From: jiangmengyu <jiangmengyu1@huawei.com>
Date: Thu, 26 Dec 2024 18:57:25 +0800
Subject: [PATCH 27/91] =?UTF-8?q?cogview3=E9=80=82=E9=85=8D?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../foundation/cogview3/cogview3plus/layers/normalization.py    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/normalization.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/normalization.py
index 88fd20c378..2ead694b9f 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/normalization.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/normalization.py
@@ -98,7 +98,7 @@ class ChunkParam:
     c_gate_msa: torch.Tensor
     c_shift_mlp: torch.Tensor
     c_scale_mlp: torch.Tensor
-    c_gate_mlp_again: torch.Tensor
+    c_gate_mlp: torch.Tensor
 
 
 class CogView3PlusAdaLayerNormZeroTextImage(nn.Module):
-- 
Gitee


From 33cea1c2ce471157b16c6e6fd4a8266fb251c26b Mon Sep 17 00:00:00 2001
From: jiangmengyu <jiangmengyu1@huawei.com>
Date: Thu, 26 Dec 2024 19:10:03 +0800
Subject: [PATCH 28/91] =?UTF-8?q?cogview3=E9=80=82=E9=85=8D?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../cogview3plus/models/attention_processor.py     | 14 --------------
 .../models/transformer_cogview3plus.py             | 10 ++++------
 .../cogview3plus/pipeline/pipeline_cogview3plus.py |  4 +---
 3 files changed, 5 insertions(+), 23 deletions(-)

diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/attention_processor.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/attention_processor.py
index de7a2a130f..aa2961cf27 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/attention_processor.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/attention_processor.py
@@ -229,20 +229,6 @@ class Attention(nn.Module):
 
         self.processor = processor
 
-    def get_processor(self, return_deprecated_lora: bool = False) -> "AttentionProcessor":
-        r"""
-        Get the attention processor in use.
-
-        Args:
-            return_deprecated_lora (`bool`, *optional*, defaults to `False`):
-                Set to `True` to return the deprecated LoRA attention processor.
-
-        Returns:
-            "AttentionProcessor": The attention processor in use.
-        """
-        if not return_deprecated_lora:
-            return self.processor
-
     def forward(
         self,
         hidden_states: torch.Tensor,
diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/transformer_cogview3plus.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/transformer_cogview3plus.py
index d96ca3966a..6df2dc361e 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/transformer_cogview3plus.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/transformer_cogview3plus.py
@@ -14,6 +14,7 @@
 
 
 from typing import Any, Dict, Union
+from dataclasses import dataclass
 
 import torch
 import torch.nn as nn
@@ -291,14 +292,14 @@ class CogView3PlusTransformer2DModel(ModelMixin, ConfigMixin):
 
     def forward(
         self,
-        hidden_states: torch.Tensor,
-        encoder_hidden_states: torch.Tensor,
+        states,
         timestep: torch.LongTensor,
         original_size: torch.Tensor,
         target_size: torch.Tensor,
         crop_coords: torch.Tensor,
-        return_dict: bool = True,
     ) -> Union[torch.Tensor, Transformer2DModelOutput]:
+        hidden_states = states[0]
+        encoder_hidden_states = states[1]
         height, width = hidden_states.shape[-2:]
         text_seq_length = encoder_hidden_states.shape[1]
 
@@ -350,7 +351,4 @@ class CogView3PlusTransformer2DModel(ModelMixin, ConfigMixin):
             shape=(hidden_states.shape[0], self.out_channels, height * patch_size, width * patch_size)
         )
 
-        if not return_dict:
-            return (output,)
-
         return Transformer2DModelOutput(sample=output)
diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py
index 2e2ec1c8e8..cd6ddff267 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py
@@ -583,13 +583,11 @@ class CogView3PlusPipeline(DiffusionPipeline):
 
                 # predict noise model_output
                 noise_pred = self.transformer(
-                    hidden_states=latent_model_input,
-                    encoder_hidden_states=prompt_embeds,
+                    states=(latent_model_input, prompt_embeds),
                     timestep=timestep,
                     original_size=original_size,
                     target_size=target_size,
                     crop_coords=crops_coords_top_left,
-                    return_dict=False,
                 )[0]
                 noise_pred = noise_pred.float()
 
-- 
Gitee


From 2f1992c42688c315a5540e51ece1c75db6907b90 Mon Sep 17 00:00:00 2001
From: jiangmengyu <jiangmengyu1@huawei.com>
Date: Thu, 26 Dec 2024 19:17:00 +0800
Subject: [PATCH 29/91] =?UTF-8?q?cogview3=E9=80=82=E9=85=8D?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../models/attention_processor.py             |   6 -
 .../cogview3plus/models/modeling_utils.py     | 160 +-----------------
 .../pipeline/pipeline_cogview3plus.py         |  28 ---
 3 files changed, 2 insertions(+), 192 deletions(-)

diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/attention_processor.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/attention_processor.py
index aa2961cf27..d20baee1ec 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/attention_processor.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/attention_processor.py
@@ -301,16 +301,10 @@ class Attention(nn.Module):
         current_length: int = attention_mask.shape[-1]
         if current_length != target_length:
             if attention_mask.device.type == "mps":
-                # HACK: MPS: Does not support padding by greater than dimension of input tensor.
-                # Instead, we can manually construct the padding tensor.
                 padding_shape = (attention_mask.shape[0], attention_mask.shape[1], target_length)
                 padding = torch.zeros(padding_shape, dtype=attention_mask.dtype, device=attention_mask.device)
                 attention_mask = torch.cat([attention_mask, padding], dim=2)
             else:
-                # TODO: for pipelines such as stable-diffusion, padding cross-attn mask:
-                #       we want to instead pad by (0, remaining_length), where remaining_length is:
-                #       remaining_length: int = target_length - current_length
-                # TODO: re-enable tests/models/test_models_unet_2d_condition.py#test_model_xattn_padding
                 attention_mask = F.pad(attention_mask, (0, target_length), value=0.0)
 
         if out_dim == 3:
diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/modeling_utils.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/modeling_utils.py
index 4a2816f283..7895c5e1ea 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/modeling_utils.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/modeling_utils.py
@@ -316,161 +316,6 @@ class ModelMixin(torch.nn.Module, PushToHubMixin):
         """
         self.set_use_memory_efficient_attention_xformers(False)
 
-    def save_pretrained(
-        self,
-        save_directory: Union[str, os.PathLike],
-        is_main_process: bool = True,
-        save_function: Optional[Callable] = None,
-        safe_serialization: bool = True,
-        variant: Optional[str] = None,
-        max_shard_size: Union[int, str] = "10GB",
-        push_to_hub: bool = False,
-        **kwargs,
-    ):
-        """
-        Save a model and its configuration file to a directory so that it can be reloaded using the
-        [`~models.ModelMixin.from_pretrained`] class method.
-
-        Arguments:
-            save_directory (`str` or `os.PathLike`):
-                Directory to save a model and its configuration file to. Will be created if it doesn't exist.
-            is_main_process (`bool`, *optional*, defaults to `True`):
-                Whether the process calling this is the main process or not. Useful during distributed training and you
-                need to call this function on all processes. In this case, set `is_main_process=True` only on the main
-                process to avoid race conditions.
-            save_function (`Callable`):
-                The function to use to save the state dictionary. Useful during distributed training when you need to
-                replace `torch.save` with another method. Can be configured with the environment variable
-                `DIFFUSERS_SAVE_MODE`.
-            safe_serialization (`bool`, *optional*, defaults to `True`):
-                Whether to save the model using `safetensors` or the traditional PyTorch way with `pickle`.
-            variant (`str`, *optional*):
-                If specified, weights are saved in the format `pytorch_model.<variant>.bin`.
-            max_shard_size (`int` or `str`, defaults to `"10GB"`):
-                The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size
-                lower than this size. If expressed as a string, needs to be digits followed by a unit (like `"5GB"`).
-                If expressed as an integer, the unit is bytes. Note that this limit will be decreased after a certain
-                period of time (starting from Oct 2024) to allow users to upgrade to the latest version of `diffusers`.
-                This is to establish a common default size for this argument across different libraries in the Hugging
-                Face ecosystem (`transformers`, and `accelerate`, for example).
-            push_to_hub (`bool`, *optional*, defaults to `False`):
-                Whether or not to push your model to the Hugging Face Hub after saving it. You can specify the
-                repository you want to push to with `repo_id` (will default to the name of `save_directory` in your
-                namespace).
-            kwargs (`Dict[str, Any]`, *optional*):
-                Additional keyword arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
-        """
-        if os.path.isfile(save_directory):
-            logger.error(f"Provided path ({save_directory}) should be a directory, not a file")
-            return
-
-        hf_quantizer = getattr(self, "hf_quantizer", None)
-        if hf_quantizer is not None:
-            quantization_serializable = (
-                hf_quantizer is not None
-                and isinstance(hf_quantizer, DiffusersQuantizer)
-                and hf_quantizer.is_serializable
-            )
-            if not quantization_serializable:
-                raise ValueError(
-                    f"The model is quantized with {hf_quantizer.quantization_config.quant_method} and is not serializable - check out the warnings from"
-                    " the logger on the traceback to understand the reason why the quantized model is not serializable."
-                )
-
-        weights_name = SAFETENSORS_WEIGHTS_NAME if safe_serialization else WEIGHTS_NAME
-        weights_name = _add_variant(weights_name, variant)
-        weights_name_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(
-            ".safetensors", "{suffix}.safetensors"
-        )
-
-        os.makedirs(save_directory, exist_ok=True)
-
-        if push_to_hub:
-            commit_message = kwargs.pop("commit_message", None)
-            private = kwargs.pop("private", None)
-            create_pr = kwargs.pop("create_pr", False)
-            token = kwargs.pop("token", None)
-            repo_id = kwargs.pop("repo_id", save_directory.split(os.path.sep)[-1])
-            repo_id = create_repo(repo_id, exist_ok=True, private=private, token=token).repo_id
-
-        # Only save the model itself if we are using distributed training
-        model_to_save = self
-
-        # Attach architecture to the config
-        # Save the config
-        if is_main_process:
-            model_to_save.save_config(save_directory)
-
-        # Save the model
-        state_dict = model_to_save.state_dict()
-
-        # Save the model
-        state_dict_split = split_torch_state_dict_into_shards(
-            state_dict, max_shard_size=max_shard_size, filename_pattern=weights_name_pattern
-        )
-
-        # Clean the folder from a previous save
-        if is_main_process:
-            for filename in os.listdir(save_directory):
-                if filename in state_dict_split.filename_to_tensors.keys():
-                    continue
-                full_filename = os.path.join(save_directory, filename)
-                if not os.path.isfile(full_filename):
-                    continue
-                weights_without_ext = weights_name_pattern.replace(".bin", "").replace(".safetensors", "")
-                weights_without_ext = weights_without_ext.replace("{suffix}", "")
-                filename_without_ext = filename.replace(".bin", "").replace(".safetensors", "")
-                # make sure that file to be deleted matches format of sharded file, e.g. pytorch_model-00001-of-00005
-                if (
-                    filename.startswith(weights_without_ext)
-                    and _REGEX_SHARD.fullmatch(filename_without_ext) is not None
-                ):
-                    os.remove(full_filename)
-
-        for filename, tensors in state_dict_split.filename_to_tensors.items():
-            shard = {tensor: state_dict[tensor] for tensor in tensors}
-            filepath = os.path.join(save_directory, filename)
-            if safe_serialization:
-                # At some point we will need to deal better with save_function (used for TPU and other distributed
-                # joyfulness), but for now this enough.
-                safetensors.torch.save_file(shard, filepath, metadata={"format": "pt"})
-            else:
-                torch.save(shard, filepath)
-
-        if state_dict_split.is_sharded:
-            index = {
-                "metadata": state_dict_split.metadata,
-                "weight_map": state_dict_split.tensor_to_filename,
-            }
-            save_index_file = SAFE_WEIGHTS_INDEX_NAME if safe_serialization else WEIGHTS_INDEX_NAME
-            save_index_file = os.path.join(save_directory, _add_variant(save_index_file, variant))
-            # Save the index as well
-            with open(save_index_file, "w", encoding="utf-8") as f:
-                content = json.dumps(index, indent=2, sort_keys=True) + "\n"
-                f.write(content)
-            logger.info(
-                f"The model is bigger than the maximum size per checkpoint ({max_shard_size}) and is going to be "
-                f"split in {len(state_dict_split.filename_to_tensors)} checkpoint shards. You can find where each parameters has been saved in the "
-                f"index located at {save_index_file}."
-            )
-        else:
-            path_to_weights = os.path.join(save_directory, weights_name)
-            logger.info(f"Model weights saved in {path_to_weights}")
-
-        if push_to_hub:
-            # Create a new empty model card and eventually tag it
-            model_card = load_or_create_model_card(repo_id, token=token)
-            model_card = populate_model_card(model_card)
-            model_card.save(Path(save_directory, "README.md").as_posix())
-
-            self._upload_folder(
-                save_directory,
-                repo_id,
-                token=token,
-                commit_message=commit_message,
-                create_pr=create_pr,
-            )
-
     def dequantize(self):
         """
         Potentially dequantize the model in case it has been quantized by a quantization method that support
@@ -652,11 +497,11 @@ class ModelMixin(torch.nn.Module, PushToHubMixin):
         elif isinstance(device_map, str) and device_map not in ["auto", "balanced", "balanced_low_0", "sequential"]:
             try:
                 device_map = {"": torch.device(device_map)}
-            except RuntimeError:
+            except RuntimeError as e:
                 raise ValueError(
                     "When passing device_map as a string, the value needs to be a device name (e.g. cpu, cuda:0) or "
                     f"'auto', 'balanced', 'balanced_low_0', 'sequential' but found {device_map}."
-                )
+                ) from e
         elif isinstance(device_map, int):
             if device_map < 0:
                 raise ValueError(
@@ -882,7 +727,6 @@ class ModelMixin(torch.nn.Module, PushToHubMixin):
                     # It would error out during the `validate_environment()` call above in the absence of cuda.
                     if hf_quantizer is None:
                         param_device = "cpu"
-                    # TODO (sayakpaul,  SunMarc): remove this after model loading refactor
                     else:
                         param_device = torch.device(torch.cuda.current_device())
                     state_dict = load_state_dict(model_file, variant=variant)
diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py
index cd6ddff267..a6d0fbd5f4 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py
@@ -193,34 +193,6 @@ class CogView3PlusPipeline(DiffusionPipeline):
         device: Optional[torch.device] = None,
         dtype: Optional[torch.dtype] = None,
     ):
-        r"""
-        Encodes the prompt into text encoder hidden states.
-
-        Args:
-            prompt (`str` or `List[str]`, *optional*):
-                prompt to be encoded
-            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. If not defined, one has to pass
-                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
-                less than `1`).
-            do_classifier_free_guidance (`bool`, *optional*, defaults to `True`):
-                Whether to use classifier free guidance or not.
-            num_images_per_prompt (`int`, *optional*, defaults to 1):
-                Number of images that should be generated per prompt. torch device to place the resulting embeddings on
-            prompt_embeds (`torch.Tensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`torch.Tensor`, *optional*):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
-            max_sequence_length (`int`, defaults to `224`):
-                Maximum sequence length in encoded prompt. Can be set to other values but may lead to poorer results.
-            device: (`torch.device`, *optional*):
-                torch device
-            dtype: (`torch.dtype`, *optional*):
-                torch dtype
-        """
         device = device or self._execution_device
 
         prompt = [prompt] if isinstance(prompt, str) else prompt
-- 
Gitee


From bf7d28c49747d94610656eb29426c134e1b45b90 Mon Sep 17 00:00:00 2001
From: jiangmengyu <jiangmengyu1@huawei.com>
Date: Thu, 26 Dec 2024 20:17:31 +0800
Subject: [PATCH 30/91] =?UTF-8?q?cogview3=E9=80=82=E9=85=8D?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../cogview3/cogview3plus/models/modeling_utils.py          | 6 +++---
 .../cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py | 2 ++
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/modeling_utils.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/modeling_utils.py
index 7895c5e1ea..f83eff493f 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/modeling_utils.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/modeling_utils.py
@@ -126,10 +126,10 @@ def get_parameter_dtype(parameter: torch.nn.Module) -> torch.dtype:
 
     gen = parameter._named_members(get_members_fn=find_tensor_attributes)
     last_tuple = None
-    for current_tuple  in gen:
+    for current_tuple in gen:
         last_tuple = current_tuple 
-        if current_tuple [1].is_floating_point():
-            return current_tuple [1].dtype
+        if current_tuple[1].is_floating_point():
+            return current_tuple[1].dtype
 
     if last_tuple is not None:
         # fallback to the last dtype
diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py
index a6d0fbd5f4..7368a528a3 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py
@@ -451,6 +451,8 @@ class CogView3PlusPipeline(DiffusionPipeline):
             [`~pipelines.cogview3.pipeline_cogview3plus.CogView3PipelineOutput`] if `return_dict` is True, otherwise a
             `tuple`. When returning a tuple, the first element is a list with the generated images.
         """
+        prompt = prompt if prompt is not None else []
+        negative_prompt = negative_prompt if negative_prompt is not None else []
 
         if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
             callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
-- 
Gitee


From 344e429078093050d7b2f15d71e593eaead03f51 Mon Sep 17 00:00:00 2001
From: jiangmengyu <jiangmengyu1@huawei.com>
Date: Thu, 26 Dec 2024 20:23:19 +0800
Subject: [PATCH 31/91] =?UTF-8?q?cogview3=E9=80=82=E9=85=8D?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../schedulers/scheduling_ddim_cogvideox.py      |  3 ---
 .../schedulers/scheduling_dpm_cogvideox.py       |  3 ---
 .../foundation/cogview3/cogview3plus/vae/vae.py  | 16 ++++++++--------
 3 files changed, 8 insertions(+), 14 deletions(-)

diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_ddim_cogvideox.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_ddim_cogvideox.py
index b4e22a0615..c2464cf51b 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_ddim_cogvideox.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_ddim_cogvideox.py
@@ -280,13 +280,10 @@ class CogVideoXDDIMScheduler(SchedulerMixin, ConfigMixin):
         # 3. compute predicted original sample from predicted noise also called
         if self.config.prediction_type == "epsilon":
             pred_original_sample = (sample - beta_prod_t ** (0.5) * model_output) / alpha_prod_t ** (0.5)
-            # pred_epsilon = model_output
         elif self.config.prediction_type == "sample":
             pred_original_sample = model_output
-            # pred_epsilon = (sample - alpha_prod_t ** (0.5) * pred_original_sample) / beta_prod_t ** (0.5)
         elif self.config.prediction_type == "v_prediction":
             pred_original_sample = (alpha_prod_t**0.5) * sample - (beta_prod_t**0.5) * model_output
-            # pred_epsilon = (alpha_prod_t**0.5) * model_output + (beta_prod_t**0.5) * sample
         else:
             raise ValueError(
                 f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample`, or"
diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_dpm_cogvideox.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_dpm_cogvideox.py
index 6d25dea524..5486e7cb93 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_dpm_cogvideox.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_dpm_cogvideox.py
@@ -401,13 +401,10 @@ class CogVideoXDPMScheduler(SchedulerMixin, ConfigMixin):
         # To make style tests pass, commented out `pred_epsilon` as it is an unused variable
         if self.config.prediction_type == "epsilon":
             pred_original_sample = (sample - beta_prod_t ** (0.5) * model_output) / alpha_prod_t ** (0.5)
-            # pred_epsilon = model_output
         elif self.config.prediction_type == "sample":
             pred_original_sample = model_output
-            # pred_epsilon = (sample - alpha_prod_t ** (0.5) * pred_original_sample) / beta_prod_t ** (0.5)
         elif self.config.prediction_type == "v_prediction":
             pred_original_sample = (alpha_prod_t**0.5) * sample - (beta_prod_t**0.5) * model_output
-            # pred_epsilon = (alpha_prod_t**0.5) * model_output + (beta_prod_t**0.5) * sample
         else:
             raise ValueError(
                 f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample`, or"
diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/vae/vae.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/vae/vae.py
index 006ed75f1f..2323be5a78 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/vae/vae.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/vae/vae.py
@@ -420,9 +420,9 @@ class MaskConditionEncoder(nn.Module):
 
         layers = []
         in_ch_ = in_ch
-        for l in range(len(out_channels)):
-            out_ch_ = out_channels[l]
-            if l == 0 or l == 1:
+        for i, _ in enumerate(out_channels):
+            out_ch_ = out_channels[i]
+            if i == 0 or i == 1:
                 layers.append(nn.Conv2d(in_ch_, out_ch_, kernel_size=3, stride=1, padding=1))
             else:
                 layers.append(nn.Conv2d(in_ch_, out_ch_, kernel_size=4, stride=2, padding=1))
@@ -433,8 +433,8 @@ class MaskConditionEncoder(nn.Module):
     def forward(self, x: torch.Tensor, mask=None) -> torch.Tensor:
         r"""The forward method of the `MaskConditionEncoder` class."""
         out = {}
-        for l in range(len(self.layers)):
-            layer = self.layers[l]
+        for i, _ in enumerate(self.layers):
+            layer = self.layers[i]
             x = layer(x)
             out[str(tuple(x.shape))] = x
             x = torch.relu(x)
@@ -703,7 +703,6 @@ class VectorQuantizer(nn.Module):
 
     def remap_to_used(self, inds: torch.LongTensor) -> torch.LongTensor:
         ishape = inds.shape
-        assert len(ishape) > 1
         inds = inds.reshape(ishape[0], -1)
         used = self.used.to(inds)
         match = (inds[:, :, None] == used[None, None, ...]).long()
@@ -717,7 +716,6 @@ class VectorQuantizer(nn.Module):
 
     def unmap_to_all(self, inds: torch.LongTensor) -> torch.LongTensor:
         ishape = inds.shape
-        assert len(ishape) > 1
         inds = inds.reshape(ishape[0], -1)
         used = self.used.to(inds)
         if self.re_embed > self.used.shape[0]:  # extra token
@@ -820,7 +818,9 @@ class DiagonalGaussianDistribution(object):
                     dim=[1, 2, 3],
                 )
 
-    def nll(self, sample: torch.Tensor, dims: Tuple[int, ...] = [1, 2, 3]) -> torch.Tensor:
+    def nll(self, sample: torch.Tensor, dims: Tuple[int, ...] = None) -> torch.Tensor:
+        if dims is None:
+            dims = [1, 2, 3]
         if self.deterministic:
             return torch.Tensor([0.0])
         logtwopi = np.log(2.0 * np.pi)
-- 
Gitee


From 97a6dd950ad99df7e73b89f58fe19ad78aafbc02 Mon Sep 17 00:00:00 2001
From: jiangmengyu <jiangmengyu1@huawei.com>
Date: Thu, 26 Dec 2024 20:24:28 +0800
Subject: [PATCH 32/91] =?UTF-8?q?cogview3=E9=80=82=E9=85=8D?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py    | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py
index 7368a528a3..7e47fcd6aa 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py
@@ -451,9 +451,6 @@ class CogView3PlusPipeline(DiffusionPipeline):
             [`~pipelines.cogview3.pipeline_cogview3plus.CogView3PipelineOutput`] if `return_dict` is True, otherwise a
             `tuple`. When returning a tuple, the first element is a list with the generated images.
         """
-        prompt = prompt if prompt is not None else []
-        negative_prompt = negative_prompt if negative_prompt is not None else []
-
         if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
             callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
 
-- 
Gitee


From b9c42ba0ba1cdf53b1796b3ef0a153ffa3dfdda9 Mon Sep 17 00:00:00 2001
From: jiangmengyu <jiangmengyu1@huawei.com>
Date: Fri, 27 Dec 2024 10:03:36 +0800
Subject: [PATCH 33/91] =?UTF-8?q?cogview3=E9=80=82=E9=85=8D?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py  | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py
index 7e47fcd6aa..cfaca19fad 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py
@@ -210,8 +210,9 @@ class CogView3PlusPipeline(DiffusionPipeline):
                 dtype=dtype,
             )
 
-        if do_classifier_free_guidance and negative_prompt is None:
-            negative_prompt_embeds = prompt_embeds.new_zeros(prompt_embeds.shape)
+        negative_prompt_embeds = prompt_embeds.new_zeros(prompt_embeds.shape)
+        print(negative_prompt_embeds)
+        exit()
 
         if do_classifier_free_guidance and negative_prompt_embeds is None:
             negative_prompt = batch_size * [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt
-- 
Gitee


From 3d95b863b0d6270943c6e05fb57f96c9a7f687bd Mon Sep 17 00:00:00 2001
From: jiangmengyu <jiangmengyu1@huawei.com>
Date: Fri, 27 Dec 2024 10:05:32 +0800
Subject: [PATCH 34/91] =?UTF-8?q?cogview3=E9=80=82=E9=85=8D?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py  | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py
index cfaca19fad..3d65d2bbd2 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py
@@ -211,10 +211,11 @@ class CogView3PlusPipeline(DiffusionPipeline):
             )
 
         negative_prompt_embeds = prompt_embeds.new_zeros(prompt_embeds.shape)
-        print(negative_prompt_embeds)
-        exit()
+        
 
         if do_classifier_free_guidance and negative_prompt_embeds is None:
+            print(negative_prompt_embeds)
+            exit()
             negative_prompt = batch_size * [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt
 
             if prompt is not None and type(prompt) is not type(negative_prompt):
-- 
Gitee


From 3b521c17fff9c2c4f90cac7741a88f13eeca7d8a Mon Sep 17 00:00:00 2001
From: jiangmengyu <jiangmengyu1@huawei.com>
Date: Fri, 27 Dec 2024 10:34:50 +0800
Subject: [PATCH 35/91] =?UTF-8?q?cogview3=E9=80=82=E9=85=8D?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../pipeline/pipeline_cogview3plus.py         | 142 +-----------------
 1 file changed, 7 insertions(+), 135 deletions(-)

diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py
index 3d65d2bbd2..df6459f818 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py
@@ -184,11 +184,7 @@ class CogView3PlusPipeline(DiffusionPipeline):
     def encode_prompt(
         self,
         prompt: Union[str, List[str]],
-        negative_prompt: Optional[Union[str, List[str]]] = None,
-        do_classifier_free_guidance: bool = True,
         num_images_per_prompt: int = 1,
-        prompt_embeds: Optional[torch.Tensor] = None,
-        negative_prompt_embeds: Optional[torch.Tensor] = None,
         max_sequence_length: int = 224,
         device: Optional[torch.device] = None,
         dtype: Optional[torch.dtype] = None,
@@ -196,48 +192,15 @@ class CogView3PlusPipeline(DiffusionPipeline):
         device = device or self._execution_device
 
         prompt = [prompt] if isinstance(prompt, str) else prompt
-        if prompt is not None:
-            batch_size = len(prompt)
-        else:
-            batch_size = prompt_embeds.shape[0]
-
-        if prompt_embeds is None:
-            prompt_embeds = self._get_t5_prompt_embeds(
-                prompt=prompt,
-                num_images_per_prompt=num_images_per_prompt,
-                max_sequence_length=max_sequence_length,
-                device=device,
-                dtype=dtype,
-            )
-
+        prompt_embeds = self._get_t5_prompt_embeds(
+            prompt=prompt,
+            num_images_per_prompt=num_images_per_prompt,
+            max_sequence_length=max_sequence_length,
+            device=device,
+            dtype=dtype,
+        )
         negative_prompt_embeds = prompt_embeds.new_zeros(prompt_embeds.shape)
         
-
-        if do_classifier_free_guidance and negative_prompt_embeds is None:
-            print(negative_prompt_embeds)
-            exit()
-            negative_prompt = batch_size * [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt
-
-            if prompt is not None and type(prompt) is not type(negative_prompt):
-                raise TypeError(
-                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
-                    f" {type(prompt)}."
-                )
-            elif batch_size != len(negative_prompt):
-                raise ValueError(
-                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
-                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
-                    " the batch size of `prompt`."
-                )
-
-            negative_prompt_embeds = self._get_t5_prompt_embeds(
-                prompt=negative_prompt,
-                num_images_per_prompt=num_images_per_prompt,
-                max_sequence_length=max_sequence_length,
-                device=device,
-                dtype=dtype,
-            )
-
         return prompt_embeds, negative_prompt_embeds
 
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
@@ -371,88 +334,6 @@ class CogView3PlusPipeline(DiffusionPipeline):
         callback_on_step_end_tensor_inputs: List[str] = ["latents"],
         max_sequence_length: int = 224,
     ) -> Union[CogView3PipelineOutput, Tuple]:
-        """
-        Function invoked when calling the pipeline for generation.
-
-        Args:
-            prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
-            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. If not defined, one has to pass
-                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
-                less than `1`).
-            height (`int`, *optional*, defaults to self.transformer.config.sample_size * self.vae_scale_factor):
-                The height in pixels of the generated image. If not provided, it is set to 1024.
-            width (`int`, *optional*, defaults to self.transformer.config.sample_size * self.vae_scale_factor):
-                The width in pixels of the generated image. If not provided it is set to 1024.
-            num_inference_steps (`int`, *optional*, defaults to `50`):
-                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
-                expense of slower inference.
-            timesteps (`List[int]`, *optional*):
-                Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
-                in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
-                passed will be used. Must be in descending order.
-            guidance_scale (`float`, *optional*, defaults to `5.0`):
-                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
-                `guidance_scale` is defined as `w` of equation 2. of [Imagen
-                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
-                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
-                usually at the expense of lower image quality.
-            num_images_per_prompt (`int`, *optional*, defaults to `1`):
-                The number of images to generate per prompt.
-            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
-                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
-                to make generation deterministic.
-            latents (`torch.FloatTensor`, *optional*):
-                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
-                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
-                tensor will ge generated by sampling using the supplied random `generator`.
-            prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
-            original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
-                If `original_size` is not the same as `target_size` the image will appear to be down- or upsampled.
-                `original_size` defaults to `(height, width)` if not specified. Part of SDXL's micro-conditioning as
-                explained in section 2.2 of
-                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
-            crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
-                `crops_coords_top_left` can be used to generate an image that appears to be "cropped" from the position
-                `crops_coords_top_left` downwards. Favorable, well-centered images are usually achieved by setting
-                `crops_coords_top_left` to (0, 0). Part of SDXL's micro-conditioning as explained in section 2.2 of
-                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
-            output_type (`str`, *optional*, defaults to `"pil"`):
-                The output format of the generate image. Choose between
-                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] instead
-                of a plain tuple.
-            attention_kwargs (`dict`, *optional*):
-                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
-                `self.processor` in
-                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
-            callback_on_step_end (`Callable`, *optional*):
-                A function that calls at the end of each denoising steps during the inference. The function is called
-                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
-                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
-                `callback_on_step_end_tensor_inputs`.
-            callback_on_step_end_tensor_inputs (`List`, *optional*):
-                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
-                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
-                `._callback_tensor_inputs` attribute of your pipeline class.
-            max_sequence_length (`int`, defaults to `224`):
-                Maximum sequence length in encoded prompt. Can be set to other values but may lead to poorer results.
-
-        Examples:
-
-        Returns:
-            [`~pipelines.cogview3.pipeline_cogview3plus.CogView3PipelineOutput`] or `tuple`:
-            [`~pipelines.cogview3.pipeline_cogview3plus.CogView3PipelineOutput`] if `return_dict` is True, otherwise a
-            `tuple`. When returning a tuple, the first element is a list with the generated images.
-        """
         if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
             callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
 
@@ -485,19 +366,10 @@ class CogView3PlusPipeline(DiffusionPipeline):
 
         device = self._execution_device
 
-        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
-        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
-        # corresponds to doing no classifier free guidance.
-        do_classifier_free_guidance = guidance_scale > 1.0
-
         # 3. Encode input prompt
         prompt_embeds, negative_prompt_embeds = self.encode_prompt(
             prompt,
-            negative_prompt,
-            self.do_classifier_free_guidance,
             num_images_per_prompt=num_images_per_prompt,
-            prompt_embeds=prompt_embeds,
-            negative_prompt_embeds=negative_prompt_embeds,
             max_sequence_length=max_sequence_length,
             device=device,
         )
-- 
Gitee


From 92b825af6493dadb4c8c8619aac2db45d4314ab4 Mon Sep 17 00:00:00 2001
From: jiangmengyu <jiangmengyu1@huawei.com>
Date: Fri, 27 Dec 2024 10:35:55 +0800
Subject: [PATCH 36/91] =?UTF-8?q?cogview3=E9=80=82=E9=85=8D?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../pipeline/pipeline_cogview3plus.py         | 83 +++++++++++++++++++
 1 file changed, 83 insertions(+)

diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py
index df6459f818..2e19c13d08 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py
@@ -334,6 +334,89 @@ class CogView3PlusPipeline(DiffusionPipeline):
         callback_on_step_end_tensor_inputs: List[str] = ["latents"],
         max_sequence_length: int = 224,
     ) -> Union[CogView3PipelineOutput, Tuple]:
+        """
+        Function invoked when calling the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            height (`int`, *optional*, defaults to self.transformer.config.sample_size * self.vae_scale_factor):
+                The height in pixels of the generated image. If not provided, it is set to 1024.
+            width (`int`, *optional*, defaults to self.transformer.config.sample_size * self.vae_scale_factor):
+                The width in pixels of the generated image. If not provided it is set to 1024.
+            num_inference_steps (`int`, *optional*, defaults to `50`):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
+                in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
+                passed will be used. Must be in descending order.
+            guidance_scale (`float`, *optional*, defaults to `5.0`):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            num_images_per_prompt (`int`, *optional*, defaults to `1`):
+                The number of images to generate per prompt.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                If `original_size` is not the same as `target_size` the image will appear to be down- or upsampled.
+                `original_size` defaults to `(height, width)` if not specified. Part of SDXL's micro-conditioning as
+                explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
+                `crops_coords_top_left` can be used to generate an image that appears to be "cropped" from the position
+                `crops_coords_top_left` downwards. Favorable, well-centered images are usually achieved by setting
+                `crops_coords_top_left` to (0, 0). Part of SDXL's micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] instead
+                of a plain tuple.
+            attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+            max_sequence_length (`int`, defaults to `224`):
+                Maximum sequence length in encoded prompt. Can be set to other values but may lead to poorer results.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.cogview3.pipeline_cogview3plus.CogView3PipelineOutput`] or `tuple`:
+            [`~pipelines.cogview3.pipeline_cogview3plus.CogView3PipelineOutput`] if `return_dict` is True, otherwise a
+            `tuple`. When returning a tuple, the first element is a list with the generated images.
+        """
+        
         if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
             callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
 
-- 
Gitee


From 480d05e1c79432b230fc54dc930f8be0b6e6e341 Mon Sep 17 00:00:00 2001
From: jiangmengyu <jiangmengyu1@huawei.com>
Date: Fri, 27 Dec 2024 10:40:11 +0800
Subject: [PATCH 37/91] =?UTF-8?q?cogview3=E9=80=82=E9=85=8D?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../cogview3plus/pipeline/pipeline_cogview3plus.py | 14 ++------------
 1 file changed, 2 insertions(+), 12 deletions(-)

diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py
index 2e19c13d08..e55ad153a0 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py
@@ -245,7 +245,6 @@ class CogView3PlusPipeline(DiffusionPipeline):
         prompt,
         height,
         width,
-        negative_prompt,
         callback_on_step_end_tensor_inputs,
         prompt_embeds=None,
         negative_prompt_embeds=None,
@@ -277,12 +276,6 @@ class CogView3PlusPipeline(DiffusionPipeline):
                 f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
             )
 
-        if negative_prompt is not None and negative_prompt_embeds is not None:
-            raise ValueError(
-                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
-                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
-            )
-
         if prompt_embeds is not None and negative_prompt_embeds is not None:
             if prompt_embeds.shape != negative_prompt_embeds.shape:
                 raise ValueError(
@@ -312,11 +305,9 @@ class CogView3PlusPipeline(DiffusionPipeline):
     def __call__(
         self,
         prompt: Optional[Union[str, List[str]]] = None,
-        negative_prompt: Optional[Union[str, List[str]]] = None,
         height: Optional[int] = None,
         width: Optional[int] = None,
         num_inference_steps: int = 50,
-        timesteps: Optional[List[int]] = None,
         guidance_scale: float = 5.0,
         num_images_per_prompt: int = 1,
         eta: float = 0.0,
@@ -416,7 +407,7 @@ class CogView3PlusPipeline(DiffusionPipeline):
             [`~pipelines.cogview3.pipeline_cogview3plus.CogView3PipelineOutput`] if `return_dict` is True, otherwise a
             `tuple`. When returning a tuple, the first element is a list with the generated images.
         """
-        
+
         if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
             callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
 
@@ -431,7 +422,6 @@ class CogView3PlusPipeline(DiffusionPipeline):
             prompt,
             height,
             width,
-            negative_prompt,
             callback_on_step_end_tensor_inputs,
             prompt_embeds,
             negative_prompt_embeds,
@@ -460,7 +450,7 @@ class CogView3PlusPipeline(DiffusionPipeline):
             prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
 
         # 4. Prepare timesteps
-        timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps)
+        timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device)
         self._num_timesteps = len(timesteps)
 
         # 5. Prepare latents.
-- 
Gitee


From 6852ece8a01d3c43e5516192a9a901492383daa8 Mon Sep 17 00:00:00 2001
From: jiangmengyu <jiangmengyu1@huawei.com>
Date: Fri, 27 Dec 2024 10:46:09 +0800
Subject: [PATCH 38/91] =?UTF-8?q?cogview3=E9=80=82=E9=85=8D?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py    | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py
index e55ad153a0..8f9ec8693f 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py
@@ -310,7 +310,6 @@ class CogView3PlusPipeline(DiffusionPipeline):
         num_inference_steps: int = 50,
         guidance_scale: float = 5.0,
         num_images_per_prompt: int = 1,
-        eta: float = 0.0,
         generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
         latents: Optional[torch.FloatTensor] = None,
         prompt_embeds: Optional[torch.FloatTensor] = None,
@@ -467,7 +466,7 @@ class CogView3PlusPipeline(DiffusionPipeline):
         )
 
         # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
-        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, 0.0)
 
         # 7. Prepare additional timestep conditions
         original_size = torch.tensor([original_size], dtype=prompt_embeds.dtype)
-- 
Gitee


From 4a8329b36925afc4447d61131ea35129f5facf7b Mon Sep 17 00:00:00 2001
From: jiangmengyu <jiangmengyu1@huawei.com>
Date: Fri, 27 Dec 2024 10:49:17 +0800
Subject: [PATCH 39/91] =?UTF-8?q?cogview3=E9=80=82=E9=85=8D?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../cogview3plus/pipeline/pipeline_cogview3plus.py       | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py
index 8f9ec8693f..efdd50f1bf 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py
@@ -310,8 +310,6 @@ class CogView3PlusPipeline(DiffusionPipeline):
         num_inference_steps: int = 50,
         guidance_scale: float = 5.0,
         num_images_per_prompt: int = 1,
-        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
-        latents: Optional[torch.FloatTensor] = None,
         prompt_embeds: Optional[torch.FloatTensor] = None,
         negative_prompt_embeds: Optional[torch.FloatTensor] = None,
         original_size: Optional[Tuple[int, int]] = None,
@@ -461,12 +459,11 @@ class CogView3PlusPipeline(DiffusionPipeline):
             width,
             prompt_embeds.dtype,
             device,
-            generator,
-            latents,
+            None,
         )
 
         # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
-        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, 0.0)
+        extra_step_kwargs = self.prepare_extra_step_kwargs(None, 0.0)
 
         # 7. Prepare additional timestep conditions
         original_size = torch.tensor([original_size], dtype=prompt_embeds.dtype)
@@ -543,7 +540,7 @@ class CogView3PlusPipeline(DiffusionPipeline):
                     progress_bar.update()
 
         if not output_type == "latent":
-            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False, generator=generator)[
+            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False, generator=None)[
                 0
             ]
         else:
-- 
Gitee


From 9dc08db22da818b28fb29dece76912ecf07d7ce9 Mon Sep 17 00:00:00 2001
From: jiangmengyu <jiangmengyu1@huawei.com>
Date: Fri, 27 Dec 2024 10:55:08 +0800
Subject: [PATCH 40/91] =?UTF-8?q?cogview3=E9=80=82=E9=85=8D?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../pipeline/pipeline_cogview3plus.py         | 44 ++-----------------
 1 file changed, 3 insertions(+), 41 deletions(-)

diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py
index efdd50f1bf..10aca046d7 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py
@@ -245,45 +245,13 @@ class CogView3PlusPipeline(DiffusionPipeline):
         prompt,
         height,
         width,
-        callback_on_step_end_tensor_inputs,
-        prompt_embeds=None,
-        negative_prompt_embeds=None,
     ):
         if height % 8 != 0 or width % 8 != 0:
             raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
 
-        if callback_on_step_end_tensor_inputs is not None and not all(
-            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
-        ):
-            raise ValueError(
-                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
-            )
-        if prompt is not None and prompt_embeds is not None:
-            raise ValueError(
-                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
-                " only forward one of the two."
-            )
-        elif prompt is None and prompt_embeds is None:
-            raise ValueError(
-                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
-            )
-        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+        if prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
             raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
 
-        if prompt is not None and negative_prompt_embeds is not None:
-            raise ValueError(
-                f"Cannot forward both `prompt`: {prompt} and `negative_prompt_embeds`:"
-                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
-            )
-
-        if prompt_embeds is not None and negative_prompt_embeds is not None:
-            if prompt_embeds.shape != negative_prompt_embeds.shape:
-                raise ValueError(
-                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
-                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
-                    f" {negative_prompt_embeds.shape}."
-                )
-
     @property
     def guidance_scale(self):
         return self._guidance_scale
@@ -310,7 +278,6 @@ class CogView3PlusPipeline(DiffusionPipeline):
         num_inference_steps: int = 50,
         guidance_scale: float = 5.0,
         num_images_per_prompt: int = 1,
-        prompt_embeds: Optional[torch.FloatTensor] = None,
         negative_prompt_embeds: Optional[torch.FloatTensor] = None,
         original_size: Optional[Tuple[int, int]] = None,
         crops_coords_top_left: Tuple[int, int] = (0, 0),
@@ -419,20 +386,15 @@ class CogView3PlusPipeline(DiffusionPipeline):
             prompt,
             height,
             width,
-            callback_on_step_end_tensor_inputs,
-            prompt_embeds,
-            negative_prompt_embeds,
         )
         self._guidance_scale = guidance_scale
         self._interrupt = False
 
         # 2. Default call parameters
-        if prompt is not None and isinstance(prompt, str):
+        if isinstance(prompt, str):
             batch_size = 1
-        elif prompt is not None and isinstance(prompt, list):
-            batch_size = len(prompt)
         else:
-            batch_size = prompt_embeds.shape[0]
+            batch_size = len(prompt)
 
         device = self._execution_device
 
-- 
Gitee


From 96f8a2516cd943c9291b9abd79a97fe5de105f68 Mon Sep 17 00:00:00 2001
From: jiangmengyu <jiangmengyu1@huawei.com>
Date: Fri, 27 Dec 2024 11:00:41 +0800
Subject: [PATCH 41/91] =?UTF-8?q?cogview3=E9=80=82=E9=85=8D?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../pipeline/pipeline_cogview3plus.py         | 43 +++----------------
 1 file changed, 5 insertions(+), 38 deletions(-)

diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py
index 10aca046d7..33b01feae5 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py
@@ -278,16 +278,6 @@ class CogView3PlusPipeline(DiffusionPipeline):
         num_inference_steps: int = 50,
         guidance_scale: float = 5.0,
         num_images_per_prompt: int = 1,
-        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
-        original_size: Optional[Tuple[int, int]] = None,
-        crops_coords_top_left: Tuple[int, int] = (0, 0),
-        output_type: str = "pil",
-        return_dict: bool = True,
-        callback_on_step_end: Optional[
-            Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
-        ] = None,
-        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
-        max_sequence_length: int = 224,
     ) -> Union[CogView3PipelineOutput, Tuple]:
         """
         Function invoked when calling the pipeline for generation.
@@ -372,13 +362,10 @@ class CogView3PlusPipeline(DiffusionPipeline):
             `tuple`. When returning a tuple, the first element is a list with the generated images.
         """
 
-        if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
-            callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
-
         height = height or self.transformer.config.sample_size * self.vae_scale_factor
         width = width or self.transformer.config.sample_size * self.vae_scale_factor
 
-        original_size = original_size or (height, width)
+        original_size = (height, width)
         target_size = (height, width)
 
         # 1. Check inputs. Raise error if not correct
@@ -402,7 +389,7 @@ class CogView3PlusPipeline(DiffusionPipeline):
         prompt_embeds, negative_prompt_embeds = self.encode_prompt(
             prompt,
             num_images_per_prompt=num_images_per_prompt,
-            max_sequence_length=max_sequence_length,
+            max_sequence_length=224,
             device=device,
         )
         if self.do_classifier_free_guidance:
@@ -430,7 +417,7 @@ class CogView3PlusPipeline(DiffusionPipeline):
         # 7. Prepare additional timestep conditions
         original_size = torch.tensor([original_size], dtype=prompt_embeds.dtype)
         target_size = torch.tensor([target_size], dtype=prompt_embeds.dtype)
-        crops_coords_top_left = torch.tensor([crops_coords_top_left], dtype=prompt_embeds.dtype)
+        crops_coords_top_left = torch.tensor([(0, 0)], dtype=prompt_embeds.dtype)
 
         if self.do_classifier_free_guidance:
             original_size = torch.cat([original_size, original_size])
@@ -487,33 +474,13 @@ class CogView3PlusPipeline(DiffusionPipeline):
                     )
                 latents = latents.to(prompt_embeds.dtype)
 
-                # call the callback, if provided
-                if callback_on_step_end is not None:
-                    callback_kwargs = {}
-                    for k in callback_on_step_end_tensor_inputs:
-                        callback_kwargs[k] = locals()[k]
-                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
-
-                    latents = callback_outputs.pop("latents", latents)
-                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
-                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
-
                 if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
                     progress_bar.update()
 
-        if not output_type == "latent":
-            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False, generator=None)[
-                0
-            ]
-        else:
-            image = latents
-
-        image = self.image_processor.postprocess(image, output_type=output_type)
+        image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False, generator=None)[0]
+        image = self.image_processor.postprocess(image, output_type="pil")
 
         # Offload all models
         self.maybe_free_model_hooks()
 
-        if not return_dict:
-            return (image,)
-
         return CogView3PipelineOutput(images=image)
\ No newline at end of file
-- 
Gitee


From 6f80df407ba4cba57ae3f4bcbee49f3ee4fab148 Mon Sep 17 00:00:00 2001
From: jiangmengyu <jiangmengyu1@huawei.com>
Date: Fri, 27 Dec 2024 11:04:15 +0800
Subject: [PATCH 42/91] =?UTF-8?q?cogview3=E9=80=82=E9=85=8D?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../cogview3plus/pipeline/pipeline_cogview3plus.py   | 12 +++++++-----
 .../foundation/cogview3/inference_cogview3plus.py    |  3 +--
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py
index 33b01feae5..5cac1be1f1 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py
@@ -273,8 +273,7 @@ class CogView3PlusPipeline(DiffusionPipeline):
     def __call__(
         self,
         prompt: Optional[Union[str, List[str]]] = None,
-        height: Optional[int] = None,
-        width: Optional[int] = None,
+        image_size: Tuple[int, int] = None,
         num_inference_steps: int = 50,
         guidance_scale: float = 5.0,
         num_images_per_prompt: int = 1,
@@ -361,9 +360,12 @@ class CogView3PlusPipeline(DiffusionPipeline):
             [`~pipelines.cogview3.pipeline_cogview3plus.CogView3PipelineOutput`] if `return_dict` is True, otherwise a
             `tuple`. When returning a tuple, the first element is a list with the generated images.
         """
-
-        height = height or self.transformer.config.sample_size * self.vae_scale_factor
-        width = width or self.transformer.config.sample_size * self.vae_scale_factor
+        if image_size is None:
+            height = self.transformer.config.sample_size * self.vae_scale_factor
+            width = self.transformer.config.sample_size * self.vae_scale_factor
+        else:
+            height = image_size[0]
+            width = image_size[1]
 
         original_size = (height, width)
         target_size = (height, width)
diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/inference_cogview3plus.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/inference_cogview3plus.py
index 5c12695d6c..d24d3f29c4 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/inference_cogview3plus.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/inference_cogview3plus.py
@@ -82,8 +82,7 @@ def infer(args):
             guidance_scale=args.guidance_scale,
             num_images_per_prompt=args.num_images_per_prompt,
             num_inference_steps=args.num_inference_steps,
-            width=args.width,
-            height=args.height,
+            image_size=(args.height, args.width),
         ).images[0]
         
         if i >= 2:
-- 
Gitee


From ee21249c88b30bf7f7173f5a95b63ab11ef33a66 Mon Sep 17 00:00:00 2001
From: jiangmengyu <jiangmengyu1@huawei.com>
Date: Fri, 27 Dec 2024 11:09:48 +0800
Subject: [PATCH 43/91] =?UTF-8?q?cogview3=E9=80=82=E9=85=8D?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../pipeline/pipeline_cogview3plus.py         | 19 +++++--------------
 1 file changed, 5 insertions(+), 14 deletions(-)

diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py
index 5cac1be1f1..d90c00653f 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py
@@ -204,23 +204,16 @@ class CogView3PlusPipeline(DiffusionPipeline):
         return prompt_embeds, negative_prompt_embeds
 
     # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
-    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
+    def prepare_latents(self, batch_size, num_channels_latents, image_size, dtype, device):
+        height = image_size[0]
+        width = image_size[1]
         shape = (
             batch_size,
             num_channels_latents,
             int(height) // self.vae_scale_factor,
             int(width) // self.vae_scale_factor,
         )
-        if isinstance(generator, list) and len(generator) != batch_size:
-            raise ValueError(
-                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
-                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
-            )
-
-        if latents is None:
-            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
-        else:
-            latents = latents.to(device)
+        latents = randn_tensor(shape, device=device, dtype=dtype)
 
         # scale the initial noise by the standard deviation required by the scheduler
         latents = latents * self.scheduler.init_noise_sigma
@@ -406,11 +399,9 @@ class CogView3PlusPipeline(DiffusionPipeline):
         latents = self.prepare_latents(
             batch_size * num_images_per_prompt,
             latent_channels,
-            height,
-            width,
+            (height, width),
             prompt_embeds.dtype,
             device,
-            None,
         )
 
         # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
-- 
Gitee


From 006c8fd91a68f198e8537b5265507dd7d6fd4eb7 Mon Sep 17 00:00:00 2001
From: jiangmengyu <jiangmengyu1@huawei.com>
Date: Fri, 27 Dec 2024 11:12:18 +0800
Subject: [PATCH 44/91] =?UTF-8?q?cogview3=E9=80=82=E9=85=8D?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py   | 1 -
 .../cogview3plus/schedulers/scheduling_ddim_cogvideox.py      | 4 ----
 2 files changed, 5 deletions(-)

diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py
index d90c00653f..a2c8d0f37f 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py
@@ -404,7 +404,6 @@ class CogView3PlusPipeline(DiffusionPipeline):
             device,
         )
 
-        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
         extra_step_kwargs = self.prepare_extra_step_kwargs(None, 0.0)
 
         # 7. Prepare additional timestep conditions
diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_ddim_cogvideox.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_ddim_cogvideox.py
index c2464cf51b..dbe9d4b17f 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_ddim_cogvideox.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_ddim_cogvideox.py
@@ -258,10 +258,6 @@ class CogVideoXDDIMScheduler(SchedulerMixin, ConfigMixin):
         model_output: torch.Tensor,
         timestep: int,
         sample: torch.Tensor,
-        eta: float = 0.0,
-        use_clipped_model_output: bool = False,
-        generator=None,
-        variance_noise: Optional[torch.Tensor] = None,
         return_dict: bool = True,
     ) -> Union[DDIMSchedulerOutput, Tuple]:
         if self.num_inference_steps is None:
-- 
Gitee


From df5d6fb3c589bb87dfbdab100a319b26322f20b6 Mon Sep 17 00:00:00 2001
From: jiangmengyu <jiangmengyu1@huawei.com>
Date: Fri, 27 Dec 2024 11:18:11 +0800
Subject: [PATCH 45/91] =?UTF-8?q?cogview3=E9=80=82=E9=85=8D?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 MindIE/MindIE-Torch/built-in/foundation/cogview3/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/README.md b/MindIE/MindIE-Torch/built-in/foundation/cogview3/README.md
index dfcf259a0b..495359c497 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/README.md
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/README.md
@@ -145,7 +145,7 @@ https://huggingface.co/THUDM/CogView3-Plus-3B/tree/main/vae
 ### 3.2 单卡单prompt功能测试
 设置权重路径
 ```shell
-model_path = '/data/CogView3B'
+model_path='/data/CogView3B'
 ```
 执行命令：
 ```shell
-- 
Gitee


From 98ec2a2264bda14d3f8113ea22286fdc80a4c532 Mon Sep 17 00:00:00 2001
From: jiangmengyu <jiangmengyu1@huawei.com>
Date: Fri, 27 Dec 2024 11:22:22 +0800
Subject: [PATCH 46/91] =?UTF-8?q?cogview3=E9=80=82=E9=85=8D?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../pipeline/pipeline_cogview3plus.py         |  20 +-
 .../cogview3plus/schedulers/__init__.py       |   1 -
 .../schedulers/scheduling_dpm_cogvideox.py    | 484 ------------------
 3 files changed, 4 insertions(+), 501 deletions(-)
 delete mode 100644 MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_dpm_cogvideox.py

diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py
index a2c8d0f37f..0031849dc9 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py
@@ -14,12 +14,11 @@
 # limitations under the License.
 
 import inspect
-from typing import Callable, Dict, List, Optional, Tuple, Union
+from typing import List, Optional, Tuple, Union
 
 import torch
 from transformers import T5EncoderModel, T5Tokenizer
 
-from diffusers.callbacks import MultiPipelineCallbacks, PipelineCallback
 from diffusers.image_processor import VaeImageProcessor
 from diffusers.pipelines.pipeline_utils import DiffusionPipeline
 from diffusers.utils import logging, replace_example_docstring
@@ -27,7 +26,7 @@ from diffusers.utils.torch_utils import randn_tensor
 
 from ..vae import AutoencoderKL
 from ..models import CogView3PlusTransformer2DModel
-from ..schedulers import CogVideoXDDIMScheduler, CogVideoXDPMScheduler
+from ..schedulers import CogVideoXDDIMScheduler
 from .pipeline_output import CogView3PipelineOutput
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
@@ -125,7 +124,7 @@ class CogView3PlusPipeline(DiffusionPipeline):
         text_encoder: T5EncoderModel,
         vae: AutoencoderKL,
         transformer: CogView3PlusTransformer2DModel,
-        scheduler: Union[CogVideoXDDIMScheduler, CogVideoXDPMScheduler],
+        scheduler: CogVideoXDDIMScheduler,
     ):
         super().__init__()
 
@@ -452,18 +451,7 @@ class CogView3PlusPipeline(DiffusionPipeline):
                     noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
 
                 # compute the previous noisy sample x_t -> x_t-1
-                if not isinstance(self.scheduler, CogVideoXDPMScheduler):
-                    latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
-                else:
-                    latents, old_pred_original_sample = self.scheduler.step(
-                        noise_pred,
-                        old_pred_original_sample,
-                        t,
-                        timesteps[i - 1] if i > 0 else None,
-                        latents,
-                        **extra_step_kwargs,
-                        return_dict=False,
-                    )
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
                 latents = latents.to(prompt_embeds.dtype)
 
                 if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/__init__.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/__init__.py
index 32d0c223e7..7a8f559a28 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/__init__.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/__init__.py
@@ -1,3 +1,2 @@
 from .scheduling_ddim_cogvideox import CogVideoXDDIMScheduler
-from .scheduling_dpm_cogvideox import CogVideoXDPMScheduler
 from .scheduling_utils import SchedulerMixin
\ No newline at end of file
diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_dpm_cogvideox.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_dpm_cogvideox.py
deleted file mode 100644
index 5486e7cb93..0000000000
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_dpm_cogvideox.py
+++ /dev/null
@@ -1,484 +0,0 @@
-# Copyright 2024 The CogVideoX team, Tsinghua University & ZhipuAI and The HuggingFace Team.
-# All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# DISCLAIMER: This code is strongly influenced by https://github.com/pesser/pytorch_diffusion
-# and https://github.com/hojonathanho/diffusion
-
-import math
-from dataclasses import dataclass
-from typing import List, Optional, Tuple, Union
-
-import numpy as np
-import torch
-
-from diffusers.configuration_utils import ConfigMixin, register_to_config
-from diffusers.utils import BaseOutput
-from diffusers.utils.torch_utils import randn_tensor
-from .scheduling_utils import KarrasDiffusionSchedulers, SchedulerMixin
-
-
-@dataclass
-# Copied from diffusers.schedulers.scheduling_ddpm.DDPMSchedulerOutput with DDPM->DDIM
-class DDIMSchedulerOutput(BaseOutput):
-    """
-    Output class for the scheduler's `step` function output.
-
-    Args:
-        prev_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images):
-            Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the
-            denoising loop.
-        pred_original_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images):
-            The predicted denoised sample `(x_{0})` based on the model output from the current timestep.
-            `pred_original_sample` can be used to preview progress or for guidance.
-    """
-
-    prev_sample: torch.Tensor
-    pred_original_sample: Optional[torch.Tensor] = None
-
-
-# Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
-def betas_for_alpha_bar(
-    num_diffusion_timesteps,
-    max_beta=0.999,
-    alpha_transform_type="cosine",
-):
-    """
-    Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
-    (1-beta) over time from t = [0,1].
-
-    Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
-    to that part of the diffusion process.
-
-
-    Args:
-        num_diffusion_timesteps (`int`): the number of betas to produce.
-        max_beta (`float`): the maximum beta to use; use values lower than 1 to
-                     prevent singularities.
-        alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
-                     Choose from `cosine` or `exp`
-
-    Returns:
-        betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
-    """
-    if alpha_transform_type == "cosine":
-
-        def alpha_bar_fn(t):
-            return math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2
-
-    elif alpha_transform_type == "exp":
-
-        def alpha_bar_fn(t):
-            return math.exp(t * -12.0)
-
-    else:
-        raise ValueError(f"Unsupported alpha_transform_type: {alpha_transform_type}")
-
-    betas = []
-    for i in range(num_diffusion_timesteps):
-        t1 = i / num_diffusion_timesteps
-        t2 = (i + 1) / num_diffusion_timesteps
-        betas.append(min(1 - alpha_bar_fn(t2) / alpha_bar_fn(t1), max_beta))
-    return torch.tensor(betas, dtype=torch.float32)
-
-
-def rescale_zero_terminal_snr(alphas_cumprod):
-    """
-
-    Args:
-        betas (`torch.Tensor`):
-            the betas that the scheduler is being initialized with.
-
-    Returns:
-        `torch.Tensor`: rescaled betas with zero terminal SNR
-    """
-
-    alphas_bar_sqrt = alphas_cumprod.sqrt()
-
-    # Store old values.
-    alphas_bar_sqrt_0 = alphas_bar_sqrt[0].clone()
-    alphas_bar_sqrt_T = alphas_bar_sqrt[-1].clone()
-
-    # Shift so the last timestep is zero.
-    alphas_bar_sqrt -= alphas_bar_sqrt_T
-
-    # Scale so the first timestep is back to the old value.
-    alphas_bar_sqrt *= alphas_bar_sqrt_0 / (alphas_bar_sqrt_0 - alphas_bar_sqrt_T)
-
-    # Convert alphas_bar_sqrt to betas
-    alphas_bar = alphas_bar_sqrt**2  # Revert sqrt
-
-    return alphas_bar
-
-
-class CogVideoXDPMScheduler(SchedulerMixin, ConfigMixin):
-    """
-    `DDIMScheduler` extends the denoising procedure introduced in denoising diffusion probabilistic models (DDPMs) with
-    non-Markovian guidance.
-
-    This model inherits from [`SchedulerMixin`] and [`ConfigMixin`]. Check the superclass documentation for the generic
-    methods the library implements for all schedulers such as loading and saving.
-
-    Args:
-        num_train_timesteps (`int`, defaults to 1000):
-            The number of diffusion steps to train the model.
-        beta_start (`float`, defaults to 0.0001):
-            The starting `beta` value of inference.
-        beta_end (`float`, defaults to 0.02):
-            The final `beta` value.
-        beta_schedule (`str`, defaults to `"linear"`):
-            The beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
-            `linear`, `scaled_linear`, or `squaredcos_cap_v2`.
-        trained_betas (`np.ndarray`, *optional*):
-            Pass an array of betas directly to the constructor to bypass `beta_start` and `beta_end`.
-        clip_sample (`bool`, defaults to `True`):
-            Clip the predicted sample for numerical stability.
-        clip_sample_range (`float`, defaults to 1.0):
-            The maximum magnitude for sample clipping. Valid only when `clip_sample=True`.
-        set_alpha_to_one (`bool`, defaults to `True`):
-            Each diffusion step uses the alphas product value at that step and at the previous one. For the final step
-            there is no previous alpha. When this option is `True` the previous alpha product is fixed to `1`,
-            otherwise it uses the alpha value at step 0.
-        steps_offset (`int`, defaults to 0):
-            An offset added to the inference steps, as required by some model families.
-        prediction_type (`str`, defaults to `epsilon`, *optional*):
-            Prediction type of the scheduler function; can be `epsilon` (predicts the noise of the diffusion process),
-            `sample` (directly predicts the noisy sample`) or `v_prediction` (see section 2.4 of [Imagen
-            Video](https://imagen.research.google/video/paper.pdf) paper).
-        thresholding (`bool`, defaults to `False`):
-            Whether to use the "dynamic thresholding" method. This is unsuitable for latent-space diffusion models such
-            as Stable Diffusion.
-        dynamic_thresholding_ratio (`float`, defaults to 0.995):
-            The ratio for the dynamic thresholding method. Valid only when `thresholding=True`.
-        sample_max_value (`float`, defaults to 1.0):
-            The threshold value for dynamic thresholding. Valid only when `thresholding=True`.
-        timestep_spacing (`str`, defaults to `"leading"`):
-            The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and
-            Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information.
-        rescale_betas_zero_snr (`bool`, defaults to `False`):
-            Whether to rescale the betas to have zero terminal SNR. This enables the model to generate very bright and
-            dark samples instead of limiting it to samples with medium brightness. Loosely related to
-            [`--offset_noise`](https://github.com/huggingface/diffusers/blob/74fd735eb073eb1d774b1ab4154a0876eb82f055/examples/dreambooth/train_dreambooth.py#L506).
-    """
-
-    _compatibles = [e.name for e in KarrasDiffusionSchedulers]
-    order = 1
-
-    @register_to_config
-    def __init__(
-        self,
-        num_train_timesteps: int = 1000,
-        beta_start: float = 0.00085,
-        beta_end: float = 0.0120,
-        beta_schedule: str = "scaled_linear",
-        trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
-        clip_sample: bool = True,
-        set_alpha_to_one: bool = True,
-        steps_offset: int = 0,
-        prediction_type: str = "epsilon",
-        clip_sample_range: float = 1.0,
-        sample_max_value: float = 1.0,
-        timestep_spacing: str = "leading",
-        rescale_betas_zero_snr: bool = False,
-        snr_shift_scale: float = 3.0,
-    ):
-        if trained_betas is not None:
-            self.betas = torch.tensor(trained_betas, dtype=torch.float32)
-        elif beta_schedule == "linear":
-            self.betas = torch.linspace(beta_start, beta_end, num_train_timesteps, dtype=torch.float32)
-        elif beta_schedule == "scaled_linear":
-            # this schedule is very specific to the latent diffusion model.
-            self.betas = torch.linspace(beta_start**0.5, beta_end**0.5, num_train_timesteps, dtype=torch.float64) ** 2
-        elif beta_schedule == "squaredcos_cap_v2":
-            # Glide cosine schedule
-            self.betas = betas_for_alpha_bar(num_train_timesteps)
-        else:
-            raise NotImplementedError(f"{beta_schedule} is not implemented for {self.__class__}")
-
-        self.alphas = 1.0 - self.betas
-        self.alphas_cumprod = torch.cumprod(self.alphas, dim=0)
-
-        # Modify: SNR shift following SD3
-        self.alphas_cumprod = self.alphas_cumprod / (snr_shift_scale + (1 - snr_shift_scale) * self.alphas_cumprod)
-
-        # Rescale for zero SNR
-        if rescale_betas_zero_snr:
-            self.alphas_cumprod = rescale_zero_terminal_snr(self.alphas_cumprod)
-
-        # At every step in ddim, we are looking into the previous alphas_cumprod
-        # For the final step, there is no previous alphas_cumprod because we are already at 0
-        # `set_alpha_to_one` decides whether we set this parameter simply to one or
-        # whether we use the final alpha of the "non-previous" one.
-        self.final_alpha_cumprod = torch.tensor(1.0) if set_alpha_to_one else self.alphas_cumprod[0]
-
-        # standard deviation of the initial noise distribution
-        self.init_noise_sigma = 1.0
-
-        # setable values
-        self.num_inference_steps = None
-        self.timesteps = torch.from_numpy(np.arange(0, num_train_timesteps)[::-1].copy().astype(np.int64))
-
-    def _get_variance(self, timestep, prev_timestep):
-        alpha_prod_t = self.alphas_cumprod[timestep]
-        alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.final_alpha_cumprod
-        beta_prod_t = 1 - alpha_prod_t
-        beta_prod_t_prev = 1 - alpha_prod_t_prev
-
-        variance = (beta_prod_t_prev / beta_prod_t) * (1 - alpha_prod_t / alpha_prod_t_prev)
-
-        return variance
-
-    def scale_model_input(self, sample: torch.Tensor, timestep: Optional[int] = None) -> torch.Tensor:
-        """
-        Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
-        current timestep.
-
-        Args:
-            sample (`torch.Tensor`):
-                The input sample.
-            timestep (`int`, *optional*):
-                The current timestep in the diffusion chain.
-
-        Returns:
-            `torch.Tensor`:
-                A scaled input sample.
-        """
-        return sample
-
-    def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.device] = None):
-        """
-        Sets the discrete timesteps used for the diffusion chain (to be run before inference).
-
-        Args:
-            num_inference_steps (`int`):
-                The number of diffusion steps used when generating samples with a pre-trained model.
-        """
-
-        if num_inference_steps > self.config.num_train_timesteps:
-            raise ValueError(
-                f"`num_inference_steps`: {num_inference_steps} cannot be larger than `self.config.train_timesteps`:"
-                f" {self.config.num_train_timesteps} as the unet model trained with this scheduler can only handle"
-                f" maximal {self.config.num_train_timesteps} timesteps."
-            )
-
-        self.num_inference_steps = num_inference_steps
-
-        # "linspace", "leading", "trailing" corresponds to annotation of Table 2. of https://arxiv.org/abs/2305.08891
-        if self.config.timestep_spacing == "linspace":
-            timesteps = (
-                np.linspace(0, self.config.num_train_timesteps - 1, num_inference_steps)
-                .round()[::-1]
-                .copy()
-                .astype(np.int64)
-            )
-        elif self.config.timestep_spacing == "leading":
-            step_ratio = self.config.num_train_timesteps // self.num_inference_steps
-            # creates integer timesteps by multiplying by ratio
-            # casting to int to avoid issues when num_inference_step is power of 3
-            timesteps = (np.arange(0, num_inference_steps) * step_ratio).round()[::-1].copy().astype(np.int64)
-            timesteps += self.config.steps_offset
-        elif self.config.timestep_spacing == "trailing":
-            step_ratio = self.config.num_train_timesteps / self.num_inference_steps
-            # creates integer timesteps by multiplying by ratio
-            # casting to int to avoid issues when num_inference_step is power of 3
-            timesteps = np.round(np.arange(self.config.num_train_timesteps, 0, -step_ratio)).astype(np.int64)
-            timesteps -= 1
-        else:
-            raise ValueError(
-                f"{self.config.timestep_spacing} is not supported. Please make sure to choose one of 'leading' or 'trailing'."
-            )
-
-        self.timesteps = torch.from_numpy(timesteps).to(device)
-
-    def get_variables(self, alpha_prod_t, alpha_prod_t_prev, alpha_prod_t_back=None):
-        lamb = ((alpha_prod_t / (1 - alpha_prod_t)) ** 0.5).log()
-        lamb_next = ((alpha_prod_t_prev / (1 - alpha_prod_t_prev)) ** 0.5).log()
-        h = lamb_next - lamb
-
-        if alpha_prod_t_back is not None:
-            lamb_previous = ((alpha_prod_t_back / (1 - alpha_prod_t_back)) ** 0.5).log()
-            h_last = lamb - lamb_previous
-            r = h_last / h
-            return h, r, lamb, lamb_next
-        else:
-            return h, None, lamb, lamb_next
-
-    def get_mult(self, h, r, alpha_prod_t, alpha_prod_t_prev, alpha_prod_t_back):
-        mult1 = ((1 - alpha_prod_t_prev) / (1 - alpha_prod_t)) ** 0.5 * (-h).exp()
-        mult2 = (-2 * h).expm1() * alpha_prod_t_prev**0.5
-
-        if alpha_prod_t_back is not None:
-            mult3 = 1 + 1 / (2 * r)
-            mult4 = 1 / (2 * r)
-            return mult1, mult2, mult3, mult4
-        else:
-            return mult1, mult2
-
-    def step(
-        self,
-        model_output: torch.Tensor,
-        old_pred_original_sample: torch.Tensor,
-        timestep: int,
-        timestep_back: int,
-        sample: torch.Tensor,
-        eta: float = 0.0,
-        use_clipped_model_output: bool = False,
-        generator=None,
-        variance_noise: Optional[torch.Tensor] = None,
-        return_dict: bool = False,
-    ) -> Union[DDIMSchedulerOutput, Tuple]:
-        """
-        Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion
-        process from the learned model outputs (most often the predicted noise).
-
-        Args:
-            model_output (`torch.Tensor`):
-                The direct output from learned diffusion model.
-            timestep (`float`):
-                The current discrete timestep in the diffusion chain.
-            sample (`torch.Tensor`):
-                A current instance of a sample created by the diffusion process.
-            eta (`float`):
-                The weight of noise for added noise in diffusion step.
-            use_clipped_model_output (`bool`, defaults to `False`):
-                If `True`, computes "corrected" `model_output` from the clipped predicted original sample. Necessary
-                because predicted original sample is clipped to [-1, 1] when `self.config.clip_sample` is `True`. If no
-                clipping has happened, "corrected" `model_output` would coincide with the one provided as input and
-                `use_clipped_model_output` has no effect.
-            generator (`torch.Generator`, *optional*):
-                A random number generator.
-            variance_noise (`torch.Tensor`):
-                Alternative to generating noise with `generator` by directly providing the noise for the variance
-                itself. Useful for methods such as [`CycleDiffusion`].
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~schedulers.scheduling_ddim.DDIMSchedulerOutput`] or `tuple`.
-
-        Returns:
-            [`~schedulers.scheduling_ddim.DDIMSchedulerOutput`] or `tuple`:
-                If return_dict is `True`, [`~schedulers.scheduling_ddim.DDIMSchedulerOutput`] is returned, otherwise a
-                tuple is returned where the first element is the sample tensor.
-
-        """
-        if self.num_inference_steps is None:
-            raise ValueError(
-                "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
-            )
-
-        # See formulas (12) and (16) of DDIM paper https://arxiv.org/pdf/2010.02502.pdf
-        # Ideally, read DDIM paper in-detail understanding
-
-        # Notation (<variable name> -> <name in paper>
-        # - pred_noise_t -> e_theta(x_t, t)
-        # - pred_original_sample -> f_theta(x_t, t) or x_0
-        # - std_dev_t -> sigma_t
-        # - eta -> η
-        # - pred_sample_direction -> "direction pointing to x_t"
-        # - pred_prev_sample -> "x_t-1"
-
-        # 1. get previous step value (=t-1)
-        prev_timestep = timestep - self.config.num_train_timesteps // self.num_inference_steps
-
-        # 2. compute alphas, betas
-        alpha_prod_t = self.alphas_cumprod[timestep]
-        alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.final_alpha_cumprod
-        alpha_prod_t_back = self.alphas_cumprod[timestep_back] if timestep_back is not None else None
-
-        beta_prod_t = 1 - alpha_prod_t
-
-        # 3. compute predicted original sample from predicted noise also called
-        # "predicted x_0" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
-        # To make style tests pass, commented out `pred_epsilon` as it is an unused variable
-        if self.config.prediction_type == "epsilon":
-            pred_original_sample = (sample - beta_prod_t ** (0.5) * model_output) / alpha_prod_t ** (0.5)
-        elif self.config.prediction_type == "sample":
-            pred_original_sample = model_output
-        elif self.config.prediction_type == "v_prediction":
-            pred_original_sample = (alpha_prod_t**0.5) * sample - (beta_prod_t**0.5) * model_output
-        else:
-            raise ValueError(
-                f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample`, or"
-                " `v_prediction`"
-            )
-
-        h, r, lamb, lamb_next = self.get_variables(alpha_prod_t, alpha_prod_t_prev, alpha_prod_t_back)
-        mult = list(self.get_mult(h, r, alpha_prod_t, alpha_prod_t_prev, alpha_prod_t_back))
-        mult_noise = (1 - alpha_prod_t_prev) ** 0.5 * (1 - (-2 * h).exp()) ** 0.5
-
-        noise = randn_tensor(sample.shape, generator=generator, device=sample.device, dtype=sample.dtype)
-        prev_sample = mult[0] * sample - mult[1] * pred_original_sample + mult_noise * noise
-
-        if old_pred_original_sample is None or prev_timestep < 0:
-            # Save a network evaluation if all noise levels are 0 or on the first step
-            return prev_sample, pred_original_sample
-        else:
-            denoised_d = mult[2] * pred_original_sample - mult[3] * old_pred_original_sample
-            noise = randn_tensor(sample.shape, generator=generator, device=sample.device, dtype=sample.dtype)
-            x_advanced = mult[0] * sample - mult[1] * denoised_d + mult_noise * noise
-
-            prev_sample = x_advanced
-
-        if not return_dict:
-            return (prev_sample, pred_original_sample)
-
-        return DDIMSchedulerOutput(prev_sample=prev_sample, pred_original_sample=pred_original_sample)
-
-    # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.add_noise
-    def add_noise(
-        self,
-        original_samples: torch.Tensor,
-        noise: torch.Tensor,
-        timesteps: torch.IntTensor,
-    ) -> torch.Tensor:
-        # Make sure alphas_cumprod and timestep have same device and dtype as original_samples
-        # Move the self.alphas_cumprod to device to avoid redundant CPU to GPU data movement
-        # for the subsequent add_noise calls
-        self.alphas_cumprod = self.alphas_cumprod.to(device=original_samples.device)
-        alphas_cumprod = self.alphas_cumprod.to(dtype=original_samples.dtype)
-        timesteps = timesteps.to(original_samples.device)
-
-        sqrt_alpha_prod = alphas_cumprod[timesteps] ** 0.5
-        sqrt_alpha_prod = sqrt_alpha_prod.flatten()
-        while len(sqrt_alpha_prod.shape) < len(original_samples.shape):
-            sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1)
-
-        sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps]) ** 0.5
-        sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
-        while len(sqrt_one_minus_alpha_prod.shape) < len(original_samples.shape):
-            sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1)
-
-        noisy_samples = sqrt_alpha_prod * original_samples + sqrt_one_minus_alpha_prod * noise
-        return noisy_samples
-
-    # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.get_velocity
-    def get_velocity(self, sample: torch.Tensor, noise: torch.Tensor, timesteps: torch.IntTensor) -> torch.Tensor:
-        # Make sure alphas_cumprod and timestep have same device and dtype as sample
-        self.alphas_cumprod = self.alphas_cumprod.to(device=sample.device)
-        alphas_cumprod = self.alphas_cumprod.to(dtype=sample.dtype)
-        timesteps = timesteps.to(sample.device)
-
-        sqrt_alpha_prod = alphas_cumprod[timesteps] ** 0.5
-        sqrt_alpha_prod = sqrt_alpha_prod.flatten()
-        while len(sqrt_alpha_prod.shape) < len(sample.shape):
-            sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1)
-
-        sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps]) ** 0.5
-        sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten()
-        while len(sqrt_one_minus_alpha_prod.shape) < len(sample.shape):
-            sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1)
-
-        velocity = sqrt_alpha_prod * noise - sqrt_one_minus_alpha_prod * sample
-        return velocity
-
-    def __len__(self):
-        return self.config.num_train_timesteps
\ No newline at end of file
-- 
Gitee


From d572adc2c2ce7a10638cb479bdccbc9a523d17cf Mon Sep 17 00:00:00 2001
From: jiangmengyu <jiangmengyu1@huawei.com>
Date: Fri, 27 Dec 2024 11:23:19 +0800
Subject: [PATCH 47/91] =?UTF-8?q?cogview3=E9=80=82=E9=85=8D?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../built-in/foundation/cogview3/cogview3plus/__init__.py       | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/__init__.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/__init__.py
index 11a5548362..304ed0a899 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/__init__.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/__init__.py
@@ -17,5 +17,5 @@
 
 from .pipeline import CogView3PlusPipeline, DiffusionPipeline
 from .vae import AutoencoderKL
-from .schedulers import CogVideoXDDIMScheduler, CogVideoXDPMScheduler, SchedulerMixin
+from .schedulers import CogVideoXDDIMScheduler, SchedulerMixin
 from .models import CogView3PlusTransformer2DModel, ModelMixin
\ No newline at end of file
-- 
Gitee


From c9d573a3307b1b5e62280cdcbab922c03ddac0c2 Mon Sep 17 00:00:00 2001
From: jiangmengyu <jiangmengyu1@huawei.com>
Date: Fri, 27 Dec 2024 11:59:17 +0800
Subject: [PATCH 48/91] =?UTF-8?q?cogview3=E9=80=82=E9=85=8D?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 MindIE/MindIE-Torch/built-in/foundation/cogview3/README.md   | 2 +-
 .../built-in/foundation/cogview3/cogview3plus/__init__.py    | 1 -
 .../foundation/cogview3/cogview3plus/layers/embeddings.py    | 5 ++---
 .../foundation/cogview3/cogview3plus/layers/linear.py        | 2 +-
 .../foundation/cogview3/cogview3plus/layers/normalization.py | 2 +-
 .../foundation/cogview3/cogview3plus/models/activations.py   | 2 +-
 .../foundation/cogview3/cogview3plus/models/attention.py     | 3 ++-
 .../cogview3/cogview3plus/models/attention_processor.py      | 3 ++-
 .../cogview3/cogview3plus/models/model_load_utils.py         | 2 +-
 .../cogview3/cogview3plus/models/modeling_utils.py           | 2 +-
 .../cogview3/cogview3plus/models/transformer_cogview3plus.py | 4 +---
 .../cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py  | 2 +-
 .../cogview3plus/schedulers/scheduling_ddim_cogvideox.py     | 5 +----
 .../cogview3/cogview3plus/schedulers/scheduling_utils.py     | 3 ++-
 .../foundation/cogview3/cogview3plus/vae/autoencoder_kl.py   | 3 ++-
 .../built-in/foundation/cogview3/cogview3plus/vae/vae.py     | 3 ++-
 16 files changed, 21 insertions(+), 23 deletions(-)

diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/README.md b/MindIE/MindIE-Torch/built-in/foundation/cogview3/README.md
index 495359c497..16592703d8 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/README.md
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/README.md
@@ -78,7 +78,7 @@ https://huggingface.co/THUDM/CogView3-Plus-3B/tree/main
 ```shell
 {
   "_class_name": "CogView3PlusPipeline",
-  "_diffusers_version": "0.31.0.dev0",
+  "_diffusers_version": "0.31.0",
   "scheduler": [
     "cogview3plus",
     "CogVideoXDDIMScheduler"
diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/__init__.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/__init__.py
index 304ed0a899..8cfcd60a5b 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/__init__.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/__init__.py
@@ -14,7 +14,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-
 from .pipeline import CogView3PlusPipeline, DiffusionPipeline
 from .vae import AutoencoderKL
 from .schedulers import CogVideoXDDIMScheduler, SchedulerMixin
diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/embeddings.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/embeddings.py
index 1763b3b910..72418f08b3 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/embeddings.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/embeddings.py
@@ -4,20 +4,19 @@
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+#     https://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
 import math
 from typing import Optional
 
-import numpy as np
 import torch
 from torch import nn
-from diffusers.utils import deprecate
 from diffusers.models.activations import FP32SiLU, get_activation
 
 
diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/linear.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/linear.py
index 805c2d2b34..5f27384302 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/linear.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/linear.py
@@ -6,7 +6,7 @@
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+#     httpa://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/normalization.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/normalization.py
index 2ead694b9f..e526184632 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/normalization.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/normalization.py
@@ -5,7 +5,7 @@
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+#     https://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/activations.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/activations.py
index 48fe8ed17d..fc68971806 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/activations.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/activations.py
@@ -5,7 +5,7 @@
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+#     https://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/attention.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/attention.py
index ac85e70e05..a7a559ff2f 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/attention.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/attention.py
@@ -4,13 +4,14 @@
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+#     https://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
 from typing import Optional
 
 import torch
diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/attention_processor.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/attention_processor.py
index d20baee1ec..1f6b12f1aa 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/attention_processor.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/attention_processor.py
@@ -4,13 +4,14 @@
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+#     https://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
 import inspect
 from typing import Optional
 
diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/model_load_utils.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/model_load_utils.py
index f6d3b20570..3cffbd6432 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/model_load_utils.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/model_load_utils.py
@@ -6,7 +6,7 @@
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0 
+#     https://www.apache.org/licenses/LICENSE-2.0 
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/modeling_utils.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/modeling_utils.py
index f83eff493f..e71d8577d5 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/modeling_utils.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/modeling_utils.py
@@ -6,7 +6,7 @@
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+#     https://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/transformer_cogview3plus.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/transformer_cogview3plus.py
index 6df2dc361e..cfeb27c109 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/transformer_cogview3plus.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/transformer_cogview3plus.py
@@ -4,7 +4,7 @@
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+#     https://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
@@ -12,9 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-
 from typing import Any, Dict, Union
-from dataclasses import dataclass
 
 import torch
 import torch.nn as nn
diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py
index 0031849dc9..7e89d4c370 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py
@@ -5,7 +5,7 @@
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+#     https://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_ddim_cogvideox.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_ddim_cogvideox.py
index dbe9d4b17f..26ae48a2c8 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_ddim_cogvideox.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_ddim_cogvideox.py
@@ -5,7 +5,7 @@
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+#     https://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
@@ -13,9 +13,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# DISCLAIMER: This code is strongly influenced by https://github.com/pesser/pytorch_diffusion
-# and https://github.com/hojonathanho/diffusion
-
 import math
 from dataclasses import dataclass
 from typing import List, Optional, Tuple, Union
diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_utils.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_utils.py
index 7a72eb3d06..3f1fb5bc32 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_utils.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_utils.py
@@ -4,13 +4,14 @@
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+#     https://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
 import importlib
 import os
 from dataclasses import dataclass
diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/vae/autoencoder_kl.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/vae/autoencoder_kl.py
index bbe9bddf3e..3f524408c9 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/vae/autoencoder_kl.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/vae/autoencoder_kl.py
@@ -4,13 +4,14 @@
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+#     https://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
 from typing import Dict, Optional, Tuple, Union
 
 import torch
diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/vae/vae.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/vae/vae.py
index 2323be5a78..c1abff7097 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/vae/vae.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/vae/vae.py
@@ -4,13 +4,14 @@
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+#     https://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
 from dataclasses import dataclass
 from typing import Optional, Tuple
 
-- 
Gitee


From 8e94c1164672e4f5e71c03bf76e01fe609c5fd2b Mon Sep 17 00:00:00 2001
From: jiangmengyu <jiangmengyu1@huawei.com>
Date: Fri, 27 Dec 2024 12:07:11 +0800
Subject: [PATCH 49/91] =?UTF-8?q?cogview3=E9=80=82=E9=85=8D?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../cogview3plus/layers/normalization.py      | 30 +------------
 .../cogview3plus/models/modeling_utils.py     | 42 ++-----------------
 2 files changed, 4 insertions(+), 68 deletions(-)

diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/normalization.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/normalization.py
index e526184632..1ec0a5b15c 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/normalization.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/normalization.py
@@ -21,34 +21,6 @@ import torch
 import torch.nn as nn
 import torch.nn.functional as F
 
-from diffusers.utils import is_torch_version
-
-
-if is_torch_version(">=", "2.1.0"):
-    LayerNorm = nn.LayerNorm
-else:
-    # Has optional bias parameter compared to torch layer norm
-    class LayerNorm(nn.Module):
-        def __init__(self, dim, eps: float = 1e-5, elementwise_affine: bool = True, bias: bool = True):
-            super().__init__()
-
-            self.eps = eps
-
-            if isinstance(dim, numbers.Integral):
-                dim = (dim,)
-
-            self.dim = torch.Size(dim)
-
-            if elementwise_affine:
-                self.weight = nn.Parameter(torch.ones(dim))
-                self.bias = nn.Parameter(torch.zeros(dim)) if bias else None
-            else:
-                self.weight = None
-                self.bias = None
-
-        def forward(self, x):
-            return F.layer_norm(x, self.dim, self.weight, self.bias, self.eps)
-
 
 class RMSNorm(nn.Module):
     def __init__(self, dim, eps: float, elementwise_affine: bool = True, bias: bool = False):
@@ -191,7 +163,7 @@ class AdaLayerNormContinuous(nn.Module):
         self.silu = nn.SiLU()
         self.linear = nn.Linear(conditioning_embedding_dim, embedding_dim * 2, bias=bias)
         if norm_type == "layer_norm":
-            self.norm = LayerNorm(embedding_dim, eps, elementwise_affine, bias)
+            self.norm = nn.LayerNorm(embedding_dim, eps, elementwise_affine, bias)
         elif norm_type == "rms_norm":
             self.norm = RMSNorm(embedding_dim, eps, elementwise_affine)
         else:
diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/modeling_utils.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/modeling_utils.py
index e71d8577d5..252c758863 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/modeling_utils.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/modeling_utils.py
@@ -22,24 +22,19 @@ import os
 import re
 from collections import OrderedDict
 from functools import partial, wraps
-from pathlib import Path
 from typing import Any, Callable, List, Optional, Tuple, Union
 
-import safetensors
 import torch
-from huggingface_hub import create_repo, split_torch_state_dict_into_shards
 from huggingface_hub.utils import validate_hf_hub_args
 from torch import Tensor, nn
 
 from diffusers import __version__
-from diffusers.quantizers import DiffusersAutoQuantizer, DiffusersQuantizer
+from diffusers.quantizers import DiffusersAutoQuantizer
 from diffusers.quantizers.quantization_config import QuantizationMethod
 from diffusers.utils import (
     CONFIG_NAME,
     FLAX_WEIGHTS_NAME,
-    SAFE_WEIGHTS_INDEX_NAME,
     SAFETENSORS_WEIGHTS_NAME,
-    WEIGHTS_INDEX_NAME,
     WEIGHTS_NAME,
     _add_variant,
     _get_checkpoint_shard_files,
@@ -48,16 +43,10 @@ from diffusers.utils import (
     is_accelerate_available,
     is_bitsandbytes_available,
     is_bitsandbytes_version,
-    is_torch_version,
     logging,
 )
-from diffusers.utils.hub_utils import (
-    PushToHubMixin,
-    load_or_create_model_card,
-    populate_model_card,
-)
+from diffusers.utils.hub_utils import PushToHubMixin
 from diffusers.models.model_loading_utils import (
-    _determine_device_map,
     _fetch_index_file,
     _fetch_index_file_legacy,
     _load_state_dict_into_model,
@@ -66,18 +55,11 @@ from diffusers.models.model_loading_utils import (
     load_state_dict,
 )
 
-from .model_load_utils import load_state_dict_sd
-
 
 logger = logging.get_logger(__name__)
 
-_REGEX_SHARD = re.compile(r"(.*?)-\d{5}-of-\d{5}")
-
 
-if is_torch_version(">=", "1.9.0"):
-    _LOW_CPU_MEM_USAGE_DEFAULT = True
-else:
-    _LOW_CPU_MEM_USAGE_DEFAULT = False
+_LOW_CPU_MEM_USAGE_DEFAULT = True
 
 
 if is_accelerate_available():
@@ -472,19 +454,6 @@ class ModelMixin(torch.nn.Module, PushToHubMixin):
                 " `device_map=None`. You can install accelerate with `pip install accelerate`."
             )
 
-        # Check if we can handle device_map and dispatching the weights
-        if device_map is not None and not is_torch_version(">=", "1.9.0"):
-            raise NotImplementedError(
-                "Loading and dispatching requires torch >= 1.9.0. Please either update your PyTorch version or set"
-                " `device_map=None`."
-            )
-
-        if low_cpu_mem_usage is True and not is_torch_version(">=", "1.9.0"):
-            raise NotImplementedError(
-                "Low memory initialization requires torch >= 1.9.0. Please either update your PyTorch version or set"
-                " `low_cpu_mem_usage=False`."
-            )
-
         if low_cpu_mem_usage is False and device_map is not None:
             raise ValueError(
                 f"You cannot set `low_cpu_mem_usage` to `False` while using device_map={device_map} for loading and"
@@ -516,11 +485,6 @@ class ModelMixin(torch.nn.Module, PushToHubMixin):
             elif not low_cpu_mem_usage:
                 raise ValueError("Passing along a `device_map` requires `low_cpu_mem_usage=True`")
 
-        if low_cpu_mem_usage:
-            if device_map is not None and not is_torch_version(">=", "1.10"):
-                # The max memory utils require PyTorch >= 1.10 to have torch.cuda.mem_get_info.
-                raise ValueError("`low_cpu_mem_usage` and `device_map` require PyTorch >= 1.10.")
-
         # Load config if we don't provide a configuration
         config_path = pretrained_model_name_or_path
 
-- 
Gitee


From 78ad303e55cf5f605f83a77785d5fc10cf7caecd Mon Sep 17 00:00:00 2001
From: jiangmengyu <jiangmengyu1@huawei.com>
Date: Fri, 27 Dec 2024 12:09:15 +0800
Subject: [PATCH 50/91] =?UTF-8?q?cogview3=E9=80=82=E9=85=8D?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../pipeline/pipeline_cogview3plus.py         | 100 ------------------
 1 file changed, 100 deletions(-)

diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py
index 7e89d4c370..82b1742f3b 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py
@@ -21,7 +21,6 @@ from transformers import T5EncoderModel, T5Tokenizer
 
 from diffusers.image_processor import VaeImageProcessor
 from diffusers.pipelines.pipeline_utils import DiffusionPipeline
-from diffusers.utils import logging, replace_example_docstring
 from diffusers.utils.torch_utils import randn_tensor
 
 from ..vae import AutoencoderKL
@@ -32,22 +31,6 @@ from .pipeline_output import CogView3PipelineOutput
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
 
-EXAMPLE_DOC_STRING = """
-    Examples:
-        ```python
-        >>> import torch
-        >>> from diffusers import CogView3PlusPipeline
-
-        >>> pipe = CogView3PlusPipeline.from_pretrained("THUDM/CogView3-Plus-3B", torch_dtype=torch.bfloat16)
-        >>> pipe.to("cuda")
-
-        >>> prompt = "A photo of an astronaut riding a horse on mars"
-        >>> image = pipe(prompt).images[0]
-        >>> image.save("output.png")
-        ```
-"""
-
-
 # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
 def retrieve_timesteps(
     scheduler,
@@ -261,7 +244,6 @@ class CogView3PlusPipeline(DiffusionPipeline):
         return self._interrupt
 
     @torch.no_grad()
-    @replace_example_docstring(EXAMPLE_DOC_STRING)
     def __call__(
         self,
         prompt: Optional[Union[str, List[str]]] = None,
@@ -270,88 +252,6 @@ class CogView3PlusPipeline(DiffusionPipeline):
         guidance_scale: float = 5.0,
         num_images_per_prompt: int = 1,
     ) -> Union[CogView3PipelineOutput, Tuple]:
-        """
-        Function invoked when calling the pipeline for generation.
-
-        Args:
-            prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
-            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. If not defined, one has to pass
-                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
-                less than `1`).
-            height (`int`, *optional*, defaults to self.transformer.config.sample_size * self.vae_scale_factor):
-                The height in pixels of the generated image. If not provided, it is set to 1024.
-            width (`int`, *optional*, defaults to self.transformer.config.sample_size * self.vae_scale_factor):
-                The width in pixels of the generated image. If not provided it is set to 1024.
-            num_inference_steps (`int`, *optional*, defaults to `50`):
-                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
-                expense of slower inference.
-            timesteps (`List[int]`, *optional*):
-                Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
-                in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
-                passed will be used. Must be in descending order.
-            guidance_scale (`float`, *optional*, defaults to `5.0`):
-                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
-                `guidance_scale` is defined as `w` of equation 2. of [Imagen
-                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
-                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
-                usually at the expense of lower image quality.
-            num_images_per_prompt (`int`, *optional*, defaults to `1`):
-                The number of images to generate per prompt.
-            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
-                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
-                to make generation deterministic.
-            latents (`torch.FloatTensor`, *optional*):
-                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
-                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
-                tensor will ge generated by sampling using the supplied random `generator`.
-            prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
-            original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
-                If `original_size` is not the same as `target_size` the image will appear to be down- or upsampled.
-                `original_size` defaults to `(height, width)` if not specified. Part of SDXL's micro-conditioning as
-                explained in section 2.2 of
-                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
-            crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
-                `crops_coords_top_left` can be used to generate an image that appears to be "cropped" from the position
-                `crops_coords_top_left` downwards. Favorable, well-centered images are usually achieved by setting
-                `crops_coords_top_left` to (0, 0). Part of SDXL's micro-conditioning as explained in section 2.2 of
-                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
-            output_type (`str`, *optional*, defaults to `"pil"`):
-                The output format of the generate image. Choose between
-                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] instead
-                of a plain tuple.
-            attention_kwargs (`dict`, *optional*):
-                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
-                `self.processor` in
-                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
-            callback_on_step_end (`Callable`, *optional*):
-                A function that calls at the end of each denoising steps during the inference. The function is called
-                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
-                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
-                `callback_on_step_end_tensor_inputs`.
-            callback_on_step_end_tensor_inputs (`List`, *optional*):
-                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
-                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
-                `._callback_tensor_inputs` attribute of your pipeline class.
-            max_sequence_length (`int`, defaults to `224`):
-                Maximum sequence length in encoded prompt. Can be set to other values but may lead to poorer results.
-
-        Examples:
-
-        Returns:
-            [`~pipelines.cogview3.pipeline_cogview3plus.CogView3PipelineOutput`] or `tuple`:
-            [`~pipelines.cogview3.pipeline_cogview3plus.CogView3PipelineOutput`] if `return_dict` is True, otherwise a
-            `tuple`. When returning a tuple, the first element is a list with the generated images.
-        """
         if image_size is None:
             height = self.transformer.config.sample_size * self.vae_scale_factor
             width = self.transformer.config.sample_size * self.vae_scale_factor
-- 
Gitee


From 59ebdf2301a7a7437313f55c49caee548df369ef Mon Sep 17 00:00:00 2001
From: jiangmengyu <jiangmengyu1@huawei.com>
Date: Fri, 27 Dec 2024 12:09:49 +0800
Subject: [PATCH 51/91] =?UTF-8?q?cogview3=E9=80=82=E9=85=8D?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py      | 1 +
 1 file changed, 1 insertion(+)

diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py
index 82b1742f3b..7af8721fa3 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py
@@ -21,6 +21,7 @@ from transformers import T5EncoderModel, T5Tokenizer
 
 from diffusers.image_processor import VaeImageProcessor
 from diffusers.pipelines.pipeline_utils import DiffusionPipeline
+from diffusers.utils import logging
 from diffusers.utils.torch_utils import randn_tensor
 
 from ..vae import AutoencoderKL
-- 
Gitee


From e3778d4c1366d13e89a2b15e255bf4f33adeedf6 Mon Sep 17 00:00:00 2001
From: jiangmengyu <jiangmengyu1@huawei.com>
Date: Fri, 27 Dec 2024 13:25:54 +0800
Subject: [PATCH 52/91] =?UTF-8?q?cogview3=E9=80=82=E9=85=8D?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../cogview3/cogview3plus/layers/linear.py    | 79 ++++---------------
 .../models/attention_processor.py             | 26 ++----
 .../models/transformer_cogview3plus.py        | 50 ++++++++++++
 3 files changed, 74 insertions(+), 81 deletions(-)

diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/linear.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/linear.py
index 5f27384302..bd9b9ba796 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/linear.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/linear.py
@@ -17,79 +17,32 @@
 
 import torch
 import torch.nn as nn
-import torch_npu
 
 
 class QKVLinear(nn.Module):
-    def __init__(self, attention_dim, hidden_size, qkv_bias=True, cross_attention_dim=None, device=None, dtype=None):
+    def __init__(self, attention_dim, hidden_size, qkv_bias=True, device=None, dtype=None):
         super(QKVLinear, self).__init__()
         self.attention_dim = attention_dim
         self.hidden_size = hidden_size
-        self.cross_attention_dim = cross_attention_dim
         self.qkv_bias = qkv_bias
 
         factory_kwargs = {"device": device, "dtype": dtype}
 
-        if not cross_attention_dim:
-            self.weight = nn.Parameter(torch.empty([self.attention_dim, 3 * self.hidden_size], **factory_kwargs))
-            if self.qkv_bias:
-                self.bias = nn.Parameter(torch.empty([3 * self.hidden_size], **factory_kwargs))
-        else:
-            self.q_weight = nn.Parameter(torch.empty([self.attention_dim, self.hidden_size], **factory_kwargs))
-            self.kv_weight = nn.Parameter(torch.empty([self.attention_dim, 2 * self.hidden_size], **factory_kwargs))
-
-            if self.qkv_bias:
-                self.q_bias = nn.Parameter(torch.empty([self.hidden_size], **factory_kwargs))
-                self.kv_bias = nn.Parameter(torch.empty([2 * self.hidden_size], **factory_kwargs))
-
-
-    def forward(self, hidden_states, encoder_hidden_states=None):
-
-        if self.cross_attention_dim is None:
-            if not self.qkv_bias:
-                qkv = torch.matmul(hidden_states, self.weight)
-            else:
-                qkv = torch.addmm(
-                    self.bias, 
-                    hidden_states.view(hidden_states.size(0) * hidden_states.size(1), hidden_states.size(2)),
-                    self.weight, 
-                    beta=1, 
-                    alpha=1
-                )
+        self.weight = nn.Parameter(torch.empty([self.attention_dim, 3 * self.hidden_size], **factory_kwargs))
+        if self.qkv_bias:
+            self.bias = nn.Parameter(torch.empty([3 * self.hidden_size], **factory_kwargs))
 
-            batch, seqlen, _ = hidden_states.shape
-            qkv_shape = (batch, seqlen, 3, -1)
-            qkv = qkv.view(qkv_shape)
-            q, k, v = qkv.unbind(2)
+    def forward(self, hidden_states):
 
+        if not self.qkv_bias:
+            qkv = torch.matmul(hidden_states, self.weight)
         else:
-            if not self.qkv_bias:
-                q = torch.matmul(hidden_states, self.q_weight)
-                kv = torch.matmul(encoder_hidden_states, self.kv_weight)
-            else:
-                q = torch.addmm(
-                    self.q_bias, 
-                    hidden_states.view(hidden_states.size(0) * hidden_states.size(1), hidden_states.size(2)), 
-                    self.q_weight, 
-                    beta=1, 
-                    alpha=1
-                )
-                kv = torch.addmm(
-                    self.kv_bias, 
-                    encoder_hidden_states.view(
-                        encoder_hidden_states.size(0) * encoder_hidden_states.size(1),
-                        encoder_hidden_states.size(2)), 
-                    self.kv_weight, 
-                    beta=1, 
-                    alpha=1
-                )
-
-            batch, seqlen, _ = encoder_hidden_states.shape
-            kv_shape = (batch, seqlen, 2, -1)
-
-            kv = kv.view(kv_shape)
-            k, v = kv.unbind(2)
-
-            q = q.view(hidden_states.shape)
-
-        return q, k, v
\ No newline at end of file
+            qkv = torch.addmm(
+                self.bias, 
+                hidden_states.view(hidden_states.size(0) * hidden_states.size(1), hidden_states.size(2)),
+                self.weight, 
+                beta=1, 
+                alpha=1
+            )
+
+        return qkv
\ No newline at end of file
diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/attention_processor.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/attention_processor.py
index 1f6b12f1aa..eb7618b38f 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/attention_processor.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/attention_processor.py
@@ -23,6 +23,8 @@ import torch_npu
 from diffusers.utils import logging
 from diffusers.utils.torch_utils import maybe_allow_in_graph
 
+from ..layers import QKVLinear
+
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
 
@@ -160,15 +162,7 @@ class Attention(nn.Module):
                 f"unknown cross_attention_norm: {cross_attention_norm}. Should be None, 'layer_norm' or 'group_norm'"
             )
         
-        self.to_q = nn.Linear(query_dim, self.inner_dim, bias=bias)
-
-        if not self.only_cross_attention:
-            # only relevant for the `AddedKVProcessor` classes
-            self.to_k = nn.Linear(self.cross_attention_dim, self.inner_kv_dim, bias=bias)
-            self.to_v = nn.Linear(self.cross_attention_dim, self.inner_kv_dim, bias=bias)
-        else:
-            self.to_k = None
-            self.to_v = None
+        self.to_qkv = QKVLinear(self.inner_dim, query_dim)
 
         self.added_proj_bias = added_proj_bias
         if self.added_kv_proj_dim is not None:
@@ -347,16 +341,12 @@ class CogVideoXAttnProcessor2_0:
             attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
             attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
 
-        query = attn.to_q(hidden_states)
-        key = attn.to_k(hidden_states)
-        value = attn.to_v(hidden_states)
-
-        inner_dim = key.shape[-1]
+        B, S, _ = hidden_states.shape
+        qkv = self.to_qkv(hidden_states)
+        inner_dim = qkv.shape[-1] // 3
         head_dim = inner_dim // attn.heads
-
-        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
-        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
-        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        qkv_shape = (B, S, 3, attn.heads, head_dim)
+        query, key, value = qkv.view(qkv_shape).permute(2, 0, 3, 1, 4).coutiguous().unbind(0)
 
         if attn.norm_q is not None:
             query = attn.norm_q(query)
diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/transformer_cogview3plus.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/transformer_cogview3plus.py
index cfeb27c109..29b6905a58 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/transformer_cogview3plus.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/transformer_cogview3plus.py
@@ -350,3 +350,53 @@ class CogView3PlusTransformer2DModel(ModelMixin, ConfigMixin):
         )
 
         return Transformer2DModelOutput(sample=output)
+
+    def load_weights(self, state_dict, shard=False):
+        with torch.no_grad():
+            if not shard:
+                self.load_state_dict(state_dict)
+                return {}
+            else:
+                weights = state_dict
+
+                for i in range(self.num_layers):
+                    if i != 26:
+                        q_weight = weights.pop(f"transformer_blocks.{i}.attn1.to_q.weight", None)
+                        q_bias = weights.pop(f"transformer_blocks.{i}.attn1.to_q.bias", None)
+                        k_weight = weights.pop(f"transformer_blocks.{i}.attn1.to_k.weight", None)
+                        k_bias = weights.pop(f"transformer_blocks.{i}.attn1.to_k.bias", None)
+                        v_weight = weights.pop(f"transformer_blocks.{i}.attn1.to_v.weight", None)
+                        v_bias = weights.pop(f"transformer_blocks.{i}.attn1.to_v.bias", None)
+
+                        # query, key, value的weight和bias权重存在同一个文件中，不会分开存储。
+                        if q_weight is not None and k_weight is not None and v_weight is not None:
+                            qkv_weight = torch.cat([q_weight, k_weight, v_weight], dim=0).transpose(0, 1).contiguous()
+                            qkv_bias = torch.cat([q_bias, k_bias, v_bias], dim=0).contiguous()
+                            weights[f"transformer_blocks.{i}.attn1.to_qkv.weight"] = qkv_weight
+                            weights[f"transformer_blocks.{i}.attn1.to_qkv.bias"] = qkv_bias
+                    else:
+                        if self.q_weight_cache is None:
+                            self.q_weight_cache = weights.pop(f"transformer_blocks.{i}.attn1.to_q.weight", None)
+                        if self.q_bias_cache is None:
+                            self.q_bias_cache = weights.pop(f"transformer_blocks.{i}.attn1.to_q.bias", None)
+                        if self.k_weight_cache is None:
+                            self.k_weight_cache = weights.pop(f"transformer_blocks.{i}.attn1.to_k.weight", None)
+                        if self.k_bias_cache is None:
+                            self.k_bias_cache = weights.pop(f"transformer_blocks.{i}.attn1.to_k.bias", None)
+                        if self.v_weight_cache is None:
+                            self.v_weight_cache = weights.pop(f"transformer_blocks.{i}.attn1.to_v.weight", None)
+                        if self.v_bias_cache is None:
+                            self.v_bias_cache = weights.pop(f"transformer_blocks.{i}.attn1.to_v.bias", None)
+
+                qk_weight_cache = self.q_weight_cache is not None and self.k_weight_cache is not None
+                if  qk_weight_cache and self.v_weight_cache is not None:
+                    qkv_weight = torch.cat(
+                        [self.q_weight_cache, self.k_weight_cache, self.v_weight_cache], 
+                        dim=0
+                    ).transpose(0, 1).contiguous()
+                    qkv_bias = torch.cat([self.q_bias_cache, self.k_bias_cache, self.v_bias_cache], dim=0).contiguous()
+                    weights[f"transformer_blocks.26.attn1.to_qkv.weight"] = qkv_weight
+                    weights[f"transformer_blocks.26.attn1.to_qkv.bias"] = qkv_bias
+
+                self.load_state_dict(weights, strict=False, assign=True)
+                return weights.keys()
-- 
Gitee


From 128bb12df5d05212993b8a9de644d003beb0fd9a Mon Sep 17 00:00:00 2001
From: jiangmengyu <jiangmengyu1@huawei.com>
Date: Fri, 27 Dec 2024 13:27:31 +0800
Subject: [PATCH 53/91] =?UTF-8?q?cogview3=E9=80=82=E9=85=8D?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../cogview3/cogview3plus/models/attention_processor.py         | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/attention_processor.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/attention_processor.py
index eb7618b38f..afc6596471 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/attention_processor.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/attention_processor.py
@@ -342,7 +342,7 @@ class CogVideoXAttnProcessor2_0:
             attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
 
         B, S, _ = hidden_states.shape
-        qkv = self.to_qkv(hidden_states)
+        qkv = attn.to_qkv(hidden_states)
         inner_dim = qkv.shape[-1] // 3
         head_dim = inner_dim // attn.heads
         qkv_shape = (B, S, 3, attn.heads, head_dim)
-- 
Gitee


From 436984bc1a52a824c609606fe02f2382574533ad Mon Sep 17 00:00:00 2001
From: jiangmengyu <jiangmengyu1@huawei.com>
Date: Fri, 27 Dec 2024 13:28:26 +0800
Subject: [PATCH 54/91] =?UTF-8?q?cogview3=E9=80=82=E9=85=8D?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../cogview3/cogview3plus/models/attention_processor.py         | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/attention_processor.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/attention_processor.py
index afc6596471..d2a7673ea5 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/attention_processor.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/attention_processor.py
@@ -346,7 +346,7 @@ class CogVideoXAttnProcessor2_0:
         inner_dim = qkv.shape[-1] // 3
         head_dim = inner_dim // attn.heads
         qkv_shape = (B, S, 3, attn.heads, head_dim)
-        query, key, value = qkv.view(qkv_shape).permute(2, 0, 3, 1, 4).coutiguous().unbind(0)
+        query, key, value = qkv.view(qkv_shape).permute(2, 0, 3, 1, 4).contiguous().unbind(0)
 
         if attn.norm_q is not None:
             query = attn.norm_q(query)
-- 
Gitee


From f99600849747c07f0f54289b2534a158c2b21ffb Mon Sep 17 00:00:00 2001
From: jiangmengyu <jiangmengyu1@huawei.com>
Date: Sat, 28 Dec 2024 14:17:47 +0800
Subject: [PATCH 55/91] =?UTF-8?q?cogview3=E9=80=82=E9=85=8D?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../models/transformer_cogview3plus.py        | 99 ++++++++++++++-----
 .../pipeline/pipeline_cogview3plus.py         |  1 +
 2 files changed, 77 insertions(+), 23 deletions(-)

diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/transformer_cogview3plus.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/transformer_cogview3plus.py
index 29b6905a58..2bd2841899 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/transformer_cogview3plus.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/transformer_cogview3plus.py
@@ -16,6 +16,7 @@ from typing import Any, Dict, Union
 
 import torch
 import torch.nn as nn
+import numpy as np
 
 from diffusers.configuration_utils import ConfigMixin, register_to_config
 from diffusers.models.attention_processor import AttentionProcessor
@@ -170,6 +171,11 @@ class CogView3PlusTransformer2DModel(ModelMixin, ConfigMixin):
         time_embed_dim: int = 512,
         condition_dim: int = 256,
         pos_embed_max_size: int = 128,
+        use_cache: bool = True,
+        cache_interval: int = 2,
+        cache_start: int = 3,
+        num_cache_layer: int = 13,
+        cache_start_steps: int = 5,
     ):
         super().__init__()
         self.out_channels = out_channels
@@ -224,6 +230,15 @@ class CogView3PlusTransformer2DModel(ModelMixin, ConfigMixin):
         self.v_weight_cache = None
         self.v_bias_cache = None
 
+        self.use_cache = use_cache
+        self.cache_interval = cache_interval
+        self.cache_start = cache_start
+        self.num_cache_layer = num_cache_layer
+        self.cache_start_steps = cache_start_steps
+
+        self.delta_cache = None
+        self.delta_encoder_cache = None
+
     @property
     # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.attn_processors
     def attn_processors(self) -> Dict[str, AttentionProcessor]:
@@ -295,6 +310,7 @@ class CogView3PlusTransformer2DModel(ModelMixin, ConfigMixin):
         original_size: torch.Tensor,
         target_size: torch.Tensor,
         crop_coords: torch.Tensor,
+        t_idx: int,
     ) -> Union[torch.Tensor, Transformer2DModelOutput]:
         hidden_states = states[0]
         encoder_hidden_states = states[1]
@@ -309,29 +325,7 @@ class CogView3PlusTransformer2DModel(ModelMixin, ConfigMixin):
         encoder_hidden_states = hidden_states[:, :text_seq_length]
         hidden_states = hidden_states[:, text_seq_length:]
 
-        for index_block, block in enumerate(self.transformer_blocks):
-            if torch.is_grad_enabled() and self.gradient_checkpointing:
-
-                def create_custom_forward(module):
-                    def custom_forward(*inputs):
-                        return module(*inputs)
-
-                    return custom_forward
-
-                ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
-                hidden_states, encoder_hidden_states = torch.utils.checkpoint.checkpoint(
-                    create_custom_forward(block),
-                    hidden_states,
-                    encoder_hidden_states,
-                    emb,
-                    **ckpt_kwargs,
-                )
-            else:
-                hidden_states, encoder_hidden_states = block(
-                    hidden_states=hidden_states,
-                    encoder_hidden_states=encoder_hidden_states,
-                    emb=emb,
-                )
+        hidden_states, encoder_hidden_states = self._forward_blocks(hidden_states, encoder_hidden_states, emb, t_idx)
 
         hidden_states = self.norm_out(hidden_states, emb)
         hidden_states = self.proj_out(hidden_states)  # (batch_size, height*width, patch_size*patch_size*out_channels)
@@ -351,6 +345,65 @@ class CogView3PlusTransformer2DModel(ModelMixin, ConfigMixin):
 
         return Transformer2DModelOutput(sample=output)
 
+    # forward blocks in range [start_idx, end_idx), then return input and output
+    def _forward_blocks_range(self, hidden_states, encoder_hidden_states, emb, start_idx, end_idx, **kwargs):
+        for _, block in enumerate(self.transformer_blocks[start_idx: end_idx]):
+            hidden_states, encoder_hidden_states = block(
+                hidden_states=hidden_states,
+                encoder_hidden_states=encoder_hidden_states,
+                emb=emb,
+            )
+
+        return hidden_states, encoder_hidden_states
+
+    def _forward_blocks(self, hidden_states, encoder_hidden_states, emb, t_idx):
+        num_blocks = len(self.spatial_blocks)
+
+        if not self.use_cache or (t_idx < self.cache_start_steps):
+            hidden_states, encoder_hidden_states = self._forward_blocks_range(
+                hidden_states, 
+                encoder_hidden_states, 
+                emb, 
+                0, 
+                num_blocks
+            )
+        else:
+            # infer [0, cache_start)
+            hidden_states, encoder_hidden_states = self._forward_blocks_range(
+                hidden_states, 
+                encoder_hidden_states, 
+                emb, 
+                0, 
+                self.cache_start
+            )
+            # infer [cache_start, cache_end)
+            cache_end = np.minimum(self.cache_start + self.num_cache_layer, num_blocks)
+            hidden_states_before_cache = hidden_states.clone()
+            encoder_hidden_states_before_cache = encoder_hidden_states.clone()
+            if t_idx % self.cache_interval == (self.cache_start_steps % self.cache_interval):
+                hidden_states, encoder_hidden_states = self._forward_blocks_range(
+                    hidden_states, 
+                    encoder_hidden_states, 
+                    emb, 
+                    self.cache_start, 
+                    cache_end
+                )
+                self.delta_cache = hidden_states - hidden_states_before_cache
+                self.delta_encoder_cache = encoder_hidden_states - encoder_hidden_states_before_cache
+            else:
+                hidden_states = hidden_states_before_cache + self.delta_cache
+                encoder_hidden_states = encoder_hidden_states_before_cache + self.delta_encoder_cache
+            # infer [cache_end, num_blocks)
+            hidden_states, encoder_hidden_states = self._forward_blocks_range(
+                hidden_states, 
+                encoder_hidden_states, 
+                emb, 
+                cache_end, 
+                num_blocks
+            )
+
+        return hidden_states, encoder_hidden_states
+
     def load_weights(self, state_dict, shard=False):
         with torch.no_grad():
             if not shard:
diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py
index 7af8721fa3..1dda0a2108 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py
@@ -343,6 +343,7 @@ class CogView3PlusPipeline(DiffusionPipeline):
                     original_size=original_size,
                     target_size=target_size,
                     crop_coords=crops_coords_top_left,
+                    t_idx=i,
                 )[0]
                 noise_pred = noise_pred.float()
 
-- 
Gitee


From e064ad386b40f6db2df0d839ea4cbf7e243486af Mon Sep 17 00:00:00 2001
From: jiangmengyu <jiangmengyu1@huawei.com>
Date: Sat, 28 Dec 2024 14:22:46 +0800
Subject: [PATCH 56/91] =?UTF-8?q?cogview3=E9=80=82=E9=85=8D?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../cogview3/cogview3plus/models/transformer_cogview3plus.py    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/transformer_cogview3plus.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/transformer_cogview3plus.py
index 2bd2841899..b98e3e2526 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/transformer_cogview3plus.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/transformer_cogview3plus.py
@@ -357,7 +357,7 @@ class CogView3PlusTransformer2DModel(ModelMixin, ConfigMixin):
         return hidden_states, encoder_hidden_states
 
     def _forward_blocks(self, hidden_states, encoder_hidden_states, emb, t_idx):
-        num_blocks = len(self.spatial_blocks)
+        num_blocks = len(self.transformer_blocks)
 
         if not self.use_cache or (t_idx < self.cache_start_steps):
             hidden_states, encoder_hidden_states = self._forward_blocks_range(
-- 
Gitee


From 58f83de387f68dabeb760439d1f986ffe5ce2954 Mon Sep 17 00:00:00 2001
From: jiangmengyu <jiangmengyu1@huawei.com>
Date: Tue, 31 Dec 2024 10:59:17 +0800
Subject: [PATCH 57/91] =?UTF-8?q?cogview3=E9=80=82=E9=85=8D?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../cogview3plus/models/activations.py        |   8 +-
 .../cogview3plus/models/modeling_utils.py     | 269 ------------------
 2 files changed, 1 insertion(+), 276 deletions(-)

diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/activations.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/activations.py
index fc68971806..4726fd7eb2 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/activations.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/activations.py
@@ -18,7 +18,7 @@ import torch.nn.functional as F
 from torch import nn
 
 from diffusers.utils import deprecate
-from diffusers.utils.import_utils import is_torch_npu_available, is_torch_version
+from diffusers.utils.import_utils import is_torch_npu_available
 
 
 if is_torch_npu_available():
@@ -79,9 +79,6 @@ class GELU(nn.Module):
         self.approximate = approximate
 
     def gelu(self, gate: torch.Tensor) -> torch.Tensor:
-        if gate.device.type == "mps" and is_torch_version("<", "2.0.0"):
-            # fp16 gelu not supported on mps before torch 2.0
-            return F.gelu(gate.to(dtype=torch.float32), approximate=self.approximate).to(dtype=gate.dtype)
         return F.gelu(gate, approximate=self.approximate)
 
     def forward(self, hidden_states):
@@ -103,9 +100,6 @@ class GEGLU(nn.Module):
         self.proj = nn.Linear(dim_in, dim_out * 2, bias=bias)
 
     def gelu(self, gate: torch.Tensor) -> torch.Tensor:
-        if gate.device.type == "mps" and is_torch_version("<", "2.0.0"):
-            # fp16 gelu not supported on mps before torch 2.0
-            return F.gelu(gate.to(dtype=torch.float32)).to(dtype=gate.dtype)
         return F.gelu(gate)
 
     def forward(self, hidden_states, *args, **kwargs):
diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/modeling_utils.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/modeling_utils.py
index 252c758863..35f4891b42 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/modeling_utils.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/modeling_utils.py
@@ -119,15 +119,6 @@ def get_parameter_dtype(parameter: torch.nn.Module) -> torch.dtype:
 
 
 class ModelMixin(torch.nn.Module, PushToHubMixin):
-    r"""
-    Base class for all models.
-
-    [`ModelMixin`] takes care of storing the model configuration and provides methods for loading, downloading and
-    saving models.
-
-        - **config_name** ([`str`]) -- Filename to save a model to when calling [`~models.ModelMixin.save_pretrained`].
-    """
-
     config_name = CONFIG_NAME
     _automatically_saved_args = ["_diffusers_version", "_class_name", "_name_or_path"]
     _supports_gradient_checkpointing = False
@@ -139,11 +130,6 @@ class ModelMixin(torch.nn.Module, PushToHubMixin):
         super().__init__()
 
     def __getattr__(self, name: str) -> Any:
-        """The only reason we overwrite `getattr` here is to gracefully deprecate accessing
-        config attributes directly. See https://github.com/huggingface/diffusers/pull/3129 We need to overwrite
-        __getattr__ here in addition so that we don't trigger `torch.nn.Module`'s __getattr__':
-        https://pytorch.org/docs/stable/_modules/torch/nn/modules/module.html#Module
-        """
 
         is_in_config = "_internal_dict" in self.__dict__ and hasattr(self.__dict__["_internal_dict"], name)
         is_attribute = name in self.__dict__
@@ -156,264 +142,9 @@ class ModelMixin(torch.nn.Module, PushToHubMixin):
         # call PyTorch's https://pytorch.org/docs/stable/_modules/torch/nn/modules/module.html#Module
         return super().__getattr__(name)
 
-    @property
-    def is_gradient_checkpointing(self) -> bool:
-        """
-        Whether gradient checkpointing is activated for this model or not.
-        """
-        return any(hasattr(m, "gradient_checkpointing") and m.gradient_checkpointing for m in self.modules())
-
-    def enable_gradient_checkpointing(self) -> None:
-        """
-        Activates gradient checkpointing for the current model (may be referred to as *activation checkpointing* or
-        *checkpoint activations* in other frameworks).
-        """
-        if not self._supports_gradient_checkpointing:
-            raise ValueError(f"{self.__class__.__name__} does not support gradient checkpointing.")
-        self.apply(partial(self._set_gradient_checkpointing, value=True))
-
-    def disable_gradient_checkpointing(self) -> None:
-        """
-        Deactivates gradient checkpointing for the current model (may be referred to as *activation checkpointing* or
-        *checkpoint activations* in other frameworks).
-        """
-        if self._supports_gradient_checkpointing:
-            self.apply(partial(self._set_gradient_checkpointing, value=False))
-
-    def set_use_npu_flash_attention(self, valid: bool) -> None:
-        r"""
-        Set the switch for the npu flash attention.
-        """
-
-        def fn_recursive_set_npu_flash_attention(module: torch.nn.Module):
-            if hasattr(module, "set_use_npu_flash_attention"):
-                module.set_use_npu_flash_attention(valid)
-
-            for child in module.children():
-                fn_recursive_set_npu_flash_attention(child)
-
-        for module in self.children():
-            if isinstance(module, torch.nn.Module):
-                fn_recursive_set_npu_flash_attention(module)
-
-    def enable_npu_flash_attention(self) -> None:
-        r"""
-        Enable npu flash attention from torch_npu
-
-        """
-        self.set_use_npu_flash_attention(True)
-
-    def disable_npu_flash_attention(self) -> None:
-        r"""
-        disable npu flash attention from torch_npu
-
-        """
-        self.set_use_npu_flash_attention(False)
-
-    def set_use_xla_flash_attention(
-        self, use_xla_flash_attention: bool, partition_spec: Optional[Callable] = None
-    ) -> None:
-        # Recursively walk through all the children.
-        # Any children which exposes the set_use_xla_flash_attention method
-        # gets the message
-        def fn_recursive_set_flash_attention(module: torch.nn.Module):
-            if hasattr(module, "set_use_xla_flash_attention"):
-                module.set_use_xla_flash_attention(use_xla_flash_attention, partition_spec)
-
-            for child in module.children():
-                fn_recursive_set_flash_attention(child)
-
-        for module in self.children():
-            if isinstance(module, torch.nn.Module):
-                fn_recursive_set_flash_attention(module)
-
-    def enable_xla_flash_attention(self, partition_spec: Optional[Callable] = None):
-        r"""
-        Enable the flash attention pallals kernel for torch_xla.
-        """
-        self.set_use_xla_flash_attention(True, partition_spec)
-
-    def disable_xla_flash_attention(self):
-        r"""
-        Disable the flash attention pallals kernel for torch_xla.
-        """
-        self.set_use_xla_flash_attention(False)
-
-    def set_use_memory_efficient_attention_xformers(
-        self, valid: bool, attention_op: Optional[Callable] = None
-    ) -> None:
-        # Recursively walk through all the children.
-        # Any children which exposes the set_use_memory_efficient_attention_xformers method
-        # gets the message
-        def fn_recursive_set_mem_eff(module: torch.nn.Module):
-            if hasattr(module, "set_use_memory_efficient_attention_xformers"):
-                module.set_use_memory_efficient_attention_xformers(valid, attention_op)
-
-            for child in module.children():
-                fn_recursive_set_mem_eff(child)
-
-        for module in self.children():
-            if isinstance(module, torch.nn.Module):
-                fn_recursive_set_mem_eff(module)
-
-    def enable_xformers_memory_efficient_attention(self, attention_op: Optional[Callable] = None) -> None:
-        r"""
-        Enable memory efficient attention from [xFormers](https://facebookresearch.github.io/xformers/).
-
-        When this option is enabled, you should observe lower GPU memory usage and a potential speed up during
-        inference. Speed up during training is not guaranteed.
-
-        <Tip warning={true}>
-
-        ⚠️ When memory efficient attention and sliced attention are both enabled, memory efficient attention takes
-        precedent.
-
-        </Tip>
-
-        Parameters:
-            attention_op (`Callable`, *optional*):
-                Override the default `None` operator for use as `op` argument to the
-                [`memory_efficient_attention()`](https://facebookresearch.github.io/xformers/components/ops.html#xformers.ops.memory_efficient_attention)
-                function of xFormers.
-
-        Examples:
-
-        ```py
-        >>> import torch
-        >>> from diffusers import UNet2DConditionModel
-        >>> from xformers.ops import MemoryEfficientAttentionFlashAttentionOp
-
-        >>> model = UNet2DConditionModel.from_pretrained(
-        ...     "stabilityai/stable-diffusion-2-1", subfolder="unet", torch_dtype=torch.float16
-        ... )
-        >>> model = model.to("cuda")
-        >>> model.enable_xformers_memory_efficient_attention(attention_op=MemoryEfficientAttentionFlashAttentionOp)
-        ```
-        """
-        self.set_use_memory_efficient_attention_xformers(True, attention_op)
-
-    def disable_xformers_memory_efficient_attention(self) -> None:
-        r"""
-        Disable memory efficient attention from [xFormers](https://facebookresearch.github.io/xformers/).
-        """
-        self.set_use_memory_efficient_attention_xformers(False)
-
-    def dequantize(self):
-        """
-        Potentially dequantize the model in case it has been quantized by a quantization method that support
-        dequantization.
-        """
-        hf_quantizer = getattr(self, "hf_quantizer", None)
-
-        if hf_quantizer is None:
-            raise ValueError("You need to first quantize your model in order to dequantize it")
-
-        return hf_quantizer.dequantize(self)
-
     @classmethod
     @validate_hf_hub_args
     def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], **kwargs):
-        r"""
-        Instantiate a pretrained PyTorch model from a pretrained model configuration.
-
-        The model is set in evaluation mode - `model.eval()` - by default, and dropout modules are deactivated. To
-        train the model, set it back in training mode with `model.train()`.
-
-        Parameters:
-            pretrained_model_name_or_path (`str` or `os.PathLike`, *optional*):
-                Can be either:
-
-                    - A string, the *model id* (for example `google/ddpm-celebahq-256`) of a pretrained model hosted on
-                      the Hub.
-                    - A path to a *directory* (for example `./my_model_directory`) containing the model weights saved
-                      with [`~ModelMixin.save_pretrained`].
-
-            cache_dir (`Union[str, os.PathLike]`, *optional*):
-                Path to a directory where a downloaded pretrained model configuration is cached if the standard cache
-                is not used.
-            torch_dtype (`str` or `torch.dtype`, *optional*):
-                Override the default `torch.dtype` and load the model with another dtype. If `"auto"` is passed, the
-                dtype is automatically derived from the model's weights.
-            force_download (`bool`, *optional*, defaults to `False`):
-                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
-                cached versions if they exist.
-            proxies (`Dict[str, str]`, *optional*):
-                A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128',
-                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
-            output_loading_info (`bool`, *optional*, defaults to `False`):
-                Whether or not to also return a dictionary containing missing keys, unexpected keys and error messages.
-            local_files_only(`bool`, *optional*, defaults to `False`):
-                Whether to only load local model weights and configuration files or not. If set to `True`, the model
-                won't be downloaded from the Hub.
-            token (`str` or *bool*, *optional*):
-                The token to use as HTTP bearer authorization for remote files. If `True`, the token generated from
-                `diffusers-cli login` (stored in `~/.huggingface`) is used.
-            revision (`str`, *optional*, defaults to `"main"`):
-                The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier
-                allowed by Git.
-            from_flax (`bool`, *optional*, defaults to `False`):
-                Load the model weights from a Flax checkpoint save file.
-            subfolder (`str`, *optional*, defaults to `""`):
-                The subfolder location of a model file within a larger model repository on the Hub or locally.
-            mirror (`str`, *optional*):
-                Mirror source to resolve accessibility issues if you're downloading a model in China. We do not
-                guarantee the timeliness or safety of the source, and you should refer to the mirror site for more
-                information.
-            device_map (`str` or `Dict[str, Union[int, str, torch.device]]`, *optional*):
-                A map that specifies where each submodule should go. It doesn't need to be defined for each
-                parameter/buffer name; once a given module name is inside, every submodule of it will be sent to the
-                same device. Defaults to `None`, meaning that the model will be loaded on CPU.
-
-                Set `device_map="auto"` to have 🤗 Accelerate automatically compute the most optimized `device_map`. For
-                more information about each option see [designing a device
-                map](https://hf.co/docs/accelerate/main/en/usage_guides/big_modeling#designing-a-device-map).
-            max_memory (`Dict`, *optional*):
-                A dictionary device identifier for the maximum memory. Will default to the maximum memory available for
-                each GPU and the available CPU RAM if unset.
-            offload_folder (`str` or `os.PathLike`, *optional*):
-                The path to offload weights if `device_map` contains the value `"disk"`.
-            offload_state_dict (`bool`, *optional*):
-                If `True`, temporarily offloads the CPU state dict to the hard drive to avoid running out of CPU RAM if
-                the weight of the CPU state dict + the biggest shard of the checkpoint does not fit. Defaults to `True`
-                when there is some disk offload.
-            low_cpu_mem_usage (`bool`, *optional*, defaults to `True` if torch version >= 1.9.0 else `False`):
-                Speed up model loading only loading the pretrained weights and not initializing the weights. This also
-                tries to not use more than 1x model size in CPU memory (including peak memory) while loading the model.
-                Only supported for PyTorch >= 1.9.0. If you are using an older version of PyTorch, setting this
-                argument to `True` will raise an error.
-            variant (`str`, *optional*):
-                Load weights from a specified `variant` filename such as `"fp16"` or `"ema"`. This is ignored when
-                loading `from_flax`.
-            use_safetensors (`bool`, *optional*, defaults to `None`):
-                If set to `None`, the `safetensors` weights are downloaded if they're available **and** if the
-                `safetensors` library is installed. If set to `True`, the model is forcibly loaded from `safetensors`
-                weights. If set to `False`, `safetensors` weights are not loaded.
-
-        <Tip>
-
-        To use private or [gated models](https://huggingface.co/docs/hub/models-gated#gated-models), log-in with
-        `huggingface-cli login`. You can also activate the special
-        ["offline-mode"](https://huggingface.co/diffusers/installation.html#offline-mode) to use this method in a
-        firewalled environment.
-
-        </Tip>
-
-        Example:
-
-        ```py
-        from diffusers import UNet2DConditionModel
-
-        unet = UNet2DConditionModel.from_pretrained("runwayml/stable-diffusion-v1-5", subfolder="unet")
-        ```
-
-        If you get the error message below, you need to finetune the weights for your downstream task:
-
-        ```bash
-        Some weights of UNet2DConditionModel were not initialized from the model checkpoint at runwayml/stable-diffusion-v1-5 and are newly initialized because the shapes did not match:
-        - conv_in.weight: found shape torch.Size([320, 4, 3, 3]) in the checkpoint and torch.Size([320, 9, 3, 3]) in the model instantiated
-        You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
-        ```
-        """
         cache_dir = kwargs.pop("cache_dir", None)
         ignore_mismatched_sizes = kwargs.pop("ignore_mismatched_sizes", False)
         force_download = kwargs.pop("force_download", False)
-- 
Gitee


From 29df760d8158b6e4a26999012f614577ff3f5e8d Mon Sep 17 00:00:00 2001
From: jiangmengyu <jiangmengyu1@huawei.com>
Date: Tue, 31 Dec 2024 11:00:54 +0800
Subject: [PATCH 58/91] =?UTF-8?q?cogview3=E9=80=82=E9=85=8D?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../built-in/foundation/cogview3/inference_cogview3plus.py      | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/inference_cogview3plus.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/inference_cogview3plus.py
index d24d3f29c4..9030a82b5e 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/inference_cogview3plus.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/inference_cogview3plus.py
@@ -60,7 +60,7 @@ def parse_arguments():
     parser.add_argument("--height", type=int, default=1024, help="Height of the generated image.")
     parser.add_argument("--output_path", type=str, default="cogview3.png", help="Path to save the generated image.")
     parser.add_argument("--dtype", type=str, default="bf16", help="bf16 or fp16")
-    parser.add_argument("--device_id", type=int, default=6, help="NPU device id")
+    parser.add_argument("--device_id", type=int, default=7, help="NPU device id")
 
     return parser.parse_args()
 
-- 
Gitee


From a796f721aa5cd003d0aa6fa6c49da096a7b6d880 Mon Sep 17 00:00:00 2001
From: jiangmengyu <jiangmengyu1@huawei.com>
Date: Tue, 31 Dec 2024 11:11:22 +0800
Subject: [PATCH 59/91] =?UTF-8?q?cogview3=E9=80=82=E9=85=8D?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../cogview3plus/models/modeling_utils.py     | 192 +-----------------
 1 file changed, 2 insertions(+), 190 deletions(-)

diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/modeling_utils.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/modeling_utils.py
index 35f4891b42..a09f50daf8 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/modeling_utils.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/modeling_utils.py
@@ -15,14 +15,13 @@
 # limitations under the License.
 
 import copy
-import inspect
 import itertools
 import json
 import os
 import re
 from collections import OrderedDict
-from functools import partial, wraps
-from typing import Any, Callable, List, Optional, Tuple, Union
+from functools import wraps
+from typing import Any, List, Optional, Tuple, Union
 
 import torch
 from huggingface_hub.utils import validate_hf_hub_args
@@ -41,7 +40,6 @@ from diffusers.utils import (
     _get_model_file,
     deprecate,
     is_accelerate_available,
-    is_bitsandbytes_available,
     is_bitsandbytes_version,
     logging,
 )
@@ -157,9 +155,6 @@ class ModelMixin(torch.nn.Module, PushToHubMixin):
         torch_dtype = kwargs.pop("torch_dtype", None)
         subfolder = kwargs.pop("subfolder", None)
         device_map = kwargs.pop("device_map", None)
-        max_memory = kwargs.pop("max_memory", None)
-        offload_folder = kwargs.pop("offload_folder", None)
-        offload_state_dict = kwargs.pop("offload_state_dict", False)
         low_cpu_mem_usage = kwargs.pop("low_cpu_mem_usage", _LOW_CPU_MEM_USAGE_DEFAULT)
         variant = kwargs.pop("variant", None)
         use_safetensors = kwargs.pop("use_safetensors", None)
@@ -631,7 +626,6 @@ class ModelMixin(torch.nn.Module, PushToHubMixin):
         cls,
         model,
         state_dict: OrderedDict,
-        resolved_archive_file,
         pretrained_model_name_or_path: Union[str, os.PathLike],
         ignore_mismatched_sizes: bool = False,
     ):
@@ -730,142 +724,14 @@ class ModelMixin(torch.nn.Module, PushToHubMixin):
 
         return model, missing_keys, unexpected_keys, mismatched_keys, error_msgs
 
-    @classmethod
-    def _get_signature_keys(cls, obj):
-        parameters = inspect.signature(obj.__init__).parameters
-        required_parameters = {k: v for k, v in parameters.items() if v.default == inspect._empty}
-        optional_parameters = set({k for k, v in parameters.items() if v.default != inspect._empty})
-        expected_modules = set(required_parameters.keys()) - {"self"}
-
-        return expected_modules, optional_parameters
-
-    # Adapted from `transformers` modeling_utils.py
-    def _get_no_split_modules(self, device_map: str):
-        """
-        Get the modules of the model that should not be spit when using device_map. We iterate through the modules to
-        get the underlying `_no_split_modules`.
-
-        Args:
-            device_map (`str`):
-                The device map value. Options are ["auto", "balanced", "balanced_low_0", "sequential"]
-
-        Returns:
-            `List[str]`: List of modules that should not be split
-        """
-        _no_split_modules = set()
-        modules_to_check = [self]
-        while len(modules_to_check) > 0:
-            module = modules_to_check.pop(-1)
-            # if the module does not appear in _no_split_modules, we also check the children
-            if module.__class__.__name__ not in _no_split_modules:
-                if isinstance(module, ModelMixin):
-                    if module._no_split_modules is None:
-                        raise ValueError(
-                            f"{module.__class__.__name__} does not support `device_map='{device_map}'`. To implement support, the model "
-                            "class needs to implement the `_no_split_modules` attribute."
-                        )
-                    else:
-                        _no_split_modules = _no_split_modules | set(module._no_split_modules)
-                modules_to_check += list(module.children())
-        return list(_no_split_modules)
-
     @property
     def device(self) -> torch.device:
-        """
-        `torch.device`: The device on which the module is (assuming that all the module parameters are on the same
-        device).
-        """
         return get_parameter_device(self)
 
     @property
     def dtype(self) -> torch.dtype:
-        """
-        `torch.dtype`: The dtype of the module (assuming that all the module parameters have the same dtype).
-        """
         return get_parameter_dtype(self)
 
-    def num_parameters(self, only_trainable: bool = False, exclude_embeddings: bool = False) -> int:
-        """
-        Get number of (trainable or non-embedding) parameters in the module.
-
-        Args:
-            only_trainable (`bool`, *optional*, defaults to `False`):
-                Whether or not to return only the number of trainable parameters.
-            exclude_embeddings (`bool`, *optional*, defaults to `False`):
-                Whether or not to return only the number of non-embedding parameters.
-
-        Returns:
-            `int`: The number of parameters.
-
-        Example:
-
-        ```py
-        from diffusers import UNet2DConditionModel
-
-        model_id = "runwayml/stable-diffusion-v1-5"
-        unet = UNet2DConditionModel.from_pretrained(model_id, subfolder="unet")
-        unet.num_parameters(only_trainable=True)
-        859520964
-        ```
-        """
-        is_loaded_in_4bit = getattr(self, "is_loaded_in_4bit", False)
-
-        if is_loaded_in_4bit:
-            if is_bitsandbytes_available():
-                import bitsandbytes as bnb
-            else:
-                raise ValueError(
-                    "bitsandbytes is not installed but it seems that the model has been loaded in 4bit precision, something went wrong"
-                    " make sure to install bitsandbytes with `pip install bitsandbytes`. You also need a GPU. "
-                )
-
-        if exclude_embeddings:
-            embedding_param_names = [
-                f"{name}.weight" for name, module_type in self.named_modules() if isinstance(module_type, nn.Embedding)
-            ]
-            total_parameters = [
-                parameter for name, parameter in self.named_parameters() if name not in embedding_param_names
-            ]
-        else:
-            total_parameters = list(self.parameters())
-
-        total_numel = []
-
-        for param in total_parameters:
-            if param.requires_grad or not only_trainable:
-                # For 4bit models, we need to multiply the number of parameters by 2 as half of the parameters are
-                # used for the 4bit quantization (uint8 tensors are stored)
-                if is_loaded_in_4bit and isinstance(param, bnb.nn.Params4bit):
-                    if hasattr(param, "element_size"):
-                        num_bytes = param.element_size()
-                    elif hasattr(param, "quant_storage"):
-                        num_bytes = param.quant_storage.itemsize
-                    else:
-                        num_bytes = 1
-                    total_numel.append(param.numel() * 2 * num_bytes)
-                else:
-                    total_numel.append(param.numel())
-
-        return sum(total_numel)
-
-    def get_memory_footprint(self, return_buffers=True):
-        r"""
-        Get the memory footprint of a model. This will return the memory footprint of the current model in bytes.
-        Useful to benchmark the memory footprint of the current model and design some tests. Solution inspired from the
-        PyTorch discussions: https://discuss.pytorch.org/t/gpu-memory-that-model-uses/56822/2
-
-        Arguments:
-            return_buffers (`bool`, *optional*, defaults to `True`):
-                Whether to return the size of the buffer tensors in the computation of the memory footprint. Buffers
-                are tensors that do not require gradients and not registered as parameters. E.g. mean and std in batch
-                norm layers. Please see: https://discuss.pytorch.org/t/what-pytorch-means-by-buffers/120266/2
-        """
-        mem = sum([param.nelement() * param.element_size() for param in self.parameters()])
-        if return_buffers:
-            mem_bufs = sum([buf.nelement() * buf.element_size() for buf in self.buffers()])
-            mem = mem + mem_bufs
-        return mem
-
     def _convert_deprecated_attention_blocks(self, state_dict: OrderedDict) -> None:
         deprecated_attention_block_paths = []
 
@@ -879,10 +745,6 @@ class ModelMixin(torch.nn.Module, PushToHubMixin):
 
         recursive_find_attn_block("", self)
 
-        # NOTE: we have to check if the deprecated parameters are in the state dict
-        # because it is possible we are loading from a state dict that was already
-        # converted
-
         for path in deprecated_attention_block_paths:
             # group_norm path stays the same
 
@@ -909,53 +771,3 @@ class ModelMixin(torch.nn.Module, PushToHubMixin):
                 state_dict[f"{path}.to_out.0.weight"] = state_dict.pop(f"{path}.proj_attn.weight")
             if f"{path}.proj_attn.bias" in state_dict:
                 state_dict[f"{path}.to_out.0.bias"] = state_dict.pop(f"{path}.proj_attn.bias")
-
-    def _temp_convert_self_to_deprecated_attention_blocks(self) -> None:
-        deprecated_attention_block_modules = []
-
-        def recursive_find_attn_block(module):
-            if hasattr(module, "_from_deprecated_attn_block") and module._from_deprecated_attn_block:
-                deprecated_attention_block_modules.append(module)
-
-            for sub_module in module.children():
-                recursive_find_attn_block(sub_module)
-
-        recursive_find_attn_block(self)
-
-        for module in deprecated_attention_block_modules:
-            module.query = module.to_q
-            module.key = module.to_k
-            module.value = module.to_v
-            module.proj_attn = module.to_out[0]
-
-            # We don't _have_ to delete the old attributes, but it's helpful to ensure
-            # that _all_ the weights are loaded into the new attributes and we're not
-            # making an incorrect assumption that this model should be converted when
-            # it really shouldn't be.
-            del module.to_q
-            del module.to_k
-            del module.to_v
-            del module.to_out
-
-    def _undo_temp_convert_self_to_deprecated_attention_blocks(self) -> None:
-        deprecated_attention_block_modules = []
-
-        def recursive_find_attn_block(module) -> None:
-            if hasattr(module, "_from_deprecated_attn_block") and module._from_deprecated_attn_block:
-                deprecated_attention_block_modules.append(module)
-
-            for sub_module in module.children():
-                recursive_find_attn_block(sub_module)
-
-        recursive_find_attn_block(self)
-
-        for module in deprecated_attention_block_modules:
-            module.to_q = module.query
-            module.to_k = module.key
-            module.to_v = module.value
-            module.to_out = nn.ModuleList([module.proj_attn, nn.Dropout(module.dropout)])
-
-            del module.query
-            del module.key
-            del module.value
-            del module.proj_attn
-- 
Gitee


From 11c5ead9215330f394ea1acf490f8dbe6b11bbb6 Mon Sep 17 00:00:00 2001
From: jiangmengyu <jiangmengyu1@huawei.com>
Date: Tue, 31 Dec 2024 11:15:57 +0800
Subject: [PATCH 60/91] =?UTF-8?q?cogview3=E9=80=82=E9=85=8D?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../models/transformer_cogview3plus.py        | 57 +------------------
 .../pipeline/pipeline_cogview3plus.py         | 27 ---------
 .../schedulers/scheduling_ddim_cogvideox.py   | 56 ------------------
 .../schedulers/scheduling_utils.py            |  6 +-
 4 files changed, 2 insertions(+), 144 deletions(-)

diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/transformer_cogview3plus.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/transformer_cogview3plus.py
index b98e3e2526..6f4fa104b8 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/transformer_cogview3plus.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/transformer_cogview3plus.py
@@ -20,7 +20,7 @@ import numpy as np
 
 from diffusers.configuration_utils import ConfigMixin, register_to_config
 from diffusers.models.attention_processor import AttentionProcessor
-from diffusers.utils import is_torch_version, logging
+from diffusers.utils import logging
 from diffusers.models.modeling_outputs import Transformer2DModelOutput
 
 from .modeling_utils import ModelMixin
@@ -34,18 +34,6 @@ logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
 
 class CogView3PlusTransformerBlock(nn.Module):
-    r"""
-    Args:
-        dim (`int`):
-            The number of channels in the input and output.
-        num_attention_heads (`int`):
-            The number of heads to use for multi-head attention.
-        attention_head_dim (`int`):
-            The number of channels in each head.
-        time_embed_dim (`int`):
-            The number of channels in timestep embedding.
-    """
-
     def __init__(
         self,
         dim: int = 2560,
@@ -125,37 +113,6 @@ class CogView3PlusTransformerBlock(nn.Module):
 
 
 class CogView3PlusTransformer2DModel(ModelMixin, ConfigMixin):
-    r"""
-    Args:
-        patch_size (`int`, defaults to `2`):
-            The size of the patches to use in the patch embedding layer.
-        in_channels (`int`, defaults to `16`):
-            The number of channels in the input.
-        num_layers (`int`, defaults to `30`):
-            The number of layers of Transformer blocks to use.
-        attention_head_dim (`int`, defaults to `40`):
-            The number of channels in each head.
-        num_attention_heads (`int`, defaults to `64`):
-            The number of heads to use for multi-head attention.
-        out_channels (`int`, defaults to `16`):
-            The number of channels in the output.
-        text_embed_dim (`int`, defaults to `4096`):
-            Input dimension of text embeddings from the text encoder.
-        time_embed_dim (`int`, defaults to `512`):
-            Output dimension of timestep embeddings.
-        condition_dim (`int`, defaults to `256`):
-            The embedding dimension of the input SDXL-style resolution conditions (original_size, target_size,
-            crop_coords).
-        pos_embed_max_size (`int`, defaults to `128`):
-            The maximum resolution of the positional embeddings, from which slices of shape `H x W` are taken and added
-            to input patched latents, where `H` and `W` are the latent height and width respectively. A value of 128
-            means that the maximum supported height and width for image generation is `128 * vae_scale_factor *
-            patch_size => 128 * 8 * 2 => 2048`.
-        sample_size (`int`, defaults to `128`):
-            The base resolution of input latents. If height/width is not provided during generation, this value is used
-            to determine the resolution as `sample_size * vae_scale_factor => 128 * 8 => 1024`
-    """
-
     _supports_gradient_checkpointing = True
 
     @register_to_config
@@ -266,18 +223,6 @@ class CogView3PlusTransformer2DModel(ModelMixin, ConfigMixin):
 
     # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_attn_processor
     def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
-        r"""
-        Sets the attention processor to use to compute attention.
-
-        Parameters:
-            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
-                The instantiated processor class or a dictionary of processor classes that will be set as the processor
-                for **all** `Attention` layers.
-
-                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
-                processor. This is strongly recommended when setting trainable attention processors.
-
-        """
         count = len(self.attn_processors.keys())
 
         if isinstance(processor, dict) and len(processor) != count:
diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py
index 1dda0a2108..05f6ddf53a 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py
@@ -41,29 +41,6 @@ def retrieve_timesteps(
     sigmas: Optional[List[float]] = None,
     **kwargs,
 ):
-    r"""
-    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
-    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
-
-    Args:
-        scheduler (`SchedulerMixin`):
-            The scheduler to get timesteps from.
-        num_inference_steps (`int`):
-            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
-            must be `None`.
-        device (`str` or `torch.device`, *optional*):
-            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
-        timesteps (`List[int]`, *optional*):
-            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
-            `num_inference_steps` and `sigmas` must be `None`.
-        sigmas (`List[float]`, *optional*):
-            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
-            `num_inference_steps` and `timesteps` must be `None`.
-
-    Returns:
-        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
-        second element is the number of inference steps.
-    """
     if timesteps is not None and sigmas is not None:
         raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
     if timesteps is not None:
@@ -121,7 +98,6 @@ class CogView3PlusPipeline(DiffusionPipeline):
 
         self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
 
-    # Copied from diffusers.pipelines.cogvideo.pipeline_cogvideox.CogVideoXPipeline._get_t5_prompt_embeds with num_videos_per_prompt->num_images_per_prompt
     def _get_t5_prompt_embeds(
         self,
         prompt: Union[str, List[str]] = None,
@@ -186,7 +162,6 @@ class CogView3PlusPipeline(DiffusionPipeline):
         
         return prompt_embeds, negative_prompt_embeds
 
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
     def prepare_latents(self, batch_size, num_channels_latents, image_size, dtype, device):
         height = image_size[0]
         width = image_size[1]
@@ -202,7 +177,6 @@ class CogView3PlusPipeline(DiffusionPipeline):
         latents = latents * self.scheduler.init_noise_sigma
         return latents
 
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
     def prepare_extra_step_kwargs(self, generator, eta):
         accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
         extra_step_kwargs = {}
@@ -215,7 +189,6 @@ class CogView3PlusPipeline(DiffusionPipeline):
             extra_step_kwargs["generator"] = generator
         return extra_step_kwargs
 
-    # Copied from diffusers.pipelines.latte.pipeline_latte.LattePipeline.check_inputs
     def check_inputs(
         self,
         prompt,
diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_ddim_cogvideox.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_ddim_cogvideox.py
index 26ae48a2c8..29ad7a2c81 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_ddim_cogvideox.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_ddim_cogvideox.py
@@ -26,7 +26,6 @@ from .scheduling_utils import KarrasDiffusionSchedulers, SchedulerMixin
 
 
 @dataclass
-# Copied from diffusers.schedulers.scheduling_ddpm.DDPMSchedulerOutput with DDPM->DDIM
 class DDIMSchedulerOutput(BaseOutput):
     """
     Output class for the scheduler's `step` function output.
@@ -43,31 +42,11 @@ class DDIMSchedulerOutput(BaseOutput):
     prev_sample: torch.Tensor
     pred_original_sample: Optional[torch.Tensor] = None
 
-
-# Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
 def betas_for_alpha_bar(
     num_diffusion_timesteps,
     max_beta=0.999,
     alpha_transform_type="cosine",
 ):
-    """
-    Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
-    (1-beta) over time from t = [0,1].
-
-    Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
-    to that part of the diffusion process.
-
-
-    Args:
-        num_diffusion_timesteps (`int`): the number of betas to produce.
-        max_beta (`float`): the maximum beta to use; use values lower than 1 to
-                     prevent singularities.
-        alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
-                     Choose from `cosine` or `exp`
-
-    Returns:
-        betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
-    """
     if alpha_transform_type == "cosine":
 
         def alpha_bar_fn(t):
@@ -90,18 +69,6 @@ def betas_for_alpha_bar(
 
 
 def rescale_zero_terminal_snr(alphas_cumprod):
-    """
-    Rescales betas to have zero terminal SNR Based on https://arxiv.org/pdf/2305.08891.pdf (Algorithm 1)
-
-
-    Args:
-        betas (`torch.Tensor`):
-            the betas that the scheduler is being initialized with.
-
-    Returns:
-        `torch.Tensor`: rescaled betas with zero terminal SNR
-    """
-
     alphas_bar_sqrt = alphas_cumprod.sqrt()
 
     # Store old values.
@@ -132,13 +99,7 @@ class CogVideoXDDIMScheduler(SchedulerMixin, ConfigMixin):
         beta_end: float = 0.0120,
         beta_schedule: str = "scaled_linear",
         trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
-        clip_sample: bool = True,
         set_alpha_to_one: bool = True,
-        steps_offset: int = 0,
-        prediction_type: str = "epsilon",
-        clip_sample_range: float = 1.0,
-        sample_max_value: float = 1.0,
-        timestep_spacing: str = "leading",
         rescale_betas_zero_snr: bool = False,
         snr_shift_scale: float = 3.0,
     ):
@@ -188,23 +149,6 @@ class CogVideoXDDIMScheduler(SchedulerMixin, ConfigMixin):
 
         return variance
 
-    def scale_model_input(self, sample: torch.Tensor, timestep: Optional[int] = None) -> torch.Tensor:
-        """
-        Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
-        current timestep.
-
-        Args:
-            sample (`torch.Tensor`):
-                The input sample.
-            timestep (`int`, *optional*):
-                The current timestep in the diffusion chain.
-
-        Returns:
-            `torch.Tensor`:
-                A scaled input sample.
-        """
-        return sample
-
     def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.device] = None):
         """
         Sets the discrete timesteps used for the diffusion chain (to be run before inference).
diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_utils.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_utils.py
index 3f1fb5bc32..ae88225358 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_utils.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_utils.py
@@ -27,10 +27,6 @@ from diffusers.utils import BaseOutput, PushToHubMixin
 SCHEDULER_CONFIG_NAME = "scheduler_config.json"
 
 
-# NOTE: We make this type an enum because it simplifies usage in docs and prevents
-# circular imports when used for `_compatibles` within the schedulers module.
-# When it's used as a type in pipelines, it really is a Union because the actual
-# scheduler instance is passed in.
 class KarrasDiffusionSchedulers(Enum):
     DDIMScheduler = 1
     DDPMScheduler = 2
@@ -88,7 +84,7 @@ class SchedulerMixin(PushToHubMixin):
         **kwargs,
     ):
 
-        config, kwargs, commit_hash = cls.load_config(
+        config, kwargs, _ = cls.load_config(
             pretrained_model_name_or_path=pretrained_model_name_or_path,
             subfolder=subfolder,
             return_unused_kwargs=True,
-- 
Gitee


From 062f09f5dcd2432d568a528ed1d7c7babd62c16e Mon Sep 17 00:00:00 2001
From: jiangmengyu <jiangmengyu1@huawei.com>
Date: Tue, 31 Dec 2024 11:18:27 +0800
Subject: [PATCH 61/91] =?UTF-8?q?cogview3=E9=80=82=E9=85=8D?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../schedulers/scheduling_ddim_cogvideox.py   |   3 +
 .../cogview3/cogview3plus/vae/__init__.py     |   2 +-
 .../cogview3plus/vae/autoencoder_kl.py        | 517 ---------
 .../cogview3/cogview3plus/vae/vae.py          | 996 ------------------
 4 files changed, 4 insertions(+), 1514 deletions(-)
 delete mode 100644 MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/vae/autoencoder_kl.py
 delete mode 100644 MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/vae/vae.py

diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_ddim_cogvideox.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_ddim_cogvideox.py
index 29ad7a2c81..f94de1c81d 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_ddim_cogvideox.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_ddim_cogvideox.py
@@ -149,6 +149,9 @@ class CogVideoXDDIMScheduler(SchedulerMixin, ConfigMixin):
 
         return variance
 
+    def scale_model_input(self, sample: torch.Tensor, timestep: Optional[int] = None) -> torch.Tensor:
+        return sample
+
     def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.device] = None):
         """
         Sets the discrete timesteps used for the diffusion chain (to be run before inference).
diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/vae/__init__.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/vae/__init__.py
index 58bbb8f14e..261968dc69 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/vae/__init__.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/vae/__init__.py
@@ -1 +1 @@
-from .autoencoder_kl import AutoencoderKL
\ No newline at end of file
+from diffusers import AutoencoderKL
\ No newline at end of file
diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/vae/autoencoder_kl.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/vae/autoencoder_kl.py
deleted file mode 100644
index 3f524408c9..0000000000
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/vae/autoencoder_kl.py
+++ /dev/null
@@ -1,517 +0,0 @@
-# Copyright 2024 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     https://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import Dict, Optional, Tuple, Union
-
-import torch
-import torch.nn as nn
-
-from diffusers.configuration_utils import ConfigMixin, register_to_config
-from diffusers.loaders import PeftAdapterMixin
-from diffusers.loaders.single_file_model import FromOriginalModelMixin
-from diffusers.utils import deprecate
-from diffusers.utils.accelerate_utils import apply_forward_hook
-from diffusers.models.attention_processor import (
-    ADDED_KV_ATTENTION_PROCESSORS,
-    CROSS_ATTENTION_PROCESSORS,
-    Attention,
-    AttentionProcessor,
-    AttnAddedKVProcessor,
-    AttnProcessor,
-    FusedAttnProcessor2_0,
-)
-from diffusers.models.modeling_outputs import AutoencoderKLOutput
-
-from ..models import ModelMixin
-from .vae import Decoder, DecoderOutput, DiagonalGaussianDistribution, Encoder
-
-
-class AutoencoderKL(ModelMixin, ConfigMixin, FromOriginalModelMixin, PeftAdapterMixin):
-    
-    _supports_gradient_checkpointing = True
-    _no_split_modules = ["BasicTransformerBlock", "ResnetBlock2D"]
-
-    @register_to_config
-    def __init__(
-        self,
-        in_channels: int = 3,
-        out_channels: int = 3,
-        down_block_types: Tuple[str] = ("DownEncoderBlock2D",),
-        up_block_types: Tuple[str] = ("UpDecoderBlock2D",),
-        block_out_channels: Tuple[int] = (64,),
-        layers_per_block: int = 1,
-        act_fn: str = "silu",
-        latent_channels: int = 4,
-        norm_num_groups: int = 32,
-        sample_size: int = 32,
-        use_quant_conv: bool = True,
-        use_post_quant_conv: bool = True,
-        mid_block_add_attention: bool = True,
-    ):
-        super().__init__()
-
-        # pass init params to Encoder
-        self.encoder = Encoder(
-            in_channels=in_channels,
-            out_channels=latent_channels,
-            down_block_types=down_block_types,
-            block_out_channels=block_out_channels,
-            layers_per_block=layers_per_block,
-            act_fn=act_fn,
-            norm_num_groups=norm_num_groups,
-            double_z=True,
-            mid_block_add_attention=mid_block_add_attention,
-        )
-
-        # pass init params to Decoder
-        self.decoder = Decoder(
-            in_channels=latent_channels,
-            out_channels=out_channels,
-            up_block_types=up_block_types,
-            block_out_channels=block_out_channels,
-            layers_per_block=layers_per_block,
-            norm_num_groups=norm_num_groups,
-            act_fn=act_fn,
-            mid_block_add_attention=mid_block_add_attention,
-        )
-
-        self.quant_conv = nn.Conv2d(2 * latent_channels, 2 * latent_channels, 1) if use_quant_conv else None
-        self.post_quant_conv = nn.Conv2d(latent_channels, latent_channels, 1) if use_post_quant_conv else None
-
-        self.use_slicing = False
-        self.use_tiling = False
-
-        # only relevant if vae tiling is enabled
-        self.tile_sample_min_size = self.config.sample_size
-        sample_size = (
-            self.config.sample_size[0]
-            if isinstance(self.config.sample_size, (list, tuple))
-            else self.config.sample_size
-        )
-        self.tile_latent_min_size = int(sample_size / (2 ** (len(self.config.block_out_channels) - 1)))
-        self.tile_overlap_factor = 0.25
-
-    def _set_gradient_checkpointing(self, module, value=False):
-        if isinstance(module, (Encoder, Decoder)):
-            module.gradient_checkpointing = value
-
-    def enable_tiling(self, use_tiling: bool = True):
-        r"""
-        Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
-        compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
-        processing larger images.
-        """
-        self.use_tiling = use_tiling
-
-    def disable_tiling(self):
-        r"""
-        Disable tiled VAE decoding. If `enable_tiling` was previously enabled, this method will go back to computing
-        decoding in one step.
-        """
-        self.enable_tiling(False)
-
-    def enable_slicing(self):
-        r"""
-        Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
-        compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
-        """
-        self.use_slicing = True
-
-    def disable_slicing(self):
-        r"""
-        Disable sliced VAE decoding. If `enable_slicing` was previously enabled, this method will go back to computing
-        decoding in one step.
-        """
-        self.use_slicing = False
-
-    @property
-    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.attn_processors
-    def attn_processors(self) -> Dict[str, AttentionProcessor]:
-        r"""
-        Returns:
-            `dict` of attention processors: A dictionary containing all attention processors used in the model with
-            indexed by its weight name.
-        """
-        # set recursively
-        processors = {}
-
-        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
-            if hasattr(module, "get_processor"):
-                processors[f"{name}.processor"] = module.get_processor()
-
-            for sub_name, child in module.named_children():
-                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
-
-            return processors
-
-        for name, module in self.named_children():
-            fn_recursive_add_processors(name, module, processors)
-
-        return processors
-
-    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_attn_processor
-    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
-        r"""
-        Sets the attention processor to use to compute attention.
-
-        Parameters:
-            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
-                The instantiated processor class or a dictionary of processor classes that will be set as the processor
-                for **all** `Attention` layers.
-
-                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
-                processor. This is strongly recommended when setting trainable attention processors.
-
-        """
-        count = len(self.attn_processors.keys())
-
-        if isinstance(processor, dict) and len(processor) != count:
-            raise ValueError(
-                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
-                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
-            )
-
-        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
-            if hasattr(module, "set_processor"):
-                if not isinstance(processor, dict):
-                    module.set_processor(processor)
-                else:
-                    module.set_processor(processor.pop(f"{name}.processor"))
-
-            for sub_name, child in module.named_children():
-                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
-
-        for name, module in self.named_children():
-            fn_recursive_attn_processor(name, module, processor)
-
-    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_default_attn_processor
-    def set_default_attn_processor(self):
-        """
-        Disables custom attention processors and sets the default attention implementation.
-        """
-        if all(proc.__class__ in ADDED_KV_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
-            processor = AttnAddedKVProcessor()
-        elif all(proc.__class__ in CROSS_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
-            processor = AttnProcessor()
-        else:
-            raise ValueError(
-                f"Cannot call `set_default_attn_processor` when attention processors are of type {next(iter(self.attn_processors.values()))}"
-            )
-
-        self.set_attn_processor(processor)
-
-    def _encode(self, x: torch.Tensor) -> torch.Tensor:
-        batch_size, num_channels, height, width = x.shape
-
-        if self.use_tiling and (width > self.tile_sample_min_size or height > self.tile_sample_min_size):
-            return self._tiled_encode(x)
-
-        enc = self.encoder(x)
-        if self.quant_conv is not None:
-            enc = self.quant_conv(enc)
-
-        return enc
-
-    @apply_forward_hook
-    def encode(
-        self, x: torch.Tensor, return_dict: bool = True
-    ) -> Union[AutoencoderKLOutput, Tuple[DiagonalGaussianDistribution]]:
-        """
-        Encode a batch of images into latents.
-
-        Args:
-            x (`torch.Tensor`): Input batch of images.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether to return a [`~models.autoencoder_kl.AutoencoderKLOutput`] instead of a plain tuple.
-
-        Returns:
-                The latent representations of the encoded images. If `return_dict` is True, a
-                [`~models.autoencoder_kl.AutoencoderKLOutput`] is returned, otherwise a plain `tuple` is returned.
-        """
-        if self.use_slicing and x.shape[0] > 1:
-            encoded_slices = [self._encode(x_slice) for x_slice in x.split(1)]
-            h = torch.cat(encoded_slices)
-        else:
-            h = self._encode(x)
-
-        posterior = DiagonalGaussianDistribution(h)
-
-        if not return_dict:
-            return (posterior,)
-
-        return AutoencoderKLOutput(latent_dist=posterior)
-
-    def _decode(self, z: torch.Tensor, return_dict: bool = True) -> Union[DecoderOutput, torch.Tensor]:
-        if self.use_tiling and (z.shape[-1] > self.tile_latent_min_size or z.shape[-2] > self.tile_latent_min_size):
-            return self.tiled_decode(z, return_dict=return_dict)
-
-        if self.post_quant_conv is not None:
-            z = self.post_quant_conv(z)
-
-        dec = self.decoder(z)
-
-        if not return_dict:
-            return (dec,)
-
-        return DecoderOutput(sample=dec)
-
-    @apply_forward_hook
-    def decode(
-        self, z: torch.FloatTensor, return_dict: bool = True, generator=None
-    ) -> Union[DecoderOutput, torch.FloatTensor]:
-        """
-        Decode a batch of images.
-
-        Args:
-            z (`torch.Tensor`): Input batch of latent vectors.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether to return a [`~models.vae.DecoderOutput`] instead of a plain tuple.
-
-        Returns:
-            [`~models.vae.DecoderOutput`] or `tuple`:
-                If return_dict is True, a [`~models.vae.DecoderOutput`] is returned, otherwise a plain `tuple` is
-                returned.
-
-        """
-        if self.use_slicing and z.shape[0] > 1:
-            decoded_slices = [self._decode(z_slice).sample for z_slice in z.split(1)]
-            decoded = torch.cat(decoded_slices)
-        else:
-            decoded = self._decode(z).sample
-
-        if not return_dict:
-            return (decoded,)
-
-        return DecoderOutput(sample=decoded)
-
-    def blend_v(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch.Tensor:
-        blend_extent = min(a.shape[2], b.shape[2], blend_extent)
-        for y in range(blend_extent):
-            b[:, :, y, :] = a[:, :, -blend_extent + y, :] * (1 - y / blend_extent) + b[:, :, y, :] * (y / blend_extent)
-        return b
-
-    def blend_h(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch.Tensor:
-        blend_extent = min(a.shape[3], b.shape[3], blend_extent)
-        for x in range(blend_extent):
-            b[:, :, :, x] = a[:, :, :, -blend_extent + x] * (1 - x / blend_extent) + b[:, :, :, x] * (x / blend_extent)
-        return b
-
-    def _tiled_encode(self, x: torch.Tensor) -> torch.Tensor:
-        r"""Encode a batch of images using a tiled encoder.
-
-        When this option is enabled, the VAE will split the input tensor into tiles to compute encoding in several
-        steps. This is useful to keep memory use constant regardless of image size. The end result of tiled encoding is
-        different from non-tiled encoding because each tile uses a different encoder. To avoid tiling artifacts, the
-        tiles overlap and are blended together to form a smooth output. You may still see tile-sized changes in the
-        output, but they should be much less noticeable.
-
-        Args:
-            x (`torch.Tensor`): Input batch of images.
-
-        Returns:
-            `torch.Tensor`:
-                The latent representation of the encoded videos.
-        """
-
-        overlap_size = int(self.tile_sample_min_size * (1 - self.tile_overlap_factor))
-        blend_extent = int(self.tile_latent_min_size * self.tile_overlap_factor)
-        row_limit = self.tile_latent_min_size - blend_extent
-
-        # Split the image into 512x512 tiles and encode them separately.
-        rows = []
-        for i in range(0, x.shape[2], overlap_size):
-            row = []
-            for j in range(0, x.shape[3], overlap_size):
-                tile = x[:, :, i : i + self.tile_sample_min_size, j : j + self.tile_sample_min_size]
-                tile = self.encoder(tile)
-                if self.config.use_quant_conv:
-                    tile = self.quant_conv(tile)
-                row.append(tile)
-            rows.append(row)
-        result_rows = []
-        for i, row in enumerate(rows):
-            result_row = []
-            for j, tile in enumerate(row):
-                # blend the above tile and the left tile
-                # to the current tile and add the current tile to the result row
-                if i > 0:
-                    tile = self.blend_v(rows[i - 1][j], tile, blend_extent)
-                if j > 0:
-                    tile = self.blend_h(row[j - 1], tile, blend_extent)
-                result_row.append(tile[:, :, :row_limit, :row_limit])
-            result_rows.append(torch.cat(result_row, dim=3))
-
-        enc = torch.cat(result_rows, dim=2)
-        return enc
-
-    def tiled_encode(self, x: torch.Tensor, return_dict: bool = True) -> AutoencoderKLOutput:
-        deprecation_message = (
-            "The tiled_encode implementation supporting the `return_dict` parameter is deprecated. In the future, the "
-            "implementation of this method will be replaced with that of `_tiled_encode` and you will no longer be able "
-            "to pass `return_dict`. You will also have to create a `DiagonalGaussianDistribution()` from the returned value."
-        )
-        deprecate("tiled_encode", "1.0.0", deprecation_message, standard_warn=False)
-
-        overlap_size = int(self.tile_sample_min_size * (1 - self.tile_overlap_factor))
-        blend_extent = int(self.tile_latent_min_size * self.tile_overlap_factor)
-        row_limit = self.tile_latent_min_size - blend_extent
-
-        # Split the image into 512x512 tiles and encode them separately.
-        rows = []
-        for i in range(0, x.shape[2], overlap_size):
-            row = []
-            for j in range(0, x.shape[3], overlap_size):
-                tile = x[:, :, i : i + self.tile_sample_min_size, j : j + self.tile_sample_min_size]
-                tile = self.encoder(tile)
-                if self.config.use_quant_conv:
-                    tile = self.quant_conv(tile)
-                row.append(tile)
-            rows.append(row)
-        result_rows = []
-        for i, row in enumerate(rows):
-            result_row = []
-            for j, tile in enumerate(row):
-                # blend the above tile and the left tile
-                # to the current tile and add the current tile to the result row
-                if i > 0:
-                    tile = self.blend_v(rows[i - 1][j], tile, blend_extent)
-                if j > 0:
-                    tile = self.blend_h(row[j - 1], tile, blend_extent)
-                result_row.append(tile[:, :, :row_limit, :row_limit])
-            result_rows.append(torch.cat(result_row, dim=3))
-
-        moments = torch.cat(result_rows, dim=2)
-        posterior = DiagonalGaussianDistribution(moments)
-
-        if not return_dict:
-            return (posterior,)
-
-        return AutoencoderKLOutput(latent_dist=posterior)
-
-    def tiled_decode(self, z: torch.Tensor, return_dict: bool = True) -> Union[DecoderOutput, torch.Tensor]:
-        r"""
-        Decode a batch of images using a tiled decoder.
-
-        Args:
-            z (`torch.Tensor`): Input batch of latent vectors.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~models.vae.DecoderOutput`] instead of a plain tuple.
-
-        Returns:
-            [`~models.vae.DecoderOutput`] or `tuple`:
-                If return_dict is True, a [`~models.vae.DecoderOutput`] is returned, otherwise a plain `tuple` is
-                returned.
-        """
-        overlap_size = int(self.tile_latent_min_size * (1 - self.tile_overlap_factor))
-        blend_extent = int(self.tile_sample_min_size * self.tile_overlap_factor)
-        row_limit = self.tile_sample_min_size - blend_extent
-
-        # Split z into overlapping 64x64 tiles and decode them separately.
-        # The tiles have an overlap to avoid seams between tiles.
-        rows = []
-        for i in range(0, z.shape[2], overlap_size):
-            row = []
-            for j in range(0, z.shape[3], overlap_size):
-                tile = z[:, :, i : i + self.tile_latent_min_size, j : j + self.tile_latent_min_size]
-                if self.config.use_post_quant_conv:
-                    tile = self.post_quant_conv(tile)
-                decoded = self.decoder(tile)
-                row.append(decoded)
-            rows.append(row)
-        result_rows = []
-        for i, row in enumerate(rows):
-            result_row = []
-            for j, tile in enumerate(row):
-                # blend the above tile and the left tile
-                # to the current tile and add the current tile to the result row
-                if i > 0:
-                    tile = self.blend_v(rows[i - 1][j], tile, blend_extent)
-                if j > 0:
-                    tile = self.blend_h(row[j - 1], tile, blend_extent)
-                result_row.append(tile[:, :, :row_limit, :row_limit])
-            result_rows.append(torch.cat(result_row, dim=3))
-
-        dec = torch.cat(result_rows, dim=2)
-        if not return_dict:
-            return (dec,)
-
-        return DecoderOutput(sample=dec)
-
-    def forward(
-        self,
-        sample: torch.Tensor,
-        sample_posterior: bool = False,
-        return_dict: bool = True,
-        generator: Optional[torch.Generator] = None,
-    ) -> Union[DecoderOutput, torch.Tensor]:
-        r"""
-        Args:
-            sample (`torch.Tensor`): Input sample.
-            sample_posterior (`bool`, *optional*, defaults to `False`):
-                Whether to sample from the posterior.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`DecoderOutput`] instead of a plain tuple.
-        """
-        x = sample
-        posterior = self.encode(x).latent_dist
-        if sample_posterior:
-            z = posterior.sample(generator=generator)
-        else:
-            z = posterior.mode()
-        dec = self.decode(z).sample
-
-        if not return_dict:
-            return (dec,)
-
-        return DecoderOutput(sample=dec)
-
-    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.fuse_qkv_projections
-    def fuse_qkv_projections(self):
-        """
-        Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, key, value)
-        are fused. For cross-attention modules, key and value projection matrices are fused.
-
-        <Tip warning={true}>
-
-        This API is 🧪 experimental.
-
-        </Tip>
-        """
-        self.original_attn_processors = None
-
-        for _, attn_processor in self.attn_processors.items():
-            if "Added" in str(attn_processor.__class__.__name__):
-                raise ValueError("`fuse_qkv_projections()` is not supported for models having added KV projections.")
-
-        self.original_attn_processors = self.attn_processors
-
-        for module in self.modules():
-            if isinstance(module, Attention):
-                module.fuse_projections(fuse=True)
-
-        self.set_attn_processor(FusedAttnProcessor2_0())
-
-    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.unfuse_qkv_projections
-    def unfuse_qkv_projections(self):
-        """Disables the fused QKV projection if enabled.
-
-        <Tip warning={true}>
-
-        This API is 🧪 experimental.
-
-        </Tip>
-
-        """
-        if self.original_attn_processors is not None:
-            self.set_attn_processor(self.original_attn_processors)
\ No newline at end of file
diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/vae/vae.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/vae/vae.py
deleted file mode 100644
index c1abff7097..0000000000
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/vae/vae.py
+++ /dev/null
@@ -1,996 +0,0 @@
-# Copyright 2024 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     https://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from dataclasses import dataclass
-from typing import Optional, Tuple
-
-import numpy as np
-import torch
-import torch.nn as nn
-
-from diffusers.utils import BaseOutput, is_torch_version
-from diffusers.utils.torch_utils import randn_tensor
-from diffusers.models.activations import get_activation
-from diffusers.models.attention_processor import SpatialNorm
-from diffusers.models.unets.unet_2d_blocks import (
-    AutoencoderTinyBlock,
-    UNetMidBlock2D,
-    get_down_block,
-    get_up_block,
-)
-
-
-@dataclass
-class EncoderOutput(BaseOutput):
-    r"""
-    Output of encoding method.
-
-    Args:
-        latent (`torch.Tensor` of shape `(batch_size, num_channels, latent_height, latent_width)`):
-            The encoded latent.
-    """
-
-    latent: torch.Tensor
-
-
-@dataclass
-class DecoderOutput(BaseOutput):
-    r"""
-    Output of decoding method.
-
-    Args:
-        sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)`):
-            The decoded output sample from the last layer of the model.
-    """
-
-    sample: torch.Tensor
-    commit_loss: Optional[torch.FloatTensor] = None
-
-
-class Encoder(nn.Module):
-    r"""
-    The `Encoder` layer of a variational autoencoder that encodes its input into a latent representation.
-
-    Args:
-        in_channels (`int`, *optional*, defaults to 3):
-            The number of input channels.
-        out_channels (`int`, *optional*, defaults to 3):
-            The number of output channels.
-        down_block_types (`Tuple[str, ...]`, *optional*, defaults to `("DownEncoderBlock2D",)`):
-            The types of down blocks to use. See `~diffusers.models.unet_2d_blocks.get_down_block` for available
-            options.
-        block_out_channels (`Tuple[int, ...]`, *optional*, defaults to `(64,)`):
-            The number of output channels for each block.
-        layers_per_block (`int`, *optional*, defaults to 2):
-            The number of layers per block.
-        norm_num_groups (`int`, *optional*, defaults to 32):
-            The number of groups for normalization.
-        act_fn (`str`, *optional*, defaults to `"silu"`):
-            The activation function to use. See `~diffusers.models.activations.get_activation` for available options.
-        double_z (`bool`, *optional*, defaults to `True`):
-            Whether to double the number of output channels for the last block.
-    """
-
-    def __init__(
-        self,
-        in_channels: int = 3,
-        out_channels: int = 3,
-        down_block_types: Tuple[str, ...] = ("DownEncoderBlock2D",),
-        block_out_channels: Tuple[int, ...] = (64,),
-        layers_per_block: int = 2,
-        norm_num_groups: int = 32,
-        act_fn: str = "silu",
-        double_z: bool = True,
-        mid_block_add_attention=True,
-    ):
-        super().__init__()
-        self.layers_per_block = layers_per_block
-
-        self.conv_in = nn.Conv2d(
-            in_channels,
-            block_out_channels[0],
-            kernel_size=3,
-            stride=1,
-            padding=1,
-        )
-
-        self.down_blocks = nn.ModuleList([])
-
-        # down
-        output_channel = block_out_channels[0]
-        for i, down_block_type in enumerate(down_block_types):
-            input_channel = output_channel
-            output_channel = block_out_channels[i]
-            is_final_block = i == len(block_out_channels) - 1
-
-            down_block = get_down_block(
-                down_block_type,
-                num_layers=self.layers_per_block,
-                in_channels=input_channel,
-                out_channels=output_channel,
-                add_downsample=not is_final_block,
-                resnet_eps=1e-6,
-                downsample_padding=0,
-                resnet_act_fn=act_fn,
-                resnet_groups=norm_num_groups,
-                attention_head_dim=output_channel,
-                temb_channels=None,
-            )
-            self.down_blocks.append(down_block)
-
-        # mid
-        self.mid_block = UNetMidBlock2D(
-            in_channels=block_out_channels[-1],
-            resnet_eps=1e-6,
-            resnet_act_fn=act_fn,
-            output_scale_factor=1,
-            resnet_time_scale_shift="default",
-            attention_head_dim=block_out_channels[-1],
-            resnet_groups=norm_num_groups,
-            temb_channels=None,
-            add_attention=mid_block_add_attention,
-        )
-
-        # out
-        self.conv_norm_out = nn.GroupNorm(num_channels=block_out_channels[-1], num_groups=norm_num_groups, eps=1e-6)
-        self.conv_act = nn.SiLU()
-
-        conv_out_channels = 2 * out_channels if double_z else out_channels
-        self.conv_out = nn.Conv2d(block_out_channels[-1], conv_out_channels, 3, padding=1)
-
-        self.gradient_checkpointing = False
-
-    def forward(self, sample: torch.Tensor) -> torch.Tensor:
-        r"""The forward method of the `Encoder` class."""
-
-        sample = self.conv_in(sample)
-
-        if torch.is_grad_enabled() and self.gradient_checkpointing:
-
-            def create_custom_forward(module):
-                def custom_forward(*inputs):
-                    return module(*inputs)
-
-                return custom_forward
-
-            # down
-            if is_torch_version(">=", "1.11.0"):
-                for down_block in self.down_blocks:
-                    sample = torch.utils.checkpoint.checkpoint(
-                        create_custom_forward(down_block), sample, use_reentrant=False
-                    )
-                # middle
-                sample = torch.utils.checkpoint.checkpoint(
-                    create_custom_forward(self.mid_block), sample, use_reentrant=False
-                )
-            else:
-                for down_block in self.down_blocks:
-                    sample = torch.utils.checkpoint.checkpoint(create_custom_forward(down_block), sample)
-                # middle
-                sample = torch.utils.checkpoint.checkpoint(create_custom_forward(self.mid_block), sample)
-
-        else:
-            # down
-            for down_block in self.down_blocks:
-                sample = down_block(sample)
-
-            # middle
-            sample = self.mid_block(sample)
-
-        # post-process
-        sample = self.conv_norm_out(sample)
-        sample = self.conv_act(sample)
-        sample = self.conv_out(sample)
-
-        return sample
-
-
-class Decoder(nn.Module):
-    r"""
-    The `Decoder` layer of a variational autoencoder that decodes its latent representation into an output sample.
-
-    Args:
-        in_channels (`int`, *optional*, defaults to 3):
-            The number of input channels.
-        out_channels (`int`, *optional*, defaults to 3):
-            The number of output channels.
-        up_block_types (`Tuple[str, ...]`, *optional*, defaults to `("UpDecoderBlock2D",)`):
-            The types of up blocks to use. See `~diffusers.models.unet_2d_blocks.get_up_block` for available options.
-        block_out_channels (`Tuple[int, ...]`, *optional*, defaults to `(64,)`):
-            The number of output channels for each block.
-        layers_per_block (`int`, *optional*, defaults to 2):
-            The number of layers per block.
-        norm_num_groups (`int`, *optional*, defaults to 32):
-            The number of groups for normalization.
-        act_fn (`str`, *optional*, defaults to `"silu"`):
-            The activation function to use. See `~diffusers.models.activations.get_activation` for available options.
-        norm_type (`str`, *optional*, defaults to `"group"`):
-            The normalization type to use. Can be either `"group"` or `"spatial"`.
-    """
-
-    def __init__(
-        self,
-        in_channels: int = 3,
-        out_channels: int = 3,
-        up_block_types: Tuple[str, ...] = ("UpDecoderBlock2D",),
-        block_out_channels: Tuple[int, ...] = (64,),
-        layers_per_block: int = 2,
-        norm_num_groups: int = 32,
-        act_fn: str = "silu",
-        norm_type: str = "group",  # group, spatial
-        mid_block_add_attention=True,
-    ):
-        super().__init__()
-        self.layers_per_block = layers_per_block
-
-        self.conv_in = nn.Conv2d(
-            in_channels,
-            block_out_channels[-1],
-            kernel_size=3,
-            stride=1,
-            padding=1,
-        )
-
-        self.up_blocks = nn.ModuleList([])
-
-        temb_channels = in_channels if norm_type == "spatial" else None
-
-        # mid
-        self.mid_block = UNetMidBlock2D(
-            in_channels=block_out_channels[-1],
-            resnet_eps=1e-6,
-            resnet_act_fn=act_fn,
-            output_scale_factor=1,
-            resnet_time_scale_shift="default" if norm_type == "group" else norm_type,
-            attention_head_dim=block_out_channels[-1],
-            resnet_groups=norm_num_groups,
-            temb_channels=temb_channels,
-            add_attention=mid_block_add_attention,
-        )
-
-        # up
-        reversed_block_out_channels = list(reversed(block_out_channels))
-        output_channel = reversed_block_out_channels[0]
-        for i, up_block_type in enumerate(up_block_types):
-            prev_output_channel = output_channel
-            output_channel = reversed_block_out_channels[i]
-
-            is_final_block = i == len(block_out_channels) - 1
-
-            up_block = get_up_block(
-                up_block_type,
-                num_layers=self.layers_per_block + 1,
-                in_channels=prev_output_channel,
-                out_channels=output_channel,
-                prev_output_channel=None,
-                add_upsample=not is_final_block,
-                resnet_eps=1e-6,
-                resnet_act_fn=act_fn,
-                resnet_groups=norm_num_groups,
-                attention_head_dim=output_channel,
-                temb_channels=temb_channels,
-                resnet_time_scale_shift=norm_type,
-            )
-            self.up_blocks.append(up_block)
-            prev_output_channel = output_channel
-
-        # out
-        if norm_type == "spatial":
-            self.conv_norm_out = SpatialNorm(block_out_channels[0], temb_channels)
-        else:
-            self.conv_norm_out = nn.GroupNorm(num_channels=block_out_channels[0], num_groups=norm_num_groups, eps=1e-6)
-        self.conv_act = nn.SiLU()
-        self.conv_out = nn.Conv2d(block_out_channels[0], out_channels, 3, padding=1)
-
-        self.gradient_checkpointing = False
-
-    def forward(
-        self,
-        sample: torch.Tensor,
-        latent_embeds: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
-        r"""The forward method of the `Decoder` class."""
-
-        sample = self.conv_in(sample)
-
-        upscale_dtype = next(iter(self.up_blocks.parameters())).dtype
-        if torch.is_grad_enabled() and self.gradient_checkpointing:
-
-            def create_custom_forward(module):
-                def custom_forward(*inputs):
-                    return module(*inputs)
-
-                return custom_forward
-
-            if is_torch_version(">=", "1.11.0"):
-                # middle
-                sample = torch.utils.checkpoint.checkpoint(
-                    create_custom_forward(self.mid_block),
-                    sample,
-                    latent_embeds,
-                    use_reentrant=False,
-                )
-                sample = sample.to(upscale_dtype)
-
-                # up
-                for up_block in self.up_blocks:
-                    sample = torch.utils.checkpoint.checkpoint(
-                        create_custom_forward(up_block),
-                        sample,
-                        latent_embeds,
-                        use_reentrant=False,
-                    )
-            else:
-                # middle
-                sample = torch.utils.checkpoint.checkpoint(
-                    create_custom_forward(self.mid_block), sample, latent_embeds
-                )
-                sample = sample.to(upscale_dtype)
-
-                # up
-                for up_block in self.up_blocks:
-                    sample = torch.utils.checkpoint.checkpoint(create_custom_forward(up_block), sample, latent_embeds)
-        else:
-            # middle
-            sample = self.mid_block(sample, latent_embeds)
-            sample = sample.to(upscale_dtype)
-
-            # up
-            for up_block in self.up_blocks:
-                sample = up_block(sample, latent_embeds)
-
-        # post-process
-        if latent_embeds is None:
-            sample = self.conv_norm_out(sample)
-        else:
-            sample = self.conv_norm_out(sample, latent_embeds)
-        sample = self.conv_act(sample)
-        sample = self.conv_out(sample)
-
-        return sample
-
-
-class UpSample(nn.Module):
-    r"""
-    The `UpSample` layer of a variational autoencoder that upsamples its input.
-
-    Args:
-        in_channels (`int`, *optional*, defaults to 3):
-            The number of input channels.
-        out_channels (`int`, *optional*, defaults to 3):
-            The number of output channels.
-    """
-
-    def __init__(
-        self,
-        in_channels: int,
-        out_channels: int,
-    ) -> None:
-        super().__init__()
-        self.in_channels = in_channels
-        self.out_channels = out_channels
-        self.deconv = nn.ConvTranspose2d(in_channels, out_channels, kernel_size=4, stride=2, padding=1)
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        r"""The forward method of the `UpSample` class."""
-        x = torch.relu(x)
-        x = self.deconv(x)
-        return x
-
-
-class MaskConditionEncoder(nn.Module):
-    """
-    used in AsymmetricAutoencoderKL
-    """
-
-    def __init__(
-        self,
-        in_ch: int,
-        out_ch: int = 192,
-        res_ch: int = 768,
-        stride: int = 16,
-    ) -> None:
-        super().__init__()
-
-        channels = []
-        while stride > 1:
-            stride = stride // 2
-            in_ch_ = out_ch * 2
-            if out_ch > res_ch:
-                out_ch = res_ch
-            if stride == 1:
-                in_ch_ = res_ch
-            channels.append((in_ch_, out_ch))
-            out_ch *= 2
-
-        out_channels = []
-        for _in_ch, _out_ch in channels:
-            out_channels.append(_out_ch)
-        out_channels.append(channels[-1][0])
-
-        layers = []
-        in_ch_ = in_ch
-        for i, _ in enumerate(out_channels):
-            out_ch_ = out_channels[i]
-            if i == 0 or i == 1:
-                layers.append(nn.Conv2d(in_ch_, out_ch_, kernel_size=3, stride=1, padding=1))
-            else:
-                layers.append(nn.Conv2d(in_ch_, out_ch_, kernel_size=4, stride=2, padding=1))
-            in_ch_ = out_ch_
-
-        self.layers = nn.Sequential(*layers)
-
-    def forward(self, x: torch.Tensor, mask=None) -> torch.Tensor:
-        r"""The forward method of the `MaskConditionEncoder` class."""
-        out = {}
-        for i, _ in enumerate(self.layers):
-            layer = self.layers[i]
-            x = layer(x)
-            out[str(tuple(x.shape))] = x
-            x = torch.relu(x)
-        return out
-
-
-class MaskConditionDecoder(nn.Module):
-    r"""The `MaskConditionDecoder` should be used in combination with [`AsymmetricAutoencoderKL`] to enhance the model's
-    decoder with a conditioner on the mask and masked image.
-
-    Args:
-        in_channels (`int`, *optional*, defaults to 3):
-            The number of input channels.
-        out_channels (`int`, *optional*, defaults to 3):
-            The number of output channels.
-        up_block_types (`Tuple[str, ...]`, *optional*, defaults to `("UpDecoderBlock2D",)`):
-            The types of up blocks to use. See `~diffusers.models.unet_2d_blocks.get_up_block` for available options.
-        block_out_channels (`Tuple[int, ...]`, *optional*, defaults to `(64,)`):
-            The number of output channels for each block.
-        layers_per_block (`int`, *optional*, defaults to 2):
-            The number of layers per block.
-        norm_num_groups (`int`, *optional*, defaults to 32):
-            The number of groups for normalization.
-        act_fn (`str`, *optional*, defaults to `"silu"`):
-            The activation function to use. See `~diffusers.models.activations.get_activation` for available options.
-        norm_type (`str`, *optional*, defaults to `"group"`):
-            The normalization type to use. Can be either `"group"` or `"spatial"`.
-    """
-
-    def __init__(
-        self,
-        in_channels: int = 3,
-        out_channels: int = 3,
-        up_block_types: Tuple[str, ...] = ("UpDecoderBlock2D",),
-        block_out_channels: Tuple[int, ...] = (64,),
-        layers_per_block: int = 2,
-        norm_num_groups: int = 32,
-        act_fn: str = "silu",
-        norm_type: str = "group",  # group, spatial
-    ):
-        super().__init__()
-        self.layers_per_block = layers_per_block
-
-        self.conv_in = nn.Conv2d(
-            in_channels,
-            block_out_channels[-1],
-            kernel_size=3,
-            stride=1,
-            padding=1,
-        )
-
-        self.up_blocks = nn.ModuleList([])
-
-        temb_channels = in_channels if norm_type == "spatial" else None
-
-        # mid
-        self.mid_block = UNetMidBlock2D(
-            in_channels=block_out_channels[-1],
-            resnet_eps=1e-6,
-            resnet_act_fn=act_fn,
-            output_scale_factor=1,
-            resnet_time_scale_shift="default" if norm_type == "group" else norm_type,
-            attention_head_dim=block_out_channels[-1],
-            resnet_groups=norm_num_groups,
-            temb_channels=temb_channels,
-        )
-
-        # up
-        reversed_block_out_channels = list(reversed(block_out_channels))
-        output_channel = reversed_block_out_channels[0]
-        for i, up_block_type in enumerate(up_block_types):
-            prev_output_channel = output_channel
-            output_channel = reversed_block_out_channels[i]
-
-            is_final_block = i == len(block_out_channels) - 1
-
-            up_block = get_up_block(
-                up_block_type,
-                num_layers=self.layers_per_block + 1,
-                in_channels=prev_output_channel,
-                out_channels=output_channel,
-                prev_output_channel=None,
-                add_upsample=not is_final_block,
-                resnet_eps=1e-6,
-                resnet_act_fn=act_fn,
-                resnet_groups=norm_num_groups,
-                attention_head_dim=output_channel,
-                temb_channels=temb_channels,
-                resnet_time_scale_shift=norm_type,
-            )
-            self.up_blocks.append(up_block)
-            prev_output_channel = output_channel
-
-        # condition encoder
-        self.condition_encoder = MaskConditionEncoder(
-            in_ch=out_channels,
-            out_ch=block_out_channels[0],
-            res_ch=block_out_channels[-1],
-        )
-
-        # out
-        if norm_type == "spatial":
-            self.conv_norm_out = SpatialNorm(block_out_channels[0], temb_channels)
-        else:
-            self.conv_norm_out = nn.GroupNorm(num_channels=block_out_channels[0], num_groups=norm_num_groups, eps=1e-6)
-        self.conv_act = nn.SiLU()
-        self.conv_out = nn.Conv2d(block_out_channels[0], out_channels, 3, padding=1)
-
-        self.gradient_checkpointing = False
-
-    def forward(
-        self,
-        z: torch.Tensor,
-        image: Optional[torch.Tensor] = None,
-        mask: Optional[torch.Tensor] = None,
-        latent_embeds: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
-        r"""The forward method of the `MaskConditionDecoder` class."""
-        sample = z
-        sample = self.conv_in(sample)
-
-        upscale_dtype = next(iter(self.up_blocks.parameters())).dtype
-        if torch.is_grad_enabled() and self.gradient_checkpointing:
-
-            def create_custom_forward(module):
-                def custom_forward(*inputs):
-                    return module(*inputs)
-
-                return custom_forward
-
-            if is_torch_version(">=", "1.11.0"):
-                # middle
-                sample = torch.utils.checkpoint.checkpoint(
-                    create_custom_forward(self.mid_block),
-                    sample,
-                    latent_embeds,
-                    use_reentrant=False,
-                )
-                sample = sample.to(upscale_dtype)
-
-                # condition encoder
-                if image is not None and mask is not None:
-                    masked_image = (1 - mask) * image
-                    im_x = torch.utils.checkpoint.checkpoint(
-                        create_custom_forward(self.condition_encoder),
-                        masked_image,
-                        mask,
-                        use_reentrant=False,
-                    )
-
-                # up
-                for up_block in self.up_blocks:
-                    if image is not None and mask is not None:
-                        sample_ = im_x[str(tuple(sample.shape))]
-                        mask_ = nn.functional.interpolate(mask, size=sample.shape[-2:], mode="nearest")
-                        sample = sample * mask_ + sample_ * (1 - mask_)
-                    sample = torch.utils.checkpoint.checkpoint(
-                        create_custom_forward(up_block),
-                        sample,
-                        latent_embeds,
-                        use_reentrant=False,
-                    )
-                if image is not None and mask is not None:
-                    sample = sample * mask + im_x[str(tuple(sample.shape))] * (1 - mask)
-            else:
-                # middle
-                sample = torch.utils.checkpoint.checkpoint(
-                    create_custom_forward(self.mid_block), sample, latent_embeds
-                )
-                sample = sample.to(upscale_dtype)
-
-                # condition encoder
-                if image is not None and mask is not None:
-                    masked_image = (1 - mask) * image
-                    im_x = torch.utils.checkpoint.checkpoint(
-                        create_custom_forward(self.condition_encoder),
-                        masked_image,
-                        mask,
-                    )
-
-                # up
-                for up_block in self.up_blocks:
-                    if image is not None and mask is not None:
-                        sample_ = im_x[str(tuple(sample.shape))]
-                        mask_ = nn.functional.interpolate(mask, size=sample.shape[-2:], mode="nearest")
-                        sample = sample * mask_ + sample_ * (1 - mask_)
-                    sample = torch.utils.checkpoint.checkpoint(create_custom_forward(up_block), sample, latent_embeds)
-                if image is not None and mask is not None:
-                    sample = sample * mask + im_x[str(tuple(sample.shape))] * (1 - mask)
-        else:
-            # middle
-            sample = self.mid_block(sample, latent_embeds)
-            sample = sample.to(upscale_dtype)
-
-            # condition encoder
-            if image is not None and mask is not None:
-                masked_image = (1 - mask) * image
-                im_x = self.condition_encoder(masked_image, mask)
-
-            # up
-            for up_block in self.up_blocks:
-                if image is not None and mask is not None:
-                    sample_ = im_x[str(tuple(sample.shape))]
-                    mask_ = nn.functional.interpolate(mask, size=sample.shape[-2:], mode="nearest")
-                    sample = sample * mask_ + sample_ * (1 - mask_)
-                sample = up_block(sample, latent_embeds)
-            if image is not None and mask is not None:
-                sample = sample * mask + im_x[str(tuple(sample.shape))] * (1 - mask)
-
-        # post-process
-        if latent_embeds is None:
-            sample = self.conv_norm_out(sample)
-        else:
-            sample = self.conv_norm_out(sample, latent_embeds)
-        sample = self.conv_act(sample)
-        sample = self.conv_out(sample)
-
-        return sample
-
-
-class VectorQuantizer(nn.Module):
-    """
-    Improved version over VectorQuantizer, can be used as a drop-in replacement. Mostly avoids costly matrix
-    multiplications and allows for post-hoc remapping of indices.
-    """
-
-    # NOTE: due to a bug the beta term was applied to the wrong term. for
-    # backwards compatibility we use the buggy version by default, but you can
-    # specify legacy=False to fix it.
-    def __init__(
-        self,
-        n_e: int,
-        vq_embed_dim: int,
-        beta: float,
-        remap=None,
-        unknown_index: str = "random",
-        sane_index_shape: bool = False,
-        legacy: bool = True,
-    ):
-        super().__init__()
-        self.n_e = n_e
-        self.vq_embed_dim = vq_embed_dim
-        self.beta = beta
-        self.legacy = legacy
-
-        self.embedding = nn.Embedding(self.n_e, self.vq_embed_dim)
-        self.embedding.weight.data.uniform_(-1.0 / self.n_e, 1.0 / self.n_e)
-
-        self.remap = remap
-        if self.remap is not None:
-            self.register_buffer("used", torch.tensor(np.load(self.remap)))
-            self.used: torch.Tensor
-            self.re_embed = self.used.shape[0]
-            self.unknown_index = unknown_index  # "random" or "extra" or integer
-            if self.unknown_index == "extra":
-                self.unknown_index = self.re_embed
-                self.re_embed = self.re_embed + 1
-            print(
-                f"Remapping {self.n_e} indices to {self.re_embed} indices. "
-                f"Using {self.unknown_index} for unknown indices."
-            )
-        else:
-            self.re_embed = n_e
-
-        self.sane_index_shape = sane_index_shape
-
-    def remap_to_used(self, inds: torch.LongTensor) -> torch.LongTensor:
-        ishape = inds.shape
-        inds = inds.reshape(ishape[0], -1)
-        used = self.used.to(inds)
-        match = (inds[:, :, None] == used[None, None, ...]).long()
-        new = match.argmax(-1)
-        unknown = match.sum(2) < 1
-        if self.unknown_index == "random":
-            new[unknown] = torch.randint(0, self.re_embed, size=new[unknown].shape).to(device=new.device)
-        else:
-            new[unknown] = self.unknown_index
-        return new.reshape(ishape)
-
-    def unmap_to_all(self, inds: torch.LongTensor) -> torch.LongTensor:
-        ishape = inds.shape
-        inds = inds.reshape(ishape[0], -1)
-        used = self.used.to(inds)
-        if self.re_embed > self.used.shape[0]:  # extra token
-            inds[inds >= self.used.shape[0]] = 0  # simply set to zero
-        back = torch.gather(used[None, :][inds.shape[0] * [0], :], 1, inds)
-        return back.reshape(ishape)
-
-    def forward(self, z: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, Tuple]:
-        # reshape z -> (batch, height, width, channel) and flatten
-        z = z.permute(0, 2, 3, 1).contiguous()
-        z_flattened = z.view(-1, self.vq_embed_dim)
-
-        # distances from z to embeddings e_j (z - e)^2 = z^2 + e^2 - 2 e * z
-        min_encoding_indices = torch.argmin(torch.cdist(z_flattened, self.embedding.weight), dim=1)
-
-        z_q = self.embedding(min_encoding_indices).view(z.shape)
-        perplexity = None
-        min_encodings = None
-
-        # compute loss for embedding
-        if not self.legacy:
-            loss = self.beta * torch.mean((z_q.detach() - z) ** 2) + torch.mean((z_q - z.detach()) ** 2)
-        else:
-            loss = torch.mean((z_q.detach() - z) ** 2) + self.beta * torch.mean((z_q - z.detach()) ** 2)
-
-        # preserve gradients
-        z_q: torch.Tensor = z + (z_q - z).detach()
-
-        # reshape back to match original input shape
-        z_q = z_q.permute(0, 3, 1, 2).contiguous()
-
-        if self.remap is not None:
-            min_encoding_indices = min_encoding_indices.reshape(z.shape[0], -1)  # add batch axis
-            min_encoding_indices = self.remap_to_used(min_encoding_indices)
-            min_encoding_indices = min_encoding_indices.reshape(-1, 1)  # flatten
-
-        if self.sane_index_shape:
-            min_encoding_indices = min_encoding_indices.reshape(z_q.shape[0], z_q.shape[2], z_q.shape[3])
-
-        return z_q, loss, (perplexity, min_encodings, min_encoding_indices)
-
-    def get_codebook_entry(self, indices: torch.LongTensor, shape: Tuple[int, ...]) -> torch.Tensor:
-        # shape specifying (batch, height, width, channel)
-        if self.remap is not None:
-            indices = indices.reshape(shape[0], -1)  # add batch axis
-            indices = self.unmap_to_all(indices)
-            indices = indices.reshape(-1)  # flatten again
-
-        # get quantized latent vectors
-        z_q: torch.Tensor = self.embedding(indices)
-
-        if shape is not None:
-            z_q = z_q.view(shape)
-            # reshape back to match original input shape
-            z_q = z_q.permute(0, 3, 1, 2).contiguous()
-
-        return z_q
-
-
-class DiagonalGaussianDistribution(object):
-    def __init__(self, parameters: torch.Tensor, deterministic: bool = False):
-        self.parameters = parameters
-        self.mean, self.logvar = torch.chunk(parameters, 2, dim=1)
-        self.logvar = torch.clamp(self.logvar, -30.0, 20.0)
-        self.deterministic = deterministic
-        self.std = torch.exp(0.5 * self.logvar)
-        self.var = torch.exp(self.logvar)
-        if self.deterministic:
-            self.var = self.std = torch.zeros_like(
-                self.mean, device=self.parameters.device, dtype=self.parameters.dtype
-            )
-
-    def sample(self, generator: Optional[torch.Generator] = None) -> torch.Tensor:
-        # make sure sample is on the same device as the parameters and has same dtype
-        sample = randn_tensor(
-            self.mean.shape,
-            generator=generator,
-            device=self.parameters.device,
-            dtype=self.parameters.dtype,
-        )
-        x = self.mean + self.std * sample
-        return x
-
-    def kl(self, other: "DiagonalGaussianDistribution" = None) -> torch.Tensor:
-        if self.deterministic:
-            return torch.Tensor([0.0])
-        else:
-            if other is None:
-                return 0.5 * torch.sum(
-                    torch.pow(self.mean, 2) + self.var - 1.0 - self.logvar,
-                    dim=[1, 2, 3],
-                )
-            else:
-                return 0.5 * torch.sum(
-                    torch.pow(self.mean - other.mean, 2) / other.var
-                    + self.var / other.var
-                    - 1.0
-                    - self.logvar
-                    + other.logvar,
-                    dim=[1, 2, 3],
-                )
-
-    def nll(self, sample: torch.Tensor, dims: Tuple[int, ...] = None) -> torch.Tensor:
-        if dims is None:
-            dims = [1, 2, 3]
-        if self.deterministic:
-            return torch.Tensor([0.0])
-        logtwopi = np.log(2.0 * np.pi)
-        return 0.5 * torch.sum(
-            logtwopi + self.logvar + torch.pow(sample - self.mean, 2) / self.var,
-            dim=dims,
-        )
-
-    def mode(self) -> torch.Tensor:
-        return self.mean
-
-
-class EncoderTiny(nn.Module):
-    r"""
-    The `EncoderTiny` layer is a simpler version of the `Encoder` layer.
-
-    Args:
-        in_channels (`int`):
-            The number of input channels.
-        out_channels (`int`):
-            The number of output channels.
-        num_blocks (`Tuple[int, ...]`):
-            Each value of the tuple represents a Conv2d layer followed by `value` number of `AutoencoderTinyBlock`'s to
-            use.
-        block_out_channels (`Tuple[int, ...]`):
-            The number of output channels for each block.
-        act_fn (`str`):
-            The activation function to use. See `~diffusers.models.activations.get_activation` for available options.
-    """
-
-    def __init__(
-        self,
-        in_channels: int,
-        out_channels: int,
-        num_blocks: Tuple[int, ...],
-        block_out_channels: Tuple[int, ...],
-        act_fn: str,
-    ):
-        super().__init__()
-
-        layers = []
-        for i, num_block in enumerate(num_blocks):
-            num_channels = block_out_channels[i]
-
-            if i == 0:
-                layers.append(nn.Conv2d(in_channels, num_channels, kernel_size=3, padding=1))
-            else:
-                layers.append(
-                    nn.Conv2d(
-                        num_channels,
-                        num_channels,
-                        kernel_size=3,
-                        padding=1,
-                        stride=2,
-                        bias=False,
-                    )
-                )
-
-            for _ in range(num_block):
-                layers.append(AutoencoderTinyBlock(num_channels, num_channels, act_fn))
-
-        layers.append(nn.Conv2d(block_out_channels[-1], out_channels, kernel_size=3, padding=1))
-
-        self.layers = nn.Sequential(*layers)
-        self.gradient_checkpointing = False
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        r"""The forward method of the `EncoderTiny` class."""
-        if torch.is_grad_enabled() and self.gradient_checkpointing:
-
-            def create_custom_forward(module):
-                def custom_forward(*inputs):
-                    return module(*inputs)
-
-                return custom_forward
-
-            if is_torch_version(">=", "1.11.0"):
-                x = torch.utils.checkpoint.checkpoint(create_custom_forward(self.layers), x, use_reentrant=False)
-            else:
-                x = torch.utils.checkpoint.checkpoint(create_custom_forward(self.layers), x)
-
-        else:
-            # scale image from [-1, 1] to [0, 1] to match TAESD convention
-            x = self.layers(x.add(1).div(2))
-
-        return x
-
-
-class DecoderTiny(nn.Module):
-    r"""
-    The `DecoderTiny` layer is a simpler version of the `Decoder` layer.
-
-    Args:
-        in_channels (`int`):
-            The number of input channels.
-        out_channels (`int`):
-            The number of output channels.
-        num_blocks (`Tuple[int, ...]`):
-            Each value of the tuple represents a Conv2d layer followed by `value` number of `AutoencoderTinyBlock`'s to
-            use.
-        block_out_channels (`Tuple[int, ...]`):
-            The number of output channels for each block.
-        upsampling_scaling_factor (`int`):
-            The scaling factor to use for upsampling.
-        act_fn (`str`):
-            The activation function to use. See `~diffusers.models.activations.get_activation` for available options.
-    """
-
-    def __init__(
-        self,
-        in_channels: int,
-        out_channels: int,
-        num_blocks: Tuple[int, ...],
-        block_out_channels: Tuple[int, ...],
-        upsampling_scaling_factor: int,
-        act_fn: str,
-        upsample_fn: str,
-    ):
-        super().__init__()
-
-        layers = [
-            nn.Conv2d(in_channels, block_out_channels[0], kernel_size=3, padding=1),
-            get_activation(act_fn),
-        ]
-
-        for i, num_block in enumerate(num_blocks):
-            is_final_block = i == (len(num_blocks) - 1)
-            num_channels = block_out_channels[i]
-
-            for _ in range(num_block):
-                layers.append(AutoencoderTinyBlock(num_channels, num_channels, act_fn))
-
-            if not is_final_block:
-                layers.append(nn.Upsample(scale_factor=upsampling_scaling_factor, mode=upsample_fn))
-
-            conv_out_channel = num_channels if not is_final_block else out_channels
-            layers.append(
-                nn.Conv2d(
-                    num_channels,
-                    conv_out_channel,
-                    kernel_size=3,
-                    padding=1,
-                    bias=is_final_block,
-                )
-            )
-
-        self.layers = nn.Sequential(*layers)
-        self.gradient_checkpointing = False
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        r"""The forward method of the `DecoderTiny` class."""
-        # Clamp.
-        x = torch.tanh(x / 3) * 3
-
-        if torch.is_grad_enabled() and self.gradient_checkpointing:
-
-            def create_custom_forward(module):
-                def custom_forward(*inputs):
-                    return module(*inputs)
-
-                return custom_forward
-
-            if is_torch_version(">=", "1.11.0"):
-                x = torch.utils.checkpoint.checkpoint(create_custom_forward(self.layers), x, use_reentrant=False)
-            else:
-                x = torch.utils.checkpoint.checkpoint(create_custom_forward(self.layers), x)
-
-        else:
-            x = self.layers(x)
-
-        # scale image from [0, 1] to [-1, 1] to match diffusers convention
-        return x.mul(2).sub(1)
\ No newline at end of file
-- 
Gitee


From f840981328864580cc1e9fa79643529b7a2f71c1 Mon Sep 17 00:00:00 2001
From: jiangmengyu <jiangmengyu1@huawei.com>
Date: Tue, 31 Dec 2024 11:26:37 +0800
Subject: [PATCH 62/91] =?UTF-8?q?cogview3=E9=80=82=E9=85=8D?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../built-in/foundation/cogview3/cogview3plus/__init__.py      | 2 +-
 .../built-in/foundation/cogview3/cogview3plus/vae/__init__.py  | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/__init__.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/__init__.py
index 8cfcd60a5b..14327efb5c 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/__init__.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/__init__.py
@@ -15,6 +15,6 @@
 # limitations under the License.
 
 from .pipeline import CogView3PlusPipeline, DiffusionPipeline
-from .vae import AutoencoderKL
+from .vae import AutoencoderKL, ModelMixin
 from .schedulers import CogVideoXDDIMScheduler, SchedulerMixin
 from .models import CogView3PlusTransformer2DModel, ModelMixin
\ No newline at end of file
diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/vae/__init__.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/vae/__init__.py
index 261968dc69..e9a931d8eb 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/vae/__init__.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/vae/__init__.py
@@ -1 +1,2 @@
-from diffusers import AutoencoderKL
\ No newline at end of file
+from diffusers import AutoencoderKL
+from ..models.modeling_utils import ModelMixin
\ No newline at end of file
-- 
Gitee


From 7155a471fff922073a2f6aa15b800225fdfa8c06 Mon Sep 17 00:00:00 2001
From: jiangmengyu <jiangmengyu1@huawei.com>
Date: Tue, 31 Dec 2024 11:30:38 +0800
Subject: [PATCH 63/91] =?UTF-8?q?cogview3=E9=80=82=E9=85=8D?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../built-in/foundation/cogview3/cogview3plus/__init__.py      | 3 ++-
 .../built-in/foundation/cogview3/cogview3plus/vae/__init__.py  | 2 --
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/__init__.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/__init__.py
index 14327efb5c..dc22483005 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/__init__.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/__init__.py
@@ -14,7 +14,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from diffusers import AutoencoderKL
+
 from .pipeline import CogView3PlusPipeline, DiffusionPipeline
-from .vae import AutoencoderKL, ModelMixin
 from .schedulers import CogVideoXDDIMScheduler, SchedulerMixin
 from .models import CogView3PlusTransformer2DModel, ModelMixin
\ No newline at end of file
diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/vae/__init__.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/vae/__init__.py
index e9a931d8eb..e69de29bb2 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/vae/__init__.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/vae/__init__.py
@@ -1,2 +0,0 @@
-from diffusers import AutoencoderKL
-from ..models.modeling_utils import ModelMixin
\ No newline at end of file
-- 
Gitee


From b4ecaf571cefe8200e04e3c00e4f3dec6fc4af55 Mon Sep 17 00:00:00 2001
From: jiangmengyu <jiangmengyu1@huawei.com>
Date: Tue, 31 Dec 2024 11:32:54 +0800
Subject: [PATCH 64/91] =?UTF-8?q?cogview3=E9=80=82=E9=85=8D?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py
index 05f6ddf53a..ed439ecb29 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py
@@ -23,8 +23,8 @@ from diffusers.image_processor import VaeImageProcessor
 from diffusers.pipelines.pipeline_utils import DiffusionPipeline
 from diffusers.utils import logging
 from diffusers.utils.torch_utils import randn_tensor
+from diffusers import AutoencoderKL
 
-from ..vae import AutoencoderKL
 from ..models import CogView3PlusTransformer2DModel
 from ..schedulers import CogVideoXDDIMScheduler
 from .pipeline_output import CogView3PipelineOutput
-- 
Gitee


From 4ded09c54a56aaef51bd6badf8b13cad454856f7 Mon Sep 17 00:00:00 2001
From: jiangmengyu <jiangmengyu1@huawei.com>
Date: Tue, 31 Dec 2024 11:34:18 +0800
Subject: [PATCH 65/91] =?UTF-8?q?cogview3=E9=80=82=E9=85=8D?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../built-in/foundation/cogview3/cogview3plus/__init__.py        | 1 -
 1 file changed, 1 deletion(-)

diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/__init__.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/__init__.py
index dc22483005..de500743d8 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/__init__.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/__init__.py
@@ -14,7 +14,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from diffusers import AutoencoderKL
 
 from .pipeline import CogView3PlusPipeline, DiffusionPipeline
 from .schedulers import CogVideoXDDIMScheduler, SchedulerMixin
-- 
Gitee


From 047be8e7198f861ef44f7dcc0cb220add719fc5d Mon Sep 17 00:00:00 2001
From: jiangmengyu <jiangmengyu1@huawei.com>
Date: Tue, 31 Dec 2024 11:58:14 +0800
Subject: [PATCH 66/91] =?UTF-8?q?cogview3=E9=80=82=E9=85=8D?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../built-in/foundation/cogview3/README.md    |  2 +-
 .../models/transformer_cogview3plus.py        |  5 +--
 .../pipeline/pipeline_cogview3plus.py         |  3 +-
 .../schedulers/scheduling_ddim_cogvideox.py   |  1 +
 .../schedulers/scheduling_utils.py            | 37 -------------------
 5 files changed, 5 insertions(+), 43 deletions(-)

diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/README.md b/MindIE/MindIE-Torch/built-in/foundation/cogview3/README.md
index 16592703d8..e161ab5c10 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/README.md
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/README.md
@@ -96,7 +96,7 @@ https://huggingface.co/THUDM/CogView3-Plus-3B/tree/main
     "CogView3PlusTransformer2DModel"
   ],
   "vae": [
-    "cogview3plus",
+    "diffusers",
     "AutoencoderKL"
   ]
 }
diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/transformer_cogview3plus.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/transformer_cogview3plus.py
index 6f4fa104b8..bd2482b587 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/transformer_cogview3plus.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/transformer_cogview3plus.py
@@ -255,7 +255,6 @@ class CogView3PlusTransformer2DModel(ModelMixin, ConfigMixin):
         original_size: torch.Tensor,
         target_size: torch.Tensor,
         crop_coords: torch.Tensor,
-        t_idx: int,
     ) -> Union[torch.Tensor, Transformer2DModelOutput]:
         hidden_states = states[0]
         encoder_hidden_states = states[1]
@@ -270,7 +269,7 @@ class CogView3PlusTransformer2DModel(ModelMixin, ConfigMixin):
         encoder_hidden_states = hidden_states[:, :text_seq_length]
         hidden_states = hidden_states[:, text_seq_length:]
 
-        hidden_states, encoder_hidden_states = self._forward_blocks(hidden_states, encoder_hidden_states, emb, t_idx)
+        hidden_states, encoder_hidden_states = self._forward_blocks(hidden_states, encoder_hidden_states, emb, states[2])
 
         hidden_states = self.norm_out(hidden_states, emb)
         hidden_states = self.proj_out(hidden_states)  # (batch_size, height*width, patch_size*patch_size*out_channels)
@@ -387,7 +386,7 @@ class CogView3PlusTransformer2DModel(ModelMixin, ConfigMixin):
                             self.v_bias_cache = weights.pop(f"transformer_blocks.{i}.attn1.to_v.bias", None)
 
                 qk_weight_cache = self.q_weight_cache is not None and self.k_weight_cache is not None
-                if  qk_weight_cache and self.v_weight_cache is not None:
+                if qk_weight_cache and self.v_weight_cache is not None:
                     qkv_weight = torch.cat(
                         [self.q_weight_cache, self.k_weight_cache, self.v_weight_cache], 
                         dim=0
diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py
index ed439ecb29..82276b2cd9 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py
@@ -311,12 +311,11 @@ class CogView3PlusPipeline(DiffusionPipeline):
 
                 # predict noise model_output
                 noise_pred = self.transformer(
-                    states=(latent_model_input, prompt_embeds),
+                    states=(latent_model_input, prompt_embeds, i),
                     timestep=timestep,
                     original_size=original_size,
                     target_size=target_size,
                     crop_coords=crops_coords_top_left,
-                    t_idx=i,
                 )[0]
                 noise_pred = noise_pred.float()
 
diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_ddim_cogvideox.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_ddim_cogvideox.py
index f94de1c81d..9b9a4f051e 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_ddim_cogvideox.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_ddim_cogvideox.py
@@ -42,6 +42,7 @@ class DDIMSchedulerOutput(BaseOutput):
     prev_sample: torch.Tensor
     pred_original_sample: Optional[torch.Tensor] = None
 
+
 def betas_for_alpha_bar(
     num_diffusion_timesteps,
     max_beta=0.999,
diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_utils.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_utils.py
index ae88225358..cd59e45bdf 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_utils.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_utils.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import importlib
 import os
 from dataclasses import dataclass
 from enum import Enum
@@ -92,39 +91,3 @@ class SchedulerMixin(PushToHubMixin):
             **kwargs,
         )
         return cls.from_config(config, return_unused_kwargs=return_unused_kwargs, **kwargs)
-
-    def save_pretrained(self, save_directory: Union[str, os.PathLike], push_to_hub: bool = False, **kwargs):
-        """
-        Save a scheduler configuration object to a directory so that it can be reloaded using the
-        [`~SchedulerMixin.from_pretrained`] class method.
-
-        Args:
-            save_directory (`str` or `os.PathLike`):
-                Directory where the configuration JSON file will be saved (will be created if it does not exist).
-            push_to_hub (`bool`, *optional*, defaults to `False`):
-                Whether or not to push your model to the Hugging Face Hub after saving it. You can specify the
-                repository you want to push to with `repo_id` (will default to the name of `save_directory` in your
-                namespace).
-            kwargs (`Dict[str, Any]`, *optional*):
-                Additional keyword arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
-        """
-        self.save_config(save_directory=save_directory, push_to_hub=push_to_hub, **kwargs)
-
-    @property
-    def compatibles(self):
-        """
-        Returns all schedulers that are compatible with this scheduler
-
-        Returns:
-            `List[SchedulerMixin]`: List of compatible schedulers
-        """
-        return self._get_compatibles()
-
-    @classmethod
-    def _get_compatibles(cls):
-        compatible_classes_str = list(set([cls.__name__] + cls._compatibles))
-        diffusers_library = importlib.import_module(__name__.split(".")[0])
-        compatible_classes = [
-            getattr(diffusers_library, c) for c in compatible_classes_str if hasattr(diffusers_library, c)
-        ]
-        return compatible_classes
\ No newline at end of file
-- 
Gitee


From ce0e6f9a428ef490bb39ff10e3224f76fefedfc2 Mon Sep 17 00:00:00 2001
From: jiangmengyu <jiangmengyu1@huawei.com>
Date: Tue, 31 Dec 2024 12:48:35 +0800
Subject: [PATCH 67/91] =?UTF-8?q?cogview3=E9=80=82=E9=85=8D?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../schedulers/scheduling_utils.py            | 20 +++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_utils.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_utils.py
index cd59e45bdf..d854366c77 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_utils.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_utils.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import importlib
 import os
 from dataclasses import dataclass
 from enum import Enum
@@ -91,3 +92,22 @@ class SchedulerMixin(PushToHubMixin):
             **kwargs,
         )
         return cls.from_config(config, return_unused_kwargs=return_unused_kwargs, **kwargs)
+
+    @property
+    def compatibles(self):
+        """
+        Returns all schedulers that are compatible with this scheduler
+
+        Returns:
+            `List[SchedulerMixin]`: List of compatible schedulers
+        """
+        return self._get_compatibles()
+
+    @classmethod
+    def _get_compatibles(cls):
+        compatible_classes_str = list(set([cls.__name__] + cls._compatibles))
+        diffusers_library = importlib.import_module(__name__.split(".")[0])
+        compatible_classes = [
+            getattr(diffusers_library, c) for c in compatible_classes_str if hasattr(diffusers_library, c)
+        ]
+        return compatible_classes
\ No newline at end of file
-- 
Gitee


From 1caf467868b2d537166630ab3ee6e9d77e8805b0 Mon Sep 17 00:00:00 2001
From: jiangmengyu <jiangmengyu1@huawei.com>
Date: Tue, 31 Dec 2024 14:13:25 +0800
Subject: [PATCH 68/91] =?UTF-8?q?cogview3=E9=80=82=E9=85=8D?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../cogview3/cogview3plus/__init__.py           | 17 -----------------
 .../cogview3/cogview3plus/layers/embeddings.py  |  4 +---
 .../cogview3/cogview3plus/layers/linear.py      |  2 +-
 3 files changed, 2 insertions(+), 21 deletions(-)

diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/__init__.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/__init__.py
index de500743d8..1139593a36 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/__init__.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/__init__.py
@@ -1,20 +1,3 @@
-#!/usr/bin/env python
-# coding=utf-8
-# Copyright 2024 Huawei Technologies Co., Ltd
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     https://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
 from .pipeline import CogView3PlusPipeline, DiffusionPipeline
 from .schedulers import CogVideoXDDIMScheduler, SchedulerMixin
 from .models import CogView3PlusTransformer2DModel, ModelMixin
\ No newline at end of file
diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/embeddings.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/embeddings.py
index 72418f08b3..129384dffc 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/embeddings.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/embeddings.py
@@ -17,7 +17,7 @@ from typing import Optional
 
 import torch
 from torch import nn
-from diffusers.models.activations import FP32SiLU, get_activation
+from diffusers.models.activations import get_activation
 
 
 def get_timestep_embedding(
@@ -205,8 +205,6 @@ class PixArtAlphaTextProjection(nn.Module):
             self.act_1 = nn.GELU(approximate="tanh")
         elif act_fn == "silu":
             self.act_1 = nn.SiLU()
-        elif act_fn == "silu_fp32":
-            self.act_1 = FP32SiLU()
         else:
             raise ValueError(f"Unknown activation function: {act_fn}")
         self.linear_2 = nn.Linear(in_features=hidden_size, out_features=out_features, bias=True)
diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/linear.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/linear.py
index bd9b9ba796..57fe8d55dc 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/linear.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/linear.py
@@ -6,7 +6,7 @@
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#     httpa://www.apache.org/licenses/LICENSE-2.0
+#     https://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
-- 
Gitee


From 53f9426a5e6ed5f569d74cdaf23650819747127d Mon Sep 17 00:00:00 2001
From: jiangmengyu <jiangmengyu1@huawei.com>
Date: Tue, 31 Dec 2024 14:25:46 +0800
Subject: [PATCH 69/91] =?UTF-8?q?cogview3=E9=80=82=E9=85=8D?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../foundation/cogview3/cogview3plus/models/activations.py      | 1 -
 .../foundation/cogview3/cogview3plus/models/model_load_utils.py | 2 +-
 .../foundation/cogview3/cogview3plus/models/modeling_utils.py   | 1 -
 3 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/activations.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/activations.py
index 4726fd7eb2..5bb3783ae4 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/activations.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/activations.py
@@ -20,7 +20,6 @@ from torch import nn
 from diffusers.utils import deprecate
 from diffusers.utils.import_utils import is_torch_npu_available
 
-
 if is_torch_npu_available():
     import torch_npu
 
diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/model_load_utils.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/model_load_utils.py
index 3cffbd6432..34a4625283 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/model_load_utils.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/model_load_utils.py
@@ -12,7 +12,7 @@
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
-# limitations under the License
+# limitations under the License.
 
 import os
 import torch
diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/modeling_utils.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/modeling_utils.py
index a09f50daf8..da548d9771 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/modeling_utils.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/modeling_utils.py
@@ -137,7 +137,6 @@ class ModelMixin(torch.nn.Module, PushToHubMixin):
             deprecate("direct config name access", "1.0.0", deprecation_message, standard_warn=False, stacklevel=3)
             return self._internal_dict[name]
 
-        # call PyTorch's https://pytorch.org/docs/stable/_modules/torch/nn/modules/module.html#Module
         return super().__getattr__(name)
 
     @classmethod
-- 
Gitee


From 8a4f05496d2641b284b8f2d28a1b26e486fc65a1 Mon Sep 17 00:00:00 2001
From: jiangmengyu <jiangmengyu1@huawei.com>
Date: Tue, 31 Dec 2024 14:45:22 +0800
Subject: [PATCH 70/91] =?UTF-8?q?cogview3=E9=80=82=E9=85=8D?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../models/attention_processor.py             | 38 -------------------
 .../cogview3plus/models/modeling_utils.py     |  1 -
 .../models/transformer_cogview3plus.py        |  2 -
 .../pipeline/pipeline_cogview3plus.py         |  1 -
 .../schedulers/scheduling_ddim_cogvideox.py   | 19 ----------
 .../cogview3/inference_cogview3plus.py        |  2 +-
 6 files changed, 1 insertion(+), 62 deletions(-)

diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/attention_processor.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/attention_processor.py
index d2a7673ea5..c197a989b7 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/attention_processor.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/attention_processor.py
@@ -212,8 +212,6 @@ class Attention(nn.Module):
             processor (`AttnProcessor`):
                 The attention processor to use.
         """
-        # if current processor is in `self._modules` and if passed `processor` is not, we need to
-        # pop `processor` from `self._modules`
         if (
             hasattr(self, "processor")
             and isinstance(self.processor, torch.nn.Module)
@@ -231,26 +229,6 @@ class Attention(nn.Module):
         attention_mask: Optional[torch.Tensor] = None,
         **cross_attention_kwargs,
     ) -> torch.Tensor:
-        r"""
-        The forward method of the `Attention` class.
-
-        Args:
-            hidden_states (`torch.Tensor`):
-                The hidden states of the query.
-            encoder_hidden_states (`torch.Tensor`, *optional*):
-                The hidden states of the encoder.
-            attention_mask (`torch.Tensor`, *optional*):
-                The attention mask to use. If `None`, no mask is applied.
-            **cross_attention_kwargs:
-                Additional keyword arguments to pass along to the cross attention.
-
-        Returns:
-            `torch.Tensor`: The output of the attention layer.
-        """
-        # The `Attention` class can call different attention processors / attention functions
-        # here we simply pass along all tensors to the selected processor class
-        # For standard processors that are defined here, `**cross_attention_kwargs` is empty
-
         attn_parameters = set(inspect.signature(self.processor.__call__).parameters.keys())
         quiet_attn_parameters = {"ip_adapter_masks", "ip_hidden_states"}
         unused_kwargs = [
@@ -273,22 +251,6 @@ class Attention(nn.Module):
     def prepare_attention_mask(
         self, attention_mask: torch.Tensor, target_length: int, batch_size: int, out_dim: int = 3
     ) -> torch.Tensor:
-        r"""
-        Prepare the attention mask for the attention computation.
-
-        Args:
-            attention_mask (`torch.Tensor`):
-                The attention mask to prepare.
-            target_length (`int`):
-                The target length of the attention mask. This is the length of the attention mask after padding.
-            batch_size (`int`):
-                The batch size, which is used to repeat the attention mask.
-            out_dim (`int`, *optional*, defaults to `3`):
-                The output dimension of the attention mask. Can be either `3` or `4`.
-
-        Returns:
-            `torch.Tensor`: The prepared attention mask.
-        """
         head_size = self.heads
         if attention_mask is None:
             return attention_mask
diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/modeling_utils.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/modeling_utils.py
index da548d9771..aa8e33daaa 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/modeling_utils.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/modeling_utils.py
@@ -185,7 +185,6 @@ class ModelMixin(torch.nn.Module, PushToHubMixin):
                 " dispatching. Please make sure to set `low_cpu_mem_usage=True`."
             )
 
-        # change device_map into a map if we passed an int, a str or a torch.device
         if isinstance(device_map, torch.device):
             device_map = {"": device_map}
         elif isinstance(device_map, str) and device_map not in ["auto", "balanced", "balanced_low_0", "sequential"]:
diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/transformer_cogview3plus.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/transformer_cogview3plus.py
index bd2482b587..f704e22589 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/transformer_cogview3plus.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/transformer_cogview3plus.py
@@ -197,7 +197,6 @@ class CogView3PlusTransformer2DModel(ModelMixin, ConfigMixin):
         self.delta_encoder_cache = None
 
     @property
-    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.attn_processors
     def attn_processors(self) -> Dict[str, AttentionProcessor]:
         r"""
         Returns:
@@ -221,7 +220,6 @@ class CogView3PlusTransformer2DModel(ModelMixin, ConfigMixin):
 
         return processors
 
-    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_attn_processor
     def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
         count = len(self.attn_processors.keys())
 
diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py
index 82276b2cd9..fe2bd5cfcd 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py
@@ -32,7 +32,6 @@ from .pipeline_output import CogView3PipelineOutput
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
 
-# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
 def retrieve_timesteps(
     scheduler,
     num_inference_steps: Optional[int] = None,
diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_ddim_cogvideox.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_ddim_cogvideox.py
index 9b9a4f051e..b3f6ce229b 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_ddim_cogvideox.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_ddim_cogvideox.py
@@ -27,18 +27,6 @@ from .scheduling_utils import KarrasDiffusionSchedulers, SchedulerMixin
 
 @dataclass
 class DDIMSchedulerOutput(BaseOutput):
-    """
-    Output class for the scheduler's `step` function output.
-
-    Args:
-        prev_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images):
-            Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the
-            denoising loop.
-        pred_original_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images):
-            The predicted denoised sample `(x_{0})` based on the model output from the current timestep.
-            `pred_original_sample` can be used to preview progress or for guidance.
-    """
-
     prev_sample: torch.Tensor
     pred_original_sample: Optional[torch.Tensor] = None
 
@@ -127,10 +115,6 @@ class CogVideoXDDIMScheduler(SchedulerMixin, ConfigMixin):
         if rescale_betas_zero_snr:
             self.alphas_cumprod = rescale_zero_terminal_snr(self.alphas_cumprod)
 
-        # At every step in ddim, we are looking into the previous alphas_cumprod
-        # For the final step, there is no previous alphas_cumprod because we are already at 0
-        # `set_alpha_to_one` decides whether we set this parameter simply to one or
-        # whether we use the final alpha of the "non-previous" one.
         self.final_alpha_cumprod = torch.tensor(1.0) if set_alpha_to_one else self.alphas_cumprod[0]
 
         # standard deviation of the initial noise distribution
@@ -171,7 +155,6 @@ class CogVideoXDDIMScheduler(SchedulerMixin, ConfigMixin):
 
         self.num_inference_steps = num_inference_steps
 
-        # "linspace", "leading", "trailing" corresponds to annotation of Table 2. of https://arxiv.org/abs/2305.08891
         if self.config.timestep_spacing == "linspace":
             timesteps = (
                 np.linspace(0, self.config.num_train_timesteps - 1, num_inference_steps)
@@ -244,7 +227,6 @@ class CogVideoXDDIMScheduler(SchedulerMixin, ConfigMixin):
 
         return DDIMSchedulerOutput(prev_sample=prev_sample, pred_original_sample=pred_original_sample)
 
-    # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.add_noise
     def add_noise(
         self,
         original_samples: torch.Tensor,
@@ -271,7 +253,6 @@ class CogVideoXDDIMScheduler(SchedulerMixin, ConfigMixin):
         noisy_samples = sqrt_alpha_prod * original_samples + sqrt_one_minus_alpha_prod * noise
         return noisy_samples
 
-    # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.get_velocity
     def get_velocity(self, sample: torch.Tensor, noise: torch.Tensor, timesteps: torch.IntTensor) -> torch.Tensor:
         # Make sure alphas_cumprod and timestep have same device and dtype as sample
         self.alphas_cumprod = self.alphas_cumprod.to(device=sample.device)
diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/inference_cogview3plus.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/inference_cogview3plus.py
index 9030a82b5e..c3bb1f2ebb 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/inference_cogview3plus.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/inference_cogview3plus.py
@@ -6,7 +6,7 @@
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+#     https://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
-- 
Gitee


From 11ef4a0a8b29e21dfaeeda4c69f703df0f55c285 Mon Sep 17 00:00:00 2001
From: jiangmengyu <jiangmengyu1@huawei.com>
Date: Thu, 2 Jan 2025 10:45:03 +0800
Subject: [PATCH 71/91] =?UTF-8?q?cogview=E9=80=82=E9=85=8D?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../cogview3/inference_cogview3plus.py        | 212 ++++++++++++++++--
 .../cogview3/prompts/example_prompts.txt      |   5 +
 2 files changed, 203 insertions(+), 14 deletions(-)
 create mode 100644 MindIE/MindIE-Torch/built-in/foundation/cogview3/prompts/example_prompts.txt

diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/inference_cogview3plus.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/inference_cogview3plus.py
index c3bb1f2ebb..341a008e7c 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/inference_cogview3plus.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/inference_cogview3plus.py
@@ -17,6 +17,9 @@
 import argparse
 import logging
 import time
+import os
+import csv
+import json
 
 import torch
 
@@ -26,25 +29,138 @@ logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 
 
+class PromptLoader:
+    def __init__(
+            self,
+            prompt_file: str,
+            prompt_file_type: str,
+            batch_size: int,
+            num_images_per_prompt: int = 1,
+            max_num_prompts: int = 0
+    ):
+        self.prompts = []
+        self.catagories = ['Not_specified']
+        self.batch_size = batch_size
+        self.num_images_per_prompt = num_images_per_prompt
+
+        if prompt_file_type == 'plain':
+            self.load_prompts_plain(prompt_file, max_num_prompts)
+        elif prompt_file_type == 'parti':
+            self.load_prompts_parti(prompt_file, max_num_prompts)
+        elif prompt_file_type == 'hpsv2':
+            self.load_prompts_hpsv2(max_num_prompts)
+        else:
+            print("This operation is not supported!")
+
+        self.current_id = 0
+        self.inner_id = 0
+
+    def __len__(self):
+        return len(self.prompts) * self.num_images_per_prompt
+
+    def __iter__(self):
+        return self
+
+    def __next__(self):
+        if self.current_id == len(self.prompts):
+            raise StopIteration
+
+        ret = {
+            'prompts': [],
+            'catagories': [],
+            'save_names': [],
+            'n_prompts': self.batch_size,
+        }
+        for _ in range(self.batch_size):
+            if self.current_id == len(self.prompts):
+                ret['prompts'].append('')
+                ret['save_names'].append('')
+                ret['catagories'].append('')
+                ret['n_prompts'] -= 1
+
+            else:
+                prompt, catagory_id = self.prompts[self.current_id]
+                ret['prompts'].append(prompt)
+                ret['catagories'].append(self.catagories[catagory_id])
+                ret['save_names'].append(f'{self.current_id}_{self.inner_id}')
+
+                self.inner_id += 1
+                if self.inner_id == self.num_images_per_prompt:
+                    self.inner_id = 0
+                    self.current_id += 1
+
+        return ret
+
+    def load_prompts_plain(self, file_path: str, max_num_prompts: int):
+        with os.fdopen(os.open(file_path, os.O_RDONLY), "r") as f:
+            for i, line in enumerate(f):
+                if max_num_prompts and i == max_num_prompts:
+                    break
+
+                prompt = line.strip()
+                self.prompts.append((prompt, 0))
+
+    def load_prompts_parti(self, file_path: str, max_num_prompts: int):
+        with os.fdopen(os.open(file_path, os.O_RDONLY), "r") as f:
+            # Skip the first line
+            next(f)
+            tsv_file = csv.reader(f, delimiter="\t")
+            for i, line in enumerate(tsv_file):
+                if max_num_prompts and i == max_num_prompts:
+                    break
+
+                prompt = line[0]
+                catagory = line[1]
+                if catagory not in self.catagories:
+                    self.catagories.append(catagory)
+
+                catagory_id = self.catagories.index(catagory)
+                self.prompts.append((prompt, catagory_id))
+
+    def load_prompts_hpsv2(self, max_num_prompts: int):
+        with open('hpsv2_benchmark_prompts.json', 'r') as file:
+            all_prompts = json.load(file)
+        count = 0
+        for style, prompts in all_prompts.items():
+            for prompt in prompts:
+                count += 1
+                if max_num_prompts and count >= max_num_prompts:
+                    break
+
+                if style not in self.catagories:
+                    self.catagories.append(style)
+
+                catagory_id = self.catagories.index(style)
+                self.prompts.append((prompt, catagory_id))
+
+
 def parse_arguments():
     parser = argparse.ArgumentParser(description="Generate an image using the CogView3-Plus-3B model.")
 
     # Define arguments for prompt, model path, etc.
     parser.add_argument(
-        "--prompt", 
-        type=list, 
-        default=[
-            "A vibrant cherry red sports car sits proudly under the gleaming sun, \
-            its polished exterior smooth and flawless, casting a mirror-like reflection. \
-            The car features a low, aerodynamic body, angular headlights that gaze forward like predatory eyes, \
-            and a set of black, high-gloss racing rims that contrast starkly with the red. \
-            A subtle hint of chrome embellishes the grille and exhaust, \
-            while the tinted windows suggest a luxurious and private interior. \
-            he scene conveys a sense of speed and elegance, \
-            the car appearing as if it's about to burst into a sprint along a coastal road, \
-            with the ocean's azure waves crashing in the background."
-        ], 
-        help="The text description for generating the image."
+        "--prompt_file",
+        type=str,
+        default="./prompts/example_prompts.txt",
+        help="A text file of prompts for generating images.",
+    )
+    parser.add_argument(
+        "--prompt_file_type",
+        choices=["plain", "parti", "hpsv2"],
+        default="plain",
+        help="Type of prompt file.",
+    )
+    parser.add_argument(
+        "--save_dir",
+        type=str,
+        default="./results",
+        help="Path to save result images.",
+    )
+    parser.add_argument(
+        "--info_file_save_path",
+        type=str,
+        default="./image_info.json",
+        help="Path to save image information file.",
     )
     parser.add_argument(
         "--model_path", type=str, default="/data/CogView3B", help="Path to the pre-trained model."
@@ -55,6 +171,18 @@ def parse_arguments():
     parser.add_argument(
         "--num_images_per_prompt", type=int, default=1, help="Number of images to generate per prompt."
     )
+    parser.add_argument(
+        "--max_num_prompts",
+        default=0,
+        type=int,
+        help="Limit the number of prompts (0: no limit).",
+    )
+    parser.add_argument(
+        "--batch_size",
+        type=int,
+        default=1,
+        help="Batch size."
+    )
     parser.add_argument("--num_inference_steps", type=int, default=50, help="Number of denoising steps for inference.")
     parser.add_argument("--width", type=int, default=1024, help="Width of the generated image.")
     parser.add_argument("--height", type=int, default=1024, help="Height of the generated image.")
@@ -72,6 +200,61 @@ def infer(args):
     # Load the pre-trained model with the specified precision
     pipe = CogView3PlusPipeline.from_pretrained(args.model_path, torch_dtype=dtype).to("npu")
 
+    use_time = 0
+    prompt_loader = PromptLoader(args.prompt_file,
+                                 args.prompt_file_type,
+                                 args.batch_size,
+                                 args.num_images_per_prompt,
+                                 args.max_num_prompts)
+
+    infer_num = 0
+    image_info = []
+    current_prompt = None
+    for i, input_info in enumerate(prompt_loader):
+        prompts = input_info['prompts']
+        catagories = input_info['catagories']
+        save_names = input_info['save_names']
+        n_prompts = input_info['n_prompts']
+
+        print(f"[{infer_num + n_prompts}/{len(prompt_loader)}]: {prompts}")
+        infer_num += args.batch_size
+
+        start_time = time.time()
+        images = pipe(
+            prompt=prompts,
+            guidance_scale=args.guidance_scale,
+            num_images_per_prompt=args.num_images_per_prompt,
+            num_inference_steps=args.num_inference_steps,
+            image_size=(args.height, args.width),
+        ).images
+
+        if i > 2: # do not count the time spent inferring the first 0 to 2 images
+            use_time += time.time() - start_time
+
+        for j in range(n_prompts):
+            image_save_path = os.path.join(args.save_dir, f"{save_names[j]}.png")
+            image = images[0][j]
+            image.save(image_save_path)
+
+            if current_prompt != prompts[j]:
+                current_prompt = prompts[j]
+                image_info.append({'images': [], 'prompt': current_prompt, 'category': catagories[j]})
+
+            image_info[-1]['images'].append(image_save_path)
+
+    infer_num = infer_num - 3 # do not count the time spent inferring the first 5 images
+    print(f"[info] infer number: {infer_num}; use time: {use_time:.3f}s\n"
+          f"average time: {use_time / infer_num:.3f}s\n")
+
+    # Save image information to a json file
+    if os.path.exists(args.info_file_save_path):
+        os.remove(args.info_file_save_path)
+
+    with os.fdopen(os.open(args.info_file_save_path, os.O_RDWR | os.O_CREAT, 0o640), "w") as f:
+        json.dump(image_info, f)
+
+    
+    """
     use_time = 0
     loops = 5
     for i in range(loops):
@@ -97,6 +280,7 @@ def infer(args):
     image.save(args.output_path)
 
     print(f"Image saved to {args.output_path}")
+    """
 
 
 if __name__ == "__main__":
diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/prompts/example_prompts.txt b/MindIE/MindIE-Torch/built-in/foundation/cogview3/prompts/example_prompts.txt
new file mode 100644
index 0000000000..7291dde080
--- /dev/null
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/prompts/example_prompts.txt
@@ -0,0 +1,5 @@
+A vibrant cherry red sports car sits proudly under the gleaming sun, its polished exterior smooth and flawless, casting a mirror-like reflection. The car features a low, aerodynamic body, angular headlights that gaze forward like predatory eyes, and a set of black, high-gloss racing rims that contrast starkly with the red. A subtle hint of chrome embellishes the grille and exhaust, while the tinted windows suggest a luxurious and private interior. The scene conveys a sense of speed and elegance, the car appearing as if it's about to burst into a sprint along a coastal road, with the ocean's azure waves crashing in the background.
+A vibrant cherry red sports car sits proudly under the gleaming sun, its polished exterior smooth and flawless, casting a mirror-like reflection. The car features a low, aerodynamic body, angular headlights that gaze forward like predatory eyes, and a set of black, high-gloss racing rims that contrast starkly with the red. A subtle hint of chrome embellishes the grille and exhaust, while the tinted windows suggest a luxurious and private interior. The scene conveys a sense of speed and elegance, the car appearing as if it's about to burst into a sprint along a coastal road, with the ocean's azure waves crashing in the background.
+A vibrant cherry red sports car sits proudly under the gleaming sun, its polished exterior smooth and flawless, casting a mirror-like reflection. The car features a low, aerodynamic body, angular headlights that gaze forward like predatory eyes, and a set of black, high-gloss racing rims that contrast starkly with the red. A subtle hint of chrome embellishes the grille and exhaust, while the tinted windows suggest a luxurious and private interior. The scene conveys a sense of speed and elegance, the car appearing as if it's about to burst into a sprint along a coastal road, with the ocean's azure waves crashing in the background.
+A vibrant cherry red sports car sits proudly under the gleaming sun, its polished exterior smooth and flawless, casting a mirror-like reflection. The car features a low, aerodynamic body, angular headlights that gaze forward like predatory eyes, and a set of black, high-gloss racing rims that contrast starkly with the red. A subtle hint of chrome embellishes the grille and exhaust, while the tinted windows suggest a luxurious and private interior. The scene conveys a sense of speed and elegance, the car appearing as if it's about to burst into a sprint along a coastal road, with the ocean's azure waves crashing in the background.
+A vibrant cherry red sports car sits proudly under the gleaming sun, its polished exterior smooth and flawless, casting a mirror-like reflection. The car features a low, aerodynamic body, angular headlights that gaze forward like predatory eyes, and a set of black, high-gloss racing rims that contrast starkly with the red. A subtle hint of chrome embellishes the grille and exhaust, while the tinted windows suggest a luxurious and private interior. The scene conveys a sense of speed and elegance, the car appearing as if it's about to burst into a sprint along a coastal road, with the ocean's azure waves crashing in the background.
\ No newline at end of file
-- 
Gitee


From f6d7ff137af7fee1121270cbbae494ab48d51b74 Mon Sep 17 00:00:00 2001
From: jiangmengyu <jiangmengyu1@huawei.com>
Date: Thu, 2 Jan 2025 10:49:03 +0800
Subject: [PATCH 72/91] =?UTF-8?q?cogview=E9=80=82=E9=85=8D?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../built-in/foundation/cogview3/inference_cogview3plus.py       | 1 +
 1 file changed, 1 insertion(+)

diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/inference_cogview3plus.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/inference_cogview3plus.py
index 341a008e7c..149be10eb1 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/inference_cogview3plus.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/inference_cogview3plus.py
@@ -227,6 +227,7 @@ def infer(args):
             num_inference_steps=args.num_inference_steps,
             image_size=(args.height, args.width),
         ).images
+        print(images.shape)
 
         if i > 2: # do not count the time spent inferring the first 0 to 2 images
             use_time += time.time() - start_time
-- 
Gitee


From 485008cb516f7077a336e5c4859c387590e59a0b Mon Sep 17 00:00:00 2001
From: jiangmengyu <jiangmengyu1@huawei.com>
Date: Thu, 2 Jan 2025 10:58:08 +0800
Subject: [PATCH 73/91] =?UTF-8?q?cogview=E9=80=82=E9=85=8D?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py  | 5 ++---
 .../built-in/foundation/cogview3/inference_cogview3plus.py   | 3 +--
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py
index fe2bd5cfcd..2f14fdd7c3 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py
@@ -27,7 +27,6 @@ from diffusers import AutoencoderKL
 
 from ..models import CogView3PlusTransformer2DModel
 from ..schedulers import CogVideoXDDIMScheduler
-from .pipeline_output import CogView3PipelineOutput
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
@@ -224,7 +223,7 @@ class CogView3PlusPipeline(DiffusionPipeline):
         num_inference_steps: int = 50,
         guidance_scale: float = 5.0,
         num_images_per_prompt: int = 1,
-    ) -> Union[CogView3PipelineOutput, Tuple]:
+    ) -> Tuple:
         if image_size is None:
             height = self.transformer.config.sample_size * self.vae_scale_factor
             width = self.transformer.config.sample_size * self.vae_scale_factor
@@ -336,4 +335,4 @@ class CogView3PlusPipeline(DiffusionPipeline):
         # Offload all models
         self.maybe_free_model_hooks()
 
-        return CogView3PipelineOutput(images=image)
\ No newline at end of file
+        return (image,)
\ No newline at end of file
diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/inference_cogview3plus.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/inference_cogview3plus.py
index 149be10eb1..fa6cfd418b 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/inference_cogview3plus.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/inference_cogview3plus.py
@@ -226,8 +226,7 @@ def infer(args):
             num_images_per_prompt=args.num_images_per_prompt,
             num_inference_steps=args.num_inference_steps,
             image_size=(args.height, args.width),
-        ).images
-        print(images.shape)
+        )
 
         if i > 2: # do not count the time spent inferring the first 0 to 2 images
             use_time += time.time() - start_time
-- 
Gitee


From 0858f272718055c29f79600ca00348a85862c919 Mon Sep 17 00:00:00 2001
From: jiangmengyu <jiangmengyu1@huawei.com>
Date: Thu, 2 Jan 2025 11:16:46 +0800
Subject: [PATCH 74/91] =?UTF-8?q?cogview=E9=80=82=E9=85=8D?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../cogview3/inference_cogview3plus.py        | 33 ++-----------------
 1 file changed, 2 insertions(+), 31 deletions(-)

diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/inference_cogview3plus.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/inference_cogview3plus.py
index fa6cfd418b..ae66dac6ca 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/inference_cogview3plus.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/inference_cogview3plus.py
@@ -228,7 +228,7 @@ def infer(args):
             image_size=(args.height, args.width),
         )
 
-        if i > 2: # do not count the time spent inferring the first 0 to 2 images
+        if i > 1: # do not count the time spent inferring the first 0 to 2 images
             use_time += time.time() - start_time
 
         for j in range(n_prompts):
@@ -242,7 +242,7 @@ def infer(args):
 
             image_info[-1]['images'].append(image_save_path)
 
-    infer_num = infer_num - 3 # do not count the time spent inferring the first 5 images
+    infer_num = infer_num - 2 # do not count the time spent inferring the first 5 images
     print(f"[info] infer number: {infer_num}; use time: {use_time:.3f}s\n"
           f"average time: {use_time / infer_num:.3f}s\n")
 
@@ -253,35 +253,6 @@ def infer(args):
     with os.fdopen(os.open(args.info_file_save_path, os.O_RDWR | os.O_CREAT, 0o640), "w") as f:
         json.dump(image_info, f)
 
-    
-    """
-    use_time = 0
-    loops = 5
-    for i in range(loops):
-        start_time = time.time()
-        # Generate the image based on the prompt
-        image = pipe(
-            prompt=args.prompt[0],
-            guidance_scale=args.guidance_scale,
-            num_images_per_prompt=args.num_images_per_prompt,
-            num_inference_steps=args.num_inference_steps,
-            image_size=(args.height, args.width),
-        ).images[0]
-        
-        if i >= 2:
-            use_time += time.time() - start_time
-            logger.info("current_time is %.3f )", time.time() - start_time)
-
-        torch.npu.empty_cache()
-    
-    logger.info("use_time is %.3f)", use_time / 3)
-
-    # Save the generated image to the local file system
-    image.save(args.output_path)
-
-    print(f"Image saved to {args.output_path}")
-    """
-
 
 if __name__ == "__main__":
     inference_args = parse_arguments()
-- 
Gitee


From 91ed1e670a06b0a029ca38eb8b9d8f2ac739b3e7 Mon Sep 17 00:00:00 2001
From: jiangmengyu <jiangmengyu1@huawei.com>
Date: Thu, 2 Jan 2025 11:28:23 +0800
Subject: [PATCH 75/91] =?UTF-8?q?cogview=E9=80=82=E9=85=8D?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../pipeline/pipeline_cogview3plus.py         |  4 +++
 .../cogview3/cogview3plus/utils/__init__.py   |  1 +
 .../cogview3/cogview3plus/utils/utils.py      | 35 +++++++++++++++++++
 3 files changed, 40 insertions(+)
 create mode 100644 MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/utils/__init__.py
 create mode 100644 MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/utils/utils.py

diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py
index 2f14fdd7c3..d242bbcd55 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py
@@ -27,6 +27,7 @@ from diffusers import AutoencoderKL
 
 from ..models import CogView3PlusTransformer2DModel
 from ..schedulers import CogVideoXDDIMScheduler
+from ..utils import set_random_seed
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
@@ -224,6 +225,9 @@ class CogView3PlusPipeline(DiffusionPipeline):
         guidance_scale: float = 5.0,
         num_images_per_prompt: int = 1,
     ) -> Tuple:
+        
+        set_random_seed(seed=42)
+
         if image_size is None:
             height = self.transformer.config.sample_size * self.vae_scale_factor
             width = self.transformer.config.sample_size * self.vae_scale_factor
diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/utils/__init__.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/utils/__init__.py
new file mode 100644
index 0000000000..f35da6dcea
--- /dev/null
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/utils/__init__.py
@@ -0,0 +1 @@
+from .utils import set_random_seed
\ No newline at end of file
diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/utils/utils.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/utils/utils.py
new file mode 100644
index 0000000000..de985c1453
--- /dev/null
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/utils/utils.py
@@ -0,0 +1,35 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2024 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import importlib
+import random
+import torch
+import numpy as np
+
+
+def set_random_seed(seed):
+    """Set random seed.
+
+    Args:
+        seed (int, optional): Seed to be used.
+
+    """
+
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    return seed
\ No newline at end of file
-- 
Gitee


From b42efe9c5c5c84e4f29fda667ea3141d95f472c0 Mon Sep 17 00:00:00 2001
From: jiangmengyu <jiangmengyu1@huawei.com>
Date: Thu, 2 Jan 2025 14:24:05 +0800
Subject: [PATCH 76/91] =?UTF-8?q?cogview=E9=80=82=E9=85=8D?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../built-in/foundation/cogview3/cogview3plus/__init__.py  | 3 ++-
 .../cogview3plus/pipeline/pipeline_cogview3plus.py         | 4 ----
 .../built-in/foundation/cogview3/inference_cogview3plus.py | 7 +++++--
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/__init__.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/__init__.py
index 1139593a36..e5bd9d5fa9 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/__init__.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/__init__.py
@@ -1,3 +1,4 @@
 from .pipeline import CogView3PlusPipeline, DiffusionPipeline
 from .schedulers import CogVideoXDDIMScheduler, SchedulerMixin
-from .models import CogView3PlusTransformer2DModel, ModelMixin
\ No newline at end of file
+from .models import CogView3PlusTransformer2DModel, ModelMixin
+from .utils import set_random_seed
\ No newline at end of file
diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py
index d242bbcd55..2f14fdd7c3 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py
@@ -27,7 +27,6 @@ from diffusers import AutoencoderKL
 
 from ..models import CogView3PlusTransformer2DModel
 from ..schedulers import CogVideoXDDIMScheduler
-from ..utils import set_random_seed
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
@@ -225,9 +224,6 @@ class CogView3PlusPipeline(DiffusionPipeline):
         guidance_scale: float = 5.0,
         num_images_per_prompt: int = 1,
     ) -> Tuple:
-        
-        set_random_seed(seed=42)
-
         if image_size is None:
             height = self.transformer.config.sample_size * self.vae_scale_factor
             width = self.transformer.config.sample_size * self.vae_scale_factor
diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/inference_cogview3plus.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/inference_cogview3plus.py
index ae66dac6ca..b7a3bd4af1 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/inference_cogview3plus.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/inference_cogview3plus.py
@@ -23,7 +23,7 @@ import json
 
 import torch
 
-from cogview3plus import CogView3PlusPipeline
+from cogview3plus import CogView3PlusPipeline, set_random_seed
 
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
@@ -186,8 +186,8 @@ def parse_arguments():
     parser.add_argument("--num_inference_steps", type=int, default=50, help="Number of denoising steps for inference.")
     parser.add_argument("--width", type=int, default=1024, help="Width of the generated image.")
     parser.add_argument("--height", type=int, default=1024, help="Height of the generated image.")
-    parser.add_argument("--output_path", type=str, default="cogview3.png", help="Path to save the generated image.")
     parser.add_argument("--dtype", type=str, default="bf16", help="bf16 or fp16")
+    parser.add_argument("--seed", type=int, default=None, help="Random seed")
     parser.add_argument("--device_id", type=int, default=7, help="NPU device id")
 
     return parser.parse_args()
@@ -197,6 +197,9 @@ def infer(args):
     torch.npu.set_device(args.device_id)
     dtype = torch.bfloat16 if args.dtype == "bf16" else torch.float16
 
+    if args.seed is not None:
+        set_random_seed(args.seed)
+
     # Load the pre-trained model with the specified precision
     pipe = CogView3PlusPipeline.from_pretrained(args.model_path, torch_dtype=dtype).to("npu")
 
-- 
Gitee


From ab635dd8049b722d3d16ea1056f8ce3ef3f7d6eb Mon Sep 17 00:00:00 2001
From: jiangmengyu <jiangmengyu1@huawei.com>
Date: Thu, 2 Jan 2025 14:27:49 +0800
Subject: [PATCH 77/91] =?UTF-8?q?cogview=E9=80=82=E9=85=8D?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../built-in/foundation/cogview3/inference_cogview3plus.py      | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/inference_cogview3plus.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/inference_cogview3plus.py
index b7a3bd4af1..0b1ebd4a5c 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/inference_cogview3plus.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/inference_cogview3plus.py
@@ -188,7 +188,7 @@ def parse_arguments():
     parser.add_argument("--height", type=int, default=1024, help="Height of the generated image.")
     parser.add_argument("--dtype", type=str, default="bf16", help="bf16 or fp16")
     parser.add_argument("--seed", type=int, default=None, help="Random seed")
-    parser.add_argument("--device_id", type=int, default=7, help="NPU device id")
+    parser.add_argument("--device_id", type=int, default=0, help="NPU device id")
 
     return parser.parse_args()
 
-- 
Gitee


From 15b52ffef0a891e4d273c506c66a0731396ed126 Mon Sep 17 00:00:00 2001
From: jiangmengyu <jiangmengyu1@huawei.com>
Date: Thu, 2 Jan 2025 15:20:30 +0800
Subject: [PATCH 78/91] =?UTF-8?q?cogview=E9=80=82=E9=85=8D?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../built-in/foundation/cogview3/README.md    | 25 +++++++++++++++++++
 .../cogview3/inference_cogview3plus.py        |  6 ++---
 2 files changed, 28 insertions(+), 3 deletions(-)

diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/README.md b/MindIE/MindIE-Torch/built-in/foundation/cogview3/README.md
index e161ab5c10..b0b7760a3a 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/README.md
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/README.md
@@ -164,3 +164,28 @@ python inference_cogview3plus.py \
 - height: 需要生成的图像的高。
 - num_inference_steps：推理迭代步数。
 - dtype: 数据类型。目前只支持bf16。
+
+
+python3 inference_cogview3plus.py \
+        --model_path \data\CogView3B \
+        --prompt_file ./PartiPrompts.tsv \
+        --prompt_file_type parti \
+        --info_file_save_path ./image_info_PartiPrompts.json \
+        --save_dir ./results_PartiPrompts \
+        --num_images_per_prompt 4 \
+        --height 1024 \
+        --width 1024 \
+        --batch_size 1 \
+        --device_id 0
+
+python3 inference_cogview3plus.py \
+        --model_path \data\CogView3B \
+        --prompt_file ./hpsv2_benchmark_prompts.json \
+        --prompt_file_type hpsv2 \
+        --info_file_save_path ./image_info_hpsv2.json \
+        --save_dir ./results_hpsv2 \
+        --num_images_per_prompt 1 \
+        --height 1024 \
+        --width 1024 \
+        --batch_size 1 \
+        --device_id 0
diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/inference_cogview3plus.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/inference_cogview3plus.py
index 0b1ebd4a5c..0fca2f95df 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/inference_cogview3plus.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/inference_cogview3plus.py
@@ -48,7 +48,7 @@ class PromptLoader:
         elif prompt_file_type == 'parti':
             self.load_prompts_parti(prompt_file, max_num_prompts)
         elif prompt_file_type == 'hpsv2':
-            self.load_prompts_hpsv2(max_num_prompts)
+            self.load_prompts_hpsv2(prompt_file, max_num_prompts)
         else:
             print("This operation is not supported!")
 
@@ -117,8 +117,8 @@ class PromptLoader:
                 catagory_id = self.catagories.index(catagory)
                 self.prompts.append((prompt, catagory_id))
 
-    def load_prompts_hpsv2(self, max_num_prompts: int):
-        with open('hpsv2_benchmark_prompts.json', 'r') as file:
+    def load_prompts_hpsv2(self, file_path: str, max_num_prompts: int):
+        with open(file_path, 'r') as file:
             all_prompts = json.load(file)
         count = 0
         for style, prompts in all_prompts.items():
-- 
Gitee


From 7e034859c7e16059b3190a9a712fc3b14f1ea274 Mon Sep 17 00:00:00 2001
From: jiangmengyu <jiangmengyu1@huawei.com>
Date: Thu, 2 Jan 2025 15:24:05 +0800
Subject: [PATCH 79/91] =?UTF-8?q?cogview=E9=80=82=E9=85=8D?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 MindIE/MindIE-Torch/built-in/foundation/cogview3/README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/README.md b/MindIE/MindIE-Torch/built-in/foundation/cogview3/README.md
index b0b7760a3a..f363b69186 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/README.md
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/README.md
@@ -167,7 +167,7 @@ python inference_cogview3plus.py \
 
 
 python3 inference_cogview3plus.py \
-        --model_path \data\CogView3B \
+        --model_path /data/CogView3B \
         --prompt_file ./PartiPrompts.tsv \
         --prompt_file_type parti \
         --info_file_save_path ./image_info_PartiPrompts.json \
@@ -179,7 +179,7 @@ python3 inference_cogview3plus.py \
         --device_id 0
 
 python3 inference_cogview3plus.py \
-        --model_path \data\CogView3B \
+        --model_path /data/CogView3B \
         --prompt_file ./hpsv2_benchmark_prompts.json \
         --prompt_file_type hpsv2 \
         --info_file_save_path ./image_info_hpsv2.json \
-- 
Gitee


From e3b1cb444a57d520070884fe4b7068847cd77dde Mon Sep 17 00:00:00 2001
From: jiangmengyu <jiangmengyu1@huawei.com>
Date: Thu, 2 Jan 2025 15:26:10 +0800
Subject: [PATCH 80/91] =?UTF-8?q?cogview=E9=80=82=E9=85=8D?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 MindIE/MindIE-Torch/built-in/foundation/cogview3/README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/README.md b/MindIE/MindIE-Torch/built-in/foundation/cogview3/README.md
index f363b69186..534f9bc2ed 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/README.md
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/README.md
@@ -168,7 +168,7 @@ python inference_cogview3plus.py \
 
 python3 inference_cogview3plus.py \
         --model_path /data/CogView3B \
-        --prompt_file ./PartiPrompts.tsv \
+        --prompt_file ./prompts/PartiPrompts.tsv \
         --prompt_file_type parti \
         --info_file_save_path ./image_info_PartiPrompts.json \
         --save_dir ./results_PartiPrompts \
@@ -180,7 +180,7 @@ python3 inference_cogview3plus.py \
 
 python3 inference_cogview3plus.py \
         --model_path /data/CogView3B \
-        --prompt_file ./hpsv2_benchmark_prompts.json \
+        --prompt_file ./prompts/hpsv2_benchmark_prompts.json \
         --prompt_file_type hpsv2 \
         --info_file_save_path ./image_info_hpsv2.json \
         --save_dir ./results_hpsv2 \
-- 
Gitee


From ea4ccb9b6c7ba3efdfdcd7dd9d9b20ffa0126ce4 Mon Sep 17 00:00:00 2001
From: jiangmengyu <jiangmengyu1@huawei.com>
Date: Thu, 2 Jan 2025 15:34:23 +0800
Subject: [PATCH 81/91] =?UTF-8?q?cogview=E9=80=82=E9=85=8D?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 MindIE/MindIE-Torch/built-in/foundation/cogview3/README.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/README.md b/MindIE/MindIE-Torch/built-in/foundation/cogview3/README.md
index 534f9bc2ed..237e70f790 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/README.md
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/README.md
@@ -176,6 +176,7 @@ python3 inference_cogview3plus.py \
         --height 1024 \
         --width 1024 \
         --batch_size 1 \
+        --seed 42 \
         --device_id 0
 
 python3 inference_cogview3plus.py \
@@ -188,4 +189,5 @@ python3 inference_cogview3plus.py \
         --height 1024 \
         --width 1024 \
         --batch_size 1 \
+        --seed 42 \
         --device_id 0
-- 
Gitee


From 6a162b05b1410ed560ef7cc3d9a8ba8719dd9623 Mon Sep 17 00:00:00 2001
From: jiangmengyu <jiangmengyu1@huawei.com>
Date: Thu, 2 Jan 2025 15:58:20 +0800
Subject: [PATCH 82/91] =?UTF-8?q?cogview=E9=80=82=E9=85=8D?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../built-in/foundation/cogview3/inference_cogview3plus.py       | 1 -
 1 file changed, 1 deletion(-)

diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/inference_cogview3plus.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/inference_cogview3plus.py
index 0fca2f95df..3ef016ed2a 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/inference_cogview3plus.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/inference_cogview3plus.py
@@ -226,7 +226,6 @@ def infer(args):
         images = pipe(
             prompt=prompts,
             guidance_scale=args.guidance_scale,
-            num_images_per_prompt=args.num_images_per_prompt,
             num_inference_steps=args.num_inference_steps,
             image_size=(args.height, args.width),
         )
-- 
Gitee


From a0a59728a09f642bf6d4035ea768fa5949f14201 Mon Sep 17 00:00:00 2001
From: jiangmengyu <jiangmengyu1@huawei.com>
Date: Thu, 2 Jan 2025 16:50:12 +0800
Subject: [PATCH 83/91] =?UTF-8?q?cogview=E9=80=82=E9=85=8D?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../built-in/foundation/cogview3/README.md    | 138 ++++++++++++++++--
 1 file changed, 126 insertions(+), 12 deletions(-)

diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/README.md b/MindIE/MindIE-Torch/built-in/foundation/cogview3/README.md
index 237e70f790..f991ac774f 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/README.md
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/README.md
@@ -70,15 +70,15 @@ pip install torch_npu-{pytorchversion}.xxxx.{arch}.whl
 ## 三、CogView3使用
 
 ### 3.1 权重及配置文件说明
-1. CogView3权重路径:
+1. CogView3权重主路径:
 ```shell
 https://huggingface.co/THUDM/CogView3-Plus-3B/tree/main
 ```
-- 修改该权重的model_index.json
+- 修改主路径下的model_index.json文件
 ```shell
 {
   "_class_name": "CogView3PlusPipeline",
-  "_diffusers_version": "0.31.0",
+  "_diffusers_version": "0.31.0.dev0",
   "scheduler": [
     "cogview3plus",
     "CogVideoXDDIMScheduler"
@@ -117,6 +117,30 @@ https://huggingface.co/THUDM/CogView3-Plus-3B/tree/main/tokenizer
 ```shell
 https://huggingface.co/THUDM/CogView3-Plus-3B/tree/main/transformer
 ```
+- 修改该路径下的config.json文件
+```shell
+{
+  "_class_name": "CogView3PlusTransformer2DModel",
+  "_diffusers_version": "0.31.0.dev0",
+  "attention_head_dim": 40,
+  "condition_dim": 256,
+  "in_channels": 16,
+  "num_attention_heads": 64,
+  "num_layers": 30,
+  "out_channels": 16,
+  "patch_size": 2,
+  "pooled_projection_dim": 1536,
+  "pos_embed_max_size": 128,
+  "sample_size": 128,
+  "text_embed_dim": 4096,
+  "time_embed_dim": 512,
+  "use_cache": True,
+  "cache_interval": 2,
+  "cache_start": 3,
+  "num_cache_layer" 13,
+  "cache_start_steps" 5
+}
+```
 6. vae权重链接：
 ```shell
 https://huggingface.co/THUDM/CogView3-Plus-3B/tree/main/vae
@@ -142,32 +166,89 @@ https://huggingface.co/THUDM/CogView3-Plus-3B/tree/main/vae
 |    |    |---- 模型权重
 ```
 
-### 3.2 单卡单prompt功能测试
-设置权重路径
+### 3.2 权重下载
+提前下载权重，放到数据集目录下(/data)。
+```shell
+# 需要使用 git-lfs (https://git-lfs.com)
+git lfs install
+
+# 下载CogView3权重
+git clone https://huggingface.co/THUDM/CogView3-Plus-3B
+```
+
+### 3.3 性能测试
+1. 进入主路径
+```shell
+cd cogview3
+```
+2. 设置权重路径
 ```shell
 model_path='/data/CogView3B'
 ```
-执行命令：
+3. 创建输出图像路径
+```shell
+output_path='./results'
+mkdir ${output_path}
+```
+4. 推理：
 ```shell
 python inference_cogview3plus.py \
        --model_path ${model_path} \
-       --device_id 0 \
+       --prompt_file ./prompts/example_prompts.txt \
+       --save_dir ${output_path} \
        --width 1024 \
        --height 1024 \
        --num_inference_steps 50 \
-       --dtype bf16
+       --dtype bf16 \
+       --device_id 0
 ```
 参数说明：
 - model_path：权重路径，包含scheduler、text_encoder、tokenizer、transformer、vae，5个模型的配置文件及权重。
-- device_id：推理设备ID。
+- prompt_file：提示词文件。
+- save_dir：生成图片的存放目录。
 - width：需要生成的图像的宽。
 - height: 需要生成的图像的高。
 - num_inference_steps：推理迭代步数。
 - dtype: 数据类型。目前只支持bf16。
+- device_id：推理设备ID。
+
+5. 可以通过修改权重文件中`/data/CongView3B/transforer/config.json`中的`use_cache`参数来控制dit cache算法的开关，`true`表示使用dit cache，`false`表示关闭dit cache。
+
+### 3.4 精度测试
+
+1. 由于生成的图片存在随机性，提供两种精度验证方法：
+   1. CLIP-score（文图匹配度量）：评估图片和输入文本的相关性，分数的取值范围为[-1, 1]，越高越好。使用Parti数据集进行验证。
+   2. HPSv2（图片美学度量）：评估生成图片的人类偏好评分，分数的取值范围为[0, 1]，越高越好。使用HPSv2数据集进行验证
+
+注意，由于要生成的图片数量较多，进行完整的精度验证需要耗费很长的时间。
+
+2. 下载Parti数据集和hpsv2数据集
+所有数据集放到congview3/prompts目录下
+```bash
+# 下载Parti数据集
+wget https://raw.githubusercontent.com/google-research/parti/main/PartiPrompts.tsv --no-check-certificate
+```
+hpsv2数据集下载链接：https://gitee.com/ascend/ModelZoo-PyTorch/blob/master/MindIE/MindIE-Torch/built-in/foundation/stable_diffusion_xl/hpsv2_benchmark_prompts.json
+
+3. 下载模型权重
 
+```bash
+# Clip Score和HPSv2均需要使用的权重
+GIT_LFS_SKIP_SMUDGE=1
+git clone https://huggingface.co/laion/CLIP-ViT-H-14-laion2B-s32B-b79K
+cd ./CLIP-ViT-H-14-laion2B-s32B-b79K
 
+# HPSv2权重
+wget https://huggingface.co/spaces/xswu/HPSv2/resolve/main/HPS_v2_compressed.pt --no-check-certificate
+```
+也可手动下载[权重](https://huggingface.co/laion/CLIP-ViT-H-14-laion2B-s32B-b79K/blob/main/open_clip_pytorch_model.bin)
+将权重放到`CLIP-ViT-H-14-laion2B-s32B-b79K`目录下，手动下载[HPSv2权重](https://huggingface.co/spaces/xswu/HPSv2/resolve/main/HPS_v2_compressed.pt)放到当前路径
+
+4. 使用推理脚本读取Parti数据集，生成图片
+```bash
+mkdir ./results_PartiPrompts
 python3 inference_cogview3plus.py \
-        --model_path /data/CogView3B \
+        --model_path ${model_path} \
         --prompt_file ./prompts/PartiPrompts.tsv \
         --prompt_file_type parti \
         --info_file_save_path ./image_info_PartiPrompts.json \
@@ -177,10 +258,28 @@ python3 inference_cogview3plus.py \
         --width 1024 \
         --batch_size 1 \
         --seed 42 \
-        --device_id 0
+        --device_id 0 
+```
+参数说明：
+- model_path：权重路径，包含scheduler、text_encoder、tokenizer、transformer、vae，5个模型的配置文件及权重。
+- prompt_file：提示词文件。
+- prompt_file_type: prompt文件类型，用于指定读取方式，可选plain，parti，hpsv2。
+- info_file_save_path：生成图片信息的json文件路径。
+- save_dir：生成图片的存放目录。
+- num_images_per_prompt: 每个prompt生成的图片数量。注意使用hpsv2时，设置num_images_per_prompt=1即可。
+- height: 需要生成的图像的高。
+- width：需要生成的图像的宽。
+- batch_size：模型batch size。
+- seed：随机种子。
+- device_id：推理设备ID。
+
+执行完成后在`./results_PartiPrompts`目录下生成推理图片，在当前目录生成一个`image_info_PartiPrompts.json`文件，记录着图片和prompt的对应关系，并在终端显示推理时间。
 
+5. 使用推理脚本读取hpsv2数据集，生成图片
+```bash
+mkdir ./results_hpsv2
 python3 inference_cogview3plus.py \
-        --model_path /data/CogView3B \
+        --model_path ${model_path} \
         --prompt_file ./prompts/hpsv2_benchmark_prompts.json \
         --prompt_file_type hpsv2 \
         --info_file_save_path ./image_info_hpsv2.json \
@@ -191,3 +290,18 @@ python3 inference_cogview3plus.py \
         --batch_size 1 \
         --seed 42 \
         --device_id 0
+```
+参数说明：
+- model_path：权重路径，包含scheduler、text_encoder、tokenizer、transformer、vae，5个模型的配置文件及权重。
+- prompt_file：提示词文件。
+- prompt_file_type: prompt文件类型，用于指定读取方式，可选plain，parti，hpsv2。
+- info_file_save_path：生成图片信息的json文件路径。
+- save_dir：生成图片的存放目录。
+- num_images_per_prompt: 每个prompt生成的图片数量。注意使用hpsv2时，设置num_images_per_prompt=1即可。
+- height: 需要生成的图像的高。
+- width：需要生成的图像的宽。
+- batch_size：模型batch size。
+- seed：随机种子。
+- device_id：推理设备ID。
+
+执行完成后在`./results_hpsv2`目录下生成推理图片，在当前目录生成一个`image_info_hpsv2.json`文件，记录着图片和prompt的对应关系，并在终端显示推理时间。
-- 
Gitee


From 55cfe04dbc475d0e29da9687a787634188fd8d46 Mon Sep 17 00:00:00 2001
From: jiangmengyu <jiangmengyu1@huawei.com>
Date: Thu, 2 Jan 2025 16:54:28 +0800
Subject: [PATCH 84/91] =?UTF-8?q?cogview=E9=80=82=E9=85=8D?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../built-in/foundation/cogview3/README.md    | 150 +++++++++---------
 1 file changed, 75 insertions(+), 75 deletions(-)

diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/README.md b/MindIE/MindIE-Torch/built-in/foundation/cogview3/README.md
index f991ac774f..18e1c6a8ec 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/README.md
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/README.md
@@ -217,91 +217,91 @@ python inference_cogview3plus.py \
 ### 3.4 精度测试
 
 1. 由于生成的图片存在随机性，提供两种精度验证方法：
-   1. CLIP-score（文图匹配度量）：评估图片和输入文本的相关性，分数的取值范围为[-1, 1]，越高越好。使用Parti数据集进行验证。
-   2. HPSv2（图片美学度量）：评估生成图片的人类偏好评分，分数的取值范围为[0, 1]，越高越好。使用HPSv2数据集进行验证
+  1. CLIP-score（文图匹配度量）：评估图片和输入文本的相关性，分数的取值范围为[-1, 1]，越高越好。使用Parti数据集进行验证。
+  2. HPSv2（图片美学度量）：评估生成图片的人类偏好评分，分数的取值范围为[0, 1]，越高越好。使用HPSv2数据集进行验证
 
-注意，由于要生成的图片数量较多，进行完整的精度验证需要耗费很长的时间。
+  注意，由于要生成的图片数量较多，进行完整的精度验证需要耗费很长的时间。
 
 2. 下载Parti数据集和hpsv2数据集
-所有数据集放到congview3/prompts目录下
-```bash
-# 下载Parti数据集
-wget https://raw.githubusercontent.com/google-research/parti/main/PartiPrompts.tsv --no-check-certificate
-```
-hpsv2数据集下载链接：https://gitee.com/ascend/ModelZoo-PyTorch/blob/master/MindIE/MindIE-Torch/built-in/foundation/stable_diffusion_xl/hpsv2_benchmark_prompts.json
+  所有数据集放到congview3/prompts目录下
+  ```bash
+  # 下载Parti数据集
+  wget https://raw.githubusercontent.com/google-research/parti/main/PartiPrompts.tsv --no-check-certificate
+  ```
+  hpsv2数据集下载链接：https://gitee.com/ascend/ModelZoo-PyTorch/blob/master/MindIE/MindIE-Torch/built-in/foundation/stable_diffusion_xl/hpsv2_benchmark_prompts.json
 
 3. 下载模型权重
 
-```bash
-# Clip Score和HPSv2均需要使用的权重
-GIT_LFS_SKIP_SMUDGE=1
-git clone https://huggingface.co/laion/CLIP-ViT-H-14-laion2B-s32B-b79K
-cd ./CLIP-ViT-H-14-laion2B-s32B-b79K
+  ```bash
+  # Clip Score和HPSv2均需要使用的权重
+  GIT_LFS_SKIP_SMUDGE=1
+  git clone https://huggingface.co/laion/CLIP-ViT-H-14-laion2B-s32B-b79K
+  cd ./CLIP-ViT-H-14-laion2B-s32B-b79K
 
-# HPSv2权重
-wget https://huggingface.co/spaces/xswu/HPSv2/resolve/main/HPS_v2_compressed.pt --no-check-certificate
-```
-也可手动下载[权重](https://huggingface.co/laion/CLIP-ViT-H-14-laion2B-s32B-b79K/blob/main/open_clip_pytorch_model.bin)
-将权重放到`CLIP-ViT-H-14-laion2B-s32B-b79K`目录下，手动下载[HPSv2权重](https://huggingface.co/spaces/xswu/HPSv2/resolve/main/HPS_v2_compressed.pt)放到当前路径
+  # HPSv2权重
+  wget https://huggingface.co/spaces/xswu/HPSv2/resolve/main/HPS_v2_compressed.pt --no-check-certificate
+  ```
+  也可手动下载[权重](https://huggingface.co/laion/CLIP-ViT-H-14-laion2B-s32B-b79K/blob/main/open_clip_pytorch_model.bin)
+  将权重放到`CLIP-ViT-H-14-laion2B-s32B-b79K`目录下，手动下载[HPSv2权重](https://huggingface.co/spaces/xswu/HPSv2/resolve/main/HPS_v2_compressed.pt)放到当前路径
 
 4. 使用推理脚本读取Parti数据集，生成图片
-```bash
-mkdir ./results_PartiPrompts
-python3 inference_cogview3plus.py \
-        --model_path ${model_path} \
-        --prompt_file ./prompts/PartiPrompts.tsv \
-        --prompt_file_type parti \
-        --info_file_save_path ./image_info_PartiPrompts.json \
-        --save_dir ./results_PartiPrompts \
-        --num_images_per_prompt 4 \
-        --height 1024 \
-        --width 1024 \
-        --batch_size 1 \
-        --seed 42 \
-        --device_id 0 
-```
-参数说明：
-- model_path：权重路径，包含scheduler、text_encoder、tokenizer、transformer、vae，5个模型的配置文件及权重。
-- prompt_file：提示词文件。
-- prompt_file_type: prompt文件类型，用于指定读取方式，可选plain，parti，hpsv2。
-- info_file_save_path：生成图片信息的json文件路径。
-- save_dir：生成图片的存放目录。
-- num_images_per_prompt: 每个prompt生成的图片数量。注意使用hpsv2时，设置num_images_per_prompt=1即可。
-- height: 需要生成的图像的高。
-- width：需要生成的图像的宽。
-- batch_size：模型batch size。
-- seed：随机种子。
-- device_id：推理设备ID。
+  ```bash
+  mkdir ./results_PartiPrompts
+  python3 inference_cogview3plus.py \
+          --model_path ${model_path} \
+          --prompt_file ./prompts/PartiPrompts.tsv \
+          --prompt_file_type parti \
+          --info_file_save_path ./image_info_PartiPrompts.json \
+          --save_dir ./results_PartiPrompts \
+          --num_images_per_prompt 4 \
+          --height 1024 \
+          --width 1024 \
+          --batch_size 1 \
+          --seed 42 \
+          --device_id 0 
+  ```
+  参数说明：
+  - model_path：权重路径，包含scheduler、text_encoder、tokenizer、transformer、vae，5个模型的配置文件及权重。
+  - prompt_file：提示词文件。
+  - prompt_file_type: prompt文件类型，用于指定读取方式，可选plain，parti，hpsv2。
+  - info_file_save_path：生成图片信息的json文件路径。
+  - save_dir：生成图片的存放目录。
+  - num_images_per_prompt: 每个prompt生成的图片数量。注意使用hpsv2时，设置num_images_per_prompt=1即可。
+  - height: 需要生成的图像的高。
+  - width：需要生成的图像的宽。
+  - batch_size：模型batch size。
+  - seed：随机种子。
+  - device_id：推理设备ID。
 
-执行完成后在`./results_PartiPrompts`目录下生成推理图片，在当前目录生成一个`image_info_PartiPrompts.json`文件，记录着图片和prompt的对应关系，并在终端显示推理时间。
+  执行完成后在`./results_PartiPrompts`目录下生成推理图片，在当前目录生成一个`image_info_PartiPrompts.json`文件，记录着图片和prompt的对应关系，并在终端显示推理时间。
 
 5. 使用推理脚本读取hpsv2数据集，生成图片
-```bash
-mkdir ./results_hpsv2
-python3 inference_cogview3plus.py \
-        --model_path ${model_path} \
-        --prompt_file ./prompts/hpsv2_benchmark_prompts.json \
-        --prompt_file_type hpsv2 \
-        --info_file_save_path ./image_info_hpsv2.json \
-        --save_dir ./results_hpsv2 \
-        --num_images_per_prompt 1 \
-        --height 1024 \
-        --width 1024 \
-        --batch_size 1 \
-        --seed 42 \
-        --device_id 0
-```
-参数说明：
-- model_path：权重路径，包含scheduler、text_encoder、tokenizer、transformer、vae，5个模型的配置文件及权重。
-- prompt_file：提示词文件。
-- prompt_file_type: prompt文件类型，用于指定读取方式，可选plain，parti，hpsv2。
-- info_file_save_path：生成图片信息的json文件路径。
-- save_dir：生成图片的存放目录。
-- num_images_per_prompt: 每个prompt生成的图片数量。注意使用hpsv2时，设置num_images_per_prompt=1即可。
-- height: 需要生成的图像的高。
-- width：需要生成的图像的宽。
-- batch_size：模型batch size。
-- seed：随机种子。
-- device_id：推理设备ID。
+  ```bash
+  mkdir ./results_hpsv2
+  python3 inference_cogview3plus.py \
+          --model_path ${model_path} \
+          --prompt_file ./prompts/hpsv2_benchmark_prompts.json \
+          --prompt_file_type hpsv2 \
+          --info_file_save_path ./image_info_hpsv2.json \
+          --save_dir ./results_hpsv2 \
+          --num_images_per_prompt 1 \
+          --height 1024 \
+          --width 1024 \
+          --batch_size 1 \
+          --seed 42 \
+          --device_id 0
+  ```
+  参数说明：
+  - model_path：权重路径，包含scheduler、text_encoder、tokenizer、transformer、vae，5个模型的配置文件及权重。
+  - prompt_file：提示词文件。
+  - prompt_file_type: prompt文件类型，用于指定读取方式，可选plain，parti，hpsv2。
+  - info_file_save_path：生成图片信息的json文件路径。
+  - save_dir：生成图片的存放目录。
+  - num_images_per_prompt: 每个prompt生成的图片数量。注意使用hpsv2时，设置num_images_per_prompt=1即可。
+  - height: 需要生成的图像的高。
+  - width：需要生成的图像的宽。
+  - batch_size：模型batch size。
+  - seed：随机种子。
+  - device_id：推理设备ID。
 
-执行完成后在`./results_hpsv2`目录下生成推理图片，在当前目录生成一个`image_info_hpsv2.json`文件，记录着图片和prompt的对应关系，并在终端显示推理时间。
+  执行完成后在`./results_hpsv2`目录下生成推理图片，在当前目录生成一个`image_info_hpsv2.json`文件，记录着图片和prompt的对应关系，并在终端显示推理时间。
-- 
Gitee


From 98596465aeb500135ab378f4a8ffebe52e61fa0b Mon Sep 17 00:00:00 2001
From: jiangmengyu <jiangmengyu1@huawei.com>
Date: Thu, 2 Jan 2025 16:55:53 +0800
Subject: [PATCH 85/91] =?UTF-8?q?cogview=E9=80=82=E9=85=8D?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 MindIE/MindIE-Torch/built-in/foundation/cogview3/README.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/README.md b/MindIE/MindIE-Torch/built-in/foundation/cogview3/README.md
index 18e1c6a8ec..a12dd3d180 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/README.md
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/README.md
@@ -217,10 +217,10 @@ python inference_cogview3plus.py \
 ### 3.4 精度测试
 
 1. 由于生成的图片存在随机性，提供两种精度验证方法：
-  1. CLIP-score（文图匹配度量）：评估图片和输入文本的相关性，分数的取值范围为[-1, 1]，越高越好。使用Parti数据集进行验证。
-  2. HPSv2（图片美学度量）：评估生成图片的人类偏好评分，分数的取值范围为[0, 1]，越高越好。使用HPSv2数据集进行验证
+    1. CLIP-score（文图匹配度量）：评估图片和输入文本的相关性，分数的取值范围为[-1, 1]，越高越好。使用Parti数据集进行验证。
+    2. HPSv2（图片美学度量）：评估生成图片的人类偏好评分，分数的取值范围为[0, 1]，越高越好。使用HPSv2数据集进行验证
 
-  注意，由于要生成的图片数量较多，进行完整的精度验证需要耗费很长的时间。
+    注意，由于要生成的图片数量较多，进行完整的精度验证需要耗费很长的时间。
 
 2. 下载Parti数据集和hpsv2数据集
   所有数据集放到congview3/prompts目录下
-- 
Gitee


From 2354124247badcb220e1c2b7e8bbf000d0c786ef Mon Sep 17 00:00:00 2001
From: jiangmengyu <jiangmengyu1@huawei.com>
Date: Thu, 2 Jan 2025 16:56:29 +0800
Subject: [PATCH 86/91] =?UTF-8?q?cogview=E9=80=82=E9=85=8D?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../built-in/foundation/cogview3/README.md    | 144 +++++++++---------
 1 file changed, 72 insertions(+), 72 deletions(-)

diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/README.md b/MindIE/MindIE-Torch/built-in/foundation/cogview3/README.md
index a12dd3d180..426a0a4e53 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/README.md
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/README.md
@@ -223,85 +223,85 @@ python inference_cogview3plus.py \
     注意，由于要生成的图片数量较多，进行完整的精度验证需要耗费很长的时间。
 
 2. 下载Parti数据集和hpsv2数据集
-  所有数据集放到congview3/prompts目录下
-  ```bash
-  # 下载Parti数据集
-  wget https://raw.githubusercontent.com/google-research/parti/main/PartiPrompts.tsv --no-check-certificate
-  ```
-  hpsv2数据集下载链接：https://gitee.com/ascend/ModelZoo-PyTorch/blob/master/MindIE/MindIE-Torch/built-in/foundation/stable_diffusion_xl/hpsv2_benchmark_prompts.json
+    所有数据集放到congview3/prompts目录下
+    ```bash
+    # 下载Parti数据集
+    wget https://raw.githubusercontent.com/google-research/parti/main/PartiPrompts.tsv --no-check-certificate
+    ```
+    hpsv2数据集下载链接：https://gitee.com/ascend/ModelZoo-PyTorch/blob/master/MindIE/MindIE-Torch/built-in/foundation/stable_diffusion_xl/hpsv2_benchmark_prompts.json
 
 3. 下载模型权重
 
-  ```bash
-  # Clip Score和HPSv2均需要使用的权重
-  GIT_LFS_SKIP_SMUDGE=1
-  git clone https://huggingface.co/laion/CLIP-ViT-H-14-laion2B-s32B-b79K
-  cd ./CLIP-ViT-H-14-laion2B-s32B-b79K
+    ```bash
+    # Clip Score和HPSv2均需要使用的权重
+    GIT_LFS_SKIP_SMUDGE=1
+    git clone https://huggingface.co/laion/CLIP-ViT-H-14-laion2B-s32B-b79K
+    cd ./CLIP-ViT-H-14-laion2B-s32B-b79K
 
-  # HPSv2权重
-  wget https://huggingface.co/spaces/xswu/HPSv2/resolve/main/HPS_v2_compressed.pt --no-check-certificate
-  ```
-  也可手动下载[权重](https://huggingface.co/laion/CLIP-ViT-H-14-laion2B-s32B-b79K/blob/main/open_clip_pytorch_model.bin)
-  将权重放到`CLIP-ViT-H-14-laion2B-s32B-b79K`目录下，手动下载[HPSv2权重](https://huggingface.co/spaces/xswu/HPSv2/resolve/main/HPS_v2_compressed.pt)放到当前路径
+    # HPSv2权重
+    wget https://huggingface.co/spaces/xswu/HPSv2/resolve/main/HPS_v2_compressed.pt --no-check-certificate
+    ```
+    也可手动下载[权重](https://huggingface.co/laion/CLIP-ViT-H-14-laion2B-s32B-b79K/blob/main/open_clip_pytorch_model.bin)
+    将权重放到`CLIP-ViT-H-14-laion2B-s32B-b79K`目录下，手动下载[HPSv2权重](https://huggingface.co/spaces/xswu/HPSv2/resolve/main/HPS_v2_compressed.pt)放到当前路径
 
 4. 使用推理脚本读取Parti数据集，生成图片
-  ```bash
-  mkdir ./results_PartiPrompts
-  python3 inference_cogview3plus.py \
-          --model_path ${model_path} \
-          --prompt_file ./prompts/PartiPrompts.tsv \
-          --prompt_file_type parti \
-          --info_file_save_path ./image_info_PartiPrompts.json \
-          --save_dir ./results_PartiPrompts \
-          --num_images_per_prompt 4 \
-          --height 1024 \
-          --width 1024 \
-          --batch_size 1 \
-          --seed 42 \
-          --device_id 0 
-  ```
-  参数说明：
-  - model_path：权重路径，包含scheduler、text_encoder、tokenizer、transformer、vae，5个模型的配置文件及权重。
-  - prompt_file：提示词文件。
-  - prompt_file_type: prompt文件类型，用于指定读取方式，可选plain，parti，hpsv2。
-  - info_file_save_path：生成图片信息的json文件路径。
-  - save_dir：生成图片的存放目录。
-  - num_images_per_prompt: 每个prompt生成的图片数量。注意使用hpsv2时，设置num_images_per_prompt=1即可。
-  - height: 需要生成的图像的高。
-  - width：需要生成的图像的宽。
-  - batch_size：模型batch size。
-  - seed：随机种子。
-  - device_id：推理设备ID。
+    ```bash
+    mkdir ./results_PartiPrompts
+    python3 inference_cogview3plus.py \
+            --model_path ${model_path} \
+            --prompt_file ./prompts/PartiPrompts.tsv \
+            --prompt_file_type parti \
+            --info_file_save_path ./image_info_PartiPrompts.json \
+            --save_dir ./results_PartiPrompts \
+            --num_images_per_prompt 4 \
+            --height 1024 \
+            --width 1024 \
+            --batch_size 1 \
+            --seed 42 \
+            --device_id 0 
+    ```
+    参数说明：
+    - model_path：权重路径，包含scheduler、text_encoder、tokenizer、transformer、vae，5个模型的配置文件及权重。
+    - prompt_file：提示词文件。
+    - prompt_file_type: prompt文件类型，用于指定读取方式，可选plain，parti，hpsv2。
+    - info_file_save_path：生成图片信息的json文件路径。
+    - save_dir：生成图片的存放目录。
+    - num_images_per_prompt: 每个prompt生成的图片数量。注意使用hpsv2时，设置num_images_per_prompt=1即可。
+    - height: 需要生成的图像的高。
+    - width：需要生成的图像的宽。
+    - batch_size：模型batch size。
+    - seed：随机种子。
+    - device_id：推理设备ID。
 
-  执行完成后在`./results_PartiPrompts`目录下生成推理图片，在当前目录生成一个`image_info_PartiPrompts.json`文件，记录着图片和prompt的对应关系，并在终端显示推理时间。
+    执行完成后在`./results_PartiPrompts`目录下生成推理图片，在当前目录生成一个`image_info_PartiPrompts.json`文件，记录着图片和prompt的对应关系，并在终端显示推理时间。
 
 5. 使用推理脚本读取hpsv2数据集，生成图片
-  ```bash
-  mkdir ./results_hpsv2
-  python3 inference_cogview3plus.py \
-          --model_path ${model_path} \
-          --prompt_file ./prompts/hpsv2_benchmark_prompts.json \
-          --prompt_file_type hpsv2 \
-          --info_file_save_path ./image_info_hpsv2.json \
-          --save_dir ./results_hpsv2 \
-          --num_images_per_prompt 1 \
-          --height 1024 \
-          --width 1024 \
-          --batch_size 1 \
-          --seed 42 \
-          --device_id 0
-  ```
-  参数说明：
-  - model_path：权重路径，包含scheduler、text_encoder、tokenizer、transformer、vae，5个模型的配置文件及权重。
-  - prompt_file：提示词文件。
-  - prompt_file_type: prompt文件类型，用于指定读取方式，可选plain，parti，hpsv2。
-  - info_file_save_path：生成图片信息的json文件路径。
-  - save_dir：生成图片的存放目录。
-  - num_images_per_prompt: 每个prompt生成的图片数量。注意使用hpsv2时，设置num_images_per_prompt=1即可。
-  - height: 需要生成的图像的高。
-  - width：需要生成的图像的宽。
-  - batch_size：模型batch size。
-  - seed：随机种子。
-  - device_id：推理设备ID。
+    ```bash
+    mkdir ./results_hpsv2
+    python3 inference_cogview3plus.py \
+            --model_path ${model_path} \
+            --prompt_file ./prompts/hpsv2_benchmark_prompts.json \
+            --prompt_file_type hpsv2 \
+            --info_file_save_path ./image_info_hpsv2.json \
+            --save_dir ./results_hpsv2 \
+            --num_images_per_prompt 1 \
+            --height 1024 \
+            --width 1024 \
+            --batch_size 1 \
+            --seed 42 \
+            --device_id 0
+    ```
+    参数说明：
+    - model_path：权重路径，包含scheduler、text_encoder、tokenizer、transformer、vae，5个模型的配置文件及权重。
+    - prompt_file：提示词文件。
+    - prompt_file_type: prompt文件类型，用于指定读取方式，可选plain，parti，hpsv2。
+    - info_file_save_path：生成图片信息的json文件路径。
+    - save_dir：生成图片的存放目录。
+    - num_images_per_prompt: 每个prompt生成的图片数量。注意使用hpsv2时，设置num_images_per_prompt=1即可。
+    - height: 需要生成的图像的高。
+    - width：需要生成的图像的宽。
+    - batch_size：模型batch size。
+    - seed：随机种子。
+    - device_id：推理设备ID。
 
-  执行完成后在`./results_hpsv2`目录下生成推理图片，在当前目录生成一个`image_info_hpsv2.json`文件，记录着图片和prompt的对应关系，并在终端显示推理时间。
+    执行完成后在`./results_hpsv2`目录下生成推理图片，在当前目录生成一个`image_info_hpsv2.json`文件，记录着图片和prompt的对应关系，并在终端显示推理时间。
-- 
Gitee


From cf9848a2abecc6952a0e7cbfdda6a9ac1f30149c Mon Sep 17 00:00:00 2001
From: jiangmengyu <jiangmengyu1@huawei.com>
Date: Mon, 6 Jan 2025 10:42:48 +0800
Subject: [PATCH 87/91] =?UTF-8?q?cogview=E9=80=82=E9=85=8D?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../foundation/cogview3/clip_score.py         | 140 ++++++++++++++++++
 .../models/transformer_cogview3plus.py        |   6 +-
 .../foundation/cogview3/hpsv2_score.py        | 123 +++++++++++++++
 3 files changed, 266 insertions(+), 3 deletions(-)
 create mode 100644 MindIE/MindIE-Torch/built-in/foundation/cogview3/clip_score.py
 create mode 100644 MindIE/MindIE-Torch/built-in/foundation/cogview3/hpsv2_score.py

diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/clip_score.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/clip_score.py
new file mode 100644
index 0000000000..e0987baac7
--- /dev/null
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/clip_score.py
@@ -0,0 +1,140 @@
+# Copyright 2023 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import json
+import time
+import argparse
+
+import open_clip
+import numpy as np
+from PIL import Image
+import torch
+import torch.nn.functional as F
+
+
+def clip_score(model_clip, tokenizer, preprocess, prompt, image_files, device):
+    imgs = []
+    texts = []
+    for image_file in image_files:
+        img = preprocess(Image.open(image_file)).unsqueeze(0).to(device)
+        imgs.append(img)
+        text = tokenizer([prompt]).to(device)
+        texts.append(text)
+
+    img = torch.cat(imgs)   # [bs, 3, 224, 224]
+    text = torch.cat(texts) # [bs, 77]
+
+    with torch.no_grad():
+        text_ft = model_clip.encode_text(text).float()
+        img_ft = model_clip.encode_image(img).float()
+        score = F.cosine_similarity(img_ft, text_ft).squeeze()
+    
+    return score.cpu()
+
+
+def main():
+    args = parse_arguments()
+    
+    if args.device is None:
+        device = torch.device('cuda' if (torch.cuda.is_available()) else 'cpu')
+    else:
+        device = torch.device(args.device)
+    
+    t_b = time.time()
+    print(f"Load clip model...") 
+    model_clip, _, preprocess = open_clip.create_model_and_transforms(
+        args.model_name, pretrained=args.model_weights_path, device=device)
+    model_clip.eval()
+    print(f">done. elapsed time: {(time.time() - t_b):.3f} s")
+    
+    tokenizer = open_clip.get_tokenizer(args.model_name)
+
+    with os.fdopen(os.open(args.image_info, os.O_RDONLY), "r") as f:
+        image_info = json.load(f)
+
+    t_b = time.time()
+    print(f"Calc clip score...") 
+    all_scores = []
+    cat_scores = {}
+
+    for i, info in enumerate(image_info):
+        image_files = info['images']
+        category = info['category']
+        prompt = info['prompt']
+
+        print(f"[{i + 1}/{len(image_info)}] {prompt}")
+
+        image_scores = clip_score(model_clip, 
+                                  tokenizer, 
+                                  preprocess, 
+                                  prompt, 
+                                  image_files, 
+                                  device)
+        if len(image_files) > 1:
+            best_score = max(image_scores)
+        else:
+            best_score = image_scores
+
+        print(f"image scores: {image_scores}")
+        print(f"best score: {best_score}")
+
+        all_scores.append(best_score)
+        if category not in cat_scores:
+            cat_scores[category] = []
+        cat_scores[category].append(best_score)
+    print(f">done. elapsed time: {(time.time() - t_b):.3f} s")
+
+    average_score = np.average(all_scores)
+    print(f"====================================")
+    print(f"average score: {average_score:.3f}")
+    print(f"category average scores:")
+    cat_average_scores = {}
+    for category, scores in cat_scores.items():
+        cat_average_scores[category] = np.average(scores)
+        print(f"[{category}], average score: {cat_average_scores[category]:.3f}")
+
+
+def parse_arguments():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="cpu",
+        choices=["cpu", "cuda"],
+        help="device for torch.",
+    )
+    parser.add_argument(
+        "--image_info",
+        type=str,
+        default="./image_info.json",
+        help="Image_info.json file.",
+    )
+    parser.add_argument(
+        "--model_name",
+        type=str,
+        default="ViT-H-14",
+        help="open clip model name",
+    )
+    parser.add_argument(
+        "--model_weights_path",
+        type=str,
+        default="./CLIP-ViT-H-14-laion2B-s32B-b79K/open_clip_pytorch_model.bin",
+        help="open clip model weights",
+    )
+    return parser.parse_args()
+
+
+if __name__ == '__main__':
+    main()
\ No newline at end of file
diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/transformer_cogview3plus.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/transformer_cogview3plus.py
index f704e22589..9515d865be 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/transformer_cogview3plus.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/transformer_cogview3plus.py
@@ -130,9 +130,9 @@ class CogView3PlusTransformer2DModel(ModelMixin, ConfigMixin):
         pos_embed_max_size: int = 128,
         use_cache: bool = True,
         cache_interval: int = 2,
-        cache_start: int = 3,
-        num_cache_layer: int = 13,
-        cache_start_steps: int = 5,
+        cache_start: int = 1,
+        num_cache_layer: int = 11,
+        cache_start_steps: int = 10,
     ):
         super().__init__()
         self.out_channels = out_channels
diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/hpsv2_score.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/hpsv2_score.py
new file mode 100644
index 0000000000..a27ba20b5a
--- /dev/null
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/hpsv2_score.py
@@ -0,0 +1,123 @@
+# Copyright 2024 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import os
+from typing import Union
+import json
+
+from clint.textui import progress
+import hpsv2
+from hpsv2.utils import root_path, hps_version_map
+from hpsv2.src.open_clip import create_model_and_transforms, get_tokenizer
+import huggingface_hub
+from PIL import Image
+import requests
+import torch
+
+
+def initialize_model(pretrained_path, device):
+    model, _, preprocess_val = create_model_and_transforms(
+        "ViT-H-14", pretrained=pretrained_path, precision='amp',
+        device=device,
+        jit=False,
+        force_quick_gelu=False,
+        force_custom_text=False,
+        force_patch_dropout=False,
+        force_image_size=None,
+        pretrained_image=False,
+        image_mean=None,
+        image_std=None,
+        light_augmentation=True,
+        aug_cfg={},
+        output_dict=True,
+        with_score_predictor=False,
+        with_region_predictor=False
+    )
+    return model, preprocess_val
+
+
+def parse_arguments():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--image_info",
+        type=str,
+        default="./image_info.json",
+        help="Image_info.json file.",
+    )
+    parser.add_argument(
+        "--HPSv2_checkpoint",
+        type=str,
+        default="./HPS_v2_compressed.pt",
+        help="HPS_v2 model weights",
+    )
+    parser.add_argument(
+        "--clip_checkpoint",
+        type=str,
+        default="./CLIP-ViT-H-14-laion2B-s32B-b79K/open_clip_pytorch_model.bin",
+        help="open clip model weights",
+    )
+    return parser.parse_args()
+
+
+def main():
+    args = parse_arguments()
+    
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+
+    model, preprocess_val = initialize_model(args.clip_checkpoint, device)
+
+    checkpoint = torch.load(args.HPSv2_checkpoint, map_location=device)
+    model.load_state_dict(checkpoint['state_dict'])
+    tokenizer = get_tokenizer('ViT-H-14')
+    model = model.to(device)
+    model.eval()
+    
+    with os.fdopen(os.open(args.image_info, os.O_RDONLY), "r") as f:
+        image_info = json.load(f)
+
+    result = []
+    for i, info in enumerate(image_info):
+        image_file = info['images'][0]
+        prompt = info['prompt']
+        
+        # Load your image and prompt
+        with torch.no_grad():
+            # Process the image
+            if isinstance(image_file, str):
+                image = preprocess_val(Image.open(image_file))
+            elif isinstance(image_file, Image.Image):
+                image = preprocess_val(image_file)
+            else:
+                raise TypeError('The type of parameter img_path is illegal.')
+            image = image.unsqueeze(0).to(device=device, non_blocking=True)
+            # Process the prompt
+            text = tokenizer([prompt]).to(device=device, non_blocking=True)
+            # Calculate the HPS
+            with torch.cuda.amp.autocast():
+                outputs = model(image, text)
+                image_features = outputs["image_features"]
+                text_features = outputs["text_features"]
+                logits_per_image = image_features @ text_features.T
+
+                hps_score = torch.diagonal(logits_per_image).cpu().numpy()
+                print(f"image {i} hps_score: ", hps_score[0])
+
+        result.append(hps_score[0])
+
+    print('avg HPSv2 score:', sum(result) / len(result))
+
+
+if __name__ == '__main__':
+    main()
\ No newline at end of file
-- 
Gitee


From 3c3439187bdb07c46fc0c74413396dc24e747619 Mon Sep 17 00:00:00 2001
From: jiangmengyu <jiangmengyu1@huawei.com>
Date: Mon, 6 Jan 2025 11:07:24 +0800
Subject: [PATCH 88/91] =?UTF-8?q?cogview=E9=80=82=E9=85=8D?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../built-in/foundation/cogview3/clip_score.py            | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/clip_score.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/clip_score.py
index e0987baac7..b9bf3ce7bb 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/clip_score.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/clip_score.py
@@ -22,6 +22,7 @@ import numpy as np
 from PIL import Image
 import torch
 import torch.nn.functional as F
+import torch_npu
 
 
 def clip_score(model_clip, tokenizer, preprocess, prompt, image_files, device):
@@ -47,6 +48,9 @@ def clip_score(model_clip, tokenizer, preprocess, prompt, image_files, device):
 def main():
     args = parse_arguments()
     
+    if args.device == 'npu':
+        torch.npu.set_device(0)
+
     if args.device is None:
         device = torch.device('cuda' if (torch.cuda.is_available()) else 'cpu')
     else:
@@ -111,8 +115,8 @@ def parse_arguments():
     parser.add_argument(
         "--device",
         type=str,
-        default="cpu",
-        choices=["cpu", "cuda"],
+        default="npu",
+        choices=["cpu", "cuda", "npu"],
         help="device for torch.",
     )
     parser.add_argument(
-- 
Gitee


From 351fb0602edadc7837a80f3c8c8b39ed513a99f4 Mon Sep 17 00:00:00 2001
From: jiangmengyu <jiangmengyu1@huawei.com>
Date: Mon, 6 Jan 2025 11:13:59 +0800
Subject: [PATCH 89/91] =?UTF-8?q?cogview=E9=80=82=E9=85=8D?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../built-in/foundation/cogview3/clip_score.py            | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/clip_score.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/clip_score.py
index b9bf3ce7bb..e0987baac7 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/clip_score.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/clip_score.py
@@ -22,7 +22,6 @@ import numpy as np
 from PIL import Image
 import torch
 import torch.nn.functional as F
-import torch_npu
 
 
 def clip_score(model_clip, tokenizer, preprocess, prompt, image_files, device):
@@ -48,9 +47,6 @@ def clip_score(model_clip, tokenizer, preprocess, prompt, image_files, device):
 def main():
     args = parse_arguments()
     
-    if args.device == 'npu':
-        torch.npu.set_device(0)
-
     if args.device is None:
         device = torch.device('cuda' if (torch.cuda.is_available()) else 'cpu')
     else:
@@ -115,8 +111,8 @@ def parse_arguments():
     parser.add_argument(
         "--device",
         type=str,
-        default="npu",
-        choices=["cpu", "cuda", "npu"],
+        default="cpu",
+        choices=["cpu", "cuda"],
         help="device for torch.",
     )
     parser.add_argument(
-- 
Gitee


From d9290bc41b3bf0d595adddb848df830c44f27cba Mon Sep 17 00:00:00 2001
From: jiangmengyu <jiangmengyu1@huawei.com>
Date: Mon, 6 Jan 2025 19:55:03 +0800
Subject: [PATCH 90/91] =?UTF-8?q?cogview=E9=80=82=E9=85=8D?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../built-in/foundation/cogview3/README.md    | 61 +++++++++++++------
 .../cogview3/inference_cogview3plus.py        |  4 +-
 .../foundation/cogview3/requirents.txt        |  2 +-
 3 files changed, 45 insertions(+), 22 deletions(-)

diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/README.md b/MindIE/MindIE-Torch/built-in/foundation/cogview3/README.md
index 426a0a4e53..a3c4f437c8 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/README.md
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/README.md
@@ -5,7 +5,7 @@
   | 配套  | 版本 | 环境准备指导 |
   | ----- | ----- |-----|
   | Python | 3.10.12 | - |
-  | torch | 2.4.0 | - |
+  | torch | 2.1.0 | - |
 
 ### 1.1 获取CANN&MindIE安装包&环境准备
 - [800I A2](https://www.hiascend.com/developer/download/community/result?module=pt+ie+cann&product=4&model=32)
@@ -136,9 +136,9 @@ https://huggingface.co/THUDM/CogView3-Plus-3B/tree/main/transformer
   "time_embed_dim": 512,
   "use_cache": True,
   "cache_interval": 2,
-  "cache_start": 3,
-  "num_cache_layer" 13,
-  "cache_start_steps" 5
+  "cache_start": 1,
+  "num_cache_layer" 11,
+  "cache_start_steps" 10
 }
 ```
 6. vae权重链接：
@@ -181,21 +181,11 @@ git clone https://huggingface.co/THUDM/CogView3-Plus-3B
 ```shell
 cd cogview3
 ```
-2. 设置权重路径
-```shell
-model_path='/data/CogView3B'
-```
-3. 创建输出图像路径
-```shell
-output_path='./results'
-mkdir ${output_path}
-```
-4. 推理：
+2. 推理：
 ```shell
 python inference_cogview3plus.py \
-       --model_path ${model_path} \
+       --model_path /data/CogView3B \
        --prompt_file ./prompts/example_prompts.txt \
-       --save_dir ${output_path} \
        --width 1024 \
        --height 1024 \
        --num_inference_steps 50 \
@@ -205,14 +195,13 @@ python inference_cogview3plus.py \
 参数说明：
 - model_path：权重路径，包含scheduler、text_encoder、tokenizer、transformer、vae，5个模型的配置文件及权重。
 - prompt_file：提示词文件。
-- save_dir：生成图片的存放目录。
 - width：需要生成的图像的宽。
 - height: 需要生成的图像的高。
 - num_inference_steps：推理迭代步数。
 - dtype: 数据类型。目前只支持bf16。
 - device_id：推理设备ID。
 
-5. 可以通过修改权重文件中`/data/CongView3B/transforer/config.json`中的`use_cache`参数来控制dit cache算法的开关，`true`表示使用dit cache，`false`表示关闭dit cache。
+3. 可以通过修改权重文件中`/data/CongView3B/transforer/config.json`中的`use_cache`参数来控制dit cache算法的开关，`true`表示使用dit cache，`false`表示关闭dit cache。
 
 ### 3.4 精度测试
 
@@ -248,7 +237,7 @@ python inference_cogview3plus.py \
     ```bash
     mkdir ./results_PartiPrompts
     python3 inference_cogview3plus.py \
-            --model_path ${model_path} \
+            --model_path /data/CogView3B \
             --prompt_file ./prompts/PartiPrompts.tsv \
             --prompt_file_type parti \
             --info_file_save_path ./image_info_PartiPrompts.json \
@@ -279,7 +268,7 @@ python inference_cogview3plus.py \
     ```bash
     mkdir ./results_hpsv2
     python3 inference_cogview3plus.py \
-            --model_path ${model_path} \
+            --model_path /data/CogView3B \
             --prompt_file ./prompts/hpsv2_benchmark_prompts.json \
             --prompt_file_type hpsv2 \
             --info_file_save_path ./image_info_hpsv2.json \
@@ -305,3 +294,35 @@ python inference_cogview3plus.py \
     - device_id：推理设备ID。
 
     执行完成后在`./results_hpsv2`目录下生成推理图片，在当前目录生成一个`image_info_hpsv2.json`文件，记录着图片和prompt的对应关系，并在终端显示推理时间。
+
+6. 计算精度指标
+    1. CLIP-score
+        ```bash
+        python3 clip_score.py \
+              --device=gpu \
+              --image_info="./image_info_PartiPrompts_cache.json" \
+              --model_name="ViT-H-14" \
+              --model_weights_path="./CLIP-ViT-H-14-laion2B-s32B-b79K/open_clip_pytorch_model.bin"
+        ```
+        参数说明：
+        - --device: 推理设备。
+        - --image_info: 上一步生成的`image_info.json`文件。
+        - --model_name: Clip模型名称。
+        - --model_weights_path: Clip模型权重文件路径。
+
+        执行完成后会在屏幕打印出精度计算结果。
+    
+    2. HPSv2
+        ```bash
+        python3 hpsv2_score.py \
+              --image_info="image_info_hpsv2_cache.json" \
+              --HPSv2_checkpoint="./HPS_v2_compressed.pt" \
+              --clip_checkpoint="./CLIP-ViT-H-14-laion2B-s32B-b79K/open_clip_pytorch_model.bin"
+        ```
+
+        参数说明：
+        - --image_info: 上一步生成的`image_info.json`文件。
+        - --HPSv2_checkpoint: HPSv2模型权重文件路径。
+        - --clip_checkpointh: Clip模型权重文件路径。
+
+        执行完成后会在屏幕打印出精度计算结果。
diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/inference_cogview3plus.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/inference_cogview3plus.py
index 3ef016ed2a..ae90899030 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/inference_cogview3plus.py
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/inference_cogview3plus.py
@@ -197,6 +197,9 @@ def infer(args):
     torch.npu.set_device(args.device_id)
     dtype = torch.bfloat16 if args.dtype == "bf16" else torch.float16
 
+    if not os.path.exists(args.save_dir):
+        os.makedirs(args.save_dir)
+
     if args.seed is not None:
         set_random_seed(args.seed)
 
@@ -259,4 +262,3 @@ def infer(args):
 if __name__ == "__main__":
     inference_args = parse_arguments()
     infer(inference_args)
-
diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/requirents.txt b/MindIE/MindIE-Torch/built-in/foundation/cogview3/requirents.txt
index 1600434700..b3b2501d42 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/requirents.txt
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/requirents.txt
@@ -4,5 +4,5 @@ gradio==5.9.1
 accelerate==1.0.1
 diffusers==0.31.0
 sentencepiece==0.2.0
-torch==2.4.0
+torch==2.1.0
 openai==1.58.1
\ No newline at end of file
-- 
Gitee


From 50a97fccc331e5ffe2decb51d69043a784c35485 Mon Sep 17 00:00:00 2001
From: jiangmengyu <jiangmengyu1@huawei.com>
Date: Tue, 7 Jan 2025 11:25:53 +0800
Subject: [PATCH 91/91] =?UTF-8?q?cogview=E9=80=82=E9=85=8D?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../built-in/foundation/cogview3/README.md       | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/README.md b/MindIE/MindIE-Torch/built-in/foundation/cogview3/README.md
index a3c4f437c8..a16bf5a0b9 100644
--- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/README.md
+++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/README.md
@@ -134,7 +134,7 @@ https://huggingface.co/THUDM/CogView3-Plus-3B/tree/main/transformer
   "sample_size": 128,
   "text_embed_dim": 4096,
   "time_embed_dim": 512,
-  "use_cache": True,
+  "use_cache": False,
   "cache_interval": 2,
   "cache_start": 1,
   "num_cache_layer" 11,
@@ -225,7 +225,6 @@ python inference_cogview3plus.py \
     # Clip Score和HPSv2均需要使用的权重
     GIT_LFS_SKIP_SMUDGE=1
     git clone https://huggingface.co/laion/CLIP-ViT-H-14-laion2B-s32B-b79K
-    cd ./CLIP-ViT-H-14-laion2B-s32B-b79K
 
     # HPSv2权重
     wget https://huggingface.co/spaces/xswu/HPSv2/resolve/main/HPS_v2_compressed.pt --no-check-certificate
@@ -300,12 +299,12 @@ python inference_cogview3plus.py \
         ```bash
         python3 clip_score.py \
               --device=gpu \
-              --image_info="./image_info_PartiPrompts_cache.json" \
+              --image_info="./image_info_PartiPrompts.json" \
               --model_name="ViT-H-14" \
               --model_weights_path="./CLIP-ViT-H-14-laion2B-s32B-b79K/open_clip_pytorch_model.bin"
         ```
         参数说明：
-        - --device: 推理设备。
+        - --device: 推理设备（CPU或者GPU）。
         - --image_info: 上一步生成的`image_info.json`文件。
         - --model_name: Clip模型名称。
         - --model_weights_path: Clip模型权重文件路径。
@@ -315,7 +314,7 @@ python inference_cogview3plus.py \
     2. HPSv2
         ```bash
         python3 hpsv2_score.py \
-              --image_info="image_info_hpsv2_cache.json" \
+              --image_info="image_info_hpsv2.json" \
               --HPSv2_checkpoint="./HPS_v2_compressed.pt" \
               --clip_checkpoint="./CLIP-ViT-H-14-laion2B-s32B-b79K/open_clip_pytorch_model.bin"
         ```
@@ -326,3 +325,10 @@ python inference_cogview3plus.py \
         - --clip_checkpointh: Clip模型权重文件路径。
 
         执行完成后会在屏幕打印出精度计算结果。
+
+### CogView3plus
+
+| 硬件形态 | 迭代次数 | dit cache | 平均耗时 | CLIP_score | HPSV2_score |
+| :------: |:----:|:----:|:----:|:----:|:----:|
+| Atlas 800T A2 (64G) 单卡 |  50  |  False  |  27.588s  |  0.367  |  0.2879729  |
+| Atlas 800T A2 (64G) 单卡 |  50  |  True   |  23.639s  |  0.367  |  0.2878573  |
\ No newline at end of file
-- 
Gitee