From 02dab518db419a417d583690221fcdcbe7d5a1ab Mon Sep 17 00:00:00 2001 From: jiangmengyu Date: Tue, 24 Dec 2024 19:53:04 +0800 Subject: [PATCH 01/91] =?UTF-8?q?CogView3plus=E9=80=82=E9=85=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../built-in/foundation/cogview3/README.md | 314 ++++++ .../cogview3/cogview3plus/__init__.py | 18 + .../cogview3/cogview3plus/layers/__init__.py | 2 + .../cogview3plus/layers/embeddings.py | 484 +++++++++ .../cogview3plus/layers/normalization.py | 166 +++ .../cogview3/cogview3plus/models/__init__.py | 1 + .../cogview3/cogview3plus/models/attention.py | 86 ++ .../models/attention_processor.py | 82 ++ .../models/transformer_cogview3plus.py | 383 +++++++ .../cogview3plus/pipeline/__init__.py | 1 + .../pipeline/pipeline_cogview3plus.py | 675 ++++++++++++ .../cogview3plus/pipeline/pipeline_output.py | 21 + .../cogview3plus/schedulers/__init__.py | 2 + .../schedulers/scheduling_ddim_cogvideox.py | 452 ++++++++ .../schedulers/scheduling_dpm_cogvideox.py | 489 +++++++++ .../schedulers/scheduling_utils.py | 193 ++++ .../cogview3/cogview3plus/vae/__init__.py | 1 + .../cogview3plus/vae/autoencoder_kl.py | 571 ++++++++++ .../cogview3/cogview3plus/vae/vae.py | 995 ++++++++++++++++++ .../cogview3/inference_cogview3plus.py | 99 ++ .../foundation/cogview3/requirents.txt | 15 + 21 files changed, 5050 insertions(+) create mode 100644 MindIE/MindIE-Torch/built-in/foundation/cogview3/README.md create mode 100644 MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/__init__.py create mode 100644 MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/__init__.py create mode 100644 MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/embeddings.py create mode 100644 MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/normalization.py create mode 100644 MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/__init__.py create mode 100644 MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/attention.py create mode 100644 MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/attention_processor.py create mode 100644 MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/transformer_cogview3plus.py create mode 100644 MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/__init__.py create mode 100644 MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py create mode 100644 MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_output.py create mode 100644 MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/__init__.py create mode 100644 MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_ddim_cogvideox.py create mode 100644 MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_dpm_cogvideox.py create mode 100644 MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_utils.py create mode 100644 MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/vae/__init__.py create mode 100644 MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/vae/autoencoder_kl.py create mode 100644 MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/vae/vae.py create mode 100644 MindIE/MindIE-Torch/built-in/foundation/cogview3/inference_cogview3plus.py create mode 100644 MindIE/MindIE-Torch/built-in/foundation/cogview3/requirents.txt diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/README.md b/MindIE/MindIE-Torch/built-in/foundation/cogview3/README.md new file mode 100644 index 0000000000..028c765d30 --- /dev/null +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/README.md @@ -0,0 +1,314 @@ +## 一、准备运行环境 + + **表 1** 版本配套表 + + | 配套 | 版本 | 环境准备指导 | + | ----- | ----- |-----| + | Python | 3.10.2 | - | + | torch | 2.1.0 | - | + +### 1.1 获取CANN&MindIE安装包&环境准备 +- [800I A2](https://www.hiascend.com/developer/download/community/result?module=pt+ie+cann&product=4&model=32) +- [Duo卡](https://www.hiascend.com/developer/download/community/result?module=pt+ie+cann&product=2&model=17) +- [环境准备指导](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/80RC2alpha002/softwareinst/instg/instg_0001.html) + +### 1.2 CANN安装 +```shell +# 增加软件包可执行权限,{version}表示软件版本号,{arch}表示CPU架构,{soc}表示昇腾AI处理器的版本。 +chmod +x ./Ascend-cann-toolkit_{version}_linux-{arch}.run +chmod +x ./Ascend-cann-kernels-{soc}_{version}_linux.run +# 校验软件包安装文件的一致性和完整性 +./Ascend-cann-toolkit_{version}_linux-{arch}.run --check +./Ascend-cann-kernels-{soc}_{version}_linux.run --check +# 安装 +./Ascend-cann-toolkit_{version}_linux-{arch}.run --install +./Ascend-cann-kernels-{soc}_{version}_linux.run --install + +# 设置环境变量 +source /usr/local/Ascend/ascend-toolkit/set_env.sh +``` + +### 1.3 MindIE安装 +```shell +# 增加软件包可执行权限,{version}表示软件版本号,{arch}表示CPU架构。 +chmod +x ./Ascend-mindie_${version}_linux-${arch}.run +./Ascend-mindie_${version}_linux-${arch}.run --check + +# 方式一:默认路径安装 +./Ascend-mindie_${version}_linux-${arch}.run --install +# 设置环境变量 +cd /usr/local/Ascend/mindie && source set_env.sh + +# 方式二:指定路径安装 +./Ascend-mindie_${version}_linux-${arch}.run --install-path=${AieInstallPath} +# 设置环境变量 +cd ${AieInstallPath}/mindie && source set_env.sh +``` + +### 1.4 Torch_npu安装 +安装pytorch框架 版本2.1.0 +[安装包下载](https://download.pytorch.org/whl/cpu/torch/) + +使用pip安装 +```shell +# {version}表示软件版本号,{arch}表示CPU架构。 +pip install torch-${version}-cp310-cp310-linux_${arch}.whl +``` +下载 pytorch_v{pytorchversion}_py{pythonversion}.tar.gz +```shell +tar -xzvf pytorch_v{pytorchversion}_py{pythonversion}.tar.gz +# 解压后,会有whl包 +pip install torch_npu-{pytorchversion}.xxxx.{arch}.whl +``` +## 二、下载本仓库 + +### 2.1 下载到本地 +```shell + git clone https://gitee.com/ascend/ModelZoo-PyTorch.git +``` + +## 三、HunyuanDiT使用 + +### 3.1 权重及配置文件说明 +1. text_encoder权重链接: +```shell +https://huggingface.co/Tencent-Hunyuan/HunyuanDiT-v1.2-Diffusers/tree/main/text_encoder +``` +2. text_encoder_2权重链接: +```shell +https://huggingface.co/Tencent-Hunyuan/HunyuanDiT-v1.2-Diffusers/tree/main/text_encoder_2 +``` +3. tokenizer权重链接: +```shell +https://huggingface.co/Tencent-Hunyuan/HunyuanDiT-v1.2-Diffusers/tree/main/tokenizer +``` +4. tokenizer_2权重链接: +```shell +https://huggingface.co/Tencent-Hunyuan/HunyuanDiT-v1.2-Diffusers/tree/main/tokenizer_2 +``` +5. transformer权重链接: +```shell +https://huggingface.co/Tencent-Hunyuan/HunyuanDiT-v1.2/tree/main/t2i/model +``` +- 修改该权重的config.json +```shell +{ + "architectures": [ + "HunyuanDiT2DModel" + ], + "input_size": [ + null, + null + ], + "patch_size": 2, + "in_channels": 4, + "hidden_size": 1408, + "depth": 40, + "num_heads": 16, + "mlp_ratio": 4.3637, + "text_states_dim": 1024, + "text_states_dim_t5": 2048, + "text_len": 77, + "text_len_t5": 256 +} +``` +6. vae权重链接: +```shell +https://huggingface.co/Tencent-Hunyuan/HunyuanDiT-v1.2-Diffusers/tree/main/vae +``` +- 修改该权重的config.json +```shell +{ + "architectures": [ + "AutoencoderKL" + ], + "in_channels": 3, + "out_channels": 3, + "down_block_types": [ + "DownEncoderBlock2D", + "DownEncoderBlock2D", + "DownEncoderBlock2D", + "DownEncoderBlock2D" + ], + "up_block_types": [ + "UpDecoderBlock2D", + "UpDecoderBlock2D", + "UpDecoderBlock2D", + "UpDecoderBlock2D" + ], + "block_out_channels": [ + 128, + 256, + 512, + 512 + ], + "layers_per_block": 2, + "act_fn": "silu", + "latent_channels": 4, + "norm_num_groups": 32, + "sample_size": 512, + "scaling_factor": 0.13025, + "shift_factor": null, + "latents_mean": null, + "latents_std": null, + "force_upcast": false, + "use_quant_conv": true, + "use_post_quant_conv": true +} +``` +7. scheduler: +- 新增scheduler_config.json配置文件, 内容如下所示: +```shell +{ + "_class_name": "DDPMScheduler", + "_mindiesd_version": "1.0.0", + "steps_offset": 1, + "beta_start": 0.00085, + "beta_end": 0.02, + "num_train_timesteps": 1000 +} +``` +8. 新增model_index.json +将以上步骤下载的权重放在同一目录下, 并新增model_index.json文件, 该文件内容如下所示 +```shell +{ + "_class_name": "HunyuanDiTPipeline", + "_mindiesd_version": "1.0.RC3", + "scheduler": [ + "mindiesd", + "DDPMScheduler" + ], + "text_encoder": [ + "transformers", + "BertModel" + ], + "text_encoder_2": [ + "transformers", + "T5EncoderModel" + ], + "tokenizer": [ + "transformers", + "BertTokenizer" + ], + "tokenizer_2": [ + "transformers", + "T5Tokenizer" + ], + "transformer": [ + "mindiesd", + "HunyuanDiT2DModel" + ], + "vae": [ + "mindiesd", + "AutoencoderKL" + ] +} +``` +9. 各模型的配置文件、权重文件的层级样例如下所示。 +```commandline +|----hunyuandit +| |---- model_index.json +| |---- scheduler +| | |---- scheduler_config.json +| |---- text_encoder +| | |---- config.json +| | |---- 模型权重 +| |---- text_encoder_2 +| | |---- config.json +| | |---- 模型权重 +| |---- tokenizer +| | |---- config.json +| | |---- 模型权重 +| |---- tokenizer_2 +| | |---- config.json +| | |---- 模型权重 +| |---- transformer +| | |---- config.json +| | |---- 模型权重 +| |---- vae +| | |---- config.json +| | |---- 模型权重 +``` + +### 3.2 单卡单prompt功能测试 +设置权重路径 +```shell +path = 'ckpts/hydit' +``` +执行命令: +```shell +python inference_hydit.py \ + --path ${path} \ + --device_id 0 \ + --prompt "青花瓷风格,一只小狗" \ + --input_size (1024, 1024) \ + --seed 42 \ + --infer_steps 25 +``` +参数说明: +- path:权重路径,包含scheduler、text_encoder、text_encoder_2、tokenizer、 tokenizer_2、transformer、vae,七个模型的配置文件及权重。 +- device_id:推理设备ID。 +- prompt:用于图像生成的文字描述提示。 +- input_size:需要生成的图像尺寸。 +- seed:设置随机种子,默认值为42。 +- infer_steps:推理迭代步数。 + +### 3.3 单卡多prompts进行性能/精度测试 +设置权重路径 +```shell +path = 'ckpts/hydit' +``` +执行命令: +```shell +python inference_hydit.py \ + --path ${path} \ + --device_id 0 \ + --test_acc \ + --prompt_list "prompts/example_prompts.txt" \ + --input_size (1024, 1024) \ + --seed 42 \ + --infer_steps 25 +``` +参数说明: +- path:权重路径,包含scheduler、text_encoder、text_encoder_2、tokenizer、 tokenizer_2、transformer、vae,七个模型的配置文件及权重。 +- device_id:推理设备ID。 +- test_acc:使用 --test_acc 开启全量图像生成,用于性能/精度测试。单prompt功能测试时,不开启该参数。 +- prompt_list:用于图像生成的文字描述提示的列表文件路径。 +- input_size:需要生成的图像尺寸。 +- seed:设置随机种子,默认值为42。 +- infer_steps:推理迭代步数。 + +### 3.4 用LoRA进行测试 +设置权重路径 +```shell +path = 'ckpts/hydit' +``` +LoRA权重链接: +```shell +https://huggingface.co/Tencent-Hunyuan/HYDiT-LoRA/tree/main +``` +设置LoRA权重路径 +```shell +lora_path = 'ckpts/lora' +``` +执行命令: +```shell +python inference_hydit.py \ + --path ${path} \ + --device_id 0 \ + --prompt "青花瓷风格,一只小狗" \ + --input_size (1024, 1024) \ + --seed 42 \ + --infer_steps 25 + --use_lora \ + --lora_ckpt ${lora_path} +``` +参数说明: +- path:权重路径,包含scheduler、text_encoder、text_encoder_2、tokenizer、 tokenizer_2、transformer、vae,七个模型的配置文件及权重。 +- device_id:推理设备ID。 +- prompt:用于图像生成的文字描述提示。 +- input_size:需要生成的图像尺寸。 +- seed:设置随机种子,默认值为42。 +- infer_steps:推理迭代步数。 +- use_lora:使用 --use_lora 开启LoRA风格化切换。 +- lora_ckpt:LoRA权重路径。 \ No newline at end of file diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/__init__.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/__init__.py new file mode 100644 index 0000000000..acbd223eb6 --- /dev/null +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/__init__.py @@ -0,0 +1,18 @@ +#!/usr/bin/env python +# coding=utf-8 +# Copyright 2024 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from .pipeline import CogView3PlusPipeline diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/__init__.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/__init__.py new file mode 100644 index 0000000000..c3e7c569e2 --- /dev/null +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/__init__.py @@ -0,0 +1,2 @@ +from .normalization import CogView3PlusAdaLayerNormZeroTextImage, AdaLayerNormContinuous +from .embeddings import CogView3CombinedTimestepSizeEmbeddings, CogView3PlusPatchEmbed \ No newline at end of file diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/embeddings.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/embeddings.py new file mode 100644 index 0000000000..445ad8245a --- /dev/null +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/embeddings.py @@ -0,0 +1,484 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import math +from typing import Optional + +import numpy as np +import torch +from torch import nn + +from diffusers.utils import deprecate +from diffusers.models.activations import FP32SiLU, get_activation + + +def get_timestep_embedding( + timesteps: torch.Tensor, + embedding_dim: int, + flip_sin_to_cos: bool = False, + downscale_freq_shift: float = 1, + scale: float = 1, + max_period: int = 10000, +): + """ + This matches the implementation in Denoising Diffusion Probabilistic Models: Create sinusoidal timestep embeddings. + + Args + timesteps (torch.Tensor): + a 1-D Tensor of N indices, one per batch element. These may be fractional. + embedding_dim (int): + the dimension of the output. + flip_sin_to_cos (bool): + Whether the embedding order should be `cos, sin` (if True) or `sin, cos` (if False) + downscale_freq_shift (float): + Controls the delta between frequencies between dimensions + scale (float): + Scaling factor applied to the embeddings. + max_period (int): + Controls the maximum frequency of the embeddings + Returns + torch.Tensor: an [N x dim] Tensor of positional embeddings. + """ + assert len(timesteps.shape) == 1, "Timesteps should be a 1d-array" + + half_dim = embedding_dim // 2 + exponent = -math.log(max_period) * torch.arange( + start=0, end=half_dim, dtype=torch.float32, device=timesteps.device + ) + exponent = exponent / (half_dim - downscale_freq_shift) + + emb = torch.exp(exponent) + emb = timesteps[:, None].float() * emb[None, :] + + # scale embeddings + emb = scale * emb + + # concat sine and cosine embeddings + emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=-1) + + # flip sine and cosine embeddings + if flip_sin_to_cos: + emb = torch.cat([emb[:, half_dim:], emb[:, :half_dim]], dim=-1) + + # zero pad + if embedding_dim % 2 == 1: + emb = torch.nn.functional.pad(emb, (0, 1, 0, 0)) + return emb + + +def get_2d_sincos_pos_embed( + embed_dim, + grid_size, + cls_token=False, + extra_tokens=0, + interpolation_scale=1.0, + base_size=16, + device: Optional[torch.device] = None, + output_type: str = "np", +): + """ + Creates 2D sinusoidal positional embeddings. + + Args: + embed_dim (`int`): + The embedding dimension. + grid_size (`int`): + The size of the grid height and width. + cls_token (`bool`, defaults to `False`): + Whether or not to add a classification token. + extra_tokens (`int`, defaults to `0`): + The number of extra tokens to add. + interpolation_scale (`float`, defaults to `1.0`): + The scale of the interpolation. + + Returns: + pos_embed (`torch.Tensor`): + Shape is either `[grid_size * grid_size, embed_dim]` if not using cls_token, or `[1 + grid_size*grid_size, + embed_dim]` if using cls_token + """ + if output_type == "np": + deprecation_message = ( + "`get_2d_sincos_pos_embed` uses `torch` and supports `device`." + " `from_numpy` is no longer required." + " Pass `output_type='pt' to use the new version now." + ) + deprecate("output_type=='np'", "0.33.0", deprecation_message, standard_warn=False) + return get_2d_sincos_pos_embed_np( + embed_dim=embed_dim, + grid_size=grid_size, + cls_token=cls_token, + extra_tokens=extra_tokens, + interpolation_scale=interpolation_scale, + base_size=base_size, + ) + if isinstance(grid_size, int): + grid_size = (grid_size, grid_size) + + grid_h = ( + torch.arange(grid_size[0], device=device, dtype=torch.float32) + / (grid_size[0] / base_size) + / interpolation_scale + ) + grid_w = ( + torch.arange(grid_size[1], device=device, dtype=torch.float32) + / (grid_size[1] / base_size) + / interpolation_scale + ) + grid = torch.meshgrid(grid_w, grid_h, indexing="xy") # here w goes first + grid = torch.stack(grid, dim=0) + + grid = grid.reshape([2, 1, grid_size[1], grid_size[0]]) + pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid, output_type=output_type) + if cls_token and extra_tokens > 0: + pos_embed = torch.concat([torch.zeros([extra_tokens, embed_dim]), pos_embed], dim=0) + return pos_embed + + +def get_2d_sincos_pos_embed_from_grid(embed_dim, grid, output_type="np"): + r""" + This function generates 2D sinusoidal positional embeddings from a grid. + + Args: + embed_dim (`int`): The embedding dimension. + grid (`torch.Tensor`): Grid of positions with shape `(H * W,)`. + + Returns: + `torch.Tensor`: The 2D sinusoidal positional embeddings with shape `(H * W, embed_dim)` + """ + if output_type == "np": + deprecation_message = ( + "`get_2d_sincos_pos_embed_from_grid` uses `torch` and supports `device`." + " `from_numpy` is no longer required." + " Pass `output_type='pt' to use the new version now." + ) + deprecate("output_type=='np'", "0.33.0", deprecation_message, standard_warn=False) + return get_2d_sincos_pos_embed_from_grid_np( + embed_dim=embed_dim, + grid=grid, + ) + if embed_dim % 2 != 0: + raise ValueError("embed_dim must be divisible by 2") + + # use half of dimensions to encode grid_h + emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0], output_type=output_type) # (H*W, D/2) + emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1], output_type=output_type) # (H*W, D/2) + + emb = torch.concat([emb_h, emb_w], dim=1) # (H*W, D) + return emb + + +def get_1d_sincos_pos_embed_from_grid(embed_dim, pos, output_type="np"): + """ + This function generates 1D positional embeddings from a grid. + + Args: + embed_dim (`int`): The embedding dimension `D` + pos (`torch.Tensor`): 1D tensor of positions with shape `(M,)` + + Returns: + `torch.Tensor`: Sinusoidal positional embeddings of shape `(M, D)`. + """ + if output_type == "np": + deprecation_message = ( + "`get_1d_sincos_pos_embed_from_grid` uses `torch` and supports `device`." + " `from_numpy` is no longer required." + " Pass `output_type='pt' to use the new version now." + ) + deprecate("output_type=='np'", "0.33.0", deprecation_message, standard_warn=False) + return get_1d_sincos_pos_embed_from_grid_np(embed_dim=embed_dim, pos=pos) + if embed_dim % 2 != 0: + raise ValueError("embed_dim must be divisible by 2") + + omega = torch.arange(embed_dim // 2, device=pos.device, dtype=torch.float64) + omega /= embed_dim / 2.0 + omega = 1.0 / 10000**omega # (D/2,) + + pos = pos.reshape(-1) # (M,) + out = torch.outer(pos, omega) # (M, D/2), outer product + + emb_sin = torch.sin(out) # (M, D/2) + emb_cos = torch.cos(out) # (M, D/2) + + emb = torch.concat([emb_sin, emb_cos], dim=1) # (M, D) + return emb + + +def get_2d_sincos_pos_embed_np( + embed_dim, grid_size, cls_token=False, extra_tokens=0, interpolation_scale=1.0, base_size=16 +): + """ + Creates 2D sinusoidal positional embeddings. + + Args: + embed_dim (`int`): + The embedding dimension. + grid_size (`int`): + The size of the grid height and width. + cls_token (`bool`, defaults to `False`): + Whether or not to add a classification token. + extra_tokens (`int`, defaults to `0`): + The number of extra tokens to add. + interpolation_scale (`float`, defaults to `1.0`): + The scale of the interpolation. + + Returns: + pos_embed (`np.ndarray`): + Shape is either `[grid_size * grid_size, embed_dim]` if not using cls_token, or `[1 + grid_size*grid_size, + embed_dim]` if using cls_token + """ + if isinstance(grid_size, int): + grid_size = (grid_size, grid_size) + + grid_h = np.arange(grid_size[0], dtype=np.float32) / (grid_size[0] / base_size) / interpolation_scale + grid_w = np.arange(grid_size[1], dtype=np.float32) / (grid_size[1] / base_size) / interpolation_scale + grid = np.meshgrid(grid_w, grid_h) # here w goes first + grid = np.stack(grid, axis=0) + + grid = grid.reshape([2, 1, grid_size[1], grid_size[0]]) + pos_embed = get_2d_sincos_pos_embed_from_grid_np(embed_dim, grid) + if cls_token and extra_tokens > 0: + pos_embed = np.concatenate([np.zeros([extra_tokens, embed_dim]), pos_embed], axis=0) + return pos_embed + + +def get_2d_sincos_pos_embed_from_grid_np(embed_dim, grid): + r""" + This function generates 2D sinusoidal positional embeddings from a grid. + + Args: + embed_dim (`int`): The embedding dimension. + grid (`np.ndarray`): Grid of positions with shape `(H * W,)`. + + Returns: + `np.ndarray`: The 2D sinusoidal positional embeddings with shape `(H * W, embed_dim)` + """ + if embed_dim % 2 != 0: + raise ValueError("embed_dim must be divisible by 2") + + # use half of dimensions to encode grid_h + emb_h = get_1d_sincos_pos_embed_from_grid_np(embed_dim // 2, grid[0]) # (H*W, D/2) + emb_w = get_1d_sincos_pos_embed_from_grid_np(embed_dim // 2, grid[1]) # (H*W, D/2) + + emb = np.concatenate([emb_h, emb_w], axis=1) # (H*W, D) + return emb + + +def get_1d_sincos_pos_embed_from_grid_np(embed_dim, pos): + """ + This function generates 1D positional embeddings from a grid. + + Args: + embed_dim (`int`): The embedding dimension `D` + pos (`numpy.ndarray`): 1D tensor of positions with shape `(M,)` + + Returns: + `numpy.ndarray`: Sinusoidal positional embeddings of shape `(M, D)`. + """ + if embed_dim % 2 != 0: + raise ValueError("embed_dim must be divisible by 2") + + omega = np.arange(embed_dim // 2, dtype=np.float64) + omega /= embed_dim / 2.0 + omega = 1.0 / 10000**omega # (D/2,) + + pos = pos.reshape(-1) # (M,) + out = np.einsum("m,d->md", pos, omega) # (M, D/2), outer product + + emb_sin = np.sin(out) # (M, D/2) + emb_cos = np.cos(out) # (M, D/2) + + emb = np.concatenate([emb_sin, emb_cos], axis=1) # (M, D) + return emb + + +class Timesteps(nn.Module): + def __init__(self, num_channels: int, flip_sin_to_cos: bool, downscale_freq_shift: float, scale: int = 1): + super().__init__() + self.num_channels = num_channels + self.flip_sin_to_cos = flip_sin_to_cos + self.downscale_freq_shift = downscale_freq_shift + self.scale = scale + + def forward(self, timesteps): + t_emb = get_timestep_embedding( + timesteps, + self.num_channels, + flip_sin_to_cos=self.flip_sin_to_cos, + downscale_freq_shift=self.downscale_freq_shift, + scale=self.scale, + ) + return t_emb + + +class TimestepEmbedding(nn.Module): + def __init__( + self, + in_channels: int, + time_embed_dim: int, + act_fn: str = "silu", + out_dim: int = None, + post_act_fn: Optional[str] = None, + cond_proj_dim=None, + sample_proj_bias=True, + ): + super().__init__() + + self.linear_1 = nn.Linear(in_channels, time_embed_dim, sample_proj_bias) + + if cond_proj_dim is not None: + self.cond_proj = nn.Linear(cond_proj_dim, in_channels, bias=False) + else: + self.cond_proj = None + + self.act = get_activation(act_fn) + + if out_dim is not None: + time_embed_dim_out = out_dim + else: + time_embed_dim_out = time_embed_dim + self.linear_2 = nn.Linear(time_embed_dim, time_embed_dim_out, sample_proj_bias) + + if post_act_fn is None: + self.post_act = None + else: + self.post_act = get_activation(post_act_fn) + + def forward(self, sample, condition=None): + if condition is not None: + sample = sample + self.cond_proj(condition) + sample = self.linear_1(sample) + + if self.act is not None: + sample = self.act(sample) + + sample = self.linear_2(sample) + + if self.post_act is not None: + sample = self.post_act(sample) + return sample + + +class PixArtAlphaTextProjection(nn.Module): + """ + Projects caption embeddings. Also handles dropout for classifier-free guidance. + """ + + def __init__(self, in_features, hidden_size, out_features=None, act_fn="gelu_tanh"): + super().__init__() + if out_features is None: + out_features = hidden_size + self.linear_1 = nn.Linear(in_features=in_features, out_features=hidden_size, bias=True) + if act_fn == "gelu_tanh": + self.act_1 = nn.GELU(approximate="tanh") + elif act_fn == "silu": + self.act_1 = nn.SiLU() + elif act_fn == "silu_fp32": + self.act_1 = FP32SiLU() + else: + raise ValueError(f"Unknown activation function: {act_fn}") + self.linear_2 = nn.Linear(in_features=hidden_size, out_features=out_features, bias=True) + + def forward(self, caption): + hidden_states = self.linear_1(caption) + hidden_states = self.act_1(hidden_states) + hidden_states = self.linear_2(hidden_states) + return hidden_states + + +class CogView3CombinedTimestepSizeEmbeddings(nn.Module): + def __init__(self, embedding_dim: int, condition_dim: int, pooled_projection_dim: int, timesteps_dim: int = 256): + super().__init__() + + self.time_proj = Timesteps(num_channels=timesteps_dim, flip_sin_to_cos=True, downscale_freq_shift=0) + self.condition_proj = Timesteps(num_channels=condition_dim, flip_sin_to_cos=True, downscale_freq_shift=0) + self.timestep_embedder = TimestepEmbedding(in_channels=timesteps_dim, time_embed_dim=embedding_dim) + self.condition_embedder = PixArtAlphaTextProjection(pooled_projection_dim, embedding_dim, act_fn="silu") + + def forward( + self, + timestep: torch.Tensor, + original_size: torch.Tensor, + target_size: torch.Tensor, + crop_coords: torch.Tensor, + hidden_dtype: torch.dtype, + ) -> torch.Tensor: + timesteps_proj = self.time_proj(timestep) + + original_size_proj = self.condition_proj(original_size.flatten()).view(original_size.size(0), -1) + crop_coords_proj = self.condition_proj(crop_coords.flatten()).view(crop_coords.size(0), -1) + target_size_proj = self.condition_proj(target_size.flatten()).view(target_size.size(0), -1) + + # (B, 3 * condition_dim) + condition_proj = torch.cat([original_size_proj, crop_coords_proj, target_size_proj], dim=1) + + timesteps_emb = self.timestep_embedder(timesteps_proj.to(dtype=hidden_dtype)) # (B, embedding_dim) + condition_emb = self.condition_embedder(condition_proj.to(dtype=hidden_dtype)) # (B, embedding_dim) + + conditioning = timesteps_emb + condition_emb + return conditioning + + +class CogView3PlusPatchEmbed(nn.Module): + def __init__( + self, + in_channels: int = 16, + hidden_size: int = 2560, + patch_size: int = 2, + text_hidden_size: int = 4096, + pos_embed_max_size: int = 128, + ): + super().__init__() + self.in_channels = in_channels + self.hidden_size = hidden_size + self.patch_size = patch_size + self.text_hidden_size = text_hidden_size + self.pos_embed_max_size = pos_embed_max_size + # Linear projection for image patches + self.proj = nn.Linear(in_channels * patch_size**2, hidden_size) + + # Linear projection for text embeddings + self.text_proj = nn.Linear(text_hidden_size, hidden_size) + + pos_embed = get_2d_sincos_pos_embed( + hidden_size, pos_embed_max_size, base_size=pos_embed_max_size, output_type="pt" + ) + pos_embed = pos_embed.reshape(pos_embed_max_size, pos_embed_max_size, hidden_size) + self.register_buffer("pos_embed", pos_embed.float(), persistent=False) + + def forward(self, hidden_states: torch.Tensor, encoder_hidden_states: torch.Tensor) -> torch.Tensor: + batch_size, channel, height, width = hidden_states.shape + + if height % self.patch_size != 0 or width % self.patch_size != 0: + raise ValueError("Height and width must be divisible by patch size") + + height = height // self.patch_size + width = width // self.patch_size + hidden_states = hidden_states.view(batch_size, channel, height, self.patch_size, width, self.patch_size) + hidden_states = hidden_states.permute(0, 2, 4, 1, 3, 5).contiguous() + hidden_states = hidden_states.view(batch_size, height * width, channel * self.patch_size * self.patch_size) + + # Project the patches + hidden_states = self.proj(hidden_states) + encoder_hidden_states = self.text_proj(encoder_hidden_states) + hidden_states = torch.cat([encoder_hidden_states, hidden_states], dim=1) + + # Calculate text_length + text_length = encoder_hidden_states.shape[1] + + image_pos_embed = self.pos_embed[:height, :width].reshape(height * width, -1) + text_pos_embed = torch.zeros( + (text_length, self.hidden_size), dtype=image_pos_embed.dtype, device=image_pos_embed.device + ) + pos_embed = torch.cat([text_pos_embed, image_pos_embed], dim=0)[None, ...] + + return (hidden_states + pos_embed).to(hidden_states.dtype) \ No newline at end of file diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/normalization.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/normalization.py new file mode 100644 index 0000000000..b2576d26f5 --- /dev/null +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/normalization.py @@ -0,0 +1,166 @@ +# coding=utf-8 +# Copyright 2024 HuggingFace Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numbers +from typing import Optional, Tuple + +import torch +import torch.nn as nn +import torch.nn.functional as F + +from diffusers.utils import is_torch_version + + +if is_torch_version(">=", "2.1.0"): + LayerNorm = nn.LayerNorm +else: + # Has optional bias parameter compared to torch layer norm + # TODO: replace with torch layernorm once min required torch version >= 2.1 + class LayerNorm(nn.Module): + def __init__(self, dim, eps: float = 1e-5, elementwise_affine: bool = True, bias: bool = True): + super().__init__() + + self.eps = eps + + if isinstance(dim, numbers.Integral): + dim = (dim,) + + self.dim = torch.Size(dim) + + if elementwise_affine: + self.weight = nn.Parameter(torch.ones(dim)) + self.bias = nn.Parameter(torch.zeros(dim)) if bias else None + else: + self.weight = None + self.bias = None + + def forward(self, input): + return F.layer_norm(input, self.dim, self.weight, self.bias, self.eps) + + +class RMSNorm(nn.Module): + def __init__(self, dim, eps: float, elementwise_affine: bool = True, bias: bool = False): + super().__init__() + + self.eps = eps + self.elementwise_affine = elementwise_affine + + if isinstance(dim, numbers.Integral): + dim = (dim,) + + self.dim = torch.Size(dim) + + self.weight = None + self.bias = None + + if elementwise_affine: + self.weight = nn.Parameter(torch.ones(dim)) + if bias: + self.bias = nn.Parameter(torch.zeros(dim)) + + def forward(self, hidden_states): + input_dtype = hidden_states.dtype + variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True) + hidden_states = hidden_states * torch.rsqrt(variance + self.eps) + + if self.weight is not None: + # convert into half-precision if necessary + if self.weight.dtype in [torch.float16, torch.bfloat16]: + hidden_states = hidden_states.to(self.weight.dtype) + hidden_states = hidden_states * self.weight + if self.bias is not None: + hidden_states = hidden_states + self.bias + else: + hidden_states = hidden_states.to(input_dtype) + + return hidden_states + + +class CogView3PlusAdaLayerNormZeroTextImage(nn.Module): + r""" + Norm layer adaptive layer norm zero (adaLN-Zero). + + Parameters: + embedding_dim (`int`): The size of each embedding vector. + num_embeddings (`int`): The size of the embeddings dictionary. + """ + + def __init__(self, embedding_dim: int, dim: int): + super().__init__() + + self.silu = nn.SiLU() + self.linear = nn.Linear(embedding_dim, 12 * dim, bias=True) + self.norm_x = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-5) + self.norm_c = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-5) + + def forward( + self, + x: torch.Tensor, + context: torch.Tensor, + emb: Optional[torch.Tensor] = None, + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: + emb = self.linear(self.silu(emb)) + ( + shift_msa, + scale_msa, + gate_msa, + shift_mlp, + scale_mlp, + gate_mlp, + c_shift_msa, + c_scale_msa, + c_gate_msa, + c_shift_mlp, + c_scale_mlp, + c_gate_mlp, + ) = emb.chunk(12, dim=1) + normed_x = self.norm_x(x) + normed_context = self.norm_c(context) + x = normed_x * (1 + scale_msa[:, None]) + shift_msa[:, None] + context = normed_context * (1 + c_scale_msa[:, None]) + c_shift_msa[:, None] + return x, gate_msa, shift_mlp, scale_mlp, gate_mlp, context, c_gate_msa, c_shift_mlp, c_scale_mlp, c_gate_mlp + + +class AdaLayerNormContinuous(nn.Module): + def __init__( + self, + embedding_dim: int, + conditioning_embedding_dim: int, + # NOTE: It is a bit weird that the norm layer can be configured to have scale and shift parameters + # because the output is immediately scaled and shifted by the projected conditioning embeddings. + # Note that AdaLayerNorm does not let the norm layer have scale and shift parameters. + # However, this is how it was implemented in the original code, and it's rather likely you should + # set `elementwise_affine` to False. + elementwise_affine=True, + eps=1e-5, + bias=True, + norm_type="layer_norm", + ): + super().__init__() + self.silu = nn.SiLU() + self.linear = nn.Linear(conditioning_embedding_dim, embedding_dim * 2, bias=bias) + if norm_type == "layer_norm": + self.norm = LayerNorm(embedding_dim, eps, elementwise_affine, bias) + elif norm_type == "rms_norm": + self.norm = RMSNorm(embedding_dim, eps, elementwise_affine) + else: + raise ValueError(f"unknown norm_type {norm_type}") + + def forward(self, x: torch.Tensor, conditioning_embedding: torch.Tensor) -> torch.Tensor: + # convert back to the original dtype in case `conditioning_embedding`` is upcasted to float32 (needed for hunyuanDiT) + emb = self.linear(self.silu(conditioning_embedding).to(x.dtype)) + scale, shift = torch.chunk(emb, 2, dim=1) + x = self.norm(x) * (1 + scale)[:, None, :] + shift[:, None, :] + return x \ No newline at end of file diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/__init__.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/__init__.py new file mode 100644 index 0000000000..06571c58d3 --- /dev/null +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/__init__.py @@ -0,0 +1 @@ +from transformer_cogview3plus import CogView3PlusTransformer2DModel \ No newline at end of file diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/attention.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/attention.py new file mode 100644 index 0000000000..00aabc9fdd --- /dev/null +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/attention.py @@ -0,0 +1,86 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import Optional + +import torch +from torch import nn + +from diffusers.utils import deprecate, logging +from diffusers.models.activations import GEGLU, GELU, ApproximateGELU, LinearActivation, SwiGLU + + +logger = logging.get_logger(__name__) + + +class FeedForward(nn.Module): + r""" + A feed-forward layer. + + Parameters: + dim (`int`): The number of channels in the input. + dim_out (`int`, *optional*): The number of channels in the output. If not given, defaults to `dim`. + mult (`int`, *optional*, defaults to 4): The multiplier to use for the hidden dimension. + dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use. + activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward. + final_dropout (`bool` *optional*, defaults to False): Apply a final dropout. + bias (`bool`, defaults to True): Whether to use a bias in the linear layer. + """ + + def __init__( + self, + dim: int, + dim_out: Optional[int] = None, + mult: int = 4, + dropout: float = 0.0, + activation_fn: str = "geglu", + final_dropout: bool = False, + inner_dim=None, + bias: bool = True, + ): + super().__init__() + if inner_dim is None: + inner_dim = int(dim * mult) + dim_out = dim_out if dim_out is not None else dim + + if activation_fn == "gelu": + act_fn = GELU(dim, inner_dim, bias=bias) + if activation_fn == "gelu-approximate": + act_fn = GELU(dim, inner_dim, approximate="tanh", bias=bias) + elif activation_fn == "geglu": + act_fn = GEGLU(dim, inner_dim, bias=bias) + elif activation_fn == "geglu-approximate": + act_fn = ApproximateGELU(dim, inner_dim, bias=bias) + elif activation_fn == "swiglu": + act_fn = SwiGLU(dim, inner_dim, bias=bias) + elif activation_fn == "linear-silu": + act_fn = LinearActivation(dim, inner_dim, bias=bias, activation="silu") + + self.net = nn.ModuleList([]) + # project in + self.net.append(act_fn) + # project dropout + self.net.append(nn.Dropout(dropout)) + # project out + self.net.append(nn.Linear(inner_dim, dim_out, bias=bias)) + # FF as used in Vision Transformer, MLP-Mixer, etc. have a final dropout + if final_dropout: + self.net.append(nn.Dropout(dropout)) + + def forward(self, hidden_states: torch.Tensor, *args, **kwargs) -> torch.Tensor: + if len(args) > 0 or kwargs.get("scale", None) is not None: + deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`." + deprecate("scale", "1.0.0", deprecation_message) + for module in self.net: + hidden_states = module(hidden_states) + return hidden_states \ No newline at end of file diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/attention_processor.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/attention_processor.py new file mode 100644 index 0000000000..6632f7f83f --- /dev/null +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/attention_processor.py @@ -0,0 +1,82 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Optional + +import torch +import torch.nn.functional as F + +from diffusers.models.attention_processor import Attention + + +class CogVideoXAttnProcessor2_0: + r""" + Processor for implementing scaled dot-product attention for the CogVideoX model. It applies a rotary embedding on + query and key vectors, but does not include spatial normalization. + """ + + def __init__(self): + if not hasattr(F, "scaled_dot_product_attention"): + raise ImportError("CogVideoXAttnProcessor requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.") + + def __call__( + self, + attn: Attention, + hidden_states: torch.Tensor, + encoder_hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None,s + ) -> torch.Tensor: + text_seq_length = encoder_hidden_states.size(1) + + hidden_states = torch.cat([encoder_hidden_states, hidden_states], dim=1) + + batch_size, sequence_length, _ = ( + hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape + ) + + if attention_mask is not None: + attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size) + attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1]) + + query = attn.to_q(hidden_states) + key = attn.to_k(hidden_states) + value = attn.to_v(hidden_states) + + inner_dim = key.shape[-1] + head_dim = inner_dim // attn.heads + + query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2) + key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2) + value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2) + + if attn.norm_q is not None: + query = attn.norm_q(query) + if attn.norm_k is not None: + key = attn.norm_k(key) + + hidden_states = F.scaled_dot_product_attention( + query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False + ) + + hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim) + + # linear proj + hidden_states = attn.to_out[0](hidden_states) + # dropout + hidden_states = attn.to_out[1](hidden_states) + + encoder_hidden_states, hidden_states = hidden_states.split( + [text_seq_length, hidden_states.size(1) - text_seq_length], dim=1 + ) + return hidden_states, encoder_hidden_states \ No newline at end of file diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/transformer_cogview3plus.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/transformer_cogview3plus.py new file mode 100644 index 0000000000..78360f61e9 --- /dev/null +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/transformer_cogview3plus.py @@ -0,0 +1,383 @@ +# Copyright 2024 The CogView team, Tsinghua University & ZhipuAI and The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from typing import Any, Dict, Union + +import torch +import torch.nn as nn + +from diffusers.configuration_utils import ConfigMixin, register_to_config +from diffusers.models.attention_processor import Attention, AttentionProcessor +from diffusers.models.modeling_utils import ModelMixin +from diffusers.utils import is_torch_version, logging +from diffusers.models.modeling_outputs import Transformer2DModelOutput + +from .attention import FeedForward +from .attention_processor import CogVideoXAttnProcessor2_0 +from ..layers import CogView3PlusAdaLayerNormZeroTextImage, AdaLayerNormContinuous +from ..layers import CogView3CombinedTimestepSizeEmbeddings, CogView3PlusPatchEmbed + + +logger = logging.get_logger(__name__) # pylint: disable=invalid-name + + +class CogView3PlusTransformerBlock(nn.Module): + r""" + Transformer block used in [CogView](https://github.com/THUDM/CogView3) model. + + Args: + dim (`int`): + The number of channels in the input and output. + num_attention_heads (`int`): + The number of heads to use for multi-head attention. + attention_head_dim (`int`): + The number of channels in each head. + time_embed_dim (`int`): + The number of channels in timestep embedding. + """ + + def __init__( + self, + dim: int = 2560, + num_attention_heads: int = 64, + attention_head_dim: int = 40, + time_embed_dim: int = 512, + ): + super().__init__() + + self.norm1 = CogView3PlusAdaLayerNormZeroTextImage(embedding_dim=time_embed_dim, dim=dim) + + self.attn1 = Attention( + query_dim=dim, + heads=num_attention_heads, + dim_head=attention_head_dim, + out_dim=dim, + bias=True, + qk_norm="layer_norm", + elementwise_affine=False, + eps=1e-6, + processor=CogVideoXAttnProcessor2_0(), + ) + + self.norm2 = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-5) + self.norm2_context = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-5) + + self.ff = FeedForward(dim=dim, dim_out=dim, activation_fn="gelu-approximate") + + def forward( + self, + hidden_states: torch.Tensor, + encoder_hidden_states: torch.Tensor, + emb: torch.Tensor, + ) -> torch.Tensor: + text_seq_length = encoder_hidden_states.size(1) + + # norm & modulate + ( + norm_hidden_states, + gate_msa, + shift_mlp, + scale_mlp, + gate_mlp, + norm_encoder_hidden_states, + c_gate_msa, + c_shift_mlp, + c_scale_mlp, + c_gate_mlp, + ) = self.norm1(hidden_states, encoder_hidden_states, emb) + + # attention + attn_hidden_states, attn_encoder_hidden_states = self.attn1( + hidden_states=norm_hidden_states, encoder_hidden_states=norm_encoder_hidden_states + ) + + hidden_states = hidden_states + gate_msa.unsqueeze(1) * attn_hidden_states + encoder_hidden_states = encoder_hidden_states + c_gate_msa.unsqueeze(1) * attn_encoder_hidden_states + + # norm & modulate + norm_hidden_states = self.norm2(hidden_states) + norm_hidden_states = norm_hidden_states * (1 + scale_mlp[:, None]) + shift_mlp[:, None] + + norm_encoder_hidden_states = self.norm2_context(encoder_hidden_states) + norm_encoder_hidden_states = norm_encoder_hidden_states * (1 + c_scale_mlp[:, None]) + c_shift_mlp[:, None] + + # feed-forward + norm_hidden_states = torch.cat([norm_encoder_hidden_states, norm_hidden_states], dim=1) + ff_output = self.ff(norm_hidden_states) + + hidden_states = hidden_states + gate_mlp.unsqueeze(1) * ff_output[:, text_seq_length:] + encoder_hidden_states = encoder_hidden_states + c_gate_mlp.unsqueeze(1) * ff_output[:, :text_seq_length] + + if hidden_states.dtype == torch.float16: + hidden_states = hidden_states.clip(-65504, 65504) + if encoder_hidden_states.dtype == torch.float16: + encoder_hidden_states = encoder_hidden_states.clip(-65504, 65504) + return hidden_states, encoder_hidden_states + + +class CogView3PlusTransformer2DModel(ModelMixin, ConfigMixin): + r""" + The Transformer model introduced in [CogView3: Finer and Faster Text-to-Image Generation via Relay + Diffusion](https://huggingface.co/papers/2403.05121). + + Args: + patch_size (`int`, defaults to `2`): + The size of the patches to use in the patch embedding layer. + in_channels (`int`, defaults to `16`): + The number of channels in the input. + num_layers (`int`, defaults to `30`): + The number of layers of Transformer blocks to use. + attention_head_dim (`int`, defaults to `40`): + The number of channels in each head. + num_attention_heads (`int`, defaults to `64`): + The number of heads to use for multi-head attention. + out_channels (`int`, defaults to `16`): + The number of channels in the output. + text_embed_dim (`int`, defaults to `4096`): + Input dimension of text embeddings from the text encoder. + time_embed_dim (`int`, defaults to `512`): + Output dimension of timestep embeddings. + condition_dim (`int`, defaults to `256`): + The embedding dimension of the input SDXL-style resolution conditions (original_size, target_size, + crop_coords). + pos_embed_max_size (`int`, defaults to `128`): + The maximum resolution of the positional embeddings, from which slices of shape `H x W` are taken and added + to input patched latents, where `H` and `W` are the latent height and width respectively. A value of 128 + means that the maximum supported height and width for image generation is `128 * vae_scale_factor * + patch_size => 128 * 8 * 2 => 2048`. + sample_size (`int`, defaults to `128`): + The base resolution of input latents. If height/width is not provided during generation, this value is used + to determine the resolution as `sample_size * vae_scale_factor => 128 * 8 => 1024` + """ + + _supports_gradient_checkpointing = True + + @register_to_config + def __init__( + self, + patch_size: int = 2, + in_channels: int = 16, + num_layers: int = 30, + attention_head_dim: int = 40, + num_attention_heads: int = 64, + out_channels: int = 16, + text_embed_dim: int = 4096, + time_embed_dim: int = 512, + condition_dim: int = 256, + pos_embed_max_size: int = 128, + sample_size: int = 128, + ): + super().__init__() + self.out_channels = out_channels + self.inner_dim = num_attention_heads * attention_head_dim + + # CogView3 uses 3 additional SDXL-like conditions - original_size, target_size, crop_coords + # Each of these are sincos embeddings of shape 2 * condition_dim + self.pooled_projection_dim = 3 * 2 * condition_dim + + self.patch_embed = CogView3PlusPatchEmbed( + in_channels=in_channels, + hidden_size=self.inner_dim, + patch_size=patch_size, + text_hidden_size=text_embed_dim, + pos_embed_max_size=pos_embed_max_size, + ) + + self.time_condition_embed = CogView3CombinedTimestepSizeEmbeddings( + embedding_dim=time_embed_dim, + condition_dim=condition_dim, + pooled_projection_dim=self.pooled_projection_dim, + timesteps_dim=self.inner_dim, + ) + + self.transformer_blocks = nn.ModuleList( + [ + CogView3PlusTransformerBlock( + dim=self.inner_dim, + num_attention_heads=num_attention_heads, + attention_head_dim=attention_head_dim, + time_embed_dim=time_embed_dim, + ) + for _ in range(num_layers) + ] + ) + + self.norm_out = AdaLayerNormContinuous( + embedding_dim=self.inner_dim, + conditioning_embedding_dim=time_embed_dim, + elementwise_affine=False, + eps=1e-6, + ) + self.proj_out = nn.Linear(self.inner_dim, patch_size * patch_size * self.out_channels, bias=True) + + self.gradient_checkpointing = False + + @property + # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.attn_processors + def attn_processors(self) -> Dict[str, AttentionProcessor]: + r""" + Returns: + `dict` of attention processors: A dictionary containing all attention processors used in the model with + indexed by its weight name. + """ + # set recursively + processors = {} + + def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]): + if hasattr(module, "get_processor"): + processors[f"{name}.processor"] = module.get_processor() + + for sub_name, child in module.named_children(): + fn_recursive_add_processors(f"{name}.{sub_name}", child, processors) + + return processors + + for name, module in self.named_children(): + fn_recursive_add_processors(name, module, processors) + + return processors + + # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_attn_processor + def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]): + r""" + Sets the attention processor to use to compute attention. + + Parameters: + processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`): + The instantiated processor class or a dictionary of processor classes that will be set as the processor + for **all** `Attention` layers. + + If `processor` is a dict, the key needs to define the path to the corresponding cross attention + processor. This is strongly recommended when setting trainable attention processors. + + """ + count = len(self.attn_processors.keys()) + + if isinstance(processor, dict) and len(processor) != count: + raise ValueError( + f"A dict of processors was passed, but the number of processors {len(processor)} does not match the" + f" number of attention layers: {count}. Please make sure to pass {count} processor classes." + ) + + def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor): + if hasattr(module, "set_processor"): + if not isinstance(processor, dict): + module.set_processor(processor) + else: + module.set_processor(processor.pop(f"{name}.processor")) + + for sub_name, child in module.named_children(): + fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor) + + for name, module in self.named_children(): + fn_recursive_attn_processor(name, module, processor) + + def _set_gradient_checkpointing(self, module, value=False): + if hasattr(module, "gradient_checkpointing"): + module.gradient_checkpointing = value + + def forward( + self, + hidden_states: torch.Tensor, + encoder_hidden_states: torch.Tensor, + timestep: torch.LongTensor, + original_size: torch.Tensor, + target_size: torch.Tensor, + crop_coords: torch.Tensor, + return_dict: bool = True, + ) -> Union[torch.Tensor, Transformer2DModelOutput]: + """ + The [`CogView3PlusTransformer2DModel`] forward method. + + Args: + hidden_states (`torch.Tensor`): + Input `hidden_states` of shape `(batch size, channel, height, width)`. + encoder_hidden_states (`torch.Tensor`): + Conditional embeddings (embeddings computed from the input conditions such as prompts) of shape + `(batch_size, sequence_len, text_embed_dim)` + timestep (`torch.LongTensor`): + Used to indicate denoising step. + original_size (`torch.Tensor`): + CogView3 uses SDXL-like micro-conditioning for original image size as explained in section 2.2 of + [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). + target_size (`torch.Tensor`): + CogView3 uses SDXL-like micro-conditioning for target image size as explained in section 2.2 of + [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). + crop_coords (`torch.Tensor`): + CogView3 uses SDXL-like micro-conditioning for crop coordinates as explained in section 2.2 of + [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`~models.transformer_2d.Transformer2DModelOutput`] instead of a plain + tuple. + + Returns: + `torch.Tensor` or [`~models.transformer_2d.Transformer2DModelOutput`]: + The denoised latents using provided inputs as conditioning. + """ + height, width = hidden_states.shape[-2:] + text_seq_length = encoder_hidden_states.shape[1] + + hidden_states = self.patch_embed( + hidden_states, encoder_hidden_states + ) # takes care of adding positional embeddings too. + emb = self.time_condition_embed(timestep, original_size, target_size, crop_coords, hidden_states.dtype) + + encoder_hidden_states = hidden_states[:, :text_seq_length] + hidden_states = hidden_states[:, text_seq_length:] + + for index_block, block in enumerate(self.transformer_blocks): + if torch.is_grad_enabled() and self.gradient_checkpointing: + + def create_custom_forward(module): + def custom_forward(*inputs): + return module(*inputs) + + return custom_forward + + ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {} + hidden_states, encoder_hidden_states = torch.utils.checkpoint.checkpoint( + create_custom_forward(block), + hidden_states, + encoder_hidden_states, + emb, + **ckpt_kwargs, + ) + else: + hidden_states, encoder_hidden_states = block( + hidden_states=hidden_states, + encoder_hidden_states=encoder_hidden_states, + emb=emb, + ) + + hidden_states = self.norm_out(hidden_states, emb) + hidden_states = self.proj_out(hidden_states) # (batch_size, height*width, patch_size*patch_size*out_channels) + + # unpatchify + patch_size = self.config.patch_size + height = height // patch_size + width = width // patch_size + + hidden_states = hidden_states.reshape( + shape=(hidden_states.shape[0], height, width, self.out_channels, patch_size, patch_size) + ) + hidden_states = torch.einsum("nhwcpq->nchpwq", hidden_states) + output = hidden_states.reshape( + shape=(hidden_states.shape[0], self.out_channels, height * patch_size, width * patch_size) + ) + + if not return_dict: + return (output,) + + return Transformer2DModelOutput(sample=output) \ No newline at end of file diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/__init__.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/__init__.py new file mode 100644 index 0000000000..aea730c2e3 --- /dev/null +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/__init__.py @@ -0,0 +1 @@ +from .pipeline_cogview3plus import CogView3PlusPipeline \ No newline at end of file diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py new file mode 100644 index 0000000000..a78f82a9b2 --- /dev/null +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py @@ -0,0 +1,675 @@ +# Copyright 2024 The CogVideoX team, Tsinghua University & ZhipuAI and The HuggingFace Team. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import inspect +from typing import Callable, Dict, List, Optional, Tuple, Union + +import torch +from transformers import T5EncoderModel, T5Tokenizer + +from diffusers.callbacks import MultiPipelineCallbacks, PipelineCallback +from diffusers.image_processor import VaeImageProcessor +from diffusers.pipelines.pipeline_utils import DiffusionPipeline +from diffusers.utils import logging, replace_example_docstring +from diffusers.utils.torch_utils import randn_tensor + +from ..vae import AutoencoderKL +from ..models import CogView3PlusTransformer2DModel +from ..schedulers import CogVideoXDDIMScheduler, CogVideoXDPMScheduler +from .pipeline_output import CogView3PipelineOutput + +logger = logging.get_logger(__name__) # pylint: disable=invalid-name + + +EXAMPLE_DOC_STRING = """ + Examples: + ```python + >>> import torch + >>> from diffusers import CogView3PlusPipeline + + >>> pipe = CogView3PlusPipeline.from_pretrained("THUDM/CogView3-Plus-3B", torch_dtype=torch.bfloat16) + >>> pipe.to("cuda") + + >>> prompt = "A photo of an astronaut riding a horse on mars" + >>> image = pipe(prompt).images[0] + >>> image.save("output.png") + ``` +""" + + +# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps +def retrieve_timesteps( + scheduler, + num_inference_steps: Optional[int] = None, + device: Optional[Union[str, torch.device]] = None, + timesteps: Optional[List[int]] = None, + sigmas: Optional[List[float]] = None, + **kwargs, +): + r""" + Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles + custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`. + + Args: + scheduler (`SchedulerMixin`): + The scheduler to get timesteps from. + num_inference_steps (`int`): + The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps` + must be `None`. + device (`str` or `torch.device`, *optional*): + The device to which the timesteps should be moved to. If `None`, the timesteps are not moved. + timesteps (`List[int]`, *optional*): + Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed, + `num_inference_steps` and `sigmas` must be `None`. + sigmas (`List[float]`, *optional*): + Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed, + `num_inference_steps` and `timesteps` must be `None`. + + Returns: + `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the + second element is the number of inference steps. + """ + if timesteps is not None and sigmas is not None: + raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values") + if timesteps is not None: + accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys()) + if not accepts_timesteps: + raise ValueError( + f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom" + f" timestep schedules. Please check whether you are using the correct scheduler." + ) + scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs) + timesteps = scheduler.timesteps + num_inference_steps = len(timesteps) + elif sigmas is not None: + accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys()) + if not accept_sigmas: + raise ValueError( + f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom" + f" sigmas schedules. Please check whether you are using the correct scheduler." + ) + scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs) + timesteps = scheduler.timesteps + num_inference_steps = len(timesteps) + else: + scheduler.set_timesteps(num_inference_steps, device=device, **kwargs) + timesteps = scheduler.timesteps + return timesteps, num_inference_steps + + +class CogView3PlusPipeline(DiffusionPipeline): + r""" + Pipeline for text-to-image generation using CogView3Plus. + + This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the + library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.) + + Args: + vae ([`AutoencoderKL`]): + Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations. + text_encoder ([`T5EncoderModel`]): + Frozen text-encoder. CogView3Plus uses + [T5](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5EncoderModel); specifically the + [t5-v1_1-xxl](https://huggingface.co/PixArt-alpha/PixArt-alpha/tree/main/t5-v1_1-xxl) variant. + tokenizer (`T5Tokenizer`): + Tokenizer of class + [T5Tokenizer](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5Tokenizer). + transformer ([`CogView3PlusTransformer2DModel`]): + A text conditioned `CogView3PlusTransformer2DModel` to denoise the encoded image latents. + scheduler ([`SchedulerMixin`]): + A scheduler to be used in combination with `transformer` to denoise the encoded image latents. + """ + + _optional_components = [] + model_cpu_offload_seq = "text_encoder->transformer->vae" + + _callback_tensor_inputs = [ + "latents", + "prompt_embeds", + "negative_prompt_embeds", + ] + + def __init__( + self, + tokenizer: T5Tokenizer, + text_encoder: T5EncoderModel, + vae: AutoencoderKL, + transformer: CogView3PlusTransformer2DModel, + scheduler: Union[CogVideoXDDIMScheduler, CogVideoXDPMScheduler], + ): + super().__init__() + + self.register_modules( + tokenizer=tokenizer, text_encoder=text_encoder, vae=vae, transformer=transformer, scheduler=scheduler + ) + self.vae_scale_factor = ( + 2 ** (len(self.vae.config.block_out_channels) - 1) if hasattr(self, "vae") and self.vae is not None else 8 + ) + + self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor) + + # Copied from diffusers.pipelines.cogvideo.pipeline_cogvideox.CogVideoXPipeline._get_t5_prompt_embeds with num_videos_per_prompt->num_images_per_prompt + def _get_t5_prompt_embeds( + self, + prompt: Union[str, List[str]] = None, + num_images_per_prompt: int = 1, + max_sequence_length: int = 226, + device: Optional[torch.device] = None, + dtype: Optional[torch.dtype] = None, + ): + device = device or self._execution_device + dtype = dtype or self.text_encoder.dtype + + prompt = [prompt] if isinstance(prompt, str) else prompt + batch_size = len(prompt) + + text_inputs = self.tokenizer( + prompt, + padding="max_length", + max_length=max_sequence_length, + truncation=True, + add_special_tokens=True, + return_tensors="pt", + ) + text_input_ids = text_inputs.input_ids + untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids + + if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids): + removed_text = self.tokenizer.batch_decode(untruncated_ids[:, max_sequence_length - 1 : -1]) + logger.warning( + "The following part of your input was truncated because `max_sequence_length` is set to " + f" {max_sequence_length} tokens: {removed_text}" + ) + + prompt_embeds = self.text_encoder(text_input_ids.to(device))[0] + prompt_embeds = prompt_embeds.to(dtype=dtype, device=device) + + # duplicate text embeddings for each generation per prompt, using mps friendly method + _, seq_len, _ = prompt_embeds.shape + prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1) + prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1) + + return prompt_embeds + + def encode_prompt( + self, + prompt: Union[str, List[str]], + negative_prompt: Optional[Union[str, List[str]]] = None, + do_classifier_free_guidance: bool = True, + num_images_per_prompt: int = 1, + prompt_embeds: Optional[torch.Tensor] = None, + negative_prompt_embeds: Optional[torch.Tensor] = None, + max_sequence_length: int = 224, + device: Optional[torch.device] = None, + dtype: Optional[torch.dtype] = None, + ): + r""" + Encodes the prompt into text encoder hidden states. + + Args: + prompt (`str` or `List[str]`, *optional*): + prompt to be encoded + negative_prompt (`str` or `List[str]`, *optional*): + The prompt or prompts not to guide the image generation. If not defined, one has to pass + `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is + less than `1`). + do_classifier_free_guidance (`bool`, *optional*, defaults to `True`): + Whether to use classifier free guidance or not. + num_images_per_prompt (`int`, *optional*, defaults to 1): + Number of images that should be generated per prompt. torch device to place the resulting embeddings on + prompt_embeds (`torch.Tensor`, *optional*): + Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not + provided, text embeddings will be generated from `prompt` input argument. + negative_prompt_embeds (`torch.Tensor`, *optional*): + Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt + weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input + argument. + max_sequence_length (`int`, defaults to `224`): + Maximum sequence length in encoded prompt. Can be set to other values but may lead to poorer results. + device: (`torch.device`, *optional*): + torch device + dtype: (`torch.dtype`, *optional*): + torch dtype + """ + device = device or self._execution_device + + prompt = [prompt] if isinstance(prompt, str) else prompt + if prompt is not None: + batch_size = len(prompt) + else: + batch_size = prompt_embeds.shape[0] + + if prompt_embeds is None: + prompt_embeds = self._get_t5_prompt_embeds( + prompt=prompt, + num_images_per_prompt=num_images_per_prompt, + max_sequence_length=max_sequence_length, + device=device, + dtype=dtype, + ) + + if do_classifier_free_guidance and negative_prompt is None: + negative_prompt_embeds = prompt_embeds.new_zeros(prompt_embeds.shape) + + if do_classifier_free_guidance and negative_prompt_embeds is None: + negative_prompt = batch_size * [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt + + if prompt is not None and type(prompt) is not type(negative_prompt): + raise TypeError( + f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !=" + f" {type(prompt)}." + ) + elif batch_size != len(negative_prompt): + raise ValueError( + f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:" + f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches" + " the batch size of `prompt`." + ) + + negative_prompt_embeds = self._get_t5_prompt_embeds( + prompt=negative_prompt, + num_images_per_prompt=num_images_per_prompt, + max_sequence_length=max_sequence_length, + device=device, + dtype=dtype, + ) + + return prompt_embeds, negative_prompt_embeds + + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents + def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None): + shape = ( + batch_size, + num_channels_latents, + int(height) // self.vae_scale_factor, + int(width) // self.vae_scale_factor, + ) + if isinstance(generator, list) and len(generator) != batch_size: + raise ValueError( + f"You have passed a list of generators of length {len(generator)}, but requested an effective batch" + f" size of {batch_size}. Make sure the batch size matches the length of the generators." + ) + + if latents is None: + latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype) + else: + latents = latents.to(device) + + # scale the initial noise by the standard deviation required by the scheduler + latents = latents * self.scheduler.init_noise_sigma + return latents + + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs + def prepare_extra_step_kwargs(self, generator, eta): + # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature + # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers. + # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502 + # and should be between [0, 1] + + accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys()) + extra_step_kwargs = {} + if accepts_eta: + extra_step_kwargs["eta"] = eta + + # check if the scheduler accepts generator + accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys()) + if accepts_generator: + extra_step_kwargs["generator"] = generator + return extra_step_kwargs + + # Copied from diffusers.pipelines.latte.pipeline_latte.LattePipeline.check_inputs + def check_inputs( + self, + prompt, + height, + width, + negative_prompt, + callback_on_step_end_tensor_inputs, + prompt_embeds=None, + negative_prompt_embeds=None, + ): + if height % 8 != 0 or width % 8 != 0: + raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.") + + if callback_on_step_end_tensor_inputs is not None and not all( + k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs + ): + raise ValueError( + f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}" + ) + if prompt is not None and prompt_embeds is not None: + raise ValueError( + f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to" + " only forward one of the two." + ) + elif prompt is None and prompt_embeds is None: + raise ValueError( + "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined." + ) + elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)): + raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}") + + if prompt is not None and negative_prompt_embeds is not None: + raise ValueError( + f"Cannot forward both `prompt`: {prompt} and `negative_prompt_embeds`:" + f" {negative_prompt_embeds}. Please make sure to only forward one of the two." + ) + + if negative_prompt is not None and negative_prompt_embeds is not None: + raise ValueError( + f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:" + f" {negative_prompt_embeds}. Please make sure to only forward one of the two." + ) + + if prompt_embeds is not None and negative_prompt_embeds is not None: + if prompt_embeds.shape != negative_prompt_embeds.shape: + raise ValueError( + "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but" + f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`" + f" {negative_prompt_embeds.shape}." + ) + + @property + def guidance_scale(self): + return self._guidance_scale + + # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2) + # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1` + # corresponds to doing no classifier free guidance. + @property + def do_classifier_free_guidance(self): + return self._guidance_scale > 1 + + @property + def num_timesteps(self): + return self._num_timesteps + + @property + def interrupt(self): + return self._interrupt + + @torch.no_grad() + @replace_example_docstring(EXAMPLE_DOC_STRING) + def __call__( + self, + prompt: Optional[Union[str, List[str]]] = None, + negative_prompt: Optional[Union[str, List[str]]] = None, + height: Optional[int] = None, + width: Optional[int] = None, + num_inference_steps: int = 50, + timesteps: Optional[List[int]] = None, + guidance_scale: float = 5.0, + num_images_per_prompt: int = 1, + eta: float = 0.0, + generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, + latents: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.FloatTensor] = None, + negative_prompt_embeds: Optional[torch.FloatTensor] = None, + original_size: Optional[Tuple[int, int]] = None, + crops_coords_top_left: Tuple[int, int] = (0, 0), + output_type: str = "pil", + return_dict: bool = True, + callback_on_step_end: Optional[ + Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks] + ] = None, + callback_on_step_end_tensor_inputs: List[str] = ["latents"], + max_sequence_length: int = 224, + ) -> Union[CogView3PipelineOutput, Tuple]: + """ + Function invoked when calling the pipeline for generation. + + Args: + prompt (`str` or `List[str]`, *optional*): + The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`. + negative_prompt (`str` or `List[str]`, *optional*): + The prompt or prompts not to guide the image generation. If not defined, one has to pass + `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is + less than `1`). + height (`int`, *optional*, defaults to self.transformer.config.sample_size * self.vae_scale_factor): + The height in pixels of the generated image. If not provided, it is set to 1024. + width (`int`, *optional*, defaults to self.transformer.config.sample_size * self.vae_scale_factor): + The width in pixels of the generated image. If not provided it is set to 1024. + num_inference_steps (`int`, *optional*, defaults to `50`): + The number of denoising steps. More denoising steps usually lead to a higher quality image at the + expense of slower inference. + timesteps (`List[int]`, *optional*): + Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument + in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is + passed will be used. Must be in descending order. + guidance_scale (`float`, *optional*, defaults to `5.0`): + Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598). + `guidance_scale` is defined as `w` of equation 2. of [Imagen + Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale > + 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`, + usually at the expense of lower image quality. + num_images_per_prompt (`int`, *optional*, defaults to `1`): + The number of images to generate per prompt. + generator (`torch.Generator` or `List[torch.Generator]`, *optional*): + One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) + to make generation deterministic. + latents (`torch.FloatTensor`, *optional*): + Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image + generation. Can be used to tweak the same generation with different prompts. If not provided, a latents + tensor will ge generated by sampling using the supplied random `generator`. + prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not + provided, text embeddings will be generated from `prompt` input argument. + negative_prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt + weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input + argument. + original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)): + If `original_size` is not the same as `target_size` the image will appear to be down- or upsampled. + `original_size` defaults to `(height, width)` if not specified. Part of SDXL's micro-conditioning as + explained in section 2.2 of + [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). + crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)): + `crops_coords_top_left` can be used to generate an image that appears to be "cropped" from the position + `crops_coords_top_left` downwards. Favorable, well-centered images are usually achieved by setting + `crops_coords_top_left` to (0, 0). Part of SDXL's micro-conditioning as explained in section 2.2 of + [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). + output_type (`str`, *optional*, defaults to `"pil"`): + The output format of the generate image. Choose between + [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`. + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] instead + of a plain tuple. + attention_kwargs (`dict`, *optional*): + A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under + `self.processor` in + [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py). + callback_on_step_end (`Callable`, *optional*): + A function that calls at the end of each denoising steps during the inference. The function is called + with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int, + callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by + `callback_on_step_end_tensor_inputs`. + callback_on_step_end_tensor_inputs (`List`, *optional*): + The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list + will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the + `._callback_tensor_inputs` attribute of your pipeline class. + max_sequence_length (`int`, defaults to `224`): + Maximum sequence length in encoded prompt. Can be set to other values but may lead to poorer results. + + Examples: + + Returns: + [`~pipelines.cogview3.pipeline_cogview3plus.CogView3PipelineOutput`] or `tuple`: + [`~pipelines.cogview3.pipeline_cogview3plus.CogView3PipelineOutput`] if `return_dict` is True, otherwise a + `tuple`. When returning a tuple, the first element is a list with the generated images. + """ + + if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)): + callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs + + height = height or self.transformer.config.sample_size * self.vae_scale_factor + width = width or self.transformer.config.sample_size * self.vae_scale_factor + + original_size = original_size or (height, width) + target_size = (height, width) + + # 1. Check inputs. Raise error if not correct + self.check_inputs( + prompt, + height, + width, + negative_prompt, + callback_on_step_end_tensor_inputs, + prompt_embeds, + negative_prompt_embeds, + ) + self._guidance_scale = guidance_scale + self._interrupt = False + + # 2. Default call parameters + if prompt is not None and isinstance(prompt, str): + batch_size = 1 + elif prompt is not None and isinstance(prompt, list): + batch_size = len(prompt) + else: + batch_size = prompt_embeds.shape[0] + + device = self._execution_device + + # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2) + # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1` + # corresponds to doing no classifier free guidance. + do_classifier_free_guidance = guidance_scale > 1.0 + + # 3. Encode input prompt + prompt_embeds, negative_prompt_embeds = self.encode_prompt( + prompt, + negative_prompt, + self.do_classifier_free_guidance, + num_images_per_prompt=num_images_per_prompt, + prompt_embeds=prompt_embeds, + negative_prompt_embeds=negative_prompt_embeds, + max_sequence_length=max_sequence_length, + device=device, + ) + if self.do_classifier_free_guidance: + prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0) + + # 4. Prepare timesteps + timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps) + self._num_timesteps = len(timesteps) + + # 5. Prepare latents. + latent_channels = self.transformer.config.in_channels + latents = self.prepare_latents( + batch_size * num_images_per_prompt, + latent_channels, + height, + width, + prompt_embeds.dtype, + device, + generator, + latents, + ) + + # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline + extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) + + # 7. Prepare additional timestep conditions + original_size = torch.tensor([original_size], dtype=prompt_embeds.dtype) + target_size = torch.tensor([target_size], dtype=prompt_embeds.dtype) + crops_coords_top_left = torch.tensor([crops_coords_top_left], dtype=prompt_embeds.dtype) + + if self.do_classifier_free_guidance: + original_size = torch.cat([original_size, original_size]) + target_size = torch.cat([target_size, target_size]) + crops_coords_top_left = torch.cat([crops_coords_top_left, crops_coords_top_left]) + + original_size = original_size.to(device).repeat(batch_size * num_images_per_prompt, 1) + target_size = target_size.to(device).repeat(batch_size * num_images_per_prompt, 1) + crops_coords_top_left = crops_coords_top_left.to(device).repeat(batch_size * num_images_per_prompt, 1) + + # 8. Denoising loop + num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0) + + with self.progress_bar(total=num_inference_steps) as progress_bar: + # for DPM-solver++ + old_pred_original_sample = None + for i, t in enumerate(timesteps): + if self.interrupt: + continue + + latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents + latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) + + # broadcast to batch dimension in a way that's compatible with ONNX/Core ML + timestep = t.expand(latent_model_input.shape[0]) + + # predict noise model_output + noise_pred = self.transformer( + hidden_states=latent_model_input, + encoder_hidden_states=prompt_embeds, + timestep=timestep, + original_size=original_size, + target_size=target_size, + crop_coords=crops_coords_top_left, + return_dict=False, + )[0] + noise_pred = noise_pred.float() + + # perform guidance + if self.do_classifier_free_guidance: + noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) + noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond) + + # compute the previous noisy sample x_t -> x_t-1 + if not isinstance(self.scheduler, CogVideoXDPMScheduler): + latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0] + else: + latents, old_pred_original_sample = self.scheduler.step( + noise_pred, + old_pred_original_sample, + t, + timesteps[i - 1] if i > 0 else None, + latents, + **extra_step_kwargs, + return_dict=False, + ) + latents = latents.to(prompt_embeds.dtype) + + # call the callback, if provided + if callback_on_step_end is not None: + callback_kwargs = {} + for k in callback_on_step_end_tensor_inputs: + callback_kwargs[k] = locals()[k] + callback_outputs = callback_on_step_end(self, i, t, callback_kwargs) + + latents = callback_outputs.pop("latents", latents) + prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds) + negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds) + + if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): + progress_bar.update() + + if not output_type == "latent": + image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False, generator=generator)[ + 0 + ] + else: + image = latents + + image = self.image_processor.postprocess(image, output_type=output_type) + + # Offload all models + self.maybe_free_model_hooks() + + if not return_dict: + return (image,) + + return CogView3PipelineOutput(images=image) \ No newline at end of file diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_output.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_output.py new file mode 100644 index 0000000000..11f8976f0e --- /dev/null +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_output.py @@ -0,0 +1,21 @@ +from dataclasses import dataclass +from typing import List, Union + +import numpy as np +import PIL.Image + +from diffusers.utils import BaseOutput + + +@dataclass +class CogView3PipelineOutput(BaseOutput): + """ + Output class for CogView3 pipelines. + + Args: + images (`List[PIL.Image.Image]` or `np.ndarray`) + List of denoised PIL images of length `batch_size` or numpy array of shape `(batch_size, height, width, + num_channels)`. PIL images or numpy array present the denoised images of the diffusion pipeline. + """ + + images: Union[List[PIL.Image.Image], np.ndarray] \ No newline at end of file diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/__init__.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/__init__.py new file mode 100644 index 0000000000..76b000d4bb --- /dev/null +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/__init__.py @@ -0,0 +1,2 @@ +from .scheduling_ddim_cogvideox import CogVideoXDDIMScheduler +from .scheduling_dpm_cogvideox import CogVideoXDPMScheduler \ No newline at end of file diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_ddim_cogvideox.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_ddim_cogvideox.py new file mode 100644 index 0000000000..27c31923fe --- /dev/null +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_ddim_cogvideox.py @@ -0,0 +1,452 @@ +# Copyright 2024 The CogVideoX team, Tsinghua University & ZhipuAI and The HuggingFace Team. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# DISCLAIMER: This code is strongly influenced by https://github.com/pesser/pytorch_diffusion +# and https://github.com/hojonathanho/diffusion + +import math +from dataclasses import dataclass +from typing import List, Optional, Tuple, Union + +import numpy as np +import torch + +from diffusers.configuration_utils import ConfigMixin, register_to_config +from diffusers.utils import BaseOutput +from .scheduling_utils import KarrasDiffusionSchedulers, SchedulerMixin + + +@dataclass +# Copied from diffusers.schedulers.scheduling_ddpm.DDPMSchedulerOutput with DDPM->DDIM +class DDIMSchedulerOutput(BaseOutput): + """ + Output class for the scheduler's `step` function output. + + Args: + prev_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images): + Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the + denoising loop. + pred_original_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images): + The predicted denoised sample `(x_{0})` based on the model output from the current timestep. + `pred_original_sample` can be used to preview progress or for guidance. + """ + + prev_sample: torch.Tensor + pred_original_sample: Optional[torch.Tensor] = None + + +# Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar +def betas_for_alpha_bar( + num_diffusion_timesteps, + max_beta=0.999, + alpha_transform_type="cosine", +): + """ + Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of + (1-beta) over time from t = [0,1]. + + Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up + to that part of the diffusion process. + + + Args: + num_diffusion_timesteps (`int`): the number of betas to produce. + max_beta (`float`): the maximum beta to use; use values lower than 1 to + prevent singularities. + alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar. + Choose from `cosine` or `exp` + + Returns: + betas (`np.ndarray`): the betas used by the scheduler to step the model outputs + """ + if alpha_transform_type == "cosine": + + def alpha_bar_fn(t): + return math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2 + + elif alpha_transform_type == "exp": + + def alpha_bar_fn(t): + return math.exp(t * -12.0) + + else: + raise ValueError(f"Unsupported alpha_transform_type: {alpha_transform_type}") + + betas = [] + for i in range(num_diffusion_timesteps): + t1 = i / num_diffusion_timesteps + t2 = (i + 1) / num_diffusion_timesteps + betas.append(min(1 - alpha_bar_fn(t2) / alpha_bar_fn(t1), max_beta)) + return torch.tensor(betas, dtype=torch.float32) + + +def rescale_zero_terminal_snr(alphas_cumprod): + """ + Rescales betas to have zero terminal SNR Based on https://arxiv.org/pdf/2305.08891.pdf (Algorithm 1) + + + Args: + betas (`torch.Tensor`): + the betas that the scheduler is being initialized with. + + Returns: + `torch.Tensor`: rescaled betas with zero terminal SNR + """ + + alphas_bar_sqrt = alphas_cumprod.sqrt() + + # Store old values. + alphas_bar_sqrt_0 = alphas_bar_sqrt[0].clone() + alphas_bar_sqrt_T = alphas_bar_sqrt[-1].clone() + + # Shift so the last timestep is zero. + alphas_bar_sqrt -= alphas_bar_sqrt_T + + # Scale so the first timestep is back to the old value. + alphas_bar_sqrt *= alphas_bar_sqrt_0 / (alphas_bar_sqrt_0 - alphas_bar_sqrt_T) + + # Convert alphas_bar_sqrt to betas + alphas_bar = alphas_bar_sqrt**2 # Revert sqrt + + return alphas_bar + + +class CogVideoXDDIMScheduler(SchedulerMixin, ConfigMixin): + """ + `DDIMScheduler` extends the denoising procedure introduced in denoising diffusion probabilistic models (DDPMs) with + non-Markovian guidance. + + This model inherits from [`SchedulerMixin`] and [`ConfigMixin`]. Check the superclass documentation for the generic + methods the library implements for all schedulers such as loading and saving. + + Args: + num_train_timesteps (`int`, defaults to 1000): + The number of diffusion steps to train the model. + beta_start (`float`, defaults to 0.0001): + The starting `beta` value of inference. + beta_end (`float`, defaults to 0.02): + The final `beta` value. + beta_schedule (`str`, defaults to `"linear"`): + The beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from + `linear`, `scaled_linear`, or `squaredcos_cap_v2`. + trained_betas (`np.ndarray`, *optional*): + Pass an array of betas directly to the constructor to bypass `beta_start` and `beta_end`. + clip_sample (`bool`, defaults to `True`): + Clip the predicted sample for numerical stability. + clip_sample_range (`float`, defaults to 1.0): + The maximum magnitude for sample clipping. Valid only when `clip_sample=True`. + set_alpha_to_one (`bool`, defaults to `True`): + Each diffusion step uses the alphas product value at that step and at the previous one. For the final step + there is no previous alpha. When this option is `True` the previous alpha product is fixed to `1`, + otherwise it uses the alpha value at step 0. + steps_offset (`int`, defaults to 0): + An offset added to the inference steps, as required by some model families. + prediction_type (`str`, defaults to `epsilon`, *optional*): + Prediction type of the scheduler function; can be `epsilon` (predicts the noise of the diffusion process), + `sample` (directly predicts the noisy sample`) or `v_prediction` (see section 2.4 of [Imagen + Video](https://imagen.research.google/video/paper.pdf) paper). + thresholding (`bool`, defaults to `False`): + Whether to use the "dynamic thresholding" method. This is unsuitable for latent-space diffusion models such + as Stable Diffusion. + dynamic_thresholding_ratio (`float`, defaults to 0.995): + The ratio for the dynamic thresholding method. Valid only when `thresholding=True`. + sample_max_value (`float`, defaults to 1.0): + The threshold value for dynamic thresholding. Valid only when `thresholding=True`. + timestep_spacing (`str`, defaults to `"leading"`): + The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and + Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information. + rescale_betas_zero_snr (`bool`, defaults to `False`): + Whether to rescale the betas to have zero terminal SNR. This enables the model to generate very bright and + dark samples instead of limiting it to samples with medium brightness. Loosely related to + [`--offset_noise`](https://github.com/huggingface/diffusers/blob/74fd735eb073eb1d774b1ab4154a0876eb82f055/examples/dreambooth/train_dreambooth.py#L506). + """ + + _compatibles = [e.name for e in KarrasDiffusionSchedulers] + order = 1 + + @register_to_config + def __init__( + self, + num_train_timesteps: int = 1000, + beta_start: float = 0.00085, + beta_end: float = 0.0120, + beta_schedule: str = "scaled_linear", + trained_betas: Optional[Union[np.ndarray, List[float]]] = None, + clip_sample: bool = True, + set_alpha_to_one: bool = True, + steps_offset: int = 0, + prediction_type: str = "epsilon", + clip_sample_range: float = 1.0, + sample_max_value: float = 1.0, + timestep_spacing: str = "leading", + rescale_betas_zero_snr: bool = False, + snr_shift_scale: float = 3.0, + ): + if trained_betas is not None: + self.betas = torch.tensor(trained_betas, dtype=torch.float32) + elif beta_schedule == "linear": + self.betas = torch.linspace(beta_start, beta_end, num_train_timesteps, dtype=torch.float32) + elif beta_schedule == "scaled_linear": + # this schedule is very specific to the latent diffusion model. + self.betas = torch.linspace(beta_start**0.5, beta_end**0.5, num_train_timesteps, dtype=torch.float64) ** 2 + elif beta_schedule == "squaredcos_cap_v2": + # Glide cosine schedule + self.betas = betas_for_alpha_bar(num_train_timesteps) + else: + raise NotImplementedError(f"{beta_schedule} is not implemented for {self.__class__}") + + self.alphas = 1.0 - self.betas + self.alphas_cumprod = torch.cumprod(self.alphas, dim=0) + + # Modify: SNR shift following SD3 + self.alphas_cumprod = self.alphas_cumprod / (snr_shift_scale + (1 - snr_shift_scale) * self.alphas_cumprod) + + # Rescale for zero SNR + if rescale_betas_zero_snr: + self.alphas_cumprod = rescale_zero_terminal_snr(self.alphas_cumprod) + + # At every step in ddim, we are looking into the previous alphas_cumprod + # For the final step, there is no previous alphas_cumprod because we are already at 0 + # `set_alpha_to_one` decides whether we set this parameter simply to one or + # whether we use the final alpha of the "non-previous" one. + self.final_alpha_cumprod = torch.tensor(1.0) if set_alpha_to_one else self.alphas_cumprod[0] + + # standard deviation of the initial noise distribution + self.init_noise_sigma = 1.0 + + # setable values + self.num_inference_steps = None + self.timesteps = torch.from_numpy(np.arange(0, num_train_timesteps)[::-1].copy().astype(np.int64)) + + def _get_variance(self, timestep, prev_timestep): + alpha_prod_t = self.alphas_cumprod[timestep] + alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.final_alpha_cumprod + beta_prod_t = 1 - alpha_prod_t + beta_prod_t_prev = 1 - alpha_prod_t_prev + + variance = (beta_prod_t_prev / beta_prod_t) * (1 - alpha_prod_t / alpha_prod_t_prev) + + return variance + + def scale_model_input(self, sample: torch.Tensor, timestep: Optional[int] = None) -> torch.Tensor: + """ + Ensures interchangeability with schedulers that need to scale the denoising model input depending on the + current timestep. + + Args: + sample (`torch.Tensor`): + The input sample. + timestep (`int`, *optional*): + The current timestep in the diffusion chain. + + Returns: + `torch.Tensor`: + A scaled input sample. + """ + return sample + + def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.device] = None): + """ + Sets the discrete timesteps used for the diffusion chain (to be run before inference). + + Args: + num_inference_steps (`int`): + The number of diffusion steps used when generating samples with a pre-trained model. + """ + + if num_inference_steps > self.config.num_train_timesteps: + raise ValueError( + f"`num_inference_steps`: {num_inference_steps} cannot be larger than `self.config.train_timesteps`:" + f" {self.config.num_train_timesteps} as the unet model trained with this scheduler can only handle" + f" maximal {self.config.num_train_timesteps} timesteps." + ) + + self.num_inference_steps = num_inference_steps + + # "linspace", "leading", "trailing" corresponds to annotation of Table 2. of https://arxiv.org/abs/2305.08891 + if self.config.timestep_spacing == "linspace": + timesteps = ( + np.linspace(0, self.config.num_train_timesteps - 1, num_inference_steps) + .round()[::-1] + .copy() + .astype(np.int64) + ) + elif self.config.timestep_spacing == "leading": + step_ratio = self.config.num_train_timesteps // self.num_inference_steps + # creates integer timesteps by multiplying by ratio + # casting to int to avoid issues when num_inference_step is power of 3 + timesteps = (np.arange(0, num_inference_steps) * step_ratio).round()[::-1].copy().astype(np.int64) + timesteps += self.config.steps_offset + elif self.config.timestep_spacing == "trailing": + step_ratio = self.config.num_train_timesteps / self.num_inference_steps + # creates integer timesteps by multiplying by ratio + # casting to int to avoid issues when num_inference_step is power of 3 + timesteps = np.round(np.arange(self.config.num_train_timesteps, 0, -step_ratio)).astype(np.int64) + timesteps -= 1 + else: + raise ValueError( + f"{self.config.timestep_spacing} is not supported. Please make sure to choose one of 'leading' or 'trailing'." + ) + + self.timesteps = torch.from_numpy(timesteps).to(device) + + def step( + self, + model_output: torch.Tensor, + timestep: int, + sample: torch.Tensor, + eta: float = 0.0, + use_clipped_model_output: bool = False, + generator=None, + variance_noise: Optional[torch.Tensor] = None, + return_dict: bool = True, + ) -> Union[DDIMSchedulerOutput, Tuple]: + """ + Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion + process from the learned model outputs (most often the predicted noise). + + Args: + model_output (`torch.Tensor`): + The direct output from learned diffusion model. + timestep (`float`): + The current discrete timestep in the diffusion chain. + sample (`torch.Tensor`): + A current instance of a sample created by the diffusion process. + eta (`float`): + The weight of noise for added noise in diffusion step. + use_clipped_model_output (`bool`, defaults to `False`): + If `True`, computes "corrected" `model_output` from the clipped predicted original sample. Necessary + because predicted original sample is clipped to [-1, 1] when `self.config.clip_sample` is `True`. If no + clipping has happened, "corrected" `model_output` would coincide with the one provided as input and + `use_clipped_model_output` has no effect. + generator (`torch.Generator`, *optional*): + A random number generator. + variance_noise (`torch.Tensor`): + Alternative to generating noise with `generator` by directly providing the noise for the variance + itself. Useful for methods such as [`CycleDiffusion`]. + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`~schedulers.scheduling_ddim.DDIMSchedulerOutput`] or `tuple`. + + Returns: + [`~schedulers.scheduling_ddim.DDIMSchedulerOutput`] or `tuple`: + If return_dict is `True`, [`~schedulers.scheduling_ddim.DDIMSchedulerOutput`] is returned, otherwise a + tuple is returned where the first element is the sample tensor. + + """ + if self.num_inference_steps is None: + raise ValueError( + "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler" + ) + + # See formulas (12) and (16) of DDIM paper https://arxiv.org/pdf/2010.02502.pdf + # Ideally, read DDIM paper in-detail understanding + + # Notation ( -> + # - pred_noise_t -> e_theta(x_t, t) + # - pred_original_sample -> f_theta(x_t, t) or x_0 + # - std_dev_t -> sigma_t + # - eta -> η + # - pred_sample_direction -> "direction pointing to x_t" + # - pred_prev_sample -> "x_t-1" + + # 1. get previous step value (=t-1) + prev_timestep = timestep - self.config.num_train_timesteps // self.num_inference_steps + + # 2. compute alphas, betas + alpha_prod_t = self.alphas_cumprod[timestep] + alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.final_alpha_cumprod + + beta_prod_t = 1 - alpha_prod_t + + # 3. compute predicted original sample from predicted noise also called + # "predicted x_0" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf + # To make style tests pass, commented out `pred_epsilon` as it is an unused variable + if self.config.prediction_type == "epsilon": + pred_original_sample = (sample - beta_prod_t ** (0.5) * model_output) / alpha_prod_t ** (0.5) + # pred_epsilon = model_output + elif self.config.prediction_type == "sample": + pred_original_sample = model_output + # pred_epsilon = (sample - alpha_prod_t ** (0.5) * pred_original_sample) / beta_prod_t ** (0.5) + elif self.config.prediction_type == "v_prediction": + pred_original_sample = (alpha_prod_t**0.5) * sample - (beta_prod_t**0.5) * model_output + # pred_epsilon = (alpha_prod_t**0.5) * model_output + (beta_prod_t**0.5) * sample + else: + raise ValueError( + f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample`, or" + " `v_prediction`" + ) + + a_t = ((1 - alpha_prod_t_prev) / (1 - alpha_prod_t)) ** 0.5 + b_t = alpha_prod_t_prev**0.5 - alpha_prod_t**0.5 * a_t + + prev_sample = a_t * sample + b_t * pred_original_sample + + if not return_dict: + return ( + prev_sample, + pred_original_sample, + ) + + return DDIMSchedulerOutput(prev_sample=prev_sample, pred_original_sample=pred_original_sample) + + # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.add_noise + def add_noise( + self, + original_samples: torch.Tensor, + noise: torch.Tensor, + timesteps: torch.IntTensor, + ) -> torch.Tensor: + # Make sure alphas_cumprod and timestep have same device and dtype as original_samples + # Move the self.alphas_cumprod to device to avoid redundant CPU to GPU data movement + # for the subsequent add_noise calls + self.alphas_cumprod = self.alphas_cumprod.to(device=original_samples.device) + alphas_cumprod = self.alphas_cumprod.to(dtype=original_samples.dtype) + timesteps = timesteps.to(original_samples.device) + + sqrt_alpha_prod = alphas_cumprod[timesteps] ** 0.5 + sqrt_alpha_prod = sqrt_alpha_prod.flatten() + while len(sqrt_alpha_prod.shape) < len(original_samples.shape): + sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1) + + sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps]) ** 0.5 + sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten() + while len(sqrt_one_minus_alpha_prod.shape) < len(original_samples.shape): + sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1) + + noisy_samples = sqrt_alpha_prod * original_samples + sqrt_one_minus_alpha_prod * noise + return noisy_samples + + # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.get_velocity + def get_velocity(self, sample: torch.Tensor, noise: torch.Tensor, timesteps: torch.IntTensor) -> torch.Tensor: + # Make sure alphas_cumprod and timestep have same device and dtype as sample + self.alphas_cumprod = self.alphas_cumprod.to(device=sample.device) + alphas_cumprod = self.alphas_cumprod.to(dtype=sample.dtype) + timesteps = timesteps.to(sample.device) + + sqrt_alpha_prod = alphas_cumprod[timesteps] ** 0.5 + sqrt_alpha_prod = sqrt_alpha_prod.flatten() + while len(sqrt_alpha_prod.shape) < len(sample.shape): + sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1) + + sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps]) ** 0.5 + sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten() + while len(sqrt_one_minus_alpha_prod.shape) < len(sample.shape): + sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1) + + velocity = sqrt_alpha_prod * noise - sqrt_one_minus_alpha_prod * sample + return velocity + + def __len__(self): + return self.config.num_train_timesteps \ No newline at end of file diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_dpm_cogvideox.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_dpm_cogvideox.py new file mode 100644 index 0000000000..4269fff66a --- /dev/null +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_dpm_cogvideox.py @@ -0,0 +1,489 @@ +# Copyright 2024 The CogVideoX team, Tsinghua University & ZhipuAI and The HuggingFace Team. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# DISCLAIMER: This code is strongly influenced by https://github.com/pesser/pytorch_diffusion +# and https://github.com/hojonathanho/diffusion + +import math +from dataclasses import dataclass +from typing import List, Optional, Tuple, Union + +import numpy as np +import torch + +from diffusers.configuration_utils import ConfigMixin, register_to_config +from diffusers.utils import BaseOutput +from diffusers.utils.torch_utils import randn_tensor +from .scheduling_utils import KarrasDiffusionSchedulers, SchedulerMixin + + +@dataclass +# Copied from diffusers.schedulers.scheduling_ddpm.DDPMSchedulerOutput with DDPM->DDIM +class DDIMSchedulerOutput(BaseOutput): + """ + Output class for the scheduler's `step` function output. + + Args: + prev_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images): + Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the + denoising loop. + pred_original_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images): + The predicted denoised sample `(x_{0})` based on the model output from the current timestep. + `pred_original_sample` can be used to preview progress or for guidance. + """ + + prev_sample: torch.Tensor + pred_original_sample: Optional[torch.Tensor] = None + + +# Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar +def betas_for_alpha_bar( + num_diffusion_timesteps, + max_beta=0.999, + alpha_transform_type="cosine", +): + """ + Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of + (1-beta) over time from t = [0,1]. + + Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up + to that part of the diffusion process. + + + Args: + num_diffusion_timesteps (`int`): the number of betas to produce. + max_beta (`float`): the maximum beta to use; use values lower than 1 to + prevent singularities. + alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar. + Choose from `cosine` or `exp` + + Returns: + betas (`np.ndarray`): the betas used by the scheduler to step the model outputs + """ + if alpha_transform_type == "cosine": + + def alpha_bar_fn(t): + return math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2 + + elif alpha_transform_type == "exp": + + def alpha_bar_fn(t): + return math.exp(t * -12.0) + + else: + raise ValueError(f"Unsupported alpha_transform_type: {alpha_transform_type}") + + betas = [] + for i in range(num_diffusion_timesteps): + t1 = i / num_diffusion_timesteps + t2 = (i + 1) / num_diffusion_timesteps + betas.append(min(1 - alpha_bar_fn(t2) / alpha_bar_fn(t1), max_beta)) + return torch.tensor(betas, dtype=torch.float32) + + +def rescale_zero_terminal_snr(alphas_cumprod): + """ + Rescales betas to have zero terminal SNR Based on https://arxiv.org/pdf/2305.08891.pdf (Algorithm 1) + + + Args: + betas (`torch.Tensor`): + the betas that the scheduler is being initialized with. + + Returns: + `torch.Tensor`: rescaled betas with zero terminal SNR + """ + + alphas_bar_sqrt = alphas_cumprod.sqrt() + + # Store old values. + alphas_bar_sqrt_0 = alphas_bar_sqrt[0].clone() + alphas_bar_sqrt_T = alphas_bar_sqrt[-1].clone() + + # Shift so the last timestep is zero. + alphas_bar_sqrt -= alphas_bar_sqrt_T + + # Scale so the first timestep is back to the old value. + alphas_bar_sqrt *= alphas_bar_sqrt_0 / (alphas_bar_sqrt_0 - alphas_bar_sqrt_T) + + # Convert alphas_bar_sqrt to betas + alphas_bar = alphas_bar_sqrt**2 # Revert sqrt + + return alphas_bar + + +class CogVideoXDPMScheduler(SchedulerMixin, ConfigMixin): + """ + `DDIMScheduler` extends the denoising procedure introduced in denoising diffusion probabilistic models (DDPMs) with + non-Markovian guidance. + + This model inherits from [`SchedulerMixin`] and [`ConfigMixin`]. Check the superclass documentation for the generic + methods the library implements for all schedulers such as loading and saving. + + Args: + num_train_timesteps (`int`, defaults to 1000): + The number of diffusion steps to train the model. + beta_start (`float`, defaults to 0.0001): + The starting `beta` value of inference. + beta_end (`float`, defaults to 0.02): + The final `beta` value. + beta_schedule (`str`, defaults to `"linear"`): + The beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from + `linear`, `scaled_linear`, or `squaredcos_cap_v2`. + trained_betas (`np.ndarray`, *optional*): + Pass an array of betas directly to the constructor to bypass `beta_start` and `beta_end`. + clip_sample (`bool`, defaults to `True`): + Clip the predicted sample for numerical stability. + clip_sample_range (`float`, defaults to 1.0): + The maximum magnitude for sample clipping. Valid only when `clip_sample=True`. + set_alpha_to_one (`bool`, defaults to `True`): + Each diffusion step uses the alphas product value at that step and at the previous one. For the final step + there is no previous alpha. When this option is `True` the previous alpha product is fixed to `1`, + otherwise it uses the alpha value at step 0. + steps_offset (`int`, defaults to 0): + An offset added to the inference steps, as required by some model families. + prediction_type (`str`, defaults to `epsilon`, *optional*): + Prediction type of the scheduler function; can be `epsilon` (predicts the noise of the diffusion process), + `sample` (directly predicts the noisy sample`) or `v_prediction` (see section 2.4 of [Imagen + Video](https://imagen.research.google/video/paper.pdf) paper). + thresholding (`bool`, defaults to `False`): + Whether to use the "dynamic thresholding" method. This is unsuitable for latent-space diffusion models such + as Stable Diffusion. + dynamic_thresholding_ratio (`float`, defaults to 0.995): + The ratio for the dynamic thresholding method. Valid only when `thresholding=True`. + sample_max_value (`float`, defaults to 1.0): + The threshold value for dynamic thresholding. Valid only when `thresholding=True`. + timestep_spacing (`str`, defaults to `"leading"`): + The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and + Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information. + rescale_betas_zero_snr (`bool`, defaults to `False`): + Whether to rescale the betas to have zero terminal SNR. This enables the model to generate very bright and + dark samples instead of limiting it to samples with medium brightness. Loosely related to + [`--offset_noise`](https://github.com/huggingface/diffusers/blob/74fd735eb073eb1d774b1ab4154a0876eb82f055/examples/dreambooth/train_dreambooth.py#L506). + """ + + _compatibles = [e.name for e in KarrasDiffusionSchedulers] + order = 1 + + @register_to_config + def __init__( + self, + num_train_timesteps: int = 1000, + beta_start: float = 0.00085, + beta_end: float = 0.0120, + beta_schedule: str = "scaled_linear", + trained_betas: Optional[Union[np.ndarray, List[float]]] = None, + clip_sample: bool = True, + set_alpha_to_one: bool = True, + steps_offset: int = 0, + prediction_type: str = "epsilon", + clip_sample_range: float = 1.0, + sample_max_value: float = 1.0, + timestep_spacing: str = "leading", + rescale_betas_zero_snr: bool = False, + snr_shift_scale: float = 3.0, + ): + if trained_betas is not None: + self.betas = torch.tensor(trained_betas, dtype=torch.float32) + elif beta_schedule == "linear": + self.betas = torch.linspace(beta_start, beta_end, num_train_timesteps, dtype=torch.float32) + elif beta_schedule == "scaled_linear": + # this schedule is very specific to the latent diffusion model. + self.betas = torch.linspace(beta_start**0.5, beta_end**0.5, num_train_timesteps, dtype=torch.float64) ** 2 + elif beta_schedule == "squaredcos_cap_v2": + # Glide cosine schedule + self.betas = betas_for_alpha_bar(num_train_timesteps) + else: + raise NotImplementedError(f"{beta_schedule} is not implemented for {self.__class__}") + + self.alphas = 1.0 - self.betas + self.alphas_cumprod = torch.cumprod(self.alphas, dim=0) + + # Modify: SNR shift following SD3 + self.alphas_cumprod = self.alphas_cumprod / (snr_shift_scale + (1 - snr_shift_scale) * self.alphas_cumprod) + + # Rescale for zero SNR + if rescale_betas_zero_snr: + self.alphas_cumprod = rescale_zero_terminal_snr(self.alphas_cumprod) + + # At every step in ddim, we are looking into the previous alphas_cumprod + # For the final step, there is no previous alphas_cumprod because we are already at 0 + # `set_alpha_to_one` decides whether we set this parameter simply to one or + # whether we use the final alpha of the "non-previous" one. + self.final_alpha_cumprod = torch.tensor(1.0) if set_alpha_to_one else self.alphas_cumprod[0] + + # standard deviation of the initial noise distribution + self.init_noise_sigma = 1.0 + + # setable values + self.num_inference_steps = None + self.timesteps = torch.from_numpy(np.arange(0, num_train_timesteps)[::-1].copy().astype(np.int64)) + + def _get_variance(self, timestep, prev_timestep): + alpha_prod_t = self.alphas_cumprod[timestep] + alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.final_alpha_cumprod + beta_prod_t = 1 - alpha_prod_t + beta_prod_t_prev = 1 - alpha_prod_t_prev + + variance = (beta_prod_t_prev / beta_prod_t) * (1 - alpha_prod_t / alpha_prod_t_prev) + + return variance + + def scale_model_input(self, sample: torch.Tensor, timestep: Optional[int] = None) -> torch.Tensor: + """ + Ensures interchangeability with schedulers that need to scale the denoising model input depending on the + current timestep. + + Args: + sample (`torch.Tensor`): + The input sample. + timestep (`int`, *optional*): + The current timestep in the diffusion chain. + + Returns: + `torch.Tensor`: + A scaled input sample. + """ + return sample + + def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.device] = None): + """ + Sets the discrete timesteps used for the diffusion chain (to be run before inference). + + Args: + num_inference_steps (`int`): + The number of diffusion steps used when generating samples with a pre-trained model. + """ + + if num_inference_steps > self.config.num_train_timesteps: + raise ValueError( + f"`num_inference_steps`: {num_inference_steps} cannot be larger than `self.config.train_timesteps`:" + f" {self.config.num_train_timesteps} as the unet model trained with this scheduler can only handle" + f" maximal {self.config.num_train_timesteps} timesteps." + ) + + self.num_inference_steps = num_inference_steps + + # "linspace", "leading", "trailing" corresponds to annotation of Table 2. of https://arxiv.org/abs/2305.08891 + if self.config.timestep_spacing == "linspace": + timesteps = ( + np.linspace(0, self.config.num_train_timesteps - 1, num_inference_steps) + .round()[::-1] + .copy() + .astype(np.int64) + ) + elif self.config.timestep_spacing == "leading": + step_ratio = self.config.num_train_timesteps // self.num_inference_steps + # creates integer timesteps by multiplying by ratio + # casting to int to avoid issues when num_inference_step is power of 3 + timesteps = (np.arange(0, num_inference_steps) * step_ratio).round()[::-1].copy().astype(np.int64) + timesteps += self.config.steps_offset + elif self.config.timestep_spacing == "trailing": + step_ratio = self.config.num_train_timesteps / self.num_inference_steps + # creates integer timesteps by multiplying by ratio + # casting to int to avoid issues when num_inference_step is power of 3 + timesteps = np.round(np.arange(self.config.num_train_timesteps, 0, -step_ratio)).astype(np.int64) + timesteps -= 1 + else: + raise ValueError( + f"{self.config.timestep_spacing} is not supported. Please make sure to choose one of 'leading' or 'trailing'." + ) + + self.timesteps = torch.from_numpy(timesteps).to(device) + + def get_variables(self, alpha_prod_t, alpha_prod_t_prev, alpha_prod_t_back=None): + lamb = ((alpha_prod_t / (1 - alpha_prod_t)) ** 0.5).log() + lamb_next = ((alpha_prod_t_prev / (1 - alpha_prod_t_prev)) ** 0.5).log() + h = lamb_next - lamb + + if alpha_prod_t_back is not None: + lamb_previous = ((alpha_prod_t_back / (1 - alpha_prod_t_back)) ** 0.5).log() + h_last = lamb - lamb_previous + r = h_last / h + return h, r, lamb, lamb_next + else: + return h, None, lamb, lamb_next + + def get_mult(self, h, r, alpha_prod_t, alpha_prod_t_prev, alpha_prod_t_back): + mult1 = ((1 - alpha_prod_t_prev) / (1 - alpha_prod_t)) ** 0.5 * (-h).exp() + mult2 = (-2 * h).expm1() * alpha_prod_t_prev**0.5 + + if alpha_prod_t_back is not None: + mult3 = 1 + 1 / (2 * r) + mult4 = 1 / (2 * r) + return mult1, mult2, mult3, mult4 + else: + return mult1, mult2 + + def step( + self, + model_output: torch.Tensor, + old_pred_original_sample: torch.Tensor, + timestep: int, + timestep_back: int, + sample: torch.Tensor, + eta: float = 0.0, + use_clipped_model_output: bool = False, + generator=None, + variance_noise: Optional[torch.Tensor] = None, + return_dict: bool = False, + ) -> Union[DDIMSchedulerOutput, Tuple]: + """ + Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion + process from the learned model outputs (most often the predicted noise). + + Args: + model_output (`torch.Tensor`): + The direct output from learned diffusion model. + timestep (`float`): + The current discrete timestep in the diffusion chain. + sample (`torch.Tensor`): + A current instance of a sample created by the diffusion process. + eta (`float`): + The weight of noise for added noise in diffusion step. + use_clipped_model_output (`bool`, defaults to `False`): + If `True`, computes "corrected" `model_output` from the clipped predicted original sample. Necessary + because predicted original sample is clipped to [-1, 1] when `self.config.clip_sample` is `True`. If no + clipping has happened, "corrected" `model_output` would coincide with the one provided as input and + `use_clipped_model_output` has no effect. + generator (`torch.Generator`, *optional*): + A random number generator. + variance_noise (`torch.Tensor`): + Alternative to generating noise with `generator` by directly providing the noise for the variance + itself. Useful for methods such as [`CycleDiffusion`]. + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`~schedulers.scheduling_ddim.DDIMSchedulerOutput`] or `tuple`. + + Returns: + [`~schedulers.scheduling_ddim.DDIMSchedulerOutput`] or `tuple`: + If return_dict is `True`, [`~schedulers.scheduling_ddim.DDIMSchedulerOutput`] is returned, otherwise a + tuple is returned where the first element is the sample tensor. + + """ + if self.num_inference_steps is None: + raise ValueError( + "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler" + ) + + # See formulas (12) and (16) of DDIM paper https://arxiv.org/pdf/2010.02502.pdf + # Ideally, read DDIM paper in-detail understanding + + # Notation ( -> + # - pred_noise_t -> e_theta(x_t, t) + # - pred_original_sample -> f_theta(x_t, t) or x_0 + # - std_dev_t -> sigma_t + # - eta -> η + # - pred_sample_direction -> "direction pointing to x_t" + # - pred_prev_sample -> "x_t-1" + + # 1. get previous step value (=t-1) + prev_timestep = timestep - self.config.num_train_timesteps // self.num_inference_steps + + # 2. compute alphas, betas + alpha_prod_t = self.alphas_cumprod[timestep] + alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.final_alpha_cumprod + alpha_prod_t_back = self.alphas_cumprod[timestep_back] if timestep_back is not None else None + + beta_prod_t = 1 - alpha_prod_t + + # 3. compute predicted original sample from predicted noise also called + # "predicted x_0" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf + # To make style tests pass, commented out `pred_epsilon` as it is an unused variable + if self.config.prediction_type == "epsilon": + pred_original_sample = (sample - beta_prod_t ** (0.5) * model_output) / alpha_prod_t ** (0.5) + # pred_epsilon = model_output + elif self.config.prediction_type == "sample": + pred_original_sample = model_output + # pred_epsilon = (sample - alpha_prod_t ** (0.5) * pred_original_sample) / beta_prod_t ** (0.5) + elif self.config.prediction_type == "v_prediction": + pred_original_sample = (alpha_prod_t**0.5) * sample - (beta_prod_t**0.5) * model_output + # pred_epsilon = (alpha_prod_t**0.5) * model_output + (beta_prod_t**0.5) * sample + else: + raise ValueError( + f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample`, or" + " `v_prediction`" + ) + + h, r, lamb, lamb_next = self.get_variables(alpha_prod_t, alpha_prod_t_prev, alpha_prod_t_back) + mult = list(self.get_mult(h, r, alpha_prod_t, alpha_prod_t_prev, alpha_prod_t_back)) + mult_noise = (1 - alpha_prod_t_prev) ** 0.5 * (1 - (-2 * h).exp()) ** 0.5 + + noise = randn_tensor(sample.shape, generator=generator, device=sample.device, dtype=sample.dtype) + prev_sample = mult[0] * sample - mult[1] * pred_original_sample + mult_noise * noise + + if old_pred_original_sample is None or prev_timestep < 0: + # Save a network evaluation if all noise levels are 0 or on the first step + return prev_sample, pred_original_sample + else: + denoised_d = mult[2] * pred_original_sample - mult[3] * old_pred_original_sample + noise = randn_tensor(sample.shape, generator=generator, device=sample.device, dtype=sample.dtype) + x_advanced = mult[0] * sample - mult[1] * denoised_d + mult_noise * noise + + prev_sample = x_advanced + + if not return_dict: + return (prev_sample, pred_original_sample) + + return DDIMSchedulerOutput(prev_sample=prev_sample, pred_original_sample=pred_original_sample) + + # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.add_noise + def add_noise( + self, + original_samples: torch.Tensor, + noise: torch.Tensor, + timesteps: torch.IntTensor, + ) -> torch.Tensor: + # Make sure alphas_cumprod and timestep have same device and dtype as original_samples + # Move the self.alphas_cumprod to device to avoid redundant CPU to GPU data movement + # for the subsequent add_noise calls + self.alphas_cumprod = self.alphas_cumprod.to(device=original_samples.device) + alphas_cumprod = self.alphas_cumprod.to(dtype=original_samples.dtype) + timesteps = timesteps.to(original_samples.device) + + sqrt_alpha_prod = alphas_cumprod[timesteps] ** 0.5 + sqrt_alpha_prod = sqrt_alpha_prod.flatten() + while len(sqrt_alpha_prod.shape) < len(original_samples.shape): + sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1) + + sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps]) ** 0.5 + sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten() + while len(sqrt_one_minus_alpha_prod.shape) < len(original_samples.shape): + sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1) + + noisy_samples = sqrt_alpha_prod * original_samples + sqrt_one_minus_alpha_prod * noise + return noisy_samples + + # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.get_velocity + def get_velocity(self, sample: torch.Tensor, noise: torch.Tensor, timesteps: torch.IntTensor) -> torch.Tensor: + # Make sure alphas_cumprod and timestep have same device and dtype as sample + self.alphas_cumprod = self.alphas_cumprod.to(device=sample.device) + alphas_cumprod = self.alphas_cumprod.to(dtype=sample.dtype) + timesteps = timesteps.to(sample.device) + + sqrt_alpha_prod = alphas_cumprod[timesteps] ** 0.5 + sqrt_alpha_prod = sqrt_alpha_prod.flatten() + while len(sqrt_alpha_prod.shape) < len(sample.shape): + sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1) + + sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps]) ** 0.5 + sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten() + while len(sqrt_one_minus_alpha_prod.shape) < len(sample.shape): + sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1) + + velocity = sqrt_alpha_prod * noise - sqrt_one_minus_alpha_prod * sample + return velocity + + def __len__(self): + return self.config.num_train_timesteps \ No newline at end of file diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_utils.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_utils.py new file mode 100644 index 0000000000..b6418f89dd --- /dev/null +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_utils.py @@ -0,0 +1,193 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import importlib +import os +from dataclasses import dataclass +from enum import Enum +from typing import Optional, Union + +import torch +from huggingface_hub.utils import validate_hf_hub_args + +from diffusers.utils import BaseOutput, PushToHubMixin + + +SCHEDULER_CONFIG_NAME = "scheduler_config.json" + + +# NOTE: We make this type an enum because it simplifies usage in docs and prevents +# circular imports when used for `_compatibles` within the schedulers module. +# When it's used as a type in pipelines, it really is a Union because the actual +# scheduler instance is passed in. +class KarrasDiffusionSchedulers(Enum): + DDIMScheduler = 1 + DDPMScheduler = 2 + PNDMScheduler = 3 + LMSDiscreteScheduler = 4 + EulerDiscreteScheduler = 5 + HeunDiscreteScheduler = 6 + EulerAncestralDiscreteScheduler = 7 + DPMSolverMultistepScheduler = 8 + DPMSolverSinglestepScheduler = 9 + KDPM2DiscreteScheduler = 10 + KDPM2AncestralDiscreteScheduler = 11 + DEISMultistepScheduler = 12 + UniPCMultistepScheduler = 13 + DPMSolverSDEScheduler = 14 + EDMEulerScheduler = 15 + + +AysSchedules = { + "StableDiffusionTimesteps": [999, 850, 736, 645, 545, 455, 343, 233, 124, 24], + "StableDiffusionSigmas": [14.615, 6.475, 3.861, 2.697, 1.886, 1.396, 0.963, 0.652, 0.399, 0.152, 0.0], + "StableDiffusionXLTimesteps": [999, 845, 730, 587, 443, 310, 193, 116, 53, 13], + "StableDiffusionXLSigmas": [14.615, 6.315, 3.771, 2.181, 1.342, 0.862, 0.555, 0.380, 0.234, 0.113, 0.0], + "StableDiffusionVideoSigmas": [700.00, 54.5, 15.886, 7.977, 4.248, 1.789, 0.981, 0.403, 0.173, 0.034, 0.0], +} + + +@dataclass +class SchedulerOutput(BaseOutput): + """ + Base class for the output of a scheduler's `step` function. + + Args: + prev_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images): + Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the + denoising loop. + """ + + prev_sample: torch.Tensor + + +class SchedulerMixin(PushToHubMixin): + """ + Base class for all schedulers. + + [`SchedulerMixin`] contains common functions shared by all schedulers such as general loading and saving + functionalities. + + [`ConfigMixin`] takes care of storing the configuration attributes (like `num_train_timesteps`) that are passed to + the scheduler's `__init__` function, and the attributes can be accessed by `scheduler.config.num_train_timesteps`. + + Class attributes: + - **_compatibles** (`List[str]`) -- A list of scheduler classes that are compatible with the parent scheduler + class. Use [`~ConfigMixin.from_config`] to load a different compatible scheduler class (should be overridden + by parent class). + """ + + config_name = SCHEDULER_CONFIG_NAME + _compatibles = [] + has_compatibles = True + + @classmethod + @validate_hf_hub_args + def from_pretrained( + cls, + pretrained_model_name_or_path: Optional[Union[str, os.PathLike]] = None, + subfolder: Optional[str] = None, + return_unused_kwargs=False, + **kwargs, + ): + r""" + Instantiate a scheduler from a pre-defined JSON configuration file in a local directory or Hub repository. + + Parameters: + pretrained_model_name_or_path (`str` or `os.PathLike`, *optional*): + Can be either: + + - A string, the *model id* (for example `google/ddpm-celebahq-256`) of a pretrained model hosted on + the Hub. + - A path to a *directory* (for example `./my_model_directory`) containing the scheduler + configuration saved with [`~SchedulerMixin.save_pretrained`]. + subfolder (`str`, *optional*): + The subfolder location of a model file within a larger model repository on the Hub or locally. + return_unused_kwargs (`bool`, *optional*, defaults to `False`): + Whether kwargs that are not consumed by the Python class should be returned or not. + cache_dir (`Union[str, os.PathLike]`, *optional*): + Path to a directory where a downloaded pretrained model configuration is cached if the standard cache + is not used. + force_download (`bool`, *optional*, defaults to `False`): + Whether or not to force the (re-)download of the model weights and configuration files, overriding the + cached versions if they exist. + + proxies (`Dict[str, str]`, *optional*): + A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128', + 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request. + output_loading_info(`bool`, *optional*, defaults to `False`): + Whether or not to also return a dictionary containing missing keys, unexpected keys and error messages. + local_files_only(`bool`, *optional*, defaults to `False`): + Whether to only load local model weights and configuration files or not. If set to `True`, the model + won't be downloaded from the Hub. + token (`str` or *bool*, *optional*): + The token to use as HTTP bearer authorization for remote files. If `True`, the token generated from + `diffusers-cli login` (stored in `~/.huggingface`) is used. + revision (`str`, *optional*, defaults to `"main"`): + The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier + allowed by Git. + + + + To use private or [gated models](https://huggingface.co/docs/hub/models-gated#gated-models), log-in with + `huggingface-cli login`. You can also activate the special + ["offline-mode"](https://huggingface.co/diffusers/installation.html#offline-mode) to use this method in a + firewalled environment. + + + + """ + config, kwargs, commit_hash = cls.load_config( + pretrained_model_name_or_path=pretrained_model_name_or_path, + subfolder=subfolder, + return_unused_kwargs=True, + return_commit_hash=True, + **kwargs, + ) + return cls.from_config(config, return_unused_kwargs=return_unused_kwargs, **kwargs) + + def save_pretrained(self, save_directory: Union[str, os.PathLike], push_to_hub: bool = False, **kwargs): + """ + Save a scheduler configuration object to a directory so that it can be reloaded using the + [`~SchedulerMixin.from_pretrained`] class method. + + Args: + save_directory (`str` or `os.PathLike`): + Directory where the configuration JSON file will be saved (will be created if it does not exist). + push_to_hub (`bool`, *optional*, defaults to `False`): + Whether or not to push your model to the Hugging Face Hub after saving it. You can specify the + repository you want to push to with `repo_id` (will default to the name of `save_directory` in your + namespace). + kwargs (`Dict[str, Any]`, *optional*): + Additional keyword arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method. + """ + self.save_config(save_directory=save_directory, push_to_hub=push_to_hub, **kwargs) + + @property + def compatibles(self): + """ + Returns all schedulers that are compatible with this scheduler + + Returns: + `List[SchedulerMixin]`: List of compatible schedulers + """ + return self._get_compatibles() + + @classmethod + def _get_compatibles(cls): + compatible_classes_str = list(set([cls.__name__] + cls._compatibles)) + diffusers_library = importlib.import_module(__name__.split(".")[0]) + compatible_classes = [ + getattr(diffusers_library, c) for c in compatible_classes_str if hasattr(diffusers_library, c) + ] + return compatible_classes \ No newline at end of file diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/vae/__init__.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/vae/__init__.py new file mode 100644 index 0000000000..a91642a899 --- /dev/null +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/vae/__init__.py @@ -0,0 +1 @@ +from autoencoder_kl import AutoencoderKL \ No newline at end of file diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/vae/autoencoder_kl.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/vae/autoencoder_kl.py new file mode 100644 index 0000000000..99ba70c8cd --- /dev/null +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/vae/autoencoder_kl.py @@ -0,0 +1,571 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import Dict, Optional, Tuple, Union + +import torch +import torch.nn as nn + +from diffusers.configuration_utils import ConfigMixin, register_to_config +from diffusers.loaders import PeftAdapterMixin +from diffusers.loaders.single_file_model import FromOriginalModelMixin +from diffusers.utils import deprecate +from diffusers.utils.accelerate_utils import apply_forward_hook +from diffusers.models.attention_processor import ( + ADDED_KV_ATTENTION_PROCESSORS, + CROSS_ATTENTION_PROCESSORS, + Attention, + AttentionProcessor, + AttnAddedKVProcessor, + AttnProcessor, + FusedAttnProcessor2_0, +) +from diffusers.models.modeling_outputs import AutoencoderKLOutput +from diffusers.models.modeling_utils import ModelMixin +from .vae import Decoder, DecoderOutput, DiagonalGaussianDistribution, Encoder + + +class AutoencoderKL(ModelMixin, ConfigMixin, FromOriginalModelMixin, PeftAdapterMixin): + r""" + A VAE model with KL loss for encoding images into latents and decoding latent representations into images. + + This model inherits from [`ModelMixin`]. Check the superclass documentation for it's generic methods implemented + for all models (such as downloading or saving). + + Parameters: + in_channels (int, *optional*, defaults to 3): Number of channels in the input image. + out_channels (int, *optional*, defaults to 3): Number of channels in the output. + down_block_types (`Tuple[str]`, *optional*, defaults to `("DownEncoderBlock2D",)`): + Tuple of downsample block types. + up_block_types (`Tuple[str]`, *optional*, defaults to `("UpDecoderBlock2D",)`): + Tuple of upsample block types. + block_out_channels (`Tuple[int]`, *optional*, defaults to `(64,)`): + Tuple of block output channels. + act_fn (`str`, *optional*, defaults to `"silu"`): The activation function to use. + latent_channels (`int`, *optional*, defaults to 4): Number of channels in the latent space. + sample_size (`int`, *optional*, defaults to `32`): Sample input size. + scaling_factor (`float`, *optional*, defaults to 0.18215): + The component-wise standard deviation of the trained latent space computed using the first batch of the + training set. This is used to scale the latent space to have unit variance when training the diffusion + model. The latents are scaled with the formula `z = z * scaling_factor` before being passed to the + diffusion model. When decoding, the latents are scaled back to the original scale with the formula: `z = 1 + / scaling_factor * z`. For more details, refer to sections 4.3.2 and D.1 of the [High-Resolution Image + Synthesis with Latent Diffusion Models](https://arxiv.org/abs/2112.10752) paper. + force_upcast (`bool`, *optional*, default to `True`): + If enabled it will force the VAE to run in float32 for high image resolution pipelines, such as SD-XL. VAE + can be fine-tuned / trained to a lower range without loosing too much precision in which case + `force_upcast` can be set to `False` - see: https://huggingface.co/madebyollin/sdxl-vae-fp16-fix + mid_block_add_attention (`bool`, *optional*, default to `True`): + If enabled, the mid_block of the Encoder and Decoder will have attention blocks. If set to false, the + mid_block will only have resnet blocks + """ + + _supports_gradient_checkpointing = True + _no_split_modules = ["BasicTransformerBlock", "ResnetBlock2D"] + + @register_to_config + def __init__( + self, + in_channels: int = 3, + out_channels: int = 3, + down_block_types: Tuple[str] = ("DownEncoderBlock2D",), + up_block_types: Tuple[str] = ("UpDecoderBlock2D",), + block_out_channels: Tuple[int] = (64,), + layers_per_block: int = 1, + act_fn: str = "silu", + latent_channels: int = 4, + norm_num_groups: int = 32, + sample_size: int = 32, + scaling_factor: float = 0.18215, + shift_factor: Optional[float] = None, + latents_mean: Optional[Tuple[float]] = None, + latents_std: Optional[Tuple[float]] = None, + force_upcast: float = True, + use_quant_conv: bool = True, + use_post_quant_conv: bool = True, + mid_block_add_attention: bool = True, + ): + super().__init__() + + # pass init params to Encoder + self.encoder = Encoder( + in_channels=in_channels, + out_channels=latent_channels, + down_block_types=down_block_types, + block_out_channels=block_out_channels, + layers_per_block=layers_per_block, + act_fn=act_fn, + norm_num_groups=norm_num_groups, + double_z=True, + mid_block_add_attention=mid_block_add_attention, + ) + + # pass init params to Decoder + self.decoder = Decoder( + in_channels=latent_channels, + out_channels=out_channels, + up_block_types=up_block_types, + block_out_channels=block_out_channels, + layers_per_block=layers_per_block, + norm_num_groups=norm_num_groups, + act_fn=act_fn, + mid_block_add_attention=mid_block_add_attention, + ) + + self.quant_conv = nn.Conv2d(2 * latent_channels, 2 * latent_channels, 1) if use_quant_conv else None + self.post_quant_conv = nn.Conv2d(latent_channels, latent_channels, 1) if use_post_quant_conv else None + + self.use_slicing = False + self.use_tiling = False + + # only relevant if vae tiling is enabled + self.tile_sample_min_size = self.config.sample_size + sample_size = ( + self.config.sample_size[0] + if isinstance(self.config.sample_size, (list, tuple)) + else self.config.sample_size + ) + self.tile_latent_min_size = int(sample_size / (2 ** (len(self.config.block_out_channels) - 1))) + self.tile_overlap_factor = 0.25 + + def _set_gradient_checkpointing(self, module, value=False): + if isinstance(module, (Encoder, Decoder)): + module.gradient_checkpointing = value + + def enable_tiling(self, use_tiling: bool = True): + r""" + Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to + compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow + processing larger images. + """ + self.use_tiling = use_tiling + + def disable_tiling(self): + r""" + Disable tiled VAE decoding. If `enable_tiling` was previously enabled, this method will go back to computing + decoding in one step. + """ + self.enable_tiling(False) + + def enable_slicing(self): + r""" + Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to + compute decoding in several steps. This is useful to save some memory and allow larger batch sizes. + """ + self.use_slicing = True + + def disable_slicing(self): + r""" + Disable sliced VAE decoding. If `enable_slicing` was previously enabled, this method will go back to computing + decoding in one step. + """ + self.use_slicing = False + + @property + # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.attn_processors + def attn_processors(self) -> Dict[str, AttentionProcessor]: + r""" + Returns: + `dict` of attention processors: A dictionary containing all attention processors used in the model with + indexed by its weight name. + """ + # set recursively + processors = {} + + def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]): + if hasattr(module, "get_processor"): + processors[f"{name}.processor"] = module.get_processor() + + for sub_name, child in module.named_children(): + fn_recursive_add_processors(f"{name}.{sub_name}", child, processors) + + return processors + + for name, module in self.named_children(): + fn_recursive_add_processors(name, module, processors) + + return processors + + # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_attn_processor + def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]): + r""" + Sets the attention processor to use to compute attention. + + Parameters: + processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`): + The instantiated processor class or a dictionary of processor classes that will be set as the processor + for **all** `Attention` layers. + + If `processor` is a dict, the key needs to define the path to the corresponding cross attention + processor. This is strongly recommended when setting trainable attention processors. + + """ + count = len(self.attn_processors.keys()) + + if isinstance(processor, dict) and len(processor) != count: + raise ValueError( + f"A dict of processors was passed, but the number of processors {len(processor)} does not match the" + f" number of attention layers: {count}. Please make sure to pass {count} processor classes." + ) + + def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor): + if hasattr(module, "set_processor"): + if not isinstance(processor, dict): + module.set_processor(processor) + else: + module.set_processor(processor.pop(f"{name}.processor")) + + for sub_name, child in module.named_children(): + fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor) + + for name, module in self.named_children(): + fn_recursive_attn_processor(name, module, processor) + + # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_default_attn_processor + def set_default_attn_processor(self): + """ + Disables custom attention processors and sets the default attention implementation. + """ + if all(proc.__class__ in ADDED_KV_ATTENTION_PROCESSORS for proc in self.attn_processors.values()): + processor = AttnAddedKVProcessor() + elif all(proc.__class__ in CROSS_ATTENTION_PROCESSORS for proc in self.attn_processors.values()): + processor = AttnProcessor() + else: + raise ValueError( + f"Cannot call `set_default_attn_processor` when attention processors are of type {next(iter(self.attn_processors.values()))}" + ) + + self.set_attn_processor(processor) + + def _encode(self, x: torch.Tensor) -> torch.Tensor: + batch_size, num_channels, height, width = x.shape + + if self.use_tiling and (width > self.tile_sample_min_size or height > self.tile_sample_min_size): + return self._tiled_encode(x) + + enc = self.encoder(x) + if self.quant_conv is not None: + enc = self.quant_conv(enc) + + return enc + + @apply_forward_hook + def encode( + self, x: torch.Tensor, return_dict: bool = True + ) -> Union[AutoencoderKLOutput, Tuple[DiagonalGaussianDistribution]]: + """ + Encode a batch of images into latents. + + Args: + x (`torch.Tensor`): Input batch of images. + return_dict (`bool`, *optional*, defaults to `True`): + Whether to return a [`~models.autoencoder_kl.AutoencoderKLOutput`] instead of a plain tuple. + + Returns: + The latent representations of the encoded images. If `return_dict` is True, a + [`~models.autoencoder_kl.AutoencoderKLOutput`] is returned, otherwise a plain `tuple` is returned. + """ + if self.use_slicing and x.shape[0] > 1: + encoded_slices = [self._encode(x_slice) for x_slice in x.split(1)] + h = torch.cat(encoded_slices) + else: + h = self._encode(x) + + posterior = DiagonalGaussianDistribution(h) + + if not return_dict: + return (posterior,) + + return AutoencoderKLOutput(latent_dist=posterior) + + def _decode(self, z: torch.Tensor, return_dict: bool = True) -> Union[DecoderOutput, torch.Tensor]: + if self.use_tiling and (z.shape[-1] > self.tile_latent_min_size or z.shape[-2] > self.tile_latent_min_size): + return self.tiled_decode(z, return_dict=return_dict) + + if self.post_quant_conv is not None: + z = self.post_quant_conv(z) + + dec = self.decoder(z) + + if not return_dict: + return (dec,) + + return DecoderOutput(sample=dec) + + @apply_forward_hook + def decode( + self, z: torch.FloatTensor, return_dict: bool = True, generator=None + ) -> Union[DecoderOutput, torch.FloatTensor]: + """ + Decode a batch of images. + + Args: + z (`torch.Tensor`): Input batch of latent vectors. + return_dict (`bool`, *optional*, defaults to `True`): + Whether to return a [`~models.vae.DecoderOutput`] instead of a plain tuple. + + Returns: + [`~models.vae.DecoderOutput`] or `tuple`: + If return_dict is True, a [`~models.vae.DecoderOutput`] is returned, otherwise a plain `tuple` is + returned. + + """ + if self.use_slicing and z.shape[0] > 1: + decoded_slices = [self._decode(z_slice).sample for z_slice in z.split(1)] + decoded = torch.cat(decoded_slices) + else: + decoded = self._decode(z).sample + + if not return_dict: + return (decoded,) + + return DecoderOutput(sample=decoded) + + def blend_v(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch.Tensor: + blend_extent = min(a.shape[2], b.shape[2], blend_extent) + for y in range(blend_extent): + b[:, :, y, :] = a[:, :, -blend_extent + y, :] * (1 - y / blend_extent) + b[:, :, y, :] * (y / blend_extent) + return b + + def blend_h(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch.Tensor: + blend_extent = min(a.shape[3], b.shape[3], blend_extent) + for x in range(blend_extent): + b[:, :, :, x] = a[:, :, :, -blend_extent + x] * (1 - x / blend_extent) + b[:, :, :, x] * (x / blend_extent) + return b + + def _tiled_encode(self, x: torch.Tensor) -> torch.Tensor: + r"""Encode a batch of images using a tiled encoder. + + When this option is enabled, the VAE will split the input tensor into tiles to compute encoding in several + steps. This is useful to keep memory use constant regardless of image size. The end result of tiled encoding is + different from non-tiled encoding because each tile uses a different encoder. To avoid tiling artifacts, the + tiles overlap and are blended together to form a smooth output. You may still see tile-sized changes in the + output, but they should be much less noticeable. + + Args: + x (`torch.Tensor`): Input batch of images. + + Returns: + `torch.Tensor`: + The latent representation of the encoded videos. + """ + + overlap_size = int(self.tile_sample_min_size * (1 - self.tile_overlap_factor)) + blend_extent = int(self.tile_latent_min_size * self.tile_overlap_factor) + row_limit = self.tile_latent_min_size - blend_extent + + # Split the image into 512x512 tiles and encode them separately. + rows = [] + for i in range(0, x.shape[2], overlap_size): + row = [] + for j in range(0, x.shape[3], overlap_size): + tile = x[:, :, i : i + self.tile_sample_min_size, j : j + self.tile_sample_min_size] + tile = self.encoder(tile) + if self.config.use_quant_conv: + tile = self.quant_conv(tile) + row.append(tile) + rows.append(row) + result_rows = [] + for i, row in enumerate(rows): + result_row = [] + for j, tile in enumerate(row): + # blend the above tile and the left tile + # to the current tile and add the current tile to the result row + if i > 0: + tile = self.blend_v(rows[i - 1][j], tile, blend_extent) + if j > 0: + tile = self.blend_h(row[j - 1], tile, blend_extent) + result_row.append(tile[:, :, :row_limit, :row_limit]) + result_rows.append(torch.cat(result_row, dim=3)) + + enc = torch.cat(result_rows, dim=2) + return enc + + def tiled_encode(self, x: torch.Tensor, return_dict: bool = True) -> AutoencoderKLOutput: + r"""Encode a batch of images using a tiled encoder. + + When this option is enabled, the VAE will split the input tensor into tiles to compute encoding in several + steps. This is useful to keep memory use constant regardless of image size. The end result of tiled encoding is + different from non-tiled encoding because each tile uses a different encoder. To avoid tiling artifacts, the + tiles overlap and are blended together to form a smooth output. You may still see tile-sized changes in the + output, but they should be much less noticeable. + + Args: + x (`torch.Tensor`): Input batch of images. + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`~models.autoencoder_kl.AutoencoderKLOutput`] instead of a plain tuple. + + Returns: + [`~models.autoencoder_kl.AutoencoderKLOutput`] or `tuple`: + If return_dict is True, a [`~models.autoencoder_kl.AutoencoderKLOutput`] is returned, otherwise a plain + `tuple` is returned. + """ + deprecation_message = ( + "The tiled_encode implementation supporting the `return_dict` parameter is deprecated. In the future, the " + "implementation of this method will be replaced with that of `_tiled_encode` and you will no longer be able " + "to pass `return_dict`. You will also have to create a `DiagonalGaussianDistribution()` from the returned value." + ) + deprecate("tiled_encode", "1.0.0", deprecation_message, standard_warn=False) + + overlap_size = int(self.tile_sample_min_size * (1 - self.tile_overlap_factor)) + blend_extent = int(self.tile_latent_min_size * self.tile_overlap_factor) + row_limit = self.tile_latent_min_size - blend_extent + + # Split the image into 512x512 tiles and encode them separately. + rows = [] + for i in range(0, x.shape[2], overlap_size): + row = [] + for j in range(0, x.shape[3], overlap_size): + tile = x[:, :, i : i + self.tile_sample_min_size, j : j + self.tile_sample_min_size] + tile = self.encoder(tile) + if self.config.use_quant_conv: + tile = self.quant_conv(tile) + row.append(tile) + rows.append(row) + result_rows = [] + for i, row in enumerate(rows): + result_row = [] + for j, tile in enumerate(row): + # blend the above tile and the left tile + # to the current tile and add the current tile to the result row + if i > 0: + tile = self.blend_v(rows[i - 1][j], tile, blend_extent) + if j > 0: + tile = self.blend_h(row[j - 1], tile, blend_extent) + result_row.append(tile[:, :, :row_limit, :row_limit]) + result_rows.append(torch.cat(result_row, dim=3)) + + moments = torch.cat(result_rows, dim=2) + posterior = DiagonalGaussianDistribution(moments) + + if not return_dict: + return (posterior,) + + return AutoencoderKLOutput(latent_dist=posterior) + + def tiled_decode(self, z: torch.Tensor, return_dict: bool = True) -> Union[DecoderOutput, torch.Tensor]: + r""" + Decode a batch of images using a tiled decoder. + + Args: + z (`torch.Tensor`): Input batch of latent vectors. + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`~models.vae.DecoderOutput`] instead of a plain tuple. + + Returns: + [`~models.vae.DecoderOutput`] or `tuple`: + If return_dict is True, a [`~models.vae.DecoderOutput`] is returned, otherwise a plain `tuple` is + returned. + """ + overlap_size = int(self.tile_latent_min_size * (1 - self.tile_overlap_factor)) + blend_extent = int(self.tile_sample_min_size * self.tile_overlap_factor) + row_limit = self.tile_sample_min_size - blend_extent + + # Split z into overlapping 64x64 tiles and decode them separately. + # The tiles have an overlap to avoid seams between tiles. + rows = [] + for i in range(0, z.shape[2], overlap_size): + row = [] + for j in range(0, z.shape[3], overlap_size): + tile = z[:, :, i : i + self.tile_latent_min_size, j : j + self.tile_latent_min_size] + if self.config.use_post_quant_conv: + tile = self.post_quant_conv(tile) + decoded = self.decoder(tile) + row.append(decoded) + rows.append(row) + result_rows = [] + for i, row in enumerate(rows): + result_row = [] + for j, tile in enumerate(row): + # blend the above tile and the left tile + # to the current tile and add the current tile to the result row + if i > 0: + tile = self.blend_v(rows[i - 1][j], tile, blend_extent) + if j > 0: + tile = self.blend_h(row[j - 1], tile, blend_extent) + result_row.append(tile[:, :, :row_limit, :row_limit]) + result_rows.append(torch.cat(result_row, dim=3)) + + dec = torch.cat(result_rows, dim=2) + if not return_dict: + return (dec,) + + return DecoderOutput(sample=dec) + + def forward( + self, + sample: torch.Tensor, + sample_posterior: bool = False, + return_dict: bool = True, + generator: Optional[torch.Generator] = None, + ) -> Union[DecoderOutput, torch.Tensor]: + r""" + Args: + sample (`torch.Tensor`): Input sample. + sample_posterior (`bool`, *optional*, defaults to `False`): + Whether to sample from the posterior. + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`DecoderOutput`] instead of a plain tuple. + """ + x = sample + posterior = self.encode(x).latent_dist + if sample_posterior: + z = posterior.sample(generator=generator) + else: + z = posterior.mode() + dec = self.decode(z).sample + + if not return_dict: + return (dec,) + + return DecoderOutput(sample=dec) + + # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.fuse_qkv_projections + def fuse_qkv_projections(self): + """ + Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, key, value) + are fused. For cross-attention modules, key and value projection matrices are fused. + + + + This API is 🧪 experimental. + + + """ + self.original_attn_processors = None + + for _, attn_processor in self.attn_processors.items(): + if "Added" in str(attn_processor.__class__.__name__): + raise ValueError("`fuse_qkv_projections()` is not supported for models having added KV projections.") + + self.original_attn_processors = self.attn_processors + + for module in self.modules(): + if isinstance(module, Attention): + module.fuse_projections(fuse=True) + + self.set_attn_processor(FusedAttnProcessor2_0()) + + # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.unfuse_qkv_projections + def unfuse_qkv_projections(self): + """Disables the fused QKV projection if enabled. + + + + This API is 🧪 experimental. + + + + """ + if self.original_attn_processors is not None: + self.set_attn_processor(self.original_attn_processors) \ No newline at end of file diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/vae/vae.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/vae/vae.py new file mode 100644 index 0000000000..006ed75f1f --- /dev/null +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/vae/vae.py @@ -0,0 +1,995 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from dataclasses import dataclass +from typing import Optional, Tuple + +import numpy as np +import torch +import torch.nn as nn + +from diffusers.utils import BaseOutput, is_torch_version +from diffusers.utils.torch_utils import randn_tensor +from diffusers.models.activations import get_activation +from diffusers.models.attention_processor import SpatialNorm +from diffusers.models.unets.unet_2d_blocks import ( + AutoencoderTinyBlock, + UNetMidBlock2D, + get_down_block, + get_up_block, +) + + +@dataclass +class EncoderOutput(BaseOutput): + r""" + Output of encoding method. + + Args: + latent (`torch.Tensor` of shape `(batch_size, num_channels, latent_height, latent_width)`): + The encoded latent. + """ + + latent: torch.Tensor + + +@dataclass +class DecoderOutput(BaseOutput): + r""" + Output of decoding method. + + Args: + sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)`): + The decoded output sample from the last layer of the model. + """ + + sample: torch.Tensor + commit_loss: Optional[torch.FloatTensor] = None + + +class Encoder(nn.Module): + r""" + The `Encoder` layer of a variational autoencoder that encodes its input into a latent representation. + + Args: + in_channels (`int`, *optional*, defaults to 3): + The number of input channels. + out_channels (`int`, *optional*, defaults to 3): + The number of output channels. + down_block_types (`Tuple[str, ...]`, *optional*, defaults to `("DownEncoderBlock2D",)`): + The types of down blocks to use. See `~diffusers.models.unet_2d_blocks.get_down_block` for available + options. + block_out_channels (`Tuple[int, ...]`, *optional*, defaults to `(64,)`): + The number of output channels for each block. + layers_per_block (`int`, *optional*, defaults to 2): + The number of layers per block. + norm_num_groups (`int`, *optional*, defaults to 32): + The number of groups for normalization. + act_fn (`str`, *optional*, defaults to `"silu"`): + The activation function to use. See `~diffusers.models.activations.get_activation` for available options. + double_z (`bool`, *optional*, defaults to `True`): + Whether to double the number of output channels for the last block. + """ + + def __init__( + self, + in_channels: int = 3, + out_channels: int = 3, + down_block_types: Tuple[str, ...] = ("DownEncoderBlock2D",), + block_out_channels: Tuple[int, ...] = (64,), + layers_per_block: int = 2, + norm_num_groups: int = 32, + act_fn: str = "silu", + double_z: bool = True, + mid_block_add_attention=True, + ): + super().__init__() + self.layers_per_block = layers_per_block + + self.conv_in = nn.Conv2d( + in_channels, + block_out_channels[0], + kernel_size=3, + stride=1, + padding=1, + ) + + self.down_blocks = nn.ModuleList([]) + + # down + output_channel = block_out_channels[0] + for i, down_block_type in enumerate(down_block_types): + input_channel = output_channel + output_channel = block_out_channels[i] + is_final_block = i == len(block_out_channels) - 1 + + down_block = get_down_block( + down_block_type, + num_layers=self.layers_per_block, + in_channels=input_channel, + out_channels=output_channel, + add_downsample=not is_final_block, + resnet_eps=1e-6, + downsample_padding=0, + resnet_act_fn=act_fn, + resnet_groups=norm_num_groups, + attention_head_dim=output_channel, + temb_channels=None, + ) + self.down_blocks.append(down_block) + + # mid + self.mid_block = UNetMidBlock2D( + in_channels=block_out_channels[-1], + resnet_eps=1e-6, + resnet_act_fn=act_fn, + output_scale_factor=1, + resnet_time_scale_shift="default", + attention_head_dim=block_out_channels[-1], + resnet_groups=norm_num_groups, + temb_channels=None, + add_attention=mid_block_add_attention, + ) + + # out + self.conv_norm_out = nn.GroupNorm(num_channels=block_out_channels[-1], num_groups=norm_num_groups, eps=1e-6) + self.conv_act = nn.SiLU() + + conv_out_channels = 2 * out_channels if double_z else out_channels + self.conv_out = nn.Conv2d(block_out_channels[-1], conv_out_channels, 3, padding=1) + + self.gradient_checkpointing = False + + def forward(self, sample: torch.Tensor) -> torch.Tensor: + r"""The forward method of the `Encoder` class.""" + + sample = self.conv_in(sample) + + if torch.is_grad_enabled() and self.gradient_checkpointing: + + def create_custom_forward(module): + def custom_forward(*inputs): + return module(*inputs) + + return custom_forward + + # down + if is_torch_version(">=", "1.11.0"): + for down_block in self.down_blocks: + sample = torch.utils.checkpoint.checkpoint( + create_custom_forward(down_block), sample, use_reentrant=False + ) + # middle + sample = torch.utils.checkpoint.checkpoint( + create_custom_forward(self.mid_block), sample, use_reentrant=False + ) + else: + for down_block in self.down_blocks: + sample = torch.utils.checkpoint.checkpoint(create_custom_forward(down_block), sample) + # middle + sample = torch.utils.checkpoint.checkpoint(create_custom_forward(self.mid_block), sample) + + else: + # down + for down_block in self.down_blocks: + sample = down_block(sample) + + # middle + sample = self.mid_block(sample) + + # post-process + sample = self.conv_norm_out(sample) + sample = self.conv_act(sample) + sample = self.conv_out(sample) + + return sample + + +class Decoder(nn.Module): + r""" + The `Decoder` layer of a variational autoencoder that decodes its latent representation into an output sample. + + Args: + in_channels (`int`, *optional*, defaults to 3): + The number of input channels. + out_channels (`int`, *optional*, defaults to 3): + The number of output channels. + up_block_types (`Tuple[str, ...]`, *optional*, defaults to `("UpDecoderBlock2D",)`): + The types of up blocks to use. See `~diffusers.models.unet_2d_blocks.get_up_block` for available options. + block_out_channels (`Tuple[int, ...]`, *optional*, defaults to `(64,)`): + The number of output channels for each block. + layers_per_block (`int`, *optional*, defaults to 2): + The number of layers per block. + norm_num_groups (`int`, *optional*, defaults to 32): + The number of groups for normalization. + act_fn (`str`, *optional*, defaults to `"silu"`): + The activation function to use. See `~diffusers.models.activations.get_activation` for available options. + norm_type (`str`, *optional*, defaults to `"group"`): + The normalization type to use. Can be either `"group"` or `"spatial"`. + """ + + def __init__( + self, + in_channels: int = 3, + out_channels: int = 3, + up_block_types: Tuple[str, ...] = ("UpDecoderBlock2D",), + block_out_channels: Tuple[int, ...] = (64,), + layers_per_block: int = 2, + norm_num_groups: int = 32, + act_fn: str = "silu", + norm_type: str = "group", # group, spatial + mid_block_add_attention=True, + ): + super().__init__() + self.layers_per_block = layers_per_block + + self.conv_in = nn.Conv2d( + in_channels, + block_out_channels[-1], + kernel_size=3, + stride=1, + padding=1, + ) + + self.up_blocks = nn.ModuleList([]) + + temb_channels = in_channels if norm_type == "spatial" else None + + # mid + self.mid_block = UNetMidBlock2D( + in_channels=block_out_channels[-1], + resnet_eps=1e-6, + resnet_act_fn=act_fn, + output_scale_factor=1, + resnet_time_scale_shift="default" if norm_type == "group" else norm_type, + attention_head_dim=block_out_channels[-1], + resnet_groups=norm_num_groups, + temb_channels=temb_channels, + add_attention=mid_block_add_attention, + ) + + # up + reversed_block_out_channels = list(reversed(block_out_channels)) + output_channel = reversed_block_out_channels[0] + for i, up_block_type in enumerate(up_block_types): + prev_output_channel = output_channel + output_channel = reversed_block_out_channels[i] + + is_final_block = i == len(block_out_channels) - 1 + + up_block = get_up_block( + up_block_type, + num_layers=self.layers_per_block + 1, + in_channels=prev_output_channel, + out_channels=output_channel, + prev_output_channel=None, + add_upsample=not is_final_block, + resnet_eps=1e-6, + resnet_act_fn=act_fn, + resnet_groups=norm_num_groups, + attention_head_dim=output_channel, + temb_channels=temb_channels, + resnet_time_scale_shift=norm_type, + ) + self.up_blocks.append(up_block) + prev_output_channel = output_channel + + # out + if norm_type == "spatial": + self.conv_norm_out = SpatialNorm(block_out_channels[0], temb_channels) + else: + self.conv_norm_out = nn.GroupNorm(num_channels=block_out_channels[0], num_groups=norm_num_groups, eps=1e-6) + self.conv_act = nn.SiLU() + self.conv_out = nn.Conv2d(block_out_channels[0], out_channels, 3, padding=1) + + self.gradient_checkpointing = False + + def forward( + self, + sample: torch.Tensor, + latent_embeds: Optional[torch.Tensor] = None, + ) -> torch.Tensor: + r"""The forward method of the `Decoder` class.""" + + sample = self.conv_in(sample) + + upscale_dtype = next(iter(self.up_blocks.parameters())).dtype + if torch.is_grad_enabled() and self.gradient_checkpointing: + + def create_custom_forward(module): + def custom_forward(*inputs): + return module(*inputs) + + return custom_forward + + if is_torch_version(">=", "1.11.0"): + # middle + sample = torch.utils.checkpoint.checkpoint( + create_custom_forward(self.mid_block), + sample, + latent_embeds, + use_reentrant=False, + ) + sample = sample.to(upscale_dtype) + + # up + for up_block in self.up_blocks: + sample = torch.utils.checkpoint.checkpoint( + create_custom_forward(up_block), + sample, + latent_embeds, + use_reentrant=False, + ) + else: + # middle + sample = torch.utils.checkpoint.checkpoint( + create_custom_forward(self.mid_block), sample, latent_embeds + ) + sample = sample.to(upscale_dtype) + + # up + for up_block in self.up_blocks: + sample = torch.utils.checkpoint.checkpoint(create_custom_forward(up_block), sample, latent_embeds) + else: + # middle + sample = self.mid_block(sample, latent_embeds) + sample = sample.to(upscale_dtype) + + # up + for up_block in self.up_blocks: + sample = up_block(sample, latent_embeds) + + # post-process + if latent_embeds is None: + sample = self.conv_norm_out(sample) + else: + sample = self.conv_norm_out(sample, latent_embeds) + sample = self.conv_act(sample) + sample = self.conv_out(sample) + + return sample + + +class UpSample(nn.Module): + r""" + The `UpSample` layer of a variational autoencoder that upsamples its input. + + Args: + in_channels (`int`, *optional*, defaults to 3): + The number of input channels. + out_channels (`int`, *optional*, defaults to 3): + The number of output channels. + """ + + def __init__( + self, + in_channels: int, + out_channels: int, + ) -> None: + super().__init__() + self.in_channels = in_channels + self.out_channels = out_channels + self.deconv = nn.ConvTranspose2d(in_channels, out_channels, kernel_size=4, stride=2, padding=1) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + r"""The forward method of the `UpSample` class.""" + x = torch.relu(x) + x = self.deconv(x) + return x + + +class MaskConditionEncoder(nn.Module): + """ + used in AsymmetricAutoencoderKL + """ + + def __init__( + self, + in_ch: int, + out_ch: int = 192, + res_ch: int = 768, + stride: int = 16, + ) -> None: + super().__init__() + + channels = [] + while stride > 1: + stride = stride // 2 + in_ch_ = out_ch * 2 + if out_ch > res_ch: + out_ch = res_ch + if stride == 1: + in_ch_ = res_ch + channels.append((in_ch_, out_ch)) + out_ch *= 2 + + out_channels = [] + for _in_ch, _out_ch in channels: + out_channels.append(_out_ch) + out_channels.append(channels[-1][0]) + + layers = [] + in_ch_ = in_ch + for l in range(len(out_channels)): + out_ch_ = out_channels[l] + if l == 0 or l == 1: + layers.append(nn.Conv2d(in_ch_, out_ch_, kernel_size=3, stride=1, padding=1)) + else: + layers.append(nn.Conv2d(in_ch_, out_ch_, kernel_size=4, stride=2, padding=1)) + in_ch_ = out_ch_ + + self.layers = nn.Sequential(*layers) + + def forward(self, x: torch.Tensor, mask=None) -> torch.Tensor: + r"""The forward method of the `MaskConditionEncoder` class.""" + out = {} + for l in range(len(self.layers)): + layer = self.layers[l] + x = layer(x) + out[str(tuple(x.shape))] = x + x = torch.relu(x) + return out + + +class MaskConditionDecoder(nn.Module): + r"""The `MaskConditionDecoder` should be used in combination with [`AsymmetricAutoencoderKL`] to enhance the model's + decoder with a conditioner on the mask and masked image. + + Args: + in_channels (`int`, *optional*, defaults to 3): + The number of input channels. + out_channels (`int`, *optional*, defaults to 3): + The number of output channels. + up_block_types (`Tuple[str, ...]`, *optional*, defaults to `("UpDecoderBlock2D",)`): + The types of up blocks to use. See `~diffusers.models.unet_2d_blocks.get_up_block` for available options. + block_out_channels (`Tuple[int, ...]`, *optional*, defaults to `(64,)`): + The number of output channels for each block. + layers_per_block (`int`, *optional*, defaults to 2): + The number of layers per block. + norm_num_groups (`int`, *optional*, defaults to 32): + The number of groups for normalization. + act_fn (`str`, *optional*, defaults to `"silu"`): + The activation function to use. See `~diffusers.models.activations.get_activation` for available options. + norm_type (`str`, *optional*, defaults to `"group"`): + The normalization type to use. Can be either `"group"` or `"spatial"`. + """ + + def __init__( + self, + in_channels: int = 3, + out_channels: int = 3, + up_block_types: Tuple[str, ...] = ("UpDecoderBlock2D",), + block_out_channels: Tuple[int, ...] = (64,), + layers_per_block: int = 2, + norm_num_groups: int = 32, + act_fn: str = "silu", + norm_type: str = "group", # group, spatial + ): + super().__init__() + self.layers_per_block = layers_per_block + + self.conv_in = nn.Conv2d( + in_channels, + block_out_channels[-1], + kernel_size=3, + stride=1, + padding=1, + ) + + self.up_blocks = nn.ModuleList([]) + + temb_channels = in_channels if norm_type == "spatial" else None + + # mid + self.mid_block = UNetMidBlock2D( + in_channels=block_out_channels[-1], + resnet_eps=1e-6, + resnet_act_fn=act_fn, + output_scale_factor=1, + resnet_time_scale_shift="default" if norm_type == "group" else norm_type, + attention_head_dim=block_out_channels[-1], + resnet_groups=norm_num_groups, + temb_channels=temb_channels, + ) + + # up + reversed_block_out_channels = list(reversed(block_out_channels)) + output_channel = reversed_block_out_channels[0] + for i, up_block_type in enumerate(up_block_types): + prev_output_channel = output_channel + output_channel = reversed_block_out_channels[i] + + is_final_block = i == len(block_out_channels) - 1 + + up_block = get_up_block( + up_block_type, + num_layers=self.layers_per_block + 1, + in_channels=prev_output_channel, + out_channels=output_channel, + prev_output_channel=None, + add_upsample=not is_final_block, + resnet_eps=1e-6, + resnet_act_fn=act_fn, + resnet_groups=norm_num_groups, + attention_head_dim=output_channel, + temb_channels=temb_channels, + resnet_time_scale_shift=norm_type, + ) + self.up_blocks.append(up_block) + prev_output_channel = output_channel + + # condition encoder + self.condition_encoder = MaskConditionEncoder( + in_ch=out_channels, + out_ch=block_out_channels[0], + res_ch=block_out_channels[-1], + ) + + # out + if norm_type == "spatial": + self.conv_norm_out = SpatialNorm(block_out_channels[0], temb_channels) + else: + self.conv_norm_out = nn.GroupNorm(num_channels=block_out_channels[0], num_groups=norm_num_groups, eps=1e-6) + self.conv_act = nn.SiLU() + self.conv_out = nn.Conv2d(block_out_channels[0], out_channels, 3, padding=1) + + self.gradient_checkpointing = False + + def forward( + self, + z: torch.Tensor, + image: Optional[torch.Tensor] = None, + mask: Optional[torch.Tensor] = None, + latent_embeds: Optional[torch.Tensor] = None, + ) -> torch.Tensor: + r"""The forward method of the `MaskConditionDecoder` class.""" + sample = z + sample = self.conv_in(sample) + + upscale_dtype = next(iter(self.up_blocks.parameters())).dtype + if torch.is_grad_enabled() and self.gradient_checkpointing: + + def create_custom_forward(module): + def custom_forward(*inputs): + return module(*inputs) + + return custom_forward + + if is_torch_version(">=", "1.11.0"): + # middle + sample = torch.utils.checkpoint.checkpoint( + create_custom_forward(self.mid_block), + sample, + latent_embeds, + use_reentrant=False, + ) + sample = sample.to(upscale_dtype) + + # condition encoder + if image is not None and mask is not None: + masked_image = (1 - mask) * image + im_x = torch.utils.checkpoint.checkpoint( + create_custom_forward(self.condition_encoder), + masked_image, + mask, + use_reentrant=False, + ) + + # up + for up_block in self.up_blocks: + if image is not None and mask is not None: + sample_ = im_x[str(tuple(sample.shape))] + mask_ = nn.functional.interpolate(mask, size=sample.shape[-2:], mode="nearest") + sample = sample * mask_ + sample_ * (1 - mask_) + sample = torch.utils.checkpoint.checkpoint( + create_custom_forward(up_block), + sample, + latent_embeds, + use_reentrant=False, + ) + if image is not None and mask is not None: + sample = sample * mask + im_x[str(tuple(sample.shape))] * (1 - mask) + else: + # middle + sample = torch.utils.checkpoint.checkpoint( + create_custom_forward(self.mid_block), sample, latent_embeds + ) + sample = sample.to(upscale_dtype) + + # condition encoder + if image is not None and mask is not None: + masked_image = (1 - mask) * image + im_x = torch.utils.checkpoint.checkpoint( + create_custom_forward(self.condition_encoder), + masked_image, + mask, + ) + + # up + for up_block in self.up_blocks: + if image is not None and mask is not None: + sample_ = im_x[str(tuple(sample.shape))] + mask_ = nn.functional.interpolate(mask, size=sample.shape[-2:], mode="nearest") + sample = sample * mask_ + sample_ * (1 - mask_) + sample = torch.utils.checkpoint.checkpoint(create_custom_forward(up_block), sample, latent_embeds) + if image is not None and mask is not None: + sample = sample * mask + im_x[str(tuple(sample.shape))] * (1 - mask) + else: + # middle + sample = self.mid_block(sample, latent_embeds) + sample = sample.to(upscale_dtype) + + # condition encoder + if image is not None and mask is not None: + masked_image = (1 - mask) * image + im_x = self.condition_encoder(masked_image, mask) + + # up + for up_block in self.up_blocks: + if image is not None and mask is not None: + sample_ = im_x[str(tuple(sample.shape))] + mask_ = nn.functional.interpolate(mask, size=sample.shape[-2:], mode="nearest") + sample = sample * mask_ + sample_ * (1 - mask_) + sample = up_block(sample, latent_embeds) + if image is not None and mask is not None: + sample = sample * mask + im_x[str(tuple(sample.shape))] * (1 - mask) + + # post-process + if latent_embeds is None: + sample = self.conv_norm_out(sample) + else: + sample = self.conv_norm_out(sample, latent_embeds) + sample = self.conv_act(sample) + sample = self.conv_out(sample) + + return sample + + +class VectorQuantizer(nn.Module): + """ + Improved version over VectorQuantizer, can be used as a drop-in replacement. Mostly avoids costly matrix + multiplications and allows for post-hoc remapping of indices. + """ + + # NOTE: due to a bug the beta term was applied to the wrong term. for + # backwards compatibility we use the buggy version by default, but you can + # specify legacy=False to fix it. + def __init__( + self, + n_e: int, + vq_embed_dim: int, + beta: float, + remap=None, + unknown_index: str = "random", + sane_index_shape: bool = False, + legacy: bool = True, + ): + super().__init__() + self.n_e = n_e + self.vq_embed_dim = vq_embed_dim + self.beta = beta + self.legacy = legacy + + self.embedding = nn.Embedding(self.n_e, self.vq_embed_dim) + self.embedding.weight.data.uniform_(-1.0 / self.n_e, 1.0 / self.n_e) + + self.remap = remap + if self.remap is not None: + self.register_buffer("used", torch.tensor(np.load(self.remap))) + self.used: torch.Tensor + self.re_embed = self.used.shape[0] + self.unknown_index = unknown_index # "random" or "extra" or integer + if self.unknown_index == "extra": + self.unknown_index = self.re_embed + self.re_embed = self.re_embed + 1 + print( + f"Remapping {self.n_e} indices to {self.re_embed} indices. " + f"Using {self.unknown_index} for unknown indices." + ) + else: + self.re_embed = n_e + + self.sane_index_shape = sane_index_shape + + def remap_to_used(self, inds: torch.LongTensor) -> torch.LongTensor: + ishape = inds.shape + assert len(ishape) > 1 + inds = inds.reshape(ishape[0], -1) + used = self.used.to(inds) + match = (inds[:, :, None] == used[None, None, ...]).long() + new = match.argmax(-1) + unknown = match.sum(2) < 1 + if self.unknown_index == "random": + new[unknown] = torch.randint(0, self.re_embed, size=new[unknown].shape).to(device=new.device) + else: + new[unknown] = self.unknown_index + return new.reshape(ishape) + + def unmap_to_all(self, inds: torch.LongTensor) -> torch.LongTensor: + ishape = inds.shape + assert len(ishape) > 1 + inds = inds.reshape(ishape[0], -1) + used = self.used.to(inds) + if self.re_embed > self.used.shape[0]: # extra token + inds[inds >= self.used.shape[0]] = 0 # simply set to zero + back = torch.gather(used[None, :][inds.shape[0] * [0], :], 1, inds) + return back.reshape(ishape) + + def forward(self, z: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, Tuple]: + # reshape z -> (batch, height, width, channel) and flatten + z = z.permute(0, 2, 3, 1).contiguous() + z_flattened = z.view(-1, self.vq_embed_dim) + + # distances from z to embeddings e_j (z - e)^2 = z^2 + e^2 - 2 e * z + min_encoding_indices = torch.argmin(torch.cdist(z_flattened, self.embedding.weight), dim=1) + + z_q = self.embedding(min_encoding_indices).view(z.shape) + perplexity = None + min_encodings = None + + # compute loss for embedding + if not self.legacy: + loss = self.beta * torch.mean((z_q.detach() - z) ** 2) + torch.mean((z_q - z.detach()) ** 2) + else: + loss = torch.mean((z_q.detach() - z) ** 2) + self.beta * torch.mean((z_q - z.detach()) ** 2) + + # preserve gradients + z_q: torch.Tensor = z + (z_q - z).detach() + + # reshape back to match original input shape + z_q = z_q.permute(0, 3, 1, 2).contiguous() + + if self.remap is not None: + min_encoding_indices = min_encoding_indices.reshape(z.shape[0], -1) # add batch axis + min_encoding_indices = self.remap_to_used(min_encoding_indices) + min_encoding_indices = min_encoding_indices.reshape(-1, 1) # flatten + + if self.sane_index_shape: + min_encoding_indices = min_encoding_indices.reshape(z_q.shape[0], z_q.shape[2], z_q.shape[3]) + + return z_q, loss, (perplexity, min_encodings, min_encoding_indices) + + def get_codebook_entry(self, indices: torch.LongTensor, shape: Tuple[int, ...]) -> torch.Tensor: + # shape specifying (batch, height, width, channel) + if self.remap is not None: + indices = indices.reshape(shape[0], -1) # add batch axis + indices = self.unmap_to_all(indices) + indices = indices.reshape(-1) # flatten again + + # get quantized latent vectors + z_q: torch.Tensor = self.embedding(indices) + + if shape is not None: + z_q = z_q.view(shape) + # reshape back to match original input shape + z_q = z_q.permute(0, 3, 1, 2).contiguous() + + return z_q + + +class DiagonalGaussianDistribution(object): + def __init__(self, parameters: torch.Tensor, deterministic: bool = False): + self.parameters = parameters + self.mean, self.logvar = torch.chunk(parameters, 2, dim=1) + self.logvar = torch.clamp(self.logvar, -30.0, 20.0) + self.deterministic = deterministic + self.std = torch.exp(0.5 * self.logvar) + self.var = torch.exp(self.logvar) + if self.deterministic: + self.var = self.std = torch.zeros_like( + self.mean, device=self.parameters.device, dtype=self.parameters.dtype + ) + + def sample(self, generator: Optional[torch.Generator] = None) -> torch.Tensor: + # make sure sample is on the same device as the parameters and has same dtype + sample = randn_tensor( + self.mean.shape, + generator=generator, + device=self.parameters.device, + dtype=self.parameters.dtype, + ) + x = self.mean + self.std * sample + return x + + def kl(self, other: "DiagonalGaussianDistribution" = None) -> torch.Tensor: + if self.deterministic: + return torch.Tensor([0.0]) + else: + if other is None: + return 0.5 * torch.sum( + torch.pow(self.mean, 2) + self.var - 1.0 - self.logvar, + dim=[1, 2, 3], + ) + else: + return 0.5 * torch.sum( + torch.pow(self.mean - other.mean, 2) / other.var + + self.var / other.var + - 1.0 + - self.logvar + + other.logvar, + dim=[1, 2, 3], + ) + + def nll(self, sample: torch.Tensor, dims: Tuple[int, ...] = [1, 2, 3]) -> torch.Tensor: + if self.deterministic: + return torch.Tensor([0.0]) + logtwopi = np.log(2.0 * np.pi) + return 0.5 * torch.sum( + logtwopi + self.logvar + torch.pow(sample - self.mean, 2) / self.var, + dim=dims, + ) + + def mode(self) -> torch.Tensor: + return self.mean + + +class EncoderTiny(nn.Module): + r""" + The `EncoderTiny` layer is a simpler version of the `Encoder` layer. + + Args: + in_channels (`int`): + The number of input channels. + out_channels (`int`): + The number of output channels. + num_blocks (`Tuple[int, ...]`): + Each value of the tuple represents a Conv2d layer followed by `value` number of `AutoencoderTinyBlock`'s to + use. + block_out_channels (`Tuple[int, ...]`): + The number of output channels for each block. + act_fn (`str`): + The activation function to use. See `~diffusers.models.activations.get_activation` for available options. + """ + + def __init__( + self, + in_channels: int, + out_channels: int, + num_blocks: Tuple[int, ...], + block_out_channels: Tuple[int, ...], + act_fn: str, + ): + super().__init__() + + layers = [] + for i, num_block in enumerate(num_blocks): + num_channels = block_out_channels[i] + + if i == 0: + layers.append(nn.Conv2d(in_channels, num_channels, kernel_size=3, padding=1)) + else: + layers.append( + nn.Conv2d( + num_channels, + num_channels, + kernel_size=3, + padding=1, + stride=2, + bias=False, + ) + ) + + for _ in range(num_block): + layers.append(AutoencoderTinyBlock(num_channels, num_channels, act_fn)) + + layers.append(nn.Conv2d(block_out_channels[-1], out_channels, kernel_size=3, padding=1)) + + self.layers = nn.Sequential(*layers) + self.gradient_checkpointing = False + + def forward(self, x: torch.Tensor) -> torch.Tensor: + r"""The forward method of the `EncoderTiny` class.""" + if torch.is_grad_enabled() and self.gradient_checkpointing: + + def create_custom_forward(module): + def custom_forward(*inputs): + return module(*inputs) + + return custom_forward + + if is_torch_version(">=", "1.11.0"): + x = torch.utils.checkpoint.checkpoint(create_custom_forward(self.layers), x, use_reentrant=False) + else: + x = torch.utils.checkpoint.checkpoint(create_custom_forward(self.layers), x) + + else: + # scale image from [-1, 1] to [0, 1] to match TAESD convention + x = self.layers(x.add(1).div(2)) + + return x + + +class DecoderTiny(nn.Module): + r""" + The `DecoderTiny` layer is a simpler version of the `Decoder` layer. + + Args: + in_channels (`int`): + The number of input channels. + out_channels (`int`): + The number of output channels. + num_blocks (`Tuple[int, ...]`): + Each value of the tuple represents a Conv2d layer followed by `value` number of `AutoencoderTinyBlock`'s to + use. + block_out_channels (`Tuple[int, ...]`): + The number of output channels for each block. + upsampling_scaling_factor (`int`): + The scaling factor to use for upsampling. + act_fn (`str`): + The activation function to use. See `~diffusers.models.activations.get_activation` for available options. + """ + + def __init__( + self, + in_channels: int, + out_channels: int, + num_blocks: Tuple[int, ...], + block_out_channels: Tuple[int, ...], + upsampling_scaling_factor: int, + act_fn: str, + upsample_fn: str, + ): + super().__init__() + + layers = [ + nn.Conv2d(in_channels, block_out_channels[0], kernel_size=3, padding=1), + get_activation(act_fn), + ] + + for i, num_block in enumerate(num_blocks): + is_final_block = i == (len(num_blocks) - 1) + num_channels = block_out_channels[i] + + for _ in range(num_block): + layers.append(AutoencoderTinyBlock(num_channels, num_channels, act_fn)) + + if not is_final_block: + layers.append(nn.Upsample(scale_factor=upsampling_scaling_factor, mode=upsample_fn)) + + conv_out_channel = num_channels if not is_final_block else out_channels + layers.append( + nn.Conv2d( + num_channels, + conv_out_channel, + kernel_size=3, + padding=1, + bias=is_final_block, + ) + ) + + self.layers = nn.Sequential(*layers) + self.gradient_checkpointing = False + + def forward(self, x: torch.Tensor) -> torch.Tensor: + r"""The forward method of the `DecoderTiny` class.""" + # Clamp. + x = torch.tanh(x / 3) * 3 + + if torch.is_grad_enabled() and self.gradient_checkpointing: + + def create_custom_forward(module): + def custom_forward(*inputs): + return module(*inputs) + + return custom_forward + + if is_torch_version(">=", "1.11.0"): + x = torch.utils.checkpoint.checkpoint(create_custom_forward(self.layers), x, use_reentrant=False) + else: + x = torch.utils.checkpoint.checkpoint(create_custom_forward(self.layers), x) + + else: + x = self.layers(x) + + # scale image from [0, 1] to [-1, 1] to match diffusers convention + return x.mul(2).sub(1) \ No newline at end of file diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/inference_cogview3plus.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/inference_cogview3plus.py new file mode 100644 index 0000000000..33d0c9aeed --- /dev/null +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/inference_cogview3plus.py @@ -0,0 +1,99 @@ +#!/usr/bin/env python +# coding=utf-8 +# Copyright 2024 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import logging +import torch + +from cogview3plus import CogView3PlusPipeline + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +def parse_arguments(): + parser = argparse.ArgumentParser(description="Generate an image using the CogView3-Plus-3B model.") + + # Define arguments for prompt, model path, etc. + parser.add_argument( + "--prompt", + type=list, + default=[ + "A vibrant cherry red sports car sits proudly under the gleaming sun, its polished exterior smooth and flawless, casting a mirror-like reflection. The car features a low, aerodynamic body, angular headlights that gaze forward like predatory eyes, and a set of black, high-gloss racing rims that contrast starkly with the red. A subtle hint of chrome embellishes the grille and exhaust, while the tinted windows suggest a luxurious and private interior. The scene conveys a sense of speed and elegance, the car appearing as if it's about to burst into a sprint along a coastal road, with the ocean's azure waves crashing in the background." + ], + help="The text description for generating the image." + ) + parser.add_argument( + "--model_path", type=str, default="/data/CogView3B", help="Path to the pre-trained model." + ) + parser.add_argument( + "--guidance_scale", type=float, default=7.0, help="The guidance scale for classifier-free guidance." + ) + parser.add_argument( + "--num_images_per_prompt", type=int, default=1, help="Number of images to generate per prompt." + ) + parser.add_argument("--num_inference_steps", type=int, default=50, help="Number of denoising steps for inference.") + parser.add_argument("--width", type=int, default=1024, help="Width of the generated image.") + parser.add_argument("--height", type=int, default=1024, help="Height of the generated image.") + parser.add_argument("--output_path", type=str, default="cogview3.png", help="Path to save the generated image.") + parser.add_argument("--dtype", type=str, default="bf16", help="bf16 or fp16") + parser.add_argument("--device_id", type=int, default=2, help="NPU device id") + + return parser.parse_args() + + +def generate_image( + prompt, model_path, guidance_scale, num_images_per_prompt, num_inference_steps, width, height, output_path, dtype +): + # Load the pre-trained model with the specified precision + pipe = CogView3PlusPipeline.from_pretrained(model_path, torch_dtype=dtype) + + # Generate the image based on the prompt + image = pipe( + prompt=prompt, + guidance_scale=guidance_scale, + num_images_per_prompt=num_images_per_prompt, + num_inference_steps=num_inference_steps, + width=width, + height=height, + ).images[0] + + # Save the generated image to the local file system + image.save(output_path) + + print(f"Image saved to {output_path}") + + +def infer(args): + torch.npu.set_device(args.device_id) + dtype = torch.bfloat16 if args.dtype == "bf16" else torch.float16 + generate_image( + prompt=args.prompt[0], + model_path=args.model_path, + guidance_scale=args.guidance_scale, + num_images_per_prompt=args.num_images_per_prompt, + num_inference_steps=args.num_inference_steps, + width=args.width, + height=args.height, + output_path=args.output_path, + dtype=dtype, + ) + + +if __name__ == "__main__": + inference_args = parse_arguments() + infer(inference_args) + diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/requirents.txt b/MindIE/MindIE-Torch/built-in/foundation/cogview3/requirents.txt new file mode 100644 index 0000000000..ac2fa2a7f6 --- /dev/null +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/requirents.txt @@ -0,0 +1,15 @@ +accelerate==0.29.3 +deepspeed==0.15.4 +einops==0.7.0 +gradio==3.50.2 +huggingface-hub==0.24.7 +Jinja2==3.1.4 +numpy==1.26.4 +peft==0.10.0 +safetensors==0.4.5 +timm==0.9.5 +tokenizers==0.15.2 +torch==2.1.0 +torchvision==0.14.1 +tqdm==4.66.5 +transformers==4.39.3 \ No newline at end of file -- Gitee From 4d72fcf83352eaa7498e533e02c3dd8ccb01fc11 Mon Sep 17 00:00:00 2001 From: jiangmengyu Date: Tue, 24 Dec 2024 20:04:47 +0800 Subject: [PATCH 02/91] =?UTF-8?q?CogView3plus=E9=80=82=E9=85=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../built-in/foundation/cogview3/cogview3plus/vae/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/vae/__init__.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/vae/__init__.py index a91642a899..58bbb8f14e 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/vae/__init__.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/vae/__init__.py @@ -1 +1 @@ -from autoencoder_kl import AutoencoderKL \ No newline at end of file +from .autoencoder_kl import AutoencoderKL \ No newline at end of file -- Gitee From 69b98374c21a0a4f395fd044fbc93dc5a1049eff Mon Sep 17 00:00:00 2001 From: jiangmengyu Date: Tue, 24 Dec 2024 20:05:28 +0800 Subject: [PATCH 03/91] =?UTF-8?q?CogView3plus=E9=80=82=E9=85=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../foundation/cogview3/cogview3plus/models/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/__init__.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/__init__.py index 06571c58d3..68d6997c34 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/__init__.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/__init__.py @@ -1 +1 @@ -from transformer_cogview3plus import CogView3PlusTransformer2DModel \ No newline at end of file +from .transformer_cogview3plus import CogView3PlusTransformer2DModel \ No newline at end of file -- Gitee From 2a0a4ea777fc9562a51b98cce4243d017a80b4d6 Mon Sep 17 00:00:00 2001 From: jiangmengyu Date: Tue, 24 Dec 2024 20:11:01 +0800 Subject: [PATCH 04/91] =?UTF-8?q?CogView3plus=E9=80=82=E9=85=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../cogview3plus/models/activations.py | 178 ++++++++++++++++++ .../cogview3/cogview3plus/models/attention.py | 2 +- 2 files changed, 179 insertions(+), 1 deletion(-) create mode 100644 MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/activations.py diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/activations.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/activations.py new file mode 100644 index 0000000000..cb1c29919e --- /dev/null +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/activations.py @@ -0,0 +1,178 @@ +# coding=utf-8 +# Copyright 2024 HuggingFace Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import torch +import torch.nn.functional as F +from torch import nn + +from diffusers.utils import deprecate +from diffusers.utils.import_utils import is_torch_npu_available, is_torch_version + + +if is_torch_npu_available(): + import torch_npu + +ACTIVATION_FUNCTIONS = { + "swish": nn.SiLU(), + "silu": nn.SiLU(), + "mish": nn.Mish(), + "gelu": nn.GELU(), + "relu": nn.ReLU(), +} + + +def get_activation(act_fn: str) -> nn.Module: + """Helper function to get activation function from string. + + Args: + act_fn (str): Name of activation function. + + Returns: + nn.Module: Activation function. + """ + + act_fn = act_fn.lower() + if act_fn in ACTIVATION_FUNCTIONS: + return ACTIVATION_FUNCTIONS[act_fn] + else: + raise ValueError(f"Unsupported activation function: {act_fn}") + + +class FP32SiLU(nn.Module): + r""" + SiLU activation function with input upcasted to torch.float32. + """ + + def __init__(self): + super().__init__() + + def forward(self, inputs: torch.Tensor) -> torch.Tensor: + return F.silu(inputs.float(), inplace=False).to(inputs.dtype) + + +class GELU(nn.Module): + r""" + GELU activation function with tanh approximation support with `approximate="tanh"`. + + Parameters: + dim_in (`int`): The number of channels in the input. + dim_out (`int`): The number of channels in the output. + approximate (`str`, *optional*, defaults to `"none"`): If `"tanh"`, use tanh approximation. + bias (`bool`, defaults to True): Whether to use a bias in the linear layer. + """ + + def __init__(self, dim_in: int, dim_out: int, approximate: str = "none", bias: bool = True): + super().__init__() + self.proj = nn.Linear(dim_in, dim_out, bias=bias) + self.approximate = approximate + + def gelu(self, gate: torch.Tensor) -> torch.Tensor: + if gate.device.type == "mps" and is_torch_version("<", "2.0.0"): + # fp16 gelu not supported on mps before torch 2.0 + return F.gelu(gate.to(dtype=torch.float32), approximate=self.approximate).to(dtype=gate.dtype) + return F.gelu(gate, approximate=self.approximate) + + def forward(self, hidden_states): + hidden_states = self.proj(hidden_states) + hidden_states = self.gelu(hidden_states) + return hidden_states + + +class GEGLU(nn.Module): + r""" + A [variant](https://arxiv.org/abs/2002.05202) of the gated linear unit activation function. + + Parameters: + dim_in (`int`): The number of channels in the input. + dim_out (`int`): The number of channels in the output. + bias (`bool`, defaults to True): Whether to use a bias in the linear layer. + """ + + def __init__(self, dim_in: int, dim_out: int, bias: bool = True): + super().__init__() + self.proj = nn.Linear(dim_in, dim_out * 2, bias=bias) + + def gelu(self, gate: torch.Tensor) -> torch.Tensor: + if gate.device.type == "mps" and is_torch_version("<", "2.0.0"): + # fp16 gelu not supported on mps before torch 2.0 + return F.gelu(gate.to(dtype=torch.float32)).to(dtype=gate.dtype) + return F.gelu(gate) + + def forward(self, hidden_states, *args, **kwargs): + if len(args) > 0 or kwargs.get("scale", None) is not None: + deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`." + deprecate("scale", "1.0.0", deprecation_message) + hidden_states = self.proj(hidden_states) + if is_torch_npu_available(): + # using torch_npu.npu_geglu can run faster and save memory on NPU. + return torch_npu.npu_geglu(hidden_states, dim=-1, approximate=1)[0] + else: + hidden_states, gate = hidden_states.chunk(2, dim=-1) + return hidden_states * self.gelu(gate) + + +class SwiGLU(nn.Module): + r""" + A [variant](https://arxiv.org/abs/2002.05202) of the gated linear unit activation function. It's similar to `GEGLU` + but uses SiLU / Swish instead of GeLU. + + Parameters: + dim_in (`int`): The number of channels in the input. + dim_out (`int`): The number of channels in the output. + bias (`bool`, defaults to True): Whether to use a bias in the linear layer. + """ + + def __init__(self, dim_in: int, dim_out: int, bias: bool = True): + super().__init__() + + self.proj = nn.Linear(dim_in, dim_out * 2, bias=bias) + self.activation = nn.SiLU() + + def forward(self, hidden_states): + hidden_states = self.proj(hidden_states) + hidden_states, gate = hidden_states.chunk(2, dim=-1) + return hidden_states * self.activation(gate) + + +class ApproximateGELU(nn.Module): + r""" + The approximate form of the Gaussian Error Linear Unit (GELU). For more details, see section 2 of this + [paper](https://arxiv.org/abs/1606.08415). + + Parameters: + dim_in (`int`): The number of channels in the input. + dim_out (`int`): The number of channels in the output. + bias (`bool`, defaults to True): Whether to use a bias in the linear layer. + """ + + def __init__(self, dim_in: int, dim_out: int, bias: bool = True): + super().__init__() + self.proj = nn.Linear(dim_in, dim_out, bias=bias) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x = self.proj(x) + return x * torch.sigmoid(1.702 * x) + + +class LinearActivation(nn.Module): + def __init__(self, dim_in: int, dim_out: int, bias: bool = True, activation: str = "silu"): + super().__init__() + + self.proj = nn.Linear(dim_in, dim_out, bias=bias) + self.activation = get_activation(activation) + + def forward(self, hidden_states): + hidden_states = self.proj(hidden_states) + return self.activation(hidden_states) \ No newline at end of file diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/attention.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/attention.py index 00aabc9fdd..ac85e70e05 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/attention.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/attention.py @@ -17,7 +17,7 @@ import torch from torch import nn from diffusers.utils import deprecate, logging -from diffusers.models.activations import GEGLU, GELU, ApproximateGELU, LinearActivation, SwiGLU +from .activations import GEGLU, GELU, ApproximateGELU, LinearActivation, SwiGLU logger = logging.get_logger(__name__) -- Gitee From cea407effe135e8a64852975c73319b1a34dbc59 Mon Sep 17 00:00:00 2001 From: jiangmengyu Date: Tue, 24 Dec 2024 20:11:54 +0800 Subject: [PATCH 05/91] =?UTF-8?q?CogView3plus=E9=80=82=E9=85=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../cogview3/cogview3plus/models/attention_processor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/attention_processor.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/attention_processor.py index 6632f7f83f..796efa0318 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/attention_processor.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/attention_processor.py @@ -35,7 +35,7 @@ class CogVideoXAttnProcessor2_0: attn: Attention, hidden_states: torch.Tensor, encoder_hidden_states: torch.Tensor, - attention_mask: Optional[torch.Tensor] = None,s + attention_mask: Optional[torch.Tensor] = None, ) -> torch.Tensor: text_seq_length = encoder_hidden_states.size(1) -- Gitee From 3d8b3bc40bd596dd4524bb75bb1edb5cba0be03d Mon Sep 17 00:00:00 2001 From: jiangmengyu Date: Tue, 24 Dec 2024 20:20:06 +0800 Subject: [PATCH 06/91] =?UTF-8?q?CogView3plus=E9=80=82=E9=85=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../cogview3/cogview3plus/models/transformer_cogview3plus.py | 1 + 1 file changed, 1 insertion(+) diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/transformer_cogview3plus.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/transformer_cogview3plus.py index 78360f61e9..782fa5b4db 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/transformer_cogview3plus.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/transformer_cogview3plus.py @@ -180,6 +180,7 @@ class CogView3PlusTransformer2DModel(ModelMixin, ConfigMixin): sample_size: int = 128, ): super().__init__() + print("====================================================") self.out_channels = out_channels self.inner_dim = num_attention_heads * attention_head_dim -- Gitee From 97c427ae393947263635cc62fe938c1a5e15e6b4 Mon Sep 17 00:00:00 2001 From: jiangmengyu Date: Wed, 25 Dec 2024 14:07:30 +0800 Subject: [PATCH 07/91] =?UTF-8?q?CogView3plus=E9=80=82=E9=85=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../cogview3/cogview3plus/__init__.py | 5 ++- .../cogview3/cogview3plus/models/__init__.py | 3 +- .../cogview3plus/models/modeling_utils.py | 1 + .../models/transformer_cogview3plus.py | 3 +- .../cogview3plus/pipeline/__init__.py | 2 +- .../cogview3plus/schedulers/__init__.py | 3 +- .../cogview3plus/vae/autoencoder_kl.py | 3 +- .../cogview3/inference_cogview3plus.py | 35 +++++++++++++------ 8 files changed, 37 insertions(+), 18 deletions(-) create mode 100644 MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/modeling_utils.py diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/__init__.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/__init__.py index acbd223eb6..11a5548362 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/__init__.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/__init__.py @@ -15,4 +15,7 @@ # limitations under the License. -from .pipeline import CogView3PlusPipeline +from .pipeline import CogView3PlusPipeline, DiffusionPipeline +from .vae import AutoencoderKL +from .schedulers import CogVideoXDDIMScheduler, CogVideoXDPMScheduler, SchedulerMixin +from .models import CogView3PlusTransformer2DModel, ModelMixin \ No newline at end of file diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/__init__.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/__init__.py index 68d6997c34..b3c595bfcc 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/__init__.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/__init__.py @@ -1 +1,2 @@ -from .transformer_cogview3plus import CogView3PlusTransformer2DModel \ No newline at end of file +from .transformer_cogview3plus import CogView3PlusTransformer2DModel +from .modeling_utils import ModelMixin \ No newline at end of file diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/modeling_utils.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/modeling_utils.py new file mode 100644 index 0000000000..1b4243486f --- /dev/null +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/modeling_utils.py @@ -0,0 +1 @@ +from diffusers import ModelMixin \ No newline at end of file diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/transformer_cogview3plus.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/transformer_cogview3plus.py index 782fa5b4db..9a343d4c86 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/transformer_cogview3plus.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/transformer_cogview3plus.py @@ -20,10 +20,10 @@ import torch.nn as nn from diffusers.configuration_utils import ConfigMixin, register_to_config from diffusers.models.attention_processor import Attention, AttentionProcessor -from diffusers.models.modeling_utils import ModelMixin from diffusers.utils import is_torch_version, logging from diffusers.models.modeling_outputs import Transformer2DModelOutput +from .modeling_utils import ModelMixin from .attention import FeedForward from .attention_processor import CogVideoXAttnProcessor2_0 from ..layers import CogView3PlusAdaLayerNormZeroTextImage, AdaLayerNormContinuous @@ -180,7 +180,6 @@ class CogView3PlusTransformer2DModel(ModelMixin, ConfigMixin): sample_size: int = 128, ): super().__init__() - print("====================================================") self.out_channels = out_channels self.inner_dim = num_attention_heads * attention_head_dim diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/__init__.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/__init__.py index aea730c2e3..626e0d588b 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/__init__.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/__init__.py @@ -1 +1 @@ -from .pipeline_cogview3plus import CogView3PlusPipeline \ No newline at end of file +from .pipeline_cogview3plus import CogView3PlusPipeline, DiffusionPipeline \ No newline at end of file diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/__init__.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/__init__.py index 76b000d4bb..32d0c223e7 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/__init__.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/__init__.py @@ -1,2 +1,3 @@ from .scheduling_ddim_cogvideox import CogVideoXDDIMScheduler -from .scheduling_dpm_cogvideox import CogVideoXDPMScheduler \ No newline at end of file +from .scheduling_dpm_cogvideox import CogVideoXDPMScheduler +from .scheduling_utils import SchedulerMixin \ No newline at end of file diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/vae/autoencoder_kl.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/vae/autoencoder_kl.py index 99ba70c8cd..fcba50ccae 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/vae/autoencoder_kl.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/vae/autoencoder_kl.py @@ -31,7 +31,8 @@ from diffusers.models.attention_processor import ( FusedAttnProcessor2_0, ) from diffusers.models.modeling_outputs import AutoencoderKLOutput -from diffusers.models.modeling_utils import ModelMixin + +from ..models import ModelMixin from .vae import Decoder, DecoderOutput, DiagonalGaussianDistribution, Encoder diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/inference_cogview3plus.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/inference_cogview3plus.py index 33d0c9aeed..34a2158f6b 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/inference_cogview3plus.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/inference_cogview3plus.py @@ -17,6 +17,7 @@ import argparse import logging import torch +import time from cogview3plus import CogView3PlusPipeline @@ -59,17 +60,29 @@ def generate_image( prompt, model_path, guidance_scale, num_images_per_prompt, num_inference_steps, width, height, output_path, dtype ): # Load the pre-trained model with the specified precision - pipe = CogView3PlusPipeline.from_pretrained(model_path, torch_dtype=dtype) - - # Generate the image based on the prompt - image = pipe( - prompt=prompt, - guidance_scale=guidance_scale, - num_images_per_prompt=num_images_per_prompt, - num_inference_steps=num_inference_steps, - width=width, - height=height, - ).images[0] + pipe = CogView3PlusPipeline.from_pretrained(model_path, torch_dtype=dtype).to("npu") + + use_time = 0 + loops = 5 + for i in range(loops): + start_time = time.time() + # Generate the image based on the prompt + image = pipe( + prompt=prompt, + guidance_scale=guidance_scale, + num_images_per_prompt=num_images_per_prompt, + num_inference_steps=num_inference_steps, + width=width, + height=height, + ).images[0] + + if i >= 2: + use_time += time.time() - start_time + logger.info("current_time is %.3f )", time.time() - start_time) + + torch.npu.empty_cache() + + logger.info("use_time is %.3f)", use_time / 3) # Save the generated image to the local file system image.save(output_path) -- Gitee From c9b51415e826ff8c71d45f71e6e60c29d464243a Mon Sep 17 00:00:00 2001 From: jiangmengyu Date: Wed, 25 Dec 2024 14:23:32 +0800 Subject: [PATCH 08/91] =?UTF-8?q?CogView3plus=E9=80=82=E9=85=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../cogview3plus/models/attention_processor.py | 17 +++++++++++++++-- .../cogview3/inference_cogview3plus.py | 2 +- 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/attention_processor.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/attention_processor.py index 796efa0318..1b9ce8616c 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/attention_processor.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/attention_processor.py @@ -16,6 +16,7 @@ from typing import Optional import torch import torch.nn.functional as F +import torch_npu from diffusers.models.attention_processor import Attention @@ -65,9 +66,21 @@ class CogVideoXAttnProcessor2_0: if attn.norm_k is not None: key = attn.norm_k(key) - hidden_states = F.scaled_dot_product_attention( - query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False + _, N, _, D = query.shape + query = F.pad(query, (0, 64 - D)) + key = F.pad(key, (0, 64 - D)) + value = F.pad(value, (0, 64 - D)) + hidden_states = torch_npu.npu_prompt_flash_attention( + query, + key, + value, + input_layout='BNSD', + scale_value=D**-0.5, + pre_tokens=65535, + next_tokens=65535, + num_heads=N ) + hidden_states = hidden_states[:, :, :, :D] hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim) diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/inference_cogview3plus.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/inference_cogview3plus.py index 34a2158f6b..74dd914294 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/inference_cogview3plus.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/inference_cogview3plus.py @@ -51,7 +51,7 @@ def parse_arguments(): parser.add_argument("--height", type=int, default=1024, help="Height of the generated image.") parser.add_argument("--output_path", type=str, default="cogview3.png", help="Path to save the generated image.") parser.add_argument("--dtype", type=str, default="bf16", help="bf16 or fp16") - parser.add_argument("--device_id", type=int, default=2, help="NPU device id") + parser.add_argument("--device_id", type=int, default=1, help="NPU device id") return parser.parse_args() -- Gitee From 62a721a2f5d0485f19726ddce37cd825225f6d86 Mon Sep 17 00:00:00 2001 From: jiangmengyu Date: Wed, 25 Dec 2024 17:06:13 +0800 Subject: [PATCH 09/91] =?UTF-8?q?CogView3plus=E9=80=82=E9=85=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../cogview3plus/models/model_load_utils.py | 42 + .../cogview3plus/models/modeling_utils.py | 1423 ++++++++++++++++- 2 files changed, 1464 insertions(+), 1 deletion(-) create mode 100644 MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/model_load_utils.py diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/model_load_utils.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/model_load_utils.py new file mode 100644 index 0000000000..f6d3b20570 --- /dev/null +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/model_load_utils.py @@ -0,0 +1,42 @@ +#!/usr/bin/env python +# coding=utf-8 +# Copyright(C) 2024. Huawei Technologies Co.,Ltd. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License + +import os +import torch +import safetensors.torch + + +SAFETENSORS_EXTENSION = "safetensors" +EMA_STATE_DICT = "ema_state_dict" +STATE_DICT = "state_dict" +CPU = "cpu" + + +def load_state_dict_sd(model_path): + name = os.path.basename(model_path).split('.')[-1] # get weights name + if name.endswith("ckpt"): + weight = torch.load(model_path, map_location=CPU) + if (EMA_STATE_DICT in weight): + weight = weight[EMA_STATE_DICT] + weight = {key.replace("module.", ""): value for key, value in weight.items()} + elif STATE_DICT in weight: + weight = weight[STATE_DICT] + return weight + elif name == SAFETENSORS_EXTENSION: # diffuser model use same name + return safetensors.torch.load_file(model_path, device=CPU) # first load on cpu + else: + # to support hf shard model weights + return torch.load(model_path, map_location=CPU) # first load on cpu \ No newline at end of file diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/modeling_utils.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/modeling_utils.py index 1b4243486f..a0740b8c67 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/modeling_utils.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/modeling_utils.py @@ -1 +1,1422 @@ -from diffusers import ModelMixin \ No newline at end of file +# coding=utf-8 +# Copyright 2024 The HuggingFace Inc. team. +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import copy +import inspect +import itertools +import json +import os +import re +from collections import OrderedDict +from functools import partial, wraps +from pathlib import Path +from typing import Any, Callable, List, Optional, Tuple, Union + +import safetensors +import torch +from huggingface_hub import create_repo, split_torch_state_dict_into_shards +from huggingface_hub.utils import validate_hf_hub_args +from torch import Tensor, nn + +from diffusers import __version__ +from diffusers.quantizers import DiffusersAutoQuantizer, DiffusersQuantizer +from diffusers.quantizers.quantization_config import QuantizationMethod +from diffusers.utils import ( + CONFIG_NAME, + FLAX_WEIGHTS_NAME, + SAFE_WEIGHTS_INDEX_NAME, + SAFETENSORS_WEIGHTS_NAME, + WEIGHTS_INDEX_NAME, + WEIGHTS_NAME, + _add_variant, + _get_checkpoint_shard_files, + _get_model_file, + deprecate, + is_accelerate_available, + is_bitsandbytes_available, + is_bitsandbytes_version, + is_torch_version, + logging, +) +from diffusers.utils.hub_utils import ( + PushToHubMixin, + load_or_create_model_card, + populate_model_card, +) +from diffusers.models.model_loading_utils import ( + _determine_device_map, + _fetch_index_file, + _fetch_index_file_legacy, + _load_state_dict_into_model, + _merge_sharded_checkpoints, + load_model_dict_into_meta, + load_state_dict, +) + +from .model_load_utils import load_state_dict_sd + + +logger = logging.get_logger(__name__) + +_REGEX_SHARD = re.compile(r"(.*?)-\d{5}-of-\d{5}") + + +if is_torch_version(">=", "1.9.0"): + _LOW_CPU_MEM_USAGE_DEFAULT = True +else: + _LOW_CPU_MEM_USAGE_DEFAULT = False + + +if is_accelerate_available(): + import accelerate + + +def get_parameter_device(parameter: torch.nn.Module) -> torch.device: + try: + parameters_and_buffers = itertools.chain(parameter.parameters(), parameter.buffers()) + return next(parameters_and_buffers).device + except StopIteration: + # For torch.nn.DataParallel compatibility in PyTorch 1.5 + + def find_tensor_attributes(module: torch.nn.Module) -> List[Tuple[str, Tensor]]: + tuples = [(k, v) for k, v in module.__dict__.items() if torch.is_tensor(v)] + return tuples + + gen = parameter._named_members(get_members_fn=find_tensor_attributes) + first_tuple = next(gen) + return first_tuple[1].device + + +def get_parameter_dtype(parameter: torch.nn.Module) -> torch.dtype: + """ + Returns the first found floating dtype in parameters if there is one, otherwise returns the last dtype it found. + """ + last_dtype = None + for param in parameter.parameters(): + last_dtype = param.dtype + if param.is_floating_point(): + return param.dtype + + for buffer in parameter.buffers(): + last_dtype = buffer.dtype + if buffer.is_floating_point(): + return buffer.dtype + + if last_dtype is not None: + # if no floating dtype was found return whatever the first dtype is + return last_dtype + + # For nn.DataParallel compatibility in PyTorch > 1.5 + def find_tensor_attributes(module: nn.Module) -> List[Tuple[str, Tensor]]: + tuples = [(k, v) for k, v in module.__dict__.items() if torch.is_tensor(v)] + return tuples + + gen = parameter._named_members(get_members_fn=find_tensor_attributes) + last_tuple = None + for tuple in gen: + last_tuple = tuple + if tuple[1].is_floating_point(): + return tuple[1].dtype + + if last_tuple is not None: + # fallback to the last dtype + return last_tuple[1].dtype + + +class ModelMixin(torch.nn.Module, PushToHubMixin): + r""" + Base class for all models. + + [`ModelMixin`] takes care of storing the model configuration and provides methods for loading, downloading and + saving models. + + - **config_name** ([`str`]) -- Filename to save a model to when calling [`~models.ModelMixin.save_pretrained`]. + """ + + config_name = CONFIG_NAME + _automatically_saved_args = ["_diffusers_version", "_class_name", "_name_or_path"] + _supports_gradient_checkpointing = False + _keys_to_ignore_on_load_unexpected = None + _no_split_modules = None + _keep_in_fp32_modules = None + + def __init__(self): + super().__init__() + + def __getattr__(self, name: str) -> Any: + """The only reason we overwrite `getattr` here is to gracefully deprecate accessing + config attributes directly. See https://github.com/huggingface/diffusers/pull/3129 We need to overwrite + __getattr__ here in addition so that we don't trigger `torch.nn.Module`'s __getattr__': + https://pytorch.org/docs/stable/_modules/torch/nn/modules/module.html#Module + """ + + is_in_config = "_internal_dict" in self.__dict__ and hasattr(self.__dict__["_internal_dict"], name) + is_attribute = name in self.__dict__ + + if is_in_config and not is_attribute: + deprecation_message = f"Accessing config attribute `{name}` directly via '{type(self).__name__}' object attribute is deprecated. Please access '{name}' over '{type(self).__name__}'s config object instead, e.g. 'unet.config.{name}'." + deprecate("direct config name access", "1.0.0", deprecation_message, standard_warn=False, stacklevel=3) + return self._internal_dict[name] + + # call PyTorch's https://pytorch.org/docs/stable/_modules/torch/nn/modules/module.html#Module + return super().__getattr__(name) + + @property + def is_gradient_checkpointing(self) -> bool: + """ + Whether gradient checkpointing is activated for this model or not. + """ + return any(hasattr(m, "gradient_checkpointing") and m.gradient_checkpointing for m in self.modules()) + + def enable_gradient_checkpointing(self) -> None: + """ + Activates gradient checkpointing for the current model (may be referred to as *activation checkpointing* or + *checkpoint activations* in other frameworks). + """ + if not self._supports_gradient_checkpointing: + raise ValueError(f"{self.__class__.__name__} does not support gradient checkpointing.") + self.apply(partial(self._set_gradient_checkpointing, value=True)) + + def disable_gradient_checkpointing(self) -> None: + """ + Deactivates gradient checkpointing for the current model (may be referred to as *activation checkpointing* or + *checkpoint activations* in other frameworks). + """ + if self._supports_gradient_checkpointing: + self.apply(partial(self._set_gradient_checkpointing, value=False)) + + def set_use_npu_flash_attention(self, valid: bool) -> None: + r""" + Set the switch for the npu flash attention. + """ + + def fn_recursive_set_npu_flash_attention(module: torch.nn.Module): + if hasattr(module, "set_use_npu_flash_attention"): + module.set_use_npu_flash_attention(valid) + + for child in module.children(): + fn_recursive_set_npu_flash_attention(child) + + for module in self.children(): + if isinstance(module, torch.nn.Module): + fn_recursive_set_npu_flash_attention(module) + + def enable_npu_flash_attention(self) -> None: + r""" + Enable npu flash attention from torch_npu + + """ + self.set_use_npu_flash_attention(True) + + def disable_npu_flash_attention(self) -> None: + r""" + disable npu flash attention from torch_npu + + """ + self.set_use_npu_flash_attention(False) + + def set_use_xla_flash_attention( + self, use_xla_flash_attention: bool, partition_spec: Optional[Callable] = None + ) -> None: + # Recursively walk through all the children. + # Any children which exposes the set_use_xla_flash_attention method + # gets the message + def fn_recursive_set_flash_attention(module: torch.nn.Module): + if hasattr(module, "set_use_xla_flash_attention"): + module.set_use_xla_flash_attention(use_xla_flash_attention, partition_spec) + + for child in module.children(): + fn_recursive_set_flash_attention(child) + + for module in self.children(): + if isinstance(module, torch.nn.Module): + fn_recursive_set_flash_attention(module) + + def enable_xla_flash_attention(self, partition_spec: Optional[Callable] = None): + r""" + Enable the flash attention pallals kernel for torch_xla. + """ + self.set_use_xla_flash_attention(True, partition_spec) + + def disable_xla_flash_attention(self): + r""" + Disable the flash attention pallals kernel for torch_xla. + """ + self.set_use_xla_flash_attention(False) + + def set_use_memory_efficient_attention_xformers( + self, valid: bool, attention_op: Optional[Callable] = None + ) -> None: + # Recursively walk through all the children. + # Any children which exposes the set_use_memory_efficient_attention_xformers method + # gets the message + def fn_recursive_set_mem_eff(module: torch.nn.Module): + if hasattr(module, "set_use_memory_efficient_attention_xformers"): + module.set_use_memory_efficient_attention_xformers(valid, attention_op) + + for child in module.children(): + fn_recursive_set_mem_eff(child) + + for module in self.children(): + if isinstance(module, torch.nn.Module): + fn_recursive_set_mem_eff(module) + + def enable_xformers_memory_efficient_attention(self, attention_op: Optional[Callable] = None) -> None: + r""" + Enable memory efficient attention from [xFormers](https://facebookresearch.github.io/xformers/). + + When this option is enabled, you should observe lower GPU memory usage and a potential speed up during + inference. Speed up during training is not guaranteed. + + + + ⚠️ When memory efficient attention and sliced attention are both enabled, memory efficient attention takes + precedent. + + + + Parameters: + attention_op (`Callable`, *optional*): + Override the default `None` operator for use as `op` argument to the + [`memory_efficient_attention()`](https://facebookresearch.github.io/xformers/components/ops.html#xformers.ops.memory_efficient_attention) + function of xFormers. + + Examples: + + ```py + >>> import torch + >>> from diffusers import UNet2DConditionModel + >>> from xformers.ops import MemoryEfficientAttentionFlashAttentionOp + + >>> model = UNet2DConditionModel.from_pretrained( + ... "stabilityai/stable-diffusion-2-1", subfolder="unet", torch_dtype=torch.float16 + ... ) + >>> model = model.to("cuda") + >>> model.enable_xformers_memory_efficient_attention(attention_op=MemoryEfficientAttentionFlashAttentionOp) + ``` + """ + self.set_use_memory_efficient_attention_xformers(True, attention_op) + + def disable_xformers_memory_efficient_attention(self) -> None: + r""" + Disable memory efficient attention from [xFormers](https://facebookresearch.github.io/xformers/). + """ + self.set_use_memory_efficient_attention_xformers(False) + + def save_pretrained( + self, + save_directory: Union[str, os.PathLike], + is_main_process: bool = True, + save_function: Optional[Callable] = None, + safe_serialization: bool = True, + variant: Optional[str] = None, + max_shard_size: Union[int, str] = "10GB", + push_to_hub: bool = False, + **kwargs, + ): + """ + Save a model and its configuration file to a directory so that it can be reloaded using the + [`~models.ModelMixin.from_pretrained`] class method. + + Arguments: + save_directory (`str` or `os.PathLike`): + Directory to save a model and its configuration file to. Will be created if it doesn't exist. + is_main_process (`bool`, *optional*, defaults to `True`): + Whether the process calling this is the main process or not. Useful during distributed training and you + need to call this function on all processes. In this case, set `is_main_process=True` only on the main + process to avoid race conditions. + save_function (`Callable`): + The function to use to save the state dictionary. Useful during distributed training when you need to + replace `torch.save` with another method. Can be configured with the environment variable + `DIFFUSERS_SAVE_MODE`. + safe_serialization (`bool`, *optional*, defaults to `True`): + Whether to save the model using `safetensors` or the traditional PyTorch way with `pickle`. + variant (`str`, *optional*): + If specified, weights are saved in the format `pytorch_model..bin`. + max_shard_size (`int` or `str`, defaults to `"10GB"`): + The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size + lower than this size. If expressed as a string, needs to be digits followed by a unit (like `"5GB"`). + If expressed as an integer, the unit is bytes. Note that this limit will be decreased after a certain + period of time (starting from Oct 2024) to allow users to upgrade to the latest version of `diffusers`. + This is to establish a common default size for this argument across different libraries in the Hugging + Face ecosystem (`transformers`, and `accelerate`, for example). + push_to_hub (`bool`, *optional*, defaults to `False`): + Whether or not to push your model to the Hugging Face Hub after saving it. You can specify the + repository you want to push to with `repo_id` (will default to the name of `save_directory` in your + namespace). + kwargs (`Dict[str, Any]`, *optional*): + Additional keyword arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method. + """ + if os.path.isfile(save_directory): + logger.error(f"Provided path ({save_directory}) should be a directory, not a file") + return + + hf_quantizer = getattr(self, "hf_quantizer", None) + if hf_quantizer is not None: + quantization_serializable = ( + hf_quantizer is not None + and isinstance(hf_quantizer, DiffusersQuantizer) + and hf_quantizer.is_serializable + ) + if not quantization_serializable: + raise ValueError( + f"The model is quantized with {hf_quantizer.quantization_config.quant_method} and is not serializable - check out the warnings from" + " the logger on the traceback to understand the reason why the quantized model is not serializable." + ) + + weights_name = SAFETENSORS_WEIGHTS_NAME if safe_serialization else WEIGHTS_NAME + weights_name = _add_variant(weights_name, variant) + weights_name_pattern = weights_name.replace(".bin", "{suffix}.bin").replace( + ".safetensors", "{suffix}.safetensors" + ) + + os.makedirs(save_directory, exist_ok=True) + + if push_to_hub: + commit_message = kwargs.pop("commit_message", None) + private = kwargs.pop("private", None) + create_pr = kwargs.pop("create_pr", False) + token = kwargs.pop("token", None) + repo_id = kwargs.pop("repo_id", save_directory.split(os.path.sep)[-1]) + repo_id = create_repo(repo_id, exist_ok=True, private=private, token=token).repo_id + + # Only save the model itself if we are using distributed training + model_to_save = self + + # Attach architecture to the config + # Save the config + if is_main_process: + model_to_save.save_config(save_directory) + + # Save the model + state_dict = model_to_save.state_dict() + + # Save the model + state_dict_split = split_torch_state_dict_into_shards( + state_dict, max_shard_size=max_shard_size, filename_pattern=weights_name_pattern + ) + + # Clean the folder from a previous save + if is_main_process: + for filename in os.listdir(save_directory): + if filename in state_dict_split.filename_to_tensors.keys(): + continue + full_filename = os.path.join(save_directory, filename) + if not os.path.isfile(full_filename): + continue + weights_without_ext = weights_name_pattern.replace(".bin", "").replace(".safetensors", "") + weights_without_ext = weights_without_ext.replace("{suffix}", "") + filename_without_ext = filename.replace(".bin", "").replace(".safetensors", "") + # make sure that file to be deleted matches format of sharded file, e.g. pytorch_model-00001-of-00005 + if ( + filename.startswith(weights_without_ext) + and _REGEX_SHARD.fullmatch(filename_without_ext) is not None + ): + os.remove(full_filename) + + for filename, tensors in state_dict_split.filename_to_tensors.items(): + shard = {tensor: state_dict[tensor] for tensor in tensors} + filepath = os.path.join(save_directory, filename) + if safe_serialization: + # At some point we will need to deal better with save_function (used for TPU and other distributed + # joyfulness), but for now this enough. + safetensors.torch.save_file(shard, filepath, metadata={"format": "pt"}) + else: + torch.save(shard, filepath) + + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + save_index_file = SAFE_WEIGHTS_INDEX_NAME if safe_serialization else WEIGHTS_INDEX_NAME + save_index_file = os.path.join(save_directory, _add_variant(save_index_file, variant)) + # Save the index as well + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + logger.info( + f"The model is bigger than the maximum size per checkpoint ({max_shard_size}) and is going to be " + f"split in {len(state_dict_split.filename_to_tensors)} checkpoint shards. You can find where each parameters has been saved in the " + f"index located at {save_index_file}." + ) + else: + path_to_weights = os.path.join(save_directory, weights_name) + logger.info(f"Model weights saved in {path_to_weights}") + + if push_to_hub: + # Create a new empty model card and eventually tag it + model_card = load_or_create_model_card(repo_id, token=token) + model_card = populate_model_card(model_card) + model_card.save(Path(save_directory, "README.md").as_posix()) + + self._upload_folder( + save_directory, + repo_id, + token=token, + commit_message=commit_message, + create_pr=create_pr, + ) + + def dequantize(self): + """ + Potentially dequantize the model in case it has been quantized by a quantization method that support + dequantization. + """ + hf_quantizer = getattr(self, "hf_quantizer", None) + + if hf_quantizer is None: + raise ValueError("You need to first quantize your model in order to dequantize it") + + return hf_quantizer.dequantize(self) + + @classmethod + @validate_hf_hub_args + def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], **kwargs): + r""" + Instantiate a pretrained PyTorch model from a pretrained model configuration. + + The model is set in evaluation mode - `model.eval()` - by default, and dropout modules are deactivated. To + train the model, set it back in training mode with `model.train()`. + + Parameters: + pretrained_model_name_or_path (`str` or `os.PathLike`, *optional*): + Can be either: + + - A string, the *model id* (for example `google/ddpm-celebahq-256`) of a pretrained model hosted on + the Hub. + - A path to a *directory* (for example `./my_model_directory`) containing the model weights saved + with [`~ModelMixin.save_pretrained`]. + + cache_dir (`Union[str, os.PathLike]`, *optional*): + Path to a directory where a downloaded pretrained model configuration is cached if the standard cache + is not used. + torch_dtype (`str` or `torch.dtype`, *optional*): + Override the default `torch.dtype` and load the model with another dtype. If `"auto"` is passed, the + dtype is automatically derived from the model's weights. + force_download (`bool`, *optional*, defaults to `False`): + Whether or not to force the (re-)download of the model weights and configuration files, overriding the + cached versions if they exist. + proxies (`Dict[str, str]`, *optional*): + A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128', + 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request. + output_loading_info (`bool`, *optional*, defaults to `False`): + Whether or not to also return a dictionary containing missing keys, unexpected keys and error messages. + local_files_only(`bool`, *optional*, defaults to `False`): + Whether to only load local model weights and configuration files or not. If set to `True`, the model + won't be downloaded from the Hub. + token (`str` or *bool*, *optional*): + The token to use as HTTP bearer authorization for remote files. If `True`, the token generated from + `diffusers-cli login` (stored in `~/.huggingface`) is used. + revision (`str`, *optional*, defaults to `"main"`): + The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier + allowed by Git. + from_flax (`bool`, *optional*, defaults to `False`): + Load the model weights from a Flax checkpoint save file. + subfolder (`str`, *optional*, defaults to `""`): + The subfolder location of a model file within a larger model repository on the Hub or locally. + mirror (`str`, *optional*): + Mirror source to resolve accessibility issues if you're downloading a model in China. We do not + guarantee the timeliness or safety of the source, and you should refer to the mirror site for more + information. + device_map (`str` or `Dict[str, Union[int, str, torch.device]]`, *optional*): + A map that specifies where each submodule should go. It doesn't need to be defined for each + parameter/buffer name; once a given module name is inside, every submodule of it will be sent to the + same device. Defaults to `None`, meaning that the model will be loaded on CPU. + + Set `device_map="auto"` to have 🤗 Accelerate automatically compute the most optimized `device_map`. For + more information about each option see [designing a device + map](https://hf.co/docs/accelerate/main/en/usage_guides/big_modeling#designing-a-device-map). + max_memory (`Dict`, *optional*): + A dictionary device identifier for the maximum memory. Will default to the maximum memory available for + each GPU and the available CPU RAM if unset. + offload_folder (`str` or `os.PathLike`, *optional*): + The path to offload weights if `device_map` contains the value `"disk"`. + offload_state_dict (`bool`, *optional*): + If `True`, temporarily offloads the CPU state dict to the hard drive to avoid running out of CPU RAM if + the weight of the CPU state dict + the biggest shard of the checkpoint does not fit. Defaults to `True` + when there is some disk offload. + low_cpu_mem_usage (`bool`, *optional*, defaults to `True` if torch version >= 1.9.0 else `False`): + Speed up model loading only loading the pretrained weights and not initializing the weights. This also + tries to not use more than 1x model size in CPU memory (including peak memory) while loading the model. + Only supported for PyTorch >= 1.9.0. If you are using an older version of PyTorch, setting this + argument to `True` will raise an error. + variant (`str`, *optional*): + Load weights from a specified `variant` filename such as `"fp16"` or `"ema"`. This is ignored when + loading `from_flax`. + use_safetensors (`bool`, *optional*, defaults to `None`): + If set to `None`, the `safetensors` weights are downloaded if they're available **and** if the + `safetensors` library is installed. If set to `True`, the model is forcibly loaded from `safetensors` + weights. If set to `False`, `safetensors` weights are not loaded. + + + + To use private or [gated models](https://huggingface.co/docs/hub/models-gated#gated-models), log-in with + `huggingface-cli login`. You can also activate the special + ["offline-mode"](https://huggingface.co/diffusers/installation.html#offline-mode) to use this method in a + firewalled environment. + + + + Example: + + ```py + from diffusers import UNet2DConditionModel + + unet = UNet2DConditionModel.from_pretrained("runwayml/stable-diffusion-v1-5", subfolder="unet") + ``` + + If you get the error message below, you need to finetune the weights for your downstream task: + + ```bash + Some weights of UNet2DConditionModel were not initialized from the model checkpoint at runwayml/stable-diffusion-v1-5 and are newly initialized because the shapes did not match: + - conv_in.weight: found shape torch.Size([320, 4, 3, 3]) in the checkpoint and torch.Size([320, 9, 3, 3]) in the model instantiated + You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference. + ``` + """ + cache_dir = kwargs.pop("cache_dir", None) + ignore_mismatched_sizes = kwargs.pop("ignore_mismatched_sizes", False) + force_download = kwargs.pop("force_download", False) + from_flax = kwargs.pop("from_flax", False) + proxies = kwargs.pop("proxies", None) + output_loading_info = kwargs.pop("output_loading_info", False) + local_files_only = kwargs.pop("local_files_only", None) + token = kwargs.pop("token", None) + revision = kwargs.pop("revision", None) + torch_dtype = kwargs.pop("torch_dtype", None) + subfolder = kwargs.pop("subfolder", None) + device_map = kwargs.pop("device_map", None) + max_memory = kwargs.pop("max_memory", None) + offload_folder = kwargs.pop("offload_folder", None) + offload_state_dict = kwargs.pop("offload_state_dict", False) + low_cpu_mem_usage = kwargs.pop("low_cpu_mem_usage", _LOW_CPU_MEM_USAGE_DEFAULT) + variant = kwargs.pop("variant", None) + use_safetensors = kwargs.pop("use_safetensors", None) + quantization_config = kwargs.pop("quantization_config", None) + + allow_pickle = False + if use_safetensors is None: + use_safetensors = True + allow_pickle = True + + if low_cpu_mem_usage and not is_accelerate_available(): + low_cpu_mem_usage = False + logger.warning( + "Cannot initialize model with low cpu memory usage because `accelerate` was not found in the" + " environment. Defaulting to `low_cpu_mem_usage=False`. It is strongly recommended to install" + " `accelerate` for faster and less memory-intense model loading. You can do so with: \n```\npip" + " install accelerate\n```\n." + ) + + if device_map is not None and not is_accelerate_available(): + raise NotImplementedError( + "Loading and dispatching requires `accelerate`. Please make sure to install accelerate or set" + " `device_map=None`. You can install accelerate with `pip install accelerate`." + ) + + # Check if we can handle device_map and dispatching the weights + if device_map is not None and not is_torch_version(">=", "1.9.0"): + raise NotImplementedError( + "Loading and dispatching requires torch >= 1.9.0. Please either update your PyTorch version or set" + " `device_map=None`." + ) + + if low_cpu_mem_usage is True and not is_torch_version(">=", "1.9.0"): + raise NotImplementedError( + "Low memory initialization requires torch >= 1.9.0. Please either update your PyTorch version or set" + " `low_cpu_mem_usage=False`." + ) + + if low_cpu_mem_usage is False and device_map is not None: + raise ValueError( + f"You cannot set `low_cpu_mem_usage` to `False` while using device_map={device_map} for loading and" + " dispatching. Please make sure to set `low_cpu_mem_usage=True`." + ) + + # change device_map into a map if we passed an int, a str or a torch.device + if isinstance(device_map, torch.device): + device_map = {"": device_map} + elif isinstance(device_map, str) and device_map not in ["auto", "balanced", "balanced_low_0", "sequential"]: + try: + device_map = {"": torch.device(device_map)} + except RuntimeError: + raise ValueError( + "When passing device_map as a string, the value needs to be a device name (e.g. cpu, cuda:0) or " + f"'auto', 'balanced', 'balanced_low_0', 'sequential' but found {device_map}." + ) + elif isinstance(device_map, int): + if device_map < 0: + raise ValueError( + "You can't pass device_map as a negative int. If you want to put the model on the cpu, pass device_map = 'cpu' " + ) + else: + device_map = {"": device_map} + + if device_map is not None: + if low_cpu_mem_usage is None: + low_cpu_mem_usage = True + elif not low_cpu_mem_usage: + raise ValueError("Passing along a `device_map` requires `low_cpu_mem_usage=True`") + + if low_cpu_mem_usage: + if device_map is not None and not is_torch_version(">=", "1.10"): + # The max memory utils require PyTorch >= 1.10 to have torch.cuda.mem_get_info. + raise ValueError("`low_cpu_mem_usage` and `device_map` require PyTorch >= 1.10.") + + # Load config if we don't provide a configuration + config_path = pretrained_model_name_or_path + + user_agent = { + "diffusers": __version__, + "file_type": "model", + "framework": "pytorch", + } + + # load config + config, unused_kwargs, commit_hash = cls.load_config( + config_path, + cache_dir=cache_dir, + return_unused_kwargs=True, + return_commit_hash=True, + force_download=force_download, + proxies=proxies, + local_files_only=local_files_only, + token=token, + revision=revision, + subfolder=subfolder, + user_agent=user_agent, + **kwargs, + ) + # no in-place modification of the original config. + config = copy.deepcopy(config) + + # determine initial quantization config. + ####################################### + pre_quantized = "quantization_config" in config and config["quantization_config"] is not None + if pre_quantized or quantization_config is not None: + if pre_quantized: + config["quantization_config"] = DiffusersAutoQuantizer.merge_quantization_configs( + config["quantization_config"], quantization_config + ) + else: + config["quantization_config"] = quantization_config + hf_quantizer = DiffusersAutoQuantizer.from_config( + config["quantization_config"], pre_quantized=pre_quantized + ) + else: + hf_quantizer = None + + if hf_quantizer is not None: + is_bnb_quantization_method = hf_quantizer.quantization_config.quant_method.value == "bitsandbytes" + if is_bnb_quantization_method and device_map is not None: + raise NotImplementedError( + "Currently, `device_map` is automatically inferred for quantized bitsandbytes models. Support for providing `device_map` as an input will be added in the future." + ) + + hf_quantizer.validate_environment(torch_dtype=torch_dtype, from_flax=from_flax, device_map=device_map) + torch_dtype = hf_quantizer.update_torch_dtype(torch_dtype) + + # In order to ensure popular quantization methods are supported. Can be disable with `disable_telemetry` + user_agent["quant"] = hf_quantizer.quantization_config.quant_method.value + + # Force-set to `True` for more mem efficiency + if low_cpu_mem_usage is None: + low_cpu_mem_usage = True + logger.info("Set `low_cpu_mem_usage` to True as `hf_quantizer` is not None.") + elif not low_cpu_mem_usage: + raise ValueError("`low_cpu_mem_usage` cannot be False or None when using quantization.") + + # Check if `_keep_in_fp32_modules` is not None + use_keep_in_fp32_modules = (cls._keep_in_fp32_modules is not None) and ( + (torch_dtype == torch.float16) or hasattr(hf_quantizer, "use_keep_in_fp32_modules") + ) + if use_keep_in_fp32_modules: + keep_in_fp32_modules = cls._keep_in_fp32_modules + if not isinstance(keep_in_fp32_modules, list): + keep_in_fp32_modules = [keep_in_fp32_modules] + + if low_cpu_mem_usage is None: + low_cpu_mem_usage = True + logger.info("Set `low_cpu_mem_usage` to True as `_keep_in_fp32_modules` is not None.") + elif not low_cpu_mem_usage: + raise ValueError("`low_cpu_mem_usage` cannot be False when `keep_in_fp32_modules` is True.") + else: + keep_in_fp32_modules = [] + ####################################### + + # Determine if we're loading from a directory of sharded checkpoints. + is_sharded = False + index_file = None + is_local = os.path.isdir(pretrained_model_name_or_path) + index_file_kwargs = { + "is_local": is_local, + "pretrained_model_name_or_path": pretrained_model_name_or_path, + "subfolder": subfolder or "", + "use_safetensors": use_safetensors, + "cache_dir": cache_dir, + "variant": variant, + "force_download": force_download, + "proxies": proxies, + "local_files_only": local_files_only, + "token": token, + "revision": revision, + "user_agent": user_agent, + "commit_hash": commit_hash, + } + index_file = _fetch_index_file(**index_file_kwargs) + # In case the index file was not found we still have to consider the legacy format. + # this becomes applicable when the variant is not None. + if variant is not None and (index_file is None or not os.path.exists(index_file)): + index_file = _fetch_index_file_legacy(**index_file_kwargs) + if index_file is not None and index_file.is_file(): + is_sharded = True + + if is_sharded and from_flax: + raise ValueError("Loading of sharded checkpoints is not supported when `from_flax=True`.") + + # load model + model_file = None + if from_flax: + model_file = _get_model_file( + pretrained_model_name_or_path, + weights_name=FLAX_WEIGHTS_NAME, + cache_dir=cache_dir, + force_download=force_download, + proxies=proxies, + local_files_only=local_files_only, + token=token, + revision=revision, + subfolder=subfolder, + user_agent=user_agent, + commit_hash=commit_hash, + ) + model = cls.from_config(config, **unused_kwargs) + + # Convert the weights + from .modeling_pytorch_flax_utils import load_flax_checkpoint_in_pytorch_model + + model = load_flax_checkpoint_in_pytorch_model(model, model_file) + else: + if is_sharded: + sharded_ckpt_cached_folder, sharded_metadata = _get_checkpoint_shard_files( + pretrained_model_name_or_path, + index_file, + cache_dir=cache_dir, + proxies=proxies, + local_files_only=local_files_only, + token=token, + user_agent=user_agent, + revision=revision, + subfolder=subfolder or "", + ) + if hf_quantizer is not None and is_bnb_quantization_method: + model_file = _merge_sharded_checkpoints(sharded_ckpt_cached_folder, sharded_metadata) + logger.info("Merged sharded checkpoints as `hf_quantizer` is not None.") + is_sharded = False + + elif use_safetensors and not is_sharded: + try: + model_file = _get_model_file( + pretrained_model_name_or_path, + weights_name=_add_variant(SAFETENSORS_WEIGHTS_NAME, variant), + cache_dir=cache_dir, + force_download=force_download, + proxies=proxies, + local_files_only=local_files_only, + token=token, + revision=revision, + subfolder=subfolder, + user_agent=user_agent, + commit_hash=commit_hash, + ) + + except IOError as e: + logger.error(f"An error occurred while trying to fetch {pretrained_model_name_or_path}: {e}") + if not allow_pickle: + raise + logger.warning( + "Defaulting to unsafe serialization. Pass `allow_pickle=False` to raise an error instead." + ) + + if model_file is None and not is_sharded: + model_file = _get_model_file( + pretrained_model_name_or_path, + weights_name=_add_variant(WEIGHTS_NAME, variant), + cache_dir=cache_dir, + force_download=force_download, + proxies=proxies, + local_files_only=local_files_only, + token=token, + revision=revision, + subfolder=subfolder, + user_agent=user_agent, + commit_hash=commit_hash, + ) + + if low_cpu_mem_usage: + # Instantiate model with empty weights + with accelerate.init_empty_weights(): + model = cls.from_config(config, **unused_kwargs) + + if hf_quantizer is not None: + hf_quantizer.preprocess_model( + model=model, device_map=device_map, keep_in_fp32_modules=keep_in_fp32_modules + ) + + # if device_map is None, load the state dict and move the params from meta device to the cpu + if device_map is None and not is_sharded: + # `torch.cuda.current_device()` is fine here when `hf_quantizer` is not None. + # It would error out during the `validate_environment()` call above in the absence of cuda. + if hf_quantizer is None: + param_device = "cpu" + # TODO (sayakpaul, SunMarc): remove this after model loading refactor + else: + param_device = torch.device(torch.cuda.current_device()) + state_dict = load_state_dict(model_file, variant=variant) + model._convert_deprecated_attention_blocks(state_dict) + + # move the params from meta device to cpu + missing_keys = set(model.state_dict().keys()) - set(state_dict.keys()) + if hf_quantizer is not None: + missing_keys = hf_quantizer.update_missing_keys(model, missing_keys, prefix="") + if len(missing_keys) > 0: + raise ValueError( + f"Cannot load {cls} from {pretrained_model_name_or_path} because the following keys are" + f" missing: \n {', '.join(missing_keys)}. \n Please make sure to pass" + " `low_cpu_mem_usage=False` and `device_map=None` if you want to randomly initialize" + " those weights or else make sure your checkpoint file is correct." + ) + + unexpected_keys = load_model_dict_into_meta( + model, + state_dict, + device=param_device, + dtype=torch_dtype, + model_name_or_path=pretrained_model_name_or_path, + hf_quantizer=hf_quantizer, + keep_in_fp32_modules=keep_in_fp32_modules, + ) + + if cls._keys_to_ignore_on_load_unexpected is not None: + for pat in cls._keys_to_ignore_on_load_unexpected: + unexpected_keys = [k for k in unexpected_keys if re.search(pat, k) is None] + + if len(unexpected_keys) > 0: + logger.warning( + f"Some weights of the model checkpoint were not used when initializing {cls.__name__}: \n {[', '.join(unexpected_keys)]}" + ) + + else: + weights_path = index_file + with open(index_file) as f: + index = json.loads(f.read()) + if "weight_map" in index: + index = index["weight_map"] + weights_path = sorted(list(set(index.values()))) + weights_path = [os.path.join(pretrained_model_name_or_path, f) for f in weights_path] + + model = cls._load_model(model, weights_path, is_sharded) + + loading_info = { + "missing_keys": [], + "unexpected_keys": [], + "mismatched_keys": [], + "error_msgs": [], + } + else: + model = cls.from_config(config, **unused_kwargs) + + state_dict = load_state_dict(model_file, variant=variant) + model._convert_deprecated_attention_blocks(state_dict) + + model, missing_keys, unexpected_keys, mismatched_keys, error_msgs = cls._load_pretrained_model( + model, + state_dict, + model_file, + pretrained_model_name_or_path, + ignore_mismatched_sizes=ignore_mismatched_sizes, + ) + + loading_info = { + "missing_keys": missing_keys, + "unexpected_keys": unexpected_keys, + "mismatched_keys": mismatched_keys, + "error_msgs": error_msgs, + } + + if hf_quantizer is not None: + hf_quantizer.postprocess_model(model) + model.hf_quantizer = hf_quantizer + + if torch_dtype is not None and not isinstance(torch_dtype, torch.dtype): + raise ValueError( + f"{torch_dtype} needs to be of type `torch.dtype`, e.g. `torch.float16`, but is {type(torch_dtype)}." + ) + # When using `use_keep_in_fp32_modules` if we do a global `to()` here, then we will + # completely lose the effectivity of `use_keep_in_fp32_modules`. + elif torch_dtype is not None and hf_quantizer is None and not use_keep_in_fp32_modules: + model = model.to(torch_dtype) + + if hf_quantizer is not None: + # We also make sure to purge `_pre_quantization_dtype` when we serialize + # the model config because `_pre_quantization_dtype` is `torch.dtype`, not JSON serializable. + model.register_to_config(_name_or_path=pretrained_model_name_or_path, _pre_quantization_dtype=torch_dtype) + else: + model.register_to_config(_name_or_path=pretrained_model_name_or_path) + + # Set model in evaluation mode to deactivate DropOut modules by default + model.eval() + if output_loading_info: + return model, loading_info + + return model + + @classmethod + def _load_model(cls, model, weights_path, is_sharded): + if not is_sharded: + state_dict = load_state_dict(weights_path) + model.load_weights(state_dict) + else: + need_key = set(model.state_dict().keys()) + state_dict = {} + cache = {} + for weight_file in weights_path: + state_dict = load_state_dict(weight_file) + state_dict.update(cache) + loadkey_cache = model.load_weights(state_dict, is_sharded) + if loadkey_cache : + if isinstance(loadkey_cache, tuple): + loaded_keys, cache = loadkey_cache + else: + loaded_keys = loadkey_cache + need_key = need_key.symmetric_difference(set(loaded_keys)) + + if len(need_key) > 0: + raise ValueError(f"The weight miss key: {need_key}") + return model + + def load_weights(self, state_dict, shard=False): + with torch.no_grad(): + if not shard: + self.load_state_dict(state_dict) + return {} + else: + self.load_state_dict(state_dict, strict=False, assign=True) + return state_dict.keys() + + # Adapted from `transformers`. + @wraps(torch.nn.Module.cuda) + def cuda(self, *args, **kwargs): + # Checks if the model has been loaded in 4-bit or 8-bit with BNB + if getattr(self, "quantization_method", None) == QuantizationMethod.BITS_AND_BYTES: + if getattr(self, "is_loaded_in_8bit", False): + raise ValueError( + "Calling `cuda()` is not supported for `8-bit` quantized models. " + " Please use the model as it is, since the model has already been set to the correct devices." + ) + elif is_bitsandbytes_version("<", "0.43.2"): + raise ValueError( + "Calling `cuda()` is not supported for `4-bit` quantized models with the installed version of bitsandbytes. " + f"The current device is `{self.device}`. If you intended to move the model, please install bitsandbytes >= 0.43.2." + ) + return super().cuda(*args, **kwargs) + + # Adapted from `transformers`. + @wraps(torch.nn.Module.to) + def to(self, *args, **kwargs): + dtype_present_in_args = "dtype" in kwargs + + if not dtype_present_in_args: + for arg in args: + if isinstance(arg, torch.dtype): + dtype_present_in_args = True + break + + if getattr(self, "is_quantized", False): + if dtype_present_in_args: + raise ValueError( + "Casting a quantized model to a new `dtype` is unsupported. To set the dtype of unquantized layers, please " + "use the `torch_dtype` argument when loading the model using `from_pretrained` or `from_single_file`" + ) + + if getattr(self, "quantization_method", None) == QuantizationMethod.BITS_AND_BYTES: + if getattr(self, "is_loaded_in_8bit", False): + raise ValueError( + "`.to` is not supported for `8-bit` bitsandbytes models. Please use the model as it is, since the" + " model has already been set to the correct devices and casted to the correct `dtype`." + ) + elif is_bitsandbytes_version("<", "0.43.2"): + raise ValueError( + "Calling `to()` is not supported for `4-bit` quantized models with the installed version of bitsandbytes. " + f"The current device is `{self.device}`. If you intended to move the model, please install bitsandbytes >= 0.43.2." + ) + return super().to(*args, **kwargs) + + # Taken from `transformers`. + def half(self, *args): + # Checks if the model is quantized + if getattr(self, "is_quantized", False): + raise ValueError( + "`.half()` is not supported for quantized model. Please use the model as it is, since the" + " model has already been cast to the correct `dtype`." + ) + else: + return super().half(*args) + + # Taken from `transformers`. + def float(self, *args): + # Checks if the model is quantized + if getattr(self, "is_quantized", False): + raise ValueError( + "`.float()` is not supported for quantized model. Please use the model as it is, since the" + " model has already been cast to the correct `dtype`." + ) + else: + return super().float(*args) + + @classmethod + def _load_pretrained_model( + cls, + model, + state_dict: OrderedDict, + resolved_archive_file, + pretrained_model_name_or_path: Union[str, os.PathLike], + ignore_mismatched_sizes: bool = False, + ): + # Retrieve missing & unexpected_keys + model_state_dict = model.state_dict() + loaded_keys = list(state_dict.keys()) + + expected_keys = list(model_state_dict.keys()) + + original_loaded_keys = loaded_keys + + missing_keys = list(set(expected_keys) - set(loaded_keys)) + unexpected_keys = list(set(loaded_keys) - set(expected_keys)) + + # Make sure we are able to load base models as well as derived models (with heads) + model_to_load = model + + def _find_mismatched_keys( + state_dict, + model_state_dict, + loaded_keys, + ignore_mismatched_sizes, + ): + mismatched_keys = [] + if ignore_mismatched_sizes: + for checkpoint_key in loaded_keys: + model_key = checkpoint_key + + if ( + model_key in model_state_dict + and state_dict[checkpoint_key].shape != model_state_dict[model_key].shape + ): + mismatched_keys.append( + (checkpoint_key, state_dict[checkpoint_key].shape, model_state_dict[model_key].shape) + ) + del state_dict[checkpoint_key] + return mismatched_keys + + if state_dict is not None: + # Whole checkpoint + mismatched_keys = _find_mismatched_keys( + state_dict, + model_state_dict, + original_loaded_keys, + ignore_mismatched_sizes, + ) + error_msgs = _load_state_dict_into_model(model_to_load, state_dict) + + if len(error_msgs) > 0: + error_msg = "\n\t".join(error_msgs) + if "size mismatch" in error_msg: + error_msg += ( + "\n\tYou may consider adding `ignore_mismatched_sizes=True` in the model `from_pretrained` method." + ) + raise RuntimeError(f"Error(s) in loading state_dict for {model.__class__.__name__}:\n\t{error_msg}") + + if len(unexpected_keys) > 0: + logger.warning( + f"Some weights of the model checkpoint at {pretrained_model_name_or_path} were not used when" + f" initializing {model.__class__.__name__}: {unexpected_keys}\n- This IS expected if you are" + f" initializing {model.__class__.__name__} from the checkpoint of a model trained on another task" + " or with another architecture (e.g. initializing a BertForSequenceClassification model from a" + " BertForPreTraining model).\n- This IS NOT expected if you are initializing" + f" {model.__class__.__name__} from the checkpoint of a model that you expect to be exactly" + " identical (initializing a BertForSequenceClassification model from a" + " BertForSequenceClassification model)." + ) + else: + logger.info(f"All model checkpoint weights were used when initializing {model.__class__.__name__}.\n") + if len(missing_keys) > 0: + logger.warning( + f"Some weights of {model.__class__.__name__} were not initialized from the model checkpoint at" + f" {pretrained_model_name_or_path} and are newly initialized: {missing_keys}\nYou should probably" + " TRAIN this model on a down-stream task to be able to use it for predictions and inference." + ) + elif len(mismatched_keys) == 0: + logger.info( + f"All the weights of {model.__class__.__name__} were initialized from the model checkpoint at" + f" {pretrained_model_name_or_path}.\nIf your task is similar to the task the model of the" + f" checkpoint was trained on, you can already use {model.__class__.__name__} for predictions" + " without further training." + ) + if len(mismatched_keys) > 0: + mismatched_warning = "\n".join( + [ + f"- {key}: found shape {shape1} in the checkpoint and {shape2} in the model instantiated" + for key, shape1, shape2 in mismatched_keys + ] + ) + logger.warning( + f"Some weights of {model.__class__.__name__} were not initialized from the model checkpoint at" + f" {pretrained_model_name_or_path} and are newly initialized because the shapes did not" + f" match:\n{mismatched_warning}\nYou should probably TRAIN this model on a down-stream task to be" + " able to use it for predictions and inference." + ) + + return model, missing_keys, unexpected_keys, mismatched_keys, error_msgs + + @classmethod + def _get_signature_keys(cls, obj): + parameters = inspect.signature(obj.__init__).parameters + required_parameters = {k: v for k, v in parameters.items() if v.default == inspect._empty} + optional_parameters = set({k for k, v in parameters.items() if v.default != inspect._empty}) + expected_modules = set(required_parameters.keys()) - {"self"} + + return expected_modules, optional_parameters + + # Adapted from `transformers` modeling_utils.py + def _get_no_split_modules(self, device_map: str): + """ + Get the modules of the model that should not be spit when using device_map. We iterate through the modules to + get the underlying `_no_split_modules`. + + Args: + device_map (`str`): + The device map value. Options are ["auto", "balanced", "balanced_low_0", "sequential"] + + Returns: + `List[str]`: List of modules that should not be split + """ + _no_split_modules = set() + modules_to_check = [self] + while len(modules_to_check) > 0: + module = modules_to_check.pop(-1) + # if the module does not appear in _no_split_modules, we also check the children + if module.__class__.__name__ not in _no_split_modules: + if isinstance(module, ModelMixin): + if module._no_split_modules is None: + raise ValueError( + f"{module.__class__.__name__} does not support `device_map='{device_map}'`. To implement support, the model " + "class needs to implement the `_no_split_modules` attribute." + ) + else: + _no_split_modules = _no_split_modules | set(module._no_split_modules) + modules_to_check += list(module.children()) + return list(_no_split_modules) + + @property + def device(self) -> torch.device: + """ + `torch.device`: The device on which the module is (assuming that all the module parameters are on the same + device). + """ + return get_parameter_device(self) + + @property + def dtype(self) -> torch.dtype: + """ + `torch.dtype`: The dtype of the module (assuming that all the module parameters have the same dtype). + """ + return get_parameter_dtype(self) + + def num_parameters(self, only_trainable: bool = False, exclude_embeddings: bool = False) -> int: + """ + Get number of (trainable or non-embedding) parameters in the module. + + Args: + only_trainable (`bool`, *optional*, defaults to `False`): + Whether or not to return only the number of trainable parameters. + exclude_embeddings (`bool`, *optional*, defaults to `False`): + Whether or not to return only the number of non-embedding parameters. + + Returns: + `int`: The number of parameters. + + Example: + + ```py + from diffusers import UNet2DConditionModel + + model_id = "runwayml/stable-diffusion-v1-5" + unet = UNet2DConditionModel.from_pretrained(model_id, subfolder="unet") + unet.num_parameters(only_trainable=True) + 859520964 + ``` + """ + is_loaded_in_4bit = getattr(self, "is_loaded_in_4bit", False) + + if is_loaded_in_4bit: + if is_bitsandbytes_available(): + import bitsandbytes as bnb + else: + raise ValueError( + "bitsandbytes is not installed but it seems that the model has been loaded in 4bit precision, something went wrong" + " make sure to install bitsandbytes with `pip install bitsandbytes`. You also need a GPU. " + ) + + if exclude_embeddings: + embedding_param_names = [ + f"{name}.weight" for name, module_type in self.named_modules() if isinstance(module_type, nn.Embedding) + ] + total_parameters = [ + parameter for name, parameter in self.named_parameters() if name not in embedding_param_names + ] + else: + total_parameters = list(self.parameters()) + + total_numel = [] + + for param in total_parameters: + if param.requires_grad or not only_trainable: + # For 4bit models, we need to multiply the number of parameters by 2 as half of the parameters are + # used for the 4bit quantization (uint8 tensors are stored) + if is_loaded_in_4bit and isinstance(param, bnb.nn.Params4bit): + if hasattr(param, "element_size"): + num_bytes = param.element_size() + elif hasattr(param, "quant_storage"): + num_bytes = param.quant_storage.itemsize + else: + num_bytes = 1 + total_numel.append(param.numel() * 2 * num_bytes) + else: + total_numel.append(param.numel()) + + return sum(total_numel) + + def get_memory_footprint(self, return_buffers=True): + r""" + Get the memory footprint of a model. This will return the memory footprint of the current model in bytes. + Useful to benchmark the memory footprint of the current model and design some tests. Solution inspired from the + PyTorch discussions: https://discuss.pytorch.org/t/gpu-memory-that-model-uses/56822/2 + + Arguments: + return_buffers (`bool`, *optional*, defaults to `True`): + Whether to return the size of the buffer tensors in the computation of the memory footprint. Buffers + are tensors that do not require gradients and not registered as parameters. E.g. mean and std in batch + norm layers. Please see: https://discuss.pytorch.org/t/what-pytorch-means-by-buffers/120266/2 + """ + mem = sum([param.nelement() * param.element_size() for param in self.parameters()]) + if return_buffers: + mem_bufs = sum([buf.nelement() * buf.element_size() for buf in self.buffers()]) + mem = mem + mem_bufs + return mem + + def _convert_deprecated_attention_blocks(self, state_dict: OrderedDict) -> None: + deprecated_attention_block_paths = [] + + def recursive_find_attn_block(name, module): + if hasattr(module, "_from_deprecated_attn_block") and module._from_deprecated_attn_block: + deprecated_attention_block_paths.append(name) + + for sub_name, sub_module in module.named_children(): + sub_name = sub_name if name == "" else f"{name}.{sub_name}" + recursive_find_attn_block(sub_name, sub_module) + + recursive_find_attn_block("", self) + + # NOTE: we have to check if the deprecated parameters are in the state dict + # because it is possible we are loading from a state dict that was already + # converted + + for path in deprecated_attention_block_paths: + # group_norm path stays the same + + # query -> to_q + if f"{path}.query.weight" in state_dict: + state_dict[f"{path}.to_q.weight"] = state_dict.pop(f"{path}.query.weight") + if f"{path}.query.bias" in state_dict: + state_dict[f"{path}.to_q.bias"] = state_dict.pop(f"{path}.query.bias") + + # key -> to_k + if f"{path}.key.weight" in state_dict: + state_dict[f"{path}.to_k.weight"] = state_dict.pop(f"{path}.key.weight") + if f"{path}.key.bias" in state_dict: + state_dict[f"{path}.to_k.bias"] = state_dict.pop(f"{path}.key.bias") + + # value -> to_v + if f"{path}.value.weight" in state_dict: + state_dict[f"{path}.to_v.weight"] = state_dict.pop(f"{path}.value.weight") + if f"{path}.value.bias" in state_dict: + state_dict[f"{path}.to_v.bias"] = state_dict.pop(f"{path}.value.bias") + + # proj_attn -> to_out.0 + if f"{path}.proj_attn.weight" in state_dict: + state_dict[f"{path}.to_out.0.weight"] = state_dict.pop(f"{path}.proj_attn.weight") + if f"{path}.proj_attn.bias" in state_dict: + state_dict[f"{path}.to_out.0.bias"] = state_dict.pop(f"{path}.proj_attn.bias") + + def _temp_convert_self_to_deprecated_attention_blocks(self) -> None: + deprecated_attention_block_modules = [] + + def recursive_find_attn_block(module): + if hasattr(module, "_from_deprecated_attn_block") and module._from_deprecated_attn_block: + deprecated_attention_block_modules.append(module) + + for sub_module in module.children(): + recursive_find_attn_block(sub_module) + + recursive_find_attn_block(self) + + for module in deprecated_attention_block_modules: + module.query = module.to_q + module.key = module.to_k + module.value = module.to_v + module.proj_attn = module.to_out[0] + + # We don't _have_ to delete the old attributes, but it's helpful to ensure + # that _all_ the weights are loaded into the new attributes and we're not + # making an incorrect assumption that this model should be converted when + # it really shouldn't be. + del module.to_q + del module.to_k + del module.to_v + del module.to_out + + def _undo_temp_convert_self_to_deprecated_attention_blocks(self) -> None: + deprecated_attention_block_modules = [] + + def recursive_find_attn_block(module) -> None: + if hasattr(module, "_from_deprecated_attn_block") and module._from_deprecated_attn_block: + deprecated_attention_block_modules.append(module) + + for sub_module in module.children(): + recursive_find_attn_block(sub_module) + + recursive_find_attn_block(self) + + for module in deprecated_attention_block_modules: + module.to_q = module.query + module.to_k = module.key + module.to_v = module.value + module.to_out = nn.ModuleList([module.proj_attn, nn.Dropout(module.dropout)]) + + del module.query + del module.key + del module.value + del module.proj_attn -- Gitee From 50bdb564ce183ce96debfb36fe5b700e07da068b Mon Sep 17 00:00:00 2001 From: jiangmengyu Date: Wed, 25 Dec 2024 17:13:23 +0800 Subject: [PATCH 10/91] =?UTF-8?q?CogView3plus=E9=80=82=E9=85=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../cogview3/cogview3plus/models/attention_processor.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/attention_processor.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/attention_processor.py index 1b9ce8616c..1feaa2be1f 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/attention_processor.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/attention_processor.py @@ -67,9 +67,10 @@ class CogVideoXAttnProcessor2_0: key = attn.norm_k(key) _, N, _, D = query.shape - query = F.pad(query, (0, 64 - D)) - key = F.pad(key, (0, 64 - D)) - value = F.pad(value, (0, 64 - D)) + dim = 64 + query = F.pad(query, (0, dim - D)) + key = F.pad(key, (0, dim - D)) + value = F.pad(value, (0, dim - D)) hidden_states = torch_npu.npu_prompt_flash_attention( query, key, -- Gitee From 1d06781e8aa66b106da82124b8848789f816b1df Mon Sep 17 00:00:00 2001 From: jiangmengyu Date: Thu, 26 Dec 2024 09:28:29 +0800 Subject: [PATCH 11/91] =?UTF-8?q?CogView3plus=E9=80=82=E9=85=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../cogview3plus/layers/normalization.py | 24 ++ .../models/attention_processor.py | 376 +++++++++++++++++- .../models/transformer_cogview3plus.py | 5 +- 3 files changed, 399 insertions(+), 6 deletions(-) diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/normalization.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/normalization.py index b2576d26f5..3dd2bba76c 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/normalization.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/normalization.py @@ -133,6 +133,30 @@ class CogView3PlusAdaLayerNormZeroTextImage(nn.Module): return x, gate_msa, shift_mlp, scale_mlp, gate_mlp, context, c_gate_msa, c_shift_mlp, c_scale_mlp, c_gate_mlp +class FP32LayerNorm(nn.LayerNorm): + def forward(self, inputs: torch.Tensor) -> torch.Tensor: + origin_dtype = inputs.dtype + return F.layer_norm( + inputs.float(), + self.normalized_shape, + self.weight.float() if self.weight is not None else None, + self.bias.float() if self.bias is not None else None, + self.eps, + ).to(origin_dtype) + + +class LpNorm(nn.Module): + def __init__(self, p: int = 2, dim: int = -1, eps: float = 1e-12): + super().__init__() + + self.p = p + self.dim = dim + self.eps = eps + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + return F.normalize(hidden_states, p=self.p, dim=self.dim, eps=self.eps) + + class AdaLayerNormContinuous(nn.Module): def __init__( self, diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/attention_processor.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/attention_processor.py index 1feaa2be1f..c6879c9898 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/attention_processor.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/attention_processor.py @@ -11,14 +11,384 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - +import inspect from typing import Optional import torch import torch.nn.functional as F -import torch_npu +from torch import nn + +from diffusers.utils import logging +from diffusers.utils.torch_utils import maybe_allow_in_graph + +logger = logging.get_logger(__name__) # pylint: disable=invalid-name + + +@maybe_allow_in_graph +class Attention(nn.Module): + r""" + A cross attention layer. + + Parameters: + query_dim (`int`): + The number of channels in the query. + cross_attention_dim (`int`, *optional*): + The number of channels in the encoder_hidden_states. If not given, defaults to `query_dim`. + heads (`int`, *optional*, defaults to 8): + The number of heads to use for multi-head attention. + kv_heads (`int`, *optional*, defaults to `None`): + The number of key and value heads to use for multi-head attention. Defaults to `heads`. If + `kv_heads=heads`, the model will use Multi Head Attention (MHA), if `kv_heads=1` the model will use Multi + Query Attention (MQA) otherwise GQA is used. + dim_head (`int`, *optional*, defaults to 64): + The number of channels in each head. + dropout (`float`, *optional*, defaults to 0.0): + The dropout probability to use. + bias (`bool`, *optional*, defaults to False): + Set to `True` for the query, key, and value linear layers to contain a bias parameter. + upcast_attention (`bool`, *optional*, defaults to False): + Set to `True` to upcast the attention computation to `float32`. + upcast_softmax (`bool`, *optional*, defaults to False): + Set to `True` to upcast the softmax computation to `float32`. + cross_attention_norm (`str`, *optional*, defaults to `None`): + The type of normalization to use for the cross attention. Can be `None`, `layer_norm`, or `group_norm`. + cross_attention_norm_num_groups (`int`, *optional*, defaults to 32): + The number of groups to use for the group norm in the cross attention. + added_kv_proj_dim (`int`, *optional*, defaults to `None`): + The number of channels to use for the added key and value projections. If `None`, no projection is used. + norm_num_groups (`int`, *optional*, defaults to `None`): + The number of groups to use for the group norm in the attention. + spatial_norm_dim (`int`, *optional*, defaults to `None`): + The number of channels to use for the spatial normalization. + out_bias (`bool`, *optional*, defaults to `True`): + Set to `True` to use a bias in the output linear layer. + scale_qk (`bool`, *optional*, defaults to `True`): + Set to `True` to scale the query and key by `1 / sqrt(dim_head)`. + only_cross_attention (`bool`, *optional*, defaults to `False`): + Set to `True` to only use cross attention and not added_kv_proj_dim. Can only be set to `True` if + `added_kv_proj_dim` is not `None`. + eps (`float`, *optional*, defaults to 1e-5): + An additional value added to the denominator in group normalization that is used for numerical stability. + rescale_output_factor (`float`, *optional*, defaults to 1.0): + A factor to rescale the output by dividing it with this value. + residual_connection (`bool`, *optional*, defaults to `False`): + Set to `True` to add the residual connection to the output. + _from_deprecated_attn_block (`bool`, *optional*, defaults to `False`): + Set to `True` if the attention block is loaded from a deprecated state dict. + processor (`AttnProcessor`, *optional*, defaults to `None`): + The attention processor to use. If `None`, defaults to `AttnProcessor2_0` if `torch 2.x` is used and + `AttnProcessor` otherwise. + """ + + def __init__( + self, + query_dim: int, + cross_attention_dim: Optional[int] = None, + heads: int = 8, + kv_heads: Optional[int] = None, + dim_head: int = 64, + dropout: float = 0.0, + bias: bool = False, + upcast_attention: bool = False, + upcast_softmax: bool = False, + cross_attention_norm: Optional[str] = None, + cross_attention_norm_num_groups: int = 32, + qk_norm: Optional[str] = None, + added_kv_proj_dim: Optional[int] = None, + added_proj_bias: Optional[bool] = True, + norm_num_groups: Optional[int] = None, + spatial_norm_dim: Optional[int] = None, + out_bias: bool = True, + scale_qk: bool = True, + only_cross_attention: bool = False, + eps: float = 1e-5, + rescale_output_factor: float = 1.0, + residual_connection: bool = False, + _from_deprecated_attn_block: bool = False, + processor: Optional["AttnProcessor"] = None, + out_dim: int = None, + out_context_dim: int = None, + context_pre_only=None, + pre_only=False, + elementwise_affine: bool = True, + is_causal: bool = False, + ): + super().__init__() + + # To prevent circular import. + from ..layers.normalization import FP32LayerNorm, LpNorm, RMSNorm + + self.inner_dim = out_dim if out_dim is not None else dim_head * heads + self.inner_kv_dim = self.inner_dim if kv_heads is None else dim_head * kv_heads + self.query_dim = query_dim + self.use_bias = bias + self.is_cross_attention = cross_attention_dim is not None + self.cross_attention_dim = cross_attention_dim if cross_attention_dim is not None else query_dim + self.upcast_attention = upcast_attention + self.upcast_softmax = upcast_softmax + self.rescale_output_factor = rescale_output_factor + self.residual_connection = residual_connection + self.dropout = dropout + self.fused_projections = False + self.out_dim = out_dim if out_dim is not None else query_dim + self.out_context_dim = out_context_dim if out_context_dim is not None else query_dim + self.context_pre_only = context_pre_only + self.pre_only = pre_only + self.is_causal = is_causal + + # we make use of this private variable to know whether this class is loaded + # with an deprecated state dict so that we can convert it on the fly + self._from_deprecated_attn_block = _from_deprecated_attn_block + + self.scale_qk = scale_qk + self.scale = dim_head**-0.5 if self.scale_qk else 1.0 + + self.heads = out_dim // dim_head if out_dim is not None else heads + # for slice_size > 0 the attention score computation + # is split across the batch axis to save memory + # You can set slice_size with `set_attention_slice` + self.sliceable_head_dim = heads + + self.added_kv_proj_dim = added_kv_proj_dim + self.only_cross_attention = only_cross_attention + + if self.added_kv_proj_dim is None and self.only_cross_attention: + raise ValueError( + "`only_cross_attention` can only be set to True if `added_kv_proj_dim` is not None. Make sure to set either `only_cross_attention=False` or define `added_kv_proj_dim`." + ) + + if norm_num_groups is not None: + self.group_norm = nn.GroupNorm(num_channels=query_dim, num_groups=norm_num_groups, eps=eps, affine=True) + else: + self.group_norm = None + + self.spatial_norm = None + + if qk_norm is None: + self.norm_q = None + self.norm_k = None + elif qk_norm == "layer_norm": + self.norm_q = nn.LayerNorm(dim_head, eps=eps, elementwise_affine=elementwise_affine) + self.norm_k = nn.LayerNorm(dim_head, eps=eps, elementwise_affine=elementwise_affine) + elif qk_norm == "fp32_layer_norm": + self.norm_q = FP32LayerNorm(dim_head, elementwise_affine=False, bias=False, eps=eps) + self.norm_k = FP32LayerNorm(dim_head, elementwise_affine=False, bias=False, eps=eps) + elif qk_norm == "layer_norm_across_heads": + # Lumina applies qk norm across all heads + self.norm_q = nn.LayerNorm(dim_head * heads, eps=eps) + self.norm_k = nn.LayerNorm(dim_head * kv_heads, eps=eps) + elif qk_norm == "rms_norm": + self.norm_q = RMSNorm(dim_head, eps=eps) + self.norm_k = RMSNorm(dim_head, eps=eps) + elif qk_norm == "rms_norm_across_heads": + # LTX applies qk norm across all heads + self.norm_q = RMSNorm(dim_head * heads, eps=eps) + self.norm_k = RMSNorm(dim_head * kv_heads, eps=eps) + elif qk_norm == "l2": + self.norm_q = LpNorm(p=2, dim=-1, eps=eps) + self.norm_k = LpNorm(p=2, dim=-1, eps=eps) + else: + raise ValueError(f"unknown qk_norm: {qk_norm}. Should be None,'layer_norm','fp32_layer_norm','rms_norm'") + + if cross_attention_norm is None: + self.norm_cross = None + elif cross_attention_norm == "layer_norm": + self.norm_cross = nn.LayerNorm(self.cross_attention_dim) + elif cross_attention_norm == "group_norm": + if self.added_kv_proj_dim is not None: + # The given `encoder_hidden_states` are initially of shape + # (batch_size, seq_len, added_kv_proj_dim) before being projected + # to (batch_size, seq_len, cross_attention_dim). The norm is applied + # before the projection, so we need to use `added_kv_proj_dim` as + # the number of channels for the group norm. + norm_cross_num_channels = added_kv_proj_dim + else: + norm_cross_num_channels = self.cross_attention_dim + + self.norm_cross = nn.GroupNorm( + num_channels=norm_cross_num_channels, num_groups=cross_attention_norm_num_groups, eps=1e-5, affine=True + ) + else: + raise ValueError( + f"unknown cross_attention_norm: {cross_attention_norm}. Should be None, 'layer_norm' or 'group_norm'" + ) + + self.to_q = nn.Linear(query_dim, self.inner_dim, bias=bias) + + if not self.only_cross_attention: + # only relevant for the `AddedKVProcessor` classes + self.to_k = nn.Linear(self.cross_attention_dim, self.inner_kv_dim, bias=bias) + self.to_v = nn.Linear(self.cross_attention_dim, self.inner_kv_dim, bias=bias) + else: + self.to_k = None + self.to_v = None + + self.added_proj_bias = added_proj_bias + if self.added_kv_proj_dim is not None: + self.add_k_proj = nn.Linear(added_kv_proj_dim, self.inner_kv_dim, bias=added_proj_bias) + self.add_v_proj = nn.Linear(added_kv_proj_dim, self.inner_kv_dim, bias=added_proj_bias) + if self.context_pre_only is not None: + self.add_q_proj = nn.Linear(added_kv_proj_dim, self.inner_dim, bias=added_proj_bias) + else: + self.add_q_proj = None + self.add_k_proj = None + self.add_v_proj = None + + if not self.pre_only: + self.to_out = nn.ModuleList([]) + self.to_out.append(nn.Linear(self.inner_dim, self.out_dim, bias=out_bias)) + self.to_out.append(nn.Dropout(dropout)) + else: + self.to_out = None + + if self.context_pre_only is not None and not self.context_pre_only: + self.to_add_out = nn.Linear(self.inner_dim, self.out_context_dim, bias=out_bias) + else: + self.to_add_out = None + + if qk_norm is not None and added_kv_proj_dim is not None: + if qk_norm == "fp32_layer_norm": + self.norm_added_q = FP32LayerNorm(dim_head, elementwise_affine=False, bias=False, eps=eps) + self.norm_added_k = FP32LayerNorm(dim_head, elementwise_affine=False, bias=False, eps=eps) + elif qk_norm == "rms_norm": + self.norm_added_q = RMSNorm(dim_head, eps=eps) + self.norm_added_k = RMSNorm(dim_head, eps=eps) + else: + raise ValueError( + f"unknown qk_norm: {qk_norm}. Should be one of `None,'layer_norm','fp32_layer_norm','rms_norm'`" + ) + else: + self.norm_added_q = None + self.norm_added_k = None + + self.set_processor(processor) + + def set_processor(self, processor: "AttnProcessor") -> None: + r""" + Set the attention processor to use. + + Args: + processor (`AttnProcessor`): + The attention processor to use. + """ + # if current processor is in `self._modules` and if passed `processor` is not, we need to + # pop `processor` from `self._modules` + if ( + hasattr(self, "processor") + and isinstance(self.processor, torch.nn.Module) + and not isinstance(processor, torch.nn.Module) + ): + logger.info(f"You are removing possibly trained weights of {self.processor} with {processor}") + self._modules.pop("processor") + + self.processor = processor + + def get_processor(self, return_deprecated_lora: bool = False) -> "AttentionProcessor": + r""" + Get the attention processor in use. + + Args: + return_deprecated_lora (`bool`, *optional*, defaults to `False`): + Set to `True` to return the deprecated LoRA attention processor. + + Returns: + "AttentionProcessor": The attention processor in use. + """ + if not return_deprecated_lora: + return self.processor + + def forward( + self, + hidden_states: torch.Tensor, + encoder_hidden_states: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + **cross_attention_kwargs, + ) -> torch.Tensor: + r""" + The forward method of the `Attention` class. + + Args: + hidden_states (`torch.Tensor`): + The hidden states of the query. + encoder_hidden_states (`torch.Tensor`, *optional*): + The hidden states of the encoder. + attention_mask (`torch.Tensor`, *optional*): + The attention mask to use. If `None`, no mask is applied. + **cross_attention_kwargs: + Additional keyword arguments to pass along to the cross attention. + + Returns: + `torch.Tensor`: The output of the attention layer. + """ + # The `Attention` class can call different attention processors / attention functions + # here we simply pass along all tensors to the selected processor class + # For standard processors that are defined here, `**cross_attention_kwargs` is empty + + attn_parameters = set(inspect.signature(self.processor.__call__).parameters.keys()) + quiet_attn_parameters = {"ip_adapter_masks", "ip_hidden_states"} + unused_kwargs = [ + k for k, _ in cross_attention_kwargs.items() if k not in attn_parameters and k not in quiet_attn_parameters + ] + if len(unused_kwargs) > 0: + logger.warning( + f"cross_attention_kwargs {unused_kwargs} are not expected by {self.processor.__class__.__name__} and will be ignored." + ) + cross_attention_kwargs = {k: w for k, w in cross_attention_kwargs.items() if k in attn_parameters} + + return self.processor( + self, + hidden_states, + encoder_hidden_states=encoder_hidden_states, + attention_mask=attention_mask, + **cross_attention_kwargs, + ) + + def prepare_attention_mask( + self, attention_mask: torch.Tensor, target_length: int, batch_size: int, out_dim: int = 3 + ) -> torch.Tensor: + r""" + Prepare the attention mask for the attention computation. + + Args: + attention_mask (`torch.Tensor`): + The attention mask to prepare. + target_length (`int`): + The target length of the attention mask. This is the length of the attention mask after padding. + batch_size (`int`): + The batch size, which is used to repeat the attention mask. + out_dim (`int`, *optional*, defaults to `3`): + The output dimension of the attention mask. Can be either `3` or `4`. + + Returns: + `torch.Tensor`: The prepared attention mask. + """ + head_size = self.heads + if attention_mask is None: + return attention_mask + + current_length: int = attention_mask.shape[-1] + if current_length != target_length: + if attention_mask.device.type == "mps": + # HACK: MPS: Does not support padding by greater than dimension of input tensor. + # Instead, we can manually construct the padding tensor. + padding_shape = (attention_mask.shape[0], attention_mask.shape[1], target_length) + padding = torch.zeros(padding_shape, dtype=attention_mask.dtype, device=attention_mask.device) + attention_mask = torch.cat([attention_mask, padding], dim=2) + else: + # TODO: for pipelines such as stable-diffusion, padding cross-attn mask: + # we want to instead pad by (0, remaining_length), where remaining_length is: + # remaining_length: int = target_length - current_length + # TODO: re-enable tests/models/test_models_unet_2d_condition.py#test_model_xattn_padding + attention_mask = F.pad(attention_mask, (0, target_length), value=0.0) + + if out_dim == 3: + if attention_mask.shape[0] < batch_size * head_size: + attention_mask = attention_mask.repeat_interleave(head_size, dim=0) + elif out_dim == 4: + attention_mask = attention_mask.unsqueeze(1) + attention_mask = attention_mask.repeat_interleave(head_size, dim=1) -from diffusers.models.attention_processor import Attention + return attention_mask class CogVideoXAttnProcessor2_0: diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/transformer_cogview3plus.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/transformer_cogview3plus.py index 9a343d4c86..ec773e4af9 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/transformer_cogview3plus.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/transformer_cogview3plus.py @@ -19,13 +19,13 @@ import torch import torch.nn as nn from diffusers.configuration_utils import ConfigMixin, register_to_config -from diffusers.models.attention_processor import Attention, AttentionProcessor +from diffusers.models.attention_processor import AttentionProcessor from diffusers.utils import is_torch_version, logging from diffusers.models.modeling_outputs import Transformer2DModelOutput from .modeling_utils import ModelMixin from .attention import FeedForward -from .attention_processor import CogVideoXAttnProcessor2_0 +from .attention_processor import CogVideoXAttnProcessor2_0, Attention from ..layers import CogView3PlusAdaLayerNormZeroTextImage, AdaLayerNormContinuous from ..layers import CogView3CombinedTimestepSizeEmbeddings, CogView3PlusPatchEmbed @@ -177,7 +177,6 @@ class CogView3PlusTransformer2DModel(ModelMixin, ConfigMixin): time_embed_dim: int = 512, condition_dim: int = 256, pos_embed_max_size: int = 128, - sample_size: int = 128, ): super().__init__() self.out_channels = out_channels -- Gitee From 7c95104fe3b385f0035097eb3450c3fe837f80b6 Mon Sep 17 00:00:00 2001 From: jiangmengyu Date: Thu, 26 Dec 2024 09:31:00 +0800 Subject: [PATCH 12/91] =?UTF-8?q?CogView3plus=E9=80=82=E9=85=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../foundation/cogview3/cogview3plus/vae/autoencoder_kl.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/vae/autoencoder_kl.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/vae/autoencoder_kl.py index fcba50ccae..cea74eb29f 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/vae/autoencoder_kl.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/vae/autoencoder_kl.py @@ -87,11 +87,6 @@ class AutoencoderKL(ModelMixin, ConfigMixin, FromOriginalModelMixin, PeftAdapter latent_channels: int = 4, norm_num_groups: int = 32, sample_size: int = 32, - scaling_factor: float = 0.18215, - shift_factor: Optional[float] = None, - latents_mean: Optional[Tuple[float]] = None, - latents_std: Optional[Tuple[float]] = None, - force_upcast: float = True, use_quant_conv: bool = True, use_post_quant_conv: bool = True, mid_block_add_attention: bool = True, -- Gitee From f4e1a62fc4ed070086bf0303bd82653f1f05792a Mon Sep 17 00:00:00 2001 From: jiangmengyu Date: Thu, 26 Dec 2024 09:32:17 +0800 Subject: [PATCH 13/91] =?UTF-8?q?CogView3plus=E9=80=82=E9=85=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../cogview3/cogview3plus/models/attention_processor.py | 1 + 1 file changed, 1 insertion(+) diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/attention_processor.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/attention_processor.py index c6879c9898..b7cdf97abb 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/attention_processor.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/attention_processor.py @@ -17,6 +17,7 @@ from typing import Optional import torch import torch.nn.functional as F from torch import nn +import torch_npu from diffusers.utils import logging from diffusers.utils.torch_utils import maybe_allow_in_graph -- Gitee From c756489d04d84d263e0792ce2e777ff499df944e Mon Sep 17 00:00:00 2001 From: jiangmengyu Date: Thu, 26 Dec 2024 09:58:49 +0800 Subject: [PATCH 14/91] =?UTF-8?q?CogView3plus=E9=80=82=E9=85=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../cogview3/cogview3plus/layers/__init__.py | 3 +- .../cogview3/cogview3plus/layers/linear.py | 95 +++++++++++++++++++ .../models/attention_processor.py | 19 +--- .../cogview3plus/models/modeling_utils.py | 3 + 4 files changed, 105 insertions(+), 15 deletions(-) create mode 100644 MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/linear.py diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/__init__.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/__init__.py index c3e7c569e2..09760b9fd0 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/__init__.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/__init__.py @@ -1,2 +1,3 @@ from .normalization import CogView3PlusAdaLayerNormZeroTextImage, AdaLayerNormContinuous -from .embeddings import CogView3CombinedTimestepSizeEmbeddings, CogView3PlusPatchEmbed \ No newline at end of file +from .embeddings import CogView3CombinedTimestepSizeEmbeddings, CogView3PlusPatchEmbed +from .linear import QKVLinear \ No newline at end of file diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/linear.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/linear.py new file mode 100644 index 0000000000..805c2d2b34 --- /dev/null +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/linear.py @@ -0,0 +1,95 @@ +#!/usr/bin/env python +# coding=utf-8 +# Copyright 2024 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import torch +import torch.nn as nn +import torch_npu + + +class QKVLinear(nn.Module): + def __init__(self, attention_dim, hidden_size, qkv_bias=True, cross_attention_dim=None, device=None, dtype=None): + super(QKVLinear, self).__init__() + self.attention_dim = attention_dim + self.hidden_size = hidden_size + self.cross_attention_dim = cross_attention_dim + self.qkv_bias = qkv_bias + + factory_kwargs = {"device": device, "dtype": dtype} + + if not cross_attention_dim: + self.weight = nn.Parameter(torch.empty([self.attention_dim, 3 * self.hidden_size], **factory_kwargs)) + if self.qkv_bias: + self.bias = nn.Parameter(torch.empty([3 * self.hidden_size], **factory_kwargs)) + else: + self.q_weight = nn.Parameter(torch.empty([self.attention_dim, self.hidden_size], **factory_kwargs)) + self.kv_weight = nn.Parameter(torch.empty([self.attention_dim, 2 * self.hidden_size], **factory_kwargs)) + + if self.qkv_bias: + self.q_bias = nn.Parameter(torch.empty([self.hidden_size], **factory_kwargs)) + self.kv_bias = nn.Parameter(torch.empty([2 * self.hidden_size], **factory_kwargs)) + + + def forward(self, hidden_states, encoder_hidden_states=None): + + if self.cross_attention_dim is None: + if not self.qkv_bias: + qkv = torch.matmul(hidden_states, self.weight) + else: + qkv = torch.addmm( + self.bias, + hidden_states.view(hidden_states.size(0) * hidden_states.size(1), hidden_states.size(2)), + self.weight, + beta=1, + alpha=1 + ) + + batch, seqlen, _ = hidden_states.shape + qkv_shape = (batch, seqlen, 3, -1) + qkv = qkv.view(qkv_shape) + q, k, v = qkv.unbind(2) + + else: + if not self.qkv_bias: + q = torch.matmul(hidden_states, self.q_weight) + kv = torch.matmul(encoder_hidden_states, self.kv_weight) + else: + q = torch.addmm( + self.q_bias, + hidden_states.view(hidden_states.size(0) * hidden_states.size(1), hidden_states.size(2)), + self.q_weight, + beta=1, + alpha=1 + ) + kv = torch.addmm( + self.kv_bias, + encoder_hidden_states.view( + encoder_hidden_states.size(0) * encoder_hidden_states.size(1), + encoder_hidden_states.size(2)), + self.kv_weight, + beta=1, + alpha=1 + ) + + batch, seqlen, _ = encoder_hidden_states.shape + kv_shape = (batch, seqlen, 2, -1) + + kv = kv.view(kv_shape) + k, v = kv.unbind(2) + + q = q.view(hidden_states.shape) + + return q, k, v \ No newline at end of file diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/attention_processor.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/attention_processor.py index b7cdf97abb..77863c2275 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/attention_processor.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/attention_processor.py @@ -22,6 +22,8 @@ import torch_npu from diffusers.utils import logging from diffusers.utils.torch_utils import maybe_allow_in_graph +from ..layers import QKVLinear + logger = logging.get_logger(__name__) # pylint: disable=invalid-name @@ -98,7 +100,6 @@ class Attention(nn.Module): added_kv_proj_dim: Optional[int] = None, added_proj_bias: Optional[bool] = True, norm_num_groups: Optional[int] = None, - spatial_norm_dim: Optional[int] = None, out_bias: bool = True, scale_qk: bool = True, only_cross_attention: bool = False, @@ -213,16 +214,8 @@ class Attention(nn.Module): raise ValueError( f"unknown cross_attention_norm: {cross_attention_norm}. Should be None, 'layer_norm' or 'group_norm'" ) - - self.to_q = nn.Linear(query_dim, self.inner_dim, bias=bias) - - if not self.only_cross_attention: - # only relevant for the `AddedKVProcessor` classes - self.to_k = nn.Linear(self.cross_attention_dim, self.inner_kv_dim, bias=bias) - self.to_v = nn.Linear(self.cross_attention_dim, self.inner_kv_dim, bias=bias) - else: - self.to_k = None - self.to_v = None + + self.to_qkv = QKVLinear(self.inner_dim, query_dim, qkv_bias=bias) self.added_proj_bias = added_proj_bias if self.added_kv_proj_dim is not None: @@ -421,9 +414,7 @@ class CogVideoXAttnProcessor2_0: attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size) attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1]) - query = attn.to_q(hidden_states) - key = attn.to_k(hidden_states) - value = attn.to_v(hidden_states) + query, key, value = attn.to_qkv(hidden_states) inner_dim = key.shape[-1] head_dim = inner_dim // attn.heads diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/modeling_utils.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/modeling_utils.py index a0740b8c67..56f94f0c0d 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/modeling_utils.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/modeling_utils.py @@ -920,6 +920,9 @@ class ModelMixin(torch.nn.Module, PushToHubMixin): ) else: + print(model.state_dict()) + exit() + weights_path = index_file with open(index_file) as f: index = json.loads(f.read()) -- Gitee From 7e0d57a85c40a970034be3b1243e2af0a38fe544 Mon Sep 17 00:00:00 2001 From: jiangmengyu Date: Thu, 26 Dec 2024 11:03:23 +0800 Subject: [PATCH 15/91] =?UTF-8?q?CogView3plus=E9=80=82=E9=85=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../cogview3plus/models/modeling_utils.py | 3 - .../models/transformer_cogview3plus.py | 59 ++++++++++++++++++- 2 files changed, 58 insertions(+), 4 deletions(-) diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/modeling_utils.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/modeling_utils.py index 56f94f0c0d..a0740b8c67 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/modeling_utils.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/modeling_utils.py @@ -920,9 +920,6 @@ class ModelMixin(torch.nn.Module, PushToHubMixin): ) else: - print(model.state_dict()) - exit() - weights_path = index_file with open(index_file) as f: index = json.loads(f.read()) diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/transformer_cogview3plus.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/transformer_cogview3plus.py index ec773e4af9..0846744ef7 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/transformer_cogview3plus.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/transformer_cogview3plus.py @@ -181,6 +181,7 @@ class CogView3PlusTransformer2DModel(ModelMixin, ConfigMixin): super().__init__() self.out_channels = out_channels self.inner_dim = num_attention_heads * attention_head_dim + self.num_layers = num_layers # CogView3 uses 3 additional SDXL-like conditions - original_size, target_size, crop_coords # Each of these are sincos embeddings of shape 2 * condition_dim @@ -223,6 +224,13 @@ class CogView3PlusTransformer2DModel(ModelMixin, ConfigMixin): self.gradient_checkpointing = False + self.q_weight_cache = None + self.q_bias_cache = None + self.k_weight_cache = None + self.k_bias_cache = None + self.v_weight_cache = None + self.v_bias_cache = None + @property # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.attn_processors def attn_processors(self) -> Dict[str, AttentionProcessor]: @@ -379,4 +387,53 @@ class CogView3PlusTransformer2DModel(ModelMixin, ConfigMixin): if not return_dict: return (output,) - return Transformer2DModelOutput(sample=output) \ No newline at end of file + return Transformer2DModelOutput(sample=output) + + def load_weights(self, state_dict, shard=False): + with torch.no_grad(): + if not shard: + self.load_state_dict(state_dict) + return {} + else: + weights = state_dict + + for i in range(self.num_layers): + if i != 26: + q_weight = weights.pop(f"transformer_blocks.{i}.attn1.to_q.weight", None) + q_bias = weights.pop(f"transformer_blocks.{i}.attn1.to_q.bias", None) + k_weight = weights.pop(f"transformer_blocks.{i}.attn1.to_k.weight", None) + k_bias = weights.pop(f"transformer_blocks.{i}.attn1.to_k.bias", None) + v_weight = weights.pop(f"transformer_blocks.{i}.attn1.to_v.weight", None) + v_bias = weights.pop(f"transformer_blocks.{i}.attn1.to_v.bias", None) + + # query, key, value的weight和bias权重存在同一个文件中,不会分开存储。 + if q_weight and k_weight and v_weight: + qkv_weight = torch.cat([q_weight, k_weight, v_weight], dim=0).transpose(0, 1).contiguous() + qkv_bias = torch.cat([q_bias, k_bias, v_bias], dim=0).contiguous() + weights[f"transformer_blocks.{i}.attn1.to_qkv.weight"] = qkv_weight + weights[f"transformer_blocks.{i}.attn1.to_qkv.bias"] = qkv_bias + else: + if self.q_weight_cache is None: + self.q_weight_cache = weights.pop(f"transformer_blocks.{i}.attn1.to_q.weight", None) + if self.q_bias_cache is None: + self.q_bias_cache = weights.pop(f"transformer_blocks.{i}.attn1.to_q.bias", None) + if self.k_weight_cache is None: + self.k_weight_cache = weights.pop(f"transformer_blocks.{i}.attn1.to_k.weight", None) + if self.k_bias_cache is None: + self.k_bias_cache = weights.pop(f"transformer_blocks.{i}.attn1.to_k.bias", None) + if self.v_weight_cache is None: + self.v_weight_cache = weights.pop(f"transformer_blocks.{i}.attn1.to_v.weight", None) + if self.v_bias_cache is None: + self.v_bias_cache = weights.pop(f"transformer_blocks.{i}.attn1.to_v.bias", None) + + if self.q_weight_cache and self.k_weight_cache and self.v_weight_cache: + qkv_weight = torch.cat( + [self.q_weight_cache, self.k_weight_cache, self.v_weight_cache], + dim=0 + ).transpose(0, 1).contiguous() + qkv_bias = torch.cat([self.q_bias_cache, self.k_bias_cache, self.v_bias_cache], dim=0).contiguous() + weights[f"transformer_blocks.{i}.attn1.to_qkv.weight"] = qkv_weight + weights[f"transformer_blocks.{i}.attn1.to_qkv.bias"] = qkv_bias + + self.load_state_dict(weights, strict=False, assign=True) + return state_dict.keys() \ No newline at end of file -- Gitee From 7765d34253ca81c1424c4a1059e381838e78b837 Mon Sep 17 00:00:00 2001 From: jiangmengyu Date: Thu, 26 Dec 2024 11:12:31 +0800 Subject: [PATCH 16/91] =?UTF-8?q?CogView3plus=E9=80=82=E9=85=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../cogview3/cogview3plus/models/transformer_cogview3plus.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/transformer_cogview3plus.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/transformer_cogview3plus.py index 0846744ef7..8c2086d644 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/transformer_cogview3plus.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/transformer_cogview3plus.py @@ -407,7 +407,7 @@ class CogView3PlusTransformer2DModel(ModelMixin, ConfigMixin): v_bias = weights.pop(f"transformer_blocks.{i}.attn1.to_v.bias", None) # query, key, value的weight和bias权重存在同一个文件中,不会分开存储。 - if q_weight and k_weight and v_weight: + if q_weight is not None and k_weight is not None and v_weight is not None: qkv_weight = torch.cat([q_weight, k_weight, v_weight], dim=0).transpose(0, 1).contiguous() qkv_bias = torch.cat([q_bias, k_bias, v_bias], dim=0).contiguous() weights[f"transformer_blocks.{i}.attn1.to_qkv.weight"] = qkv_weight @@ -426,7 +426,8 @@ class CogView3PlusTransformer2DModel(ModelMixin, ConfigMixin): if self.v_bias_cache is None: self.v_bias_cache = weights.pop(f"transformer_blocks.{i}.attn1.to_v.bias", None) - if self.q_weight_cache and self.k_weight_cache and self.v_weight_cache: + qk_weight_cache = self.q_weight_cache is not None and self.k_weight_cache is not None + if qk_weight_cache and self.v_weight_cache is not None: qkv_weight = torch.cat( [self.q_weight_cache, self.k_weight_cache, self.v_weight_cache], dim=0 -- Gitee From f00c0fbe2012c8577e90feced506e76f9face611 Mon Sep 17 00:00:00 2001 From: jiangmengyu Date: Thu, 26 Dec 2024 11:29:58 +0800 Subject: [PATCH 17/91] =?UTF-8?q?CogView3plus=E9=80=82=E9=85=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../cogview3plus/models/transformer_cogview3plus.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/transformer_cogview3plus.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/transformer_cogview3plus.py index 8c2086d644..fe16cafe50 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/transformer_cogview3plus.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/transformer_cogview3plus.py @@ -433,8 +433,8 @@ class CogView3PlusTransformer2DModel(ModelMixin, ConfigMixin): dim=0 ).transpose(0, 1).contiguous() qkv_bias = torch.cat([self.q_bias_cache, self.k_bias_cache, self.v_bias_cache], dim=0).contiguous() - weights[f"transformer_blocks.{i}.attn1.to_qkv.weight"] = qkv_weight - weights[f"transformer_blocks.{i}.attn1.to_qkv.bias"] = qkv_bias + weights[f"transformer_blocks.26.attn1.to_qkv.weight"] = qkv_weight + weights[f"transformer_blocks.26.attn1.to_qkv.bias"] = qkv_bias self.load_state_dict(weights, strict=False, assign=True) - return state_dict.keys() \ No newline at end of file + return weights.keys() \ No newline at end of file -- Gitee From 7258f2dcbf35958be84e39f0fefb067c248474cb Mon Sep 17 00:00:00 2001 From: jiangmengyu Date: Thu, 26 Dec 2024 14:16:10 +0800 Subject: [PATCH 18/91] =?UTF-8?q?CogView3plus=E9=80=82=E9=85=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../cogview3plus/models/activations.py | 8 -- .../models/attention_processor.py | 83 ++++--------- .../models/transformer_cogview3plus.py | 83 ------------- .../pipeline/pipeline_cogview3plus.py | 111 ------------------ .../schedulers/scheduling_ddim_cogvideox.py | 95 --------------- .../schedulers/scheduling_dpm_cogvideox.py | 2 - .../schedulers/scheduling_utils.py | 60 ---------- .../cogview3plus/vae/autoencoder_kl.py | 53 +-------- 8 files changed, 21 insertions(+), 474 deletions(-) diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/activations.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/activations.py index cb1c29919e..48fe8ed17d 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/activations.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/activations.py @@ -92,8 +92,6 @@ class GELU(nn.Module): class GEGLU(nn.Module): r""" - A [variant](https://arxiv.org/abs/2002.05202) of the gated linear unit activation function. - Parameters: dim_in (`int`): The number of channels in the input. dim_out (`int`): The number of channels in the output. @@ -125,9 +123,6 @@ class GEGLU(nn.Module): class SwiGLU(nn.Module): r""" - A [variant](https://arxiv.org/abs/2002.05202) of the gated linear unit activation function. It's similar to `GEGLU` - but uses SiLU / Swish instead of GeLU. - Parameters: dim_in (`int`): The number of channels in the input. dim_out (`int`): The number of channels in the output. @@ -148,9 +143,6 @@ class SwiGLU(nn.Module): class ApproximateGELU(nn.Module): r""" - The approximate form of the Gaussian Error Linear Unit (GELU). For more details, see section 2 of this - [paper](https://arxiv.org/abs/1606.08415). - Parameters: dim_in (`int`): The number of channels in the input. dim_out (`int`): The number of channels in the output. diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/attention_processor.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/attention_processor.py index 77863c2275..de7a2a130f 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/attention_processor.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/attention_processor.py @@ -22,67 +22,11 @@ import torch_npu from diffusers.utils import logging from diffusers.utils.torch_utils import maybe_allow_in_graph -from ..layers import QKVLinear - logger = logging.get_logger(__name__) # pylint: disable=invalid-name @maybe_allow_in_graph class Attention(nn.Module): - r""" - A cross attention layer. - - Parameters: - query_dim (`int`): - The number of channels in the query. - cross_attention_dim (`int`, *optional*): - The number of channels in the encoder_hidden_states. If not given, defaults to `query_dim`. - heads (`int`, *optional*, defaults to 8): - The number of heads to use for multi-head attention. - kv_heads (`int`, *optional*, defaults to `None`): - The number of key and value heads to use for multi-head attention. Defaults to `heads`. If - `kv_heads=heads`, the model will use Multi Head Attention (MHA), if `kv_heads=1` the model will use Multi - Query Attention (MQA) otherwise GQA is used. - dim_head (`int`, *optional*, defaults to 64): - The number of channels in each head. - dropout (`float`, *optional*, defaults to 0.0): - The dropout probability to use. - bias (`bool`, *optional*, defaults to False): - Set to `True` for the query, key, and value linear layers to contain a bias parameter. - upcast_attention (`bool`, *optional*, defaults to False): - Set to `True` to upcast the attention computation to `float32`. - upcast_softmax (`bool`, *optional*, defaults to False): - Set to `True` to upcast the softmax computation to `float32`. - cross_attention_norm (`str`, *optional*, defaults to `None`): - The type of normalization to use for the cross attention. Can be `None`, `layer_norm`, or `group_norm`. - cross_attention_norm_num_groups (`int`, *optional*, defaults to 32): - The number of groups to use for the group norm in the cross attention. - added_kv_proj_dim (`int`, *optional*, defaults to `None`): - The number of channels to use for the added key and value projections. If `None`, no projection is used. - norm_num_groups (`int`, *optional*, defaults to `None`): - The number of groups to use for the group norm in the attention. - spatial_norm_dim (`int`, *optional*, defaults to `None`): - The number of channels to use for the spatial normalization. - out_bias (`bool`, *optional*, defaults to `True`): - Set to `True` to use a bias in the output linear layer. - scale_qk (`bool`, *optional*, defaults to `True`): - Set to `True` to scale the query and key by `1 / sqrt(dim_head)`. - only_cross_attention (`bool`, *optional*, defaults to `False`): - Set to `True` to only use cross attention and not added_kv_proj_dim. Can only be set to `True` if - `added_kv_proj_dim` is not `None`. - eps (`float`, *optional*, defaults to 1e-5): - An additional value added to the denominator in group normalization that is used for numerical stability. - rescale_output_factor (`float`, *optional*, defaults to 1.0): - A factor to rescale the output by dividing it with this value. - residual_connection (`bool`, *optional*, defaults to `False`): - Set to `True` to add the residual connection to the output. - _from_deprecated_attn_block (`bool`, *optional*, defaults to `False`): - Set to `True` if the attention block is loaded from a deprecated state dict. - processor (`AttnProcessor`, *optional*, defaults to `None`): - The attention processor to use. If `None`, defaults to `AttnProcessor2_0` if `torch 2.x` is used and - `AttnProcessor` otherwise. - """ - def __init__( self, query_dim: int, @@ -215,7 +159,15 @@ class Attention(nn.Module): f"unknown cross_attention_norm: {cross_attention_norm}. Should be None, 'layer_norm' or 'group_norm'" ) - self.to_qkv = QKVLinear(self.inner_dim, query_dim, qkv_bias=bias) + self.to_q = nn.Linear(query_dim, self.inner_dim, bias=bias) + + if not self.only_cross_attention: + # only relevant for the `AddedKVProcessor` classes + self.to_k = nn.Linear(self.cross_attention_dim, self.inner_kv_dim, bias=bias) + self.to_v = nn.Linear(self.cross_attention_dim, self.inner_kv_dim, bias=bias) + else: + self.to_k = None + self.to_v = None self.added_proj_bias = added_proj_bias if self.added_kv_proj_dim is not None: @@ -414,7 +366,9 @@ class CogVideoXAttnProcessor2_0: attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size) attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1]) - query, key, value = attn.to_qkv(hidden_states) + query = attn.to_q(hidden_states) + key = attn.to_k(hidden_states) + value = attn.to_v(hidden_states) inner_dim = key.shape[-1] head_dim = inner_dim // attn.heads @@ -428,11 +382,14 @@ class CogVideoXAttnProcessor2_0: if attn.norm_k is not None: key = attn.norm_k(key) - _, N, _, D = query.shape - dim = 64 - query = F.pad(query, (0, dim - D)) - key = F.pad(key, (0, dim - D)) - value = F.pad(value, (0, dim - D)) + B, N, S, D = query.shape + dim = 48 + pad_shape = [B, N, S, D] + pad_shape[-1] = dim - pad_shape[-1] + pad = torch.zeros(pad_shape, dtype=query.dtype, device=query.device) + query = torch.cat([query, pad], dim=-1) + key = torch.cat([key, pad], dim=-1) + value = torch.cat([value, pad], dim=-1) hidden_states = torch_npu.npu_prompt_flash_attention( query, key, diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/transformer_cogview3plus.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/transformer_cogview3plus.py index fe16cafe50..f13d71880b 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/transformer_cogview3plus.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/transformer_cogview3plus.py @@ -35,8 +35,6 @@ logger = logging.get_logger(__name__) # pylint: disable=invalid-name class CogView3PlusTransformerBlock(nn.Module): r""" - Transformer block used in [CogView](https://github.com/THUDM/CogView3) model. - Args: dim (`int`): The number of channels in the input and output. @@ -129,9 +127,6 @@ class CogView3PlusTransformerBlock(nn.Module): class CogView3PlusTransformer2DModel(ModelMixin, ConfigMixin): r""" - The Transformer model introduced in [CogView3: Finer and Faster Text-to-Image Generation via Relay - Diffusion](https://huggingface.co/papers/2403.05121). - Args: patch_size (`int`, defaults to `2`): The size of the patches to use in the patch embedding layer. @@ -305,34 +300,6 @@ class CogView3PlusTransformer2DModel(ModelMixin, ConfigMixin): crop_coords: torch.Tensor, return_dict: bool = True, ) -> Union[torch.Tensor, Transformer2DModelOutput]: - """ - The [`CogView3PlusTransformer2DModel`] forward method. - - Args: - hidden_states (`torch.Tensor`): - Input `hidden_states` of shape `(batch size, channel, height, width)`. - encoder_hidden_states (`torch.Tensor`): - Conditional embeddings (embeddings computed from the input conditions such as prompts) of shape - `(batch_size, sequence_len, text_embed_dim)` - timestep (`torch.LongTensor`): - Used to indicate denoising step. - original_size (`torch.Tensor`): - CogView3 uses SDXL-like micro-conditioning for original image size as explained in section 2.2 of - [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). - target_size (`torch.Tensor`): - CogView3 uses SDXL-like micro-conditioning for target image size as explained in section 2.2 of - [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). - crop_coords (`torch.Tensor`): - CogView3 uses SDXL-like micro-conditioning for crop coordinates as explained in section 2.2 of - [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). - return_dict (`bool`, *optional*, defaults to `True`): - Whether or not to return a [`~models.transformer_2d.Transformer2DModelOutput`] instead of a plain - tuple. - - Returns: - `torch.Tensor` or [`~models.transformer_2d.Transformer2DModelOutput`]: - The denoised latents using provided inputs as conditioning. - """ height, width = hidden_states.shape[-2:] text_seq_length = encoder_hidden_states.shape[1] @@ -388,53 +355,3 @@ class CogView3PlusTransformer2DModel(ModelMixin, ConfigMixin): return (output,) return Transformer2DModelOutput(sample=output) - - def load_weights(self, state_dict, shard=False): - with torch.no_grad(): - if not shard: - self.load_state_dict(state_dict) - return {} - else: - weights = state_dict - - for i in range(self.num_layers): - if i != 26: - q_weight = weights.pop(f"transformer_blocks.{i}.attn1.to_q.weight", None) - q_bias = weights.pop(f"transformer_blocks.{i}.attn1.to_q.bias", None) - k_weight = weights.pop(f"transformer_blocks.{i}.attn1.to_k.weight", None) - k_bias = weights.pop(f"transformer_blocks.{i}.attn1.to_k.bias", None) - v_weight = weights.pop(f"transformer_blocks.{i}.attn1.to_v.weight", None) - v_bias = weights.pop(f"transformer_blocks.{i}.attn1.to_v.bias", None) - - # query, key, value的weight和bias权重存在同一个文件中,不会分开存储。 - if q_weight is not None and k_weight is not None and v_weight is not None: - qkv_weight = torch.cat([q_weight, k_weight, v_weight], dim=0).transpose(0, 1).contiguous() - qkv_bias = torch.cat([q_bias, k_bias, v_bias], dim=0).contiguous() - weights[f"transformer_blocks.{i}.attn1.to_qkv.weight"] = qkv_weight - weights[f"transformer_blocks.{i}.attn1.to_qkv.bias"] = qkv_bias - else: - if self.q_weight_cache is None: - self.q_weight_cache = weights.pop(f"transformer_blocks.{i}.attn1.to_q.weight", None) - if self.q_bias_cache is None: - self.q_bias_cache = weights.pop(f"transformer_blocks.{i}.attn1.to_q.bias", None) - if self.k_weight_cache is None: - self.k_weight_cache = weights.pop(f"transformer_blocks.{i}.attn1.to_k.weight", None) - if self.k_bias_cache is None: - self.k_bias_cache = weights.pop(f"transformer_blocks.{i}.attn1.to_k.bias", None) - if self.v_weight_cache is None: - self.v_weight_cache = weights.pop(f"transformer_blocks.{i}.attn1.to_v.weight", None) - if self.v_bias_cache is None: - self.v_bias_cache = weights.pop(f"transformer_blocks.{i}.attn1.to_v.bias", None) - - qk_weight_cache = self.q_weight_cache is not None and self.k_weight_cache is not None - if qk_weight_cache and self.v_weight_cache is not None: - qkv_weight = torch.cat( - [self.q_weight_cache, self.k_weight_cache, self.v_weight_cache], - dim=0 - ).transpose(0, 1).contiguous() - qkv_bias = torch.cat([self.q_bias_cache, self.k_bias_cache, self.v_bias_cache], dim=0).contiguous() - weights[f"transformer_blocks.26.attn1.to_qkv.weight"] = qkv_weight - weights[f"transformer_blocks.26.attn1.to_qkv.bias"] = qkv_bias - - self.load_state_dict(weights, strict=False, assign=True) - return weights.keys() \ No newline at end of file diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py index a78f82a9b2..91559134c6 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py @@ -110,28 +110,6 @@ def retrieve_timesteps( class CogView3PlusPipeline(DiffusionPipeline): - r""" - Pipeline for text-to-image generation using CogView3Plus. - - This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the - library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.) - - Args: - vae ([`AutoencoderKL`]): - Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations. - text_encoder ([`T5EncoderModel`]): - Frozen text-encoder. CogView3Plus uses - [T5](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5EncoderModel); specifically the - [t5-v1_1-xxl](https://huggingface.co/PixArt-alpha/PixArt-alpha/tree/main/t5-v1_1-xxl) variant. - tokenizer (`T5Tokenizer`): - Tokenizer of class - [T5Tokenizer](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5Tokenizer). - transformer ([`CogView3PlusTransformer2DModel`]): - A text conditioned `CogView3PlusTransformer2DModel` to denoise the encoded image latents. - scheduler ([`SchedulerMixin`]): - A scheduler to be used in combination with `transformer` to denoise the encoded image latents. - """ - _optional_components = [] model_cpu_offload_seq = "text_encoder->transformer->vae" @@ -313,10 +291,6 @@ class CogView3PlusPipeline(DiffusionPipeline): # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs def prepare_extra_step_kwargs(self, generator, eta): - # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature - # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers. - # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502 - # and should be between [0, 1] accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys()) extra_step_kwargs = {} @@ -385,9 +359,6 @@ class CogView3PlusPipeline(DiffusionPipeline): def guidance_scale(self): return self._guidance_scale - # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2) - # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1` - # corresponds to doing no classifier free guidance. @property def do_classifier_free_guidance(self): return self._guidance_scale > 1 @@ -427,88 +398,6 @@ class CogView3PlusPipeline(DiffusionPipeline): callback_on_step_end_tensor_inputs: List[str] = ["latents"], max_sequence_length: int = 224, ) -> Union[CogView3PipelineOutput, Tuple]: - """ - Function invoked when calling the pipeline for generation. - - Args: - prompt (`str` or `List[str]`, *optional*): - The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`. - negative_prompt (`str` or `List[str]`, *optional*): - The prompt or prompts not to guide the image generation. If not defined, one has to pass - `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is - less than `1`). - height (`int`, *optional*, defaults to self.transformer.config.sample_size * self.vae_scale_factor): - The height in pixels of the generated image. If not provided, it is set to 1024. - width (`int`, *optional*, defaults to self.transformer.config.sample_size * self.vae_scale_factor): - The width in pixels of the generated image. If not provided it is set to 1024. - num_inference_steps (`int`, *optional*, defaults to `50`): - The number of denoising steps. More denoising steps usually lead to a higher quality image at the - expense of slower inference. - timesteps (`List[int]`, *optional*): - Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument - in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is - passed will be used. Must be in descending order. - guidance_scale (`float`, *optional*, defaults to `5.0`): - Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598). - `guidance_scale` is defined as `w` of equation 2. of [Imagen - Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale > - 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`, - usually at the expense of lower image quality. - num_images_per_prompt (`int`, *optional*, defaults to `1`): - The number of images to generate per prompt. - generator (`torch.Generator` or `List[torch.Generator]`, *optional*): - One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) - to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): - Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image - generation. Can be used to tweak the same generation with different prompts. If not provided, a latents - tensor will ge generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): - Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not - provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): - Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt - weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input - argument. - original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)): - If `original_size` is not the same as `target_size` the image will appear to be down- or upsampled. - `original_size` defaults to `(height, width)` if not specified. Part of SDXL's micro-conditioning as - explained in section 2.2 of - [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). - crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)): - `crops_coords_top_left` can be used to generate an image that appears to be "cropped" from the position - `crops_coords_top_left` downwards. Favorable, well-centered images are usually achieved by setting - `crops_coords_top_left` to (0, 0). Part of SDXL's micro-conditioning as explained in section 2.2 of - [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). - output_type (`str`, *optional*, defaults to `"pil"`): - The output format of the generate image. Choose between - [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`. - return_dict (`bool`, *optional*, defaults to `True`): - Whether or not to return a [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] instead - of a plain tuple. - attention_kwargs (`dict`, *optional*): - A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under - `self.processor` in - [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py). - callback_on_step_end (`Callable`, *optional*): - A function that calls at the end of each denoising steps during the inference. The function is called - with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int, - callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by - `callback_on_step_end_tensor_inputs`. - callback_on_step_end_tensor_inputs (`List`, *optional*): - The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list - will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the - `._callback_tensor_inputs` attribute of your pipeline class. - max_sequence_length (`int`, defaults to `224`): - Maximum sequence length in encoded prompt. Can be set to other values but may lead to poorer results. - - Examples: - - Returns: - [`~pipelines.cogview3.pipeline_cogview3plus.CogView3PipelineOutput`] or `tuple`: - [`~pipelines.cogview3.pipeline_cogview3plus.CogView3PipelineOutput`] if `return_dict` is True, otherwise a - `tuple`. When returning a tuple, the first element is a list with the generated images. - """ if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)): callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_ddim_cogvideox.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_ddim_cogvideox.py index 27c31923fe..b4e22a0615 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_ddim_cogvideox.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_ddim_cogvideox.py @@ -124,55 +124,6 @@ def rescale_zero_terminal_snr(alphas_cumprod): class CogVideoXDDIMScheduler(SchedulerMixin, ConfigMixin): - """ - `DDIMScheduler` extends the denoising procedure introduced in denoising diffusion probabilistic models (DDPMs) with - non-Markovian guidance. - - This model inherits from [`SchedulerMixin`] and [`ConfigMixin`]. Check the superclass documentation for the generic - methods the library implements for all schedulers such as loading and saving. - - Args: - num_train_timesteps (`int`, defaults to 1000): - The number of diffusion steps to train the model. - beta_start (`float`, defaults to 0.0001): - The starting `beta` value of inference. - beta_end (`float`, defaults to 0.02): - The final `beta` value. - beta_schedule (`str`, defaults to `"linear"`): - The beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from - `linear`, `scaled_linear`, or `squaredcos_cap_v2`. - trained_betas (`np.ndarray`, *optional*): - Pass an array of betas directly to the constructor to bypass `beta_start` and `beta_end`. - clip_sample (`bool`, defaults to `True`): - Clip the predicted sample for numerical stability. - clip_sample_range (`float`, defaults to 1.0): - The maximum magnitude for sample clipping. Valid only when `clip_sample=True`. - set_alpha_to_one (`bool`, defaults to `True`): - Each diffusion step uses the alphas product value at that step and at the previous one. For the final step - there is no previous alpha. When this option is `True` the previous alpha product is fixed to `1`, - otherwise it uses the alpha value at step 0. - steps_offset (`int`, defaults to 0): - An offset added to the inference steps, as required by some model families. - prediction_type (`str`, defaults to `epsilon`, *optional*): - Prediction type of the scheduler function; can be `epsilon` (predicts the noise of the diffusion process), - `sample` (directly predicts the noisy sample`) or `v_prediction` (see section 2.4 of [Imagen - Video](https://imagen.research.google/video/paper.pdf) paper). - thresholding (`bool`, defaults to `False`): - Whether to use the "dynamic thresholding" method. This is unsuitable for latent-space diffusion models such - as Stable Diffusion. - dynamic_thresholding_ratio (`float`, defaults to 0.995): - The ratio for the dynamic thresholding method. Valid only when `thresholding=True`. - sample_max_value (`float`, defaults to 1.0): - The threshold value for dynamic thresholding. Valid only when `thresholding=True`. - timestep_spacing (`str`, defaults to `"leading"`): - The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and - Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information. - rescale_betas_zero_snr (`bool`, defaults to `False`): - Whether to rescale the betas to have zero terminal SNR. This enables the model to generate very bright and - dark samples instead of limiting it to samples with medium brightness. Loosely related to - [`--offset_noise`](https://github.com/huggingface/diffusers/blob/74fd735eb073eb1d774b1ab4154a0876eb82f055/examples/dreambooth/train_dreambooth.py#L506). - """ - _compatibles = [e.name for e in KarrasDiffusionSchedulers] order = 1 @@ -313,55 +264,11 @@ class CogVideoXDDIMScheduler(SchedulerMixin, ConfigMixin): variance_noise: Optional[torch.Tensor] = None, return_dict: bool = True, ) -> Union[DDIMSchedulerOutput, Tuple]: - """ - Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion - process from the learned model outputs (most often the predicted noise). - - Args: - model_output (`torch.Tensor`): - The direct output from learned diffusion model. - timestep (`float`): - The current discrete timestep in the diffusion chain. - sample (`torch.Tensor`): - A current instance of a sample created by the diffusion process. - eta (`float`): - The weight of noise for added noise in diffusion step. - use_clipped_model_output (`bool`, defaults to `False`): - If `True`, computes "corrected" `model_output` from the clipped predicted original sample. Necessary - because predicted original sample is clipped to [-1, 1] when `self.config.clip_sample` is `True`. If no - clipping has happened, "corrected" `model_output` would coincide with the one provided as input and - `use_clipped_model_output` has no effect. - generator (`torch.Generator`, *optional*): - A random number generator. - variance_noise (`torch.Tensor`): - Alternative to generating noise with `generator` by directly providing the noise for the variance - itself. Useful for methods such as [`CycleDiffusion`]. - return_dict (`bool`, *optional*, defaults to `True`): - Whether or not to return a [`~schedulers.scheduling_ddim.DDIMSchedulerOutput`] or `tuple`. - - Returns: - [`~schedulers.scheduling_ddim.DDIMSchedulerOutput`] or `tuple`: - If return_dict is `True`, [`~schedulers.scheduling_ddim.DDIMSchedulerOutput`] is returned, otherwise a - tuple is returned where the first element is the sample tensor. - - """ if self.num_inference_steps is None: raise ValueError( "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler" ) - # See formulas (12) and (16) of DDIM paper https://arxiv.org/pdf/2010.02502.pdf - # Ideally, read DDIM paper in-detail understanding - - # Notation ( -> - # - pred_noise_t -> e_theta(x_t, t) - # - pred_original_sample -> f_theta(x_t, t) or x_0 - # - std_dev_t -> sigma_t - # - eta -> η - # - pred_sample_direction -> "direction pointing to x_t" - # - pred_prev_sample -> "x_t-1" - - # 1. get previous step value (=t-1) prev_timestep = timestep - self.config.num_train_timesteps // self.num_inference_steps # 2. compute alphas, betas @@ -371,8 +278,6 @@ class CogVideoXDDIMScheduler(SchedulerMixin, ConfigMixin): beta_prod_t = 1 - alpha_prod_t # 3. compute predicted original sample from predicted noise also called - # "predicted x_0" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf - # To make style tests pass, commented out `pred_epsilon` as it is an unused variable if self.config.prediction_type == "epsilon": pred_original_sample = (sample - beta_prod_t ** (0.5) * model_output) / alpha_prod_t ** (0.5) # pred_epsilon = model_output diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_dpm_cogvideox.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_dpm_cogvideox.py index 4269fff66a..6d25dea524 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_dpm_cogvideox.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_dpm_cogvideox.py @@ -95,8 +95,6 @@ def betas_for_alpha_bar( def rescale_zero_terminal_snr(alphas_cumprod): """ - Rescales betas to have zero terminal SNR Based on https://arxiv.org/pdf/2305.08891.pdf (Algorithm 1) - Args: betas (`torch.Tensor`): diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_utils.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_utils.py index b6418f89dd..7a72eb3d06 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_utils.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_utils.py @@ -72,20 +72,6 @@ class SchedulerOutput(BaseOutput): class SchedulerMixin(PushToHubMixin): - """ - Base class for all schedulers. - - [`SchedulerMixin`] contains common functions shared by all schedulers such as general loading and saving - functionalities. - - [`ConfigMixin`] takes care of storing the configuration attributes (like `num_train_timesteps`) that are passed to - the scheduler's `__init__` function, and the attributes can be accessed by `scheduler.config.num_train_timesteps`. - - Class attributes: - - **_compatibles** (`List[str]`) -- A list of scheduler classes that are compatible with the parent scheduler - class. Use [`~ConfigMixin.from_config`] to load a different compatible scheduler class (should be overridden - by parent class). - """ config_name = SCHEDULER_CONFIG_NAME _compatibles = [] @@ -100,53 +86,7 @@ class SchedulerMixin(PushToHubMixin): return_unused_kwargs=False, **kwargs, ): - r""" - Instantiate a scheduler from a pre-defined JSON configuration file in a local directory or Hub repository. - - Parameters: - pretrained_model_name_or_path (`str` or `os.PathLike`, *optional*): - Can be either: - - - A string, the *model id* (for example `google/ddpm-celebahq-256`) of a pretrained model hosted on - the Hub. - - A path to a *directory* (for example `./my_model_directory`) containing the scheduler - configuration saved with [`~SchedulerMixin.save_pretrained`]. - subfolder (`str`, *optional*): - The subfolder location of a model file within a larger model repository on the Hub or locally. - return_unused_kwargs (`bool`, *optional*, defaults to `False`): - Whether kwargs that are not consumed by the Python class should be returned or not. - cache_dir (`Union[str, os.PathLike]`, *optional*): - Path to a directory where a downloaded pretrained model configuration is cached if the standard cache - is not used. - force_download (`bool`, *optional*, defaults to `False`): - Whether or not to force the (re-)download of the model weights and configuration files, overriding the - cached versions if they exist. - - proxies (`Dict[str, str]`, *optional*): - A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128', - 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request. - output_loading_info(`bool`, *optional*, defaults to `False`): - Whether or not to also return a dictionary containing missing keys, unexpected keys and error messages. - local_files_only(`bool`, *optional*, defaults to `False`): - Whether to only load local model weights and configuration files or not. If set to `True`, the model - won't be downloaded from the Hub. - token (`str` or *bool*, *optional*): - The token to use as HTTP bearer authorization for remote files. If `True`, the token generated from - `diffusers-cli login` (stored in `~/.huggingface`) is used. - revision (`str`, *optional*, defaults to `"main"`): - The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier - allowed by Git. - - - - To use private or [gated models](https://huggingface.co/docs/hub/models-gated#gated-models), log-in with - `huggingface-cli login`. You can also activate the special - ["offline-mode"](https://huggingface.co/diffusers/installation.html#offline-mode) to use this method in a - firewalled environment. - - - """ config, kwargs, commit_hash = cls.load_config( pretrained_model_name_or_path=pretrained_model_name_or_path, subfolder=subfolder, diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/vae/autoencoder_kl.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/vae/autoencoder_kl.py index cea74eb29f..bbe9bddf3e 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/vae/autoencoder_kl.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/vae/autoencoder_kl.py @@ -37,40 +37,7 @@ from .vae import Decoder, DecoderOutput, DiagonalGaussianDistribution, Encoder class AutoencoderKL(ModelMixin, ConfigMixin, FromOriginalModelMixin, PeftAdapterMixin): - r""" - A VAE model with KL loss for encoding images into latents and decoding latent representations into images. - - This model inherits from [`ModelMixin`]. Check the superclass documentation for it's generic methods implemented - for all models (such as downloading or saving). - - Parameters: - in_channels (int, *optional*, defaults to 3): Number of channels in the input image. - out_channels (int, *optional*, defaults to 3): Number of channels in the output. - down_block_types (`Tuple[str]`, *optional*, defaults to `("DownEncoderBlock2D",)`): - Tuple of downsample block types. - up_block_types (`Tuple[str]`, *optional*, defaults to `("UpDecoderBlock2D",)`): - Tuple of upsample block types. - block_out_channels (`Tuple[int]`, *optional*, defaults to `(64,)`): - Tuple of block output channels. - act_fn (`str`, *optional*, defaults to `"silu"`): The activation function to use. - latent_channels (`int`, *optional*, defaults to 4): Number of channels in the latent space. - sample_size (`int`, *optional*, defaults to `32`): Sample input size. - scaling_factor (`float`, *optional*, defaults to 0.18215): - The component-wise standard deviation of the trained latent space computed using the first batch of the - training set. This is used to scale the latent space to have unit variance when training the diffusion - model. The latents are scaled with the formula `z = z * scaling_factor` before being passed to the - diffusion model. When decoding, the latents are scaled back to the original scale with the formula: `z = 1 - / scaling_factor * z`. For more details, refer to sections 4.3.2 and D.1 of the [High-Resolution Image - Synthesis with Latent Diffusion Models](https://arxiv.org/abs/2112.10752) paper. - force_upcast (`bool`, *optional*, default to `True`): - If enabled it will force the VAE to run in float32 for high image resolution pipelines, such as SD-XL. VAE - can be fine-tuned / trained to a lower range without loosing too much precision in which case - `force_upcast` can be set to `False` - see: https://huggingface.co/madebyollin/sdxl-vae-fp16-fix - mid_block_add_attention (`bool`, *optional*, default to `True`): - If enabled, the mid_block of the Encoder and Decoder will have attention blocks. If set to false, the - mid_block will only have resnet blocks - """ - + _supports_gradient_checkpointing = True _no_split_modules = ["BasicTransformerBlock", "ResnetBlock2D"] @@ -388,24 +355,6 @@ class AutoencoderKL(ModelMixin, ConfigMixin, FromOriginalModelMixin, PeftAdapter return enc def tiled_encode(self, x: torch.Tensor, return_dict: bool = True) -> AutoencoderKLOutput: - r"""Encode a batch of images using a tiled encoder. - - When this option is enabled, the VAE will split the input tensor into tiles to compute encoding in several - steps. This is useful to keep memory use constant regardless of image size. The end result of tiled encoding is - different from non-tiled encoding because each tile uses a different encoder. To avoid tiling artifacts, the - tiles overlap and are blended together to form a smooth output. You may still see tile-sized changes in the - output, but they should be much less noticeable. - - Args: - x (`torch.Tensor`): Input batch of images. - return_dict (`bool`, *optional*, defaults to `True`): - Whether or not to return a [`~models.autoencoder_kl.AutoencoderKLOutput`] instead of a plain tuple. - - Returns: - [`~models.autoencoder_kl.AutoencoderKLOutput`] or `tuple`: - If return_dict is True, a [`~models.autoencoder_kl.AutoencoderKLOutput`] is returned, otherwise a plain - `tuple` is returned. - """ deprecation_message = ( "The tiled_encode implementation supporting the `return_dict` parameter is deprecated. In the future, the " "implementation of this method will be replaced with that of `_tiled_encode` and you will no longer be able " -- Gitee From d20c7e8e8de31ae015c0304b92c4a45ac7a51295 Mon Sep 17 00:00:00 2001 From: jiangmengyu Date: Thu, 26 Dec 2024 14:29:28 +0800 Subject: [PATCH 19/91] =?UTF-8?q?CogView3plus=E9=80=82=E9=85=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../pipeline/pipeline_cogview3plus.py | 83 ++++++++++++++++++- 1 file changed, 82 insertions(+), 1 deletion(-) diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py index 91559134c6..2e2ec1c8e8 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py @@ -291,7 +291,6 @@ class CogView3PlusPipeline(DiffusionPipeline): # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs def prepare_extra_step_kwargs(self, generator, eta): - accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys()) extra_step_kwargs = {} if accepts_eta: @@ -398,6 +397,88 @@ class CogView3PlusPipeline(DiffusionPipeline): callback_on_step_end_tensor_inputs: List[str] = ["latents"], max_sequence_length: int = 224, ) -> Union[CogView3PipelineOutput, Tuple]: + """ + Function invoked when calling the pipeline for generation. + + Args: + prompt (`str` or `List[str]`, *optional*): + The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`. + negative_prompt (`str` or `List[str]`, *optional*): + The prompt or prompts not to guide the image generation. If not defined, one has to pass + `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is + less than `1`). + height (`int`, *optional*, defaults to self.transformer.config.sample_size * self.vae_scale_factor): + The height in pixels of the generated image. If not provided, it is set to 1024. + width (`int`, *optional*, defaults to self.transformer.config.sample_size * self.vae_scale_factor): + The width in pixels of the generated image. If not provided it is set to 1024. + num_inference_steps (`int`, *optional*, defaults to `50`): + The number of denoising steps. More denoising steps usually lead to a higher quality image at the + expense of slower inference. + timesteps (`List[int]`, *optional*): + Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument + in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is + passed will be used. Must be in descending order. + guidance_scale (`float`, *optional*, defaults to `5.0`): + Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598). + `guidance_scale` is defined as `w` of equation 2. of [Imagen + Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale > + 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`, + usually at the expense of lower image quality. + num_images_per_prompt (`int`, *optional*, defaults to `1`): + The number of images to generate per prompt. + generator (`torch.Generator` or `List[torch.Generator]`, *optional*): + One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) + to make generation deterministic. + latents (`torch.FloatTensor`, *optional*): + Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image + generation. Can be used to tweak the same generation with different prompts. If not provided, a latents + tensor will ge generated by sampling using the supplied random `generator`. + prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not + provided, text embeddings will be generated from `prompt` input argument. + negative_prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt + weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input + argument. + original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)): + If `original_size` is not the same as `target_size` the image will appear to be down- or upsampled. + `original_size` defaults to `(height, width)` if not specified. Part of SDXL's micro-conditioning as + explained in section 2.2 of + [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). + crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)): + `crops_coords_top_left` can be used to generate an image that appears to be "cropped" from the position + `crops_coords_top_left` downwards. Favorable, well-centered images are usually achieved by setting + `crops_coords_top_left` to (0, 0). Part of SDXL's micro-conditioning as explained in section 2.2 of + [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). + output_type (`str`, *optional*, defaults to `"pil"`): + The output format of the generate image. Choose between + [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`. + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] instead + of a plain tuple. + attention_kwargs (`dict`, *optional*): + A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under + `self.processor` in + [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py). + callback_on_step_end (`Callable`, *optional*): + A function that calls at the end of each denoising steps during the inference. The function is called + with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int, + callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by + `callback_on_step_end_tensor_inputs`. + callback_on_step_end_tensor_inputs (`List`, *optional*): + The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list + will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the + `._callback_tensor_inputs` attribute of your pipeline class. + max_sequence_length (`int`, defaults to `224`): + Maximum sequence length in encoded prompt. Can be set to other values but may lead to poorer results. + + Examples: + + Returns: + [`~pipelines.cogview3.pipeline_cogview3plus.CogView3PipelineOutput`] or `tuple`: + [`~pipelines.cogview3.pipeline_cogview3plus.CogView3PipelineOutput`] if `return_dict` is True, otherwise a + `tuple`. When returning a tuple, the first element is a list with the generated images. + """ if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)): callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs -- Gitee From 10d485a9e886f23782b80984463b84a4ae0abcfa Mon Sep 17 00:00:00 2001 From: jiangmengyu Date: Thu, 26 Dec 2024 15:06:50 +0800 Subject: [PATCH 20/91] =?UTF-8?q?CogView3plus=E9=80=82=E9=85=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../cogview3/inference_cogview3plus.py | 2 +- .../foundation/cogview3/requirents.txt | 23 +++++++------------ 2 files changed, 9 insertions(+), 16 deletions(-) diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/inference_cogview3plus.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/inference_cogview3plus.py index 74dd914294..e2b39e04aa 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/inference_cogview3plus.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/inference_cogview3plus.py @@ -51,7 +51,7 @@ def parse_arguments(): parser.add_argument("--height", type=int, default=1024, help="Height of the generated image.") parser.add_argument("--output_path", type=str, default="cogview3.png", help="Path to save the generated image.") parser.add_argument("--dtype", type=str, default="bf16", help="bf16 or fp16") - parser.add_argument("--device_id", type=int, default=1, help="NPU device id") + parser.add_argument("--device_id", type=int, default=6, help="NPU device id") return parser.parse_args() diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/requirents.txt b/MindIE/MindIE-Torch/built-in/foundation/cogview3/requirents.txt index ac2fa2a7f6..1600434700 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/requirents.txt +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/requirents.txt @@ -1,15 +1,8 @@ -accelerate==0.29.3 -deepspeed==0.15.4 -einops==0.7.0 -gradio==3.50.2 -huggingface-hub==0.24.7 -Jinja2==3.1.4 -numpy==1.26.4 -peft==0.10.0 -safetensors==0.4.5 -timm==0.9.5 -tokenizers==0.15.2 -torch==2.1.0 -torchvision==0.14.1 -tqdm==4.66.5 -transformers==4.39.3 \ No newline at end of file +deepspeed==0.16.1 +transformers==4.47.1 +gradio==5.9.1 +accelerate==1.0.1 +diffusers==0.31.0 +sentencepiece==0.2.0 +torch==2.4.0 +openai==1.58.1 \ No newline at end of file -- Gitee From a5ef08b4dddd8654ff1c3ee2054684be32397506 Mon Sep 17 00:00:00 2001 From: jiangmengyu Date: Thu, 26 Dec 2024 16:07:08 +0800 Subject: [PATCH 21/91] =?UTF-8?q?CogView3plus=E9=80=82=E9=85=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../built-in/foundation/cogview3/README.md | 170 +++++------------- .../cogview3/inference_cogview3plus.py | 10 +- 2 files changed, 50 insertions(+), 130 deletions(-) diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/README.md b/MindIE/MindIE-Torch/built-in/foundation/cogview3/README.md index 028c765d30..2ad3b64c91 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/README.md +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/README.md @@ -5,7 +5,7 @@ | 配套 | 版本 | 环境准备指导 | | ----- | ----- |-----| | Python | 3.10.2 | - | - | torch | 2.1.0 | - | + | torch | 2.4.0 | - | ### 1.1 获取CANN&MindIE安装包&环境准备 - [800I A2](https://www.hiascend.com/developer/download/community/result?module=pt+ie+cann&product=4&model=32) @@ -46,7 +46,7 @@ cd ${AieInstallPath}/mindie && source set_env.sh ``` ### 1.4 Torch_npu安装 -安装pytorch框架 版本2.1.0 +安装pytorch框架 版本2.4.0 [安装包下载](https://download.pytorch.org/whl/cpu/torch/) 使用pip安装 @@ -67,161 +67,73 @@ pip install torch_npu-{pytorchversion}.xxxx.{arch}.whl git clone https://gitee.com/ascend/ModelZoo-PyTorch.git ``` -## 三、HunyuanDiT使用 +## 三、CogView3使用 ### 3.1 权重及配置文件说明 -1. text_encoder权重链接: +1. CogView3权重路径: ```shell -https://huggingface.co/Tencent-Hunyuan/HunyuanDiT-v1.2-Diffusers/tree/main/text_encoder +https://huggingface.co/THUDM/CogView3-Plus-3B/tree/main ``` -2. text_encoder_2权重链接: -```shell -https://huggingface.co/Tencent-Hunyuan/HunyuanDiT-v1.2-Diffusers/tree/main/text_encoder_2 -``` -3. tokenizer权重链接: -```shell -https://huggingface.co/Tencent-Hunyuan/HunyuanDiT-v1.2-Diffusers/tree/main/tokenizer -``` -4. tokenizer_2权重链接: -```shell -https://huggingface.co/Tencent-Hunyuan/HunyuanDiT-v1.2-Diffusers/tree/main/tokenizer_2 -``` -5. transformer权重链接: -```shell -https://huggingface.co/Tencent-Hunyuan/HunyuanDiT-v1.2/tree/main/t2i/model -``` -- 修改该权重的config.json +- 修改该权重的model_index.json ```shell { - "architectures": [ - "HunyuanDiT2DModel" + "_class_name": "CogView3PlusPipeline", + "_diffusers_version": "0.31.0.dev0", + "scheduler": [ + "cogview3plus", + "CogVideoXDDIMScheduler" + ], + "text_encoder": [ + "transformers", + "T5EncoderModel" ], - "input_size": [ - null, - null + "tokenizer": [ + "transformers", + "T5Tokenizer" ], - "patch_size": 2, - "in_channels": 4, - "hidden_size": 1408, - "depth": 40, - "num_heads": 16, - "mlp_ratio": 4.3637, - "text_states_dim": 1024, - "text_states_dim_t5": 2048, - "text_len": 77, - "text_len_t5": 256 + "transformer": [ + "cogview3plus", + "CogView3PlusTransformer2DModel" + ], + "vae": [ + "cogview3plus", + "AutoencoderKL" + ] } ``` -6. vae权重链接: +2. scheduler权重链接: ```shell -https://huggingface.co/Tencent-Hunyuan/HunyuanDiT-v1.2-Diffusers/tree/main/vae +https://huggingface.co/THUDM/CogView3-Plus-3B/tree/main/scheduler ``` -- 修改该权重的config.json +3. text_encoder权重链接: ```shell -{ - "architectures": [ - "AutoencoderKL" - ], - "in_channels": 3, - "out_channels": 3, - "down_block_types": [ - "DownEncoderBlock2D", - "DownEncoderBlock2D", - "DownEncoderBlock2D", - "DownEncoderBlock2D" - ], - "up_block_types": [ - "UpDecoderBlock2D", - "UpDecoderBlock2D", - "UpDecoderBlock2D", - "UpDecoderBlock2D" - ], - "block_out_channels": [ - 128, - 256, - 512, - 512 - ], - "layers_per_block": 2, - "act_fn": "silu", - "latent_channels": 4, - "norm_num_groups": 32, - "sample_size": 512, - "scaling_factor": 0.13025, - "shift_factor": null, - "latents_mean": null, - "latents_std": null, - "force_upcast": false, - "use_quant_conv": true, - "use_post_quant_conv": true -} +https://huggingface.co/THUDM/CogView3-Plus-3B/tree/main/text_encoder ``` -7. scheduler: -- 新增scheduler_config.json配置文件, 内容如下所示: +4. tokenizer权重链接: ```shell -{ - "_class_name": "DDPMScheduler", - "_mindiesd_version": "1.0.0", - "steps_offset": 1, - "beta_start": 0.00085, - "beta_end": 0.02, - "num_train_timesteps": 1000 -} +https://huggingface.co/THUDM/CogView3-Plus-3B/tree/main/tokenizer ``` -8. 新增model_index.json -将以上步骤下载的权重放在同一目录下, 并新增model_index.json文件, 该文件内容如下所示 +5. transformer权重链接: ```shell -{ - "_class_name": "HunyuanDiTPipeline", - "_mindiesd_version": "1.0.RC3", - "scheduler": [ - "mindiesd", - "DDPMScheduler" - ], - "text_encoder": [ - "transformers", - "BertModel" - ], - "text_encoder_2": [ - "transformers", - "T5EncoderModel" - ], - "tokenizer": [ - "transformers", - "BertTokenizer" - ], - "tokenizer_2": [ - "transformers", - "T5Tokenizer" - ], - "transformer": [ - "mindiesd", - "HunyuanDiT2DModel" - ], - "vae": [ - "mindiesd", - "AutoencoderKL" - ] -} +https://huggingface.co/THUDM/CogView3-Plus-3B/tree/main/transformer +``` +6. vae权重链接: +```shell +https://huggingface.co/THUDM/CogView3-Plus-3B/tree/main/vae ``` -9. 各模型的配置文件、权重文件的层级样例如下所示。 +7. 各模型的配置文件、权重文件的层级样例如下所示。 ```commandline -|----hunyuandit +|----CogView3B +| |---- configuration.json | |---- model_index.json | |---- scheduler | | |---- scheduler_config.json | |---- text_encoder | | |---- config.json | | |---- 模型权重 -| |---- text_encoder_2 -| | |---- config.json -| | |---- 模型权重 | |---- tokenizer | | |---- config.json | | |---- 模型权重 -| |---- tokenizer_2 -| | |---- config.json -| | |---- 模型权重 | |---- transformer | | |---- config.json | | |---- 模型权重 diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/inference_cogview3plus.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/inference_cogview3plus.py index e2b39e04aa..ea9c77744e 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/inference_cogview3plus.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/inference_cogview3plus.py @@ -33,7 +33,15 @@ def parse_arguments(): "--prompt", type=list, default=[ - "A vibrant cherry red sports car sits proudly under the gleaming sun, its polished exterior smooth and flawless, casting a mirror-like reflection. The car features a low, aerodynamic body, angular headlights that gaze forward like predatory eyes, and a set of black, high-gloss racing rims that contrast starkly with the red. A subtle hint of chrome embellishes the grille and exhaust, while the tinted windows suggest a luxurious and private interior. The scene conveys a sense of speed and elegance, the car appearing as if it's about to burst into a sprint along a coastal road, with the ocean's azure waves crashing in the background." + "A vibrant cherry red sports car sits proudly under the gleaming sun, \ + its polished exterior smooth and flawless, casting a mirror-like reflection. \ + The car features a low, aerodynamic body, angular headlights that gaze forward like predatory eyes, \ + and a set of black, high-gloss racing rims that contrast starkly with the red. \ + A subtle hint of chrome embellishes the grille and exhaust, \ + while the tinted windows suggest a luxurious and private interior. \ + he scene conveys a sense of speed and elegance, \ + the car appearing as if it's about to burst into a sprint along a coastal road, \ + with the ocean's azure waves crashing in the background." ], help="The text description for generating the image." ) -- Gitee From 6eab401dc3c783504098658202368ca1994cc385 Mon Sep 17 00:00:00 2001 From: jiangmengyu Date: Thu, 26 Dec 2024 16:31:08 +0800 Subject: [PATCH 22/91] =?UTF-8?q?CogView3plus=E9=80=82=E9=85=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../built-in/foundation/cogview3/README.md | 84 +++---------------- 1 file changed, 12 insertions(+), 72 deletions(-) diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/README.md b/MindIE/MindIE-Torch/built-in/foundation/cogview3/README.md index 2ad3b64c91..d5a9e274e6 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/README.md +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/README.md @@ -145,82 +145,22 @@ https://huggingface.co/THUDM/CogView3-Plus-3B/tree/main/vae ### 3.2 单卡单prompt功能测试 设置权重路径 ```shell -path = 'ckpts/hydit' +model_path = '/data/CogView3B' ``` 执行命令: ```shell -python inference_hydit.py \ - --path ${path} \ +python inference_cogview3plus.py \ + --model_path ${model_path} \ --device_id 0 \ - --prompt "青花瓷风格,一只小狗" \ - --input_size (1024, 1024) \ - --seed 42 \ - --infer_steps 25 + --width 1024 \ + --height 1024 \ + --num_inference_steps 50 \ + --dtype bf16 ``` 参数说明: -- path:权重路径,包含scheduler、text_encoder、text_encoder_2、tokenizer、 tokenizer_2、transformer、vae,七个模型的配置文件及权重。 +- model_path:权重路径,包含scheduler、text_encoder、tokenizer、transformer、vae,5个模型的配置文件及权重。 - device_id:推理设备ID。 -- prompt:用于图像生成的文字描述提示。 -- input_size:需要生成的图像尺寸。 -- seed:设置随机种子,默认值为42。 -- infer_steps:推理迭代步数。 - -### 3.3 单卡多prompts进行性能/精度测试 -设置权重路径 -```shell -path = 'ckpts/hydit' -``` -执行命令: -```shell -python inference_hydit.py \ - --path ${path} \ - --device_id 0 \ - --test_acc \ - --prompt_list "prompts/example_prompts.txt" \ - --input_size (1024, 1024) \ - --seed 42 \ - --infer_steps 25 -``` -参数说明: -- path:权重路径,包含scheduler、text_encoder、text_encoder_2、tokenizer、 tokenizer_2、transformer、vae,七个模型的配置文件及权重。 -- device_id:推理设备ID。 -- test_acc:使用 --test_acc 开启全量图像生成,用于性能/精度测试。单prompt功能测试时,不开启该参数。 -- prompt_list:用于图像生成的文字描述提示的列表文件路径。 -- input_size:需要生成的图像尺寸。 -- seed:设置随机种子,默认值为42。 -- infer_steps:推理迭代步数。 - -### 3.4 用LoRA进行测试 -设置权重路径 -```shell -path = 'ckpts/hydit' -``` -LoRA权重链接: -```shell -https://huggingface.co/Tencent-Hunyuan/HYDiT-LoRA/tree/main -``` -设置LoRA权重路径 -```shell -lora_path = 'ckpts/lora' -``` -执行命令: -```shell -python inference_hydit.py \ - --path ${path} \ - --device_id 0 \ - --prompt "青花瓷风格,一只小狗" \ - --input_size (1024, 1024) \ - --seed 42 \ - --infer_steps 25 - --use_lora \ - --lora_ckpt ${lora_path} -``` -参数说明: -- path:权重路径,包含scheduler、text_encoder、text_encoder_2、tokenizer、 tokenizer_2、transformer、vae,七个模型的配置文件及权重。 -- device_id:推理设备ID。 -- prompt:用于图像生成的文字描述提示。 -- input_size:需要生成的图像尺寸。 -- seed:设置随机种子,默认值为42。 -- infer_steps:推理迭代步数。 -- use_lora:使用 --use_lora 开启LoRA风格化切换。 -- lora_ckpt:LoRA权重路径。 \ No newline at end of file +- width:需要生成的图像的宽。 +- height: 需要生成的图像的高。 +- num_inference_steps:推理迭代步数。 +- dtype: 数据类型。目前只支持bf16。 -- Gitee From 00d969b6194fbf27a1fb472a7c2832e26280d82f Mon Sep 17 00:00:00 2001 From: jiangmengyu Date: Thu, 26 Dec 2024 16:33:00 +0800 Subject: [PATCH 23/91] =?UTF-8?q?CogView3plus=E9=80=82=E9=85=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- MindIE/MindIE-Torch/built-in/foundation/cogview3/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/README.md b/MindIE/MindIE-Torch/built-in/foundation/cogview3/README.md index d5a9e274e6..dfcf259a0b 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/README.md +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/README.md @@ -4,7 +4,7 @@ | 配套 | 版本 | 环境准备指导 | | ----- | ----- |-----| - | Python | 3.10.2 | - | + | Python | 3.10.12 | - | | torch | 2.4.0 | - | ### 1.1 获取CANN&MindIE安装包&环境准备 -- Gitee From 09c395f8857fc2288200b1d9d60dd64082370b0d Mon Sep 17 00:00:00 2001 From: jiangmengyu Date: Thu, 26 Dec 2024 17:22:46 +0800 Subject: [PATCH 24/91] =?UTF-8?q?cogview3=E9=80=82=E9=85=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../cogview3plus/layers/embeddings.py | 35 ++------------- .../cogview3plus/layers/normalization.py | 5 +-- .../cogview3/inference_cogview3plus.py | 44 +++++++------------ 3 files changed, 20 insertions(+), 64 deletions(-) diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/embeddings.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/embeddings.py index 445ad8245a..29896c0814 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/embeddings.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/embeddings.py @@ -17,7 +17,6 @@ from typing import Optional import numpy as np import torch from torch import nn - from diffusers.utils import deprecate from diffusers.models.activations import FP32SiLU, get_activation @@ -27,30 +26,8 @@ def get_timestep_embedding( embedding_dim: int, flip_sin_to_cos: bool = False, downscale_freq_shift: float = 1, - scale: float = 1, max_period: int = 10000, ): - """ - This matches the implementation in Denoising Diffusion Probabilistic Models: Create sinusoidal timestep embeddings. - - Args - timesteps (torch.Tensor): - a 1-D Tensor of N indices, one per batch element. These may be fractional. - embedding_dim (int): - the dimension of the output. - flip_sin_to_cos (bool): - Whether the embedding order should be `cos, sin` (if True) or `sin, cos` (if False) - downscale_freq_shift (float): - Controls the delta between frequencies between dimensions - scale (float): - Scaling factor applied to the embeddings. - max_period (int): - Controls the maximum frequency of the embeddings - Returns - torch.Tensor: an [N x dim] Tensor of positional embeddings. - """ - assert len(timesteps.shape) == 1, "Timesteps should be a 1d-array" - half_dim = embedding_dim // 2 exponent = -math.log(max_period) * torch.arange( start=0, end=half_dim, dtype=torch.float32, device=timesteps.device @@ -60,9 +37,6 @@ def get_timestep_embedding( emb = torch.exp(exponent) emb = timesteps[:, None].float() * emb[None, :] - # scale embeddings - emb = scale * emb - # concat sine and cosine embeddings emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=-1) @@ -83,7 +57,6 @@ def get_2d_sincos_pos_embed( extra_tokens=0, interpolation_scale=1.0, base_size=16, - device: Optional[torch.device] = None, output_type: str = "np", ): """ @@ -125,12 +98,12 @@ def get_2d_sincos_pos_embed( grid_size = (grid_size, grid_size) grid_h = ( - torch.arange(grid_size[0], device=device, dtype=torch.float32) + torch.arange(grid_size[0], dtype=torch.float32) / (grid_size[0] / base_size) / interpolation_scale ) grid_w = ( - torch.arange(grid_size[1], device=device, dtype=torch.float32) + torch.arange(grid_size[1], dtype=torch.float32) / (grid_size[1] / base_size) / interpolation_scale ) @@ -302,12 +275,11 @@ def get_1d_sincos_pos_embed_from_grid_np(embed_dim, pos): class Timesteps(nn.Module): - def __init__(self, num_channels: int, flip_sin_to_cos: bool, downscale_freq_shift: float, scale: int = 1): + def __init__(self, num_channels: int, flip_sin_to_cos: bool, downscale_freq_shift: float): super().__init__() self.num_channels = num_channels self.flip_sin_to_cos = flip_sin_to_cos self.downscale_freq_shift = downscale_freq_shift - self.scale = scale def forward(self, timesteps): t_emb = get_timestep_embedding( @@ -315,7 +287,6 @@ class Timesteps(nn.Module): self.num_channels, flip_sin_to_cos=self.flip_sin_to_cos, downscale_freq_shift=self.downscale_freq_shift, - scale=self.scale, ) return t_emb diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/normalization.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/normalization.py index 3dd2bba76c..64dbbe058a 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/normalization.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/normalization.py @@ -27,7 +27,6 @@ if is_torch_version(">=", "2.1.0"): LayerNorm = nn.LayerNorm else: # Has optional bias parameter compared to torch layer norm - # TODO: replace with torch layernorm once min required torch version >= 2.1 class LayerNorm(nn.Module): def __init__(self, dim, eps: float = 1e-5, elementwise_affine: bool = True, bias: bool = True): super().__init__() @@ -46,8 +45,8 @@ else: self.weight = None self.bias = None - def forward(self, input): - return F.layer_norm(input, self.dim, self.weight, self.bias, self.eps) + def forward(self, x): + return F.layer_norm(x, self.dim, self.weight, self.bias, self.eps) class RMSNorm(nn.Module): diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/inference_cogview3plus.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/inference_cogview3plus.py index ea9c77744e..5c12695d6c 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/inference_cogview3plus.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/inference_cogview3plus.py @@ -16,9 +16,10 @@ import argparse import logging -import torch import time +import torch + from cogview3plus import CogView3PlusPipeline logging.basicConfig(level=logging.INFO) @@ -64,11 +65,12 @@ def parse_arguments(): return parser.parse_args() -def generate_image( - prompt, model_path, guidance_scale, num_images_per_prompt, num_inference_steps, width, height, output_path, dtype -): +def infer(args): + torch.npu.set_device(args.device_id) + dtype = torch.bfloat16 if args.dtype == "bf16" else torch.float16 + # Load the pre-trained model with the specified precision - pipe = CogView3PlusPipeline.from_pretrained(model_path, torch_dtype=dtype).to("npu") + pipe = CogView3PlusPipeline.from_pretrained(args.model_path, torch_dtype=dtype).to("npu") use_time = 0 loops = 5 @@ -76,12 +78,12 @@ def generate_image( start_time = time.time() # Generate the image based on the prompt image = pipe( - prompt=prompt, - guidance_scale=guidance_scale, - num_images_per_prompt=num_images_per_prompt, - num_inference_steps=num_inference_steps, - width=width, - height=height, + prompt=args.prompt[0], + guidance_scale=args.guidance_scale, + num_images_per_prompt=args.num_images_per_prompt, + num_inference_steps=args.num_inference_steps, + width=args.width, + height=args.height, ).images[0] if i >= 2: @@ -93,25 +95,9 @@ def generate_image( logger.info("use_time is %.3f)", use_time / 3) # Save the generated image to the local file system - image.save(output_path) + image.save(args.output_path) - print(f"Image saved to {output_path}") - - -def infer(args): - torch.npu.set_device(args.device_id) - dtype = torch.bfloat16 if args.dtype == "bf16" else torch.float16 - generate_image( - prompt=args.prompt[0], - model_path=args.model_path, - guidance_scale=args.guidance_scale, - num_images_per_prompt=args.num_images_per_prompt, - num_inference_steps=args.num_inference_steps, - width=args.width, - height=args.height, - output_path=args.output_path, - dtype=dtype, - ) + print(f"Image saved to {args.output_path}") if __name__ == "__main__": -- Gitee From 64616aa9f302369d84d435ec5b059e23a24dfb55 Mon Sep 17 00:00:00 2001 From: jiangmengyu Date: Thu, 26 Dec 2024 17:27:03 +0800 Subject: [PATCH 25/91] =?UTF-8?q?cogview3=E9=80=82=E9=85=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../cogview3plus/layers/embeddings.py | 159 +----------------- 1 file changed, 6 insertions(+), 153 deletions(-) diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/embeddings.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/embeddings.py index 29896c0814..dc1c683c63 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/embeddings.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/embeddings.py @@ -53,47 +53,9 @@ def get_timestep_embedding( def get_2d_sincos_pos_embed( embed_dim, grid_size, - cls_token=False, - extra_tokens=0, interpolation_scale=1.0, base_size=16, - output_type: str = "np", ): - """ - Creates 2D sinusoidal positional embeddings. - - Args: - embed_dim (`int`): - The embedding dimension. - grid_size (`int`): - The size of the grid height and width. - cls_token (`bool`, defaults to `False`): - Whether or not to add a classification token. - extra_tokens (`int`, defaults to `0`): - The number of extra tokens to add. - interpolation_scale (`float`, defaults to `1.0`): - The scale of the interpolation. - - Returns: - pos_embed (`torch.Tensor`): - Shape is either `[grid_size * grid_size, embed_dim]` if not using cls_token, or `[1 + grid_size*grid_size, - embed_dim]` if using cls_token - """ - if output_type == "np": - deprecation_message = ( - "`get_2d_sincos_pos_embed` uses `torch` and supports `device`." - " `from_numpy` is no longer required." - " Pass `output_type='pt' to use the new version now." - ) - deprecate("output_type=='np'", "0.33.0", deprecation_message, standard_warn=False) - return get_2d_sincos_pos_embed_np( - embed_dim=embed_dim, - grid_size=grid_size, - cls_token=cls_token, - extra_tokens=extra_tokens, - interpolation_scale=interpolation_scale, - base_size=base_size, - ) if isinstance(grid_size, int): grid_size = (grid_size, grid_size) @@ -111,13 +73,11 @@ def get_2d_sincos_pos_embed( grid = torch.stack(grid, dim=0) grid = grid.reshape([2, 1, grid_size[1], grid_size[0]]) - pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid, output_type=output_type) - if cls_token and extra_tokens > 0: - pos_embed = torch.concat([torch.zeros([extra_tokens, embed_dim]), pos_embed], dim=0) + pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid) return pos_embed -def get_2d_sincos_pos_embed_from_grid(embed_dim, grid, output_type="np"): +def get_2d_sincos_pos_embed_from_grid(embed_dim, grid): r""" This function generates 2D sinusoidal positional embeddings from a grid. @@ -128,29 +88,18 @@ def get_2d_sincos_pos_embed_from_grid(embed_dim, grid, output_type="np"): Returns: `torch.Tensor`: The 2D sinusoidal positional embeddings with shape `(H * W, embed_dim)` """ - if output_type == "np": - deprecation_message = ( - "`get_2d_sincos_pos_embed_from_grid` uses `torch` and supports `device`." - " `from_numpy` is no longer required." - " Pass `output_type='pt' to use the new version now." - ) - deprecate("output_type=='np'", "0.33.0", deprecation_message, standard_warn=False) - return get_2d_sincos_pos_embed_from_grid_np( - embed_dim=embed_dim, - grid=grid, - ) if embed_dim % 2 != 0: raise ValueError("embed_dim must be divisible by 2") # use half of dimensions to encode grid_h - emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0], output_type=output_type) # (H*W, D/2) - emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1], output_type=output_type) # (H*W, D/2) + emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0]) # (H*W, D/2) + emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1]) # (H*W, D/2) emb = torch.concat([emb_h, emb_w], dim=1) # (H*W, D) return emb -def get_1d_sincos_pos_embed_from_grid(embed_dim, pos, output_type="np"): +def get_1d_sincos_pos_embed_from_grid(embed_dim, pos): """ This function generates 1D positional embeddings from a grid. @@ -161,14 +110,6 @@ def get_1d_sincos_pos_embed_from_grid(embed_dim, pos, output_type="np"): Returns: `torch.Tensor`: Sinusoidal positional embeddings of shape `(M, D)`. """ - if output_type == "np": - deprecation_message = ( - "`get_1d_sincos_pos_embed_from_grid` uses `torch` and supports `device`." - " `from_numpy` is no longer required." - " Pass `output_type='pt' to use the new version now." - ) - deprecate("output_type=='np'", "0.33.0", deprecation_message, standard_warn=False) - return get_1d_sincos_pos_embed_from_grid_np(embed_dim=embed_dim, pos=pos) if embed_dim % 2 != 0: raise ValueError("embed_dim must be divisible by 2") @@ -186,94 +127,6 @@ def get_1d_sincos_pos_embed_from_grid(embed_dim, pos, output_type="np"): return emb -def get_2d_sincos_pos_embed_np( - embed_dim, grid_size, cls_token=False, extra_tokens=0, interpolation_scale=1.0, base_size=16 -): - """ - Creates 2D sinusoidal positional embeddings. - - Args: - embed_dim (`int`): - The embedding dimension. - grid_size (`int`): - The size of the grid height and width. - cls_token (`bool`, defaults to `False`): - Whether or not to add a classification token. - extra_tokens (`int`, defaults to `0`): - The number of extra tokens to add. - interpolation_scale (`float`, defaults to `1.0`): - The scale of the interpolation. - - Returns: - pos_embed (`np.ndarray`): - Shape is either `[grid_size * grid_size, embed_dim]` if not using cls_token, or `[1 + grid_size*grid_size, - embed_dim]` if using cls_token - """ - if isinstance(grid_size, int): - grid_size = (grid_size, grid_size) - - grid_h = np.arange(grid_size[0], dtype=np.float32) / (grid_size[0] / base_size) / interpolation_scale - grid_w = np.arange(grid_size[1], dtype=np.float32) / (grid_size[1] / base_size) / interpolation_scale - grid = np.meshgrid(grid_w, grid_h) # here w goes first - grid = np.stack(grid, axis=0) - - grid = grid.reshape([2, 1, grid_size[1], grid_size[0]]) - pos_embed = get_2d_sincos_pos_embed_from_grid_np(embed_dim, grid) - if cls_token and extra_tokens > 0: - pos_embed = np.concatenate([np.zeros([extra_tokens, embed_dim]), pos_embed], axis=0) - return pos_embed - - -def get_2d_sincos_pos_embed_from_grid_np(embed_dim, grid): - r""" - This function generates 2D sinusoidal positional embeddings from a grid. - - Args: - embed_dim (`int`): The embedding dimension. - grid (`np.ndarray`): Grid of positions with shape `(H * W,)`. - - Returns: - `np.ndarray`: The 2D sinusoidal positional embeddings with shape `(H * W, embed_dim)` - """ - if embed_dim % 2 != 0: - raise ValueError("embed_dim must be divisible by 2") - - # use half of dimensions to encode grid_h - emb_h = get_1d_sincos_pos_embed_from_grid_np(embed_dim // 2, grid[0]) # (H*W, D/2) - emb_w = get_1d_sincos_pos_embed_from_grid_np(embed_dim // 2, grid[1]) # (H*W, D/2) - - emb = np.concatenate([emb_h, emb_w], axis=1) # (H*W, D) - return emb - - -def get_1d_sincos_pos_embed_from_grid_np(embed_dim, pos): - """ - This function generates 1D positional embeddings from a grid. - - Args: - embed_dim (`int`): The embedding dimension `D` - pos (`numpy.ndarray`): 1D tensor of positions with shape `(M,)` - - Returns: - `numpy.ndarray`: Sinusoidal positional embeddings of shape `(M, D)`. - """ - if embed_dim % 2 != 0: - raise ValueError("embed_dim must be divisible by 2") - - omega = np.arange(embed_dim // 2, dtype=np.float64) - omega /= embed_dim / 2.0 - omega = 1.0 / 10000**omega # (D/2,) - - pos = pos.reshape(-1) # (M,) - out = np.einsum("m,d->md", pos, omega) # (M, D/2), outer product - - emb_sin = np.sin(out) # (M, D/2) - emb_cos = np.cos(out) # (M, D/2) - - emb = np.concatenate([emb_sin, emb_cos], axis=1) # (M, D) - return emb - - class Timesteps(nn.Module): def __init__(self, num_channels: int, flip_sin_to_cos: bool, downscale_freq_shift: float): super().__init__() @@ -421,7 +274,7 @@ class CogView3PlusPatchEmbed(nn.Module): self.text_proj = nn.Linear(text_hidden_size, hidden_size) pos_embed = get_2d_sincos_pos_embed( - hidden_size, pos_embed_max_size, base_size=pos_embed_max_size, output_type="pt" + hidden_size, pos_embed_max_size, base_size=pos_embed_max_size ) pos_embed = pos_embed.reshape(pos_embed_max_size, pos_embed_max_size, hidden_size) self.register_buffer("pos_embed", pos_embed.float(), persistent=False) -- Gitee From e9b1be42198f4964d6fe6d89f75d77b1d8fa8ec8 Mon Sep 17 00:00:00 2001 From: jiangmengyu Date: Thu, 26 Dec 2024 18:30:38 +0800 Subject: [PATCH 26/91] =?UTF-8?q?cogview3=E9=80=82=E9=85=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../cogview3plus/layers/embeddings.py | 1 - .../cogview3plus/layers/normalization.py | 20 ++++++++++++++-- .../cogview3plus/models/modeling_utils.py | 8 +++---- .../models/transformer_cogview3plus.py | 23 +++++++++---------- 4 files changed, 33 insertions(+), 19 deletions(-) diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/embeddings.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/embeddings.py index dc1c683c63..1763b3b910 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/embeddings.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/embeddings.py @@ -242,7 +242,6 @@ class CogView3CombinedTimestepSizeEmbeddings(nn.Module): crop_coords_proj = self.condition_proj(crop_coords.flatten()).view(crop_coords.size(0), -1) target_size_proj = self.condition_proj(target_size.flatten()).view(target_size.size(0), -1) - # (B, 3 * condition_dim) condition_proj = torch.cat([original_size_proj, crop_coords_proj, target_size_proj], dim=1) timesteps_emb = self.timestep_embedder(timesteps_proj.to(dtype=hidden_dtype)) # (B, embedding_dim) diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/normalization.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/normalization.py index 64dbbe058a..88fd20c378 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/normalization.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/normalization.py @@ -15,6 +15,7 @@ import numbers from typing import Optional, Tuple +from dataclasses import dataclass import torch import torch.nn as nn @@ -85,7 +86,20 @@ class RMSNorm(nn.Module): hidden_states = hidden_states.to(input_dtype) return hidden_states - + + +@dataclass +class ChunkParam: + gate_msa: torch.Tensor + shift_mlp: torch.Tensor + scale_mlp: torch.Tensor + gate_mlp: torch.Tensor + context: torch.Tensor + c_gate_msa: torch.Tensor + c_shift_mlp: torch.Tensor + c_scale_mlp: torch.Tensor + c_gate_mlp_again: torch.Tensor + class CogView3PlusAdaLayerNormZeroTextImage(nn.Module): r""" @@ -129,7 +143,9 @@ class CogView3PlusAdaLayerNormZeroTextImage(nn.Module): normed_context = self.norm_c(context) x = normed_x * (1 + scale_msa[:, None]) + shift_msa[:, None] context = normed_context * (1 + c_scale_msa[:, None]) + c_shift_msa[:, None] - return x, gate_msa, shift_mlp, scale_mlp, gate_mlp, context, c_gate_msa, c_shift_mlp, c_scale_mlp, c_gate_mlp + return x, ChunkParam( + gate_msa, shift_mlp, scale_mlp, gate_mlp, context, c_gate_msa, c_shift_mlp, c_scale_mlp, c_gate_mlp + ) class FP32LayerNorm(nn.LayerNorm): diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/modeling_utils.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/modeling_utils.py index a0740b8c67..4a2816f283 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/modeling_utils.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/modeling_utils.py @@ -126,10 +126,10 @@ def get_parameter_dtype(parameter: torch.nn.Module) -> torch.dtype: gen = parameter._named_members(get_members_fn=find_tensor_attributes) last_tuple = None - for tuple in gen: - last_tuple = tuple - if tuple[1].is_floating_point(): - return tuple[1].dtype + for current_tuple in gen: + last_tuple = current_tuple + if current_tuple [1].is_floating_point(): + return current_tuple [1].dtype if last_tuple is not None: # fallback to the last dtype diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/transformer_cogview3plus.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/transformer_cogview3plus.py index f13d71880b..d96ca3966a 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/transformer_cogview3plus.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/transformer_cogview3plus.py @@ -83,18 +83,17 @@ class CogView3PlusTransformerBlock(nn.Module): text_seq_length = encoder_hidden_states.size(1) # norm & modulate - ( - norm_hidden_states, - gate_msa, - shift_mlp, - scale_mlp, - gate_mlp, - norm_encoder_hidden_states, - c_gate_msa, - c_shift_mlp, - c_scale_mlp, - c_gate_mlp, - ) = self.norm1(hidden_states, encoder_hidden_states, emb) + norm_hidden_states, chunk_params = self.norm1(hidden_states, encoder_hidden_states, emb) + + gate_msa = chunk_params.gate_msa + shift_mlp = chunk_params.shift_mlp + scale_mlp = chunk_params.scale_mlp + gate_mlp = chunk_params.gate_mlp + norm_encoder_hidden_states = chunk_params.context + c_gate_msa = chunk_params.c_gate_msa + c_shift_mlp = chunk_params.c_shift_mlp + c_scale_mlp = chunk_params.c_scale_mlp + c_gate_mlp = chunk_params.c_gate_mlp # attention attn_hidden_states, attn_encoder_hidden_states = self.attn1( -- Gitee From 883c4f55a505d64022d54f11889027fb1814aebe Mon Sep 17 00:00:00 2001 From: jiangmengyu Date: Thu, 26 Dec 2024 18:57:25 +0800 Subject: [PATCH 27/91] =?UTF-8?q?cogview3=E9=80=82=E9=85=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../foundation/cogview3/cogview3plus/layers/normalization.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/normalization.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/normalization.py index 88fd20c378..2ead694b9f 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/normalization.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/normalization.py @@ -98,7 +98,7 @@ class ChunkParam: c_gate_msa: torch.Tensor c_shift_mlp: torch.Tensor c_scale_mlp: torch.Tensor - c_gate_mlp_again: torch.Tensor + c_gate_mlp: torch.Tensor class CogView3PlusAdaLayerNormZeroTextImage(nn.Module): -- Gitee From 33cea1c2ce471157b16c6e6fd4a8266fb251c26b Mon Sep 17 00:00:00 2001 From: jiangmengyu Date: Thu, 26 Dec 2024 19:10:03 +0800 Subject: [PATCH 28/91] =?UTF-8?q?cogview3=E9=80=82=E9=85=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../cogview3plus/models/attention_processor.py | 14 -------------- .../models/transformer_cogview3plus.py | 10 ++++------ .../cogview3plus/pipeline/pipeline_cogview3plus.py | 4 +--- 3 files changed, 5 insertions(+), 23 deletions(-) diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/attention_processor.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/attention_processor.py index de7a2a130f..aa2961cf27 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/attention_processor.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/attention_processor.py @@ -229,20 +229,6 @@ class Attention(nn.Module): self.processor = processor - def get_processor(self, return_deprecated_lora: bool = False) -> "AttentionProcessor": - r""" - Get the attention processor in use. - - Args: - return_deprecated_lora (`bool`, *optional*, defaults to `False`): - Set to `True` to return the deprecated LoRA attention processor. - - Returns: - "AttentionProcessor": The attention processor in use. - """ - if not return_deprecated_lora: - return self.processor - def forward( self, hidden_states: torch.Tensor, diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/transformer_cogview3plus.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/transformer_cogview3plus.py index d96ca3966a..6df2dc361e 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/transformer_cogview3plus.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/transformer_cogview3plus.py @@ -14,6 +14,7 @@ from typing import Any, Dict, Union +from dataclasses import dataclass import torch import torch.nn as nn @@ -291,14 +292,14 @@ class CogView3PlusTransformer2DModel(ModelMixin, ConfigMixin): def forward( self, - hidden_states: torch.Tensor, - encoder_hidden_states: torch.Tensor, + states, timestep: torch.LongTensor, original_size: torch.Tensor, target_size: torch.Tensor, crop_coords: torch.Tensor, - return_dict: bool = True, ) -> Union[torch.Tensor, Transformer2DModelOutput]: + hidden_states = states[0] + encoder_hidden_states = states[1] height, width = hidden_states.shape[-2:] text_seq_length = encoder_hidden_states.shape[1] @@ -350,7 +351,4 @@ class CogView3PlusTransformer2DModel(ModelMixin, ConfigMixin): shape=(hidden_states.shape[0], self.out_channels, height * patch_size, width * patch_size) ) - if not return_dict: - return (output,) - return Transformer2DModelOutput(sample=output) diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py index 2e2ec1c8e8..cd6ddff267 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py @@ -583,13 +583,11 @@ class CogView3PlusPipeline(DiffusionPipeline): # predict noise model_output noise_pred = self.transformer( - hidden_states=latent_model_input, - encoder_hidden_states=prompt_embeds, + states=(latent_model_input, prompt_embeds), timestep=timestep, original_size=original_size, target_size=target_size, crop_coords=crops_coords_top_left, - return_dict=False, )[0] noise_pred = noise_pred.float() -- Gitee From 2f1992c42688c315a5540e51ece1c75db6907b90 Mon Sep 17 00:00:00 2001 From: jiangmengyu Date: Thu, 26 Dec 2024 19:17:00 +0800 Subject: [PATCH 29/91] =?UTF-8?q?cogview3=E9=80=82=E9=85=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../models/attention_processor.py | 6 - .../cogview3plus/models/modeling_utils.py | 160 +----------------- .../pipeline/pipeline_cogview3plus.py | 28 --- 3 files changed, 2 insertions(+), 192 deletions(-) diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/attention_processor.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/attention_processor.py index aa2961cf27..d20baee1ec 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/attention_processor.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/attention_processor.py @@ -301,16 +301,10 @@ class Attention(nn.Module): current_length: int = attention_mask.shape[-1] if current_length != target_length: if attention_mask.device.type == "mps": - # HACK: MPS: Does not support padding by greater than dimension of input tensor. - # Instead, we can manually construct the padding tensor. padding_shape = (attention_mask.shape[0], attention_mask.shape[1], target_length) padding = torch.zeros(padding_shape, dtype=attention_mask.dtype, device=attention_mask.device) attention_mask = torch.cat([attention_mask, padding], dim=2) else: - # TODO: for pipelines such as stable-diffusion, padding cross-attn mask: - # we want to instead pad by (0, remaining_length), where remaining_length is: - # remaining_length: int = target_length - current_length - # TODO: re-enable tests/models/test_models_unet_2d_condition.py#test_model_xattn_padding attention_mask = F.pad(attention_mask, (0, target_length), value=0.0) if out_dim == 3: diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/modeling_utils.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/modeling_utils.py index 4a2816f283..7895c5e1ea 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/modeling_utils.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/modeling_utils.py @@ -316,161 +316,6 @@ class ModelMixin(torch.nn.Module, PushToHubMixin): """ self.set_use_memory_efficient_attention_xformers(False) - def save_pretrained( - self, - save_directory: Union[str, os.PathLike], - is_main_process: bool = True, - save_function: Optional[Callable] = None, - safe_serialization: bool = True, - variant: Optional[str] = None, - max_shard_size: Union[int, str] = "10GB", - push_to_hub: bool = False, - **kwargs, - ): - """ - Save a model and its configuration file to a directory so that it can be reloaded using the - [`~models.ModelMixin.from_pretrained`] class method. - - Arguments: - save_directory (`str` or `os.PathLike`): - Directory to save a model and its configuration file to. Will be created if it doesn't exist. - is_main_process (`bool`, *optional*, defaults to `True`): - Whether the process calling this is the main process or not. Useful during distributed training and you - need to call this function on all processes. In this case, set `is_main_process=True` only on the main - process to avoid race conditions. - save_function (`Callable`): - The function to use to save the state dictionary. Useful during distributed training when you need to - replace `torch.save` with another method. Can be configured with the environment variable - `DIFFUSERS_SAVE_MODE`. - safe_serialization (`bool`, *optional*, defaults to `True`): - Whether to save the model using `safetensors` or the traditional PyTorch way with `pickle`. - variant (`str`, *optional*): - If specified, weights are saved in the format `pytorch_model..bin`. - max_shard_size (`int` or `str`, defaults to `"10GB"`): - The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size - lower than this size. If expressed as a string, needs to be digits followed by a unit (like `"5GB"`). - If expressed as an integer, the unit is bytes. Note that this limit will be decreased after a certain - period of time (starting from Oct 2024) to allow users to upgrade to the latest version of `diffusers`. - This is to establish a common default size for this argument across different libraries in the Hugging - Face ecosystem (`transformers`, and `accelerate`, for example). - push_to_hub (`bool`, *optional*, defaults to `False`): - Whether or not to push your model to the Hugging Face Hub after saving it. You can specify the - repository you want to push to with `repo_id` (will default to the name of `save_directory` in your - namespace). - kwargs (`Dict[str, Any]`, *optional*): - Additional keyword arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method. - """ - if os.path.isfile(save_directory): - logger.error(f"Provided path ({save_directory}) should be a directory, not a file") - return - - hf_quantizer = getattr(self, "hf_quantizer", None) - if hf_quantizer is not None: - quantization_serializable = ( - hf_quantizer is not None - and isinstance(hf_quantizer, DiffusersQuantizer) - and hf_quantizer.is_serializable - ) - if not quantization_serializable: - raise ValueError( - f"The model is quantized with {hf_quantizer.quantization_config.quant_method} and is not serializable - check out the warnings from" - " the logger on the traceback to understand the reason why the quantized model is not serializable." - ) - - weights_name = SAFETENSORS_WEIGHTS_NAME if safe_serialization else WEIGHTS_NAME - weights_name = _add_variant(weights_name, variant) - weights_name_pattern = weights_name.replace(".bin", "{suffix}.bin").replace( - ".safetensors", "{suffix}.safetensors" - ) - - os.makedirs(save_directory, exist_ok=True) - - if push_to_hub: - commit_message = kwargs.pop("commit_message", None) - private = kwargs.pop("private", None) - create_pr = kwargs.pop("create_pr", False) - token = kwargs.pop("token", None) - repo_id = kwargs.pop("repo_id", save_directory.split(os.path.sep)[-1]) - repo_id = create_repo(repo_id, exist_ok=True, private=private, token=token).repo_id - - # Only save the model itself if we are using distributed training - model_to_save = self - - # Attach architecture to the config - # Save the config - if is_main_process: - model_to_save.save_config(save_directory) - - # Save the model - state_dict = model_to_save.state_dict() - - # Save the model - state_dict_split = split_torch_state_dict_into_shards( - state_dict, max_shard_size=max_shard_size, filename_pattern=weights_name_pattern - ) - - # Clean the folder from a previous save - if is_main_process: - for filename in os.listdir(save_directory): - if filename in state_dict_split.filename_to_tensors.keys(): - continue - full_filename = os.path.join(save_directory, filename) - if not os.path.isfile(full_filename): - continue - weights_without_ext = weights_name_pattern.replace(".bin", "").replace(".safetensors", "") - weights_without_ext = weights_without_ext.replace("{suffix}", "") - filename_without_ext = filename.replace(".bin", "").replace(".safetensors", "") - # make sure that file to be deleted matches format of sharded file, e.g. pytorch_model-00001-of-00005 - if ( - filename.startswith(weights_without_ext) - and _REGEX_SHARD.fullmatch(filename_without_ext) is not None - ): - os.remove(full_filename) - - for filename, tensors in state_dict_split.filename_to_tensors.items(): - shard = {tensor: state_dict[tensor] for tensor in tensors} - filepath = os.path.join(save_directory, filename) - if safe_serialization: - # At some point we will need to deal better with save_function (used for TPU and other distributed - # joyfulness), but for now this enough. - safetensors.torch.save_file(shard, filepath, metadata={"format": "pt"}) - else: - torch.save(shard, filepath) - - if state_dict_split.is_sharded: - index = { - "metadata": state_dict_split.metadata, - "weight_map": state_dict_split.tensor_to_filename, - } - save_index_file = SAFE_WEIGHTS_INDEX_NAME if safe_serialization else WEIGHTS_INDEX_NAME - save_index_file = os.path.join(save_directory, _add_variant(save_index_file, variant)) - # Save the index as well - with open(save_index_file, "w", encoding="utf-8") as f: - content = json.dumps(index, indent=2, sort_keys=True) + "\n" - f.write(content) - logger.info( - f"The model is bigger than the maximum size per checkpoint ({max_shard_size}) and is going to be " - f"split in {len(state_dict_split.filename_to_tensors)} checkpoint shards. You can find where each parameters has been saved in the " - f"index located at {save_index_file}." - ) - else: - path_to_weights = os.path.join(save_directory, weights_name) - logger.info(f"Model weights saved in {path_to_weights}") - - if push_to_hub: - # Create a new empty model card and eventually tag it - model_card = load_or_create_model_card(repo_id, token=token) - model_card = populate_model_card(model_card) - model_card.save(Path(save_directory, "README.md").as_posix()) - - self._upload_folder( - save_directory, - repo_id, - token=token, - commit_message=commit_message, - create_pr=create_pr, - ) - def dequantize(self): """ Potentially dequantize the model in case it has been quantized by a quantization method that support @@ -652,11 +497,11 @@ class ModelMixin(torch.nn.Module, PushToHubMixin): elif isinstance(device_map, str) and device_map not in ["auto", "balanced", "balanced_low_0", "sequential"]: try: device_map = {"": torch.device(device_map)} - except RuntimeError: + except RuntimeError as e: raise ValueError( "When passing device_map as a string, the value needs to be a device name (e.g. cpu, cuda:0) or " f"'auto', 'balanced', 'balanced_low_0', 'sequential' but found {device_map}." - ) + ) from e elif isinstance(device_map, int): if device_map < 0: raise ValueError( @@ -882,7 +727,6 @@ class ModelMixin(torch.nn.Module, PushToHubMixin): # It would error out during the `validate_environment()` call above in the absence of cuda. if hf_quantizer is None: param_device = "cpu" - # TODO (sayakpaul, SunMarc): remove this after model loading refactor else: param_device = torch.device(torch.cuda.current_device()) state_dict = load_state_dict(model_file, variant=variant) diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py index cd6ddff267..a6d0fbd5f4 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py @@ -193,34 +193,6 @@ class CogView3PlusPipeline(DiffusionPipeline): device: Optional[torch.device] = None, dtype: Optional[torch.dtype] = None, ): - r""" - Encodes the prompt into text encoder hidden states. - - Args: - prompt (`str` or `List[str]`, *optional*): - prompt to be encoded - negative_prompt (`str` or `List[str]`, *optional*): - The prompt or prompts not to guide the image generation. If not defined, one has to pass - `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is - less than `1`). - do_classifier_free_guidance (`bool`, *optional*, defaults to `True`): - Whether to use classifier free guidance or not. - num_images_per_prompt (`int`, *optional*, defaults to 1): - Number of images that should be generated per prompt. torch device to place the resulting embeddings on - prompt_embeds (`torch.Tensor`, *optional*): - Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not - provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.Tensor`, *optional*): - Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt - weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input - argument. - max_sequence_length (`int`, defaults to `224`): - Maximum sequence length in encoded prompt. Can be set to other values but may lead to poorer results. - device: (`torch.device`, *optional*): - torch device - dtype: (`torch.dtype`, *optional*): - torch dtype - """ device = device or self._execution_device prompt = [prompt] if isinstance(prompt, str) else prompt -- Gitee From bf7d28c49747d94610656eb29426c134e1b45b90 Mon Sep 17 00:00:00 2001 From: jiangmengyu Date: Thu, 26 Dec 2024 20:17:31 +0800 Subject: [PATCH 30/91] =?UTF-8?q?cogview3=E9=80=82=E9=85=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../cogview3/cogview3plus/models/modeling_utils.py | 6 +++--- .../cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py | 2 ++ 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/modeling_utils.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/modeling_utils.py index 7895c5e1ea..f83eff493f 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/modeling_utils.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/modeling_utils.py @@ -126,10 +126,10 @@ def get_parameter_dtype(parameter: torch.nn.Module) -> torch.dtype: gen = parameter._named_members(get_members_fn=find_tensor_attributes) last_tuple = None - for current_tuple in gen: + for current_tuple in gen: last_tuple = current_tuple - if current_tuple [1].is_floating_point(): - return current_tuple [1].dtype + if current_tuple[1].is_floating_point(): + return current_tuple[1].dtype if last_tuple is not None: # fallback to the last dtype diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py index a6d0fbd5f4..7368a528a3 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py @@ -451,6 +451,8 @@ class CogView3PlusPipeline(DiffusionPipeline): [`~pipelines.cogview3.pipeline_cogview3plus.CogView3PipelineOutput`] if `return_dict` is True, otherwise a `tuple`. When returning a tuple, the first element is a list with the generated images. """ + prompt = prompt if prompt is not None else [] + negative_prompt = negative_prompt if negative_prompt is not None else [] if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)): callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs -- Gitee From 344e429078093050d7b2f15d71e593eaead03f51 Mon Sep 17 00:00:00 2001 From: jiangmengyu Date: Thu, 26 Dec 2024 20:23:19 +0800 Subject: [PATCH 31/91] =?UTF-8?q?cogview3=E9=80=82=E9=85=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../schedulers/scheduling_ddim_cogvideox.py | 3 --- .../schedulers/scheduling_dpm_cogvideox.py | 3 --- .../foundation/cogview3/cogview3plus/vae/vae.py | 16 ++++++++-------- 3 files changed, 8 insertions(+), 14 deletions(-) diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_ddim_cogvideox.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_ddim_cogvideox.py index b4e22a0615..c2464cf51b 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_ddim_cogvideox.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_ddim_cogvideox.py @@ -280,13 +280,10 @@ class CogVideoXDDIMScheduler(SchedulerMixin, ConfigMixin): # 3. compute predicted original sample from predicted noise also called if self.config.prediction_type == "epsilon": pred_original_sample = (sample - beta_prod_t ** (0.5) * model_output) / alpha_prod_t ** (0.5) - # pred_epsilon = model_output elif self.config.prediction_type == "sample": pred_original_sample = model_output - # pred_epsilon = (sample - alpha_prod_t ** (0.5) * pred_original_sample) / beta_prod_t ** (0.5) elif self.config.prediction_type == "v_prediction": pred_original_sample = (alpha_prod_t**0.5) * sample - (beta_prod_t**0.5) * model_output - # pred_epsilon = (alpha_prod_t**0.5) * model_output + (beta_prod_t**0.5) * sample else: raise ValueError( f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample`, or" diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_dpm_cogvideox.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_dpm_cogvideox.py index 6d25dea524..5486e7cb93 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_dpm_cogvideox.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_dpm_cogvideox.py @@ -401,13 +401,10 @@ class CogVideoXDPMScheduler(SchedulerMixin, ConfigMixin): # To make style tests pass, commented out `pred_epsilon` as it is an unused variable if self.config.prediction_type == "epsilon": pred_original_sample = (sample - beta_prod_t ** (0.5) * model_output) / alpha_prod_t ** (0.5) - # pred_epsilon = model_output elif self.config.prediction_type == "sample": pred_original_sample = model_output - # pred_epsilon = (sample - alpha_prod_t ** (0.5) * pred_original_sample) / beta_prod_t ** (0.5) elif self.config.prediction_type == "v_prediction": pred_original_sample = (alpha_prod_t**0.5) * sample - (beta_prod_t**0.5) * model_output - # pred_epsilon = (alpha_prod_t**0.5) * model_output + (beta_prod_t**0.5) * sample else: raise ValueError( f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample`, or" diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/vae/vae.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/vae/vae.py index 006ed75f1f..2323be5a78 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/vae/vae.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/vae/vae.py @@ -420,9 +420,9 @@ class MaskConditionEncoder(nn.Module): layers = [] in_ch_ = in_ch - for l in range(len(out_channels)): - out_ch_ = out_channels[l] - if l == 0 or l == 1: + for i, _ in enumerate(out_channels): + out_ch_ = out_channels[i] + if i == 0 or i == 1: layers.append(nn.Conv2d(in_ch_, out_ch_, kernel_size=3, stride=1, padding=1)) else: layers.append(nn.Conv2d(in_ch_, out_ch_, kernel_size=4, stride=2, padding=1)) @@ -433,8 +433,8 @@ class MaskConditionEncoder(nn.Module): def forward(self, x: torch.Tensor, mask=None) -> torch.Tensor: r"""The forward method of the `MaskConditionEncoder` class.""" out = {} - for l in range(len(self.layers)): - layer = self.layers[l] + for i, _ in enumerate(self.layers): + layer = self.layers[i] x = layer(x) out[str(tuple(x.shape))] = x x = torch.relu(x) @@ -703,7 +703,6 @@ class VectorQuantizer(nn.Module): def remap_to_used(self, inds: torch.LongTensor) -> torch.LongTensor: ishape = inds.shape - assert len(ishape) > 1 inds = inds.reshape(ishape[0], -1) used = self.used.to(inds) match = (inds[:, :, None] == used[None, None, ...]).long() @@ -717,7 +716,6 @@ class VectorQuantizer(nn.Module): def unmap_to_all(self, inds: torch.LongTensor) -> torch.LongTensor: ishape = inds.shape - assert len(ishape) > 1 inds = inds.reshape(ishape[0], -1) used = self.used.to(inds) if self.re_embed > self.used.shape[0]: # extra token @@ -820,7 +818,9 @@ class DiagonalGaussianDistribution(object): dim=[1, 2, 3], ) - def nll(self, sample: torch.Tensor, dims: Tuple[int, ...] = [1, 2, 3]) -> torch.Tensor: + def nll(self, sample: torch.Tensor, dims: Tuple[int, ...] = None) -> torch.Tensor: + if dims is None: + dims = [1, 2, 3] if self.deterministic: return torch.Tensor([0.0]) logtwopi = np.log(2.0 * np.pi) -- Gitee From 97a6dd950ad99df7e73b89f58fe19ad78aafbc02 Mon Sep 17 00:00:00 2001 From: jiangmengyu Date: Thu, 26 Dec 2024 20:24:28 +0800 Subject: [PATCH 32/91] =?UTF-8?q?cogview3=E9=80=82=E9=85=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py index 7368a528a3..7e47fcd6aa 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py @@ -451,9 +451,6 @@ class CogView3PlusPipeline(DiffusionPipeline): [`~pipelines.cogview3.pipeline_cogview3plus.CogView3PipelineOutput`] if `return_dict` is True, otherwise a `tuple`. When returning a tuple, the first element is a list with the generated images. """ - prompt = prompt if prompt is not None else [] - negative_prompt = negative_prompt if negative_prompt is not None else [] - if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)): callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs -- Gitee From b9c42ba0ba1cdf53b1796b3ef0a153ffa3dfdda9 Mon Sep 17 00:00:00 2001 From: jiangmengyu Date: Fri, 27 Dec 2024 10:03:36 +0800 Subject: [PATCH 33/91] =?UTF-8?q?cogview3=E9=80=82=E9=85=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py index 7e47fcd6aa..cfaca19fad 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py @@ -210,8 +210,9 @@ class CogView3PlusPipeline(DiffusionPipeline): dtype=dtype, ) - if do_classifier_free_guidance and negative_prompt is None: - negative_prompt_embeds = prompt_embeds.new_zeros(prompt_embeds.shape) + negative_prompt_embeds = prompt_embeds.new_zeros(prompt_embeds.shape) + print(negative_prompt_embeds) + exit() if do_classifier_free_guidance and negative_prompt_embeds is None: negative_prompt = batch_size * [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt -- Gitee From 3d95b863b0d6270943c6e05fb57f96c9a7f687bd Mon Sep 17 00:00:00 2001 From: jiangmengyu Date: Fri, 27 Dec 2024 10:05:32 +0800 Subject: [PATCH 34/91] =?UTF-8?q?cogview3=E9=80=82=E9=85=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py index cfaca19fad..3d65d2bbd2 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py @@ -211,10 +211,11 @@ class CogView3PlusPipeline(DiffusionPipeline): ) negative_prompt_embeds = prompt_embeds.new_zeros(prompt_embeds.shape) - print(negative_prompt_embeds) - exit() + if do_classifier_free_guidance and negative_prompt_embeds is None: + print(negative_prompt_embeds) + exit() negative_prompt = batch_size * [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt if prompt is not None and type(prompt) is not type(negative_prompt): -- Gitee From 3b521c17fff9c2c4f90cac7741a88f13eeca7d8a Mon Sep 17 00:00:00 2001 From: jiangmengyu Date: Fri, 27 Dec 2024 10:34:50 +0800 Subject: [PATCH 35/91] =?UTF-8?q?cogview3=E9=80=82=E9=85=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../pipeline/pipeline_cogview3plus.py | 142 +----------------- 1 file changed, 7 insertions(+), 135 deletions(-) diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py index 3d65d2bbd2..df6459f818 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py @@ -184,11 +184,7 @@ class CogView3PlusPipeline(DiffusionPipeline): def encode_prompt( self, prompt: Union[str, List[str]], - negative_prompt: Optional[Union[str, List[str]]] = None, - do_classifier_free_guidance: bool = True, num_images_per_prompt: int = 1, - prompt_embeds: Optional[torch.Tensor] = None, - negative_prompt_embeds: Optional[torch.Tensor] = None, max_sequence_length: int = 224, device: Optional[torch.device] = None, dtype: Optional[torch.dtype] = None, @@ -196,48 +192,15 @@ class CogView3PlusPipeline(DiffusionPipeline): device = device or self._execution_device prompt = [prompt] if isinstance(prompt, str) else prompt - if prompt is not None: - batch_size = len(prompt) - else: - batch_size = prompt_embeds.shape[0] - - if prompt_embeds is None: - prompt_embeds = self._get_t5_prompt_embeds( - prompt=prompt, - num_images_per_prompt=num_images_per_prompt, - max_sequence_length=max_sequence_length, - device=device, - dtype=dtype, - ) - + prompt_embeds = self._get_t5_prompt_embeds( + prompt=prompt, + num_images_per_prompt=num_images_per_prompt, + max_sequence_length=max_sequence_length, + device=device, + dtype=dtype, + ) negative_prompt_embeds = prompt_embeds.new_zeros(prompt_embeds.shape) - - if do_classifier_free_guidance and negative_prompt_embeds is None: - print(negative_prompt_embeds) - exit() - negative_prompt = batch_size * [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt - - if prompt is not None and type(prompt) is not type(negative_prompt): - raise TypeError( - f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !=" - f" {type(prompt)}." - ) - elif batch_size != len(negative_prompt): - raise ValueError( - f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:" - f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches" - " the batch size of `prompt`." - ) - - negative_prompt_embeds = self._get_t5_prompt_embeds( - prompt=negative_prompt, - num_images_per_prompt=num_images_per_prompt, - max_sequence_length=max_sequence_length, - device=device, - dtype=dtype, - ) - return prompt_embeds, negative_prompt_embeds # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents @@ -371,88 +334,6 @@ class CogView3PlusPipeline(DiffusionPipeline): callback_on_step_end_tensor_inputs: List[str] = ["latents"], max_sequence_length: int = 224, ) -> Union[CogView3PipelineOutput, Tuple]: - """ - Function invoked when calling the pipeline for generation. - - Args: - prompt (`str` or `List[str]`, *optional*): - The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`. - negative_prompt (`str` or `List[str]`, *optional*): - The prompt or prompts not to guide the image generation. If not defined, one has to pass - `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is - less than `1`). - height (`int`, *optional*, defaults to self.transformer.config.sample_size * self.vae_scale_factor): - The height in pixels of the generated image. If not provided, it is set to 1024. - width (`int`, *optional*, defaults to self.transformer.config.sample_size * self.vae_scale_factor): - The width in pixels of the generated image. If not provided it is set to 1024. - num_inference_steps (`int`, *optional*, defaults to `50`): - The number of denoising steps. More denoising steps usually lead to a higher quality image at the - expense of slower inference. - timesteps (`List[int]`, *optional*): - Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument - in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is - passed will be used. Must be in descending order. - guidance_scale (`float`, *optional*, defaults to `5.0`): - Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598). - `guidance_scale` is defined as `w` of equation 2. of [Imagen - Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale > - 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`, - usually at the expense of lower image quality. - num_images_per_prompt (`int`, *optional*, defaults to `1`): - The number of images to generate per prompt. - generator (`torch.Generator` or `List[torch.Generator]`, *optional*): - One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) - to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): - Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image - generation. Can be used to tweak the same generation with different prompts. If not provided, a latents - tensor will ge generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): - Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not - provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): - Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt - weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input - argument. - original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)): - If `original_size` is not the same as `target_size` the image will appear to be down- or upsampled. - `original_size` defaults to `(height, width)` if not specified. Part of SDXL's micro-conditioning as - explained in section 2.2 of - [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). - crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)): - `crops_coords_top_left` can be used to generate an image that appears to be "cropped" from the position - `crops_coords_top_left` downwards. Favorable, well-centered images are usually achieved by setting - `crops_coords_top_left` to (0, 0). Part of SDXL's micro-conditioning as explained in section 2.2 of - [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). - output_type (`str`, *optional*, defaults to `"pil"`): - The output format of the generate image. Choose between - [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`. - return_dict (`bool`, *optional*, defaults to `True`): - Whether or not to return a [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] instead - of a plain tuple. - attention_kwargs (`dict`, *optional*): - A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under - `self.processor` in - [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py). - callback_on_step_end (`Callable`, *optional*): - A function that calls at the end of each denoising steps during the inference. The function is called - with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int, - callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by - `callback_on_step_end_tensor_inputs`. - callback_on_step_end_tensor_inputs (`List`, *optional*): - The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list - will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the - `._callback_tensor_inputs` attribute of your pipeline class. - max_sequence_length (`int`, defaults to `224`): - Maximum sequence length in encoded prompt. Can be set to other values but may lead to poorer results. - - Examples: - - Returns: - [`~pipelines.cogview3.pipeline_cogview3plus.CogView3PipelineOutput`] or `tuple`: - [`~pipelines.cogview3.pipeline_cogview3plus.CogView3PipelineOutput`] if `return_dict` is True, otherwise a - `tuple`. When returning a tuple, the first element is a list with the generated images. - """ if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)): callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs @@ -485,19 +366,10 @@ class CogView3PlusPipeline(DiffusionPipeline): device = self._execution_device - # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2) - # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1` - # corresponds to doing no classifier free guidance. - do_classifier_free_guidance = guidance_scale > 1.0 - # 3. Encode input prompt prompt_embeds, negative_prompt_embeds = self.encode_prompt( prompt, - negative_prompt, - self.do_classifier_free_guidance, num_images_per_prompt=num_images_per_prompt, - prompt_embeds=prompt_embeds, - negative_prompt_embeds=negative_prompt_embeds, max_sequence_length=max_sequence_length, device=device, ) -- Gitee From 92b825af6493dadb4c8c8619aac2db45d4314ab4 Mon Sep 17 00:00:00 2001 From: jiangmengyu Date: Fri, 27 Dec 2024 10:35:55 +0800 Subject: [PATCH 36/91] =?UTF-8?q?cogview3=E9=80=82=E9=85=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../pipeline/pipeline_cogview3plus.py | 83 +++++++++++++++++++ 1 file changed, 83 insertions(+) diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py index df6459f818..2e19c13d08 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py @@ -334,6 +334,89 @@ class CogView3PlusPipeline(DiffusionPipeline): callback_on_step_end_tensor_inputs: List[str] = ["latents"], max_sequence_length: int = 224, ) -> Union[CogView3PipelineOutput, Tuple]: + """ + Function invoked when calling the pipeline for generation. + + Args: + prompt (`str` or `List[str]`, *optional*): + The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`. + negative_prompt (`str` or `List[str]`, *optional*): + The prompt or prompts not to guide the image generation. If not defined, one has to pass + `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is + less than `1`). + height (`int`, *optional*, defaults to self.transformer.config.sample_size * self.vae_scale_factor): + The height in pixels of the generated image. If not provided, it is set to 1024. + width (`int`, *optional*, defaults to self.transformer.config.sample_size * self.vae_scale_factor): + The width in pixels of the generated image. If not provided it is set to 1024. + num_inference_steps (`int`, *optional*, defaults to `50`): + The number of denoising steps. More denoising steps usually lead to a higher quality image at the + expense of slower inference. + timesteps (`List[int]`, *optional*): + Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument + in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is + passed will be used. Must be in descending order. + guidance_scale (`float`, *optional*, defaults to `5.0`): + Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598). + `guidance_scale` is defined as `w` of equation 2. of [Imagen + Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale > + 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`, + usually at the expense of lower image quality. + num_images_per_prompt (`int`, *optional*, defaults to `1`): + The number of images to generate per prompt. + generator (`torch.Generator` or `List[torch.Generator]`, *optional*): + One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) + to make generation deterministic. + latents (`torch.FloatTensor`, *optional*): + Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image + generation. Can be used to tweak the same generation with different prompts. If not provided, a latents + tensor will ge generated by sampling using the supplied random `generator`. + prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not + provided, text embeddings will be generated from `prompt` input argument. + negative_prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt + weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input + argument. + original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)): + If `original_size` is not the same as `target_size` the image will appear to be down- or upsampled. + `original_size` defaults to `(height, width)` if not specified. Part of SDXL's micro-conditioning as + explained in section 2.2 of + [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). + crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)): + `crops_coords_top_left` can be used to generate an image that appears to be "cropped" from the position + `crops_coords_top_left` downwards. Favorable, well-centered images are usually achieved by setting + `crops_coords_top_left` to (0, 0). Part of SDXL's micro-conditioning as explained in section 2.2 of + [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). + output_type (`str`, *optional*, defaults to `"pil"`): + The output format of the generate image. Choose between + [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`. + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] instead + of a plain tuple. + attention_kwargs (`dict`, *optional*): + A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under + `self.processor` in + [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py). + callback_on_step_end (`Callable`, *optional*): + A function that calls at the end of each denoising steps during the inference. The function is called + with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int, + callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by + `callback_on_step_end_tensor_inputs`. + callback_on_step_end_tensor_inputs (`List`, *optional*): + The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list + will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the + `._callback_tensor_inputs` attribute of your pipeline class. + max_sequence_length (`int`, defaults to `224`): + Maximum sequence length in encoded prompt. Can be set to other values but may lead to poorer results. + + Examples: + + Returns: + [`~pipelines.cogview3.pipeline_cogview3plus.CogView3PipelineOutput`] or `tuple`: + [`~pipelines.cogview3.pipeline_cogview3plus.CogView3PipelineOutput`] if `return_dict` is True, otherwise a + `tuple`. When returning a tuple, the first element is a list with the generated images. + """ + if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)): callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs -- Gitee From 480d05e1c79432b230fc54dc930f8be0b6e6e341 Mon Sep 17 00:00:00 2001 From: jiangmengyu Date: Fri, 27 Dec 2024 10:40:11 +0800 Subject: [PATCH 37/91] =?UTF-8?q?cogview3=E9=80=82=E9=85=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../cogview3plus/pipeline/pipeline_cogview3plus.py | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py index 2e19c13d08..e55ad153a0 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py @@ -245,7 +245,6 @@ class CogView3PlusPipeline(DiffusionPipeline): prompt, height, width, - negative_prompt, callback_on_step_end_tensor_inputs, prompt_embeds=None, negative_prompt_embeds=None, @@ -277,12 +276,6 @@ class CogView3PlusPipeline(DiffusionPipeline): f" {negative_prompt_embeds}. Please make sure to only forward one of the two." ) - if negative_prompt is not None and negative_prompt_embeds is not None: - raise ValueError( - f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:" - f" {negative_prompt_embeds}. Please make sure to only forward one of the two." - ) - if prompt_embeds is not None and negative_prompt_embeds is not None: if prompt_embeds.shape != negative_prompt_embeds.shape: raise ValueError( @@ -312,11 +305,9 @@ class CogView3PlusPipeline(DiffusionPipeline): def __call__( self, prompt: Optional[Union[str, List[str]]] = None, - negative_prompt: Optional[Union[str, List[str]]] = None, height: Optional[int] = None, width: Optional[int] = None, num_inference_steps: int = 50, - timesteps: Optional[List[int]] = None, guidance_scale: float = 5.0, num_images_per_prompt: int = 1, eta: float = 0.0, @@ -416,7 +407,7 @@ class CogView3PlusPipeline(DiffusionPipeline): [`~pipelines.cogview3.pipeline_cogview3plus.CogView3PipelineOutput`] if `return_dict` is True, otherwise a `tuple`. When returning a tuple, the first element is a list with the generated images. """ - + if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)): callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs @@ -431,7 +422,6 @@ class CogView3PlusPipeline(DiffusionPipeline): prompt, height, width, - negative_prompt, callback_on_step_end_tensor_inputs, prompt_embeds, negative_prompt_embeds, @@ -460,7 +450,7 @@ class CogView3PlusPipeline(DiffusionPipeline): prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0) # 4. Prepare timesteps - timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps) + timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device) self._num_timesteps = len(timesteps) # 5. Prepare latents. -- Gitee From 6852ece8a01d3c43e5516192a9a901492383daa8 Mon Sep 17 00:00:00 2001 From: jiangmengyu Date: Fri, 27 Dec 2024 10:46:09 +0800 Subject: [PATCH 38/91] =?UTF-8?q?cogview3=E9=80=82=E9=85=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py index e55ad153a0..8f9ec8693f 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py @@ -310,7 +310,6 @@ class CogView3PlusPipeline(DiffusionPipeline): num_inference_steps: int = 50, guidance_scale: float = 5.0, num_images_per_prompt: int = 1, - eta: float = 0.0, generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, latents: Optional[torch.FloatTensor] = None, prompt_embeds: Optional[torch.FloatTensor] = None, @@ -467,7 +466,7 @@ class CogView3PlusPipeline(DiffusionPipeline): ) # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline - extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) + extra_step_kwargs = self.prepare_extra_step_kwargs(generator, 0.0) # 7. Prepare additional timestep conditions original_size = torch.tensor([original_size], dtype=prompt_embeds.dtype) -- Gitee From 4a8329b36925afc4447d61131ea35129f5facf7b Mon Sep 17 00:00:00 2001 From: jiangmengyu Date: Fri, 27 Dec 2024 10:49:17 +0800 Subject: [PATCH 39/91] =?UTF-8?q?cogview3=E9=80=82=E9=85=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../cogview3plus/pipeline/pipeline_cogview3plus.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py index 8f9ec8693f..efdd50f1bf 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py @@ -310,8 +310,6 @@ class CogView3PlusPipeline(DiffusionPipeline): num_inference_steps: int = 50, guidance_scale: float = 5.0, num_images_per_prompt: int = 1, - generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, - latents: Optional[torch.FloatTensor] = None, prompt_embeds: Optional[torch.FloatTensor] = None, negative_prompt_embeds: Optional[torch.FloatTensor] = None, original_size: Optional[Tuple[int, int]] = None, @@ -461,12 +459,11 @@ class CogView3PlusPipeline(DiffusionPipeline): width, prompt_embeds.dtype, device, - generator, - latents, + None, ) # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline - extra_step_kwargs = self.prepare_extra_step_kwargs(generator, 0.0) + extra_step_kwargs = self.prepare_extra_step_kwargs(None, 0.0) # 7. Prepare additional timestep conditions original_size = torch.tensor([original_size], dtype=prompt_embeds.dtype) @@ -543,7 +540,7 @@ class CogView3PlusPipeline(DiffusionPipeline): progress_bar.update() if not output_type == "latent": - image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False, generator=generator)[ + image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False, generator=None)[ 0 ] else: -- Gitee From 9dc08db22da818b28fb29dece76912ecf07d7ce9 Mon Sep 17 00:00:00 2001 From: jiangmengyu Date: Fri, 27 Dec 2024 10:55:08 +0800 Subject: [PATCH 40/91] =?UTF-8?q?cogview3=E9=80=82=E9=85=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../pipeline/pipeline_cogview3plus.py | 44 ++----------------- 1 file changed, 3 insertions(+), 41 deletions(-) diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py index efdd50f1bf..10aca046d7 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py @@ -245,45 +245,13 @@ class CogView3PlusPipeline(DiffusionPipeline): prompt, height, width, - callback_on_step_end_tensor_inputs, - prompt_embeds=None, - negative_prompt_embeds=None, ): if height % 8 != 0 or width % 8 != 0: raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.") - if callback_on_step_end_tensor_inputs is not None and not all( - k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs - ): - raise ValueError( - f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}" - ) - if prompt is not None and prompt_embeds is not None: - raise ValueError( - f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to" - " only forward one of the two." - ) - elif prompt is None and prompt_embeds is None: - raise ValueError( - "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined." - ) - elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)): + if prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)): raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}") - if prompt is not None and negative_prompt_embeds is not None: - raise ValueError( - f"Cannot forward both `prompt`: {prompt} and `negative_prompt_embeds`:" - f" {negative_prompt_embeds}. Please make sure to only forward one of the two." - ) - - if prompt_embeds is not None and negative_prompt_embeds is not None: - if prompt_embeds.shape != negative_prompt_embeds.shape: - raise ValueError( - "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but" - f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`" - f" {negative_prompt_embeds.shape}." - ) - @property def guidance_scale(self): return self._guidance_scale @@ -310,7 +278,6 @@ class CogView3PlusPipeline(DiffusionPipeline): num_inference_steps: int = 50, guidance_scale: float = 5.0, num_images_per_prompt: int = 1, - prompt_embeds: Optional[torch.FloatTensor] = None, negative_prompt_embeds: Optional[torch.FloatTensor] = None, original_size: Optional[Tuple[int, int]] = None, crops_coords_top_left: Tuple[int, int] = (0, 0), @@ -419,20 +386,15 @@ class CogView3PlusPipeline(DiffusionPipeline): prompt, height, width, - callback_on_step_end_tensor_inputs, - prompt_embeds, - negative_prompt_embeds, ) self._guidance_scale = guidance_scale self._interrupt = False # 2. Default call parameters - if prompt is not None and isinstance(prompt, str): + if isinstance(prompt, str): batch_size = 1 - elif prompt is not None and isinstance(prompt, list): - batch_size = len(prompt) else: - batch_size = prompt_embeds.shape[0] + batch_size = len(prompt) device = self._execution_device -- Gitee From 96f8a2516cd943c9291b9abd79a97fe5de105f68 Mon Sep 17 00:00:00 2001 From: jiangmengyu Date: Fri, 27 Dec 2024 11:00:41 +0800 Subject: [PATCH 41/91] =?UTF-8?q?cogview3=E9=80=82=E9=85=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../pipeline/pipeline_cogview3plus.py | 43 +++---------------- 1 file changed, 5 insertions(+), 38 deletions(-) diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py index 10aca046d7..33b01feae5 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py @@ -278,16 +278,6 @@ class CogView3PlusPipeline(DiffusionPipeline): num_inference_steps: int = 50, guidance_scale: float = 5.0, num_images_per_prompt: int = 1, - negative_prompt_embeds: Optional[torch.FloatTensor] = None, - original_size: Optional[Tuple[int, int]] = None, - crops_coords_top_left: Tuple[int, int] = (0, 0), - output_type: str = "pil", - return_dict: bool = True, - callback_on_step_end: Optional[ - Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks] - ] = None, - callback_on_step_end_tensor_inputs: List[str] = ["latents"], - max_sequence_length: int = 224, ) -> Union[CogView3PipelineOutput, Tuple]: """ Function invoked when calling the pipeline for generation. @@ -372,13 +362,10 @@ class CogView3PlusPipeline(DiffusionPipeline): `tuple`. When returning a tuple, the first element is a list with the generated images. """ - if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)): - callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs - height = height or self.transformer.config.sample_size * self.vae_scale_factor width = width or self.transformer.config.sample_size * self.vae_scale_factor - original_size = original_size or (height, width) + original_size = (height, width) target_size = (height, width) # 1. Check inputs. Raise error if not correct @@ -402,7 +389,7 @@ class CogView3PlusPipeline(DiffusionPipeline): prompt_embeds, negative_prompt_embeds = self.encode_prompt( prompt, num_images_per_prompt=num_images_per_prompt, - max_sequence_length=max_sequence_length, + max_sequence_length=224, device=device, ) if self.do_classifier_free_guidance: @@ -430,7 +417,7 @@ class CogView3PlusPipeline(DiffusionPipeline): # 7. Prepare additional timestep conditions original_size = torch.tensor([original_size], dtype=prompt_embeds.dtype) target_size = torch.tensor([target_size], dtype=prompt_embeds.dtype) - crops_coords_top_left = torch.tensor([crops_coords_top_left], dtype=prompt_embeds.dtype) + crops_coords_top_left = torch.tensor([(0, 0)], dtype=prompt_embeds.dtype) if self.do_classifier_free_guidance: original_size = torch.cat([original_size, original_size]) @@ -487,33 +474,13 @@ class CogView3PlusPipeline(DiffusionPipeline): ) latents = latents.to(prompt_embeds.dtype) - # call the callback, if provided - if callback_on_step_end is not None: - callback_kwargs = {} - for k in callback_on_step_end_tensor_inputs: - callback_kwargs[k] = locals()[k] - callback_outputs = callback_on_step_end(self, i, t, callback_kwargs) - - latents = callback_outputs.pop("latents", latents) - prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds) - negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds) - if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): progress_bar.update() - if not output_type == "latent": - image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False, generator=None)[ - 0 - ] - else: - image = latents - - image = self.image_processor.postprocess(image, output_type=output_type) + image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False, generator=None)[0] + image = self.image_processor.postprocess(image, output_type="pil") # Offload all models self.maybe_free_model_hooks() - if not return_dict: - return (image,) - return CogView3PipelineOutput(images=image) \ No newline at end of file -- Gitee From 6f80df407ba4cba57ae3f4bcbee49f3ee4fab148 Mon Sep 17 00:00:00 2001 From: jiangmengyu Date: Fri, 27 Dec 2024 11:04:15 +0800 Subject: [PATCH 42/91] =?UTF-8?q?cogview3=E9=80=82=E9=85=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../cogview3plus/pipeline/pipeline_cogview3plus.py | 12 +++++++----- .../foundation/cogview3/inference_cogview3plus.py | 3 +-- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py index 33b01feae5..5cac1be1f1 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py @@ -273,8 +273,7 @@ class CogView3PlusPipeline(DiffusionPipeline): def __call__( self, prompt: Optional[Union[str, List[str]]] = None, - height: Optional[int] = None, - width: Optional[int] = None, + image_size: Tuple[int, int] = None, num_inference_steps: int = 50, guidance_scale: float = 5.0, num_images_per_prompt: int = 1, @@ -361,9 +360,12 @@ class CogView3PlusPipeline(DiffusionPipeline): [`~pipelines.cogview3.pipeline_cogview3plus.CogView3PipelineOutput`] if `return_dict` is True, otherwise a `tuple`. When returning a tuple, the first element is a list with the generated images. """ - - height = height or self.transformer.config.sample_size * self.vae_scale_factor - width = width or self.transformer.config.sample_size * self.vae_scale_factor + if image_size is None: + height = self.transformer.config.sample_size * self.vae_scale_factor + width = self.transformer.config.sample_size * self.vae_scale_factor + else: + height = image_size[0] + width = image_size[1] original_size = (height, width) target_size = (height, width) diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/inference_cogview3plus.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/inference_cogview3plus.py index 5c12695d6c..d24d3f29c4 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/inference_cogview3plus.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/inference_cogview3plus.py @@ -82,8 +82,7 @@ def infer(args): guidance_scale=args.guidance_scale, num_images_per_prompt=args.num_images_per_prompt, num_inference_steps=args.num_inference_steps, - width=args.width, - height=args.height, + image_size=(args.height, args.width), ).images[0] if i >= 2: -- Gitee From ee21249c88b30bf7f7173f5a95b63ab11ef33a66 Mon Sep 17 00:00:00 2001 From: jiangmengyu Date: Fri, 27 Dec 2024 11:09:48 +0800 Subject: [PATCH 43/91] =?UTF-8?q?cogview3=E9=80=82=E9=85=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../pipeline/pipeline_cogview3plus.py | 19 +++++-------------- 1 file changed, 5 insertions(+), 14 deletions(-) diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py index 5cac1be1f1..d90c00653f 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py @@ -204,23 +204,16 @@ class CogView3PlusPipeline(DiffusionPipeline): return prompt_embeds, negative_prompt_embeds # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents - def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None): + def prepare_latents(self, batch_size, num_channels_latents, image_size, dtype, device): + height = image_size[0] + width = image_size[1] shape = ( batch_size, num_channels_latents, int(height) // self.vae_scale_factor, int(width) // self.vae_scale_factor, ) - if isinstance(generator, list) and len(generator) != batch_size: - raise ValueError( - f"You have passed a list of generators of length {len(generator)}, but requested an effective batch" - f" size of {batch_size}. Make sure the batch size matches the length of the generators." - ) - - if latents is None: - latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype) - else: - latents = latents.to(device) + latents = randn_tensor(shape, device=device, dtype=dtype) # scale the initial noise by the standard deviation required by the scheduler latents = latents * self.scheduler.init_noise_sigma @@ -406,11 +399,9 @@ class CogView3PlusPipeline(DiffusionPipeline): latents = self.prepare_latents( batch_size * num_images_per_prompt, latent_channels, - height, - width, + (height, width), prompt_embeds.dtype, device, - None, ) # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline -- Gitee From 006c8fd91a68f198e8537b5265507dd7d6fd4eb7 Mon Sep 17 00:00:00 2001 From: jiangmengyu Date: Fri, 27 Dec 2024 11:12:18 +0800 Subject: [PATCH 44/91] =?UTF-8?q?cogview3=E9=80=82=E9=85=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py | 1 - .../cogview3plus/schedulers/scheduling_ddim_cogvideox.py | 4 ---- 2 files changed, 5 deletions(-) diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py index d90c00653f..a2c8d0f37f 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py @@ -404,7 +404,6 @@ class CogView3PlusPipeline(DiffusionPipeline): device, ) - # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline extra_step_kwargs = self.prepare_extra_step_kwargs(None, 0.0) # 7. Prepare additional timestep conditions diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_ddim_cogvideox.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_ddim_cogvideox.py index c2464cf51b..dbe9d4b17f 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_ddim_cogvideox.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_ddim_cogvideox.py @@ -258,10 +258,6 @@ class CogVideoXDDIMScheduler(SchedulerMixin, ConfigMixin): model_output: torch.Tensor, timestep: int, sample: torch.Tensor, - eta: float = 0.0, - use_clipped_model_output: bool = False, - generator=None, - variance_noise: Optional[torch.Tensor] = None, return_dict: bool = True, ) -> Union[DDIMSchedulerOutput, Tuple]: if self.num_inference_steps is None: -- Gitee From df5d6fb3c589bb87dfbdab100a319b26322f20b6 Mon Sep 17 00:00:00 2001 From: jiangmengyu Date: Fri, 27 Dec 2024 11:18:11 +0800 Subject: [PATCH 45/91] =?UTF-8?q?cogview3=E9=80=82=E9=85=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- MindIE/MindIE-Torch/built-in/foundation/cogview3/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/README.md b/MindIE/MindIE-Torch/built-in/foundation/cogview3/README.md index dfcf259a0b..495359c497 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/README.md +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/README.md @@ -145,7 +145,7 @@ https://huggingface.co/THUDM/CogView3-Plus-3B/tree/main/vae ### 3.2 单卡单prompt功能测试 设置权重路径 ```shell -model_path = '/data/CogView3B' +model_path='/data/CogView3B' ``` 执行命令: ```shell -- Gitee From 98ec2a2264bda14d3f8113ea22286fdc80a4c532 Mon Sep 17 00:00:00 2001 From: jiangmengyu Date: Fri, 27 Dec 2024 11:22:22 +0800 Subject: [PATCH 46/91] =?UTF-8?q?cogview3=E9=80=82=E9=85=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../pipeline/pipeline_cogview3plus.py | 20 +- .../cogview3plus/schedulers/__init__.py | 1 - .../schedulers/scheduling_dpm_cogvideox.py | 484 ------------------ 3 files changed, 4 insertions(+), 501 deletions(-) delete mode 100644 MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_dpm_cogvideox.py diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py index a2c8d0f37f..0031849dc9 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py @@ -14,12 +14,11 @@ # limitations under the License. import inspect -from typing import Callable, Dict, List, Optional, Tuple, Union +from typing import List, Optional, Tuple, Union import torch from transformers import T5EncoderModel, T5Tokenizer -from diffusers.callbacks import MultiPipelineCallbacks, PipelineCallback from diffusers.image_processor import VaeImageProcessor from diffusers.pipelines.pipeline_utils import DiffusionPipeline from diffusers.utils import logging, replace_example_docstring @@ -27,7 +26,7 @@ from diffusers.utils.torch_utils import randn_tensor from ..vae import AutoencoderKL from ..models import CogView3PlusTransformer2DModel -from ..schedulers import CogVideoXDDIMScheduler, CogVideoXDPMScheduler +from ..schedulers import CogVideoXDDIMScheduler from .pipeline_output import CogView3PipelineOutput logger = logging.get_logger(__name__) # pylint: disable=invalid-name @@ -125,7 +124,7 @@ class CogView3PlusPipeline(DiffusionPipeline): text_encoder: T5EncoderModel, vae: AutoencoderKL, transformer: CogView3PlusTransformer2DModel, - scheduler: Union[CogVideoXDDIMScheduler, CogVideoXDPMScheduler], + scheduler: CogVideoXDDIMScheduler, ): super().__init__() @@ -452,18 +451,7 @@ class CogView3PlusPipeline(DiffusionPipeline): noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond) # compute the previous noisy sample x_t -> x_t-1 - if not isinstance(self.scheduler, CogVideoXDPMScheduler): - latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0] - else: - latents, old_pred_original_sample = self.scheduler.step( - noise_pred, - old_pred_original_sample, - t, - timesteps[i - 1] if i > 0 else None, - latents, - **extra_step_kwargs, - return_dict=False, - ) + latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0] latents = latents.to(prompt_embeds.dtype) if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/__init__.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/__init__.py index 32d0c223e7..7a8f559a28 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/__init__.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/__init__.py @@ -1,3 +1,2 @@ from .scheduling_ddim_cogvideox import CogVideoXDDIMScheduler -from .scheduling_dpm_cogvideox import CogVideoXDPMScheduler from .scheduling_utils import SchedulerMixin \ No newline at end of file diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_dpm_cogvideox.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_dpm_cogvideox.py deleted file mode 100644 index 5486e7cb93..0000000000 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_dpm_cogvideox.py +++ /dev/null @@ -1,484 +0,0 @@ -# Copyright 2024 The CogVideoX team, Tsinghua University & ZhipuAI and The HuggingFace Team. -# All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# DISCLAIMER: This code is strongly influenced by https://github.com/pesser/pytorch_diffusion -# and https://github.com/hojonathanho/diffusion - -import math -from dataclasses import dataclass -from typing import List, Optional, Tuple, Union - -import numpy as np -import torch - -from diffusers.configuration_utils import ConfigMixin, register_to_config -from diffusers.utils import BaseOutput -from diffusers.utils.torch_utils import randn_tensor -from .scheduling_utils import KarrasDiffusionSchedulers, SchedulerMixin - - -@dataclass -# Copied from diffusers.schedulers.scheduling_ddpm.DDPMSchedulerOutput with DDPM->DDIM -class DDIMSchedulerOutput(BaseOutput): - """ - Output class for the scheduler's `step` function output. - - Args: - prev_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images): - Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the - denoising loop. - pred_original_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images): - The predicted denoised sample `(x_{0})` based on the model output from the current timestep. - `pred_original_sample` can be used to preview progress or for guidance. - """ - - prev_sample: torch.Tensor - pred_original_sample: Optional[torch.Tensor] = None - - -# Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar -def betas_for_alpha_bar( - num_diffusion_timesteps, - max_beta=0.999, - alpha_transform_type="cosine", -): - """ - Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of - (1-beta) over time from t = [0,1]. - - Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up - to that part of the diffusion process. - - - Args: - num_diffusion_timesteps (`int`): the number of betas to produce. - max_beta (`float`): the maximum beta to use; use values lower than 1 to - prevent singularities. - alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar. - Choose from `cosine` or `exp` - - Returns: - betas (`np.ndarray`): the betas used by the scheduler to step the model outputs - """ - if alpha_transform_type == "cosine": - - def alpha_bar_fn(t): - return math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2 - - elif alpha_transform_type == "exp": - - def alpha_bar_fn(t): - return math.exp(t * -12.0) - - else: - raise ValueError(f"Unsupported alpha_transform_type: {alpha_transform_type}") - - betas = [] - for i in range(num_diffusion_timesteps): - t1 = i / num_diffusion_timesteps - t2 = (i + 1) / num_diffusion_timesteps - betas.append(min(1 - alpha_bar_fn(t2) / alpha_bar_fn(t1), max_beta)) - return torch.tensor(betas, dtype=torch.float32) - - -def rescale_zero_terminal_snr(alphas_cumprod): - """ - - Args: - betas (`torch.Tensor`): - the betas that the scheduler is being initialized with. - - Returns: - `torch.Tensor`: rescaled betas with zero terminal SNR - """ - - alphas_bar_sqrt = alphas_cumprod.sqrt() - - # Store old values. - alphas_bar_sqrt_0 = alphas_bar_sqrt[0].clone() - alphas_bar_sqrt_T = alphas_bar_sqrt[-1].clone() - - # Shift so the last timestep is zero. - alphas_bar_sqrt -= alphas_bar_sqrt_T - - # Scale so the first timestep is back to the old value. - alphas_bar_sqrt *= alphas_bar_sqrt_0 / (alphas_bar_sqrt_0 - alphas_bar_sqrt_T) - - # Convert alphas_bar_sqrt to betas - alphas_bar = alphas_bar_sqrt**2 # Revert sqrt - - return alphas_bar - - -class CogVideoXDPMScheduler(SchedulerMixin, ConfigMixin): - """ - `DDIMScheduler` extends the denoising procedure introduced in denoising diffusion probabilistic models (DDPMs) with - non-Markovian guidance. - - This model inherits from [`SchedulerMixin`] and [`ConfigMixin`]. Check the superclass documentation for the generic - methods the library implements for all schedulers such as loading and saving. - - Args: - num_train_timesteps (`int`, defaults to 1000): - The number of diffusion steps to train the model. - beta_start (`float`, defaults to 0.0001): - The starting `beta` value of inference. - beta_end (`float`, defaults to 0.02): - The final `beta` value. - beta_schedule (`str`, defaults to `"linear"`): - The beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from - `linear`, `scaled_linear`, or `squaredcos_cap_v2`. - trained_betas (`np.ndarray`, *optional*): - Pass an array of betas directly to the constructor to bypass `beta_start` and `beta_end`. - clip_sample (`bool`, defaults to `True`): - Clip the predicted sample for numerical stability. - clip_sample_range (`float`, defaults to 1.0): - The maximum magnitude for sample clipping. Valid only when `clip_sample=True`. - set_alpha_to_one (`bool`, defaults to `True`): - Each diffusion step uses the alphas product value at that step and at the previous one. For the final step - there is no previous alpha. When this option is `True` the previous alpha product is fixed to `1`, - otherwise it uses the alpha value at step 0. - steps_offset (`int`, defaults to 0): - An offset added to the inference steps, as required by some model families. - prediction_type (`str`, defaults to `epsilon`, *optional*): - Prediction type of the scheduler function; can be `epsilon` (predicts the noise of the diffusion process), - `sample` (directly predicts the noisy sample`) or `v_prediction` (see section 2.4 of [Imagen - Video](https://imagen.research.google/video/paper.pdf) paper). - thresholding (`bool`, defaults to `False`): - Whether to use the "dynamic thresholding" method. This is unsuitable for latent-space diffusion models such - as Stable Diffusion. - dynamic_thresholding_ratio (`float`, defaults to 0.995): - The ratio for the dynamic thresholding method. Valid only when `thresholding=True`. - sample_max_value (`float`, defaults to 1.0): - The threshold value for dynamic thresholding. Valid only when `thresholding=True`. - timestep_spacing (`str`, defaults to `"leading"`): - The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and - Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information. - rescale_betas_zero_snr (`bool`, defaults to `False`): - Whether to rescale the betas to have zero terminal SNR. This enables the model to generate very bright and - dark samples instead of limiting it to samples with medium brightness. Loosely related to - [`--offset_noise`](https://github.com/huggingface/diffusers/blob/74fd735eb073eb1d774b1ab4154a0876eb82f055/examples/dreambooth/train_dreambooth.py#L506). - """ - - _compatibles = [e.name for e in KarrasDiffusionSchedulers] - order = 1 - - @register_to_config - def __init__( - self, - num_train_timesteps: int = 1000, - beta_start: float = 0.00085, - beta_end: float = 0.0120, - beta_schedule: str = "scaled_linear", - trained_betas: Optional[Union[np.ndarray, List[float]]] = None, - clip_sample: bool = True, - set_alpha_to_one: bool = True, - steps_offset: int = 0, - prediction_type: str = "epsilon", - clip_sample_range: float = 1.0, - sample_max_value: float = 1.0, - timestep_spacing: str = "leading", - rescale_betas_zero_snr: bool = False, - snr_shift_scale: float = 3.0, - ): - if trained_betas is not None: - self.betas = torch.tensor(trained_betas, dtype=torch.float32) - elif beta_schedule == "linear": - self.betas = torch.linspace(beta_start, beta_end, num_train_timesteps, dtype=torch.float32) - elif beta_schedule == "scaled_linear": - # this schedule is very specific to the latent diffusion model. - self.betas = torch.linspace(beta_start**0.5, beta_end**0.5, num_train_timesteps, dtype=torch.float64) ** 2 - elif beta_schedule == "squaredcos_cap_v2": - # Glide cosine schedule - self.betas = betas_for_alpha_bar(num_train_timesteps) - else: - raise NotImplementedError(f"{beta_schedule} is not implemented for {self.__class__}") - - self.alphas = 1.0 - self.betas - self.alphas_cumprod = torch.cumprod(self.alphas, dim=0) - - # Modify: SNR shift following SD3 - self.alphas_cumprod = self.alphas_cumprod / (snr_shift_scale + (1 - snr_shift_scale) * self.alphas_cumprod) - - # Rescale for zero SNR - if rescale_betas_zero_snr: - self.alphas_cumprod = rescale_zero_terminal_snr(self.alphas_cumprod) - - # At every step in ddim, we are looking into the previous alphas_cumprod - # For the final step, there is no previous alphas_cumprod because we are already at 0 - # `set_alpha_to_one` decides whether we set this parameter simply to one or - # whether we use the final alpha of the "non-previous" one. - self.final_alpha_cumprod = torch.tensor(1.0) if set_alpha_to_one else self.alphas_cumprod[0] - - # standard deviation of the initial noise distribution - self.init_noise_sigma = 1.0 - - # setable values - self.num_inference_steps = None - self.timesteps = torch.from_numpy(np.arange(0, num_train_timesteps)[::-1].copy().astype(np.int64)) - - def _get_variance(self, timestep, prev_timestep): - alpha_prod_t = self.alphas_cumprod[timestep] - alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.final_alpha_cumprod - beta_prod_t = 1 - alpha_prod_t - beta_prod_t_prev = 1 - alpha_prod_t_prev - - variance = (beta_prod_t_prev / beta_prod_t) * (1 - alpha_prod_t / alpha_prod_t_prev) - - return variance - - def scale_model_input(self, sample: torch.Tensor, timestep: Optional[int] = None) -> torch.Tensor: - """ - Ensures interchangeability with schedulers that need to scale the denoising model input depending on the - current timestep. - - Args: - sample (`torch.Tensor`): - The input sample. - timestep (`int`, *optional*): - The current timestep in the diffusion chain. - - Returns: - `torch.Tensor`: - A scaled input sample. - """ - return sample - - def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.device] = None): - """ - Sets the discrete timesteps used for the diffusion chain (to be run before inference). - - Args: - num_inference_steps (`int`): - The number of diffusion steps used when generating samples with a pre-trained model. - """ - - if num_inference_steps > self.config.num_train_timesteps: - raise ValueError( - f"`num_inference_steps`: {num_inference_steps} cannot be larger than `self.config.train_timesteps`:" - f" {self.config.num_train_timesteps} as the unet model trained with this scheduler can only handle" - f" maximal {self.config.num_train_timesteps} timesteps." - ) - - self.num_inference_steps = num_inference_steps - - # "linspace", "leading", "trailing" corresponds to annotation of Table 2. of https://arxiv.org/abs/2305.08891 - if self.config.timestep_spacing == "linspace": - timesteps = ( - np.linspace(0, self.config.num_train_timesteps - 1, num_inference_steps) - .round()[::-1] - .copy() - .astype(np.int64) - ) - elif self.config.timestep_spacing == "leading": - step_ratio = self.config.num_train_timesteps // self.num_inference_steps - # creates integer timesteps by multiplying by ratio - # casting to int to avoid issues when num_inference_step is power of 3 - timesteps = (np.arange(0, num_inference_steps) * step_ratio).round()[::-1].copy().astype(np.int64) - timesteps += self.config.steps_offset - elif self.config.timestep_spacing == "trailing": - step_ratio = self.config.num_train_timesteps / self.num_inference_steps - # creates integer timesteps by multiplying by ratio - # casting to int to avoid issues when num_inference_step is power of 3 - timesteps = np.round(np.arange(self.config.num_train_timesteps, 0, -step_ratio)).astype(np.int64) - timesteps -= 1 - else: - raise ValueError( - f"{self.config.timestep_spacing} is not supported. Please make sure to choose one of 'leading' or 'trailing'." - ) - - self.timesteps = torch.from_numpy(timesteps).to(device) - - def get_variables(self, alpha_prod_t, alpha_prod_t_prev, alpha_prod_t_back=None): - lamb = ((alpha_prod_t / (1 - alpha_prod_t)) ** 0.5).log() - lamb_next = ((alpha_prod_t_prev / (1 - alpha_prod_t_prev)) ** 0.5).log() - h = lamb_next - lamb - - if alpha_prod_t_back is not None: - lamb_previous = ((alpha_prod_t_back / (1 - alpha_prod_t_back)) ** 0.5).log() - h_last = lamb - lamb_previous - r = h_last / h - return h, r, lamb, lamb_next - else: - return h, None, lamb, lamb_next - - def get_mult(self, h, r, alpha_prod_t, alpha_prod_t_prev, alpha_prod_t_back): - mult1 = ((1 - alpha_prod_t_prev) / (1 - alpha_prod_t)) ** 0.5 * (-h).exp() - mult2 = (-2 * h).expm1() * alpha_prod_t_prev**0.5 - - if alpha_prod_t_back is not None: - mult3 = 1 + 1 / (2 * r) - mult4 = 1 / (2 * r) - return mult1, mult2, mult3, mult4 - else: - return mult1, mult2 - - def step( - self, - model_output: torch.Tensor, - old_pred_original_sample: torch.Tensor, - timestep: int, - timestep_back: int, - sample: torch.Tensor, - eta: float = 0.0, - use_clipped_model_output: bool = False, - generator=None, - variance_noise: Optional[torch.Tensor] = None, - return_dict: bool = False, - ) -> Union[DDIMSchedulerOutput, Tuple]: - """ - Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion - process from the learned model outputs (most often the predicted noise). - - Args: - model_output (`torch.Tensor`): - The direct output from learned diffusion model. - timestep (`float`): - The current discrete timestep in the diffusion chain. - sample (`torch.Tensor`): - A current instance of a sample created by the diffusion process. - eta (`float`): - The weight of noise for added noise in diffusion step. - use_clipped_model_output (`bool`, defaults to `False`): - If `True`, computes "corrected" `model_output` from the clipped predicted original sample. Necessary - because predicted original sample is clipped to [-1, 1] when `self.config.clip_sample` is `True`. If no - clipping has happened, "corrected" `model_output` would coincide with the one provided as input and - `use_clipped_model_output` has no effect. - generator (`torch.Generator`, *optional*): - A random number generator. - variance_noise (`torch.Tensor`): - Alternative to generating noise with `generator` by directly providing the noise for the variance - itself. Useful for methods such as [`CycleDiffusion`]. - return_dict (`bool`, *optional*, defaults to `True`): - Whether or not to return a [`~schedulers.scheduling_ddim.DDIMSchedulerOutput`] or `tuple`. - - Returns: - [`~schedulers.scheduling_ddim.DDIMSchedulerOutput`] or `tuple`: - If return_dict is `True`, [`~schedulers.scheduling_ddim.DDIMSchedulerOutput`] is returned, otherwise a - tuple is returned where the first element is the sample tensor. - - """ - if self.num_inference_steps is None: - raise ValueError( - "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler" - ) - - # See formulas (12) and (16) of DDIM paper https://arxiv.org/pdf/2010.02502.pdf - # Ideally, read DDIM paper in-detail understanding - - # Notation ( -> - # - pred_noise_t -> e_theta(x_t, t) - # - pred_original_sample -> f_theta(x_t, t) or x_0 - # - std_dev_t -> sigma_t - # - eta -> η - # - pred_sample_direction -> "direction pointing to x_t" - # - pred_prev_sample -> "x_t-1" - - # 1. get previous step value (=t-1) - prev_timestep = timestep - self.config.num_train_timesteps // self.num_inference_steps - - # 2. compute alphas, betas - alpha_prod_t = self.alphas_cumprod[timestep] - alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.final_alpha_cumprod - alpha_prod_t_back = self.alphas_cumprod[timestep_back] if timestep_back is not None else None - - beta_prod_t = 1 - alpha_prod_t - - # 3. compute predicted original sample from predicted noise also called - # "predicted x_0" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf - # To make style tests pass, commented out `pred_epsilon` as it is an unused variable - if self.config.prediction_type == "epsilon": - pred_original_sample = (sample - beta_prod_t ** (0.5) * model_output) / alpha_prod_t ** (0.5) - elif self.config.prediction_type == "sample": - pred_original_sample = model_output - elif self.config.prediction_type == "v_prediction": - pred_original_sample = (alpha_prod_t**0.5) * sample - (beta_prod_t**0.5) * model_output - else: - raise ValueError( - f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, `sample`, or" - " `v_prediction`" - ) - - h, r, lamb, lamb_next = self.get_variables(alpha_prod_t, alpha_prod_t_prev, alpha_prod_t_back) - mult = list(self.get_mult(h, r, alpha_prod_t, alpha_prod_t_prev, alpha_prod_t_back)) - mult_noise = (1 - alpha_prod_t_prev) ** 0.5 * (1 - (-2 * h).exp()) ** 0.5 - - noise = randn_tensor(sample.shape, generator=generator, device=sample.device, dtype=sample.dtype) - prev_sample = mult[0] * sample - mult[1] * pred_original_sample + mult_noise * noise - - if old_pred_original_sample is None or prev_timestep < 0: - # Save a network evaluation if all noise levels are 0 or on the first step - return prev_sample, pred_original_sample - else: - denoised_d = mult[2] * pred_original_sample - mult[3] * old_pred_original_sample - noise = randn_tensor(sample.shape, generator=generator, device=sample.device, dtype=sample.dtype) - x_advanced = mult[0] * sample - mult[1] * denoised_d + mult_noise * noise - - prev_sample = x_advanced - - if not return_dict: - return (prev_sample, pred_original_sample) - - return DDIMSchedulerOutput(prev_sample=prev_sample, pred_original_sample=pred_original_sample) - - # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.add_noise - def add_noise( - self, - original_samples: torch.Tensor, - noise: torch.Tensor, - timesteps: torch.IntTensor, - ) -> torch.Tensor: - # Make sure alphas_cumprod and timestep have same device and dtype as original_samples - # Move the self.alphas_cumprod to device to avoid redundant CPU to GPU data movement - # for the subsequent add_noise calls - self.alphas_cumprod = self.alphas_cumprod.to(device=original_samples.device) - alphas_cumprod = self.alphas_cumprod.to(dtype=original_samples.dtype) - timesteps = timesteps.to(original_samples.device) - - sqrt_alpha_prod = alphas_cumprod[timesteps] ** 0.5 - sqrt_alpha_prod = sqrt_alpha_prod.flatten() - while len(sqrt_alpha_prod.shape) < len(original_samples.shape): - sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1) - - sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps]) ** 0.5 - sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten() - while len(sqrt_one_minus_alpha_prod.shape) < len(original_samples.shape): - sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1) - - noisy_samples = sqrt_alpha_prod * original_samples + sqrt_one_minus_alpha_prod * noise - return noisy_samples - - # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.get_velocity - def get_velocity(self, sample: torch.Tensor, noise: torch.Tensor, timesteps: torch.IntTensor) -> torch.Tensor: - # Make sure alphas_cumprod and timestep have same device and dtype as sample - self.alphas_cumprod = self.alphas_cumprod.to(device=sample.device) - alphas_cumprod = self.alphas_cumprod.to(dtype=sample.dtype) - timesteps = timesteps.to(sample.device) - - sqrt_alpha_prod = alphas_cumprod[timesteps] ** 0.5 - sqrt_alpha_prod = sqrt_alpha_prod.flatten() - while len(sqrt_alpha_prod.shape) < len(sample.shape): - sqrt_alpha_prod = sqrt_alpha_prod.unsqueeze(-1) - - sqrt_one_minus_alpha_prod = (1 - alphas_cumprod[timesteps]) ** 0.5 - sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.flatten() - while len(sqrt_one_minus_alpha_prod.shape) < len(sample.shape): - sqrt_one_minus_alpha_prod = sqrt_one_minus_alpha_prod.unsqueeze(-1) - - velocity = sqrt_alpha_prod * noise - sqrt_one_minus_alpha_prod * sample - return velocity - - def __len__(self): - return self.config.num_train_timesteps \ No newline at end of file -- Gitee From d572adc2c2ce7a10638cb479bdccbc9a523d17cf Mon Sep 17 00:00:00 2001 From: jiangmengyu Date: Fri, 27 Dec 2024 11:23:19 +0800 Subject: [PATCH 47/91] =?UTF-8?q?cogview3=E9=80=82=E9=85=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../built-in/foundation/cogview3/cogview3plus/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/__init__.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/__init__.py index 11a5548362..304ed0a899 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/__init__.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/__init__.py @@ -17,5 +17,5 @@ from .pipeline import CogView3PlusPipeline, DiffusionPipeline from .vae import AutoencoderKL -from .schedulers import CogVideoXDDIMScheduler, CogVideoXDPMScheduler, SchedulerMixin +from .schedulers import CogVideoXDDIMScheduler, SchedulerMixin from .models import CogView3PlusTransformer2DModel, ModelMixin \ No newline at end of file -- Gitee From c9d573a3307b1b5e62280cdcbab922c03ddac0c2 Mon Sep 17 00:00:00 2001 From: jiangmengyu Date: Fri, 27 Dec 2024 11:59:17 +0800 Subject: [PATCH 48/91] =?UTF-8?q?cogview3=E9=80=82=E9=85=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- MindIE/MindIE-Torch/built-in/foundation/cogview3/README.md | 2 +- .../built-in/foundation/cogview3/cogview3plus/__init__.py | 1 - .../foundation/cogview3/cogview3plus/layers/embeddings.py | 5 ++--- .../foundation/cogview3/cogview3plus/layers/linear.py | 2 +- .../foundation/cogview3/cogview3plus/layers/normalization.py | 2 +- .../foundation/cogview3/cogview3plus/models/activations.py | 2 +- .../foundation/cogview3/cogview3plus/models/attention.py | 3 ++- .../cogview3/cogview3plus/models/attention_processor.py | 3 ++- .../cogview3/cogview3plus/models/model_load_utils.py | 2 +- .../cogview3/cogview3plus/models/modeling_utils.py | 2 +- .../cogview3/cogview3plus/models/transformer_cogview3plus.py | 4 +--- .../cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py | 2 +- .../cogview3plus/schedulers/scheduling_ddim_cogvideox.py | 5 +---- .../cogview3/cogview3plus/schedulers/scheduling_utils.py | 3 ++- .../foundation/cogview3/cogview3plus/vae/autoencoder_kl.py | 3 ++- .../built-in/foundation/cogview3/cogview3plus/vae/vae.py | 3 ++- 16 files changed, 21 insertions(+), 23 deletions(-) diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/README.md b/MindIE/MindIE-Torch/built-in/foundation/cogview3/README.md index 495359c497..16592703d8 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/README.md +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/README.md @@ -78,7 +78,7 @@ https://huggingface.co/THUDM/CogView3-Plus-3B/tree/main ```shell { "_class_name": "CogView3PlusPipeline", - "_diffusers_version": "0.31.0.dev0", + "_diffusers_version": "0.31.0", "scheduler": [ "cogview3plus", "CogVideoXDDIMScheduler" diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/__init__.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/__init__.py index 304ed0a899..8cfcd60a5b 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/__init__.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/__init__.py @@ -14,7 +14,6 @@ # See the License for the specific language governing permissions and # limitations under the License. - from .pipeline import CogView3PlusPipeline, DiffusionPipeline from .vae import AutoencoderKL from .schedulers import CogVideoXDDIMScheduler, SchedulerMixin diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/embeddings.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/embeddings.py index 1763b3b910..72418f08b3 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/embeddings.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/embeddings.py @@ -4,20 +4,19 @@ # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # -# http://www.apache.org/licenses/LICENSE-2.0 +# https://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + import math from typing import Optional -import numpy as np import torch from torch import nn -from diffusers.utils import deprecate from diffusers.models.activations import FP32SiLU, get_activation diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/linear.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/linear.py index 805c2d2b34..5f27384302 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/linear.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/linear.py @@ -6,7 +6,7 @@ # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # -# http://www.apache.org/licenses/LICENSE-2.0 +# httpa://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/normalization.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/normalization.py index 2ead694b9f..e526184632 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/normalization.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/normalization.py @@ -5,7 +5,7 @@ # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # -# http://www.apache.org/licenses/LICENSE-2.0 +# https://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/activations.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/activations.py index 48fe8ed17d..fc68971806 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/activations.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/activations.py @@ -5,7 +5,7 @@ # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # -# http://www.apache.org/licenses/LICENSE-2.0 +# https://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/attention.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/attention.py index ac85e70e05..a7a559ff2f 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/attention.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/attention.py @@ -4,13 +4,14 @@ # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # -# http://www.apache.org/licenses/LICENSE-2.0 +# https://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + from typing import Optional import torch diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/attention_processor.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/attention_processor.py index d20baee1ec..1f6b12f1aa 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/attention_processor.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/attention_processor.py @@ -4,13 +4,14 @@ # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # -# http://www.apache.org/licenses/LICENSE-2.0 +# https://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + import inspect from typing import Optional diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/model_load_utils.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/model_load_utils.py index f6d3b20570..3cffbd6432 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/model_load_utils.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/model_load_utils.py @@ -6,7 +6,7 @@ # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # -# http://www.apache.org/licenses/LICENSE-2.0 +# https://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/modeling_utils.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/modeling_utils.py index f83eff493f..e71d8577d5 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/modeling_utils.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/modeling_utils.py @@ -6,7 +6,7 @@ # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # -# http://www.apache.org/licenses/LICENSE-2.0 +# https://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/transformer_cogview3plus.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/transformer_cogview3plus.py index 6df2dc361e..cfeb27c109 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/transformer_cogview3plus.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/transformer_cogview3plus.py @@ -4,7 +4,7 @@ # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # -# http://www.apache.org/licenses/LICENSE-2.0 +# https://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, @@ -12,9 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. - from typing import Any, Dict, Union -from dataclasses import dataclass import torch import torch.nn as nn diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py index 0031849dc9..7e89d4c370 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py @@ -5,7 +5,7 @@ # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # -# http://www.apache.org/licenses/LICENSE-2.0 +# https://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_ddim_cogvideox.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_ddim_cogvideox.py index dbe9d4b17f..26ae48a2c8 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_ddim_cogvideox.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_ddim_cogvideox.py @@ -5,7 +5,7 @@ # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # -# http://www.apache.org/licenses/LICENSE-2.0 +# https://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, @@ -13,9 +13,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -# DISCLAIMER: This code is strongly influenced by https://github.com/pesser/pytorch_diffusion -# and https://github.com/hojonathanho/diffusion - import math from dataclasses import dataclass from typing import List, Optional, Tuple, Union diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_utils.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_utils.py index 7a72eb3d06..3f1fb5bc32 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_utils.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_utils.py @@ -4,13 +4,14 @@ # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # -# http://www.apache.org/licenses/LICENSE-2.0 +# https://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + import importlib import os from dataclasses import dataclass diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/vae/autoencoder_kl.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/vae/autoencoder_kl.py index bbe9bddf3e..3f524408c9 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/vae/autoencoder_kl.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/vae/autoencoder_kl.py @@ -4,13 +4,14 @@ # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # -# http://www.apache.org/licenses/LICENSE-2.0 +# https://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + from typing import Dict, Optional, Tuple, Union import torch diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/vae/vae.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/vae/vae.py index 2323be5a78..c1abff7097 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/vae/vae.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/vae/vae.py @@ -4,13 +4,14 @@ # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # -# http://www.apache.org/licenses/LICENSE-2.0 +# https://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + from dataclasses import dataclass from typing import Optional, Tuple -- Gitee From 8e94c1164672e4f5e71c03bf76e01fe609c5fd2b Mon Sep 17 00:00:00 2001 From: jiangmengyu Date: Fri, 27 Dec 2024 12:07:11 +0800 Subject: [PATCH 49/91] =?UTF-8?q?cogview3=E9=80=82=E9=85=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../cogview3plus/layers/normalization.py | 30 +------------ .../cogview3plus/models/modeling_utils.py | 42 ++----------------- 2 files changed, 4 insertions(+), 68 deletions(-) diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/normalization.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/normalization.py index e526184632..1ec0a5b15c 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/normalization.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/normalization.py @@ -21,34 +21,6 @@ import torch import torch.nn as nn import torch.nn.functional as F -from diffusers.utils import is_torch_version - - -if is_torch_version(">=", "2.1.0"): - LayerNorm = nn.LayerNorm -else: - # Has optional bias parameter compared to torch layer norm - class LayerNorm(nn.Module): - def __init__(self, dim, eps: float = 1e-5, elementwise_affine: bool = True, bias: bool = True): - super().__init__() - - self.eps = eps - - if isinstance(dim, numbers.Integral): - dim = (dim,) - - self.dim = torch.Size(dim) - - if elementwise_affine: - self.weight = nn.Parameter(torch.ones(dim)) - self.bias = nn.Parameter(torch.zeros(dim)) if bias else None - else: - self.weight = None - self.bias = None - - def forward(self, x): - return F.layer_norm(x, self.dim, self.weight, self.bias, self.eps) - class RMSNorm(nn.Module): def __init__(self, dim, eps: float, elementwise_affine: bool = True, bias: bool = False): @@ -191,7 +163,7 @@ class AdaLayerNormContinuous(nn.Module): self.silu = nn.SiLU() self.linear = nn.Linear(conditioning_embedding_dim, embedding_dim * 2, bias=bias) if norm_type == "layer_norm": - self.norm = LayerNorm(embedding_dim, eps, elementwise_affine, bias) + self.norm = nn.LayerNorm(embedding_dim, eps, elementwise_affine, bias) elif norm_type == "rms_norm": self.norm = RMSNorm(embedding_dim, eps, elementwise_affine) else: diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/modeling_utils.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/modeling_utils.py index e71d8577d5..252c758863 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/modeling_utils.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/modeling_utils.py @@ -22,24 +22,19 @@ import os import re from collections import OrderedDict from functools import partial, wraps -from pathlib import Path from typing import Any, Callable, List, Optional, Tuple, Union -import safetensors import torch -from huggingface_hub import create_repo, split_torch_state_dict_into_shards from huggingface_hub.utils import validate_hf_hub_args from torch import Tensor, nn from diffusers import __version__ -from diffusers.quantizers import DiffusersAutoQuantizer, DiffusersQuantizer +from diffusers.quantizers import DiffusersAutoQuantizer from diffusers.quantizers.quantization_config import QuantizationMethod from diffusers.utils import ( CONFIG_NAME, FLAX_WEIGHTS_NAME, - SAFE_WEIGHTS_INDEX_NAME, SAFETENSORS_WEIGHTS_NAME, - WEIGHTS_INDEX_NAME, WEIGHTS_NAME, _add_variant, _get_checkpoint_shard_files, @@ -48,16 +43,10 @@ from diffusers.utils import ( is_accelerate_available, is_bitsandbytes_available, is_bitsandbytes_version, - is_torch_version, logging, ) -from diffusers.utils.hub_utils import ( - PushToHubMixin, - load_or_create_model_card, - populate_model_card, -) +from diffusers.utils.hub_utils import PushToHubMixin from diffusers.models.model_loading_utils import ( - _determine_device_map, _fetch_index_file, _fetch_index_file_legacy, _load_state_dict_into_model, @@ -66,18 +55,11 @@ from diffusers.models.model_loading_utils import ( load_state_dict, ) -from .model_load_utils import load_state_dict_sd - logger = logging.get_logger(__name__) -_REGEX_SHARD = re.compile(r"(.*?)-\d{5}-of-\d{5}") - -if is_torch_version(">=", "1.9.0"): - _LOW_CPU_MEM_USAGE_DEFAULT = True -else: - _LOW_CPU_MEM_USAGE_DEFAULT = False +_LOW_CPU_MEM_USAGE_DEFAULT = True if is_accelerate_available(): @@ -472,19 +454,6 @@ class ModelMixin(torch.nn.Module, PushToHubMixin): " `device_map=None`. You can install accelerate with `pip install accelerate`." ) - # Check if we can handle device_map and dispatching the weights - if device_map is not None and not is_torch_version(">=", "1.9.0"): - raise NotImplementedError( - "Loading and dispatching requires torch >= 1.9.0. Please either update your PyTorch version or set" - " `device_map=None`." - ) - - if low_cpu_mem_usage is True and not is_torch_version(">=", "1.9.0"): - raise NotImplementedError( - "Low memory initialization requires torch >= 1.9.0. Please either update your PyTorch version or set" - " `low_cpu_mem_usage=False`." - ) - if low_cpu_mem_usage is False and device_map is not None: raise ValueError( f"You cannot set `low_cpu_mem_usage` to `False` while using device_map={device_map} for loading and" @@ -516,11 +485,6 @@ class ModelMixin(torch.nn.Module, PushToHubMixin): elif not low_cpu_mem_usage: raise ValueError("Passing along a `device_map` requires `low_cpu_mem_usage=True`") - if low_cpu_mem_usage: - if device_map is not None and not is_torch_version(">=", "1.10"): - # The max memory utils require PyTorch >= 1.10 to have torch.cuda.mem_get_info. - raise ValueError("`low_cpu_mem_usage` and `device_map` require PyTorch >= 1.10.") - # Load config if we don't provide a configuration config_path = pretrained_model_name_or_path -- Gitee From 78ad303e55cf5f605f83a77785d5fc10cf7caecd Mon Sep 17 00:00:00 2001 From: jiangmengyu Date: Fri, 27 Dec 2024 12:09:15 +0800 Subject: [PATCH 50/91] =?UTF-8?q?cogview3=E9=80=82=E9=85=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../pipeline/pipeline_cogview3plus.py | 100 ------------------ 1 file changed, 100 deletions(-) diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py index 7e89d4c370..82b1742f3b 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py @@ -21,7 +21,6 @@ from transformers import T5EncoderModel, T5Tokenizer from diffusers.image_processor import VaeImageProcessor from diffusers.pipelines.pipeline_utils import DiffusionPipeline -from diffusers.utils import logging, replace_example_docstring from diffusers.utils.torch_utils import randn_tensor from ..vae import AutoencoderKL @@ -32,22 +31,6 @@ from .pipeline_output import CogView3PipelineOutput logger = logging.get_logger(__name__) # pylint: disable=invalid-name -EXAMPLE_DOC_STRING = """ - Examples: - ```python - >>> import torch - >>> from diffusers import CogView3PlusPipeline - - >>> pipe = CogView3PlusPipeline.from_pretrained("THUDM/CogView3-Plus-3B", torch_dtype=torch.bfloat16) - >>> pipe.to("cuda") - - >>> prompt = "A photo of an astronaut riding a horse on mars" - >>> image = pipe(prompt).images[0] - >>> image.save("output.png") - ``` -""" - - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps def retrieve_timesteps( scheduler, @@ -261,7 +244,6 @@ class CogView3PlusPipeline(DiffusionPipeline): return self._interrupt @torch.no_grad() - @replace_example_docstring(EXAMPLE_DOC_STRING) def __call__( self, prompt: Optional[Union[str, List[str]]] = None, @@ -270,88 +252,6 @@ class CogView3PlusPipeline(DiffusionPipeline): guidance_scale: float = 5.0, num_images_per_prompt: int = 1, ) -> Union[CogView3PipelineOutput, Tuple]: - """ - Function invoked when calling the pipeline for generation. - - Args: - prompt (`str` or `List[str]`, *optional*): - The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`. - negative_prompt (`str` or `List[str]`, *optional*): - The prompt or prompts not to guide the image generation. If not defined, one has to pass - `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is - less than `1`). - height (`int`, *optional*, defaults to self.transformer.config.sample_size * self.vae_scale_factor): - The height in pixels of the generated image. If not provided, it is set to 1024. - width (`int`, *optional*, defaults to self.transformer.config.sample_size * self.vae_scale_factor): - The width in pixels of the generated image. If not provided it is set to 1024. - num_inference_steps (`int`, *optional*, defaults to `50`): - The number of denoising steps. More denoising steps usually lead to a higher quality image at the - expense of slower inference. - timesteps (`List[int]`, *optional*): - Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument - in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is - passed will be used. Must be in descending order. - guidance_scale (`float`, *optional*, defaults to `5.0`): - Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598). - `guidance_scale` is defined as `w` of equation 2. of [Imagen - Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale > - 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`, - usually at the expense of lower image quality. - num_images_per_prompt (`int`, *optional*, defaults to `1`): - The number of images to generate per prompt. - generator (`torch.Generator` or `List[torch.Generator]`, *optional*): - One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) - to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): - Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image - generation. Can be used to tweak the same generation with different prompts. If not provided, a latents - tensor will ge generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): - Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not - provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): - Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt - weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input - argument. - original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)): - If `original_size` is not the same as `target_size` the image will appear to be down- or upsampled. - `original_size` defaults to `(height, width)` if not specified. Part of SDXL's micro-conditioning as - explained in section 2.2 of - [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). - crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)): - `crops_coords_top_left` can be used to generate an image that appears to be "cropped" from the position - `crops_coords_top_left` downwards. Favorable, well-centered images are usually achieved by setting - `crops_coords_top_left` to (0, 0). Part of SDXL's micro-conditioning as explained in section 2.2 of - [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). - output_type (`str`, *optional*, defaults to `"pil"`): - The output format of the generate image. Choose between - [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`. - return_dict (`bool`, *optional*, defaults to `True`): - Whether or not to return a [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] instead - of a plain tuple. - attention_kwargs (`dict`, *optional*): - A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under - `self.processor` in - [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py). - callback_on_step_end (`Callable`, *optional*): - A function that calls at the end of each denoising steps during the inference. The function is called - with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int, - callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by - `callback_on_step_end_tensor_inputs`. - callback_on_step_end_tensor_inputs (`List`, *optional*): - The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list - will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the - `._callback_tensor_inputs` attribute of your pipeline class. - max_sequence_length (`int`, defaults to `224`): - Maximum sequence length in encoded prompt. Can be set to other values but may lead to poorer results. - - Examples: - - Returns: - [`~pipelines.cogview3.pipeline_cogview3plus.CogView3PipelineOutput`] or `tuple`: - [`~pipelines.cogview3.pipeline_cogview3plus.CogView3PipelineOutput`] if `return_dict` is True, otherwise a - `tuple`. When returning a tuple, the first element is a list with the generated images. - """ if image_size is None: height = self.transformer.config.sample_size * self.vae_scale_factor width = self.transformer.config.sample_size * self.vae_scale_factor -- Gitee From 59ebdf2301a7a7437313f55c49caee548df369ef Mon Sep 17 00:00:00 2001 From: jiangmengyu Date: Fri, 27 Dec 2024 12:09:49 +0800 Subject: [PATCH 51/91] =?UTF-8?q?cogview3=E9=80=82=E9=85=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py | 1 + 1 file changed, 1 insertion(+) diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py index 82b1742f3b..7af8721fa3 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py @@ -21,6 +21,7 @@ from transformers import T5EncoderModel, T5Tokenizer from diffusers.image_processor import VaeImageProcessor from diffusers.pipelines.pipeline_utils import DiffusionPipeline +from diffusers.utils import logging from diffusers.utils.torch_utils import randn_tensor from ..vae import AutoencoderKL -- Gitee From e3778d4c1366d13e89a2b15e255bf4f33adeedf6 Mon Sep 17 00:00:00 2001 From: jiangmengyu Date: Fri, 27 Dec 2024 13:25:54 +0800 Subject: [PATCH 52/91] =?UTF-8?q?cogview3=E9=80=82=E9=85=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../cogview3/cogview3plus/layers/linear.py | 79 ++++--------------- .../models/attention_processor.py | 26 ++---- .../models/transformer_cogview3plus.py | 50 ++++++++++++ 3 files changed, 74 insertions(+), 81 deletions(-) diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/linear.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/linear.py index 5f27384302..bd9b9ba796 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/linear.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/linear.py @@ -17,79 +17,32 @@ import torch import torch.nn as nn -import torch_npu class QKVLinear(nn.Module): - def __init__(self, attention_dim, hidden_size, qkv_bias=True, cross_attention_dim=None, device=None, dtype=None): + def __init__(self, attention_dim, hidden_size, qkv_bias=True, device=None, dtype=None): super(QKVLinear, self).__init__() self.attention_dim = attention_dim self.hidden_size = hidden_size - self.cross_attention_dim = cross_attention_dim self.qkv_bias = qkv_bias factory_kwargs = {"device": device, "dtype": dtype} - if not cross_attention_dim: - self.weight = nn.Parameter(torch.empty([self.attention_dim, 3 * self.hidden_size], **factory_kwargs)) - if self.qkv_bias: - self.bias = nn.Parameter(torch.empty([3 * self.hidden_size], **factory_kwargs)) - else: - self.q_weight = nn.Parameter(torch.empty([self.attention_dim, self.hidden_size], **factory_kwargs)) - self.kv_weight = nn.Parameter(torch.empty([self.attention_dim, 2 * self.hidden_size], **factory_kwargs)) - - if self.qkv_bias: - self.q_bias = nn.Parameter(torch.empty([self.hidden_size], **factory_kwargs)) - self.kv_bias = nn.Parameter(torch.empty([2 * self.hidden_size], **factory_kwargs)) - - - def forward(self, hidden_states, encoder_hidden_states=None): - - if self.cross_attention_dim is None: - if not self.qkv_bias: - qkv = torch.matmul(hidden_states, self.weight) - else: - qkv = torch.addmm( - self.bias, - hidden_states.view(hidden_states.size(0) * hidden_states.size(1), hidden_states.size(2)), - self.weight, - beta=1, - alpha=1 - ) + self.weight = nn.Parameter(torch.empty([self.attention_dim, 3 * self.hidden_size], **factory_kwargs)) + if self.qkv_bias: + self.bias = nn.Parameter(torch.empty([3 * self.hidden_size], **factory_kwargs)) - batch, seqlen, _ = hidden_states.shape - qkv_shape = (batch, seqlen, 3, -1) - qkv = qkv.view(qkv_shape) - q, k, v = qkv.unbind(2) + def forward(self, hidden_states): + if not self.qkv_bias: + qkv = torch.matmul(hidden_states, self.weight) else: - if not self.qkv_bias: - q = torch.matmul(hidden_states, self.q_weight) - kv = torch.matmul(encoder_hidden_states, self.kv_weight) - else: - q = torch.addmm( - self.q_bias, - hidden_states.view(hidden_states.size(0) * hidden_states.size(1), hidden_states.size(2)), - self.q_weight, - beta=1, - alpha=1 - ) - kv = torch.addmm( - self.kv_bias, - encoder_hidden_states.view( - encoder_hidden_states.size(0) * encoder_hidden_states.size(1), - encoder_hidden_states.size(2)), - self.kv_weight, - beta=1, - alpha=1 - ) - - batch, seqlen, _ = encoder_hidden_states.shape - kv_shape = (batch, seqlen, 2, -1) - - kv = kv.view(kv_shape) - k, v = kv.unbind(2) - - q = q.view(hidden_states.shape) - - return q, k, v \ No newline at end of file + qkv = torch.addmm( + self.bias, + hidden_states.view(hidden_states.size(0) * hidden_states.size(1), hidden_states.size(2)), + self.weight, + beta=1, + alpha=1 + ) + + return qkv \ No newline at end of file diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/attention_processor.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/attention_processor.py index 1f6b12f1aa..eb7618b38f 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/attention_processor.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/attention_processor.py @@ -23,6 +23,8 @@ import torch_npu from diffusers.utils import logging from diffusers.utils.torch_utils import maybe_allow_in_graph +from ..layers import QKVLinear + logger = logging.get_logger(__name__) # pylint: disable=invalid-name @@ -160,15 +162,7 @@ class Attention(nn.Module): f"unknown cross_attention_norm: {cross_attention_norm}. Should be None, 'layer_norm' or 'group_norm'" ) - self.to_q = nn.Linear(query_dim, self.inner_dim, bias=bias) - - if not self.only_cross_attention: - # only relevant for the `AddedKVProcessor` classes - self.to_k = nn.Linear(self.cross_attention_dim, self.inner_kv_dim, bias=bias) - self.to_v = nn.Linear(self.cross_attention_dim, self.inner_kv_dim, bias=bias) - else: - self.to_k = None - self.to_v = None + self.to_qkv = QKVLinear(self.inner_dim, query_dim) self.added_proj_bias = added_proj_bias if self.added_kv_proj_dim is not None: @@ -347,16 +341,12 @@ class CogVideoXAttnProcessor2_0: attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size) attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1]) - query = attn.to_q(hidden_states) - key = attn.to_k(hidden_states) - value = attn.to_v(hidden_states) - - inner_dim = key.shape[-1] + B, S, _ = hidden_states.shape + qkv = self.to_qkv(hidden_states) + inner_dim = qkv.shape[-1] // 3 head_dim = inner_dim // attn.heads - - query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2) - key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2) - value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2) + qkv_shape = (B, S, 3, attn.heads, head_dim) + query, key, value = qkv.view(qkv_shape).permute(2, 0, 3, 1, 4).coutiguous().unbind(0) if attn.norm_q is not None: query = attn.norm_q(query) diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/transformer_cogview3plus.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/transformer_cogview3plus.py index cfeb27c109..29b6905a58 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/transformer_cogview3plus.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/transformer_cogview3plus.py @@ -350,3 +350,53 @@ class CogView3PlusTransformer2DModel(ModelMixin, ConfigMixin): ) return Transformer2DModelOutput(sample=output) + + def load_weights(self, state_dict, shard=False): + with torch.no_grad(): + if not shard: + self.load_state_dict(state_dict) + return {} + else: + weights = state_dict + + for i in range(self.num_layers): + if i != 26: + q_weight = weights.pop(f"transformer_blocks.{i}.attn1.to_q.weight", None) + q_bias = weights.pop(f"transformer_blocks.{i}.attn1.to_q.bias", None) + k_weight = weights.pop(f"transformer_blocks.{i}.attn1.to_k.weight", None) + k_bias = weights.pop(f"transformer_blocks.{i}.attn1.to_k.bias", None) + v_weight = weights.pop(f"transformer_blocks.{i}.attn1.to_v.weight", None) + v_bias = weights.pop(f"transformer_blocks.{i}.attn1.to_v.bias", None) + + # query, key, value的weight和bias权重存在同一个文件中,不会分开存储。 + if q_weight is not None and k_weight is not None and v_weight is not None: + qkv_weight = torch.cat([q_weight, k_weight, v_weight], dim=0).transpose(0, 1).contiguous() + qkv_bias = torch.cat([q_bias, k_bias, v_bias], dim=0).contiguous() + weights[f"transformer_blocks.{i}.attn1.to_qkv.weight"] = qkv_weight + weights[f"transformer_blocks.{i}.attn1.to_qkv.bias"] = qkv_bias + else: + if self.q_weight_cache is None: + self.q_weight_cache = weights.pop(f"transformer_blocks.{i}.attn1.to_q.weight", None) + if self.q_bias_cache is None: + self.q_bias_cache = weights.pop(f"transformer_blocks.{i}.attn1.to_q.bias", None) + if self.k_weight_cache is None: + self.k_weight_cache = weights.pop(f"transformer_blocks.{i}.attn1.to_k.weight", None) + if self.k_bias_cache is None: + self.k_bias_cache = weights.pop(f"transformer_blocks.{i}.attn1.to_k.bias", None) + if self.v_weight_cache is None: + self.v_weight_cache = weights.pop(f"transformer_blocks.{i}.attn1.to_v.weight", None) + if self.v_bias_cache is None: + self.v_bias_cache = weights.pop(f"transformer_blocks.{i}.attn1.to_v.bias", None) + + qk_weight_cache = self.q_weight_cache is not None and self.k_weight_cache is not None + if qk_weight_cache and self.v_weight_cache is not None: + qkv_weight = torch.cat( + [self.q_weight_cache, self.k_weight_cache, self.v_weight_cache], + dim=0 + ).transpose(0, 1).contiguous() + qkv_bias = torch.cat([self.q_bias_cache, self.k_bias_cache, self.v_bias_cache], dim=0).contiguous() + weights[f"transformer_blocks.26.attn1.to_qkv.weight"] = qkv_weight + weights[f"transformer_blocks.26.attn1.to_qkv.bias"] = qkv_bias + + self.load_state_dict(weights, strict=False, assign=True) + return weights.keys() -- Gitee From 128bb12df5d05212993b8a9de644d003beb0fd9a Mon Sep 17 00:00:00 2001 From: jiangmengyu Date: Fri, 27 Dec 2024 13:27:31 +0800 Subject: [PATCH 53/91] =?UTF-8?q?cogview3=E9=80=82=E9=85=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../cogview3/cogview3plus/models/attention_processor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/attention_processor.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/attention_processor.py index eb7618b38f..afc6596471 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/attention_processor.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/attention_processor.py @@ -342,7 +342,7 @@ class CogVideoXAttnProcessor2_0: attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1]) B, S, _ = hidden_states.shape - qkv = self.to_qkv(hidden_states) + qkv = attn.to_qkv(hidden_states) inner_dim = qkv.shape[-1] // 3 head_dim = inner_dim // attn.heads qkv_shape = (B, S, 3, attn.heads, head_dim) -- Gitee From 436984bc1a52a824c609606fe02f2382574533ad Mon Sep 17 00:00:00 2001 From: jiangmengyu Date: Fri, 27 Dec 2024 13:28:26 +0800 Subject: [PATCH 54/91] =?UTF-8?q?cogview3=E9=80=82=E9=85=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../cogview3/cogview3plus/models/attention_processor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/attention_processor.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/attention_processor.py index afc6596471..d2a7673ea5 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/attention_processor.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/attention_processor.py @@ -346,7 +346,7 @@ class CogVideoXAttnProcessor2_0: inner_dim = qkv.shape[-1] // 3 head_dim = inner_dim // attn.heads qkv_shape = (B, S, 3, attn.heads, head_dim) - query, key, value = qkv.view(qkv_shape).permute(2, 0, 3, 1, 4).coutiguous().unbind(0) + query, key, value = qkv.view(qkv_shape).permute(2, 0, 3, 1, 4).contiguous().unbind(0) if attn.norm_q is not None: query = attn.norm_q(query) -- Gitee From f99600849747c07f0f54289b2534a158c2b21ffb Mon Sep 17 00:00:00 2001 From: jiangmengyu Date: Sat, 28 Dec 2024 14:17:47 +0800 Subject: [PATCH 55/91] =?UTF-8?q?cogview3=E9=80=82=E9=85=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../models/transformer_cogview3plus.py | 99 ++++++++++++++----- .../pipeline/pipeline_cogview3plus.py | 1 + 2 files changed, 77 insertions(+), 23 deletions(-) diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/transformer_cogview3plus.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/transformer_cogview3plus.py index 29b6905a58..2bd2841899 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/transformer_cogview3plus.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/transformer_cogview3plus.py @@ -16,6 +16,7 @@ from typing import Any, Dict, Union import torch import torch.nn as nn +import numpy as np from diffusers.configuration_utils import ConfigMixin, register_to_config from diffusers.models.attention_processor import AttentionProcessor @@ -170,6 +171,11 @@ class CogView3PlusTransformer2DModel(ModelMixin, ConfigMixin): time_embed_dim: int = 512, condition_dim: int = 256, pos_embed_max_size: int = 128, + use_cache: bool = True, + cache_interval: int = 2, + cache_start: int = 3, + num_cache_layer: int = 13, + cache_start_steps: int = 5, ): super().__init__() self.out_channels = out_channels @@ -224,6 +230,15 @@ class CogView3PlusTransformer2DModel(ModelMixin, ConfigMixin): self.v_weight_cache = None self.v_bias_cache = None + self.use_cache = use_cache + self.cache_interval = cache_interval + self.cache_start = cache_start + self.num_cache_layer = num_cache_layer + self.cache_start_steps = cache_start_steps + + self.delta_cache = None + self.delta_encoder_cache = None + @property # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.attn_processors def attn_processors(self) -> Dict[str, AttentionProcessor]: @@ -295,6 +310,7 @@ class CogView3PlusTransformer2DModel(ModelMixin, ConfigMixin): original_size: torch.Tensor, target_size: torch.Tensor, crop_coords: torch.Tensor, + t_idx: int, ) -> Union[torch.Tensor, Transformer2DModelOutput]: hidden_states = states[0] encoder_hidden_states = states[1] @@ -309,29 +325,7 @@ class CogView3PlusTransformer2DModel(ModelMixin, ConfigMixin): encoder_hidden_states = hidden_states[:, :text_seq_length] hidden_states = hidden_states[:, text_seq_length:] - for index_block, block in enumerate(self.transformer_blocks): - if torch.is_grad_enabled() and self.gradient_checkpointing: - - def create_custom_forward(module): - def custom_forward(*inputs): - return module(*inputs) - - return custom_forward - - ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {} - hidden_states, encoder_hidden_states = torch.utils.checkpoint.checkpoint( - create_custom_forward(block), - hidden_states, - encoder_hidden_states, - emb, - **ckpt_kwargs, - ) - else: - hidden_states, encoder_hidden_states = block( - hidden_states=hidden_states, - encoder_hidden_states=encoder_hidden_states, - emb=emb, - ) + hidden_states, encoder_hidden_states = self._forward_blocks(hidden_states, encoder_hidden_states, emb, t_idx) hidden_states = self.norm_out(hidden_states, emb) hidden_states = self.proj_out(hidden_states) # (batch_size, height*width, patch_size*patch_size*out_channels) @@ -351,6 +345,65 @@ class CogView3PlusTransformer2DModel(ModelMixin, ConfigMixin): return Transformer2DModelOutput(sample=output) + # forward blocks in range [start_idx, end_idx), then return input and output + def _forward_blocks_range(self, hidden_states, encoder_hidden_states, emb, start_idx, end_idx, **kwargs): + for _, block in enumerate(self.transformer_blocks[start_idx: end_idx]): + hidden_states, encoder_hidden_states = block( + hidden_states=hidden_states, + encoder_hidden_states=encoder_hidden_states, + emb=emb, + ) + + return hidden_states, encoder_hidden_states + + def _forward_blocks(self, hidden_states, encoder_hidden_states, emb, t_idx): + num_blocks = len(self.spatial_blocks) + + if not self.use_cache or (t_idx < self.cache_start_steps): + hidden_states, encoder_hidden_states = self._forward_blocks_range( + hidden_states, + encoder_hidden_states, + emb, + 0, + num_blocks + ) + else: + # infer [0, cache_start) + hidden_states, encoder_hidden_states = self._forward_blocks_range( + hidden_states, + encoder_hidden_states, + emb, + 0, + self.cache_start + ) + # infer [cache_start, cache_end) + cache_end = np.minimum(self.cache_start + self.num_cache_layer, num_blocks) + hidden_states_before_cache = hidden_states.clone() + encoder_hidden_states_before_cache = encoder_hidden_states.clone() + if t_idx % self.cache_interval == (self.cache_start_steps % self.cache_interval): + hidden_states, encoder_hidden_states = self._forward_blocks_range( + hidden_states, + encoder_hidden_states, + emb, + self.cache_start, + cache_end + ) + self.delta_cache = hidden_states - hidden_states_before_cache + self.delta_encoder_cache = encoder_hidden_states - encoder_hidden_states_before_cache + else: + hidden_states = hidden_states_before_cache + self.delta_cache + encoder_hidden_states = encoder_hidden_states_before_cache + self.delta_encoder_cache + # infer [cache_end, num_blocks) + hidden_states, encoder_hidden_states = self._forward_blocks_range( + hidden_states, + encoder_hidden_states, + emb, + cache_end, + num_blocks + ) + + return hidden_states, encoder_hidden_states + def load_weights(self, state_dict, shard=False): with torch.no_grad(): if not shard: diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py index 7af8721fa3..1dda0a2108 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py @@ -343,6 +343,7 @@ class CogView3PlusPipeline(DiffusionPipeline): original_size=original_size, target_size=target_size, crop_coords=crops_coords_top_left, + t_idx=i, )[0] noise_pred = noise_pred.float() -- Gitee From e064ad386b40f6db2df0d839ea4cbf7e243486af Mon Sep 17 00:00:00 2001 From: jiangmengyu Date: Sat, 28 Dec 2024 14:22:46 +0800 Subject: [PATCH 56/91] =?UTF-8?q?cogview3=E9=80=82=E9=85=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../cogview3/cogview3plus/models/transformer_cogview3plus.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/transformer_cogview3plus.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/transformer_cogview3plus.py index 2bd2841899..b98e3e2526 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/transformer_cogview3plus.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/transformer_cogview3plus.py @@ -357,7 +357,7 @@ class CogView3PlusTransformer2DModel(ModelMixin, ConfigMixin): return hidden_states, encoder_hidden_states def _forward_blocks(self, hidden_states, encoder_hidden_states, emb, t_idx): - num_blocks = len(self.spatial_blocks) + num_blocks = len(self.transformer_blocks) if not self.use_cache or (t_idx < self.cache_start_steps): hidden_states, encoder_hidden_states = self._forward_blocks_range( -- Gitee From 58f83de387f68dabeb760439d1f986ffe5ce2954 Mon Sep 17 00:00:00 2001 From: jiangmengyu Date: Tue, 31 Dec 2024 10:59:17 +0800 Subject: [PATCH 57/91] =?UTF-8?q?cogview3=E9=80=82=E9=85=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../cogview3plus/models/activations.py | 8 +- .../cogview3plus/models/modeling_utils.py | 269 ------------------ 2 files changed, 1 insertion(+), 276 deletions(-) diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/activations.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/activations.py index fc68971806..4726fd7eb2 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/activations.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/activations.py @@ -18,7 +18,7 @@ import torch.nn.functional as F from torch import nn from diffusers.utils import deprecate -from diffusers.utils.import_utils import is_torch_npu_available, is_torch_version +from diffusers.utils.import_utils import is_torch_npu_available if is_torch_npu_available(): @@ -79,9 +79,6 @@ class GELU(nn.Module): self.approximate = approximate def gelu(self, gate: torch.Tensor) -> torch.Tensor: - if gate.device.type == "mps" and is_torch_version("<", "2.0.0"): - # fp16 gelu not supported on mps before torch 2.0 - return F.gelu(gate.to(dtype=torch.float32), approximate=self.approximate).to(dtype=gate.dtype) return F.gelu(gate, approximate=self.approximate) def forward(self, hidden_states): @@ -103,9 +100,6 @@ class GEGLU(nn.Module): self.proj = nn.Linear(dim_in, dim_out * 2, bias=bias) def gelu(self, gate: torch.Tensor) -> torch.Tensor: - if gate.device.type == "mps" and is_torch_version("<", "2.0.0"): - # fp16 gelu not supported on mps before torch 2.0 - return F.gelu(gate.to(dtype=torch.float32)).to(dtype=gate.dtype) return F.gelu(gate) def forward(self, hidden_states, *args, **kwargs): diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/modeling_utils.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/modeling_utils.py index 252c758863..35f4891b42 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/modeling_utils.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/modeling_utils.py @@ -119,15 +119,6 @@ def get_parameter_dtype(parameter: torch.nn.Module) -> torch.dtype: class ModelMixin(torch.nn.Module, PushToHubMixin): - r""" - Base class for all models. - - [`ModelMixin`] takes care of storing the model configuration and provides methods for loading, downloading and - saving models. - - - **config_name** ([`str`]) -- Filename to save a model to when calling [`~models.ModelMixin.save_pretrained`]. - """ - config_name = CONFIG_NAME _automatically_saved_args = ["_diffusers_version", "_class_name", "_name_or_path"] _supports_gradient_checkpointing = False @@ -139,11 +130,6 @@ class ModelMixin(torch.nn.Module, PushToHubMixin): super().__init__() def __getattr__(self, name: str) -> Any: - """The only reason we overwrite `getattr` here is to gracefully deprecate accessing - config attributes directly. See https://github.com/huggingface/diffusers/pull/3129 We need to overwrite - __getattr__ here in addition so that we don't trigger `torch.nn.Module`'s __getattr__': - https://pytorch.org/docs/stable/_modules/torch/nn/modules/module.html#Module - """ is_in_config = "_internal_dict" in self.__dict__ and hasattr(self.__dict__["_internal_dict"], name) is_attribute = name in self.__dict__ @@ -156,264 +142,9 @@ class ModelMixin(torch.nn.Module, PushToHubMixin): # call PyTorch's https://pytorch.org/docs/stable/_modules/torch/nn/modules/module.html#Module return super().__getattr__(name) - @property - def is_gradient_checkpointing(self) -> bool: - """ - Whether gradient checkpointing is activated for this model or not. - """ - return any(hasattr(m, "gradient_checkpointing") and m.gradient_checkpointing for m in self.modules()) - - def enable_gradient_checkpointing(self) -> None: - """ - Activates gradient checkpointing for the current model (may be referred to as *activation checkpointing* or - *checkpoint activations* in other frameworks). - """ - if not self._supports_gradient_checkpointing: - raise ValueError(f"{self.__class__.__name__} does not support gradient checkpointing.") - self.apply(partial(self._set_gradient_checkpointing, value=True)) - - def disable_gradient_checkpointing(self) -> None: - """ - Deactivates gradient checkpointing for the current model (may be referred to as *activation checkpointing* or - *checkpoint activations* in other frameworks). - """ - if self._supports_gradient_checkpointing: - self.apply(partial(self._set_gradient_checkpointing, value=False)) - - def set_use_npu_flash_attention(self, valid: bool) -> None: - r""" - Set the switch for the npu flash attention. - """ - - def fn_recursive_set_npu_flash_attention(module: torch.nn.Module): - if hasattr(module, "set_use_npu_flash_attention"): - module.set_use_npu_flash_attention(valid) - - for child in module.children(): - fn_recursive_set_npu_flash_attention(child) - - for module in self.children(): - if isinstance(module, torch.nn.Module): - fn_recursive_set_npu_flash_attention(module) - - def enable_npu_flash_attention(self) -> None: - r""" - Enable npu flash attention from torch_npu - - """ - self.set_use_npu_flash_attention(True) - - def disable_npu_flash_attention(self) -> None: - r""" - disable npu flash attention from torch_npu - - """ - self.set_use_npu_flash_attention(False) - - def set_use_xla_flash_attention( - self, use_xla_flash_attention: bool, partition_spec: Optional[Callable] = None - ) -> None: - # Recursively walk through all the children. - # Any children which exposes the set_use_xla_flash_attention method - # gets the message - def fn_recursive_set_flash_attention(module: torch.nn.Module): - if hasattr(module, "set_use_xla_flash_attention"): - module.set_use_xla_flash_attention(use_xla_flash_attention, partition_spec) - - for child in module.children(): - fn_recursive_set_flash_attention(child) - - for module in self.children(): - if isinstance(module, torch.nn.Module): - fn_recursive_set_flash_attention(module) - - def enable_xla_flash_attention(self, partition_spec: Optional[Callable] = None): - r""" - Enable the flash attention pallals kernel for torch_xla. - """ - self.set_use_xla_flash_attention(True, partition_spec) - - def disable_xla_flash_attention(self): - r""" - Disable the flash attention pallals kernel for torch_xla. - """ - self.set_use_xla_flash_attention(False) - - def set_use_memory_efficient_attention_xformers( - self, valid: bool, attention_op: Optional[Callable] = None - ) -> None: - # Recursively walk through all the children. - # Any children which exposes the set_use_memory_efficient_attention_xformers method - # gets the message - def fn_recursive_set_mem_eff(module: torch.nn.Module): - if hasattr(module, "set_use_memory_efficient_attention_xformers"): - module.set_use_memory_efficient_attention_xformers(valid, attention_op) - - for child in module.children(): - fn_recursive_set_mem_eff(child) - - for module in self.children(): - if isinstance(module, torch.nn.Module): - fn_recursive_set_mem_eff(module) - - def enable_xformers_memory_efficient_attention(self, attention_op: Optional[Callable] = None) -> None: - r""" - Enable memory efficient attention from [xFormers](https://facebookresearch.github.io/xformers/). - - When this option is enabled, you should observe lower GPU memory usage and a potential speed up during - inference. Speed up during training is not guaranteed. - - - - ⚠️ When memory efficient attention and sliced attention are both enabled, memory efficient attention takes - precedent. - - - - Parameters: - attention_op (`Callable`, *optional*): - Override the default `None` operator for use as `op` argument to the - [`memory_efficient_attention()`](https://facebookresearch.github.io/xformers/components/ops.html#xformers.ops.memory_efficient_attention) - function of xFormers. - - Examples: - - ```py - >>> import torch - >>> from diffusers import UNet2DConditionModel - >>> from xformers.ops import MemoryEfficientAttentionFlashAttentionOp - - >>> model = UNet2DConditionModel.from_pretrained( - ... "stabilityai/stable-diffusion-2-1", subfolder="unet", torch_dtype=torch.float16 - ... ) - >>> model = model.to("cuda") - >>> model.enable_xformers_memory_efficient_attention(attention_op=MemoryEfficientAttentionFlashAttentionOp) - ``` - """ - self.set_use_memory_efficient_attention_xformers(True, attention_op) - - def disable_xformers_memory_efficient_attention(self) -> None: - r""" - Disable memory efficient attention from [xFormers](https://facebookresearch.github.io/xformers/). - """ - self.set_use_memory_efficient_attention_xformers(False) - - def dequantize(self): - """ - Potentially dequantize the model in case it has been quantized by a quantization method that support - dequantization. - """ - hf_quantizer = getattr(self, "hf_quantizer", None) - - if hf_quantizer is None: - raise ValueError("You need to first quantize your model in order to dequantize it") - - return hf_quantizer.dequantize(self) - @classmethod @validate_hf_hub_args def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], **kwargs): - r""" - Instantiate a pretrained PyTorch model from a pretrained model configuration. - - The model is set in evaluation mode - `model.eval()` - by default, and dropout modules are deactivated. To - train the model, set it back in training mode with `model.train()`. - - Parameters: - pretrained_model_name_or_path (`str` or `os.PathLike`, *optional*): - Can be either: - - - A string, the *model id* (for example `google/ddpm-celebahq-256`) of a pretrained model hosted on - the Hub. - - A path to a *directory* (for example `./my_model_directory`) containing the model weights saved - with [`~ModelMixin.save_pretrained`]. - - cache_dir (`Union[str, os.PathLike]`, *optional*): - Path to a directory where a downloaded pretrained model configuration is cached if the standard cache - is not used. - torch_dtype (`str` or `torch.dtype`, *optional*): - Override the default `torch.dtype` and load the model with another dtype. If `"auto"` is passed, the - dtype is automatically derived from the model's weights. - force_download (`bool`, *optional*, defaults to `False`): - Whether or not to force the (re-)download of the model weights and configuration files, overriding the - cached versions if they exist. - proxies (`Dict[str, str]`, *optional*): - A dictionary of proxy servers to use by protocol or endpoint, for example, `{'http': 'foo.bar:3128', - 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request. - output_loading_info (`bool`, *optional*, defaults to `False`): - Whether or not to also return a dictionary containing missing keys, unexpected keys and error messages. - local_files_only(`bool`, *optional*, defaults to `False`): - Whether to only load local model weights and configuration files or not. If set to `True`, the model - won't be downloaded from the Hub. - token (`str` or *bool*, *optional*): - The token to use as HTTP bearer authorization for remote files. If `True`, the token generated from - `diffusers-cli login` (stored in `~/.huggingface`) is used. - revision (`str`, *optional*, defaults to `"main"`): - The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier - allowed by Git. - from_flax (`bool`, *optional*, defaults to `False`): - Load the model weights from a Flax checkpoint save file. - subfolder (`str`, *optional*, defaults to `""`): - The subfolder location of a model file within a larger model repository on the Hub or locally. - mirror (`str`, *optional*): - Mirror source to resolve accessibility issues if you're downloading a model in China. We do not - guarantee the timeliness or safety of the source, and you should refer to the mirror site for more - information. - device_map (`str` or `Dict[str, Union[int, str, torch.device]]`, *optional*): - A map that specifies where each submodule should go. It doesn't need to be defined for each - parameter/buffer name; once a given module name is inside, every submodule of it will be sent to the - same device. Defaults to `None`, meaning that the model will be loaded on CPU. - - Set `device_map="auto"` to have 🤗 Accelerate automatically compute the most optimized `device_map`. For - more information about each option see [designing a device - map](https://hf.co/docs/accelerate/main/en/usage_guides/big_modeling#designing-a-device-map). - max_memory (`Dict`, *optional*): - A dictionary device identifier for the maximum memory. Will default to the maximum memory available for - each GPU and the available CPU RAM if unset. - offload_folder (`str` or `os.PathLike`, *optional*): - The path to offload weights if `device_map` contains the value `"disk"`. - offload_state_dict (`bool`, *optional*): - If `True`, temporarily offloads the CPU state dict to the hard drive to avoid running out of CPU RAM if - the weight of the CPU state dict + the biggest shard of the checkpoint does not fit. Defaults to `True` - when there is some disk offload. - low_cpu_mem_usage (`bool`, *optional*, defaults to `True` if torch version >= 1.9.0 else `False`): - Speed up model loading only loading the pretrained weights and not initializing the weights. This also - tries to not use more than 1x model size in CPU memory (including peak memory) while loading the model. - Only supported for PyTorch >= 1.9.0. If you are using an older version of PyTorch, setting this - argument to `True` will raise an error. - variant (`str`, *optional*): - Load weights from a specified `variant` filename such as `"fp16"` or `"ema"`. This is ignored when - loading `from_flax`. - use_safetensors (`bool`, *optional*, defaults to `None`): - If set to `None`, the `safetensors` weights are downloaded if they're available **and** if the - `safetensors` library is installed. If set to `True`, the model is forcibly loaded from `safetensors` - weights. If set to `False`, `safetensors` weights are not loaded. - - - - To use private or [gated models](https://huggingface.co/docs/hub/models-gated#gated-models), log-in with - `huggingface-cli login`. You can also activate the special - ["offline-mode"](https://huggingface.co/diffusers/installation.html#offline-mode) to use this method in a - firewalled environment. - - - - Example: - - ```py - from diffusers import UNet2DConditionModel - - unet = UNet2DConditionModel.from_pretrained("runwayml/stable-diffusion-v1-5", subfolder="unet") - ``` - - If you get the error message below, you need to finetune the weights for your downstream task: - - ```bash - Some weights of UNet2DConditionModel were not initialized from the model checkpoint at runwayml/stable-diffusion-v1-5 and are newly initialized because the shapes did not match: - - conv_in.weight: found shape torch.Size([320, 4, 3, 3]) in the checkpoint and torch.Size([320, 9, 3, 3]) in the model instantiated - You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference. - ``` - """ cache_dir = kwargs.pop("cache_dir", None) ignore_mismatched_sizes = kwargs.pop("ignore_mismatched_sizes", False) force_download = kwargs.pop("force_download", False) -- Gitee From 29df760d8158b6e4a26999012f614577ff3f5e8d Mon Sep 17 00:00:00 2001 From: jiangmengyu Date: Tue, 31 Dec 2024 11:00:54 +0800 Subject: [PATCH 58/91] =?UTF-8?q?cogview3=E9=80=82=E9=85=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../built-in/foundation/cogview3/inference_cogview3plus.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/inference_cogview3plus.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/inference_cogview3plus.py index d24d3f29c4..9030a82b5e 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/inference_cogview3plus.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/inference_cogview3plus.py @@ -60,7 +60,7 @@ def parse_arguments(): parser.add_argument("--height", type=int, default=1024, help="Height of the generated image.") parser.add_argument("--output_path", type=str, default="cogview3.png", help="Path to save the generated image.") parser.add_argument("--dtype", type=str, default="bf16", help="bf16 or fp16") - parser.add_argument("--device_id", type=int, default=6, help="NPU device id") + parser.add_argument("--device_id", type=int, default=7, help="NPU device id") return parser.parse_args() -- Gitee From a796f721aa5cd003d0aa6fa6c49da096a7b6d880 Mon Sep 17 00:00:00 2001 From: jiangmengyu Date: Tue, 31 Dec 2024 11:11:22 +0800 Subject: [PATCH 59/91] =?UTF-8?q?cogview3=E9=80=82=E9=85=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../cogview3plus/models/modeling_utils.py | 192 +----------------- 1 file changed, 2 insertions(+), 190 deletions(-) diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/modeling_utils.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/modeling_utils.py index 35f4891b42..a09f50daf8 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/modeling_utils.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/modeling_utils.py @@ -15,14 +15,13 @@ # limitations under the License. import copy -import inspect import itertools import json import os import re from collections import OrderedDict -from functools import partial, wraps -from typing import Any, Callable, List, Optional, Tuple, Union +from functools import wraps +from typing import Any, List, Optional, Tuple, Union import torch from huggingface_hub.utils import validate_hf_hub_args @@ -41,7 +40,6 @@ from diffusers.utils import ( _get_model_file, deprecate, is_accelerate_available, - is_bitsandbytes_available, is_bitsandbytes_version, logging, ) @@ -157,9 +155,6 @@ class ModelMixin(torch.nn.Module, PushToHubMixin): torch_dtype = kwargs.pop("torch_dtype", None) subfolder = kwargs.pop("subfolder", None) device_map = kwargs.pop("device_map", None) - max_memory = kwargs.pop("max_memory", None) - offload_folder = kwargs.pop("offload_folder", None) - offload_state_dict = kwargs.pop("offload_state_dict", False) low_cpu_mem_usage = kwargs.pop("low_cpu_mem_usage", _LOW_CPU_MEM_USAGE_DEFAULT) variant = kwargs.pop("variant", None) use_safetensors = kwargs.pop("use_safetensors", None) @@ -631,7 +626,6 @@ class ModelMixin(torch.nn.Module, PushToHubMixin): cls, model, state_dict: OrderedDict, - resolved_archive_file, pretrained_model_name_or_path: Union[str, os.PathLike], ignore_mismatched_sizes: bool = False, ): @@ -730,142 +724,14 @@ class ModelMixin(torch.nn.Module, PushToHubMixin): return model, missing_keys, unexpected_keys, mismatched_keys, error_msgs - @classmethod - def _get_signature_keys(cls, obj): - parameters = inspect.signature(obj.__init__).parameters - required_parameters = {k: v for k, v in parameters.items() if v.default == inspect._empty} - optional_parameters = set({k for k, v in parameters.items() if v.default != inspect._empty}) - expected_modules = set(required_parameters.keys()) - {"self"} - - return expected_modules, optional_parameters - - # Adapted from `transformers` modeling_utils.py - def _get_no_split_modules(self, device_map: str): - """ - Get the modules of the model that should not be spit when using device_map. We iterate through the modules to - get the underlying `_no_split_modules`. - - Args: - device_map (`str`): - The device map value. Options are ["auto", "balanced", "balanced_low_0", "sequential"] - - Returns: - `List[str]`: List of modules that should not be split - """ - _no_split_modules = set() - modules_to_check = [self] - while len(modules_to_check) > 0: - module = modules_to_check.pop(-1) - # if the module does not appear in _no_split_modules, we also check the children - if module.__class__.__name__ not in _no_split_modules: - if isinstance(module, ModelMixin): - if module._no_split_modules is None: - raise ValueError( - f"{module.__class__.__name__} does not support `device_map='{device_map}'`. To implement support, the model " - "class needs to implement the `_no_split_modules` attribute." - ) - else: - _no_split_modules = _no_split_modules | set(module._no_split_modules) - modules_to_check += list(module.children()) - return list(_no_split_modules) - @property def device(self) -> torch.device: - """ - `torch.device`: The device on which the module is (assuming that all the module parameters are on the same - device). - """ return get_parameter_device(self) @property def dtype(self) -> torch.dtype: - """ - `torch.dtype`: The dtype of the module (assuming that all the module parameters have the same dtype). - """ return get_parameter_dtype(self) - def num_parameters(self, only_trainable: bool = False, exclude_embeddings: bool = False) -> int: - """ - Get number of (trainable or non-embedding) parameters in the module. - - Args: - only_trainable (`bool`, *optional*, defaults to `False`): - Whether or not to return only the number of trainable parameters. - exclude_embeddings (`bool`, *optional*, defaults to `False`): - Whether or not to return only the number of non-embedding parameters. - - Returns: - `int`: The number of parameters. - - Example: - - ```py - from diffusers import UNet2DConditionModel - - model_id = "runwayml/stable-diffusion-v1-5" - unet = UNet2DConditionModel.from_pretrained(model_id, subfolder="unet") - unet.num_parameters(only_trainable=True) - 859520964 - ``` - """ - is_loaded_in_4bit = getattr(self, "is_loaded_in_4bit", False) - - if is_loaded_in_4bit: - if is_bitsandbytes_available(): - import bitsandbytes as bnb - else: - raise ValueError( - "bitsandbytes is not installed but it seems that the model has been loaded in 4bit precision, something went wrong" - " make sure to install bitsandbytes with `pip install bitsandbytes`. You also need a GPU. " - ) - - if exclude_embeddings: - embedding_param_names = [ - f"{name}.weight" for name, module_type in self.named_modules() if isinstance(module_type, nn.Embedding) - ] - total_parameters = [ - parameter for name, parameter in self.named_parameters() if name not in embedding_param_names - ] - else: - total_parameters = list(self.parameters()) - - total_numel = [] - - for param in total_parameters: - if param.requires_grad or not only_trainable: - # For 4bit models, we need to multiply the number of parameters by 2 as half of the parameters are - # used for the 4bit quantization (uint8 tensors are stored) - if is_loaded_in_4bit and isinstance(param, bnb.nn.Params4bit): - if hasattr(param, "element_size"): - num_bytes = param.element_size() - elif hasattr(param, "quant_storage"): - num_bytes = param.quant_storage.itemsize - else: - num_bytes = 1 - total_numel.append(param.numel() * 2 * num_bytes) - else: - total_numel.append(param.numel()) - - return sum(total_numel) - - def get_memory_footprint(self, return_buffers=True): - r""" - Get the memory footprint of a model. This will return the memory footprint of the current model in bytes. - Useful to benchmark the memory footprint of the current model and design some tests. Solution inspired from the - PyTorch discussions: https://discuss.pytorch.org/t/gpu-memory-that-model-uses/56822/2 - - Arguments: - return_buffers (`bool`, *optional*, defaults to `True`): - Whether to return the size of the buffer tensors in the computation of the memory footprint. Buffers - are tensors that do not require gradients and not registered as parameters. E.g. mean and std in batch - norm layers. Please see: https://discuss.pytorch.org/t/what-pytorch-means-by-buffers/120266/2 - """ - mem = sum([param.nelement() * param.element_size() for param in self.parameters()]) - if return_buffers: - mem_bufs = sum([buf.nelement() * buf.element_size() for buf in self.buffers()]) - mem = mem + mem_bufs - return mem - def _convert_deprecated_attention_blocks(self, state_dict: OrderedDict) -> None: deprecated_attention_block_paths = [] @@ -879,10 +745,6 @@ class ModelMixin(torch.nn.Module, PushToHubMixin): recursive_find_attn_block("", self) - # NOTE: we have to check if the deprecated parameters are in the state dict - # because it is possible we are loading from a state dict that was already - # converted - for path in deprecated_attention_block_paths: # group_norm path stays the same @@ -909,53 +771,3 @@ class ModelMixin(torch.nn.Module, PushToHubMixin): state_dict[f"{path}.to_out.0.weight"] = state_dict.pop(f"{path}.proj_attn.weight") if f"{path}.proj_attn.bias" in state_dict: state_dict[f"{path}.to_out.0.bias"] = state_dict.pop(f"{path}.proj_attn.bias") - - def _temp_convert_self_to_deprecated_attention_blocks(self) -> None: - deprecated_attention_block_modules = [] - - def recursive_find_attn_block(module): - if hasattr(module, "_from_deprecated_attn_block") and module._from_deprecated_attn_block: - deprecated_attention_block_modules.append(module) - - for sub_module in module.children(): - recursive_find_attn_block(sub_module) - - recursive_find_attn_block(self) - - for module in deprecated_attention_block_modules: - module.query = module.to_q - module.key = module.to_k - module.value = module.to_v - module.proj_attn = module.to_out[0] - - # We don't _have_ to delete the old attributes, but it's helpful to ensure - # that _all_ the weights are loaded into the new attributes and we're not - # making an incorrect assumption that this model should be converted when - # it really shouldn't be. - del module.to_q - del module.to_k - del module.to_v - del module.to_out - - def _undo_temp_convert_self_to_deprecated_attention_blocks(self) -> None: - deprecated_attention_block_modules = [] - - def recursive_find_attn_block(module) -> None: - if hasattr(module, "_from_deprecated_attn_block") and module._from_deprecated_attn_block: - deprecated_attention_block_modules.append(module) - - for sub_module in module.children(): - recursive_find_attn_block(sub_module) - - recursive_find_attn_block(self) - - for module in deprecated_attention_block_modules: - module.to_q = module.query - module.to_k = module.key - module.to_v = module.value - module.to_out = nn.ModuleList([module.proj_attn, nn.Dropout(module.dropout)]) - - del module.query - del module.key - del module.value - del module.proj_attn -- Gitee From 11c5ead9215330f394ea1acf490f8dbe6b11bbb6 Mon Sep 17 00:00:00 2001 From: jiangmengyu Date: Tue, 31 Dec 2024 11:15:57 +0800 Subject: [PATCH 60/91] =?UTF-8?q?cogview3=E9=80=82=E9=85=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../models/transformer_cogview3plus.py | 57 +------------------ .../pipeline/pipeline_cogview3plus.py | 27 --------- .../schedulers/scheduling_ddim_cogvideox.py | 56 ------------------ .../schedulers/scheduling_utils.py | 6 +- 4 files changed, 2 insertions(+), 144 deletions(-) diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/transformer_cogview3plus.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/transformer_cogview3plus.py index b98e3e2526..6f4fa104b8 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/transformer_cogview3plus.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/transformer_cogview3plus.py @@ -20,7 +20,7 @@ import numpy as np from diffusers.configuration_utils import ConfigMixin, register_to_config from diffusers.models.attention_processor import AttentionProcessor -from diffusers.utils import is_torch_version, logging +from diffusers.utils import logging from diffusers.models.modeling_outputs import Transformer2DModelOutput from .modeling_utils import ModelMixin @@ -34,18 +34,6 @@ logger = logging.get_logger(__name__) # pylint: disable=invalid-name class CogView3PlusTransformerBlock(nn.Module): - r""" - Args: - dim (`int`): - The number of channels in the input and output. - num_attention_heads (`int`): - The number of heads to use for multi-head attention. - attention_head_dim (`int`): - The number of channels in each head. - time_embed_dim (`int`): - The number of channels in timestep embedding. - """ - def __init__( self, dim: int = 2560, @@ -125,37 +113,6 @@ class CogView3PlusTransformerBlock(nn.Module): class CogView3PlusTransformer2DModel(ModelMixin, ConfigMixin): - r""" - Args: - patch_size (`int`, defaults to `2`): - The size of the patches to use in the patch embedding layer. - in_channels (`int`, defaults to `16`): - The number of channels in the input. - num_layers (`int`, defaults to `30`): - The number of layers of Transformer blocks to use. - attention_head_dim (`int`, defaults to `40`): - The number of channels in each head. - num_attention_heads (`int`, defaults to `64`): - The number of heads to use for multi-head attention. - out_channels (`int`, defaults to `16`): - The number of channels in the output. - text_embed_dim (`int`, defaults to `4096`): - Input dimension of text embeddings from the text encoder. - time_embed_dim (`int`, defaults to `512`): - Output dimension of timestep embeddings. - condition_dim (`int`, defaults to `256`): - The embedding dimension of the input SDXL-style resolution conditions (original_size, target_size, - crop_coords). - pos_embed_max_size (`int`, defaults to `128`): - The maximum resolution of the positional embeddings, from which slices of shape `H x W` are taken and added - to input patched latents, where `H` and `W` are the latent height and width respectively. A value of 128 - means that the maximum supported height and width for image generation is `128 * vae_scale_factor * - patch_size => 128 * 8 * 2 => 2048`. - sample_size (`int`, defaults to `128`): - The base resolution of input latents. If height/width is not provided during generation, this value is used - to determine the resolution as `sample_size * vae_scale_factor => 128 * 8 => 1024` - """ - _supports_gradient_checkpointing = True @register_to_config @@ -266,18 +223,6 @@ class CogView3PlusTransformer2DModel(ModelMixin, ConfigMixin): # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_attn_processor def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]): - r""" - Sets the attention processor to use to compute attention. - - Parameters: - processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`): - The instantiated processor class or a dictionary of processor classes that will be set as the processor - for **all** `Attention` layers. - - If `processor` is a dict, the key needs to define the path to the corresponding cross attention - processor. This is strongly recommended when setting trainable attention processors. - - """ count = len(self.attn_processors.keys()) if isinstance(processor, dict) and len(processor) != count: diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py index 1dda0a2108..05f6ddf53a 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py @@ -41,29 +41,6 @@ def retrieve_timesteps( sigmas: Optional[List[float]] = None, **kwargs, ): - r""" - Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles - custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`. - - Args: - scheduler (`SchedulerMixin`): - The scheduler to get timesteps from. - num_inference_steps (`int`): - The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps` - must be `None`. - device (`str` or `torch.device`, *optional*): - The device to which the timesteps should be moved to. If `None`, the timesteps are not moved. - timesteps (`List[int]`, *optional*): - Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed, - `num_inference_steps` and `sigmas` must be `None`. - sigmas (`List[float]`, *optional*): - Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed, - `num_inference_steps` and `timesteps` must be `None`. - - Returns: - `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the - second element is the number of inference steps. - """ if timesteps is not None and sigmas is not None: raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values") if timesteps is not None: @@ -121,7 +98,6 @@ class CogView3PlusPipeline(DiffusionPipeline): self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor) - # Copied from diffusers.pipelines.cogvideo.pipeline_cogvideox.CogVideoXPipeline._get_t5_prompt_embeds with num_videos_per_prompt->num_images_per_prompt def _get_t5_prompt_embeds( self, prompt: Union[str, List[str]] = None, @@ -186,7 +162,6 @@ class CogView3PlusPipeline(DiffusionPipeline): return prompt_embeds, negative_prompt_embeds - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents def prepare_latents(self, batch_size, num_channels_latents, image_size, dtype, device): height = image_size[0] width = image_size[1] @@ -202,7 +177,6 @@ class CogView3PlusPipeline(DiffusionPipeline): latents = latents * self.scheduler.init_noise_sigma return latents - # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs def prepare_extra_step_kwargs(self, generator, eta): accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys()) extra_step_kwargs = {} @@ -215,7 +189,6 @@ class CogView3PlusPipeline(DiffusionPipeline): extra_step_kwargs["generator"] = generator return extra_step_kwargs - # Copied from diffusers.pipelines.latte.pipeline_latte.LattePipeline.check_inputs def check_inputs( self, prompt, diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_ddim_cogvideox.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_ddim_cogvideox.py index 26ae48a2c8..29ad7a2c81 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_ddim_cogvideox.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_ddim_cogvideox.py @@ -26,7 +26,6 @@ from .scheduling_utils import KarrasDiffusionSchedulers, SchedulerMixin @dataclass -# Copied from diffusers.schedulers.scheduling_ddpm.DDPMSchedulerOutput with DDPM->DDIM class DDIMSchedulerOutput(BaseOutput): """ Output class for the scheduler's `step` function output. @@ -43,31 +42,11 @@ class DDIMSchedulerOutput(BaseOutput): prev_sample: torch.Tensor pred_original_sample: Optional[torch.Tensor] = None - -# Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar def betas_for_alpha_bar( num_diffusion_timesteps, max_beta=0.999, alpha_transform_type="cosine", ): - """ - Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of - (1-beta) over time from t = [0,1]. - - Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up - to that part of the diffusion process. - - - Args: - num_diffusion_timesteps (`int`): the number of betas to produce. - max_beta (`float`): the maximum beta to use; use values lower than 1 to - prevent singularities. - alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar. - Choose from `cosine` or `exp` - - Returns: - betas (`np.ndarray`): the betas used by the scheduler to step the model outputs - """ if alpha_transform_type == "cosine": def alpha_bar_fn(t): @@ -90,18 +69,6 @@ def betas_for_alpha_bar( def rescale_zero_terminal_snr(alphas_cumprod): - """ - Rescales betas to have zero terminal SNR Based on https://arxiv.org/pdf/2305.08891.pdf (Algorithm 1) - - - Args: - betas (`torch.Tensor`): - the betas that the scheduler is being initialized with. - - Returns: - `torch.Tensor`: rescaled betas with zero terminal SNR - """ - alphas_bar_sqrt = alphas_cumprod.sqrt() # Store old values. @@ -132,13 +99,7 @@ class CogVideoXDDIMScheduler(SchedulerMixin, ConfigMixin): beta_end: float = 0.0120, beta_schedule: str = "scaled_linear", trained_betas: Optional[Union[np.ndarray, List[float]]] = None, - clip_sample: bool = True, set_alpha_to_one: bool = True, - steps_offset: int = 0, - prediction_type: str = "epsilon", - clip_sample_range: float = 1.0, - sample_max_value: float = 1.0, - timestep_spacing: str = "leading", rescale_betas_zero_snr: bool = False, snr_shift_scale: float = 3.0, ): @@ -188,23 +149,6 @@ class CogVideoXDDIMScheduler(SchedulerMixin, ConfigMixin): return variance - def scale_model_input(self, sample: torch.Tensor, timestep: Optional[int] = None) -> torch.Tensor: - """ - Ensures interchangeability with schedulers that need to scale the denoising model input depending on the - current timestep. - - Args: - sample (`torch.Tensor`): - The input sample. - timestep (`int`, *optional*): - The current timestep in the diffusion chain. - - Returns: - `torch.Tensor`: - A scaled input sample. - """ - return sample - def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.device] = None): """ Sets the discrete timesteps used for the diffusion chain (to be run before inference). diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_utils.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_utils.py index 3f1fb5bc32..ae88225358 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_utils.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_utils.py @@ -27,10 +27,6 @@ from diffusers.utils import BaseOutput, PushToHubMixin SCHEDULER_CONFIG_NAME = "scheduler_config.json" -# NOTE: We make this type an enum because it simplifies usage in docs and prevents -# circular imports when used for `_compatibles` within the schedulers module. -# When it's used as a type in pipelines, it really is a Union because the actual -# scheduler instance is passed in. class KarrasDiffusionSchedulers(Enum): DDIMScheduler = 1 DDPMScheduler = 2 @@ -88,7 +84,7 @@ class SchedulerMixin(PushToHubMixin): **kwargs, ): - config, kwargs, commit_hash = cls.load_config( + config, kwargs, _ = cls.load_config( pretrained_model_name_or_path=pretrained_model_name_or_path, subfolder=subfolder, return_unused_kwargs=True, -- Gitee From 062f09f5dcd2432d568a528ed1d7c7babd62c16e Mon Sep 17 00:00:00 2001 From: jiangmengyu Date: Tue, 31 Dec 2024 11:18:27 +0800 Subject: [PATCH 61/91] =?UTF-8?q?cogview3=E9=80=82=E9=85=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../schedulers/scheduling_ddim_cogvideox.py | 3 + .../cogview3/cogview3plus/vae/__init__.py | 2 +- .../cogview3plus/vae/autoencoder_kl.py | 517 --------- .../cogview3/cogview3plus/vae/vae.py | 996 ------------------ 4 files changed, 4 insertions(+), 1514 deletions(-) delete mode 100644 MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/vae/autoencoder_kl.py delete mode 100644 MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/vae/vae.py diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_ddim_cogvideox.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_ddim_cogvideox.py index 29ad7a2c81..f94de1c81d 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_ddim_cogvideox.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_ddim_cogvideox.py @@ -149,6 +149,9 @@ class CogVideoXDDIMScheduler(SchedulerMixin, ConfigMixin): return variance + def scale_model_input(self, sample: torch.Tensor, timestep: Optional[int] = None) -> torch.Tensor: + return sample + def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.device] = None): """ Sets the discrete timesteps used for the diffusion chain (to be run before inference). diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/vae/__init__.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/vae/__init__.py index 58bbb8f14e..261968dc69 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/vae/__init__.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/vae/__init__.py @@ -1 +1 @@ -from .autoencoder_kl import AutoencoderKL \ No newline at end of file +from diffusers import AutoencoderKL \ No newline at end of file diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/vae/autoencoder_kl.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/vae/autoencoder_kl.py deleted file mode 100644 index 3f524408c9..0000000000 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/vae/autoencoder_kl.py +++ /dev/null @@ -1,517 +0,0 @@ -# Copyright 2024 The HuggingFace Team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from typing import Dict, Optional, Tuple, Union - -import torch -import torch.nn as nn - -from diffusers.configuration_utils import ConfigMixin, register_to_config -from diffusers.loaders import PeftAdapterMixin -from diffusers.loaders.single_file_model import FromOriginalModelMixin -from diffusers.utils import deprecate -from diffusers.utils.accelerate_utils import apply_forward_hook -from diffusers.models.attention_processor import ( - ADDED_KV_ATTENTION_PROCESSORS, - CROSS_ATTENTION_PROCESSORS, - Attention, - AttentionProcessor, - AttnAddedKVProcessor, - AttnProcessor, - FusedAttnProcessor2_0, -) -from diffusers.models.modeling_outputs import AutoencoderKLOutput - -from ..models import ModelMixin -from .vae import Decoder, DecoderOutput, DiagonalGaussianDistribution, Encoder - - -class AutoencoderKL(ModelMixin, ConfigMixin, FromOriginalModelMixin, PeftAdapterMixin): - - _supports_gradient_checkpointing = True - _no_split_modules = ["BasicTransformerBlock", "ResnetBlock2D"] - - @register_to_config - def __init__( - self, - in_channels: int = 3, - out_channels: int = 3, - down_block_types: Tuple[str] = ("DownEncoderBlock2D",), - up_block_types: Tuple[str] = ("UpDecoderBlock2D",), - block_out_channels: Tuple[int] = (64,), - layers_per_block: int = 1, - act_fn: str = "silu", - latent_channels: int = 4, - norm_num_groups: int = 32, - sample_size: int = 32, - use_quant_conv: bool = True, - use_post_quant_conv: bool = True, - mid_block_add_attention: bool = True, - ): - super().__init__() - - # pass init params to Encoder - self.encoder = Encoder( - in_channels=in_channels, - out_channels=latent_channels, - down_block_types=down_block_types, - block_out_channels=block_out_channels, - layers_per_block=layers_per_block, - act_fn=act_fn, - norm_num_groups=norm_num_groups, - double_z=True, - mid_block_add_attention=mid_block_add_attention, - ) - - # pass init params to Decoder - self.decoder = Decoder( - in_channels=latent_channels, - out_channels=out_channels, - up_block_types=up_block_types, - block_out_channels=block_out_channels, - layers_per_block=layers_per_block, - norm_num_groups=norm_num_groups, - act_fn=act_fn, - mid_block_add_attention=mid_block_add_attention, - ) - - self.quant_conv = nn.Conv2d(2 * latent_channels, 2 * latent_channels, 1) if use_quant_conv else None - self.post_quant_conv = nn.Conv2d(latent_channels, latent_channels, 1) if use_post_quant_conv else None - - self.use_slicing = False - self.use_tiling = False - - # only relevant if vae tiling is enabled - self.tile_sample_min_size = self.config.sample_size - sample_size = ( - self.config.sample_size[0] - if isinstance(self.config.sample_size, (list, tuple)) - else self.config.sample_size - ) - self.tile_latent_min_size = int(sample_size / (2 ** (len(self.config.block_out_channels) - 1))) - self.tile_overlap_factor = 0.25 - - def _set_gradient_checkpointing(self, module, value=False): - if isinstance(module, (Encoder, Decoder)): - module.gradient_checkpointing = value - - def enable_tiling(self, use_tiling: bool = True): - r""" - Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to - compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow - processing larger images. - """ - self.use_tiling = use_tiling - - def disable_tiling(self): - r""" - Disable tiled VAE decoding. If `enable_tiling` was previously enabled, this method will go back to computing - decoding in one step. - """ - self.enable_tiling(False) - - def enable_slicing(self): - r""" - Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to - compute decoding in several steps. This is useful to save some memory and allow larger batch sizes. - """ - self.use_slicing = True - - def disable_slicing(self): - r""" - Disable sliced VAE decoding. If `enable_slicing` was previously enabled, this method will go back to computing - decoding in one step. - """ - self.use_slicing = False - - @property - # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.attn_processors - def attn_processors(self) -> Dict[str, AttentionProcessor]: - r""" - Returns: - `dict` of attention processors: A dictionary containing all attention processors used in the model with - indexed by its weight name. - """ - # set recursively - processors = {} - - def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]): - if hasattr(module, "get_processor"): - processors[f"{name}.processor"] = module.get_processor() - - for sub_name, child in module.named_children(): - fn_recursive_add_processors(f"{name}.{sub_name}", child, processors) - - return processors - - for name, module in self.named_children(): - fn_recursive_add_processors(name, module, processors) - - return processors - - # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_attn_processor - def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]): - r""" - Sets the attention processor to use to compute attention. - - Parameters: - processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`): - The instantiated processor class or a dictionary of processor classes that will be set as the processor - for **all** `Attention` layers. - - If `processor` is a dict, the key needs to define the path to the corresponding cross attention - processor. This is strongly recommended when setting trainable attention processors. - - """ - count = len(self.attn_processors.keys()) - - if isinstance(processor, dict) and len(processor) != count: - raise ValueError( - f"A dict of processors was passed, but the number of processors {len(processor)} does not match the" - f" number of attention layers: {count}. Please make sure to pass {count} processor classes." - ) - - def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor): - if hasattr(module, "set_processor"): - if not isinstance(processor, dict): - module.set_processor(processor) - else: - module.set_processor(processor.pop(f"{name}.processor")) - - for sub_name, child in module.named_children(): - fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor) - - for name, module in self.named_children(): - fn_recursive_attn_processor(name, module, processor) - - # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_default_attn_processor - def set_default_attn_processor(self): - """ - Disables custom attention processors and sets the default attention implementation. - """ - if all(proc.__class__ in ADDED_KV_ATTENTION_PROCESSORS for proc in self.attn_processors.values()): - processor = AttnAddedKVProcessor() - elif all(proc.__class__ in CROSS_ATTENTION_PROCESSORS for proc in self.attn_processors.values()): - processor = AttnProcessor() - else: - raise ValueError( - f"Cannot call `set_default_attn_processor` when attention processors are of type {next(iter(self.attn_processors.values()))}" - ) - - self.set_attn_processor(processor) - - def _encode(self, x: torch.Tensor) -> torch.Tensor: - batch_size, num_channels, height, width = x.shape - - if self.use_tiling and (width > self.tile_sample_min_size or height > self.tile_sample_min_size): - return self._tiled_encode(x) - - enc = self.encoder(x) - if self.quant_conv is not None: - enc = self.quant_conv(enc) - - return enc - - @apply_forward_hook - def encode( - self, x: torch.Tensor, return_dict: bool = True - ) -> Union[AutoencoderKLOutput, Tuple[DiagonalGaussianDistribution]]: - """ - Encode a batch of images into latents. - - Args: - x (`torch.Tensor`): Input batch of images. - return_dict (`bool`, *optional*, defaults to `True`): - Whether to return a [`~models.autoencoder_kl.AutoencoderKLOutput`] instead of a plain tuple. - - Returns: - The latent representations of the encoded images. If `return_dict` is True, a - [`~models.autoencoder_kl.AutoencoderKLOutput`] is returned, otherwise a plain `tuple` is returned. - """ - if self.use_slicing and x.shape[0] > 1: - encoded_slices = [self._encode(x_slice) for x_slice in x.split(1)] - h = torch.cat(encoded_slices) - else: - h = self._encode(x) - - posterior = DiagonalGaussianDistribution(h) - - if not return_dict: - return (posterior,) - - return AutoencoderKLOutput(latent_dist=posterior) - - def _decode(self, z: torch.Tensor, return_dict: bool = True) -> Union[DecoderOutput, torch.Tensor]: - if self.use_tiling and (z.shape[-1] > self.tile_latent_min_size or z.shape[-2] > self.tile_latent_min_size): - return self.tiled_decode(z, return_dict=return_dict) - - if self.post_quant_conv is not None: - z = self.post_quant_conv(z) - - dec = self.decoder(z) - - if not return_dict: - return (dec,) - - return DecoderOutput(sample=dec) - - @apply_forward_hook - def decode( - self, z: torch.FloatTensor, return_dict: bool = True, generator=None - ) -> Union[DecoderOutput, torch.FloatTensor]: - """ - Decode a batch of images. - - Args: - z (`torch.Tensor`): Input batch of latent vectors. - return_dict (`bool`, *optional*, defaults to `True`): - Whether to return a [`~models.vae.DecoderOutput`] instead of a plain tuple. - - Returns: - [`~models.vae.DecoderOutput`] or `tuple`: - If return_dict is True, a [`~models.vae.DecoderOutput`] is returned, otherwise a plain `tuple` is - returned. - - """ - if self.use_slicing and z.shape[0] > 1: - decoded_slices = [self._decode(z_slice).sample for z_slice in z.split(1)] - decoded = torch.cat(decoded_slices) - else: - decoded = self._decode(z).sample - - if not return_dict: - return (decoded,) - - return DecoderOutput(sample=decoded) - - def blend_v(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch.Tensor: - blend_extent = min(a.shape[2], b.shape[2], blend_extent) - for y in range(blend_extent): - b[:, :, y, :] = a[:, :, -blend_extent + y, :] * (1 - y / blend_extent) + b[:, :, y, :] * (y / blend_extent) - return b - - def blend_h(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch.Tensor: - blend_extent = min(a.shape[3], b.shape[3], blend_extent) - for x in range(blend_extent): - b[:, :, :, x] = a[:, :, :, -blend_extent + x] * (1 - x / blend_extent) + b[:, :, :, x] * (x / blend_extent) - return b - - def _tiled_encode(self, x: torch.Tensor) -> torch.Tensor: - r"""Encode a batch of images using a tiled encoder. - - When this option is enabled, the VAE will split the input tensor into tiles to compute encoding in several - steps. This is useful to keep memory use constant regardless of image size. The end result of tiled encoding is - different from non-tiled encoding because each tile uses a different encoder. To avoid tiling artifacts, the - tiles overlap and are blended together to form a smooth output. You may still see tile-sized changes in the - output, but they should be much less noticeable. - - Args: - x (`torch.Tensor`): Input batch of images. - - Returns: - `torch.Tensor`: - The latent representation of the encoded videos. - """ - - overlap_size = int(self.tile_sample_min_size * (1 - self.tile_overlap_factor)) - blend_extent = int(self.tile_latent_min_size * self.tile_overlap_factor) - row_limit = self.tile_latent_min_size - blend_extent - - # Split the image into 512x512 tiles and encode them separately. - rows = [] - for i in range(0, x.shape[2], overlap_size): - row = [] - for j in range(0, x.shape[3], overlap_size): - tile = x[:, :, i : i + self.tile_sample_min_size, j : j + self.tile_sample_min_size] - tile = self.encoder(tile) - if self.config.use_quant_conv: - tile = self.quant_conv(tile) - row.append(tile) - rows.append(row) - result_rows = [] - for i, row in enumerate(rows): - result_row = [] - for j, tile in enumerate(row): - # blend the above tile and the left tile - # to the current tile and add the current tile to the result row - if i > 0: - tile = self.blend_v(rows[i - 1][j], tile, blend_extent) - if j > 0: - tile = self.blend_h(row[j - 1], tile, blend_extent) - result_row.append(tile[:, :, :row_limit, :row_limit]) - result_rows.append(torch.cat(result_row, dim=3)) - - enc = torch.cat(result_rows, dim=2) - return enc - - def tiled_encode(self, x: torch.Tensor, return_dict: bool = True) -> AutoencoderKLOutput: - deprecation_message = ( - "The tiled_encode implementation supporting the `return_dict` parameter is deprecated. In the future, the " - "implementation of this method will be replaced with that of `_tiled_encode` and you will no longer be able " - "to pass `return_dict`. You will also have to create a `DiagonalGaussianDistribution()` from the returned value." - ) - deprecate("tiled_encode", "1.0.0", deprecation_message, standard_warn=False) - - overlap_size = int(self.tile_sample_min_size * (1 - self.tile_overlap_factor)) - blend_extent = int(self.tile_latent_min_size * self.tile_overlap_factor) - row_limit = self.tile_latent_min_size - blend_extent - - # Split the image into 512x512 tiles and encode them separately. - rows = [] - for i in range(0, x.shape[2], overlap_size): - row = [] - for j in range(0, x.shape[3], overlap_size): - tile = x[:, :, i : i + self.tile_sample_min_size, j : j + self.tile_sample_min_size] - tile = self.encoder(tile) - if self.config.use_quant_conv: - tile = self.quant_conv(tile) - row.append(tile) - rows.append(row) - result_rows = [] - for i, row in enumerate(rows): - result_row = [] - for j, tile in enumerate(row): - # blend the above tile and the left tile - # to the current tile and add the current tile to the result row - if i > 0: - tile = self.blend_v(rows[i - 1][j], tile, blend_extent) - if j > 0: - tile = self.blend_h(row[j - 1], tile, blend_extent) - result_row.append(tile[:, :, :row_limit, :row_limit]) - result_rows.append(torch.cat(result_row, dim=3)) - - moments = torch.cat(result_rows, dim=2) - posterior = DiagonalGaussianDistribution(moments) - - if not return_dict: - return (posterior,) - - return AutoencoderKLOutput(latent_dist=posterior) - - def tiled_decode(self, z: torch.Tensor, return_dict: bool = True) -> Union[DecoderOutput, torch.Tensor]: - r""" - Decode a batch of images using a tiled decoder. - - Args: - z (`torch.Tensor`): Input batch of latent vectors. - return_dict (`bool`, *optional*, defaults to `True`): - Whether or not to return a [`~models.vae.DecoderOutput`] instead of a plain tuple. - - Returns: - [`~models.vae.DecoderOutput`] or `tuple`: - If return_dict is True, a [`~models.vae.DecoderOutput`] is returned, otherwise a plain `tuple` is - returned. - """ - overlap_size = int(self.tile_latent_min_size * (1 - self.tile_overlap_factor)) - blend_extent = int(self.tile_sample_min_size * self.tile_overlap_factor) - row_limit = self.tile_sample_min_size - blend_extent - - # Split z into overlapping 64x64 tiles and decode them separately. - # The tiles have an overlap to avoid seams between tiles. - rows = [] - for i in range(0, z.shape[2], overlap_size): - row = [] - for j in range(0, z.shape[3], overlap_size): - tile = z[:, :, i : i + self.tile_latent_min_size, j : j + self.tile_latent_min_size] - if self.config.use_post_quant_conv: - tile = self.post_quant_conv(tile) - decoded = self.decoder(tile) - row.append(decoded) - rows.append(row) - result_rows = [] - for i, row in enumerate(rows): - result_row = [] - for j, tile in enumerate(row): - # blend the above tile and the left tile - # to the current tile and add the current tile to the result row - if i > 0: - tile = self.blend_v(rows[i - 1][j], tile, blend_extent) - if j > 0: - tile = self.blend_h(row[j - 1], tile, blend_extent) - result_row.append(tile[:, :, :row_limit, :row_limit]) - result_rows.append(torch.cat(result_row, dim=3)) - - dec = torch.cat(result_rows, dim=2) - if not return_dict: - return (dec,) - - return DecoderOutput(sample=dec) - - def forward( - self, - sample: torch.Tensor, - sample_posterior: bool = False, - return_dict: bool = True, - generator: Optional[torch.Generator] = None, - ) -> Union[DecoderOutput, torch.Tensor]: - r""" - Args: - sample (`torch.Tensor`): Input sample. - sample_posterior (`bool`, *optional*, defaults to `False`): - Whether to sample from the posterior. - return_dict (`bool`, *optional*, defaults to `True`): - Whether or not to return a [`DecoderOutput`] instead of a plain tuple. - """ - x = sample - posterior = self.encode(x).latent_dist - if sample_posterior: - z = posterior.sample(generator=generator) - else: - z = posterior.mode() - dec = self.decode(z).sample - - if not return_dict: - return (dec,) - - return DecoderOutput(sample=dec) - - # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.fuse_qkv_projections - def fuse_qkv_projections(self): - """ - Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, key, value) - are fused. For cross-attention modules, key and value projection matrices are fused. - - - - This API is 🧪 experimental. - - - """ - self.original_attn_processors = None - - for _, attn_processor in self.attn_processors.items(): - if "Added" in str(attn_processor.__class__.__name__): - raise ValueError("`fuse_qkv_projections()` is not supported for models having added KV projections.") - - self.original_attn_processors = self.attn_processors - - for module in self.modules(): - if isinstance(module, Attention): - module.fuse_projections(fuse=True) - - self.set_attn_processor(FusedAttnProcessor2_0()) - - # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.unfuse_qkv_projections - def unfuse_qkv_projections(self): - """Disables the fused QKV projection if enabled. - - - - This API is 🧪 experimental. - - - - """ - if self.original_attn_processors is not None: - self.set_attn_processor(self.original_attn_processors) \ No newline at end of file diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/vae/vae.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/vae/vae.py deleted file mode 100644 index c1abff7097..0000000000 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/vae/vae.py +++ /dev/null @@ -1,996 +0,0 @@ -# Copyright 2024 The HuggingFace Team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from dataclasses import dataclass -from typing import Optional, Tuple - -import numpy as np -import torch -import torch.nn as nn - -from diffusers.utils import BaseOutput, is_torch_version -from diffusers.utils.torch_utils import randn_tensor -from diffusers.models.activations import get_activation -from diffusers.models.attention_processor import SpatialNorm -from diffusers.models.unets.unet_2d_blocks import ( - AutoencoderTinyBlock, - UNetMidBlock2D, - get_down_block, - get_up_block, -) - - -@dataclass -class EncoderOutput(BaseOutput): - r""" - Output of encoding method. - - Args: - latent (`torch.Tensor` of shape `(batch_size, num_channels, latent_height, latent_width)`): - The encoded latent. - """ - - latent: torch.Tensor - - -@dataclass -class DecoderOutput(BaseOutput): - r""" - Output of decoding method. - - Args: - sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)`): - The decoded output sample from the last layer of the model. - """ - - sample: torch.Tensor - commit_loss: Optional[torch.FloatTensor] = None - - -class Encoder(nn.Module): - r""" - The `Encoder` layer of a variational autoencoder that encodes its input into a latent representation. - - Args: - in_channels (`int`, *optional*, defaults to 3): - The number of input channels. - out_channels (`int`, *optional*, defaults to 3): - The number of output channels. - down_block_types (`Tuple[str, ...]`, *optional*, defaults to `("DownEncoderBlock2D",)`): - The types of down blocks to use. See `~diffusers.models.unet_2d_blocks.get_down_block` for available - options. - block_out_channels (`Tuple[int, ...]`, *optional*, defaults to `(64,)`): - The number of output channels for each block. - layers_per_block (`int`, *optional*, defaults to 2): - The number of layers per block. - norm_num_groups (`int`, *optional*, defaults to 32): - The number of groups for normalization. - act_fn (`str`, *optional*, defaults to `"silu"`): - The activation function to use. See `~diffusers.models.activations.get_activation` for available options. - double_z (`bool`, *optional*, defaults to `True`): - Whether to double the number of output channels for the last block. - """ - - def __init__( - self, - in_channels: int = 3, - out_channels: int = 3, - down_block_types: Tuple[str, ...] = ("DownEncoderBlock2D",), - block_out_channels: Tuple[int, ...] = (64,), - layers_per_block: int = 2, - norm_num_groups: int = 32, - act_fn: str = "silu", - double_z: bool = True, - mid_block_add_attention=True, - ): - super().__init__() - self.layers_per_block = layers_per_block - - self.conv_in = nn.Conv2d( - in_channels, - block_out_channels[0], - kernel_size=3, - stride=1, - padding=1, - ) - - self.down_blocks = nn.ModuleList([]) - - # down - output_channel = block_out_channels[0] - for i, down_block_type in enumerate(down_block_types): - input_channel = output_channel - output_channel = block_out_channels[i] - is_final_block = i == len(block_out_channels) - 1 - - down_block = get_down_block( - down_block_type, - num_layers=self.layers_per_block, - in_channels=input_channel, - out_channels=output_channel, - add_downsample=not is_final_block, - resnet_eps=1e-6, - downsample_padding=0, - resnet_act_fn=act_fn, - resnet_groups=norm_num_groups, - attention_head_dim=output_channel, - temb_channels=None, - ) - self.down_blocks.append(down_block) - - # mid - self.mid_block = UNetMidBlock2D( - in_channels=block_out_channels[-1], - resnet_eps=1e-6, - resnet_act_fn=act_fn, - output_scale_factor=1, - resnet_time_scale_shift="default", - attention_head_dim=block_out_channels[-1], - resnet_groups=norm_num_groups, - temb_channels=None, - add_attention=mid_block_add_attention, - ) - - # out - self.conv_norm_out = nn.GroupNorm(num_channels=block_out_channels[-1], num_groups=norm_num_groups, eps=1e-6) - self.conv_act = nn.SiLU() - - conv_out_channels = 2 * out_channels if double_z else out_channels - self.conv_out = nn.Conv2d(block_out_channels[-1], conv_out_channels, 3, padding=1) - - self.gradient_checkpointing = False - - def forward(self, sample: torch.Tensor) -> torch.Tensor: - r"""The forward method of the `Encoder` class.""" - - sample = self.conv_in(sample) - - if torch.is_grad_enabled() and self.gradient_checkpointing: - - def create_custom_forward(module): - def custom_forward(*inputs): - return module(*inputs) - - return custom_forward - - # down - if is_torch_version(">=", "1.11.0"): - for down_block in self.down_blocks: - sample = torch.utils.checkpoint.checkpoint( - create_custom_forward(down_block), sample, use_reentrant=False - ) - # middle - sample = torch.utils.checkpoint.checkpoint( - create_custom_forward(self.mid_block), sample, use_reentrant=False - ) - else: - for down_block in self.down_blocks: - sample = torch.utils.checkpoint.checkpoint(create_custom_forward(down_block), sample) - # middle - sample = torch.utils.checkpoint.checkpoint(create_custom_forward(self.mid_block), sample) - - else: - # down - for down_block in self.down_blocks: - sample = down_block(sample) - - # middle - sample = self.mid_block(sample) - - # post-process - sample = self.conv_norm_out(sample) - sample = self.conv_act(sample) - sample = self.conv_out(sample) - - return sample - - -class Decoder(nn.Module): - r""" - The `Decoder` layer of a variational autoencoder that decodes its latent representation into an output sample. - - Args: - in_channels (`int`, *optional*, defaults to 3): - The number of input channels. - out_channels (`int`, *optional*, defaults to 3): - The number of output channels. - up_block_types (`Tuple[str, ...]`, *optional*, defaults to `("UpDecoderBlock2D",)`): - The types of up blocks to use. See `~diffusers.models.unet_2d_blocks.get_up_block` for available options. - block_out_channels (`Tuple[int, ...]`, *optional*, defaults to `(64,)`): - The number of output channels for each block. - layers_per_block (`int`, *optional*, defaults to 2): - The number of layers per block. - norm_num_groups (`int`, *optional*, defaults to 32): - The number of groups for normalization. - act_fn (`str`, *optional*, defaults to `"silu"`): - The activation function to use. See `~diffusers.models.activations.get_activation` for available options. - norm_type (`str`, *optional*, defaults to `"group"`): - The normalization type to use. Can be either `"group"` or `"spatial"`. - """ - - def __init__( - self, - in_channels: int = 3, - out_channels: int = 3, - up_block_types: Tuple[str, ...] = ("UpDecoderBlock2D",), - block_out_channels: Tuple[int, ...] = (64,), - layers_per_block: int = 2, - norm_num_groups: int = 32, - act_fn: str = "silu", - norm_type: str = "group", # group, spatial - mid_block_add_attention=True, - ): - super().__init__() - self.layers_per_block = layers_per_block - - self.conv_in = nn.Conv2d( - in_channels, - block_out_channels[-1], - kernel_size=3, - stride=1, - padding=1, - ) - - self.up_blocks = nn.ModuleList([]) - - temb_channels = in_channels if norm_type == "spatial" else None - - # mid - self.mid_block = UNetMidBlock2D( - in_channels=block_out_channels[-1], - resnet_eps=1e-6, - resnet_act_fn=act_fn, - output_scale_factor=1, - resnet_time_scale_shift="default" if norm_type == "group" else norm_type, - attention_head_dim=block_out_channels[-1], - resnet_groups=norm_num_groups, - temb_channels=temb_channels, - add_attention=mid_block_add_attention, - ) - - # up - reversed_block_out_channels = list(reversed(block_out_channels)) - output_channel = reversed_block_out_channels[0] - for i, up_block_type in enumerate(up_block_types): - prev_output_channel = output_channel - output_channel = reversed_block_out_channels[i] - - is_final_block = i == len(block_out_channels) - 1 - - up_block = get_up_block( - up_block_type, - num_layers=self.layers_per_block + 1, - in_channels=prev_output_channel, - out_channels=output_channel, - prev_output_channel=None, - add_upsample=not is_final_block, - resnet_eps=1e-6, - resnet_act_fn=act_fn, - resnet_groups=norm_num_groups, - attention_head_dim=output_channel, - temb_channels=temb_channels, - resnet_time_scale_shift=norm_type, - ) - self.up_blocks.append(up_block) - prev_output_channel = output_channel - - # out - if norm_type == "spatial": - self.conv_norm_out = SpatialNorm(block_out_channels[0], temb_channels) - else: - self.conv_norm_out = nn.GroupNorm(num_channels=block_out_channels[0], num_groups=norm_num_groups, eps=1e-6) - self.conv_act = nn.SiLU() - self.conv_out = nn.Conv2d(block_out_channels[0], out_channels, 3, padding=1) - - self.gradient_checkpointing = False - - def forward( - self, - sample: torch.Tensor, - latent_embeds: Optional[torch.Tensor] = None, - ) -> torch.Tensor: - r"""The forward method of the `Decoder` class.""" - - sample = self.conv_in(sample) - - upscale_dtype = next(iter(self.up_blocks.parameters())).dtype - if torch.is_grad_enabled() and self.gradient_checkpointing: - - def create_custom_forward(module): - def custom_forward(*inputs): - return module(*inputs) - - return custom_forward - - if is_torch_version(">=", "1.11.0"): - # middle - sample = torch.utils.checkpoint.checkpoint( - create_custom_forward(self.mid_block), - sample, - latent_embeds, - use_reentrant=False, - ) - sample = sample.to(upscale_dtype) - - # up - for up_block in self.up_blocks: - sample = torch.utils.checkpoint.checkpoint( - create_custom_forward(up_block), - sample, - latent_embeds, - use_reentrant=False, - ) - else: - # middle - sample = torch.utils.checkpoint.checkpoint( - create_custom_forward(self.mid_block), sample, latent_embeds - ) - sample = sample.to(upscale_dtype) - - # up - for up_block in self.up_blocks: - sample = torch.utils.checkpoint.checkpoint(create_custom_forward(up_block), sample, latent_embeds) - else: - # middle - sample = self.mid_block(sample, latent_embeds) - sample = sample.to(upscale_dtype) - - # up - for up_block in self.up_blocks: - sample = up_block(sample, latent_embeds) - - # post-process - if latent_embeds is None: - sample = self.conv_norm_out(sample) - else: - sample = self.conv_norm_out(sample, latent_embeds) - sample = self.conv_act(sample) - sample = self.conv_out(sample) - - return sample - - -class UpSample(nn.Module): - r""" - The `UpSample` layer of a variational autoencoder that upsamples its input. - - Args: - in_channels (`int`, *optional*, defaults to 3): - The number of input channels. - out_channels (`int`, *optional*, defaults to 3): - The number of output channels. - """ - - def __init__( - self, - in_channels: int, - out_channels: int, - ) -> None: - super().__init__() - self.in_channels = in_channels - self.out_channels = out_channels - self.deconv = nn.ConvTranspose2d(in_channels, out_channels, kernel_size=4, stride=2, padding=1) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - r"""The forward method of the `UpSample` class.""" - x = torch.relu(x) - x = self.deconv(x) - return x - - -class MaskConditionEncoder(nn.Module): - """ - used in AsymmetricAutoencoderKL - """ - - def __init__( - self, - in_ch: int, - out_ch: int = 192, - res_ch: int = 768, - stride: int = 16, - ) -> None: - super().__init__() - - channels = [] - while stride > 1: - stride = stride // 2 - in_ch_ = out_ch * 2 - if out_ch > res_ch: - out_ch = res_ch - if stride == 1: - in_ch_ = res_ch - channels.append((in_ch_, out_ch)) - out_ch *= 2 - - out_channels = [] - for _in_ch, _out_ch in channels: - out_channels.append(_out_ch) - out_channels.append(channels[-1][0]) - - layers = [] - in_ch_ = in_ch - for i, _ in enumerate(out_channels): - out_ch_ = out_channels[i] - if i == 0 or i == 1: - layers.append(nn.Conv2d(in_ch_, out_ch_, kernel_size=3, stride=1, padding=1)) - else: - layers.append(nn.Conv2d(in_ch_, out_ch_, kernel_size=4, stride=2, padding=1)) - in_ch_ = out_ch_ - - self.layers = nn.Sequential(*layers) - - def forward(self, x: torch.Tensor, mask=None) -> torch.Tensor: - r"""The forward method of the `MaskConditionEncoder` class.""" - out = {} - for i, _ in enumerate(self.layers): - layer = self.layers[i] - x = layer(x) - out[str(tuple(x.shape))] = x - x = torch.relu(x) - return out - - -class MaskConditionDecoder(nn.Module): - r"""The `MaskConditionDecoder` should be used in combination with [`AsymmetricAutoencoderKL`] to enhance the model's - decoder with a conditioner on the mask and masked image. - - Args: - in_channels (`int`, *optional*, defaults to 3): - The number of input channels. - out_channels (`int`, *optional*, defaults to 3): - The number of output channels. - up_block_types (`Tuple[str, ...]`, *optional*, defaults to `("UpDecoderBlock2D",)`): - The types of up blocks to use. See `~diffusers.models.unet_2d_blocks.get_up_block` for available options. - block_out_channels (`Tuple[int, ...]`, *optional*, defaults to `(64,)`): - The number of output channels for each block. - layers_per_block (`int`, *optional*, defaults to 2): - The number of layers per block. - norm_num_groups (`int`, *optional*, defaults to 32): - The number of groups for normalization. - act_fn (`str`, *optional*, defaults to `"silu"`): - The activation function to use. See `~diffusers.models.activations.get_activation` for available options. - norm_type (`str`, *optional*, defaults to `"group"`): - The normalization type to use. Can be either `"group"` or `"spatial"`. - """ - - def __init__( - self, - in_channels: int = 3, - out_channels: int = 3, - up_block_types: Tuple[str, ...] = ("UpDecoderBlock2D",), - block_out_channels: Tuple[int, ...] = (64,), - layers_per_block: int = 2, - norm_num_groups: int = 32, - act_fn: str = "silu", - norm_type: str = "group", # group, spatial - ): - super().__init__() - self.layers_per_block = layers_per_block - - self.conv_in = nn.Conv2d( - in_channels, - block_out_channels[-1], - kernel_size=3, - stride=1, - padding=1, - ) - - self.up_blocks = nn.ModuleList([]) - - temb_channels = in_channels if norm_type == "spatial" else None - - # mid - self.mid_block = UNetMidBlock2D( - in_channels=block_out_channels[-1], - resnet_eps=1e-6, - resnet_act_fn=act_fn, - output_scale_factor=1, - resnet_time_scale_shift="default" if norm_type == "group" else norm_type, - attention_head_dim=block_out_channels[-1], - resnet_groups=norm_num_groups, - temb_channels=temb_channels, - ) - - # up - reversed_block_out_channels = list(reversed(block_out_channels)) - output_channel = reversed_block_out_channels[0] - for i, up_block_type in enumerate(up_block_types): - prev_output_channel = output_channel - output_channel = reversed_block_out_channels[i] - - is_final_block = i == len(block_out_channels) - 1 - - up_block = get_up_block( - up_block_type, - num_layers=self.layers_per_block + 1, - in_channels=prev_output_channel, - out_channels=output_channel, - prev_output_channel=None, - add_upsample=not is_final_block, - resnet_eps=1e-6, - resnet_act_fn=act_fn, - resnet_groups=norm_num_groups, - attention_head_dim=output_channel, - temb_channels=temb_channels, - resnet_time_scale_shift=norm_type, - ) - self.up_blocks.append(up_block) - prev_output_channel = output_channel - - # condition encoder - self.condition_encoder = MaskConditionEncoder( - in_ch=out_channels, - out_ch=block_out_channels[0], - res_ch=block_out_channels[-1], - ) - - # out - if norm_type == "spatial": - self.conv_norm_out = SpatialNorm(block_out_channels[0], temb_channels) - else: - self.conv_norm_out = nn.GroupNorm(num_channels=block_out_channels[0], num_groups=norm_num_groups, eps=1e-6) - self.conv_act = nn.SiLU() - self.conv_out = nn.Conv2d(block_out_channels[0], out_channels, 3, padding=1) - - self.gradient_checkpointing = False - - def forward( - self, - z: torch.Tensor, - image: Optional[torch.Tensor] = None, - mask: Optional[torch.Tensor] = None, - latent_embeds: Optional[torch.Tensor] = None, - ) -> torch.Tensor: - r"""The forward method of the `MaskConditionDecoder` class.""" - sample = z - sample = self.conv_in(sample) - - upscale_dtype = next(iter(self.up_blocks.parameters())).dtype - if torch.is_grad_enabled() and self.gradient_checkpointing: - - def create_custom_forward(module): - def custom_forward(*inputs): - return module(*inputs) - - return custom_forward - - if is_torch_version(">=", "1.11.0"): - # middle - sample = torch.utils.checkpoint.checkpoint( - create_custom_forward(self.mid_block), - sample, - latent_embeds, - use_reentrant=False, - ) - sample = sample.to(upscale_dtype) - - # condition encoder - if image is not None and mask is not None: - masked_image = (1 - mask) * image - im_x = torch.utils.checkpoint.checkpoint( - create_custom_forward(self.condition_encoder), - masked_image, - mask, - use_reentrant=False, - ) - - # up - for up_block in self.up_blocks: - if image is not None and mask is not None: - sample_ = im_x[str(tuple(sample.shape))] - mask_ = nn.functional.interpolate(mask, size=sample.shape[-2:], mode="nearest") - sample = sample * mask_ + sample_ * (1 - mask_) - sample = torch.utils.checkpoint.checkpoint( - create_custom_forward(up_block), - sample, - latent_embeds, - use_reentrant=False, - ) - if image is not None and mask is not None: - sample = sample * mask + im_x[str(tuple(sample.shape))] * (1 - mask) - else: - # middle - sample = torch.utils.checkpoint.checkpoint( - create_custom_forward(self.mid_block), sample, latent_embeds - ) - sample = sample.to(upscale_dtype) - - # condition encoder - if image is not None and mask is not None: - masked_image = (1 - mask) * image - im_x = torch.utils.checkpoint.checkpoint( - create_custom_forward(self.condition_encoder), - masked_image, - mask, - ) - - # up - for up_block in self.up_blocks: - if image is not None and mask is not None: - sample_ = im_x[str(tuple(sample.shape))] - mask_ = nn.functional.interpolate(mask, size=sample.shape[-2:], mode="nearest") - sample = sample * mask_ + sample_ * (1 - mask_) - sample = torch.utils.checkpoint.checkpoint(create_custom_forward(up_block), sample, latent_embeds) - if image is not None and mask is not None: - sample = sample * mask + im_x[str(tuple(sample.shape))] * (1 - mask) - else: - # middle - sample = self.mid_block(sample, latent_embeds) - sample = sample.to(upscale_dtype) - - # condition encoder - if image is not None and mask is not None: - masked_image = (1 - mask) * image - im_x = self.condition_encoder(masked_image, mask) - - # up - for up_block in self.up_blocks: - if image is not None and mask is not None: - sample_ = im_x[str(tuple(sample.shape))] - mask_ = nn.functional.interpolate(mask, size=sample.shape[-2:], mode="nearest") - sample = sample * mask_ + sample_ * (1 - mask_) - sample = up_block(sample, latent_embeds) - if image is not None and mask is not None: - sample = sample * mask + im_x[str(tuple(sample.shape))] * (1 - mask) - - # post-process - if latent_embeds is None: - sample = self.conv_norm_out(sample) - else: - sample = self.conv_norm_out(sample, latent_embeds) - sample = self.conv_act(sample) - sample = self.conv_out(sample) - - return sample - - -class VectorQuantizer(nn.Module): - """ - Improved version over VectorQuantizer, can be used as a drop-in replacement. Mostly avoids costly matrix - multiplications and allows for post-hoc remapping of indices. - """ - - # NOTE: due to a bug the beta term was applied to the wrong term. for - # backwards compatibility we use the buggy version by default, but you can - # specify legacy=False to fix it. - def __init__( - self, - n_e: int, - vq_embed_dim: int, - beta: float, - remap=None, - unknown_index: str = "random", - sane_index_shape: bool = False, - legacy: bool = True, - ): - super().__init__() - self.n_e = n_e - self.vq_embed_dim = vq_embed_dim - self.beta = beta - self.legacy = legacy - - self.embedding = nn.Embedding(self.n_e, self.vq_embed_dim) - self.embedding.weight.data.uniform_(-1.0 / self.n_e, 1.0 / self.n_e) - - self.remap = remap - if self.remap is not None: - self.register_buffer("used", torch.tensor(np.load(self.remap))) - self.used: torch.Tensor - self.re_embed = self.used.shape[0] - self.unknown_index = unknown_index # "random" or "extra" or integer - if self.unknown_index == "extra": - self.unknown_index = self.re_embed - self.re_embed = self.re_embed + 1 - print( - f"Remapping {self.n_e} indices to {self.re_embed} indices. " - f"Using {self.unknown_index} for unknown indices." - ) - else: - self.re_embed = n_e - - self.sane_index_shape = sane_index_shape - - def remap_to_used(self, inds: torch.LongTensor) -> torch.LongTensor: - ishape = inds.shape - inds = inds.reshape(ishape[0], -1) - used = self.used.to(inds) - match = (inds[:, :, None] == used[None, None, ...]).long() - new = match.argmax(-1) - unknown = match.sum(2) < 1 - if self.unknown_index == "random": - new[unknown] = torch.randint(0, self.re_embed, size=new[unknown].shape).to(device=new.device) - else: - new[unknown] = self.unknown_index - return new.reshape(ishape) - - def unmap_to_all(self, inds: torch.LongTensor) -> torch.LongTensor: - ishape = inds.shape - inds = inds.reshape(ishape[0], -1) - used = self.used.to(inds) - if self.re_embed > self.used.shape[0]: # extra token - inds[inds >= self.used.shape[0]] = 0 # simply set to zero - back = torch.gather(used[None, :][inds.shape[0] * [0], :], 1, inds) - return back.reshape(ishape) - - def forward(self, z: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, Tuple]: - # reshape z -> (batch, height, width, channel) and flatten - z = z.permute(0, 2, 3, 1).contiguous() - z_flattened = z.view(-1, self.vq_embed_dim) - - # distances from z to embeddings e_j (z - e)^2 = z^2 + e^2 - 2 e * z - min_encoding_indices = torch.argmin(torch.cdist(z_flattened, self.embedding.weight), dim=1) - - z_q = self.embedding(min_encoding_indices).view(z.shape) - perplexity = None - min_encodings = None - - # compute loss for embedding - if not self.legacy: - loss = self.beta * torch.mean((z_q.detach() - z) ** 2) + torch.mean((z_q - z.detach()) ** 2) - else: - loss = torch.mean((z_q.detach() - z) ** 2) + self.beta * torch.mean((z_q - z.detach()) ** 2) - - # preserve gradients - z_q: torch.Tensor = z + (z_q - z).detach() - - # reshape back to match original input shape - z_q = z_q.permute(0, 3, 1, 2).contiguous() - - if self.remap is not None: - min_encoding_indices = min_encoding_indices.reshape(z.shape[0], -1) # add batch axis - min_encoding_indices = self.remap_to_used(min_encoding_indices) - min_encoding_indices = min_encoding_indices.reshape(-1, 1) # flatten - - if self.sane_index_shape: - min_encoding_indices = min_encoding_indices.reshape(z_q.shape[0], z_q.shape[2], z_q.shape[3]) - - return z_q, loss, (perplexity, min_encodings, min_encoding_indices) - - def get_codebook_entry(self, indices: torch.LongTensor, shape: Tuple[int, ...]) -> torch.Tensor: - # shape specifying (batch, height, width, channel) - if self.remap is not None: - indices = indices.reshape(shape[0], -1) # add batch axis - indices = self.unmap_to_all(indices) - indices = indices.reshape(-1) # flatten again - - # get quantized latent vectors - z_q: torch.Tensor = self.embedding(indices) - - if shape is not None: - z_q = z_q.view(shape) - # reshape back to match original input shape - z_q = z_q.permute(0, 3, 1, 2).contiguous() - - return z_q - - -class DiagonalGaussianDistribution(object): - def __init__(self, parameters: torch.Tensor, deterministic: bool = False): - self.parameters = parameters - self.mean, self.logvar = torch.chunk(parameters, 2, dim=1) - self.logvar = torch.clamp(self.logvar, -30.0, 20.0) - self.deterministic = deterministic - self.std = torch.exp(0.5 * self.logvar) - self.var = torch.exp(self.logvar) - if self.deterministic: - self.var = self.std = torch.zeros_like( - self.mean, device=self.parameters.device, dtype=self.parameters.dtype - ) - - def sample(self, generator: Optional[torch.Generator] = None) -> torch.Tensor: - # make sure sample is on the same device as the parameters and has same dtype - sample = randn_tensor( - self.mean.shape, - generator=generator, - device=self.parameters.device, - dtype=self.parameters.dtype, - ) - x = self.mean + self.std * sample - return x - - def kl(self, other: "DiagonalGaussianDistribution" = None) -> torch.Tensor: - if self.deterministic: - return torch.Tensor([0.0]) - else: - if other is None: - return 0.5 * torch.sum( - torch.pow(self.mean, 2) + self.var - 1.0 - self.logvar, - dim=[1, 2, 3], - ) - else: - return 0.5 * torch.sum( - torch.pow(self.mean - other.mean, 2) / other.var - + self.var / other.var - - 1.0 - - self.logvar - + other.logvar, - dim=[1, 2, 3], - ) - - def nll(self, sample: torch.Tensor, dims: Tuple[int, ...] = None) -> torch.Tensor: - if dims is None: - dims = [1, 2, 3] - if self.deterministic: - return torch.Tensor([0.0]) - logtwopi = np.log(2.0 * np.pi) - return 0.5 * torch.sum( - logtwopi + self.logvar + torch.pow(sample - self.mean, 2) / self.var, - dim=dims, - ) - - def mode(self) -> torch.Tensor: - return self.mean - - -class EncoderTiny(nn.Module): - r""" - The `EncoderTiny` layer is a simpler version of the `Encoder` layer. - - Args: - in_channels (`int`): - The number of input channels. - out_channels (`int`): - The number of output channels. - num_blocks (`Tuple[int, ...]`): - Each value of the tuple represents a Conv2d layer followed by `value` number of `AutoencoderTinyBlock`'s to - use. - block_out_channels (`Tuple[int, ...]`): - The number of output channels for each block. - act_fn (`str`): - The activation function to use. See `~diffusers.models.activations.get_activation` for available options. - """ - - def __init__( - self, - in_channels: int, - out_channels: int, - num_blocks: Tuple[int, ...], - block_out_channels: Tuple[int, ...], - act_fn: str, - ): - super().__init__() - - layers = [] - for i, num_block in enumerate(num_blocks): - num_channels = block_out_channels[i] - - if i == 0: - layers.append(nn.Conv2d(in_channels, num_channels, kernel_size=3, padding=1)) - else: - layers.append( - nn.Conv2d( - num_channels, - num_channels, - kernel_size=3, - padding=1, - stride=2, - bias=False, - ) - ) - - for _ in range(num_block): - layers.append(AutoencoderTinyBlock(num_channels, num_channels, act_fn)) - - layers.append(nn.Conv2d(block_out_channels[-1], out_channels, kernel_size=3, padding=1)) - - self.layers = nn.Sequential(*layers) - self.gradient_checkpointing = False - - def forward(self, x: torch.Tensor) -> torch.Tensor: - r"""The forward method of the `EncoderTiny` class.""" - if torch.is_grad_enabled() and self.gradient_checkpointing: - - def create_custom_forward(module): - def custom_forward(*inputs): - return module(*inputs) - - return custom_forward - - if is_torch_version(">=", "1.11.0"): - x = torch.utils.checkpoint.checkpoint(create_custom_forward(self.layers), x, use_reentrant=False) - else: - x = torch.utils.checkpoint.checkpoint(create_custom_forward(self.layers), x) - - else: - # scale image from [-1, 1] to [0, 1] to match TAESD convention - x = self.layers(x.add(1).div(2)) - - return x - - -class DecoderTiny(nn.Module): - r""" - The `DecoderTiny` layer is a simpler version of the `Decoder` layer. - - Args: - in_channels (`int`): - The number of input channels. - out_channels (`int`): - The number of output channels. - num_blocks (`Tuple[int, ...]`): - Each value of the tuple represents a Conv2d layer followed by `value` number of `AutoencoderTinyBlock`'s to - use. - block_out_channels (`Tuple[int, ...]`): - The number of output channels for each block. - upsampling_scaling_factor (`int`): - The scaling factor to use for upsampling. - act_fn (`str`): - The activation function to use. See `~diffusers.models.activations.get_activation` for available options. - """ - - def __init__( - self, - in_channels: int, - out_channels: int, - num_blocks: Tuple[int, ...], - block_out_channels: Tuple[int, ...], - upsampling_scaling_factor: int, - act_fn: str, - upsample_fn: str, - ): - super().__init__() - - layers = [ - nn.Conv2d(in_channels, block_out_channels[0], kernel_size=3, padding=1), - get_activation(act_fn), - ] - - for i, num_block in enumerate(num_blocks): - is_final_block = i == (len(num_blocks) - 1) - num_channels = block_out_channels[i] - - for _ in range(num_block): - layers.append(AutoencoderTinyBlock(num_channels, num_channels, act_fn)) - - if not is_final_block: - layers.append(nn.Upsample(scale_factor=upsampling_scaling_factor, mode=upsample_fn)) - - conv_out_channel = num_channels if not is_final_block else out_channels - layers.append( - nn.Conv2d( - num_channels, - conv_out_channel, - kernel_size=3, - padding=1, - bias=is_final_block, - ) - ) - - self.layers = nn.Sequential(*layers) - self.gradient_checkpointing = False - - def forward(self, x: torch.Tensor) -> torch.Tensor: - r"""The forward method of the `DecoderTiny` class.""" - # Clamp. - x = torch.tanh(x / 3) * 3 - - if torch.is_grad_enabled() and self.gradient_checkpointing: - - def create_custom_forward(module): - def custom_forward(*inputs): - return module(*inputs) - - return custom_forward - - if is_torch_version(">=", "1.11.0"): - x = torch.utils.checkpoint.checkpoint(create_custom_forward(self.layers), x, use_reentrant=False) - else: - x = torch.utils.checkpoint.checkpoint(create_custom_forward(self.layers), x) - - else: - x = self.layers(x) - - # scale image from [0, 1] to [-1, 1] to match diffusers convention - return x.mul(2).sub(1) \ No newline at end of file -- Gitee From f840981328864580cc1e9fa79643529b7a2f71c1 Mon Sep 17 00:00:00 2001 From: jiangmengyu Date: Tue, 31 Dec 2024 11:26:37 +0800 Subject: [PATCH 62/91] =?UTF-8?q?cogview3=E9=80=82=E9=85=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../built-in/foundation/cogview3/cogview3plus/__init__.py | 2 +- .../built-in/foundation/cogview3/cogview3plus/vae/__init__.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/__init__.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/__init__.py index 8cfcd60a5b..14327efb5c 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/__init__.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/__init__.py @@ -15,6 +15,6 @@ # limitations under the License. from .pipeline import CogView3PlusPipeline, DiffusionPipeline -from .vae import AutoencoderKL +from .vae import AutoencoderKL, ModelMixin from .schedulers import CogVideoXDDIMScheduler, SchedulerMixin from .models import CogView3PlusTransformer2DModel, ModelMixin \ No newline at end of file diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/vae/__init__.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/vae/__init__.py index 261968dc69..e9a931d8eb 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/vae/__init__.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/vae/__init__.py @@ -1 +1,2 @@ -from diffusers import AutoencoderKL \ No newline at end of file +from diffusers import AutoencoderKL +from ..models.modeling_utils import ModelMixin \ No newline at end of file -- Gitee From 7155a471fff922073a2f6aa15b800225fdfa8c06 Mon Sep 17 00:00:00 2001 From: jiangmengyu Date: Tue, 31 Dec 2024 11:30:38 +0800 Subject: [PATCH 63/91] =?UTF-8?q?cogview3=E9=80=82=E9=85=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../built-in/foundation/cogview3/cogview3plus/__init__.py | 3 ++- .../built-in/foundation/cogview3/cogview3plus/vae/__init__.py | 2 -- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/__init__.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/__init__.py index 14327efb5c..dc22483005 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/__init__.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/__init__.py @@ -14,7 +14,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +from diffusers import AutoencoderKL + from .pipeline import CogView3PlusPipeline, DiffusionPipeline -from .vae import AutoencoderKL, ModelMixin from .schedulers import CogVideoXDDIMScheduler, SchedulerMixin from .models import CogView3PlusTransformer2DModel, ModelMixin \ No newline at end of file diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/vae/__init__.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/vae/__init__.py index e9a931d8eb..e69de29bb2 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/vae/__init__.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/vae/__init__.py @@ -1,2 +0,0 @@ -from diffusers import AutoencoderKL -from ..models.modeling_utils import ModelMixin \ No newline at end of file -- Gitee From b4ecaf571cefe8200e04e3c00e4f3dec6fc4af55 Mon Sep 17 00:00:00 2001 From: jiangmengyu Date: Tue, 31 Dec 2024 11:32:54 +0800 Subject: [PATCH 64/91] =?UTF-8?q?cogview3=E9=80=82=E9=85=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py index 05f6ddf53a..ed439ecb29 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py @@ -23,8 +23,8 @@ from diffusers.image_processor import VaeImageProcessor from diffusers.pipelines.pipeline_utils import DiffusionPipeline from diffusers.utils import logging from diffusers.utils.torch_utils import randn_tensor +from diffusers import AutoencoderKL -from ..vae import AutoencoderKL from ..models import CogView3PlusTransformer2DModel from ..schedulers import CogVideoXDDIMScheduler from .pipeline_output import CogView3PipelineOutput -- Gitee From 4ded09c54a56aaef51bd6badf8b13cad454856f7 Mon Sep 17 00:00:00 2001 From: jiangmengyu Date: Tue, 31 Dec 2024 11:34:18 +0800 Subject: [PATCH 65/91] =?UTF-8?q?cogview3=E9=80=82=E9=85=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../built-in/foundation/cogview3/cogview3plus/__init__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/__init__.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/__init__.py index dc22483005..de500743d8 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/__init__.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/__init__.py @@ -14,7 +14,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -from diffusers import AutoencoderKL from .pipeline import CogView3PlusPipeline, DiffusionPipeline from .schedulers import CogVideoXDDIMScheduler, SchedulerMixin -- Gitee From 047be8e7198f861ef44f7dcc0cb220add719fc5d Mon Sep 17 00:00:00 2001 From: jiangmengyu Date: Tue, 31 Dec 2024 11:58:14 +0800 Subject: [PATCH 66/91] =?UTF-8?q?cogview3=E9=80=82=E9=85=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../built-in/foundation/cogview3/README.md | 2 +- .../models/transformer_cogview3plus.py | 5 +-- .../pipeline/pipeline_cogview3plus.py | 3 +- .../schedulers/scheduling_ddim_cogvideox.py | 1 + .../schedulers/scheduling_utils.py | 37 ------------------- 5 files changed, 5 insertions(+), 43 deletions(-) diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/README.md b/MindIE/MindIE-Torch/built-in/foundation/cogview3/README.md index 16592703d8..e161ab5c10 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/README.md +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/README.md @@ -96,7 +96,7 @@ https://huggingface.co/THUDM/CogView3-Plus-3B/tree/main "CogView3PlusTransformer2DModel" ], "vae": [ - "cogview3plus", + "diffusers", "AutoencoderKL" ] } diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/transformer_cogview3plus.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/transformer_cogview3plus.py index 6f4fa104b8..bd2482b587 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/transformer_cogview3plus.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/transformer_cogview3plus.py @@ -255,7 +255,6 @@ class CogView3PlusTransformer2DModel(ModelMixin, ConfigMixin): original_size: torch.Tensor, target_size: torch.Tensor, crop_coords: torch.Tensor, - t_idx: int, ) -> Union[torch.Tensor, Transformer2DModelOutput]: hidden_states = states[0] encoder_hidden_states = states[1] @@ -270,7 +269,7 @@ class CogView3PlusTransformer2DModel(ModelMixin, ConfigMixin): encoder_hidden_states = hidden_states[:, :text_seq_length] hidden_states = hidden_states[:, text_seq_length:] - hidden_states, encoder_hidden_states = self._forward_blocks(hidden_states, encoder_hidden_states, emb, t_idx) + hidden_states, encoder_hidden_states = self._forward_blocks(hidden_states, encoder_hidden_states, emb, states[2]) hidden_states = self.norm_out(hidden_states, emb) hidden_states = self.proj_out(hidden_states) # (batch_size, height*width, patch_size*patch_size*out_channels) @@ -387,7 +386,7 @@ class CogView3PlusTransformer2DModel(ModelMixin, ConfigMixin): self.v_bias_cache = weights.pop(f"transformer_blocks.{i}.attn1.to_v.bias", None) qk_weight_cache = self.q_weight_cache is not None and self.k_weight_cache is not None - if qk_weight_cache and self.v_weight_cache is not None: + if qk_weight_cache and self.v_weight_cache is not None: qkv_weight = torch.cat( [self.q_weight_cache, self.k_weight_cache, self.v_weight_cache], dim=0 diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py index ed439ecb29..82276b2cd9 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py @@ -311,12 +311,11 @@ class CogView3PlusPipeline(DiffusionPipeline): # predict noise model_output noise_pred = self.transformer( - states=(latent_model_input, prompt_embeds), + states=(latent_model_input, prompt_embeds, i), timestep=timestep, original_size=original_size, target_size=target_size, crop_coords=crops_coords_top_left, - t_idx=i, )[0] noise_pred = noise_pred.float() diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_ddim_cogvideox.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_ddim_cogvideox.py index f94de1c81d..9b9a4f051e 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_ddim_cogvideox.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_ddim_cogvideox.py @@ -42,6 +42,7 @@ class DDIMSchedulerOutput(BaseOutput): prev_sample: torch.Tensor pred_original_sample: Optional[torch.Tensor] = None + def betas_for_alpha_bar( num_diffusion_timesteps, max_beta=0.999, diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_utils.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_utils.py index ae88225358..cd59e45bdf 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_utils.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_utils.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import importlib import os from dataclasses import dataclass from enum import Enum @@ -92,39 +91,3 @@ class SchedulerMixin(PushToHubMixin): **kwargs, ) return cls.from_config(config, return_unused_kwargs=return_unused_kwargs, **kwargs) - - def save_pretrained(self, save_directory: Union[str, os.PathLike], push_to_hub: bool = False, **kwargs): - """ - Save a scheduler configuration object to a directory so that it can be reloaded using the - [`~SchedulerMixin.from_pretrained`] class method. - - Args: - save_directory (`str` or `os.PathLike`): - Directory where the configuration JSON file will be saved (will be created if it does not exist). - push_to_hub (`bool`, *optional*, defaults to `False`): - Whether or not to push your model to the Hugging Face Hub after saving it. You can specify the - repository you want to push to with `repo_id` (will default to the name of `save_directory` in your - namespace). - kwargs (`Dict[str, Any]`, *optional*): - Additional keyword arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method. - """ - self.save_config(save_directory=save_directory, push_to_hub=push_to_hub, **kwargs) - - @property - def compatibles(self): - """ - Returns all schedulers that are compatible with this scheduler - - Returns: - `List[SchedulerMixin]`: List of compatible schedulers - """ - return self._get_compatibles() - - @classmethod - def _get_compatibles(cls): - compatible_classes_str = list(set([cls.__name__] + cls._compatibles)) - diffusers_library = importlib.import_module(__name__.split(".")[0]) - compatible_classes = [ - getattr(diffusers_library, c) for c in compatible_classes_str if hasattr(diffusers_library, c) - ] - return compatible_classes \ No newline at end of file -- Gitee From ce0e6f9a428ef490bb39ff10e3224f76fefedfc2 Mon Sep 17 00:00:00 2001 From: jiangmengyu Date: Tue, 31 Dec 2024 12:48:35 +0800 Subject: [PATCH 67/91] =?UTF-8?q?cogview3=E9=80=82=E9=85=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../schedulers/scheduling_utils.py | 20 +++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_utils.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_utils.py index cd59e45bdf..d854366c77 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_utils.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_utils.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import importlib import os from dataclasses import dataclass from enum import Enum @@ -91,3 +92,22 @@ class SchedulerMixin(PushToHubMixin): **kwargs, ) return cls.from_config(config, return_unused_kwargs=return_unused_kwargs, **kwargs) + + @property + def compatibles(self): + """ + Returns all schedulers that are compatible with this scheduler + + Returns: + `List[SchedulerMixin]`: List of compatible schedulers + """ + return self._get_compatibles() + + @classmethod + def _get_compatibles(cls): + compatible_classes_str = list(set([cls.__name__] + cls._compatibles)) + diffusers_library = importlib.import_module(__name__.split(".")[0]) + compatible_classes = [ + getattr(diffusers_library, c) for c in compatible_classes_str if hasattr(diffusers_library, c) + ] + return compatible_classes \ No newline at end of file -- Gitee From 1caf467868b2d537166630ab3ee6e9d77e8805b0 Mon Sep 17 00:00:00 2001 From: jiangmengyu Date: Tue, 31 Dec 2024 14:13:25 +0800 Subject: [PATCH 68/91] =?UTF-8?q?cogview3=E9=80=82=E9=85=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../cogview3/cogview3plus/__init__.py | 17 ----------------- .../cogview3/cogview3plus/layers/embeddings.py | 4 +--- .../cogview3/cogview3plus/layers/linear.py | 2 +- 3 files changed, 2 insertions(+), 21 deletions(-) diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/__init__.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/__init__.py index de500743d8..1139593a36 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/__init__.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/__init__.py @@ -1,20 +1,3 @@ -#!/usr/bin/env python -# coding=utf-8 -# Copyright 2024 Huawei Technologies Co., Ltd -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - from .pipeline import CogView3PlusPipeline, DiffusionPipeline from .schedulers import CogVideoXDDIMScheduler, SchedulerMixin from .models import CogView3PlusTransformer2DModel, ModelMixin \ No newline at end of file diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/embeddings.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/embeddings.py index 72418f08b3..129384dffc 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/embeddings.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/embeddings.py @@ -17,7 +17,7 @@ from typing import Optional import torch from torch import nn -from diffusers.models.activations import FP32SiLU, get_activation +from diffusers.models.activations import get_activation def get_timestep_embedding( @@ -205,8 +205,6 @@ class PixArtAlphaTextProjection(nn.Module): self.act_1 = nn.GELU(approximate="tanh") elif act_fn == "silu": self.act_1 = nn.SiLU() - elif act_fn == "silu_fp32": - self.act_1 = FP32SiLU() else: raise ValueError(f"Unknown activation function: {act_fn}") self.linear_2 = nn.Linear(in_features=hidden_size, out_features=out_features, bias=True) diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/linear.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/linear.py index bd9b9ba796..57fe8d55dc 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/linear.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/layers/linear.py @@ -6,7 +6,7 @@ # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # -# httpa://www.apache.org/licenses/LICENSE-2.0 +# https://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, -- Gitee From 53f9426a5e6ed5f569d74cdaf23650819747127d Mon Sep 17 00:00:00 2001 From: jiangmengyu Date: Tue, 31 Dec 2024 14:25:46 +0800 Subject: [PATCH 69/91] =?UTF-8?q?cogview3=E9=80=82=E9=85=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../foundation/cogview3/cogview3plus/models/activations.py | 1 - .../foundation/cogview3/cogview3plus/models/model_load_utils.py | 2 +- .../foundation/cogview3/cogview3plus/models/modeling_utils.py | 1 - 3 files changed, 1 insertion(+), 3 deletions(-) diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/activations.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/activations.py index 4726fd7eb2..5bb3783ae4 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/activations.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/activations.py @@ -20,7 +20,6 @@ from torch import nn from diffusers.utils import deprecate from diffusers.utils.import_utils import is_torch_npu_available - if is_torch_npu_available(): import torch_npu diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/model_load_utils.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/model_load_utils.py index 3cffbd6432..34a4625283 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/model_load_utils.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/model_load_utils.py @@ -12,7 +12,7 @@ # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and -# limitations under the License +# limitations under the License. import os import torch diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/modeling_utils.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/modeling_utils.py index a09f50daf8..da548d9771 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/modeling_utils.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/modeling_utils.py @@ -137,7 +137,6 @@ class ModelMixin(torch.nn.Module, PushToHubMixin): deprecate("direct config name access", "1.0.0", deprecation_message, standard_warn=False, stacklevel=3) return self._internal_dict[name] - # call PyTorch's https://pytorch.org/docs/stable/_modules/torch/nn/modules/module.html#Module return super().__getattr__(name) @classmethod -- Gitee From 8a4f05496d2641b284b8f2d28a1b26e486fc65a1 Mon Sep 17 00:00:00 2001 From: jiangmengyu Date: Tue, 31 Dec 2024 14:45:22 +0800 Subject: [PATCH 70/91] =?UTF-8?q?cogview3=E9=80=82=E9=85=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../models/attention_processor.py | 38 ------------------- .../cogview3plus/models/modeling_utils.py | 1 - .../models/transformer_cogview3plus.py | 2 - .../pipeline/pipeline_cogview3plus.py | 1 - .../schedulers/scheduling_ddim_cogvideox.py | 19 ---------- .../cogview3/inference_cogview3plus.py | 2 +- 6 files changed, 1 insertion(+), 62 deletions(-) diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/attention_processor.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/attention_processor.py index d2a7673ea5..c197a989b7 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/attention_processor.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/attention_processor.py @@ -212,8 +212,6 @@ class Attention(nn.Module): processor (`AttnProcessor`): The attention processor to use. """ - # if current processor is in `self._modules` and if passed `processor` is not, we need to - # pop `processor` from `self._modules` if ( hasattr(self, "processor") and isinstance(self.processor, torch.nn.Module) @@ -231,26 +229,6 @@ class Attention(nn.Module): attention_mask: Optional[torch.Tensor] = None, **cross_attention_kwargs, ) -> torch.Tensor: - r""" - The forward method of the `Attention` class. - - Args: - hidden_states (`torch.Tensor`): - The hidden states of the query. - encoder_hidden_states (`torch.Tensor`, *optional*): - The hidden states of the encoder. - attention_mask (`torch.Tensor`, *optional*): - The attention mask to use. If `None`, no mask is applied. - **cross_attention_kwargs: - Additional keyword arguments to pass along to the cross attention. - - Returns: - `torch.Tensor`: The output of the attention layer. - """ - # The `Attention` class can call different attention processors / attention functions - # here we simply pass along all tensors to the selected processor class - # For standard processors that are defined here, `**cross_attention_kwargs` is empty - attn_parameters = set(inspect.signature(self.processor.__call__).parameters.keys()) quiet_attn_parameters = {"ip_adapter_masks", "ip_hidden_states"} unused_kwargs = [ @@ -273,22 +251,6 @@ class Attention(nn.Module): def prepare_attention_mask( self, attention_mask: torch.Tensor, target_length: int, batch_size: int, out_dim: int = 3 ) -> torch.Tensor: - r""" - Prepare the attention mask for the attention computation. - - Args: - attention_mask (`torch.Tensor`): - The attention mask to prepare. - target_length (`int`): - The target length of the attention mask. This is the length of the attention mask after padding. - batch_size (`int`): - The batch size, which is used to repeat the attention mask. - out_dim (`int`, *optional*, defaults to `3`): - The output dimension of the attention mask. Can be either `3` or `4`. - - Returns: - `torch.Tensor`: The prepared attention mask. - """ head_size = self.heads if attention_mask is None: return attention_mask diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/modeling_utils.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/modeling_utils.py index da548d9771..aa8e33daaa 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/modeling_utils.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/modeling_utils.py @@ -185,7 +185,6 @@ class ModelMixin(torch.nn.Module, PushToHubMixin): " dispatching. Please make sure to set `low_cpu_mem_usage=True`." ) - # change device_map into a map if we passed an int, a str or a torch.device if isinstance(device_map, torch.device): device_map = {"": device_map} elif isinstance(device_map, str) and device_map not in ["auto", "balanced", "balanced_low_0", "sequential"]: diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/transformer_cogview3plus.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/transformer_cogview3plus.py index bd2482b587..f704e22589 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/transformer_cogview3plus.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/transformer_cogview3plus.py @@ -197,7 +197,6 @@ class CogView3PlusTransformer2DModel(ModelMixin, ConfigMixin): self.delta_encoder_cache = None @property - # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.attn_processors def attn_processors(self) -> Dict[str, AttentionProcessor]: r""" Returns: @@ -221,7 +220,6 @@ class CogView3PlusTransformer2DModel(ModelMixin, ConfigMixin): return processors - # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_attn_processor def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]): count = len(self.attn_processors.keys()) diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py index 82276b2cd9..fe2bd5cfcd 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py @@ -32,7 +32,6 @@ from .pipeline_output import CogView3PipelineOutput logger = logging.get_logger(__name__) # pylint: disable=invalid-name -# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps def retrieve_timesteps( scheduler, num_inference_steps: Optional[int] = None, diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_ddim_cogvideox.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_ddim_cogvideox.py index 9b9a4f051e..b3f6ce229b 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_ddim_cogvideox.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/schedulers/scheduling_ddim_cogvideox.py @@ -27,18 +27,6 @@ from .scheduling_utils import KarrasDiffusionSchedulers, SchedulerMixin @dataclass class DDIMSchedulerOutput(BaseOutput): - """ - Output class for the scheduler's `step` function output. - - Args: - prev_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images): - Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the - denoising loop. - pred_original_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images): - The predicted denoised sample `(x_{0})` based on the model output from the current timestep. - `pred_original_sample` can be used to preview progress or for guidance. - """ - prev_sample: torch.Tensor pred_original_sample: Optional[torch.Tensor] = None @@ -127,10 +115,6 @@ class CogVideoXDDIMScheduler(SchedulerMixin, ConfigMixin): if rescale_betas_zero_snr: self.alphas_cumprod = rescale_zero_terminal_snr(self.alphas_cumprod) - # At every step in ddim, we are looking into the previous alphas_cumprod - # For the final step, there is no previous alphas_cumprod because we are already at 0 - # `set_alpha_to_one` decides whether we set this parameter simply to one or - # whether we use the final alpha of the "non-previous" one. self.final_alpha_cumprod = torch.tensor(1.0) if set_alpha_to_one else self.alphas_cumprod[0] # standard deviation of the initial noise distribution @@ -171,7 +155,6 @@ class CogVideoXDDIMScheduler(SchedulerMixin, ConfigMixin): self.num_inference_steps = num_inference_steps - # "linspace", "leading", "trailing" corresponds to annotation of Table 2. of https://arxiv.org/abs/2305.08891 if self.config.timestep_spacing == "linspace": timesteps = ( np.linspace(0, self.config.num_train_timesteps - 1, num_inference_steps) @@ -244,7 +227,6 @@ class CogVideoXDDIMScheduler(SchedulerMixin, ConfigMixin): return DDIMSchedulerOutput(prev_sample=prev_sample, pred_original_sample=pred_original_sample) - # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.add_noise def add_noise( self, original_samples: torch.Tensor, @@ -271,7 +253,6 @@ class CogVideoXDDIMScheduler(SchedulerMixin, ConfigMixin): noisy_samples = sqrt_alpha_prod * original_samples + sqrt_one_minus_alpha_prod * noise return noisy_samples - # Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.get_velocity def get_velocity(self, sample: torch.Tensor, noise: torch.Tensor, timesteps: torch.IntTensor) -> torch.Tensor: # Make sure alphas_cumprod and timestep have same device and dtype as sample self.alphas_cumprod = self.alphas_cumprod.to(device=sample.device) diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/inference_cogview3plus.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/inference_cogview3plus.py index 9030a82b5e..c3bb1f2ebb 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/inference_cogview3plus.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/inference_cogview3plus.py @@ -6,7 +6,7 @@ # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # -# http://www.apache.org/licenses/LICENSE-2.0 +# https://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, -- Gitee From 11ef4a0a8b29e21dfaeeda4c69f703df0f55c285 Mon Sep 17 00:00:00 2001 From: jiangmengyu Date: Thu, 2 Jan 2025 10:45:03 +0800 Subject: [PATCH 71/91] =?UTF-8?q?cogview=E9=80=82=E9=85=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../cogview3/inference_cogview3plus.py | 212 ++++++++++++++++-- .../cogview3/prompts/example_prompts.txt | 5 + 2 files changed, 203 insertions(+), 14 deletions(-) create mode 100644 MindIE/MindIE-Torch/built-in/foundation/cogview3/prompts/example_prompts.txt diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/inference_cogview3plus.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/inference_cogview3plus.py index c3bb1f2ebb..341a008e7c 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/inference_cogview3plus.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/inference_cogview3plus.py @@ -17,6 +17,9 @@ import argparse import logging import time +import os +import csv +import json import torch @@ -26,25 +29,138 @@ logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) +class PromptLoader: + def __init__( + self, + prompt_file: str, + prompt_file_type: str, + batch_size: int, + num_images_per_prompt: int = 1, + max_num_prompts: int = 0 + ): + self.prompts = [] + self.catagories = ['Not_specified'] + self.batch_size = batch_size + self.num_images_per_prompt = num_images_per_prompt + + if prompt_file_type == 'plain': + self.load_prompts_plain(prompt_file, max_num_prompts) + elif prompt_file_type == 'parti': + self.load_prompts_parti(prompt_file, max_num_prompts) + elif prompt_file_type == 'hpsv2': + self.load_prompts_hpsv2(max_num_prompts) + else: + print("This operation is not supported!") + + self.current_id = 0 + self.inner_id = 0 + + def __len__(self): + return len(self.prompts) * self.num_images_per_prompt + + def __iter__(self): + return self + + def __next__(self): + if self.current_id == len(self.prompts): + raise StopIteration + + ret = { + 'prompts': [], + 'catagories': [], + 'save_names': [], + 'n_prompts': self.batch_size, + } + for _ in range(self.batch_size): + if self.current_id == len(self.prompts): + ret['prompts'].append('') + ret['save_names'].append('') + ret['catagories'].append('') + ret['n_prompts'] -= 1 + + else: + prompt, catagory_id = self.prompts[self.current_id] + ret['prompts'].append(prompt) + ret['catagories'].append(self.catagories[catagory_id]) + ret['save_names'].append(f'{self.current_id}_{self.inner_id}') + + self.inner_id += 1 + if self.inner_id == self.num_images_per_prompt: + self.inner_id = 0 + self.current_id += 1 + + return ret + + def load_prompts_plain(self, file_path: str, max_num_prompts: int): + with os.fdopen(os.open(file_path, os.O_RDONLY), "r") as f: + for i, line in enumerate(f): + if max_num_prompts and i == max_num_prompts: + break + + prompt = line.strip() + self.prompts.append((prompt, 0)) + + def load_prompts_parti(self, file_path: str, max_num_prompts: int): + with os.fdopen(os.open(file_path, os.O_RDONLY), "r") as f: + # Skip the first line + next(f) + tsv_file = csv.reader(f, delimiter="\t") + for i, line in enumerate(tsv_file): + if max_num_prompts and i == max_num_prompts: + break + + prompt = line[0] + catagory = line[1] + if catagory not in self.catagories: + self.catagories.append(catagory) + + catagory_id = self.catagories.index(catagory) + self.prompts.append((prompt, catagory_id)) + + def load_prompts_hpsv2(self, max_num_prompts: int): + with open('hpsv2_benchmark_prompts.json', 'r') as file: + all_prompts = json.load(file) + count = 0 + for style, prompts in all_prompts.items(): + for prompt in prompts: + count += 1 + if max_num_prompts and count >= max_num_prompts: + break + + if style not in self.catagories: + self.catagories.append(style) + + catagory_id = self.catagories.index(style) + self.prompts.append((prompt, catagory_id)) + + def parse_arguments(): parser = argparse.ArgumentParser(description="Generate an image using the CogView3-Plus-3B model.") # Define arguments for prompt, model path, etc. parser.add_argument( - "--prompt", - type=list, - default=[ - "A vibrant cherry red sports car sits proudly under the gleaming sun, \ - its polished exterior smooth and flawless, casting a mirror-like reflection. \ - The car features a low, aerodynamic body, angular headlights that gaze forward like predatory eyes, \ - and a set of black, high-gloss racing rims that contrast starkly with the red. \ - A subtle hint of chrome embellishes the grille and exhaust, \ - while the tinted windows suggest a luxurious and private interior. \ - he scene conveys a sense of speed and elegance, \ - the car appearing as if it's about to burst into a sprint along a coastal road, \ - with the ocean's azure waves crashing in the background." - ], - help="The text description for generating the image." + "--prompt_file", + type=str, + default="./prompts/example_prompts.txt", + help="A text file of prompts for generating images.", + ) + parser.add_argument( + "--prompt_file_type", + choices=["plain", "parti", "hpsv2"], + default="plain", + help="Type of prompt file.", + ) + parser.add_argument( + "--save_dir", + type=str, + default="./results", + help="Path to save result images.", + ) + parser.add_argument( + "--info_file_save_path", + type=str, + default="./image_info.json", + help="Path to save image information file.", ) parser.add_argument( "--model_path", type=str, default="/data/CogView3B", help="Path to the pre-trained model." @@ -55,6 +171,18 @@ def parse_arguments(): parser.add_argument( "--num_images_per_prompt", type=int, default=1, help="Number of images to generate per prompt." ) + parser.add_argument( + "--max_num_prompts", + default=0, + type=int, + help="Limit the number of prompts (0: no limit).", + ) + parser.add_argument( + "--batch_size", + type=int, + default=1, + help="Batch size." + ) parser.add_argument("--num_inference_steps", type=int, default=50, help="Number of denoising steps for inference.") parser.add_argument("--width", type=int, default=1024, help="Width of the generated image.") parser.add_argument("--height", type=int, default=1024, help="Height of the generated image.") @@ -72,6 +200,61 @@ def infer(args): # Load the pre-trained model with the specified precision pipe = CogView3PlusPipeline.from_pretrained(args.model_path, torch_dtype=dtype).to("npu") + use_time = 0 + prompt_loader = PromptLoader(args.prompt_file, + args.prompt_file_type, + args.batch_size, + args.num_images_per_prompt, + args.max_num_prompts) + + infer_num = 0 + image_info = [] + current_prompt = None + for i, input_info in enumerate(prompt_loader): + prompts = input_info['prompts'] + catagories = input_info['catagories'] + save_names = input_info['save_names'] + n_prompts = input_info['n_prompts'] + + print(f"[{infer_num + n_prompts}/{len(prompt_loader)}]: {prompts}") + infer_num += args.batch_size + + start_time = time.time() + images = pipe( + prompt=prompts, + guidance_scale=args.guidance_scale, + num_images_per_prompt=args.num_images_per_prompt, + num_inference_steps=args.num_inference_steps, + image_size=(args.height, args.width), + ).images + + if i > 2: # do not count the time spent inferring the first 0 to 2 images + use_time += time.time() - start_time + + for j in range(n_prompts): + image_save_path = os.path.join(args.save_dir, f"{save_names[j]}.png") + image = images[0][j] + image.save(image_save_path) + + if current_prompt != prompts[j]: + current_prompt = prompts[j] + image_info.append({'images': [], 'prompt': current_prompt, 'category': catagories[j]}) + + image_info[-1]['images'].append(image_save_path) + + infer_num = infer_num - 3 # do not count the time spent inferring the first 5 images + print(f"[info] infer number: {infer_num}; use time: {use_time:.3f}s\n" + f"average time: {use_time / infer_num:.3f}s\n") + + # Save image information to a json file + if os.path.exists(args.info_file_save_path): + os.remove(args.info_file_save_path) + + with os.fdopen(os.open(args.info_file_save_path, os.O_RDWR | os.O_CREAT, 0o640), "w") as f: + json.dump(image_info, f) + + + """ use_time = 0 loops = 5 for i in range(loops): @@ -97,6 +280,7 @@ def infer(args): image.save(args.output_path) print(f"Image saved to {args.output_path}") + """ if __name__ == "__main__": diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/prompts/example_prompts.txt b/MindIE/MindIE-Torch/built-in/foundation/cogview3/prompts/example_prompts.txt new file mode 100644 index 0000000000..7291dde080 --- /dev/null +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/prompts/example_prompts.txt @@ -0,0 +1,5 @@ +A vibrant cherry red sports car sits proudly under the gleaming sun, its polished exterior smooth and flawless, casting a mirror-like reflection. The car features a low, aerodynamic body, angular headlights that gaze forward like predatory eyes, and a set of black, high-gloss racing rims that contrast starkly with the red. A subtle hint of chrome embellishes the grille and exhaust, while the tinted windows suggest a luxurious and private interior. The scene conveys a sense of speed and elegance, the car appearing as if it's about to burst into a sprint along a coastal road, with the ocean's azure waves crashing in the background. +A vibrant cherry red sports car sits proudly under the gleaming sun, its polished exterior smooth and flawless, casting a mirror-like reflection. The car features a low, aerodynamic body, angular headlights that gaze forward like predatory eyes, and a set of black, high-gloss racing rims that contrast starkly with the red. A subtle hint of chrome embellishes the grille and exhaust, while the tinted windows suggest a luxurious and private interior. The scene conveys a sense of speed and elegance, the car appearing as if it's about to burst into a sprint along a coastal road, with the ocean's azure waves crashing in the background. +A vibrant cherry red sports car sits proudly under the gleaming sun, its polished exterior smooth and flawless, casting a mirror-like reflection. The car features a low, aerodynamic body, angular headlights that gaze forward like predatory eyes, and a set of black, high-gloss racing rims that contrast starkly with the red. A subtle hint of chrome embellishes the grille and exhaust, while the tinted windows suggest a luxurious and private interior. The scene conveys a sense of speed and elegance, the car appearing as if it's about to burst into a sprint along a coastal road, with the ocean's azure waves crashing in the background. +A vibrant cherry red sports car sits proudly under the gleaming sun, its polished exterior smooth and flawless, casting a mirror-like reflection. The car features a low, aerodynamic body, angular headlights that gaze forward like predatory eyes, and a set of black, high-gloss racing rims that contrast starkly with the red. A subtle hint of chrome embellishes the grille and exhaust, while the tinted windows suggest a luxurious and private interior. The scene conveys a sense of speed and elegance, the car appearing as if it's about to burst into a sprint along a coastal road, with the ocean's azure waves crashing in the background. +A vibrant cherry red sports car sits proudly under the gleaming sun, its polished exterior smooth and flawless, casting a mirror-like reflection. The car features a low, aerodynamic body, angular headlights that gaze forward like predatory eyes, and a set of black, high-gloss racing rims that contrast starkly with the red. A subtle hint of chrome embellishes the grille and exhaust, while the tinted windows suggest a luxurious and private interior. The scene conveys a sense of speed and elegance, the car appearing as if it's about to burst into a sprint along a coastal road, with the ocean's azure waves crashing in the background. \ No newline at end of file -- Gitee From f6d7ff137af7fee1121270cbbae494ab48d51b74 Mon Sep 17 00:00:00 2001 From: jiangmengyu Date: Thu, 2 Jan 2025 10:49:03 +0800 Subject: [PATCH 72/91] =?UTF-8?q?cogview=E9=80=82=E9=85=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../built-in/foundation/cogview3/inference_cogview3plus.py | 1 + 1 file changed, 1 insertion(+) diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/inference_cogview3plus.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/inference_cogview3plus.py index 341a008e7c..149be10eb1 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/inference_cogview3plus.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/inference_cogview3plus.py @@ -227,6 +227,7 @@ def infer(args): num_inference_steps=args.num_inference_steps, image_size=(args.height, args.width), ).images + print(images.shape) if i > 2: # do not count the time spent inferring the first 0 to 2 images use_time += time.time() - start_time -- Gitee From 485008cb516f7077a336e5c4859c387590e59a0b Mon Sep 17 00:00:00 2001 From: jiangmengyu Date: Thu, 2 Jan 2025 10:58:08 +0800 Subject: [PATCH 73/91] =?UTF-8?q?cogview=E9=80=82=E9=85=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py | 5 ++--- .../built-in/foundation/cogview3/inference_cogview3plus.py | 3 +-- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py index fe2bd5cfcd..2f14fdd7c3 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py @@ -27,7 +27,6 @@ from diffusers import AutoencoderKL from ..models import CogView3PlusTransformer2DModel from ..schedulers import CogVideoXDDIMScheduler -from .pipeline_output import CogView3PipelineOutput logger = logging.get_logger(__name__) # pylint: disable=invalid-name @@ -224,7 +223,7 @@ class CogView3PlusPipeline(DiffusionPipeline): num_inference_steps: int = 50, guidance_scale: float = 5.0, num_images_per_prompt: int = 1, - ) -> Union[CogView3PipelineOutput, Tuple]: + ) -> Tuple: if image_size is None: height = self.transformer.config.sample_size * self.vae_scale_factor width = self.transformer.config.sample_size * self.vae_scale_factor @@ -336,4 +335,4 @@ class CogView3PlusPipeline(DiffusionPipeline): # Offload all models self.maybe_free_model_hooks() - return CogView3PipelineOutput(images=image) \ No newline at end of file + return (image,) \ No newline at end of file diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/inference_cogview3plus.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/inference_cogview3plus.py index 149be10eb1..fa6cfd418b 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/inference_cogview3plus.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/inference_cogview3plus.py @@ -226,8 +226,7 @@ def infer(args): num_images_per_prompt=args.num_images_per_prompt, num_inference_steps=args.num_inference_steps, image_size=(args.height, args.width), - ).images - print(images.shape) + ) if i > 2: # do not count the time spent inferring the first 0 to 2 images use_time += time.time() - start_time -- Gitee From 0858f272718055c29f79600ca00348a85862c919 Mon Sep 17 00:00:00 2001 From: jiangmengyu Date: Thu, 2 Jan 2025 11:16:46 +0800 Subject: [PATCH 74/91] =?UTF-8?q?cogview=E9=80=82=E9=85=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../cogview3/inference_cogview3plus.py | 33 ++----------------- 1 file changed, 2 insertions(+), 31 deletions(-) diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/inference_cogview3plus.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/inference_cogview3plus.py index fa6cfd418b..ae66dac6ca 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/inference_cogview3plus.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/inference_cogview3plus.py @@ -228,7 +228,7 @@ def infer(args): image_size=(args.height, args.width), ) - if i > 2: # do not count the time spent inferring the first 0 to 2 images + if i > 1: # do not count the time spent inferring the first 0 to 2 images use_time += time.time() - start_time for j in range(n_prompts): @@ -242,7 +242,7 @@ def infer(args): image_info[-1]['images'].append(image_save_path) - infer_num = infer_num - 3 # do not count the time spent inferring the first 5 images + infer_num = infer_num - 2 # do not count the time spent inferring the first 5 images print(f"[info] infer number: {infer_num}; use time: {use_time:.3f}s\n" f"average time: {use_time / infer_num:.3f}s\n") @@ -253,35 +253,6 @@ def infer(args): with os.fdopen(os.open(args.info_file_save_path, os.O_RDWR | os.O_CREAT, 0o640), "w") as f: json.dump(image_info, f) - - """ - use_time = 0 - loops = 5 - for i in range(loops): - start_time = time.time() - # Generate the image based on the prompt - image = pipe( - prompt=args.prompt[0], - guidance_scale=args.guidance_scale, - num_images_per_prompt=args.num_images_per_prompt, - num_inference_steps=args.num_inference_steps, - image_size=(args.height, args.width), - ).images[0] - - if i >= 2: - use_time += time.time() - start_time - logger.info("current_time is %.3f )", time.time() - start_time) - - torch.npu.empty_cache() - - logger.info("use_time is %.3f)", use_time / 3) - - # Save the generated image to the local file system - image.save(args.output_path) - - print(f"Image saved to {args.output_path}") - """ - if __name__ == "__main__": inference_args = parse_arguments() -- Gitee From 91ed1e670a06b0a029ca38eb8b9d8f2ac739b3e7 Mon Sep 17 00:00:00 2001 From: jiangmengyu Date: Thu, 2 Jan 2025 11:28:23 +0800 Subject: [PATCH 75/91] =?UTF-8?q?cogview=E9=80=82=E9=85=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../pipeline/pipeline_cogview3plus.py | 4 +++ .../cogview3/cogview3plus/utils/__init__.py | 1 + .../cogview3/cogview3plus/utils/utils.py | 35 +++++++++++++++++++ 3 files changed, 40 insertions(+) create mode 100644 MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/utils/__init__.py create mode 100644 MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/utils/utils.py diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py index 2f14fdd7c3..d242bbcd55 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py @@ -27,6 +27,7 @@ from diffusers import AutoencoderKL from ..models import CogView3PlusTransformer2DModel from ..schedulers import CogVideoXDDIMScheduler +from ..utils import set_random_seed logger = logging.get_logger(__name__) # pylint: disable=invalid-name @@ -224,6 +225,9 @@ class CogView3PlusPipeline(DiffusionPipeline): guidance_scale: float = 5.0, num_images_per_prompt: int = 1, ) -> Tuple: + + set_random_seed(seed=42) + if image_size is None: height = self.transformer.config.sample_size * self.vae_scale_factor width = self.transformer.config.sample_size * self.vae_scale_factor diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/utils/__init__.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/utils/__init__.py new file mode 100644 index 0000000000..f35da6dcea --- /dev/null +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/utils/__init__.py @@ -0,0 +1 @@ +from .utils import set_random_seed \ No newline at end of file diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/utils/utils.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/utils/utils.py new file mode 100644 index 0000000000..de985c1453 --- /dev/null +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/utils/utils.py @@ -0,0 +1,35 @@ +#!/usr/bin/env python +# coding=utf-8 +# Copyright 2024 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import importlib +import random +import torch +import numpy as np + + +def set_random_seed(seed): + """Set random seed. + + Args: + seed (int, optional): Seed to be used. + + """ + + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + return seed \ No newline at end of file -- Gitee From b42efe9c5c5c84e4f29fda667ea3141d95f472c0 Mon Sep 17 00:00:00 2001 From: jiangmengyu Date: Thu, 2 Jan 2025 14:24:05 +0800 Subject: [PATCH 76/91] =?UTF-8?q?cogview=E9=80=82=E9=85=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../built-in/foundation/cogview3/cogview3plus/__init__.py | 3 ++- .../cogview3plus/pipeline/pipeline_cogview3plus.py | 4 ---- .../built-in/foundation/cogview3/inference_cogview3plus.py | 7 +++++-- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/__init__.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/__init__.py index 1139593a36..e5bd9d5fa9 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/__init__.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/__init__.py @@ -1,3 +1,4 @@ from .pipeline import CogView3PlusPipeline, DiffusionPipeline from .schedulers import CogVideoXDDIMScheduler, SchedulerMixin -from .models import CogView3PlusTransformer2DModel, ModelMixin \ No newline at end of file +from .models import CogView3PlusTransformer2DModel, ModelMixin +from .utils import set_random_seed \ No newline at end of file diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py index d242bbcd55..2f14fdd7c3 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/pipeline/pipeline_cogview3plus.py @@ -27,7 +27,6 @@ from diffusers import AutoencoderKL from ..models import CogView3PlusTransformer2DModel from ..schedulers import CogVideoXDDIMScheduler -from ..utils import set_random_seed logger = logging.get_logger(__name__) # pylint: disable=invalid-name @@ -225,9 +224,6 @@ class CogView3PlusPipeline(DiffusionPipeline): guidance_scale: float = 5.0, num_images_per_prompt: int = 1, ) -> Tuple: - - set_random_seed(seed=42) - if image_size is None: height = self.transformer.config.sample_size * self.vae_scale_factor width = self.transformer.config.sample_size * self.vae_scale_factor diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/inference_cogview3plus.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/inference_cogview3plus.py index ae66dac6ca..b7a3bd4af1 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/inference_cogview3plus.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/inference_cogview3plus.py @@ -23,7 +23,7 @@ import json import torch -from cogview3plus import CogView3PlusPipeline +from cogview3plus import CogView3PlusPipeline, set_random_seed logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) @@ -186,8 +186,8 @@ def parse_arguments(): parser.add_argument("--num_inference_steps", type=int, default=50, help="Number of denoising steps for inference.") parser.add_argument("--width", type=int, default=1024, help="Width of the generated image.") parser.add_argument("--height", type=int, default=1024, help="Height of the generated image.") - parser.add_argument("--output_path", type=str, default="cogview3.png", help="Path to save the generated image.") parser.add_argument("--dtype", type=str, default="bf16", help="bf16 or fp16") + parser.add_argument("--seed", type=int, default=None, help="Random seed") parser.add_argument("--device_id", type=int, default=7, help="NPU device id") return parser.parse_args() @@ -197,6 +197,9 @@ def infer(args): torch.npu.set_device(args.device_id) dtype = torch.bfloat16 if args.dtype == "bf16" else torch.float16 + if args.seed is not None: + set_random_seed(args.seed) + # Load the pre-trained model with the specified precision pipe = CogView3PlusPipeline.from_pretrained(args.model_path, torch_dtype=dtype).to("npu") -- Gitee From ab635dd8049b722d3d16ea1056f8ce3ef3f7d6eb Mon Sep 17 00:00:00 2001 From: jiangmengyu Date: Thu, 2 Jan 2025 14:27:49 +0800 Subject: [PATCH 77/91] =?UTF-8?q?cogview=E9=80=82=E9=85=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../built-in/foundation/cogview3/inference_cogview3plus.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/inference_cogview3plus.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/inference_cogview3plus.py index b7a3bd4af1..0b1ebd4a5c 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/inference_cogview3plus.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/inference_cogview3plus.py @@ -188,7 +188,7 @@ def parse_arguments(): parser.add_argument("--height", type=int, default=1024, help="Height of the generated image.") parser.add_argument("--dtype", type=str, default="bf16", help="bf16 or fp16") parser.add_argument("--seed", type=int, default=None, help="Random seed") - parser.add_argument("--device_id", type=int, default=7, help="NPU device id") + parser.add_argument("--device_id", type=int, default=0, help="NPU device id") return parser.parse_args() -- Gitee From 15b52ffef0a891e4d273c506c66a0731396ed126 Mon Sep 17 00:00:00 2001 From: jiangmengyu Date: Thu, 2 Jan 2025 15:20:30 +0800 Subject: [PATCH 78/91] =?UTF-8?q?cogview=E9=80=82=E9=85=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../built-in/foundation/cogview3/README.md | 25 +++++++++++++++++++ .../cogview3/inference_cogview3plus.py | 6 ++--- 2 files changed, 28 insertions(+), 3 deletions(-) diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/README.md b/MindIE/MindIE-Torch/built-in/foundation/cogview3/README.md index e161ab5c10..b0b7760a3a 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/README.md +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/README.md @@ -164,3 +164,28 @@ python inference_cogview3plus.py \ - height: 需要生成的图像的高。 - num_inference_steps:推理迭代步数。 - dtype: 数据类型。目前只支持bf16。 + + +python3 inference_cogview3plus.py \ + --model_path \data\CogView3B \ + --prompt_file ./PartiPrompts.tsv \ + --prompt_file_type parti \ + --info_file_save_path ./image_info_PartiPrompts.json \ + --save_dir ./results_PartiPrompts \ + --num_images_per_prompt 4 \ + --height 1024 \ + --width 1024 \ + --batch_size 1 \ + --device_id 0 + +python3 inference_cogview3plus.py \ + --model_path \data\CogView3B \ + --prompt_file ./hpsv2_benchmark_prompts.json \ + --prompt_file_type hpsv2 \ + --info_file_save_path ./image_info_hpsv2.json \ + --save_dir ./results_hpsv2 \ + --num_images_per_prompt 1 \ + --height 1024 \ + --width 1024 \ + --batch_size 1 \ + --device_id 0 diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/inference_cogview3plus.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/inference_cogview3plus.py index 0b1ebd4a5c..0fca2f95df 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/inference_cogview3plus.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/inference_cogview3plus.py @@ -48,7 +48,7 @@ class PromptLoader: elif prompt_file_type == 'parti': self.load_prompts_parti(prompt_file, max_num_prompts) elif prompt_file_type == 'hpsv2': - self.load_prompts_hpsv2(max_num_prompts) + self.load_prompts_hpsv2(prompt_file, max_num_prompts) else: print("This operation is not supported!") @@ -117,8 +117,8 @@ class PromptLoader: catagory_id = self.catagories.index(catagory) self.prompts.append((prompt, catagory_id)) - def load_prompts_hpsv2(self, max_num_prompts: int): - with open('hpsv2_benchmark_prompts.json', 'r') as file: + def load_prompts_hpsv2(self, file_path: str, max_num_prompts: int): + with open(file_path, 'r') as file: all_prompts = json.load(file) count = 0 for style, prompts in all_prompts.items(): -- Gitee From 7e034859c7e16059b3190a9a712fc3b14f1ea274 Mon Sep 17 00:00:00 2001 From: jiangmengyu Date: Thu, 2 Jan 2025 15:24:05 +0800 Subject: [PATCH 79/91] =?UTF-8?q?cogview=E9=80=82=E9=85=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- MindIE/MindIE-Torch/built-in/foundation/cogview3/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/README.md b/MindIE/MindIE-Torch/built-in/foundation/cogview3/README.md index b0b7760a3a..f363b69186 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/README.md +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/README.md @@ -167,7 +167,7 @@ python inference_cogview3plus.py \ python3 inference_cogview3plus.py \ - --model_path \data\CogView3B \ + --model_path /data/CogView3B \ --prompt_file ./PartiPrompts.tsv \ --prompt_file_type parti \ --info_file_save_path ./image_info_PartiPrompts.json \ @@ -179,7 +179,7 @@ python3 inference_cogview3plus.py \ --device_id 0 python3 inference_cogview3plus.py \ - --model_path \data\CogView3B \ + --model_path /data/CogView3B \ --prompt_file ./hpsv2_benchmark_prompts.json \ --prompt_file_type hpsv2 \ --info_file_save_path ./image_info_hpsv2.json \ -- Gitee From e3b1cb444a57d520070884fe4b7068847cd77dde Mon Sep 17 00:00:00 2001 From: jiangmengyu Date: Thu, 2 Jan 2025 15:26:10 +0800 Subject: [PATCH 80/91] =?UTF-8?q?cogview=E9=80=82=E9=85=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- MindIE/MindIE-Torch/built-in/foundation/cogview3/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/README.md b/MindIE/MindIE-Torch/built-in/foundation/cogview3/README.md index f363b69186..534f9bc2ed 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/README.md +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/README.md @@ -168,7 +168,7 @@ python inference_cogview3plus.py \ python3 inference_cogview3plus.py \ --model_path /data/CogView3B \ - --prompt_file ./PartiPrompts.tsv \ + --prompt_file ./prompts/PartiPrompts.tsv \ --prompt_file_type parti \ --info_file_save_path ./image_info_PartiPrompts.json \ --save_dir ./results_PartiPrompts \ @@ -180,7 +180,7 @@ python3 inference_cogview3plus.py \ python3 inference_cogview3plus.py \ --model_path /data/CogView3B \ - --prompt_file ./hpsv2_benchmark_prompts.json \ + --prompt_file ./prompts/hpsv2_benchmark_prompts.json \ --prompt_file_type hpsv2 \ --info_file_save_path ./image_info_hpsv2.json \ --save_dir ./results_hpsv2 \ -- Gitee From ea4ccb9b6c7ba3efdfdcd7dd9d9b20ffa0126ce4 Mon Sep 17 00:00:00 2001 From: jiangmengyu Date: Thu, 2 Jan 2025 15:34:23 +0800 Subject: [PATCH 81/91] =?UTF-8?q?cogview=E9=80=82=E9=85=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- MindIE/MindIE-Torch/built-in/foundation/cogview3/README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/README.md b/MindIE/MindIE-Torch/built-in/foundation/cogview3/README.md index 534f9bc2ed..237e70f790 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/README.md +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/README.md @@ -176,6 +176,7 @@ python3 inference_cogview3plus.py \ --height 1024 \ --width 1024 \ --batch_size 1 \ + --seed 42 \ --device_id 0 python3 inference_cogview3plus.py \ @@ -188,4 +189,5 @@ python3 inference_cogview3plus.py \ --height 1024 \ --width 1024 \ --batch_size 1 \ + --seed 42 \ --device_id 0 -- Gitee From 6a162b05b1410ed560ef7cc3d9a8ba8719dd9623 Mon Sep 17 00:00:00 2001 From: jiangmengyu Date: Thu, 2 Jan 2025 15:58:20 +0800 Subject: [PATCH 82/91] =?UTF-8?q?cogview=E9=80=82=E9=85=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../built-in/foundation/cogview3/inference_cogview3plus.py | 1 - 1 file changed, 1 deletion(-) diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/inference_cogview3plus.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/inference_cogview3plus.py index 0fca2f95df..3ef016ed2a 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/inference_cogview3plus.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/inference_cogview3plus.py @@ -226,7 +226,6 @@ def infer(args): images = pipe( prompt=prompts, guidance_scale=args.guidance_scale, - num_images_per_prompt=args.num_images_per_prompt, num_inference_steps=args.num_inference_steps, image_size=(args.height, args.width), ) -- Gitee From a0a59728a09f642bf6d4035ea768fa5949f14201 Mon Sep 17 00:00:00 2001 From: jiangmengyu Date: Thu, 2 Jan 2025 16:50:12 +0800 Subject: [PATCH 83/91] =?UTF-8?q?cogview=E9=80=82=E9=85=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../built-in/foundation/cogview3/README.md | 138 ++++++++++++++++-- 1 file changed, 126 insertions(+), 12 deletions(-) diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/README.md b/MindIE/MindIE-Torch/built-in/foundation/cogview3/README.md index 237e70f790..f991ac774f 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/README.md +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/README.md @@ -70,15 +70,15 @@ pip install torch_npu-{pytorchversion}.xxxx.{arch}.whl ## 三、CogView3使用 ### 3.1 权重及配置文件说明 -1. CogView3权重路径: +1. CogView3权重主路径: ```shell https://huggingface.co/THUDM/CogView3-Plus-3B/tree/main ``` -- 修改该权重的model_index.json +- 修改主路径下的model_index.json文件 ```shell { "_class_name": "CogView3PlusPipeline", - "_diffusers_version": "0.31.0", + "_diffusers_version": "0.31.0.dev0", "scheduler": [ "cogview3plus", "CogVideoXDDIMScheduler" @@ -117,6 +117,30 @@ https://huggingface.co/THUDM/CogView3-Plus-3B/tree/main/tokenizer ```shell https://huggingface.co/THUDM/CogView3-Plus-3B/tree/main/transformer ``` +- 修改该路径下的config.json文件 +```shell +{ + "_class_name": "CogView3PlusTransformer2DModel", + "_diffusers_version": "0.31.0.dev0", + "attention_head_dim": 40, + "condition_dim": 256, + "in_channels": 16, + "num_attention_heads": 64, + "num_layers": 30, + "out_channels": 16, + "patch_size": 2, + "pooled_projection_dim": 1536, + "pos_embed_max_size": 128, + "sample_size": 128, + "text_embed_dim": 4096, + "time_embed_dim": 512, + "use_cache": True, + "cache_interval": 2, + "cache_start": 3, + "num_cache_layer" 13, + "cache_start_steps" 5 +} +``` 6. vae权重链接: ```shell https://huggingface.co/THUDM/CogView3-Plus-3B/tree/main/vae @@ -142,32 +166,89 @@ https://huggingface.co/THUDM/CogView3-Plus-3B/tree/main/vae | | |---- 模型权重 ``` -### 3.2 单卡单prompt功能测试 -设置权重路径 +### 3.2 权重下载 +提前下载权重,放到数据集目录下(/data)。 +```shell +# 需要使用 git-lfs (https://git-lfs.com) +git lfs install + +# 下载CogView3权重 +git clone https://huggingface.co/THUDM/CogView3-Plus-3B +``` + +### 3.3 性能测试 +1. 进入主路径 +```shell +cd cogview3 +``` +2. 设置权重路径 ```shell model_path='/data/CogView3B' ``` -执行命令: +3. 创建输出图像路径 +```shell +output_path='./results' +mkdir ${output_path} +``` +4. 推理: ```shell python inference_cogview3plus.py \ --model_path ${model_path} \ - --device_id 0 \ + --prompt_file ./prompts/example_prompts.txt \ + --save_dir ${output_path} \ --width 1024 \ --height 1024 \ --num_inference_steps 50 \ - --dtype bf16 + --dtype bf16 \ + --device_id 0 ``` 参数说明: - model_path:权重路径,包含scheduler、text_encoder、tokenizer、transformer、vae,5个模型的配置文件及权重。 -- device_id:推理设备ID。 +- prompt_file:提示词文件。 +- save_dir:生成图片的存放目录。 - width:需要生成的图像的宽。 - height: 需要生成的图像的高。 - num_inference_steps:推理迭代步数。 - dtype: 数据类型。目前只支持bf16。 +- device_id:推理设备ID。 + +5. 可以通过修改权重文件中`/data/CongView3B/transforer/config.json`中的`use_cache`参数来控制dit cache算法的开关,`true`表示使用dit cache,`false`表示关闭dit cache。 + +### 3.4 精度测试 + +1. 由于生成的图片存在随机性,提供两种精度验证方法: + 1. CLIP-score(文图匹配度量):评估图片和输入文本的相关性,分数的取值范围为[-1, 1],越高越好。使用Parti数据集进行验证。 + 2. HPSv2(图片美学度量):评估生成图片的人类偏好评分,分数的取值范围为[0, 1],越高越好。使用HPSv2数据集进行验证 + +注意,由于要生成的图片数量较多,进行完整的精度验证需要耗费很长的时间。 + +2. 下载Parti数据集和hpsv2数据集 +所有数据集放到congview3/prompts目录下 +```bash +# 下载Parti数据集 +wget https://raw.githubusercontent.com/google-research/parti/main/PartiPrompts.tsv --no-check-certificate +``` +hpsv2数据集下载链接:https://gitee.com/ascend/ModelZoo-PyTorch/blob/master/MindIE/MindIE-Torch/built-in/foundation/stable_diffusion_xl/hpsv2_benchmark_prompts.json + +3. 下载模型权重 +```bash +# Clip Score和HPSv2均需要使用的权重 +GIT_LFS_SKIP_SMUDGE=1 +git clone https://huggingface.co/laion/CLIP-ViT-H-14-laion2B-s32B-b79K +cd ./CLIP-ViT-H-14-laion2B-s32B-b79K +# HPSv2权重 +wget https://huggingface.co/spaces/xswu/HPSv2/resolve/main/HPS_v2_compressed.pt --no-check-certificate +``` +也可手动下载[权重](https://huggingface.co/laion/CLIP-ViT-H-14-laion2B-s32B-b79K/blob/main/open_clip_pytorch_model.bin) +将权重放到`CLIP-ViT-H-14-laion2B-s32B-b79K`目录下,手动下载[HPSv2权重](https://huggingface.co/spaces/xswu/HPSv2/resolve/main/HPS_v2_compressed.pt)放到当前路径 + +4. 使用推理脚本读取Parti数据集,生成图片 +```bash +mkdir ./results_PartiPrompts python3 inference_cogview3plus.py \ - --model_path /data/CogView3B \ + --model_path ${model_path} \ --prompt_file ./prompts/PartiPrompts.tsv \ --prompt_file_type parti \ --info_file_save_path ./image_info_PartiPrompts.json \ @@ -177,10 +258,28 @@ python3 inference_cogview3plus.py \ --width 1024 \ --batch_size 1 \ --seed 42 \ - --device_id 0 + --device_id 0 +``` +参数说明: +- model_path:权重路径,包含scheduler、text_encoder、tokenizer、transformer、vae,5个模型的配置文件及权重。 +- prompt_file:提示词文件。 +- prompt_file_type: prompt文件类型,用于指定读取方式,可选plain,parti,hpsv2。 +- info_file_save_path:生成图片信息的json文件路径。 +- save_dir:生成图片的存放目录。 +- num_images_per_prompt: 每个prompt生成的图片数量。注意使用hpsv2时,设置num_images_per_prompt=1即可。 +- height: 需要生成的图像的高。 +- width:需要生成的图像的宽。 +- batch_size:模型batch size。 +- seed:随机种子。 +- device_id:推理设备ID。 + +执行完成后在`./results_PartiPrompts`目录下生成推理图片,在当前目录生成一个`image_info_PartiPrompts.json`文件,记录着图片和prompt的对应关系,并在终端显示推理时间。 +5. 使用推理脚本读取hpsv2数据集,生成图片 +```bash +mkdir ./results_hpsv2 python3 inference_cogview3plus.py \ - --model_path /data/CogView3B \ + --model_path ${model_path} \ --prompt_file ./prompts/hpsv2_benchmark_prompts.json \ --prompt_file_type hpsv2 \ --info_file_save_path ./image_info_hpsv2.json \ @@ -191,3 +290,18 @@ python3 inference_cogview3plus.py \ --batch_size 1 \ --seed 42 \ --device_id 0 +``` +参数说明: +- model_path:权重路径,包含scheduler、text_encoder、tokenizer、transformer、vae,5个模型的配置文件及权重。 +- prompt_file:提示词文件。 +- prompt_file_type: prompt文件类型,用于指定读取方式,可选plain,parti,hpsv2。 +- info_file_save_path:生成图片信息的json文件路径。 +- save_dir:生成图片的存放目录。 +- num_images_per_prompt: 每个prompt生成的图片数量。注意使用hpsv2时,设置num_images_per_prompt=1即可。 +- height: 需要生成的图像的高。 +- width:需要生成的图像的宽。 +- batch_size:模型batch size。 +- seed:随机种子。 +- device_id:推理设备ID。 + +执行完成后在`./results_hpsv2`目录下生成推理图片,在当前目录生成一个`image_info_hpsv2.json`文件,记录着图片和prompt的对应关系,并在终端显示推理时间。 -- Gitee From 55cfe04dbc475d0e29da9687a787634188fd8d46 Mon Sep 17 00:00:00 2001 From: jiangmengyu Date: Thu, 2 Jan 2025 16:54:28 +0800 Subject: [PATCH 84/91] =?UTF-8?q?cogview=E9=80=82=E9=85=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../built-in/foundation/cogview3/README.md | 150 +++++++++--------- 1 file changed, 75 insertions(+), 75 deletions(-) diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/README.md b/MindIE/MindIE-Torch/built-in/foundation/cogview3/README.md index f991ac774f..18e1c6a8ec 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/README.md +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/README.md @@ -217,91 +217,91 @@ python inference_cogview3plus.py \ ### 3.4 精度测试 1. 由于生成的图片存在随机性,提供两种精度验证方法: - 1. CLIP-score(文图匹配度量):评估图片和输入文本的相关性,分数的取值范围为[-1, 1],越高越好。使用Parti数据集进行验证。 - 2. HPSv2(图片美学度量):评估生成图片的人类偏好评分,分数的取值范围为[0, 1],越高越好。使用HPSv2数据集进行验证 + 1. CLIP-score(文图匹配度量):评估图片和输入文本的相关性,分数的取值范围为[-1, 1],越高越好。使用Parti数据集进行验证。 + 2. HPSv2(图片美学度量):评估生成图片的人类偏好评分,分数的取值范围为[0, 1],越高越好。使用HPSv2数据集进行验证 -注意,由于要生成的图片数量较多,进行完整的精度验证需要耗费很长的时间。 + 注意,由于要生成的图片数量较多,进行完整的精度验证需要耗费很长的时间。 2. 下载Parti数据集和hpsv2数据集 -所有数据集放到congview3/prompts目录下 -```bash -# 下载Parti数据集 -wget https://raw.githubusercontent.com/google-research/parti/main/PartiPrompts.tsv --no-check-certificate -``` -hpsv2数据集下载链接:https://gitee.com/ascend/ModelZoo-PyTorch/blob/master/MindIE/MindIE-Torch/built-in/foundation/stable_diffusion_xl/hpsv2_benchmark_prompts.json + 所有数据集放到congview3/prompts目录下 + ```bash + # 下载Parti数据集 + wget https://raw.githubusercontent.com/google-research/parti/main/PartiPrompts.tsv --no-check-certificate + ``` + hpsv2数据集下载链接:https://gitee.com/ascend/ModelZoo-PyTorch/blob/master/MindIE/MindIE-Torch/built-in/foundation/stable_diffusion_xl/hpsv2_benchmark_prompts.json 3. 下载模型权重 -```bash -# Clip Score和HPSv2均需要使用的权重 -GIT_LFS_SKIP_SMUDGE=1 -git clone https://huggingface.co/laion/CLIP-ViT-H-14-laion2B-s32B-b79K -cd ./CLIP-ViT-H-14-laion2B-s32B-b79K + ```bash + # Clip Score和HPSv2均需要使用的权重 + GIT_LFS_SKIP_SMUDGE=1 + git clone https://huggingface.co/laion/CLIP-ViT-H-14-laion2B-s32B-b79K + cd ./CLIP-ViT-H-14-laion2B-s32B-b79K -# HPSv2权重 -wget https://huggingface.co/spaces/xswu/HPSv2/resolve/main/HPS_v2_compressed.pt --no-check-certificate -``` -也可手动下载[权重](https://huggingface.co/laion/CLIP-ViT-H-14-laion2B-s32B-b79K/blob/main/open_clip_pytorch_model.bin) -将权重放到`CLIP-ViT-H-14-laion2B-s32B-b79K`目录下,手动下载[HPSv2权重](https://huggingface.co/spaces/xswu/HPSv2/resolve/main/HPS_v2_compressed.pt)放到当前路径 + # HPSv2权重 + wget https://huggingface.co/spaces/xswu/HPSv2/resolve/main/HPS_v2_compressed.pt --no-check-certificate + ``` + 也可手动下载[权重](https://huggingface.co/laion/CLIP-ViT-H-14-laion2B-s32B-b79K/blob/main/open_clip_pytorch_model.bin) + 将权重放到`CLIP-ViT-H-14-laion2B-s32B-b79K`目录下,手动下载[HPSv2权重](https://huggingface.co/spaces/xswu/HPSv2/resolve/main/HPS_v2_compressed.pt)放到当前路径 4. 使用推理脚本读取Parti数据集,生成图片 -```bash -mkdir ./results_PartiPrompts -python3 inference_cogview3plus.py \ - --model_path ${model_path} \ - --prompt_file ./prompts/PartiPrompts.tsv \ - --prompt_file_type parti \ - --info_file_save_path ./image_info_PartiPrompts.json \ - --save_dir ./results_PartiPrompts \ - --num_images_per_prompt 4 \ - --height 1024 \ - --width 1024 \ - --batch_size 1 \ - --seed 42 \ - --device_id 0 -``` -参数说明: -- model_path:权重路径,包含scheduler、text_encoder、tokenizer、transformer、vae,5个模型的配置文件及权重。 -- prompt_file:提示词文件。 -- prompt_file_type: prompt文件类型,用于指定读取方式,可选plain,parti,hpsv2。 -- info_file_save_path:生成图片信息的json文件路径。 -- save_dir:生成图片的存放目录。 -- num_images_per_prompt: 每个prompt生成的图片数量。注意使用hpsv2时,设置num_images_per_prompt=1即可。 -- height: 需要生成的图像的高。 -- width:需要生成的图像的宽。 -- batch_size:模型batch size。 -- seed:随机种子。 -- device_id:推理设备ID。 + ```bash + mkdir ./results_PartiPrompts + python3 inference_cogview3plus.py \ + --model_path ${model_path} \ + --prompt_file ./prompts/PartiPrompts.tsv \ + --prompt_file_type parti \ + --info_file_save_path ./image_info_PartiPrompts.json \ + --save_dir ./results_PartiPrompts \ + --num_images_per_prompt 4 \ + --height 1024 \ + --width 1024 \ + --batch_size 1 \ + --seed 42 \ + --device_id 0 + ``` + 参数说明: + - model_path:权重路径,包含scheduler、text_encoder、tokenizer、transformer、vae,5个模型的配置文件及权重。 + - prompt_file:提示词文件。 + - prompt_file_type: prompt文件类型,用于指定读取方式,可选plain,parti,hpsv2。 + - info_file_save_path:生成图片信息的json文件路径。 + - save_dir:生成图片的存放目录。 + - num_images_per_prompt: 每个prompt生成的图片数量。注意使用hpsv2时,设置num_images_per_prompt=1即可。 + - height: 需要生成的图像的高。 + - width:需要生成的图像的宽。 + - batch_size:模型batch size。 + - seed:随机种子。 + - device_id:推理设备ID。 -执行完成后在`./results_PartiPrompts`目录下生成推理图片,在当前目录生成一个`image_info_PartiPrompts.json`文件,记录着图片和prompt的对应关系,并在终端显示推理时间。 + 执行完成后在`./results_PartiPrompts`目录下生成推理图片,在当前目录生成一个`image_info_PartiPrompts.json`文件,记录着图片和prompt的对应关系,并在终端显示推理时间。 5. 使用推理脚本读取hpsv2数据集,生成图片 -```bash -mkdir ./results_hpsv2 -python3 inference_cogview3plus.py \ - --model_path ${model_path} \ - --prompt_file ./prompts/hpsv2_benchmark_prompts.json \ - --prompt_file_type hpsv2 \ - --info_file_save_path ./image_info_hpsv2.json \ - --save_dir ./results_hpsv2 \ - --num_images_per_prompt 1 \ - --height 1024 \ - --width 1024 \ - --batch_size 1 \ - --seed 42 \ - --device_id 0 -``` -参数说明: -- model_path:权重路径,包含scheduler、text_encoder、tokenizer、transformer、vae,5个模型的配置文件及权重。 -- prompt_file:提示词文件。 -- prompt_file_type: prompt文件类型,用于指定读取方式,可选plain,parti,hpsv2。 -- info_file_save_path:生成图片信息的json文件路径。 -- save_dir:生成图片的存放目录。 -- num_images_per_prompt: 每个prompt生成的图片数量。注意使用hpsv2时,设置num_images_per_prompt=1即可。 -- height: 需要生成的图像的高。 -- width:需要生成的图像的宽。 -- batch_size:模型batch size。 -- seed:随机种子。 -- device_id:推理设备ID。 + ```bash + mkdir ./results_hpsv2 + python3 inference_cogview3plus.py \ + --model_path ${model_path} \ + --prompt_file ./prompts/hpsv2_benchmark_prompts.json \ + --prompt_file_type hpsv2 \ + --info_file_save_path ./image_info_hpsv2.json \ + --save_dir ./results_hpsv2 \ + --num_images_per_prompt 1 \ + --height 1024 \ + --width 1024 \ + --batch_size 1 \ + --seed 42 \ + --device_id 0 + ``` + 参数说明: + - model_path:权重路径,包含scheduler、text_encoder、tokenizer、transformer、vae,5个模型的配置文件及权重。 + - prompt_file:提示词文件。 + - prompt_file_type: prompt文件类型,用于指定读取方式,可选plain,parti,hpsv2。 + - info_file_save_path:生成图片信息的json文件路径。 + - save_dir:生成图片的存放目录。 + - num_images_per_prompt: 每个prompt生成的图片数量。注意使用hpsv2时,设置num_images_per_prompt=1即可。 + - height: 需要生成的图像的高。 + - width:需要生成的图像的宽。 + - batch_size:模型batch size。 + - seed:随机种子。 + - device_id:推理设备ID。 -执行完成后在`./results_hpsv2`目录下生成推理图片,在当前目录生成一个`image_info_hpsv2.json`文件,记录着图片和prompt的对应关系,并在终端显示推理时间。 + 执行完成后在`./results_hpsv2`目录下生成推理图片,在当前目录生成一个`image_info_hpsv2.json`文件,记录着图片和prompt的对应关系,并在终端显示推理时间。 -- Gitee From 98596465aeb500135ab378f4a8ffebe52e61fa0b Mon Sep 17 00:00:00 2001 From: jiangmengyu Date: Thu, 2 Jan 2025 16:55:53 +0800 Subject: [PATCH 85/91] =?UTF-8?q?cogview=E9=80=82=E9=85=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- MindIE/MindIE-Torch/built-in/foundation/cogview3/README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/README.md b/MindIE/MindIE-Torch/built-in/foundation/cogview3/README.md index 18e1c6a8ec..a12dd3d180 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/README.md +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/README.md @@ -217,10 +217,10 @@ python inference_cogview3plus.py \ ### 3.4 精度测试 1. 由于生成的图片存在随机性,提供两种精度验证方法: - 1. CLIP-score(文图匹配度量):评估图片和输入文本的相关性,分数的取值范围为[-1, 1],越高越好。使用Parti数据集进行验证。 - 2. HPSv2(图片美学度量):评估生成图片的人类偏好评分,分数的取值范围为[0, 1],越高越好。使用HPSv2数据集进行验证 + 1. CLIP-score(文图匹配度量):评估图片和输入文本的相关性,分数的取值范围为[-1, 1],越高越好。使用Parti数据集进行验证。 + 2. HPSv2(图片美学度量):评估生成图片的人类偏好评分,分数的取值范围为[0, 1],越高越好。使用HPSv2数据集进行验证 - 注意,由于要生成的图片数量较多,进行完整的精度验证需要耗费很长的时间。 + 注意,由于要生成的图片数量较多,进行完整的精度验证需要耗费很长的时间。 2. 下载Parti数据集和hpsv2数据集 所有数据集放到congview3/prompts目录下 -- Gitee From 2354124247badcb220e1c2b7e8bbf000d0c786ef Mon Sep 17 00:00:00 2001 From: jiangmengyu Date: Thu, 2 Jan 2025 16:56:29 +0800 Subject: [PATCH 86/91] =?UTF-8?q?cogview=E9=80=82=E9=85=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../built-in/foundation/cogview3/README.md | 144 +++++++++--------- 1 file changed, 72 insertions(+), 72 deletions(-) diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/README.md b/MindIE/MindIE-Torch/built-in/foundation/cogview3/README.md index a12dd3d180..426a0a4e53 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/README.md +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/README.md @@ -223,85 +223,85 @@ python inference_cogview3plus.py \ 注意,由于要生成的图片数量较多,进行完整的精度验证需要耗费很长的时间。 2. 下载Parti数据集和hpsv2数据集 - 所有数据集放到congview3/prompts目录下 - ```bash - # 下载Parti数据集 - wget https://raw.githubusercontent.com/google-research/parti/main/PartiPrompts.tsv --no-check-certificate - ``` - hpsv2数据集下载链接:https://gitee.com/ascend/ModelZoo-PyTorch/blob/master/MindIE/MindIE-Torch/built-in/foundation/stable_diffusion_xl/hpsv2_benchmark_prompts.json + 所有数据集放到congview3/prompts目录下 + ```bash + # 下载Parti数据集 + wget https://raw.githubusercontent.com/google-research/parti/main/PartiPrompts.tsv --no-check-certificate + ``` + hpsv2数据集下载链接:https://gitee.com/ascend/ModelZoo-PyTorch/blob/master/MindIE/MindIE-Torch/built-in/foundation/stable_diffusion_xl/hpsv2_benchmark_prompts.json 3. 下载模型权重 - ```bash - # Clip Score和HPSv2均需要使用的权重 - GIT_LFS_SKIP_SMUDGE=1 - git clone https://huggingface.co/laion/CLIP-ViT-H-14-laion2B-s32B-b79K - cd ./CLIP-ViT-H-14-laion2B-s32B-b79K + ```bash + # Clip Score和HPSv2均需要使用的权重 + GIT_LFS_SKIP_SMUDGE=1 + git clone https://huggingface.co/laion/CLIP-ViT-H-14-laion2B-s32B-b79K + cd ./CLIP-ViT-H-14-laion2B-s32B-b79K - # HPSv2权重 - wget https://huggingface.co/spaces/xswu/HPSv2/resolve/main/HPS_v2_compressed.pt --no-check-certificate - ``` - 也可手动下载[权重](https://huggingface.co/laion/CLIP-ViT-H-14-laion2B-s32B-b79K/blob/main/open_clip_pytorch_model.bin) - 将权重放到`CLIP-ViT-H-14-laion2B-s32B-b79K`目录下,手动下载[HPSv2权重](https://huggingface.co/spaces/xswu/HPSv2/resolve/main/HPS_v2_compressed.pt)放到当前路径 + # HPSv2权重 + wget https://huggingface.co/spaces/xswu/HPSv2/resolve/main/HPS_v2_compressed.pt --no-check-certificate + ``` + 也可手动下载[权重](https://huggingface.co/laion/CLIP-ViT-H-14-laion2B-s32B-b79K/blob/main/open_clip_pytorch_model.bin) + 将权重放到`CLIP-ViT-H-14-laion2B-s32B-b79K`目录下,手动下载[HPSv2权重](https://huggingface.co/spaces/xswu/HPSv2/resolve/main/HPS_v2_compressed.pt)放到当前路径 4. 使用推理脚本读取Parti数据集,生成图片 - ```bash - mkdir ./results_PartiPrompts - python3 inference_cogview3plus.py \ - --model_path ${model_path} \ - --prompt_file ./prompts/PartiPrompts.tsv \ - --prompt_file_type parti \ - --info_file_save_path ./image_info_PartiPrompts.json \ - --save_dir ./results_PartiPrompts \ - --num_images_per_prompt 4 \ - --height 1024 \ - --width 1024 \ - --batch_size 1 \ - --seed 42 \ - --device_id 0 - ``` - 参数说明: - - model_path:权重路径,包含scheduler、text_encoder、tokenizer、transformer、vae,5个模型的配置文件及权重。 - - prompt_file:提示词文件。 - - prompt_file_type: prompt文件类型,用于指定读取方式,可选plain,parti,hpsv2。 - - info_file_save_path:生成图片信息的json文件路径。 - - save_dir:生成图片的存放目录。 - - num_images_per_prompt: 每个prompt生成的图片数量。注意使用hpsv2时,设置num_images_per_prompt=1即可。 - - height: 需要生成的图像的高。 - - width:需要生成的图像的宽。 - - batch_size:模型batch size。 - - seed:随机种子。 - - device_id:推理设备ID。 + ```bash + mkdir ./results_PartiPrompts + python3 inference_cogview3plus.py \ + --model_path ${model_path} \ + --prompt_file ./prompts/PartiPrompts.tsv \ + --prompt_file_type parti \ + --info_file_save_path ./image_info_PartiPrompts.json \ + --save_dir ./results_PartiPrompts \ + --num_images_per_prompt 4 \ + --height 1024 \ + --width 1024 \ + --batch_size 1 \ + --seed 42 \ + --device_id 0 + ``` + 参数说明: + - model_path:权重路径,包含scheduler、text_encoder、tokenizer、transformer、vae,5个模型的配置文件及权重。 + - prompt_file:提示词文件。 + - prompt_file_type: prompt文件类型,用于指定读取方式,可选plain,parti,hpsv2。 + - info_file_save_path:生成图片信息的json文件路径。 + - save_dir:生成图片的存放目录。 + - num_images_per_prompt: 每个prompt生成的图片数量。注意使用hpsv2时,设置num_images_per_prompt=1即可。 + - height: 需要生成的图像的高。 + - width:需要生成的图像的宽。 + - batch_size:模型batch size。 + - seed:随机种子。 + - device_id:推理设备ID。 - 执行完成后在`./results_PartiPrompts`目录下生成推理图片,在当前目录生成一个`image_info_PartiPrompts.json`文件,记录着图片和prompt的对应关系,并在终端显示推理时间。 + 执行完成后在`./results_PartiPrompts`目录下生成推理图片,在当前目录生成一个`image_info_PartiPrompts.json`文件,记录着图片和prompt的对应关系,并在终端显示推理时间。 5. 使用推理脚本读取hpsv2数据集,生成图片 - ```bash - mkdir ./results_hpsv2 - python3 inference_cogview3plus.py \ - --model_path ${model_path} \ - --prompt_file ./prompts/hpsv2_benchmark_prompts.json \ - --prompt_file_type hpsv2 \ - --info_file_save_path ./image_info_hpsv2.json \ - --save_dir ./results_hpsv2 \ - --num_images_per_prompt 1 \ - --height 1024 \ - --width 1024 \ - --batch_size 1 \ - --seed 42 \ - --device_id 0 - ``` - 参数说明: - - model_path:权重路径,包含scheduler、text_encoder、tokenizer、transformer、vae,5个模型的配置文件及权重。 - - prompt_file:提示词文件。 - - prompt_file_type: prompt文件类型,用于指定读取方式,可选plain,parti,hpsv2。 - - info_file_save_path:生成图片信息的json文件路径。 - - save_dir:生成图片的存放目录。 - - num_images_per_prompt: 每个prompt生成的图片数量。注意使用hpsv2时,设置num_images_per_prompt=1即可。 - - height: 需要生成的图像的高。 - - width:需要生成的图像的宽。 - - batch_size:模型batch size。 - - seed:随机种子。 - - device_id:推理设备ID。 + ```bash + mkdir ./results_hpsv2 + python3 inference_cogview3plus.py \ + --model_path ${model_path} \ + --prompt_file ./prompts/hpsv2_benchmark_prompts.json \ + --prompt_file_type hpsv2 \ + --info_file_save_path ./image_info_hpsv2.json \ + --save_dir ./results_hpsv2 \ + --num_images_per_prompt 1 \ + --height 1024 \ + --width 1024 \ + --batch_size 1 \ + --seed 42 \ + --device_id 0 + ``` + 参数说明: + - model_path:权重路径,包含scheduler、text_encoder、tokenizer、transformer、vae,5个模型的配置文件及权重。 + - prompt_file:提示词文件。 + - prompt_file_type: prompt文件类型,用于指定读取方式,可选plain,parti,hpsv2。 + - info_file_save_path:生成图片信息的json文件路径。 + - save_dir:生成图片的存放目录。 + - num_images_per_prompt: 每个prompt生成的图片数量。注意使用hpsv2时,设置num_images_per_prompt=1即可。 + - height: 需要生成的图像的高。 + - width:需要生成的图像的宽。 + - batch_size:模型batch size。 + - seed:随机种子。 + - device_id:推理设备ID。 - 执行完成后在`./results_hpsv2`目录下生成推理图片,在当前目录生成一个`image_info_hpsv2.json`文件,记录着图片和prompt的对应关系,并在终端显示推理时间。 + 执行完成后在`./results_hpsv2`目录下生成推理图片,在当前目录生成一个`image_info_hpsv2.json`文件,记录着图片和prompt的对应关系,并在终端显示推理时间。 -- Gitee From cf9848a2abecc6952a0e7cbfdda6a9ac1f30149c Mon Sep 17 00:00:00 2001 From: jiangmengyu Date: Mon, 6 Jan 2025 10:42:48 +0800 Subject: [PATCH 87/91] =?UTF-8?q?cogview=E9=80=82=E9=85=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../foundation/cogview3/clip_score.py | 140 ++++++++++++++++++ .../models/transformer_cogview3plus.py | 6 +- .../foundation/cogview3/hpsv2_score.py | 123 +++++++++++++++ 3 files changed, 266 insertions(+), 3 deletions(-) create mode 100644 MindIE/MindIE-Torch/built-in/foundation/cogview3/clip_score.py create mode 100644 MindIE/MindIE-Torch/built-in/foundation/cogview3/hpsv2_score.py diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/clip_score.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/clip_score.py new file mode 100644 index 0000000000..e0987baac7 --- /dev/null +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/clip_score.py @@ -0,0 +1,140 @@ +# Copyright 2023 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import json +import time +import argparse + +import open_clip +import numpy as np +from PIL import Image +import torch +import torch.nn.functional as F + + +def clip_score(model_clip, tokenizer, preprocess, prompt, image_files, device): + imgs = [] + texts = [] + for image_file in image_files: + img = preprocess(Image.open(image_file)).unsqueeze(0).to(device) + imgs.append(img) + text = tokenizer([prompt]).to(device) + texts.append(text) + + img = torch.cat(imgs) # [bs, 3, 224, 224] + text = torch.cat(texts) # [bs, 77] + + with torch.no_grad(): + text_ft = model_clip.encode_text(text).float() + img_ft = model_clip.encode_image(img).float() + score = F.cosine_similarity(img_ft, text_ft).squeeze() + + return score.cpu() + + +def main(): + args = parse_arguments() + + if args.device is None: + device = torch.device('cuda' if (torch.cuda.is_available()) else 'cpu') + else: + device = torch.device(args.device) + + t_b = time.time() + print(f"Load clip model...") + model_clip, _, preprocess = open_clip.create_model_and_transforms( + args.model_name, pretrained=args.model_weights_path, device=device) + model_clip.eval() + print(f">done. elapsed time: {(time.time() - t_b):.3f} s") + + tokenizer = open_clip.get_tokenizer(args.model_name) + + with os.fdopen(os.open(args.image_info, os.O_RDONLY), "r") as f: + image_info = json.load(f) + + t_b = time.time() + print(f"Calc clip score...") + all_scores = [] + cat_scores = {} + + for i, info in enumerate(image_info): + image_files = info['images'] + category = info['category'] + prompt = info['prompt'] + + print(f"[{i + 1}/{len(image_info)}] {prompt}") + + image_scores = clip_score(model_clip, + tokenizer, + preprocess, + prompt, + image_files, + device) + if len(image_files) > 1: + best_score = max(image_scores) + else: + best_score = image_scores + + print(f"image scores: {image_scores}") + print(f"best score: {best_score}") + + all_scores.append(best_score) + if category not in cat_scores: + cat_scores[category] = [] + cat_scores[category].append(best_score) + print(f">done. elapsed time: {(time.time() - t_b):.3f} s") + + average_score = np.average(all_scores) + print(f"====================================") + print(f"average score: {average_score:.3f}") + print(f"category average scores:") + cat_average_scores = {} + for category, scores in cat_scores.items(): + cat_average_scores[category] = np.average(scores) + print(f"[{category}], average score: {cat_average_scores[category]:.3f}") + + +def parse_arguments(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--device", + type=str, + default="cpu", + choices=["cpu", "cuda"], + help="device for torch.", + ) + parser.add_argument( + "--image_info", + type=str, + default="./image_info.json", + help="Image_info.json file.", + ) + parser.add_argument( + "--model_name", + type=str, + default="ViT-H-14", + help="open clip model name", + ) + parser.add_argument( + "--model_weights_path", + type=str, + default="./CLIP-ViT-H-14-laion2B-s32B-b79K/open_clip_pytorch_model.bin", + help="open clip model weights", + ) + return parser.parse_args() + + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/transformer_cogview3plus.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/transformer_cogview3plus.py index f704e22589..9515d865be 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/transformer_cogview3plus.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/cogview3plus/models/transformer_cogview3plus.py @@ -130,9 +130,9 @@ class CogView3PlusTransformer2DModel(ModelMixin, ConfigMixin): pos_embed_max_size: int = 128, use_cache: bool = True, cache_interval: int = 2, - cache_start: int = 3, - num_cache_layer: int = 13, - cache_start_steps: int = 5, + cache_start: int = 1, + num_cache_layer: int = 11, + cache_start_steps: int = 10, ): super().__init__() self.out_channels = out_channels diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/hpsv2_score.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/hpsv2_score.py new file mode 100644 index 0000000000..a27ba20b5a --- /dev/null +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/hpsv2_score.py @@ -0,0 +1,123 @@ +# Copyright 2024 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import os +from typing import Union +import json + +from clint.textui import progress +import hpsv2 +from hpsv2.utils import root_path, hps_version_map +from hpsv2.src.open_clip import create_model_and_transforms, get_tokenizer +import huggingface_hub +from PIL import Image +import requests +import torch + + +def initialize_model(pretrained_path, device): + model, _, preprocess_val = create_model_and_transforms( + "ViT-H-14", pretrained=pretrained_path, precision='amp', + device=device, + jit=False, + force_quick_gelu=False, + force_custom_text=False, + force_patch_dropout=False, + force_image_size=None, + pretrained_image=False, + image_mean=None, + image_std=None, + light_augmentation=True, + aug_cfg={}, + output_dict=True, + with_score_predictor=False, + with_region_predictor=False + ) + return model, preprocess_val + + +def parse_arguments(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--image_info", + type=str, + default="./image_info.json", + help="Image_info.json file.", + ) + parser.add_argument( + "--HPSv2_checkpoint", + type=str, + default="./HPS_v2_compressed.pt", + help="HPS_v2 model weights", + ) + parser.add_argument( + "--clip_checkpoint", + type=str, + default="./CLIP-ViT-H-14-laion2B-s32B-b79K/open_clip_pytorch_model.bin", + help="open clip model weights", + ) + return parser.parse_args() + + +def main(): + args = parse_arguments() + + device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + + model, preprocess_val = initialize_model(args.clip_checkpoint, device) + + checkpoint = torch.load(args.HPSv2_checkpoint, map_location=device) + model.load_state_dict(checkpoint['state_dict']) + tokenizer = get_tokenizer('ViT-H-14') + model = model.to(device) + model.eval() + + with os.fdopen(os.open(args.image_info, os.O_RDONLY), "r") as f: + image_info = json.load(f) + + result = [] + for i, info in enumerate(image_info): + image_file = info['images'][0] + prompt = info['prompt'] + + # Load your image and prompt + with torch.no_grad(): + # Process the image + if isinstance(image_file, str): + image = preprocess_val(Image.open(image_file)) + elif isinstance(image_file, Image.Image): + image = preprocess_val(image_file) + else: + raise TypeError('The type of parameter img_path is illegal.') + image = image.unsqueeze(0).to(device=device, non_blocking=True) + # Process the prompt + text = tokenizer([prompt]).to(device=device, non_blocking=True) + # Calculate the HPS + with torch.cuda.amp.autocast(): + outputs = model(image, text) + image_features = outputs["image_features"] + text_features = outputs["text_features"] + logits_per_image = image_features @ text_features.T + + hps_score = torch.diagonal(logits_per_image).cpu().numpy() + print(f"image {i} hps_score: ", hps_score[0]) + + result.append(hps_score[0]) + + print('avg HPSv2 score:', sum(result) / len(result)) + + +if __name__ == '__main__': + main() \ No newline at end of file -- Gitee From 3c3439187bdb07c46fc0c74413396dc24e747619 Mon Sep 17 00:00:00 2001 From: jiangmengyu Date: Mon, 6 Jan 2025 11:07:24 +0800 Subject: [PATCH 88/91] =?UTF-8?q?cogview=E9=80=82=E9=85=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../built-in/foundation/cogview3/clip_score.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/clip_score.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/clip_score.py index e0987baac7..b9bf3ce7bb 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/clip_score.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/clip_score.py @@ -22,6 +22,7 @@ import numpy as np from PIL import Image import torch import torch.nn.functional as F +import torch_npu def clip_score(model_clip, tokenizer, preprocess, prompt, image_files, device): @@ -47,6 +48,9 @@ def clip_score(model_clip, tokenizer, preprocess, prompt, image_files, device): def main(): args = parse_arguments() + if args.device == 'npu': + torch.npu.set_device(0) + if args.device is None: device = torch.device('cuda' if (torch.cuda.is_available()) else 'cpu') else: @@ -111,8 +115,8 @@ def parse_arguments(): parser.add_argument( "--device", type=str, - default="cpu", - choices=["cpu", "cuda"], + default="npu", + choices=["cpu", "cuda", "npu"], help="device for torch.", ) parser.add_argument( -- Gitee From 351fb0602edadc7837a80f3c8c8b39ed513a99f4 Mon Sep 17 00:00:00 2001 From: jiangmengyu Date: Mon, 6 Jan 2025 11:13:59 +0800 Subject: [PATCH 89/91] =?UTF-8?q?cogview=E9=80=82=E9=85=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../built-in/foundation/cogview3/clip_score.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/clip_score.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/clip_score.py index b9bf3ce7bb..e0987baac7 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/clip_score.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/clip_score.py @@ -22,7 +22,6 @@ import numpy as np from PIL import Image import torch import torch.nn.functional as F -import torch_npu def clip_score(model_clip, tokenizer, preprocess, prompt, image_files, device): @@ -48,9 +47,6 @@ def clip_score(model_clip, tokenizer, preprocess, prompt, image_files, device): def main(): args = parse_arguments() - if args.device == 'npu': - torch.npu.set_device(0) - if args.device is None: device = torch.device('cuda' if (torch.cuda.is_available()) else 'cpu') else: @@ -115,8 +111,8 @@ def parse_arguments(): parser.add_argument( "--device", type=str, - default="npu", - choices=["cpu", "cuda", "npu"], + default="cpu", + choices=["cpu", "cuda"], help="device for torch.", ) parser.add_argument( -- Gitee From d9290bc41b3bf0d595adddb848df830c44f27cba Mon Sep 17 00:00:00 2001 From: jiangmengyu Date: Mon, 6 Jan 2025 19:55:03 +0800 Subject: [PATCH 90/91] =?UTF-8?q?cogview=E9=80=82=E9=85=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../built-in/foundation/cogview3/README.md | 61 +++++++++++++------ .../cogview3/inference_cogview3plus.py | 4 +- .../foundation/cogview3/requirents.txt | 2 +- 3 files changed, 45 insertions(+), 22 deletions(-) diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/README.md b/MindIE/MindIE-Torch/built-in/foundation/cogview3/README.md index 426a0a4e53..a3c4f437c8 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/README.md +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/README.md @@ -5,7 +5,7 @@ | 配套 | 版本 | 环境准备指导 | | ----- | ----- |-----| | Python | 3.10.12 | - | - | torch | 2.4.0 | - | + | torch | 2.1.0 | - | ### 1.1 获取CANN&MindIE安装包&环境准备 - [800I A2](https://www.hiascend.com/developer/download/community/result?module=pt+ie+cann&product=4&model=32) @@ -136,9 +136,9 @@ https://huggingface.co/THUDM/CogView3-Plus-3B/tree/main/transformer "time_embed_dim": 512, "use_cache": True, "cache_interval": 2, - "cache_start": 3, - "num_cache_layer" 13, - "cache_start_steps" 5 + "cache_start": 1, + "num_cache_layer" 11, + "cache_start_steps" 10 } ``` 6. vae权重链接: @@ -181,21 +181,11 @@ git clone https://huggingface.co/THUDM/CogView3-Plus-3B ```shell cd cogview3 ``` -2. 设置权重路径 -```shell -model_path='/data/CogView3B' -``` -3. 创建输出图像路径 -```shell -output_path='./results' -mkdir ${output_path} -``` -4. 推理: +2. 推理: ```shell python inference_cogview3plus.py \ - --model_path ${model_path} \ + --model_path /data/CogView3B \ --prompt_file ./prompts/example_prompts.txt \ - --save_dir ${output_path} \ --width 1024 \ --height 1024 \ --num_inference_steps 50 \ @@ -205,14 +195,13 @@ python inference_cogview3plus.py \ 参数说明: - model_path:权重路径,包含scheduler、text_encoder、tokenizer、transformer、vae,5个模型的配置文件及权重。 - prompt_file:提示词文件。 -- save_dir:生成图片的存放目录。 - width:需要生成的图像的宽。 - height: 需要生成的图像的高。 - num_inference_steps:推理迭代步数。 - dtype: 数据类型。目前只支持bf16。 - device_id:推理设备ID。 -5. 可以通过修改权重文件中`/data/CongView3B/transforer/config.json`中的`use_cache`参数来控制dit cache算法的开关,`true`表示使用dit cache,`false`表示关闭dit cache。 +3. 可以通过修改权重文件中`/data/CongView3B/transforer/config.json`中的`use_cache`参数来控制dit cache算法的开关,`true`表示使用dit cache,`false`表示关闭dit cache。 ### 3.4 精度测试 @@ -248,7 +237,7 @@ python inference_cogview3plus.py \ ```bash mkdir ./results_PartiPrompts python3 inference_cogview3plus.py \ - --model_path ${model_path} \ + --model_path /data/CogView3B \ --prompt_file ./prompts/PartiPrompts.tsv \ --prompt_file_type parti \ --info_file_save_path ./image_info_PartiPrompts.json \ @@ -279,7 +268,7 @@ python inference_cogview3plus.py \ ```bash mkdir ./results_hpsv2 python3 inference_cogview3plus.py \ - --model_path ${model_path} \ + --model_path /data/CogView3B \ --prompt_file ./prompts/hpsv2_benchmark_prompts.json \ --prompt_file_type hpsv2 \ --info_file_save_path ./image_info_hpsv2.json \ @@ -305,3 +294,35 @@ python inference_cogview3plus.py \ - device_id:推理设备ID。 执行完成后在`./results_hpsv2`目录下生成推理图片,在当前目录生成一个`image_info_hpsv2.json`文件,记录着图片和prompt的对应关系,并在终端显示推理时间。 + +6. 计算精度指标 + 1. CLIP-score + ```bash + python3 clip_score.py \ + --device=gpu \ + --image_info="./image_info_PartiPrompts_cache.json" \ + --model_name="ViT-H-14" \ + --model_weights_path="./CLIP-ViT-H-14-laion2B-s32B-b79K/open_clip_pytorch_model.bin" + ``` + 参数说明: + - --device: 推理设备。 + - --image_info: 上一步生成的`image_info.json`文件。 + - --model_name: Clip模型名称。 + - --model_weights_path: Clip模型权重文件路径。 + + 执行完成后会在屏幕打印出精度计算结果。 + + 2. HPSv2 + ```bash + python3 hpsv2_score.py \ + --image_info="image_info_hpsv2_cache.json" \ + --HPSv2_checkpoint="./HPS_v2_compressed.pt" \ + --clip_checkpoint="./CLIP-ViT-H-14-laion2B-s32B-b79K/open_clip_pytorch_model.bin" + ``` + + 参数说明: + - --image_info: 上一步生成的`image_info.json`文件。 + - --HPSv2_checkpoint: HPSv2模型权重文件路径。 + - --clip_checkpointh: Clip模型权重文件路径。 + + 执行完成后会在屏幕打印出精度计算结果。 diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/inference_cogview3plus.py b/MindIE/MindIE-Torch/built-in/foundation/cogview3/inference_cogview3plus.py index 3ef016ed2a..ae90899030 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/inference_cogview3plus.py +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/inference_cogview3plus.py @@ -197,6 +197,9 @@ def infer(args): torch.npu.set_device(args.device_id) dtype = torch.bfloat16 if args.dtype == "bf16" else torch.float16 + if not os.path.exists(args.save_dir): + os.makedirs(args.save_dir) + if args.seed is not None: set_random_seed(args.seed) @@ -259,4 +262,3 @@ def infer(args): if __name__ == "__main__": inference_args = parse_arguments() infer(inference_args) - diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/requirents.txt b/MindIE/MindIE-Torch/built-in/foundation/cogview3/requirents.txt index 1600434700..b3b2501d42 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/requirents.txt +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/requirents.txt @@ -4,5 +4,5 @@ gradio==5.9.1 accelerate==1.0.1 diffusers==0.31.0 sentencepiece==0.2.0 -torch==2.4.0 +torch==2.1.0 openai==1.58.1 \ No newline at end of file -- Gitee From 50a97fccc331e5ffe2decb51d69043a784c35485 Mon Sep 17 00:00:00 2001 From: jiangmengyu Date: Tue, 7 Jan 2025 11:25:53 +0800 Subject: [PATCH 91/91] =?UTF-8?q?cogview=E9=80=82=E9=85=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../built-in/foundation/cogview3/README.md | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/MindIE/MindIE-Torch/built-in/foundation/cogview3/README.md b/MindIE/MindIE-Torch/built-in/foundation/cogview3/README.md index a3c4f437c8..a16bf5a0b9 100644 --- a/MindIE/MindIE-Torch/built-in/foundation/cogview3/README.md +++ b/MindIE/MindIE-Torch/built-in/foundation/cogview3/README.md @@ -134,7 +134,7 @@ https://huggingface.co/THUDM/CogView3-Plus-3B/tree/main/transformer "sample_size": 128, "text_embed_dim": 4096, "time_embed_dim": 512, - "use_cache": True, + "use_cache": False, "cache_interval": 2, "cache_start": 1, "num_cache_layer" 11, @@ -225,7 +225,6 @@ python inference_cogview3plus.py \ # Clip Score和HPSv2均需要使用的权重 GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/laion/CLIP-ViT-H-14-laion2B-s32B-b79K - cd ./CLIP-ViT-H-14-laion2B-s32B-b79K # HPSv2权重 wget https://huggingface.co/spaces/xswu/HPSv2/resolve/main/HPS_v2_compressed.pt --no-check-certificate @@ -300,12 +299,12 @@ python inference_cogview3plus.py \ ```bash python3 clip_score.py \ --device=gpu \ - --image_info="./image_info_PartiPrompts_cache.json" \ + --image_info="./image_info_PartiPrompts.json" \ --model_name="ViT-H-14" \ --model_weights_path="./CLIP-ViT-H-14-laion2B-s32B-b79K/open_clip_pytorch_model.bin" ``` 参数说明: - - --device: 推理设备。 + - --device: 推理设备(CPU或者GPU)。 - --image_info: 上一步生成的`image_info.json`文件。 - --model_name: Clip模型名称。 - --model_weights_path: Clip模型权重文件路径。 @@ -315,7 +314,7 @@ python inference_cogview3plus.py \ 2. HPSv2 ```bash python3 hpsv2_score.py \ - --image_info="image_info_hpsv2_cache.json" \ + --image_info="image_info_hpsv2.json" \ --HPSv2_checkpoint="./HPS_v2_compressed.pt" \ --clip_checkpoint="./CLIP-ViT-H-14-laion2B-s32B-b79K/open_clip_pytorch_model.bin" ``` @@ -326,3 +325,10 @@ python inference_cogview3plus.py \ - --clip_checkpointh: Clip模型权重文件路径。 执行完成后会在屏幕打印出精度计算结果。 + +### CogView3plus + +| 硬件形态 | 迭代次数 | dit cache | 平均耗时 | CLIP_score | HPSV2_score | +| :------: |:----:|:----:|:----:|:----:|:----:| +| Atlas 800T A2 (64G) 单卡 | 50 | False | 27.588s | 0.367 | 0.2879729 | +| Atlas 800T A2 (64G) 单卡 | 50 | True | 23.639s | 0.367 | 0.2878573 | \ No newline at end of file -- Gitee