From 3122293588b91a6b1bec3f812761d9360525070f Mon Sep 17 00:00:00 2001 From: lin-mingzhe-hw Date: Thu, 6 Jun 2024 20:57:35 +0800 Subject: [PATCH 1/3] [built-in][PyTorch][OpenSoraPlan] OpenSoraPlan v1.1 transfer to npu --- .../models/diffusion/latte/modules.py | 2 + .../models/ae/videobase/modules/conv.py | 16 +- .../models/ae/videobase/modules/ops.py | 4 +- .../ae/videobase/modules/updownsample.py | 23 ++- .../diffusion/gaussian_diffusion_t2v.py | 4 +- .../models/diffusion/latte/modules.py | 104 ++++++---- .../opensora/models/text_encoder/__init__.py | 17 +- .../opensora/train/train_t2v.py | 29 ++- .../opensora/utils/npu_utils.py | 186 ++++++++++++++++++ .../public_address_statement.md | 146 ++++++++++++++ .../train_videoae_65x512x512_16.sh | 33 ++++ .../scripts/train_data/video_data.txt | 4 +- 12 files changed, 510 insertions(+), 58 deletions(-) create mode 100644 PyTorch/built-in/mlm/OpenSoraPlan1.1/opensora/utils/npu_utils.py create mode 100644 PyTorch/built-in/mlm/OpenSoraPlan1.1/public_address_statement.md create mode 100644 PyTorch/built-in/mlm/OpenSoraPlan1.1/scripts/text_condition/train_videoae_65x512x512_16.sh diff --git a/PyTorch/built-in/mlm/OpenSoraPlan1.0/opensora/models/diffusion/latte/modules.py b/PyTorch/built-in/mlm/OpenSoraPlan1.0/opensora/models/diffusion/latte/modules.py index b81d358e36..df21156612 100644 --- a/PyTorch/built-in/mlm/OpenSoraPlan1.0/opensora/models/diffusion/latte/modules.py +++ b/PyTorch/built-in/mlm/OpenSoraPlan1.0/opensora/models/diffusion/latte/modules.py @@ -1,3 +1,4 @@ +# Copyright 2024 Huawei Technologies Co., Ltd from importlib import import_module import math @@ -900,6 +901,7 @@ class AttnProcessor2_0: input_layout="BSH", scale=scale, pse=None, + atten_mask=attention_mask.bool().expand(-1, -1, query.shape[1], -1) if attention_mask is not None else None, pre_tockens=2147483647, next_tockens=2147483647, keep_prob=1.0, diff --git a/PyTorch/built-in/mlm/OpenSoraPlan1.1/opensora/models/ae/videobase/modules/conv.py b/PyTorch/built-in/mlm/OpenSoraPlan1.1/opensora/models/ae/videobase/modules/conv.py index 5a4c8ae279..afc8c229cf 100644 --- a/PyTorch/built-in/mlm/OpenSoraPlan1.1/opensora/models/ae/videobase/modules/conv.py +++ b/PyTorch/built-in/mlm/OpenSoraPlan1.1/opensora/models/ae/videobase/modules/conv.py @@ -1,3 +1,4 @@ +# Copyright 2024 Huawei Technologies Co., Ltd import torch.nn as nn from typing import Union, Tuple import torch.nn.functional as F @@ -7,6 +8,10 @@ from .ops import cast_tuple from einops import rearrange from .ops import video_to_image +from opensora.utils.npu_utils import is_npu_available +if is_npu_available(): + import torch_npu + class Conv2d(nn.Conv2d): def __init__( self, @@ -55,7 +60,10 @@ class CausalConv3d(nn.Module): padding = list(cast_tuple(padding, 3)) padding[0] = 0 stride = cast_tuple(stride, 3) - self.conv = nn.Conv3d(chan_in, chan_out, self.kernel_size, stride=stride, padding=padding) + if is_npu_available(): + self.conv = nn.Conv3d(chan_in, chan_out, self.kernel_size, stride=stride, padding=padding, dtype=torch.bfloat16) + else: + self.conv = nn.Conv3d(chan_in, chan_out, self.kernel_size, stride=stride, padding=padding) self._init_weights(init_method) def _init_weights(self, init_method): @@ -95,4 +103,8 @@ class CausalConv3d(nn.Module): (1, 1, self.time_kernel_size - 1, 1, 1) ) # b c t h w x = torch.concatenate((first_frame_pad, x), dim=2) # 3 + 16 - return self.conv(x) \ No newline at end of file + if is_npu_available(): + res = self.conv.to(torch.bfloat16)(x.to(torch.bfloat16)).to(x.dtype) + return torch_npu.npu_format_cast(res, 2) + else: + return self.conv(x) \ No newline at end of file diff --git a/PyTorch/built-in/mlm/OpenSoraPlan1.1/opensora/models/ae/videobase/modules/ops.py b/PyTorch/built-in/mlm/OpenSoraPlan1.1/opensora/models/ae/videobase/modules/ops.py index fdd262ad71..999cec0348 100644 --- a/PyTorch/built-in/mlm/OpenSoraPlan1.1/opensora/models/ae/videobase/modules/ops.py +++ b/PyTorch/built-in/mlm/OpenSoraPlan1.1/opensora/models/ae/videobase/modules/ops.py @@ -1,3 +1,4 @@ +# Copyright 2024 Huawei Technologies Co., Ltd import torch from einops import rearrange @@ -12,7 +13,8 @@ def video_to_image(func): return wrapper def nonlinearity(x): - return x * torch.sigmoid(x) + # return x * torch.sigmoid(x) + return torch.nn.functional.silu(x) def cast_tuple(t, length=1): return t if isinstance(t, tuple) else ((t,) * length) diff --git a/PyTorch/built-in/mlm/OpenSoraPlan1.1/opensora/models/ae/videobase/modules/updownsample.py b/PyTorch/built-in/mlm/OpenSoraPlan1.1/opensora/models/ae/videobase/modules/updownsample.py index 9e3d489aee..0824ef1487 100644 --- a/PyTorch/built-in/mlm/OpenSoraPlan1.1/opensora/models/ae/videobase/modules/updownsample.py +++ b/PyTorch/built-in/mlm/OpenSoraPlan1.1/opensora/models/ae/videobase/modules/updownsample.py @@ -1,3 +1,4 @@ +# Copyright 2024 Huawei Technologies Co., Ltd from typing import Union, Tuple import torch import torch.nn as nn @@ -10,6 +11,10 @@ from .conv import CausalConv3d from einops import rearrange from .block import Block +from opensora.utils.npu_utils import is_npu_available +if is_npu_available(): + import torch_npu + class Upsample(Block): def __init__(self, in_channels, out_channels): super().__init__() @@ -115,14 +120,16 @@ class TimeDownsample2x(Block): ): super().__init__() self.kernel_size = kernel_size - self.conv = nn.AvgPool3d((kernel_size,1,1), stride=(2,1,1)) - + self.conv = nn.AvgPool1d(kernel_size, stride=2) + def forward(self, x): first_frame_pad = x[:, :, :1, :, :].repeat( (1, 1, self.kernel_size - 1, 1, 1) ) x = torch.concatenate((first_frame_pad, x), dim=2) - return self.conv(x) + n, c, d, h, w = x.shape + return self.conv(x.permute(0, 1, 3, 4, 2).reshape(n, c * h * w, d)).reshape(n, c, h, w, -1).permute(0, 1, 4, 2, + 3) class TimeUpsample2x(Block): def __init__( @@ -148,7 +155,7 @@ class TimeDownsampleRes2x(nn.Module): ): super().__init__() self.kernel_size = cast_tuple(kernel_size, 3) - self.avg_pool = nn.AvgPool3d((kernel_size,1,1), stride=(2,1,1)) + self.avg_pool = nn.AvgPool1d(kernel_size, stride=2) self.conv = nn.Conv3d( in_channels, out_channels, self.kernel_size, stride=(2,1,1), padding=(0,1,1) ) @@ -160,7 +167,13 @@ class TimeDownsampleRes2x(nn.Module): (1, 1, self.kernel_size[0] - 1, 1, 1) ) x = torch.concatenate((first_frame_pad, x), dim=2) - return alpha * self.avg_pool(x) + (1 - alpha) * self.conv(x) + n, c, d, h, w = x.shape + if is_npu_available(): + pool_res = self.avg_pool(x.float().permute(0, 1, 3, 4, 2).reshape(n, c*h*w, d)).reshape(n, c, h, w, -1).permute(0, 1, 4, 2, 3) + return alpha * pool_res.to(x.dtype) + (1 - alpha) * self.conv(x) + else: + pool_res = self.avg_pool(x.permute(0, 1, 3, 4, 2).reshape(n, c*h*w, d)).reshape(n, c, h, w, -1).permute(0, 1, 4, 2, 3) + return alpha * pool_res + (1 - alpha) * self.conv(x) class TimeUpsampleRes2x(nn.Module): def __init__( diff --git a/PyTorch/built-in/mlm/OpenSoraPlan1.1/opensora/models/diffusion/diffusion/gaussian_diffusion_t2v.py b/PyTorch/built-in/mlm/OpenSoraPlan1.1/opensora/models/diffusion/diffusion/gaussian_diffusion_t2v.py index 2dfe4d99d2..cb0859de31 100644 --- a/PyTorch/built-in/mlm/OpenSoraPlan1.1/opensora/models/diffusion/diffusion/gaussian_diffusion_t2v.py +++ b/PyTorch/built-in/mlm/OpenSoraPlan1.1/opensora/models/diffusion/diffusion/gaussian_diffusion_t2v.py @@ -1,3 +1,4 @@ +# Copyright 2024 Huawei Technologies Co., Ltd # Modified from OpenAI's diffusion repos # GLIDE: https://github.com/openai/glide-text2im/blob/main/glide_text2im/gaussian_diffusion.py # ADM: https://github.com/openai/guided-diffusion/blob/main/guided_diffusion @@ -898,7 +899,8 @@ def _extract_into_tensor(arr, timesteps, broadcast_shape): dimension equal to the length of timesteps. :return: a tensor of shape [batch_size, 1, ...] where the shape has K dims. """ - res = th.from_numpy(arr).to(device=timesteps.device)[timesteps].float() + # res = th.from_numpy(arr).to(device=timesteps.device)[timesteps].float() + res = th.tensor(arr, dtype=th.float32, device=timesteps.device)[timesteps] while len(res.shape) < len(broadcast_shape): res = res[..., None] return res + th.zeros(broadcast_shape, device=timesteps.device) diff --git a/PyTorch/built-in/mlm/OpenSoraPlan1.1/opensora/models/diffusion/latte/modules.py b/PyTorch/built-in/mlm/OpenSoraPlan1.1/opensora/models/diffusion/latte/modules.py index 40ff32e275..1e848470c0 100644 --- a/PyTorch/built-in/mlm/OpenSoraPlan1.1/opensora/models/diffusion/latte/modules.py +++ b/PyTorch/built-in/mlm/OpenSoraPlan1.1/opensora/models/diffusion/latte/modules.py @@ -1,4 +1,6 @@ +# Copyright 2024 Huawei Technologies Co., Ltd from importlib import import_module +import math import numpy as np from typing import Any, Dict, Optional, Tuple, Callable @@ -28,6 +30,9 @@ if is_xformers_available(): else: xformers = None +from opensora.utils.npu_utils import is_npu_available +if is_npu_available(): + import torch_npu class CombinedTimestepSizeEmbeddings(nn.Module): """ @@ -936,7 +941,7 @@ class AttnProcessor2_0: hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2) args = () if USE_PEFT_BACKEND else (scale,) - query = attn.to_q(hidden_states, *args) + query = attn.to_q(hidden_states, *args).contiguous() if encoder_hidden_states is None: encoder_hidden_states = hidden_states @@ -945,53 +950,70 @@ class AttnProcessor2_0: - key = attn.to_k(encoder_hidden_states, *args) - value = attn.to_v(encoder_hidden_states, *args) + key = attn.to_k(encoder_hidden_states, *args).contiguous() + value = attn.to_v(encoder_hidden_states, *args).contiguous() inner_dim = key.shape[-1] head_dim = inner_dim // attn.heads - query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2) - - key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2) - value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2) - - if self.use_rope: - # require the shape of (batch_size x nheads x ntokens x dim) - if position_q.ndim == 3: - query = self.rope2d(query, position_q) - elif position_q.ndim == 2: - query = self.rope1d(query, position_q) - else: - raise NotImplementedError - if position_k.ndim == 3: - key = self.rope2d(key, position_k) - elif position_k.ndim == 2: - key = self.rope1d(key, position_k) - else: - raise NotImplementedError - - # the output of sdp = (batch, num_heads, seq_len, head_dim) - # TODO: add support for attn.scale when we move to Torch 2.1 - if self.attention_mode == 'flash': - assert attention_mask is None or torch.all(attention_mask.bool()), 'flash-attn do not support attention_mask' - with torch.backends.cuda.sdp_kernel(enable_math=False, enable_flash=True, enable_mem_efficient=False): - hidden_states = F.scaled_dot_product_attention( - query, key, value, dropout_p=0.0, is_causal=False - ) - elif self.attention_mode == 'xformers': - with torch.backends.cuda.sdp_kernel(enable_math=False, enable_flash=False, enable_mem_efficient=True): + if is_npu_available() and not self.use_rope and query.dtype in (torch.float16, torch.bfloat16): + scale = 1 / math.sqrt(head_dim) + hidden_states = torch_npu.npu_fusion_attention( + query, + key, + value, + head_num=attn.heads, + input_layout="BSH", + scale=scale, + pse=None, + atten_mask=attention_mask.bool().expand(-1, -1, query.shape[1], -1) if attention_mask is not None else None, + pre_tockens=2147483647, + next_tockens=2147483647, + keep_prob=1, + inner_precise=0 + )[0].contiguous() + else: + query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2) + + key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2) + value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2) + + if self.use_rope: + # require the shape of (batch_size x nheads x ntokens x dim) + if position_q.ndim == 3: + query = self.rope2d(query, position_q) + elif position_q.ndim == 2: + query = self.rope1d(query, position_q) + else: + raise NotImplementedError + if position_k.ndim == 3: + key = self.rope2d(key, position_k) + elif position_k.ndim == 2: + key = self.rope1d(key, position_k) + else: + raise NotImplementedError + + # the output of sdp = (batch, num_heads, seq_len, head_dim) + # TODO: add support for attn.scale when we move to Torch 2.1 + if self.attention_mode == 'flash': + assert attention_mask is None or torch.all(attention_mask.bool()), 'flash-attn do not support attention_mask' + with torch.backends.cuda.sdp_kernel(enable_math=False, enable_flash=True, enable_mem_efficient=False): + hidden_states = F.scaled_dot_product_attention( + query, key, value, dropout_p=0.0, is_causal=False + ) + elif self.attention_mode == 'xformers': + with torch.backends.cuda.sdp_kernel(enable_math=False, enable_flash=False, enable_mem_efficient=True): + hidden_states = F.scaled_dot_product_attention( + query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False + ) + elif self.attention_mode == 'math': hidden_states = F.scaled_dot_product_attention( query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False ) - elif self.attention_mode == 'math': - hidden_states = F.scaled_dot_product_attention( - query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False - ) - else: - raise NotImplementedError(f'Found attention_mode: {self.attention_mode}') - hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim) - hidden_states = hidden_states.to(query.dtype) + else: + raise NotImplementedError(f'Found attention_mode: {self.attention_mode}') + hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim) + hidden_states = hidden_states.to(query.dtype) # linear proj hidden_states = attn.to_out[0](hidden_states, *args) diff --git a/PyTorch/built-in/mlm/OpenSoraPlan1.1/opensora/models/text_encoder/__init__.py b/PyTorch/built-in/mlm/OpenSoraPlan1.1/opensora/models/text_encoder/__init__.py index 28cf771bb4..e6ac8d1ed5 100644 --- a/PyTorch/built-in/mlm/OpenSoraPlan1.1/opensora/models/text_encoder/__init__.py +++ b/PyTorch/built-in/mlm/OpenSoraPlan1.1/opensora/models/text_encoder/__init__.py @@ -1,15 +1,30 @@ +# Copyright 2024 Huawei Technologies Co., Ltd import torch from torch import nn from transformers import T5EncoderModel, CLIPModel, CLIPProcessor +from transformers.models.t5.modeling_t5 import T5LayerNorm +from transformers.activations import NewGELUActivation -from opensora.utils.utils import get_precision +from opensora.utils.utils import get_precision +from opensora.utils.npu_utils import is_npu_available, NpuRMSNorm, replace_module class T5Wrapper(nn.Module): def __init__(self, args, **kwargs): super(T5Wrapper, self).__init__() self.model_name = args.text_encoder_name self.text_enc = T5EncoderModel.from_pretrained(self.model_name, cache_dir=args.cache_dir, **kwargs).eval() + if is_npu_available(): + # Monekey Patch NpuRMSNorm, GELU + for name, module in self.text_enc.named_modules(): + if isinstance(module, T5LayerNorm): + hidden_size = module.weight.shape[0] + eps = module.variance_epsilon + npu_rms_norm = NpuRMSNorm(hidden_size, eps) + npu_rms_norm.load_state_dict(module.state_dict()) + replace_module(self.text_enc, name, npu_rms_norm) + if isinstance(module, NewGELUActivation): + replace_module(self.text_enc, name, nn.GELU(approximate='tanh')) def forward(self, input_ids, attention_mask): text_encoder_embs = self.text_enc(input_ids=input_ids, attention_mask=attention_mask)['last_hidden_state'] diff --git a/PyTorch/built-in/mlm/OpenSoraPlan1.1/opensora/train/train_t2v.py b/PyTorch/built-in/mlm/OpenSoraPlan1.1/opensora/train/train_t2v.py index 22df3a2bb3..cd62120206 100644 --- a/PyTorch/built-in/mlm/OpenSoraPlan1.1/opensora/train/train_t2v.py +++ b/PyTorch/built-in/mlm/OpenSoraPlan1.1/opensora/train/train_t2v.py @@ -1,3 +1,4 @@ +# Copyright 2024 Huawei Technologies Co., Ltd # Copyright (c) Meta Platforms, Inc. and affiliates. # All rights reserved. @@ -12,6 +13,7 @@ import logging import math import os import shutil +import time from pathlib import Path from typing import Optional import gc @@ -23,6 +25,14 @@ from torch.utils.data import DataLoader from copy import deepcopy import accelerate import torch + +from opensora.utils.npu_utils import is_npu_available +if is_npu_available(): + import torch_npu + from torch_npu.contrib import transfer_to_npu + torch.npu.config.allow_internal_format = False + from opensora.utils.npu_utils import AdamW + from torch.nn import functional as F import transformers from accelerate import Accelerator @@ -47,6 +57,7 @@ from opensora.models.diffusion.latte.modeling_latte import LatteT2V from opensora.models.text_encoder import get_text_enc, get_text_warpper from opensora.utils.dataset_utils import Collate from opensora.models.ae import ae_stride_config, ae_channel_config +from opensora.models.ae.imagebase import vae from opensora.models.diffusion import Diffusion_models from opensora.sample.pipeline_videogen import VideoGenPipeline from opensora.utils.utils import print_grad_norm @@ -310,7 +321,10 @@ def main(args): optimizer_class = bnb.optim.AdamW8bit else: - optimizer_class = torch.optim.AdamW + if is_npu_available(): + optimizer_class = AdamW + else: + optimizer_class = torch.optim.AdamW # Optimizer creation params_to_optimize = model.parameters() @@ -414,7 +428,9 @@ def main(args): for epoch in range(first_epoch, args.num_train_epochs): train_loss = 0.0 + dataloader_start_time = time.time() for step, (x, attn_mask, input_ids, cond_mask) in enumerate(train_dataloader): + step_start_time = time.time() with accelerator.accumulate(model): # Sample noise that we'll add to the latents if not args.multi_scale: @@ -429,11 +445,11 @@ def main(args): with torch.no_grad(): # use for loop to avoid OOM, because T5 is too huge... - B, _, _ = input_ids.shape # B T+num_images L b 1+4, L - cond = torch.stack([text_enc(input_ids[i], cond_mask[i]) for i in range(B)]) # B 1+num_images L D + B, N, L = input_ids.shape # B T+num_images L b 1+4, L + cond = text_enc(input_ids.reshape(-1, L), cond_mask.reshape(-1, L)).reshape(B, N, L, -1) # B 1+num_images L D # Map input images to latent space + normalize latents - if args.use_image_num == 0: + if args.use_image_num == 0 or (args.ae in vae and args.use_img_from_vid): x = ae.encode(x) # B C T H W else: videos, images = x[:, :, :-args.use_image_num], x[:, :, -args.use_image_num:] @@ -506,6 +522,11 @@ def main(args): lr_scheduler.step() optimizer.zero_grad() + step_end_time = time.time() + if accelerator.is_main_process: + logger.info(f"steps: {global_step}, dataloader time: {step_start_time - dataloader_start_time}, " + f"train time: {step_end_time - step_start_time}, train loss: {train_loss}") + # Checks if the accelerator has performed an optimization step behind the scenes if accelerator.sync_gradients: progress_bar.update(1) diff --git a/PyTorch/built-in/mlm/OpenSoraPlan1.1/opensora/utils/npu_utils.py b/PyTorch/built-in/mlm/OpenSoraPlan1.1/opensora/utils/npu_utils.py new file mode 100644 index 0000000000..18e3ec60a2 --- /dev/null +++ b/PyTorch/built-in/mlm/OpenSoraPlan1.1/opensora/utils/npu_utils.py @@ -0,0 +1,186 @@ +# Copyright 2024 Huawei Technologies Co., Ltd +from typing import List, Optional +import importlib +from functools import lru_cache + +import torch +import torch.nn as nn +from torch import Tensor +from torch.optim.optimizer import Optimizer + + +@lru_cache +def is_npu_available(): + "Checks if `torch_npu` is installed and potentially if a NPU is in the environment" + if importlib.util.find_spec("torch") is None or importlib.util.find_spec("torch_npu") is None: + return False + + import torch_npu + + try: + # Will raise a RuntimeError if no NPU is found + _ = torch.npu.device_count() + return torch.npu.is_available() + except RuntimeError: + return False + + +if is_npu_available(): + import torch_npu + + +def adamw(params: List[Tensor], + grads: List[Tensor], + exp_avgs: List[Tensor], + exp_avg_sqs: List[Tensor], + max_exp_avg_sqs: List[Tensor], + step: int, + *, + amsgrad: bool, + beta1: float, + beta2: float, + lr: float, + weight_decay: float, + eps: float, + maximize: bool): + r"""Functional API that performs AdamW algorithm computation. + See :class:`~torch.optim.AdamW` for details. + """ + for i, param in enumerate(params): + grad = grads[i] + exp_avg = exp_avgs[i] + exp_avg_sq = exp_avg_sqs[i] + + # Perform stepweight decay + ## param.mul_(1 - lr * weight_decay) + bias_correction1 = beta1 ** (step - 1) + bias_correction2 = beta2 ** (step - 1) + + param.data, exp_avg, exp_avg_sq = torch_npu.npu_apply_adam_w( + bias_correction1, + bias_correction2, + lr, + weight_decay, + beta1, + beta2, + eps, + grad, + None, + amsgrad, + maximize, + out=(param.data, exp_avg, exp_avg_sq) + ) + + +class AdamW(Optimizer): + def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, + weight_decay=1e-2, amsgrad=False, *, maximize: bool = False): + if not 0.0 <= lr: + raise ValueError("Invalid learning rate: {}".format(lr)) + if not 0.0 <= eps: + raise ValueError("Invalid epsilon value: {}".format(eps)) + if not 0.0 <= betas[0] < 1.0: + raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0])) + if not 0.0 <= betas[1] < 1.0: + raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1])) + if not 0.0 <= weight_decay: + raise ValueError("Invalid weight_decay value: {}".format(weight_decay)) + defaults = dict(lr=lr, betas=betas, eps=eps, + weight_decay=weight_decay, amsgrad=amsgrad, maximize=maximize) + super(AdamW, self).__init__(params, defaults) + + def __setstate__(self, state): + super(AdamW, self).__setstate__(state) + for group in self.param_groups: + group.setdefault('amsgrad', False) + group.setdefault('maximize', False) + + @torch.no_grad() + def step(self, closure=None): + loss = None + if closure is not None: + with torch.enable_grad(): + loss = closure() + + for group in self.param_groups: + params_with_grad = [] + grads = [] + exp_avgs = [] + exp_avg_sqs = [] + state_sums = [] + max_exp_avg_sqs = [] + state_steps = [] + amsgrad = group['amsgrad'] + beta1, beta2 = group['betas'] + + if 'step' in group: + group['step'] += 1 + else: + group['step'] = 1 + + for p in group['params']: + if p.grad is None: + continue + params_with_grad.append(p) + if p.grad.is_sparse: + raise RuntimeError('AdamW does not support sparse gradients') + grads.append(p.grad) + + state = self.state[p] + + # State initialization + if len(state) == 0: + # Exponential moving average of gradient values + state['exp_avg'] = torch.zeros_like(p, memory_format=torch.preserve_format) + # Exponential moving average of squared gradient values + state['exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format) + if amsgrad: + # Maintains max of all exp. moving avg. of sq. grad. values + state['max_exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format) + + exp_avgs.append(state['exp_avg']) + exp_avg_sqs.append(state['exp_avg_sq']) + + if amsgrad: + max_exp_avg_sqs.append(state['max_exp_avg_sq']) + + adamw(params_with_grad, + grads, + exp_avgs, + exp_avg_sqs, + max_exp_avg_sqs, + group['step'], + amsgrad=amsgrad, + beta1=beta1, + beta2=beta2, + lr=group['lr'], + weight_decay=group['weight_decay'], + eps=group['eps'], + maximize=group['maximize']) + + return loss + + +class NpuRMSNorm(torch.nn.Module): + def __init__(self, hidden_size, eps=1e-6): + """ + Initialize NPU RMSNorm normalization layer + """ + super().__init__() + self.weight = nn.Parameter(torch.ones(hidden_size)) + self.eps = eps + + def forward(self, x): + return torch_npu.npu_rms_norm(x.to(self.weight.dtype), self.weight, epsilon=self.eps)[0] + + +def replace_module(model, submodule_key, module): + """Replace all the submodule of the model with module that contains submodule_key""" + tokens = submodule_key.split('.') + sub_tokens = tokens[:-1] + cur_mod = model + for s in sub_tokens: + cur_mod = getattr(cur_mod, s) + setattr(cur_mod, tokens[-1], module) + + diff --git a/PyTorch/built-in/mlm/OpenSoraPlan1.1/public_address_statement.md b/PyTorch/built-in/mlm/OpenSoraPlan1.1/public_address_statement.md new file mode 100644 index 0000000000..5461fa98ac --- /dev/null +++ b/PyTorch/built-in/mlm/OpenSoraPlan1.1/public_address_statement.md @@ -0,0 +1,146 @@ +| 类型 | 开源代码地址 | 文件名 | 公网IP地址/公网URL地址/域名/邮箱地址 | 用途说明 | +|--------|----------------------------------------------------------------------------------------------------------------------|-----------------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------| +| 开源代码引入 | https://github.com/PKU-YuanGroup/Open-Sora-Plan/opensora/dataset/transform.py | /opensora/dataset/transform.py | https://github.com/openai/guided-diffusion/blob/8fb3ad9197f16bbc40620447b2742e13458d2831/guided_diffusion/image_datasets.py | code from openai/guided-diffusion | +| 开源代码引入 | https://github.com/PKU-YuanGroup/Open-Sora-Plan/opensora/eval/eval_clip_score.py | /opensora/eval/eval_clip_score.py | https://github.com/openai/CLIP | code from openai/CLIP. | +| 开源代码引入 | https://github.com/PKU-YuanGroup/Open-Sora-Plan/opensora/eval/eval_clip_score.py | /opensora/eval/eval_clip_score.py | https://github.com/mseitzer/pytorch-fid | code from mseitzer/pytorch-fid | +| 开源代码引入 | https://github.com/PKU-YuanGroup/Open-Sora-Plan/opensora/eval/eval_clip_score.py | /opensora/eval/eval_clip_score.py | https://github.com/openai/CLIP | code from openai/CLIP. | +| 开源代码引入 | https://github.com/PKU-YuanGroup/Open-Sora-Plan/opensora/eval/eval_clip_score.py | /opensora/eval/eval_clip_score.py | http://www.apache.org/licenses/LICENSE-2.0 | license | +| 开源代码引入 | https://github.com/PKU-YuanGroup/Open-Sora-Plan/opensora/eval/eval_common_metric.py | /opensora/eval/eval_common_metric.py | https://github.com/openai/CLIP | code from openai/CLIP. | +| 开源代码引入 | https://github.com/PKU-YuanGroup/Open-Sora-Plan/opensora/eval/eval_common_metric.py | /opensora/eval/eval_common_metric.py | https://github.com/mseitzer/pytorch-fid | code from mseitzer/pytorch-fid | +| 开源代码引入 | https://github.com/PKU-YuanGroup/Open-Sora-Plan/opensora/eval/eval_common_metric.py | /opensora/eval/eval_common_metric.py | https://github.com/openai/CLIP | code from openai/CLIP. | +| 开源代码引入 | https://github.com/PKU-YuanGroup/Open-Sora-Plan/opensora/eval/eval_common_metric.py | /opensora/eval/eval_common_metric.py | http://www.apache.org/licenses/LICENSE-2.0 | license | +| 开源代码引入 | https://github.com/PKU-YuanGroup/Open-Sora-Plan/opensora/eval/flolpips/pwcnet.py | /opensora/eval/flolpips/pwcnet.py | http://content.sniklaus.com/github/pytorch-pwc/network-default.pytorch | download weights | +| 开源代码引入 | https://github.com/PKU-YuanGroup/Open-Sora-Plan/opensora/eval/fvd/styleganv/fvd.py | /opensora/eval/fvd/styleganv/fvd.py | https://github.com/universome/fvd-comparison | code from universome/fvd-comparison | +| 开源代码引入 | https://github.com/PKU-YuanGroup/Open-Sora-Plan/opensora/eval/fvd/styleganv/fvd.py | /opensora/eval/fvd/styleganv/fvd.py | https://www.dropbox.com/s/ge9e5ujwgetktms/i3d_torchscript.pt | download weights | +| 开源代码引入 | https://github.com/PKU-YuanGroup/Open-Sora-Plan/opensora/eval/fvd/styleganv/fvd.py | /opensora/eval/fvd/styleganv/fvd.py | https://github.com/cvpr2022-stylegan-v/stylegan-v/blob/main/src/metrics/frechet_video_distance.py | code from cvpr2022-stylegan-v/stylegan-v | +| 开源代码引入 | https://github.com/PKU-YuanGroup/Open-Sora-Plan/opensora/eval/fvd/videogpt/fvd.py | /opensora/eval/fvd/videogpt/fvd.py | https://onedrive.live.com/download?cid=78EEF3EB6AE7DBCB&resid=78EEF3EB6AE7DBCB%21199&authkey=AApKdFHPXzWLNyI | download weights | +| 开源代码引入 | https://github.com/PKU-YuanGroup/Open-Sora-Plan/opensora/eval/fvd/videogpt/fvd.py | /opensora/eval/fvd/videogpt/fvd.py | https://github.com/tensorflow/gan/blob/de4b8da3853058ea380a6152bd3bd454013bf619/tensorflow_gan/python/eval/classifier_metrics.py | code from tensorflow/gan | +| 开源代码引入 | https://github.com/PKU-YuanGroup/Open-Sora-Plan/opensora/eval/fvd/videogpt/fvd.py | /opensora/eval/fvd/videogpt/fvd.py | https://github.com/tensorflow/gan/blob/de4b8da3853058ea380a6152bd3bd454013bf619/tensorflow_gan/python/eval/classifier_metrics.py | code from tensorflow/gan | +| 开源代码引入 | https://github.com/PKU-YuanGroup/Open-Sora-Plan/opensora/eval/fvd/videogpt/fvd.py | /opensora/eval/fvd/videogpt/fvd.py | https://discuss.pytorch.org/t/covariance-and-gradient-support/16217/2 | code from ModarTensai | +| 开源代码引入 | https://github.com/PKU-YuanGroup/Open-Sora-Plan/opensora/eval/fvd/videogpt/pytorch_i3d.py | /opensora/eval/fvd/videogpt/pytorch_i3d.py | https://github.com/piergiaj/pytorch-i3d | code from piergiaj/pytorch-i3d | +| 开源代码引入 | https://github.com/PKU-YuanGroup/Open-Sora-Plan/opensora/eval/fvd/videogpt/pytorch_i3d.py | /opensora/eval/fvd/videogpt/pytorch_i3d.py | https://arxiv.org/pdf/1705.07750v1.pdf | paper | +| 开源代码引入 | https://github.com/PKU-YuanGroup/Open-Sora-Plan/opensora/eval/fvd/videogpt/pytorch_i3d.py | /opensora/eval/fvd/videogpt/pytorch_i3d.py | http://arxiv.org/pdf/1409.4842v1.pdf | paper | +| 开源代码引入 | https://github.com/PKU-YuanGroup/Open-Sora-Plan/opensora/models/ae/imagebase/vqvae/quantize.py | /opensora/models/ae/imagebase/vqvae/quantize.py | https://github.com/MishaLaskin/vqvae/blob/d761a999e2267766400dc646d82d3ac3657771d4/models/quantizer.py | code from MishaLaskin/vqvae | +| 开源代码引入 | https://github.com/PKU-YuanGroup/Open-Sora-Plan/opensora/models/ae/imagebase/vqvae/quantize.py | /opensora/models/ae/imagebase/vqvae/quantize.py | https://github.com/karpathy/deep-vector-quantization/blob/main/model.py | code from karpathy/deep-vector-quantization | +| 开源代码引入 | https://github.com/PKU-YuanGroup/Open-Sora-Plan/opensora/models/ae/imagebase/vqvae/quantize.py | /opensora/models/ae/imagebase/vqvae/quantize.py | https://arxiv.org/abs/1611.01144 | paper | +| 开源代码引入 | https://github.com/PKU-YuanGroup/Open-Sora-Plan/opensora/models/ae/videobase/causal_vqvae/modeling_causalvqvae.py | /opensora/models/ae/videobase/causal_vqvae/modeling_causalvqvae.py | https://github.com/wilson1yan/VideoGPT | code from wilson1yan/VideoGPT | +| 开源代码引入 | https://github.com/PKU-YuanGroup/Open-Sora-Plan/opensora/models/ae/videobase/causal_vqvae/modeling_causalvqvae.py | /opensora/models/ae/videobase/causal_vqvae/modeling_causalvqvae.py | https://github.com/wilson1yan/VideoGPT | code from wilson1yan/VideoGPT | +| 开源代码引入 | https://github.com/PKU-YuanGroup/Open-Sora-Plan/opensora/models/ae/videobase/causal_vqvae/modeling_causalvqvae.py | /opensora/models/ae/videobase/causal_vqvae/modeling_causalvqvae.py | https://github.com/wilson1yan/VideoGPT | code from wilson1yan/VideoGPT | +| 开源代码引入 | https://github.com/PKU-YuanGroup/Open-Sora-Plan/opensora/models/ae/videobase/causal_vqvae/modeling_causalvqvae.py | /opensora/models/ae/videobase/causal_vqvae/modeling_causalvqvae.py | https://github.com/wilson1yan/VideoGPT | code from wilson1yan/VideoGPT | +| 开源代码引入 | https://github.com/PKU-YuanGroup/Open-Sora-Plan/opensora/models/ae/videobase/causal_vqvae/modeling_causalvqvae.py | /opensora/models/ae/videobase/causal_vqvae/modeling_causalvqvae.py | https://github.com/wilson1yan/VideoGPT | code from wilson1yan/VideoGPT | +| 开源代码引入 | https://github.com/PKU-YuanGroup/Open-Sora-Plan/opensora/models/ae/videobase/causal_vqvae/modeling_causalvqvae.py | /opensora/models/ae/videobase/causal_vqvae/modeling_causalvqvae.py | https://github.com/wilson1yan/VideoGPT | code from wilson1yan/VideoGPT | +| 开源代码引入 | https://github.com/PKU-YuanGroup/Open-Sora-Plan/opensora/models/ae/videobase/causal_vqvae/modeling_causalvqvae.py | /opensora/models/ae/videobase/causal_vqvae/modeling_causalvqvae.py | https://github.com/wilson1yan/VideoGPT | code from wilson1yan/VideoGPT | +| 开源代码引入 | https://github.com/PKU-YuanGroup/Open-Sora-Plan/opensora/models/ae/videobase/causal_vqvae/modeling_causalvqvae.py | /opensora/models/ae/videobase/causal_vqvae/modeling_causalvqvae.py | https://github.com/wilson1yan/VideoGPT | code from wilson1yan/VideoGPT | +| 开源代码引入 | https://github.com/PKU-YuanGroup/Open-Sora-Plan/opensora/models/ae/videobase/causal_vqvae/modeling_causalvqvae.py | /opensora/models/ae/videobase/causal_vqvae/modeling_causalvqvae.py | https://github.com/wilson1yan/VideoGPT | code from wilson1yan/VideoGPT | +| 开源代码引入 | https://github.com/PKU-YuanGroup/Open-Sora-Plan/opensora/models/ae/videobase/causal_vqvae/modeling_causalvqvae.py | /opensora/models/ae/videobase/causal_vqvae/modeling_causalvqvae.py | https://github.com/wilson1yan/VideoGPT | code from wilson1yan/VideoGPT | +| 开源代码引入 | https://github.com/PKU-YuanGroup/Open-Sora-Plan/opensora/models/ae/videobase/causal_vqvae/modeling_causalvqvae.py | /opensora/models/ae/videobase/causal_vqvae/modeling_causalvqvae.py | https://github.com/wilson1yan/VideoGPT | code from wilson1yan/VideoGPT | +| 开源代码引入 | https://github.com/PKU-YuanGroup/Open-Sora-Plan/opensora/models/ae/videobase/causal_vqvae/modeling_causalvqvae.py | /opensora/models/ae/videobase/causal_vqvae/modeling_causalvqvae.py | https://github.com/wilson1yan/VideoGPT | code from wilson1yan/VideoGPT | +| 开源代码引入 | https://github.com/PKU-YuanGroup/Open-Sora-Plan/opensora/models/ae/videobase/causal_vqvae/modeling_causalvqvae.py | /opensora/models/ae/videobase/causal_vqvae/modeling_causalvqvae.py | https://arxiv.org/abs/1904.10509 | paper | +| 开源代码引入 | https://github.com/PKU-YuanGroup/Open-Sora-Plan/opensora/models/ae/videobase/causal_vqvae/modeling_causalvqvae.py | /opensora/models/ae/videobase/causal_vqvae/modeling_causalvqvae.py | https://github.com/wilson1yan/VideoGPT | code from wilson1yan/VideoGPT | +| 开源代码引入 | https://github.com/PKU-YuanGroup/Open-Sora-Plan/opensora/models/ae/videobase/losses/discriminator.py | /opensora/models/ae/videobase/losses/discriminator.py | https://github.com/junyanz/pytorch-CycleGAN-and-pix2pix/blob/master/models/networks.py | code from junyanz/pytorch-CycleGAN-and-pix2pix | +| 开源代码引入 | https://github.com/PKU-YuanGroup/Open-Sora-Plan/opensora/models/ae/videobase/losses/lpips.py | /opensora/models/ae/videobase/losses/lpips.py | https://github.com/richzhang/PerceptualSimilarity/tree/master/models | code from richzhang/PerceptualSimilarity | +| 开源代码引入 | https://github.com/PKU-YuanGroup/Open-Sora-Plan/opensora/models/ae/videobase/losses/perceptual_loss.py | /opensora/models/ae/videobase/losses/perceptual_loss.py | https://github.com/karpathy/deep-vector-quantization/blob/main/model.py | code from karpathy/deep-vector-quantization | +| 开源代码引入 | https://github.com/PKU-YuanGroup/Open-Sora-Plan/opensora/models/ae/videobase/modules/attention.py | /opensora/models/ae/videobase/modules/attention.py | https://github.com/PKU-YuanGroup/Open-Sora-Plan/pull/172 | code from PKU-YuanGroup/Open-Sora-Plan | +| 开源代码引入 | https://github.com/PKU-YuanGroup/Open-Sora-Plan/opensora/models/ae/videobase/vqvae/modeling_vqvae.py | /opensora/models/ae/videobase/vqvae/modeling_vqvae.py | https://github.com/wilson1yan/VideoGPT | code from wilson1yan/VideoGPT | +| 开源代码引入 | https://github.com/PKU-YuanGroup/Open-Sora-Plan/opensora/models/ae/videobase/vqvae/modeling_vqvae.py | /opensora/models/ae/videobase/vqvae/modeling_vqvae.py | https://github.com/wilson1yan/VideoGPT | code from wilson1yan/VideoGPT | +| 开源代码引入 | https://github.com/PKU-YuanGroup/Open-Sora-Plan/opensora/models/ae/videobase/vqvae/modeling_vqvae.py | /opensora/models/ae/videobase/vqvae/modeling_vqvae.py | https://github.com/wilson1yan/VideoGPT | code from wilson1yan/VideoGPT | +| 开源代码引入 | https://github.com/PKU-YuanGroup/Open-Sora-Plan/opensora/models/ae/videobase/vqvae/modeling_vqvae.py | /opensora/models/ae/videobase/vqvae/modeling_vqvae.py | https://github.com/wilson1yan/VideoGPT | code from wilson1yan/VideoGPT | +| 开源代码引入 | https://github.com/PKU-YuanGroup/Open-Sora-Plan/opensora/models/ae/videobase/vqvae/modeling_vqvae.py | /opensora/models/ae/videobase/vqvae/modeling_vqvae.py | https://github.com/wilson1yan/VideoGPT | code from wilson1yan/VideoGPT | +| 开源代码引入 | https://github.com/PKU-YuanGroup/Open-Sora-Plan/opensora/models/ae/videobase/vqvae/modeling_vqvae.py | /opensora/models/ae/videobase/vqvae/modeling_vqvae.py | https://github.com/wilson1yan/VideoGPT | code from wilson1yan/VideoGPT | +| 开源代码引入 | https://github.com/PKU-YuanGroup/Open-Sora-Plan/opensora/models/ae/videobase/vqvae/modeling_vqvae.py | /opensora/models/ae/videobase/vqvae/modeling_vqvae.py | https://github.com/wilson1yan/VideoGPT | code from wilson1yan/VideoGPT | +| 开源代码引入 | https://github.com/PKU-YuanGroup/Open-Sora-Plan/opensora/models/ae/videobase/vqvae/modeling_vqvae.py | /opensora/models/ae/videobase/vqvae/modeling_vqvae.py | https://github.com/wilson1yan/VideoGPT | code from wilson1yan/VideoGPT | +| 开源代码引入 | https://github.com/PKU-YuanGroup/Open-Sora-Plan/opensora/models/ae/videobase/vqvae/modeling_vqvae.py | /opensora/models/ae/videobase/vqvae/modeling_vqvae.py | https://github.com/wilson1yan/VideoGPT | code from wilson1yan/VideoGPT | +| 开源代码引入 | https://github.com/PKU-YuanGroup/Open-Sora-Plan/opensora/models/ae/videobase/vqvae/modeling_vqvae.py | /opensora/models/ae/videobase/vqvae/modeling_vqvae.py | https://github.com/wilson1yan/VideoGPT | code from wilson1yan/VideoGPT | +| 开源代码引入 | https://github.com/PKU-YuanGroup/Open-Sora-Plan/opensora/models/ae/videobase/vqvae/modeling_vqvae.py | /opensora/models/ae/videobase/vqvae/modeling_vqvae.py | https://github.com/wilson1yan/VideoGPT | code from wilson1yan/VideoGPT | +| 开源代码引入 | https://github.com/PKU-YuanGroup/Open-Sora-Plan/opensora/models/ae/videobase/vqvae/modeling_vqvae.py | /opensora/models/ae/videobase/vqvae/modeling_vqvae.py | https://github.com/wilson1yan/VideoGPT | code from wilson1yan/VideoGPT | +| 开源代码引入 | https://github.com/PKU-YuanGroup/Open-Sora-Plan/opensora/models/ae/videobase/vqvae/modeling_vqvae.py | /opensora/models/ae/videobase/vqvae/modeling_vqvae.py | https://github.com/wilson1yan/VideoGPT | code from wilson1yan/VideoGPT | +| 开源代码引入 | https://github.com/PKU-YuanGroup/Open-Sora-Plan/opensora/models/ae/videobase/vqvae/modeling_vqvae.py | /opensora/models/ae/videobase/vqvae/modeling_vqvae.py | https://github.com/wilson1yan/VideoGPT | code from wilson1yan/VideoGPT | +| 开源代码引入 | https://github.com/PKU-YuanGroup/Open-Sora-Plan/opensora/models/ae/videobase/vqvae/modeling_vqvae.py | /opensora/models/ae/videobase/vqvae/modeling_vqvae.py | https://arxiv.org/abs/1904.10509 | paper | +| 开源代码引入 | https://github.com/PKU-YuanGroup/Open-Sora-Plan/opensora/models/ae/videobase/vqvae/modeling_vqvae.py | /opensora/models/ae/videobase/vqvae/modeling_vqvae.py | https://github.com/wilson1yan/VideoGPT | code from wilson1yan/VideoGPT | +| 开源代码引入 | https://github.com/PKU-YuanGroup/Open-Sora-Plan/opensora/models/ae/videobase/vqvae/modeling_vqvae.py | /opensora/models/ae/videobase/vqvae/modeling_vqvae.py | https://github.com/wilson1yan/VideoGPT | code from wilson1yan/VideoGPT | +| 开源代码引入 | https://github.com/PKU-YuanGroup/Open-Sora-Plan/opensora/models/captioner/caption_refiner/demo_for_refiner.py | /opensora/models/captioner/caption_refiner/demo_for_refiner.py | https://one-api.bltcy.top/v1 | openai api | +| 开源代码引入 | https://github.com/PKU-YuanGroup/Open-Sora-Plan/opensora/models/diffusion/diffusion/diffusion_utils.py | /opensora/models/diffusion/diffusion/diffusion_utils.py | https://github.com/openai/glide-text2im/blob/main/glide_text2im/gaussian_diffusion.py | code from openai/glide-text2im | +| 开源代码引入 | https://github.com/PKU-YuanGroup/Open-Sora-Plan/opensora/models/diffusion/diffusion/diffusion_utils.py | /opensora/models/diffusion/diffusion/diffusion_utils.py | https://github.com/openai/guided-diffusion/blob/main/guided_diffusion | code from openai/guided-diffusion | +| 开源代码引入 | https://github.com/PKU-YuanGroup/Open-Sora-Plan/opensora/models/diffusion/diffusion/diffusion_utils.py | /opensora/models/diffusion/diffusion/diffusion_utils.py | https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py | code from openai/improved-diffusion | +| 开源代码引入 | https://github.com/PKU-YuanGroup/Open-Sora-Plan/opensora/models/diffusion/diffusion/gaussian_diffusion.py | /opensora/models/diffusion/diffusion/gaussian_diffusion.py | https://github.com/openai/glide-text2im/blob/main/glide_text2im/gaussian_diffusion.py | code from openai/glide-text2im | +| 开源代码引入 | https://github.com/PKU-YuanGroup/Open-Sora-Plan/opensora/models/diffusion/diffusion/gaussian_diffusion.py | /opensora/models/diffusion/diffusion/gaussian_diffusion.py | https://github.com/openai/guided-diffusion/blob/main/guided_diffusion | code from openai/guided-diffusion | +| 开源代码引入 | https://github.com/PKU-YuanGroup/Open-Sora-Plan/opensora/models/diffusion/diffusion/gaussian_diffusion.py | /opensora/models/diffusion/diffusion/gaussian_diffusion.py | https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py | code from openai/improved-diffusion | +| 开源代码引入 | https://github.com/PKU-YuanGroup/Open-Sora-Plan/opensora/models/diffusion/diffusion/gaussian_diffusion.py | /opensora/models/diffusion/diffusion/gaussian_diffusion.py | https://github.com/hojonathanho/diffusion/blob/1e0dceb3b3495bbe19116a5e1b3596cd0706c543/diffusion_tf/diffusion_utils_2.py | code from hojonathanho/diffusion | +| 开源代码引入 | https://github.com/PKU-YuanGroup/Open-Sora-Plan/opensora/models/diffusion/diffusion/gaussian_diffusion_t2v.py | /opensora/models/diffusion/diffusion/gaussian_diffusion_t2v.py | https://github.com/openai/glide-text2im/blob/main/glide_text2im/gaussian_diffusion.py | code from openai/glide-text2im | +| 开源代码引入 | https://github.com/PKU-YuanGroup/Open-Sora-Plan/opensora/models/diffusion/diffusion/gaussian_diffusion_t2v.py | /opensora/models/diffusion/diffusion/gaussian_diffusion_t2v.py | https://github.com/openai/guided-diffusion/blob/main/guided_diffusion | code from openai/guided-diffusion | +| 开源代码引入 | https://github.com/PKU-YuanGroup/Open-Sora-Plan/opensora/models/diffusion/diffusion/gaussian_diffusion_t2v.py | /opensora/models/diffusion/diffusion/gaussian_diffusion_t2v.py | https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py | code from openai/improved-diffusion | +| 开源代码引入 | https://github.com/PKU-YuanGroup/Open-Sora-Plan/opensora/models/diffusion/diffusion/gaussian_diffusion_t2v.py | /opensora/models/diffusion/diffusion/gaussian_diffusion_t2v.py | https://github.com/hojonathanho/diffusion/blob/1e0dceb3b3495bbe19116a5e1b3596cd0706c543/diffusion_tf/diffusion_utils_2.py | code from hojonathanho/diffusion | +| 开源代码引入 | https://github.com/PKU-YuanGroup/Open-Sora-Plan/opensora/models/diffusion/diffusion/respace.py | /opensora/models/diffusion/diffusion/respace.py | https://github.com/openai/glide-text2im/blob/main/glide_text2im/gaussian_diffusion.py | code from openai/glide-text2im | +| 开源代码引入 | https://github.com/PKU-YuanGroup/Open-Sora-Plan/opensora/models/diffusion/diffusion/respace.py | /opensora/models/diffusion/diffusion/respace.py | https://github.com/openai/guided-diffusion/blob/main/guided_diffusion | code from openai/guided-diffusion | +| 开源代码引入 | https://github.com/PKU-YuanGroup/Open-Sora-Plan/opensora/models/diffusion/diffusion/respace.py | /opensora/models/diffusion/diffusion/respace.py | https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py | code from openai/improved-diffusion | +| 开源代码引入 | https://github.com/PKU-YuanGroup/Open-Sora-Plan/opensora/models/diffusion/diffusion/timestep_sampler.py | /opensora/models/diffusion/diffusion/timestep_sampler.py | https://github.com/openai/glide-text2im/blob/main/glide_text2im/gaussian_diffusion.py | code from openai/glide-text2im | +| 开源代码引入 | https://github.com/PKU-YuanGroup/Open-Sora-Plan/opensora/models/diffusion/diffusion/timestep_sampler.py | /opensora/models/diffusion/diffusion/timestep_sampler.py | https://github.com/openai/guided-diffusion/blob/main/guided_diffusion | code from openai/guided-diffusion | +| 开源代码引入 | https://github.com/PKU-YuanGroup/Open-Sora-Plan/opensora/models/diffusion/diffusion/timestep_sampler.py | /opensora/models/diffusion/diffusion/timestep_sampler.py | https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py | code from openai/improved-diffusion | +| 开源代码引入 | https://github.com/PKU-YuanGroup/Open-Sora-Plan/opensora/models/diffusion/diffusion/__init__.py | /opensora/models/diffusion/diffusion/__init__.py | https://github.com/openai/glide-text2im/blob/main/glide_text2im/gaussian_diffusion.py | code from openai/glide-text2im | +| 开源代码引入 | https://github.com/PKU-YuanGroup/Open-Sora-Plan/opensora/models/diffusion/diffusion/__init__.py | /opensora/models/diffusion/diffusion/__init__.py | https://github.com/openai/guided-diffusion/blob/main/guided_diffusion | code from openai/guided-diffusion | +| 开源代码引入 | https://github.com/PKU-YuanGroup/Open-Sora-Plan/opensora/models/diffusion/diffusion/__init__.py | /opensora/models/diffusion/diffusion/__init__.py | https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py | code from openai/improved-diffusion | +| 开源代码引入 | https://github.com/PKU-YuanGroup/Open-Sora-Plan/opensora/models/diffusion/latte/modeling_latte.py | /opensora/models/diffusion/latte/modeling_latte.py | https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py | code from huggingface/diffusers | +| 开源代码引入 | https://github.com/PKU-YuanGroup/Open-Sora-Plan/opensora/models/diffusion/latte/modules.py | /opensora/models/diffusion/latte/modules.py | https://github.com/PixArt-alpha/PixArt-alpha/blob/0f55e922376d8b797edd44d25d0e7464b260dcab/diffusion/model/nets/PixArtMS.py | code from PixArt-alpha/PixArt-alpha | +| 开源代码引入 | https://github.com/PKU-YuanGroup/Open-Sora-Plan/opensora/models/diffusion/latte/modules.py | /opensora/models/diffusion/latte/modules.py | https://github.com/PixArt-alpha/PixArt-alpha/blob/master/diffusion/model/nets/PixArt_blocks.py | code from PixArt-alpha/PixArt-alpha | +| 开源代码引入 | https://github.com/PKU-YuanGroup/Open-Sora-Plan/opensora/models/diffusion/latte/modules.py | /opensora/models/diffusion/latte/modules.py | https://github.com/PixArt-alpha/PixArt-alpha/blob/0f55e922376d8b797edd44d25d0e7464b260dcab/diffusion/model/nets/PixArtMS.py | code from PixArt-alpha/PixArt-alpha | +| 开源代码引入 | https://github.com/PKU-YuanGroup/Open-Sora-Plan/opensora/models/diffusion/latte/modules.py | /opensora/models/diffusion/latte/modules.py | https://github.com/PixArt-alpha/PixArt-alpha/blob/0f55e922376d8b797edd44d25d0e7464b260dcab/diffusion/model/nets/PixArtMS.py | code from PixArt-alpha/PixArt-alpha | +| 开源代码引入 | https://github.com/PKU-YuanGroup/Open-Sora-Plan/opensora/models/diffusion/latte/modules.py | /opensora/models/diffusion/latte/modules.py | https://github.com/facebookresearch/xformers | code from facebookresearch/xformers | +| 开源代码引入 | https://github.com/PKU-YuanGroup/Open-Sora-Plan/opensora/models/diffusion/latte/modules.py | /opensora/models/diffusion/latte/modules.py | https://github.com/PixArt-alpha/PixArt-alpha/blob/0f55e922376d8b797edd44d25d0e7464b260dcab/diffusion/model/nets/PixArtMS.py | code from PixArt-alpha/PixArt-alpha | +| 开源代码引入 | https://github.com/PKU-YuanGroup/Open-Sora-Plan/opensora/models/diffusion/latte/modules.py | /opensora/models/diffusion/latte/modules.py | https://github.com/PixArt-alpha/PixArt-alpha/blob/0f55e922376d8b797edd44d25d0e7464b260dcab/diffusion/model/nets/PixArtMS.py | code from PixArt-alpha/PixArt-alpha | +| 开源代码引入 | https://github.com/PKU-YuanGroup/Open-Sora-Plan/opensora/models/diffusion/latte/modules.py | /opensora/models/diffusion/latte/modules.py | https://arxiv.org/abs/2310.00426 | paper | +| 开源代码引入 | https://github.com/PKU-YuanGroup/Open-Sora-Plan/opensora/models/diffusion/utils/pos_embed.py | /opensora/models/diffusion/utils/pos_embed.py | https://github.com/naver/croco | code from naver/croco | +| 开源代码引入 | https://github.com/PKU-YuanGroup/Open-Sora-Plan/opensora/models/diffusion/utils/pos_embed.py | /opensora/models/diffusion/utils/pos_embed.py | https://github.com/huggingface/diffusers | code from huggingface/diffusers | +| 开源代码引入 | https://github.com/PKU-YuanGroup/Open-Sora-Plan/opensora/models/diffusion/utils/pos_embed.py | /opensora/models/diffusion/utils/pos_embed.py | https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py | code from huggingface/transformers | +| 开源代码引入 | https://github.com/PKU-YuanGroup/Open-Sora-Plan/opensora/models/diffusion/utils/pos_embed.py | /opensora/models/diffusion/utils/pos_embed.py | https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py | code from huggingface/transformers | +| 开源代码引入 | https://github.com/PKU-YuanGroup/Open-Sora-Plan/opensora/models/frame_interpolation/interpolation.py | /opensora/models/frame_interpolation/interpolation.py | https://github.com/MCG-NKU/AMT/blob/main/demos/demo_2x.py | code from MCG-NKU/AMT | +| 开源代码引入 | https://github.com/PKU-YuanGroup/Open-Sora-Plan/opensora/models/frame_interpolation/utils/flow_utils.py | /opensora/models/frame_interpolation/utils/flow_utils.py | http://vision.middlebury.edu/flow/flowEval-iccv07.pdf | paper A Database and Evaluation Methodology for Optical Flow | +| 开源代码引入 | https://github.com/PKU-YuanGroup/Open-Sora-Plan/opensora/models/super_resolution/basicsr/archs/arch_util.py | /opensora/models/super_resolution/basicsr/archs/arch_util.py | https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/layers/weight_init.py | code from rwightman/pytorch-image-models | +| 开源代码引入 | https://github.com/PKU-YuanGroup/Open-Sora-Plan/opensora/models/super_resolution/basicsr/archs/arch_util.py | /opensora/models/super_resolution/basicsr/archs/arch_util.py | https://people.sc.fsu.edu/ | paper | +| 开源代码引入 | https://github.com/PKU-YuanGroup/Open-Sora-Plan/opensora/models/super_resolution/basicsr/archs/arch_util.py | /opensora/models/super_resolution/basicsr/archs/arch_util.py | https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/layers/weight_init.py | code from rwightman/pytorch-image-models | +| 开源代码引入 | https://github.com/PKU-YuanGroup/Open-Sora-Plan/opensora/models/super_resolution/basicsr/archs/rgt_arch.py | /opensora/models/super_resolution/basicsr/archs/rgt_arch.py | https://github.com/cheerss/CrossFormer/blob/main/models/crossformer.py | code from cheerss/CrossFormer | +| 开源代码引入 | https://github.com/PKU-YuanGroup/Open-Sora-Plan/opensora/models/super_resolution/basicsr/archs/rgt_arch.py | /opensora/models/super_resolution/basicsr/archs/rgt_arch.py | https://github.com/zhengchen1999/CAT/blob/main/basicsr/archs/cat_arch.py | code from zhengchen1999/CAT | +| 开源代码引入 | https://github.com/PKU-YuanGroup/Open-Sora-Plan/opensora/models/super_resolution/basicsr/archs/rgt_arch.py | /opensora/models/super_resolution/basicsr/archs/rgt_arch.py | https://github.com/microsoft/Swin-Transformer/blob/main/models/swin_transformer.py | code from microsoft/Swin-Transformer | +| 开源代码引入 | https://github.com/PKU-YuanGroup/Open-Sora-Plan/opensora/models/super_resolution/basicsr/data/data_util.py | /opensora/models/super_resolution/basicsr/data/data_util.py | https://lmdb.readthedocs.io/en/release/ | introduce imdb | +| 开源代码引入 | https://github.com/PKU-YuanGroup/Open-Sora-Plan/opensora/models/super_resolution/basicsr/data/prefetch_dataloader.py | /opensora/models/super_resolution/basicsr/data/prefetch_dataloader.py | https://stackoverflow.com/questions/7323664/python-generator-pre-fetch | code from Winston Ewert | +| 开源代码引入 | https://github.com/PKU-YuanGroup/Open-Sora-Plan/opensora/models/super_resolution/basicsr/data/prefetch_dataloader.py | /opensora/models/super_resolution/basicsr/data/prefetch_dataloader.py | https://github.com/IgorSusmelj/pytorch-styleguide/issues/5 | code from IgorSusmelj/pytorch-styleguide | +| 开源代码引入 | https://github.com/PKU-YuanGroup/Open-Sora-Plan/opensora/models/super_resolution/basicsr/data/prefetch_dataloader.py | /opensora/models/super_resolution/basicsr/data/prefetch_dataloader.py | https://github.com/NVIDIA/apex/issues/304 | code from NVIDIA/apex | +| 开源代码引入 | https://github.com/PKU-YuanGroup/Open-Sora-Plan/opensora/models/super_resolution/basicsr/metrics/psnr_ssim.py | /opensora/models/super_resolution/basicsr/metrics/psnr_ssim.py | https://en.wikipedia.org/wiki/Peak_signal-to-noise_ratio | wiki for Peak_signal-to-noise_ratio | +| 开源代码引入 | https://github.com/PKU-YuanGroup/Open-Sora-Plan/opensora/models/super_resolution/basicsr/metrics/psnr_ssim.py | /opensora/models/super_resolution/basicsr/metrics/psnr_ssim.py | https://ece.uwaterloo.ca/~z70wang/research/ssim/ | introduce ssim | +| 开源代码引入 | https://github.com/PKU-YuanGroup/Open-Sora-Plan/opensora/models/super_resolution/basicsr/utils/dist_util.py | /opensora/models/super_resolution/basicsr/utils/dist_util.py | https://github.com/open-mmlab/mmcv/blob/master/mmcv/runner/dist_utils.py | code from open-mmlab/mmcv | +| 开源代码引入 | https://github.com/PKU-YuanGroup/Open-Sora-Plan/opensora/models/super_resolution/basicsr/utils/file_client.py | /opensora/models/super_resolution/basicsr/utils/file_client.py | https://github.com/open-mmlab/mmcv/blob/master/mmcv/fileio/file_client.py | code from open-mmlab/mmcv | +| 开源代码引入 | https://github.com/PKU-YuanGroup/Open-Sora-Plan/opensora/models/super_resolution/basicsr/utils/matlab_functions.py | /opensora/models/super_resolution/basicsr/utils/matlab_functions.py | https://en.wikipedia.org/wiki/YCbCr | wiki for YCbCr | +| 开源代码引入 | https://github.com/PKU-YuanGroup/Open-Sora-Plan/opensora/models/super_resolution/basicsr/utils/matlab_functions.py | /opensora/models/super_resolution/basicsr/utils/matlab_functions.py | https://en.wikipedia.org/wiki/YCbCr | wiki for YCbCr | +| 开源代码引入 | https://github.com/PKU-YuanGroup/Open-Sora-Plan/opensora/models/super_resolution/basicsr/utils/matlab_functions.py | /opensora/models/super_resolution/basicsr/utils/matlab_functions.py | https://en.wikipedia.org/wiki/YCbCr | wiki for YCbCr | +| 开源代码引入 | https://github.com/PKU-YuanGroup/Open-Sora-Plan/opensora/models/super_resolution/basicsr/utils/matlab_functions.py | /opensora/models/super_resolution/basicsr/utils/matlab_functions.py | https://en.wikipedia.org/wiki/YCbCr | wiki for YCbCr | +| 开源代码引入 | https://github.com/PKU-YuanGroup/Open-Sora-Plan/opensora/models/super_resolution/basicsr/utils/matlab_functions.py | /opensora/models/super_resolution/basicsr/utils/matlab_functions.py | https://en.wikipedia.org/wiki/YCbCr | wiki for YCbCr | +| 开源代码引入 | https://github.com/PKU-YuanGroup/Open-Sora-Plan/opensora/models/super_resolution/basicsr/utils/matlab_functions.py | /opensora/models/super_resolution/basicsr/utils/matlab_functions.py | https://en.wikipedia.org/wiki/YCbCr | wiki for YCbCr | +| 开源代码引入 | https://github.com/PKU-YuanGroup/Open-Sora-Plan/opensora/models/super_resolution/basicsr/utils/matlab_functions.py | /opensora/models/super_resolution/basicsr/utils/matlab_functions.py | https://en.wikipedia.org/wiki/YCbCr | wiki for YCbCr | +| 开源代码引入 | https://github.com/PKU-YuanGroup/Open-Sora-Plan/opensora/models/super_resolution/basicsr/utils/matlab_functions.py | /opensora/models/super_resolution/basicsr/utils/matlab_functions.py | https://en.wikipedia.org/wiki/YCbCr | wiki for YCbCr | +| 开源代码引入 | https://github.com/PKU-YuanGroup/Open-Sora-Plan/opensora/models/super_resolution/basicsr/utils/registry.py | /opensora/models/super_resolution/basicsr/utils/registry.py | https://github.com/facebookresearch/fvcore/blob/master/fvcore/common/registry.py | code from facebookresearch/fvcore | +| 开源代码引入 | https://github.com/PKU-YuanGroup/Open-Sora-Plan/opensora/sample/pipeline_videogen.py | /opensora/sample/pipeline_videogen.py | http://www.apache.org/licenses/LICENSE-2.0 | license | +| 开源代码引入 | https://github.com/PKU-YuanGroup/Open-Sora-Plan/opensora/sample/pipeline_videogen.py | /opensora/sample/pipeline_videogen.py | https://huggingface.co/docs/transformers/model_doc/t5 | introduce t5 | +| 开源代码引入 | https://github.com/PKU-YuanGroup/Open-Sora-Plan/opensora/sample/pipeline_videogen.py | /opensora/sample/pipeline_videogen.py | https://huggingface.co/PixArt-alpha/PixArt-alpha/tree/main/t5-v1_1-xxl | download weights | +| 开源代码引入 | https://github.com/PKU-YuanGroup/Open-Sora-Plan/opensora/sample/pipeline_videogen.py | /opensora/sample/pipeline_videogen.py | https://huggingface.co/docs/transformers/model_doc/t5 | introduce t5 | +| 开源代码引入 | https://github.com/PKU-YuanGroup/Open-Sora-Plan/opensora/sample/pipeline_videogen.py | /opensora/sample/pipeline_videogen.py | https://github.com/PixArt-alpha/PixArt-alpha/blob/master/diffusion/model/utils.py | code from PixArt-alpha/PixArt-alpha | +| 开源代码引入 | https://github.com/PKU-YuanGroup/Open-Sora-Plan/opensora/sample/pipeline_videogen.py | /opensora/sample/pipeline_videogen.py | https://arxiv.org/abs/2010.02502 | paper | +| 开源代码引入 | https://github.com/PKU-YuanGroup/Open-Sora-Plan/opensora/sample/pipeline_videogen.py | /opensora/sample/pipeline_videogen.py | https://arxiv.org/abs/2207.12598 | paper | +| 开源代码引入 | https://github.com/PKU-YuanGroup/Open-Sora-Plan/opensora/sample/pipeline_videogen.py | /opensora/sample/pipeline_videogen.py | https://arxiv.org/pdf/2205.11487.pdf | paper | +| 开源代码引入 | https://github.com/PKU-YuanGroup/Open-Sora-Plan/opensora/sample/pipeline_videogen.py | /opensora/sample/pipeline_videogen.py | https://arxiv.org/abs/2010.02502 | paper | +| 开源代码引入 | https://github.com/PKU-YuanGroup/Open-Sora-Plan/opensora/sample/pipeline_videogen.py | /opensora/sample/pipeline_videogen.py | https://pytorch.org/docs/stable/generated/torch.Generator.html | introduce torch.Generator | +| 开源代码引入 | https://github.com/PKU-YuanGroup/Open-Sora-Plan/opensora/sample/pipeline_videogen.py | /opensora/sample/pipeline_videogen.py | https://pillow.readthedocs.io/en/stable/ | introduce pillow | +| 开源代码引入 | https://github.com/PKU-YuanGroup/Open-Sora-Plan/opensora/sample/pipeline_videogen.py | /opensora/sample/pipeline_videogen.py | https://arxiv.org/pdf/2205.11487.pdf | paper | +| 开源代码引入 | https://github.com/PKU-YuanGroup/Open-Sora-Plan/opensora/sample/transport_sample.py | /opensora/sample/transport_sample.py | https://github.com/rtqichen/torchdiffeq | code from rtqichen/torchdiffeq | +| 开源代码引入 | https://github.com/PKU-YuanGroup/Open-Sora-Plan/opensora/serve/gradio_utils.py | /opensora/serve/gradio_utils.py | https://www.pnglog.com/AOuPMh.png | web server image | +| 开源代码引入 | https://github.com/PKU-YuanGroup/Open-Sora-Plan/opensora/serve/gradio_utils.py | /opensora/serve/gradio_utils.py | https://github.com/PKU-YuanGroup/Open-Sora-Plan | code from PKU-YuanGroup/Open-Sora-Plan | +| 开源代码引入 | https://github.com/PKU-YuanGroup/Open-Sora-Plan/opensora/serve/gradio_utils.py | /opensora/serve/gradio_utils.py | https://github.com/PKU-YuanGroup/Open-Sora-Plan | code from PKU-YuanGroup/Open-Sora-Plan | +| 开源代码引入 | https://github.com/PKU-YuanGroup/Open-Sora-Plan/opensora/serve/gradio_utils.py | /opensora/serve/gradio_utils.py | https://github.com/PKU-YuanGroup/Open-Sora-Plan | code from PKU-YuanGroup/Open-Sora-Plan | +| 开源代码引入 | https://github.com/PKU-YuanGroup/Open-Sora-Plan/opensora/serve/gradio_utils.py | /opensora/serve/gradio_utils.py | https://img.shields.io/badge/Github-Code-blue | web server image | +| 开源代码引入 | https://github.com/PKU-YuanGroup/Open-Sora-Plan/opensora/serve/gradio_utils.py | /opensora/serve/gradio_utils.py | https://github.com/PKU-YuanGroup/Open-Sora-Plan/stargazers | code from PKU-YuanGroup/Open-Sora-Plan | +| 开源代码引入 | https://github.com/PKU-YuanGroup/Open-Sora-Plan/opensora/serve/gradio_utils.py | /opensora/serve/gradio_utils.py | https://img.shields.io/github/stars/PKU-YuanGroup/Open-Sora-Plan.svg?style=social | web server image | +| 开源代码引入 | https://github.com/PKU-YuanGroup/Open-Sora-Plan/opensora/train/train_t2v.py | /opensora/train/train_t2v.py | https://pytorch.org/docs/stable/notes/cuda.html | introduce TF32 on Ampere GPUs | +| 开源代码引入 | https://github.com/PKU-YuanGroup/Open-Sora-Plan/opensora/train/train_t2v.py | /opensora/train/train_t2v.py | https://arxiv.org/abs/2303.09556 | paper | +| 开源代码引入 | https://github.com/PKU-YuanGroup/Open-Sora-Plan/opensora/train/train_t2v.py | /opensora/train/train_t2v.py | https://pytorch.org/docs/stable/notes/cuda.html | introduce cuda | +| 开源代码引入 | https://github.com/PKU-YuanGroup/Open-Sora-Plan/opensora/train/train_t2v.py | /opensora/train/train_t2v.py | https://www.tensorflow.org/tensorboard | introduce TF32 on Ampere GPUs | +| 开源代码引入 | https://github.com/PKU-YuanGroup/Open-Sora-Plan/opensora/utils/dataset_utils.py | /opensora/utils/dataset_utils.py | https://github.com/dmlc/decord | code from dmlc/decord | +| 开源代码引入 | https://github.com/PKU-YuanGroup/Open-Sora-Plan/opensora/utils/taming_download.py | /opensora/utils/taming_download.py | https://github.com/CompVis/taming-transformers.git | code from CompVis/taming-transformers.git | +| 开源代码引入 | https://github.com/PKU-YuanGroup/Open-Sora-Plan/opensora/utils/taming_download.py | /opensora/utils/taming_download.py | https://heibox.uni-heidelberg.de/f/607503859c864bc1b30b/?dl=1 | download weights | diff --git a/PyTorch/built-in/mlm/OpenSoraPlan1.1/scripts/text_condition/train_videoae_65x512x512_16.sh b/PyTorch/built-in/mlm/OpenSoraPlan1.1/scripts/text_condition/train_videoae_65x512x512_16.sh new file mode 100644 index 0000000000..30e2505f49 --- /dev/null +++ b/PyTorch/built-in/mlm/OpenSoraPlan1.1/scripts/text_condition/train_videoae_65x512x512_16.sh @@ -0,0 +1,33 @@ +accelerate launch \ + --config_file scripts/accelerate_configs/deepspeed_zero2_config.yaml \ + opensora/train/train_t2v.py \ + --model LatteT2V-XL/122 \ + --text_encoder_name DeepFloyd/t5-v1_1-xxl \ + --cache_dir "./cache_dir" \ + --dataset t2v \ + --ae CausalVAEModel_4x8x8 \ + --ae_path "LanguageBind/Open-Sora-Plan-v1.1.0/vae" \ + --video_data "scripts/train_data/video_data.txt" \ + --use_img_from_vid \ + --sample_rate 1 \ + --num_frames 65 \ + --max_image_size 512 \ + --gradient_checkpointing \ + --attention_mode math \ + --train_batch_size=2 \ + --dataloader_num_workers 4 \ + --gradient_accumulation_steps=1 \ + --max_train_steps=1000000 \ + --learning_rate=2e-05 \ + --lr_scheduler="constant" \ + --lr_warmup_steps=0 \ + --mixed_precision="bf16" \ + --report_to="tensorboard" \ + --checkpointing_steps=500 \ + --output_dir="65x512x512_10node_bs2_lr2e-5_4img" \ + --allow_tf32 \ + --use_deepspeed \ + --model_max_length 300 \ + --use_image_num 16 \ + --enable_tiling \ + --pretrained LanguageBind/Open-Sora-Plan-v1.1.0/65x512x512/diffusion_pytorch_model.safetensors diff --git a/PyTorch/built-in/mlm/OpenSoraPlan1.1/scripts/train_data/video_data.txt b/PyTorch/built-in/mlm/OpenSoraPlan1.1/scripts/train_data/video_data.txt index 7f9c55548c..0b3ba5d66d 100644 --- a/PyTorch/built-in/mlm/OpenSoraPlan1.1/scripts/train_data/video_data.txt +++ b/PyTorch/built-in/mlm/OpenSoraPlan1.1/scripts/train_data/video_data.txt @@ -1,3 +1 @@ -/dxyl_data02/datasets/pixabay_v2,/dxyl_data02/anno_jsons/video_pixabay_65f_601513.json -/dxyl_data02/datasets/pexels,/dxyl_data02/anno_jsons/video_pexel_65f_3832666.json -/dxyl_data02/datasets/mixkit,/dxyl_data02/anno_jsons/video_mixkit_65f_54735.json \ No newline at end of file +dataset/mixkit2,dataset/mixkit2/video_mixkit_65f_54735.json \ No newline at end of file -- Gitee From 1db0abc7cf4f36e200183726dc9d9720e7c18429 Mon Sep 17 00:00:00 2001 From: lin-mingzhe-hw Date: Sat, 22 Jun 2024 15:28:31 +0800 Subject: [PATCH 2/3] [built-in][PyTorch][OpenSoraPlan] readme backup --- .../mlm/OpenSoraPlan1.1/README_ORG.md | 410 ++++++++++++++++++ 1 file changed, 410 insertions(+) create mode 100644 PyTorch/built-in/mlm/OpenSoraPlan1.1/README_ORG.md diff --git a/PyTorch/built-in/mlm/OpenSoraPlan1.1/README_ORG.md b/PyTorch/built-in/mlm/OpenSoraPlan1.1/README_ORG.md new file mode 100644 index 0000000000..e363bfff40 --- /dev/null +++ b/PyTorch/built-in/mlm/OpenSoraPlan1.1/README_ORG.md @@ -0,0 +1,410 @@ +# Open-Sora Plan + + + +[![slack badge](https://img.shields.io/badge/Discord-join-blueviolet?logo=discord&)](https://discord.gg/YtsBNg7n) +[![WeChat badge](https://img.shields.io/badge/微信-加入-green?logo=wechat&)](https://github.com/PKU-YuanGroup/Open-Sora-Plan/issues/53#issuecomment-1987226516) +[![hf_space](https://img.shields.io/badge/🤗-Open%20In%20Spaces-blue.svg)](https://huggingface.co/spaces/LanguageBind/Open-Sora-Plan-v1.1.0) +[![Twitter](https://img.shields.io/badge/-Twitter@LinBin46984-black?logo=twitter&logoColor=1D9BF0)](https://x.com/LinBin46984/status/1795018003345510687)
+[![hf_space](https://img.shields.io/badge/🤗-Open%20In%20Spaces-blue.svg)](https://huggingface.co/spaces/LanguageBind/Open-Sora-Plan-v1.1.0) +[![License](https://img.shields.io/badge/License-MIT-yellow)](https://github.com/PKU-YuanGroup/Open-Sora-Plan/blob/main/LICENSE) +[![GitHub repo contributors](https://img.shields.io/github/contributors-anon/PKU-YuanGroup/Open-Sora-Plan?style=flat&label=Contributors)](https://github.com/PKU-YuanGroup/Open-Sora-Plan/graphs/contributors) +[![GitHub Commit](https://img.shields.io/github/commit-activity/m/PKU-YuanGroup/Open-Sora-Plan?label=Commit)](https://github.com/PKU-YuanGroup/Open-Sora-Plan/commits/main/) +[![Pr](https://img.shields.io/github/issues-pr-closed-raw/PKU-YuanGroup/Open-Sora-Plan.svg?label=Merged+PRs&color=green)](https://github.com/PKU-YuanGroup/Open-Sora-Plan/pulls) +[![GitHub issues](https://img.shields.io/github/issues/PKU-YuanGroup/Open-Sora-Plan?color=critical&label=Issues)](https://github.com/PKU-YuanGroup/Video-LLaVA/issues?q=is%3Aopen+is%3Aissue) +[![GitHub closed issues](https://img.shields.io/github/issues-closed/PKU-YuanGroup/Open-Sora-Plan?color=success&label=Issues)](https://github.com/PKU-YuanGroup/Video-LLaVA/issues?q=is%3Aissue+is%3Aclosed)
+[![GitHub repo stars](https://img.shields.io/github/stars/PKU-YuanGroup/Open-Sora-Plan?style=flat&logo=github&logoColor=whitesmoke&label=Stars)](https://github.com/PKU-YuanGroup/Open-Sora-Plan/stargazers)  +[![GitHub repo forks](https://img.shields.io/github/forks/PKU-YuanGroup/Open-Sora-Plan?style=flat&logo=github&logoColor=whitesmoke&label=Forks)](https://github.com/PKU-YuanGroup/Open-Sora-Plan/network)  +[![GitHub repo watchers](https://img.shields.io/github/watchers/PKU-YuanGroup/Open-Sora-Plan?style=flat&logo=github&logoColor=whitesmoke&label=Watchers)](https://github.com/PKU-YuanGroup/Open-Sora-Plan/watchers)  +[![GitHub repo size](https://img.shields.io/github/repo-size/PKU-YuanGroup/Open-Sora-Plan?style=flat&logo=github&logoColor=whitesmoke&label=Repo%20Size)](https://github.com/PKU-YuanGroup/Open-Sora-Plan/archive/refs/heads/main.zip) + +
+v1.0.0 badge +[![Twitter](https://img.shields.io/badge/-Twitter@LinBin46984-black?logo=twitter&logoColor=1D9BF0)](https://x.com/LinBin46984/status/1763476690385424554?s=20)
+[![hf_space](https://img.shields.io/badge/🤗-Open%20In%20Spaces-blue.svg)](https://huggingface.co/spaces/LanguageBind/Open-Sora-Plan-v1.0.0) +[![hf_space](https://img.shields.io/badge/🤗-Open%20In%20Spaces-blue.svg)](https://huggingface.co/spaces/fffiloni/Open-Sora-Plan-v1-0-0) +[![Replicate demo and cloud API](https://replicate.com/camenduru/open-sora-plan-512x512/badge)](https://replicate.com/camenduru/open-sora-plan-512x512) +[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/camenduru/Open-Sora-Plan-jupyter/blob/main/Open_Sora_Plan_jupyter.ipynb)
+
+ +We are thrilled to present **Open-Sora-Plan v1.1.0**, which significantly enhances video generation quality and text control capabilities. See our [report](docs/Report-v1.1.0.md). We show compressed .gif on GitHub, which loses some quality. + +Thanks to **HUAWEI Ascend Team** for supporting us. In the second stage, we used Huawei Ascend computing power for training. This stage's training and inference were fully supported by Huawei. Models trained on Huawei Ascend can also be loaded into GPUs and generate videos of the same quality. + +目前已经支持使用国产AI芯片(华为昇腾,期待更多国产算力芯片)进行完整的训练和推理。在项目第二阶段,所有训练和推理任务完全由华为昇腾芯片支持。此外,基于华为昇腾的512卡集群训练出的模型,也可以无缝地在GPU上运行,并保持相同的视频质量。详细信息请参考我们的[hw branch](https://github.com/PKU-YuanGroup/Open-Sora-Plan/tree/hw). + + +### 221×512×512 Text-to-Video Generation + + + +| 221×512×512 (9.2s) | 221×512×512 (9.2s) | 221×512×512 (9.2s) | +| --- | --- | --- | +| | | | +| 3D animation of a small, round, fluffy creature with big, expressive eyes explores ... | A single drop of liquid metal falls from a floating orb, landing on a mirror-like ... | The video presents an abstract composition centered around a hexagonal shape adorned ... | +| | | | +| A drone camera circles around a beautiful historic church built on a rocky outcropping ... | Aerial view of Santorini during the blue hour, showcasing the stunning architecture ... | An aerial shot of a lighthouse standing tall on a rocky cliff, its beacon cutting ... | +| | | | +| A snowy forest landscape with a dirt road running through it. The road is flanked by ... | Drone shot along the Hawaii jungle coastline, sunny day. Kayaks in the water. |The camera rotates around a large stack of vintage televisions all showing different ... | + + +### 65×512×512 Text-to-Video Generation + +| 65×512×512 (2.7s) | 65×512×512 (2.7s) | 65×512×512 (2.7s) | +| --- | --- | --- | +| | | | +| In an ornate, historical hall, a massive tidal wave peaks and begins to crash. Two ... | A Shiba Inu dog wearing a beret and black turtleneck. | A painting of a boat on water comes to life, with waves crashing and the boat becoming ... | +|| | | +| A person clad in a space suit with a helmet and equipped with a chest light and arm ... | 3D animation of a small, round, fluffy creature with big, expressive eyes explores a ... | In a studio, there is a painting depicting a ship sailing through the rough sea. | +| | | | +| A robot dog trots down a deserted alley at night, its metallic paws clinking softly ... | A lone surfer rides a massive wave, skillfully maneuvering through the surf. The water ... | A solitary cheetah sprints across the savannah, its powerful muscles propelling it ... | + +### 65×512×512 Video Editing + +| generated 65×512×512 (2.7s) | edited 65×512×512 (2.7s) | +| --- | --- | +| | | +| | | +| | | + +### 512×512 Text-to-Image Generation + + + + + + +## 📰 News + +**[2024.05.27]** 🚀🚀🚀 We are launching Open-Sora Plan v1.1.0, which significantly improves video quality and length, and is fully open source! Please check out our latest [report](docs/Report-v1.1.0.md). + +**[2024.04.09]** 🚀 Excited to share our latest exploration on metamorphic time-lapse video generation: [MagicTime](https://github.com/PKU-YuanGroup/MagicTime), which learns real-world physics knowledge from time-lapse videos. Here is the dataset for train (updating): [Open-Sora-Dataset](https://github.com/PKU-YuanGroup/Open-Sora-Dataset). + +**[2024.04.07]** 🔥🔥🔥 Today, we are thrilled to present Open-Sora-Plan v1.0.0, which significantly enhances video generation quality and text control capabilities. See our [report](docs/Report-v1.0.0.md). Thanks to HUAWEI NPU for supporting us. + +**[2024.03.27]** 🚀🚀🚀 We release the report of [VideoCausalVAE](docs/CausalVideoVAE.md), which supports both images and videos. We present our reconstructed video in this demonstration as follows. The text-to-video model is on the way. + +
+View more + +**[2024.03.10]** 🚀🚀🚀 This repo supports training a latent size of 225×90×90 (t×h×w), which means we are able to **train 1 minute of 1080P video with 30FPS** (2× interpolated frames and 2× super resolution) under class-condition. + +**[2024.03.08]** We support the training code of text condition with 16 frames of 512x512. The code is mainly borrowed from [Latte](https://github.com/Vchitect/Latte). + +**[2024.03.07]** We support training with 128 frames (when sample rate = 3, which is about 13 seconds) of 256x256, or 64 frames (which is about 6 seconds) of 512x512. + +**[2024.03.05]** See our latest [todo](https://github.com/PKU-YuanGroup/Open-Sora-Plan?tab=readme-ov-file#todo), pull requests are welcome. + +**[2024.03.04]** We re-organize and modulize our code to make it easy to [contribute](https://github.com/PKU-YuanGroup/Open-Sora-Plan?tab=readme-ov-file#how-to-contribute-to-the-open-sora-plan-community) to the project, to contribute please see the [Repo structure](https://github.com/PKU-YuanGroup/Open-Sora-Plan?tab=readme-ov-file#repo-structure). + +**[2024.03.03]** We open some [discussions](https://github.com/PKU-YuanGroup/Open-Sora-Plan/discussions) to clarify several issues. + +**[2024.03.01]** Training code is available now! Learn more on our [project page](https://pku-yuangroup.github.io/Open-Sora-Plan/). Please feel free to watch 👀 this repository for the latest updates. + +
+ +## 💪 Goal +This project aims to create a simple and scalable repo, to reproduce [Sora](https://openai.com/sora) (OpenAI, but we prefer to call it "ClosedAI" ). We wish the open-source community can contribute to this project. Pull requests are welcome!!! + +本项目希望通过开源社区的力量复现Sora,由北大-兔展AIGC联合实验室共同发起,当前版本离目标差距仍然较大,仍需持续完善和快速迭代,欢迎Pull request!!! + +Project stages: +- Primary +1. Setup the codebase and train an un-conditional model on a landscape dataset. +2. Train models that boost resolution and duration. + +- Extensions +3. Conduct text2video experiments on landscape dataset. +4. Train the 1080p model on video2text dataset. +5. Control model with more conditions. + + +
+ + +
+ + +
+✊ Todo + +#### Setup the codebase and train an unconditional model on landscape dataset +- [x] Fix typos & Update readme. 🤝 Thanks to [@mio2333](https://github.com/mio2333), [@CreamyLong](https://github.com/CreamyLong), [@chg0901](https://github.com/chg0901), [@Nyx-177](https://github.com/Nyx-177), [@HowardLi1984](https://github.com/HowardLi1984), [@sennnnn](https://github.com/sennnnn), [@Jason-fan20](https://github.com/Jason-fan20) +- [x] Setup environment. 🤝 Thanks to [@nameless1117](https://github.com/nameless1117) +- [ ] Add docker file. ⌛ [WIP] 🤝 Thanks to [@Mon-ius](https://github.com/Mon-ius), [@SimonLeeGit](https://github.com/SimonLeeGit) +- [ ] Enable type hints for functions. 🤝 Thanks to [@RuslanPeresy](https://github.com/RuslanPeresy), 🙏 **[Need your contribution]** +- [x] Resume from checkpoint. +- [x] Add Video-VQVAE model, which is borrowed from [VideoGPT](https://github.com/wilson1yan/VideoGPT). +- [x] Support variable aspect ratios, resolutions, durations training on [DiT](https://github.com/facebookresearch/DiT). +- [x] Support Dynamic mask input inspired by [FiT](https://github.com/whlzy/FiT). +- [x] Add class-conditioning on embeddings. +- [x] Incorporating [Latte](https://github.com/Vchitect/Latte) as main codebase. +- [x] Add VAE model, which is borrowed from [Stable Diffusion](https://github.com/CompVis/latent-diffusion). +- [x] Joint dynamic mask input with VAE. +- [ ] Add VQVAE from [VQGAN](https://github.com/CompVis/taming-transformers). 🙏 **[Need your contribution]** +- [ ] Make the codebase ready for the cluster training. Add SLURM scripts. 🙏 **[Need your contribution]** +- [x] Refactor VideoGPT. 🤝 Thanks to [@qqingzheng](https://github.com/qqingzheng), [@luo3300612](https://github.com/luo3300612), [@sennnnn](https://github.com/sennnnn) +- [x] Add sampling script. +- [ ] Add DDP sampling script. ⌛ [WIP] +- [x] Use accelerate on multi-node. 🤝 Thanks to [@sysuyy](https://github.com/sysuyy) +- [x] Incorporate [SiT](https://github.com/willisma/SiT). 🤝 Thanks to [@khan-yin](https://github.com/khan-yin) +- [x] Add evaluation scripts (FVD, CLIP score). 🤝 Thanks to [@rain305f](https://github.com/rain305f) + +#### Train models that boost resolution and duration +- [x] Add [PI](https://arxiv.org/abs/2306.15595) to support out-of-domain size. 🤝 Thanks to [@jpthu17](https://github.com/jpthu17) +- [x] Add 2D RoPE to improve generalization ability as [FiT](https://github.com/whlzy/FiT). 🤝 Thanks to [@jpthu17](https://github.com/jpthu17) +- [x] Compress KV according to [PixArt-sigma](https://pixart-alpha.github.io/PixArt-sigma-project). +- [x] Support deepspeed for videogpt training. 🤝 Thanks to [@sennnnn](https://github.com/sennnnn) +- [x] Train a **low dimension** Video-AE, whether it is VAE or VQVAE. +- [x] Extract offline feature. +- [x] Train with offline feature. +- [x] Add frame interpolation model. 🤝 Thanks to [@yunyangge](https://github.com/yunyangge) +- [x] Add super resolution model. 🤝 Thanks to [@Linzy19](https://github.com/Linzy19) +- [x] Add accelerate to automatically manage training. +- [x] Joint training with images. +- [ ] Implement [MaskDiT](https://github.com/Anima-Lab/MaskDiT) technique for fast training. 🙏 **[Need your contribution]** +- [ ] Incorporate [NaViT](https://arxiv.org/abs/2307.06304). 🙏 **[Need your contribution]** +- [ ] Add [FreeNoise](https://github.com/arthur-qiu/FreeNoise-LaVie) support for training-free longer video generation. 🙏 **[Need your contribution]** + +#### Conduct text2video experiments on landscape dataset. +- [x] Load pretrained weights from [Latte](https://github.com/Vchitect/Latte). +- [ ] Implement [PeRFlow](https://github.com/magic-research/piecewise-rectified-flow) for improving the sampling process. 🙏 **[Need your contribution]** +- [x] Finish data loading, pre-processing utils. +- [x] Add T5 support. +- [x] Add CLIP support. 🤝 Thanks to [@Ytimed2020](https://github.com/Ytimed2020) +- [x] Add text2image training script. +- [ ] Add prompt captioner. + - [ ] Collect training data. + - [ ] Need video-text pairs with caption. 🙏 **[Need your contribution]** + - [ ] Extract multi-frame descriptions by large image-language models. 🤝 Thanks to [@HowardLi1984](https://github.com/HowardLi1984) + - [ ] Extract video description by large video-language models. 🙏 **[Need your contribution]** + - [ ] Integrate captions to get a dense caption by using a large language model, such as GPT-4. 🤝 Thanks to [@HowardLi1984](https://github.com/HowardLi1984) + - [ ] Train a captioner to refine captions. 🚀 **[Require more computation]** + +#### Train the 1080p model on video2text dataset +- [ ] Looking for a suitable dataset, welcome to discuss and recommend. 🙏 **[Need your contribution]** +- [ ] Add synthetic video created by game engines or 3D representations. 🙏 **[Need your contribution]** +- [x] Finish data loading, and pre-processing utils. +- [x] Support memory friendly training. + - [x] Add flash-attention2 from pytorch. + - [x] Add xformers. 🤝 Thanks to [@jialin-zhao](https://github.com/jialin-zhao) + - [x] Support mixed precision training. + - [x] Add gradient checkpoint. + - [x] Support for ReBased and Ring attention. 🤝 Thanks to [@kabachuha](https://github.com/kabachuha) + - [x] Train using the deepspeed engine. 🤝 Thanks to [@sennnnn](https://github.com/sennnnn) +- [ ] Train with a text condition. Here we could conduct different experiments: 🚀 **[Require more computation]** + - [x] Train with T5 conditioning. + - [ ] Train with CLIP conditioning. + - [ ] Train with CLIP + T5 conditioning (probably costly during training and experiments). +- [ ] Support Chinese. ⌛ [WIP] + +#### Control model with more condition +- [ ] Incorporating [ControlNet](https://github.com/lllyasviel/ControlNet). ⌛ [WIP] 🙏 **[Need your contribution]** +- [ ] Incorporating [ReVideo](https://github.com/MC-E/ReVideo). ⌛ [WIP] + +
+ +## 📂 Repo structure (WIP) +``` +├── README.md +├── docs +│ ├── Data.md -> Datasets description. +│ ├── Contribution_Guidelines.md -> Contribution guidelines description. +├── scripts -> All scripts. +├── opensora +│   ├── dataset +│   ├── models +│   │   ├── ae -> Compress videos to latents +│   │   │   ├── imagebase +│   │   │   │   ├── vae +│   │   │   │   └── vqvae +│   │   │   └── videobase +│   │   │   ├── vae +│   │   │   └── vqvae +│   │   ├── captioner +│   │   ├── diffusion -> Denoise latents +│   │   │   ├── diffusion +│   │   │   ├── dit +│   │   │   ├── latte +│   │   │   └── unet +│   │   ├── frame_interpolation +│   │   ├── super_resolution +│   │   └── text_encoder +│   ├── sample +│   ├── train -> Training code +│   └── utils +``` + +## 🛠️ Requirements and Installation + +1. Clone this repository and navigate to Open-Sora-Plan folder +``` +git clone https://github.com/PKU-YuanGroup/Open-Sora-Plan +cd Open-Sora-Plan +``` +2. Install required packages +``` +conda create -n opensora python=3.8 -y +conda activate opensora +pip install -e . +``` +3. Install additional packages for training cases +``` +pip install -e ".[train]" +pip install flash-attn --no-build-isolation +``` +4. Install optional requirements such as static type checking: +``` +pip install -e '.[dev]' +``` + +## 🗝️ Usage + + +### 🤗 Demo + +#### Gradio Web UI + +Highly recommend trying out our web demo by the following command. We also provide [online demo](https://huggingface.co/spaces/LanguageBind/Open-Sora-Plan-v1.1.0) [![hf_space](https://img.shields.io/badge/🤗-Open%20In%20Spaces-blue.svg)](https://huggingface.co/spaces/LanguageBind/Open-Sora-Plan-v1.1.0). + +
+v1.0.0 + +Highly recommend trying out our web demo by the following command. We also provide [online demo](https://huggingface.co/spaces/LanguageBind/Open-Sora-Plan-v1.0.0) [![hf_space](https://img.shields.io/badge/🤗-Open%20In%20Spaces-blue.svg)](https://huggingface.co/spaces/LanguageBind/Open-Sora-Plan-v1.0.0) and [![hf_space](https://img.shields.io/badge/🤗-Open%20In%20Spaces-blue.svg)](https://huggingface.co/spaces/fffiloni/Open-Sora-Plan-v1-0-0) in Huggingface Spaces. + +🤝 Enjoying the [![Replicate demo and cloud API](https://replicate.com/camenduru/open-sora-plan-512x512/badge)](https://replicate.com/camenduru/open-sora-plan-512x512) and [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/camenduru/Open-Sora-Plan-jupyter/blob/main/Open_Sora_Plan_jupyter.ipynb), created by [@camenduru](https://github.com/camenduru), who generously supports our research! + +
+ +```bash +python -m opensora.serve.gradio_web_server +``` + +#### CLI Inference + +```bash +sh scripts/text_condition/sample_video.sh +``` + +### Datasets +Refer to [Data.md](docs/Data.md) + +### Evaluation +Refer to the document [EVAL.md](docs/EVAL.md). + +### CausalVideoVAE + +#### Reconstructing + +Example: + +```Python +python examples/rec_imvi_vae.py --video_path test_video.mp4 --rec_path output_video.mp4 --fps 24 --resolution 512 --crop_size 512 --num_frames 128 --sample_rate 1 --ae CausalVAEModel_4x8x8 --model_path pretrained_488_release --enable_tiling --enable_time_chunk +``` + +Parameter explanation: + +- `--enable_tiling`: This parameter is a flag to enable a tiling conv. + +#### Training and Eval + +Please refer to the document [CausalVideoVAE](docs/Train_And_Eval_CausalVideoVAE.md). + +### VideoGPT VQVAE + +Please refer to the document [VQVAE](docs/VQVAE.md). + +### Video Diffusion Transformer + +#### Training +``` +sh scripts/text_condition/train_videoae_65x512x512.sh +``` +``` +sh scripts/text_condition/train_videoae_221x512x512.sh +``` +``` +sh scripts/text_condition/train_videoae_513x512x512.sh +``` + + + +## 💡 How to Contribute to the Open-Sora Plan Community +We greatly appreciate your contributions to the Open-Sora Plan open-source community and helping us make it even better than it is now! + +For more details, please refer to the [Contribution Guidelines](docs/Contribution_Guidelines.md) + + + + +## 👍 Acknowledgement +* [Latte](https://github.com/Vchitect/Latte): The **main codebase** we built upon and it is an wonderful video generated model. +* [PixArt-alpha](https://github.com/PixArt-alpha/PixArt-alpha): Fast Training of Diffusion Transformer for Photorealistic Text-to-Image Synthesis. +* [ShareGPT4Video](https://github.com/InternLM/InternLM-XComposer/tree/main/projects/ShareGPT4Video): Improving Video Understanding and Generation with Better Captions. +* [VideoGPT](https://github.com/wilson1yan/VideoGPT): Video Generation using VQ-VAE and Transformers. +* [DiT](https://github.com/facebookresearch/DiT): Scalable Diffusion Models with Transformers. +* [FiT](https://github.com/whlzy/FiT): Flexible Vision Transformer for Diffusion Model. +* [Positional Interpolation](https://arxiv.org/abs/2306.15595): Extending Context Window of Large Language Models via Positional Interpolation. + + +## 🔒 License +* See [LICENSE](LICENSE) for details. + + + + +## ✏️ Citing + +### BibTeX + +```bibtex +@software{pku_yuan_lab_and_tuzhan_ai_etc_2024_10948109, + author = {PKU-Yuan Lab and Tuzhan AI etc.}, + title = {Open-Sora-Plan}, + month = apr, + year = 2024, + publisher = {GitHub}, + doi = {10.5281/zenodo.10948109}, + url = {https://doi.org/10.5281/zenodo.10948109} +} +``` +### Latest DOI + +[![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.10948109.svg)](https://zenodo.org/records/10948109) + +## 🤝 Community contributors + + + + -- Gitee From 4fffa56671435be9c877a219bfd6800896a248d7 Mon Sep 17 00:00:00 2001 From: lin-mingzhe-hw Date: Sat, 22 Jun 2024 15:29:56 +0800 Subject: [PATCH 3/3] [built-in][PyTorch][OpenSoraPlan] edit readme --- .../built-in/mlm/OpenSoraPlan1.1/README.md | 532 ++++++------------ 1 file changed, 171 insertions(+), 361 deletions(-) diff --git a/PyTorch/built-in/mlm/OpenSoraPlan1.1/README.md b/PyTorch/built-in/mlm/OpenSoraPlan1.1/README.md index e363bfff40..702ee16012 100644 --- a/PyTorch/built-in/mlm/OpenSoraPlan1.1/README.md +++ b/PyTorch/built-in/mlm/OpenSoraPlan1.1/README.md @@ -1,410 +1,220 @@ -# Open-Sora Plan - +# OpenSoraPlan1.1 for PyTorch +# 目录 -[![slack badge](https://img.shields.io/badge/Discord-join-blueviolet?logo=discord&)](https://discord.gg/YtsBNg7n) -[![WeChat badge](https://img.shields.io/badge/微信-加入-green?logo=wechat&)](https://github.com/PKU-YuanGroup/Open-Sora-Plan/issues/53#issuecomment-1987226516) -[![hf_space](https://img.shields.io/badge/🤗-Open%20In%20Spaces-blue.svg)](https://huggingface.co/spaces/LanguageBind/Open-Sora-Plan-v1.1.0) -[![Twitter](https://img.shields.io/badge/-Twitter@LinBin46984-black?logo=twitter&logoColor=1D9BF0)](https://x.com/LinBin46984/status/1795018003345510687)
-[![hf_space](https://img.shields.io/badge/🤗-Open%20In%20Spaces-blue.svg)](https://huggingface.co/spaces/LanguageBind/Open-Sora-Plan-v1.1.0) -[![License](https://img.shields.io/badge/License-MIT-yellow)](https://github.com/PKU-YuanGroup/Open-Sora-Plan/blob/main/LICENSE) -[![GitHub repo contributors](https://img.shields.io/github/contributors-anon/PKU-YuanGroup/Open-Sora-Plan?style=flat&label=Contributors)](https://github.com/PKU-YuanGroup/Open-Sora-Plan/graphs/contributors) -[![GitHub Commit](https://img.shields.io/github/commit-activity/m/PKU-YuanGroup/Open-Sora-Plan?label=Commit)](https://github.com/PKU-YuanGroup/Open-Sora-Plan/commits/main/) -[![Pr](https://img.shields.io/github/issues-pr-closed-raw/PKU-YuanGroup/Open-Sora-Plan.svg?label=Merged+PRs&color=green)](https://github.com/PKU-YuanGroup/Open-Sora-Plan/pulls) -[![GitHub issues](https://img.shields.io/github/issues/PKU-YuanGroup/Open-Sora-Plan?color=critical&label=Issues)](https://github.com/PKU-YuanGroup/Video-LLaVA/issues?q=is%3Aopen+is%3Aissue) -[![GitHub closed issues](https://img.shields.io/github/issues-closed/PKU-YuanGroup/Open-Sora-Plan?color=success&label=Issues)](https://github.com/PKU-YuanGroup/Video-LLaVA/issues?q=is%3Aissue+is%3Aclosed)
-[![GitHub repo stars](https://img.shields.io/github/stars/PKU-YuanGroup/Open-Sora-Plan?style=flat&logo=github&logoColor=whitesmoke&label=Stars)](https://github.com/PKU-YuanGroup/Open-Sora-Plan/stargazers)  -[![GitHub repo forks](https://img.shields.io/github/forks/PKU-YuanGroup/Open-Sora-Plan?style=flat&logo=github&logoColor=whitesmoke&label=Forks)](https://github.com/PKU-YuanGroup/Open-Sora-Plan/network)  -[![GitHub repo watchers](https://img.shields.io/github/watchers/PKU-YuanGroup/Open-Sora-Plan?style=flat&logo=github&logoColor=whitesmoke&label=Watchers)](https://github.com/PKU-YuanGroup/Open-Sora-Plan/watchers)  -[![GitHub repo size](https://img.shields.io/github/repo-size/PKU-YuanGroup/Open-Sora-Plan?style=flat&logo=github&logoColor=whitesmoke&label=Repo%20Size)](https://github.com/PKU-YuanGroup/Open-Sora-Plan/archive/refs/heads/main.zip) +- [简介](#简介) + - [模型介绍](#模型介绍) + - [支持任务列表](#支持任务列表) + - [代码实现](#代码实现) -
-v1.0.0 badge -[![Twitter](https://img.shields.io/badge/-Twitter@LinBin46984-black?logo=twitter&logoColor=1D9BF0)](https://x.com/LinBin46984/status/1763476690385424554?s=20)
-[![hf_space](https://img.shields.io/badge/🤗-Open%20In%20Spaces-blue.svg)](https://huggingface.co/spaces/LanguageBind/Open-Sora-Plan-v1.0.0) -[![hf_space](https://img.shields.io/badge/🤗-Open%20In%20Spaces-blue.svg)](https://huggingface.co/spaces/fffiloni/Open-Sora-Plan-v1-0-0) -[![Replicate demo and cloud API](https://replicate.com/camenduru/open-sora-plan-512x512/badge)](https://replicate.com/camenduru/open-sora-plan-512x512) -[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/camenduru/Open-Sora-Plan-jupyter/blob/main/Open_Sora_Plan_jupyter.ipynb)
-
+- [准备训练环境](#准备训练环境) + - [安装模型环境](#安装模型环境) + - [安装昇腾环境](#安装昇腾环境) +- [LatteT2V](#LatteT2V) + - [训练数据集准备](#训练数据集准备) + - [准备预训练模型](#准备预训练模型) + - [快速开始](#快速开始) + - [训练任务](#训练任务) + - [性能展示](#性能展示) + - [在线推理任务](#在线推理任务) +- [公网地址说明](#公网地址说明) +- [变更说明](#变更说明) +- [FAQ](#FAQ) -We are thrilled to present **Open-Sora-Plan v1.1.0**, which significantly enhances video generation quality and text control capabilities. See our [report](docs/Report-v1.1.0.md). We show compressed .gif on GitHub, which loses some quality. +# 简介 -Thanks to **HUAWEI Ascend Team** for supporting us. In the second stage, we used Huawei Ascend computing power for training. This stage's training and inference were fully supported by Huawei. Models trained on Huawei Ascend can also be loaded into GPUs and generate videos of the same quality. +## 模型介绍 -目前已经支持使用国产AI芯片(华为昇腾,期待更多国产算力芯片)进行完整的训练和推理。在项目第二阶段,所有训练和推理任务完全由华为昇腾芯片支持。此外,基于华为昇腾的512卡集群训练出的模型,也可以无缝地在GPU上运行,并保持相同的视频质量。详细信息请参考我们的[hw branch](https://github.com/PKU-YuanGroup/Open-Sora-Plan/tree/hw). +Open-Sora-Plan是由北大技术团队推出的项目,旨在通过开源框架复现 OpenAI +Sora。作为基础开源框架,它支持视频生成模型的训练,包括无条件视频生成、类别视频生成和文本到视频生成。 +本仓库主要将Open-Sora-Plan多个任务迁移到了昇腾NPU上,并进行极致性能优化。 +## 支持任务列表 -### 221×512×512 Text-to-Video Generation +本仓已经支持以下模型任务类型 +| 模型 | 任务列表 | 是否支持 | +|:--------:|:----:|:----:| +| LatteT2V | 训练 | ✔ | +## 代码实现 -| 221×512×512 (9.2s) | 221×512×512 (9.2s) | 221×512×512 (9.2s) | -| --- | --- | --- | -| | | | -| 3D animation of a small, round, fluffy creature with big, expressive eyes explores ... | A single drop of liquid metal falls from a floating orb, landing on a mirror-like ... | The video presents an abstract composition centered around a hexagonal shape adorned ... | -| | | | -| A drone camera circles around a beautiful historic church built on a rocky outcropping ... | Aerial view of Santorini during the blue hour, showcasing the stunning architecture ... | An aerial shot of a lighthouse standing tall on a rocky cliff, its beacon cutting ... | -| | | | -| A snowy forest landscape with a dirt road running through it. The road is flanked by ... | Drone shot along the Hawaii jungle coastline, sunny day. Kayaks in the water. |The camera rotates around a large stack of vintage televisions all showing different ... | +- 参考实现: + ``` + url=https://github.com/PKU-YuanGroup/Open-Sora-Plan + commit_id=2a8b2328a5fcc0108fb5444b010f7e1ae0b4cb7b + ``` -### 65×512×512 Text-to-Video Generation +- 适配昇腾 AI 处理器的实现: -| 65×512×512 (2.7s) | 65×512×512 (2.7s) | 65×512×512 (2.7s) | -| --- | --- | --- | -| | | | -| In an ornate, historical hall, a massive tidal wave peaks and begins to crash. Two ... | A Shiba Inu dog wearing a beret and black turtleneck. | A painting of a boat on water comes to life, with waves crashing and the boat becoming ... | -|| | | -| A person clad in a space suit with a helmet and equipped with a chest light and arm ... | 3D animation of a small, round, fluffy creature with big, expressive eyes explores a ... | In a studio, there is a painting depicting a ship sailing through the rough sea. | -| | | | -| A robot dog trots down a deserted alley at night, its metallic paws clinking softly ... | A lone surfer rides a massive wave, skillfully maneuvering through the surf. The water ... | A solitary cheetah sprints across the savannah, its powerful muscles propelling it ... | + ``` + url=https://gitee.com/ascend/ModelZoo-PyTorch.git + code_path=PyTorch/built-in/mlm/ + ``` -### 65×512×512 Video Editing +# 准备训练环境 -| generated 65×512×512 (2.7s) | edited 65×512×512 (2.7s) | -| --- | --- | -| | | -| | | -| | | +## 安装模型环境 -### 512×512 Text-to-Image Generation +**表 1** 三方库版本支持表 - +| 三方库 | 支持版本 | + |:------------:|:------:| +| PyTorch | 2.1.0 | +| diffusers | 0.27.2 | +| accelerate | 0.28.0 | +| deepspeed | 0.12.6 | +| transformers | 4.39.1 | +| decord | 0.6.0 | +在模型根目录下执行以下命令,安装模型对应PyTorch版本需要的依赖。 + ```shell + pip install -e . # 安装本地OpenSoraPlan代码仓 + ``` +注: 模型依赖decord需编译安装,根据原仓安装https://github.com/dmlc/decord -## 📰 News - -**[2024.05.27]** 🚀🚀🚀 We are launching Open-Sora Plan v1.1.0, which significantly improves video quality and length, and is fully open source! Please check out our latest [report](docs/Report-v1.1.0.md). - -**[2024.04.09]** 🚀 Excited to share our latest exploration on metamorphic time-lapse video generation: [MagicTime](https://github.com/PKU-YuanGroup/MagicTime), which learns real-world physics knowledge from time-lapse videos. Here is the dataset for train (updating): [Open-Sora-Dataset](https://github.com/PKU-YuanGroup/Open-Sora-Dataset). - -**[2024.04.07]** 🔥🔥🔥 Today, we are thrilled to present Open-Sora-Plan v1.0.0, which significantly enhances video generation quality and text control capabilities. See our [report](docs/Report-v1.0.0.md). Thanks to HUAWEI NPU for supporting us. - -**[2024.03.27]** 🚀🚀🚀 We release the report of [VideoCausalVAE](docs/CausalVideoVAE.md), which supports both images and videos. We present our reconstructed video in this demonstration as follows. The text-to-video model is on the way. - -
-View more - -**[2024.03.10]** 🚀🚀🚀 This repo supports training a latent size of 225×90×90 (t×h×w), which means we are able to **train 1 minute of 1080P video with 30FPS** (2× interpolated frames and 2× super resolution) under class-condition. - -**[2024.03.08]** We support the training code of text condition with 16 frames of 512x512. The code is mainly borrowed from [Latte](https://github.com/Vchitect/Latte). - -**[2024.03.07]** We support training with 128 frames (when sample rate = 3, which is about 13 seconds) of 256x256, or 64 frames (which is about 6 seconds) of 512x512. - -**[2024.03.05]** See our latest [todo](https://github.com/PKU-YuanGroup/Open-Sora-Plan?tab=readme-ov-file#todo), pull requests are welcome. - -**[2024.03.04]** We re-organize and modulize our code to make it easy to [contribute](https://github.com/PKU-YuanGroup/Open-Sora-Plan?tab=readme-ov-file#how-to-contribute-to-the-open-sora-plan-community) to the project, to contribute please see the [Repo structure](https://github.com/PKU-YuanGroup/Open-Sora-Plan?tab=readme-ov-file#repo-structure). - -**[2024.03.03]** We open some [discussions](https://github.com/PKU-YuanGroup/Open-Sora-Plan/discussions) to clarify several issues. - -**[2024.03.01]** Training code is available now! Learn more on our [project page](https://pku-yuangroup.github.io/Open-Sora-Plan/). Please feel free to watch 👀 this repository for the latest updates. - -
- -## 💪 Goal -This project aims to create a simple and scalable repo, to reproduce [Sora](https://openai.com/sora) (OpenAI, but we prefer to call it "ClosedAI" ). We wish the open-source community can contribute to this project. Pull requests are welcome!!! - -本项目希望通过开源社区的力量复现Sora,由北大-兔展AIGC联合实验室共同发起,当前版本离目标差距仍然较大,仍需持续完善和快速迭代,欢迎Pull request!!! - -Project stages: -- Primary -1. Setup the codebase and train an un-conditional model on a landscape dataset. -2. Train models that boost resolution and duration. - -- Extensions -3. Conduct text2video experiments on landscape dataset. -4. Train the 1080p model on video2text dataset. -5. Control model with more conditions. - - -
- - -
- - -
-✊ Todo - -#### Setup the codebase and train an unconditional model on landscape dataset -- [x] Fix typos & Update readme. 🤝 Thanks to [@mio2333](https://github.com/mio2333), [@CreamyLong](https://github.com/CreamyLong), [@chg0901](https://github.com/chg0901), [@Nyx-177](https://github.com/Nyx-177), [@HowardLi1984](https://github.com/HowardLi1984), [@sennnnn](https://github.com/sennnnn), [@Jason-fan20](https://github.com/Jason-fan20) -- [x] Setup environment. 🤝 Thanks to [@nameless1117](https://github.com/nameless1117) -- [ ] Add docker file. ⌛ [WIP] 🤝 Thanks to [@Mon-ius](https://github.com/Mon-ius), [@SimonLeeGit](https://github.com/SimonLeeGit) -- [ ] Enable type hints for functions. 🤝 Thanks to [@RuslanPeresy](https://github.com/RuslanPeresy), 🙏 **[Need your contribution]** -- [x] Resume from checkpoint. -- [x] Add Video-VQVAE model, which is borrowed from [VideoGPT](https://github.com/wilson1yan/VideoGPT). -- [x] Support variable aspect ratios, resolutions, durations training on [DiT](https://github.com/facebookresearch/DiT). -- [x] Support Dynamic mask input inspired by [FiT](https://github.com/whlzy/FiT). -- [x] Add class-conditioning on embeddings. -- [x] Incorporating [Latte](https://github.com/Vchitect/Latte) as main codebase. -- [x] Add VAE model, which is borrowed from [Stable Diffusion](https://github.com/CompVis/latent-diffusion). -- [x] Joint dynamic mask input with VAE. -- [ ] Add VQVAE from [VQGAN](https://github.com/CompVis/taming-transformers). 🙏 **[Need your contribution]** -- [ ] Make the codebase ready for the cluster training. Add SLURM scripts. 🙏 **[Need your contribution]** -- [x] Refactor VideoGPT. 🤝 Thanks to [@qqingzheng](https://github.com/qqingzheng), [@luo3300612](https://github.com/luo3300612), [@sennnnn](https://github.com/sennnnn) -- [x] Add sampling script. -- [ ] Add DDP sampling script. ⌛ [WIP] -- [x] Use accelerate on multi-node. 🤝 Thanks to [@sysuyy](https://github.com/sysuyy) -- [x] Incorporate [SiT](https://github.com/willisma/SiT). 🤝 Thanks to [@khan-yin](https://github.com/khan-yin) -- [x] Add evaluation scripts (FVD, CLIP score). 🤝 Thanks to [@rain305f](https://github.com/rain305f) - -#### Train models that boost resolution and duration -- [x] Add [PI](https://arxiv.org/abs/2306.15595) to support out-of-domain size. 🤝 Thanks to [@jpthu17](https://github.com/jpthu17) -- [x] Add 2D RoPE to improve generalization ability as [FiT](https://github.com/whlzy/FiT). 🤝 Thanks to [@jpthu17](https://github.com/jpthu17) -- [x] Compress KV according to [PixArt-sigma](https://pixart-alpha.github.io/PixArt-sigma-project). -- [x] Support deepspeed for videogpt training. 🤝 Thanks to [@sennnnn](https://github.com/sennnnn) -- [x] Train a **low dimension** Video-AE, whether it is VAE or VQVAE. -- [x] Extract offline feature. -- [x] Train with offline feature. -- [x] Add frame interpolation model. 🤝 Thanks to [@yunyangge](https://github.com/yunyangge) -- [x] Add super resolution model. 🤝 Thanks to [@Linzy19](https://github.com/Linzy19) -- [x] Add accelerate to automatically manage training. -- [x] Joint training with images. -- [ ] Implement [MaskDiT](https://github.com/Anima-Lab/MaskDiT) technique for fast training. 🙏 **[Need your contribution]** -- [ ] Incorporate [NaViT](https://arxiv.org/abs/2307.06304). 🙏 **[Need your contribution]** -- [ ] Add [FreeNoise](https://github.com/arthur-qiu/FreeNoise-LaVie) support for training-free longer video generation. 🙏 **[Need your contribution]** - -#### Conduct text2video experiments on landscape dataset. -- [x] Load pretrained weights from [Latte](https://github.com/Vchitect/Latte). -- [ ] Implement [PeRFlow](https://github.com/magic-research/piecewise-rectified-flow) for improving the sampling process. 🙏 **[Need your contribution]** -- [x] Finish data loading, pre-processing utils. -- [x] Add T5 support. -- [x] Add CLIP support. 🤝 Thanks to [@Ytimed2020](https://github.com/Ytimed2020) -- [x] Add text2image training script. -- [ ] Add prompt captioner. - - [ ] Collect training data. - - [ ] Need video-text pairs with caption. 🙏 **[Need your contribution]** - - [ ] Extract multi-frame descriptions by large image-language models. 🤝 Thanks to [@HowardLi1984](https://github.com/HowardLi1984) - - [ ] Extract video description by large video-language models. 🙏 **[Need your contribution]** - - [ ] Integrate captions to get a dense caption by using a large language model, such as GPT-4. 🤝 Thanks to [@HowardLi1984](https://github.com/HowardLi1984) - - [ ] Train a captioner to refine captions. 🚀 **[Require more computation]** - -#### Train the 1080p model on video2text dataset -- [ ] Looking for a suitable dataset, welcome to discuss and recommend. 🙏 **[Need your contribution]** -- [ ] Add synthetic video created by game engines or 3D representations. 🙏 **[Need your contribution]** -- [x] Finish data loading, and pre-processing utils. -- [x] Support memory friendly training. - - [x] Add flash-attention2 from pytorch. - - [x] Add xformers. 🤝 Thanks to [@jialin-zhao](https://github.com/jialin-zhao) - - [x] Support mixed precision training. - - [x] Add gradient checkpoint. - - [x] Support for ReBased and Ring attention. 🤝 Thanks to [@kabachuha](https://github.com/kabachuha) - - [x] Train using the deepspeed engine. 🤝 Thanks to [@sennnnn](https://github.com/sennnnn) -- [ ] Train with a text condition. Here we could conduct different experiments: 🚀 **[Require more computation]** - - [x] Train with T5 conditioning. - - [ ] Train with CLIP conditioning. - - [ ] Train with CLIP + T5 conditioning (probably costly during training and experiments). -- [ ] Support Chinese. ⌛ [WIP] - -#### Control model with more condition -- [ ] Incorporating [ControlNet](https://github.com/lllyasviel/ControlNet). ⌛ [WIP] 🙏 **[Need your contribution]** -- [ ] Incorporating [ReVideo](https://github.com/MC-E/ReVideo). ⌛ [WIP] - -
- -## 📂 Repo structure (WIP) -``` -├── README.md -├── docs -│ ├── Data.md -> Datasets description. -│ ├── Contribution_Guidelines.md -> Contribution guidelines description. -├── scripts -> All scripts. -├── opensora -│   ├── dataset -│   ├── models -│   │   ├── ae -> Compress videos to latents -│   │   │   ├── imagebase -│   │   │   │   ├── vae -│   │   │   │   └── vqvae -│   │   │   └── videobase -│   │   │   ├── vae -│   │   │   └── vqvae -│   │   ├── captioner -│   │   ├── diffusion -> Denoise latents -│   │   │   ├── diffusion -│   │   │   ├── dit -│   │   │   ├── latte -│   │   │   └── unet -│   │   ├── frame_interpolation -│   │   ├── super_resolution -│   │   └── text_encoder -│   ├── sample -│   ├── train -> Training code -│   └── utils -``` - -## 🛠️ Requirements and Installation - -1. Clone this repository and navigate to Open-Sora-Plan folder -``` -git clone https://github.com/PKU-YuanGroup/Open-Sora-Plan -cd Open-Sora-Plan -``` -2. Install required packages -``` -conda create -n opensora python=3.8 -y -conda activate opensora -pip install -e . -``` -3. Install additional packages for training cases -``` -pip install -e ".[train]" -pip install flash-attn --no-build-isolation -``` -4. Install optional requirements such as static type checking: -``` -pip install -e '.[dev]' -``` - -## 🗝️ Usage - - -### 🤗 Demo - -#### Gradio Web UI - -Highly recommend trying out our web demo by the following command. We also provide [online demo](https://huggingface.co/spaces/LanguageBind/Open-Sora-Plan-v1.1.0) [![hf_space](https://img.shields.io/badge/🤗-Open%20In%20Spaces-blue.svg)](https://huggingface.co/spaces/LanguageBind/Open-Sora-Plan-v1.1.0). - -
-v1.0.0 - -Highly recommend trying out our web demo by the following command. We also provide [online demo](https://huggingface.co/spaces/LanguageBind/Open-Sora-Plan-v1.0.0) [![hf_space](https://img.shields.io/badge/🤗-Open%20In%20Spaces-blue.svg)](https://huggingface.co/spaces/LanguageBind/Open-Sora-Plan-v1.0.0) and [![hf_space](https://img.shields.io/badge/🤗-Open%20In%20Spaces-blue.svg)](https://huggingface.co/spaces/fffiloni/Open-Sora-Plan-v1-0-0) in Huggingface Spaces. - -🤝 Enjoying the [![Replicate demo and cloud API](https://replicate.com/camenduru/open-sora-plan-512x512/badge)](https://replicate.com/camenduru/open-sora-plan-512x512) and [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/camenduru/Open-Sora-Plan-jupyter/blob/main/Open_Sora_Plan_jupyter.ipynb), created by [@camenduru](https://github.com/camenduru), who generously supports our research! - -
- -```bash -python -m opensora.serve.gradio_web_server -``` - -#### CLI Inference - -```bash -sh scripts/text_condition/sample_video.sh -``` - -### Datasets -Refer to [Data.md](docs/Data.md) - -### Evaluation -Refer to the document [EVAL.md](docs/EVAL.md). - -### CausalVideoVAE - -#### Reconstructing - -Example: - -```Python -python examples/rec_imvi_vae.py --video_path test_video.mp4 --rec_path output_video.mp4 --fps 24 --resolution 512 --crop_size 512 --num_frames 128 --sample_rate 1 --ae CausalVAEModel_4x8x8 --model_path pretrained_488_release --enable_tiling --enable_time_chunk -``` +## 安装昇腾环境 -Parameter explanation: +请参考昇腾社区中《[Pytorch框架训练环境准备](https://www.hiascend.com/document/detail/zh/ModelZoo/pytorchframework/ptes) +》文档搭建昇腾环境,本仓已支持表2中软件版本。 -- `--enable_tiling`: This parameter is a flag to enable a tiling conv. +**表 2** 昇腾软件版本支持表 -#### Training and Eval +| 软件类型 | 支持版本 | + |:------------------:|:----:| +| FrameworkPTAdapter | 在研版本 | +| CANN | 在研版本 | +| 昇腾NPU固件 | 在研版本 | +| 昇腾NPU驱动 | 在研版本 | + +# LatteT2V + +## 训练数据集准备 + +用户需自行获取并解压mixkit2数据集,以及对应帧数的标注json,放置到`OpenSoraPlan1.1/dataset`目录下。 +数据和标注可以从huggingface的LanguageBind/Open-Sora-Plan-v1.1.0数据集文件的all_mixkit和anno_jsons中获取。 + +数据结构如下: + + ``` + OpenSoraPlan1.1 + ├── dataset + ├── mixkit2 + ├── Airplane + ├── Baby + ├── ... + └── video_mixkit_65f_54735.json + ``` + +## 准备预训练模型 -Please refer to the document [CausalVideoVAE](docs/Train_And_Eval_CausalVideoVAE.md). +1. 联网情况下,预训练模型会自动下载。 -### VideoGPT VQVAE +2. 无网络时,用户可访问huggingface官网自行下载,文件namespace如下: -Please refer to the document [VQVAE](docs/VQVAE.md). + ``` + DeepFloyd/t5-v1_1-xxl # t5模型 + LanguageBind/Open-Sora-Plan-v1.1.0 # 预训练权重(含3D VAE模型和LatteT2V模型) + ``` -### Video Diffusion Transformer +3. 将下载好的预训练模型放在本工程目录下,组织结构如下: + ``` + OpenSoraPlan1.1 + ├── DeepFloyd + │ ├── t5-v1_1-xxl + │ │ ├── config.json + │ │ ├── pytorch_model-00001-of-00002.bin + │ │ ├── ... + │ LanguageBind + │ ├── Open-Sora-Plan-v1.1.0 + │ │ ├── 221x512x512 + │ │ ├── 65x512x512 + │ │ └── vae + ``` -#### Training -``` -sh scripts/text_condition/train_videoae_65x512x512.sh -``` -``` -sh scripts/text_condition/train_videoae_221x512x512.sh -``` -``` -sh scripts/text_condition/train_videoae_513x512x512.sh -``` +## 快速开始 - +2. 运行预训练脚本。 -## 💡 How to Contribute to the Open-Sora Plan Community -We greatly appreciate your contributions to the Open-Sora Plan open-source community and helping us make it even better than it is now! + 该模型支持单机8卡训练。 -For more details, please refer to the [Contribution Guidelines](docs/Contribution_Guidelines.md) + - 单机8卡训练 + ```shell + bash scripts/text_condition/train_videoae_65x512x512.sh # 8卡训练,混精bf16 + ``` + - 模型训练python训练脚本参数说明如下。 + ```shell + --config_file scripts/accelerate_configs/deepspeed_zero2_config.yaml \ // deepspeed配置文件 + opensora/train/train_t2v.py \ // 训练启动脚本 + --model LatteT2V-XL/122 \ // 训练模型 + --text_encoder_name DeepFloyd/t5-v1_1-xxl \ // 文本编码器 + --cache_dir "./cache_dir" \ // 下载缓存目录 + --dataset t2v \ // 数据集类型 + --ae CausalVAEModel_4x8x8 \ // 图片/视频预训练模型 + --ae_path "LanguageBind/Open-Sora-Plan-v1.1.0/vae" \ // vae预训练文件路径 + --video_data "scripts/train_data/video_data.txt" \ // 视频数据路径文件 + --use_img_from_vid \ // 训练图片来自视频 + --sample_rate 1 \ // 采样率 + --num_frames 65 \ // 训练帧数 + --max_image_size 512 \ // 图像/视频最大尺寸 + --gradient_checkpointing \ // 是否重计算 + --attention_mode math \ // attention的类型 + --train_batch_size=2 \ // 训练的批大小 + --dataloader_num_workers 4 \ // 数据处理线程数 + --gradient_accumulation_steps=1 \ // 梯度累计步数 + --max_train_steps=1000000 \ // 最大训练步数 + --learning_rate=2e-05 \ // 学习率 + --lr_scheduler="constant" \ // 学习率调度策略 + --lr_warmup_steps=0 \ // 学习率预热步数 + --mixed_precision="bf16" \ // 混精训练的数据类型 + --report_to="tensorboard" \ // 记录方式 + --checkpointing_steps=500 \ // 检查点步数 + --output_dir="65x512x512_10node_bs2_lr2e-5_16img" \ // 输出的路径 + --allow_tf32 \ // 使用tf32训练 + --use_deepspeed \ // 使用deepspeed训练 + --model_max_length 300 \ // 文本最大长度 + --use_image_num 16 \ // 训练使用图片的数量 + --enable_tiling \ // 启用平铺 + --pretrained LanguageBind/Open-Sora-Plan-v1.1.0/65x512x512/diffusion_pytorch_model.safetensors // 预训练模型 + ``` + +#### 性能展示 +##### 性能 + +| 芯片 | 卡数 | 单步迭代时间(s/step) | batch_size | AMP_Type | Torch_Version | +|:--------:|:--:|:--------------:|:----------:|:--------:|:-------------:| +| GPU | 8p | 9.19 | 2 | bf16 | 2.1 | +| Atlas A2 | 8p | 9.66 | 2 | bf16 | 2.1 | -## 👍 Acknowledgement -* [Latte](https://github.com/Vchitect/Latte): The **main codebase** we built upon and it is an wonderful video generated model. -* [PixArt-alpha](https://github.com/PixArt-alpha/PixArt-alpha): Fast Training of Diffusion Transformer for Photorealistic Text-to-Image Synthesis. -* [ShareGPT4Video](https://github.com/InternLM/InternLM-XComposer/tree/main/projects/ShareGPT4Video): Improving Video Understanding and Generation with Better Captions. -* [VideoGPT](https://github.com/wilson1yan/VideoGPT): Video Generation using VQ-VAE and Transformers. -* [DiT](https://github.com/facebookresearch/DiT): Scalable Diffusion Models with Transformers. -* [FiT](https://github.com/whlzy/FiT): Flexible Vision Transformer for Diffusion Model. -* [Positional Interpolation](https://arxiv.org/abs/2306.15595): Extending Context Window of Large Language Models via Positional Interpolation. +# 公网地址说明 +代码涉及公网地址参考 public_address_statement.md -## 🔒 License -* See [LICENSE](LICENSE) for details. +# 变更说明 - +2024.06.20: LatteT2V bf16训练任务首次发布 + +# FAQ -## ✏️ Citing -### BibTeX - -```bibtex -@software{pku_yuan_lab_and_tuzhan_ai_etc_2024_10948109, - author = {PKU-Yuan Lab and Tuzhan AI etc.}, - title = {Open-Sora-Plan}, - month = apr, - year = 2024, - publisher = {GitHub}, - doi = {10.5281/zenodo.10948109}, - url = {https://doi.org/10.5281/zenodo.10948109} -} -``` -### Latest DOI - -[![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.10948109.svg)](https://zenodo.org/records/10948109) - -## 🤝 Community contributors - - - - -- Gitee