diff --git a/PyTorch/built-in/mlm/PLLaVA/DATA.md b/PyTorch/built-in/mlm/PLLaVA/DATA.md deleted file mode 100644 index cf7783763fa362e2c8e57fe78fbf354c3261485d..0000000000000000000000000000000000000000 --- a/PyTorch/built-in/mlm/PLLaVA/DATA.md +++ /dev/null @@ -1,124 +0,0 @@ -# Data -## Instruction Training Data - - - -For training, we leveraged the video instruction tuning data from [Videochat2](https://github.com/OpenGVLab/Ask-Anything/tree/main/video_chat2). - -#### 1. Download json annotation files from huggingface. -[![Dataset meta](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-VideoChat2%20IT-blue)](https://huggingface.co/datasets/OpenGVLab/VideoChat2-IT) - - - -#### 2. Download the raw videos from the following links. -The video directories can be found in tasks/train/instruction_data.py. You can also change them to your own saved paths. - -- [VideoChat](https://github.com/OpenGVLab/InternVideo/tree/main/Data/instruction_data): Based on [InternVid](https://github.com/OpenGVLab/InternVideo/tree/main/Data/InternVid), download the processed version directly [here](https://pjlab-gvm-data.oss-cn-shanghai.aliyuncs.com/videochat2/data/videochat2_conversation_videos.zip) -- [VideoChatGPT](https://github.com/mbzuai-oryx/Video-ChatGPT/tree/main/data) -- [Kinetics-710](https://github.com/OpenGVLab/UniFormerV2/blob/main/DATASET.md), download Kinetics 400/600/700 [here](https://openxlab.org.cn/datasets?keywords=kinetics). -- [SthSthV2](https://developer.qualcomm.com/software/ai-datasets/something-something): Option candidates were generated from [UMT](https://github.com/OpenGVLab/unmasked_teacher) top-20 predictions. -- [NExTQA](https://github.com/doc-doc/NExT-QA) -- [CLEVRER](https://clevrer.csail.mit.edu/) -- [WebVid](https://maxbain.com/webvid-dataset/) -- [YouCook2](https://youcook2.eecs.umich.edu/), download the processed version [here](https://pjlab-gvm-data.oss-cn-shanghai.aliyuncs.com/videochat2/data/youcook_split_videos.zip). -- [TextVR](https://github.com/callsys/textvr) -- [TGIF](https://github.com/YunseokJANG/tgif-qa) -- [EgoQA](https://ego4d-data.org/), download the processed version [here](https://pjlab-gvm-data.oss-cn-shanghai.aliyuncs.com/videochat2/data/egoqa_split_videos.zip). - -#### 3. We also provide our processed json annotation files here. - -[![Dataset meta](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-magic%5Fjsons-blue)](https://huggingface.co/datasets/cathyxl/magic_jsons) - - - - -## Evaluation Data & Others -Follow this section to obtain the evaluation open resources. - -### VCGBench - -We refer to the VideoChatGPT video question answering evaluation as VCGBench in this repo. We followed the original [repo](https://github.com/mbzuai-oryx/Video-ChatGPT/tree/main) to prepare the evaluation data. - -### MVBench -We follow the original [Videochat2 repo](https://github.com/OpenGVLab/Ask-Anything/tree/main/video_chat2) in setting up the MVBench Evaluation. You can also find helpful resources at their [huggingface repo](https://huggingface.co/datasets/OpenGVLab/MVBench) - - -### Videoqabench -We refer to all other video question answering benchmarks as videoqabench in this repo. They are mainly prepared folloing the original repos. Each listed: -1. [MSVD](https://www.cs.utexas.edu/users/ml/clamp/videoDescription/) & [MSRVTT](https://github.com/xudejing/video-question-answering) - -3. [Activity Net](https://github.com/MILVLG/activitynet-qa/tree/master) -4. [TGIF](https://github.com/raingo/TGIF-Release/tree/master) - -Also other fantastic repo intergrating these benchmarks are helpful in the process of setting up the evaluation data: -- [VideoChatGPT](https://github.com/mbzuai-oryx/Video-ChatGPT/tree/main) -- [VideoLlava](https://github.com/PKU-YuanGroup/Video-LLaVA/tree/main/videollava) -- [IG-VLM](https://github.com/imagegridworth/IG-VLM/tree/main) - - - -### Recaptioning -#### Inter4k - -This is a dataset with 1000 samples of high resolution videos. We prepare the data folloing the instructions from their [official website](https://alexandrosstergiou.github.io/datasets/Inter4K/index.html) - -#### Extending Reacptioning -The recaptioning part is designed to be extendable. - -inference script [tasks/eval/recaption/pllava_recaption.py](tasks/eval/recaption/pllava_recaption.py) would use a dataset class [RecaptionDataset](tasks/eval/recaption/__init__.py#L197). The detailed information is kept in the data_list_info attribute as: -``` -data_list_info = OrderedDict({ - # "Panda70M": OrderedDict( - # json_relpath="Panda70M/annotations.json", - # prefix="DATAS/Recaption/Panda70M/videos", - # data_type="video", - # bound=False, - # key_rename_map={ - # # 'caption': 'hint', - # }, - # name_key='video_name', - # postfix=('mp4', 'mkv', 'webm'), - # recaption_type=RecaptionSample, - # ), # don't has start & end - "Inter4K": OrderedDict( - json_relpath="Inter4K/annotations.json", - prefix="DATAS/Recaption/Inter4K/60fps/UHD", - data_type="video", - bound=False, - key_rename_map={ - # 'caption': 'hint', - }, - name_key='video_name', - postfix=('mp4', 'mkv', 'webm'), - recaption_type=CaptionSample, - ), # don't has start & end - }) -``` -It contains the path to a annotation json file where there is a list and each item of the list is a sample waiting for captioning. For example, the Inter4K/annotations.json is like: -```json -[ - { - "video_name": "973" - }, - ... -] -``` -and the directory DATAS/Recaption/Inter4K/60fps/UHD would look like: -``` -$ ls DATAS/Recaption/Inter4K/60fps/UHD -1.mp4 134.mp4 170.mp4 .... -``` - -Naively, only the video is needed when captioning directly, therefore the annotation file only needs to contain the names of each video under the "prefix" directory. - -Extending a dataset for captioning would consist of the folloing steps: -1. have all the videos downloaded -2. construct a annotation.json file with sepecific format. -3. configure the recaption dataset [here](tasks/eval/recaption/__init__.py#L197), where you would need to determine: - - json_relpath: the annotation relative path - - prefix: root directory for videos - - postfix: a list containing all the file extensions for these videos - -The other options are experimental, so stick with the default setting as in Inter4k. The recommended length of video is around 5-20 seconds. - -p.s. "bound" is to make sure the video pass to the model doesn't have scene transition or so. This part wasn't tested, so set the bound to false and make sure the original videos files are single clip of a video. But always feel free to discover and contribute to PLLaVA! \ No newline at end of file diff --git a/PyTorch/built-in/mlm/PLLaVA/README.md b/PyTorch/built-in/mlm/PLLaVA/README.md index b07bc8e8c546c10c547edffc0bcdc6bd9da6f13d..a856ec3fc38e9b84fdcec70cdd3bb048559f4356 100644 --- a/PyTorch/built-in/mlm/PLLaVA/README.md +++ b/PyTorch/built-in/mlm/PLLaVA/README.md @@ -1,372 +1,148 @@ -
-

PLLaVA: Parameter-free LLaVA Extension from Images to Videos for Video Dense Captioning

+# PLLaVA for PyTorch +# 目录 + +- [简介](#简介) + - [模型介绍](#模型介绍) + - [支持任务列表](#支持任务列表) + - [代码实现](#代码实现) +- [准备训练环境](#准备训练环境) + - [安装模型环境](#安装模型环境) + - [安装昇腾环境](#安装昇腾环境) + - [准备数据集](#准备数据集) + - [获取预训练模型](#获取预训练模型) +- [快速开始](#快速开始) + - [模型训练](#模型训练) + - [结果展示](#结果展示) + - [模型推理](#模型推理) +- [公网地址说明](#公网地址说明) +- [变更说明](#变更说明) +- [FAQ](#faq) + -[Lin Xu](https://scholar.google.com/citations?user=_Gu69coAAAAJ), [Yilin Zhao](https://ermu2001.github.io/me.io/), [Daquan Zhou](https://scholar.google.com/citations?user=DdCAbWwAAAAJ), [Zhijie Lin](https://scholar.google.com/citations?user=xXMj6_EAAAAJ), [See-Kiong Ng](https://scholar.google.com/citations?user=_wsommYAAAAJ), [Jiashi Feng](https://scholar.google.com.sg/citations?user=Q8iay0gAAAAJ&hl=en) -
+# 简介 - +## 模型介绍 -**Project Page: [PLLaVA](https://pllava.github.io/)** +PLLaVA是一种新颖的端到端训练的大型多模态模型,它结合了视觉编码器和Vicuna,用于通用的视觉和语言理解,实现了令人印象深刻的聊天能力,在科学问答(Science QA)上达到了新的高度。 -[![arXiv](https://img.shields.io/badge/arXiv-2404.16994-b31b1b.svg)](https://arxiv.org/abs/2404.16994) -[![YouTube Video](https://img.shields.io/badge/YouTube-Video-red)](https://www.youtube.com/watch?v=nAEje8tu18U) -[![Model on HF](https://huggingface.co/datasets/huggingface/badges/resolve/main/model-on-hf-sm-dark.svg)](https://huggingface.co/ermu2001/pllava-34b) +## 支持任务列表 +本仓已经支持以下模型任务类型: -[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/pllava-parameter-free-llava-extension-from-1/zeroshot-video-question-answer-on-activitynet)](https://paperswithcode.com/sota/zeroshot-video-question-answer-on-activitynet?p=pllava-parameter-free-llava-extension-from-1) -[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/pllava-parameter-free-llava-extension-from-1/zeroshot-video-question-answer-on-msrvtt-qa)](https://paperswithcode.com/sota/zeroshot-video-question-answer-on-msrvtt-qa?p=pllava-parameter-free-llava-extension-from-1) -[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/pllava-parameter-free-llava-extension-from-1/zeroshot-video-question-answer-on-msvd-qa)](https://paperswithcode.com/sota/zeroshot-video-question-answer-on-msvd-qa?p=pllava-parameter-free-llava-extension-from-1) -[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/pllava-parameter-free-llava-extension-from-1/video-question-answering-on-mvbench)](https://paperswithcode.com/sota/video-question-answering-on-mvbench?p=pllava-parameter-free-llava-extension-from-1) -[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/pllava-parameter-free-llava-extension-from-1/zeroshot-video-question-answer-on-tgif-qa)](https://paperswithcode.com/sota/zeroshot-video-question-answer-on-tgif-qa?p=pllava-parameter-free-llava-extension-from-1) -[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/pllava-parameter-free-llava-extension-from-1/video-based-generative-performance-4)](https://paperswithcode.com/sota/video-based-generative-performance-4?p=pllava-parameter-free-llava-extension-from-1) -[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/pllava-parameter-free-llava-extension-from-1/video-based-generative-performance-3)](https://paperswithcode.com/sota/video-based-generative-performance-3?p=pllava-parameter-free-llava-extension-from-1) -[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/pllava-parameter-free-llava-extension-from-1/video-based-generative-performance)](https://paperswithcode.com/sota/video-based-generative-performance?p=pllava-parameter-free-llava-extension-from-1) -[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/pllava-parameter-free-llava-extension-from-1/video-based-generative-performance-2)](https://paperswithcode.com/sota/video-based-generative-performance-2?p=pllava-parameter-free-llava-extension-from-1) -[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/pllava-parameter-free-llava-extension-from-1/video-based-generative-performance-1)](https://paperswithcode.com/sota/video-based-generative-performance-1?p=pllava-parameter-free-llava-extension-from-1) -[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/pllava-parameter-free-llava-extension-from-1/video-based-generative-performance-5)](https://paperswithcode.com/sota/video-based-generative-performance-5?p=pllava-parameter-free-llava-extension-from-1) +| 模型 | 任务列表 | 是否支持 | +|:------------:|:----:|:-----:| +| LLaVA 1.6 7B | 训练 | ✔ | +| LLaVA 1.6 7B | 推理 | ✔ | +## 代码实现 +- 参考实现: + ``` + url=https://github.com/magic-research/PLLaVA + commit_id=6f49fd2 + ``` +- 适配昇腾AI处理器的实现: + ```shell + url=https://gitee.com/ascend/ModelZoo-PyTorch.git + code_path=PyTorch/built-in/mlm/PLLaVA + ``` +# 准备训练环境 +## 安装模型环境 +- 下载代码: + ```shell + git clone https://gitee.com/ascend/ModelZoo-PyTorch.git + cd PyTorch/built-in/mlm/PLLaVA + ``` -![]() -
- - - -
+- 创建Python环境并且安装Python三方包: + ```shell + conda create -n llava python=3.10 -y + conda activate pllava + pip install --upgrade pip # enable PEP 660 support + pip3 install torch==2.1.0+cpu --index-url https://download.pytorch.org/whl/cpu #For X86 + pip3 install torch==2.1.0 #For Aarch64 + pip install -r requirements.txt + ``` + +## 安装昇腾环境 -
-
+ 请参考昇腾社区中《[Pytorch框架训练环境准备](https://www.hiascend.com/document/detail/zh/ModelZoo/pytorchframework/ptes)》文档搭建昇腾环境,本仓已支持表4中软件版本。 + + + **表 4** 昇腾软件版本支持表 + | 软件类型 | 支持版本 | + | :--------: |:--------:| + | FrameworkPTAdapter | 在研版本 | + | CANN | 在研版本 | + | 昇腾NPU固件 | 在研版本 | + | 昇腾NPU驱动 | 在研版本 | +## 准备数据集 +- json文件下载路径参考: (https://huggingface.co/datasets/OpenGVLab/VideoChat2-IT)。 +- 视频文件下载参考:(https://github.com/magic-research/PLLaVA/blob/main/README.md 中的数据准备章节)。 +- 数据集结构如下所示: + ``` + dataset/VideoChat2-IT/video/reasoning/clever_qa + ├── train.json + + dataset/video_all + ├── xxx.mp4 +- 在训练脚本中(train_pllava_single_npu.sh(单卡)、 train_pllava_multi_npu.sh(单机多卡)、train_pllava_npu_multi_node.sh(多机多卡))通过指定train_corpus的value,在 tasks/train/instruction_data.py中获取具体的json路径和视频路径。 -## Overview +## 获取预训练模型 -Welcome to PLLAVA! +- 联网情况下,预训练模型会自动下载。 -The primary purpose of this repository is to support research and the development of prototype models. It is designed to facilitate ease of experimentation and enable a clear overview of results. Please note that this repo is currently undergoing development and reconstruction. +- 无网络时,用户可访问huggingface官网自行下载,文件namespace如下: + 参考 https://github.com/magic-research/PLLaVA/blob/main/README.md 中的模型下载准备章节。 + 在训练脚本中,需要指定模型存储的绝对路径。 + +# 快速开始 -It's important to mention that we have not optimized the response speed of the application or the frontend logic. Our goal is to maintain simplicity, clarity, and ease of development, making it accessible for both researchers and students. If you have suggestions or want to enhance the application's performance, please feel free to contact us or contribute to the project. +## 模型训练 +1. 训练脚本位置位于scripts目录,提供了train_pllava_single_npu.sh(单卡)、 train_pllava_multi_npu.sh(单机多卡)、train_pllava_npu_multi_node.sh(多机多卡)三个脚本。 需要根据真实值配置cann的set_env.sh路径、数据集路径、权重的路径。 -We've briefly introduce our work in section [PLLAVA](#%EF%B8%8F-pllava). For more details, feel free to read our paper. Check out section [Usage](#hammer-usage) to start using this repo. If you felt our works interesting, please star us, your support is all we want. If you find our work helpful, feel free to [cite](#page_facing_up-citation) us directly. +2. 运行训练脚本,下面以单机单卡示例: -## :fire: Updates + ```shell + bash scripts/train_pllava_single_npu.sh + ``` + 训练完成后,权重文件保存在参数`--output_dir`路径下。 +## 结果展示 -- **2024/4/24**: Release: - - We are releasing our code/models/datasets. +**表 2** 训练结果展示: -## 🏖️ PLLAVA -
- - - -
+| 芯片 | 卡数 | second per step | batch_size | AMP_Type | Torch_Version | +|:-------------:|:---:|:---------------:|:----------:|:---:|:---:| +| 竞品A | 8p | 0.84s | 1 | bf16 | 2.1 | +| Atlas 800T A2 | 8p | 0.85s | 1 | bf16 | 2.1 | -### Abstract +## 模型推理 +训练脚本位置位于scripts目录下,提供了eval_single.sh脚本,其中的cann的set_env.sh路径、视频文件路径、模型文件路径、权重文件路径等,按照实际填写。 -Vision-language pre-training (VLP) has significantly elevated performance across a range of vision-language applications. Yet, the pre-training process for video-related tasks demands an exceptionally high degree of computational and data resources. This paper investigates a straightforward, highly efficient, and resource-light approach to adapting an existing image-language pre-training model for video data. Our preliminary experiments reveal that directly fine-tuning pre-trained image-language models with multiple frames on video datasets leads to performance saturation or even a drop in caption-related tasks. Besides, it is also vulnerable to prompts and tends to provide short descriptions. We conducted a deep analysis and observed that the performance saturation and the vulnerability might be related to the dominant patches that exist in some single video patches. We then propose a simple pooling strategy to smooth the feature distribution along the temporal dimension and thus reduce the dominant impacts from some extreme tokens. The new model is termed Pooling LLaVA, or PLLaVA in short. With the proposed pooling strategy, we achieve new state-of-the-art performance on all evaluated datasets. Notably, on the recent popular Video ChatGPT benchmark, PLLaVA achieves a score of 3.48 out of 5 on average of five evaluated dimensions, which is the new state-of-the-art score on the leaderboard and is 0.31 higher than the previous SOTA results from GPT4V (IG-VLM). On the latest multi-choice benchmark MVBench, PLLaVA achieves 58.1% accuracy on average across 20 sub-tasks, which is the new state-of-the-art result and is 14.5% higher than GPT4V (IG-VLM). + ``` + bash scripts/eval_single.sh + ``` +脚本执行中,会让用户输入问题,再根据问题返回答案。 -
+# 公网地址说明 +代码涉及公网地址参考 [public_address_statement.md](public_address_statement.md) -### SEARCHING FOR OPTIMAL POOLING STRATEGY -There are two dimensions for the pooling strategy: the spatial dimension and the temporal dimension. We empirically found that reducing the spatial dimension with a larger temporal dimension could lead to better model performance, compared to reducing the temporal dimension directly. -
+# 变更说明 +2024.08.09: 首次发布。 - -### STATE-OF-THE-ART PERFORMANCE -We compare the performance of PLLAVA with recent popular methods over both question-answer and captioning datasets. The results are shown below. - -
- -## :hammer: Usage - -This section provides guidance on how to run, train, and evaluate our models. - -### Install -First, you will need to set up the environment and download some pre-trained weights. - -This repo is built up using [transformers](https://github.com/huggingface/transformers) for model construction along with [accelerate](https://github.com/huggingface/accelerate) for distributed training. Follow the instructions to install the needed environment. - -1. Above all, the following environment set up is for python 3.10. If you choose to use conda for environment setup, we recommend creating the virtual environment with: -```bash -conda create -n pllava python=3.10 -``` - -1. Firstly, install [pytorch](https://pytorch.org/) from the official website. The code runs on torch 2.2.1, cu118 or cu122. Select the version that suits your drive version. - -``` -torch 2.2.1+cu118 -torchaudio 2.2.1+cu118 -torchvision 0.17.1+cu118 -``` - -If your driver version is higher than cu121, you could probably try installing with the following scripts: -```bash -pip install -r requirements.txt -``` - -Otherwise, you would need to install a torch for your server first, then install the other packages: -```bash -pip install -r requirements.torch.txt # decide your own requirements, (this is for cu11), or install torch directly following the official website. -pip install -r requirements.no_torch.txt # install the following -``` - -1. Prepare the model. -We prefer to have huggingface models explicitly downloaded to a MODELS directory. However, if you are familiar with huggingface-hub usage, feel free to organize the model yourself. -``` -python python_scripts/hf.py -``` - -Here are some detailed information of the obtained models: - - -| Model | Link | Initialized From | -| ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------- | -| pllava-7b | [![Model on HF](https://huggingface.co/datasets/huggingface/badges/resolve/main/model-on-hf-sm-dark.svg)](https://huggingface.co/ermu2001/pllava-7b) | [llava-hf/llava-v1.6-vicuna-7b-hf · Hugging Face](https://huggingface.co/llava-hf/llava-v1.6-vicuna-7b-hf) | -| pllava-13b | [![Model on HF](https://huggingface.co/datasets/huggingface/badges/resolve/main/model-on-hf-sm-dark.svg)](https://huggingface.co/ermu2001/pllava-13b) | [llava-hf/llava-v1.6-vicuna-13b-hf · Hugging Face](https://huggingface.co/llava-hf/llava-v1.6-vicuna-13b-hf) | -| pllava-34b | [![Model on HF](https://huggingface.co/datasets/huggingface/badges/resolve/main/model-on-hf-sm-dark.svg)](https://huggingface.co/ermu2001/pllava-34b) | [llava-hf/llava-v1.6-34b-hf · Hugging Face](https://huggingface.co/llava-hf/llava-v1.6-34b-hf) | - -The model directory should look like this, where you would only need the corresponding model's weights and directory. - -``` -$ tree MODELS -MODELS -|-- pllava-13b -| |-- added_tokens.json -| |-- config.json -| |-- generation_config.json -| |-- model-00001-of-00006.safetensors -| |-- model-00002-of-00006.safetensors -| |-- model-00003-of-00006.safetensors -| |-- model-00004-of-00006.safetensors -| |-- model-00005-of-00006.safetensors -| |-- model-00006-of-00006.safetensors -| |-- model.safetensors.index.json -| |-- preprocessor_config.json -| |-- processor_config.json -| |-- special_tokens_map.json -| |-- tokenizer.json -| |-- tokenizer.model -| `-- tokenizer_config.json -|-- pllava-34b -| |-- added_tokens.json -| |-- config.json -| |-- generation_config.json -| |-- model-00001-of-00015.safetensors -| |-- model-00002-of-00015.safetensors -| |-- model-00003-of-00015.safetensors -| |-- model-00004-of-00015.safetensors -| |-- model-00005-of-00015.safetensors -| |-- model-00006-of-00015.safetensors -| |-- model-00007-of-00015.safetensors -| |-- model-00008-of-00015.safetensors -| |-- model-00009-of-00015.safetensors -| |-- model-00010-of-00015.safetensors -| |-- model-00011-of-00015.safetensors -| |-- model-00012-of-00015.safetensors -| |-- model-00013-of-00015.safetensors -| |-- model-00014-of-00015.safetensors -| |-- model-00015-of-00015.safetensors -| |-- model.safetensors-deprecated -| |-- model.safetensors.index.json -| |-- preprocessor_config.json -| |-- processor_config.json -| |-- special_tokens_map.json -| |-- tokenizer.json -| |-- tokenizer.model -| `-- tokenizer_config.json -|-- pllava-7b - |-- added_tokens.json - |-- config.json - |-- generation_config.json - |-- model-00001-of-00003.safetensors - |-- model-00002-of-00003.safetensors - |-- model-00003-of-00003.safetensors - |-- model.safetensors.index.json - |-- preprocessor_config.json - |-- processor_config.json - |-- special_tokens_map.json - |-- tokenizer.json - |-- tokenizer.model - `-- tokenizer_config.json -``` - -With the above steps, you should be able to proceed on with the following usages. - -### Run Application - -To run our models, make sure you have downloaded a model pretrained weights from the huggingface spaces. Then, run the following scripts with the corresponding path input. Since we are only training with lora and the projector, the model to be run are determined with: - -- **model_dir**: model directory, one with config.json as compatible with transformers. This refers to the base model's directory, for example "llava-hf/llava-v1.6-vicuna-7b-hf"/"ermu2001/pllava-7b"/"MODELS/pllava-7b". (default to: MODELS/plave-7b) -- **weights_dir**: your weights directory. could be the same as model_dir, but if you have a weights directory for the lora weights, you should set this weights_dir to that directory to load the lora weights. This directory should be local. Also, it would need to contain a config.json file within. (default to: ${model_dir}). - -```bash -model_dir="model directory" -weights_dir="weights directory" -bash scripts/demo.sh ${model_dir} ${weights_dir} -``` - -Now check out the application demo and try play with PLLAVA! - -### Train - -Follow the following steps to reproduce our results or train your own variant: - -#### 1. Data Preparation - -To train our model from a starting Image-aligned Vision LLM, you would need to download the data first. Our data set up is mainly based on the original Videochat2's training data. Check out [Instruction Data](./DATA.md) to prepare the instruction training data. Ideally, setting up a root data directory and alter the code [here](./tasks/train/instruction_data.py#L6) would accomodate the data for training most smoothly. - -#### 2. Start Training - -Now you're only a few step away from starting the training. Follow the instructions: - -##### Setup Accelerator - -Customize a accelerate training config. For example, a simple config using multiple gpus with no distribution strategy (only torch DDP) would look like: - -```yaml -compute_environment: LOCAL_MACHINE -debug: false -distributed_type: MULTI_GPU -downcast_bf16: 'no' -gpu_ids: all -machine_rank: 0 -main_training_function: main -mixed_precision: bf16 -num_machines: 1 -num_processes: 8 -rdzv_backend: static -same_network: true -tpu_env: [] -tpu_use_cluster: false -tpu_use_sudo: false -use_cpu: false -``` - -Check out out the [Accelerate](https://huggingface.co/docs/accelerate/index) documents for more details. - -##### Overwatch the training configuration - -Next, you should go over a basic training configuration of the training process in [here](tasks/train/config_pllava_nframe.py). Then passing this file as the first arg to the [training script](tasks/train/train_pllava_nframe_accel.py) would utilize every arguments in the file. You can customize some of the hyper parameters for your own training process by passing them in the format of "key" "value" pair in the following arguments. A example training scripts could be find [here](scripts/train_pllava.sh). - -We recommand customize a [configuration](tasks/train/config_pllava_nframe.py) to set up a customized training! - -With the above steps, you would be able to start the training process. The output would be well organized in the output directory, each a qualified model directory to pass in to demo as weights_dir, since we are only saveing the lora weights and projector weights to avoide redundancy. - -### Evaluation - -This section mainly introduce how to reproduce the evaluation or evaluate your own model. - -#### Set up Evaluation Data - -Make sure you set up the "DATAS" directory as in [DATA.md](DATA.md), then you would be able to run the inference with fortune! The evaluation data directory of DATAS would look like: - -``` -DATAS/: -DATAS/VideoQA: -DATAS/VideoQA/TGIF_QA: - test_a.json - test_q.json -DATAS/VideoQA/TGIF_QA/videos: - tumblr_m4387mGrlc1r6m5e8o1_250.gif - ... -DATAS/VideoQA/TGIF_QA/videos_mp4: - tumblr_m4387mGrlc1r6m5e8o1_250.mp4 - ... -DATAS/VideoQA/TGIF_QA/video_gif: - tumblr_m4387mGrlc1r6m5e8o1_250.gif - ... -DATAS/VideoQA/MSVD_Zero_Shot_QA: - test_a.json - test_q.json -DATAS/VideoQA/MSVD_Zero_Shot_QA/videos: - -4wsuPCjDBc_5_15.avi -DATAS/VideoQA/MSVD_Zero_Shot_QA/msvd_qa: -DATAS/VideoQA/ActivityNet: - test_a.json - test_q.json -DATAS/VideoQA/ActivityNet/all_test: - v_--tFD65KaK4.mp4 - ... -DATAS/VideoQA/MSRVTT_Zero_Shot_QA: - test_a.json - test_q.json -DATAS/VideoQA/MSRVTT_Zero_Shot_QA/videos: -DATAS/VideoQA/MSRVTT_Zero_Shot_QA/videos/all: - video0.mp4 - ... - -DATAS/MVBench: - ... - -DATAS/Recaption/Inter4K: - annotations.json -DATAS/Recaption/Inter4K/60fps: -DATAS/Recaption/Inter4K/60fps/UHD: - 1.mp4 - ... - -``` - -#### Start Evaluate - -Once you have construted the evaluation data, you can start the evaluation as in [here](scripts/eval.sh). This script is for evaluating 7B/13B models. As pllava-34b model uses a slightly different prompting, it is evaluated with this [script](scripts/eval_yiprompt.sh). - -``` -bash scripts/eval.sh -``` - -Same as running the demo, you would need to determine the model_dir and weights_dir to evaluate the model. Feel free to comment out some commands and produce partial evaluation. - -#### Overwatch the Results - -The evaluation results would be shown to you with our results gallery demo: - -```bash -bash scripts/gallery.sh -``` - -Feel free to use the compare version to compare differnt models' results or use the single gallery version to check out one model's results. They are basically the same. Check out this [script](scripts/gallery.sh) for more details - -#### For Captioning and Recaptioning -Follow instructions at [DATA.md](DATA.md#extending-reacptioning) and you can extend the recaptioning data with a few steps. - -Feel free to point out high quality dataset of videos, we would proceed on doing captioning on those datasets. - - -# :page_facing_up: Citation - -If you find this project useful in your research, please consider cite: - -```BibTeX -@misc{xu2024pllava, - title={PLLaVA : Parameter-free LLaVA Extension from Images to Videos for Video Dense Captioning}, - author={Lin Xu and Yilin Zhao and Daquan Zhou and Zhijie Lin and See Kiong Ng and Jiashi Feng}, - year={2024}, - eprint={2404.16994}, - archivePrefix={arXiv}, - primaryClass={cs.CV} -} -``` - -# :dizzy: Acknowledgement - -This code base is mainly built upon [Videochat2](https://github.com/OpenGVLab/Ask-Anything/tree/main/video_chat2). SALUTE. - -We would also like to recognize and commend the following open source projects, thank you for your great contribution to the open source community: - -- [LLaVA](https://github.com/haotian-liu/LLaVA): Fantastic Open Source Image LLM Model. -- [VideoChatGPT](https://github.com/mbzuai-oryx/Video-ChatGPT/tree/main): Great Evaluation Benchmarking Framework. -- [VideoLlava](https://github.com/PKU-YuanGroup/Video-LLaVA/tree/main/videollava):Video LLM repo with helpful resources. +# FAQ +无 diff --git a/PyTorch/built-in/mlm/PLLaVA/README_en.md b/PyTorch/built-in/mlm/PLLaVA/README_en.md new file mode 100644 index 0000000000000000000000000000000000000000..b07bc8e8c546c10c547edffc0bcdc6bd9da6f13d --- /dev/null +++ b/PyTorch/built-in/mlm/PLLaVA/README_en.md @@ -0,0 +1,372 @@ +
+ +

PLLaVA: Parameter-free LLaVA Extension from Images to Videos for Video Dense Captioning

+ +[Lin Xu](https://scholar.google.com/citations?user=_Gu69coAAAAJ), [Yilin Zhao](https://ermu2001.github.io/me.io/), [Daquan Zhou](https://scholar.google.com/citations?user=DdCAbWwAAAAJ), [Zhijie Lin](https://scholar.google.com/citations?user=xXMj6_EAAAAJ), [See-Kiong Ng](https://scholar.google.com/citations?user=_wsommYAAAAJ), [Jiashi Feng](https://scholar.google.com.sg/citations?user=Q8iay0gAAAAJ&hl=en) + +
+ + + +**Project Page: [PLLaVA](https://pllava.github.io/)** + +[![arXiv](https://img.shields.io/badge/arXiv-2404.16994-b31b1b.svg)](https://arxiv.org/abs/2404.16994) +[![YouTube Video](https://img.shields.io/badge/YouTube-Video-red)](https://www.youtube.com/watch?v=nAEje8tu18U) +[![Model on HF](https://huggingface.co/datasets/huggingface/badges/resolve/main/model-on-hf-sm-dark.svg)](https://huggingface.co/ermu2001/pllava-34b) + +[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/pllava-parameter-free-llava-extension-from-1/zeroshot-video-question-answer-on-activitynet)](https://paperswithcode.com/sota/zeroshot-video-question-answer-on-activitynet?p=pllava-parameter-free-llava-extension-from-1) +[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/pllava-parameter-free-llava-extension-from-1/zeroshot-video-question-answer-on-msrvtt-qa)](https://paperswithcode.com/sota/zeroshot-video-question-answer-on-msrvtt-qa?p=pllava-parameter-free-llava-extension-from-1) +[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/pllava-parameter-free-llava-extension-from-1/zeroshot-video-question-answer-on-msvd-qa)](https://paperswithcode.com/sota/zeroshot-video-question-answer-on-msvd-qa?p=pllava-parameter-free-llava-extension-from-1) +[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/pllava-parameter-free-llava-extension-from-1/video-question-answering-on-mvbench)](https://paperswithcode.com/sota/video-question-answering-on-mvbench?p=pllava-parameter-free-llava-extension-from-1) +[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/pllava-parameter-free-llava-extension-from-1/zeroshot-video-question-answer-on-tgif-qa)](https://paperswithcode.com/sota/zeroshot-video-question-answer-on-tgif-qa?p=pllava-parameter-free-llava-extension-from-1) +[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/pllava-parameter-free-llava-extension-from-1/video-based-generative-performance-4)](https://paperswithcode.com/sota/video-based-generative-performance-4?p=pllava-parameter-free-llava-extension-from-1) +[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/pllava-parameter-free-llava-extension-from-1/video-based-generative-performance-3)](https://paperswithcode.com/sota/video-based-generative-performance-3?p=pllava-parameter-free-llava-extension-from-1) +[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/pllava-parameter-free-llava-extension-from-1/video-based-generative-performance)](https://paperswithcode.com/sota/video-based-generative-performance?p=pllava-parameter-free-llava-extension-from-1) +[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/pllava-parameter-free-llava-extension-from-1/video-based-generative-performance-2)](https://paperswithcode.com/sota/video-based-generative-performance-2?p=pllava-parameter-free-llava-extension-from-1) +[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/pllava-parameter-free-llava-extension-from-1/video-based-generative-performance-1)](https://paperswithcode.com/sota/video-based-generative-performance-1?p=pllava-parameter-free-llava-extension-from-1) +[![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/pllava-parameter-free-llava-extension-from-1/video-based-generative-performance-5)](https://paperswithcode.com/sota/video-based-generative-performance-5?p=pllava-parameter-free-llava-extension-from-1) + + + + + + + +![]() +
+ + + +
+ +
+
+ + + + + + +## Overview + +Welcome to PLLAVA! + +The primary purpose of this repository is to support research and the development of prototype models. It is designed to facilitate ease of experimentation and enable a clear overview of results. Please note that this repo is currently undergoing development and reconstruction. + +It's important to mention that we have not optimized the response speed of the application or the frontend logic. Our goal is to maintain simplicity, clarity, and ease of development, making it accessible for both researchers and students. If you have suggestions or want to enhance the application's performance, please feel free to contact us or contribute to the project. + + +We've briefly introduce our work in section [PLLAVA](#%EF%B8%8F-pllava). For more details, feel free to read our paper. Check out section [Usage](#hammer-usage) to start using this repo. If you felt our works interesting, please star us, your support is all we want. If you find our work helpful, feel free to [cite](#page_facing_up-citation) us directly. + +## :fire: Updates + +- **2024/4/24**: Release: + - We are releasing our code/models/datasets. + +## 🏖️ PLLAVA +
+ + + +
+ + +### Abstract + +Vision-language pre-training (VLP) has significantly elevated performance across a range of vision-language applications. Yet, the pre-training process for video-related tasks demands an exceptionally high degree of computational and data resources. This paper investigates a straightforward, highly efficient, and resource-light approach to adapting an existing image-language pre-training model for video data. Our preliminary experiments reveal that directly fine-tuning pre-trained image-language models with multiple frames on video datasets leads to performance saturation or even a drop in caption-related tasks. Besides, it is also vulnerable to prompts and tends to provide short descriptions. We conducted a deep analysis and observed that the performance saturation and the vulnerability might be related to the dominant patches that exist in some single video patches. We then propose a simple pooling strategy to smooth the feature distribution along the temporal dimension and thus reduce the dominant impacts from some extreme tokens. The new model is termed Pooling LLaVA, or PLLaVA in short. With the proposed pooling strategy, we achieve new state-of-the-art performance on all evaluated datasets. Notably, on the recent popular Video ChatGPT benchmark, PLLaVA achieves a score of 3.48 out of 5 on average of five evaluated dimensions, which is the new state-of-the-art score on the leaderboard and is 0.31 higher than the previous SOTA results from GPT4V (IG-VLM). On the latest multi-choice benchmark MVBench, PLLaVA achieves 58.1% accuracy on average across 20 sub-tasks, which is the new state-of-the-art result and is 14.5% higher than GPT4V (IG-VLM). + +
+ + +### SEARCHING FOR OPTIMAL POOLING STRATEGY +There are two dimensions for the pooling strategy: the spatial dimension and the temporal dimension. We empirically found that reducing the spatial dimension with a larger temporal dimension could lead to better model performance, compared to reducing the temporal dimension directly. + +
+ + +### STATE-OF-THE-ART PERFORMANCE +We compare the performance of PLLAVA with recent popular methods over both question-answer and captioning datasets. The results are shown below. + +
+ +## :hammer: Usage + +This section provides guidance on how to run, train, and evaluate our models. + +### Install +First, you will need to set up the environment and download some pre-trained weights. + +This repo is built up using [transformers](https://github.com/huggingface/transformers) for model construction along with [accelerate](https://github.com/huggingface/accelerate) for distributed training. Follow the instructions to install the needed environment. + +1. Above all, the following environment set up is for python 3.10. If you choose to use conda for environment setup, we recommend creating the virtual environment with: +```bash +conda create -n pllava python=3.10 +``` + +1. Firstly, install [pytorch](https://pytorch.org/) from the official website. The code runs on torch 2.2.1, cu118 or cu122. Select the version that suits your drive version. + +``` +torch 2.2.1+cu118 +torchaudio 2.2.1+cu118 +torchvision 0.17.1+cu118 +``` + +If your driver version is higher than cu121, you could probably try installing with the following scripts: +```bash +pip install -r requirements.txt +``` + +Otherwise, you would need to install a torch for your server first, then install the other packages: +```bash +pip install -r requirements.torch.txt # decide your own requirements, (this is for cu11), or install torch directly following the official website. +pip install -r requirements.no_torch.txt # install the following +``` + +1. Prepare the model. +We prefer to have huggingface models explicitly downloaded to a MODELS directory. However, if you are familiar with huggingface-hub usage, feel free to organize the model yourself. +``` +python python_scripts/hf.py +``` + +Here are some detailed information of the obtained models: + + +| Model | Link | Initialized From | +| ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------- | +| pllava-7b | [![Model on HF](https://huggingface.co/datasets/huggingface/badges/resolve/main/model-on-hf-sm-dark.svg)](https://huggingface.co/ermu2001/pllava-7b) | [llava-hf/llava-v1.6-vicuna-7b-hf · Hugging Face](https://huggingface.co/llava-hf/llava-v1.6-vicuna-7b-hf) | +| pllava-13b | [![Model on HF](https://huggingface.co/datasets/huggingface/badges/resolve/main/model-on-hf-sm-dark.svg)](https://huggingface.co/ermu2001/pllava-13b) | [llava-hf/llava-v1.6-vicuna-13b-hf · Hugging Face](https://huggingface.co/llava-hf/llava-v1.6-vicuna-13b-hf) | +| pllava-34b | [![Model on HF](https://huggingface.co/datasets/huggingface/badges/resolve/main/model-on-hf-sm-dark.svg)](https://huggingface.co/ermu2001/pllava-34b) | [llava-hf/llava-v1.6-34b-hf · Hugging Face](https://huggingface.co/llava-hf/llava-v1.6-34b-hf) | + +The model directory should look like this, where you would only need the corresponding model's weights and directory. + +``` +$ tree MODELS +MODELS +|-- pllava-13b +| |-- added_tokens.json +| |-- config.json +| |-- generation_config.json +| |-- model-00001-of-00006.safetensors +| |-- model-00002-of-00006.safetensors +| |-- model-00003-of-00006.safetensors +| |-- model-00004-of-00006.safetensors +| |-- model-00005-of-00006.safetensors +| |-- model-00006-of-00006.safetensors +| |-- model.safetensors.index.json +| |-- preprocessor_config.json +| |-- processor_config.json +| |-- special_tokens_map.json +| |-- tokenizer.json +| |-- tokenizer.model +| `-- tokenizer_config.json +|-- pllava-34b +| |-- added_tokens.json +| |-- config.json +| |-- generation_config.json +| |-- model-00001-of-00015.safetensors +| |-- model-00002-of-00015.safetensors +| |-- model-00003-of-00015.safetensors +| |-- model-00004-of-00015.safetensors +| |-- model-00005-of-00015.safetensors +| |-- model-00006-of-00015.safetensors +| |-- model-00007-of-00015.safetensors +| |-- model-00008-of-00015.safetensors +| |-- model-00009-of-00015.safetensors +| |-- model-00010-of-00015.safetensors +| |-- model-00011-of-00015.safetensors +| |-- model-00012-of-00015.safetensors +| |-- model-00013-of-00015.safetensors +| |-- model-00014-of-00015.safetensors +| |-- model-00015-of-00015.safetensors +| |-- model.safetensors-deprecated +| |-- model.safetensors.index.json +| |-- preprocessor_config.json +| |-- processor_config.json +| |-- special_tokens_map.json +| |-- tokenizer.json +| |-- tokenizer.model +| `-- tokenizer_config.json +|-- pllava-7b + |-- added_tokens.json + |-- config.json + |-- generation_config.json + |-- model-00001-of-00003.safetensors + |-- model-00002-of-00003.safetensors + |-- model-00003-of-00003.safetensors + |-- model.safetensors.index.json + |-- preprocessor_config.json + |-- processor_config.json + |-- special_tokens_map.json + |-- tokenizer.json + |-- tokenizer.model + `-- tokenizer_config.json +``` + +With the above steps, you should be able to proceed on with the following usages. + +### Run Application + +To run our models, make sure you have downloaded a model pretrained weights from the huggingface spaces. Then, run the following scripts with the corresponding path input. Since we are only training with lora and the projector, the model to be run are determined with: + +- **model_dir**: model directory, one with config.json as compatible with transformers. This refers to the base model's directory, for example "llava-hf/llava-v1.6-vicuna-7b-hf"/"ermu2001/pllava-7b"/"MODELS/pllava-7b". (default to: MODELS/plave-7b) +- **weights_dir**: your weights directory. could be the same as model_dir, but if you have a weights directory for the lora weights, you should set this weights_dir to that directory to load the lora weights. This directory should be local. Also, it would need to contain a config.json file within. (default to: ${model_dir}). + +```bash +model_dir="model directory" +weights_dir="weights directory" +bash scripts/demo.sh ${model_dir} ${weights_dir} +``` + +Now check out the application demo and try play with PLLAVA! + +### Train + +Follow the following steps to reproduce our results or train your own variant: + +#### 1. Data Preparation + +To train our model from a starting Image-aligned Vision LLM, you would need to download the data first. Our data set up is mainly based on the original Videochat2's training data. Check out [Instruction Data](./DATA.md) to prepare the instruction training data. Ideally, setting up a root data directory and alter the code [here](./tasks/train/instruction_data.py#L6) would accomodate the data for training most smoothly. + +#### 2. Start Training + +Now you're only a few step away from starting the training. Follow the instructions: + +##### Setup Accelerator + +Customize a accelerate training config. For example, a simple config using multiple gpus with no distribution strategy (only torch DDP) would look like: + +```yaml +compute_environment: LOCAL_MACHINE +debug: false +distributed_type: MULTI_GPU +downcast_bf16: 'no' +gpu_ids: all +machine_rank: 0 +main_training_function: main +mixed_precision: bf16 +num_machines: 1 +num_processes: 8 +rdzv_backend: static +same_network: true +tpu_env: [] +tpu_use_cluster: false +tpu_use_sudo: false +use_cpu: false +``` + +Check out out the [Accelerate](https://huggingface.co/docs/accelerate/index) documents for more details. + +##### Overwatch the training configuration + +Next, you should go over a basic training configuration of the training process in [here](tasks/train/config_pllava_nframe.py). Then passing this file as the first arg to the [training script](tasks/train/train_pllava_nframe_accel.py) would utilize every arguments in the file. You can customize some of the hyper parameters for your own training process by passing them in the format of "key" "value" pair in the following arguments. A example training scripts could be find [here](scripts/train_pllava.sh). + +We recommand customize a [configuration](tasks/train/config_pllava_nframe.py) to set up a customized training! + +With the above steps, you would be able to start the training process. The output would be well organized in the output directory, each a qualified model directory to pass in to demo as weights_dir, since we are only saveing the lora weights and projector weights to avoide redundancy. + +### Evaluation + +This section mainly introduce how to reproduce the evaluation or evaluate your own model. + +#### Set up Evaluation Data + +Make sure you set up the "DATAS" directory as in [DATA.md](DATA.md), then you would be able to run the inference with fortune! The evaluation data directory of DATAS would look like: + +``` +DATAS/: +DATAS/VideoQA: +DATAS/VideoQA/TGIF_QA: + test_a.json + test_q.json +DATAS/VideoQA/TGIF_QA/videos: + tumblr_m4387mGrlc1r6m5e8o1_250.gif + ... +DATAS/VideoQA/TGIF_QA/videos_mp4: + tumblr_m4387mGrlc1r6m5e8o1_250.mp4 + ... +DATAS/VideoQA/TGIF_QA/video_gif: + tumblr_m4387mGrlc1r6m5e8o1_250.gif + ... +DATAS/VideoQA/MSVD_Zero_Shot_QA: + test_a.json + test_q.json +DATAS/VideoQA/MSVD_Zero_Shot_QA/videos: + -4wsuPCjDBc_5_15.avi +DATAS/VideoQA/MSVD_Zero_Shot_QA/msvd_qa: +DATAS/VideoQA/ActivityNet: + test_a.json + test_q.json +DATAS/VideoQA/ActivityNet/all_test: + v_--tFD65KaK4.mp4 + ... +DATAS/VideoQA/MSRVTT_Zero_Shot_QA: + test_a.json + test_q.json +DATAS/VideoQA/MSRVTT_Zero_Shot_QA/videos: +DATAS/VideoQA/MSRVTT_Zero_Shot_QA/videos/all: + video0.mp4 + ... + +DATAS/MVBench: + ... + +DATAS/Recaption/Inter4K: + annotations.json +DATAS/Recaption/Inter4K/60fps: +DATAS/Recaption/Inter4K/60fps/UHD: + 1.mp4 + ... + +``` + +#### Start Evaluate + +Once you have construted the evaluation data, you can start the evaluation as in [here](scripts/eval.sh). This script is for evaluating 7B/13B models. As pllava-34b model uses a slightly different prompting, it is evaluated with this [script](scripts/eval_yiprompt.sh). + +``` +bash scripts/eval.sh +``` + +Same as running the demo, you would need to determine the model_dir and weights_dir to evaluate the model. Feel free to comment out some commands and produce partial evaluation. + +#### Overwatch the Results + +The evaluation results would be shown to you with our results gallery demo: + +```bash +bash scripts/gallery.sh +``` + +Feel free to use the compare version to compare differnt models' results or use the single gallery version to check out one model's results. They are basically the same. Check out this [script](scripts/gallery.sh) for more details + +#### For Captioning and Recaptioning +Follow instructions at [DATA.md](DATA.md#extending-reacptioning) and you can extend the recaptioning data with a few steps. + +Feel free to point out high quality dataset of videos, we would proceed on doing captioning on those datasets. + + +# :page_facing_up: Citation + +If you find this project useful in your research, please consider cite: + +```BibTeX +@misc{xu2024pllava, + title={PLLaVA : Parameter-free LLaVA Extension from Images to Videos for Video Dense Captioning}, + author={Lin Xu and Yilin Zhao and Daquan Zhou and Zhijie Lin and See Kiong Ng and Jiashi Feng}, + year={2024}, + eprint={2404.16994}, + archivePrefix={arXiv}, + primaryClass={cs.CV} +} +``` + +# :dizzy: Acknowledgement + +This code base is mainly built upon [Videochat2](https://github.com/OpenGVLab/Ask-Anything/tree/main/video_chat2). SALUTE. + +We would also like to recognize and commend the following open source projects, thank you for your great contribution to the open source community: + +- [LLaVA](https://github.com/haotian-liu/LLaVA): Fantastic Open Source Image LLM Model. +- [VideoChatGPT](https://github.com/mbzuai-oryx/Video-ChatGPT/tree/main): Great Evaluation Benchmarking Framework. +- [VideoLlava](https://github.com/PKU-YuanGroup/Video-LLaVA/tree/main/videollava):Video LLM repo with helpful resources. diff --git a/PyTorch/built-in/mlm/PLLaVA/models/pllava/modeling_pllava.py b/PyTorch/built-in/mlm/PLLaVA/models/pllava/modeling_pllava.py index 60120aa7b9c1c45e14831723aced671dd22950f1..6e961db71e40923a96c8f68c0ac5b2a8ae6a4bed 100644 --- a/PyTorch/built-in/mlm/PLLaVA/models/pllava/modeling_pllava.py +++ b/PyTorch/built-in/mlm/PLLaVA/models/pllava/modeling_pllava.py @@ -1,20 +1,7 @@ -# coding=utf-8 -# Copyright 2023 the HuggingFace Inc. team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# Copyright 2024 Huawei Technologies Co., Ltd """ PyTorch Llava model.""" from dataclasses import dataclass -from typing import List, Optional, Tuple, Union +from typing import List, Optional, Tuple, Union, Dict import math import torch @@ -291,7 +278,7 @@ class PllavaForConditionalGeneration(PllavaPreTrainedModel): self.vision_tower = AutoModel.from_config(config.vision_config) self.multi_modal_projector = PllavaMultiModalProjector(config) self.vocab_size = config.vocab_size - self.language_model = AutoModelForCausalLM.from_config(config.text_config, torch_dtype=config.torch_dtype, attn_implementation="flash_attention_2") + self.language_model = AutoModelForCausalLM.from_config(config.text_config, torch_dtype=config.torch_dtype) self.pad_token_id = self.config.pad_token_id if self.config.pad_token_id is not None else self.config.text_config.pad_token_id assert self.pad_token_id is not None, 'provide the model with pad_token_id, this would be used to arranging new embedings' self.post_init() @@ -389,8 +376,9 @@ class PllavaForConditionalGeneration(PllavaPreTrainedModel): f"The input provided to the model are wrong. The number of image tokens is {torch.sum(special_image_token_mask)} while" f" the number of image given to the model is {num_images}. This prevents correct indexing and breaks batch generation." ) - - final_embedding[image_to_overwrite] = image_features.contiguous().reshape(-1, embed_dim).to(target_device) + final_embedding = final_embedding.to(dtype=torch.bfloat16) + final_embedding[image_to_overwrite] = image_features.contiguous().reshape(-1, embed_dim).to(target_device, dtype=torch.bfloat16) + final_embedding = final_embedding.to(dtype=torch.float16) final_attention_mask |= image_to_overwrite position_ids = (final_attention_mask.cumsum(-1) - 1).masked_fill_((final_attention_mask == 0), 1) @@ -399,6 +387,10 @@ class PllavaForConditionalGeneration(PllavaPreTrainedModel): return final_embedding, final_attention_mask, final_labels, position_ids + @property + def dummy_inputs(self) -> Dict[str, torch.Tensor]: + return super().dummy_inputs + @add_start_docstrings_to_model_forward(PLLAVA_INPUTS_DOCSTRING) @replace_return_docstrings(output_type=PllavaCausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC) def forward( diff --git a/PyTorch/built-in/mlm/PLLaVA/public_address_statement.md b/PyTorch/built-in/mlm/PLLaVA/public_address_statement.md new file mode 100644 index 0000000000000000000000000000000000000000..659095be2760ed204ef5a3d3231758880d499de6 --- /dev/null +++ b/PyTorch/built-in/mlm/PLLaVA/public_address_statement.md @@ -0,0 +1,10 @@ +| 类型 | 开源代码地址 | 文件名 | 公网IP地址/公网URL地址/域名/邮箱地址 | 用途说明 | +| ------- |-------------------------------------------------------------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------|------------------------|-----------------------------| +| 开源代码引入 | https://github.com/magic-research/PLLaVA/tree/main/utils/logger.py | pllava/utils/logger.py | https://github.com/facebookresearch/mmf/blob/master/mmf/utils/logger.py | 代码实现参考连接 | +| 开源代码引入 | https://github.com/magic-research/PLLaVA/tree/main/dataset/video_utils.py | pllava/dataset/video_utils.py | https://github.com/m-bain/frozen-in-time/blob/22a91d78405ec6032fdf521ae1ff5573358e632f/base/base_dataset.py | 代码实现参考连接 | +| 开源代码引入 | https://github.com/magic-research/PLLaVA/tree/main/dataset/video_utils.py | pllava/dataset/video_utils.py | https://github.com/facebookresearch/pytorchvideo/blob/main/pytorchvideo/data/utils.py#L54-L64 | 代码实现参考连接 | +| 开源代码引入 | https://github.com/magic-research/PLLaVA/tree/main/models/pllava/configuration_pllava.py | pllava/models/pllava/configuration_pllava.py | https://huggingface.co/llava-hf/llava-v1.5-7b/resolve/main/config.json | 数据集下载 | +| 开源代码引入 | https://github.com/magic-research/PLLaVA/tree/main/models/pllava/configuration_pllava.py | pllava/models/pllava/configuration_pllava.py | https://huggingface.co/llava-hf/llava-9b | 数据集下载 | +| 开源代码引入 | https://github.com/magic-research/PLLaVA/tree/main/models/pllava/modeling_pllava.py | pllava/models/pllava/modeling_pllava.py | https://pytorch.org/docs/stable/nn.html#torch.nn.Module | 代码实现参考连接 | +| 开源代码引入 | https://github.com/magic-research/PLLaVA/tree/main/models/pllava/modeling_pllava.py | pllava/models/pllava/modeling_pllava.py | https://arxiv.org/abs/1910.13461 | 代码实现参考连接 | +| 开源代码引入 | https://github.com/magic-research/PLLaVA/tree/main/utils/scheduler.py | pllava/utils/scheduler.py | https://github.com/huggingface/transformers/blob/v4.15.0/src/transformers/optimization.py | 代码实现参考连接 | \ No newline at end of file diff --git a/PyTorch/built-in/mlm/PLLaVA/requirements.txt b/PyTorch/built-in/mlm/PLLaVA/requirements.txt index cb2801ff1a8e2ba143cdf8acdcc2404aa90d8cef..6feac297633bfd29eb3a6b0970e9aaad456a2603 100644 --- a/PyTorch/built-in/mlm/PLLaVA/requirements.txt +++ b/PyTorch/built-in/mlm/PLLaVA/requirements.txt @@ -49,8 +49,6 @@ ffmpy==0.3.2 fiftyone==0.23.6 fiftyone-brain==0.16.1 fiftyone_db==1.1.2 -filelock==3.9.0 -https://github.com/Dao-AILab/flash-attention/releases/download/v2.5.6/flash_attn-2.5.6+cu118torch2.2cxx11abiFALSE-cp310-cp310-linux_x86_64.whl fonttools==4.49.0 fsspec==2024.2.0 ftfy==6.1.3 @@ -101,7 +99,6 @@ markdown-it-py==3.0.0 MarkupSafe==2.1.3 matplotlib==3.8.3 mdurl==0.1.2 -mmcv-full==1.7.2 model-index==0.1.11 mongoengine==0.24.2 motor==3.3.2 @@ -111,19 +108,6 @@ multivolumefile==0.2.3 networkx==3.2.1 ninja==1.11.1.1 numpy==1.23.5 -nvidia-cublas-cu11==11.11.3.6 -nvidia-cuda-cupti-cu11==11.8.87 -nvidia-cuda-nvrtc-cu11==11.8.89 -nvidia-cuda-runtime-cu11==11.8.89 -nvidia-cudnn-cu11==8.7.0.84 -nvidia-cufft-cu11==10.9.0.58 -nvidia-curand-cu11==10.3.0.86 -nvidia-cusolver-cu11==11.4.1.48 -nvidia-cusparse-cu11==11.7.5.86 -nvidia-ml-py==12.535.133 -nvidia-ml-py3==7.352.0 -nvidia-nccl-cu11==2.19.3 -nvidia-nvtx-cu11==11.8.86 oauthlib==3.2.2 omegaconf==2.3.0 openai==1.14.0 @@ -224,14 +208,12 @@ tokenizers==0.15.2 tomli==2.0.1 tomlkit==0.12.0 toolz==0.12.1 -torch==2.2.1 -torchaudio==2.2.1 -torchvision==0.17.1 +torchaudio==2.1.0 +torchvision==0.16.0 tqdm==4.65.2 transaction==4.0 -transformers==4.37.1 +transformers==4.42.4 translationstring==1.4 -triton==2.2.0 typer==0.9.0 typing_extensions==4.8.0 tzdata==2024.1 diff --git a/PyTorch/built-in/mlm/PLLaVA/scripts/accel_config_multigpu.yaml b/PyTorch/built-in/mlm/PLLaVA/scripts/accel_config_multigpu.yaml index dbe0dc7b6ade744eca906c95a06c018f21cac09f..930322faed5dea7fd3a974e1f125e1a13cfdaf83 100644 --- a/PyTorch/built-in/mlm/PLLaVA/scripts/accel_config_multigpu.yaml +++ b/PyTorch/built-in/mlm/PLLaVA/scripts/accel_config_multigpu.yaml @@ -2,12 +2,12 @@ compute_environment: LOCAL_MACHINE debug: false distributed_type: MULTI_GPU downcast_bf16: 'no' -gpu_ids: 2,3,4,5 +gpu_ids: 0,1,2,3,4,5,6,7 machine_rank: 0 main_training_function: main mixed_precision: bf16 num_machines: 1 -num_processes: 4 +num_processes: 8 rdzv_backend: static same_network: true tpu_env: [] diff --git a/PyTorch/built-in/mlm/PLLaVA/scripts/eval_single.sh b/PyTorch/built-in/mlm/PLLaVA/scripts/eval_single.sh new file mode 100644 index 0000000000000000000000000000000000000000..4aeaab90ea59f3625aaecdd21f2bd4ecf3b00bae --- /dev/null +++ b/PyTorch/built-in/mlm/PLLaVA/scripts/eval_single.sh @@ -0,0 +1,24 @@ +# export CUDA_VISIBLE_DEVICES=2,6,7 +source /path_to_cann/set_env.sh +export OPENAI_API_KEY=... +num_frames=16 +test_ratio=1 + +# 13b, uses offload thus saving the full model +model_dir=/path_to_model +weight_dir=/path_to_train_result +SAVE_DIR=test_results/test_pllava_13b +lora_alpha=4 +video_path=/path_to_PLLaVA/example/cooking.mp4 + +conv_mode=eval_videoqabench +python -m tasks.eval.videoqabench.pllava_eval_videoqabench \ + --pretrained_model_name_or_path ${model_dir} \ + --save_path ${SAVE_DIR}/videoqabench \ + --num_frames ${num_frames} \ + --use_lora \ + --lora_alpha ${lora_alpha} \ + --weight_dir ${weight_dir} \ + --test_ratio ${test_ratio} \ + --example_path ${video_path} \ + --eval_mode 1 diff --git a/PyTorch/built-in/mlm/PLLaVA/scripts/train_pllava_multi_npu.sh b/PyTorch/built-in/mlm/PLLaVA/scripts/train_pllava_multi_npu.sh new file mode 100644 index 0000000000000000000000000000000000000000..46f39e59a077658c414c2380f8eb7e290e6a9e80 --- /dev/null +++ b/PyTorch/built-in/mlm/PLLaVA/scripts/train_pllava_multi_npu.sh @@ -0,0 +1,35 @@ +source /path_to_cann/set_env.sh +echo "PYTHONPATH: ${PYTHONPATH}" +which_python=$(which python) +echo "which python: ${which_python}" +export PYTHONPATH=${PYTHONPATH}:${which_python} +export PYTHONPATH=${PYTHONPATH}:. +echo "PYTHONPATH: ${PYTHONPATH}" + +OUTPUT_DIR=./pllava_video_outputs/test_train_7b_reconstruct + +pooling_shape=(16,12,12) + +repo_id=/path_to_model/llava-hf/llava-v1.6-vicuna-7b-hf +accelerate launch --main_process_port 6876 --config_file scripts/accel_config_multigpu.yaml tasks/train/train_pllava_nframe_accel.py \ + tasks/train/config_pllava_nframe.py \ + output_dir ${OUTPUT_DIR} \ + train_corpus videochat2_instruction_debug \ + save_steps 10000 \ + num_workers 8 \ + num_frames 16 \ + model.pooling_method avg \ + model.use_lora True \ + model.repo_id $repo_id \ + model.pooling_shape $pooling_shape \ + optimizer.lr 2e-5 \ + scheduler.epochs 1 \ + scheduler.warmup_ratio 0.2 \ + scheduler.min_lr_multi 0.25 \ + scheduler.is_videochat2_custom True \ + preprocess.mm_alone False \ + preprocess.random_shuffle False \ + preprocess.add_second_msg False + + + diff --git a/PyTorch/built-in/mlm/PLLaVA/scripts/train_pllava_npu_multi_node.sh b/PyTorch/built-in/mlm/PLLaVA/scripts/train_pllava_npu_multi_node.sh new file mode 100644 index 0000000000000000000000000000000000000000..56d644787c7729001f481c9dbf345924fb74a12a --- /dev/null +++ b/PyTorch/built-in/mlm/PLLaVA/scripts/train_pllava_npu_multi_node.sh @@ -0,0 +1,35 @@ +source /path_to_cann/set_env.sh +echo "PYTHONPATH: ${PYTHONPATH}" +which_python=$(which python) +echo "which python: ${which_python}" +export PYTHONPATH=${PYTHONPATH}:${which_python} +export PYTHONPATH=${PYTHONPATH}:. +echo "PYTHONPATH: ${PYTHONPATH}" + +OUTPUT_DIR=./pllava_video_outputs/test_train_7b_reconstruct + +pooling_shape=(16,12,12) + +repo_id=/path_to_model/llava-hf/llava-v1.6-vicuna-7b-hf +accelerate launch --main_process_port 6876 --config_file scripts/accel_config_multinode.yaml tasks/train/train_pllava_nframe_accel.py \ + tasks/train/config_pllava_nframe.py \ + output_dir ${OUTPUT_DIR} \ + train_corpus videochat2_instruction_debug \ + save_steps 10000 \ + num_workers 8 \ + num_frames 16 \ + model.pooling_method avg \ + model.use_lora True \ + model.repo_id $repo_id \ + model.pooling_shape $pooling_shape \ + optimizer.lr 2e-5 \ + scheduler.epochs 1 \ + scheduler.warmup_ratio 0.2 \ + scheduler.min_lr_multi 0.25 \ + scheduler.is_videochat2_custom True \ + preprocess.mm_alone False \ + preprocess.random_shuffle False \ + preprocess.add_second_msg False + + + diff --git a/PyTorch/built-in/mlm/PLLaVA/scripts/train_pllava_single_npu.sh b/PyTorch/built-in/mlm/PLLaVA/scripts/train_pllava_single_npu.sh new file mode 100644 index 0000000000000000000000000000000000000000..47f442afecc46671a545b39690180ef3b0408a7e --- /dev/null +++ b/PyTorch/built-in/mlm/PLLaVA/scripts/train_pllava_single_npu.sh @@ -0,0 +1,35 @@ +source /path_to_cann/set_env.sh +echo "PYTHONPATH: ${PYTHONPATH}" +which_python=$(which python) +echo "which python: ${which_python}" +export PYTHONPATH=${PYTHONPATH}:${which_python} +export PYTHONPATH=${PYTHONPATH}:. +echo "PYTHONPATH: ${PYTHONPATH}" + +OUTPUT_DIR=./pllava_video_outputs/test_train_7b_reconstruct + +pooling_shape=(16,12,12) + +repo_id=/path_to_model/llava-hf/llava-v1.6-vicuna-7b-hf +accelerate launch --main_process_port 6876 --config_file scripts/accel_config_singlegpu.yaml tasks/train/train_pllava_nframe_accel.py \ + tasks/train/config_pllava_nframe.py \ + output_dir ${OUTPUT_DIR} \ + train_corpus videochat2_instruction_debug \ + save_steps 10000 \ + num_workers 8 \ + num_frames 16 \ + model.pooling_method avg \ + model.use_lora True \ + model.repo_id $repo_id \ + model.pooling_shape $pooling_shape \ + optimizer.lr 2e-5 \ + scheduler.epochs 1 \ + scheduler.warmup_ratio 0.2 \ + scheduler.min_lr_multi 0.25 \ + scheduler.is_videochat2_custom True \ + preprocess.mm_alone False \ + preprocess.random_shuffle False \ + preprocess.add_second_msg False + + + diff --git a/PyTorch/built-in/mlm/PLLaVA/tasks/eval/videoqabench/pllava_eval_single.py b/PyTorch/built-in/mlm/PLLaVA/tasks/eval/videoqabench/pllava_eval_single.py new file mode 100644 index 0000000000000000000000000000000000000000..59872424304392be192e476cb0ebe26c4c6a3ad6 --- /dev/null +++ b/PyTorch/built-in/mlm/PLLaVA/tasks/eval/videoqabench/pllava_eval_single.py @@ -0,0 +1,192 @@ +# Copyright 2024 Huawei Technologies Co., Ltd +import torch_npu +from torch_npu.contrib import transfer_to_npu +import functools +import itertools +import logging +from tqdm import tqdm +from PIL import Image +from multiprocessing import Pool +from argparse import ArgumentParser +import multiprocessing as mp + +import numpy as np +import torch + +import torchvision + +import transformers +from decord import VideoReader, cpu + +from tasks.eval.model_utils import load_pllava, pllava_answer +from tasks.eval.eval_utils import conv_templates + +logging.basicConfig() +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) + + +IMAGE_TOKEN='' +from tasks.eval.videoqabench import ( + VideoQABenchDataset, + load_results, + save_results, +) +RESOLUTION = 672 # +VIDEOQA_DATASETS=["MSVD_QA","MSRVTT_QA", "ActivityNet","TGIF_QA"] +def parse_args(): + parser = ArgumentParser() + parser.add_argument( + "--pretrained_model_name_or_path", + type=str, + required=True, + default='llava-hf/llava-1.5-7b-hf' + ) + parser.add_argument( + "--save_path", + type=str, + required=True, + default='"./test_results/test_llava_mvbench"' + ) + parser.add_argument( + "--num_frames", + type=int, + required=True, + default=4, + ) + parser.add_argument( + "--use_lora", + action='store_true' + ) + parser.add_argument( + "--lora_alpha", + type=int, + required=False, + default=32, + ) + parser.add_argument( + "--max_new_tokens", + type=int, + required=False, + default=100, + ) + parser.add_argument( + "--weight_dir", + type=str, + required=False, + default=None, + ) + parser.add_argument( + "--eval_model", + type=str, + required=False, + default="gpt-3.5-turbo-0125", + ) + parser.add_argument( + '--test_ratio', + type=float, + required=False, + default=1 + ) + parser.add_argument( + "--conv_mode", + type=str, + required=False, + default='eval_videoqabench', + ) + parser.add_argument( + "--test_datasets", + type=str, + required=False, + default='MSVD_QA', + ) + parser.add_argument( + "--example_path", + type=str, + required=True, + default='/path_to_video_file', + ) + parser.add_argument( + "--eval_mode", + type=str, + required=True, + default=1, + ) + args = parser.parse_args() + return args + +def load_model_and_dataset(rank, world_size, pretrained_model_name_or_path, num_frames, use_lora, lora_alpha, weight_dir, test_ratio): + # remind that, once the model goes larger (30B+) may cause the memory to be heavily used up. Even Tearing Nodes. + model, processor = load_pllava(pretrained_model_name_or_path, num_frames=num_frames, use_lora=use_lora, lora_alpha=lora_alpha, weight_dir=weight_dir) + logger.info('done loading llava') + # position embedding + model = model.to(torch.device(rank)) + model = model.eval() + return model, processor + + +def single_test(model, processor, vid_path, num_frames=4, conv_mode="plain", eval_mode=1): + def get_index(num_frames, num_segments): + seg_size = float(num_frames - 1) / num_segments + start = int(seg_size / 2) + offsets = np.array([ + start + int(np.round(seg_size * idx)) for idx in range(num_segments) + ]) + return offsets + + def load_video(video_path, num_segments=8, return_msg=False, num_frames=4, resolution=336): + transforms = torchvision.transforms.Resize(size=resolution) + vr = VideoReader(video_path, ctx=cpu(0), num_threads=1) + num_frames = len(vr) + frame_indices = get_index(num_frames, num_segments) + images_group = list() + for frame_index in frame_indices: + img = Image.fromarray(vr[frame_index].asnumpy()) + images_group.append(transforms(img)) + if return_msg: + fps = float(vr.get_avg_fps()) + sec = ", ".join([str(round(f / fps, 1)) for f in frame_indices]) + # " " should be added in the start and end + msg = f"The video contains {len(frame_indices)} frames sampled at {sec} seconds." + return images_group, msg + else: + return images_group + + if num_frames != 0: + vid, msg = load_video(vid_path, num_segments=num_frames, return_msg=True, resolution=RESOLUTION) + else: + vid, msg = None, 'num_frames is 0, not inputing image' + img_list = vid + conv = conv_templates[conv_mode].copy() + if eval_mode == 1: + query_question = input("question input:") + conv.user_query(query_question, is_mm=True) + else: + conv.user_query("Describe the video in details.", is_mm=True) + llm_response, conv = pllava_answer(conv=conv, model=model, processor=processor, do_sample=False, img_list=img_list, max_new_tokens=256, print_res=True) + +def main(): + multiprocess=True + mp.set_start_method('spawn',force=True) + args = parse_args() + save_path = args.save_path + eval_model = args.eval_model + logger.info(f'trying loading results from {save_path}') + result_list = load_results(save_path) + vid_path = args.example_path + n_gpus = torch.cuda.device_count() + world_size = n_gpus + model, processor = load_model_and_dataset(0, + world_size, + pretrained_model_name_or_path=args.pretrained_model_name_or_path, + num_frames=args.num_frames, + use_lora=args.use_lora, + lora_alpha=args.lora_alpha, + weight_dir=args.weight_dir, + test_ratio=args.test_ratio, + ) + single_test(model, processor, vid_path, num_frames=args.num_frames, conv_mode=args.conv_mode) + logger.info('single test done...') + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/PyTorch/built-in/mlm/PLLaVA/tasks/train/config_pllava_nframe.py b/PyTorch/built-in/mlm/PLLaVA/tasks/train/config_pllava_nframe.py index b80ac33155504b002ca182cb3fef4f556ef655a3..5629d0d3aa254a503907796cb2e251249da63b72 100644 --- a/PyTorch/built-in/mlm/PLLaVA/tasks/train/config_pllava_nframe.py +++ b/PyTorch/built-in/mlm/PLLaVA/tasks/train/config_pllava_nframe.py @@ -1,3 +1,4 @@ +# Copyright 2024 Huawei Technologies Co., Ltd from tasks.train.instruction_data import * # ========================= data ========================== @@ -126,7 +127,7 @@ mode = "it" output_dir = None # output dir resume = False # if True, load optimizer and scheduler states as well debug = False -log_freq = 5 +log_freq = 1 metric_window_size=10 # window size for metric seed = 42 report_to='tensorboard' diff --git a/PyTorch/built-in/mlm/PLLaVA/tasks/train/llama_npu_monkey_patch.py b/PyTorch/built-in/mlm/PLLaVA/tasks/train/llama_npu_monkey_patch.py new file mode 100644 index 0000000000000000000000000000000000000000..0420fe0a695349e658f119aa330449a59250cb55 --- /dev/null +++ b/PyTorch/built-in/mlm/PLLaVA/tasks/train/llama_npu_monkey_patch.py @@ -0,0 +1,256 @@ +# Copyright 2024 Huawei Technologies Co., Ltd +import warnings +import math +from typing import Optional, Tuple + +import torch +import transformers.models.llama.modeling_llama +import torch.nn.functional as F +from transformers.cache_utils import Cache, DynamicCache +from transformers.models.llama.modeling_llama import LlamaRotaryEmbedding +from torch import nn, Tensor +from einops import rearrange +import torch_npu + + +class FlashLlamaRotaryEmbedding(nn.Module): + """LlamaRotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev""" + + def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None): + super().__init__() + self.dim = dim + self.max_position_embeddings = max_position_embeddings + self.base = base + inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim)) + self.register_buffer("inv_freq", inv_freq, persistent=False) + self._set_cos_sin_cache( + seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype() + ) + + def _set_cos_sin_cache(self, seq_len, device, dtype): + self.max_seq_len_cached = seq_len + t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype) + + freqs = torch.einsum("i,j->ij", t, self.inv_freq) + + # Different from paper, but it uses a different permutation in order to obtain the same calculation + emb = torch.cat((freqs, freqs), dim=-1) + self.register_buffer("cos_cached", emb.cos()[None, :, None, :].to(dtype), persistent=False) + self.register_buffer("sin_cached", emb.sin()[None, :, None, :].to(dtype), persistent=False) + + def forward(self, x, seq_len=None): + # x: [bs, num_attention_heads, seq_len, head_size] + if seq_len > self.max_seq_len_cached: + self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype) + return ( + self.cos_cached[:, :seq_len, :, ...].to(dtype=x.dtype), + self.sin_cached[:, :seq_len, :, ...].to(dtype=x.dtype), + ) + +def rotate_half(x): + """Rotates half the hidden dims of the input.""" + x1 = x[..., : x.shape[-1] // 2] + x2 = x[..., x.shape[-1] // 2:] + return torch.cat((-x2, x1), dim=-1) + +def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1): + """Applies Rotary Position Embedding to the query and key tensors. + + Args: + q (`torch.Tensor`): The query tensor. + k (`torch.Tensor`): The key tensor. + cos (`torch.Tensor`): The cosine part of the rotary embedding. + sin (`torch.Tensor`): The sine part of the rotary embedding. + position_ids (`torch.Tensor`): + The position indices of the tokens corresponding to the query and key tensors. For example, this can be + used to pass offsetted position ids when working with a KV-cache. + unsqueeze_dim (`int`, *optional*, defaults to 1): + The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and + sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note + that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and + k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes + cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have + the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2. + Returns: + `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding. + """ + cos = cos[position_ids].unsqueeze(unsqueeze_dim) + sin = sin[position_ids].unsqueeze(unsqueeze_dim) + q_embed = (q * cos) + (rotate_half(q) * sin) + k_embed = (k * cos) + (rotate_half(k) * sin) + return q_embed, k_embed + +def apply_fused_rotary_pos_emb(q, k, cos, sin, position_ids): + q_embed = torch_npu.npu_rotary_mul(q, cos, sin) + k_embed = torch_npu.npu_rotary_mul(k, cos, sin) + return q_embed, k_embed + +def _init_rope(self): + self.rotary_emb = FlashLlamaRotaryEmbedding(self.head_dim, max_position_embeddings=self.max_position_embeddings) + +def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: + """ + This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch, + num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim) + """ + batch, num_key_value_heads, slen, head_dim = hidden_states.shape + if n_rep == 1: + return hidden_states + hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim) + return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim) + +def attention_forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Cache] = None, + output_attentions: bool = False, + use_cache: bool = False, + **kwargs, +) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + if "padding_mask" in kwargs: + warnings.warn( + "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`" + ) + + bsz, q_len, _ = hidden_states.size() + + if self.config.pretraining_tp > 1: + key_value_slicing = (self.num_key_value_heads * self.head_dim) // self.config.pretraining_tp + query_slices = self.q_proj.weight.split( + (self.num_heads * self.head_dim) // self.config.pretraining_tp, dim=0 + ) + key_slices = self.k_proj.weight.split(key_value_slicing, dim=0) + value_slices = self.v_proj.weight.split(key_value_slicing, dim=0) + + query_states = [F.linear(hidden_states, query_slices[i]) for i in range(self.config.pretraining_tp)] + query_states = torch.cat(query_states, dim=-1) + + key_states = [F.linear(hidden_states, key_slices[i]) for i in range(self.config.pretraining_tp)] + key_states = torch.cat(key_states, dim=-1) + + value_states = [F.linear(hidden_states, value_slices[i]) for i in range(self.config.pretraining_tp)] + value_states = torch.cat(value_states, dim=-1) + + else: + query_states = self.q_proj(hidden_states) + key_states = self.k_proj(hidden_states) + value_states = self.v_proj(hidden_states) + # use torch_npu flash attention + if not use_cache and query_states.dtype in (torch.float16, torch.bfloat16): + + query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim) # BSND + key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim) # BSND + value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim) # BSND + + kv_seq_len = key_states.shape[1] + + cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) + query_states, key_states = apply_fused_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) + + q, k, v = [rearrange(x, 'b s h d -> b s (h d)').contiguous() for x in + (query_states, key_states, value_states)] # BSH + scale = 1 / math.sqrt(self.head_dim) + + attention_mask_shape = attention_mask.shape + if attention_mask_shape[0] == 1: + attention_mask = attention_mask.view((attention_mask_shape[-2], attention_mask_shape[-1])) + if not isinstance(attention_mask.type(), torch.BoolTensor): + attention_mask = attention_mask.bool() + + attn_output = torch_npu.npu_fusion_attention( + q, k, v, self.num_heads, + pse=None, + padding_mask=None, + atten_mask=attention_mask, + scale=scale, + keep_prob=1, + input_layout="BSH", + pre_tockens=65536, + next_tockens=0, + inner_precise=0)[0] + + else: + query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + + kv_seq_len = key_states.shape[-2] + if past_key_value is not None: + if self.layer_idx is None: + raise ValueError( + f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} " + "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class " + "with a layer index." + ) + kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx) + cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids) + + if past_key_value is not None: + cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models + key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) + + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) + + attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim) + + if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len): + raise ValueError( + f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is" + f" {attn_weights.size()}" + ) + + if attention_mask is not None: + if attention_mask.size() != (bsz, 1, q_len, kv_seq_len): + raise ValueError( + f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}" + ) + attn_weights = attn_weights + attention_mask + + # upcast attention to fp32 + attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype) + attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training) + attn_output = torch.matmul(attn_weights, value_states) + + if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim): + raise ValueError( + f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is" + f" {attn_output.size()}" + ) + + attn_output = attn_output.transpose(1, 2).contiguous() + + attn_output = attn_output.reshape(bsz, q_len, self.hidden_size) + + if self.config.pretraining_tp > 1: + attn_output = attn_output.split(self.hidden_size // self.config.pretraining_tp, dim=2) + o_proj_slices = self.o_proj.weight.split(self.hidden_size // self.config.pretraining_tp, dim=1) + attn_output = sum([F.linear(attn_output[i], o_proj_slices[i]) for i in range(self.config.pretraining_tp)]) + else: + attn_output = self.o_proj(attn_output) + + if not output_attentions: + attn_weights = None + + return attn_output, attn_weights, past_key_value + +def adaptive_avg_pool3d(input: Tensor, output_size) -> Tensor: + input_dtype = input.dtype + input_shape = input.shape + input = input.to(dtype=torch.float32) + pool = nn.AvgPool2d(kernel_size=(2, 2), stride=(2, 2), padding=0, ceil_mode=False) + output = pool( + input.reshape(input_shape[0] * input_shape[1], input_shape[2], input_shape[3], input_shape[4])).reshape( + input_shape[0], input_shape[1], output_size[0], output_size[1], output_size[2]) + output = output.to(dtype=input_dtype) + return output + +def replace_with_torch_npu_flash_attention(): + transformers.models.llama.modeling_llama.LlamaAttention.forward = attention_forward + transformers.models.llama.modeling_llama.LlamaAttention._init_rope = _init_rope + +def replace_with_adaptive_avg_pool3d(): + torch.nn.functional.adaptive_avg_pool3d = adaptive_avg_pool3d \ No newline at end of file diff --git a/PyTorch/built-in/mlm/PLLaVA/tasks/train/train_pllava_nframe_accel.py b/PyTorch/built-in/mlm/PLLaVA/tasks/train/train_pllava_nframe_accel.py index 9f02309d20ac3629f6f5382b697404d1ffcba96e..c07d389e18ef5b15e1ffcba7871a0daf0885d646 100644 --- a/PyTorch/built-in/mlm/PLLaVA/tasks/train/train_pllava_nframe_accel.py +++ b/PyTorch/built-in/mlm/PLLaVA/tasks/train/train_pllava_nframe_accel.py @@ -1,3 +1,6 @@ +# Copyright 2024 Huawei Technologies Co., Ltd +import torch_npu +from torch_npu.contrib import transfer_to_npu import datetime import gc import time @@ -28,6 +31,8 @@ from utils.basic_utils import (MetricLogger, SmoothedValue, setup_seed) from utils.config_utils import setup_main from transformers.utils import TensorType +from tasks.train.llama_npu_monkey_patch import replace_with_torch_npu_flash_attention, replace_with_adaptive_avg_pool3d + from tasks.shared_utils import create_optimizer, create_scheduler import copy from transformers import ( @@ -361,6 +366,7 @@ def main(config): start_epoch = 0 num_batches = sum(len(loader) for loader in train_loaders) global_step = start_epoch * num_batches # the steps before divided by accumulation + resume_cur_epoch_step = 0 if osp.exists(config.output_dir): subfolders = os.listdir(config.output_dir) sample_saving = False @@ -374,8 +380,6 @@ def main(config): ckpt_paths = [subfolder for subfolder in subfolders if re.match("ckpt_[^\d]+", subfolder) is not None] ckpt_iters = [int(s.split(re.match("ckpt_[^\d]+", s).group())[-1]) for s in ckpt_paths] - - resume_cur_epoch_step=0 if len(ckpt_iters) > 0: resume_iter = max(ckpt_iters) ckpt_path = osp.join(config.output_dir, ckpt_paths[ckpt_iters.index(resume_iter)]) @@ -540,6 +544,8 @@ def main(config): if __name__ == "__main__": + replace_with_torch_npu_flash_attention() + replace_with_adaptive_avg_pool3d() cfg = setup_main() print(cfg) main(cfg) diff --git a/PyTorch/built-in/mlm/PLLaVA/utils/basic_utils.py b/PyTorch/built-in/mlm/PLLaVA/utils/basic_utils.py index fb453d35c852741bf1ad6dfe27e604d9fef6557e..d2ca1ca21193decacead3e71f9d7bd45a77f45c4 100644 --- a/PyTorch/built-in/mlm/PLLaVA/utils/basic_utils.py +++ b/PyTorch/built-in/mlm/PLLaVA/utils/basic_utils.py @@ -1,3 +1,4 @@ +# Copyright 2024 Huawei Technologies Co., Ltd import numpy as np import io import os @@ -42,13 +43,13 @@ class SmoothedValue(object): """ if not is_dist_avail_and_initialized(): return - t = torch.tensor([self.count, self.total], + tensor_ret = torch.tensor([self.count, self.total], dtype=torch.float64, device='cuda') dist.barrier() - dist.all_reduce(t) - t = t.tolist() - self.count = int(t[0]) - self.total = t[1] + dist.all_reduce(tensor_ret.float()) + tensor_ret = tensor_ret.tolist() + self.count = int(tensor_ret[0]) + self.total = tensor_ret[1] @property def median(self):