From 62a46808b27e81556b1d9e0b67863e2d5dc8c664 Mon Sep 17 00:00:00 2001 From: "xinchi.tian" Date: Thu, 9 Jan 2025 09:49:19 +0800 Subject: [PATCH] Add LLava --- .../LLava/vllm/README.md | 49 ++++ .../vllm/offline_inference_vision_language.py | 193 +++++++++++++++ .../LLava/vllm/utils.py | 225 ++++++++++++++++++ .../vllm_public_assets/cherry_blossom.jpg | Bin 0 -> 351679 bytes 4 files changed, 467 insertions(+) create mode 100644 models/vision-language-understanding/LLava/vllm/README.md create mode 100644 models/vision-language-understanding/LLava/vllm/offline_inference_vision_language.py create mode 100644 models/vision-language-understanding/LLava/vllm/utils.py create mode 100644 models/vision-language-understanding/LLava/vllm/vllm_public_assets/cherry_blossom.jpg diff --git a/models/vision-language-understanding/LLava/vllm/README.md b/models/vision-language-understanding/LLava/vllm/README.md new file mode 100644 index 00000000..1b805f41 --- /dev/null +++ b/models/vision-language-understanding/LLava/vllm/README.md @@ -0,0 +1,49 @@ +# LLava + +## Description + +LLaVA is an open-source chatbot trained by fine-tuning LLaMA/Vicuna on GPT-generated multimodal instruction-following data. It is an auto-regressive language model, based on the transformer architecture.The LLaVA-NeXT model was proposed in LLaVA-NeXT: Improved reasoning, OCR, and world knowledge by Haotian Liu, Chunyuan Li, Yuheng Li, Bo Li, Yuanhan Zhang, Sheng Shen, Yong Jae Lee. LLaVa-NeXT (also called LLaVa-1.6) improves upon LLaVa-1.5 by increasing the input image resolution and training on an improved visual instruction tuning dataset to improve OCR and common sense reasoning. + + +## Setup + +### Install + +In order to run the model smoothly, you need to get the sdk from [resource center](https://support.iluvatar.com/#/ProductLine?id=2) of Iluvatar CoreX official website. + +```bash +# Install libGL +## CentOS +yum install -y mesa-libGL +## Ubuntu +apt install -y libgl1-mesa-dev +pip3 install transformers +``` + +### Download + +-llava-v1.6-vicuna-7b-hf: + + +```bash +# Download model from the website and make sure the model's path is "data/llava" +mkdir data + +``` + +## Inference + +```bash +export PT_SDPA_ENABLE_HEAD_DIM_PADDING=1 +export PATH=/usr/local/corex/bin:${PATH} +export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/lib64 +``` + + +### Inference llava-1.6 + +```bash +export VLLM_ASSETS_CACHE=../vllm/ +export CUDA_VISIBLE_DEVICES=0,1,2,3 +python3 offline_inference_vision_language.py --model /path/to/model --max-tokens 256 -tp 4 --trust-remote-code --temperature 0.0 --model-type llava-next --max-model-len 4096 +``` \ No newline at end of file diff --git a/models/vision-language-understanding/LLava/vllm/offline_inference_vision_language.py b/models/vision-language-understanding/LLava/vllm/offline_inference_vision_language.py new file mode 100644 index 00000000..72bf073c --- /dev/null +++ b/models/vision-language-understanding/LLava/vllm/offline_inference_vision_language.py @@ -0,0 +1,193 @@ +# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +""" +This example shows how to use vLLM for running offline inference +with the correct prompt format on vision language models. + +For most models, the prompt format should follow corresponding examples +on HuggingFace model repository. +""" +import sys +from pathlib import Path +import os +sys.path.append(str(Path(__file__).resolve().parent.parent.parent)) +import argparse +import dataclasses +import inspect +from vllm.assets.image import ImageAsset +from vllm.assets.video import VideoAsset + + +from vllm import LLM, EngineArgs, SamplingParams +import sys +from pathlib import Path +import os +from utils import sampling_add_cli_args + +# LLaVA-1.5 +def run_llava(question,engine_params,modality): + assert modality == "image" + prompt = f"USER: \n{question}\nASSISTANT:" + + llm = LLM(**engine_params) + stop_token_ids = None + return llm, prompt, stop_token_ids + + +# LLaVA-1.6/LLaVA-NeXT +def run_llava_next(question,engine_params,modality): + assert modality == "image" + prompt = f"USER: \n{question}\nASSISTANT:" + llm = LLM(**engine_params) + stop_token_ids = None + return llm, prompt, stop_token_ids + +# LlaVA-NeXT-Video +# Currently only support for video input +def run_llava_next_video(question,engine_params,modality): + assert modality == "video" + prompt = f"USER: