From 7c65ec24fb875cc3c4f9dfc29507bd66af993e6b Mon Sep 17 00:00:00 2001 From: "hongliang.yuan" Date: Wed, 2 Jul 2025 11:18:20 +0800 Subject: [PATCH] update vllm idefics3 minicpm_v llava_next_video_7b --- .../idefics3/vllm/README.md | 9 +- .../idefics3/vllm/ci/prepare.sh | 1 + .../vllm/offline_inference_vision_language.py | 276 +++++++++++----- .../sample_demo_1.mp4 | Bin .../minicpm_v/vllm/README.md | 8 +- .../vllm/offline_inference_vision_language.py | 309 +++++++++++++----- tests/model_info.json | 2 +- tests/run_vllm.py | 11 +- 8 files changed, 430 insertions(+), 186 deletions(-) rename models/multimodal/vision_language_model/llava_next_video_7b/vllm/{video-eample-data => video-example-data}/sample_demo_1.mp4 (100%) mode change 100755 => 100644 diff --git a/models/multimodal/vision_language_model/idefics3/vllm/README.md b/models/multimodal/vision_language_model/idefics3/vllm/README.md index 5117a327..78d4117c 100644 --- a/models/multimodal/vision_language_model/idefics3/vllm/README.md +++ b/models/multimodal/vision_language_model/idefics3/vllm/README.md @@ -22,8 +22,8 @@ significantly enhancing capabilities around OCR, document understanding and visu ```bash cp -r ../../vllm_public_assets/ ./ -# Download model from the website and make sure the model's path is "data/Aria" -mkdir data +# Download model from the website and make sure the model's path is "idefics3" +mkdir HuggingFaceM4 ``` ### Install Dependencies @@ -36,13 +36,14 @@ In order to run the model smoothly, you need to get the sdk from [resource cente yum install -y mesa-libGL ## Ubuntu apt install -y libgl1-mesa-glx + +pip install transformers==4.50.3 ``` ## Model Inference ```bash -export VLLM_ASSETS_CACHE=../vllm/ -python3 offline_inference_vision_language.py --model data/Idefics3-8B-Llama3 -tp 4 --max-tokens 256 --trust-remote-code --temperature 0.0 --disable-mm-preprocessor-cache +python3 offline_inference_vision_language.py --model-type idefics3 ``` ## Model Results diff --git a/models/multimodal/vision_language_model/idefics3/vllm/ci/prepare.sh b/models/multimodal/vision_language_model/idefics3/vllm/ci/prepare.sh index 7232aa29..26f7a3ff 100644 --- a/models/multimodal/vision_language_model/idefics3/vllm/ci/prepare.sh +++ b/models/multimodal/vision_language_model/idefics3/vllm/ci/prepare.sh @@ -25,3 +25,4 @@ else fi cp -r ../../vllm_public_assets/ ./ +pip install transformers==4.50.3 \ No newline at end of file diff --git a/models/multimodal/vision_language_model/idefics3/vllm/offline_inference_vision_language.py b/models/multimodal/vision_language_model/idefics3/vllm/offline_inference_vision_language.py index 958131c6..c2593603 100644 --- a/models/multimodal/vision_language_model/idefics3/vllm/offline_inference_vision_language.py +++ b/models/multimodal/vision_language_model/idefics3/vllm/offline_inference_vision_language.py @@ -1,55 +1,67 @@ -#!/bin/bash -# Copyright (c) 2025, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. -# All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may -# not use this file except in compliance with the License. You may obtain -# a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-License-Identifier: Apache-2.0 """ -This example shows how to use vLLM for running offline inference -with the correct prompt format on vision language models. +This example shows how to use vLLM for running offline inference with +the correct prompt format on vision language models for text generation. For most models, the prompt format should follow corresponding examples on HuggingFace model repository. """ -import sys -from pathlib import Path import os -sys.path.append(str(Path(__file__).resolve().parent.parent.parent)) -import argparse -import dataclasses -import inspect -from vllm.assets.image import ImageAsset -from vllm.assets.video import VideoAsset +import random +from dataclasses import asdict +from typing import NamedTuple, Optional +from huggingface_hub import snapshot_download from transformers import AutoTokenizer from vllm import LLM, EngineArgs, SamplingParams -from utils import sampling_add_cli_args +from vllm.assets.image import ImageAsset +from vllm.assets.video import VideoAsset +from vllm.lora.request import LoRARequest +from vllm.utils import FlexibleArgumentParser + +class ModelRequestData(NamedTuple): + engine_args: EngineArgs + prompts: list[str] + stop_token_ids: Optional[list[int]] = None + lora_requests: Optional[list[LoRARequest]] = None + + +# NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on +# lower-end GPUs. +# Unless specified, these settings have been tested to work on a single L4. # Idefics3-8B-Llama3 -def run_idefics3(question: str, engine_params, modality: str): +def run_idefics3(questions: list[str], modality: str) -> ModelRequestData: assert modality == "image" - model_name = "HuggingFaceM4/Idefics3-8B-Llama3" + model_name = "./idefics3" - llm = LLM(**engine_params) - prompt = ( - f"<|begin_of_text|>User:{question}\nAssistant:" + engine_args = EngineArgs( + model=model_name, + max_model_len=8192, + max_num_seqs=2, + enforce_eager=True, + # if you are running out of memory, you can reduce the "longest_edge". + # see: https://huggingface.co/HuggingFaceM4/Idefics3-8B-Llama3#model-optimizations + mm_processor_kwargs={ + "size": { + "longest_edge": 3 * 364 + }, + }, + disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, ) - stop_token_ids = None - return llm, prompt, stop_token_ids - + prompts = [( + f"<|begin_of_text|>User:{question}\nAssistant:" + ) for question in questions] + return ModelRequestData( + engine_args=engine_args, + prompts=prompts, + ) +model_example_map = { + "idefics3": run_idefics3, +} def get_multi_modal_input(args): """ @@ -60,92 +72,188 @@ def get_multi_modal_input(args): """ if args.modality == "image": # Input image and question - image = ImageAsset("cherry_blossom").pil_image.convert("RGB") - img_question = "What is the content of this image?" + image = ImageAsset("cherry_blossom") \ + .pil_image.convert("RGB") + img_questions = [ + "What is the content of this image?", + "Describe the content of this image in detail.", + "What's in the image?", + "Where is this image taken?", + ] return { "data": image, - "question": img_question, + "questions": img_questions, } if args.modality == "video": # Input video and question video = VideoAsset(name="sample_demo_1.mp4", num_frames=args.num_frames).np_ndarrays - vid_question = "Why is this video funny?" + vid_questions = ["Why is this video funny?"] return { "data": video, - "question": vid_question, + "questions": vid_questions, } msg = f"Modality {args.modality} is not supported." raise ValueError(msg) - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument('--num-prompts', - type=int, - default=1, - help='Number of prompts to run.') - parser.add_argument('--modality', - type=str, - default="image", - help='Modality of the input.') - parser.add_argument('--num-frames', - type=int, - default=16, - help='Number of frames to extract from the video.') - parser = EngineArgs.add_cli_args(parser) - parser = sampling_add_cli_args(parser) - args = parser.parse_args() - engine_args = [attr.name for attr in dataclasses.fields(EngineArgs)] - sampling_args = [ - param.name - for param in list( - inspect.signature(SamplingParams).parameters.values() - ) - ] - engine_params = {attr: getattr(args, attr) for attr in engine_args} - sampling_params = { - attr: getattr(args, attr) for attr in sampling_args if args.__contains__(attr) - } - +def apply_image_repeat(image_repeat_prob, num_prompts, data, + prompts: list[str], modality): + """Repeats images with provided probability of "image_repeat_prob". + Used to simulate hit/miss for the MM preprocessor cache. + """ + assert (image_repeat_prob <= 1.0 and image_repeat_prob >= 0) + no_yes = [0, 1] + probs = [1.0 - image_repeat_prob, image_repeat_prob] + + inputs = [] + cur_image = data + for i in range(num_prompts): + if image_repeat_prob is not None: + res = random.choices(no_yes, probs)[0] + if res == 0: + # No repeat => Modify one pixel + cur_image = cur_image.copy() + new_val = (i // 256 // 256, i // 256, i % 256) + cur_image.putpixel((0, 0), new_val) + + inputs.append({ + "prompt": prompts[i % len(prompts)], + "multi_modal_data": { + modality: cur_image + } + }) + + return inputs + + +def main(args): + model = args.model_type + if model not in model_example_map: + raise ValueError(f"Model type {model} is not supported.") + modality = args.modality mm_input = get_multi_modal_input(args) data = mm_input["data"] - question = mm_input["question"] + questions = mm_input["questions"] + + req_data = model_example_map[model](questions, modality) - llm, prompt, stop_token_ids = run_idefics3(question,engine_params,args.modality) - sampling_params['stop_token_ids'] = stop_token_ids + engine_args = asdict(req_data.engine_args) | {"seed": args.seed} + llm = LLM(**engine_args) + + # To maintain code compatibility in this script, we add LoRA here. + # You can also add LoRA using: + # llm.generate(prompts, lora_request=lora_request,...) + if req_data.lora_requests: + for lora_request in req_data.lora_requests: + llm.llm_engine.add_lora(lora_request=lora_request) + + # Don't want to check the flag multiple times, so just hijack `prompts`. + prompts = req_data.prompts if args.use_different_prompt_per_request else [ + req_data.prompts[0] + ] # We set temperature to 0.2 so that outputs can be different # even when all prompts are identical when running batch inference. - sampling_params = SamplingParams(**sampling_params) + sampling_params = SamplingParams(temperature=0.2, + max_tokens=64, + stop_token_ids=req_data.stop_token_ids) assert args.num_prompts > 0 if args.num_prompts == 1: # Single inference inputs = { - "prompt": prompt, + "prompt": prompts[0], "multi_modal_data": { modality: data }, } - else: # Batch inference - inputs = [{ - "prompt": prompt, - "multi_modal_data": { - modality: data - }, - } for _ in range(args.num_prompts)] + if args.image_repeat_prob is not None: + # Repeat images with specified probability of "image_repeat_prob" + inputs = apply_image_repeat(args.image_repeat_prob, + args.num_prompts, data, prompts, + modality) + else: + # Use the same image for all prompts + inputs = [{ + "prompt": prompts[i % len(prompts)], + "multi_modal_data": { + modality: data + }, + } for i in range(args.num_prompts)] + + if args.time_generate: + import time + start_time = time.time() + outputs = llm.generate(inputs, sampling_params=sampling_params) + elapsed_time = time.time() - start_time + print("-- generate time = {}".format(elapsed_time)) - outputs = llm.generate(inputs, sampling_params=sampling_params) + else: + outputs = llm.generate(inputs, sampling_params=sampling_params) for o in outputs: generated_text = o.outputs[0].text - print(generated_text) \ No newline at end of file + print(generated_text) + + +if __name__ == "__main__": + parser = FlexibleArgumentParser( + description='Demo on using vLLM for offline inference with ' + 'vision language models for text generation') + parser.add_argument('--model-type', + '-m', + type=str, + default="llava", + choices=model_example_map.keys(), + help='Huggingface "model_type".') + parser.add_argument('--num-prompts', + type=int, + default=4, + help='Number of prompts to run.') + parser.add_argument('--modality', + type=str, + default="image", + choices=['image', 'video'], + help='Modality of the input.') + parser.add_argument('--num-frames', + type=int, + default=16, + help='Number of frames to extract from the video.') + parser.add_argument("--seed", + type=int, + default=None, + help="Set the seed when initializing `vllm.LLM`.") + + parser.add_argument( + '--image-repeat-prob', + type=float, + default=None, + help='Simulates the hit-ratio for multi-modal preprocessor cache' + ' (if enabled)') + + parser.add_argument( + '--disable-mm-preprocessor-cache', + action='store_true', + help='If True, disables caching of multi-modal preprocessor/mapper.') + + parser.add_argument( + '--time-generate', + action='store_true', + help='If True, then print the total generate() call time') + + parser.add_argument( + '--use-different-prompt-per-request', + action='store_true', + help='If True, then use different prompt (with the same multi-modal ' + 'data) for each request.') + + args = parser.parse_args() + main(args) \ No newline at end of file diff --git a/models/multimodal/vision_language_model/llava_next_video_7b/vllm/video-eample-data/sample_demo_1.mp4 b/models/multimodal/vision_language_model/llava_next_video_7b/vllm/video-example-data/sample_demo_1.mp4 old mode 100755 new mode 100644 similarity index 100% rename from models/multimodal/vision_language_model/llava_next_video_7b/vllm/video-eample-data/sample_demo_1.mp4 rename to models/multimodal/vision_language_model/llava_next_video_7b/vllm/video-example-data/sample_demo_1.mp4 diff --git a/models/multimodal/vision_language_model/minicpm_v/vllm/README.md b/models/multimodal/vision_language_model/minicpm_v/vllm/README.md index a404f6ec..ea1c8d74 100644 --- a/models/multimodal/vision_language_model/minicpm_v/vllm/README.md +++ b/models/multimodal/vision_language_model/minicpm_v/vllm/README.md @@ -16,13 +16,12 @@ techniques, making it suitable for deployment in resource-constrained environmen ### Prepare Resources -- Model: +- Model: ```bash cp -r ../../vllm_public_assets/ ./ -# Download model from the website and make sure the model's path is "data/Aria" -mkdir data +# Download model from the website and make sure the model's path is "./minicpm_v" ``` ### Install Dependencies @@ -42,8 +41,7 @@ pip install timm==0.9.10 ## Model Inference ```bash -export VLLM_ASSETS_CACHE=../vllm/ -PT_SDPA_ENABLE_HEAD_DIM_PADDING=1 python3 offline_inference_vision_language.py --model data/MiniCPM-V-2 --max-tokens 256 -tp 2 --trust-remote-code --temperature 0.0 +python3 offline_inference_vision_language.py --model-type minicpmv ``` ## Model Results diff --git a/models/multimodal/vision_language_model/minicpm_v/vllm/offline_inference_vision_language.py b/models/multimodal/vision_language_model/minicpm_v/vllm/offline_inference_vision_language.py index 2fc88f46..f6df6f98 100644 --- a/models/multimodal/vision_language_model/minicpm_v/vllm/offline_inference_vision_language.py +++ b/models/multimodal/vision_language_model/minicpm_v/vllm/offline_inference_vision_language.py @@ -1,42 +1,42 @@ -#!/bin/bash -# Copyright (c) 2025, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. -# All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may -# not use this file except in compliance with the License. You may obtain -# a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - +# SPDX-License-Identifier: Apache-2.0 """ -This example shows how to use vLLM for running offline inference -with the correct prompt format on vision language models. +This example shows how to use vLLM for running offline inference with +the correct prompt format on vision language models for text generation. For most models, the prompt format should follow corresponding examples on HuggingFace model repository. """ -import sys -from pathlib import Path import os -sys.path.append(str(Path(__file__).resolve().parent.parent.parent)) -import argparse -import dataclasses -import inspect +import random +from dataclasses import asdict +from typing import NamedTuple, Optional + +from huggingface_hub import snapshot_download from transformers import AutoTokenizer + +from vllm import LLM, EngineArgs, SamplingParams from vllm.assets.image import ImageAsset from vllm.assets.video import VideoAsset -from vllm import LLM, EngineArgs, SamplingParams -from utils import sampling_add_cli_args +from vllm.lora.request import LoRARequest +from vllm.utils import FlexibleArgumentParser + + +class ModelRequestData(NamedTuple): + engine_args: EngineArgs + prompts: list[str] + stop_token_ids: Optional[list[int]] = None + lora_requests: Optional[list[LoRARequest]] = None + + +# NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on +# lower-end GPUs. +# Unless specified, these settings have been tested to work on a single L4. # MiniCPM-V -def run_minicpmv(question, engine_params, model,modality): - assert modality == "image" +def run_minicpmv_base(questions: list[str], modality: str, model_name): + assert modality in ["image", "video"] + # If you want to use `MiniCPM-o-2_6` with audio inputs, check `audio_language.py` # noqa + # 2.0 # The official repo doesn't work yet, so we need to use a fork for now # For more details, please see: See: https://github.com/vllm-project/vllm/pull/4087#issuecomment-2250397630 # noqa @@ -45,10 +45,25 @@ def run_minicpmv(question, engine_params, model,modality): # 2.5 # model_name = "openbmb/MiniCPM-Llama3-V-2_5" - #2.6 - tokenizer = AutoTokenizer.from_pretrained(model, + # 2.6 + # model_name = "openbmb/MiniCPM-V-2_6" + # o2.6 + + # modality supports + # 2.0: image + # 2.5: image + # 2.6: image, video + # o2.6: image, video, audio + # model_name = "openbmb/MiniCPM-o-2_6" + tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) - llm = LLM(**engine_params) + engine_args = EngineArgs( + model=model_name, + max_model_len=4096, + max_num_seqs=2, + trust_remote_code=True, + disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, + ) # NOTE The stop_token_ids are different for various versions of MiniCPM-V # 2.0 # stop_token_ids = [tokenizer.eos_id] @@ -56,18 +71,38 @@ def run_minicpmv(question, engine_params, model,modality): # 2.5 # stop_token_ids = [tokenizer.eos_id, tokenizer.eot_id] - # 2.6 + # 2.6 / o2.6 stop_tokens = ['<|im_end|>', '<|endoftext|>'] stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens] - messages = [{ - 'role': 'user', - 'content': f'(./)\n{question}' - }] - prompt = tokenizer.apply_chat_template(messages, - tokenize=False, - add_generation_prompt=True) - return llm, prompt, stop_token_ids + modality_placeholder = { + "image": "(./)", + "video": "()", + } + + prompts = [ + tokenizer.apply_chat_template( + [{ + 'role': 'user', + 'content': f"{modality_placeholder[modality]}\n{question}" + }], + tokenize=False, + add_generation_prompt=True) for question in questions + ] + + return ModelRequestData( + engine_args=engine_args, + prompts=prompts, + stop_token_ids=stop_token_ids, + ) + +def run_minicpmv(questions: list[str], modality: str) -> ModelRequestData: + return run_minicpmv_base(questions, modality, "./minicpm_v") + + +model_example_map = { + "minicpmv": run_minicpmv, +} def get_multi_modal_input(args): @@ -79,92 +114,188 @@ def get_multi_modal_input(args): """ if args.modality == "image": # Input image and question - image = ImageAsset("cherry_blossom").pil_image.convert("RGB") - img_question = "What is the content of this image?" + image = ImageAsset("cherry_blossom") \ + .pil_image.convert("RGB") + img_questions = [ + "What is the content of this image?", + "Describe the content of this image in detail.", + "What's in the image?", + "Where is this image taken?", + ] return { "data": image, - "question": img_question, + "questions": img_questions, } if args.modality == "video": # Input video and question video = VideoAsset(name="sample_demo_1.mp4", num_frames=args.num_frames).np_ndarrays - vid_question = "Why is this video funny?" + vid_questions = ["Why is this video funny?"] return { "data": video, - "question": vid_question, + "questions": vid_questions, } msg = f"Modality {args.modality} is not supported." raise ValueError(msg) - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument('--num-prompts', - type=int, - default=1, - help='Number of prompts to run.') - parser.add_argument('--modality', - type=str, - default="image", - help='Modality of the input.') - parser.add_argument('--num-frames', - type=int, - default=16, - help='Number of frames to extract from the video.') - parser = EngineArgs.add_cli_args(parser) - parser = sampling_add_cli_args(parser) - args = parser.parse_args() - engine_args = [attr.name for attr in dataclasses.fields(EngineArgs)] - sampling_args = [ - param.name - for param in list( - inspect.signature(SamplingParams).parameters.values() - ) - ] - engine_params = {attr: getattr(args, attr) for attr in engine_args} - sampling_params = { - attr: getattr(args, attr) for attr in sampling_args if args.__contains__(attr) - } - +def apply_image_repeat(image_repeat_prob, num_prompts, data, + prompts: list[str], modality): + """Repeats images with provided probability of "image_repeat_prob". + Used to simulate hit/miss for the MM preprocessor cache. + """ + assert (image_repeat_prob <= 1.0 and image_repeat_prob >= 0) + no_yes = [0, 1] + probs = [1.0 - image_repeat_prob, image_repeat_prob] + + inputs = [] + cur_image = data + for i in range(num_prompts): + if image_repeat_prob is not None: + res = random.choices(no_yes, probs)[0] + if res == 0: + # No repeat => Modify one pixel + cur_image = cur_image.copy() + new_val = (i // 256 // 256, i // 256, i % 256) + cur_image.putpixel((0, 0), new_val) + + inputs.append({ + "prompt": prompts[i % len(prompts)], + "multi_modal_data": { + modality: cur_image + } + }) + + return inputs + + +def main(args): + model = args.model_type + if model not in model_example_map: + raise ValueError(f"Model type {model} is not supported.") + modality = args.modality mm_input = get_multi_modal_input(args) data = mm_input["data"] - question = mm_input["question"] + questions = mm_input["questions"] - llm, prompt, stop_token_ids = run_minicpmv(question,engine_params, args.model, args.modality) - sampling_params['stop_token_ids'] = stop_token_ids + req_data = model_example_map[model](questions, modality) + + engine_args = asdict(req_data.engine_args) | {"seed": args.seed} + llm = LLM(**engine_args) + + # To maintain code compatibility in this script, we add LoRA here. + # You can also add LoRA using: + # llm.generate(prompts, lora_request=lora_request,...) + if req_data.lora_requests: + for lora_request in req_data.lora_requests: + llm.llm_engine.add_lora(lora_request=lora_request) + + # Don't want to check the flag multiple times, so just hijack `prompts`. + prompts = req_data.prompts if args.use_different_prompt_per_request else [ + req_data.prompts[0] + ] # We set temperature to 0.2 so that outputs can be different # even when all prompts are identical when running batch inference. - sampling_params = SamplingParams(**sampling_params) + sampling_params = SamplingParams(temperature=0.2, + max_tokens=64, + stop_token_ids=req_data.stop_token_ids) assert args.num_prompts > 0 if args.num_prompts == 1: # Single inference inputs = { - "prompt": prompt, + "prompt": prompts[0], "multi_modal_data": { modality: data }, } - else: # Batch inference - inputs = [{ - "prompt": prompt, - "multi_modal_data": { - modality: data - }, - } for _ in range(args.num_prompts)] + if args.image_repeat_prob is not None: + # Repeat images with specified probability of "image_repeat_prob" + inputs = apply_image_repeat(args.image_repeat_prob, + args.num_prompts, data, prompts, + modality) + else: + # Use the same image for all prompts + inputs = [{ + "prompt": prompts[i % len(prompts)], + "multi_modal_data": { + modality: data + }, + } for i in range(args.num_prompts)] - outputs = llm.generate(inputs, sampling_params=sampling_params) + if args.time_generate: + import time + start_time = time.time() + outputs = llm.generate(inputs, sampling_params=sampling_params) + elapsed_time = time.time() - start_time + print("-- generate time = {}".format(elapsed_time)) + + else: + outputs = llm.generate(inputs, sampling_params=sampling_params) for o in outputs: generated_text = o.outputs[0].text - print(generated_text) \ No newline at end of file + print(generated_text) + + +if __name__ == "__main__": + parser = FlexibleArgumentParser( + description='Demo on using vLLM for offline inference with ' + 'vision language models for text generation') + parser.add_argument('--model-type', + '-m', + type=str, + default="llava", + choices=model_example_map.keys(), + help='Huggingface "model_type".') + parser.add_argument('--num-prompts', + type=int, + default=4, + help='Number of prompts to run.') + parser.add_argument('--modality', + type=str, + default="image", + choices=['image', 'video'], + help='Modality of the input.') + parser.add_argument('--num-frames', + type=int, + default=16, + help='Number of frames to extract from the video.') + parser.add_argument("--seed", + type=int, + default=None, + help="Set the seed when initializing `vllm.LLM`.") + + parser.add_argument( + '--image-repeat-prob', + type=float, + default=None, + help='Simulates the hit-ratio for multi-modal preprocessor cache' + ' (if enabled)') + + parser.add_argument( + '--disable-mm-preprocessor-cache', + action='store_true', + help='If True, disables caching of multi-modal preprocessor/mapper.') + + parser.add_argument( + '--time-generate', + action='store_true', + help='If True, then print the total generate() call time') + + parser.add_argument( + '--use-different-prompt-per-request', + action='store_true', + help='If True, then use different prompt (with the same multi-modal ' + 'data) for each request.') + + args = parser.parse_args() + main(args) \ No newline at end of file diff --git a/tests/model_info.json b/tests/model_info.json index 3ba5dadd..466a81d7 100644 --- a/tests/model_info.json +++ b/tests/model_info.json @@ -7098,7 +7098,7 @@ "github_branch": "", "github_path": "", "datasets": "", - "download_url": "https://huggingface.co/openbmb/MiniCPM-V-2", + "download_url": "https://huggingface.co/openbmb/MiniCPM-V-2_6", "need_third_part": false, "precisions": [ "fp16" diff --git a/tests/run_vllm.py b/tests/run_vllm.py index c6100a40..be795462 100644 --- a/tests/run_vllm.py +++ b/tests/run_vllm.py @@ -229,7 +229,13 @@ def run_nlp_testcase(model): export VLLM_ASSETS_CACHE=../vllm/ python3 offline_inference_vision_language.py --model ./{model_name} --max-tokens 256 -tp 2 --trust-remote-code --temperature 0.0 """ - elif model_name == "h2vol" or model_name == "idefics3": + elif model_name == "idefics3": + script = f""" + set -x + cd ../{model['model_path']} + python3 offline_inference_vision_language.py --model-type idefics3 + """ + elif model_name == "h2vol": script = f""" set -x cd ../{model['model_path']} @@ -240,8 +246,7 @@ def run_nlp_testcase(model): script = f""" set -x cd ../{model['model_path']} - export VLLM_ASSETS_CACHE=../vllm/ - PT_SDPA_ENABLE_HEAD_DIM_PADDING=1 python3 offline_inference_vision_language.py --model ./{model_name} --max-tokens 256 -tp 2 --trust-remote-code --temperature 0.0 + python3 offline_inference_vision_language.py --model-type minicpmv """ elif model_name == "llama-3.2": script = f""" -- Gitee