From 7c65ec24fb875cc3c4f9dfc29507bd66af993e6b Mon Sep 17 00:00:00 2001
From: "hongliang.yuan" <hongliang.yuan@iluvatar.com>
Date: Wed, 2 Jul 2025 11:18:20 +0800
Subject: [PATCH] update vllm idefics3 minicpm_v llava_next_video_7b

---
 .../idefics3/vllm/README.md                   |   9 +-
 .../idefics3/vllm/ci/prepare.sh               |   1 +
 .../vllm/offline_inference_vision_language.py | 276 +++++++++++-----
 .../sample_demo_1.mp4                         | Bin
 .../minicpm_v/vllm/README.md                  |   8 +-
 .../vllm/offline_inference_vision_language.py | 309 +++++++++++++-----
 tests/model_info.json                         |   2 +-
 tests/run_vllm.py                             |  11 +-
 8 files changed, 430 insertions(+), 186 deletions(-)
 rename models/multimodal/vision_language_model/llava_next_video_7b/vllm/{video-eample-data => video-example-data}/sample_demo_1.mp4 (100%)
 mode change 100755 => 100644

diff --git a/models/multimodal/vision_language_model/idefics3/vllm/README.md b/models/multimodal/vision_language_model/idefics3/vllm/README.md
index 5117a327..78d4117c 100644
--- a/models/multimodal/vision_language_model/idefics3/vllm/README.md
+++ b/models/multimodal/vision_language_model/idefics3/vllm/README.md
@@ -22,8 +22,8 @@ significantly enhancing capabilities around OCR, document understanding and visu
 ```bash
 cp -r ../../vllm_public_assets/ ./
 
-# Download model from the website and make sure the model's path is "data/Aria"
-mkdir data
+# Download model from the website and make sure the model's path is "idefics3"
+mkdir HuggingFaceM4
 ```
 
 ### Install Dependencies
@@ -36,13 +36,14 @@ In order to run the model smoothly, you need to get the sdk from [resource cente
 yum install -y mesa-libGL
 ## Ubuntu
 apt install -y libgl1-mesa-glx
+
+pip install transformers==4.50.3
 ```
 
 ## Model Inference
 
 ```bash
-export VLLM_ASSETS_CACHE=../vllm/
-python3 offline_inference_vision_language.py --model data/Idefics3-8B-Llama3 -tp 4 --max-tokens 256 --trust-remote-code --temperature 0.0 --disable-mm-preprocessor-cache
+python3 offline_inference_vision_language.py --model-type idefics3
 ```
 
 ## Model Results
diff --git a/models/multimodal/vision_language_model/idefics3/vllm/ci/prepare.sh b/models/multimodal/vision_language_model/idefics3/vllm/ci/prepare.sh
index 7232aa29..26f7a3ff 100644
--- a/models/multimodal/vision_language_model/idefics3/vllm/ci/prepare.sh
+++ b/models/multimodal/vision_language_model/idefics3/vllm/ci/prepare.sh
@@ -25,3 +25,4 @@ else
 fi
 
 cp -r ../../vllm_public_assets/ ./
+pip install transformers==4.50.3
\ No newline at end of file
diff --git a/models/multimodal/vision_language_model/idefics3/vllm/offline_inference_vision_language.py b/models/multimodal/vision_language_model/idefics3/vllm/offline_inference_vision_language.py
index 958131c6..c2593603 100644
--- a/models/multimodal/vision_language_model/idefics3/vllm/offline_inference_vision_language.py
+++ b/models/multimodal/vision_language_model/idefics3/vllm/offline_inference_vision_language.py
@@ -1,55 +1,67 @@
-#!/bin/bash
-# Copyright (c) 2025, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may
-# not use this file except in compliance with the License. You may obtain
-# a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
+# SPDX-License-Identifier: Apache-2.0
 """
-This example shows how to use vLLM for running offline inference 
-with the correct prompt format on vision language models.
+This example shows how to use vLLM for running offline inference with
+the correct prompt format on vision language models for text generation.
 
 For most models, the prompt format should follow corresponding examples
 on HuggingFace model repository.
 """
-import sys
-from pathlib import Path
 import os
-sys.path.append(str(Path(__file__).resolve().parent.parent.parent))
-import argparse
-import dataclasses
-import inspect
-from vllm.assets.image import ImageAsset
-from vllm.assets.video import VideoAsset
+import random
+from dataclasses import asdict
+from typing import NamedTuple, Optional
 
+from huggingface_hub import snapshot_download
 from transformers import AutoTokenizer
 
 from vllm import LLM, EngineArgs, SamplingParams
-from utils import sampling_add_cli_args
+from vllm.assets.image import ImageAsset
+from vllm.assets.video import VideoAsset
+from vllm.lora.request import LoRARequest
+from vllm.utils import FlexibleArgumentParser
+
+class ModelRequestData(NamedTuple):
+    engine_args: EngineArgs
+    prompts: list[str]
+    stop_token_ids: Optional[list[int]] = None
+    lora_requests: Optional[list[LoRARequest]] = None
+
+
+# NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
+# lower-end GPUs.
+# Unless specified, these settings have been tested to work on a single L4.
 
 # Idefics3-8B-Llama3
-def run_idefics3(question: str, engine_params, modality: str):
+def run_idefics3(questions: list[str], modality: str) -> ModelRequestData:
     assert modality == "image"
-    model_name = "HuggingFaceM4/Idefics3-8B-Llama3"
+    model_name = "./idefics3"
 
-    llm = LLM(**engine_params)
-    prompt = (
-        f"<|begin_of_text|>User:<image>{question}<end_of_utterance>\nAssistant:"
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=8192,
+        max_num_seqs=2,
+        enforce_eager=True,
+        # if you are running out of memory, you can reduce the "longest_edge".
+        # see: https://huggingface.co/HuggingFaceM4/Idefics3-8B-Llama3#model-optimizations
+        mm_processor_kwargs={
+            "size": {
+                "longest_edge": 3 * 364
+            },
+        },
+        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
     )
-    stop_token_ids = None
-    return llm, prompt, stop_token_ids
-
+    prompts = [(
+        f"<|begin_of_text|>User:<image>{question}<end_of_utterance>\nAssistant:"
+    ) for question in questions]
 
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+    )
 
+model_example_map = {
+    "idefics3": run_idefics3,
+}
 
 def get_multi_modal_input(args):
     """
@@ -60,92 +72,188 @@ def get_multi_modal_input(args):
     """
     if args.modality == "image":
         # Input image and question
-        image = ImageAsset("cherry_blossom").pil_image.convert("RGB")
-        img_question = "What is the content of this image?"
+        image = ImageAsset("cherry_blossom") \
+            .pil_image.convert("RGB")
+        img_questions = [
+            "What is the content of this image?",
+            "Describe the content of this image in detail.",
+            "What's in the image?",
+            "Where is this image taken?",
+        ]
 
         return {
             "data": image,
-            "question": img_question,
+            "questions": img_questions,
         }
 
     if args.modality == "video":
         # Input video and question
         video = VideoAsset(name="sample_demo_1.mp4",
                            num_frames=args.num_frames).np_ndarrays
-        vid_question = "Why is this video funny?"
+        vid_questions = ["Why is this video funny?"]
 
         return {
             "data": video,
-            "question": vid_question,
+            "questions": vid_questions,
         }
 
     msg = f"Modality {args.modality} is not supported."
     raise ValueError(msg)
 
 
-    
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--num-prompts',
-                        type=int,
-                        default=1,
-                        help='Number of prompts to run.')
-    parser.add_argument('--modality',
-                        type=str,
-                        default="image",
-                        help='Modality of the input.')
-    parser.add_argument('--num-frames',
-                        type=int,
-                        default=16,
-                        help='Number of frames to extract from the video.')
-    parser = EngineArgs.add_cli_args(parser)
-    parser = sampling_add_cli_args(parser)
-    args = parser.parse_args()
-    engine_args = [attr.name for attr in dataclasses.fields(EngineArgs)]
-    sampling_args = [
-        param.name
-        for param in list(
-            inspect.signature(SamplingParams).parameters.values()
-        )
-    ]
-    engine_params = {attr: getattr(args, attr) for attr in engine_args}
-    sampling_params = {
-        attr: getattr(args, attr) for attr in sampling_args if args.__contains__(attr)
-    }
-    
+def apply_image_repeat(image_repeat_prob, num_prompts, data,
+                       prompts: list[str], modality):
+    """Repeats images with provided probability of "image_repeat_prob". 
+    Used to simulate hit/miss for the MM preprocessor cache.
+    """
+    assert (image_repeat_prob <= 1.0 and image_repeat_prob >= 0)
+    no_yes = [0, 1]
+    probs = [1.0 - image_repeat_prob, image_repeat_prob]
+
+    inputs = []
+    cur_image = data
+    for i in range(num_prompts):
+        if image_repeat_prob is not None:
+            res = random.choices(no_yes, probs)[0]
+            if res == 0:
+                # No repeat => Modify one pixel
+                cur_image = cur_image.copy()
+                new_val = (i // 256 // 256, i // 256, i % 256)
+                cur_image.putpixel((0, 0), new_val)
+
+        inputs.append({
+            "prompt": prompts[i % len(prompts)],
+            "multi_modal_data": {
+                modality: cur_image
+            }
+        })
+
+    return inputs
+
+
+def main(args):
+    model = args.model_type
+    if model not in model_example_map:
+        raise ValueError(f"Model type {model} is not supported.")
+
     modality = args.modality
     mm_input = get_multi_modal_input(args)
     data = mm_input["data"]
-    question = mm_input["question"]
+    questions = mm_input["questions"]
+
+    req_data = model_example_map[model](questions, modality)
 
-    llm, prompt, stop_token_ids = run_idefics3(question,engine_params,args.modality)
-    sampling_params['stop_token_ids'] = stop_token_ids
+    engine_args = asdict(req_data.engine_args) | {"seed": args.seed}
+    llm = LLM(**engine_args)
+
+    # To maintain code compatibility in this script, we add LoRA here.
+    # You can also add LoRA using:
+    # llm.generate(prompts, lora_request=lora_request,...)
+    if req_data.lora_requests:
+        for lora_request in req_data.lora_requests:
+            llm.llm_engine.add_lora(lora_request=lora_request)
+
+    # Don't want to check the flag multiple times, so just hijack `prompts`.
+    prompts = req_data.prompts if args.use_different_prompt_per_request else [
+        req_data.prompts[0]
+    ]
 
     # We set temperature to 0.2 so that outputs can be different
     # even when all prompts are identical when running batch inference.
-    sampling_params = SamplingParams(**sampling_params)
+    sampling_params = SamplingParams(temperature=0.2,
+                                     max_tokens=64,
+                                     stop_token_ids=req_data.stop_token_ids)
 
     assert args.num_prompts > 0
     if args.num_prompts == 1:
         # Single inference
         inputs = {
-            "prompt": prompt,
+            "prompt": prompts[0],
             "multi_modal_data": {
                 modality: data
             },
         }
-
     else:
         # Batch inference
-        inputs = [{
-            "prompt": prompt,
-            "multi_modal_data": {
-                modality: data
-            },
-        } for _ in range(args.num_prompts)]
+        if args.image_repeat_prob is not None:
+            # Repeat images with specified probability of "image_repeat_prob"
+            inputs = apply_image_repeat(args.image_repeat_prob,
+                                        args.num_prompts, data, prompts,
+                                        modality)
+        else:
+            # Use the same image for all prompts
+            inputs = [{
+                "prompt": prompts[i % len(prompts)],
+                "multi_modal_data": {
+                    modality: data
+                },
+            } for i in range(args.num_prompts)]
+
+    if args.time_generate:
+        import time
+        start_time = time.time()
+        outputs = llm.generate(inputs, sampling_params=sampling_params)
+        elapsed_time = time.time() - start_time
+        print("-- generate time = {}".format(elapsed_time))
 
-    outputs = llm.generate(inputs, sampling_params=sampling_params)
+    else:
+        outputs = llm.generate(inputs, sampling_params=sampling_params)
 
     for o in outputs:
         generated_text = o.outputs[0].text
-        print(generated_text)
\ No newline at end of file
+        print(generated_text)
+
+
+if __name__ == "__main__":
+    parser = FlexibleArgumentParser(
+        description='Demo on using vLLM for offline inference with '
+        'vision language models for text generation')
+    parser.add_argument('--model-type',
+                        '-m',
+                        type=str,
+                        default="llava",
+                        choices=model_example_map.keys(),
+                        help='Huggingface "model_type".')
+    parser.add_argument('--num-prompts',
+                        type=int,
+                        default=4,
+                        help='Number of prompts to run.')
+    parser.add_argument('--modality',
+                        type=str,
+                        default="image",
+                        choices=['image', 'video'],
+                        help='Modality of the input.')
+    parser.add_argument('--num-frames',
+                        type=int,
+                        default=16,
+                        help='Number of frames to extract from the video.')
+    parser.add_argument("--seed",
+                        type=int,
+                        default=None,
+                        help="Set the seed when initializing `vllm.LLM`.")
+
+    parser.add_argument(
+        '--image-repeat-prob',
+        type=float,
+        default=None,
+        help='Simulates the hit-ratio for multi-modal preprocessor cache'
+        ' (if enabled)')
+
+    parser.add_argument(
+        '--disable-mm-preprocessor-cache',
+        action='store_true',
+        help='If True, disables caching of multi-modal preprocessor/mapper.')
+
+    parser.add_argument(
+        '--time-generate',
+        action='store_true',
+        help='If True, then print the total generate() call time')
+
+    parser.add_argument(
+        '--use-different-prompt-per-request',
+        action='store_true',
+        help='If True, then use different prompt (with the same multi-modal '
+        'data) for each request.')
+
+    args = parser.parse_args()
+    main(args)
\ No newline at end of file
diff --git a/models/multimodal/vision_language_model/llava_next_video_7b/vllm/video-eample-data/sample_demo_1.mp4 b/models/multimodal/vision_language_model/llava_next_video_7b/vllm/video-example-data/sample_demo_1.mp4
old mode 100755
new mode 100644
similarity index 100%
rename from models/multimodal/vision_language_model/llava_next_video_7b/vllm/video-eample-data/sample_demo_1.mp4
rename to models/multimodal/vision_language_model/llava_next_video_7b/vllm/video-example-data/sample_demo_1.mp4
diff --git a/models/multimodal/vision_language_model/minicpm_v/vllm/README.md b/models/multimodal/vision_language_model/minicpm_v/vllm/README.md
index a404f6ec..ea1c8d74 100644
--- a/models/multimodal/vision_language_model/minicpm_v/vllm/README.md
+++ b/models/multimodal/vision_language_model/minicpm_v/vllm/README.md
@@ -16,13 +16,12 @@ techniques, making it suitable for deployment in resource-constrained environmen
 
 ### Prepare Resources
 
-- Model: <https://huggingface.co/openbmb/MiniCPM-V-2>
+- Model: <https://huggingface.co/openbmb/MiniCPM-V-2_6>
 
 ```bash
 cp -r ../../vllm_public_assets/ ./
 
-# Download model from the website and make sure the model's path is "data/Aria"
-mkdir data
+# Download model from the website and make sure the model's path is "./minicpm_v"
 ```
 
 ### Install Dependencies
@@ -42,8 +41,7 @@ pip install timm==0.9.10
 ## Model Inference
 
 ```bash
-export VLLM_ASSETS_CACHE=../vllm/
-PT_SDPA_ENABLE_HEAD_DIM_PADDING=1 python3 offline_inference_vision_language.py --model data/MiniCPM-V-2 --max-tokens 256 -tp 2 --trust-remote-code --temperature 0.0
+python3 offline_inference_vision_language.py --model-type minicpmv
 ```
 
 ## Model Results
diff --git a/models/multimodal/vision_language_model/minicpm_v/vllm/offline_inference_vision_language.py b/models/multimodal/vision_language_model/minicpm_v/vllm/offline_inference_vision_language.py
index 2fc88f46..f6df6f98 100644
--- a/models/multimodal/vision_language_model/minicpm_v/vllm/offline_inference_vision_language.py
+++ b/models/multimodal/vision_language_model/minicpm_v/vllm/offline_inference_vision_language.py
@@ -1,42 +1,42 @@
-#!/bin/bash
-# Copyright (c) 2025, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may
-# not use this file except in compliance with the License. You may obtain
-# a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
+# SPDX-License-Identifier: Apache-2.0
 """
-This example shows how to use vLLM for running offline inference 
-with the correct prompt format on vision language models.
+This example shows how to use vLLM for running offline inference with
+the correct prompt format on vision language models for text generation.
 
 For most models, the prompt format should follow corresponding examples
 on HuggingFace model repository.
 """
-import sys
-from pathlib import Path
 import os
-sys.path.append(str(Path(__file__).resolve().parent.parent.parent))
-import argparse
-import dataclasses
-import inspect
+import random
+from dataclasses import asdict
+from typing import NamedTuple, Optional
+
+from huggingface_hub import snapshot_download
 from transformers import AutoTokenizer
+
+from vllm import LLM, EngineArgs, SamplingParams
 from vllm.assets.image import ImageAsset
 from vllm.assets.video import VideoAsset
-from vllm import LLM, EngineArgs, SamplingParams
-from utils import sampling_add_cli_args
+from vllm.lora.request import LoRARequest
+from vllm.utils import FlexibleArgumentParser
+
+
+class ModelRequestData(NamedTuple):
+    engine_args: EngineArgs
+    prompts: list[str]
+    stop_token_ids: Optional[list[int]] = None
+    lora_requests: Optional[list[LoRARequest]] = None
+
+
+# NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
+# lower-end GPUs.
+# Unless specified, these settings have been tested to work on a single L4.
 
 # MiniCPM-V
-def run_minicpmv(question, engine_params, model,modality):
-    assert modality == "image"
+def run_minicpmv_base(questions: list[str], modality: str, model_name):
+    assert modality in ["image", "video"]
+    # If you want to use `MiniCPM-o-2_6` with audio inputs, check `audio_language.py` # noqa
+
     # 2.0
     # The official repo doesn't work yet, so we need to use a fork for now
     # For more details, please see: See: https://github.com/vllm-project/vllm/pull/4087#issuecomment-2250397630 # noqa
@@ -45,10 +45,25 @@ def run_minicpmv(question, engine_params, model,modality):
     # 2.5
     # model_name = "openbmb/MiniCPM-Llama3-V-2_5"
 
-    #2.6
-    tokenizer = AutoTokenizer.from_pretrained(model,
+    # 2.6
+    # model_name = "openbmb/MiniCPM-V-2_6"
+    # o2.6
+
+    # modality supports
+    # 2.0: image
+    # 2.5: image
+    # 2.6: image, video
+    # o2.6: image, video, audio
+    # model_name = "openbmb/MiniCPM-o-2_6"
+    tokenizer = AutoTokenizer.from_pretrained(model_name,
                                               trust_remote_code=True)
-    llm = LLM(**engine_params)
+    engine_args = EngineArgs(
+        model=model_name,
+        max_model_len=4096,
+        max_num_seqs=2,
+        trust_remote_code=True,
+        disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
+    )
     # NOTE The stop_token_ids are different for various versions of MiniCPM-V
     # 2.0
     # stop_token_ids = [tokenizer.eos_id]
@@ -56,18 +71,38 @@ def run_minicpmv(question, engine_params, model,modality):
     # 2.5
     # stop_token_ids = [tokenizer.eos_id, tokenizer.eot_id]
 
-    # 2.6
+    # 2.6 / o2.6
     stop_tokens = ['<|im_end|>', '<|endoftext|>']
     stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
 
-    messages = [{
-        'role': 'user',
-        'content': f'(<image>./</image>)\n{question}'
-    }]
-    prompt = tokenizer.apply_chat_template(messages,
-                                           tokenize=False,
-                                           add_generation_prompt=True)
-    return llm, prompt, stop_token_ids
+    modality_placeholder = {
+        "image": "(<image>./</image>)",
+        "video": "(<video>./</video>)",
+    }
+
+    prompts = [
+        tokenizer.apply_chat_template(
+            [{
+                'role': 'user',
+                'content': f"{modality_placeholder[modality]}\n{question}"
+            }],
+            tokenize=False,
+            add_generation_prompt=True) for question in questions
+    ]
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+        stop_token_ids=stop_token_ids,
+    )
+
+def run_minicpmv(questions: list[str], modality: str) -> ModelRequestData:
+    return run_minicpmv_base(questions, modality, "./minicpm_v")
+
+
+model_example_map = {
+    "minicpmv": run_minicpmv,
+}
 
 
 def get_multi_modal_input(args):
@@ -79,92 +114,188 @@ def get_multi_modal_input(args):
     """
     if args.modality == "image":
         # Input image and question
-        image = ImageAsset("cherry_blossom").pil_image.convert("RGB")
-        img_question = "What is the content of this image?"
+        image = ImageAsset("cherry_blossom") \
+            .pil_image.convert("RGB")
+        img_questions = [
+            "What is the content of this image?",
+            "Describe the content of this image in detail.",
+            "What's in the image?",
+            "Where is this image taken?",
+        ]
 
         return {
             "data": image,
-            "question": img_question,
+            "questions": img_questions,
         }
 
     if args.modality == "video":
         # Input video and question
         video = VideoAsset(name="sample_demo_1.mp4",
                            num_frames=args.num_frames).np_ndarrays
-        vid_question = "Why is this video funny?"
+        vid_questions = ["Why is this video funny?"]
 
         return {
             "data": video,
-            "question": vid_question,
+            "questions": vid_questions,
         }
 
     msg = f"Modality {args.modality} is not supported."
     raise ValueError(msg)
 
 
-    
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--num-prompts',
-                        type=int,
-                        default=1,
-                        help='Number of prompts to run.')
-    parser.add_argument('--modality',
-                        type=str,
-                        default="image",
-                        help='Modality of the input.')
-    parser.add_argument('--num-frames',
-                        type=int,
-                        default=16,
-                        help='Number of frames to extract from the video.')
-    parser = EngineArgs.add_cli_args(parser)
-    parser = sampling_add_cli_args(parser)
-    args = parser.parse_args()
-    engine_args = [attr.name for attr in dataclasses.fields(EngineArgs)]
-    sampling_args = [
-        param.name
-        for param in list(
-            inspect.signature(SamplingParams).parameters.values()
-        )
-    ]
-    engine_params = {attr: getattr(args, attr) for attr in engine_args}
-    sampling_params = {
-        attr: getattr(args, attr) for attr in sampling_args if args.__contains__(attr)
-    }
-    
+def apply_image_repeat(image_repeat_prob, num_prompts, data,
+                       prompts: list[str], modality):
+    """Repeats images with provided probability of "image_repeat_prob". 
+    Used to simulate hit/miss for the MM preprocessor cache.
+    """
+    assert (image_repeat_prob <= 1.0 and image_repeat_prob >= 0)
+    no_yes = [0, 1]
+    probs = [1.0 - image_repeat_prob, image_repeat_prob]
+
+    inputs = []
+    cur_image = data
+    for i in range(num_prompts):
+        if image_repeat_prob is not None:
+            res = random.choices(no_yes, probs)[0]
+            if res == 0:
+                # No repeat => Modify one pixel
+                cur_image = cur_image.copy()
+                new_val = (i // 256 // 256, i // 256, i % 256)
+                cur_image.putpixel((0, 0), new_val)
+
+        inputs.append({
+            "prompt": prompts[i % len(prompts)],
+            "multi_modal_data": {
+                modality: cur_image
+            }
+        })
+
+    return inputs
+
+
+def main(args):
+    model = args.model_type
+    if model not in model_example_map:
+        raise ValueError(f"Model type {model} is not supported.")
+
     modality = args.modality
     mm_input = get_multi_modal_input(args)
     data = mm_input["data"]
-    question = mm_input["question"]
+    questions = mm_input["questions"]
 
-    llm, prompt, stop_token_ids = run_minicpmv(question,engine_params, args.model, args.modality)
-    sampling_params['stop_token_ids'] = stop_token_ids
+    req_data = model_example_map[model](questions, modality)
+
+    engine_args = asdict(req_data.engine_args) | {"seed": args.seed}
+    llm = LLM(**engine_args)
+
+    # To maintain code compatibility in this script, we add LoRA here.
+    # You can also add LoRA using:
+    # llm.generate(prompts, lora_request=lora_request,...)
+    if req_data.lora_requests:
+        for lora_request in req_data.lora_requests:
+            llm.llm_engine.add_lora(lora_request=lora_request)
+
+    # Don't want to check the flag multiple times, so just hijack `prompts`.
+    prompts = req_data.prompts if args.use_different_prompt_per_request else [
+        req_data.prompts[0]
+    ]
 
     # We set temperature to 0.2 so that outputs can be different
     # even when all prompts are identical when running batch inference.
-    sampling_params = SamplingParams(**sampling_params)
+    sampling_params = SamplingParams(temperature=0.2,
+                                     max_tokens=64,
+                                     stop_token_ids=req_data.stop_token_ids)
 
     assert args.num_prompts > 0
     if args.num_prompts == 1:
         # Single inference
         inputs = {
-            "prompt": prompt,
+            "prompt": prompts[0],
             "multi_modal_data": {
                 modality: data
             },
         }
-
     else:
         # Batch inference
-        inputs = [{
-            "prompt": prompt,
-            "multi_modal_data": {
-                modality: data
-            },
-        } for _ in range(args.num_prompts)]
+        if args.image_repeat_prob is not None:
+            # Repeat images with specified probability of "image_repeat_prob"
+            inputs = apply_image_repeat(args.image_repeat_prob,
+                                        args.num_prompts, data, prompts,
+                                        modality)
+        else:
+            # Use the same image for all prompts
+            inputs = [{
+                "prompt": prompts[i % len(prompts)],
+                "multi_modal_data": {
+                    modality: data
+                },
+            } for i in range(args.num_prompts)]
 
-    outputs = llm.generate(inputs, sampling_params=sampling_params)
+    if args.time_generate:
+        import time
+        start_time = time.time()
+        outputs = llm.generate(inputs, sampling_params=sampling_params)
+        elapsed_time = time.time() - start_time
+        print("-- generate time = {}".format(elapsed_time))
+
+    else:
+        outputs = llm.generate(inputs, sampling_params=sampling_params)
 
     for o in outputs:
         generated_text = o.outputs[0].text
-        print(generated_text)
\ No newline at end of file
+        print(generated_text)
+
+
+if __name__ == "__main__":
+    parser = FlexibleArgumentParser(
+        description='Demo on using vLLM for offline inference with '
+        'vision language models for text generation')
+    parser.add_argument('--model-type',
+                        '-m',
+                        type=str,
+                        default="llava",
+                        choices=model_example_map.keys(),
+                        help='Huggingface "model_type".')
+    parser.add_argument('--num-prompts',
+                        type=int,
+                        default=4,
+                        help='Number of prompts to run.')
+    parser.add_argument('--modality',
+                        type=str,
+                        default="image",
+                        choices=['image', 'video'],
+                        help='Modality of the input.')
+    parser.add_argument('--num-frames',
+                        type=int,
+                        default=16,
+                        help='Number of frames to extract from the video.')
+    parser.add_argument("--seed",
+                        type=int,
+                        default=None,
+                        help="Set the seed when initializing `vllm.LLM`.")
+
+    parser.add_argument(
+        '--image-repeat-prob',
+        type=float,
+        default=None,
+        help='Simulates the hit-ratio for multi-modal preprocessor cache'
+        ' (if enabled)')
+
+    parser.add_argument(
+        '--disable-mm-preprocessor-cache',
+        action='store_true',
+        help='If True, disables caching of multi-modal preprocessor/mapper.')
+
+    parser.add_argument(
+        '--time-generate',
+        action='store_true',
+        help='If True, then print the total generate() call time')
+
+    parser.add_argument(
+        '--use-different-prompt-per-request',
+        action='store_true',
+        help='If True, then use different prompt (with the same multi-modal '
+        'data) for each request.')
+
+    args = parser.parse_args()
+    main(args)
\ No newline at end of file
diff --git a/tests/model_info.json b/tests/model_info.json
index 3ba5dadd..466a81d7 100644
--- a/tests/model_info.json
+++ b/tests/model_info.json
@@ -7098,7 +7098,7 @@
             "github_branch": "",
             "github_path": "",
             "datasets": "",
-            "download_url": "https://huggingface.co/openbmb/MiniCPM-V-2",
+            "download_url": "https://huggingface.co/openbmb/MiniCPM-V-2_6",
             "need_third_part": false,
             "precisions": [
                 "fp16"
diff --git a/tests/run_vllm.py b/tests/run_vllm.py
index c6100a40..be795462 100644
--- a/tests/run_vllm.py
+++ b/tests/run_vllm.py
@@ -229,7 +229,13 @@ def run_nlp_testcase(model):
             export VLLM_ASSETS_CACHE=../vllm/
             python3 offline_inference_vision_language.py --model ./{model_name} --max-tokens 256 -tp 2 --trust-remote-code --temperature 0.0
             """
-        elif model_name == "h2vol" or model_name == "idefics3":
+        elif model_name == "idefics3":
+            script = f"""
+            set -x
+            cd ../{model['model_path']}
+            python3 offline_inference_vision_language.py --model-type idefics3
+            """
+        elif model_name == "h2vol":
             script = f"""
             set -x
             cd ../{model['model_path']}
@@ -240,8 +246,7 @@ def run_nlp_testcase(model):
             script = f"""
             set -x
             cd ../{model['model_path']}
-            export VLLM_ASSETS_CACHE=../vllm/
-            PT_SDPA_ENABLE_HEAD_DIM_PADDING=1 python3 offline_inference_vision_language.py --model ./{model_name} --max-tokens 256 -tp 2 --trust-remote-code --temperature 0.0
+            python3 offline_inference_vision_language.py --model-type minicpmv
             """
         elif model_name == "llama-3.2":
             script = f"""
-- 
Gitee