diff --git a/models/multimodal/vision_language_model/aria/vllm/README.md b/models/multimodal/vision_language_model/aria/vllm/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..ddcf3f42ac81ac93755912ad263fba885a786b74
--- /dev/null
+++ b/models/multimodal/vision_language_model/aria/vllm/README.md
@@ -0,0 +1,48 @@
+# Aria
+
+## Model Description
+
+Aria is a multimodal native MoE model. It features:
+- State-of-the-art performance on various multimodal and language tasks, superior in video and document understanding;
+- Long multimodal context window of 64K tokens;
+- 3.9B activated parameters per token, enabling fast inference speed and low fine-tuning cost.
+
+## Supported Environments
+
+| GPU    | [IXUCA SDK](https://gitee.com/deep-spark/deepspark#%E5%A4%A9%E6%95%B0%E6%99%BA%E7%AE%97%E8%BD%AF%E4%BB%B6%E6%A0%88-ixuca) | vLLM | Release |
+|--------|-----------|---------|---------|
+|        |           | >=0.6.6 |         |
+
+## Model Preparation
+
+### Prepare Resources
+
+- Model: <https://huggingface.co/rhymes-ai/Aria>
+
+```bash
+cp -r ../../vllm_public_assets/ ./
+
+# Download model from the website and make sure the model's path is "data/Aria"
+mkdir data
+```
+
+### Install Dependencies
+
+In order to run the model smoothly, you need to get the sdk from [resource center](https://support.iluvatar.com/#/ProductLine?id=2) of Iluvatar CoreX official website.
+
+```bash
+# Install libGL
+## CentOS
+yum install -y mesa-libGL
+## Ubuntu
+apt install -y libgl1-mesa-glx
+```
+
+## Model Inference
+
+```bash
+export VLLM_ASSETS_CACHE=../vllm/
+python3 offline_inference_vision_language.py --model data/Aria --max-tokens 256 -tp 4 --trust-remote-code --temperature 0.0 --dtype bfloat16 --tokenizer-mode slow
+```
+
+## Model Results
\ No newline at end of file
diff --git a/models/multimodal/vision_language_model/aria/vllm/ci/prepare.sh b/models/multimodal/vision_language_model/aria/vllm/ci/prepare.sh
new file mode 100644
index 0000000000000000000000000000000000000000..7232aa2996f379a961cf931968a1319fb70ac091
--- /dev/null
+++ b/models/multimodal/vision_language_model/aria/vllm/ci/prepare.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+# Copyright (c) 2025, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -x
+ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
+if [[ ${ID} == "ubuntu" ]]; then
+    apt install -y libgl1-mesa-glx
+elif [[ ${ID} == "centos" ]]; then
+    yum install -y mesa-libGL
+else
+    echo "Not Support Os"
+fi
+
+cp -r ../../vllm_public_assets/ ./
diff --git a/models/multimodal/vision_language_model/aria/vllm/offline_inference_vision_language.py b/models/multimodal/vision_language_model/aria/vllm/offline_inference_vision_language.py
new file mode 100644
index 0000000000000000000000000000000000000000..7b795615fab8f06a1ade352bef56ae280c5d03bd
--- /dev/null
+++ b/models/multimodal/vision_language_model/aria/vllm/offline_inference_vision_language.py
@@ -0,0 +1,147 @@
+#!/bin/bash
+# Copyright (c) 2025, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+This example shows how to use vLLM for running offline inference 
+with the correct prompt format on vision language models.
+
+For most models, the prompt format should follow corresponding examples
+on HuggingFace model repository.
+"""
+import sys
+from pathlib import Path
+import os
+sys.path.append(str(Path(__file__).resolve().parent.parent.parent))
+import argparse
+import dataclasses
+import inspect
+from vllm.assets.image import ImageAsset
+from vllm.assets.video import VideoAsset
+
+from vllm import LLM, EngineArgs, SamplingParams
+from utils import sampling_add_cli_args
+
+# Aria
+def run_aria(question: str, engine_params, modality: str):
+    assert modality == "image"
+
+    llm = LLM(**engine_params)
+
+    prompt = (f"<|im_start|>user\n<fim_prefix><|img|><fim_suffix>\n{question}"
+              "<|im_end|>\n<|im_start|>assistant\n")
+
+    stop_token_ids = [93532, 93653, 944, 93421, 1019, 93653, 93519]
+    return llm, prompt, stop_token_ids
+
+
+def get_multi_modal_input(args):
+    """
+    return {
+        "data": image or video,
+        "question": question,
+    }
+    """
+    if args.modality == "image":
+        # Input image and question
+        image = ImageAsset("cherry_blossom").pil_image.convert("RGB")
+        img_question = "What is the content of this image?"
+
+        return {
+            "data": image,
+            "question": img_question,
+        }
+
+    if args.modality == "video":
+        # Input video and question
+        video = VideoAsset(name="sample_demo_1.mp4",
+                           num_frames=args.num_frames).np_ndarrays
+        vid_question = "Why is this video funny?"
+
+        return {
+            "data": video,
+            "question": vid_question,
+        }
+
+    msg = f"Modality {args.modality} is not supported."
+    raise ValueError(msg)
+
+
+    
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--num-prompts',
+                        type=int,
+                        default=1,
+                        help='Number of prompts to run.')
+    parser.add_argument('--modality',
+                        type=str,
+                        default="image",
+                        help='Modality of the input.')
+    parser.add_argument('--num-frames',
+                        type=int,
+                        default=16,
+                        help='Number of frames to extract from the video.')
+    parser = EngineArgs.add_cli_args(parser)
+    parser = sampling_add_cli_args(parser)
+    args = parser.parse_args()
+    engine_args = [attr.name for attr in dataclasses.fields(EngineArgs)]
+    sampling_args = [
+        param.name
+        for param in list(
+            inspect.signature(SamplingParams).parameters.values()
+        )
+    ]
+    engine_params = {attr: getattr(args, attr) for attr in engine_args}
+    sampling_params = {
+        attr: getattr(args, attr) for attr in sampling_args if args.__contains__(attr)
+    }
+    
+    modality = args.modality
+    mm_input = get_multi_modal_input(args)
+    data = mm_input["data"]
+    question = mm_input["question"]
+
+    llm, prompt, stop_token_ids = run_aria(question,engine_params, args.modality)
+    sampling_params['stop_token_ids'] = stop_token_ids
+
+    # We set temperature to 0.2 so that outputs can be different
+    # even when all prompts are identical when running batch inference.
+    sampling_params = SamplingParams(**sampling_params)
+
+    assert args.num_prompts > 0
+    if args.num_prompts == 1:
+        # Single inference
+        inputs = {
+            "prompt": prompt,
+            "multi_modal_data": {
+                modality: data
+            },
+        }
+
+    else:
+        # Batch inference
+        inputs = [{
+            "prompt": prompt,
+            "multi_modal_data": {
+                modality: data
+            },
+        } for _ in range(args.num_prompts)]
+
+    outputs = llm.generate(inputs, sampling_params=sampling_params)
+
+    for o in outputs:
+        generated_text = o.outputs[0].text
+        print(generated_text)
\ No newline at end of file
diff --git a/models/multimodal/vision_language_model/h2vol/vllm/README.md b/models/multimodal/vision_language_model/h2vol/vllm/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..759871c46373731f5d856a856db57e7b7ab7abf7
--- /dev/null
+++ b/models/multimodal/vision_language_model/h2vol/vllm/README.md
@@ -0,0 +1,45 @@
+# H2ovl
+
+## Model Description
+
+The H2OVL-Mississippi-800M is a compact yet powerful vision-language model from H2O.ai, featuring 0.8 billion parameters. Despite its small size, it delivers state-of-the-art performance in text recognition, excelling in the Text Recognition segment of OCRBench and outperforming much larger models in this domain. Built upon the robust architecture of our H2O-Danube language models, the Mississippi-800M extends their capabilities by seamlessly integrating vision and language tasks.
+
+## Supported Environments
+
+| GPU    | [IXUCA SDK](https://gitee.com/deep-spark/deepspark#%E5%A4%A9%E6%95%B0%E6%99%BA%E7%AE%97%E8%BD%AF%E4%BB%B6%E6%A0%88-ixuca) | vLLM | Release |
+|--------|-----------|---------|---------|
+|        |           | >=0.6.4 |         |
+
+## Model Preparation
+
+### Prepare Resources
+
+- Model: <https://huggingface.co/h2oai/h2ovl-mississippi-800m>
+
+```bash
+cp -r ../../vllm_public_assets/ ./
+
+# Download model from the website and make sure the model's path is "data/Aria"
+mkdir data
+```
+
+### Install Dependencies
+
+In order to run the model smoothly, you need to get the sdk from [resource center](https://support.iluvatar.com/#/ProductLine?id=2) of Iluvatar CoreX official website.
+
+```bash
+# Install libGL
+## CentOS
+yum install -y mesa-libGL
+## Ubuntu
+apt install -y libgl1-mesa-glx
+```
+
+## Model Inference
+
+```bash
+export VLLM_ASSETS_CACHE=../vllm/
+python3 offline_inference_vision_language.py --model data/h2ovl-mississippi-800m -tp 1 --max-tokens 256 --trust-remote-code --temperature 0.0 --disable-mm-preprocessor-cache
+```
+
+## Model Results
\ No newline at end of file
diff --git a/models/multimodal/vision_language_model/h2vol/vllm/ci/prepare.sh b/models/multimodal/vision_language_model/h2vol/vllm/ci/prepare.sh
new file mode 100644
index 0000000000000000000000000000000000000000..7232aa2996f379a961cf931968a1319fb70ac091
--- /dev/null
+++ b/models/multimodal/vision_language_model/h2vol/vllm/ci/prepare.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+# Copyright (c) 2025, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -x
+ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
+if [[ ${ID} == "ubuntu" ]]; then
+    apt install -y libgl1-mesa-glx
+elif [[ ${ID} == "centos" ]]; then
+    yum install -y mesa-libGL
+else
+    echo "Not Support Os"
+fi
+
+cp -r ../../vllm_public_assets/ ./
diff --git a/models/multimodal/vision_language_model/h2vol/vllm/offline_inference_vision_language.py b/models/multimodal/vision_language_model/h2vol/vllm/offline_inference_vision_language.py
new file mode 100644
index 0000000000000000000000000000000000000000..0acace7f241d527ec132a4f059a45d8f70d48abb
--- /dev/null
+++ b/models/multimodal/vision_language_model/h2vol/vllm/offline_inference_vision_language.py
@@ -0,0 +1,155 @@
+#!/bin/bash
+# Copyright (c) 2025, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+This example shows how to use vLLM for running offline inference 
+with the correct prompt format on vision language models.
+
+For most models, the prompt format should follow corresponding examples
+on HuggingFace model repository.
+"""
+import sys
+from pathlib import Path
+import os
+sys.path.append(str(Path(__file__).resolve().parent.parent.parent))
+import argparse
+import dataclasses
+import inspect
+from vllm.assets.image import ImageAsset
+from vllm.assets.video import VideoAsset
+
+from transformers import AutoTokenizer
+
+from vllm import LLM, EngineArgs, SamplingParams
+from utils import sampling_add_cli_args
+
+# H2OVL-Mississippi
+def run_h2ovl(question: str, engine_params, model_name,modality: str):
+    assert modality == "image"
+    llm = LLM(**engine_params)
+    
+    tokenizer = AutoTokenizer.from_pretrained(model_name,
+                                              trust_remote_code=True)
+    messages = [{'role': 'user', 'content': f"<image>\n{question}"}]
+    prompt = tokenizer.apply_chat_template(messages,
+                                           tokenize=False,
+                                           add_generation_prompt=True)
+
+    # Stop tokens for H2OVL-Mississippi
+    # https://huggingface.co/h2oai/h2ovl-mississippi-2b
+    stop_token_ids = [tokenizer.eos_token_id]
+    return llm, prompt, stop_token_ids
+
+
+
+def get_multi_modal_input(args):
+    """
+    return {
+        "data": image or video,
+        "question": question,
+    }
+    """
+    if args.modality == "image":
+        # Input image and question
+        image = ImageAsset("cherry_blossom").pil_image.convert("RGB")
+        img_question = "What is the content of this image?"
+
+        return {
+            "data": image,
+            "question": img_question,
+        }
+
+    if args.modality == "video":
+        # Input video and question
+        video = VideoAsset(name="sample_demo_1.mp4",
+                           num_frames=args.num_frames).np_ndarrays
+        vid_question = "Why is this video funny?"
+
+        return {
+            "data": video,
+            "question": vid_question,
+        }
+
+    msg = f"Modality {args.modality} is not supported."
+    raise ValueError(msg)
+
+
+    
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--num-prompts',
+                        type=int,
+                        default=1,
+                        help='Number of prompts to run.')
+    parser.add_argument('--modality',
+                        type=str,
+                        default="image",
+                        help='Modality of the input.')
+    parser.add_argument('--num-frames',
+                        type=int,
+                        default=16,
+                        help='Number of frames to extract from the video.')
+    parser = EngineArgs.add_cli_args(parser)
+    parser = sampling_add_cli_args(parser)
+    args = parser.parse_args()
+    engine_args = [attr.name for attr in dataclasses.fields(EngineArgs)]
+    sampling_args = [
+        param.name
+        for param in list(
+            inspect.signature(SamplingParams).parameters.values()
+        )
+    ]
+    engine_params = {attr: getattr(args, attr) for attr in engine_args}
+    sampling_params = {
+        attr: getattr(args, attr) for attr in sampling_args if args.__contains__(attr)
+    }
+    
+    modality = args.modality
+    mm_input = get_multi_modal_input(args)
+    data = mm_input["data"]
+    question = mm_input["question"]
+
+    llm, prompt, stop_token_ids = run_h2ovl(question,engine_params,args.model,args.modality)
+    sampling_params['stop_token_ids'] = stop_token_ids
+
+    # We set temperature to 0.2 so that outputs can be different
+    # even when all prompts are identical when running batch inference.
+    sampling_params = SamplingParams(**sampling_params)
+
+    assert args.num_prompts > 0
+    if args.num_prompts == 1:
+        # Single inference
+        inputs = {
+            "prompt": prompt,
+            "multi_modal_data": {
+                modality: data
+            },
+        }
+
+    else:
+        # Batch inference
+        inputs = [{
+            "prompt": prompt,
+            "multi_modal_data": {
+                modality: data
+            },
+        } for _ in range(args.num_prompts)]
+
+    outputs = llm.generate(inputs, sampling_params=sampling_params)
+
+    for o in outputs:
+        generated_text = o.outputs[0].text
+        print(generated_text)
\ No newline at end of file
diff --git a/models/multimodal/vision_language_model/idefics3/vllm/README.md b/models/multimodal/vision_language_model/idefics3/vllm/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..340665fc85937ca2fdbaa72757ca0005b25a43df
--- /dev/null
+++ b/models/multimodal/vision_language_model/idefics3/vllm/README.md
@@ -0,0 +1,45 @@
+# Idefics3
+
+## Model Description
+
+Idefics3 is an open multimodal model that accepts arbitrary sequences of image and text inputs and produces text outputs. The model can answer questions about images, describe visual content, create stories grounded on multiple images, or simply behave as a pure language model without visual inputs. It improves upon Idefics1 and Idefics2, significantly enhancing capabilities around OCR, document understanding and visual reasoning.
+
+## Supported Environments
+
+| GPU    | [IXUCA SDK](https://gitee.com/deep-spark/deepspark#%E5%A4%A9%E6%95%B0%E6%99%BA%E7%AE%97%E8%BD%AF%E4%BB%B6%E6%A0%88-ixuca) | vLLM | Release |
+|--------|-----------|---------|---------|
+|        |           | >=0.6.4 |         |
+
+## Model Preparation
+
+### Prepare Resources
+
+- Model: <https://huggingface.co/HuggingFaceM4/Idefics3-8B-Llama3>
+
+```bash
+cp -r ../../vllm_public_assets/ ./
+
+# Download model from the website and make sure the model's path is "data/Aria"
+mkdir data
+```
+
+### Install Dependencies
+
+In order to run the model smoothly, you need to get the sdk from [resource center](https://support.iluvatar.com/#/ProductLine?id=2) of Iluvatar CoreX official website.
+
+```bash
+# Install libGL
+## CentOS
+yum install -y mesa-libGL
+## Ubuntu
+apt install -y libgl1-mesa-glx
+```
+
+## Model Inference
+
+```bash
+export VLLM_ASSETS_CACHE=../vllm/
+python3 offline_inference_vision_language.py --model data/Idefics3-8B-Llama3 -tp 4 --max-tokens 256 --trust-remote-code --temperature 0.0 --disable-mm-preprocessor-cache
+```
+
+## Model Results
\ No newline at end of file
diff --git a/models/multimodal/vision_language_model/idefics3/vllm/ci/prepare.sh b/models/multimodal/vision_language_model/idefics3/vllm/ci/prepare.sh
new file mode 100644
index 0000000000000000000000000000000000000000..7232aa2996f379a961cf931968a1319fb70ac091
--- /dev/null
+++ b/models/multimodal/vision_language_model/idefics3/vllm/ci/prepare.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+# Copyright (c) 2025, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -x
+ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
+if [[ ${ID} == "ubuntu" ]]; then
+    apt install -y libgl1-mesa-glx
+elif [[ ${ID} == "centos" ]]; then
+    yum install -y mesa-libGL
+else
+    echo "Not Support Os"
+fi
+
+cp -r ../../vllm_public_assets/ ./
diff --git a/models/multimodal/vision_language_model/idefics3/vllm/offline_inference_vision_language.py b/models/multimodal/vision_language_model/idefics3/vllm/offline_inference_vision_language.py
new file mode 100644
index 0000000000000000000000000000000000000000..958131c646eb2e3f741257b4883ba0fcc04d8840
--- /dev/null
+++ b/models/multimodal/vision_language_model/idefics3/vllm/offline_inference_vision_language.py
@@ -0,0 +1,151 @@
+#!/bin/bash
+# Copyright (c) 2025, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+This example shows how to use vLLM for running offline inference 
+with the correct prompt format on vision language models.
+
+For most models, the prompt format should follow corresponding examples
+on HuggingFace model repository.
+"""
+import sys
+from pathlib import Path
+import os
+sys.path.append(str(Path(__file__).resolve().parent.parent.parent))
+import argparse
+import dataclasses
+import inspect
+from vllm.assets.image import ImageAsset
+from vllm.assets.video import VideoAsset
+
+from transformers import AutoTokenizer
+
+from vllm import LLM, EngineArgs, SamplingParams
+from utils import sampling_add_cli_args
+
+# Idefics3-8B-Llama3
+def run_idefics3(question: str, engine_params, modality: str):
+    assert modality == "image"
+    model_name = "HuggingFaceM4/Idefics3-8B-Llama3"
+
+    llm = LLM(**engine_params)
+    prompt = (
+        f"<|begin_of_text|>User:<image>{question}<end_of_utterance>\nAssistant:"
+    )
+    stop_token_ids = None
+    return llm, prompt, stop_token_ids
+
+
+
+
+def get_multi_modal_input(args):
+    """
+    return {
+        "data": image or video,
+        "question": question,
+    }
+    """
+    if args.modality == "image":
+        # Input image and question
+        image = ImageAsset("cherry_blossom").pil_image.convert("RGB")
+        img_question = "What is the content of this image?"
+
+        return {
+            "data": image,
+            "question": img_question,
+        }
+
+    if args.modality == "video":
+        # Input video and question
+        video = VideoAsset(name="sample_demo_1.mp4",
+                           num_frames=args.num_frames).np_ndarrays
+        vid_question = "Why is this video funny?"
+
+        return {
+            "data": video,
+            "question": vid_question,
+        }
+
+    msg = f"Modality {args.modality} is not supported."
+    raise ValueError(msg)
+
+
+    
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--num-prompts',
+                        type=int,
+                        default=1,
+                        help='Number of prompts to run.')
+    parser.add_argument('--modality',
+                        type=str,
+                        default="image",
+                        help='Modality of the input.')
+    parser.add_argument('--num-frames',
+                        type=int,
+                        default=16,
+                        help='Number of frames to extract from the video.')
+    parser = EngineArgs.add_cli_args(parser)
+    parser = sampling_add_cli_args(parser)
+    args = parser.parse_args()
+    engine_args = [attr.name for attr in dataclasses.fields(EngineArgs)]
+    sampling_args = [
+        param.name
+        for param in list(
+            inspect.signature(SamplingParams).parameters.values()
+        )
+    ]
+    engine_params = {attr: getattr(args, attr) for attr in engine_args}
+    sampling_params = {
+        attr: getattr(args, attr) for attr in sampling_args if args.__contains__(attr)
+    }
+    
+    modality = args.modality
+    mm_input = get_multi_modal_input(args)
+    data = mm_input["data"]
+    question = mm_input["question"]
+
+    llm, prompt, stop_token_ids = run_idefics3(question,engine_params,args.modality)
+    sampling_params['stop_token_ids'] = stop_token_ids
+
+    # We set temperature to 0.2 so that outputs can be different
+    # even when all prompts are identical when running batch inference.
+    sampling_params = SamplingParams(**sampling_params)
+
+    assert args.num_prompts > 0
+    if args.num_prompts == 1:
+        # Single inference
+        inputs = {
+            "prompt": prompt,
+            "multi_modal_data": {
+                modality: data
+            },
+        }
+
+    else:
+        # Batch inference
+        inputs = [{
+            "prompt": prompt,
+            "multi_modal_data": {
+                modality: data
+            },
+        } for _ in range(args.num_prompts)]
+
+    outputs = llm.generate(inputs, sampling_params=sampling_params)
+
+    for o in outputs:
+        generated_text = o.outputs[0].text
+        print(generated_text)
\ No newline at end of file
diff --git a/models/multimodal/vision_language_model/minicpm_v/vllm/README.md b/models/multimodal/vision_language_model/minicpm_v/vllm/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..092f54d0b0e3e21e7921071940105ce84c338661
--- /dev/null
+++ b/models/multimodal/vision_language_model/minicpm_v/vllm/README.md
@@ -0,0 +1,47 @@
+# MiniCPM-V-2
+
+## Model Description
+
+MiniCPM V2 is a compact and efficient language model designed for various natural language processing (NLP) tasks. Building on its predecessor, MiniCPM-V-1, this model integrates advancements in architecture and optimization techniques, making it suitable for deployment in resource-constrained environments.s
+
+## Supported Environments
+
+| GPU    | [IXUCA SDK](https://gitee.com/deep-spark/deepspark#%E5%A4%A9%E6%95%B0%E6%99%BA%E7%AE%97%E8%BD%AF%E4%BB%B6%E6%A0%88-ixuca) | Release |
+|--------|-----------|---------|
+| MR-V100 | 4.2.0     |  25.06  |
+
+## Model Preparation
+
+### Prepare Resources
+
+- Model: <https://huggingface.co/openbmb/MiniCPM-V-2>
+
+```bash
+cp -r ../../vllm_public_assets/ ./
+
+# Download model from the website and make sure the model's path is "data/Aria"
+mkdir data
+```
+
+### Install Dependencies
+
+In order to run the model smoothly, you need to get the sdk from [resource center](https://support.iluvatar.com/#/ProductLine?id=2) of Iluvatar CoreX official website.
+
+```bash
+# Install libGL
+## CentOS
+yum install -y mesa-libGL
+## Ubuntu
+apt install -y libgl1-mesa-glx
+
+pip install timm==0.9.10
+```
+
+## Model Inference
+
+```bash
+export VLLM_ASSETS_CACHE=../vllm/
+PT_SDPA_ENABLE_HEAD_DIM_PADDING=1 python3 offline_inference_vision_language.py --model data/MiniCPM-V-2 --max-tokens 256 -tp 2 --trust-remote-code --temperature 0.0
+```
+
+## Model Results
\ No newline at end of file
diff --git a/models/multimodal/vision_language_model/minicpm_v/vllm/ci/prepare.sh b/models/multimodal/vision_language_model/minicpm_v/vllm/ci/prepare.sh
new file mode 100644
index 0000000000000000000000000000000000000000..0527fa6e7217d9777faa685be5428772d7011bfd
--- /dev/null
+++ b/models/multimodal/vision_language_model/minicpm_v/vllm/ci/prepare.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+# Copyright (c) 2025, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -x
+ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
+if [[ ${ID} == "ubuntu" ]]; then
+    apt install -y libgl1-mesa-glx
+elif [[ ${ID} == "centos" ]]; then
+    yum install -y mesa-libGL
+else
+    echo "Not Support Os"
+fi
+
+cp -r ../../vllm_public_assets/ ./
+pip install timm==0.9.10
diff --git a/models/multimodal/vision_language_model/minicpm_v/vllm/offline_inference_vision_language.py b/models/multimodal/vision_language_model/minicpm_v/vllm/offline_inference_vision_language.py
new file mode 100644
index 0000000000000000000000000000000000000000..2fc88f4695bf32cc400dbb51ea5dae4c3fb8b11a
--- /dev/null
+++ b/models/multimodal/vision_language_model/minicpm_v/vllm/offline_inference_vision_language.py
@@ -0,0 +1,170 @@
+#!/bin/bash
+# Copyright (c) 2025, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+This example shows how to use vLLM for running offline inference 
+with the correct prompt format on vision language models.
+
+For most models, the prompt format should follow corresponding examples
+on HuggingFace model repository.
+"""
+import sys
+from pathlib import Path
+import os
+sys.path.append(str(Path(__file__).resolve().parent.parent.parent))
+import argparse
+import dataclasses
+import inspect
+from transformers import AutoTokenizer
+from vllm.assets.image import ImageAsset
+from vllm.assets.video import VideoAsset
+from vllm import LLM, EngineArgs, SamplingParams
+from utils import sampling_add_cli_args
+
+# MiniCPM-V
+def run_minicpmv(question, engine_params, model,modality):
+    assert modality == "image"
+    # 2.0
+    # The official repo doesn't work yet, so we need to use a fork for now
+    # For more details, please see: See: https://github.com/vllm-project/vllm/pull/4087#issuecomment-2250397630 # noqa
+    # model_name = "HwwwH/MiniCPM-V-2"
+
+    # 2.5
+    # model_name = "openbmb/MiniCPM-Llama3-V-2_5"
+
+    #2.6
+    tokenizer = AutoTokenizer.from_pretrained(model,
+                                              trust_remote_code=True)
+    llm = LLM(**engine_params)
+    # NOTE The stop_token_ids are different for various versions of MiniCPM-V
+    # 2.0
+    # stop_token_ids = [tokenizer.eos_id]
+
+    # 2.5
+    # stop_token_ids = [tokenizer.eos_id, tokenizer.eot_id]
+
+    # 2.6
+    stop_tokens = ['<|im_end|>', '<|endoftext|>']
+    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
+
+    messages = [{
+        'role': 'user',
+        'content': f'(<image>./</image>)\n{question}'
+    }]
+    prompt = tokenizer.apply_chat_template(messages,
+                                           tokenize=False,
+                                           add_generation_prompt=True)
+    return llm, prompt, stop_token_ids
+
+
+def get_multi_modal_input(args):
+    """
+    return {
+        "data": image or video,
+        "question": question,
+    }
+    """
+    if args.modality == "image":
+        # Input image and question
+        image = ImageAsset("cherry_blossom").pil_image.convert("RGB")
+        img_question = "What is the content of this image?"
+
+        return {
+            "data": image,
+            "question": img_question,
+        }
+
+    if args.modality == "video":
+        # Input video and question
+        video = VideoAsset(name="sample_demo_1.mp4",
+                           num_frames=args.num_frames).np_ndarrays
+        vid_question = "Why is this video funny?"
+
+        return {
+            "data": video,
+            "question": vid_question,
+        }
+
+    msg = f"Modality {args.modality} is not supported."
+    raise ValueError(msg)
+
+
+    
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--num-prompts',
+                        type=int,
+                        default=1,
+                        help='Number of prompts to run.')
+    parser.add_argument('--modality',
+                        type=str,
+                        default="image",
+                        help='Modality of the input.')
+    parser.add_argument('--num-frames',
+                        type=int,
+                        default=16,
+                        help='Number of frames to extract from the video.')
+    parser = EngineArgs.add_cli_args(parser)
+    parser = sampling_add_cli_args(parser)
+    args = parser.parse_args()
+    engine_args = [attr.name for attr in dataclasses.fields(EngineArgs)]
+    sampling_args = [
+        param.name
+        for param in list(
+            inspect.signature(SamplingParams).parameters.values()
+        )
+    ]
+    engine_params = {attr: getattr(args, attr) for attr in engine_args}
+    sampling_params = {
+        attr: getattr(args, attr) for attr in sampling_args if args.__contains__(attr)
+    }
+    
+    modality = args.modality
+    mm_input = get_multi_modal_input(args)
+    data = mm_input["data"]
+    question = mm_input["question"]
+
+    llm, prompt, stop_token_ids = run_minicpmv(question,engine_params, args.model, args.modality)
+    sampling_params['stop_token_ids'] = stop_token_ids
+
+    # We set temperature to 0.2 so that outputs can be different
+    # even when all prompts are identical when running batch inference.
+    sampling_params = SamplingParams(**sampling_params)
+
+    assert args.num_prompts > 0
+    if args.num_prompts == 1:
+        # Single inference
+        inputs = {
+            "prompt": prompt,
+            "multi_modal_data": {
+                modality: data
+            },
+        }
+
+    else:
+        # Batch inference
+        inputs = [{
+            "prompt": prompt,
+            "multi_modal_data": {
+                modality: data
+            },
+        } for _ in range(args.num_prompts)]
+
+    outputs = llm.generate(inputs, sampling_params=sampling_params)
+
+    for o in outputs:
+        generated_text = o.outputs[0].text
+        print(generated_text)
\ No newline at end of file
diff --git a/models/multimodal/vision_language_model/mllama/vllm/README.md b/models/multimodal/vision_language_model/mllama/vllm/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..186af2bbb027def8cf143faa7b72e2f0a37d48f3
--- /dev/null
+++ b/models/multimodal/vision_language_model/mllama/vllm/README.md
@@ -0,0 +1,46 @@
+# Mllama
+
+## Model Description
+
+The Llama 3.2 collection of multilingual large language models (LLMs) is a collection of pretrained and instruction-tuned generative models in 1B and 3B sizes (text in/text out). The Llama 3.2 instruction-tuned text only models are optimized for multilingual dialogue use cases, including agentic retrieval and summarization tasks. They outperform many of the available open source and closed chat models on common industry benchmarks.
+
+## Supported Environments
+
+| GPU    | [IXUCA SDK](https://gitee.com/deep-spark/deepspark#%E5%A4%A9%E6%95%B0%E6%99%BA%E7%AE%97%E8%BD%AF%E4%BB%B6%E6%A0%88-ixuca) | Release |
+|--------|-----------|---------|
+| MR-V100 | 4.2.0     |  25.06  |
+
+## Model Preparation
+
+### Prepare Resources
+
+- Model: <https://huggingface.co/meta-llama/Llama-3.2-1B>
+
+```bash
+cp -r ../../vllm_public_assets/ ./
+
+# Download model from the website and make sure the model's path is "data/Aria"
+mkdir data
+```
+
+### Install Dependencies
+
+In order to run the model smoothly, you need to get the sdk from [resource center](https://support.iluvatar.com/#/ProductLine?id=2) of Iluvatar CoreX official website.
+
+```bash
+# Install libGL
+## CentOS
+yum install -y mesa-libGL
+## Ubuntu
+apt install -y libgl1-mesa-glx
+```
+
+## Model Inference
+
+```bash
+export VLLM_ASSETS_CACHE=../vllm/
+export VLLM_FORCE_NCCL_COMM=1
+python3 offline_inference_vision_language.py --model data/LLamaV3.2 --max-tokens 256 -tp 2 --trust-remote-code --temperature 0.0 --max-model-len 8192 --max-num-seqs 16
+```
+
+## Model Results
\ No newline at end of file
diff --git a/models/multimodal/vision_language_model/mllama/vllm/ci/prepare.sh b/models/multimodal/vision_language_model/mllama/vllm/ci/prepare.sh
new file mode 100644
index 0000000000000000000000000000000000000000..7232aa2996f379a961cf931968a1319fb70ac091
--- /dev/null
+++ b/models/multimodal/vision_language_model/mllama/vllm/ci/prepare.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+# Copyright (c) 2025, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -x
+ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
+if [[ ${ID} == "ubuntu" ]]; then
+    apt install -y libgl1-mesa-glx
+elif [[ ${ID} == "centos" ]]; then
+    yum install -y mesa-libGL
+else
+    echo "Not Support Os"
+fi
+
+cp -r ../../vllm_public_assets/ ./
diff --git a/models/multimodal/vision_language_model/mllama/vllm/offline_inference_vision_language.py b/models/multimodal/vision_language_model/mllama/vllm/offline_inference_vision_language.py
new file mode 100644
index 0000000000000000000000000000000000000000..055a975fa72c68f370b6d7692f212a1ebb5c231c
--- /dev/null
+++ b/models/multimodal/vision_language_model/mllama/vllm/offline_inference_vision_language.py
@@ -0,0 +1,137 @@
+#!/bin/bash
+# Copyright (c) 2025, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+This example shows how to use vLLM for running offline inference 
+with the correct prompt format on vision language models.
+
+For most models, the prompt format should follow corresponding examples
+on HuggingFace model repository.
+"""
+import sys
+from pathlib import Path
+import os
+sys.path.append(str(Path(__file__).resolve().parent.parent.parent))
+import argparse
+import dataclasses
+import inspect
+from vllm.assets.image import ImageAsset
+from vllm.assets.video import VideoAsset
+
+
+from vllm import LLM, EngineArgs, SamplingParams
+import sys
+from pathlib import Path
+import os
+from utils import sampling_add_cli_args
+
+# LLama 3.2
+def run_mllama(question,engine_params,modality):
+    assert modality == "image"
+
+    # Note: The default setting of max_num_seqs (256) and
+    # max_model_len (131072) for this model may cause OOM.
+    # You may lower either to run this example on lower-end GPUs.
+
+    # The configuration below has been confirmed to launch on a single L40 GPU.
+    llm = LLM(**engine_params)
+
+    prompt = f"<|image|><|begin_of_text|>{question}"
+    stop_token_ids = None
+    return llm, prompt, stop_token_ids
+
+
+def get_multi_modal_input(args):
+    """
+    return {
+        "data": image,
+        "question": question,
+    }
+    """
+    if args.modality == "image":
+        # Input image and question
+        image = ImageAsset("cherry_blossom").pil_image.convert("RGB")
+        img_question = "What is the content of this image?"
+
+        return {
+            "data": image,
+            "question": img_question,
+        }
+
+    msg = f"Modality {args.modality} is not supported."
+    raise ValueError(msg)
+
+    
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--num-prompts',
+                        type=int,
+                        default=1,
+                        help='Number of prompts to run.')
+    parser.add_argument('--modality',
+                        type=str,
+                        default="image",
+                        help='Modality of the input.')
+    parser = EngineArgs.add_cli_args(parser)
+    parser = sampling_add_cli_args(parser)
+    args = parser.parse_args()
+    engine_args = [attr.name for attr in dataclasses.fields(EngineArgs)]
+    sampling_args = [
+        param.name
+        for param in list(
+            inspect.signature(SamplingParams).parameters.values()
+        )
+    ]
+    engine_params = {attr: getattr(args, attr) for attr in engine_args}
+    sampling_params = {
+        attr: getattr(args, attr) for attr in sampling_args if args.__contains__(attr)
+    }
+    
+    modality = args.modality
+    mm_input = get_multi_modal_input(args)
+    data = mm_input["data"]
+    question = mm_input["question"]
+
+    llm, prompt, stop_token_ids = run_mllama(question,engine_params,args.modality)
+    sampling_params['stop_token_ids'] = stop_token_ids
+
+    # We set temperature to 0.2 so that outputs can be different
+    # even when all prompts are identical when running batch inference.
+    sampling_params = SamplingParams(**sampling_params)
+
+    assert args.num_prompts > 0
+    if args.num_prompts == 1:
+        # Single inference
+        inputs = {
+            "prompt": prompt,
+            "multi_modal_data": {
+                modality: data
+            },
+        }
+
+    else:
+        # Batch inference
+        inputs = [{
+            "prompt": prompt,
+            "multi_modal_data": {
+                modality: data
+            },
+        } for _ in range(args.num_prompts)]
+
+    outputs = llm.generate(inputs, sampling_params=sampling_params)
+    for o in outputs:
+        generated_text = o.outputs[0].text
+        print(generated_text)
\ No newline at end of file
diff --git a/models/multimodal/vision_language_model/pixtral/vllm/README.md b/models/multimodal/vision_language_model/pixtral/vllm/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..14e4ec0ead2975857d9693895f8805c40c070960
--- /dev/null
+++ b/models/multimodal/vision_language_model/pixtral/vllm/README.md
@@ -0,0 +1,45 @@
+# Pixtral
+
+## Model Description
+
+Pixtral is trained to understand both natural images and documents, achieving 52.5% on the MMMU reasoning benchmark, surpassing a number of larger models. The model shows strong abilities in tasks such as chart and figure understanding, document question answering, multimodal reasoning and instruction following. Pixtral is able to ingest images at their natural resolution and aspect ratio, giving the user flexibility on the number of tokens used to process an image. Pixtral is also able to process any number of images in its long context window of 128K tokens. Unlike previous open-source models, Pixtral does not compromise on text benchmark performance to excel in multimodal tasks.
+
+## Supported Environments
+
+| GPU    | [IXUCA SDK](https://gitee.com/deep-spark/deepspark#%E5%A4%A9%E6%95%B0%E6%99%BA%E7%AE%97%E8%BD%AF%E4%BB%B6%E6%A0%88-ixuca) | Release |
+|--------|-----------|---------|
+| MR-V100 | 4.2.0     |  25.06  |
+
+## Model Preparation
+
+### Prepare Resources
+
+- Model: <https://huggingface.co/mistralai/Pixtral-12B-2409>
+
+```bash
+cp -r ../../vllm_public_assets/ ./
+
+# Download model from the website and make sure the model's path is "data/Aria"
+mkdir data
+```
+
+### Install Dependencies
+
+In order to run the model smoothly, you need to get the sdk from [resource center](https://support.iluvatar.com/#/ProductLine?id=2) of Iluvatar CoreX official website.
+
+```bash
+# Install libGL
+## CentOS
+yum install -y mesa-libGL
+## Ubuntu
+apt install -y libgl1-mesa-glx
+```
+
+## Model Inference
+
+```bash
+export VLLM_ASSETS_CACHE=../vllm/
+python3 offline_inference_vision_language.py --model data/Pixtral-12B-2409 --max-tokens 256 -tp 4 --trust-remote-code --temperature 0.0 --tokenizer-mode 'mistral'
+```
+
+## Model Results
\ No newline at end of file
diff --git a/models/multimodal/vision_language_model/pixtral/vllm/ci/prepare.sh b/models/multimodal/vision_language_model/pixtral/vllm/ci/prepare.sh
new file mode 100644
index 0000000000000000000000000000000000000000..7232aa2996f379a961cf931968a1319fb70ac091
--- /dev/null
+++ b/models/multimodal/vision_language_model/pixtral/vllm/ci/prepare.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+# Copyright (c) 2025, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -x
+ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
+if [[ ${ID} == "ubuntu" ]]; then
+    apt install -y libgl1-mesa-glx
+elif [[ ${ID} == "centos" ]]; then
+    yum install -y mesa-libGL
+else
+    echo "Not Support Os"
+fi
+
+cp -r ../../vllm_public_assets/ ./
diff --git a/models/multimodal/vision_language_model/pixtral/vllm/offline_inference_vision_language.py b/models/multimodal/vision_language_model/pixtral/vllm/offline_inference_vision_language.py
new file mode 100644
index 0000000000000000000000000000000000000000..f46f1c92dd18e90971dcb522d25ad09cce89d51a
--- /dev/null
+++ b/models/multimodal/vision_language_model/pixtral/vllm/offline_inference_vision_language.py
@@ -0,0 +1,103 @@
+#!/bin/bash
+# Copyright (c) 2025, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+This example shows how to use vLLM for running offline inference 
+with the correct prompt format on vision language models.
+
+For most models, the prompt format should follow corresponding examples
+on HuggingFace model repository.
+"""
+import sys
+from pathlib import Path
+import io
+sys.path.append(str(Path(__file__).resolve().parent.parent.parent))
+import argparse
+import dataclasses
+import inspect
+from PIL import Image
+import base64
+from vllm import LLM, EngineArgs, SamplingParams
+
+from utils import sampling_add_cli_args
+
+# Pixtral
+def run_pixtral(question,engine_params):
+
+    prompt = prompt = f"{question}"
+    # Note: The default setting of max_num_seqs (256) and
+    # max_model_len (128k) for this model may cause OOM.
+    # You may lower either to run this example on lower-end GPUs.
+
+    # In this example, we override max_num_seqs to 5 while
+    # keeping the original context length of 128k.
+    llm = LLM(**engine_params)
+    stop_token_ids = None
+    return llm, prompt, stop_token_ids
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser = EngineArgs.add_cli_args(parser)
+    parser = sampling_add_cli_args(parser)
+    args = parser.parse_args()
+    engine_args = [attr.name for attr in dataclasses.fields(EngineArgs)]
+    sampling_args = [
+        param.name
+        for param in list(
+            inspect.signature(SamplingParams).parameters.values()
+        )
+    ]
+    engine_params = {attr: getattr(args, attr) for attr in engine_args}
+    sampling_params = {
+        attr: getattr(args, attr) for attr in sampling_args if args.__contains__(attr)
+    }
+    
+    prompt = "Describe this image in one sentence."
+
+    llm, prompt, stop_token_ids = run_pixtral(prompt,engine_params)
+    sampling_params['stop_token_ids'] = stop_token_ids
+
+    # We set temperature to 0.2 so that outputs can be different
+    # even when all prompts are identical when running batch inference.
+    sampling_params = SamplingParams(**sampling_params)
+    
+    image: Image = Image.open("./vllm_public_assets/cherry_blossom.jpg")
+    image = image.convert("RGB")
+    image_data = io.BytesIO()
+    image.save(image_data, format='JPEG')
+    image_base64 = image_base64 = base64.b64encode(image_data.getvalue()).decode("utf-8")
+
+    messages = [
+        # {"role": "system", "content": SYSTEM_PROMPT},
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "text",
+                    "text": prompt
+                },
+                {"type": "image_url", "image_url": {
+                    "url": f"data:image/jpeg;base64,{image_base64}"
+                    },
+                }
+            ],
+        },
+    ]
+
+    outputs = llm.chat(messages, sampling_params=sampling_params)
+
+    print(outputs[0].outputs[0].text)
\ No newline at end of file
diff --git a/models/multimodal/vision_language_model/utils.py b/models/multimodal/vision_language_model/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..edc7691b64cfff5d3eb56b3f65737f164a1c1696
--- /dev/null
+++ b/models/multimodal/vision_language_model/utils.py
@@ -0,0 +1,226 @@
+#!/bin/bash
+# Copyright (c) 2025, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import codecs
+import logging
+
+"""
+The following arguments can not be add in args...
+early_stopping: Union[bool, str] = False,
+early_stopping: Controls the stopping condition for beam search. It
+    accepts the following values: `True`, where the generation stops as
+    soon as there are `best_of` complete candidates; `False`, where an
+    heuristic is applied and the generation stops when is it very
+    unlikely to find better candidates; `"never"`, where the beam search
+    procedure only stops when there cannot be better candidates
+    (canonical beam search algorithm).
+stop: Optional[Union[str, List[str]]] = None,
+stop_token_ids: Optional[List[int]] = None,
+logits_processors: Optional[List[LogitsProcessor]] = None,
+logits_processors: List of functions that modify logits based on
+    previously generated tokens, and optionally prompt tokens as
+    a first argument.
+truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None,
+truncate_prompt_tokens: If set to an integer k, will use only the last k
+    tokens from the prompt (i.e., left truncation). Defaults to None
+    (i.e., no truncation).
+    """
+
+
+def sampling_add_cli_args(args: argparse.ArgumentParser) -> argparse.ArgumentParser:
+    args.add_argument(
+        "--n",
+        type=int,
+        default=1,
+        help="Number of output sequences to return for the given prompt.",
+    )
+    args.add_argument(
+        "--best-of",
+        type=int,
+        default=None,
+        help="Number of output sequences that are generated from the prompt. "
+        "From these `best_of` sequences, the top `n` sequences are returned. "
+        "`best_of` must be greater than or equal to `n`. This is treated as "
+        "the beam width when `use_beam_search` is True. By default, `best_of`"
+        "is set to `n`.",
+    )
+    args.add_argument(
+        "--presence-penalty",
+        type=float,
+        default=0.0,
+        help="Float that penalizes new tokens based on whether they "
+        "appear in the generated text so far. Values > 0 encourage the model "
+        "to use new tokens, while values < 0 encourage the model to repeat "
+        "tokens.",
+    )
+    args.add_argument(
+        "--frequency-penalty",
+        type=float,
+        default=0.0,
+        help="Float that penalizes new tokens based on their "
+        " frequency in the generated text so far. Values > 0 encourage the "
+        " model to use new tokens, while values < 0 encourage the model to "
+        "repeat tokens.",
+    )
+    args.add_argument(
+        "--repetition-penalty",
+        type=float,
+        default=1.0,
+        help="Float that penalizes new tokens based on whether "
+        "they appear in the prompt and the generated text so far. Values > 1 "
+        "encourage the model to use new tokens, while values < 1 encourage "
+        "the model to repeat tokens.",
+    )
+    args.add_argument(
+        "--temperature",
+        type=float,
+        default=1.0,
+        help="Float that controls the randomness of the sampling. Lower "
+        "values make the model more deterministic, while higher values make "
+        "the model more random. Zero means greedy sampling.",
+    )
+    args.add_argument(
+        "--top-p",
+        type=float,
+        default=1.0,
+        help="Float that controls the cumulative probability of the top tokens "
+        "to consider. Must be in (0, 1]. Set to 1 to consider all tokens.",
+    )
+    args.add_argument(
+        "--top-k",
+        type=int,
+        default=-1,
+        help="Integer that controls the number of top tokens to consider. Set "
+        "to -1 to consider all tokens.",
+    )
+    args.add_argument(
+        "--min-p",
+        type=float,
+        default=0.0,
+        help="Float that represents the minimum probability for a token to be "
+        "considered, relative to the probability of the most likely token. "
+        "Must be in [0, 1]. Set to 0 to disable this.",
+    )
+    args.add_argument(
+        "--use-beam-search",
+        default=False,
+        action="store_true",
+        help="Whether to use beam search instead of sampling.",
+    )
+    args.add_argument(
+        "--length-penalty",
+        type=float,
+        default=1.0,
+        help="Float that penalizes sequences based on their length. Used in beam search.",
+    )
+    args.add_argument(
+        "--stop",
+        type=str,
+        default=None,
+        help="List of strings that stop the generation when they are generated. "
+        "The returned output will not contain the stop strings.",
+    )
+    args.add_argument(
+        "--stop-token-ids",
+        type=int,
+        default=None,
+        help="List of tokens that stop the generation when they are "
+        "generated. The returned output will contain the stop tokens unless "
+        "the stop tokens are special tokens.",
+    )
+    args.add_argument(
+        "--include-stop-str-in-output",
+        default=False,
+        action="store_true",
+        help="Whether to include the stop strings in output text. Defaults to False.",
+    )
+    args.add_argument(
+        "--ignore-eos",
+        default=False,
+        action="store_true",
+        help="Whether to ignore the EOS token and continue generating tokens after the EOS token is generated.",
+    )
+    args.add_argument(
+        "--max-tokens",
+        type=int,
+        default=16,
+        help="Maximum number of tokens to generate per output sequence.",
+    )
+    args.add_argument(
+        "--min-tokens",
+        type=int,
+        default=0,
+        help="Minimum number of tokens to generate per output sequence "
+        "before EOS or stop_token_ids can be generated",
+    )
+    args.add_argument(
+        "--logprobs",
+        type=int,
+        default=None,
+        help="NNumber of log probabilities to return per output token. "
+        "Note that the implementation follows the OpenAI API: The return "
+        "result includes the log probabilities on the `logprobs` most likely "
+        "tokens, as well the chosen tokens. The API will always return the "
+        "log probability of the sampled token, so there  may be up to "
+        "`logprobs+1` elements in the response.",
+    )
+    args.add_argument(
+        "--prompt-logprobs",
+        type=int,
+        default=None,
+        help="Number of log probabilities to return per prompt token.",
+    )
+    args.add_argument(
+        "--detokenize",
+        type=bool,
+        default=True,
+        help="Whether to detokenize the output. Defaults to True.",
+    )
+    args.add_argument(
+        "--skip-special-tokens",
+        default=True,
+        action="store_false",
+        help="Whether to skip special tokens in the output.",
+    )
+    args.add_argument(
+        "--spaces-between-special-tokens",
+        default=True,
+        action="store_false",
+        help="Whether to add spaces between special tokens in the output.  Defaults to True.",
+    )
+    return args
+
+
+def load_chat_template(tokenizer, chat_template):
+    if chat_template is not None:
+        try:
+            with open(chat_template, "r") as f:
+                tokenizer.chat_template = f.read()
+        except OSError:
+            # If opening a file fails, set chat template to be args to
+            # ensure we decode so our escape are interpreted correctly
+            tokenizer.chat_template = codecs.decode(chat_template, "unicode_escape")
+
+        logging.info(f"Using supplied chat template:\n{tokenizer.chat_template}")
+    elif tokenizer.chat_template is not None:
+        logging.info(
+            f"Using default chat template:\n{tokenizer.chat_template}. This May lead to unsatisfactory results. You can provide a template.jinja file for vllm."
+        )
+    else:
+        logging.warning(
+            "No chat template provided. Chat API will not work. This May lead to unsatisfactory results. You can provide a template.jinja file for vllm."
+        )
diff --git a/models/multimodal/vision_language_model/vllm_public_assets/cherry_blossom.jpg b/models/multimodal/vision_language_model/vllm_public_assets/cherry_blossom.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..63173db0da7687d7841fe4d85239d8e277d81259
Binary files /dev/null and b/models/multimodal/vision_language_model/vllm_public_assets/cherry_blossom.jpg differ
diff --git a/tests/run_vllm.py b/tests/run_vllm.py
index f927877525261d79727c81fcf6119ccf9503c3db..96151bd4995bb829e127ba4158f09772f1422b55 100644
--- a/tests/run_vllm.py
+++ b/tests/run_vllm.py
@@ -93,7 +93,6 @@ def run_nlp_testcase(model):
     }
     d_url = model["download_url"]
     checkpoint_n = d_url.split("/")[-1]
-    dataset_n = model["datasets"].split("/")[-1]
     prepare_script = f"""
     set -x
     cd ../{model['model_path']}
@@ -164,12 +163,6 @@ def run_nlp_testcase(model):
             cd ../{model['model_path']}
             python3 offline_inference.py --model ./qwen1.5-7b --max-tokens 256 -tp 1 --temperature 0.0 --max-model-len 3096
             """
-        elif model_name == "qwen1.5-7b":
-            script = f"""
-            set -x
-            cd ../{model['model_path']}
-            python3 offline_inference.py --model ./qwen1.5-7b --max-tokens 256 -tp 1 --temperature 0.0 --max-model-len 3096
-            """
         elif model_name == "qwen1.5-14b":
             script = f"""
             set -x
@@ -226,6 +219,42 @@ def run_nlp_testcase(model):
             cd ../{model['model_path']}
             python3 offline_inference.py --model ./{model_name} --max-tokens 256 -tp 2 --temperature 0.0 --max-model-len 3096
             """
+        elif model_name == "aria":
+            script = f"""
+            set -x
+            cd ../{model['model_path']}
+            export VLLM_ASSETS_CACHE=../vllm/
+            python3 offline_inference_vision_language.py --model ./{model_name} --max-tokens 256 -tp 4 --trust-remote-code --temperature 0.0 --dtype bfloat16 --tokenizer-mode slow
+            """
+        elif model_name == "h2vol" or model_name == "idefics3":
+            script = f"""
+            set -x
+            cd ../{model['model_path']}
+            export VLLM_ASSETS_CACHE=../vllm/
+            python3 offline_inference_vision_language.py --model ./{model_name} --max-tokens 256 -tp 4 --trust-remote-code --temperature 0.0 --disable-mm-preprocessor-cache
+            """
+        elif model_name == "minicpm_v":
+            script = f"""
+            set -x
+            cd ../{model['model_path']}
+            export VLLM_ASSETS_CACHE=../vllm/
+            PT_SDPA_ENABLE_HEAD_DIM_PADDING=1 python3 offline_inference_vision_language.py --model ./{model_name} --max-tokens 256 -tp 2 --trust-remote-code --temperature 0.0
+            """
+        elif model_name == "mllama":
+            script = f"""
+            set -x
+            cd ../{model['model_path']}
+            export VLLM_ASSETS_CACHE=../vllm/
+            export VLLM_FORCE_NCCL_COMM=1
+            python3 offline_inference_vision_language.py --model ./{model_name} --max-tokens 256 -tp 2 --trust-remote-code --temperature 0.0 --max-model-len 8192 --max-num-seqs 16
+            """
+        elif model_name == "pixtral":
+            script = f"""
+            set -x
+            cd ../{model['model_path']}
+            export VLLM_ASSETS_CACHE=../vllm/
+            python3 offline_inference_vision_language.py --model ./{model_name} --max-tokens 256 -tp 4 --trust-remote-code --temperature 0.0 --tokenizer-mode 'mistral'
+            """
 
         r, t = run_script(script)
         sout = r.stdout