diff --git a/models/vision-language-understanding/chameleon-7b/vllm/README.md b/models/vision-language-understanding/chameleon-7b/vllm/README.md
new file mode 100755
index 0000000000000000000000000000000000000000..bebd7c799c66db9bc8ae7fe093ea945d34bbdfca
--- /dev/null
+++ b/models/vision-language-understanding/chameleon-7b/vllm/README.md
@@ -0,0 +1,35 @@
+# Chameleon
+
+## Description
+
+Chameleon, an AI system that mitigates these limitations by augmenting LLMs with plug-and-play modules for compositional reasoning. Chameleon synthesizes programs by composing various tools (e.g., LLMs, off-the-shelf vision models, web search engines, Python functions, and heuristic-based modules) for accomplishing complex reasoning tasks. At the heart of Chameleon is an LLM-based planner that assembles a sequence of tools to execute to generate the final response. We showcase the effectiveness of Chameleon on two multi-modal knowledge-intensive reasoning tasks: ScienceQA and TabMWP. Chameleon, powered by GPT-4, achieves an 86.54% overall accuracy on ScienceQA, improving the best published few-shot result by 11.37%. On TabMWP, GPT-4-powered Chameleon improves the accuracy by 17.0%, lifting the state of the art to 98.78%. Our analysis also shows that the GPT-4-powered planner exhibits more consistent and rational tool selection via inferring potential constraints from instructions, compared to a ChatGPT-powered planner. 
+
+## Setup
+
+### Install
+
+In order to run the model smoothly, you need to get the sdk from [resource center](https://support.iluvatar.com/#/ProductLine?id=2) of Iluvatar CoreX official website.
+
+```bash
+# Install libGL
+## CentOS
+yum install -y mesa-libGL
+## Ubuntu
+apt install -y libgl1-mesa-dev
+```
+
+### Download
+
+-Model: <https://huggingface.co/facebook/chameleon-7b>
+
+```bash
+# Download model from the website and make sure the model's path is "data/chameleon-7b"
+mkdir data
+```
+
+## Inference
+
+```bash
+export VLLM_ASSETS_CACHE=../vllm/
+python3 offline_inference_vision_language.py --model ./data/chameleon-7b --max-tokens 256 -tp 2 --trust-remote-code --temperature 0.0
+```
\ No newline at end of file
diff --git a/models/vision-language-understanding/chameleon-7b/vllm/offline_inference_vision_language.py b/models/vision-language-understanding/chameleon-7b/vllm/offline_inference_vision_language.py
new file mode 100755
index 0000000000000000000000000000000000000000..8df835b38b44e5ef4e98825f79c7eca33bcf2ae7
--- /dev/null
+++ b/models/vision-language-understanding/chameleon-7b/vllm/offline_inference_vision_language.py
@@ -0,0 +1,143 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+"""
+This example shows how to use vLLM for running offline inference 
+with the correct prompt format on vision language models.
+
+For most models, the prompt format should follow corresponding examples
+on HuggingFace model repository.
+"""
+import sys
+from pathlib import Path
+import os
+sys.path.append(str(Path(__file__).resolve().parent.parent.parent))
+import argparse
+import dataclasses
+import inspect
+from vllm.assets.image import ImageAsset
+from vllm.assets.video import VideoAsset
+from vllm import LLM, EngineArgs, SamplingParams
+
+from utils import sampling_add_cli_args
+
+# Chameleon
+def run_chameleon(question,engine_params,modality):
+    
+    assert modality == "image"
+
+    prompt = f"{question}<image>"
+    llm = LLM(**engine_params)
+    stop_token_ids = None
+    return llm, prompt, stop_token_ids
+
+
+def get_multi_modal_input(args):
+    """
+    return {
+        "data": image or video,
+        "question": question,
+    }
+    """
+    if args.modality == "image":
+        # Input image and question
+        image = ImageAsset("cherry_blossom").pil_image.convert("RGB")
+        img_question = "What is the content of this image?"
+
+        return {
+            "data": image,
+            "question": img_question,
+        }
+
+    if args.modality == "video":
+        # Input video and question
+        video = VideoAsset(name="sample_demo_1.mp4",
+                           num_frames=args.num_frames).np_ndarrays
+        vid_question = "Why is this video funny?"
+
+        return {
+            "data": video,
+            "question": vid_question,
+        }
+
+    msg = f"Modality {args.modality} is not supported."
+    raise ValueError(msg)
+
+
+    
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--num-prompts',
+                        type=int,
+                        default=1,
+                        help='Number of prompts to run.')
+    parser.add_argument('--modality',
+                        type=str,
+                        default="image",
+                        help='Modality of the input.')
+    parser.add_argument('--num-frames',
+                        type=int,
+                        default=16,
+                        help='Number of frames to extract from the video.')
+    parser = EngineArgs.add_cli_args(parser)
+    parser = sampling_add_cli_args(parser)
+    args = parser.parse_args()
+    engine_args = [attr.name for attr in dataclasses.fields(EngineArgs)]
+    sampling_args = [
+        param.name
+        for param in list(
+            inspect.signature(SamplingParams).parameters.values()
+        )
+    ]
+    engine_params = {attr: getattr(args, attr) for attr in engine_args}
+    sampling_params = {
+        attr: getattr(args, attr) for attr in sampling_args if args.__contains__(attr)
+    }
+    
+    modality = args.modality
+    mm_input = get_multi_modal_input(args)
+    data = mm_input["data"]
+    question = mm_input["question"]
+
+    llm, prompt, stop_token_ids = run_chameleon(question,engine_params,modality)
+    sampling_params['stop_token_ids'] = stop_token_ids
+
+    # We set temperature to 0.2 so that outputs can be different
+    # even when all prompts are identical when running batch inference.
+    sampling_params = SamplingParams(**sampling_params)
+
+    assert args.num_prompts > 0
+    if args.num_prompts == 1:
+        # Single inference
+        inputs = {
+            "prompt": prompt,
+            "multi_modal_data": {
+                modality: data
+            },
+        }
+
+    else:
+        # Batch inference
+        inputs = [{
+            "prompt": prompt,
+            "multi_modal_data": {
+                modality: data
+            },
+        } for _ in range(args.num_prompts)]
+
+    outputs = llm.generate(inputs, sampling_params=sampling_params)
+
+    for o in outputs:
+        generated_text = o.outputs[0].text
+        print(generated_text)
\ No newline at end of file
diff --git a/models/vision-language-understanding/chameleon-7b/vllm/utils.py b/models/vision-language-understanding/chameleon-7b/vllm/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..48445ed97d08a8388a90d20e026609b5c1e88a99
--- /dev/null
+++ b/models/vision-language-understanding/chameleon-7b/vllm/utils.py
@@ -0,0 +1,224 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+import argparse
+import codecs
+import logging
+
+"""
+The following arguments can not be add in args...
+early_stopping: Union[bool, str] = False,
+early_stopping: Controls the stopping condition for beam search. It
+    accepts the following values: `True`, where the generation stops as
+    soon as there are `best_of` complete candidates; `False`, where an
+    heuristic is applied and the generation stops when is it very
+    unlikely to find better candidates; `"never"`, where the beam search
+    procedure only stops when there cannot be better candidates
+    (canonical beam search algorithm).
+stop: Optional[Union[str, List[str]]] = None,
+stop_token_ids: Optional[List[int]] = None,
+logits_processors: Optional[List[LogitsProcessor]] = None,
+logits_processors: List of functions that modify logits based on
+    previously generated tokens, and optionally prompt tokens as
+    a first argument.
+truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None,
+truncate_prompt_tokens: If set to an integer k, will use only the last k
+    tokens from the prompt (i.e., left truncation). Defaults to None
+    (i.e., no truncation).
+    """
+
+
+def sampling_add_cli_args(args: argparse.ArgumentParser) -> argparse.ArgumentParser:
+    args.add_argument(
+        "--n",
+        type=int,
+        default=1,
+        help="Number of output sequences to return for the given prompt.",
+    )
+    args.add_argument(
+        "--best-of",
+        type=int,
+        default=None,
+        help="Number of output sequences that are generated from the prompt. "
+        "From these `best_of` sequences, the top `n` sequences are returned. "
+        "`best_of` must be greater than or equal to `n`. This is treated as "
+        "the beam width when `use_beam_search` is True. By default, `best_of`"
+        "is set to `n`.",
+    )
+    args.add_argument(
+        "--presence-penalty",
+        type=float,
+        default=0.0,
+        help="Float that penalizes new tokens based on whether they "
+        "appear in the generated text so far. Values > 0 encourage the model "
+        "to use new tokens, while values < 0 encourage the model to repeat "
+        "tokens.",
+    )
+    args.add_argument(
+        "--frequency-penalty",
+        type=float,
+        default=0.0,
+        help="Float that penalizes new tokens based on their "
+        " frequency in the generated text so far. Values > 0 encourage the "
+        " model to use new tokens, while values < 0 encourage the model to "
+        "repeat tokens.",
+    )
+    args.add_argument(
+        "--repetition-penalty",
+        type=float,
+        default=1.0,
+        help="Float that penalizes new tokens based on whether "
+        "they appear in the prompt and the generated text so far. Values > 1 "
+        "encourage the model to use new tokens, while values < 1 encourage "
+        "the model to repeat tokens.",
+    )
+    args.add_argument(
+        "--temperature",
+        type=float,
+        default=1.0,
+        help="Float that controls the randomness of the sampling. Lower "
+        "values make the model more deterministic, while higher values make "
+        "the model more random. Zero means greedy sampling.",
+    )
+    args.add_argument(
+        "--top-p",
+        type=float,
+        default=1.0,
+        help="Float that controls the cumulative probability of the top tokens "
+        "to consider. Must be in (0, 1]. Set to 1 to consider all tokens.",
+    )
+    args.add_argument(
+        "--top-k",
+        type=int,
+        default=-1,
+        help="Integer that controls the number of top tokens to consider. Set "
+        "to -1 to consider all tokens.",
+    )
+    args.add_argument(
+        "--min-p",
+        type=float,
+        default=0.0,
+        help="Float that represents the minimum probability for a token to be "
+        "considered, relative to the probability of the most likely token. "
+        "Must be in [0, 1]. Set to 0 to disable this.",
+    )
+    args.add_argument(
+        "--use-beam-search",
+        default=False,
+        action="store_true",
+        help="Whether to use beam search instead of sampling.",
+    )
+    args.add_argument(
+        "--length-penalty",
+        type=float,
+        default=1.0,
+        help="Float that penalizes sequences based on their length. Used in beam search.",
+    )
+    args.add_argument(
+        "--stop",
+        type=str,
+        default=None,
+        help="List of strings that stop the generation when they are generated. "
+        "The returned output will not contain the stop strings.",
+    )
+    args.add_argument(
+        "--stop-token-ids",
+        type=int,
+        default=None,
+        help="List of tokens that stop the generation when they are "
+        "generated. The returned output will contain the stop tokens unless "
+        "the stop tokens are special tokens.",
+    )
+    args.add_argument(
+        "--include-stop-str-in-output",
+        default=False,
+        action="store_true",
+        help="Whether to include the stop strings in output text. Defaults to False.",
+    )
+    args.add_argument(
+        "--ignore-eos",
+        default=False,
+        action="store_true",
+        help="Whether to ignore the EOS token and continue generating tokens after the EOS token is generated.",
+    )
+    args.add_argument(
+        "--max-tokens",
+        type=int,
+        default=16,
+        help="Maximum number of tokens to generate per output sequence.",
+    )
+    args.add_argument(
+        "--min-tokens",
+        type=int,
+        default=0,
+        help="Minimum number of tokens to generate per output sequence "
+        "before EOS or stop_token_ids can be generated",
+    )
+    args.add_argument(
+        "--logprobs",
+        type=int,
+        default=None,
+        help="NNumber of log probabilities to return per output token. "
+        "Note that the implementation follows the OpenAI API: The return "
+        "result includes the log probabilities on the `logprobs` most likely "
+        "tokens, as well the chosen tokens. The API will always return the "
+        "log probability of the sampled token, so there  may be up to "
+        "`logprobs+1` elements in the response.",
+    )
+    args.add_argument(
+        "--prompt-logprobs",
+        type=int,
+        default=None,
+        help="Number of log probabilities to return per prompt token.",
+    )
+    args.add_argument(
+        "--detokenize",
+        type=bool,
+        default=True,
+        help="Whether to detokenize the output. Defaults to True.",
+    )
+    args.add_argument(
+        "--skip-special-tokens",
+        default=True,
+        action="store_false",
+        help="Whether to skip special tokens in the output.",
+    )
+    args.add_argument(
+        "--spaces-between-special-tokens",
+        default=True,
+        action="store_false",
+        help="Whether to add spaces between special tokens in the output.  Defaults to True.",
+    )
+    return args
+
+
+def load_chat_template(tokenizer, chat_template):
+    if chat_template is not None:
+        try:
+            with open(chat_template, "r") as f:
+                tokenizer.chat_template = f.read()
+        except OSError:
+            # If opening a file fails, set chat template to be args to
+            # ensure we decode so our escape are interpreted correctly
+            tokenizer.chat_template = codecs.decode(chat_template, "unicode_escape")
+
+        logging.info(f"Using supplied chat template:\n{tokenizer.chat_template}")
+    elif tokenizer.chat_template is not None:
+        logging.info(
+            f"Using default chat template:\n{tokenizer.chat_template}. This May lead to unsatisfactory results. You can provide a template.jinja file for vllm."
+        )
+    else:
+        logging.warning(
+            "No chat template provided. Chat API will not work. This May lead to unsatisfactory results. You can provide a template.jinja file for vllm."
+        )
\ No newline at end of file
diff --git a/models/vision-language-understanding/chameleon-7b/vllm/vllm_public_assets/cherry_blossom.jpg b/models/vision-language-understanding/chameleon-7b/vllm/vllm_public_assets/cherry_blossom.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..63173db0da7687d7841fe4d85239d8e277d81259
Binary files /dev/null and b/models/vision-language-understanding/chameleon-7b/vllm/vllm_public_assets/cherry_blossom.jpg differ
diff --git a/models/vision-language-understanding/fuyu-8b/vllm/README.md b/models/vision-language-understanding/fuyu-8b/vllm/README.md
new file mode 100755
index 0000000000000000000000000000000000000000..7bc5d2cc04268a63d6dd1bebb4e040f55f80be4d
--- /dev/null
+++ b/models/vision-language-understanding/fuyu-8b/vllm/README.md
@@ -0,0 +1,37 @@
+# FuyuForCausalLM
+
+## Description
+
+Fuyu-8B is a multi-modal text and image transformer trained by Adept AI.
+
+Architecturally, Fuyu is a vanilla decoder-only transformer - there is no image encoder. Image patches are instead linearly projected into the first layer of the transformer, bypassing the embedding lookup. We simply treat the transformer decoder like an image transformer (albeit with no pooling and causal attention).
+
+## Setup
+
+### Install
+
+In order to run the model smoothly, you need to get the sdk from [resource center](https://support.iluvatar.com/#/ProductLine?id=2) of Iluvatar CoreX official website.
+
+```bash
+# Install libGL
+## CentOS
+yum install -y mesa-libGL
+## Ubuntu
+apt install -y libgl1-mesa-dev
+```
+
+### Download
+
+-Model: <https://huggingface.co/adept/fuyu-8b>
+
+```bash
+# Download model from the website and make sure the model's path is "data/fuyu-8b"
+mkdir data
+```
+
+## Inference
+
+```bash
+export VLLM_ASSETS_CACHE=../vllm/
+python3 offline_inference_vision_language.py --model ./data/fuyu-8b --max-tokens 256 -tp 2 --trust-remote-code --temperature 0.0
+```
\ No newline at end of file
diff --git a/models/vision-language-understanding/fuyu-8b/vllm/offline_inference_vision_language.py b/models/vision-language-understanding/fuyu-8b/vllm/offline_inference_vision_language.py
new file mode 100755
index 0000000000000000000000000000000000000000..21d6c6a87cf1713b96b3106b180154f2e66bf52e
--- /dev/null
+++ b/models/vision-language-understanding/fuyu-8b/vllm/offline_inference_vision_language.py
@@ -0,0 +1,141 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+"""
+This example shows how to use vLLM for running offline inference 
+with the correct prompt format on vision language models.
+
+For most models, the prompt format should follow corresponding examples
+on HuggingFace model repository.
+"""
+import sys
+from pathlib import Path
+import os
+sys.path.append(str(Path(__file__).resolve().parent.parent.parent))
+import argparse
+import dataclasses
+import inspect
+from vllm.assets.image import ImageAsset
+from vllm.assets.video import VideoAsset
+
+from vllm import LLM, EngineArgs, SamplingParams
+from utils import sampling_add_cli_args
+
+# Fuyu
+def run_fuyu(question,engine_params,modality):
+    assert modality == "image"
+    prompt = f"{question}\n"
+    llm = LLM(**engine_params)
+    stop_token_ids = None
+    return llm, prompt, stop_token_ids
+
+
+def get_multi_modal_input(args):
+    """
+    return {
+        "data": image or video,
+        "question": question,
+    }
+    """
+    if args.modality == "image":
+        # Input image and question
+        image = ImageAsset("cherry_blossom").pil_image.convert("RGB")
+        img_question = "What is the content of this image?"
+
+        return {
+            "data": image,
+            "question": img_question,
+        }
+
+    if args.modality == "video":
+        # Input video and question
+        video = VideoAsset(name="sample_demo_1.mp4",
+                           num_frames=args.num_frames).np_ndarrays
+        vid_question = "Why is this video funny?"
+
+        return {
+            "data": video,
+            "question": vid_question,
+        }
+
+    msg = f"Modality {args.modality} is not supported."
+    raise ValueError(msg)
+
+
+    
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--num-prompts',
+                        type=int,
+                        default=1,
+                        help='Number of prompts to run.')
+    parser.add_argument('--modality',
+                        type=str,
+                        default="image",
+                        help='Modality of the input.')
+    parser.add_argument('--num-frames',
+                        type=int,
+                        default=16,
+                        help='Number of frames to extract from the video.')
+    parser = EngineArgs.add_cli_args(parser)
+    parser = sampling_add_cli_args(parser)
+    args = parser.parse_args()
+    engine_args = [attr.name for attr in dataclasses.fields(EngineArgs)]
+    sampling_args = [
+        param.name
+        for param in list(
+            inspect.signature(SamplingParams).parameters.values()
+        )
+    ]
+    engine_params = {attr: getattr(args, attr) for attr in engine_args}
+    sampling_params = {
+        attr: getattr(args, attr) for attr in sampling_args if args.__contains__(attr)
+    }
+    
+    modality = args.modality
+    mm_input = get_multi_modal_input(args)
+    data = mm_input["data"]
+    question = mm_input["question"]
+
+    llm, prompt, stop_token_ids = run_fuyu(question,engine_params, args.modality)
+    sampling_params['stop_token_ids'] = stop_token_ids
+
+    # We set temperature to 0.2 so that outputs can be different
+    # even when all prompts are identical when running batch inference.
+    sampling_params = SamplingParams(**sampling_params)
+
+    assert args.num_prompts > 0
+    if args.num_prompts == 1:
+        # Single inference
+        inputs = {
+            "prompt": prompt,
+            "multi_modal_data": {
+                modality: data
+            },
+        }
+
+    else:
+        # Batch inference
+        inputs = [{
+            "prompt": prompt,
+            "multi_modal_data": {
+                modality: data
+            },
+        } for _ in range(args.num_prompts)]
+
+    outputs = llm.generate(inputs, sampling_params=sampling_params)
+
+    for o in outputs:
+        generated_text = o.outputs[0].text
+        print(generated_text)
\ No newline at end of file
diff --git a/models/vision-language-understanding/fuyu-8b/vllm/utils.py b/models/vision-language-understanding/fuyu-8b/vllm/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..48445ed97d08a8388a90d20e026609b5c1e88a99
--- /dev/null
+++ b/models/vision-language-understanding/fuyu-8b/vllm/utils.py
@@ -0,0 +1,224 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+import argparse
+import codecs
+import logging
+
+"""
+The following arguments can not be add in args...
+early_stopping: Union[bool, str] = False,
+early_stopping: Controls the stopping condition for beam search. It
+    accepts the following values: `True`, where the generation stops as
+    soon as there are `best_of` complete candidates; `False`, where an
+    heuristic is applied and the generation stops when is it very
+    unlikely to find better candidates; `"never"`, where the beam search
+    procedure only stops when there cannot be better candidates
+    (canonical beam search algorithm).
+stop: Optional[Union[str, List[str]]] = None,
+stop_token_ids: Optional[List[int]] = None,
+logits_processors: Optional[List[LogitsProcessor]] = None,
+logits_processors: List of functions that modify logits based on
+    previously generated tokens, and optionally prompt tokens as
+    a first argument.
+truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None,
+truncate_prompt_tokens: If set to an integer k, will use only the last k
+    tokens from the prompt (i.e., left truncation). Defaults to None
+    (i.e., no truncation).
+    """
+
+
+def sampling_add_cli_args(args: argparse.ArgumentParser) -> argparse.ArgumentParser:
+    args.add_argument(
+        "--n",
+        type=int,
+        default=1,
+        help="Number of output sequences to return for the given prompt.",
+    )
+    args.add_argument(
+        "--best-of",
+        type=int,
+        default=None,
+        help="Number of output sequences that are generated from the prompt. "
+        "From these `best_of` sequences, the top `n` sequences are returned. "
+        "`best_of` must be greater than or equal to `n`. This is treated as "
+        "the beam width when `use_beam_search` is True. By default, `best_of`"
+        "is set to `n`.",
+    )
+    args.add_argument(
+        "--presence-penalty",
+        type=float,
+        default=0.0,
+        help="Float that penalizes new tokens based on whether they "
+        "appear in the generated text so far. Values > 0 encourage the model "
+        "to use new tokens, while values < 0 encourage the model to repeat "
+        "tokens.",
+    )
+    args.add_argument(
+        "--frequency-penalty",
+        type=float,
+        default=0.0,
+        help="Float that penalizes new tokens based on their "
+        " frequency in the generated text so far. Values > 0 encourage the "
+        " model to use new tokens, while values < 0 encourage the model to "
+        "repeat tokens.",
+    )
+    args.add_argument(
+        "--repetition-penalty",
+        type=float,
+        default=1.0,
+        help="Float that penalizes new tokens based on whether "
+        "they appear in the prompt and the generated text so far. Values > 1 "
+        "encourage the model to use new tokens, while values < 1 encourage "
+        "the model to repeat tokens.",
+    )
+    args.add_argument(
+        "--temperature",
+        type=float,
+        default=1.0,
+        help="Float that controls the randomness of the sampling. Lower "
+        "values make the model more deterministic, while higher values make "
+        "the model more random. Zero means greedy sampling.",
+    )
+    args.add_argument(
+        "--top-p",
+        type=float,
+        default=1.0,
+        help="Float that controls the cumulative probability of the top tokens "
+        "to consider. Must be in (0, 1]. Set to 1 to consider all tokens.",
+    )
+    args.add_argument(
+        "--top-k",
+        type=int,
+        default=-1,
+        help="Integer that controls the number of top tokens to consider. Set "
+        "to -1 to consider all tokens.",
+    )
+    args.add_argument(
+        "--min-p",
+        type=float,
+        default=0.0,
+        help="Float that represents the minimum probability for a token to be "
+        "considered, relative to the probability of the most likely token. "
+        "Must be in [0, 1]. Set to 0 to disable this.",
+    )
+    args.add_argument(
+        "--use-beam-search",
+        default=False,
+        action="store_true",
+        help="Whether to use beam search instead of sampling.",
+    )
+    args.add_argument(
+        "--length-penalty",
+        type=float,
+        default=1.0,
+        help="Float that penalizes sequences based on their length. Used in beam search.",
+    )
+    args.add_argument(
+        "--stop",
+        type=str,
+        default=None,
+        help="List of strings that stop the generation when they are generated. "
+        "The returned output will not contain the stop strings.",
+    )
+    args.add_argument(
+        "--stop-token-ids",
+        type=int,
+        default=None,
+        help="List of tokens that stop the generation when they are "
+        "generated. The returned output will contain the stop tokens unless "
+        "the stop tokens are special tokens.",
+    )
+    args.add_argument(
+        "--include-stop-str-in-output",
+        default=False,
+        action="store_true",
+        help="Whether to include the stop strings in output text. Defaults to False.",
+    )
+    args.add_argument(
+        "--ignore-eos",
+        default=False,
+        action="store_true",
+        help="Whether to ignore the EOS token and continue generating tokens after the EOS token is generated.",
+    )
+    args.add_argument(
+        "--max-tokens",
+        type=int,
+        default=16,
+        help="Maximum number of tokens to generate per output sequence.",
+    )
+    args.add_argument(
+        "--min-tokens",
+        type=int,
+        default=0,
+        help="Minimum number of tokens to generate per output sequence "
+        "before EOS or stop_token_ids can be generated",
+    )
+    args.add_argument(
+        "--logprobs",
+        type=int,
+        default=None,
+        help="NNumber of log probabilities to return per output token. "
+        "Note that the implementation follows the OpenAI API: The return "
+        "result includes the log probabilities on the `logprobs` most likely "
+        "tokens, as well the chosen tokens. The API will always return the "
+        "log probability of the sampled token, so there  may be up to "
+        "`logprobs+1` elements in the response.",
+    )
+    args.add_argument(
+        "--prompt-logprobs",
+        type=int,
+        default=None,
+        help="Number of log probabilities to return per prompt token.",
+    )
+    args.add_argument(
+        "--detokenize",
+        type=bool,
+        default=True,
+        help="Whether to detokenize the output. Defaults to True.",
+    )
+    args.add_argument(
+        "--skip-special-tokens",
+        default=True,
+        action="store_false",
+        help="Whether to skip special tokens in the output.",
+    )
+    args.add_argument(
+        "--spaces-between-special-tokens",
+        default=True,
+        action="store_false",
+        help="Whether to add spaces between special tokens in the output.  Defaults to True.",
+    )
+    return args
+
+
+def load_chat_template(tokenizer, chat_template):
+    if chat_template is not None:
+        try:
+            with open(chat_template, "r") as f:
+                tokenizer.chat_template = f.read()
+        except OSError:
+            # If opening a file fails, set chat template to be args to
+            # ensure we decode so our escape are interpreted correctly
+            tokenizer.chat_template = codecs.decode(chat_template, "unicode_escape")
+
+        logging.info(f"Using supplied chat template:\n{tokenizer.chat_template}")
+    elif tokenizer.chat_template is not None:
+        logging.info(
+            f"Using default chat template:\n{tokenizer.chat_template}. This May lead to unsatisfactory results. You can provide a template.jinja file for vllm."
+        )
+    else:
+        logging.warning(
+            "No chat template provided. Chat API will not work. This May lead to unsatisfactory results. You can provide a template.jinja file for vllm."
+        )
\ No newline at end of file
diff --git a/models/vision-language-understanding/fuyu-8b/vllm/vllm_public_assets/cherry_blossom.jpg b/models/vision-language-understanding/fuyu-8b/vllm/vllm_public_assets/cherry_blossom.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..63173db0da7687d7841fe4d85239d8e277d81259
Binary files /dev/null and b/models/vision-language-understanding/fuyu-8b/vllm/vllm_public_assets/cherry_blossom.jpg differ
diff --git a/models/vision-language-understanding/llava_next_video-7b/vllm/README.md b/models/vision-language-understanding/llava_next_video-7b/vllm/README.md
new file mode 100755
index 0000000000000000000000000000000000000000..a50af3a220158968f5de39e48aed3bf61362e011
--- /dev/null
+++ b/models/vision-language-understanding/llava_next_video-7b/vllm/README.md
@@ -0,0 +1,35 @@
+# LLaVA-Next-Video
+
+## Description
+
+LLaVA-Next-Video is an open-source chatbot trained by fine-tuning LLM on multimodal instruction-following data. The model is buit on top of LLaVa-NeXT by tuning on a mix of video and image data to achieves better video understanding capabilities. The videos were sampled uniformly to be 32 frames per clip. The model is a current SOTA among open-source models on VideoMME bench. Base LLM: lmsys/vicuna-7b-v1.5
+
+## Setup
+
+### Install
+
+In order to run the model smoothly, you need to get the sdk from [resource center](https://support.iluvatar.com/#/ProductLine?id=2) of Iluvatar CoreX official website.
+
+```bash
+# Install libGL
+## CentOS
+yum install -y mesa-libGL
+## Ubuntu
+apt install -y libgl1-mesa-dev
+```
+
+### Download
+
+-Model: <https://modelscope.cn/models/swift/LLaVA-NeXT-Video-7B-hf>
+
+```bash
+# Download model from the website and make sure the model's path is "data/LLaVA-NeXT-Video-7B-hf"
+mkdir data
+```
+
+## Inference
+
+```bash
+export VLLM_ASSETS_CACHE=../vllm/
+python3 offline_inference_vision_language.py --model ./data/LLaVA-NeXT-Video-7B-hf --max-tokens 256 -tp 4 --trust-remote-code --temperature 0.0 --model-type llava-next-video --modality video  --dtype bfloat16
+```
\ No newline at end of file
diff --git a/models/vision-language-understanding/llava_next_video-7b/vllm/offline_inference_vision_language.py b/models/vision-language-understanding/llava_next_video-7b/vllm/offline_inference_vision_language.py
new file mode 100755
index 0000000000000000000000000000000000000000..510a67e3a48cac72c8ac5adff06c0b3019fc5247
--- /dev/null
+++ b/models/vision-language-understanding/llava_next_video-7b/vllm/offline_inference_vision_language.py
@@ -0,0 +1,192 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+"""
+This example shows how to use vLLM for running offline inference 
+with the correct prompt format on vision language models.
+
+For most models, the prompt format should follow corresponding examples
+on HuggingFace model repository.
+"""
+import sys
+from pathlib import Path
+import os
+sys.path.append(str(Path(__file__).resolve().parent.parent.parent))
+import argparse
+import dataclasses
+import inspect
+from vllm.assets.image import ImageAsset
+from vllm.assets.video import VideoAsset
+
+
+from vllm import LLM, EngineArgs, SamplingParams
+import sys
+from pathlib import Path
+import os
+from utils import sampling_add_cli_args
+
+# LLaVA-1.5
+def run_llava(question,engine_params,modality):
+    assert modality == "image"
+    prompt = f"USER: <image>\n{question}\nASSISTANT:"
+
+    llm = LLM(**engine_params)
+    stop_token_ids = None
+    return llm, prompt, stop_token_ids
+
+
+# LLaVA-1.6/LLaVA-NeXT
+def run_llava_next(question,engine_params,modality):
+    assert modality == "image"
+    prompt = f"USER: <image>\n{question}\nASSISTANT:"
+    llm = LLM(**engine_params)
+    stop_token_ids = None
+    return llm, prompt, stop_token_ids
+
+# LlaVA-NeXT-Video
+# Currently only support for video input
+def run_llava_next_video(question,engine_params,modality):
+    assert modality == "video"
+    prompt = f"USER: <video>\n{question}\nASSISTANT:"
+    llm = LLM(**engine_params)
+    stop_token_ids = None
+    return llm, prompt, stop_token_ids
+
+# LLaVA-onevision
+def run_llava_onevision(question,engine_params,modality):
+
+    if modality == "video":
+        prompt = f"<|im_start|>user <video>\n{question}<|im_end|> \
+        <|im_start|>assistant\n"
+
+    elif modality == "image":
+        prompt = f"<|im_start|>user <image>\n{question}<|im_end|> \
+        <|im_start|>assistant\n"
+
+    llm = LLM(**engine_params)
+    stop_token_ids = None
+    return llm, prompt, stop_token_ids
+
+
+model_example_map = {
+    "llava": run_llava,
+    "llava-next": run_llava_next,
+    "llava-next-video": run_llava_next_video,
+    "llava-onevision": run_llava_onevision
+}
+
+
+def get_multi_modal_input(args):
+    """
+    return {
+        "data": image or video,
+        "question": question,
+    }
+    """
+    if args.modality == "image":
+        # Input image and question
+        image = ImageAsset("cherry_blossom").pil_image.convert("RGB")
+        img_question = "What is the content of this image?"
+
+        return {
+            "data": image,
+            "question": img_question,
+        }
+
+    if args.modality == "video":
+        # Input video and question
+        video = VideoAsset(name="sample_demo_1.mp4",
+                           num_frames=args.num_frames).np_ndarrays
+        vid_question = "Why is this video funny?"
+
+        return {
+            "data": video,
+            "question": vid_question,
+        }
+
+    msg = f"Modality {args.modality} is not supported."
+    raise ValueError(msg)
+
+
+    
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--model-type',
+                        '-m',
+                        type=str,
+                        default="llava",
+                        choices=model_example_map.keys(),
+                        help='Huggingface "model_type".')
+    parser.add_argument('--num-prompts',
+                        type=int,
+                        default=1,
+                        help='Number of prompts to run.')
+    parser.add_argument('--modality',
+                        type=str,
+                        default="image",
+                        help='Modality of the input.')
+    parser.add_argument('--num-frames',
+                        type=int,
+                        default=16,
+                        help='Number of frames to extract from the video.')
+    parser = EngineArgs.add_cli_args(parser)
+    parser = sampling_add_cli_args(parser)
+    args = parser.parse_args()
+    engine_args = [attr.name for attr in dataclasses.fields(EngineArgs)]
+    sampling_args = [
+        param.name
+        for param in list(
+            inspect.signature(SamplingParams).parameters.values()
+        )
+    ]
+    engine_params = {attr: getattr(args, attr) for attr in engine_args}
+    sampling_params = {
+        attr: getattr(args, attr) for attr in sampling_args if args.__contains__(attr)
+    }
+    
+    modality = args.modality
+    mm_input = get_multi_modal_input(args)
+    data = mm_input["data"]
+    question = mm_input["question"]
+
+    llm, prompt, stop_token_ids = model_example_map[args.model_type](question,engine_params,args.modality)
+    sampling_params['stop_token_ids'] = stop_token_ids
+
+    # We set temperature to 0.2 so that outputs can be different
+    # even when all prompts are identical when running batch inference.
+    sampling_params = SamplingParams(**sampling_params)
+
+    assert args.num_prompts > 0
+    if args.num_prompts == 1:
+        # Single inference
+        inputs = {
+            "prompt": prompt,
+            "multi_modal_data": {
+                modality: data
+            },
+        }
+
+    else:
+        # Batch inference
+        inputs = [{
+            "prompt": prompt,
+            "multi_modal_data": {
+                modality: data
+            },
+        } for _ in range(args.num_prompts)]
+
+    outputs = llm.generate(inputs, sampling_params=sampling_params)
+    for o in outputs:
+        generated_text = o.outputs[0].text
+        print(generated_text)
\ No newline at end of file
diff --git a/models/vision-language-understanding/llava_next_video-7b/vllm/utils.py b/models/vision-language-understanding/llava_next_video-7b/vllm/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..48445ed97d08a8388a90d20e026609b5c1e88a99
--- /dev/null
+++ b/models/vision-language-understanding/llava_next_video-7b/vllm/utils.py
@@ -0,0 +1,224 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+import argparse
+import codecs
+import logging
+
+"""
+The following arguments can not be add in args...
+early_stopping: Union[bool, str] = False,
+early_stopping: Controls the stopping condition for beam search. It
+    accepts the following values: `True`, where the generation stops as
+    soon as there are `best_of` complete candidates; `False`, where an
+    heuristic is applied and the generation stops when is it very
+    unlikely to find better candidates; `"never"`, where the beam search
+    procedure only stops when there cannot be better candidates
+    (canonical beam search algorithm).
+stop: Optional[Union[str, List[str]]] = None,
+stop_token_ids: Optional[List[int]] = None,
+logits_processors: Optional[List[LogitsProcessor]] = None,
+logits_processors: List of functions that modify logits based on
+    previously generated tokens, and optionally prompt tokens as
+    a first argument.
+truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None,
+truncate_prompt_tokens: If set to an integer k, will use only the last k
+    tokens from the prompt (i.e., left truncation). Defaults to None
+    (i.e., no truncation).
+    """
+
+
+def sampling_add_cli_args(args: argparse.ArgumentParser) -> argparse.ArgumentParser:
+    args.add_argument(
+        "--n",
+        type=int,
+        default=1,
+        help="Number of output sequences to return for the given prompt.",
+    )
+    args.add_argument(
+        "--best-of",
+        type=int,
+        default=None,
+        help="Number of output sequences that are generated from the prompt. "
+        "From these `best_of` sequences, the top `n` sequences are returned. "
+        "`best_of` must be greater than or equal to `n`. This is treated as "
+        "the beam width when `use_beam_search` is True. By default, `best_of`"
+        "is set to `n`.",
+    )
+    args.add_argument(
+        "--presence-penalty",
+        type=float,
+        default=0.0,
+        help="Float that penalizes new tokens based on whether they "
+        "appear in the generated text so far. Values > 0 encourage the model "
+        "to use new tokens, while values < 0 encourage the model to repeat "
+        "tokens.",
+    )
+    args.add_argument(
+        "--frequency-penalty",
+        type=float,
+        default=0.0,
+        help="Float that penalizes new tokens based on their "
+        " frequency in the generated text so far. Values > 0 encourage the "
+        " model to use new tokens, while values < 0 encourage the model to "
+        "repeat tokens.",
+    )
+    args.add_argument(
+        "--repetition-penalty",
+        type=float,
+        default=1.0,
+        help="Float that penalizes new tokens based on whether "
+        "they appear in the prompt and the generated text so far. Values > 1 "
+        "encourage the model to use new tokens, while values < 1 encourage "
+        "the model to repeat tokens.",
+    )
+    args.add_argument(
+        "--temperature",
+        type=float,
+        default=1.0,
+        help="Float that controls the randomness of the sampling. Lower "
+        "values make the model more deterministic, while higher values make "
+        "the model more random. Zero means greedy sampling.",
+    )
+    args.add_argument(
+        "--top-p",
+        type=float,
+        default=1.0,
+        help="Float that controls the cumulative probability of the top tokens "
+        "to consider. Must be in (0, 1]. Set to 1 to consider all tokens.",
+    )
+    args.add_argument(
+        "--top-k",
+        type=int,
+        default=-1,
+        help="Integer that controls the number of top tokens to consider. Set "
+        "to -1 to consider all tokens.",
+    )
+    args.add_argument(
+        "--min-p",
+        type=float,
+        default=0.0,
+        help="Float that represents the minimum probability for a token to be "
+        "considered, relative to the probability of the most likely token. "
+        "Must be in [0, 1]. Set to 0 to disable this.",
+    )
+    args.add_argument(
+        "--use-beam-search",
+        default=False,
+        action="store_true",
+        help="Whether to use beam search instead of sampling.",
+    )
+    args.add_argument(
+        "--length-penalty",
+        type=float,
+        default=1.0,
+        help="Float that penalizes sequences based on their length. Used in beam search.",
+    )
+    args.add_argument(
+        "--stop",
+        type=str,
+        default=None,
+        help="List of strings that stop the generation when they are generated. "
+        "The returned output will not contain the stop strings.",
+    )
+    args.add_argument(
+        "--stop-token-ids",
+        type=int,
+        default=None,
+        help="List of tokens that stop the generation when they are "
+        "generated. The returned output will contain the stop tokens unless "
+        "the stop tokens are special tokens.",
+    )
+    args.add_argument(
+        "--include-stop-str-in-output",
+        default=False,
+        action="store_true",
+        help="Whether to include the stop strings in output text. Defaults to False.",
+    )
+    args.add_argument(
+        "--ignore-eos",
+        default=False,
+        action="store_true",
+        help="Whether to ignore the EOS token and continue generating tokens after the EOS token is generated.",
+    )
+    args.add_argument(
+        "--max-tokens",
+        type=int,
+        default=16,
+        help="Maximum number of tokens to generate per output sequence.",
+    )
+    args.add_argument(
+        "--min-tokens",
+        type=int,
+        default=0,
+        help="Minimum number of tokens to generate per output sequence "
+        "before EOS or stop_token_ids can be generated",
+    )
+    args.add_argument(
+        "--logprobs",
+        type=int,
+        default=None,
+        help="NNumber of log probabilities to return per output token. "
+        "Note that the implementation follows the OpenAI API: The return "
+        "result includes the log probabilities on the `logprobs` most likely "
+        "tokens, as well the chosen tokens. The API will always return the "
+        "log probability of the sampled token, so there  may be up to "
+        "`logprobs+1` elements in the response.",
+    )
+    args.add_argument(
+        "--prompt-logprobs",
+        type=int,
+        default=None,
+        help="Number of log probabilities to return per prompt token.",
+    )
+    args.add_argument(
+        "--detokenize",
+        type=bool,
+        default=True,
+        help="Whether to detokenize the output. Defaults to True.",
+    )
+    args.add_argument(
+        "--skip-special-tokens",
+        default=True,
+        action="store_false",
+        help="Whether to skip special tokens in the output.",
+    )
+    args.add_argument(
+        "--spaces-between-special-tokens",
+        default=True,
+        action="store_false",
+        help="Whether to add spaces between special tokens in the output.  Defaults to True.",
+    )
+    return args
+
+
+def load_chat_template(tokenizer, chat_template):
+    if chat_template is not None:
+        try:
+            with open(chat_template, "r") as f:
+                tokenizer.chat_template = f.read()
+        except OSError:
+            # If opening a file fails, set chat template to be args to
+            # ensure we decode so our escape are interpreted correctly
+            tokenizer.chat_template = codecs.decode(chat_template, "unicode_escape")
+
+        logging.info(f"Using supplied chat template:\n{tokenizer.chat_template}")
+    elif tokenizer.chat_template is not None:
+        logging.info(
+            f"Using default chat template:\n{tokenizer.chat_template}. This May lead to unsatisfactory results. You can provide a template.jinja file for vllm."
+        )
+    else:
+        logging.warning(
+            "No chat template provided. Chat API will not work. This May lead to unsatisfactory results. You can provide a template.jinja file for vllm."
+        )
\ No newline at end of file
diff --git a/models/vision-language-understanding/llava_next_video-7b/vllm/video-eample-data/sample_demo_1.mp4 b/models/vision-language-understanding/llava_next_video-7b/vllm/video-eample-data/sample_demo_1.mp4
new file mode 100755
index 0000000000000000000000000000000000000000..17cc120e9fd151e2938cd46c0ba6fd83947bf961
Binary files /dev/null and b/models/vision-language-understanding/llava_next_video-7b/vllm/video-eample-data/sample_demo_1.mp4 differ