From 7cef9b4d338632eb1e1fdf253acb985dcb1035da Mon Sep 17 00:00:00 2001
From: "hongliang.yuan" <hongliang.yuan@iluvatar.com>
Date: Mon, 11 Aug 2025 09:32:55 +0800
Subject: [PATCH] add MiniCPM-o-2_6 l

---
 .../minicpm_o/vllm/README.md                  |  36 ++++
 .../minicpm_o/vllm/ci/prepare.sh              |  19 ++
 .../vllm/offline_inference_vision_language.py | 173 ++++++++++++++++++
 tests/model_info.json                         |  33 ++++
 tests/run_vllm.py                             |   6 +
 5 files changed, 267 insertions(+)
 create mode 100644 models/multimodal/vision_language_model/minicpm_o/vllm/README.md
 create mode 100644 models/multimodal/vision_language_model/minicpm_o/vllm/ci/prepare.sh
 create mode 100644 models/multimodal/vision_language_model/minicpm_o/vllm/offline_inference_vision_language.py

diff --git a/models/multimodal/vision_language_model/minicpm_o/vllm/README.md b/models/multimodal/vision_language_model/minicpm_o/vllm/README.md
new file mode 100644
index 00000000..2fa3a6bf
--- /dev/null
+++ b/models/multimodal/vision_language_model/minicpm_o/vllm/README.md
@@ -0,0 +1,36 @@
+# MiniCPM-o 2 (vLLM)
+
+## Model Description
+
+The most capable model in the MiniCPM-o series. With a total of 8B parameters, this end-to-end model achieves comparable performance to GPT-4o-202405 in vision, speech, and multimodal live streaming, making it one of the most versatile and performant models in the open-source community. For the new voice mode, MiniCPM-o 2.6 supports bilingual real-time speech conversation with configurable voices, and also allows for fun capabilities such as emotion/speed/style control, end-to-end voice cloning, role play, etc. It also advances MiniCPM-V 2.6's visual capabilities such strong OCR capability, trustworthy behavior, multilingual support, and video understanding. Due to its superior token density, MiniCPM-o 2.6 can for the first time support multimodal live streaming on end-side devices such as iPad.
+
+## Supported Environments
+
+| GPU    | [IXUCA SDK](https://gitee.com/deep-spark/deepspark#%E5%A4%A9%E6%95%B0%E6%99%BA%E7%AE%97%E8%BD%AF%E4%BB%B6%E6%A0%88-ixuca) | Release |
+| :----: | :----: | :----: |
+| MR-V100 | 4.3.0 | 25.09 |
+
+## Model Preparation
+
+### Prepare Resources
+
+- Model: <https://huggingface.co/openbmb/MiniCPM-o-2_6>
+
+```bash
+cp -r ../../vllm_public_assets/ ./
+```
+
+### Install Dependencies
+
+Contact the Iluvatar administrator to get the missing packages:
+- transformers-4.45.2+corex.4.3.0-py3-none-any.whl
+
+## Model Inference
+
+```bash
+export VLLM_ASSETS_CACHE=../vllm/
+python3 offline_inference_vision_language.py --model ./MiniCPM-o-2_6/ --max-model-len 4096 --max-num-seqs 2  --trust-remote-code --temperature 0.0 --disable-mm-preprocessor-cache
+python3 offline_inference_vision_language.py --model ./MiniCPM-o-2_6/ --max-model-len 4096 --max-num-seqs 2  --trust-remote-code --temperature 0.0 --disable-mm-preprocessor-cache --modality video
+```
+
+## Model Results
diff --git a/models/multimodal/vision_language_model/minicpm_o/vllm/ci/prepare.sh b/models/multimodal/vision_language_model/minicpm_o/vllm/ci/prepare.sh
new file mode 100644
index 00000000..072ab438
--- /dev/null
+++ b/models/multimodal/vision_language_model/minicpm_o/vllm/ci/prepare.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+# Copyright (c) 2025, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -x
+cp -r ../../vllm_public_assets/ ./
+pip install /mnt/deepspark/install/transformers-4.45.2+corex.4.3.0-py3-none-any.whl
diff --git a/models/multimodal/vision_language_model/minicpm_o/vllm/offline_inference_vision_language.py b/models/multimodal/vision_language_model/minicpm_o/vllm/offline_inference_vision_language.py
new file mode 100644
index 00000000..fd70829e
--- /dev/null
+++ b/models/multimodal/vision_language_model/minicpm_o/vllm/offline_inference_vision_language.py
@@ -0,0 +1,173 @@
+"""
+This example shows how to use vLLM for running offline inference 
+with the correct prompt format on vision language models.
+
+For most models, the prompt format should follow corresponding examples
+on HuggingFace model repository.
+"""
+import sys
+from pathlib import Path
+import os
+sys.path.append(str(Path(__file__).resolve().parent.parent.parent))
+import argparse
+import dataclasses
+import inspect
+from vllm.assets.image import ImageAsset
+from vllm.assets.video import VideoAsset
+from vllm import LLM, EngineArgs, SamplingParams
+from utils import sampling_add_cli_args
+from transformers import AutoTokenizer
+
+def run_minicpmv_base(question: str,engine_params, model, modality: str):
+    assert modality in ["image", "video"]
+    # If you want to use `MiniCPM-o-2_6` with audio inputs, check `audio_language.py` # noqa
+
+    # 2.0
+    # The official repo doesn't work yet, so we need to use a fork for now
+    # For more details, please see: See: https://github.com/vllm-project/vllm/pull/4087#issuecomment-2250397630 # noqa
+    # model_name = "HwwwH/MiniCPM-V-2"
+
+    # 2.5
+    # model_name = "openbmb/MiniCPM-Llama3-V-2_5"
+
+    # 2.6
+    # model_name = "openbmb/MiniCPM-V-2_6"
+    # o2.6
+
+    # modality supports
+    # 2.0: image
+    # 2.5: image
+    # 2.6: image, video
+    # o2.6: image, video, audio
+    # model_name = "openbmb/MiniCPM-o-2_6"
+    tokenizer = AutoTokenizer.from_pretrained(model,
+                                              trust_remote_code=True)
+    llm = LLM(**engine_params)
+    # NOTE The stop_token_ids are different for various versions of MiniCPM-V
+    # 2.0
+    # stop_token_ids = [tokenizer.eos_id]
+
+    # 2.5
+    # stop_token_ids = [tokenizer.eos_id, tokenizer.eot_id]
+
+    # 2.6 / o2.6
+    stop_tokens = ['<|im_end|>', '<|endoftext|>']
+    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
+
+    modality_placeholder = {
+        "image": "(<image>./</image>)",
+        "video": "(<video>./</video>)",
+    }
+
+    messages = [{
+        'role': 'user',
+        'content': f'{modality_placeholder[modality]}\n{question}'
+    }]
+    prompt = tokenizer.apply_chat_template(messages,
+                                           tokenize=False,
+                                           add_generation_prompt=True)
+    return llm, prompt, stop_token_ids
+
+
+def run_minicpmo(question: str,engine_params, model, modality: str):
+    return run_minicpmv_base(question, engine_params, model, modality)
+
+
+def get_multi_modal_input(args):
+    """
+    return {
+        "data": image or video,
+        "question": question,
+    }
+    """
+    if args.modality == "image":
+        # Input image and question
+        image = ImageAsset("cherry_blossom").pil_image.convert("RGB")
+        img_question = "What is the content of this image?"
+
+        return {
+            "data": image,
+            "question": img_question,
+        }
+
+    if args.modality == "video":
+        # Input video and question
+        video = VideoAsset(name="sample_demo_1.mp4",
+                           num_frames=args.num_frames).np_ndarrays
+        vid_question = "Why is this video funny?"
+
+        return {
+            "data": video,
+            "question": vid_question,
+        }
+
+    msg = f"Modality {args.modality} is not supported."
+    raise ValueError(msg)
+
+
+    
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--num-prompts',
+                        type=int,
+                        default=1,
+                        help='Number of prompts to run.')
+    parser.add_argument('--modality',
+                        type=str,
+                        default="image",
+                        help='Modality of the input.')
+    parser.add_argument('--num-frames',
+                        type=int,
+                        default=16,
+                        help='Number of frames to extract from the video.')
+    parser = EngineArgs.add_cli_args(parser)
+    parser = sampling_add_cli_args(parser)
+    args = parser.parse_args()
+    engine_args = [attr.name for attr in dataclasses.fields(EngineArgs)]
+    sampling_args = [
+        param.name
+        for param in list(
+            inspect.signature(SamplingParams).parameters.values()
+        )
+    ]
+    engine_params = {attr: getattr(args, attr) for attr in engine_args}
+    sampling_params = {
+        attr: getattr(args, attr) for attr in sampling_args if args.__contains__(attr)
+    }
+    
+    modality = args.modality
+    mm_input = get_multi_modal_input(args)
+    data = mm_input["data"]
+    question = mm_input["question"]
+
+    llm, prompt, stop_token_ids = run_minicpmo(question,engine_params, args.model, args.modality)
+    sampling_params['stop_token_ids'] = stop_token_ids
+
+    # We set temperature to 0.2 so that outputs can be different
+    # even when all prompts are identical when running batch inference.
+    sampling_params = SamplingParams(**sampling_params)
+
+    assert args.num_prompts > 0
+    if args.num_prompts == 1:
+        # Single inference
+        inputs = {
+            "prompt": prompt,
+            "multi_modal_data": {
+                modality: data
+            },
+        }
+
+    else:
+        # Batch inference
+        inputs = [{
+            "prompt": prompt,
+            "multi_modal_data": {
+                modality: data
+            },
+        } for _ in range(args.num_prompts)]
+
+    outputs = llm.generate(inputs, sampling_params=sampling_params)
+
+    for o in outputs:
+        generated_text = o.outputs[0].text
+        print(generated_text)
\ No newline at end of file
diff --git a/tests/model_info.json b/tests/model_info.json
index e4e18362..e3195a95 100644
--- a/tests/model_info.json
+++ b/tests/model_info.json
@@ -8032,6 +8032,39 @@
             "type": "inference",
             "hasDemo": false,
             "demoType": ""
+        },
+        {
+            "display_name": "MiniCPM-o 2",
+            "model_name": "minicpm_o",
+            "framework": "vllm",
+            "release_version": "25.09",
+            "release_sdk": "4.3.0",
+            "release_gpgpu": "MR-V100",
+            "latest_sdk": "",
+            "latest_gpgpu": "",
+            "category": "multimodal/vision_language_model",
+            "toolbox": "",
+            "mdims": "",
+            "dataset": "",
+            "license": "",
+            "model_path": "models/multimodal/vision_language_model/minicpm_o/vllm",
+            "readme_file": "models/multimodal/vision_language_model/minicpm_o/vllm/README.md",
+            "bitbucket_repo": "",
+            "bitbucket_branch": "",
+            "bitbucket_path": "",
+            "develop_owner": "",
+            "github_repo": "",
+            "github_branch": "",
+            "github_path": "",
+            "datasets": "",
+            "download_url": "https://huggingface.co/openbmb/MiniCPM-o-2_6",
+            "need_third_part": false,
+            "precisions": [
+                "fp16"
+            ],
+            "type": "inference",
+            "hasDemo": false,
+            "demoType": ""
         }
     ]
 }
\ No newline at end of file
diff --git a/tests/run_vllm.py b/tests/run_vllm.py
index b6c5f774..39b3c222 100644
--- a/tests/run_vllm.py
+++ b/tests/run_vllm.py
@@ -335,6 +335,12 @@ def run_nlp_testcase(model):
             cd ../{model['model_path']}
             python3 offline_inference.py --model-path /mnt/deepspark/data/checkpoints/{checkpoint_n} --tp 1
             """
+        elif model_name == "minicpm_o":
+            script = f"""
+            set -x
+            cd ../{model['model_path']}
+            python3 offline_inference_vision_language.py --model ./{model_name} --max-model-len 4096 --max-num-seqs 2  --trust-remote-code --temperature 0.0 --disable-mm-preprocessor-cache
+            """
 
         r, t = run_script(script)
         sout = r.stdout
-- 
Gitee