From 7cef9b4d338632eb1e1fdf253acb985dcb1035da Mon Sep 17 00:00:00 2001 From: "hongliang.yuan" Date: Mon, 11 Aug 2025 09:32:55 +0800 Subject: [PATCH] add MiniCPM-o-2_6 l --- .../minicpm_o/vllm/README.md | 36 ++++ .../minicpm_o/vllm/ci/prepare.sh | 19 ++ .../vllm/offline_inference_vision_language.py | 173 ++++++++++++++++++ tests/model_info.json | 33 ++++ tests/run_vllm.py | 6 + 5 files changed, 267 insertions(+) create mode 100644 models/multimodal/vision_language_model/minicpm_o/vllm/README.md create mode 100644 models/multimodal/vision_language_model/minicpm_o/vllm/ci/prepare.sh create mode 100644 models/multimodal/vision_language_model/minicpm_o/vllm/offline_inference_vision_language.py diff --git a/models/multimodal/vision_language_model/minicpm_o/vllm/README.md b/models/multimodal/vision_language_model/minicpm_o/vllm/README.md new file mode 100644 index 00000000..2fa3a6bf --- /dev/null +++ b/models/multimodal/vision_language_model/minicpm_o/vllm/README.md @@ -0,0 +1,36 @@ +# MiniCPM-o 2 (vLLM) + +## Model Description + +The most capable model in the MiniCPM-o series. With a total of 8B parameters, this end-to-end model achieves comparable performance to GPT-4o-202405 in vision, speech, and multimodal live streaming, making it one of the most versatile and performant models in the open-source community. For the new voice mode, MiniCPM-o 2.6 supports bilingual real-time speech conversation with configurable voices, and also allows for fun capabilities such as emotion/speed/style control, end-to-end voice cloning, role play, etc. It also advances MiniCPM-V 2.6's visual capabilities such strong OCR capability, trustworthy behavior, multilingual support, and video understanding. Due to its superior token density, MiniCPM-o 2.6 can for the first time support multimodal live streaming on end-side devices such as iPad. + +## Supported Environments + +| GPU | [IXUCA SDK](https://gitee.com/deep-spark/deepspark#%E5%A4%A9%E6%95%B0%E6%99%BA%E7%AE%97%E8%BD%AF%E4%BB%B6%E6%A0%88-ixuca) | Release | +| :----: | :----: | :----: | +| MR-V100 | 4.3.0 | 25.09 | + +## Model Preparation + +### Prepare Resources + +- Model: + +```bash +cp -r ../../vllm_public_assets/ ./ +``` + +### Install Dependencies + +Contact the Iluvatar administrator to get the missing packages: +- transformers-4.45.2+corex.4.3.0-py3-none-any.whl + +## Model Inference + +```bash +export VLLM_ASSETS_CACHE=../vllm/ +python3 offline_inference_vision_language.py --model ./MiniCPM-o-2_6/ --max-model-len 4096 --max-num-seqs 2 --trust-remote-code --temperature 0.0 --disable-mm-preprocessor-cache +python3 offline_inference_vision_language.py --model ./MiniCPM-o-2_6/ --max-model-len 4096 --max-num-seqs 2 --trust-remote-code --temperature 0.0 --disable-mm-preprocessor-cache --modality video +``` + +## Model Results diff --git a/models/multimodal/vision_language_model/minicpm_o/vllm/ci/prepare.sh b/models/multimodal/vision_language_model/minicpm_o/vllm/ci/prepare.sh new file mode 100644 index 00000000..072ab438 --- /dev/null +++ b/models/multimodal/vision_language_model/minicpm_o/vllm/ci/prepare.sh @@ -0,0 +1,19 @@ +#!/bin/bash +# Copyright (c) 2025, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -x +cp -r ../../vllm_public_assets/ ./ +pip install /mnt/deepspark/install/transformers-4.45.2+corex.4.3.0-py3-none-any.whl diff --git a/models/multimodal/vision_language_model/minicpm_o/vllm/offline_inference_vision_language.py b/models/multimodal/vision_language_model/minicpm_o/vllm/offline_inference_vision_language.py new file mode 100644 index 00000000..fd70829e --- /dev/null +++ b/models/multimodal/vision_language_model/minicpm_o/vllm/offline_inference_vision_language.py @@ -0,0 +1,173 @@ +""" +This example shows how to use vLLM for running offline inference +with the correct prompt format on vision language models. + +For most models, the prompt format should follow corresponding examples +on HuggingFace model repository. +""" +import sys +from pathlib import Path +import os +sys.path.append(str(Path(__file__).resolve().parent.parent.parent)) +import argparse +import dataclasses +import inspect +from vllm.assets.image import ImageAsset +from vllm.assets.video import VideoAsset +from vllm import LLM, EngineArgs, SamplingParams +from utils import sampling_add_cli_args +from transformers import AutoTokenizer + +def run_minicpmv_base(question: str,engine_params, model, modality: str): + assert modality in ["image", "video"] + # If you want to use `MiniCPM-o-2_6` with audio inputs, check `audio_language.py` # noqa + + # 2.0 + # The official repo doesn't work yet, so we need to use a fork for now + # For more details, please see: See: https://github.com/vllm-project/vllm/pull/4087#issuecomment-2250397630 # noqa + # model_name = "HwwwH/MiniCPM-V-2" + + # 2.5 + # model_name = "openbmb/MiniCPM-Llama3-V-2_5" + + # 2.6 + # model_name = "openbmb/MiniCPM-V-2_6" + # o2.6 + + # modality supports + # 2.0: image + # 2.5: image + # 2.6: image, video + # o2.6: image, video, audio + # model_name = "openbmb/MiniCPM-o-2_6" + tokenizer = AutoTokenizer.from_pretrained(model, + trust_remote_code=True) + llm = LLM(**engine_params) + # NOTE The stop_token_ids are different for various versions of MiniCPM-V + # 2.0 + # stop_token_ids = [tokenizer.eos_id] + + # 2.5 + # stop_token_ids = [tokenizer.eos_id, tokenizer.eot_id] + + # 2.6 / o2.6 + stop_tokens = ['<|im_end|>', '<|endoftext|>'] + stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens] + + modality_placeholder = { + "image": "(./)", + "video": "()", + } + + messages = [{ + 'role': 'user', + 'content': f'{modality_placeholder[modality]}\n{question}' + }] + prompt = tokenizer.apply_chat_template(messages, + tokenize=False, + add_generation_prompt=True) + return llm, prompt, stop_token_ids + + +def run_minicpmo(question: str,engine_params, model, modality: str): + return run_minicpmv_base(question, engine_params, model, modality) + + +def get_multi_modal_input(args): + """ + return { + "data": image or video, + "question": question, + } + """ + if args.modality == "image": + # Input image and question + image = ImageAsset("cherry_blossom").pil_image.convert("RGB") + img_question = "What is the content of this image?" + + return { + "data": image, + "question": img_question, + } + + if args.modality == "video": + # Input video and question + video = VideoAsset(name="sample_demo_1.mp4", + num_frames=args.num_frames).np_ndarrays + vid_question = "Why is this video funny?" + + return { + "data": video, + "question": vid_question, + } + + msg = f"Modality {args.modality} is not supported." + raise ValueError(msg) + + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument('--num-prompts', + type=int, + default=1, + help='Number of prompts to run.') + parser.add_argument('--modality', + type=str, + default="image", + help='Modality of the input.') + parser.add_argument('--num-frames', + type=int, + default=16, + help='Number of frames to extract from the video.') + parser = EngineArgs.add_cli_args(parser) + parser = sampling_add_cli_args(parser) + args = parser.parse_args() + engine_args = [attr.name for attr in dataclasses.fields(EngineArgs)] + sampling_args = [ + param.name + for param in list( + inspect.signature(SamplingParams).parameters.values() + ) + ] + engine_params = {attr: getattr(args, attr) for attr in engine_args} + sampling_params = { + attr: getattr(args, attr) for attr in sampling_args if args.__contains__(attr) + } + + modality = args.modality + mm_input = get_multi_modal_input(args) + data = mm_input["data"] + question = mm_input["question"] + + llm, prompt, stop_token_ids = run_minicpmo(question,engine_params, args.model, args.modality) + sampling_params['stop_token_ids'] = stop_token_ids + + # We set temperature to 0.2 so that outputs can be different + # even when all prompts are identical when running batch inference. + sampling_params = SamplingParams(**sampling_params) + + assert args.num_prompts > 0 + if args.num_prompts == 1: + # Single inference + inputs = { + "prompt": prompt, + "multi_modal_data": { + modality: data + }, + } + + else: + # Batch inference + inputs = [{ + "prompt": prompt, + "multi_modal_data": { + modality: data + }, + } for _ in range(args.num_prompts)] + + outputs = llm.generate(inputs, sampling_params=sampling_params) + + for o in outputs: + generated_text = o.outputs[0].text + print(generated_text) \ No newline at end of file diff --git a/tests/model_info.json b/tests/model_info.json index e4e18362..e3195a95 100644 --- a/tests/model_info.json +++ b/tests/model_info.json @@ -8032,6 +8032,39 @@ "type": "inference", "hasDemo": false, "demoType": "" + }, + { + "display_name": "MiniCPM-o 2", + "model_name": "minicpm_o", + "framework": "vllm", + "release_version": "25.09", + "release_sdk": "4.3.0", + "release_gpgpu": "MR-V100", + "latest_sdk": "", + "latest_gpgpu": "", + "category": "multimodal/vision_language_model", + "toolbox": "", + "mdims": "", + "dataset": "", + "license": "", + "model_path": "models/multimodal/vision_language_model/minicpm_o/vllm", + "readme_file": "models/multimodal/vision_language_model/minicpm_o/vllm/README.md", + "bitbucket_repo": "", + "bitbucket_branch": "", + "bitbucket_path": "", + "develop_owner": "", + "github_repo": "", + "github_branch": "", + "github_path": "", + "datasets": "", + "download_url": "https://huggingface.co/openbmb/MiniCPM-o-2_6", + "need_third_part": false, + "precisions": [ + "fp16" + ], + "type": "inference", + "hasDemo": false, + "demoType": "" } ] } \ No newline at end of file diff --git a/tests/run_vllm.py b/tests/run_vllm.py index b6c5f774..39b3c222 100644 --- a/tests/run_vllm.py +++ b/tests/run_vllm.py @@ -335,6 +335,12 @@ def run_nlp_testcase(model): cd ../{model['model_path']} python3 offline_inference.py --model-path /mnt/deepspark/data/checkpoints/{checkpoint_n} --tp 1 """ + elif model_name == "minicpm_o": + script = f""" + set -x + cd ../{model['model_path']} + python3 offline_inference_vision_language.py --model ./{model_name} --max-model-len 4096 --max-num-seqs 2 --trust-remote-code --temperature 0.0 --disable-mm-preprocessor-cache + """ r, t = run_script(script) sout = r.stdout -- Gitee