From b7441c45df828bcd6c9877f3aa1b180231cba468 Mon Sep 17 00:00:00 2001 From: "xinchi.tian" Date: Wed, 9 Oct 2024 14:39:48 +0800 Subject: [PATCH] Add MiniCPM-V-2 in IxRT link #IAVG7Q Signed-off-by: xinchi.tian --- .../MiniCPM-V-2/vllm/README.md | 46 ++++++++++ .../MiniCPM-V-2/vllm/minicpmv-2.0-offline.py | 91 +++++++++++++++++++ 2 files changed, 137 insertions(+) create mode 100644 models/vision-language-understanding/MiniCPM-V-2/vllm/README.md create mode 100644 models/vision-language-understanding/MiniCPM-V-2/vllm/minicpmv-2.0-offline.py diff --git a/models/vision-language-understanding/MiniCPM-V-2/vllm/README.md b/models/vision-language-understanding/MiniCPM-V-2/vllm/README.md new file mode 100644 index 00000000..2dc49881 --- /dev/null +++ b/models/vision-language-understanding/MiniCPM-V-2/vllm/README.md @@ -0,0 +1,46 @@ +# MiniCPM-V-2 + +## Description + +MiniCPM-V-2 is a compact and efficient language model designed for various natural language processing (NLP) tasks. Building on its predecessor, MiniCPM-V-1, this model integrates advancements in architecture and optimization techniques, making it suitable for deployment in resource-constrained environments.s + +## Setup + +### Install + +In order to run the model smoothly, you need to get the sdk from [resource center](https://support.iluvatar.com/#/ProductLine?id=2) of Iluvatar CoreX official website. + +```bash +# Install libGL +## CentOS +yum install -y mesa-libGL +## Ubuntu +apt install -y libgl1-mesa-dev +pip3 install timm==0.9.10 +pip3 install transformers +pip3 install --user --upgrade pillow -i https://pypi.tuna.tsinghua.edu.cn/simple +``` + +### Download + +-Model: +Note: Due to the official weights missing some necessary files for vllm execution, you can download the additional files from here: to ensure that the file directory matches the structure shown here: . + +```bash +# Download model from the website and make sure the model's path is "data/MiniCPM-V-2" +mkdir data + +``` + +## Inference + +```bash +export PT_SDPA_ENABLE_HEAD_DIM_PADDING=1 +export PATH=/usr/local/corex/bin:${PATH} +export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/lib64 +``` + +```bash +wget https://img.zcool.cn/community/012e285a1ea496a8012171323c6bf1.jpg -O dog.jpg +python3 minicpmv-2.0-offline.py --model-path /path/to/model --image-path ./dog.jpg +``` diff --git a/models/vision-language-understanding/MiniCPM-V-2/vllm/minicpmv-2.0-offline.py b/models/vision-language-understanding/MiniCPM-V-2/vllm/minicpmv-2.0-offline.py new file mode 100644 index 00000000..d6add4d8 --- /dev/null +++ b/models/vision-language-understanding/MiniCPM-V-2/vllm/minicpmv-2.0-offline.py @@ -0,0 +1,91 @@ +# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +from PIL import Image +from transformers import AutoTokenizer +from vllm import LLM, SamplingParams +import argparse + +def main(args): + # 图像文件路径列表 + ## wget https://img.zcool.cn/community/012e285a1ea496a8012171323c6bf1.jpg@3000w_1l_0o_100sh.jpg -O dog.jpg + IMAGES = [ + args.image_path, # 本地图片路径 + ] + + # 模型名称或路径 + MODEL_NAME = args.model_path # 本地模型路径或Hugging Face模型名称 + + # 打开并转换图像 + image = Image.open(IMAGES[0]).convert("RGB") + + # 初始化分词器 + tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True) + + # 初始化语言模型 + llm = LLM(model=MODEL_NAME, + gpu_memory_utilization=0.95, # 使用全部GPU内存 + trust_remote_code=True, + max_model_len=1024, + max_num_seqs=1, + max_num_batched_tokens=1024,) # 根据内存状况可调整此值 + + # 构建对话消息 + messages = [{'role': 'user', 'content': '(./)\n' + '请描述这张图片'}] + + # 应用对话模板到消息 + prompt = tokenizer.apply_chat_template(messages) + + # 设置停止符ID + # 2.0 + stop_token_ids = [tokenizer.eos_id] + # 2.5 + #stop_token_ids = [tokenizer.eos_id, tokenizer.eot_id] + # 2.6 + # stop_tokens = ['<|im_end|>', '<|endoftext|>'] + # stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens] + + # 设置生成参数 + sampling_params = SamplingParams( + stop_token_ids=stop_token_ids, + # temperature=0.7, + # top_p=0.8, + # top_k=100, + # seed=3472, + max_tokens=128, + # min_tokens=150, + temperature=0, + use_beam_search=False, + # length_penalty=1.2, + best_of=1) + + # 获取模型输出 + outputs = llm.generate({ + "prompt": prompt, + "multi_modal_data": { + "image": image + } + }, sampling_params=sampling_params) + print(outputs[0].outputs[0].text) + + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--model-path", type=str, default=None, help="model path") + parser.add_argument("--image-path", type=str, default=None, help="sample image path") + args = parser.parse_args() + + main(args) \ No newline at end of file -- Gitee