From b7441c45df828bcd6c9877f3aa1b180231cba468 Mon Sep 17 00:00:00 2001
From: "xinchi.tian" <xinchi.tian@iluvatar.com>
Date: Wed, 9 Oct 2024 14:39:48 +0800
Subject: [PATCH] Add MiniCPM-V-2 in IxRT link #IAVG7Q

Signed-off-by: xinchi.tian <xinchi.tian@iluvatar.com>
---
 .../MiniCPM-V-2/vllm/README.md                | 46 ++++++++++
 .../MiniCPM-V-2/vllm/minicpmv-2.0-offline.py  | 91 +++++++++++++++++++
 2 files changed, 137 insertions(+)
 create mode 100644 models/vision-language-understanding/MiniCPM-V-2/vllm/README.md
 create mode 100644 models/vision-language-understanding/MiniCPM-V-2/vllm/minicpmv-2.0-offline.py

diff --git a/models/vision-language-understanding/MiniCPM-V-2/vllm/README.md b/models/vision-language-understanding/MiniCPM-V-2/vllm/README.md
new file mode 100644
index 00000000..2dc49881
--- /dev/null
+++ b/models/vision-language-understanding/MiniCPM-V-2/vllm/README.md
@@ -0,0 +1,46 @@
+# MiniCPM-V-2
+
+## Description
+
+MiniCPM-V-2 is a compact and efficient language model designed for various natural language processing (NLP) tasks. Building on its predecessor, MiniCPM-V-1, this model integrates advancements in architecture and optimization techniques, making it suitable for deployment in resource-constrained environments.s
+
+## Setup
+
+### Install
+
+In order to run the model smoothly, you need to get the sdk from [resource center](https://support.iluvatar.com/#/ProductLine?id=2) of Iluvatar CoreX official website.
+
+```bash
+# Install libGL
+## CentOS
+yum install -y mesa-libGL
+## Ubuntu
+apt install -y libgl1-mesa-dev
+pip3 install timm==0.9.10
+pip3 install transformers
+pip3 install --user --upgrade pillow -i https://pypi.tuna.tsinghua.edu.cn/simple
+```
+
+### Download
+
+-Model: <https://huggingface.co/openbmb/MiniCPM-V-2>
+Note: Due to the official weights missing some necessary files for vllm execution, you can download the additional files from here: <https://github.com/HwwwwwwwH/MiniCPM-V-2> to ensure that the file directory matches the structure shown here: <https://github.com/HwwwwwwwH/MiniCPM-V-2>.
+
+```bash
+# Download model from the website and make sure the model's path is "data/MiniCPM-V-2"
+mkdir data
+
+```
+
+## Inference
+
+```bash
+export PT_SDPA_ENABLE_HEAD_DIM_PADDING=1
+export PATH=/usr/local/corex/bin:${PATH}
+export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/lib64 
+```
+
+```bash
+wget https://img.zcool.cn/community/012e285a1ea496a8012171323c6bf1.jpg -O dog.jpg
+python3 minicpmv-2.0-offline.py --model-path /path/to/model --image-path ./dog.jpg
+```
diff --git a/models/vision-language-understanding/MiniCPM-V-2/vllm/minicpmv-2.0-offline.py b/models/vision-language-understanding/MiniCPM-V-2/vllm/minicpmv-2.0-offline.py
new file mode 100644
index 00000000..d6add4d8
--- /dev/null
+++ b/models/vision-language-understanding/MiniCPM-V-2/vllm/minicpmv-2.0-offline.py
@@ -0,0 +1,91 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+from PIL import Image
+from transformers import AutoTokenizer
+from vllm import LLM, SamplingParams
+import argparse
+
+def main(args):
+    # 图像文件路径列表
+    ## wget https://img.zcool.cn/community/012e285a1ea496a8012171323c6bf1.jpg@3000w_1l_0o_100sh.jpg -O dog.jpg
+    IMAGES = [
+        args.image_path,  # 本地图片路径
+    ]
+
+    # 模型名称或路径
+    MODEL_NAME = args.model_path  # 本地模型路径或Hugging Face模型名称
+
+    # 打开并转换图像
+    image = Image.open(IMAGES[0]).convert("RGB")
+
+    # 初始化分词器
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
+
+    # 初始化语言模型
+    llm = LLM(model=MODEL_NAME,
+            gpu_memory_utilization=0.95,  # 使用全部GPU内存
+            trust_remote_code=True,
+            max_model_len=1024,
+            max_num_seqs=1,
+            max_num_batched_tokens=1024,)  # 根据内存状况可调整此值
+
+    # 构建对话消息
+    messages = [{'role': 'user', 'content': '(<image>./</image>)\n' + '请描述这张图片'}]
+
+    # 应用对话模板到消息
+    prompt = tokenizer.apply_chat_template(messages)
+
+    # 设置停止符ID
+    # 2.0
+    stop_token_ids = [tokenizer.eos_id]
+    # 2.5
+    #stop_token_ids = [tokenizer.eos_id, tokenizer.eot_id]
+    # 2.6 
+    # stop_tokens = ['<|im_end|>', '<|endoftext|>']
+    # stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
+
+    # 设置生成参数
+    sampling_params = SamplingParams(
+        stop_token_ids=stop_token_ids,
+        # temperature=0.7,
+        # top_p=0.8,
+        # top_k=100,
+        # seed=3472,
+        max_tokens=128,
+        # min_tokens=150,
+        temperature=0,
+        use_beam_search=False,
+        # length_penalty=1.2,
+        best_of=1)
+
+    # 获取模型输出
+    outputs = llm.generate({
+        "prompt": prompt,
+        "multi_modal_data": {
+            "image": image
+        }
+    }, sampling_params=sampling_params)
+    print(outputs[0].outputs[0].text)
+
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model-path", type=str, default=None, help="model path")
+    parser.add_argument("--image-path", type=str, default=None, help="sample image path")
+    args = parser.parse_args()
+
+    main(args)
\ No newline at end of file
-- 
Gitee