diff --git a/models/multimodal/vision_language_model/glm-4v/vllm/README.md b/models/multimodal/vision_language_model/glm-4v/vllm/README.md new file mode 100644 index 0000000000000000000000000000000000000000..b450696f2b4bf40f4961396d4042b6ce3bea6ee6 --- /dev/null +++ b/models/multimodal/vision_language_model/glm-4v/vllm/README.md @@ -0,0 +1,34 @@ +# GLM-4v (vLLM) + +## Model Description + +GLM-4V-9B is the open-source multimodal version of Zhipu AI's latest generation pre-trained model GLM-4 series. GLM-4V-9B demonstrates exceptional performance in various multimodal evaluations, including bilingual (Chinese and English) multi-turn conversations at a high resolution of 1120 * 1120, comprehensive Chinese-English capabilities, perception reasoning, text recognition, and chart understanding. It surpasses GPT-4-turbo-2024-04-09, Gemini 1.0 Pro, Qwen-VL-Max, and Claude 3 Opus in these aspects. + +## Supported Environments + +| GPU | [IXUCA SDK](https://gitee.com/deep-spark/deepspark#%E5%A4%A9%E6%95%B0%E6%99%BA%E7%AE%97%E8%BD%AF%E4%BB%B6%E6%A0%88-ixuca) | Release | +| :----: | :----: | :----: | +| MR-V100 | 4.3.0 | 25.09 | + +## Model Preparation + +### Prepare Resources + +- Model: + +```bash +cp -r ../../vllm_public_assets/ ./ +``` + +### Install Dependencies + +In order to run the model smoothly, you need to get the sdk from [resource center](https://support.iluvatar.com/#/ProductLine?id=2) of Iluvatar CoreX official website. + +## Model Inference + +```bash +export VLLM_ASSETS_CACHE=../vllm/ +python3 offline_inference_vision_language.py --model /path/to/glm-4v-9b --max-tokens 256 -tp 4 --trust-remote-code --temperature 0.0 --hf-overrides '{"architectures": ["GLM4VForCausalLM"]}' +``` + +## Model Results \ No newline at end of file diff --git a/models/multimodal/vision_language_model/glm-4v/vllm/ci/prepare.sh b/models/multimodal/vision_language_model/glm-4v/vllm/ci/prepare.sh new file mode 100644 index 0000000000000000000000000000000000000000..b54c8d39af69827a8993f582b3029fd463c58c0b --- /dev/null +++ b/models/multimodal/vision_language_model/glm-4v/vllm/ci/prepare.sh @@ -0,0 +1,19 @@ +#!/bin/bash +# Copyright (c) 2025, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -x + +cp -r ../../vllm_public_assets/ ./ diff --git a/models/multimodal/vision_language_model/glm-4v/vllm/offline_inference_vision_language.py b/models/multimodal/vision_language_model/glm-4v/vllm/offline_inference_vision_language.py new file mode 100644 index 0000000000000000000000000000000000000000..0fef7fddedb8a9033b8226df6c61c0e3f7e6bee0 --- /dev/null +++ b/models/multimodal/vision_language_model/glm-4v/vllm/offline_inference_vision_language.py @@ -0,0 +1,147 @@ +#!/bin/bash +# Copyright (c) 2025, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +""" +This example shows how to use vLLM for running offline inference +with the correct prompt format on vision language models. + +For most models, the prompt format should follow corresponding examples +on HuggingFace model repository. +""" +import sys +from pathlib import Path +import os +sys.path.append(str(Path(__file__).resolve().parent.parent.parent)) +import argparse +import dataclasses +import inspect +from vllm.assets.image import ImageAsset +from vllm.assets.video import VideoAsset + +from vllm import LLM, EngineArgs, SamplingParams +from utils import sampling_add_cli_args + +# GLM +def run_glm(question,engine_params): + + prompt = f"{question}\n" + llm = LLM(**engine_params) + prompt = f"<|user|>\n<|begin_of_image|><|endoftext|><|end_of_image|>\ + {question}<|assistant|>" + + stop_token_ids = [151329, 151336, 151338] + return llm, prompt, stop_token_ids + + +def get_multi_modal_input(args): + """ + return { + "data": image or video, + "question": question, + } + """ + if args.modality == "image": + # Input image and question + image = ImageAsset("cherry_blossom").pil_image.convert("RGB") + img_question = "What is the content of this image?" + + return { + "data": image, + "question": img_question, + } + + if args.modality == "video": + # Input video and question + video = VideoAsset(name="sample_demo_1.mp4", + num_frames=args.num_frames).np_ndarrays + vid_question = "Why is this video funny?" + + return { + "data": video, + "question": vid_question, + } + + msg = f"Modality {args.modality} is not supported." + raise ValueError(msg) + + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument('--num-prompts', + type=int, + default=1, + help='Number of prompts to run.') + parser.add_argument('--modality', + type=str, + default="image", + help='Modality of the input.') + parser.add_argument('--num-frames', + type=int, + default=16, + help='Number of frames to extract from the video.') + parser = EngineArgs.add_cli_args(parser) + parser = sampling_add_cli_args(parser) + args = parser.parse_args() + engine_args = [attr.name for attr in dataclasses.fields(EngineArgs)] + sampling_args = [ + param.name + for param in list( + inspect.signature(SamplingParams).parameters.values() + ) + ] + engine_params = {attr: getattr(args, attr) for attr in engine_args} + sampling_params = { + attr: getattr(args, attr) for attr in sampling_args if args.__contains__(attr) + } + + modality = args.modality + mm_input = get_multi_modal_input(args) + data = mm_input["data"] + question = mm_input["question"] + + llm, prompt, stop_token_ids = run_glm(question,engine_params) + sampling_params['stop_token_ids'] = stop_token_ids + + # We set temperature to 0.2 so that outputs can be different + # even when all prompts are identical when running batch inference. + sampling_params = SamplingParams(**sampling_params) + + assert args.num_prompts > 0 + if args.num_prompts == 1: + # Single inference + inputs = { + "prompt": prompt, + "multi_modal_data": { + modality: data + }, + } + + else: + # Batch inference + inputs = [{ + "prompt": prompt, + "multi_modal_data": { + modality: data + }, + } for _ in range(args.num_prompts)] + + outputs = llm.generate(inputs, sampling_params=sampling_params) + + for o in outputs: + generated_text = o.outputs[0].text + print(generated_text) \ No newline at end of file diff --git a/models/multimodal/vision_language_model/llava_next_base/vllm/README.md b/models/multimodal/vision_language_model/llava_next_base/vllm/README.md new file mode 100644 index 0000000000000000000000000000000000000000..20dd1aec7146d3bfe0b1ddef23053ea3ee723e87 --- /dev/null +++ b/models/multimodal/vision_language_model/llava_next_base/vllm/README.md @@ -0,0 +1,37 @@ +# LLaVA-NeXT-based (vLLM) + +## Model Description + +E5-V is fine-tuned based on lmms-lab/llama3-llava-next-8b. + +We propose a framework, called E5-V, to adpat MLLMs for achieving multimodal embeddings. E5-V effectively bridges the modality gap between different types of inputs, demonstrating strong performance in multimodal embeddings even without fine-tuning. We also propose a single modality training approach for E5-V, where the model is trained exclusively on text pairs, demonstrating better performance than multimodal training. + +More details can be found in https://github.com/kongds/E5-V + +## Supported Environments + +| GPU | [IXUCA SDK](https://gitee.com/deep-spark/deepspark#%E5%A4%A9%E6%95%B0%E6%99%BA%E7%AE%97%E8%BD%AF%E4%BB%B6%E6%A0%88-ixuca) | Release | +| :----: | :----: | :----: | +| MR-V100 | 4.3.0 | 25.09 | + +## Model Preparation + +### Prepare Resources + +- Model: + +```bash +cp -r ../../vllm_public_assets/ ./ +``` + +### Install Dependencies + +In order to run the model smoothly, you need to get the sdk from [resource center](https://support.iluvatar.com/#/ProductLine?id=2) of Iluvatar CoreX official website. + +## Model Inference + +```bash +python3 offline_inference_vision_language_embedding.py --model /path/to/e5-v/ --modality "image" --tensor_parallel_size 1 --task "embed" --trust_remote_code --max_model_len 4096 +``` + +## Model Results \ No newline at end of file diff --git a/models/multimodal/vision_language_model/llava_next_base/vllm/ci/prepare.sh b/models/multimodal/vision_language_model/llava_next_base/vllm/ci/prepare.sh new file mode 100644 index 0000000000000000000000000000000000000000..1ce243cbc5197ba4f8526707e50605e75b46e691 --- /dev/null +++ b/models/multimodal/vision_language_model/llava_next_base/vllm/ci/prepare.sh @@ -0,0 +1,19 @@ +#!/bin/bash +# Copyright (c) 2025, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -x + +cp -r ../../vllm_public_assets/ ./ \ No newline at end of file diff --git a/models/multimodal/vision_language_model/llava_next_base/vllm/offline_inference_vision_language_embedding.py b/models/multimodal/vision_language_model/llava_next_base/vllm/offline_inference_vision_language_embedding.py new file mode 100644 index 0000000000000000000000000000000000000000..c8e6ae5e481bf83a3e1a3778b60cf11f99130c0f --- /dev/null +++ b/models/multimodal/vision_language_model/llava_next_base/vllm/offline_inference_vision_language_embedding.py @@ -0,0 +1,156 @@ +#!/bin/bash +# Copyright (c) 2025, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +""" +This example shows how to use vLLM for running offline inference with +the correct prompt format on vision language models for multimodal embedding. + +For most models, the prompt format should follow corresponding examples +on HuggingFace model repository. +""" +from argparse import Namespace +from typing import Literal, NamedTuple, Optional, TypedDict, Union, get_args +import io +import base64 +from PIL import Image +from vllm import LLM +from vllm.multimodal.utils import fetch_image +from vllm.utils import FlexibleArgumentParser +from vllm import LLM, EngineArgs +import dataclasses + +class TextQuery(TypedDict): + modality: Literal["text"] + text: str + + +class ImageQuery(TypedDict): + modality: Literal["image"] + image: Image.Image + + +class TextImageQuery(TypedDict): + modality: Literal["text+image"] + text: str + image: Image.Image + + +QueryModality = Literal["text", "image", "text+image"] +Query = Union[TextQuery, ImageQuery, TextImageQuery] + + +class ModelRequestData(NamedTuple): + llm: LLM + prompt: str + image: Optional[Image.Image] + + +def run_e5_v(query: Query, engine_params): + llama3_template = '<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n \n' # noqa: E501 + + if query["modality"] == "text": + text = query["text"] + prompt = llama3_template.format( + f"{text}\nSummary above sentence in one word: ") + image = None + elif query["modality"] == "image": + prompt = llama3_template.format( + "\nSummary above image in one word: ") + image = query["image"] + else: + modality = query['modality'] + raise ValueError(f"Unsupported query modality: '{modality}'") + + llm = LLM(**engine_params) + + return ModelRequestData( + llm=llm, + prompt=prompt, + image=image, + ) + + + +def get_query(modality: QueryModality): + if modality == "text": + return TextQuery(modality="text", text="A dog sitting in the grass") + + + if modality == "image": + image: Image = Image.open("vllm_public_assets/American_Eskimo_Dog.jpg") + image = image.convert("RGB") + image_data = io.BytesIO() + image.save(image_data, format='JPEG') + image_base64 = base64.b64encode(image_data.getvalue()).decode("utf-8") + return ImageQuery( + modality="image", + image= fetch_image(f"data:image/jpeg;base64,{image_base64}" + ), + ) + + if modality == "text+image": + image: Image = Image.open("vllm_public_assets/Felis_catus-cat_on_snow.jpg") + image = image.convert("RGB") + image_data = io.BytesIO() + image.save(image_data, format='JPEG') + image_base64 = base64.b64encode(image_data.getvalue()).decode("utf-8") + return TextImageQuery( + modality="text+image", + text="A cat standing in the snow.", + image= fetch_image(f"data:image/jpeg;base64,{image_base64}" + ), + ) + + msg = f"Modality {modality} is not supported." + raise ValueError(msg) + + +def run_encode(engine_params, modality: QueryModality): + query = get_query(modality) + req_data = run_e5_v(query, engine_params) + + mm_data = {} + if req_data.image is not None: + mm_data["image"] = req_data.image + + outputs = req_data.llm.embed({ + "prompt": req_data.prompt, + "multi_modal_data": mm_data, + }) + + for output in outputs: + print(output.outputs.embedding) + if output.outputs.embedding is not None: + print("Offline inference is successful!") + + + +if __name__ == "__main__": + parser = FlexibleArgumentParser( + description='Demo on using vLLM for offline inference with ' + 'vision language models for multimodal embedding') + parser.add_argument('--modality', + type=str, + default="image", + choices=get_args(QueryModality), + help='Modality of the input.') + parser = EngineArgs.add_cli_args(parser) + args = parser.parse_args() + engine_args = [attr.name for attr in dataclasses.fields(EngineArgs)] + engine_params = {attr: getattr(args, attr) for attr in engine_args} + + run_encode(engine_params, args.modality) diff --git a/models/multimodal/vision_language_model/qwen2_5_vl/vllm/README.md b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/README.md new file mode 100644 index 0000000000000000000000000000000000000000..8545cad6614ad37b851e97193a832d0ddd3703b4 --- /dev/null +++ b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/README.md @@ -0,0 +1,39 @@ +# Qwen2.5-VL (vLLM) + +## Model Description + +Qwen2.5-VL is not only proficient in recognizing common objects such as flowers, birds, fish, and insects, but it is highly capable of analyzing texts, charts, icons, graphics, and layouts within images. Qwen2.5-VL directly plays as a visual agent that can reason and dynamically direct tools, which is capable of computer use and phone use. Qwen2.5-VL can comprehend videos of over 1 hour, and this time it has a new ability of cpaturing event by pinpointing the relevant video segments. Qwen2.5-VL can accurately localize objects in an image by generating bounding boxes or points, and it can provide stable JSON outputs for coordinates and attributes. for data like scans of invoices, forms, tables, etc. Qwen2.5-VL supports structured outputs of their contents, benefiting usages in finance, commerce, etc. + +## Supported Environments + +| GPU | [IXUCA SDK](https://gitee.com/deep-spark/deepspark#%E5%A4%A9%E6%95%B0%E6%99%BA%E7%AE%97%E8%BD%AF%E4%BB%B6%E6%A0%88-ixuca) | Release | +| :----: | :----: | :----: | +| MR-V100 | 4.3.0 | 25.09 | + +## Model Preparation + +### Prepare Resources + +- Model: + +```bash +cp -r ../../vllm_public_assets/ ./ +``` + +### Install Dependencies + +In order to run the model smoothly, you need to get the sdk from [resource center](https://support.iluvatar.com/#/ProductLine?id=2) of Iluvatar CoreX official website. + +```bash +pip install transformers==4.50.3 +``` + +## Model Inference + +```bash +export VLLM_ASSETS_CACHE=../vllm/ +export ENABLE_FLASH_ATTENTION_WITH_HEAD_DIM_PADDING=1 +python3 offline_inference_vision_language.py --model /path/to/Qwen2.5-VL-3B-Instruct/ -tp 4 --trust-remote-code --temperature 0.0 --max-token 256 +``` + +## Model Results diff --git a/models/multimodal/vision_language_model/qwen2_5_vl/vllm/ci/prepare.sh b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/ci/prepare.sh new file mode 100644 index 0000000000000000000000000000000000000000..cc6608c240adf4526fc66d01d049232f64da883b --- /dev/null +++ b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/ci/prepare.sh @@ -0,0 +1,20 @@ +#!/bin/bash +# Copyright (c) 2025, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -x + +cp -r ../../vllm_public_assets/ ./ +pip install transformers==4.50.3 diff --git a/models/multimodal/vision_language_model/qwen2_5_vl/vllm/offline_inference_vision_language.py b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/offline_inference_vision_language.py new file mode 100644 index 0000000000000000000000000000000000000000..f26930089794c620e0c1a3af100a672bd213a1c2 --- /dev/null +++ b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/offline_inference_vision_language.py @@ -0,0 +1,153 @@ +#!/bin/bash +# Copyright (c) 2025, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +""" +This example shows how to use vLLM for running offline inference +with the correct prompt format on vision language models. + +For most models, the prompt format should follow corresponding examples +on HuggingFace model repository. +""" +import sys +from pathlib import Path +import os +sys.path.append(str(Path(__file__).resolve().parent.parent.parent)) +import argparse +import dataclasses +import inspect +from vllm.assets.image import ImageAsset +from vllm.assets.video import VideoAsset +from vllm import LLM, EngineArgs, SamplingParams +from utils import sampling_add_cli_args + + +# Qwen2_5_VL +def run_qwen2_5_vl(question: str,engine_params, modality: str): + + llm = LLM(**engine_params) + + if modality == "image": + placeholder = "<|image_pad|>" + elif modality == "video": + placeholder = "<|video_pad|>" + + prompt = ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n" + f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>" + f"{question}<|im_end|>\n" + "<|im_start|>assistant\n") + stop_token_ids = None + return llm, prompt, stop_token_ids + + +def get_multi_modal_input(args): + """ + return { + "data": image or video, + "question": question, + } + """ + if args.modality == "image": + # Input image and question + image = ImageAsset("cherry_blossom").pil_image.convert("RGB") + img_question = "What is the content of this image?" + + return { + "data": image, + "question": img_question, + } + + if args.modality == "video": + # Input video and question + video = VideoAsset(name="sample_demo_1.mp4", + num_frames=args.num_frames).np_ndarrays + vid_question = "Why is this video funny?" + + return { + "data": video, + "question": vid_question, + } + + msg = f"Modality {args.modality} is not supported." + raise ValueError(msg) + + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument('--num-prompts', + type=int, + default=1, + help='Number of prompts to run.') + parser.add_argument('--modality', + type=str, + default="image", + help='Modality of the input.') + parser.add_argument('--num-frames', + type=int, + default=16, + help='Number of frames to extract from the video.') + parser = EngineArgs.add_cli_args(parser) + parser = sampling_add_cli_args(parser) + args = parser.parse_args() + engine_args = [attr.name for attr in dataclasses.fields(EngineArgs)] + sampling_args = [ + param.name + for param in list( + inspect.signature(SamplingParams).parameters.values() + ) + ] + engine_params = {attr: getattr(args, attr) for attr in engine_args} + sampling_params = { + attr: getattr(args, attr) for attr in sampling_args if args.__contains__(attr) + } + + modality = args.modality + mm_input = get_multi_modal_input(args) + data = mm_input["data"] + question = mm_input["question"] + + llm, prompt, stop_token_ids = run_qwen2_5_vl(question,engine_params,args.modality) + sampling_params['stop_token_ids'] = stop_token_ids + + # We set temperature to 0.2 so that outputs can be different + # even when all prompts are identical when running batch inference. + sampling_params = SamplingParams(**sampling_params) + + assert args.num_prompts > 0 + if args.num_prompts == 1: + # Single inference + inputs = { + "prompt": prompt, + "multi_modal_data": { + modality: data + }, + } + + else: + # Batch inference + inputs = [{ + "prompt": prompt, + "multi_modal_data": { + modality: data + }, + } for _ in range(args.num_prompts)] + + outputs = llm.generate(inputs, sampling_params=sampling_params) + + for o in outputs: + generated_text = o.outputs[0].text + print(generated_text) \ No newline at end of file diff --git a/models/multimodal/vision_language_model/qwen2_vl/vllm/README.md b/models/multimodal/vision_language_model/qwen2_vl/vllm/README.md new file mode 100644 index 0000000000000000000000000000000000000000..40c2f05b14efbb3bc799fadfc71ac2075b332e92 --- /dev/null +++ b/models/multimodal/vision_language_model/qwen2_vl/vllm/README.md @@ -0,0 +1,39 @@ +# Qwen2-VL (vLLM) + +## Model Description + +Qwen2-VL achieves state-of-the-art performance on visual understanding benchmarks, including MathVista, DocVQA, RealWorldQA, MTVQA, etc. And can understand videos over 20 minutes for high-quality video-based question answering, dialog, content creation, etc. With the abilities of complex reasoning and decision making, Qwen2-VL can be integrated with devices like mobile phones, robots, etc., for automatic operation based on visual environment and text instructions. + +## Supported Environments + +| GPU | [IXUCA SDK](https://gitee.com/deep-spark/deepspark#%E5%A4%A9%E6%95%B0%E6%99%BA%E7%AE%97%E8%BD%AF%E4%BB%B6%E6%A0%88-ixuca) | Release | +| :----: | :----: | :----: | +| MR-V100 | 4.3.0 | 25.09 | + +## Model Preparation + +### Prepare Resources + +- Model: + +```bash +cp -r ../../vllm_public_assets/ ./ +``` + +### Install Dependencies + +In order to run the model smoothly, you need to get the sdk from [resource center](https://support.iluvatar.com/#/ProductLine?id=2) of Iluvatar CoreX official website. + +```bash +pip install transformers==4.50.3 +``` + +## Model Inference + +```bash +export VLLM_ASSETS_CACHE=../vllm/ +export ENABLE_FLASH_ATTENTION_WITH_HEAD_DIM_PADDING=1 +python3 offline_inference_vision_language.py --model /path/to/Qwen2-VL-7B-Instruct --max-tokens 256 -tp 4 --trust-remote-code --temperature 0.0 --max-num-seqs 5 +``` + +## Model Results \ No newline at end of file diff --git a/models/multimodal/vision_language_model/qwen2_vl/vllm/ci/prepare.sh b/models/multimodal/vision_language_model/qwen2_vl/vllm/ci/prepare.sh new file mode 100644 index 0000000000000000000000000000000000000000..cc6608c240adf4526fc66d01d049232f64da883b --- /dev/null +++ b/models/multimodal/vision_language_model/qwen2_vl/vllm/ci/prepare.sh @@ -0,0 +1,20 @@ +#!/bin/bash +# Copyright (c) 2025, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -x + +cp -r ../../vllm_public_assets/ ./ +pip install transformers==4.50.3 diff --git a/models/multimodal/vision_language_model/qwen2_vl/vllm/offline_inference_vision_language.py b/models/multimodal/vision_language_model/qwen2_vl/vllm/offline_inference_vision_language.py new file mode 100644 index 0000000000000000000000000000000000000000..23b3d8d75c955e6e5a5f81e60d638fa94a884578 --- /dev/null +++ b/models/multimodal/vision_language_model/qwen2_vl/vllm/offline_inference_vision_language.py @@ -0,0 +1,148 @@ +#!/bin/bash +# Copyright (c) 2025, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +""" +This example shows how to use vLLM for running offline inference +with the correct prompt format on vision language models. + +For most models, the prompt format should follow corresponding examples +on HuggingFace model repository. +""" +import sys +from pathlib import Path +import os +sys.path.append(str(Path(__file__).resolve().parent.parent.parent)) +import argparse +import dataclasses +import inspect +from vllm.assets.image import ImageAsset +from vllm.assets.video import VideoAsset +from vllm import LLM, EngineArgs, SamplingParams +from utils import sampling_add_cli_args + +# Qwen2-VL +def run_qwen2_vl(question, engine_params): + llm = LLM(**engine_params) + + prompt = ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n" + "<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>" + f"{question}<|im_end|>\n" + "<|im_start|>assistant\n") + stop_token_ids = None + return llm, prompt, stop_token_ids + + + + +def get_multi_modal_input(args): + """ + return { + "data": image or video, + "question": question, + } + """ + if args.modality == "image": + # Input image and question + image = ImageAsset("cherry_blossom").pil_image.convert("RGB") + img_question = "What is the content of this image?" + + return { + "data": image, + "question": img_question, + } + + if args.modality == "video": + # Input video and question + video = VideoAsset(name="sample_demo_1.mp4", + num_frames=args.num_frames).np_ndarrays + vid_question = "Why is this video funny?" + + return { + "data": video, + "question": vid_question, + } + + msg = f"Modality {args.modality} is not supported." + raise ValueError(msg) + + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument('--num-prompts', + type=int, + default=1, + help='Number of prompts to run.') + parser.add_argument('--modality', + type=str, + default="image", + help='Modality of the input.') + parser.add_argument('--num-frames', + type=int, + default=16, + help='Number of frames to extract from the video.') + parser = EngineArgs.add_cli_args(parser) + parser = sampling_add_cli_args(parser) + args = parser.parse_args() + engine_args = [attr.name for attr in dataclasses.fields(EngineArgs)] + sampling_args = [ + param.name + for param in list( + inspect.signature(SamplingParams).parameters.values() + ) + ] + engine_params = {attr: getattr(args, attr) for attr in engine_args} + sampling_params = { + attr: getattr(args, attr) for attr in sampling_args if args.__contains__(attr) + } + + modality = args.modality + mm_input = get_multi_modal_input(args) + data = mm_input["data"] + question = mm_input["question"] + + llm, prompt, stop_token_ids = run_qwen2_vl(question,engine_params) + sampling_params['stop_token_ids'] = stop_token_ids + + # We set temperature to 0.2 so that outputs can be different + # even when all prompts are identical when running batch inference. + sampling_params = SamplingParams(**sampling_params) + + assert args.num_prompts > 0 + if args.num_prompts == 1: + # Single inference + inputs = { + "prompt": prompt, + "multi_modal_data": { + modality: data + }, + } + + else: + # Batch inference + inputs = [{ + "prompt": prompt, + "multi_modal_data": { + modality: data + }, + } for _ in range(args.num_prompts)] + + outputs = llm.generate(inputs, sampling_params=sampling_params) + + for o in outputs: + generated_text = o.outputs[0].text + print(generated_text) \ No newline at end of file diff --git a/models/multimodal/vision_language_model/qwen_vl/vllm/README.md b/models/multimodal/vision_language_model/qwen_vl/vllm/README.md new file mode 100644 index 0000000000000000000000000000000000000000..2418517381aa147442fa8113e37978d511fcf195 --- /dev/null +++ b/models/multimodal/vision_language_model/qwen_vl/vllm/README.md @@ -0,0 +1,38 @@ +# Qwen-VL (vLLM) + +## Model Description + +Qwen-VL (Qwen Large Vision Language Model) is the visual multimodal version of the large model series, Qwen (abbr. Tongyi Qianwen), proposed by Alibaba Cloud. Qwen-VL accepts image, text, and bounding box as inputs, outputs text and bounding box. + +## Supported Environments + +| GPU | [IXUCA SDK](https://gitee.com/deep-spark/deepspark#%E5%A4%A9%E6%95%B0%E6%99%BA%E7%AE%97%E8%BD%AF%E4%BB%B6%E6%A0%88-ixuca) | Release | +| :----: | :----: | :----: | +| MR-V100 | 4.3.0 | 25.09 | + +## Model Preparation + +### Prepare Resources + +- Model: + +```bash +cp -r ../../vllm_public_assets/ ./ +``` + +### Install Dependencies + +In order to run the model smoothly, you need to get the sdk from [resource center](https://support.iluvatar.com/#/ProductLine?id=2) of Iluvatar CoreX official website. + +```bash +pip install matplotlib +``` + +## Model Inference + +```bash +export VLLM_ASSETS_CACHE=../vllm/ +python3 offline_inference_vision_language.py --model /path/to/Qwen-VL-Chat -tp 1 --trust-remote-code --temperature 0.0 --hf-overrides '{"architectures": ["QwenVLForConditionalGeneration"]}' +``` + +## Model Results \ No newline at end of file diff --git a/models/multimodal/vision_language_model/qwen_vl/vllm/ci/prepare.sh b/models/multimodal/vision_language_model/qwen_vl/vllm/ci/prepare.sh new file mode 100644 index 0000000000000000000000000000000000000000..277bc43fa20703b8a6fea8a6effac2cd461b99d0 --- /dev/null +++ b/models/multimodal/vision_language_model/qwen_vl/vllm/ci/prepare.sh @@ -0,0 +1,20 @@ +#!/bin/bash +# Copyright (c) 2025, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -x + +cp -r ../../vllm_public_assets/ ./ +pip install matplotlib diff --git a/models/multimodal/vision_language_model/qwen_vl/vllm/offline_inference_vision_language.py b/models/multimodal/vision_language_model/qwen_vl/vllm/offline_inference_vision_language.py new file mode 100644 index 0000000000000000000000000000000000000000..5a8309960ec77ff67a96743d66183b09753e822a --- /dev/null +++ b/models/multimodal/vision_language_model/qwen_vl/vllm/offline_inference_vision_language.py @@ -0,0 +1,144 @@ +#!/bin/bash +# Copyright (c) 2025, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +""" +This example shows how to use vLLM for running offline inference +with the correct prompt format on vision language models. + +For most models, the prompt format should follow corresponding examples +on HuggingFace model repository. +""" +import sys +from pathlib import Path +import os +sys.path.append(str(Path(__file__).resolve().parent.parent.parent)) +import argparse +import dataclasses +import inspect +from vllm.assets.image import ImageAsset +from vllm.assets.video import VideoAsset +from vllm import LLM, EngineArgs, SamplingParams +from utils import sampling_add_cli_args + + +# Qwen +def run_qwen_vl(question,engine_params,modality): + assert modality == "image" + llm = LLM(**engine_params) + prompt = f"{question}Picture 1: \n" + stop_token_ids = None + return llm, prompt, stop_token_ids + + +def get_multi_modal_input(args): + """ + return { + "data": image or video, + "question": question, + } + """ + if args.modality == "image": + # Input image and question + image = ImageAsset("cherry_blossom").pil_image.convert("RGB") + img_question = "What is the content of this image?" + + return { + "data": image, + "question": img_question, + } + + if args.modality == "video": + # Input video and question + video = VideoAsset(name="sample_demo_1.mp4", + num_frames=args.num_frames).np_ndarrays + vid_question = "Why is this video funny?" + + return { + "data": video, + "question": vid_question, + } + + msg = f"Modality {args.modality} is not supported." + raise ValueError(msg) + + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument('--num-prompts', + type=int, + default=1, + help='Number of prompts to run.') + parser.add_argument('--modality', + type=str, + default="image", + help='Modality of the input.') + parser.add_argument('--num-frames', + type=int, + default=16, + help='Number of frames to extract from the video.') + parser = EngineArgs.add_cli_args(parser) + parser = sampling_add_cli_args(parser) + args = parser.parse_args() + engine_args = [attr.name for attr in dataclasses.fields(EngineArgs)] + sampling_args = [ + param.name + for param in list( + inspect.signature(SamplingParams).parameters.values() + ) + ] + engine_params = {attr: getattr(args, attr) for attr in engine_args} + sampling_params = { + attr: getattr(args, attr) for attr in sampling_args if args.__contains__(attr) + } + + modality = args.modality + mm_input = get_multi_modal_input(args) + data = mm_input["data"] + question = mm_input["question"] + + llm, prompt, stop_token_ids = run_qwen_vl(question,engine_params,args.modality) + sampling_params['stop_token_ids'] = stop_token_ids + + # We set temperature to 0.2 so that outputs can be different + # even when all prompts are identical when running batch inference. + sampling_params = SamplingParams(**sampling_params) + + assert args.num_prompts > 0 + if args.num_prompts == 1: + # Single inference + inputs = { + "prompt": prompt, + "multi_modal_data": { + modality: data + }, + } + + else: + # Batch inference + inputs = [{ + "prompt": prompt, + "multi_modal_data": { + modality: data + }, + } for _ in range(args.num_prompts)] + + outputs = llm.generate(inputs, sampling_params=sampling_params) + + for o in outputs: + generated_text = o.outputs[0].text + print(generated_text) \ No newline at end of file diff --git a/models/multimodal/vision_language_model/vllm_public_assets/American_Eskimo_Dog.jpg b/models/multimodal/vision_language_model/vllm_public_assets/American_Eskimo_Dog.jpg new file mode 100644 index 0000000000000000000000000000000000000000..20cd46d783de5eaf92fdd0eaf2f4d8f31ad19b2f Binary files /dev/null and b/models/multimodal/vision_language_model/vllm_public_assets/American_Eskimo_Dog.jpg differ diff --git a/models/nlp/llm/internlm3/lmdeploy/README.md b/models/nlp/llm/internlm3/lmdeploy/README.md new file mode 100644 index 0000000000000000000000000000000000000000..75d21385a88eefb4dce9af9c2ffd7d01cca9d5fc --- /dev/null +++ b/models/nlp/llm/internlm3/lmdeploy/README.md @@ -0,0 +1,54 @@ +# InternLM3 (LMDeploy) + +## Model Description + +InternLM3 has open-sourced an 8-billion parameter instruction model, InternLM3-8B-Instruct, designed for general-purpose usage and advanced reasoning. This model has the following characteristics: + +- Enhanced performance at reduced cost: State-of-the-art performance on reasoning and knowledge-intensive tasks surpass models like Llama3.1-8B and Qwen2.5-7B. Remarkably, InternLM3 is trained on only 4 trillion high-quality tokens, saving more than 75% of the training cost compared to other LLMs of similar scale. +- Deep thinking capability: InternLM3 supports both the deep thinking mode for solving complicated reasoning tasks via the long chain-of-thought and the normal response mode for fluent user interactions. + +## Supported Environments + +| GPU | [IXUCA SDK](https://gitee.com/deep-spark/deepspark#%E5%A4%A9%E6%95%B0%E6%99%BA%E7%AE%97%E8%BD%AF%E4%BB%B6%E6%A0%88-ixuca) | Release | +| :----: | :----: | :----: | +| MR-V100 | 4.3.0 | 25.09 | + +## Model Preparation + +### Prepare Resources + +- Model: + +### Install Dependencies + +In order to run the model smoothly, you need to get the sdk from [resource center](https://support.iluvatar.com/#/ProductLine?id=2) of Iluvatar CoreX official website. + +```bash +pip install lmdeploy-0.7.2+corex.4.3.0-py3-none-any.whl +``` + +## Model Inference + +### Offline + +```bash +python3 offline_inference.py --model-path /path/to/internlm3-8b-instruct --tp 1 +``` + +### Server + +```bash +lmdeploy serve api_server /path/to/internlm3-8b-instruct --server-port 23333 + +curl http://{server_ip}:{server_port}/v1/models + + +curl http://{server_ip}:{server_port}/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "/home/data/nlp/internlm3-8b-instruct_awq", + "messages": [{"role": "user", "content": "Hello! How are you?"}] + }' +``` + +## Model Results diff --git a/models/nlp/llm/internlm3/lmdeploy/ci/prepare.sh b/models/nlp/llm/internlm3/lmdeploy/ci/prepare.sh new file mode 100644 index 0000000000000000000000000000000000000000..34aa71cee97e8a388ab721416acfdaf00cd4ec33 --- /dev/null +++ b/models/nlp/llm/internlm3/lmdeploy/ci/prepare.sh @@ -0,0 +1,18 @@ +#!/bin/bash +# Copyright (c) 2025, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -x +pip install /mnt/deepspark/data/install/lmdeploy-0.7.2+corex.4.3.0-py3-none-any.whl \ No newline at end of file diff --git a/models/nlp/llm/internlm3/lmdeploy/offline_inference.py b/models/nlp/llm/internlm3/lmdeploy/offline_inference.py new file mode 100644 index 0000000000000000000000000000000000000000..9bebd5f23a1fda5ebeb5d4d8f642ff36b233ac66 --- /dev/null +++ b/models/nlp/llm/internlm3/lmdeploy/offline_inference.py @@ -0,0 +1,51 @@ +# Copyright (c) 2025, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +from lmdeploy import pipeline, GenerationConfig, PytorchEngineConfig + +def main(args): + model_path = args.model_path + max_new_tokens = args.max_tokens + + backend_config = PytorchEngineConfig(session_len=2048, tp=args.tp) + gen_config = GenerationConfig(top_p=0.8, + top_k=40, + temperature=0.8, + max_new_tokens=max_new_tokens) + + pipe = pipeline(model_path, + backend_config=backend_config) + prompts = [[{ + 'role': 'user', + 'content': '请介绍一下你自己' + }], [{ + 'role': 'user', + 'content': '请介绍一下上海' + }]] + response = pipe(prompts, gen_config=gen_config) + print(response) + if response is not None: + print("Offline inference is successful!") + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--max-tokens", type=int, default=1024) + parser.add_argument("--model-path", type=str, default=None) + parser.add_argument("--tp", type=int, default=1) + + args = parser.parse_args() + main(args) + diff --git a/models/speech/asr/utils.py b/models/speech/asr/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..edc7691b64cfff5d3eb56b3f65737f164a1c1696 --- /dev/null +++ b/models/speech/asr/utils.py @@ -0,0 +1,226 @@ +#!/bin/bash +# Copyright (c) 2025, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import codecs +import logging + +""" +The following arguments can not be add in args... +early_stopping: Union[bool, str] = False, +early_stopping: Controls the stopping condition for beam search. It + accepts the following values: `True`, where the generation stops as + soon as there are `best_of` complete candidates; `False`, where an + heuristic is applied and the generation stops when is it very + unlikely to find better candidates; `"never"`, where the beam search + procedure only stops when there cannot be better candidates + (canonical beam search algorithm). +stop: Optional[Union[str, List[str]]] = None, +stop_token_ids: Optional[List[int]] = None, +logits_processors: Optional[List[LogitsProcessor]] = None, +logits_processors: List of functions that modify logits based on + previously generated tokens, and optionally prompt tokens as + a first argument. +truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None, +truncate_prompt_tokens: If set to an integer k, will use only the last k + tokens from the prompt (i.e., left truncation). Defaults to None + (i.e., no truncation). + """ + + +def sampling_add_cli_args(args: argparse.ArgumentParser) -> argparse.ArgumentParser: + args.add_argument( + "--n", + type=int, + default=1, + help="Number of output sequences to return for the given prompt.", + ) + args.add_argument( + "--best-of", + type=int, + default=None, + help="Number of output sequences that are generated from the prompt. " + "From these `best_of` sequences, the top `n` sequences are returned. " + "`best_of` must be greater than or equal to `n`. This is treated as " + "the beam width when `use_beam_search` is True. By default, `best_of`" + "is set to `n`.", + ) + args.add_argument( + "--presence-penalty", + type=float, + default=0.0, + help="Float that penalizes new tokens based on whether they " + "appear in the generated text so far. Values > 0 encourage the model " + "to use new tokens, while values < 0 encourage the model to repeat " + "tokens.", + ) + args.add_argument( + "--frequency-penalty", + type=float, + default=0.0, + help="Float that penalizes new tokens based on their " + " frequency in the generated text so far. Values > 0 encourage the " + " model to use new tokens, while values < 0 encourage the model to " + "repeat tokens.", + ) + args.add_argument( + "--repetition-penalty", + type=float, + default=1.0, + help="Float that penalizes new tokens based on whether " + "they appear in the prompt and the generated text so far. Values > 1 " + "encourage the model to use new tokens, while values < 1 encourage " + "the model to repeat tokens.", + ) + args.add_argument( + "--temperature", + type=float, + default=1.0, + help="Float that controls the randomness of the sampling. Lower " + "values make the model more deterministic, while higher values make " + "the model more random. Zero means greedy sampling.", + ) + args.add_argument( + "--top-p", + type=float, + default=1.0, + help="Float that controls the cumulative probability of the top tokens " + "to consider. Must be in (0, 1]. Set to 1 to consider all tokens.", + ) + args.add_argument( + "--top-k", + type=int, + default=-1, + help="Integer that controls the number of top tokens to consider. Set " + "to -1 to consider all tokens.", + ) + args.add_argument( + "--min-p", + type=float, + default=0.0, + help="Float that represents the minimum probability for a token to be " + "considered, relative to the probability of the most likely token. " + "Must be in [0, 1]. Set to 0 to disable this.", + ) + args.add_argument( + "--use-beam-search", + default=False, + action="store_true", + help="Whether to use beam search instead of sampling.", + ) + args.add_argument( + "--length-penalty", + type=float, + default=1.0, + help="Float that penalizes sequences based on their length. Used in beam search.", + ) + args.add_argument( + "--stop", + type=str, + default=None, + help="List of strings that stop the generation when they are generated. " + "The returned output will not contain the stop strings.", + ) + args.add_argument( + "--stop-token-ids", + type=int, + default=None, + help="List of tokens that stop the generation when they are " + "generated. The returned output will contain the stop tokens unless " + "the stop tokens are special tokens.", + ) + args.add_argument( + "--include-stop-str-in-output", + default=False, + action="store_true", + help="Whether to include the stop strings in output text. Defaults to False.", + ) + args.add_argument( + "--ignore-eos", + default=False, + action="store_true", + help="Whether to ignore the EOS token and continue generating tokens after the EOS token is generated.", + ) + args.add_argument( + "--max-tokens", + type=int, + default=16, + help="Maximum number of tokens to generate per output sequence.", + ) + args.add_argument( + "--min-tokens", + type=int, + default=0, + help="Minimum number of tokens to generate per output sequence " + "before EOS or stop_token_ids can be generated", + ) + args.add_argument( + "--logprobs", + type=int, + default=None, + help="NNumber of log probabilities to return per output token. " + "Note that the implementation follows the OpenAI API: The return " + "result includes the log probabilities on the `logprobs` most likely " + "tokens, as well the chosen tokens. The API will always return the " + "log probability of the sampled token, so there may be up to " + "`logprobs+1` elements in the response.", + ) + args.add_argument( + "--prompt-logprobs", + type=int, + default=None, + help="Number of log probabilities to return per prompt token.", + ) + args.add_argument( + "--detokenize", + type=bool, + default=True, + help="Whether to detokenize the output. Defaults to True.", + ) + args.add_argument( + "--skip-special-tokens", + default=True, + action="store_false", + help="Whether to skip special tokens in the output.", + ) + args.add_argument( + "--spaces-between-special-tokens", + default=True, + action="store_false", + help="Whether to add spaces between special tokens in the output. Defaults to True.", + ) + return args + + +def load_chat_template(tokenizer, chat_template): + if chat_template is not None: + try: + with open(chat_template, "r") as f: + tokenizer.chat_template = f.read() + except OSError: + # If opening a file fails, set chat template to be args to + # ensure we decode so our escape are interpreted correctly + tokenizer.chat_template = codecs.decode(chat_template, "unicode_escape") + + logging.info(f"Using supplied chat template:\n{tokenizer.chat_template}") + elif tokenizer.chat_template is not None: + logging.info( + f"Using default chat template:\n{tokenizer.chat_template}. This May lead to unsatisfactory results. You can provide a template.jinja file for vllm." + ) + else: + logging.warning( + "No chat template provided. Chat API will not work. This May lead to unsatisfactory results. You can provide a template.jinja file for vllm." + ) diff --git a/models/speech/asr/whisper/vllm/README.md b/models/speech/asr/whisper/vllm/README.md new file mode 100644 index 0000000000000000000000000000000000000000..719d22932bf4bf2e0fab750b3d66b0ec3e904286 --- /dev/null +++ b/models/speech/asr/whisper/vllm/README.md @@ -0,0 +1,37 @@ +# Whisper (vLLM) + +## Model Description + +Whisper is a state-of-the-art model for automatic speech recognition (ASR) and speech translation, proposed in the paper Robust Speech Recognition via Large-Scale Weak Supervision by Alec Radford et al. from OpenAI. Trained on >5M hours of labeled data, Whisper demonstrates a strong ability to generalise to many datasets and domains in a zero-shot setting. + +Whisper large-v3-turbo is a finetuned version of a pruned Whisper large-v3. In other words, it's the exact same model, except that the number of decoding layers have reduced from 32 to 4. As a result, the model is way faster, at the expense of a minor quality degradation. You can find more details about it in this GitHub discussion. + +## Supported Environments + +| GPU | [IXUCA SDK](https://gitee.com/deep-spark/deepspark#%E5%A4%A9%E6%95%B0%E6%99%BA%E7%AE%97%E8%BD%AF%E4%BB%B6%E6%A0%88-ixuca) | Release | +| :----: | :----: | :----: | +| MR-V100 | 4.3.0 | 25.09 | + +## Model Preparation + +### Prepare Resources + +- Model: + +### Install Dependencies + +In order to run the model smoothly, you need to get the sdk from [resource center](https://support.iluvatar.com/#/ProductLine?id=2) of Iluvatar CoreX official website. + +```bash +pip3 install transformers==4.50.3 +pip3 install librosa +``` + +## Model Inference + +```bash +export VLLM_ASSETS_CACHE=../vllm/ +python3 offline_inference_audio_language.py --model /path/to/whisper-large-v3-turbo/ -tp 1 --temperature 0.0 --model-name openai/whisper-large-v3-turbo --max-tokens 200 +``` + +## Model Results diff --git a/models/speech/asr/whisper/vllm/ci/prepare.sh b/models/speech/asr/whisper/vllm/ci/prepare.sh new file mode 100644 index 0000000000000000000000000000000000000000..a7e2b7e81929af29a1557a9a6fe607c5ab7a5742 --- /dev/null +++ b/models/speech/asr/whisper/vllm/ci/prepare.sh @@ -0,0 +1,20 @@ +#!/bin/bash +# Copyright (c) 2025, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -x + +pip3 install transformers==4.50.3 +pip3 install librosa \ No newline at end of file diff --git a/models/speech/asr/whisper/vllm/offline_inference_audio_language.py b/models/speech/asr/whisper/vllm/offline_inference_audio_language.py new file mode 100644 index 0000000000000000000000000000000000000000..19efa152c0b821cc710dff421018eb05c273cecd --- /dev/null +++ b/models/speech/asr/whisper/vllm/offline_inference_audio_language.py @@ -0,0 +1,151 @@ +#!/bin/bash +# Copyright (c) 2025, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: Apache-2.0 +"""Compare the outputs of HF and vLLM for Whisper models using greedy sampling. + +Run `pytest tests/models/encoder_decoder/audio/test_whisper.py`. +""" +from typing import Optional + +import argparse +import sys +from pathlib import Path +sys.path.append(str(Path(__file__).resolve().parent.parent.parent)) +import argparse +import dataclasses +import inspect +from vllm import LLM, EngineArgs, SamplingParams +from utils import sampling_add_cli_args +from vllm.assets.audio import AudioAsset + +PROMPTS = [ + { + "prompt": + "<|startoftranscript|><|en|><|transcribe|><|notimestamps|>", + "multi_modal_data": { + "audio": AudioAsset("mary_had_lamb").audio_and_sample_rate, + }, + }, + { # Test explicit encoder/decoder prompt + "encoder_prompt": { + "prompt": "", + "multi_modal_data": { + "audio": AudioAsset("winning_call").audio_and_sample_rate, + }, + }, + "decoder_prompt": + "<|startoftranscript|><|en|><|transcribe|><|notimestamps|>", + } +] + +EXPECTED = { + "openai/whisper-tiny": [ + " He has birth words I spoke in the original corner of that. And a" + " little piece of black coat poetry. Mary had a little sandwich," + " sweet, with white and snow. And everyone had it very went the last" + " would sure to go.", + " >> And the old one, fit John the way to Edgar Martinez. >> One more" + " to line down the field line for our base camp. Here comes joy. Here" + " is June and the third base. They're going to wave him in. The throw" + " to the plate will be late. The Mariners are going to play for the" + " American League Championship. I don't believe it. It just continues" + " by all five." + ], + "openai/whisper-small": [ + " The first words I spoke in the original pornograph. A little piece" + " of practical poetry. Mary had a little lamb, its fleece was quite a" + " slow, and everywhere that Mary went the lamb was sure to go.", + " And the old one pitch on the way to Edgar Martinez one month. Here" + " comes joy. Here is Junior to third base. They're gonna wave him" + " in. The throw to the plate will be late. The Mariners are going to" + " play for the American League Championship. I don't believe it. It" + " just continues. My, oh my." + ], + "openai/whisper-medium": [ + " The first words I spoke in the original phonograph, a little piece" + " of practical poetry. Mary had a little lamb, its fleece was quite as" + " slow, and everywhere that Mary went the lamb was sure to go.", + " And the 0-1 pitch on the way to Edgar Martinez swung on the line" + " down the left field line for Obeyshev. Here comes Joy. Here is" + " Jorgen at third base. They're going to wave him in. The throw to the" + " plate will be late. The Mariners are going to play for the American" + " League Championship. I don't believe it. It just continues. My, oh" + " my." + ], + "openai/whisper-large-v3": [ + " The first words I spoke in the original phonograph, a little piece" + " of practical poetry. Mary had a little lamb, its feet were quite as" + " slow, and everywhere that Mary went, the lamb was sure to go.", + " And the 0-1 pitch on the way to Edgar Martinez. Swung on the line." + " Now the left field line for a base hit. Here comes Joy. Here is" + " Junior to third base. They're going to wave him in. The throw to the" + " plate will be late. The Mariners are going to play for the American" + " League Championship. I don't believe it. It just continues. My, oh," + " my." + ], + "openai/whisper-large-v3-turbo": [ + " The first words I spoke in the original phonograph, a little piece" + " of practical poetry. Mary had a little lamb, its streets were quite" + " as slow, and everywhere that Mary went the lamb was sure to go.", + " And the 0-1 pitch on the way to Edgar Martinez. Swung on the line" + " down the left field line for a base hit. Here comes Joy. Here is" + " Junior to third base. They're going to wave him in. The throw to the" + " plate will be late. The Mariners are going to play for the American" + " League Championship. I don't believe it. It just continues. My, oh," + " my." + ] +} + + +def run_whisper(engine_params,sampling_param,model_name) -> None: + # import pdb + # pdb.set_trace() + prompt_list = PROMPTS * 10 + expected_list = EXPECTED[model_name] * 10 + + llm = LLM(**engine_params) + sampling_params = SamplingParams(**sampling_param) + + outputs = llm.generate(prompt_list, sampling_params) + + for output, expected in zip(outputs, expected_list): + print(output.outputs[0].text) + # assert output.outputs[0].text == expected + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument('--model-name', + type=str, + help='Model name') + parser = EngineArgs.add_cli_args(parser) + parser = sampling_add_cli_args(parser) + args = parser.parse_args() + engine_args = [attr.name for attr in dataclasses.fields(EngineArgs)] + sampling_args = [ + param.name + for param in list( + inspect.signature(SamplingParams).parameters.values() + ) + ] + engine_params = {attr: getattr(args, attr) for attr in engine_args} + sampling_params = { + attr: getattr(args, attr) for attr in sampling_args if args.__contains__(attr) + } + run_whisper(engine_params,sampling_params, args.model_name) + diff --git a/tests/model_info.json b/tests/model_info.json index 58adf01b0326d3a63a27d3bf3b1d31ebde2b15d3..2f21403ec4df87d9f392701200dee82bd7b796cc 100644 --- a/tests/model_info.json +++ b/tests/model_info.json @@ -193,7 +193,7 @@ "datasets": "https://www.image-net.org/download.php", "download_url": "https://huggingface.co/openai/clip-vit-base-patch32", "need_third_part": "", - "precisions": [ + "precisions": [ "fp16" ], "type": "inference", @@ -7172,6 +7172,237 @@ "type": "inference", "hasDemo": false, "demoType": "" + }, + { + "display_name": "GLM-4v", + "model_name": "glm-4v", + "framework": "vllm", + "release_version": "25.09", + "release_sdk": "4.3.0", + "release_gpgpu": "MR-V100", + "latest_sdk": "", + "latest_gpgpu": "", + "category": "multimodal/vision_language_model", + "toolbox": "", + "mdims": "", + "dataset": "", + "license": "", + "model_path": "models/multimodal/vision_language_model/glm-4v/vllm", + "readme_file": "models/multimodal/vision_language_model/glm-4v/vllm/README.md", + "bitbucket_repo": "", + "bitbucket_branch": "", + "bitbucket_path": "", + "develop_owner": "", + "github_repo": "", + "github_branch": "", + "github_path": "", + "datasets": "", + "download_url": "https://huggingface.co/THUDM/glm-4v-9b", + "need_third_part": false, + "precisions": [ + "fp16" + ], + "type": "inference", + "hasDemo": false, + "demoType": "" + }, + { + "display_name": "LLaVA Next Base", + "model_name": "llava_next_base", + "framework": "vllm", + "release_version": "25.09", + "release_sdk": "4.3.0", + "release_gpgpu": "MR-V100", + "latest_sdk": "", + "latest_gpgpu": "", + "category": "multimodal/vision_language_model", + "toolbox": "", + "mdims": "", + "dataset": "", + "license": "", + "model_path": "models/multimodal/vision_language_model/llava_next_base/vllm", + "readme_file": "models/multimodal/vision_language_model/llava_next_base/vllm/README.md", + "bitbucket_repo": "", + "bitbucket_branch": "", + "bitbucket_path": "", + "develop_owner": "", + "github_repo": "", + "github_branch": "", + "github_path": "", + "datasets": "", + "download_url": "https://huggingface.co/royokong/e5-v", + "need_third_part": false, + "precisions": [ + "fp16" + ], + "type": "inference", + "hasDemo": false, + "demoType": "" + }, + { + "display_name": "Qwen2.5-VL", + "model_name": "qwen2_5_vl", + "framework": "vllm", + "release_version": "25.09", + "release_sdk": "4.3.0", + "release_gpgpu": "MR-V100", + "latest_sdk": "", + "latest_gpgpu": "", + "category": "multimodal/vision_language_model", + "toolbox": "", + "mdims": "", + "dataset": "", + "license": "", + "model_path": "models/multimodal/vision_language_model/qwen2_5_vl/vllm", + "readme_file": "models/multimodal/vision_language_model/qwen2_5_vl/vllm/README.md", + "bitbucket_repo": "", + "bitbucket_branch": "", + "bitbucket_path": "", + "develop_owner": "", + "github_repo": "", + "github_branch": "", + "github_path": "", + "datasets": "", + "download_url": "https://huggingface.co/Qwen/Qwen2.5-VL-3B-Instruct", + "need_third_part": false, + "precisions": [ + "fp16" + ], + "type": "inference", + "hasDemo": false, + "demoType": "" + }, + { + "display_name": "Qwen2-VL", + "model_name": "qwen2_vl", + "framework": "vllm", + "release_version": "25.09", + "release_sdk": "4.3.0", + "release_gpgpu": "MR-V100", + "latest_sdk": "", + "latest_gpgpu": "", + "category": "multimodal/vision_language_model", + "toolbox": "", + "mdims": "", + "dataset": "", + "license": "", + "model_path": "models/multimodal/vision_language_model/qwen2_vl/vllm", + "readme_file": "models/multimodal/vision_language_model/qwen2_vl/vllm/README.md", + "bitbucket_repo": "", + "bitbucket_branch": "", + "bitbucket_path": "", + "develop_owner": "", + "github_repo": "", + "github_branch": "", + "github_path": "", + "datasets": "", + "download_url": "https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct", + "need_third_part": false, + "precisions": [ + "fp16" + ], + "type": "inference", + "hasDemo": false, + "demoType": "" + }, + { + "display_name": "Qwen-VL", + "model_name": "qwen_vl", + "framework": "vllm", + "release_version": "25.09", + "release_sdk": "4.3.0", + "release_gpgpu": "MR-V100", + "latest_sdk": "", + "latest_gpgpu": "", + "category": "multimodal/vision_language_model", + "toolbox": "", + "mdims": "", + "dataset": "", + "license": "", + "model_path": "models/multimodal/vision_language_model/qwen_vl/vllm", + "readme_file": "models/multimodal/vision_language_model/qwen_vl/vllm/README.md", + "bitbucket_repo": "", + "bitbucket_branch": "", + "bitbucket_path": "", + "develop_owner": "", + "github_repo": "", + "github_branch": "", + "github_path": "", + "datasets": "", + "download_url": "https://huggingface.co/Qwen/Qwen-VL-Chat", + "need_third_part": false, + "precisions": [ + "fp16" + ], + "type": "inference", + "hasDemo": false, + "demoType": "" + }, + { + "display_name": "InternLM3", + "model_name": "internlm3", + "framework": "lmdeploy", + "release_version": "25.09", + "release_sdk": "4.3.0", + "release_gpgpu": "MR-V100", + "latest_sdk": "", + "latest_gpgpu": "", + "category": "nlp/llm", + "toolbox": "", + "mdims": "", + "dataset": "", + "license": "", + "model_path": "models/nlp/llm/internlm3/lmdeploy", + "readme_file": "models/nlp/llm/internlm3/lmdeploy/README.md", + "bitbucket_repo": "", + "bitbucket_branch": "", + "bitbucket_path": "", + "develop_owner": "", + "github_repo": "", + "github_branch": "", + "github_path": "", + "datasets": "", + "download_url": "https://huggingface.co/internlm/internlm3-8b-instruct", + "need_third_part": false, + "precisions": [ + "fp16" + ], + "type": "inference", + "hasDemo": false, + "demoType": "" + }, + { + "display_name": "Whisper Large V3 Turbo", + "model_name": "whisper", + "framework": "vllm", + "release_version": "25.09", + "release_sdk": "4.3.0", + "release_gpgpu": "MR-V100", + "latest_sdk": "", + "latest_gpgpu": "", + "category": "speech/asr", + "toolbox": "", + "mdims": "", + "dataset": "", + "license": "", + "model_path": "models/speech/asr/whisper/vllm", + "readme_file": "models/speech/asr/whisper/vllm/README.md", + "bitbucket_repo": "", + "bitbucket_branch": "", + "bitbucket_path": "", + "develop_owner": "", + "github_repo": "", + "github_branch": "", + "github_path": "", + "datasets": "", + "download_url": "https://huggingface.co/openai/whisper-large-v3-turbo", + "need_third_part": false, + "precisions": [ + "fp16" + ], + "type": "inference", + "hasDemo": false, + "demoType": "" } ] } \ No newline at end of file diff --git a/tests/run_vllm.py b/tests/run_vllm.py index be795462d5917eb7c8f7a6dca34cdb70f75946d2..b6c5f774b6ba19b867834412c837302c838f30de 100644 --- a/tests/run_vllm.py +++ b/tests/run_vllm.py @@ -56,7 +56,7 @@ def main(): result = {} # NLP模型 - if model["category"] in ["nlp/llm", "multimodal/vision_language_model"]: + if model["category"] in ["nlp/llm", "multimodal/vision_language_model", "speech/asr"]: logging.info(f"Start running {model['model_name']} test case:\n{json.dumps(model, indent=4)}") d_url = model["download_url"] if d_url is not None: @@ -72,7 +72,7 @@ def get_model_config(mode_name): models = json.load(file) for model in models['models']: - if model["model_name"] == mode_name.lower() and model["framework"] == "vllm": + if model["model_name"] == mode_name.lower() and (model["framework"] == "vllm" or model["framework"] == "lmdeploy"): return model return @@ -284,13 +284,63 @@ def run_nlp_testcase(model): export VLLM_ASSETS_CACHE=../vllm/ python3 offline_inference_vision_language.py --model ./{model_name} --max-tokens 256 -tp 2 --temperature 0.0 --max-model-len 2048 """ + elif model_name == "whisper": + script = f""" + set -x + cd ../{model['model_path']} + export VLLM_ASSETS_CACHE=../vllm/ + python3 offline_inference_audio_language.py --model ./{model_name} -tp 1 --temperature 0.0 --model-name openai/whisper-large-v3-turbo --max-tokens 200 + """ + elif model_name == "qwen_vl": + script = f""" + set -x + cd ../{model['model_path']} + export VLLM_ASSETS_CACHE=../vllm/ + python3 offline_inference_vision_language.py --model ./{model_name} -tp 1 --trust-remote-code --temperature 0.0 --hf-overrides '{{"architectures": ["QwenVLForConditionalGeneration"]}}' + """ + elif model_name == "qwen2_vl": + script = f""" + set -x + cd ../{model['model_path']} + export VLLM_ASSETS_CACHE=../vllm/ + export ENABLE_FLASH_ATTENTION_WITH_HEAD_DIM_PADDING=1 + python3 offline_inference_vision_language.py --model ./{model_name} --max-tokens 256 -tp 4 --trust-remote-code --temperature 0.0 --max-num-seqs 5 + """ + elif model_name == "qwen2_5_vl": + script = f""" + set -x + cd ../{model['model_path']} + export VLLM_ASSETS_CACHE=../vllm/ + export ENABLE_FLASH_ATTENTION_WITH_HEAD_DIM_PADDING=1 + python3 offline_inference_vision_language.py --model ./{model_name} -tp 4 --trust-remote-code --temperature 0.0 --max-token 256 + """ + elif model_name == "llava_next_base": + script = f""" + set -x + cd ../{model['model_path']} + python3 offline_inference_vision_language_embedding.py --model ./{model_name} --modality "image" --tensor_parallel_size 1 --task "embed" --trust_remote_code --max_model_len 4096 + """ + elif model_name == "glm-4v": + script = f""" + set -x + cd ../{model['model_path']} + export VLLM_ASSETS_CACHE=../vllm/ + python3 offline_inference_vision_language.py --model ./{model_name} --max-tokens 256 -tp 4 --trust-remote-code --temperature 0.0 --hf-overrides '{{"architectures": ["GLM4VForCausalLM"]}}' + """ + elif model_name == "internlm3": + # lmdeploy pipline requires model path to be a huggingface model id + # such as "internlm/internlm-chat-7b", "Qwen/Qwen-7B-Chat ", "baichuan-inc/Baichuan2-7B-Chat" and so on. + script = f""" + set -x + cd ../{model['model_path']} + python3 offline_inference.py --model-path /mnt/deepspark/data/checkpoints/{checkpoint_n} --tp 1 + """ r, t = run_script(script) sout = r.stdout pattern = r"tokens: (\d+), QPS: ([\d.]+)" matchs = re.search(pattern, sout) result["result"].setdefault(prec, {"status": "FAIL"}) - logging.debug(f"matchs:\n{matchs}") if matchs: result["result"][prec]["tokens"] = int(matchs.group(1)) result["result"][prec]["QPS"] = float(matchs.group(2)) @@ -303,6 +353,11 @@ def run_nlp_testcase(model): result["result"][prec]["QPS"] = float(matchs.group(2)) result["result"][prec]["status"] = "PASS" + if not matchs: + pattern = r"Offline inference is successful!" + matchs = re.search(pattern, sout) + if matchs: + result["result"][prec]["status"] = "PASS" result["result"][prec]["Cost time (s)"] = t return result