diff --git a/models/multimodal/vision_language_model/glm-4v/vllm/README.md b/models/multimodal/vision_language_model/glm-4v/vllm/README.md new file mode 100644 index 0000000000000000000000000000000000000000..b450696f2b4bf40f4961396d4042b6ce3bea6ee6 --- /dev/null +++ b/models/multimodal/vision_language_model/glm-4v/vllm/README.md @@ -0,0 +1,34 @@ +# GLM-4v (vLLM) + +## Model Description + +GLM-4V-9B is the open-source multimodal version of Zhipu AI's latest generation pre-trained model GLM-4 series. GLM-4V-9B demonstrates exceptional performance in various multimodal evaluations, including bilingual (Chinese and English) multi-turn conversations at a high resolution of 1120 * 1120, comprehensive Chinese-English capabilities, perception reasoning, text recognition, and chart understanding. It surpasses GPT-4-turbo-2024-04-09, Gemini 1.0 Pro, Qwen-VL-Max, and Claude 3 Opus in these aspects. + +## Supported Environments + +| GPU | [IXUCA SDK](https://gitee.com/deep-spark/deepspark#%E5%A4%A9%E6%95%B0%E6%99%BA%E7%AE%97%E8%BD%AF%E4%BB%B6%E6%A0%88-ixuca) | Release | +| :----: | :----: | :----: | +| MR-V100 | 4.3.0 | 25.09 | + +## Model Preparation + +### Prepare Resources + +- Model: + +```bash +cp -r ../../vllm_public_assets/ ./ +``` + +### Install Dependencies + +In order to run the model smoothly, you need to get the sdk from [resource center](https://support.iluvatar.com/#/ProductLine?id=2) of Iluvatar CoreX official website. + +## Model Inference + +```bash +export VLLM_ASSETS_CACHE=../vllm/ +python3 offline_inference_vision_language.py --model /path/to/glm-4v-9b --max-tokens 256 -tp 4 --trust-remote-code --temperature 0.0 --hf-overrides '{"architectures": ["GLM4VForCausalLM"]}' +``` + +## Model Results \ No newline at end of file diff --git a/models/multimodal/vision_language_model/glm-4v/vllm/ci/prepare.sh b/models/multimodal/vision_language_model/glm-4v/vllm/ci/prepare.sh new file mode 100644 index 0000000000000000000000000000000000000000..b54c8d39af69827a8993f582b3029fd463c58c0b --- /dev/null +++ b/models/multimodal/vision_language_model/glm-4v/vllm/ci/prepare.sh @@ -0,0 +1,19 @@ +#!/bin/bash +# Copyright (c) 2025, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -x + +cp -r ../../vllm_public_assets/ ./ diff --git a/models/multimodal/vision_language_model/glm-4v/vllm/offline_inference_vision_language.py b/models/multimodal/vision_language_model/glm-4v/vllm/offline_inference_vision_language.py new file mode 100644 index 0000000000000000000000000000000000000000..0fef7fddedb8a9033b8226df6c61c0e3f7e6bee0 --- /dev/null +++ b/models/multimodal/vision_language_model/glm-4v/vllm/offline_inference_vision_language.py @@ -0,0 +1,147 @@ +#!/bin/bash +# Copyright (c) 2025, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +""" +This example shows how to use vLLM for running offline inference +with the correct prompt format on vision language models. + +For most models, the prompt format should follow corresponding examples +on HuggingFace model repository. +""" +import sys +from pathlib import Path +import os +sys.path.append(str(Path(__file__).resolve().parent.parent.parent)) +import argparse +import dataclasses +import inspect +from vllm.assets.image import ImageAsset +from vllm.assets.video import VideoAsset + +from vllm import LLM, EngineArgs, SamplingParams +from utils import sampling_add_cli_args + +# GLM +def run_glm(question,engine_params): + + prompt = f"{question}\n" + llm = LLM(**engine_params) + prompt = f"<|user|>\n<|begin_of_image|><|endoftext|><|end_of_image|>\ + {question}<|assistant|>" + + stop_token_ids = [151329, 151336, 151338] + return llm, prompt, stop_token_ids + + +def get_multi_modal_input(args): + """ + return { + "data": image or video, + "question": question, + } + """ + if args.modality == "image": + # Input image and question + image = ImageAsset("cherry_blossom").pil_image.convert("RGB") + img_question = "What is the content of this image?" + + return { + "data": image, + "question": img_question, + } + + if args.modality == "video": + # Input video and question + video = VideoAsset(name="sample_demo_1.mp4", + num_frames=args.num_frames).np_ndarrays + vid_question = "Why is this video funny?" + + return { + "data": video, + "question": vid_question, + } + + msg = f"Modality {args.modality} is not supported." + raise ValueError(msg) + + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument('--num-prompts', + type=int, + default=1, + help='Number of prompts to run.') + parser.add_argument('--modality', + type=str, + default="image", + help='Modality of the input.') + parser.add_argument('--num-frames', + type=int, + default=16, + help='Number of frames to extract from the video.') + parser = EngineArgs.add_cli_args(parser) + parser = sampling_add_cli_args(parser) + args = parser.parse_args() + engine_args = [attr.name for attr in dataclasses.fields(EngineArgs)] + sampling_args = [ + param.name + for param in list( + inspect.signature(SamplingParams).parameters.values() + ) + ] + engine_params = {attr: getattr(args, attr) for attr in engine_args} + sampling_params = { + attr: getattr(args, attr) for attr in sampling_args if args.__contains__(attr) + } + + modality = args.modality + mm_input = get_multi_modal_input(args) + data = mm_input["data"] + question = mm_input["question"] + + llm, prompt, stop_token_ids = run_glm(question,engine_params) + sampling_params['stop_token_ids'] = stop_token_ids + + # We set temperature to 0.2 so that outputs can be different + # even when all prompts are identical when running batch inference. + sampling_params = SamplingParams(**sampling_params) + + assert args.num_prompts > 0 + if args.num_prompts == 1: + # Single inference + inputs = { + "prompt": prompt, + "multi_modal_data": { + modality: data + }, + } + + else: + # Batch inference + inputs = [{ + "prompt": prompt, + "multi_modal_data": { + modality: data + }, + } for _ in range(args.num_prompts)] + + outputs = llm.generate(inputs, sampling_params=sampling_params) + + for o in outputs: + generated_text = o.outputs[0].text + print(generated_text) \ No newline at end of file diff --git a/models/multimodal/vision_language_model/llava_next_base/vllm/README.md b/models/multimodal/vision_language_model/llava_next_base/vllm/README.md new file mode 100644 index 0000000000000000000000000000000000000000..20dd1aec7146d3bfe0b1ddef23053ea3ee723e87 --- /dev/null +++ b/models/multimodal/vision_language_model/llava_next_base/vllm/README.md @@ -0,0 +1,37 @@ +# LLaVA-NeXT-based (vLLM) + +## Model Description + +E5-V is fine-tuned based on lmms-lab/llama3-llava-next-8b. + +We propose a framework, called E5-V, to adpat MLLMs for achieving multimodal embeddings. E5-V effectively bridges the modality gap between different types of inputs, demonstrating strong performance in multimodal embeddings even without fine-tuning. We also propose a single modality training approach for E5-V, where the model is trained exclusively on text pairs, demonstrating better performance than multimodal training. + +More details can be found in https://github.com/kongds/E5-V + +## Supported Environments + +| GPU | [IXUCA SDK](https://gitee.com/deep-spark/deepspark#%E5%A4%A9%E6%95%B0%E6%99%BA%E7%AE%97%E8%BD%AF%E4%BB%B6%E6%A0%88-ixuca) | Release | +| :----: | :----: | :----: | +| MR-V100 | 4.3.0 | 25.09 | + +## Model Preparation + +### Prepare Resources + +- Model: + +```bash +cp -r ../../vllm_public_assets/ ./ +``` + +### Install Dependencies + +In order to run the model smoothly, you need to get the sdk from [resource center](https://support.iluvatar.com/#/ProductLine?id=2) of Iluvatar CoreX official website. + +## Model Inference + +```bash +python3 offline_inference_vision_language_embedding.py --model /path/to/e5-v/ --modality "image" --tensor_parallel_size 1 --task "embed" --trust_remote_code --max_model_len 4096 +``` + +## Model Results \ No newline at end of file diff --git a/models/multimodal/vision_language_model/llava_next_base/vllm/ci/prepare.sh b/models/multimodal/vision_language_model/llava_next_base/vllm/ci/prepare.sh new file mode 100644 index 0000000000000000000000000000000000000000..1ce243cbc5197ba4f8526707e50605e75b46e691 --- /dev/null +++ b/models/multimodal/vision_language_model/llava_next_base/vllm/ci/prepare.sh @@ -0,0 +1,19 @@ +#!/bin/bash +# Copyright (c) 2025, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -x + +cp -r ../../vllm_public_assets/ ./ \ No newline at end of file diff --git a/models/multimodal/vision_language_model/llava_next_base/vllm/offline_inference_vision_language_embedding.py b/models/multimodal/vision_language_model/llava_next_base/vllm/offline_inference_vision_language_embedding.py new file mode 100644 index 0000000000000000000000000000000000000000..aeb601e0af030718be8f6cb74a858b0df22f8e1d --- /dev/null +++ b/models/multimodal/vision_language_model/llava_next_base/vllm/offline_inference_vision_language_embedding.py @@ -0,0 +1,154 @@ +#!/bin/bash +# Copyright (c) 2025, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +""" +This example shows how to use vLLM for running offline inference with +the correct prompt format on vision language models for multimodal embedding. + +For most models, the prompt format should follow corresponding examples +on HuggingFace model repository. +""" +from argparse import Namespace +from typing import Literal, NamedTuple, Optional, TypedDict, Union, get_args +import io +import base64 +from PIL import Image +from vllm import LLM +from vllm.multimodal.utils import fetch_image +from vllm.utils import FlexibleArgumentParser +from vllm import LLM, EngineArgs +import dataclasses + +class TextQuery(TypedDict): + modality: Literal["text"] + text: str + + +class ImageQuery(TypedDict): + modality: Literal["image"] + image: Image.Image + + +class TextImageQuery(TypedDict): + modality: Literal["text+image"] + text: str + image: Image.Image + + +QueryModality = Literal["text", "image", "text+image"] +Query = Union[TextQuery, ImageQuery, TextImageQuery] + + +class ModelRequestData(NamedTuple): + llm: LLM + prompt: str + image: Optional[Image.Image] + + +def run_e5_v(query: Query, engine_params): + llama3_template = '<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n \n' # noqa: E501 + + if query["modality"] == "text": + text = query["text"] + prompt = llama3_template.format( + f"{text}\nSummary above sentence in one word: ") + image = None + elif query["modality"] == "image": + prompt = llama3_template.format( + "\nSummary above image in one word: ") + image = query["image"] + else: + modality = query['modality'] + raise ValueError(f"Unsupported query modality: '{modality}'") + + llm = LLM(**engine_params) + + return ModelRequestData( + llm=llm, + prompt=prompt, + image=image, + ) + + + +def get_query(modality: QueryModality): + if modality == "text": + return TextQuery(modality="text", text="A dog sitting in the grass") + + + if modality == "image": + image: Image = Image.open("vllm_public_assets/American_Eskimo_Dog.jpg") + image = image.convert("RGB") + image_data = io.BytesIO() + image.save(image_data, format='JPEG') + image_base64 = base64.b64encode(image_data.getvalue()).decode("utf-8") + return ImageQuery( + modality="image", + image= fetch_image(f"data:image/jpeg;base64,{image_base64}" + ), + ) + + if modality == "text+image": + image: Image = Image.open("vllm_public_assets/Felis_catus-cat_on_snow.jpg") + image = image.convert("RGB") + image_data = io.BytesIO() + image.save(image_data, format='JPEG') + image_base64 = base64.b64encode(image_data.getvalue()).decode("utf-8") + return TextImageQuery( + modality="text+image", + text="A cat standing in the snow.", + image= fetch_image(f"data:image/jpeg;base64,{image_base64}" + ), + ) + + msg = f"Modality {modality} is not supported." + raise ValueError(msg) + + +def run_encode(engine_params, modality: QueryModality): + query = get_query(modality) + req_data = run_e5_v(query, engine_params) + + mm_data = {} + if req_data.image is not None: + mm_data["image"] = req_data.image + + outputs = req_data.llm.embed({ + "prompt": req_data.prompt, + "multi_modal_data": mm_data, + }) + + for output in outputs: + print(output.outputs.embedding) + + + +if __name__ == "__main__": + parser = FlexibleArgumentParser( + description='Demo on using vLLM for offline inference with ' + 'vision language models for multimodal embedding') + parser.add_argument('--modality', + type=str, + default="image", + choices=get_args(QueryModality), + help='Modality of the input.') + parser = EngineArgs.add_cli_args(parser) + args = parser.parse_args() + engine_args = [attr.name for attr in dataclasses.fields(EngineArgs)] + engine_params = {attr: getattr(args, attr) for attr in engine_args} + + run_encode(engine_params, args.modality) diff --git a/models/multimodal/vision_language_model/qwen2_5_vl/vllm/README.md b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/README.md new file mode 100644 index 0000000000000000000000000000000000000000..a1279a4294ea34dfcb1b14c5fb6e41c3f2f274fe --- /dev/null +++ b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/README.md @@ -0,0 +1,39 @@ +# Qwen2.5_VL (vLLM) + +## Model Description + +Qwen2.5-VL is not only proficient in recognizing common objects such as flowers, birds, fish, and insects, but it is highly capable of analyzing texts, charts, icons, graphics, and layouts within images. Qwen2.5-VL directly plays as a visual agent that can reason and dynamically direct tools, which is capable of computer use and phone use. Qwen2.5-VL can comprehend videos of over 1 hour, and this time it has a new ability of cpaturing event by pinpointing the relevant video segments. Qwen2.5-VL can accurately localize objects in an image by generating bounding boxes or points, and it can provide stable JSON outputs for coordinates and attributes. for data like scans of invoices, forms, tables, etc. Qwen2.5-VL supports structured outputs of their contents, benefiting usages in finance, commerce, etc. + +## Supported Environments + +| GPU | [IXUCA SDK](https://gitee.com/deep-spark/deepspark#%E5%A4%A9%E6%95%B0%E6%99%BA%E7%AE%97%E8%BD%AF%E4%BB%B6%E6%A0%88-ixuca) | Release | +| :----: | :----: | :----: | +| MR-V100 | 4.3.0 | 25.09 | + +## Model Preparation + +### Prepare Resources + +- Model: + +```bash +cp -r ../../vllm_public_assets/ ./ +``` + +### Install Dependencies + +In order to run the model smoothly, you need to get the sdk from [resource center](https://support.iluvatar.com/#/ProductLine?id=2) of Iluvatar CoreX official website. + +```bash +pip install transformers==4.50.3 +``` + +## Model Inference + +```bash +export VLLM_ASSETS_CACHE=../vllm/ +export ENABLE_FLASH_ATTENTION_WITH_HEAD_DIM_PADDING=1 +python3 offline_inference_vision_language.py --model /path/to/Qwen2.5-VL-3B-Instruct/ -tp 4 --trust-remote-code --temperature 0.0 --max-token 256 +``` + +## Model Results diff --git a/models/multimodal/vision_language_model/qwen2_5_vl/vllm/ci/prepare.sh b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/ci/prepare.sh new file mode 100644 index 0000000000000000000000000000000000000000..cc6608c240adf4526fc66d01d049232f64da883b --- /dev/null +++ b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/ci/prepare.sh @@ -0,0 +1,20 @@ +#!/bin/bash +# Copyright (c) 2025, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -x + +cp -r ../../vllm_public_assets/ ./ +pip install transformers==4.50.3 diff --git a/models/multimodal/vision_language_model/qwen2_5_vl/vllm/offline_inference_vision_language.py b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/offline_inference_vision_language.py new file mode 100644 index 0000000000000000000000000000000000000000..f26930089794c620e0c1a3af100a672bd213a1c2 --- /dev/null +++ b/models/multimodal/vision_language_model/qwen2_5_vl/vllm/offline_inference_vision_language.py @@ -0,0 +1,153 @@ +#!/bin/bash +# Copyright (c) 2025, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +""" +This example shows how to use vLLM for running offline inference +with the correct prompt format on vision language models. + +For most models, the prompt format should follow corresponding examples +on HuggingFace model repository. +""" +import sys +from pathlib import Path +import os +sys.path.append(str(Path(__file__).resolve().parent.parent.parent)) +import argparse +import dataclasses +import inspect +from vllm.assets.image import ImageAsset +from vllm.assets.video import VideoAsset +from vllm import LLM, EngineArgs, SamplingParams +from utils import sampling_add_cli_args + + +# Qwen2_5_VL +def run_qwen2_5_vl(question: str,engine_params, modality: str): + + llm = LLM(**engine_params) + + if modality == "image": + placeholder = "<|image_pad|>" + elif modality == "video": + placeholder = "<|video_pad|>" + + prompt = ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n" + f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>" + f"{question}<|im_end|>\n" + "<|im_start|>assistant\n") + stop_token_ids = None + return llm, prompt, stop_token_ids + + +def get_multi_modal_input(args): + """ + return { + "data": image or video, + "question": question, + } + """ + if args.modality == "image": + # Input image and question + image = ImageAsset("cherry_blossom").pil_image.convert("RGB") + img_question = "What is the content of this image?" + + return { + "data": image, + "question": img_question, + } + + if args.modality == "video": + # Input video and question + video = VideoAsset(name="sample_demo_1.mp4", + num_frames=args.num_frames).np_ndarrays + vid_question = "Why is this video funny?" + + return { + "data": video, + "question": vid_question, + } + + msg = f"Modality {args.modality} is not supported." + raise ValueError(msg) + + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument('--num-prompts', + type=int, + default=1, + help='Number of prompts to run.') + parser.add_argument('--modality', + type=str, + default="image", + help='Modality of the input.') + parser.add_argument('--num-frames', + type=int, + default=16, + help='Number of frames to extract from the video.') + parser = EngineArgs.add_cli_args(parser) + parser = sampling_add_cli_args(parser) + args = parser.parse_args() + engine_args = [attr.name for attr in dataclasses.fields(EngineArgs)] + sampling_args = [ + param.name + for param in list( + inspect.signature(SamplingParams).parameters.values() + ) + ] + engine_params = {attr: getattr(args, attr) for attr in engine_args} + sampling_params = { + attr: getattr(args, attr) for attr in sampling_args if args.__contains__(attr) + } + + modality = args.modality + mm_input = get_multi_modal_input(args) + data = mm_input["data"] + question = mm_input["question"] + + llm, prompt, stop_token_ids = run_qwen2_5_vl(question,engine_params,args.modality) + sampling_params['stop_token_ids'] = stop_token_ids + + # We set temperature to 0.2 so that outputs can be different + # even when all prompts are identical when running batch inference. + sampling_params = SamplingParams(**sampling_params) + + assert args.num_prompts > 0 + if args.num_prompts == 1: + # Single inference + inputs = { + "prompt": prompt, + "multi_modal_data": { + modality: data + }, + } + + else: + # Batch inference + inputs = [{ + "prompt": prompt, + "multi_modal_data": { + modality: data + }, + } for _ in range(args.num_prompts)] + + outputs = llm.generate(inputs, sampling_params=sampling_params) + + for o in outputs: + generated_text = o.outputs[0].text + print(generated_text) \ No newline at end of file diff --git a/models/multimodal/vision_language_model/qwen2_vl/vllm/README.md b/models/multimodal/vision_language_model/qwen2_vl/vllm/README.md new file mode 100644 index 0000000000000000000000000000000000000000..0af8e6efd92b7c99fe4f524525dc245c7c2ab88f --- /dev/null +++ b/models/multimodal/vision_language_model/qwen2_vl/vllm/README.md @@ -0,0 +1,39 @@ +# Qwen2_VL (vLLM) + +## Model Description + +Qwen2-VL achieves state-of-the-art performance on visual understanding benchmarks, including MathVista, DocVQA, RealWorldQA, MTVQA, etc. And can understand videos over 20 minutes for high-quality video-based question answering, dialog, content creation, etc. With the abilities of complex reasoning and decision making, Qwen2-VL can be integrated with devices like mobile phones, robots, etc., for automatic operation based on visual environment and text instructions. + +## Supported Environments + +| GPU | [IXUCA SDK](https://gitee.com/deep-spark/deepspark#%E5%A4%A9%E6%95%B0%E6%99%BA%E7%AE%97%E8%BD%AF%E4%BB%B6%E6%A0%88-ixuca) | Release | +| :----: | :----: | :----: | +| MR-V100 | 4.3.0 | 25.09 | + +## Model Preparation + +### Prepare Resources + +- Model: + +```bash +cp -r ../../vllm_public_assets/ ./ +``` + +### Install Dependencies + +In order to run the model smoothly, you need to get the sdk from [resource center](https://support.iluvatar.com/#/ProductLine?id=2) of Iluvatar CoreX official website. + +```bash +pip install transformers==4.50.3 +``` + +## Model Inference + +```bash +export VLLM_ASSETS_CACHE=../vllm/ +export ENABLE_FLASH_ATTENTION_WITH_HEAD_DIM_PADDING=1 +python3 offline_inference_vision_language.py --model /path/to/Qwen2-VL-7B-Instruct --max-tokens 256 -tp 4 --trust-remote-code --temperature 0.0 --max-num-seqs 5 +``` + +## Model Results \ No newline at end of file diff --git a/models/multimodal/vision_language_model/qwen2_vl/vllm/ci/prepare.sh b/models/multimodal/vision_language_model/qwen2_vl/vllm/ci/prepare.sh new file mode 100644 index 0000000000000000000000000000000000000000..cc6608c240adf4526fc66d01d049232f64da883b --- /dev/null +++ b/models/multimodal/vision_language_model/qwen2_vl/vllm/ci/prepare.sh @@ -0,0 +1,20 @@ +#!/bin/bash +# Copyright (c) 2025, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -x + +cp -r ../../vllm_public_assets/ ./ +pip install transformers==4.50.3 diff --git a/models/multimodal/vision_language_model/qwen2_vl/vllm/offline_inference_vision_language.py b/models/multimodal/vision_language_model/qwen2_vl/vllm/offline_inference_vision_language.py new file mode 100644 index 0000000000000000000000000000000000000000..23b3d8d75c955e6e5a5f81e60d638fa94a884578 --- /dev/null +++ b/models/multimodal/vision_language_model/qwen2_vl/vllm/offline_inference_vision_language.py @@ -0,0 +1,148 @@ +#!/bin/bash +# Copyright (c) 2025, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +""" +This example shows how to use vLLM for running offline inference +with the correct prompt format on vision language models. + +For most models, the prompt format should follow corresponding examples +on HuggingFace model repository. +""" +import sys +from pathlib import Path +import os +sys.path.append(str(Path(__file__).resolve().parent.parent.parent)) +import argparse +import dataclasses +import inspect +from vllm.assets.image import ImageAsset +from vllm.assets.video import VideoAsset +from vllm import LLM, EngineArgs, SamplingParams +from utils import sampling_add_cli_args + +# Qwen2-VL +def run_qwen2_vl(question, engine_params): + llm = LLM(**engine_params) + + prompt = ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n" + "<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>" + f"{question}<|im_end|>\n" + "<|im_start|>assistant\n") + stop_token_ids = None + return llm, prompt, stop_token_ids + + + + +def get_multi_modal_input(args): + """ + return { + "data": image or video, + "question": question, + } + """ + if args.modality == "image": + # Input image and question + image = ImageAsset("cherry_blossom").pil_image.convert("RGB") + img_question = "What is the content of this image?" + + return { + "data": image, + "question": img_question, + } + + if args.modality == "video": + # Input video and question + video = VideoAsset(name="sample_demo_1.mp4", + num_frames=args.num_frames).np_ndarrays + vid_question = "Why is this video funny?" + + return { + "data": video, + "question": vid_question, + } + + msg = f"Modality {args.modality} is not supported." + raise ValueError(msg) + + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument('--num-prompts', + type=int, + default=1, + help='Number of prompts to run.') + parser.add_argument('--modality', + type=str, + default="image", + help='Modality of the input.') + parser.add_argument('--num-frames', + type=int, + default=16, + help='Number of frames to extract from the video.') + parser = EngineArgs.add_cli_args(parser) + parser = sampling_add_cli_args(parser) + args = parser.parse_args() + engine_args = [attr.name for attr in dataclasses.fields(EngineArgs)] + sampling_args = [ + param.name + for param in list( + inspect.signature(SamplingParams).parameters.values() + ) + ] + engine_params = {attr: getattr(args, attr) for attr in engine_args} + sampling_params = { + attr: getattr(args, attr) for attr in sampling_args if args.__contains__(attr) + } + + modality = args.modality + mm_input = get_multi_modal_input(args) + data = mm_input["data"] + question = mm_input["question"] + + llm, prompt, stop_token_ids = run_qwen2_vl(question,engine_params) + sampling_params['stop_token_ids'] = stop_token_ids + + # We set temperature to 0.2 so that outputs can be different + # even when all prompts are identical when running batch inference. + sampling_params = SamplingParams(**sampling_params) + + assert args.num_prompts > 0 + if args.num_prompts == 1: + # Single inference + inputs = { + "prompt": prompt, + "multi_modal_data": { + modality: data + }, + } + + else: + # Batch inference + inputs = [{ + "prompt": prompt, + "multi_modal_data": { + modality: data + }, + } for _ in range(args.num_prompts)] + + outputs = llm.generate(inputs, sampling_params=sampling_params) + + for o in outputs: + generated_text = o.outputs[0].text + print(generated_text) \ No newline at end of file diff --git a/models/multimodal/vision_language_model/qwen_vl/vllm/README.md b/models/multimodal/vision_language_model/qwen_vl/vllm/README.md new file mode 100644 index 0000000000000000000000000000000000000000..eae369a953bb37446419580f1c8bc1fefd8a0613 --- /dev/null +++ b/models/multimodal/vision_language_model/qwen_vl/vllm/README.md @@ -0,0 +1,38 @@ +# Qwen_VL (vLLM) + +## Model Description + +Qwen-VL (Qwen Large Vision Language Model) is the visual multimodal version of the large model series, Qwen (abbr. Tongyi Qianwen), proposed by Alibaba Cloud. Qwen-VL accepts image, text, and bounding box as inputs, outputs text and bounding box. + +## Supported Environments + +| GPU | [IXUCA SDK](https://gitee.com/deep-spark/deepspark#%E5%A4%A9%E6%95%B0%E6%99%BA%E7%AE%97%E8%BD%AF%E4%BB%B6%E6%A0%88-ixuca) | Release | +| :----: | :----: | :----: | +| MR-V100 | 4.3.0 | 25.09 | + +## Model Preparation + +### Prepare Resources + +- Model: + +```bash +cp -r ../../vllm_public_assets/ ./ +``` + +### Install Dependencies + +In order to run the model smoothly, you need to get the sdk from [resource center](https://support.iluvatar.com/#/ProductLine?id=2) of Iluvatar CoreX official website. + +```bash +pip install matplotlib +``` + +## Model Inference + +```bash +export VLLM_ASSETS_CACHE=../vllm/ +python3 offline_inference_vision_language.py --model /path/to/Qwen-VL-Chat -tp 1 --trust-remote-code --temperature 0.0 --hf-overrides '{"architectures": ["QwenVLForConditionalGeneration"]}' +``` + +## Model Results \ No newline at end of file diff --git a/models/multimodal/vision_language_model/qwen_vl/vllm/ci/prepare.sh b/models/multimodal/vision_language_model/qwen_vl/vllm/ci/prepare.sh new file mode 100644 index 0000000000000000000000000000000000000000..277bc43fa20703b8a6fea8a6effac2cd461b99d0 --- /dev/null +++ b/models/multimodal/vision_language_model/qwen_vl/vllm/ci/prepare.sh @@ -0,0 +1,20 @@ +#!/bin/bash +# Copyright (c) 2025, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -x + +cp -r ../../vllm_public_assets/ ./ +pip install matplotlib diff --git a/models/multimodal/vision_language_model/qwen_vl/vllm/offline_inference_vision_language.py b/models/multimodal/vision_language_model/qwen_vl/vllm/offline_inference_vision_language.py new file mode 100644 index 0000000000000000000000000000000000000000..5a8309960ec77ff67a96743d66183b09753e822a --- /dev/null +++ b/models/multimodal/vision_language_model/qwen_vl/vllm/offline_inference_vision_language.py @@ -0,0 +1,144 @@ +#!/bin/bash +# Copyright (c) 2025, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +""" +This example shows how to use vLLM for running offline inference +with the correct prompt format on vision language models. + +For most models, the prompt format should follow corresponding examples +on HuggingFace model repository. +""" +import sys +from pathlib import Path +import os +sys.path.append(str(Path(__file__).resolve().parent.parent.parent)) +import argparse +import dataclasses +import inspect +from vllm.assets.image import ImageAsset +from vllm.assets.video import VideoAsset +from vllm import LLM, EngineArgs, SamplingParams +from utils import sampling_add_cli_args + + +# Qwen +def run_qwen_vl(question,engine_params,modality): + assert modality == "image" + llm = LLM(**engine_params) + prompt = f"{question}Picture 1: \n" + stop_token_ids = None + return llm, prompt, stop_token_ids + + +def get_multi_modal_input(args): + """ + return { + "data": image or video, + "question": question, + } + """ + if args.modality == "image": + # Input image and question + image = ImageAsset("cherry_blossom").pil_image.convert("RGB") + img_question = "What is the content of this image?" + + return { + "data": image, + "question": img_question, + } + + if args.modality == "video": + # Input video and question + video = VideoAsset(name="sample_demo_1.mp4", + num_frames=args.num_frames).np_ndarrays + vid_question = "Why is this video funny?" + + return { + "data": video, + "question": vid_question, + } + + msg = f"Modality {args.modality} is not supported." + raise ValueError(msg) + + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument('--num-prompts', + type=int, + default=1, + help='Number of prompts to run.') + parser.add_argument('--modality', + type=str, + default="image", + help='Modality of the input.') + parser.add_argument('--num-frames', + type=int, + default=16, + help='Number of frames to extract from the video.') + parser = EngineArgs.add_cli_args(parser) + parser = sampling_add_cli_args(parser) + args = parser.parse_args() + engine_args = [attr.name for attr in dataclasses.fields(EngineArgs)] + sampling_args = [ + param.name + for param in list( + inspect.signature(SamplingParams).parameters.values() + ) + ] + engine_params = {attr: getattr(args, attr) for attr in engine_args} + sampling_params = { + attr: getattr(args, attr) for attr in sampling_args if args.__contains__(attr) + } + + modality = args.modality + mm_input = get_multi_modal_input(args) + data = mm_input["data"] + question = mm_input["question"] + + llm, prompt, stop_token_ids = run_qwen_vl(question,engine_params,args.modality) + sampling_params['stop_token_ids'] = stop_token_ids + + # We set temperature to 0.2 so that outputs can be different + # even when all prompts are identical when running batch inference. + sampling_params = SamplingParams(**sampling_params) + + assert args.num_prompts > 0 + if args.num_prompts == 1: + # Single inference + inputs = { + "prompt": prompt, + "multi_modal_data": { + modality: data + }, + } + + else: + # Batch inference + inputs = [{ + "prompt": prompt, + "multi_modal_data": { + modality: data + }, + } for _ in range(args.num_prompts)] + + outputs = llm.generate(inputs, sampling_params=sampling_params) + + for o in outputs: + generated_text = o.outputs[0].text + print(generated_text) \ No newline at end of file diff --git a/models/multimodal/vision_language_model/vllm_public_assets/American_Eskimo_Dog.jpg b/models/multimodal/vision_language_model/vllm_public_assets/American_Eskimo_Dog.jpg new file mode 100644 index 0000000000000000000000000000000000000000..20cd46d783de5eaf92fdd0eaf2f4d8f31ad19b2f Binary files /dev/null and b/models/multimodal/vision_language_model/vllm_public_assets/American_Eskimo_Dog.jpg differ diff --git a/models/nlp/llm/internlm3/lmdeploy/README.md b/models/nlp/llm/internlm3/lmdeploy/README.md new file mode 100644 index 0000000000000000000000000000000000000000..833dfda0d781967fc3171f1831501a0708a5103c --- /dev/null +++ b/models/nlp/llm/internlm3/lmdeploy/README.md @@ -0,0 +1,56 @@ + + +# InternLM3 (LMDeploy) + +## Model Description + +InternLM3 has open-sourced an 8-billion parameter instruction model, InternLM3-8B-Instruct, designed for general-purpose usage and advanced reasoning. This model has the following characteristics: + +- Enhanced performance at reduced cost: State-of-the-art performance on reasoning and knowledge-intensive tasks surpass models like Llama3.1-8B and Qwen2.5-7B. Remarkably, InternLM3 is trained on only 4 trillion high-quality tokens, saving more than 75% of the training cost compared to other LLMs of similar scale. +- Deep thinking capability: InternLM3 supports both the deep thinking mode for solving complicated reasoning tasks via the long chain-of-thought and the normal response mode for fluent user interactions. + +## Supported Environments + +| GPU | [IXUCA SDK](https://gitee.com/deep-spark/deepspark#%E5%A4%A9%E6%95%B0%E6%99%BA%E7%AE%97%E8%BD%AF%E4%BB%B6%E6%A0%88-ixuca) | Release | +| :----: | :----: | :----: | +| MR-V100 | 4.3.0 | 25.09 | + +## Model Preparation + +### Prepare Resources + +- Model: + +### Install Dependencies + +In order to run the model smoothly, you need to get the sdk from [resource center](https://support.iluvatar.com/#/ProductLine?id=2) of Iluvatar CoreX official website. + +```bash +pip install lmdeploy-0.7.2+corex.4.3.0-py3-none-any.whl +``` + +## Model Inference + +### Offline + +```bash +python3 offline_inference.py --model-path /path/to/internlm3-8b-instruct --tp 1 +``` + +### Server + +```bash +lmdeploy serve api_server /path/to/internlm3-8b-instruct --server-port 23333 + +curl http://{server_ip}:{server_port}/v1/models + + +curl http://{server_ip}:{server_port}/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "/home/data/nlp/internlm3-8b-instruct_awq", + "messages": [{"role": "user", "content": "Hello! How are you?"}] + }' +``` + +## Model Results diff --git a/models/nlp/llm/internlm3/lmdeploy/ci/prepare.sh b/models/nlp/llm/internlm3/lmdeploy/ci/prepare.sh new file mode 100644 index 0000000000000000000000000000000000000000..34aa71cee97e8a388ab721416acfdaf00cd4ec33 --- /dev/null +++ b/models/nlp/llm/internlm3/lmdeploy/ci/prepare.sh @@ -0,0 +1,18 @@ +#!/bin/bash +# Copyright (c) 2025, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -x +pip install /mnt/deepspark/data/install/lmdeploy-0.7.2+corex.4.3.0-py3-none-any.whl \ No newline at end of file diff --git a/models/nlp/llm/internlm3/lmdeploy/offline_inference.py b/models/nlp/llm/internlm3/lmdeploy/offline_inference.py new file mode 100644 index 0000000000000000000000000000000000000000..0258c906615d0bfd0d2925a16d1d51062a76e801 --- /dev/null +++ b/models/nlp/llm/internlm3/lmdeploy/offline_inference.py @@ -0,0 +1,49 @@ +# Copyright (c) 2025, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +from lmdeploy import pipeline, GenerationConfig, PytorchEngineConfig + +def main(args): + model_path = args.model_path + max_new_tokens = args.max_tokens + + backend_config = PytorchEngineConfig(session_len=2048,tp = args.tp) + gen_config = GenerationConfig(top_p=0.8, + top_k=40, + temperature=0.8, + max_new_tokens=max_new_tokens) + + pipe = pipeline(model_path, + backend_config=backend_config) + prompts = [[{ + 'role': 'user', + 'content': '请介绍一下你自己' + }], [{ + 'role': 'user', + 'content': '请介绍一下上海' + }]] + response = pipe(prompts, gen_config=gen_config) + print(response) + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--max-tokens", type=int, default=1024) + parser.add_argument("--model-path", type=str, default=None) + parser.add_argument("--tp", type=int, default=1) + + args = parser.parse_args() + main(args) + diff --git a/models/speech/asr/whisper/vllm/README.md b/models/speech/asr/whisper/vllm/README.md new file mode 100644 index 0000000000000000000000000000000000000000..19fba95c73850d33447afecedb023fe79ca0f475 --- /dev/null +++ b/models/speech/asr/whisper/vllm/README.md @@ -0,0 +1,36 @@ +# Whisper (vLLM) + +## Model Description + +Whisper is a state-of-the-art model for automatic speech recognition (ASR) and speech translation, proposed in the paper Robust Speech Recognition via Large-Scale Weak Supervision by Alec Radford et al. from OpenAI. Trained on >5M hours of labeled data, Whisper demonstrates a strong ability to generalise to many datasets and domains in a zero-shot setting. + +Whisper large-v3-turbo is a finetuned version of a pruned Whisper large-v3. In other words, it's the exact same model, except that the number of decoding layers have reduced from 32 to 4. As a result, the model is way faster, at the expense of a minor quality degradation. You can find more details about it in this GitHub discussion. + +## Supported Environments + +| GPU | [IXUCA SDK](https://gitee.com/deep-spark/deepspark#%E5%A4%A9%E6%95%B0%E6%99%BA%E7%AE%97%E8%BD%AF%E4%BB%B6%E6%A0%88-ixuca) | Release | +| :----: | :----: | :----: | +| MR-V100 | 4.3.0 | 25.09 | + +## Model Preparation + +### Prepare Resources + +- Model: + +### Install Dependencies + +In order to run the model smoothly, you need to get the sdk from [resource center](https://support.iluvatar.com/#/ProductLine?id=2) of Iluvatar CoreX official website. + +```bash +pip3 install transformers==4.50.3 +pip3 install librosa +``` + +## Model Inference + +```bash +export VLLM_ASSETS_CACHE=../vllm/ +python3 offline_inference_audio_language.py --model /path/to/whisper-large-v3-turbo/ -tp 1 --temperature 0.0 --model-name openai/whisper-large-v3-turbo --max-tokens 200 +``` + diff --git a/models/speech/asr/whisper/vllm/ci/prepare.sh b/models/speech/asr/whisper/vllm/ci/prepare.sh new file mode 100644 index 0000000000000000000000000000000000000000..a7e2b7e81929af29a1557a9a6fe607c5ab7a5742 --- /dev/null +++ b/models/speech/asr/whisper/vllm/ci/prepare.sh @@ -0,0 +1,20 @@ +#!/bin/bash +# Copyright (c) 2025, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -x + +pip3 install transformers==4.50.3 +pip3 install librosa \ No newline at end of file diff --git a/models/speech/asr/whisper/vllm/offline_inference_audio_language.py b/models/speech/asr/whisper/vllm/offline_inference_audio_language.py new file mode 100644 index 0000000000000000000000000000000000000000..19efa152c0b821cc710dff421018eb05c273cecd --- /dev/null +++ b/models/speech/asr/whisper/vllm/offline_inference_audio_language.py @@ -0,0 +1,151 @@ +#!/bin/bash +# Copyright (c) 2025, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# SPDX-License-Identifier: Apache-2.0 +"""Compare the outputs of HF and vLLM for Whisper models using greedy sampling. + +Run `pytest tests/models/encoder_decoder/audio/test_whisper.py`. +""" +from typing import Optional + +import argparse +import sys +from pathlib import Path +sys.path.append(str(Path(__file__).resolve().parent.parent.parent)) +import argparse +import dataclasses +import inspect +from vllm import LLM, EngineArgs, SamplingParams +from utils import sampling_add_cli_args +from vllm.assets.audio import AudioAsset + +PROMPTS = [ + { + "prompt": + "<|startoftranscript|><|en|><|transcribe|><|notimestamps|>", + "multi_modal_data": { + "audio": AudioAsset("mary_had_lamb").audio_and_sample_rate, + }, + }, + { # Test explicit encoder/decoder prompt + "encoder_prompt": { + "prompt": "", + "multi_modal_data": { + "audio": AudioAsset("winning_call").audio_and_sample_rate, + }, + }, + "decoder_prompt": + "<|startoftranscript|><|en|><|transcribe|><|notimestamps|>", + } +] + +EXPECTED = { + "openai/whisper-tiny": [ + " He has birth words I spoke in the original corner of that. And a" + " little piece of black coat poetry. Mary had a little sandwich," + " sweet, with white and snow. And everyone had it very went the last" + " would sure to go.", + " >> And the old one, fit John the way to Edgar Martinez. >> One more" + " to line down the field line for our base camp. Here comes joy. Here" + " is June and the third base. They're going to wave him in. The throw" + " to the plate will be late. The Mariners are going to play for the" + " American League Championship. I don't believe it. It just continues" + " by all five." + ], + "openai/whisper-small": [ + " The first words I spoke in the original pornograph. A little piece" + " of practical poetry. Mary had a little lamb, its fleece was quite a" + " slow, and everywhere that Mary went the lamb was sure to go.", + " And the old one pitch on the way to Edgar Martinez one month. Here" + " comes joy. Here is Junior to third base. They're gonna wave him" + " in. The throw to the plate will be late. The Mariners are going to" + " play for the American League Championship. I don't believe it. It" + " just continues. My, oh my." + ], + "openai/whisper-medium": [ + " The first words I spoke in the original phonograph, a little piece" + " of practical poetry. Mary had a little lamb, its fleece was quite as" + " slow, and everywhere that Mary went the lamb was sure to go.", + " And the 0-1 pitch on the way to Edgar Martinez swung on the line" + " down the left field line for Obeyshev. Here comes Joy. Here is" + " Jorgen at third base. They're going to wave him in. The throw to the" + " plate will be late. The Mariners are going to play for the American" + " League Championship. I don't believe it. It just continues. My, oh" + " my." + ], + "openai/whisper-large-v3": [ + " The first words I spoke in the original phonograph, a little piece" + " of practical poetry. Mary had a little lamb, its feet were quite as" + " slow, and everywhere that Mary went, the lamb was sure to go.", + " And the 0-1 pitch on the way to Edgar Martinez. Swung on the line." + " Now the left field line for a base hit. Here comes Joy. Here is" + " Junior to third base. They're going to wave him in. The throw to the" + " plate will be late. The Mariners are going to play for the American" + " League Championship. I don't believe it. It just continues. My, oh," + " my." + ], + "openai/whisper-large-v3-turbo": [ + " The first words I spoke in the original phonograph, a little piece" + " of practical poetry. Mary had a little lamb, its streets were quite" + " as slow, and everywhere that Mary went the lamb was sure to go.", + " And the 0-1 pitch on the way to Edgar Martinez. Swung on the line" + " down the left field line for a base hit. Here comes Joy. Here is" + " Junior to third base. They're going to wave him in. The throw to the" + " plate will be late. The Mariners are going to play for the American" + " League Championship. I don't believe it. It just continues. My, oh," + " my." + ] +} + + +def run_whisper(engine_params,sampling_param,model_name) -> None: + # import pdb + # pdb.set_trace() + prompt_list = PROMPTS * 10 + expected_list = EXPECTED[model_name] * 10 + + llm = LLM(**engine_params) + sampling_params = SamplingParams(**sampling_param) + + outputs = llm.generate(prompt_list, sampling_params) + + for output, expected in zip(outputs, expected_list): + print(output.outputs[0].text) + # assert output.outputs[0].text == expected + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument('--model-name', + type=str, + help='Model name') + parser = EngineArgs.add_cli_args(parser) + parser = sampling_add_cli_args(parser) + args = parser.parse_args() + engine_args = [attr.name for attr in dataclasses.fields(EngineArgs)] + sampling_args = [ + param.name + for param in list( + inspect.signature(SamplingParams).parameters.values() + ) + ] + engine_params = {attr: getattr(args, attr) for attr in engine_args} + sampling_params = { + attr: getattr(args, attr) for attr in sampling_args if args.__contains__(attr) + } + run_whisper(engine_params,sampling_params, args.model_name) + diff --git a/models/speech/asr/whisper/vllm/vllm_public_assets/mary_had_lamb.ogg b/models/speech/asr/whisper/vllm/vllm_public_assets/mary_had_lamb.ogg new file mode 100644 index 0000000000000000000000000000000000000000..f2e2db40c1533c414fd235d3beeb6830748253c0 Binary files /dev/null and b/models/speech/asr/whisper/vllm/vllm_public_assets/mary_had_lamb.ogg differ diff --git a/models/speech/asr/whisper/vllm/vllm_public_assets/winning_call.ogg b/models/speech/asr/whisper/vllm/vllm_public_assets/winning_call.ogg new file mode 100644 index 0000000000000000000000000000000000000000..7760ca1d86242754a4b741cfcdae4cf3daae7f14 Binary files /dev/null and b/models/speech/asr/whisper/vllm/vllm_public_assets/winning_call.ogg differ diff --git a/tests/run_vllm.py b/tests/run_vllm.py index be795462d5917eb7c8f7a6dca34cdb70f75946d2..a9bfe117927f43dfd69a5563fea773e07751f107 100644 --- a/tests/run_vllm.py +++ b/tests/run_vllm.py @@ -56,7 +56,7 @@ def main(): result = {} # NLP模型 - if model["category"] in ["nlp/llm", "multimodal/vision_language_model"]: + if model["category"] in ["nlp/llm", "multimodal/vision_language_model", "speech/asr"]: logging.info(f"Start running {model['model_name']} test case:\n{json.dumps(model, indent=4)}") d_url = model["download_url"] if d_url is not None: @@ -284,6 +284,49 @@ def run_nlp_testcase(model): export VLLM_ASSETS_CACHE=../vllm/ python3 offline_inference_vision_language.py --model ./{model_name} --max-tokens 256 -tp 2 --temperature 0.0 --max-model-len 2048 """ + elif model_name == "whisper": + script = f""" + set -x + cd ../{model['model_path']} + export VLLM_ASSETS_CACHE=../vllm/ + python3 offline_inference_audio_language.py --model ./{model_name} -tp 1 --temperature 0.0 --model-name openai/whisper-large-v3-turbo --max-tokens 200 + """ + elif model_name == "qwen_vl": + script = f""" + set -x + cd ../{model['model_path']} + export VLLM_ASSETS_CACHE=../vllm/ + python3 offline_inference_vision_language.py --model ./{model_name} -tp 1 --trust-remote-code --temperature 0.0 --hf-overrides '{"architectures": ["QwenVLForConditionalGeneration"]}' + """ + elif model_name == "qwen2_vl": + script = f""" + set -x + cd ../{model['model_path']} + export VLLM_ASSETS_CACHE=../vllm/ + export ENABLE_FLASH_ATTENTION_WITH_HEAD_DIM_PADDING=1 + python3 offline_inference_vision_language.py --model ./{model_name} --max-tokens 256 -tp 4 --trust-remote-code --temperature 0.0 --max-num-seqs 5 + """ + elif model_name == "qwen2_5_vl": + script = f""" + set -x + cd ../{model['model_path']} + export VLLM_ASSETS_CACHE=../vllm/ + export ENABLE_FLASH_ATTENTION_WITH_HEAD_DIM_PADDING=1 + python3 offline_inference_vision_language.py --model ./{model_name} -tp 4 --trust-remote-code --temperature 0.0 --max-token 256 + """ + elif model_name == "llava_next_base": + script = f""" + set -x + cd ../{model['model_path']} + python3 offline_inference_vision_language_embedding.py --model ./{model_name} --modality "image" --tensor_parallel_size 1 --task "embed" --trust_remote_code --max_model_len 4096 + """ + elif model_name == "glm-4v": + script = f""" + set -x + cd ../{model['model_path']} + export VLLM_ASSETS_CACHE=../vllm/ + python3 offline_inference_vision_language.py --model ./{model_name} --max-tokens 256 -tp 4 --trust-remote-code --temperature 0.0 --hf-overrides '{"architectures": ["GLM4VForCausalLM"]}' + """ r, t = run_script(script) sout = r.stdout