diff --git a/models/multimodal/diffusion/stable-diffusion/README.md b/models/multimodal/diffusion/stable-diffusion/README.md new file mode 100644 index 0000000000000000000000000000000000000000..93bc5f3dfc67faf74099d11c2b1e305f96622440 --- /dev/null +++ b/models/multimodal/diffusion/stable-diffusion/README.md @@ -0,0 +1,41 @@ +# Stable Diffusion 1.5 + +## Model description + +Stable Diffusion is a latent text-to-image diffusion model capable of generating photo-realistic images given any text input. + +## Setup + +### Install + +```bash +# Install libGL +## CentOS +yum install -y mesa-libGL +## Ubuntu +apt install -y libgl1-mesa-glx + +pip3 install http://files.deepspark.org.cn:880/deepspark/add-ons/diffusers-0.31.0-py3-none-any.whl +pip3 install -r requirements.txt +``` + +### Download + +Download the runwayml/stable-diffusion-v1-5 from [huggingface page](https://huggingface.co/runwayml/stable-diffusion-v1-5). + +```bash +cd stable-diffusion +mkdir -p data/ +ln -s /path/to/stable-diffusion-v1-5 ./data/ +``` + +## Inference + +```bash +export ENABLE_IXFORMER_INFERENCE=1 +python3 demo.py +``` + +## Reference + +- [diffusers](https://github.com/huggingface/diffusers) diff --git a/models/multimodal/diffusion/stable-diffusion/ci/prepare.sh b/models/multimodal/diffusion/stable-diffusion/ci/prepare.sh new file mode 100644 index 0000000000000000000000000000000000000000..b9140aa3524dfab2af3249dee4e6ab89948b3607 --- /dev/null +++ b/models/multimodal/diffusion/stable-diffusion/ci/prepare.sh @@ -0,0 +1,28 @@ +#!/bin/bash +# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -x +ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"') +if [[ ${ID} == "ubuntu" ]]; then + apt install -y libgl1-mesa-glx +elif [[ ${ID} == "centos" ]]; then + yum install -y mesa-libGL +else + echo "Not Support Os" +fi + +pip3 install http://files.deepspark.org.cn:880/deepspark/add-ons/diffusers-0.31.0-py3-none-any.whl +pip3 install -r requirements.txt \ No newline at end of file diff --git a/models/multimodal/diffusion/stable-diffusion/demo.py b/models/multimodal/diffusion/stable-diffusion/demo.py new file mode 100644 index 0000000000000000000000000000000000000000..ce3cdf398e29b4663c938ed46ea19dd74400d43b --- /dev/null +++ b/models/multimodal/diffusion/stable-diffusion/demo.py @@ -0,0 +1,33 @@ +import random +import time +import numpy as np +import torch +from torchvision.utils import save_image +from diffusers import StableDiffusionPipeline + + +def setup_seed(seed): + torch.manual_seed(seed) + torch.cuda.manual_seed_all(seed) + np.random.seed(seed) + random.seed(seed) + torch.backends.cudnn.deterministic = True + +# 设置随机数种子 +setup_seed(20) + +pipe = StableDiffusionPipeline.from_pretrained(f"data/stable-diffusion-v1-5", torch_dtype=torch.float16) +pipe.safety_checker = None +pipe = pipe.to("cuda") +prompt = "A raccoon wearing formal clothes, wearing a tophat and holding a cane" +print("Warming up GPU...") +wh = 1024 +batch_size=1 +print(f'height={wh}, width={wh}, prompt={prompt}, batch_size={batch_size}') +start_time = time.time() +image = pipe( + prompt, output_type="pt", return_dict=True, height=wh, width=wh, num_images_per_prompt=batch_size, num_inference_steps=50, guidance_scale=7.0 +).images[0] +use_time = time.time() - start_time +print("time: {:.2f} seconds".format(use_time)) +save_image(image, "demo.png") \ No newline at end of file diff --git a/models/multimodal/diffusion/stable-diffusion/requirements.txt b/models/multimodal/diffusion/stable-diffusion/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..2cbf12c60d5945cf14ebaec9a7348fab5400bade --- /dev/null +++ b/models/multimodal/diffusion/stable-diffusion/requirements.txt @@ -0,0 +1,22 @@ +contourpy==1.2.1 +cycler==0.12.1 +et-xmlfile==1.1.0 +fonttools==4.53.0 +kiwisolver==1.4.5 +matplotlib==3.9.0 +numpy==1.26.4 +opencv-python==4.10.0.82 +openpyxl==3.1.4 +packaging==24.1 +pandas==2.2.2 +pillow==10.3.0 +pyparsing==3.1.2 +python-dateutil==2.9.0.post0 +pytz==2024.1 +scipy==1.13.1 +six==1.16.0 +tzdata==2024.1 +transformers==4.39.3 +accelerate==0.29.0 +peft==0.13.2 +safetensors \ No newline at end of file diff --git a/models/nlp/large_language_model/deepseek-r1-distill-llama-70b/vllm/README.md b/models/nlp/large_language_model/deepseek-r1-distill-llama-70b/vllm/README.md new file mode 100644 index 0000000000000000000000000000000000000000..26049a59f9beae20d1382ba6b597b4f8f8c6808f --- /dev/null +++ b/models/nlp/large_language_model/deepseek-r1-distill-llama-70b/vllm/README.md @@ -0,0 +1,42 @@ +# DeepSeek-R1-Distill-Llama-70B + +## Description + +DeepSeek-R1-Distill models are fine-tuned based on open-source models, using samples generated by DeepSeek-R1. We slightly change their configs and tokenizers. We open-source distilled 1.5B, 7B, 8B, 14B, 32B, and 70B checkpoints based on Qwen2.5 and Llama3 series to the community. + +## Setup + +### Install + +```bash +# Install libGL +## CentOS +yum install -y mesa-libGL +## Ubuntu +apt install -y libgl1-mesa-glx +``` + +### Download + +-Model: + +```bash +cd deepSeek-r1-distill-llama-70b/vllm +mkdir -p data/ +ln -s /path/to/DeepSeek-R1-Distill-Llama-70B ./data/ +``` + +## Inference with offline + +```bash +python3 offline_inference.py --model ./data/DeepSeek-R1-Distill-Llama-70B --max-tokens 256 -tp 8 --temperature 0.0 --max-model-len 3096 +``` +## Inference with serve + +```bash +vllm serve data/DeepSeek-R1-Distill-Llama-70B --tensor-parallel-size 8 --max-model-len 32768 --enforce-eager --trust-remote-code +``` + +## Reference + +[DeepSeek-R1](https://github.com/deepseek-ai/DeepSeek-R1) diff --git a/models/nlp/large_language_model/deepseek-r1-distill-llama-70b/vllm/ci/prepare.sh b/models/nlp/large_language_model/deepseek-r1-distill-llama-70b/vllm/ci/prepare.sh new file mode 100644 index 0000000000000000000000000000000000000000..75fb19458942e1d61a674c1d6dd9bbdb521bc00c --- /dev/null +++ b/models/nlp/large_language_model/deepseek-r1-distill-llama-70b/vllm/ci/prepare.sh @@ -0,0 +1,25 @@ +#!/bin/bash +# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -x +ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"') +if [[ ${ID} == "ubuntu" ]]; then + apt install -y libgl1-mesa-glx +elif [[ ${ID} == "centos" ]]; then + yum install -y mesa-libGL +else + echo "Not Support Os" +fi diff --git a/models/nlp/large_language_model/deepseek-r1-distill-llama-70b/vllm/offline_inference.py b/models/nlp/large_language_model/deepseek-r1-distill-llama-70b/vllm/offline_inference.py new file mode 100644 index 0000000000000000000000000000000000000000..9b7d87fd797c78fcedba7cd4c9a9a0e7642c251f --- /dev/null +++ b/models/nlp/large_language_model/deepseek-r1-distill-llama-70b/vllm/offline_inference.py @@ -0,0 +1,115 @@ +import sys +from pathlib import Path +import os + +sys.path.append(str(Path(__file__).resolve().parent.parent.parent)) +import argparse +import dataclasses +import inspect +import logging +import time + +import torch +from utils import load_chat_template, sampling_add_cli_args +from vllm import LLM, EngineArgs, SamplingParams + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--chat_template", type=str, default=None) + parser.add_argument( + "--remove_chat_template", + default=False, + action="store_true", + help="pass this if you are not use a chat model", + ) + parser = EngineArgs.add_cli_args(parser) + parser = sampling_add_cli_args(parser) + args = parser.parse_args() + + engine_args = [attr.name for attr in dataclasses.fields(EngineArgs)] + sampling_args = [ + param.name + for param in list( + inspect.signature(SamplingParams).parameters.values() + ) + ] + engine_params = {attr: getattr(args, attr) for attr in engine_args} + sampling_params = { + attr: getattr(args, attr) for attr in sampling_args if args.__contains__(attr) + } + + model_name = os.path.dirname(args.model).rsplit("/")[-1] + + # Sample prompts. + prompts = ["哪些迹象可能表明一个人正在经历焦虑?", "描述一下如何制作芝士披萨。", "写一篇有关5G网络研发的综述文章。"] + + # Create a sampling params object. + sampling_params = SamplingParams(**sampling_params) + + # Create an LLM. + llm = LLM(**engine_params) + + # process chat template + if args.remove_chat_template: + if "chat" in model_name.lower(): + logging.warning( + f"The model name from model path is {model_name}, so we guess you are using the chat model and the additional processing is required for the input prompt. " + f"If the result is not quite correct, please ensure you do not pass --remove_chat_template in CLI." + ) + prompts_new = prompts + else: + # Build chat model promopt + logging.warning( + "If you are using a non chat model, please pass the --remove_chat_template in CLI." + ) + # Try use transformers's apply_chat_template, if chat_template is None, will use defalut template. + # For some old models, the default template may cause bad answers. we don't consider this situation, + # because the Transformers team is advancing the chat template. For more informatino about it, + # please refer to https://huggingface.co/docs/transformers/main/chat_templating + try: + load_chat_template(llm.get_tokenizer(), args.chat_template) + prompts_new = [] + for prompt in prompts: + messages = [{"role": "user", "content": prompt}] + text = llm.get_tokenizer().apply_chat_template( + messages, tokenize=False, add_generation_prompt=True + ) + prompts_new.append(text) + except: + logging.warning( + "use tokenizer apply_chat_template function failed, may because of low transformers version...(try use transformers>=4.34.0)" + ) + prompts_new = prompts + + # Generate texts from the prompts. The output is a list of RequestOutput objects + # that contain the prompt, generated text, and other information. + outputs = ( + llm.generate(prompts_new, sampling_params, use_tqdm=False) + if isinstance(prompts_new[0], str) + else llm.generate( + sampling_params=sampling_params, + prompt_token_ids=prompts_new, + use_tqdm=False, + ) + ) + torch.cuda.synchronize() + + start_time = time.perf_counter() + outputs = ( + llm.generate(prompts_new, sampling_params) + if isinstance(prompts_new[0], str) + else llm.generate(sampling_params=sampling_params, prompt_token_ids=prompts_new) + ) + torch.cuda.synchronize() + end_time = time.perf_counter() + duration_time = end_time - start_time + + num_tokens = 0 + # Print the outputs. + for i, output in enumerate(outputs): + prompt = prompts[i] # show the origin prompt. actully prompt is "output.prompt" + generated_text = output.outputs[0].text + + num_tokens += len(output.outputs[0].token_ids) + print(f"Prompt: {prompt}\nGenerated text: {generated_text} \n") + print(f"tokens: {num_tokens}, QPS: {num_tokens/duration_time}") diff --git a/models/nlp/large_language_model/deepseek-r1-distill-llama-8b/vllm/README.md b/models/nlp/large_language_model/deepseek-r1-distill-llama-8b/vllm/README.md new file mode 100644 index 0000000000000000000000000000000000000000..903a7a160a1e77f7ff35a1f320b810477e7a5455 --- /dev/null +++ b/models/nlp/large_language_model/deepseek-r1-distill-llama-8b/vllm/README.md @@ -0,0 +1,48 @@ +# DeepSeek-R1-Distill-Llama-8B + +## Description + +DeepSeek-R1-Distill models are fine-tuned based on open-source models, using samples generated by DeepSeek-R1. We slightly change their configs and tokenizers. We open-source distilled 1.5B, 7B, 8B, 14B, 32B, and 70B checkpoints based on Qwen2.5 and Llama3 series to the community. + +## Setup + +### Install + +```bash +# Install libGL +## CentOS +yum install -y mesa-libGL +## Ubuntu +apt install -y libgl1-mesa-glx +``` + +### Download + +-Model: + +```bash +cd deepSeek-r1-distill-llama-8b/vllm +mkdir -p data/ +ln -s /path/to/DeepSeek-R1-Distill-Llama-8B ./data/ +``` + +## Inference with offline + +```bash +python3 offline_inference.py --model ./data/DeepSeek-R1-Distill-Llama-8B --max-tokens 256 -tp 1 --temperature 0.0 --max-model-len 3096 +``` +## Inference with serve + +```bash +vllm serve data/DeepSeek-R1-Distill-Llama-8B --tensor-parallel-size 2 --max-model-len 32768 --enforce-eager --trust-remote-code +``` + +## Results + +| Model | QPS | +| ---------- | ----- | +| DeepSeek-R1-Distill-Llama-8B | 105.33| + +## Reference + +[DeepSeek-R1](https://github.com/deepseek-ai/DeepSeek-R1) diff --git a/models/nlp/large_language_model/deepseek-r1-distill-llama-8b/vllm/ci/prepare.sh b/models/nlp/large_language_model/deepseek-r1-distill-llama-8b/vllm/ci/prepare.sh new file mode 100644 index 0000000000000000000000000000000000000000..75fb19458942e1d61a674c1d6dd9bbdb521bc00c --- /dev/null +++ b/models/nlp/large_language_model/deepseek-r1-distill-llama-8b/vllm/ci/prepare.sh @@ -0,0 +1,25 @@ +#!/bin/bash +# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -x +ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"') +if [[ ${ID} == "ubuntu" ]]; then + apt install -y libgl1-mesa-glx +elif [[ ${ID} == "centos" ]]; then + yum install -y mesa-libGL +else + echo "Not Support Os" +fi diff --git a/models/nlp/large_language_model/deepseek-r1-distill-llama-8b/vllm/offline_inference.py b/models/nlp/large_language_model/deepseek-r1-distill-llama-8b/vllm/offline_inference.py new file mode 100644 index 0000000000000000000000000000000000000000..9b7d87fd797c78fcedba7cd4c9a9a0e7642c251f --- /dev/null +++ b/models/nlp/large_language_model/deepseek-r1-distill-llama-8b/vllm/offline_inference.py @@ -0,0 +1,115 @@ +import sys +from pathlib import Path +import os + +sys.path.append(str(Path(__file__).resolve().parent.parent.parent)) +import argparse +import dataclasses +import inspect +import logging +import time + +import torch +from utils import load_chat_template, sampling_add_cli_args +from vllm import LLM, EngineArgs, SamplingParams + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--chat_template", type=str, default=None) + parser.add_argument( + "--remove_chat_template", + default=False, + action="store_true", + help="pass this if you are not use a chat model", + ) + parser = EngineArgs.add_cli_args(parser) + parser = sampling_add_cli_args(parser) + args = parser.parse_args() + + engine_args = [attr.name for attr in dataclasses.fields(EngineArgs)] + sampling_args = [ + param.name + for param in list( + inspect.signature(SamplingParams).parameters.values() + ) + ] + engine_params = {attr: getattr(args, attr) for attr in engine_args} + sampling_params = { + attr: getattr(args, attr) for attr in sampling_args if args.__contains__(attr) + } + + model_name = os.path.dirname(args.model).rsplit("/")[-1] + + # Sample prompts. + prompts = ["哪些迹象可能表明一个人正在经历焦虑?", "描述一下如何制作芝士披萨。", "写一篇有关5G网络研发的综述文章。"] + + # Create a sampling params object. + sampling_params = SamplingParams(**sampling_params) + + # Create an LLM. + llm = LLM(**engine_params) + + # process chat template + if args.remove_chat_template: + if "chat" in model_name.lower(): + logging.warning( + f"The model name from model path is {model_name}, so we guess you are using the chat model and the additional processing is required for the input prompt. " + f"If the result is not quite correct, please ensure you do not pass --remove_chat_template in CLI." + ) + prompts_new = prompts + else: + # Build chat model promopt + logging.warning( + "If you are using a non chat model, please pass the --remove_chat_template in CLI." + ) + # Try use transformers's apply_chat_template, if chat_template is None, will use defalut template. + # For some old models, the default template may cause bad answers. we don't consider this situation, + # because the Transformers team is advancing the chat template. For more informatino about it, + # please refer to https://huggingface.co/docs/transformers/main/chat_templating + try: + load_chat_template(llm.get_tokenizer(), args.chat_template) + prompts_new = [] + for prompt in prompts: + messages = [{"role": "user", "content": prompt}] + text = llm.get_tokenizer().apply_chat_template( + messages, tokenize=False, add_generation_prompt=True + ) + prompts_new.append(text) + except: + logging.warning( + "use tokenizer apply_chat_template function failed, may because of low transformers version...(try use transformers>=4.34.0)" + ) + prompts_new = prompts + + # Generate texts from the prompts. The output is a list of RequestOutput objects + # that contain the prompt, generated text, and other information. + outputs = ( + llm.generate(prompts_new, sampling_params, use_tqdm=False) + if isinstance(prompts_new[0], str) + else llm.generate( + sampling_params=sampling_params, + prompt_token_ids=prompts_new, + use_tqdm=False, + ) + ) + torch.cuda.synchronize() + + start_time = time.perf_counter() + outputs = ( + llm.generate(prompts_new, sampling_params) + if isinstance(prompts_new[0], str) + else llm.generate(sampling_params=sampling_params, prompt_token_ids=prompts_new) + ) + torch.cuda.synchronize() + end_time = time.perf_counter() + duration_time = end_time - start_time + + num_tokens = 0 + # Print the outputs. + for i, output in enumerate(outputs): + prompt = prompts[i] # show the origin prompt. actully prompt is "output.prompt" + generated_text = output.outputs[0].text + + num_tokens += len(output.outputs[0].token_ids) + print(f"Prompt: {prompt}\nGenerated text: {generated_text} \n") + print(f"tokens: {num_tokens}, QPS: {num_tokens/duration_time}") diff --git a/models/nlp/large_language_model/deepseek-r1-distill-qwen-1.5b/vllm/README.md b/models/nlp/large_language_model/deepseek-r1-distill-qwen-1.5b/vllm/README.md new file mode 100644 index 0000000000000000000000000000000000000000..c52fc0f55f7692753cea62ebd1d171f80a739136 --- /dev/null +++ b/models/nlp/large_language_model/deepseek-r1-distill-qwen-1.5b/vllm/README.md @@ -0,0 +1,48 @@ +# DeepSeek-R1-Distill-Qwen-1.5B + +## Description + +DeepSeek-R1-Distill models are fine-tuned based on open-source models, using samples generated by DeepSeek-R1. We slightly change their configs and tokenizers. We open-source distilled 1.5B, 7B, 8B, 14B, 32B, and 70B checkpoints based on Qwen2.5 and Llama3 series to the community. + +## Setup + +### Install + +```bash +# Install libGL +## CentOS +yum install -y mesa-libGL +## Ubuntu +apt install -y libgl1-mesa-glx +``` + +### Download + +-Model: + +```bash +cd deepseek-r1-distill-qwen-1.5b/vllm +mkdir -p data/ +ln -s /path/to/DeepSeek-R1-Distill-Qwen-1.5B ./data/ +``` + +## Inference with offline + +```bash +python3 offline_inference.py --model ./data/DeepSeek-R1-Distill-Qwen-1.5B --max-tokens 256 -tp 1 --temperature 0.0 --max-model-len 3096 +``` +## Inference with serve + +```bash +vllm serve data/DeepSeek-R1-Distill-Qwen-1.5B --tensor-parallel-size 2 --max-model-len 32768 --enforce-eager --trust-remote-code +``` + +## Results + +| Model | QPS | +| ---------- | ----- | +| DeepSeek-R1-Distill-Qwen-1.5B | 259.42 | + +## Reference + +[DeepSeek-R1](https://github.com/deepseek-ai/DeepSeek-R1) diff --git a/models/nlp/large_language_model/deepseek-r1-distill-qwen-1.5b/vllm/ci/prepare.sh b/models/nlp/large_language_model/deepseek-r1-distill-qwen-1.5b/vllm/ci/prepare.sh new file mode 100644 index 0000000000000000000000000000000000000000..75fb19458942e1d61a674c1d6dd9bbdb521bc00c --- /dev/null +++ b/models/nlp/large_language_model/deepseek-r1-distill-qwen-1.5b/vllm/ci/prepare.sh @@ -0,0 +1,25 @@ +#!/bin/bash +# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -x +ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"') +if [[ ${ID} == "ubuntu" ]]; then + apt install -y libgl1-mesa-glx +elif [[ ${ID} == "centos" ]]; then + yum install -y mesa-libGL +else + echo "Not Support Os" +fi diff --git a/models/nlp/large_language_model/deepseek-r1-distill-qwen-1.5b/vllm/offline_inference.py b/models/nlp/large_language_model/deepseek-r1-distill-qwen-1.5b/vllm/offline_inference.py new file mode 100644 index 0000000000000000000000000000000000000000..9b7d87fd797c78fcedba7cd4c9a9a0e7642c251f --- /dev/null +++ b/models/nlp/large_language_model/deepseek-r1-distill-qwen-1.5b/vllm/offline_inference.py @@ -0,0 +1,115 @@ +import sys +from pathlib import Path +import os + +sys.path.append(str(Path(__file__).resolve().parent.parent.parent)) +import argparse +import dataclasses +import inspect +import logging +import time + +import torch +from utils import load_chat_template, sampling_add_cli_args +from vllm import LLM, EngineArgs, SamplingParams + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--chat_template", type=str, default=None) + parser.add_argument( + "--remove_chat_template", + default=False, + action="store_true", + help="pass this if you are not use a chat model", + ) + parser = EngineArgs.add_cli_args(parser) + parser = sampling_add_cli_args(parser) + args = parser.parse_args() + + engine_args = [attr.name for attr in dataclasses.fields(EngineArgs)] + sampling_args = [ + param.name + for param in list( + inspect.signature(SamplingParams).parameters.values() + ) + ] + engine_params = {attr: getattr(args, attr) for attr in engine_args} + sampling_params = { + attr: getattr(args, attr) for attr in sampling_args if args.__contains__(attr) + } + + model_name = os.path.dirname(args.model).rsplit("/")[-1] + + # Sample prompts. + prompts = ["哪些迹象可能表明一个人正在经历焦虑?", "描述一下如何制作芝士披萨。", "写一篇有关5G网络研发的综述文章。"] + + # Create a sampling params object. + sampling_params = SamplingParams(**sampling_params) + + # Create an LLM. + llm = LLM(**engine_params) + + # process chat template + if args.remove_chat_template: + if "chat" in model_name.lower(): + logging.warning( + f"The model name from model path is {model_name}, so we guess you are using the chat model and the additional processing is required for the input prompt. " + f"If the result is not quite correct, please ensure you do not pass --remove_chat_template in CLI." + ) + prompts_new = prompts + else: + # Build chat model promopt + logging.warning( + "If you are using a non chat model, please pass the --remove_chat_template in CLI." + ) + # Try use transformers's apply_chat_template, if chat_template is None, will use defalut template. + # For some old models, the default template may cause bad answers. we don't consider this situation, + # because the Transformers team is advancing the chat template. For more informatino about it, + # please refer to https://huggingface.co/docs/transformers/main/chat_templating + try: + load_chat_template(llm.get_tokenizer(), args.chat_template) + prompts_new = [] + for prompt in prompts: + messages = [{"role": "user", "content": prompt}] + text = llm.get_tokenizer().apply_chat_template( + messages, tokenize=False, add_generation_prompt=True + ) + prompts_new.append(text) + except: + logging.warning( + "use tokenizer apply_chat_template function failed, may because of low transformers version...(try use transformers>=4.34.0)" + ) + prompts_new = prompts + + # Generate texts from the prompts. The output is a list of RequestOutput objects + # that contain the prompt, generated text, and other information. + outputs = ( + llm.generate(prompts_new, sampling_params, use_tqdm=False) + if isinstance(prompts_new[0], str) + else llm.generate( + sampling_params=sampling_params, + prompt_token_ids=prompts_new, + use_tqdm=False, + ) + ) + torch.cuda.synchronize() + + start_time = time.perf_counter() + outputs = ( + llm.generate(prompts_new, sampling_params) + if isinstance(prompts_new[0], str) + else llm.generate(sampling_params=sampling_params, prompt_token_ids=prompts_new) + ) + torch.cuda.synchronize() + end_time = time.perf_counter() + duration_time = end_time - start_time + + num_tokens = 0 + # Print the outputs. + for i, output in enumerate(outputs): + prompt = prompts[i] # show the origin prompt. actully prompt is "output.prompt" + generated_text = output.outputs[0].text + + num_tokens += len(output.outputs[0].token_ids) + print(f"Prompt: {prompt}\nGenerated text: {generated_text} \n") + print(f"tokens: {num_tokens}, QPS: {num_tokens/duration_time}") diff --git a/models/nlp/large_language_model/deepseek-r1-distill-qwen-14b/vllm/README.md b/models/nlp/large_language_model/deepseek-r1-distill-qwen-14b/vllm/README.md new file mode 100644 index 0000000000000000000000000000000000000000..7d20eeeec6f29e20af0a1aed438f5f6a26f92986 --- /dev/null +++ b/models/nlp/large_language_model/deepseek-r1-distill-qwen-14b/vllm/README.md @@ -0,0 +1,48 @@ +# DeepSeek-R1-Distill-Qwen-14B + +## Description + +DeepSeek-R1-Distill models are fine-tuned based on open-source models, using samples generated by DeepSeek-R1. We slightly change their configs and tokenizers. We open-source distilled 1.5B, 7B, 8B, 14B, 32B, and 70B checkpoints based on Qwen2.5 and Llama3 series to the community. + +## Setup + +### Install + +```bash +# Install libGL +## CentOS +yum install -y mesa-libGL +## Ubuntu +apt install -y libgl1-mesa-glx +``` + +### Download + +-Model: + +```bash +cd deepseek-r1-distill-qwen-14b/vllm +mkdir -p data/ +ln -s /path/to/DeepSeek-R1-Distill-Qwen-14B ./data/ +``` + +## Inference with offline + +```bash +python3 offline_inference.py --model ./data/DeepSeek-R1-Distill-Qwen-14B --max-tokens 256 -tp 2 --temperature 0.0 --max-model-len 3096 +``` +## Inference with serve + +```bash +vllm serve data/DeepSeek-R1-Distill-Qwen-14B --tensor-parallel-size 2 --max-model-len 32768 --enforce-eager --trust-remote-code +``` + +## Results + +| Model | QPS | +| ---------- | ----- | +| DeepSeek-R1-Distill-Qwen-14B | 88.01| + +## Reference + +[DeepSeek-R1](https://github.com/deepseek-ai/DeepSeek-R1) diff --git a/models/nlp/large_language_model/deepseek-r1-distill-qwen-14b/vllm/ci/prepare.sh b/models/nlp/large_language_model/deepseek-r1-distill-qwen-14b/vllm/ci/prepare.sh new file mode 100644 index 0000000000000000000000000000000000000000..75fb19458942e1d61a674c1d6dd9bbdb521bc00c --- /dev/null +++ b/models/nlp/large_language_model/deepseek-r1-distill-qwen-14b/vllm/ci/prepare.sh @@ -0,0 +1,25 @@ +#!/bin/bash +# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -x +ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"') +if [[ ${ID} == "ubuntu" ]]; then + apt install -y libgl1-mesa-glx +elif [[ ${ID} == "centos" ]]; then + yum install -y mesa-libGL +else + echo "Not Support Os" +fi diff --git a/models/nlp/large_language_model/deepseek-r1-distill-qwen-14b/vllm/offline_inference.py b/models/nlp/large_language_model/deepseek-r1-distill-qwen-14b/vllm/offline_inference.py new file mode 100644 index 0000000000000000000000000000000000000000..9b7d87fd797c78fcedba7cd4c9a9a0e7642c251f --- /dev/null +++ b/models/nlp/large_language_model/deepseek-r1-distill-qwen-14b/vllm/offline_inference.py @@ -0,0 +1,115 @@ +import sys +from pathlib import Path +import os + +sys.path.append(str(Path(__file__).resolve().parent.parent.parent)) +import argparse +import dataclasses +import inspect +import logging +import time + +import torch +from utils import load_chat_template, sampling_add_cli_args +from vllm import LLM, EngineArgs, SamplingParams + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--chat_template", type=str, default=None) + parser.add_argument( + "--remove_chat_template", + default=False, + action="store_true", + help="pass this if you are not use a chat model", + ) + parser = EngineArgs.add_cli_args(parser) + parser = sampling_add_cli_args(parser) + args = parser.parse_args() + + engine_args = [attr.name for attr in dataclasses.fields(EngineArgs)] + sampling_args = [ + param.name + for param in list( + inspect.signature(SamplingParams).parameters.values() + ) + ] + engine_params = {attr: getattr(args, attr) for attr in engine_args} + sampling_params = { + attr: getattr(args, attr) for attr in sampling_args if args.__contains__(attr) + } + + model_name = os.path.dirname(args.model).rsplit("/")[-1] + + # Sample prompts. + prompts = ["哪些迹象可能表明一个人正在经历焦虑?", "描述一下如何制作芝士披萨。", "写一篇有关5G网络研发的综述文章。"] + + # Create a sampling params object. + sampling_params = SamplingParams(**sampling_params) + + # Create an LLM. + llm = LLM(**engine_params) + + # process chat template + if args.remove_chat_template: + if "chat" in model_name.lower(): + logging.warning( + f"The model name from model path is {model_name}, so we guess you are using the chat model and the additional processing is required for the input prompt. " + f"If the result is not quite correct, please ensure you do not pass --remove_chat_template in CLI." + ) + prompts_new = prompts + else: + # Build chat model promopt + logging.warning( + "If you are using a non chat model, please pass the --remove_chat_template in CLI." + ) + # Try use transformers's apply_chat_template, if chat_template is None, will use defalut template. + # For some old models, the default template may cause bad answers. we don't consider this situation, + # because the Transformers team is advancing the chat template. For more informatino about it, + # please refer to https://huggingface.co/docs/transformers/main/chat_templating + try: + load_chat_template(llm.get_tokenizer(), args.chat_template) + prompts_new = [] + for prompt in prompts: + messages = [{"role": "user", "content": prompt}] + text = llm.get_tokenizer().apply_chat_template( + messages, tokenize=False, add_generation_prompt=True + ) + prompts_new.append(text) + except: + logging.warning( + "use tokenizer apply_chat_template function failed, may because of low transformers version...(try use transformers>=4.34.0)" + ) + prompts_new = prompts + + # Generate texts from the prompts. The output is a list of RequestOutput objects + # that contain the prompt, generated text, and other information. + outputs = ( + llm.generate(prompts_new, sampling_params, use_tqdm=False) + if isinstance(prompts_new[0], str) + else llm.generate( + sampling_params=sampling_params, + prompt_token_ids=prompts_new, + use_tqdm=False, + ) + ) + torch.cuda.synchronize() + + start_time = time.perf_counter() + outputs = ( + llm.generate(prompts_new, sampling_params) + if isinstance(prompts_new[0], str) + else llm.generate(sampling_params=sampling_params, prompt_token_ids=prompts_new) + ) + torch.cuda.synchronize() + end_time = time.perf_counter() + duration_time = end_time - start_time + + num_tokens = 0 + # Print the outputs. + for i, output in enumerate(outputs): + prompt = prompts[i] # show the origin prompt. actully prompt is "output.prompt" + generated_text = output.outputs[0].text + + num_tokens += len(output.outputs[0].token_ids) + print(f"Prompt: {prompt}\nGenerated text: {generated_text} \n") + print(f"tokens: {num_tokens}, QPS: {num_tokens/duration_time}") diff --git a/models/nlp/large_language_model/deepseek-r1-distill-qwen-32b/vllm/README.md b/models/nlp/large_language_model/deepseek-r1-distill-qwen-32b/vllm/README.md new file mode 100644 index 0000000000000000000000000000000000000000..e1c8ca81e97b877136c80e293816845e75c1dec0 --- /dev/null +++ b/models/nlp/large_language_model/deepseek-r1-distill-qwen-32b/vllm/README.md @@ -0,0 +1,48 @@ +# DeepSeek-R1-Distill-Qwen-32B + +## Description + +DeepSeek-R1-Distill models are fine-tuned based on open-source models, using samples generated by DeepSeek-R1. We slightly change their configs and tokenizers. We open-source distilled 1.5B, 7B, 8B, 14B, 32B, and 70B checkpoints based on Qwen2.5 and Llama3 series to the community. + +## Setup + +### Install + +```bash +# Install libGL +## CentOS +yum install -y mesa-libGL +## Ubuntu +apt install -y libgl1-mesa-glx +``` + +### Download + +-Model: + +```bash +cd deepseek-r1-distill-qwen-32b/vllm +mkdir -p data/ +ln -s /path/to/DeepSeek-R1-Distill-Qwen-32B ./data/ +``` + +## Inference with offline + +```bash +python3 offline_inference.py --model ./data/DeepSeek-R1-Distill-Qwen-32B --max-tokens 256 -tp 4 --temperature 0.0 --max-model-len 3096 +``` +## Inference with serve + +```bash +vllm serve data/DeepSeek-R1-Distill-Qwen-32B --tensor-parallel-size 4 --max-model-len 32768 --enforce-eager --trust-remote-code +``` + +## Results + +| Model | QPS | +| ---------- | ----- | +| DeepSeek-R1-Distill-Qwen-32B | 68.30| + +## Reference + +[DeepSeek-R1](https://github.com/deepseek-ai/DeepSeek-R1) diff --git a/models/nlp/large_language_model/deepseek-r1-distill-qwen-32b/vllm/ci/prepare.sh b/models/nlp/large_language_model/deepseek-r1-distill-qwen-32b/vllm/ci/prepare.sh new file mode 100644 index 0000000000000000000000000000000000000000..75fb19458942e1d61a674c1d6dd9bbdb521bc00c --- /dev/null +++ b/models/nlp/large_language_model/deepseek-r1-distill-qwen-32b/vllm/ci/prepare.sh @@ -0,0 +1,25 @@ +#!/bin/bash +# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -x +ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"') +if [[ ${ID} == "ubuntu" ]]; then + apt install -y libgl1-mesa-glx +elif [[ ${ID} == "centos" ]]; then + yum install -y mesa-libGL +else + echo "Not Support Os" +fi diff --git a/models/nlp/large_language_model/deepseek-r1-distill-qwen-32b/vllm/offline_inference.py b/models/nlp/large_language_model/deepseek-r1-distill-qwen-32b/vllm/offline_inference.py new file mode 100644 index 0000000000000000000000000000000000000000..9b7d87fd797c78fcedba7cd4c9a9a0e7642c251f --- /dev/null +++ b/models/nlp/large_language_model/deepseek-r1-distill-qwen-32b/vllm/offline_inference.py @@ -0,0 +1,115 @@ +import sys +from pathlib import Path +import os + +sys.path.append(str(Path(__file__).resolve().parent.parent.parent)) +import argparse +import dataclasses +import inspect +import logging +import time + +import torch +from utils import load_chat_template, sampling_add_cli_args +from vllm import LLM, EngineArgs, SamplingParams + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--chat_template", type=str, default=None) + parser.add_argument( + "--remove_chat_template", + default=False, + action="store_true", + help="pass this if you are not use a chat model", + ) + parser = EngineArgs.add_cli_args(parser) + parser = sampling_add_cli_args(parser) + args = parser.parse_args() + + engine_args = [attr.name for attr in dataclasses.fields(EngineArgs)] + sampling_args = [ + param.name + for param in list( + inspect.signature(SamplingParams).parameters.values() + ) + ] + engine_params = {attr: getattr(args, attr) for attr in engine_args} + sampling_params = { + attr: getattr(args, attr) for attr in sampling_args if args.__contains__(attr) + } + + model_name = os.path.dirname(args.model).rsplit("/")[-1] + + # Sample prompts. + prompts = ["哪些迹象可能表明一个人正在经历焦虑?", "描述一下如何制作芝士披萨。", "写一篇有关5G网络研发的综述文章。"] + + # Create a sampling params object. + sampling_params = SamplingParams(**sampling_params) + + # Create an LLM. + llm = LLM(**engine_params) + + # process chat template + if args.remove_chat_template: + if "chat" in model_name.lower(): + logging.warning( + f"The model name from model path is {model_name}, so we guess you are using the chat model and the additional processing is required for the input prompt. " + f"If the result is not quite correct, please ensure you do not pass --remove_chat_template in CLI." + ) + prompts_new = prompts + else: + # Build chat model promopt + logging.warning( + "If you are using a non chat model, please pass the --remove_chat_template in CLI." + ) + # Try use transformers's apply_chat_template, if chat_template is None, will use defalut template. + # For some old models, the default template may cause bad answers. we don't consider this situation, + # because the Transformers team is advancing the chat template. For more informatino about it, + # please refer to https://huggingface.co/docs/transformers/main/chat_templating + try: + load_chat_template(llm.get_tokenizer(), args.chat_template) + prompts_new = [] + for prompt in prompts: + messages = [{"role": "user", "content": prompt}] + text = llm.get_tokenizer().apply_chat_template( + messages, tokenize=False, add_generation_prompt=True + ) + prompts_new.append(text) + except: + logging.warning( + "use tokenizer apply_chat_template function failed, may because of low transformers version...(try use transformers>=4.34.0)" + ) + prompts_new = prompts + + # Generate texts from the prompts. The output is a list of RequestOutput objects + # that contain the prompt, generated text, and other information. + outputs = ( + llm.generate(prompts_new, sampling_params, use_tqdm=False) + if isinstance(prompts_new[0], str) + else llm.generate( + sampling_params=sampling_params, + prompt_token_ids=prompts_new, + use_tqdm=False, + ) + ) + torch.cuda.synchronize() + + start_time = time.perf_counter() + outputs = ( + llm.generate(prompts_new, sampling_params) + if isinstance(prompts_new[0], str) + else llm.generate(sampling_params=sampling_params, prompt_token_ids=prompts_new) + ) + torch.cuda.synchronize() + end_time = time.perf_counter() + duration_time = end_time - start_time + + num_tokens = 0 + # Print the outputs. + for i, output in enumerate(outputs): + prompt = prompts[i] # show the origin prompt. actully prompt is "output.prompt" + generated_text = output.outputs[0].text + + num_tokens += len(output.outputs[0].token_ids) + print(f"Prompt: {prompt}\nGenerated text: {generated_text} \n") + print(f"tokens: {num_tokens}, QPS: {num_tokens/duration_time}") diff --git a/models/nlp/large_language_model/deepseek-r1-distill-qwen-7b/vllm/README.md b/models/nlp/large_language_model/deepseek-r1-distill-qwen-7b/vllm/README.md new file mode 100644 index 0000000000000000000000000000000000000000..8d72e0c77645cd2ade78fd30f4b065e183508870 --- /dev/null +++ b/models/nlp/large_language_model/deepseek-r1-distill-qwen-7b/vllm/README.md @@ -0,0 +1,48 @@ +# DeepSeek-R1-Distill-Qwen-7B + +## Description + +DeepSeek-R1-Distill models are fine-tuned based on open-source models, using samples generated by DeepSeek-R1. We slightly change their configs and tokenizers. We open-source distilled 1.5B, 7B, 8B, 14B, 32B, and 70B checkpoints based on Qwen2.5 and Llama3 series to the community. + +## Setup + +### Install + +```bash +# Install libGL +## CentOS +yum install -y mesa-libGL +## Ubuntu +apt install -y libgl1-mesa-glx +``` + +### Download + +-Model: + +```bash +cd deepseek-r1-distill-qwen-7b/vllm +mkdir -p data/ +ln -s /path/to/DeepSeek-R1-Distill-Qwen-7B ./data/ +``` + +## Inference with offline + +```bash +python3 offline_inference.py --model ./data/DeepSeek-R1-Distill-Qwen-7B --max-tokens 256 -tp 1 --temperature 0.0 --max-model-len 3096 +``` +## Inference with serve + +```bash +vllm serve data/DeepSeek-R1-Distill-Qwen-7B --tensor-parallel-size 2 --max-model-len 32768 --enforce-eager --trust-remote-code +``` + +## Results + +| Model | QPS | +| ---------- | ----- | +| DeepSeek-R1-Distill-Qwen-7B | 90.48| + +## Reference + +[DeepSeek-R1](https://github.com/deepseek-ai/DeepSeek-R1) diff --git a/models/nlp/large_language_model/deepseek-r1-distill-qwen-7b/vllm/ci/prepare.sh b/models/nlp/large_language_model/deepseek-r1-distill-qwen-7b/vllm/ci/prepare.sh new file mode 100644 index 0000000000000000000000000000000000000000..75fb19458942e1d61a674c1d6dd9bbdb521bc00c --- /dev/null +++ b/models/nlp/large_language_model/deepseek-r1-distill-qwen-7b/vllm/ci/prepare.sh @@ -0,0 +1,25 @@ +#!/bin/bash +# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -x +ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"') +if [[ ${ID} == "ubuntu" ]]; then + apt install -y libgl1-mesa-glx +elif [[ ${ID} == "centos" ]]; then + yum install -y mesa-libGL +else + echo "Not Support Os" +fi diff --git a/models/nlp/large_language_model/deepseek-r1-distill-qwen-7b/vllm/offline_inference.py b/models/nlp/large_language_model/deepseek-r1-distill-qwen-7b/vllm/offline_inference.py new file mode 100644 index 0000000000000000000000000000000000000000..9b7d87fd797c78fcedba7cd4c9a9a0e7642c251f --- /dev/null +++ b/models/nlp/large_language_model/deepseek-r1-distill-qwen-7b/vllm/offline_inference.py @@ -0,0 +1,115 @@ +import sys +from pathlib import Path +import os + +sys.path.append(str(Path(__file__).resolve().parent.parent.parent)) +import argparse +import dataclasses +import inspect +import logging +import time + +import torch +from utils import load_chat_template, sampling_add_cli_args +from vllm import LLM, EngineArgs, SamplingParams + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--chat_template", type=str, default=None) + parser.add_argument( + "--remove_chat_template", + default=False, + action="store_true", + help="pass this if you are not use a chat model", + ) + parser = EngineArgs.add_cli_args(parser) + parser = sampling_add_cli_args(parser) + args = parser.parse_args() + + engine_args = [attr.name for attr in dataclasses.fields(EngineArgs)] + sampling_args = [ + param.name + for param in list( + inspect.signature(SamplingParams).parameters.values() + ) + ] + engine_params = {attr: getattr(args, attr) for attr in engine_args} + sampling_params = { + attr: getattr(args, attr) for attr in sampling_args if args.__contains__(attr) + } + + model_name = os.path.dirname(args.model).rsplit("/")[-1] + + # Sample prompts. + prompts = ["哪些迹象可能表明一个人正在经历焦虑?", "描述一下如何制作芝士披萨。", "写一篇有关5G网络研发的综述文章。"] + + # Create a sampling params object. + sampling_params = SamplingParams(**sampling_params) + + # Create an LLM. + llm = LLM(**engine_params) + + # process chat template + if args.remove_chat_template: + if "chat" in model_name.lower(): + logging.warning( + f"The model name from model path is {model_name}, so we guess you are using the chat model and the additional processing is required for the input prompt. " + f"If the result is not quite correct, please ensure you do not pass --remove_chat_template in CLI." + ) + prompts_new = prompts + else: + # Build chat model promopt + logging.warning( + "If you are using a non chat model, please pass the --remove_chat_template in CLI." + ) + # Try use transformers's apply_chat_template, if chat_template is None, will use defalut template. + # For some old models, the default template may cause bad answers. we don't consider this situation, + # because the Transformers team is advancing the chat template. For more informatino about it, + # please refer to https://huggingface.co/docs/transformers/main/chat_templating + try: + load_chat_template(llm.get_tokenizer(), args.chat_template) + prompts_new = [] + for prompt in prompts: + messages = [{"role": "user", "content": prompt}] + text = llm.get_tokenizer().apply_chat_template( + messages, tokenize=False, add_generation_prompt=True + ) + prompts_new.append(text) + except: + logging.warning( + "use tokenizer apply_chat_template function failed, may because of low transformers version...(try use transformers>=4.34.0)" + ) + prompts_new = prompts + + # Generate texts from the prompts. The output is a list of RequestOutput objects + # that contain the prompt, generated text, and other information. + outputs = ( + llm.generate(prompts_new, sampling_params, use_tqdm=False) + if isinstance(prompts_new[0], str) + else llm.generate( + sampling_params=sampling_params, + prompt_token_ids=prompts_new, + use_tqdm=False, + ) + ) + torch.cuda.synchronize() + + start_time = time.perf_counter() + outputs = ( + llm.generate(prompts_new, sampling_params) + if isinstance(prompts_new[0], str) + else llm.generate(sampling_params=sampling_params, prompt_token_ids=prompts_new) + ) + torch.cuda.synchronize() + end_time = time.perf_counter() + duration_time = end_time - start_time + + num_tokens = 0 + # Print the outputs. + for i, output in enumerate(outputs): + prompt = prompts[i] # show the origin prompt. actully prompt is "output.prompt" + generated_text = output.outputs[0].text + + num_tokens += len(output.outputs[0].token_ids) + print(f"Prompt: {prompt}\nGenerated text: {generated_text} \n") + print(f"tokens: {num_tokens}, QPS: {num_tokens/duration_time}") diff --git a/tests/models_vllm.yaml b/tests/models_vllm.yaml index 548f16c81dbc297b22a666e4982c3a3027311128..5781866f49744c906808a20af7cafeaace5241ba 100644 --- a/tests/models_vllm.yaml +++ b/tests/models_vllm.yaml @@ -112,3 +112,51 @@ - fp16 relative_path: models/multimodal/vision-language-understanding/minicpm-v-2/vllm/ task_type: multimodal/vision-language-understanding +- datasets: https://localhost + download_url: https://localhost/DeepSeek-R1-Distill-Qwen-1.5B + name: deepseek-r1-distill-qwen-1.5b + need_third_part: false + precisions: + - fp16 + relative_path: models/nlp/large_language_model/deepseek-r1-distill-qwen-1.5b/vllm/ + task_type: nlp/large_language_model +- datasets: https://localhost + download_url: https://localhost/DeepSeek-R1-Distill-Qwen-7B + name: deepseek-r1-distill-qwen-7b + need_third_part: false + precisions: + - fp16 + relative_path: models/nlp/large_language_model/deepseek-r1-distill-qwen-7b/vllm/ + task_type: nlp/large_language_model +- datasets: https://localhost + download_url: https://localhost/DeepSeek-R1-Distill-Qwen-14B + name: deepseek-r1-distill-qwen-14b + need_third_part: false + precisions: + - fp16 + relative_path: models/nlp/large_language_model/deepseek-r1-distill-qwen-14b/vllm/ + task_type: nlp/large_language_model +- datasets: https://localhost + download_url: https://localhost/DeepSeek-R1-Distill-Qwen-32B + name: deepseek-r1-distill-qwen-32b + need_third_part: false + precisions: + - fp16 + relative_path: models/nlp/large_language_model/deepseek-r1-distill-qwen-32b/vllm/ + task_type: nlp/large_language_model +- datasets: https://localhost + download_url: https://localhost/DeepSeek-R1-Distill-Llama-8B + name: deepseek-r1-distill-llama-8b + need_third_part: false + precisions: + - fp16 + relative_path: models/nlp/large_language_model/deepseek-r1-distill-llama-8b/vllm/ + task_type: nlp/large_language_model +- datasets: https://localhost + download_url: https://localhost/DeepSeek-R1-Distill-Llama-70B + name: deepseek-r1-distill-llama-70b + need_third_part: false + precisions: + - fp16 + relative_path: models/nlp/large_language_model/deepseek-r1-distill-llama-70b/vllm/ + task_type: nlp/large_language_model \ No newline at end of file diff --git a/tests/run_vllm.py b/tests/run_vllm.py index c3eb0217cad0414008e39a98346e91fe0e77dae1..518ad79ee977d8e79f77dba8a60c09db28be6688 100644 --- a/tests/run_vllm.py +++ b/tests/run_vllm.py @@ -220,6 +220,12 @@ def run_nlp_testcase(model): export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/lib64 python3 minicpmv-2.0-offline.py --model-path ./minicpm-v-2 --image-path ./dog.jpg """ + elif model_name.startswith("deepseek-r1-distill-"): + script = f""" + set -x + cd ../{model['relative_path']} + python3 offline_inference.py --model ./{model_name} --max-tokens 256 -tp 2 --temperature 0.0 --max-model-len 3096 + """ r, t = run_script(script) sout = r.stdout @@ -235,11 +241,6 @@ def run_nlp_testcase(model): result["result"][prec]["Cost time (s)"] = t return result -def get_metric_result(str): - if str: - return json.loads(str.replace("'", "\""))["metricResult"] - return None - def run_script(script): start_time = time.perf_counter() result = subprocess.run(