diff --git a/models/multimodal/diffusion/stable-diffusion/README.md b/models/multimodal/diffusion/stable-diffusion/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..93bc5f3dfc67faf74099d11c2b1e305f96622440
--- /dev/null
+++ b/models/multimodal/diffusion/stable-diffusion/README.md
@@ -0,0 +1,41 @@
+# Stable Diffusion 1.5
+
+## Model description
+
+Stable Diffusion is a latent text-to-image diffusion model capable of generating photo-realistic images given any text input.
+
+## Setup
+
+### Install
+
+```bash
+# Install libGL
+## CentOS
+yum install -y mesa-libGL
+## Ubuntu
+apt install -y libgl1-mesa-glx
+
+pip3 install http://files.deepspark.org.cn:880/deepspark/add-ons/diffusers-0.31.0-py3-none-any.whl
+pip3 install -r requirements.txt
+```
+
+### Download
+
+Download the runwayml/stable-diffusion-v1-5 from [huggingface page](https://huggingface.co/runwayml/stable-diffusion-v1-5).
+
+```bash
+cd stable-diffusion
+mkdir -p data/
+ln -s /path/to/stable-diffusion-v1-5 ./data/
+```
+
+## Inference
+
+```bash
+export ENABLE_IXFORMER_INFERENCE=1
+python3 demo.py
+```
+
+## Reference
+
+- [diffusers](https://github.com/huggingface/diffusers)
diff --git a/models/multimodal/diffusion/stable-diffusion/ci/prepare.sh b/models/multimodal/diffusion/stable-diffusion/ci/prepare.sh
new file mode 100644
index 0000000000000000000000000000000000000000..b9140aa3524dfab2af3249dee4e6ab89948b3607
--- /dev/null
+++ b/models/multimodal/diffusion/stable-diffusion/ci/prepare.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -x
+ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
+if [[ ${ID} == "ubuntu" ]]; then
+    apt install -y libgl1-mesa-glx
+elif [[ ${ID} == "centos" ]]; then
+    yum install -y mesa-libGL
+else
+    echo "Not Support Os"
+fi
+
+pip3 install http://files.deepspark.org.cn:880/deepspark/add-ons/diffusers-0.31.0-py3-none-any.whl
+pip3 install -r requirements.txt
\ No newline at end of file
diff --git a/models/multimodal/diffusion/stable-diffusion/demo.py b/models/multimodal/diffusion/stable-diffusion/demo.py
new file mode 100644
index 0000000000000000000000000000000000000000..ce3cdf398e29b4663c938ed46ea19dd74400d43b
--- /dev/null
+++ b/models/multimodal/diffusion/stable-diffusion/demo.py
@@ -0,0 +1,33 @@
+import random
+import time
+import numpy as np
+import torch
+from torchvision.utils import save_image
+from diffusers import StableDiffusionPipeline
+
+
+def setup_seed(seed):
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    np.random.seed(seed)
+    random.seed(seed)
+    torch.backends.cudnn.deterministic = True
+
+# 设置随机数种子
+setup_seed(20)
+
+pipe = StableDiffusionPipeline.from_pretrained(f"data/stable-diffusion-v1-5", torch_dtype=torch.float16)
+pipe.safety_checker = None
+pipe = pipe.to("cuda")
+prompt = "A raccoon wearing formal clothes, wearing a tophat and holding a cane"
+print("Warming up GPU...")
+wh = 1024
+batch_size=1
+print(f'height={wh}, width={wh}, prompt={prompt}, batch_size={batch_size}')
+start_time = time.time()
+image = pipe(
+    prompt, output_type="pt", return_dict=True, height=wh, width=wh, num_images_per_prompt=batch_size, num_inference_steps=50, guidance_scale=7.0
+).images[0]
+use_time = time.time() - start_time
+print("time: {:.2f} seconds".format(use_time))
+save_image(image, "demo.png")
\ No newline at end of file
diff --git a/models/multimodal/diffusion/stable-diffusion/requirements.txt b/models/multimodal/diffusion/stable-diffusion/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..2cbf12c60d5945cf14ebaec9a7348fab5400bade
--- /dev/null
+++ b/models/multimodal/diffusion/stable-diffusion/requirements.txt
@@ -0,0 +1,22 @@
+contourpy==1.2.1
+cycler==0.12.1
+et-xmlfile==1.1.0
+fonttools==4.53.0
+kiwisolver==1.4.5
+matplotlib==3.9.0
+numpy==1.26.4
+opencv-python==4.10.0.82
+openpyxl==3.1.4
+packaging==24.1
+pandas==2.2.2
+pillow==10.3.0
+pyparsing==3.1.2
+python-dateutil==2.9.0.post0
+pytz==2024.1
+scipy==1.13.1
+six==1.16.0
+tzdata==2024.1
+transformers==4.39.3
+accelerate==0.29.0
+peft==0.13.2
+safetensors
\ No newline at end of file
diff --git a/models/nlp/large_language_model/deepseek-r1-distill-llama-70b/vllm/README.md b/models/nlp/large_language_model/deepseek-r1-distill-llama-70b/vllm/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..26049a59f9beae20d1382ba6b597b4f8f8c6808f
--- /dev/null
+++ b/models/nlp/large_language_model/deepseek-r1-distill-llama-70b/vllm/README.md
@@ -0,0 +1,42 @@
+# DeepSeek-R1-Distill-Llama-70B
+
+## Description
+
+DeepSeek-R1-Distill models are fine-tuned based on open-source models, using samples generated by DeepSeek-R1. We slightly change their configs and tokenizers.  We open-source distilled 1.5B, 7B, 8B, 14B, 32B, and 70B checkpoints based on Qwen2.5 and Llama3 series to the community.
+
+## Setup
+
+### Install
+
+```bash
+# Install libGL
+## CentOS
+yum install -y mesa-libGL
+## Ubuntu
+apt install -y libgl1-mesa-glx
+```
+
+### Download
+
+-Model: <https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-70B>
+
+```bash
+cd deepSeek-r1-distill-llama-70b/vllm
+mkdir -p data/
+ln -s /path/to/DeepSeek-R1-Distill-Llama-70B ./data/
+```
+
+## Inference with offline
+
+```bash
+python3 offline_inference.py --model ./data/DeepSeek-R1-Distill-Llama-70B --max-tokens 256 -tp 8 --temperature 0.0 --max-model-len 3096
+```
+## Inference with serve
+
+```bash
+vllm serve data/DeepSeek-R1-Distill-Llama-70B --tensor-parallel-size 8 --max-model-len 32768 --enforce-eager --trust-remote-code
+```
+
+## Reference
+
+[DeepSeek-R1](https://github.com/deepseek-ai/DeepSeek-R1)
diff --git a/models/nlp/large_language_model/deepseek-r1-distill-llama-70b/vllm/ci/prepare.sh b/models/nlp/large_language_model/deepseek-r1-distill-llama-70b/vllm/ci/prepare.sh
new file mode 100644
index 0000000000000000000000000000000000000000..75fb19458942e1d61a674c1d6dd9bbdb521bc00c
--- /dev/null
+++ b/models/nlp/large_language_model/deepseek-r1-distill-llama-70b/vllm/ci/prepare.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -x
+ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
+if [[ ${ID} == "ubuntu" ]]; then
+    apt install -y libgl1-mesa-glx
+elif [[ ${ID} == "centos" ]]; then
+    yum install -y mesa-libGL
+else
+    echo "Not Support Os"
+fi
diff --git a/models/nlp/large_language_model/deepseek-r1-distill-llama-70b/vllm/offline_inference.py b/models/nlp/large_language_model/deepseek-r1-distill-llama-70b/vllm/offline_inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..9b7d87fd797c78fcedba7cd4c9a9a0e7642c251f
--- /dev/null
+++ b/models/nlp/large_language_model/deepseek-r1-distill-llama-70b/vllm/offline_inference.py
@@ -0,0 +1,115 @@
+import sys
+from pathlib import Path
+import os
+
+sys.path.append(str(Path(__file__).resolve().parent.parent.parent))
+import argparse
+import dataclasses
+import inspect
+import logging
+import time
+
+import torch
+from utils import load_chat_template, sampling_add_cli_args
+from vllm import LLM, EngineArgs, SamplingParams
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--chat_template", type=str, default=None)
+    parser.add_argument(
+        "--remove_chat_template",
+        default=False,
+        action="store_true",
+        help="pass this if you are not use a chat model",
+    )
+    parser = EngineArgs.add_cli_args(parser)
+    parser = sampling_add_cli_args(parser)
+    args = parser.parse_args()
+
+    engine_args = [attr.name for attr in dataclasses.fields(EngineArgs)]
+    sampling_args = [
+        param.name
+        for param in list(
+            inspect.signature(SamplingParams).parameters.values()
+        )
+    ]
+    engine_params = {attr: getattr(args, attr) for attr in engine_args}
+    sampling_params = {
+        attr: getattr(args, attr) for attr in sampling_args if args.__contains__(attr)
+    }
+
+    model_name = os.path.dirname(args.model).rsplit("/")[-1]
+
+    # Sample prompts.
+    prompts = ["哪些迹象可能表明一个人正在经历焦虑?", "描述一下如何制作芝士披萨。", "写一篇有关5G网络研发的综述文章。"]
+
+    # Create a sampling params object.
+    sampling_params = SamplingParams(**sampling_params)
+
+    # Create an LLM.
+    llm = LLM(**engine_params)
+
+    # process chat template
+    if args.remove_chat_template:
+        if "chat" in model_name.lower():
+            logging.warning(
+                f"The model name from model path is {model_name}, so we guess you are using the chat model and the additional processing is required for the input prompt. "
+                f"If the result is not quite correct, please ensure you do not pass --remove_chat_template in CLI."
+            )
+        prompts_new = prompts
+    else:
+        # Build chat model promopt
+        logging.warning(
+            "If you are using a non chat model, please pass the --remove_chat_template in CLI."
+        )
+        # Try use transformers's apply_chat_template, if chat_template is None, will use defalut template.
+        # For some old models, the default template may cause bad answers. we don't consider this situation,
+        # because the Transformers team is advancing the chat template. For more informatino about it,
+        # please refer to https://huggingface.co/docs/transformers/main/chat_templating
+        try:
+            load_chat_template(llm.get_tokenizer(), args.chat_template)
+            prompts_new = []
+            for prompt in prompts:
+                messages = [{"role": "user", "content": prompt}]
+                text = llm.get_tokenizer().apply_chat_template(
+                    messages, tokenize=False, add_generation_prompt=True
+                )
+                prompts_new.append(text)
+        except:
+            logging.warning(
+                "use tokenizer apply_chat_template function failed, may because of low transformers version...(try use transformers>=4.34.0)"
+            )
+            prompts_new = prompts
+
+    # Generate texts from the prompts. The output is a list of RequestOutput objects
+    # that contain the prompt, generated text, and other information.
+    outputs = (
+        llm.generate(prompts_new, sampling_params, use_tqdm=False)
+        if isinstance(prompts_new[0], str)
+        else llm.generate(
+            sampling_params=sampling_params,
+            prompt_token_ids=prompts_new,
+            use_tqdm=False,
+        )
+    )
+    torch.cuda.synchronize()
+
+    start_time = time.perf_counter()
+    outputs = (
+        llm.generate(prompts_new, sampling_params)
+        if isinstance(prompts_new[0], str)
+        else llm.generate(sampling_params=sampling_params, prompt_token_ids=prompts_new)
+    )
+    torch.cuda.synchronize()
+    end_time = time.perf_counter()
+    duration_time = end_time - start_time
+
+    num_tokens = 0
+    # Print the outputs.
+    for i, output in enumerate(outputs):
+        prompt = prompts[i]  # show the origin prompt. actully prompt is "output.prompt"
+        generated_text = output.outputs[0].text
+
+        num_tokens += len(output.outputs[0].token_ids)
+        print(f"Prompt: {prompt}\nGenerated text: {generated_text} \n")
+    print(f"tokens: {num_tokens}, QPS: {num_tokens/duration_time}")
diff --git a/models/nlp/large_language_model/deepseek-r1-distill-llama-8b/vllm/README.md b/models/nlp/large_language_model/deepseek-r1-distill-llama-8b/vllm/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..903a7a160a1e77f7ff35a1f320b810477e7a5455
--- /dev/null
+++ b/models/nlp/large_language_model/deepseek-r1-distill-llama-8b/vllm/README.md
@@ -0,0 +1,48 @@
+# DeepSeek-R1-Distill-Llama-8B
+
+## Description
+
+DeepSeek-R1-Distill models are fine-tuned based on open-source models, using samples generated by DeepSeek-R1. We slightly change their configs and tokenizers.  We open-source distilled 1.5B, 7B, 8B, 14B, 32B, and 70B checkpoints based on Qwen2.5 and Llama3 series to the community.
+
+## Setup
+
+### Install
+
+```bash
+# Install libGL
+## CentOS
+yum install -y mesa-libGL
+## Ubuntu
+apt install -y libgl1-mesa-glx
+```
+
+### Download
+
+-Model: <https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-8B>
+
+```bash
+cd deepSeek-r1-distill-llama-8b/vllm
+mkdir -p data/
+ln -s /path/to/DeepSeek-R1-Distill-Llama-8B ./data/
+```
+
+## Inference with offline
+
+```bash
+python3 offline_inference.py --model ./data/DeepSeek-R1-Distill-Llama-8B --max-tokens 256 -tp 1 --temperature 0.0 --max-model-len 3096
+```
+## Inference with serve
+
+```bash
+vllm serve data/DeepSeek-R1-Distill-Llama-8B --tensor-parallel-size 2 --max-model-len 32768 --enforce-eager --trust-remote-code
+```
+
+## Results
+
+| Model      | QPS   |
+| ---------- | ----- |
+| DeepSeek-R1-Distill-Llama-8B | 105.33|
+
+## Reference
+
+[DeepSeek-R1](https://github.com/deepseek-ai/DeepSeek-R1)
diff --git a/models/nlp/large_language_model/deepseek-r1-distill-llama-8b/vllm/ci/prepare.sh b/models/nlp/large_language_model/deepseek-r1-distill-llama-8b/vllm/ci/prepare.sh
new file mode 100644
index 0000000000000000000000000000000000000000..75fb19458942e1d61a674c1d6dd9bbdb521bc00c
--- /dev/null
+++ b/models/nlp/large_language_model/deepseek-r1-distill-llama-8b/vllm/ci/prepare.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -x
+ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
+if [[ ${ID} == "ubuntu" ]]; then
+    apt install -y libgl1-mesa-glx
+elif [[ ${ID} == "centos" ]]; then
+    yum install -y mesa-libGL
+else
+    echo "Not Support Os"
+fi
diff --git a/models/nlp/large_language_model/deepseek-r1-distill-llama-8b/vllm/offline_inference.py b/models/nlp/large_language_model/deepseek-r1-distill-llama-8b/vllm/offline_inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..9b7d87fd797c78fcedba7cd4c9a9a0e7642c251f
--- /dev/null
+++ b/models/nlp/large_language_model/deepseek-r1-distill-llama-8b/vllm/offline_inference.py
@@ -0,0 +1,115 @@
+import sys
+from pathlib import Path
+import os
+
+sys.path.append(str(Path(__file__).resolve().parent.parent.parent))
+import argparse
+import dataclasses
+import inspect
+import logging
+import time
+
+import torch
+from utils import load_chat_template, sampling_add_cli_args
+from vllm import LLM, EngineArgs, SamplingParams
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--chat_template", type=str, default=None)
+    parser.add_argument(
+        "--remove_chat_template",
+        default=False,
+        action="store_true",
+        help="pass this if you are not use a chat model",
+    )
+    parser = EngineArgs.add_cli_args(parser)
+    parser = sampling_add_cli_args(parser)
+    args = parser.parse_args()
+
+    engine_args = [attr.name for attr in dataclasses.fields(EngineArgs)]
+    sampling_args = [
+        param.name
+        for param in list(
+            inspect.signature(SamplingParams).parameters.values()
+        )
+    ]
+    engine_params = {attr: getattr(args, attr) for attr in engine_args}
+    sampling_params = {
+        attr: getattr(args, attr) for attr in sampling_args if args.__contains__(attr)
+    }
+
+    model_name = os.path.dirname(args.model).rsplit("/")[-1]
+
+    # Sample prompts.
+    prompts = ["哪些迹象可能表明一个人正在经历焦虑?", "描述一下如何制作芝士披萨。", "写一篇有关5G网络研发的综述文章。"]
+
+    # Create a sampling params object.
+    sampling_params = SamplingParams(**sampling_params)
+
+    # Create an LLM.
+    llm = LLM(**engine_params)
+
+    # process chat template
+    if args.remove_chat_template:
+        if "chat" in model_name.lower():
+            logging.warning(
+                f"The model name from model path is {model_name}, so we guess you are using the chat model and the additional processing is required for the input prompt. "
+                f"If the result is not quite correct, please ensure you do not pass --remove_chat_template in CLI."
+            )
+        prompts_new = prompts
+    else:
+        # Build chat model promopt
+        logging.warning(
+            "If you are using a non chat model, please pass the --remove_chat_template in CLI."
+        )
+        # Try use transformers's apply_chat_template, if chat_template is None, will use defalut template.
+        # For some old models, the default template may cause bad answers. we don't consider this situation,
+        # because the Transformers team is advancing the chat template. For more informatino about it,
+        # please refer to https://huggingface.co/docs/transformers/main/chat_templating
+        try:
+            load_chat_template(llm.get_tokenizer(), args.chat_template)
+            prompts_new = []
+            for prompt in prompts:
+                messages = [{"role": "user", "content": prompt}]
+                text = llm.get_tokenizer().apply_chat_template(
+                    messages, tokenize=False, add_generation_prompt=True
+                )
+                prompts_new.append(text)
+        except:
+            logging.warning(
+                "use tokenizer apply_chat_template function failed, may because of low transformers version...(try use transformers>=4.34.0)"
+            )
+            prompts_new = prompts
+
+    # Generate texts from the prompts. The output is a list of RequestOutput objects
+    # that contain the prompt, generated text, and other information.
+    outputs = (
+        llm.generate(prompts_new, sampling_params, use_tqdm=False)
+        if isinstance(prompts_new[0], str)
+        else llm.generate(
+            sampling_params=sampling_params,
+            prompt_token_ids=prompts_new,
+            use_tqdm=False,
+        )
+    )
+    torch.cuda.synchronize()
+
+    start_time = time.perf_counter()
+    outputs = (
+        llm.generate(prompts_new, sampling_params)
+        if isinstance(prompts_new[0], str)
+        else llm.generate(sampling_params=sampling_params, prompt_token_ids=prompts_new)
+    )
+    torch.cuda.synchronize()
+    end_time = time.perf_counter()
+    duration_time = end_time - start_time
+
+    num_tokens = 0
+    # Print the outputs.
+    for i, output in enumerate(outputs):
+        prompt = prompts[i]  # show the origin prompt. actully prompt is "output.prompt"
+        generated_text = output.outputs[0].text
+
+        num_tokens += len(output.outputs[0].token_ids)
+        print(f"Prompt: {prompt}\nGenerated text: {generated_text} \n")
+    print(f"tokens: {num_tokens}, QPS: {num_tokens/duration_time}")
diff --git a/models/nlp/large_language_model/deepseek-r1-distill-qwen-1.5b/vllm/README.md b/models/nlp/large_language_model/deepseek-r1-distill-qwen-1.5b/vllm/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..c52fc0f55f7692753cea62ebd1d171f80a739136
--- /dev/null
+++ b/models/nlp/large_language_model/deepseek-r1-distill-qwen-1.5b/vllm/README.md
@@ -0,0 +1,48 @@
+# DeepSeek-R1-Distill-Qwen-1.5B
+
+## Description
+
+DeepSeek-R1-Distill models are fine-tuned based on open-source models, using samples generated by DeepSeek-R1. We slightly change their configs and tokenizers.  We open-source distilled 1.5B, 7B, 8B, 14B, 32B, and 70B checkpoints based on Qwen2.5 and Llama3 series to the community.
+
+## Setup
+
+### Install
+
+```bash
+# Install libGL
+## CentOS
+yum install -y mesa-libGL
+## Ubuntu
+apt install -y libgl1-mesa-glx
+```
+
+### Download
+
+-Model: <https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B>
+
+```bash
+cd deepseek-r1-distill-qwen-1.5b/vllm
+mkdir -p data/
+ln -s /path/to/DeepSeek-R1-Distill-Qwen-1.5B ./data/
+```
+
+## Inference with offline
+
+```bash
+python3 offline_inference.py --model ./data/DeepSeek-R1-Distill-Qwen-1.5B --max-tokens 256 -tp 1 --temperature 0.0 --max-model-len 3096
+```
+## Inference with serve
+
+```bash
+vllm serve data/DeepSeek-R1-Distill-Qwen-1.5B --tensor-parallel-size 2 --max-model-len 32768 --enforce-eager --trust-remote-code
+```
+
+## Results
+
+| Model      | QPS   |
+| ---------- | ----- |
+| DeepSeek-R1-Distill-Qwen-1.5B | 259.42 |
+
+## Reference
+
+[DeepSeek-R1](https://github.com/deepseek-ai/DeepSeek-R1)
diff --git a/models/nlp/large_language_model/deepseek-r1-distill-qwen-1.5b/vllm/ci/prepare.sh b/models/nlp/large_language_model/deepseek-r1-distill-qwen-1.5b/vllm/ci/prepare.sh
new file mode 100644
index 0000000000000000000000000000000000000000..75fb19458942e1d61a674c1d6dd9bbdb521bc00c
--- /dev/null
+++ b/models/nlp/large_language_model/deepseek-r1-distill-qwen-1.5b/vllm/ci/prepare.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -x
+ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
+if [[ ${ID} == "ubuntu" ]]; then
+    apt install -y libgl1-mesa-glx
+elif [[ ${ID} == "centos" ]]; then
+    yum install -y mesa-libGL
+else
+    echo "Not Support Os"
+fi
diff --git a/models/nlp/large_language_model/deepseek-r1-distill-qwen-1.5b/vllm/offline_inference.py b/models/nlp/large_language_model/deepseek-r1-distill-qwen-1.5b/vllm/offline_inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..9b7d87fd797c78fcedba7cd4c9a9a0e7642c251f
--- /dev/null
+++ b/models/nlp/large_language_model/deepseek-r1-distill-qwen-1.5b/vllm/offline_inference.py
@@ -0,0 +1,115 @@
+import sys
+from pathlib import Path
+import os
+
+sys.path.append(str(Path(__file__).resolve().parent.parent.parent))
+import argparse
+import dataclasses
+import inspect
+import logging
+import time
+
+import torch
+from utils import load_chat_template, sampling_add_cli_args
+from vllm import LLM, EngineArgs, SamplingParams
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--chat_template", type=str, default=None)
+    parser.add_argument(
+        "--remove_chat_template",
+        default=False,
+        action="store_true",
+        help="pass this if you are not use a chat model",
+    )
+    parser = EngineArgs.add_cli_args(parser)
+    parser = sampling_add_cli_args(parser)
+    args = parser.parse_args()
+
+    engine_args = [attr.name for attr in dataclasses.fields(EngineArgs)]
+    sampling_args = [
+        param.name
+        for param in list(
+            inspect.signature(SamplingParams).parameters.values()
+        )
+    ]
+    engine_params = {attr: getattr(args, attr) for attr in engine_args}
+    sampling_params = {
+        attr: getattr(args, attr) for attr in sampling_args if args.__contains__(attr)
+    }
+
+    model_name = os.path.dirname(args.model).rsplit("/")[-1]
+
+    # Sample prompts.
+    prompts = ["哪些迹象可能表明一个人正在经历焦虑?", "描述一下如何制作芝士披萨。", "写一篇有关5G网络研发的综述文章。"]
+
+    # Create a sampling params object.
+    sampling_params = SamplingParams(**sampling_params)
+
+    # Create an LLM.
+    llm = LLM(**engine_params)
+
+    # process chat template
+    if args.remove_chat_template:
+        if "chat" in model_name.lower():
+            logging.warning(
+                f"The model name from model path is {model_name}, so we guess you are using the chat model and the additional processing is required for the input prompt. "
+                f"If the result is not quite correct, please ensure you do not pass --remove_chat_template in CLI."
+            )
+        prompts_new = prompts
+    else:
+        # Build chat model promopt
+        logging.warning(
+            "If you are using a non chat model, please pass the --remove_chat_template in CLI."
+        )
+        # Try use transformers's apply_chat_template, if chat_template is None, will use defalut template.
+        # For some old models, the default template may cause bad answers. we don't consider this situation,
+        # because the Transformers team is advancing the chat template. For more informatino about it,
+        # please refer to https://huggingface.co/docs/transformers/main/chat_templating
+        try:
+            load_chat_template(llm.get_tokenizer(), args.chat_template)
+            prompts_new = []
+            for prompt in prompts:
+                messages = [{"role": "user", "content": prompt}]
+                text = llm.get_tokenizer().apply_chat_template(
+                    messages, tokenize=False, add_generation_prompt=True
+                )
+                prompts_new.append(text)
+        except:
+            logging.warning(
+                "use tokenizer apply_chat_template function failed, may because of low transformers version...(try use transformers>=4.34.0)"
+            )
+            prompts_new = prompts
+
+    # Generate texts from the prompts. The output is a list of RequestOutput objects
+    # that contain the prompt, generated text, and other information.
+    outputs = (
+        llm.generate(prompts_new, sampling_params, use_tqdm=False)
+        if isinstance(prompts_new[0], str)
+        else llm.generate(
+            sampling_params=sampling_params,
+            prompt_token_ids=prompts_new,
+            use_tqdm=False,
+        )
+    )
+    torch.cuda.synchronize()
+
+    start_time = time.perf_counter()
+    outputs = (
+        llm.generate(prompts_new, sampling_params)
+        if isinstance(prompts_new[0], str)
+        else llm.generate(sampling_params=sampling_params, prompt_token_ids=prompts_new)
+    )
+    torch.cuda.synchronize()
+    end_time = time.perf_counter()
+    duration_time = end_time - start_time
+
+    num_tokens = 0
+    # Print the outputs.
+    for i, output in enumerate(outputs):
+        prompt = prompts[i]  # show the origin prompt. actully prompt is "output.prompt"
+        generated_text = output.outputs[0].text
+
+        num_tokens += len(output.outputs[0].token_ids)
+        print(f"Prompt: {prompt}\nGenerated text: {generated_text} \n")
+    print(f"tokens: {num_tokens}, QPS: {num_tokens/duration_time}")
diff --git a/models/nlp/large_language_model/deepseek-r1-distill-qwen-14b/vllm/README.md b/models/nlp/large_language_model/deepseek-r1-distill-qwen-14b/vllm/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..7d20eeeec6f29e20af0a1aed438f5f6a26f92986
--- /dev/null
+++ b/models/nlp/large_language_model/deepseek-r1-distill-qwen-14b/vllm/README.md
@@ -0,0 +1,48 @@
+# DeepSeek-R1-Distill-Qwen-14B
+
+## Description
+
+DeepSeek-R1-Distill models are fine-tuned based on open-source models, using samples generated by DeepSeek-R1. We slightly change their configs and tokenizers.  We open-source distilled 1.5B, 7B, 8B, 14B, 32B, and 70B checkpoints based on Qwen2.5 and Llama3 series to the community.
+
+## Setup
+
+### Install
+
+```bash
+# Install libGL
+## CentOS
+yum install -y mesa-libGL
+## Ubuntu
+apt install -y libgl1-mesa-glx
+```
+
+### Download
+
+-Model: <https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-14B>
+
+```bash
+cd deepseek-r1-distill-qwen-14b/vllm
+mkdir -p data/
+ln -s /path/to/DeepSeek-R1-Distill-Qwen-14B ./data/
+```
+
+## Inference with offline
+
+```bash
+python3 offline_inference.py --model ./data/DeepSeek-R1-Distill-Qwen-14B --max-tokens 256 -tp 2 --temperature 0.0 --max-model-len 3096
+```
+## Inference with serve
+
+```bash
+vllm serve data/DeepSeek-R1-Distill-Qwen-14B --tensor-parallel-size 2 --max-model-len 32768 --enforce-eager --trust-remote-code
+```
+
+## Results
+
+| Model      | QPS   |
+| ---------- | ----- |
+| DeepSeek-R1-Distill-Qwen-14B | 88.01|
+
+## Reference
+
+[DeepSeek-R1](https://github.com/deepseek-ai/DeepSeek-R1)
diff --git a/models/nlp/large_language_model/deepseek-r1-distill-qwen-14b/vllm/ci/prepare.sh b/models/nlp/large_language_model/deepseek-r1-distill-qwen-14b/vllm/ci/prepare.sh
new file mode 100644
index 0000000000000000000000000000000000000000..75fb19458942e1d61a674c1d6dd9bbdb521bc00c
--- /dev/null
+++ b/models/nlp/large_language_model/deepseek-r1-distill-qwen-14b/vllm/ci/prepare.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -x
+ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
+if [[ ${ID} == "ubuntu" ]]; then
+    apt install -y libgl1-mesa-glx
+elif [[ ${ID} == "centos" ]]; then
+    yum install -y mesa-libGL
+else
+    echo "Not Support Os"
+fi
diff --git a/models/nlp/large_language_model/deepseek-r1-distill-qwen-14b/vllm/offline_inference.py b/models/nlp/large_language_model/deepseek-r1-distill-qwen-14b/vllm/offline_inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..9b7d87fd797c78fcedba7cd4c9a9a0e7642c251f
--- /dev/null
+++ b/models/nlp/large_language_model/deepseek-r1-distill-qwen-14b/vllm/offline_inference.py
@@ -0,0 +1,115 @@
+import sys
+from pathlib import Path
+import os
+
+sys.path.append(str(Path(__file__).resolve().parent.parent.parent))
+import argparse
+import dataclasses
+import inspect
+import logging
+import time
+
+import torch
+from utils import load_chat_template, sampling_add_cli_args
+from vllm import LLM, EngineArgs, SamplingParams
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--chat_template", type=str, default=None)
+    parser.add_argument(
+        "--remove_chat_template",
+        default=False,
+        action="store_true",
+        help="pass this if you are not use a chat model",
+    )
+    parser = EngineArgs.add_cli_args(parser)
+    parser = sampling_add_cli_args(parser)
+    args = parser.parse_args()
+
+    engine_args = [attr.name for attr in dataclasses.fields(EngineArgs)]
+    sampling_args = [
+        param.name
+        for param in list(
+            inspect.signature(SamplingParams).parameters.values()
+        )
+    ]
+    engine_params = {attr: getattr(args, attr) for attr in engine_args}
+    sampling_params = {
+        attr: getattr(args, attr) for attr in sampling_args if args.__contains__(attr)
+    }
+
+    model_name = os.path.dirname(args.model).rsplit("/")[-1]
+
+    # Sample prompts.
+    prompts = ["哪些迹象可能表明一个人正在经历焦虑?", "描述一下如何制作芝士披萨。", "写一篇有关5G网络研发的综述文章。"]
+
+    # Create a sampling params object.
+    sampling_params = SamplingParams(**sampling_params)
+
+    # Create an LLM.
+    llm = LLM(**engine_params)
+
+    # process chat template
+    if args.remove_chat_template:
+        if "chat" in model_name.lower():
+            logging.warning(
+                f"The model name from model path is {model_name}, so we guess you are using the chat model and the additional processing is required for the input prompt. "
+                f"If the result is not quite correct, please ensure you do not pass --remove_chat_template in CLI."
+            )
+        prompts_new = prompts
+    else:
+        # Build chat model promopt
+        logging.warning(
+            "If you are using a non chat model, please pass the --remove_chat_template in CLI."
+        )
+        # Try use transformers's apply_chat_template, if chat_template is None, will use defalut template.
+        # For some old models, the default template may cause bad answers. we don't consider this situation,
+        # because the Transformers team is advancing the chat template. For more informatino about it,
+        # please refer to https://huggingface.co/docs/transformers/main/chat_templating
+        try:
+            load_chat_template(llm.get_tokenizer(), args.chat_template)
+            prompts_new = []
+            for prompt in prompts:
+                messages = [{"role": "user", "content": prompt}]
+                text = llm.get_tokenizer().apply_chat_template(
+                    messages, tokenize=False, add_generation_prompt=True
+                )
+                prompts_new.append(text)
+        except:
+            logging.warning(
+                "use tokenizer apply_chat_template function failed, may because of low transformers version...(try use transformers>=4.34.0)"
+            )
+            prompts_new = prompts
+
+    # Generate texts from the prompts. The output is a list of RequestOutput objects
+    # that contain the prompt, generated text, and other information.
+    outputs = (
+        llm.generate(prompts_new, sampling_params, use_tqdm=False)
+        if isinstance(prompts_new[0], str)
+        else llm.generate(
+            sampling_params=sampling_params,
+            prompt_token_ids=prompts_new,
+            use_tqdm=False,
+        )
+    )
+    torch.cuda.synchronize()
+
+    start_time = time.perf_counter()
+    outputs = (
+        llm.generate(prompts_new, sampling_params)
+        if isinstance(prompts_new[0], str)
+        else llm.generate(sampling_params=sampling_params, prompt_token_ids=prompts_new)
+    )
+    torch.cuda.synchronize()
+    end_time = time.perf_counter()
+    duration_time = end_time - start_time
+
+    num_tokens = 0
+    # Print the outputs.
+    for i, output in enumerate(outputs):
+        prompt = prompts[i]  # show the origin prompt. actully prompt is "output.prompt"
+        generated_text = output.outputs[0].text
+
+        num_tokens += len(output.outputs[0].token_ids)
+        print(f"Prompt: {prompt}\nGenerated text: {generated_text} \n")
+    print(f"tokens: {num_tokens}, QPS: {num_tokens/duration_time}")
diff --git a/models/nlp/large_language_model/deepseek-r1-distill-qwen-32b/vllm/README.md b/models/nlp/large_language_model/deepseek-r1-distill-qwen-32b/vllm/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..e1c8ca81e97b877136c80e293816845e75c1dec0
--- /dev/null
+++ b/models/nlp/large_language_model/deepseek-r1-distill-qwen-32b/vllm/README.md
@@ -0,0 +1,48 @@
+# DeepSeek-R1-Distill-Qwen-32B
+
+## Description
+
+DeepSeek-R1-Distill models are fine-tuned based on open-source models, using samples generated by DeepSeek-R1. We slightly change their configs and tokenizers.  We open-source distilled 1.5B, 7B, 8B, 14B, 32B, and 70B checkpoints based on Qwen2.5 and Llama3 series to the community.
+
+## Setup
+
+### Install
+
+```bash
+# Install libGL
+## CentOS
+yum install -y mesa-libGL
+## Ubuntu
+apt install -y libgl1-mesa-glx
+```
+
+### Download
+
+-Model: <https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B>
+
+```bash
+cd deepseek-r1-distill-qwen-32b/vllm
+mkdir -p data/
+ln -s /path/to/DeepSeek-R1-Distill-Qwen-32B ./data/
+```
+
+## Inference with offline
+
+```bash
+python3 offline_inference.py --model ./data/DeepSeek-R1-Distill-Qwen-32B --max-tokens 256 -tp 4 --temperature 0.0 --max-model-len 3096
+```
+## Inference with serve
+
+```bash
+vllm serve data/DeepSeek-R1-Distill-Qwen-32B --tensor-parallel-size 4 --max-model-len 32768 --enforce-eager --trust-remote-code
+```
+
+## Results
+
+| Model      | QPS   |
+| ---------- | ----- |
+| DeepSeek-R1-Distill-Qwen-32B | 68.30|
+
+## Reference
+
+[DeepSeek-R1](https://github.com/deepseek-ai/DeepSeek-R1)
diff --git a/models/nlp/large_language_model/deepseek-r1-distill-qwen-32b/vllm/ci/prepare.sh b/models/nlp/large_language_model/deepseek-r1-distill-qwen-32b/vllm/ci/prepare.sh
new file mode 100644
index 0000000000000000000000000000000000000000..75fb19458942e1d61a674c1d6dd9bbdb521bc00c
--- /dev/null
+++ b/models/nlp/large_language_model/deepseek-r1-distill-qwen-32b/vllm/ci/prepare.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -x
+ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
+if [[ ${ID} == "ubuntu" ]]; then
+    apt install -y libgl1-mesa-glx
+elif [[ ${ID} == "centos" ]]; then
+    yum install -y mesa-libGL
+else
+    echo "Not Support Os"
+fi
diff --git a/models/nlp/large_language_model/deepseek-r1-distill-qwen-32b/vllm/offline_inference.py b/models/nlp/large_language_model/deepseek-r1-distill-qwen-32b/vllm/offline_inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..9b7d87fd797c78fcedba7cd4c9a9a0e7642c251f
--- /dev/null
+++ b/models/nlp/large_language_model/deepseek-r1-distill-qwen-32b/vllm/offline_inference.py
@@ -0,0 +1,115 @@
+import sys
+from pathlib import Path
+import os
+
+sys.path.append(str(Path(__file__).resolve().parent.parent.parent))
+import argparse
+import dataclasses
+import inspect
+import logging
+import time
+
+import torch
+from utils import load_chat_template, sampling_add_cli_args
+from vllm import LLM, EngineArgs, SamplingParams
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--chat_template", type=str, default=None)
+    parser.add_argument(
+        "--remove_chat_template",
+        default=False,
+        action="store_true",
+        help="pass this if you are not use a chat model",
+    )
+    parser = EngineArgs.add_cli_args(parser)
+    parser = sampling_add_cli_args(parser)
+    args = parser.parse_args()
+
+    engine_args = [attr.name for attr in dataclasses.fields(EngineArgs)]
+    sampling_args = [
+        param.name
+        for param in list(
+            inspect.signature(SamplingParams).parameters.values()
+        )
+    ]
+    engine_params = {attr: getattr(args, attr) for attr in engine_args}
+    sampling_params = {
+        attr: getattr(args, attr) for attr in sampling_args if args.__contains__(attr)
+    }
+
+    model_name = os.path.dirname(args.model).rsplit("/")[-1]
+
+    # Sample prompts.
+    prompts = ["哪些迹象可能表明一个人正在经历焦虑?", "描述一下如何制作芝士披萨。", "写一篇有关5G网络研发的综述文章。"]
+
+    # Create a sampling params object.
+    sampling_params = SamplingParams(**sampling_params)
+
+    # Create an LLM.
+    llm = LLM(**engine_params)
+
+    # process chat template
+    if args.remove_chat_template:
+        if "chat" in model_name.lower():
+            logging.warning(
+                f"The model name from model path is {model_name}, so we guess you are using the chat model and the additional processing is required for the input prompt. "
+                f"If the result is not quite correct, please ensure you do not pass --remove_chat_template in CLI."
+            )
+        prompts_new = prompts
+    else:
+        # Build chat model promopt
+        logging.warning(
+            "If you are using a non chat model, please pass the --remove_chat_template in CLI."
+        )
+        # Try use transformers's apply_chat_template, if chat_template is None, will use defalut template.
+        # For some old models, the default template may cause bad answers. we don't consider this situation,
+        # because the Transformers team is advancing the chat template. For more informatino about it,
+        # please refer to https://huggingface.co/docs/transformers/main/chat_templating
+        try:
+            load_chat_template(llm.get_tokenizer(), args.chat_template)
+            prompts_new = []
+            for prompt in prompts:
+                messages = [{"role": "user", "content": prompt}]
+                text = llm.get_tokenizer().apply_chat_template(
+                    messages, tokenize=False, add_generation_prompt=True
+                )
+                prompts_new.append(text)
+        except:
+            logging.warning(
+                "use tokenizer apply_chat_template function failed, may because of low transformers version...(try use transformers>=4.34.0)"
+            )
+            prompts_new = prompts
+
+    # Generate texts from the prompts. The output is a list of RequestOutput objects
+    # that contain the prompt, generated text, and other information.
+    outputs = (
+        llm.generate(prompts_new, sampling_params, use_tqdm=False)
+        if isinstance(prompts_new[0], str)
+        else llm.generate(
+            sampling_params=sampling_params,
+            prompt_token_ids=prompts_new,
+            use_tqdm=False,
+        )
+    )
+    torch.cuda.synchronize()
+
+    start_time = time.perf_counter()
+    outputs = (
+        llm.generate(prompts_new, sampling_params)
+        if isinstance(prompts_new[0], str)
+        else llm.generate(sampling_params=sampling_params, prompt_token_ids=prompts_new)
+    )
+    torch.cuda.synchronize()
+    end_time = time.perf_counter()
+    duration_time = end_time - start_time
+
+    num_tokens = 0
+    # Print the outputs.
+    for i, output in enumerate(outputs):
+        prompt = prompts[i]  # show the origin prompt. actully prompt is "output.prompt"
+        generated_text = output.outputs[0].text
+
+        num_tokens += len(output.outputs[0].token_ids)
+        print(f"Prompt: {prompt}\nGenerated text: {generated_text} \n")
+    print(f"tokens: {num_tokens}, QPS: {num_tokens/duration_time}")
diff --git a/models/nlp/large_language_model/deepseek-r1-distill-qwen-7b/vllm/README.md b/models/nlp/large_language_model/deepseek-r1-distill-qwen-7b/vllm/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..8d72e0c77645cd2ade78fd30f4b065e183508870
--- /dev/null
+++ b/models/nlp/large_language_model/deepseek-r1-distill-qwen-7b/vllm/README.md
@@ -0,0 +1,48 @@
+# DeepSeek-R1-Distill-Qwen-7B
+
+## Description
+
+DeepSeek-R1-Distill models are fine-tuned based on open-source models, using samples generated by DeepSeek-R1. We slightly change their configs and tokenizers.  We open-source distilled 1.5B, 7B, 8B, 14B, 32B, and 70B checkpoints based on Qwen2.5 and Llama3 series to the community.
+
+## Setup
+
+### Install
+
+```bash
+# Install libGL
+## CentOS
+yum install -y mesa-libGL
+## Ubuntu
+apt install -y libgl1-mesa-glx
+```
+
+### Download
+
+-Model: <https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B>
+
+```bash
+cd deepseek-r1-distill-qwen-7b/vllm
+mkdir -p data/
+ln -s /path/to/DeepSeek-R1-Distill-Qwen-7B ./data/
+```
+
+## Inference with offline
+
+```bash
+python3 offline_inference.py --model ./data/DeepSeek-R1-Distill-Qwen-7B --max-tokens 256 -tp 1 --temperature 0.0 --max-model-len 3096
+```
+## Inference with serve
+
+```bash
+vllm serve data/DeepSeek-R1-Distill-Qwen-7B --tensor-parallel-size 2 --max-model-len 32768 --enforce-eager --trust-remote-code
+```
+
+## Results
+
+| Model      | QPS   |
+| ---------- | ----- |
+| DeepSeek-R1-Distill-Qwen-7B | 90.48|
+
+## Reference
+
+[DeepSeek-R1](https://github.com/deepseek-ai/DeepSeek-R1)
diff --git a/models/nlp/large_language_model/deepseek-r1-distill-qwen-7b/vllm/ci/prepare.sh b/models/nlp/large_language_model/deepseek-r1-distill-qwen-7b/vllm/ci/prepare.sh
new file mode 100644
index 0000000000000000000000000000000000000000..75fb19458942e1d61a674c1d6dd9bbdb521bc00c
--- /dev/null
+++ b/models/nlp/large_language_model/deepseek-r1-distill-qwen-7b/vllm/ci/prepare.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -x
+ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
+if [[ ${ID} == "ubuntu" ]]; then
+    apt install -y libgl1-mesa-glx
+elif [[ ${ID} == "centos" ]]; then
+    yum install -y mesa-libGL
+else
+    echo "Not Support Os"
+fi
diff --git a/models/nlp/large_language_model/deepseek-r1-distill-qwen-7b/vllm/offline_inference.py b/models/nlp/large_language_model/deepseek-r1-distill-qwen-7b/vllm/offline_inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..9b7d87fd797c78fcedba7cd4c9a9a0e7642c251f
--- /dev/null
+++ b/models/nlp/large_language_model/deepseek-r1-distill-qwen-7b/vllm/offline_inference.py
@@ -0,0 +1,115 @@
+import sys
+from pathlib import Path
+import os
+
+sys.path.append(str(Path(__file__).resolve().parent.parent.parent))
+import argparse
+import dataclasses
+import inspect
+import logging
+import time
+
+import torch
+from utils import load_chat_template, sampling_add_cli_args
+from vllm import LLM, EngineArgs, SamplingParams
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--chat_template", type=str, default=None)
+    parser.add_argument(
+        "--remove_chat_template",
+        default=False,
+        action="store_true",
+        help="pass this if you are not use a chat model",
+    )
+    parser = EngineArgs.add_cli_args(parser)
+    parser = sampling_add_cli_args(parser)
+    args = parser.parse_args()
+
+    engine_args = [attr.name for attr in dataclasses.fields(EngineArgs)]
+    sampling_args = [
+        param.name
+        for param in list(
+            inspect.signature(SamplingParams).parameters.values()
+        )
+    ]
+    engine_params = {attr: getattr(args, attr) for attr in engine_args}
+    sampling_params = {
+        attr: getattr(args, attr) for attr in sampling_args if args.__contains__(attr)
+    }
+
+    model_name = os.path.dirname(args.model).rsplit("/")[-1]
+
+    # Sample prompts.
+    prompts = ["哪些迹象可能表明一个人正在经历焦虑?", "描述一下如何制作芝士披萨。", "写一篇有关5G网络研发的综述文章。"]
+
+    # Create a sampling params object.
+    sampling_params = SamplingParams(**sampling_params)
+
+    # Create an LLM.
+    llm = LLM(**engine_params)
+
+    # process chat template
+    if args.remove_chat_template:
+        if "chat" in model_name.lower():
+            logging.warning(
+                f"The model name from model path is {model_name}, so we guess you are using the chat model and the additional processing is required for the input prompt. "
+                f"If the result is not quite correct, please ensure you do not pass --remove_chat_template in CLI."
+            )
+        prompts_new = prompts
+    else:
+        # Build chat model promopt
+        logging.warning(
+            "If you are using a non chat model, please pass the --remove_chat_template in CLI."
+        )
+        # Try use transformers's apply_chat_template, if chat_template is None, will use defalut template.
+        # For some old models, the default template may cause bad answers. we don't consider this situation,
+        # because the Transformers team is advancing the chat template. For more informatino about it,
+        # please refer to https://huggingface.co/docs/transformers/main/chat_templating
+        try:
+            load_chat_template(llm.get_tokenizer(), args.chat_template)
+            prompts_new = []
+            for prompt in prompts:
+                messages = [{"role": "user", "content": prompt}]
+                text = llm.get_tokenizer().apply_chat_template(
+                    messages, tokenize=False, add_generation_prompt=True
+                )
+                prompts_new.append(text)
+        except:
+            logging.warning(
+                "use tokenizer apply_chat_template function failed, may because of low transformers version...(try use transformers>=4.34.0)"
+            )
+            prompts_new = prompts
+
+    # Generate texts from the prompts. The output is a list of RequestOutput objects
+    # that contain the prompt, generated text, and other information.
+    outputs = (
+        llm.generate(prompts_new, sampling_params, use_tqdm=False)
+        if isinstance(prompts_new[0], str)
+        else llm.generate(
+            sampling_params=sampling_params,
+            prompt_token_ids=prompts_new,
+            use_tqdm=False,
+        )
+    )
+    torch.cuda.synchronize()
+
+    start_time = time.perf_counter()
+    outputs = (
+        llm.generate(prompts_new, sampling_params)
+        if isinstance(prompts_new[0], str)
+        else llm.generate(sampling_params=sampling_params, prompt_token_ids=prompts_new)
+    )
+    torch.cuda.synchronize()
+    end_time = time.perf_counter()
+    duration_time = end_time - start_time
+
+    num_tokens = 0
+    # Print the outputs.
+    for i, output in enumerate(outputs):
+        prompt = prompts[i]  # show the origin prompt. actully prompt is "output.prompt"
+        generated_text = output.outputs[0].text
+
+        num_tokens += len(output.outputs[0].token_ids)
+        print(f"Prompt: {prompt}\nGenerated text: {generated_text} \n")
+    print(f"tokens: {num_tokens}, QPS: {num_tokens/duration_time}")
diff --git a/tests/models_vllm.yaml b/tests/models_vllm.yaml
index 548f16c81dbc297b22a666e4982c3a3027311128..5781866f49744c906808a20af7cafeaace5241ba 100644
--- a/tests/models_vllm.yaml
+++ b/tests/models_vllm.yaml
@@ -112,3 +112,51 @@
     - fp16
   relative_path: models/multimodal/vision-language-understanding/minicpm-v-2/vllm/
   task_type: multimodal/vision-language-understanding
+- datasets: https://localhost
+  download_url: https://localhost/DeepSeek-R1-Distill-Qwen-1.5B
+  name: deepseek-r1-distill-qwen-1.5b
+  need_third_part: false
+  precisions:
+    - fp16
+  relative_path: models/nlp/large_language_model/deepseek-r1-distill-qwen-1.5b/vllm/
+  task_type: nlp/large_language_model
+- datasets: https://localhost
+  download_url: https://localhost/DeepSeek-R1-Distill-Qwen-7B
+  name: deepseek-r1-distill-qwen-7b
+  need_third_part: false
+  precisions:
+    - fp16
+  relative_path: models/nlp/large_language_model/deepseek-r1-distill-qwen-7b/vllm/
+  task_type: nlp/large_language_model
+- datasets: https://localhost
+  download_url: https://localhost/DeepSeek-R1-Distill-Qwen-14B
+  name: deepseek-r1-distill-qwen-14b
+  need_third_part: false
+  precisions:
+    - fp16
+  relative_path: models/nlp/large_language_model/deepseek-r1-distill-qwen-14b/vllm/
+  task_type: nlp/large_language_model
+- datasets: https://localhost
+  download_url: https://localhost/DeepSeek-R1-Distill-Qwen-32B
+  name: deepseek-r1-distill-qwen-32b
+  need_third_part: false
+  precisions:
+    - fp16
+  relative_path: models/nlp/large_language_model/deepseek-r1-distill-qwen-32b/vllm/
+  task_type: nlp/large_language_model
+- datasets: https://localhost
+  download_url: https://localhost/DeepSeek-R1-Distill-Llama-8B
+  name: deepseek-r1-distill-llama-8b
+  need_third_part: false
+  precisions:
+    - fp16
+  relative_path: models/nlp/large_language_model/deepseek-r1-distill-llama-8b/vllm/
+  task_type: nlp/large_language_model
+- datasets: https://localhost
+  download_url: https://localhost/DeepSeek-R1-Distill-Llama-70B
+  name: deepseek-r1-distill-llama-70b
+  need_third_part: false
+  precisions:
+    - fp16
+  relative_path: models/nlp/large_language_model/deepseek-r1-distill-llama-70b/vllm/
+  task_type: nlp/large_language_model
\ No newline at end of file
diff --git a/tests/run_vllm.py b/tests/run_vllm.py
index c3eb0217cad0414008e39a98346e91fe0e77dae1..518ad79ee977d8e79f77dba8a60c09db28be6688 100644
--- a/tests/run_vllm.py
+++ b/tests/run_vllm.py
@@ -220,6 +220,12 @@ def run_nlp_testcase(model):
             export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/lib64
             python3 minicpmv-2.0-offline.py --model-path ./minicpm-v-2 --image-path ./dog.jpg
             """
+        elif model_name.startswith("deepseek-r1-distill-"):
+            script = f"""
+            set -x
+            cd ../{model['relative_path']}
+            python3 offline_inference.py --model ./{model_name} --max-tokens 256 -tp 2 --temperature 0.0 --max-model-len 3096
+            """
 
         r, t = run_script(script)
         sout = r.stdout
@@ -235,11 +241,6 @@ def run_nlp_testcase(model):
         result["result"][prec]["Cost time (s)"] = t
     return result
 
-def get_metric_result(str):
-    if str:
-        return json.loads(str.replace("'", "\""))["metricResult"]
-    return None
-
 def run_script(script):
     start_time = time.perf_counter()
     result = subprocess.run(