From a15cb5c7528f886be0440a63a054a1f932a9847d Mon Sep 17 00:00:00 2001
From: "xiaomei.wang" <xiaomei.wang@iluvatar.com>
Date: Tue, 22 Oct 2024 17:11:05 +0800
Subject: [PATCH 1/4] Add llama2-7b/qwen-7b/qwen1.5-32b/qwen2-72b/qwen2-7b vllm
 inference.

---
 .../llama2-7b/vllm/README.md                  |  40 ++++
 .../vllm/flashinfer_backend/README.md         |  20 ++
 .../flashinfer_backend/offline_inference.py   | 117 ++++++++++++
 .../llama2-7b/vllm/offline_inference.py       | 117 ++++++++++++
 .../llama2-7b/vllm/template_llama.jinja       |  13 ++
 .../llama2-7b/vllm/utils.py                   | 173 ++++++++++++++++++
 .../qwen-7b/vllm/README.md                    |  41 +++++
 .../qwen-7b/vllm/offline_inference.py         | 117 ++++++++++++
 .../qwen-7b/vllm/utils.py                     | 173 ++++++++++++++++++
 .../qwen1.5-32b/vllm/README.md                |  39 ++++
 .../qwen1.5-32b/vllm/offline_inference.py     | 116 ++++++++++++
 .../qwen1.5-32b/vllm/utils.py                 | 173 ++++++++++++++++++
 .../qwen2-72b/vllm/README.md                  |  43 +++++
 .../qwen2-72b/vllm/offline_inference.py       | 116 ++++++++++++
 .../qwen2-72b/vllm/utils.py                   | 173 ++++++++++++++++++
 .../qwen2-7b/vllm/README.md                   |  43 +++++
 .../qwen2-7b/vllm/offline_inference.py        | 116 ++++++++++++
 .../qwen2-7b/vllm/utils.py                    | 173 ++++++++++++++++++
 18 files changed, 1803 insertions(+)
 create mode 100755 models/nlp/large_language_model/llama2-7b/vllm/README.md
 create mode 100644 models/nlp/large_language_model/llama2-7b/vllm/flashinfer_backend/README.md
 create mode 100644 models/nlp/large_language_model/llama2-7b/vllm/flashinfer_backend/offline_inference.py
 create mode 100644 models/nlp/large_language_model/llama2-7b/vllm/offline_inference.py
 create mode 100644 models/nlp/large_language_model/llama2-7b/vllm/template_llama.jinja
 create mode 100644 models/nlp/large_language_model/llama2-7b/vllm/utils.py
 create mode 100644 models/nlp/large_language_model/qwen-7b/vllm/README.md
 create mode 100644 models/nlp/large_language_model/qwen-7b/vllm/offline_inference.py
 create mode 100644 models/nlp/large_language_model/qwen-7b/vllm/utils.py
 create mode 100755 models/nlp/large_language_model/qwen1.5-32b/vllm/README.md
 create mode 100644 models/nlp/large_language_model/qwen1.5-32b/vllm/offline_inference.py
 create mode 100644 models/nlp/large_language_model/qwen1.5-32b/vllm/utils.py
 create mode 100755 models/nlp/large_language_model/qwen2-72b/vllm/README.md
 create mode 100644 models/nlp/large_language_model/qwen2-72b/vllm/offline_inference.py
 create mode 100644 models/nlp/large_language_model/qwen2-72b/vllm/utils.py
 create mode 100755 models/nlp/large_language_model/qwen2-7b/vllm/README.md
 create mode 100644 models/nlp/large_language_model/qwen2-7b/vllm/offline_inference.py
 create mode 100644 models/nlp/large_language_model/qwen2-7b/vllm/utils.py

diff --git a/models/nlp/large_language_model/llama2-7b/vllm/README.md b/models/nlp/large_language_model/llama2-7b/vllm/README.md
new file mode 100755
index 00000000..32793683
--- /dev/null
+++ b/models/nlp/large_language_model/llama2-7b/vllm/README.md
@@ -0,0 +1,40 @@
+# LlaMa2 7B
+
+## Description
+
+we develop and release Llama 2, a collection of pretrained and fine-tuned large language models (LLMs) ranging in scale from 7 billion to 70 billion parameters. Our fine-tuned LLMs, called Llama 2-Chat, are optimized for dialogue use cases. Our models outperform open-source chat models on most benchmarks we tested, and based on our human evaluations for helpfulness and safety, may be a suitable substitute for closed-source models. We provide a detailed description of our approach to fine-tuning and safety improvements of Llama 2-Chat in order to enable the community to build on our work and contribute to the responsible development of LLMs.
+
+## Setup
+
+### Instal
+
+In order to run the model smoothly, you need to get the sdk from [resource center](https://support.iluvatar.com/#/ProductLine?id=2) of Iluvatar CoreX official website.
+
+```bash
+# Install libGL
+## CentOS
+yum install -y mesa-libGL
+## Ubuntu
+apt install -y libgl1-mesa-dev
+
+# Contact the iluvatar manager to get adapted install packages of vllm, triton, and ixformer
+pip3 install vllm
+pip3 install triton
+pip3 install ixformer
+```
+### Download
+
+-Model: <https://huggingface.co/meta-llama/Llama-2-7b>
+
+```bash
+cd ${DeepSparkInference}/models/nlp/large_language_model/llama2-7b/vllm
+mkdir -p data/llama2
+ln -s /path/to/llama2-7b ./data/llama2
+```
+
+## Inference
+
+```bash
+python3 offline_inference.py --model ./data/llama2/llama2-7b --max-tokens 256 -tp 1 --temperature 0.0
+python3 offline_inference.py --model ./data/llama2/llama2-7b --max-tokens 256 -tp 2 --temperature 0.0
+```
diff --git a/models/nlp/large_language_model/llama2-7b/vllm/flashinfer_backend/README.md b/models/nlp/large_language_model/llama2-7b/vllm/flashinfer_backend/README.md
new file mode 100644
index 00000000..7f766da8
--- /dev/null
+++ b/models/nlp/large_language_model/llama2-7b/vllm/flashinfer_backend/README.md
@@ -0,0 +1,20 @@
+# attention 使用不同的backend
+
+通过设置环境变量可以使用不同的attention实现，目前支持两种。
+VLLM_ATTENTION_BACKEND=FLASHINFER / XFORMERS(默认使用)
+
+```shell
+# offline
+VLLM_ATTENTION_BACKEND=FLASHINFER python3 offline_inference.py \
+    --model xxx/Meta-Llama-3-8B-Instruct/ \
+    --max-tokens 256 \
+    --temperature 0.0
+
+# server
+VLLM_ATTENTION_BACKEND=FLASHINFER python3 -m vllm.entrypoints.openai.api_server \
+    --model xxx/Meta-Llama-3-8B-Instruct/ \
+    --gpu-memory-utilization 0.9 \
+    --max-num-seqs 1024 \
+    --host 127.0.0.1 \
+    --port 12345
+```
diff --git a/models/nlp/large_language_model/llama2-7b/vllm/flashinfer_backend/offline_inference.py b/models/nlp/large_language_model/llama2-7b/vllm/flashinfer_backend/offline_inference.py
new file mode 100644
index 00000000..eb6da16d
--- /dev/null
+++ b/models/nlp/large_language_model/llama2-7b/vllm/flashinfer_backend/offline_inference.py
@@ -0,0 +1,117 @@
+import sys
+from pathlib import Path
+
+sys.path.append(str(Path(__file__).resolve().parent.parent.parent))
+import argparse
+import dataclasses
+import inspect
+import logging
+import time
+
+import torch
+from utils import load_chat_template, sampling_add_cli_args
+from vllm import LLM, EngineArgs, SamplingParams
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--chat_template", type=str, default=None)
+    parser.add_argument(
+        "--remove_chat_template",
+        default=False,
+        action="store_true",
+        help="pass this if you are not use a chat model",
+    )
+    parser = EngineArgs.add_cli_args(parser)
+    parser = sampling_add_cli_args(parser)
+    args = parser.parse_args()
+
+    engine_args = [attr.name for attr in dataclasses.fields(EngineArgs)]
+    sampling_args = [
+        param.name
+        for param in list(
+            inspect.signature(SamplingParams.__init__).parameters.values()
+        )[1:]
+    ]
+    engine_params = {attr: getattr(args, attr) for attr in engine_args}
+    sampling_params = {
+        attr: getattr(args, attr) for attr in sampling_args if args.__contains__(attr)
+    }
+
+    model_name = args.model.strip()
+    model_name = model_name if args.model[-1] != "/" else model_name[:-1]
+    model_name = model_name.rsplit("/")[-1]
+
+    # Sample prompts.
+    prompts = [
+        "Shanghai is one of the most prosperous cities in China, with a GDP of over $300 billion. Shanghai has the fastest growing economy in China and is the second busiest port in the world. In addition to being a hub for business, Shanghai is also a major tourist destination. It is known for its diverse culture and many historical sites.\nThe city of Shanghai is located on the coast of the Pacific Ocean in east-central China. It is bordered by Jiangsu Province to the north, Zhejiang Province to the south, and Jiangsu Province to the west.",
+        "What signs may indicate that a person is experiencing anxiety?",
+        "Describe how to make cheese pizza.",
+        "Write a review article on the development of 5G networks.",
+    ]
+
+    # Create a sampling params object.
+    sampling_params = SamplingParams(**sampling_params)
+
+    # Create an LLM.
+    llm = LLM(**engine_params)
+
+    # process chat template
+    if args.remove_chat_template:
+        prompts_new = prompts
+        if "chat" in model_name.lower():
+            logging.warning(
+                f"The model name from model path is {model_name}, so we guess you are using the chat model and the additional processing is required for the input prompt. "
+                f"If the result is not quite correct, please ensure you do not pass --remove_chat_template in CLI."
+            )
+    else:
+        # Build chat model promopt
+        # logging.warning("If you are using a non chat model, please pass the --remove_chat_template in CLI.")
+        # Try use transformers's apply_chat_template, if chat_template is None, will use defalut template.
+        # For some old models, the default template may cause bad answers. we don't consider this situation,
+        # because the Transformers team is advancing the chat template. For more informatino about it,
+        # please refer to https://huggingface.co/docs/transformers/main/chat_templating
+        try:
+            load_chat_template(llm.get_tokenizer(), args.chat_template)
+            prompts_new = []
+            for prompt in prompts:
+                messages = [{"role": "user", "content": prompt}]
+                text = llm.get_tokenizer().apply_chat_template(
+                    messages, tokenize=False, add_generation_prompt=True
+                )
+                prompts_new.append(text)
+        except:
+            logging.warning(
+                "use tokenizer apply_chat_template function failed, may because of low transformers version...(try use transformers>=4.34.0)"
+            )
+            prompts_new = prompts
+
+    outputs = (
+        llm.generate(prompts_new, sampling_params, use_tqdm=False)
+        if isinstance(prompts_new[0], str)
+        else llm.generate(
+            sampling_params=sampling_params,
+            prompt_token_ids=prompts_new,
+            use_tqdm=False,
+        )
+    )
+    torch.cuda.synchronize()
+
+    start_time = time.perf_counter()
+    outputs = (
+        llm.generate(prompts_new, sampling_params)
+        if isinstance(prompts_new[0], str)
+        else llm.generate(sampling_params=sampling_params, prompt_token_ids=prompts_new)
+    )
+    torch.cuda.synchronize()
+    end_time = time.perf_counter()
+    duration_time = end_time - start_time
+
+    num_tokens = 0
+    # Print the outputs.
+    for i, output in enumerate(outputs):
+        prompt = prompts[i]  # show the origin prompt. actully prompt is "output.prompt"
+        generated_text = output.outputs[0].text
+
+        num_tokens += len(output.outputs[0].token_ids)
+        print(f"Prompt: {prompt}\nGenerated text: {generated_text} \n")
+    print(f"tokens: {num_tokens}, QPS: {num_tokens/duration_time}")
diff --git a/models/nlp/large_language_model/llama2-7b/vllm/offline_inference.py b/models/nlp/large_language_model/llama2-7b/vllm/offline_inference.py
new file mode 100644
index 00000000..eb6da16d
--- /dev/null
+++ b/models/nlp/large_language_model/llama2-7b/vllm/offline_inference.py
@@ -0,0 +1,117 @@
+import sys
+from pathlib import Path
+
+sys.path.append(str(Path(__file__).resolve().parent.parent.parent))
+import argparse
+import dataclasses
+import inspect
+import logging
+import time
+
+import torch
+from utils import load_chat_template, sampling_add_cli_args
+from vllm import LLM, EngineArgs, SamplingParams
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--chat_template", type=str, default=None)
+    parser.add_argument(
+        "--remove_chat_template",
+        default=False,
+        action="store_true",
+        help="pass this if you are not use a chat model",
+    )
+    parser = EngineArgs.add_cli_args(parser)
+    parser = sampling_add_cli_args(parser)
+    args = parser.parse_args()
+
+    engine_args = [attr.name for attr in dataclasses.fields(EngineArgs)]
+    sampling_args = [
+        param.name
+        for param in list(
+            inspect.signature(SamplingParams.__init__).parameters.values()
+        )[1:]
+    ]
+    engine_params = {attr: getattr(args, attr) for attr in engine_args}
+    sampling_params = {
+        attr: getattr(args, attr) for attr in sampling_args if args.__contains__(attr)
+    }
+
+    model_name = args.model.strip()
+    model_name = model_name if args.model[-1] != "/" else model_name[:-1]
+    model_name = model_name.rsplit("/")[-1]
+
+    # Sample prompts.
+    prompts = [
+        "Shanghai is one of the most prosperous cities in China, with a GDP of over $300 billion. Shanghai has the fastest growing economy in China and is the second busiest port in the world. In addition to being a hub for business, Shanghai is also a major tourist destination. It is known for its diverse culture and many historical sites.\nThe city of Shanghai is located on the coast of the Pacific Ocean in east-central China. It is bordered by Jiangsu Province to the north, Zhejiang Province to the south, and Jiangsu Province to the west.",
+        "What signs may indicate that a person is experiencing anxiety?",
+        "Describe how to make cheese pizza.",
+        "Write a review article on the development of 5G networks.",
+    ]
+
+    # Create a sampling params object.
+    sampling_params = SamplingParams(**sampling_params)
+
+    # Create an LLM.
+    llm = LLM(**engine_params)
+
+    # process chat template
+    if args.remove_chat_template:
+        prompts_new = prompts
+        if "chat" in model_name.lower():
+            logging.warning(
+                f"The model name from model path is {model_name}, so we guess you are using the chat model and the additional processing is required for the input prompt. "
+                f"If the result is not quite correct, please ensure you do not pass --remove_chat_template in CLI."
+            )
+    else:
+        # Build chat model promopt
+        # logging.warning("If you are using a non chat model, please pass the --remove_chat_template in CLI.")
+        # Try use transformers's apply_chat_template, if chat_template is None, will use defalut template.
+        # For some old models, the default template may cause bad answers. we don't consider this situation,
+        # because the Transformers team is advancing the chat template. For more informatino about it,
+        # please refer to https://huggingface.co/docs/transformers/main/chat_templating
+        try:
+            load_chat_template(llm.get_tokenizer(), args.chat_template)
+            prompts_new = []
+            for prompt in prompts:
+                messages = [{"role": "user", "content": prompt}]
+                text = llm.get_tokenizer().apply_chat_template(
+                    messages, tokenize=False, add_generation_prompt=True
+                )
+                prompts_new.append(text)
+        except:
+            logging.warning(
+                "use tokenizer apply_chat_template function failed, may because of low transformers version...(try use transformers>=4.34.0)"
+            )
+            prompts_new = prompts
+
+    outputs = (
+        llm.generate(prompts_new, sampling_params, use_tqdm=False)
+        if isinstance(prompts_new[0], str)
+        else llm.generate(
+            sampling_params=sampling_params,
+            prompt_token_ids=prompts_new,
+            use_tqdm=False,
+        )
+    )
+    torch.cuda.synchronize()
+
+    start_time = time.perf_counter()
+    outputs = (
+        llm.generate(prompts_new, sampling_params)
+        if isinstance(prompts_new[0], str)
+        else llm.generate(sampling_params=sampling_params, prompt_token_ids=prompts_new)
+    )
+    torch.cuda.synchronize()
+    end_time = time.perf_counter()
+    duration_time = end_time - start_time
+
+    num_tokens = 0
+    # Print the outputs.
+    for i, output in enumerate(outputs):
+        prompt = prompts[i]  # show the origin prompt. actully prompt is "output.prompt"
+        generated_text = output.outputs[0].text
+
+        num_tokens += len(output.outputs[0].token_ids)
+        print(f"Prompt: {prompt}\nGenerated text: {generated_text} \n")
+    print(f"tokens: {num_tokens}, QPS: {num_tokens/duration_time}")
diff --git a/models/nlp/large_language_model/llama2-7b/vllm/template_llama.jinja b/models/nlp/large_language_model/llama2-7b/vllm/template_llama.jinja
new file mode 100644
index 00000000..245ceeff
--- /dev/null
+++ b/models/nlp/large_language_model/llama2-7b/vllm/template_llama.jinja
@@ -0,0 +1,13 @@
+{{- '<s>[INST] ' -}}
+
+{%- for message in messages -%}
+    {%- if loop.first and message['role'] == 'system' -%}
+        {{- '<<SYS>>\n' + message['content'] + '\n<</SYS>>\n' -}}
+    {%- elif message['role'] == 'user' and loop.index <= 2 -%}
+        {{- message['content'] + ' [/INST]' -}}
+    {%- elif message['role'] == 'user' -%}
+        {{- '<s>[INST] ' + message['content'] + ' [/INST]' -}}
+    {%- elif message['role'] == 'assistant' -%}
+        {{- ' ' + message['content'] + ' </s>' -}}
+    {%- endif -%}
+{%- endfor -%}
diff --git a/models/nlp/large_language_model/llama2-7b/vllm/utils.py b/models/nlp/large_language_model/llama2-7b/vllm/utils.py
new file mode 100644
index 00000000..c6def85d
--- /dev/null
+++ b/models/nlp/large_language_model/llama2-7b/vllm/utils.py
@@ -0,0 +1,173 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+import codecs
+import logging
+import argparse
+
+
+def sampling_add_cli_args(args: argparse.ArgumentParser) -> argparse.ArgumentParser:
+    args.add_argument(
+        '--n',
+        type=int,
+        default=1,
+        help="Number of output sequences to return for the given prompt.")
+    args.add_argument(
+        '--best-of',
+        type=int,
+        default=None,
+        help="Number of output sequences that are generated from the prompt. "
+        "From these `best_of` sequences, the top `n` sequences are returned. "
+        "`best_of` must be greater than or equal to `n`. This is treated as "
+        "the beam width when `use_beam_search` is True. By default, `best_of`"
+        "is set to `n`.")
+    args.add_argument(
+        '--presence-penalty',
+        type=float,
+        default=0.0,
+        help="Float that penalizes new tokens based on whether they "
+        "appear in the generated text so far. Values > 0 encourage the model "
+        "to use new tokens, while values < 0 encourage the model to repeat "
+        "tokens.")
+    args.add_argument(
+        '--frequency-penalty',
+        type=float,
+        default=0.0,
+        help="Float that penalizes new tokens based on their "
+        " frequency in the generated text so far. Values > 0 encourage the "
+        " model to use new tokens, while values < 0 encourage the model to "
+        "repeat tokens.")
+    args.add_argument(
+        '--repetition-penalty',
+        type=float,
+        default=1.0,
+        help="Float that penalizes new tokens based on whether "
+        "they appear in the prompt and the generated text so far. Values > 1 "
+        "encourage the model to use new tokens, while values < 1 encourage "
+        "the model to repeat tokens.")
+    args.add_argument(
+        '--temperature',
+        type=float,
+        default=1.0,
+        help="Float that controls the randomness of the sampling. Lower "
+        "values make the model more deterministic, while higher values make "
+        "the model more random. Zero means greedy sampling.")
+    args.add_argument(
+        '--top-p',
+        type=float,
+        default=1.0,
+        help="Float that controls the cumulative probability of the top tokens "
+            "to consider. Must be in (0, 1]. Set to 1 to consider all tokens.")
+    args.add_argument(
+        '--top-k',
+        type=int,
+        default=-1,
+        help="Integer that controls the number of top tokens to consider. Set "
+        "to -1 to consider all tokens.")
+    args.add_argument(
+        '--min-p',
+        type=float,
+        default=0.0,
+        help="Float that represents the minimum probability for a token to be "
+        "considered, relative to the probability of the most likely token. "
+        "Must be in [0, 1]. Set to 0 to disable this.")
+    args.add_argument(
+        '--use-beam-search',
+        default=False,
+        action="store_true",
+        help="Whether to use beam search instead of sampling.")
+    args.add_argument(
+        '--length-penalty',
+        type=float,
+        default=1.0,
+        help="Float that penalizes sequences based on their length. Used in beam search.")
+    args.add_argument(
+        '--stop',
+        type=str,
+        default=None,
+        help="List of strings that stop the generation when they are generated. "
+        "The returned output will not contain the stop strings.")
+    args.add_argument(
+        '--stop-token-ids',
+        type=int,
+        default=None,
+        help="List of tokens that stop the generation when they are "
+        "generated. The returned output will contain the stop tokens unless "
+        "the stop tokens are special tokens.")
+    args.add_argument(
+        '--include-stop-str-in-output',
+        default=False,
+        action="store_true",
+        help="Whether to include the stop strings in output text. Defaults to False.")
+    args.add_argument(
+        '--ignore-eos',
+        default=False,
+        action="store_true",
+        help="Whether to ignore the EOS token and continue generating tokens after the EOS token is generated.")
+    args.add_argument(
+        '--max-tokens',
+        type=int,
+        default=16,
+        help="Maximum number of tokens to generate per output sequence.")
+    args.add_argument(
+        '--logprobs',
+        type=int,
+        default=None,
+        help="NNumber of log probabilities to return per output token. "
+        "Note that the implementation follows the OpenAI API: The return "
+        "result includes the log probabilities on the `logprobs` most likely "
+        "tokens, as well the chosen tokens. The API will always return the "
+        "log probability of the sampled token, so there  may be up to "
+        "`logprobs+1` elements in the response.")
+    args.add_argument(
+        '--prompt-logprobs',
+        type=int,
+        default=None,
+        help="Number of log probabilities to return per prompt token.")
+    args.add_argument(
+        '--skip-special-tokens',
+        default=True,
+        action="store_false",
+        help="Whether to skip special tokens in the output.")
+    args.add_argument(
+        '--spaces-between-special-tokens',
+        default=True,
+        action="store_false",
+        help="Whether to add spaces between special tokens in the output.  Defaults to True.")
+    # early_stopping logits_processors seed
+    return args
+
+
+def load_chat_template(tokenizer, chat_template):
+    if chat_template is not None:
+        try:
+            with open(chat_template, "r") as f:
+                tokenizer.chat_template = f.read()
+        except OSError:
+            # If opening a file fails, set chat template to be args to
+            # ensure we decode so our escape are interpreted correctly
+            tokenizer.chat_template = codecs.decode(
+                chat_template, "unicode_escape")
+
+        logging.info(
+            f"Using supplied chat template:\n{tokenizer.chat_template}"
+        )
+    elif tokenizer.chat_template is not None:
+        logging.info(
+            f"Using default chat template:\n{tokenizer.chat_template}. This May lead to unsatisfactory results. You can provide a template.jinja file for vllm."
+        )
+    else:
+        logging.warning(
+            "No chat template provided. Chat API will not work. This May lead to unsatisfactory results. You can provide a template.jinja file for vllm.")
diff --git a/models/nlp/large_language_model/qwen-7b/vllm/README.md b/models/nlp/large_language_model/qwen-7b/vllm/README.md
new file mode 100644
index 00000000..0eeafd3c
--- /dev/null
+++ b/models/nlp/large_language_model/qwen-7b/vllm/README.md
@@ -0,0 +1,41 @@
+# Qwen-7B
+
+## Description
+
+Large language models (LLMs) have revolutionized the field of artificial intelligence, enabling natural language processing tasks that were previously thought to be exclusive to humans. In this work, we introduce Qwen, the first installment of our large language model series. Qwen is a comprehensive language model series that encompasses distinct models with varying parameter counts. It includes Qwen, the base pretrained language models, and Qwen-Chat, the chat models finetuned with human alignment techniques. The base language models consistently demonstrate superior performance across a multitude of downstream tasks, and the chat models, particularly those trained using Reinforcement Learning from Human Feedback (RLHF), are highly competitive. The chat models possess advanced tool-use and planning capabilities for creating agent applications, showcasing impressive performance even when compared to bigger models on complex tasks like utilizing a code interpreter. Furthermore, we have developed coding-specialized models, Code-Qwen and Code-Qwen-Chat, as well as mathematics-focused models, Math-Qwen-Chat, which are built upon base language models. These models demonstrate significantly improved performance in comparison with open-source models, and slightly fall behind the proprietary models.
+
+## Setup
+
+### Install
+
+In order to run the model smoothly, you need to get the sdk from [resource center](https://support.iluvatar.com/#/ProductLine?id=2) of Iluvatar CoreX official website.
+
+```bash
+# Install libGL
+## CentOS
+yum install -y mesa-libGL
+## Ubuntu
+apt install -y libgl1-mesa-glx
+
+# Contact the iluvatar manager to get adapted install packages of vllm, triton, and ixformer
+pip3 install vllm
+pip3 install triton
+pip3 install ixformer
+```
+
+### Download
+
+-Model: -Model: <https://modelscope.cn/models/qwen/Qwen-7B/summary>
+
+```bash
+cd ${DeepSparkInference}/models/nlp/large_language_model/qwen-7b/vllm
+mkdir data/qwen
+ln -s /path/to/Qwen-7B ./data/qwen
+```
+
+## Inference
+
+```bash
+export CUDA_VISIBLE_DEVICES=0,1
+python3 offline_inference.py --model ./data/qwen/Qwen-7B-Chat --max-tokens 256 -tp 2 --trust-remote-code --temperature 0.0
+```
diff --git a/models/nlp/large_language_model/qwen-7b/vllm/offline_inference.py b/models/nlp/large_language_model/qwen-7b/vllm/offline_inference.py
new file mode 100644
index 00000000..5235f032
--- /dev/null
+++ b/models/nlp/large_language_model/qwen-7b/vllm/offline_inference.py
@@ -0,0 +1,117 @@
+import sys
+from pathlib import Path
+
+sys.path.append(str(Path(__file__).resolve().parent.parent.parent))
+import argparse
+import dataclasses
+import inspect
+import logging
+import time
+
+import torch
+from utils import load_chat_template, sampling_add_cli_args
+from vllm import LLM, EngineArgs, SamplingParams
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--chat_template", type=str, default=None)
+    parser.add_argument(
+        "--remove_chat_template",
+        default=False,
+        action="store_true",
+        help="pass this if you are not use a chat model",
+    )
+    parser = EngineArgs.add_cli_args(parser)
+    parser = sampling_add_cli_args(parser)
+    args = parser.parse_args()
+
+    engine_args = [attr.name for attr in dataclasses.fields(EngineArgs)]
+    sampling_args = [
+        param.name
+        for param in list(
+            inspect.signature(SamplingParams.__init__).parameters.values()
+        )[1:]
+    ]
+    engine_params = {attr: getattr(args, attr) for attr in engine_args}
+    sampling_params = {
+        attr: getattr(args, attr) for attr in sampling_args if args.__contains__(attr)
+    }
+
+    model_name = args.model.strip()
+    model_name = model_name if args.model[-1] != "/" else model_name[:-1]
+    model_name = model_name.rsplit("/")[-1]
+
+    # Sample prompts.
+    prompts = ["哪些迹象可能表明一个人正在经历焦虑?", "描述一下如何制作芝士披萨。", "写一篇有关5G网络研发的综述文章。"]
+
+    # Create a sampling params object.
+    sampling_params = SamplingParams(**sampling_params)
+
+    # Create an LLM.
+    llm = LLM(**engine_params)
+
+    # process chat template
+    if args.remove_chat_template:
+        if "chat" in model_name.lower():
+            logging.warning(
+                f"The model name from model path is {model_name}, so we guess you are using the chat model and the additional processing is required for the input prompt. "
+                f"If the result is not quite correct, please ensure you do not pass --remove_chat_template in CLI."
+            )
+        prompts_new = prompts
+    else:
+        # Build chat model promopt
+        logging.warning(
+            "If you are using a non chat model, please pass the --remove_chat_template in CLI."
+        )
+        # Try use transformers's apply_chat_template, if chat_template is None, will use defalut template.
+        # For some old models, the default template may cause bad answers. we don't consider this situation,
+        # because the Transformers team is advancing the chat template. For more informatino about it,
+        # please refer to https://huggingface.co/docs/transformers/main/chat_templating
+        try:
+            load_chat_template(llm.get_tokenizer(), args.chat_template)
+            prompts_new = []
+            for prompt in prompts:
+                messages = [{"role": "user", "content": prompt}]
+                text = llm.get_tokenizer().apply_chat_template(
+                    messages, tokenize=False, add_generation_prompt=True
+                )
+                prompts_new.append(text)
+        except:
+            logging.warning(
+                "use tokenizer apply_chat_template function failed, may because of low transformers version...(try use transformers>=4.34.0)"
+            )
+
+            prompts_new = prompts
+
+    # Generate texts from the prompts. The output is a list of RequestOutput objects
+    # that contain the prompt, generated text, and other information.
+    outputs = (
+        llm.generate(prompts_new, sampling_params, use_tqdm=False)
+        if isinstance(prompts_new[0], str)
+        else llm.generate(
+            sampling_params=sampling_params,
+            prompt_token_ids=prompts_new,
+            use_tqdm=False,
+        )
+    )
+    torch.cuda.synchronize()
+
+    start_time = time.perf_counter()
+    outputs = (
+        llm.generate(prompts_new, sampling_params)
+        if isinstance(prompts_new[0], str)
+        else llm.generate(sampling_params=sampling_params, prompt_token_ids=prompts_new)
+    )
+    torch.cuda.synchronize()
+    end_time = time.perf_counter()
+    duration_time = end_time - start_time
+
+    num_tokens = 0
+    # Print the outputs.
+    for i, output in enumerate(outputs):
+        prompt = prompts[i]  # show the origin prompt. actully prompt is "output.prompt"
+        generated_text = output.outputs[0].text
+
+        num_tokens += len(output.outputs[0].token_ids)
+        print(f"Prompt: {prompt}\nGenerated text: {generated_text} \n")
+    print(f"tokens: {num_tokens}, QPS: {num_tokens/duration_time}")
diff --git a/models/nlp/large_language_model/qwen-7b/vllm/utils.py b/models/nlp/large_language_model/qwen-7b/vllm/utils.py
new file mode 100644
index 00000000..c6def85d
--- /dev/null
+++ b/models/nlp/large_language_model/qwen-7b/vllm/utils.py
@@ -0,0 +1,173 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+import codecs
+import logging
+import argparse
+
+
+def sampling_add_cli_args(args: argparse.ArgumentParser) -> argparse.ArgumentParser:
+    args.add_argument(
+        '--n',
+        type=int,
+        default=1,
+        help="Number of output sequences to return for the given prompt.")
+    args.add_argument(
+        '--best-of',
+        type=int,
+        default=None,
+        help="Number of output sequences that are generated from the prompt. "
+        "From these `best_of` sequences, the top `n` sequences are returned. "
+        "`best_of` must be greater than or equal to `n`. This is treated as "
+        "the beam width when `use_beam_search` is True. By default, `best_of`"
+        "is set to `n`.")
+    args.add_argument(
+        '--presence-penalty',
+        type=float,
+        default=0.0,
+        help="Float that penalizes new tokens based on whether they "
+        "appear in the generated text so far. Values > 0 encourage the model "
+        "to use new tokens, while values < 0 encourage the model to repeat "
+        "tokens.")
+    args.add_argument(
+        '--frequency-penalty',
+        type=float,
+        default=0.0,
+        help="Float that penalizes new tokens based on their "
+        " frequency in the generated text so far. Values > 0 encourage the "
+        " model to use new tokens, while values < 0 encourage the model to "
+        "repeat tokens.")
+    args.add_argument(
+        '--repetition-penalty',
+        type=float,
+        default=1.0,
+        help="Float that penalizes new tokens based on whether "
+        "they appear in the prompt and the generated text so far. Values > 1 "
+        "encourage the model to use new tokens, while values < 1 encourage "
+        "the model to repeat tokens.")
+    args.add_argument(
+        '--temperature',
+        type=float,
+        default=1.0,
+        help="Float that controls the randomness of the sampling. Lower "
+        "values make the model more deterministic, while higher values make "
+        "the model more random. Zero means greedy sampling.")
+    args.add_argument(
+        '--top-p',
+        type=float,
+        default=1.0,
+        help="Float that controls the cumulative probability of the top tokens "
+            "to consider. Must be in (0, 1]. Set to 1 to consider all tokens.")
+    args.add_argument(
+        '--top-k',
+        type=int,
+        default=-1,
+        help="Integer that controls the number of top tokens to consider. Set "
+        "to -1 to consider all tokens.")
+    args.add_argument(
+        '--min-p',
+        type=float,
+        default=0.0,
+        help="Float that represents the minimum probability for a token to be "
+        "considered, relative to the probability of the most likely token. "
+        "Must be in [0, 1]. Set to 0 to disable this.")
+    args.add_argument(
+        '--use-beam-search',
+        default=False,
+        action="store_true",
+        help="Whether to use beam search instead of sampling.")
+    args.add_argument(
+        '--length-penalty',
+        type=float,
+        default=1.0,
+        help="Float that penalizes sequences based on their length. Used in beam search.")
+    args.add_argument(
+        '--stop',
+        type=str,
+        default=None,
+        help="List of strings that stop the generation when they are generated. "
+        "The returned output will not contain the stop strings.")
+    args.add_argument(
+        '--stop-token-ids',
+        type=int,
+        default=None,
+        help="List of tokens that stop the generation when they are "
+        "generated. The returned output will contain the stop tokens unless "
+        "the stop tokens are special tokens.")
+    args.add_argument(
+        '--include-stop-str-in-output',
+        default=False,
+        action="store_true",
+        help="Whether to include the stop strings in output text. Defaults to False.")
+    args.add_argument(
+        '--ignore-eos',
+        default=False,
+        action="store_true",
+        help="Whether to ignore the EOS token and continue generating tokens after the EOS token is generated.")
+    args.add_argument(
+        '--max-tokens',
+        type=int,
+        default=16,
+        help="Maximum number of tokens to generate per output sequence.")
+    args.add_argument(
+        '--logprobs',
+        type=int,
+        default=None,
+        help="NNumber of log probabilities to return per output token. "
+        "Note that the implementation follows the OpenAI API: The return "
+        "result includes the log probabilities on the `logprobs` most likely "
+        "tokens, as well the chosen tokens. The API will always return the "
+        "log probability of the sampled token, so there  may be up to "
+        "`logprobs+1` elements in the response.")
+    args.add_argument(
+        '--prompt-logprobs',
+        type=int,
+        default=None,
+        help="Number of log probabilities to return per prompt token.")
+    args.add_argument(
+        '--skip-special-tokens',
+        default=True,
+        action="store_false",
+        help="Whether to skip special tokens in the output.")
+    args.add_argument(
+        '--spaces-between-special-tokens',
+        default=True,
+        action="store_false",
+        help="Whether to add spaces between special tokens in the output.  Defaults to True.")
+    # early_stopping logits_processors seed
+    return args
+
+
+def load_chat_template(tokenizer, chat_template):
+    if chat_template is not None:
+        try:
+            with open(chat_template, "r") as f:
+                tokenizer.chat_template = f.read()
+        except OSError:
+            # If opening a file fails, set chat template to be args to
+            # ensure we decode so our escape are interpreted correctly
+            tokenizer.chat_template = codecs.decode(
+                chat_template, "unicode_escape")
+
+        logging.info(
+            f"Using supplied chat template:\n{tokenizer.chat_template}"
+        )
+    elif tokenizer.chat_template is not None:
+        logging.info(
+            f"Using default chat template:\n{tokenizer.chat_template}. This May lead to unsatisfactory results. You can provide a template.jinja file for vllm."
+        )
+    else:
+        logging.warning(
+            "No chat template provided. Chat API will not work. This May lead to unsatisfactory results. You can provide a template.jinja file for vllm.")
diff --git a/models/nlp/large_language_model/qwen1.5-32b/vllm/README.md b/models/nlp/large_language_model/qwen1.5-32b/vllm/README.md
new file mode 100755
index 00000000..5c766a76
--- /dev/null
+++ b/models/nlp/large_language_model/qwen1.5-32b/vllm/README.md
@@ -0,0 +1,39 @@
+# Qwen1.5-32B-Chat
+
+## Description
+
+Qwen1.5 is a language model series including decoder language models of different model sizes. For each size, we release the base language model and the aligned chat model. It is based on the Transformer architecture with SwiGLU activation, attention QKV bias, group query attention, mixture of sliding window attention and full attention, etc. Additionally, we have an improved tokenizer adaptive to multiple natural languages and codes. 
+
+## Setup
+
+### Install
+
+```bash
+# Install libGL
+## CentOS
+yum install -y mesa-libGL
+## Ubuntu
+apt install -y libgl1-mesa-glx
+
+# Contact the iluvatar manager to get adapted install packages of vllm, triton, and ixformer
+pip3 install vllm
+pip3 install triton
+pip3 install ixformer
+```
+
+### Download
+
+-Model: <https://modelscope.cn/models/Qwen/Qwen1.5-32B-Chat>
+
+```bash
+cd ${DeepSparkInference}/models/nlp/large_language_model/qwen1.5-32b/vllm
+mkdir data/qwen1.5
+ln -s /path/to/Qwen1.5-32B ./data/qwen1.5
+```
+
+## Inference
+
+```bash
+export CUDA_VISIBLE_DEVICES=0,1,2,3
+python3 offline_inference.py --model ./data/qwen1.5/Qwen1.5-32B-Chat --max-tokens 256 -tp 4 --temperature 0.0
+```
diff --git a/models/nlp/large_language_model/qwen1.5-32b/vllm/offline_inference.py b/models/nlp/large_language_model/qwen1.5-32b/vllm/offline_inference.py
new file mode 100644
index 00000000..81b0635e
--- /dev/null
+++ b/models/nlp/large_language_model/qwen1.5-32b/vllm/offline_inference.py
@@ -0,0 +1,116 @@
+import sys
+from pathlib import Path
+
+sys.path.append(str(Path(__file__).resolve().parent.parent.parent))
+import argparse
+import dataclasses
+import inspect
+import logging
+import time
+
+import torch
+from utils import load_chat_template, sampling_add_cli_args
+from vllm import LLM, EngineArgs, SamplingParams
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--chat_template", type=str, default=None)
+    parser.add_argument(
+        "--remove_chat_template",
+        default=False,
+        action="store_true",
+        help="pass this if you are not use a chat model",
+    )
+    parser = EngineArgs.add_cli_args(parser)
+    parser = sampling_add_cli_args(parser)
+    args = parser.parse_args()
+
+    engine_args = [attr.name for attr in dataclasses.fields(EngineArgs)]
+    sampling_args = [
+        param.name
+        for param in list(
+            inspect.signature(SamplingParams.__init__).parameters.values()
+        )[1:]
+    ]
+    engine_params = {attr: getattr(args, attr) for attr in engine_args}
+    sampling_params = {
+        attr: getattr(args, attr) for attr in sampling_args if args.__contains__(attr)
+    }
+
+    model_name = args.model.strip()
+    model_name = model_name if args.model[-1] != "/" else model_name[:-1]
+    model_name = model_name.rsplit("/")[-1]
+
+    # Sample prompts.
+    prompts = ["哪些迹象可能表明一个人正在经历焦虑?", "描述一下如何制作芝士披萨。", "写一篇有关5G网络研发的综述文章。"]
+
+    # Create a sampling params object.
+    sampling_params = SamplingParams(**sampling_params)
+
+    # Create an LLM.
+    llm = LLM(**engine_params)
+
+    # process chat template
+    if args.remove_chat_template:
+        if "chat" in model_name.lower():
+            logging.warning(
+                f"The model name from model path is {model_name}, so we guess you are using the chat model and the additional processing is required for the input prompt. "
+                f"If the result is not quite correct, please ensure you do not pass --remove_chat_template in CLI."
+            )
+        prompts_new = prompts
+    else:
+        # Build chat model promopt
+        logging.warning(
+            "If you are using a non chat model, please pass the --remove_chat_template in CLI."
+        )
+        # Try use transformers's apply_chat_template, if chat_template is None, will use defalut template.
+        # For some old models, the default template may cause bad answers. we don't consider this situation,
+        # because the Transformers team is advancing the chat template. For more informatino about it,
+        # please refer to https://huggingface.co/docs/transformers/main/chat_templating
+        try:
+            load_chat_template(llm.get_tokenizer(), args.chat_template)
+            prompts_new = []
+            for prompt in prompts:
+                messages = [{"role": "user", "content": prompt}]
+                text = llm.get_tokenizer().apply_chat_template(
+                    messages, tokenize=False, add_generation_prompt=True
+                )
+                prompts_new.append(text)
+        except:
+            logging.warning(
+                "use tokenizer apply_chat_template function failed, may because of low transformers version...(try use transformers>=4.34.0)"
+            )
+            prompts_new = prompts
+
+    # Generate texts from the prompts. The output is a list of RequestOutput objects
+    # that contain the prompt, generated text, and other information.
+    outputs = (
+        llm.generate(prompts_new, sampling_params, use_tqdm=False)
+        if isinstance(prompts_new[0], str)
+        else llm.generate(
+            sampling_params=sampling_params,
+            prompt_token_ids=prompts_new,
+            use_tqdm=False,
+        )
+    )
+    torch.cuda.synchronize()
+
+    start_time = time.perf_counter()
+    outputs = (
+        llm.generate(prompts_new, sampling_params)
+        if isinstance(prompts_new[0], str)
+        else llm.generate(sampling_params=sampling_params, prompt_token_ids=prompts_new)
+    )
+    torch.cuda.synchronize()
+    end_time = time.perf_counter()
+    duration_time = end_time - start_time
+
+    num_tokens = 0
+    # Print the outputs.
+    for i, output in enumerate(outputs):
+        prompt = prompts[i]  # show the origin prompt. actully prompt is "output.prompt"
+        generated_text = output.outputs[0].text
+
+        num_tokens += len(output.outputs[0].token_ids)
+        print(f"Prompt: {prompt}\nGenerated text: {generated_text} \n")
+    print(f"tokens: {num_tokens}, QPS: {num_tokens/duration_time}")
diff --git a/models/nlp/large_language_model/qwen1.5-32b/vllm/utils.py b/models/nlp/large_language_model/qwen1.5-32b/vllm/utils.py
new file mode 100644
index 00000000..c6def85d
--- /dev/null
+++ b/models/nlp/large_language_model/qwen1.5-32b/vllm/utils.py
@@ -0,0 +1,173 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+import codecs
+import logging
+import argparse
+
+
+def sampling_add_cli_args(args: argparse.ArgumentParser) -> argparse.ArgumentParser:
+    args.add_argument(
+        '--n',
+        type=int,
+        default=1,
+        help="Number of output sequences to return for the given prompt.")
+    args.add_argument(
+        '--best-of',
+        type=int,
+        default=None,
+        help="Number of output sequences that are generated from the prompt. "
+        "From these `best_of` sequences, the top `n` sequences are returned. "
+        "`best_of` must be greater than or equal to `n`. This is treated as "
+        "the beam width when `use_beam_search` is True. By default, `best_of`"
+        "is set to `n`.")
+    args.add_argument(
+        '--presence-penalty',
+        type=float,
+        default=0.0,
+        help="Float that penalizes new tokens based on whether they "
+        "appear in the generated text so far. Values > 0 encourage the model "
+        "to use new tokens, while values < 0 encourage the model to repeat "
+        "tokens.")
+    args.add_argument(
+        '--frequency-penalty',
+        type=float,
+        default=0.0,
+        help="Float that penalizes new tokens based on their "
+        " frequency in the generated text so far. Values > 0 encourage the "
+        " model to use new tokens, while values < 0 encourage the model to "
+        "repeat tokens.")
+    args.add_argument(
+        '--repetition-penalty',
+        type=float,
+        default=1.0,
+        help="Float that penalizes new tokens based on whether "
+        "they appear in the prompt and the generated text so far. Values > 1 "
+        "encourage the model to use new tokens, while values < 1 encourage "
+        "the model to repeat tokens.")
+    args.add_argument(
+        '--temperature',
+        type=float,
+        default=1.0,
+        help="Float that controls the randomness of the sampling. Lower "
+        "values make the model more deterministic, while higher values make "
+        "the model more random. Zero means greedy sampling.")
+    args.add_argument(
+        '--top-p',
+        type=float,
+        default=1.0,
+        help="Float that controls the cumulative probability of the top tokens "
+            "to consider. Must be in (0, 1]. Set to 1 to consider all tokens.")
+    args.add_argument(
+        '--top-k',
+        type=int,
+        default=-1,
+        help="Integer that controls the number of top tokens to consider. Set "
+        "to -1 to consider all tokens.")
+    args.add_argument(
+        '--min-p',
+        type=float,
+        default=0.0,
+        help="Float that represents the minimum probability for a token to be "
+        "considered, relative to the probability of the most likely token. "
+        "Must be in [0, 1]. Set to 0 to disable this.")
+    args.add_argument(
+        '--use-beam-search',
+        default=False,
+        action="store_true",
+        help="Whether to use beam search instead of sampling.")
+    args.add_argument(
+        '--length-penalty',
+        type=float,
+        default=1.0,
+        help="Float that penalizes sequences based on their length. Used in beam search.")
+    args.add_argument(
+        '--stop',
+        type=str,
+        default=None,
+        help="List of strings that stop the generation when they are generated. "
+        "The returned output will not contain the stop strings.")
+    args.add_argument(
+        '--stop-token-ids',
+        type=int,
+        default=None,
+        help="List of tokens that stop the generation when they are "
+        "generated. The returned output will contain the stop tokens unless "
+        "the stop tokens are special tokens.")
+    args.add_argument(
+        '--include-stop-str-in-output',
+        default=False,
+        action="store_true",
+        help="Whether to include the stop strings in output text. Defaults to False.")
+    args.add_argument(
+        '--ignore-eos',
+        default=False,
+        action="store_true",
+        help="Whether to ignore the EOS token and continue generating tokens after the EOS token is generated.")
+    args.add_argument(
+        '--max-tokens',
+        type=int,
+        default=16,
+        help="Maximum number of tokens to generate per output sequence.")
+    args.add_argument(
+        '--logprobs',
+        type=int,
+        default=None,
+        help="NNumber of log probabilities to return per output token. "
+        "Note that the implementation follows the OpenAI API: The return "
+        "result includes the log probabilities on the `logprobs` most likely "
+        "tokens, as well the chosen tokens. The API will always return the "
+        "log probability of the sampled token, so there  may be up to "
+        "`logprobs+1` elements in the response.")
+    args.add_argument(
+        '--prompt-logprobs',
+        type=int,
+        default=None,
+        help="Number of log probabilities to return per prompt token.")
+    args.add_argument(
+        '--skip-special-tokens',
+        default=True,
+        action="store_false",
+        help="Whether to skip special tokens in the output.")
+    args.add_argument(
+        '--spaces-between-special-tokens',
+        default=True,
+        action="store_false",
+        help="Whether to add spaces between special tokens in the output.  Defaults to True.")
+    # early_stopping logits_processors seed
+    return args
+
+
+def load_chat_template(tokenizer, chat_template):
+    if chat_template is not None:
+        try:
+            with open(chat_template, "r") as f:
+                tokenizer.chat_template = f.read()
+        except OSError:
+            # If opening a file fails, set chat template to be args to
+            # ensure we decode so our escape are interpreted correctly
+            tokenizer.chat_template = codecs.decode(
+                chat_template, "unicode_escape")
+
+        logging.info(
+            f"Using supplied chat template:\n{tokenizer.chat_template}"
+        )
+    elif tokenizer.chat_template is not None:
+        logging.info(
+            f"Using default chat template:\n{tokenizer.chat_template}. This May lead to unsatisfactory results. You can provide a template.jinja file for vllm."
+        )
+    else:
+        logging.warning(
+            "No chat template provided. Chat API will not work. This May lead to unsatisfactory results. You can provide a template.jinja file for vllm.")
diff --git a/models/nlp/large_language_model/qwen2-72b/vllm/README.md b/models/nlp/large_language_model/qwen2-72b/vllm/README.md
new file mode 100755
index 00000000..f6baf8e4
--- /dev/null
+++ b/models/nlp/large_language_model/qwen2-72b/vllm/README.md
@@ -0,0 +1,43 @@
+# Qwen2-72B-Chat
+
+## Description
+
+Qwen2 is the new series of Qwen large language models. For Qwen2, we release a number of base language models and instruction-tuned language models ranging from 0.5 to 72 billion parameters, including a Mixture-of-Experts model. This repo contains the instruction-tuned 72B Qwen2 model.
+
+Compared with the state-of-the-art opensource language models, including the previous released Qwen1.5, Qwen2 has generally surpassed most opensource models and demonstrated competitiveness against proprietary models across a series of benchmarks targeting for language understanding, language generation, multilingual capability, coding, mathematics, reasoning, etc.
+
+Qwen2-72B-Instruct supports a context length of up to 131,072 tokens, enabling the processing of extensive inputs. Please refer to this section for detailed instructions on how to deploy Qwen2 for handling long texts.
+
+## Setup
+
+### Install
+
+```bash
+# Install libGL
+## CentOS
+yum install -y mesa-libGL
+## Ubuntu
+apt install -y libgl1-mesa-glx
+
+# Contact the iluvatar manager to get adapted install packages of vllm, triton, and ixformer
+pip3 install vllm
+pip3 install triton
+pip3 install ixformer
+```
+
+### Download
+
+-Model: <https://modelscope.cn/models/Qwen/Qwen2-72B-Instruct>
+
+```bash
+cd ${DeepSparkInference}/models/nlp/large_language_model/qwen2-72b/vllm
+mkdir -p data/qwen2
+ln -s /path/to/Qwen2-72B ./data/qwen2
+```
+
+## Inference
+
+```bash
+export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+python3 offline_inference.py --model ./data/qwen2/Qwen2-72B --max-tokens 256 -tp 8 --temperature 0.0 --gpu-memory-utilization 0.98 --max-model-len 58000
+```
diff --git a/models/nlp/large_language_model/qwen2-72b/vllm/offline_inference.py b/models/nlp/large_language_model/qwen2-72b/vllm/offline_inference.py
new file mode 100644
index 00000000..81b0635e
--- /dev/null
+++ b/models/nlp/large_language_model/qwen2-72b/vllm/offline_inference.py
@@ -0,0 +1,116 @@
+import sys
+from pathlib import Path
+
+sys.path.append(str(Path(__file__).resolve().parent.parent.parent))
+import argparse
+import dataclasses
+import inspect
+import logging
+import time
+
+import torch
+from utils import load_chat_template, sampling_add_cli_args
+from vllm import LLM, EngineArgs, SamplingParams
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--chat_template", type=str, default=None)
+    parser.add_argument(
+        "--remove_chat_template",
+        default=False,
+        action="store_true",
+        help="pass this if you are not use a chat model",
+    )
+    parser = EngineArgs.add_cli_args(parser)
+    parser = sampling_add_cli_args(parser)
+    args = parser.parse_args()
+
+    engine_args = [attr.name for attr in dataclasses.fields(EngineArgs)]
+    sampling_args = [
+        param.name
+        for param in list(
+            inspect.signature(SamplingParams.__init__).parameters.values()
+        )[1:]
+    ]
+    engine_params = {attr: getattr(args, attr) for attr in engine_args}
+    sampling_params = {
+        attr: getattr(args, attr) for attr in sampling_args if args.__contains__(attr)
+    }
+
+    model_name = args.model.strip()
+    model_name = model_name if args.model[-1] != "/" else model_name[:-1]
+    model_name = model_name.rsplit("/")[-1]
+
+    # Sample prompts.
+    prompts = ["哪些迹象可能表明一个人正在经历焦虑?", "描述一下如何制作芝士披萨。", "写一篇有关5G网络研发的综述文章。"]
+
+    # Create a sampling params object.
+    sampling_params = SamplingParams(**sampling_params)
+
+    # Create an LLM.
+    llm = LLM(**engine_params)
+
+    # process chat template
+    if args.remove_chat_template:
+        if "chat" in model_name.lower():
+            logging.warning(
+                f"The model name from model path is {model_name}, so we guess you are using the chat model and the additional processing is required for the input prompt. "
+                f"If the result is not quite correct, please ensure you do not pass --remove_chat_template in CLI."
+            )
+        prompts_new = prompts
+    else:
+        # Build chat model promopt
+        logging.warning(
+            "If you are using a non chat model, please pass the --remove_chat_template in CLI."
+        )
+        # Try use transformers's apply_chat_template, if chat_template is None, will use defalut template.
+        # For some old models, the default template may cause bad answers. we don't consider this situation,
+        # because the Transformers team is advancing the chat template. For more informatino about it,
+        # please refer to https://huggingface.co/docs/transformers/main/chat_templating
+        try:
+            load_chat_template(llm.get_tokenizer(), args.chat_template)
+            prompts_new = []
+            for prompt in prompts:
+                messages = [{"role": "user", "content": prompt}]
+                text = llm.get_tokenizer().apply_chat_template(
+                    messages, tokenize=False, add_generation_prompt=True
+                )
+                prompts_new.append(text)
+        except:
+            logging.warning(
+                "use tokenizer apply_chat_template function failed, may because of low transformers version...(try use transformers>=4.34.0)"
+            )
+            prompts_new = prompts
+
+    # Generate texts from the prompts. The output is a list of RequestOutput objects
+    # that contain the prompt, generated text, and other information.
+    outputs = (
+        llm.generate(prompts_new, sampling_params, use_tqdm=False)
+        if isinstance(prompts_new[0], str)
+        else llm.generate(
+            sampling_params=sampling_params,
+            prompt_token_ids=prompts_new,
+            use_tqdm=False,
+        )
+    )
+    torch.cuda.synchronize()
+
+    start_time = time.perf_counter()
+    outputs = (
+        llm.generate(prompts_new, sampling_params)
+        if isinstance(prompts_new[0], str)
+        else llm.generate(sampling_params=sampling_params, prompt_token_ids=prompts_new)
+    )
+    torch.cuda.synchronize()
+    end_time = time.perf_counter()
+    duration_time = end_time - start_time
+
+    num_tokens = 0
+    # Print the outputs.
+    for i, output in enumerate(outputs):
+        prompt = prompts[i]  # show the origin prompt. actully prompt is "output.prompt"
+        generated_text = output.outputs[0].text
+
+        num_tokens += len(output.outputs[0].token_ids)
+        print(f"Prompt: {prompt}\nGenerated text: {generated_text} \n")
+    print(f"tokens: {num_tokens}, QPS: {num_tokens/duration_time}")
diff --git a/models/nlp/large_language_model/qwen2-72b/vllm/utils.py b/models/nlp/large_language_model/qwen2-72b/vllm/utils.py
new file mode 100644
index 00000000..c6def85d
--- /dev/null
+++ b/models/nlp/large_language_model/qwen2-72b/vllm/utils.py
@@ -0,0 +1,173 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+import codecs
+import logging
+import argparse
+
+
+def sampling_add_cli_args(args: argparse.ArgumentParser) -> argparse.ArgumentParser:
+    args.add_argument(
+        '--n',
+        type=int,
+        default=1,
+        help="Number of output sequences to return for the given prompt.")
+    args.add_argument(
+        '--best-of',
+        type=int,
+        default=None,
+        help="Number of output sequences that are generated from the prompt. "
+        "From these `best_of` sequences, the top `n` sequences are returned. "
+        "`best_of` must be greater than or equal to `n`. This is treated as "
+        "the beam width when `use_beam_search` is True. By default, `best_of`"
+        "is set to `n`.")
+    args.add_argument(
+        '--presence-penalty',
+        type=float,
+        default=0.0,
+        help="Float that penalizes new tokens based on whether they "
+        "appear in the generated text so far. Values > 0 encourage the model "
+        "to use new tokens, while values < 0 encourage the model to repeat "
+        "tokens.")
+    args.add_argument(
+        '--frequency-penalty',
+        type=float,
+        default=0.0,
+        help="Float that penalizes new tokens based on their "
+        " frequency in the generated text so far. Values > 0 encourage the "
+        " model to use new tokens, while values < 0 encourage the model to "
+        "repeat tokens.")
+    args.add_argument(
+        '--repetition-penalty',
+        type=float,
+        default=1.0,
+        help="Float that penalizes new tokens based on whether "
+        "they appear in the prompt and the generated text so far. Values > 1 "
+        "encourage the model to use new tokens, while values < 1 encourage "
+        "the model to repeat tokens.")
+    args.add_argument(
+        '--temperature',
+        type=float,
+        default=1.0,
+        help="Float that controls the randomness of the sampling. Lower "
+        "values make the model more deterministic, while higher values make "
+        "the model more random. Zero means greedy sampling.")
+    args.add_argument(
+        '--top-p',
+        type=float,
+        default=1.0,
+        help="Float that controls the cumulative probability of the top tokens "
+            "to consider. Must be in (0, 1]. Set to 1 to consider all tokens.")
+    args.add_argument(
+        '--top-k',
+        type=int,
+        default=-1,
+        help="Integer that controls the number of top tokens to consider. Set "
+        "to -1 to consider all tokens.")
+    args.add_argument(
+        '--min-p',
+        type=float,
+        default=0.0,
+        help="Float that represents the minimum probability for a token to be "
+        "considered, relative to the probability of the most likely token. "
+        "Must be in [0, 1]. Set to 0 to disable this.")
+    args.add_argument(
+        '--use-beam-search',
+        default=False,
+        action="store_true",
+        help="Whether to use beam search instead of sampling.")
+    args.add_argument(
+        '--length-penalty',
+        type=float,
+        default=1.0,
+        help="Float that penalizes sequences based on their length. Used in beam search.")
+    args.add_argument(
+        '--stop',
+        type=str,
+        default=None,
+        help="List of strings that stop the generation when they are generated. "
+        "The returned output will not contain the stop strings.")
+    args.add_argument(
+        '--stop-token-ids',
+        type=int,
+        default=None,
+        help="List of tokens that stop the generation when they are "
+        "generated. The returned output will contain the stop tokens unless "
+        "the stop tokens are special tokens.")
+    args.add_argument(
+        '--include-stop-str-in-output',
+        default=False,
+        action="store_true",
+        help="Whether to include the stop strings in output text. Defaults to False.")
+    args.add_argument(
+        '--ignore-eos',
+        default=False,
+        action="store_true",
+        help="Whether to ignore the EOS token and continue generating tokens after the EOS token is generated.")
+    args.add_argument(
+        '--max-tokens',
+        type=int,
+        default=16,
+        help="Maximum number of tokens to generate per output sequence.")
+    args.add_argument(
+        '--logprobs',
+        type=int,
+        default=None,
+        help="NNumber of log probabilities to return per output token. "
+        "Note that the implementation follows the OpenAI API: The return "
+        "result includes the log probabilities on the `logprobs` most likely "
+        "tokens, as well the chosen tokens. The API will always return the "
+        "log probability of the sampled token, so there  may be up to "
+        "`logprobs+1` elements in the response.")
+    args.add_argument(
+        '--prompt-logprobs',
+        type=int,
+        default=None,
+        help="Number of log probabilities to return per prompt token.")
+    args.add_argument(
+        '--skip-special-tokens',
+        default=True,
+        action="store_false",
+        help="Whether to skip special tokens in the output.")
+    args.add_argument(
+        '--spaces-between-special-tokens',
+        default=True,
+        action="store_false",
+        help="Whether to add spaces between special tokens in the output.  Defaults to True.")
+    # early_stopping logits_processors seed
+    return args
+
+
+def load_chat_template(tokenizer, chat_template):
+    if chat_template is not None:
+        try:
+            with open(chat_template, "r") as f:
+                tokenizer.chat_template = f.read()
+        except OSError:
+            # If opening a file fails, set chat template to be args to
+            # ensure we decode so our escape are interpreted correctly
+            tokenizer.chat_template = codecs.decode(
+                chat_template, "unicode_escape")
+
+        logging.info(
+            f"Using supplied chat template:\n{tokenizer.chat_template}"
+        )
+    elif tokenizer.chat_template is not None:
+        logging.info(
+            f"Using default chat template:\n{tokenizer.chat_template}. This May lead to unsatisfactory results. You can provide a template.jinja file for vllm."
+        )
+    else:
+        logging.warning(
+            "No chat template provided. Chat API will not work. This May lead to unsatisfactory results. You can provide a template.jinja file for vllm.")
diff --git a/models/nlp/large_language_model/qwen2-7b/vllm/README.md b/models/nlp/large_language_model/qwen2-7b/vllm/README.md
new file mode 100755
index 00000000..ebf5c00c
--- /dev/null
+++ b/models/nlp/large_language_model/qwen2-7b/vllm/README.md
@@ -0,0 +1,43 @@
+# Qwen2-7B-Instruct
+
+## Description
+
+Qwen2 is the new series of Qwen large language models. For Qwen2, we release a number of base language models and instruction-tuned language models ranging from 0.5 to 72 billion parameters, including a Mixture-of-Experts model. This repo contains the instruction-tuned 7B Qwen2 model.
+
+Compared with the state-of-the-art opensource language models, including the previous released Qwen1.5, Qwen2 has generally surpassed most opensource models and demonstrated competitiveness against proprietary models across a series of benchmarks targeting for language understanding, language generation, multilingual capability, coding, mathematics, reasoning, etc.
+
+Qwen2-7B-Instruct supports a context length of up to 131,072 tokens, enabling the processing of extensive inputs.
+
+## Setup
+
+### Install
+
+```bash
+# Install libGL
+## CentOS
+yum install -y mesa-libGL
+## Ubuntu
+apt install -y libgl1-mesa-glx
+
+# Contact the iluvatar manager to get adapted install packages of vllm, triton, and ixformer
+pip3 install vllm
+pip3 install triton
+pip3 install ixformer
+```
+
+### Download
+
+-Model: https://modelscope.cn/models/Qwen/Qwen2-7B-Instruct
+
+```bash
+cd ${DeepSparkInference}/models/nlp/large_language_model/qwen2-7b/vllm
+mkdir -p data/qwen2
+ln -s /path/to/Qwen2-7B-Instruct ./data/qwen2
+```
+
+## Inference
+
+```bash
+export CUDA_VISIBLE_DEVICES=0
+python3 offline_inference.py --model ./data/qwen2/Qwen2-7B-Instruct --max-tokens 256 -tp 1 --temperature 0.0
+```
diff --git a/models/nlp/large_language_model/qwen2-7b/vllm/offline_inference.py b/models/nlp/large_language_model/qwen2-7b/vllm/offline_inference.py
new file mode 100644
index 00000000..81b0635e
--- /dev/null
+++ b/models/nlp/large_language_model/qwen2-7b/vllm/offline_inference.py
@@ -0,0 +1,116 @@
+import sys
+from pathlib import Path
+
+sys.path.append(str(Path(__file__).resolve().parent.parent.parent))
+import argparse
+import dataclasses
+import inspect
+import logging
+import time
+
+import torch
+from utils import load_chat_template, sampling_add_cli_args
+from vllm import LLM, EngineArgs, SamplingParams
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--chat_template", type=str, default=None)
+    parser.add_argument(
+        "--remove_chat_template",
+        default=False,
+        action="store_true",
+        help="pass this if you are not use a chat model",
+    )
+    parser = EngineArgs.add_cli_args(parser)
+    parser = sampling_add_cli_args(parser)
+    args = parser.parse_args()
+
+    engine_args = [attr.name for attr in dataclasses.fields(EngineArgs)]
+    sampling_args = [
+        param.name
+        for param in list(
+            inspect.signature(SamplingParams.__init__).parameters.values()
+        )[1:]
+    ]
+    engine_params = {attr: getattr(args, attr) for attr in engine_args}
+    sampling_params = {
+        attr: getattr(args, attr) for attr in sampling_args if args.__contains__(attr)
+    }
+
+    model_name = args.model.strip()
+    model_name = model_name if args.model[-1] != "/" else model_name[:-1]
+    model_name = model_name.rsplit("/")[-1]
+
+    # Sample prompts.
+    prompts = ["哪些迹象可能表明一个人正在经历焦虑?", "描述一下如何制作芝士披萨。", "写一篇有关5G网络研发的综述文章。"]
+
+    # Create a sampling params object.
+    sampling_params = SamplingParams(**sampling_params)
+
+    # Create an LLM.
+    llm = LLM(**engine_params)
+
+    # process chat template
+    if args.remove_chat_template:
+        if "chat" in model_name.lower():
+            logging.warning(
+                f"The model name from model path is {model_name}, so we guess you are using the chat model and the additional processing is required for the input prompt. "
+                f"If the result is not quite correct, please ensure you do not pass --remove_chat_template in CLI."
+            )
+        prompts_new = prompts
+    else:
+        # Build chat model promopt
+        logging.warning(
+            "If you are using a non chat model, please pass the --remove_chat_template in CLI."
+        )
+        # Try use transformers's apply_chat_template, if chat_template is None, will use defalut template.
+        # For some old models, the default template may cause bad answers. we don't consider this situation,
+        # because the Transformers team is advancing the chat template. For more informatino about it,
+        # please refer to https://huggingface.co/docs/transformers/main/chat_templating
+        try:
+            load_chat_template(llm.get_tokenizer(), args.chat_template)
+            prompts_new = []
+            for prompt in prompts:
+                messages = [{"role": "user", "content": prompt}]
+                text = llm.get_tokenizer().apply_chat_template(
+                    messages, tokenize=False, add_generation_prompt=True
+                )
+                prompts_new.append(text)
+        except:
+            logging.warning(
+                "use tokenizer apply_chat_template function failed, may because of low transformers version...(try use transformers>=4.34.0)"
+            )
+            prompts_new = prompts
+
+    # Generate texts from the prompts. The output is a list of RequestOutput objects
+    # that contain the prompt, generated text, and other information.
+    outputs = (
+        llm.generate(prompts_new, sampling_params, use_tqdm=False)
+        if isinstance(prompts_new[0], str)
+        else llm.generate(
+            sampling_params=sampling_params,
+            prompt_token_ids=prompts_new,
+            use_tqdm=False,
+        )
+    )
+    torch.cuda.synchronize()
+
+    start_time = time.perf_counter()
+    outputs = (
+        llm.generate(prompts_new, sampling_params)
+        if isinstance(prompts_new[0], str)
+        else llm.generate(sampling_params=sampling_params, prompt_token_ids=prompts_new)
+    )
+    torch.cuda.synchronize()
+    end_time = time.perf_counter()
+    duration_time = end_time - start_time
+
+    num_tokens = 0
+    # Print the outputs.
+    for i, output in enumerate(outputs):
+        prompt = prompts[i]  # show the origin prompt. actully prompt is "output.prompt"
+        generated_text = output.outputs[0].text
+
+        num_tokens += len(output.outputs[0].token_ids)
+        print(f"Prompt: {prompt}\nGenerated text: {generated_text} \n")
+    print(f"tokens: {num_tokens}, QPS: {num_tokens/duration_time}")
diff --git a/models/nlp/large_language_model/qwen2-7b/vllm/utils.py b/models/nlp/large_language_model/qwen2-7b/vllm/utils.py
new file mode 100644
index 00000000..c6def85d
--- /dev/null
+++ b/models/nlp/large_language_model/qwen2-7b/vllm/utils.py
@@ -0,0 +1,173 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
+import codecs
+import logging
+import argparse
+
+
+def sampling_add_cli_args(args: argparse.ArgumentParser) -> argparse.ArgumentParser:
+    args.add_argument(
+        '--n',
+        type=int,
+        default=1,
+        help="Number of output sequences to return for the given prompt.")
+    args.add_argument(
+        '--best-of',
+        type=int,
+        default=None,
+        help="Number of output sequences that are generated from the prompt. "
+        "From these `best_of` sequences, the top `n` sequences are returned. "
+        "`best_of` must be greater than or equal to `n`. This is treated as "
+        "the beam width when `use_beam_search` is True. By default, `best_of`"
+        "is set to `n`.")
+    args.add_argument(
+        '--presence-penalty',
+        type=float,
+        default=0.0,
+        help="Float that penalizes new tokens based on whether they "
+        "appear in the generated text so far. Values > 0 encourage the model "
+        "to use new tokens, while values < 0 encourage the model to repeat "
+        "tokens.")
+    args.add_argument(
+        '--frequency-penalty',
+        type=float,
+        default=0.0,
+        help="Float that penalizes new tokens based on their "
+        " frequency in the generated text so far. Values > 0 encourage the "
+        " model to use new tokens, while values < 0 encourage the model to "
+        "repeat tokens.")
+    args.add_argument(
+        '--repetition-penalty',
+        type=float,
+        default=1.0,
+        help="Float that penalizes new tokens based on whether "
+        "they appear in the prompt and the generated text so far. Values > 1 "
+        "encourage the model to use new tokens, while values < 1 encourage "
+        "the model to repeat tokens.")
+    args.add_argument(
+        '--temperature',
+        type=float,
+        default=1.0,
+        help="Float that controls the randomness of the sampling. Lower "
+        "values make the model more deterministic, while higher values make "
+        "the model more random. Zero means greedy sampling.")
+    args.add_argument(
+        '--top-p',
+        type=float,
+        default=1.0,
+        help="Float that controls the cumulative probability of the top tokens "
+            "to consider. Must be in (0, 1]. Set to 1 to consider all tokens.")
+    args.add_argument(
+        '--top-k',
+        type=int,
+        default=-1,
+        help="Integer that controls the number of top tokens to consider. Set "
+        "to -1 to consider all tokens.")
+    args.add_argument(
+        '--min-p',
+        type=float,
+        default=0.0,
+        help="Float that represents the minimum probability for a token to be "
+        "considered, relative to the probability of the most likely token. "
+        "Must be in [0, 1]. Set to 0 to disable this.")
+    args.add_argument(
+        '--use-beam-search',
+        default=False,
+        action="store_true",
+        help="Whether to use beam search instead of sampling.")
+    args.add_argument(
+        '--length-penalty',
+        type=float,
+        default=1.0,
+        help="Float that penalizes sequences based on their length. Used in beam search.")
+    args.add_argument(
+        '--stop',
+        type=str,
+        default=None,
+        help="List of strings that stop the generation when they are generated. "
+        "The returned output will not contain the stop strings.")
+    args.add_argument(
+        '--stop-token-ids',
+        type=int,
+        default=None,
+        help="List of tokens that stop the generation when they are "
+        "generated. The returned output will contain the stop tokens unless "
+        "the stop tokens are special tokens.")
+    args.add_argument(
+        '--include-stop-str-in-output',
+        default=False,
+        action="store_true",
+        help="Whether to include the stop strings in output text. Defaults to False.")
+    args.add_argument(
+        '--ignore-eos',
+        default=False,
+        action="store_true",
+        help="Whether to ignore the EOS token and continue generating tokens after the EOS token is generated.")
+    args.add_argument(
+        '--max-tokens',
+        type=int,
+        default=16,
+        help="Maximum number of tokens to generate per output sequence.")
+    args.add_argument(
+        '--logprobs',
+        type=int,
+        default=None,
+        help="NNumber of log probabilities to return per output token. "
+        "Note that the implementation follows the OpenAI API: The return "
+        "result includes the log probabilities on the `logprobs` most likely "
+        "tokens, as well the chosen tokens. The API will always return the "
+        "log probability of the sampled token, so there  may be up to "
+        "`logprobs+1` elements in the response.")
+    args.add_argument(
+        '--prompt-logprobs',
+        type=int,
+        default=None,
+        help="Number of log probabilities to return per prompt token.")
+    args.add_argument(
+        '--skip-special-tokens',
+        default=True,
+        action="store_false",
+        help="Whether to skip special tokens in the output.")
+    args.add_argument(
+        '--spaces-between-special-tokens',
+        default=True,
+        action="store_false",
+        help="Whether to add spaces between special tokens in the output.  Defaults to True.")
+    # early_stopping logits_processors seed
+    return args
+
+
+def load_chat_template(tokenizer, chat_template):
+    if chat_template is not None:
+        try:
+            with open(chat_template, "r") as f:
+                tokenizer.chat_template = f.read()
+        except OSError:
+            # If opening a file fails, set chat template to be args to
+            # ensure we decode so our escape are interpreted correctly
+            tokenizer.chat_template = codecs.decode(
+                chat_template, "unicode_escape")
+
+        logging.info(
+            f"Using supplied chat template:\n{tokenizer.chat_template}"
+        )
+    elif tokenizer.chat_template is not None:
+        logging.info(
+            f"Using default chat template:\n{tokenizer.chat_template}. This May lead to unsatisfactory results. You can provide a template.jinja file for vllm."
+        )
+    else:
+        logging.warning(
+            "No chat template provided. Chat API will not work. This May lead to unsatisfactory results. You can provide a template.jinja file for vllm.")
-- 
Gitee


From a799e3ad40a673959bced20c1b18297ab275f025 Mon Sep 17 00:00:00 2001
From: "xiaomei.wang" <xiaomei.wang@iluvatar.com>
Date: Thu, 24 Oct 2024 14:13:55 +0800
Subject: [PATCH 2/4] Add license.

---
 .../vllm/flashinfer_backend/offline_inference.py  | 15 +++++++++++++++
 .../llama2-7b/vllm/offline_inference.py           | 14 ++++++++++++++
 .../qwen-7b/vllm/offline_inference.py             | 15 +++++++++++++++
 .../qwen1.5-32b/vllm/offline_inference.py         | 15 +++++++++++++++
 .../qwen2-72b/vllm/offline_inference.py           | 15 +++++++++++++++
 .../qwen2-7b/vllm/offline_inference.py            | 15 +++++++++++++++
 6 files changed, 89 insertions(+)

diff --git a/models/nlp/large_language_model/llama2-7b/vllm/flashinfer_backend/offline_inference.py b/models/nlp/large_language_model/llama2-7b/vllm/flashinfer_backend/offline_inference.py
index eb6da16d..965e23aa 100644
--- a/models/nlp/large_language_model/llama2-7b/vllm/flashinfer_backend/offline_inference.py
+++ b/models/nlp/large_language_model/llama2-7b/vllm/flashinfer_backend/offline_inference.py
@@ -1,3 +1,18 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
 import sys
 from pathlib import Path
 
diff --git a/models/nlp/large_language_model/llama2-7b/vllm/offline_inference.py b/models/nlp/large_language_model/llama2-7b/vllm/offline_inference.py
index eb6da16d..9c0b6d2f 100644
--- a/models/nlp/large_language_model/llama2-7b/vllm/offline_inference.py
+++ b/models/nlp/large_language_model/llama2-7b/vllm/offline_inference.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
 import sys
 from pathlib import Path
 
diff --git a/models/nlp/large_language_model/qwen-7b/vllm/offline_inference.py b/models/nlp/large_language_model/qwen-7b/vllm/offline_inference.py
index 5235f032..3b9e9fd8 100644
--- a/models/nlp/large_language_model/qwen-7b/vllm/offline_inference.py
+++ b/models/nlp/large_language_model/qwen-7b/vllm/offline_inference.py
@@ -1,3 +1,18 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
 import sys
 from pathlib import Path
 
diff --git a/models/nlp/large_language_model/qwen1.5-32b/vllm/offline_inference.py b/models/nlp/large_language_model/qwen1.5-32b/vllm/offline_inference.py
index 81b0635e..5e859291 100644
--- a/models/nlp/large_language_model/qwen1.5-32b/vllm/offline_inference.py
+++ b/models/nlp/large_language_model/qwen1.5-32b/vllm/offline_inference.py
@@ -1,3 +1,18 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
 import sys
 from pathlib import Path
 
diff --git a/models/nlp/large_language_model/qwen2-72b/vllm/offline_inference.py b/models/nlp/large_language_model/qwen2-72b/vllm/offline_inference.py
index 81b0635e..5e859291 100644
--- a/models/nlp/large_language_model/qwen2-72b/vllm/offline_inference.py
+++ b/models/nlp/large_language_model/qwen2-72b/vllm/offline_inference.py
@@ -1,3 +1,18 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
 import sys
 from pathlib import Path
 
diff --git a/models/nlp/large_language_model/qwen2-7b/vllm/offline_inference.py b/models/nlp/large_language_model/qwen2-7b/vllm/offline_inference.py
index 81b0635e..5e859291 100644
--- a/models/nlp/large_language_model/qwen2-7b/vllm/offline_inference.py
+++ b/models/nlp/large_language_model/qwen2-7b/vllm/offline_inference.py
@@ -1,3 +1,18 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+#    Licensed under the Apache License, Version 2.0 (the "License"); you may
+#    not use this file except in compliance with the License. You may obtain
+#    a copy of the License at
+#
+#         http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+#    License for the specific language governing permissions and limitations
+#    under the License.
+
 import sys
 from pathlib import Path
 
-- 
Gitee


From 1dde68342187789826f096b7d650ba587fb777e0 Mon Sep 17 00:00:00 2001
From: "xiaomei.wang" <xiaomei.wang@iluvatar.com>
Date: Thu, 24 Oct 2024 16:54:33 +0800
Subject: [PATCH 3/4] Modify README.md

---
 models/nlp/large_language_model/qwen-7b/vllm/README.md     | 2 +-
 models/nlp/large_language_model/qwen1.5-32b/vllm/README.md | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/models/nlp/large_language_model/qwen-7b/vllm/README.md b/models/nlp/large_language_model/qwen-7b/vllm/README.md
index 0eeafd3c..9dcb18f2 100644
--- a/models/nlp/large_language_model/qwen-7b/vllm/README.md
+++ b/models/nlp/large_language_model/qwen-7b/vllm/README.md
@@ -29,7 +29,7 @@ pip3 install ixformer
 
 ```bash
 cd ${DeepSparkInference}/models/nlp/large_language_model/qwen-7b/vllm
-mkdir data/qwen
+mkdir -p data/qwen
 ln -s /path/to/Qwen-7B ./data/qwen
 ```
 
diff --git a/models/nlp/large_language_model/qwen1.5-32b/vllm/README.md b/models/nlp/large_language_model/qwen1.5-32b/vllm/README.md
index 5c766a76..9e484934 100755
--- a/models/nlp/large_language_model/qwen1.5-32b/vllm/README.md
+++ b/models/nlp/large_language_model/qwen1.5-32b/vllm/README.md
@@ -27,7 +27,7 @@ pip3 install ixformer
 
 ```bash
 cd ${DeepSparkInference}/models/nlp/large_language_model/qwen1.5-32b/vllm
-mkdir data/qwen1.5
+mkdir -p data/qwen1.5
 ln -s /path/to/Qwen1.5-32B ./data/qwen1.5
 ```
 
-- 
Gitee


From cb7d360950422fe66a26f76d49404899a2054f75 Mon Sep 17 00:00:00 2001
From: may <xiaomei.wang@iluvatar.ai>
Date: Tue, 5 Nov 2024 06:34:33 +0000
Subject: [PATCH 4/4] update
 models/nlp/large_language_model/qwen2-72b/vllm/README.md.

Signed-off-by: may <xiaomei.wang@iluvatar.ai>
---
 models/nlp/large_language_model/qwen2-72b/vllm/README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/models/nlp/large_language_model/qwen2-72b/vllm/README.md b/models/nlp/large_language_model/qwen2-72b/vllm/README.md
index f6baf8e4..f925f812 100755
--- a/models/nlp/large_language_model/qwen2-72b/vllm/README.md
+++ b/models/nlp/large_language_model/qwen2-72b/vllm/README.md
@@ -1,4 +1,4 @@
-# Qwen2-72B-Chat
+# Qwen2-72B-Instruct
 
 ## Description
 
@@ -39,5 +39,5 @@ ln -s /path/to/Qwen2-72B ./data/qwen2
 
 ```bash
 export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-python3 offline_inference.py --model ./data/qwen2/Qwen2-72B --max-tokens 256 -tp 8 --temperature 0.0 --gpu-memory-utilization 0.98 --max-model-len 58000
+python3 offline_inference.py --model ./data/qwen2/Qwen2-72B --max-tokens 256 -tp 8 --temperature 0.0 --gpu-memory-utilization 0.98 --max-model-len 32768
 ```
-- 
Gitee