From 0adbdfdd7b31124705eafe5285126a2b176a1dd4 Mon Sep 17 00:00:00 2001
From: "hongliang.yuan" <hongliang.yuan@iluvatar.com>
Date: Fri, 7 Feb 2025 11:39:35 +0800
Subject: [PATCH] sync vllm

---
 .../baichuan2-7b/vllm/convert2int8.py         |  57 ++-
 .../baichuan2-7b/vllm/offline_inference.py    | 212 +++++-----
 .../baichuan2-7b/vllm/template_baichuan.jinja |   2 +-
 .../chatglm3-6b-32k/vllm/offline_inference.py |  24 +-
 .../chatglm3-6b-32k/vllm/server_inference.py  |  17 +-
 .../chatglm3-6b-32k/vllm/utils.py             | 371 ------------------
 .../chatglm3-6b/vllm/offline_inference.py     | 202 +++++-----
 .../chatglm3-6b/vllm/server_inference.py      |  47 +--
 .../chatglm3-6b/vllm/utils.py                 | 371 ------------------
 .../flashinfer_backend/offline_inference.py   |  22 +-
 .../llama2-7b/vllm/offline_inference.py       |  25 +-
 .../llama2-7b/vllm/utils.py                   | 173 --------
 .../llama3-70b/vllm/offline_inference.py      | 253 +++++-------
 .../llama3-70b/vllm/utils.py                  | 172 --------
 .../offline_inference.py                      | 161 ++++----
 .../qwen-7b/vllm/offline_inference.py         |  26 +-
 .../qwen-7b/vllm/utils.py                     | 173 --------
 .../qwen1.5-14b/vllm/offline_inference.py     | 189 ++++-----
 .../qwen1.5-14b/vllm/utils.py                 | 173 --------
 .../qwen1.5-32b/vllm/offline_inference.py     |  26 +-
 .../qwen1.5-32b/vllm/utils.py                 | 173 --------
 .../qwen1.5-72b/vllm/offline_inference.py     | 189 ++++-----
 .../qwen1.5-72b/vllm/utils.py                 | 173 --------
 .../offline_inference.py                      |  68 ++--
 .../qwen1.5-7b/vllm/offline_inference.py      | 189 ++++-----
 .../qwen1.5-7b/vllm/utils.py                  | 173 --------
 .../qwen2-72b/vllm/offline_inference.py       |  26 +-
 .../qwen2-72b/vllm/utils.py                   | 173 --------
 .../qwen2-7b/vllm/offline_inference.py        |  26 +-
 .../qwen2-7b/vllm/utils.py                    | 173 --------
 .../stablelm/vllm/offline_inference.py        |  26 +-
 .../stablelm/vllm/utils.py                    | 173 --------
 .../{baichuan2-7b/vllm => }/utils.py          | 163 +++++---
 tests/run_vllm.py                             |  11 +-
 34 files changed, 886 insertions(+), 3546 deletions(-)
 delete mode 100644 models/nlp/large_language_model/chatglm3-6b-32k/vllm/utils.py
 delete mode 100644 models/nlp/large_language_model/chatglm3-6b/vllm/utils.py
 delete mode 100644 models/nlp/large_language_model/llama2-7b/vllm/utils.py
 delete mode 100644 models/nlp/large_language_model/llama3-70b/vllm/utils.py
 delete mode 100644 models/nlp/large_language_model/qwen-7b/vllm/utils.py
 delete mode 100644 models/nlp/large_language_model/qwen1.5-14b/vllm/utils.py
 delete mode 100644 models/nlp/large_language_model/qwen1.5-32b/vllm/utils.py
 delete mode 100644 models/nlp/large_language_model/qwen1.5-72b/vllm/utils.py
 delete mode 100644 models/nlp/large_language_model/qwen1.5-7b/vllm/utils.py
 delete mode 100644 models/nlp/large_language_model/qwen2-72b/vllm/utils.py
 delete mode 100644 models/nlp/large_language_model/qwen2-7b/vllm/utils.py
 delete mode 100644 models/nlp/large_language_model/stablelm/vllm/utils.py
 rename models/nlp/large_language_model/{baichuan2-7b/vllm => }/utils.py (61%)

diff --git a/models/nlp/large_language_model/baichuan2-7b/vllm/convert2int8.py b/models/nlp/large_language_model/baichuan2-7b/vllm/convert2int8.py
index a2444761..7068001e 100644
--- a/models/nlp/large_language_model/baichuan2-7b/vllm/convert2int8.py
+++ b/models/nlp/large_language_model/baichuan2-7b/vllm/convert2int8.py
@@ -1,57 +1,47 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
-import torch
-import os
-import sys
-from collections import OrderedDict
 import argparse
 import glob
-import shutil
 import json
+import os
+import shutil
+import sys
+from collections import OrderedDict
+
+import torch
 
 parser = argparse.ArgumentParser()
-parser.add_argument("--model-path",type=str,default=None)
+parser.add_argument("--model-path", type=str, default=None)
 args = parser.parse_args()
 
 
 def float2int8(load_path, save_path):
-    all_files = glob.glob(os.path.join(load_path,'*'))
+    all_files = glob.glob(os.path.join(load_path, "*"))
     os.makedirs(save_path)
     print(f"save int8 weight to: {save_path}")
     for raw_file in all_files:
         ext_name = os.path.splitext(raw_file)[-1]
-        if ext_name in ['.json', '.py', '.model']:
+        if ext_name in [".json", ".py", ".model"]:
             dst_file = os.path.split(raw_file)[-1]
             dst_file = os.path.join(save_path, dst_file)
             shutil.copy(raw_file, dst_file)
             print(f"copy file `{raw_file}` to `{dst_file}`")
         elif ext_name == ".bin":
             print(f"quantize `{raw_file}`")
-            params = torch.load(raw_file,map_location="cpu")
+            params = torch.load(raw_file, map_location="cpu")
             new_params = OrderedDict()
-            keys = ['proj','pack']
-            for k,v in params.items():
+            keys = ["proj", "pack"]
+            for k, v in params.items():
                 find_key = False
                 for key in keys:
                     if key in k:
                         scale = torch.abs(v).max(dim=-1)[0] / 127.0
-                        int8_v = torch.clamp(v / scale.view(-1,1),min=-127,max=127).to(torch.int8).contiguous()
-                        scale = scale.view(1,-1).contiguous()
+                        int8_v = (
+                            torch.clamp(v / scale.view(-1, 1), min=-127, max=127)
+                            .to(torch.int8)
+                            .contiguous()
+                        )
+                        scale = scale.view(1, -1).contiguous()
                         new_params[k] = int8_v
-                        new_params[k.replace("weight","scales")] = scale
+                        new_params[k.replace("weight", "scales")] = scale
                         find_key = True
                         break
                 if find_key:
@@ -59,14 +49,15 @@ def float2int8(load_path, save_path):
                 # save the other param
                 new_params[k] = v
             file_name = os.path.basename(raw_file)
-            file_name_no_suffix = file_name.rsplit('.',1)[0]
-            new_file_name = file_name_no_suffix+"_int8.bin"
-            torch.save(new_params,os.path.join(save_path,new_file_name))
+            file_name_no_suffix = file_name.rsplit(".", 1)[0]
+            new_file_name = file_name_no_suffix + "_int8.bin"
+            torch.save(new_params, os.path.join(save_path, new_file_name))
 
     config_file = os.path.join(save_path, "w8a16_config.json")
-    with open(config_file, 'w') as f:
+    with open(config_file, "w") as f:
         f.write(json.dumps({}))
 
+
 if __name__ == "__main__":
     model_path = args.model_path
     save_path = os.path.join(model_path, "int8")
diff --git a/models/nlp/large_language_model/baichuan2-7b/vllm/offline_inference.py b/models/nlp/large_language_model/baichuan2-7b/vllm/offline_inference.py
index ecb921a6..9b7d87fd 100644
--- a/models/nlp/large_language_model/baichuan2-7b/vllm/offline_inference.py
+++ b/models/nlp/large_language_model/baichuan2-7b/vllm/offline_inference.py
@@ -1,117 +1,115 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
 import sys
 from pathlib import Path
-sys.path.append(str(Path(__file__).resolve().parent.parent.parent))
-from utils import load_chat_template,sampling_add_cli_args
+import os
 
-import logging
-import time
+sys.path.append(str(Path(__file__).resolve().parent.parent.parent))
 import argparse
 import dataclasses
 import inspect
+import logging
+import time
 
 import torch
-from vllm import LLM, SamplingParams, EngineArgs
-
-
-parser = argparse.ArgumentParser()
-parser.add_argument("--chat_template",type=str,default=None)
-parser.add_argument("--remove_chat_template",default=False,action="store_true",help="pass this if you are not use a chat model")
-parser = EngineArgs.add_cli_args(parser)
-parser = sampling_add_cli_args(parser)
-args = parser.parse_args()
-
-engine_args = [attr.name for attr in dataclasses.fields(EngineArgs)]
-sampling_args = [param.name for param in list(inspect.signature(SamplingParams).parameters.values())[1:]]
-engine_params = {attr:getattr(args, attr) for attr in engine_args}
-sampling_params = {attr:getattr(args, attr) for attr in sampling_args if args.__contains__(attr)}
-
-model_name = args.model.strip()
-model_name = model_name if args.model[-1]!='/' else model_name[:-1]
-model_name = model_name.rsplit('/')[-1]
-
-
-# Sample prompts.
-prompts = [
-            "哪些迹象可能表明一个人正在经历焦虑?", 
-            "描述一下如何制作芝士披萨。", 
-            "写一篇有关5G网络研发的综述文章。"
-           ]
-
-# Create a sampling params object.
-sampling_params = SamplingParams(**sampling_params)
-
-# Create an LLM.
-llm = LLM(**engine_params)
-
-# process chat template
-if args.remove_chat_template:
-    if 'chat' in model_name.lower():
-        logging.warning(f"The model name from model path is {model_name}, so we guess you are using the chat model and the additional processing is required for the input prompt. "
-                        f"If the result is not quite correct, please ensure you do not pass --remove_chat_template in CLI.")
-    prompts_new = prompts
-else:
-    # Build chat model promopt
-    logging.warning("If you are using a non chat model, please pass the --remove_chat_template in CLI.")
-    # Try use transformers's apply_chat_template, if chat_template is None, will use defalut template.
-    # For some old models, the default template may cause bad answers. we don't consider this situation, 
-    # because the Transformers team is advancing the chat template. For more informatino about it, 
-    # please refer to https://huggingface.co/docs/transformers/main/chat_templating
-    try:
-        load_chat_template(llm.get_tokenizer(),args.chat_template)
-        prompts_new = []
-        for prompt in prompts:
-            messages = [
-                {"role": "user", "content": prompt}
-            ]
-            text = llm.get_tokenizer().apply_chat_template(
-                messages,
-                tokenize=False,
-                add_generation_prompt=True
+from utils import load_chat_template, sampling_add_cli_args
+from vllm import LLM, EngineArgs, SamplingParams
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--chat_template", type=str, default=None)
+    parser.add_argument(
+        "--remove_chat_template",
+        default=False,
+        action="store_true",
+        help="pass this if you are not use a chat model",
+    )
+    parser = EngineArgs.add_cli_args(parser)
+    parser = sampling_add_cli_args(parser)
+    args = parser.parse_args()
+
+    engine_args = [attr.name for attr in dataclasses.fields(EngineArgs)]
+    sampling_args = [
+        param.name
+        for param in list(
+            inspect.signature(SamplingParams).parameters.values()
+        )
+    ]
+    engine_params = {attr: getattr(args, attr) for attr in engine_args}
+    sampling_params = {
+        attr: getattr(args, attr) for attr in sampling_args if args.__contains__(attr)
+    }
+
+    model_name = os.path.dirname(args.model).rsplit("/")[-1]
+
+    # Sample prompts.
+    prompts = ["哪些迹象可能表明一个人正在经历焦虑?", "描述一下如何制作芝士披萨。", "写一篇有关5G网络研发的综述文章。"]
+
+    # Create a sampling params object.
+    sampling_params = SamplingParams(**sampling_params)
+
+    # Create an LLM.
+    llm = LLM(**engine_params)
+
+    # process chat template
+    if args.remove_chat_template:
+        if "chat" in model_name.lower():
+            logging.warning(
+                f"The model name from model path is {model_name}, so we guess you are using the chat model and the additional processing is required for the input prompt. "
+                f"If the result is not quite correct, please ensure you do not pass --remove_chat_template in CLI."
             )
-            prompts_new.append(text)
-    except:
-        logging.warning("use tokenizer apply_chat_template function failed, may because of low transformers version...(try use transformers>=4.34.0)")
         prompts_new = prompts
-
-# Generate texts from the prompts. The output is a list of RequestOutput objects
-# that contain the prompt, generated text, and other information.
-outputs = llm.generate(prompts_new, sampling_params,use_tqdm=False) if isinstance(prompts_new[0],str) else llm.generate(sampling_params=sampling_params,prompt_token_ids=prompts_new,use_tqdm=False)
-torch.cuda.synchronize()
-
-start_time = time.perf_counter()
-outputs = llm.generate(prompts_new, sampling_params) if isinstance(prompts_new[0],str) else llm.generate(sampling_params=sampling_params,prompt_token_ids=prompts_new)
-torch.cuda.synchronize()
-end_time = time.perf_counter()
-duration_time = end_time - start_time
-
-num_tokens = 0
-# Print the outputs.
-for i, output in enumerate(outputs):
-    prompt = prompts[i] # show the origin prompt. actully prompt is "output.prompt"
-    generated_text = output.outputs[0].text
-    
-    num_tokens += len(output.outputs[0].token_ids)
-    print(f"Prompt: {prompt}\nGenerated text: {generated_text} \n")
-print(f"tokens: {num_tokens}, QPS: {num_tokens/duration_time}")
-
-metricResult = {"metricResult": {}}
-metricResult["metricResult"]["tokens"] = num_tokens
-metricResult["metricResult"]["QPS"] = round(num_tokens/duration_time,3)
-print(metricResult)
-
-# 0.3.2 tokens: 757, QPS: 97.97229589080902
\ No newline at end of file
+    else:
+        # Build chat model promopt
+        logging.warning(
+            "If you are using a non chat model, please pass the --remove_chat_template in CLI."
+        )
+        # Try use transformers's apply_chat_template, if chat_template is None, will use defalut template.
+        # For some old models, the default template may cause bad answers. we don't consider this situation,
+        # because the Transformers team is advancing the chat template. For more informatino about it,
+        # please refer to https://huggingface.co/docs/transformers/main/chat_templating
+        try:
+            load_chat_template(llm.get_tokenizer(), args.chat_template)
+            prompts_new = []
+            for prompt in prompts:
+                messages = [{"role": "user", "content": prompt}]
+                text = llm.get_tokenizer().apply_chat_template(
+                    messages, tokenize=False, add_generation_prompt=True
+                )
+                prompts_new.append(text)
+        except:
+            logging.warning(
+                "use tokenizer apply_chat_template function failed, may because of low transformers version...(try use transformers>=4.34.0)"
+            )
+            prompts_new = prompts
+
+    # Generate texts from the prompts. The output is a list of RequestOutput objects
+    # that contain the prompt, generated text, and other information.
+    outputs = (
+        llm.generate(prompts_new, sampling_params, use_tqdm=False)
+        if isinstance(prompts_new[0], str)
+        else llm.generate(
+            sampling_params=sampling_params,
+            prompt_token_ids=prompts_new,
+            use_tqdm=False,
+        )
+    )
+    torch.cuda.synchronize()
+
+    start_time = time.perf_counter()
+    outputs = (
+        llm.generate(prompts_new, sampling_params)
+        if isinstance(prompts_new[0], str)
+        else llm.generate(sampling_params=sampling_params, prompt_token_ids=prompts_new)
+    )
+    torch.cuda.synchronize()
+    end_time = time.perf_counter()
+    duration_time = end_time - start_time
+
+    num_tokens = 0
+    # Print the outputs.
+    for i, output in enumerate(outputs):
+        prompt = prompts[i]  # show the origin prompt. actully prompt is "output.prompt"
+        generated_text = output.outputs[0].text
+
+        num_tokens += len(output.outputs[0].token_ids)
+        print(f"Prompt: {prompt}\nGenerated text: {generated_text} \n")
+    print(f"tokens: {num_tokens}, QPS: {num_tokens/duration_time}")
diff --git a/models/nlp/large_language_model/baichuan2-7b/vllm/template_baichuan.jinja b/models/nlp/large_language_model/baichuan2-7b/vllm/template_baichuan.jinja
index 42a8d927..e3a786c5 100644
--- a/models/nlp/large_language_model/baichuan2-7b/vllm/template_baichuan.jinja
+++ b/models/nlp/large_language_model/baichuan2-7b/vllm/template_baichuan.jinja
@@ -10,4 +10,4 @@
 
 {%- if add_generation_prompt and messages[-1]['role'] != 'assistant' -%}
     {{- '<reserved_107>' -}}
-{% endif %}
\ No newline at end of file
+{% endif %}
diff --git a/models/nlp/large_language_model/chatglm3-6b-32k/vllm/offline_inference.py b/models/nlp/large_language_model/chatglm3-6b-32k/vllm/offline_inference.py
index 21843e49..1c2c3999 100644
--- a/models/nlp/large_language_model/chatglm3-6b-32k/vllm/offline_inference.py
+++ b/models/nlp/large_language_model/chatglm3-6b-32k/vllm/offline_inference.py
@@ -1,20 +1,6 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
 import sys
 from pathlib import Path
+import os
 
 sys.path.append(str(Path(__file__).resolve().parent.parent.parent))
 import argparse
@@ -45,16 +31,14 @@ if __name__ == "__main__":
         param.name
         for param in list(
             inspect.signature(SamplingParams).parameters.values()
-        )[1:]
+        )
     ]
     engine_params = {attr: getattr(args, attr) for attr in engine_args}
     sampling_params = {
         attr: getattr(args, attr) for attr in sampling_args if args.__contains__(attr)
     }
 
-    model_name = args.model.strip()
-    model_name = model_name if args.model[-1] != "/" else model_name[:-1]
-    model_name = model_name.rsplit("/")[-1]
+    model_name = os.path.dirname(args.model).rsplit("/")[-1]
 
     # Sample prompts.
     prompts = ["哪些迹象可能表明一个人正在经历焦虑?", "描述一下如何制作芝士披萨。", "写一篇有关5G网络研发的综述文章。"]
@@ -124,4 +108,4 @@ if __name__ == "__main__":
 
         num_tokens += len(output.outputs[0].token_ids)
         print(f"Prompt: {prompt}\nGenerated text: {generated_text} \n")
-    print(f"tokens: {num_tokens}, QPS: {num_tokens/duration_time}")
\ No newline at end of file
+    print(f"tokens: {num_tokens}, QPS: {num_tokens/duration_time}")
diff --git a/models/nlp/large_language_model/chatglm3-6b-32k/vllm/server_inference.py b/models/nlp/large_language_model/chatglm3-6b-32k/vllm/server_inference.py
index 70517a33..0b81ec08 100644
--- a/models/nlp/large_language_model/chatglm3-6b-32k/vllm/server_inference.py
+++ b/models/nlp/large_language_model/chatglm3-6b-32k/vllm/server_inference.py
@@ -1,18 +1,3 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
 import argparse
 import time
 
@@ -78,4 +63,4 @@ if __name__ == "__main__":
     prompts = [tokenizer.build_chat_input(i).input_ids.tolist() for i in prompts]
 
     for prompt in prompts:
-        send_request(api_url, prompt, args.output_token, args.stream)
\ No newline at end of file
+        send_request(api_url, prompt, args.output_token, args.stream)
diff --git a/models/nlp/large_language_model/chatglm3-6b-32k/vllm/utils.py b/models/nlp/large_language_model/chatglm3-6b-32k/vllm/utils.py
deleted file mode 100644
index 1fc072d8..00000000
--- a/models/nlp/large_language_model/chatglm3-6b-32k/vllm/utils.py
+++ /dev/null
@@ -1,371 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
-from copy import deepcopy
-from typing import Tuple, List, Union
-
-import codecs
-import logging
-import argparse
-
-# 对于chat模型，或者模型需要特定的输入，需要对prompt进行额外的处理。
-# 如果您在使用中有额外的prompt处理方式需求或者错误反馈，可以联系王坚或者巩亚飞，我们会对modelzoo进行更新适配。
-
-def sampling_add_cli_args(args: argparse.ArgumentParser) -> argparse.ArgumentParser:
-    args.add_argument(
-        '--n',
-        type=int,
-        default=1,
-        help="Number of output sequences to return for the given prompt.")
-    args.add_argument(
-        '--best-of',
-        type=int,
-        default=None,
-        help="Number of output sequences that are generated from the prompt. "
-        "From these `best_of` sequences, the top `n` sequences are returned. "
-        "`best_of` must be greater than or equal to `n`. This is treated as "
-        "the beam width when `use_beam_search` is True. By default, `best_of`"
-        "is set to `n`.")
-    args.add_argument(
-        '--presence-penalty',
-        type=float,
-        default=0.0,
-        help="Float that penalizes new tokens based on whether they "
-        "appear in the generated text so far. Values > 0 encourage the model "
-        "to use new tokens, while values < 0 encourage the model to repeat "
-        "tokens.")
-    args.add_argument(
-        '--frequency-penalty',
-        type=float,
-        default=0.0,
-        help="Float that penalizes new tokens based on their "
-        " frequency in the generated text so far. Values > 0 encourage the "
-        " model to use new tokens, while values < 0 encourage the model to "
-        "repeat tokens.")
-    args.add_argument(
-        '--repetition-penalty',
-        type=float,
-        default=1.0,
-        help="Float that penalizes new tokens based on whether "
-        "they appear in the prompt and the generated text so far. Values > 1 "
-        "encourage the model to use new tokens, while values < 1 encourage "
-        "the model to repeat tokens.")
-    args.add_argument(
-        '--temperature',
-        type=float,
-        default=1.0,
-        help="Float that controls the randomness of the sampling. Lower "
-        "values make the model more deterministic, while higher values make "
-        "the model more random. Zero means greedy sampling.")
-    args.add_argument(
-        '--top-p',
-        type=float,
-        default=1.0,
-        help="Float that controls the cumulative probability of the top tokens "
-            "to consider. Must be in (0, 1]. Set to 1 to consider all tokens.")
-    args.add_argument(
-        '--top-k',
-        type=int,
-        default=-1,
-        help="Integer that controls the number of top tokens to consider. Set "
-        "to -1 to consider all tokens.")
-    args.add_argument(
-        '--min-p',
-        type=float,
-        default=0.0,
-        help="Float that represents the minimum probability for a token to be "
-        "considered, relative to the probability of the most likely token. "
-        "Must be in [0, 1]. Set to 0 to disable this.")
-    args.add_argument(
-        '--use-beam-search',
-        default=False,
-        action="store_true",
-        help="Whether to use beam search instead of sampling.")
-    args.add_argument(
-        '--length-penalty',
-        type=float,
-        default=1.0,
-        help="Float that penalizes sequences based on their length. Used in beam search.")
-    args.add_argument(
-        '--stop',
-        type=str,
-        default=None,
-        help="List of strings that stop the generation when they are generated. "
-        "The returned output will not contain the stop strings.")
-    args.add_argument(
-        '--stop-token-ids',
-        type=int,
-        default=None,
-        help="List of tokens that stop the generation when they are "
-        "generated. The returned output will contain the stop tokens unless "
-        "the stop tokens are special tokens.")
-    args.add_argument(
-        '--include-stop-str-in-output',
-        default=False,
-        action="store_true",
-        help="Whether to include the stop strings in output text. Defaults to False.")
-    args.add_argument(
-        '--ignore-eos',
-        default=False,
-        action="store_true",
-        help="Whether to ignore the EOS token and continue generating tokens after the EOS token is generated.")
-    args.add_argument(
-        '--max-tokens',
-        type=int,
-        default=16,
-        help="Maximum number of tokens to generate per output sequence.")
-    args.add_argument(
-        '--logprobs',
-        type=int,
-        default=None,
-        help="NNumber of log probabilities to return per output token. "
-        "Note that the implementation follows the OpenAI API: The return "
-        "result includes the log probabilities on the `logprobs` most likely "
-        "tokens, as well the chosen tokens. The API will always return the "
-        "log probability of the sampled token, so there  may be up to "
-        "`logprobs+1` elements in the response.")
-    args.add_argument(
-        '--prompt-logprobs',
-        type=int,
-        default=None,
-        help="Number of log probabilities to return per prompt token.")
-    args.add_argument(
-        '--skip-special-tokens',
-        default=True,
-        action="store_false",
-        help="Whether to skip special tokens in the output.")
-    args.add_argument(
-        '--spaces-between-special-tokens',
-        default=True,
-        action="store_false",
-        help="Whether to add spaces between special tokens in the output.  Defaults to True.")
-    # early_stopping logits_processors seed
-    return args
-
-
-def load_chat_template(tokenizer, chat_template):
-        if chat_template is not None:
-            try:
-                with open(chat_template, "r") as f:
-                    tokenizer.chat_template = f.read()
-            except OSError:
-                # If opening a file fails, set chat template to be args to
-                # ensure we decode so our escape are interpreted correctly
-                tokenizer.chat_template = codecs.decode(
-                    chat_template, "unicode_escape")
-
-            logging.info(
-                f"Using supplied chat template:\n{tokenizer.chat_template}"
-            )
-        elif tokenizer.chat_template is not None:
-            logging.info(
-                f"Using default chat template:\n{tokenizer.chat_template}"
-            )
-        else:
-            logging.warning(
-                "No chat template provided. Chat API will not work.")
-
-def default_build_chat(tokenizer,prompt):
-    return prompt
-
-def chatglm2_build_chat(tokenizer,prompt):
-    return tokenizer.build_prompt(prompt)
-
-def chatglm3_build_chat(tokenizer,prompt):
-    return tokenizer.build_chat_input(prompt).input_ids[0].tolist()
-
-def llama2_build_chat(tokenizer,prompt):
-    return f"[INST]{prompt}[/INST]"
-
-# adapt from https://huggingface.co/baichuan-inc/Baichuan2-13B-Chat/blob/main/generation_utils.py
-def baichuan2_build_chat(tokenizer, prompt, max_new_tokens=512):
-    def _parse_messages(messages, split_role="user"):
-        system, rounds = "", []
-        round = []
-        for i, message in enumerate(messages):
-            if message["role"] == "system":
-                assert i == 0
-                system = message["content"]
-                continue
-            if message["role"] == split_role and round:
-                rounds.append(round)
-                round = []
-            round.append(message)
-        if round:
-            rounds.append(round)
-        return system, rounds
-
-    messages = [{"role": "user", "content": f"{prompt}"}]
-    max_new_tokens = max_new_tokens
-    max_input_tokens = 4096 - max_new_tokens
-    system, rounds = _parse_messages(messages, split_role="user")
-    system_tokens = tokenizer.encode(system)
-    max_history_tokens = max_input_tokens - len(system_tokens)
-
-    history_tokens = []
-    for round in rounds[::-1]:
-        round_tokens = []
-        for message in round:
-            if message["role"] == "user":
-                round_tokens.append(195)
-            else:
-                round_tokens.append(196)
-            round_tokens.extend(tokenizer.encode(message["content"]))
-        if len(history_tokens) == 0 or len(history_tokens) + len(round_tokens) <= max_history_tokens:
-            history_tokens = round_tokens + history_tokens  # concat left
-            if len(history_tokens) < max_history_tokens:
-                continue
-        break
-
-    input_tokens = system_tokens + history_tokens
-    if messages[-1]["role"] != "assistant":
-        input_tokens.append(196)
-    input_tokens = input_tokens[-max_input_tokens:]  # truncate left
-    return input_tokens
-
-def qwen_build_chat(
-    tokenizer,
-    query: str,
-    history: List[Tuple[str, str]] = None,
-    system: str = "",
-    max_window_size: int = 6144,
-    chat_format: str = "chatml",
-):
-    if history is None:
-        history = []
-
-    if chat_format == "chatml":
-        im_start, im_end = "<|im_start|>", "<|im_end|>"
-        im_start_tokens = [tokenizer.im_start_id]
-        im_end_tokens = [tokenizer.im_end_id]
-        nl_tokens = tokenizer.encode("\n")
-
-        def _tokenize_str(role, content):
-            return f"{role}\n{content}", tokenizer.encode(
-                role, allowed_special=set()
-            ) + nl_tokens + tokenizer.encode(content, allowed_special=set())
-
-        system_text, system_tokens_part = _tokenize_str("system", system)
-        system_tokens = im_start_tokens + system_tokens_part + im_end_tokens
-
-        raw_text = ""
-        context_tokens = []
-
-        for turn_query, turn_response in reversed(history):
-            query_text, query_tokens_part = _tokenize_str("user", turn_query)
-            query_tokens = im_start_tokens + query_tokens_part + im_end_tokens
-            response_text, response_tokens_part = _tokenize_str(
-                "assistant", turn_response
-            )
-            response_tokens = im_start_tokens + response_tokens_part + im_end_tokens
-
-            next_context_tokens = nl_tokens + query_tokens + nl_tokens + response_tokens
-            prev_chat = (
-                f"\n{im_start}{query_text}{im_end}\n{im_start}{response_text}{im_end}"
-            )
-
-            current_context_size = (
-                len(system_tokens) + len(next_context_tokens) + len(context_tokens)
-            )
-            if current_context_size < max_window_size:
-                context_tokens = next_context_tokens + context_tokens
-                raw_text = prev_chat + raw_text
-            else:
-                break
-
-        context_tokens = system_tokens + context_tokens
-        raw_text = f"{im_start}{system_text}{im_end}" + raw_text
-        context_tokens += (
-            nl_tokens
-            + im_start_tokens
-            + _tokenize_str("user", query)[1]
-            + im_end_tokens
-            + nl_tokens
-            + im_start_tokens
-            + tokenizer.encode("assistant")
-            + nl_tokens
-        )
-        raw_text += f"\n{im_start}user\n{query}{im_end}\n{im_start}assistant\n"
-
-    elif chat_format == "raw":
-        raw_text = query
-        context_tokens = tokenizer.encode(raw_text)
-    else:
-        raise NotImplementedError(f"Unknown chat format {chat_format!r}")
-
-    return raw_text, context_tokens
-
-def codellama_build_chat(tokenizer,prompt):
-    return "[INST] Write code to solve the following coding problem that obeys the constraints and passes the example test cases. Please wrap your code answer using ```:{}[/INST]".format(prompt)
-
-def build_chat(tokenizer, prompt, model_name, **kwargs):
-    model_name = model_name.lower()
-        # return str or list[int]
-    if "chatglm2" in model_name:
-        prompt = chatglm2_build_chat(tokenizer,prompt)
-    elif "chatglm3" in model_name:
-        prompt = chatglm3_build_chat(tokenizer,prompt)
-    elif "llama2" in model_name and 'chat' in model_name:
-        prompt = llama2_build_chat(tokenizer,prompt)
-    elif "baichuan2" in model_name and 'chat' in model_name:
-        prompt = baichuan2_build_chat(tokenizer,prompt, kwargs['max_length'])
-    elif "qwen" in model_name and 'chat' in model_name:
-        prompt = qwen_build_chat(tokenizer,prompt)
-    elif "code" in model_name and 'llama' in model_name:
-        prompt = codellama_build_chat(tokenizer,prompt)
-    else:
-        prompt = default_build_chat(tokenizer,prompt)
-    return prompt
-
-
-# for output
-def default_post_process(output):
-    return output
-
-def glm2_post_process(output):
-    output = output.strip()
-    output = output.replace("[[训练时间]]", "2023年")
-    return output
-
-def glm3_post_process(output, history=[]):
-    content = ""
-    history = deepcopy(history)
-    for response in output.split("<|assistant|>"):
-        metadata, content = response.split("\n", maxsplit=1)
-        if not metadata.strip():
-            content = content.strip()
-            history.append({"role": "assistant", "metadata": metadata, "content": content})
-            content = content.replace("[[训练时间]]", "2023年")
-        else:
-            history.append({"role": "assistant", "metadata": metadata, "content": content})
-            if history[0]["role"] == "system" and "tools" in history[0]:
-                content = "\n".join(content.split("\n")[1:-1])
-                def tool_call(**kwargs):
-                    return kwargs
-                parameters = eval(content)
-                content = {"name": metadata.strip(), "parameters": parameters}
-            else:
-                content = {"name": metadata.strip(), "content": content}
-    return content
-
-def post_process(response, model_name,**kwargs):
-    model_name = model_name.lower()
-    if "chatglm2" in model_name:
-        response = glm2_post_process(response)
-    elif "chatglm3" in model_name:
-        response = glm3_post_process(response)
-    else:
-        response = default_post_process(response)
-    return response
\ No newline at end of file
diff --git a/models/nlp/large_language_model/chatglm3-6b/vllm/offline_inference.py b/models/nlp/large_language_model/chatglm3-6b/vllm/offline_inference.py
index ee573921..1c2c3999 100644
--- a/models/nlp/large_language_model/chatglm3-6b/vllm/offline_inference.py
+++ b/models/nlp/large_language_model/chatglm3-6b/vllm/offline_inference.py
@@ -1,107 +1,111 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
 import sys
 from pathlib import Path
-sys.path.append(str(Path(__file__).resolve().parent.parent.parent))
-from utils import load_chat_template,sampling_add_cli_args
+import os
 
-import logging
-import time
+sys.path.append(str(Path(__file__).resolve().parent.parent.parent))
 import argparse
 import dataclasses
 import inspect
+import logging
+import time
 
 import torch
-from vllm import LLM, SamplingParams, EngineArgs
-
-
-parser = argparse.ArgumentParser()
-parser.add_argument("--chat_template",type=str,default=None)
-parser.add_argument("--remove_chat_template",default=False,action="store_true",help="pass this if you are not use a chat model")
-parser = EngineArgs.add_cli_args(parser)
-parser = sampling_add_cli_args(parser)
-args = parser.parse_args()
-
-engine_args = [attr.name for attr in dataclasses.fields(EngineArgs)]
-sampling_args = [param.name for param in list(inspect.signature(SamplingParams).parameters.values())[1:]]
-engine_params = {attr:getattr(args, attr) for attr in engine_args}
-sampling_params = {attr:getattr(args, attr) for attr in sampling_args if args.__contains__(attr)}
-
-model_name = args.model.strip()
-model_name = model_name if args.model[-1]!='/' else model_name[:-1]
-model_name = model_name.rsplit('/')[-1]
-
-
-# Sample prompts.
-prompts = [
-            "哪些迹象可能表明一个人正在经历焦虑?", 
-            "描述一下如何制作芝士披萨。", 
-            "写一篇有关5G网络研发的综述文章。"
-           ]
-
-# Create a sampling params object.
-sampling_params = SamplingParams(**sampling_params)
-
-# Create an LLM.
-llm = LLM(**engine_params)
-
-# process chat template
-if args.remove_chat_template:
-    if 'chat' in model_name.lower():
-        logging.warning(f"The model name from model path is {model_name}, so we guess you are using the chat model and the additional processing is required for the input prompt. "
-                        f"If the result is not quite correct, please ensure you do not pass --remove_chat_template in CLI.")
-    prompts_new = prompts
-else:
-    # Build chat model promopt
-    logging.warning("If you are using a non chat model, please pass the --remove_chat_template in CLI.")
-    logging.warning("For now, openai api chat interface(v1/chat/completions) need you provide a chat template to process prompt(str) for better results. "
-                    "Otherwise, you have to use the default chat template, which may lead to bad answers. But, the process of building chat input is complex "
-                    "for some models and the rule of process can not be written as a jinja file. Fortunately, the v1/completions interface support List[int] "
-                    "params. This means you can process the prompt firstly, then send the List[int] to v1/completions and consider it as v1/chat/completions "
-                    "to use when you use openai api.")
-    tokenizer = llm.get_tokenizer()
-    prompts_new = []
-    for prompt in prompts:
-        input_idx = tokenizer.build_chat_input(prompt)['input_ids'][0].cpu().tolist()
-        prompts_new.append(input_idx)
-
-# Generate texts from the prompts. The output is a list of RequestOutput objects
-# that contain the prompt, generated text, and other information.
-outputs = llm.generate(prompts_new, sampling_params,use_tqdm=False) if isinstance(prompts_new[0],str) else llm.generate(sampling_params=sampling_params,prompt_token_ids=prompts_new,use_tqdm=False)
-torch.cuda.synchronize()
-
-start_time = time.perf_counter()
-outputs = llm.generate(prompts_new, sampling_params) if isinstance(prompts_new[0],str) else llm.generate(sampling_params=sampling_params,prompt_token_ids=prompts_new)
-torch.cuda.synchronize()
-end_time = time.perf_counter()
-duration_time = end_time - start_time
-
-num_tokens = 0
-# Print the outputs.
-for i, output in enumerate(outputs):
-    prompt = prompts[i] # show the origin prompt. actully prompt is "output.prompt"
-    generated_text = output.outputs[0].text
-    
-    num_tokens += len(output.outputs[0].token_ids)
-    print(f"Prompt: {prompt}\nGenerated text: {generated_text} \n")
-print(f"tokens: {num_tokens}, QPS: {num_tokens/duration_time}")
-
-metricResult = {"metricResult": {}}
-metricResult["metricResult"]["tokens"] = num_tokens
-metricResult["metricResult"]["QPS"] = round(num_tokens/duration_time,3)
-print(metricResult)
-
-# 0.3.2 tokens: 422, QPS: 70.02308283048338(tokens: 422, QPS: 93.67210003677407),32-k 模型 tokens: 477, QPS: 81.46537314533865(tokens: 477, QPS: 106.54247895449554)
\ No newline at end of file
+from utils import load_chat_template, sampling_add_cli_args
+from vllm import LLM, EngineArgs, SamplingParams
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--chat_template", type=str, default=None)
+    parser.add_argument(
+        "--remove_chat_template",
+        default=False,
+        action="store_true",
+        help="pass this if you are not use a chat model",
+    )
+    parser = EngineArgs.add_cli_args(parser)
+    parser = sampling_add_cli_args(parser)
+    args = parser.parse_args()
+
+    engine_args = [attr.name for attr in dataclasses.fields(EngineArgs)]
+    sampling_args = [
+        param.name
+        for param in list(
+            inspect.signature(SamplingParams).parameters.values()
+        )
+    ]
+    engine_params = {attr: getattr(args, attr) for attr in engine_args}
+    sampling_params = {
+        attr: getattr(args, attr) for attr in sampling_args if args.__contains__(attr)
+    }
+
+    model_name = os.path.dirname(args.model).rsplit("/")[-1]
+
+    # Sample prompts.
+    prompts = ["哪些迹象可能表明一个人正在经历焦虑?", "描述一下如何制作芝士披萨。", "写一篇有关5G网络研发的综述文章。"]
+
+    # Create a sampling params object.
+    sampling_params = SamplingParams(**sampling_params)
+
+    # Create an LLM.
+    llm = LLM(**engine_params)
+
+    # process chat template
+    if args.remove_chat_template:
+        if "chat" in model_name.lower():
+            logging.warning(
+                f"The model name from model path is {model_name}, so we guess you are using the chat model and the additional processing is required for the input prompt. "
+                f"If the result is not quite correct, please ensure you do not pass --remove_chat_template in CLI."
+            )
+        prompts_new = prompts
+    else:
+        # Build chat model promopt
+        logging.warning(
+            "If you are using a non chat model, please pass the --remove_chat_template in CLI."
+        )
+        logging.warning(
+            "For now, openai api chat interface(v1/chat/completions) need you provide a chat template to process prompt(str) for better results. "
+            "Otherwise, you have to use the default chat template, which may lead to bad answers. But, the process of building chat input is complex "
+            "for some models and the rule of process can not be written as a jinja file. Fortunately, the v1/completions interface support List[int] "
+            "params. This means you can process the prompt firstly, then send the List[int] to v1/completions and consider it as v1/chat/completions "
+            "to use when you use openai api."
+        )
+        tokenizer = llm.get_tokenizer()
+        prompts_new = []
+        for prompt in prompts:
+            input_idx = (
+                tokenizer.build_chat_input(prompt)["input_ids"][0].cpu().tolist()
+            )
+            prompts_new.append(input_idx)
+
+    # Generate texts from the prompts. The output is a list of RequestOutput objects
+    # that contain the prompt, generated text, and other information.
+    outputs = (
+        llm.generate(prompts_new, sampling_params, use_tqdm=False)
+        if isinstance(prompts_new[0], str)
+        else llm.generate(
+            sampling_params=sampling_params,
+            prompt_token_ids=prompts_new,
+            use_tqdm=False,
+        )
+    )
+    torch.cuda.synchronize()
+
+    start_time = time.perf_counter()
+    outputs = (
+        llm.generate(prompts_new, sampling_params)
+        if isinstance(prompts_new[0], str)
+        else llm.generate(sampling_params=sampling_params, prompt_token_ids=prompts_new)
+    )
+    torch.cuda.synchronize()
+    end_time = time.perf_counter()
+    duration_time = end_time - start_time
+
+    num_tokens = 0
+    # Print the outputs.
+    for i, output in enumerate(outputs):
+        prompt = prompts[i]  # show the origin prompt. actully prompt is "output.prompt"
+        generated_text = output.outputs[0].text
+
+        num_tokens += len(output.outputs[0].token_ids)
+        print(f"Prompt: {prompt}\nGenerated text: {generated_text} \n")
+    print(f"tokens: {num_tokens}, QPS: {num_tokens/duration_time}")
diff --git a/models/nlp/large_language_model/chatglm3-6b/vllm/server_inference.py b/models/nlp/large_language_model/chatglm3-6b/vllm/server_inference.py
index e60b6f9c..0b81ec08 100644
--- a/models/nlp/large_language_model/chatglm3-6b/vllm/server_inference.py
+++ b/models/nlp/large_language_model/chatglm3-6b/vllm/server_inference.py
@@ -1,20 +1,6 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
 import argparse
 import time
+
 from openai import OpenAI
 from transformers import AutoTokenizer
 
@@ -26,24 +12,24 @@ def send_request(
     stream: bool,
 ) -> None:
     client = OpenAI(
-    # defaults to os.environ.get("OPENAI_API_KEY")
+        # defaults to os.environ.get("OPENAI_API_KEY")
         api_key="EMPTY",
         base_url=api_url,
     )
 
     models = client.models.list()
     model = models.data[0].id
-    
+
     completion = client.completions.create(
         model=model,
         # messages=[{"role": "user", "content": prompt},],
         prompt=prompt,
         n=1,
         stream=stream,
-        max_tokens=output_len, 
-        temperature=0.0
+        max_tokens=output_len,
+        temperature=0.0,
     )
-    
+
     if stream:
         for each_com in completion:
             print(each_com)
@@ -54,7 +40,8 @@ def send_request(
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(
-        description="Benchmark the online serving throughput.")
+        description="Benchmark the online serving throughput."
+    )
     parser.add_argument("--host", type=str, default="localhost")
     parser.add_argument("--port", type=int, default=8000)
     parser.add_argument("--stream", action="store_true")
@@ -63,17 +50,17 @@ if __name__ == "__main__":
 
     args = parser.parse_args()
     api_url = f"http://{args.host}:{args.port}/v1"
-    
+
     prompts = [
-            "你好",
-            "Which city is the capital of China?",
-            "1 + 1 = ?",
-            "中国的首都是哪里", 
-            "请讲以下内容翻译为英文：\n你好,我来自中国。",
-            ]
-    
+        "你好",
+        "Which city is the capital of China?",
+        "1 + 1 = ?",
+        "中国的首都是哪里",
+        "请讲以下内容翻译为英文：\n你好,我来自中国。",
+    ]
+
     tokenizer = AutoTokenizer.from_pretrained(args.model_path, trust_remote_code=True)
     prompts = [tokenizer.build_chat_input(i).input_ids.tolist() for i in prompts]
 
     for prompt in prompts:
-        send_request(api_url,prompt,args.output_token,args.stream)
+        send_request(api_url, prompt, args.output_token, args.stream)
diff --git a/models/nlp/large_language_model/chatglm3-6b/vllm/utils.py b/models/nlp/large_language_model/chatglm3-6b/vllm/utils.py
deleted file mode 100644
index 1fc072d8..00000000
--- a/models/nlp/large_language_model/chatglm3-6b/vllm/utils.py
+++ /dev/null
@@ -1,371 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
-from copy import deepcopy
-from typing import Tuple, List, Union
-
-import codecs
-import logging
-import argparse
-
-# 对于chat模型，或者模型需要特定的输入，需要对prompt进行额外的处理。
-# 如果您在使用中有额外的prompt处理方式需求或者错误反馈，可以联系王坚或者巩亚飞，我们会对modelzoo进行更新适配。
-
-def sampling_add_cli_args(args: argparse.ArgumentParser) -> argparse.ArgumentParser:
-    args.add_argument(
-        '--n',
-        type=int,
-        default=1,
-        help="Number of output sequences to return for the given prompt.")
-    args.add_argument(
-        '--best-of',
-        type=int,
-        default=None,
-        help="Number of output sequences that are generated from the prompt. "
-        "From these `best_of` sequences, the top `n` sequences are returned. "
-        "`best_of` must be greater than or equal to `n`. This is treated as "
-        "the beam width when `use_beam_search` is True. By default, `best_of`"
-        "is set to `n`.")
-    args.add_argument(
-        '--presence-penalty',
-        type=float,
-        default=0.0,
-        help="Float that penalizes new tokens based on whether they "
-        "appear in the generated text so far. Values > 0 encourage the model "
-        "to use new tokens, while values < 0 encourage the model to repeat "
-        "tokens.")
-    args.add_argument(
-        '--frequency-penalty',
-        type=float,
-        default=0.0,
-        help="Float that penalizes new tokens based on their "
-        " frequency in the generated text so far. Values > 0 encourage the "
-        " model to use new tokens, while values < 0 encourage the model to "
-        "repeat tokens.")
-    args.add_argument(
-        '--repetition-penalty',
-        type=float,
-        default=1.0,
-        help="Float that penalizes new tokens based on whether "
-        "they appear in the prompt and the generated text so far. Values > 1 "
-        "encourage the model to use new tokens, while values < 1 encourage "
-        "the model to repeat tokens.")
-    args.add_argument(
-        '--temperature',
-        type=float,
-        default=1.0,
-        help="Float that controls the randomness of the sampling. Lower "
-        "values make the model more deterministic, while higher values make "
-        "the model more random. Zero means greedy sampling.")
-    args.add_argument(
-        '--top-p',
-        type=float,
-        default=1.0,
-        help="Float that controls the cumulative probability of the top tokens "
-            "to consider. Must be in (0, 1]. Set to 1 to consider all tokens.")
-    args.add_argument(
-        '--top-k',
-        type=int,
-        default=-1,
-        help="Integer that controls the number of top tokens to consider. Set "
-        "to -1 to consider all tokens.")
-    args.add_argument(
-        '--min-p',
-        type=float,
-        default=0.0,
-        help="Float that represents the minimum probability for a token to be "
-        "considered, relative to the probability of the most likely token. "
-        "Must be in [0, 1]. Set to 0 to disable this.")
-    args.add_argument(
-        '--use-beam-search',
-        default=False,
-        action="store_true",
-        help="Whether to use beam search instead of sampling.")
-    args.add_argument(
-        '--length-penalty',
-        type=float,
-        default=1.0,
-        help="Float that penalizes sequences based on their length. Used in beam search.")
-    args.add_argument(
-        '--stop',
-        type=str,
-        default=None,
-        help="List of strings that stop the generation when they are generated. "
-        "The returned output will not contain the stop strings.")
-    args.add_argument(
-        '--stop-token-ids',
-        type=int,
-        default=None,
-        help="List of tokens that stop the generation when they are "
-        "generated. The returned output will contain the stop tokens unless "
-        "the stop tokens are special tokens.")
-    args.add_argument(
-        '--include-stop-str-in-output',
-        default=False,
-        action="store_true",
-        help="Whether to include the stop strings in output text. Defaults to False.")
-    args.add_argument(
-        '--ignore-eos',
-        default=False,
-        action="store_true",
-        help="Whether to ignore the EOS token and continue generating tokens after the EOS token is generated.")
-    args.add_argument(
-        '--max-tokens',
-        type=int,
-        default=16,
-        help="Maximum number of tokens to generate per output sequence.")
-    args.add_argument(
-        '--logprobs',
-        type=int,
-        default=None,
-        help="NNumber of log probabilities to return per output token. "
-        "Note that the implementation follows the OpenAI API: The return "
-        "result includes the log probabilities on the `logprobs` most likely "
-        "tokens, as well the chosen tokens. The API will always return the "
-        "log probability of the sampled token, so there  may be up to "
-        "`logprobs+1` elements in the response.")
-    args.add_argument(
-        '--prompt-logprobs',
-        type=int,
-        default=None,
-        help="Number of log probabilities to return per prompt token.")
-    args.add_argument(
-        '--skip-special-tokens',
-        default=True,
-        action="store_false",
-        help="Whether to skip special tokens in the output.")
-    args.add_argument(
-        '--spaces-between-special-tokens',
-        default=True,
-        action="store_false",
-        help="Whether to add spaces between special tokens in the output.  Defaults to True.")
-    # early_stopping logits_processors seed
-    return args
-
-
-def load_chat_template(tokenizer, chat_template):
-        if chat_template is not None:
-            try:
-                with open(chat_template, "r") as f:
-                    tokenizer.chat_template = f.read()
-            except OSError:
-                # If opening a file fails, set chat template to be args to
-                # ensure we decode so our escape are interpreted correctly
-                tokenizer.chat_template = codecs.decode(
-                    chat_template, "unicode_escape")
-
-            logging.info(
-                f"Using supplied chat template:\n{tokenizer.chat_template}"
-            )
-        elif tokenizer.chat_template is not None:
-            logging.info(
-                f"Using default chat template:\n{tokenizer.chat_template}"
-            )
-        else:
-            logging.warning(
-                "No chat template provided. Chat API will not work.")
-
-def default_build_chat(tokenizer,prompt):
-    return prompt
-
-def chatglm2_build_chat(tokenizer,prompt):
-    return tokenizer.build_prompt(prompt)
-
-def chatglm3_build_chat(tokenizer,prompt):
-    return tokenizer.build_chat_input(prompt).input_ids[0].tolist()
-
-def llama2_build_chat(tokenizer,prompt):
-    return f"[INST]{prompt}[/INST]"
-
-# adapt from https://huggingface.co/baichuan-inc/Baichuan2-13B-Chat/blob/main/generation_utils.py
-def baichuan2_build_chat(tokenizer, prompt, max_new_tokens=512):
-    def _parse_messages(messages, split_role="user"):
-        system, rounds = "", []
-        round = []
-        for i, message in enumerate(messages):
-            if message["role"] == "system":
-                assert i == 0
-                system = message["content"]
-                continue
-            if message["role"] == split_role and round:
-                rounds.append(round)
-                round = []
-            round.append(message)
-        if round:
-            rounds.append(round)
-        return system, rounds
-
-    messages = [{"role": "user", "content": f"{prompt}"}]
-    max_new_tokens = max_new_tokens
-    max_input_tokens = 4096 - max_new_tokens
-    system, rounds = _parse_messages(messages, split_role="user")
-    system_tokens = tokenizer.encode(system)
-    max_history_tokens = max_input_tokens - len(system_tokens)
-
-    history_tokens = []
-    for round in rounds[::-1]:
-        round_tokens = []
-        for message in round:
-            if message["role"] == "user":
-                round_tokens.append(195)
-            else:
-                round_tokens.append(196)
-            round_tokens.extend(tokenizer.encode(message["content"]))
-        if len(history_tokens) == 0 or len(history_tokens) + len(round_tokens) <= max_history_tokens:
-            history_tokens = round_tokens + history_tokens  # concat left
-            if len(history_tokens) < max_history_tokens:
-                continue
-        break
-
-    input_tokens = system_tokens + history_tokens
-    if messages[-1]["role"] != "assistant":
-        input_tokens.append(196)
-    input_tokens = input_tokens[-max_input_tokens:]  # truncate left
-    return input_tokens
-
-def qwen_build_chat(
-    tokenizer,
-    query: str,
-    history: List[Tuple[str, str]] = None,
-    system: str = "",
-    max_window_size: int = 6144,
-    chat_format: str = "chatml",
-):
-    if history is None:
-        history = []
-
-    if chat_format == "chatml":
-        im_start, im_end = "<|im_start|>", "<|im_end|>"
-        im_start_tokens = [tokenizer.im_start_id]
-        im_end_tokens = [tokenizer.im_end_id]
-        nl_tokens = tokenizer.encode("\n")
-
-        def _tokenize_str(role, content):
-            return f"{role}\n{content}", tokenizer.encode(
-                role, allowed_special=set()
-            ) + nl_tokens + tokenizer.encode(content, allowed_special=set())
-
-        system_text, system_tokens_part = _tokenize_str("system", system)
-        system_tokens = im_start_tokens + system_tokens_part + im_end_tokens
-
-        raw_text = ""
-        context_tokens = []
-
-        for turn_query, turn_response in reversed(history):
-            query_text, query_tokens_part = _tokenize_str("user", turn_query)
-            query_tokens = im_start_tokens + query_tokens_part + im_end_tokens
-            response_text, response_tokens_part = _tokenize_str(
-                "assistant", turn_response
-            )
-            response_tokens = im_start_tokens + response_tokens_part + im_end_tokens
-
-            next_context_tokens = nl_tokens + query_tokens + nl_tokens + response_tokens
-            prev_chat = (
-                f"\n{im_start}{query_text}{im_end}\n{im_start}{response_text}{im_end}"
-            )
-
-            current_context_size = (
-                len(system_tokens) + len(next_context_tokens) + len(context_tokens)
-            )
-            if current_context_size < max_window_size:
-                context_tokens = next_context_tokens + context_tokens
-                raw_text = prev_chat + raw_text
-            else:
-                break
-
-        context_tokens = system_tokens + context_tokens
-        raw_text = f"{im_start}{system_text}{im_end}" + raw_text
-        context_tokens += (
-            nl_tokens
-            + im_start_tokens
-            + _tokenize_str("user", query)[1]
-            + im_end_tokens
-            + nl_tokens
-            + im_start_tokens
-            + tokenizer.encode("assistant")
-            + nl_tokens
-        )
-        raw_text += f"\n{im_start}user\n{query}{im_end}\n{im_start}assistant\n"
-
-    elif chat_format == "raw":
-        raw_text = query
-        context_tokens = tokenizer.encode(raw_text)
-    else:
-        raise NotImplementedError(f"Unknown chat format {chat_format!r}")
-
-    return raw_text, context_tokens
-
-def codellama_build_chat(tokenizer,prompt):
-    return "[INST] Write code to solve the following coding problem that obeys the constraints and passes the example test cases. Please wrap your code answer using ```:{}[/INST]".format(prompt)
-
-def build_chat(tokenizer, prompt, model_name, **kwargs):
-    model_name = model_name.lower()
-        # return str or list[int]
-    if "chatglm2" in model_name:
-        prompt = chatglm2_build_chat(tokenizer,prompt)
-    elif "chatglm3" in model_name:
-        prompt = chatglm3_build_chat(tokenizer,prompt)
-    elif "llama2" in model_name and 'chat' in model_name:
-        prompt = llama2_build_chat(tokenizer,prompt)
-    elif "baichuan2" in model_name and 'chat' in model_name:
-        prompt = baichuan2_build_chat(tokenizer,prompt, kwargs['max_length'])
-    elif "qwen" in model_name and 'chat' in model_name:
-        prompt = qwen_build_chat(tokenizer,prompt)
-    elif "code" in model_name and 'llama' in model_name:
-        prompt = codellama_build_chat(tokenizer,prompt)
-    else:
-        prompt = default_build_chat(tokenizer,prompt)
-    return prompt
-
-
-# for output
-def default_post_process(output):
-    return output
-
-def glm2_post_process(output):
-    output = output.strip()
-    output = output.replace("[[训练时间]]", "2023年")
-    return output
-
-def glm3_post_process(output, history=[]):
-    content = ""
-    history = deepcopy(history)
-    for response in output.split("<|assistant|>"):
-        metadata, content = response.split("\n", maxsplit=1)
-        if not metadata.strip():
-            content = content.strip()
-            history.append({"role": "assistant", "metadata": metadata, "content": content})
-            content = content.replace("[[训练时间]]", "2023年")
-        else:
-            history.append({"role": "assistant", "metadata": metadata, "content": content})
-            if history[0]["role"] == "system" and "tools" in history[0]:
-                content = "\n".join(content.split("\n")[1:-1])
-                def tool_call(**kwargs):
-                    return kwargs
-                parameters = eval(content)
-                content = {"name": metadata.strip(), "parameters": parameters}
-            else:
-                content = {"name": metadata.strip(), "content": content}
-    return content
-
-def post_process(response, model_name,**kwargs):
-    model_name = model_name.lower()
-    if "chatglm2" in model_name:
-        response = glm2_post_process(response)
-    elif "chatglm3" in model_name:
-        response = glm3_post_process(response)
-    else:
-        response = default_post_process(response)
-    return response
\ No newline at end of file
diff --git a/models/nlp/large_language_model/llama2-7b/vllm/flashinfer_backend/offline_inference.py b/models/nlp/large_language_model/llama2-7b/vllm/flashinfer_backend/offline_inference.py
index baa03ef8..0020834e 100644
--- a/models/nlp/large_language_model/llama2-7b/vllm/flashinfer_backend/offline_inference.py
+++ b/models/nlp/large_language_model/llama2-7b/vllm/flashinfer_backend/offline_inference.py
@@ -1,20 +1,6 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
 import sys
 from pathlib import Path
+import os
 
 sys.path.append(str(Path(__file__).resolve().parent.parent.parent))
 import argparse
@@ -44,7 +30,7 @@ if __name__ == "__main__":
     sampling_args = [
         param.name
         for param in list(
-            inspect.signature(SamplingParams).parameters.values()
+            inspect.signature(SamplingParams.__init__).parameters.values()
         )[1:]
     ]
     engine_params = {attr: getattr(args, attr) for attr in engine_args}
@@ -52,9 +38,7 @@ if __name__ == "__main__":
         attr: getattr(args, attr) for attr in sampling_args if args.__contains__(attr)
     }
 
-    model_name = args.model.strip()
-    model_name = model_name if args.model[-1] != "/" else model_name[:-1]
-    model_name = model_name.rsplit("/")[-1]
+    model_name = os.path.dirname(args.model).rsplit("/")[-1]
 
     # Sample prompts.
     prompts = [
diff --git a/models/nlp/large_language_model/llama2-7b/vllm/offline_inference.py b/models/nlp/large_language_model/llama2-7b/vllm/offline_inference.py
index 35f1e119..30a1b43d 100644
--- a/models/nlp/large_language_model/llama2-7b/vllm/offline_inference.py
+++ b/models/nlp/large_language_model/llama2-7b/vllm/offline_inference.py
@@ -1,19 +1,6 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
 import sys
 from pathlib import Path
+import os
 
 sys.path.append(str(Path(__file__).resolve().parent.parent.parent))
 import argparse
@@ -44,16 +31,14 @@ if __name__ == "__main__":
         param.name
         for param in list(
             inspect.signature(SamplingParams).parameters.values()
-        )[1:]
+        )
     ]
     engine_params = {attr: getattr(args, attr) for attr in engine_args}
     sampling_params = {
         attr: getattr(args, attr) for attr in sampling_args if args.__contains__(attr)
     }
 
-    model_name = args.model.strip()
-    model_name = model_name if args.model[-1] != "/" else model_name[:-1]
-    model_name = model_name.rsplit("/")[-1]
+    model_name = os.path.dirname(args.model).rsplit("/")[-1]
 
     # Sample prompts.
     prompts = [
@@ -129,7 +114,3 @@ if __name__ == "__main__":
         num_tokens += len(output.outputs[0].token_ids)
         print(f"Prompt: {prompt}\nGenerated text: {generated_text} \n")
     print(f"tokens: {num_tokens}, QPS: {num_tokens/duration_time}")
-    metricResult = {"metricResult": {}}
-    metricResult["metricResult"]["tokens"] = num_tokens
-    metricResult["metricResult"]["QPS"] = round(num_tokens/duration_time,3)
-    print(metricResult)
diff --git a/models/nlp/large_language_model/llama2-7b/vllm/utils.py b/models/nlp/large_language_model/llama2-7b/vllm/utils.py
deleted file mode 100644
index c6def85d..00000000
--- a/models/nlp/large_language_model/llama2-7b/vllm/utils.py
+++ /dev/null
@@ -1,173 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
-import codecs
-import logging
-import argparse
-
-
-def sampling_add_cli_args(args: argparse.ArgumentParser) -> argparse.ArgumentParser:
-    args.add_argument(
-        '--n',
-        type=int,
-        default=1,
-        help="Number of output sequences to return for the given prompt.")
-    args.add_argument(
-        '--best-of',
-        type=int,
-        default=None,
-        help="Number of output sequences that are generated from the prompt. "
-        "From these `best_of` sequences, the top `n` sequences are returned. "
-        "`best_of` must be greater than or equal to `n`. This is treated as "
-        "the beam width when `use_beam_search` is True. By default, `best_of`"
-        "is set to `n`.")
-    args.add_argument(
-        '--presence-penalty',
-        type=float,
-        default=0.0,
-        help="Float that penalizes new tokens based on whether they "
-        "appear in the generated text so far. Values > 0 encourage the model "
-        "to use new tokens, while values < 0 encourage the model to repeat "
-        "tokens.")
-    args.add_argument(
-        '--frequency-penalty',
-        type=float,
-        default=0.0,
-        help="Float that penalizes new tokens based on their "
-        " frequency in the generated text so far. Values > 0 encourage the "
-        " model to use new tokens, while values < 0 encourage the model to "
-        "repeat tokens.")
-    args.add_argument(
-        '--repetition-penalty',
-        type=float,
-        default=1.0,
-        help="Float that penalizes new tokens based on whether "
-        "they appear in the prompt and the generated text so far. Values > 1 "
-        "encourage the model to use new tokens, while values < 1 encourage "
-        "the model to repeat tokens.")
-    args.add_argument(
-        '--temperature',
-        type=float,
-        default=1.0,
-        help="Float that controls the randomness of the sampling. Lower "
-        "values make the model more deterministic, while higher values make "
-        "the model more random. Zero means greedy sampling.")
-    args.add_argument(
-        '--top-p',
-        type=float,
-        default=1.0,
-        help="Float that controls the cumulative probability of the top tokens "
-            "to consider. Must be in (0, 1]. Set to 1 to consider all tokens.")
-    args.add_argument(
-        '--top-k',
-        type=int,
-        default=-1,
-        help="Integer that controls the number of top tokens to consider. Set "
-        "to -1 to consider all tokens.")
-    args.add_argument(
-        '--min-p',
-        type=float,
-        default=0.0,
-        help="Float that represents the minimum probability for a token to be "
-        "considered, relative to the probability of the most likely token. "
-        "Must be in [0, 1]. Set to 0 to disable this.")
-    args.add_argument(
-        '--use-beam-search',
-        default=False,
-        action="store_true",
-        help="Whether to use beam search instead of sampling.")
-    args.add_argument(
-        '--length-penalty',
-        type=float,
-        default=1.0,
-        help="Float that penalizes sequences based on their length. Used in beam search.")
-    args.add_argument(
-        '--stop',
-        type=str,
-        default=None,
-        help="List of strings that stop the generation when they are generated. "
-        "The returned output will not contain the stop strings.")
-    args.add_argument(
-        '--stop-token-ids',
-        type=int,
-        default=None,
-        help="List of tokens that stop the generation when they are "
-        "generated. The returned output will contain the stop tokens unless "
-        "the stop tokens are special tokens.")
-    args.add_argument(
-        '--include-stop-str-in-output',
-        default=False,
-        action="store_true",
-        help="Whether to include the stop strings in output text. Defaults to False.")
-    args.add_argument(
-        '--ignore-eos',
-        default=False,
-        action="store_true",
-        help="Whether to ignore the EOS token and continue generating tokens after the EOS token is generated.")
-    args.add_argument(
-        '--max-tokens',
-        type=int,
-        default=16,
-        help="Maximum number of tokens to generate per output sequence.")
-    args.add_argument(
-        '--logprobs',
-        type=int,
-        default=None,
-        help="NNumber of log probabilities to return per output token. "
-        "Note that the implementation follows the OpenAI API: The return "
-        "result includes the log probabilities on the `logprobs` most likely "
-        "tokens, as well the chosen tokens. The API will always return the "
-        "log probability of the sampled token, so there  may be up to "
-        "`logprobs+1` elements in the response.")
-    args.add_argument(
-        '--prompt-logprobs',
-        type=int,
-        default=None,
-        help="Number of log probabilities to return per prompt token.")
-    args.add_argument(
-        '--skip-special-tokens',
-        default=True,
-        action="store_false",
-        help="Whether to skip special tokens in the output.")
-    args.add_argument(
-        '--spaces-between-special-tokens',
-        default=True,
-        action="store_false",
-        help="Whether to add spaces between special tokens in the output.  Defaults to True.")
-    # early_stopping logits_processors seed
-    return args
-
-
-def load_chat_template(tokenizer, chat_template):
-    if chat_template is not None:
-        try:
-            with open(chat_template, "r") as f:
-                tokenizer.chat_template = f.read()
-        except OSError:
-            # If opening a file fails, set chat template to be args to
-            # ensure we decode so our escape are interpreted correctly
-            tokenizer.chat_template = codecs.decode(
-                chat_template, "unicode_escape")
-
-        logging.info(
-            f"Using supplied chat template:\n{tokenizer.chat_template}"
-        )
-    elif tokenizer.chat_template is not None:
-        logging.info(
-            f"Using default chat template:\n{tokenizer.chat_template}. This May lead to unsatisfactory results. You can provide a template.jinja file for vllm."
-        )
-    else:
-        logging.warning(
-            "No chat template provided. Chat API will not work. This May lead to unsatisfactory results. You can provide a template.jinja file for vllm.")
diff --git a/models/nlp/large_language_model/llama3-70b/vllm/offline_inference.py b/models/nlp/large_language_model/llama3-70b/vllm/offline_inference.py
index 593837fe..30a1b43d 100644
--- a/models/nlp/large_language_model/llama3-70b/vllm/offline_inference.py
+++ b/models/nlp/large_language_model/llama3-70b/vllm/offline_inference.py
@@ -1,163 +1,116 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
 import sys
 from pathlib import Path
-sys.path.append(str(Path(__file__).resolve().parent.parent.parent))
-from utils import load_chat_template,sampling_add_cli_args
+import os
 
-import logging
-import time
+sys.path.append(str(Path(__file__).resolve().parent.parent.parent))
 import argparse
 import dataclasses
 import inspect
+import logging
+import time
 
 import torch
-from vllm import LLM, SamplingParams, EngineArgs
-
-
-parser = argparse.ArgumentParser()
-parser.add_argument("--qps_test", default=False, action='store_true', help="for test only!!")
-parser.add_argument("--acc_test", default=False, action='store_true', help="for test only!!")
-parser.add_argument("--qps_threshold", type=float, default=15., help="for test only!!")
-parser.add_argument("--acc_threshold", type=float, default=0.95, help="for test only!!")
-parser.add_argument("--chat_template",type=str,default=None)
-parser.add_argument("--remove_chat_template",default=False,action="store_true",help="pass this if you are not use a chat model")
-parser = EngineArgs.add_cli_args(parser)
-parser = sampling_add_cli_args(parser)
-args = parser.parse_args()
-
-engine_args = [attr.name for attr in dataclasses.fields(EngineArgs)]
-sampling_args = [param.name for param in list(inspect.signature(SamplingParams).parameters.values())[1:]]
-engine_params = {attr:getattr(args, attr) for attr in engine_args}
-sampling_params = {attr:getattr(args, attr) for attr in sampling_args if args.__contains__(attr)}
-
-model_name = args.model.strip()
-model_name = model_name if args.model[-1]!='/' else model_name[:-1]
-model_name = model_name.rsplit('/')[-1]
-
-
-# Sample prompts.
-if not args.qps_test and not args.acc_test:
+from utils import load_chat_template, sampling_add_cli_args
+from vllm import LLM, EngineArgs, SamplingParams
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--chat_template", type=str, default=None)
+    parser.add_argument(
+        "--remove_chat_template",
+        default=False,
+        action="store_true",
+        help="pass this if you are not use a chat model",
+    )
+    parser = EngineArgs.add_cli_args(parser)
+    parser = sampling_add_cli_args(parser)
+    args = parser.parse_args()
+
+    engine_args = [attr.name for attr in dataclasses.fields(EngineArgs)]
+    sampling_args = [
+        param.name
+        for param in list(
+            inspect.signature(SamplingParams).parameters.values()
+        )
+    ]
+    engine_params = {attr: getattr(args, attr) for attr in engine_args}
+    sampling_params = {
+        attr: getattr(args, attr) for attr in sampling_args if args.__contains__(attr)
+    }
+
+    model_name = os.path.dirname(args.model).rsplit("/")[-1]
+
+    # Sample prompts.
     prompts = [
-                "Shanghai is one of the most prosperous cities in China, with a GDP of over $300 billion. Shanghai has the fastest growing economy in China and is the second busiest port in the world. In addition to being a hub for business, Shanghai is also a major tourist destination. It is known for its diverse culture and many historical sites.\nThe city of Shanghai is located on the coast of the Pacific Ocean in east-central China. It is bordered by Jiangsu Province to the north, Zhejiang Province to the south, and Jiangsu Province to the west.", 
-                "What signs may indicate that a person is experiencing anxiety?", 
-                "Describe how to make cheese pizza.", 
-                "Write a review article on the development of 5G networks."
-            ]
-else:
-    prompts = ["Shanghai is one of the most prosperous cities in China, with a GDP of over $300 billion. Shanghai has the fastest growing economy in China and is the second busiest port in the world. In addition to being a hub for business, Shanghai is also a major tourist destination. It is known for its diverse culture and many historical sites.\nThe city of Shanghai is located on the coast of the Pacific Ocean in east-central China. It is bordered by Jiangsu Province to the north, Zhejiang Province to the south, and Jiangsu Province to the west.",]
+        "Shanghai is one of the most prosperous cities in China, with a GDP of over $300 billion. Shanghai has the fastest growing economy in China and is the second busiest port in the world. In addition to being a hub for business, Shanghai is also a major tourist destination. It is known for its diverse culture and many historical sites.\nThe city of Shanghai is located on the coast of the Pacific Ocean in east-central China. It is bordered by Jiangsu Province to the north, Zhejiang Province to the south, and Jiangsu Province to the west.",
+        "What signs may indicate that a person is experiencing anxiety?",
+        "Describe how to make cheese pizza.",
+        "Write a review article on the development of 5G networks.",
+    ]
 
-# Create a sampling params object.
-sampling_params = SamplingParams(**sampling_params)
+    # Create a sampling params object.
+    sampling_params = SamplingParams(**sampling_params)
 
-# Create an LLM.
-llm = LLM(**engine_params)
+    # Create an LLM.
+    llm = LLM(**engine_params)
 
-# process chat template
-if args.remove_chat_template:
-    if 'chat' in model_name.lower():
-        logging.warning(f"The model name from model path is {model_name}, so we guess you are using the chat model and the additional processing is required for the input prompt. "
-                        f"If the result is not quite correct, please ensure you do not pass --remove_chat_template in CLI.")
-    prompts_new = prompts
-else:
-    # Build chat model promopt
-    logging.warning("If you are using a non chat model, please pass the --remove_chat_template in CLI.")
-    # Try use transformers's apply_chat_template, if chat_template is None, will use defalut template.
-    # For some old models, the default template may cause bad answers. we don't consider this situation, 
-    # because the Transformers team is advancing the chat template. For more informatino about it, 
-    # please refer to https://huggingface.co/docs/transformers/main/chat_templating
-    try:
-        load_chat_template(llm.get_tokenizer(),args.chat_template)
-        prompts_new = []
-        for prompt in prompts:
-            messages = [
-                {"role": "user", "content": prompt}
-            ]
-            text = llm.get_tokenizer().apply_chat_template(
-                messages,
-                tokenize=False,
-                add_generation_prompt=True
-            )
-            prompts_new.append(text)
-    except:
-        logging.warning("use tokenizer apply_chat_template function failed, may because of low transformers version...(try use transformers>=4.34.0)")
+    # process chat template
+    if args.remove_chat_template:
         prompts_new = prompts
-
-# Generate texts from the prompts. The output is a list of RequestOutput objects
-# that contain the prompt, generated text, and other information.
-outputs = llm.generate(prompts_new, sampling_params,use_tqdm=False) if isinstance(prompts_new[0],str) else llm.generate(sampling_params=sampling_params,prompt_token_ids=prompts_new,use_tqdm=False)
-torch.cuda.synchronize()
-
-start_time = time.perf_counter()
-outputs = llm.generate(prompts_new, sampling_params) if isinstance(prompts_new[0],str) else llm.generate(sampling_params=sampling_params,prompt_token_ids=prompts_new)
-torch.cuda.synchronize()
-end_time = time.perf_counter()
-duration_time = end_time - start_time
-
-num_tokens = 0
-# Print the outputs.
-for i, output in enumerate(outputs):
-    prompt = prompts[i] # show the origin prompt. actully prompt is "output.prompt"
-    generated_text = output.outputs[0].text
-    
-    num_tokens += len(output.outputs[0].token_ids)
-    # if not args.qps_test and not args.acc_test:
-    print(f"Prompt: {prompt}\nGenerated text: {generated_text} \n")
-print(f"tokens: {num_tokens}, QPS: {num_tokens/duration_time}")
-
-# test use
-if args.qps_test:
-    import re
-    qps_dicts = {'llama.*7.*' :{1:36, 2:42, 8:30},
-                 'llama.*13.*':{1:20, 2:28, 8:30}
-                 }
-    for k,v in qps_dicts.items():
-        if re.search(k,model_name.lower()):
-            qps_dict = v
-            break
-    args.qps_threshold = qps_dict.get(args.tensor_parallel_size,4.5)
-    if num_tokens/duration_time < args.qps_threshold:
-        print('val qps: {}, target qps: {}, fail'.format(num_tokens/duration_time,args.qps_threshold))
-        exit(1)
-    print('val qps: {}, target qps: {}, pass'.format(num_tokens/duration_time,args.qps_threshold))
-if args.acc_test:
-    from rouge import Rouge
-    import re
-    acc_dict  = {r'llama.?7.*?':"",
-                 r'llama.?13.*?':" Shanghai is located on the Yangtze River Delta, which is the largest river delta in the world.\nShanghai has a humid subtropical climate with four distinct seasons. The summers are hot and humid, with temperatures reaching 35 degrees Celsius. The winters are cool and dry, with temperatures reaching 10 degrees Celsius. The city receives an average of 1,200 millimeters of rain per year.\nShanghai is the most populous city in China, with a population of over 23 million. The city is home to 9 million permanent residents and 14 million migrant workers. Shanghai is also home to the largest number of expatriates in China.\nShanghai is a major tourist destination, with over 23 million visitors per year. The city is home to many historical sites, including the Bund, the Yu Garden, and the Jade Buddha Temple. Shanghai is also home to the Oriental Pearl TV Tower, which is the second tallest tower in the world.\nShanghai is a major center for business and finance in China. It is home to the Shanghai Stock Exchange,",
-                 r'llama.?2.?7.*?':" The city has a population of over 24 million people and covers an area of 6,340 square kilometers.\nShanghai is a major port city and is home to many large companies. The city is also a major tourist destination and is known for its many historical sites.\nShanghai is a major port city and is home to many large companies. The city is also a major tourist destination and is known for its many historical sites.\nShanghai is a major port city and is home to many large companies. The city is also a major tourist destination and is known for its many historical sites. Shanghai is a major port city and is home to many large companies. The city is also a major tourist destination and is known for its many historical sites.\nShanghai is a major port city and is home to many large companies. The city is also a major tourist destination and is known for its many historical sites. Shanghai is a major port city and is home to many large companies. The city is also a major tourist destination and is known for its many historical sites. Shanghai is a major port city and is home to many large companies. The city is also a major tourist",
-                 r'llama.?2.?13.*?':" The city has a population of over 24 million people and covers an area of 6,340 square kilometers (2,448 square miles).\nShanghai is a major financial center in China and is home to many multinational corporations. The city has a diverse economy that includes manufacturing, finance, real estate, and tourism. Shanghai is also a major transportation hub with two international airports and a large port.\nThe city of Shanghai is divided into 16 districts. The districts are:\n1. Huangpu District\n2. Xuhui District\n3. Changning District\n4. Jing’an District\n5. Putuo District\n6. Yangpu District\n7. Hongkou District\n8. Baoshan District\n9. Minhang District\n10. Jiading District\n11. Qingpu District\n12. Songjiang District\n13. Fengxian District\n14. Jinshan District\n15. Nanhui District\n16. Pudong New Area\nThe city of Shanghai is divided into 16 districts. The districts are: Huangpu District, Xu",
-                }
-    for k, v in acc_dict.items():
-        if re.search(k,model_name.lower()):
-            reference = v
-            break
-    candidate = outputs[0].outputs[0].text
-    scorer = Rouge()
-    scores = scorer.get_scores(reference, candidate)
-    if scores[0]["rouge-1"]['f'] < args.acc_threshold:
-        print('val ROUGE-1 score f1: {}, target ROUGE-1 score f1: {}, fail'.format(scores[0]["rouge-1"]['f'],args.acc_threshold))
-        exit(1)
-    print('val ROUGE-1 score f1: {}, target ROUGE-1 score f1: {}, pass'.format(scores[0]["rouge-1"]['f'],args.acc_threshold))
-    metricResult = {"metricResult": {}}
-    metricResult["metricResult"]["val ROUGE-1 score f1"] = scores[0]["rouge-1"]['f']
-    metricResult["metricResult"]["val ROUGE-1 score f1"] = args.acc_threshold
-    print(metricResult)
-
-# 2 7b vllm 0.1.6: batch 3, tokens: 773, QPS: 64.35866137433203; batch 1, tokens: 257, QPS: 25.396898421442113
-# 1\2 13b vllm 0.1.6: batch 3, tokens: 768, QPS: 41.538942353799506; batch 1, tokens: 257, QPS: 15.639606595029639 (2, 6.5829828847570795; 8, 5.137610167755676)
-
-# 0.3.2 13b tokens: 768, QPS: 99.1182273040533 13b-awq-2card(tokens: 768, QPS: 161.07526866069998) 70b-awq-2card(tokens: 768, QPS: 55.91434180918294)
-# 0.3.2 smoothquant 7b tokens: 750, QPS: 82.11710297948171(tokens: 768, QPS: 82.49768795244577)
\ No newline at end of file
+        if "chat" in model_name.lower():
+            logging.warning(
+                f"The model name from model path is {model_name}, so we guess you are using the chat model and the additional processing is required for the input prompt. "
+                f"If the result is not quite correct, please ensure you do not pass --remove_chat_template in CLI."
+            )
+    else:
+        # Build chat model promopt
+        # logging.warning("If you are using a non chat model, please pass the --remove_chat_template in CLI.")
+        # Try use transformers's apply_chat_template, if chat_template is None, will use defalut template.
+        # For some old models, the default template may cause bad answers. we don't consider this situation,
+        # because the Transformers team is advancing the chat template. For more informatino about it,
+        # please refer to https://huggingface.co/docs/transformers/main/chat_templating
+        try:
+            load_chat_template(llm.get_tokenizer(), args.chat_template)
+            prompts_new = []
+            for prompt in prompts:
+                messages = [{"role": "user", "content": prompt}]
+                text = llm.get_tokenizer().apply_chat_template(
+                    messages, tokenize=False, add_generation_prompt=True
+                )
+                prompts_new.append(text)
+        except:
+            logging.warning(
+                "use tokenizer apply_chat_template function failed, may because of low transformers version...(try use transformers>=4.34.0)"
+            )
+            prompts_new = prompts
+
+    outputs = (
+        llm.generate(prompts_new, sampling_params, use_tqdm=False)
+        if isinstance(prompts_new[0], str)
+        else llm.generate(
+            sampling_params=sampling_params,
+            prompt_token_ids=prompts_new,
+            use_tqdm=False,
+        )
+    )
+    torch.cuda.synchronize()
+
+    start_time = time.perf_counter()
+    outputs = (
+        llm.generate(prompts_new, sampling_params)
+        if isinstance(prompts_new[0], str)
+        else llm.generate(sampling_params=sampling_params, prompt_token_ids=prompts_new)
+    )
+    torch.cuda.synchronize()
+    end_time = time.perf_counter()
+    duration_time = end_time - start_time
+
+    num_tokens = 0
+    # Print the outputs.
+    for i, output in enumerate(outputs):
+        prompt = prompts[i]  # show the origin prompt. actully prompt is "output.prompt"
+        generated_text = output.outputs[0].text
+
+        num_tokens += len(output.outputs[0].token_ids)
+        print(f"Prompt: {prompt}\nGenerated text: {generated_text} \n")
+    print(f"tokens: {num_tokens}, QPS: {num_tokens/duration_time}")
diff --git a/models/nlp/large_language_model/llama3-70b/vllm/utils.py b/models/nlp/large_language_model/llama3-70b/vllm/utils.py
deleted file mode 100644
index de218b2a..00000000
--- a/models/nlp/large_language_model/llama3-70b/vllm/utils.py
+++ /dev/null
@@ -1,172 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-import codecs
-import logging
-import argparse
-
-
-def sampling_add_cli_args(args: argparse.ArgumentParser) -> argparse.ArgumentParser:
-    args.add_argument(
-        '--n',
-        type=int,
-        default=1,
-        help="Number of output sequences to return for the given prompt.")
-    args.add_argument(
-        '--best-of',
-        type=int,
-        default=None,
-        help="Number of output sequences that are generated from the prompt. "
-        "From these `best_of` sequences, the top `n` sequences are returned. "
-        "`best_of` must be greater than or equal to `n`. This is treated as "
-        "the beam width when `use_beam_search` is True. By default, `best_of`"
-        "is set to `n`.")
-    args.add_argument(
-        '--presence-penalty',
-        type=float,
-        default=0.0,
-        help="Float that penalizes new tokens based on whether they "
-        "appear in the generated text so far. Values > 0 encourage the model "
-        "to use new tokens, while values < 0 encourage the model to repeat "
-        "tokens.")
-    args.add_argument(
-        '--frequency-penalty',
-        type=float,
-        default=0.0,
-        help="Float that penalizes new tokens based on their "
-        " frequency in the generated text so far. Values > 0 encourage the "
-        " model to use new tokens, while values < 0 encourage the model to "
-        "repeat tokens.")
-    args.add_argument(
-        '--repetition-penalty',
-        type=float,
-        default=1.0,
-        help="Float that penalizes new tokens based on whether "
-        "they appear in the prompt and the generated text so far. Values > 1 "
-        "encourage the model to use new tokens, while values < 1 encourage "
-        "the model to repeat tokens.")
-    args.add_argument(
-        '--temperature',
-        type=float,
-        default=1.0,
-        help="Float that controls the randomness of the sampling. Lower "
-        "values make the model more deterministic, while higher values make "
-        "the model more random. Zero means greedy sampling.")
-    args.add_argument(
-        '--top-p',
-        type=float,
-        default=1.0,
-        help="Float that controls the cumulative probability of the top tokens "
-            "to consider. Must be in (0, 1]. Set to 1 to consider all tokens.")
-    args.add_argument(
-        '--top-k',
-        type=int,
-        default=-1,
-        help="Integer that controls the number of top tokens to consider. Set "
-        "to -1 to consider all tokens.")
-    args.add_argument(
-        '--min-p',
-        type=float,
-        default=0.0,
-        help="Float that represents the minimum probability for a token to be "
-        "considered, relative to the probability of the most likely token. "
-        "Must be in [0, 1]. Set to 0 to disable this.")
-    args.add_argument(
-        '--use-beam-search',
-        default=False,
-        action="store_true",
-        help="Whether to use beam search instead of sampling.")
-    args.add_argument(
-        '--length-penalty',
-        type=float,
-        default=1.0,
-        help="Float that penalizes sequences based on their length. Used in beam search.")
-    args.add_argument(
-        '--stop',
-        type=str,
-        default=None,
-        help="List of strings that stop the generation when they are generated. "
-        "The returned output will not contain the stop strings.")
-    args.add_argument(
-        '--stop-token-ids',
-        type=int,
-        default=None,
-        help="List of tokens that stop the generation when they are "
-        "generated. The returned output will contain the stop tokens unless "
-        "the stop tokens are special tokens.")
-    args.add_argument(
-        '--include-stop-str-in-output',
-        default=False,
-        action="store_true",
-        help="Whether to include the stop strings in output text. Defaults to False.")
-    args.add_argument(
-        '--ignore-eos',
-        default=False,
-        action="store_true",
-        help="Whether to ignore the EOS token and continue generating tokens after the EOS token is generated.")
-    args.add_argument(
-        '--max-tokens',
-        type=int,
-        default=16,
-        help="Maximum number of tokens to generate per output sequence.")
-    args.add_argument(
-        '--logprobs',
-        type=int,
-        default=None,
-        help="NNumber of log probabilities to return per output token. "
-        "Note that the implementation follows the OpenAI API: The return "
-        "result includes the log probabilities on the `logprobs` most likely "
-        "tokens, as well the chosen tokens. The API will always return the "
-        "log probability of the sampled token, so there  may be up to "
-        "`logprobs+1` elements in the response.")
-    args.add_argument(
-        '--prompt-logprobs',
-        type=int,
-        default=None,
-        help="Number of log probabilities to return per prompt token.")
-    args.add_argument(
-        '--skip-special-tokens',
-        default=True,
-        action="store_false",
-        help="Whether to skip special tokens in the output.")
-    args.add_argument(
-        '--spaces-between-special-tokens',
-        default=True,
-        action="store_false",
-        help="Whether to add spaces between special tokens in the output.  Defaults to True.")
-    # early_stopping logits_processors seed
-    return args
-
-
-def load_chat_template(tokenizer, chat_template):
-    if chat_template is not None:
-        try:
-            with open(chat_template, "r") as f:
-                tokenizer.chat_template = f.read()
-        except OSError:
-            # If opening a file fails, set chat template to be args to
-            # ensure we decode so our escape are interpreted correctly
-            tokenizer.chat_template = codecs.decode(
-                chat_template, "unicode_escape")
-
-        logging.info(
-            f"Using supplied chat template:\n{tokenizer.chat_template}"
-        )
-    elif tokenizer.chat_template is not None:
-        logging.info(
-            f"Using default chat template:\n{tokenizer.chat_template}. This May lead to unsatisfactory results. You can provide a template.jinja file for vllm."
-        )
-    else:
-        logging.warning(
-            "No chat template provided. Chat API will not work. This May lead to unsatisfactory results. You can provide a template.jinja file for vllm.")
\ No newline at end of file
diff --git a/models/nlp/large_language_model/qwen-7b/text-generation-inference/offline_inference.py b/models/nlp/large_language_model/qwen-7b/text-generation-inference/offline_inference.py
index e3ebcc3a..d16450cd 100644
--- a/models/nlp/large_language_model/qwen-7b/text-generation-inference/offline_inference.py
+++ b/models/nlp/large_language_model/qwen-7b/text-generation-inference/offline_inference.py
@@ -1,35 +1,33 @@
-from text_generation_server.models.flash_qwen import (
-        FlashQwen,
-    )
+import argparse
+import time
+
 import torch
+from text_generation_server.models.flash_qwen2 import FlashQwen2
 from text_generation_server.pb import generate_pb2
-
-import time
+from text_generation_server.utils.speculate import set_speculate
 from torch.cuda import profiler
-import argparse
+
 
 def parse_args(args=None):
     parser = argparse.ArgumentParser()
-    parser.add_argument('--generate_length', type=int, default=512)
-    parser.add_argument('--model2path', type=str, default="/home/data/nlp/qwen/qwen-7B")
-    parser.add_argument('--quantize', type=str, default=None, choices=['awq'])
-    parser.add_argument('--speculate', type=int, default=0)
+    parser.add_argument("--generate_length", type=int, default=512)
+    parser.add_argument(
+        "--model2path", type=str, default="/home/data/nlp/qwen2/Qwen1.5-0.5B"
+    )
+    parser.add_argument("--quantize", type=str, default=None, choices=["awq"])
+    parser.add_argument("--speculate", type=int, default=0)
 
     return parser.parse_args(args)
 
+
 if __name__ == "__main__":
     args = parse_args()
-    isNewVersion = True
-    try:
-        from text_generation_server.utils.speculate import set_speculate
-    except ImportError:
-        isNewVersion = False
-        print("use n-gram speculate must update tgi version to 1.4.3+")
-    else:
-        set_speculate(args.speculate)
+
     max_input_length = 2048
     max_prefill_tokens = 2048
-    model = FlashQwen(args.model2path, trust_remote_code=True)
+
+    set_speculate(args.speculate)
+    model = FlashQwen2(args.model2path, trust_remote_code=True, quantize=args.quantize)
 
     first_line = "蒙古国的首都是乌兰巴托（Ulaanbaatar）\n冰岛的首都是雷克雅未克（Reykjavik）\n埃塞俄比亚的首都是"
 
@@ -41,10 +39,12 @@ if __name__ == "__main__":
         typical_p=1.0,
         do_sample=False,
     )
-    
-    default_pb_stop_parameters = generate_pb2.StoppingCriteriaParameters(stop_sequences=[], max_new_tokens=args.generate_length)
-    
-    warmup_requests =  generate_pb2.Request(
+
+    default_pb_stop_parameters = generate_pb2.StoppingCriteriaParameters(
+        stop_sequences=[], max_new_tokens=args.generate_length
+    )
+
+    warmup_requests = generate_pb2.Request(
         id=0,
         inputs="_test " * max_input_length,
         prefill_logprobs=True,
@@ -64,13 +64,13 @@ if __name__ == "__main__":
             stop_sequences=[],
             ignore_eos_token=False,
         ),
-        top_n_tokens = 20
+        top_n_tokens=20,
     )
     warmup_requests_batch = generate_pb2.Batch(id=0, requests=[warmup_requests], size=1)
-    warmup_requests_batchs =  model.batch_type.from_pb(
+    warmup_requests_batchs = model.batch_type.from_pb(
         warmup_requests_batch, model.tokenizer, model.dtype, torch.device("cuda")
     )
-    
+
     model.warmup(warmup_requests_batchs)
 
     pb_request = generate_pb2.Request(
@@ -87,65 +87,84 @@ if __name__ == "__main__":
     )
 
     next_batch_one = causal_lm_one_batch
-    last_generations = True 
+    last_generations = True
     torch.cuda.synchronize()
     profiler.start()
     start_time = time.perf_counter()
     for _ in range(causal_lm_one_batch.stopping_criterias[0].max_new_tokens - 1):
-        data = model.generate_token(next_batch_one)
-        if isNewVersion:
-            generations_one, next_batch_one, _ = data
-        else:
-            generations_one, next_batch_one = data
+        generations_one, next_batch_one, _ = model.generate_token(next_batch_one)
         if next_batch_one is None:
             last_generations = False
             break
     if last_generations:
-        data = model.generate_token(next_batch_one)
-    generations_one = data[0]
+        generations_one, next_batch_one, _ = model.generate_token(next_batch_one)
     profiler.stop()
     torch.cuda.synchronize()
     end_time = time.perf_counter()
     duration_time = end_time - start_time
     print(f"generate length: {generations_one[0].generated_text.generated_tokens}")
-    print(f"one batch: {generations_one[0].generated_text.text}\nqps: {generations_one[0].generated_text.generated_tokens /duration_time}")
-    metricResult = {"metricResult": {}}
-    metricResult["metricResult"]["generate length"] = generations_one[0].generated_text.generated_tokens
-    metricResult["metricResult"]["one batch"] = generations_one[0].generated_text.text
-    metricResult["metricResult"]["qps"] = generations_one[0].generated_text.generated_tokens /duration_time
-    print(metricResult)
+    print(
+        f"one batch: {generations_one[0].generated_text.text}\nqps: {generations_one[0].generated_text.generated_tokens /duration_time}"
+    )
 
 """
-qwen-7B
-亚的斯亚贝巴（Addis Ababa）
-尼日利亚的首都是阿布贾（Abuja）
-巴基斯坦的首都是伊斯兰堡（Islamabad）
-菲律宾的首都是马尼拉（Manila）
-波兰的首都是华沙（Warsaw）
-葡萄牙的首都是里斯本（Lisbon）
+qwen1.5-0.5B
+one batch: 亚历山大（Alexandria）
 俄罗斯的首都是莫斯科（Moscow）
-新加坡的首都是新加坡（Singapore）
-南非的首都是比勒陀利亚（Pretoria）
-西班牙的首都是马德里（Madrid）
-斯里兰卡的首都是斯里贾亚瓦德纳普拉克特（Sri Jayawardenepura Kotte）
-斯洛伐克的首都是布拉迪斯拉发（Bratislava）
-斯洛文尼亚的首都是卢布尔雅那（Ljubljana）
-南非的首都是比勒陀利亚（Pretoria）
-瑞典的首都是斯德哥尔摩（Stockholm）
-瑞士的首都是伯尔尼（Bern）
-泰国的首都是曼谷（Bangkok）
-土耳其的首都是安卡拉（Ankara）
-乌克兰的首都是基辅（Kyiv）
+土耳其的首都是伊斯坦布尔（Istanbul）
+南非的首都是开普敦（Cape Town）
+美国的首都是华盛顿（Washington）
+澳大利亚的首都是堪培拉（Canberra）
+印度的首都是新德里（New Delhi）
+法国的首都是巴黎（Paris）
 英国的首都是伦敦（London）
-美国的首都是华盛顿特区（Washington, D.C.）
-乌兹别克斯坦的首都是塔什干（Tashkent）
-委内瑞拉的首都是加拉加斯（Caracas）
-越南的首都是河内（Hanoi）
-赞比亚的首都是卢萨卡（Lusaka）
-津巴布韦的首都是哈拉雷（Harare）
-以上是世界上一些国家的首都，当然还有很多其他国家的首都，这里只是列举了一些比较有代表性的。  2022年广东省公务员考试公告于11月26日发布，报考者可在 2021年11月29日9︰00至12月3日16︰00 的时间内报名。建议小伙伴们根据本人的专业、意愿和职业规划等选择报考职位，不要等到最后才匆忙报名，以免因时间不足等情况无法完成报名而造成遗憾。
-   ——2022年广东省考报名有关解答——
-  报考者如何办理考试费减免手续?
-  答：报考者如属城乡最低生活保障对象，可向报考职位所在考区考务机构申请减免考试费，申请对象需提交其家庭所在地的县(区、
-qps: 34.23966521171583
-"""
\ No newline at end of file
+加拿大首都是温哥华（Vancouver）
+南非首都是开普敦（Cape Town）
+美国首都是华盛顿（Washington）
+澳大利亚首都是堪培拉（Canberra）
+印度首都是新德里（New Delhi）
+法国首都是巴黎（Paris）
+英国首都是伦敦（London）
+加拿大首都是温哥华（Vancouver）
+南非首都是开普敦（Cape Town）
+美国首都是华盛顿（Washington）
+澳大利亚首都是堪培拉（Canberra）
+印度首都是新德里（New Delhi）
+法国首都是巴黎（Paris）
+英国首都是伦敦（London）
+加拿大首都是温哥华（Vancouver）
+南非首都是开普敦（Cape Town）
+美国首都是华盛顿（Washington）
+澳大利亚首都是堪培拉（Canberra）
+印度首都是新德里（New Delhi）
+法国首都是巴黎（Paris）
+英国首都是伦敦（London）
+加拿大首都是温哥华（Vancouver）
+南非首都是开普敦（Cape Town）
+美国首都是华盛顿（Washington）
+澳大利亚首都是堪培拉（Canberra）
+印度首都是新德里（New Delhi）
+法国首都是巴黎（Paris）
+英国首都是伦敦（London）
+加拿大首都是温哥华（Vancouver）
+南非首都是开普敦（Cape Town）
+美国首都是华盛顿（Washington）
+澳大利亚首都是堪培拉（Canberra）
+印度首都是新德里（New Delhi）
+法国首都是巴黎（Paris）
+英国首都是伦敦（London）
+加拿大首都是温哥华（Vancouver）
+南非首都是开普敦（Cape Town）
+美国首都是华盛顿（Washington）
+澳大利亚首都是堪培拉（Canberra）
+印度首都是新德里（New Delhi）
+法国首都是巴黎（Paris）
+英国首都是伦敦（London）
+加拿大首都是温哥华（Vancouver）
+南非首都是开普敦（Cape Town）
+美国首都是华盛顿（Washington）
+澳大利亚首都是堪培拉（Canberra）
+印度首都是新德里（New Delhi）
+法国首都是巴黎（
+qps: 128.489649542011
+"""
diff --git a/models/nlp/large_language_model/qwen-7b/vllm/offline_inference.py b/models/nlp/large_language_model/qwen-7b/vllm/offline_inference.py
index de90a63e..e9275e21 100644
--- a/models/nlp/large_language_model/qwen-7b/vllm/offline_inference.py
+++ b/models/nlp/large_language_model/qwen-7b/vllm/offline_inference.py
@@ -1,20 +1,6 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
 import sys
 from pathlib import Path
+import os
 
 sys.path.append(str(Path(__file__).resolve().parent.parent.parent))
 import argparse
@@ -45,16 +31,14 @@ if __name__ == "__main__":
         param.name
         for param in list(
             inspect.signature(SamplingParams).parameters.values()
-        )[1:]
+        )
     ]
     engine_params = {attr: getattr(args, attr) for attr in engine_args}
     sampling_params = {
         attr: getattr(args, attr) for attr in sampling_args if args.__contains__(attr)
     }
 
-    model_name = args.model.strip()
-    model_name = model_name if args.model[-1] != "/" else model_name[:-1]
-    model_name = model_name.rsplit("/")[-1]
+    model_name = os.path.dirname(args.model).rsplit("/")[-1]
 
     # Sample prompts.
     prompts = ["哪些迹象可能表明一个人正在经历焦虑?", "描述一下如何制作芝士披萨。", "写一篇有关5G网络研发的综述文章。"]
@@ -130,7 +114,3 @@ if __name__ == "__main__":
         num_tokens += len(output.outputs[0].token_ids)
         print(f"Prompt: {prompt}\nGenerated text: {generated_text} \n")
     print(f"tokens: {num_tokens}, QPS: {num_tokens/duration_time}")
-    metricResult = {"metricResult": {}}
-    metricResult["metricResult"]["tokens"] = num_tokens
-    metricResult["metricResult"]["QPS"] = round(num_tokens/duration_time,3)
-    print(metricResult)
diff --git a/models/nlp/large_language_model/qwen-7b/vllm/utils.py b/models/nlp/large_language_model/qwen-7b/vllm/utils.py
deleted file mode 100644
index c6def85d..00000000
--- a/models/nlp/large_language_model/qwen-7b/vllm/utils.py
+++ /dev/null
@@ -1,173 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
-import codecs
-import logging
-import argparse
-
-
-def sampling_add_cli_args(args: argparse.ArgumentParser) -> argparse.ArgumentParser:
-    args.add_argument(
-        '--n',
-        type=int,
-        default=1,
-        help="Number of output sequences to return for the given prompt.")
-    args.add_argument(
-        '--best-of',
-        type=int,
-        default=None,
-        help="Number of output sequences that are generated from the prompt. "
-        "From these `best_of` sequences, the top `n` sequences are returned. "
-        "`best_of` must be greater than or equal to `n`. This is treated as "
-        "the beam width when `use_beam_search` is True. By default, `best_of`"
-        "is set to `n`.")
-    args.add_argument(
-        '--presence-penalty',
-        type=float,
-        default=0.0,
-        help="Float that penalizes new tokens based on whether they "
-        "appear in the generated text so far. Values > 0 encourage the model "
-        "to use new tokens, while values < 0 encourage the model to repeat "
-        "tokens.")
-    args.add_argument(
-        '--frequency-penalty',
-        type=float,
-        default=0.0,
-        help="Float that penalizes new tokens based on their "
-        " frequency in the generated text so far. Values > 0 encourage the "
-        " model to use new tokens, while values < 0 encourage the model to "
-        "repeat tokens.")
-    args.add_argument(
-        '--repetition-penalty',
-        type=float,
-        default=1.0,
-        help="Float that penalizes new tokens based on whether "
-        "they appear in the prompt and the generated text so far. Values > 1 "
-        "encourage the model to use new tokens, while values < 1 encourage "
-        "the model to repeat tokens.")
-    args.add_argument(
-        '--temperature',
-        type=float,
-        default=1.0,
-        help="Float that controls the randomness of the sampling. Lower "
-        "values make the model more deterministic, while higher values make "
-        "the model more random. Zero means greedy sampling.")
-    args.add_argument(
-        '--top-p',
-        type=float,
-        default=1.0,
-        help="Float that controls the cumulative probability of the top tokens "
-            "to consider. Must be in (0, 1]. Set to 1 to consider all tokens.")
-    args.add_argument(
-        '--top-k',
-        type=int,
-        default=-1,
-        help="Integer that controls the number of top tokens to consider. Set "
-        "to -1 to consider all tokens.")
-    args.add_argument(
-        '--min-p',
-        type=float,
-        default=0.0,
-        help="Float that represents the minimum probability for a token to be "
-        "considered, relative to the probability of the most likely token. "
-        "Must be in [0, 1]. Set to 0 to disable this.")
-    args.add_argument(
-        '--use-beam-search',
-        default=False,
-        action="store_true",
-        help="Whether to use beam search instead of sampling.")
-    args.add_argument(
-        '--length-penalty',
-        type=float,
-        default=1.0,
-        help="Float that penalizes sequences based on their length. Used in beam search.")
-    args.add_argument(
-        '--stop',
-        type=str,
-        default=None,
-        help="List of strings that stop the generation when they are generated. "
-        "The returned output will not contain the stop strings.")
-    args.add_argument(
-        '--stop-token-ids',
-        type=int,
-        default=None,
-        help="List of tokens that stop the generation when they are "
-        "generated. The returned output will contain the stop tokens unless "
-        "the stop tokens are special tokens.")
-    args.add_argument(
-        '--include-stop-str-in-output',
-        default=False,
-        action="store_true",
-        help="Whether to include the stop strings in output text. Defaults to False.")
-    args.add_argument(
-        '--ignore-eos',
-        default=False,
-        action="store_true",
-        help="Whether to ignore the EOS token and continue generating tokens after the EOS token is generated.")
-    args.add_argument(
-        '--max-tokens',
-        type=int,
-        default=16,
-        help="Maximum number of tokens to generate per output sequence.")
-    args.add_argument(
-        '--logprobs',
-        type=int,
-        default=None,
-        help="NNumber of log probabilities to return per output token. "
-        "Note that the implementation follows the OpenAI API: The return "
-        "result includes the log probabilities on the `logprobs` most likely "
-        "tokens, as well the chosen tokens. The API will always return the "
-        "log probability of the sampled token, so there  may be up to "
-        "`logprobs+1` elements in the response.")
-    args.add_argument(
-        '--prompt-logprobs',
-        type=int,
-        default=None,
-        help="Number of log probabilities to return per prompt token.")
-    args.add_argument(
-        '--skip-special-tokens',
-        default=True,
-        action="store_false",
-        help="Whether to skip special tokens in the output.")
-    args.add_argument(
-        '--spaces-between-special-tokens',
-        default=True,
-        action="store_false",
-        help="Whether to add spaces between special tokens in the output.  Defaults to True.")
-    # early_stopping logits_processors seed
-    return args
-
-
-def load_chat_template(tokenizer, chat_template):
-    if chat_template is not None:
-        try:
-            with open(chat_template, "r") as f:
-                tokenizer.chat_template = f.read()
-        except OSError:
-            # If opening a file fails, set chat template to be args to
-            # ensure we decode so our escape are interpreted correctly
-            tokenizer.chat_template = codecs.decode(
-                chat_template, "unicode_escape")
-
-        logging.info(
-            f"Using supplied chat template:\n{tokenizer.chat_template}"
-        )
-    elif tokenizer.chat_template is not None:
-        logging.info(
-            f"Using default chat template:\n{tokenizer.chat_template}. This May lead to unsatisfactory results. You can provide a template.jinja file for vllm."
-        )
-    else:
-        logging.warning(
-            "No chat template provided. Chat API will not work. This May lead to unsatisfactory results. You can provide a template.jinja file for vllm.")
diff --git a/models/nlp/large_language_model/qwen1.5-14b/vllm/offline_inference.py b/models/nlp/large_language_model/qwen1.5-14b/vllm/offline_inference.py
index 75eb8181..9b7d87fd 100644
--- a/models/nlp/large_language_model/qwen1.5-14b/vllm/offline_inference.py
+++ b/models/nlp/large_language_model/qwen1.5-14b/vllm/offline_inference.py
@@ -1,114 +1,115 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
 import sys
 from pathlib import Path
-sys.path.append(str(Path(__file__).resolve().parent.parent.parent))
-from utils import load_chat_template,sampling_add_cli_args
+import os
 
-import logging
-import time
+sys.path.append(str(Path(__file__).resolve().parent.parent.parent))
 import argparse
 import dataclasses
 import inspect
+import logging
+import time
 
 import torch
-from vllm import LLM, SamplingParams, EngineArgs
-
-
-parser = argparse.ArgumentParser()
-parser.add_argument("--chat_template",type=str,default=None)
-parser.add_argument("--remove_chat_template",default=False,action="store_true",help="pass this if you are not use a chat model")
-parser = EngineArgs.add_cli_args(parser)
-parser = sampling_add_cli_args(parser)
-args = parser.parse_args()
+from utils import load_chat_template, sampling_add_cli_args
+from vllm import LLM, EngineArgs, SamplingParams
 
-engine_args = [attr.name for attr in dataclasses.fields(EngineArgs)]
-sampling_args = [param.name for param in list(inspect.signature(SamplingParams).parameters.values())[1:]]
-engine_params = {attr:getattr(args, attr) for attr in engine_args}
-sampling_params = {attr:getattr(args, attr) for attr in sampling_args if args.__contains__(attr)}
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--chat_template", type=str, default=None)
+    parser.add_argument(
+        "--remove_chat_template",
+        default=False,
+        action="store_true",
+        help="pass this if you are not use a chat model",
+    )
+    parser = EngineArgs.add_cli_args(parser)
+    parser = sampling_add_cli_args(parser)
+    args = parser.parse_args()
 
-model_name = args.model.strip()
-model_name = model_name if args.model[-1]!='/' else model_name[:-1]
-model_name = model_name.rsplit('/')[-1]
+    engine_args = [attr.name for attr in dataclasses.fields(EngineArgs)]
+    sampling_args = [
+        param.name
+        for param in list(
+            inspect.signature(SamplingParams).parameters.values()
+        )
+    ]
+    engine_params = {attr: getattr(args, attr) for attr in engine_args}
+    sampling_params = {
+        attr: getattr(args, attr) for attr in sampling_args if args.__contains__(attr)
+    }
 
+    model_name = os.path.dirname(args.model).rsplit("/")[-1]
 
-# Sample prompts.
-prompts = [
-            "哪些迹象可能表明一个人正在经历焦虑?", 
-            "描述一下如何制作芝士披萨。", 
-            "写一篇有关5G网络研发的综述文章。"
-           ]
+    # Sample prompts.
+    prompts = ["哪些迹象可能表明一个人正在经历焦虑?", "描述一下如何制作芝士披萨。", "写一篇有关5G网络研发的综述文章。"]
 
-# Create a sampling params object.
-sampling_params = SamplingParams(**sampling_params)
+    # Create a sampling params object.
+    sampling_params = SamplingParams(**sampling_params)
 
-# Create an LLM.
-llm = LLM(**engine_params)
+    # Create an LLM.
+    llm = LLM(**engine_params)
 
-# process chat template
-if args.remove_chat_template:
-    if 'chat' in model_name.lower():
-        logging.warning(f"The model name from model path is {model_name}, so we guess you are using the chat model and the additional processing is required for the input prompt. "
-                        f"If the result is not quite correct, please ensure you do not pass --remove_chat_template in CLI.")
-    prompts_new = prompts
-else:
-    # Build chat model promopt
-    logging.warning("If you are using a non chat model, please pass the --remove_chat_template in CLI.")
-    # Try use transformers's apply_chat_template, if chat_template is None, will use defalut template.
-    # For some old models, the default template may cause bad answers. we don't consider this situation, 
-    # because the Transformers team is advancing the chat template. For more informatino about it, 
-    # please refer to https://huggingface.co/docs/transformers/main/chat_templating
-    try:
-        load_chat_template(llm.get_tokenizer(),args.chat_template)
-        prompts_new = []
-        for prompt in prompts:
-            messages = [
-                {"role": "user", "content": prompt}
-            ]
-            text = llm.get_tokenizer().apply_chat_template(
-                messages,
-                tokenize=False,
-                add_generation_prompt=True
+    # process chat template
+    if args.remove_chat_template:
+        if "chat" in model_name.lower():
+            logging.warning(
+                f"The model name from model path is {model_name}, so we guess you are using the chat model and the additional processing is required for the input prompt. "
+                f"If the result is not quite correct, please ensure you do not pass --remove_chat_template in CLI."
             )
-            prompts_new.append(text)
-    except:
-        logging.warning("use tokenizer apply_chat_template function failed, may because of low transformers version...(try use transformers>=4.34.0)")
         prompts_new = prompts
+    else:
+        # Build chat model promopt
+        logging.warning(
+            "If you are using a non chat model, please pass the --remove_chat_template in CLI."
+        )
+        # Try use transformers's apply_chat_template, if chat_template is None, will use defalut template.
+        # For some old models, the default template may cause bad answers. we don't consider this situation,
+        # because the Transformers team is advancing the chat template. For more informatino about it,
+        # please refer to https://huggingface.co/docs/transformers/main/chat_templating
+        try:
+            load_chat_template(llm.get_tokenizer(), args.chat_template)
+            prompts_new = []
+            for prompt in prompts:
+                messages = [{"role": "user", "content": prompt}]
+                text = llm.get_tokenizer().apply_chat_template(
+                    messages, tokenize=False, add_generation_prompt=True
+                )
+                prompts_new.append(text)
+        except:
+            logging.warning(
+                "use tokenizer apply_chat_template function failed, may because of low transformers version...(try use transformers>=4.34.0)"
+            )
+            prompts_new = prompts
+
+    # Generate texts from the prompts. The output is a list of RequestOutput objects
+    # that contain the prompt, generated text, and other information.
+    outputs = (
+        llm.generate(prompts_new, sampling_params, use_tqdm=False)
+        if isinstance(prompts_new[0], str)
+        else llm.generate(
+            sampling_params=sampling_params,
+            prompt_token_ids=prompts_new,
+            use_tqdm=False,
+        )
+    )
+    torch.cuda.synchronize()
 
-# Generate texts from the prompts. The output is a list of RequestOutput objects
-# that contain the prompt, generated text, and other information.
-outputs = llm.generate(prompts_new, sampling_params,use_tqdm=False) if isinstance(prompts_new[0],str) else llm.generate(sampling_params=sampling_params,prompt_token_ids=prompts_new,use_tqdm=False)
-torch.cuda.synchronize()
+    start_time = time.perf_counter()
+    outputs = (
+        llm.generate(prompts_new, sampling_params)
+        if isinstance(prompts_new[0], str)
+        else llm.generate(sampling_params=sampling_params, prompt_token_ids=prompts_new)
+    )
+    torch.cuda.synchronize()
+    end_time = time.perf_counter()
+    duration_time = end_time - start_time
 
-start_time = time.perf_counter()
-outputs = llm.generate(prompts_new, sampling_params) if isinstance(prompts_new[0],str) else llm.generate(sampling_params=sampling_params,prompt_token_ids=prompts_new)
-torch.cuda.synchronize()
-end_time = time.perf_counter()
-duration_time = end_time - start_time
+    num_tokens = 0
+    # Print the outputs.
+    for i, output in enumerate(outputs):
+        prompt = prompts[i]  # show the origin prompt. actully prompt is "output.prompt"
+        generated_text = output.outputs[0].text
 
-num_tokens = 0
-# Print the outputs.
-for i, output in enumerate(outputs):
-    prompt = prompts[i] # show the origin prompt. actully prompt is "output.prompt"
-    generated_text = output.outputs[0].text
-    
-    num_tokens += len(output.outputs[0].token_ids)
-    print(f"Prompt: {prompt}\nGenerated text: {generated_text} \n")
-print(f"tokens: {num_tokens}, QPS: {num_tokens/duration_time}")
-metricResult = {"metricResult": {}}
-metricResult["metricResult"]["tokens"] = num_tokens
-metricResult["metricResult"]["QPS"] = round(num_tokens/duration_time,3)
-print(metricResult)
\ No newline at end of file
+        num_tokens += len(output.outputs[0].token_ids)
+        print(f"Prompt: {prompt}\nGenerated text: {generated_text} \n")
+    print(f"tokens: {num_tokens}, QPS: {num_tokens/duration_time}")
diff --git a/models/nlp/large_language_model/qwen1.5-14b/vllm/utils.py b/models/nlp/large_language_model/qwen1.5-14b/vllm/utils.py
deleted file mode 100644
index c6def85d..00000000
--- a/models/nlp/large_language_model/qwen1.5-14b/vllm/utils.py
+++ /dev/null
@@ -1,173 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
-import codecs
-import logging
-import argparse
-
-
-def sampling_add_cli_args(args: argparse.ArgumentParser) -> argparse.ArgumentParser:
-    args.add_argument(
-        '--n',
-        type=int,
-        default=1,
-        help="Number of output sequences to return for the given prompt.")
-    args.add_argument(
-        '--best-of',
-        type=int,
-        default=None,
-        help="Number of output sequences that are generated from the prompt. "
-        "From these `best_of` sequences, the top `n` sequences are returned. "
-        "`best_of` must be greater than or equal to `n`. This is treated as "
-        "the beam width when `use_beam_search` is True. By default, `best_of`"
-        "is set to `n`.")
-    args.add_argument(
-        '--presence-penalty',
-        type=float,
-        default=0.0,
-        help="Float that penalizes new tokens based on whether they "
-        "appear in the generated text so far. Values > 0 encourage the model "
-        "to use new tokens, while values < 0 encourage the model to repeat "
-        "tokens.")
-    args.add_argument(
-        '--frequency-penalty',
-        type=float,
-        default=0.0,
-        help="Float that penalizes new tokens based on their "
-        " frequency in the generated text so far. Values > 0 encourage the "
-        " model to use new tokens, while values < 0 encourage the model to "
-        "repeat tokens.")
-    args.add_argument(
-        '--repetition-penalty',
-        type=float,
-        default=1.0,
-        help="Float that penalizes new tokens based on whether "
-        "they appear in the prompt and the generated text so far. Values > 1 "
-        "encourage the model to use new tokens, while values < 1 encourage "
-        "the model to repeat tokens.")
-    args.add_argument(
-        '--temperature',
-        type=float,
-        default=1.0,
-        help="Float that controls the randomness of the sampling. Lower "
-        "values make the model more deterministic, while higher values make "
-        "the model more random. Zero means greedy sampling.")
-    args.add_argument(
-        '--top-p',
-        type=float,
-        default=1.0,
-        help="Float that controls the cumulative probability of the top tokens "
-            "to consider. Must be in (0, 1]. Set to 1 to consider all tokens.")
-    args.add_argument(
-        '--top-k',
-        type=int,
-        default=-1,
-        help="Integer that controls the number of top tokens to consider. Set "
-        "to -1 to consider all tokens.")
-    args.add_argument(
-        '--min-p',
-        type=float,
-        default=0.0,
-        help="Float that represents the minimum probability for a token to be "
-        "considered, relative to the probability of the most likely token. "
-        "Must be in [0, 1]. Set to 0 to disable this.")
-    args.add_argument(
-        '--use-beam-search',
-        default=False,
-        action="store_true",
-        help="Whether to use beam search instead of sampling.")
-    args.add_argument(
-        '--length-penalty',
-        type=float,
-        default=1.0,
-        help="Float that penalizes sequences based on their length. Used in beam search.")
-    args.add_argument(
-        '--stop',
-        type=str,
-        default=None,
-        help="List of strings that stop the generation when they are generated. "
-        "The returned output will not contain the stop strings.")
-    args.add_argument(
-        '--stop-token-ids',
-        type=int,
-        default=None,
-        help="List of tokens that stop the generation when they are "
-        "generated. The returned output will contain the stop tokens unless "
-        "the stop tokens are special tokens.")
-    args.add_argument(
-        '--include-stop-str-in-output',
-        default=False,
-        action="store_true",
-        help="Whether to include the stop strings in output text. Defaults to False.")
-    args.add_argument(
-        '--ignore-eos',
-        default=False,
-        action="store_true",
-        help="Whether to ignore the EOS token and continue generating tokens after the EOS token is generated.")
-    args.add_argument(
-        '--max-tokens',
-        type=int,
-        default=16,
-        help="Maximum number of tokens to generate per output sequence.")
-    args.add_argument(
-        '--logprobs',
-        type=int,
-        default=None,
-        help="NNumber of log probabilities to return per output token. "
-        "Note that the implementation follows the OpenAI API: The return "
-        "result includes the log probabilities on the `logprobs` most likely "
-        "tokens, as well the chosen tokens. The API will always return the "
-        "log probability of the sampled token, so there  may be up to "
-        "`logprobs+1` elements in the response.")
-    args.add_argument(
-        '--prompt-logprobs',
-        type=int,
-        default=None,
-        help="Number of log probabilities to return per prompt token.")
-    args.add_argument(
-        '--skip-special-tokens',
-        default=True,
-        action="store_false",
-        help="Whether to skip special tokens in the output.")
-    args.add_argument(
-        '--spaces-between-special-tokens',
-        default=True,
-        action="store_false",
-        help="Whether to add spaces between special tokens in the output.  Defaults to True.")
-    # early_stopping logits_processors seed
-    return args
-
-
-def load_chat_template(tokenizer, chat_template):
-    if chat_template is not None:
-        try:
-            with open(chat_template, "r") as f:
-                tokenizer.chat_template = f.read()
-        except OSError:
-            # If opening a file fails, set chat template to be args to
-            # ensure we decode so our escape are interpreted correctly
-            tokenizer.chat_template = codecs.decode(
-                chat_template, "unicode_escape")
-
-        logging.info(
-            f"Using supplied chat template:\n{tokenizer.chat_template}"
-        )
-    elif tokenizer.chat_template is not None:
-        logging.info(
-            f"Using default chat template:\n{tokenizer.chat_template}. This May lead to unsatisfactory results. You can provide a template.jinja file for vllm."
-        )
-    else:
-        logging.warning(
-            "No chat template provided. Chat API will not work. This May lead to unsatisfactory results. You can provide a template.jinja file for vllm.")
diff --git a/models/nlp/large_language_model/qwen1.5-32b/vllm/offline_inference.py b/models/nlp/large_language_model/qwen1.5-32b/vllm/offline_inference.py
index fd97059a..9b7d87fd 100644
--- a/models/nlp/large_language_model/qwen1.5-32b/vllm/offline_inference.py
+++ b/models/nlp/large_language_model/qwen1.5-32b/vllm/offline_inference.py
@@ -1,20 +1,6 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
 import sys
 from pathlib import Path
+import os
 
 sys.path.append(str(Path(__file__).resolve().parent.parent.parent))
 import argparse
@@ -45,16 +31,14 @@ if __name__ == "__main__":
         param.name
         for param in list(
             inspect.signature(SamplingParams).parameters.values()
-        )[1:]
+        )
     ]
     engine_params = {attr: getattr(args, attr) for attr in engine_args}
     sampling_params = {
         attr: getattr(args, attr) for attr in sampling_args if args.__contains__(attr)
     }
 
-    model_name = args.model.strip()
-    model_name = model_name if args.model[-1] != "/" else model_name[:-1]
-    model_name = model_name.rsplit("/")[-1]
+    model_name = os.path.dirname(args.model).rsplit("/")[-1]
 
     # Sample prompts.
     prompts = ["哪些迹象可能表明一个人正在经历焦虑?", "描述一下如何制作芝士披萨。", "写一篇有关5G网络研发的综述文章。"]
@@ -129,7 +113,3 @@ if __name__ == "__main__":
         num_tokens += len(output.outputs[0].token_ids)
         print(f"Prompt: {prompt}\nGenerated text: {generated_text} \n")
     print(f"tokens: {num_tokens}, QPS: {num_tokens/duration_time}")
-    metricResult = {"metricResult": {}}
-    metricResult["metricResult"]["tokens"] = num_tokens
-    metricResult["metricResult"]["QPS"] = round(num_tokens/duration_time,3)
-    print(metricResult)
\ No newline at end of file
diff --git a/models/nlp/large_language_model/qwen1.5-32b/vllm/utils.py b/models/nlp/large_language_model/qwen1.5-32b/vllm/utils.py
deleted file mode 100644
index c6def85d..00000000
--- a/models/nlp/large_language_model/qwen1.5-32b/vllm/utils.py
+++ /dev/null
@@ -1,173 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
-import codecs
-import logging
-import argparse
-
-
-def sampling_add_cli_args(args: argparse.ArgumentParser) -> argparse.ArgumentParser:
-    args.add_argument(
-        '--n',
-        type=int,
-        default=1,
-        help="Number of output sequences to return for the given prompt.")
-    args.add_argument(
-        '--best-of',
-        type=int,
-        default=None,
-        help="Number of output sequences that are generated from the prompt. "
-        "From these `best_of` sequences, the top `n` sequences are returned. "
-        "`best_of` must be greater than or equal to `n`. This is treated as "
-        "the beam width when `use_beam_search` is True. By default, `best_of`"
-        "is set to `n`.")
-    args.add_argument(
-        '--presence-penalty',
-        type=float,
-        default=0.0,
-        help="Float that penalizes new tokens based on whether they "
-        "appear in the generated text so far. Values > 0 encourage the model "
-        "to use new tokens, while values < 0 encourage the model to repeat "
-        "tokens.")
-    args.add_argument(
-        '--frequency-penalty',
-        type=float,
-        default=0.0,
-        help="Float that penalizes new tokens based on their "
-        " frequency in the generated text so far. Values > 0 encourage the "
-        " model to use new tokens, while values < 0 encourage the model to "
-        "repeat tokens.")
-    args.add_argument(
-        '--repetition-penalty',
-        type=float,
-        default=1.0,
-        help="Float that penalizes new tokens based on whether "
-        "they appear in the prompt and the generated text so far. Values > 1 "
-        "encourage the model to use new tokens, while values < 1 encourage "
-        "the model to repeat tokens.")
-    args.add_argument(
-        '--temperature',
-        type=float,
-        default=1.0,
-        help="Float that controls the randomness of the sampling. Lower "
-        "values make the model more deterministic, while higher values make "
-        "the model more random. Zero means greedy sampling.")
-    args.add_argument(
-        '--top-p',
-        type=float,
-        default=1.0,
-        help="Float that controls the cumulative probability of the top tokens "
-            "to consider. Must be in (0, 1]. Set to 1 to consider all tokens.")
-    args.add_argument(
-        '--top-k',
-        type=int,
-        default=-1,
-        help="Integer that controls the number of top tokens to consider. Set "
-        "to -1 to consider all tokens.")
-    args.add_argument(
-        '--min-p',
-        type=float,
-        default=0.0,
-        help="Float that represents the minimum probability for a token to be "
-        "considered, relative to the probability of the most likely token. "
-        "Must be in [0, 1]. Set to 0 to disable this.")
-    args.add_argument(
-        '--use-beam-search',
-        default=False,
-        action="store_true",
-        help="Whether to use beam search instead of sampling.")
-    args.add_argument(
-        '--length-penalty',
-        type=float,
-        default=1.0,
-        help="Float that penalizes sequences based on their length. Used in beam search.")
-    args.add_argument(
-        '--stop',
-        type=str,
-        default=None,
-        help="List of strings that stop the generation when they are generated. "
-        "The returned output will not contain the stop strings.")
-    args.add_argument(
-        '--stop-token-ids',
-        type=int,
-        default=None,
-        help="List of tokens that stop the generation when they are "
-        "generated. The returned output will contain the stop tokens unless "
-        "the stop tokens are special tokens.")
-    args.add_argument(
-        '--include-stop-str-in-output',
-        default=False,
-        action="store_true",
-        help="Whether to include the stop strings in output text. Defaults to False.")
-    args.add_argument(
-        '--ignore-eos',
-        default=False,
-        action="store_true",
-        help="Whether to ignore the EOS token and continue generating tokens after the EOS token is generated.")
-    args.add_argument(
-        '--max-tokens',
-        type=int,
-        default=16,
-        help="Maximum number of tokens to generate per output sequence.")
-    args.add_argument(
-        '--logprobs',
-        type=int,
-        default=None,
-        help="NNumber of log probabilities to return per output token. "
-        "Note that the implementation follows the OpenAI API: The return "
-        "result includes the log probabilities on the `logprobs` most likely "
-        "tokens, as well the chosen tokens. The API will always return the "
-        "log probability of the sampled token, so there  may be up to "
-        "`logprobs+1` elements in the response.")
-    args.add_argument(
-        '--prompt-logprobs',
-        type=int,
-        default=None,
-        help="Number of log probabilities to return per prompt token.")
-    args.add_argument(
-        '--skip-special-tokens',
-        default=True,
-        action="store_false",
-        help="Whether to skip special tokens in the output.")
-    args.add_argument(
-        '--spaces-between-special-tokens',
-        default=True,
-        action="store_false",
-        help="Whether to add spaces between special tokens in the output.  Defaults to True.")
-    # early_stopping logits_processors seed
-    return args
-
-
-def load_chat_template(tokenizer, chat_template):
-    if chat_template is not None:
-        try:
-            with open(chat_template, "r") as f:
-                tokenizer.chat_template = f.read()
-        except OSError:
-            # If opening a file fails, set chat template to be args to
-            # ensure we decode so our escape are interpreted correctly
-            tokenizer.chat_template = codecs.decode(
-                chat_template, "unicode_escape")
-
-        logging.info(
-            f"Using supplied chat template:\n{tokenizer.chat_template}"
-        )
-    elif tokenizer.chat_template is not None:
-        logging.info(
-            f"Using default chat template:\n{tokenizer.chat_template}. This May lead to unsatisfactory results. You can provide a template.jinja file for vllm."
-        )
-    else:
-        logging.warning(
-            "No chat template provided. Chat API will not work. This May lead to unsatisfactory results. You can provide a template.jinja file for vllm.")
diff --git a/models/nlp/large_language_model/qwen1.5-72b/vllm/offline_inference.py b/models/nlp/large_language_model/qwen1.5-72b/vllm/offline_inference.py
index 75eb8181..9b7d87fd 100644
--- a/models/nlp/large_language_model/qwen1.5-72b/vllm/offline_inference.py
+++ b/models/nlp/large_language_model/qwen1.5-72b/vllm/offline_inference.py
@@ -1,114 +1,115 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
 import sys
 from pathlib import Path
-sys.path.append(str(Path(__file__).resolve().parent.parent.parent))
-from utils import load_chat_template,sampling_add_cli_args
+import os
 
-import logging
-import time
+sys.path.append(str(Path(__file__).resolve().parent.parent.parent))
 import argparse
 import dataclasses
 import inspect
+import logging
+import time
 
 import torch
-from vllm import LLM, SamplingParams, EngineArgs
-
-
-parser = argparse.ArgumentParser()
-parser.add_argument("--chat_template",type=str,default=None)
-parser.add_argument("--remove_chat_template",default=False,action="store_true",help="pass this if you are not use a chat model")
-parser = EngineArgs.add_cli_args(parser)
-parser = sampling_add_cli_args(parser)
-args = parser.parse_args()
+from utils import load_chat_template, sampling_add_cli_args
+from vllm import LLM, EngineArgs, SamplingParams
 
-engine_args = [attr.name for attr in dataclasses.fields(EngineArgs)]
-sampling_args = [param.name for param in list(inspect.signature(SamplingParams).parameters.values())[1:]]
-engine_params = {attr:getattr(args, attr) for attr in engine_args}
-sampling_params = {attr:getattr(args, attr) for attr in sampling_args if args.__contains__(attr)}
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--chat_template", type=str, default=None)
+    parser.add_argument(
+        "--remove_chat_template",
+        default=False,
+        action="store_true",
+        help="pass this if you are not use a chat model",
+    )
+    parser = EngineArgs.add_cli_args(parser)
+    parser = sampling_add_cli_args(parser)
+    args = parser.parse_args()
 
-model_name = args.model.strip()
-model_name = model_name if args.model[-1]!='/' else model_name[:-1]
-model_name = model_name.rsplit('/')[-1]
+    engine_args = [attr.name for attr in dataclasses.fields(EngineArgs)]
+    sampling_args = [
+        param.name
+        for param in list(
+            inspect.signature(SamplingParams).parameters.values()
+        )
+    ]
+    engine_params = {attr: getattr(args, attr) for attr in engine_args}
+    sampling_params = {
+        attr: getattr(args, attr) for attr in sampling_args if args.__contains__(attr)
+    }
 
+    model_name = os.path.dirname(args.model).rsplit("/")[-1]
 
-# Sample prompts.
-prompts = [
-            "哪些迹象可能表明一个人正在经历焦虑?", 
-            "描述一下如何制作芝士披萨。", 
-            "写一篇有关5G网络研发的综述文章。"
-           ]
+    # Sample prompts.
+    prompts = ["哪些迹象可能表明一个人正在经历焦虑?", "描述一下如何制作芝士披萨。", "写一篇有关5G网络研发的综述文章。"]
 
-# Create a sampling params object.
-sampling_params = SamplingParams(**sampling_params)
+    # Create a sampling params object.
+    sampling_params = SamplingParams(**sampling_params)
 
-# Create an LLM.
-llm = LLM(**engine_params)
+    # Create an LLM.
+    llm = LLM(**engine_params)
 
-# process chat template
-if args.remove_chat_template:
-    if 'chat' in model_name.lower():
-        logging.warning(f"The model name from model path is {model_name}, so we guess you are using the chat model and the additional processing is required for the input prompt. "
-                        f"If the result is not quite correct, please ensure you do not pass --remove_chat_template in CLI.")
-    prompts_new = prompts
-else:
-    # Build chat model promopt
-    logging.warning("If you are using a non chat model, please pass the --remove_chat_template in CLI.")
-    # Try use transformers's apply_chat_template, if chat_template is None, will use defalut template.
-    # For some old models, the default template may cause bad answers. we don't consider this situation, 
-    # because the Transformers team is advancing the chat template. For more informatino about it, 
-    # please refer to https://huggingface.co/docs/transformers/main/chat_templating
-    try:
-        load_chat_template(llm.get_tokenizer(),args.chat_template)
-        prompts_new = []
-        for prompt in prompts:
-            messages = [
-                {"role": "user", "content": prompt}
-            ]
-            text = llm.get_tokenizer().apply_chat_template(
-                messages,
-                tokenize=False,
-                add_generation_prompt=True
+    # process chat template
+    if args.remove_chat_template:
+        if "chat" in model_name.lower():
+            logging.warning(
+                f"The model name from model path is {model_name}, so we guess you are using the chat model and the additional processing is required for the input prompt. "
+                f"If the result is not quite correct, please ensure you do not pass --remove_chat_template in CLI."
             )
-            prompts_new.append(text)
-    except:
-        logging.warning("use tokenizer apply_chat_template function failed, may because of low transformers version...(try use transformers>=4.34.0)")
         prompts_new = prompts
+    else:
+        # Build chat model promopt
+        logging.warning(
+            "If you are using a non chat model, please pass the --remove_chat_template in CLI."
+        )
+        # Try use transformers's apply_chat_template, if chat_template is None, will use defalut template.
+        # For some old models, the default template may cause bad answers. we don't consider this situation,
+        # because the Transformers team is advancing the chat template. For more informatino about it,
+        # please refer to https://huggingface.co/docs/transformers/main/chat_templating
+        try:
+            load_chat_template(llm.get_tokenizer(), args.chat_template)
+            prompts_new = []
+            for prompt in prompts:
+                messages = [{"role": "user", "content": prompt}]
+                text = llm.get_tokenizer().apply_chat_template(
+                    messages, tokenize=False, add_generation_prompt=True
+                )
+                prompts_new.append(text)
+        except:
+            logging.warning(
+                "use tokenizer apply_chat_template function failed, may because of low transformers version...(try use transformers>=4.34.0)"
+            )
+            prompts_new = prompts
+
+    # Generate texts from the prompts. The output is a list of RequestOutput objects
+    # that contain the prompt, generated text, and other information.
+    outputs = (
+        llm.generate(prompts_new, sampling_params, use_tqdm=False)
+        if isinstance(prompts_new[0], str)
+        else llm.generate(
+            sampling_params=sampling_params,
+            prompt_token_ids=prompts_new,
+            use_tqdm=False,
+        )
+    )
+    torch.cuda.synchronize()
 
-# Generate texts from the prompts. The output is a list of RequestOutput objects
-# that contain the prompt, generated text, and other information.
-outputs = llm.generate(prompts_new, sampling_params,use_tqdm=False) if isinstance(prompts_new[0],str) else llm.generate(sampling_params=sampling_params,prompt_token_ids=prompts_new,use_tqdm=False)
-torch.cuda.synchronize()
+    start_time = time.perf_counter()
+    outputs = (
+        llm.generate(prompts_new, sampling_params)
+        if isinstance(prompts_new[0], str)
+        else llm.generate(sampling_params=sampling_params, prompt_token_ids=prompts_new)
+    )
+    torch.cuda.synchronize()
+    end_time = time.perf_counter()
+    duration_time = end_time - start_time
 
-start_time = time.perf_counter()
-outputs = llm.generate(prompts_new, sampling_params) if isinstance(prompts_new[0],str) else llm.generate(sampling_params=sampling_params,prompt_token_ids=prompts_new)
-torch.cuda.synchronize()
-end_time = time.perf_counter()
-duration_time = end_time - start_time
+    num_tokens = 0
+    # Print the outputs.
+    for i, output in enumerate(outputs):
+        prompt = prompts[i]  # show the origin prompt. actully prompt is "output.prompt"
+        generated_text = output.outputs[0].text
 
-num_tokens = 0
-# Print the outputs.
-for i, output in enumerate(outputs):
-    prompt = prompts[i] # show the origin prompt. actully prompt is "output.prompt"
-    generated_text = output.outputs[0].text
-    
-    num_tokens += len(output.outputs[0].token_ids)
-    print(f"Prompt: {prompt}\nGenerated text: {generated_text} \n")
-print(f"tokens: {num_tokens}, QPS: {num_tokens/duration_time}")
-metricResult = {"metricResult": {}}
-metricResult["metricResult"]["tokens"] = num_tokens
-metricResult["metricResult"]["QPS"] = round(num_tokens/duration_time,3)
-print(metricResult)
\ No newline at end of file
+        num_tokens += len(output.outputs[0].token_ids)
+        print(f"Prompt: {prompt}\nGenerated text: {generated_text} \n")
+    print(f"tokens: {num_tokens}, QPS: {num_tokens/duration_time}")
diff --git a/models/nlp/large_language_model/qwen1.5-72b/vllm/utils.py b/models/nlp/large_language_model/qwen1.5-72b/vllm/utils.py
deleted file mode 100644
index c6def85d..00000000
--- a/models/nlp/large_language_model/qwen1.5-72b/vllm/utils.py
+++ /dev/null
@@ -1,173 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
-import codecs
-import logging
-import argparse
-
-
-def sampling_add_cli_args(args: argparse.ArgumentParser) -> argparse.ArgumentParser:
-    args.add_argument(
-        '--n',
-        type=int,
-        default=1,
-        help="Number of output sequences to return for the given prompt.")
-    args.add_argument(
-        '--best-of',
-        type=int,
-        default=None,
-        help="Number of output sequences that are generated from the prompt. "
-        "From these `best_of` sequences, the top `n` sequences are returned. "
-        "`best_of` must be greater than or equal to `n`. This is treated as "
-        "the beam width when `use_beam_search` is True. By default, `best_of`"
-        "is set to `n`.")
-    args.add_argument(
-        '--presence-penalty',
-        type=float,
-        default=0.0,
-        help="Float that penalizes new tokens based on whether they "
-        "appear in the generated text so far. Values > 0 encourage the model "
-        "to use new tokens, while values < 0 encourage the model to repeat "
-        "tokens.")
-    args.add_argument(
-        '--frequency-penalty',
-        type=float,
-        default=0.0,
-        help="Float that penalizes new tokens based on their "
-        " frequency in the generated text so far. Values > 0 encourage the "
-        " model to use new tokens, while values < 0 encourage the model to "
-        "repeat tokens.")
-    args.add_argument(
-        '--repetition-penalty',
-        type=float,
-        default=1.0,
-        help="Float that penalizes new tokens based on whether "
-        "they appear in the prompt and the generated text so far. Values > 1 "
-        "encourage the model to use new tokens, while values < 1 encourage "
-        "the model to repeat tokens.")
-    args.add_argument(
-        '--temperature',
-        type=float,
-        default=1.0,
-        help="Float that controls the randomness of the sampling. Lower "
-        "values make the model more deterministic, while higher values make "
-        "the model more random. Zero means greedy sampling.")
-    args.add_argument(
-        '--top-p',
-        type=float,
-        default=1.0,
-        help="Float that controls the cumulative probability of the top tokens "
-            "to consider. Must be in (0, 1]. Set to 1 to consider all tokens.")
-    args.add_argument(
-        '--top-k',
-        type=int,
-        default=-1,
-        help="Integer that controls the number of top tokens to consider. Set "
-        "to -1 to consider all tokens.")
-    args.add_argument(
-        '--min-p',
-        type=float,
-        default=0.0,
-        help="Float that represents the minimum probability for a token to be "
-        "considered, relative to the probability of the most likely token. "
-        "Must be in [0, 1]. Set to 0 to disable this.")
-    args.add_argument(
-        '--use-beam-search',
-        default=False,
-        action="store_true",
-        help="Whether to use beam search instead of sampling.")
-    args.add_argument(
-        '--length-penalty',
-        type=float,
-        default=1.0,
-        help="Float that penalizes sequences based on their length. Used in beam search.")
-    args.add_argument(
-        '--stop',
-        type=str,
-        default=None,
-        help="List of strings that stop the generation when they are generated. "
-        "The returned output will not contain the stop strings.")
-    args.add_argument(
-        '--stop-token-ids',
-        type=int,
-        default=None,
-        help="List of tokens that stop the generation when they are "
-        "generated. The returned output will contain the stop tokens unless "
-        "the stop tokens are special tokens.")
-    args.add_argument(
-        '--include-stop-str-in-output',
-        default=False,
-        action="store_true",
-        help="Whether to include the stop strings in output text. Defaults to False.")
-    args.add_argument(
-        '--ignore-eos',
-        default=False,
-        action="store_true",
-        help="Whether to ignore the EOS token and continue generating tokens after the EOS token is generated.")
-    args.add_argument(
-        '--max-tokens',
-        type=int,
-        default=16,
-        help="Maximum number of tokens to generate per output sequence.")
-    args.add_argument(
-        '--logprobs',
-        type=int,
-        default=None,
-        help="NNumber of log probabilities to return per output token. "
-        "Note that the implementation follows the OpenAI API: The return "
-        "result includes the log probabilities on the `logprobs` most likely "
-        "tokens, as well the chosen tokens. The API will always return the "
-        "log probability of the sampled token, so there  may be up to "
-        "`logprobs+1` elements in the response.")
-    args.add_argument(
-        '--prompt-logprobs',
-        type=int,
-        default=None,
-        help="Number of log probabilities to return per prompt token.")
-    args.add_argument(
-        '--skip-special-tokens',
-        default=True,
-        action="store_false",
-        help="Whether to skip special tokens in the output.")
-    args.add_argument(
-        '--spaces-between-special-tokens',
-        default=True,
-        action="store_false",
-        help="Whether to add spaces between special tokens in the output.  Defaults to True.")
-    # early_stopping logits_processors seed
-    return args
-
-
-def load_chat_template(tokenizer, chat_template):
-    if chat_template is not None:
-        try:
-            with open(chat_template, "r") as f:
-                tokenizer.chat_template = f.read()
-        except OSError:
-            # If opening a file fails, set chat template to be args to
-            # ensure we decode so our escape are interpreted correctly
-            tokenizer.chat_template = codecs.decode(
-                chat_template, "unicode_escape")
-
-        logging.info(
-            f"Using supplied chat template:\n{tokenizer.chat_template}"
-        )
-    elif tokenizer.chat_template is not None:
-        logging.info(
-            f"Using default chat template:\n{tokenizer.chat_template}. This May lead to unsatisfactory results. You can provide a template.jinja file for vllm."
-        )
-    else:
-        logging.warning(
-            "No chat template provided. Chat API will not work. This May lead to unsatisfactory results. You can provide a template.jinja file for vllm.")
diff --git a/models/nlp/large_language_model/qwen1.5-7b/text-generation-inference/offline_inference.py b/models/nlp/large_language_model/qwen1.5-7b/text-generation-inference/offline_inference.py
index 87f4df98..d16450cd 100644
--- a/models/nlp/large_language_model/qwen1.5-7b/text-generation-inference/offline_inference.py
+++ b/models/nlp/large_language_model/qwen1.5-7b/text-generation-inference/offline_inference.py
@@ -1,38 +1,25 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
-from text_generation_server.models.flash_qwen2 import (
-    FlashQwen2,
-)
+import argparse
+import time
+
 import torch
+from text_generation_server.models.flash_qwen2 import FlashQwen2
 from text_generation_server.pb import generate_pb2
-
-import time
-from torch.cuda import profiler
 from text_generation_server.utils.speculate import set_speculate
-import argparse
+from torch.cuda import profiler
+
 
 def parse_args(args=None):
     parser = argparse.ArgumentParser()
-    parser.add_argument('--generate_length', type=int, default=512)
-    parser.add_argument('--model2path', type=str, default="/home/data/nlp/qwen2/Qwen1.5-0.5B")
-    parser.add_argument('--quantize', type=str, default=None, choices=['awq'])
-    parser.add_argument('--speculate', type=int, default=0)
+    parser.add_argument("--generate_length", type=int, default=512)
+    parser.add_argument(
+        "--model2path", type=str, default="/home/data/nlp/qwen2/Qwen1.5-0.5B"
+    )
+    parser.add_argument("--quantize", type=str, default=None, choices=["awq"])
+    parser.add_argument("--speculate", type=int, default=0)
 
     return parser.parse_args(args)
 
+
 if __name__ == "__main__":
     args = parse_args()
 
@@ -52,10 +39,12 @@ if __name__ == "__main__":
         typical_p=1.0,
         do_sample=False,
     )
-    
-    default_pb_stop_parameters = generate_pb2.StoppingCriteriaParameters(stop_sequences=[], max_new_tokens=args.generate_length)
-    
-    warmup_requests =  generate_pb2.Request(
+
+    default_pb_stop_parameters = generate_pb2.StoppingCriteriaParameters(
+        stop_sequences=[], max_new_tokens=args.generate_length
+    )
+
+    warmup_requests = generate_pb2.Request(
         id=0,
         inputs="_test " * max_input_length,
         prefill_logprobs=True,
@@ -75,13 +64,13 @@ if __name__ == "__main__":
             stop_sequences=[],
             ignore_eos_token=False,
         ),
-        top_n_tokens = 20
+        top_n_tokens=20,
     )
     warmup_requests_batch = generate_pb2.Batch(id=0, requests=[warmup_requests], size=1)
-    warmup_requests_batchs =  model.batch_type.from_pb(
+    warmup_requests_batchs = model.batch_type.from_pb(
         warmup_requests_batch, model.tokenizer, model.dtype, torch.device("cuda")
     )
-    
+
     model.warmup(warmup_requests_batchs)
 
     pb_request = generate_pb2.Request(
@@ -98,7 +87,7 @@ if __name__ == "__main__":
     )
 
     next_batch_one = causal_lm_one_batch
-    last_generations = True 
+    last_generations = True
     torch.cuda.synchronize()
     profiler.start()
     start_time = time.perf_counter()
@@ -114,12 +103,9 @@ if __name__ == "__main__":
     end_time = time.perf_counter()
     duration_time = end_time - start_time
     print(f"generate length: {generations_one[0].generated_text.generated_tokens}")
-    print(f"one batch: {generations_one[0].generated_text.text}\nqps: {generations_one[0].generated_text.generated_tokens /duration_time}")
-    metricResult = {"metricResult": {}}
-    metricResult["metricResult"]["generate length"] = generations_one[0].generated_text.generated_tokens
-    metricResult["metricResult"]["one batch"] = generations_one[0].generated_text.text
-    metricResult["metricResult"]["qps"] = generations_one[0].generated_text.generated_tokens /duration_time
-    print(metricResult)
+    print(
+        f"one batch: {generations_one[0].generated_text.text}\nqps: {generations_one[0].generated_text.generated_tokens /duration_time}"
+    )
 
 """
 qwen1.5-0.5B
@@ -181,4 +167,4 @@ one batch: 亚历山大（Alexandria）
 印度首都是新德里（New Delhi）
 法国首都是巴黎（
 qps: 128.489649542011
-"""
\ No newline at end of file
+"""
diff --git a/models/nlp/large_language_model/qwen1.5-7b/vllm/offline_inference.py b/models/nlp/large_language_model/qwen1.5-7b/vllm/offline_inference.py
index 16aca56d..9b7d87fd 100644
--- a/models/nlp/large_language_model/qwen1.5-7b/vllm/offline_inference.py
+++ b/models/nlp/large_language_model/qwen1.5-7b/vllm/offline_inference.py
@@ -1,114 +1,115 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
 import sys
 from pathlib import Path
-sys.path.append(str(Path(__file__).resolve().parent.parent.parent))
-from utils import load_chat_template,sampling_add_cli_args
+import os
 
-import logging
-import time
+sys.path.append(str(Path(__file__).resolve().parent.parent.parent))
 import argparse
 import dataclasses
 import inspect
+import logging
+import time
 
 import torch
-from vllm import LLM, SamplingParams, EngineArgs
-
-
-parser = argparse.ArgumentParser()
-parser.add_argument("--chat_template",type=str,default=None)
-parser.add_argument("--remove_chat_template",default=False,action="store_true",help="pass this if you are not use a chat model")
-parser = EngineArgs.add_cli_args(parser)
-parser = sampling_add_cli_args(parser)
-args = parser.parse_args()
+from utils import load_chat_template, sampling_add_cli_args
+from vllm import LLM, EngineArgs, SamplingParams
 
-engine_args = [attr.name for attr in dataclasses.fields(EngineArgs)]
-sampling_args = [param.name for param in list(inspect.signature(SamplingParams).parameters.values())[1:]]
-engine_params = {attr:getattr(args, attr) for attr in engine_args}
-sampling_params = {attr:getattr(args, attr) for attr in sampling_args if args.__contains__(attr)}
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--chat_template", type=str, default=None)
+    parser.add_argument(
+        "--remove_chat_template",
+        default=False,
+        action="store_true",
+        help="pass this if you are not use a chat model",
+    )
+    parser = EngineArgs.add_cli_args(parser)
+    parser = sampling_add_cli_args(parser)
+    args = parser.parse_args()
 
-model_name = args.model.strip()
-model_name = model_name if args.model[-1]!='/' else model_name[:-1]
-model_name = model_name.rsplit('/')[-1]
+    engine_args = [attr.name for attr in dataclasses.fields(EngineArgs)]
+    sampling_args = [
+        param.name
+        for param in list(
+            inspect.signature(SamplingParams).parameters.values()
+        )
+    ]
+    engine_params = {attr: getattr(args, attr) for attr in engine_args}
+    sampling_params = {
+        attr: getattr(args, attr) for attr in sampling_args if args.__contains__(attr)
+    }
 
+    model_name = os.path.dirname(args.model).rsplit("/")[-1]
 
-# Sample prompts.
-prompts = [
-            "哪些迹象可能表明一个人正在经历焦虑?", 
-            "描述一下如何制作芝士披萨。", 
-            "写一篇有关5G网络研发的综述文章。"
-           ]
+    # Sample prompts.
+    prompts = ["哪些迹象可能表明一个人正在经历焦虑?", "描述一下如何制作芝士披萨。", "写一篇有关5G网络研发的综述文章。"]
 
-# Create a sampling params object.
-sampling_params = SamplingParams(**sampling_params)
+    # Create a sampling params object.
+    sampling_params = SamplingParams(**sampling_params)
 
-# Create an LLM.
-llm = LLM(**engine_params)
+    # Create an LLM.
+    llm = LLM(**engine_params)
 
-# process chat template
-if args.remove_chat_template:
-    if 'chat' in model_name.lower():
-        logging.warning(f"The model name from model path is {model_name}, so we guess you are using the chat model and the additional processing is required for the input prompt. "
-                        f"If the result is not quite correct, please ensure you do not pass --remove_chat_template in CLI.")
-    prompts_new = prompts
-else:
-    # Build chat model promopt
-    logging.warning("If you are using a non chat model, please pass the --remove_chat_template in CLI.")
-    # Try use transformers's apply_chat_template, if chat_template is None, will use defalut template.
-    # For some old models, the default template may cause bad answers. we don't consider this situation, 
-    # because the Transformers team is advancing the chat template. For more informatino about it, 
-    # please refer to https://huggingface.co/docs/transformers/main/chat_templating
-    try:
-        load_chat_template(llm.get_tokenizer(),args.chat_template)
-        prompts_new = []
-        for prompt in prompts:
-            messages = [
-                {"role": "user", "content": prompt}
-            ]
-            text = llm.get_tokenizer().apply_chat_template(
-                messages,
-                tokenize=False,
-                add_generation_prompt=True
+    # process chat template
+    if args.remove_chat_template:
+        if "chat" in model_name.lower():
+            logging.warning(
+                f"The model name from model path is {model_name}, so we guess you are using the chat model and the additional processing is required for the input prompt. "
+                f"If the result is not quite correct, please ensure you do not pass --remove_chat_template in CLI."
             )
-            prompts_new.append(text)
-    except:
-        logging.warning("use tokenizer apply_chat_template function failed, may because of low transformers version...(try use transformers>=4.34.0)")
         prompts_new = prompts
+    else:
+        # Build chat model promopt
+        logging.warning(
+            "If you are using a non chat model, please pass the --remove_chat_template in CLI."
+        )
+        # Try use transformers's apply_chat_template, if chat_template is None, will use defalut template.
+        # For some old models, the default template may cause bad answers. we don't consider this situation,
+        # because the Transformers team is advancing the chat template. For more informatino about it,
+        # please refer to https://huggingface.co/docs/transformers/main/chat_templating
+        try:
+            load_chat_template(llm.get_tokenizer(), args.chat_template)
+            prompts_new = []
+            for prompt in prompts:
+                messages = [{"role": "user", "content": prompt}]
+                text = llm.get_tokenizer().apply_chat_template(
+                    messages, tokenize=False, add_generation_prompt=True
+                )
+                prompts_new.append(text)
+        except:
+            logging.warning(
+                "use tokenizer apply_chat_template function failed, may because of low transformers version...(try use transformers>=4.34.0)"
+            )
+            prompts_new = prompts
+
+    # Generate texts from the prompts. The output is a list of RequestOutput objects
+    # that contain the prompt, generated text, and other information.
+    outputs = (
+        llm.generate(prompts_new, sampling_params, use_tqdm=False)
+        if isinstance(prompts_new[0], str)
+        else llm.generate(
+            sampling_params=sampling_params,
+            prompt_token_ids=prompts_new,
+            use_tqdm=False,
+        )
+    )
+    torch.cuda.synchronize()
 
-# Generate texts from the prompts. The output is a list of RequestOutput objects
-# that contain the prompt, generated text, and other information.
-outputs = llm.generate(prompts_new, sampling_params,use_tqdm=False) if isinstance(prompts_new[0],str) else llm.generate(sampling_params=sampling_params,prompt_token_ids=prompts_new,use_tqdm=False)
-torch.cuda.synchronize()
+    start_time = time.perf_counter()
+    outputs = (
+        llm.generate(prompts_new, sampling_params)
+        if isinstance(prompts_new[0], str)
+        else llm.generate(sampling_params=sampling_params, prompt_token_ids=prompts_new)
+    )
+    torch.cuda.synchronize()
+    end_time = time.perf_counter()
+    duration_time = end_time - start_time
 
-start_time = time.perf_counter()
-outputs = llm.generate(prompts_new, sampling_params) if isinstance(prompts_new[0],str) else llm.generate(sampling_params=sampling_params,prompt_token_ids=prompts_new)
-torch.cuda.synchronize()
-end_time = time.perf_counter()
-duration_time = end_time - start_time
+    num_tokens = 0
+    # Print the outputs.
+    for i, output in enumerate(outputs):
+        prompt = prompts[i]  # show the origin prompt. actully prompt is "output.prompt"
+        generated_text = output.outputs[0].text
 
-num_tokens = 0
-# Print the outputs.
-for i, output in enumerate(outputs):
-    prompt = prompts[i] # show the origin prompt. actully prompt is "output.prompt"
-    generated_text = output.outputs[0].text
-    
-    num_tokens += len(output.outputs[0].token_ids)
-    print(f"Prompt: {prompt}\nGenerated text: {generated_text} \n")
-print(f"tokens: {num_tokens}, QPS: {num_tokens/duration_time}")
-metricResult = {"metricResult": {}}
-metricResult["metricResult"]["tokens"] = num_tokens
-metricResult["metricResult"]["QPS"] = round(num_tokens/duration_time,3)
-print(metricResult)
+        num_tokens += len(output.outputs[0].token_ids)
+        print(f"Prompt: {prompt}\nGenerated text: {generated_text} \n")
+    print(f"tokens: {num_tokens}, QPS: {num_tokens/duration_time}")
diff --git a/models/nlp/large_language_model/qwen1.5-7b/vllm/utils.py b/models/nlp/large_language_model/qwen1.5-7b/vllm/utils.py
deleted file mode 100644
index c6def85d..00000000
--- a/models/nlp/large_language_model/qwen1.5-7b/vllm/utils.py
+++ /dev/null
@@ -1,173 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
-import codecs
-import logging
-import argparse
-
-
-def sampling_add_cli_args(args: argparse.ArgumentParser) -> argparse.ArgumentParser:
-    args.add_argument(
-        '--n',
-        type=int,
-        default=1,
-        help="Number of output sequences to return for the given prompt.")
-    args.add_argument(
-        '--best-of',
-        type=int,
-        default=None,
-        help="Number of output sequences that are generated from the prompt. "
-        "From these `best_of` sequences, the top `n` sequences are returned. "
-        "`best_of` must be greater than or equal to `n`. This is treated as "
-        "the beam width when `use_beam_search` is True. By default, `best_of`"
-        "is set to `n`.")
-    args.add_argument(
-        '--presence-penalty',
-        type=float,
-        default=0.0,
-        help="Float that penalizes new tokens based on whether they "
-        "appear in the generated text so far. Values > 0 encourage the model "
-        "to use new tokens, while values < 0 encourage the model to repeat "
-        "tokens.")
-    args.add_argument(
-        '--frequency-penalty',
-        type=float,
-        default=0.0,
-        help="Float that penalizes new tokens based on their "
-        " frequency in the generated text so far. Values > 0 encourage the "
-        " model to use new tokens, while values < 0 encourage the model to "
-        "repeat tokens.")
-    args.add_argument(
-        '--repetition-penalty',
-        type=float,
-        default=1.0,
-        help="Float that penalizes new tokens based on whether "
-        "they appear in the prompt and the generated text so far. Values > 1 "
-        "encourage the model to use new tokens, while values < 1 encourage "
-        "the model to repeat tokens.")
-    args.add_argument(
-        '--temperature',
-        type=float,
-        default=1.0,
-        help="Float that controls the randomness of the sampling. Lower "
-        "values make the model more deterministic, while higher values make "
-        "the model more random. Zero means greedy sampling.")
-    args.add_argument(
-        '--top-p',
-        type=float,
-        default=1.0,
-        help="Float that controls the cumulative probability of the top tokens "
-            "to consider. Must be in (0, 1]. Set to 1 to consider all tokens.")
-    args.add_argument(
-        '--top-k',
-        type=int,
-        default=-1,
-        help="Integer that controls the number of top tokens to consider. Set "
-        "to -1 to consider all tokens.")
-    args.add_argument(
-        '--min-p',
-        type=float,
-        default=0.0,
-        help="Float that represents the minimum probability for a token to be "
-        "considered, relative to the probability of the most likely token. "
-        "Must be in [0, 1]. Set to 0 to disable this.")
-    args.add_argument(
-        '--use-beam-search',
-        default=False,
-        action="store_true",
-        help="Whether to use beam search instead of sampling.")
-    args.add_argument(
-        '--length-penalty',
-        type=float,
-        default=1.0,
-        help="Float that penalizes sequences based on their length. Used in beam search.")
-    args.add_argument(
-        '--stop',
-        type=str,
-        default=None,
-        help="List of strings that stop the generation when they are generated. "
-        "The returned output will not contain the stop strings.")
-    args.add_argument(
-        '--stop-token-ids',
-        type=int,
-        default=None,
-        help="List of tokens that stop the generation when they are "
-        "generated. The returned output will contain the stop tokens unless "
-        "the stop tokens are special tokens.")
-    args.add_argument(
-        '--include-stop-str-in-output',
-        default=False,
-        action="store_true",
-        help="Whether to include the stop strings in output text. Defaults to False.")
-    args.add_argument(
-        '--ignore-eos',
-        default=False,
-        action="store_true",
-        help="Whether to ignore the EOS token and continue generating tokens after the EOS token is generated.")
-    args.add_argument(
-        '--max-tokens',
-        type=int,
-        default=16,
-        help="Maximum number of tokens to generate per output sequence.")
-    args.add_argument(
-        '--logprobs',
-        type=int,
-        default=None,
-        help="NNumber of log probabilities to return per output token. "
-        "Note that the implementation follows the OpenAI API: The return "
-        "result includes the log probabilities on the `logprobs` most likely "
-        "tokens, as well the chosen tokens. The API will always return the "
-        "log probability of the sampled token, so there  may be up to "
-        "`logprobs+1` elements in the response.")
-    args.add_argument(
-        '--prompt-logprobs',
-        type=int,
-        default=None,
-        help="Number of log probabilities to return per prompt token.")
-    args.add_argument(
-        '--skip-special-tokens',
-        default=True,
-        action="store_false",
-        help="Whether to skip special tokens in the output.")
-    args.add_argument(
-        '--spaces-between-special-tokens',
-        default=True,
-        action="store_false",
-        help="Whether to add spaces between special tokens in the output.  Defaults to True.")
-    # early_stopping logits_processors seed
-    return args
-
-
-def load_chat_template(tokenizer, chat_template):
-    if chat_template is not None:
-        try:
-            with open(chat_template, "r") as f:
-                tokenizer.chat_template = f.read()
-        except OSError:
-            # If opening a file fails, set chat template to be args to
-            # ensure we decode so our escape are interpreted correctly
-            tokenizer.chat_template = codecs.decode(
-                chat_template, "unicode_escape")
-
-        logging.info(
-            f"Using supplied chat template:\n{tokenizer.chat_template}"
-        )
-    elif tokenizer.chat_template is not None:
-        logging.info(
-            f"Using default chat template:\n{tokenizer.chat_template}. This May lead to unsatisfactory results. You can provide a template.jinja file for vllm."
-        )
-    else:
-        logging.warning(
-            "No chat template provided. Chat API will not work. This May lead to unsatisfactory results. You can provide a template.jinja file for vllm.")
diff --git a/models/nlp/large_language_model/qwen2-72b/vllm/offline_inference.py b/models/nlp/large_language_model/qwen2-72b/vllm/offline_inference.py
index fd97059a..9b7d87fd 100644
--- a/models/nlp/large_language_model/qwen2-72b/vllm/offline_inference.py
+++ b/models/nlp/large_language_model/qwen2-72b/vllm/offline_inference.py
@@ -1,20 +1,6 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
 import sys
 from pathlib import Path
+import os
 
 sys.path.append(str(Path(__file__).resolve().parent.parent.parent))
 import argparse
@@ -45,16 +31,14 @@ if __name__ == "__main__":
         param.name
         for param in list(
             inspect.signature(SamplingParams).parameters.values()
-        )[1:]
+        )
     ]
     engine_params = {attr: getattr(args, attr) for attr in engine_args}
     sampling_params = {
         attr: getattr(args, attr) for attr in sampling_args if args.__contains__(attr)
     }
 
-    model_name = args.model.strip()
-    model_name = model_name if args.model[-1] != "/" else model_name[:-1]
-    model_name = model_name.rsplit("/")[-1]
+    model_name = os.path.dirname(args.model).rsplit("/")[-1]
 
     # Sample prompts.
     prompts = ["哪些迹象可能表明一个人正在经历焦虑?", "描述一下如何制作芝士披萨。", "写一篇有关5G网络研发的综述文章。"]
@@ -129,7 +113,3 @@ if __name__ == "__main__":
         num_tokens += len(output.outputs[0].token_ids)
         print(f"Prompt: {prompt}\nGenerated text: {generated_text} \n")
     print(f"tokens: {num_tokens}, QPS: {num_tokens/duration_time}")
-    metricResult = {"metricResult": {}}
-    metricResult["metricResult"]["tokens"] = num_tokens
-    metricResult["metricResult"]["QPS"] = round(num_tokens/duration_time,3)
-    print(metricResult)
\ No newline at end of file
diff --git a/models/nlp/large_language_model/qwen2-72b/vllm/utils.py b/models/nlp/large_language_model/qwen2-72b/vllm/utils.py
deleted file mode 100644
index c6def85d..00000000
--- a/models/nlp/large_language_model/qwen2-72b/vllm/utils.py
+++ /dev/null
@@ -1,173 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
-import codecs
-import logging
-import argparse
-
-
-def sampling_add_cli_args(args: argparse.ArgumentParser) -> argparse.ArgumentParser:
-    args.add_argument(
-        '--n',
-        type=int,
-        default=1,
-        help="Number of output sequences to return for the given prompt.")
-    args.add_argument(
-        '--best-of',
-        type=int,
-        default=None,
-        help="Number of output sequences that are generated from the prompt. "
-        "From these `best_of` sequences, the top `n` sequences are returned. "
-        "`best_of` must be greater than or equal to `n`. This is treated as "
-        "the beam width when `use_beam_search` is True. By default, `best_of`"
-        "is set to `n`.")
-    args.add_argument(
-        '--presence-penalty',
-        type=float,
-        default=0.0,
-        help="Float that penalizes new tokens based on whether they "
-        "appear in the generated text so far. Values > 0 encourage the model "
-        "to use new tokens, while values < 0 encourage the model to repeat "
-        "tokens.")
-    args.add_argument(
-        '--frequency-penalty',
-        type=float,
-        default=0.0,
-        help="Float that penalizes new tokens based on their "
-        " frequency in the generated text so far. Values > 0 encourage the "
-        " model to use new tokens, while values < 0 encourage the model to "
-        "repeat tokens.")
-    args.add_argument(
-        '--repetition-penalty',
-        type=float,
-        default=1.0,
-        help="Float that penalizes new tokens based on whether "
-        "they appear in the prompt and the generated text so far. Values > 1 "
-        "encourage the model to use new tokens, while values < 1 encourage "
-        "the model to repeat tokens.")
-    args.add_argument(
-        '--temperature',
-        type=float,
-        default=1.0,
-        help="Float that controls the randomness of the sampling. Lower "
-        "values make the model more deterministic, while higher values make "
-        "the model more random. Zero means greedy sampling.")
-    args.add_argument(
-        '--top-p',
-        type=float,
-        default=1.0,
-        help="Float that controls the cumulative probability of the top tokens "
-            "to consider. Must be in (0, 1]. Set to 1 to consider all tokens.")
-    args.add_argument(
-        '--top-k',
-        type=int,
-        default=-1,
-        help="Integer that controls the number of top tokens to consider. Set "
-        "to -1 to consider all tokens.")
-    args.add_argument(
-        '--min-p',
-        type=float,
-        default=0.0,
-        help="Float that represents the minimum probability for a token to be "
-        "considered, relative to the probability of the most likely token. "
-        "Must be in [0, 1]. Set to 0 to disable this.")
-    args.add_argument(
-        '--use-beam-search',
-        default=False,
-        action="store_true",
-        help="Whether to use beam search instead of sampling.")
-    args.add_argument(
-        '--length-penalty',
-        type=float,
-        default=1.0,
-        help="Float that penalizes sequences based on their length. Used in beam search.")
-    args.add_argument(
-        '--stop',
-        type=str,
-        default=None,
-        help="List of strings that stop the generation when they are generated. "
-        "The returned output will not contain the stop strings.")
-    args.add_argument(
-        '--stop-token-ids',
-        type=int,
-        default=None,
-        help="List of tokens that stop the generation when they are "
-        "generated. The returned output will contain the stop tokens unless "
-        "the stop tokens are special tokens.")
-    args.add_argument(
-        '--include-stop-str-in-output',
-        default=False,
-        action="store_true",
-        help="Whether to include the stop strings in output text. Defaults to False.")
-    args.add_argument(
-        '--ignore-eos',
-        default=False,
-        action="store_true",
-        help="Whether to ignore the EOS token and continue generating tokens after the EOS token is generated.")
-    args.add_argument(
-        '--max-tokens',
-        type=int,
-        default=16,
-        help="Maximum number of tokens to generate per output sequence.")
-    args.add_argument(
-        '--logprobs',
-        type=int,
-        default=None,
-        help="NNumber of log probabilities to return per output token. "
-        "Note that the implementation follows the OpenAI API: The return "
-        "result includes the log probabilities on the `logprobs` most likely "
-        "tokens, as well the chosen tokens. The API will always return the "
-        "log probability of the sampled token, so there  may be up to "
-        "`logprobs+1` elements in the response.")
-    args.add_argument(
-        '--prompt-logprobs',
-        type=int,
-        default=None,
-        help="Number of log probabilities to return per prompt token.")
-    args.add_argument(
-        '--skip-special-tokens',
-        default=True,
-        action="store_false",
-        help="Whether to skip special tokens in the output.")
-    args.add_argument(
-        '--spaces-between-special-tokens',
-        default=True,
-        action="store_false",
-        help="Whether to add spaces between special tokens in the output.  Defaults to True.")
-    # early_stopping logits_processors seed
-    return args
-
-
-def load_chat_template(tokenizer, chat_template):
-    if chat_template is not None:
-        try:
-            with open(chat_template, "r") as f:
-                tokenizer.chat_template = f.read()
-        except OSError:
-            # If opening a file fails, set chat template to be args to
-            # ensure we decode so our escape are interpreted correctly
-            tokenizer.chat_template = codecs.decode(
-                chat_template, "unicode_escape")
-
-        logging.info(
-            f"Using supplied chat template:\n{tokenizer.chat_template}"
-        )
-    elif tokenizer.chat_template is not None:
-        logging.info(
-            f"Using default chat template:\n{tokenizer.chat_template}. This May lead to unsatisfactory results. You can provide a template.jinja file for vllm."
-        )
-    else:
-        logging.warning(
-            "No chat template provided. Chat API will not work. This May lead to unsatisfactory results. You can provide a template.jinja file for vllm.")
diff --git a/models/nlp/large_language_model/qwen2-7b/vllm/offline_inference.py b/models/nlp/large_language_model/qwen2-7b/vllm/offline_inference.py
index fd97059a..9b7d87fd 100644
--- a/models/nlp/large_language_model/qwen2-7b/vllm/offline_inference.py
+++ b/models/nlp/large_language_model/qwen2-7b/vllm/offline_inference.py
@@ -1,20 +1,6 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
 import sys
 from pathlib import Path
+import os
 
 sys.path.append(str(Path(__file__).resolve().parent.parent.parent))
 import argparse
@@ -45,16 +31,14 @@ if __name__ == "__main__":
         param.name
         for param in list(
             inspect.signature(SamplingParams).parameters.values()
-        )[1:]
+        )
     ]
     engine_params = {attr: getattr(args, attr) for attr in engine_args}
     sampling_params = {
         attr: getattr(args, attr) for attr in sampling_args if args.__contains__(attr)
     }
 
-    model_name = args.model.strip()
-    model_name = model_name if args.model[-1] != "/" else model_name[:-1]
-    model_name = model_name.rsplit("/")[-1]
+    model_name = os.path.dirname(args.model).rsplit("/")[-1]
 
     # Sample prompts.
     prompts = ["哪些迹象可能表明一个人正在经历焦虑?", "描述一下如何制作芝士披萨。", "写一篇有关5G网络研发的综述文章。"]
@@ -129,7 +113,3 @@ if __name__ == "__main__":
         num_tokens += len(output.outputs[0].token_ids)
         print(f"Prompt: {prompt}\nGenerated text: {generated_text} \n")
     print(f"tokens: {num_tokens}, QPS: {num_tokens/duration_time}")
-    metricResult = {"metricResult": {}}
-    metricResult["metricResult"]["tokens"] = num_tokens
-    metricResult["metricResult"]["QPS"] = round(num_tokens/duration_time,3)
-    print(metricResult)
\ No newline at end of file
diff --git a/models/nlp/large_language_model/qwen2-7b/vllm/utils.py b/models/nlp/large_language_model/qwen2-7b/vllm/utils.py
deleted file mode 100644
index c6def85d..00000000
--- a/models/nlp/large_language_model/qwen2-7b/vllm/utils.py
+++ /dev/null
@@ -1,173 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
-import codecs
-import logging
-import argparse
-
-
-def sampling_add_cli_args(args: argparse.ArgumentParser) -> argparse.ArgumentParser:
-    args.add_argument(
-        '--n',
-        type=int,
-        default=1,
-        help="Number of output sequences to return for the given prompt.")
-    args.add_argument(
-        '--best-of',
-        type=int,
-        default=None,
-        help="Number of output sequences that are generated from the prompt. "
-        "From these `best_of` sequences, the top `n` sequences are returned. "
-        "`best_of` must be greater than or equal to `n`. This is treated as "
-        "the beam width when `use_beam_search` is True. By default, `best_of`"
-        "is set to `n`.")
-    args.add_argument(
-        '--presence-penalty',
-        type=float,
-        default=0.0,
-        help="Float that penalizes new tokens based on whether they "
-        "appear in the generated text so far. Values > 0 encourage the model "
-        "to use new tokens, while values < 0 encourage the model to repeat "
-        "tokens.")
-    args.add_argument(
-        '--frequency-penalty',
-        type=float,
-        default=0.0,
-        help="Float that penalizes new tokens based on their "
-        " frequency in the generated text so far. Values > 0 encourage the "
-        " model to use new tokens, while values < 0 encourage the model to "
-        "repeat tokens.")
-    args.add_argument(
-        '--repetition-penalty',
-        type=float,
-        default=1.0,
-        help="Float that penalizes new tokens based on whether "
-        "they appear in the prompt and the generated text so far. Values > 1 "
-        "encourage the model to use new tokens, while values < 1 encourage "
-        "the model to repeat tokens.")
-    args.add_argument(
-        '--temperature',
-        type=float,
-        default=1.0,
-        help="Float that controls the randomness of the sampling. Lower "
-        "values make the model more deterministic, while higher values make "
-        "the model more random. Zero means greedy sampling.")
-    args.add_argument(
-        '--top-p',
-        type=float,
-        default=1.0,
-        help="Float that controls the cumulative probability of the top tokens "
-            "to consider. Must be in (0, 1]. Set to 1 to consider all tokens.")
-    args.add_argument(
-        '--top-k',
-        type=int,
-        default=-1,
-        help="Integer that controls the number of top tokens to consider. Set "
-        "to -1 to consider all tokens.")
-    args.add_argument(
-        '--min-p',
-        type=float,
-        default=0.0,
-        help="Float that represents the minimum probability for a token to be "
-        "considered, relative to the probability of the most likely token. "
-        "Must be in [0, 1]. Set to 0 to disable this.")
-    args.add_argument(
-        '--use-beam-search',
-        default=False,
-        action="store_true",
-        help="Whether to use beam search instead of sampling.")
-    args.add_argument(
-        '--length-penalty',
-        type=float,
-        default=1.0,
-        help="Float that penalizes sequences based on their length. Used in beam search.")
-    args.add_argument(
-        '--stop',
-        type=str,
-        default=None,
-        help="List of strings that stop the generation when they are generated. "
-        "The returned output will not contain the stop strings.")
-    args.add_argument(
-        '--stop-token-ids',
-        type=int,
-        default=None,
-        help="List of tokens that stop the generation when they are "
-        "generated. The returned output will contain the stop tokens unless "
-        "the stop tokens are special tokens.")
-    args.add_argument(
-        '--include-stop-str-in-output',
-        default=False,
-        action="store_true",
-        help="Whether to include the stop strings in output text. Defaults to False.")
-    args.add_argument(
-        '--ignore-eos',
-        default=False,
-        action="store_true",
-        help="Whether to ignore the EOS token and continue generating tokens after the EOS token is generated.")
-    args.add_argument(
-        '--max-tokens',
-        type=int,
-        default=16,
-        help="Maximum number of tokens to generate per output sequence.")
-    args.add_argument(
-        '--logprobs',
-        type=int,
-        default=None,
-        help="NNumber of log probabilities to return per output token. "
-        "Note that the implementation follows the OpenAI API: The return "
-        "result includes the log probabilities on the `logprobs` most likely "
-        "tokens, as well the chosen tokens. The API will always return the "
-        "log probability of the sampled token, so there  may be up to "
-        "`logprobs+1` elements in the response.")
-    args.add_argument(
-        '--prompt-logprobs',
-        type=int,
-        default=None,
-        help="Number of log probabilities to return per prompt token.")
-    args.add_argument(
-        '--skip-special-tokens',
-        default=True,
-        action="store_false",
-        help="Whether to skip special tokens in the output.")
-    args.add_argument(
-        '--spaces-between-special-tokens',
-        default=True,
-        action="store_false",
-        help="Whether to add spaces between special tokens in the output.  Defaults to True.")
-    # early_stopping logits_processors seed
-    return args
-
-
-def load_chat_template(tokenizer, chat_template):
-    if chat_template is not None:
-        try:
-            with open(chat_template, "r") as f:
-                tokenizer.chat_template = f.read()
-        except OSError:
-            # If opening a file fails, set chat template to be args to
-            # ensure we decode so our escape are interpreted correctly
-            tokenizer.chat_template = codecs.decode(
-                chat_template, "unicode_escape")
-
-        logging.info(
-            f"Using supplied chat template:\n{tokenizer.chat_template}"
-        )
-    elif tokenizer.chat_template is not None:
-        logging.info(
-            f"Using default chat template:\n{tokenizer.chat_template}. This May lead to unsatisfactory results. You can provide a template.jinja file for vllm."
-        )
-    else:
-        logging.warning(
-            "No chat template provided. Chat API will not work. This May lead to unsatisfactory results. You can provide a template.jinja file for vllm.")
diff --git a/models/nlp/large_language_model/stablelm/vllm/offline_inference.py b/models/nlp/large_language_model/stablelm/vllm/offline_inference.py
index b8d69671..946e379b 100644
--- a/models/nlp/large_language_model/stablelm/vllm/offline_inference.py
+++ b/models/nlp/large_language_model/stablelm/vllm/offline_inference.py
@@ -1,20 +1,6 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
 import sys
 from pathlib import Path
+import os
 
 sys.path.append(str(Path(__file__).resolve().parent.parent.parent))
 import argparse
@@ -45,16 +31,14 @@ if __name__ == "__main__":
         param.name
         for param in list(
             inspect.signature(SamplingParams).parameters.values()
-        )[1:]
+        )
     ]
     engine_params = {attr: getattr(args, attr) for attr in engine_args}
     sampling_params = {
         attr: getattr(args, attr) for attr in sampling_args if args.__contains__(attr)
     }
 
-    model_name = args.model.strip()
-    model_name = model_name if args.model[-1] != "/" else model_name[:-1]
-    model_name = model_name.rsplit("/")[-1]
+    model_name = os.path.dirname(args.model).rsplit("/")[-1]
 
     # Sample prompts.
     prompts = [
@@ -133,7 +117,3 @@ if __name__ == "__main__":
         num_tokens += len(output.outputs[0].token_ids)
         print(f"Prompt: {prompt}\nGenerated text: {generated_text} \n")
     print(f"tokens: {num_tokens}, QPS: {num_tokens/duration_time}")
-    metricResult = {"metricResult": {}}
-    metricResult["metricResult"]["tokens"] = num_tokens
-    metricResult["metricResult"]["QPS"] = round(num_tokens/duration_time,3)
-    print(metricResult)
\ No newline at end of file
diff --git a/models/nlp/large_language_model/stablelm/vllm/utils.py b/models/nlp/large_language_model/stablelm/vllm/utils.py
deleted file mode 100644
index c6def85d..00000000
--- a/models/nlp/large_language_model/stablelm/vllm/utils.py
+++ /dev/null
@@ -1,173 +0,0 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
-import codecs
-import logging
-import argparse
-
-
-def sampling_add_cli_args(args: argparse.ArgumentParser) -> argparse.ArgumentParser:
-    args.add_argument(
-        '--n',
-        type=int,
-        default=1,
-        help="Number of output sequences to return for the given prompt.")
-    args.add_argument(
-        '--best-of',
-        type=int,
-        default=None,
-        help="Number of output sequences that are generated from the prompt. "
-        "From these `best_of` sequences, the top `n` sequences are returned. "
-        "`best_of` must be greater than or equal to `n`. This is treated as "
-        "the beam width when `use_beam_search` is True. By default, `best_of`"
-        "is set to `n`.")
-    args.add_argument(
-        '--presence-penalty',
-        type=float,
-        default=0.0,
-        help="Float that penalizes new tokens based on whether they "
-        "appear in the generated text so far. Values > 0 encourage the model "
-        "to use new tokens, while values < 0 encourage the model to repeat "
-        "tokens.")
-    args.add_argument(
-        '--frequency-penalty',
-        type=float,
-        default=0.0,
-        help="Float that penalizes new tokens based on their "
-        " frequency in the generated text so far. Values > 0 encourage the "
-        " model to use new tokens, while values < 0 encourage the model to "
-        "repeat tokens.")
-    args.add_argument(
-        '--repetition-penalty',
-        type=float,
-        default=1.0,
-        help="Float that penalizes new tokens based on whether "
-        "they appear in the prompt and the generated text so far. Values > 1 "
-        "encourage the model to use new tokens, while values < 1 encourage "
-        "the model to repeat tokens.")
-    args.add_argument(
-        '--temperature',
-        type=float,
-        default=1.0,
-        help="Float that controls the randomness of the sampling. Lower "
-        "values make the model more deterministic, while higher values make "
-        "the model more random. Zero means greedy sampling.")
-    args.add_argument(
-        '--top-p',
-        type=float,
-        default=1.0,
-        help="Float that controls the cumulative probability of the top tokens "
-            "to consider. Must be in (0, 1]. Set to 1 to consider all tokens.")
-    args.add_argument(
-        '--top-k',
-        type=int,
-        default=-1,
-        help="Integer that controls the number of top tokens to consider. Set "
-        "to -1 to consider all tokens.")
-    args.add_argument(
-        '--min-p',
-        type=float,
-        default=0.0,
-        help="Float that represents the minimum probability for a token to be "
-        "considered, relative to the probability of the most likely token. "
-        "Must be in [0, 1]. Set to 0 to disable this.")
-    args.add_argument(
-        '--use-beam-search',
-        default=False,
-        action="store_true",
-        help="Whether to use beam search instead of sampling.")
-    args.add_argument(
-        '--length-penalty',
-        type=float,
-        default=1.0,
-        help="Float that penalizes sequences based on their length. Used in beam search.")
-    args.add_argument(
-        '--stop',
-        type=str,
-        default=None,
-        help="List of strings that stop the generation when they are generated. "
-        "The returned output will not contain the stop strings.")
-    args.add_argument(
-        '--stop-token-ids',
-        type=int,
-        default=None,
-        help="List of tokens that stop the generation when they are "
-        "generated. The returned output will contain the stop tokens unless "
-        "the stop tokens are special tokens.")
-    args.add_argument(
-        '--include-stop-str-in-output',
-        default=False,
-        action="store_true",
-        help="Whether to include the stop strings in output text. Defaults to False.")
-    args.add_argument(
-        '--ignore-eos',
-        default=False,
-        action="store_true",
-        help="Whether to ignore the EOS token and continue generating tokens after the EOS token is generated.")
-    args.add_argument(
-        '--max-tokens',
-        type=int,
-        default=16,
-        help="Maximum number of tokens to generate per output sequence.")
-    args.add_argument(
-        '--logprobs',
-        type=int,
-        default=None,
-        help="NNumber of log probabilities to return per output token. "
-        "Note that the implementation follows the OpenAI API: The return "
-        "result includes the log probabilities on the `logprobs` most likely "
-        "tokens, as well the chosen tokens. The API will always return the "
-        "log probability of the sampled token, so there  may be up to "
-        "`logprobs+1` elements in the response.")
-    args.add_argument(
-        '--prompt-logprobs',
-        type=int,
-        default=None,
-        help="Number of log probabilities to return per prompt token.")
-    args.add_argument(
-        '--skip-special-tokens',
-        default=True,
-        action="store_false",
-        help="Whether to skip special tokens in the output.")
-    args.add_argument(
-        '--spaces-between-special-tokens',
-        default=True,
-        action="store_false",
-        help="Whether to add spaces between special tokens in the output.  Defaults to True.")
-    # early_stopping logits_processors seed
-    return args
-
-
-def load_chat_template(tokenizer, chat_template):
-    if chat_template is not None:
-        try:
-            with open(chat_template, "r") as f:
-                tokenizer.chat_template = f.read()
-        except OSError:
-            # If opening a file fails, set chat template to be args to
-            # ensure we decode so our escape are interpreted correctly
-            tokenizer.chat_template = codecs.decode(
-                chat_template, "unicode_escape")
-
-        logging.info(
-            f"Using supplied chat template:\n{tokenizer.chat_template}"
-        )
-    elif tokenizer.chat_template is not None:
-        logging.info(
-            f"Using default chat template:\n{tokenizer.chat_template}. This May lead to unsatisfactory results. You can provide a template.jinja file for vllm."
-        )
-    else:
-        logging.warning(
-            "No chat template provided. Chat API will not work. This May lead to unsatisfactory results. You can provide a template.jinja file for vllm.")
diff --git a/models/nlp/large_language_model/baichuan2-7b/vllm/utils.py b/models/nlp/large_language_model/utils.py
similarity index 61%
rename from models/nlp/large_language_model/baichuan2-7b/vllm/utils.py
rename to models/nlp/large_language_model/utils.py
index c6def85d..1708c9f2 100644
--- a/models/nlp/large_language_model/baichuan2-7b/vllm/utils.py
+++ b/models/nlp/large_language_model/utils.py
@@ -1,128 +1,158 @@
-# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
-# All Rights Reserved.
-#
-#    Licensed under the Apache License, Version 2.0 (the "License"); you may
-#    not use this file except in compliance with the License. You may obtain
-#    a copy of the License at
-#
-#         http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-#    WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-#    License for the specific language governing permissions and limitations
-#    under the License.
-
+import argparse
 import codecs
 import logging
-import argparse
+
+"""
+The following arguments can not be add in args...
+early_stopping: Union[bool, str] = False,
+early_stopping: Controls the stopping condition for beam search. It
+    accepts the following values: `True`, where the generation stops as
+    soon as there are `best_of` complete candidates; `False`, where an
+    heuristic is applied and the generation stops when is it very
+    unlikely to find better candidates; `"never"`, where the beam search
+    procedure only stops when there cannot be better candidates
+    (canonical beam search algorithm).
+stop: Optional[Union[str, List[str]]] = None,
+stop_token_ids: Optional[List[int]] = None,
+logits_processors: Optional[List[LogitsProcessor]] = None,
+logits_processors: List of functions that modify logits based on
+    previously generated tokens, and optionally prompt tokens as
+    a first argument.
+truncate_prompt_tokens: Optional[Annotated[int, Field(ge=1)]] = None,
+truncate_prompt_tokens: If set to an integer k, will use only the last k
+    tokens from the prompt (i.e., left truncation). Defaults to None
+    (i.e., no truncation).
+    """
 
 
 def sampling_add_cli_args(args: argparse.ArgumentParser) -> argparse.ArgumentParser:
     args.add_argument(
-        '--n',
+        "--n",
         type=int,
         default=1,
-        help="Number of output sequences to return for the given prompt.")
+        help="Number of output sequences to return for the given prompt.",
+    )
     args.add_argument(
-        '--best-of',
+        "--best-of",
         type=int,
         default=None,
         help="Number of output sequences that are generated from the prompt. "
         "From these `best_of` sequences, the top `n` sequences are returned. "
         "`best_of` must be greater than or equal to `n`. This is treated as "
         "the beam width when `use_beam_search` is True. By default, `best_of`"
-        "is set to `n`.")
+        "is set to `n`.",
+    )
     args.add_argument(
-        '--presence-penalty',
+        "--presence-penalty",
         type=float,
         default=0.0,
         help="Float that penalizes new tokens based on whether they "
         "appear in the generated text so far. Values > 0 encourage the model "
         "to use new tokens, while values < 0 encourage the model to repeat "
-        "tokens.")
+        "tokens.",
+    )
     args.add_argument(
-        '--frequency-penalty',
+        "--frequency-penalty",
         type=float,
         default=0.0,
         help="Float that penalizes new tokens based on their "
         " frequency in the generated text so far. Values > 0 encourage the "
         " model to use new tokens, while values < 0 encourage the model to "
-        "repeat tokens.")
+        "repeat tokens.",
+    )
     args.add_argument(
-        '--repetition-penalty',
+        "--repetition-penalty",
         type=float,
         default=1.0,
         help="Float that penalizes new tokens based on whether "
         "they appear in the prompt and the generated text so far. Values > 1 "
         "encourage the model to use new tokens, while values < 1 encourage "
-        "the model to repeat tokens.")
+        "the model to repeat tokens.",
+    )
     args.add_argument(
-        '--temperature',
+        "--temperature",
         type=float,
         default=1.0,
         help="Float that controls the randomness of the sampling. Lower "
         "values make the model more deterministic, while higher values make "
-        "the model more random. Zero means greedy sampling.")
+        "the model more random. Zero means greedy sampling.",
+    )
     args.add_argument(
-        '--top-p',
+        "--top-p",
         type=float,
         default=1.0,
         help="Float that controls the cumulative probability of the top tokens "
-            "to consider. Must be in (0, 1]. Set to 1 to consider all tokens.")
+        "to consider. Must be in (0, 1]. Set to 1 to consider all tokens.",
+    )
     args.add_argument(
-        '--top-k',
+        "--top-k",
         type=int,
         default=-1,
         help="Integer that controls the number of top tokens to consider. Set "
-        "to -1 to consider all tokens.")
+        "to -1 to consider all tokens.",
+    )
     args.add_argument(
-        '--min-p',
+        "--min-p",
         type=float,
         default=0.0,
         help="Float that represents the minimum probability for a token to be "
         "considered, relative to the probability of the most likely token. "
-        "Must be in [0, 1]. Set to 0 to disable this.")
+        "Must be in [0, 1]. Set to 0 to disable this.",
+    )
     args.add_argument(
-        '--use-beam-search',
+        "--use-beam-search",
         default=False,
         action="store_true",
-        help="Whether to use beam search instead of sampling.")
+        help="Whether to use beam search instead of sampling.",
+    )
     args.add_argument(
-        '--length-penalty',
+        "--length-penalty",
         type=float,
         default=1.0,
-        help="Float that penalizes sequences based on their length. Used in beam search.")
+        help="Float that penalizes sequences based on their length. Used in beam search.",
+    )
     args.add_argument(
-        '--stop',
+        "--stop",
         type=str,
         default=None,
         help="List of strings that stop the generation when they are generated. "
-        "The returned output will not contain the stop strings.")
+        "The returned output will not contain the stop strings.",
+    )
     args.add_argument(
-        '--stop-token-ids',
+        "--stop-token-ids",
         type=int,
         default=None,
         help="List of tokens that stop the generation when they are "
         "generated. The returned output will contain the stop tokens unless "
-        "the stop tokens are special tokens.")
+        "the stop tokens are special tokens.",
+    )
     args.add_argument(
-        '--include-stop-str-in-output',
+        "--include-stop-str-in-output",
         default=False,
         action="store_true",
-        help="Whether to include the stop strings in output text. Defaults to False.")
+        help="Whether to include the stop strings in output text. Defaults to False.",
+    )
     args.add_argument(
-        '--ignore-eos',
+        "--ignore-eos",
         default=False,
         action="store_true",
-        help="Whether to ignore the EOS token and continue generating tokens after the EOS token is generated.")
+        help="Whether to ignore the EOS token and continue generating tokens after the EOS token is generated.",
+    )
     args.add_argument(
-        '--max-tokens',
+        "--max-tokens",
         type=int,
         default=16,
-        help="Maximum number of tokens to generate per output sequence.")
+        help="Maximum number of tokens to generate per output sequence.",
+    )
+    args.add_argument(
+        "--min-tokens",
+        type=int,
+        default=0,
+        help="Minimum number of tokens to generate per output sequence "
+        "before EOS or stop_token_ids can be generated",
+    )
     args.add_argument(
-        '--logprobs',
+        "--logprobs",
         type=int,
         default=None,
         help="NNumber of log probabilities to return per output token. "
@@ -130,23 +160,32 @@ def sampling_add_cli_args(args: argparse.ArgumentParser) -> argparse.ArgumentPar
         "result includes the log probabilities on the `logprobs` most likely "
         "tokens, as well the chosen tokens. The API will always return the "
         "log probability of the sampled token, so there  may be up to "
-        "`logprobs+1` elements in the response.")
+        "`logprobs+1` elements in the response.",
+    )
     args.add_argument(
-        '--prompt-logprobs',
+        "--prompt-logprobs",
         type=int,
         default=None,
-        help="Number of log probabilities to return per prompt token.")
+        help="Number of log probabilities to return per prompt token.",
+    )
     args.add_argument(
-        '--skip-special-tokens',
+        "--detokenize",
+        type=bool,
+        default=True,
+        help="Whether to detokenize the output. Defaults to True.",
+    )
+    args.add_argument(
+        "--skip-special-tokens",
         default=True,
         action="store_false",
-        help="Whether to skip special tokens in the output.")
+        help="Whether to skip special tokens in the output.",
+    )
     args.add_argument(
-        '--spaces-between-special-tokens',
+        "--spaces-between-special-tokens",
         default=True,
         action="store_false",
-        help="Whether to add spaces between special tokens in the output.  Defaults to True.")
-    # early_stopping logits_processors seed
+        help="Whether to add spaces between special tokens in the output.  Defaults to True.",
+    )
     return args
 
 
@@ -158,16 +197,14 @@ def load_chat_template(tokenizer, chat_template):
         except OSError:
             # If opening a file fails, set chat template to be args to
             # ensure we decode so our escape are interpreted correctly
-            tokenizer.chat_template = codecs.decode(
-                chat_template, "unicode_escape")
+            tokenizer.chat_template = codecs.decode(chat_template, "unicode_escape")
 
-        logging.info(
-            f"Using supplied chat template:\n{tokenizer.chat_template}"
-        )
+        logging.info(f"Using supplied chat template:\n{tokenizer.chat_template}")
     elif tokenizer.chat_template is not None:
         logging.info(
             f"Using default chat template:\n{tokenizer.chat_template}. This May lead to unsatisfactory results. You can provide a template.jinja file for vllm."
         )
     else:
         logging.warning(
-            "No chat template provided. Chat API will not work. This May lead to unsatisfactory results. You can provide a template.jinja file for vllm.")
+            "No chat template provided. Chat API will not work. This May lead to unsatisfactory results. You can provide a template.jinja file for vllm."
+        )
diff --git a/tests/run_vllm.py b/tests/run_vllm.py
index 8bd3504d..51bd0d3e 100644
--- a/tests/run_vllm.py
+++ b/tests/run_vllm.py
@@ -223,14 +223,13 @@ def run_nlp_testcase(model):
 
         r, t = run_script(script)
         sout = r.stdout
-
-        pattern = METRIC_PATTERN
-        matchs = re.findall(pattern, sout)
+        pattern = r"tokens: (\d+), QPS: ([\d.]+)"
+        matchs = re.search(pattern, sout)
         result["result"].setdefault(prec, {"status": "FAIL"})
         logging.debug(f"matchs:\n{matchs}")
-        for m in matchs:
-            result["result"][prec].update(get_metric_result(m))
-        if len(matchs) == 1:
+        if matchs:
+            result["result"][prec]["tokens"] = int(matchs.group(1))
+            result["result"][prec]["QPS"] = float(matchs.group(2))
             result["result"][prec]["status"] = "PASS"
 
         result["result"][prec]["Cost time (s)"] = t
-- 
Gitee