From f778ca94d887526db4ffe130182edff71c7c6048 Mon Sep 17 00:00:00 2001
From: majorli <mingjiang.li@iluvatar.com>
Date: Tue, 18 Jun 2024 13:51:01 +0800
Subject: [PATCH] bugfix: sync code and update readme.md

link #IA5YBX #IA5XI2

Signed-off-by: majorli <mingjiang.li@iluvatar.com>
---
 .../baichuan2-7b/vllm/README.md               |  18 +-
 .../baichuan2-7b/vllm/offline_inference.py    |  21 +-
 .../baichuan2-7b/vllm/template_baichuan.jinja |  29 +--
 .../baichuan2-7b/vllm/utils.py                | 234 ++----------------
 4 files changed, 50 insertions(+), 252 deletions(-)

diff --git a/models/nlp/large_language_model/baichuan2-7b/vllm/README.md b/models/nlp/large_language_model/baichuan2-7b/vllm/README.md
index e9ff41bd..6ac3fa63 100755
--- a/models/nlp/large_language_model/baichuan2-7b/vllm/README.md
+++ b/models/nlp/large_language_model/baichuan2-7b/vllm/README.md
@@ -13,7 +13,7 @@ In order to run the model smoothly, we need the following dependency files:
 1. ixrt-xxx.whl
 2. ixformer-xxx.whl
 3. vllm-xxx.whl
-Please contact the staff to obtain the relevant installation packages.
+   Please contact the staff to obtain the relevant installation packages.
 
 ```bash
 # Install libGL
@@ -22,7 +22,7 @@ yum install -y mesa-libGL
 ## Ubuntu
 apt install -y libgl1-mesa-dev
 
-pip3 install transformers==4.33.2
+pip3 install transformers==4.37.1
 pip3 install Path/To/ixrt-xxx.whl
 pip3 install Path/To/vllm-xxx.whl
 pip3 install Path/To/ixformer-xxx.whl
@@ -30,7 +30,7 @@ pip3 install Path/To/ixformer-xxx.whl
 
 ### Download
 
-Pretrained model: <https://huggingface.co/baichuan-inc/Baichuan2-7B-Base/tree/main>
+Pretrained model: [https://huggingface.co/baichuan-inc/Baichuan2-7B-Base/tree/main](https://huggingface.co/baichuan-inc/Baichuan2-7B-Base/tree/main)
 
 ```bash
 mkdir /data/baichuan/
@@ -40,7 +40,7 @@ mv Baichuan2-7B-Base.tar/zip /data/baichuan/
 ## Run model
 
 ```bash
-python3 offline_inference.py --model /data/baichuan/Baichuan2-7B-Base/ --chat_template template_baichuan.jinja --trust-remote-code
+python3 offline_inference.py --model /data/baichuan/Baichuan2-7B-Base/ --max-tokens 256 --trust-remote-code --chat_template template_baichuan.jinja --temperature 0.0
 ```
 
 ## Run Baichuan w8a16 quantization
@@ -56,6 +56,12 @@ python3 convert2int8.py --model-path /data/baichuan/Baichuan2-7B-Base/
 ### Run
 
 ```bash
-python3 offline_inference.py --model /data/baichuan/Baichuan2-7B-Base/int8/ --chat_template template_baichuan.jinja --quantization w8a16 --trust-remote-code --max-num-seqs 1 --max-model-len 256 \    
-                             --trust-remote-code --tensor-parallel-size 2 --temperature 0.0
+python3 offline_inference.py --model /data/baichuan/Baichuan2-7B-Base/int8/ --chat_template template_baichuan.jinja --quantization w8a16 --max-num-seqs 1 --max-model-len 256 --trust-remote-code --temperature 0.0 --max-tokens 256
 ```
+
+## Results
+
+| Model         | Precision | tokens | QPS    |
+| ------------- | --------- | ------ | ------ |
+| Baichuan-2-7B | FP16      | 768    | 109.27 |
+| Baichuan-2-7B | w8a16     | 740    | 59.82  |
diff --git a/models/nlp/large_language_model/baichuan2-7b/vllm/offline_inference.py b/models/nlp/large_language_model/baichuan2-7b/vllm/offline_inference.py
index 9d01a791..40c0e2e1 100644
--- a/models/nlp/large_language_model/baichuan2-7b/vllm/offline_inference.py
+++ b/models/nlp/large_language_model/baichuan2-7b/vllm/offline_inference.py
@@ -16,7 +16,7 @@
 import sys
 from pathlib import Path
 sys.path.append(str(Path(__file__).resolve().parent.parent.parent))
-from utils import build_chat,post_process,load_chat_template,sampling_add_cli_args
+from utils import load_chat_template,sampling_add_cli_args
 
 import logging
 import time
@@ -30,7 +30,7 @@ from vllm import LLM, SamplingParams, EngineArgs
 
 parser = argparse.ArgumentParser()
 parser.add_argument("--chat_template",type=str,default=None)
-parser.add_argument("--remove_chat_template",default=True,action="store_false",help="pass this if you are not use a chat model")
+parser.add_argument("--remove_chat_template",default=False,action="store_true",help="pass this if you are not use a chat model")
 parser = EngineArgs.add_cli_args(parser)
 parser = sampling_add_cli_args(parser)
 args = parser.parse_args()
@@ -59,14 +59,14 @@ sampling_params = SamplingParams(**sampling_params)
 llm = LLM(**engine_params)
 
 # process chat template
-if not args.remove_chat_template:
-    if 'chat' not in model_name.lower():
-        logging.warning(f"We assume that you are using the chat model, so additional processing is required for the input prompt. "
-                        f"If the result is not quite correct, please ensure that the model path includes the chat character. "
-                        f"for now, the model_name from model path is {model_name}")
+if args.remove_chat_template:
+    if 'chat' in model_name.lower():
+        logging.warning(f"The model name from model path is {model_name}, so we guess you are using the chat model and the additional processing is required for the input prompt. "
+                        f"If the result is not quite correct, please ensure you do not pass --remove_chat_template in CLI.")
     prompts_new = prompts
 else:
     # Build chat model promopt
+    logging.warning("If you are using a non chat model, please pass the --remove_chat_template in CLI.")
     # Try use transformers's apply_chat_template, if chat_template is None, will use defalut template.
     # For some old models, the default template may cause bad answers. we don't consider this situation, 
     # because the Transformers team is advancing the chat template. For more informatino about it, 
@@ -85,9 +85,8 @@ else:
             )
             prompts_new.append(text)
     except:
-        logging.warning("use tokenizer apply_chat_template function failed, may because of low transformers version...(try use transformers>=4.37.0)")
-        # Fall back to simple build chat, this part should be controled by model developer, we just provide a simple use cases
-        prompts_new = [build_chat(llm.get_tokenizer(),prompt,model_name,max_length=args.max_generate_tokens) for prompt in prompts]
+        logging.warning("use tokenizer apply_chat_template function failed, may because of low transformers version...(try use transformers>=4.34.0)")
+        prompts_new = prompts
 
 # Generate texts from the prompts. The output is a list of RequestOutput objects
 # that contain the prompt, generated text, and other information.
@@ -104,7 +103,7 @@ num_tokens = 0
 # Print the outputs.
 for i, output in enumerate(outputs):
     prompt = prompts[i] # show the origin prompt. actully prompt is "output.prompt"
-    generated_text = post_process(output.outputs[0].text,model_name)
+    generated_text = output.outputs[0].text
     
     num_tokens += len(output.outputs[0].token_ids)
     print(f"Prompt: {prompt}\nGenerated text: {generated_text} \n")
diff --git a/models/nlp/large_language_model/baichuan2-7b/vllm/template_baichuan.jinja b/models/nlp/large_language_model/baichuan2-7b/vllm/template_baichuan.jinja
index a1812a6c..42a8d927 100644
--- a/models/nlp/large_language_model/baichuan2-7b/vllm/template_baichuan.jinja
+++ b/models/nlp/large_language_model/baichuan2-7b/vllm/template_baichuan.jinja
@@ -1,22 +1,13 @@
 {{ (messages|selectattr('role', 'equalto', 'system')|list|last).content|trim if (messages|selectattr('role', 'equalto', 'system')|list) else '' }}
 
-{% for message in messages %}
-{% if message['role'] == 'user' %}
-<reserved_106>
-{{ message['content']|trim -}}
-{% if not loop.last %}
-
-
-{% endif %}
-{% elif message['role'] == 'assistant' %}
-<reserved_107>
-{{ message['content']|trim -}}
-{% if not loop.last %}
-
-
-{% endif %}
-{% endif %}
-{% endfor %}
-{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}
-<reserved_107>
+{%- for message in messages -%}
+    {%- if message['role'] == 'user' -%}
+        {{- '<reserved_106>' + message['content'] -}}
+    {%- elif message['role'] == 'assistant' -%}
+        {{- '<reserved_107>' + message['content'] -}}
+    {%- endif -%}
+{%- endfor -%}
+
+{%- if add_generation_prompt and messages[-1]['role'] != 'assistant' -%}
+    {{- '<reserved_107>' -}}
 {% endif %}
\ No newline at end of file
diff --git a/models/nlp/large_language_model/baichuan2-7b/vllm/utils.py b/models/nlp/large_language_model/baichuan2-7b/vllm/utils.py
index 1fc072d8..c6def85d 100644
--- a/models/nlp/large_language_model/baichuan2-7b/vllm/utils.py
+++ b/models/nlp/large_language_model/baichuan2-7b/vllm/utils.py
@@ -13,15 +13,10 @@
 #    License for the specific language governing permissions and limitations
 #    under the License.
 
-from copy import deepcopy
-from typing import Tuple, List, Union
-
 import codecs
 import logging
 import argparse
 
-# 对于chat模型，或者模型需要特定的输入，需要对prompt进行额外的处理。
-# 如果您在使用中有额外的prompt处理方式需求或者错误反馈，可以联系王坚或者巩亚飞，我们会对modelzoo进行更新适配。
 
 def sampling_add_cli_args(args: argparse.ArgumentParser) -> argparse.ArgumentParser:
     args.add_argument(
@@ -156,216 +151,23 @@ def sampling_add_cli_args(args: argparse.ArgumentParser) -> argparse.ArgumentPar
 
 
 def load_chat_template(tokenizer, chat_template):
-        if chat_template is not None:
-            try:
-                with open(chat_template, "r") as f:
-                    tokenizer.chat_template = f.read()
-            except OSError:
-                # If opening a file fails, set chat template to be args to
-                # ensure we decode so our escape are interpreted correctly
-                tokenizer.chat_template = codecs.decode(
-                    chat_template, "unicode_escape")
-
-            logging.info(
-                f"Using supplied chat template:\n{tokenizer.chat_template}"
-            )
-        elif tokenizer.chat_template is not None:
-            logging.info(
-                f"Using default chat template:\n{tokenizer.chat_template}"
-            )
-        else:
-            logging.warning(
-                "No chat template provided. Chat API will not work.")
-
-def default_build_chat(tokenizer,prompt):
-    return prompt
-
-def chatglm2_build_chat(tokenizer,prompt):
-    return tokenizer.build_prompt(prompt)
-
-def chatglm3_build_chat(tokenizer,prompt):
-    return tokenizer.build_chat_input(prompt).input_ids[0].tolist()
-
-def llama2_build_chat(tokenizer,prompt):
-    return f"[INST]{prompt}[/INST]"
-
-# adapt from https://huggingface.co/baichuan-inc/Baichuan2-13B-Chat/blob/main/generation_utils.py
-def baichuan2_build_chat(tokenizer, prompt, max_new_tokens=512):
-    def _parse_messages(messages, split_role="user"):
-        system, rounds = "", []
-        round = []
-        for i, message in enumerate(messages):
-            if message["role"] == "system":
-                assert i == 0
-                system = message["content"]
-                continue
-            if message["role"] == split_role and round:
-                rounds.append(round)
-                round = []
-            round.append(message)
-        if round:
-            rounds.append(round)
-        return system, rounds
-
-    messages = [{"role": "user", "content": f"{prompt}"}]
-    max_new_tokens = max_new_tokens
-    max_input_tokens = 4096 - max_new_tokens
-    system, rounds = _parse_messages(messages, split_role="user")
-    system_tokens = tokenizer.encode(system)
-    max_history_tokens = max_input_tokens - len(system_tokens)
-
-    history_tokens = []
-    for round in rounds[::-1]:
-        round_tokens = []
-        for message in round:
-            if message["role"] == "user":
-                round_tokens.append(195)
-            else:
-                round_tokens.append(196)
-            round_tokens.extend(tokenizer.encode(message["content"]))
-        if len(history_tokens) == 0 or len(history_tokens) + len(round_tokens) <= max_history_tokens:
-            history_tokens = round_tokens + history_tokens  # concat left
-            if len(history_tokens) < max_history_tokens:
-                continue
-        break
-
-    input_tokens = system_tokens + history_tokens
-    if messages[-1]["role"] != "assistant":
-        input_tokens.append(196)
-    input_tokens = input_tokens[-max_input_tokens:]  # truncate left
-    return input_tokens
-
-def qwen_build_chat(
-    tokenizer,
-    query: str,
-    history: List[Tuple[str, str]] = None,
-    system: str = "",
-    max_window_size: int = 6144,
-    chat_format: str = "chatml",
-):
-    if history is None:
-        history = []
-
-    if chat_format == "chatml":
-        im_start, im_end = "<|im_start|>", "<|im_end|>"
-        im_start_tokens = [tokenizer.im_start_id]
-        im_end_tokens = [tokenizer.im_end_id]
-        nl_tokens = tokenizer.encode("\n")
-
-        def _tokenize_str(role, content):
-            return f"{role}\n{content}", tokenizer.encode(
-                role, allowed_special=set()
-            ) + nl_tokens + tokenizer.encode(content, allowed_special=set())
-
-        system_text, system_tokens_part = _tokenize_str("system", system)
-        system_tokens = im_start_tokens + system_tokens_part + im_end_tokens
-
-        raw_text = ""
-        context_tokens = []
-
-        for turn_query, turn_response in reversed(history):
-            query_text, query_tokens_part = _tokenize_str("user", turn_query)
-            query_tokens = im_start_tokens + query_tokens_part + im_end_tokens
-            response_text, response_tokens_part = _tokenize_str(
-                "assistant", turn_response
-            )
-            response_tokens = im_start_tokens + response_tokens_part + im_end_tokens
-
-            next_context_tokens = nl_tokens + query_tokens + nl_tokens + response_tokens
-            prev_chat = (
-                f"\n{im_start}{query_text}{im_end}\n{im_start}{response_text}{im_end}"
-            )
-
-            current_context_size = (
-                len(system_tokens) + len(next_context_tokens) + len(context_tokens)
-            )
-            if current_context_size < max_window_size:
-                context_tokens = next_context_tokens + context_tokens
-                raw_text = prev_chat + raw_text
-            else:
-                break
-
-        context_tokens = system_tokens + context_tokens
-        raw_text = f"{im_start}{system_text}{im_end}" + raw_text
-        context_tokens += (
-            nl_tokens
-            + im_start_tokens
-            + _tokenize_str("user", query)[1]
-            + im_end_tokens
-            + nl_tokens
-            + im_start_tokens
-            + tokenizer.encode("assistant")
-            + nl_tokens
+    if chat_template is not None:
+        try:
+            with open(chat_template, "r") as f:
+                tokenizer.chat_template = f.read()
+        except OSError:
+            # If opening a file fails, set chat template to be args to
+            # ensure we decode so our escape are interpreted correctly
+            tokenizer.chat_template = codecs.decode(
+                chat_template, "unicode_escape")
+
+        logging.info(
+            f"Using supplied chat template:\n{tokenizer.chat_template}"
+        )
+    elif tokenizer.chat_template is not None:
+        logging.info(
+            f"Using default chat template:\n{tokenizer.chat_template}. This May lead to unsatisfactory results. You can provide a template.jinja file for vllm."
         )
-        raw_text += f"\n{im_start}user\n{query}{im_end}\n{im_start}assistant\n"
-
-    elif chat_format == "raw":
-        raw_text = query
-        context_tokens = tokenizer.encode(raw_text)
-    else:
-        raise NotImplementedError(f"Unknown chat format {chat_format!r}")
-
-    return raw_text, context_tokens
-
-def codellama_build_chat(tokenizer,prompt):
-    return "[INST] Write code to solve the following coding problem that obeys the constraints and passes the example test cases. Please wrap your code answer using ```:{}[/INST]".format(prompt)
-
-def build_chat(tokenizer, prompt, model_name, **kwargs):
-    model_name = model_name.lower()
-        # return str or list[int]
-    if "chatglm2" in model_name:
-        prompt = chatglm2_build_chat(tokenizer,prompt)
-    elif "chatglm3" in model_name:
-        prompt = chatglm3_build_chat(tokenizer,prompt)
-    elif "llama2" in model_name and 'chat' in model_name:
-        prompt = llama2_build_chat(tokenizer,prompt)
-    elif "baichuan2" in model_name and 'chat' in model_name:
-        prompt = baichuan2_build_chat(tokenizer,prompt, kwargs['max_length'])
-    elif "qwen" in model_name and 'chat' in model_name:
-        prompt = qwen_build_chat(tokenizer,prompt)
-    elif "code" in model_name and 'llama' in model_name:
-        prompt = codellama_build_chat(tokenizer,prompt)
-    else:
-        prompt = default_build_chat(tokenizer,prompt)
-    return prompt
-
-
-# for output
-def default_post_process(output):
-    return output
-
-def glm2_post_process(output):
-    output = output.strip()
-    output = output.replace("[[训练时间]]", "2023年")
-    return output
-
-def glm3_post_process(output, history=[]):
-    content = ""
-    history = deepcopy(history)
-    for response in output.split("<|assistant|>"):
-        metadata, content = response.split("\n", maxsplit=1)
-        if not metadata.strip():
-            content = content.strip()
-            history.append({"role": "assistant", "metadata": metadata, "content": content})
-            content = content.replace("[[训练时间]]", "2023年")
-        else:
-            history.append({"role": "assistant", "metadata": metadata, "content": content})
-            if history[0]["role"] == "system" and "tools" in history[0]:
-                content = "\n".join(content.split("\n")[1:-1])
-                def tool_call(**kwargs):
-                    return kwargs
-                parameters = eval(content)
-                content = {"name": metadata.strip(), "parameters": parameters}
-            else:
-                content = {"name": metadata.strip(), "content": content}
-    return content
-
-def post_process(response, model_name,**kwargs):
-    model_name = model_name.lower()
-    if "chatglm2" in model_name:
-        response = glm2_post_process(response)
-    elif "chatglm3" in model_name:
-        response = glm3_post_process(response)
     else:
-        response = default_post_process(response)
-    return response
\ No newline at end of file
+        logging.warning(
+            "No chat template provided. Chat API will not work. This May lead to unsatisfactory results. You can provide a template.jinja file for vllm.")
-- 
Gitee