From f778ca94d887526db4ffe130182edff71c7c6048 Mon Sep 17 00:00:00 2001 From: majorli Date: Tue, 18 Jun 2024 13:51:01 +0800 Subject: [PATCH] bugfix: sync code and update readme.md link #IA5YBX #IA5XI2 Signed-off-by: majorli --- .../baichuan2-7b/vllm/README.md | 18 +- .../baichuan2-7b/vllm/offline_inference.py | 21 +- .../baichuan2-7b/vllm/template_baichuan.jinja | 29 +-- .../baichuan2-7b/vllm/utils.py | 234 ++---------------- 4 files changed, 50 insertions(+), 252 deletions(-) diff --git a/models/nlp/large_language_model/baichuan2-7b/vllm/README.md b/models/nlp/large_language_model/baichuan2-7b/vllm/README.md index e9ff41bd..6ac3fa63 100755 --- a/models/nlp/large_language_model/baichuan2-7b/vllm/README.md +++ b/models/nlp/large_language_model/baichuan2-7b/vllm/README.md @@ -13,7 +13,7 @@ In order to run the model smoothly, we need the following dependency files: 1. ixrt-xxx.whl 2. ixformer-xxx.whl 3. vllm-xxx.whl -Please contact the staff to obtain the relevant installation packages. + Please contact the staff to obtain the relevant installation packages. ```bash # Install libGL @@ -22,7 +22,7 @@ yum install -y mesa-libGL ## Ubuntu apt install -y libgl1-mesa-dev -pip3 install transformers==4.33.2 +pip3 install transformers==4.37.1 pip3 install Path/To/ixrt-xxx.whl pip3 install Path/To/vllm-xxx.whl pip3 install Path/To/ixformer-xxx.whl @@ -30,7 +30,7 @@ pip3 install Path/To/ixformer-xxx.whl ### Download -Pretrained model: +Pretrained model: [https://huggingface.co/baichuan-inc/Baichuan2-7B-Base/tree/main](https://huggingface.co/baichuan-inc/Baichuan2-7B-Base/tree/main) ```bash mkdir /data/baichuan/ @@ -40,7 +40,7 @@ mv Baichuan2-7B-Base.tar/zip /data/baichuan/ ## Run model ```bash -python3 offline_inference.py --model /data/baichuan/Baichuan2-7B-Base/ --chat_template template_baichuan.jinja --trust-remote-code +python3 offline_inference.py --model /data/baichuan/Baichuan2-7B-Base/ --max-tokens 256 --trust-remote-code --chat_template template_baichuan.jinja --temperature 0.0 ``` ## Run Baichuan w8a16 quantization @@ -56,6 +56,12 @@ python3 convert2int8.py --model-path /data/baichuan/Baichuan2-7B-Base/ ### Run ```bash -python3 offline_inference.py --model /data/baichuan/Baichuan2-7B-Base/int8/ --chat_template template_baichuan.jinja --quantization w8a16 --trust-remote-code --max-num-seqs 1 --max-model-len 256 \ - --trust-remote-code --tensor-parallel-size 2 --temperature 0.0 +python3 offline_inference.py --model /data/baichuan/Baichuan2-7B-Base/int8/ --chat_template template_baichuan.jinja --quantization w8a16 --max-num-seqs 1 --max-model-len 256 --trust-remote-code --temperature 0.0 --max-tokens 256 ``` + +## Results + +| Model | Precision | tokens | QPS | +| ------------- | --------- | ------ | ------ | +| Baichuan-2-7B | FP16 | 768 | 109.27 | +| Baichuan-2-7B | w8a16 | 740 | 59.82 | diff --git a/models/nlp/large_language_model/baichuan2-7b/vllm/offline_inference.py b/models/nlp/large_language_model/baichuan2-7b/vllm/offline_inference.py index 9d01a791..40c0e2e1 100644 --- a/models/nlp/large_language_model/baichuan2-7b/vllm/offline_inference.py +++ b/models/nlp/large_language_model/baichuan2-7b/vllm/offline_inference.py @@ -16,7 +16,7 @@ import sys from pathlib import Path sys.path.append(str(Path(__file__).resolve().parent.parent.parent)) -from utils import build_chat,post_process,load_chat_template,sampling_add_cli_args +from utils import load_chat_template,sampling_add_cli_args import logging import time @@ -30,7 +30,7 @@ from vllm import LLM, SamplingParams, EngineArgs parser = argparse.ArgumentParser() parser.add_argument("--chat_template",type=str,default=None) -parser.add_argument("--remove_chat_template",default=True,action="store_false",help="pass this if you are not use a chat model") +parser.add_argument("--remove_chat_template",default=False,action="store_true",help="pass this if you are not use a chat model") parser = EngineArgs.add_cli_args(parser) parser = sampling_add_cli_args(parser) args = parser.parse_args() @@ -59,14 +59,14 @@ sampling_params = SamplingParams(**sampling_params) llm = LLM(**engine_params) # process chat template -if not args.remove_chat_template: - if 'chat' not in model_name.lower(): - logging.warning(f"We assume that you are using the chat model, so additional processing is required for the input prompt. " - f"If the result is not quite correct, please ensure that the model path includes the chat character. " - f"for now, the model_name from model path is {model_name}") +if args.remove_chat_template: + if 'chat' in model_name.lower(): + logging.warning(f"The model name from model path is {model_name}, so we guess you are using the chat model and the additional processing is required for the input prompt. " + f"If the result is not quite correct, please ensure you do not pass --remove_chat_template in CLI.") prompts_new = prompts else: # Build chat model promopt + logging.warning("If you are using a non chat model, please pass the --remove_chat_template in CLI.") # Try use transformers's apply_chat_template, if chat_template is None, will use defalut template. # For some old models, the default template may cause bad answers. we don't consider this situation, # because the Transformers team is advancing the chat template. For more informatino about it, @@ -85,9 +85,8 @@ else: ) prompts_new.append(text) except: - logging.warning("use tokenizer apply_chat_template function failed, may because of low transformers version...(try use transformers>=4.37.0)") - # Fall back to simple build chat, this part should be controled by model developer, we just provide a simple use cases - prompts_new = [build_chat(llm.get_tokenizer(),prompt,model_name,max_length=args.max_generate_tokens) for prompt in prompts] + logging.warning("use tokenizer apply_chat_template function failed, may because of low transformers version...(try use transformers>=4.34.0)") + prompts_new = prompts # Generate texts from the prompts. The output is a list of RequestOutput objects # that contain the prompt, generated text, and other information. @@ -104,7 +103,7 @@ num_tokens = 0 # Print the outputs. for i, output in enumerate(outputs): prompt = prompts[i] # show the origin prompt. actully prompt is "output.prompt" - generated_text = post_process(output.outputs[0].text,model_name) + generated_text = output.outputs[0].text num_tokens += len(output.outputs[0].token_ids) print(f"Prompt: {prompt}\nGenerated text: {generated_text} \n") diff --git a/models/nlp/large_language_model/baichuan2-7b/vllm/template_baichuan.jinja b/models/nlp/large_language_model/baichuan2-7b/vllm/template_baichuan.jinja index a1812a6c..42a8d927 100644 --- a/models/nlp/large_language_model/baichuan2-7b/vllm/template_baichuan.jinja +++ b/models/nlp/large_language_model/baichuan2-7b/vllm/template_baichuan.jinja @@ -1,22 +1,13 @@ {{ (messages|selectattr('role', 'equalto', 'system')|list|last).content|trim if (messages|selectattr('role', 'equalto', 'system')|list) else '' }} -{% for message in messages %} -{% if message['role'] == 'user' %} - -{{ message['content']|trim -}} -{% if not loop.last %} - - -{% endif %} -{% elif message['role'] == 'assistant' %} - -{{ message['content']|trim -}} -{% if not loop.last %} - - -{% endif %} -{% endif %} -{% endfor %} -{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %} - +{%- for message in messages -%} + {%- if message['role'] == 'user' -%} + {{- '' + message['content'] -}} + {%- elif message['role'] == 'assistant' -%} + {{- '' + message['content'] -}} + {%- endif -%} +{%- endfor -%} + +{%- if add_generation_prompt and messages[-1]['role'] != 'assistant' -%} + {{- '' -}} {% endif %} \ No newline at end of file diff --git a/models/nlp/large_language_model/baichuan2-7b/vllm/utils.py b/models/nlp/large_language_model/baichuan2-7b/vllm/utils.py index 1fc072d8..c6def85d 100644 --- a/models/nlp/large_language_model/baichuan2-7b/vllm/utils.py +++ b/models/nlp/large_language_model/baichuan2-7b/vllm/utils.py @@ -13,15 +13,10 @@ # License for the specific language governing permissions and limitations # under the License. -from copy import deepcopy -from typing import Tuple, List, Union - import codecs import logging import argparse -# 对于chat模型,或者模型需要特定的输入,需要对prompt进行额外的处理。 -# 如果您在使用中有额外的prompt处理方式需求或者错误反馈,可以联系王坚或者巩亚飞,我们会对modelzoo进行更新适配。 def sampling_add_cli_args(args: argparse.ArgumentParser) -> argparse.ArgumentParser: args.add_argument( @@ -156,216 +151,23 @@ def sampling_add_cli_args(args: argparse.ArgumentParser) -> argparse.ArgumentPar def load_chat_template(tokenizer, chat_template): - if chat_template is not None: - try: - with open(chat_template, "r") as f: - tokenizer.chat_template = f.read() - except OSError: - # If opening a file fails, set chat template to be args to - # ensure we decode so our escape are interpreted correctly - tokenizer.chat_template = codecs.decode( - chat_template, "unicode_escape") - - logging.info( - f"Using supplied chat template:\n{tokenizer.chat_template}" - ) - elif tokenizer.chat_template is not None: - logging.info( - f"Using default chat template:\n{tokenizer.chat_template}" - ) - else: - logging.warning( - "No chat template provided. Chat API will not work.") - -def default_build_chat(tokenizer,prompt): - return prompt - -def chatglm2_build_chat(tokenizer,prompt): - return tokenizer.build_prompt(prompt) - -def chatglm3_build_chat(tokenizer,prompt): - return tokenizer.build_chat_input(prompt).input_ids[0].tolist() - -def llama2_build_chat(tokenizer,prompt): - return f"[INST]{prompt}[/INST]" - -# adapt from https://huggingface.co/baichuan-inc/Baichuan2-13B-Chat/blob/main/generation_utils.py -def baichuan2_build_chat(tokenizer, prompt, max_new_tokens=512): - def _parse_messages(messages, split_role="user"): - system, rounds = "", [] - round = [] - for i, message in enumerate(messages): - if message["role"] == "system": - assert i == 0 - system = message["content"] - continue - if message["role"] == split_role and round: - rounds.append(round) - round = [] - round.append(message) - if round: - rounds.append(round) - return system, rounds - - messages = [{"role": "user", "content": f"{prompt}"}] - max_new_tokens = max_new_tokens - max_input_tokens = 4096 - max_new_tokens - system, rounds = _parse_messages(messages, split_role="user") - system_tokens = tokenizer.encode(system) - max_history_tokens = max_input_tokens - len(system_tokens) - - history_tokens = [] - for round in rounds[::-1]: - round_tokens = [] - for message in round: - if message["role"] == "user": - round_tokens.append(195) - else: - round_tokens.append(196) - round_tokens.extend(tokenizer.encode(message["content"])) - if len(history_tokens) == 0 or len(history_tokens) + len(round_tokens) <= max_history_tokens: - history_tokens = round_tokens + history_tokens # concat left - if len(history_tokens) < max_history_tokens: - continue - break - - input_tokens = system_tokens + history_tokens - if messages[-1]["role"] != "assistant": - input_tokens.append(196) - input_tokens = input_tokens[-max_input_tokens:] # truncate left - return input_tokens - -def qwen_build_chat( - tokenizer, - query: str, - history: List[Tuple[str, str]] = None, - system: str = "", - max_window_size: int = 6144, - chat_format: str = "chatml", -): - if history is None: - history = [] - - if chat_format == "chatml": - im_start, im_end = "<|im_start|>", "<|im_end|>" - im_start_tokens = [tokenizer.im_start_id] - im_end_tokens = [tokenizer.im_end_id] - nl_tokens = tokenizer.encode("\n") - - def _tokenize_str(role, content): - return f"{role}\n{content}", tokenizer.encode( - role, allowed_special=set() - ) + nl_tokens + tokenizer.encode(content, allowed_special=set()) - - system_text, system_tokens_part = _tokenize_str("system", system) - system_tokens = im_start_tokens + system_tokens_part + im_end_tokens - - raw_text = "" - context_tokens = [] - - for turn_query, turn_response in reversed(history): - query_text, query_tokens_part = _tokenize_str("user", turn_query) - query_tokens = im_start_tokens + query_tokens_part + im_end_tokens - response_text, response_tokens_part = _tokenize_str( - "assistant", turn_response - ) - response_tokens = im_start_tokens + response_tokens_part + im_end_tokens - - next_context_tokens = nl_tokens + query_tokens + nl_tokens + response_tokens - prev_chat = ( - f"\n{im_start}{query_text}{im_end}\n{im_start}{response_text}{im_end}" - ) - - current_context_size = ( - len(system_tokens) + len(next_context_tokens) + len(context_tokens) - ) - if current_context_size < max_window_size: - context_tokens = next_context_tokens + context_tokens - raw_text = prev_chat + raw_text - else: - break - - context_tokens = system_tokens + context_tokens - raw_text = f"{im_start}{system_text}{im_end}" + raw_text - context_tokens += ( - nl_tokens - + im_start_tokens - + _tokenize_str("user", query)[1] - + im_end_tokens - + nl_tokens - + im_start_tokens - + tokenizer.encode("assistant") - + nl_tokens + if chat_template is not None: + try: + with open(chat_template, "r") as f: + tokenizer.chat_template = f.read() + except OSError: + # If opening a file fails, set chat template to be args to + # ensure we decode so our escape are interpreted correctly + tokenizer.chat_template = codecs.decode( + chat_template, "unicode_escape") + + logging.info( + f"Using supplied chat template:\n{tokenizer.chat_template}" + ) + elif tokenizer.chat_template is not None: + logging.info( + f"Using default chat template:\n{tokenizer.chat_template}. This May lead to unsatisfactory results. You can provide a template.jinja file for vllm." ) - raw_text += f"\n{im_start}user\n{query}{im_end}\n{im_start}assistant\n" - - elif chat_format == "raw": - raw_text = query - context_tokens = tokenizer.encode(raw_text) - else: - raise NotImplementedError(f"Unknown chat format {chat_format!r}") - - return raw_text, context_tokens - -def codellama_build_chat(tokenizer,prompt): - return "[INST] Write code to solve the following coding problem that obeys the constraints and passes the example test cases. Please wrap your code answer using ```:{}[/INST]".format(prompt) - -def build_chat(tokenizer, prompt, model_name, **kwargs): - model_name = model_name.lower() - # return str or list[int] - if "chatglm2" in model_name: - prompt = chatglm2_build_chat(tokenizer,prompt) - elif "chatglm3" in model_name: - prompt = chatglm3_build_chat(tokenizer,prompt) - elif "llama2" in model_name and 'chat' in model_name: - prompt = llama2_build_chat(tokenizer,prompt) - elif "baichuan2" in model_name and 'chat' in model_name: - prompt = baichuan2_build_chat(tokenizer,prompt, kwargs['max_length']) - elif "qwen" in model_name and 'chat' in model_name: - prompt = qwen_build_chat(tokenizer,prompt) - elif "code" in model_name and 'llama' in model_name: - prompt = codellama_build_chat(tokenizer,prompt) - else: - prompt = default_build_chat(tokenizer,prompt) - return prompt - - -# for output -def default_post_process(output): - return output - -def glm2_post_process(output): - output = output.strip() - output = output.replace("[[训练时间]]", "2023年") - return output - -def glm3_post_process(output, history=[]): - content = "" - history = deepcopy(history) - for response in output.split("<|assistant|>"): - metadata, content = response.split("\n", maxsplit=1) - if not metadata.strip(): - content = content.strip() - history.append({"role": "assistant", "metadata": metadata, "content": content}) - content = content.replace("[[训练时间]]", "2023年") - else: - history.append({"role": "assistant", "metadata": metadata, "content": content}) - if history[0]["role"] == "system" and "tools" in history[0]: - content = "\n".join(content.split("\n")[1:-1]) - def tool_call(**kwargs): - return kwargs - parameters = eval(content) - content = {"name": metadata.strip(), "parameters": parameters} - else: - content = {"name": metadata.strip(), "content": content} - return content - -def post_process(response, model_name,**kwargs): - model_name = model_name.lower() - if "chatglm2" in model_name: - response = glm2_post_process(response) - elif "chatglm3" in model_name: - response = glm3_post_process(response) else: - response = default_post_process(response) - return response \ No newline at end of file + logging.warning( + "No chat template provided. Chat API will not work. This May lead to unsatisfactory results. You can provide a template.jinja file for vllm.") -- Gitee