diff --git a/models/nlp/large_language_model/chatglm/vllm/README.md b/models/nlp/large_language_model/chatglm/vllm/README.md new file mode 100644 index 0000000000000000000000000000000000000000..78c8d48c8b0866a9b835456828fb78217b4843fa --- /dev/null +++ b/models/nlp/large_language_model/chatglm/vllm/README.md @@ -0,0 +1,51 @@ +# ChatGLM3-6B + +## Description +ChatGLM3-6B is trained on large-scale natural language text data, enabling it to understand and generate text. It can be applied to various natural language processing tasks such as dialogue generation, text summarization, and language translation. + +## Setup + +### Install +In order to run the model smoothly, we need the following dependency files: +1. ixrt-xxx.whl +2. ixformer-xxx.whl +3. vllm-xxx.whl +Please contact the staff to obtain the relevant installation packages. + +```bash +yum install mesa-libGL +pip3 install transformers==4.33.2 +pip3 install Path/To/ixrt-xxx.whl +pip3 install Path/To/vllm-xxx.whl +pip3 install Path/To/ixformer-xxx.whl +``` + +### Download +Pretrained model: + +```bash +mkdir /data/chatglm/ +mv chatglm3-6b.zip/tar /data/chatglm/ +``` + + +## Run model + +```bash +python3 offline_inference.py --model /data/chatglm/chatglm3-6b --trust-remote-code --temperature 0.0 --max-tokens 256 +``` + +## Use the server + +### Start the server + +```bash +python3 -m vllm.entrypoints.openai.api_server --model /data/chatglm/chatglm3-6b --gpu-memory-utilization 0.9 --max-num-batched-tokens 8193 \ + --max-num-seqs 32 --disable-log-requests --host 127.0.0.1 --port 12345 --trust-remote-code +``` + +### Test using the OpenAI interface + +```bash +python3 server_inference.py --host 127.0.0.1 --port 12345 --model_path /data/chatglm/chatglm3-6b +``` \ No newline at end of file diff --git a/models/nlp/large_language_model/chatglm/vllm/offline_inference.py b/models/nlp/large_language_model/chatglm/vllm/offline_inference.py new file mode 100644 index 0000000000000000000000000000000000000000..0162d93c53ac839268b3c964e0e96ecaad63ac4e --- /dev/null +++ b/models/nlp/large_language_model/chatglm/vllm/offline_inference.py @@ -0,0 +1,102 @@ +# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +import sys +from pathlib import Path +sys.path.append(str(Path(__file__).resolve().parent.parent.parent)) +from utils import load_chat_template,sampling_add_cli_args + +import logging +import time +import argparse +import dataclasses +import inspect + +import torch +from vllm import LLM, SamplingParams, EngineArgs + + +parser = argparse.ArgumentParser() +parser.add_argument("--chat_template",type=str,default=None) +parser.add_argument("--remove_chat_template",default=False,action="store_true",help="pass this if you are not use a chat model") +parser = EngineArgs.add_cli_args(parser) +parser = sampling_add_cli_args(parser) +args = parser.parse_args() + +engine_args = [attr.name for attr in dataclasses.fields(EngineArgs)] +sampling_args = [param.name for param in list(inspect.signature(SamplingParams.__init__).parameters.values())[1:]] +engine_params = {attr:getattr(args, attr) for attr in engine_args} +sampling_params = {attr:getattr(args, attr) for attr in sampling_args if args.__contains__(attr)} + +model_name = args.model.strip() +model_name = model_name if args.model[-1]!='/' else model_name[:-1] +model_name = model_name.rsplit('/')[-1] + + +# Sample prompts. +prompts = [ + "哪些迹象可能表明一个人正在经历焦虑?", + "描述一下如何制作芝士披萨。", + "写一篇有关5G网络研发的综述文章。" + ] + +# Create a sampling params object. +sampling_params = SamplingParams(**sampling_params) + +# Create an LLM. +llm = LLM(**engine_params) + +# process chat template +if args.remove_chat_template: + if 'chat' in model_name.lower(): + logging.warning(f"The model name from model path is {model_name}, so we guess you are using the chat model and the additional processing is required for the input prompt. " + f"If the result is not quite correct, please ensure you do not pass --remove_chat_template in CLI.") + prompts_new = prompts +else: + # Build chat model promopt + logging.warning("If you are using a non chat model, please pass the --remove_chat_template in CLI.") + logging.warning("For now, openai api chat interface(v1/chat/completions) need you provide a chat template to process prompt(str) for better results. " + "Otherwise, you have to use the default chat template, which may lead to bad answers. But, the process of building chat input is complex " + "for some models and the rule of process can not be written as a jinja file. Fortunately, the v1/completions interface support List[int] " + "params. This means you can process the prompt firstly, then send the List[int] to v1/completions and consider it as v1/chat/completions " + "to use when you use openai api.") + tokenizer = llm.get_tokenizer() + prompts_new = [] + for prompt in prompts: + input_idx = tokenizer.build_chat_input(prompt)['input_ids'][0].cpu().tolist() + prompts_new.append(input_idx) + +# Generate texts from the prompts. The output is a list of RequestOutput objects +# that contain the prompt, generated text, and other information. +outputs = llm.generate(prompts_new, sampling_params,use_tqdm=False) if isinstance(prompts_new[0],str) else llm.generate(sampling_params=sampling_params,prompt_token_ids=prompts_new,use_tqdm=False) +torch.cuda.synchronize() + +start_time = time.perf_counter() +outputs = llm.generate(prompts_new, sampling_params) if isinstance(prompts_new[0],str) else llm.generate(sampling_params=sampling_params,prompt_token_ids=prompts_new) +torch.cuda.synchronize() +end_time = time.perf_counter() +duration_time = end_time - start_time + +num_tokens = 0 +# Print the outputs. +for i, output in enumerate(outputs): + prompt = prompts[i] # show the origin prompt. actully prompt is "output.prompt" + generated_text = output.outputs[0].text + + num_tokens += len(output.outputs[0].token_ids) + print(f"Prompt: {prompt}\nGenerated text: {generated_text} \n") +print(f"tokens: {num_tokens}, QPS: {num_tokens/duration_time}") + +# 0.3.2 tokens: 422, QPS: 70.02308283048338(tokens: 422, QPS: 93.67210003677407),32-k 模型 tokens: 477, QPS: 81.46537314533865(tokens: 477, QPS: 106.54247895449554) \ No newline at end of file diff --git a/models/nlp/large_language_model/chatglm/vllm/server_inference.py b/models/nlp/large_language_model/chatglm/vllm/server_inference.py new file mode 100644 index 0000000000000000000000000000000000000000..e60b6f9c25f71ce4d8be81e8d8eeb32712acc3f9 --- /dev/null +++ b/models/nlp/large_language_model/chatglm/vllm/server_inference.py @@ -0,0 +1,79 @@ +# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +import argparse +import time +from openai import OpenAI +from transformers import AutoTokenizer + + +def send_request( + api_url: str, + prompt: str, + output_len: int, + stream: bool, +) -> None: + client = OpenAI( + # defaults to os.environ.get("OPENAI_API_KEY") + api_key="EMPTY", + base_url=api_url, + ) + + models = client.models.list() + model = models.data[0].id + + completion = client.completions.create( + model=model, + # messages=[{"role": "user", "content": prompt},], + prompt=prompt, + n=1, + stream=stream, + max_tokens=output_len, + temperature=0.0 + ) + + if stream: + for each_com in completion: + print(each_com) + else: + print("++++++++++++++++++") + print(completion) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Benchmark the online serving throughput.") + parser.add_argument("--host", type=str, default="localhost") + parser.add_argument("--port", type=int, default=8000) + parser.add_argument("--stream", action="store_true") + parser.add_argument("--output_token", type=int, default=1024) + parser.add_argument("--model_path", type=str) + + args = parser.parse_args() + api_url = f"http://{args.host}:{args.port}/v1" + + prompts = [ + "你好", + "Which city is the capital of China?", + "1 + 1 = ?", + "中国的首都是哪里", + "请讲以下内容翻译为英文:\n你好,我来自中国。", + ] + + tokenizer = AutoTokenizer.from_pretrained(args.model_path, trust_remote_code=True) + prompts = [tokenizer.build_chat_input(i).input_ids.tolist() for i in prompts] + + for prompt in prompts: + send_request(api_url,prompt,args.output_token,args.stream) diff --git a/models/nlp/large_language_model/chatglm/vllm/utils.py b/models/nlp/large_language_model/chatglm/vllm/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..1fc072d8d51e35109a97c17b5476e7bf3aa1448b --- /dev/null +++ b/models/nlp/large_language_model/chatglm/vllm/utils.py @@ -0,0 +1,371 @@ +# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +from copy import deepcopy +from typing import Tuple, List, Union + +import codecs +import logging +import argparse + +# 对于chat模型,或者模型需要特定的输入,需要对prompt进行额外的处理。 +# 如果您在使用中有额外的prompt处理方式需求或者错误反馈,可以联系王坚或者巩亚飞,我们会对modelzoo进行更新适配。 + +def sampling_add_cli_args(args: argparse.ArgumentParser) -> argparse.ArgumentParser: + args.add_argument( + '--n', + type=int, + default=1, + help="Number of output sequences to return for the given prompt.") + args.add_argument( + '--best-of', + type=int, + default=None, + help="Number of output sequences that are generated from the prompt. " + "From these `best_of` sequences, the top `n` sequences are returned. " + "`best_of` must be greater than or equal to `n`. This is treated as " + "the beam width when `use_beam_search` is True. By default, `best_of`" + "is set to `n`.") + args.add_argument( + '--presence-penalty', + type=float, + default=0.0, + help="Float that penalizes new tokens based on whether they " + "appear in the generated text so far. Values > 0 encourage the model " + "to use new tokens, while values < 0 encourage the model to repeat " + "tokens.") + args.add_argument( + '--frequency-penalty', + type=float, + default=0.0, + help="Float that penalizes new tokens based on their " + " frequency in the generated text so far. Values > 0 encourage the " + " model to use new tokens, while values < 0 encourage the model to " + "repeat tokens.") + args.add_argument( + '--repetition-penalty', + type=float, + default=1.0, + help="Float that penalizes new tokens based on whether " + "they appear in the prompt and the generated text so far. Values > 1 " + "encourage the model to use new tokens, while values < 1 encourage " + "the model to repeat tokens.") + args.add_argument( + '--temperature', + type=float, + default=1.0, + help="Float that controls the randomness of the sampling. Lower " + "values make the model more deterministic, while higher values make " + "the model more random. Zero means greedy sampling.") + args.add_argument( + '--top-p', + type=float, + default=1.0, + help="Float that controls the cumulative probability of the top tokens " + "to consider. Must be in (0, 1]. Set to 1 to consider all tokens.") + args.add_argument( + '--top-k', + type=int, + default=-1, + help="Integer that controls the number of top tokens to consider. Set " + "to -1 to consider all tokens.") + args.add_argument( + '--min-p', + type=float, + default=0.0, + help="Float that represents the minimum probability for a token to be " + "considered, relative to the probability of the most likely token. " + "Must be in [0, 1]. Set to 0 to disable this.") + args.add_argument( + '--use-beam-search', + default=False, + action="store_true", + help="Whether to use beam search instead of sampling.") + args.add_argument( + '--length-penalty', + type=float, + default=1.0, + help="Float that penalizes sequences based on their length. Used in beam search.") + args.add_argument( + '--stop', + type=str, + default=None, + help="List of strings that stop the generation when they are generated. " + "The returned output will not contain the stop strings.") + args.add_argument( + '--stop-token-ids', + type=int, + default=None, + help="List of tokens that stop the generation when they are " + "generated. The returned output will contain the stop tokens unless " + "the stop tokens are special tokens.") + args.add_argument( + '--include-stop-str-in-output', + default=False, + action="store_true", + help="Whether to include the stop strings in output text. Defaults to False.") + args.add_argument( + '--ignore-eos', + default=False, + action="store_true", + help="Whether to ignore the EOS token and continue generating tokens after the EOS token is generated.") + args.add_argument( + '--max-tokens', + type=int, + default=16, + help="Maximum number of tokens to generate per output sequence.") + args.add_argument( + '--logprobs', + type=int, + default=None, + help="NNumber of log probabilities to return per output token. " + "Note that the implementation follows the OpenAI API: The return " + "result includes the log probabilities on the `logprobs` most likely " + "tokens, as well the chosen tokens. The API will always return the " + "log probability of the sampled token, so there may be up to " + "`logprobs+1` elements in the response.") + args.add_argument( + '--prompt-logprobs', + type=int, + default=None, + help="Number of log probabilities to return per prompt token.") + args.add_argument( + '--skip-special-tokens', + default=True, + action="store_false", + help="Whether to skip special tokens in the output.") + args.add_argument( + '--spaces-between-special-tokens', + default=True, + action="store_false", + help="Whether to add spaces between special tokens in the output. Defaults to True.") + # early_stopping logits_processors seed + return args + + +def load_chat_template(tokenizer, chat_template): + if chat_template is not None: + try: + with open(chat_template, "r") as f: + tokenizer.chat_template = f.read() + except OSError: + # If opening a file fails, set chat template to be args to + # ensure we decode so our escape are interpreted correctly + tokenizer.chat_template = codecs.decode( + chat_template, "unicode_escape") + + logging.info( + f"Using supplied chat template:\n{tokenizer.chat_template}" + ) + elif tokenizer.chat_template is not None: + logging.info( + f"Using default chat template:\n{tokenizer.chat_template}" + ) + else: + logging.warning( + "No chat template provided. Chat API will not work.") + +def default_build_chat(tokenizer,prompt): + return prompt + +def chatglm2_build_chat(tokenizer,prompt): + return tokenizer.build_prompt(prompt) + +def chatglm3_build_chat(tokenizer,prompt): + return tokenizer.build_chat_input(prompt).input_ids[0].tolist() + +def llama2_build_chat(tokenizer,prompt): + return f"[INST]{prompt}[/INST]" + +# adapt from https://huggingface.co/baichuan-inc/Baichuan2-13B-Chat/blob/main/generation_utils.py +def baichuan2_build_chat(tokenizer, prompt, max_new_tokens=512): + def _parse_messages(messages, split_role="user"): + system, rounds = "", [] + round = [] + for i, message in enumerate(messages): + if message["role"] == "system": + assert i == 0 + system = message["content"] + continue + if message["role"] == split_role and round: + rounds.append(round) + round = [] + round.append(message) + if round: + rounds.append(round) + return system, rounds + + messages = [{"role": "user", "content": f"{prompt}"}] + max_new_tokens = max_new_tokens + max_input_tokens = 4096 - max_new_tokens + system, rounds = _parse_messages(messages, split_role="user") + system_tokens = tokenizer.encode(system) + max_history_tokens = max_input_tokens - len(system_tokens) + + history_tokens = [] + for round in rounds[::-1]: + round_tokens = [] + for message in round: + if message["role"] == "user": + round_tokens.append(195) + else: + round_tokens.append(196) + round_tokens.extend(tokenizer.encode(message["content"])) + if len(history_tokens) == 0 or len(history_tokens) + len(round_tokens) <= max_history_tokens: + history_tokens = round_tokens + history_tokens # concat left + if len(history_tokens) < max_history_tokens: + continue + break + + input_tokens = system_tokens + history_tokens + if messages[-1]["role"] != "assistant": + input_tokens.append(196) + input_tokens = input_tokens[-max_input_tokens:] # truncate left + return input_tokens + +def qwen_build_chat( + tokenizer, + query: str, + history: List[Tuple[str, str]] = None, + system: str = "", + max_window_size: int = 6144, + chat_format: str = "chatml", +): + if history is None: + history = [] + + if chat_format == "chatml": + im_start, im_end = "<|im_start|>", "<|im_end|>" + im_start_tokens = [tokenizer.im_start_id] + im_end_tokens = [tokenizer.im_end_id] + nl_tokens = tokenizer.encode("\n") + + def _tokenize_str(role, content): + return f"{role}\n{content}", tokenizer.encode( + role, allowed_special=set() + ) + nl_tokens + tokenizer.encode(content, allowed_special=set()) + + system_text, system_tokens_part = _tokenize_str("system", system) + system_tokens = im_start_tokens + system_tokens_part + im_end_tokens + + raw_text = "" + context_tokens = [] + + for turn_query, turn_response in reversed(history): + query_text, query_tokens_part = _tokenize_str("user", turn_query) + query_tokens = im_start_tokens + query_tokens_part + im_end_tokens + response_text, response_tokens_part = _tokenize_str( + "assistant", turn_response + ) + response_tokens = im_start_tokens + response_tokens_part + im_end_tokens + + next_context_tokens = nl_tokens + query_tokens + nl_tokens + response_tokens + prev_chat = ( + f"\n{im_start}{query_text}{im_end}\n{im_start}{response_text}{im_end}" + ) + + current_context_size = ( + len(system_tokens) + len(next_context_tokens) + len(context_tokens) + ) + if current_context_size < max_window_size: + context_tokens = next_context_tokens + context_tokens + raw_text = prev_chat + raw_text + else: + break + + context_tokens = system_tokens + context_tokens + raw_text = f"{im_start}{system_text}{im_end}" + raw_text + context_tokens += ( + nl_tokens + + im_start_tokens + + _tokenize_str("user", query)[1] + + im_end_tokens + + nl_tokens + + im_start_tokens + + tokenizer.encode("assistant") + + nl_tokens + ) + raw_text += f"\n{im_start}user\n{query}{im_end}\n{im_start}assistant\n" + + elif chat_format == "raw": + raw_text = query + context_tokens = tokenizer.encode(raw_text) + else: + raise NotImplementedError(f"Unknown chat format {chat_format!r}") + + return raw_text, context_tokens + +def codellama_build_chat(tokenizer,prompt): + return "[INST] Write code to solve the following coding problem that obeys the constraints and passes the example test cases. Please wrap your code answer using ```:{}[/INST]".format(prompt) + +def build_chat(tokenizer, prompt, model_name, **kwargs): + model_name = model_name.lower() + # return str or list[int] + if "chatglm2" in model_name: + prompt = chatglm2_build_chat(tokenizer,prompt) + elif "chatglm3" in model_name: + prompt = chatglm3_build_chat(tokenizer,prompt) + elif "llama2" in model_name and 'chat' in model_name: + prompt = llama2_build_chat(tokenizer,prompt) + elif "baichuan2" in model_name and 'chat' in model_name: + prompt = baichuan2_build_chat(tokenizer,prompt, kwargs['max_length']) + elif "qwen" in model_name and 'chat' in model_name: + prompt = qwen_build_chat(tokenizer,prompt) + elif "code" in model_name and 'llama' in model_name: + prompt = codellama_build_chat(tokenizer,prompt) + else: + prompt = default_build_chat(tokenizer,prompt) + return prompt + + +# for output +def default_post_process(output): + return output + +def glm2_post_process(output): + output = output.strip() + output = output.replace("[[训练时间]]", "2023年") + return output + +def glm3_post_process(output, history=[]): + content = "" + history = deepcopy(history) + for response in output.split("<|assistant|>"): + metadata, content = response.split("\n", maxsplit=1) + if not metadata.strip(): + content = content.strip() + history.append({"role": "assistant", "metadata": metadata, "content": content}) + content = content.replace("[[训练时间]]", "2023年") + else: + history.append({"role": "assistant", "metadata": metadata, "content": content}) + if history[0]["role"] == "system" and "tools" in history[0]: + content = "\n".join(content.split("\n")[1:-1]) + def tool_call(**kwargs): + return kwargs + parameters = eval(content) + content = {"name": metadata.strip(), "parameters": parameters} + else: + content = {"name": metadata.strip(), "content": content} + return content + +def post_process(response, model_name,**kwargs): + model_name = model_name.lower() + if "chatglm2" in model_name: + response = glm2_post_process(response) + elif "chatglm3" in model_name: + response = glm3_post_process(response) + else: + response = default_post_process(response) + return response \ No newline at end of file