diff --git a/models/nlp/large_language_model/chatglm3-6b-32k/vllm/README.md b/models/nlp/large_language_model/chatglm3-6b-32k/vllm/README.md new file mode 100644 index 0000000000000000000000000000000000000000..247ba8e387c66c7ad1ca94c1e1760886fbaf3801 --- /dev/null +++ b/models/nlp/large_language_model/chatglm3-6b-32k/vllm/README.md @@ -0,0 +1,57 @@ +# ChatGLM3-6B-32K + +## Description + +ChatGLM3-6B-32K further enhances the understanding of long text capabilities based on ChatGLM3-6B, enabling better handling of contexts up to 32K in length. Specifically, we have updated the positional encoding and designed more targeted long text training methods, using a 32K context length during the training phase. In practical use, if your context length is mostly within 8K, we recommend using ChatGLM3-6B; if you need to handle context lengths exceeding 8K, we recommend using ChatGLM3-6B-32K. + +## Setup + +### Install + +In order to run the model smoothly, you need to get the sdk from [resource center](https://support.iluvatar.com/#/ProductLine?id=2) of Iluvatar CoreX official website. + +```bash +# Install libGL +## CentOS +yum install -y mesa-libGL +## Ubuntu +apt install -y libgl1-mesa-dev + +pip3 install transformers==4.37.1 +``` + +### Download + +Pretrained model: + +```bash +mkdir -p /data/chatglm/ +mv chatglm3-6b-32k.zip/tar /data/chatglm/ +``` + +## Run model + +```bash +python3 offline_inference.py --model /data/chatglm/chatglm3-6b-32k --trust-remote-code --temperature 0.0 --max-tokens 256 +``` + +## Use the server + +### Start the server + +```bash +python3 -m vllm.entrypoints.openai.api_server --model /data/chatglm/chatglm3-6b-32k --gpu-memory-utilization 0.9 --max-num-batched-tokens 8193 \ + --max-num-seqs 32 --disable-log-requests --host 127.0.0.1 --port 12345 --trust-remote-code +``` + +### Test using the OpenAI interface + +```bash +python3 server_inference.py --host 127.0.0.1 --port 12345 --model_path /data/chatglm/chatglm3-6b-32k +``` + +## Results + +| Model | Precision | tokens | QPS | +| --------------- | --------- | ------ | ------ | +| ChatGLM3-6B-32K | FP16 | 745 | 110.85 | diff --git a/models/nlp/large_language_model/chatglm3-6b-32k/vllm/offline_inference.py b/models/nlp/large_language_model/chatglm3-6b-32k/vllm/offline_inference.py new file mode 100644 index 0000000000000000000000000000000000000000..bc731079f72988cd20c5a68b3ccb4e192769c8fb --- /dev/null +++ b/models/nlp/large_language_model/chatglm3-6b-32k/vllm/offline_inference.py @@ -0,0 +1,127 @@ +# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +import sys +from pathlib import Path + +sys.path.append(str(Path(__file__).resolve().parent.parent.parent)) +import argparse +import dataclasses +import inspect +import logging +import time + +import torch +from utils import load_chat_template, sampling_add_cli_args +from vllm import LLM, EngineArgs, SamplingParams + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--chat_template", type=str, default=None) + parser.add_argument( + "--remove_chat_template", + default=False, + action="store_true", + help="pass this if you are not use a chat model", + ) + parser = EngineArgs.add_cli_args(parser) + parser = sampling_add_cli_args(parser) + args = parser.parse_args() + + engine_args = [attr.name for attr in dataclasses.fields(EngineArgs)] + sampling_args = [ + param.name + for param in list( + inspect.signature(SamplingParams.__init__).parameters.values() + )[1:] + ] + engine_params = {attr: getattr(args, attr) for attr in engine_args} + sampling_params = { + attr: getattr(args, attr) for attr in sampling_args if args.__contains__(attr) + } + + model_name = args.model.strip() + model_name = model_name if args.model[-1] != "/" else model_name[:-1] + model_name = model_name.rsplit("/")[-1] + + # Sample prompts. + prompts = ["哪些迹象可能表明一个人正在经历焦虑?", "描述一下如何制作芝士披萨。", "写一篇有关5G网络研发的综述文章。"] + + # Create a sampling params object. + sampling_params = SamplingParams(**sampling_params) + + # Create an LLM. + llm = LLM(**engine_params) + + # process chat template + if args.remove_chat_template: + if "chat" in model_name.lower(): + logging.warning( + f"The model name from model path is {model_name}, so we guess you are using the chat model and the additional processing is required for the input prompt. " + f"If the result is not quite correct, please ensure you do not pass --remove_chat_template in CLI." + ) + prompts_new = prompts + else: + # Build chat model promopt + logging.warning( + "If you are using a non chat model, please pass the --remove_chat_template in CLI." + ) + logging.warning( + "For now, openai api chat interface(v1/chat/completions) need you provide a chat template to process prompt(str) for better results. " + "Otherwise, you have to use the default chat template, which may lead to bad answers. But, the process of building chat input is complex " + "for some models and the rule of process can not be written as a jinja file. Fortunately, the v1/completions interface support List[int] " + "params. This means you can process the prompt firstly, then send the List[int] to v1/completions and consider it as v1/chat/completions " + "to use when you use openai api." + ) + tokenizer = llm.get_tokenizer() + prompts_new = [] + for prompt in prompts: + input_idx = ( + tokenizer.build_chat_input(prompt)["input_ids"][0].cpu().tolist() + ) + prompts_new.append(input_idx) + + # Generate texts from the prompts. The output is a list of RequestOutput objects + # that contain the prompt, generated text, and other information. + outputs = ( + llm.generate(prompts_new, sampling_params, use_tqdm=False) + if isinstance(prompts_new[0], str) + else llm.generate( + sampling_params=sampling_params, + prompt_token_ids=prompts_new, + use_tqdm=False, + ) + ) + torch.cuda.synchronize() + + start_time = time.perf_counter() + outputs = ( + llm.generate(prompts_new, sampling_params) + if isinstance(prompts_new[0], str) + else llm.generate(sampling_params=sampling_params, prompt_token_ids=prompts_new) + ) + torch.cuda.synchronize() + end_time = time.perf_counter() + duration_time = end_time - start_time + + num_tokens = 0 + # Print the outputs. + for i, output in enumerate(outputs): + prompt = prompts[i] # show the origin prompt. actully prompt is "output.prompt" + generated_text = output.outputs[0].text + + num_tokens += len(output.outputs[0].token_ids) + print(f"Prompt: {prompt}\nGenerated text: {generated_text} \n") + print(f"tokens: {num_tokens}, QPS: {num_tokens/duration_time}") \ No newline at end of file diff --git a/models/nlp/large_language_model/chatglm3-6b-32k/vllm/server_inference.py b/models/nlp/large_language_model/chatglm3-6b-32k/vllm/server_inference.py new file mode 100644 index 0000000000000000000000000000000000000000..f6dcf8b88dd25d95c972cee251291c9e515fd9b8 --- /dev/null +++ b/models/nlp/large_language_model/chatglm3-6b-32k/vllm/server_inference.py @@ -0,0 +1,81 @@ +# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +import argparse +import time + +from openai import OpenAI +from transformers import AutoTokenizer + + +def send_request( + api_url: str, + prompt: str, + output_len: int, + stream: bool, +) -> None: + client = OpenAI( + # defaults to os.environ.get("OPENAI_API_KEY") + api_key="EMPTY", + base_url=api_url, + ) + + models = client.models.list() + model = models.data[0].id + + completion = client.completions.create( + model=model, + # messages=[{"role": "user", "content": prompt},], + prompt=prompt, + n=1, + stream=stream, + max_tokens=output_len, + temperature=0.0, + ) + + if stream: + for each_com in completion: + print(each_com) + else: + print("++++++++++++++++++") + print(completion) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Benchmark the online serving throughput." + ) + parser.add_argument("--host", type=str, default="localhost") + parser.add_argument("--port", type=int, default=8000) + parser.add_argument("--stream", action="store_true") + parser.add_argument("--output_token", type=int, default=1024) + parser.add_argument("--model_path", type=str) + + args = parser.parse_args() + api_url = f"http://{args.host}:{args.port}/v1" + + prompts = [ + "你好", + "Which city is the capital of China?", + "1 + 1 = ?", + "中国的首都是哪里", + "请讲以下内容翻译为英文:\n你好,我来自中国。", + ] + + tokenizer = AutoTokenizer.from_pretrained(args.model_path, trust_remote_code=True) + prompts = [tokenizer.build_chat_input(i).input_ids.tolist() for i in prompts] + + for prompt in prompts: + send_request(api_url, prompt, args.output_token, args.stream) \ No newline at end of file diff --git a/models/nlp/large_language_model/chatglm3-6b-32k/vllm/utils.py b/models/nlp/large_language_model/chatglm3-6b-32k/vllm/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..1fc072d8d51e35109a97c17b5476e7bf3aa1448b --- /dev/null +++ b/models/nlp/large_language_model/chatglm3-6b-32k/vllm/utils.py @@ -0,0 +1,371 @@ +# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +from copy import deepcopy +from typing import Tuple, List, Union + +import codecs +import logging +import argparse + +# 瀵逛簬chat妯″瀷锛屾垨鑰呮ā鍨嬮渶瑕佺壒瀹氱殑杈撳叆锛岄渶瑕佸prompt杩涜棰濆鐨勫鐞嗐 +# 濡傛灉鎮ㄥ湪浣跨敤涓湁棰濆鐨刾rompt澶勭悊鏂瑰紡闇姹傛垨鑰呴敊璇弽棣堬紝鍙互鑱旂郴鐜嬪潥鎴栬呭珐浜氶锛屾垜浠細瀵筸odelzoo杩涜鏇存柊閫傞厤銆 + +def sampling_add_cli_args(args: argparse.ArgumentParser) -> argparse.ArgumentParser: + args.add_argument( + '--n', + type=int, + default=1, + help="Number of output sequences to return for the given prompt.") + args.add_argument( + '--best-of', + type=int, + default=None, + help="Number of output sequences that are generated from the prompt. " + "From these `best_of` sequences, the top `n` sequences are returned. " + "`best_of` must be greater than or equal to `n`. This is treated as " + "the beam width when `use_beam_search` is True. By default, `best_of`" + "is set to `n`.") + args.add_argument( + '--presence-penalty', + type=float, + default=0.0, + help="Float that penalizes new tokens based on whether they " + "appear in the generated text so far. Values > 0 encourage the model " + "to use new tokens, while values < 0 encourage the model to repeat " + "tokens.") + args.add_argument( + '--frequency-penalty', + type=float, + default=0.0, + help="Float that penalizes new tokens based on their " + " frequency in the generated text so far. Values > 0 encourage the " + " model to use new tokens, while values < 0 encourage the model to " + "repeat tokens.") + args.add_argument( + '--repetition-penalty', + type=float, + default=1.0, + help="Float that penalizes new tokens based on whether " + "they appear in the prompt and the generated text so far. Values > 1 " + "encourage the model to use new tokens, while values < 1 encourage " + "the model to repeat tokens.") + args.add_argument( + '--temperature', + type=float, + default=1.0, + help="Float that controls the randomness of the sampling. Lower " + "values make the model more deterministic, while higher values make " + "the model more random. Zero means greedy sampling.") + args.add_argument( + '--top-p', + type=float, + default=1.0, + help="Float that controls the cumulative probability of the top tokens " + "to consider. Must be in (0, 1]. Set to 1 to consider all tokens.") + args.add_argument( + '--top-k', + type=int, + default=-1, + help="Integer that controls the number of top tokens to consider. Set " + "to -1 to consider all tokens.") + args.add_argument( + '--min-p', + type=float, + default=0.0, + help="Float that represents the minimum probability for a token to be " + "considered, relative to the probability of the most likely token. " + "Must be in [0, 1]. Set to 0 to disable this.") + args.add_argument( + '--use-beam-search', + default=False, + action="store_true", + help="Whether to use beam search instead of sampling.") + args.add_argument( + '--length-penalty', + type=float, + default=1.0, + help="Float that penalizes sequences based on their length. Used in beam search.") + args.add_argument( + '--stop', + type=str, + default=None, + help="List of strings that stop the generation when they are generated. " + "The returned output will not contain the stop strings.") + args.add_argument( + '--stop-token-ids', + type=int, + default=None, + help="List of tokens that stop the generation when they are " + "generated. The returned output will contain the stop tokens unless " + "the stop tokens are special tokens.") + args.add_argument( + '--include-stop-str-in-output', + default=False, + action="store_true", + help="Whether to include the stop strings in output text. Defaults to False.") + args.add_argument( + '--ignore-eos', + default=False, + action="store_true", + help="Whether to ignore the EOS token and continue generating tokens after the EOS token is generated.") + args.add_argument( + '--max-tokens', + type=int, + default=16, + help="Maximum number of tokens to generate per output sequence.") + args.add_argument( + '--logprobs', + type=int, + default=None, + help="NNumber of log probabilities to return per output token. " + "Note that the implementation follows the OpenAI API: The return " + "result includes the log probabilities on the `logprobs` most likely " + "tokens, as well the chosen tokens. The API will always return the " + "log probability of the sampled token, so there may be up to " + "`logprobs+1` elements in the response.") + args.add_argument( + '--prompt-logprobs', + type=int, + default=None, + help="Number of log probabilities to return per prompt token.") + args.add_argument( + '--skip-special-tokens', + default=True, + action="store_false", + help="Whether to skip special tokens in the output.") + args.add_argument( + '--spaces-between-special-tokens', + default=True, + action="store_false", + help="Whether to add spaces between special tokens in the output. Defaults to True.") + # early_stopping logits_processors seed + return args + + +def load_chat_template(tokenizer, chat_template): + if chat_template is not None: + try: + with open(chat_template, "r") as f: + tokenizer.chat_template = f.read() + except OSError: + # If opening a file fails, set chat template to be args to + # ensure we decode so our escape are interpreted correctly + tokenizer.chat_template = codecs.decode( + chat_template, "unicode_escape") + + logging.info( + f"Using supplied chat template:\n{tokenizer.chat_template}" + ) + elif tokenizer.chat_template is not None: + logging.info( + f"Using default chat template:\n{tokenizer.chat_template}" + ) + else: + logging.warning( + "No chat template provided. Chat API will not work.") + +def default_build_chat(tokenizer,prompt): + return prompt + +def chatglm2_build_chat(tokenizer,prompt): + return tokenizer.build_prompt(prompt) + +def chatglm3_build_chat(tokenizer,prompt): + return tokenizer.build_chat_input(prompt).input_ids[0].tolist() + +def llama2_build_chat(tokenizer,prompt): + return f"[INST]{prompt}[/INST]" + +# adapt from https://huggingface.co/baichuan-inc/Baichuan2-13B-Chat/blob/main/generation_utils.py +def baichuan2_build_chat(tokenizer, prompt, max_new_tokens=512): + def _parse_messages(messages, split_role="user"): + system, rounds = "", [] + round = [] + for i, message in enumerate(messages): + if message["role"] == "system": + assert i == 0 + system = message["content"] + continue + if message["role"] == split_role and round: + rounds.append(round) + round = [] + round.append(message) + if round: + rounds.append(round) + return system, rounds + + messages = [{"role": "user", "content": f"{prompt}"}] + max_new_tokens = max_new_tokens + max_input_tokens = 4096 - max_new_tokens + system, rounds = _parse_messages(messages, split_role="user") + system_tokens = tokenizer.encode(system) + max_history_tokens = max_input_tokens - len(system_tokens) + + history_tokens = [] + for round in rounds[::-1]: + round_tokens = [] + for message in round: + if message["role"] == "user": + round_tokens.append(195) + else: + round_tokens.append(196) + round_tokens.extend(tokenizer.encode(message["content"])) + if len(history_tokens) == 0 or len(history_tokens) + len(round_tokens) <= max_history_tokens: + history_tokens = round_tokens + history_tokens # concat left + if len(history_tokens) < max_history_tokens: + continue + break + + input_tokens = system_tokens + history_tokens + if messages[-1]["role"] != "assistant": + input_tokens.append(196) + input_tokens = input_tokens[-max_input_tokens:] # truncate left + return input_tokens + +def qwen_build_chat( + tokenizer, + query: str, + history: List[Tuple[str, str]] = None, + system: str = "", + max_window_size: int = 6144, + chat_format: str = "chatml", +): + if history is None: + history = [] + + if chat_format == "chatml": + im_start, im_end = "<|im_start|>", "<|im_end|>" + im_start_tokens = [tokenizer.im_start_id] + im_end_tokens = [tokenizer.im_end_id] + nl_tokens = tokenizer.encode("\n") + + def _tokenize_str(role, content): + return f"{role}\n{content}", tokenizer.encode( + role, allowed_special=set() + ) + nl_tokens + tokenizer.encode(content, allowed_special=set()) + + system_text, system_tokens_part = _tokenize_str("system", system) + system_tokens = im_start_tokens + system_tokens_part + im_end_tokens + + raw_text = "" + context_tokens = [] + + for turn_query, turn_response in reversed(history): + query_text, query_tokens_part = _tokenize_str("user", turn_query) + query_tokens = im_start_tokens + query_tokens_part + im_end_tokens + response_text, response_tokens_part = _tokenize_str( + "assistant", turn_response + ) + response_tokens = im_start_tokens + response_tokens_part + im_end_tokens + + next_context_tokens = nl_tokens + query_tokens + nl_tokens + response_tokens + prev_chat = ( + f"\n{im_start}{query_text}{im_end}\n{im_start}{response_text}{im_end}" + ) + + current_context_size = ( + len(system_tokens) + len(next_context_tokens) + len(context_tokens) + ) + if current_context_size < max_window_size: + context_tokens = next_context_tokens + context_tokens + raw_text = prev_chat + raw_text + else: + break + + context_tokens = system_tokens + context_tokens + raw_text = f"{im_start}{system_text}{im_end}" + raw_text + context_tokens += ( + nl_tokens + + im_start_tokens + + _tokenize_str("user", query)[1] + + im_end_tokens + + nl_tokens + + im_start_tokens + + tokenizer.encode("assistant") + + nl_tokens + ) + raw_text += f"\n{im_start}user\n{query}{im_end}\n{im_start}assistant\n" + + elif chat_format == "raw": + raw_text = query + context_tokens = tokenizer.encode(raw_text) + else: + raise NotImplementedError(f"Unknown chat format {chat_format!r}") + + return raw_text, context_tokens + +def codellama_build_chat(tokenizer,prompt): + return "[INST] Write code to solve the following coding problem that obeys the constraints and passes the example test cases. Please wrap your code answer using ```:{}[/INST]".format(prompt) + +def build_chat(tokenizer, prompt, model_name, **kwargs): + model_name = model_name.lower() + # return str or list[int] + if "chatglm2" in model_name: + prompt = chatglm2_build_chat(tokenizer,prompt) + elif "chatglm3" in model_name: + prompt = chatglm3_build_chat(tokenizer,prompt) + elif "llama2" in model_name and 'chat' in model_name: + prompt = llama2_build_chat(tokenizer,prompt) + elif "baichuan2" in model_name and 'chat' in model_name: + prompt = baichuan2_build_chat(tokenizer,prompt, kwargs['max_length']) + elif "qwen" in model_name and 'chat' in model_name: + prompt = qwen_build_chat(tokenizer,prompt) + elif "code" in model_name and 'llama' in model_name: + prompt = codellama_build_chat(tokenizer,prompt) + else: + prompt = default_build_chat(tokenizer,prompt) + return prompt + + +# for output +def default_post_process(output): + return output + +def glm2_post_process(output): + output = output.strip() + output = output.replace("[[璁粌鏃堕棿]]", "2023骞") + return output + +def glm3_post_process(output, history=[]): + content = "" + history = deepcopy(history) + for response in output.split("<|assistant|>"): + metadata, content = response.split("\n", maxsplit=1) + if not metadata.strip(): + content = content.strip() + history.append({"role": "assistant", "metadata": metadata, "content": content}) + content = content.replace("[[璁粌鏃堕棿]]", "2023骞") + else: + history.append({"role": "assistant", "metadata": metadata, "content": content}) + if history[0]["role"] == "system" and "tools" in history[0]: + content = "\n".join(content.split("\n")[1:-1]) + def tool_call(**kwargs): + return kwargs + parameters = eval(content) + content = {"name": metadata.strip(), "parameters": parameters} + else: + content = {"name": metadata.strip(), "content": content} + return content + +def post_process(response, model_name,**kwargs): + model_name = model_name.lower() + if "chatglm2" in model_name: + response = glm2_post_process(response) + elif "chatglm3" in model_name: + response = glm3_post_process(response) + else: + response = default_post_process(response) + return response \ No newline at end of file