diff --git a/models/nlp/large_language_model/qwen1.5-14b/vllm/README.md b/models/nlp/large_language_model/qwen1.5-14b/vllm/README.md new file mode 100644 index 0000000000000000000000000000000000000000..0d6d04ae0aec6aef451dc35fa5c25be574ff7765 --- /dev/null +++ b/models/nlp/large_language_model/qwen1.5-14b/vllm/README.md @@ -0,0 +1,44 @@ +# Qwen1.5-14B + +## Description + +Qwen1.5 is a language model series including decoder language models of different model sizes. For each size, we release the base language model and the aligned chat model. It is based on the Transformer architecture with SwiGLU activation, attention QKV bias, group query attention, mixture of sliding window attention and full attention, etc. Additionally, we have an improved tokenizer adaptive to multiple natural languages and codes. For the beta version, temporarily we did not include GQA (except for 32B) and the mixture of SWA and full attention. + +## Setup + +### Install + +```bash +# Install libGL +## CentOS +yum install -y mesa-libGL +## Ubuntu +apt install -y libgl1-mesa-dev + +# Please contact the staff to obtain the relevant installlation packages. +pip3 install Path/To/bitsandbytes-xxx.whl +pip3 install Path/To/flash_atten-xxx.whl +pip3 install Path/To/ixformer-xxx.whl +pip3 install Path/To/vllm-xxx.whl +pip3 install Path/To/eetq-xxx.whl +``` + +### Download + +-Model: + +```bash +mkdir data/qwen1.5 +``` + +## Inference + +```bash +python3 offline_inference.py --model /data/qwen1.5/$MODEL_ID --max-tokens 256 -tp 1 --temperature 0.0 --max-model-len 1024 +``` + +## Results + +| Model | QPS | +| ---------- | ----- | +| Qwen1.5-14B| 57.15 | diff --git a/models/nlp/large_language_model/qwen1.5-14b/vllm/offline_inference.py b/models/nlp/large_language_model/qwen1.5-14b/vllm/offline_inference.py new file mode 100644 index 0000000000000000000000000000000000000000..7ee127a259eb78f91d71c07b4a129464e0cc6cd3 --- /dev/null +++ b/models/nlp/large_language_model/qwen1.5-14b/vllm/offline_inference.py @@ -0,0 +1,110 @@ +# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +import sys +from pathlib import Path +sys.path.append(str(Path(__file__).resolve().parent.parent.parent)) +from utils import load_chat_template,sampling_add_cli_args + +import logging +import time +import argparse +import dataclasses +import inspect + +import torch +from vllm import LLM, SamplingParams, EngineArgs + + +parser = argparse.ArgumentParser() +parser.add_argument("--chat_template",type=str,default=None) +parser.add_argument("--remove_chat_template",default=False,action="store_true",help="pass this if you are not use a chat model") +parser = EngineArgs.add_cli_args(parser) +parser = sampling_add_cli_args(parser) +args = parser.parse_args() + +engine_args = [attr.name for attr in dataclasses.fields(EngineArgs)] +sampling_args = [param.name for param in list(inspect.signature(SamplingParams.__init__).parameters.values())[1:]] +engine_params = {attr:getattr(args, attr) for attr in engine_args} +sampling_params = {attr:getattr(args, attr) for attr in sampling_args if args.__contains__(attr)} + +model_name = args.model.strip() +model_name = model_name if args.model[-1]!='/' else model_name[:-1] +model_name = model_name.rsplit('/')[-1] + + +# Sample prompts. +prompts = [ + "哪些迹象可能表明一个人正在经历焦虑?", + "描述一下如何制作芝士披萨。", + "写一篇有关5G网络研发的综述文章。" + ] + +# Create a sampling params object. +sampling_params = SamplingParams(**sampling_params) + +# Create an LLM. +llm = LLM(**engine_params) + +# process chat template +if args.remove_chat_template: + if 'chat' in model_name.lower(): + logging.warning(f"The model name from model path is {model_name}, so we guess you are using the chat model and the additional processing is required for the input prompt. " + f"If the result is not quite correct, please ensure you do not pass --remove_chat_template in CLI.") + prompts_new = prompts +else: + # Build chat model promopt + logging.warning("If you are using a non chat model, please pass the --remove_chat_template in CLI.") + # Try use transformers's apply_chat_template, if chat_template is None, will use defalut template. + # For some old models, the default template may cause bad answers. we don't consider this situation, + # because the Transformers team is advancing the chat template. For more informatino about it, + # please refer to https://huggingface.co/docs/transformers/main/chat_templating + try: + load_chat_template(llm.get_tokenizer(),args.chat_template) + prompts_new = [] + for prompt in prompts: + messages = [ + {"role": "user", "content": prompt} + ] + text = llm.get_tokenizer().apply_chat_template( + messages, + tokenize=False, + add_generation_prompt=True + ) + prompts_new.append(text) + except: + logging.warning("use tokenizer apply_chat_template function failed, may because of low transformers version...(try use transformers>=4.34.0)") + prompts_new = prompts + +# Generate texts from the prompts. The output is a list of RequestOutput objects +# that contain the prompt, generated text, and other information. +outputs = llm.generate(prompts_new, sampling_params,use_tqdm=False) if isinstance(prompts_new[0],str) else llm.generate(sampling_params=sampling_params,prompt_token_ids=prompts_new,use_tqdm=False) +torch.cuda.synchronize() + +start_time = time.perf_counter() +outputs = llm.generate(prompts_new, sampling_params) if isinstance(prompts_new[0],str) else llm.generate(sampling_params=sampling_params,prompt_token_ids=prompts_new) +torch.cuda.synchronize() +end_time = time.perf_counter() +duration_time = end_time - start_time + +num_tokens = 0 +# Print the outputs. +for i, output in enumerate(outputs): + prompt = prompts[i] # show the origin prompt. actully prompt is "output.prompt" + generated_text = output.outputs[0].text + + num_tokens += len(output.outputs[0].token_ids) + print(f"Prompt: {prompt}\nGenerated text: {generated_text} \n") +print(f"tokens: {num_tokens}, QPS: {num_tokens/duration_time}") diff --git a/models/nlp/large_language_model/qwen1.5-14b/vllm/utils.py b/models/nlp/large_language_model/qwen1.5-14b/vllm/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..c6def85dedc08ef9c3a489ce9dc5b1ff4a5e48b0 --- /dev/null +++ b/models/nlp/large_language_model/qwen1.5-14b/vllm/utils.py @@ -0,0 +1,173 @@ +# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +import codecs +import logging +import argparse + + +def sampling_add_cli_args(args: argparse.ArgumentParser) -> argparse.ArgumentParser: + args.add_argument( + '--n', + type=int, + default=1, + help="Number of output sequences to return for the given prompt.") + args.add_argument( + '--best-of', + type=int, + default=None, + help="Number of output sequences that are generated from the prompt. " + "From these `best_of` sequences, the top `n` sequences are returned. " + "`best_of` must be greater than or equal to `n`. This is treated as " + "the beam width when `use_beam_search` is True. By default, `best_of`" + "is set to `n`.") + args.add_argument( + '--presence-penalty', + type=float, + default=0.0, + help="Float that penalizes new tokens based on whether they " + "appear in the generated text so far. Values > 0 encourage the model " + "to use new tokens, while values < 0 encourage the model to repeat " + "tokens.") + args.add_argument( + '--frequency-penalty', + type=float, + default=0.0, + help="Float that penalizes new tokens based on their " + " frequency in the generated text so far. Values > 0 encourage the " + " model to use new tokens, while values < 0 encourage the model to " + "repeat tokens.") + args.add_argument( + '--repetition-penalty', + type=float, + default=1.0, + help="Float that penalizes new tokens based on whether " + "they appear in the prompt and the generated text so far. Values > 1 " + "encourage the model to use new tokens, while values < 1 encourage " + "the model to repeat tokens.") + args.add_argument( + '--temperature', + type=float, + default=1.0, + help="Float that controls the randomness of the sampling. Lower " + "values make the model more deterministic, while higher values make " + "the model more random. Zero means greedy sampling.") + args.add_argument( + '--top-p', + type=float, + default=1.0, + help="Float that controls the cumulative probability of the top tokens " + "to consider. Must be in (0, 1]. Set to 1 to consider all tokens.") + args.add_argument( + '--top-k', + type=int, + default=-1, + help="Integer that controls the number of top tokens to consider. Set " + "to -1 to consider all tokens.") + args.add_argument( + '--min-p', + type=float, + default=0.0, + help="Float that represents the minimum probability for a token to be " + "considered, relative to the probability of the most likely token. " + "Must be in [0, 1]. Set to 0 to disable this.") + args.add_argument( + '--use-beam-search', + default=False, + action="store_true", + help="Whether to use beam search instead of sampling.") + args.add_argument( + '--length-penalty', + type=float, + default=1.0, + help="Float that penalizes sequences based on their length. Used in beam search.") + args.add_argument( + '--stop', + type=str, + default=None, + help="List of strings that stop the generation when they are generated. " + "The returned output will not contain the stop strings.") + args.add_argument( + '--stop-token-ids', + type=int, + default=None, + help="List of tokens that stop the generation when they are " + "generated. The returned output will contain the stop tokens unless " + "the stop tokens are special tokens.") + args.add_argument( + '--include-stop-str-in-output', + default=False, + action="store_true", + help="Whether to include the stop strings in output text. Defaults to False.") + args.add_argument( + '--ignore-eos', + default=False, + action="store_true", + help="Whether to ignore the EOS token and continue generating tokens after the EOS token is generated.") + args.add_argument( + '--max-tokens', + type=int, + default=16, + help="Maximum number of tokens to generate per output sequence.") + args.add_argument( + '--logprobs', + type=int, + default=None, + help="NNumber of log probabilities to return per output token. " + "Note that the implementation follows the OpenAI API: The return " + "result includes the log probabilities on the `logprobs` most likely " + "tokens, as well the chosen tokens. The API will always return the " + "log probability of the sampled token, so there may be up to " + "`logprobs+1` elements in the response.") + args.add_argument( + '--prompt-logprobs', + type=int, + default=None, + help="Number of log probabilities to return per prompt token.") + args.add_argument( + '--skip-special-tokens', + default=True, + action="store_false", + help="Whether to skip special tokens in the output.") + args.add_argument( + '--spaces-between-special-tokens', + default=True, + action="store_false", + help="Whether to add spaces between special tokens in the output. Defaults to True.") + # early_stopping logits_processors seed + return args + + +def load_chat_template(tokenizer, chat_template): + if chat_template is not None: + try: + with open(chat_template, "r") as f: + tokenizer.chat_template = f.read() + except OSError: + # If opening a file fails, set chat template to be args to + # ensure we decode so our escape are interpreted correctly + tokenizer.chat_template = codecs.decode( + chat_template, "unicode_escape") + + logging.info( + f"Using supplied chat template:\n{tokenizer.chat_template}" + ) + elif tokenizer.chat_template is not None: + logging.info( + f"Using default chat template:\n{tokenizer.chat_template}. This May lead to unsatisfactory results. You can provide a template.jinja file for vllm." + ) + else: + logging.warning( + "No chat template provided. Chat API will not work. This May lead to unsatisfactory results. You can provide a template.jinja file for vllm.") diff --git a/models/nlp/large_language_model/qwen1.5-72b/vllm/README.md b/models/nlp/large_language_model/qwen1.5-72b/vllm/README.md new file mode 100644 index 0000000000000000000000000000000000000000..9b86d87c0b77eff5cbb16a5f37de7377a61d7d42 --- /dev/null +++ b/models/nlp/large_language_model/qwen1.5-72b/vllm/README.md @@ -0,0 +1,45 @@ +# Qwen1.5-72B + +## Description + +Qwen1.5 is a language model series including decoder language models of different model sizes. For each size, we release the base language model and the aligned chat model. It is based on the Transformer architecture with SwiGLU activation, attention QKV bias, group query attention, mixture of sliding window attention and full attention, etc. Additionally, we have an improved tokenizer adaptive to multiple natural languages and codes. For the beta version, temporarily we did not include GQA (except for 32B) and the mixture of SWA and full attention. + +## Setup + +### Install + +```bash +# Install libGL +## CentOS +yum install -y mesa-libGL +## Ubuntu +apt install -y libgl1-mesa-dev + +# Please contact the staff to obtain the relevant installlation packages. +pip3 install Path/To/bitsandbytes-xxx.whl +pip3 install Path/To/flash_atten-xxx.whl +pip3 install Path/To/ixformer-xxx.whl +pip3 install Path/To/vllm-xxx.whl +pip3 install Path/To/eetq-xxx.whl +``` + +### Download + +-Model: + +```bash +mkdir data/qwen1.5 +``` + +## Inference + +```bash +export CUDA_VISIBLE_DEVICES=0,1 +python3 offline_inference.py --model /data/qwen1.5/$MODEL_ID --max-tokens 256 -tp 2 --temperature 0.0 --max-model-len 3096 +``` + +## Results + +| Model | QPS | +| ---------- | ----- | +| Qwen1.5-72B| 41.24 | diff --git a/models/nlp/large_language_model/qwen1.5-72b/vllm/offline_inference.py b/models/nlp/large_language_model/qwen1.5-72b/vllm/offline_inference.py new file mode 100644 index 0000000000000000000000000000000000000000..7ee127a259eb78f91d71c07b4a129464e0cc6cd3 --- /dev/null +++ b/models/nlp/large_language_model/qwen1.5-72b/vllm/offline_inference.py @@ -0,0 +1,110 @@ +# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +import sys +from pathlib import Path +sys.path.append(str(Path(__file__).resolve().parent.parent.parent)) +from utils import load_chat_template,sampling_add_cli_args + +import logging +import time +import argparse +import dataclasses +import inspect + +import torch +from vllm import LLM, SamplingParams, EngineArgs + + +parser = argparse.ArgumentParser() +parser.add_argument("--chat_template",type=str,default=None) +parser.add_argument("--remove_chat_template",default=False,action="store_true",help="pass this if you are not use a chat model") +parser = EngineArgs.add_cli_args(parser) +parser = sampling_add_cli_args(parser) +args = parser.parse_args() + +engine_args = [attr.name for attr in dataclasses.fields(EngineArgs)] +sampling_args = [param.name for param in list(inspect.signature(SamplingParams.__init__).parameters.values())[1:]] +engine_params = {attr:getattr(args, attr) for attr in engine_args} +sampling_params = {attr:getattr(args, attr) for attr in sampling_args if args.__contains__(attr)} + +model_name = args.model.strip() +model_name = model_name if args.model[-1]!='/' else model_name[:-1] +model_name = model_name.rsplit('/')[-1] + + +# Sample prompts. +prompts = [ + "哪些迹象可能表明一个人正在经历焦虑?", + "描述一下如何制作芝士披萨。", + "写一篇有关5G网络研发的综述文章。" + ] + +# Create a sampling params object. +sampling_params = SamplingParams(**sampling_params) + +# Create an LLM. +llm = LLM(**engine_params) + +# process chat template +if args.remove_chat_template: + if 'chat' in model_name.lower(): + logging.warning(f"The model name from model path is {model_name}, so we guess you are using the chat model and the additional processing is required for the input prompt. " + f"If the result is not quite correct, please ensure you do not pass --remove_chat_template in CLI.") + prompts_new = prompts +else: + # Build chat model promopt + logging.warning("If you are using a non chat model, please pass the --remove_chat_template in CLI.") + # Try use transformers's apply_chat_template, if chat_template is None, will use defalut template. + # For some old models, the default template may cause bad answers. we don't consider this situation, + # because the Transformers team is advancing the chat template. For more informatino about it, + # please refer to https://huggingface.co/docs/transformers/main/chat_templating + try: + load_chat_template(llm.get_tokenizer(),args.chat_template) + prompts_new = [] + for prompt in prompts: + messages = [ + {"role": "user", "content": prompt} + ] + text = llm.get_tokenizer().apply_chat_template( + messages, + tokenize=False, + add_generation_prompt=True + ) + prompts_new.append(text) + except: + logging.warning("use tokenizer apply_chat_template function failed, may because of low transformers version...(try use transformers>=4.34.0)") + prompts_new = prompts + +# Generate texts from the prompts. The output is a list of RequestOutput objects +# that contain the prompt, generated text, and other information. +outputs = llm.generate(prompts_new, sampling_params,use_tqdm=False) if isinstance(prompts_new[0],str) else llm.generate(sampling_params=sampling_params,prompt_token_ids=prompts_new,use_tqdm=False) +torch.cuda.synchronize() + +start_time = time.perf_counter() +outputs = llm.generate(prompts_new, sampling_params) if isinstance(prompts_new[0],str) else llm.generate(sampling_params=sampling_params,prompt_token_ids=prompts_new) +torch.cuda.synchronize() +end_time = time.perf_counter() +duration_time = end_time - start_time + +num_tokens = 0 +# Print the outputs. +for i, output in enumerate(outputs): + prompt = prompts[i] # show the origin prompt. actully prompt is "output.prompt" + generated_text = output.outputs[0].text + + num_tokens += len(output.outputs[0].token_ids) + print(f"Prompt: {prompt}\nGenerated text: {generated_text} \n") +print(f"tokens: {num_tokens}, QPS: {num_tokens/duration_time}") diff --git a/models/nlp/large_language_model/qwen1.5-72b/vllm/utils.py b/models/nlp/large_language_model/qwen1.5-72b/vllm/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..c6def85dedc08ef9c3a489ce9dc5b1ff4a5e48b0 --- /dev/null +++ b/models/nlp/large_language_model/qwen1.5-72b/vllm/utils.py @@ -0,0 +1,173 @@ +# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +import codecs +import logging +import argparse + + +def sampling_add_cli_args(args: argparse.ArgumentParser) -> argparse.ArgumentParser: + args.add_argument( + '--n', + type=int, + default=1, + help="Number of output sequences to return for the given prompt.") + args.add_argument( + '--best-of', + type=int, + default=None, + help="Number of output sequences that are generated from the prompt. " + "From these `best_of` sequences, the top `n` sequences are returned. " + "`best_of` must be greater than or equal to `n`. This is treated as " + "the beam width when `use_beam_search` is True. By default, `best_of`" + "is set to `n`.") + args.add_argument( + '--presence-penalty', + type=float, + default=0.0, + help="Float that penalizes new tokens based on whether they " + "appear in the generated text so far. Values > 0 encourage the model " + "to use new tokens, while values < 0 encourage the model to repeat " + "tokens.") + args.add_argument( + '--frequency-penalty', + type=float, + default=0.0, + help="Float that penalizes new tokens based on their " + " frequency in the generated text so far. Values > 0 encourage the " + " model to use new tokens, while values < 0 encourage the model to " + "repeat tokens.") + args.add_argument( + '--repetition-penalty', + type=float, + default=1.0, + help="Float that penalizes new tokens based on whether " + "they appear in the prompt and the generated text so far. Values > 1 " + "encourage the model to use new tokens, while values < 1 encourage " + "the model to repeat tokens.") + args.add_argument( + '--temperature', + type=float, + default=1.0, + help="Float that controls the randomness of the sampling. Lower " + "values make the model more deterministic, while higher values make " + "the model more random. Zero means greedy sampling.") + args.add_argument( + '--top-p', + type=float, + default=1.0, + help="Float that controls the cumulative probability of the top tokens " + "to consider. Must be in (0, 1]. Set to 1 to consider all tokens.") + args.add_argument( + '--top-k', + type=int, + default=-1, + help="Integer that controls the number of top tokens to consider. Set " + "to -1 to consider all tokens.") + args.add_argument( + '--min-p', + type=float, + default=0.0, + help="Float that represents the minimum probability for a token to be " + "considered, relative to the probability of the most likely token. " + "Must be in [0, 1]. Set to 0 to disable this.") + args.add_argument( + '--use-beam-search', + default=False, + action="store_true", + help="Whether to use beam search instead of sampling.") + args.add_argument( + '--length-penalty', + type=float, + default=1.0, + help="Float that penalizes sequences based on their length. Used in beam search.") + args.add_argument( + '--stop', + type=str, + default=None, + help="List of strings that stop the generation when they are generated. " + "The returned output will not contain the stop strings.") + args.add_argument( + '--stop-token-ids', + type=int, + default=None, + help="List of tokens that stop the generation when they are " + "generated. The returned output will contain the stop tokens unless " + "the stop tokens are special tokens.") + args.add_argument( + '--include-stop-str-in-output', + default=False, + action="store_true", + help="Whether to include the stop strings in output text. Defaults to False.") + args.add_argument( + '--ignore-eos', + default=False, + action="store_true", + help="Whether to ignore the EOS token and continue generating tokens after the EOS token is generated.") + args.add_argument( + '--max-tokens', + type=int, + default=16, + help="Maximum number of tokens to generate per output sequence.") + args.add_argument( + '--logprobs', + type=int, + default=None, + help="NNumber of log probabilities to return per output token. " + "Note that the implementation follows the OpenAI API: The return " + "result includes the log probabilities on the `logprobs` most likely " + "tokens, as well the chosen tokens. The API will always return the " + "log probability of the sampled token, so there may be up to " + "`logprobs+1` elements in the response.") + args.add_argument( + '--prompt-logprobs', + type=int, + default=None, + help="Number of log probabilities to return per prompt token.") + args.add_argument( + '--skip-special-tokens', + default=True, + action="store_false", + help="Whether to skip special tokens in the output.") + args.add_argument( + '--spaces-between-special-tokens', + default=True, + action="store_false", + help="Whether to add spaces between special tokens in the output. Defaults to True.") + # early_stopping logits_processors seed + return args + + +def load_chat_template(tokenizer, chat_template): + if chat_template is not None: + try: + with open(chat_template, "r") as f: + tokenizer.chat_template = f.read() + except OSError: + # If opening a file fails, set chat template to be args to + # ensure we decode so our escape are interpreted correctly + tokenizer.chat_template = codecs.decode( + chat_template, "unicode_escape") + + logging.info( + f"Using supplied chat template:\n{tokenizer.chat_template}" + ) + elif tokenizer.chat_template is not None: + logging.info( + f"Using default chat template:\n{tokenizer.chat_template}. This May lead to unsatisfactory results. You can provide a template.jinja file for vllm." + ) + else: + logging.warning( + "No chat template provided. Chat API will not work. This May lead to unsatisfactory results. You can provide a template.jinja file for vllm.") diff --git a/models/nlp/large_language_model/qwen1.5-7b/text-generation-inference/README.md b/models/nlp/large_language_model/qwen1.5-7b/text-generation-inference/README.md new file mode 100644 index 0000000000000000000000000000000000000000..eedad292edf589050c017bf72469c38c2182513c --- /dev/null +++ b/models/nlp/large_language_model/qwen1.5-7b/text-generation-inference/README.md @@ -0,0 +1,60 @@ +# Qwen1.5-7B + +## Description + +Qwen1.5 is a language model series including decoder language models of different model sizes. For each size, we release the base language model and the aligned chat model. It is based on the Transformer architecture with SwiGLU activation, attention QKV bias, group query attention, mixture of sliding window attention and full attention, etc. Additionally, we have an improved tokenizer adaptive to multiple natural languages and codes. For the beta version, temporarily we did not include GQA (except for 32B) and the mixture of SWA and full attention. + +## Setup + +### Install + +```bash +# Install libGL +## CentOS +yum install -y mesa-libGL +## Ubuntu +apt install -y libgl1-mesa-dev + +# Please contact the staff to obtain the relevant installlation packages. +pip3 install Path/To/bitsandbytes-xxx.whl +pip3 install Path/To/flash_atten-xxx.whl +pip3 install Path/To/ixformer-xxx.whl +pip3 install Path/To/vllm-xxx.whl +pip3 install Path/To/eetq-xxx.whl +pip3 install Path/To/text-generation-xxx.whl +pip3 install Path/To/text-generation-server-xxx.whl +``` + +### Download + +-Model: + +```bash +mkdir data/qwen1.5 +``` + +## Inference + +### Start webserver + +#### Single GPU + +```bash +# Use one docker container to start webserver +export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python +CUDA_VISIBLE_DEVICES=0 USE_FLASH_ATTENTION=true text-generation-launcher --model-id $PROJECT_DIR/data/qwen1.5/$MODEL_ID --sharded false --dtype float16 --disable-custom-kernels --port 8001 --max-input-length 2048 --max-batch-prefill-tokens 2048 --max-total-tokens 4096 --max-batch-total-tokens 4096 +``` + +#### Offline test + +```bash +# Use another docker container to run offline test +export CUDA_VISIBLE_DEVICES=2 +python3 offline_inference.py --model2path /data/qwen1.5/$MODEL_ID +``` + +## Results + +| Model | QPS | +| ---------- | ----- | +| Qwen1.5-7B | 39.11 | diff --git a/models/nlp/large_language_model/qwen1.5-7b/text-generation-inference/offline_inference.py b/models/nlp/large_language_model/qwen1.5-7b/text-generation-inference/offline_inference.py new file mode 100644 index 0000000000000000000000000000000000000000..b927973a76953e189d4c4ebd4ee10bc392e0b4f0 --- /dev/null +++ b/models/nlp/large_language_model/qwen1.5-7b/text-generation-inference/offline_inference.py @@ -0,0 +1,179 @@ +# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +from text_generation_server.models.flash_qwen2 import ( + FlashQwen2, +) +import torch +from text_generation_server.pb import generate_pb2 + +import time +from torch.cuda import profiler +from text_generation_server.utils.speculate import set_speculate +import argparse + +def parse_args(args=None): + parser = argparse.ArgumentParser() + parser.add_argument('--generate_length', type=int, default=512) + parser.add_argument('--model2path', type=str, default="/home/data/nlp/qwen2/Qwen1.5-0.5B") + parser.add_argument('--quantize', type=str, default=None, choices=['awq']) + parser.add_argument('--speculate', type=int, default=0) + + return parser.parse_args(args) + +if __name__ == "__main__": + args = parse_args() + + max_input_length = 2048 + max_prefill_tokens = 2048 + + set_speculate(args.speculate) + model = FlashQwen2(args.model2path, trust_remote_code=True, quantize=args.quantize) + + first_line = "蒙古国的首都是乌兰巴托(Ulaanbaatar)\n冰岛的首都是雷克雅未克(Reykjavik)\n埃塞俄比亚的首都是" + + default_pb_parameters = generate_pb2.NextTokenChooserParameters( + temperature=1.0, + repetition_penalty=1.0, + top_k=0, + top_p=1, + typical_p=1.0, + do_sample=False, + ) + + default_pb_stop_parameters = generate_pb2.StoppingCriteriaParameters(stop_sequences=[], max_new_tokens=args.generate_length) + + warmup_requests = generate_pb2.Request( + id=0, + inputs="_test " * max_input_length, + prefill_logprobs=True, + truncate=max_input_length, + parameters=generate_pb2.NextTokenChooserParameters( + temperature=0.9, + top_k=10, + top_p=0.9, + typical_p=0.9, + do_sample=False, + seed=0, + repetition_penalty=1.2, + watermark=True, + ), + stopping_parameters=generate_pb2.StoppingCriteriaParameters( + max_new_tokens=2, + stop_sequences=[], + ignore_eos_token=False, + ), + top_n_tokens = 20 + ) + warmup_requests_batch = generate_pb2.Batch(id=0, requests=[warmup_requests], size=1) + warmup_requests_batchs = model.batch_type.from_pb( + warmup_requests_batch, model.tokenizer, model.dtype, torch.device("cuda") + ) + + model.warmup(warmup_requests_batchs) + + pb_request = generate_pb2.Request( + id=1, + inputs=first_line, + prefill_logprobs=True, + truncate=1024, + parameters=default_pb_parameters, + stopping_parameters=default_pb_stop_parameters, + ) + pb_one_batch = generate_pb2.Batch(id=1, requests=[pb_request], size=1) + causal_lm_one_batch = model.batch_type.from_pb( + pb_one_batch, model.tokenizer, model.dtype, torch.device("cuda") + ) + + next_batch_one = causal_lm_one_batch + last_generations = True + torch.cuda.synchronize() + profiler.start() + start_time = time.perf_counter() + for _ in range(causal_lm_one_batch.stopping_criterias[0].max_new_tokens - 1): + generations_one, next_batch_one, _ = model.generate_token(next_batch_one) + if next_batch_one is None: + last_generations = False + break + if last_generations: + generations_one, next_batch_one, _ = model.generate_token(next_batch_one) + profiler.stop() + torch.cuda.synchronize() + end_time = time.perf_counter() + duration_time = end_time - start_time + print(f"generate length: {generations_one[0].generated_text.generated_tokens}") + print(f"one batch: {generations_one[0].generated_text.text}\nqps: {generations_one[0].generated_text.generated_tokens /duration_time}") + +""" +qwen1.5-0.5B +one batch: 亚历山大(Alexandria) +俄罗斯的首都是莫斯科(Moscow) +土耳其的首都是伊斯坦布尔(Istanbul) +南非的首都是开普敦(Cape Town) +美国的首都是华盛顿(Washington) +澳大利亚的首都是堪培拉(Canberra) +印度的首都是新德里(New Delhi) +法国的首都是巴黎(Paris) +英国的首都是伦敦(London) +加拿大首都是温哥华(Vancouver) +南非首都是开普敦(Cape Town) +美国首都是华盛顿(Washington) +澳大利亚首都是堪培拉(Canberra) +印度首都是新德里(New Delhi) +法国首都是巴黎(Paris) +英国首都是伦敦(London) +加拿大首都是温哥华(Vancouver) +南非首都是开普敦(Cape Town) +美国首都是华盛顿(Washington) +澳大利亚首都是堪培拉(Canberra) +印度首都是新德里(New Delhi) +法国首都是巴黎(Paris) +英国首都是伦敦(London) +加拿大首都是温哥华(Vancouver) +南非首都是开普敦(Cape Town) +美国首都是华盛顿(Washington) +澳大利亚首都是堪培拉(Canberra) +印度首都是新德里(New Delhi) +法国首都是巴黎(Paris) +英国首都是伦敦(London) +加拿大首都是温哥华(Vancouver) +南非首都是开普敦(Cape Town) +美国首都是华盛顿(Washington) +澳大利亚首都是堪培拉(Canberra) +印度首都是新德里(New Delhi) +法国首都是巴黎(Paris) +英国首都是伦敦(London) +加拿大首都是温哥华(Vancouver) +南非首都是开普敦(Cape Town) +美国首都是华盛顿(Washington) +澳大利亚首都是堪培拉(Canberra) +印度首都是新德里(New Delhi) +法国首都是巴黎(Paris) +英国首都是伦敦(London) +加拿大首都是温哥华(Vancouver) +南非首都是开普敦(Cape Town) +美国首都是华盛顿(Washington) +澳大利亚首都是堪培拉(Canberra) +印度首都是新德里(New Delhi) +法国首都是巴黎(Paris) +英国首都是伦敦(London) +加拿大首都是温哥华(Vancouver) +南非首都是开普敦(Cape Town) +美国首都是华盛顿(Washington) +澳大利亚首都是堪培拉(Canberra) +印度首都是新德里(New Delhi) +法国首都是巴黎( +qps: 128.489649542011 +""" \ No newline at end of file diff --git a/models/nlp/large_language_model/qwen1.5-7b/vllm/README.md b/models/nlp/large_language_model/qwen1.5-7b/vllm/README.md new file mode 100644 index 0000000000000000000000000000000000000000..b805671c2aa271d0d77619b51b6be4fa42c9129b --- /dev/null +++ b/models/nlp/large_language_model/qwen1.5-7b/vllm/README.md @@ -0,0 +1,44 @@ +# Qwen1.5-7B + +## Description + +Qwen1.5 is a language model series including decoder language models of different model sizes. For each size, we release the base language model and the aligned chat model. It is based on the Transformer architecture with SwiGLU activation, attention QKV bias, group query attention, mixture of sliding window attention and full attention, etc. Additionally, we have an improved tokenizer adaptive to multiple natural languages and codes. For the beta version, temporarily we did not include GQA (except for 32B) and the mixture of SWA and full attention. + +## Setup + +### Install + +```bash +# Install libGL +## CentOS +yum install -y mesa-libGL +## Ubuntu +apt install -y libgl1-mesa-dev + +# Please contact the staff to obtain the relevant installlation packages. +pip3 install Path/To/bitsandbytes-xxx.whl +pip3 install Path/To/flash_atten-xxx.whl +pip3 install Path/To/ixformer-xxx.whl +pip3 install Path/To/vllm-xxx.whl +pip3 install Path/To/eetq-xxx.whl +``` + +### Download + +-Model: + +```bash +mkdir data/qwen1.5 +``` + +## Inference + +```bash +python3 offline_inference.py --model /data/qwen1.5/$MODEL_ID --max-tokens 256 -tp 1 --temperature 0.0 +``` + +## Results + +| Model | QPS | +| ---------- | ----- | +| Qwen1.5-7B | 109.56| diff --git a/models/nlp/large_language_model/qwen1.5-7b/vllm/offline_inference.py b/models/nlp/large_language_model/qwen1.5-7b/vllm/offline_inference.py new file mode 100644 index 0000000000000000000000000000000000000000..7ee127a259eb78f91d71c07b4a129464e0cc6cd3 --- /dev/null +++ b/models/nlp/large_language_model/qwen1.5-7b/vllm/offline_inference.py @@ -0,0 +1,110 @@ +# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +import sys +from pathlib import Path +sys.path.append(str(Path(__file__).resolve().parent.parent.parent)) +from utils import load_chat_template,sampling_add_cli_args + +import logging +import time +import argparse +import dataclasses +import inspect + +import torch +from vllm import LLM, SamplingParams, EngineArgs + + +parser = argparse.ArgumentParser() +parser.add_argument("--chat_template",type=str,default=None) +parser.add_argument("--remove_chat_template",default=False,action="store_true",help="pass this if you are not use a chat model") +parser = EngineArgs.add_cli_args(parser) +parser = sampling_add_cli_args(parser) +args = parser.parse_args() + +engine_args = [attr.name for attr in dataclasses.fields(EngineArgs)] +sampling_args = [param.name for param in list(inspect.signature(SamplingParams.__init__).parameters.values())[1:]] +engine_params = {attr:getattr(args, attr) for attr in engine_args} +sampling_params = {attr:getattr(args, attr) for attr in sampling_args if args.__contains__(attr)} + +model_name = args.model.strip() +model_name = model_name if args.model[-1]!='/' else model_name[:-1] +model_name = model_name.rsplit('/')[-1] + + +# Sample prompts. +prompts = [ + "哪些迹象可能表明一个人正在经历焦虑?", + "描述一下如何制作芝士披萨。", + "写一篇有关5G网络研发的综述文章。" + ] + +# Create a sampling params object. +sampling_params = SamplingParams(**sampling_params) + +# Create an LLM. +llm = LLM(**engine_params) + +# process chat template +if args.remove_chat_template: + if 'chat' in model_name.lower(): + logging.warning(f"The model name from model path is {model_name}, so we guess you are using the chat model and the additional processing is required for the input prompt. " + f"If the result is not quite correct, please ensure you do not pass --remove_chat_template in CLI.") + prompts_new = prompts +else: + # Build chat model promopt + logging.warning("If you are using a non chat model, please pass the --remove_chat_template in CLI.") + # Try use transformers's apply_chat_template, if chat_template is None, will use defalut template. + # For some old models, the default template may cause bad answers. we don't consider this situation, + # because the Transformers team is advancing the chat template. For more informatino about it, + # please refer to https://huggingface.co/docs/transformers/main/chat_templating + try: + load_chat_template(llm.get_tokenizer(),args.chat_template) + prompts_new = [] + for prompt in prompts: + messages = [ + {"role": "user", "content": prompt} + ] + text = llm.get_tokenizer().apply_chat_template( + messages, + tokenize=False, + add_generation_prompt=True + ) + prompts_new.append(text) + except: + logging.warning("use tokenizer apply_chat_template function failed, may because of low transformers version...(try use transformers>=4.34.0)") + prompts_new = prompts + +# Generate texts from the prompts. The output is a list of RequestOutput objects +# that contain the prompt, generated text, and other information. +outputs = llm.generate(prompts_new, sampling_params,use_tqdm=False) if isinstance(prompts_new[0],str) else llm.generate(sampling_params=sampling_params,prompt_token_ids=prompts_new,use_tqdm=False) +torch.cuda.synchronize() + +start_time = time.perf_counter() +outputs = llm.generate(prompts_new, sampling_params) if isinstance(prompts_new[0],str) else llm.generate(sampling_params=sampling_params,prompt_token_ids=prompts_new) +torch.cuda.synchronize() +end_time = time.perf_counter() +duration_time = end_time - start_time + +num_tokens = 0 +# Print the outputs. +for i, output in enumerate(outputs): + prompt = prompts[i] # show the origin prompt. actully prompt is "output.prompt" + generated_text = output.outputs[0].text + + num_tokens += len(output.outputs[0].token_ids) + print(f"Prompt: {prompt}\nGenerated text: {generated_text} \n") +print(f"tokens: {num_tokens}, QPS: {num_tokens/duration_time}") diff --git a/models/nlp/large_language_model/qwen1.5-7b/vllm/utils.py b/models/nlp/large_language_model/qwen1.5-7b/vllm/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..c6def85dedc08ef9c3a489ce9dc5b1ff4a5e48b0 --- /dev/null +++ b/models/nlp/large_language_model/qwen1.5-7b/vllm/utils.py @@ -0,0 +1,173 @@ +# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +import codecs +import logging +import argparse + + +def sampling_add_cli_args(args: argparse.ArgumentParser) -> argparse.ArgumentParser: + args.add_argument( + '--n', + type=int, + default=1, + help="Number of output sequences to return for the given prompt.") + args.add_argument( + '--best-of', + type=int, + default=None, + help="Number of output sequences that are generated from the prompt. " + "From these `best_of` sequences, the top `n` sequences are returned. " + "`best_of` must be greater than or equal to `n`. This is treated as " + "the beam width when `use_beam_search` is True. By default, `best_of`" + "is set to `n`.") + args.add_argument( + '--presence-penalty', + type=float, + default=0.0, + help="Float that penalizes new tokens based on whether they " + "appear in the generated text so far. Values > 0 encourage the model " + "to use new tokens, while values < 0 encourage the model to repeat " + "tokens.") + args.add_argument( + '--frequency-penalty', + type=float, + default=0.0, + help="Float that penalizes new tokens based on their " + " frequency in the generated text so far. Values > 0 encourage the " + " model to use new tokens, while values < 0 encourage the model to " + "repeat tokens.") + args.add_argument( + '--repetition-penalty', + type=float, + default=1.0, + help="Float that penalizes new tokens based on whether " + "they appear in the prompt and the generated text so far. Values > 1 " + "encourage the model to use new tokens, while values < 1 encourage " + "the model to repeat tokens.") + args.add_argument( + '--temperature', + type=float, + default=1.0, + help="Float that controls the randomness of the sampling. Lower " + "values make the model more deterministic, while higher values make " + "the model more random. Zero means greedy sampling.") + args.add_argument( + '--top-p', + type=float, + default=1.0, + help="Float that controls the cumulative probability of the top tokens " + "to consider. Must be in (0, 1]. Set to 1 to consider all tokens.") + args.add_argument( + '--top-k', + type=int, + default=-1, + help="Integer that controls the number of top tokens to consider. Set " + "to -1 to consider all tokens.") + args.add_argument( + '--min-p', + type=float, + default=0.0, + help="Float that represents the minimum probability for a token to be " + "considered, relative to the probability of the most likely token. " + "Must be in [0, 1]. Set to 0 to disable this.") + args.add_argument( + '--use-beam-search', + default=False, + action="store_true", + help="Whether to use beam search instead of sampling.") + args.add_argument( + '--length-penalty', + type=float, + default=1.0, + help="Float that penalizes sequences based on their length. Used in beam search.") + args.add_argument( + '--stop', + type=str, + default=None, + help="List of strings that stop the generation when they are generated. " + "The returned output will not contain the stop strings.") + args.add_argument( + '--stop-token-ids', + type=int, + default=None, + help="List of tokens that stop the generation when they are " + "generated. The returned output will contain the stop tokens unless " + "the stop tokens are special tokens.") + args.add_argument( + '--include-stop-str-in-output', + default=False, + action="store_true", + help="Whether to include the stop strings in output text. Defaults to False.") + args.add_argument( + '--ignore-eos', + default=False, + action="store_true", + help="Whether to ignore the EOS token and continue generating tokens after the EOS token is generated.") + args.add_argument( + '--max-tokens', + type=int, + default=16, + help="Maximum number of tokens to generate per output sequence.") + args.add_argument( + '--logprobs', + type=int, + default=None, + help="NNumber of log probabilities to return per output token. " + "Note that the implementation follows the OpenAI API: The return " + "result includes the log probabilities on the `logprobs` most likely " + "tokens, as well the chosen tokens. The API will always return the " + "log probability of the sampled token, so there may be up to " + "`logprobs+1` elements in the response.") + args.add_argument( + '--prompt-logprobs', + type=int, + default=None, + help="Number of log probabilities to return per prompt token.") + args.add_argument( + '--skip-special-tokens', + default=True, + action="store_false", + help="Whether to skip special tokens in the output.") + args.add_argument( + '--spaces-between-special-tokens', + default=True, + action="store_false", + help="Whether to add spaces between special tokens in the output. Defaults to True.") + # early_stopping logits_processors seed + return args + + +def load_chat_template(tokenizer, chat_template): + if chat_template is not None: + try: + with open(chat_template, "r") as f: + tokenizer.chat_template = f.read() + except OSError: + # If opening a file fails, set chat template to be args to + # ensure we decode so our escape are interpreted correctly + tokenizer.chat_template = codecs.decode( + chat_template, "unicode_escape") + + logging.info( + f"Using supplied chat template:\n{tokenizer.chat_template}" + ) + elif tokenizer.chat_template is not None: + logging.info( + f"Using default chat template:\n{tokenizer.chat_template}. This May lead to unsatisfactory results. You can provide a template.jinja file for vllm." + ) + else: + logging.warning( + "No chat template provided. Chat API will not work. This May lead to unsatisfactory results. You can provide a template.jinja file for vllm.")