From e27be90de9cac17e4a9a417f119699aff6f5de1a Mon Sep 17 00:00:00 2001 From: "xinchi.tian" Date: Wed, 9 Oct 2024 14:43:45 +0800 Subject: [PATCH] Add StableLM in IxRT link #IAVG7Q Signed-off-by: xinchi.tian --- .../stablelm/vllm/README.md | 40 ++++ .../stablelm/vllm/offline_inference.py | 135 ++++++++++++++ .../stablelm/vllm/utils.py | 173 ++++++++++++++++++ 3 files changed, 348 insertions(+) create mode 100644 models/nlp/large_language_model/stablelm/vllm/README.md create mode 100644 models/nlp/large_language_model/stablelm/vllm/offline_inference.py create mode 100644 models/nlp/large_language_model/stablelm/vllm/utils.py diff --git a/models/nlp/large_language_model/stablelm/vllm/README.md b/models/nlp/large_language_model/stablelm/vllm/README.md new file mode 100644 index 00000000..cff35998 --- /dev/null +++ b/models/nlp/large_language_model/stablelm/vllm/README.md @@ -0,0 +1,40 @@ +# StableLm-2-1_6B + +## Description + +Stable LM 2 1.6B is a decoder-only language model with 1.6 billion parameters. It has been pre-trained on a diverse multilingual and code dataset, comprising 2 trillion tokens, for two epochs. This model is designed for various natural language processing tasks, including text generation and dialogue systems. Due to its extensive training on such a large and diverse dataset, Stable LM 2 1.6B can effectively capture the nuances of language, including grammar, semantics, and contextual relationships, which enhances the quality and accuracy of the generated text. + +## Setup + +### Install + +```bash +# Install libGL +## CentOS +yum install -y mesa-libGL +## Ubuntu +apt install -y libgl1-mesa-glx +pip3 install transformers +``` + +### Download + +-Model: + +```bash +# Download model from the website and make sure the model's path is "data/stablelm/stablelm-2-1_6b" +mkdir -p data/stablelm/stablelm-2-1_6b +``` + +## Inference + +```bash +export CUDA_VISIBLE_DEVICES=0,1 +python3 offline_inference.py --model ./data/stablelm/stablelm-2-1_6b --max-tokens 256 -tp 1 --temperature 0.0 +``` + +## Results + +| Model | QPS | +| ---------- | ----- | +| StableLM | 254.3 | diff --git a/models/nlp/large_language_model/stablelm/vllm/offline_inference.py b/models/nlp/large_language_model/stablelm/vllm/offline_inference.py new file mode 100644 index 00000000..40678a62 --- /dev/null +++ b/models/nlp/large_language_model/stablelm/vllm/offline_inference.py @@ -0,0 +1,135 @@ +# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +import sys +from pathlib import Path + +sys.path.append(str(Path(__file__).resolve().parent.parent.parent)) +import argparse +import dataclasses +import inspect +import logging +import time + +import torch +from utils import load_chat_template, sampling_add_cli_args +from vllm import LLM, EngineArgs, SamplingParams + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--chat_template", type=str, default=None) + parser.add_argument( + "--remove_chat_template", + default=False, + action="store_true", + help="pass this if you are not use a chat model", + ) + parser = EngineArgs.add_cli_args(parser) + parser = sampling_add_cli_args(parser) + args = parser.parse_args() + + engine_args = [attr.name for attr in dataclasses.fields(EngineArgs)] + sampling_args = [ + param.name + for param in list( + inspect.signature(SamplingParams.__init__).parameters.values() + )[1:] + ] + engine_params = {attr: getattr(args, attr) for attr in engine_args} + sampling_params = { + attr: getattr(args, attr) for attr in sampling_args if args.__contains__(attr) + } + + model_name = args.model.strip() + model_name = model_name if args.model[-1] != "/" else model_name[:-1] + model_name = model_name.rsplit("/")[-1] + + # Sample prompts. + prompts = [ + "What signs may indicate that a person is experiencing anxiety?", + "Describe how to make cheese pizza.", + "Write a review article on the development of 5G networks.", + ] + + # Create a sampling params object. + sampling_params = SamplingParams(**sampling_params) + + # Create an LLM. + llm = LLM(**engine_params) + + # process chat template + if args.remove_chat_template: + if "chat" in model_name.lower(): + logging.warning( + f"The model name from model path is {model_name}, so we guess you are using the chat model and the additional processing is required for the input prompt. " + f"If the result is not quite correct, please ensure you do not pass --remove_chat_template in CLI." + ) + prompts_new = prompts + else: + # Build chat model promopt + logging.warning( + "If you are using a non chat model, please pass the --remove_chat_template in CLI." + ) + # Try use transformers's apply_chat_template, if chat_template is None, will use defalut template. + # For some old models, the default template may cause bad answers. we don't consider this situation, + # because the Transformers team is advancing the chat template. For more informatino about it, + # please refer to https://huggingface.co/docs/transformers/main/chat_templating + try: + load_chat_template(llm.get_tokenizer(), args.chat_template) + prompts_new = [] + for prompt in prompts: + messages = [{"role": "user", "content": prompt}] + text = llm.get_tokenizer().apply_chat_template( + messages, tokenize=False, add_generation_prompt=True + ) + prompts_new.append(text) + except: + logging.warning( + "use tokenizer apply_chat_template function failed, may because of low transformers version...(try use transformers>=4.34.0)" + ) + prompts_new = prompts + + # Generate texts from the prompts. The output is a list of RequestOutput objects + # that contain the prompt, generated text, and other information. + outputs = ( + llm.generate(prompts_new, sampling_params, use_tqdm=False) + if isinstance(prompts_new[0], str) + else llm.generate( + sampling_params=sampling_params, + prompt_token_ids=prompts_new, + use_tqdm=False, + ) + ) + torch.cuda.synchronize() + + start_time = time.perf_counter() + outputs = ( + llm.generate(prompts_new, sampling_params) + if isinstance(prompts_new[0], str) + else llm.generate(sampling_params=sampling_params, prompt_token_ids=prompts_new) + ) + torch.cuda.synchronize() + end_time = time.perf_counter() + duration_time = end_time - start_time + + num_tokens = 0 + # Print the outputs. + for i, output in enumerate(outputs): + prompt = prompts[i] # show the origin prompt. actully prompt is "output.prompt" + generated_text = output.outputs[0].text + + num_tokens += len(output.outputs[0].token_ids) + print(f"Prompt: {prompt}\nGenerated text: {generated_text} \n") + print(f"tokens: {num_tokens}, QPS: {num_tokens/duration_time}") \ No newline at end of file diff --git a/models/nlp/large_language_model/stablelm/vllm/utils.py b/models/nlp/large_language_model/stablelm/vllm/utils.py new file mode 100644 index 00000000..c6def85d --- /dev/null +++ b/models/nlp/large_language_model/stablelm/vllm/utils.py @@ -0,0 +1,173 @@ +# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +import codecs +import logging +import argparse + + +def sampling_add_cli_args(args: argparse.ArgumentParser) -> argparse.ArgumentParser: + args.add_argument( + '--n', + type=int, + default=1, + help="Number of output sequences to return for the given prompt.") + args.add_argument( + '--best-of', + type=int, + default=None, + help="Number of output sequences that are generated from the prompt. " + "From these `best_of` sequences, the top `n` sequences are returned. " + "`best_of` must be greater than or equal to `n`. This is treated as " + "the beam width when `use_beam_search` is True. By default, `best_of`" + "is set to `n`.") + args.add_argument( + '--presence-penalty', + type=float, + default=0.0, + help="Float that penalizes new tokens based on whether they " + "appear in the generated text so far. Values > 0 encourage the model " + "to use new tokens, while values < 0 encourage the model to repeat " + "tokens.") + args.add_argument( + '--frequency-penalty', + type=float, + default=0.0, + help="Float that penalizes new tokens based on their " + " frequency in the generated text so far. Values > 0 encourage the " + " model to use new tokens, while values < 0 encourage the model to " + "repeat tokens.") + args.add_argument( + '--repetition-penalty', + type=float, + default=1.0, + help="Float that penalizes new tokens based on whether " + "they appear in the prompt and the generated text so far. Values > 1 " + "encourage the model to use new tokens, while values < 1 encourage " + "the model to repeat tokens.") + args.add_argument( + '--temperature', + type=float, + default=1.0, + help="Float that controls the randomness of the sampling. Lower " + "values make the model more deterministic, while higher values make " + "the model more random. Zero means greedy sampling.") + args.add_argument( + '--top-p', + type=float, + default=1.0, + help="Float that controls the cumulative probability of the top tokens " + "to consider. Must be in (0, 1]. Set to 1 to consider all tokens.") + args.add_argument( + '--top-k', + type=int, + default=-1, + help="Integer that controls the number of top tokens to consider. Set " + "to -1 to consider all tokens.") + args.add_argument( + '--min-p', + type=float, + default=0.0, + help="Float that represents the minimum probability for a token to be " + "considered, relative to the probability of the most likely token. " + "Must be in [0, 1]. Set to 0 to disable this.") + args.add_argument( + '--use-beam-search', + default=False, + action="store_true", + help="Whether to use beam search instead of sampling.") + args.add_argument( + '--length-penalty', + type=float, + default=1.0, + help="Float that penalizes sequences based on their length. Used in beam search.") + args.add_argument( + '--stop', + type=str, + default=None, + help="List of strings that stop the generation when they are generated. " + "The returned output will not contain the stop strings.") + args.add_argument( + '--stop-token-ids', + type=int, + default=None, + help="List of tokens that stop the generation when they are " + "generated. The returned output will contain the stop tokens unless " + "the stop tokens are special tokens.") + args.add_argument( + '--include-stop-str-in-output', + default=False, + action="store_true", + help="Whether to include the stop strings in output text. Defaults to False.") + args.add_argument( + '--ignore-eos', + default=False, + action="store_true", + help="Whether to ignore the EOS token and continue generating tokens after the EOS token is generated.") + args.add_argument( + '--max-tokens', + type=int, + default=16, + help="Maximum number of tokens to generate per output sequence.") + args.add_argument( + '--logprobs', + type=int, + default=None, + help="NNumber of log probabilities to return per output token. " + "Note that the implementation follows the OpenAI API: The return " + "result includes the log probabilities on the `logprobs` most likely " + "tokens, as well the chosen tokens. The API will always return the " + "log probability of the sampled token, so there may be up to " + "`logprobs+1` elements in the response.") + args.add_argument( + '--prompt-logprobs', + type=int, + default=None, + help="Number of log probabilities to return per prompt token.") + args.add_argument( + '--skip-special-tokens', + default=True, + action="store_false", + help="Whether to skip special tokens in the output.") + args.add_argument( + '--spaces-between-special-tokens', + default=True, + action="store_false", + help="Whether to add spaces between special tokens in the output. Defaults to True.") + # early_stopping logits_processors seed + return args + + +def load_chat_template(tokenizer, chat_template): + if chat_template is not None: + try: + with open(chat_template, "r") as f: + tokenizer.chat_template = f.read() + except OSError: + # If opening a file fails, set chat template to be args to + # ensure we decode so our escape are interpreted correctly + tokenizer.chat_template = codecs.decode( + chat_template, "unicode_escape") + + logging.info( + f"Using supplied chat template:\n{tokenizer.chat_template}" + ) + elif tokenizer.chat_template is not None: + logging.info( + f"Using default chat template:\n{tokenizer.chat_template}. This May lead to unsatisfactory results. You can provide a template.jinja file for vllm." + ) + else: + logging.warning( + "No chat template provided. Chat API will not work. This May lead to unsatisfactory results. You can provide a template.jinja file for vllm.") -- Gitee