diff --git a/README.md b/README.md index 79b1e4e27703ce90717f72f42bf6e90c55d144eb..a92b22a32f0390f7d8cb860bbadfa7247c1f24d9 100644 --- a/README.md +++ b/README.md @@ -1160,6 +1160,18 @@ DeepSparkInference将按季度进行版本更新,后续会逐步丰富模型 Supported - + + Llama2-13B + - + Supported + - + + + Llama2-70B + - + Supported + - + Llama3-70B Supported @@ -1176,7 +1188,7 @@ DeepSparkInference将按季度进行版本更新,后续会逐步丰富模型 Qwen-7B Supported - - Supported + - Qwen1.5-7B diff --git a/models/nlp/large_language_model/qwen-7b/text-generation-inference/README.md b/models/nlp/large_language_model/qwen-7b/text-generation-inference/README.md deleted file mode 100644 index 729b9833fa6e0d7947f72dde56206988646bc299..0000000000000000000000000000000000000000 --- a/models/nlp/large_language_model/qwen-7b/text-generation-inference/README.md +++ /dev/null @@ -1,55 +0,0 @@ -# Qwen-7B - -## Description - -Large language models (LLMs) have revolutionized the field of artificial intelligence, enabling natural language processing tasks that were previously thought to be exclusive to humans. In this work, we introduce Qwen, the first installment of our large language model series. Qwen is a comprehensive language model series that encompasses distinct models with varying parameter counts. It includes Qwen, the base pretrained language models, and Qwen-Chat, the chat models finetuned with human alignment techniques. The base language models consistently demonstrate superior performance across a multitude of downstream tasks, and the chat models, particularly those trained using Reinforcement Learning from Human Feedback (RLHF), are highly competitive. The chat models possess advanced tool-use and planning capabilities for creating agent applications, showcasing impressive performance even when compared to bigger models on complex tasks like utilizing a code interpreter. Furthermore, we have developed coding-specialized models, Code-Qwen and Code-Qwen-Chat, as well as mathematics-focused models, Math-Qwen-Chat, which are built upon base language models. These models demonstrate significantly improved performance in comparison with open-source models, and slightly fall behind the proprietary models. - -## Setup - -### Install - -In order to run the model smoothly, you need to get the sdk from [resource center](https://support.iluvatar.com/#/ProductLine?id=2) of Iluvatar CoreX official website. - -```bash -# Install libGL -## CentOS -yum install -y mesa-libGL -## Ubuntu -apt install -y libgl1-mesa-dev - -``` - -### Download - --Model: - -```bash -# Make sure the model's file name is qwen-7B -mkdir data -``` - -## Inference - -### Start webserver - -#### Single GPU - -```bash -# Use one docker container to start webserver -export PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python -ENABLE_INFER_PG=1 CUDA_VISIBLE_DEVICES=0 USE_FLASH_ATTENTION=true text-generation-launcher --model-id ./data/qwen-7B --sharded false --dtype float16 --disable-custom-kernels --port 8001 --max-input-length 13312 --max-batch-prefill-tokens 13312 --max-total-tokens 15360 --max-batch-total-tokens 15360 -``` - -#### Offline test - -```bash -# Use another docker container to run offline test -export CUDA_VISIBLE_DEVICES=1 -python3 offline_inference.py --model2path ./data/qwen-7B -``` - -## Results - -| Model | QPS | -| ------- | ----- | -| Qwen-7B | 35.64 | diff --git a/models/nlp/large_language_model/qwen-7b/text-generation-inference/ci/prepare.sh b/models/nlp/large_language_model/qwen-7b/text-generation-inference/ci/prepare.sh deleted file mode 100644 index 4b2fdf8b5b521defcd963cd9e9fe92bd271dc2cf..0000000000000000000000000000000000000000 --- a/models/nlp/large_language_model/qwen-7b/text-generation-inference/ci/prepare.sh +++ /dev/null @@ -1,29 +0,0 @@ -#!/bin/bash -# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. -# All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may -# not use this file except in compliance with the License. You may obtain -# a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -set -x -ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"') -if [[ ${ID} == "ubuntu" ]]; then - apt install -y libgl1-mesa-glx -elif [[ ${ID} == "centos" ]]; then - yum install -y mesa-libGL -else - echo "Not Support Os" -fi - -mkdir -p data - -ln -s /mnt/deepspark/data/checkpoints/qwen-7B data/qwen-7B \ No newline at end of file diff --git a/models/nlp/large_language_model/qwen-7b/text-generation-inference/offline_inference.py b/models/nlp/large_language_model/qwen-7b/text-generation-inference/offline_inference.py deleted file mode 100644 index d16450cd9c7b3967546e9c4e96b6ff575f0d73d9..0000000000000000000000000000000000000000 --- a/models/nlp/large_language_model/qwen-7b/text-generation-inference/offline_inference.py +++ /dev/null @@ -1,170 +0,0 @@ -import argparse -import time - -import torch -from text_generation_server.models.flash_qwen2 import FlashQwen2 -from text_generation_server.pb import generate_pb2 -from text_generation_server.utils.speculate import set_speculate -from torch.cuda import profiler - - -def parse_args(args=None): - parser = argparse.ArgumentParser() - parser.add_argument("--generate_length", type=int, default=512) - parser.add_argument( - "--model2path", type=str, default="/home/data/nlp/qwen2/Qwen1.5-0.5B" - ) - parser.add_argument("--quantize", type=str, default=None, choices=["awq"]) - parser.add_argument("--speculate", type=int, default=0) - - return parser.parse_args(args) - - -if __name__ == "__main__": - args = parse_args() - - max_input_length = 2048 - max_prefill_tokens = 2048 - - set_speculate(args.speculate) - model = FlashQwen2(args.model2path, trust_remote_code=True, quantize=args.quantize) - - first_line = "蒙古国的首都是乌兰巴托(Ulaanbaatar)\n冰岛的首都是雷克雅未克(Reykjavik)\n埃塞俄比亚的首都是" - - default_pb_parameters = generate_pb2.NextTokenChooserParameters( - temperature=1.0, - repetition_penalty=1.0, - top_k=0, - top_p=1, - typical_p=1.0, - do_sample=False, - ) - - default_pb_stop_parameters = generate_pb2.StoppingCriteriaParameters( - stop_sequences=[], max_new_tokens=args.generate_length - ) - - warmup_requests = generate_pb2.Request( - id=0, - inputs="_test " * max_input_length, - prefill_logprobs=True, - truncate=max_input_length, - parameters=generate_pb2.NextTokenChooserParameters( - temperature=0.9, - top_k=10, - top_p=0.9, - typical_p=0.9, - do_sample=False, - seed=0, - repetition_penalty=1.2, - watermark=True, - ), - stopping_parameters=generate_pb2.StoppingCriteriaParameters( - max_new_tokens=2, - stop_sequences=[], - ignore_eos_token=False, - ), - top_n_tokens=20, - ) - warmup_requests_batch = generate_pb2.Batch(id=0, requests=[warmup_requests], size=1) - warmup_requests_batchs = model.batch_type.from_pb( - warmup_requests_batch, model.tokenizer, model.dtype, torch.device("cuda") - ) - - model.warmup(warmup_requests_batchs) - - pb_request = generate_pb2.Request( - id=1, - inputs=first_line, - prefill_logprobs=True, - truncate=1024, - parameters=default_pb_parameters, - stopping_parameters=default_pb_stop_parameters, - ) - pb_one_batch = generate_pb2.Batch(id=1, requests=[pb_request], size=1) - causal_lm_one_batch = model.batch_type.from_pb( - pb_one_batch, model.tokenizer, model.dtype, torch.device("cuda") - ) - - next_batch_one = causal_lm_one_batch - last_generations = True - torch.cuda.synchronize() - profiler.start() - start_time = time.perf_counter() - for _ in range(causal_lm_one_batch.stopping_criterias[0].max_new_tokens - 1): - generations_one, next_batch_one, _ = model.generate_token(next_batch_one) - if next_batch_one is None: - last_generations = False - break - if last_generations: - generations_one, next_batch_one, _ = model.generate_token(next_batch_one) - profiler.stop() - torch.cuda.synchronize() - end_time = time.perf_counter() - duration_time = end_time - start_time - print(f"generate length: {generations_one[0].generated_text.generated_tokens}") - print( - f"one batch: {generations_one[0].generated_text.text}\nqps: {generations_one[0].generated_text.generated_tokens /duration_time}" - ) - -""" -qwen1.5-0.5B -one batch: 亚历山大(Alexandria) -俄罗斯的首都是莫斯科(Moscow) -土耳其的首都是伊斯坦布尔(Istanbul) -南非的首都是开普敦(Cape Town) -美国的首都是华盛顿(Washington) -澳大利亚的首都是堪培拉(Canberra) -印度的首都是新德里(New Delhi) -法国的首都是巴黎(Paris) -英国的首都是伦敦(London) -加拿大首都是温哥华(Vancouver) -南非首都是开普敦(Cape Town) -美国首都是华盛顿(Washington) -澳大利亚首都是堪培拉(Canberra) -印度首都是新德里(New Delhi) -法国首都是巴黎(Paris) -英国首都是伦敦(London) -加拿大首都是温哥华(Vancouver) -南非首都是开普敦(Cape Town) -美国首都是华盛顿(Washington) -澳大利亚首都是堪培拉(Canberra) -印度首都是新德里(New Delhi) -法国首都是巴黎(Paris) -英国首都是伦敦(London) -加拿大首都是温哥华(Vancouver) -南非首都是开普敦(Cape Town) -美国首都是华盛顿(Washington) -澳大利亚首都是堪培拉(Canberra) -印度首都是新德里(New Delhi) -法国首都是巴黎(Paris) -英国首都是伦敦(London) -加拿大首都是温哥华(Vancouver) -南非首都是开普敦(Cape Town) -美国首都是华盛顿(Washington) -澳大利亚首都是堪培拉(Canberra) -印度首都是新德里(New Delhi) -法国首都是巴黎(Paris) -英国首都是伦敦(London) -加拿大首都是温哥华(Vancouver) -南非首都是开普敦(Cape Town) -美国首都是华盛顿(Washington) -澳大利亚首都是堪培拉(Canberra) -印度首都是新德里(New Delhi) -法国首都是巴黎(Paris) -英国首都是伦敦(London) -加拿大首都是温哥华(Vancouver) -南非首都是开普敦(Cape Town) -美国首都是华盛顿(Washington) -澳大利亚首都是堪培拉(Canberra) -印度首都是新德里(New Delhi) -法国首都是巴黎(Paris) -英国首都是伦敦(London) -加拿大首都是温哥华(Vancouver) -南非首都是开普敦(Cape Town) -美国首都是华盛顿(Washington) -澳大利亚首都是堪培拉(Canberra) -印度首都是新德里(New Delhi) -法国首都是巴黎( -qps: 128.489649542011 -""" diff --git a/models/nlp/large_language_model/qwen1.5-72b/vllm/README.md b/models/nlp/large_language_model/qwen1.5-72b/vllm/README.md index 4a44a48a4bcec1cc0a9f857d34a669f554cfb6ce..3b7c9298e78a320df17d2c1cda1fa7826c4cd73e 100644 --- a/models/nlp/large_language_model/qwen1.5-72b/vllm/README.md +++ b/models/nlp/large_language_model/qwen1.5-72b/vllm/README.md @@ -30,7 +30,7 @@ ln -s /path/to/Qwen1.5-72B ./data/qwen1.5 ```bash export CUDA_VISIBLE_DEVICES=0,1 -python3 offline_inference.py --model ./data/qwen1.5/Qwen1.5-72B --max-tokens 256 -tp 2 --temperature 0.0 --max-model-len 3096 +python3 offline_inference.py --model ./data/qwen1.5/Qwen1.5-72B --max-tokens 256 -tp 8 --temperature 0.0 --max-model-len 3096 ``` ## Results diff --git a/models/nlp/large_language_model/qwen1.5-7b/text-generation-inference/offline_inference.py b/models/nlp/large_language_model/qwen1.5-7b/text-generation-inference/offline_inference.py index d16450cd9c7b3967546e9c4e96b6ff575f0d73d9..6f395536dab722ecb1c53830e05bd0598dee5dd4 100644 --- a/models/nlp/large_language_model/qwen1.5-7b/text-generation-inference/offline_inference.py +++ b/models/nlp/large_language_model/qwen1.5-7b/text-generation-inference/offline_inference.py @@ -1,35 +1,66 @@ import argparse +import os import time +import text_generation_server.models as models import torch -from text_generation_server.models.flash_qwen2 import FlashQwen2 +from text_generation_server.models.globals import set_adapter_to_index from text_generation_server.pb import generate_pb2 -from text_generation_server.utils.speculate import set_speculate from torch.cuda import profiler def parse_args(args=None): parser = argparse.ArgumentParser() + parser.add_argument("--inputs", type=str, default=None) parser.add_argument("--generate_length", type=int, default=512) parser.add_argument( - "--model2path", type=str, default="/home/data/nlp/qwen2/Qwen1.5-0.5B" + "--model2path", type=str, default="/home/data/nlp/llama2/llama2-7b" ) - parser.add_argument("--quantize", type=str, default=None, choices=["awq"]) parser.add_argument("--speculate", type=int, default=0) + parser.add_argument( + "--quantize", type=str, default=None, choices=["awq", "bitsandbytes", "gptq"] + ) return parser.parse_args(args) if __name__ == "__main__": args = parse_args() + revision = None + max_input_length = 1024 + max_prefill_tokens = 1024 + model_id = args.model2path + test_model_name = model_id.split("/") + set_adapter_to_index({}) + lora_adapter_ids = None + model = models.get_model( + model_id, + lora_adapter_ids, + revision, + False, + quantize=args.quantize, + speculate=args.speculate, + dtype=None, + trust_remote_code=True, + max_input_tokens=max_input_length, + ) - max_input_length = 2048 - max_prefill_tokens = 2048 - - set_speculate(args.speculate) - model = FlashQwen2(args.model2path, trust_remote_code=True, quantize=args.quantize) - - first_line = "蒙古国的首都是乌兰巴托(Ulaanbaatar)\n冰岛的首都是雷克雅未克(Reykjavik)\n埃塞俄比亚的首都是" + if test_model_name[-1] == "": + print(f"test_model_name: {test_model_name[-2]}") + else: + print(f"test_model_name: {test_model_name[-1]}") + model_name = model_id.lower() + if args.inputs: + first_line = args.inputs + else: + if any(key in model_name for key in ["codellama", "flan-t5"]): + first_line = "Tell me about AI" + elif any(key in model_name for key in ["santacoder", "opt", "galactica"]): + first_line = "Shanghai is one of the most prosperous cities in China, with a GDP of over $300 billion. Shanghai has the fastest growing economy in China and is the second busiest port in the world. In addition to being a hub for business, Shanghai is also a major tourist destination. It is known for its diverse culture and many historical sites.\nThe city of Shanghai is located on the coast of the Pacific Ocean in east-central China. It is bordered by Jiangsu Province to the north, Zhejiang Province to the south, and Jiangsu Province to the west." + elif "mpt" in model_name: + first_line = "Here is a recipe for vegan banana bread:\n" + else: + first_line = "蒙古国的首都是乌兰巴托(Ulaanbaatar)\n冰岛的首都是雷克雅未克(Reykjavik)\n埃塞俄比亚的首都是" default_pb_parameters = generate_pb2.NextTokenChooserParameters( temperature=1.0, @@ -47,7 +78,7 @@ if __name__ == "__main__": warmup_requests = generate_pb2.Request( id=0, inputs="_test " * max_input_length, - prefill_logprobs=True, + input_chunks=generate_pb2.Input(chunks=[generate_pb2.InputChunk(text="Test")]), truncate=max_input_length, parameters=generate_pb2.NextTokenChooserParameters( temperature=0.9, @@ -62,9 +93,10 @@ if __name__ == "__main__": stopping_parameters=generate_pb2.StoppingCriteriaParameters( max_new_tokens=2, stop_sequences=[], - ignore_eos_token=False, + ignore_eos_token=True, ), - top_n_tokens=20, + prefill_logprobs=True, + top_n_tokens=512, ) warmup_requests_batch = generate_pb2.Batch(id=0, requests=[warmup_requests], size=1) warmup_requests_batchs = model.batch_type.from_pb( @@ -73,9 +105,18 @@ if __name__ == "__main__": model.warmup(warmup_requests_batchs) + prompt_length = model.tokenizer( + first_line, truncation=False, return_tensors="pt" + ).input_ids[0] + + print(f"prompt length: {len(prompt_length)}") + print(f"input text: {first_line}") pb_request = generate_pb2.Request( id=1, - inputs=first_line, + inputs=first_line, # first_line + input_chunks=generate_pb2.Input( + chunks=[generate_pb2.InputChunk(text=first_line)] + ), prefill_logprobs=True, truncate=1024, parameters=default_pb_parameters, @@ -97,74 +138,12 @@ if __name__ == "__main__": last_generations = False break if last_generations: - generations_one, next_batch_one, _ = model.generate_token(next_batch_one) + data = model.generate_token(next_batch_one) profiler.stop() torch.cuda.synchronize() end_time = time.perf_counter() duration_time = end_time - start_time + generations_one = data[0] print(f"generate length: {generations_one[0].generated_text.generated_tokens}") - print( - f"one batch: {generations_one[0].generated_text.text}\nqps: {generations_one[0].generated_text.generated_tokens /duration_time}" - ) - -""" -qwen1.5-0.5B -one batch: 亚历山大(Alexandria) -俄罗斯的首都是莫斯科(Moscow) -土耳其的首都是伊斯坦布尔(Istanbul) -南非的首都是开普敦(Cape Town) -美国的首都是华盛顿(Washington) -澳大利亚的首都是堪培拉(Canberra) -印度的首都是新德里(New Delhi) -法国的首都是巴黎(Paris) -英国的首都是伦敦(London) -加拿大首都是温哥华(Vancouver) -南非首都是开普敦(Cape Town) -美国首都是华盛顿(Washington) -澳大利亚首都是堪培拉(Canberra) -印度首都是新德里(New Delhi) -法国首都是巴黎(Paris) -英国首都是伦敦(London) -加拿大首都是温哥华(Vancouver) -南非首都是开普敦(Cape Town) -美国首都是华盛顿(Washington) -澳大利亚首都是堪培拉(Canberra) -印度首都是新德里(New Delhi) -法国首都是巴黎(Paris) -英国首都是伦敦(London) -加拿大首都是温哥华(Vancouver) -南非首都是开普敦(Cape Town) -美国首都是华盛顿(Washington) -澳大利亚首都是堪培拉(Canberra) -印度首都是新德里(New Delhi) -法国首都是巴黎(Paris) -英国首都是伦敦(London) -加拿大首都是温哥华(Vancouver) -南非首都是开普敦(Cape Town) -美国首都是华盛顿(Washington) -澳大利亚首都是堪培拉(Canberra) -印度首都是新德里(New Delhi) -法国首都是巴黎(Paris) -英国首都是伦敦(London) -加拿大首都是温哥华(Vancouver) -南非首都是开普敦(Cape Town) -美国首都是华盛顿(Washington) -澳大利亚首都是堪培拉(Canberra) -印度首都是新德里(New Delhi) -法国首都是巴黎(Paris) -英国首都是伦敦(London) -加拿大首都是温哥华(Vancouver) -南非首都是开普敦(Cape Town) -美国首都是华盛顿(Washington) -澳大利亚首都是堪培拉(Canberra) -印度首都是新德里(New Delhi) -法国首都是巴黎(Paris) -英国首都是伦敦(London) -加拿大首都是温哥华(Vancouver) -南非首都是开普敦(Cape Town) -美国首都是华盛顿(Washington) -澳大利亚首都是堪培拉(Canberra) -印度首都是新德里(New Delhi) -法国首都是巴黎( -qps: 128.489649542011 -""" + print(f"one batch: {generations_one[0].generated_text.text}") + print(f"qps: {generations_one[0].generated_text.generated_tokens / duration_time}") \ No newline at end of file diff --git a/tests/models_trtllm.yaml b/tests/models_trtllm.yaml index de21908e72524b29534ee542380ab49dfb448e30..7e5f776742c101b159621b7aa96eb7f82eb220cd 100644 --- a/tests/models_trtllm.yaml +++ b/tests/models_trtllm.yaml @@ -23,14 +23,6 @@ - fp16 relative_path: models/nlp/large_language_model/llama2-70b/trtllm task_type: nlp/large_language_model -- datasets: https://localhost - download_url: https://localhost/qwen-7B - name: qwen-7b - need_third_part: false - precisions: - - fp16 - relative_path: models/nlp/large_language_model/qwen-7b/text-generation-inference - task_type: nlp/large_language_model - datasets: https://localhost download_url: https://modelscope.cn/models/qwen/Qwen1.5-7B name: qwen1.5-7b diff --git a/tests/run_vllm.py b/tests/run_vllm.py index 51bd0d3e69873c42bcf7d4168f3d66d78929365f..c3eb0217cad0414008e39a98346e91fe0e77dae1 100644 --- a/tests/run_vllm.py +++ b/tests/run_vllm.py @@ -201,8 +201,8 @@ def run_nlp_testcase(model): script = f""" set -x cd ../{model['relative_path']} - export CUDA_VISIBLE_DEVICES=0,1,2,3 - python3 offline_inference.py --model ./qwen2-72b --max-tokens 256 -tp 4 --temperature 0.0 --gpu-memory-utilization 0.98 --max-model-len 32768 + export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 + python3 offline_inference.py --model ./qwen2-72b --max-tokens 256 -tp 8 --temperature 0.0 --gpu-memory-utilization 0.98 --max-model-len 32768 """ elif model_name == "stablelm": script = f"""