From 2846a8c570d1daf2fcd2885a98af80f4ea0d4d18 Mon Sep 17 00:00:00 2001 From: "hongliang.yuan" Date: Tue, 21 Jan 2025 17:09:03 +0800 Subject: [PATCH 1/6] init vllm ci --- .../baichuan2-7b/vllm/ci/prepare.sh | 32 ++ .../baichuan2-7b/vllm/offline_inference.py | 5 + .../chatglm3-6b-32k/vllm/ci/prepare.sh | 27 ++ .../chatglm3-6b/vllm/ci/prepare.sh | 28 ++ .../chatglm3-6b/vllm/offline_inference.py | 5 + .../llama2-13b/trtllm/README.md | 3 - .../llama2-13b/trtllm/ci/prepare.sh | 36 ++ .../llama2-70b/trtllm/README.md | 4 - .../llama2-70b/trtllm/ci/prepare.sh | 36 ++ .../llama2-7b/trtllm/ci/prepare.sh | 36 ++ .../llama2-7b/vllm/ci/prepare.sh | 30 ++ .../llama2-7b/vllm/offline_inference.py | 4 + .../llama3-70b/vllm/ci/prepare.sh | 26 ++ .../llama3-70b/vllm/offline_inference.py | 4 + .../text-generation-inference/README.md | 4 - .../text-generation-inference/ci/prepare.sh | 29 ++ .../offline_inference.py | 5 + .../qwen-7b/vllm/ci/prepare.sh | 30 ++ .../qwen-7b/vllm/offline_inference.py | 4 + .../qwen1.5-14b/vllm/ci/prepare.sh | 25 ++ .../qwen1.5-14b/vllm/offline_inference.py | 4 + .../qwen1.5-32b/vllm/ci/prepare.sh | 30 ++ .../qwen1.5-32b/vllm/offline_inference.py | 4 + .../qwen1.5-72b/vllm/ci/prepare.sh | 25 ++ .../qwen1.5-72b/vllm/offline_inference.py | 4 + .../text-generation-inference/ci/prepare.sh | 29 ++ .../offline_inference.py | 5 + .../qwen1.5-7b/vllm/ci/prepare.sh | 25 ++ .../qwen1.5-7b/vllm/offline_inference.py | 4 + .../qwen2-72b/vllm/ci/prepare.sh | 30 ++ .../qwen2-72b/vllm/offline_inference.py | 4 + .../qwen2-7b/vllm/ci/prepare.sh | 30 ++ .../qwen2-7b/vllm/offline_inference.py | 4 + .../stablelm/vllm/ci/prepare.sh | 27 ++ .../stablelm/vllm/offline_inference.py | 6 +- tests/models_trtllm.yaml | 41 +++ tests/models_vllm.yaml | 106 ++++++ tests/run_trtllm.py | 189 +++++++++++ tests/run_vllm.py | 315 ++++++++++++++++++ 39 files changed, 1243 insertions(+), 12 deletions(-) create mode 100644 models/nlp/large_language_model/baichuan2-7b/vllm/ci/prepare.sh create mode 100644 models/nlp/large_language_model/chatglm3-6b-32k/vllm/ci/prepare.sh create mode 100644 models/nlp/large_language_model/chatglm3-6b/vllm/ci/prepare.sh create mode 100644 models/nlp/large_language_model/llama2-13b/trtllm/ci/prepare.sh create mode 100644 models/nlp/large_language_model/llama2-70b/trtllm/ci/prepare.sh create mode 100644 models/nlp/large_language_model/llama2-7b/trtllm/ci/prepare.sh create mode 100644 models/nlp/large_language_model/llama2-7b/vllm/ci/prepare.sh create mode 100644 models/nlp/large_language_model/llama3-70b/vllm/ci/prepare.sh create mode 100644 models/nlp/large_language_model/qwen-7b/text-generation-inference/ci/prepare.sh create mode 100644 models/nlp/large_language_model/qwen-7b/vllm/ci/prepare.sh create mode 100644 models/nlp/large_language_model/qwen1.5-14b/vllm/ci/prepare.sh create mode 100644 models/nlp/large_language_model/qwen1.5-32b/vllm/ci/prepare.sh create mode 100644 models/nlp/large_language_model/qwen1.5-72b/vllm/ci/prepare.sh create mode 100644 models/nlp/large_language_model/qwen1.5-7b/text-generation-inference/ci/prepare.sh create mode 100644 models/nlp/large_language_model/qwen1.5-7b/vllm/ci/prepare.sh create mode 100644 models/nlp/large_language_model/qwen2-72b/vllm/ci/prepare.sh create mode 100644 models/nlp/large_language_model/qwen2-7b/vllm/ci/prepare.sh create mode 100644 models/nlp/large_language_model/stablelm/vllm/ci/prepare.sh create mode 100644 tests/models_trtllm.yaml create mode 100644 tests/models_vllm.yaml create mode 100644 tests/run_trtllm.py create mode 100644 tests/run_vllm.py diff --git a/models/nlp/large_language_model/baichuan2-7b/vllm/ci/prepare.sh b/models/nlp/large_language_model/baichuan2-7b/vllm/ci/prepare.sh new file mode 100644 index 00000000..5b9abbd3 --- /dev/null +++ b/models/nlp/large_language_model/baichuan2-7b/vllm/ci/prepare.sh @@ -0,0 +1,32 @@ +#!/bin/bash +# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -x +ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"') +if [[ ${ID} == "ubuntu" ]]; then + apt install -y libgl1-mesa-glx +elif [[ ${ID} == "centos" ]]; then + yum install -y mesa-libGL +else + echo "Not Support Os" +fi + +pip3 install transformers==4.37.1 + +# has prepared in ci +# ln -s /mnt/deepspark/data/checkpoints/Baichuan2-7B-Base ./ + +python3 convert2int8.py --model-path ./Baichuan2-7B-Base/ \ No newline at end of file diff --git a/models/nlp/large_language_model/baichuan2-7b/vllm/offline_inference.py b/models/nlp/large_language_model/baichuan2-7b/vllm/offline_inference.py index 40c0e2e1..9e5738a1 100644 --- a/models/nlp/large_language_model/baichuan2-7b/vllm/offline_inference.py +++ b/models/nlp/large_language_model/baichuan2-7b/vllm/offline_inference.py @@ -109,4 +109,9 @@ for i, output in enumerate(outputs): print(f"Prompt: {prompt}\nGenerated text: {generated_text} \n") print(f"tokens: {num_tokens}, QPS: {num_tokens/duration_time}") +metricResult = {"metricResult": {}} +metricResult["metricResult"]["tokens"] = num_tokens +metricResult["metricResult"]["QPS"] = round(num_tokens/duration_time,3) +print(metricResult) + # 0.3.2 tokens: 757, QPS: 97.97229589080902 \ No newline at end of file diff --git a/models/nlp/large_language_model/chatglm3-6b-32k/vllm/ci/prepare.sh b/models/nlp/large_language_model/chatglm3-6b-32k/vllm/ci/prepare.sh new file mode 100644 index 00000000..ad683f6e --- /dev/null +++ b/models/nlp/large_language_model/chatglm3-6b-32k/vllm/ci/prepare.sh @@ -0,0 +1,27 @@ +#!/bin/bash +# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -x +ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"') +if [[ ${ID} == "ubuntu" ]]; then + apt install -y libgl1-mesa-glx +elif [[ ${ID} == "centos" ]]; then + yum install -y mesa-libGL +else + echo "Not Support Os" +fi + +pip3 install transformers==4.37.1 diff --git a/models/nlp/large_language_model/chatglm3-6b/vllm/ci/prepare.sh b/models/nlp/large_language_model/chatglm3-6b/vllm/ci/prepare.sh new file mode 100644 index 00000000..4cfd5fd6 --- /dev/null +++ b/models/nlp/large_language_model/chatglm3-6b/vllm/ci/prepare.sh @@ -0,0 +1,28 @@ +#!/bin/bash +# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -x +ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"') +if [[ ${ID} == "ubuntu" ]]; then + apt install -y libgl1-mesa-glx +elif [[ ${ID} == "centos" ]]; then + yum install -y mesa-libGL +else + echo "Not Support Os" +fi + +pip3 install vllm==0.5.0 +pip3 install transformers==4.37.1 diff --git a/models/nlp/large_language_model/chatglm3-6b/vllm/offline_inference.py b/models/nlp/large_language_model/chatglm3-6b/vllm/offline_inference.py index 0162d93c..eaa7fe20 100644 --- a/models/nlp/large_language_model/chatglm3-6b/vllm/offline_inference.py +++ b/models/nlp/large_language_model/chatglm3-6b/vllm/offline_inference.py @@ -99,4 +99,9 @@ for i, output in enumerate(outputs): print(f"Prompt: {prompt}\nGenerated text: {generated_text} \n") print(f"tokens: {num_tokens}, QPS: {num_tokens/duration_time}") +metricResult = {"metricResult": {}} +metricResult["metricResult"]["tokens"] = num_tokens +metricResult["metricResult"]["QPS"] = round(num_tokens/duration_time,3) +print(metricResult) + # 0.3.2 tokens: 422, QPS: 70.02308283048338(tokens: 422, QPS: 93.67210003677407),32-k 模型 tokens: 477, QPS: 81.46537314533865(tokens: 477, QPS: 106.54247895449554) \ No newline at end of file diff --git a/models/nlp/large_language_model/llama2-13b/trtllm/README.md b/models/nlp/large_language_model/llama2-13b/trtllm/README.md index 4658334d..b24c29ce 100755 --- a/models/nlp/large_language_model/llama2-13b/trtllm/README.md +++ b/models/nlp/large_language_model/llama2-13b/trtllm/README.md @@ -18,9 +18,6 @@ apt install -y libgl1-mesa-dev bash scripts/set_environment.sh . -# *star refer to a specified version -wget http://files.deepspark.org.cn:880/deepspark/add-ons/tensorrt_llm-*.whl -pip install tensorrt_llm-*.whl ``` ### Download diff --git a/models/nlp/large_language_model/llama2-13b/trtllm/ci/prepare.sh b/models/nlp/large_language_model/llama2-13b/trtllm/ci/prepare.sh new file mode 100644 index 00000000..7947d62b --- /dev/null +++ b/models/nlp/large_language_model/llama2-13b/trtllm/ci/prepare.sh @@ -0,0 +1,36 @@ +#!/bin/bash +# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -x +ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"') +if [[ ${ID} == "ubuntu" ]]; then + apt install -y libgl1-mesa-glx +elif [[ ${ID} == "centos" ]]; then + yum install -y mesa-libGL +else + echo "Not Support Os" +fi + +bash scripts/set_environment.sh . + +# Download model from the website and make sure the model's path is "data/llama2-13b-chat" +# Download dataset from the website and make sure the dataset's path is "data/datasets_cnn_dailymail" +mkdir -p data +ln -s /mnt/deepspark/data/checkpoints/llama2-13b-chat data/llama2-13b-chat +ln -s /mnt/deepspark/data/datasets/datasets_cnn_dailymail data/datasets_cnn_dailymail +# Please download rouge.py to this path if your server can't attach huggingface.co. +mkdir -p rouge/ +cp /mnt/deepspark/data/3rd_party/rouge.py rouge/ \ No newline at end of file diff --git a/models/nlp/large_language_model/llama2-70b/trtllm/README.md b/models/nlp/large_language_model/llama2-70b/trtllm/README.md index 671b51c7..621dbffe 100644 --- a/models/nlp/large_language_model/llama2-70b/trtllm/README.md +++ b/models/nlp/large_language_model/llama2-70b/trtllm/README.md @@ -16,10 +16,6 @@ yum install -y mesa-libGL apt install -y libgl1-mesa-dev bash scripts/set_environment.sh . - -# *star refer to a specified version -wget http://files.deepspark.org.cn:880/deepspark/add-ons/tensorrt_llm-*.whl -pip install tensorrt_llm-*.whl ``` ### Download diff --git a/models/nlp/large_language_model/llama2-70b/trtllm/ci/prepare.sh b/models/nlp/large_language_model/llama2-70b/trtllm/ci/prepare.sh new file mode 100644 index 00000000..4a8c1e4e --- /dev/null +++ b/models/nlp/large_language_model/llama2-70b/trtllm/ci/prepare.sh @@ -0,0 +1,36 @@ +#!/bin/bash +# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -x +ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"') +if [[ ${ID} == "ubuntu" ]]; then + apt install -y libgl1-mesa-glx +elif [[ ${ID} == "centos" ]]; then + yum install -y mesa-libGL +else + echo "Not Support Os" +fi + +bash scripts/set_environment.sh . + +# Download model from the website and make sure the model's path is "data/llama2-70b-chat" +# Download dataset from the website and make sure the dataset's path is "data/datasets_cnn_dailymail" +mkdir -p data +ln -s /mnt/deepspark/data/checkpoints/llama2-70b-chat data/llama2-70b-chat +ln -s /mnt/deepspark/data/datasets/datasets_cnn_dailymail data/datasets_cnn_dailymail +# Please download rouge.py to this path if your server can't attach huggingface.co. +mkdir -p rouge/ +cp /mnt/deepspark/data/3rd_party/rouge.py rouge/ \ No newline at end of file diff --git a/models/nlp/large_language_model/llama2-7b/trtllm/ci/prepare.sh b/models/nlp/large_language_model/llama2-7b/trtllm/ci/prepare.sh new file mode 100644 index 00000000..2ac2384d --- /dev/null +++ b/models/nlp/large_language_model/llama2-7b/trtllm/ci/prepare.sh @@ -0,0 +1,36 @@ +#!/bin/bash +# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -x +ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"') +if [[ ${ID} == "ubuntu" ]]; then + apt install -y libgl1-mesa-glx +elif [[ ${ID} == "centos" ]]; then + yum install -y mesa-libGL +else + echo "Not Support Os" +fi + +bash scripts/set_environment.sh . + +# Download model from the website and make sure the model's path is "data/llama2-7b-chat" +# Download dataset from the website and make sure the dataset's path is "data/datasets_cnn_dailymail" +mkdir -p data +ln -s /mnt/deepspark/data/checkpoints/llama2-7b-chat data/llama2-7b-chat +ln -s /mnt/deepspark/data/datasets/datasets_cnn_dailymail data/datasets_cnn_dailymail +# Please download rouge.py to this path if your server can't attach huggingface.co. +mkdir -p rouge/ +cp /mnt/deepspark/data/3rd_party/rouge.py rouge/ \ No newline at end of file diff --git a/models/nlp/large_language_model/llama2-7b/vllm/ci/prepare.sh b/models/nlp/large_language_model/llama2-7b/vllm/ci/prepare.sh new file mode 100644 index 00000000..6afe9667 --- /dev/null +++ b/models/nlp/large_language_model/llama2-7b/vllm/ci/prepare.sh @@ -0,0 +1,30 @@ +#!/bin/bash +# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -x +ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"') +if [[ ${ID} == "ubuntu" ]]; then + apt install -y libgl1-mesa-glx +elif [[ ${ID} == "centos" ]]; then + yum install -y mesa-libGL +else + echo "Not Support Os" +fi + +# Contact the iluvatar manager to get adapted install packages of vllm, triton, and ixformer +pip3 install vllm +pip3 install triton +pip3 install ixformer diff --git a/models/nlp/large_language_model/llama2-7b/vllm/offline_inference.py b/models/nlp/large_language_model/llama2-7b/vllm/offline_inference.py index 9c0b6d2f..538d3541 100644 --- a/models/nlp/large_language_model/llama2-7b/vllm/offline_inference.py +++ b/models/nlp/large_language_model/llama2-7b/vllm/offline_inference.py @@ -129,3 +129,7 @@ if __name__ == "__main__": num_tokens += len(output.outputs[0].token_ids) print(f"Prompt: {prompt}\nGenerated text: {generated_text} \n") print(f"tokens: {num_tokens}, QPS: {num_tokens/duration_time}") + metricResult = {"metricResult": {}} + metricResult["metricResult"]["tokens"] = num_tokens + metricResult["metricResult"]["QPS"] = round(num_tokens/duration_time,3) + print(metricResult) diff --git a/models/nlp/large_language_model/llama3-70b/vllm/ci/prepare.sh b/models/nlp/large_language_model/llama3-70b/vllm/ci/prepare.sh new file mode 100644 index 00000000..000245a8 --- /dev/null +++ b/models/nlp/large_language_model/llama3-70b/vllm/ci/prepare.sh @@ -0,0 +1,26 @@ +#!/bin/bash +# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -x +ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"') +if [[ ${ID} == "ubuntu" ]]; then + apt install -y libgl1-mesa-glx +elif [[ ${ID} == "centos" ]]; then + yum install -y mesa-libGL +else + echo "Not Support Os" +fi + diff --git a/models/nlp/large_language_model/llama3-70b/vllm/offline_inference.py b/models/nlp/large_language_model/llama3-70b/vllm/offline_inference.py index cdf635a7..6932fde6 100644 --- a/models/nlp/large_language_model/llama3-70b/vllm/offline_inference.py +++ b/models/nlp/large_language_model/llama3-70b/vllm/offline_inference.py @@ -151,6 +151,10 @@ if args.acc_test: print('val ROUGE-1 score f1: {}, target ROUGE-1 score f1: {}, fail'.format(scores[0]["rouge-1"]['f'],args.acc_threshold)) exit(1) print('val ROUGE-1 score f1: {}, target ROUGE-1 score f1: {}, pass'.format(scores[0]["rouge-1"]['f'],args.acc_threshold)) + metricResult = {"metricResult": {}} + metricResult["metricResult"]["val ROUGE-1 score f1"] = scores[0]["rouge-1"]['f'] + metricResult["metricResult"]["val ROUGE-1 score f1"] = args.acc_threshold + print(metricResult) # 2 7b vllm 0.1.6: batch 3, tokens: 773, QPS: 64.35866137433203; batch 1, tokens: 257, QPS: 25.396898421442113 # 1\2 13b vllm 0.1.6: batch 3, tokens: 768, QPS: 41.538942353799506; batch 1, tokens: 257, QPS: 15.639606595029639 (2, 6.5829828847570795; 8, 5.137610167755676) diff --git a/models/nlp/large_language_model/qwen-7b/text-generation-inference/README.md b/models/nlp/large_language_model/qwen-7b/text-generation-inference/README.md index 33b0aab2..729b9833 100644 --- a/models/nlp/large_language_model/qwen-7b/text-generation-inference/README.md +++ b/models/nlp/large_language_model/qwen-7b/text-generation-inference/README.md @@ -17,10 +17,6 @@ yum install -y mesa-libGL ## Ubuntu apt install -y libgl1-mesa-dev -# *star refer to a specified version -wget http://files.deepspark.org.cn:880/deepspark/add-ons/text-generation-*.whl -wget http://files.deepspark.org.cn:880/deepspark/add-ons/text-generation-server-*.whl -pip install tensorrt_llm-*.whl text-generation-server-*.whl ``` ### Download diff --git a/models/nlp/large_language_model/qwen-7b/text-generation-inference/ci/prepare.sh b/models/nlp/large_language_model/qwen-7b/text-generation-inference/ci/prepare.sh new file mode 100644 index 00000000..4b2fdf8b --- /dev/null +++ b/models/nlp/large_language_model/qwen-7b/text-generation-inference/ci/prepare.sh @@ -0,0 +1,29 @@ +#!/bin/bash +# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -x +ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"') +if [[ ${ID} == "ubuntu" ]]; then + apt install -y libgl1-mesa-glx +elif [[ ${ID} == "centos" ]]; then + yum install -y mesa-libGL +else + echo "Not Support Os" +fi + +mkdir -p data + +ln -s /mnt/deepspark/data/checkpoints/qwen-7B data/qwen-7B \ No newline at end of file diff --git a/models/nlp/large_language_model/qwen-7b/text-generation-inference/offline_inference.py b/models/nlp/large_language_model/qwen-7b/text-generation-inference/offline_inference.py index 57db6334..e3ebcc3a 100644 --- a/models/nlp/large_language_model/qwen-7b/text-generation-inference/offline_inference.py +++ b/models/nlp/large_language_model/qwen-7b/text-generation-inference/offline_inference.py @@ -109,6 +109,11 @@ if __name__ == "__main__": duration_time = end_time - start_time print(f"generate length: {generations_one[0].generated_text.generated_tokens}") print(f"one batch: {generations_one[0].generated_text.text}\nqps: {generations_one[0].generated_text.generated_tokens /duration_time}") + metricResult = {"metricResult": {}} + metricResult["metricResult"]["generate length"] = generations_one[0].generated_text.generated_tokens + metricResult["metricResult"]["one batch"] = generations_one[0].generated_text.text + metricResult["metricResult"]["qps"] = generations_one[0].generated_text.generated_tokens /duration_time + print(metricResult) """ qwen-7B diff --git a/models/nlp/large_language_model/qwen-7b/vllm/ci/prepare.sh b/models/nlp/large_language_model/qwen-7b/vllm/ci/prepare.sh new file mode 100644 index 00000000..cfd5031a --- /dev/null +++ b/models/nlp/large_language_model/qwen-7b/vllm/ci/prepare.sh @@ -0,0 +1,30 @@ +#!/bin/bash +# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -x +ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"') +if [[ ${ID} == "ubuntu" ]]; then + apt install -y libgl1-mesa-glx +elif [[ ${ID} == "centos" ]]; then + yum install -y mesa-libGL +else + echo "Not Support Os" +fi + +# Contact the iluvatar manager to get adapted install packages of vllm, triton, and ixformer +pip3 install vllm +pip3 install triton +pip3 install ixformer \ No newline at end of file diff --git a/models/nlp/large_language_model/qwen-7b/vllm/offline_inference.py b/models/nlp/large_language_model/qwen-7b/vllm/offline_inference.py index 3b9e9fd8..5de14fb0 100644 --- a/models/nlp/large_language_model/qwen-7b/vllm/offline_inference.py +++ b/models/nlp/large_language_model/qwen-7b/vllm/offline_inference.py @@ -130,3 +130,7 @@ if __name__ == "__main__": num_tokens += len(output.outputs[0].token_ids) print(f"Prompt: {prompt}\nGenerated text: {generated_text} \n") print(f"tokens: {num_tokens}, QPS: {num_tokens/duration_time}") + metricResult = {"metricResult": {}} + metricResult["metricResult"]["tokens"] = num_tokens + metricResult["metricResult"]["QPS"] = round(num_tokens/duration_time,3) + print(metricResult) diff --git a/models/nlp/large_language_model/qwen1.5-14b/vllm/ci/prepare.sh b/models/nlp/large_language_model/qwen1.5-14b/vllm/ci/prepare.sh new file mode 100644 index 00000000..75fb1945 --- /dev/null +++ b/models/nlp/large_language_model/qwen1.5-14b/vllm/ci/prepare.sh @@ -0,0 +1,25 @@ +#!/bin/bash +# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -x +ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"') +if [[ ${ID} == "ubuntu" ]]; then + apt install -y libgl1-mesa-glx +elif [[ ${ID} == "centos" ]]; then + yum install -y mesa-libGL +else + echo "Not Support Os" +fi diff --git a/models/nlp/large_language_model/qwen1.5-14b/vllm/offline_inference.py b/models/nlp/large_language_model/qwen1.5-14b/vllm/offline_inference.py index 7ee127a2..130f0885 100644 --- a/models/nlp/large_language_model/qwen1.5-14b/vllm/offline_inference.py +++ b/models/nlp/large_language_model/qwen1.5-14b/vllm/offline_inference.py @@ -108,3 +108,7 @@ for i, output in enumerate(outputs): num_tokens += len(output.outputs[0].token_ids) print(f"Prompt: {prompt}\nGenerated text: {generated_text} \n") print(f"tokens: {num_tokens}, QPS: {num_tokens/duration_time}") +metricResult = {"metricResult": {}} +metricResult["metricResult"]["tokens"] = num_tokens +metricResult["metricResult"]["QPS"] = round(num_tokens/duration_time,3) +print(metricResult) \ No newline at end of file diff --git a/models/nlp/large_language_model/qwen1.5-32b/vllm/ci/prepare.sh b/models/nlp/large_language_model/qwen1.5-32b/vllm/ci/prepare.sh new file mode 100644 index 00000000..cfd5031a --- /dev/null +++ b/models/nlp/large_language_model/qwen1.5-32b/vllm/ci/prepare.sh @@ -0,0 +1,30 @@ +#!/bin/bash +# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -x +ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"') +if [[ ${ID} == "ubuntu" ]]; then + apt install -y libgl1-mesa-glx +elif [[ ${ID} == "centos" ]]; then + yum install -y mesa-libGL +else + echo "Not Support Os" +fi + +# Contact the iluvatar manager to get adapted install packages of vllm, triton, and ixformer +pip3 install vllm +pip3 install triton +pip3 install ixformer \ No newline at end of file diff --git a/models/nlp/large_language_model/qwen1.5-32b/vllm/offline_inference.py b/models/nlp/large_language_model/qwen1.5-32b/vllm/offline_inference.py index 5e859291..9799150f 100644 --- a/models/nlp/large_language_model/qwen1.5-32b/vllm/offline_inference.py +++ b/models/nlp/large_language_model/qwen1.5-32b/vllm/offline_inference.py @@ -129,3 +129,7 @@ if __name__ == "__main__": num_tokens += len(output.outputs[0].token_ids) print(f"Prompt: {prompt}\nGenerated text: {generated_text} \n") print(f"tokens: {num_tokens}, QPS: {num_tokens/duration_time}") + metricResult = {"metricResult": {}} + metricResult["metricResult"]["tokens"] = num_tokens + metricResult["metricResult"]["QPS"] = round(num_tokens/duration_time,3) + print(metricResult) \ No newline at end of file diff --git a/models/nlp/large_language_model/qwen1.5-72b/vllm/ci/prepare.sh b/models/nlp/large_language_model/qwen1.5-72b/vllm/ci/prepare.sh new file mode 100644 index 00000000..75fb1945 --- /dev/null +++ b/models/nlp/large_language_model/qwen1.5-72b/vllm/ci/prepare.sh @@ -0,0 +1,25 @@ +#!/bin/bash +# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -x +ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"') +if [[ ${ID} == "ubuntu" ]]; then + apt install -y libgl1-mesa-glx +elif [[ ${ID} == "centos" ]]; then + yum install -y mesa-libGL +else + echo "Not Support Os" +fi diff --git a/models/nlp/large_language_model/qwen1.5-72b/vllm/offline_inference.py b/models/nlp/large_language_model/qwen1.5-72b/vllm/offline_inference.py index 7ee127a2..130f0885 100644 --- a/models/nlp/large_language_model/qwen1.5-72b/vllm/offline_inference.py +++ b/models/nlp/large_language_model/qwen1.5-72b/vllm/offline_inference.py @@ -108,3 +108,7 @@ for i, output in enumerate(outputs): num_tokens += len(output.outputs[0].token_ids) print(f"Prompt: {prompt}\nGenerated text: {generated_text} \n") print(f"tokens: {num_tokens}, QPS: {num_tokens/duration_time}") +metricResult = {"metricResult": {}} +metricResult["metricResult"]["tokens"] = num_tokens +metricResult["metricResult"]["QPS"] = round(num_tokens/duration_time,3) +print(metricResult) \ No newline at end of file diff --git a/models/nlp/large_language_model/qwen1.5-7b/text-generation-inference/ci/prepare.sh b/models/nlp/large_language_model/qwen1.5-7b/text-generation-inference/ci/prepare.sh new file mode 100644 index 00000000..c801677c --- /dev/null +++ b/models/nlp/large_language_model/qwen1.5-7b/text-generation-inference/ci/prepare.sh @@ -0,0 +1,29 @@ +#!/bin/bash +# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -x +ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"') +if [[ ${ID} == "ubuntu" ]]; then + apt install -y libgl1-mesa-glx +elif [[ ${ID} == "centos" ]]; then + yum install -y mesa-libGL +else + echo "Not Support Os" +fi + +mkdir -p data + +ln -s /mnt/deepspark/data/checkpoints/Qwen1.5-7B data/Qwen1.5-7B \ No newline at end of file diff --git a/models/nlp/large_language_model/qwen1.5-7b/text-generation-inference/offline_inference.py b/models/nlp/large_language_model/qwen1.5-7b/text-generation-inference/offline_inference.py index b927973a..87f4df98 100644 --- a/models/nlp/large_language_model/qwen1.5-7b/text-generation-inference/offline_inference.py +++ b/models/nlp/large_language_model/qwen1.5-7b/text-generation-inference/offline_inference.py @@ -115,6 +115,11 @@ if __name__ == "__main__": duration_time = end_time - start_time print(f"generate length: {generations_one[0].generated_text.generated_tokens}") print(f"one batch: {generations_one[0].generated_text.text}\nqps: {generations_one[0].generated_text.generated_tokens /duration_time}") + metricResult = {"metricResult": {}} + metricResult["metricResult"]["generate length"] = generations_one[0].generated_text.generated_tokens + metricResult["metricResult"]["one batch"] = generations_one[0].generated_text.text + metricResult["metricResult"]["qps"] = generations_one[0].generated_text.generated_tokens /duration_time + print(metricResult) """ qwen1.5-0.5B diff --git a/models/nlp/large_language_model/qwen1.5-7b/vllm/ci/prepare.sh b/models/nlp/large_language_model/qwen1.5-7b/vllm/ci/prepare.sh new file mode 100644 index 00000000..75fb1945 --- /dev/null +++ b/models/nlp/large_language_model/qwen1.5-7b/vllm/ci/prepare.sh @@ -0,0 +1,25 @@ +#!/bin/bash +# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -x +ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"') +if [[ ${ID} == "ubuntu" ]]; then + apt install -y libgl1-mesa-glx +elif [[ ${ID} == "centos" ]]; then + yum install -y mesa-libGL +else + echo "Not Support Os" +fi diff --git a/models/nlp/large_language_model/qwen1.5-7b/vllm/offline_inference.py b/models/nlp/large_language_model/qwen1.5-7b/vllm/offline_inference.py index 7ee127a2..bae01307 100644 --- a/models/nlp/large_language_model/qwen1.5-7b/vllm/offline_inference.py +++ b/models/nlp/large_language_model/qwen1.5-7b/vllm/offline_inference.py @@ -108,3 +108,7 @@ for i, output in enumerate(outputs): num_tokens += len(output.outputs[0].token_ids) print(f"Prompt: {prompt}\nGenerated text: {generated_text} \n") print(f"tokens: {num_tokens}, QPS: {num_tokens/duration_time}") +metricResult = {"metricResult": {}} +metricResult["metricResult"]["tokens"] = num_tokens +metricResult["metricResult"]["QPS"] = round(num_tokens/duration_time,3) +print(metricResult) diff --git a/models/nlp/large_language_model/qwen2-72b/vllm/ci/prepare.sh b/models/nlp/large_language_model/qwen2-72b/vllm/ci/prepare.sh new file mode 100644 index 00000000..cfd5031a --- /dev/null +++ b/models/nlp/large_language_model/qwen2-72b/vllm/ci/prepare.sh @@ -0,0 +1,30 @@ +#!/bin/bash +# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -x +ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"') +if [[ ${ID} == "ubuntu" ]]; then + apt install -y libgl1-mesa-glx +elif [[ ${ID} == "centos" ]]; then + yum install -y mesa-libGL +else + echo "Not Support Os" +fi + +# Contact the iluvatar manager to get adapted install packages of vllm, triton, and ixformer +pip3 install vllm +pip3 install triton +pip3 install ixformer \ No newline at end of file diff --git a/models/nlp/large_language_model/qwen2-72b/vllm/offline_inference.py b/models/nlp/large_language_model/qwen2-72b/vllm/offline_inference.py index 5e859291..9799150f 100644 --- a/models/nlp/large_language_model/qwen2-72b/vllm/offline_inference.py +++ b/models/nlp/large_language_model/qwen2-72b/vllm/offline_inference.py @@ -129,3 +129,7 @@ if __name__ == "__main__": num_tokens += len(output.outputs[0].token_ids) print(f"Prompt: {prompt}\nGenerated text: {generated_text} \n") print(f"tokens: {num_tokens}, QPS: {num_tokens/duration_time}") + metricResult = {"metricResult": {}} + metricResult["metricResult"]["tokens"] = num_tokens + metricResult["metricResult"]["QPS"] = round(num_tokens/duration_time,3) + print(metricResult) \ No newline at end of file diff --git a/models/nlp/large_language_model/qwen2-7b/vllm/ci/prepare.sh b/models/nlp/large_language_model/qwen2-7b/vllm/ci/prepare.sh new file mode 100644 index 00000000..cfd5031a --- /dev/null +++ b/models/nlp/large_language_model/qwen2-7b/vllm/ci/prepare.sh @@ -0,0 +1,30 @@ +#!/bin/bash +# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -x +ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"') +if [[ ${ID} == "ubuntu" ]]; then + apt install -y libgl1-mesa-glx +elif [[ ${ID} == "centos" ]]; then + yum install -y mesa-libGL +else + echo "Not Support Os" +fi + +# Contact the iluvatar manager to get adapted install packages of vllm, triton, and ixformer +pip3 install vllm +pip3 install triton +pip3 install ixformer \ No newline at end of file diff --git a/models/nlp/large_language_model/qwen2-7b/vllm/offline_inference.py b/models/nlp/large_language_model/qwen2-7b/vllm/offline_inference.py index 5e859291..9799150f 100644 --- a/models/nlp/large_language_model/qwen2-7b/vllm/offline_inference.py +++ b/models/nlp/large_language_model/qwen2-7b/vllm/offline_inference.py @@ -129,3 +129,7 @@ if __name__ == "__main__": num_tokens += len(output.outputs[0].token_ids) print(f"Prompt: {prompt}\nGenerated text: {generated_text} \n") print(f"tokens: {num_tokens}, QPS: {num_tokens/duration_time}") + metricResult = {"metricResult": {}} + metricResult["metricResult"]["tokens"] = num_tokens + metricResult["metricResult"]["QPS"] = round(num_tokens/duration_time,3) + print(metricResult) \ No newline at end of file diff --git a/models/nlp/large_language_model/stablelm/vllm/ci/prepare.sh b/models/nlp/large_language_model/stablelm/vllm/ci/prepare.sh new file mode 100644 index 00000000..6d814558 --- /dev/null +++ b/models/nlp/large_language_model/stablelm/vllm/ci/prepare.sh @@ -0,0 +1,27 @@ +#!/bin/bash +# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -x +ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"') +if [[ ${ID} == "ubuntu" ]]; then + apt install -y libgl1-mesa-glx +elif [[ ${ID} == "centos" ]]; then + yum install -y mesa-libGL +else + echo "Not Support Os" +fi + +pip3 install transformers \ No newline at end of file diff --git a/models/nlp/large_language_model/stablelm/vllm/offline_inference.py b/models/nlp/large_language_model/stablelm/vllm/offline_inference.py index 40678a62..e9f2abfb 100644 --- a/models/nlp/large_language_model/stablelm/vllm/offline_inference.py +++ b/models/nlp/large_language_model/stablelm/vllm/offline_inference.py @@ -132,4 +132,8 @@ if __name__ == "__main__": num_tokens += len(output.outputs[0].token_ids) print(f"Prompt: {prompt}\nGenerated text: {generated_text} \n") - print(f"tokens: {num_tokens}, QPS: {num_tokens/duration_time}") \ No newline at end of file + print(f"tokens: {num_tokens}, QPS: {num_tokens/duration_time}") + metricResult = {"metricResult": {}} + metricResult["metricResult"]["tokens"] = num_tokens + metricResult["metricResult"]["QPS"] = round(num_tokens/duration_time,3) + print(metricResult) \ No newline at end of file diff --git a/tests/models_trtllm.yaml b/tests/models_trtllm.yaml new file mode 100644 index 00000000..de21908e --- /dev/null +++ b/tests/models_trtllm.yaml @@ -0,0 +1,41 @@ +--- +- datasets: https://localhost + download_url: https://huggingface.co/meta-llama/llama2-7b-chat + name: llama2-7b + need_third_part: false + precisions: + - fp16 + relative_path: models/nlp/large_language_model/llama2-7b/trtllm + task_type: nlp/large_language_model +- datasets: https://localhost + download_url: https://localhost/llama2-13b-chat + name: llama2-13b + need_third_part: false + precisions: + - fp16 + relative_path: models/nlp/large_language_model/llama2-13b/trtllm + task_type: nlp/large_language_model +- datasets: https://localhost + download_url: https://localhost/llama2-70b-chat + name: llama2-70b + need_third_part: false + precisions: + - fp16 + relative_path: models/nlp/large_language_model/llama2-70b/trtllm + task_type: nlp/large_language_model +- datasets: https://localhost + download_url: https://localhost/qwen-7B + name: qwen-7b + need_third_part: false + precisions: + - fp16 + relative_path: models/nlp/large_language_model/qwen-7b/text-generation-inference + task_type: nlp/large_language_model +- datasets: https://localhost + download_url: https://modelscope.cn/models/qwen/Qwen1.5-7B + name: qwen1.5-7b + need_third_part: false + precisions: + - fp16 + relative_path: models/nlp/large_language_model/qwen1.5-7b/text-generation-inference + task_type: nlp/large_language_model diff --git a/tests/models_vllm.yaml b/tests/models_vllm.yaml new file mode 100644 index 00000000..6d7177ea --- /dev/null +++ b/tests/models_vllm.yaml @@ -0,0 +1,106 @@ +--- +- datasets: https://localhost + download_url: https://huggingface.co/baichuan-inc/Baichuan2-7B-Base + name: baichuan2-7b + need_third_part: false + precisions: + - fp16 + - int8 + relative_path: models/nlp/large_language_model/baichuan2-7b/vllm + task_type: nlp/large_language_model +- datasets: https://localhost + download_url: https://huggingface.co/THUDM/chatglm3-6b + name: chatglm3-6b + need_third_part: false + precisions: + - fp16 + relative_path: models/nlp/large_language_model/chatglm3-6b/vllm + task_type: nlp/large_language_model +- datasets: https://localhost + download_url: https://www.modelscope.cn/models/ZhipuAI/chatglm3-6b-32k-无 + name: chatglm3-6b-32k + need_third_part: false + precisions: + - fp16 + relative_path: models/nlp/large_language_model/chatglm3-6b-32k/vllm + task_type: nlp/large_language_model +- datasets: https://localhost + download_url: https://localhost/llama2-7b + name: llama2-7b + need_third_part: false + precisions: + - fp16 + relative_path: models/nlp/large_language_model/llama2-7b/vllm + task_type: nlp/large_language_model +- datasets: https://localhost + download_url: https://localhost/Meta-Llama-3-70B-Instruct + name: llama3-70b + need_third_part: false + precisions: + - fp16 + relative_path: models/nlp/large_language_model/llama3-70b/vllm + task_type: nlp/large_language_model +- datasets: https://localhost + download_url: https://localhost/qwen-7B + name: qwen-7b + need_third_part: false + precisions: + - fp16 + relative_path: models/nlp/large_language_model/qwen-7b/vllm + task_type: nlp/large_language_model +- datasets: https://localhost + download_url: https://modelscope.cn/models/qwen/Qwen1.5-7B + name: qwen1.5-7b + need_third_part: false + precisions: + - fp16 + relative_path: models/nlp/large_language_model/qwen1.5-7b/vllm + task_type: nlp/large_language_model +- datasets: https://localhost + download_url: https://modelscope.cn/models/qwen/Qwen1.5-14B + name: qwen1.5-14b + need_third_part: false + precisions: + - fp16 + relative_path: models/nlp/large_language_model/qwen1.5-14b/vllm + task_type: nlp/large_language_model +- datasets: https://localhost + download_url: https://modelscope.cn/models/Qwen/Qwen1.5-32B-Chat + name: qwen1.5-32b + need_third_part: false + precisions: + - fp16 + relative_path: models/nlp/large_language_model/qwen1.5-32b/vllm + task_type: nlp/large_language_model +- datasets: https://localhost + download_url: https://modelscope.cn/models/qwen/Qwen1.5-72B + name: qwen1.5-72b + need_third_part: false + precisions: + - fp16 + relative_path: models/nlp/large_language_model/qwen1.5-72b/vllm + task_type: nlp/large_language_model +- datasets: https://localhost + download_url: https://modelscope.cn/models/Qwen/Qwen2-7B-Instruct + name: qwen2-7b + need_third_part: false + precisions: + - fp16 + relative_path: models/nlp/large_language_model/qwen2-7b/vllm + task_type: nlp/large_language_model +- datasets: https://localhost + download_url: https://localhost/Qwen2-72B + name: qwen2-72b + need_third_part: false + precisions: + - fp16 + relative_path: models/nlp/large_language_model/qwen2-72b/vllm + task_type: nlp/large_language_model +- datasets: https://localhost + download_url: https://huggingface.co/stabilityai/stablelm-2-1_6b + name: stablelm + need_third_part: false + precisions: + - fp16 + relative_path: models/nlp/large_language_model/stablelm/vllm + task_type: nlp/large_language_model diff --git a/tests/run_trtllm.py b/tests/run_trtllm.py new file mode 100644 index 00000000..9f8a494f --- /dev/null +++ b/tests/run_trtllm.py @@ -0,0 +1,189 @@ +# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import yaml +import subprocess +import json +import re +import time +import logging +import os +import sys +import argparse + +import utils + +# 配置日志 +debug_level = logging.DEBUG if utils.is_debug() else logging.INFO +logging.basicConfig( + handlers=[logging.FileHandler("output.log"), logging.StreamHandler()], + level=debug_level, + format="%(asctime)s - %(levelname)s - %(message)s", +) + +METRIC_PATTERN = r"{'metricResult':.*}" + +def main(): + parser = argparse.ArgumentParser(description="") + parser.add_argument("--model", type=str, help="model name, e.g: alexnet") + args = parser.parse_args() + + if args.model: + test_model = args.model + else: + test_model = os.environ.get("TEST_CASE") + logging.info(f"Test case to run: {test_model}") + if not test_model: + logging.error("test model case is empty") + sys.exit(-1) + + model = get_model_config(test_model) + if not model: + logging.error("mode config is empty") + sys.exit(-1) + + result = {} + # NLP模型 + if model["task_type"] in ["nlp/large_language_model"]: + logging.info(f"Start running {model['name']} test case:\n{json.dumps(model, indent=4)}") + d_url = model["download_url"] + if d_url is not None: + result = run_nlp_testcase(model) + check_model_result(result) + logging.debug(f"The result of {model['name']} is\n{json.dumps(result, indent=4)}") + logging.info(f"End running {model['name']} test case.") + + logging.info(f"Full text result: {result}") + +def get_model_config(mode_name): + with open("models_igie.yaml", "r") as file: + models = yaml.safe_load(file) + + for model in models: + if model["name"] == mode_name.lower(): + return model + return + +def check_model_result(result): + status = "PASS" + for prec in ["fp16", "int8"]: + if prec in result["result"]: + if result["result"][prec]["status"] == "FAIL": + status = "FAIL" + break + result["status"] = status + +def run_nlp_testcase(model): + model_name = model["name"] + result = { + "name": model_name, + "result": {}, + } + d_url = model["download_url"] + checkpoint_n = d_url.split("/")[-1] + dataset_n = model["datasets"].split("/")[-1] + prepare_script = f""" + set -x + cd ../{model['relative_path']} + bash ci/prepare.sh + """ + + # add pip list info when in debug mode + if utils.is_debug(): + pip_list_script = "pip list | grep -E 'numpy|transformer|igie|mmcv|onnx'\n" + prepare_script = pip_list_script + prepare_script + pip_list_script + + run_script(prepare_script) + + for prec in model["precisions"]: + logging.info(f"Start running {model_name} {prec} test case") + script = f""" + set -x + cd ../{model['relative_path']} + """ + if model_name == "llama2-7b": + script = f""" + set -x + cd ../{model['relative_path']} + bash scripts/test_trtllm_llama2_7b_gpu1_build.sh + bash scripts/test_trtllm_llama2_7b_gpu1.sh + """ + elif model_name == "llama2-13b": + script = f""" + set -x + cd ../{model['relative_path']} + export CUDA_VISIBLE_DEVICES=0,1 + bash scripts/test_trtllm_llama2_13b_gpu2_build.sh + bash scripts/test_trtllm_llama2_13b_gpu2.sh + """ + elif model_name == "llama2-70b": + script = f""" + set -x + cd ../{model['relative_path']} + export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 + bash scripts/test_trtllm_llama2_70b_gpu8_build.sh + bash scripts/test_trtllm_llama2_70b_gpu8.sh + """ + elif model_name == "qwen-7b": + script = f""" + set -x + cd ../{model['relative_path']} + export CUDA_VISIBLE_DEVICES=1 + python3 offline_inference.py --model2path ./data/qwen-7B + """ + elif model_name == "qwen1.5-7b": + script = f""" + set -x + cd ../{model['relative_path']} + export CUDA_VISIBLE_DEVICES=1 + python3 offline_inference.py --model2path ./data/Qwen1.5-7B + """ + + r, t = run_script(script) + sout = r.stdout + + pattern = METRIC_PATTERN + matchs = re.findall(pattern, sout) + result["result"].setdefault(prec, {"status": "FAIL"}) + logging.debug(f"matchs:\n{matchs}") + for m in matchs: + result["result"][prec].update(get_metric_result(m)) + if len(matchs) == 2: + result["result"][prec]["status"] = "PASS" + + result["result"][prec]["Cost time (s)"] = t + return result + +def get_metric_result(str): + if str: + return json.loads(str.replace("'", "\""))["metricResult"] + return None + +def run_script(script): + start_time = time.perf_counter() + result = subprocess.run( + script, shell=True, capture_output=True, text=True, executable="/bin/bash" + ) + end_time = time.perf_counter() + execution_time = end_time - start_time + logging.debug(f"执行命令:\n{script}") + logging.debug("执行时间: {:.4f} 秒".format(execution_time)) + logging.debug(f"标准输出: {result.stdout}") + logging.debug(f"标准错误: {result.stderr}") + logging.debug(f"返回码: {result.returncode}") + return result, execution_time + +if __name__ == "__main__": + main() diff --git a/tests/run_vllm.py b/tests/run_vllm.py new file mode 100644 index 00000000..6d212c95 --- /dev/null +++ b/tests/run_vllm.py @@ -0,0 +1,315 @@ +# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import yaml +import subprocess +import json +import re +import time +import logging +import os +import sys +import argparse + +import utils + +# 配置日志 +debug_level = logging.DEBUG if utils.is_debug() else logging.INFO +logging.basicConfig( + handlers=[logging.FileHandler("output.log"), logging.StreamHandler()], + level=debug_level, + format="%(asctime)s - %(levelname)s - %(message)s", +) + +METRIC_PATTERN = r"{'metricResult':.*}" + +def main(): + parser = argparse.ArgumentParser(description="") + parser.add_argument("--model", type=str, help="model name, e.g: alexnet") + args = parser.parse_args() + + if args.model: + test_model = args.model + else: + test_model = os.environ.get("TEST_CASE") + logging.info(f"Test case to run: {test_model}") + if not test_model: + logging.error("test model case is empty") + sys.exit(-1) + + model = get_model_config(test_model) + if not model: + logging.error("mode config is empty") + sys.exit(-1) + + result = {} + # NLP模型 + if model["task_type"] in ["nlp/large_language_model"]: + logging.info(f"Start running {model['name']} test case:\n{json.dumps(model, indent=4)}") + d_url = model["download_url"] + if d_url is not None: + result = run_nlp_testcase(model) + check_model_result(result) + logging.debug(f"The result of {model['name']} is\n{json.dumps(result, indent=4)}") + logging.info(f"End running {model['name']} test case.") + + logging.info(f"Full text result: {result}") + +def get_model_config(mode_name): + with open("models_igie.yaml", "r") as file: + models = yaml.safe_load(file) + + for model in models: + if model["name"] == mode_name.lower(): + return model + return + +def check_model_result(result): + status = "PASS" + for prec in ["fp16", "int8"]: + if prec in result["result"]: + if result["result"][prec]["status"] == "FAIL": + status = "FAIL" + break + result["status"] = status + +def run_nlp_testcase(model): + model_name = model["name"] + result = { + "name": model_name, + "result": {}, + } + d_url = model["download_url"] + checkpoint_n = d_url.split("/")[-1] + dataset_n = model["datasets"].split("/")[-1] + prepare_script = f""" + set -x + cd ../{model['relative_path']} + ln -s /mnt/deepspark/data/checkpoints/{checkpoint_n} ./model_name + bash ci/prepare.sh + """ + + # add pip list info when in debug mode + if utils.is_debug(): + pip_list_script = "pip list | grep -E 'numpy|transformer|igie|mmcv|onnx'\n" + prepare_script = pip_list_script + prepare_script + pip_list_script + + run_script(prepare_script) + + for prec in model["precisions"]: + logging.info(f"Start running {model_name} {prec} test case") + script = f""" + set -x + cd ../{model['relative_path']} + """ + if model_name == "baichuan2-7b": + script = f""" + set -x + cd ../{model['relative_path']} + python3 offline_inference.py --model ./baichuan2-7b/ --max-tokens 256 --trust-remote-code --chat_template template_baichuan.jinja --temperature 0.0 + """ + if prec == "int8": + script = f""" + set -x + cd ../{model['relative_path']} + python3 offline_inference.py --model ./baichuan2-7b/int8/ --chat_template template_baichuan.jinja --quantization w8a16 --max-num-seqs 1 --max-model-len 256 --trust-remote-code --temperature 0.0 --max-tokens 256 + """ + elif model_name == "chatglm3-6b": + script = f""" + set -x + cd ../{model['relative_path']} + python3 offline_inference.py --model ./chatglm3-6b --trust-remote-code --temperature 0.0 --max-tokens 256 + """ + elif model_name == "chatglm3-6b-32k": + script = f""" + set -x + cd ../{model['relative_path']} + python3 offline_inference.py --model ./chatglm3-6b-32k --trust-remote-code --temperature 0.0 --max-tokens 256 + """ + elif model_name == "llama2-7b": + script = f""" + set -x + cd ../{model['relative_path']} + python3 offline_inference.py --model ./llama2-7b --max-tokens 256 -tp 1 --temperature 0.0 + """ + elif model_name == "llama3-70b": + script = f""" + set -x + cd ../{model['relative_path']} + export CUDA_VISIBLE_DEVICES=0,1,2,3 + python3 offline_inference.py --model ./llama3-70b --max-tokens 256 -tp 4 --temperature 0.0 + """ + elif model_name == "qwen-7b": + script = f""" + set -x + cd ../{model['relative_path']} + export CUDA_VISIBLE_DEVICES=0,1 + python3 offline_inference.py --model ./qwen-7b --max-tokens 256 -tp 2 --trust-remote-code --temperature 0.0 + """ + elif model_name == "qwen1.5-7b": + script = f""" + set -x + cd ../{model['relative_path']} + python3 offline_inference.py --model ./qwen1.5-7b --max-tokens 256 -tp 1 --temperature 0.0 --max-model-len 3096 + """ + elif model_name == "qwen1.5-7b": + script = f""" + set -x + cd ../{model['relative_path']} + python3 offline_inference.py --model ./qwen1.5-7b --max-tokens 256 -tp 1 --temperature 0.0 --max-model-len 3096 + """ + elif model_name == "qwen1.5-14b": + script = f""" + set -x + cd ../{model['relative_path']} + python3 offline_inference.py --model ./qwen1.5-14b --max-tokens 256 -tp 1 --temperature 0.0 --max-model-len 1024 + """ + elif model_name == "qwen1.5-32b": + script = f""" + set -x + cd ../{model['relative_path']} + export CUDA_VISIBLE_DEVICES=0,1,2,3 + python3 offline_inference.py --model ./qwen1.5-32b --max-tokens 256 -tp 4 --temperature 0.0 + """ + elif model_name == "qwen1.5-72b": + script = f""" + set -x + cd ../{model['relative_path']} + export CUDA_VISIBLE_DEVICES=0,1 + python3 offline_inference.py --model ./qwen1.5-72b --max-tokens 256 -tp 2 --temperature 0.0 --max-model-len 3096 + """ + elif model_name == "qwen2-7b": + script = f""" + set -x + cd ../{model['relative_path']} + export CUDA_VISIBLE_DEVICES=0 + python3 offline_inference.py --model ./qwen2-7b --max-tokens 256 -tp 1 --temperature 0.0 + """ + elif model_name == "qwen2-72b": + script = f""" + set -x + cd ../{model['relative_path']} + export CUDA_VISIBLE_DEVICES=0,1,2,3 + python3 offline_inference.py --model ./qwen2-72b --max-tokens 256 -tp 4 --temperature 0.0 --gpu-memory-utilization 0.98 --max-model-len 32768 + """ + elif model_name == "stablelm": + script = f""" + set -x + cd ../{model['relative_path']} + export CUDA_VISIBLE_DEVICES=0,1 + python3 offline_inference.py --model ./stablelm --max-tokens 256 -tp 1 --temperature 0.0 + """ + + r, t = run_script(script) + sout = r.stdout + + pattern = METRIC_PATTERN + matchs = re.findall(pattern, sout) + result["result"].setdefault(prec, {"status": "FAIL"}) + logging.debug(f"matchs:\n{matchs}") + for m in matchs: + result["result"][prec].update(get_metric_result(m)) + if len(matchs) == 2: + result["result"][prec]["status"] = "PASS" + + result["result"][prec]["Cost time (s)"] = t + return result + +def run_speech_testcase(model): + model_name = model["name"] + result = { + "name": model_name, + "result": {}, + } + d_url = model["download_url"] + checkpoint_n = d_url.split("/")[-1] + dataset_n = model["datasets"].split("/")[-1] + prepare_script = f""" + cd ../{model['relative_path']} + ln -s /mnt/deepspark/data/checkpoints/{checkpoint_n} ./ + ln -s /mnt/deepspark/data/datasets/{dataset_n} ./ + """ + + if model["need_third_part"] and model_name == "conformer": + prepare_script += "unzip /mnt/deepspark/data/3rd_party/kenlm.zip -d ./ctc_decoder/swig/kenlm\n" + prepare_script += "unzip /mnt/deepspark/data/3rd_party/ThreadPool.zip -d ./ctc_decoder/swig/ThreadPool\n" + prepare_script += "tar -xzvf /mnt/deepspark/data/3rd_party/openfst-1.6.3.tar.gz -C ./ctc_decoder/swig/\n" + + prepare_script += """ + export PYTHONPATH=`pwd`/wenet:$PYTHONPATH + echo $PYTHONPATH + bash ci/prepare.sh + ls -l | grep onnx + """ + + # add pip list info when in debug mode + if utils.is_debug(): + pip_list_script = "pip list | grep -E 'numpy|transformer|igie|mmcv|onnx'\n" + prepare_script = pip_list_script + prepare_script + pip_list_script + + run_script(prepare_script) + + for prec in model["precisions"]: + logging.info(f"Start running {model_name} {prec} test case") + script = f""" + cd ../{model['relative_path']} + export PYTHONPATH=./wenet:$PYTHONPATH + echo $PYTHONPATH + bash scripts/infer_{model_name}_{prec}_accuracy.sh + bash scripts/infer_{model_name}_{prec}_performance.sh + """ + + r, t = run_script(script) + sout = r.stdout + pattern = r"\* ([\w\d ]+):\s*([\d.]+)[ ms%]*, ([\w\d ]+):\s*([\d.]+)[ ms%]*" + matchs = re.findall(pattern, sout) + for m in matchs: + result["result"].setdefault(prec, {"status": "FAIL"}) + try: + result["result"][prec] = result["result"][prec] | {m[0]: float(m[1]), m[2]: float(m[3])} + except ValueError: + print("The string cannot be converted to a float.") + result["result"][prec] = result["result"][prec] | {m[0]: m[1], m[2]: m[3]} + pattern = METRIC_PATTERN + matchs = re.findall(pattern, sout) + if matchs and len(matchs) == 1: + result["result"].setdefault(prec, {}) + result["result"][prec].update(get_metric_result(matchs[0])) + result["result"][prec]["status"] = "PASS" + result["result"][prec]["Cost time (s)"] = t + logging.debug(f"matchs:\n{matchs}") + return result + +def get_metric_result(str): + if str: + return json.loads(str.replace("'", "\""))["metricResult"] + return None + +def run_script(script): + start_time = time.perf_counter() + result = subprocess.run( + script, shell=True, capture_output=True, text=True, executable="/bin/bash" + ) + end_time = time.perf_counter() + execution_time = end_time - start_time + logging.debug(f"执行命令:\n{script}") + logging.debug("执行时间: {:.4f} 秒".format(execution_time)) + logging.debug(f"标准输出: {result.stdout}") + logging.debug(f"标准错误: {result.stderr}") + logging.debug(f"返回码: {result.returncode}") + return result, execution_time + +if __name__ == "__main__": + main() -- Gitee From 96488331904933684f3dfea52c05cf6a959e43ad Mon Sep 17 00:00:00 2001 From: "hongliang.yuan" Date: Wed, 22 Jan 2025 11:05:44 +0800 Subject: [PATCH 2/6] update vllm --- tests/run_trtllm.py | 2 +- tests/run_vllm.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/run_trtllm.py b/tests/run_trtllm.py index 9f8a494f..911ac6fd 100644 --- a/tests/run_trtllm.py +++ b/tests/run_trtllm.py @@ -68,7 +68,7 @@ def main(): logging.info(f"Full text result: {result}") def get_model_config(mode_name): - with open("models_igie.yaml", "r") as file: + with open("models_trtllm.yaml", "r") as file: models = yaml.safe_load(file) for model in models: diff --git a/tests/run_vllm.py b/tests/run_vllm.py index 6d212c95..96498c2a 100644 --- a/tests/run_vllm.py +++ b/tests/run_vllm.py @@ -68,7 +68,7 @@ def main(): logging.info(f"Full text result: {result}") def get_model_config(mode_name): - with open("models_igie.yaml", "r") as file: + with open("models_vllm.yaml", "r") as file: models = yaml.safe_load(file) for model in models: -- Gitee From d8dcde4db9c6124ee6ea226530ee4d51b023523e Mon Sep 17 00:00:00 2001 From: "hongliang.yuan" Date: Wed, 22 Jan 2025 11:36:52 +0800 Subject: [PATCH 3/6] fix model name --- tests/run_vllm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/run_vllm.py b/tests/run_vllm.py index 96498c2a..bc15c268 100644 --- a/tests/run_vllm.py +++ b/tests/run_vllm.py @@ -97,7 +97,7 @@ def run_nlp_testcase(model): prepare_script = f""" set -x cd ../{model['relative_path']} - ln -s /mnt/deepspark/data/checkpoints/{checkpoint_n} ./model_name + ln -s /mnt/deepspark/data/checkpoints/{checkpoint_n} ./{model_name} bash ci/prepare.sh """ -- Gitee From ae6a10adca6259aff74da0c67c2d9ce3016185ca Mon Sep 17 00:00:00 2001 From: "hongliang.yuan" Date: Wed, 22 Jan 2025 14:29:43 +0800 Subject: [PATCH 4/6] update vllm --- .../baichuan2-7b/vllm/ci/prepare.sh | 2 +- .../chatglm3-6b-32k/vllm/offline_inference.py | 2 +- .../qwen1.5-14b/vllm/README.md | 2 +- tests/models_vllm.yaml | 2 +- tests/run_vllm.py | 69 +------------------ 5 files changed, 6 insertions(+), 71 deletions(-) diff --git a/models/nlp/large_language_model/baichuan2-7b/vllm/ci/prepare.sh b/models/nlp/large_language_model/baichuan2-7b/vllm/ci/prepare.sh index 5b9abbd3..54b66b4e 100644 --- a/models/nlp/large_language_model/baichuan2-7b/vllm/ci/prepare.sh +++ b/models/nlp/large_language_model/baichuan2-7b/vllm/ci/prepare.sh @@ -29,4 +29,4 @@ pip3 install transformers==4.37.1 # has prepared in ci # ln -s /mnt/deepspark/data/checkpoints/Baichuan2-7B-Base ./ -python3 convert2int8.py --model-path ./Baichuan2-7B-Base/ \ No newline at end of file +python3 convert2int8.py --model-path ./baichuan2-7b/ \ No newline at end of file diff --git a/models/nlp/large_language_model/chatglm3-6b-32k/vllm/offline_inference.py b/models/nlp/large_language_model/chatglm3-6b-32k/vllm/offline_inference.py index bc731079..7fc45b68 100644 --- a/models/nlp/large_language_model/chatglm3-6b-32k/vllm/offline_inference.py +++ b/models/nlp/large_language_model/chatglm3-6b-32k/vllm/offline_inference.py @@ -57,7 +57,7 @@ if __name__ == "__main__": model_name = model_name.rsplit("/")[-1] # Sample prompts. - prompts = ["Щܱһھ?", "һ֥ʿ", "дһƪй5Gз¡"] + prompts = ["哪些迹象可能表明一个人正在经历焦虑?", "描述一下如何制作芝士披萨。", "写一篇有关5G网络研发的综述文章。"] # Create a sampling params object. sampling_params = SamplingParams(**sampling_params) diff --git a/models/nlp/large_language_model/qwen1.5-14b/vllm/README.md b/models/nlp/large_language_model/qwen1.5-14b/vllm/README.md index b3c67597..905967c2 100644 --- a/models/nlp/large_language_model/qwen1.5-14b/vllm/README.md +++ b/models/nlp/large_language_model/qwen1.5-14b/vllm/README.md @@ -29,7 +29,7 @@ ln -s /path/to/Qwen1.5-14B ./data/qwen1.5 ## Inference ```bash -python3 offline_inference.py --model ./data/qwen1.5/Qwen1.5-14B --max-tokens 256 -tp 1 --temperature 0.0 --max-model-len 1024 +python3 offline_inference.py --model ./data/qwen1.5/Qwen1.5-14B --max-tokens 256 -tp 1 --temperature 0.0 --max-model-len 896 ``` ## Results diff --git a/tests/models_vllm.yaml b/tests/models_vllm.yaml index 6d7177ea..821733f6 100644 --- a/tests/models_vllm.yaml +++ b/tests/models_vllm.yaml @@ -17,7 +17,7 @@ relative_path: models/nlp/large_language_model/chatglm3-6b/vllm task_type: nlp/large_language_model - datasets: https://localhost - download_url: https://www.modelscope.cn/models/ZhipuAI/chatglm3-6b-32k-无 + download_url: https://www.modelscope.cn/models/ZhipuAI/chatglm3-6b-32k name: chatglm3-6b-32k need_third_part: false precisions: diff --git a/tests/run_vllm.py b/tests/run_vllm.py index bc15c268..488465ba 100644 --- a/tests/run_vllm.py +++ b/tests/run_vllm.py @@ -174,7 +174,7 @@ def run_nlp_testcase(model): script = f""" set -x cd ../{model['relative_path']} - python3 offline_inference.py --model ./qwen1.5-14b --max-tokens 256 -tp 1 --temperature 0.0 --max-model-len 1024 + python3 offline_inference.py --model ./qwen1.5-14b --max-tokens 256 -tp 1 --temperature 0.0 --max-model-len 896 """ elif model_name == "qwen1.5-32b": script = f""" @@ -221,77 +221,12 @@ def run_nlp_testcase(model): logging.debug(f"matchs:\n{matchs}") for m in matchs: result["result"][prec].update(get_metric_result(m)) - if len(matchs) == 2: + if len(matchs) == 1: result["result"][prec]["status"] = "PASS" result["result"][prec]["Cost time (s)"] = t return result -def run_speech_testcase(model): - model_name = model["name"] - result = { - "name": model_name, - "result": {}, - } - d_url = model["download_url"] - checkpoint_n = d_url.split("/")[-1] - dataset_n = model["datasets"].split("/")[-1] - prepare_script = f""" - cd ../{model['relative_path']} - ln -s /mnt/deepspark/data/checkpoints/{checkpoint_n} ./ - ln -s /mnt/deepspark/data/datasets/{dataset_n} ./ - """ - - if model["need_third_part"] and model_name == "conformer": - prepare_script += "unzip /mnt/deepspark/data/3rd_party/kenlm.zip -d ./ctc_decoder/swig/kenlm\n" - prepare_script += "unzip /mnt/deepspark/data/3rd_party/ThreadPool.zip -d ./ctc_decoder/swig/ThreadPool\n" - prepare_script += "tar -xzvf /mnt/deepspark/data/3rd_party/openfst-1.6.3.tar.gz -C ./ctc_decoder/swig/\n" - - prepare_script += """ - export PYTHONPATH=`pwd`/wenet:$PYTHONPATH - echo $PYTHONPATH - bash ci/prepare.sh - ls -l | grep onnx - """ - - # add pip list info when in debug mode - if utils.is_debug(): - pip_list_script = "pip list | grep -E 'numpy|transformer|igie|mmcv|onnx'\n" - prepare_script = pip_list_script + prepare_script + pip_list_script - - run_script(prepare_script) - - for prec in model["precisions"]: - logging.info(f"Start running {model_name} {prec} test case") - script = f""" - cd ../{model['relative_path']} - export PYTHONPATH=./wenet:$PYTHONPATH - echo $PYTHONPATH - bash scripts/infer_{model_name}_{prec}_accuracy.sh - bash scripts/infer_{model_name}_{prec}_performance.sh - """ - - r, t = run_script(script) - sout = r.stdout - pattern = r"\* ([\w\d ]+):\s*([\d.]+)[ ms%]*, ([\w\d ]+):\s*([\d.]+)[ ms%]*" - matchs = re.findall(pattern, sout) - for m in matchs: - result["result"].setdefault(prec, {"status": "FAIL"}) - try: - result["result"][prec] = result["result"][prec] | {m[0]: float(m[1]), m[2]: float(m[3])} - except ValueError: - print("The string cannot be converted to a float.") - result["result"][prec] = result["result"][prec] | {m[0]: m[1], m[2]: m[3]} - pattern = METRIC_PATTERN - matchs = re.findall(pattern, sout) - if matchs and len(matchs) == 1: - result["result"].setdefault(prec, {}) - result["result"][prec].update(get_metric_result(matchs[0])) - result["result"][prec]["status"] = "PASS" - result["result"][prec]["Cost time (s)"] = t - logging.debug(f"matchs:\n{matchs}") - return result - def get_metric_result(str): if str: return json.loads(str.replace("'", "\""))["metricResult"] -- Gitee From c25c728f8d5514c75d29adb0fd545f8298787b34 Mon Sep 17 00:00:00 2001 From: "hongliang.yuan" Date: Wed, 22 Jan 2025 15:52:57 +0800 Subject: [PATCH 5/6] update vllm --- README.md | 2 +- .../minicpm-v-2}/vllm/README.md | 0 .../minicpm-v-2/vllm/ci/prepare.sh | 31 +++++++++++++++++++ .../minicpm-v-2}/vllm/minicpmv-2.0-offline.py | 0 tests/models_vllm.yaml | 8 +++++ tests/run_vllm.py | 11 ++++++- 6 files changed, 50 insertions(+), 2 deletions(-) rename models/{vision-language-understanding/MiniCPM-V-2 => multimodal/vision-language-understanding/minicpm-v-2}/vllm/README.md (100%) create mode 100644 models/multimodal/vision-language-understanding/minicpm-v-2/vllm/ci/prepare.sh rename models/{vision-language-understanding/MiniCPM-V-2 => multimodal/vision-language-understanding/minicpm-v-2}/vllm/minicpmv-2.0-offline.py (100%) diff --git a/README.md b/README.md index b5132821..4ddd9511 100644 --- a/README.md +++ b/README.md @@ -1168,7 +1168,7 @@ DeepSparkInference将按季度进行版本更新,后续会逐步丰富模型 MiniCPM-V-2 - Supported + Supported - - diff --git a/models/vision-language-understanding/MiniCPM-V-2/vllm/README.md b/models/multimodal/vision-language-understanding/minicpm-v-2/vllm/README.md similarity index 100% rename from models/vision-language-understanding/MiniCPM-V-2/vllm/README.md rename to models/multimodal/vision-language-understanding/minicpm-v-2/vllm/README.md diff --git a/models/multimodal/vision-language-understanding/minicpm-v-2/vllm/ci/prepare.sh b/models/multimodal/vision-language-understanding/minicpm-v-2/vllm/ci/prepare.sh new file mode 100644 index 00000000..f1c0b9c8 --- /dev/null +++ b/models/multimodal/vision-language-understanding/minicpm-v-2/vllm/ci/prepare.sh @@ -0,0 +1,31 @@ +#!/bin/bash +# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd. +# All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -x +ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"') +if [[ ${ID} == "ubuntu" ]]; then + apt install -y libgl1-mesa-glx +elif [[ ${ID} == "centos" ]]; then + yum install -y mesa-libGL +else + echo "Not Support Os" +fi + +pip3 install timm==0.9.10 +pip3 install transformers +pip3 install --user --upgrade pillow -i https://pypi.tuna.tsinghua.edu.cn/simple + +cp /mnt/deepspark/data/datasets/dog.jpg ./ \ No newline at end of file diff --git a/models/vision-language-understanding/MiniCPM-V-2/vllm/minicpmv-2.0-offline.py b/models/multimodal/vision-language-understanding/minicpm-v-2/vllm/minicpmv-2.0-offline.py similarity index 100% rename from models/vision-language-understanding/MiniCPM-V-2/vllm/minicpmv-2.0-offline.py rename to models/multimodal/vision-language-understanding/minicpm-v-2/vllm/minicpmv-2.0-offline.py diff --git a/tests/models_vllm.yaml b/tests/models_vllm.yaml index 821733f6..548f16c8 100644 --- a/tests/models_vllm.yaml +++ b/tests/models_vllm.yaml @@ -104,3 +104,11 @@ - fp16 relative_path: models/nlp/large_language_model/stablelm/vllm task_type: nlp/large_language_model +- datasets: https://localhost + download_url: https://localhost/MiniCPM-V-2 + name: minicpm-v-2 + need_third_part: false + precisions: + - fp16 + relative_path: models/multimodal/vision-language-understanding/minicpm-v-2/vllm/ + task_type: multimodal/vision-language-understanding diff --git a/tests/run_vllm.py b/tests/run_vllm.py index 488465ba..8bd3504d 100644 --- a/tests/run_vllm.py +++ b/tests/run_vllm.py @@ -56,7 +56,7 @@ def main(): result = {} # NLP模型 - if model["task_type"] in ["nlp/large_language_model"]: + if model["task_type"] in ["nlp/large_language_model", "multimodal/vision-language-understanding"]: logging.info(f"Start running {model['name']} test case:\n{json.dumps(model, indent=4)}") d_url = model["download_url"] if d_url is not None: @@ -211,6 +211,15 @@ def run_nlp_testcase(model): export CUDA_VISIBLE_DEVICES=0,1 python3 offline_inference.py --model ./stablelm --max-tokens 256 -tp 1 --temperature 0.0 """ + elif model_name == "minicpm-v-2": + script = f""" + set -x + cd ../{model['relative_path']} + export PT_SDPA_ENABLE_HEAD_DIM_PADDING=1 + export PATH=/usr/local/corex/bin:${PATH} + export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/lib64 + python3 minicpmv-2.0-offline.py --model-path ./minicpm-v-2 --image-path ./dog.jpg + """ r, t = run_script(script) sout = r.stdout -- Gitee From 9981f977ae3960fd0d1f54a5dc90956ae932c9a8 Mon Sep 17 00:00:00 2001 From: "hongliang.yuan" Date: Wed, 22 Jan 2025 17:00:42 +0800 Subject: [PATCH 6/6] fix minicpm --- .../minicpm-v-2/vllm/minicpmv-2.0-offline.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/models/multimodal/vision-language-understanding/minicpm-v-2/vllm/minicpmv-2.0-offline.py b/models/multimodal/vision-language-understanding/minicpm-v-2/vllm/minicpmv-2.0-offline.py index d6add4d8..1da0fdd8 100644 --- a/models/multimodal/vision-language-understanding/minicpm-v-2/vllm/minicpmv-2.0-offline.py +++ b/models/multimodal/vision-language-understanding/minicpm-v-2/vllm/minicpmv-2.0-offline.py @@ -38,9 +38,9 @@ def main(args): llm = LLM(model=MODEL_NAME, gpu_memory_utilization=0.95, # 使用全部GPU内存 trust_remote_code=True, - max_model_len=1024, - max_num_seqs=1, - max_num_batched_tokens=1024,) # 根据内存状况可调整此值 + max_model_len=2048, + # max_num_seqs=1, + max_num_batched_tokens=2048,) # 根据内存状况可调整此值 # 构建对话消息 messages = [{'role': 'user', 'content': '(./)\n' + '请描述这张图片'}] @@ -64,10 +64,10 @@ def main(args): # top_p=0.8, # top_k=100, # seed=3472, - max_tokens=128, + max_tokens=1024, # min_tokens=150, temperature=0, - use_beam_search=False, + # use_beam_search=False, # length_penalty=1.2, best_of=1) -- Gitee