From 2846a8c570d1daf2fcd2885a98af80f4ea0d4d18 Mon Sep 17 00:00:00 2001
From: "hongliang.yuan" <hongliang.yuan@iluvatar.com>
Date: Tue, 21 Jan 2025 17:09:03 +0800
Subject: [PATCH 1/6] init vllm ci

---
 .../baichuan2-7b/vllm/ci/prepare.sh           |  32 ++
 .../baichuan2-7b/vllm/offline_inference.py    |   5 +
 .../chatglm3-6b-32k/vllm/ci/prepare.sh        |  27 ++
 .../chatglm3-6b/vllm/ci/prepare.sh            |  28 ++
 .../chatglm3-6b/vllm/offline_inference.py     |   5 +
 .../llama2-13b/trtllm/README.md               |   3 -
 .../llama2-13b/trtllm/ci/prepare.sh           |  36 ++
 .../llama2-70b/trtllm/README.md               |   4 -
 .../llama2-70b/trtllm/ci/prepare.sh           |  36 ++
 .../llama2-7b/trtllm/ci/prepare.sh            |  36 ++
 .../llama2-7b/vllm/ci/prepare.sh              |  30 ++
 .../llama2-7b/vllm/offline_inference.py       |   4 +
 .../llama3-70b/vllm/ci/prepare.sh             |  26 ++
 .../llama3-70b/vllm/offline_inference.py      |   4 +
 .../text-generation-inference/README.md       |   4 -
 .../text-generation-inference/ci/prepare.sh   |  29 ++
 .../offline_inference.py                      |   5 +
 .../qwen-7b/vllm/ci/prepare.sh                |  30 ++
 .../qwen-7b/vllm/offline_inference.py         |   4 +
 .../qwen1.5-14b/vllm/ci/prepare.sh            |  25 ++
 .../qwen1.5-14b/vllm/offline_inference.py     |   4 +
 .../qwen1.5-32b/vllm/ci/prepare.sh            |  30 ++
 .../qwen1.5-32b/vllm/offline_inference.py     |   4 +
 .../qwen1.5-72b/vllm/ci/prepare.sh            |  25 ++
 .../qwen1.5-72b/vllm/offline_inference.py     |   4 +
 .../text-generation-inference/ci/prepare.sh   |  29 ++
 .../offline_inference.py                      |   5 +
 .../qwen1.5-7b/vllm/ci/prepare.sh             |  25 ++
 .../qwen1.5-7b/vllm/offline_inference.py      |   4 +
 .../qwen2-72b/vllm/ci/prepare.sh              |  30 ++
 .../qwen2-72b/vllm/offline_inference.py       |   4 +
 .../qwen2-7b/vllm/ci/prepare.sh               |  30 ++
 .../qwen2-7b/vllm/offline_inference.py        |   4 +
 .../stablelm/vllm/ci/prepare.sh               |  27 ++
 .../stablelm/vllm/offline_inference.py        |   6 +-
 tests/models_trtllm.yaml                      |  41 +++
 tests/models_vllm.yaml                        | 106 ++++++
 tests/run_trtllm.py                           | 189 +++++++++++
 tests/run_vllm.py                             | 315 ++++++++++++++++++
 39 files changed, 1243 insertions(+), 12 deletions(-)
 create mode 100644 models/nlp/large_language_model/baichuan2-7b/vllm/ci/prepare.sh
 create mode 100644 models/nlp/large_language_model/chatglm3-6b-32k/vllm/ci/prepare.sh
 create mode 100644 models/nlp/large_language_model/chatglm3-6b/vllm/ci/prepare.sh
 create mode 100644 models/nlp/large_language_model/llama2-13b/trtllm/ci/prepare.sh
 create mode 100644 models/nlp/large_language_model/llama2-70b/trtllm/ci/prepare.sh
 create mode 100644 models/nlp/large_language_model/llama2-7b/trtllm/ci/prepare.sh
 create mode 100644 models/nlp/large_language_model/llama2-7b/vllm/ci/prepare.sh
 create mode 100644 models/nlp/large_language_model/llama3-70b/vllm/ci/prepare.sh
 create mode 100644 models/nlp/large_language_model/qwen-7b/text-generation-inference/ci/prepare.sh
 create mode 100644 models/nlp/large_language_model/qwen-7b/vllm/ci/prepare.sh
 create mode 100644 models/nlp/large_language_model/qwen1.5-14b/vllm/ci/prepare.sh
 create mode 100644 models/nlp/large_language_model/qwen1.5-32b/vllm/ci/prepare.sh
 create mode 100644 models/nlp/large_language_model/qwen1.5-72b/vllm/ci/prepare.sh
 create mode 100644 models/nlp/large_language_model/qwen1.5-7b/text-generation-inference/ci/prepare.sh
 create mode 100644 models/nlp/large_language_model/qwen1.5-7b/vllm/ci/prepare.sh
 create mode 100644 models/nlp/large_language_model/qwen2-72b/vllm/ci/prepare.sh
 create mode 100644 models/nlp/large_language_model/qwen2-7b/vllm/ci/prepare.sh
 create mode 100644 models/nlp/large_language_model/stablelm/vllm/ci/prepare.sh
 create mode 100644 tests/models_trtllm.yaml
 create mode 100644 tests/models_vllm.yaml
 create mode 100644 tests/run_trtllm.py
 create mode 100644 tests/run_vllm.py

diff --git a/models/nlp/large_language_model/baichuan2-7b/vllm/ci/prepare.sh b/models/nlp/large_language_model/baichuan2-7b/vllm/ci/prepare.sh
new file mode 100644
index 00000000..5b9abbd3
--- /dev/null
+++ b/models/nlp/large_language_model/baichuan2-7b/vllm/ci/prepare.sh
@@ -0,0 +1,32 @@
+#!/bin/bash
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -x
+ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
+if [[ ${ID} == "ubuntu" ]]; then
+    apt install -y libgl1-mesa-glx
+elif [[ ${ID} == "centos" ]]; then
+    yum install -y mesa-libGL
+else
+    echo "Not Support Os"
+fi
+
+pip3 install transformers==4.37.1
+
+# has prepared in ci
+# ln -s /mnt/deepspark/data/checkpoints/Baichuan2-7B-Base ./
+
+python3 convert2int8.py --model-path ./Baichuan2-7B-Base/
\ No newline at end of file
diff --git a/models/nlp/large_language_model/baichuan2-7b/vllm/offline_inference.py b/models/nlp/large_language_model/baichuan2-7b/vllm/offline_inference.py
index 40c0e2e1..9e5738a1 100644
--- a/models/nlp/large_language_model/baichuan2-7b/vllm/offline_inference.py
+++ b/models/nlp/large_language_model/baichuan2-7b/vllm/offline_inference.py
@@ -109,4 +109,9 @@ for i, output in enumerate(outputs):
     print(f"Prompt: {prompt}\nGenerated text: {generated_text} \n")
 print(f"tokens: {num_tokens}, QPS: {num_tokens/duration_time}")
 
+metricResult = {"metricResult": {}}
+metricResult["metricResult"]["tokens"] = num_tokens
+metricResult["metricResult"]["QPS"] = round(num_tokens/duration_time,3)
+print(metricResult)
+
 # 0.3.2 tokens: 757, QPS: 97.97229589080902
\ No newline at end of file
diff --git a/models/nlp/large_language_model/chatglm3-6b-32k/vllm/ci/prepare.sh b/models/nlp/large_language_model/chatglm3-6b-32k/vllm/ci/prepare.sh
new file mode 100644
index 00000000..ad683f6e
--- /dev/null
+++ b/models/nlp/large_language_model/chatglm3-6b-32k/vllm/ci/prepare.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -x
+ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
+if [[ ${ID} == "ubuntu" ]]; then
+    apt install -y libgl1-mesa-glx
+elif [[ ${ID} == "centos" ]]; then
+    yum install -y mesa-libGL
+else
+    echo "Not Support Os"
+fi
+
+pip3 install transformers==4.37.1
diff --git a/models/nlp/large_language_model/chatglm3-6b/vllm/ci/prepare.sh b/models/nlp/large_language_model/chatglm3-6b/vllm/ci/prepare.sh
new file mode 100644
index 00000000..4cfd5fd6
--- /dev/null
+++ b/models/nlp/large_language_model/chatglm3-6b/vllm/ci/prepare.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -x
+ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
+if [[ ${ID} == "ubuntu" ]]; then
+    apt install -y libgl1-mesa-glx
+elif [[ ${ID} == "centos" ]]; then
+    yum install -y mesa-libGL
+else
+    echo "Not Support Os"
+fi
+
+pip3 install vllm==0.5.0
+pip3 install transformers==4.37.1
diff --git a/models/nlp/large_language_model/chatglm3-6b/vllm/offline_inference.py b/models/nlp/large_language_model/chatglm3-6b/vllm/offline_inference.py
index 0162d93c..eaa7fe20 100644
--- a/models/nlp/large_language_model/chatglm3-6b/vllm/offline_inference.py
+++ b/models/nlp/large_language_model/chatglm3-6b/vllm/offline_inference.py
@@ -99,4 +99,9 @@ for i, output in enumerate(outputs):
     print(f"Prompt: {prompt}\nGenerated text: {generated_text} \n")
 print(f"tokens: {num_tokens}, QPS: {num_tokens/duration_time}")
 
+metricResult = {"metricResult": {}}
+metricResult["metricResult"]["tokens"] = num_tokens
+metricResult["metricResult"]["QPS"] = round(num_tokens/duration_time,3)
+print(metricResult)
+
 # 0.3.2 tokens: 422, QPS: 70.02308283048338(tokens: 422, QPS: 93.67210003677407),32-k 模型 tokens: 477, QPS: 81.46537314533865(tokens: 477, QPS: 106.54247895449554)
\ No newline at end of file
diff --git a/models/nlp/large_language_model/llama2-13b/trtllm/README.md b/models/nlp/large_language_model/llama2-13b/trtllm/README.md
index 4658334d..b24c29ce 100755
--- a/models/nlp/large_language_model/llama2-13b/trtllm/README.md
+++ b/models/nlp/large_language_model/llama2-13b/trtllm/README.md
@@ -18,9 +18,6 @@ apt install -y libgl1-mesa-dev
 
 bash scripts/set_environment.sh .
 
-# *star refer to a specified version
-wget http://files.deepspark.org.cn:880/deepspark/add-ons/tensorrt_llm-*.whl
-pip install tensorrt_llm-*.whl
 ```
 
 ### Download
diff --git a/models/nlp/large_language_model/llama2-13b/trtllm/ci/prepare.sh b/models/nlp/large_language_model/llama2-13b/trtllm/ci/prepare.sh
new file mode 100644
index 00000000..7947d62b
--- /dev/null
+++ b/models/nlp/large_language_model/llama2-13b/trtllm/ci/prepare.sh
@@ -0,0 +1,36 @@
+#!/bin/bash
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -x
+ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
+if [[ ${ID} == "ubuntu" ]]; then
+    apt install -y libgl1-mesa-glx
+elif [[ ${ID} == "centos" ]]; then
+    yum install -y mesa-libGL
+else
+    echo "Not Support Os"
+fi
+
+bash scripts/set_environment.sh .
+
+# Download model from the website and make sure the model's path is "data/llama2-13b-chat"
+# Download dataset from the website and make sure the dataset's path is "data/datasets_cnn_dailymail"
+mkdir -p data
+ln -s /mnt/deepspark/data/checkpoints/llama2-13b-chat data/llama2-13b-chat
+ln -s /mnt/deepspark/data/datasets/datasets_cnn_dailymail data/datasets_cnn_dailymail
+# Please download rouge.py to this path if your server can't attach huggingface.co.
+mkdir -p rouge/
+cp /mnt/deepspark/data/3rd_party/rouge.py rouge/
\ No newline at end of file
diff --git a/models/nlp/large_language_model/llama2-70b/trtllm/README.md b/models/nlp/large_language_model/llama2-70b/trtllm/README.md
index 671b51c7..621dbffe 100644
--- a/models/nlp/large_language_model/llama2-70b/trtllm/README.md
+++ b/models/nlp/large_language_model/llama2-70b/trtllm/README.md
@@ -16,10 +16,6 @@ yum install -y mesa-libGL
 apt install -y libgl1-mesa-dev
 
 bash scripts/set_environment.sh .
-
-# *star refer to a specified version
-wget http://files.deepspark.org.cn:880/deepspark/add-ons/tensorrt_llm-*.whl
-pip install tensorrt_llm-*.whl
 ```
 
 ### Download
diff --git a/models/nlp/large_language_model/llama2-70b/trtllm/ci/prepare.sh b/models/nlp/large_language_model/llama2-70b/trtllm/ci/prepare.sh
new file mode 100644
index 00000000..4a8c1e4e
--- /dev/null
+++ b/models/nlp/large_language_model/llama2-70b/trtllm/ci/prepare.sh
@@ -0,0 +1,36 @@
+#!/bin/bash
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -x
+ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
+if [[ ${ID} == "ubuntu" ]]; then
+    apt install -y libgl1-mesa-glx
+elif [[ ${ID} == "centos" ]]; then
+    yum install -y mesa-libGL
+else
+    echo "Not Support Os"
+fi
+
+bash scripts/set_environment.sh .
+
+# Download model from the website and make sure the model's path is "data/llama2-70b-chat"
+# Download dataset from the website and make sure the dataset's path is "data/datasets_cnn_dailymail"
+mkdir -p data
+ln -s /mnt/deepspark/data/checkpoints/llama2-70b-chat data/llama2-70b-chat
+ln -s /mnt/deepspark/data/datasets/datasets_cnn_dailymail data/datasets_cnn_dailymail
+# Please download rouge.py to this path if your server can't attach huggingface.co.
+mkdir -p rouge/
+cp /mnt/deepspark/data/3rd_party/rouge.py rouge/
\ No newline at end of file
diff --git a/models/nlp/large_language_model/llama2-7b/trtllm/ci/prepare.sh b/models/nlp/large_language_model/llama2-7b/trtllm/ci/prepare.sh
new file mode 100644
index 00000000..2ac2384d
--- /dev/null
+++ b/models/nlp/large_language_model/llama2-7b/trtllm/ci/prepare.sh
@@ -0,0 +1,36 @@
+#!/bin/bash
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -x
+ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
+if [[ ${ID} == "ubuntu" ]]; then
+    apt install -y libgl1-mesa-glx
+elif [[ ${ID} == "centos" ]]; then
+    yum install -y mesa-libGL
+else
+    echo "Not Support Os"
+fi
+
+bash scripts/set_environment.sh .
+
+# Download model from the website and make sure the model's path is "data/llama2-7b-chat"
+# Download dataset from the website and make sure the dataset's path is "data/datasets_cnn_dailymail"
+mkdir -p data
+ln -s /mnt/deepspark/data/checkpoints/llama2-7b-chat data/llama2-7b-chat
+ln -s /mnt/deepspark/data/datasets/datasets_cnn_dailymail data/datasets_cnn_dailymail
+# Please download rouge.py to this path if your server can't attach huggingface.co.
+mkdir -p rouge/
+cp /mnt/deepspark/data/3rd_party/rouge.py rouge/
\ No newline at end of file
diff --git a/models/nlp/large_language_model/llama2-7b/vllm/ci/prepare.sh b/models/nlp/large_language_model/llama2-7b/vllm/ci/prepare.sh
new file mode 100644
index 00000000..6afe9667
--- /dev/null
+++ b/models/nlp/large_language_model/llama2-7b/vllm/ci/prepare.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -x
+ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
+if [[ ${ID} == "ubuntu" ]]; then
+    apt install -y libgl1-mesa-glx
+elif [[ ${ID} == "centos" ]]; then
+    yum install -y mesa-libGL
+else
+    echo "Not Support Os"
+fi
+
+# Contact the iluvatar manager to get adapted install packages of vllm, triton, and ixformer
+pip3 install vllm
+pip3 install triton
+pip3 install ixformer
diff --git a/models/nlp/large_language_model/llama2-7b/vllm/offline_inference.py b/models/nlp/large_language_model/llama2-7b/vllm/offline_inference.py
index 9c0b6d2f..538d3541 100644
--- a/models/nlp/large_language_model/llama2-7b/vllm/offline_inference.py
+++ b/models/nlp/large_language_model/llama2-7b/vllm/offline_inference.py
@@ -129,3 +129,7 @@ if __name__ == "__main__":
         num_tokens += len(output.outputs[0].token_ids)
         print(f"Prompt: {prompt}\nGenerated text: {generated_text} \n")
     print(f"tokens: {num_tokens}, QPS: {num_tokens/duration_time}")
+    metricResult = {"metricResult": {}}
+    metricResult["metricResult"]["tokens"] = num_tokens
+    metricResult["metricResult"]["QPS"] = round(num_tokens/duration_time,3)
+    print(metricResult)
diff --git a/models/nlp/large_language_model/llama3-70b/vllm/ci/prepare.sh b/models/nlp/large_language_model/llama3-70b/vllm/ci/prepare.sh
new file mode 100644
index 00000000..000245a8
--- /dev/null
+++ b/models/nlp/large_language_model/llama3-70b/vllm/ci/prepare.sh
@@ -0,0 +1,26 @@
+#!/bin/bash
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -x
+ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
+if [[ ${ID} == "ubuntu" ]]; then
+    apt install -y libgl1-mesa-glx
+elif [[ ${ID} == "centos" ]]; then
+    yum install -y mesa-libGL
+else
+    echo "Not Support Os"
+fi
+
diff --git a/models/nlp/large_language_model/llama3-70b/vllm/offline_inference.py b/models/nlp/large_language_model/llama3-70b/vllm/offline_inference.py
index cdf635a7..6932fde6 100644
--- a/models/nlp/large_language_model/llama3-70b/vllm/offline_inference.py
+++ b/models/nlp/large_language_model/llama3-70b/vllm/offline_inference.py
@@ -151,6 +151,10 @@ if args.acc_test:
         print('val ROUGE-1 score f1: {}, target ROUGE-1 score f1: {}, fail'.format(scores[0]["rouge-1"]['f'],args.acc_threshold))
         exit(1)
     print('val ROUGE-1 score f1: {}, target ROUGE-1 score f1: {}, pass'.format(scores[0]["rouge-1"]['f'],args.acc_threshold))
+    metricResult = {"metricResult": {}}
+    metricResult["metricResult"]["val ROUGE-1 score f1"] = scores[0]["rouge-1"]['f']
+    metricResult["metricResult"]["val ROUGE-1 score f1"] = args.acc_threshold
+    print(metricResult)
 
 # 2 7b vllm 0.1.6: batch 3, tokens: 773, QPS: 64.35866137433203; batch 1, tokens: 257, QPS: 25.396898421442113
 # 1\2 13b vllm 0.1.6: batch 3, tokens: 768, QPS: 41.538942353799506; batch 1, tokens: 257, QPS: 15.639606595029639 (2, 6.5829828847570795; 8, 5.137610167755676)
diff --git a/models/nlp/large_language_model/qwen-7b/text-generation-inference/README.md b/models/nlp/large_language_model/qwen-7b/text-generation-inference/README.md
index 33b0aab2..729b9833 100644
--- a/models/nlp/large_language_model/qwen-7b/text-generation-inference/README.md
+++ b/models/nlp/large_language_model/qwen-7b/text-generation-inference/README.md
@@ -17,10 +17,6 @@ yum install -y mesa-libGL
 ## Ubuntu
 apt install -y libgl1-mesa-dev
 
-# *star refer to a specified version
-wget http://files.deepspark.org.cn:880/deepspark/add-ons/text-generation-*.whl
-wget http://files.deepspark.org.cn:880/deepspark/add-ons/text-generation-server-*.whl
-pip install tensorrt_llm-*.whl text-generation-server-*.whl
 ```
 
 ### Download
diff --git a/models/nlp/large_language_model/qwen-7b/text-generation-inference/ci/prepare.sh b/models/nlp/large_language_model/qwen-7b/text-generation-inference/ci/prepare.sh
new file mode 100644
index 00000000..4b2fdf8b
--- /dev/null
+++ b/models/nlp/large_language_model/qwen-7b/text-generation-inference/ci/prepare.sh
@@ -0,0 +1,29 @@
+#!/bin/bash
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -x
+ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
+if [[ ${ID} == "ubuntu" ]]; then
+    apt install -y libgl1-mesa-glx
+elif [[ ${ID} == "centos" ]]; then
+    yum install -y mesa-libGL
+else
+    echo "Not Support Os"
+fi
+
+mkdir -p data
+
+ln -s /mnt/deepspark/data/checkpoints/qwen-7B data/qwen-7B
\ No newline at end of file
diff --git a/models/nlp/large_language_model/qwen-7b/text-generation-inference/offline_inference.py b/models/nlp/large_language_model/qwen-7b/text-generation-inference/offline_inference.py
index 57db6334..e3ebcc3a 100644
--- a/models/nlp/large_language_model/qwen-7b/text-generation-inference/offline_inference.py
+++ b/models/nlp/large_language_model/qwen-7b/text-generation-inference/offline_inference.py
@@ -109,6 +109,11 @@ if __name__ == "__main__":
     duration_time = end_time - start_time
     print(f"generate length: {generations_one[0].generated_text.generated_tokens}")
     print(f"one batch: {generations_one[0].generated_text.text}\nqps: {generations_one[0].generated_text.generated_tokens /duration_time}")
+    metricResult = {"metricResult": {}}
+    metricResult["metricResult"]["generate length"] = generations_one[0].generated_text.generated_tokens
+    metricResult["metricResult"]["one batch"] = generations_one[0].generated_text.text
+    metricResult["metricResult"]["qps"] = generations_one[0].generated_text.generated_tokens /duration_time
+    print(metricResult)
 
 """
 qwen-7B
diff --git a/models/nlp/large_language_model/qwen-7b/vllm/ci/prepare.sh b/models/nlp/large_language_model/qwen-7b/vllm/ci/prepare.sh
new file mode 100644
index 00000000..cfd5031a
--- /dev/null
+++ b/models/nlp/large_language_model/qwen-7b/vllm/ci/prepare.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -x
+ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
+if [[ ${ID} == "ubuntu" ]]; then
+    apt install -y libgl1-mesa-glx
+elif [[ ${ID} == "centos" ]]; then
+    yum install -y mesa-libGL
+else
+    echo "Not Support Os"
+fi
+
+# Contact the iluvatar manager to get adapted install packages of vllm, triton, and ixformer
+pip3 install vllm
+pip3 install triton
+pip3 install ixformer
\ No newline at end of file
diff --git a/models/nlp/large_language_model/qwen-7b/vllm/offline_inference.py b/models/nlp/large_language_model/qwen-7b/vllm/offline_inference.py
index 3b9e9fd8..5de14fb0 100644
--- a/models/nlp/large_language_model/qwen-7b/vllm/offline_inference.py
+++ b/models/nlp/large_language_model/qwen-7b/vllm/offline_inference.py
@@ -130,3 +130,7 @@ if __name__ == "__main__":
         num_tokens += len(output.outputs[0].token_ids)
         print(f"Prompt: {prompt}\nGenerated text: {generated_text} \n")
     print(f"tokens: {num_tokens}, QPS: {num_tokens/duration_time}")
+    metricResult = {"metricResult": {}}
+    metricResult["metricResult"]["tokens"] = num_tokens
+    metricResult["metricResult"]["QPS"] = round(num_tokens/duration_time,3)
+    print(metricResult)
diff --git a/models/nlp/large_language_model/qwen1.5-14b/vllm/ci/prepare.sh b/models/nlp/large_language_model/qwen1.5-14b/vllm/ci/prepare.sh
new file mode 100644
index 00000000..75fb1945
--- /dev/null
+++ b/models/nlp/large_language_model/qwen1.5-14b/vllm/ci/prepare.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -x
+ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
+if [[ ${ID} == "ubuntu" ]]; then
+    apt install -y libgl1-mesa-glx
+elif [[ ${ID} == "centos" ]]; then
+    yum install -y mesa-libGL
+else
+    echo "Not Support Os"
+fi
diff --git a/models/nlp/large_language_model/qwen1.5-14b/vllm/offline_inference.py b/models/nlp/large_language_model/qwen1.5-14b/vllm/offline_inference.py
index 7ee127a2..130f0885 100644
--- a/models/nlp/large_language_model/qwen1.5-14b/vllm/offline_inference.py
+++ b/models/nlp/large_language_model/qwen1.5-14b/vllm/offline_inference.py
@@ -108,3 +108,7 @@ for i, output in enumerate(outputs):
     num_tokens += len(output.outputs[0].token_ids)
     print(f"Prompt: {prompt}\nGenerated text: {generated_text} \n")
 print(f"tokens: {num_tokens}, QPS: {num_tokens/duration_time}")
+metricResult = {"metricResult": {}}
+metricResult["metricResult"]["tokens"] = num_tokens
+metricResult["metricResult"]["QPS"] = round(num_tokens/duration_time,3)
+print(metricResult)
\ No newline at end of file
diff --git a/models/nlp/large_language_model/qwen1.5-32b/vllm/ci/prepare.sh b/models/nlp/large_language_model/qwen1.5-32b/vllm/ci/prepare.sh
new file mode 100644
index 00000000..cfd5031a
--- /dev/null
+++ b/models/nlp/large_language_model/qwen1.5-32b/vllm/ci/prepare.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -x
+ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
+if [[ ${ID} == "ubuntu" ]]; then
+    apt install -y libgl1-mesa-glx
+elif [[ ${ID} == "centos" ]]; then
+    yum install -y mesa-libGL
+else
+    echo "Not Support Os"
+fi
+
+# Contact the iluvatar manager to get adapted install packages of vllm, triton, and ixformer
+pip3 install vllm
+pip3 install triton
+pip3 install ixformer
\ No newline at end of file
diff --git a/models/nlp/large_language_model/qwen1.5-32b/vllm/offline_inference.py b/models/nlp/large_language_model/qwen1.5-32b/vllm/offline_inference.py
index 5e859291..9799150f 100644
--- a/models/nlp/large_language_model/qwen1.5-32b/vllm/offline_inference.py
+++ b/models/nlp/large_language_model/qwen1.5-32b/vllm/offline_inference.py
@@ -129,3 +129,7 @@ if __name__ == "__main__":
         num_tokens += len(output.outputs[0].token_ids)
         print(f"Prompt: {prompt}\nGenerated text: {generated_text} \n")
     print(f"tokens: {num_tokens}, QPS: {num_tokens/duration_time}")
+    metricResult = {"metricResult": {}}
+    metricResult["metricResult"]["tokens"] = num_tokens
+    metricResult["metricResult"]["QPS"] = round(num_tokens/duration_time,3)
+    print(metricResult)
\ No newline at end of file
diff --git a/models/nlp/large_language_model/qwen1.5-72b/vllm/ci/prepare.sh b/models/nlp/large_language_model/qwen1.5-72b/vllm/ci/prepare.sh
new file mode 100644
index 00000000..75fb1945
--- /dev/null
+++ b/models/nlp/large_language_model/qwen1.5-72b/vllm/ci/prepare.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -x
+ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
+if [[ ${ID} == "ubuntu" ]]; then
+    apt install -y libgl1-mesa-glx
+elif [[ ${ID} == "centos" ]]; then
+    yum install -y mesa-libGL
+else
+    echo "Not Support Os"
+fi
diff --git a/models/nlp/large_language_model/qwen1.5-72b/vllm/offline_inference.py b/models/nlp/large_language_model/qwen1.5-72b/vllm/offline_inference.py
index 7ee127a2..130f0885 100644
--- a/models/nlp/large_language_model/qwen1.5-72b/vllm/offline_inference.py
+++ b/models/nlp/large_language_model/qwen1.5-72b/vllm/offline_inference.py
@@ -108,3 +108,7 @@ for i, output in enumerate(outputs):
     num_tokens += len(output.outputs[0].token_ids)
     print(f"Prompt: {prompt}\nGenerated text: {generated_text} \n")
 print(f"tokens: {num_tokens}, QPS: {num_tokens/duration_time}")
+metricResult = {"metricResult": {}}
+metricResult["metricResult"]["tokens"] = num_tokens
+metricResult["metricResult"]["QPS"] = round(num_tokens/duration_time,3)
+print(metricResult)
\ No newline at end of file
diff --git a/models/nlp/large_language_model/qwen1.5-7b/text-generation-inference/ci/prepare.sh b/models/nlp/large_language_model/qwen1.5-7b/text-generation-inference/ci/prepare.sh
new file mode 100644
index 00000000..c801677c
--- /dev/null
+++ b/models/nlp/large_language_model/qwen1.5-7b/text-generation-inference/ci/prepare.sh
@@ -0,0 +1,29 @@
+#!/bin/bash
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -x
+ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
+if [[ ${ID} == "ubuntu" ]]; then
+    apt install -y libgl1-mesa-glx
+elif [[ ${ID} == "centos" ]]; then
+    yum install -y mesa-libGL
+else
+    echo "Not Support Os"
+fi
+
+mkdir -p data
+
+ln -s /mnt/deepspark/data/checkpoints/Qwen1.5-7B data/Qwen1.5-7B
\ No newline at end of file
diff --git a/models/nlp/large_language_model/qwen1.5-7b/text-generation-inference/offline_inference.py b/models/nlp/large_language_model/qwen1.5-7b/text-generation-inference/offline_inference.py
index b927973a..87f4df98 100644
--- a/models/nlp/large_language_model/qwen1.5-7b/text-generation-inference/offline_inference.py
+++ b/models/nlp/large_language_model/qwen1.5-7b/text-generation-inference/offline_inference.py
@@ -115,6 +115,11 @@ if __name__ == "__main__":
     duration_time = end_time - start_time
     print(f"generate length: {generations_one[0].generated_text.generated_tokens}")
     print(f"one batch: {generations_one[0].generated_text.text}\nqps: {generations_one[0].generated_text.generated_tokens /duration_time}")
+    metricResult = {"metricResult": {}}
+    metricResult["metricResult"]["generate length"] = generations_one[0].generated_text.generated_tokens
+    metricResult["metricResult"]["one batch"] = generations_one[0].generated_text.text
+    metricResult["metricResult"]["qps"] = generations_one[0].generated_text.generated_tokens /duration_time
+    print(metricResult)
 
 """
 qwen1.5-0.5B
diff --git a/models/nlp/large_language_model/qwen1.5-7b/vllm/ci/prepare.sh b/models/nlp/large_language_model/qwen1.5-7b/vllm/ci/prepare.sh
new file mode 100644
index 00000000..75fb1945
--- /dev/null
+++ b/models/nlp/large_language_model/qwen1.5-7b/vllm/ci/prepare.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -x
+ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
+if [[ ${ID} == "ubuntu" ]]; then
+    apt install -y libgl1-mesa-glx
+elif [[ ${ID} == "centos" ]]; then
+    yum install -y mesa-libGL
+else
+    echo "Not Support Os"
+fi
diff --git a/models/nlp/large_language_model/qwen1.5-7b/vllm/offline_inference.py b/models/nlp/large_language_model/qwen1.5-7b/vllm/offline_inference.py
index 7ee127a2..bae01307 100644
--- a/models/nlp/large_language_model/qwen1.5-7b/vllm/offline_inference.py
+++ b/models/nlp/large_language_model/qwen1.5-7b/vllm/offline_inference.py
@@ -108,3 +108,7 @@ for i, output in enumerate(outputs):
     num_tokens += len(output.outputs[0].token_ids)
     print(f"Prompt: {prompt}\nGenerated text: {generated_text} \n")
 print(f"tokens: {num_tokens}, QPS: {num_tokens/duration_time}")
+metricResult = {"metricResult": {}}
+metricResult["metricResult"]["tokens"] = num_tokens
+metricResult["metricResult"]["QPS"] = round(num_tokens/duration_time,3)
+print(metricResult)
diff --git a/models/nlp/large_language_model/qwen2-72b/vllm/ci/prepare.sh b/models/nlp/large_language_model/qwen2-72b/vllm/ci/prepare.sh
new file mode 100644
index 00000000..cfd5031a
--- /dev/null
+++ b/models/nlp/large_language_model/qwen2-72b/vllm/ci/prepare.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -x
+ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
+if [[ ${ID} == "ubuntu" ]]; then
+    apt install -y libgl1-mesa-glx
+elif [[ ${ID} == "centos" ]]; then
+    yum install -y mesa-libGL
+else
+    echo "Not Support Os"
+fi
+
+# Contact the iluvatar manager to get adapted install packages of vllm, triton, and ixformer
+pip3 install vllm
+pip3 install triton
+pip3 install ixformer
\ No newline at end of file
diff --git a/models/nlp/large_language_model/qwen2-72b/vllm/offline_inference.py b/models/nlp/large_language_model/qwen2-72b/vllm/offline_inference.py
index 5e859291..9799150f 100644
--- a/models/nlp/large_language_model/qwen2-72b/vllm/offline_inference.py
+++ b/models/nlp/large_language_model/qwen2-72b/vllm/offline_inference.py
@@ -129,3 +129,7 @@ if __name__ == "__main__":
         num_tokens += len(output.outputs[0].token_ids)
         print(f"Prompt: {prompt}\nGenerated text: {generated_text} \n")
     print(f"tokens: {num_tokens}, QPS: {num_tokens/duration_time}")
+    metricResult = {"metricResult": {}}
+    metricResult["metricResult"]["tokens"] = num_tokens
+    metricResult["metricResult"]["QPS"] = round(num_tokens/duration_time,3)
+    print(metricResult)
\ No newline at end of file
diff --git a/models/nlp/large_language_model/qwen2-7b/vllm/ci/prepare.sh b/models/nlp/large_language_model/qwen2-7b/vllm/ci/prepare.sh
new file mode 100644
index 00000000..cfd5031a
--- /dev/null
+++ b/models/nlp/large_language_model/qwen2-7b/vllm/ci/prepare.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -x
+ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
+if [[ ${ID} == "ubuntu" ]]; then
+    apt install -y libgl1-mesa-glx
+elif [[ ${ID} == "centos" ]]; then
+    yum install -y mesa-libGL
+else
+    echo "Not Support Os"
+fi
+
+# Contact the iluvatar manager to get adapted install packages of vllm, triton, and ixformer
+pip3 install vllm
+pip3 install triton
+pip3 install ixformer
\ No newline at end of file
diff --git a/models/nlp/large_language_model/qwen2-7b/vllm/offline_inference.py b/models/nlp/large_language_model/qwen2-7b/vllm/offline_inference.py
index 5e859291..9799150f 100644
--- a/models/nlp/large_language_model/qwen2-7b/vllm/offline_inference.py
+++ b/models/nlp/large_language_model/qwen2-7b/vllm/offline_inference.py
@@ -129,3 +129,7 @@ if __name__ == "__main__":
         num_tokens += len(output.outputs[0].token_ids)
         print(f"Prompt: {prompt}\nGenerated text: {generated_text} \n")
     print(f"tokens: {num_tokens}, QPS: {num_tokens/duration_time}")
+    metricResult = {"metricResult": {}}
+    metricResult["metricResult"]["tokens"] = num_tokens
+    metricResult["metricResult"]["QPS"] = round(num_tokens/duration_time,3)
+    print(metricResult)
\ No newline at end of file
diff --git a/models/nlp/large_language_model/stablelm/vllm/ci/prepare.sh b/models/nlp/large_language_model/stablelm/vllm/ci/prepare.sh
new file mode 100644
index 00000000..6d814558
--- /dev/null
+++ b/models/nlp/large_language_model/stablelm/vllm/ci/prepare.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -x
+ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
+if [[ ${ID} == "ubuntu" ]]; then
+    apt install -y libgl1-mesa-glx
+elif [[ ${ID} == "centos" ]]; then
+    yum install -y mesa-libGL
+else
+    echo "Not Support Os"
+fi
+
+pip3 install transformers
\ No newline at end of file
diff --git a/models/nlp/large_language_model/stablelm/vllm/offline_inference.py b/models/nlp/large_language_model/stablelm/vllm/offline_inference.py
index 40678a62..e9f2abfb 100644
--- a/models/nlp/large_language_model/stablelm/vllm/offline_inference.py
+++ b/models/nlp/large_language_model/stablelm/vllm/offline_inference.py
@@ -132,4 +132,8 @@ if __name__ == "__main__":
 
         num_tokens += len(output.outputs[0].token_ids)
         print(f"Prompt: {prompt}\nGenerated text: {generated_text} \n")
-    print(f"tokens: {num_tokens}, QPS: {num_tokens/duration_time}")
\ No newline at end of file
+    print(f"tokens: {num_tokens}, QPS: {num_tokens/duration_time}")
+    metricResult = {"metricResult": {}}
+    metricResult["metricResult"]["tokens"] = num_tokens
+    metricResult["metricResult"]["QPS"] = round(num_tokens/duration_time,3)
+    print(metricResult)
\ No newline at end of file
diff --git a/tests/models_trtllm.yaml b/tests/models_trtllm.yaml
new file mode 100644
index 00000000..de21908e
--- /dev/null
+++ b/tests/models_trtllm.yaml
@@ -0,0 +1,41 @@
+---
+- datasets: https://localhost
+  download_url: https://huggingface.co/meta-llama/llama2-7b-chat
+  name: llama2-7b
+  need_third_part: false
+  precisions:
+    - fp16
+  relative_path: models/nlp/large_language_model/llama2-7b/trtllm
+  task_type: nlp/large_language_model
+- datasets: https://localhost
+  download_url: https://localhost/llama2-13b-chat
+  name: llama2-13b
+  need_third_part: false
+  precisions:
+    - fp16
+  relative_path: models/nlp/large_language_model/llama2-13b/trtllm
+  task_type: nlp/large_language_model
+- datasets: https://localhost
+  download_url: https://localhost/llama2-70b-chat
+  name: llama2-70b
+  need_third_part: false
+  precisions:
+    - fp16
+  relative_path: models/nlp/large_language_model/llama2-70b/trtllm
+  task_type: nlp/large_language_model
+- datasets: https://localhost
+  download_url: https://localhost/qwen-7B
+  name: qwen-7b
+  need_third_part: false
+  precisions:
+    - fp16
+  relative_path: models/nlp/large_language_model/qwen-7b/text-generation-inference
+  task_type: nlp/large_language_model
+- datasets: https://localhost
+  download_url: https://modelscope.cn/models/qwen/Qwen1.5-7B
+  name: qwen1.5-7b
+  need_third_part: false
+  precisions:
+    - fp16
+  relative_path: models/nlp/large_language_model/qwen1.5-7b/text-generation-inference
+  task_type: nlp/large_language_model
diff --git a/tests/models_vllm.yaml b/tests/models_vllm.yaml
new file mode 100644
index 00000000..6d7177ea
--- /dev/null
+++ b/tests/models_vllm.yaml
@@ -0,0 +1,106 @@
+---
+- datasets: https://localhost
+  download_url: https://huggingface.co/baichuan-inc/Baichuan2-7B-Base
+  name: baichuan2-7b
+  need_third_part: false
+  precisions:
+    - fp16
+    - int8
+  relative_path: models/nlp/large_language_model/baichuan2-7b/vllm
+  task_type: nlp/large_language_model
+- datasets: https://localhost
+  download_url: https://huggingface.co/THUDM/chatglm3-6b
+  name: chatglm3-6b
+  need_third_part: false
+  precisions:
+    - fp16
+  relative_path: models/nlp/large_language_model/chatglm3-6b/vllm
+  task_type: nlp/large_language_model
+- datasets: https://localhost
+  download_url: https://www.modelscope.cn/models/ZhipuAI/chatglm3-6b-32k-无
+  name: chatglm3-6b-32k
+  need_third_part: false
+  precisions:
+    - fp16
+  relative_path: models/nlp/large_language_model/chatglm3-6b-32k/vllm
+  task_type: nlp/large_language_model
+- datasets: https://localhost
+  download_url: https://localhost/llama2-7b
+  name: llama2-7b
+  need_third_part: false
+  precisions:
+    - fp16
+  relative_path: models/nlp/large_language_model/llama2-7b/vllm
+  task_type: nlp/large_language_model
+- datasets: https://localhost
+  download_url: https://localhost/Meta-Llama-3-70B-Instruct
+  name: llama3-70b
+  need_third_part: false
+  precisions:
+    - fp16
+  relative_path: models/nlp/large_language_model/llama3-70b/vllm
+  task_type: nlp/large_language_model
+- datasets: https://localhost
+  download_url: https://localhost/qwen-7B
+  name: qwen-7b
+  need_third_part: false
+  precisions:
+    - fp16
+  relative_path: models/nlp/large_language_model/qwen-7b/vllm
+  task_type: nlp/large_language_model
+- datasets: https://localhost
+  download_url: https://modelscope.cn/models/qwen/Qwen1.5-7B
+  name: qwen1.5-7b
+  need_third_part: false
+  precisions:
+    - fp16
+  relative_path: models/nlp/large_language_model/qwen1.5-7b/vllm
+  task_type: nlp/large_language_model
+- datasets: https://localhost
+  download_url: https://modelscope.cn/models/qwen/Qwen1.5-14B
+  name: qwen1.5-14b
+  need_third_part: false
+  precisions:
+    - fp16
+  relative_path: models/nlp/large_language_model/qwen1.5-14b/vllm
+  task_type: nlp/large_language_model
+- datasets: https://localhost
+  download_url: https://modelscope.cn/models/Qwen/Qwen1.5-32B-Chat
+  name: qwen1.5-32b
+  need_third_part: false
+  precisions:
+    - fp16
+  relative_path: models/nlp/large_language_model/qwen1.5-32b/vllm
+  task_type: nlp/large_language_model
+- datasets: https://localhost
+  download_url: https://modelscope.cn/models/qwen/Qwen1.5-72B
+  name: qwen1.5-72b
+  need_third_part: false
+  precisions:
+    - fp16
+  relative_path: models/nlp/large_language_model/qwen1.5-72b/vllm
+  task_type: nlp/large_language_model
+- datasets: https://localhost
+  download_url: https://modelscope.cn/models/Qwen/Qwen2-7B-Instruct
+  name: qwen2-7b
+  need_third_part: false
+  precisions:
+    - fp16
+  relative_path: models/nlp/large_language_model/qwen2-7b/vllm
+  task_type: nlp/large_language_model
+- datasets: https://localhost
+  download_url: https://localhost/Qwen2-72B
+  name: qwen2-72b
+  need_third_part: false
+  precisions:
+    - fp16
+  relative_path: models/nlp/large_language_model/qwen2-72b/vllm
+  task_type: nlp/large_language_model
+- datasets: https://localhost
+  download_url: https://huggingface.co/stabilityai/stablelm-2-1_6b
+  name: stablelm
+  need_third_part: false
+  precisions:
+    - fp16
+  relative_path: models/nlp/large_language_model/stablelm/vllm
+  task_type: nlp/large_language_model
diff --git a/tests/run_trtllm.py b/tests/run_trtllm.py
new file mode 100644
index 00000000..9f8a494f
--- /dev/null
+++ b/tests/run_trtllm.py
@@ -0,0 +1,189 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import yaml
+import subprocess
+import json
+import re
+import time
+import logging
+import os
+import sys
+import argparse
+
+import utils
+
+# 配置日志
+debug_level = logging.DEBUG if utils.is_debug() else logging.INFO
+logging.basicConfig(
+    handlers=[logging.FileHandler("output.log"), logging.StreamHandler()],
+    level=debug_level,
+    format="%(asctime)s - %(levelname)s - %(message)s",
+)
+
+METRIC_PATTERN = r"{'metricResult':.*}"
+
+def main():
+    parser = argparse.ArgumentParser(description="")
+    parser.add_argument("--model", type=str, help="model name, e.g: alexnet")
+    args = parser.parse_args()
+
+    if args.model:
+        test_model = args.model
+    else:
+        test_model = os.environ.get("TEST_CASE")
+    logging.info(f"Test case to run: {test_model}")
+    if not test_model:
+        logging.error("test model case is empty")
+        sys.exit(-1)
+    
+    model = get_model_config(test_model)
+    if not model:
+        logging.error("mode config is empty")
+        sys.exit(-1)
+
+    result = {}
+    # NLP模型
+    if model["task_type"] in ["nlp/large_language_model"]:
+        logging.info(f"Start running {model['name']} test case:\n{json.dumps(model, indent=4)}")
+        d_url = model["download_url"]
+        if d_url is not None:
+            result = run_nlp_testcase(model)
+            check_model_result(result)
+            logging.debug(f"The result of {model['name']} is\n{json.dumps(result, indent=4)}")
+        logging.info(f"End running {model['name']} test case.")
+
+    logging.info(f"Full text result: {result}")
+
+def get_model_config(mode_name):
+    with open("models_igie.yaml", "r") as file:
+        models = yaml.safe_load(file)
+
+    for model in models:
+        if model["name"] == mode_name.lower():
+            return model
+    return
+
+def check_model_result(result):
+    status = "PASS"
+    for prec in ["fp16", "int8"]:
+        if prec in result["result"]:
+            if result["result"][prec]["status"] == "FAIL":
+                status = "FAIL"
+                break
+    result["status"] = status
+
+def run_nlp_testcase(model):
+    model_name = model["name"]
+    result = {
+        "name": model_name,
+        "result": {},
+    }
+    d_url = model["download_url"]
+    checkpoint_n = d_url.split("/")[-1]
+    dataset_n = model["datasets"].split("/")[-1]
+    prepare_script = f"""
+    set -x
+    cd ../{model['relative_path']}
+    bash ci/prepare.sh
+    """
+
+    # add pip list info when in debug mode
+    if utils.is_debug():
+        pip_list_script = "pip list | grep -E 'numpy|transformer|igie|mmcv|onnx'\n"
+        prepare_script = pip_list_script + prepare_script + pip_list_script
+
+    run_script(prepare_script)
+
+    for prec in model["precisions"]:
+        logging.info(f"Start running {model_name} {prec} test case")
+        script = f"""
+        set -x
+        cd ../{model['relative_path']}
+        """
+        if model_name == "llama2-7b":
+            script = f"""
+            set -x
+            cd ../{model['relative_path']}
+            bash scripts/test_trtllm_llama2_7b_gpu1_build.sh
+            bash scripts/test_trtllm_llama2_7b_gpu1.sh
+            """
+        elif model_name == "llama2-13b":
+            script = f"""
+            set -x
+            cd ../{model['relative_path']}
+            export CUDA_VISIBLE_DEVICES=0,1
+            bash scripts/test_trtllm_llama2_13b_gpu2_build.sh
+            bash scripts/test_trtllm_llama2_13b_gpu2.sh
+            """
+        elif model_name == "llama2-70b":
+            script = f"""
+            set -x
+            cd ../{model['relative_path']}
+            export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+            bash scripts/test_trtllm_llama2_70b_gpu8_build.sh
+            bash scripts/test_trtllm_llama2_70b_gpu8.sh
+            """
+        elif model_name == "qwen-7b":
+            script = f"""
+            set -x
+            cd ../{model['relative_path']}
+            export CUDA_VISIBLE_DEVICES=1
+            python3 offline_inference.py --model2path ./data/qwen-7B
+            """
+        elif model_name == "qwen1.5-7b":
+            script = f"""
+            set -x
+            cd ../{model['relative_path']}
+            export CUDA_VISIBLE_DEVICES=1
+            python3 offline_inference.py --model2path ./data/Qwen1.5-7B
+            """
+
+        r, t = run_script(script)
+        sout = r.stdout
+
+        pattern = METRIC_PATTERN
+        matchs = re.findall(pattern, sout)
+        result["result"].setdefault(prec, {"status": "FAIL"})
+        logging.debug(f"matchs:\n{matchs}")
+        for m in matchs:
+            result["result"][prec].update(get_metric_result(m))
+        if len(matchs) == 2:
+            result["result"][prec]["status"] = "PASS"
+
+        result["result"][prec]["Cost time (s)"] = t
+    return result
+
+def get_metric_result(str):
+    if str:
+        return json.loads(str.replace("'", "\""))["metricResult"]
+    return None
+
+def run_script(script):
+    start_time = time.perf_counter()
+    result = subprocess.run(
+        script, shell=True, capture_output=True, text=True, executable="/bin/bash"
+    )
+    end_time = time.perf_counter()
+    execution_time = end_time - start_time
+    logging.debug(f"执行命令：\n{script}")
+    logging.debug("执行时间: {:.4f} 秒".format(execution_time))
+    logging.debug(f"标准输出: {result.stdout}")
+    logging.debug(f"标准错误: {result.stderr}")
+    logging.debug(f"返回码: {result.returncode}")
+    return result, execution_time
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/run_vllm.py b/tests/run_vllm.py
new file mode 100644
index 00000000..6d212c95
--- /dev/null
+++ b/tests/run_vllm.py
@@ -0,0 +1,315 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import yaml
+import subprocess
+import json
+import re
+import time
+import logging
+import os
+import sys
+import argparse
+
+import utils
+
+# 配置日志
+debug_level = logging.DEBUG if utils.is_debug() else logging.INFO
+logging.basicConfig(
+    handlers=[logging.FileHandler("output.log"), logging.StreamHandler()],
+    level=debug_level,
+    format="%(asctime)s - %(levelname)s - %(message)s",
+)
+
+METRIC_PATTERN = r"{'metricResult':.*}"
+
+def main():
+    parser = argparse.ArgumentParser(description="")
+    parser.add_argument("--model", type=str, help="model name, e.g: alexnet")
+    args = parser.parse_args()
+
+    if args.model:
+        test_model = args.model
+    else:
+        test_model = os.environ.get("TEST_CASE")
+    logging.info(f"Test case to run: {test_model}")
+    if not test_model:
+        logging.error("test model case is empty")
+        sys.exit(-1)
+    
+    model = get_model_config(test_model)
+    if not model:
+        logging.error("mode config is empty")
+        sys.exit(-1)
+
+    result = {}
+    # NLP模型
+    if model["task_type"] in ["nlp/large_language_model"]:
+        logging.info(f"Start running {model['name']} test case:\n{json.dumps(model, indent=4)}")
+        d_url = model["download_url"]
+        if d_url is not None:
+            result = run_nlp_testcase(model)
+            check_model_result(result)
+            logging.debug(f"The result of {model['name']} is\n{json.dumps(result, indent=4)}")
+        logging.info(f"End running {model['name']} test case.")
+
+    logging.info(f"Full text result: {result}")
+
+def get_model_config(mode_name):
+    with open("models_igie.yaml", "r") as file:
+        models = yaml.safe_load(file)
+
+    for model in models:
+        if model["name"] == mode_name.lower():
+            return model
+    return
+
+def check_model_result(result):
+    status = "PASS"
+    for prec in ["fp16", "int8"]:
+        if prec in result["result"]:
+            if result["result"][prec]["status"] == "FAIL":
+                status = "FAIL"
+                break
+    result["status"] = status
+
+def run_nlp_testcase(model):
+    model_name = model["name"]
+    result = {
+        "name": model_name,
+        "result": {},
+    }
+    d_url = model["download_url"]
+    checkpoint_n = d_url.split("/")[-1]
+    dataset_n = model["datasets"].split("/")[-1]
+    prepare_script = f"""
+    set -x
+    cd ../{model['relative_path']}
+    ln -s /mnt/deepspark/data/checkpoints/{checkpoint_n} ./model_name
+    bash ci/prepare.sh
+    """
+
+    # add pip list info when in debug mode
+    if utils.is_debug():
+        pip_list_script = "pip list | grep -E 'numpy|transformer|igie|mmcv|onnx'\n"
+        prepare_script = pip_list_script + prepare_script + pip_list_script
+
+    run_script(prepare_script)
+
+    for prec in model["precisions"]:
+        logging.info(f"Start running {model_name} {prec} test case")
+        script = f"""
+        set -x
+        cd ../{model['relative_path']}
+        """
+        if model_name == "baichuan2-7b":
+            script = f"""
+            set -x
+            cd ../{model['relative_path']}
+            python3 offline_inference.py --model ./baichuan2-7b/ --max-tokens 256 --trust-remote-code --chat_template template_baichuan.jinja --temperature 0.0
+            """
+            if prec == "int8":
+                script = f"""
+                set -x
+                cd ../{model['relative_path']}
+                python3 offline_inference.py --model ./baichuan2-7b/int8/ --chat_template template_baichuan.jinja --quantization w8a16 --max-num-seqs 1 --max-model-len 256 --trust-remote-code --temperature 0.0 --max-tokens 256
+                """
+        elif model_name == "chatglm3-6b":
+            script = f"""
+            set -x
+            cd ../{model['relative_path']}
+            python3 offline_inference.py --model ./chatglm3-6b --trust-remote-code --temperature 0.0 --max-tokens 256
+            """
+        elif model_name == "chatglm3-6b-32k":
+            script = f"""
+            set -x
+            cd ../{model['relative_path']}
+            python3 offline_inference.py --model ./chatglm3-6b-32k --trust-remote-code --temperature 0.0 --max-tokens 256
+            """
+        elif model_name == "llama2-7b":
+            script = f"""
+            set -x
+            cd ../{model['relative_path']}
+            python3 offline_inference.py --model ./llama2-7b --max-tokens 256 -tp 1 --temperature 0.0
+            """
+        elif model_name == "llama3-70b":
+            script = f"""
+            set -x
+            cd ../{model['relative_path']}
+            export CUDA_VISIBLE_DEVICES=0,1,2,3
+            python3 offline_inference.py --model ./llama3-70b --max-tokens 256 -tp 4 --temperature 0.0
+            """
+        elif model_name == "qwen-7b":
+            script = f"""
+            set -x
+            cd ../{model['relative_path']}
+            export CUDA_VISIBLE_DEVICES=0,1
+            python3 offline_inference.py --model ./qwen-7b --max-tokens 256 -tp 2 --trust-remote-code --temperature 0.0
+            """
+        elif model_name == "qwen1.5-7b":
+            script = f"""
+            set -x
+            cd ../{model['relative_path']}
+            python3 offline_inference.py --model ./qwen1.5-7b --max-tokens 256 -tp 1 --temperature 0.0 --max-model-len 3096
+            """
+        elif model_name == "qwen1.5-7b":
+            script = f"""
+            set -x
+            cd ../{model['relative_path']}
+            python3 offline_inference.py --model ./qwen1.5-7b --max-tokens 256 -tp 1 --temperature 0.0 --max-model-len 3096
+            """
+        elif model_name == "qwen1.5-14b":
+            script = f"""
+            set -x
+            cd ../{model['relative_path']}
+            python3 offline_inference.py --model ./qwen1.5-14b --max-tokens 256 -tp 1 --temperature 0.0 --max-model-len 1024
+            """
+        elif model_name == "qwen1.5-32b":
+            script = f"""
+            set -x
+            cd ../{model['relative_path']}
+            export CUDA_VISIBLE_DEVICES=0,1,2,3
+            python3 offline_inference.py --model ./qwen1.5-32b --max-tokens 256 -tp 4 --temperature 0.0
+            """
+        elif model_name == "qwen1.5-72b":
+            script = f"""
+            set -x
+            cd ../{model['relative_path']}
+            export CUDA_VISIBLE_DEVICES=0,1
+            python3 offline_inference.py --model ./qwen1.5-72b --max-tokens 256 -tp 2 --temperature 0.0 --max-model-len 3096
+            """
+        elif model_name == "qwen2-7b":
+            script = f"""
+            set -x
+            cd ../{model['relative_path']}
+            export CUDA_VISIBLE_DEVICES=0
+            python3 offline_inference.py --model ./qwen2-7b --max-tokens 256 -tp 1 --temperature 0.0
+            """
+        elif model_name == "qwen2-72b":
+            script = f"""
+            set -x
+            cd ../{model['relative_path']}
+            export CUDA_VISIBLE_DEVICES=0,1,2,3
+            python3 offline_inference.py --model ./qwen2-72b --max-tokens 256 -tp 4 --temperature 0.0 --gpu-memory-utilization 0.98 --max-model-len 32768
+            """
+        elif model_name == "stablelm":
+            script = f"""
+            set -x
+            cd ../{model['relative_path']}
+            export CUDA_VISIBLE_DEVICES=0,1
+            python3 offline_inference.py --model ./stablelm --max-tokens 256 -tp 1 --temperature 0.0
+            """
+
+        r, t = run_script(script)
+        sout = r.stdout
+
+        pattern = METRIC_PATTERN
+        matchs = re.findall(pattern, sout)
+        result["result"].setdefault(prec, {"status": "FAIL"})
+        logging.debug(f"matchs:\n{matchs}")
+        for m in matchs:
+            result["result"][prec].update(get_metric_result(m))
+        if len(matchs) == 2:
+            result["result"][prec]["status"] = "PASS"
+
+        result["result"][prec]["Cost time (s)"] = t
+    return result
+
+def run_speech_testcase(model):
+    model_name = model["name"]
+    result = {
+        "name": model_name,
+        "result": {},
+    }
+    d_url = model["download_url"]
+    checkpoint_n = d_url.split("/")[-1]
+    dataset_n = model["datasets"].split("/")[-1]
+    prepare_script = f"""
+    cd ../{model['relative_path']}
+    ln -s /mnt/deepspark/data/checkpoints/{checkpoint_n} ./
+    ln -s /mnt/deepspark/data/datasets/{dataset_n} ./
+    """
+
+    if model["need_third_part"] and model_name == "conformer":
+        prepare_script += "unzip /mnt/deepspark/data/3rd_party/kenlm.zip -d ./ctc_decoder/swig/kenlm\n"
+        prepare_script += "unzip /mnt/deepspark/data/3rd_party/ThreadPool.zip -d ./ctc_decoder/swig/ThreadPool\n"
+        prepare_script += "tar -xzvf /mnt/deepspark/data/3rd_party/openfst-1.6.3.tar.gz -C ./ctc_decoder/swig/\n"
+
+    prepare_script += """
+    export PYTHONPATH=`pwd`/wenet:$PYTHONPATH
+    echo $PYTHONPATH
+    bash ci/prepare.sh
+    ls -l | grep onnx
+    """
+
+    # add pip list info when in debug mode
+    if utils.is_debug():
+        pip_list_script = "pip list | grep -E 'numpy|transformer|igie|mmcv|onnx'\n"
+        prepare_script = pip_list_script + prepare_script + pip_list_script
+
+    run_script(prepare_script)
+
+    for prec in model["precisions"]:
+        logging.info(f"Start running {model_name} {prec} test case")
+        script = f"""
+        cd ../{model['relative_path']}
+        export PYTHONPATH=./wenet:$PYTHONPATH
+        echo $PYTHONPATH
+        bash scripts/infer_{model_name}_{prec}_accuracy.sh
+        bash scripts/infer_{model_name}_{prec}_performance.sh
+        """
+
+        r, t = run_script(script)
+        sout = r.stdout
+        pattern = r"\* ([\w\d ]+):\s*([\d.]+)[ ms%]*, ([\w\d ]+):\s*([\d.]+)[ ms%]*"
+        matchs = re.findall(pattern, sout)
+        for m in matchs:
+            result["result"].setdefault(prec, {"status": "FAIL"})
+            try:
+                result["result"][prec] = result["result"][prec] | {m[0]: float(m[1]), m[2]: float(m[3])}
+            except ValueError:
+                print("The string cannot be converted to a float.")
+                result["result"][prec] = result["result"][prec] | {m[0]: m[1], m[2]: m[3]}
+        pattern = METRIC_PATTERN
+        matchs = re.findall(pattern, sout)
+        if matchs and len(matchs) == 1:
+            result["result"].setdefault(prec, {})
+            result["result"][prec].update(get_metric_result(matchs[0]))
+            result["result"][prec]["status"] = "PASS"
+        result["result"][prec]["Cost time (s)"] = t
+        logging.debug(f"matchs:\n{matchs}")
+    return result
+
+def get_metric_result(str):
+    if str:
+        return json.loads(str.replace("'", "\""))["metricResult"]
+    return None
+
+def run_script(script):
+    start_time = time.perf_counter()
+    result = subprocess.run(
+        script, shell=True, capture_output=True, text=True, executable="/bin/bash"
+    )
+    end_time = time.perf_counter()
+    execution_time = end_time - start_time
+    logging.debug(f"执行命令：\n{script}")
+    logging.debug("执行时间: {:.4f} 秒".format(execution_time))
+    logging.debug(f"标准输出: {result.stdout}")
+    logging.debug(f"标准错误: {result.stderr}")
+    logging.debug(f"返回码: {result.returncode}")
+    return result, execution_time
+
+if __name__ == "__main__":
+    main()
-- 
Gitee


From 96488331904933684f3dfea52c05cf6a959e43ad Mon Sep 17 00:00:00 2001
From: "hongliang.yuan" <hongliang.yuan@iluvatar.com>
Date: Wed, 22 Jan 2025 11:05:44 +0800
Subject: [PATCH 2/6] update vllm

---
 tests/run_trtllm.py | 2 +-
 tests/run_vllm.py   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/run_trtllm.py b/tests/run_trtllm.py
index 9f8a494f..911ac6fd 100644
--- a/tests/run_trtllm.py
+++ b/tests/run_trtllm.py
@@ -68,7 +68,7 @@ def main():
     logging.info(f"Full text result: {result}")
 
 def get_model_config(mode_name):
-    with open("models_igie.yaml", "r") as file:
+    with open("models_trtllm.yaml", "r") as file:
         models = yaml.safe_load(file)
 
     for model in models:
diff --git a/tests/run_vllm.py b/tests/run_vllm.py
index 6d212c95..96498c2a 100644
--- a/tests/run_vllm.py
+++ b/tests/run_vllm.py
@@ -68,7 +68,7 @@ def main():
     logging.info(f"Full text result: {result}")
 
 def get_model_config(mode_name):
-    with open("models_igie.yaml", "r") as file:
+    with open("models_vllm.yaml", "r") as file:
         models = yaml.safe_load(file)
 
     for model in models:
-- 
Gitee


From d8dcde4db9c6124ee6ea226530ee4d51b023523e Mon Sep 17 00:00:00 2001
From: "hongliang.yuan" <hongliang.yuan@iluvatar.com>
Date: Wed, 22 Jan 2025 11:36:52 +0800
Subject: [PATCH 3/6] fix model name

---
 tests/run_vllm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/run_vllm.py b/tests/run_vllm.py
index 96498c2a..bc15c268 100644
--- a/tests/run_vllm.py
+++ b/tests/run_vllm.py
@@ -97,7 +97,7 @@ def run_nlp_testcase(model):
     prepare_script = f"""
     set -x
     cd ../{model['relative_path']}
-    ln -s /mnt/deepspark/data/checkpoints/{checkpoint_n} ./model_name
+    ln -s /mnt/deepspark/data/checkpoints/{checkpoint_n} ./{model_name}
     bash ci/prepare.sh
     """
 
-- 
Gitee


From ae6a10adca6259aff74da0c67c2d9ce3016185ca Mon Sep 17 00:00:00 2001
From: "hongliang.yuan" <hongliang.yuan@iluvatar.com>
Date: Wed, 22 Jan 2025 14:29:43 +0800
Subject: [PATCH 4/6] update vllm

---
 .../baichuan2-7b/vllm/ci/prepare.sh           |  2 +-
 .../chatglm3-6b-32k/vllm/offline_inference.py |  2 +-
 .../qwen1.5-14b/vllm/README.md                |  2 +-
 tests/models_vllm.yaml                        |  2 +-
 tests/run_vllm.py                             | 69 +------------------
 5 files changed, 6 insertions(+), 71 deletions(-)

diff --git a/models/nlp/large_language_model/baichuan2-7b/vllm/ci/prepare.sh b/models/nlp/large_language_model/baichuan2-7b/vllm/ci/prepare.sh
index 5b9abbd3..54b66b4e 100644
--- a/models/nlp/large_language_model/baichuan2-7b/vllm/ci/prepare.sh
+++ b/models/nlp/large_language_model/baichuan2-7b/vllm/ci/prepare.sh
@@ -29,4 +29,4 @@ pip3 install transformers==4.37.1
 # has prepared in ci
 # ln -s /mnt/deepspark/data/checkpoints/Baichuan2-7B-Base ./
 
-python3 convert2int8.py --model-path ./Baichuan2-7B-Base/
\ No newline at end of file
+python3 convert2int8.py --model-path ./baichuan2-7b/
\ No newline at end of file
diff --git a/models/nlp/large_language_model/chatglm3-6b-32k/vllm/offline_inference.py b/models/nlp/large_language_model/chatglm3-6b-32k/vllm/offline_inference.py
index bc731079..7fc45b68 100644
--- a/models/nlp/large_language_model/chatglm3-6b-32k/vllm/offline_inference.py
+++ b/models/nlp/large_language_model/chatglm3-6b-32k/vllm/offline_inference.py
@@ -57,7 +57,7 @@ if __name__ == "__main__":
     model_name = model_name.rsplit("/")[-1]
 
     # Sample prompts.
-    prompts = ["��Щ������ܱ���һ�������ھ�������?", "����һ���������֥ʿ������", "дһƪ�й�5G�����з����������¡�"]
+    prompts = ["哪些迹象可能表明一个人正在经历焦虑?", "描述一下如何制作芝士披萨。", "写一篇有关5G网络研发的综述文章。"]
 
     # Create a sampling params object.
     sampling_params = SamplingParams(**sampling_params)
diff --git a/models/nlp/large_language_model/qwen1.5-14b/vllm/README.md b/models/nlp/large_language_model/qwen1.5-14b/vllm/README.md
index b3c67597..905967c2 100644
--- a/models/nlp/large_language_model/qwen1.5-14b/vllm/README.md
+++ b/models/nlp/large_language_model/qwen1.5-14b/vllm/README.md
@@ -29,7 +29,7 @@ ln -s /path/to/Qwen1.5-14B ./data/qwen1.5
 ## Inference
 
 ```bash
-python3 offline_inference.py --model ./data/qwen1.5/Qwen1.5-14B --max-tokens 256 -tp 1 --temperature 0.0 --max-model-len 1024
+python3 offline_inference.py --model ./data/qwen1.5/Qwen1.5-14B --max-tokens 256 -tp 1 --temperature 0.0 --max-model-len 896
 ```
 
 ## Results
diff --git a/tests/models_vllm.yaml b/tests/models_vllm.yaml
index 6d7177ea..821733f6 100644
--- a/tests/models_vllm.yaml
+++ b/tests/models_vllm.yaml
@@ -17,7 +17,7 @@
   relative_path: models/nlp/large_language_model/chatglm3-6b/vllm
   task_type: nlp/large_language_model
 - datasets: https://localhost
-  download_url: https://www.modelscope.cn/models/ZhipuAI/chatglm3-6b-32k-无
+  download_url: https://www.modelscope.cn/models/ZhipuAI/chatglm3-6b-32k
   name: chatglm3-6b-32k
   need_third_part: false
   precisions:
diff --git a/tests/run_vllm.py b/tests/run_vllm.py
index bc15c268..488465ba 100644
--- a/tests/run_vllm.py
+++ b/tests/run_vllm.py
@@ -174,7 +174,7 @@ def run_nlp_testcase(model):
             script = f"""
             set -x
             cd ../{model['relative_path']}
-            python3 offline_inference.py --model ./qwen1.5-14b --max-tokens 256 -tp 1 --temperature 0.0 --max-model-len 1024
+            python3 offline_inference.py --model ./qwen1.5-14b --max-tokens 256 -tp 1 --temperature 0.0 --max-model-len 896
             """
         elif model_name == "qwen1.5-32b":
             script = f"""
@@ -221,77 +221,12 @@ def run_nlp_testcase(model):
         logging.debug(f"matchs:\n{matchs}")
         for m in matchs:
             result["result"][prec].update(get_metric_result(m))
-        if len(matchs) == 2:
+        if len(matchs) == 1:
             result["result"][prec]["status"] = "PASS"
 
         result["result"][prec]["Cost time (s)"] = t
     return result
 
-def run_speech_testcase(model):
-    model_name = model["name"]
-    result = {
-        "name": model_name,
-        "result": {},
-    }
-    d_url = model["download_url"]
-    checkpoint_n = d_url.split("/")[-1]
-    dataset_n = model["datasets"].split("/")[-1]
-    prepare_script = f"""
-    cd ../{model['relative_path']}
-    ln -s /mnt/deepspark/data/checkpoints/{checkpoint_n} ./
-    ln -s /mnt/deepspark/data/datasets/{dataset_n} ./
-    """
-
-    if model["need_third_part"] and model_name == "conformer":
-        prepare_script += "unzip /mnt/deepspark/data/3rd_party/kenlm.zip -d ./ctc_decoder/swig/kenlm\n"
-        prepare_script += "unzip /mnt/deepspark/data/3rd_party/ThreadPool.zip -d ./ctc_decoder/swig/ThreadPool\n"
-        prepare_script += "tar -xzvf /mnt/deepspark/data/3rd_party/openfst-1.6.3.tar.gz -C ./ctc_decoder/swig/\n"
-
-    prepare_script += """
-    export PYTHONPATH=`pwd`/wenet:$PYTHONPATH
-    echo $PYTHONPATH
-    bash ci/prepare.sh
-    ls -l | grep onnx
-    """
-
-    # add pip list info when in debug mode
-    if utils.is_debug():
-        pip_list_script = "pip list | grep -E 'numpy|transformer|igie|mmcv|onnx'\n"
-        prepare_script = pip_list_script + prepare_script + pip_list_script
-
-    run_script(prepare_script)
-
-    for prec in model["precisions"]:
-        logging.info(f"Start running {model_name} {prec} test case")
-        script = f"""
-        cd ../{model['relative_path']}
-        export PYTHONPATH=./wenet:$PYTHONPATH
-        echo $PYTHONPATH
-        bash scripts/infer_{model_name}_{prec}_accuracy.sh
-        bash scripts/infer_{model_name}_{prec}_performance.sh
-        """
-
-        r, t = run_script(script)
-        sout = r.stdout
-        pattern = r"\* ([\w\d ]+):\s*([\d.]+)[ ms%]*, ([\w\d ]+):\s*([\d.]+)[ ms%]*"
-        matchs = re.findall(pattern, sout)
-        for m in matchs:
-            result["result"].setdefault(prec, {"status": "FAIL"})
-            try:
-                result["result"][prec] = result["result"][prec] | {m[0]: float(m[1]), m[2]: float(m[3])}
-            except ValueError:
-                print("The string cannot be converted to a float.")
-                result["result"][prec] = result["result"][prec] | {m[0]: m[1], m[2]: m[3]}
-        pattern = METRIC_PATTERN
-        matchs = re.findall(pattern, sout)
-        if matchs and len(matchs) == 1:
-            result["result"].setdefault(prec, {})
-            result["result"][prec].update(get_metric_result(matchs[0]))
-            result["result"][prec]["status"] = "PASS"
-        result["result"][prec]["Cost time (s)"] = t
-        logging.debug(f"matchs:\n{matchs}")
-    return result
-
 def get_metric_result(str):
     if str:
         return json.loads(str.replace("'", "\""))["metricResult"]
-- 
Gitee


From c25c728f8d5514c75d29adb0fd545f8298787b34 Mon Sep 17 00:00:00 2001
From: "hongliang.yuan" <hongliang.yuan@iluvatar.com>
Date: Wed, 22 Jan 2025 15:52:57 +0800
Subject: [PATCH 5/6] update vllm

---
 README.md                                     |  2 +-
 .../minicpm-v-2}/vllm/README.md               |  0
 .../minicpm-v-2/vllm/ci/prepare.sh            | 31 +++++++++++++++++++
 .../minicpm-v-2}/vllm/minicpmv-2.0-offline.py |  0
 tests/models_vllm.yaml                        |  8 +++++
 tests/run_vllm.py                             | 11 ++++++-
 6 files changed, 50 insertions(+), 2 deletions(-)
 rename models/{vision-language-understanding/MiniCPM-V-2 => multimodal/vision-language-understanding/minicpm-v-2}/vllm/README.md (100%)
 create mode 100644 models/multimodal/vision-language-understanding/minicpm-v-2/vllm/ci/prepare.sh
 rename models/{vision-language-understanding/MiniCPM-V-2 => multimodal/vision-language-understanding/minicpm-v-2}/vllm/minicpmv-2.0-offline.py (100%)

diff --git a/README.md b/README.md
index b5132821..4ddd9511 100644
--- a/README.md
+++ b/README.md
@@ -1168,7 +1168,7 @@ DeepSparkInference将按季度进行版本更新，后续会逐步丰富模型
     </tr>
     <tr align="center">
         <td>MiniCPM-V-2</td>
-        <td><a href="models/vision-language-understanding/MiniCPM-V-2/vllm/README.md">Supported</a></td>
+        <td><a href="models/multimodal/vision-language-understanding/minicpm-v-2/vllm/README.md">Supported</a></td>
         <td>-</td>
         <td>-</td>
     </tr>
diff --git a/models/vision-language-understanding/MiniCPM-V-2/vllm/README.md b/models/multimodal/vision-language-understanding/minicpm-v-2/vllm/README.md
similarity index 100%
rename from models/vision-language-understanding/MiniCPM-V-2/vllm/README.md
rename to models/multimodal/vision-language-understanding/minicpm-v-2/vllm/README.md
diff --git a/models/multimodal/vision-language-understanding/minicpm-v-2/vllm/ci/prepare.sh b/models/multimodal/vision-language-understanding/minicpm-v-2/vllm/ci/prepare.sh
new file mode 100644
index 00000000..f1c0b9c8
--- /dev/null
+++ b/models/multimodal/vision-language-understanding/minicpm-v-2/vllm/ci/prepare.sh
@@ -0,0 +1,31 @@
+#!/bin/bash
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -x
+ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
+if [[ ${ID} == "ubuntu" ]]; then
+    apt install -y libgl1-mesa-glx
+elif [[ ${ID} == "centos" ]]; then
+    yum install -y mesa-libGL
+else
+    echo "Not Support Os"
+fi
+
+pip3 install timm==0.9.10
+pip3 install transformers
+pip3 install --user --upgrade pillow -i https://pypi.tuna.tsinghua.edu.cn/simple
+
+cp /mnt/deepspark/data/datasets/dog.jpg ./
\ No newline at end of file
diff --git a/models/vision-language-understanding/MiniCPM-V-2/vllm/minicpmv-2.0-offline.py b/models/multimodal/vision-language-understanding/minicpm-v-2/vllm/minicpmv-2.0-offline.py
similarity index 100%
rename from models/vision-language-understanding/MiniCPM-V-2/vllm/minicpmv-2.0-offline.py
rename to models/multimodal/vision-language-understanding/minicpm-v-2/vllm/minicpmv-2.0-offline.py
diff --git a/tests/models_vllm.yaml b/tests/models_vllm.yaml
index 821733f6..548f16c8 100644
--- a/tests/models_vllm.yaml
+++ b/tests/models_vllm.yaml
@@ -104,3 +104,11 @@
     - fp16
   relative_path: models/nlp/large_language_model/stablelm/vllm
   task_type: nlp/large_language_model
+- datasets: https://localhost
+  download_url: https://localhost/MiniCPM-V-2
+  name: minicpm-v-2
+  need_third_part: false
+  precisions:
+    - fp16
+  relative_path: models/multimodal/vision-language-understanding/minicpm-v-2/vllm/
+  task_type: multimodal/vision-language-understanding
diff --git a/tests/run_vllm.py b/tests/run_vllm.py
index 488465ba..8bd3504d 100644
--- a/tests/run_vllm.py
+++ b/tests/run_vllm.py
@@ -56,7 +56,7 @@ def main():
 
     result = {}
     # NLP模型
-    if model["task_type"] in ["nlp/large_language_model"]:
+    if model["task_type"] in ["nlp/large_language_model", "multimodal/vision-language-understanding"]:
         logging.info(f"Start running {model['name']} test case:\n{json.dumps(model, indent=4)}")
         d_url = model["download_url"]
         if d_url is not None:
@@ -211,6 +211,15 @@ def run_nlp_testcase(model):
             export CUDA_VISIBLE_DEVICES=0,1
             python3 offline_inference.py --model ./stablelm --max-tokens 256 -tp 1 --temperature 0.0
             """
+        elif model_name == "minicpm-v-2":
+            script = f"""
+            set -x
+            cd ../{model['relative_path']}
+            export PT_SDPA_ENABLE_HEAD_DIM_PADDING=1
+            export PATH=/usr/local/corex/bin:${PATH}
+            export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/lib64
+            python3 minicpmv-2.0-offline.py --model-path ./minicpm-v-2 --image-path ./dog.jpg
+            """
 
         r, t = run_script(script)
         sout = r.stdout
-- 
Gitee


From 9981f977ae3960fd0d1f54a5dc90956ae932c9a8 Mon Sep 17 00:00:00 2001
From: "hongliang.yuan" <hongliang.yuan@iluvatar.com>
Date: Wed, 22 Jan 2025 17:00:42 +0800
Subject: [PATCH 6/6] fix minicpm

---
 .../minicpm-v-2/vllm/minicpmv-2.0-offline.py           | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/models/multimodal/vision-language-understanding/minicpm-v-2/vllm/minicpmv-2.0-offline.py b/models/multimodal/vision-language-understanding/minicpm-v-2/vllm/minicpmv-2.0-offline.py
index d6add4d8..1da0fdd8 100644
--- a/models/multimodal/vision-language-understanding/minicpm-v-2/vllm/minicpmv-2.0-offline.py
+++ b/models/multimodal/vision-language-understanding/minicpm-v-2/vllm/minicpmv-2.0-offline.py
@@ -38,9 +38,9 @@ def main(args):
     llm = LLM(model=MODEL_NAME,
             gpu_memory_utilization=0.95,  # 使用全部GPU内存
             trust_remote_code=True,
-            max_model_len=1024,
-            max_num_seqs=1,
-            max_num_batched_tokens=1024,)  # 根据内存状况可调整此值
+            max_model_len=2048,
+            # max_num_seqs=1,
+            max_num_batched_tokens=2048,)  # 根据内存状况可调整此值
 
     # 构建对话消息
     messages = [{'role': 'user', 'content': '(<image>./</image>)\n' + '请描述这张图片'}]
@@ -64,10 +64,10 @@ def main(args):
         # top_p=0.8,
         # top_k=100,
         # seed=3472,
-        max_tokens=128,
+        max_tokens=1024,
         # min_tokens=150,
         temperature=0,
-        use_beam_search=False,
+        # use_beam_search=False,
         # length_penalty=1.2,
         best_of=1)
 
-- 
Gitee