diff --git a/README.md b/README.md
index b51328219444c0816cb6ce62b66e6da013202715..4ddd9511c5b586b08f9eccaa36ad7bec101d0fab 100644
--- a/README.md
+++ b/README.md
@@ -1168,7 +1168,7 @@ DeepSparkInferenceå°†æŒ‰å­£åº¦è¿›è¡Œç‰ˆæœ¬æ›´æ–°ï¼ŒåŽç»­ä¼šé€æ­¥ä¸°å¯Œæ¨¡åž‹
     </tr>
     <tr align="center">
         <td>MiniCPM-V-2</td>
-        <td><a href="models/vision-language-understanding/MiniCPM-V-2/vllm/README.md">Supported</a></td>
+        <td><a href="models/multimodal/vision-language-understanding/minicpm-v-2/vllm/README.md">Supported</a></td>
         <td>-</td>
         <td>-</td>
     </tr>
diff --git a/models/vision-language-understanding/MiniCPM-V-2/vllm/README.md b/models/multimodal/vision-language-understanding/minicpm-v-2/vllm/README.md
similarity index 100%
rename from models/vision-language-understanding/MiniCPM-V-2/vllm/README.md
rename to models/multimodal/vision-language-understanding/minicpm-v-2/vllm/README.md
diff --git a/models/multimodal/vision-language-understanding/minicpm-v-2/vllm/ci/prepare.sh b/models/multimodal/vision-language-understanding/minicpm-v-2/vllm/ci/prepare.sh
new file mode 100644
index 0000000000000000000000000000000000000000..f1c0b9c835b299341cc45a866800e7b4a4bd5c89
--- /dev/null
+++ b/models/multimodal/vision-language-understanding/minicpm-v-2/vllm/ci/prepare.sh
@@ -0,0 +1,31 @@
+#!/bin/bash
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -x
+ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
+if [[ ${ID} == "ubuntu" ]]; then
+    apt install -y libgl1-mesa-glx
+elif [[ ${ID} == "centos" ]]; then
+    yum install -y mesa-libGL
+else
+    echo "Not Support Os"
+fi
+
+pip3 install timm==0.9.10
+pip3 install transformers
+pip3 install --user --upgrade pillow -i https://pypi.tuna.tsinghua.edu.cn/simple
+
+cp /mnt/deepspark/data/datasets/dog.jpg ./
\ No newline at end of file
diff --git a/models/vision-language-understanding/MiniCPM-V-2/vllm/minicpmv-2.0-offline.py b/models/multimodal/vision-language-understanding/minicpm-v-2/vllm/minicpmv-2.0-offline.py
similarity index 93%
rename from models/vision-language-understanding/MiniCPM-V-2/vllm/minicpmv-2.0-offline.py
rename to models/multimodal/vision-language-understanding/minicpm-v-2/vllm/minicpmv-2.0-offline.py
index d6add4d8f00fcc8bb307767d149dad8009f182b0..1da0fdd8e5bb7c507c5f24d8f66f0d0e5d7d5967 100644
--- a/models/vision-language-understanding/MiniCPM-V-2/vllm/minicpmv-2.0-offline.py
+++ b/models/multimodal/vision-language-understanding/minicpm-v-2/vllm/minicpmv-2.0-offline.py
@@ -38,9 +38,9 @@ def main(args):
     llm = LLM(model=MODEL_NAME,
             gpu_memory_utilization=0.95,  # ä½¿ç”¨å…¨éƒ¨GPUå†…å­˜
             trust_remote_code=True,
-            max_model_len=1024,
-            max_num_seqs=1,
-            max_num_batched_tokens=1024,)  # æ ¹æ®å†…å­˜çŠ¶å†µå¯è°ƒæ•´æ­¤å€¼
+            max_model_len=2048,
+            # max_num_seqs=1,
+            max_num_batched_tokens=2048,)  # æ ¹æ®å†…å­˜çŠ¶å†µå¯è°ƒæ•´æ­¤å€¼
 
     # æž„å»ºå¯¹è¯æ¶ˆæ¯
     messages = [{'role': 'user', 'content': '(<image>./</image>)\n' + 'è¯·æè¿°è¿™å¼ å›¾ç‰‡'}]
@@ -64,10 +64,10 @@ def main(args):
         # top_p=0.8,
         # top_k=100,
         # seed=3472,
-        max_tokens=128,
+        max_tokens=1024,
         # min_tokens=150,
         temperature=0,
-        use_beam_search=False,
+        # use_beam_search=False,
         # length_penalty=1.2,
         best_of=1)
 
diff --git a/models/nlp/large_language_model/baichuan2-7b/vllm/ci/prepare.sh b/models/nlp/large_language_model/baichuan2-7b/vllm/ci/prepare.sh
new file mode 100644
index 0000000000000000000000000000000000000000..54b66b4eba0c0d7beb5f93d3699e4377dae399c0
--- /dev/null
+++ b/models/nlp/large_language_model/baichuan2-7b/vllm/ci/prepare.sh
@@ -0,0 +1,32 @@
+#!/bin/bash
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -x
+ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
+if [[ ${ID} == "ubuntu" ]]; then
+    apt install -y libgl1-mesa-glx
+elif [[ ${ID} == "centos" ]]; then
+    yum install -y mesa-libGL
+else
+    echo "Not Support Os"
+fi
+
+pip3 install transformers==4.37.1
+
+# has prepared in ci
+# ln -s /mnt/deepspark/data/checkpoints/Baichuan2-7B-Base ./
+
+python3 convert2int8.py --model-path ./baichuan2-7b/
\ No newline at end of file
diff --git a/models/nlp/large_language_model/baichuan2-7b/vllm/offline_inference.py b/models/nlp/large_language_model/baichuan2-7b/vllm/offline_inference.py
index 40c0e2e10deb2869ad38dcdd2663a6ed8d5baa23..9e5738a17c9b532a4c38b3365a55df54c748a51d 100644
--- a/models/nlp/large_language_model/baichuan2-7b/vllm/offline_inference.py
+++ b/models/nlp/large_language_model/baichuan2-7b/vllm/offline_inference.py
@@ -109,4 +109,9 @@ for i, output in enumerate(outputs):
     print(f"Prompt: {prompt}\nGenerated text: {generated_text} \n")
 print(f"tokens: {num_tokens}, QPS: {num_tokens/duration_time}")
 
+metricResult = {"metricResult": {}}
+metricResult["metricResult"]["tokens"] = num_tokens
+metricResult["metricResult"]["QPS"] = round(num_tokens/duration_time,3)
+print(metricResult)
+
 # 0.3.2 tokens: 757, QPS: 97.97229589080902
\ No newline at end of file
diff --git a/models/nlp/large_language_model/chatglm3-6b-32k/vllm/ci/prepare.sh b/models/nlp/large_language_model/chatglm3-6b-32k/vllm/ci/prepare.sh
new file mode 100644
index 0000000000000000000000000000000000000000..ad683f6e992593e59d91544c2e1e6b724a6245ec
--- /dev/null
+++ b/models/nlp/large_language_model/chatglm3-6b-32k/vllm/ci/prepare.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -x
+ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
+if [[ ${ID} == "ubuntu" ]]; then
+    apt install -y libgl1-mesa-glx
+elif [[ ${ID} == "centos" ]]; then
+    yum install -y mesa-libGL
+else
+    echo "Not Support Os"
+fi
+
+pip3 install transformers==4.37.1
diff --git a/models/nlp/large_language_model/chatglm3-6b-32k/vllm/offline_inference.py b/models/nlp/large_language_model/chatglm3-6b-32k/vllm/offline_inference.py
index bc731079f72988cd20c5a68b3ccb4e192769c8fb..7fc45b685046e85411b17ea4edf7ff46b8bf52a6 100644
--- a/models/nlp/large_language_model/chatglm3-6b-32k/vllm/offline_inference.py
+++ b/models/nlp/large_language_model/chatglm3-6b-32k/vllm/offline_inference.py
@@ -57,7 +57,7 @@ if __name__ == "__main__":
     model_name = model_name.rsplit("/")[-1]
 
     # Sample prompts.
-    prompts = ["ÄÄÐ©¼£Ïó¿ÉÄÜ±íÃ÷Ò»¸öÈËÕýÔÚ¾­Àú½¹ÂÇ?", "ÃèÊöÒ»ÏÂÈçºÎÖÆ×÷Ö¥Ê¿ÅûÈø¡£", "Ð´Ò»ÆªÓÐ¹Ø5GÍøÂçÑÐ·¢µÄ×ÛÊöÎÄÕÂ¡£"]
+    prompts = ["å“ªäº›è¿¹è±¡å¯èƒ½è¡¨æ˜Žä¸€ä¸ªäººæ­£åœ¨ç»åŽ†ç„¦è™‘?", "æè¿°ä¸€ä¸‹å¦‚ä½•åˆ¶ä½œèŠå£«æŠ«è¨ã€‚", "å†™ä¸€ç¯‡æœ‰å…³5Gç½‘ç»œç ”å‘çš„ç»¼è¿°æ–‡ç« ã€‚"]
 
     # Create a sampling params object.
     sampling_params = SamplingParams(**sampling_params)
diff --git a/models/nlp/large_language_model/chatglm3-6b/vllm/ci/prepare.sh b/models/nlp/large_language_model/chatglm3-6b/vllm/ci/prepare.sh
new file mode 100644
index 0000000000000000000000000000000000000000..4cfd5fd6f1fd9dcc9fab83d9023b33e4752606a9
--- /dev/null
+++ b/models/nlp/large_language_model/chatglm3-6b/vllm/ci/prepare.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -x
+ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
+if [[ ${ID} == "ubuntu" ]]; then
+    apt install -y libgl1-mesa-glx
+elif [[ ${ID} == "centos" ]]; then
+    yum install -y mesa-libGL
+else
+    echo "Not Support Os"
+fi
+
+pip3 install vllm==0.5.0
+pip3 install transformers==4.37.1
diff --git a/models/nlp/large_language_model/chatglm3-6b/vllm/offline_inference.py b/models/nlp/large_language_model/chatglm3-6b/vllm/offline_inference.py
index 0162d93c53ac839268b3c964e0e96ecaad63ac4e..eaa7fe206459819731b00523c6d40224afafc545 100644
--- a/models/nlp/large_language_model/chatglm3-6b/vllm/offline_inference.py
+++ b/models/nlp/large_language_model/chatglm3-6b/vllm/offline_inference.py
@@ -99,4 +99,9 @@ for i, output in enumerate(outputs):
     print(f"Prompt: {prompt}\nGenerated text: {generated_text} \n")
 print(f"tokens: {num_tokens}, QPS: {num_tokens/duration_time}")
 
+metricResult = {"metricResult": {}}
+metricResult["metricResult"]["tokens"] = num_tokens
+metricResult["metricResult"]["QPS"] = round(num_tokens/duration_time,3)
+print(metricResult)
+
 # 0.3.2 tokens: 422, QPS: 70.02308283048338(tokens: 422, QPS: 93.67210003677407),32-k æ¨¡åž‹ tokens: 477, QPS: 81.46537314533865(tokens: 477, QPS: 106.54247895449554)
\ No newline at end of file
diff --git a/models/nlp/large_language_model/llama2-13b/trtllm/README.md b/models/nlp/large_language_model/llama2-13b/trtllm/README.md
index 4658334d0f2c2c262ff613a3127f9832965e465f..b24c29ce921f2571833a50d01ad3035ec28e1ea5 100755
--- a/models/nlp/large_language_model/llama2-13b/trtllm/README.md
+++ b/models/nlp/large_language_model/llama2-13b/trtllm/README.md
@@ -18,9 +18,6 @@ apt install -y libgl1-mesa-dev
 
 bash scripts/set_environment.sh .
 
-# *star refer to a specified version
-wget http://files.deepspark.org.cn:880/deepspark/add-ons/tensorrt_llm-*.whl
-pip install tensorrt_llm-*.whl
 ```
 
 ### Download
diff --git a/models/nlp/large_language_model/llama2-13b/trtllm/ci/prepare.sh b/models/nlp/large_language_model/llama2-13b/trtllm/ci/prepare.sh
new file mode 100644
index 0000000000000000000000000000000000000000..7947d62bee569e9f109283e843b288fc68148f0e
--- /dev/null
+++ b/models/nlp/large_language_model/llama2-13b/trtllm/ci/prepare.sh
@@ -0,0 +1,36 @@
+#!/bin/bash
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -x
+ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
+if [[ ${ID} == "ubuntu" ]]; then
+    apt install -y libgl1-mesa-glx
+elif [[ ${ID} == "centos" ]]; then
+    yum install -y mesa-libGL
+else
+    echo "Not Support Os"
+fi
+
+bash scripts/set_environment.sh .
+
+# Download model from the website and make sure the model's path is "data/llama2-13b-chat"
+# Download dataset from the website and make sure the dataset's path is "data/datasets_cnn_dailymail"
+mkdir -p data
+ln -s /mnt/deepspark/data/checkpoints/llama2-13b-chat data/llama2-13b-chat
+ln -s /mnt/deepspark/data/datasets/datasets_cnn_dailymail data/datasets_cnn_dailymail
+# Please download rouge.py to this path if your server can't attach huggingface.co.
+mkdir -p rouge/
+cp /mnt/deepspark/data/3rd_party/rouge.py rouge/
\ No newline at end of file
diff --git a/models/nlp/large_language_model/llama2-70b/trtllm/README.md b/models/nlp/large_language_model/llama2-70b/trtllm/README.md
index 671b51c79f01d8760df490d167e32814634d507d..621dbffeeaac3387030e326e6e24f918644c9ae1 100644
--- a/models/nlp/large_language_model/llama2-70b/trtllm/README.md
+++ b/models/nlp/large_language_model/llama2-70b/trtllm/README.md
@@ -16,10 +16,6 @@ yum install -y mesa-libGL
 apt install -y libgl1-mesa-dev
 
 bash scripts/set_environment.sh .
-
-# *star refer to a specified version
-wget http://files.deepspark.org.cn:880/deepspark/add-ons/tensorrt_llm-*.whl
-pip install tensorrt_llm-*.whl
 ```
 
 ### Download
diff --git a/models/nlp/large_language_model/llama2-70b/trtllm/ci/prepare.sh b/models/nlp/large_language_model/llama2-70b/trtllm/ci/prepare.sh
new file mode 100644
index 0000000000000000000000000000000000000000..4a8c1e4e321df3e2ca3228b3456029062731ddfd
--- /dev/null
+++ b/models/nlp/large_language_model/llama2-70b/trtllm/ci/prepare.sh
@@ -0,0 +1,36 @@
+#!/bin/bash
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -x
+ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
+if [[ ${ID} == "ubuntu" ]]; then
+    apt install -y libgl1-mesa-glx
+elif [[ ${ID} == "centos" ]]; then
+    yum install -y mesa-libGL
+else
+    echo "Not Support Os"
+fi
+
+bash scripts/set_environment.sh .
+
+# Download model from the website and make sure the model's path is "data/llama2-70b-chat"
+# Download dataset from the website and make sure the dataset's path is "data/datasets_cnn_dailymail"
+mkdir -p data
+ln -s /mnt/deepspark/data/checkpoints/llama2-70b-chat data/llama2-70b-chat
+ln -s /mnt/deepspark/data/datasets/datasets_cnn_dailymail data/datasets_cnn_dailymail
+# Please download rouge.py to this path if your server can't attach huggingface.co.
+mkdir -p rouge/
+cp /mnt/deepspark/data/3rd_party/rouge.py rouge/
\ No newline at end of file
diff --git a/models/nlp/large_language_model/llama2-7b/trtllm/ci/prepare.sh b/models/nlp/large_language_model/llama2-7b/trtllm/ci/prepare.sh
new file mode 100644
index 0000000000000000000000000000000000000000..2ac2384d24f563195692e601c86010febfcac6ef
--- /dev/null
+++ b/models/nlp/large_language_model/llama2-7b/trtllm/ci/prepare.sh
@@ -0,0 +1,36 @@
+#!/bin/bash
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -x
+ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
+if [[ ${ID} == "ubuntu" ]]; then
+    apt install -y libgl1-mesa-glx
+elif [[ ${ID} == "centos" ]]; then
+    yum install -y mesa-libGL
+else
+    echo "Not Support Os"
+fi
+
+bash scripts/set_environment.sh .
+
+# Download model from the website and make sure the model's path is "data/llama2-7b-chat"
+# Download dataset from the website and make sure the dataset's path is "data/datasets_cnn_dailymail"
+mkdir -p data
+ln -s /mnt/deepspark/data/checkpoints/llama2-7b-chat data/llama2-7b-chat
+ln -s /mnt/deepspark/data/datasets/datasets_cnn_dailymail data/datasets_cnn_dailymail
+# Please download rouge.py to this path if your server can't attach huggingface.co.
+mkdir -p rouge/
+cp /mnt/deepspark/data/3rd_party/rouge.py rouge/
\ No newline at end of file
diff --git a/models/nlp/large_language_model/llama2-7b/vllm/ci/prepare.sh b/models/nlp/large_language_model/llama2-7b/vllm/ci/prepare.sh
new file mode 100644
index 0000000000000000000000000000000000000000..6afe9667d3033ef4fe624e70662dc90bd6e563f7
--- /dev/null
+++ b/models/nlp/large_language_model/llama2-7b/vllm/ci/prepare.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -x
+ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
+if [[ ${ID} == "ubuntu" ]]; then
+    apt install -y libgl1-mesa-glx
+elif [[ ${ID} == "centos" ]]; then
+    yum install -y mesa-libGL
+else
+    echo "Not Support Os"
+fi
+
+# Contact the iluvatar manager to get adapted install packages of vllm, triton, and ixformer
+pip3 install vllm
+pip3 install triton
+pip3 install ixformer
diff --git a/models/nlp/large_language_model/llama2-7b/vllm/offline_inference.py b/models/nlp/large_language_model/llama2-7b/vllm/offline_inference.py
index 9c0b6d2fabcd1e9f64f59208336f12cd8d0def0c..538d35410b4a178602726608cbbae21550e2f90b 100644
--- a/models/nlp/large_language_model/llama2-7b/vllm/offline_inference.py
+++ b/models/nlp/large_language_model/llama2-7b/vllm/offline_inference.py
@@ -129,3 +129,7 @@ if __name__ == "__main__":
         num_tokens += len(output.outputs[0].token_ids)
         print(f"Prompt: {prompt}\nGenerated text: {generated_text} \n")
     print(f"tokens: {num_tokens}, QPS: {num_tokens/duration_time}")
+    metricResult = {"metricResult": {}}
+    metricResult["metricResult"]["tokens"] = num_tokens
+    metricResult["metricResult"]["QPS"] = round(num_tokens/duration_time,3)
+    print(metricResult)
diff --git a/models/nlp/large_language_model/llama3-70b/vllm/ci/prepare.sh b/models/nlp/large_language_model/llama3-70b/vllm/ci/prepare.sh
new file mode 100644
index 0000000000000000000000000000000000000000..000245a822b911a916c2693f3b2adfbff570520b
--- /dev/null
+++ b/models/nlp/large_language_model/llama3-70b/vllm/ci/prepare.sh
@@ -0,0 +1,26 @@
+#!/bin/bash
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -x
+ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
+if [[ ${ID} == "ubuntu" ]]; then
+    apt install -y libgl1-mesa-glx
+elif [[ ${ID} == "centos" ]]; then
+    yum install -y mesa-libGL
+else
+    echo "Not Support Os"
+fi
+
diff --git a/models/nlp/large_language_model/llama3-70b/vllm/offline_inference.py b/models/nlp/large_language_model/llama3-70b/vllm/offline_inference.py
index cdf635a71debcbb68945a745e8cd7b2151968c77..6932fde6ba557964767d36b123dca9c4f4bf05c7 100644
--- a/models/nlp/large_language_model/llama3-70b/vllm/offline_inference.py
+++ b/models/nlp/large_language_model/llama3-70b/vllm/offline_inference.py
@@ -151,6 +151,10 @@ if args.acc_test:
         print('val ROUGE-1 score f1: {}, target ROUGE-1 score f1: {}, fail'.format(scores[0]["rouge-1"]['f'],args.acc_threshold))
         exit(1)
     print('val ROUGE-1 score f1: {}, target ROUGE-1 score f1: {}, pass'.format(scores[0]["rouge-1"]['f'],args.acc_threshold))
+    metricResult = {"metricResult": {}}
+    metricResult["metricResult"]["val ROUGE-1 score f1"] = scores[0]["rouge-1"]['f']
+    metricResult["metricResult"]["val ROUGE-1 score f1"] = args.acc_threshold
+    print(metricResult)
 
 # 2 7b vllm 0.1.6: batch 3, tokens: 773, QPS: 64.35866137433203; batch 1, tokens: 257, QPS: 25.396898421442113
 # 1\2 13b vllm 0.1.6: batch 3, tokens: 768, QPS: 41.538942353799506; batch 1, tokens: 257, QPS: 15.639606595029639 (2, 6.5829828847570795; 8, 5.137610167755676)
diff --git a/models/nlp/large_language_model/qwen-7b/text-generation-inference/README.md b/models/nlp/large_language_model/qwen-7b/text-generation-inference/README.md
index 33b0aab2bf76ae501b61e91c0a6104194fb4654a..729b9833fa6e0d7947f72dde56206988646bc299 100644
--- a/models/nlp/large_language_model/qwen-7b/text-generation-inference/README.md
+++ b/models/nlp/large_language_model/qwen-7b/text-generation-inference/README.md
@@ -17,10 +17,6 @@ yum install -y mesa-libGL
 ## Ubuntu
 apt install -y libgl1-mesa-dev
 
-# *star refer to a specified version
-wget http://files.deepspark.org.cn:880/deepspark/add-ons/text-generation-*.whl
-wget http://files.deepspark.org.cn:880/deepspark/add-ons/text-generation-server-*.whl
-pip install tensorrt_llm-*.whl text-generation-server-*.whl
 ```
 
 ### Download
diff --git a/models/nlp/large_language_model/qwen-7b/text-generation-inference/ci/prepare.sh b/models/nlp/large_language_model/qwen-7b/text-generation-inference/ci/prepare.sh
new file mode 100644
index 0000000000000000000000000000000000000000..4b2fdf8b5b521defcd963cd9e9fe92bd271dc2cf
--- /dev/null
+++ b/models/nlp/large_language_model/qwen-7b/text-generation-inference/ci/prepare.sh
@@ -0,0 +1,29 @@
+#!/bin/bash
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -x
+ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
+if [[ ${ID} == "ubuntu" ]]; then
+    apt install -y libgl1-mesa-glx
+elif [[ ${ID} == "centos" ]]; then
+    yum install -y mesa-libGL
+else
+    echo "Not Support Os"
+fi
+
+mkdir -p data
+
+ln -s /mnt/deepspark/data/checkpoints/qwen-7B data/qwen-7B
\ No newline at end of file
diff --git a/models/nlp/large_language_model/qwen-7b/text-generation-inference/offline_inference.py b/models/nlp/large_language_model/qwen-7b/text-generation-inference/offline_inference.py
index 57db633401e7849adac36f5f9e6ad166fdf38bbd..e3ebcc3a5e6b40d7801ebeda885710aec4dd6f08 100644
--- a/models/nlp/large_language_model/qwen-7b/text-generation-inference/offline_inference.py
+++ b/models/nlp/large_language_model/qwen-7b/text-generation-inference/offline_inference.py
@@ -109,6 +109,11 @@ if __name__ == "__main__":
     duration_time = end_time - start_time
     print(f"generate length: {generations_one[0].generated_text.generated_tokens}")
     print(f"one batch: {generations_one[0].generated_text.text}\nqps: {generations_one[0].generated_text.generated_tokens /duration_time}")
+    metricResult = {"metricResult": {}}
+    metricResult["metricResult"]["generate length"] = generations_one[0].generated_text.generated_tokens
+    metricResult["metricResult"]["one batch"] = generations_one[0].generated_text.text
+    metricResult["metricResult"]["qps"] = generations_one[0].generated_text.generated_tokens /duration_time
+    print(metricResult)
 
 """
 qwen-7B
diff --git a/models/nlp/large_language_model/qwen-7b/vllm/ci/prepare.sh b/models/nlp/large_language_model/qwen-7b/vllm/ci/prepare.sh
new file mode 100644
index 0000000000000000000000000000000000000000..cfd5031a7d3e6ac57abbc085dd41fc1063482bee
--- /dev/null
+++ b/models/nlp/large_language_model/qwen-7b/vllm/ci/prepare.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -x
+ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
+if [[ ${ID} == "ubuntu" ]]; then
+    apt install -y libgl1-mesa-glx
+elif [[ ${ID} == "centos" ]]; then
+    yum install -y mesa-libGL
+else
+    echo "Not Support Os"
+fi
+
+# Contact the iluvatar manager to get adapted install packages of vllm, triton, and ixformer
+pip3 install vllm
+pip3 install triton
+pip3 install ixformer
\ No newline at end of file
diff --git a/models/nlp/large_language_model/qwen-7b/vllm/offline_inference.py b/models/nlp/large_language_model/qwen-7b/vllm/offline_inference.py
index 3b9e9fd89ae1043b6055a01087d8d6e421281c5e..5de14fb0f6989e50ed03725946cb17543d30832d 100644
--- a/models/nlp/large_language_model/qwen-7b/vllm/offline_inference.py
+++ b/models/nlp/large_language_model/qwen-7b/vllm/offline_inference.py
@@ -130,3 +130,7 @@ if __name__ == "__main__":
         num_tokens += len(output.outputs[0].token_ids)
         print(f"Prompt: {prompt}\nGenerated text: {generated_text} \n")
     print(f"tokens: {num_tokens}, QPS: {num_tokens/duration_time}")
+    metricResult = {"metricResult": {}}
+    metricResult["metricResult"]["tokens"] = num_tokens
+    metricResult["metricResult"]["QPS"] = round(num_tokens/duration_time,3)
+    print(metricResult)
diff --git a/models/nlp/large_language_model/qwen1.5-14b/vllm/README.md b/models/nlp/large_language_model/qwen1.5-14b/vllm/README.md
index b3c67597fbb13b3d404969aa0f4c09208b131e8c..905967c2c7372eba6bfb3791fe6833400ba68ee6 100644
--- a/models/nlp/large_language_model/qwen1.5-14b/vllm/README.md
+++ b/models/nlp/large_language_model/qwen1.5-14b/vllm/README.md
@@ -29,7 +29,7 @@ ln -s /path/to/Qwen1.5-14B ./data/qwen1.5
 ## Inference
 
 ```bash
-python3 offline_inference.py --model ./data/qwen1.5/Qwen1.5-14B --max-tokens 256 -tp 1 --temperature 0.0 --max-model-len 1024
+python3 offline_inference.py --model ./data/qwen1.5/Qwen1.5-14B --max-tokens 256 -tp 1 --temperature 0.0 --max-model-len 896
 ```
 
 ## Results
diff --git a/models/nlp/large_language_model/qwen1.5-14b/vllm/ci/prepare.sh b/models/nlp/large_language_model/qwen1.5-14b/vllm/ci/prepare.sh
new file mode 100644
index 0000000000000000000000000000000000000000..75fb19458942e1d61a674c1d6dd9bbdb521bc00c
--- /dev/null
+++ b/models/nlp/large_language_model/qwen1.5-14b/vllm/ci/prepare.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -x
+ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
+if [[ ${ID} == "ubuntu" ]]; then
+    apt install -y libgl1-mesa-glx
+elif [[ ${ID} == "centos" ]]; then
+    yum install -y mesa-libGL
+else
+    echo "Not Support Os"
+fi
diff --git a/models/nlp/large_language_model/qwen1.5-14b/vllm/offline_inference.py b/models/nlp/large_language_model/qwen1.5-14b/vllm/offline_inference.py
index 7ee127a259eb78f91d71c07b4a129464e0cc6cd3..130f0885a4a85b657d71d6204bdf78a2cc9e871a 100644
--- a/models/nlp/large_language_model/qwen1.5-14b/vllm/offline_inference.py
+++ b/models/nlp/large_language_model/qwen1.5-14b/vllm/offline_inference.py
@@ -108,3 +108,7 @@ for i, output in enumerate(outputs):
     num_tokens += len(output.outputs[0].token_ids)
     print(f"Prompt: {prompt}\nGenerated text: {generated_text} \n")
 print(f"tokens: {num_tokens}, QPS: {num_tokens/duration_time}")
+metricResult = {"metricResult": {}}
+metricResult["metricResult"]["tokens"] = num_tokens
+metricResult["metricResult"]["QPS"] = round(num_tokens/duration_time,3)
+print(metricResult)
\ No newline at end of file
diff --git a/models/nlp/large_language_model/qwen1.5-32b/vllm/ci/prepare.sh b/models/nlp/large_language_model/qwen1.5-32b/vllm/ci/prepare.sh
new file mode 100644
index 0000000000000000000000000000000000000000..cfd5031a7d3e6ac57abbc085dd41fc1063482bee
--- /dev/null
+++ b/models/nlp/large_language_model/qwen1.5-32b/vllm/ci/prepare.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -x
+ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
+if [[ ${ID} == "ubuntu" ]]; then
+    apt install -y libgl1-mesa-glx
+elif [[ ${ID} == "centos" ]]; then
+    yum install -y mesa-libGL
+else
+    echo "Not Support Os"
+fi
+
+# Contact the iluvatar manager to get adapted install packages of vllm, triton, and ixformer
+pip3 install vllm
+pip3 install triton
+pip3 install ixformer
\ No newline at end of file
diff --git a/models/nlp/large_language_model/qwen1.5-32b/vllm/offline_inference.py b/models/nlp/large_language_model/qwen1.5-32b/vllm/offline_inference.py
index 5e85929151748155349ec1de2bd89b9789f48574..9799150f7c1221b3d8f58857d3463727c9d5400e 100644
--- a/models/nlp/large_language_model/qwen1.5-32b/vllm/offline_inference.py
+++ b/models/nlp/large_language_model/qwen1.5-32b/vllm/offline_inference.py
@@ -129,3 +129,7 @@ if __name__ == "__main__":
         num_tokens += len(output.outputs[0].token_ids)
         print(f"Prompt: {prompt}\nGenerated text: {generated_text} \n")
     print(f"tokens: {num_tokens}, QPS: {num_tokens/duration_time}")
+    metricResult = {"metricResult": {}}
+    metricResult["metricResult"]["tokens"] = num_tokens
+    metricResult["metricResult"]["QPS"] = round(num_tokens/duration_time,3)
+    print(metricResult)
\ No newline at end of file
diff --git a/models/nlp/large_language_model/qwen1.5-72b/vllm/ci/prepare.sh b/models/nlp/large_language_model/qwen1.5-72b/vllm/ci/prepare.sh
new file mode 100644
index 0000000000000000000000000000000000000000..75fb19458942e1d61a674c1d6dd9bbdb521bc00c
--- /dev/null
+++ b/models/nlp/large_language_model/qwen1.5-72b/vllm/ci/prepare.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -x
+ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
+if [[ ${ID} == "ubuntu" ]]; then
+    apt install -y libgl1-mesa-glx
+elif [[ ${ID} == "centos" ]]; then
+    yum install -y mesa-libGL
+else
+    echo "Not Support Os"
+fi
diff --git a/models/nlp/large_language_model/qwen1.5-72b/vllm/offline_inference.py b/models/nlp/large_language_model/qwen1.5-72b/vllm/offline_inference.py
index 7ee127a259eb78f91d71c07b4a129464e0cc6cd3..130f0885a4a85b657d71d6204bdf78a2cc9e871a 100644
--- a/models/nlp/large_language_model/qwen1.5-72b/vllm/offline_inference.py
+++ b/models/nlp/large_language_model/qwen1.5-72b/vllm/offline_inference.py
@@ -108,3 +108,7 @@ for i, output in enumerate(outputs):
     num_tokens += len(output.outputs[0].token_ids)
     print(f"Prompt: {prompt}\nGenerated text: {generated_text} \n")
 print(f"tokens: {num_tokens}, QPS: {num_tokens/duration_time}")
+metricResult = {"metricResult": {}}
+metricResult["metricResult"]["tokens"] = num_tokens
+metricResult["metricResult"]["QPS"] = round(num_tokens/duration_time,3)
+print(metricResult)
\ No newline at end of file
diff --git a/models/nlp/large_language_model/qwen1.5-7b/text-generation-inference/ci/prepare.sh b/models/nlp/large_language_model/qwen1.5-7b/text-generation-inference/ci/prepare.sh
new file mode 100644
index 0000000000000000000000000000000000000000..c801677cefaf6638b3a9695a7da4d84a3a66fcc1
--- /dev/null
+++ b/models/nlp/large_language_model/qwen1.5-7b/text-generation-inference/ci/prepare.sh
@@ -0,0 +1,29 @@
+#!/bin/bash
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -x
+ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
+if [[ ${ID} == "ubuntu" ]]; then
+    apt install -y libgl1-mesa-glx
+elif [[ ${ID} == "centos" ]]; then
+    yum install -y mesa-libGL
+else
+    echo "Not Support Os"
+fi
+
+mkdir -p data
+
+ln -s /mnt/deepspark/data/checkpoints/Qwen1.5-7B data/Qwen1.5-7B
\ No newline at end of file
diff --git a/models/nlp/large_language_model/qwen1.5-7b/text-generation-inference/offline_inference.py b/models/nlp/large_language_model/qwen1.5-7b/text-generation-inference/offline_inference.py
index b927973a76953e189d4c4ebd4ee10bc392e0b4f0..87f4df9885a635af2e019fb76bbca2c5210f0cb4 100644
--- a/models/nlp/large_language_model/qwen1.5-7b/text-generation-inference/offline_inference.py
+++ b/models/nlp/large_language_model/qwen1.5-7b/text-generation-inference/offline_inference.py
@@ -115,6 +115,11 @@ if __name__ == "__main__":
     duration_time = end_time - start_time
     print(f"generate length: {generations_one[0].generated_text.generated_tokens}")
     print(f"one batch: {generations_one[0].generated_text.text}\nqps: {generations_one[0].generated_text.generated_tokens /duration_time}")
+    metricResult = {"metricResult": {}}
+    metricResult["metricResult"]["generate length"] = generations_one[0].generated_text.generated_tokens
+    metricResult["metricResult"]["one batch"] = generations_one[0].generated_text.text
+    metricResult["metricResult"]["qps"] = generations_one[0].generated_text.generated_tokens /duration_time
+    print(metricResult)
 
 """
 qwen1.5-0.5B
diff --git a/models/nlp/large_language_model/qwen1.5-7b/vllm/ci/prepare.sh b/models/nlp/large_language_model/qwen1.5-7b/vllm/ci/prepare.sh
new file mode 100644
index 0000000000000000000000000000000000000000..75fb19458942e1d61a674c1d6dd9bbdb521bc00c
--- /dev/null
+++ b/models/nlp/large_language_model/qwen1.5-7b/vllm/ci/prepare.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -x
+ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
+if [[ ${ID} == "ubuntu" ]]; then
+    apt install -y libgl1-mesa-glx
+elif [[ ${ID} == "centos" ]]; then
+    yum install -y mesa-libGL
+else
+    echo "Not Support Os"
+fi
diff --git a/models/nlp/large_language_model/qwen1.5-7b/vllm/offline_inference.py b/models/nlp/large_language_model/qwen1.5-7b/vllm/offline_inference.py
index 7ee127a259eb78f91d71c07b4a129464e0cc6cd3..bae01307762ee571e9c8bd5f77af10a177bbf28d 100644
--- a/models/nlp/large_language_model/qwen1.5-7b/vllm/offline_inference.py
+++ b/models/nlp/large_language_model/qwen1.5-7b/vllm/offline_inference.py
@@ -108,3 +108,7 @@ for i, output in enumerate(outputs):
     num_tokens += len(output.outputs[0].token_ids)
     print(f"Prompt: {prompt}\nGenerated text: {generated_text} \n")
 print(f"tokens: {num_tokens}, QPS: {num_tokens/duration_time}")
+metricResult = {"metricResult": {}}
+metricResult["metricResult"]["tokens"] = num_tokens
+metricResult["metricResult"]["QPS"] = round(num_tokens/duration_time,3)
+print(metricResult)
diff --git a/models/nlp/large_language_model/qwen2-72b/vllm/ci/prepare.sh b/models/nlp/large_language_model/qwen2-72b/vllm/ci/prepare.sh
new file mode 100644
index 0000000000000000000000000000000000000000..cfd5031a7d3e6ac57abbc085dd41fc1063482bee
--- /dev/null
+++ b/models/nlp/large_language_model/qwen2-72b/vllm/ci/prepare.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -x
+ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
+if [[ ${ID} == "ubuntu" ]]; then
+    apt install -y libgl1-mesa-glx
+elif [[ ${ID} == "centos" ]]; then
+    yum install -y mesa-libGL
+else
+    echo "Not Support Os"
+fi
+
+# Contact the iluvatar manager to get adapted install packages of vllm, triton, and ixformer
+pip3 install vllm
+pip3 install triton
+pip3 install ixformer
\ No newline at end of file
diff --git a/models/nlp/large_language_model/qwen2-72b/vllm/offline_inference.py b/models/nlp/large_language_model/qwen2-72b/vllm/offline_inference.py
index 5e85929151748155349ec1de2bd89b9789f48574..9799150f7c1221b3d8f58857d3463727c9d5400e 100644
--- a/models/nlp/large_language_model/qwen2-72b/vllm/offline_inference.py
+++ b/models/nlp/large_language_model/qwen2-72b/vllm/offline_inference.py
@@ -129,3 +129,7 @@ if __name__ == "__main__":
         num_tokens += len(output.outputs[0].token_ids)
         print(f"Prompt: {prompt}\nGenerated text: {generated_text} \n")
     print(f"tokens: {num_tokens}, QPS: {num_tokens/duration_time}")
+    metricResult = {"metricResult": {}}
+    metricResult["metricResult"]["tokens"] = num_tokens
+    metricResult["metricResult"]["QPS"] = round(num_tokens/duration_time,3)
+    print(metricResult)
\ No newline at end of file
diff --git a/models/nlp/large_language_model/qwen2-7b/vllm/ci/prepare.sh b/models/nlp/large_language_model/qwen2-7b/vllm/ci/prepare.sh
new file mode 100644
index 0000000000000000000000000000000000000000..cfd5031a7d3e6ac57abbc085dd41fc1063482bee
--- /dev/null
+++ b/models/nlp/large_language_model/qwen2-7b/vllm/ci/prepare.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -x
+ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
+if [[ ${ID} == "ubuntu" ]]; then
+    apt install -y libgl1-mesa-glx
+elif [[ ${ID} == "centos" ]]; then
+    yum install -y mesa-libGL
+else
+    echo "Not Support Os"
+fi
+
+# Contact the iluvatar manager to get adapted install packages of vllm, triton, and ixformer
+pip3 install vllm
+pip3 install triton
+pip3 install ixformer
\ No newline at end of file
diff --git a/models/nlp/large_language_model/qwen2-7b/vllm/offline_inference.py b/models/nlp/large_language_model/qwen2-7b/vllm/offline_inference.py
index 5e85929151748155349ec1de2bd89b9789f48574..9799150f7c1221b3d8f58857d3463727c9d5400e 100644
--- a/models/nlp/large_language_model/qwen2-7b/vllm/offline_inference.py
+++ b/models/nlp/large_language_model/qwen2-7b/vllm/offline_inference.py
@@ -129,3 +129,7 @@ if __name__ == "__main__":
         num_tokens += len(output.outputs[0].token_ids)
         print(f"Prompt: {prompt}\nGenerated text: {generated_text} \n")
     print(f"tokens: {num_tokens}, QPS: {num_tokens/duration_time}")
+    metricResult = {"metricResult": {}}
+    metricResult["metricResult"]["tokens"] = num_tokens
+    metricResult["metricResult"]["QPS"] = round(num_tokens/duration_time,3)
+    print(metricResult)
\ No newline at end of file
diff --git a/models/nlp/large_language_model/stablelm/vllm/ci/prepare.sh b/models/nlp/large_language_model/stablelm/vllm/ci/prepare.sh
new file mode 100644
index 0000000000000000000000000000000000000000..6d81455870a7e0da387248def8bb77bbae5e417f
--- /dev/null
+++ b/models/nlp/large_language_model/stablelm/vllm/ci/prepare.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -x
+ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
+if [[ ${ID} == "ubuntu" ]]; then
+    apt install -y libgl1-mesa-glx
+elif [[ ${ID} == "centos" ]]; then
+    yum install -y mesa-libGL
+else
+    echo "Not Support Os"
+fi
+
+pip3 install transformers
\ No newline at end of file
diff --git a/models/nlp/large_language_model/stablelm/vllm/offline_inference.py b/models/nlp/large_language_model/stablelm/vllm/offline_inference.py
index 40678a62ea18296ecdd53cbbcf7d8c3c25e0950d..e9f2abfb7002071d0ce520be433eb972fd0def4b 100644
--- a/models/nlp/large_language_model/stablelm/vllm/offline_inference.py
+++ b/models/nlp/large_language_model/stablelm/vllm/offline_inference.py
@@ -132,4 +132,8 @@ if __name__ == "__main__":
 
         num_tokens += len(output.outputs[0].token_ids)
         print(f"Prompt: {prompt}\nGenerated text: {generated_text} \n")
-    print(f"tokens: {num_tokens}, QPS: {num_tokens/duration_time}")
\ No newline at end of file
+    print(f"tokens: {num_tokens}, QPS: {num_tokens/duration_time}")
+    metricResult = {"metricResult": {}}
+    metricResult["metricResult"]["tokens"] = num_tokens
+    metricResult["metricResult"]["QPS"] = round(num_tokens/duration_time,3)
+    print(metricResult)
\ No newline at end of file
diff --git a/tests/models_trtllm.yaml b/tests/models_trtllm.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..de21908e72524b29534ee542380ab49dfb448e30
--- /dev/null
+++ b/tests/models_trtllm.yaml
@@ -0,0 +1,41 @@
+---
+- datasets: https://localhost
+  download_url: https://huggingface.co/meta-llama/llama2-7b-chat
+  name: llama2-7b
+  need_third_part: false
+  precisions:
+    - fp16
+  relative_path: models/nlp/large_language_model/llama2-7b/trtllm
+  task_type: nlp/large_language_model
+- datasets: https://localhost
+  download_url: https://localhost/llama2-13b-chat
+  name: llama2-13b
+  need_third_part: false
+  precisions:
+    - fp16
+  relative_path: models/nlp/large_language_model/llama2-13b/trtllm
+  task_type: nlp/large_language_model
+- datasets: https://localhost
+  download_url: https://localhost/llama2-70b-chat
+  name: llama2-70b
+  need_third_part: false
+  precisions:
+    - fp16
+  relative_path: models/nlp/large_language_model/llama2-70b/trtllm
+  task_type: nlp/large_language_model
+- datasets: https://localhost
+  download_url: https://localhost/qwen-7B
+  name: qwen-7b
+  need_third_part: false
+  precisions:
+    - fp16
+  relative_path: models/nlp/large_language_model/qwen-7b/text-generation-inference
+  task_type: nlp/large_language_model
+- datasets: https://localhost
+  download_url: https://modelscope.cn/models/qwen/Qwen1.5-7B
+  name: qwen1.5-7b
+  need_third_part: false
+  precisions:
+    - fp16
+  relative_path: models/nlp/large_language_model/qwen1.5-7b/text-generation-inference
+  task_type: nlp/large_language_model
diff --git a/tests/models_vllm.yaml b/tests/models_vllm.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..548f16c81dbc297b22a666e4982c3a3027311128
--- /dev/null
+++ b/tests/models_vllm.yaml
@@ -0,0 +1,114 @@
+---
+- datasets: https://localhost
+  download_url: https://huggingface.co/baichuan-inc/Baichuan2-7B-Base
+  name: baichuan2-7b
+  need_third_part: false
+  precisions:
+    - fp16
+    - int8
+  relative_path: models/nlp/large_language_model/baichuan2-7b/vllm
+  task_type: nlp/large_language_model
+- datasets: https://localhost
+  download_url: https://huggingface.co/THUDM/chatglm3-6b
+  name: chatglm3-6b
+  need_third_part: false
+  precisions:
+    - fp16
+  relative_path: models/nlp/large_language_model/chatglm3-6b/vllm
+  task_type: nlp/large_language_model
+- datasets: https://localhost
+  download_url: https://www.modelscope.cn/models/ZhipuAI/chatglm3-6b-32k
+  name: chatglm3-6b-32k
+  need_third_part: false
+  precisions:
+    - fp16
+  relative_path: models/nlp/large_language_model/chatglm3-6b-32k/vllm
+  task_type: nlp/large_language_model
+- datasets: https://localhost
+  download_url: https://localhost/llama2-7b
+  name: llama2-7b
+  need_third_part: false
+  precisions:
+    - fp16
+  relative_path: models/nlp/large_language_model/llama2-7b/vllm
+  task_type: nlp/large_language_model
+- datasets: https://localhost
+  download_url: https://localhost/Meta-Llama-3-70B-Instruct
+  name: llama3-70b
+  need_third_part: false
+  precisions:
+    - fp16
+  relative_path: models/nlp/large_language_model/llama3-70b/vllm
+  task_type: nlp/large_language_model
+- datasets: https://localhost
+  download_url: https://localhost/qwen-7B
+  name: qwen-7b
+  need_third_part: false
+  precisions:
+    - fp16
+  relative_path: models/nlp/large_language_model/qwen-7b/vllm
+  task_type: nlp/large_language_model
+- datasets: https://localhost
+  download_url: https://modelscope.cn/models/qwen/Qwen1.5-7B
+  name: qwen1.5-7b
+  need_third_part: false
+  precisions:
+    - fp16
+  relative_path: models/nlp/large_language_model/qwen1.5-7b/vllm
+  task_type: nlp/large_language_model
+- datasets: https://localhost
+  download_url: https://modelscope.cn/models/qwen/Qwen1.5-14B
+  name: qwen1.5-14b
+  need_third_part: false
+  precisions:
+    - fp16
+  relative_path: models/nlp/large_language_model/qwen1.5-14b/vllm
+  task_type: nlp/large_language_model
+- datasets: https://localhost
+  download_url: https://modelscope.cn/models/Qwen/Qwen1.5-32B-Chat
+  name: qwen1.5-32b
+  need_third_part: false
+  precisions:
+    - fp16
+  relative_path: models/nlp/large_language_model/qwen1.5-32b/vllm
+  task_type: nlp/large_language_model
+- datasets: https://localhost
+  download_url: https://modelscope.cn/models/qwen/Qwen1.5-72B
+  name: qwen1.5-72b
+  need_third_part: false
+  precisions:
+    - fp16
+  relative_path: models/nlp/large_language_model/qwen1.5-72b/vllm
+  task_type: nlp/large_language_model
+- datasets: https://localhost
+  download_url: https://modelscope.cn/models/Qwen/Qwen2-7B-Instruct
+  name: qwen2-7b
+  need_third_part: false
+  precisions:
+    - fp16
+  relative_path: models/nlp/large_language_model/qwen2-7b/vllm
+  task_type: nlp/large_language_model
+- datasets: https://localhost
+  download_url: https://localhost/Qwen2-72B
+  name: qwen2-72b
+  need_third_part: false
+  precisions:
+    - fp16
+  relative_path: models/nlp/large_language_model/qwen2-72b/vllm
+  task_type: nlp/large_language_model
+- datasets: https://localhost
+  download_url: https://huggingface.co/stabilityai/stablelm-2-1_6b
+  name: stablelm
+  need_third_part: false
+  precisions:
+    - fp16
+  relative_path: models/nlp/large_language_model/stablelm/vllm
+  task_type: nlp/large_language_model
+- datasets: https://localhost
+  download_url: https://localhost/MiniCPM-V-2
+  name: minicpm-v-2
+  need_third_part: false
+  precisions:
+    - fp16
+  relative_path: models/multimodal/vision-language-understanding/minicpm-v-2/vllm/
+  task_type: multimodal/vision-language-understanding
diff --git a/tests/run_trtllm.py b/tests/run_trtllm.py
new file mode 100644
index 0000000000000000000000000000000000000000..911ac6fd80bed287336bd092c521cf6f6478b396
--- /dev/null
+++ b/tests/run_trtllm.py
@@ -0,0 +1,189 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import yaml
+import subprocess
+import json
+import re
+import time
+import logging
+import os
+import sys
+import argparse
+
+import utils
+
+# é…ç½®æ—¥å¿—
+debug_level = logging.DEBUG if utils.is_debug() else logging.INFO
+logging.basicConfig(
+    handlers=[logging.FileHandler("output.log"), logging.StreamHandler()],
+    level=debug_level,
+    format="%(asctime)s - %(levelname)s - %(message)s",
+)
+
+METRIC_PATTERN = r"{'metricResult':.*}"
+
+def main():
+    parser = argparse.ArgumentParser(description="")
+    parser.add_argument("--model", type=str, help="model name, e.g: alexnet")
+    args = parser.parse_args()
+
+    if args.model:
+        test_model = args.model
+    else:
+        test_model = os.environ.get("TEST_CASE")
+    logging.info(f"Test case to run: {test_model}")
+    if not test_model:
+        logging.error("test model case is empty")
+        sys.exit(-1)
+    
+    model = get_model_config(test_model)
+    if not model:
+        logging.error("mode config is empty")
+        sys.exit(-1)
+
+    result = {}
+    # NLPæ¨¡åž‹
+    if model["task_type"] in ["nlp/large_language_model"]:
+        logging.info(f"Start running {model['name']} test case:\n{json.dumps(model, indent=4)}")
+        d_url = model["download_url"]
+        if d_url is not None:
+            result = run_nlp_testcase(model)
+            check_model_result(result)
+            logging.debug(f"The result of {model['name']} is\n{json.dumps(result, indent=4)}")
+        logging.info(f"End running {model['name']} test case.")
+
+    logging.info(f"Full text result: {result}")
+
+def get_model_config(mode_name):
+    with open("models_trtllm.yaml", "r") as file:
+        models = yaml.safe_load(file)
+
+    for model in models:
+        if model["name"] == mode_name.lower():
+            return model
+    return
+
+def check_model_result(result):
+    status = "PASS"
+    for prec in ["fp16", "int8"]:
+        if prec in result["result"]:
+            if result["result"][prec]["status"] == "FAIL":
+                status = "FAIL"
+                break
+    result["status"] = status
+
+def run_nlp_testcase(model):
+    model_name = model["name"]
+    result = {
+        "name": model_name,
+        "result": {},
+    }
+    d_url = model["download_url"]
+    checkpoint_n = d_url.split("/")[-1]
+    dataset_n = model["datasets"].split("/")[-1]
+    prepare_script = f"""
+    set -x
+    cd ../{model['relative_path']}
+    bash ci/prepare.sh
+    """
+
+    # add pip list info when in debug mode
+    if utils.is_debug():
+        pip_list_script = "pip list | grep -E 'numpy|transformer|igie|mmcv|onnx'\n"
+        prepare_script = pip_list_script + prepare_script + pip_list_script
+
+    run_script(prepare_script)
+
+    for prec in model["precisions"]:
+        logging.info(f"Start running {model_name} {prec} test case")
+        script = f"""
+        set -x
+        cd ../{model['relative_path']}
+        """
+        if model_name == "llama2-7b":
+            script = f"""
+            set -x
+            cd ../{model['relative_path']}
+            bash scripts/test_trtllm_llama2_7b_gpu1_build.sh
+            bash scripts/test_trtllm_llama2_7b_gpu1.sh
+            """
+        elif model_name == "llama2-13b":
+            script = f"""
+            set -x
+            cd ../{model['relative_path']}
+            export CUDA_VISIBLE_DEVICES=0,1
+            bash scripts/test_trtllm_llama2_13b_gpu2_build.sh
+            bash scripts/test_trtllm_llama2_13b_gpu2.sh
+            """
+        elif model_name == "llama2-70b":
+            script = f"""
+            set -x
+            cd ../{model['relative_path']}
+            export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+            bash scripts/test_trtllm_llama2_70b_gpu8_build.sh
+            bash scripts/test_trtllm_llama2_70b_gpu8.sh
+            """
+        elif model_name == "qwen-7b":
+            script = f"""
+            set -x
+            cd ../{model['relative_path']}
+            export CUDA_VISIBLE_DEVICES=1
+            python3 offline_inference.py --model2path ./data/qwen-7B
+            """
+        elif model_name == "qwen1.5-7b":
+            script = f"""
+            set -x
+            cd ../{model['relative_path']}
+            export CUDA_VISIBLE_DEVICES=1
+            python3 offline_inference.py --model2path ./data/Qwen1.5-7B
+            """
+
+        r, t = run_script(script)
+        sout = r.stdout
+
+        pattern = METRIC_PATTERN
+        matchs = re.findall(pattern, sout)
+        result["result"].setdefault(prec, {"status": "FAIL"})
+        logging.debug(f"matchs:\n{matchs}")
+        for m in matchs:
+            result["result"][prec].update(get_metric_result(m))
+        if len(matchs) == 2:
+            result["result"][prec]["status"] = "PASS"
+
+        result["result"][prec]["Cost time (s)"] = t
+    return result
+
+def get_metric_result(str):
+    if str:
+        return json.loads(str.replace("'", "\""))["metricResult"]
+    return None
+
+def run_script(script):
+    start_time = time.perf_counter()
+    result = subprocess.run(
+        script, shell=True, capture_output=True, text=True, executable="/bin/bash"
+    )
+    end_time = time.perf_counter()
+    execution_time = end_time - start_time
+    logging.debug(f"æ‰§è¡Œå‘½ä»¤ï¼š\n{script}")
+    logging.debug("æ‰§è¡Œæ—¶é—´: {:.4f} ç§’".format(execution_time))
+    logging.debug(f"æ ‡å‡†è¾“å‡º: {result.stdout}")
+    logging.debug(f"æ ‡å‡†é”™è¯¯: {result.stderr}")
+    logging.debug(f"è¿”å›žç : {result.returncode}")
+    return result, execution_time
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/run_vllm.py b/tests/run_vllm.py
new file mode 100644
index 0000000000000000000000000000000000000000..8bd3504db634ec4360e502cb27aababd5edace63
--- /dev/null
+++ b/tests/run_vllm.py
@@ -0,0 +1,259 @@
+# Copyright (c) 2024, Shanghai Iluvatar CoreX Semiconductor Co., Ltd.
+# All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may
+# not use this file except in compliance with the License. You may obtain
+# a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import yaml
+import subprocess
+import json
+import re
+import time
+import logging
+import os
+import sys
+import argparse
+
+import utils
+
+# é…ç½®æ—¥å¿—
+debug_level = logging.DEBUG if utils.is_debug() else logging.INFO
+logging.basicConfig(
+    handlers=[logging.FileHandler("output.log"), logging.StreamHandler()],
+    level=debug_level,
+    format="%(asctime)s - %(levelname)s - %(message)s",
+)
+
+METRIC_PATTERN = r"{'metricResult':.*}"
+
+def main():
+    parser = argparse.ArgumentParser(description="")
+    parser.add_argument("--model", type=str, help="model name, e.g: alexnet")
+    args = parser.parse_args()
+
+    if args.model:
+        test_model = args.model
+    else:
+        test_model = os.environ.get("TEST_CASE")
+    logging.info(f"Test case to run: {test_model}")
+    if not test_model:
+        logging.error("test model case is empty")
+        sys.exit(-1)
+    
+    model = get_model_config(test_model)
+    if not model:
+        logging.error("mode config is empty")
+        sys.exit(-1)
+
+    result = {}
+    # NLPæ¨¡åž‹
+    if model["task_type"] in ["nlp/large_language_model", "multimodal/vision-language-understanding"]:
+        logging.info(f"Start running {model['name']} test case:\n{json.dumps(model, indent=4)}")
+        d_url = model["download_url"]
+        if d_url is not None:
+            result = run_nlp_testcase(model)
+            check_model_result(result)
+            logging.debug(f"The result of {model['name']} is\n{json.dumps(result, indent=4)}")
+        logging.info(f"End running {model['name']} test case.")
+
+    logging.info(f"Full text result: {result}")
+
+def get_model_config(mode_name):
+    with open("models_vllm.yaml", "r") as file:
+        models = yaml.safe_load(file)
+
+    for model in models:
+        if model["name"] == mode_name.lower():
+            return model
+    return
+
+def check_model_result(result):
+    status = "PASS"
+    for prec in ["fp16", "int8"]:
+        if prec in result["result"]:
+            if result["result"][prec]["status"] == "FAIL":
+                status = "FAIL"
+                break
+    result["status"] = status
+
+def run_nlp_testcase(model):
+    model_name = model["name"]
+    result = {
+        "name": model_name,
+        "result": {},
+    }
+    d_url = model["download_url"]
+    checkpoint_n = d_url.split("/")[-1]
+    dataset_n = model["datasets"].split("/")[-1]
+    prepare_script = f"""
+    set -x
+    cd ../{model['relative_path']}
+    ln -s /mnt/deepspark/data/checkpoints/{checkpoint_n} ./{model_name}
+    bash ci/prepare.sh
+    """
+
+    # add pip list info when in debug mode
+    if utils.is_debug():
+        pip_list_script = "pip list | grep -E 'numpy|transformer|igie|mmcv|onnx'\n"
+        prepare_script = pip_list_script + prepare_script + pip_list_script
+
+    run_script(prepare_script)
+
+    for prec in model["precisions"]:
+        logging.info(f"Start running {model_name} {prec} test case")
+        script = f"""
+        set -x
+        cd ../{model['relative_path']}
+        """
+        if model_name == "baichuan2-7b":
+            script = f"""
+            set -x
+            cd ../{model['relative_path']}
+            python3 offline_inference.py --model ./baichuan2-7b/ --max-tokens 256 --trust-remote-code --chat_template template_baichuan.jinja --temperature 0.0
+            """
+            if prec == "int8":
+                script = f"""
+                set -x
+                cd ../{model['relative_path']}
+                python3 offline_inference.py --model ./baichuan2-7b/int8/ --chat_template template_baichuan.jinja --quantization w8a16 --max-num-seqs 1 --max-model-len 256 --trust-remote-code --temperature 0.0 --max-tokens 256
+                """
+        elif model_name == "chatglm3-6b":
+            script = f"""
+            set -x
+            cd ../{model['relative_path']}
+            python3 offline_inference.py --model ./chatglm3-6b --trust-remote-code --temperature 0.0 --max-tokens 256
+            """
+        elif model_name == "chatglm3-6b-32k":
+            script = f"""
+            set -x
+            cd ../{model['relative_path']}
+            python3 offline_inference.py --model ./chatglm3-6b-32k --trust-remote-code --temperature 0.0 --max-tokens 256
+            """
+        elif model_name == "llama2-7b":
+            script = f"""
+            set -x
+            cd ../{model['relative_path']}
+            python3 offline_inference.py --model ./llama2-7b --max-tokens 256 -tp 1 --temperature 0.0
+            """
+        elif model_name == "llama3-70b":
+            script = f"""
+            set -x
+            cd ../{model['relative_path']}
+            export CUDA_VISIBLE_DEVICES=0,1,2,3
+            python3 offline_inference.py --model ./llama3-70b --max-tokens 256 -tp 4 --temperature 0.0
+            """
+        elif model_name == "qwen-7b":
+            script = f"""
+            set -x
+            cd ../{model['relative_path']}
+            export CUDA_VISIBLE_DEVICES=0,1
+            python3 offline_inference.py --model ./qwen-7b --max-tokens 256 -tp 2 --trust-remote-code --temperature 0.0
+            """
+        elif model_name == "qwen1.5-7b":
+            script = f"""
+            set -x
+            cd ../{model['relative_path']}
+            python3 offline_inference.py --model ./qwen1.5-7b --max-tokens 256 -tp 1 --temperature 0.0 --max-model-len 3096
+            """
+        elif model_name == "qwen1.5-7b":
+            script = f"""
+            set -x
+            cd ../{model['relative_path']}
+            python3 offline_inference.py --model ./qwen1.5-7b --max-tokens 256 -tp 1 --temperature 0.0 --max-model-len 3096
+            """
+        elif model_name == "qwen1.5-14b":
+            script = f"""
+            set -x
+            cd ../{model['relative_path']}
+            python3 offline_inference.py --model ./qwen1.5-14b --max-tokens 256 -tp 1 --temperature 0.0 --max-model-len 896
+            """
+        elif model_name == "qwen1.5-32b":
+            script = f"""
+            set -x
+            cd ../{model['relative_path']}
+            export CUDA_VISIBLE_DEVICES=0,1,2,3
+            python3 offline_inference.py --model ./qwen1.5-32b --max-tokens 256 -tp 4 --temperature 0.0
+            """
+        elif model_name == "qwen1.5-72b":
+            script = f"""
+            set -x
+            cd ../{model['relative_path']}
+            export CUDA_VISIBLE_DEVICES=0,1
+            python3 offline_inference.py --model ./qwen1.5-72b --max-tokens 256 -tp 2 --temperature 0.0 --max-model-len 3096
+            """
+        elif model_name == "qwen2-7b":
+            script = f"""
+            set -x
+            cd ../{model['relative_path']}
+            export CUDA_VISIBLE_DEVICES=0
+            python3 offline_inference.py --model ./qwen2-7b --max-tokens 256 -tp 1 --temperature 0.0
+            """
+        elif model_name == "qwen2-72b":
+            script = f"""
+            set -x
+            cd ../{model['relative_path']}
+            export CUDA_VISIBLE_DEVICES=0,1,2,3
+            python3 offline_inference.py --model ./qwen2-72b --max-tokens 256 -tp 4 --temperature 0.0 --gpu-memory-utilization 0.98 --max-model-len 32768
+            """
+        elif model_name == "stablelm":
+            script = f"""
+            set -x
+            cd ../{model['relative_path']}
+            export CUDA_VISIBLE_DEVICES=0,1
+            python3 offline_inference.py --model ./stablelm --max-tokens 256 -tp 1 --temperature 0.0
+            """
+        elif model_name == "minicpm-v-2":
+            script = f"""
+            set -x
+            cd ../{model['relative_path']}
+            export PT_SDPA_ENABLE_HEAD_DIM_PADDING=1
+            export PATH=/usr/local/corex/bin:${PATH}
+            export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/lib64
+            python3 minicpmv-2.0-offline.py --model-path ./minicpm-v-2 --image-path ./dog.jpg
+            """
+
+        r, t = run_script(script)
+        sout = r.stdout
+
+        pattern = METRIC_PATTERN
+        matchs = re.findall(pattern, sout)
+        result["result"].setdefault(prec, {"status": "FAIL"})
+        logging.debug(f"matchs:\n{matchs}")
+        for m in matchs:
+            result["result"][prec].update(get_metric_result(m))
+        if len(matchs) == 1:
+            result["result"][prec]["status"] = "PASS"
+
+        result["result"][prec]["Cost time (s)"] = t
+    return result
+
+def get_metric_result(str):
+    if str:
+        return json.loads(str.replace("'", "\""))["metricResult"]
+    return None
+
+def run_script(script):
+    start_time = time.perf_counter()
+    result = subprocess.run(
+        script, shell=True, capture_output=True, text=True, executable="/bin/bash"
+    )
+    end_time = time.perf_counter()
+    execution_time = end_time - start_time
+    logging.debug(f"æ‰§è¡Œå‘½ä»¤ï¼š\n{script}")
+    logging.debug("æ‰§è¡Œæ—¶é—´: {:.4f} ç§’".format(execution_time))
+    logging.debug(f"æ ‡å‡†è¾“å‡º: {result.stdout}")
+    logging.debug(f"æ ‡å‡†é”™è¯¯: {result.stderr}")
+    logging.debug(f"è¿”å›žç : {result.returncode}")
+    return result, execution_time
+
+if __name__ == "__main__":
+    main()