From 9d8600380d0d63e8dc5fd2b06867b14a944e82cd Mon Sep 17 00:00:00 2001
From: horcam <zhanghongquan15@huawei.com>
Date: Sat, 2 Aug 2025 17:45:45 +0800
Subject: [PATCH] update infer doc and update vllm-ms models

---
 .../models_list/models_list.md                |  2 +-
 .../models_list/models_list.md                |  2 +-
 .../ms_infer/ms_infer_model_serving_infer.md  | 44 ++++++++++++++-----
 3 files changed, 36 insertions(+), 12 deletions(-)

diff --git a/docs/vllm_mindspore/docs/source_en/user_guide/supported_models/models_list/models_list.md b/docs/vllm_mindspore/docs/source_en/user_guide/supported_models/models_list/models_list.md
index f7d634918a..ba825bcae5 100644
--- a/docs/vllm_mindspore/docs/source_en/user_guide/supported_models/models_list/models_list.md
+++ b/docs/vllm_mindspore/docs/source_en/user_guide/supported_models/models_list/models_list.md
@@ -10,7 +10,7 @@
 | Qwen2.5 | Supported | [Qwen2.5-0.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct), [Qwen2.5-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct), [Qwen2.5-3B-Instruct](https://huggingface.co/Qwen/Qwen2.5-3B-Instruct), [Qwen2.5-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct), [Qwen2.5-14B-Instruct](https://huggingface.co/Qwen/Qwen2.5-14B-Instruct), [Qwen2.5-32B-Instruct](https://huggingface.co/Qwen/Qwen2.5-32B-Instruct), [Qwen2.5-72B-Instruct](https://huggingface.co/Qwen/Qwen2.5-72B-Instruct) |
 | Qwen3-32B | Supported | [Qwen3-32B](https://modelers.cn/models/MindSpore-Lab/Qwen3-32B) |
 | Qwen3-235B-A22B | Supported | [Qwen3-235B-A22B](https://huggingface.co/Qwen/Qwen3-235B-A22B) |
-| Qwen3, Qwen3-MOE | Testing | [Qwen3-0.6B](https://huggingface.co/Qwen/Qwen3-0.6B), [Qwen3-1.7B](https://huggingface.co/Qwen/Qwen3-1.7B), [Qwen3-4B](https://huggingface.co/Qwen/Qwen3-4B), [Qwen3-8B](https://huggingface.co/Qwen/Qwen3-8B), [Qwen3-14B](https://modelers.cn/models/MindSpore-Lab/Qwen3-14B), [Qwen3-32B](https://modelers.cn/models/MindSpore-Lab/Qwen3-32B), [Qwen3-30B-A3](https://huggingface.co/Qwen/Qwen3-30B-A3B) |
+| Qwen3, Qwen3-MOE | Testing | [Qwen3-0.6B](https://huggingface.co/Qwen/Qwen3-0.6B), [Qwen3-1.7B](https://huggingface.co/Qwen/Qwen3-1.7B), [Qwen3-4B](https://huggingface.co/Qwen/Qwen3-4B), [Qwen3-8B](https://huggingface.co/Qwen/Qwen3-8B), [Qwen3-14B](https://modelers.cn/models/MindSpore-Lab/Qwen3-14B), [Qwen3-30B-A3](https://huggingface.co/Qwen/Qwen3-30B-A3B) |
 | Qwen2.5-VL | Testing | [Qwen2.5-VL-3B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-3B-Instruct), [Qwen2.5-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct), [Qwen2.5-VL-32B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-32B-Instruct), [Qwen2.5-VL-72B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-72B-Instruct) |
 | QwQ-32B | Testing | [QwQ-32B](https://huggingface.co/Qwen/QwQ-32B)     |
 | Llama3.1 | Testing | [Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct), [Llama-3.1-70B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct), [Llama-3.1-405B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct)  |
diff --git a/docs/vllm_mindspore/docs/source_zh_cn/user_guide/supported_models/models_list/models_list.md b/docs/vllm_mindspore/docs/source_zh_cn/user_guide/supported_models/models_list/models_list.md
index 67767d8bad..2e504c0fec 100644
--- a/docs/vllm_mindspore/docs/source_zh_cn/user_guide/supported_models/models_list/models_list.md
+++ b/docs/vllm_mindspore/docs/source_zh_cn/user_guide/supported_models/models_list/models_list.md
@@ -10,7 +10,7 @@
 | Qwen2.5 | 已支持 | [Qwen2.5-0.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct)、[Qwen2.5-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct)、[Qwen2.5-3B-Instruct](https://huggingface.co/Qwen/Qwen2.5-3B-Instruct)、 [Qwen2.5-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct)、[Qwen2.5-14B-Instruct](https://huggingface.co/Qwen/Qwen2.5-14B-Instruct)、[Qwen2.5-32B-Instruct](https://huggingface.co/Qwen/Qwen2.5-32B-Instruct)、[Qwen2.5-72B-Instruct](https://huggingface.co/Qwen/Qwen2.5-72B-Instruct) |
 | Qwen3-32B | 已支持 | [Qwen3-32B](https://modelers.cn/models/MindSpore-Lab/Qwen3-32B) |
 | Qwen3-235B-A22B | 已支持 | [Qwen3-235B-A22B](https://huggingface.co/Qwen/Qwen3-235B-A22B) |
-| Qwen3、Qwen3-MOE | 测试中 | [Qwen3-0.6B](https://huggingface.co/Qwen/Qwen3-0.6B)、[Qwen3-1.7B](https://huggingface.co/Qwen/Qwen3-1.7B)、[Qwen3-4B](https://huggingface.co/Qwen/Qwen3-4B)、[Qwen3-8B](https://huggingface.co/Qwen/Qwen3-8B)、[Qwen3-14B](https://modelers.cn/models/MindSpore-Lab/Qwen3-14B)、[Qwen3-32B](https://modelers.cn/models/MindSpore-Lab/Qwen3-32B)、[Qwen3-30B-A3](https://huggingface.co/Qwen/Qwen3-30B-A3B) |
+| Qwen3、Qwen3-MOE | 测试中 | [Qwen3-0.6B](https://huggingface.co/Qwen/Qwen3-0.6B)、[Qwen3-1.7B](https://huggingface.co/Qwen/Qwen3-1.7B)、[Qwen3-4B](https://huggingface.co/Qwen/Qwen3-4B)、[Qwen3-8B](https://huggingface.co/Qwen/Qwen3-8B)、[Qwen3-14B](https://modelers.cn/models/MindSpore-Lab/Qwen3-14B)、[Qwen3-30B-A3](https://huggingface.co/Qwen/Qwen3-30B-A3B) |
 | Qwen2.5-VL | 测试中 | [Qwen2.5-VL-3B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-3B-Instruct)、[Qwen2.5-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct)、[Qwen2.5-VL-32B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-32B-Instruct)、[Qwen2.5-VL-72B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-72B-Instruct) |
 | QwQ-32B | 测试中 | [QwQ-32B](https://huggingface.co/Qwen/QwQ-32B)     |
 | Llama3.1 | 测试中 | [Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct)、[Llama-3.1-70B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct)、[Llama-3.1-405B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct)  |
diff --git a/tutorials/source_zh_cn/model_infer/ms_infer/ms_infer_model_serving_infer.md b/tutorials/source_zh_cn/model_infer/ms_infer/ms_infer_model_serving_infer.md
index abc2346400..41e18f0b3f 100644
--- a/tutorials/source_zh_cn/model_infer/ms_infer/ms_infer_model_serving_infer.md
+++ b/tutorials/source_zh_cn/model_infer/ms_infer/ms_infer_model_serving_infer.md
@@ -41,13 +41,13 @@ print(generate_text)
 
 ## 推理教程
 
-MindSpore推理结合vLLM社区方案，为用户提供了全栈端到端的推理服务化能力，通过vllm-mindspore适配层，实现vLLM社区的服务化能力在MindSpore框架下的无缝对接，具体可以参考[vLLM MindSpore 文档](https://gitee.com/mindspore/docs/blob/master/docs/vllm_mindspore/docs/source_zh_cn/index.rst)。
+MindSpore推理结合vLLM社区方案，为用户提供了全栈端到端的推理服务化能力，通过vLLM MindSpore适配层，实现vLLM社区的服务化能力在MindSpore框架下的无缝对接，具体可以参考[vLLM MindSpore文档](https://www.mindspore.cn/vllm_mindspore/docs/zh-CN/master/index.html)。
 
-本章主要简单介绍vLLM + MindSpore服务化推理的基础使用。
+本章主要简单介绍vLLM MindSpore服务化推理的基础使用。
 
 ### 环境准备
 
-vllm-mindspore适配层提供了环境安装脚本，用户可以执行如下命令创建一个vllm-mindspore的运行环境：
+vLLM MindSpore适配层提供了环境安装脚本，用户可以执行如下命令创建一个vLLM MindSpore的运行环境：
 
 ```shell
 # download vllm-mindspore code
@@ -69,7 +69,7 @@ bash install_depend_pkgs.sh
 python setup.py install
 ```
 
-vllm-mindspore的运行环境创建后，还需要安装以下依赖包：
+vLLM MindSpore的运行环境创建后，还需要安装以下依赖包：
 
 - **mindspore**：MindSpore开发框架，模型运行基础。
 
@@ -92,13 +92,15 @@ git lfs install
 git clone https://huggingface.co/Qwen/Qwen2-7B-Instruct
 ```
 
+若在拉取过程中，执行`git lfs install失败`，可以参考vLLM MindSpore FAQ 进行解决。
+
 ### 启动服务
 
 在启动后端服务前，需要按照实际环境设置对应的环境变量。
 
 ```shell
 # set Ascend CANN tools envs
-/usr/local/Ascend/ascend-toolkit/set_env.sh
+source /usr/local/Ascend/ascend-toolkit/set_env.sh
 export ASCEND_CUSTOM_PATH=${ASCEND_HOME_PATH}/../
 export ASCEND_RT_VISIBLE_DEVICES=3
 export ASCEND_TOTAL_MEMORY_GB=32
@@ -114,12 +116,13 @@ export VLLM_MODEL_MEMORY_USE_GB=26
 export VLLM_MASTER_IP=127.0.0.1
 export VLLM_RPC_PORT=12390
 export VLLM_HTTP_PORT=8080
+unset vLLM_MODEL_BACKEND
 
 # model envs
-export MODEL_ID="/path/to/model"
+export MODEL_ID="/path/to/model/Qwen2-7B-Instruct"
 ```
 
-执行如下命令可以启动vllm-mindspore的服务后端。
+执行如下命令可以启动vLLM MindSpore的服务后端。
 
 ```shell
 vllm-mindspore serve --model=${MODEL_ID} --port=${VLLM_HTTP_PORT} --trust_remote_code --max-num-seqs=256 --max_model_len=32768 --max-num-batched-tokens=4096 --block_size=128 --gpu-memory-utilization=0.9 --tensor-parallel-size 1 --data-parallel-size 1 --data-parallel-size-local 1 --data-parallel-start-rank 0  --data-parallel-address ${VLLM_MASTER_IP} --data-parallel-rpc-port ${VLLM_RPC_PORT} &> vllm-mindspore.log &
@@ -132,11 +135,32 @@ vllm-mindspore serve --model=${MODEL_ID} --port=${VLLM_HTTP_PORT} --trust_remote
 用户可以通过发送http请求来实现模型推理，具体可以执行如下命令：
 
 ```shell
-curl http://${VLLM_MASTER_IP}:${VLLM_HTTP_PORT}/v1/completions -H "Content-Type: application.json" -d "{\"model\": \"${MODEL_ID}\", \"prompt\": \"I love Beijing, because\", \"max_tokens\": 128, \"temperature\": 1.0, \"top_p\": 1.0, \"top_k\": 1, \"repetition_penalty\": 1.0}"
+curl http://${VLLM_MASTER_IP}:${VLLM_HTTP_PORT}/v1/completions -H "Content-Type: application/json" -d "{\"model\": \"${MODEL_ID}\", \"prompt\": \"I love Beijing, because\", \"max_tokens\": 128, \"temperature\": 1.0, \"top_p\": 1.0, \"top_k\": 1, \"repetition_penalty\": 1.0}"
 ```
 
 服务后端收到推理请求后，计算后会返回如下结果：
 
-```shell
-待补充
+```json
+{
+    "id":"cmpl-1c30caf453154b5ab4a579b7b06cea19",
+    "object":"text_completion",
+    "created":1754103773,
+    "model":"/path/to/model/Qwen2-7B-Instruct",
+    "choices":[
+        {
+            "index":0,
+            "text":" it is a city with a long history and rich culture. I have been to many places of interest in Beijing, such as the Great Wall, the Forbidden City, the Summer Palace, and the Temple of Heaven. I also visited the National Museum of China, where I learned a lot about Chinese history and culture. The food in Beijing is also amazing, especially the Peking duck and the dumplings. I enjoyed trying different types of local cuisine and experiencing the unique flavors of Beijing. The people in Beijing are friendly and welcoming, and they are always willing to help tourists. I had a great time exploring the city and interacting with the locals",
+            "logprobs":null,
+            "finish_reason":"length",
+            "stop_reason":null,
+            "prompt_logprobs":null
+        }
+    ],
+    "usage":{
+        "prompt_tokens":5,
+        "total_tokens":133,
+        "completion_tokens":128,
+        "prompt_tokens_details":null
+    }
+}
 ```
-- 
Gitee