From 9d8600380d0d63e8dc5fd2b06867b14a944e82cd Mon Sep 17 00:00:00 2001 From: horcam Date: Sat, 2 Aug 2025 17:45:45 +0800 Subject: [PATCH] update infer doc and update vllm-ms models --- .../models_list/models_list.md | 2 +- .../models_list/models_list.md | 2 +- .../ms_infer/ms_infer_model_serving_infer.md | 44 ++++++++++++++----- 3 files changed, 36 insertions(+), 12 deletions(-) diff --git a/docs/vllm_mindspore/docs/source_en/user_guide/supported_models/models_list/models_list.md b/docs/vllm_mindspore/docs/source_en/user_guide/supported_models/models_list/models_list.md index f7d634918a..ba825bcae5 100644 --- a/docs/vllm_mindspore/docs/source_en/user_guide/supported_models/models_list/models_list.md +++ b/docs/vllm_mindspore/docs/source_en/user_guide/supported_models/models_list/models_list.md @@ -10,7 +10,7 @@ | Qwen2.5 | Supported | [Qwen2.5-0.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct), [Qwen2.5-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct), [Qwen2.5-3B-Instruct](https://huggingface.co/Qwen/Qwen2.5-3B-Instruct), [Qwen2.5-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct), [Qwen2.5-14B-Instruct](https://huggingface.co/Qwen/Qwen2.5-14B-Instruct), [Qwen2.5-32B-Instruct](https://huggingface.co/Qwen/Qwen2.5-32B-Instruct), [Qwen2.5-72B-Instruct](https://huggingface.co/Qwen/Qwen2.5-72B-Instruct) | | Qwen3-32B | Supported | [Qwen3-32B](https://modelers.cn/models/MindSpore-Lab/Qwen3-32B) | | Qwen3-235B-A22B | Supported | [Qwen3-235B-A22B](https://huggingface.co/Qwen/Qwen3-235B-A22B) | -| Qwen3, Qwen3-MOE | Testing | [Qwen3-0.6B](https://huggingface.co/Qwen/Qwen3-0.6B), [Qwen3-1.7B](https://huggingface.co/Qwen/Qwen3-1.7B), [Qwen3-4B](https://huggingface.co/Qwen/Qwen3-4B), [Qwen3-8B](https://huggingface.co/Qwen/Qwen3-8B), [Qwen3-14B](https://modelers.cn/models/MindSpore-Lab/Qwen3-14B), [Qwen3-32B](https://modelers.cn/models/MindSpore-Lab/Qwen3-32B), [Qwen3-30B-A3](https://huggingface.co/Qwen/Qwen3-30B-A3B) | +| Qwen3, Qwen3-MOE | Testing | [Qwen3-0.6B](https://huggingface.co/Qwen/Qwen3-0.6B), [Qwen3-1.7B](https://huggingface.co/Qwen/Qwen3-1.7B), [Qwen3-4B](https://huggingface.co/Qwen/Qwen3-4B), [Qwen3-8B](https://huggingface.co/Qwen/Qwen3-8B), [Qwen3-14B](https://modelers.cn/models/MindSpore-Lab/Qwen3-14B), [Qwen3-30B-A3](https://huggingface.co/Qwen/Qwen3-30B-A3B) | | Qwen2.5-VL | Testing | [Qwen2.5-VL-3B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-3B-Instruct), [Qwen2.5-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct), [Qwen2.5-VL-32B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-32B-Instruct), [Qwen2.5-VL-72B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-72B-Instruct) | | QwQ-32B | Testing | [QwQ-32B](https://huggingface.co/Qwen/QwQ-32B) | | Llama3.1 | Testing | [Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct), [Llama-3.1-70B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct), [Llama-3.1-405B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct) | diff --git a/docs/vllm_mindspore/docs/source_zh_cn/user_guide/supported_models/models_list/models_list.md b/docs/vllm_mindspore/docs/source_zh_cn/user_guide/supported_models/models_list/models_list.md index 67767d8bad..2e504c0fec 100644 --- a/docs/vllm_mindspore/docs/source_zh_cn/user_guide/supported_models/models_list/models_list.md +++ b/docs/vllm_mindspore/docs/source_zh_cn/user_guide/supported_models/models_list/models_list.md @@ -10,7 +10,7 @@ | Qwen2.5 | 已支持 | [Qwen2.5-0.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct)、[Qwen2.5-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct)、[Qwen2.5-3B-Instruct](https://huggingface.co/Qwen/Qwen2.5-3B-Instruct)、 [Qwen2.5-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct)、[Qwen2.5-14B-Instruct](https://huggingface.co/Qwen/Qwen2.5-14B-Instruct)、[Qwen2.5-32B-Instruct](https://huggingface.co/Qwen/Qwen2.5-32B-Instruct)、[Qwen2.5-72B-Instruct](https://huggingface.co/Qwen/Qwen2.5-72B-Instruct) | | Qwen3-32B | 已支持 | [Qwen3-32B](https://modelers.cn/models/MindSpore-Lab/Qwen3-32B) | | Qwen3-235B-A22B | 已支持 | [Qwen3-235B-A22B](https://huggingface.co/Qwen/Qwen3-235B-A22B) | -| Qwen3、Qwen3-MOE | 测试中 | [Qwen3-0.6B](https://huggingface.co/Qwen/Qwen3-0.6B)、[Qwen3-1.7B](https://huggingface.co/Qwen/Qwen3-1.7B)、[Qwen3-4B](https://huggingface.co/Qwen/Qwen3-4B)、[Qwen3-8B](https://huggingface.co/Qwen/Qwen3-8B)、[Qwen3-14B](https://modelers.cn/models/MindSpore-Lab/Qwen3-14B)、[Qwen3-32B](https://modelers.cn/models/MindSpore-Lab/Qwen3-32B)、[Qwen3-30B-A3](https://huggingface.co/Qwen/Qwen3-30B-A3B) | +| Qwen3、Qwen3-MOE | 测试中 | [Qwen3-0.6B](https://huggingface.co/Qwen/Qwen3-0.6B)、[Qwen3-1.7B](https://huggingface.co/Qwen/Qwen3-1.7B)、[Qwen3-4B](https://huggingface.co/Qwen/Qwen3-4B)、[Qwen3-8B](https://huggingface.co/Qwen/Qwen3-8B)、[Qwen3-14B](https://modelers.cn/models/MindSpore-Lab/Qwen3-14B)、[Qwen3-30B-A3](https://huggingface.co/Qwen/Qwen3-30B-A3B) | | Qwen2.5-VL | 测试中 | [Qwen2.5-VL-3B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-3B-Instruct)、[Qwen2.5-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct)、[Qwen2.5-VL-32B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-32B-Instruct)、[Qwen2.5-VL-72B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-72B-Instruct) | | QwQ-32B | 测试中 | [QwQ-32B](https://huggingface.co/Qwen/QwQ-32B) | | Llama3.1 | 测试中 | [Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct)、[Llama-3.1-70B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct)、[Llama-3.1-405B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct) | diff --git a/tutorials/source_zh_cn/model_infer/ms_infer/ms_infer_model_serving_infer.md b/tutorials/source_zh_cn/model_infer/ms_infer/ms_infer_model_serving_infer.md index abc2346400..41e18f0b3f 100644 --- a/tutorials/source_zh_cn/model_infer/ms_infer/ms_infer_model_serving_infer.md +++ b/tutorials/source_zh_cn/model_infer/ms_infer/ms_infer_model_serving_infer.md @@ -41,13 +41,13 @@ print(generate_text) ## 推理教程 -MindSpore推理结合vLLM社区方案,为用户提供了全栈端到端的推理服务化能力,通过vllm-mindspore适配层,实现vLLM社区的服务化能力在MindSpore框架下的无缝对接,具体可以参考[vLLM MindSpore 文档](https://gitee.com/mindspore/docs/blob/master/docs/vllm_mindspore/docs/source_zh_cn/index.rst)。 +MindSpore推理结合vLLM社区方案,为用户提供了全栈端到端的推理服务化能力,通过vLLM MindSpore适配层,实现vLLM社区的服务化能力在MindSpore框架下的无缝对接,具体可以参考[vLLM MindSpore文档](https://www.mindspore.cn/vllm_mindspore/docs/zh-CN/master/index.html)。 -本章主要简单介绍vLLM + MindSpore服务化推理的基础使用。 +本章主要简单介绍vLLM MindSpore服务化推理的基础使用。 ### 环境准备 -vllm-mindspore适配层提供了环境安装脚本,用户可以执行如下命令创建一个vllm-mindspore的运行环境: +vLLM MindSpore适配层提供了环境安装脚本,用户可以执行如下命令创建一个vLLM MindSpore的运行环境: ```shell # download vllm-mindspore code @@ -69,7 +69,7 @@ bash install_depend_pkgs.sh python setup.py install ``` -vllm-mindspore的运行环境创建后,还需要安装以下依赖包: +vLLM MindSpore的运行环境创建后,还需要安装以下依赖包: - **mindspore**:MindSpore开发框架,模型运行基础。 @@ -92,13 +92,15 @@ git lfs install git clone https://huggingface.co/Qwen/Qwen2-7B-Instruct ``` +若在拉取过程中,执行`git lfs install失败`,可以参考vLLM MindSpore FAQ 进行解决。 + ### 启动服务 在启动后端服务前,需要按照实际环境设置对应的环境变量。 ```shell # set Ascend CANN tools envs -/usr/local/Ascend/ascend-toolkit/set_env.sh +source /usr/local/Ascend/ascend-toolkit/set_env.sh export ASCEND_CUSTOM_PATH=${ASCEND_HOME_PATH}/../ export ASCEND_RT_VISIBLE_DEVICES=3 export ASCEND_TOTAL_MEMORY_GB=32 @@ -114,12 +116,13 @@ export VLLM_MODEL_MEMORY_USE_GB=26 export VLLM_MASTER_IP=127.0.0.1 export VLLM_RPC_PORT=12390 export VLLM_HTTP_PORT=8080 +unset vLLM_MODEL_BACKEND # model envs -export MODEL_ID="/path/to/model" +export MODEL_ID="/path/to/model/Qwen2-7B-Instruct" ``` -执行如下命令可以启动vllm-mindspore的服务后端。 +执行如下命令可以启动vLLM MindSpore的服务后端。 ```shell vllm-mindspore serve --model=${MODEL_ID} --port=${VLLM_HTTP_PORT} --trust_remote_code --max-num-seqs=256 --max_model_len=32768 --max-num-batched-tokens=4096 --block_size=128 --gpu-memory-utilization=0.9 --tensor-parallel-size 1 --data-parallel-size 1 --data-parallel-size-local 1 --data-parallel-start-rank 0 --data-parallel-address ${VLLM_MASTER_IP} --data-parallel-rpc-port ${VLLM_RPC_PORT} &> vllm-mindspore.log & @@ -132,11 +135,32 @@ vllm-mindspore serve --model=${MODEL_ID} --port=${VLLM_HTTP_PORT} --trust_remote 用户可以通过发送http请求来实现模型推理,具体可以执行如下命令: ```shell -curl http://${VLLM_MASTER_IP}:${VLLM_HTTP_PORT}/v1/completions -H "Content-Type: application.json" -d "{\"model\": \"${MODEL_ID}\", \"prompt\": \"I love Beijing, because\", \"max_tokens\": 128, \"temperature\": 1.0, \"top_p\": 1.0, \"top_k\": 1, \"repetition_penalty\": 1.0}" +curl http://${VLLM_MASTER_IP}:${VLLM_HTTP_PORT}/v1/completions -H "Content-Type: application/json" -d "{\"model\": \"${MODEL_ID}\", \"prompt\": \"I love Beijing, because\", \"max_tokens\": 128, \"temperature\": 1.0, \"top_p\": 1.0, \"top_k\": 1, \"repetition_penalty\": 1.0}" ``` 服务后端收到推理请求后,计算后会返回如下结果: -```shell -待补充 +```json +{ + "id":"cmpl-1c30caf453154b5ab4a579b7b06cea19", + "object":"text_completion", + "created":1754103773, + "model":"/path/to/model/Qwen2-7B-Instruct", + "choices":[ + { + "index":0, + "text":" it is a city with a long history and rich culture. I have been to many places of interest in Beijing, such as the Great Wall, the Forbidden City, the Summer Palace, and the Temple of Heaven. I also visited the National Museum of China, where I learned a lot about Chinese history and culture. The food in Beijing is also amazing, especially the Peking duck and the dumplings. I enjoyed trying different types of local cuisine and experiencing the unique flavors of Beijing. The people in Beijing are friendly and welcoming, and they are always willing to help tourists. I had a great time exploring the city and interacting with the locals", + "logprobs":null, + "finish_reason":"length", + "stop_reason":null, + "prompt_logprobs":null + } + ], + "usage":{ + "prompt_tokens":5, + "total_tokens":133, + "completion_tokens":128, + "prompt_tokens_details":null + } +} ``` -- Gitee