From c0d51af33c49cda58e0721313e675e540be0bac5 Mon Sep 17 00:00:00 2001 From: xuzhen Date: Wed, 20 Aug 2025 11:09:18 +0800 Subject: [PATCH] modify vLLM-MindSpore Plugin --- .../qwen2.5_7b_singleNPU/qwen2.5_7b_singleNPU.md | 2 +- docs/vllm_mindspore/docs/source_zh_cn/conf.py | 4 ++-- .../qwen2.5_7b_singleNPU/qwen2.5_7b_singleNPU.md | 2 +- tools/generate_html/base_version.json | 4 ++-- tutorials/source_en/model_infer/introduction.md | 2 +- .../ms_infer/ms_infer_model_serving_infer.md | 16 ++++++++-------- .../source_zh_cn/model_infer/introduction.md | 2 +- .../ms_infer/ms_infer_model_serving_infer.md | 14 +++++++------- 8 files changed, 23 insertions(+), 23 deletions(-) diff --git a/docs/vllm_mindspore/docs/source_en/getting_started/tutorials/qwen2.5_7b_singleNPU/qwen2.5_7b_singleNPU.md b/docs/vllm_mindspore/docs/source_en/getting_started/tutorials/qwen2.5_7b_singleNPU/qwen2.5_7b_singleNPU.md index e4b2167fbc..ce7e679dcf 100644 --- a/docs/vllm_mindspore/docs/source_en/getting_started/tutorials/qwen2.5_7b_singleNPU/qwen2.5_7b_singleNPU.md +++ b/docs/vllm_mindspore/docs/source_en/getting_started/tutorials/qwen2.5_7b_singleNPU/qwen2.5_7b_singleNPU.md @@ -145,7 +145,7 @@ export ASCEND_RT_VISIBLE_DEVICES=$NPU_VISIBE_DEVICES ## Offline Inference -Taking [Qwen2.5-7B](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct) as an example, user can perform offline inference with the following Python code: +After setting up the vLLM-MindSpore Plugin environment, user can use the following python code to perform offline inference on the model: ```python import vllm_mindspore # Add this line on the top of script. diff --git a/docs/vllm_mindspore/docs/source_zh_cn/conf.py b/docs/vllm_mindspore/docs/source_zh_cn/conf.py index 2ccb114608..ad6bd3bdd0 100644 --- a/docs/vllm_mindspore/docs/source_zh_cn/conf.py +++ b/docs/vllm_mindspore/docs/source_zh_cn/conf.py @@ -23,9 +23,9 @@ from sphinx.ext import autodoc as sphinx_autodoc # -- Project information ----------------------------------------------------- -project = 'vLLM-MindSpore插件' +project = 'vLLM-MindSpore Plugin' copyright = 'MindSpore' -author = 'vLLM-MindSpore插件' +author = 'vLLM-MindSpore Plugin' # The full version, including alpha/beta/rc tags release = 'master' diff --git a/docs/vllm_mindspore/docs/source_zh_cn/getting_started/tutorials/qwen2.5_7b_singleNPU/qwen2.5_7b_singleNPU.md b/docs/vllm_mindspore/docs/source_zh_cn/getting_started/tutorials/qwen2.5_7b_singleNPU/qwen2.5_7b_singleNPU.md index c7ec37a452..4302202913 100644 --- a/docs/vllm_mindspore/docs/source_zh_cn/getting_started/tutorials/qwen2.5_7b_singleNPU/qwen2.5_7b_singleNPU.md +++ b/docs/vllm_mindspore/docs/source_zh_cn/getting_started/tutorials/qwen2.5_7b_singleNPU/qwen2.5_7b_singleNPU.md @@ -146,7 +146,7 @@ export ASCEND_RT_VISIBLE_DEVICES=$NPU_VISIBE_DEVICES ## 离线推理 -vllm MindSprore环境搭建之后,用户可以使用如下Python代码,进行模型的离线推理: +vLLM-MindSprore插件环境搭建之后,用户可以使用如下Python代码,进行模型的离线推理: ```python import vllm_mindspore # Add this line on the top of script. diff --git a/tools/generate_html/base_version.json b/tools/generate_html/base_version.json index 75e33126d1..874b652657 100644 --- a/tools/generate_html/base_version.json +++ b/tools/generate_html/base_version.json @@ -300,8 +300,8 @@ { "version": "master", "label": { - "zh": "vLLM MindSpore", - "en": "vLLM MindSpore" + "zh": "vLLM-MindSpore Plugin", + "en": "vLLM-MindSpore Plugin" }, "repo_name": "vllm_mindspore", "theme": "theme-docs" diff --git a/tutorials/source_en/model_infer/introduction.md b/tutorials/source_en/model_infer/introduction.md index f02a2a2cc7..42bb5ba9d6 100644 --- a/tutorials/source_en/model_infer/introduction.md +++ b/tutorials/source_en/model_infer/introduction.md @@ -62,7 +62,7 @@ The following figure shows the key technology stack of MindSpore inference. - **Inference with a framework**: In scenarios with abundant computing resources, only Python APIs are provided. You need to use Python scripts to build models and perform inference. Service-oriented components are not mandatory. - - **vLLM&vLLM-MindSpore**: The service-oriented capability of the inference solution with a framework is provided. The popular vLLM service-oriented inference capability in the open-source community is used to seamlessly connect the service-oriented capability of the community to the MindSpore inference ecosystem. + - **vLLM & vLLM-MindSpore Plugin**: The service-oriented capability of the inference solution with a framework is provided. The popular vLLM service-oriented inference capability in the open-source community is used to seamlessly connect the service-oriented capability of the community to the MindSpore inference ecosystem. - **Python API**: MindSpore provides Python APIs, including mint operator APIs (consistent with PyTorch semantics), nn APIs, and parallel APIs. diff --git a/tutorials/source_en/model_infer/ms_infer/ms_infer_model_serving_infer.md b/tutorials/source_en/model_infer/ms_infer/ms_infer_model_serving_infer.md index 7fef1b3ad2..50fe80c9da 100644 --- a/tutorials/source_en/model_infer/ms_infer/ms_infer_model_serving_infer.md +++ b/tutorials/source_en/model_infer/ms_infer/ms_infer_model_serving_infer.md @@ -41,13 +41,13 @@ As an efficient service-oriented model inference backend, it should provide the ## Inference Tutorial -MindSpore inference works with the vLLM community solution to provide users with full-stack end-to-end inference service capabilities. The vLLM MindSpore adaptation layer implements seamless interconnection of the vLLM community service capabilities in the MindSpore framework. For details, see [vLLM MindSpore](https://www.mindspore.cn/vllm_mindspore/docs/en/master/index.html). +MindSpore inference works with the vLLM community solution to provide users with full-stack end-to-end inference service capabilities. The vLLM-MindSpore Plugin adaptation layer implements seamless interconnection of the vLLM community service capabilities in the MindSpore framework. For details, see [vLLM-MindSpore Plugin](https://www.mindspore.cn/vllm_mindspore/docs/en/master/index.html). -This section describes the basic usage of vLLM MindSpore service-oriented inference. +This section describes the basic usage of vLLM-MindSpore Plugin service-oriented inference. ### Setting Up the Environment -The vLLM MindSpore adaptation layer provides an environment installation script. You can run the following commands to create a vLLM MindSpore operating environment: +The vLLM-MindSpore Plugin adaptation layer provides an environment installation script. You can run the following commands to create a vLLM-MindSpore Plugin operating environment: ```shell # download vllm-mindspore code @@ -69,11 +69,11 @@ bash install_depend_pkgs.sh python setup.py install ``` -After the vLLM MindSpore operating environment is created, you need to install the following dependency packages: +After the vLLM-MindSpore Plugin operating environment is created, you need to install the following dependency packages: - **mindspore**: MindSpore development framework, which is the basis for model running. -- **vLLM**: vLLM service software. +- **vllm**: vLLM service software. - **vllm-mindspore**: vLLM extension that adapts to the MindSpore framework. It is required for running MindSpore models. @@ -85,14 +85,14 @@ After the vLLM MindSpore operating environment is created, you need to install t ### Preparing a Model -The service-oriented vLLM MindSpore supports the direct running of the native Hugging Face model. Therefore, you can directly download the model from the Hugging Face official website. The following uses the Qwen2-7B-Instruct model as an example: +The service-oriented vLLM-MindSpore Plugin supports the direct running of the native Hugging Face model. Therefore, you can directly download the model from the Hugging Face official website. The following uses the Qwen2-7B-Instruct model as an example: ```shell git lfs install git clone https://huggingface.co/Qwen/Qwen2-7B-Instruct ``` -If `git lfs install` fails during the pull process, refer to the vLLM MindSpore FAQ for a solution. +If `git lfs install` fails during the pull process, refer to the vLLM-MindSpore Plugin [FAQ](https://www.mindspore.cn/vllm_mindspore/docs/en/master/faqs/faqs.html) for a solution. ### Starting a Service @@ -122,7 +122,7 @@ unset vLLM_MODEL_BACKEND export MODEL_ID="/path/to/model/Qwen2-7B-Instruct" ``` -Run the following command to start the vLLM MindSpore service backend: +Run the following command to start the vLLM-MindSpore Plugin service backend: ```shell vllm-mindspore serve --model=${MODEL_ID} --port=${VLLM_HTTP_PORT} --trust_remote_code --max-num-seqs=256 --max_model_len=32768 --max-num-batched-tokens=4096 --block_size=128 --gpu-memory-utilization=0.9 --tensor-parallel-size 1 --data-parallel-size 1 --data-parallel-size-local 1 --data-parallel-start-rank 0 --data-parallel-address ${VLLM_MASTER_IP} --data-parallel-rpc-port ${VLLM_RPC_PORT} &> vllm-mindspore.log & diff --git a/tutorials/source_zh_cn/model_infer/introduction.md b/tutorials/source_zh_cn/model_infer/introduction.md index 5d765395ac..e02b588388 100644 --- a/tutorials/source_zh_cn/model_infer/introduction.md +++ b/tutorials/source_zh_cn/model_infer/introduction.md @@ -62,7 +62,7 @@ MindSpore框架提供多种模型推理方式,以方便用户在面对不同 - **带框架推理**:面向丰富计算资源场景,只提供Python API接口,用户需要通过Python脚本构建模型并推理,其中服务化组件不是必备的。 - - **vLLM&vLLM-MindSpore**:提供带框架推理方案上的服务化能力,使用当前开源社区热门的vLLM推理服务化能力,实现社区的服务化能力无缝衔接到MindSpore推理生态。 + - **vLLM & vLLM-MindSpore插件**:提供带框架推理方案上的服务化能力,使用当前开源社区热门的vLLM推理服务化能力,实现社区的服务化能力无缝衔接到MindSpore推理生态。 - **Python API**:MindSpore框架提供Python API接口,其中包括mint算子接口(和PyTorch语义一致)、nn接口、parallel接口等。 diff --git a/tutorials/source_zh_cn/model_infer/ms_infer/ms_infer_model_serving_infer.md b/tutorials/source_zh_cn/model_infer/ms_infer/ms_infer_model_serving_infer.md index 41e18f0b3f..7585a8f3fe 100644 --- a/tutorials/source_zh_cn/model_infer/ms_infer/ms_infer_model_serving_infer.md +++ b/tutorials/source_zh_cn/model_infer/ms_infer/ms_infer_model_serving_infer.md @@ -41,13 +41,13 @@ print(generate_text) ## 推理教程 -MindSpore推理结合vLLM社区方案,为用户提供了全栈端到端的推理服务化能力,通过vLLM MindSpore适配层,实现vLLM社区的服务化能力在MindSpore框架下的无缝对接,具体可以参考[vLLM MindSpore文档](https://www.mindspore.cn/vllm_mindspore/docs/zh-CN/master/index.html)。 +MindSpore推理结合vLLM社区方案,为用户提供了全栈端到端的推理服务化能力,通过vLLM-MindSpore插件适配层,实现vLLM社区的服务化能力在MindSpore框架下的无缝对接,具体可以参考[vLLM-MindSpore插件文档](https://www.mindspore.cn/vllm_mindspore/docs/zh-CN/master/index.html)。 -本章主要简单介绍vLLM MindSpore服务化推理的基础使用。 +本章主要简单介绍vLLM-MindSpore插件服务化推理的基础使用。 ### 环境准备 -vLLM MindSpore适配层提供了环境安装脚本,用户可以执行如下命令创建一个vLLM MindSpore的运行环境: +vLLM-MindSpore插件适配层提供了环境安装脚本,用户可以执行如下命令创建一个vLLM-MindSpore插件的运行环境: ```shell # download vllm-mindspore code @@ -69,7 +69,7 @@ bash install_depend_pkgs.sh python setup.py install ``` -vLLM MindSpore的运行环境创建后,还需要安装以下依赖包: +vLLM-MindSpore插件的运行环境创建后,还需要安装以下依赖包: - **mindspore**:MindSpore开发框架,模型运行基础。 @@ -85,14 +85,14 @@ vLLM MindSpore的运行环境创建后,还需要安装以下依赖包: ### 模型准备 -vllm-mindspore服务化支持原生Hugging Face的模型直接运行,因此直接从Hugging Face官网下载模型即可,此处我们仍然以Qwen2-7B-Instruct模型为例。 +vLLM-MindSpore插件服务化支持原生Hugging Face的模型直接运行,因此直接从Hugging Face官网下载模型即可,此处我们仍然以Qwen2-7B-Instruct模型为例。 ```shell git lfs install git clone https://huggingface.co/Qwen/Qwen2-7B-Instruct ``` -若在拉取过程中,执行`git lfs install失败`,可以参考vLLM MindSpore FAQ 进行解决。 +若在拉取过程中,执行`git lfs install失败`,可以参考vLLM-MindSpore插件 [FAQ](https://www.mindspore.cn/vllm_mindspore/docs/zh-CN/master/faqs/faqs.html) 进行解决。 ### 启动服务 @@ -122,7 +122,7 @@ unset vLLM_MODEL_BACKEND export MODEL_ID="/path/to/model/Qwen2-7B-Instruct" ``` -执行如下命令可以启动vLLM MindSpore的服务后端。 +执行如下命令可以启动vLLM-MindSpore插件的服务后端。 ```shell vllm-mindspore serve --model=${MODEL_ID} --port=${VLLM_HTTP_PORT} --trust_remote_code --max-num-seqs=256 --max_model_len=32768 --max-num-batched-tokens=4096 --block_size=128 --gpu-memory-utilization=0.9 --tensor-parallel-size 1 --data-parallel-size 1 --data-parallel-size-local 1 --data-parallel-start-rank 0 --data-parallel-address ${VLLM_MASTER_IP} --data-parallel-rpc-port ${VLLM_RPC_PORT} &> vllm-mindspore.log & -- Gitee