diff --git a/docs/vllm_mindspore/docs/source_en/conf.py b/docs/vllm_mindspore/docs/source_en/conf.py index 33bfa05fe2b56701007b997876529b4a7cfbedd6..e90650890c5c43e8fb5fee15ce66595133e28bdb 100644 --- a/docs/vllm_mindspore/docs/source_en/conf.py +++ b/docs/vllm_mindspore/docs/source_en/conf.py @@ -32,9 +32,9 @@ with open(_html_base.__file__, "r", encoding="utf-8") as f: # -- Project information ----------------------------------------------------- -project = 'vLLM MindSpore' +project = 'vLLM-MindSpore Plugin' copyright = 'MindSpore' -author = 'vLLM MindSpore' +author = 'vLLM-MindSpore Plugin' # The full version, including alpha/beta/rc tags release = 'master' @@ -182,7 +182,7 @@ with open(autodoc_source_path, "r+", encoding="utf8") as f: exec(get_param_func_str, sphinx_autodoc.__dict__) exec(code_str, sphinx_autodoc.__dict__) -# Copy source files of chinese python api from vLLM MindSpore repository. +# Copy source files of chinese python api from vLLM-MindSpore Plugin repository. from sphinx.util import logging logger = logging.getLogger(__name__) diff --git a/docs/vllm_mindspore/docs/source_en/developer_guide/contributing.md b/docs/vllm_mindspore/docs/source_en/developer_guide/contributing.md index e912018c23b66559859d6b3d544c0a74ecd06dca..f6e6140ef2500f0b8da1eace2c68cd2fa5ae437e 100644 --- a/docs/vllm_mindspore/docs/source_en/developer_guide/contributing.md +++ b/docs/vllm_mindspore/docs/source_en/developer_guide/contributing.md @@ -13,11 +13,11 @@ Before submitting code to the MindSpore community, you need to sign the Contribu ## Supporting New Models -To support a new model for vLLM MindSpore code repository, please note the following: +To support a new model for vLLM-MindSpore Plugin code repository, please note the following: - **Follow file format and location specifications.** Model code files should be placed under the `vllm_mindspore/model_executor` directory, organized in corresponding subfolders by model type. -- **Implement models using MindSpore interfaces with jit static graph support.** Model definitions in vLLM MindSpore must be implemented using MindSpore interfaces. Since MindSpore's static graph mode offers performance advantages, models should support execution via @jit static graphs. For reference, see the [Qwen2.5](https://gitee.com/mindspore/vllm-mindspore/blob/master/vllm_mindspore/model_executor/models/qwen2.py) implementation. -- **Register new models in vLLM MindSpore.** After implementing the model structure, register it in vLLM MindSpore by adding it to `_NATIVE_MODELS` in `vllm_mindspore/model_executor/models/registry.py`. +- **Implement models using MindSpore interfaces with jit static graph support.** Model definitions in vLLM-MindSpore Plugin must be implemented using MindSpore interfaces. Since MindSpore's static graph mode offers performance advantages, models should support execution via @jit static graphs. For reference, see the [Qwen2.5](https://gitee.com/mindspore/vllm-mindspore/blob/master/vllm_mindspore/model_executor/models/qwen2.py) implementation. +- **Register new models in vLLM-MindSpore Plugin.** After implementing the model structure, register it in vLLM-MindSpore Plugin by adding it to `_NATIVE_MODELS` in `vllm_mindspore/model_executor/models/registry.py`. - **Write unit tests.** New models must include corresponding unit tests. Refer to the [Qwen2.5 testcases](https://gitee.com/mindspore/vllm-mindspore/blob/master/tests/st/python/cases_parallel/vllm_qwen_7b.py) for examples. ## Contribution Process @@ -27,12 +27,12 @@ To support a new model for vLLM MindSpore code repository, please note the follo Follow these guidelines for community code review, maintenance, and development. - **Coding Standards:** Use vLLM community code checking tools: yapf, codespell, ruff, isort, and mypy. For more details, see the [Toolchain Usage Guide](https://gitee.com/mindspore/vllm-mindspore/blob/master/codecheck_toolkits/README.md). -- **Unit Testing Guidelines:** vLLM MindSpore uses the [pytest](http://www.pytest.org/en/latest/) framework. Test names should clearly reflect their purpose. +- **Unit Testing Guidelines:** vLLM-MindSpore Plugin uses the [pytest](http://www.pytest.org/en/latest/) framework. Test names should clearly reflect their purpose. - **Refactoring Guidelines:** Developers are encouraged to refactor code to eliminate [code smells](https://en.wikipedia.org/wiki/Code_smell). All code, including refactored code, must adhere to coding and testing standards. ### Fork-Pull Development Model -- **Fork the vLLM MindSpore Repository:** Before submitting code, fork the project to your own repository. Ensure consistency between the vLLM MindSpore repository and your fork during parallel development. +- **Fork the vLLM-MindSpore Plugin Repository:** Before submitting code, fork the project to your own repository. Ensure consistency between the vLLM-MindSpore Plugin repository and your fork during parallel development. - **Clone the Remote Repository:** users can use git to pull the source code: @@ -59,13 +59,13 @@ Follow these guidelines for community code review, maintenance, and development. git push origin {new_branch_name} ``` -- **Create a Pull Request to vLLM MindSpore:** Compare and create a PR between your branch and the vLLM MindSpore master branch. After submission, manually trigger CI checks with `/retest` in the comments. PRs should be merged into upstream master promptly to minimize merge risks. +- **Create a Pull Request to vLLM-MindSpore Plugin:** Compare and create a PR between your branch and the vLLM-MindSpore Plugin master branch. After submission, manually trigger CI checks with `/retest` in the comments. PRs should be merged into upstream master promptly to minimize merge risks. ### Reporting Issues To contribute by reporting issues, follow these guidelines: -- Specify your environment versions (vLLM MindSpore, MindSpore TransFormers, MindSpore, OS, Python, etc.). +- Specify your environment versions (vLLM-MindSpore Plugin, MindSpore TransFormers, MindSpore, OS, Python, etc.). - Indicate whether it's a bug report or feature request. - Label the issue type for visibility on the issue board. - Describe the problem and expected resolution. @@ -92,4 +92,4 @@ To contribute by reporting issues, follow these guidelines: - Keep your branch synchronized with master. - For bug-fix PRs, ensure all related issues are referenced. -Thank you for your interest in contributing to vLLM MindSpore. We welcome and value all forms of collaboration. +Thank you for your interest in contributing to vLLM-MindSpore Plugin. We welcome and value all forms of collaboration. diff --git a/docs/vllm_mindspore/docs/source_en/developer_guide/operations/custom_ops.md b/docs/vllm_mindspore/docs/source_en/developer_guide/operations/custom_ops.md index 9f29cb0df6504c3d08e5fe8eab14b755face3f8a..e16285a9fe56513983c34fe69c787f40d1ab5228 100644 --- a/docs/vllm_mindspore/docs/source_en/developer_guide/operations/custom_ops.md +++ b/docs/vllm_mindspore/docs/source_en/developer_guide/operations/custom_ops.md @@ -4,9 +4,9 @@ When the built-in operators do not meet your requirements, you can use MindSpore's custom operator functionality to integrate your operators. -This document would introduce how to integrate a new custom operator into the vLLM MindSpore project, with the **`advance_step_flashattn`** operator as an example. The focus here is on the integration process into vLLM MindSpore. For the details of custom operator development, please refer to the official MindSpore tutorial: [CustomOpBuilder-Based Custom Operators](https://www.mindspore.cn/tutorials/en/master/custom_program/operation/op_customopbuilder.html), and for AscendC operator development, see the official Ascend documentation: [Ascend C Operator Development](https://www.hiascend.com/document/detail/zh/canncommercial/81RC1/developmentguide/opdevg/Ascendcopdevg/atlas_ascendc_10_0001.html). +This document would introduce how to integrate a new custom operator into the vLLM-MindSpore Plugin project, with the **`advance_step_flashattn`** operator as an example. The focus here is on the integration process into vLLM-MindSpore Plugin. For the details of custom operator development, please refer to the official MindSpore tutorial: [CustomOpBuilder-Based Custom Operators](https://www.mindspore.cn/tutorials/en/master/custom_program/operation/op_customopbuilder.html), and for AscendC operator development, see the official Ascend documentation: [Ascend C Operator Development](https://www.hiascend.com/document/detail/zh/canncommercial/81RC1/developmentguide/opdevg/Ascendcopdevg/atlas_ascendc_10_0001.html). -**Note: Currently, custom operators in vLLM MindSpore are only supported in PyNative Mode.** +**Note: Currently, custom operators in vLLM-MindSpore Plugin are only supported in PyNative Mode.** ## File Structure @@ -109,11 +109,11 @@ VLLM_MS_EXTENSION_MODULE(m) { In the above, the first parameter `"advance_step_flashattn"` in `m.def()` is the Python interface name for the operator. -The `module.h` and `module.cpp` files create the Python module for the operator based on pybind11. Since only one `PYBIND11_MODULE` is allowed per dynamic library, and to allow users to complete operator integration in a single file, vLLM MindSpore provides a new registration macro `VLLM_MS_EXTENSION_MODULE`. When the custom operator dynamic library is loaded, all operator interfaces will be automatically registered into the same Python module. +The `module.h` and `module.cpp` files create the Python module for the operator based on pybind11. Since only one `PYBIND11_MODULE` is allowed per dynamic library, and to allow users to complete operator integration in a single file, vLLM-MindSpore Plugin provides a new registration macro `VLLM_MS_EXTENSION_MODULE`. When the custom operator dynamic library is loaded, all operator interfaces will be automatically registered into the same Python module. ### Operator Interface -The custom operator in vLLM MindSpore is compiled into `_C_ops.so`. For convenient calls, user can add a call interface in `vllm_mindspore/_custom_ops.py`. If extra adaptation is needed before or after the operator call, user can implement it in this interface. +The custom operator in vLLM-MindSpore Plugin is compiled into `_C_ops.so`. For convenient calls, user can add a call interface in `vllm_mindspore/_custom_ops.py`. If extra adaptation is needed before or after the operator call, user can implement it in this interface. ```python def advance_step_flashattn(num_seqs: int, num_queries: int, block_size: int, @@ -140,8 +140,8 @@ Here, importing `_C_ops` allows user to use the Python module for the custom ope ### Operator Compilation and Testing -1. **Code Integration**: Merge the code into the vLLM MindSpore project. -2. **Project Compilation**: Run `pip install .` in vllm-mindspore to build and install vLLM MindSpore. +1. **Code Integration**: Merge the code into the vLLM-MindSpore Plugin project. +2. **Project Compilation**: Run `pip install .` in vllm-mindspore to build and install vLLM-MindSpore Plugin. 3. **Operator Testing**: Call the operator interface via `_custom_ops`. Refer to testcase [test_custom_advstepflash.py](https://gitee.com/mindspore/vllm-mindspore/blob/master/tests/st/python/test_custom_advstepflash.py): ```python @@ -152,18 +152,18 @@ custom_ops.advance_step_flashattn(...) ## Custom Operator Compilation Project -Currently, MindSpore provides only a [CustomOpBuilder](https://www.mindspore.cn/docs/en/master/api_python/ops/mindspore.ops.CustomOpBuilder.html) interface for online compilation of custom operators, with default compilation and linking options built in. vLLM MindSpore integrates operators based on MindSpore’s custom operator feature and compiles them into a dynamic library for package release. The following introduces the build process: +Currently, MindSpore provides only a [CustomOpBuilder](https://www.mindspore.cn/docs/en/master/api_python/ops/mindspore.ops.CustomOpBuilder.html) interface for online compilation of custom operators, with default compilation and linking options built in. vLLM-MindSpore Plugin integrates operators based on MindSpore’s custom operator feature and compiles them into a dynamic library for package release. The following introduces the build process: ### Extension Module -In `setup.py`, vLLM MindSpore adds a `vllm_mindspore._C_ops` extension and the corresponding build module: +In `setup.py`, vLLM-MindSpore Plugin adds a `vllm_mindspore._C_ops` extension and the corresponding build module: ```python ext_modules = [Extension("vllm_mindspore._C_ops", sources=[])], cmdclass = {"build_ext": CustomBuildExt}, ``` -There is no need to specify `sources` here because vLLM MindSpore triggers the operator build via CMake, which automatically collects the source files. +There is no need to specify `sources` here because vLLM-MindSpore Plugin triggers the operator build via CMake, which automatically collects the source files. ### Building Process diff --git a/docs/vllm_mindspore/docs/source_en/general/security.md b/docs/vllm_mindspore/docs/source_en/general/security.md index 886bd1b8ad7eb3a2350a8d668904d4adeb2de3c7..d156fffcb72f28f5fa91aeff3fbe63ac634bbf32 100644 --- a/docs/vllm_mindspore/docs/source_en/general/security.md +++ b/docs/vllm_mindspore/docs/source_en/general/security.md @@ -2,11 +2,11 @@ [![View Source On Gitee](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/master/resource/_static/logo_source_en.svg)](https://gitee.com/mindspore/docs/blob/master/docs/vllm_mindspore/docs/source_en/general/security.md) -When enabling inference services using vLLM MindSpore on Ascend, there may be some security-related issues due to the need for certain network ports for necessary functions such as serviceification, node communication, and model execution. +When enabling inference services using vLLM-MindSpore Plugin on Ascend, there may be some security-related issues due to the need for certain network ports for necessary functions such as serviceification, node communication, and model execution. ## Service Port Configuration -When starting the inference service using vLLM MindSpore, relevant IP and port information is required, including: +When starting the inference service using vLLM-MindSpore Plugin, relevant IP and port information is required, including: 1. `host`: Sets the IP address associated with the vLLM serve (default: `0.0.0.0`). 2. `port`: Sets the port for vLLM serve (default: `8000`). @@ -36,7 +36,7 @@ For security, it should be deployed in a sufficiently secure isolated network en ### Executing Framework Distributed Communication -It should be noted that vLLM MindSpore use MindSpore's distributed communication. For detailed security information about MindSpore, please refer to the [MindSpore](https://www.mindspore.cn/en). +It should be noted that vLLM-MindSpore Plugin use MindSpore's distributed communication. For detailed security information about MindSpore, please refer to the [MindSpore](https://www.mindspore.cn/en). ## Security Recommendations diff --git a/docs/vllm_mindspore/docs/source_en/getting_started/installation/installation.md b/docs/vllm_mindspore/docs/source_en/getting_started/installation/installation.md index 73851332d61d4d9f33665d31dfb44915d1071bb9..20315c3d3b31d0fb75f9929027442d40420dce3e 100644 --- a/docs/vllm_mindspore/docs/source_en/getting_started/installation/installation.md +++ b/docs/vllm_mindspore/docs/source_en/getting_started/installation/installation.md @@ -2,11 +2,10 @@ [![View Source On Gitee](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/master/resource/_static/logo_source_en.svg)](https://gitee.com/mindspore/docs/blob/master/docs/vllm_mindspore/docs/source_en/getting_started/installation/installation.md) -This document describes the steps to install the vLLM MindSpore environment. Three installation methods are provided: +This document will introduce the [Version Matching](#version-compatibility) of vLLM-MindSpore Plugin, the installation steps for vLLM-MindSpore Plugin, and the [Quick Verification](#quick-verification) to verify whether the installation is successful. The installation steps provide two installation methods: -- [Docker Installation](#docker-installation): Suitable for quick deployment scenarios. -- [Pip Installation](#pip-installation): Suitable for scenarios requiring specific versions. -- [Source Code Installation](#source-code-installation): Suitable for incremental development of vLLM MindSpore. +- [Docker Installation](#docker-installation): Suitable for quick deployment scenarios. +- [Source Code Installation](#source-code-installation): Suitable for incremental development of vLLM-MindSpore Plugin. ## Version Compatibility @@ -14,27 +13,23 @@ This document describes the steps to install the vLLM MindSpore environment. Thr - Python: 3.9 / 3.10 / 3.11 - Software version compatibility - | Software | Version | Corresponding Branch | - | -------- | ------- | -------------------- | - | [CANN](https://www.hiascend.com/developer/download/community/result?module=cann) | 8.1 | - | - | [MindSpore](https://www.mindspore.cn/install/) | 2.7 | master | - | [MSAdapter](https://git.openi.org.cn/OpenI/MSAdapter) | 0.2 | master | - | [MindSpore Transformers](https://gitee.com/mindspore/mindformers) | 1.6 | dev | - | [Golden Stick](https://gitee.com/mindspore/golden-stick) | 1.1.0 | r1.1.0 | - | [vLLM](https://github.com/vllm-project/vllm) | 0.9.1 | v0.9.1 | - | [vLLM MindSpore](https://gitee.com/mindspore/vllm-mindspore) | 0.3 | master | + | Software | Version And Links | + | ----- | ----- | + | CANN | [8.1.RC1](https://www.hiascend.com/document/detail/zh/canncommercial/81RC1/softwareinst/instg/instg_0000.html?Mode=PmIns&InstallType=local&OS=Debian&Software=cannToolKit) | + | MindSpore | [2.7.0](https://repo.mindspore.cn/mindspore/mindspore/version/202508/20250814/master_20250814091143_7548abc43af03319bfa528fc96d0ccd3917fcc9c_newest/unified/) | + | MSAdapter | [0.5.0](https://repo.mindspore.cn/mindspore/msadapter/version/202508/20250814/master_20250814010018_4615051c43eef898b6bbdc69768656493b5932f8_newest/any/) | + | MindSpore Transformers | [1.6.0](https://gitee.com/mindspore/mindformers) | + | Golden Stick | [1.2.0](https://repo.mindspore.cn/mindspore/golden-stick/version/202508/20250814/master_20250814010017_2713821db982330b3bcd6d84d85a3b337d555f27_newest/any/) | + | vLLM | [0.9.1](https://repo.mindspore.cn/mirrors/vllm/version/202507/20250715/v0.9.1/any/) | + | vLLM-MindSpore Plugin | [0.3.0](https://gitee.com/mindspore/vllm-mindspore/) | -## Environment Setup +## Docker Installation -This section introduces three installation methods: [Docker Installation](#docker-installation), [Pip Installation](#pip-installation), [Source Code Installation](#source-code-installation), and [Quick Verification](#quick-verification) example to check the installation. +We recommend using Docker for quick deployment of the vLLM-MindSpore Plugin environment. Below are the steps: -### Docker Installation +### Building the Image -We recommend using Docker for quick deployment of the vLLM MindSpore environment. Below are the steps: - -#### Building the Image - -User can execute the following commands to clone the vLLM MindSpore code repository and build the image: +User can execute the following commands to clone the vLLM-MindSpore Plugin code repository and build the image: ```bash git clone https://gitee.com/mindspore/vllm-mindspore.git @@ -54,7 +49,7 @@ Here, `e40bcbeae9fc` is the image ID, and `vllm_ms_20250726:latest` is the image docker images ``` -#### Creating a Container +### Creating a Container After [building the image](#building-the-image), set `DOCKER_NAME` and `IMAGE_NAME` as the container and image names, then execute the following command to create the container: @@ -96,7 +91,7 @@ The container ID will be returned if docker is created successfully. User can al docker ps ``` -#### Entering the Container +### Entering the Container After [creating the container](#creating-a-container), user can start and enter the container, using the environment variable `DOCKER_NAME`: @@ -104,58 +99,111 @@ After [creating the container](#creating-a-container), user can start and enter docker exec -it $DOCKER_NAME bash ``` -### Source Code Installation +## Source Code Installation + +### CANN Installation + +For CANN installation methods and environment configuration, please refer to [CANN Community Edition Installation Guide](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/82RC1alpha002/softwareinst/instg/instg_0001.html?Mode=PmIns&OS=openEuler&Software=cannToolKit). If you encounter any issues during CANN installation, please consult the [Ascend FAQ](https://www.hiascend.com/document/detail/zh/AscendFAQ/ProduTech/CANNFAQ/cannfaq_000.html) for troubleshooting. + +The default installation path for CANN is `/usr/local/Ascend`. After completing CANN installation, configure the environment variables with the following commands: + +```bash +LOCAL_ASCEND=/usr/local/Ascend # the root directory of run package +source ${LOCAL_ASCEND}/ascend-toolkit/set_env.sh +export ASCEND_CUSTOM_PATH=${LOCAL_ASCEND}/ascend-toolkit +``` + +### vLLM Prerequisites Installation + +For vLLM environment configuration and installation methods, please refer to the [vLLM Installation Guide](https://docs.vllm.ai/en/v0.9.1/getting_started/installation/cpu.html). In vllM installation, `gcc/g++ >= 12.3.0` is required, and it could be installed by the following command: + +```bash +yum install -y gcc gcc-c++ +``` + +### vLLM-MindSpore Plugin Installation + +vLLM-MindSpore Plugin can be installed in the following two ways. **vLLM-MindSpore Plugin Quick Installation** is suitable for scenarios where users need quick deployment and usage. **vLLM-MindSpore Plugin Manual Installation** is suitable for scenarios where users require custom modifications to the components. + +- **vLLM-MindSpore Plugin Quick Installation** + + To install vLLM-MindSpore Plugin, user needs to pull the vLLM-MindSpore Plugin source code and then runs the following command to install the dependencies: + + ```bash + git clone https://gitee.com/mindspore/vllm-mindspore.git + cd vllm-mindspore + bash install_depend_pkgs.sh + ``` + + Compile and install vLLM-MindSpore Plugin: + + ```bash + pip install . + ``` + + After executing the above commands, `mindformers` folder will be generated in the `vllm-mindspore/install_depend_pkgs` directory. Add this folder to the environment variables: + + ```bash + export PYTHONPATH=$MF_PATH:$PYTHONPATH + ``` + +- **vLLM-MindSpore Plugin Manual Installation** + + If user need to modify the components or use other versions, components need to be manually installed in a specific order. Version compatibility of vLLM-MindSpore Plugin can be found [Version Compatibility](#version-compatibility), abd vLLM-MindSpore Plugin requires the following installation sequence: + + 1. Install vLLM -- **CANN Installation** + ```bash + pip install /path/to/vllm-*.whl + ``` - For CANN installation methods and environment configuration, please refer to [CANN Community Edition Installation Guide](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/82RC1alpha002/softwareinst/instg/instg_0001.html?Mode=PmIns&OS=openEuler&Software=cannToolKit). If you encounter any issues during CANN installation, please consult the [Ascend FAQ](https://www.hiascend.com/document/detail/zh/AscendFAQ/ProduTech/CANNFAQ/cannfaq_000.html) for troubleshooting. + 2. Uninstall Torch-related components - The default installation path for CANN is `/usr/local/Ascend`. After completing CANN installation, configure the environment variables with the following commands: + ```bash + pip uninstall torch torch-npu torchvision torchaudio -y + ``` - ```bash - LOCAL_ASCEND=/usr/local/Ascend # the root directory of run package - source ${LOCAL_ASCEND}/ascend-toolkit/set_env.sh - export ASCEND_CUSTOM_PATH=${LOCAL_ASCEND}/ascend-toolkit - ``` + 3. Install MindSpore -- **vLLM Prerequisites Installation** + ```bash + pip install /path/to/mindspore-*.whl + ``` - For vLLM environment configuration and installation methods, please refer to the [vLLM Installation Guide](https://docs.vllm.ai/en/v0.9.1/getting_started/installation/cpu.html). In vllM installation, `gcc/g++ >= 12.3.0` is required, and it could be installed by the following command: + 4. Clone the MindSpore Transformers repository and add it to `PYTHONPATH` - ```bash - yum install -y gcc gcc-c++ - ``` + ```bash + git clone https://gitee.com/mindspore/mindformers.git + export PYTHONPATH=$MF_PATH:$PYTHONPATH + ``` -- **vLLM MindSpore Installation** + 5. Install Golden Stick - To install vLLM MindSpore, user needs to pull the vLLM MindSpore source code and then runs the following command to install the dependencies: + ```bash + pip install /path/to/mindspore_gs-*.whl + ``` - ```bash - git clone https://gitee.com/mindspore/vllm-mindspore.git - cd vllm-mindspore - bash install_depend_pkgs.sh - ``` + 6. Install MSAdapter - Compile and install vLLM MindSpore: + ```bash + pip install /path/to/msadapter-*.whl + ``` - ```bash - pip install . - ``` + 7. Install vLLM-MindSpore Plugin - After executing the above commands, `mindformers` folder will be generated in the `vllm-mindspore/install_depend_pkgs` directory. Add this folder to the environment variables: + User needs to pull source of vLLM-MindSpore Plugin, and run installation. - ```bash - export PYTHONPATH=$MF_PATH:$PYTHONPATH - ``` + ```bash + git clone https://gitee.com/mindspore/vllm-mindspore.git + cd vllm-mindspore + pip install . + ``` -### Quick Verification +## Quick Verification User can verify the installation with a simple offline inference test. First, user need to configure the environment variables with the following command: ```bash -export ASCEND_TOTAL_MEMORY_GB=64 # Please use `npu-smi info` to check the memory. export vLLM_MODEL_BACKEND=MindFormers # use MindSpore Transformers as model backend. -export vLLM_MODEL_MEMORY_USE_GB=32 # Memory reserved for model execution. Set according to the model's maximum usage, with the remaining environment used for kvcache allocation export MINDFORMERS_MODEL_CONFIG=$YAML_PATH # Set the corresponding MindSpore Transformers model's YAML file. ``` diff --git a/docs/vllm_mindspore/docs/source_en/getting_started/quick_start/quick_start.md b/docs/vllm_mindspore/docs/source_en/getting_started/quick_start/quick_start.md index c0eaf16c347299abc75285ffc1d57c50403d5fee..e425c661a47830d81822d3fe600801a359c3a189 100644 --- a/docs/vllm_mindspore/docs/source_en/getting_started/quick_start/quick_start.md +++ b/docs/vllm_mindspore/docs/source_en/getting_started/quick_start/quick_start.md @@ -2,15 +2,15 @@ [![View Source On Gitee](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/master/resource/_static/logo_source_en.svg)](https://gitee.com/mindspore/docs/blob/master/docs/vllm_mindspore/docs/source_en/getting_started/quick_start/quick_start.md) -This document provides a quick guide to deploy vLLM MindSpore by [docker](https://www.docker.com/), with the [Qwen2.5-7B](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct) model as an example. User can quickly experience the serving and inference abilities of vLLM MindSpore by [offline inference](#offline-inference) and [online inference](#online-inference). For more information about installation, please refer to the [Installation Guide](../installation/installation.md). +This document provides a quick guide to deploy vLLM-MindSpore Plugin by [docker](https://www.docker.com/), with the [Qwen2.5-7B](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct) model as an example. User can quickly experience the serving and inference abilities of vLLM-MindSpore Plugin by [offline inference](#offline-inference) and [online inference](#online-inference). For more information about installation, please refer to the [Installation Guide](../installation/installation.md). ## Docker Installation -In this section, we recommend to use docker to deploy the vLLM MindSpore environment. The following sections are the steps for deployment: +In this section, we recommend to use docker to deploy the vLLM-MindSpore Plugin environment. The following sections are the steps for deployment: ### Building the Image -User can execute the following commands to clone the vLLM MindSpore code repository and build the image: +User can execute the following commands to clone the vLLM-MindSpore Plugin code repository and build the image: ```bash git clone https://gitee.com/mindspore/vllm-mindspore.git @@ -131,18 +131,14 @@ git clone https://huggingface.co/Qwen/Qwen2.5-7B-Instruct Before launching the model, user need to set the following environment variables: ```bash -export ASCEND_TOTAL_MEMORY_GB=64 # Please use `npu-smi info` to check the memory. export vLLM_MODEL_BACKEND=MindFormers # use MindSpore Transformers as model backend. -export vLLM_MODEL_MEMORY_USE_GB=32 # Memory reserved for model execution. Set according to the model's maximum usage, with the remaining environment used for kvcache allocation. export MINDFORMERS_MODEL_CONFIG=$YAML_PATH # Set the corresponding MindSpore Transformers model's YAML file. ``` Here is an explanation of these environment variables: -- `ASCEND_TOTAL_MEMORY_GB`: The memory size of each card. User can check the memory by using `npu-smi info`, where the value corresponds to `HBM-Usage(MB)` in the query results. -- `vLLM_MODEL_BACKEND`: The backend of the model to run. User could find supported models and backends for vLLM MindSpore in the [Model Support List](../../user_guide/supported_models/models_list/models_list.md). -- `vLLM_MODEL_MEMORY_USE_GB`: The memory reserved for model loading. Adjust this value if insufficient memory error occurs during model loading. -- `MINDFORMERS_MODEL_CONFIG`: The model configuration file. +- `vLLM_MODEL_BACKEND`: The backend of the model to run. User could find supported models and backends for vLLM-MindSpore Plugin in the [Model Support List](../../user_guide/supported_models/models_list/models_list.md). +- `MINDFORMERS_MODEL_CONFIG`: The model configuration file. User can find the corresponding YAML file in the [MindSpore Transformers repository](https://gitee.com/mindspore/mindformers/tree/master/research/qwen2_5). For Qwen2.5-7B, the YAML file is [predict_qwen2_5_7b_instruct.yaml](https://gitee.com/mindspore/mindformers/blob/master/research/qwen2_5/predict_qwen2_5_7b_instruct.yaml). Additionally, users need to ensure that MindSpore Transformers is installed. Users can add it by running the following command: @@ -192,7 +188,7 @@ Prompt: 'Llama is'. Generated text: ' a 100% natural, biodegradable, and compost ### Online Inference -vLLM MindSpore supports online inference deployment with the OpenAI API protocol. The following section would introduce how to [starting the service](#starting-the-service) and [send requests](#sending-requests) to obtain inference results, using [Qwen2.5-7B](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct) as an example. +vLLM-MindSpore Plugin supports online inference deployment with the OpenAI API protocol. The following section would introduce how to [starting the service](#starting-the-service) and [send requests](#sending-requests) to obtain inference results, using [Qwen2.5-7B](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct) as an example. #### Starting the Service @@ -202,7 +198,7 @@ Use the model `Qwen/Qwen2.5-7B-Instruct` and start the vLLM service with the fol python3 -m vllm_mindspore.entrypoints vllm.entrypoints.openai.api_server --model "Qwen/Qwen2.5-7B-Instruct" ``` -If the service starts successfully, similar output will be obtained: +User can also set the local model path by `--model` argument. If the service starts successfully, similar output will be obtained: ```text INFO: Started server process [6363] @@ -224,6 +220,8 @@ Use the following command to send a request, where `prompt` is the model input: curl http://localhost:8000/v1/completions -H "Content-Type: application/json" -d '{"model": "Qwen/Qwen2.5-7B-Instruct", "prompt": "I am", "max_tokens": 15, "temperature": 0}' ``` +User needs to ensure that the `"model"` field matches the `--model` in the service startup, and the request can successfully match the model. + If the request is processed successfully, the following inference result will be returned: ```text diff --git a/docs/vllm_mindspore/docs/source_en/getting_started/tutorials/deepseek_parallel/deepseek_r1_671b_w8a8_dp4_tp4_ep4.md b/docs/vllm_mindspore/docs/source_en/getting_started/tutorials/deepseek_parallel/deepseek_r1_671b_w8a8_dp4_tp4_ep4.md index 7a3a9fb4a83667a8d74de50b205d55a4a8a7a928..9080b67487908939da1d82b6a212341c274042b7 100644 --- a/docs/vllm_mindspore/docs/source_en/getting_started/tutorials/deepseek_parallel/deepseek_r1_671b_w8a8_dp4_tp4_ep4.md +++ b/docs/vllm_mindspore/docs/source_en/getting_started/tutorials/deepseek_parallel/deepseek_r1_671b_w8a8_dp4_tp4_ep4.md @@ -2,7 +2,7 @@ [![View Source On Gitee](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/master/resource/_static/logo_source_en.svg)](https://gitee.com/mindspore/docs/blob/master/docs/vllm_mindspore/docs/source_en/getting_started/tutorials/deepseek_parallel/deepseek_r1_671b_w8a8_dp4_tp4_ep4.md) -vLLM MindSpore supports hybrid parallel inference with configurations of tensor parallelism (TP), data parallelism (DP), expert parallelism (EP), and their combinations. For the applicable scenarios of different parallel strategies, refer to the [vLLM official documentation](https://docs.vllm.ai/en/latest/configuration/optimization.html#parallelism-strategies). +vLLM-MindSpore Plugin supports hybrid parallel inference with configurations of tensor parallelism (TP), data parallelism (DP), expert parallelism (EP), and their combinations. For the applicable scenarios of different parallel strategies, refer to the [vLLM official documentation](https://docs.vllm.ai/en/latest/configuration/optimization.html#parallelism-strategies). This document uses the DeepSeek R1 671B W8A8 model as an example to introduce the inference workflows for [tensor parallelism (TP16)](#tp16-tensor-parallel-inference) and [hybrid parallelism](#hybrid-parallel-inference). The DeepSeek R1 671B W8A8 model requires multiple nodes to run inference. To ensure consistent execution configurations (including model configuration file paths, Python environments, etc.) across all nodes, it is recommended to use Docker containers to eliminate execution differences. @@ -10,11 +10,11 @@ Users can configure the environment by following the [Docker Installation](#dock ## Docker Installation -In this section, we recommend to use docker to deploy the vLLM MindSpore environment. The following sections are the steps for deployment: +In this section, we recommend to use docker to deploy the vLLM-MindSpore Plugin environment. The following sections are the steps for deployment: ### Building the Image -User can execute the following commands to clone the vLLM MindSpore code repository and build the image: +User can execute the following commands to clone the vLLM-MindSpore Plugin code repository and build the image: ```bash git clone https://gitee.com/mindspore/vllm-mindspore.git @@ -52,8 +52,8 @@ Execute the following Python script to download the MindSpore-compatible DeepSee ```python from openmind_hub import snapshot_download -snapshot_download(repo_id="MindSpore-Lab/DeepSeek-R1-W8A8", - local_dir="/path/to/save/deepseek_r1_w8a8", +snapshot_download(repo_id="MindSpore-Lab/DeepSeek-R1-0528-A8W8", + local_dir="/path/to/save/deepseek_r1_0528_a8w8", local_dir_use_symlinks=False) ``` @@ -78,7 +78,7 @@ If the tool is unavailable, install [git-lfs](https://git-lfs.com) first. Refer Once confirmed, download the weights by executing the following command: ```shell -git clone https://modelers.cn/MindSpore-Lab/DeepSeek-R1-W8A8.git +git clone https://modelers.cn/models/MindSpore-Lab/DeepSeek-R1-0528-A8W8.git ``` ## TP16 Tensor Parallel Inference @@ -114,7 +114,7 @@ Environment variable descriptions: - `HCCL_OP_EXPANSION_MODE`: Configure the communication algorithm expansion location to the AI Vector Core (AIV) computing unit on the device side. - `MS_ALLOC_CONF`: Set the memory policy. Refer to the [MindSpore documentation](https://www.mindspore.cn/docs/en/master/api_python/env_var_list.html). - `ASCEND_RT_VISIBLE_DEVICES`: Configure the available device IDs for each node. Use the `npu-smi info` command to check. -- `vLLM_MODEL_BACKEND`: The backend of the model to run. Currently supported models and backends for vLLM MindSpore can be found in the [Model Support List](../../../user_guide/supported_models/models_list/models_list.md). +- `vLLM_MODEL_BACKEND`: The backend of the model to run. Currently supported models and backends for vLLM-MindSpore Plugin can be found in the [Model Support List](../../../user_guide/supported_models/models_list/models_list.md). - `MINDFORMERS_MODEL_CONFIG`: Model configuration file. Users can find the corresponding YAML file in the [MindSpore Transformers repository](https://gitee.com/mindspore/mindformers/tree/master/research/deepseek3/deepseek_r1_671b), such as [predict_deepseek_r1_671b_w8a8.yaml](https://gitee.com/mindspore/mindformers/blob/master/research/deepseek3/deepseek_r1_671b/predict_deepseek_r1_671b_w8a8.yaml). The model parallel strategy is specified in the `parallel_config` of the configuration file. For example, the TP16 tensor parallel configuration is as follows: @@ -222,7 +222,7 @@ Before managing a multi-node cluster, ensure that the hostnames of all nodes are #### Starting the Service -vLLM MindSpore can deploy online inference using the OpenAI API protocol. Below is the workflow for launching the service. +vLLM-MindSpore Plugin can deploy online inference using the OpenAI API protocol. Below is the workflow for launching the service. ```bash # Service launch parameter explanation @@ -241,18 +241,20 @@ Execution example: ```bash # Master node: -vllm-mindspore serve --model="/path/to/save/deepseek_r1_w8a8" --trust-remote-code --max-num-seqs=256 --max_model_len=32768 --max-num-batched-tokens=4096 --block-size=128 --gpu-memory-utilization=0.9 --tensor-parallel-size 16 --distributed-executor-backend=ray +vllm-mindspore serve --model="MindSpore-Lab/DeepSeek-R1-0528-A8W8" --trust-remote-code --max-num-seqs=256 --max_model_len=32768 --max-num-batched-tokens=4096 --block-size=128 --gpu-memory-utilization=0.9 --tensor-parallel-size 16 --distributed-executor-backend=ray ``` -In tensor parallel scenarios, the `--tensor-parallel-size` parameter overrides the `model_parallel` configuration in the model YAML file. +In tensor parallel scenarios, the `--tensor-parallel-size` parameter overrides the `model_parallel` configuration in the model YAML file. User can also set the local model path by `--model` argument. #### Sending Requests Use the following command to send requests, where `prompt` is the model input: ```bash -curl http://localhost:8000/v1/completions -H "Content-Type: application/json" -d '{"model": "/path/to/save/deepseek_r1_w8a8", "prompt": "I am", "max_tokens": 20, "temperature": 0, "top_p": 1.0, "top_k": 1, "repetition_penalty": 1.0}' -``` +curl http://localhost:8000/v1/completions -H "Content-Type: application/json" -d '{"model": "MindSpore-Lab/DeepSeek-R1-0528-A8W8", "prompt": "I am", "max_tokens": 20, "temperature": 0, "top_p": 1.0, "top_k": 1, "repetition_penalty": 1.0}' +``` + +User needs to ensure that the `"model"` field matches the `--model` in the service startup, and the request can successfully match the model. ## Hybrid Parallel Inference @@ -283,7 +285,7 @@ Environment variable descriptions: - `HCCL_OP_EXPANSION_MODE`: Configure the communication algorithm expansion location to the AI Vector Core (AIV) computing unit on the device side. - `MS_ALLOC_CONF`: Set the memory policy. Refer to the [MindSpore documentation](https://www.mindspore.cn/docs/en/master/api_python/env_var_list.html). - `ASCEND_RT_VISIBLE_DEVICES`: Configure the available device IDs for each node. Use the `npu-smi info` command to check. -- `vLLM_MODEL_BACKEND`: The backend of the model to run. Currently supported models and backends for vLLM MindSpore can be found in the [Model Support List](../../../user_guide/supported_models/models_list/models_list.md). +- `vLLM_MODEL_BACKEND`: The backend of the model to run. Currently supported models and backends for vLLM-MindSpore Plugin can be found in the [Model Support List](../../../user_guide/supported_models/models_list/models_list.md). - `MINDFORMERS_MODEL_CONFIG`: Model configuration file. Users can find the corresponding YAML file in the [MindSpore Transformers repository](https://gitee.com/mindspore/mindformers/tree/master/research/deepseek3/deepseek_r1_671b), such as [predict_deepseek_r1_671b_w8a8_ep4t4.yaml](https://gitee.com/mindspore/mindformers/blob/master/research/deepseek3/deepseek_r1_671b/predict_deepseek_r1_671b_w8a8_ep4tp4.yaml). The model parallel strategy is specified in the `parallel_config` of the configuration file. For example, the hybrid parallel configuration is as follows: @@ -301,6 +303,8 @@ parallel_config: ### Online Inference +#### Starting the Service + `vllm-mindspore` can deploy online inference using the OpenAI API protocol. Below is the workflow for launching the service: ```bash @@ -321,22 +325,24 @@ vllm-mindspore serve --data-parallel-address [Master node communication IP] --data-parallel-rpc-port [Master node communication port] --enable-expert-parallel # Enable expert parallelism -``` +``` -Execution example: +User can also set the local model path by `--model` argument. The following is an execution example: ```bash # Master node: -vllm-mindspore serve --model="/path/to/save/deepseek_r1_w8a8" --trust-remote-code --max-num-seqs=256 --max-model-len=32768 --max-num-batched-tokens=4096 --block-size=128 --gpu-memory-utilization=0.9 --tensor-parallel-size 4 --data-parallel-size 4 --data-parallel-size-local 2 --data-parallel-start-rank 0 --data-parallel-address 192.10.10.10 --data-parallel-rpc-port 12370 --enable-expert-parallel +vllm-mindspore serve --model="MindSpore-Lab/DeepSeek-R1-0528-A8W8" --trust-remote-code --max-num-seqs=256 --max-model-len=32768 --max-num-batched-tokens=4096 --block-size=128 --gpu-memory-utilization=0.9 --tensor-parallel-size 4 --data-parallel-size 4 --data-parallel-size-local 2 --data-parallel-start-rank 0 --data-parallel-address 192.10.10.10 --data-parallel-rpc-port 12370 --enable-expert-parallel # Worker node: -vllm-mindspore serve --headless --model="/path/to/save/deepseek_r1_w8a8" --trust-remote-code --max-num-seqs=256 --max-model-len=32768 --max-num-batched-tokens=4096 --block-size=128 --gpu-memory-utilization=0.9 --tensor-parallel-size 4 --data-parallel-size 4 --data-parallel-size-local 2 --data-parallel-start-rank 2 --data-parallel-address 192.10.10.10 --data-parallel-rpc-port 12370 --enable-expert-parallel -``` +vllm-mindspore serve --headless --model="MindSpore-Lab/DeepSeek-R1-0528-A8W8" --trust-remote-code --max-num-seqs=256 --max-model-len=32768 --max-num-batched-tokens=4096 --block-size=128 --gpu-memory-utilization=0.9 --tensor-parallel-size 4 --data-parallel-size 4 --data-parallel-size-local 2 --data-parallel-start-rank 2 --data-parallel-address 192.10.10.10 --data-parallel-rpc-port 12370 --enable-expert-parallel +``` -## Sending Requests +#### Sending Requests Use the following command to send requests, where `prompt` is the model input: ```bash -curl http://localhost:8000/v1/completions -H "Content-Type: application/json" -d '{"model": "/path/to/save/deepseek_r1_w8a8", "prompt": "I am", "max_tokens": 20, "temperature": 0}' +curl http://localhost:8000/v1/completions -H "Content-Type: application/json" -d '{"model": "MindSpore-Lab/DeepSeek-R1-0528-A8W8", "prompt": "I am", "max_tokens": 20, "temperature": 0}' ``` + +User needs to ensure that the `"model"` field matches the `--model` in the service startup, and the request can successfully match the model. diff --git a/docs/vllm_mindspore/docs/source_en/getting_started/tutorials/qwen2.5_32b_multiNPU/qwen2.5_32b_multiNPU.md b/docs/vllm_mindspore/docs/source_en/getting_started/tutorials/qwen2.5_32b_multiNPU/qwen2.5_32b_multiNPU.md index 24d4d4a2cac790dc5e9e8f5d5145266b896d32f5..5976859caae0ac95456128fab31c35e8d38b832e 100644 --- a/docs/vllm_mindspore/docs/source_en/getting_started/tutorials/qwen2.5_32b_multiNPU/qwen2.5_32b_multiNPU.md +++ b/docs/vllm_mindspore/docs/source_en/getting_started/tutorials/qwen2.5_32b_multiNPU/qwen2.5_32b_multiNPU.md @@ -2,15 +2,15 @@ [![View Source On Gitee](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/master/resource/_static/logo_source_en.svg)](https://gitee.com/mindspore/docs/blob/master/docs/vllm_mindspore/docs/source_en/getting_started/tutorials/qwen2.5_32b_multiNPU/qwen2.5_32b_multiNPU.md) -This document introduces single-node multi-card inference process by vLLM MindSpore. Taking the [Qwen2.5-32B](https://huggingface.co/Qwen/Qwen2.5-32B-Instruct) model as an example, users can configure the environment through the [Docker Installation](#docker-installation) section or the [Installation Guide](../../installation/installation.md#installation-guide), and then [download the model weights](#downloading-model-weights). After [setting environment variables](#setting-environment-variables), users can perform [online inference](#online-inference) to experience single-node multi-card inference capabilities. +This document introduces single-node multi-card inference process by vLLM-MindSpore Plugin. Taking the [Qwen2.5-32B](https://huggingface.co/Qwen/Qwen2.5-32B-Instruct) model as an example, users can configure the environment through the [Docker Installation](#docker-installation) section or the [Installation Guide](../../installation/installation.md#installation-guide), and then [download the model weights](#downloading-model-weights). After [setting environment variables](#setting-environment-variables), users can perform [online inference](#online-inference) to experience single-node multi-card inference capabilities. ## Docker Installation -In this section, we recommend using Docker for quick deployment of the vLLM MindSpore environment. Below are the steps for Docker deployment: +In this section, we recommend using Docker for quick deployment of the vLLM-MindSpore Plugin environment. Below are the steps for Docker deployment: ### Building the Image -User can execute the following commands to clone the vLLM MindSpore code repository and build the image: +User can execute the following commands to clone the vLLM-MindSpore Plugin code repository and build the image: ```bash git clone https://gitee.com/mindspore/vllm-mindspore.git @@ -127,18 +127,14 @@ For [Qwen2.5-32B](https://huggingface.co/Qwen/Qwen2.5-32B-Instruct), the followi ```bash #set environment variables -export ASCEND_TOTAL_MEMORY_GB=64 # Use `npu-smi info` to check the memory. export vLLM_MODEL_BACKEND=MindFormers # Use MindSpore TransFormers as the model backend. -export vLLM_MODEL_MEMORY_USE_GB=32 # Memory reserved for model execution. Adjust based on the model's maximum usage, with the remaining allocated for KV cache. export MINDFORMERS_MODEL_CONFIG=$YAML_PATH # Set the corresponding MindSpore Transformers model YAML file. ``` Here is an explanation of these environment variables: -- `ASCEND_TOTAL_MEMORY_GB`: The memory size of each compute card. Query using `npu-smi info`, corresponding to `HBM-Usage(MB)` in the results. - `vLLM_MODEL_BACKEND`: The model backend. Currently supported models and backends are listed in the [Model Support List](../../../user_guide/supported_models/models_list/models_list.md). -- `vLLM_MODEL_MEMORY_USE_GB`: Memory reserved for model loading. Adjust this if encountering insufficient memory. -- `MINDFORMERS_MODEL_CONFIG`: Model configuration file. User can find the corresponding YAML file in the [MindSpore Transformers repository](https://gitee.com/mindspore/mindformers/tree/r1.5.0/research/qwen2_5). For Qwen2.5-32B, the YAML file is [predict_qwen2_5_32b_instruct.yaml](https://gitee.com/mindspore/mindformers/blob/r1.5.0/research/qwen2_5/predict_qwen2_5_32b_instruct.yaml). +- `MINDFORMERS_MODEL_CONFIG`: Model configuration file. User can find the corresponding YAML file in the [MindSpore Transformers repository](https://gitee.com/mindspore/mindformers/tree/master/research/qwen2_5). For Qwen2.5-32B, the YAML file is [predict_qwen2_5_32b_instruct.yaml](https://gitee.com/mindspore/mindformers/blob/master/research/qwen2_5/predict_qwen2_5_32b_instruct.yaml). Users can check memory usage with `npu-smi info` and set the NPU cards for inference using the following example (assuming cards 4,5,6,7 are used): @@ -148,7 +144,7 @@ export ASCEND_RT_VISIBLE_DEVICES=4,5,6,7 ## Online Inference -vLLM MindSpore supports online inference deployment with the OpenAI API protocol. The following section would introduce how to [starting the service](#starting-the-service) and [send requests](#sending-requests) to obtain inference results, using [Qwen2.5-32B](https://huggingface.co/Qwen/Qwen2.5-32B-Instruct) as an example. +vLLM-MindSpore Plugin supports online inference deployment with the OpenAI API protocol. The following section would introduce how to [starting the service](#starting-the-service) and [send requests](#sending-requests) to obtain inference results, using [Qwen2.5-32B](https://huggingface.co/Qwen/Qwen2.5-32B-Instruct) as an example. ### Starting the Service @@ -160,7 +156,7 @@ export MAX_MODEL_LEN=1024 python3 -m vllm_mindspore.entrypoints vllm.entrypoints.openai.api_server --model "Qwen/Qwen2.5-32B-Instruct" --trust_remote_code --tensor-parallel-size $TENSOR_PARALLEL_SIZE --max-model-len $MAX_MODEL_LEN ``` -Here, `TENSOR_PARALLEL_SIZE` specifies the number of NPU cards, and `MAX_MODEL_LEN` sets the maximum output token length. +Here, `TENSOR_PARALLEL_SIZE` specifies the number of NPU cards, and `MAX_MODEL_LEN` sets the maximum output token length. User can also set the local model path by `--model` argument. If the service starts successfully, similar output will be obtained: @@ -181,9 +177,11 @@ Engine 000: Avg prompt throughput: 0.0 tokens/s, Avg generation throughput: 0.0 Use the following command to send a request, where `prompt` is the model input: ```bash -curl http://localhost:8000/v1/completions -H "Content-Type: application/json" -d '{"model": "Qwen2.5-32B-Instruct", "prompt": "I am", "max_tokens": 20, "temperature": 0}' +curl http://localhost:8000/v1/completions -H "Content-Type: application/json" -d '{"model": "Qwen/Qwen2.5-32B-Instruct", "prompt": "I am", "max_tokens": 20, "temperature": 0}' ``` +User needs to ensure that the `"model"` field matches the `--model` in the service startup, and the request can successfully match the model. + If processed successfully, the inference result will be: ```text diff --git a/docs/vllm_mindspore/docs/source_en/getting_started/tutorials/qwen2.5_7b_singleNPU/qwen2.5_7b_singleNPU.md b/docs/vllm_mindspore/docs/source_en/getting_started/tutorials/qwen2.5_7b_singleNPU/qwen2.5_7b_singleNPU.md index 2c360e28f8010720b10792648cd758d2fc54acab..e4b2167fbc68a09bd0ce7176793d83ad3b850b46 100644 --- a/docs/vllm_mindspore/docs/source_en/getting_started/tutorials/qwen2.5_7b_singleNPU/qwen2.5_7b_singleNPU.md +++ b/docs/vllm_mindspore/docs/source_en/getting_started/tutorials/qwen2.5_7b_singleNPU/qwen2.5_7b_singleNPU.md @@ -2,15 +2,15 @@ [![View Source On Gitee](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/master/resource/_static/logo_source_en.svg)](https://gitee.com/mindspore/docs/blob/master/docs/vllm_mindspore/docs/source_en/getting_started/tutorials/qwen2.5_7b_singleNPU/qwen2.5_7b_singleNPU.md) -This document introduces single NPU inference process by vLLM MindSpore. Taking the [Qwen2.5-7B](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct) model as an example, user can configure the environment through the [Docker Installation](#docker-installation) or the [Installation Guide](../../installation/installation.md#installation-guide), and [downloading model weights](#downloading-model-weights). After [setting environment variables](#setting-environment-variables), user can perform [offline inference](#offline-inference) and [online inference](#online-inference) to experience single NPU inference abilities. +This document introduces single NPU inference process by vLLM-MindSpore Plugin. Taking the [Qwen2.5-7B](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct) model as an example, user can configure the environment through the [Docker Installation](#docker-installation) or the [Installation Guide](../../installation/installation.md#installation-guide), and [downloading model weights](#downloading-model-weights). After [setting environment variables](#setting-environment-variables), user can perform [offline inference](#offline-inference) and [online inference](#online-inference) to experience single NPU inference abilities. ## Docker Installation -In this section, we recommend using Docker for quick deployment of the vLLM MindSpore environment. Below are the steps for Docker deployment: +In this section, we recommend using Docker for quick deployment of the vLLM-MindSpore Plugin environment. Below are the steps for Docker deployment: ### Building the Image -User can execute the following commands to clone the vLLM MindSpore code repository and build the image: +User can execute the following commands to clone the vLLM-MindSpore Plugin code repository and build the image: ```bash git clone https://gitee.com/mindspore/vllm-mindspore.git @@ -127,17 +127,13 @@ For [Qwen2.5-7B](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct), the following ```bash #set environment variables -export ASCEND_TOTAL_MEMORY_GB=64 # Please use `npu-smi info` to check the memory. export vLLM_MODEL_BACKEND=MindFormers # use MindSpore TransFormers as model backend. -export vLLM_MODEL_MEMORY_USE_GB=32 # Memory reserved for model execution. Set according to the model's maximum usage, with the remaining environment used for kvcache allocation export MINDFORMERS_MODEL_CONFIG=$YAML_PATH # Set the corresponding MindSpore Transformers model's YAML file. ``` Here is an explanation of these variables: -- `ASCEND_TOTAL_MEMORY_GB`: The memory size of each compute card. Query using `npu-smi info`, corresponding to `HBM-Usage(MB)` in the results. - `vLLM_MODEL_BACKEND`: The model backend. Currently supported models and backends are listed in the [Model Support List](../../../user_guide/supported_models/models_list/models_list.md). -- `vLLM_MODEL_MEMORY_USE_GB`: Memory reserved for model loading. Adjust this if encountering insufficient memory. - `MINDFORMERS_MODEL_CONFIG`: Model configuration file. User can find the corresponding YAML file in the [MindSpore Transformers repository](https://gitee.com/mindspore/mindformers/tree/master/research/qwen2_5). For Qwen2.5-7B, the YAML file is [predict_qwen2_5_7b_instruct.yaml](https://gitee.com/mindspore/mindformers/blob/master/research/qwen2_5/predict_qwen2_5_7b_instruct.yaml). User can check memory usage with `npu-smi info` and set the compute card for inference using: @@ -186,7 +182,7 @@ Prompt: 'Llama is'. Generated text: ' a 100% natural, biodegradable, and compost ## Online Inference -vLLM MindSpore supports online inference deployment with the OpenAI API protocol. The following section would introduce how to [starting the service](#starting-the-service) and [send requests](#sending-requests) to obtain inference results, using [Qwen2.5-7B](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct) as an example. +vLLM-MindSpore Plugin supports online inference deployment with the OpenAI API protocol. The following section would introduce how to [starting the service](#starting-the-service) and [send requests](#sending-requests) to obtain inference results, using [Qwen2.5-7B](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct) as an example. ### Starting the Service @@ -196,7 +192,7 @@ Use the model `Qwen/Qwen2.5-7B-Instruct` and start the vLLM service with the fol python3 -m vllm_mindspore.entrypoints vllm.entrypoints.openai.api_server --model "Qwen/Qwen2.5-7B-Instruct" ``` -If the service starts successfully, similar output will be obtained: +User can also set the local model path by `--model` argument. If the service starts successfully, similar output will be obtained: ```text INFO: Started server process [6363] @@ -218,6 +214,8 @@ Use the following command to send a request, where `prompt` is the model input: curl http://localhost:8000/v1/completions -H "Content-Type: application/json" -d '{"model": "Qwen/Qwen2.5-7B-Instruct", "prompt": "I am", "max_tokens": 15, "temperature": 0}' ``` +User needs to ensure that the `"model"` field matches the `--model` in the service startup, and the request can successfully match the model. + If the request is processed successfully, the following inference result will be returned: ```text diff --git a/docs/vllm_mindspore/docs/source_en/index.rst b/docs/vllm_mindspore/docs/source_en/index.rst index 3163af72f89ad7a5ef6f231b6240e1b7b6a63450..a373198a6d30f4b945db68d5735ae168a34c62d0 100644 --- a/docs/vllm_mindspore/docs/source_en/index.rst +++ b/docs/vllm_mindspore/docs/source_en/index.rst @@ -1,22 +1,22 @@ -vLLM MindSpore +vLLM-MindSpore Plugin ========================================= Overview ----------------------------------------------------- -vLLM MindSpore (`vllm-mindspore`) is a plugin brewed by the `MindSpore community `_ , which aims to integrate MindSpore LLM inference capabilities into `vLLM `_ . With vLLM MindSpore, technical strengths of Mindspore and vLLM will be organically combined to provide a full-stack open-source, high-performance, easy-to-use LLM inference solution. +vLLM-MindSpore Plugin (`vllm-mindspore`) is a plugin brewed by the `MindSpore community `_ , which aims to integrate MindSpore LLM inference capabilities into `vLLM `_ . With vLLM-MindSpore Plugin, technical strengths of Mindspore and vLLM will be organically combined to provide a full-stack open-source, high-performance, easy-to-use LLM inference solution. vLLM, an opensource and community-driven project initiated by Sky Computing Lab, UC Berkeley, has been widely used in academic research and industry applications. On the basis of Continuous Batching scheduling mechanism and PagedAttention Key-Value cache management, vLLM provides a rich set of inference service features, including speculative inference, Prefix Caching, Multi-LoRA, etc. vLLM also supports a wide range of open-source large models, including Transformer-based models (e.g., LLaMa), Mixture-of-Expert models (e.g., DeepSeek), Embedding models (e.g., E5-Mistral), and multi-modal models (e.g., LLaVA). Because vLLM chooses to use PyTorch to build large models and manage storage resources, it cannot deploy large models built upon MindSpore. -vLLM MindSpore plugin aims to integrate Mindspore large models into vLLM and to enable deploying MindSpore-based LLM inference services. It follows the following design principles: +vLLM-MindSpore Plugin aims to integrate Mindspore large models into vLLM and to enable deploying MindSpore-based LLM inference services. It follows the following design principles: - Interface compatibility: support the native APIs and service deployment interfaces of vLLM to avoid adding new configuration files or interfaces, reducing user learning costs and ensuring ease of use. - Minimal invasive modifications: minimize invasive modifications to the vLLM code to ensure system maintainability and evolvability. - Component decoupling: minimize and standardize the coupling between MindSpore large model components and vLLM service components to facilitate the integration of various MindSpore large model suites. -On the basis of the above design principles, vLLM MindSpore adopts the system architecture shown in the figure below, and implements the docking between vLLM and Mindspore in categories of components: +On the basis of the above design principles, vLLM-MindSpore Plugin adopts the system architecture shown in the figure below, and implements the docking between vLLM and Mindspore in categories of components: -- Service components: vLLM MindSpore maps PyTorch API calls in service components including LLMEngine and Scheduler to MindSpore capabilities, inheriting support for service functions like Continuous Batching and PagedAttention. -- Model components: vLLM MindSpore registers or replaces model components including models, network layers, and custom operators, and integrates MindSpore Transformers, MindSpore One, and other MindSpore large model suites, as well as custom large models, into vLLM. +- Service components: vLLM-MindSpore Plugin maps PyTorch API calls in service components including LLMEngine and Scheduler to MindSpore capabilities, inheriting support for service functions like Continuous Batching and PagedAttention. +- Model components: vLLM-MindSpore Plugin registers or replaces model components including models, network layers, and custom operators, and integrates MindSpore Transformers, MindSpore One, and other MindSpore large model suites, as well as custom large models, into vLLM. .. raw:: html @@ -28,7 +28,7 @@ On the basis of the above design principles, vLLM MindSpore adopts the system ar -vLLM MindSpore uses the plugin mechanism recommended by the vLLM community to realize capability registration. In the future, we expect to promote vLLM community to support integration of inference capabilities of third-party AI frameworks, including PaddlePaddle and JAX by following principles described in `[RPC] Multi-framework support for vllm `_ . +vLLM-MindSpore Plugin uses the plugin mechanism recommended by the vLLM community to realize capability registration. In the future, we expect to promote vLLM community to support integration of inference capabilities of third-party AI frameworks, including PaddlePaddle and JAX by following principles described in `[RPC] Multi-framework support for vllm `_ . Code: @@ -58,7 +58,7 @@ Branch ----------------------------------------------------- The vllm-mindspore repository contains the main branch, development branch, and version branches: -- **main**: the main branch, compatible with Mindspore master branch and vLLM v0.8.3 version, is continuously monitored for quality through Ascend-MindSpore CI. +- **main**: the main branch, compatible with Mindspore master branch and vLLM v0.9.1 version, is continuously monitored for quality through Ascend-MindSpore CI. - **develop**: the development branch for adapting vLLM features, which is forked from the main branch when a new vLLM version is released. Once the adapted features is stable, it will be merged into the main branch. The current development branch is adapting vLLM v0.9.1 version. - **rX.Y.Z**: version branches used for archiving version release, which is forked from the main branch after the adaptation of a certain vLLM version is completed. @@ -72,7 +72,7 @@ The following are the version branches: - Notes * - master - Maintained - - Compatible with vLLM v0.8.3, and CI commitment for MindSpore master branch + - Compatible with vLLM v0.9.1, and CI commitment for MindSpore master branch * - develop - Maintained - Compatible with vLLM v0.9.1 @@ -82,10 +82,13 @@ The following are the version branches: * - r0.2 - Maintained - Compatible with vLLM v0.7.3, and CI commitment for MindSpore 2.6.0 + * - r0.3.0 + - Maintained + - Compatible with vLLM v0.8.3, and CI commitment for MindSpore 2.7.0 SIG ----------------------------------------------------- -- Welcome to join vLLM MindSpore SIG to participate in the co-construction of open-source projects and industrial cooperation: https://www.mindspore.cn/community/SIG +- Welcome to join vLLM-MindSpore Plugin SIG to participate in the co-construction of open-source projects and industrial cooperation: https://www.mindspore.cn/community/SIG - SIG meetings, every other Friday or Saturday evening, 20:00 - 21:00 (UTC+8, `Convert to your timezone `_ ) License diff --git a/docs/vllm_mindspore/docs/source_en/release_notes/release_notes.md b/docs/vllm_mindspore/docs/source_en/release_notes/release_notes.md index 4b22bc8bf9a260819deaeba8bf11defa4da00fd0..ea96f2fb1364f72b7acfd21d79118b7c9c22171c 100644 --- a/docs/vllm_mindspore/docs/source_en/release_notes/release_notes.md +++ b/docs/vllm_mindspore/docs/source_en/release_notes/release_notes.md @@ -2,9 +2,9 @@ [![View Source On Gitee](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/master/resource/_static/logo_source_en.svg)](https://gitee.com/mindspore/docs/blob/master/docs/vllm_mindspore/docs/source_en/release_notes/release_notes.md) -## vLLM MindSpore 0.3.0 Release Notes +## vLLM-MindSpore Plugin 0.3.0 Release Notes -The following are the key new features and models supported in the vLLM MindSpore plugin version 0.3.0. +The following are the key new features and models supported in the vLLM-MindSpore Plugin version 0.3.0. ### New Features diff --git a/docs/vllm_mindspore/docs/source_en/user_guide/environment_variables/environment_variables.md b/docs/vllm_mindspore/docs/source_en/user_guide/environment_variables/environment_variables.md index c1b71616260505fa7705916b0598ded2aa098674..054328a0ff81c21355adebd30230661fb5ef4127 100644 --- a/docs/vllm_mindspore/docs/source_en/user_guide/environment_variables/environment_variables.md +++ b/docs/vllm_mindspore/docs/source_en/user_guide/environment_variables/environment_variables.md @@ -4,13 +4,20 @@ | Environment Variable | Function | Type | Values | Description | |----------------------|----------|------|--------|-------------| -| `vLLM_MODEL_BACKEND` | Specifies the model backend. Not Required when using vLLM MindSpore native models, and required when using an external vLLM MindSpore models. | String | `MindFormers`: Model source is MindSpore Transformers. | vLLM MindSpore native model backend supports Qwen2.5 series. MindSpore Transformers model backend supports Qwen/DeepSeek/Llama series models, and the environment variable: `export PYTHONPATH=/path/to/mindformers/:$PYTHONPATH` needs to be set. | +| `vLLM_MODEL_BACKEND` | Specifies the model backend. Not Required when using vLLM-MindSpore Plugin native models, and required when using an external vLLM-MindSpore Plugin models. | String | `MindFormers`: Model source is MindSpore Transformers. | vLLM-MindSpore Plugin native model backend supports Qwen2.5 series. MindSpore Transformers model backend supports Qwen/DeepSeek/Llama series models, and the environment variable: `export PYTHONPATH=/path/to/mindformers/:$PYTHONPATH` needs to be set. | | `MINDFORMERS_MODEL_CONFIG` | Configuration file for MindSpore Transformers models. Required for Qwen2.5 series or DeepSeek series models. | String | Path to the model configuration file | **This environment variable will be removed in future versions.** Example: `export MINDFORMERS_MODEL_CONFIG=/path/to/research/deepseek3/deepseek_r1_671b/predict_deepseek_r1_671b_w8a8.yaml`. | | `GLOO_SOCKET_IFNAME` | Specifies the network interface name for inter-machine communication using gloo. | String | Interface name (e.g., `enp189s0f0`). | Used in multi-machine scenarios. The interface name can be found via `ifconfig` by matching the IP address. | | `TP_SOCKET_IFNAME` | Specifies the network interface name for inter-machine communication using TP. | String | Interface name (e.g., `enp189s0f0`). | Used in multi-machine scenarios. The interface name can be found via `ifconfig` by matching the IP address. | | `HCCL_SOCKET_IFNAME` | Specifies the network interface name for inter-machine communication using HCCL. | String | Interface name (e.g., `enp189s0f0`). | Used in multi-machine scenarios. The interface name can be found via `ifconfig` by matching the IP address. | | `ASCEND_RT_VISIBLE_DEVICES` | Specifies which devices are visible to the current process, supporting one or multiple Device IDs. | String | Device IDs as a comma-separated string (e.g., `"0,1,2,3,4,5,6,7"`). | Recommended for Ray usage scenarios. | | `HCCL_BUFFSIZE` | Controls the buffer size for data sharing between two NPUs. | int | Buffer size in MB (e.g., `2048`). | Usage reference: [HCCL_BUFFSIZE](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/81RC1beta1/maintenref/envvar/envref_07_0080.html). Example: For DeepSeek hybrid parallelism (Data Parallel: 32, Expert Parallel: 32) with `max-num-batched-tokens=256`, set `export HCCL_BUFFSIZE=2048`. | -| MS_MEMPOOL_BLOCK_SIZE | Set the size of the memory pool block in PyNative mode for devices | String | String of positive number, and the unit is GB. | | -| vLLM_USE_NPU_ADV_STEP_FLASH_OP | Whether to use Ascend operation `adv_step_flash` | String | `on`: Use;`off`:Not use | If the variable is set to `off`, model will use the implement of small operations. | -| VLLM_TORCH_PROFILER_DIR | Enables profiling data collection and takes effect when a data save path is configured. | String | The path to save profiling data. | | +| `MS_MEMPOOL_BLOCK_SIZE` | Set the size of the memory pool block in PyNative mode for devices | String | String of positive number, and the unit is GB. | | +| `vLLM_USE_NPU_ADV_STEP_FLASH_OP` | Whether to use Ascend operation `adv_step_flash` | String | `on`: Use;`off`:Not use | If the variable is set to `off`, model will use the implement of small operations. | +| `VLLM_TORCH_PROFILER_DIR` | Enables profiling data collection and takes effect when a data save path is configured. | String | The path to save profiling data. | | + +More environment variable information can be referred in the following links: + +- [CANN Environment Variable List](https://www.hiascend.com/document/detail/en/CANNCommunityEdition/81RC1beta1/index/index.html) +- [MindSpore Environment Variable List](https://www.mindspore.cn/docs/en/master/api_python/env_var_list.html) +- [MindSpore Transformers Environment Variable List](https://www.mindspore.cn/mindformers/docs/en/master/index.html) +- [vLLM Environment Variable List](https://docs.vllm.ai/en/v0.8.4/serving/env_vars.html) diff --git a/docs/vllm_mindspore/docs/source_en/user_guide/supported_features/benchmark/benchmark.md b/docs/vllm_mindspore/docs/source_en/user_guide/supported_features/benchmark/benchmark.md index b704a49f442c313bdc3b4a38029672b11ab39734..e72a1fee5400497cfac63acb948619f3d4a76daa 100644 --- a/docs/vllm_mindspore/docs/source_en/user_guide/supported_features/benchmark/benchmark.md +++ b/docs/vllm_mindspore/docs/source_en/user_guide/supported_features/benchmark/benchmark.md @@ -2,16 +2,14 @@ [![View Source On Gitee](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/master/resource/_static/logo_source_en.svg)](https://gitee.com/mindspore/docs/blob/master/docs/vllm_mindspore/docs/source_en/user_guide/supported_features/benchmark/benchmark.md) -The benchmark tool of vLLM MindSpore is inherited from vLLM. You can refer to the [vLLM BenchMark](https://github.com/vllm-project/vllm/blob/main/benchmarks/README.md) documentation for more details. This document introduces [Online Benchmark](#online-benchmark) and [Offline Benchmark](#offline-benchmark). Users can follow the steps to conduct performance tests. +The benchmark tool of vLLM-MindSpore Plugin is inherited from vLLM. You can refer to the [vLLM BenchMark](https://github.com/vllm-project/vllm/blob/main/benchmarks/README.md) documentation for more details. This document introduces [Online Benchmark](#online-benchmark) and [Offline Benchmark](#offline-benchmark). Users can follow the steps to conduct performance tests. ## Online Benchmark For single-card inference, we take [Qwen2.5-7B](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct) as an example. You can prepare the environment by following the guide [Single-Card Inference (Qwen2.5-7B)](../../../getting_started/tutorials/qwen2.5_7b_singleNPU/qwen2.5_7b_singleNPU.md#online-inference), set the environment variables: ```bash -export ASCEND_TOTAL_MEMORY_GB=64 # Please use `npu-smi info` to check the memory. export vLLM_MODEL_BACKEND=MindFormers # use MindSpore Transformers as model backend. -export vLLM_MODEL_MEMORY_USE_GB=32 # Memory reserved for model execution. Set according to the model's maximum usage, with the remaining environment used for kvcache allocation export MINDFORMERS_MODEL_CONFIG=$YAML_PATH # Set the corresponding MindSpore Transformers model's YAML file. ``` @@ -37,7 +35,7 @@ INFO: Waiting for application startup. INFO: Application startup complete. ``` -Clone the vLLM repository and import the vLLM MindSpore plugin to reuse the benchmark tools: +Clone the vLLM repository and import the vLLM-MindSpore Plugin to reuse the benchmark tools: ```bash export VLLM_BRANCH=v0.9.1 @@ -46,7 +44,7 @@ cd vllm sed -i '1i import vllm_mindspore' benchmarks/benchmark_serving.py ``` -Here, `VLLM_BRANCH` refers to the branch name of vLLM, which needs to be compatible with vLLM MindSpore. For compatibility details, please refer to [here](../../../getting_started/installation/installation.md#version-compatibility). +Here, `VLLM_BRANCH` refers to the branch name of vLLM, which needs to be compatible with vLLM-MindSpore Plugin. For compatibility details, please refer to [here](../../../getting_started/installation/installation.md#version-compatibility). Execute the test script: @@ -104,9 +102,7 @@ P99 ITL (ms): .... For offline performance benchmark, take [Qwen2.5-7B](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct) as an example. Prepare the environment by following the guide [Single-Card Inference (Qwen2.5-7B)](../../../getting_started/tutorials/qwen2.5_7b_singleNPU/qwen2.5_7b_singleNPU.md#offline-inference). User need to set the environment variables: ```bash -export ASCEND_TOTAL_MEMORY_GB=64 # Please use `npu-smi info` to check the memory. export vLLM_MODEL_BACKEND=MindFormers # use MindSpore Transformers as model backend. -export vLLM_MODEL_MEMORY_USE_GB=32 # Memory reserved for model execution. Set according to the model's maximum usage, with the remaining environment used for kvcache allocation export MINDFORMERS_MODEL_CONFIG=$YAML_PATH # Set the corresponding MindSpore Transformers model's YAML file. ``` @@ -119,7 +115,7 @@ cd vllm sed -i '1i import vllm_mindspore' benchmarks/benchmark_throughput.py ``` -Here, `VLLM_BRANCH` refers to the branch name of vLLM, which needs to be compatible with vLLM MindSpore. For compatibility details, please refer to [here](../../../getting_started/installation/installation.md#version-compatibility). +Here, `VLLM_BRANCH` refers to the branch name of vLLM, which needs to be compatible with vLLM-MindSpore Plugin. For compatibility details, please refer to [here](../../../getting_started/installation/installation.md#version-compatibility). Run the test script with the following command. The script below will start the model automatically, and user does not need to start the model manually: diff --git a/docs/vllm_mindspore/docs/source_en/user_guide/supported_features/features_list/features_list.md b/docs/vllm_mindspore/docs/source_en/user_guide/supported_features/features_list/features_list.md index 67530546706d489f1c2723054b5f2e40b6949558..59680c244f3d017157a56d07be473ff6219a906a 100644 --- a/docs/vllm_mindspore/docs/source_en/user_guide/supported_features/features_list/features_list.md +++ b/docs/vllm_mindspore/docs/source_en/user_guide/supported_features/features_list/features_list.md @@ -2,9 +2,9 @@ [![View Source On Gitee](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/master/resource/_static/logo_source_en.svg)](https://gitee.com/mindspore/docs/blob/master/docs/vllm_mindspore/docs/source_en/user_guide/supported_features/features_list/features_list.md) -The features supported by vLLM MindSpore are consistent with the community version of vLLM. For feature descriptions and usage, please refer to the [vLLM Official Documentation](https://docs.vllm.ai/en/latest/). +The features supported by vLLM-MindSpore Plugin are consistent with the community version of vLLM. For feature descriptions and usage, please refer to the [vLLM Official Documentation](https://docs.vllm.ai/en/latest/). -The following is the features supported in vLLM MindSpore. +The following is the features supported in vLLM-MindSpore Plugin. | **Features** | **vLLM V0** | **vLLM V1** | |-----------------------------------|--------------------|--------------------| @@ -39,5 +39,5 @@ The following is the features supported in vLLM MindSpore. ## Feature Description -- LoRA currently only supports the Qwen2.5 vLLM MindSpore native model, other models are in the process of adaptation; +- LoRA currently only supports the Qwen2.5 vLLM-MindSpore Plugin native model, other models are in the process of adaptation; - Tool Calling only supports DeepSeek V3 0324 W8A8 model. diff --git a/docs/vllm_mindspore/docs/source_en/user_guide/supported_features/profiling/profiling.md b/docs/vllm_mindspore/docs/source_en/user_guide/supported_features/profiling/profiling.md index b24b541e59a4a4184de1e3619164b7951fcddd11..3810ec6a8c2d98f8802439a171e7bba85991f681 100644 --- a/docs/vllm_mindspore/docs/source_en/user_guide/supported_features/profiling/profiling.md +++ b/docs/vllm_mindspore/docs/source_en/user_guide/supported_features/profiling/profiling.md @@ -2,7 +2,7 @@ [![View Source On Gitee](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/master/resource/_static/logo_source_en.svg)](https://gitee.com/mindspore/docs/blob/master/docs/vllm_mindspore/docs/source_en/user_guide/supported_features/profiling/profiling.md) -vLLM MindSpore supports the `mindspore.Profiler` module to track the performance of workers in vLLM MindSpore. User can follow the [Collecting Profiling Data](#collecting-profiling-data) section to gather data and then analyze it according to [Analyzing Profiling Data](#analyzing-profiling-data). Additionally, user can inspect the model's IR graph through [Graph Data Dump](#graph-data-dump) to analyze and debug the model structure. +vLLM-MindSpore Plugin supports the `mindspore.Profiler` module to track the performance of workers in vLLM-MindSpore Plugin. User can follow the [Collecting Profiling Data](#collecting-profiling-data) section to gather data and then analyze it according to [Analyzing Profiling Data](#analyzing-profiling-data). Additionally, user can inspect the model's IR graph through [Graph Data Dump](#graph-data-dump) to analyze and debug the model structure. ## Collecting Profiling Data @@ -12,7 +12,7 @@ To enable profiling data collection, user need to set the `VLLM_TORCH_PROFILER_D export VLLM_TORCH_PROFILER_DIR=/path/to/save/vllm_profile ``` -After setting the variable, Run the following command to launch the vLLM MindSpore service. We take [Qwen2.5-32B](https://huggingface.co/Qwen/Qwen2.5-32B-Instruct) as an example: +After setting the variable, Run the following command to launch the vLLM-MindSpore Plugin service. We take [Qwen2.5-32B](https://huggingface.co/Qwen/Qwen2.5-32B-Instruct) as an example: ```bash export TENSOR_PARALLEL_SIZE=4 @@ -40,7 +40,7 @@ curl -X POST http://127.0.0.1:8000/start_profile curl http://localhost:8000/v1/completions \ -H "Content-Type: application/json" \ -d '{ - "model": "/home/DeepSeekV3", + "model": "Qwen/Qwen2.5-32B-Instruct", "prompt": "San Francisco is a", "max_tokens": 7, "temperature": 0 diff --git a/docs/vllm_mindspore/docs/source_en/user_guide/supported_features/quantization/quantization.md b/docs/vllm_mindspore/docs/source_en/user_guide/supported_features/quantization/quantization.md index 401768ca1af91c1442bee6a3b3d8960921fcea92..fb135bfe73a1767dca700fdddbf8c22fe86e6c9a 100644 --- a/docs/vllm_mindspore/docs/source_en/user_guide/supported_features/quantization/quantization.md +++ b/docs/vllm_mindspore/docs/source_en/user_guide/supported_features/quantization/quantization.md @@ -16,7 +16,7 @@ We employ [MindSpore Golden Stick's PTQ algorithm](https://gitee.com/mindspore/g ### Downloading Quantized Weights -We have uploaded the quantized DeepSeek-R1 to [ModelArts Community](https://modelers.cn): [MindSpore-Lab/DeepSeek-R1-W8A8](https://modelers.cn/models/MindSpore-Lab/DeepSeek-R1-W8A8). Refer to the [ModelArts Community documentation](https://modelers.cn/docs/en/openmind-hub-client/0.9/basic_tutorial/download.html) to download the weights locally. +We have uploaded the quantized DeepSeek-R1 to [ModelArts Community](https://modelers.cn): [MindSpore-Lab/DeepSeek-R1-0528-A8W8](https://modelers.cn/models/MindSpore-Lab/DeepSeek-R1-0528-A8W8). Refer to the [ModelArts Community documentation](https://modelers.cn/docs/en/openmind-hub-client/0.9/basic_tutorial/download.html) to download the weights locally. ## Quantized Model Inference @@ -24,12 +24,10 @@ After obtaining the DeepSeek-R1 W8A8 weights, ensure they are stored in the rela ### Offline Inference -Refer to the [Installation Guide](../../../getting_started/installation/installation.md) to set up the vLLM MindSpore environment. User need to set the following environment variables: +Refer to the [Installation Guide](../../../getting_started/installation/installation.md) to set up the vLLM-MindSpore Plugin environment. User need to set the following environment variables: ```bash -export ASCEND_TOTAL_MEMORY_GB=64 # Please use `npu-smi info` to check the memory. export vLLM_MODEL_BACKEND=MindFormers # use MindSpore Transformers as model backend. -export vLLM_MODEL_MEMORY_USE_GB=32 # Memory reserved for model execution. Set according to the model's maximum usage, with the remaining environment used for kvcache allocation export MINDFORMERS_MODEL_CONFIG=$YAML_PATH # Set the corresponding MindSpore Transformers model's YAML file. ``` diff --git a/docs/vllm_mindspore/docs/source_en/user_guide/supported_models/models_list/models_list.md b/docs/vllm_mindspore/docs/source_en/user_guide/supported_models/models_list/models_list.md index ba825bcae5a769b9217aae4ba8fb808e819b4f85..3d9b49bd6e3f3bd9a9f9bb1cd614cc201c8d1fa9 100644 --- a/docs/vllm_mindspore/docs/source_en/user_guide/supported_models/models_list/models_list.md +++ b/docs/vllm_mindspore/docs/source_en/user_guide/supported_models/models_list/models_list.md @@ -6,7 +6,7 @@ |-------| --------- | ---- | | DeepSeek-V3 | Supported | [DeepSeek-V3](https://modelers.cn/models/MindSpore-Lab/DeepSeek-V3) | | DeepSeek-R1 | Supported | [DeepSeek-R1](https://huggingface.co/deepseek-ai/DeepSeek-V3) | -| DeepSeek-R1 W8A8 | Supported | [Deepseek-R1-W8A8](https://modelers.cn/models/MindSpore-Lab/DeepSeek-r1-w8a8) | +| DeepSeek-R1 W8A8 | Supported | [Deepseek-R1-W8A8](https://modelers.cn/models/MindSpore-Lab/DeepSeek-R1-0528-A8W8) | | Qwen2.5 | Supported | [Qwen2.5-0.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct), [Qwen2.5-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct), [Qwen2.5-3B-Instruct](https://huggingface.co/Qwen/Qwen2.5-3B-Instruct), [Qwen2.5-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct), [Qwen2.5-14B-Instruct](https://huggingface.co/Qwen/Qwen2.5-14B-Instruct), [Qwen2.5-32B-Instruct](https://huggingface.co/Qwen/Qwen2.5-32B-Instruct), [Qwen2.5-72B-Instruct](https://huggingface.co/Qwen/Qwen2.5-72B-Instruct) | | Qwen3-32B | Supported | [Qwen3-32B](https://modelers.cn/models/MindSpore-Lab/Qwen3-32B) | | Qwen3-235B-A22B | Supported | [Qwen3-235B-A22B](https://huggingface.co/Qwen/Qwen3-235B-A22B) | diff --git a/docs/vllm_mindspore/docs/source_zh_cn/conf.py b/docs/vllm_mindspore/docs/source_zh_cn/conf.py index 509c2877de7bba1581c022267fb9c94418f1cd54..2ccb11460862e62f11e226d72054cc834363273f 100644 --- a/docs/vllm_mindspore/docs/source_zh_cn/conf.py +++ b/docs/vllm_mindspore/docs/source_zh_cn/conf.py @@ -23,9 +23,9 @@ from sphinx.ext import autodoc as sphinx_autodoc # -- Project information ----------------------------------------------------- -project = 'vLLM MindSpore' +project = 'vLLM-MindSpore插件' copyright = 'MindSpore' -author = 'vLLM MindSpore' +author = 'vLLM-MindSpore插件' # The full version, including alpha/beta/rc tags release = 'master' diff --git a/docs/vllm_mindspore/docs/source_zh_cn/developer_guide/contributing.md b/docs/vllm_mindspore/docs/source_zh_cn/developer_guide/contributing.md index aef574b2599f9a330fbe76ee4451bff571e73ee5..2b490681046580c3b9ba2c163d90309ae9c343be 100644 --- a/docs/vllm_mindspore/docs/source_zh_cn/developer_guide/contributing.md +++ b/docs/vllm_mindspore/docs/source_zh_cn/developer_guide/contributing.md @@ -14,11 +14,11 @@ ## 增加新模型 -若希望将一个新模型合入vLLM MindSpore代码仓库,需要注意几点: +若希望将一个新模型合入vLLM-MindSpore插件代码仓库,需要注意几点: - **文件格式及位置要遵循规范。** 模型代码文件统一放置于`vllm_mindspore/model_executor`文件夹下,请根据不同模型将代码文件放置于对应的文件夹下。 -- **模型基于MindSpore接口实现,支持jit静态图方式执行。** vLLM MindSpore中的模型定义实现需基于MindSpore接口实现。由于MindSpore静态图模式执行性能有优势,因此模型需支持@jit静态图方式执行。详细可参考[Qwen2.5](https://gitee.com/mindspore/vllm-mindspore/blob/master/vllm_mindspore/model_executor/models/qwen2.py)模型定义实现。 -- **将新模型在vLLM MindSpore代码中进行注册。** 模型结构定义实现后,需要将该模型注册到vLLM MindSpore中,注册文件位于'vllm_mindspore/model_executor/models/registry.py'中,请将模型注册到`_NATIVE_MODELS`。 +- **模型基于MindSpore接口实现,支持jit静态图方式执行。** vLLM-MindSpore插件中的模型定义实现需基于MindSpore接口实现。由于MindSpore静态图模式执行性能有优势,因此模型需支持@jit静态图方式执行。详细可参考[Qwen2.5](https://gitee.com/mindspore/vllm-mindspore/blob/master/vllm_mindspore/model_executor/models/qwen2.py)模型定义实现。 +- **将新模型在vLLM-MindSpore插件代码中进行注册。** 模型结构定义实现后,需要将该模型注册到vLLM-MindSpore插件中,注册文件位于'vllm_mindspore/model_executor/models/registry.py'中,请将模型注册到`_NATIVE_MODELS`。 - **编写单元测试。** 新增的模型需同步提交单元测试用例,用例编写请参考[Qwen2.5模型用例](https://gitee.com/mindspore/vllm-mindspore/blob/master/tests/st/python/cases_parallel/vllm_qwen_7b.py)。 ## 贡献流程 @@ -29,13 +29,13 @@ - **编码指南:** 使用vLLM社区代码检查工具:yapf、codespell、ruff、isort和mypy。更多信息可参考[检查工具链使用说明](https://gitee.com/mindspore/vllm-mindspore/blob/master/codecheck_toolkits/README.md)。 -- **单元测试指南:** vLLM MindSpore使用Python单元测试框架[pytest](http://www.pytest.org/en/latest/)。注释名称需反映测试用例的设计意图。 +- **单元测试指南:** vLLM-MindSpore插件使用Python单元测试框架[pytest](http://www.pytest.org/en/latest/)。注释名称需反映测试用例的设计意图。 - **重构指南:** 我们鼓励开发人员重构我们的代码,以消除[代码坏味道](https://zh.wikipedia.org/wiki/%E4%BB%A3%E7%A0%81%E5%BC%82%E5%91%B3)。所有代码都要符合编码风格和测试风格,重构代码也不例外。 ### Fork-Pull开发模型 -- **Fork vLLM MindSpore代码仓:** 在提交代码至vLLM MindSpore项目之前,请确保已fork此项目到您自己的代码仓。vLLM MindSpore代码仓和您自己的代码仓之间可能会并行开发,请注意它们之间的一致性。 +- **Fork vLLM-MindSpore插件代码仓:** 在提交代码至vLLM-MindSpore插件项目之前,请确保已fork此项目到您自己的代码仓。vLLM-MindSpore插件代码仓和您自己的代码仓之间可能会并行开发,请注意它们之间的一致性。 - **克隆远程代码仓:** 如果您想将代码下载到本地计算机,最好使用git方法: @@ -63,7 +63,7 @@ git push origin {新分支名称} ``` -- **将请求拉取到vLLM MindSpore代码仓:** 在最后一步中,您需要在新分支和vLLM MindSpore主分支之间拉取比较请求然后创建PR。提交PR提交后,需要在评论中通过`/retest`手动触发门禁检查,进行构建测试。PR应该尽快合并到上游master分支中,以降低合并的风险。 +- **将请求拉取到vLLM-MindSpore插件代码仓:** 在最后一步中,您需要在新分支和vLLM-MindSpore插件主分支之间拉取比较请求然后创建PR。提交PR提交后,需要在评论中通过`/retest`手动触发门禁检查,进行构建测试。PR应该尽快合并到上游master分支中,以降低合并的风险。 ### 报告Issue @@ -71,7 +71,7 @@ 报告issue时,请参考以下格式: -- 说明您使用的环境版本(vLLM MindSpore、MindSpore TransFormers、MindSpore、OS、Python等); +- 说明您使用的环境版本(vLLM-MindSpore插件、MindSpore TransFormers、MindSpore、OS、Python等); - 说明是错误报告还是功能需求; - 说明issue类型,添加标签可以在issue板上突出显示该issue; - 问题是什么; @@ -99,4 +99,4 @@ - 确保您的分支与主分支始终一致。 - 用于修复错误的PR中,确保已关联所有相关问题。 -最后,感谢您对为vLLM MindSpore项目做出贡献的兴趣,我们欢迎并重视任何形式的贡献与合作。 +最后,感谢您对为vLLM-MindSpore插件项目做出贡献的兴趣,我们欢迎并重视任何形式的贡献与合作。 diff --git a/docs/vllm_mindspore/docs/source_zh_cn/developer_guide/operations/custom_ops.md b/docs/vllm_mindspore/docs/source_zh_cn/developer_guide/operations/custom_ops.md index 357fbb0ed193bef12d71adc67c82d760f555ab99..3e08c978c843c503c35371d656a8612fdeaa7bdf 100644 --- a/docs/vllm_mindspore/docs/source_zh_cn/developer_guide/operations/custom_ops.md +++ b/docs/vllm_mindspore/docs/source_zh_cn/developer_guide/operations/custom_ops.md @@ -4,15 +4,15 @@ 当内置算子不满足需求时,你可以利用MindSpore提供的自定义算子功能接入你的算子。 -本文档将以 **`advance_step_flashattn`** 算子为例,讲解如何在 vLLM MindSpore 项目中接入一个AscendC自定义算子。 +本文档将以 **`advance_step_flashattn`** 算子为例,讲解如何在vLLM-MindSpore插件项目中接入一个AscendC自定义算子。 -本文重点在于介绍把算子集成进vLLM MindSpore的流程,自定义算子的细节请参考 MindSpore 官方教程:[基于CustomOpBuilder的自定义算子](https://www.mindspore.cn/tutorials/zh-CN/master/custom_program/operation/op_customopbuilder.html)。AscendC算子的开发流程请参考昇腾官方文档:[Ascend C算子开发](https://www.hiascend.com/document/detail/zh/canncommercial/81RC1/developmentguide/opdevg/Ascendcopdevg/atlas_ascendc_10_0001.html)。 +本文重点在于介绍把算子集成进vLLM-MindSpore插件的流程,自定义算子的细节请参考 MindSpore 官方教程:[基于CustomOpBuilder的自定义算子](https://www.mindspore.cn/tutorials/zh-CN/master/custom_program/operation/op_customopbuilder.html)。AscendC算子的开发流程请参考昇腾官方文档:[Ascend C算子开发](https://www.hiascend.com/document/detail/zh/canncommercial/81RC1/developmentguide/opdevg/Ascendcopdevg/atlas_ascendc_10_0001.html)。 -**注:目前vLLM MindSpore的自定义算子仅支持动态图(PyNative Mode)场景。** +**注:目前vLLM-MindSpore插件的自定义算子仅支持动态图(PyNative Mode)场景。** ## 文件组织结构 -接入自定义算子需要在 vLLM MindSpore 项目的 `csrc` 目录下添加代码,目录结构如下: +接入自定义算子需要在vLLM-MindSpore插件项目的 `csrc` 目录下添加代码,目录结构如下: ```text vllm-mindspore/ @@ -111,11 +111,11 @@ VLLM_MS_EXTENSION_MODULE(m) { 上面`m.def()`接口的第一个参数`"advance_step_flashattn"`就是算子的Python接口名。 -`module.h` 和 `module.cpp` 文件的作用是基于pybind11创建算子的Python模块。因为一个动态库内只能有一个 `PYBIND11_MODULE` ,为了让用户可以在一个文件内完成算子接入工作,vLLM MindSpore提供了一个新的注册接口 `VLLM_MS_EXTENSION_MODULE` 宏。自定义算子动态库加载时,所有算子接口都会被自动注册到同一个Python模块中。 +`module.h` 和 `module.cpp` 文件的作用是基于pybind11创建算子的Python模块。因为一个动态库内只能有一个 `PYBIND11_MODULE` ,为了让用户可以在一个文件内完成算子接入工作,vLLM-MindSpore插件提供了一个新的注册接口 `VLLM_MS_EXTENSION_MODULE` 宏。自定义算子动态库加载时,所有算子接口都会被自动注册到同一个Python模块中。 ### 算子调用接口 -vLLM MindSpore的自定义算子被编译到了 `_C_ops.so` 里面,为了方便调用,可以在 `vllm_mindspore/_custom_ops.py` 添加一个调用接口。如果在算子调用前后需要做额外适配,也可以在这接口内实现。 +vLLM-MindSpore插件的自定义算子被编译到了 `_C_ops.so` 里面,为了方便调用,可以在 `vllm_mindspore/_custom_ops.py` 添加一个调用接口。如果在算子调用前后需要做额外适配,也可以在这接口内实现。 ```python def advance_step_flashattn(num_seqs: int, num_queries: int, block_size: int, @@ -142,8 +142,8 @@ def advance_step_flashattn(num_seqs: int, num_queries: int, block_size: int, ### 算子编译和测试 -1. **代码集成**:将代码集成至 vLLM MindSpore 项目。 -2. **编译项目**:在项目代码根目录下,执行 `pip install .` ,编译安装vLLM MindSpore。 +1. **代码集成**:将代码集成至vLLM-MindSpore插件项目。 +2. **编译项目**:在项目代码根目录下,执行 `pip install .` ,编译安装vLLM-MindSpore插件。 3. **测试算子接口**:通过 `_custom_ops` 调用算子接口,可以参考测试用例[test_custom_advstepflash.py](https://gitee.com/mindspore/vllm-mindspore/blob/master/tests/st/python/test_custom_advstepflash.py): ```python @@ -154,18 +154,18 @@ custom_ops.advance_step_flashattn(...) ## 自定义算子编译工程 -当前MindSpore仅提供了一个 [CustomOpBuilder接口](https://www.mindspore.cn/docs/zh-CN/master/api_python/ops/mindspore.ops.CustomOpBuilder.html) 用于在线编译自定义算子,接口内置了默认的编译和链接选项。vLLM MindSpore基于MindSpore的自定义算子功能接入算子,并编译成动态库随包发布。下面是编译流程介绍: +当前MindSpore仅提供了一个 [CustomOpBuilder接口](https://www.mindspore.cn/docs/zh-CN/master/api_python/ops/mindspore.ops.CustomOpBuilder.html) 用于在线编译自定义算子,接口内置了默认的编译和链接选项。vLLM-MindSpore插件基于MindSpore的自定义算子功能接入算子,并编译成动态库随包发布。下面是编译流程介绍: ### 算子扩展库模块 -在 `setup.py` 中,vLLM MindSpore添加了一个 `vllm_mindspore._C_ops` 扩展,并添加了相应的编译模块: +在 `setup.py` 中,vLLM-MindSpore插件添加了一个 `vllm_mindspore._C_ops` 扩展,并添加了相应的编译模块: ```python ext_modules = [Extension("vllm_mindspore._C_ops", sources=[])], cmdclass = {"build_ext": CustomBuildExt}, ``` -这里不需要指定 `sources` ,是因为vLLM MindSpore通过CMake触发算子编译,自动收集了源文件。 +这里不需要指定 `sources` ,是因为vLLM-MindSpore插件通过CMake触发算子编译,自动收集了源文件。 ### 算子编译流程 diff --git a/docs/vllm_mindspore/docs/source_zh_cn/general/security.md b/docs/vllm_mindspore/docs/source_zh_cn/general/security.md index 575a3b1bf5d399dc9b6311701021cee447643e9d..63118b81957de86e8604b59da6e301e78cdf39bc 100644 --- a/docs/vllm_mindspore/docs/source_zh_cn/general/security.md +++ b/docs/vllm_mindspore/docs/source_zh_cn/general/security.md @@ -2,24 +2,24 @@ [![查看源文件](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/master/resource/_static/logo_source.svg)](https://gitee.com/mindspore/docs/blob/master/docs/vllm_mindspore/docs/source_zh_cn/general/security.md) -通过 vLLM MindSpore 在 Ascend 上使能推理服务时,由于服务化、节点通信、模型执行等必要功能需要使用一些网络端口,因此会存在安全相关的一些问题。 +通过vLLM-MindSpore插件在 Ascend 上使能推理服务时,由于服务化、节点通信、模型执行等必要功能需要使用一些网络端口,因此会存在安全相关的一些问题。 ## 服务化端口配置 -使用 vLLM MindSpore 启动推理服务时,需要相关 IP 与端口信息,包括: +使用vLLM-MindSpore插件启动推理服务时,需要相关 IP 与端口信息,包括: -1. `host`: 配置服务关联的 IP 地址,默认值为 `0.0.0.0`。 +1. `host`: 配置服务关联的IP地址,默认值为 `0.0.0.0`。 2. `port`: 配置服务关联的端口,默认值为 `8000`。 -3. `data-parallel-address`: 配置数据并行管理 IP,默认值为 `127.0.0.1`。 - > 仅在使能了多节点数据并行的 vLLM 中生效。 +3. `data-parallel-address`: 配置数据并行管理IP,默认值为 `127.0.0.1`。 + > 仅在使能了多节点数据并行的vLLM中生效。 4. `data-parallel-rpc-port`: 配置数据并行管理端口,默认值为 `29550`。 - > 仅在使能了多节点数据并行的 vLLM 中生效。 + > 仅在使能了多节点数据并行的vLLM中生效。 ## 节点间通信 -通过 vLLM MindSpore 进行多节点部署时,使用默认配置进行节点间的通信是不安全的,包括以下场景: +通过vLLM-MindSpore插件进行多节点部署时,使用默认配置进行节点间的通信是不安全的,包括以下场景: -1. MindSpore 分布式通信。 +1. MindSpore分布式通信。 2. 模型TP、DP并行下的通信。 为了保证其安全性,应该部署在有足够安全的隔离网络环境中。 @@ -27,28 +27,28 @@ ### vLLM 中关于节点间通信的可配置项 1. 环境变量 - * `VLLM_HOST_IP`: 可配置 vLLM 中进程间用于通信的 IP 地址。主要作用场景是传递给运行框架进行分布式通信组网。 - * `VLLM_DP_MASTER_IP`: 设置数据并行主节点 IP 地址(非服务化启动数据并行场景),默认值为 `127.0.0.1`。 + * `VLLM_HOST_IP`: 可配置vLLM中进程间用于通信的IP地址。主要作用场景是传递给运行框架进行分布式通信组网。 + * `VLLM_DP_MASTER_IP`: 设置数据并行主节点IP地址(非服务化启动数据并行场景),默认值为 `127.0.0.1`。 * `VLLM_DP_MASTER_PORT`: 设置数据并行主节点端口(非服务化启动数据并行场景),默认值为 `0`。 2. 数据并行配置项 - * `data_parallel_master_ip`: 设置数据并行时的主节点 IP 地址,默认值为 `127.0.0.1`。 + * `data_parallel_master_ip`: 设置数据并行时的主节点IP地址,默认值为 `127.0.0.1`。 * `data_parallel_master_port`: 设置数据并行时的主节点端口,默认为 `29500`。 ### 执行框架分布式通信 -需要注意的是,vLLM MindSpore 当前通过 MindSpore 进行分布式通信,与其相关的安全问题应该参考 [MindSpore官网](https://www.mindspore.cn/)。 +需要注意的是,vLLM-MindSpore插件当前通过MindSpore进行分布式通信,与其相关的安全问题应该参考 [MindSpore官网](https://www.mindspore.cn/)。 ## 安全建议 1. 网络隔离 - * 在隔离的专用网络上部署 vLLM 节点。 + * 在隔离的专用网络上部署vLLM节点。 * 通过网络分段防止未经授权的访问。 * 设置恰当的防火墙规则。如: - * 除了 vLLM API 服务监听的端口外,阻止所以其他连接。 + * 除了vLLM API服务监听的端口外,阻止所以其他连接。 * 确保用于内部的通信端口仅被信任的主机或网络访。 * 不向公共互联网或不受信任的网络暴露内部端口。 2. 推荐配置行为 - * 总是配置相关参数,避免使用默认值,如通过 `VLLM_HOST_IP` 设置指定的 IP 地址。 + * 总是配置相关参数,避免使用默认值,如通过 `VLLM_HOST_IP` 设置指定的IP地址。 * 设定防火墙规则,只允许必要的端口有访问权限。 3. 管理访问权限 * 在部署环境实施物理层和网络层的访问限制。 diff --git a/docs/vllm_mindspore/docs/source_zh_cn/getting_started/installation/installation.md b/docs/vllm_mindspore/docs/source_zh_cn/getting_started/installation/installation.md index 8121916024cadc82ada5d9bbcfe0b86451ada08b..a979696d105be3b9a9936d5030f3c7262123d807 100644 --- a/docs/vllm_mindspore/docs/source_zh_cn/getting_started/installation/installation.md +++ b/docs/vllm_mindspore/docs/source_zh_cn/getting_started/installation/installation.md @@ -2,10 +2,10 @@ [![查看源文件](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/master/resource/_static/logo_source.svg)](https://gitee.com/mindspore/docs/blob/master/docs/vllm_mindspore/docs/source_zh_cn/getting_started/installation/installation.md) -本文档将介绍安装vLLM MindSpore环境的操作步骤。分为三种安装方式: +本文档将介绍vLLM-MindSpore插件的[版本配套](#版本配套),vLLM-MindSpore插件的安装步骤,与[快速验证](#快速验证)用例,用于验证安装是否成功。其中安装步骤分为两种安装方式: - [docker安装](#docker安装):适合用户快速使用的场景; -- [源码安装](#源码安装):适合用户有增量开发vLLM MindSpore的场景。 +- [源码安装](#源码安装):适合用户有增量开发vLLM-MindSpore插件的场景。 ## 版本配套 @@ -13,27 +13,23 @@ - Python:3.9 / 3.10 / 3.11 - 软件版本配套 - | 软件 | 版本 | 对应分支 | - | ----- | ----- | ----- | - |[CANN](https://www.hiascend.com/developer/download/community/result?module=cann) | 8.1 | - | - |[MindSpore](https://www.mindspore.cn/install/) | 2.7 | master | - |[MSAdapter](https://git.openi.org.cn/OpenI/MSAdapter)| 0.2 | master | - |[MindSpore Transformers](https://gitee.com/mindspore/mindformers)|1.6 | dev | - |[Golden Stick](https://gitee.com/mindspore/golden-stick)|1.1.0 | r1.1.0 | - |[vLLM](https://github.com/vllm-project/vllm) | 0.9.1 | v0.9.1 | - |[vLLM MindSpore](https://gitee.com/mindspore/vllm-mindspore) | 0.3 | master | + | 软件 | 配套版本与下载链接 | + | ----- | ----- | + | CANN | [8.1.RC1](https://www.hiascend.com/document/detail/zh/canncommercial/81RC1/softwareinst/instg/instg_0000.html?Mode=PmIns&InstallType=local&OS=Debian&Software=cannToolKit) | + | MindSpore | [2.7.0](https://repo.mindspore.cn/mindspore/mindspore/version/202508/20250814/master_20250814091143_7548abc43af03319bfa528fc96d0ccd3917fcc9c_newest/unified/) | + | MSAdapter| [0.5.0](https://repo.mindspore.cn/mindspore/msadapter/version/202508/20250814/master_20250814010018_4615051c43eef898b6bbdc69768656493b5932f8_newest/any/) | + | MindSpore Transformers | [1.6.0](https://gitee.com/mindspore/mindformers) | + | Golden Stick | [1.2.0](https://repo.mindspore.cn/mindspore/golden-stick/version/202508/20250814/master_20250814010017_2713821db982330b3bcd6d84d85a3b337d555f27_newest/any/) | + | vLLM | [0.9.1](https://repo.mindspore.cn/mirrors/vllm/version/202507/20250715/v0.9.1/any/) | + | vLLM-MindSpore插件 | [0.3.0](https://gitee.com/mindspore/vllm-mindspore/) | -## 配置环境 +## docker安装 -在本章节中,我们将介绍[docker安装](#docker安装)、[pip安装](#pip安装)、[源码安装](#源码安装)三种安装方式,以及[快速验证](#快速验证)用例,用于验证安装是否成功。 +在本章节中,我们推荐用docker创建的方式,以快速部署vLLM-MindSpore插件环境,以下是部署docker的步骤介绍: -### docker安装 +### 构建镜像 -在本章节中,我们推荐用docker创建的方式,以快速部署vLLM MindSpore环境,以下是部署docker的步骤介绍: - -#### 构建镜像 - -用户可执行以下命令,拉取vLLM MindSpore代码仓库,并构建镜像: +用户可执行以下命令,拉取vLLM-MindSpore插件代码仓库,并构建镜像: ```bash git clone https://gitee.com/mindspore/vllm-mindspore.git @@ -53,7 +49,7 @@ Successfully tagged vllm_ms_20250726:latest docker images ``` -#### 新建容器 +### 新建容器 用户在完成[构建镜像](#构建镜像)后,设置`DOCKER_NAME`与`IMAGE_NAME`以设置容器名与镜像名,并执行以下命令,以新建容器: @@ -95,7 +91,7 @@ docker run -itd --name=${DOCKER_NAME} --ipc=host --network=host --privileged=tru docker ps ``` -#### 进入容器 +### 进入容器 用户在完成[新建容器](#新建容器)后,使用已定义的环境变量`DOCKER_NAME`,启动并进入容器: @@ -103,31 +99,35 @@ docker ps docker exec -it $DOCKER_NAME bash ``` -### 源码安装 +## 源码安装 -- **CANN安装** +### CANN安装 - CANN安装方法与环境配套,请参考[CANN社区版软件安装](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/82RC1alpha002/softwareinst/instg/instg_0001.html?Mode=PmIns&OS=openEuler&Software=cannToolKit),若用户在安装CANN过程中遇到问题,可参考[昇腾常见问题](https://www.hiascend.com/document/detail/zh/AscendFAQ/ProduTech/CANNFAQ/cannfaq_000.html)进行解决。 +CANN安装方法与环境配套,请参考[CANN社区版软件安装](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/82RC1alpha002/softwareinst/instg/instg_0001.html?Mode=PmIns&OS=openEuler&Software=cannToolKit),若用户在安装CANN过程中遇到问题,可参考[昇腾常见问题](https://www.hiascend.com/document/detail/zh/AscendFAQ/ProduTech/CANNFAQ/cannfaq_000.html)进行解决。 - CANN默认安装路径为`/usr/local/Ascend`。用户在安装CANN完毕后,使用如下命令,为CANN配置环境变量: +CANN默认安装路径为`/usr/local/Ascend`。用户在安装CANN完毕后,使用如下命令,为CANN配置环境变量: - ```bash - LOCAL_ASCEND=/usr/local/Ascend # the root directory of run package - source ${LOCAL_ASCEND}/ascend-toolkit/set_env.sh - export ASCEND_CUSTOM_PATH=${LOCAL_ASCEND}/ascend-toolkit - ``` +```bash +LOCAL_ASCEND=/usr/local/Ascend # the root directory of run package +source ${LOCAL_ASCEND}/ascend-toolkit/set_env.sh +export ASCEND_CUSTOM_PATH=${LOCAL_ASCEND}/ascend-toolkit +``` -- **vLLM前置依赖安装** +### vLLM前置依赖安装 - vLLM的环境配置与安装方法,请参考[vLLM安装教程](https://docs.vllm.ai/en/v0.9.1/getting_started/installation/cpu.html)。其依赖`gcc/g++ >= 12.3.0`版本,可通过以下命令完成安装: +vLLM的环境配置与安装方法,请参考[vLLM安装教程](https://docs.vllm.ai/en/v0.9.1/getting_started/installation/cpu.html)。其依赖`gcc/g++ >= 12.3.0`版本,可通过以下命令完成安装: - ```bash - yum install -y gcc gcc-c++ - ``` +```bash +yum install -y gcc gcc-c++ +``` + +### vLLM-MindSpore插件安装 + +vLLM-MindSpore插件有以下两种安装方式。**vLLM-MindSpore插件快速安装**适用于用户快速使用与部署的场景。**vLLM-MindSpore插件手动安装**适用于用户对组件有自定义修改的场景。 -- **vLLM MindSpore安装** +- **vLLM-MindSpore插件快速安装** - 安装vLLM MindSpore,需要在拉取vLLM MindSpore源码后,执行以下命令,安装依赖包: + 采用快速安装脚本来安装vLLM-MindSpore插件,需要在拉取vLLM-MindSpore插件源码后,执行以下命令,安装依赖包: ```bash git clone https://gitee.com/mindspore/vllm-mindspore.git @@ -135,7 +135,7 @@ docker exec -it $DOCKER_NAME bash bash install_depend_pkgs.sh ``` - 编译安装vLLM MindSpore: + 编译安装vLLM-MindSpore插件: ```bash pip install . @@ -147,14 +147,63 @@ docker exec -it $DOCKER_NAME bash export PYTHONPATH=$MF_PATH:$PYTHONPATH ``` -### 快速验证 +- **vLLM-MindSpore插件手动安装** + + 若用户对组件有修改,或者需使用其他版本,则用户需要按照特定顺序,手动安装组件。vLLM-MindSpore插件软件配套下载地址可以参考[版本配套](#版本配套),且对组件的安装顺序要求如下: + + 1. 安装vLLM + + ```bash + pip install /path/to/vllm-*.whl + ``` + + 2. 卸载torch相关组件 + + ```bash + pip uninstall torch torch-npu torchvision torchaudio -y + ``` + + 3. 安装MindSpore + + ```bash + pip install /path/to/mindspore-*.whl + ``` + + 4. 引入MindSpore Transformers仓,加入到`PYTHONPATH`中 + + ```bash + git clone https://gitee.com/mindspore/mindformers.git + export PYTHONPATH=$MF_PATH:$PYTHONPATH + ``` + + 5. 安装Golden Stick + + ```bash + pip install /path/to/mindspore_gs-*.whl + ``` + + 6. 安装MSAdapter + + ```bash + pip install /path/to/msadapter-*.whl + ``` + + 7. 安装vLLM-MindSpore插件 + + 需要先拉取vLLM-MindSpore插件源码,再执行安装 + + ```bash + git clone https://gitee.com/mindspore/vllm-mindspore.git + cd vllm-mindspore + pip install . + ``` + +## 快速验证 用户可以创建一个简单的离线推理场景,验证安装是否成功。下面以[Qwen2.5-7B](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct) 为例。首先用户需要执行以下命令,设置环境变量: ```bash -export ASCEND_TOTAL_MEMORY_GB=64 # Please use `npu-smi info` to check the memory. export vLLM_MODEL_BACKEND=MindFormers # use MindSpore Transformers as model backend. -export vLLM_MODEL_MEMORY_USE_GB=32 # Memory reserved for model execution. Set according to the model's maximum usage, with the remaining environment used for kvcache allocation export MINDFORMERS_MODEL_CONFIG=$YAML_PATH # Set the corresponding MindSpore Transformers model's YAML file. ``` diff --git a/docs/vllm_mindspore/docs/source_zh_cn/getting_started/quick_start/quick_start.md b/docs/vllm_mindspore/docs/source_zh_cn/getting_started/quick_start/quick_start.md index 2bf629908a752b2c386f752e33355339b9ec6069..f0d19926ba072f4b63448d34ccb48ffe7474b66d 100644 --- a/docs/vllm_mindspore/docs/source_zh_cn/getting_started/quick_start/quick_start.md +++ b/docs/vllm_mindspore/docs/source_zh_cn/getting_started/quick_start/quick_start.md @@ -2,15 +2,15 @@ [![查看源文件](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/master/resource/_static/logo_source.svg)](https://gitee.com/mindspore/docs/blob/master/docs/vllm_mindspore/docs/source_zh_cn/getting_started/quick_start/quick_start.md) -本文档将为用户提供快速指引,以[Qwen2.5-7B](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct)模型为例,使用[docker](https://www.docker.com/)的安装方式部署vLLM MindSpore,并以[离线推理](#离线推理)与[在线推理](#在线推理)两种方式,快速体验vLLM MindSpore的服务化与推理能力。如用户需要了解更多的安装方式,请参考[安装指南](../installation/installation.md)。 +本文档将为用户提供快速指引,以[Qwen2.5-7B](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct)模型为例,使用[docker](https://www.docker.com/)的安装方式部署vLLM-MindSpore插件,并以[离线推理](#离线推理)与[在线推理](#在线推理)两种方式,快速体验vLLM-MindSpore插件的服务化与推理能力。如用户需要了解更多的安装方式,请参考[安装指南](../installation/installation.md)。 ## docker安装 -在本章节中,我们推荐用docker创建的方式,以快速部署vLLM MindSpore环境,以下是部署docker的步骤介绍: +在本章节中,我们推荐用docker创建的方式,以快速部署vLLM-MindSpore插件环境,以下是部署docker的步骤介绍: ### 构建镜像 -用户可执行以下命令,拉取vLLM MindSpore代码仓库,并构建镜像: +用户可执行以下命令,拉取vLLM-MindSpore插件代码仓库,并构建镜像: ```bash git clone https://gitee.com/mindspore/vllm-mindspore.git @@ -131,18 +131,14 @@ git clone https://huggingface.co/Qwen/Qwen2.5-7B-Instruct 用户在拉起模型前,需设置以下环境变量: ```bash -export ASCEND_TOTAL_MEMORY_GB=64 # Please use `npu-smi info` to check the memory. export vLLM_MODEL_BACKEND=MindFormers # use MindSpore Transformers as model backend. -export vLLM_MODEL_MEMORY_USE_GB=32 # Memory reserved for model execution. Set according to the model's maximum usage, with the remaining environment used for kvcache allocation export MINDFORMERS_MODEL_CONFIG=$YAML_PATH # Set the corresponding MindSpore Transformers model's YAML file. ``` 以下是对上述环境变量的解释: -- `ASCEND_TOTAL_MEMORY_GB`: 每一张计算卡的显存大小。用户可使用`npu-smi info`命令进行查询,该值对应查询结果中的`HBM-Usage(MB)`; -- `vLLM_MODEL_BACKEND`:所运行的模型后端。目前vLLM MindSpore所支持的模型与模型后端,可在[模型支持列表](../../user_guide/supported_models/models_list/models_list.md)中进行查询; -- `vLLM_MODEL_MEMORY_USE_GB`:模型加载时所用空间,根据用户所使用的模型进行设置。若用户在模型加载过程中遇到显存不足时,可适当增大该值并重试; -- `MINDFORMERS_MODEL_CONFIG`:模型配置文件。 +- `vLLM_MODEL_BACKEND`:所运行的模型后端。目前vLLM-MindSpore插件所支持的模型与模型后端,可在[模型支持列表](../../user_guide/supported_models/models_list/models_list.md)中进行查询; +- `MINDFORMERS_MODEL_CONFIG`:模型配置文件。用户可以在[MindSpore Transformers工程](https://gitee.com/mindspore/mindformers/tree/master/research/qwen2_5)中,找到对应模型的yaml文件。以Qwen2.5-7B为例,则其yaml文件为[predict_qwen2_5_7b_instruct.yaml](https://gitee.com/mindspore/mindformers/blob/master/research/qwen2_5/predict_qwen2_5_7b_instruct.yaml)。 另外,用户需要确保MindSpore Transformers已安装。用户可通过 @@ -192,7 +188,7 @@ Prompt: 'Llama is'. Generated text: ' a 100% natural, biodegradable, and compost ### 在线推理 -vLLM MindSpore可使用OpenAI的API协议,进行在线推理部署。以下是以[Qwen2.5-7B](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct) 为例,介绍模型的[启动服务](#启动服务),并[发送请求](#发送请求),得到在线推理的推理结果。 +vLLM-MindSpore插件可使用OpenAI的API协议,进行在线推理部署。以下是以[Qwen2.5-7B](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct) 为例,介绍模型的[启动服务](#启动服务),并[发送请求](#发送请求),得到在线推理的推理结果。 #### 启动服务 @@ -202,7 +198,7 @@ vLLM MindSpore可使用OpenAI的API协议,进行在线推理部署。以下是 python3 -m vllm_mindspore.entrypoints vllm.entrypoints.openai.api_server --model "Qwen/Qwen2.5-7B-Instruct" ``` -若服务成功拉起,则可以获得类似的执行结果: +用户可以通过`--model`参数,指定模型保存的本地路径。若服务成功拉起,则可以获得类似的执行结果: ```text INFO: Started server process [6363] @@ -224,7 +220,7 @@ Engine 000: Avg prompt throughput: 0.0 tokens/s, Avg gereration throughput: 0.0 curl http://localhost:8000/v1/completions -H "Content-Type: application/json" -d '{"model": "Qwen/Qwen2.5-7B-Instruct", "prompt": "I am", "max_tokens": 20, "temperature": 0}' ``` -若请求处理成功,将获得以下的推理结果: +其中,用户需确认`"model"`字段与启动服务中`--model`一致,请求才能成功匹配到模型。若请求处理成功,将获得以下推理结果: ```text { diff --git a/docs/vllm_mindspore/docs/source_zh_cn/getting_started/tutorials/deepseek_parallel/deepseek_r1_671b_w8a8_dp4_tp4_ep4.md b/docs/vllm_mindspore/docs/source_zh_cn/getting_started/tutorials/deepseek_parallel/deepseek_r1_671b_w8a8_dp4_tp4_ep4.md index 047e7b4aad2f9239abc18c111f8f576867c3a8be..87eb81a0a13c62b94bc89191982b163df7b596e1 100644 --- a/docs/vllm_mindspore/docs/source_zh_cn/getting_started/tutorials/deepseek_parallel/deepseek_r1_671b_w8a8_dp4_tp4_ep4.md +++ b/docs/vllm_mindspore/docs/source_zh_cn/getting_started/tutorials/deepseek_parallel/deepseek_r1_671b_w8a8_dp4_tp4_ep4.md @@ -2,7 +2,7 @@ [![查看源文件](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/master/resource/_static/logo_source.svg)](https://gitee.com/mindspore/docs/blob/master/docs/vllm_mindspore/docs/source_zh_cn/getting_started/tutorials/deepseek_parallel/deepseek_r1_671b_w8a8_dp4_tp4_ep4.md) -vLLM MindSpore支持张量并行(TP)、数据并行(DP)、专家并行(EP)及其组合配置的混合并行推理,不同并行策略的适用场景可参考[vLLM官方文档](https://docs.vllm.ai/en/latest/configuration/optimization.html#parallelism-strategies)。 +vLLM-MindSpore插件支持张量并行(TP)、数据并行(DP)、专家并行(EP)及其组合配置的混合并行推理,不同并行策略的适用场景可参考[vLLM官方文档](https://docs.vllm.ai/en/latest/configuration/optimization.html#parallelism-strategies)。 本文档将以DeepSeek R1 671B W8A8为例介绍[张量并行](#tp16-张量并行推理)及[混合并行](#混合并行推理)推理流程。DeepSeek R1 671B W8A8模型需使用多个节点资源运行推理模型。为确保各个节点的执行配置(包括模型配置文件路径、Python环境等)一致,推荐通过 docker 镜像创建容器的方式避免执行差异。 @@ -10,11 +10,11 @@ vLLM MindSpore支持张量并行(TP)、数据并行(DP)、专家并行 ## docker安装 -在本章节中,我们推荐用docker创建的方式,以快速部署vLLM MindSpore环境。以下是部署docker的步骤介绍: +在本章节中,我们推荐用docker创建的方式,以快速部署vLLM-MindSpore插件环境。以下是部署docker的步骤介绍: ### 构建镜像 -用户可执行以下命令,拉取vLLM MindSpore代码仓库,并构建镜像: +用户可执行以下命令,拉取vLLM-MindSpore插件代码仓库,并构建镜像: ```bash git clone https://gitee.com/mindspore/vllm-mindspore.git @@ -94,8 +94,8 @@ docker exec -it $DOCKER_NAME bash ```python from openmind_hub import snapshot_download -snapshot_download(repo_id="MindSpore-Lab/DeepSeek-R1-W8A8", - local_dir="/path/to/save/deepseek_r1_w8a8", +snapshot_download(repo_id="MindSpore-Lab/DeepSeek-R1-0528-A8W8", + local_dir="/path/to/save/deepseek_r1_0528_a8w8", local_dir_use_symlinks=False) ``` @@ -120,7 +120,7 @@ Git LFS initialized. 工具确认可用后,执行以下命令,下载权重: ```shell -git clone https://modelers.cn/MindSpore-Lab/DeepSeek-R1-W8A8.git +git clone https://modelers.cn/models/MindSpore-Lab/DeepSeek-R1-0528-A8W8.git ``` ## TP16 张量并行推理 @@ -156,7 +156,7 @@ export MINDFORMERS_MODEL_CONFIG=/path/to/research/deepseek3/deepseek_r1_671b/pre - `HCCL_OP_EXPANSION_MODE`: 配置通信算法的编排展开位置为Device侧的AI Vector Core计算单元。 - `MS_ALLOC_CONF`: 设置内存策略。可参考[MindSpore官网文档](https://www.mindspore.cn/docs/zh-CN/master/api_python/env_var_list.html)。 - `ASCEND_RT_VISIBLE_DEVICES`: 配置每个节点可用device id。用户可使用`npu-smi info`命令进行查询。 -- `vLLM_MODEL_BACKEND`:所运行的模型后端。目前vLLM MindSpore所支持的模型与模型后端,可在[模型支持列表](../../../user_guide/supported_models/models_list/models_list.md)中进行查询。 +- `vLLM_MODEL_BACKEND`:所运行的模型后端。目前vLLM-MindSpore插件所支持的模型与模型后端,可在[模型支持列表](../../../user_guide/supported_models/models_list/models_list.md)中进行查询。 - `MINDFORMERS_MODEL_CONFIG`:模型配置文件。用户可以在[MindSpore Transformers工程](https://gitee.com/mindspore/mindformers/tree/master/research/deepseek3/deepseek_r1_671b)中,找到对应模型的yaml文件[predict_deepseek_r1_671b_w8a8.yaml](https://gitee.com/mindspore/mindformers/blob/master/research/deepseek3/deepseek_r1_671b/predict_deepseek_r1_671b_w8a8.yaml) 。 模型并行策略通过配置文件中的`parallel_config`指定,例如TP16 张量并行配置如下所示: @@ -264,7 +264,7 @@ chmod -R 777 ./Ascend-pyACL_8.0.RC1_linux-aarch64.run #### 启动服务 -vLLM MindSpore可使用OpenAI的API协议,部署为在线推理。以下是在线推理的拉起流程。 +vLLM-MindSpore插件可使用OpenAI的API协议,部署为在线推理。以下是在线推理的拉起流程。 ```bash # 启动配置参数说明 @@ -284,19 +284,21 @@ vllm-mindspore serve ```bash # 主节点: -vllm-mindspore serve --model="/path/to/save/deepseek_r1_w8a8" --trust-remote-code --max-num-seqs=256 --max_model_len=32768 --max-num-batched-tokens=4096 --block-size=128 --gpu-memory-utilization=0.9 --tensor-parallel-size 16 --distributed-executor-backend=ray +vllm-mindspore serve --model="MindSpore-Lab/DeepSeek-R1-0528-A8W8" --trust-remote-code --max-num-seqs=256 --max_model_len=32768 --max-num-batched-tokens=4096 --block-size=128 --gpu-memory-utilization=0.9 --tensor-parallel-size 16 --distributed-executor-backend=ray ``` -张量并行场景下,`--tensor-parallel-size`参数会覆盖模型yaml文件中`parallel_config`的`model_parallel`配置。 +张量并行场景下,`--tensor-parallel-size`参数会覆盖模型yaml文件中`parallel_config`的`model_parallel`配置。用户可以通过`--model`参数,指定模型保存的本地路径。 #### 发起请求 使用如下命令发送请求。其中`prompt`字段为模型输入: ```bash -curl http://localhost:8000/v1/completions -H "Content-Type: application/json" -d '{"model": "/path/to/save/deepseek_r1_w8a8", "prompt": "I am", "max_tokens": 20, "temperature": 0, "top_p": 1.0, "top_k": 1, "repetition_penalty": 1.0}' +curl http://localhost:8000/v1/completions -H "Content-Type: application/json" -d '{"model": "MindSpore-Lab/DeepSeek-R1-0528-A8W8", "prompt": "I am", "max_tokens": 20, "temperature": 0, "top_p": 1.0, "top_k": 1, "repetition_penalty": 1.0}' ``` +用户需确认`"model"`字段与启动服务中`--model`一致,请求才能成功匹配到模型。 + ## 混合并行推理 vLLM 通过 Ray 对多个节点资源进行管理和运行。该样例对应以下并行策略场景: @@ -326,7 +328,7 @@ export MINDFORMERS_MODEL_CONFIG=/path/to/research/deepseek3/deepseek_r1_671b/pre - `HCCL_OP_EXPANSION_MODE`: 配置通信算法的编排展开位置为Device侧的AI Vector Core计算单元。 - `MS_ALLOC_CONF`: 设置内存策略。可参考[MindSpore官网文档](https://www.mindspore.cn/docs/zh-CN/r2.6.0/api_python/env_var_list.html)。 - `ASCEND_RT_VISIBLE_DEVICES`: 配置每个节点可用device id。用户可使用`npu-smi info`命令进行查询。 -- `vLLM_MODEL_BACKEND`:所运行的模型后端。目前vLLM MindSpore所支持的模型与模型后端,可在[模型支持列表](../../../user_guide/supported_models/models_list/models_list.md)中进行查询。 +- `vLLM_MODEL_BACKEND`:所运行的模型后端。目前vLLM-MindSpore插件所支持的模型与模型后端,可在[模型支持列表](../../../user_guide/supported_models/models_list/models_list.md)中进行查询。 - `MINDFORMERS_MODEL_CONFIG`:模型配置文件。用户可以在[MindSpore Transformers工程](https://gitee.com/mindspore/mindformers/tree/master/research/deepseek3/deepseek_r1_671b)中,找到对应模型的yaml文件[predict_deepseek_r1_671b_w8a8.yaml](https://gitee.com/mindspore/mindformers/blob/master/research/deepseek3/deepseek_r1_671b/predict_deepseek_r1_671b_w8a8_ep4tp4.yaml)。 模型并行策略通过配置文件中的`parallel_config`指定,例如混合并行配置如下所示: @@ -344,6 +346,8 @@ parallel_config: ### 在线推理 +#### 启动服务 + `vllm-mindspore`可使用OpenAI的API协议部署在线推理。以下是在线推理的拉起流程: ```bash @@ -366,20 +370,22 @@ vllm-mindspore serve --enable-expert-parallel # 使能专家并行 ``` -执行示例: +用户可以通过`--model`参数,指定模型保存的本地路径。以下为执行示例: ```bash # 主节点: -vllm-mindspore serve --model="/path/to/save/deepseek_r1_w8a8" --trust-remote-code --max-num-seqs=256 --max-model-len=32768 --max-num-batched-tokens=4096 --block-size=128 --gpu-memory-utilization=0.9 --tensor-parallel-size 4 --data-parallel-size 4 --data-parallel-size-local 2 --data-parallel-start-rank 0 --data-parallel-address 192.10.10.10 --data-parallel-rpc-port 12370 --enable-expert-parallel +vllm-mindspore serve --model="MindSpore-Lab/DeepSeek-R1-0528-A8W8" --trust-remote-code --max-num-seqs=256 --max-model-len=32768 --max-num-batched-tokens=4096 --block-size=128 --gpu-memory-utilization=0.9 --tensor-parallel-size 4 --data-parallel-size 4 --data-parallel-size-local 2 --data-parallel-start-rank 0 --data-parallel-address 192.10.10.10 --data-parallel-rpc-port 12370 --enable-expert-parallel # 从节点: -vllm-mindspore serve --headless --model="/path/to/save/deepseek_r1_w8a8" --trust-remote-code --max-num-seqs=256 --max-model-len=32768 --max-num-batched-tokens=4096 --block-size=128 --gpu-memory-utilization=0.9 --tensor-parallel-size 4 --data-parallel-size 4 --data-parallel-size-local 2 --data-parallel-start-rank 2 --data-parallel-address 192.10.10.10 --data-parallel-rpc-port 12370 --enable-expert-parallel +vllm-mindspore serve --headless --model="MindSpore-Lab/DeepSeek-R1-0528-A8W8" --trust-remote-code --max-num-seqs=256 --max-model-len=32768 --max-num-batched-tokens=4096 --block-size=128 --gpu-memory-utilization=0.9 --tensor-parallel-size 4 --data-parallel-size 4 --data-parallel-size-local 2 --data-parallel-start-rank 2 --data-parallel-address 192.10.10.10 --data-parallel-rpc-port 12370 --enable-expert-parallel ``` -## 发送请求 +#### 发送请求 使用如下命令发送请求。其中`prompt`字段为模型输入: ```bash -curl http://localhost:8000/v1/completions -H "Content-Type: application/json" -d '{"model": "/path/to/save/deepseek_r1_w8a8", "prompt": "I am, "max_tokens": 120, "temperature": 0}' +curl http://localhost:8000/v1/completions -H "Content-Type: application/json" -d '{"model": "MindSpore-Lab/DeepSeek-R1-0528-A8W8", "prompt": "I am, "max_tokens": 120, "temperature": 0}' ``` + +用户需确认`"model"`字段与启动服务中`--model`一致,请求才能成功匹配到模型。 diff --git a/docs/vllm_mindspore/docs/source_zh_cn/getting_started/tutorials/qwen2.5_32b_multiNPU/qwen2.5_32b_multiNPU.md b/docs/vllm_mindspore/docs/source_zh_cn/getting_started/tutorials/qwen2.5_32b_multiNPU/qwen2.5_32b_multiNPU.md index 811009f6471bea2dad284efdfad79184b3fed3a8..67ea031dd1957444f8acd798dc68d96c16cab041 100644 --- a/docs/vllm_mindspore/docs/source_zh_cn/getting_started/tutorials/qwen2.5_32b_multiNPU/qwen2.5_32b_multiNPU.md +++ b/docs/vllm_mindspore/docs/source_zh_cn/getting_started/tutorials/qwen2.5_32b_multiNPU/qwen2.5_32b_multiNPU.md @@ -2,15 +2,15 @@ [![查看源文件](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/master/resource/_static/logo_source.svg)](https://gitee.com/mindspore/docs/blob/master/docs/vllm_mindspore/docs/source_zh_cn/getting_started/tutorials/qwen2.5_32b_multiNPU/qwen2.5_32b_multiNPU.md) -本文档将为用户介绍使用vLLM MindSpore进行单节点多卡的推理流程。以[Qwen2.5-32B](https://huggingface.co/Qwen/Qwen2.5-32B-Instruct)模型为例,用户通过以下[docker安装](#docker安装)章节,或[安装指南](../../installation/installation.md#安装指南)进行环境配置,并[下载模型权重](#下载模型权重)。在[设置环境变量](#设置环境变量)之后,可部署[在线推理](#在线推理),以体验单节点多卡的推理功能。 +本文档将为用户介绍使用vLLM-MindSpore插件进行单节点多卡的推理流程。以[Qwen2.5-32B](https://huggingface.co/Qwen/Qwen2.5-32B-Instruct)模型为例,用户通过以下[docker安装](#docker安装)章节,或[安装指南](../../installation/installation.md#安装指南)进行环境配置,并[下载模型权重](#下载模型权重)。在[设置环境变量](#设置环境变量)之后,可部署[在线推理](#在线推理),以体验单节点多卡的推理功能。 ## docker安装 -在本章节中,我们推荐用docker创建的方式,以快速部署vLLM MindSpore环境,以下是部署docker的步骤介绍: +在本章节中,我们推荐用docker创建的方式,以快速部署vLLM-MindSpore插件环境,以下是部署docker的步骤介绍: ### 构建镜像 -用户可执行以下命令,拉取vLLM MindSpore代码仓库,并构建镜像: +用户可执行以下命令,拉取vLLM-MindSpore插件代码仓库,并构建镜像: ```bash git clone https://gitee.com/mindspore/vllm-mindspore.git @@ -128,18 +128,14 @@ git clone https://huggingface.co/Qwen/Qwen2.5-32B-Instruct ```bash #set environment variables -export ASCEND_TOTAL_MEMORY_GB=64 # Please use `npu-smi info` to check the memory. export vLLM_MODEL_BACKEND=MindFormers # use MindSpore TransFormers as model backend. -export vLLM_MODEL_MEMORY_USE_GB=32 # Memory reserved for model execution. Set according to the model's maximum usage, with the remaining environment used for kvcache allocation export MINDFORMERS_MODEL_CONFIG=$YAML_PATH # Set the corresponding MindSpore Transformers model's YAML file. ``` 以下是对上述环境变量的解释: -- `ASCEND_TOTAL_MEMORY_GB`: 每一张计算卡的显存大小。用户可使用`npu-smi info`命令进行查询,该值对应查询结果中的`HBM-Usage(MB)`。 -- `vLLM_MODEL_BACKEND`:所运行的模型后端。目前vLLM MindSpore所支持的模型与模型后端,可在[模型支持列表](../../../user_guide/supported_models/models_list/models_list.md)中进行查询。 -- `vLLM_MODEL_MEMORY_USE_GB`:模型加载时所用空间,根据用户所使用的模型进行设置。若用户在模型加载过程中遇到显存不足时,可适当增大该值并重试。 -- `MINDFORMERS_MODEL_CONFIG`:模型配置文件。用户可以在[MindSpore Transformers工程](https://gitee.com/mindspore/mindformers/tree/r1.5.0/research/qwen2_5)中,找到对应模型的yaml文件。以Qwen2.5-32B为例,则其yaml文件为[predict_qwen2_5_32b_instruct.yaml](https://gitee.com/mindspore/mindformers/blob/r1.5.0/research/qwen2_5/predict_qwen2_5_32b_instruct.yaml) 。 +- `vLLM_MODEL_BACKEND`:所运行的模型后端。目前vLLM-MindSpore插件所支持的模型与模型后端,可在[模型支持列表](../../../user_guide/supported_models/models_list/models_list.md)中进行查询。 +- `MINDFORMERS_MODEL_CONFIG`:模型配置文件。用户可以在[MindSpore Transformers工程](https://gitee.com/mindspore/mindformers/tree/master/research/qwen2_5)中,找到对应模型的yaml文件。以Qwen2.5-32B为例,则其yaml文件为[predict_qwen2_5_32b_instruct.yaml](https://gitee.com/mindspore/mindformers/blob/master/research/qwen2_5/predict_qwen2_5_32b_instruct.yaml) 。 用户可通过`npu-smi info`查看显存占用情况,并可以使用如下环境变量,设置用于推理的计算卡。以下例子为假设用户使用4,5,6,7卡进行推理: @@ -149,7 +145,7 @@ export ASCEND_RT_VISIBLE_DEVICES=4,5,6,7 ## 在线推理 -vLLM MindSpore可使用OpenAI的API协议,部署为在线推理。以下是在线推理的拉起流程。以下是以[Qwen2.5-32B](https://huggingface.co/Qwen/Qwen2.5-32B-Instruct) 为例,介绍模型的[启动服务](#启动服务),并[发送请求](#发送请求),得到在线推理的推理结果。 +vLLM-MindSpore插件可使用OpenAI的API协议,部署为在线推理。以下是在线推理的拉起流程。以下是以[Qwen2.5-32B](https://huggingface.co/Qwen/Qwen2.5-32B-Instruct) 为例,介绍模型的[启动服务](#启动服务),并[发送请求](#发送请求),得到在线推理的推理结果。 ### 启动服务 @@ -163,7 +159,7 @@ python3 -m vllm_mindspore.entrypoints vllm.entrypoints.openai.api_server --model 其中,`TENSOR_PARALLEL_SIZE`为用户指定的卡数,`MAX_MODEL_LEN`为模型最大输出token数。 -若服务成功拉起,则可以获得类似的执行结果: +用户可以通过`--model`参数,指定模型保存的本地路径。若服务成功拉起,则可以获得类似的执行结果: ```text INFO: Started server process [6363] @@ -182,10 +178,10 @@ Engine 000: Avg prompt throughput: 0.0 tokens/s, Avg gereration throughput: 0.0 使用如下命令发送请求。其中`prompt`字段为模型输入: ```bash -curl http://localhost:8000/v1/completions -H "Content-Type: application/json" -d '{"model": "Qwen2.5-32B-Instruct", "prompt": "I am", "max_tokens": 20, "temperature": 0}' +curl http://localhost:8000/v1/completions -H "Content-Type: application/json" -d '{"model": "Qwen/Qwen2.5-32B-Instruct", "prompt": "I am", "max_tokens": 20, "temperature": 0}' ``` -若请求处理成功,将获得以下的推理结果: +其中,用户需确认`"model"`字段与启动服务中`--model`一致,请求才能成功匹配到模型。若请求处理成功,将获得以下推理结果: ```text { diff --git a/docs/vllm_mindspore/docs/source_zh_cn/getting_started/tutorials/qwen2.5_7b_singleNPU/qwen2.5_7b_singleNPU.md b/docs/vllm_mindspore/docs/source_zh_cn/getting_started/tutorials/qwen2.5_7b_singleNPU/qwen2.5_7b_singleNPU.md index ffc82071b2e9a98e315268756ad2a48ec9e12246..c7ec37a452d549787fa83d70f832e00bfa0d5be4 100644 --- a/docs/vllm_mindspore/docs/source_zh_cn/getting_started/tutorials/qwen2.5_7b_singleNPU/qwen2.5_7b_singleNPU.md +++ b/docs/vllm_mindspore/docs/source_zh_cn/getting_started/tutorials/qwen2.5_7b_singleNPU/qwen2.5_7b_singleNPU.md @@ -2,15 +2,15 @@ [![查看源文件](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/master/resource/_static/logo_source.svg)](https://gitee.com/mindspore/docs/blob/master/docs/vllm_mindspore/docs/source_zh_cn/getting_started/tutorials/qwen2.5_7b_singleNPU/qwen2.5_7b_singleNPU.md) -本文档将为用户介绍使用vLLM MindSpore进行单卡推理流程。以[Qwen2.5-7B](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct)模型为例,用户通过以下[docker安装](#docker安装)章节,或[安装指南](../../installation/installation.md#安装指南)进行环境配置,并[下载模型权重](#下载模型权重)。在[设置环境变量](#设置环境变量)之后,可进行[离线推理](#离线推理)与[在线推理](#在线推理),以体验单卡推理功能。 +本文档将为用户介绍使用vLLM-MindSpore插件进行单卡推理流程。以[Qwen2.5-7B](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct)模型为例,用户通过以下[docker安装](#docker安装)章节,或[安装指南](../../installation/installation.md#安装指南)进行环境配置,并[下载模型权重](#下载模型权重)。在[设置环境变量](#设置环境变量)之后,可进行[离线推理](#离线推理)与[在线推理](#在线推理),以体验单卡推理功能。 ## docker安装 -在本章节中,我们推荐用docker创建的方式,以快速部署vLLM MindSpore环境。以下是部署docker的步骤介绍: +在本章节中,我们推荐用docker创建的方式,以快速部署vLLM-MindSpore插件环境。以下是部署docker的步骤介绍: ### 构建镜像 -用户可执行以下命令,拉取vLLM MindSpore代码仓库,并构建镜像: +用户可执行以下命令,拉取vLLM-MindSpore插件代码仓库,并构建镜像: ```bash git clone https://gitee.com/mindspore/vllm-mindspore.git @@ -128,18 +128,14 @@ git clone https://huggingface.co/Qwen/Qwen2.5-7B-Instruct ```bash #set environment variables -export ASCEND_TOTAL_MEMORY_GB=64 # Please use `npu-smi info` to check the memory. export vLLM_MODEL_BACKEND=MindFormers # use MindSpore TransFormers as model backend. -export vLLM_MODEL_MEMORY_USE_GB=32 # Memory reserved for model execution. Set according to the model's maximum usage, with the remaining environment used for kvcache allocation export MINDFORMERS_MODEL_CONFIG=$YAML_PATH # Set the corresponding MindSpore Transformers model's YAML file. ``` 以下是对上述环境变量的解释: -- `ASCEND_TOTAL_MEMORY_GB`: 每一张计算卡的显存大小。用户可使用`npu-smi info`命令进行查询,该值对应查询结果中的`HBM-Usage(MB)`; -- `vLLM_MODEL_BACKEND`:所运行的模型后端。目前vLLM MindSpore所支持的模型与模型后端,可在[模型支持列表](../../../user_guide/supported_models/models_list/models_list.md)中进行查询; -- `vLLM_MODEL_MEMORY_USE_GB`:模型加载时所用空间,根据用户所使用的模型进行设置。若用户在模型加载过程中遇到显存不足时,可适当增大该值并重试; -- `MINDFORMERS_MODEL_CONFIG`:模型配置文件。用户可以在[MindSpore Transformers工程](https://gitee.com/mindspore/mindformers/tree/r1.5.0/research/qwen2_5)中,找到对应模型的yaml文件。以Qwen2.5-7B为例,则其yaml文件为[predict_qwen2_5_7b_instruct.yaml](https://gitee.com/mindspore/mindformers/blob/r1.5.0/research/qwen2_5/predict_qwen2_5_7b_instruct.yaml) 。 +- `vLLM_MODEL_BACKEND`:所运行的模型后端。目前vLLM-MindSpore插件所支持的模型与模型后端,可在[模型支持列表](../../../user_guide/supported_models/models_list/models_list.md)中进行查询; +- `MINDFORMERS_MODEL_CONFIG`:模型配置文件。用户可以在[MindSpore Transformers工程](https://gitee.com/mindspore/mindformers/tree/master/research/qwen2_5)中,找到对应模型的yaml文件。以Qwen2.5-7B为例,则其yaml文件为[predict_qwen2_5_7b_instruct.yaml](https://gitee.com/mindspore/mindformers/blob/master/research/qwen2_5/predict_qwen2_5_7b_instruct.yaml) 。 用户可通过`npu-smi info`查看显存占用情况,并可以使用如下环境变量,设置用于推理的计算卡: @@ -188,7 +184,7 @@ Prompt: 'Llama is'. Generated text: ' a 100% natural, biodegradable, and compost ## 在线推理 -vLLM MindSpore可使用OpenAI的API协议,部署为在线推理。以下是以[Qwen2.5-7B](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct) 为例,介绍模型的[启动服务](#启动服务),并[发送请求](#发送请求),得到在线推理的推理结果。 +vLLM-MindSpore插件可使用OpenAI的API协议,部署为在线推理。以下是以[Qwen2.5-7B](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct) 为例,介绍模型的[启动服务](#启动服务),并[发送请求](#发送请求),得到在线推理的推理结果。 ### 启动服务 @@ -198,7 +194,7 @@ vLLM MindSpore可使用OpenAI的API协议,部署为在线推理。以下是以 python3 -m vllm_mindspore.entrypoints vllm.entrypoints.openai.api_server --model "Qwen/Qwen2.5-7B-Instruct" ``` -若服务成功拉起,则可以获得类似的执行结果: +用户可以通过`--model`参数,指定模型保存的本地路径。若服务成功拉起,则可以获得类似的执行结果: ```text INFO: Started server process [6363] @@ -220,7 +216,7 @@ Engine 000: Avg prompt throughput: 0.0 tokens/s, Avg gereration throughput: 0.0 curl http://localhost:8000/v1/completions -H "Content-Type: application/json" -d '{"model": "Qwen/Qwen2.5-7B-Instruct", "prompt": "I am", "max_tokens": 20, "temperature": 0}' ``` -若请求处理成功,将获得以下的推理结果: +其中,用户需确认`"model"`字段与启动服务中`--model`一致,请求才能成功匹配到模型。若请求处理成功,将获得以下推理结果: ```text { diff --git a/docs/vllm_mindspore/docs/source_zh_cn/index.rst b/docs/vllm_mindspore/docs/source_zh_cn/index.rst index f465f8c121915e72eb11ae20eb3b4da8d0745a2f..921322682e81d226bf2e3a76f0b8a8a4916ac841 100644 --- a/docs/vllm_mindspore/docs/source_zh_cn/index.rst +++ b/docs/vllm_mindspore/docs/source_zh_cn/index.rst @@ -1,19 +1,19 @@ -vLLM MindSpore 文档 +vLLM-MindSpore插件文档 ========================================= -vLLM MindSpore 简介 +vLLM-MindSpore插件简介 ----------------------------------------------------- -vLLM MindSpore插件(`vllm-mindspore`)是一个由 `MindSpore社区 `_ 孵化的vLLM后端插件。其将基于MindSpore构建的大模型推理能力接入 `vLLM `_ ,从而有机整合MindSpore和vLLM的技术优势,提供全栈开源、高性能、易用的大模型推理解决方案。 +vLLM-MindSpore插件(`vllm-mindspore`)是一个由 `MindSpore社区 `_ 孵化的vLLM后端插件。其将基于MindSpore构建的大模型推理能力接入 `vLLM `_ ,从而有机整合MindSpore和vLLM的技术优势,提供全栈开源、高性能、易用的大模型推理解决方案。 vLLM是由加州大学伯克利分校Sky Computing Lab创建的社区开源项目,已广泛用于学术研究和工业应用。vLLM以Continuous Batching调度机制和PagedAttention Key-Value缓存管理为基础,提供了丰富的推理服务功能,包括投机推理、Prefix Caching、Multi-LoRA等。同时,vLLM已支持种类丰富的开源大模型,包括Transformer类(如LLaMa)、混合专家类(如DeepSeek)、Embedding类(如E5-Mistral)、多模态类(如LLaVA)等。由于vLLM选用PyTorch构建大模型和管理计算存储资源,此前无法使用其部署基于MindSpore大模型的推理服务。 -vLLM MindSpore插件以将MindSpore大模型接入vLLM,并实现服务化部署为功能目标。其遵循以下设计原则: +vLLM-MindSpore插件以将MindSpore大模型接入vLLM,并实现服务化部署为功能目标。其遵循以下设计原则: - 接口兼容:支持vLLM原生的API和服务部署接口,避免新增配置文件或接口,降低用户学习成本和确保易用性。 - 最小化侵入式修改:尽可能避免侵入式修改vLLM代码,以保障系统的可维护性和可演进性。 - 组件解耦:最小化和规范化MindSpore大模型组件和vLLM服务组件的耦合面,以利于多种MindSpore大模型套件接入。 -基于上述设计原则,vLLM MindSpore采用如下图所示的系统架构,分组件类别实现vLLM与MindSpore的对接: +基于上述设计原则,vLLM-MindSpore插件采用如下图所示的系统架构,分组件类别实现vLLM与MindSpore的对接: - 服务化组件:通过将LLM Engine、Scheduler等服务化组件中的PyTorch API调用映射至MindSpore能力调用,继承支持包括Continuous Batching、PagedAttention在内的服务化功能。 - 大模型组件:通过注册或替换模型、网络层、自定义算子等组件,将MindSpore Transformers、MindSpore One等MindSpore大模型套件和自定义大模型接入vLLM。 @@ -28,7 +28,7 @@ vLLM MindSpore插件以将MindSpore大模型接入vLLM,并实现服务化部 -vLLM MindSpore采用vLLM社区推荐的插件机制,实现能力注册。未来期望遵循 `RPC Multi-framework support for vllm `_ 所述原则。 +vLLM-MindSpore插件采用vLLM社区推荐的插件机制,实现能力注册。未来期望遵循 `RPC Multi-framework support for vllm `_ 所述原则。 代码仓地址: @@ -41,8 +41,8 @@ vLLM MindSpore采用vLLM社区推荐的插件机制,实现能力注册。未 * Python >= 3.9, < 3.12 * CANN >= 8.0.0.beta1 - * MindSpore (与vLLM MindSpore版本配套) - * vLLM (与vLLM MindSpore版本配套) + * MindSpore (与vLLM-MindSpore插件版本配套) + * vLLM (与vLLM-MindSpore插件版本配套) 快速体验 ----------------------------------------------------- @@ -56,9 +56,9 @@ vLLM MindSpore采用vLLM社区推荐的插件机制,实现能力注册。未 分支策略 ----------------------------------------------------- -vLLM MindSpore代码仓包含主干分支、开发分支、版本分支: +vLLM-MindSpore插件代码仓包含主干分支、开发分支、版本分支: -- **main**: 主干分支,与MindSpore master分支和vLLM v0.8.3版本配套,并通过昇腾+昇思CI持续进行质量看护; +- **main**: 主干分支,与MindSpore master分支和vLLM v0.9.1版本配套,并通过昇腾+昇思CI持续进行质量看护; - **develop**: 开发分支,在vLLM部分新版本发布时从主干分支拉出,用于开发适配vLLM的新功能特性。待特性适配稳定后合入主干分支。当前开发分支正在适配vLLM v0.9.1版本; - **rX.Y.Z**: 版本分支,在完成vLLM某版本适配后,从主干分支拉出,用于正式版本发布归档。 @@ -72,7 +72,7 @@ vLLM MindSpore代码仓包含主干分支、开发分支、版本分支: - 备注 * - master - Maintained - - 基于vLLM v0.8.3版本和MindSpore master分支CI看护 + - 基于vLLM v0.9.1版本和MindSpore master分支CI看护 * - develop - Maintained - 基于vLLM v0.9.1版本 @@ -82,6 +82,9 @@ vLLM MindSpore代码仓包含主干分支、开发分支、版本分支: * - r0.2 - Maintained - 基于vLLM v0.7.3版本和MindSpore 2.6.0版本CI看护 + * - r0.3.0 + - Maintained + - 基于vLLM v0.7.3版本和MindSpore 2.7.0版本CI看护 SIG组织 ----------------------------------------------------- diff --git a/docs/vllm_mindspore/docs/source_zh_cn/release_notes/release_notes.md b/docs/vllm_mindspore/docs/source_zh_cn/release_notes/release_notes.md index f8bd01f5a0644758a34bf307f9bbc4c4413fa21c..8b6c39dbc68b2a43b5c1c2674e1a3ce1a203764f 100644 --- a/docs/vllm_mindspore/docs/source_zh_cn/release_notes/release_notes.md +++ b/docs/vllm_mindspore/docs/source_zh_cn/release_notes/release_notes.md @@ -2,9 +2,9 @@ [![查看源文件](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/master/resource/_static/logo_source.svg)](https://gitee.com/mindspore/docs/blob/master/docs/vllm_mindspore/docs/source_zh_cn/release_notes/release_notes.md) -## vLLM MindSpore 0.3.0 Release Notes +## vLLM-MindSpore插件 0.3.0 Release Notes -以下为vLLM MindSpore插件0.3.0版本支持的关键新功能和模型。 +以下为vLLM-MindSpore插件0.3.0版本支持的关键新功能和模型。 ### 新特性 diff --git a/docs/vllm_mindspore/docs/source_zh_cn/user_guide/environment_variables/environment_variables.md b/docs/vllm_mindspore/docs/source_zh_cn/user_guide/environment_variables/environment_variables.md index 7fd53b3ff3ee7ca8084c66a5942184e2a1fdff73..d3f594eb29758a95e60a7035a42234fa2582d42e 100644 --- a/docs/vllm_mindspore/docs/source_zh_cn/user_guide/environment_variables/environment_variables.md +++ b/docs/vllm_mindspore/docs/source_zh_cn/user_guide/environment_variables/environment_variables.md @@ -4,13 +4,20 @@ | 环境变量 | 功能 | 类型 | 取值 | 说明 | | ------ | ------- | ------ | ------ | ------ | -| vLLM_MODEL_BACKEND | 用于指定模型后端。使用vLLM MindSpore原生模型后端时无需指定;使用模型为vLLM MindSpore外部后端时则需要指定。 | String | `MindFormers`: 模型后端为MindSpore Transformers。 | 原生模型后端当前支持Qwen2.5系列;MindSpore Transformers模型后端支持Qwen系列、DeepSeek、Llama系列模型,使用时需配置环境变量:`export PYTHONPATH=/path/to/mindformers/:$PYTHONPATH`。 | -| MINDFORMERS_MODEL_CONFIG | MindSpore Transformers模型的配置文件。使用Qwen2.5系列、DeepSeek系列模型时,需要配置文件路径。 | String | 模型配置文件路径。 | **该环境变量在后续版本会被移除。** 样例:`export MINDFORMERS_MODEL_CONFIG=/path/to/research/deepseek3/deepseek_r1_671b/predict_deepseek_r1_671b_w8a8.yaml`。 | -| GLOO_SOCKET_IFNAME | 用于多机之间使用gloo通信时的网口名称。 | String | 网口名称,例如enp189s0f0。 | 多机场景使用,可通过`ifconfig`查找ip对应网卡的网卡名。 | -| TP_SOCKET_IFNAME | 用于多机之间使用TP通信时的网口名称。 | String | 网口名称,例如enp189s0f0。 | 多机场景使用,可通过`ifconfig`查找ip对应网卡的网卡名。 | -| HCCL_SOCKET_IFNAME | 用于多机之间使用HCCL通信时的网口名称。 | String | 网口名称,例如enp189s0f0。 | 多机场景使用,可通过`ifconfig`查找ip对应网卡的网卡名。 | -| ASCEND_RT_VISIBLE_DEVICES | 指定哪些Device对当前进程可见,支持一次指定一个或多个Device ID。 | String | 为Device ID,逗号分割的字符串,例如"0,1,2,3,4,5,6,7"。 | ray使用场景建议使用。 | -| HCCL_BUFFSIZE | 此环境变量用于控制两个NPU之间共享数据的缓存区大小。 | int | 缓存区大小,大小为MB。例如:`2048`。 | 使用方法参考:[HCCL_BUFFSIZE](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/81RC1beta1/maintenref/envvar/envref_07_0080.html)。例如DeepSeek 混合并行(数据并行数为32,专家并行数为32),且`max-num-batched-tokens`为256时,则`export HCCL_BUFFSIZE=2048`。 | -| MS_MEMPOOL_BLOCK_SIZE | 设置PyNative模式下设备内存池的块大小。 | String | 正整数string,单位为GB。 | | -| vLLM_USE_NPU_ADV_STEP_FLASH_OP | 是否使用昇腾`adv_step_flash`算子。 | String | `on`: 使用;`off`:不使用 | 取值为`off`时,将使用小算子实现替代`adv_step_flash`算子。 | -| VLLM_TORCH_PROFILER_DIR | 开启profiling采集数据,当配置了采集数据保存路径后生效 | String | Profiling数据保存路径。| | +| `vLLM_MODEL_BACKEND` | 用于指定模型后端。使用vLLM-MindSpore插件原生模型后端时无需指定;使用模型为vLLM-MindSpore插件外部后端时则需要指定。 | String | `MindFormers`: 模型后端为MindSpore Transformers。 | 原生模型后端当前支持Qwen2.5系列;MindSpore Transformers模型后端支持Qwen系列、DeepSeek、Llama系列模型,使用时需配置环境变量:`export PYTHONPATH=/path/to/mindformers/:$PYTHONPATH`。 | +| `MINDFORMERS_MODEL_CONFIG` | MindSpore Transformers模型的配置文件。使用Qwen2.5系列、DeepSeek系列模型时,需要配置文件路径。 | String | 模型配置文件路径。 | **该环境变量在后续版本会被移除。** 样例:`export MINDFORMERS_MODEL_CONFIG=/path/to/research/deepseek3/deepseek_r1_671b/predict_deepseek_r1_671b_w8a8.yaml`。 | +| `GLOO_SOCKET_IFNAME` | 用于多机之间使用gloo通信时的网口名称。 | String | 网口名称,例如enp189s0f0。 | 多机场景使用,可通过`ifconfig`查找ip对应网卡的网卡名。 | +| `TP_SOCKET_IFNAME` | 用于多机之间使用TP通信时的网口名称。 | String | 网口名称,例如enp189s0f0。 | 多机场景使用,可通过`ifconfig`查找ip对应网卡的网卡名。 | +| `HCCL_SOCKET_IFNAME` | 用于多机之间使用HCCL通信时的网口名称。 | String | 网口名称,例如enp189s0f0。 | 多机场景使用,可通过`ifconfig`查找ip对应网卡的网卡名。 | +| `ASCEND_RT_VISIBLE_DEVICES` | 指定哪些Device对当前进程可见,支持一次指定一个或多个Device ID。 | String | 为Device ID,逗号分割的字符串,例如"0,1,2,3,4,5,6,7"。 | ray使用场景建议使用。 | +| `HCCL_BUFFSIZE` | 此环境变量用于控制两个NPU之间共享数据的缓存区大小。 | Integer | 缓存区大小,大小为MB。例如:`2048`。 | 使用方法参考:[HCCL_BUFFSIZE](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/81RC1beta1/maintenref/envvar/envref_07_0080.html)。例如DeepSeek 混合并行(数据并行数为32,专家并行数为32),且`max-num-batched-tokens`为256时,则`export HCCL_BUFFSIZE=2048`。 | +| `MS_MEMPOOL_BLOCK_SIZE` | 设置PyNative模式下设备内存池的块大小。 | String | 正整数string,单位为GB。 | | +| `vLLM_USE_NPU_ADV_STEP_FLASH_OP` | 是否使用昇腾`adv_step_flash`算子。 | String | `on`: 使用;`off`:不使用 | 取值为`off`时,将使用小算子实现替代`adv_step_flash`算子。 | +| `VLLM_TORCH_PROFILER_DIR` | 开启profiling采集数据,当配置了采集数据保存路径后生效 | String | Profiling数据保存路径。| | + +更多的环境变量信息,请查看: + +- [CANN 环境变量列表](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/81RC1beta1/index/index.html) +- [MindSpore 环境变量列表](https://www.mindspore.cn/docs/zh-CN/master/api_python/env_var_list.html) +- [MindSpore Transformers 环境变量列表](https://www.mindspore.cn/mindformers/docs/zh-CN/master/index.html) +- [vLLM 环境变量列表](https://docs.vllm.ai/en/v0.8.4/serving/env_vars.html) diff --git a/docs/vllm_mindspore/docs/source_zh_cn/user_guide/supported_features/benchmark/benchmark.md b/docs/vllm_mindspore/docs/source_zh_cn/user_guide/supported_features/benchmark/benchmark.md index 15f104069996c5c0c4e32fbfdf909a2269f4074a..390f5e6291ca0b7048edd83bf165e1f08ba78e42 100644 --- a/docs/vllm_mindspore/docs/source_zh_cn/user_guide/supported_features/benchmark/benchmark.md +++ b/docs/vllm_mindspore/docs/source_zh_cn/user_guide/supported_features/benchmark/benchmark.md @@ -2,16 +2,14 @@ [![查看源文件](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/master/resource/_static/logo_source.svg)](https://gitee.com/mindspore/docs/blob/master/docs/vllm_mindspore/docs/source_zh_cn/user_guide/supported_features/benchmark/benchmark.md) -vLLM MindSpore的性能测试能力,继承自vLLM所提供的性能测试能力,详情可参考[vLLM BenchMark](https://github.com/vllm-project/vllm/blob/main/benchmarks/README.md)文档。该文档将介绍[在线性能测试](#在线性能测试)与[离线性能测试](#离线性能测试),用户可以根据所介绍步骤进行性能测试。 +vLLM-MindSpore插件的性能测试能力,继承自vLLM所提供的性能测试能力,详情可参考[vLLM BenchMark](https://github.com/vllm-project/vllm/blob/main/benchmarks/README.md)文档。该文档将介绍[在线性能测试](#在线性能测试)与[离线性能测试](#离线性能测试),用户可以根据所介绍步骤进行性能测试。 ## 在线性能测试 若用户使用单卡推理,以[Qwen2.5-7B](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct)为例,可按照文档[单卡推理(Qwen2.5-7B)](../../../getting_started/tutorials/qwen2.5_7b_singleNPU/qwen2.5_7b_singleNPU.md#在线推理)进行环境准备,设置以下环境变量: ```bash -export ASCEND_TOTAL_MEMORY_GB=64 # Please use `npu-smi info` to check the memory. export vLLM_MODEL_BACKEND=MindFormers # use MindSpore Transformers as model backend. -export vLLM_MODEL_MEMORY_USE_GB=32 # Memory reserved for model execution. Set according to the model's maximum usage, with the remaining environment used for kvcache allocation export MINDFORMERS_MODEL_CONFIG=$YAML_PATH # Set the corresponding MindSpore Transformers model's YAML file. ``` @@ -37,7 +35,7 @@ INFO: Waiting for application startup. INFO: Application startup complete. ``` -拉取vLLM代码仓,导入vLLM MindSpore插件,复用其中benchmark功能: +拉取vLLM代码仓,导入vLLM-MindSpore插件,复用其中benchmark功能: ```bash export VLLM_BRANCH=v0.9.1 @@ -46,7 +44,7 @@ cd vllm sed -i '1i import vllm_mindspore' benchmarks/benchmark_serving.py ``` -其中,`VLLM_BRANCH`为vLLM的分支名,其需要与vLLM MindSpore相配套。配套关系可以参考[这里](../../../getting_started/installation/installation.md#版本配套)。 +其中,`VLLM_BRANCH`为vLLM的分支名,其需要与vLLM-MindSpore插件相配套。配套关系可以参考[这里](../../../getting_started/installation/installation.md#版本配套)。 执行测试脚本: @@ -104,13 +102,11 @@ P99 ITL (ms): .... 用户使用离线性能测试时,以[Qwen2.5-7B](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct)为例,可按照文档[单卡推理(Qwen2.5-7B)](../../../getting_started/tutorials/qwen2.5_7b_singleNPU/qwen2.5_7b_singleNPU.md#离线推理)进行环境准备,设置以下环境变量: ```bash -export ASCEND_TOTAL_MEMORY_GB=64 # Please use `npu-smi info` to check the memory. export vLLM_MODEL_BACKEND=MindFormers # use MindSpore Transformers as model backend. -export vLLM_MODEL_MEMORY_USE_GB=32 # Memory reserved for model execution. Set according to the model's maximum usage, with the remaining environment used for kvcache allocation export MINDFORMERS_MODEL_CONFIG=$YAML_PATH # Set the corresponding MindSpore Transformers model's YAML file. ``` -并拉取vLLM代码仓,导入vLLM MindSpore插件,复用其中benchmark功能: +并拉取vLLM代码仓,导入vLLM-MindSpore插件,复用其中benchmark功能: ```bash export VLLM_BRANCH=v0.9.1 @@ -119,7 +115,7 @@ cd vllm sed -i '1i import vllm_mindspore' benchmarks/benchmark_throughput.py ``` -其中,`VLLM_BRANCH`为vLLM的分支名,其需要与vLLM MindSpore相配套。配套关系可以参考[这里](../../../getting_started/installation/installation.md#版本配套)。 +其中,`VLLM_BRANCH`为vLLM的分支名,其需要与vLLM-MindSpore插件相配套。配套关系可以参考[这里](../../../getting_started/installation/installation.md#版本配套)。 用户可通过以下命令,运行测试脚本。该脚本将启动模型,并执行测试,用户不需要再拉起模型: diff --git a/docs/vllm_mindspore/docs/source_zh_cn/user_guide/supported_features/features_list/features_list.md b/docs/vllm_mindspore/docs/source_zh_cn/user_guide/supported_features/features_list/features_list.md index 910550afccb3eadebc898debd958050347363435..baa7edf953b50c459d43556e5bb21f833c394382 100644 --- a/docs/vllm_mindspore/docs/source_zh_cn/user_guide/supported_features/features_list/features_list.md +++ b/docs/vllm_mindspore/docs/source_zh_cn/user_guide/supported_features/features_list/features_list.md @@ -2,9 +2,9 @@ [![查看源文件](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/master/resource/_static/logo_source.svg)](https://gitee.com/mindspore/docs/blob/master/docs/vllm_mindspore/docs/source_zh_cn/user_guide/supported_features/features_list/features_list.md) -vLLM MindSpore支持的特性功能与vLLM社区版本保持一致,特性描述和使用请参考[vLLM官方资料](https://docs.vllm.ai/en/latest/)。 +vLLM-MindSpore插件支持的特性功能与vLLM社区版本保持一致,特性描述和使用请参考[vLLM官方资料](https://docs.vllm.ai/en/latest/)。 -以下是vLLM MindSpore的功能支持状态: +以下是vLLM-MindSpore插件的功能支持状态: | **功能** | **vLLM V0** | **vLLM V1** | |-----------------------------------|--------------------|--------------------| @@ -39,5 +39,5 @@ vLLM MindSpore支持的特性功能与vLLM社区版本保持一致,特性描 ## 特性说明 -- LoRA目前仅支持Qwen2.5 vLLM MindSpore原生模型,其他模型正在适配中; +- LoRA目前仅支持Qwen2.5 vLLM-MindSpore插件原生模型,其他模型正在适配中; - Tool Calling目前已支持DeepSeek V3 0324 W8A8模型。 diff --git a/docs/vllm_mindspore/docs/source_zh_cn/user_guide/supported_features/profiling/profiling.md b/docs/vllm_mindspore/docs/source_zh_cn/user_guide/supported_features/profiling/profiling.md index 4dc4f2ccee29c5c3d3842ba62e5381c4e01834d4..2df27851163daa02598b14561bd068f8c100615f 100644 --- a/docs/vllm_mindspore/docs/source_zh_cn/user_guide/supported_features/profiling/profiling.md +++ b/docs/vllm_mindspore/docs/source_zh_cn/user_guide/supported_features/profiling/profiling.md @@ -2,7 +2,7 @@ [![查看源文件](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/master/resource/_static/logo_source.svg)](https://gitee.com/mindspore/docs/blob/master/docs/vllm_mindspore/docs/source_zh_cn/user_guide/supported_features/profiling/profiling.md) -vLLM MindSpore支持使用`mindspore.Profiler`模块,跟踪vLLM MindSpore中worker的性能。用户可以根据[采集profiling数据](#采集profiling数据)章节,在完成数据采集后,根据[分析profiling数据](#分析profiling数据),进行数据分析。另一方面,用户可以根据[图数据dump](#图数据dump),查看模型的IR图,从而进行对模型结构的分析与调试。 +vLLM-MindSpore插件支持使用`mindspore.Profiler`模块,跟踪vLLM-MindSpore插件中worker的性能。用户可以根据[采集profiling数据](#采集profiling数据)章节,在完成数据采集后,根据[分析profiling数据](#分析profiling数据),进行数据分析。另一方面,用户可以根据[图数据dump](#图数据dump),查看模型的IR图,从而进行对模型结构的分析与调试。 ## 采集profiling数据 @@ -12,7 +12,7 @@ vLLM MindSpore支持使用`mindspore.Profiler`模块,跟踪vLLM MindSpore中wo export VLLM_TORCH_PROFILER_DIR=/path/to/save/vllm_profile ``` -设置完成后,以[Qwen2.5-32B](https://huggingface.co/Qwen/Qwen2.5-32B-Instruct) 为例,启动vLLM MindSpore服务: +设置完成后,以[Qwen2.5-32B](https://huggingface.co/Qwen/Qwen2.5-32B-Instruct) 为例,启动vLLM-MindSpore插件服务: ```bash export TENSOR_PARALLEL_SIZE=4 @@ -40,7 +40,7 @@ curl -X POST http://127.0.0.1:8000/start_profile curl http://localhost:8000/v1/completions \ -H "Content-Type: application/json" \ -d '{ - "model": "/home/DeepSeekV3", + "model": "Qwen/Qwen2.5-32B-Instruct", "prompt": "San Francisco is a", "max_tokens": 7, "temperature": 0 diff --git a/docs/vllm_mindspore/docs/source_zh_cn/user_guide/supported_features/quantization/quantization.md b/docs/vllm_mindspore/docs/source_zh_cn/user_guide/supported_features/quantization/quantization.md index 54ad35032dbdcb594ef4ae2b847beb5d41f701fe..0f11e66c83578fbef359737f054a10a1835da3b8 100644 --- a/docs/vllm_mindspore/docs/source_zh_cn/user_guide/supported_features/quantization/quantization.md +++ b/docs/vllm_mindspore/docs/source_zh_cn/user_guide/supported_features/quantization/quantization.md @@ -16,7 +16,7 @@ ### 直接下载量化权重 -我们已经将量化好的DeepSeek-R1上传到[魔乐社区](https://modelers.cn):[MindSpore-Lab/DeepSeek-R1-W8A8](https://modelers.cn/models/MindSpore-Lab/DeepSeek-R1-W8A8),可以参考[魔乐社区文档](https://modelers.cn/docs/zh/openmind-hub-client/0.9/basic_tutorial/download.html)将权重下载到本地。 +我们已经将量化好的DeepSeek-R1上传到[魔乐社区](https://modelers.cn):[MindSpore-Lab/DeepSeek-R1-0528-A8W8](https://modelers.cn/models/MindSpore-Lab/DeepSeek-R1-0528-A8W8),可以参考[魔乐社区文档](https://modelers.cn/docs/zh/openmind-hub-client/0.9/basic_tutorial/download.html)将权重下载到本地。 ## 量化模型推理 @@ -24,12 +24,10 @@ ### 离线推理 -用户可以参考[安装指南](../../../getting_started/installation/installation.md),进行vLLM MindSpore的环境搭建。用户需设置以下环境变量: +用户可以参考[安装指南](../../../getting_started/installation/installation.md),进行vLLM-MindSpore插件的环境搭建。用户需设置以下环境变量: ```bash -export ASCEND_TOTAL_MEMORY_GB=64 # Please use `npu-smi info` to check the memory. export vLLM_MODEL_BACKEND=MindFormers # use MindSpore Transformers as model backend. -export vLLM_MODEL_MEMORY_USE_GB=32 # Memory reserved for model execution. Set according to the model's maximum usage, with the remaining environment used for kvcache allocation export MINDFORMERS_MODEL_CONFIG=$YAML_PATH # Set the corresponding MindSpore Transformers model's YAML file. ``` diff --git a/docs/vllm_mindspore/docs/source_zh_cn/user_guide/supported_models/models_list/models_list.md b/docs/vllm_mindspore/docs/source_zh_cn/user_guide/supported_models/models_list/models_list.md index 2e504c0fecd265842d02a1474e0f77fdfe151eac..c64725c9e1e448999d189ec22aeab0942f012299 100644 --- a/docs/vllm_mindspore/docs/source_zh_cn/user_guide/supported_models/models_list/models_list.md +++ b/docs/vllm_mindspore/docs/source_zh_cn/user_guide/supported_models/models_list/models_list.md @@ -6,7 +6,7 @@ |-------| --------- | ---- | | DeepSeek-V3 | 已支持 | [DeepSeek-V3](https://modelers.cn/models/MindSpore-Lab/DeepSeek-V3) | | DeepSeek-R1 | 已支持 | [DeepSeek-R1](https://huggingface.co/deepseek-ai/DeepSeek-V3) | -| DeepSeek-R1 W8A8 | 已支持 | [Deepseek-R1-W8A8](https://modelers.cn/models/MindSpore-Lab/DeepSeek-r1-w8a8) | +| DeepSeek-R1 W8A8 | 已支持 | [Deepseek-R1-W8A8](https://modelers.cn/models/MindSpore-Lab/DeepSeek-R1-0528-A8W8) | | Qwen2.5 | 已支持 | [Qwen2.5-0.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct)、[Qwen2.5-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct)、[Qwen2.5-3B-Instruct](https://huggingface.co/Qwen/Qwen2.5-3B-Instruct)、 [Qwen2.5-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct)、[Qwen2.5-14B-Instruct](https://huggingface.co/Qwen/Qwen2.5-14B-Instruct)、[Qwen2.5-32B-Instruct](https://huggingface.co/Qwen/Qwen2.5-32B-Instruct)、[Qwen2.5-72B-Instruct](https://huggingface.co/Qwen/Qwen2.5-72B-Instruct) | | Qwen3-32B | 已支持 | [Qwen3-32B](https://modelers.cn/models/MindSpore-Lab/Qwen3-32B) | | Qwen3-235B-A22B | 已支持 | [Qwen3-235B-A22B](https://huggingface.co/Qwen/Qwen3-235B-A22B) |