diff --git a/docs/vllm_mindspore/docs/requirements.txt b/docs/vllm_mindspore/docs/requirements.txt index fabeca22d66d2f21d9ccda77b5d7225ddfc3113b..9e952e9a95cffffd0351396a3066726a3f81857f 100644 --- a/docs/vllm_mindspore/docs/requirements.txt +++ b/docs/vllm_mindspore/docs/requirements.txt @@ -8,4 +8,4 @@ jieba descriptastorus == 2.6.0 sympy tqdm -sphinx-rtd-theme == 1.0.0 +sphinx-rtd-theme == 1.0.0 \ No newline at end of file diff --git a/docs/vllm_mindspore/docs/source_en/arch.png b/docs/vllm_mindspore/docs/source_en/arch.png new file mode 100644 index 0000000000000000000000000000000000000000..fc3b524ca3487ae92431c58157175b4ddcb42725 Binary files /dev/null and b/docs/vllm_mindspore/docs/source_en/arch.png differ diff --git a/docs/vllm_mindspore/docs/source_en/conf.py b/docs/vllm_mindspore/docs/source_en/conf.py new file mode 100644 index 0000000000000000000000000000000000000000..f6864f092fea0e8274bc20873bb76de694da5908 --- /dev/null +++ b/docs/vllm_mindspore/docs/source_en/conf.py @@ -0,0 +1,266 @@ +# Configuration file for the Sphinx documentation builder. +# +# This file only contains a selection of the most common options. For a full +# list see the documentation: +# https://www.sphinx-doc.org/en/master/usage/configuration.html + +# -- Path setup -------------------------------------------------------------- + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +# +import glob +import os +import shutil +import sys +import IPython +import re +import sphinx +sys.path.append(os.path.abspath('../_ext')) +import sphinx.ext.autosummary.generate as g +from sphinx.ext import autodoc as sphinx_autodoc + +# -- Project information ----------------------------------------------------- + +project = 'vLLM MindSpore' +copyright = 'MindSpore' +author = 'vLLM MindSpore' + +# The full version, including alpha/beta/rc tags +release = 'master' + +# -- General configuration --------------------------------------------------- + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +myst_enable_extensions = ["dollarmath", "amsmath"] + + +myst_heading_anchors = 5 +extensions = [ + 'sphinx.ext.autodoc', + 'sphinx.ext.autosummary', + 'sphinx.ext.doctest', + 'sphinx.ext.intersphinx', + 'sphinx.ext.todo', + 'sphinx.ext.coverage', + 'sphinx.ext.napoleon', + 'sphinx.ext.viewcode', + 'myst_parser', + 'nbsphinx', + 'sphinx.ext.mathjax', + 'IPython.sphinxext.ipython_console_highlighting' +] + +source_suffix = { + '.rst': 'restructuredtext', + '.md': 'markdown', +} + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This pattern also affects html_static_path and html_extra_path. +mathjax_path = 'https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/mathjax/MathJax-3.2.2/es5/tex-mml-chtml.js' + +mathjax_options = { + 'async':'async' +} + +smartquotes_action = 'De' + +exclude_patterns = [] + +pygments_style = 'sphinx' + +autodoc_inherit_docstrings = False + +autosummary_generate = True + +autosummary_generate_overwrite = False + +# -- Options for HTML output ------------------------------------------------- + +# Reconstruction of sphinx auto generated document translation. +language = 'zh_CN' +locale_dirs = ['../../../../resource/locale/'] +gettext_compact = False + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +# +html_theme = 'sphinx_rtd_theme' + +html_search_language = 'zh' + +html_search_options = {'dict': '../../../resource/jieba.txt'} + +sys.path.append(os.path.abspath('../../../../resource/sphinx_ext')) +# import anchor_mod +import nbsphinx_mod + +# Example configuration for intersphinx: refer to the Python standard library. +intersphinx_mapping = { + 'python': ('https://docs.python.org/', '../../../../resource/python_objects.inv'), + 'numpy': ('https://docs.scipy.org/doc/numpy/', '../../../../resource/numpy_objects.inv'), +} + +from sphinx import directives +with open('../_ext/overwriteobjectiondirective.txt', 'r', encoding="utf8") as f: + exec(f.read(), directives.__dict__) + +from sphinx.ext import viewcode +with open('../_ext/overwriteviewcode.txt', 'r', encoding="utf8") as f: + exec(f.read(), viewcode.__dict__) + +with open('../_ext/customdocumenter.txt', 'r', encoding="utf8") as f: + exec(f.read(), sphinx_autodoc.__dict__) + +# Modify regex for sphinx.ext.autosummary.generate.find_autosummary_in_lines. +gfile_abs_path = os.path.abspath(g.__file__) +autosummary_re_line_old = r"autosummary_re = re.compile(r'^(\s*)\.\.\s+autosummary::\s*')" +autosummary_re_line_new = r"autosummary_re = re.compile(r'^(\s*)\.\.\s+(ms[a-z]*)?autosummary::\s*')" +with open(gfile_abs_path, "r+", encoding="utf8") as f: + data = f.read() + data = data.replace(autosummary_re_line_old, autosummary_re_line_new) + exec(data, g.__dict__) + +# Modify default signatures for autodoc. +autodoc_source_path = os.path.abspath(sphinx_autodoc.__file__) +autodoc_source_re = re.compile(r'stringify_signature\(.*?\)') +get_param_func_str = r"""\ +import re +import inspect as inspect_ + +def get_param_func(func): + try: + source_code = inspect_.getsource(func) + if func.__doc__: + source_code = source_code.replace(func.__doc__, '') + all_params_str = re.findall(r"def [\w_\d\-]+\(([\S\s]*?)(\):|\) ->.*?:)", source_code) + all_params = re.sub("(self|cls)(,|, )?", '', all_params_str[0][0].replace("\n", "").replace("'", "\"")) + return all_params + except: + return '' + +def get_obj(obj): + if isinstance(obj, type): + return obj.__init__ + + return obj +""" + +with open(autodoc_source_path, "r+", encoding="utf8") as f: + code_str = f.read() + code_str = autodoc_source_re.sub('"(" + get_param_func(get_obj(self.object)) + ")"', code_str, count=0) + exec(get_param_func_str, sphinx_autodoc.__dict__) + exec(code_str, sphinx_autodoc.__dict__) + +# Copy source files of chinese python api from mindscience repository. +from sphinx.util import logging +logger = logging.getLogger(__name__) + +# copy_path = 'docs/api_python/mindchemistry' +# src_dir = os.path.join(os.getenv("VLLM_PATH"), copy_path) + +copy_list = [] + +present_path = os.path.dirname(__file__) + +# for i in os.listdir(src_dir): +# if os.path.isfile(os.path.join(src_dir,i)): +# if os.path.exists('./'+i): +# os.remove('./'+i) +# shutil.copy(os.path.join(src_dir,i),'./'+i) +# copy_list.append(os.path.join(present_path,i)) +# else: +# if os.path.exists('./'+i): +# shutil.rmtree('./'+i) +# shutil.copytree(os.path.join(src_dir,i),'./'+i) +# copy_list.append(os.path.join(present_path,i)) + +# add view +import json + +with open('../../../../tools/generate_html/daily.json', 'r+', encoding='utf-8') as f: + version_inf = json.load(f) + +# if os.getenv("VLLM_PATH").split('/')[-1]: +# copy_repo = os.getenv("VLLM_PATH").split('/')[-1] +# else: +# copy_repo = os.getenv("VLLM_PATH").split('/')[-2] + +# import pdb +# pdb.set_trace() + +# branch = [version_inf[i]['branch'] for i in range(len(version_inf)) if version_inf[i]['name'] == copy_repo][0] +# docs_branch = [version_inf[i]['branch'] for i in range(len(version_inf)) if version_inf[i]['name'] == 'tutorials'][0] + +# re_view = f"\n.. image:: https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/{docs_branch}/" + \ +# f"resource/_static/logo_source_en.svg\n :target: https://gitee.com/mindspore/{copy_repo}/blob/{branch}/" + +# for cur, _, files in os.walk(present_path): +# for i in files: +# flag_copy = 0 +# if i.endswith('.rst'): +# for j in copy_list: +# if j in cur: +# flag_copy = 1 +# break +# if os.path.join(cur, i) in copy_list or flag_copy: +# try: +# with open(os.path.join(cur, i), 'r+', encoding='utf-8') as f: +# content = f.read() +# new_content = content +# if '.. include::' in content and '.. automodule::' in content: +# continue +# if 'autosummary::' not in content and "\n=====" in content: +# re_view_ = re_view + copy_path + cur.split(present_path)[-1] + '/' + i + \ +# '\n :alt: 查看源文件\n\n' +# new_content = re.sub('([=]{5,})\n', r'\1\n' + re_view_, content, 1) +# print("re_view_") +# print(re_view_) +# if new_content != content: +# f.seek(0) +# f.truncate() +# f.write(new_content) +# except Exception: +# print(f'打开{i}文件失败') + + +# import vllm_mindspore + +sys.path.append(os.path.abspath('../../../../resource/search')) +import search_code + +sys.path.append(os.path.abspath('../../../../resource/custom_directives')) +from custom_directives import IncludeCodeDirective +from myautosummary import MsPlatformAutoSummary, MsCnPlatformAutoSummary + +rst_files = set([i.replace('.rst', '') for i in glob.glob('./**/*.rst', recursive=True)]) + +def setup(app): + app.add_directive('msplatformautosummary', MsPlatformAutoSummary) + app.add_directive('mscnplatformautosummary', MsCnPlatformAutoSummary) + app.add_directive('includecode', IncludeCodeDirective) + app.add_config_value('rst_files', set(), False) + +src_release = "./release_notes/release_notes.md" +des_release = "./RELEASE.md" +with open(src_release, "r", encoding="utf-8") as f: + data = f.read() +if len(re.findall("\n## (.*?)\n",data)) > 1: + content = re.findall("(## [\s\S\n]*?)\n## ", data) +else: + content = re.findall("(## [\s\S\n]*)", data) + +with open(des_release, "w", encoding="utf-8") as p: + p.write("# Release Notes"+"\n\n") + p.write(content[0]) + +os.makedirs(os.path.join(present_path, "../build_en/html/"), exist_ok=True) +shutil.copy(os.path.join(present_path, "arch.png"), os.path.join(present_path, "../build_en/html/")) diff --git a/docs/vllm_mindspore/docs/source_en/developer_guide/contributing.md b/docs/vllm_mindspore/docs/source_en/developer_guide/contributing.md new file mode 100644 index 0000000000000000000000000000000000000000..bd4e479bd76671d41d7641cec13537a5b3187f4b --- /dev/null +++ b/docs/vllm_mindspore/docs/source_en/developer_guide/contributing.md @@ -0,0 +1,95 @@ +# Contribution Guidelines + +[![View Source](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/master/resource/_static/logo_source_en.svg)](https://gitee.com/mindspore/docs/blob/master/docs/vllm_mindspore/docs/source_en/developer_guide/contributing.md) + +## Contributor License Agreement + +Before submitting code to the MindSpore community, you need to sign the Contributor License Agreement (CLA). Individual contributors should refer to the [ICLA Online Document](https://www.mindspore.cn/icla). + +## Quick Start + +- Fork the repository on [Gitee](https://gitee.com/mindspore/vllm-mindspore). +- Refer to [README.md](https://gitee.com/mindspore/vllm-mindspore/blob/master/README.md) and the installation page for project information and build instructions. + +## Supporting New Models + +To support a new model for vLLM MindSpore code repository, please note the following: + +- **Follow file format and location specifications.** Model code files should be placed under the `vllm_mindspore/model_executor` directory, organized in corresponding subfolders by model type. +- **Implement models using MindSpore interfaces with jit static graph support.** Model definitions in vLLM MindSpore must be implemented using MindSpore interfaces. Since MindSpore's static graph mode offers performance advantages, models should support execution via @jit static graphs. For reference, see the [Qwen2.5](https://gitee.com/mindspore/vllm-mindspore/blob/master/vllm_mindspore/model_executor/models/qwen2.py) implementation. +- **Register new models in vLLM MindSpore.** After implementing the model structure, register it in vLLM MindSpore by adding it to `_NATIVE_MODELS` in `vllm_mindspore/model_executor/models/registry.py`. +- **Write unit tests.** New models must include corresponding unit tests. Refer to the [Qwen2.5 testcases](https://gitee.com/mindspore/vllm-mindspore/blob/master/tests/st/python/test_vllm_qwen_7b.py) for examples. + +## Contribution Process + +### Code Style + +Follow these guidelines for community code review, maintenance, and development. + +- **Coding Standards:** Use vLLM community code checking tools: yapf, codespell, ruff, isort, and mypy. For more details, see the [Toolchain Usage Guide](https://gitee.com/mindspore/vllm-mindspore/blob/master/codecheck_toolkits/README.md). +- **Unit Testing Guidelines:** vLLM MindSpore uses the [pytest](http://www.pytest.org/en/latest/) framework. Test names should clearly reflect their purpose. +- **Refactoring Guidelines:** Developers are encouraged to refactor code to eliminate [code smells](https://en.wikipedia.org/wiki/Code_smell). All code, including refactored code, must adhere to coding and testing standards. + +### Fork-Pull Development Model + +- **Fork the vLLM MindSpore Repository:** Before submitting code, fork the project to your own repository. Ensure consistency between the vLLM MindSpore repository and your fork during parallel development. + +- **Clone the Remote Repository:** users can use git to pull the source code: + + ```shell + # On Gitee: + git clone https://gitee.com/{insert_your_forked_repo}/vllm-mindspore.git + git remote add upstream https://gitee.com/mindspore/vllm-mindspore.git + ``` + +- **Local Development:** To avoid branch inconsistencies, switch to a new branch: + + ```shell + git checkout -b {new_branch_name} origin/master + ``` + + For version branches or downstream development, fix upstream bugs before modifying code. +- **Push Changes to Remote Repository:** After updating the code, push changes: + + ```shell + git add . + git status # Check update status. + git commit -m "Your commit title" + git commit -s --amend # Add detailed commit description. + git push origin {new_branch_name} + ``` + +- **Create a Pull Request to vLLM MindSpore:** Compare and create a PR between your branch and the vLLM MindSpore master branch. After submission, manually trigger CI checks with `/retest` in the comments. PRs should be merged into upstream master promptly to minimize merge risks. + +### Reporting Issues + +To contribute by reporting issues, follow these guidelines: + +- Specify your environment versions (vLLM MindSpore, MindFormers, MindSpore, OS, Python, etc.). +- Indicate whether it's a bug report or feature request. +- Label the issue type for visibility on the issue board. +- Describe the problem and expected resolution. +- Provide detailed reproduction steps. +- Add special notes for reviewers. + +**Issue Notes:** + +- **Comment first when processing an issue,** inform others that you would start to fix this issue. +- **For long-unresolved issues**, verify the problem before attempting a fix. +- **If you resolve your own reported issue**, notify others before closing it. + +### Submitting PRs + +- For major new features, include a design proposal. +- After consensus via issue discussion and design review, develop in your fork and submit a PR. +- Each PR requires at least two LGTM labels from reviewers (excluding the PR author). +- After thorough discussion, the PR will be merged, abandoned, or rejected based on the outcome. + +**PR Notes:** + +- Avoid unrelated changes. +- Maintain clean commit history. +- Keep your branch synchronized with master. +- For bug-fix PRs, ensure all related issues are referenced. + +Thank you for your interest in contributing to vLLM MindSpore. We welcome and value all forms of collaboration. diff --git a/docs/vllm_mindspore/docs/source_en/faqs/faqs.md b/docs/vllm_mindspore/docs/source_en/faqs/faqs.md new file mode 100644 index 0000000000000000000000000000000000000000..2d27c2c672f8675fbb0af87e8b1da5adb3708eb2 --- /dev/null +++ b/docs/vllm_mindspore/docs/source_en/faqs/faqs.md @@ -0,0 +1,86 @@ +# Frequently Asked Questions + +[![View Source](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/master/resource/_static/logo_source_en.svg)](https://gitee.com/mindspore/docs/blob/master/docs/vllm_mindspore/docs/source_en/faqs/faqs.md) + +## Model-related Issues + +### Git-LFS Installation + +1. Obtain the corresponding [git-lfs installation package](https://github.com/git-lfs/git-lfs/releases/tag/v3.0.1) from the following link. +2. Download and install: + + ```shell + mkdir git-lfs + cd git-lfs + wget https://github.com/git-lfs/git-lfs/releases/download/v3.0.1/git-lfs-linux-arm64-v3.0.1.tar.gz --no-check-certificate + tar zxvf git-lfs-linux-arm64-v3.0.1.tar.gz + bash install.sh + ``` + +3. Verify successful installation: + + ```shell + git lfs install + ``` + + If `Git LFS initialized.` is returned, the installation was successful. + +## Deployment-related Issues + +### Model Fails to Load During Offline/Online Inference + +- Key error message: + + ```text + raise ValueError(f"{config.load_checkpoint} is not a valid path to load checkpoint ") + ``` + +- Solution: + 1. Check if the model path exists and is valid; + 2. If the model path exists and the model files are in `safetensors` format, confirm whether the yaml file contains the `load_ckpt_format: "safetensors"` field: + 1. Print the path of the yaml file used by the model: + + ```bash + echo $MINDFORMERS_MODEL_CONFIG + ``` + + 2. Check the yaml file. If the `load_ckpt_format` field is missing, add it: + + ```text + load_ckpt_format: "safetensors" + ``` + +### `aclnnNonzeroV2` Related Error When Starting Online Service + +- Key error message: + + ```text + RuntimeError: Call aclnnNonzeroV2 failed, detail:E39999: Inner Error + ``` + + Check whether the CANN and MindSpore versions are correctly matched. + +### `resolve_transformers_fallback` Import Error When Running Qwen3 + +- Key error message: + + ```text + ImportError: cannot import name 'resolve_transformers_fallback' from 'vllm.model_executor.model_loader.utils' + ``` + + Try switching `vllm` to version `0.7.3`. + +### `torch` Not Found When Importing `vllm_mindspore` + +- Key error message: + + ```text + importlib.metadata.PackageNotFoundError: No package metadata was found for torch + ``` + + Execute the following commands to reinstall torch-related components: + + ```bash + pip uninstall torch + pip uninstall torchvision + ``` diff --git a/docs/vllm_mindspore/docs/source_en/getting_started/installation/installation.md b/docs/vllm_mindspore/docs/source_en/getting_started/installation/installation.md new file mode 100644 index 0000000000000000000000000000000000000000..f088336aa7a953454ca0bf95d4963a7d81a83cf8 --- /dev/null +++ b/docs/vllm_mindspore/docs/source_en/getting_started/installation/installation.md @@ -0,0 +1,194 @@ +# Installation Guide + +[![View Source](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/master/resource/_static/logo_source_en.svg)](https://gitee.com/mindspore/docs/blob/master/docs/vllm_mindspore/docs/source_en/getting_started/installation/installation.md) + +This document describes the steps to install the vLLM MindSpore environment. Three installation methods are provided: + +- [Docker Installation](#docker-installation): Suitable for quick deployment scenarios. +- [Pip Installation](#pip-installation): Suitable for scenarios requiring specific versions. +- [Source Code Installation](#source-code-installation): Suitable for incremental development of vLLM MindSpore. + +## Version Compatibility + +- OS: Linux-aarch64 +- Python: 3.9 / 3.10 / 3.11 +- Software version compatibility + + | Software | Version | Corresponding Branch | + | -------- | ------- | -------------------- | + | [CANN](https://www.hiascend.com/developer/download/community/result?module=cann) | 8.1 | - | + | [MindSpore](https://www.mindspore.cn/install/) | 2.7 | master | + | [MSAdapter](https://git.openi.org.cn/OpenI/MSAdapter) | 0.2 | master | + | [MindSpore Transformers](https://gitee.com/mindspore/mindformers) | 1.6 | br_infer_deepseek_os | + | [Golden Stick](https://gitee.com/mindspore/golden-stick) | 1.1.0 | r1.1.0 | + | [vLLM](https://github.com/vllm-project/vllm) | 0.8.3 | v0.8.3 | + | [vLLM MindSpore](https://gitee.com/mindspore/vllm-mindspore) | 0.2 | master | + +## Environment Setup + +This section introduces three installation methods: [Docker Installation](#docker-installation), [Pip Installation](#pip-installation), [Source Code Installation](#source-code-installation), and [Quick Verification](#quick-verification) example to check the installation. + +### Docker Installation + +We recommend using Docker for quick deployment of the vLLM MindSpore environment. Below are the steps: + +#### Pulling the Image + +Execute the following command to pull the vLLM MindSpore Docker image: + +```bash +docker pull hub.oepkgs.net/oedeploy/openeuler/aarch64/mindspore:latest +``` + +During the pull process, user will see the progress of each layer. After successful completion, check the image by executing the following command: + +```bash +docker images +``` + +#### Creating a Container + +After [pulling the image](#pulling-the-image), set `DOCKER_NAME` and `IMAGE_NAME` as the container and image names, then execute the following command to create the container: + +```bash +export DOCKER_NAME=vllm-mindspore-container # your container name +export IMAGE_NAME=hub.oepkgs.net/oedeploy/openeuler/aarch64/mindspore:latest # your image name + +docker run -itd --name=${DOCKER_NAME} --ipc=host --network=host --privileged=true \ + --device=/dev/davinci0 \ + --device=/dev/davinci1 \ + --device=/dev/davinci2 \ + --device=/dev/davinci3 \ + --device=/dev/davinci4 \ + --device=/dev/davinci5 \ + --device=/dev/davinci6 \ + --device=/dev/davinci7 \ + --device=/dev/davinci_manager \ + --device=/dev/devmm_svm \ + --device=/dev/hisi_hdc \ + -v /usr/local/sbin/:/usr/local/sbin/ \ + -v /var/log/npu/slog/:/var/log/npu/slog \ + -v /var/log/npu/profiling/:/var/log/npu/profiling \ + -v /var/log/npu/dump/:/var/log/npu/dump \ + -v /var/log/npu/:/usr/slog \ + -v /etc/hccn.conf:/etc/hccn.conf \ + -v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \ + -v /usr/local/dcmi:/usr/local/dcmi \ + -v /usr/local/Ascend/driver:/usr/local/Ascend/driver \ + -v /etc/ascend_install.info:/etc/ascend_install.info \ + -v /etc/vnpu.cfg:/etc/vnpu.cfg \ + --shm-size="250g" \ + ${IMAGE_NAME} \ + bash +``` + +The container ID will be returned if docker is created successfully. User can also check the container by executing the following command: + +```bash +docker ps +``` + +#### Entering the Container + +After [creating the container](#creating-a-container), user can start and enter the container, using the environment variable `DOCKER_NAME`: + +```bash +docker exec -it $DOCKER_NAME bash +``` + +### Pip Installation + +Use pip to install vLLM MindSpore, by executing the following command: + +```bash +pip install vllm_mindspore +``` + +### Source Code Installation + +- **CANN Installation** + For CANN installation methods and environment configuration, please refer to [CANN Community Edition Installation Guide](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/82RC1alpha002/softwareinst/instg/instg_0001.html?Mode=PmIns&OS=openEuler&Software=cannToolKit). If you encounter any issues during CANN installation, please consult the [Ascend FAQ](https://www.hiascend.com/document/detail/zh/AscendFAQ/ProduTech/CANNFAQ/cannfaq_000.html) for troubleshooting. + + The default installation path for CANN is `/usr/local/Ascend`. After completing CANN installation, configure the environment variables with the following commands: + + ```bash + LOCAL_ASCEND=/usr/local/Ascend # the root directory of run package + source ${LOCAL_ASCEND}/ascend-toolkit/set_env.sh + export ASCEND_CUSTOM_PATH=${LOCAL_ASCEND}/ascend-toolkit + ``` + +- **vLLM Prerequisites Installation** + For vLLM environment configuration and installation methods, please refer to the [vLLM Installation Guide](https://docs.vllm.ai/en/v0.8.3/getting_started/installation/cpu.html). In vllM installation, `gcc/g++ >= 12.3.0` is required, and it could be installed by the following command: + + ```bash + yum install -y gcc gcc-c++ + ``` + +- **vLLM MindSpore Installation** + To install vLLM MindSpore, user needs to pull the vLLM MindSpore source code and then runs the following command to install the dependencies: + + ```bash + git clone https://gitee.com/mindspore/vllm-mindspore.git + cd vllm-mindspore + bash install_depend_pkgs.sh + ``` + + Compile and install vLLM MindSpore: + + ```bash + pip install . + ``` + + After executing the above commands, `mindformers-dev` folder will be generated in the `vllm-mindspore/install_depend_pkgs` directory. Add this folder to the environment variables: + + ```bash + export MF_PATH=`pwd install_depend_pkgs/mindformers-dev` + export PYTHONPATH=$MF_PATH:$PYTHONPATH + ``` + + If MindSpore Transformers was compiled and installed from the `br_infer_deepseek_os` branch, `mindformers-os` folder will be generated in the `vllm-mindspore/install_depend_pkgs` directory. In this case, adjust the `MF_PATH` environment variable to: + + ```bash + export MF_PATH=`pwd install_depend_pkgs/mindformers-os` + export PYTHONPATH=$MF_PATH:$PYTHONPATH + ``` + +### Quick Verification + +To verify the installation, run a simple offline inference test with [Qwen2.5-7B](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct): + +```python +import vllm_mindspore # Add this line on the top of script. +from vllm import LLM, SamplingParams + +# Sample prompts. +prompts = [ + "I am", + "Today is", + "Llama is" +] + +# Create a sampling params object. +sampling_params = SamplingParams(temperature=0.0, top_p=0.95) + +# Create a LLM +llm = LLM(model="Qwen2.5-7B-Instruct") +# Generate texts from the prompts. The output is a list of RequestOutput objects +# that contain the prompt, generated text, and other information. +outputs = llm.generate(prompts, sampling_params) +# Print the outputs. +for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}. Generated text: {generated_text!r}") +``` + +If successful, the output will resemble: + +```text +Prompt: 'I am'. Generated text: ' trying to create a virtual environment for my Python project, but I am encountering some' +Prompt: 'Today is'. Generated text: ' the 100th day of school. To celebrate, the teacher has' +Prompt: 'Llama is'. Generated text: ' a 100% natural, biodegradable, and compostable alternative' +``` + +Alternatively, refer to the [Quick Start](../quick_start/quick_start.md) guide for [online serving](../quick_start/quick_start.md#online-serving) verification. diff --git a/docs/vllm_mindspore/docs/source_en/getting_started/quick_start/quick_start.md b/docs/vllm_mindspore/docs/source_en/getting_started/quick_start/quick_start.md new file mode 100644 index 0000000000000000000000000000000000000000..edf49a6dbb417fcd469be38ab9a5c63815a83db2 --- /dev/null +++ b/docs/vllm_mindspore/docs/source_en/getting_started/quick_start/quick_start.md @@ -0,0 +1,235 @@ +# Quick Start + +[![View Source](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/master/resource/_static/logo_source_en.svg)](https://gitee.com/mindspore/docs/blob/master/docs/vllm_mindspore/docs/source_en/getting_started/quick_start/quick_start.md) + +This document provides a quick guide to deploy vLLM MindSpore by [docker](https://www.docker.com/), with the [Qwen2.5-7B](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct) model as an example. User can quickly experience the serving and inference abilities of vLLM MindSpore by [offline inference](#offline-inference) and [online serving](#online-serving). For more information about installation, please refer to the [Installation Guide](../installation/installation.md). + +## Docker Installation + +In this section, we recommend to use docker to deploy the vLLM MindSpore environment. The following sections are the steps for deployment: + +### Pulling the Image + +Pull the vLLM MindSpore docker image by executing the following command: + +```bash +docker pull hub.oepkgs.net/oedeploy/openeuler/aarch64/mindspore:latest +``` + +During the pull process, user will see the progress of each layer of the docker image. User can verify the image by executing the following command: + +```bash +docker images +``` + +### Creating a Container + +After [pulling the image](#pulling-the-image), set `DOCKER_NAME` and `IMAGE_NAME` as the container and image names, and create the container by running: + +```bash +export DOCKER_NAME=vllm-mindspore-container # your container name +export IMAGE_NAME=hub.oepkgs.net/oedeploy/openeuler/aarch64/mindspore:latest # your image name + +docker run -itd --name=${DOCKER_NAME} --ipc=host --network=host --privileged=true \ + --device=/dev/davinci0 \ + --device=/dev/davinci1 \ + --device=/dev/davinci2 \ + --device=/dev/davinci3 \ + --device=/dev/davinci4 \ + --device=/dev/davinci5 \ + --device=/dev/davinci6 \ + --device=/dev/davinci7 \ + --device=/dev/davinci_manager \ + --device=/dev/devmm_svm \ + --device=/dev/hisi_hdc \ + -v /usr/local/sbin/:/usr/local/sbin/ \ + -v /var/log/npu/slog/:/var/log/npu/slog \ + -v /var/log/npu/profiling/:/var/log/npu/profiling \ + -v /var/log/npu/dump/:/var/log/npu/dump \ + -v /var/log/npu/:/usr/slog \ + -v /etc/hccn.conf:/etc/hccn.conf \ + -v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \ + -v /usr/local/dcmi:/usr/local/dcmi \ + -v /usr/local/Ascend/driver:/usr/local/Ascend/driver \ + -v /etc/ascend_install.info:/etc/ascend_install.info \ + -v /etc/vnpu.cfg:/etc/vnpu.cfg \ + --shm-size="250g" \ + ${IMAGE_NAME} \ + bash +``` + +After successfully creating the container, the container ID will be returned. User can verify the creation by executing the following command: + +```bash +docker ps +``` + +### Entering the Container + +After [creating the container](#creating-a-container), use the environment variable `DOCKER_NAME` to start and enter the container by executing the following command: + +```bash +docker exec -it $DOCKER_NAME bash +``` + +## Using the Service + +After deploying the environment, user need to prepare the model files before running the model. Refer to the [Download Model](#downloading-model) section for guidance. After [setting environment variables](#setting-environment-variables), user can experience the model bt [offline inference](#offline-inference) or [online serving](#online-serving). + +### Downloading Model + +User can download the model using either the [Python Tool](#downloading-with-python-tool) or [git-lfs Tool](#downloading-with-git-lfs-tool). + +#### Downloading with Python Tool + +Execute the following Python script to download the [Qwen2.5-7B](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct) weights and files from [Hugging Face](https://huggingface.co/): + +```python +from huggingface_hub import snapshot_download + +snapshot_download( + repo_id="Qwen/Qwen2.5-7B-Instruct", + local_dir="/path/to/save/Qwen2.5-7B-Instruct", + local_dir_use_symlinks=False +) +``` + +`local_dir` is the model save path specified by the user. Please ensure the disk space is sufficient. + +#### Downloading with git-lfs Tool + +Execute the following command to check if [git-lfs](https://git-lfs.com) is available: + +```bash +git lfs install +``` + +If available, the following output will be displayed: + +```text +Git LFS initialized. +``` + +If the tool is unavailable, please install [git-lfs](https://git-lfs.com) first. Refer to the [FAQ](../../faqs/faqs.md) section for guidance on [git-lfs installation](../../faqs/faqs.md#git-lfs-installation). + +Once confirmed, download the weights by executing the following command: + +```bash +git clone https://huggingface.co/Qwen/Qwen2.5-7B-Instruct +``` + +### Setting Environment Variables + +Before launching the model, user need to set the following environment variables: + +```bash +export ASCEND_TOTAL_MEMORY_GB=64 # Please use `npu-smi info` to check the memory. +export vLLM_MODEL_BACKEND=MindFormers # use MindSpore Transformers as model backend. +export vLLM_MODEL_MEMORY_USE_GB=32 # Memory reserved for model execution. Set according to the model's maximum usage, with the remaining environment used for kvcache allocation. +export MINDFORMERS_MODEL_CONFIG=$YAML_PATH # Set the corresponding MindSpore Transformers model's YAML file. +``` + +Here is an explanation of these environment variables: + +- `ASCEND_TOTAL_MEMORY_GB`: The memory size of each card. User can check the memory by using `npu-smi info`, where the value corresponds to `HBM-Usage(MB)` in the query results. +- `vLLM_MODEL_BACKEND`: The backend of the model to run. User could find supported models and backends for vLLM MindSpore in the [Model Support List](../../user_guide/supported_models/models_list/models_list.md). +- `vLLM_MODEL_MEMORY_USE_GB`: The memory reserved for model loading. Adjust this value if insufficient memory error occurs during model loading. +- `MINDFORMERS_MODEL_CONFIG`: The model configuration file. + +### Offline Inference + +Taking [Qwen2.5-7B](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct) as an example, user can perform offline inference with the following Python script: + +```python +import vllm_mindspore # Add this line on the top of script. +from vllm import LLM, SamplingParams + +# Sample prompts. +prompts = [ + "I am", + "Today is", + "Llama is" +] + +# Create a sampling params object. +sampling_params = SamplingParams(temperature=0.0, top_p=0.95) + +# Create a LLM +llm = LLM(model="Qwen2.5-7B-Instruct") +# Generate texts from the prompts. The output is a list of RequestOutput objects +# that contain the prompt, generated text, and other information. +outputs = llm.generate(prompts, sampling_params) +# Print the outputs. +for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}. Generated text: {generated_text!r}") +``` + +If offline inference runs successfully, similar results will be obtained: + +```text +Prompt: 'I am'. Generated text: ' trying to create a virtual environment for my Python project, but I am encountering some' +Prompt: 'Today is'. Generated text: ' the 100th day of school. To celebrate, the teacher has' +Prompt: 'Llama is'. Generated text: ' a 100% natural, biodegradable, and compostable alternative' +``` + +### Online Serving + +vLLM MindSpore supports online serving deployment with the OpenAI API protocol. The following section would introduce how to [starting the service](#starting-the-service) and [send requests](#sending-requests) to obtain inference results, using [Qwen2.5-7B](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct) as an example. + +#### Starting the Service + +Use the model `Qwen/Qwen2.5-7B-Instruct` and start the vLLM service with the following command: + +```bash +python3 -m vllm_mindspore.entrypoints vllm.entrypoints.openai.api_server --model "Qwen/Qwen2.5-7B-Instruct" +``` + +If the service starts successfully, similar output will be obtained: + +```text +INFO: Started server process [6363] +INFO: Waiting for application startup. +INFO: Application startup complete. +``` + +Additionally, performance metrics will be logged, such as: + +```text +Engine 000: Avg prompt throughput: 0.0 tokens/s, Avg gereration throughput: 0.0 tokens/s, Running: 0 reqs, Waiting: 0 reqs, GPU KV cache usage: 0.0%, Prefix cache hit rate: 0.0% +``` + +#### Sending Requests + +Use the following command to send a request, where `prompt` is the model input: + +```bash +curl http://localhost:8000/v1/completions -H "Content-Type: application/json" -d '{"model": "Qwen/Qwen2.5-7B-Instruct", "prompt": "I am", "max_tokens": 15, "temperature": 0}' +``` + +If the request is processed successfully, the following inference result will be returned: + +```text +{ + "id":"cmpl-5e6e314861c24ba79fea151d86c1b9a6","object":"text_completion", + "create":1747398389, + "model":"Qwen2.5-7B-Instruct", + "choices":[ + { + "index":0, + "trying to create a virtual environment for my Python project, but I am encountering some", + "logprobs":null, + "finish_reason":"length", + "stop_reason":null, + "prompt_logprobs":null + } + ], + "usage":{ + "prompt_tokens":2, + "total_tokens":17, + "completion_tokens":15, + "prompt_tokens_details":null + } +} +``` diff --git a/docs/vllm_mindspore/docs/source_en/getting_started/tutorials/qwen2.5_32b_multiNPU/qwen2.5_32b_multiNPU.md b/docs/vllm_mindspore/docs/source_en/getting_started/tutorials/qwen2.5_32b_multiNPU/qwen2.5_32b_multiNPU.md new file mode 100644 index 0000000000000000000000000000000000000000..007bdab99ef7f181c9102280ec54feca98e1a3b3 --- /dev/null +++ b/docs/vllm_mindspore/docs/source_en/getting_started/tutorials/qwen2.5_32b_multiNPU/qwen2.5_32b_multiNPU.md @@ -0,0 +1,203 @@ +# NPU Single-Node Multi-Card Inference (Qwen2.5-32B) + +[![View Source](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/master/resource/_static/logo_source_en.svg)](https://gitee.com/mindspore/docs/blob/master/docs/vllm_mindspore/docs/source_en/getting_started/tutorials/qwen2.5_32b_multiNPU/qwen2.5_32b_multiNPU.md) + +This document introduces single-node multi-card inference process by vLLM MindSpore. Taking the [Qwen2.5-32B](https://huggingface.co/Qwen/Qwen2.5-32B-Instruct) model as an example, users can configure the environment through the [Docker Installation](#docker-installation) section or the [Installation Guide](../../installation/installation.md#installation-guide), and then [download the model weights](#downloading-model-weights). After [setting environment variables](#setting-environment-variables), users can perform [online inference](#online-inference) to experience single-node multi-card inference capabilities. + +## Docker Installation + +In this section, we recommend using Docker for quick deployment of the vLLM MindSpore environment. Below are the steps for Docker deployment: + +### Pulling the Image + +Pull the vLLM MindSpore Docker image by executing the following command: + +```bash +docker pull hub.oepkgs.net/oedeploy/openeuler/aarch64/mindspore:latest +``` + +During the pull process, user will see the progress of each layer. After successful completion, use can also check the image by running: + +```bash +docker images +``` + +### Creating a Container + +After [pulling the image](#pulling-the-image), set `DOCKER_NAME` and `IMAGE_NAME` as the container and image names, then create the container: + +```bash +export DOCKER_NAME=vllm-mindspore-container # your container name +export IMAGE_NAME=hub.oepkgs.net/oedeploy/openeuler/aarch64/mindspore:latest # your image name + +docker run -itd --name=${DOCKER_NAME} --ipc=host --network=host --privileged=true \ + --device=/dev/davinci0 \ + --device=/dev/davinci1 \ + --device=/dev/davinci2 \ + --device=/dev/davinci3 \ + --device=/dev/davinci4 \ + --device=/dev/davinci5 \ + --device=/dev/davinci6 \ + --device=/dev/davinci7 \ + --device=/dev/davinci_manager \ + --device=/dev/devmm_svm \ + --device=/dev/hisi_hdc \ + -v /usr/local/sbin/:/usr/local/sbin/ \ + -v /var/log/npu/slog/:/var/log/npu/slog \ + -v /var/log/npu/profiling/:/var/log/npu/profiling \ + -v /var/log/npu/dump/:/var/log/npu/dump \ + -v /var/log/npu/:/usr/slog \ + -v /etc/hccn.conf:/etc/hccn.conf \ + -v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \ + -v /usr/local/dcmi:/usr/local/dcmi \ + -v /usr/local/Ascend/driver:/usr/local/Ascend/driver \ + -v /etc/ascend_install.info:/etc/ascend_install.info \ + -v /etc/vnpu.cfg:/etc/vnpu.cfg \ + --shm-size="250g" \ + ${IMAGE_NAME} \ + bash +``` + +After successful creation, the container ID will be returned. Verify the container by running: + +```bash +docker ps +``` + +### Entering the Container + +After [creating the container](#creating-a-container), start and enter the container using the predefined `DOCKER_NAME`: + +```bash +docker exec -it $DOCKER_NAME bash +``` + +## Downloading Model Weights + +Users can download the model using either [Python Tools](#downloading-with-python-tool) or [git-lfs Tools](#downloading-with-git-lfs-tool). + +### Downloading with Python Tool + +Execute the following Python script to download the [Qwen2.5-32B](https://huggingface.co/Qwen/Qwen2.5-32B-Instruct) weights and files from [Hugging Face](https://huggingface.co/): + +```python +from openmind_hub import snapshot_downloadfrom huggingface_hub import snapshot_download +snapshot_download( + repo_id="Qwen/Qwen2.5-32B-Instruct", + local_dir="/path/to/save/Qwen2.5-32B-Instruct", + local_dir_use_symlinks=False +) +``` + +`local_dir` is the user-specified path to save the model. Ensure sufficient disk space is available. + +### Downloading with git-lfs Tool + +Run the following command to verify if [git-lfs](https://git-lfs.com) is available: + +```bash +git lfs install +``` + +If available, the following output will be displayed: + +```text +Git LFS initialized. +``` + +If unavailable, install [git-lfs](https://git-lfs.com) first. Refer to the [FAQ](../../../faqs/faqs.md) section for [git-lfs installation](../../../faqs/faqs.md#git-lfs-installation) guidance. + +Once confirmed, execute the following command to download the weights: + +```bash +git clone https://huggingface.co/Qwen/Qwen2.5-32B-Instruct +``` + +## Setting Environment Variables + +For [Qwen2.5-32B](https://huggingface.co/Qwen/Qwen2.5-32B-Instruct), the following environment variables configure memory allocation, backend, and model-related YAML files: + +```bash +#set environment variables +export ASCEND_TOTAL_MEMORY_GB=64 # Use `npu-smi info` to check the memory. +export vLLM_MODEL_BACKEND=MindFormers # Use MindFormers as the model backend. +export vLLM_MODEL_MEMORY_USE_GB=32 # Memory reserved for model execution. Adjust based on the model's maximum usage, with the remaining allocated for KV cache. +export MINDFORMERS_MODEL_CONFIG=$YAML_PATH # Set the corresponding MindSpore Transformers model YAML file. +``` + +Here is an explanation of these environment variables: + +- `ASCEND_TOTAL_MEMORY_GB`: The memory size of each compute card. Query using `npu-smi info`, corresponding to `HBM-Usage(MB)` in the results. +- `vLLM_MODEL_BACKEND`: The model backend. Currently supported models and backends are listed in the [Model Support List](../../../user_guide/supported_models/models_list/models_list.md). +- `vLLM_MODEL_MEMORY_USE_GB`: Memory reserved for model loading. Adjust this if encountering insufficient memory. +- `MINDFORMERS_MODEL_CONFIG`: Model configuration file. User can find the corresponding YAML file in the [MindSpore Transformers repository](https://gitee.com/mindspore/mindformers/tree/r1.5.0/research/qwen2_5). For Qwen2.5-32B, the YAML file is [predict_qwen2_5_32b_instruct.yaml](https://gitee.com/mindspore/mindformers/blob/r1.5.0/research/qwen2_5/predict_qwen2_5_32b_instruct.yaml). + +Users can check memory usage with `npu-smi info` and set the NPU cards for inference using the following example (assuming cards 4,5,6,7 are used): + +```bash +export ASCEND_RT_VISIBLE_DEVICES=4,5,6,7 +``` + +## Online Inference + +vLLM MindSpore supports online serving deployment with the OpenAI API protocol. The following section would introduce how to [starting the service](#starting-the-service) and [send requests](#sending-requests) to obtain inference results, using [Qwen2.5-32B](https://huggingface.co/Qwen/Qwen2.5-32B-Instruct) as an example. + +### Starting the Service + +Use the model `Qwen/Qwen2.5-32B-Instruct` and start the vLLM service with the following command: + +```bash +export TENSOR_PARALLEL_SIZE=4 +export MAX_MODEL_LEN=1024 +python3 -m vllm_mindspore.entrypoints vllm.entrypoints.openai.api_server --model "Qwen/Qwen2.5-32B-Instruct" --trust_remote_code --tensor-parallel-size $TENSOR_PARALLEL_SIZE --max-model-len $MAX_MODEL_LEN +``` + +Here, `TENSOR_PARALLEL_SIZE` specifies the number of NPU cards, and `MAX_MODEL_LEN` sets the maximum output token length. + +If the service starts successfully, similar output will be obtained: + +```text +INFO: Started server process [6363] +INFO: Waiting for application startup. +INFO: Application startup complete. +``` + +Additionally, performance metrics will be logged, such as: + +```text +Engine 000: Avg prompt throughput: 0.0 tokens/s, Avg generation throughput: 0.0 tokens/s, Running: 0 reqs, Waiting: 0 reqs, GPU KV cache usage: 0.0%, Prefix cache hit rate: 0.0% +``` + +### Sending Requests + +Use the following command to send a request, where `prompt` is the model input: + +```bash +curl http://localhost:8000/v1/completions -H "Content-Type: application/json" -d '{"model": "Qwen2.5-32B-Instruct", "prompt": "I am", "max_tokens": 20, "temperature": 0}' +``` + +If processed successfully, the inference result will be: + +```text +{ + "id":"cmpl-11fe2898c77d4ff18c879f57ae7aa9ca","object":"text_completion", + "create":1748568696, + "model":"Qwen2.5-32B-Instruct", + "choices":[ + { + "index":0, + "text":"trying to create a virtual environment in Python using venv, but I am encountering some issues with setting", + "logprobs":null, + "finish_reason":"length", + "stop_reason":null, + "prompt_logprobs":null + } + ], + "usage":{ + "prompt_tokens":2, + "total_tokens":22, + "completion_tokens":20, + "prompt_tokens_details":null + } +} +``` diff --git a/docs/vllm_mindspore/docs/source_en/getting_started/tutorials/qwen2.5_7b_singleNPU/qwen2.5_7b_singleNPU.md b/docs/vllm_mindspore/docs/source_en/getting_started/tutorials/qwen2.5_7b_singleNPU/qwen2.5_7b_singleNPU.md new file mode 100644 index 0000000000000000000000000000000000000000..47ca1d4997b0b007261c65973c382345cd05b557 --- /dev/null +++ b/docs/vllm_mindspore/docs/source_en/getting_started/tutorials/qwen2.5_7b_singleNPU/qwen2.5_7b_singleNPU.md @@ -0,0 +1,237 @@ +# Single NPU Inference (Qwen2.5-7B) + +[![View Source](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/master/resource/_static/logo_source_en.svg)](https://gitee.com/mindspore/docs/blob/master/docs/vllm_mindspore/docs/source_en/getting_started/tutorials/qwen2.5_7b_singleNPU/qwen2.5_7b_singleNPU.md) + +This document introduces single NPU inference process by vLLM MindSpore. Taking the [Qwen2.5-7B](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct) model as an example, user can configure the environment through the [Docker Installation](#docker-installation) or the [Installation Guide](../../installation/installation.md#installation-guide), and [download model weights](#download-model-weights). After [setting environment variables](#setting-environment-variables), user can perform [offline inference](#offline-inference) and [online inference](#online-inference) to experience single NPU inference abilities. + +## Docker Installation + +In this section, we recommend using Docker for quick deployment of the vLLM MindSpore environment. Below are the steps for Docker deployment: + +### Pulling the Image + +Pull the vLLM MindSpore Docker image by executing the following command: + +```bash +docker pull hub.oepkgs.net/oedeploy/openeuler/aarch64/mindspore:latest +``` + +During the pull process, user will see the progress of each layer. After successful completion, use can also check the image by running: + +```bash +docker images +``` + +### Creating a Container + +After [pulling the image](#pulling-the-image), set `DOCKER_NAME` and `IMAGE_NAME` as the container and image names, then create the container: + +```bash +export DOCKER_NAME=vllm-mindspore-container # your container name +export IMAGE_NAME=hub.oepkgs.net/oedeploy/openeuler/aarch64/mindspore:latest # your image name + +docker run -itd --name=${DOCKER_NAME} --ipc=host --network=host --privileged=true \ + --device=/dev/davinci0 \ + --device=/dev/davinci1 \ + --device=/dev/davinci2 \ + --device=/dev/davinci3 \ + --device=/dev/davinci4 \ + --device=/dev/davinci5 \ + --device=/dev/davinci6 \ + --device=/dev/davinci7 \ + --device=/dev/davinci_manager \ + --device=/dev/devmm_svm \ + --device=/dev/hisi_hdc \ + -v /usr/local/sbin/:/usr/local/sbin/ \ + -v /var/log/npu/slog/:/var/log/npu/slog \ + -v /var/log/npu/profiling/:/var/log/npu/profiling \ + -v /var/log/npu/dump/:/var/log/npu/dump \ + -v /var/log/npu/:/usr/slog \ + -v /etc/hccn.conf:/etc/hccn.conf \ + -v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \ + -v /usr/local/dcmi:/usr/local/dcmi \ + -v /usr/local/Ascend/driver:/usr/local/Ascend/driver \ + -v /etc/ascend_install.info:/etc/ascend_install.info \ + -v /etc/vnpu.cfg:/etc/vnpu.cfg \ + --shm-size="250g" \ + ${IMAGE_NAME} \ + bash +``` + +After successful creation, the container ID will be returned. Verify the container by running: + +```bash +docker ps +``` + +### Entering the Container + +After [creating the container](#creating-a-container), start and enter the container using the predefined `DOCKER_NAME`: + +```bash +docker exec -it $DOCKER_NAME bash +``` + +## Downloading Model Weights + +User can download the model using either [Python Tool](#downloading-with-python-tool) or [git-lfs Tool](#downloading-with-git-lfs-tool). + +### Downloading with Python Tool + +Execute the following Python script to download the [Qwen2.5-7B](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct) weights and files from [Hugging Face](https://huggingface.co/): + +```python +from huggingface_hub import snapshot_download +snapshot_download( + repo_id="Qwen/Qwen2.5-7B-Instruct", + local_dir="/path/to/save/Qwen2.5-7B-Instruct", + local_dir_use_symlinks=False +) +``` + +`local_dir` is the user-specified model save path. Ensure sufficient disk space is available. + +### Downloading with git-lfs Tool + +Run the following command to check if [git-lfs](https://git-lfs.com) is available: + +```bash +git lfs install +``` + +If available, the following output will be displayed: + +```text +Git LFS initialized. +``` + +If unavailable, install [git-lfs](https://git-lfs.com) first. Refer to the [FAQ](../../../faqs/faqs.md) section for [git-lfs installation](../../../faqs/faqs.md#git-lfs-installation) guidance. + +Once confirmed, download the weights by executing the following command: + +```bash +git clone https://huggingface.co/Qwen/Qwen2.5-7B-Instruct +``` + +## Setting Environment Variables + +For [Qwen2.5-7B](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct), the following environment variables configure memory allocation, backend, and model-related YAML files: + +```bash +#set environment variables +export ASCEND_TOTAL_MEMORY_GB=64 # Please use `npu-smi info` to check the memory. +export vLLM_MODEL_BACKEND=MindFormers # use MindFormers as model backend. +export vLLM_MODEL_MEMORY_USE_GB=32 # Memory reserved for model execution. Set according to the model's maximum usage, with the remaining environment used for kvcache allocation +export MINDFORMERS_MODEL_CONFIG=$YAML_PATH # Set the corresponding MindSpore Transformers model's YAML file. +``` + +Here is an explanation of these variables: + +- `ASCEND_TOTAL_MEMORY_GB`: The memory size of each compute card. Query using `npu-smi info`, corresponding to `HBM-Usage(MB)` in the results. +- `vLLM_MODEL_BACKEND`: The model backend. Currently supported models and backends are listed in the [Model Support List](../../../user_guide/supported_models/models_list/models_list.md). +- `vLLM_MODEL_MEMORY_USE_GB`: Memory reserved for model loading. Adjust this if encountering insufficient memory. +- `MINDFORMERS_MODEL_CONFIG`: Model configuration file. User can find the corresponding YAML file in the [MindSpore Transformers repository](https://gitee.com/mindspore/mindformers/tree/dev/research/qwen2_5). For Qwen2.5-7B, the YAML file is [predict_qwen2_5_7b_instruct.yaml](https://gitee.com/mindspore/mindformers/blob/dev/research/qwen2_5/predict_qwen2_5_7b_instruct.yaml). + +User can check memory usage with `npu-smi info` and set the compute card for inference using: + +```bash +export NPU_VISIBE_DEVICES=0 +export ASCEND_RT_VISIBLE_DEVICES=$NPU_VISIBE_DEVICES +``` + +## Offline Inference + +Taking [Qwen2.5-7B](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct) as an example, user can perform offline inference with the following Python code: + +```python +import vllm_mindspore # Add this line on the top of script. +from vllm import LLM, SamplingParams + +# Sample prompts. +prompts = [ + "I am", + "Today is", + "Llama is" +] + +# Create a sampling params object. +sampling_params = SamplingParams(temperature=0.0, top_p=0.95) + +# Create a LLM +llm = LLM(model="Qwen/Qwen2.5-7B-Instruct") +# Generate texts from the prompts. +outputs = llm.generate(prompts, sampling_params) +# Print the outputs. +for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}. Generated text: {generated_text!r}") +``` + +If offline inference runs successfully, similar results will be obtained: + +```text +Prompt: 'I am'. Generated text: ' trying to create a virtual environment for my Python project, but I am encountering some' +Prompt: 'Today is'. Generated text: ' the 100th day of school. To celebrate, the teacher has' +Prompt: 'Llama is'. Generated text: ' a 100% natural, biodegradable, and compostable alternative' +``` + +## Online Inference + +vLLM MindSpore supports online serving deployment with the OpenAI API protocol. The following section would introduce how to [starting the service](#starting-the-service) and [send requests](#sending-requests) to obtain inference results, using [Qwen2.5-7B](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct) as an example. + +### Starting the Service + +Use the model `Qwen/Qwen2.5-7B-Instruct` and start the vLLM service with the following command: + +```bash +python3 -m vllm_mindspore.entrypoints vllm.entrypoints.openai.api_server --model "Qwen/Qwen2.5-7B-Instruct" +``` + +If the service starts successfully, similar output will be obtained: + +```text +INFO: Started server process [6363] +INFO: Waiting for application startup. +INFO: Application startup complete. +``` + +Additionally, performance metrics will be logged, such as: + +```text +Engine 000: Avg prompt throughput: 0.0 tokens/s, Avg generation throughput: 0.0 tokens/s, Running: 0 reqs, Waiting: 0 reqs, GPU KV cache usage: 0.0%, Prefix cache hit rate: 0.0% +``` + +#### Sending Requests + +Use the following command to send a request, where `prompt` is the model input: + +```bash +curl http://localhost:8000/v1/completions -H "Content-Type: application/json" -d '{"model": "Qwen/Qwen2.5-7B-Instruct", "prompt": "I am", "max_tokens": 15, "temperature": 0}' +``` + +If the request is processed successfully, the following inference result will be returned: + +```text +{ + "id":"cmpl-5e6e314861c24ba79fea151d86c1b9a6","object":"text_completion", + "create":1747398389, + "model":"Qwen2.5-7B-Instruct", + "choices":[ + { + "index":0, + "trying to create a virtual environment for my Python project, but I am encountering some", + "logprobs":null, + "finish_reason":"length", + "stop_reason":null, + "prompt_logprobs":null + } + ], + "usage":{ + "prompt_tokens":2, + "total_tokens":17, + "completion_tokens":15, + "prompt_tokens_details":null + } +} +``` diff --git a/docs/vllm_mindspore/docs/source_en/index.rst b/docs/vllm_mindspore/docs/source_en/index.rst new file mode 100644 index 0000000000000000000000000000000000000000..32ed9e953f82eacf2463e000ccfbe4204cb17a3a --- /dev/null +++ b/docs/vllm_mindspore/docs/source_en/index.rst @@ -0,0 +1,143 @@ +vLLM MindSpore +========================================= + +Overview +----------------------------------------------------- +vLLM MindSpore (`vllm-mindspore`) is a plugin brewed by the `MindSpore community `_ , which aims to integrate MindSpore LLM inference capabilities into `vLLM `_ . With vLLM MindSpore, technical strengths of Mindspore and vLLM will be organically combined to provide a full-stack open-source, high-performance, easy-to-use LLM inference solution. + +vLLM, an opensource and community-driven project initiated by Sky Computing Lab, UC Berkeley, has been widely used in academic research and industry applications. On the basis of Continuous Batching scheduling mechanism and PagedAttention Key-Value cache management, vLLM provides a rich set of inference service features, including speculative inference, Prefix Caching, Multi-LoRA, etc. vLLM also supports a wide range of open-source large models, including Transformer-based models (e.g., LLaMa), Mixture-of-Expert models (e.g., DeepSeek), Embedding models (e.g., E5-Mistral), and multi-modal models (e.g., LLaVA). Because vLLM chooses to use PyTorch to build large models and manage storage resources, it cannot deploy large models built upon MindSpore. + +vLLM MindSpore plugin aims to integrate Mindspore large models into vLLM and to enable deploying MindSpore-based LLM inference services. It follows the following design principles: + +- Interface compatibility: support the native APIs and service deployment interfaces of vLLM to avoid adding new configuration files or interfaces, reducing user learning costs and ensuring ease of use. +- Minimal invasive modifications: minimize invasive modifications to the vLLM code to ensure system maintainability and evolvability. +- Component decoupling: minimize and standardize the coupling between MindSpore large model components and vLLM service components to facilitate the integration of various MindSpore large model suites. + +On the basis of the above design principles, vLLM MindSpore adopts the system architecture shown in the figure below, and implements the docking between vLLM and Mindspore in categories of components: + +- Service components: vLLM MindSpore maps PyTorch API calls in service components including LLMEngine and Scheduler to MindSpore capabilities, inheriting support for service functions like Continuous Batching and PagedAttention. +- Model components: vLLM MindSpore registers or replaces model components including models, network layers, and custom operators, and integrates MindSpore Transformers, MindSpore One, and other MindSpore large model suites, as well as custom large models, into vLLM. + +.. raw:: html + + + + + +
+ +
+ +vLLM MindSpore uses the plugin mechanism recommended by the vLLM community to realize capability registration. In the future, we expect to promote vLLM community to support integration of inference capabilities of third-party AI frameworks, including PaddlePaddle and JAX by following principles described in `[RPC] Multi-framework support for vllm `_ . + +Code: + +Prerequisites +----------------------------------------------------- + +- Hardware:Atlas 800I A2 Inference series, or Atlas 800T A2 Training series, with necessary drivers installed and access to the Internet +- Operating System: openEuler or Ubuntu Linux +- Software: + + * Python >= 3.9, < 3.12 + * CANN >= 8.0.0.beta1 + * MindSpore (matched with the vllm-mindspore version) + * vLLM (matched with the vllm-mindspore version) + +Getting Started +----------------------------------------------------- +Please refer to `Quick Start <./getting_started/quick_start/quick_start.html>`_ and `Installation <./getting_started/installation/installation.html>`_ for more details. + +Contributing +----------------------------------------------------- +Please read `CONTRIBUTING <./developer_guide/contributing.html>`_ for details on setting up development environments, testing functions, and submitting PR. + +We welcome and value any form of contribution and cooperation. Please use `Issue `_ to inform us of any bugs you encounter, or to submit your feature requests, improvement suggestions, and technical solutions. + +Branch +----------------------------------------------------- +The vllm-mindspore repository contains the main branch, development branch, and version branches: + +- **main**: the main branch, compatible with Mindspore master branch and vLLM v0.7.3 version, is continuously monitored for quality through Ascend-MindSpore CI. +- **develop**: the development branch for adapting vLLM features, which is forked from the main branch when a new vLLM version is released. Once the adapted features is stable, it will be merged into the main branch. The current development branch is adapting vLLM v0.8.3 version. +- **rX.Y.Z**: version branches used for archiving version release, which is forked from the main branch after the adaptation of a certain vLLM version is completed. + +The following are the version branches: + +.. list-table:: + :header-rows: 1 + + * - Branch + - Status + - Notes + * - master + - Maintained + - Compatible with vLLM v0.7.3, and CI commitment for MindSpore master branch + * - develop + - Maintained + - Compatible with vLLM v0.8.3 + * - r0.1 + - Unmaintained + - Only doc fixed is allowed + * - r0.2 + - Maintained + - Compatible with vLLM v0.7.3, and CI commitment for MindSpore 2.6.0 + +SIG +----------------------------------------------------- +- Welcome to join vLLM MindSpore SIG to participate in the co-construction of open-source projects and industrial cooperation: https://www.mindspore.cn/community/SIG +- SIG meetings, every other Friday or Saturday evening, 20:00 - 21:00 (UTC+8, `Convert to your timezone `_ ) + +License +----------------------------------------------------- +Apache License 2.0, as found in the `LICENSE `_ file. + + +.. toctree:: + :glob: + :maxdepth: 2 + :caption: Quick Start + :hidden: + + getting_started/quick_start/quick_start + getting_started/installation/installation + getting_started/tutorials/qwen2.5_7b_singleNPU/qwen2.5_7b_singleNPU + getting_started/tutorials/qwen2.5_32b_multiNPU/qwen2.5_32b_multiNPU + +.. toctree:: + :glob: + :maxdepth: 1 + :caption: User Guide + :hidden: + + user_guide/supported_models/models_list/models_list + user_guide/supported_features/features_list/features_list + user_guide/supported_features/operations/npu_ops + user_guide/supported_features/quantization/quantization + user_guide/supported_features/profiling/profiling + user_guide/supported_features/benchmark/benchmark + user_guide/environment_variables/environment_variables + +.. toctree:: + :glob: + :maxdepth: 1 + :caption: Developer Guide + :hidden: + + developer_guide/contributing + +.. toctree:: + :glob: + :maxdepth: 1 + :caption: FAQ + :hidden: + + faqs/faqs + +.. toctree:: + :glob: + :maxdepth: 1 + :caption: RELEASE NOTES + :hidden: + + RELEASE \ No newline at end of file diff --git a/docs/vllm_mindspore/docs/source_en/release_notes/release_notes.md b/docs/vllm_mindspore/docs/source_en/release_notes/release_notes.md new file mode 100644 index 0000000000000000000000000000000000000000..e8dc13f686149d9ac4066029e039c40b1c38dba1 --- /dev/null +++ b/docs/vllm_mindspore/docs/source_en/release_notes/release_notes.md @@ -0,0 +1,23 @@ +# Release Notes + +[![View Source](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/master/resource/_static/logo_source_en.svg)](https://gitee.com/mindspore/docs/blob/master/docs/vllm_mindspore/docs/source_en/release_notes/release_notes.md) + +## vLLM MindSpore 0.3.0 Release Notes + +The following are the key new features and models supported in the vLLM MindSpore plugin version 0.3.0. + +### New Features + +- 0.8.3 V1 Architecture Basic Features, including chunked prefill and automatic prefix caching; +- V0 Multi-step Scheduling; +- V0 Chunked Prefill; +- V0 Automatic Prefix Caching; +- V0 DeepSeek MTP (Multi-Task Processing); +- GPTQ Quantization; +- SmoothQuant Quantization. + +### New Models + +- DeepSeek-V3/R1 +- Qwen2.5-0.5B/1.5/7B/14B/32B/72B +- Qwen3-0.6B/1.7B/4B/8B/14B/32B diff --git a/docs/vllm_mindspore/docs/source_en/user_guide/environment_variables/environment_variables.md b/docs/vllm_mindspore/docs/source_en/user_guide/environment_variables/environment_variables.md new file mode 100644 index 0000000000000000000000000000000000000000..99d210e577a876d81ee83849724ee445b9080e18 --- /dev/null +++ b/docs/vllm_mindspore/docs/source_en/user_guide/environment_variables/environment_variables.md @@ -0,0 +1,20 @@ +# Environment Variable List + +[![View Source](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/master/resource/_static/logo_source_en.svg)](https://gitee.com/mindspore/docs/blob/master/docs/vllm_mindspore/docs/source_en/user_guide/environment_variables/environment_variables.md) + +| Environment Variable | Required for Basic Scenarios | Function | +|----------------------|-----------------------------|----------| +| `export vLLM_MODEL_BACKEND=MINDFORMER_MODELS` | Running MindSpore Transformers models | Distinguishes between MindSpore Transformers and vLLM MindSpore native models (default: native models) | +| `export PYTHONPATH=/xxx/mindformers-dev/:$PYTHONPATH` | Running models in MindSpore Transformers Research directory | MindSpore Transformers must be installed from source, as research directory code is not packaged into whl files | +| `export MINDFORMERS_MODEL_CONFIG=/xxx.yaml` | Running MindSpore Transformers models | Configuration file for MindSpore Transformers models | +| `export MS_JIT_MODULES="vllm_mindspore,research"` | Greater than v0.7.3 version | Specifies modules require JIT static compilation in static graph mode; corresponds to top-level module names in imports | +| `export GLOO_SOCKET_IFNAME=enp189s0f0` | Ray multi-machine | Used for inter-server communication in Ray multi-machine scenarios | +| `export TP_SOCKET_IFNAME=enp189s0f0` | Ray multi-machine | Required for RPC in Ray multi-machine scenarios | +| `export HCCL_OP_EXPANSION_MODE=AIV` | Multi-machine | Multi-machine optimization configuring communication algorithm orchestration for acceleration | +| `export HCCL_EXEC_TIMEOUT=7200` | Multi-machine | Multi-machine optimization controlling device synchronization timeout (seconds, default: 1836) | +| `export RUN_MODE="predict"` | Basic inference workflow (system default) | Configures network execution mode (predict mode enables optimizations) | +| `export DEVICE_NUM_PER_NODE=16` | Multi-machine checkpoint splitting | Required for automatic weight splitting functionality (default: 8 NPUs/server) | +| `export vLLM_USE_NPU_ADV_STEP_FLASH_OP="on"` | MSS (Multi-step scheduler) custom operators | Toggle for custom operators in MSS functionality | +| `export RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES=1` | Ray multi-machine | Enables Ray dependency in vLLM MindSpore | +| `export MS_JIT=0` | Quantization scenarios (post v0.7.3) | 0: Disables JIT compilation, executing network scripts in dynamic graph (PyNative) mode | +| `export FORCE_EAGER="true"` | Quantization scenarios (post v0.7.3) | | diff --git a/docs/vllm_mindspore/docs/source_en/user_guide/supported_features/benchmark/benchmark.md b/docs/vllm_mindspore/docs/source_en/user_guide/supported_features/benchmark/benchmark.md new file mode 100644 index 0000000000000000000000000000000000000000..d7eff67b57d152816b0215ad6f5dcd2194f11cb6 --- /dev/null +++ b/docs/vllm_mindspore/docs/source_en/user_guide/supported_features/benchmark/benchmark.md @@ -0,0 +1,118 @@ +# Benchmark + +[![View Source](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/master/resource/_static/logo_source_en.svg)](https://gitee.com/mindspore/docs/blob/master/docs/vllm_mindspore/docs/source_en/user_guide/supported_features/benchmark/benchmark.md) + +The benchmark tool of vLLM MindSpore is inherited from vLLM. You can refer to the [vLLM BenchMark](https://github.com/vllm-project/vllm/blob/main/benchmarks/README.md) documentation for more details. This document introduces [Online Benchmark](#online-performance-testing) and [Offline Benchmark](#offline-performance-testing). Users can follow the steps to conduct performance tests. + +## Online Benchmark + +For single-GPU inference, we take [Qwen2.5-7B](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct) as an example. You can prepare the environment by following the guide [NPU Single-GPU Inference (Qwen2.5-7B)](../../../getting_started/tutorials/qwen2.5_7b_singleNPU/qwen2.5_7b_singleNPU.md#online-inference), then start the online service with the following command: + +```bash +vllm-mindspore serve Qwen/Qwen2.5-7B-Instruct --device auto --disable-log-requests +``` + +For multi-GPU inference, we take [Qwen2.5-32B](https://huggingface.co/Qwen/Qwen2.5-32B-Instruct) as an example. You can prepare the environment by following the guide [NPU Single-Node Multi-GPU Inference (Qwen2.5-32B)](../../../getting_started/tutorials/qwen2.5_32b_multiNPU/qwen2.5_32b_multiNPU.md#online-inference), then start the online service with the following command: + +```bash +export TENSOR_PARALLEL_SIZE=4 +export MAX_MODEL_LEN=1024 +python3 -m vllm_mindspore.entrypoints vllm.entrypoints.openai.api_server --model "Qwen/Qwen2.5-32B-Instruct" --trust_remote_code --tensor-parallel-size $TENSOR_PARALLEL_SIZE --max-model-len $MAX_MODEL_LEN +``` + +If the service is successfully started, the following inference result will be returned: + +```text +INFO: Started server process [21349] +INFO: Waiting for application startup. +INFO: Application startup complete. +``` + +Clone the vLLM repository and import the vLLM MindSpore plugin to reuse the benchmark tools: + +```bash +git clone https://github.com/vllm-project/vllm.git +cd vllm +sed -i '1i import vllm_mindspore' benchmarks/benchmark_serving.py +``` + +Execute the test script: + +```bash +# download dataset +# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json + +# single-card, take Qwen2.5-7B as example: +python3 benchmarks/benchmark_serving.py \ + --backend openai-chat \ + --endpoint /v1/chat/completions \ + --model Qwen/Qwen2.5-7B-Instruct \ + --dataset-name sharegpt \ + --dataset-path /ShareGPT_V3_unfiltered_cleaned_split.json \ + --num-prompts 10 + +# multi-card, take Qwen2.5-32B as example: +python3 benchmarks/benchmark_serving.py \ + --backend openai-chat \ + --endpoint /v1/chat/completions \ + --model Qwen/Qwen2.5-32B-Instruct \ + --dataset-name sharegpt \ + --dataset-path /ShareGPT_V3_unfiltered_cleaned_split.json \ + --num-prompts 10 +``` + +If the test runs successfully, the following results will be returned: + +```text +============ Serving Benchmark Result ============ +Successful requests: .... +Benchmark duration (s): .... +Total input tokens: .... +Total generated tokens: .... +Request throughput (req/s): .... +Output token throughput (tok/s): .... +Total Token throughput (tok/s): .... +---------------Time to First Token---------------- +Mean TTFT (ms): .... +Median TTFT (ms): .... +P99 TTFT (ms): .... +-----Time per Output Token (excl. 1st token)------ +Mean TPOT (ms): .... +Median TPOT (ms): .... +P99 TPOT (ms): .... +---------------Inter-token Latency---------------- +Mean ITL (ms): .... +Median ITL (ms): .... +P99 ITL (ms): .... +================================================== +``` + +## Offline Benchmark + +For offline performance benchmark, take [Qwen2.5-7B](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct) as an example. Prepare the environment by following the guide [NPU Single-GPU Inference (Qwen2.5-7B)](../../../getting_started/tutorials/qwen2.5_7b_singleNPU/qwen2.5_7b_singleNPU.md#offline-inference). + +Clone the vLLM repository and import the vLLM-MindSpore plugin to reuse the benchmark tools: + +```bash +git clone https://github.com/vllm-project/vllm.git +cd vllm +sed -i '1i import vllm_mindspore' benchmarks/benchmark_throughput.py +``` + +Run the test script with the following command: + +```bash +python3 benchmarks/benchmark_throughput.py \ + --model Qwen/Qwen2.5-7B-Instruct \ + --dataset-name sonnet \ + --dataset-path benchmarks/sonnet.txt \ + --num-prompts 10 +``` + +If the test runs successfully, the following results will be returned: + +```text +Throughput: ... requests/s, ... total tokens/s, ... output tokens/s +Total num prompt tokens: ... +Total num output tokens: ... +``` diff --git a/docs/vllm_mindspore/docs/source_en/user_guide/supported_features/features_list/features_list.md b/docs/vllm_mindspore/docs/source_en/user_guide/supported_features/features_list/features_list.md new file mode 100644 index 0000000000000000000000000000000000000000..b75f8f133fb5425dd5a79af93a6e47eada49ca06 --- /dev/null +++ b/docs/vllm_mindspore/docs/source_en/user_guide/supported_features/features_list/features_list.md @@ -0,0 +1,36 @@ +# Supported Features List + +[![View Source](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/master/resource/_static/logo_source_en.svg)](https://gitee.com/mindspore/docs/blob/master/docs/vllm_mindspore/docs/source_en/user_guide/supported_features/features_list/features_list.md) + +The features supported by vLLM MindSpore are consistent with the community version of vLLM. For feature descriptions and usage, please refer to the [vLLM Official Documentation](https://docs.vllm.ai/en/latest/). + +The following is the features supported in vLLM MindSpore. + +| **Features** | **vLLM V0** | **vLLM V1** | +|-----------------------------------|--------------------|--------------------| +| Chunked Prefill | √ | √ | +| Automatic Prefix Caching | √ | √ | +| Multi step scheduler | √ | × | +| DeepSeek MTP | √ | WIP | +| Async output | √ | √ | +| Quantization | √ | √ | +| LoRA | WIP | WIP | +| Tensor Parallel | √ | √ | +| Pipeline Parallel | WIP | WIP | +| Expert Parallel | × | √ | +| Data Parallel | × | √ | +| Prefill Decode Disaggregation | × | √ | +| Multi Modality | WIP | WIP | +| Prompt adapter | × | WIP | +| Speculative decoding | × | WIP | +| LogProbs | × | WIP | +| Prompt logProbs | × | WIP | +| Best of | × | × | +| Beam search | × | WIP | +| Guided Decoding | × | WIP | +| Pooling | × | × | +| Enc-dec | × | × | + +- √:Feature aligned with the community version of vLLM. +- ×:Currently unsupported; alternative solutions are recommended. +- WIP:Under development or planned for future implementation. diff --git a/docs/vllm_mindspore/docs/source_en/user_guide/supported_features/operations/npu_ops.md b/docs/vllm_mindspore/docs/source_en/user_guide/supported_features/operations/npu_ops.md new file mode 100644 index 0000000000000000000000000000000000000000..4e8392a75eae6a7bec60b1df429e7f3b77d4f1ae --- /dev/null +++ b/docs/vllm_mindspore/docs/source_en/user_guide/supported_features/operations/npu_ops.md @@ -0,0 +1,105 @@ +# Custom Operator Integration + +[![View Source](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/master/resource/_static/logo_source_en.svg)](https://gitee.com/mindspore/docs/blob/master/docs/vllm_mindspore/docs/source_en/user_guide/supported_features/operations/npu_ops.md) + +This document would introduce how to integrate a new custom operator into the vLLM MindSpore project, with the **`adv_step_flash`** operator as an example. The following sections would focus on the integration process, and user can refer to operator implementation introduction in official MindSpore tutorial: [Dynamic Graph Custom Operator Integration](https://www.mindspore.cn/tutorials/en/master/custom_program/operation/op_customopbuilder.html). + +For development, additional features can be extended based on project requirements. Implementation details can be referenced from [MindSpore Custom Operator Implementation](https://www.mindspore.cn/tutorials/en/master/custom_program/operation/op_customopbuilder.html). + +## File Structure + +The directory `vllm_mindspore/ops` contains and declaration and implementation of operations: + +```text +vllm_mindspore/ops/ +├── ascendc/ +│ ├── adv_step_flash.h // AscendC AdvStepFlash operator declaration +│ ├── adv_step_flash.c // AscendC AdvStepFlash operator implementation +│ └── ... +├── module/ +│ ├── module.h // Common module registration header +│ ├── module.cpp // Common module registration implementation +│ ├── adv_step_flash.cpp // Integration layer code (Python interface registration) +│ └── ... +``` + +- **`ops/ascendc/`**: Contains AscendC custom operator implementation code. +- **`ops/module/`**: Contains operator integration layer code, including common module registration (`module.h`, `module.cpp`) and operator-specific integration (e.g., `adv_step_flash.cpp`). + +## Integration Process + +To integrate a custom operator, user need to create [Operator Interface Declaration](#operator-interface-declaration), [Operator Implementation](#operator-implementation) and [Operator Integration](#operator-integration) in the directory `ops/ascendc/`. And do [Operator Compilation and Testing](#operator-compilation-and-testing) after declaration and implementation. + +### Operator Interface Declaration + +Create a header file (e.g., `my_custom_op.h`) in `ops/ascendc/` to declare the operator function and related interfaces: + +```cpp +#ifndef VLLM_MINDSPORE_OPS_ASCENDC_MY_CUSTOM_OP_H +#define VLLM_MINDSPORE_OPS_ASCENDC_MY_CUSTOM_OP_H + +extern void MyCustomOpKernelEntry(uint32_t blockDims, void *l2ctrl, void *aclStream, + uint8_t *input, uint8_t *output, int32_t param1, int32_t param2); + +#endif // VLLM_MINDSPORE_OPS_ASCENDC_MY_CUSTOM_OP_H +``` + +### Operator Implementation + +Create an implementation file (e.g., `my_custom_op.c`) in `ops/ascendc/` for the core logic: + +```cpp +#include "my_custom_op.h" +#include "kernel_operator.h" + +extern "C" __global__ __aicore__ void my_custom_op_impl(GM_ADDR input, GM_ADDR output, + int32_t param1, int32_t param2) { + // AscendC operation implement +} + +#ifndef __CCE_KT_TEST__ +void MyCustomOpKernelEntry(uint32_t blockDims, void *l2ctrl, void *aclStream, + uint8_t *input, uint8_t *output, int32_t param1, int32_t param2) { + my_custom_op_impl<<>>(input, output, param1, param2); +} +#endif +``` + +### Operator Integration + +Create an integration file (e.g., `my_custom_op.cpp`) in `module/`. User can refer to `adv_step_flash.cpp` for more details about the integration: + +```cpp +#include "ms_extension.h" +#include "ascendc/my_custom_op.h" +#include "module/module.h" + +void MyCustomOpPythonInterface(int32_t param1, int32_t param2, + BaseTensorPtr input, BaseTensorPtr output) { + ... +} + +MS_EXTENSION_MODULE(my_custom_op) { + m.def("my_custom_op", &MyCustomOpPythonInterface, "My custom operator", + pybind11::arg("param1"), pybind11::arg("param2"), + pybind11::arg("input"), pybind11::arg("output")); +} +``` + +### Operator Compilation and Testing + +1. **Code Integration**: Merge the code into the vLLM MindSpore project. +2. **Project Compilation**: Build and install the whl package containing the custom operator. +3. **Operator Testing**: Invoke the operator in Python: + + ```python + from vllm_mindspore import npu_ops + import numpy as np + import mindspore as ms + + input = ms.Tensor(np.array([1, 2, 3], dtype=np.int32)) + output = ms.Tensor(np.zeros_like(input)) + + npu_ops.my_custom_op(10, 20, input, output) + print("Output:", output) + ``` diff --git a/docs/vllm_mindspore/docs/source_en/user_guide/supported_features/profiling/op_detail.png b/docs/vllm_mindspore/docs/source_en/user_guide/supported_features/profiling/op_detail.png new file mode 100644 index 0000000000000000000000000000000000000000..cab7a5dcc9b4146d6375efba9a947c73c2f162b9 Binary files /dev/null and b/docs/vllm_mindspore/docs/source_en/user_guide/supported_features/profiling/op_detail.png differ diff --git a/docs/vllm_mindspore/docs/source_en/user_guide/supported_features/profiling/op_total.png b/docs/vllm_mindspore/docs/source_en/user_guide/supported_features/profiling/op_total.png new file mode 100644 index 0000000000000000000000000000000000000000..a8dbc1f6548dafd6f0a8a1b777499560a18b9095 Binary files /dev/null and b/docs/vllm_mindspore/docs/source_en/user_guide/supported_features/profiling/op_total.png differ diff --git a/docs/vllm_mindspore/docs/source_en/user_guide/supported_features/profiling/profiling.md b/docs/vllm_mindspore/docs/source_en/user_guide/supported_features/profiling/profiling.md new file mode 100644 index 0000000000000000000000000000000000000000..7213a8c04bfd0aef1c12da1da05e5f7b33be379e --- /dev/null +++ b/docs/vllm_mindspore/docs/source_en/user_guide/supported_features/profiling/profiling.md @@ -0,0 +1,89 @@ +# Profiling Methods + +[![View Source](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/master/resource/_static/logo_source_en.svg)](https://gitee.com/mindspore/docs/blob/master/docs/vllm_mindspore/docs/source_en/user_guide/supported_features/profiling/profiling.md) + +vLLM MindSpore supports the `mindspore.Profiler` module to track the performance of workers in vLLM MindSpore. User can follow the [Collecting Profiling Data](#collecting-profiling-data) section to gather data and then analyze it according to [Analyzing Profiling Data](#analyzing-profiling-data). Additionally, user can inspect the model's IR graph through [Graph Data Dump](#graph-data-dump) to analyze and debug the model structure. + +## Collecting Profiling Data + +To enable profiling data collection, user need to set the `VLLM_TORCH_PROFILER_DIR` environment variable to the directory where the profiling results will be saved. For multi-machine inference, this variable must be set on each machine before inference: + +```bash +export VLLM_TORCH_PROFILER_DIR=/path/to/save/vllm_profile +``` + +After setting the variable, Run the following command to launch the vLLM MindSpore service. We take [Qwen2.5-32B](https://huggingface.co/Qwen/Qwen2.5-32B-Instruct) as an example: + +```bash +export TENSOR_PARALLEL_SIZE=4 +export MAX_MODEL_LEN=1024 +python3 -m vllm_mindspore.entrypoints vllm.entrypoints.openai.api_server --model "Qwen/Qwen2.5-32B-Instruct" --trust_remote_code --tensor-parallel-size $TENSOR_PARALLEL_SIZE --max-model-len $MAX_MODEL_LEN +``` + +If the service starts successfully, you will see output similar to the following, indicating that the `start_profile` and `stop_profile` requests are being monitored: + +```text +INFO 05-15 12:03:07 [launcher.py:31] Route: /start_profile, Methods: POST +INFO 05-15 12:03:07 [launcher.py:31] Route: /stop_profile, Methods: POST +INFO: Started server process [212135] +INFO: Waiting for application startup. +INFO: Application startup complete. +``` + +Once the service is running, user can send the following requests to perform a profiling collection: + +```shell +# Request to start profiling +curl -X POST http://127.0.0.1:8000/start_profile + +# Request for inference +curl http://localhost:8000/v1/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "/home/DeepSeekV3", + "prompt": "San Francisco is a", + "max_tokens": 7, + "temperature": 0 + }' + +# Request to stop profiling +curl -X POST http://127.0.0.1:8000/stop_profile +``` + +When the log displays content similar to the following, it indicates that profiling data collection for one worker is complete: + +```text +Parsing: [####################] 3/3 Done +``` + +## Analyzing Profiling Data + +The directory specified by `VLLM_TORCH_PROFILER_DIR` contains the profiling results, with subdirectories named with the `ascend_ms` suffix. Each subdirectory stores the profiling results for one worker. The files in these subdirectories can be referenced for performance analysis, as described in [Ascend Performance Tuning](https://www.mindspore.cn/tutorials/en/master/debug/profiler.html). + +User can select a subdirectory to analyze the performance of a single worker: + +- `op_statistic.csv`: Overall operator statistics. + + ![](op_total.png) + +- `kernel_details.csv`: Detailed execution data for each operator. + + ![](op_detail.png) + +- `trace_view.json`: System-wide execution data. This file can be uploaded to the [Perfetto UI](https://ui.perfetto.dev/) for visual inspection of system execution. Clicking on a process in the left sidebar displays trace event information for all threads under that process: + + ![](trace_total.png) + + In MindSpore information, it shows the operator dispatch in graph execution. + + ![](trace_1.png) + + In Ascend information, it shows the actual execution of Ascend operators, which can be correlated with the operators dispatched in the MindSpore process. + + ![](trace_2.png) + +## Graph Data Dump + +Refer to the [MindSpore Dump Documentation](https://www.mindspore.cn/tutorials/en/master/debug/dump.html). First, configure the JSON file, then set the `MINDSPORE_DUMP_CONFIG` environment variable to point to the absolute path of this configuration file. After inference completes, the graph data can be obtained. + +The dump results include the IR graph. Additionally, by configuring the `dump_mode` in the JSON file, user can choose to dump execution data for all operators or specific operators. diff --git a/docs/vllm_mindspore/docs/source_en/user_guide/supported_features/profiling/trace_1.png b/docs/vllm_mindspore/docs/source_en/user_guide/supported_features/profiling/trace_1.png new file mode 100644 index 0000000000000000000000000000000000000000..83f6d0123bfdbf36f388accf6f95c00bfce51248 Binary files /dev/null and b/docs/vllm_mindspore/docs/source_en/user_guide/supported_features/profiling/trace_1.png differ diff --git a/docs/vllm_mindspore/docs/source_en/user_guide/supported_features/profiling/trace_2.png b/docs/vllm_mindspore/docs/source_en/user_guide/supported_features/profiling/trace_2.png new file mode 100644 index 0000000000000000000000000000000000000000..69d3bbd4cf661c7f67bc93b20ab080cf77b8916b Binary files /dev/null and b/docs/vllm_mindspore/docs/source_en/user_guide/supported_features/profiling/trace_2.png differ diff --git a/docs/vllm_mindspore/docs/source_en/user_guide/supported_features/profiling/trace_total.png b/docs/vllm_mindspore/docs/source_en/user_guide/supported_features/profiling/trace_total.png new file mode 100644 index 0000000000000000000000000000000000000000..8e260cfd42ba6dee53f825282cfdd7dff8c3f9b9 Binary files /dev/null and b/docs/vllm_mindspore/docs/source_en/user_guide/supported_features/profiling/trace_total.png differ diff --git a/docs/vllm_mindspore/docs/source_en/user_guide/supported_features/quantization/quantization.md b/docs/vllm_mindspore/docs/source_en/user_guide/supported_features/quantization/quantization.md new file mode 100644 index 0000000000000000000000000000000000000000..9663e0f88261275306c9947c95d219d0ce35f360 --- /dev/null +++ b/docs/vllm_mindspore/docs/source_en/user_guide/supported_features/quantization/quantization.md @@ -0,0 +1,131 @@ +# Quantization Methods + +[![View Source](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/master/resource/_static/logo_source_en.svg)](https://gitee.com/mindspore/docs/blob/master/docs/vllm_mindspore/docs/source_en/user_guide/supported_features/quantization/quantization.md) + +This document introduces model quantization and quantized inference methods. Quantization reduces inference resource with minor cost of precision, while improving inference performance to enable deployment on more devices. With the large scale of LLMs, post-training quantization has become the mainstream approach for model quantization. For details, refer to [Post-Training Quantization Introduction](https://gitee.com/mindspore/golden-stick/blob/master/mindspore_gs/ptq/README_CN.md). + +In this document, the [Creating Quantized Models](#creating-quantized-models) section introduces post-training quantization steps using [Qwen3-8B](https://huggingface.co/Qwen/Qwen3-8B) as an example. A the [Quantized Model Inference](#quantized-model-inference) section explains how to perform inference with quantized models. + +## Creating Quantized Models + +We use the [Qwen3-8B](https://huggingface.co/Qwen/Qwen3-8B) network as an example to introduce A8W8 quantization with the SmoothQuant algorithm. + +### Quantizing Networks with MindSpore Golden Stick + +We employ [MindSpore Golden Stick's PTQ algorithm](https://gitee.com/mindspore/golden-stick/blob/master/mindspore_gs/ptq/ptq/README_CN.md) for SmoothQuant quantization of Qwen3-8B. For detailed methods, refer to [Qwen3-SmoothQuant Quantization Example](todo). + +#### Downloading Qwen3-8B Weights + +Users can download the weights using huggingface-cli: + +```bash +huggingface-cli download --resume-download Qwen/Qwen3-8B --local-dir Qwen3-8B-bf16 +``` + +Alternatively, use [other download methods](../../../getting_started/quick_start/quick_start.md#download-model). + +#### Loading the Network with MindSpore Transformers + +Load the network using [MindSpore Transformers](https://gitee.com/mindspore/mindformers) with the following script: + +```python +from mindformers import AutoModel +from mindformers import AutoTokenizer + +network = AutoModel.from_pretrained("Qwen3-8B-bf16") +tokenizer = AutoTokenizer.from_pretrained("Qwen3-8B-bf16") +``` + +#### Preparing the CEval Dataset + +Download the CEval dataset to the `ceval` directory with the following structure: + +```bash +ceval + ├── dev + ├── test + └── val +``` + +Create a dataset handle using MindSpore: + +```python +from mindspore import GeneratorDataset +ds = GeneratorDataset(source="ceval", column_names=["subjects", "input_ids", "labels"]) +``` + +#### Performing Post-Training Quantization with Golden Stick + +Use the following Python script for post-training quantization: + +```python +from mindspore import dtype as msdtype +from mindspore_gs.ptq import PTQ +from mindspore_gs.common import BackendTarget +from mindspore_gs.ptq import PTQConfig, PTQMode, OutliersSuppressionType, QuantGranularity, PrecisionRecovery + +cfg = PTQConfig(mode=quant_mode, backend=BackendTarget.ASCEND, weight_quant_dtype=msdtype.int8, + act_quant_dtype=msdtype.int8, outliers_suppression=OutliersSuppressionType.SMOOTH, + opname_blacklist=['lm_head']) +w2_config = PTQConfig(mode=quant_mode, backend=BackendTarget.ASCEND, weight_quant_dtype=msdtype.int8, + act_quant_dtype=msdtype.int8, + outliers_suppression=OutliersSuppressionType.NONE, + precision_recovery=PrecisionRecovery.NONE, + act_quant_granularity=QuantGranularity.PER_TOKEN, + weight_quant_granularity=QuantGranularity.PER_CHANNEL) +layer_policies = OrderedDict({r'.*\.w2.*': w2_config}) +ptq = PTQ(config=cfg, layer_policies=layer_policies) +from research.qwen3.qwen3_transformers import Qwen3ParallelTransformerLayer +ptq.decoder_layer_types.append(Qwen3ParallelTransformerLayer) +ptq.apply(network, ds) +ptq.convert(network) +ms.save_checkpoint(network.parameters_dict(), "Qwen3-8B-A8W8", format="safetensors", + choice_func=lambda x: "key_cache" not in x and "value_cache" not in x and "float_weight" not in x) +``` + +Before calibration, add the MindSpore Transformers root directory to the `PYTHONPATH` environment variable, and check Qwen3-related classes have been successfully imported. + +### Downloading Quantized Weights + +We have uploaded the quantized Qwen3-8B to [ModelArts Community](https://modelers.cn): [MindSpore-Lab/Qwen3-8B-A8W8](https://modelers.cn/models/MindSpore-Lab/Qwen3-8B-A8W8). Refer to the [ModelArts Community documentation](https://modelers.cn/docs/zh/openmind-hub-client/0.9/basic_tutorial/download.html) to download the weights locally. + +## Quantized Model Inference + +After obtaining the Qwen3-8B SmoothQuant weights, ensure they are stored in the relative path `Qwen3-8B-A8W8`. + +### Offline Inference + +Refer to the [Installation Guide](../../../getting_started/installation/installation.md) to set up the vLLM MindSpore environment. Once ready, use the following Python code for offline inference: + +```python +import vllm_mindspore # Add this line at the top of the script +from vllm import LLM, SamplingParams + +# Sample prompts +prompts = [ + "I am", + "Today is", + "Llama is" +] + +# Create sampling parameters +sampling_params = SamplingParams(temperature=0.0, top_p=0.95) + +# Initialize LLM +llm = LLM(model="Qwen3-8B-A8W8", quantization='SmoothQuant') +# Generate text +outputs = llm.generate(prompts, sampling_params) +# Print results +for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") +``` + +Successful execution will yield inference results like: + +```text +Prompt: 'I am', Generated text: ' trying to create a virtual environment for my Python project, but I am encountering some' +Prompt: 'Today is', Generated text: ' the 100th day of school. To celebrate, the teacher has' +Prompt: 'Llama is', Generated text: ' a 100% natural, biodegradable, and compostable alternative' +``` diff --git a/docs/vllm_mindspore/docs/source_en/user_guide/supported_models/models_list/models_list.md b/docs/vllm_mindspore/docs/source_en/user_guide/supported_models/models_list/models_list.md new file mode 100644 index 0000000000000000000000000000000000000000..097ed295ccc70f52b59f35c17cb6c823e718ddc1 --- /dev/null +++ b/docs/vllm_mindspore/docs/source_en/user_guide/supported_models/models_list/models_list.md @@ -0,0 +1,21 @@ +# Supported Model List + +[![View Source](https://mindspore-website.obs.cn-north-4.myhuaweicloud.com/website-images/master/resource/_static/logo_source_en.svg)](https://gitee.com/mindspore/docs/blob/master/docs/vllm_mindspore/docs/source_en/user_guide/supported_models/models_list/models_list.md) + +| Model | Supported | Download Link | Backend | +|-------| --------- | ------------- | ------- | +| Qwen2.5 | √ | [Qwen2.5-7B](https://modelers.cn/models/AI-Research/Qwen2.5-7B), [Qwen2.5-32B](https://modelers.cn/models/AI-Research/Qwen2.5-32B), etc. | MINDFORMER_MODELS | +| Qwen3 | √ | [Qwen3-8B](https://modelers.cn/models/MindSpore-Lab/Qwen3-8B), [Qwen3-32B](https://modelers.cn/models/MindSpore-Lab/Qwen3-32B), etc. | MINDFORMER_MODELS | +| DeepSeek V3 | √ | [DeepSeek-V3](https://modelers.cn/models/MindSpore-Lab/DeepSeek-V3), etc. | MINDFORMER_MODELS | +| DeepSeek R1 | √ | [DeepSeek-R1](https://modelers.cn/models/MindSpore-Lab/DeepSeek-R1), [Deepseek-R1-W8A8](https://modelers.cn/models/MindSpore-Lab/DeepSeek-r1-w8a8), etc. | MINDFORMER_MODELS | + +The "Backend" refers to the source of the model, which can be either from MindSpore Transformers or vLLM MindSpore native models. It is specified using the environment variable `vLLM_MODEL_BACKEND`: + +- If the model source is MindSpore Transformers, the value is `MINDFORMER_MODELS`; +- If the model source is vLLM MindSpore, the value is `NATIVE_MODELS`. + +By default, the backend is set to `NATIVE_MODELS`. To change the model backend, use the following command: + +```bash +export vLLM_MODEL_BACKEND=MINDFORMER_MODELS +``` diff --git a/docs/vllm_mindspore/docs/source_zh_cn/faqs/faqs.md b/docs/vllm_mindspore/docs/source_zh_cn/faqs/faqs.md index deab07166d628d82fc4cea03b58ce6fe3e846b0c..7815a11f1a09561bf0569f3d395713ae7f9af9de 100644 --- a/docs/vllm_mindspore/docs/source_zh_cn/faqs/faqs.md +++ b/docs/vllm_mindspore/docs/source_zh_cn/faqs/faqs.md @@ -58,6 +58,7 @@ RuntimeError: Call aclnnNonzeroV2 failed, detail:E39999: Inner Error ``` +- 解决思路: 请检查CANN与MindSpore的配套关系是否正确。 ### 执行Qwen3时,报vLLM相关的`resolve_transformers_fallback`导入错误 @@ -68,6 +69,7 @@ ImportError: cannot import name 'resolve_transformers_fallback' from 'vllm.model_executor.model_loader.utils' ``` +- 解决思路: 请尝试将`vllm`切换为`v0.7.3`版本。 ### `import vllm_mindspore`时找不到`torch` @@ -78,6 +80,7 @@ importlib.metadata.PackageNotFoundError: No package metadata was found for torch ``` +- 解决思路: 请执行以下命令,下载torch相关组件: ```bash diff --git a/docs/vllm_mindspore/docs/source_zh_cn/getting_started/installation/installation.md b/docs/vllm_mindspore/docs/source_zh_cn/getting_started/installation/installation.md index f76ce96408c7f10e38a039c4be06fb946c376946..90ee43d26449a680138694ad7268ca3bc32640b6 100644 --- a/docs/vllm_mindspore/docs/source_zh_cn/getting_started/installation/installation.md +++ b/docs/vllm_mindspore/docs/source_zh_cn/getting_started/installation/installation.md @@ -106,7 +106,7 @@ pip install vllm_mindspore ### 源码安装 -- 安装CANN +- **CANN安装** CANN安装方法与环境配套,请参考[CANN社区版软件安装](https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/82RC1alpha002/softwareinst/instg/instg_0001.html?Mode=PmIns&OS=openEuler&Software=cannToolKit),若用户在安装CANN过程中遇到问题,可参考[昇腾常见问题](https://www.hiascend.com/document/detail/zh/AscendFAQ/ProduTech/CANNFAQ/cannfaq_000.html)进行解决。 CANN默认安装路径为`/usr/local/Ascend`。用户在安装CANN完毕后,使用如下命令,为CANN配置环境变量: @@ -117,14 +117,14 @@ pip install vllm_mindspore export ASCEND_CUSTOM_PATH=${LOCAL_ASCEND}/ascend-toolkit ``` -- 安装vLLM的前置依赖 +- **vLLM前置依赖安装** vLLM的环境配置与安装方法,请参考[vLLM安装教程](https://docs.vllm.ai/en/v0.8.3/getting_started/installation/cpu.html)。其依赖`gcc/g++ >= 12.3.0`版本,可通过以下命令完成安装: ```bash yum install -y gcc gcc-c++ ``` -- 安装vLLM MindSpore +- **vLLM MindSpore安装** 安装vLLM MindSpore,需要在拉取vLLM MindSpore源码后,执行以下命令,安装依赖包: diff --git a/docs/vllm_mindspore/docs/source_zh_cn/getting_started/quick_start/quick_start.md b/docs/vllm_mindspore/docs/source_zh_cn/getting_started/quick_start/quick_start.md index 60941bdc89ad5a2b474b1120e6b12a7c9372cb9a..aca28df54978cbbce3a4afa8a545a3e077771e84 100644 --- a/docs/vllm_mindspore/docs/source_zh_cn/getting_started/quick_start/quick_start.md +++ b/docs/vllm_mindspore/docs/source_zh_cn/getting_started/quick_start/quick_start.md @@ -125,7 +125,7 @@ git clone https://huggingface.co/Qwen/Qwen2.5-7B-Instruct ```bash export ASCEND_TOTAL_MEMORY_GB=64 # Please use `npu-smi info` to check the memory. export vLLM_MODEL_BACKEND=MindFormers # use MindSpore Transformers as model backend. -export vLLM_MODEL_MEMORY_USE_GB=16 # Memory reserved for model execution. Set according to the model's maximum usage, with the remaining environment used for kvcache allocation +export vLLM_MODEL_MEMORY_USE_GB=32 # Memory reserved for model execution. Set according to the model's maximum usage, with the remaining environment used for kvcache allocation export MINDFORMERS_MODEL_CONFIG=$YAML_PATH # Set the corresponding MindSpore Transformers model's YAML file. ``` diff --git a/docs/vllm_mindspore/docs/source_zh_cn/getting_started/tutorials/deepseek_multiNode/deepseek_r1_671b_w8a8_tp16_multi_node.md b/docs/vllm_mindspore/docs/source_zh_cn/getting_started/tutorials/deepseek_multiNode/deepseek_r1_671b_w8a8_tp16_multi_node.md index 22a74c910e2d043f8b4ca8214b3edd904ac35189..af563414a8a993bc6ba58f27887779c7999a34ca 100644 --- a/docs/vllm_mindspore/docs/source_zh_cn/getting_started/tutorials/deepseek_multiNode/deepseek_r1_671b_w8a8_tp16_multi_node.md +++ b/docs/vllm_mindspore/docs/source_zh_cn/getting_started/tutorials/deepseek_multiNode/deepseek_r1_671b_w8a8_tp16_multi_node.md @@ -86,7 +86,7 @@ git clone https://modelers.cn/MindSpore-Lab/DeepSeek-R1-W8A8.git 分别在主从节点配置如下环境变量: -> 注:环境变量必须设置在 Ray 创建集群前,且当环境有变更时,需要通过 `ray stop` 将主从节点集群停止,并重新创建集群,否则环境变量将不生效。 +> 环境变量必须设置在 Ray 创建集群前,且当环境有变更时,需要通过 `ray stop` 将主从节点集群停止,并重新创建集群,否则环境变量将不生效。 ```bash source /usr/local/Ascend/ascend-toolkit/set_env.sh diff --git a/docs/vllm_mindspore/docs/source_zh_cn/index.rst b/docs/vllm_mindspore/docs/source_zh_cn/index.rst index 0045c52e4b7cdc4da7de0b2796b24a7f85cd6574..46f7199b4adb2838f63e009ff141e262eadef67d 100644 --- a/docs/vllm_mindspore/docs/source_zh_cn/index.rst +++ b/docs/vllm_mindspore/docs/source_zh_cn/index.rst @@ -124,7 +124,7 @@ Apache 许可证 2.0,如 `LICENSE