diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000000000000000000000000000000000000..d057201a76ae4ad81c3deff14ede37dbad233438 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,4 @@ +[submodule "tests/mindformers"] + path = tests/mindformers + url = https://gitee.com/mindspore/mindformers.git + branch = br_infer_deepseek_os diff --git a/.jenkins/test/config/dependent_package.yaml b/.jenkins/test/config/dependent_package.yaml deleted file mode 100644 index 19bed914b485d3cc51b19874bfccc91737137a7f..0000000000000000000000000000000000000000 --- a/.jenkins/test/config/dependent_package.yaml +++ /dev/null @@ -1,8 +0,0 @@ -mindspore: - https://repo.mindspore.cn/mindspore/mindspore/version/202502/20250227/master_20250227211723_94ac228bae9cd6d0f00b4ce8d5857773799c4f26_newest/ - -mindformers: - https://repo.mindspore.cn/mindspore/mindformers/version/202502/20250228/dev_20250228220021_4e90ca405720ea2f4a0abdf501d01078f28d724c_newest/ - -msadapter: - https://repo.mindspore.cn/mindspore/msadapter/version/202503/20250301/master_20250301_newest/ diff --git a/.jenkins/test/config/dependent_packages.yaml b/.jenkins/test/config/dependent_packages.yaml new file mode 100644 index 0000000000000000000000000000000000000000..375119335c8ce534a82ade88e8970e3c85245b2c --- /dev/null +++ b/.jenkins/test/config/dependent_packages.yaml @@ -0,0 +1,11 @@ +mindspore: + 'https://repo.mindspore.cn/mindspore/mindspore/version/202504/20250417/br_infer_deepseek_os_20250417004508_38b6db6c3039b59153d52d5e353cd01fe774dc93_newest/' + +mindspore_gs: + 'https://repo.mindspore.cn/mindspore/golden-stick/version/202504/20250424/master_20250424010019_dc3222e266c572dce1070a112aa6e12155a45370_newest/' + +msadapter: + 'https://repo.mindspore.cn/mindspore/msadapter/version/202504/20250410/master_20250410120007_83e7214eb2b9598179135a4e98dce3b69ba27da2_newest/' + +vllm: + 'https://repo.mindspore.cn/mirrors/vllm/version/202503/20250321/v0.7.3_20250321112504_ed6e9075d31e32c8548b480a47d1ffb77da1f54c_newest/' diff --git a/Dockerfile b/Dockerfile index 059901e9c1c3a9cbbba35b3fd20cd35319a4dbb9..d174da7c2085a9c8173549d48cc92cce1cb813fb 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,5 +1,6 @@ FROM hub.oepkgs.net/openeuler/openeuler:22.03-lts-sp4 +####################### os ####################### RUN yum clean all && \ yum makecache && \ yum install -y \ @@ -16,74 +17,92 @@ RUN yum clean all && \ ####################### python ####################### WORKDIR /root -RUN wget --no-check-certificate https://mirrors.tuna.tsinghua.edu.cn/anaconda/miniconda/Miniconda3-py311_25.1.1-2-Linux-aarch64.sh && \ +RUN wget https://mirrors.tuna.tsinghua.edu.cn/anaconda/miniconda/Miniconda3-py311_25.1.1-2-Linux-aarch64.sh && \ bash /root/Miniconda3-py311_25.1.1-2-Linux-aarch64.sh -b && \ rm /root/Miniconda3-py311_25.1.1-2-Linux-aarch64.sh ENV PATH="/root/miniconda3/bin:$PATH" ENV PYTHONPATH="/root/miniconda3/lib/python3.11/site-packages" -RUN pip config set global.index-url 'https://mirrors.tools.huawei.com/pypi/simple/' && \ - pip config set global.trusted-host mirrors.tools.huawei.com +RUN pip config set global.index-url 'https://pypi.tuna.tsinghua.edu.cn/simple' && \ + pip config set global.trusted-host pypi.tuna.tsinghua.edu.cn ####################### CANN ####################### -COPY ascend_install.info /etc/ascend_install.info -RUN wget --no-check-certificate "https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/CANN/CANN%208.0.0/Ascend-cann-toolkit_8.0.0_linux-aarch64.run" -o Ascend-cann-toolkit_8.0.0_linux-aarch64.run && \ - wget --no-check-certificate "https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/CANN/CANN%208.0.0/Ascend-cann-kernels-910b_8.0.0_linux-aarch64.run" -o Ascend-cann-kernels-910b_8.0.0_linux-aarch64.run && \ - wget --no-check-certificate "https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/CANN/CANN%208.0.0/Ascend-cann-nnrt_8.0.0_linux-aarch64.run" -o Ascend-cann-nnrt_8.0.0_linux-aarch64.run && \ +WORKDIR /root +RUN echo "UserName=HwHiAiUser" >> /etc/ascend_install.info && \ + echo "UserGroup=HwHiAiUser" >> /etc/ascend_install.info && \ + echo "Firmware_Install_Type=full" >> /etc/ascend_install.info && \ + echo "Firmware_Install_Path_Param=/usr/local/Ascend" >> /etc/ascend_install.info && \ + echo "Driver_Install_Type=full" >> /etc/ascend_install.info && \ + echo "Driver_Install_Path_Param=/usr/local/Ascend" >> /etc/ascend_install.info && \ + echo "Driver_Install_For_All=no" >> /etc/ascend_install.info && \ + echo "Driver_Install_Mode=normal" >> /etc/ascend_install.info && \ + echo "Driver_Install_Status=complete" >> /etc/ascend_install.info +RUN curl -s "https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/CANN/CANN%208.0.0/Ascend-cann-toolkit_8.0.0_linux-aarch64.run" -o Ascend-cann-toolkit.run && \ + curl -s "https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/CANN/CANN%208.0.0/Ascend-cann-kernels-910b_8.0.0_linux-aarch64.run" -o Ascend-cann-kernels-910b.run && \ + curl -s "https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/CANN/CANN%208.0.0/Ascend-cann-nnrt_8.0.0_linux-aarch64.run" -o Ascend-cann-nnrt.run && \ chmod a+x *.run && \ - bash /root/Ascend-cann-toolkit_8.0.0_linux-aarch64.run --install -q && \ - bash /root/Ascend-cann-kernels-910b_8.0.0_linux-aarch64.run --install -q && \ - bash /Ascend-cann-nnrt_8.0.0_linux-aarch64.run --install -q && \ + bash /root/Ascend-cann-toolkit.run --install -q && \ + bash /root/Ascend-cann-kernels-910b.run --install -q && \ + bash /root/Ascend-cann-nnrt.run --install -q && \ rm /root/*.run - RUN echo "source /usr/local/Ascend/nnrt/set_env.sh" >> /root/.bashrc && \ echo "source /usr/local/Ascend/ascend-toolkit/set_env.sh" >> /root/.bashrc -####################### dev ####################### +####################### dev env ####################### RUN pip install --no-cache-dir \ - cmake \ + cmake>=3.26 \ decorator \ ray==2.42.1 \ + protobuf==3.20.0 \ + ml_dtypes \ wheel \ setuptools \ wrap \ - deprecated + deprecated \ + packaging \ + ninja \ + "setuptools-scm>=8" \ + numpy \ + build WORKDIR /workspace -ARG GITEE_USERNAME -ARG GITEE_PASSWORD -RUN git config --global credential.helper store && \ - echo "https://${GITEE_USERNAME}:${GITEE_PASSWORD}@gitee.com" > /root/.git-credentials - RUN git clone -b br_infer_deepseek_os https://gitee.com/mindspore/mindformers.git /workspace/mindformers && \ cd mindformers && \ + sed -i 's/-i https:\/\/pypi.tuna.tsinghua.edu.cn\/simple//g' build.sh && \ bash build.sh && \ PACKAGE_PATH=$(python3 -c "import site; print(site.getsitepackages()[0])") && \ cp -a research "$PACKAGE_PATH" && \ rm -rf /workspace/mindformers -RUN git clone -b deepseek https://gitee.com/mindspore/golden-stick.git /workspace/golden-stick && \ +RUN git clone https://gitee.com/mindspore/golden-stick.git /workspace/golden-stick && \ cd golden-stick && \ bash build.sh && \ pip install --no-cache-dir /workspace/golden-stick/output/*.whl && \ rm -rf /workspace/golden-stick -RUN git clone https://gitee.com/mindspore/msadapter.git /workspace/msadapter && \ - cd /workspace/msadapter && \ - bash scripts/build_and_reinstall.sh && \ - rm -rf /workspace/msadapter - -# vllm_ms ENV USE_TORCH="FALSE" ENV USE_TF="FALSE" RUN git clone -b v0.6.6.post1 https://gitee.com/mirrors/vllm.git /workspace/vllm && \ cd vllm && \ VLLM_TARGET_DEVICE=empty pip install --no-cache-dir . && \ rm -rf /workspace/vllm + +RUN git clone https://openi.pcl.ac.cn/OpenI/MSAdapter.git /workspace/msadapter && \ + cd /workspace/msadapter && \ + bash scripts/build_and_reinstall.sh && \ + rm -rf /workspace/msadapter + ADD . /workspace/vllm_mindspore RUN cd /workspace/vllm_mindspore && \ pip install --no-cache-dir -r requirements.txt && \ pip install . && \ rm -rf /workspace/vllm_mindspore +RUN wget -O mindspore-2.5.0-cp311-cp311-linux_aarch64.whl \ +https://repo.mindspore.cn/mindspore/mindspore/version/202503/20250303/br_infer_deepseek_os_20250303004707_705727d59236c8c197b25ad0e464c4908434d42f_newest/unified/aarch64/mindspore-2.5.0-cp311-cp311-linux_aarch64.whl && \ +pip install --no-cache-dir mindspore-2.5.0-cp311-cp311-linux_aarch64.whl && \ +rm -f mindspore-2.5.0-cp311-cp311-linux_aarch64.whl + +RUN pip uninstall torch torch-npu torchvision -y + CMD ["bash"] \ No newline at end of file diff --git a/OWNERS b/OWNERS new file mode 100644 index 0000000000000000000000000000000000000000..90ee21daca40e61153d7c13faf2220c0c339aa56 --- /dev/null +++ b/OWNERS @@ -0,0 +1,12 @@ +reviewers: +- wang_shaocong +- erpim +- zhang_xue_tong +- tan-wei-cheng + +approvers: +- tronzhang +- zichun_ye +- zlq2020 +- panshaowu +- zhaizhiqiang \ No newline at end of file diff --git a/README.md b/README.md index 284abf19eed28a42d1e35ace34256483853b7550..5ea56601b6088bb737fde97ca01261d5bce9f4ae 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# vllm_mindspore +# vllm-mindspore ## Overview @@ -6,7 +6,7 @@ The `vllm-mindspore`is a integration for running vLLM on the MindSpore framework This is the recommended solution for supporting the MindSpore within the vLLM community. It provides deep integration with the MindSpore framework, offering efficient computation and optimization support for vLLM, enabling seamless operation on MindSpore. -By using the `vllm-mindspore`, popular open-source models, including Transformer-like, Mixture-of-Expert, Embedding, and Multi-modal LLMs, can run seamlessly for training and inference on the MindSpore framework. +By using the `vllm-mindspore`, popular open-source models, can run seamlessly for training and inference on the MindSpore framework. --- @@ -14,9 +14,9 @@ By using the `vllm-mindspore`, popular open-source models, including Transformer - Hardware: Atlas A2/A3 - Software: - - Python >= 3.9 - - CANN >= 8.0.0 - - MindSpore >=2.5.0 + - Python >= 3.9 + - CANN >= 8.0.0 + - MindSpore >=2.5.0 --- @@ -24,16 +24,22 @@ By using the `vllm-mindspore`, popular open-source models, including Transformer ### Installation -Installation from source code +#### Installation from source code + +Install from source code. [Wiki Installation.](https://gitee.com/mindspore/vllm-mindspore/wikis/Getting%20Started/Installation) + +#### Set up using Docker + +##### Pre-built images ```shell -# 1. Uninstall torch-related packages due to msadapter limitations -pip3 uninstall torch torch-npu torchvision +docker pull hub.oepkgs.net/oedeploy/openeuler/aarch64/mindspore:v1.0 +``` + +##### Build image from source -# 2.Install vllm_mindspore -git clone https://gitee.com/mindspore/vllm_mindspore.git -cd vllm_mindspore -pip install . +```shell +docker build --network=host . ``` ### Inference and Serving @@ -42,22 +48,28 @@ pip install . You can run vllm_mindspore in your own code on a list of prompts. +```bash +export ASCEND_TOTAL_MEMORY_GB=64 # Based on the ascend device. +``` + ```python + import vllm_mindspore # Add this line on the top of script. + from vllm import LLM, SamplingParams # Sample prompts. prompts = [ "I am", "Today is", - "Llama is" + "What is" ] # Create a sampling params object. sampling_params = SamplingParams(temperature=0.0, top_p=0.95) # Create an LLM. -llm = LLM(model="meta-llama/Llama-2-7b-hf") +llm = LLM(model="Qwen/Qwen2.5-32B-Instruct", tensor_parallel_size=8) # Generate texts from the prompts. The output is a list of RequestOutput objects # that contain the prompt, generated text, and other information. outputs = llm.generate(prompts, sampling_params) @@ -66,28 +78,29 @@ for output in outputs: prompt = output.prompt generated_text = output.outputs[0].text print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + ``` #### Serving(OpenAI-Compatible) You can start the server via the vllm_mindspore command: -`python3 -m vllm_mindspore.entrypoints vllm.entrypoints.openai.api_server --model "meta-llama/Llama-2-7b-hf"` +`python3 -m vllm_mindspore.entrypoints vllm.entrypoints.openai.api_server --model "Qwen/Qwen2.5-32B-Instruct" --tensor_parallel_size=8` To call the server, you can use `curl` or any other HTTP client. ```shell + curl http://localhost:8000/v1/completions \ -H "Content-Type: application/json" \ -d '{ - "model": "meta-llama/Llama-2-7b-hf", - "prompt": "Llama is", + "model": "Qwen/Qwen2.5-32B-Instruct", + "prompt": "MindSpore is", "max_tokens": 120, "temperature": 0 }' -``` - +``` ## Contributing @@ -96,8 +109,6 @@ We welcome and value any contributions and collaborations: - Please feel free comments about your usage of vllm_mindspore. - Please let us know if you encounter a bug by filing an issue. - - ## License Apache License 2.0, as found in the [LICENSE](https://gitee.com/mindspore/vllm_mindspore/blob/master/LICENSE) file. diff --git a/build_image.sh b/build_image.sh deleted file mode 100644 index 801fdcf310cbbd9be78ae9f7eb4408cce2b6f820..0000000000000000000000000000000000000000 --- a/build_image.sh +++ /dev/null @@ -1,181 +0,0 @@ -#!/bin/bash - -validate_args() { - if [ $# -lt 2 ]; then - echo "Usage: $0 " - exit 1 - fi - MODEL=$1 - VERSION=$2 -} - -check_proxy() { - if [[ -z "$http_proxy" || -z "$https_proxy" ]]; then - echo "Error: http_proxy and https_proxy must be set." - exit 1 - fi -} - -init_variables() { - case $MODEL in - "300I") - DEVICE=310p - DEVICE_TAG=300I-Duo - ;; - "800I") - DEVICE=910b - DEVICE_TAG=800I-A2 - ;; - "A3") - DEVICE=A3 - DEVICE_TAG=800I-A3 - ;; - *) - echo "Unsupported architecture: $MODEL" - exit 1 - ;; - esac - - FILE_VERSION="${VERSION%.*}-${VERSION##*.}" - IMAGE_FILE_NAME="mindie:dev-${FILE_VERSION}-${DEVICE_TAG}-py311-ubuntu22.04-aarch64" - IMAGE_FILE="${IMAGE_FILE_NAME}.tar.gz" - IMAGE_URL="https://cmc-nkg-artifactory.cmc.tools.huawei.com/artifactory/cmc-nkg-inner/MindIE/ATB-Models/${VERSION}/MindIE-images/${IMAGE_FILE}" - IMAGE_MD5_URL="${IMAGE_URL}.md5" - DOCKER_TAG="ms_vllm_$(date +%Y%m%d)" -} - -print_summary() { - echo "Model: $MODEL" - echo "Version: $VERSION" - echo "Image url: $IMAGE_URL" -} - -update_msadapter() { - rm -rf vllm_mindspore/msadapter - git submodule update --init vllm_mindspore/msadapter || true - cd vllm_mindspore/msadapter || exit 1 - for patch in ../../patch/msadapter/*.patch; do - [ -e "$patch" ] || continue - git apply "$patch" - done - touch __init__.py - touch mindtorch/__init__.py - cd - >/dev/null -} - -function fetch_and_patch_vllm() { - local script_dir=$(cd "$(dirname $0)"; pwd) - local vllm_tag="v0.6.6.post1" - local vllm_source_dir="${script_dir}/vllm-${vllm_tag}" - local patch_dir="${script_dir}/patch/vllm" - - if [ -d "${vllm_source_dir}" ]; then - echo "The ${vllm_source_dir} already exists. Remove it if reinstallation is needed." - exit 1 - fi - - git clone https://github.com/vllm-project/vllm.git -b ${vllm_tag} --depth 1 ${vllm_source_dir} - cd ${vllm_source_dir} - - for patch in $(ls ${patch_dir}); do - sed -i 's/\r//g' ${patch_dir}/${patch} - git apply ${patch_dir}/${patch} - done - cd .. -} - -download_file() { - local url=$1 - local output=$2 - curl -k --noproxy 'cmc-nkg-artifactory.cmc.tools.huawei.com' "$url" -o "$output" - if [ $? -ne 0 ]; then - echo "Failed to download $output from $url" - exit 1 - fi -} - -verify_md5() { - local file=$1 - local md5_file=$2 - local downloaded_md5=$(awk '{print $1}' $md5_file) - local calculated_md5=$(md5sum $file | awk '{print $1}') - - if [ "$downloaded_md5" == "$calculated_md5" ]; then - echo "MD5 checksum for $file verified successfully." - return 0 - else - echo "MD5 checksum verification failed!" - echo "Expected: $downloaded_md5" - echo "Got: $calculated_md5" - return 1 - fi -} - -check_or_download() { - local file=$1 - local md5_file=$2 - local file_url=$3 - local md5_url=$4 - - if [ -f "$file" ] && [ -f "$md5_file" ]; then - verify_md5 "$file" "$md5_file" && return 0 - echo "Verification failed. Redownloading files..." - else - echo "Files not found. Downloading..." - fi - - download_file "$md5_url" "$md5_file" - download_file "$file_url" "$file" - verify_md5 "$file" "$md5_file" || { echo "Verification failed after re-downloading. Exiting."; exit 1; } -} - -load_docker_image() { - local file=$1 - docker load -i $file - if [ $? -eq 0 ]; then - echo "Docker image loaded successfully." - else - echo "Failed to load Docker image." - exit 1 - fi -} - -build_docker_image() { - local tag=$1 - docker build \ - --network=host \ - --build-arg http_proxy=$http_proxy \ - --build-arg https_proxy=$https_proxy \ - --build-arg no_proxy=127.0.0.1,*.huawei.com,localhost,local,.local,172.17.0.1,cmc-nkg-artifactory.cmc.tools.huawei.com,mirrors.tools.huawei.com \ - -f Dockerfile \ - -t $tag \ - --target ms_vllm \ - . - - if [ $? -eq 0 ]; then - echo "Docker image $tag built successfully." - else - echo "Failed to build Docker image." - exit 1 - fi -} - -main() { - validate_args "$@" - check_proxy - - init_variables - print_summary - - # update repo - update_msadapter - fetch_and_patch_vllm - - # docker build - check_or_download "mindie.tar.gz" "mindie.tar.gz.md5" "$IMAGE_URL" "$IMAGE_MD5_URL" - load_docker_image "mindie.tar.gz" - sed -i "1s|FROM .* AS base|FROM $IMAGE_FILE_NAME AS base|" Dockerfile - build_docker_image "$DOCKER_TAG" -} - -main "$@" \ No newline at end of file diff --git a/codecheck_toolkits/README.md b/codecheck_toolkits/README.md new file mode 100644 index 0000000000000000000000000000000000000000..03b601856bfeddf4c81a2880e56f8c785ea59db0 --- /dev/null +++ b/codecheck_toolkits/README.md @@ -0,0 +1,30 @@ +# vllm 社区 codecheck 检查工具链使用说明 + +## 使用步骤 +- 1. 确保修改已经```git commit```,并合并成一个commit id. +- 2. 运行命令:```bash vllm_codecheck.sh``` + +## 执行说明 +- 1、根据 ``requiremnts-lint.txt``安装工具链,请确保网络畅通。 +- 2、依次运行`yaph`, `codespell`, `ruff`, `isort`, `mypy` 工具。 + +## 工具说明 +- `yapf`: 自动formatting工具。 +- `codespell`: 拼写检查工具。 +- `ruff`: 代码format检查工具。 +- `isort`: 自动修复import工具。 +- `mypy`: 静态类型检查工具。 + +## 修复建议: +- `codespell`如需屏蔽拼写错误,修改`pyproject.toml`中的 + +```commandline +[tool.codespell] +ignore-words-list = "dout, te, indicies, subtile, ElementE, CANN" +``` + +- `ruff` 如需屏蔽检查,在代码行后增加注释 + +```commandline +# noqa: {error_code} +``` diff --git a/codecheck_toolkits/pyproject.toml b/codecheck_toolkits/pyproject.toml new file mode 100644 index 0000000000000000000000000000000000000000..9a3c52de098a77f51eaa18f62f206bc7fa054f16 --- /dev/null +++ b/codecheck_toolkits/pyproject.toml @@ -0,0 +1,110 @@ +[build-system] +# Should be mirrored in requirements-build.txt +requires = [ + "cmake>=3.26", + "ninja", + "packaging", + "setuptools>=61", + "setuptools-scm>=8.0", + "torch == 2.5.1", + "wheel", + "jinja2", +] +build-backend = "setuptools.build_meta" + +[tool.setuptools_scm] +# version_file = "vllm/_version.py" # currently handled by `setup.py:get_version()` + +[tool.ruff] +# Allow lines to be as long as 80. +line-length = 80 +exclude = [ + # External file, leaving license intact + "vllm_mindspore/__init__.py", + "tests/*" +] + +[tool.ruff.lint.per-file-ignores] +"vllm_mindspore/version.txt" = ["F401"] +"vllm_mindspore/_version.txt" = ["ALL"] + +[tool.ruff.lint] +select = [ + # pycodestyle + "E", + # Pyflakes + "F", + # pyupgrade + "UP", + # flake8-bugbear + "B", + # flake8-simplify + "SIM", + # isort + # "I", + "G", +] +ignore = [ + # star imports + "F405", "F403", + # lambda expression assignment + "E731", + # Loop control variable not used within loop body + "B007", + # f-string format + "UP032", + # long line + "E501" +] + +[tool.mypy] +ignore_missing_imports = true +check_untyped_defs = true +follow_imports = "silent" + +# After fixing type errors resulting from follow_imports: "skip" -> "silent", +# move the directory here and remove it from tools/mypy.sh +#files = [ +# "vllm/*.py", +# "vllm/adapter_commons", +# "vllm/assets", +# "vllm/entrypoints", +# "vllm/core", +# "vllm/inputs", +# "vllm/logging_utils", +# "vllm/multimodal", +# "vllm/platforms", +# "vllm/transformers_utils", +# "vllm/triton_utils", +# "vllm/usage", +#] +files= ["vllm_mindspore/*.py",] +# TODO(woosuk): Include the code from Megatron and HuggingFace. +exclude = [ + "vllm_mindspore/model_executor/parallel_utils/|vllm_mindspore/model_executor/models/", + # Ignore triton kernels in ops. + 'vllm_mindspore/attention/ops/.*\.py$' +] + +[tool.codespell] +ignore-words-list = "dout, te, indicies, subtile, ElementE, CANN" +skip = "./tests/models/fixtures,./tests/prompts,./benchmarks/sonnet.txt,./tests/lora/data,./build" + +[tool.isort] +use_parentheses = true +skip_gitignore = true + +skip_glob = ["tests/*", "vllm_mindspore/ops/*"] +skip = ["vllm_mindspore/__init__.py"] + +[tool.pytest.ini_options] +markers = [ + "skip_global_cleanup", + "core_model: enable this model test in each PR instead of only nightly", + "cpu_model: enable this model test in CPU tests", + "quant_model: run this model test under Quantized category", + "split: run this test as part of a split", + "distributed: run this test only in distributed GPU tests", + "skip_v1: do not run this test with v1", + "optional: optional tests that are automatically skipped, include --optional to run them", +] \ No newline at end of file diff --git a/codecheck_toolkits/requirements-lint.txt b/codecheck_toolkits/requirements-lint.txt new file mode 100644 index 0000000000000000000000000000000000000000..711bb50a0e93656e2495d20745021b9eb2de0af4 --- /dev/null +++ b/codecheck_toolkits/requirements-lint.txt @@ -0,0 +1,15 @@ +# formatting +yapf==0.32.0 +toml==0.10.2 +tomli==2.0.2 +ruff==0.6.5 +codespell==2.3.0 +isort==5.13.2 +clang-format==18.1.5 +sphinx-lint==1.0.0 + +# type checking +mypy==1.11.1 +types-PyYAML +types-requests +types-setuptools diff --git a/codecheck_toolkits/vllm_codecheck.sh b/codecheck_toolkits/vllm_codecheck.sh new file mode 100644 index 0000000000000000000000000000000000000000..e67c73723425b6568b33a76c2d145c75ab4d0eb3 --- /dev/null +++ b/codecheck_toolkits/vllm_codecheck.sh @@ -0,0 +1,72 @@ +pip install -r requirements-lint.txt + +RET_FLAG=0 + +cd .. +# yapf formats code automatically + +MERGEBASE="$(git merge-base origin/master HEAD)" +if ! git diff --diff-filter=ACM --quiet --exit-code "$MERGEBASE" -- '*.py' '*.pyi' &> /dev/null; then + git diff --name-only --diff-filter=ACM "$MERGEBASE" -- '*.py' '*.pyi' | xargs -P 5 \ + yapf --in-place --recursive --parallel --exclude build/ +fi + +if [[ $? -ne 0 ]]; then + echo "yapf run failed." + RET_FLAG=1 +else + echo "yapf run success." +fi + +# codespell check +if ! git diff --diff-filter=ACM --quiet --exit-code "$MERGEBASE" -- '*.py' '*.pyi' &> /dev/null; then + git diff --name-only --diff-filter=ACM "$MERGEBASE" -- '*.py' '*.pyi' | xargs -P 5 \ + codespell --skip ./vllm_mindspore/ops/ascendc/* +fi +if [[ $? -ne 0 ]]; then + echo "codespell check failed." + RET_FLAG=1 +else + echo "codespell check success." +fi + +# ruff check +if ! git diff --diff-filter=ACM --quiet --exit-code "$MERGEBASE" -- '*.py' '*.pyi' &> /dev/null; then + git diff --name-only --diff-filter=ACM "$MERGEBASE" -- '*.py' '*.pyi' | xargs -P 5 \ + ruff check +fi +if [[ $? -ne 0 ]]; then + echo "ruff check failed." + RET_FLAG=1 +else + echo "ruff check success." +fi + +# isort fixed +if ! git diff --diff-filter=ACM --quiet --exit-code "$MERGEBASE" -- '*.py' '*.pyi' &> /dev/null; then + git diff --name-only --diff-filter=ACM "$MERGEBASE" -- '*.py' '*.pyi' | xargs -P 5 \ + isort +fi +if [[ $? -ne 0 ]]; then + echo "isort fixed failed." + RET_FLAG=1 +else + echo "isort fixed success." +fi + +# mypy check type + +PYTHON_VERSION=$(python -c 'import sys; print(f"{sys.version_info.major}.{sys.version_info.minor}")') + +if ! git diff --diff-filter=ACM --quiet --exit-code "$MERGEBASE" -- '*.py' '*.pyi' &> /dev/null; then + git diff --name-only --diff-filter=ACM "$MERGEBASE" -- '*.py' '*.pyi' | xargs -P 5 \ + mypy --follow-imports skip --python-version "${PYTHON_VERSION}" "$@" +fi +if [[ $? -ne 0 ]]; then + echo "mypy check failed." + RET_FLAG=1 +else + echo "mypy check success." +fi + +cd - || exit $RET_FLAG diff --git a/setup.py b/setup.py index e7189008b772f66ae480f40b4cbee8befaf1c28c..8e2154b3671e0e6956e5cb90feb3e6d19f3216a1 100644 --- a/setup.py +++ b/setup.py @@ -1,6 +1,7 @@ #!/usr/bin/env python3 # encoding: utf-8 # Copyright 2025 Huawei Technologies Co., Ltd +# Copyright 2024 The vLLM team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -20,9 +21,14 @@ import importlib.util import logging import os import sys +import shutil from typing import List from pathlib import Path from setuptools import find_packages, setup +from setuptools.command.build_ext import build_ext +from setuptools import Extension +import subprocess +import warnings def load_module_from_path(module_name, path): @@ -82,8 +88,102 @@ def get_requirements() -> List[str]: return requirements +def write_commit_id(): + ret_code = os.system("git rev-parse --abbrev-ref HEAD > ./vllm_mindspore/.commit_id " + "&& git log --abbrev-commit -1 >> ./vllm_mindspore/.commit_id") + if ret_code != 0: + sys.stdout.write("Warning: Can not get commit id information. Please make sure git is available.") + os.system("echo 'git is not available while building.' > ./vllm_mindspore/.commit_id") + + version = (Path("vllm_mindspore") / "version.txt").read_text() +def _get_ascend_home_path(): + return os.environ.get("ASCEND_HOME_PATH", "/usr/local/Ascend/ascend-toolkit/latest") + +def _get_ascend_env_path(check_exists=True): + env_script_path = os.path.join(_get_ascend_home_path(), "bin", "setenv.bash") + if check_exists and not os.path.exists(env_script_path): + warnings.warn(f"The file '{env_script_path}' is not found, " + "please make sure env variable 'ASCEND_HOME_PATH' is set correctly.") + return None + return env_script_path + +class CustomBuildExt(build_ext): + ROOT_DIR = os.path.abspath(os.path.dirname(__file__)) + + def build_extension(self, ext): + if ext.name == "vllm_mindspore.npu_ops": + self.build_npu_ops(ext) + else: + raise ValueError(f"Unknown extension name: {ext.name}") + + def build_npu_ops(self, ext): + # "vllm_mindspore.npu_ops" --> "npu_ops" + ext_name = ext.name.split('.')[-1] + so_name = ext_name + ".so" + print(f"Building {so_name} ...") + OPS_DIR = os.path.join(ROOT_DIR, "vllm_mindspore", "ops") + BUILD_OPS_DIR = os.path.join(ROOT_DIR, "build", "ops") + os.makedirs(BUILD_OPS_DIR, exist_ok=True) + + ascend_home_path = _get_ascend_home_path() + env_script_path = _get_ascend_env_path(False) + build_extension_dir = os.path.join(BUILD_OPS_DIR, "kernel_meta", ext_name) + # Combine all cmake commands into one string + cmake_cmd = ( + f"source {env_script_path} && " + f"cmake -S {OPS_DIR} -B {BUILD_OPS_DIR}" + f" -DCMAKE_BUILD_TYPE=Release" + f" -DCMAKE_INSTALL_PREFIX={os.path.join(BUILD_OPS_DIR, 'install')}" + f" -DBUILD_EXTENSION_DIR={build_extension_dir}" + f" -DMS_EXTENSION_NAME={ext_name}" + f" -DASCEND_CANN_PACKAGE_PATH={ascend_home_path} && " + f"cmake --build {BUILD_OPS_DIR} -j --verbose" + ) + + try: + # Run the combined cmake command + print(f"Running combined CMake commands:\n{cmake_cmd}") + result = subprocess.run(cmake_cmd, cwd=self.ROOT_DIR, text=True, shell=True, capture_output=True) + if result.returncode != 0: + print("CMake commands failed:") + print(result.stdout) # Print standard output + print(result.stderr) # Print error output + raise RuntimeError(f"Combined CMake commands failed with exit code {result.returncode}") + except subprocess.CalledProcessError as e: + raise RuntimeError(f"Failed to build {so_name}: {e}") + + # Copy the generated .so file to the target directory + src_so_path = os.path.join(build_extension_dir, so_name) + dst_so_path = self.get_ext_fullpath(ext.name) + os.makedirs(os.path.dirname(dst_so_path), exist_ok=True) + if os.path.exists(dst_so_path): + os.remove(dst_so_path) + shutil.copy(src_so_path, dst_so_path) + print(f"Copied {so_name} to {dst_so_path}") + + +write_commit_id() + +package_data = { + "": [ + "*.so", + "lib/*.so", + ".commit_id" + ] +} + +def _get_ext_modules(): + ext_modules = [] + # Currently, the CI environment does not support the compilation of custom operators. + # As a temporary solution, this is controlled via an environment variable. + # Once the CI environment adds support for custom operator compilation, + # this should be updated to enable compilation by default. + if os.getenv("vLLM_USE_NPU_ADV_STEP_FLASH_OP", "off") == "on" and _get_ascend_env_path() is not None: + ext_modules.append(Extension("vllm_mindspore.npu_ops", sources=[])) # sources are specified in CMakeLists.txt + return ext_modules + setup( name="vllm-mindspore", version=version, @@ -95,9 +195,9 @@ setup( ), long_description=read_readme(), long_description_content_type="text/markdown", - url="https://gitee.com/mindspore/vllm_mindspore", + url="https://gitee.com/mindspore/vllm-mindspore", project_urls={ - "Homepage": "https://gitee.com/mindspore/vllm_mindspore", + "Homepage": "https://gitee.com/mindspore/vllm-mindspore", "Documentation": "", }, classifiers=[ @@ -114,4 +214,8 @@ setup( packages=find_packages(), python_requires=">=3.9", install_requires=get_requirements(), + cmdclass={"build_ext": CustomBuildExt}, + ext_modules=_get_ext_modules(), + include_package_data=True, + package_data=package_data, ) diff --git a/tests/mindformers b/tests/mindformers new file mode 160000 index 0000000000000000000000000000000000000000..544c4009573051e0e254efab71d212bfc77fc7b2 --- /dev/null +++ b/tests/mindformers @@ -0,0 +1 @@ +Subproject commit 544c4009573051e0e254efab71d212bfc77fc7b2 diff --git a/tests/__init__.py b/tests/st/python/__init__.py similarity index 100% rename from tests/__init__.py rename to tests/st/python/__init__.py diff --git a/tests/st/python/config/predict_deepseek_r1_671b.yaml b/tests/st/python/config/predict_deepseek_r1_671b.yaml new file mode 100644 index 0000000000000000000000000000000000000000..112375eff5105d85e32c325b0b3f3d987591a1bd --- /dev/null +++ b/tests/st/python/config/predict_deepseek_r1_671b.yaml @@ -0,0 +1,121 @@ +seed: 0 +output_dir: './output' # path to save checkpoint/strategy +run_mode: 'predict' +use_parallel: True + +load_checkpoint: "/path/to/deepseekr1/model_ckpt" +load_ckpt_format: "safetensors" +auto_trans_ckpt: True # If true, auto transform load_checkpoint to load in distributed model + +# trainer config +trainer: + type: CausalLanguageModelingTrainer + model_name: 'DeepSeekR1' + +# default parallel of device num = 32 for Atlas 800T A2 +parallel_config: + model_parallel: 4 + pipeline_stage: 1 + expert_parallel: 1 + vocab_emb_dp: False + +# mindspore context init config +context: + mode: 0 # 0--Graph Mode; 1--Pynative Mode + max_device_memory: "61GB" + device_id: 0 + affinity_cpu_list: None + +kernel_launch_group: + thread_num: 4 + kernel_group_num: 16 + +# parallel context config +parallel: + parallel_mode: "STAND_ALONE" # use 'STAND_ALONE' mode for inference with parallelism in frontend + full_batch: False + strategy_ckpt_save_file: "./ckpt_strategy.ckpt" + +# model config +model: + model_config: + type: DeepseekV3Config + auto_register: deepseek3_config.DeepseekV3Config + batch_size: 1 # add for incre predict + seq_length: 4096 + hidden_size: 7168 + num_layers: 4 + num_heads: 128 + max_position_embeddings: 163840 + intermediate_size: 18432 + kv_lora_rank: 512 + q_lora_rank: 1536 + qk_rope_head_dim: 64 + v_head_dim: 128 + qk_nope_head_dim: 128 + vocab_size: 129280 + multiple_of: 256 + rms_norm_eps: 1.0e-6 + bos_token_id: 0 + eos_token_id: 1 + pad_token_id: 1 + ignore_token_id: -100 + compute_dtype: "bfloat16" + layernorm_compute_type: "bfloat16" + softmax_compute_type: "bfloat16" + rotary_dtype: "bfloat16" + router_dense_type: "bfloat16" + param_init_type: "bfloat16" + scaling_factor: + beta_fast: 32.0 + beta_slow: 1.0 + factor: 40.0 + mscale: 1.0 + mscale_all_dim: 1.0 + original_max_position_embeddings: 4096 + use_past: True + extend_method: "YARN" + use_flash_attention: True + block_size: 16 + num_blocks: 512 + offset: 0 + checkpoint_name_or_path: "" + repetition_penalty: 1 + max_decode_length: 1024 + top_k: 1 + top_p: 1 + theta: 10000.0 + do_sample: False + is_dynamic: True + qkv_concat: False + ffn_concat: True + auto_map: + AutoConfig: deepseek3_config.DeepseekV3Config + AutoModel: deepseek3.DeepseekV3ForCausalLM + arch: + type: DeepseekV3ForCausalLM + auto_register: deepseek3.DeepseekV3ForCausalLM + +moe_config: + expert_num: 256 + num_experts_chosen: 8 + routing_policy: "TopkRouterV2" + shared_expert_num: 1 + routed_scaling_factor: 2.5 + first_k_dense_replace: 3 + moe_intermediate_size: 2048 + topk_group: 4 + n_group: 8 + +processor: + return_tensors: ms + tokenizer: + unk_token: '' + bos_token: '<|begin▁of▁sentence|>' + eos_token: '<|end▁of▁sentence|>' + pad_token: '<|end▁of▁sentence|>' + type: LlamaTokenizerFast + vocab_file: '/path/to/deepseekr1/tokenizer.json' + tokenizer_file: '/path/to/deepseekr1/tokenizer.json' + chat_template: "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='', is_first_sp=true) %}{%- for message in messages %}{%- if message['role'] == 'system' %}{%- if ns.is_first_sp %}{% set ns.system_prompt = ns.system_prompt + message['content'] %}{% set ns.is_first_sp = false %}{%- else %}{% set ns.system_prompt = ns.system_prompt + '\n\n' + message['content'] %}{%- endif %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<|User|>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<|Assistant|><|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<|tool▁call▁end|>'}}{%- set ns.is_first = true -%}{%- else %}{{'\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<|tool▁call▁end|>'}}{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<|tool▁outputs▁end|>' + message['content'] + '<|end▁of▁sentence|>'}}{%- set ns.is_tool = false -%}{%- else %}{{'<|Assistant|>' + message['content'] + '<|end▁of▁sentence|>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<|tool▁outputs▁begin|><|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\n<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<|tool▁outputs▁end|>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<|Assistant|>'}}{% endif %}" + type: LlamaProcessor diff --git a/tests/st/python/config/predict_deepseek_r1_671b_w8a8.yaml b/tests/st/python/config/predict_deepseek_r1_671b_w8a8.yaml new file mode 100644 index 0000000000000000000000000000000000000000..5a5e9d60aadc362af9e645cac551e9ae5cec1235 --- /dev/null +++ b/tests/st/python/config/predict_deepseek_r1_671b_w8a8.yaml @@ -0,0 +1,125 @@ +seed: 0 +output_dir: './output' # path to save checkpoint/strategy +run_mode: 'predict' +use_parallel: True + +load_checkpoint: "/path/to/deepseekr1/model_w8a8_ckpt" +load_ckpt_format: "safetensors" +auto_trans_ckpt: True # If true, auto transform load_checkpoint to load in distributed model + +# trainer config +trainer: + type: CausalLanguageModelingTrainer + model_name: 'DeepSeekR1-W8A8' + +# default parallel of device num = 16 for Atlas 800T A2 +parallel_config: + model_parallel: 16 + pipeline_stage: 1 + expert_parallel: 1 + vocab_emb_dp: False + +# mindspore context init config +context: + mode: 0 # 0--Graph Mode; 1--Pynative Mode + max_device_memory: "61GB" + device_id: 0 + affinity_cpu_list: None + +kernel_launch_group: + thread_num: 4 + kernel_group_num: 16 + +# parallel context config +parallel: + parallel_mode: "STAND_ALONE" # use 'STAND_ALONE' mode for inference with parallelism in frontend + full_batch: False + strategy_ckpt_save_file: "./ckpt_strategy.ckpt" + +# model config +model: + model_config: + type: DeepseekV3Config + auto_register: deepseek3_config.DeepseekV3Config + batch_size: 1 # add for incre predict + seq_length: 4096 + hidden_size: 7168 + num_layers: 4 + num_heads: 128 + max_position_embeddings: 163840 + intermediate_size: 18432 + kv_lora_rank: 512 + q_lora_rank: 1536 + qk_rope_head_dim: 64 + v_head_dim: 128 + qk_nope_head_dim: 128 + vocab_size: 129280 + multiple_of: 256 + rms_norm_eps: 1.0e-6 + bos_token_id: 0 + eos_token_id: 1 + pad_token_id: 1 + ignore_token_id: -100 + compute_dtype: "bfloat16" + layernorm_compute_type: "bfloat16" + softmax_compute_type: "bfloat16" + rotary_dtype: "bfloat16" + router_dense_type: "bfloat16" + param_init_type: "bfloat16" + scaling_factor: + beta_fast: 32.0 + beta_slow: 1.0 + factor: 40.0 + mscale: 1.0 + mscale_all_dim: 1.0 + original_max_position_embeddings: 4096 + use_past: True + extend_method: "YARN" + use_flash_attention: True + block_size: 16 + num_blocks: 512 + offset: 0 + checkpoint_name_or_path: "" + repetition_penalty: 1 + max_decode_length: 1024 + top_k: 1 + top_p: 1 + theta: 10000.0 + do_sample: False + is_dynamic: True + qkv_concat: False + ffn_concat: True + quantization_config: + quant_method: 'ptq' + weight_dtype: 'int8' + activation_dtype: 'int8' + auto_map: + AutoConfig: deepseek3_config.DeepseekV3Config + AutoModel: deepseek3.DeepseekV3ForCausalLM + arch: + type: DeepseekV3ForCausalLM + auto_register: deepseek3.DeepseekV3ForCausalLM + +moe_config: + expert_num: 256 + num_experts_chosen: 8 + routing_policy: "TopkRouterV2" + shared_expert_num: 1 + routed_scaling_factor: 2.5 + first_k_dense_replace: 3 + moe_intermediate_size: 2048 + topk_group: 4 + n_group: 8 + +processor: + return_tensors: ms + tokenizer: + unk_token: '' + bos_token: '<|begin▁of▁sentence|>' + eos_token: '<|end▁of▁sentence|>' + pad_token: '<|end▁of▁sentence|>' + type: LlamaTokenizerFast + vocab_file: '/path/to/deepseekr1/tokenizer.json' + tokenizer_file: '/path/to/deepseekr1/tokenizer.json' + chat_template: "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='', is_first_sp=true) %}{%- for message in messages %}{%- if message['role'] == 'system' %}{%- if ns.is_first_sp %}{% set ns.system_prompt = ns.system_prompt + message['content'] %}{% set ns.is_first_sp = false %}{%- else %}{% set ns.system_prompt = ns.system_prompt + '\n\n' + message['content'] %}{%- endif %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<|User|>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<|Assistant|><|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<|tool▁call▁end|>'}}{%- set ns.is_first = true -%}{%- else %}{{'\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<|tool▁call▁end|>'}}{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<|tool▁outputs▁end|>' + message['content'] + '<|end▁of▁sentence|>'}}{%- set ns.is_tool = false -%}{%- else %}{{'<|Assistant|>' + message['content'] + '<|end▁of▁sentence|>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<|tool▁outputs▁begin|><|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\n<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<|tool▁outputs▁end|>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<|Assistant|>'}}{% endif %}" + type: LlamaProcessor diff --git a/tests/st/python/config/predict_deepseek_r1_671b_w8a8_smoothquant.yaml b/tests/st/python/config/predict_deepseek_r1_671b_w8a8_smoothquant.yaml new file mode 100644 index 0000000000000000000000000000000000000000..f8984e0fde2c898cb54663180ede3495f3608e78 --- /dev/null +++ b/tests/st/python/config/predict_deepseek_r1_671b_w8a8_smoothquant.yaml @@ -0,0 +1,125 @@ +seed: 0 +output_dir: './output' # path to save checkpoint/strategy +run_mode: 'predict' +use_parallel: True + +load_checkpoint: "/path/to/deepseekr1/model_w8a8_smoothquant_ckpt" +load_ckpt_format: "safetensors" +auto_trans_ckpt: True # If true, auto transform load_checkpoint to load in distributed model + +# trainer config +trainer: + type: CausalLanguageModelingTrainer + model_name: 'DeepSeekR1-W8A8' + +# default parallel of device num = 16 for Atlas 800T A2 +parallel_config: + model_parallel: 16 + pipeline_stage: 1 + expert_parallel: 1 + vocab_emb_dp: False + +# mindspore context init config +context: + mode: 0 # 0--Graph Mode; 1--Pynative Mode + max_device_memory: "61GB" + device_id: 0 + affinity_cpu_list: None + +kernel_launch_group: + thread_num: 4 + kernel_group_num: 16 + +# parallel context config +parallel: + parallel_mode: "STAND_ALONE" # use 'STAND_ALONE' mode for inference with parallelism in frontend + full_batch: False + strategy_ckpt_save_file: "./ckpt_strategy.ckpt" + +# model config +model: + model_config: + type: DeepseekV3Config + auto_register: deepseek3_config.DeepseekV3Config + batch_size: 1 # add for incre predict + seq_length: 4096 + hidden_size: 7168 + num_layers: 4 + num_heads: 128 + max_position_embeddings: 163840 + intermediate_size: 18432 + kv_lora_rank: 512 + q_lora_rank: 1536 + qk_rope_head_dim: 64 + v_head_dim: 128 + qk_nope_head_dim: 128 + vocab_size: 129280 + multiple_of: 256 + rms_norm_eps: 1.0e-6 + bos_token_id: 0 + eos_token_id: 1 + pad_token_id: 1 + ignore_token_id: -100 + compute_dtype: "bfloat16" + layernorm_compute_type: "bfloat16" + softmax_compute_type: "bfloat16" + rotary_dtype: "bfloat16" + router_dense_type: "bfloat16" + param_init_type: "bfloat16" + scaling_factor: + beta_fast: 32.0 + beta_slow: 1.0 + factor: 40.0 + mscale: 1.0 + mscale_all_dim: 1.0 + original_max_position_embeddings: 4096 + use_past: True + extend_method: "YARN" + use_flash_attention: True + block_size: 16 + num_blocks: 512 + offset: 0 + checkpoint_name_or_path: "" + repetition_penalty: 1 + max_decode_length: 1024 + top_k: 1 + top_p: 1 + theta: 10000.0 + do_sample: False + is_dynamic: True + qkv_concat: True + ffn_concat: True + quantization_config: + quant_method: 'smoothquant' + weight_dtype: 'int8' + activation_dtype: 'int8' + auto_map: + AutoConfig: deepseek3_config.DeepseekV3Config + AutoModel: deepseek3.DeepseekV3ForCausalLM + arch: + type: DeepseekV3ForCausalLM + auto_register: deepseek3.DeepseekV3ForCausalLM + +moe_config: + expert_num: 256 + num_experts_chosen: 8 + routing_policy: "TopkRouterV2" + shared_expert_num: 1 + routed_scaling_factor: 2.5 + first_k_dense_replace: 3 + moe_intermediate_size: 2048 + topk_group: 4 + n_group: 8 + +processor: + return_tensors: ms + tokenizer: + unk_token: '' + bos_token: '<|begin▁of▁sentence|>' + eos_token: '<|end▁of▁sentence|>' + pad_token: '<|end▁of▁sentence|>' + type: LlamaTokenizerFast + vocab_file: '/path/to/deepseekr1/tokenizer.json' + tokenizer_file: '/path/to/deepseekr1/tokenizer.json' + chat_template: "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='', is_first_sp=true) %}{%- for message in messages %}{%- if message['role'] == 'system' %}{%- if ns.is_first_sp %}{% set ns.system_prompt = ns.system_prompt + message['content'] %}{% set ns.is_first_sp = false %}{%- else %}{% set ns.system_prompt = ns.system_prompt + '\n\n' + message['content'] %}{%- endif %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<|User|>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<|Assistant|><|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<|tool▁call▁end|>'}}{%- set ns.is_first = true -%}{%- else %}{{'\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<|tool▁call▁end|>'}}{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<|tool▁outputs▁end|>' + message['content'] + '<|end▁of▁sentence|>'}}{%- set ns.is_tool = false -%}{%- else %}{{'<|Assistant|>' + message['content'] + '<|end▁of▁sentence|>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<|tool▁outputs▁begin|><|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\n<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<|tool▁outputs▁end|>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<|Assistant|>'}}{% endif %}" + type: LlamaProcessor diff --git a/tests/st/python/config/predict_qwen2_5_7b_instruct.yaml b/tests/st/python/config/predict_qwen2_5_7b_instruct.yaml new file mode 100644 index 0000000000000000000000000000000000000000..821e33f5d713b248adc4ff68ee942793b78e5315 --- /dev/null +++ b/tests/st/python/config/predict_qwen2_5_7b_instruct.yaml @@ -0,0 +1,126 @@ +seed: 0 +output_dir: './output' # path to save checkpoint/strategy +load_checkpoint: '' +src_strategy_path_or_dir: '' +auto_trans_ckpt: False # If true, auto transform load_checkpoint to load in distributed model +only_save_strategy: False +resume_training: False +use_parallel: False +run_mode: 'predict' + +# trainer config +trainer: + type: CausalLanguageModelingTrainer + model_name: 'qwen2_5_7b' + +# runner config +runner_config: + epochs: 5 + batch_size: 1 + sink_mode: True + sink_size: 2 +runner_wrapper: + type: MFTrainOneStepCell + scale_sense: + type: DynamicLossScaleUpdateCell + loss_scale_value: 65536 + scale_factor: 2 + scale_window: 1000 + use_clip_grad: True + +# default parallel of device num = 8 for Atlas 800T A2 +parallel_config: + data_parallel: 1 + model_parallel: 1 + pipeline_stage: 1 + micro_batch_num: 1 + vocab_emb_dp: False + gradient_aggregation_group: 4 +# when model parallel is greater than 1, we can set micro_batch_interleave_num=2, that may accelerate the train process. +micro_batch_interleave_num: 1 + +model: + model_config: + type: LlamaConfig + batch_size: 1 + seq_length: 32768 + hidden_size: 3584 + num_layers: 28 + num_heads: 28 + n_kv_heads: 4 + vocab_size: 152064 + intermediate_size: 18944 + max_position_embeddings: 32768 + qkv_has_bias: True + rms_norm_eps: 1.0e-6 + theta: 1000000.0 + emb_dropout_prob: 0.0 + eos_token_id: [151645,151643] + pad_token_id: 151643 + bos_token_id: 151643 + compute_dtype: "bfloat16" + layernorm_compute_type: "float32" + softmax_compute_type: "float32" + rotary_dtype: "bfloat16" + param_init_type: "bfloat16" + use_past: True + use_flash_attention: True + block_size: 32 + num_blocks: 1024 + use_past_shard: False + offset: 0 + checkpoint_name_or_path: "" + repetition_penalty: 1.05 + max_decode_length: 512 + top_k: 20 + top_p: 0.8 + temperature: 0.7 + do_sample: True + is_dynamic: True + qkv_concat: True + auto_map: + AutoTokenizer: [qwen2_5_tokenizer.Qwen2Tokenizer, null] + + arch: + type: LlamaForCausalLM + +processor: + return_tensors: ms + tokenizer: + model_max_length: 131072 + bos_token: null + eos_token: "<|im_end|>" + unk_token: null + pad_token: "<|endoftext|>" + vocab_file: "/path/to/vocab.json" + merges_file: "/path/to/merges.txt" + chat_template: "{%- if tools %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0]['role'] == 'system' %}\n {{- messages[0]['content'] }}\n {%- else %}\n {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}\n {%- endif %}\n {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within XML tags:\\n\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n\\n\\nFor each function call, return a json object with function name and arguments within XML tags:\\n\\n{\\\"name\\\": , \\\"arguments\\\": }\\n<|im_end|>\\n\" }}\n{%- else %}\n {%- if messages[0]['role'] == 'system' %}\n {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n {%- else %}\n {{- '<|im_start|>system\\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" %}\n {{- '<|im_start|>' + message.role }}\n {%- if message.content %}\n {{- '\\n' + message.content }}\n {%- endif %}\n {%- for tool_call in message.tool_calls %}\n {%- if tool_call.function is defined %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '\\n\\n{\"name\": \"' }}\n {{- tool_call.name }}\n {{- '\", \"arguments\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- '}\\n' }}\n {%- endfor %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n\\n' }}\n {{- message.content }}\n {{- '\\n' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n" + type: Qwen2Tokenizer + type: Qwen2Processor + +# mindspore context init config +context: + mode: 0 #0--Graph Mode; 1--Pynative Mode + device_target: "Ascend" + ascend_config: + precision_mode: "must_keep_origin_dtype" + max_call_depth: 10000 + max_device_memory: "59GB" + save_graphs: False + save_graphs_path: "./graph" + device_id: 0 + +# parallel context config +parallel: + parallel_mode: 1 # 0-data parallel, 1-semi-auto parallel, 2-auto parallel, 3-hybrid parallel + gradients_mean: False + enable_alltoall: False + full_batch: True + search_mode: "sharding_propagation" + enable_parallel_optimizer: False + strategy_ckpt_config: + save_file: "./ckpt_strategy.ckpt" + only_trainable_params: False + parallel_optimizer_config: + gradient_accumulation_shard: False + parallel_optimizer_threshold: 64 diff --git a/tests/st/python/set_env.py b/tests/st/python/set_env.py new file mode 100644 index 0000000000000000000000000000000000000000..f39bd01999719536233caa4c0ded7dc318c85d6d --- /dev/null +++ b/tests/st/python/set_env.py @@ -0,0 +1,58 @@ +import os +import sys +from typing import Dict, Optional + +mindformers_path = "/home/jenkins/mindspore/testcases/testcases/tests/mindformers" + +if mindformers_path not in sys.path: + sys.path.insert(0, mindformers_path) + +current_pythonpath = os.environ.get("PYTHONPATH", "") +if current_pythonpath: + os.environ["PYTHONPATH"] = f"{mindformers_path}:{current_pythonpath}" +else: + os.environ["PYTHONPATH"] = mindformers_path + + +class EnvVarManager: + def __init__(self): + self._original_env: Dict[str, Optional[str]] = {} + self._managed_vars: Dict[str, str] = {} + + def set_env_var(self, var_name: str, value: str) -> None: + """设置环境变量并记录原始值(如果存在)""" + if var_name not in self._original_env: + # 保存原始值,即使它不存在(保存为None) + self._original_env[var_name] = os.environ.get(var_name) + + os.environ[var_name] = value + self._managed_vars[var_name] = value + + def unset_env_var(self, var_name: str) -> None: + """取消设置之前设置的环境变量,恢复原始值""" + if var_name not in self._original_env: + raise ValueError(f"Variable {var_name} was not set by this manager") + + original_value = self._original_env[var_name] + if original_value is not None: + os.environ[var_name] = original_value + else: + if var_name in os.environ: + del os.environ[var_name] + + del self._original_env[var_name] + del self._managed_vars[var_name] + + def unset_all(self) -> None: + """取消设置所有由该管理器设置的环境变量""" + for var_name in list(self._managed_vars.keys()): + self.unset_env_var(var_name) + + def get_managed_vars(self) -> Dict[str, str]: + """获取当前由该管理器管理的所有环境变量 """ + return self._managed_vars.copy() + + def setup_ai_environment(self, env_vars: Dict[str, str]) -> None: + """设置AI相关的环境变量,使用传入的参数""" + for var_name, value in env_vars.items(): + self.set_env_var(var_name, value) diff --git a/tests/st/python/test_sampler.py b/tests/st/python/test_sampler.py new file mode 100644 index 0000000000000000000000000000000000000000..9f8916ca4c870f1469e1a96d7ba363c50680d48d --- /dev/null +++ b/tests/st/python/test_sampler.py @@ -0,0 +1,777 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# Copyright 2025 Huawei Technologies Co., Ltd +# Copyright 2024 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +import vllm_mindspore +import itertools +import random +from dataclasses import dataclass +from typing import Dict, List, Optional, Tuple +from unittest.mock import Mock, patch +from mindspore import mint + +import pytest +import torch +from transformers import GenerationConfig, GenerationMixin + +import vllm.envs as envs + +from vllm_mindspore.model_executor.layers.sampler import Sampler +from vllm_mindspore.model_executor.sampling_metadata import SamplingMetadata +from vllm.model_executor.utils import set_random_seed +from vllm_mindspore.sequence import SamplingParams, SequenceData, SequenceGroupMetadata +from vllm.utils import Counter, is_pin_memory_available + +class MockLogitsSampler(Sampler): + + def __init__(self, fake_logits: torch.Tensor): + super().__init__() + self.fake_logits = fake_logits + + def forward(self, *args, **kwargs): + return super().forward(*args, **kwargs) + + +def _prepare_test( + batch_size: int +) -> Tuple[torch.Tensor, torch.Tensor, MockLogitsSampler]: + input_tensor = torch.rand((batch_size, 1024), dtype=torch.float16) + fake_logits = torch.full((batch_size, VOCAB_SIZE), + 1e-2, + dtype=input_tensor.dtype) + sampler = MockLogitsSampler(fake_logits) + return input_tensor, fake_logits, sampler + + +VOCAB_SIZE = 32000 +RANDOM_SEEDS = list(range(2)) +CUDA_DEVICES = ['cuda'] + + +def _do_sample( + batch_size: int, + input_tensor: torch.Tensor, + sampler: MockLogitsSampler, + sampling_params: SamplingParams, + device: str, +): + seq_group_metadata_list: List[SequenceGroupMetadata] = [] + seq_lens: List[int] = [] + for i in range(batch_size): + seq_group_metadata_list.append( + SequenceGroupMetadata( + request_id=f"test_{i}", + is_prompt=True, + seq_data={0: SequenceData.from_seqs([1, 2, 3])}, + sampling_params=sampling_params, + block_tables={0: [1]}, + )) + seq_lens.append(seq_group_metadata_list[-1].seq_data[0].get_len()) + + sampling_metadata = SamplingMetadata.prepare( + seq_group_metadata_list, + seq_lens, + query_lens=seq_lens, + device=device, + pin_memory=is_pin_memory_available()) + return sampler(logits=input_tensor, sampling_metadata=sampling_metadata) + +@pytest.mark.level0 +@pytest.mark.platform_arm_ascend910b_training +@pytest.mark.env_single +@pytest.mark.parametrize("seed", RANDOM_SEEDS) +@pytest.mark.parametrize("device", CUDA_DEVICES) +def test_sampler_all_greedy(seed: int, device: str): + set_random_seed(seed) + batch_size = random.randint(1, 256) + input_tensor, fake_logits, sampler = _prepare_test(batch_size) + + sampling_params = SamplingParams(temperature=0) + sampler_output = _do_sample(batch_size, fake_logits, sampler, + sampling_params, device) + expected = torch.argmax(fake_logits, dim=-1) + for i, sequence_output in enumerate(sampler_output): + for nth_output in sequence_output.samples: + assert nth_output.output_token == expected[i].item() + +@pytest.mark.level0 +@pytest.mark.platform_arm_ascend910b_training +@pytest.mark.env_single +@pytest.mark.parametrize("seed", RANDOM_SEEDS) +@pytest.mark.parametrize("device", CUDA_DEVICES) +def test_sampler_all_random(seed: int, device: str): + set_random_seed(seed) + batch_size = random.randint(1, 256) + _, fake_logits, sampler = _prepare_test(batch_size) + + for i in range(batch_size): + fake_logits[i, i] = 1e2 + + sampling_params = SamplingParams( + temperature=1.0, + n=random.randint(1, 10), + ) + sampler_output = _do_sample(batch_size, fake_logits, sampler, + sampling_params, device) + + for i, sequence_output in enumerate(sampler_output): + for nth_output in sequence_output.samples: + assert nth_output.output_token == i + +@pytest.mark.skip(reason="Not implemented yet") +@pytest.mark.parametrize("seed", RANDOM_SEEDS) +@pytest.mark.parametrize("device", CUDA_DEVICES) +def test_sampler_all_random_seed(seed: int, device: str): + set_random_seed(seed) + torch.set_default_device(device) + batch_size = random.randint(1, 256) + _, fake_logits, sampler = _prepare_test(batch_size) + + for i in range(batch_size): + fake_logits[i, i] = 1e2 + + sampling_params = SamplingParams( + temperature=1.0, + n=random.randint(1, 10), + seed=random.randint(0, 10000), + ) + sampler_output = _do_sample(batch_size, fake_logits, sampler, + sampling_params, device) + + for i, sequence_output in enumerate(sampler_output): + for nth_output in sequence_output.samples: + assert nth_output.output_token == i + +@pytest.mark.skip(reason="Not implemented yet") +@pytest.mark.parametrize("seed", RANDOM_SEEDS) +@pytest.mark.parametrize("device", CUDA_DEVICES) +def test_sampler_all_random_seed_deterministic(seed: int, device: str): + set_random_seed(seed) + torch.set_default_device(device) + batch_size = random.randint(1, 256) + _, fake_logits, sampler = _prepare_test(batch_size) + + sampling_params = SamplingParams( + temperature=1.0, + n=random.randint(1, 10), + seed=random.randint(0, 10000), + ) + first_sampler_output = _do_sample(batch_size, fake_logits, sampler, + sampling_params, device) + + second_sampler_output = _do_sample(batch_size, fake_logits, sampler, + sampling_params, device) + + assert first_sampler_output == second_sampler_output + +@pytest.mark.skip(reason="Not implemented yet") +@pytest.mark.parametrize("seed", RANDOM_SEEDS) +@pytest.mark.parametrize("device", CUDA_DEVICES) +def test_sampler_min_tokens_penalty(seed: int, device: str): + seq_id_counter = Counter(start=random.randint(0, 100)) + set_random_seed(seed) + torch.set_default_device(device) + + def create_sampling_params(min_tokens, + eos_token_id=0, + *, + stop_token_ids: Optional[List[int]] = None, + prompt_logprobs: Optional[int] = None): + sampling_params = SamplingParams( + min_tokens=min_tokens, + max_tokens=9999, # keep higher than max of min_tokens + stop_token_ids=stop_token_ids, + # requesting prompt_logprobs changes the structure of `logits` + prompt_logprobs=prompt_logprobs, + ) + sampling_params.all_stop_token_ids.add(eos_token_id) + return sampling_params + + def create_sequence_data(num_input=3, num_generated=0): + seq_data = SequenceData.from_seqs( + random.choices(range(0, VOCAB_SIZE), k=num_input)) + if num_generated > 0: + seq_data.output_token_ids = random.choices(range(0, VOCAB_SIZE), + k=num_generated) + return seq_data + + def generate_test_case(): + # generate multiple seq groups but limit total batch size + batch_size = random.randint(1, 128) + + expected_penalization = [] + sequence_metadata_list: List[SequenceGroupMetadata] = [] + # 20% chance to generate seq group metadata list with all prompts + is_prompt = random.random() < 0.2 + while batch_size > 0: + num_seqs = 1 if is_prompt else random.randint(1, batch_size) + + eos_token_id = random.randint(0, VOCAB_SIZE - 1) + min_tokens = random.randint(0, 50) + num_stop_tokens = random.randint(0, 8) + if num_stop_tokens > 0: + stop_token_ids = random.choices(range(0, VOCAB_SIZE - 1), + k=num_stop_tokens) + else: + stop_token_ids = None + + sampling_params = create_sampling_params( + min_tokens=min_tokens, + eos_token_id=eos_token_id, + stop_token_ids=stop_token_ids) + + seq_data: Dict[int, SequenceData] = {} + seq_group_penalization: List[bool] = [] + for _ in range(num_seqs): + num_input = random.randint(1, 100) + num_generated = 0 if is_prompt else random.randint(1, 100) + seq_data[next(seq_id_counter)] = create_sequence_data( + num_input=num_input, num_generated=num_generated) + seq_group_penalization.append(num_generated < min_tokens) + + expected_penalization.extend(seq_group_penalization) + sequence_metadata_list.append( + SequenceGroupMetadata( + request_id=f"test_{batch_size}", + is_prompt=is_prompt, + seq_data=seq_data, + sampling_params=sampling_params, + block_tables={}, + )) + batch_size -= num_seqs + + return { + "expected_penalization": expected_penalization, + "seq_group_metadata_list": sequence_metadata_list, + } + + # define some explicit test cases for edge case behavior + prompt_without_penalization = { + "expected_penalization": [False], + "seq_group_metadata_list": [ + SequenceGroupMetadata( + request_id="test_1", + is_prompt=True, + seq_data={ + next(seq_id_counter): create_sequence_data(), + }, + sampling_params=create_sampling_params(0), + block_tables={}, + ), + ] + } + + prompt_with_penalization = { + "expected_penalization": [True], + "seq_group_metadata_list": [ + SequenceGroupMetadata( + request_id="test_1", + is_prompt=True, + seq_data={ + next(seq_id_counter): create_sequence_data(), + }, + sampling_params=create_sampling_params(1), + block_tables={}, + ), + ] + } + + prompt_with_penalization_and_prompt_logprobs = { + "expected_penalization": [False, False, True], + "seq_group_metadata_list": [ + SequenceGroupMetadata( + request_id="test_1", + is_prompt=True, + seq_data={ + next(seq_id_counter): create_sequence_data(num_input=3), + }, + sampling_params=create_sampling_params(1, prompt_logprobs=3), + block_tables={}, + ), + ] + } + + stop_penalizing_after_min_tokens = { + "expected_penalization": [False], + "seq_group_metadata_list": [ + SequenceGroupMetadata( + request_id="test_1", + is_prompt=False, + seq_data={ + next(seq_id_counter): + create_sequence_data(num_generated=1), + }, + sampling_params=create_sampling_params(1), + block_tables={}, + ) + ] + } + + stop_token_ids = [42, 99, 42, 0] # intentional duplication + prompt_combination = { + "expected_penalization": [False, True, False], + "seq_group_metadata_list": [ + SequenceGroupMetadata( + request_id="test_2", + is_prompt=True, + seq_data={ + next(seq_id_counter): create_sequence_data(num_input=2), + }, + sampling_params=create_sampling_params(1, prompt_logprobs=3), + block_tables={}, + ), + SequenceGroupMetadata( + request_id="test_3", + is_prompt=True, + seq_data={ + next(seq_id_counter): create_sequence_data(), + }, + sampling_params=create_sampling_params( + 0, stop_token_ids=stop_token_ids), + block_tables={}, + ) + ] + } + + stop_token_ids = [1, 999, 37, 37] # intentional duplication + decode_combination = { + "expected_penalization": [True, False, False, True, False], + "seq_group_metadata_list": [ + SequenceGroupMetadata( + request_id="test_1", + is_prompt=False, + seq_data={ + next(seq_id_counter): + create_sequence_data(num_generated=1), + next(seq_id_counter): + create_sequence_data(num_generated=100), + }, + sampling_params=create_sampling_params( + 2, stop_token_ids=stop_token_ids), + block_tables={}, + ), + SequenceGroupMetadata( + request_id="test_2", + is_prompt=False, + seq_data={ + next(seq_id_counter): + create_sequence_data(num_generated=20), + next(seq_id_counter): + create_sequence_data(num_generated=1), + next(seq_id_counter): + create_sequence_data(num_generated=10), + }, + sampling_params=create_sampling_params( + 10, prompt_logprobs=5, stop_token_ids=stop_token_ids), + block_tables={}, + ), + ] + } + + if seed == 0: + test_cases = [ + prompt_without_penalization, + prompt_with_penalization, + prompt_with_penalization_and_prompt_logprobs, + stop_penalizing_after_min_tokens, + prompt_combination, + decode_combination, + ] + else: + test_cases = [generate_test_case()] + + def run_test_case(*, expected_penalization: List[bool], + seq_group_metadata_list: List[SequenceGroupMetadata]): + assert expected_penalization, \ + "Invalid test case, need expected_penalization" + assert seq_group_metadata_list, \ + "Invalid test case, need seq_group_metadata_list" + + batch_size = 0 + seq_lens: List[int] = [] + sampling_params_per_row: List[SamplingParams] = [] + for sgm in seq_group_metadata_list: + sampling_params = sgm.sampling_params + + num_rows = len(sgm.seq_data) + if sgm.is_prompt: + # a prompt seq_group has only one sequence + seq_data = next(iter(sgm.seq_data.values())) + prompt_len = seq_data.get_prompt_len() + seq_lens.append(prompt_len) + + assert sgm.sampling_params is not None + if sgm.sampling_params.prompt_logprobs: + # with prompt_logprobs each token in the prompt has a row in + # logits + num_rows = prompt_len + + batch_size += num_rows + sampling_params_per_row.extend( + itertools.repeat(sampling_params, num_rows)) + + assert len( + expected_penalization + ) == batch_size, \ + ("Invalid test case, expected_penalization does not match computed" + "batch size") + + _, fake_logits, sampler = _prepare_test(batch_size) + sampling_metadata = SamplingMetadata.prepare( + seq_group_metadata_list, + seq_lens=seq_lens if seq_lens else None, + query_lens=seq_lens if seq_lens else [1] * batch_size, + device=device, + pin_memory=is_pin_memory_available()) + # the logits tensor is modified in-place by the sampler + _ = sampler(logits=fake_logits, sampling_metadata=sampling_metadata) + + for logits_idx, (should_penalize, sampling_params) in enumerate( + zip(expected_penalization, sampling_params_per_row)): + + tokens_to_check = sampling_params.all_stop_token_ids + + if should_penalize: + for token_id in tokens_to_check: + assert fake_logits[logits_idx, token_id] == -float( + 'inf' + ), f"Expected token {token_id} for logits row {logits_idx}" + " to be penalized" + # no other tokens should be set to -inf + assert torch.count_nonzero( + fake_logits[logits_idx, :] == -float('inf')) == len( + tokens_to_check + ), f"Expected only {len(tokens_to_check)} to be penalized" + else: + # no tokens should be set to -inf + assert torch.count_nonzero( + fake_logits[logits_idx, :] == + -float('inf')) == 0, "No tokens should have been penalized" + + for test_case in test_cases: + run_test_case(**test_case) + +@pytest.mark.skip(reason="Not implemented yet") +@pytest.mark.parametrize("seed", RANDOM_SEEDS) +@pytest.mark.parametrize("device", CUDA_DEVICES) +def test_sampler_mixed(seed: int, device: str): + set_random_seed(seed) + torch.set_default_device(device) + batch_size = random.randint(1, 256) + input_tensor, fake_logits, sampler = _prepare_test(batch_size) + + seq_group_metadata_list: List[SequenceGroupMetadata] = [] + expected_tokens: List[Optional[List[int]]] = [] + seq_lens: List[int] = [] + for i in range(batch_size): + expected: Optional[List[int]] = None + sampling_type = random.randint(0, 2) + if sampling_type == 0: + sampling_params = SamplingParams(temperature=0) + expected = [int(torch.argmax(fake_logits[i], dim=-1).item())] + elif sampling_type in (1, 2): + n = random.randint(1, 10) + sampling_params = SamplingParams( + temperature=random.random() + 0.1, + top_p=min(random.random() + 0.1, 1), + top_k=random.randint(0, 10) or -1, + n=n, + presence_penalty=random.randint(0, 1), + ) + if sampling_type == 2: + sampling_params.seed = random.randint(0, 10000) + else: + for idx in range(n): + fake_logits[i, i + idx] = 1e2 + expected = list(range(i, i + n)) + + expected_tokens.append(expected) + seq_group_metadata_list.append( + SequenceGroupMetadata( + request_id=f"test_{i}", + is_prompt=True, + seq_data={0: SequenceData.from_seqs([1, 2, 3])}, + sampling_params=sampling_params, + block_tables={0: [1]}, + )) + seq_lens.append(seq_group_metadata_list[-1].seq_data[0].get_len()) + + generators: Dict[str, torch.Generator] = {} + + def test_sampling(): + sampling_metadata = SamplingMetadata.prepare( + seq_group_metadata_list, + seq_lens, + query_lens=seq_lens, + device=device, + pin_memory=is_pin_memory_available(), + generators=generators) + sampler_output = sampler(logits=fake_logits, + sampling_metadata=sampling_metadata) + + for i, (sequence_output, metadata) in enumerate( + zip(sampler_output, seq_group_metadata_list)): + assert metadata.sampling_params is not None + + if (metadata.sampling_params.seed is not None + and expected_tokens[i] is None): + # Record seeded random result to compare with results of + # second invocation + expected_tokens[i] = [ + nth_output.output_token + for nth_output in sequence_output.samples + ] + continue + + expected_tokens_item = expected_tokens[i] + assert expected_tokens_item is not None + + for n, nth_output in enumerate(sequence_output.samples): + assert metadata.sampling_params is not None + + if (metadata.sampling_params.temperature == 0 + or metadata.sampling_params.seed is not None): + # Ensure exact matches for greedy or random with seed + assert nth_output.output_token == expected_tokens_item[n] + else: + # For non-seeded random check that one of the high-logit + # tokens were chosen + assert nth_output.output_token in expected_tokens_item + + # Test batch + test_sampling() + + # Shuffle the batch and resample + target_index = list(range(batch_size)) + for list_to_shuffle in (target_index, seq_group_metadata_list, + expected_tokens, seq_lens): + random.Random(seed).shuffle(list_to_shuffle) + target_index = torch.tensor(target_index) + input_tensor.data = input_tensor.index_select(0, target_index) + fake_logits.data = fake_logits.index_select(0, target_index) + + # This time, results of seeded random samples will be compared with + # the corresponding sample in the pre-shuffled batch + test_sampling() + +@pytest.mark.skip(reason="Not implemented yet") +@pytest.mark.parametrize("seed", RANDOM_SEEDS) +@pytest.mark.parametrize("device", CUDA_DEVICES) +def test_sampler_top_k_top_p(seed: int, device: str): + set_random_seed(seed) + batch_size = random.randint(1, 256) + top_k = random.randint(100, 500) + top_p = random.random() * 0.1 + vocab_size = 32000 + input_tensor = torch.rand((batch_size, 1024), + device=device, + dtype=torch.float16) + fake_logits = torch.normal(0, + 5, + size=(batch_size, vocab_size), + device=input_tensor.device, + dtype=input_tensor.dtype) + sampler = MockLogitsSampler(fake_logits) + + generation_model = GenerationMixin() + generation_config = GenerationConfig(top_k=top_k, + top_p=top_p, + do_sample=True) + + @dataclass + class MockConfig: + is_encoder_decoder: bool = False + + generation_model.config = MockConfig() # needed by the following method + generation_model._prepare_special_tokens(generation_config, device=device) + processors = generation_model._get_logits_processor(generation_config, + None, + None, + None, [], + device=device) + assert len(processors) == 2 # top_p and top_k + + seq_group_metadata_list: List[SequenceGroupMetadata] = [] + seq_lens: List[int] = [] + for i in range(batch_size): + seq_group_metadata_list.append( + SequenceGroupMetadata( + request_id=f"test_{i}", + is_prompt=True, + seq_data={0: SequenceData.from_seqs([1, 2, 3])}, + sampling_params=SamplingParams( + temperature=1, + top_k=top_k, + top_p=top_p, + ), + block_tables={0: [1]}, + )) + seq_lens.append(seq_group_metadata_list[-1].seq_data[0].get_len()) + + sampling_metadata = SamplingMetadata.prepare( + seq_group_metadata_list, + seq_lens, + query_lens=seq_lens, + device=device, + pin_memory=is_pin_memory_available()) + + sample_probs = None + + def mock_sample(probs, *args, **kwargs): + nonlocal sample_probs + sample_probs = probs + return ([[prob.topk(1, dim=-1).indices.tolist(), [0]] + for prob in probs], None) + + # top-k and top-p is only calculated when flashinfer kernel is not available + with patch("vllm.model_executor.layers.sampler._sample", mock_sample), \ + patch("vllm.model_executor.layers.sampler." + "flashinfer_top_k_top_p_sampling", None): + sampler(logits=fake_logits, sampling_metadata=sampling_metadata) + + assert sample_probs is not None + + hf_probs = processors(torch.zeros_like(fake_logits), fake_logits.clone()) + hf_probs = torch.softmax(hf_probs, dim=-1, dtype=torch.float) + torch.testing.assert_close(hf_probs, sample_probs, rtol=0.0, atol=1e-5) + assert torch.equal(hf_probs.eq(0), sample_probs.eq(0)) + +@pytest.mark.skip(reason="Not implemented yet") +@pytest.mark.parametrize("seed", RANDOM_SEEDS) +@pytest.mark.parametrize("device", CUDA_DEVICES) +def test_flashinfer_fallback(seed: int, device: str): + if not envs.VLLM_USE_FLASHINFER_SAMPLER: + pytest.skip("Flashinfer sampler is disabled") + + set_random_seed(seed) + torch.set_default_device(device) + batch_size = random.randint(1, 256) + _, fake_logits, sampler = _prepare_test(batch_size) + + def failing_flashinfer_sampling(*_args, **_kwargs): + return None, torch.zeros(batch_size, device=device, dtype=torch.int32) + + sampling_params = SamplingParams( + temperature=1.0, + n=random.randint(1, 10), + seed=random.randint(0, 10000), + ) + sampler_output = _do_sample(batch_size, fake_logits, sampler, + sampling_params, device) + + with patch( + "vllm.model_executor.layers.sampler." + "flashinfer_top_k_top_p_sampling", failing_flashinfer_sampling): + fallback_sampler_output = _do_sample(batch_size, fake_logits, sampler, + sampling_params, device) + + assert sampler_output == fallback_sampler_output + +@pytest.mark.level0 +@pytest.mark.platform_arm_ascend910b_training +@pytest.mark.env_single +@pytest.mark.parametrize("device", CUDA_DEVICES) +def test_sampler_repetition_penalty_mixed(device: str): + + vocab_size = 8 + + def test_sampling_params(sampling_params: List[SamplingParams]): + + seq_group_metadata_list: List[SequenceGroupMetadata] = [] + seq_lens: List[int] = [] + for i in range(2): + seq_group_metadata_list.append( + SequenceGroupMetadata( + request_id=f"test_{i}", + is_prompt=True, + seq_data={0: SequenceData.from_seqs([1, 2, 3])}, + sampling_params=sampling_params[i], + block_tables={0: [1]}, + )) + seq_lens.append(seq_group_metadata_list[-1].seq_data[0].get_len()) + + sampling_metadata = SamplingMetadata.prepare( + seq_group_metadata_list, + seq_lens, + query_lens=seq_lens, + device=device, + pin_memory=is_pin_memory_available()) + + fake_logits = torch.full((2, vocab_size), + 1e-2, + device=device, + dtype=torch.float16) + + fake_logits[:, 5] = 1.1e-2 + fake_logits[:, 1] = 1.2e-2 + + sampler = MockLogitsSampler(fake_logits) + + sampler_output = sampler(logits=fake_logits, + sampling_metadata=sampling_metadata) + + generated_tokens = [] + for output in sampler_output: + generated_tokens.append(output.samples[0].output_token) + + return generated_tokens + + # one configuration is greedy with repetition_penalty + sampling_params_rep = SamplingParams( + temperature=0.0, + repetition_penalty=2.0, + ) + + # other configuration is sampling w/o repetition_penalty + sampling_params_sample = SamplingParams( + temperature=1.0, + top_k=1, + seed=42, + ) + + tokens1 = test_sampling_params( + [sampling_params_rep, sampling_params_sample]) + + tokens2 = test_sampling_params( + [sampling_params_sample, sampling_params_rep]) + + assert tokens1[0] == tokens2[1] + assert tokens1[1] == tokens2[0] + +@pytest.mark.skip(reason="Not implemented yet") +@pytest.mark.parametrize("device", CUDA_DEVICES) +def test_sampler_include_gpu_probs_tensor(device: str): + set_random_seed(42) + torch.set_default_device(device) + batch_size = random.randint(1, 256) + _, fake_logits, sampler = _prepare_test(batch_size) + sampler.include_gpu_probs_tensor = True + sampler.should_modify_greedy_probs_inplace = False + + sampling_params = SamplingParams(temperature=0) + + mock_inplace = Mock() + with patch( + "vllm.model_executor.layers.sampler._modify_greedy_probs_inplace", + mock_inplace): + + sampler_output = _do_sample(batch_size, fake_logits, sampler, + sampling_params, device) + mock_inplace.assert_not_called() + + assert sampler_output.sampled_token_probs is not None + assert sampler_output.logprobs is not None + assert sampler_output.sampled_token_ids is not None diff --git a/tests/st/python/test_shm_broadcast.py b/tests/st/python/test_shm_broadcast.py new file mode 100644 index 0000000000000000000000000000000000000000..cfc328810fdf318feadb30cffa735e8be105892f --- /dev/null +++ b/tests/st/python/test_shm_broadcast.py @@ -0,0 +1,140 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# Copyright 2025 Huawei Technologies Co., Ltd +# Copyright 2024 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""test cpu communicator and share memory""" +import pytest +import multiprocessing +import random +import time +from typing import List + +import numpy as np +import torch.distributed as dist + +import vllm_mindspore + +from vllm.distributed.device_communicators.shm_broadcast import MessageQueue +from vllm.distributed.utils import StatelessProcessGroup +from vllm.utils import get_ip, get_open_port, update_environment_variables, get_distributed_init_method + + +def get_arrays(n: int, seed: int = 0) -> List[np.ndarray]: + np.random.seed(seed) + sizes = np.random.randint(1, 10_000, n) + # on average, each array will have 5k elements + # with int64, each array will have 40kb + return [np.random.randint(1, 100, i) for i in sizes] + + +def distributed_run(fn, world_size): + number_of_processes = world_size + processes = [] + + port = get_open_port() + distributed_init_method = get_distributed_init_method("127.0.0.1", port) + + for i in range(number_of_processes): + p = multiprocessing.Process(target=fn, args=(distributed_init_method, i, world_size)) + processes.append(p) + p.start() + + for p in processes: + p.join() + + for p in processes: + assert p.exitcode == 0 + + +def worker_fn_wrapper(fn): + # `multiprocessing.Process` cannot accept environment variables directly + # so we need to pass the environment variables as arguments + # and update the environment variables in the function + def wrapped_fn(distributed_init_method, rank, world_size): + dist.init_process_group( + backend="nccl", + init_method=distributed_init_method, + rank=rank, + world_size=world_size, + ) + fn() + + return wrapped_fn + + +@worker_fn_wrapper +def worker_fn(): + + rank = dist.get_rank() + if rank == 0: + port = get_open_port() + ip = get_ip() + dist.broadcast_object_list([ip, port], src=0) + else: + recv = [None, None] + dist.broadcast_object_list(recv, src=0) + ip, port = recv + + stateless_pg = dist.new_group([0,1,2,3], backend="gloo") + + for pg in [dist.group.WORLD, stateless_pg]: + + writer_rank = 2 + broadcaster = MessageQueue.create_from_process_group( + pg, 40 * 1024, 2, writer_rank) + if rank == writer_rank: + seed = random.randint(0, 1000) + dist.broadcast_object_list([seed], writer_rank) + else: + recv = [None] + dist.broadcast_object_list(recv, writer_rank) + seed = recv[0] # type: ignore + + if pg == dist.group.WORLD: + dist.barrier() + else: + dist.barrier(group=pg) + + # in case we find a race condition + # print the seed so that we can reproduce the error + print(f"Rank {rank} got seed {seed}") + # test broadcasting with about 400MB of data + N = 10_000 + if rank == writer_rank: + arrs = get_arrays(N, seed) + for x in arrs: + broadcaster.broadcast_object(x) + time.sleep(random.random() / 1000) + else: + arrs = get_arrays(N, seed) + for x in arrs: + y = broadcaster.broadcast_object(None) + assert np.array_equal(x, y) + time.sleep(random.random() / 1000) + + if pg == dist.group.WORLD: + dist.barrier() + print("torch distributed passed the test!") + else: + dist.barrier(group=pg) + print("StatelessProcessGroup passed the test!") + + +@pytest.mark.level0 +@pytest.mark.platform_arm_ascend910b_training +@pytest.mark.env_single +def test_shm_broadcast(): + distributed_run(worker_fn, 4) diff --git a/tests/st/python/test_vllm_deepseek_bf16_part.py b/tests/st/python/test_vllm_deepseek_bf16_part.py new file mode 100644 index 0000000000000000000000000000000000000000..c19dd14a66e82fa30ea302723c12497d0b191652 --- /dev/null +++ b/tests/st/python/test_vllm_deepseek_bf16_part.py @@ -0,0 +1,76 @@ +# Copyright 2024 The vLLM team. +# Copyright 2024 Microsoft and the HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://wwww.apache.org/licenses/LICENSE-2.0 +# +# Unless required by application law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""test mf deepseek r1.""" +import pytest +import os +from . import set_env +env_manager = set_env.EnvVarManager() +# def env +env_vars = { + "MINDFORMERS_MODEL_CONFIG": "./config/predict_deepseek_r1_671b.yaml", + "ASCEND_CUSTOM_PATH": os.path.expandvars("$ASCEND_HOME_PATH/../"), + "vLLM_MODEL_BACKEND": "MindFormers", + "MS_ENABLE_LCCL": "off", + "HCCL_OP_EXPANSION_MODE": "AIV", + "ASCEND_RT_VISIBLE_DEVICES": "0,1,2,3,4,5,6,7", + "MS_ALLOC_CONF": "enable_vmm:True", + "LCCL_DETERMINISTIC": "1", + "HCCL_DETERMINISTIC": "true", + "ATB_MATMUL_SHUFFLE_K_ENABLE": "0", + "ATB_LLM_LCOC_ENABLE": "0" +} +# set env +env_manager.setup_ai_environment(env_vars) +import vllm_mindspore +from vllm import LLM, SamplingParams + +class TestDeepSeek: + """ + Test Deepseek. + """ + + @pytest.mark.level0 + @pytest.mark.platform_arm_ascend910b_training + @pytest.mark.env_single + def test_deepseek_r1_bf16(self): + """ + test case deepseek r1 bf16 + """ + + # Sample prompts. + prompts = [ + "You are a helpful assistant.<|User|>将文本分类为中性、负面或正面。 \n文本:我认为这次假期还可以。 \n情感:<|Assistant|>\n", + ] + + # Create a sampling params object. + sampling_params = SamplingParams(temperature=0.0, max_tokens=10, top_k=1) + + # Create an LLM. + llm = LLM(model="/home/workspace/mindspore_dataset/weight/DeepSeek-R1-bf16", + trust_remote_code=True, gpu_memory_utilization=0.9, tensor_parallel_size=8) + # Generate texts from the prompts. The output is a list of RequestOutput objects + # that contain the prompt, generated text, and other information. + outputs = llm.generate(prompts, sampling_params) + except_list=['ugs611ాలు sic辨hara的开璞 SquaresInsp'] + # Print the outputs. + for i, output in enumerate(outputs): + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + assert generated_text == except_list[i] + + # unset env + env_manager.unset_all() diff --git a/tests/st/python/test_vllm_deepseek_part.py b/tests/st/python/test_vllm_deepseek_part.py new file mode 100644 index 0000000000000000000000000000000000000000..8cb7fe8429450b98b1dbdbff361e3b22fd7cda47 --- /dev/null +++ b/tests/st/python/test_vllm_deepseek_part.py @@ -0,0 +1,119 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# Copyright 2025 Huawei Technologies Co., Ltd +# Copyright 2024 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""test mf deepseek r1.""" +import pytest +import os +from . import set_env +env_manager = set_env.EnvVarManager() +# def env +env_vars = { + "MINDFORMERS_MODEL_CONFIG": "./config/predict_deepseek_r1_671b_w8a8.yaml", + "ASCEND_CUSTOM_PATH": os.path.expandvars("$ASCEND_HOME_PATH/../"), + "vLLM_MODEL_BACKEND": "MindFormers", + "MS_ENABLE_LCCL": "off", + "HCCL_OP_EXPANSION_MODE": "AIV", + "ASCEND_RT_VISIBLE_DEVICES": "0,1,2,3,4,5,6,7", + "MS_ALLOC_CONF": "enable_vmm:True", + "LCCL_DETERMINISTIC": "1", + "HCCL_DETERMINISTIC": "true", + "ATB_MATMUL_SHUFFLE_K_ENABLE": "0", + "ATB_LLM_LCOC_ENABLE": "0" +} +# set env +env_manager.setup_ai_environment(env_vars) +import vllm_mindspore +from vllm import LLM, SamplingParams + +class TestDeepSeek: + """ + Test Deepseek. + """ + + @pytest.mark.level0 + @pytest.mark.platform_arm_ascend910b_training + @pytest.mark.env_single + def test_deepseek_r1(self): + """ + test case deepseek r1 w8a8 + """ + + # Sample prompts. + prompts = [ + "You are a helpful assistant.<|User|>将文本分类为中性、负面或正面。 \n文本:我认为这次假期还可以。 \n情感:<|Assistant|>\n", + ] + + # Create a sampling params object. + sampling_params = SamplingParams(temperature=0.0, max_tokens=10, top_k=1) + + # Create an LLM. + llm = LLM(model="/home/workspace/mindspore_dataset/weight/DeepSeek-R1-W8A8", + trust_remote_code=True, gpu_memory_utilization=0.9, tensor_parallel_size=8) + # Generate texts from the prompts. The output is a list of RequestOutput objects + # that contain the prompt, generated text, and other information. + outputs = llm.generate(prompts, sampling_params) + except_list=['ugs611ాలు哒ాలు mahassisemaSTE的道德', 'ugs611ాలు哒ాలు mah战区rollerOVERlaid'] + # Print the outputs. + for i, output in enumerate(outputs): + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + assert generated_text in except_list + + # unset env + env_manager.unset_all() + + +class TestDeepSeekMTP: + """ + Test DeepseekMTP. + 大模型用量化(4层),mtp模型用浮点(1层,layer 61)。 + mtp的权重加载默认从配置的num_hidden_layer开始,为了支持减层推理场景mtp权重加载,ci服务器上修改了浮点的权重map文件的layer为4。 + """ + @pytest.mark.level0 + @pytest.mark.platform_arm_ascend910b_training + @pytest.mark.env_single + def test_deepseek_mtp(self): + """ + test case deepseek mtp with main model of r1-w8a8 + """ + + # Sample prompts. + prompts = [ + "You are a helpful assistant.<|User|>将文本分类为中性、负面或正面。 \n文本:我认为这次假期还可以。 \n情感:<|Assistant|>\n", + ] + + # Create a sampling params object. + sampling_params = SamplingParams(temperature=0.0, max_tokens=10, top_k=1) + + # Create an LLM. + llm = LLM(model="/home/workspace/mindspore_dataset/weight/DeepSeek-R1-MTP", + trust_remote_code=True, gpu_memory_utilization=0.8, tensor_parallel_size=8, + num_speculative_tokens=1) + # Generate texts from the prompts. The output is a list of RequestOutput objects + # that contain the prompt, generated text, and other information. + outputs = llm.generate(prompts, sampling_params) + except_list = ['ugs611ాలు哒ాలు mahassisemaSTE的道德', 'ugs611ాలు哒ాలు mah战区rollerOVERlaid'] + # Print the outputs. + for i, output in enumerate(outputs): + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + assert generated_text in except_list + + # unset env + env_manager.unset_all() diff --git a/tests/st/python/test_vllm_deepseek_smoothquant.py b/tests/st/python/test_vllm_deepseek_smoothquant.py new file mode 100644 index 0000000000000000000000000000000000000000..7582e55b20fa988ab2edb170feb874551689382e --- /dev/null +++ b/tests/st/python/test_vllm_deepseek_smoothquant.py @@ -0,0 +1,77 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# Copyright 2025 Huawei Technologies Co., Ltd +# Copyright 2024 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""test mf deepseek r1 smoothquant.""" +import pytest +import os +from . import set_env +env_manager = set_env.EnvVarManager() +# def env +env_vars = { + "MINDFORMERS_MODEL_CONFIG": "./config/predict_deepseek_r1_671b_w8a8_smoothquant.yaml", + "ASCEND_CUSTOM_PATH": os.path.expandvars("$ASCEND_HOME_PATH/../"), + "vLLM_MODEL_BACKEND": "MindFormers", + "MS_ENABLE_LCCL": "off", + "HCCL_OP_EXPANSION_MODE": "AIV", + "ASCEND_RT_VISIBLE_DEVICES": "0,1,2,3,4,5,6,7", + "MS_ALLOC_CONF": "enable_vmm:True", + "LCCL_DETERMINISTIC": "1", + "HCCL_DETERMINISTIC": "true", + "ATB_MATMUL_SHUFFLE_K_ENABLE": "0", + "ATB_LLM_LCOC_ENABLE": "0" +} +# set env +env_manager.setup_ai_environment(env_vars) +import vllm_mindspore +from vllm import LLM, SamplingParams + +class TestDeepSeek: + """ + Test Deepseek. + """ + + @pytest.mark.level0 + @pytest.mark.platform_arm_ascend910b_training + @pytest.mark.env_single + def test_deepseek_r1(self): + """ + test case deepseek r1 w8a8 + """ + + # Sample prompts. + prompts = [ + "介绍下北京故宫", + ] + + # Create a sampling params object. + sampling_params = SamplingParams(temperature=0.0, max_tokens=10, top_k=1) + + # Create an LLM. + llm = LLM(model="/home/workspace/mindspore_dataset/weight/DeepSeek-R1-W8A8-smoothquant", + trust_remote_code=True, gpu_memory_utilization=0.9, tensor_parallel_size=8) + # Generate texts from the prompts. The output is a list of RequestOutput objects + # that contain the prompt, generated text, and other information. + outputs = llm.generate(prompts, sampling_params) + # Print the outputs. + for i, output in enumerate(outputs): + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + assert "博物院" in generated_text + + # unset env + env_manager.unset_all() diff --git a/tests/st/python/test_vllm_mf_qwen_7b.py b/tests/st/python/test_vllm_mf_qwen_7b.py new file mode 100644 index 0000000000000000000000000000000000000000..ddb545c78a846482cfba02035520767401e69004 --- /dev/null +++ b/tests/st/python/test_vllm_mf_qwen_7b.py @@ -0,0 +1,78 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# Copyright 2025 Huawei Technologies Co., Ltd +# Copyright 2024 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""test mf qwen.""" +import pytest +import os +from . import set_env +env_manager = set_env.EnvVarManager() +# def env +env_vars = { + "MINDFORMERS_MODEL_CONFIG": "./config/predict_qwen2_5_7b_instruct.yaml", + "ASCEND_CUSTOM_PATH": os.path.expandvars("$ASCEND_HOME_PATH/../"), + "vLLM_MODEL_BACKEND": "MindFormers", + "MS_ENABLE_LCCL": "off", + "HCCL_OP_EXPANSION_MODE": "AIV", + "ASCEND_RT_VISIBLE_DEVICES": "0,1", + "MS_ALLOC_CONF": "enable_vmm:True", + "LCCL_DETERMINISTIC": "1", + "HCCL_DETERMINISTIC": "true", + "ATB_MATMUL_SHUFFLE_K_ENABLE": "0", + "ATB_LLM_LCOC_ENABLE": "0" +} +# set env +env_manager.setup_ai_environment(env_vars) +import vllm_mindspore +from vllm import LLM, SamplingParams + + +class TestMfQwen: + """ + Test Qwen. + """ + @pytest.mark.level0 + @pytest.mark.platform_arm_ascend910b_training + @pytest.mark.env_single + def test_mf_qwen(self): + """ + test case qwen2.5 7B + """ + + # Sample prompts. + prompts = [ + "You are a helpful assistant.<|User|>将文本分类为中性、负面或正面。 \n文本:我认为这次假期还可以。 \n情感:<|Assistant|>\n", + ] + + # Create a sampling params object. + sampling_params = SamplingParams(temperature=0.0, max_tokens=10, top_k=1) + + # Create an LLM. + llm = LLM(model="/home/workspace/mindspore_dataset/weight/Qwen2.5-7B-Instruct", + gpu_memory_utilization=0.9, tensor_parallel_size=2) + # Generate texts from the prompts. The output is a list of RequestOutput objects + # that contain the prompt, generated text, and other information. + outputs = llm.generate(prompts, sampling_params) + except_list=['中性<|Assistant|> 这句话'] + # Print the outputs. + for i, output in enumerate(outputs): + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + assert generated_text == except_list[i] + + # unset env + env_manager.unset_all() diff --git a/tests/st/python/test_vllm_mf_qwen_7b_chunk_prefill.py b/tests/st/python/test_vllm_mf_qwen_7b_chunk_prefill.py new file mode 100644 index 0000000000000000000000000000000000000000..1523e46bb119ba665266d87b978d2c9b780ee4db --- /dev/null +++ b/tests/st/python/test_vllm_mf_qwen_7b_chunk_prefill.py @@ -0,0 +1,89 @@ +# Copyright 2024 The vLLM team. +# Copyright 2024 Microsoft and the HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://wwww.apache.org/licenses/LICENSE-2.0 +# +# Unless required by application law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""test mf qwen chunk prefill.""" +import pytest +import os +from . import set_env + +env_manager = set_env.EnvVarManager() +# def env +env_vars = { + "MINDFORMERS_MODEL_CONFIG": "./config/predict_qwen2_5_7b_instruct.yaml", + "ASCEND_CUSTOM_PATH": os.path.expandvars("$ASCEND_HOME_PATH/../"), + "vLLM_MODEL_BACKEND": "MindFormers", + "MS_ENABLE_LCCL": "off", + "HCCL_OP_EXPANSION_MODE": "AIV", + "ASCEND_RT_VISIBLE_DEVICES": "0,1", + "MS_ALLOC_CONF": "enable_vmm:True", + "LCCL_DETERMINISTIC": "1", + "HCCL_DETERMINISTIC": "true", + "ATB_MATMUL_SHUFFLE_K_ENABLE": "0", + "ATB_LLM_LCOC_ENABLE": "0" +} +# set env +env_manager.setup_ai_environment(env_vars) +import vllm_mindspore +from vllm import LLM, SamplingParams + + +class TestMfQwen_chunk_prefill: + """ + Test qwen. + """ + + @pytest.mark.level0 + @pytest.mark.platform_arm_ascend910b_training + @pytest.mark.env_single + def test_mf_qwen_7b_chunk_prefill(self): + """ + test case qwen_7b_chunk_prefill + """ + + # Sample prompts. + batch_datas = [{ + "prompt": "I love Beijing, because it is a city with a long history and profound cultural heritage. Walking through " + "its ancient hutongs, one can almost feel the whispers of the past. The Forbidden City, an architectural " + "marvel that once housed emperors, stands as a testament to the city's imperial past. Meanwhile, the Great " + "Wall, though not within the city limits, is easily accessible from Beijing and offers a glimpse into the " + "strategic genius and resilience of ancient China.", + "answer": " The city's blend of traditional and modern architecture, vibrant street life, and rich culinary scene " + "make it a truly unique and captivating destination. I am always eager to"}, + {"prompt": "I love Beijing, because", + "answer": " it is a city with a long history. Which of the following options correctly expresses this sentence?\nA. I love Beijing, because it is a city with a"}, + ] + + # Create a sampling params object. + sampling_params = SamplingParams(temperature=0.0, max_tokens=32, top_k=1) + + # Create an LLM. + llm = LLM(model="/home/workspace/mindspore_dataset/weight/Qwen2.5-7B-Instruct", + max_model_len=8192, max_num_seqs=16, max_num_batched_tokens=32, + block_size=32, gpu_memory_utilization=0.9, tensor_parallel_size=2, + enable_chunked_prefill=True) + # Generate texts from the prompts. The output is a list of RequestOutput objects + # that contain the prompt, generated text, and other information. + for batch_data in batch_datas: + prompt = batch_data["prompt"] + answer = batch_data["answer"] + outputs = llm.generate(prompt, sampling_params) + # Print the outputs. + for i, output in enumerate(outputs): + generated_text = output.outputs[0].text + print(f"Prompt: {output.prompt!r}, Generated text: {generated_text!r}") + assert generated_text == answer + + # unset env + env_manager.unset_all() diff --git a/tests/st/python/test_vllm_mf_qwen_7b_cp_pc_mss.py b/tests/st/python/test_vllm_mf_qwen_7b_cp_pc_mss.py new file mode 100644 index 0000000000000000000000000000000000000000..6292b22c6020777ed6c3ee752834b140e2ab13fc --- /dev/null +++ b/tests/st/python/test_vllm_mf_qwen_7b_cp_pc_mss.py @@ -0,0 +1,86 @@ +# Copyright 2024 The vLLM team. +# Copyright 2024 Microsoft and the HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://wwww.apache.org/licenses/LICENSE-2.0 +# +# Unless required by application law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""test mf qwen chunk prefill, prefix cache, mss.""" +import pytest +import os +from . import set_env +env_manager = set_env.EnvVarManager() +# def env +env_vars = { + "MINDFORMERS_MODEL_CONFIG": "./config/predict_qwen2_5_7b_instruct.yaml", + "ASCEND_CUSTOM_PATH": os.path.expandvars("$ASCEND_HOME_PATH/../"), + "vLLM_MODEL_BACKEND": "MindFormers", + "MS_ENABLE_LCCL": "off", + "HCCL_OP_EXPANSION_MODE": "AIV", + "ASCEND_RT_VISIBLE_DEVICES": "0,1", + "MS_ALLOC_CONF": "enable_vmm:True", + "LCCL_DETERMINISTIC": "1", + "HCCL_DETERMINISTIC": "true", + "ATB_MATMUL_SHUFFLE_K_ENABLE": "0", + "ATB_LLM_LCOC_ENABLE": "0" +} +# set env +env_manager.setup_ai_environment(env_vars) +import vllm_mindspore +from vllm import LLM, SamplingParams + +class TestMfQwen_cp_pc_mss: + """ + Test qwen. + """ + @pytest.mark.level0 + @pytest.mark.platform_arm_ascend910b_training + @pytest.mark.env_single + def test_mf_qwen_7b_cp_pc_mss(self): + """ + test case mf_qwen_7b_cp_pc_mss + """ + + # Sample prompts. + batch_datas = [{ + "prompt": "I love Beijing, because it is a city with a long history and profound cultural heritage. Walking through " + "its ancient hutongs, one can almost feel the whispers of the past. The Forbidden City, an architectural " + "marvel that once housed emperors, stands as a testament to the city's imperial past. Meanwhile, the Great " + "Wall, though not within the city limits, is easily accessible from Beijing and offers a glimpse into the " + "strategic genius and resilience of ancient China.", + "answer": ""}, + {"prompt": "I love Beijing, because", + "answer": " it is a city with a long history. Which of the following options correctly expresses this sentence?\nA. I love Beijing, because it is a city with a"}, + ] + + # Create a sampling params object. + sampling_params = SamplingParams(temperature=0.0, max_tokens=32, top_k=1) + + # Create an LLM. + llm = LLM(model="/home/workspace/mindspore_dataset/weight/Qwen2.5-7B-Instruct", + max_model_len=8192, max_num_seqs=16, max_num_batched_tokens=32, + block_size=32, gpu_memory_utilization=0.9, tensor_parallel_size=2, + enable_chunked_prefill=True, enable_prefix_caching=True, num_scheduler_steps=8) + # Generate texts from the prompts. The output is a list of RequestOutput objects + # that contain the prompt, generated text, and other information. + for _ in range(3): + for batch_data in batch_datas: + prompt = batch_data["prompt"] + answer = batch_data["answer"] + outputs = llm.generate(prompt, sampling_params) + # Print the outputs. + for i, output in enumerate(outputs): + generated_text = output.outputs[0].text + print(f"Prompt: {output.prompt!r}, Generated text: {generated_text!r}") + assert generated_text == answer + + # unset env + env_manager.unset_all() diff --git a/tests/st/python/test_vllm_mf_qwen_7b_mss.py b/tests/st/python/test_vllm_mf_qwen_7b_mss.py new file mode 100644 index 0000000000000000000000000000000000000000..b174804dd50d5dc5e3f090286b29b93164e1515e --- /dev/null +++ b/tests/st/python/test_vllm_mf_qwen_7b_mss.py @@ -0,0 +1,78 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# Copyright 2025 Huawei Technologies Co., Ltd +# Copyright 2024 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""test mf qwen mss.""" +import pytest +import os +from . import set_env +env_manager = set_env.EnvVarManager() +# def env +env_vars = { + "MINDFORMERS_MODEL_CONFIG": "./config/predict_qwen2_5_7b_instruct.yaml", + "ASCEND_CUSTOM_PATH": os.path.expandvars("$ASCEND_HOME_PATH/../"), + "vLLM_MODEL_BACKEND": "MindFormers", + "MS_ENABLE_LCCL": "off", + "HCCL_OP_EXPANSION_MODE": "AIV", + "ASCEND_RT_VISIBLE_DEVICES": "0,1", + "MS_ALLOC_CONF": "enable_vmm:True", + "LCCL_DETERMINISTIC": "1", + "HCCL_DETERMINISTIC": "true", + "ATB_MATMUL_SHUFFLE_K_ENABLE": "0", + "ATB_LLM_LCOC_ENABLE": "0" +} +# set env +env_manager.setup_ai_environment(env_vars) +import vllm_mindspore +from vllm import LLM, SamplingParams + +class TestMfQwen_mss: + """ + Test qwen. + """ + @pytest.mark.level0 + @pytest.mark.platform_arm_ascend910b_training + @pytest.mark.env_single + def test_mf_qwen_7b_mss(self): + """ + test case qwen_7b_mss + """ + + # Sample prompts. + prompts = [ + "I love Beijing, because", + ] + + # Create a sampling params object. + sampling_params = SamplingParams(temperature=0.0, max_tokens=10, top_k=1) + + # Create an LLM. + llm = LLM(model="/home/workspace/mindspore_dataset/weight/Qwen2.5-7B-Instruct", + max_model_len=8192, max_num_batched_tokens=8192, + block_size=32, gpu_memory_utilization=0.9, num_scheduler_steps=8, tensor_parallel_size=2) + # Generate texts from the prompts. The output is a list of RequestOutput objects + # that contain the prompt, generated text, and other information. + outputs = llm.generate(prompts, sampling_params) + except_list=[' it is a city with a long history. Which'] + # Print the outputs. + for i, output in enumerate(outputs): + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + assert generated_text == except_list[i] + + # unset env + env_manager.unset_all() diff --git a/tests/st/python/test_vllm_mf_qwen_7b_prefix_caching.py b/tests/st/python/test_vllm_mf_qwen_7b_prefix_caching.py new file mode 100644 index 0000000000000000000000000000000000000000..89ba64c0e5032cc64cffaaa468b26823aeac185c --- /dev/null +++ b/tests/st/python/test_vllm_mf_qwen_7b_prefix_caching.py @@ -0,0 +1,83 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# Copyright 2025 Huawei Technologies Co., Ltd +# Copyright 2024 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +"""test mf qwen prefix caching.""" +import pytest +import os +from . import set_env +env_manager = set_env.EnvVarManager() +env_vars = { + "MINDFORMERS_MODEL_CONFIG": "./config/predict_qwen2_5_7b_instruct.yaml", + "ASCEND_CUSTOM_PATH": os.path.expandvars("$ASCEND_HOME_PATH/../"), + "vLLM_MODEL_BACKEND": "MindFormers", + "MS_ENABLE_LCCL": "off", + "HCCL_OP_EXPANSION_MODE": "AIV", + "ASCEND_RT_VISIBLE_DEVICES": "0,1", + "LCCL_DETERMINISTIC": "1", + "HCCL_DETERMINISTIC": "true", + "ATB_MATMUL_SHUFFLE_K_ENABLE": "0", + "ATB_LLM_LCOC_ENABLE": "0" +} +env_manager.setup_ai_environment(env_vars) +import vllm_mindspore +from vllm import LLM, SamplingParams + + +class TestMfQwen_prefix_caching: + """ + Test qwen7b enable prefix_caching + """ + @pytest.mark.level0 + @pytest.mark.platform_arm_ascend910b_training + @pytest.mark.env_single + def test_mf_qwen_7b_prefix_caching(self): + """ + test case qwen_7b_prefix_caching + """ + + # First prompts. + prompts = [ + "I love Beijing, because it is a city that has so much to offer. I have visited" + ] + #second prompts, the second prompt is a continuation of the first prompts, make sure prefix caching work. + second_prompts = [ + "I love Beijing, because it is a city that has so much to offer. I have visited many places" + ] + # Create a sampling params object. + sampling_params = SamplingParams(temperature=0.0, max_tokens=10, top_k=1) + + # Create an LLM. + llm = LLM(model="/home/workspace/mindspore_dataset/weight/Qwen2.5-7B-Instruct", + max_model_len=8192, block_size=16, enable_prefix_caching=True, + gpu_memory_utilization=0.9, tensor_parallel_size=2) + # Generate texts from the prompts. The output is a list of RequestOutput objects + # that contain the prompt, generated text, and other information. + outputs = llm.generate(prompts, sampling_params) + second_outputs = llm.generate(second_prompts, sampling_params) + except_list=[' many times and each time I have found something new'] + second_except_list=[' to visit, such as the Forbidden City, the'] + for i, (output, second_output) in enumerate(zip(outputs, second_outputs)): + generated_text = output.outputs[i].text + print(f"Output1 - Prompt: {prompts[i]!r}, Generated text: {generated_text!r}") + assert generated_text == except_list[i] + + second_generated_text = second_output.outputs[i].text + print(f"Output2 - Prompt: {second_prompts[i]!r}, Generated text: {second_generated_text!r}") + assert second_generated_text == second_except_list[i] + + env_manager.unset_all() diff --git a/tests/st/python/test_vllm_qwen_7b.py b/tests/st/python/test_vllm_qwen_7b.py new file mode 100644 index 0000000000000000000000000000000000000000..bce75d3e11bc24c73cecc45a90f84954d9b800e0 --- /dev/null +++ b/tests/st/python/test_vllm_qwen_7b.py @@ -0,0 +1,74 @@ +# Copyright 2024 The vLLM team. +# Copyright 2024 Microsoft and the HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://wwww.apache.org/licenses/LICENSE-2.0 +# +# Unless required by application law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""test vllm qwen.""" +import pytest +import os +from . import set_env +env_manager = set_env.EnvVarManager() +# def env +env_vars = { + "ASCEND_CUSTOM_PATH": os.path.expandvars("$ASCEND_HOME_PATH/../"), + "MS_ENABLE_LCCL": "off", + "HCCL_OP_EXPANSION_MODE": "AIV", + "ASCEND_RT_VISIBLE_DEVICES": "0,1", + "MS_ALLOC_CONF": "enable_vmm:True", + "LCCL_DETERMINISTIC": "1", + "HCCL_DETERMINISTIC": "true", + "ATB_MATMUL_SHUFFLE_K_ENABLE": "0", + "ATB_LLM_LCOC_ENABLE": "0" +} +# set env +env_manager.setup_ai_environment(env_vars) +import vllm_mindspore +from vllm import LLM, SamplingParams + + +class TestQwen: + """ + Test Qwen. + """ + @pytest.mark.level0 + @pytest.mark.platform_arm_ascend910b_training + @pytest.mark.env_single + def test_vllm_qwen(self): + """ + test case qwen2.5 7B + """ + + # Sample prompts. + prompts = [ + "You are a helpful assistant.<|User|>将文本分类为中性、负面或正面。 \n文本:我认为这次假期还可以。 \n情感:<|Assistant|>\n", + ] + + # Create a sampling params object. + sampling_params = SamplingParams(temperature=0.0, max_tokens=10, top_k=1) + + # Create an LLM. + llm = LLM(model="/home/workspace/mindspore_dataset/weight/Qwen2.5-7B-Instruct", + gpu_memory_utilization=0.9, tensor_parallel_size=2) + # Generate texts from the prompts. The output is a list of RequestOutput objects + # that contain the prompt, generated text, and other information. + outputs = llm.generate(prompts, sampling_params) + except_list=['中性<|Assistant|> 这句话'] + # Print the outputs. + for i, output in enumerate(outputs): + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + assert generated_text == except_list[i] + + # unset env + env_manager.unset_all() diff --git a/vllm_mindspore/__init__.py b/vllm_mindspore/__init__.py index b86b9127f4a41fa2ba087c2b99b91525448a5d88..47a9e4d58c3a922c2f16ff444ffb1a2619696627 100644 --- a/vllm_mindspore/__init__.py +++ b/vllm_mindspore/__init__.py @@ -1,6 +1,7 @@ #!/usr/bin/env python3 # encoding: utf-8 # Copyright 2025 Huawei Technologies Co., Ltd +# Copyright 2024 The vLLM team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -46,7 +47,6 @@ vllm.utils.current_platform = ascend_platform from vllm_mindspore.utils import ( direct_register_custom_op, - memory_profiling, make_tensor_with_pad, async_tensor_h2d, get_dtype_size, @@ -55,7 +55,6 @@ from vllm_mindspore.utils import ( ) vllm.utils.direct_register_custom_op = direct_register_custom_op -vllm.utils.memory_profiling = memory_profiling vllm.utils.make_tensor_with_pad = make_tensor_with_pad vllm.utils.async_tensor_h2d = async_tensor_h2d vllm.utils.get_dtype_size = get_dtype_size @@ -69,15 +68,19 @@ vllm.executor.cuda_device_count_stateless = ascend_device_count_stateless from vllm_mindspore.model_executor.models.registry import ( MindSporeModelRegistry, - _run_in_subprocess, + _SUBPROCESS_COMMAND, ) +vllm.config.ModelRegistry = MindSporeModelRegistry + import vllm.model_executor vllm.model_executor.models.ModelRegistry = MindSporeModelRegistry -vllm.config.ModelRegistry = MindSporeModelRegistry +vllm.model_executor.models.registry._SUBPROCESS_COMMAND = _SUBPROCESS_COMMAND from vllm_mindspore.model_executor.model_loader.utils import get_ms_model_architecture + +# To patching the get_model_architecture, should import it first. from vllm.model_executor.model_loader import get_model_architecture vllm.model_executor.model_loader.get_model_architecture = get_ms_model_architecture @@ -87,7 +90,6 @@ vllm.model_executor.model_loader.utils.get_model_architecture = ( vllm.model_executor.model_loader.loader.get_model_architecture = ( get_ms_model_architecture ) -vllm.model_executor.models.registry._run_in_subprocess = _run_in_subprocess from vllm_mindspore.model_executor.sampling_metadata import ( SequenceGroupToSample, @@ -101,23 +103,15 @@ vllm.model_executor.sampling_metadata.SequenceGroupToSample = SequenceGroupToSam vllm.model_executor.sampling_metadata.SamplingMetadataCache = SamplingMetadataCache vllm.model_executor.sampling_metadata.SamplingMetadata = SamplingMetadata -from vllm_mindspore.attention.selector import get_ms_attn_backend - -import vllm.attention - -vllm.attention.get_attn_backend = get_ms_attn_backend - from vllm_mindspore.worker.cache_engine import ( ms_allocate_kv_cache, ms_swap_in, ms_swap_out, - cache_engine_init ) import vllm.worker.cache_engine vllm.worker.cache_engine.CacheEngine._allocate_kv_cache = ms_allocate_kv_cache -vllm.worker.cache_engine.CacheEngine.__init__ = cache_engine_init vllm.worker.cache_engine.CacheEngine.swap_in = ms_swap_in vllm.worker.cache_engine.CacheEngine.swap_out = ms_swap_out @@ -129,35 +123,39 @@ vllm.model_executor.model_loader.loader.safetensors_weights_iterator = ( safetensors_weights_iterator ) -from vllm_mindspore.worker.worker import ( - _warm_up_model, - determine_num_available_blocks, +from vllm_mindspore.worker.worker import _warm_up_model +from vllm_mindspore.worker.profile import ( + wrapper_worker_init, + wrapper_worker_init_device, ) from vllm.worker.worker import Worker Worker._warm_up_model = _warm_up_model -Worker.determine_num_available_blocks = determine_num_available_blocks +Worker.__init__ = wrapper_worker_init(Worker.__init__) +Worker.init_device = wrapper_worker_init_device(Worker.init_device) -from vllm_mindspore.worker.model_runner import _get_cuda_graph_pad_size, profile_run +from vllm_mindspore.worker.model_runner import ( + _get_cuda_graph_pad_size, + _dummy_run, + _get_supported_attention_backends, +) vllm.worker.model_runner.ModelInputForGPUBuilder._get_cuda_graph_pad_size = ( _get_cuda_graph_pad_size ) -vllm.worker.model_runner.GPUModelRunnerBase.profile_run = profile_run +vllm.worker.model_runner.GPUModelRunnerBase._dummy_run = _dummy_run -from vllm_mindspore.distributed.parallel_state import ( - all_reduce_for_GroupCoordinator, - init_model_parallel_group, -) +import vllm.worker.multi_step_model_runner -vllm.distributed.parallel_state.GroupCoordinator.all_reduce = ( - all_reduce_for_GroupCoordinator +vllm.worker.multi_step_model_runner._get_supported_attention_backends = ( + _get_supported_attention_backends ) -vllm.distributed.parallel_state.init_model_parallel_group = init_model_parallel_group from vllm_mindspore.executor.multiproc_worker_utils import ( get_mp_context as ms_get_mp_context, ) + +# To patching the get_mp_context, should import it first. from vllm.executor.multiproc_worker_utils import get_mp_context vllm.executor.multiproc_worker_utils.get_mp_context = ms_get_mp_context @@ -167,10 +165,11 @@ from vllm_mindspore.executor.ray_gpu_executor import ( initialize_ray_cluster, ) -from vllm.executor.ray_gpu_executor import RayGPUExecutor +from vllm.executor.ray_distributed_executor import RayDistributedExecutor -RayGPUExecutor._init_workers_ray = ms_init_workers_ray +RayDistributedExecutor._init_workers_ray = ms_init_workers_ray +vllm.executor.ray_distributed_executor.initialize_ray_cluster = initialize_ray_cluster vllm.executor.ray_utils.initialize_ray_cluster = initialize_ray_cluster import vllm.engine.llm_engine @@ -180,10 +179,44 @@ vllm.engine.llm_engine.initialize_ray_cluster = initialize_ray_cluster vllm.engine.async_llm_engine.initialize_ray_cluster = initialize_ray_cluster -from .config import get_head_size, _verify_quantization -vllm.config.ModelConfig.get_head_size = get_head_size +from .config import _verify_quantization, _verify_args, vllm_config_post_init + vllm.config.ModelConfig._verify_quantization = _verify_quantization +vllm.config.VllmConfig.__post_init__ = vllm_config_post_init +vllm.config.SchedulerConfig._verify_args = _verify_args + +from .utils import update_modules +from vllm_mindspore.attention.backends import ms_attn +update_modules("vllm.attention.backends.flash_attn", ms_attn) + +from vllm_mindspore.worker.spec_decode_worker import ( + spec_decode_worker_init, + _run_no_spec, + _verify_tokens, + _create_output, + _merge_outputs, +) +from vllm.spec_decode.spec_decode_worker import SpecDecodeWorker +SpecDecodeWorker.__init__ = spec_decode_worker_init +SpecDecodeWorker._verify_tokens = _verify_tokens +SpecDecodeWorker._run_no_spec = _run_no_spec + +from vllm.model_executor.layers.spec_decode_base_sampler import SpecDecodeBaseSampler +SpecDecodeBaseSampler._create_output = _create_output + +from vllm.spec_decode.top1_proposer import Top1Proposer +Top1Proposer._merge_outputs = _merge_outputs + +from vllm_mindspore.model_executor.layers.rejection_sampler import _smallest_positive_value, _multinomial +from vllm.model_executor.layers.rejection_sampler import RejectionSampler +RejectionSampler._smallest_positive_value = _smallest_positive_value +RejectionSampler._smallest_positive_value.__set_name__(RejectionSampler, '_smallest_positive_value') +vllm.model_executor.layers.rejection_sampler._multinomial = _multinomial from .utils import check_ready +from vllm_mindspore.engine.multiprocessing.engine import cleanup +import vllm.engine.multiprocessing.engine +vllm.engine.multiprocessing.engine.MQLLMEngine.cleanup = cleanup + check_ready() diff --git a/vllm_mindspore/attention/backends/ms_attn.py b/vllm_mindspore/attention/backends/ms_attn.py index f01c1517f9385f93ddb95880344e2dca192e8784..558882cdffe5caaf6aa621a9ade1678d9a99b04b 100644 --- a/vllm_mindspore/attention/backends/ms_attn.py +++ b/vllm_mindspore/attention/backends/ms_attn.py @@ -1,12 +1,13 @@ #!/usr/bin/env python3 # encoding: utf-8 # Copyright 2025 Huawei Technologies Co., Ltd +# Copyright 2024 The vLLM team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # -# http://www.apache.org/licenses/LICENSE-2.0 +# http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, @@ -18,7 +19,11 @@ from collections import defaultdict from dataclasses import dataclass +from itertools import accumulate from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type +import os + +import numpy as np import torch @@ -29,6 +34,7 @@ from vllm.attention.backends.abstract import ( AttentionMetadataBuilder, AttentionType, AttentionState, + AttentionLayer, ) if TYPE_CHECKING: @@ -52,6 +58,42 @@ from mindspore import mutable from mindspore._c_expression import swap_cache +def advance_step_op(sampled_token_ids, + model_input, + seq_lens_tensor, + num_queries, + block_size, + block_tables, + slot_mapping): + # update input_tokens + sampled_token_ids_list = sampled_token_ids[: + num_queries].squeeze( # type: ignore + -1) + model_input.input_tokens[: + num_queries] = sampled_token_ids_list # type: ignore + + # get seq_lens and input_positions + seq_lens = seq_lens_tensor[:num_queries] + next_seq_lens = seq_lens + 1 + next_input_pos = next_seq_lens - 1 + + # update seq_lens and input_positions + seq_lens_tensor[:num_queries] = next_seq_lens + model_input.input_positions[: + num_queries] = next_input_pos # type: ignore + + # 计算 block index 和 offset + block_idx = next_input_pos // block_size + block_offset = next_input_pos % block_size + + current_block_table = block_tables.gather( + 1, block_idx.unsqueeze(-1)).squeeze(-1) + slot_num = current_block_table * block_size + block_offset + + # update slot_mapping + slot_mapping[:num_queries] = slot_num + + @dataclass class MSAttentionMetadata(AttentionMetadata, PagedAttentionMetadata): """Metadata for TorchSDPABackend.""" @@ -63,10 +105,20 @@ class MSAttentionMetadata(AttentionMetadata, PagedAttentionMetadata): # For chunked prefill only max_query_len: Optional[int] = None + + max_prefill_seq_len: int = 0 + seq_start_loc: Optional[torch.Tensor] = None + _cached_prefill_metadata: Optional["MSAttentionMetadata"] = None + _cached_decode_metadata: Optional["MSAttentionMetadata"] = None + context_lens_tensor: Optional[torch.Tensor] = None + encoder_seq_start_loc: Optional[torch.Tensor] = None + max_decode_query_len: Optional[int] = None + max_kv_len: Optional[int] = None query_start_loc: Optional[torch.Tensor] = None kv_start_loc: Optional[torch.Tensor] = None prefill_block_tables: Optional[torch.Tensor] = None + query_lens: Optional[List[int]] = None # Begin encoder attn & enc/dec cross-attn fields... # Encoder sequence lengths representation @@ -84,20 +136,202 @@ class MSAttentionMetadata(AttentionMetadata, PagedAttentionMetadata): cross_slot_mapping: Optional[torch.Tensor] = None cross_block_tables: Optional[torch.Tensor] = None - # TODO(tronzhang): No need to use cuda_graph for mindspore. use_cuda_graph: bool = False + enable_kv_scales_calculation: bool + @property def prefill_metadata(self): - if self.num_prefill_tokens == 0: + if self.num_prefills == 0: return None - return self + + if self._cached_prefill_metadata is not None: + return self._cached_prefill_metadata + + assert ((self.seq_lens is not None) + or (self.encoder_seq_lens is not None)) + assert ((self.seq_lens_tensor is not None) + or (self.encoder_seq_lens_tensor is not None)) + + # Compute some attn_metadata fields which default to None + query_start_loc = (None if self.query_start_loc is None else + self.query_start_loc[:self.num_prefills + 1]) + slot_mapping = (None if self.slot_mapping is None else + self.slot_mapping[:self.num_prefill_tokens]) + seq_lens = (None if self.seq_lens is None else + self.seq_lens[:self.num_prefills]) + seq_lens_tensor = (None if self.seq_lens_tensor is None else + self.seq_lens_tensor[:self.num_prefills]) + seq_start_loc = (None if self.seq_start_loc is None else + self.seq_start_loc[:self.num_prefills + 1]) + context_lens_tensor = (None if self.context_lens_tensor is None else + self.context_lens_tensor[:self.num_prefills]) + block_tables = (None if self.block_tables is None else + self.block_tables[:self.num_prefills]) + + self._cached_prefill_metadata = MSAttentionMetadata( + num_prefills=self.num_prefills, + num_prefill_tokens=self.num_prefill_tokens, + num_decode_tokens=0, + slot_mapping=slot_mapping, + multi_modal_placeholder_index_maps=self. + multi_modal_placeholder_index_maps, + enable_kv_scales_calculation=False, + seq_lens=seq_lens, + seq_lens_tensor=seq_lens_tensor, + max_query_len=self.max_query_len, + max_prefill_seq_len=self.max_prefill_seq_len, + max_decode_query_len=0, + max_decode_seq_len=0, + query_start_loc=query_start_loc, + seq_start_loc=seq_start_loc, + context_lens_tensor=context_lens_tensor, + block_tables=block_tables, + use_cuda_graph=False, + # Begin encoder & cross attn fields below... + encoder_seq_lens=self.encoder_seq_lens, + encoder_seq_lens_tensor=self.encoder_seq_lens_tensor, + encoder_seq_start_loc=self.encoder_seq_start_loc, + max_encoder_seq_len=self.max_encoder_seq_len, + chunked_prefill=self.chunked_prefill, + cross_slot_mapping=self.cross_slot_mapping, + cross_block_tables=self.cross_block_tables) + return self._cached_prefill_metadata @property def decode_metadata(self): if self.num_decode_tokens == 0: return None - return self + + if self._cached_decode_metadata is not None: + return self._cached_decode_metadata + assert ((self.seq_lens_tensor is not None) + or (self.encoder_seq_lens_tensor is not None)) + + # Compute some attn_metadata fields which default to None + slot_mapping = (None if self.slot_mapping is None else + self.slot_mapping[self.num_prefill_tokens:]) + seq_lens_tensor = (None if self.seq_lens_tensor is None else + self.seq_lens_tensor[self.num_prefills:]) + block_tables = (None if self.block_tables is None else + self.block_tables[self.num_prefills:]) + + self._cached_decode_metadata = MSAttentionMetadata( + num_prefills=0, + num_prefill_tokens=0, + num_decode_tokens=self.num_decode_tokens, + slot_mapping=slot_mapping, + multi_modal_placeholder_index_maps=None, + enable_kv_scales_calculation=False, + seq_lens=None, + seq_lens_tensor=seq_lens_tensor, + max_decode_query_len=self.max_decode_query_len, + max_query_len=self.max_query_len, + max_prefill_seq_len=0, + max_decode_seq_len=self.max_decode_seq_len, + # Batch may be composed of prefill|decodes, adjust query start + # indices to refer to the start of decodes. E.g. + # in tokens:[3 prefills|6 decodes], query_start_loc=[3,9] => [0,6]. + query_start_loc=(self.query_start_loc[self.num_prefills:] - + self.query_start_loc[self.num_prefills]) + if self.query_start_loc is not None else None, + seq_start_loc=self.seq_start_loc[self.num_prefills:] + if self.seq_start_loc is not None else None, + context_lens_tensor=None, + block_tables=block_tables, + use_cuda_graph=self.use_cuda_graph, + # Begin encoder & cross attn fields below... + encoder_seq_lens=self.encoder_seq_lens, + encoder_seq_lens_tensor=self.encoder_seq_lens_tensor, + encoder_seq_start_loc=self.encoder_seq_start_loc, + max_encoder_seq_len=self.max_encoder_seq_len, + chunked_prefill=self.chunked_prefill, + cross_slot_mapping=self.cross_slot_mapping, + cross_block_tables=self.cross_block_tables) + return self._cached_decode_metadata + + def advance_step(self, + model_input: "ModelInputForNPUWithSamplingMetadata", + sampled_token_ids: Optional[torch.Tensor], + block_size: int, + num_seqs: int, + num_queries: int, + turn_prefills_into_decodes: bool = False): + """ + Update metadata in-place to advance one decode step. + """ + # When using cudagraph, the num_seqs is padded to the next captured + # batch sized, but num_queries tracks the actual number of requests in + # the batch. For --enforce-eager mode, num_seqs == num_queries + if num_seqs != num_queries: + assert num_seqs > num_queries + + if turn_prefills_into_decodes: + # When Mutli-Step is enabled with Chunked-Prefill, prefills and + # decodes are scheduled together. In the first step, all the + # prefills turn into decodes. This update reflects that + # conversion. + assert self.num_decode_tokens + self.num_prefills == num_seqs + self.num_decode_tokens += self.num_prefills + self.num_prefills = 0 + self.num_prefill_tokens = 0 + self.max_prefill_seq_len = 0 + self.max_query_len = 1 + + self.slot_mapping = self.slot_mapping[:num_seqs] + else: + assert self.seq_lens is not None + assert self.max_decode_seq_len == max(self.seq_lens) + + assert self.num_prefills == 0 + assert self.num_prefill_tokens == 0 + assert self.num_decode_tokens == num_seqs + assert self.slot_mapping.shape == (num_seqs, ) + + assert self.seq_lens is not None + assert len(self.seq_lens) == num_seqs + assert self.seq_lens_tensor is not None + assert self.seq_lens_tensor.shape == (num_seqs, ) + assert self.max_query_len == 1 + assert self.max_prefill_seq_len == 0 + + assert self.query_start_loc is not None + assert self.query_start_loc.shape == (num_queries + 1, ) + assert self.seq_start_loc is not None + assert self.seq_start_loc.shape == (num_seqs + 1, ) + + assert self.context_lens_tensor is not None + assert self.context_lens_tensor.shape == (num_queries, ) + + assert self.block_tables is not None + assert self.block_tables.shape[0] == num_seqs + + # Update query lengths. Note that we update only queries and not seqs, + # since tensors may be padded due to captured cuda graph batch size + for i in range(num_queries): + self.seq_lens[i] += 1 + self.max_decode_seq_len = max(self.seq_lens) + + # default use python op + if os.getenv("vLLM_USE_NPU_ADV_STEP_FLASH_OP", "off") == "on": + from vllm_mindspore import npu_ops + npu_ops.adv_step_flash(num_seqs=num_seqs, + num_queries=num_queries, + block_size=block_size, + input_tokens=model_input.input_tokens, + sampled_token_ids=sampled_token_ids, + input_positions=model_input.input_positions, + seq_lens=self.seq_lens_tensor, + slot_mapping=self.slot_mapping, + block_tables=self.block_tables) + else: + advance_step_op(sampled_token_ids, + model_input, + self.seq_lens_tensor, + num_queries, + block_size, + self.block_tables, + self.slot_mapping) def get_seq_lens( self, @@ -159,23 +393,16 @@ class MSAttentionMetadata(AttentionMetadata, PagedAttentionMetadata): else: raise AttributeError(f"Invalid attention type {str(attn_type)}") - def keys(self): - return ["num_prefill_tokens", "num_decode_tokens", "slot_mapping", "batch_valid_length", "context_lens", "block_tables"] - - def __getitem__(self, key): - if key == "context_lens": - key = "seq_lens_tensor" - if key == "batch_valid_length": - return mutable(getattr(self, "seq_lens"), dynamic_len=True) - if key == "block_tables": - if getattr(self, key).ndim == 1: - return mutable(getattr(self, key).expand_dims(0)) - return mutable(getattr(self, key)) - return mutable(getattr(self, key)) class MsAttentionMetadataBuilder(AttentionMetadataBuilder[MSAttentionMetadata]): def __init__(self, input_builder: "ModelInputForGPUBuilder"): + self.input_builder = input_builder + self.runner = input_builder.runner + self.sliding_window = input_builder.sliding_window + self.block_size = input_builder.block_size + + def prepare(self): self.slot_mapping: List[int] = [] self.prefill_seq_lens: List[int] = [] self.context_lens: List[int] = [] @@ -189,10 +416,6 @@ class MsAttentionMetadataBuilder(AttentionMetadataBuilder[MSAttentionMetadata]): self.num_decode_tokens = 0 self.has_prefix_cache_hit = False - self.input_builder = input_builder - self.runner = input_builder.runner - self.sliding_window = input_builder.sliding_window - self.block_size = input_builder.block_size def _add_seq_group( self, @@ -305,11 +528,18 @@ class MsAttentionMetadataBuilder(AttentionMetadataBuilder[MSAttentionMetadata]): use_captured_graph = cuda_graph_pad_size != -1 max_query_len = max(query_lens) + decode_query_lens = query_lens[self.num_prefills:] + if len(decode_query_lens) > 0: + max_decode_query_len = max(decode_query_lens) + else: + max_decode_query_len = 1 + max_prefill_seq_len = max(self.prefill_seq_lens, default=0) max_decode_seq_len = max(self.curr_seq_lens, default=0) num_decode_tokens = self.num_decode_tokens + query_start_loc = list(accumulate(query_lens, initial=0)) + seq_start_loc = list(accumulate(seq_lens, initial=0)) if use_captured_graph: - # TODO(tronzhang): Maybe here only turn graph mode on , and go with then same condition branch logic? raise RuntimeError("Doesnot support captured graph now!") else: block_tables = make_tensor_with_pad( @@ -320,10 +550,15 @@ class MsAttentionMetadataBuilder(AttentionMetadataBuilder[MSAttentionMetadata]): ) assert max_query_len > 0, "query_lens: {}".format(query_lens) + context_lens_tensor = ms.Tensor(self.context_lens, dtype=ms.int32) seq_lens_tensor = ms.Tensor(seq_lens, dtype=ms.int32) + slot_mapping_tensor = ms.Tensor(self.slot_mapping, dtype=ms.int32) + query_start_loc_tensor = ms.Tensor(query_start_loc, dtype=ms.int32) + seq_start_loc_tensor = ms.Tensor(seq_start_loc, dtype=ms.int32) + return MSAttentionMetadata( - slot_mapping=ms.Tensor(self.slot_mapping, dtype=ms.int32), + slot_mapping=slot_mapping_tensor, block_tables=block_tables, seq_lens_tensor=seq_lens_tensor, seq_lens=seq_lens, @@ -333,6 +568,12 @@ class MsAttentionMetadataBuilder(AttentionMetadataBuilder[MSAttentionMetadata]): num_prefill_tokens=self.num_prefill_tokens, num_decode_tokens=num_decode_tokens, multi_modal_placeholder_index_maps=None, + enable_kv_scales_calculation=False, + query_lens=query_lens, + query_start_loc=query_start_loc_tensor, + seq_start_loc=seq_start_loc_tensor, + context_lens_tensor=context_lens_tensor, + max_query_len=max_query_len, ) @@ -341,7 +582,7 @@ class MsAttentionBackend(AttentionBackend): @staticmethod def get_name() -> str: - raise "MS_ATTN" + return "MS_ATTN" @staticmethod def get_impl_cls() -> Type["AttentionImpl"]: @@ -398,7 +639,6 @@ class MsAttentionBackend(AttentionBackend): kv_caches: List[MsKVCache], src_to_dists: torch.Tensor, ) -> None: - # TODO(tronzhang): this may be slow, a faster interface should be implemented by custom op! blocks_to_copy = src_to_dists.asnumpy().tolist() for kv_cache in kv_caches: npu_key_block, npu_value_block = kv_cache @@ -444,18 +684,18 @@ class MsAttentionImpl(AttentionImpl): kv_cache_dtype: str, blocksparse_params: Optional[Dict[str, Any]] = None, logits_soft_cap: Optional[float] = None, + attn_type: str = AttentionType.DECODER, ) -> None: pass def forward( self, + layer: AttentionLayer, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, kv_cache: torch.Tensor, attn_metadata: MSAttentionMetadata, - k_scale: float = 1.0, - v_scale: float = 1.0, attn_type: str = AttentionType.DECODER, output: Optional[torch.Tensor] = None, ) -> torch.Tensor: @@ -530,3 +770,5 @@ class MLABackend(AttentionBackend): @staticmethod def get_supported_head_sizes() -> List[int]: return [576] + +FlashAttentionMetadata = MSAttentionMetadata diff --git a/vllm_mindspore/attention/backends/utils.py b/vllm_mindspore/attention/backends/utils.py index 00970f8f5682c68e4da8cecdfcbb496a8e989a9f..88cf9e1e50128f9eb851cfe7a72a77422b82330f 100644 --- a/vllm_mindspore/attention/backends/utils.py +++ b/vllm_mindspore/attention/backends/utils.py @@ -1,6 +1,7 @@ #!/usr/bin/env python3 # encoding: utf-8 # Copyright 2025 Huawei Technologies Co., Ltd +# Copyright 2024 The vLLM team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/vllm_mindspore/attention/layer.py b/vllm_mindspore/attention/layer.py index 01eacca3a2a62986c639796db8b4cc7e417a2892..4634727b9811601b3879d9c1a62c2a32fa613424 100644 --- a/vllm_mindspore/attention/layer.py +++ b/vllm_mindspore/attention/layer.py @@ -1,6 +1,7 @@ #!/usr/bin/env python3 # encoding: utf-8 # Copyright 2025 Huawei Technologies Co., Ltd +# Copyright 2024 The vLLM team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -152,15 +153,15 @@ class Attention(nn.Cell): query: Tensor, key: Tensor, value: Tensor, - kv_cache: Tuple[Tensor, Tensor], - # attn_metadata: MSMetadata, - num_prefill_tokens: int, - num_decode_tokens: int, + key_cache: Tensor, + value_cache: Tensor, + is_prefill: bool, slot_mapping: Tensor, batch_valid_length: Tuple[int], - context_lens: Tensor, + q_seq_lens: Tensor, block_tables: Tensor, attn_mask: Tensor, + decode_mask: Tensor, ) -> Tensor: """Attention foward, support MHA and GQA. @@ -174,13 +175,13 @@ class Attention(nn.Cell): block_tables: shape = [block_size, num_block] """ output = query - key_cache, value_cache = kv_cache cache_out = self.reshape_and_cache(key, value, key_cache, value_cache, slot_mapping) query = ops.depend(query, cache_out) - if num_prefill_tokens > 0: + if is_prefill: output = self._run_prefill_forward(query, key, value, attn_mask, batch_valid_length, batch_valid_length) - if num_decode_tokens > 0: - output = self._run_decode_forward(query, key_cache, value_cache, block_tables, context_lens) + else: + output = self._run_decode_forward(query, key_cache, value_cache, block_tables, batch_valid_length, + decode_mask, q_seq_lens) return output def _run_prefill_forward( @@ -205,16 +206,18 @@ class Attention(nn.Cell): query = query.view(-1, self.hidden_size_per_partition) key = key.view(-1, self.kv_hidden_size_per_partition) value = value.view(-1, self.kv_hidden_size_per_partition) - _, _, _, output = self.flash_attention(query, - key, - value, - None, - None, - None, - attn_mask, - None, - actual_seq_qlen, - actual_seq_kvlen) + _, _, _, output = self.flash_attention( + query, + key, + value, + None, + None, + None, + attn_mask, + None, + actual_seq_qlen, + actual_seq_kvlen + ) output = output.view(1, -1, self.hidden_size_per_partition) return output @@ -224,7 +227,9 @@ class Attention(nn.Cell): key_cache: Tensor, value_cache: Tensor, block_tables: Tensor, - context_lens: Tensor, + batch_valid_length: Tensor, + decode_mask: Tensor, + q_seq_lens: Tensor, ) -> Tensor: """Decode with PagedAttention. @@ -235,5 +240,15 @@ class Attention(nn.Cell): block_tables: shape = [block_size, num_block] context_lens: shape = [batch_size, ] """ - output = self.paged_attention(query, key_cache, value_cache, block_tables, context_lens) + output = self.paged_attention( + query, + key_cache, + value_cache, + block_tables, + batch_valid_length, + None, + None, + decode_mask, + q_seq_lens + ) return output diff --git a/vllm_mindspore/attention/ops/paged_attn.py b/vllm_mindspore/attention/ops/paged_attn.py index 57f58db6f9430a9d02894641786166381f9aa237..df9394c785ee29160e58ab930b82dd2a865f49e2 100644 --- a/vllm_mindspore/attention/ops/paged_attn.py +++ b/vllm_mindspore/attention/ops/paged_attn.py @@ -1,6 +1,7 @@ #!/usr/bin/env python3 # encoding: utf-8 # Copyright 2025 Huawei Technologies Co., Ltd +# Copyright 2024 The vLLM team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -30,8 +31,6 @@ if HAS_TRITON: # Should be the same as PARTITION_SIZE in `paged_attention_v2_launcher`. _PARTITION_SIZE = 512 -# TODO(tronzhang): delete all not work codes. - @dataclass class PagedAttentionMetadata: diff --git a/vllm_mindspore/attention/selector.py b/vllm_mindspore/attention/selector.py index 1dd046661ba137f92ed1fd2e02ef79afb698411e..34654ffc858cc8a2659d68d4720598f41628c364 100644 --- a/vllm_mindspore/attention/selector.py +++ b/vllm_mindspore/attention/selector.py @@ -1,6 +1,7 @@ #!/usr/bin/env python3 # encoding: utf-8 # Copyright 2025 Huawei Technologies Co., Ltd +# Copyright 2024 The vLLM team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/vllm_mindspore/config.py b/vllm_mindspore/config.py index 18fed6fc54a7bee1dbebc9ea2311cff048bf5cec..e702278ef6c57f4ced180c1c61ac62e85de4e564 100644 --- a/vllm_mindspore/config.py +++ b/vllm_mindspore/config.py @@ -1,6 +1,7 @@ #!/usr/bin/env python3 # encoding: utf-8 # Copyright 2025 Huawei Technologies Co., Ltd +# Copyright 2024 The vLLM team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -14,31 +15,148 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================ +import torch -from vllm_mindspore.utils import is_mindformers_model_backend +import vllm.envs as envs +from vllm.config import VllmConfig, CompilationConfig, CompilationLevel, logger +from vllm.utils import random_uuid +from vllm.logger import init_logger -def get_head_size(self) -> int: - if hasattr(self.hf_text_config, "model_type") and ( - self.hf_text_config.model_type in ("deepseek_v2", "deepseek_v3") - ): +logger = init_logger(__name__) - if is_mindformers_model_backend(): - qk_rope_head_dim = getattr(self.hf_text_config, "qk_rope_head_dim", 0) - return self.hf_text_config.kv_lora_rank + qk_rope_head_dim - # FlashAttention supports only head_size 32, 64, 128, 256, - # we need to pad head_size 192 to 256 - return 256 +def _verify_quantization(self) -> None: + # Donnot verify now. + return - if self.is_attention_free: - return 0 - if hasattr(self.hf_text_config, "head_dim"): - return self.hf_text_config.head_dim - # FIXME(woosuk): This may not be true for all models. - return self.hf_text_config.hidden_size // self.hf_text_config.num_attention_heads +def vllm_config_post_init(self): + """Verify configs are valid & consistent with each other.""" + if self.model_config is not None: + self.model_config.verify_async_output_proc(self.parallel_config, + self.speculative_config, + self.device_config) + self.model_config.verify_with_parallel_config(self.parallel_config) -def _verify_quantization(self) -> None: - # Donnot verify now. - return \ No newline at end of file + if self.cache_config is not None: + self.cache_config.verify_with_parallel_config(self.parallel_config) + + if self.lora_config: + self.lora_config.verify_with_cache_config(self.cache_config) + self.lora_config.verify_with_model_config(self.model_config) + self.lora_config.verify_with_scheduler_config( + self.scheduler_config) + if self.prompt_adapter_config: + self.prompt_adapter_config.verify_with_model_config( + self.model_config) + + if self.quant_config is None and \ + self.model_config is not None and self.load_config is not None: + self.quant_config = VllmConfig._get_quantization_config( + self.model_config, self.load_config) + + from vllm.platforms import current_platform + if self.scheduler_config is not None and \ + self.model_config is not None and \ + self.scheduler_config.chunked_prefill_enabled and \ + self.model_config.dtype == torch.float32 and \ + current_platform.get_device_capability() == (7, 5): + logger.warning_once( + "Turing devices tensor cores do not support float32 matmul. " + "To workaround this limitation, vLLM will set 'ieee' input " + "precision for chunked prefill triton kernels.") + + if self.compilation_config is None: + self.compilation_config = CompilationConfig() + if envs.VLLM_USE_V1 and self.model_config is not None and \ + not self.model_config.enforce_eager: + # NOTE(woosuk): Currently, we use inductor because the piecewise + # CUDA graphs do not work properly with the custom CUDA kernels. + # FIXME(woosuk): Disable inductor to reduce the compilation time + # and avoid any potential issues with the inductor. + self.compilation_config.custom_ops = ["none"] + self.compilation_config.use_cudagraph = True + self.compilation_config.use_inductor = True + self.compilation_config.cudagraph_num_of_warmups = 1 + self.compilation_config.pass_config.enable_fusion = False + self.compilation_config.pass_config.enable_reshape = False + self.compilation_config.level = CompilationLevel.PIECEWISE + + self._set_cudagraph_sizes() + + if self.cache_config is not None and \ + self.cache_config.cpu_offload_gb > 0 and \ + self.compilation_config.level != CompilationLevel.NO_COMPILATION: + logger.warning( + "CPU offload is not supported with `torch.compile` yet." + " Disabling `torch.compile`.") + self.compilation_config.level = CompilationLevel.NO_COMPILATION + + if self.lora_config is not None and self.compilation_config.level !=\ + CompilationLevel.NO_COMPILATION: + logger.warning("LoRA is not supported with `torch.compile` yet. " + "Disabling `torch.compile`.") + self.compilation_config.level = CompilationLevel.NO_COMPILATION + + current_platform.check_and_update_config(self) + + if self.model_config and self.model_config.use_mla: + logger.info("For MindSpore, MLA supports chunked prefill and prefix cache, " + "so keep them enable.") + + if not self.instance_id: + self.instance_id = random_uuid()[:5] + + +def _verify_args(self) -> None: + if (self.max_num_batched_tokens < self.max_model_len + and not self.chunked_prefill_enabled): + logger.warning( + f"max_num_batched_tokens ({self.max_num_batched_tokens}) is " + f"smaller than max_model_len ({self.max_model_len}). " + "This effectively limits the maximum sequence length to " + "max_num_batched_tokens and makes vLLM reject longer " + "sequences. Please increase max_num_batched_tokens or " + "decrease max_model_len.") + + if self.max_num_batched_tokens < self.max_num_seqs: + raise ValueError( + f"max_num_batched_tokens ({self.max_num_batched_tokens}) must " + "be greater than or equal to max_num_seqs " + f"({self.max_num_seqs}).") + + if self.num_lookahead_slots < 0: + raise ValueError( + "num_lookahead_slots " + f"({self.num_lookahead_slots}) must be greater than or " + "equal to 0.") + + if self.num_scheduler_steps < 1: + raise ValueError( + "num_scheduler_steps " + f"({self.num_scheduler_steps}) must be greater than or " + "equal to 1.") + + if self.max_num_partial_prefills < 1: + raise ValueError( + f"max_num_partial_prefills ({self.max_num_partial_prefills}) " + "must be greater than or equal to 1.") + elif self.max_num_partial_prefills > 1: + if not self.chunked_prefill_enabled: + raise ValueError("Chunked prefill must be enabled to set " + "max_num_partial_prefills > 1.") + + if self.long_prefill_token_threshold > self.max_model_len: + raise ValueError( + "long_prefill_token_threshold " + f"({self.long_prefill_token_threshold}) cannot be greater " + f"than the max_model_len ({self.max_model_len}).") + + if (self.max_long_partial_prefills + < 1) or (self.max_long_partial_prefills + > self.max_num_partial_prefills): + raise ValueError( + f"max_long_partial_prefills ({self.max_long_partial_prefills}) " + "must be greater than or equal to 1 and less than or equal to " + f"max_num_partial_prefills ({self.max_num_partial_prefills}).") diff --git a/vllm_mindspore/distributed/communication_op.py b/vllm_mindspore/distributed/communication_op.py index 58c8c1e8e5f81306dc000cfea0819b6b3666fff6..00447432e546516bf4d8629c374ac36e491041e8 100644 --- a/vllm_mindspore/distributed/communication_op.py +++ b/vllm_mindspore/distributed/communication_op.py @@ -1,3 +1,20 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# Copyright 2025 Huawei Technologies Co., Ltd +# Copyright 2024 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ # 该文件实现底层通信接口, 要求动静统一, 最后才可以在网络中入图。 diff --git a/vllm_mindspore/distributed/parallel_state.py b/vllm_mindspore/distributed/parallel_state.py deleted file mode 100644 index b669f82f90bbd0ee825ca81c7c924b1cc5124b7c..0000000000000000000000000000000000000000 --- a/vllm_mindspore/distributed/parallel_state.py +++ /dev/null @@ -1,77 +0,0 @@ -#!/usr/bin/env python3 -# encoding: utf-8 -# Copyright 2025 Huawei Technologies Co., Ltd -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================ - -import pickle -from typing import List, Optional, Any - -import numpy as np -import torch -import torch.distributed - - -def init_model_parallel_group( - group_ranks: List[List[int]], - local_rank: int, - backend: str, - use_custom_allreduce: Optional[bool] = None, - use_message_queue_broadcaster: bool = False, - group_name: Optional[str] = None, -) -> "GroupCoordinator": - from vllm.distributed.parallel_state import ( - GroupCoordinator, - _ENABLE_CUSTOM_ALL_REDUCE, - ) - - if use_custom_allreduce is None: - use_custom_allreduce = _ENABLE_CUSTOM_ALL_REDUCE - - # TODO(tronzhang): mindspore doesnot support enough communicate cpu ops, set use_message_queue_broadcaster to False now. - return GroupCoordinator( - group_ranks=group_ranks, - local_rank=local_rank, - torch_distributed_backend=backend, - use_pynccl=False, - use_custom_allreduce=use_custom_allreduce, - use_tpu_communicator=True, - use_hpu_communicator=True, - use_xpu_communicator=True, - use_message_queue_broadcaster=False, - group_name=group_name, - ) - - -def all_reduce_for_GroupCoordinator(self, input_: torch.Tensor) -> torch.Tensor: - """ - User-facing all-reduce function before we actually call the - all-reduce operation. - - We need this because Dynamo does not support passing an arbitrary - object (`self` in this case) to a custom op. We need to pass the - group name as a string, and then look up the group coordinator from - the group name, dispatch the all-reduce operation to the group - coordinator. - - In addition, PyTorch custom ops do not support mutation or returning - a new tensor in the same op. So we always make the all-reduce operation - out-of-place. - """ - # Bypass the function if we are using only 1 GPU. - if self.world_size == 1: - return input_ - - torch.distributed.all_reduce(input_, group=self.device_group) - return input_ diff --git a/vllm_mindspore/engine/__init__.py b/vllm_mindspore/engine/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/vllm_mindspore/engine/multiprocessing/__init__.py b/vllm_mindspore/engine/multiprocessing/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/vllm_mindspore/engine/multiprocessing/engine.py b/vllm_mindspore/engine/multiprocessing/engine.py new file mode 100644 index 0000000000000000000000000000000000000000..c91658e38ae8067c022682d7b7fe960105345b12 --- /dev/null +++ b/vllm_mindspore/engine/multiprocessing/engine.py @@ -0,0 +1,4 @@ +def cleanup(self): + self.ctx.destroy(linger=0) + if model_executor := getattr(self.engine, "model_executor", None): + model_executor.shutdown() \ No newline at end of file diff --git a/vllm_mindspore/entrypoints.py b/vllm_mindspore/entrypoints.py index 208acb72441c8f8bdc5e5fd0b50f1d5cbac38494..aa91f07aeae5036c651fc3a1b5a2b205b6a68203 100644 --- a/vllm_mindspore/entrypoints.py +++ b/vllm_mindspore/entrypoints.py @@ -1,6 +1,7 @@ #!/usr/bin/env python3 # encoding: utf-8 # Copyright 2025 Huawei Technologies Co., Ltd +# Copyright 2024 The vLLM team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/vllm_mindspore/executor/multiproc_worker_utils.py b/vllm_mindspore/executor/multiproc_worker_utils.py index e2dc5bab62c11c54f29278058564278400bf366e..86986fa6f628661ee1c6ac2a13c20eb92213fd1a 100644 --- a/vllm_mindspore/executor/multiproc_worker_utils.py +++ b/vllm_mindspore/executor/multiproc_worker_utils.py @@ -1,6 +1,7 @@ #!/usr/bin/env python3 # encoding: utf-8 # Copyright 2025 Huawei Technologies Co., Ltd +# Copyright 2024 The vLLM team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -19,5 +20,4 @@ import multiprocessing def get_mp_context(): - # TODO(tronzhang): support spawn latter... return multiprocessing.get_context("fork") diff --git a/vllm_mindspore/executor/ray_gpu_executor.py b/vllm_mindspore/executor/ray_gpu_executor.py index 8b9cd11abba67a7fd4d5dcf391f580fbd32ba694..d9c2affd6364eeb11794dac75f9163f7c915d587 100644 --- a/vllm_mindspore/executor/ray_gpu_executor.py +++ b/vllm_mindspore/executor/ray_gpu_executor.py @@ -1,6 +1,7 @@ #!/usr/bin/env python3 # encoding: utf-8 # Copyright 2025 Huawei Technologies Co., Ltd +# Copyright 2024 The vLLM team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -14,6 +15,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================ +import os from typing import Dict, List, Optional from collections import defaultdict @@ -24,7 +26,7 @@ from vllm.logger import init_logger from vllm.config import ParallelConfig from vllm.platforms import current_platform from vllm.executor.ray_utils import RayWorkerWrapper, ray, available_resources_per_node -from vllm.executor.ray_gpu_executor import PlacementGroupSchedulingStrategy +from vllm.executor.ray_distributed_executor import PlacementGroupSchedulingStrategy logger = init_logger(__name__) @@ -37,220 +39,265 @@ class MsRayWorkerWrapper(RayWorkerWrapper): def ms_init_workers_ray(self, placement_group: "PlacementGroup", **ray_remote_kwargs): - if (self.parallel_config.tensor_parallel_size == 1 - and self.parallel_config.pipeline_parallel_size == 1): - # For single GPU case, we use a ray worker with constrained memory. - num_gpus = self.cache_config.gpu_memory_utilization - else: - # Otherwise, the ray workers are allocated with a full GPU. - num_gpus = 1 - - # The driver dummy worker does not actually use any resources. - # It holds the resource for the driver worker. - self.driver_dummy_worker: Optional[RayWorkerWrapper] = None - # The remaining workers are the actual ray actors. - self.workers: List[RayWorkerWrapper] = [] - - # Used in ray compiled DAG: indexed first by PP rank, - # and then TP rank. In other words, the inner list is - # the TP group of workers for a PP rank. - self.pp_tp_workers: List[List[RayWorkerWrapper]] = [] - - if self.parallel_config.ray_workers_use_nsight: - ray_remote_kwargs = self._configure_ray_workers_use_nsight( - ray_remote_kwargs) - - logger.info("use_ray_spmd_worker: %s", self.use_ray_spmd_worker) - - # Create the workers. - driver_ip = get_ip() - workers = [] - for bundle_id, bundle in enumerate(placement_group.bundle_specs): - if not bundle.get("NPU", 0): - continue - scheduling_strategy = PlacementGroupSchedulingStrategy( - placement_group=placement_group, - placement_group_capture_child_tasks=True, - placement_group_bundle_index=bundle_id, - ) - - worker = ray.remote( - num_cpus=0, - num_gpus=0, - resources={"NPU": 1}, - scheduling_strategy=scheduling_strategy, - **ray_remote_kwargs, - )(MsRayWorkerWrapper).remote(vllm_config=self.vllm_config) - workers.append(worker) - - worker_ip_refs = [ - worker.get_node_ip.remote() # type: ignore[attr-defined] - for worker in workers - ] - worker_ips = ray.get(worker_ip_refs) - - if not self.use_ray_spmd_worker: - for i in range(len(workers)): - worker = workers[i] - worker_ip = worker_ips[i] - if self.driver_dummy_worker is None and worker_ip == driver_ip: - # If the worker is on the same node as the driver, we use it - # as the resource holder for the driver process. - self.driver_dummy_worker = worker - self.driver_worker = MsRayWorkerWrapper( - vllm_config=self.vllm_config) - workers.pop(i) - worker_ips.pop(i) - self.workers = workers - break - else: - self.workers = workers - - logger.debug("workers: %s", self.workers) - logger.debug("driver_dummy_worker: %s", self.driver_dummy_worker) - if not self.use_ray_spmd_worker and self.driver_dummy_worker is None: - raise ValueError( - "Ray does not allocate any GPUs on the driver node. Consider " - "adjusting the Ray placement group or running the driver on a " - "NPU node.") - - ip_counts: Dict[str, int] = {} - for ip in worker_ips: - ip_counts[ip] = ip_counts.get(ip, 0) + 1 - - worker_to_ip = dict(zip(self.workers, worker_ips)) - - def sort_by_driver_then_worker_ip(worker): - """ - Sort the workers based on 3 properties: - 1. If the worker is on the same node as the driver (vllm engine), - it should be placed first. - 2. Then, if the worker is on a node with fewer workers, it should - be placed first. - 3. Finally, if the work is on a node with smaller IP address, it - should be placed first. - """ - ip = worker_to_ip[worker] - return (ip != driver_ip, ip_counts[ip], ip) - - # After sorting, the workers on the same node will be - # close to each other, and the workers on the driver - # node will be placed first. - self.workers = sorted(self.workers, key=sort_by_driver_then_worker_ip) - - # Get the set of GPU IDs used on each node. - worker_node_and_gpu_ids = [] - for worker in [self.driver_dummy_worker] + self.workers: - if worker is None: - # driver_dummy_worker can be None when using ray spmd worker. - continue - worker_node_and_gpu_ids.append( - ray.get(worker.get_node_and_gpu_ids.remote()) \ - ) # type: ignore - - node_workers = defaultdict(list) # node id -> list of worker ranks - node_gpus = defaultdict(list) # node id -> list of gpu ids - - for i, (node_id, gpu_ids) in enumerate(worker_node_and_gpu_ids): - node_workers[node_id].append(i) - # `gpu_ids` can be a list of strings or integers. - # convert them to integers for consistency. - # NOTE: gpu_ids can be larger than 9 (e.g. 16 GPUs), - # string sorting is not sufficient. - # see https://github.com/vllm-project/vllm/issues/5590 - gpu_ids = [int(x) for x in gpu_ids] - node_gpus[node_id].extend(gpu_ids) - for node_id, gpu_ids in node_gpus.items(): - node_gpus[node_id] = sorted(gpu_ids) - - all_ips = set(worker_ips + [driver_ip]) - n_ips = len(all_ips) - n_nodes = len(node_workers) - - if n_nodes != n_ips: - raise RuntimeError( - f"Every node should have a unique IP address. Got {n_nodes}" - f" nodes with node ids {list(node_workers.keys())} and " - f"{n_ips} unique IP addresses {all_ips}. Please check your" - " network configuration. If you set `VLLM_HOST_IP`" - " environment variable, make sure it is unique for" - " each node.") - - # Set environment variables for the driver and workers. - all_args_to_update_environment_variables = [({ - "CUDA_VISIBLE_DEVICES": - ",".join(map(str, node_gpus[node_id])), - "VLLM_TRACE_FUNCTION": - str(envs.VLLM_TRACE_FUNCTION), - **({ - "VLLM_ATTENTION_BACKEND": envs.VLLM_ATTENTION_BACKEND - } if envs.VLLM_ATTENTION_BACKEND is not None else {}) - }, ) for (node_id, _) in worker_node_and_gpu_ids] - - self._env_vars_for_all_workers = ( - all_args_to_update_environment_variables) - - self._run_workers("update_environment_variables", - all_args=self._get_env_vars_to_be_updated()) - - if len(node_gpus) == 1: - # in single node case, we don't need to get the IP address. - # the loopback address is sufficient - # NOTE: a node may have several IP addresses, one for each - # network interface. `get_ip()` might return any of them, - # while they might not work for communication inside the node - # if the network setup is complicated. Using the loopback address - # solves this issue, as it always works for communication inside - # the node. - driver_ip = "127.0.0.1" - distributed_init_method = get_distributed_init_method( - driver_ip, get_open_port()) - - # Initialize the actual workers inside worker wrapper. - init_worker_all_kwargs = [ - self._get_worker_kwargs( - local_rank=node_workers[node_id].index(rank), - rank=rank, - distributed_init_method=distributed_init_method, - ) for rank, (node_id, _) in enumerate(worker_node_and_gpu_ids) - ] - self._run_workers("init_worker", all_kwargs=init_worker_all_kwargs) - - self._run_workers("init_device") - self._run_workers("load_model", - max_concurrent_workers=self.parallel_config. - max_parallel_loading_workers) - - if self.use_ray_spmd_worker: - for pp_rank in range(self.parallel_config.pipeline_parallel_size): - self.pp_tp_workers.append([]) - for tp_rank in range( - self.parallel_config.tensor_parallel_size): - # PP=2, TP=4 - # pp_tp_workers = [[0, 1, 2, 3], [4, 5, 6, 7]] - rank = (pp_rank * self.parallel_config.tensor_parallel_size - ) + tp_rank - assert len(self.pp_tp_workers[pp_rank]) == tp_rank - assert pp_rank < len(self.pp_tp_workers) - self.pp_tp_workers[pp_rank].append(self.workers[rank]) - - # This is the list of workers that are rank 0 of each TP group EXCEPT - # global rank 0. These are the workers that will broadcast to the - # rest of the workers. - self.tp_driver_workers: List[RayWorkerWrapper] = [] - # This is the list of workers that are not drivers and not the first - # worker in a TP group. These are the workers that will be - # broadcasted to. - self.non_driver_workers: List[RayWorkerWrapper] = [] - - # Enforce rank order for correct rank to return final output. - for index, worker in enumerate(self.workers): - # The driver worker is rank 0 and not in self.workers. - rank = index + 1 - if rank % self.parallel_config.tensor_parallel_size == 0: - self.tp_driver_workers.append(worker) + from vllm.executor.ray_distributed_executor import RayWorkerMetaData + num_gpus = envs.VLLM_RAY_PER_WORKER_GPUS + + # The driver dummy worker does not actually use any resources. + # It holds the resource for the driver worker. + self.driver_dummy_worker: Optional[RayWorkerWrapper] = None + # The remaining workers are the actual ray actors. + self.workers: List[RayWorkerWrapper] = [] + + # Used in ray compiled DAG: indexed first by PP rank, + # and then TP rank. In other words, the inner list is + # the TP group of workers for a PP rank. + self.pp_tp_workers: List[List[RayWorkerWrapper]] = [] + + if self.parallel_config.ray_workers_use_nsight: + ray_remote_kwargs = self._configure_ray_workers_use_nsight( + ray_remote_kwargs) + + logger.info("use_ray_spmd_worker: %s", self.use_ray_spmd_worker) + + # Create the workers. + bundle_indices: List[int] + if envs.VLLM_RAY_BUNDLE_INDICES: + # Use the bundle indices specified by the user. + bundle_indices = list( + map(int, envs.VLLM_RAY_BUNDLE_INDICES.split(","))) + assert len(bundle_indices) == self.parallel_config.world_size, \ + ("VLLM_RAY_BUNDLE_INDICES must have the same size" + f" as the world size, but got {bundle_indices=} " + f"and {self.parallel_config.world_size=}") + assert len(set(bundle_indices)) == len(bundle_indices), \ + ("VLLM_RAY_BUNDLE_INDICES cannot have duplicate values," + f" but got {bundle_indices=}") else: - self.non_driver_workers.append(worker) + # use the first N bundles that have GPU resources. + bundle_indices = [] + for bundle_id, bundle in enumerate(placement_group.bundle_specs): + if bundle.get(current_platform.ray_device_key, 0): + bundle_indices.append(bundle_id) + bundle_indices = bundle_indices[:self.parallel_config.world_size] + + worker_metadata: List[RayWorkerMetaData] = [] + driver_ip = get_ip() + for rank, bundle_id in enumerate(bundle_indices): + scheduling_strategy = PlacementGroupSchedulingStrategy( + placement_group=placement_group, + placement_group_capture_child_tasks=True, + placement_group_bundle_index=bundle_id, + ) + + if current_platform.ray_device_key == "GPU": + # NV+AMD GPUs, and Intel XPUs + worker = ray.remote( + num_cpus=0, + num_gpus=num_gpus, + scheduling_strategy=scheduling_strategy, + **ray_remote_kwargs, + )(RayWorkerWrapper).remote(vllm_config=self.vllm_config, + rpc_rank=rank) + else: + worker = ray.remote( + num_cpus=0, + num_gpus=0, + resources={current_platform.ray_device_key: num_gpus}, + scheduling_strategy=scheduling_strategy, + **ray_remote_kwargs, + )(MsRayWorkerWrapper).remote(vllm_config=self.vllm_config, + rpc_rank=rank) + worker_metadata.append( + RayWorkerMetaData(worker=worker, created_rank=rank)) + + worker_ips = ray.get([ + each.worker.get_node_ip.remote() # type: ignore[attr-defined] + for each in worker_metadata + ]) + + for each, ip in zip(worker_metadata, worker_ips): + each.ip = ip + + if not self.use_ray_spmd_worker: + for i, each in enumerate(worker_metadata): + # find and remove the dummy worker from the list + worker = each.worker + worker_ip = each.ip + if self.driver_dummy_worker is None and worker_ip == driver_ip: + # If the worker is on the same node as the driver, we use it + # as the resource holder for the driver process. + self.driver_dummy_worker = worker + self.driver_worker = MsRayWorkerWrapper( + vllm_config=self.vllm_config, rpc_rank=0) + worker_metadata.pop(i) + break + + logger.debug("workers: %s", worker_metadata) + logger.debug("driver_dummy_worker: %s", self.driver_dummy_worker) + if not self.use_ray_spmd_worker and self.driver_dummy_worker is None: + raise ValueError( + "Ray does not allocate any GPUs on the driver node. Consider " + "adjusting the Ray placement group or running the driver on a " + "GPU node.") + + ip_counts: Dict[str, int] = {} + for ip in worker_ips: + ip_counts[ip] = ip_counts.get(ip, 0) + 1 + + def sort_by_driver_then_worker_ip(item: RayWorkerMetaData): + """ + Sort the workers based on 3 properties: + 1. If the worker is on the same node as the driver (vllm engine), + it should be placed first. + 2. Then, if the worker is on a node with fewer workers, it should + be placed first. + 3. Finally, if the work is on a node with smaller IP address, it + should be placed first. + """ + ip = item.ip + return (0 if ip == driver_ip else 1, ip_counts[ip], ip) + + # After sorting, the workers on the same node will be + # close to each other, and the workers on the driver + # node will be placed first. + sorted_worker_metadata = sorted(worker_metadata, + key=sort_by_driver_then_worker_ip) + start_rank = 0 if self.use_ray_spmd_worker else 1 + for i, item in enumerate(sorted_worker_metadata): + item.adjusted_rank = i + start_rank + self.workers = [item.worker for item in sorted_worker_metadata] + rerank_mapping = { + item.created_rank: item.adjusted_rank + for item in sorted_worker_metadata + } + self._run_workers("adjust_rank", rerank_mapping) + + # Get the set of GPU IDs used on each node. + worker_node_and_gpu_ids = [] + for worker in [self.driver_dummy_worker] + self.workers: + if worker is None: + # driver_dummy_worker can be None when using ray spmd worker. + continue + worker_node_and_gpu_ids.append( + ray.get(worker.get_node_and_gpu_ids.remote()) \ + ) # type: ignore + + node_workers = defaultdict(list) # node id -> list of worker ranks + node_gpus = defaultdict(list) # node id -> list of gpu ids + + for i, (node_id, gpu_ids) in enumerate(worker_node_and_gpu_ids): + node_workers[node_id].append(i) + # `gpu_ids` can be a list of strings or integers. + # convert them to integers for consistency. + # NOTE: gpu_ids can be larger than 9 (e.g. 16 GPUs), + # string sorting is not sufficient. + # see https://github.com/vllm-project/vllm/issues/5590 + gpu_ids = [int(x) for x in gpu_ids] + node_gpus[node_id].extend(gpu_ids) + for node_id, gpu_ids in node_gpus.items(): + node_gpus[node_id] = sorted(gpu_ids) + + all_ips = set(worker_ips + [driver_ip]) + n_ips = len(all_ips) + n_nodes = len(node_workers) + + if n_nodes != n_ips: + raise RuntimeError( + f"Every node should have a unique IP address. Got {n_nodes}" + f" nodes with node ids {list(node_workers.keys())} and " + f"{n_ips} unique IP addresses {all_ips}. Please check your" + " network configuration. If you set `VLLM_HOST_IP`" + " environment variable, make sure it is unique for" + " each node.") + + # Set environment variables for the driver and workers. + all_args_to_update_environment_variables = [{ + current_platform.device_control_env_var: + ",".join(map(str, node_gpus[node_id])), + } for (node_id, _) in worker_node_and_gpu_ids] + + for args in all_args_to_update_environment_variables: + # some carry-over env vars from the driver + # TODO: refactor platform-specific env vars + for name in [ + "VLLM_ATTENTION_BACKEND", + "TPU_CHIPS_PER_HOST_BOUNDS", + "TPU_HOST_BOUNDS", + "VLLM_USE_V1", + "VLLM_TRACE_FUNCTION", + ]: + if name in os.environ: + args[name] = os.environ[name] + + self._env_vars_for_all_workers = ( + all_args_to_update_environment_variables) + + self._run_workers("update_environment_variables", + self._get_env_vars_to_be_updated()) + + if len(node_gpus) == 1: + # in single node case, we don't need to get the IP address. + # the loopback address is sufficient + # NOTE: a node may have several IP addresses, one for each + # network interface. `get_ip()` might return any of them, + # while they might not work for communication inside the node + # if the network setup is complicated. Using the loopback address + # solves this issue, as it always works for communication inside + # the node. + driver_ip = "127.0.0.1" + distributed_init_method = get_distributed_init_method( + driver_ip, get_open_port()) + + # Initialize the actual workers inside worker wrapper. + all_kwargs = [] + for rank, (node_id, _) in enumerate(worker_node_and_gpu_ids): + local_rank = node_workers[node_id].index(rank) + kwargs = dict( + vllm_config=self.vllm_config, + local_rank=local_rank, + rank=rank, + distributed_init_method=distributed_init_method, + is_driver_worker=(not self.parallel_config) + or (rank % self.parallel_config.tensor_parallel_size == 0), + ) + all_kwargs.append(kwargs) + self._run_workers("init_worker", all_kwargs) + + self._run_workers("init_device") + self._run_workers("load_model", + max_concurrent_workers=self.parallel_config. + max_parallel_loading_workers) + + if self.use_ray_spmd_worker: + for pp_rank in range(self.parallel_config.pipeline_parallel_size): + self.pp_tp_workers.append([]) + for tp_rank in range( + self.parallel_config.tensor_parallel_size): + # PP=2, TP=4 + # pp_tp_workers = [[0, 1, 2, 3], [4, 5, 6, 7]] + rank = (pp_rank * self.parallel_config.tensor_parallel_size + ) + tp_rank + assert len(self.pp_tp_workers[pp_rank]) == tp_rank + assert pp_rank < len(self.pp_tp_workers) + self.pp_tp_workers[pp_rank].append(self.workers[rank]) + + # This is the list of workers that are rank 0 of each TP group EXCEPT + # global rank 0. These are the workers that will broadcast to the + # rest of the workers. + self.tp_driver_workers: List[RayWorkerWrapper] = [] + # This is the list of workers that are not drivers and not the first + # worker in a TP group. These are the workers that will be + # broadcasted to. + self.non_driver_workers: List[RayWorkerWrapper] = [] + + # Enforce rank order for correct rank to return final output. + for index, worker in enumerate(self.workers): + # The driver worker is rank 0 and not in self.workers. + rank = index + 1 + if rank % self.parallel_config.tensor_parallel_size == 0: + self.tp_driver_workers.append(worker) + else: + self.non_driver_workers.append(worker) def initialize_ray_cluster( @@ -354,3 +401,4 @@ def initialize_ray_cluster( _verify_bundles(current_placement_group, parallel_config, device_str) # Set the placement group in the parallel config parallel_config.placement_group = current_placement_group + diff --git a/vllm_mindspore/model_executor/custom_op.py b/vllm_mindspore/model_executor/custom_op.py index 7b913ef8510bb294993e35c8d2345f24673484e7..a8c273f5f264a172a4d5daf60088bc734a11e60f 100644 --- a/vllm_mindspore/model_executor/custom_op.py +++ b/vllm_mindspore/model_executor/custom_op.py @@ -1,6 +1,7 @@ #!/usr/bin/env python3 # encoding: utf-8 # Copyright 2025 Huawei Technologies Co., Ltd +# Copyright 2024 The vLLM team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/vllm_mindspore/model_executor/layers/activation.py b/vllm_mindspore/model_executor/layers/activation.py index afc2b79399f6906f0ac2e64f9ecb56de97155f2c..a1d94ecae6d9db7ce364f7cef67f67bbabf0139a 100644 --- a/vllm_mindspore/model_executor/layers/activation.py +++ b/vllm_mindspore/model_executor/layers/activation.py @@ -1,6 +1,7 @@ #!/usr/bin/env python3 # encoding: utf-8 # Copyright 2025 Huawei Technologies Co., Ltd +# Copyright 2024 The vLLM team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/vllm_mindspore/model_executor/layers/layernorm.py b/vllm_mindspore/model_executor/layers/layernorm.py index dd497e08eafd32b624977d355fe95f80d46f2276..db156c0cc0dd0605fef0e48d0d79a1a489b9a4ea 100644 --- a/vllm_mindspore/model_executor/layers/layernorm.py +++ b/vllm_mindspore/model_executor/layers/layernorm.py @@ -1,6 +1,7 @@ #!/usr/bin/env python3 # encoding: utf-8 # Copyright 2025 Huawei Technologies Co., Ltd +# Copyright 2024 The vLLM team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/vllm_mindspore/model_executor/layers/linear.py b/vllm_mindspore/model_executor/layers/linear.py index 62142ac8c44316a3bef0f195b1b325ea9b5d77c0..45aa4c4399a47e4912aa4fda35b19cdfaaae38d5 100644 --- a/vllm_mindspore/model_executor/layers/linear.py +++ b/vllm_mindspore/model_executor/layers/linear.py @@ -1,6 +1,7 @@ #!/usr/bin/env python3 # encoding: utf-8 # Copyright 2025 Huawei Technologies Co., Ltd +# Copyright 2024 The vLLM team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -55,6 +56,7 @@ WEIGHT_LOADER_V2_SUPPORTED = [ "IPEXAWQLinearMethod", "IPEXGPTQLinearMethod", "HQQMarlinMethod", + "QuarkLinearMethod" ] @@ -272,20 +274,19 @@ class ColumnParallelLinear(LinearBase): use_bitsandbytes_4bit = getattr(param, "use_bitsandbytes_4bit", False) - param_data = param.data # bitsandbytes loads the weights of the specific portion # no need to narrow here if output_dim is not None and not use_bitsandbytes_4bit: - shard_size = param_data.shape[output_dim] + shard_size = param.shape[output_dim] start_idx = tp_rank * shard_size - loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size) + loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size).contiguous() # Special case for loading scales off disk, which often do not # have a shape (such as in the case of AutoFP8). if len(loaded_weight.shape) == 0: loaded_weight = loaded_weight.reshape(1) - assert param_data.shape == loaded_weight.shape + assert param.shape == loaded_weight.shape # param_data.copy_(loaded_weight) # param.set_data(loaded_weight) # param[:, start_idx:start_idx + shard_size] = loaded_weight @@ -376,7 +377,7 @@ class MergedColumnParallelLinear(ColumnParallelLinear): # bitsandbytes loads the weights of the specific portion # no need to narrow here if not use_bitsandbytes_4bit: - loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size) + loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size).contiguous() assert param_data.shape == loaded_weight.shape # param_data.copy_(loaded_weight) # param_data.set_data(loaded_weight) @@ -459,7 +460,7 @@ class QKVParallelLinear(ColumnParallelLinear): start_idx = shard_id * shard_size if not use_bitsandbytes_4bit: - loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size) + loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size).contiguous() assert param_data.shape == loaded_weight.shape if param.name.endswith("weight"): self.weight[shard_offset: shard_offset + shard_size, :] = loaded_weight @@ -498,19 +499,21 @@ class RowParallelLinear(LinearBase): input_size, output_size, skip_bias_add, params_dtype, quant_config, prefix ) - self.input_is_parallel = input_is_parallel - self.reduce_results = reduce_results - # Divide the weight matrix along the last dimension. self.tp_rank = get_tensor_model_parallel_rank() self.tp_size = get_tensor_model_parallel_world_size() self.input_size_per_partition = divide(input_size, self.tp_size) + self.output_size_per_partition = output_size + self.output_partition_sizes = [output_size] + self.input_is_parallel = input_is_parallel + self.reduce_results = reduce_results + assert self.quant_method is not None self.quant_method.create_weights( layer=self, input_size_per_partition=self.input_size_per_partition, - output_partition_sizes=[self.output_size], + output_partition_sizes=self.output_partition_sizes, input_size=self.input_size, output_size=self.output_size, params_dtype=self.params_dtype, @@ -527,7 +530,7 @@ class RowParallelLinear(LinearBase): ) if bias: - self.bias = Parameter(mint.zeros(self.output_size), dtype=params_dtype) + self.bias = Parameter(mint.zeros(self.output_size, dtype=params_dtype)) set_weight_attrs( self.bias, { @@ -568,24 +571,26 @@ class RowParallelLinear(LinearBase): def weight_loader(self, param, loaded_weight): tp_rank = get_tensor_model_parallel_rank() - tp_size = get_tensor_model_parallel_world_size() input_dim = getattr(param, "input_dim", None) use_bitsandbytes_4bit = getattr(param, "use_bitsandbytes_4bit", False) + is_sharded_weight = getattr(param, "is_sharded_weight", False) + # bitsandbytes loads the weights of the specific portion + # no need to narrow + is_sharded_weight = is_sharded_weight or use_bitsandbytes_4bit - param_data = param.data # bitsandbytes loads the weights of the specific portion # no need to narrow here - if input_dim is not None and not use_bitsandbytes_4bit: - shard_size = param_data.shape[input_dim] + if input_dim is not None and not is_sharded_weight: + shard_size = param.shape[input_dim] start_idx = tp_rank * shard_size - loaded_weight = loaded_weight.narrow(input_dim, start_idx, shard_size) + loaded_weight = loaded_weight.narrow(input_dim, start_idx, shard_size).contiguous() # Special case for loading scales off disk, which often do not # have a shape (such as in the case of AutoFP8). if len(loaded_weight.shape) == 0: loaded_weight = loaded_weight.reshape(1) - assert param_data.shape == loaded_weight.shape + assert param.shape == loaded_weight.shape # param_data.copy_(loaded_weight) # self.weight[:, start_idx : start_idx + shard_size] = loaded_weight - param.set_data(loaded_weight) + param.set_data(loaded_weight.contiguous()) diff --git a/vllm_mindspore/model_executor/layers/logits_processor.py b/vllm_mindspore/model_executor/layers/logits_processor.py index 9399e518a38f57177857a865e48961881b786976..647b4ac837fd86ca078bdb01847fa7d88959d614 100644 --- a/vllm_mindspore/model_executor/layers/logits_processor.py +++ b/vllm_mindspore/model_executor/layers/logits_processor.py @@ -1,6 +1,7 @@ #!/usr/bin/env python3 # encoding: utf-8 # Copyright 2025 Huawei Technologies Co., Ltd +# Copyright 2024 The vLLM team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -16,12 +17,14 @@ # ============================================================================ """A layer that compute logits from hidden_stats.""" import inspect +from concurrent.futures import ThreadPoolExecutor from typing import Optional import mindspore.nn as nn -from mindspore import Tensor -from mindspore import mint +from mindspore import Tensor, ops, mint, nn +import vllm.envs as envs +from vllm.config import get_current_vllm_config from vllm.distributed import ( tensor_model_parallel_all_gather, tensor_model_parallel_gather, @@ -32,8 +35,11 @@ from vllm_mindspore.model_executor.layers.vocab_parallel_embedding import ( from vllm_mindspore.model_executor.sampling_metadata import SamplingMetadata from vllm.platforms import current_platform -# TODO(tronzhang): Use vllm's logits_processor.py latter... +_logits_processor_threadpool: Optional[ThreadPoolExecutor] = None +if envs.VLLM_LOGITS_PROCESSOR_THREADS is not None: + _logits_processor_threadpool = ThreadPoolExecutor( + envs.VLLM_LOGITS_PROCESSOR_THREADS) class LogitsProcessor(nn.Cell): """Process logits and apply logits processors from sampling metadata. @@ -66,7 +72,10 @@ class LogitsProcessor(nn.Cell): # Soft cap the logits. Used in Gemma 2. self.soft_cap = soft_cap # Whether to use gather or all-gather to gather the logits. - self.use_gather = not current_platform.is_tpu() + parallel_config = get_current_vllm_config().parallel_config + self.use_gather = not current_platform.is_tpu() \ + or envs.VLLM_USE_V1 \ + or parallel_config.distributed_executor_backend == "external_launcher" def construct( self, @@ -105,7 +114,7 @@ class LogitsProcessor(nn.Cell): embedding_bias: Optional[Tensor], ) -> Optional[Tensor]: # Get the logits for the next tokens. - logits = lm_head.linear_method.apply( + logits = lm_head.quant_method.apply( lm_head, hidden_states, bias=embedding_bias ) if self.use_gather: @@ -138,7 +147,7 @@ def _prune_hidden_states( # (warmup, profile_run) we might not have selected_token_indices, # so we skip pruning. if sampling_metadata.selected_token_indices is not None: - return hidden_states.index_select(0, sampling_metadata.selected_token_indices) + return ops.gather(hidden_states, sampling_metadata.selected_token_indices, 0) else: return hidden_states @@ -149,6 +158,7 @@ def _apply_logits_processors( ) -> Tensor: found_logits_processors = False logits_processed = 0 + logits_row_ids_and_logits_row_futures = [] for seq_group in sampling_metadata.seq_groups: seq_ids = seq_group.seq_ids sampling_params = seq_group.sampling_params @@ -161,22 +171,39 @@ def _apply_logits_processors( past_tokens_ids = seq_group.seq_data[seq_id].output_token_ids prompt_tokens_ids = seq_group.seq_data[seq_id].prompt_token_ids - for logits_processor in logits_processors: - parameters = inspect.signature(logits_processor).parameters - if len(parameters) == 3: - logits_row = logits_processor( - prompt_tokens_ids, past_tokens_ids, logits_row - ) - else: - logits_row = logits_processor(past_tokens_ids, logits_row) - - logits[logits_row_idx] = logits_row + if _logits_processor_threadpool is not None: + logits_row_ids_and_logits_row_futures.append( + (logits_row_idx, + _logits_processor_threadpool.submit( + _apply_logits_processors_single_seq, logits_row, + logits_processors, past_tokens_ids, + prompt_tokens_ids))) + else: + logits[logits_row_idx] = \ + _apply_logits_processors_single_seq( + logits_row, logits_processors, past_tokens_ids, + prompt_tokens_ids) logits_processed += len(seq_group.sample_indices) + len( seq_group.prompt_logprob_indices ) + + for logits_row_idx, future in logits_row_ids_and_logits_row_futures: + logits[logits_row_idx] = future.result() if found_logits_processors: # verifies that no rows in logits were missed unexpectedly assert logits_processed == logits.shape[0] return logits + +def _apply_logits_processors_single_seq(logits_row, logits_processors, + past_tokens_ids, + prompt_tokens_ids) -> Tensor: + for logits_processor in logits_processors: + parameters = inspect.signature(logits_processor).parameters + if len(parameters) == 3: + logits_row = logits_processor(prompt_tokens_ids, past_tokens_ids, + logits_row) + else: + logits_row = logits_processor(past_tokens_ids, logits_row) + return logits_row \ No newline at end of file diff --git a/vllm_mindspore/model_executor/layers/quantization/base_config.py b/vllm_mindspore/model_executor/layers/quantization/base_config.py index ea259ee65742a5089a36a0966520144b6a8973bc..afc957ce8f662fd50ef29f927dd96227d3671dfa 100644 --- a/vllm_mindspore/model_executor/layers/quantization/base_config.py +++ b/vllm_mindspore/model_executor/layers/quantization/base_config.py @@ -1,6 +1,7 @@ #!/usr/bin/env python3 # encoding: utf-8 # Copyright 2025 Huawei Technologies Co., Ltd +# Copyright 2024 The vLLM team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -20,7 +21,6 @@ from abc import ABC, abstractmethod from typing import Any, Dict, List, Optional, Type import mindspore as ms -# TODO(tronzhang): Use vllm's quantization base_config.py latter. class QuantizeMethodBase(ABC): """Base class for different quantized methods.""" @@ -57,6 +57,11 @@ class QuantizeMethodBase(ABC): class QuantizationConfig(ABC): """Base class for quantization configs.""" + def __init__(self): + super().__init__() + # mapping is updated by models as they initialize + self.packed_modules_mapping: Dict[str, List[str]] = dict() + @abstractmethod def get_name(self) -> str: """Name of the quantization method.""" diff --git a/vllm_mindspore/model_executor/layers/rejection_sampler.py b/vllm_mindspore/model_executor/layers/rejection_sampler.py new file mode 100644 index 0000000000000000000000000000000000000000..b6842cf5a7e051e57212e195eaecadfce9f7b7f2 --- /dev/null +++ b/vllm_mindspore/model_executor/layers/rejection_sampler.py @@ -0,0 +1,77 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# Copyright 2025 Huawei Technologies Co., Ltd +# Copyright 2024 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +# the data type of finfo.tiny is not float but narray in msadapter, +# which is not supported to be a tensor index + +from functools import cached_property +from typing import Dict + +import torch +import mindspore as ms + +from vllm.platforms import current_platform + +@cached_property +def _smallest_positive_value(self) -> float: + """Return the smallest positive value representable by the probs dtype. + This value is used when constructing a distribution from which to sample + recovered tokens in the first rejection case. + + See _get_recovered_probs for more details + + Note that this isn't actually the smallest positive value representable + by float32, but the smallest positive normal value. + See https://en.wikipedia.org/wiki/Subnormal_number for more information. + """ + # the value type of tiny is numpy in msadapter. + return float(torch.finfo(self.probs_dtype).tiny) + + +# msadapter does not support 'exponential_' +@torch.compile(dynamic=True, backend=current_platform.simple_compile_backend) +def _multinomial( + probs: torch.Tensor, + num_samples: int, + k: int, + seeded_seqs: Dict[int, torch.Generator], +) -> torch.Tensor: + # msadapter donot support tensor.exponential_ + def exponential_(x: torch.Tensor, lambda_, generator=None): + random_x = ms.mint.rand(x.shape, generator=generator) # 生成均匀分布随机数 + return -torch.log(random_x) / lambda_ # 逆变换法 + + if num_samples > 1: + # This is equivalent to torch.repeat_interleaved (which also + # forces a GPU<->CPU sync). + probs = probs[:, None, :].expand(probs.shape[0], num_samples, + probs.shape[1]).contiguous().view( + -1, probs.shape[1]) + q = torch.empty_like(probs) + if not seeded_seqs: + q = exponential_(q, 1.0) + else: + start = 0 + for idx in range(len(q) // k): + end = start + k + generator = seeded_seqs.get(idx) + # Note: generator might be None for non seeded + q[start:end] = exponential_(q[start:end], 1.0, generator=generator) + start = end + + return probs.div_(q).argmax(dim=1).view(-1, num_samples) diff --git a/vllm_mindspore/model_executor/layers/rotary_embedding.py b/vllm_mindspore/model_executor/layers/rotary_embedding.py index 77827002f512d77570e3438f47a2a6549b56ac8b..7903702a96d3e29c5f6560c534ec427f629b9e68 100644 --- a/vllm_mindspore/model_executor/layers/rotary_embedding.py +++ b/vllm_mindspore/model_executor/layers/rotary_embedding.py @@ -1,6 +1,7 @@ #!/usr/bin/env python3 # encoding: utf-8 # Copyright 2025 Huawei Technologies Co., Ltd +# Copyright 2024 The vLLM team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -155,6 +156,7 @@ class InferRotaryEmbedding(CustomOp): self.freqs_cos = Tensor(freqs_cos, dtype=dtype) self.freqs_sin = Tensor(freqs_sin, dtype=dtype) self.rotary_embedding_op = ops.ApplyRotaryPosEmb(2) + self.gather = ops.Gather() def forward_native( self, @@ -162,14 +164,14 @@ class InferRotaryEmbedding(CustomOp): query: Tensor, key: Tensor, batch_valid_length: Tensor, - num_prefill_tokens: int, + is_prefill: bool, offsets: Optional[Tensor] = None, ) -> Tuple[Tensor, Tensor]: - if num_prefill_tokens > 0: + if is_prefill: return self.rotary_embedding_op(query, key, self.freqs_cos, self.freqs_sin, batch_valid_length) - freqs_cos = self.freqs_cos.index_select(0, positions) - freqs_sin = self.freqs_sin.index_select(0, positions) + freqs_cos = self.gather(self.freqs_cos, positions, 0) + freqs_sin = self.gather(self.freqs_sin, positions, 0) return self.rotary_embedding_op(query, key, freqs_cos, freqs_sin, batch_valid_length) diff --git a/vllm_mindspore/model_executor/layers/sampler.py b/vllm_mindspore/model_executor/layers/sampler.py index 8130aecca8de824316c51a4f36aea747a18c5a62..354fb021464af6510c765eb8bdf0397021a84e67 100644 --- a/vllm_mindspore/model_executor/layers/sampler.py +++ b/vllm_mindspore/model_executor/layers/sampler.py @@ -1,6 +1,7 @@ #!/usr/bin/env python3 # encoding: utf-8 # Copyright 2025 Huawei Technologies Co., Ltd +# Copyright 2024 The vLLM team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -19,52 +20,36 @@ import itertools import warnings import mindspore as ms -from mindspore.common.api import _pynative_executor +import numpy as np from dataclasses import dataclass from importlib.util import find_spec from math import inf from typing import Dict, Iterator, List, Optional, Tuple, Union -# TODO(tronzhang): for some ops, msadaptor cannnot support, latter use vllm's... - import msgspec import torch import torch.nn as nn import vllm.envs as envs -from vllm_mindspore.model_executor.layers.utils import apply_penalties from vllm.sampling_params import SamplingType -from vllm.sequence import ( - VLLM_INVALID_TOKEN_ID, - CompletionSequenceGroupOutput, - Logprob, - PromptLogprobs, - SampleLogprobs, - SequenceOutput, -) +from vllm.sequence import (VLLM_INVALID_TOKEN_ID, + CompletionSequenceGroupOutput, Logprob, + PromptLogprobs, SampleLogprobs, SequenceOutput) from vllm.spec_decode.metrics import SpecDecodeWorkerMetrics - +from vllm_mindspore.model_executor.layers.utils import apply_penalties from vllm_mindspore.model_executor.sampling_metadata import ( SamplingMetadata, SamplingTensors, SequenceGroupToSample, ) -class AsyncContext: - def __enter__(self): - _pynative_executor.sync() - _pynative_executor.set_async_for_graph(True) - - def __exit__(self, exc_type, exc_value, tb): - _pynative_executor.sync() - _pynative_executor.set_async_for_graph(False) - if envs.VLLM_USE_FLASHINFER_SAMPLER and find_spec("flashinfer"): raise RuntimeError("Donot support for mindspore now.") else: flashinfer_top_k_top_p_sampling = None + def get_sampler() -> torch.nn.Module: return Sampler() @@ -74,7 +59,8 @@ SampleResultType = List[Tuple[List[int], List[int]]] # Types of temporary data structures used for # computing sample_result -SampleMetadataType = Dict[SamplingType, Tuple[List[int], List[SequenceGroupToSample]]] +SampleMetadataType = Dict[SamplingType, Tuple[List[int], + List[SequenceGroupToSample]]] MultinomialSamplesType = Dict[SamplingType, torch.Tensor] SampleResultsDictType = Dict[int, Tuple[List[int], List[int]]] @@ -95,7 +81,6 @@ class SampleResultArgsType: sample_results_dict: SampleResultsDictType sampling_metadata: SamplingMetadata greedy_samples: Optional[torch.Tensor] - beam_search_logprobs: Optional[torch.Tensor] # Union of non-deferred (single-step scheduling) @@ -108,8 +93,9 @@ SampleReturnType = Tuple[MaybeDeferredSampleResultType, Optional[torch.Tensor]] class SamplerOutput( - msgspec.Struct, omit_defaults=True, array_like=True # type: ignore[call-arg] -): # type: ignore[call-arg] + msgspec.Struct, + omit_defaults=True, # type: ignore[call-arg] + array_like=True): # type: ignore[call-arg] """For each sequence group, we generate a list of SequenceOutput object, each of which contains one possible candidate for the next token. @@ -167,24 +153,21 @@ class SamplerOutput( return len(self.outputs) def __eq__(self, other: object): - return isinstance(other, self.__class__) and self.outputs == other.outputs + return isinstance(other, + self.__class__) and self.outputs == other.outputs def __repr__(self) -> str: - """Show the shape of a tensor instead of its values to reduce noise.""" - sampled_token_probs_repr = ( - "None" - if self.sampled_token_probs is None - else self.sampled_token_probs.shape - ) - sampled_token_ids_repr = ( - "None" if self.sampled_token_ids is None else self.sampled_token_ids.shape - ) + """Show the shape of a tensor instead of its values to reduce noise. + """ + sampled_token_probs_repr = ("None" if self.sampled_token_probs is None + else self.sampled_token_probs.shape) + sampled_token_ids_repr = ("None" if self.sampled_token_ids is None else + self.sampled_token_ids.shape) return ( f"SamplerOutput(outputs={self.outputs}, " f"sampled_token_probs={sampled_token_probs_repr}, " f"sampled_token_ids={sampled_token_ids_repr}, " - f"spec_decode_worker_metrics={self.spec_decode_worker_metrics})" - ) + f"spec_decode_worker_metrics={self.spec_decode_worker_metrics})") class Sampler(nn.Module): @@ -234,18 +217,16 @@ class Sampler(nn.Module): self._sampling_tensors = None # Initialize new sampling tensors - (sampling_tensors, do_penalties, do_top_p_top_k, do_min_p) = ( - SamplingTensors.from_sampling_metadata( - sampling_metadata, vocab_size, logits.device, logits.dtype - ) - ) + (sampling_tensors, do_penalties, do_top_p_top_k, + do_min_p) = SamplingTensors.from_sampling_metadata( + sampling_metadata, vocab_size, logits.device, logits.dtype) self._sampling_tensors = sampling_tensors self._do_penalties = do_penalties self._do_top_p_top_k = do_top_p_top_k self._do_min_p = do_min_p - def run_forward( + def forward( self, logits: torch.Tensor, sampling_metadata: SamplingMetadata, @@ -291,14 +272,11 @@ class Sampler(nn.Module): # Apply presence and frequency penalties. if do_penalties: - logits = apply_penalties( - logits, - sampling_tensors.prompt_tokens, - sampling_tensors.output_tokens, - sampling_tensors.presence_penalties, - sampling_tensors.frequency_penalties, - sampling_tensors.repetition_penalties, - ) + logits = apply_penalties(logits, sampling_tensors.prompt_tokens, + sampling_tensors.output_tokens, + sampling_tensors.presence_penalties, + sampling_tensors.frequency_penalties, + sampling_tensors.repetition_penalties) # Use float32 to apply temperature scaling. # Use in-place division to avoid creating a new tensor. @@ -306,9 +284,8 @@ class Sampler(nn.Module): logits.div_(sampling_tensors.temperatures.unsqueeze(dim=1)) if do_top_p_top_k and flashinfer_top_k_top_p_sampling is None: - logits = _apply_top_k_top_p( - logits, sampling_tensors.top_ps, sampling_tensors.top_ks - ) + logits = _apply_top_k_top_p(logits, sampling_tensors.top_ps, + sampling_tensors.top_ks) if do_min_p: logits = _apply_min_p(logits, sampling_tensors.min_ps) @@ -317,7 +294,6 @@ class Sampler(nn.Module): # Compute the probabilities. probs = torch.softmax(logits, dim=-1, dtype=torch.float) # Compute the log probabilities. - logprobs = ms.ops.log_softmax(logits, axis=-1).to(torch.float) # Sample the next tokens. @@ -346,10 +322,10 @@ class Sampler(nn.Module): sample_logprobs = None if not sampling_metadata.skip_sampler_cpu_output: # Pythonize logprobs now (GPU -> CPU); do not defer. - assert not isinstance(maybe_deferred_sample_results, SampleResultArgsType) + assert not isinstance(maybe_deferred_sample_results, + SampleResultArgsType) prompt_logprobs, sample_logprobs = get_logprobs( - logprobs, sampling_metadata, maybe_deferred_sample_results - ) + logprobs, sampling_metadata, maybe_deferred_sample_results) return _build_sampler_output( maybe_deferred_sample_results, @@ -357,16 +333,14 @@ class Sampler(nn.Module): prompt_logprobs, sample_logprobs, on_device_tensors=on_device_tensors, - skip_sampler_cpu_output=sampling_metadata.skip_sampler_cpu_output, - ) + skip_sampler_cpu_output=sampling_metadata.skip_sampler_cpu_output) - def forward( + def __call__( self, logits: torch.Tensor, sampling_metadata: SamplingMetadata, ) -> Optional[SamplerOutput]: - with AsyncContext() as ctx: - return self.run_forward(logits, sampling_metadata) + return self.forward(logits, sampling_metadata) @property def _should_modify_greedy_probs_inplace(self) -> bool: @@ -388,7 +362,7 @@ def _apply_min_tokens_penalty( sampling_metadata: SamplingMetadata, ) -> torch.Tensor: """Apply min_tokens penalty which sets stop tokens to -inf if min_tokens - have not been generated yet + have not been generated yet """ # list of indices in logits that will be set to -inf logits_to_penalize: List[Tuple[int, int]] = [] @@ -398,7 +372,8 @@ def _apply_min_tokens_penalty( sampling_params = seq_group.sampling_params sample_indices = seq_group.sample_indices - logits_applied += len(sample_indices) + len(seq_group.prompt_logprob_indices) + logits_applied += len(sample_indices) + len( + seq_group.prompt_logprob_indices) if not seq_group.do_sample: continue @@ -417,8 +392,7 @@ def _apply_min_tokens_penalty( seqs_to_penalize = [start_idx + j for j in seqs_to_penalize] # itertools.product pairs each seq index with every token id logits_to_penalize.extend( - itertools.product(seqs_to_penalize, token_ids_to_penalize) - ) + itertools.product(seqs_to_penalize, token_ids_to_penalize)) if logits_to_penalize: # use zip and * to group indices along each dimension @@ -435,27 +409,27 @@ def _apply_top_k_top_p( p: torch.Tensor, k: torch.Tensor, ) -> torch.Tensor: - logits_sort, logits_idx = logits.sort(axis=-1, descending=False) + logits_sort, logits_idx = logits.sort(dim=-1, descending=False) # Apply top-k. top_k_mask = logits_sort.size(1) - k.to(torch.long) # Get all the top_k values. - top_k_mask = logits_sort.gather(top_k_mask, 0) + top_k_mask = logits_sort.gather(1, top_k_mask.unsqueeze(dim=1)) top_k_mask = logits_sort < top_k_mask - logits_sort.masked_fill(top_k_mask, -float("inf")) + logits_sort.masked_fill_(top_k_mask, -float("inf")) # Apply top-p. - probs_sort = logits_sort.softmax(axis=-1) + probs_sort = logits_sort.softmax(-1) probs_sum = probs_sort.cumsum(axis=-1) top_p_mask = probs_sum <= 1 - p.unsqueeze(dim=1) # at least one top_p_mask[:, -1] = False - logits_sort.masked_fill(top_p_mask, -float("inf")) + logits_sort.masked_fill_(top_p_mask, -float("inf")) # Re-sort the probabilities. - logits = torch.empty_like(logits_sort).scatter( - axis=-1, index=logits_idx, src=logits_sort - ) + logits = torch.empty_like(logits_sort).scatter_(dim=-1, + index=logits_idx, + src=logits_sort) return logits @@ -467,11 +441,11 @@ def _apply_min_p( Adapted from https://github.com/oobabooga/text-generation-webui/blob/3146124ec01f02c8fb1650a6517cf1b60b537aaf/modules/sampler_hijack.py#L16C17-L16C17 """ - probs = torch.softmax(logits, axis=-1) - top_probs, _ = probs.max(axis=-1, keepdims=True) - scaled_min_p = min_p.unsqueeze_(axis=1) * top_probs + probs = torch.softmax(logits, dim=-1) + top_probs, _ = probs.max(dim=-1, keepdim=True) + scaled_min_p = min_p.unsqueeze_(dim=1) * top_probs tokens_to_remove = probs < scaled_min_p - logits = logits.masked_fill(tokens_to_remove, -float("inf")) + logits = logits.masked_fill_(tokens_to_remove, -float("inf")) return logits @@ -502,7 +476,8 @@ def _greedy_sample( seq_ids = seq_group.seq_ids num_parent_seqs = len(seq_ids) - assert num_parent_seqs == 1, "Greedy sampling should have only one seq." + assert num_parent_seqs == 1, ( + "Greedy sampling should have only one seq.") parent_ids = list(range(num_parent_seqs)) next_token_ids = [samples_lst[sample_idx]] results.append((next_token_ids, parent_ids)) @@ -529,6 +504,7 @@ def _random_sample( # Find the maximum n value of the prompt phase requests. sample_idx = 0 results: SampleResultType = [] + random_samples = random_samples.asnumpy() for seq_group in selected_seq_groups: if not seq_group.do_sample: results.append(([], [])) @@ -541,13 +517,13 @@ def _random_sample( if is_prompt: # Prompt phase. parent_ids = [0] * sampling_params.n - next_token_ids = random_samples[sample_idx, : sampling_params.n].tolist() + next_token_ids = random_samples[ + sample_idx, :sampling_params.n].tolist() else: # Generation phase. parent_ids = list(range(num_parent_seqs)) - next_token_ids = random_samples[ - sample_idx : sample_idx + num_parent_seqs, 0 - ].tolist() + next_token_ids = random_samples[sample_idx:sample_idx + + num_parent_seqs, 0].tolist() results.append((next_token_ids, parent_ids)) sample_idx += num_parent_seqs return results @@ -588,25 +564,29 @@ def _beam_search_sample( seq_ids, sampling_params = seq_group.seq_ids, seq_group.sampling_params num_parent_seqs = len(seq_ids) beam_width = sampling_params.n - seq_group_logprobs = logprobs[sample_idx : sample_idx + num_parent_seqs] + seq_group_logprobs = logprobs[sample_idx:sample_idx + num_parent_seqs] if is_prompt: # Prompt phase. - assert num_parent_seqs == 1, "Prompt input should have only one seq." + assert num_parent_seqs == 1, ( + "Prompt input should have only one seq.") parent_ids = [0] * (2 * beam_width) - _, next_token_ids = torch.topk(seq_group_logprobs[0], 2 * beam_width) + _, next_token_ids = torch.topk(seq_group_logprobs[0], + 2 * beam_width) next_token_ids = next_token_ids.tolist() else: # Generation phase. cumulative_logprobs: List[float] = [ - seq_group.seq_data[seq_id].cumulative_logprob for seq_id in seq_ids + seq_group.seq_data[seq_id].cumulative_logprob + for seq_id in seq_ids ] cumulative_logprobs_tensor = torch.tensor( - cumulative_logprobs, dtype=torch.float, device=seq_group_logprobs.device - ) - seq_group_logprobs = ( - seq_group_logprobs + cumulative_logprobs_tensor.unsqueeze(1) - ) - _, topk_ids = torch.topk(seq_group_logprobs.flatten(), 2 * beam_width) + cumulative_logprobs, + dtype=torch.float, + device=seq_group_logprobs.device) + seq_group_logprobs = (seq_group_logprobs + + cumulative_logprobs_tensor.unsqueeze(dim=1)) + _, topk_ids = torch.topk(seq_group_logprobs.flatten(), + 2 * beam_width) topk_ids = topk_ids.tolist() vocab_size = seq_group_logprobs.size(-1) parent_ids = [i // vocab_size for i in topk_ids] @@ -617,16 +597,6 @@ def _beam_search_sample( return results -def exponential(x, lambd=1.0, *, generator=None): - if generator is not None: - raise ValueError("`generator` can not be supported.") - import numpy as np - import mindspore as ms - - output = np.random.exponential(scale=lambd, size=x.shape) - return ms.Tensor(output).astype(x.dtype) - - # torch.multinomial forces a GPU<->CPU sync. # Therefore, we use an optimized implementation instead. # Note that we always sample with replacement. @@ -641,37 +611,30 @@ def _multinomial( probs = probs.repeat_interleave(num_samples, dim=0) q = torch.empty_like(probs) if seq_groups is None: - q = exponential(q) - # q.exponential_() + q.exponential_() else: sample_idx = 0 for seq_group in seq_groups: seq_ids = seq_group.seq_ids stride = len(seq_ids) * num_samples assert seq_group.generator is not None - q[sample_idx : sample_idx + stride] = exponential( - q[sample_idx : sample_idx + stride] - ) - # q[sample_idx:sample_idx + - # stride].exponential_(generator=seq_group.generator) + q[sample_idx : sample_idx + + stride].exponential_(generator=seq_group.generator) sample_idx += stride - return probs.div(q).argmax(axis=1).view(-1, num_samples) + return probs.div_(q).argmax(dim=1).view(-1, num_samples) def _top_k_top_p_multinomial_with_flashinfer( - probs: torch.Tensor, - top_ks: torch.Tensor, - top_ps: torch.Tensor, - num_samples: int, - seq_groups: Optional[List[SequenceGroupToSample]], -): + probs: torch.Tensor, top_ks: torch.Tensor, top_ps: torch.Tensor, + num_samples: int, seq_groups: Optional[List[SequenceGroupToSample]]): max_top_k_round = 32 if num_samples > 1: probs = probs.repeat_interleave(num_samples, dim=0) top_ks = top_ks.repeat_interleave(num_samples) top_ps = top_ps.repeat_interleave(num_samples) batch_size = probs.shape[0] - uniform_samples = torch.empty((max_top_k_round, batch_size), device=probs.device) + uniform_samples = torch.empty((max_top_k_round, batch_size), + device=probs.device) if seq_groups is None: uniform_samples.uniform_() else: @@ -680,9 +643,8 @@ def _top_k_top_p_multinomial_with_flashinfer( seq_ids = seq_group.seq_ids stride = len(seq_ids) * num_samples assert seq_group.generator is not None - uniform_samples[:, sample_idx : sample_idx + stride].uniform_( - generator=seq_group.generator - ) + uniform_samples[:, sample_idx:sample_idx + + stride].uniform_(generator=seq_group.generator) sample_idx += stride batch_next_token_ids, success = flashinfer_top_k_top_p_sampling( probs, @@ -691,19 +653,18 @@ def _top_k_top_p_multinomial_with_flashinfer( top_ps, ) if not success.all(): - warnings.warn("FlashInfer rejection sampling failed, fallback.", stacklevel=1) + warnings.warn("FlashInfer rejection sampling failed, fallback.", + stacklevel=1) probs = flashinfer.sampling.top_k_renorm_prob(probs, top_ks) probs = flashinfer.sampling.top_p_renorm_prob(probs, top_ps) batch_next_token_ids = flashinfer.sampling.sampling_from_probs( - probs, uniform_samples[0] - ) + probs, uniform_samples[0]) return batch_next_token_ids.view(-1, num_samples) def get_pythonized_sample_results( - sample_result_args: SampleResultArgsType, -) -> SampleResultType: - """This function consumes GPU-side sampler results and computes + sample_result_args: SampleResultArgsType) -> SampleResultType: + '''This function consumes GPU-side sampler results and computes Pythonized CPU-side sampler results (GPU -> CPU sync.) Single-step scheduling: this function is invoked at sampling-time @@ -717,21 +678,19 @@ def get_pythonized_sample_results( Returns: Pythonized sampler results - """ + ''' ( sample_metadata, sampling_metadata, greedy_samples, multinomial_samples, - beam_search_logprobs, sample_results_dict, ) = ( sample_result_args.sample_metadata, sample_result_args.sampling_metadata, sample_result_args.greedy_samples, sample_result_args.multinomial_samples, - sample_result_args.beam_search_logprobs, sample_result_args.sample_results_dict, ) @@ -742,11 +701,8 @@ def get_pythonized_sample_results( if sampling_type == SamplingType.GREEDY: sample_results = _greedy_sample(seq_groups, greedy_samples) elif sampling_type in (SamplingType.RANDOM, SamplingType.RANDOM_SEED): - sample_results = _random_sample( - seq_groups, multinomial_samples[sampling_type] - ) - elif sampling_type == SamplingType.BEAM: - sample_results = _beam_search_sample(seq_groups, beam_search_logprobs) + sample_results = _random_sample(seq_groups, + multinomial_samples[sampling_type]) sample_results_dict.update(zip(seq_group_id, sample_results)) return [ @@ -763,7 +719,7 @@ def _sample_with_torch( include_gpu_probs_tensor: bool, modify_greedy_probs: bool, ) -> SampleReturnType: - """Torch-oriented _sample() implementation. + '''Torch-oriented _sample() implementation. Single-step scheduling: * Perform GPU-side sampling computation @@ -773,11 +729,11 @@ def _sample_with_torch( * Perform GPU-side sampling computation * Defer Pythonization & preserve GPU-side tensors required for Pythonization - """ + ''' - categorized_seq_group_ids: Dict[SamplingType, List[int]] = { - t: [] for t in SamplingType - } + categorized_seq_group_ids: Dict[SamplingType, + List[int]] = {t: [] + for t in SamplingType} categorized_sample_indices = sampling_metadata.categorized_sample_indices for i, seq_group in enumerate(sampling_metadata.seq_groups): sampling_params = seq_group.sampling_params @@ -788,16 +744,13 @@ def _sample_with_torch( sample_metadata: SampleMetadataType = {} multinomial_samples: MultinomialSamplesType = {} greedy_samples: Optional[torch.Tensor] = None - beam_search_logprobs: Optional[torch.Tensor] = None # Create output tensor for sampled token ids. if include_gpu_probs_tensor: - sampled_token_ids_tensor = torch.full( - (logprobs.shape[0], 1), - VLLM_INVALID_TOKEN_ID, - dtype=torch.long, - device=logprobs.device, - ) + sampled_token_ids_tensor = torch.full((logprobs.shape[0], 1), + VLLM_INVALID_TOKEN_ID, + dtype=torch.long, + device=logprobs.device) else: sampled_token_ids_tensor = None @@ -814,21 +767,21 @@ def _sample_with_torch( sample_metadata[sampling_type] = (seq_group_id, seq_groups) long_sample_indices = sample_indices.long() if sampling_type == SamplingType.GREEDY: - greedy_samples = torch.argmax(logprobs[long_sample_indices], dim=-1) + greedy_samples = torch.argmax(logprobs[long_sample_indices], + dim=-1) if sampled_token_ids_tensor is not None: # Store sampled tokens in output tensor. - sampled_token_ids_tensor[long_sample_indices] = ( - greedy_samples.unsqueeze(-1) - ) + sampled_token_ids_tensor[ + long_sample_indices] = greedy_samples.unsqueeze(-1) if modify_greedy_probs: # If required, modify the probabilities such that sampling from # the modified distribution would always sample the argmax # token id. - _modify_greedy_probs_inplace( - logprobs, probs, long_sample_indices, greedy_samples - ) + _modify_greedy_probs_inplace(logprobs, probs, + long_sample_indices, + greedy_samples) elif sampling_type in (SamplingType.RANDOM, SamplingType.RANDOM_SEED): max_n_in_batch = 1 @@ -836,35 +789,29 @@ def _sample_with_torch( if seq_group.is_prompt: sampling_params = seq_group.sampling_params max_n_in_batch = max(max_n_in_batch, sampling_params.n) - seq_groups_arg = ( - None if sampling_type == SamplingType.RANDOM else seq_groups - ) + seq_groups_arg = (None if sampling_type == SamplingType.RANDOM else + seq_groups) if flashinfer_top_k_top_p_sampling is not None: - multinomial_samples[sampling_type] = ( - _top_k_top_p_multinomial_with_flashinfer( + multinomial_samples[ + sampling_type] = _top_k_top_p_multinomial_with_flashinfer( probs[long_sample_indices], sampling_tensors.top_ks[long_sample_indices], sampling_tensors.top_ps[long_sample_indices], max_n_in_batch, seq_groups_arg, ) - ) else: multinomial_samples[sampling_type] = _multinomial( probs[long_sample_indices], max_n_in_batch, - seq_groups=seq_groups_arg, - ) + seq_groups=seq_groups_arg) if sampled_token_ids_tensor is not None: # Store sampled tokens in output tensor. - sampled_token_ids_tensor[long_sample_indices] = multinomial_samples[ - sampling_type - ].to(torch.long) + sampled_token_ids_tensor[long_sample_indices] = \ + multinomial_samples[sampling_type].to(torch.long) - elif sampling_type == SamplingType.BEAM: - beam_search_logprobs = logprobs[sample_indices] else: raise ValueError(f"Unsupported sampling type: {sampling_type}") @@ -875,18 +822,14 @@ def _sample_with_torch( sample_metadata=sample_metadata, multinomial_samples=multinomial_samples, greedy_samples=greedy_samples, - beam_search_logprobs=beam_search_logprobs, - sample_results_dict=sample_results_dict, - ) + sample_results_dict=sample_results_dict) if not sampling_metadata.skip_sampler_cpu_output: # GPU<->CPU sync happens here. # This also converts the sampler output to a Python object. # Return Pythonized sampler result & sampled token ids - return ( - get_pythonized_sample_results(maybe_deferred_args), - sampled_token_ids_tensor, - ) + return get_pythonized_sample_results( + maybe_deferred_args), sampled_token_ids_tensor else: # Defer sampler result Pythonization; return deferred # Pythonization args & sampled token ids @@ -940,8 +883,9 @@ def _get_ranks(x: torch.Tensor, indices: torch.Tensor) -> torch.Tensor: Each element in the returned tensor represents the rank of the chosen token in the input logprob tensor. """ - vals = x[torch.arange(0, len(x), device=x.device, dtype=indices.dtype), indices] - result = x > vals[:, None] + vals = x[torch.arange(0, len(x), device=x.device, dtype=indices.dtype), + indices] + result = (x > vals[:, None]) del vals return result.sum(1).add_(1) @@ -989,14 +933,15 @@ def get_logprobs( # Select indices to compute logprob from, ranks of token ids, and the top # k token ids from logprobs. - for seq_group, sample_result in zip(sampling_metadata.seq_groups, sample_results): + for (seq_group, sample_result) in zip(sampling_metadata.seq_groups, + sample_results): sampling_params = seq_group.sampling_params # Update indices and tokens for prompt logprobs. - if seq_group.is_prompt and sampling_params.prompt_logprobs is not None: - largest_num_logprobs = max( - largest_num_logprobs, sampling_params.prompt_logprobs - ) + if (seq_group.is_prompt + and sampling_params.prompt_logprobs is not None): + largest_num_logprobs = max(largest_num_logprobs, + sampling_params.prompt_logprobs) next_prompt_tokens = _get_next_prompt_tokens(seq_group) query_indices.extend(seq_group.prompt_logprob_indices) next_token_ids.extend(next_prompt_tokens) @@ -1010,14 +955,12 @@ def get_logprobs( # we can obtain it from `sample_result[1]`. query_idx = seq_group.sample_indices[0] query_indices.extend( - [query_idx + parent_id for parent_id in parent_seq_ids] - ) + [query_idx + parent_id for parent_id in parent_seq_ids]) next_token_ids.extend(token_ids) if sampling_params.logprobs is not None: - largest_num_logprobs = max( - largest_num_logprobs, sampling_params.logprobs - ) + largest_num_logprobs = max(largest_num_logprobs, + sampling_params.logprobs) assert len(next_token_ids) == len(query_indices) @@ -1033,16 +976,15 @@ def get_logprobs( # skip the whole logprob calculation. if largest_num_logprobs >= 0: query_indices_gpu = torch.tensor(query_indices, device=logprobs.device) - next_token_ids_gpu = torch.tensor(next_token_ids, device=logprobs.device) + next_token_ids_gpu = torch.tensor(next_token_ids, + device=logprobs.device) # (num_selected_query_tokens, num_logprobs). Note that query_indices can # contain duplicates if beam search is enabled. - selected_logprobs = logprobs[ - [ - query_indices_gpu, - next_token_ids_gpu, - ] - ] + selected_logprobs = logprobs[[ + query_indices_gpu, + next_token_ids_gpu, + ]] ranks = _get_ranks( logprobs[query_indices_gpu], next_token_ids_gpu, @@ -1053,14 +995,14 @@ def get_logprobs( if largest_num_logprobs > 0: # Logprobs of topk tokens for a batch of sequence groups. # (num_query_tokens_across_batch). - top_logprobs, top_token_ids = torch.topk( - logprobs, largest_num_logprobs, dim=-1 - ) - top_logprobs = top_logprobs.to("cpu") - top_token_ids = top_token_ids.to("cpu") + top_logprobs, top_token_ids = torch.topk(logprobs, + largest_num_logprobs, + dim=-1) + top_logprobs = top_logprobs.to('cpu') + top_token_ids = top_token_ids.to('cpu') - selected_logprobs = selected_logprobs.to("cpu") - ranks = ranks.to("cpu") + selected_logprobs = selected_logprobs.to('cpu') + ranks = ranks.to('cpu') # Find prompt/sample logprobs. prompt_logprobs_per_seq_group: List[Optional[PromptLogprobs]] = [] @@ -1068,32 +1010,18 @@ def get_logprobs( top_logprob_idx = 0 selected_logprobs_idx = 0 - for seq_group, sample_result in zip(sampling_metadata.seq_groups, sample_results): - (prompt_logprobs, top_logprob_idx, selected_logprobs_idx) = ( - _get_prompt_logprob_if_needed( - seq_group, - selected_logprobs, - ranks, - top_token_ids, - top_logprobs, - selected_logprobs_idx, - top_logprob_idx, - ) - ) + for seq_group, sample_result in zip(sampling_metadata.seq_groups, + sample_results): + (prompt_logprobs, top_logprob_idx, + selected_logprobs_idx) = _get_prompt_logprob_if_needed( + seq_group, selected_logprobs, ranks, top_token_ids, top_logprobs, + selected_logprobs_idx, top_logprob_idx) prompt_logprobs_per_seq_group.append(prompt_logprobs) - (sampled_logprobs, top_logprob_idx, selected_logprobs_idx) = ( - _get_sampled_logprob_if_needed( - seq_group, - sample_result, - selected_logprobs, - ranks, - top_token_ids, - top_logprobs, - selected_logprobs_idx, - top_logprob_idx, - ) - ) + (sampled_logprobs, top_logprob_idx, + selected_logprobs_idx) = _get_sampled_logprob_if_needed( + seq_group, sample_result, selected_logprobs, ranks, top_token_ids, + top_logprobs, selected_logprobs_idx, top_logprob_idx) sample_logprobs_per_seq_group.append(sampled_logprobs) return prompt_logprobs_per_seq_group, sample_logprobs_per_seq_group @@ -1121,11 +1049,10 @@ def _get_prompt_logprob_if_needed( # Pre-select indexes and create a list. It is faster than calling .item # repetitively. selected_logprob_items = selected_logprobs[ - selected_logprobs_idx : selected_logprobs_idx + len(next_prompt_tokens) - ].tolist() - rank_items = ranks[ - selected_logprobs_idx : selected_logprobs_idx + len(next_prompt_tokens) - ].tolist() + selected_logprobs_idx:selected_logprobs_idx + + len(next_prompt_tokens)].tolist() + rank_items = ranks[selected_logprobs_idx:selected_logprobs_idx + + len(next_prompt_tokens)].tolist() for idx, token_id in enumerate(next_prompt_tokens): # Calculate the prompt logprob of the real prompt tokens. @@ -1136,23 +1063,22 @@ def _get_prompt_logprob_if_needed( # Add top K prompt logprobs along with its rank. if num_logprobs > 0: - top_ids = top_token_ids[top_logprob_idx, :num_logprobs].tolist() - top_probs = top_logprobs[top_logprob_idx, :num_logprobs].tolist() + top_ids = top_token_ids[ + top_logprob_idx, :num_logprobs].tolist() + top_probs = top_logprobs[ + top_logprob_idx, :num_logprobs].tolist() # Top K is already sorted by rank, so we can use 1 ~ # num_logprobs + 1 for rank. top_ranks = range(1, num_logprobs + 1) - prompt_logprobs_dict.update( - { - top_id: (top_prob, rank) - for top_id, top_prob, rank in zip(top_ids, top_probs, top_ranks) - } - ) - prompt_logprobs.append( - { - token_id: Logprob(*logprob_and_rank) - for token_id, logprob_and_rank in prompt_logprobs_dict.items() - } - ) + prompt_logprobs_dict.update({ + top_id: (top_prob, rank) + for top_id, top_prob, rank in zip(top_ids, top_probs, + top_ranks) + }) + prompt_logprobs.append({ + token_id: Logprob(*logprob_and_rank) + for token_id, logprob_and_rank in prompt_logprobs_dict.items() + }) # + 1 to go to the next prompt token. top_logprob_idx += 1 @@ -1187,44 +1113,37 @@ def _get_sampled_logprob_if_needed( # Pre-select items from tensor. tolist() is faster than repetitive # `.item()` calls. selected_logprob_items = selected_logprobs[ - selected_logprobs_idx : selected_logprobs_idx + len(next_token_ids) - ].tolist() - rank_items = ranks[ - selected_logprobs_idx : selected_logprobs_idx + len(next_token_ids) - ].tolist() + selected_logprobs_idx:selected_logprobs_idx + + len(next_token_ids)].tolist() + rank_items = ranks[selected_logprobs_idx:selected_logprobs_idx + + len(next_token_ids)].tolist() for idx, (next_token_id, parent_id) in enumerate( - zip(next_token_ids, parent_seq_ids) - ): + zip(next_token_ids, parent_seq_ids)): # Get the logprob of a sampled token. sampled_logprobs_dict = { - next_token_id: (selected_logprob_items[idx], rank_items[idx]) + next_token_id: + (selected_logprob_items[idx], rank_items[idx]) } if num_logprobs is not None and num_logprobs > 0: # Get top K logprobs. - top_ids = top_token_ids[ - top_logprob_idx + parent_id, :num_logprobs - ].tolist() + top_ids = top_token_ids[top_logprob_idx + + parent_id, :num_logprobs].tolist() top_probs = top_logprobs[ - top_logprob_idx + parent_id, :num_logprobs - ].tolist() + top_logprob_idx + parent_id, :num_logprobs].tolist() # Top K is already sorted by rank, so we can use 1 ~ # num_logprobs + 1 for rank. top_ranks = range(1, num_logprobs + 1) - sampled_logprobs_dict.update( - { - top_id: (top_prob, rank) - for top_id, top_prob, rank in zip( - top_ids, top_probs, top_ranks - ) - } - ) + sampled_logprobs_dict.update({ + top_id: (top_prob, rank) + for top_id, top_prob, rank in zip( + top_ids, top_probs, top_ranks) + }) - sampled_logprobs.append( - { - token_id: Logprob(*logprob_and_rank) - for token_id, logprob_and_rank in sampled_logprobs_dict.items() - } - ) + sampled_logprobs.append({ + token_id: Logprob(*logprob_and_rank) + for token_id, logprob_and_rank in + sampled_logprobs_dict.items() + }) # NOTE: This part of code is not intuitive. `selected_logprobs` include # logprobs for the current step, which has len(next_token_ids) tokens @@ -1238,12 +1157,9 @@ def _get_sampled_logprob_if_needed( return sampled_logprobs, top_logprob_idx, selected_logprobs_idx -def _modify_greedy_probs_inplace( - logprobs: torch.Tensor, - probs: torch.Tensor, - sample_indices: torch.Tensor, - greedy_samples: torch.Tensor, -) -> None: +def _modify_greedy_probs_inplace(logprobs: torch.Tensor, probs: torch.Tensor, + sample_indices: torch.Tensor, + greedy_samples: torch.Tensor) -> None: """Modify the probability distributions of the greedily-sampled tokens such that each sampled token has a "probability" of 1.0. This is required by speculative decoding, which depends on the sampling method being encoded @@ -1296,7 +1212,8 @@ def _build_sampler_output( sampling_metadata: SamplingMetadata, prompt_logprobs: Optional[List[Optional[PromptLogprobs]]], sample_logprobs: Optional[List[SampleLogprobs]], - on_device_tensors: Optional[Tuple[torch.Tensor, torch.Tensor, torch.Tensor]], + on_device_tensors: Optional[Tuple[torch.Tensor, torch.Tensor, + torch.Tensor]], skip_sampler_cpu_output: bool = False, ) -> SamplerOutput: """Construct Python objects with the output of sampling. @@ -1315,46 +1232,40 @@ def _build_sampler_output( else: assert prompt_logprobs is not None assert sample_logprobs is not None - assert not isinstance(maybe_deferred_sample_results, SampleResultArgsType) + assert not isinstance(maybe_deferred_sample_results, + SampleResultArgsType) deferred_sample_results_args = None - for ( - seq_group, - sample_result, - group_prompt_logprobs, - group_sample_logprobs, - ) in zip( - sampling_metadata.seq_groups, - maybe_deferred_sample_results, - prompt_logprobs, - sample_logprobs, - ): + for (seq_group, sample_result, group_prompt_logprobs, + group_sample_logprobs) in zip(sampling_metadata.seq_groups, + maybe_deferred_sample_results, + prompt_logprobs, sample_logprobs): seq_ids = seq_group.seq_ids next_token_ids, parent_ids = sample_result seq_outputs: List[SequenceOutput] = [] for parent_id, next_token_id, logprobs in zip( - parent_ids, next_token_ids, group_sample_logprobs - ): + parent_ids, next_token_ids, group_sample_logprobs): seq_outputs.append( - SequenceOutput(seq_ids[parent_id], next_token_id, logprobs) - ) + SequenceOutput(seq_ids[parent_id], next_token_id, + logprobs)) sampler_output.append( - CompletionSequenceGroupOutput(seq_outputs, group_prompt_logprobs) - ) + CompletionSequenceGroupOutput(seq_outputs, + group_prompt_logprobs)) # If not specified, store None values in SamplerOutput. if on_device_tensors is not None: - (sampled_token_probs, logprobs_tensor, sampled_token_ids) = on_device_tensors + (sampled_token_probs, logprobs_tensor, + sampled_token_ids) = on_device_tensors else: - sampled_token_probs, logprobs_tensor, sampled_token_ids = (None, None, None) + sampled_token_probs, logprobs_tensor, sampled_token_ids = (None, None, + None) return SamplerOutput( outputs=sampler_output, sampled_token_probs=sampled_token_probs, sampled_token_ids=sampled_token_ids, logprobs=logprobs_tensor, - deferred_sample_results_args=deferred_sample_results_args, - ) + deferred_sample_results_args=deferred_sample_results_args) def _get_next_prompt_tokens(seq_group: SequenceGroupToSample) -> List[int]: @@ -1371,9 +1282,8 @@ def _get_next_prompt_tokens(seq_group: SequenceGroupToSample) -> List[int]: Returns: A list of next prompt tokens to compute logprob. """ - assert ( - seq_group.is_prompt - ), "Caller should ensure the sequence group is in a prefill stage." + assert seq_group.is_prompt, ( + "Caller should ensure the sequence group is in a prefill stage.") seq_ids = seq_group.seq_ids query_len = seq_group.query_len assert query_len is not None @@ -1384,6 +1294,9 @@ def _get_next_prompt_tokens(seq_group: SequenceGroupToSample) -> List[int]: prompt_tokens = seq_data.prompt_token_ids # +1 because we are looking for a next prompt token. next_token_index_start = computed_len + 1 - next_token_index_end = min(computed_len + query_len + 1, len(prompt_tokens)) - next_prompt_tokens = prompt_tokens[next_token_index_start:next_token_index_end] + next_token_index_end = min(computed_len + query_len + 1, + len(prompt_tokens)) + next_prompt_tokens = prompt_tokens[ + next_token_index_start:next_token_index_end] return next_prompt_tokens + diff --git a/vllm_mindspore/model_executor/layers/utils.py b/vllm_mindspore/model_executor/layers/utils.py index eedfaa12b0e84176c7e0b445291cadc82a6f2dfd..bbef8d9cb8a6f324981e3c28ef5220d6bbca9305 100644 --- a/vllm_mindspore/model_executor/layers/utils.py +++ b/vllm_mindspore/model_executor/layers/utils.py @@ -1,3 +1,20 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# Copyright 2025 Huawei Technologies Co., Ltd +# Copyright 2024 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ """Utility methods for model layers.""" from typing import Tuple import torch diff --git a/vllm_mindspore/model_executor/layers/vocab_parallel_embedding.py b/vllm_mindspore/model_executor/layers/vocab_parallel_embedding.py index f45064e194c34e7252cc070e680f7fe57e746560..81ebbe119d5f847844ddc8bb96350abc529ba95d 100644 --- a/vllm_mindspore/model_executor/layers/vocab_parallel_embedding.py +++ b/vllm_mindspore/model_executor/layers/vocab_parallel_embedding.py @@ -1,6 +1,7 @@ #!/usr/bin/env python3 # encoding: utf-8 # Copyright 2025 Huawei Technologies Co., Ltd +# Copyright 2024 The vLLM team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -35,8 +36,6 @@ from mindspore import jit DEFAULT_VOCAB_PADDING_SIZE = 64 -# TODO(tronzhang): Most same as vllm's one, check latter... - class UnquantizedEmbeddingMethod(QuantizeMethodBase): """Unquantized method for embeddings.""" @@ -223,26 +222,26 @@ class VocabParallelEmbedding(nn.Cell): self.embedding_dim = embedding_dim - linear_method = None - if quant_config is not None: - linear_method = quant_config.get_quant_method(self, prefix=prefix) - if linear_method is None: - linear_method = UnquantizedEmbeddingMethod() + quant_method = None + if quant_method is not None: + quant_method = quant_config.get_quant_method(self, prefix=prefix) + if quant_method is None: + quant_method = UnquantizedEmbeddingMethod() # If we are making an embedding layer, then our quantization linear # method must implement the embedding operation. If we are another # layer type like ParallelLMHead, this is not important. is_embedding_layer = type(self.__class__) is VocabParallelEmbedding - linear_method_implements_embedding = method_has_implemented_embedding( - type(linear_method) + quant_method_implements_embedding = method_has_implemented_embedding( + type(quant_method) ) - if is_embedding_layer and not linear_method_implements_embedding: + if is_embedding_layer and not quant_method_implements_embedding: raise NotImplementedError( - f"The class {type(linear_method).__name__} must implement " + f"The class {type(quant_method).__name__} must implement " "the 'embedding' method, see UnquantizedEmbeddingMethod." ) - self.linear_method: QuantizeMethodBase = linear_method + self.quant_method: QuantizeMethodBase = quant_method if params_dtype is None: params_dtype = mstype.float16 @@ -263,7 +262,7 @@ class VocabParallelEmbedding(nn.Cell): - self.shard_indices.added_vocab_start_index ) - self.linear_method.create_weights( + self.quant_method.create_weights( self, self.embedding_dim, [self.num_embeddings_per_partition], @@ -327,7 +326,7 @@ class VocabParallelEmbedding(nn.Cell): else: masked_input, input_mask = input_, None # Get the embeddings. - output_parallel = self.linear_method.embedding(self, masked_input) + output_parallel = self.quant_method.embedding(self, masked_input) # Mask the output embedding. if self.tp_size > 1: output_parallel = mint.mul(output_parallel, input_mask) @@ -358,7 +357,7 @@ class VocabParallelEmbedding(nn.Cell): f" but got {loaded_weight.shape[output_dim]} and {self.org_vocab_size}") # Copy the data. - loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size) + loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size).contiguous() param[: loaded_weight.shape[0]] = loaded_weight param[loaded_weight.shape[0]:] = 0 diff --git a/vllm_mindspore/model_executor/model_loader/utils.py b/vllm_mindspore/model_executor/model_loader/utils.py index c94e3150b96842bd9b598879d05fc338a0afee9b..66295a32cb0090105aa6ae7156322eabb5d71910 100644 --- a/vllm_mindspore/model_executor/model_loader/utils.py +++ b/vllm_mindspore/model_executor/model_loader/utils.py @@ -1,6 +1,7 @@ #!/usr/bin/env python3 # encoding: utf-8 # Copyright 2025 Huawei Technologies Co., Ltd +# Copyright 2024 The vLLM team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -19,14 +20,20 @@ from typing import Tuple, Type from torch import nn -from vllm.config import ModelConfig +from vllm.config import ModelConfig, ModelImpl +from vllm.model_executor.models import ModelRegistry from vllm_mindspore.model_executor.models.registry import MindSporeModelRegistry - +from vllm.model_executor.model_loader.utils import resolve_transformers_fallback def get_ms_model_architecture(model_config: ModelConfig) -> Tuple[Type[nn.Module], str]: architectures = getattr(model_config.hf_config, "architectures", []) + vllm_supported_archs = ModelRegistry.get_supported_archs() + is_vllm_supported = any(arch in vllm_supported_archs + for arch in architectures) + if not is_vllm_supported: + raise RuntimeError("vLLM-Mindspore does not support %s for now." % str(architectures)) model_cls, arch = MindSporeModelRegistry.resolve_model_cls(architectures) if model_config.task == "embed": raise RecursionError("MindSpore unsupport embed model task now!") diff --git a/vllm_mindspore/model_executor/model_loader/weight_utils.py b/vllm_mindspore/model_executor/model_loader/weight_utils.py index ead186957a59adf97789be023e2af5e5a19925bc..45fe4bdd5a4b256a39fe46c73bd49e8537c82a03 100644 --- a/vllm_mindspore/model_executor/model_loader/weight_utils.py +++ b/vllm_mindspore/model_executor/model_loader/weight_utils.py @@ -1,6 +1,7 @@ #!/usr/bin/env python3 # encoding: utf-8 # Copyright 2025 Huawei Technologies Co., Ltd +# Copyright 2024 The vLLM team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/vllm_mindspore/model_executor/models/interfaces.py b/vllm_mindspore/model_executor/models/interfaces.py index 0b1510d973dc911b5fe85d0d4980648d9e605c18..f9b27a0796200af811fc2ae610ff6515d792d346 100644 --- a/vllm_mindspore/model_executor/models/interfaces.py +++ b/vllm_mindspore/model_executor/models/interfaces.py @@ -1,3 +1,21 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# Copyright 2025 Huawei Technologies Co., Ltd +# Copyright 2024 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + from typing import (TYPE_CHECKING, ClassVar, Dict, List, Literal, Optional, Protocol, Type, Union, overload, runtime_checkable) diff --git a/vllm_mindspore/model_executor/models/llama.py b/vllm_mindspore/model_executor/models/llama.py index c20f54fe8571b1879846f393a438fb047f67cd52..3a18956b93e825851144f28ca4a261d46bcea9b7 100644 --- a/vllm_mindspore/model_executor/models/llama.py +++ b/vllm_mindspore/model_executor/models/llama.py @@ -1,6 +1,7 @@ #!/usr/bin/env python3 # encoding: utf-8 # Copyright 2025 Huawei Technologies Co., Ltd +# Copyright 2024 The vLLM team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -54,6 +55,7 @@ from vllm_mindspore.model_executor.models.model_base import MsModelBase from vllm.sequence import IntermediateTensors from vllm.attention import AttentionMetadata from vllm.model_executor.models.interfaces import SupportsPP +from vllm.model_executor.model_loader.weight_utils import maybe_remap_kv_scale_name from mindspore import Tensor, mint, jit, nn from mindspore import dtype as mstype @@ -115,6 +117,7 @@ class LlamaAttention(nn.Cell): max_position_embeddings: int = 8192, quant_config=None, bias: bool = False, + bias_o_proj: bool = False, cache_config=None, prefix: str = "", ) -> None: @@ -139,6 +142,9 @@ class LlamaAttention(nn.Cell): self.head_dim = getattr( config, "head_dim", self.hidden_size // self.total_num_heads ) + # Phi models introduced a partial_rotary_factor parameter in the config + partial_rotary_factor = getattr(config, "partial_rotary_factor", 1) + self.rotary_dim = int(partial_rotary_factor * self.head_dim) self.q_size = self.num_heads * self.head_dim self.kv_size = self.num_kv_heads * self.head_dim self.scaling = self.head_dim**-0.5 @@ -158,13 +164,14 @@ class LlamaAttention(nn.Cell): self.o_proj = RowParallelLinear( input_size=self.total_num_heads * self.head_dim, output_size=hidden_size, - bias=bias, + bias=bias_o_proj, quant_config=quant_config, prefix=f"{prefix}.o_proj", ) is_neox_style = True - if quant_config is not None and quant_config.get_name() == "gguf": + is_gguf = quant_config and quant_config.get_name() == "gguf" + if is_gguf and config.model_type == "llama": is_neox_style = False self.rotary_emb = get_rope( self.head_dim, @@ -176,13 +183,14 @@ class LlamaAttention(nn.Cell): ) if hasattr(config, "interleaved_sliding_window"): - if isinstance(config.interleaved_sliding_window, int): - sliding_window = config.interleaved_sliding_window - elif isinstance(config.interleaved_sliding_window, list): - sw_idx = layer_idx % len(config.interleaved_sliding_window) - sliding_window = config.interleaved_sliding_window[sw_idx] + interleaved_sliding_window = config.interleaved_sliding_window + if isinstance(interleaved_sliding_window, int): + sliding_window = interleaved_sliding_window + elif isinstance(interleaved_sliding_window, list): + sw_idx = layer_idx % len(interleaved_sliding_window) + sliding_window = interleaved_sliding_window[sw_idx] else: - raise ValueError(f"{type(sliding_window)} is not supported.") + raise ValueError(f"{type(interleaved_sliding_window)} is not supported.") else: sliding_window = None @@ -245,6 +253,11 @@ class LlamaDecoderLayer(nn.Cell): attention_bias = getattr(config, "attention_bias", False) or getattr( config, "bias", False ) + bias_o_proj = attention_bias + # support internlm/internlm3-8b with qkv_bias + if hasattr(config, 'qkv_bias'): + attention_bias = config.qkv_bias + self.self_attn = LlamaAttention( config=config, hidden_size=self.hidden_size, @@ -257,6 +270,7 @@ class LlamaDecoderLayer(nn.Cell): max_position_embeddings=max_position_embeddings, quant_config=quant_config, bias=attention_bias, + bias_o_proj=bias_o_proj, cache_config=cache_config, prefix=f"{prefix}.self_attn", ) @@ -328,11 +342,6 @@ class LlamaModel(nn.Cell): config = vllm_config self.config = config self.padding_idx = config.pad_token_id - # TODO: Support lora_config - # lora_config = config - # lora_vocab = (lora_config.lora_extra_vocab_size * - # (lora_config.max_loras or 1)) if lora_config else 0 - # self.vocab_size = config.vocab_size + lora_vocab self.vocab_size = config.vocab_size self.org_vocab_size = config.vocab_size # TODO: Support quant_config cache_config @@ -523,6 +532,7 @@ class LlamaForCausalLM(MsModelBase, SupportsPP): attn_metadata, intermediate_tensors=None, inputs_embeds=None, + **kwargs ): if attn_metadata.num_prefill_tokens > 0: input_ids = input_ids.expand_dims(0) @@ -556,4 +566,4 @@ class LlamaForCausalLM(MsModelBase, SupportsPP): sampling_metadata: SamplingMetadata, ) -> Optional[Tensor]: logits = self.logits_processor(self.lm_head, hidden_states, sampling_metadata) - return logits + return logits \ No newline at end of file diff --git a/vllm_mindspore/model_executor/models/mf_models/attention_mask.py b/vllm_mindspore/model_executor/models/mf_models/attention_mask.py new file mode 100644 index 0000000000000000000000000000000000000000..10fcd25ecb485217fb53f7eb5d7b4061a90beb7a --- /dev/null +++ b/vllm_mindspore/model_executor/models/mf_models/attention_mask.py @@ -0,0 +1,53 @@ +# Copyright 2025 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +""" +infer attention mask. +""" +import numpy as np + +import mindspore as ms +from mindspore import Tensor, JitConfig, Model + + +class LowerTriangularMask: + r""" + Provide Infer model attention mask. + Args: + mf_model_config (MF Config): The config of Infer model. + + """ + + def __init__(self, mf_model_config): + compute_dtype = mf_model_config.compute_dtype + seq_length = mf_model_config.seq_length + self.prefill_mask = Tensor(np.triu(np.ones(shape=(128, 128), dtype=np.float16), k=1), dtype=compute_dtype) + + self.decode_mask = Tensor(np.triu(np.ones(shape=(seq_length, seq_length), dtype=np.int8), k=1), + dtype=compute_dtype) + + self.hard_mask = Tensor([0], dtype=compute_dtype).reshape(1, 1) + + self.gather = ms.ops.Gather() + + def gen_attention_mask(self, is_prefill, position_ids, query_lens): + if is_prefill: + attention_mask = self.prefill_mask + else: + if max(query_lens) > 1: + attention_mask = self.gather(self.decode_mask, position_ids, 0) + else: + attention_mask = self.hard_mask + return attention_mask diff --git a/vllm_mindspore/model_executor/models/mf_models/deepseek_mtp.py b/vllm_mindspore/model_executor/models/mf_models/deepseek_mtp.py new file mode 100644 index 0000000000000000000000000000000000000000..fac2bf20fc1b33ce831880b2a12f5b84f604ed21 --- /dev/null +++ b/vllm_mindspore/model_executor/models/mf_models/deepseek_mtp.py @@ -0,0 +1,113 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# Copyright 2025 Huawei Technologies Co., Ltd +# Copyright 2024 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +from typing import Iterable, Set, Tuple + +from vllm.config import VllmConfig +from vllm.config import get_current_vllm_config +from vllm.forward_context import get_forward_context +from vllm.logger import init_logger + +from mindspore import Tensor, JitConfig, Model, mutable +from mindspore.nn.utils import no_init_parameters + +from research.deepseek3.deepseek3_config import ( + DeepseekV3Config as DeepseekV3Config_MF, +) +from research.deepseek3.deepseek3 import ( + DeepseekV3ForCausalLM as DeepseekV3ForCausalLM_MF, +) + +from vllm_mindspore.model_executor.layers.sampler import get_sampler +from vllm_mindspore.model_executor.models.model_base import Fake_MLA +from vllm_mindspore.model_executor.models.mf_models.mf_model_base import MfModelBase +from vllm_mindspore.model_executor.models.mf_models.deepseekv3_weight_processor import DeepseekV3WeightProcessor + +logger = init_logger(__name__) + +class DeepseekV3MTPForCausalLM(MfModelBase): + def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: + super(DeepseekV3MTPForCausalLM, self).__init__( + vllm_config=vllm_config, prefix=prefix + ) + self.mf_kvcaches_init = False + + self.sampler = get_sampler() + self.set_modules({"model": self.network}) + + self.kv_caches = [Fake_MLA() for i in range(self.mf_model_config.num_layers)] + compilation_config = get_current_vllm_config().compilation_config + + if prefix in compilation_config.static_forward_context: + raise ValueError(f"Duplicate layer name: {prefix}") + for i in range(self.mf_model_config.num_nextn_predict_layers): + compilation_config.static_forward_context[str(i)] = self.kv_caches[i] + + self.set_flags = False + + + def _generate_model_config(self): + self.mf_config.load_checkpoint = self.get_model_path() + + self.mf_model_config = DeepseekV3Config_MF(**self.mf_config.model.model_config) + if self.mf_config.moe_config: + self.mf_model_config.moe_config = self.mf_config.moe_config + self.mf_model_config.return_hidden_states = True + setattr(self.mf_model_config, 'npu_mem_size', -1) + + self.mf_model_config.is_mtp_model = True + self.mf_model_config.num_nextn_predict_layers = self.model_config.hf_config.num_nextn_predict_layers + if self.mf_model_config.num_nextn_predict_layers != 1: + raise NotImplementedError("Only support 1 MTP-layer now.") + + self.mf_config.model.model_config = self.mf_model_config + + + def _create_network(self): + # Initital network + with no_init_parameters(): # Delay initialization + network = DeepseekV3ForCausalLM_MF(self.mf_model_config) + + return network, network.mtp_model.head + + + def get_kvcache(self): + key_cache = [] + forward_context = get_forward_context() + for i in range(self.mf_model_config.num_nextn_predict_layers): + k_cache = self.kv_caches[i].kv_cache[forward_context.virtual_engine][0] + key_cache.append(k_cache) + return mutable(key_cache), None + + + def update_model_inputs(self, model_inputs, **kwargs): + # ToDo: supports multi-mtpLayers with 'spec_step_idx' specifing the layer index. + if kwargs.get("spec_step_idx", 0) != 0: + raise NotImplementedError("Only support 1 MTP-layer now.") + # model_inputs["index"] = ms.Tensor(kwargs.get("spec_step_idx", 0), ms.int32) + hidden_states_shape = list(model_inputs["input_ids"].shape) + hidden_states_shape.append(self.model_config.get_hidden_size()) + model_inputs["hidden_states"] = kwargs.get("previous_hidden_states").reshape(hidden_states_shape) + return model_inputs + + + def load_weights(self, weights: Iterable[Tuple[str, Tensor]]) -> Set[str]: + weight_processor = DeepseekV3WeightProcessor(self.mf_config, self.network, False) + weight_processor.load_safetensors_shard(self.mf_config.load_checkpoint, is_mtp_model=True) + self.network.set_dynamic_inputs() + return None diff --git a/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py b/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py index 026988af656efbec4a5e41384c573800db8bd434..e0ede9464918f923391b864de85dd743901acb57 100644 --- a/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py +++ b/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py @@ -1,6 +1,7 @@ #!/usr/bin/env python3 # encoding: utf-8 # Copyright 2025 Huawei Technologies Co., Ltd +# Copyright 2024 The vLLM team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -16,25 +17,28 @@ # ============================================================================ import os -from typing import Iterable, List, Optional, Set, Tuple, Union -from pathlib import Path +from typing import Iterable, Set, Tuple +from collections import OrderedDict import numpy as np -from vllm.attention import AttentionMetadata from vllm.config import VllmConfig -from vllm.model_executor.layers.sampler import SamplerOutput -from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.sequence import IntermediateTensors -from vllm.distributed import get_tensor_model_parallel_world_size +from vllm.config import get_current_vllm_config +from vllm.forward_context import get_forward_context from vllm.logger import init_logger +import mindspore as ms +from mindspore import Tensor, JitConfig, Model, mutable +from mindspore.common import dtype as msdtype +from mindspore.nn.utils import no_init_parameters -from mindformers.tools.register.config import MindFormerConfig +from mindspore_gs.ptq import PTQ +from mindspore_gs.ptq import PTQMode, PTQConfig, OutliersSuppressionType, PrecisionRecovery, QuantGranularity, \ + GPTQQuantConfig +from mindspore_gs.common import BackendTarget -from mindformers.core.context import build_context -from mindformers.core.parallel_config import build_parallel_config from mindformers.trainer.utils import transform_and_load_checkpoint +from research.deepseek3.deepseek3_model_infer import DeepseekV3DecodeLayer from research.deepseek3.deepseek3_config import ( DeepseekV3Config as DeepseekV3Config_MF, ) @@ -43,190 +47,175 @@ from research.deepseek3.deepseek3 import ( ) from vllm_mindspore.model_executor.layers.sampler import get_sampler -from vllm_mindspore.model_executor.models.model_base import MsModelBase -from vllm_mindspore.utils import cal_block_num - -import mindspore as ms -from mindspore import Tensor, JitConfig, Model +from vllm_mindspore.model_executor.models.model_base import Fake_MLA +from vllm_mindspore.model_executor.models.mf_models.mf_model_base import MfModelBase +from vllm_mindspore.model_executor.models.mf_models.deepseekv3_weight_processor import DeepseekV3WeightProcessor logger = init_logger(__name__) -def _pad_to_max(x, max_len): - return x + [-1] * (max_len - len(x)) +def set_runtime_kernel_launch_group(): + kernel_launch_group = {'thread_num' : 2, 'kernel_group_num' : 8} + env_kernel_launch_group = os.getenv("EXPERIMENTAL_KERNEL_LAUNCH_GROUP", None) + if env_kernel_launch_group is not None: + pairs = env_kernel_launch_group.split(',') + for pair in pairs: + key, val = pair.split(':') + kernel_launch_group[key] = val + thread_num = int(kernel_launch_group.get('thread_num', 2)) + kernel_group_num = int(kernel_launch_group.get('kernel_group_num', 8)) + ms.runtime.set_kernel_launch_group(thread_num=thread_num, kernel_group_num=kernel_group_num) -def _pad_block_table(block_tables, seq_length, block_size): - # When prefill, the block_tables is a empty tensor. - if len(block_tables.shape) < 2: - fake_block_tables = ms.mint.empty( - 2, seq_length // block_size, dtype=ms.int32, device="Ascend" +class DeepseekV3ForCausalLM(MfModelBase): + def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: + super(DeepseekV3ForCausalLM, self).__init__( + vllm_config=vllm_config, prefix=prefix ) - return fake_block_tables - - block_tables_list = block_tables.tolist() - padded_block_tables = [ - _pad_to_max(block_table, seq_length // block_size) - for block_table in block_tables_list - ] - - return Tensor(np.array(padded_block_tables).astype(np.int32)) + self.is_quant = bool(hasattr(self.mf_model_config, "quantization_config") and + self.mf_model_config.quantization_config) + self.mf_kvcaches_init = False -def _batch_seq(input_tokens, prefill): - if prefill: - return ms.ops.expand_dims(input_tokens, 0).to(ms.int32) + self.sampler = get_sampler() + self.set_modules({"model": self.network}) - return ms.mint.reshape(input_tokens, (-1, 1)).to(ms.int32) + self.kv_caches = [Fake_MLA() for i in range(self.mf_model_config.num_layers)] + compilation_config = get_current_vllm_config().compilation_config + if prefix in compilation_config.static_forward_context: + raise ValueError(f"Duplicate layer name: {prefix}") + for i in range(self.mf_model_config.num_layers): + compilation_config.static_forward_context[str(i)] = self.kv_caches[i] -class DeepseekV3ForCausalLM(MsModelBase): - def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: - super(DeepseekV3ForCausalLM, self).__init__( - vllm_config=vllm_config, prefix=prefix - ) + self.set_flags = False + set_runtime_kernel_launch_group() - self.mf_config = MindFormerConfig(os.getenv("MINDFORMERS_MODEL_CONFIG")) - build_context(self.mf_config, is_set_ms_ctx=False, is_init_ms=False) - build_parallel_config(self.mf_config) - self.mf_config.model.model_config.parallel_config = ( - self.mf_config.parallel_config - ) - self.mf_config.model.model_config.parallel_config.model_parallel = ( - get_tensor_model_parallel_world_size() - ) - self.mf_config.model.model_config.parallel_config.pipeline_stage = 1 + def _generate_model_config(self): + self.mf_config.load_checkpoint = self.get_model_path() self.mf_model_config = DeepseekV3Config_MF(**self.mf_config.model.model_config) - self.mf_model_config.num_blocks = cal_block_num(self.cache_config, self.model_config, self.parallel_config) - self.mf_model_config.block_size = self.cache_config.block_size if self.mf_config.moe_config: self.mf_model_config.moe_config = self.mf_config.moe_config + self.mf_model_config.return_hidden_states = True + setattr(self.mf_model_config, 'npu_mem_size', -1) + def _create_network(self): # Initital network - self.network = DeepseekV3ForCausalLM_MF(self.mf_model_config) + with no_init_parameters(): # Delay initialization + network = DeepseekV3ForCausalLM_MF(self.mf_model_config) # quant - if hasattr(self.mf_model_config, "quantization_config") and self.mf_model_config.quantization_config: - from mindspore_gs.ptq import PTQ - from mindspore_gs.ptq import PTQMode, PTQConfig, OutliersSuppressionType, PrecisionRecovery, QuantGranularity - from mindspore_gs.common import BackendTarget - from mindspore.common import dtype as msdtype - from collections import OrderedDict - cfg = PTQConfig(mode=PTQMode.DEPLOY, - backend=BackendTarget.ASCEND, - weight_quant_dtype=msdtype.int8, + if hasattr(self.mf_model_config, "quantization_config") and hasattr(self.mf_model_config.quantization_config, "quant_method"): + ptq = self.create_ptq(self.mf_model_config.quantization_config.quant_method, PTQMode.DEPLOY) + if ptq is not None: + ptq.apply(network) + ptq.convert(network) + return network, network.lm_head + + def get_kvcache(self): + key_cache = [] + forward_context = get_forward_context() + for i in range(self.mf_model_config.num_layers): + k_cache = self.kv_caches[i].kv_cache[forward_context.virtual_engine][0] + key_cache.append(k_cache) + return mutable(key_cache), None + + def load_weights(self, weights: Iterable[Tuple[str, Tensor]]) -> Set[str]: + if self.mf_config.load_ckpt_format == "ckpt": + model = Model(self.network) + batch_size = self.mf_config.model.model_config.batch_size + seq_length = self.mf_config.model.model_config.seq_length + input_ids = np.ones(shape=tuple([batch_size, seq_length])) + infer_data = self.network.prepare_inputs_for_predict_layout(input_ids) + transform_and_load_checkpoint( + self.mf_config, model, self.network, infer_data, do_predict=True + ) + else: + weight_processor = DeepseekV3WeightProcessor(self.mf_config, self.network, self.is_quant) + weight_processor.load_safetensors_shard(self.mf_config.load_checkpoint) + self.network.set_dynamic_inputs() + dynamic_hidden_states = Tensor(shape=[None, None], dtype=self.mf_model_config.compute_dtype) + self.lm_head.set_inputs(dynamic_hidden_states) + return None + + def get_model_path(self): + model_name_or_path = self.model_config.model + if os.path.isdir(model_name_or_path): + return model_name_or_path + else: + raise ValueError("The 'model' in LLM should be the local path of the MindSpore checkpoint file.") + + def create_ptq(self, quant_type: str, quant_mode: PTQMode): + """create_ptq""" + if quant_type.lower() == 'ptq': + cfg = PTQConfig(mode=quant_mode, backend=BackendTarget.ASCEND, weight_quant_dtype=msdtype.int8, act_quant_dtype=msdtype.int8, outliers_suppression=OutliersSuppressionType.OUTLIER_SUPPRESSION_PLUS, - opname_blacklist=['lkv2kv', 'lm_head'], - precision_recovery=PrecisionRecovery.NONE, + opname_blacklist=['lkv2kv', 'lm_head'], precision_recovery=PrecisionRecovery.NONE, act_quant_granularity=QuantGranularity.PER_TENSOR, weight_quant_granularity=QuantGranularity.PER_CHANNEL) - wo_config = PTQConfig(mode=PTQMode.DEPLOY, - backend=BackendTarget.ASCEND, - weight_quant_dtype=msdtype.int8, - act_quant_dtype=msdtype.int8, - outliers_suppression=OutliersSuppressionType.NONE, - precision_recovery=PrecisionRecovery.NONE, - act_quant_granularity=QuantGranularity.PER_TENSOR, - weight_quant_granularity=QuantGranularity.PER_CHANNEL) - ffn_config = PTQConfig(mode=PTQMode.DEPLOY, - backend=BackendTarget.ASCEND, - weight_quant_dtype=msdtype.int8, + ffn_config = PTQConfig(mode=quant_mode, backend=BackendTarget.ASCEND, weight_quant_dtype=msdtype.int8, act_quant_dtype=msdtype.int8, outliers_suppression=OutliersSuppressionType.NONE, precision_recovery=PrecisionRecovery.NONE, act_quant_granularity=QuantGranularity.PER_TOKEN, weight_quant_granularity=QuantGranularity.PER_CHANNEL) - ptq = PTQ(config=cfg, - layer_policies=OrderedDict({r'.*\.wo.*':wo_config, r'.*\.feed_forward\..*':ffn_config})) - ptq.apply(self.network) - ptq.convert(self.network) - - self.network._jit_config_dict = JitConfig( - jit_level="O0", infer_boost="on" - ).jit_config_dict - self.mf_kvcaches_init = False - self.logits = None - - self.sampler = get_sampler() - self.set_modules({"model": self.network}) - - def update_mf_kvcaches(self, kv_caches): - if self.mf_kvcaches_init: - return - - for i in range(self.mf_model_config.num_layers): - k_cache = kv_caches[i][0] - mf_k_cache, _ = self.network.kvcache(i) - - mf_k_cache.set_device_address( - k_cache._data_ptr(), k_cache.shape, k_cache.dtype - ) - self.mf_kvcaches_init = True - - def forward( - self, - input_ids: Tensor, - positions: Tensor, - kv_caches: List[Tensor], - attn_metadata: AttentionMetadata, - intermediate_tensors: Optional[IntermediateTensors] = None, - inputs_embeds: Optional[Tensor] = None, - ) -> Union[Tensor, IntermediateTensors]: - self.update_mf_kvcaches(kv_caches) - - is_prefill = True if attn_metadata.prefill_metadata else False - - self.logits = None - - model_inputs = {} - model_inputs["input_ids"] = _batch_seq(input_ids, is_prefill) - model_inputs["batch_valid_length"] = ms.ops.expand_dims( - attn_metadata.seq_lens_tensor, 0 - ) - model_inputs["block_tables"] = _pad_block_table( - attn_metadata.block_tables, - self.mf_model_config.seq_length, - self.mf_model_config.block_size, - ) - model_inputs["slot_mapping"] = attn_metadata.slot_mapping - - if is_prefill: - self.network.phase = "prefill" - self.network.add_flags_custom(is_first_iteration=True) - self.logits = self.network(**model_inputs) - self.network.phase = "increment" - self.network.add_flags_custom(is_first_iteration=False) + layer_policies = OrderedDict({r'.*\.feed_forward\..*': ffn_config}) + elif quant_type.lower() == 'awq-a16w4': + cfg = PTQConfig(mode=quant_mode, backend=BackendTarget.ASCEND, weight_quant_dtype=msdtype.qint4x2, + act_quant_dtype=None, outliers_suppression=OutliersSuppressionType.AWQ, + opname_blacklist=['lm_head', 'lkv2kv'], weight_quant_granularity=QuantGranularity.PER_GROUP, + group_size=128) + layer_policies = OrderedDict() + elif quant_type.lower() == 'awq-a16w8': + cfg = PTQConfig(mode=quant_mode, backend=BackendTarget.ASCEND, weight_quant_dtype=msdtype.int8, + act_quant_dtype=None, outliers_suppression=OutliersSuppressionType.AWQ, + opname_blacklist=['lm_head', 'lkv2kv']) + elif quant_type.lower() == 'gptq-perchannel': + gptq_config = GPTQQuantConfig() + cfg = PTQConfig(mode=quant_mode, backend=BackendTarget.ASCEND, weight_quant_dtype=msdtype.qint4x2, + act_quant_dtype=None, precision_recovery=PrecisionRecovery.GPTQ, algo_args=gptq_config, + opname_blacklist=['lm_head', 'lkv2kv']) + layer_policies = OrderedDict() + elif quant_type.lower() == 'gptq-pergroup': + gptq_config = GPTQQuantConfig() + cfg = PTQConfig(mode=quant_mode, backend=BackendTarget.ASCEND, weight_quant_dtype=msdtype.qint4x2, + algo_args=gptq_config, act_quant_dtype=None, precision_recovery=PrecisionRecovery.GPTQ, + weight_quant_granularity=QuantGranularity.PER_GROUP, opname_blacklist=['lm_head', 'lkv2kv'], + group_size=128) + layer_policies = OrderedDict() + elif quant_type.lower() == 'smoothquant': + cfg = PTQConfig(mode=quant_mode, backend=BackendTarget.ASCEND, weight_quant_dtype=msdtype.int8, + act_quant_dtype=msdtype.int8, outliers_suppression=OutliersSuppressionType.SMOOTH, + opname_blacklist=['lm_head', 'lkv2kv']) + w2_config = PTQConfig(mode=quant_mode, backend=BackendTarget.ASCEND, weight_quant_dtype=msdtype.int8, + act_quant_dtype=msdtype.int8, + outliers_suppression=OutliersSuppressionType.NONE, + precision_recovery=PrecisionRecovery.NONE, + act_quant_granularity=QuantGranularity.PER_TOKEN, + weight_quant_granularity=QuantGranularity.PER_CHANNEL) + layer_policies = OrderedDict({r'.*\.w2.*': w2_config}) + elif quant_type.lower() == 'a16w8': + cfg = PTQConfig(mode=quant_mode, backend=BackendTarget.ASCEND, weight_quant_dtype=msdtype.int8, + opname_blacklist=['lm_head', 'lkv2kv']) + layer_policies = OrderedDict() + elif quant_type.lower() == 'a8dynw8': + cfg = PTQConfig(mode=quant_mode, backend=BackendTarget.ASCEND, weight_quant_dtype=msdtype.int8, + act_quant_dtype=msdtype.int8, act_quant_granularity=QuantGranularity.PER_TOKEN, + opname_blacklist=['lm_head', 'lkv2kv']) + layer_policies = OrderedDict() else: - self.logits = self.network(**model_inputs) - - return None - - def compute_logits( - self, - hidden_states: Tensor, - sampling_metadata: SamplingMetadata, - ) -> Optional[Tensor]: - return self.logits - - def sample( - self, - logits: Tensor, - sampling_metadata: SamplingMetadata, - ) -> Optional[SamplerOutput]: - next_tokens = self.sampler(logits, sampling_metadata) - return next_tokens - - def load_weights(self, weights: Iterable[Tuple[str, Tensor]]) -> Set[str]: - model = Model(self.network) - batch_size = self.mf_config.model.model_config.batch_size - seq_length = self.mf_config.model.model_config.seq_length - input_ids = np.ones(shape=tuple([batch_size, seq_length])) - infer_data = self.network.prepare_inputs_for_predict_layout(input_ids) - transform_and_load_checkpoint( - self.mf_config, model, self.network, infer_data, do_predict=True - ) - self.network.set_dynamic_inputs() - return None + logger.warning("Input unsupported quant type: %s.", quant_type) + return None + ptq = PTQ(config=cfg, layer_policies=layer_policies) + if 'awq' in quant_type.lower(): + # pylint: disable=protected-access + ptq._config.weight_symmetric = False + if 'smoothquant' in quant_type.lower(): + # pylint: disable=protected-access + ptq._config.aclnn_quant_list = ["routed_experts.ffn.w_gate_hidden", "routed_experts.ffn.w1", + "routed_experts.ffn.w3"] + ptq.decoder_layer_types.append(DeepseekV3DecodeLayer) + return ptq diff --git a/vllm_mindspore/model_executor/models/mf_models/deepseekv3_infer_save_ckpt.py b/vllm_mindspore/model_executor/models/mf_models/deepseekv3_infer_save_ckpt.py new file mode 100644 index 0000000000000000000000000000000000000000..4b781a8c1480d3b9d8f9c34d689fef9af9e66ccf --- /dev/null +++ b/vllm_mindspore/model_executor/models/mf_models/deepseekv3_infer_save_ckpt.py @@ -0,0 +1,104 @@ +# Copyright 2025 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""Infer save ckpt by safetensor.""" +import argparse +import os +from collections import OrderedDict + +import mindspore as ms +from mindspore import dtype as msdtype +from mindspore.communication.management import get_rank +from mindformers.core.parallel_config import build_parallel_config +from mindformers.tools.logger import logger +from mindformers import MindFormerConfig +from mindformers import build_context +from research.deepseek3.deepseekv3_infer_parallelism import DeepseekInferParallelism + +from research.deepseek3.deepseek3_config import DeepseekV3Config +from research.deepseek3.deepseek3_model_infer import InferenceDeepseekV3ForCausalLM + +# for example +# bash scripts/msrun_launcher.sh "python ./infer_save_ckpt_from_safetensor.py +# --config /path/to/predict_deepseek_r1_671b.yaml +# --save_ckpt_path /path/to/save_ckpt_path +# --load_checkpoint /path/to/safetensor_path " 4 8555 "output/deepseek_msrun_log" "False" 7200 + +def create_ptq(): + '''create_ptq''' + from research.deepseek3.deepseek3_model_infer import DeepseekV3DecodeLayer + from mindspore_gs.ptq import PTQ + from mindspore_gs.common import BackendTarget + from mindspore_gs.ptq import PTQConfig, PTQMode, OutliersSuppressionType, PrecisionRecovery, QuantGranularity + cfg = PTQConfig(mode=PTQMode.DEPLOY, backend=BackendTarget.ASCEND, weight_quant_dtype=msdtype.int8, + act_quant_dtype=msdtype.int8, outliers_suppression=OutliersSuppressionType.OUTLIER_SUPPRESSION_PLUS, + opname_blacklist=['lkv2kv', 'lm_head'], precision_recovery=PrecisionRecovery.NONE, + act_quant_granularity=QuantGranularity.PER_TENSOR, + weight_quant_granularity=QuantGranularity.PER_CHANNEL) + ffn_config = PTQConfig(mode=PTQMode.DEPLOY, backend=BackendTarget.ASCEND, weight_quant_dtype=msdtype.int8, + act_quant_dtype=msdtype.int8, + outliers_suppression=OutliersSuppressionType.NONE, + precision_recovery=PrecisionRecovery.NONE, + act_quant_granularity=QuantGranularity.PER_TOKEN, + weight_quant_granularity=QuantGranularity.PER_CHANNEL) + ptq = PTQ(config=cfg, layer_policies=OrderedDict({r'.*\.feed_forward\..*': ffn_config})) + ptq.decoder_layers.append(DeepseekV3DecodeLayer) + return ptq + + +def main(config_path, load_checkpoint, save_ckpt_dir): + # set model config + config = MindFormerConfig(config_path) + config.load_checkpoint = load_checkpoint + + build_context(config) + build_parallel_config(config) + model_config = config.model.model_config + model_config.parallel_config = config.parallel_config + model_config.moe_config = config.moe_config + model_config = DeepseekV3Config(**model_config) + + # build model from config + network = InferenceDeepseekV3ForCausalLM(model_config) + + is_quant = hasattr(config.model.model_config, "quantization_config") + + if is_quant: + ptq = create_ptq() + ptq.apply(network) + ptq.convert(network) + ptq.summary(network) + # load checkpoint + if config.load_checkpoint: + logger.info("----------------Transform and load checkpoint----------------") + model_parallelism = DeepseekInferParallelism(config, network, is_quant) + model_parallelism.infer_convert_and_parallelism(config.load_checkpoint) + + rank_id = str(get_rank()) + os.makedirs(os.path.join(save_ckpt_dir, "rank_" + rank_id), exist_ok=True) + + save_ckpt_path = os.path.join(save_ckpt_dir, "rank_" + rank_id, "checkpoint_" + rank_id + ".ckpt") + ms.save_checkpoint(network.parameters_dict(), save_ckpt_path) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument('--config_path', default='predict_llama2_7b.yaml', type=str, + help='model config file path.') + parser.add_argument('--load_checkpoint', type=str, + help='load model checkpoint path or directory.') + parser.add_argument('--save_ckpt_dir', type=str, + help='save ckpt path.') + args = parser.parse_args() + main(args.config_path, args.load_checkpoint, args.save_ckpt_dir) diff --git a/vllm_mindspore/model_executor/models/mf_models/deepseekv3_weight_processor.py b/vllm_mindspore/model_executor/models/mf_models/deepseekv3_weight_processor.py new file mode 100644 index 0000000000000000000000000000000000000000..642897ed44cd3289037f5fbaca444839b5cb2b4d --- /dev/null +++ b/vllm_mindspore/model_executor/models/mf_models/deepseekv3_weight_processor.py @@ -0,0 +1,1456 @@ +# Copyright 2025 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +""" +transform huggingface model to mindspore safetensor. +""" +import os +import json +import gc +import numpy as np +from tqdm import tqdm + +import mindspore as ms +from mindspore import dtype +from mindspore.communication.management import get_rank +from mindformers.experimental.parallel_core.pynative.parallel_state import get_tensor_model_parallel_rank +from vllm_mindspore.model_executor.models.mf_models.weight_processor import BaseWeightProcessor +from vllm_mindspore.utils import convert_np_to_ms_dtype + + +class DeepseekV3WeightProcessor(BaseWeightProcessor): + r""" + Provide DeepseekV3/R1 Model weight load and shards. + Args: + config (DeepseekV3/R1Config): The config of DeepseekV3/R1 model. + network (InferenceDeepseekV3ForCausalLM): The network of DeepseekV3/R1. + + """ + + def __init__(self, config, network, is_quant): + super().__init__(config, network, is_quant) + self.num_layers = self.config.model.model_config.num_layers + + def quant_convert_weight_name(self, weight_name: str): + """replace quant net weight name""" + weight_name = weight_name.replace('embed_tokens.weight', 'tok_embeddings.embedding_weight') + + weight_name = weight_name.replace('.self_attn.q_a_proj.weight', '.attention.q2l_proj._layer.weight') + weight_name = weight_name.replace('.self_attn.q_a_proj.input_scale', '.attention.q2l_proj.quant_op.input_scale') + weight_name = weight_name.replace('.self_attn.q_a_proj.input_offset', '.attention.q2l_proj.quant_op.input_zp') + weight_name = weight_name.replace('.self_attn.q_a_proj.quant_bias', + '.attention.q2l_proj._layer.matmul.quant_bias') + weight_name = weight_name.replace('.self_attn.q_a_proj.deq_scale', + '.attention.q2l_proj._layer.matmul.dequant_scale') + + weight_name = weight_name.replace('.self_attn.q_a_layernorm.weight', '.attention.lq_norm.weight') + weight_name = weight_name.replace('.self_attn.kv_a_layernorm.weight', '.attention.lkv_norm.weight') + weight_name = weight_name.replace('.self_attn.kv_b_proj.', '.attention.lkv2kv.') + + weight_name = weight_name.replace('.self_attn.q_b_proj.weight', '.attention.l2q_proj._layer.weight') + weight_name = weight_name.replace('.self_attn.q_b_proj.input_scale', '.attention.l2q_proj.quant_op.input_scale') + weight_name = weight_name.replace('.self_attn.q_b_proj.input_offset', '.attention.l2q_proj.quant_op.input_zp') + weight_name = weight_name.replace('.self_attn.q_b_proj.quant_bias', + '.attention.l2q_proj._layer.matmul.quant_bias') + weight_name = weight_name.replace('.self_attn.q_b_proj.deq_scale', + '.attention.l2q_proj._layer.matmul.dequant_scale') + + weight_name = weight_name.replace('.self_attn.kv_a_proj_with_mqa.weight', '.attention.kv2l._layer.weight') + weight_name = weight_name.replace('.self_attn.kv_a_proj_with_mqa.input_scale', + '.attention.kv2l.quant_op.input_scale') + weight_name = weight_name.replace('.self_attn.kv_a_proj_with_mqa.input_offset', + '.attention.kv2l.quant_op.input_zp') + weight_name = weight_name.replace('.self_attn.kv_a_proj_with_mqa.quant_bias', + '.attention.kv2l._layer.matmul.quant_bias') + weight_name = weight_name.replace('.self_attn.kv_a_proj_with_mqa.deq_scale', + '.attention.kv2l._layer.matmul.dequant_scale') + + weight_name = weight_name.replace('.self_attn.o_proj.weight', '.attention.wo._layer.weight') + weight_name = weight_name.replace('.self_attn.o_proj.input_scale', '.attention.wo.quant_op.input_scale') + weight_name = weight_name.replace('.self_attn.o_proj.input_offset', '.attention.wo.quant_op.input_zp') + weight_name = weight_name.replace('.self_attn.o_proj.quant_bias', '.attention.wo._layer.matmul.quant_bias') + weight_name = weight_name.replace('.self_attn.o_proj.deq_scale', '.attention.wo._layer.matmul.dequant_scale') + + weight_name = weight_name.replace('.self_attn.q_a_layernorm.bias', '.attention.l2q_proj.quant_op.beta') + weight_name = weight_name.replace('.input_layernorm.bias', '.attention.q2l_proj.quant_op.beta') + + # mlp is pertoken quant + weight_name = weight_name.replace('.weight_scale', '.matmul.weight_scale') + weight_name = weight_name.replace('.weight_offset', '.matmul.weight_offset') + + weight_name = weight_name.replace('mlp.gate_proj.', 'feed_forward.w1._layer.') + weight_name = weight_name.replace('mlp.down_proj.', 'feed_forward.w2._layer.') + weight_name = weight_name.replace('mlp.up_proj.', 'feed_forward.w3._layer.') + weight_name = weight_name.replace('mlp.experts.', 'feed_forward.routed_experts.ffn.') + weight_name = weight_name.replace('mlp.shared_experts.gate_proj.', 'feed_forward.shared_experts.w1._layer.') + weight_name = weight_name.replace('mlp.shared_experts.down_proj.', 'feed_forward.shared_experts.w2._layer.') + weight_name = weight_name.replace('mlp.shared_experts.up_proj.', 'feed_forward.shared_experts.w3._layer.') + weight_name = weight_name.replace('mlp.gate.weight', 'feed_forward.routed_experts.router.dense.weight') + weight_name = weight_name.replace('mlp.gate.e_score_correction_bias', + 'feed_forward.routed_experts.router.e_score_correction_bias') + weight_name = weight_name.replace('.input_layernorm.weight', '.attention_norm.weight') + weight_name = weight_name.replace('.post_attention_layernorm.', '.ffn_norm.') + weight_name = weight_name.replace('model.norm.weight', 'model.norm_out.weight') + return weight_name + + def infer_trans_rope_weight(self, weight, qk_rope_head_dim): + """process rope router weight""" + w1 = weight[..., -qk_rope_head_dim::2, :] + w2 = weight[..., -qk_rope_head_dim + 1::2, :] + weight[..., -qk_rope_head_dim:, :] = np.concatenate([w1, w2], axis=-2) + return weight + + def infer_quant_process_moe_routed_expert_ffn_weight(self, src_hf_dir, layer_id, hf_weight_map): + """process moe router expert weight""" + ffn_concat = self.config.model.model_config.ffn_concat + num_router_experts = self.config.moe_config.expert_num + + # router expert dense + router_dense_hf_name = f"model.layers.{layer_id}.mlp.gate.weight" + router_dense_ms_name = self.quant_convert_weight_name(router_dense_hf_name) + router_dense_ms_param, _ = self.get_safetensor_from_file(router_dense_hf_name, src_hf_dir, hf_weight_map) + self.parameter_dict[router_dense_ms_name] = ms.Parameter( + ms.from_numpy(router_dense_ms_param).astype(ms.bfloat16), + name=router_dense_ms_name, requires_grad=False) + + # e_score_correction_bias + e_score_correction_bias_hf_name = f"model.layers.{layer_id}.mlp.gate.e_score_correction_bias" + e_score_correction_bias_ms_name = self.quant_convert_weight_name(e_score_correction_bias_hf_name) + e_score_correction_bias_ms_param, _ = self.get_safetensor_from_file(e_score_correction_bias_hf_name, src_hf_dir, + hf_weight_map) + self.parameter_dict[e_score_correction_bias_ms_name] = ms.Parameter( + ms.from_numpy(e_score_correction_bias_ms_param).astype(ms.float32), + name=e_score_correction_bias_ms_name, requires_grad=False) + + w1_list = [] + w2_list = [] + w3_list = [] + + w1_scale_list = [] + w2_scale_list = [] + w3_scale_list = [] + + w1_ms_name = f"model.layers.{layer_id}.feed_forward.routed_experts.ffn.w1._layer.weight" + w2_ms_name = f"model.layers.{layer_id}.feed_forward.routed_experts.ffn.w2._layer.weight" + w3_ms_name = f"model.layers.{layer_id}.feed_forward.routed_experts.ffn.w3._layer.weight" + + w1_scale_ms_name = f"model.layers.{layer_id}.feed_forward.routed_experts.ffn.w1._layer.matmul.weight_scale" + w2_scale_ms_name = f"model.layers.{layer_id}.feed_forward.routed_experts.ffn.w2._layer.matmul.weight_scale" + w3_scale_ms_name = f"model.layers.{layer_id}.feed_forward.routed_experts.ffn.w3._layer.matmul.weight_scale" + + for index in range(0, num_router_experts): + w1_hf_name = f"model.layers.{layer_id}.mlp.experts.{index}.gate_proj.weight" + w1_ms_param, _ = self.get_safetensor_from_file(w1_hf_name, src_hf_dir, hf_weight_map, + is_split_param=True, split_axis=0) + + w2_hf_name = f"model.layers.{layer_id}.mlp.experts.{index}.down_proj.weight" + w2_ms_param, _ = self.get_safetensor_from_file(w2_hf_name, src_hf_dir, hf_weight_map, + is_split_param=True, split_axis=1) + + w3_hf_name = f"model.layers.{layer_id}.mlp.experts.{index}.up_proj.weight" + w3_ms_param, _ = self.get_safetensor_from_file(w3_hf_name, src_hf_dir, hf_weight_map, + is_split_param=True, split_axis=0) + + w1_list.append(w1_ms_param) + w2_list.append(w2_ms_param) + w3_list.append(w3_ms_param) + + w1_scale_hf_name = f"model.layers.{layer_id}.mlp.experts.{index}.gate_proj.weight_scale" + w1_scale_ms_param, _ = self.get_safetensor_from_file(w1_scale_hf_name, src_hf_dir, hf_weight_map, + is_split_param=True, split_axis=0) + + w2_scale_hf_name = f"model.layers.{layer_id}.mlp.experts.{index}.down_proj.weight_scale" + w2_scale_ms_param, _ = self.get_safetensor_from_file(w2_scale_hf_name, src_hf_dir, hf_weight_map) + + w3_scale_hf_name = f"model.layers.{layer_id}.mlp.experts.{index}.up_proj.weight_scale" + w3_scale_ms_param, _ = self.get_safetensor_from_file(w3_scale_hf_name, src_hf_dir, hf_weight_map, + is_split_param=True, split_axis=0) + + w1_scale_ms_param = w1_scale_ms_param.squeeze(axis=-1) + w2_scale_ms_param = w2_scale_ms_param.squeeze(axis=-1) + w3_scale_ms_param = w3_scale_ms_param.squeeze(axis=-1) + w1_scale_list.append(w1_scale_ms_param) + w2_scale_list.append(w2_scale_ms_param) + w3_scale_list.append(w3_scale_ms_param) + + w1_ms_stack_param = np.stack(w1_list, axis=0) + w2_ms_stack_param = np.stack(w2_list, axis=0) + w3_ms_stack_param = np.stack(w3_list, axis=0) + + w1_scale_ms_stack_param = np.stack(w1_scale_list, axis=0) + w2_scale_ms_stack_param = np.stack(w2_scale_list, axis=0) + w3_scale_ms_stack_param = np.stack(w3_scale_list, axis=0) + + if ffn_concat: + # w_gate_hidden + w_gate_hidden_name = f"model.layers.{layer_id}.feed_forward.routed_experts.ffn.w_gate_hidden._layer.weight" + w_gate_hidden_np = np.concatenate([w1_ms_stack_param, w3_ms_stack_param], axis=1) + w_gate_hidden_param = ms.from_numpy(w_gate_hidden_np).permute(0, 2, 1).astype(ms.int8) + self.parameter_dict[w_gate_hidden_name] = ms.Parameter(w_gate_hidden_param, name=w_gate_hidden_name, + requires_grad=False) + # w_scale_gate_hidden + w_scale_gate_hidden_name = \ + f"model.layers.{layer_id}.feed_forward.routed_experts.ffn.w_gate_hidden._layer.matmul.weight_scale" + + w_scale_gate_hidden_np = np.concatenate([w1_scale_ms_stack_param, w3_scale_ms_stack_param], axis=1) + w_scale_gate_hidden_param = ms.from_numpy(w_scale_gate_hidden_np).astype(ms.bfloat16) + self.parameter_dict[w_scale_gate_hidden_name] = ms.Parameter(w_scale_gate_hidden_param, + name=w_scale_gate_hidden_name, + requires_grad=False) + else: + # w1 w3 + self.parameter_dict[w1_ms_name] = ms.Parameter( + ms.from_numpy(w1_ms_stack_param).permute(0, 2, 1).astype(ms.int8), + name=w1_ms_name, + requires_grad=False) + self.parameter_dict[w3_ms_name] = ms.Parameter( + ms.from_numpy(w3_ms_stack_param).permute(0, 2, 1).astype(ms.int8), + name=w3_ms_name, + requires_grad=False) + + # w1_scale w3_scale + self.parameter_dict[w1_scale_ms_name] = ms.Parameter( + ms.from_numpy(w1_scale_ms_stack_param).astype(ms.bfloat16), + name=w1_ms_name, + requires_grad=False) + self.parameter_dict[w3_scale_ms_name] = ms.Parameter( + ms.from_numpy(w3_scale_ms_stack_param).astype(ms.bfloat16), + name=w3_ms_name, + requires_grad=False) + + self.parameter_dict[w2_ms_name] = ms.Parameter( + ms.from_numpy(w2_ms_stack_param).permute(0, 2, 1).astype(ms.int8), + name=w2_ms_name, + requires_grad=False) + + self.parameter_dict[w2_scale_ms_name] = ms.Parameter( + ms.from_numpy(w2_scale_ms_stack_param).astype(ms.bfloat16), + name=w2_scale_ms_name, + requires_grad=False) + + def infer_quant_process_moe_shared_expert_ffn_weight(self, src_hf_dir, layer_id, hf_weight_map): + """infer quant process moe shared expert ffn weight""" + + ffn_concat = self.config.model.model_config.ffn_concat + w1_hf_name = f"model.layers.{layer_id}.mlp.shared_experts.gate_proj.weight" + w1_ms_name = self.quant_convert_weight_name(w1_hf_name) + w1_ms_param, _ = self.get_safetensor_from_file(w1_hf_name, src_hf_dir, hf_weight_map, + is_split_param=True, split_axis=0) + + w1_scale_hf_name = f"model.layers.{layer_id}.mlp.shared_experts.gate_proj.weight_scale" + w1_scale_ms_name = self.quant_convert_weight_name(w1_scale_hf_name) + w1_scale_ms_param, _ = self.get_safetensor_from_file(w1_scale_hf_name, src_hf_dir, hf_weight_map, + is_split_param=True, split_axis=0) + + w2_hf_name = f"model.layers.{layer_id}.mlp.shared_experts.down_proj.weight" + w2_ms_name = self.quant_convert_weight_name(w2_hf_name) + w2_ms_param, _ = self.get_safetensor_from_file(w2_hf_name, src_hf_dir, hf_weight_map, + is_split_param=True, split_axis=1) + + w2_scale_hf_name = f"model.layers.{layer_id}.mlp.shared_experts.down_proj.weight_scale" + w2_scale_ms_name = self.quant_convert_weight_name(w2_scale_hf_name) + w2_scale_ms_param, _ = self.get_safetensor_from_file(w2_scale_hf_name, src_hf_dir, hf_weight_map) + + w3_hf_name = f"model.layers.{layer_id}.mlp.shared_experts.up_proj.weight" + w3_ms_name = self.quant_convert_weight_name(w3_hf_name) + w3_ms_param, _ = self.get_safetensor_from_file(w3_hf_name, src_hf_dir, hf_weight_map, + is_split_param=True, split_axis=0) + + w3_scale_hf_name = f"model.layers.{layer_id}.mlp.shared_experts.up_proj.weight_scale" + w3_scale_ms_name = self.quant_convert_weight_name(w3_scale_hf_name) + w3_scale_ms_param, _ = self.get_safetensor_from_file(w3_scale_hf_name, src_hf_dir, hf_weight_map, + is_split_param=True, split_axis=0) + + w1_scale_ms_param = w1_scale_ms_param.squeeze(axis=-1) + w2_scale_ms_param = w2_scale_ms_param.squeeze(axis=-1) + w3_scale_ms_param = w3_scale_ms_param.squeeze(axis=-1) + + if ffn_concat: + w_gate_hidden_name = f"model.layers.{layer_id}.feed_forward.shared_experts.w_gate_hidden._layer.weight" + w_gate_hidden_np = np.concatenate([w1_ms_param, w3_ms_param], axis=0) + w_gate_hidden_param = ms.from_numpy(w_gate_hidden_np).astype(ms.int8) + self.parameter_dict[w_gate_hidden_name] = ms.Parameter(w_gate_hidden_param, name=w_gate_hidden_name, + requires_grad=False) + + w_scale_gate_hidden_name = \ + f"model.layers.{layer_id}.feed_forward.shared_experts.w_gate_hidden._layer.matmul.weight_scale" + w_scale_gate_hidden_np = np.concatenate([w1_scale_ms_param, w3_scale_ms_param], axis=0) + w_scale_gate_hidden_param = ms.from_numpy(w_scale_gate_hidden_np).astype(ms.bfloat16) + self.parameter_dict[w_scale_gate_hidden_name] = ms.Parameter(w_scale_gate_hidden_param, + name=w_scale_gate_hidden_name, + requires_grad=False) + + else: + self.parameter_dict[w1_ms_name] = ms.Parameter(ms.from_numpy(w1_ms_param).astype(ms.int8), + name=w1_ms_name, + requires_grad=False) + self.parameter_dict[w3_ms_name] = ms.Parameter(ms.from_numpy(w3_ms_param).astype(ms.int8), + name=w3_ms_name, + requires_grad=False) + + self.parameter_dict[w1_scale_ms_name] = ms.Parameter( + ms.from_numpy(w1_scale_ms_param).astype(ms.bfloat16), + name=w1_ms_name, + requires_grad=False) + self.parameter_dict[w3_scale_ms_name] = ms.Parameter( + ms.from_numpy(w3_scale_ms_param).astype(ms.bfloat16), + name=w3_ms_name, + requires_grad=False) + + self.parameter_dict[w2_ms_name] = ms.Parameter(ms.from_numpy(w2_ms_param).astype(ms.int8), + name=w2_ms_name, + requires_grad=False) + + self.parameter_dict[w2_scale_ms_name] = ms.Parameter( + ms.from_numpy(w2_scale_ms_param).astype(ms.bfloat16), + name=w2_ms_name, + requires_grad=False) + + def infer_quant_process_dense_ffn_weight(self, src_hf_dir, layer_id, hf_weight_map): + """infer process dense ffn weight""" + + ffn_concat = self.config.model.model_config.ffn_concat + w1_hf_name = f"model.layers.{layer_id}.mlp.gate_proj.weight" + w1_ms_name = self.quant_convert_weight_name(w1_hf_name) + w1_ms_param, _ = self.get_safetensor_from_file(w1_hf_name, src_hf_dir, hf_weight_map, + is_split_param=True, + split_axis=0) + w1_scale_hf_name = f"model.layers.{layer_id}.mlp.gate_proj.weight_scale" + w1_scale_ms_name = self.quant_convert_weight_name(w1_scale_hf_name) + w1_scale_ms_param, _ = self.get_safetensor_from_file(w1_scale_hf_name, src_hf_dir, hf_weight_map, + is_split_param=True, + split_axis=0) + + w2_hf_name = f"model.layers.{layer_id}.mlp.down_proj.weight" + w2_ms_name = self.quant_convert_weight_name(w2_hf_name) + w2_ms_param, _ = self.get_safetensor_from_file(w2_hf_name, src_hf_dir, hf_weight_map, + is_split_param=True, + split_axis=1) + w2_scale_hf_name = f"model.layers.{layer_id}.mlp.down_proj.weight_scale" + w2_scale_ms_name = self.quant_convert_weight_name(w2_scale_hf_name) + # shape:[7168,1] + w2_scale_ms_param, _ = self.get_safetensor_from_file(w2_scale_hf_name, src_hf_dir, hf_weight_map) + + w3_hf_name = f"model.layers.{layer_id}.mlp.up_proj.weight" + w3_ms_name = self.quant_convert_weight_name(w3_hf_name) + w3_ms_param, _ = self.get_safetensor_from_file(w3_hf_name, src_hf_dir, hf_weight_map, + is_split_param=True, + split_axis=0) + w3_scale_hf_name = f"model.layers.{layer_id}.mlp.up_proj.weight_scale" + w3_scale_ms_name = self.quant_convert_weight_name(w3_scale_hf_name) + w3_scale_ms_param, _ = self.get_safetensor_from_file(w3_scale_hf_name, src_hf_dir, hf_weight_map, + is_split_param=True, + split_axis=0) + + w1_scale_ms_param = w1_scale_ms_param.squeeze(axis=-1) + w2_scale_ms_param = w2_scale_ms_param.squeeze(axis=-1) + w3_scale_ms_param = w3_scale_ms_param.squeeze(axis=-1) + + if ffn_concat: + w_gate_hidden_name = f"model.layers.{layer_id}.feed_forward.w_gate_hidden._layer.weight" + w_gate_hidden_np = np.concatenate([w1_ms_param, w3_ms_param], axis=0) + w_gate_hidden_param = ms.from_numpy(w_gate_hidden_np).astype(dtype=ms.int8) + self.parameter_dict[w_gate_hidden_name] = ms.Parameter(w_gate_hidden_param, name=w_gate_hidden_name, + requires_grad=False) + + w_scale_gate_hidden_name = f"model.layers.{layer_id}.feed_forward.w_gate_hidden._layer.matmul.weight_scale" + w_scale_gate_hidden_param = ms.from_numpy( + np.concatenate([w1_scale_ms_param, w3_scale_ms_param], axis=0)).astype(dtype=ms.bfloat16) + self.parameter_dict[w_scale_gate_hidden_name] = ms.Parameter(w_scale_gate_hidden_param, + name=w_scale_gate_hidden_name, + requires_grad=False) + + else: + self.parameter_dict[w1_ms_name] = ms.Parameter(ms.from_numpy(w1_ms_param).astype(ms.int8), + name=w1_ms_name, + requires_grad=False) + self.parameter_dict[w3_ms_name] = ms.Parameter(ms.from_numpy(w3_ms_param).astype(ms.int8), + name=w3_ms_name, + requires_grad=False) + + self.parameter_dict[w1_scale_ms_name] = ms.Parameter( + ms.from_numpy(w1_scale_ms_param).astype(ms.bfloat16), + name=w1_scale_ms_name, + requires_grad=False) + self.parameter_dict[w3_scale_ms_name] = ms.Parameter( + ms.from_numpy(w3_scale_ms_param).astype(ms.bfloat16), + name=w3_scale_ms_name, + requires_grad=False) + + self.parameter_dict[w2_ms_name] = ms.Parameter(ms.from_numpy(w2_ms_param).astype(ms.int8), + name=w2_ms_name, + requires_grad=False) + + self.parameter_dict[w2_scale_ms_name] = ms.Parameter( + ms.from_numpy(w2_scale_ms_param).astype(ms.bfloat16), + name=w2_ms_name, + requires_grad=False) + + def infer_convert_outer_weight(self, src_hf_dir, hf_weight_map): + """convert weight not in model""" + embed_tokens_hf_name = "model.embed_tokens.weight" + embed_tokens_ms_name = self.quant_convert_weight_name(embed_tokens_hf_name) + np_data, _ = self.get_safetensor_from_file(embed_tokens_hf_name, src_hf_dir, hf_weight_map) + self.parameter_dict[embed_tokens_ms_name] = ms.Parameter(ms.from_numpy(np_data).astype(ms.bfloat16), + name=embed_tokens_ms_name, + requires_grad=False) + + norm_hf_name = "model.norm.weight" + norm_ms_name = self.quant_convert_weight_name(norm_hf_name) + np_data, _ = self.get_safetensor_from_file(norm_hf_name, src_hf_dir, hf_weight_map) + self.parameter_dict[norm_ms_name] = ms.Parameter(ms.from_numpy(np_data).astype(ms.bfloat16), + name=norm_ms_name, + requires_grad=False) + + lm_head_hf_name = "lm_head.weight" + lm_head_ms_name = self.quant_convert_weight_name(lm_head_hf_name) + if not self.config.parallel_config.vocab_emb_dp: + np_data, _ = self.get_safetensor_from_file(lm_head_hf_name, src_hf_dir, hf_weight_map, + is_split_param=True, split_axis=0) + else: + np_data, _ = self.get_safetensor_from_file(lm_head_hf_name, src_hf_dir, hf_weight_map) + self.parameter_dict[lm_head_ms_name] = ms.Parameter(ms.from_numpy(np_data).astype(ms.bfloat16), + name=lm_head_ms_name, + requires_grad=False) + + def quant_special_attention_weight(self, layer_id, src_hf_dir, hf_weight_map, name, is_trans_rope_weigh=False, + is_split_param=False): + # q_a_proj->q2l_proj + # kv_a_proj_with_mqa->kv2l + # q_a_layernorm->lq_norm + # o_proj->wo + + # input_scale, input_zp no split + input_scale_hf_name = f"model.layers.{layer_id}.self_attn." + name + ".input_scale" + input_scale_ms_name = self.quant_convert_weight_name(input_scale_hf_name) + input_scale_ms_param, _ = self.get_safetensor_from_file(input_scale_hf_name, src_hf_dir, hf_weight_map) + self.parameter_dict[input_scale_ms_name] = ms.Parameter( + ms.from_numpy(input_scale_ms_param).astype(ms.bfloat16), + name=input_scale_ms_name, requires_grad=False) + + input_zp_hf_name = f"model.layers.{layer_id}.self_attn." + name + ".input_offset" + input_zp_ms_name = self.quant_convert_weight_name(input_zp_hf_name) + input_zp_ms_param, _ = self.get_safetensor_from_file(input_zp_hf_name, src_hf_dir, hf_weight_map) + self.parameter_dict[input_zp_ms_name] = ms.Parameter(ms.from_numpy(input_zp_ms_param).astype(ms.int8), + name=input_zp_ms_name, + requires_grad=False) + + if not is_trans_rope_weigh: + quant_bias_hf_name = f"model.layers.{layer_id}.self_attn." + name + ".quant_bias" + quant_bias_ms_name = self.quant_convert_weight_name(quant_bias_hf_name) + quant_bias_ms_param, _ = self.get_safetensor_from_file(quant_bias_hf_name, src_hf_dir, hf_weight_map) + if name == "o_proj" and get_tensor_model_parallel_rank() != 0: + quant_bias_ms_param.fill(0) + + dequant_scale_hf_name = f"model.layers.{layer_id}.self_attn." + name + ".deq_scale" + dequant_scale_ms_name = self.quant_convert_weight_name(dequant_scale_hf_name) + dequant_scale_ms_param, _ = self.get_safetensor_from_file(dequant_scale_hf_name, src_hf_dir, hf_weight_map) + else: + kv_lora_rank = self.config.model.model_config.kv_lora_rank + qk_rope_head_dim = self.config.model.model_config.qk_rope_head_dim + qk_nope_head_dim = self.config.model.model_config.qk_nope_head_dim + + num_heads = self.config.model.model_config.num_heads + rope_dim = qk_rope_head_dim + qk_nope_head_dim + kv_head_dim = kv_lora_rank + qk_rope_head_dim + + quant_bias_hf_name = f"model.layers.{layer_id}.self_attn." + name + ".quant_bias" + quant_bias_ms_name = self.quant_convert_weight_name(quant_bias_hf_name) + quant_bias_ms_param, _ = self.get_safetensor_from_file(quant_bias_hf_name, src_hf_dir, hf_weight_map) + + dequant_scale_hf_name = f"model.layers.{layer_id}.self_attn." + name + ".deq_scale" + dequant_scale_ms_name = self.quant_convert_weight_name(dequant_scale_hf_name) + dequant_scale_ms_param, _ = self.get_safetensor_from_file(dequant_scale_hf_name, src_hf_dir, hf_weight_map) + + if name == "q_b_proj": + quant_bias_ms_param = quant_bias_ms_param.reshape(num_heads, rope_dim, -1) + quant_bias_ms_param = self.infer_trans_rope_weight(quant_bias_ms_param, qk_rope_head_dim) + quant_bias_ms_param = quant_bias_ms_param.reshape(num_heads * rope_dim, -1).reshape(-1) + + dequant_scale_ms_param = dequant_scale_ms_param.reshape(num_heads, rope_dim, -1) + dequant_scale_ms_param = self.infer_trans_rope_weight(dequant_scale_ms_param, qk_rope_head_dim) + dequant_scale_ms_param = dequant_scale_ms_param.reshape(num_heads * rope_dim, -1).reshape(-1) + + elif name == "kv_a_proj_with_mqa": + quant_bias_ms_param = quant_bias_ms_param.reshape(kv_head_dim, -1) + quant_bias_ms_param = self.infer_trans_rope_weight(quant_bias_ms_param, qk_rope_head_dim).reshape(-1) + + dequant_scale_ms_param = dequant_scale_ms_param.reshape(kv_head_dim, -1) + dequant_scale_ms_param = self.infer_trans_rope_weight(dequant_scale_ms_param, qk_rope_head_dim).reshape( + -1) + + if is_split_param: + quant_bias_ms_param = self.split_weight_by_rank(quant_bias_ms_param, split_axis=0) + dequant_scale_ms_param = self.split_weight_by_rank(dequant_scale_ms_param, split_axis=0) + + self.parameter_dict[quant_bias_ms_name] = ms.Parameter( + ms.from_numpy(quant_bias_ms_param).astype(ms.int32), + name=quant_bias_ms_name, requires_grad=False) + self.parameter_dict[dequant_scale_ms_name] = ms.Parameter( + ms.from_numpy(dequant_scale_ms_param).astype(ms.float32), + name=dequant_scale_ms_name, + requires_grad=False) + + def infer_quant_bias_weight(self, src_hf_dir, layer_id, hf_weight_map): + # quant_op.beta + q2l_proj_bias_hf_name = f"model.layers.{layer_id}.input_layernorm.bias" + q2l_proj_bias_ms_name = self.quant_convert_weight_name(q2l_proj_bias_hf_name) + q2l_proj_bias_ms_param, _ = self.get_safetensor_from_file(q2l_proj_bias_hf_name, src_hf_dir, hf_weight_map) + + kv2l_bias_ms_name = f"model.layers.{layer_id}.attention.kv2l.quant_op.beta" + kv2l_bias_ms_param = q2l_proj_bias_ms_param.copy() + + l2q_proj_bias_hf_name = f"model.layers.{layer_id}.self_attn.q_a_layernorm.bias" + l2q_proj_bias_ms_name = self.quant_convert_weight_name(l2q_proj_bias_hf_name) + l2q_proj_bias_ms_param, _ = self.get_safetensor_from_file(l2q_proj_bias_hf_name, src_hf_dir, hf_weight_map) + + self.parameter_dict[q2l_proj_bias_ms_name] = ms.Parameter( + ms.from_numpy(q2l_proj_bias_ms_param).astype(ms.bfloat16), + name=q2l_proj_bias_ms_name, + requires_grad=False) + self.parameter_dict[kv2l_bias_ms_name] = ms.Parameter( + ms.from_numpy(kv2l_bias_ms_param).astype(ms.bfloat16), + name=kv2l_bias_ms_name, + requires_grad=False) + self.parameter_dict[l2q_proj_bias_ms_name] = ms.Parameter( + ms.from_numpy(l2q_proj_bias_ms_param).astype(ms.bfloat16), + name=l2q_proj_bias_ms_name, + requires_grad=False) + + def infer_quant_process_attention_weight(self, src_hf_dir, layer_id, hf_weight_map): + """infer quant process attention weight""" + num_heads = self.config.model.model_config.num_heads + kv_lora_rank = self.config.model.model_config.kv_lora_rank + qk_rope_head_dim = self.config.model.model_config.qk_rope_head_dim + v_head_dim = self.config.model.model_config.v_head_dim + qk_nope_head_dim = self.config.model.model_config.qk_nope_head_dim + + rope_dim = qk_rope_head_dim + qk_nope_head_dim + kv_head_dim = kv_lora_rank + qk_rope_head_dim + + # q_a_proj->q2l_proj + q2l_proj_hf_name = f"model.layers.{layer_id}.self_attn.q_a_proj.weight" + q2l_proj_ms_name = self.quant_convert_weight_name(q2l_proj_hf_name) + q2l_proj_ms_param, _ = self.get_safetensor_from_file(q2l_proj_hf_name, src_hf_dir, hf_weight_map) + self.parameter_dict[q2l_proj_ms_name] = ms.Parameter( + ms.from_numpy(q2l_proj_ms_param).astype(ms.int8), + name=q2l_proj_ms_name, + requires_grad=False) + self.quant_special_attention_weight(layer_id, src_hf_dir, hf_weight_map, "q_a_proj") + + # kv_a_proj_with_mqa->kv2l + kv2l_hf_name = f"model.layers.{layer_id}.self_attn.kv_a_proj_with_mqa.weight" + kv2l_ms_name = self.quant_convert_weight_name(kv2l_hf_name) + kv2l_ms_param, _ = self.get_safetensor_from_file(kv2l_hf_name, src_hf_dir, hf_weight_map) + kv2l_ms_param = kv2l_ms_param.reshape(kv_head_dim, -1) + kv2l_ms_param = self.infer_trans_rope_weight(kv2l_ms_param, qk_rope_head_dim) + self.parameter_dict[kv2l_ms_name] = ms.Parameter(ms.from_numpy(kv2l_ms_param).astype(ms.int8), + name=kv2l_ms_name, + requires_grad=False) + self.quant_special_attention_weight(layer_id, src_hf_dir, hf_weight_map, "kv_a_proj_with_mqa", + is_trans_rope_weigh=True) + + # q_a_layernorm->lq_norm + lq_norm_hf_name = f"model.layers.{layer_id}.self_attn.q_a_layernorm.weight" + lq_norm_ms_name = self.quant_convert_weight_name(lq_norm_hf_name) + lq_norm_ms_param, _ = self.get_safetensor_from_file(lq_norm_hf_name, src_hf_dir, hf_weight_map) + self.parameter_dict[lq_norm_ms_name] = ms.Parameter(ms.from_numpy(lq_norm_ms_param).astype(ms.bfloat16), + name=lq_norm_ms_name, + requires_grad=False) + + # q_b_proj->l2q_proj + l2q_proj_hf_name = f"model.layers.{layer_id}.self_attn.q_b_proj.weight" + l2q_proj_ms_name = self.quant_convert_weight_name(l2q_proj_hf_name) + l2q_proj_ms_param, _ = self.get_safetensor_from_file(l2q_proj_hf_name, src_hf_dir, hf_weight_map) + l2q_proj_ms_param = l2q_proj_ms_param.reshape(num_heads, rope_dim, -1) + l2q_proj_ms_param = self.infer_trans_rope_weight(l2q_proj_ms_param, qk_rope_head_dim) + l2q_proj_ms_param = l2q_proj_ms_param.reshape(num_heads * rope_dim, -1) + l2q_proj_ms_param = self.split_weight_by_rank(l2q_proj_ms_param, split_axis=0) + self.parameter_dict[l2q_proj_ms_name] = ms.Parameter( + ms.from_numpy(l2q_proj_ms_param).astype(ms.int8), + name=l2q_proj_ms_name, + requires_grad=False) + self.quant_special_attention_weight(layer_id, src_hf_dir, hf_weight_map, "q_b_proj", is_trans_rope_weigh=True, + is_split_param=True) + + # kv_a_layernorm->lkv_norm + lkv_norm_hf_name = f"model.layers.{layer_id}.self_attn.kv_a_layernorm.weight" + lkv_norm_ms_name = self.quant_convert_weight_name(lkv_norm_hf_name) + lkv_norm_ms_param, _ = self.get_safetensor_from_file(lkv_norm_hf_name, src_hf_dir, hf_weight_map) + self.parameter_dict[lkv_norm_ms_name] = ms.Parameter( + ms.from_numpy(lkv_norm_ms_param).astype(ms.bfloat16), + name=lkv_norm_ms_name, + requires_grad=False) + + # kv_b_proj->lkv2kv + lkv2kv_hf_name = f"model.layers.{layer_id}.self_attn.kv_b_proj.weight" + lkv2kv_ms_name = self.quant_convert_weight_name(lkv2kv_hf_name) + lkv2kv_ms_param, _ = self.get_safetensor_from_file(lkv2kv_hf_name, src_hf_dir, hf_weight_map) + lkv2kv_head = qk_nope_head_dim + v_head_dim + lkv2kv_ms_param = lkv2kv_ms_param.reshape(num_heads, lkv2kv_head, -1) + value_k_nope, value_v = lkv2kv_ms_param[:, :qk_nope_head_dim, :], lkv2kv_ms_param[:, qk_nope_head_dim:, :] + + # value_k_nope + value_k_nope = value_k_nope.reshape(-1, value_k_nope.shape[-1]) + value_k_nope = self.split_weight_by_rank(value_k_nope, split_axis=0) + name_k_nope = lkv2kv_ms_name.replace(".attention.lkv2kv.", ".attention.lkv2kv_k_nope.") + self.parameter_dict[name_k_nope] = ms.Parameter(ms.from_numpy(value_k_nope).astype(ms.bfloat16), + name=name_k_nope, + requires_grad=False) + # value_v + value_v = value_v.reshape(-1, value_v.shape[-1]) + value_v = self.split_weight_by_rank(value_v, split_axis=0) + name_v = lkv2kv_ms_name.replace(".attention.lkv2kv.", ".attention.lkv2kv_v.") + self.parameter_dict[name_v] = ms.Parameter(ms.from_numpy(value_v).astype(ms.bfloat16), + name=name_v, + requires_grad=False) + + # o_proj->wo + wo_hf_name = f"model.layers.{layer_id}.self_attn.o_proj.weight" + wo_ms_name = self.quant_convert_weight_name(wo_hf_name) + wo_ms_param, _ = self.get_safetensor_from_file(wo_hf_name, src_hf_dir, hf_weight_map) + wo_ms_param = self.split_weight_by_rank(wo_ms_param, split_axis=1) + self.parameter_dict[wo_ms_name] = ms.Parameter(ms.from_numpy(wo_ms_param).astype(ms.int8), + name=wo_ms_name, + requires_grad=False) + self.quant_special_attention_weight(layer_id, src_hf_dir, hf_weight_map, "o_proj") + + def infer_quant_net_convert_layer_weight(self, src_hf_dir, layer_id, hf_weight_map): + """infer quant net convert layer weight""" + + if layer_id >= 3: + self.infer_quant_process_moe_routed_expert_ffn_weight(src_hf_dir, layer_id, hf_weight_map) + self.infer_quant_process_moe_shared_expert_ffn_weight(src_hf_dir, layer_id, hf_weight_map) + else: + self.infer_quant_process_dense_ffn_weight(src_hf_dir, layer_id, hf_weight_map) + + self.infer_quant_process_attention_weight(src_hf_dir, layer_id, hf_weight_map) + self.infer_quant_bias_weight(src_hf_dir, layer_id, hf_weight_map) + self.infer_process_norm_weight(src_hf_dir, layer_id, hf_weight_map) + + def convert_weight_name(self, weight_name: str): + """replace weight name""" + weight_name = weight_name.replace('embed_tokens.weight', 'tok_embeddings.embedding_weight') + weight_name = weight_name.replace('.self_attn.q_a_proj.', '.attention.q2l_proj.') + weight_name = weight_name.replace('.self_attn.q_a_layernorm.', '.attention.lq_norm.') + weight_name = weight_name.replace('.self_attn.q_b_proj.', '.attention.l2q_proj.') + weight_name = weight_name.replace('.self_attn.kv_a_proj_with_mqa.', '.attention.kv2l.') + weight_name = weight_name.replace('.self_attn.kv_a_layernorm.', '.attention.lkv_norm.') + weight_name = weight_name.replace('.self_attn.kv_b_proj.', '.attention.lkv2kv.') + weight_name = weight_name.replace('.self_attn.o_proj.', '.attention.wo.') + weight_name = weight_name.replace('mlp.gate_proj.', 'feed_forward.w1.') + weight_name = weight_name.replace('mlp.down_proj.', 'feed_forward.w2.') + weight_name = weight_name.replace('mlp.up_proj.', 'feed_forward.w3.') + weight_name = weight_name.replace('mlp.experts.', 'feed_forward.routed_experts.ffn.') + weight_name = weight_name.replace('mlp.shared_experts.gate_proj.', 'feed_forward.shared_experts.w1.') + weight_name = weight_name.replace('mlp.shared_experts.down_proj.', 'feed_forward.shared_experts.w2.') + weight_name = weight_name.replace('mlp.shared_experts.up_proj.', 'feed_forward.shared_experts.w3.') + weight_name = weight_name.replace('mlp.gate.weight', 'feed_forward.routed_experts.router.dense.weight') + weight_name = weight_name.replace('mlp.gate.e_score_correction_bias', + 'feed_forward.routed_experts.router.e_score_correction_bias') + weight_name = weight_name.replace('.input_layernorm.', '.attention_norm.') + weight_name = weight_name.replace('.post_attention_layernorm.', '.ffn_norm.') + weight_name = weight_name.replace('model.norm.weight', 'model.norm_out.weight') + + weight_name = self.convert_mtp_weight_name(weight_name) + return weight_name + + def convert_mtp_weight_name(self, weight_name: str): + layer = 0 if 'layers.' not in weight_name else int(weight_name[weight_name.find('layers.') : ].split('.')[1]) + if layer < self.num_layers: + return weight_name + mtp_prefix = f'mtp_model' + is_mtp_layer = 'tok_embeddings' not in weight_name and 'shared_head.' not in weight_name + mtp_prefix = mtp_prefix if not is_mtp_layer else f'{mtp_prefix}.layer' + is_decode_layer = "ffn" in weight_name or "attention" in weight_name or "feed_forward" in weight_name + mtp_prefix = mtp_prefix if not is_decode_layer else f'{mtp_prefix}.decode_layer' + + weight_name = weight_name.replace(f'model.layers.{layer}', mtp_prefix) + if "tok_embeddings" in weight_name: + weight_name = weight_name.replace(f'.weight', f'.embedding_weight') + if "shared_head." in weight_name: + weight_name = weight_name.replace(f'shared_head.', f'') + return weight_name + + def infer_process_moe_routed_expert_ffn_weight(self, src_hf_dir, layer_id, hf_weight_map): + """process moe router expert weight""" + ffn_concat = self.config.model.model_config.ffn_concat + num_router_experts = self.config.moe_config.expert_num + + # router expert dense + router_dense_hf_name = f"model.layers.{layer_id}.mlp.gate.weight" + router_dense_ms_name = self.convert_weight_name(router_dense_hf_name) + router_dense_ms_param, _ = self.get_safetensor_from_file(router_dense_hf_name, src_hf_dir, hf_weight_map) + self.parameter_dict[router_dense_ms_name] = ms.Parameter( + ms.from_numpy(router_dense_ms_param).astype(ms.bfloat16), + name=router_dense_ms_name, requires_grad=False) + + # e_score_correction_bias + e_score_correction_bias_hf_name = f"model.layers.{layer_id}.mlp.gate.e_score_correction_bias" + e_score_correction_bias_ms_name = self.convert_weight_name(e_score_correction_bias_hf_name) + e_score_correction_bias_ms_param, _ = self.get_safetensor_from_file(e_score_correction_bias_hf_name, src_hf_dir, + hf_weight_map) + self.parameter_dict[e_score_correction_bias_ms_name] = ms.Parameter( + ms.from_numpy(e_score_correction_bias_ms_param).astype(ms.float32), + name=e_score_correction_bias_ms_name, requires_grad=False) + + w1_list = [] + w2_list = [] + w3_list = [] + + w1_ms_name = f"model.layers.{layer_id}.feed_forward.routed_experts.ffn.w1.weight" + w1_ms_name = w1_ms_name if layer_id < self.num_layers else self.convert_mtp_weight_name(w1_ms_name) + w2_ms_name = f"model.layers.{layer_id}.feed_forward.routed_experts.ffn.w2.weight" + w2_ms_name = w2_ms_name if layer_id < self.num_layers else self.convert_mtp_weight_name(w2_ms_name) + w3_ms_name = f"model.layers.{layer_id}.feed_forward.routed_experts.ffn.w3.weight" + w3_ms_name = w3_ms_name if layer_id < self.num_layers else self.convert_mtp_weight_name(w3_ms_name) + + for index in range(0, num_router_experts): + w1_hf_name = f"model.layers.{layer_id}.mlp.experts.{index}.gate_proj.weight" + w1_ms_param, _ = self.get_safetensor_from_file(w1_hf_name, src_hf_dir, hf_weight_map, + is_split_param=True, split_axis=0) + + w2_hf_name = f"model.layers.{layer_id}.mlp.experts.{index}.down_proj.weight" + w2_ms_param, _ = self.get_safetensor_from_file(w2_hf_name, src_hf_dir, hf_weight_map, + is_split_param=True, split_axis=1) + + w3_hf_name = f"model.layers.{layer_id}.mlp.experts.{index}.up_proj.weight" + w3_ms_param, _ = self.get_safetensor_from_file(w3_hf_name, src_hf_dir, hf_weight_map, + is_split_param=True, split_axis=0) + + w1_list.append(w1_ms_param) + w2_list.append(w2_ms_param) + w3_list.append(w3_ms_param) + + w1_ms_stack_param = np.stack(w1_list, axis=0) + w2_ms_stack_param = np.stack(w2_list, axis=0) + w3_ms_stack_param = np.stack(w3_list, axis=0) + + if ffn_concat: + w_gate_hidden_name = f"model.layers.{layer_id}.feed_forward.routed_experts.ffn.w_gate_hidden.weight" + w_gate_hidden_name = w_gate_hidden_name if layer_id < self.num_layers else \ + self.convert_mtp_weight_name(w_gate_hidden_name) + w_gate_hidden_np = np.concatenate([w1_ms_stack_param, w3_ms_stack_param], axis=1) + w_gate_hidden_param = ms.from_numpy(w_gate_hidden_np).permute(0, 2, 1).astype(dtype=ms.bfloat16) + self.parameter_dict[w_gate_hidden_name] = ms.Parameter(w_gate_hidden_param, + name=w_gate_hidden_name, + requires_grad=False) + else: + w1_ms_stack_param = ms.from_numpy(w1_ms_stack_param).permute(0, 2, 1).astype(ms.bfloat16) + self.parameter_dict[w1_ms_name] = ms.Parameter(w1_ms_stack_param, + name=w1_ms_name, + requires_grad=False) + + w3_ms_stack_param = ms.from_numpy(w3_ms_stack_param).permute(0, 2, 1).astype(ms.bfloat16) + self.parameter_dict[w3_ms_name] = ms.Parameter(w3_ms_stack_param, + name=w3_ms_name, + requires_grad=False) + + w2_ms_stack_param = ms.from_numpy(w2_ms_stack_param).permute(0, 2, 1).astype(ms.bfloat16) + self.parameter_dict[w2_ms_name] = ms.Parameter(w2_ms_stack_param, + name=w2_ms_name, + requires_grad=False) + + def infer_process_moe_shared_expert_ffn_weight(self, src_hf_dir, layer_id, hf_weight_map): + """infer process moe shared expert ffn weight""" + ffn_concat = self.config.model.model_config.ffn_concat + w1_hf_name = f"model.layers.{layer_id}.mlp.shared_experts.gate_proj.weight" + w1_ms_name = self.convert_weight_name(w1_hf_name) + w1_ms_param, _ = self.get_safetensor_from_file(w1_hf_name, src_hf_dir, hf_weight_map, + is_split_param=True, split_axis=0) + + w2_hf_name = f"model.layers.{layer_id}.mlp.shared_experts.down_proj.weight" + w2_ms_name = self.convert_weight_name(w2_hf_name) + w2_ms_param, _ = self.get_safetensor_from_file(w2_hf_name, src_hf_dir, hf_weight_map, + is_split_param=True, split_axis=1) + + w3_hf_name = f"model.layers.{layer_id}.mlp.shared_experts.up_proj.weight" + w3_ms_name = self.convert_weight_name(w3_hf_name) + w3_ms_param, _ = self.get_safetensor_from_file(w3_hf_name, src_hf_dir, hf_weight_map, + is_split_param=True, split_axis=0) + + if ffn_concat: + w_gate_hidden_name = f"model.layers.{layer_id}.feed_forward.shared_experts.w_gate_hidden.weight" + w_gate_hidden_name = w_gate_hidden_name if layer_id < self.num_layers else \ + self.convert_mtp_weight_name(w_gate_hidden_name) + w_gate_hidden_np = np.concatenate([w1_ms_param, w3_ms_param], axis=0) + w_gate_hidden_param = ms.from_numpy(w_gate_hidden_np).astype(ms.bfloat16) + self.parameter_dict[w_gate_hidden_name] = ms.Parameter(w_gate_hidden_param, + name=w_gate_hidden_name, + requires_grad=False) + else: + self.parameter_dict[w1_ms_name] = ms.Parameter(ms.from_numpy(w1_ms_param).astype(ms.bfloat16), + name=w1_ms_name, + requires_grad=False) + self.parameter_dict[w3_ms_name] = ms.Parameter(ms.from_numpy(w3_ms_param).astype(ms.bfloat16), + name=w3_ms_name, + requires_grad=False) + self.parameter_dict[w2_ms_name] = ms.Parameter(ms.from_numpy(w2_ms_param).astype(ms.bfloat16), + name=w2_ms_name, + requires_grad=False) + + def infer_process_dense_ffn_weight(self, src_hf_dir, layer_id, hf_weight_map): + """infer process dense ffn weight""" + + ffn_concat = self.config.model.model_config.ffn_concat + + w1_hf_name = f"model.layers.{layer_id}.mlp.gate_proj.weight" + w1_ms_name = self.convert_weight_name(w1_hf_name) + w1_ms_param, _ = self.get_safetensor_from_file(w1_hf_name, src_hf_dir, hf_weight_map, is_split_param=True, + split_axis=0) + + w2_hf_name = f"model.layers.{layer_id}.mlp.down_proj.weight" + w2_ms_name = self.convert_weight_name(w2_hf_name) + w2_ms_param, _ = self.get_safetensor_from_file(w2_hf_name, src_hf_dir, hf_weight_map, is_split_param=True, + split_axis=1) + + w3_hf_name = f"model.layers.{layer_id}.mlp.up_proj.weight" + w3_ms_name = self.convert_weight_name(w3_hf_name) + w3_ms_param, _ = self.get_safetensor_from_file(w3_hf_name, src_hf_dir, hf_weight_map, is_split_param=True, + split_axis=0) + + if ffn_concat: + w_gate_hidden_name = f"model.layers.{layer_id}.feed_forward.w_gate_hidden.weight" + w_gate_hidden_np = np.concatenate([w1_ms_param, w3_ms_param], axis=0) + w_gate_hidden_param = ms.from_numpy(w_gate_hidden_np).astype(ms.bfloat16) + self.parameter_dict[w_gate_hidden_name] = ms.Parameter(w_gate_hidden_param, + name=w_gate_hidden_name, + requires_grad=False) + else: + self.parameter_dict[w1_ms_name] = ms.Parameter(ms.from_numpy(w1_ms_param).astype(ms.bfloat16), + name=w1_ms_name, + requires_grad=False) + self.parameter_dict[w3_ms_name] = ms.Parameter(ms.from_numpy(w3_ms_param).astype(ms.bfloat16), + name=w3_ms_name, + requires_grad=False) + + self.parameter_dict[w2_ms_name] = ms.Parameter(ms.from_numpy(w2_ms_param).astype(ms.bfloat16), + name=w2_ms_name, + requires_grad=False) + + def infer_process_attention_weight(self, src_hf_dir, layer_id, hf_weight_map): + """infer process attention weight""" + num_heads = self.config.model.model_config.num_heads + kv_lora_rank = self.config.model.model_config.kv_lora_rank + qk_rope_head_dim = self.config.model.model_config.qk_rope_head_dim + v_head_dim = self.config.model.model_config.v_head_dim + qk_nope_head_dim = self.config.model.model_config.qk_nope_head_dim + + rope_dim = qk_rope_head_dim + qk_nope_head_dim + kv_head_dim = kv_lora_rank + qk_rope_head_dim + + qkv_concat = self.config.model.model_config.qkv_concat + # q2l_proj + q2l_proj_hf_name = f"model.layers.{layer_id}.self_attn.q_a_proj.weight" + q2l_proj_ms_name = self.convert_weight_name(q2l_proj_hf_name) + q_a_proj_ms_param, _ = self.get_safetensor_from_file(q2l_proj_hf_name, src_hf_dir, hf_weight_map) + + # kv2l + kv2l_hf_name = f"model.layers.{layer_id}.self_attn.kv_a_proj_with_mqa.weight" + kv2l_ms_name = self.convert_weight_name(kv2l_hf_name) + kv2l_ms_param, _ = self.get_safetensor_from_file(kv2l_hf_name, src_hf_dir, hf_weight_map) + kv2l_ms_param = kv2l_ms_param.reshape(kv_head_dim, -1) + kv2l_ms_param = self.infer_trans_rope_weight(kv2l_ms_param, qk_rope_head_dim) + if qkv_concat: + wqkv2l_weight = np.concatenate((q_a_proj_ms_param, kv2l_ms_param), 0) + wqkv2l_weight_name = f"model.layers.{layer_id}.attention.qkv2l.weight" + self.parameter_dict[wqkv2l_weight_name] = ms.Parameter(ms.from_numpy(wqkv2l_weight).astype(ms.bfloat16), + name=wqkv2l_weight_name, + requires_grad=False) + else: + self.parameter_dict[q2l_proj_ms_name] = ms.Parameter(ms.from_numpy(q_a_proj_ms_param).astype(ms.bfloat16), + name=q2l_proj_ms_name, + requires_grad=False) + self.parameter_dict[kv2l_ms_name] = ms.Parameter(ms.from_numpy(kv2l_ms_param).astype(ms.bfloat16), + name=kv2l_ms_name, + requires_grad=False) + # lq_norm + lq_norm_hf_name = f"model.layers.{layer_id}.self_attn.q_a_layernorm.weight" + lq_norm_ms_name = self.convert_weight_name(lq_norm_hf_name) + lq_norm_ms_param, _ = self.get_safetensor_from_file(lq_norm_hf_name, src_hf_dir, hf_weight_map) + self.parameter_dict[lq_norm_ms_name] = ms.Parameter(ms.from_numpy(lq_norm_ms_param).astype(ms.bfloat16), + name=lq_norm_ms_name, + requires_grad=False) + + # l2q_proj + l2q_proj_hf_name = f"model.layers.{layer_id}.self_attn.q_b_proj.weight" + l2q_proj_ms_name = self.convert_weight_name(l2q_proj_hf_name) + l2q_proj_ms_param, _ = self.get_safetensor_from_file(l2q_proj_hf_name, src_hf_dir, hf_weight_map) + l2q_proj_ms_param = l2q_proj_ms_param.reshape(num_heads, rope_dim, -1) + l2q_proj_ms_param = self.infer_trans_rope_weight(l2q_proj_ms_param, qk_rope_head_dim) + l2q_proj_ms_param = l2q_proj_ms_param.reshape(num_heads * rope_dim, -1) + l2q_proj_ms_param = self.split_weight_by_rank(l2q_proj_ms_param, split_axis=0) + self.parameter_dict[l2q_proj_ms_name] = ms.Parameter( + ms.from_numpy(l2q_proj_ms_param).astype(ms.bfloat16), + name=l2q_proj_ms_name, + requires_grad=False) + + # lkv_norm + lkv_norm_hf_name = f"model.layers.{layer_id}.self_attn.kv_a_layernorm.weight" + lkv_norm_ms_name = self.convert_weight_name(lkv_norm_hf_name) + lkv_norm_ms_param, _ = self.get_safetensor_from_file(lkv_norm_hf_name, src_hf_dir, hf_weight_map) + self.parameter_dict[lkv_norm_ms_name] = ms.Parameter( + ms.from_numpy(lkv_norm_ms_param).astype(ms.bfloat16), + name=lkv_norm_ms_name, + requires_grad=False) + + # lkv2kv + lkv2kv_hf_name = f"model.layers.{layer_id}.self_attn.kv_b_proj.weight" + lkv2kv_ms_name = self.convert_weight_name(lkv2kv_hf_name) + lkv2kv_ms_param, _ = self.get_safetensor_from_file(lkv2kv_hf_name, src_hf_dir, hf_weight_map) + lkv2kv_head = qk_nope_head_dim + v_head_dim + lkv2kv_ms_param = lkv2kv_ms_param.reshape(num_heads, lkv2kv_head, -1) + value_k_nope, value_v = lkv2kv_ms_param[:, :qk_nope_head_dim, :], lkv2kv_ms_param[:, qk_nope_head_dim:, :] + + # value_k_nope + value_k_nope = value_k_nope.reshape(-1, value_k_nope.shape[-1]) + value_k_nope = self.split_weight_by_rank(value_k_nope, split_axis=0) + name_k_nope = lkv2kv_ms_name.replace(".attention.lkv2kv.", ".attention.lkv2kv_k_nope.") + self.parameter_dict[name_k_nope] = ms.Parameter(ms.from_numpy(value_k_nope).astype(ms.bfloat16), + name=name_k_nope, + requires_grad=False) + # value_v + value_v = value_v.reshape(-1, value_v.shape[-1]) + value_v = self.split_weight_by_rank(value_v, split_axis=0) + name_v = lkv2kv_ms_name.replace(".attention.lkv2kv.", ".attention.lkv2kv_v.") + self.parameter_dict[name_v] = ms.Parameter(ms.from_numpy(value_v).astype(ms.bfloat16), + name=name_v, + requires_grad=False) + + # wo + wo_hf_name = f"model.layers.{layer_id}.self_attn.o_proj.weight" + wo_ms_name = self.convert_weight_name(wo_hf_name) + wo_ms_param, _ = self.get_safetensor_from_file(wo_hf_name, src_hf_dir, hf_weight_map) + wo_ms_param = self.split_weight_by_rank(wo_ms_param, split_axis=1) + self.parameter_dict[wo_ms_name] = ms.Parameter(ms.from_numpy(wo_ms_param).astype(ms.bfloat16), + name=wo_ms_name, + requires_grad=False) + + def infer_process_norm_weight(self, src_hf_dir, layer_id, hf_weight_map): + """infer process attention weight""" + # attention_norm + attention_norm_hf_name = f"model.layers.{layer_id}.input_layernorm.weight" + attention_norm_ms_name = self.convert_weight_name(attention_norm_hf_name) + attention_norm_ms_param, _ = self.get_safetensor_from_file(attention_norm_hf_name, + src_hf_dir, + hf_weight_map) + self.parameter_dict[attention_norm_ms_name] = ms.Parameter( + ms.from_numpy(attention_norm_ms_param).astype(ms.bfloat16), + name=attention_norm_ms_name, + requires_grad=False) + + # ffn_norm + ffn_norm_hf_name = f"model.layers.{layer_id}.post_attention_layernorm.weight" + ffn_norm_ms_name = self.convert_weight_name(ffn_norm_hf_name) + ffn_norm_ms_param, _ = self.get_safetensor_from_file(ffn_norm_hf_name, src_hf_dir, hf_weight_map) + self.parameter_dict[ffn_norm_ms_name] = ms.Parameter( + ms.from_numpy(ffn_norm_ms_param).astype(ms.bfloat16), + name=ffn_norm_ms_name, + requires_grad=False) + + def infer_process_mtp_layer_weight(self, src_hf_dir, layer_id, hf_weight_map): + parameter_dict = {} + mtp_layer_names = ["embed_tokens.weight", "enorm.weight", "hnorm.weight", "eh_proj.weight", + "shared_head.norm.weight", "shared_head.head.weight"] + head_names = ["eh_proj.weight", "shared_head.head.weight"] + for prefix_name in mtp_layer_names: + hf_name = f"model.layers.{layer_id}.{prefix_name}" + ms_name = self.convert_weight_name(hf_name) + if prefix_name in head_names and not self.config.parallel_config.vocab_emb_dp: + ms_param, _ = self.get_safetensor_from_file(hf_name, src_hf_dir, hf_weight_map, + is_split_param=True, split_axis=0) + else: + ms_param, _ = self.get_safetensor_from_file(hf_name, src_hf_dir, hf_weight_map) + parameter_dict[ms_name] = ms.Parameter(ms.Tensor(ms_param, ms.bfloat16), + name=ms_name, + requires_grad=False) + + _, ckpt_not_load = ms.load_param_into_net(self.network, parameter_dict) + + def infer_convert_layer_weight(self, src_hf_dir, layer_id, hf_weight_map): + """infer convert layer weight""" + if layer_id >= 3: + self.infer_process_moe_routed_expert_ffn_weight(src_hf_dir, layer_id, hf_weight_map) + self.infer_process_moe_shared_expert_ffn_weight(src_hf_dir, layer_id, hf_weight_map) + else: + self.infer_process_dense_ffn_weight(src_hf_dir, layer_id, hf_weight_map) + + self.infer_process_attention_weight(src_hf_dir, layer_id, hf_weight_map) + self.infer_process_norm_weight(src_hf_dir, layer_id, hf_weight_map) + + # convert mtp shared weights. + if layer_id >= self.num_layers: + self.infer_process_mtp_layer_weight(src_hf_dir, layer_id, hf_weight_map) + + def smooth_quant_process_route_ffn_weight(self, src_hf_dir, layer_id, hf_weight_map, parameter_dict, layer_type): + """smooth_quant_process_route_ffn_weight""" + ffn_concat = self.config.model.model_config.ffn_concat + w1_weight_name = f"model.layers.{layer_id}.{layer_type}.w1._layer.weight" + w1_weight_param, _ = self.get_safetensor_from_file(w1_weight_name, src_hf_dir, hf_weight_map, + is_split_param=True, + split_axis=2) + + w1_bias_name = f"model.layers.{layer_id}.{layer_type}.w1._layer.matmul.quant_bias" + w1_bias_param, _ = self.get_safetensor_from_file(w1_bias_name, src_hf_dir, hf_weight_map, + is_split_param=True, + split_axis=1) + w1_scale_name = f"model.layers.{layer_id}.{layer_type}.w1._layer.matmul.dequant_scale" + w1_scale_param, _ = self.get_safetensor_from_file(w1_scale_name, src_hf_dir, hf_weight_map, + is_split_param=True, + split_axis=1) + + w1_quant_zp = f"model.layers.{layer_id}.{layer_type}.w1.quant_op.input_zp" + w1_quant_scale = f"model.layers.{layer_id}.{layer_type}.w1.quant_op.input_scale" + w1_quant_zp_param, _ = self.get_safetensor_from_file(w1_quant_zp, src_hf_dir, hf_weight_map) + w1_quant_scale_param, _ = self.get_safetensor_from_file(w1_quant_scale, src_hf_dir, hf_weight_map) + + w3_weight_name = f"model.layers.{layer_id}.{layer_type}.w3._layer.weight" + w3_weight_param, _ = self.get_safetensor_from_file(w3_weight_name, src_hf_dir, hf_weight_map, + is_split_param=True, + split_axis=2) + + w3_bias_name = f"model.layers.{layer_id}.{layer_type}.w3._layer.matmul.quant_bias" + w3_bias_param, _ = self.get_safetensor_from_file(w3_bias_name, src_hf_dir, hf_weight_map, + is_split_param=True, + split_axis=1) + + w3_scale_name = f"model.layers.{layer_id}.{layer_type}.w3._layer.matmul.dequant_scale" + w3_scale_param, _ = self.get_safetensor_from_file(w3_scale_name, src_hf_dir, hf_weight_map, + is_split_param=True, + split_axis=1) + + w3_quant_zp = f"model.layers.{layer_id}.{layer_type}.w3.quant_op.input_zp" + w3_quant_scale = f"model.layers.{layer_id}.{layer_type}.w3.quant_op.input_scale" + w3_quant_zp_param, _ = self.get_safetensor_from_file(w3_quant_zp, src_hf_dir, hf_weight_map) + w3_quant_scale_param, _ = self.get_safetensor_from_file(w3_quant_scale, src_hf_dir, hf_weight_map) + if ffn_concat: + concat_weight_name = f"model.layers.{layer_id}.{layer_type}.w_gate_hidden._layer.weight" + concat_weight_param = ms.Tensor(np.concatenate([w1_weight_param, w3_weight_param], axis=2), dtype=ms.int8) + parameter_dict[concat_weight_name] = ms.Parameter(concat_weight_param, name=concat_weight_name, + requires_grad=False) + + concat_bias_name = f"model.layers.{layer_id}.{layer_type}.w_gate_hidden._layer.matmul.quant_bias" + concat_bias_param = ms.Tensor(np.concatenate([w1_bias_param, w3_bias_param], axis=1), dtype=ms.int32) + parameter_dict[concat_bias_name] = ms.Parameter(concat_bias_param, name=concat_bias_name, + requires_grad=False) + + concat_scale_name = f"model.layers.{layer_id}.{layer_type}.w_gate_hidden._layer.matmul.dequant_scale" + concat_scale_param = ms.Tensor(np.concatenate([w1_scale_param, w3_scale_param], axis=1), dtype=ms.bfloat16) + parameter_dict[concat_scale_name] = ms.Parameter(concat_scale_param, name=concat_scale_name, + requires_grad=False) + + concat_quant_zp_name = f"model.layers.{layer_id}.{layer_type}.w_gate_hidden.quant_op.input_zp" + concat_quant_zp_param = ms.Tensor(w1_quant_zp_param, dtype=ms.bfloat16) + parameter_dict[concat_quant_zp_name] = ms.Parameter(concat_quant_zp_param, name=concat_quant_zp_name, + requires_grad=False) + + concat_quant_scale_name = f"model.layers.{layer_id}.{layer_type}.w_gate_hidden.quant_op.input_scale" + concat_quant_scale_param = ms.Tensor(w1_quant_scale_param, dtype=ms.bfloat16) + parameter_dict[concat_quant_scale_name] = ms.Parameter(concat_quant_scale_param, + name=concat_quant_scale_name, + requires_grad=False) + else: + # w1 w3 + parameter_dict[w1_weight_name] = ms.Parameter(ms.Tensor(w1_weight_param, ms.int8), name=w1_weight_name, + requires_grad=False) + parameter_dict[w3_weight_name] = ms.Parameter(ms.Tensor(w3_weight_param, ms.int8), name=w3_weight_name, + requires_grad=False) + + parameter_dict[w1_bias_name] = ms.Parameter(ms.Tensor(w1_bias_param, ms.int32), + name=w1_bias_name, requires_grad=False) + parameter_dict[w3_bias_name] = ms.Parameter(ms.Tensor(w3_bias_param, ms.int32), + name=w3_bias_name, requires_grad=False) + + parameter_dict[w1_scale_name] = ms.Parameter(ms.Tensor(w1_scale_param, ms.bfloat16), + name=w1_scale_name, requires_grad=False) + parameter_dict[w3_scale_name] = ms.Parameter(ms.Tensor(w3_scale_param, ms.bfloat16), + name=w3_scale_name, requires_grad=False) + + parameter_dict[w1_quant_zp] = ms.Parameter(ms.Tensor(w1_quant_zp_param, ms.bfloat16), + name=w1_quant_zp, requires_grad=False) + parameter_dict[w3_quant_zp] = ms.Parameter(ms.Tensor(w3_quant_zp_param, ms.bfloat16), + name=w3_quant_zp, requires_grad=False) + + parameter_dict[w1_quant_scale] = ms.Parameter(ms.Tensor(w1_quant_scale_param, ms.bfloat16), + name=w1_quant_scale, requires_grad=False) + parameter_dict[w3_quant_scale] = ms.Parameter(ms.Tensor(w3_quant_scale_param, ms.bfloat16), + name=w3_quant_scale, requires_grad=False) + + def smooth_quant_process_ffn_weight(self, src_hf_dir, layer_id, hf_weight_map, parameter_dict, layer_type): + """smooth_quant_process_ffn_weight""" + + ffn_concat = self.config.model.model_config.ffn_concat + w1_weight_name = f"model.layers.{layer_id}.{layer_type}.w1._layer.weight" + w1_weight_param, _ = self.get_safetensor_from_file(w1_weight_name, src_hf_dir, hf_weight_map, + is_split_param=True, + split_axis=0) + w1_bias_name = f"model.layers.{layer_id}.{layer_type}.w1._layer.matmul.quant_bias" + w1_bias_param, _ = self.get_safetensor_from_file(w1_bias_name, src_hf_dir, hf_weight_map, + is_split_param=True, + split_axis=0) + w1_scale_name = f"model.layers.{layer_id}.{layer_type}.w1._layer.matmul.dequant_scale" + w1_scale_param, _ = self.get_safetensor_from_file(w1_scale_name, src_hf_dir, hf_weight_map, + is_split_param=True, + split_axis=0) + + w1_quant_zp = f"model.layers.{layer_id}.{layer_type}.w1.quant_op.input_zp" + w1_quant_scale = f"model.layers.{layer_id}.{layer_type}.w1.quant_op.input_scale" + w1_quant_zp_param, _ = self.get_safetensor_from_file(w1_quant_zp, src_hf_dir, hf_weight_map) + w1_quant_scale_param, _ = self.get_safetensor_from_file(w1_quant_scale, src_hf_dir, hf_weight_map) + + w3_weight_name = f"model.layers.{layer_id}.{layer_type}.w3._layer.weight" + w3_weight_param, _ = self.get_safetensor_from_file(w3_weight_name, src_hf_dir, hf_weight_map, + is_split_param=True, + split_axis=0) + w3_bias_name = f"model.layers.{layer_id}.{layer_type}.w3._layer.matmul.quant_bias" + w3_bias_param, _ = self.get_safetensor_from_file(w3_bias_name, src_hf_dir, hf_weight_map, + is_split_param=True, + split_axis=0) + w3_scale_name = f"model.layers.{layer_id}.{layer_type}.w3._layer.matmul.dequant_scale" + w3_scale_param, _ = self.get_safetensor_from_file(w3_scale_name, src_hf_dir, hf_weight_map, + is_split_param=True, + split_axis=0) + + w3_quant_zp = f"model.layers.{layer_id}.{layer_type}.w3.quant_op.input_zp" + w3_quant_scale = f"model.layers.{layer_id}.{layer_type}.w3.quant_op.input_scale" + w3_quant_zp_param, _ = self.get_safetensor_from_file(w3_quant_zp, src_hf_dir, hf_weight_map) + w3_quant_scale_param, _ = self.get_safetensor_from_file(w3_quant_scale, src_hf_dir, hf_weight_map) + if ffn_concat: + concat_weight_name = f"model.layers.{layer_id}.{layer_type}.w_gate_hidden._layer.weight" + concat_weight_param = ms.Tensor(np.concatenate([w1_weight_param, w3_weight_param], axis=0), dtype=ms.int8) + parameter_dict[concat_weight_name] = ms.Parameter(concat_weight_param, name=concat_weight_name, + requires_grad=False) + + concat_bias_name = f"model.layers.{layer_id}.{layer_type}.w_gate_hidden._layer.matmul.quant_bias" + concat_bias_param = ms.Tensor(np.concatenate([w1_bias_param, w3_bias_param], axis=0), dtype=ms.int32) + parameter_dict[concat_bias_name] = ms.Parameter(concat_bias_param, name=concat_bias_name, + requires_grad=False) + + concat_scale_name = f"model.layers.{layer_id}.{layer_type}.w_gate_hidden._layer.matmul.dequant_scale" + concat_scale_param = ms.Tensor(np.concatenate([w1_scale_param, w3_scale_param], axis=0), dtype=ms.float32) + parameter_dict[concat_scale_name] = ms.Parameter(concat_scale_param, name=concat_scale_name, + requires_grad=False) + + concat_quant_zp_name = f"model.layers.{layer_id}.{layer_type}.w_gate_hidden.quant_op.input_zp" + concat_quant_zp_param = ms.Tensor(w1_quant_zp_param, dtype=ms.int8) + parameter_dict[concat_quant_zp_name] = ms.Parameter(concat_quant_zp_param, name=concat_quant_zp_name, + requires_grad=False) + + concat_quant_scale_name = f"model.layers.{layer_id}.{layer_type}.w_gate_hidden.quant_op.input_scale" + concat_quant_scale_param = ms.Tensor(w1_quant_scale_param, dtype=ms.bfloat16) + parameter_dict[concat_quant_scale_name] = ms.Parameter(concat_quant_scale_param, + name=concat_quant_scale_name, + requires_grad=False) + else: + # w1 w3 + parameter_dict[w1_weight_name] = ms.Parameter(ms.Tensor(w1_weight_param, ms.int8), name=w1_weight_name, + requires_grad=False) + parameter_dict[w3_weight_name] = ms.Parameter(ms.Tensor(w3_weight_param, ms.int8), name=w3_weight_name, + requires_grad=False) + + parameter_dict[w1_bias_name] = ms.Parameter(ms.Tensor(w1_bias_param, ms.int32), + name=w1_bias_name, requires_grad=False) + parameter_dict[w3_bias_name] = ms.Parameter(ms.Tensor(w3_bias_param, ms.int32), + name=w3_bias_name, requires_grad=False) + + parameter_dict[w1_scale_name] = ms.Parameter(ms.Tensor(w1_scale_param, ms.float32), + name=w1_scale_name, requires_grad=False) + parameter_dict[w3_scale_name] = ms.Parameter(ms.Tensor(w3_scale_param, ms.float32), + name=w3_scale_name, requires_grad=False) + + parameter_dict[w1_quant_zp] = ms.Parameter(ms.Tensor(w1_quant_zp_param, ms.int8), + name=w1_quant_zp, requires_grad=False) + parameter_dict[w3_quant_zp] = ms.Parameter(ms.Tensor(w3_quant_zp_param, ms.int8), + name=w3_quant_zp, requires_grad=False) + + parameter_dict[w1_quant_scale] = ms.Parameter(ms.Tensor(w1_quant_scale_param, ms.bfloat16), + name=w1_quant_scale, requires_grad=False) + parameter_dict[w3_quant_scale] = ms.Parameter(ms.Tensor(w3_quant_scale_param, ms.bfloat16), + name=w3_quant_scale, requires_grad=False) + + def smooth_quant_process_qkv_weight(self, src_hf_dir, layer_id, hf_weight_map, parameter_dict): + '''smooth_quant_process_qkv_weight''' + qkv_concat = self.config.model.model_config.qkv_concat + # q2l_proj + q2l_weight_name = f"model.layers.{layer_id}.attention.q2l_proj._layer.weight" + q2l_weight_param, _ = self.get_safetensor_from_file(q2l_weight_name, src_hf_dir, hf_weight_map) + q2l_bias_name = f"model.layers.{layer_id}.attention.q2l_proj._layer.matmul.quant_bias" + q2l_bias_param, _ = self.get_safetensor_from_file(q2l_bias_name, src_hf_dir, hf_weight_map) + q2l_scale_name = f"model.layers.{layer_id}.attention.q2l_proj._layer.matmul.dequant_scale" + q2l_scale_param, _ = self.get_safetensor_from_file(q2l_scale_name, src_hf_dir, hf_weight_map) + + q2l_quant_zp = f"model.layers.{layer_id}.attention.q2l_proj.quant_op.input_zp" + q2l_quant_scale = f"model.layers.{layer_id}.attention.q2l_proj.quant_op.input_scale" + q2l_quant_zp_param, _ = self.get_safetensor_from_file(q2l_quant_zp, src_hf_dir, hf_weight_map) + q2l_quant_scale_param, _ = self.get_safetensor_from_file(q2l_quant_scale, src_hf_dir, hf_weight_map) + + kv2l_weight_name = f"model.layers.{layer_id}.attention.kv2l._layer.weight" + kv2l_weight_param, _ = self.get_safetensor_from_file(kv2l_weight_name, src_hf_dir, hf_weight_map) + kv2l_bias_name = f"model.layers.{layer_id}.attention.kv2l._layer.matmul.quant_bias" + kv2l_bias_param, _ = self.get_safetensor_from_file(kv2l_bias_name, src_hf_dir, hf_weight_map) + kv2l_scale_name = f"model.layers.{layer_id}.attention.kv2l._layer.matmul.dequant_scale" + kv2l_scale_param, _ = self.get_safetensor_from_file(kv2l_scale_name, src_hf_dir, hf_weight_map) + + kv2l_quant_zp = f"model.layers.{layer_id}.attention.kv2l.quant_op.input_zp" + kv2l_quant_scale = f"model.layers.{layer_id}.attention.kv2l.quant_op.input_scale" + kv2l_quant_zp_param, _ = self.get_safetensor_from_file(kv2l_quant_zp, src_hf_dir, hf_weight_map) + kv2l_quant_scale_param, _ = self.get_safetensor_from_file(kv2l_quant_scale, src_hf_dir, hf_weight_map) + + if qkv_concat: + qkv2l_weight_name = f"model.layers.{layer_id}.attention.qkv2l._layer.weight" + qkv2l_bias_name = f"model.layers.{layer_id}.attention.qkv2l._layer.matmul.quant_bias" + qkv2l_scale_name = f"model.layers.{layer_id}.attention.qkv2l._layer.matmul.dequant_scale" + qkv2l_quant_zp_name = f"model.layers.{layer_id}.attention.qkv2l.quant_op.input_zp" + qkv2l_quant_scale_name = f"model.layers.{layer_id}.attention.qkv2l.quant_op.input_scale" + + qkv2l_weight = np.concatenate((q2l_weight_param, kv2l_weight_param), 0) + parameter_dict[qkv2l_weight_name] = ms.Parameter(ms.Tensor(qkv2l_weight, ms.int8), name=qkv2l_weight_name, + requires_grad=False) + qkv2l_bias = np.concatenate((q2l_bias_param, kv2l_bias_param), 0) + parameter_dict[qkv2l_bias_name] = ms.Parameter(ms.Tensor(qkv2l_bias, ms.int32), name=qkv2l_bias_name, + requires_grad=False) + qkv2l_scale = np.concatenate((q2l_scale_param, kv2l_scale_param), 0) + parameter_dict[qkv2l_scale_name] = ms.Parameter(ms.Tensor(qkv2l_scale, ms.float32), name=qkv2l_scale_name, + requires_grad=False) + parameter_dict[qkv2l_quant_zp_name] = ms.Parameter(ms.Tensor(q2l_quant_zp_param, ms.int8), + name=qkv2l_quant_zp_name, requires_grad=False) + parameter_dict[qkv2l_quant_scale_name] = ms.Parameter(ms.Tensor(q2l_quant_scale_param, ms.bfloat16), + name=qkv2l_quant_scale_name, requires_grad=False) + else: + parameter_dict[q2l_weight_name] = ms.Parameter(ms.Tensor(q2l_weight_param, ms.int8), name=q2l_weight_name, + requires_grad=False) + parameter_dict[kv2l_weight_name] = ms.Parameter(ms.Tensor(kv2l_weight_param, ms.int8), + name=kv2l_weight_name, requires_grad=False) + parameter_dict[q2l_bias_name] = ms.Parameter(ms.Tensor(q2l_bias_param, ms.int32), name=q2l_bias_name, + requires_grad=False) + parameter_dict[kv2l_bias_name] = ms.Parameter(ms.Tensor(kv2l_bias_param, ms.int32), name=kv2l_bias_name, + requires_grad=False) + parameter_dict[q2l_scale_name] = ms.Parameter(ms.Tensor(q2l_scale_param, ms.float32), name=q2l_scale_name, + requires_grad=False) + parameter_dict[kv2l_scale_name] = ms.Parameter(ms.Tensor(kv2l_scale_param, ms.float32), + name=kv2l_scale_name, requires_grad=False) + parameter_dict[q2l_quant_zp] = ms.Parameter(ms.Tensor(q2l_quant_zp_param, ms.int8), name=q2l_quant_zp, + requires_grad=False) + parameter_dict[kv2l_quant_zp] = ms.Parameter(ms.Tensor(kv2l_quant_zp_param, ms.int8), name=kv2l_quant_zp, + requires_grad=False) + parameter_dict[q2l_quant_scale] = ms.Parameter(ms.Tensor(q2l_quant_scale_param, ms.bfloat16), + name=q2l_quant_scale, requires_grad=False) + parameter_dict[kv2l_quant_scale] = ms.Parameter(ms.Tensor(kv2l_quant_scale_param, ms.bfloat16), + name=kv2l_quant_scale, requires_grad=False) + + def infer_smooth_quant_row_linear_split(self, param_name, src_hf_dir, hf_weight_map): + '''infer_smooth_quant_row_linear_split''' + if param_name.endswith(".weight"): + value, _ = self.get_safetensor_from_file(param_name, src_hf_dir, + hf_weight_map, is_split_param=True, + split_axis=1) + elif "quant_op" in param_name: + value, _ = self.get_safetensor_from_file(param_name, src_hf_dir, + hf_weight_map, is_split_param=True, + split_axis=0) + else: + value, _ = self.get_safetensor_from_file(param_name, src_hf_dir, + hf_weight_map) + if "wo._layer.matmul.quant_bias" in param_name and get_tensor_model_parallel_rank() != 0: + value.fill(0) + return value + + def infer_smooth_quant_get_value(self, param_name, src_hf_dir, hf_weight_map, no_need_split_layer): + '''infer_smooth_quant_get_value''' + + if any([name in param_name for name in no_need_split_layer]): + value, _ = self.get_safetensor_from_file(param_name, src_hf_dir, + hf_weight_map) + elif any([name in param_name for name in [".l2q_proj."]]): + if param_name.endswith(".weight") or "matmul" in param_name: + value, _ = self.get_safetensor_from_file(param_name, src_hf_dir, + hf_weight_map, is_split_param=True, + split_axis=0) + else: + value, _ = self.get_safetensor_from_file(param_name, src_hf_dir, + hf_weight_map) + elif any([name in param_name for name in [".feed_forward.w2.", ".wo.", "shared_experts.w2"]]): + value = self.infer_smooth_quant_row_linear_split(param_name, src_hf_dir, hf_weight_map) + elif ".routed_experts.ffn.w2" in param_name: + if param_name.endswith(".weight"): + value, _ = self.get_safetensor_from_file(param_name, src_hf_dir, hf_weight_map, + is_split_param=True, split_axis=1) + else: + value, _ = self.get_safetensor_from_file(param_name, src_hf_dir, + hf_weight_map) + elif any([name in param_name for name in ["lkv2kv_k_nope", "lkv2kv_v"]]): + value, _ = self.get_safetensor_from_file(param_name, src_hf_dir, hf_weight_map, + is_split_param=True, split_axis=0) + elif "lm_head" in param_name: + if not self.config.parallel_config.vocab_emb_dp: + value, _ = self.get_safetensor_from_file(param_name, src_hf_dir, hf_weight_map, + is_split_param=True, split_axis=0) + else: + value, _ = self.get_safetensor_from_file(param_name, src_hf_dir, hf_weight_map) + else: + raise ValueError(f"not found layer {param_name}, please check safetensors file.") + return value + + def infer_smooth_quant_net_ms_convert_layer_weight(self, src_hf_dir, num_layers, hf_weight_map): + '''infer_smooth_quant_net_ms_convert_layer_weight''' + parameter_dict = {} + + no_need_split_layer = ["tok_embeddings", "norm", "routed_experts.router.dense", + "routed_experts.router.e_score_correction_bias", + "topk_bias"] + for layer_id in tqdm(range(num_layers), desc="qkv/ffn params load"): + if layer_id >= 3: + self.smooth_quant_process_route_ffn_weight(src_hf_dir, layer_id, hf_weight_map, parameter_dict, + "feed_forward.routed_experts.ffn") + self.smooth_quant_process_ffn_weight(src_hf_dir, layer_id, hf_weight_map, parameter_dict, + "feed_forward.shared_experts") + + else: + self.smooth_quant_process_ffn_weight(src_hf_dir, layer_id, hf_weight_map, parameter_dict, + "feed_forward") + self.smooth_quant_process_qkv_weight(src_hf_dir, layer_id, hf_weight_map, parameter_dict) + + skip_layer = ["feed_forward.routed_experts.ffn.w1", "feed_forward.shared_experts.w1", "feed_forward.w1", + "feed_forward.routed_experts.ffn.w3", "feed_forward.shared_experts.w3", "feed_forward.w3", + "feed_forward.routed_experts.ffn.w_gate_hidden", "feed_forward.shared_experts.w_gate_hidden", + "feed_forward.w_gate_hidden", "attention.kv2l", "attention.q2l_proj", "attention.qkv2l"] + + for param_name, _ in tqdm(hf_weight_map.items(), desc="remaining params load"): + if "model.layers" in param_name and int(param_name.split('.')[2]) >= num_layers: + continue + + if any([name in param_name for name in skip_layer]): + continue + + value = self.infer_smooth_quant_get_value(param_name, src_hf_dir, hf_weight_map, no_need_split_layer) + dst_dtype = convert_np_to_ms_dtype(value) + + parameter_dict[param_name] = ms.Parameter(ms.Tensor(value, dtype=dst_dtype), + name=param_name, requires_grad=False) + + param_not_load, ckpt_not_load = ms.load_param_into_net(self.network, parameter_dict) + print(f"smoothquant param_not_load:{param_not_load}") + print(f"smoothquant ckpt_not_load:{ckpt_not_load}") + + def infer_gptq_quant_net_ms_convert_layer_weight(self, src_hf_dir, num_layers, hf_weight_map): + """infer_gptq_quant_net_ms_convert_layer_weight""" + parameter_dict = {} + + no_need_split_layer = ["tok_embeddings", "norm", "q2l_proj", + "kv2l", "routed_experts.router.dense", + "routed_experts.router.e_score_correction_bias", + "topk_bias"] + + for param_name, _ in tqdm(hf_weight_map.items(), desc="split safetensors"): + if "model.layers" in param_name and int(param_name.split('.')[2]) >= num_layers: + continue + + if any([name in param_name for name in no_need_split_layer]): + value, is_int4 = self.get_safetensor_from_file(param_name, src_hf_dir, + hf_weight_map) + elif any([name in param_name for name in [".l2q_proj.", ".feed_forward.w_gate_hidden.", + "shared_experts.w_gate_hidden"]]): + value, is_int4 = self.get_safetensor_from_file(param_name, src_hf_dir, + hf_weight_map, is_split_param=True, + split_axis=1) + elif any([name in param_name for name in [".feed_forward.w2.", ".wo.", + "shared_experts.w2"]]): + value, is_int4 = self.get_safetensor_from_file(param_name, src_hf_dir, + hf_weight_map, is_split_param=True, + split_axis=0) + elif ".routed_experts.ffn.w_gate_hidden." in param_name: + value, is_int4 = self.get_safetensor_from_file(param_name, src_hf_dir, hf_weight_map) + value_list = [] + for experts_id in range(value.shape[0]): + value_list.append(self.split_weight_by_rank(value[experts_id, :, :], split_axis=1)) + value = np.stack(value_list, axis=0) + elif ".routed_experts.ffn.w2" in param_name: + value, is_int4 = self.get_safetensor_from_file(param_name, src_hf_dir, hf_weight_map) + value_list = [] + for experts_id in range(value.shape[0]): + value_list.append(self.split_weight_by_rank(value[experts_id, :, :], split_axis=0)) + value = np.stack(value_list, axis=0) + elif any([name in param_name for name in ["lkv2kv_k_nope", "lkv2kv_v"]]): + value, is_int4 = self.get_safetensor_from_file(param_name, src_hf_dir, hf_weight_map, + is_split_param=True, split_axis=0) + elif "lm_head" in param_name: + if not self.config.parallel_config.vocab_emb_dp: + value, is_int4 = self.get_safetensor_from_file(param_name, src_hf_dir, hf_weight_map, + is_split_param=True, split_axis=0) + else: + value, is_int4 = self.get_safetensor_from_file(param_name, src_hf_dir, hf_weight_map) + else: + raise ValueError(f"not found layer {param_name}, please check safetensors file.") + + dst_dtype = convert_np_to_ms_dtype(value) + if is_int4: + parameter_dict[param_name] = ms.Parameter(ms.Tensor(value, dtype=dtype.qint4x2), + name=param_name, requires_grad=False) + else: + parameter_dict[param_name] = ms.Parameter(ms.Tensor(value, dtype=dst_dtype), + name=param_name, requires_grad=False) + _, _ = ms.load_param_into_net(self.network, parameter_dict) + + def load_safetensors_shard(self, src_hf_dir, is_mtp_model=False): + """deepseek load safetensors and shard """ + rank_id = get_rank() + param_json_path = "" + + for file in os.listdir(src_hf_dir): + if file.endswith('index.json'): + # mtp model do not support quantization, needs to load bf16 weight. + if ('quant' in file and self.is_quant) or \ + ('quant' not in file and (not self.is_quant or is_mtp_model)): + param_json_path = os.path.join(src_hf_dir, file) + with open(param_json_path, "r") as fp: + hf_weight_map = json.load(fp)['weight_map'] + break + elif file.endswith('_name_map.json'): + param_json_path = os.path.join(src_hf_dir, file) + with open(param_json_path, "r") as fp: + hf_weight_map = json.load(fp) + if hf_weight_map.get('weight_map'): + hf_weight_map = hf_weight_map['weight_map'] + break + + if not param_json_path: + raise ValueError(f"Not found param_json_path in {src_hf_dir}") + + quantization_config = self.config.model.model_config.quantization_config + quant_method = quantization_config.quant_method if quantization_config else None + if not quant_method or (quant_method != "gptq-pergroup" and quant_method != "smoothquant") and \ + not is_mtp_model: + self.infer_convert_outer_weight(src_hf_dir, hf_weight_map) + + if quant_method and quant_method == "gptq-pergroup": + self.infer_gptq_quant_net_ms_convert_layer_weight(src_hf_dir, self.num_layers, hf_weight_map) + return + if quant_method and quant_method == "smoothquant": + self.infer_smooth_quant_net_ms_convert_layer_weight(src_hf_dir, self.num_layers, hf_weight_map) + return + + enable_tqdm = rank_id == 0 + mtp_layers = self.config.model.model_config.num_nextn_predict_layers + start_layer = 0 if not is_mtp_model else self.num_layers + end_layer = self.num_layers if not is_mtp_model else self.num_layers + mtp_layers + for layer_id in tqdm(range(start_layer, end_layer), desc="Weight loading", disable=not enable_tqdm): + if self.is_quant: + self.infer_quant_net_convert_layer_weight(src_hf_dir, layer_id, hf_weight_map) + else: + self.infer_convert_layer_weight(src_hf_dir, layer_id, hf_weight_map) + + ms.load_param_into_net(self.network, self.parameter_dict) + del self.parameter_dict + gc.collect() diff --git a/vllm_mindspore/model_executor/models/mf_models/mf_model_base.py b/vllm_mindspore/model_executor/models/mf_models/mf_model_base.py new file mode 100644 index 0000000000000000000000000000000000000000..893d91a51d9fd6a1268017cd468c1d1878c0b110 --- /dev/null +++ b/vllm_mindspore/model_executor/models/mf_models/mf_model_base.py @@ -0,0 +1,184 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# Copyright 2025 Huawei Technologies Co., Ltd +# Copyright 2024 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +import os +from types import MethodType +from typing import Iterable, List, Optional, Set, Tuple, Union +from abc import abstractmethod +import numpy as np + +from vllm.attention import AttentionMetadata +from vllm.config import VllmConfig +from vllm.model_executor.layers.sampler import SamplerOutput +from vllm.model_executor.sampling_metadata import SamplingMetadata +from vllm.sequence import IntermediateTensors +from vllm.distributed import get_tensor_model_parallel_world_size +from vllm.logger import init_logger + +import torch +import mindspore as ms +from mindspore import Tensor, mutable +from mindspore.common.api import _pynative_executor + +from mindformers.tools.register.config import MindFormerConfig +from mindformers.core.context import build_mf_context +from mindformers.core.parallel_config import build_parallel_config + +from vllm_mindspore.model_executor.models.model_base import MsModelBase +from vllm_mindspore.model_executor.models.mf_models.attention_mask import LowerTriangularMask + +logger = init_logger(__name__) + + +def _pad_to_max(x, max_len): + return x + [-1] * (max_len - len(x)) + + +def _batch_seq(input_tokens, prefill): + if prefill: + return ms.ops.expand_dims(input_tokens, 0).to(ms.int32) + + return ms.mint.reshape(input_tokens, (-1, 1)).to(ms.int32) + + +class MfModelBase(MsModelBase): + def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: + super(MfModelBase, self).__init__( + vllm_config=vllm_config, prefix=prefix + ) + + self.mf_config = MindFormerConfig(os.getenv("MINDFORMERS_MODEL_CONFIG")) + build_mf_context(self.mf_config) + build_parallel_config(self.mf_config) + self.mf_config.model.model_config.parallel_config = ( + self.mf_config.parallel_config + ) + self.mf_config.model.model_config.parallel_config.model_parallel = ( + get_tensor_model_parallel_world_size() + ) + self.mf_config.model.model_config.parallel_config.pipeline_stage = 1 + self._generate_model_config() + self.casual_mask = LowerTriangularMask(mf_model_config=self.mf_model_config) + self.network, self.lm_head = self._create_network() + affinity_config = self.mf_config.get('context', {}).get('affinity_cpu_list', {}) + if isinstance(affinity_config, dict): + ms.runtime.set_cpu_affinity(True, affinity_config) + + @abstractmethod + def _generate_model_config(self): + raise NotImplementedError("Function _generate_model_config should be Implemented!") + + @abstractmethod + def _create_network(self): + raise NotImplementedError("Function _create_network should be Implemented!") + + + def prepare_inputs(self, input_ids, positions, attn_metadata): + key_cache, value_cache = self.get_kvcache() + seq_lens = attn_metadata.seq_lens + max_query_len = attn_metadata.max_query_len + # When Mutli-Step is enabled with Chunked-Prefill, prefills and + # decodes are scheduled together. In the first step, all the + # prefills turn into decodes and max_query_len will be 1. + if self.is_multi_step_chunked_prefill and max_query_len == 1: + query_lens = [1] * len(seq_lens) + else: + query_lens = attn_metadata.query_lens + + seq_lens_np = np.array(seq_lens, dtype=np.int32) + query_lens_np = np.array(query_lens, dtype=np.int32) + kv_cache_lens = seq_lens_np - query_lens_np + if attn_metadata.num_decode_tokens == 0 and kv_cache_lens.max() == 0: + is_prefill = True + else: + is_prefill = False + + q_seq_lens = ms.Tensor(query_lens_np, dtype=ms.int32) + position_ids = ms.Tensor(positions, dtype=ms.int32) + attention_mask = self.casual_mask.gen_attention_mask(is_prefill, position_ids, query_lens) + + model_inputs = {} + model_inputs["input_ids"] = _batch_seq(input_ids, is_prefill) + model_inputs["batch_valid_length"] = ms.Tensor.from_numpy(np.expand_dims(seq_lens_np, 0)) + model_inputs["block_tables"] = attn_metadata.block_tables + model_inputs["slot_mapping"] = attn_metadata.slot_mapping + model_inputs["position_ids"] = position_ids + model_inputs["q_seq_lens"] = q_seq_lens + model_inputs["attention_mask"] = attention_mask + model_inputs["key_cache"] = key_cache + model_inputs["value_cache"] = value_cache + + return model_inputs, is_prefill + + def update_model_inputs(self, model_inputs, **kwargs): + return model_inputs + + def forward( + self, + input_ids: Tensor, + positions: Tensor, + kv_caches: List[Tensor], + attn_metadata: AttentionMetadata, + intermediate_tensors: Optional[IntermediateTensors] = None, + inputs_embeds: Optional[Tensor] = None, + **kwargs + ) -> Union[Tensor, IntermediateTensors]: + model_inputs, is_prefill = self.prepare_inputs(input_ids, positions, attn_metadata) + model_inputs = self.update_model_inputs(model_inputs, **kwargs) + + if is_prefill: + self.network.phase = "prefill" + if not self.set_flags: + self.network.add_flags_custom(is_first_iteration=True) + hidden_states = self.network(**model_inputs) + self.network.phase = "increment" + if not self.set_flags: + self.network.add_flags_custom(is_first_iteration=False) + self.set_flags = True + else: + hidden_states = self.network(**model_inputs) + + return hidden_states + + def compute_logits( + self, + hidden_states: Tensor, + sampling_metadata: SamplingMetadata, + ) -> Optional[Tensor]: + selected_token_indices = sampling_metadata.selected_token_indices + if selected_token_indices is not None and selected_token_indices.numel() <= 0: + logits = ms.mint.zeros((0, self.mf_model_config.vocab_size), + dtype=self.mf_model_config.compute_dtype) + else: + hidden_states = hidden_states.index_select(0, selected_token_indices) + logits = self.lm_head(hidden_states) + logits = logits.reshape(-1, logits.shape[-1]) + + return logits + + def sample( + self, + logits: Tensor, + sampling_metadata: SamplingMetadata, + ) -> Optional[SamplerOutput]: + next_tokens = self.sampler(logits, sampling_metadata) + _pynative_executor.sync() + return next_tokens + + def load_weights(self, weights: Iterable[Tuple[str, Tensor]]) -> Set[str]: + raise NotImplementedError("load_weight not implemented.") diff --git a/vllm_mindspore/model_executor/models/mf_models/qwen2.py b/vllm_mindspore/model_executor/models/mf_models/qwen2.py index afde4466a424e3c07d55ebfcd6f528ec88c9afb8..18a865c14b4d82ee3f50a02710526e79066a8507 100644 --- a/vllm_mindspore/model_executor/models/mf_models/qwen2.py +++ b/vllm_mindspore/model_executor/models/mf_models/qwen2.py @@ -1,6 +1,7 @@ #!/usr/bin/env python3 # encoding: utf-8 # Copyright 2025 Huawei Technologies Co., Ltd +# Copyright 2024 The vLLM team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,194 +16,70 @@ # limitations under the License. # ============================================================================ -import os -from typing import Iterable, List, Optional, Set, Tuple, Union +from typing import Iterable, Set, Tuple -import numpy as np - -from vllm.attention import AttentionMetadata from vllm.config import VllmConfig -from vllm.model_executor.layers.sampler import SamplerOutput -from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.sequence import IntermediateTensors -from vllm.distributed import get_tensor_model_parallel_world_size +from vllm.config import get_current_vllm_config from vllm.logger import init_logger - -from mindformers.tools.register.config import MindFormerConfig - -from mindformers.core.context import build_context -from mindformers.core.parallel_config import build_parallel_config +from mindspore import Tensor, JitConfig +from mindspore.nn.utils import no_init_parameters from mindformers.models.llama import LlamaConfig as LlamaConfig_MF -from mindformers.trainer import BaseTrainer -from mindformers.tools.utils import set_output_path, set_strategy_save_path from research.qwen2_5.infer.qwen2_5 import ( ParallelQwenForCausalLM as ParallelQwenForCausalLM_MF, ) from vllm_mindspore.model_executor.layers.sampler import get_sampler -from vllm_mindspore.model_executor.models.model_base import MsModelBase -from vllm_mindspore.utils import cal_block_num - -import mindspore as ms -from mindspore import Tensor, JitConfig, Model -from mindformers.trainer.utils import transform_and_load_checkpoint +from vllm_mindspore.model_executor.models.model_base import Fake_Attention +from vllm_mindspore.model_executor.models.mf_models.mf_model_base import MfModelBase +from vllm_mindspore.model_executor.models.mf_models.qwen2_weight_processor import Qwen2WeightProcessor logger = init_logger(__name__) -def _pad_to_max(x, max_len): - return x + [-1] * (max_len - len(x)) - - -def _pad_block_table(block_tables, seq_length, block_size): - # When Prefill, the block_tables is a empty tensor. - if len(block_tables.shape) < 2: - fake_block_tables = ms.mint.empty( - 1, seq_length // block_size, dtype=ms.int32, device="Ascend" - ) - return fake_block_tables - - block_tables_list = block_tables.tolist() - padded_block_tables = [ - _pad_to_max(block_table, seq_length // block_size) - for block_table in block_tables_list - ] - - return Tensor(np.array(padded_block_tables).astype(np.int32)) - - -def _batch_seq(input_tokens, prefill): - if prefill: - return ms.ops.expand_dims(input_tokens, 0).to(ms.int32) - - return ms.mint.reshape(input_tokens, (-1, 1)).to(ms.int32) - - -class Qwen2ForCausalLM(MsModelBase): +class Qwen2ForCausalLM(MfModelBase): def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: super(Qwen2ForCausalLM, self).__init__(vllm_config=vllm_config, prefix=prefix) - - self.mf_config = MindFormerConfig(os.getenv("MINDFORMERS_MODEL_CONFIG")) - build_context(self.mf_config, is_set_ms_ctx=False, is_init_ms=False) - build_parallel_config(self.mf_config) - self.mf_config.model.model_config.parallel_config = ( - self.mf_config.parallel_config - ) - self.mf_config.model.model_config.parallel_config.model_parallel = ( - get_tensor_model_parallel_world_size() - ) - self.mf_config.model.model_config.parallel_config.pipeline_stage = 1 - - self.mf_model_config = LlamaConfig_MF(**self.mf_config.model.model_config) - # Cannot get num_gpu_blocks from cache config now, calculate one first. - self.mf_model_config.num_blocks = cal_block_num( - self.cache_config, self.model_config, self.parallel_config - ) - self.mf_model_config.block_size = self.cache_config.block_size - if self.mf_config.moe_config: - self.mf_model_config.moe_config = self.mf_config.moe_config - - # Initial network - self.network = ParallelQwenForCausalLM_MF(self.mf_model_config) - self.network._jit_config_dict = JitConfig( - jit_level="O0", infer_boost="on" - ).jit_config_dict - - set_output_path(self.mf_config.output_dir) - set_strategy_save_path(self.mf_config.parallel) - # update safetensor path - ms_safetensors_path = BaseTrainer._get_load_path_after_hf_convert( - self.mf_config, self.network - ) - self.mf_config.load_checkpoint = ms_safetensors_path - self.mf_kvcaches_init = False - self.logits = None self.sampler = get_sampler() self.set_modules({"model": self.network}) - def update_mf_kvcaches(self, kv_caches): - if self.mf_kvcaches_init: - return + self.kv_caches = [Fake_Attention() for i in range(self.mf_model_config.num_layers)] + compilation_config = get_current_vllm_config().compilation_config + if prefix in compilation_config.static_forward_context: + raise ValueError(f"Duplicate layer name: {prefix}") for i in range(self.mf_model_config.num_layers): - k_cache, v_cache = kv_caches[i] - mf_k_cache, mf_v_cache = self.network.kvcache(i) - mf_k_cache.set_device_address( - k_cache._data_ptr(), k_cache.shape, k_cache.dtype - ) - mf_v_cache.set_device_address( - v_cache._data_ptr(), v_cache.shape, v_cache.dtype - ) - self.mf_kvcaches_init = True - - def forward( - self, - input_ids: Tensor, - positions: Tensor, - kv_caches: List[Tensor], - attn_metadata: AttentionMetadata, - intermediate_tensors: Optional[IntermediateTensors] = None, - inputs_embeds: Optional[Tensor] = None, - ) -> Union[Tensor, IntermediateTensors]: - self.update_mf_kvcaches(kv_caches) - - is_prefill = True if attn_metadata.prefill_metadata else False - - self.logits = None - - model_inputs = {} - model_inputs["input_ids"] = _batch_seq(input_ids, is_prefill) - model_inputs["batch_valid_length"] = ms.ops.expand_dims( - attn_metadata.seq_lens_tensor, 0 - ) - model_inputs["block_tables"] = _pad_block_table( - attn_metadata.block_tables, - self.mf_model_config.seq_length, - self.mf_model_config.block_size, - ) - model_inputs["slot_mapping"] = attn_metadata.slot_mapping - - if is_prefill: - self.network.phase = "prefill" - self.network.add_flags_custom(is_first_iteration=True) - self.logits = self.network(**model_inputs) - self.network.phase = "increment" - self.network.add_flags_custom(is_first_iteration=False) - else: - self.logits = self.network(**model_inputs) + compilation_config.static_forward_context[str(i)] = self.kv_caches[i] - return None + self.set_flags = False - def compute_logits( - self, - hidden_states: Tensor, - sampling_metadata: SamplingMetadata, - ) -> Optional[Tensor]: - return self.logits - - def sample( - self, - logits: Tensor, - sampling_metadata: SamplingMetadata, - ) -> Optional[SamplerOutput]: - next_tokens = self.sampler(logits, sampling_metadata) - return next_tokens + def _generate_model_config(self): + self.mf_config.load_checkpoint = self.get_model_path() + self.mf_model_config = LlamaConfig_MF(**self.mf_config.model.model_config) + if self.mf_config.moe_config: + self.mf_model_config.moe_config = self.mf_config.moe_config + self.mf_model_config.return_hidden_states = True + + # qwen qkv concat will support in next version + self.mf_model_config.qkv_concat = False + setattr(self.mf_model_config, 'npu_mem_size', -1) + self.mf_config.model.model_config.qkv_concat = False + + def _create_network(self): + # Initial network + with no_init_parameters(): # Delay initialization + network = ParallelQwenForCausalLM_MF(self.mf_model_config) + return network, network.lm_head def load_weights(self, weights: Iterable[Tuple[str, Tensor]]) -> Set[str]: - model = Model(self.network) - batch_size = self.mf_config.model.model_config.batch_size - seq_length = self.mf_config.model.model_config.seq_length - input_ids = np.ones(shape=tuple([batch_size, seq_length])) - infer_data = self.network.prepare_inputs_for_predict_layout(input_ids) - transform_and_load_checkpoint( - self.mf_config, model, self.network, infer_data, do_predict=True - ) + weight_processor = Qwen2WeightProcessor(self.mf_config, self.network, False) + weight_processor.load_safetensors_shard(self.mf_config.load_checkpoint) self.network.set_dynamic_inputs() - + dynamic_hidden_states = Tensor(shape=[None, None], dtype=self.mf_model_config.compute_dtype) + self.lm_head.set_inputs(dynamic_hidden_states) return None diff --git a/vllm_mindspore/model_executor/models/mf_models/qwen2_weight_processor.py b/vllm_mindspore/model_executor/models/mf_models/qwen2_weight_processor.py new file mode 100644 index 0000000000000000000000000000000000000000..59423eca036f17d4c1739735dc2ffe74c7d96b21 --- /dev/null +++ b/vllm_mindspore/model_executor/models/mf_models/qwen2_weight_processor.py @@ -0,0 +1,267 @@ +# Copyright 2025 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +""" +transform huggingface model to mindspore safetensor. +""" +import os +import json +import gc +import numpy as np +from tqdm import tqdm +from safetensors import safe_open +import mindspore as ms +from mindspore.communication.management import get_rank + +from vllm_mindspore.model_executor.models.mf_models.weight_processor import BaseWeightProcessor + + +class Qwen2WeightProcessor(BaseWeightProcessor): + r""" + Provide Qwen2 Model weight load and shards. + Args: + config (Qwen2Config): The config of Qwen2 model. + network (InferenceQwen2ForCausalLM): The network of Qwen2. + + """ + + def __init__(self, config, network, is_quant): + super().__init__(config, network, is_quant) + + def infer_convert_outer_weight(self, src_hf_dir, hf_weight_map): + """convert weight not in model""" + embed_tokens_hf_name = "model.embed_tokens.weight" + embed_tokens_ms_name = self.convert_weight_name(embed_tokens_hf_name) + if self.config.parallel_config.vocab_emb_dp: + np_data, _ = self.get_safetensor_from_file(embed_tokens_hf_name, src_hf_dir, hf_weight_map) + else: + np_data, _ = self.get_safetensor_from_file(embed_tokens_hf_name, src_hf_dir, hf_weight_map, + is_split_param=True, split_axis=0) + self.parameter_dict[embed_tokens_ms_name] = ms.Parameter(ms.from_numpy(np_data).astype(ms.bfloat16), + name=embed_tokens_ms_name, + requires_grad=False) + + norm_hf_name = "model.norm.weight" + norm_ms_name = self.convert_weight_name(norm_hf_name) + np_data, _ = self.get_safetensor_from_file(norm_hf_name, src_hf_dir, hf_weight_map) + self.parameter_dict[norm_ms_name] = ms.Parameter(ms.from_numpy(np_data).astype(ms.bfloat16), + name=norm_ms_name, + requires_grad=False) + + lm_head_hf_name = "lm_head.weight" + lm_head_ms_name = self.convert_weight_name(lm_head_hf_name) + if not self.config.model.model_config.tie_word_embeddings: + if not self.config.parallel_config.vocab_emb_dp: + np_data, _ = self.get_safetensor_from_file(lm_head_hf_name, src_hf_dir, hf_weight_map, + is_split_param=True, split_axis=0) + else: + np_data, _ = self.get_safetensor_from_file(lm_head_hf_name, src_hf_dir, hf_weight_map) + self.parameter_dict[lm_head_ms_name] = ms.Parameter(ms.from_numpy(np_data).astype(ms.bfloat16), + name=lm_head_ms_name, + requires_grad=False) + + def convert_weight_name(self, weight_name: str): + """replace weight name""" + weight_name = weight_name.replace('embed_tokens.weight', 'tok_embeddings.embedding_weight') + weight_name = weight_name.replace('self_attn.q_proj.', 'attention.wq.') + weight_name = weight_name.replace('self_attn.k_proj.', 'attention.wk.') + weight_name = weight_name.replace('self_attn.v_proj.', 'attention.wv.') + weight_name = weight_name.replace('self_attn.o_proj.', 'attention.wo.') + + weight_name = weight_name.replace('mlp.gate_proj.', 'feed_forward.w1.') + weight_name = weight_name.replace('mlp.down_proj.', 'feed_forward.w2.') + weight_name = weight_name.replace('mlp.up_proj.', 'feed_forward.w3.') + weight_name = weight_name.replace('.input_layernorm.', '.attention_norm.') + weight_name = weight_name.replace('.post_attention_layernorm.', '.ffn_norm.') + weight_name = weight_name.replace('model.norm.weight', 'model.norm_out.weight') + return weight_name + + def infer_process_dense_ffn_weight(self, src_hf_dir, layer_id, hf_weight_map): + """infer process dense ffn weight""" + + ffn_concat = self.config.model.model_config.qkv_concat + w1_hf_name = f"model.layers.{layer_id}.mlp.gate_proj.weight" + w1_ms_name = self.convert_weight_name(w1_hf_name) + w1_ms_param, _ = self.get_safetensor_from_file(w1_hf_name, src_hf_dir, hf_weight_map, is_split_param=True, + split_axis=0) + + w2_hf_name = f"model.layers.{layer_id}.mlp.down_proj.weight" + w2_ms_name = self.convert_weight_name(w2_hf_name) + w2_ms_param, _ = self.get_safetensor_from_file(w2_hf_name, src_hf_dir, hf_weight_map, is_split_param=True, + split_axis=1) + + w3_hf_name = f"model.layers.{layer_id}.mlp.up_proj.weight" + w3_ms_name = self.convert_weight_name(w3_hf_name) + w3_ms_param, _ = self.get_safetensor_from_file(w3_hf_name, src_hf_dir, hf_weight_map, is_split_param=True, + split_axis=0) + + if ffn_concat: + w_gate_hidden_name = f"model.layers.{layer_id}.feed_forward.w_gate_hidden.weight" + w_gate_hidden_param = np.concatenate((w1_ms_param, w3_ms_param), axis=0) + self.parameter_dict[w_gate_hidden_name] = ms.Parameter(w_gate_hidden_param, name=w_gate_hidden_name, + requires_grad=False) + else: + self.parameter_dict[w1_ms_name] = ms.Parameter(ms.from_numpy(w1_ms_param).astype(ms.bfloat16), + name=w1_ms_name, + requires_grad=False) + self.parameter_dict[w3_ms_name] = ms.Parameter(ms.from_numpy(w3_ms_param).astype(ms.bfloat16), + name=w3_ms_name, + requires_grad=False) + + self.parameter_dict[w2_ms_name] = ms.Parameter(ms.from_numpy(w2_ms_param).astype(ms.bfloat16), + name=w2_ms_name, + requires_grad=False) + + def infer_process_attention_weight(self, src_hf_dir, layer_id, hf_weight_map): + """infer process attention weight""" + qkv_concat = self.config.model.model_config.qkv_concat + # wq + wq_hf_name = f"model.layers.{layer_id}.self_attn.q_proj.weight" + wq_ms_name = self.convert_weight_name(wq_hf_name) + wq_ms_param, _ = self.get_safetensor_from_file(wq_hf_name, src_hf_dir, hf_weight_map, is_split_param=True, + split_axis=0) + # wq bias + wq_bias_hf_name = f"model.layers.{layer_id}.self_attn.q_proj.bias" + wq_bias_ms_name = self.convert_weight_name(wq_bias_hf_name) + wq_bias_ms_param, _ = self.get_safetensor_from_file(wq_bias_hf_name, src_hf_dir, hf_weight_map, + is_split_param=True, + split_axis=0) + + # wk + wk_hf_name = f"model.layers.{layer_id}.self_attn.k_proj.weight" + wk_ms_name = self.convert_weight_name(wk_hf_name) + wk_ms_param, _ = self.get_safetensor_from_file(wk_hf_name, src_hf_dir, hf_weight_map, is_split_param=True, + split_axis=0) + # wk bias + wk_bias_hf_name = f"model.layers.{layer_id}.self_attn.k_proj.bias" + wk_bias_ms_name = self.convert_weight_name(wk_bias_hf_name) + wk_bias_ms_param, _ = self.get_safetensor_from_file(wk_bias_hf_name, src_hf_dir, hf_weight_map, + is_split_param=True, + split_axis=0) + + # wv + wv_hf_name = f"model.layers.{layer_id}.self_attn.v_proj.weight" + wv_ms_name = self.convert_weight_name(wv_hf_name) + wv_ms_param, _ = self.get_safetensor_from_file(wv_hf_name, src_hf_dir, hf_weight_map, is_split_param=True, + split_axis=0) + # wv bias + wv_bias_hf_name = f"model.layers.{layer_id}.self_attn.v_proj.bias" + wv_bias_ms_name = self.convert_weight_name(wv_bias_hf_name) + wv_bias_ms_param, _ = self.get_safetensor_from_file(wv_bias_hf_name, src_hf_dir, hf_weight_map, + is_split_param=True, + split_axis=0) + + if qkv_concat: + w_qkv_name = f"model.layers.{layer_id}.attention.w_qkv.weight" + w_qkv_param = np.concatenate((wq_ms_param, wk_ms_param, wv_ms_param), axis=0) + w_qkv_param = ms.from_numpy(w_qkv_param).astype(ms.bfloat16) + self.parameter_dict[w_qkv_name] = ms.Parameter(w_qkv_param, name=w_qkv_name, requires_grad=False) + + w_qkv_bias_name = f"model.layers.{layer_id}.attention.w_qkv.bias" + w_qkv_bias_param = np.concatenate((wq_bias_ms_param, wk_bias_ms_param, wv_bias_ms_param), axis=0) + w_qkv_bias_param = ms.from_numpy(w_qkv_bias_param).astype(ms.bfloat16) + self.parameter_dict[w_qkv_bias_name] = ms.Parameter(w_qkv_bias_param, name=w_qkv_bias_name, + requires_grad=False) + else: + self.parameter_dict[wq_ms_name] = ms.Parameter(ms.from_numpy(wq_ms_param).astype(ms.bfloat16), + name=wq_ms_name, + requires_grad=False) + self.parameter_dict[wk_ms_name] = ms.Parameter(ms.from_numpy(wk_ms_param).astype(ms.bfloat16), + name=wk_ms_name, + requires_grad=False) + self.parameter_dict[wv_ms_name] = ms.Parameter(ms.from_numpy(wv_ms_param).astype(ms.bfloat16), + name=wv_ms_name, + requires_grad=False) + + self.parameter_dict[wq_bias_ms_name] = ms.Parameter( + ms.from_numpy(wq_bias_ms_param).astype(ms.bfloat16), + name=wq_bias_ms_name, + requires_grad=False) + self.parameter_dict[wk_bias_ms_name] = ms.Parameter( + ms.from_numpy(wk_bias_ms_param).astype(ms.bfloat16), + name=wk_bias_ms_name, + requires_grad=False) + self.parameter_dict[wv_bias_ms_name] = ms.Parameter( + ms.from_numpy(wv_bias_ms_param).astype(ms.bfloat16), + name=wv_bias_ms_name, + requires_grad=False) + + # wo + wo_hf_name = f"model.layers.{layer_id}.self_attn.o_proj.weight" + wo_ms_name = self.convert_weight_name(wo_hf_name) + wo_ms_param, _ = self.get_safetensor_from_file(wo_hf_name, src_hf_dir, hf_weight_map, is_split_param=True, + split_axis=1) + self.parameter_dict[wo_ms_name] = ms.Parameter(ms.from_numpy(wo_ms_param).astype(ms.bfloat16), + name=wo_ms_name, + requires_grad=False) + + def infer_process_norm_weight(self, src_hf_dir, layer_id, hf_weight_map): + """infer process attention weight""" + # attention_norm + attention_norm_hf_name = f"model.layers.{layer_id}.input_layernorm.weight" + attention_norm_ms_name = self.convert_weight_name(attention_norm_hf_name) + attention_norm_ms_param, _ = self.get_safetensor_from_file(attention_norm_hf_name, + src_hf_dir, + hf_weight_map) + self.parameter_dict[attention_norm_ms_name] = ms.Parameter( + ms.from_numpy(attention_norm_ms_param).astype(ms.bfloat16), + name=attention_norm_ms_name, + requires_grad=False) + + # ffn_norm + ffn_norm_hf_name = f"model.layers.{layer_id}.post_attention_layernorm.weight" + ffn_norm_ms_name = self.convert_weight_name(ffn_norm_hf_name) + ffn_norm_ms_param, _ = self.get_safetensor_from_file(ffn_norm_hf_name, src_hf_dir, hf_weight_map) + self.parameter_dict[ffn_norm_ms_name] = ms.Parameter( + ms.from_numpy(ffn_norm_ms_param).astype(ms.bfloat16), + name=ffn_norm_ms_name, + requires_grad=False) + + def infer_convert_layer_weight(self, src_hf_dir, layer_id, hf_weight_map): + """infer convert layer weight""" + self.infer_process_attention_weight(src_hf_dir, layer_id, hf_weight_map) + self.infer_process_dense_ffn_weight(src_hf_dir, layer_id, hf_weight_map) + self.infer_process_norm_weight(src_hf_dir, layer_id, hf_weight_map) + + def load_safetensors_shard(self, src_hf_dir): + """qwen load safetensors and shard """ + rank_id = get_rank() + param_json_path = "" + for file in os.listdir(src_hf_dir): + if file.endswith('index.json'): + param_json_path = os.path.join(src_hf_dir, file) + break + + hf_weight_map = {} + if os.path.exists(param_json_path): + with open(param_json_path, "r") as fp: + hf_weight_map = json.load(fp)['weight_map'] + else: + # only one safetensor, create a hf_weight_map + safetensor_file = "model.safetensors" + with safe_open(f"{src_hf_dir}/{safetensor_file}", framework="np") as sf_file: + all_keys = sf_file.keys() + for key in all_keys: + hf_weight_map[str(key).strip()] = safetensor_file + + self.infer_convert_outer_weight(src_hf_dir, hf_weight_map) + num_layers = self.config.model.model_config.num_layers + enable_tqdm = rank_id == 0 + for layer_id in tqdm(range(num_layers), desc="Weight loading", disable=not enable_tqdm): + self.infer_convert_layer_weight(src_hf_dir, layer_id, hf_weight_map) + + ms.load_param_into_net(self.network, self.parameter_dict) + del self.parameter_dict + gc.collect() diff --git a/vllm_mindspore/model_executor/models/mf_models/weight_processor.py b/vllm_mindspore/model_executor/models/mf_models/weight_processor.py new file mode 100644 index 0000000000000000000000000000000000000000..9b0aab3a177323bb584f268061a06f9213070494 --- /dev/null +++ b/vllm_mindspore/model_executor/models/mf_models/weight_processor.py @@ -0,0 +1,102 @@ +# Copyright 2025 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +""" +transform huggingface safetensor. +""" + +import os +from safetensors import safe_open +from mindspore.communication.management import get_rank, get_group_size + + +class BaseWeightProcessor: + r""" + Provide model weight load and shards. + Args: + config (MF Config): The config of Infer model. + network (InferenceModelForCausalLM): The network of infer model. + + """ + + def __init__(self, config, network, is_quant): + self.config = config + self.network = network + self.is_quant = is_quant + self.tp_group_size = get_group_size() + self.rank_id = get_rank() + self.parameter_dict = {} + self.file_handles = {} + + def get_file_handles(self, filename): + if filename not in self.file_handles: + fp = safe_open(filename, framework="np") + self.file_handles[filename] = fp + return self.file_handles[filename] + + def release_file_handles(self): + del self.file_handles + + def get_safetensor_from_file(self, hf_param_name, src_hf_dir, hf_weight_map, is_split_param=False, split_axis=0): + safetensor_file = hf_weight_map[hf_param_name] + filename = os.path.join(src_hf_dir, safetensor_file) + sf_file = self.get_file_handles(filename) + qint4 = False + if sf_file.metadata() is not None and hf_param_name in sf_file.metadata().keys(): + qint4 = True + if not is_split_param: + np_data = sf_file.get_tensor(hf_param_name) + return np_data, qint4 + + np_data = sf_file.get_slice(hf_param_name) + shape = np_data.get_shape() + if split_axis == 0: + split_size = shape[0] // self.tp_group_size + start = self.rank_id * split_size + stop = (self.rank_id + 1) * split_size + split_data = np_data[start:stop] + elif split_axis == 1: + split_size = shape[1] // self.tp_group_size + start = self.rank_id * split_size + stop = (self.rank_id + 1) * split_size + split_data = np_data[:, start:stop] + elif split_axis == 2: + split_size = shape[2] // self.tp_group_size + start = self.rank_id * split_size + stop = (self.rank_id + 1) * split_size + split_data = np_data[:, :, start:stop] + else: + raise ValueError("split_axis:{} is not supported.".format(split_axis)) + return split_data, qint4 + + def split_weight_by_rank(self, weight, split_axis=0): + shape = weight.shape + if split_axis == 0: + split_size = shape[0] // self.tp_group_size + start = self.rank_id * split_size + stop = (self.rank_id + 1) * split_size + split_data = weight[start:stop] + elif split_axis == 1: + split_size = shape[1] // self.tp_group_size + start = self.rank_id * split_size + stop = (self.rank_id + 1) * split_size + split_data = weight[:, start:stop] + else: + raise ValueError("split_axis:{} is not supported.".format(split_axis)) + return split_data + + def load_safetensors_shard(self, src_hf_dir): + """ load safetensors and shards """ + raise NotImplementedError("load_safetensors_shard method is not implemented.") diff --git a/vllm_mindspore/model_executor/models/model_base.py b/vllm_mindspore/model_executor/models/model_base.py index 86c12e252240a64689755ef8c9df90619fdf9046..b97d71526a408102248eb8e79321d86fa7a19261 100644 --- a/vllm_mindspore/model_executor/models/model_base.py +++ b/vllm_mindspore/model_executor/models/model_base.py @@ -1,6 +1,7 @@ #!/usr/bin/env python3 # encoding: utf-8 # Copyright 2025 Huawei Technologies Co., Ltd +# Copyright 2024 The vLLM team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -15,20 +16,53 @@ # limitations under the License. # ============================================================================ +import os from abc import abstractmethod from typing import Iterable, List, Optional, Set, Tuple, Union, Dict from vllm.attention import AttentionMetadata -from vllm.config import VllmConfig +from vllm.config import VllmConfig, get_current_vllm_config from vllm.model_executor.layers.sampler import SamplerOutput from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors +from vllm.attention.backends.abstract import AttentionType +from vllm.forward_context import get_forward_context + +import torch from mindspore import Tensor, nn, mutable from mindspore import dtype as mstype from vllm_mindspore.utils import STR_DTYPE_TO_MS_DTYPE +class Fake_Attention: + def __init__(self): + vllm_config = get_current_vllm_config() + block_size = vllm_config.cache_config.block_size + num_kv_heads = vllm_config.model_config.get_num_kv_heads( + vllm_config.parallel_config + ) + head_size = vllm_config.model_config.get_head_size() + num_block = 0 + self.kv_shape = [num_block, block_size, num_kv_heads, head_size] + self.kv_cache = [ + ( + torch.zeros(self.kv_shape, dtype=torch.bfloat16, device="Ascend"), + torch.zeros(self.kv_shape, dtype=torch.bfloat16, device="Ascend"), + ) + for _ in range(vllm_config.parallel_config.pipeline_parallel_size) + ] + self.attn_type = AttentionType.DECODER + + +class Fake_MLA(Fake_Attention): + def __init__(self): + super().__init__() + vllm_config = get_current_vllm_config() + self.kv_cache = [ + (torch.zeros(self.kv_shape, dtype=torch.bfloat16, device="Ascend"),) + for _ in range(vllm_config.parallel_config.pipeline_parallel_size) + ] class MsModelBase(): def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: @@ -41,9 +75,31 @@ class MsModelBase(): self.lora_config = lora_config self.cache_config = vllm_config.cache_config self.parallel_config = vllm_config.parallel_config + self.load_config = vllm_config.load_config self.modules_dict = None + self.enable_chunked_prefill = vllm_config.scheduler_config.enable_chunked_prefill + self.enable_prefix_caching = vllm_config.cache_config.enable_prefix_caching + self.is_multi_step = vllm_config.scheduler_config.is_multi_step + self.is_multi_step_chunked_prefill = self.is_multi_step and self.enable_chunked_prefill + + def get_model_path(self): + model_name_or_path = self.model_config.model + if os.path.isdir(model_name_or_path): + return model_name_or_path + else: + from vllm.model_executor.model_loader.weight_utils import download_weights_from_hf + allow_patterns = ["*.safetensors"] + revision = self.model_config.revision + return download_weights_from_hf( + model_name_or_path, + self.load_config.download_dir, + allow_patterns, + revision, + ignore_patterns=self.load_config.ignore_patterns, + ) + def set_modules(self, model_dicts: Dict[str, nn.Cell]): self.modules_dict = model_dicts @@ -105,6 +161,8 @@ class MsModelBase(): attn_metadata: AttentionMetadata, intermediate_tensors: Optional[IntermediateTensors] = None, inputs_embeds: Optional[Tensor] = None, + previous_hidden_states: Optional[Tensor] = None, + spec_step_idx: int = 0, ) -> Union[Tensor, IntermediateTensors]: return self.forward( input_ids, @@ -113,6 +171,8 @@ class MsModelBase(): attn_metadata, intermediate_tensors, inputs_embeds, + previous_hidden_states=previous_hidden_states, + spec_step_idx=spec_step_idx ) def forward( @@ -123,10 +183,11 @@ class MsModelBase(): attn_metadata: AttentionMetadata, intermediate_tensors: Optional[IntermediateTensors] = None, inputs_embeds: Optional[Tensor] = None, + **kwargs ) -> Union[Tensor, IntermediateTensors]: raise NotImplementedError - def set_model_inputs(self): + def set_model_inputs(self, is_prefill): dyn_input_ids = Tensor(shape=[None, None], dtype=mstype.int64) dyn_position_ids = Tensor(shape=[None], dtype=mstype.int64) @@ -143,13 +204,11 @@ class MsModelBase(): dyn_key_cache = mutable(Tensor(shape=kv_cache_shape, dtype=kv_cache_dtype)) dyn_value_cache = mutable(Tensor(shape=kv_cache_shape, dtype=kv_cache_dtype)) - dyn_kv_cache = mutable((dyn_key_cache, dyn_value_cache)) - dyn_kv_caches = mutable([dyn_kv_cache for _ in range(num_layers)]) + dyn_key_caches = mutable([dyn_key_cache for _ in range(num_layers)]) + dyn_value_caches = mutable([dyn_value_cache for _ in range(num_layers)]) - dyn_num_prefill_tokens = mutable(1) - dyn_num_decode_tokens = mutable(0) - dyn_context_lens = Tensor(shape=[None, ], dtype=mstype.int32) - dyn_batch_valid_length = mutable([0, 0, 0], dynamic_len=True) + dyn_batch_valid_length = Tensor(shape=[None, ], dtype=mstype.int32) + dyn_q_seq_lens = Tensor(shape=[None, ], dtype=mstype.int32) dyn_slot_mapping = Tensor(shape=[None, ], dtype=mstype.int32) dyn_block_tables = Tensor(shape=[None, None], dtype=mstype.int32) dyn_intermediate_tensors = None @@ -158,17 +217,28 @@ class MsModelBase(): self.model.set_inputs( dyn_input_ids, dyn_position_ids, - dyn_kv_caches, - dyn_num_prefill_tokens, - dyn_num_decode_tokens, - dyn_context_lens, - dyn_batch_valid_length, + dyn_key_caches, + dyn_value_caches, + is_prefill, dyn_slot_mapping, + dyn_batch_valid_length, + dyn_q_seq_lens, dyn_block_tables, dyn_intermediate_tensors, dyn_inputs_embeds ) + def get_kvcache(self): + key_cache = [] + value_cache = [] + forward_context = get_forward_context() + for i in range(self.config.num_hidden_layers): + k_cache = self.kv_caches[i].kv_cache[forward_context.virtual_engine][0] + v_cache = self.kv_caches[i].kv_cache[forward_context.virtual_engine][1] + key_cache.append(k_cache) + value_cache.append(v_cache) + return mutable(key_cache), mutable(value_cache) + @abstractmethod def compute_logits( self, diff --git a/vllm_mindspore/model_executor/models/qwen2.py b/vllm_mindspore/model_executor/models/qwen2.py index 7a46c83a0500f16708978b163590575dec6e9350..32d9da8d91b1a3cf2e8a7f6f51d51e152793bb09 100644 --- a/vllm_mindspore/model_executor/models/qwen2.py +++ b/vllm_mindspore/model_executor/models/qwen2.py @@ -1,10 +1,31 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# Copyright 2025 Huawei Technologies Co., Ltd +# Copyright 2024 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +from vllm.config import get_current_vllm_config from typing import TYPE_CHECKING, Dict, List, Optional, Set, Tuple, Union, Iterable if TYPE_CHECKING: from transformers import Qwen2Config else: Qwen2Config = None -from mindspore import Parameter, Tensor, mint, nn, jit, mutable + +import numpy as np + +from mindspore import Parameter, Tensor, mint, nn, jit, ops from mindspore.common import dtype as mstype @@ -16,8 +37,6 @@ from vllm_mindspore.model_executor.layers.linear import ( MergedColumnParallelLinear, QKVParallelLinear, RowParallelLinear) from vllm_mindspore.model_executor.layers.logits_processor import \ LogitsProcessor -from vllm.model_executor.layers.quantization import \ - QuantizationConfig from vllm_mindspore.model_executor.layers.rotary_embedding import get_rope from vllm_mindspore.model_executor.layers.sampler import (SamplerOutput, get_sampler) @@ -29,10 +48,12 @@ from vllm_mindspore.model_executor.models.utils import ( PPMissingLayer, make_empty_intermediate_tensors_factory, make_layers, maybe_prefix) from vllm_mindspore.model_executor.sampling_metadata import SamplingMetadata -from vllm_mindspore.model_executor.models.model_base import MsModelBase +from vllm_mindspore.model_executor.models.model_base import MsModelBase, Fake_Attention from vllm.config import CacheConfig, VllmConfig +from vllm.model_executor.layers.quantization import \ + QuantizationConfig from vllm.sequence import IntermediateTensors from vllm.attention.backends.abstract import AttentionType from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size @@ -153,26 +174,26 @@ class Qwen2Attention(nn.Cell): attn_type=attn_type ) self.attn_mask = mint.triu(mint.ones(size=(128, 128), dtype=mstype.bfloat16), 1) + self.hard_mask = Tensor([0], dtype=mstype.bfloat16).reshape(1, 1) @jit def construct( self, positions: Tensor, hidden_states: Tensor, - kv_cache: Tuple[Tensor, Tensor], - # attn_metadata: AttentionMetadata, - num_prefill_tokens: int, - num_decode_tokens: int, + key_cache: Tensor, + value_cache: Tensor, + is_prefill: bool, slot_mapping: Tensor, batch_valid_length: Tuple[int], - context_lens: Tensor, + q_seq_lens: Tensor, block_tables: Tensor, ) -> Tensor: qkv, _ = self.qkv_proj(hidden_states) q, k, v = mint.split(qkv, (self.q_size, self.kv_size, self.kv_size), -1) - q, k = self.rotary_emb(positions, q, k, context_lens, num_prefill_tokens) - attn_output = self.attn(q, k, v, kv_cache, num_prefill_tokens, num_decode_tokens, - slot_mapping, batch_valid_length, context_lens, block_tables, self.attn_mask) + q, k = self.rotary_emb(positions, q, k, q_seq_lens, is_prefill) + attn_output = self.attn(q, k, v, key_cache, value_cache, is_prefill, slot_mapping, batch_valid_length, + q_seq_lens, block_tables, self.attn_mask, self.hard_mask) output, _ = self.o_proj(attn_output) return output @@ -232,13 +253,12 @@ class Qwen2DecoderLayer(nn.Cell): self, positions: Tensor, hidden_states: Tensor, - kv_cache: Tuple[Tensor, Tensor], - # attn_metadata: AttentionMetadata, - num_prefill_tokens: int, - num_decode_tokens: int, + key_cache: Tensor, + value_cache: Tensor, + is_prefill: bool, slot_mapping: Tensor, batch_valid_length: Tuple[int], - context_lens: Tensor, + q_seq_lens: Tensor, block_tables: Tensor, residual: Optional[Tensor], ) -> Tuple[Tensor, Tensor]: @@ -251,12 +271,12 @@ class Qwen2DecoderLayer(nn.Cell): hidden_states = self.self_attn( positions, hidden_states, - kv_cache, - num_prefill_tokens, - num_decode_tokens, + key_cache, + value_cache, + is_prefill, slot_mapping, batch_valid_length, - context_lens, + q_seq_lens, block_tables ) @@ -318,13 +338,12 @@ class Qwen2Model(nn.Cell): self, input_ids: Optional[Tensor], positions: Tensor, - kv_caches: List[Tuple[Tensor, Tensor]], - # attn_metadata: AttentionMetadata, - num_prefill_tokens: int, - num_decode_tokens: int, + key_caches: List[Tensor], + value_caches: List[Tensor], + is_prefill: bool, slot_mapping: Tensor, - batch_valid_length: Tuple[int], - context_lens: Tensor, + batch_valid_length: Tensor, + q_seq_lens: Tensor, block_tables: Tensor, intermediate_tensors: Optional[IntermediateTensors] = None, inputs_embeds: Optional[Tensor] = None, @@ -344,12 +363,12 @@ class Qwen2Model(nn.Cell): hidden_states, residual = layer( positions, hidden_states, - kv_caches[i - self.start_layer], - num_prefill_tokens, - num_decode_tokens, + key_caches[i - self.start_layer], + value_caches[i - self.start_layer], + is_prefill, slot_mapping, batch_valid_length, - context_lens, + q_seq_lens, block_tables, residual ) @@ -380,7 +399,17 @@ class Qwen2Model(nn.Cell): # Models trained using ColossalAI may include these tensors in # the checkpoint. Skip them. continue - + if (self.quant_config is not None and + (scale_name := self.quant_config.get_cache_scale(name))): + # Loading kv cache quantization scales + param = params_dict[scale_name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + loaded_weight = (loaded_weight if loaded_weight.dim() == 0 else + loaded_weight[0]) + weight_loader(param, loaded_weight) + loaded_params.add(scale_name) + continue for param_name, weight_name, shard_id in stacked_params_mapping: if weight_name not in name: continue @@ -456,7 +485,15 @@ class Qwen2ForCausalLM(MsModelBase): self.model.make_empty_intermediate_tensors) self.set_modules({"model": self.model, "lm_head": self.lm_head}) - self.set_model_inputs() + self.prefill = True + self.set_model_inputs(self.prefill) + self.kv_caches = [Fake_Attention() for i in range(config.num_hidden_layers)] + compilation_config = vllm_config.compilation_config + + if prefix in compilation_config.static_forward_context: + raise ValueError(f"Duplicate layer name: {prefix}") + for i in range(config.num_hidden_layers): + compilation_config.static_forward_context[str(i)] = self.kv_caches[i] def get_input_embeddings(self, input_ids: Tensor) -> Tensor: return self.model.get_input_embeddings(input_ids) @@ -468,22 +505,54 @@ class Qwen2ForCausalLM(MsModelBase): kv_caches: List[Tuple[Tensor, Tensor]], attn_metadata: AttentionMetadata, intermediate_tensors: IntermediateTensors = None, - inputs_embeds: Tensor = None + inputs_embeds: Tensor = None, + **kwargs ) -> Union[Tensor, IntermediateTensors]: - if attn_metadata.num_prefill_tokens > 0: - input_ids = input_ids.expand_dims(0) - if attn_metadata.num_decode_tokens > 0: - input_ids = input_ids.expand_dims(1) + key_cache, value_cache = self.get_kvcache() + seq_lens = attn_metadata.seq_lens + max_query_len = attn_metadata.max_query_len + # When Mutli-Step is enabled with Chunked-Prefill, prefills and + # decodes are scheduled together. In the first step, all the + # prefills turn into decodes and max_query_len will be 1. + if self.is_multi_step_chunked_prefill and max_query_len == 1: + query_lens = [1] * len(seq_lens) + else: + query_lens = attn_metadata.query_lens + + seq_lens_np = np.array(seq_lens, dtype=np.int32) + query_lens_np = np.array(query_lens, dtype=np.int32) + kv_cache_lens = seq_lens_np - query_lens_np + is_prefill = attn_metadata.num_decode_tokens == 0 and kv_cache_lens.max() == 0 + if is_prefill: + input_ids = ops.expand_dims(input_ids, 0) + if not self.prefill: + self.prefill = True + self.set_model_inputs(self.prefill) + else: + input_ids = ops.expand_dims(input_ids, 1) + if self.prefill: + self.prefill = False + self.set_model_inputs(self.prefill) + + slot_mapping = attn_metadata.slot_mapping + batch_valid_length = Tensor.from_numpy(np.array(attn_metadata.seq_lens, dtype=np.int32)) + q_seq_lens = Tensor.from_numpy(np.array(attn_metadata.query_lens, dtype=np.int32)) + block_tables = attn_metadata.block_tables model_output = self.model(input_ids, positions, - kv_caches, - **dict(attn_metadata), - intermediate_tensors=intermediate_tensors, - inputs_embeds=inputs_embeds) - if attn_metadata.num_prefill_tokens > 0: - model_output = model_output.squeeze(0) - if attn_metadata.num_decode_tokens > 0: - model_output = model_output.squeeze(1) + key_cache, + value_cache, + is_prefill, + slot_mapping, + batch_valid_length, + q_seq_lens, + block_tables, + intermediate_tensors, + inputs_embeds) + if is_prefill: + model_output = ops.squeeze(model_output, 0) + else: + model_output = ops.squeeze(model_output, 1) return model_output def load_weights(self, weights: Iterable[Tuple[str, Tensor]]) -> Set[str]: diff --git a/vllm_mindspore/model_executor/models/registry.py b/vllm_mindspore/model_executor/models/registry.py index ef38ee0b9946723215a89b633f745dddbdb3d7ee..1a9dbe9fe8811eeefa6be9309271cfeaaaa4be4c 100644 --- a/vllm_mindspore/model_executor/models/registry.py +++ b/vllm_mindspore/model_executor/models/registry.py @@ -1,6 +1,7 @@ #!/usr/bin/env python3 # encoding: utf-8 # Copyright 2025 Huawei Technologies Co., Ltd +# Copyright 2024 The vLLM team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -36,6 +37,7 @@ _MINDSPORE_MODELS = { _MINDFORMERS_MODELS = { "Qwen2ForCausalLM": ("qwen2", "Qwen2ForCausalLM"), "DeepseekV3ForCausalLM": ("deepseek_v3", "DeepseekV3ForCausalLM"), + "DeepSeekMTPModel": ("deepseek_mtp", "DeepseekV3MTPForCausalLM"), } MindSporeModelRegistry = _ModelRegistry( @@ -59,32 +61,9 @@ MindSporeModelRegistry = _ModelRegistry( _T = TypeVar("_T") -def _run_in_subprocess(fn: Callable[[], _T]) -> _T: - with tempfile.TemporaryDirectory() as tempdir: - output_filepath = os.path.join(tempdir, "registry_output.tmp") - - # `cloudpickle` allows pickling lambda functions directly - input_bytes = cloudpickle.dumps((fn, output_filepath)) - - # cannot use `sys.executable __file__` here because the script - # contains relative imports - returned = subprocess.run( - [sys.executable, "-m", "vllm_mindspore.model_executor.models.registry"], - input=input_bytes, - capture_output=True, - ) - - # check if the subprocess is successful - try: - returned.check_returncode() - except Exception as e: - # wrap raised exception to provide more information - raise RuntimeError( - f"Error raised in subprocess:\n" f"{returned.stderr.decode()}" - ) from e - - with open(output_filepath, "rb") as f: - return pickle.load(f) +_SUBPROCESS_COMMAND = [ + sys.executable, "-m", "vllm.model_executor.models.registry" +] def _run() -> None: diff --git a/vllm_mindspore/model_executor/models/utils.py b/vllm_mindspore/model_executor/models/utils.py index c84b6dc315b190b8d2ebc1d01b71c5a9b3f7684e..0a115c2cc712a3156aec84e7a4a3efe54a4358bd 100644 --- a/vllm_mindspore/model_executor/models/utils.py +++ b/vllm_mindspore/model_executor/models/utils.py @@ -1,6 +1,7 @@ #!/usr/bin/env python3 # encoding: utf-8 # Copyright 2025 Huawei Technologies Co., Ltd +# Copyright 2024 The vLLM team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/vllm_mindspore/model_executor/sampling_metadata.py b/vllm_mindspore/model_executor/sampling_metadata.py index e6b60f579ba00ac507a4f83d1ad6cbf89327f035..a016dd8e14450b2dea8ca2a138e41578d4a40d08 100644 --- a/vllm_mindspore/model_executor/sampling_metadata.py +++ b/vllm_mindspore/model_executor/sampling_metadata.py @@ -1,6 +1,7 @@ #!/usr/bin/env python3 # encoding: utf-8 # Copyright 2025 Huawei Technologies Co., Ltd +# Copyright 2024 The vLLM team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -34,8 +35,6 @@ _SAMPLING_EPS = 1e-5 from mindspore import Tensor import mindspore as ms -# TODO(tronzhang): use vllm's SequenceGroupToSample. (now for tensor create pin/device and tensor.to) - @dataclass class SequenceGroupToSample: @@ -601,7 +600,6 @@ class SamplingTensors: # Because the memory is pinned, we can do non-blocking # transfer to device. - # TODO(tronzhang): mindspore tensor donot support tensor.to(device=xxx, non_blocking=xxx), but tensor.move_to(to, blocking=xxx). return cls( temperatures=temperatures_t, top_ps=top_ps_t, diff --git a/vllm_mindspore/model_executor/utils.py b/vllm_mindspore/model_executor/utils.py index c6de292aafdd2e56052b24dfe0ba2eb940d06a65..eb421de0ba02911be0da18afb4b885efc566fc74 100644 --- a/vllm_mindspore/model_executor/utils.py +++ b/vllm_mindspore/model_executor/utils.py @@ -1,6 +1,7 @@ #!/usr/bin/env python3 # encoding: utf-8 # Copyright 2025 Huawei Technologies Co., Ltd +# Copyright 2024 The vLLM team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -18,8 +19,6 @@ from typing import Any, Dict, Optional from mindspore import Tensor -# TODO(tronzhang): Use vllm's latter... - def set_weight_attrs( weight: Tensor, diff --git a/vllm_mindspore/ops/CMakeLists.txt b/vllm_mindspore/ops/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..4c94b2c085b0be5ed4247e4c5829325531648ae9 --- /dev/null +++ b/vllm_mindspore/ops/CMakeLists.txt @@ -0,0 +1,40 @@ +cmake_minimum_required(VERSION 3.16) +project(Ops) + +set(MS_EXTENSION_NAME "" CACHE STRING "Extension Name") +set(BUILD_EXTENSION_DIR "" CACHE STRING "Extension directory") +if (MS_EXTENSION_NAME STREQUAL "") + message(FATAL_ERROR "MS_EXTENSION_NAME must be set. Use -DMS_EXTENSION_NAME=") +endif() +if (BUILD_EXTENSION_DIR STREQUAL "") + message(FATAL_ERROR "BUILD_EXTENSION_DIR must be set. Use -DBUILD_EXTENSION_DIR=") +endif() + +# Build ascendc kernels +add_subdirectory(ascendc) + +# Collect source files +file(GLOB SRC_FILES ${CMAKE_CURRENT_SOURCE_DIR}/module/*.cpp) + +# Generate a temporary python script file to build custom ops with MindSpore's CustomOpBuilder +set(PYTHON_SCRIPT_PATH "${CMAKE_BINARY_DIR}/build_custom_with_ms.py") +file(WRITE ${PYTHON_SCRIPT_PATH} " +import mindspore as ms +src_files = '${SRC_FILES}'.split(';') +ms.ops.CustomOpBuilder( + name='${MS_EXTENSION_NAME}', + sources=src_files, + backend='Ascend', + cflags='-I${CMAKE_CURRENT_SOURCE_DIR}', + ldflags='-L${ASCENDC_TARGET_DIR} -l${ASCENDC_TARGET_NAME}', + build_dir='${BUILD_EXTENSION_DIR}' +).build() +") + +find_package(Python3 COMPONENTS Interpreter REQUIRED) +add_custom_target( + BuildCustomOp ALL + COMMAND cd ${CMAKE_BINARY_DIR} && ${Python3_EXECUTABLE} ${PYTHON_SCRIPT_PATH} + DEPENDS ${ASCENDC_TARGET_NAME} + COMMENT "Building custom operator with MindSpore" +) diff --git a/vllm_mindspore/ops/ascendc/CMakeLists.txt b/vllm_mindspore/ops/ascendc/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..d6165987c9c0b00e6b31d7b50a053afcc796d9d8 --- /dev/null +++ b/vllm_mindspore/ops/ascendc/CMakeLists.txt @@ -0,0 +1,31 @@ +cmake_minimum_required(VERSION 3.16) +project(AscendC_Kernels) + +# Parameters passed from command line or default values +set(RUN_MODE "npu") +set(SOC_VERSION "Ascend910B1" CACHE STRING "system on chip type") +set(CMAKE_BUILD_TYPE "Debug" CACHE STRING "Build type Release/Debug") + +# Set ASCEND_CANN_PACKAGE_PATH based on the ASCEND_HOME_PATH environment variable +set(ASCEND_CANN_PACKAGE_PATH "$ENV{ASCEND_HOME_PATH}" CACHE STRING "ASCEND CANN package installation directory") + +# Verify that the required paths exist +if(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake) + set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake) +elseif(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake) + set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake) +else() + message(FATAL_ERROR "ascendc_kernel_cmake does not exist. Check whether the CANN package is installed in ${ASCEND_CANN_PACKAGE_PATH}") +endif() + +# Include Ascend CANN CMake file +include(${ASCENDC_CMAKE_DIR}/ascendc.cmake) + +# Collect source files +file(GLOB ASCENDC_KERNEL_FILES ${CMAKE_CURRENT_SOURCE_DIR}/*.c) + +# Create an object library +ascendc_library(ascendc_kernels_npu STATIC ${ASCENDC_KERNEL_FILES}) + +set(ASCENDC_TARGET_NAME ascendc_kernels_npu PARENT_SCOPE) +set(ASCENDC_TARGET_DIR "${CMAKE_BINARY_DIR}/lib" PARENT_SCOPE) diff --git a/vllm_mindspore/ops/ascendc/adv_step_flash.c b/vllm_mindspore/ops/ascendc/adv_step_flash.c new file mode 100644 index 0000000000000000000000000000000000000000..e89976311acfa9653923fb3f1d46f02153a9d351 --- /dev/null +++ b/vllm_mindspore/ops/ascendc/adv_step_flash.c @@ -0,0 +1,253 @@ +#include "kernel_operator.h" + +using namespace AscendC; + +template +struct integral_constant { + static constexpr Tp value = v; +}; +using true_type = integral_constant; +using false_type = integral_constant; +template +struct is_same : public false_type {}; +template +struct is_same : public true_type {}; + +template +__aicore__ inline void DataCopyCustom(const U &dstTensor, const R &srcTensor, const uint32_t count) { + DataCopyParams copyParams; + copyParams.blockLen = count * sizeof(T); + copyParams.blockCount = 1; + if constexpr (is_same>::value) { + DataCopyPadParams padParams; + DataCopyPad(dstTensor, srcTensor, copyParams, padParams); + } else { + DataCopyPad(dstTensor, srcTensor, copyParams); + } +} + +class KernelAdvStepFlash { + public: + __aicore__ inline KernelAdvStepFlash(TPipe *pipe) { Ppipe = pipe; } + + __aicore__ inline void Init(GM_ADDR sampledTokenIds, GM_ADDR blockTables, GM_ADDR seqLensInput, GM_ADDR inputTokens, + GM_ADDR inputPositions, GM_ADDR seqLensOut, GM_ADDR slotMapping, int32_t num_seqs, + int32_t block_size, int32_t block_tables_stride) { + ASSERT(GetBlockNum() != 0 && "Block dim can not be zero!"); + this->blockSize = block_size; + this->blockTablesStride = block_tables_stride; + this->tensorLength = num_seqs; + + this->blockSizeFp = static_cast(this->blockSize); + + // get start index for current core, core parallel + sampledTokenIdsGm.SetGlobalBuffer((__gm__ int32_t *)sampledTokenIds, tensorLength); + seqLensInputGm.SetGlobalBuffer((__gm__ int32_t *)seqLensInput, tensorLength); + blockTablesGm.SetGlobalBuffer((__gm__ int32_t *)blockTables); // inf size + + inputTokensGm.SetGlobalBuffer((__gm__ int32_t *)inputTokens, tensorLength); + inputPositionsGm.SetGlobalBuffer((__gm__ int32_t *)inputPositions, tensorLength); + seqLensOutGm.SetGlobalBuffer((__gm__ int32_t *)seqLensOut, tensorLength); + slotMappingGm.SetGlobalBuffer((__gm__ int32_t *)slotMapping, tensorLength); + + // pipe alloc memory to queue, the unit is Bytes + Ppipe->InitBuffer(sampledIdsQue, 1, tensorLength * sizeof(int32_t)); + Ppipe->InitBuffer(seqLenInQue, 1, tensorLength * sizeof(int32_t)); + + Ppipe->InitBuffer(inputTokensQue, 1, tensorLength * sizeof(int32_t)); + Ppipe->InitBuffer(seqLensOutQue, 1, tensorLength * sizeof(int32_t)); + Ppipe->InitBuffer(inputPositionsQue, 1, tensorLength * sizeof(int32_t)); + + Ppipe->InitBuffer(tableOffsetBuf, tensorLength * sizeof(int32_t)); + + Ppipe->InitBuffer(tmpDivBuf01, tensorLength * sizeof(int32_t)); + Ppipe->InitBuffer(tmpDivBuf02, tensorLength * sizeof(int32_t)); + + Ppipe->InitBuffer(outTableBuf, tensorLength * sizeof(int32_t)); + Ppipe->InitBuffer(blockTableBuf, 32); + } + + __aicore__ inline void Process() { + CopyIn(); + Compute(); + CopyOut(); + } + + private: + __aicore__ inline void CopyIn() { + LocalTensor sampledIdsLocal = sampledIdsQue.AllocTensor(); + LocalTensor seqLenInLocal = seqLenInQue.AllocTensor(); + + DataCopyCustom(sampledIdsLocal, sampledTokenIdsGm, tensorLength); + DataCopyCustom(seqLenInLocal, seqLensInputGm, tensorLength); + + sampledIdsQue.EnQue(sampledIdsLocal); + seqLenInQue.EnQue(seqLenInLocal); + } + + __aicore__ inline void Compute() { + LocalTensor tableOffset = tableOffsetBuf.Get(); + + LocalTensor sampledIdsLocal = sampledIdsQue.DeQue(); + LocalTensor seqLenInLocal = seqLenInQue.DeQue(); + + LocalTensor inputTokensLocal = inputTokensQue.AllocTensor(); + LocalTensor seqLensOutLocal = seqLensOutQue.AllocTensor(); + LocalTensor inputPositionsLocal = inputPositionsQue.AllocTensor(); + + Adds(inputTokensLocal, sampledIdsLocal, (int32_t)0, tensorLength); // inputTokensLocal <-- sampledIdsLocal + Adds(inputPositionsLocal, seqLenInLocal, (int32_t)0, tensorLength); // inputPositionsLocal <-- seqLenInLocal + Adds(seqLensOutLocal, seqLenInLocal, (int32_t)1, tensorLength); // seqLensOutLocal <-- seqLenInLocal + 1 + PipeBarrier(); + + // TODO add Function + ComputeTableOffset(tableOffset, inputPositionsLocal); + // GetTableValueByOffset(tableOffset, inputPositionsLocal); + + sampledIdsQue.FreeTensor(sampledIdsLocal); + seqLenInQue.FreeTensor(seqLenInLocal); + + inputTokensQue.EnQue(inputTokensLocal); + seqLensOutQue.EnQue(seqLensOutLocal); + inputPositionsQue.EnQue(inputPositionsLocal); + } + + __aicore__ inline void CopyOut() { + LocalTensor inputTokensLocal = inputTokensQue.DeQue(); + LocalTensor seqLensOutLocal = seqLensOutQue.DeQue(); + LocalTensor inputPositionsLocal = inputPositionsQue.DeQue(); + + DataCopyCustom(inputTokensGm, inputTokensLocal, tensorLength); + DataCopyCustom(inputPositionsGm, inputPositionsLocal, tensorLength); + DataCopyCustom(seqLensOutGm, seqLensOutLocal, tensorLength); + + inputTokensQue.FreeTensor(inputTokensLocal); + seqLensOutQue.FreeTensor(seqLensOutLocal); + inputPositionsQue.FreeTensor(inputPositionsLocal); + } + + __aicore__ inline void ComputeTableOffset(LocalTensor tableOffset, + LocalTensor inputPositionsLocal) { + LocalTensor tmpBuf01 = tmpDivBuf01.Get(); + LocalTensor tmpBuf02 = tmpDivBuf02.Get(); + + LocalTensor tmpBuf01Int = tmpBuf01.ReinterpretCast(); + LocalTensor tmpBuf02Int = tmpBuf02.ReinterpretCast(); + + LocalTensor outTableValue = outTableBuf.Get(); + LocalTensor blockTableLocal = blockTableBuf.Get(); + + // floor div + Cast(tmpBuf01, inputPositionsLocal, RoundMode::CAST_RINT, tensorLength); + Duplicate(tmpBuf02, blockSizeFp, tensorLength); + PipeBarrier(); + Div(tmpBuf01, tmpBuf01, tmpBuf02, tensorLength); // <-- inputPositionsLocal / blockSize + PipeBarrier(); + Cast(tmpBuf02Int, tmpBuf01, RoundMode::CAST_TRUNC, tensorLength); + + CreateVecIndex(tableOffset, (int32_t)0, tensorLength); // tableOffset <--- 0, 1, 2, 3, .... tensorLength -1 + PipeBarrier(); + + Muls(tableOffset, tableOffset, this->blockTablesStride, + tensorLength); // tableOffset <--- curt_offset * block_stride + PipeBarrier(); + Add(tableOffset, tableOffset, tmpBuf02Int, + tensorLength); // tableOffset <--- curt_offset * block_stride + inputPositionsLocal / blockSize + + PIPE_V_S(); + + for (int32_t idx = 0; idx < tensorLength; idx++) { + int32_t blockTableIdx = tableOffset.GetValue(idx); + + PIPE_S_MTE2(); + + DataCopyCustom(blockTableLocal, blockTablesGm[blockTableIdx], 1); + + PIPE_MTE2_S(); + + int32_t blockTableValue = blockTableLocal.GetValue(0); + int32_t block_offset = inputPositionsLocal.GetValue(idx) % this->blockSize; + blockTableValue = blockTableValue * this->blockSize + block_offset; + outTableValue.SetValue(idx, blockTableValue); + } + PIPE_S_MTE3(); + DataCopyCustom(slotMappingGm, outTableValue, tensorLength); + } + + __aicore__ inline void PIPE_S_MTE3() { + event_t event_S_MTE3 = static_cast(GetTPipePtr()->FetchEventID(HardEvent::S_MTE3)); + SetFlag(event_S_MTE3); + WaitFlag(event_S_MTE3); + } + + __aicore__ inline void PIPE_S_MTE2() { + event_t event_S_MTE2 = static_cast(GetTPipePtr()->FetchEventID(HardEvent::S_MTE2)); + SetFlag(event_S_MTE2); + WaitFlag(event_S_MTE2); + } + + __aicore__ inline void PIPE_MTE2_S() { + event_t event_MTE2_S = static_cast(GetTPipePtr()->FetchEventID(HardEvent::MTE2_S)); + SetFlag(event_MTE2_S); + WaitFlag(event_MTE2_S); + } + + __aicore__ inline void PIPE_V_S() { + event_t event_V_S = static_cast(GetTPipePtr()->FetchEventID(HardEvent::V_S)); + SetFlag(event_V_S); + WaitFlag(event_V_S); + } + + private: + TPipe *Ppipe = nullptr; + // create queues for input, in this case depth is equal to buffer num + TQue sampledIdsQue, seqLenInQue; + // create queues for output, in this case depth is equal to buffer num + TQue inputTokensQue, seqLensOutQue, inputPositionsQue; + + TBuf tableOffsetBuf; + TBuf tmpDivBuf01; + TBuf tmpDivBuf02; + TBuf outTableBuf; + TBuf blockTableBuf; + + // inputs + GlobalTensor sampledTokenIdsGm; + GlobalTensor seqLensInputGm; + GlobalTensor blockTablesGm; + // outs + GlobalTensor inputTokensGm; + GlobalTensor inputPositionsGm; + GlobalTensor seqLensOutGm; + GlobalTensor slotMappingGm; + + int32_t blockSize; + int32_t blockTablesStride; + int64_t tensorLength; // number of calculations rows on each core + + float blockSizeFp; +}; + +extern "C" __global__ __aicore__ void adv_step_flash_impl(GM_ADDR sampledTokenIds, GM_ADDR blockTables, + GM_ADDR seqLensInput, GM_ADDR inputTokens, + GM_ADDR inputPositions, GM_ADDR seqLensOut, + GM_ADDR slotMapping, int32_t num_seqs, int32_t block_size, + int32_t block_tables_stride) { + TPipe pipe; + + KernelAdvStepFlash op(&pipe); + op.Init(sampledTokenIds, blockTables, seqLensInput, inputTokens, inputPositions, seqLensOut, slotMapping, num_seqs, + block_size, block_tables_stride); + op.Process(); +} + +#ifndef __CCE_KT_TEST__ +void AdvStepFlashKernelEntry(uint32_t blockDims, void *l2ctrl, void *aclStream, uint8_t *sampledTokenIds, + uint8_t *blockTables, uint8_t *seqLensInput, uint8_t *inputTokens, uint8_t *inputPositions, + uint8_t *seqLensOut, uint8_t *slotMapping, int32_t num_seqs, int32_t block_size, + int32_t block_tables_stride) { + adv_step_flash_impl<<>>(sampledTokenIds, blockTables, seqLensInput, inputTokens, + inputPositions, seqLensOut, slotMapping, num_seqs, block_size, + block_tables_stride); +} +#endif diff --git a/vllm_mindspore/ops/ascendc/adv_step_flash.h b/vllm_mindspore/ops/ascendc/adv_step_flash.h new file mode 100644 index 0000000000000000000000000000000000000000..926626b0b56ee58c6d35c9ca71920141baa660b4 --- /dev/null +++ b/vllm_mindspore/ops/ascendc/adv_step_flash.h @@ -0,0 +1,9 @@ +#ifndef VLLM_MINDSPORE_OPS_ASCENDC_ADV_STEP_FLASH_H +#define VLLM_MINDSPORE_OPS_ASCENDC_ADV_STEP_FLASH_H + +extern void AdvStepFlashKernelEntry(uint32_t blockDims, void *l2ctrl, void *aclStream, uint8_t *sampledTokenIds, + uint8_t *blockTables, uint8_t *seqLensInput, uint8_t *inputTokens, + uint8_t *inputPositions, uint8_t *seqLensOut, uint8_t *slotMapping, + int32_t num_seqs, int32_t block_size, int32_t block_tables_stride); + +#endif // VLLM_MINDSPORE_OPS_ASCENDC_ADV_STEP_FLASH_H diff --git a/vllm_mindspore/ops/module/adv_step_flash.cpp b/vllm_mindspore/ops/module/adv_step_flash.cpp new file mode 100644 index 0000000000000000000000000000000000000000..803abb0a4239065f43083f153d7eea9d6d96b736 --- /dev/null +++ b/vllm_mindspore/ops/module/adv_step_flash.cpp @@ -0,0 +1,100 @@ +#include +#include +#include + +#include "ms_extension.h" + +#include "ascendc/adv_step_flash.h" +#include "module/module.h" + +using BaseTensor = mindspore::tensor::BaseTensor; +using BaseTensorPtr = mindspore::tensor::BaseTensorPtr; +using PyBoostUtils = mindspore::kernel::pyboost::PyBoostUtils; + +uint8_t *GetDataPtr(const BaseTensorPtr &t) { + return static_cast(t->device_address()->GetMutablePtr()) + t->data().itemsize() * t->storage_offset(); +} + +struct DtypeCaster { + BaseTensorPtr CheckAndCast(const BaseTensorPtr &t, const std::string &name = "") { + mindspore::Int64ImmPtr dst_type = std::make_shared(mindspore::TypeId::kNumberTypeInt32); + if (t->data_type() != mindspore::TypeId::kNumberTypeInt32) { + if (!name.empty()) { + tensor_map_[name] = t; + } + return mindspore::kernel::pyboost::cast(t, dst_type); + } + return t; + } + BaseTensorPtr RecoveryTensorDtype(const BaseTensorPtr &t, const std::string &name) { + auto iter = tensor_map_.find(name); + if (iter == tensor_map_.end()) { + return t; + } + auto ori_tensor = iter->second; + auto ori_dtype = std::make_shared(ori_tensor->data_type()); + auto ret = mindspore::kernel::pyboost::cast(t, ori_dtype); + ori_tensor->AssignValue(*ret); + return ori_tensor; + } + std::map tensor_map_; +}; + +void AdvStepFlashAscendC(int32_t num_seqs, int32_t num_queries, int32_t block_size, + BaseTensorPtr &input_tokens, // output + BaseTensorPtr sampled_token_ids, // input + BaseTensorPtr &input_positions, // output + BaseTensorPtr &seq_lens, // input&output (inplace) + BaseTensorPtr &slot_mapping, // output + BaseTensorPtr block_tables // input +) { + // the AdvStepFlashKernelEntry only support int32 inputs. + DtypeCaster caster; + sampled_token_ids = caster.CheckAndCast(sampled_token_ids); + block_tables = caster.CheckAndCast(block_tables); + input_tokens = caster.CheckAndCast(input_tokens, "input_tokens"); + input_positions = caster.CheckAndCast(input_positions, "input_positions"); + slot_mapping = caster.CheckAndCast(slot_mapping, "slot_mapping"); + seq_lens = caster.CheckAndCast(seq_lens, "seq_lens"); + + auto stream_id = PyBoostUtils::cur_stream_id(); + auto device_context = mindspore::runtime::OpRunner::GetDeviceContext("Ascend"); + PyBoostUtils::PrepareOpInputs(device_context, stream_id, input_tokens, sampled_token_ids, input_positions, seq_lens, + slot_mapping, block_tables); + // PyBoostUtils::PrepareOpOutputs(device_context, stream_id, outputs); + PyBoostUtils::DispatchRun(std::make_shared([=]() { + PyBoostUtils::MallocOpInputs(device_context, input_tokens, sampled_token_ids, input_positions, seq_lens, + slot_mapping, block_tables); + // PyBoostUtils::MallocOpOutputs(device_context, outputs); + + uint8_t *sampledTokenIdsPtr = GetDataPtr(sampled_token_ids); + uint8_t *blockTablesPtr = GetDataPtr(block_tables); + uint8_t *seqLensPtr = GetDataPtr(seq_lens); + uint8_t *inputTokensPtr = GetDataPtr(input_tokens); + uint8_t *inputPositionsPtr = GetDataPtr(input_positions); + uint8_t *slotMappingPtr = GetDataPtr(slot_mapping); + auto aclStream = device_context->device_res_manager_->GetStream(stream_id); + auto stride = block_tables->stride(); + int32_t block_tables_stride = stride.empty() ? 1 : stride[0]; + + mindspore::runtime::OpExecutor::DispatchLaunchTask([=]() { + uint32_t blockDims = 1; + void *l2ctrl = nullptr; + AdvStepFlashKernelEntry(blockDims, l2ctrl, aclStream, sampledTokenIdsPtr, blockTablesPtr, seqLensPtr, + inputTokensPtr, inputPositionsPtr, seqLensPtr, slotMappingPtr, num_seqs, block_size, + block_tables_stride); + }); + })); + + input_tokens = caster.RecoveryTensorDtype(input_tokens, "input_tokens"); + input_positions = caster.RecoveryTensorDtype(input_positions, "input_positions"); + slot_mapping = caster.RecoveryTensorDtype(slot_mapping, "slot_mapping"); + seq_lens = caster.RecoveryTensorDtype(seq_lens, "seq_lens"); +} + +MS_EXTENSION_MODULE(adv_step_flash) { + m.def("adv_step_flash", &AdvStepFlashAscendC, "adv_step_flash_ascendc", pybind11::arg("num_seqs"), + pybind11::arg("num_queries"), pybind11::arg("block_size"), pybind11::arg("input_tokens"), + pybind11::arg("sampled_token_ids"), pybind11::arg("input_positions"), pybind11::arg("seq_lens"), + pybind11::arg("slot_mapping"), pybind11::arg("block_tables")); +} diff --git a/vllm_mindspore/ops/module/module.cpp b/vllm_mindspore/ops/module/module.cpp new file mode 100644 index 0000000000000000000000000000000000000000..45ae8c067e4112a15e2d7554d85235d85c5905cb --- /dev/null +++ b/vllm_mindspore/ops/module/module.cpp @@ -0,0 +1,6 @@ +#include "module/module.h" + +PYBIND11_MODULE(MS_EXTENSION_NAME, m) { + m.doc() = "A custom module for operators"; + ModuleRegistry::Instance().RegisterAll(m); +} diff --git a/vllm_mindspore/ops/module/module.h b/vllm_mindspore/ops/module/module.h new file mode 100644 index 0000000000000000000000000000000000000000..ef660e12d335b835af8b0a13b7334ac35b2f310f --- /dev/null +++ b/vllm_mindspore/ops/module/module.h @@ -0,0 +1,54 @@ +#ifndef VLLM_MINDSPORE_OPS_MODULE_MODULE_H +#define VLLM_MINDSPORE_OPS_MODULE_MODULE_H + +#include +#include +#include +#include + +// Define the type of module registration functions +using ModuleRegisterFunction = std::function; + +// Module registry class +class ModuleRegistry { + public: + // Get the singleton instance + static ModuleRegistry &Instance() { + static ModuleRegistry instance; + return instance; + } + + // Register a module function + void Register(const ModuleRegisterFunction &func) { functions_.push_back(func); } + + // Call all registered module functions + void RegisterAll(pybind11::module_ &m) { + for (const auto &func : functions_) { + func(m); + } + } + + private: + ModuleRegistry() = default; + ~ModuleRegistry() = default; + + // Disable copy and assignment + ModuleRegistry(const ModuleRegistry &) = delete; + ModuleRegistry &operator=(const ModuleRegistry &) = delete; + + // Store all registered functions + std::vector functions_; +}; + +// Define a macro to register module functions +#define MS_EXTENSION_MODULE(func) \ + static void func##_register(pybind11::module_ &); \ + namespace { \ + struct func##_registrar { \ + func##_registrar() { ModuleRegistry::Instance().Register(func##_register); } \ + }; \ + static func##_registrar registrar_instance; \ + } \ + static void func##_register(pybind11::module_ &m) + +#endif // VLLM_MINDSPORE_OPS_MODULE_MODULE_H diff --git a/vllm_mindspore/platforms/ascend.py b/vllm_mindspore/platforms/ascend.py index de56d2680faac68893d0b035e88dc34408ea6507..b96403d4959fe14be3fcf3dc2f43d2b9bcc564e1 100644 --- a/vllm_mindspore/platforms/ascend.py +++ b/vllm_mindspore/platforms/ascend.py @@ -1,6 +1,7 @@ #!/usr/bin/env python3 # encoding: utf-8 # Copyright 2025 Huawei Technologies Co., Ltd +# Copyright 2024 The vLLM team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -16,9 +17,11 @@ # ============================================================================ """Ascend platform.""" -from typing import TYPE_CHECKING, Optional +import os +from typing import (TYPE_CHECKING, Optional, Union, Tuple) import torch +import mindspore as ms from vllm.platforms.interface import DeviceCapability, Platform, PlatformEnum, _Backend from vllm.logger import init_logger @@ -32,23 +35,25 @@ logger = init_logger(__name__) class AscendPlatform(Platform): - _enum = PlatformEnum.CUDA - device_name: str = "cuda" - device_type: str = "cuda" - dispatch_key: str = "CUDA" + + _enum = PlatformEnum.OOT + device_name: str = "npu" + device_type: str = "cuda" # To use cuda worker, executor... + simple_compile_backend: str = "npu" + ray_device_key: str = "NPU" + device_control_env_var: str = "ASCEND_RT_VISIBLE_DEVICES" @classmethod - def get_default_attn_backend(cls, selected_backend: _Backend): - """Get the default attention backend of a device.""" - return _Backend.FLASH_ATTN + def get_device_capability(cls, device_id: int = 0): + return True @classmethod - def get_device_capability( + def has_device_capability( cls, + capability: Union[Tuple[int, int], int], device_id: int = 0, - ) -> Optional[DeviceCapability]: - major, minor = torch.cuda.get_device_capability(device_id) - return DeviceCapability(major=major, minor=minor) + ) -> bool: + return True @classmethod def get_device_name(cls, device_id: int = 0) -> str: @@ -56,37 +61,10 @@ class AscendPlatform(Platform): return torch.cuda.get_device_name(device_id) @classmethod - def get_device_total_memory(cls, device_id: int = 0) -> int: - """Get the total memory of a device in bytes.""" - device_props = torch.cuda.get_device_properties(device_id) - return device_props.total_memory - - @classmethod - def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool: - """ - Check if the current platform supports async output. - """ - if enforce_eager: - # from vllm.logger import init_logger - # logger = init_logger(__name__) - logger.warning( - "To see benefits of async output processing, enable CUDA " - "graph. Since, enforce-eager is enabled, async output " - "processor cannot be used" - ) - return False + def is_async_output_supported(cls, _) -> bool: + """Check if the current platform supports async output.""" return True - @classmethod - def inference_mode(cls): - """A device-specific wrapper of `torch.inference_mode`. - - This wrapper is recommended because some hardware backends such as TPU - do not support `torch.inference_mode`. In such a case, they will fall - back to `torch.no_grad` by overriding this method. - """ - return torch.inference_mode(mode=True) - @classmethod def check_and_update_config(cls, vllm_config: VllmConfig) -> None: """ @@ -102,40 +80,50 @@ class AscendPlatform(Platform): scheduler_config = vllm_config.scheduler_config if parallel_config.worker_cls == "auto": - import vllm.envs as envs - if scheduler_config.is_multi_step: - if envs.VLLM_USE_V1: - raise NotImplementedError - else: - parallel_config.worker_cls = ( - "vllm.worker.multi_step_worker.MultiStepWorker" - ) + parallel_config.worker_cls = "vllm.worker.multi_step_worker.MultiStepWorker" elif vllm_config.speculative_config: - if envs.VLLM_USE_V1: - raise NotImplementedError - else: - parallel_config.worker_cls = ( - "vllm.spec_decode.spec_decode_worker.create_spec_worker" - ) - parallel_config.sd_worker_cls = "vllm.worker.worker.Worker" + parallel_config.worker_cls = "vllm.spec_decode.spec_decode_worker.create_spec_worker" + parallel_config.sd_worker_cls = "vllm.worker.worker.Worker" else: - if envs.VLLM_USE_V1: - parallel_config.worker_cls = "vllm.v1.worker.gpu_worker.Worker" - else: - parallel_config.worker_cls = "vllm.worker.worker.Worker" + parallel_config.worker_cls = "vllm.worker.worker.Worker" cache_config = vllm_config.cache_config if cache_config and cache_config.block_size is None: cache_config.block_size = 16 + @classmethod - def verify_quantization(cls, quant: str) -> None: - """ - Verify whether the quantization is supported by the current platform. - """ - if cls.supported_quantization and quant not in cls.supported_quantization: - raise ValueError( - f"{quant} quantization is currently not supported in " - f"{cls.device_name}." - ) + def get_attn_backend_cls(cls, selected_backend, head_size, dtype, kv_cache_dtype, block_size, use_v1, use_mla): + """Get the attention backend class of a device.""" + if use_v1: + raise RuntimeError("vLLM-MindSpore do not support v1 egine now!") + if use_mla: + logger.info("Using MindSpore MLA backend.") + return "vllm_mindspore.attention.backends.ms_attn.MLABackend" + + if selected_backend == _Backend.FLASH_ATTN or selected_backend is None: + logger.info("Using MindSpore Attention backend.") + return "vllm_mindspore.attention.backends.ms_attn.MsAttentionBackend" + + raise ValueError( + "Invaild attention backend %s for vLLM-MindSpore with head_size: %s, dtype: %s, kv_cache_dtype: %s, block_size: %s." + % (str(selected_backend), str(head_size), str(dtype), str(kv_cache_dtype), str(block_size)) + ) + + @classmethod + def get_current_memory_usage(cls, device: Optional[torch.types.Device] = None) -> float: + """Return the memory usage in bytes.""" + torch.cuda.reset_peak_memory_stats() + return torch.cuda.max_memory_allocated(device) + + @classmethod + def get_device_communicator_cls(cls) -> str: + """Get device specific communicator class for distributed communication.""" + return "vllm.distributed.device_communicators.base_device_communicator.DeviceCommunicatorBase" + + @classmethod + def get_device_total_memory(cls, device_id: int = 0) -> int: + """Get the total memory of a device in bytes.""" + device_props = torch.cuda.get_device_properties(device_id) + return device_props.total_memory diff --git a/vllm_mindspore/scripts.py b/vllm_mindspore/scripts.py index 274f0cb21107a853225327dd73af3957277db733..530c1e624b7b46ce8c83fb6cafd631ef7f245e55 100644 --- a/vllm_mindspore/scripts.py +++ b/vllm_mindspore/scripts.py @@ -1,6 +1,7 @@ #!/usr/bin/env python3 # encoding: utf-8 # Copyright 2025 Huawei Technologies Co., Ltd +# Copyright 2024 The vLLM team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/vllm_mindspore/sequence.py b/vllm_mindspore/sequence.py index 82b93f54646ec6e9f04f9142dfd87b97ac98fb43..c1ca3c750b32cd656e35b7584ad74bbef34cfa3c 100644 --- a/vllm_mindspore/sequence.py +++ b/vllm_mindspore/sequence.py @@ -1,6 +1,7 @@ #!/usr/bin/env python3 # encoding: utf-8 # Copyright 2025 Huawei Technologies Co., Ltd +# Copyright 2024 The vLLM team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/vllm_mindspore/tests/test_sampler.py b/vllm_mindspore/tests/test_sampler.py deleted file mode 100644 index e0d91147ce80960300b78abe55802c1fe5327653..0000000000000000000000000000000000000000 --- a/vllm_mindspore/tests/test_sampler.py +++ /dev/null @@ -1,168 +0,0 @@ -import vllm_mindspore -import itertools -import random -from dataclasses import dataclass -from typing import Dict, List, Optional, Tuple -from unittest.mock import Mock, patch - -import pytest -import torch - -from vllm_mindspore.model_executor.layers.sampler import Sampler -from vllm_mindspore.model_executor.sampling_metadata import SamplingMetadata -# from vllm_mindspore.model_executor.utils import set_random_seed -from vllm_mindspore.sequence import SamplingParams, SequenceData, SequenceGroupMetadata - -VOCAB_SIZE = 32000 -RANDOM_SEEDS = list(range(128)) - -class MockLogitsSampler(Sampler): - - def __init__(self, fake_logits: torch.Tensor): - super().__init__() - self.fake_logits = fake_logits - - def forward(self, *args, **kwargs): - return super().forward(*args, **kwargs) - -def _prepare_test( - batch_size: int -) -> Tuple[torch.Tensor, torch.Tensor, MockLogitsSampler]: - input_tensor = torch.rand((batch_size, 1024), dtype=torch.float16) - fake_logits = torch.full((batch_size, VOCAB_SIZE), - 1e-2, - dtype=input_tensor.dtype) - sampler = MockLogitsSampler(fake_logits) - return input_tensor, fake_logits, sampler - -def _do_sample( - batch_size: int, - input_tensor: torch.Tensor, - sampler: MockLogitsSampler, - sampling_params: SamplingParams, - device: str, -): - seq_group_metadata_list: List[SequenceGroupMetadata] = [] - seq_lens: List[int] = [] - for i in range(batch_size): - seq_group_metadata_list.append( - SequenceGroupMetadata( - request_id=f"test_{i}", - is_prompt=True, - seq_data={0: SequenceData.from_seqs([1, 2, 3])}, - sampling_params=sampling_params, - block_tables={0: [1]}, - )) - seq_lens.append(seq_group_metadata_list[-1].seq_data[0].get_len()) - - sampling_metadata = SamplingMetadata.prepare( - seq_group_metadata_list, - seq_lens, - query_lens=seq_lens, - device=device, - pin_memory=False) - return sampler(logits=input_tensor, sampling_metadata=sampling_metadata) - -def test_sampler_all_greedy(): - # set_random_seed(seed) - device='cuda' - # torch.set_default_device(device) - batch_size = random.randint(1, 256) - input_tensor, fake_logits, sampler = _prepare_test(batch_size) - - sampling_params = SamplingParams(temperature=0) - sampler_output = _do_sample(batch_size, fake_logits, sampler, - sampling_params, device) - expected = mint.argmax(fake_logits, dim=-1) - for i, sequence_output in enumerate(sampler_output): - for nth_output in sequence_output.samples: - assert nth_output.output_token == expected[i].item() - - -def test_sampler_all_random(): - # set_random_seed(seed) - # torch.set_default_device(device) - device='cuda' - batch_size = random.randint(1, 256) - _, fake_logits, sampler = _prepare_test(batch_size) - - for i in range(batch_size): - fake_logits[i, i] = 1e2 - - sampling_params = SamplingParams( - temperature=1.0, - n=random.randint(1, 10), - ) - sampler_output = _do_sample(batch_size, fake_logits, sampler, - sampling_params, device) - - for i, sequence_output in enumerate(sampler_output): - for nth_output in sequence_output.samples: - assert nth_output.output_token == i - - - -def test_sampler_repetition_penalty_mixed(): - device='cuda' - vocab_size = 8 - - def test_sampling_params(sampling_params: List[SamplingParams]): - - seq_group_metadata_list: List[SequenceGroupMetadata] = [] - seq_lens: List[int] = [] - for i in range(2): - seq_group_metadata_list.append( - SequenceGroupMetadata( - request_id=f"test_{i}", - is_prompt=True, - seq_data={0: SequenceData.from_seqs([1, 2, 3])}, - sampling_params=sampling_params[i], - block_tables={0: [1]}, - )) - seq_lens.append(seq_group_metadata_list[-1].seq_data[0].get_len()) - - sampling_metadata = SamplingMetadata.prepare( - seq_group_metadata_list, - seq_lens, - query_lens=seq_lens, - device=device, - pin_memory=False) - - fake_logits = torch.full((2, vocab_size), - 1e-2, - dtype=torch.float16) - - fake_logits[:, 5] = 1.1e-2 - fake_logits[:, 1] = 1.2e-2 - - sampler = MockLogitsSampler(fake_logits) - print(f'fake_logits is: {fake_logits}', flush = True) - - sampler_output = sampler(logits=fake_logits, - sampling_metadata=sampling_metadata) - - generated_tokens = [] - for output in sampler_output: - generated_tokens.append(output.samples[0].output_token) - - return generated_tokens - - # one configuration is greedy with repetition_penalty - sampling_params_rep = SamplingParams( - temperature=0.0, - repetition_penalty=2.0, - ) - - # other configuration is sampling w/o repetition_penalty - sampling_params_sample = SamplingParams( - temperature=1.0, - top_k=1, - ) - - tokens1 = test_sampling_params( - [sampling_params_rep, sampling_params_sample]) - - tokens2 = test_sampling_params( - [sampling_params_sample, sampling_params_rep]) - - assert tokens1[0] == tokens2[1] \ No newline at end of file diff --git a/vllm_mindspore/utils.py b/vllm_mindspore/utils.py index 6e2d6f03a0a010cc93d2b60668f4845808b45ce7..717416bbc7d3c9495ca94a63eb93ec9196d46740 100644 --- a/vllm_mindspore/utils.py +++ b/vllm_mindspore/utils.py @@ -1,6 +1,7 @@ #!/usr/bin/env python3 # encoding: utf-8 # Copyright 2025 Huawei Technologies Co., Ltd +# Copyright 2024 The vLLM team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -19,6 +20,7 @@ import contextlib import gc import logging import os +import sys from typing import ( TYPE_CHECKING, Callable, @@ -28,6 +30,7 @@ from typing import ( Tuple, Union, ) +import numpy as np import torch @@ -41,6 +44,7 @@ from vllm.utils import T, TORCH_DTYPE_TO_NUMPY_DTYPE, make_ndarray_with_pad import mindspore as ms from mindspore.common.initializer import Zero from mindspore import dtype as mstype +from mindspore.common.api import _pynative_executor from .scripts import env_setup @@ -76,99 +80,6 @@ def direct_register_custom_op( ): ... -@contextlib.contextmanager -def memory_profiling( - baseline_memory_in_bytes: int, weights_memory_in_bytes: int -) -> Generator["MemoryProfilingResult", None, None]: - """Memory profiling context manager. - baseline_memory_in_bytes: memory used by all the components other than - the current vLLM instance. It contains: memory used by other processes, memory - used by another vLLM instance in the same process, etc. It is usually measured - before the current vLLM instance initialize the device. And we assume it is - constant during the profiling of the current vLLM instance. - weights_memory_in_bytes: memory used by PyTorch when loading the model weights. - Note that, before loading the model weights, we also initialize the device - and distributed environment, which may consume some memory. This part is not - included in the weights_memory_in_bytes because PyTorch does not control it. - - The memory in one GPU can be classified into 3 categories: - 1. memory used by anything other than the current vLLM instance. - 2. memory used by torch in the current vLLM instance. - 3. memory used in the current vLLM instance, but not by torch. - - A quantitive example: - - Before creating the current vLLM instance: - category 1: 1 GiB - category 2: 0 GiB - category 3: 0 GiB - - After creating the current vLLM instance and loading the model, - (i.e. before profiling): - category 1: 1 GiB - category 2: 2 GiB (model weights take 2 GiB) - category 3: 0.5 GiB (memory used by NCCL) - - During profiling (peak): - category 1: 1 GiB - category 2: 4 GiB (peak activation tensors take 2 GiB) - category 3: 1 GiB (memory used by NCCL + buffers for some attention backends) - - After profiling: - category 1: 1 GiB - category 2: 3 GiB (after garbage-collecting activation tensors) - category 3: 1 GiB (memory used by NCCL + buffers for some attention backends) - - In this case, non-kv cache takes 5 GiB in total, including: - a. 2 GiB used by the model weights (category 2) - b. 2 GiB reserved for the peak activation tensors (category 2) - c. 1 GiB used by non-torch components (category 3) - - The memory used for loading weights (a.) is directly given from the argument `weights_memory_in_bytes`. - - The increase of ``torch.cuda.memory_stats()["allocated_bytes.all.peak"]` after profiling gives (b.). - - (c.) is tricky. We measure the total memory used in this GPU (`torch.cuda.mem_get_info()[1] - torch.cuda.mem_get_info()[0]`), - subtract the baseline memory, the memory used by the model weights, and diff of `torch.cuda.memory_stats()["allocated_bytes.all.current"]`. - """ # noqa - torch.cuda.reset_peak_memory_stats() - - from vllm.utils import MemoryProfilingResult - - result = MemoryProfilingResult() - - result.baseline_memory_in_bytes = baseline_memory_in_bytes - # the part of memory used for holding the model weights - result.weights_memory_in_bytes = weights_memory_in_bytes - - result.before_profile.measure() - - yield result - - gc.collect() - torch.cuda.empty_cache() - - result.after_profile.measure() - - diff = result.after_profile - result.before_profile - result.torch_peak_increase_in_bytes = diff.torch_peak_in_bytes - - # For mindspore, the memory is allocated and free in memory pool, so cannot read the current used memory by `torch.cuda.mem_get_info`. - current_cuda_memory_bytes = result.after_profile.torch_memory_in_bytes - result.non_torch_increase_in_bytes = ( - current_cuda_memory_bytes - - baseline_memory_in_bytes - - weights_memory_in_bytes - - diff.torch_memory_in_bytes - ) # noqa - result.profile_time = diff.timestamp - result.non_kv_cache_memory_in_bytes = ( - result.non_torch_increase_in_bytes - + result.torch_peak_increase_in_bytes - + result.weights_memory_in_bytes - ) # noqa - - def _create_empty_tensor(ms_type): init_func = Zero() init_func.__enable_zero_dim__ = True @@ -178,6 +89,10 @@ def _create_empty_tensor(ms_type): return init_tensor +def _create_dummy_block_tables(dtype): + return ms.ops.zeros((1, 1), dtype=dtype) + + def make_tensor_with_pad( x: List[List[T]], pad: T, @@ -199,7 +114,7 @@ def make_tensor_with_pad( pin_memory = False if padded_x.size == 0: - tensor = _create_empty_tensor(dtype) + tensor = _create_dummy_block_tables(dtype) else: tensor = torch.from_numpy(padded_x) if pin_memory: @@ -299,14 +214,21 @@ def is_mindformers_model_backend(): def check_ready(): + import vllm.envs as envs from mindspore import set_context + if envs.VLLM_USE_V1: + raise NotImplementedError("vLLM-MindSpore does not support VLLM V1 now!") + # Common environment variables of predict. set_context(jit_config={"jit_level": "O0", "infer_boost": "on"}) + if os.getenv("MS_MEMPOOL_BLOCK_SIZE"): + set_context(mempool_block_size=f"{os.environ['MS_MEMPOOL_BLOCK_SIZE']}GB") + if is_mindformers_model_backend(): logger.info("Run with Mindformers backend!") - necessary_envs = ("vLLM_MODEL_MEMORY_USE_GB", "MINDFORMERS_MODEL_CONFIG") + necessary_envs = ("MINDFORMERS_MODEL_CONFIG", ) lost_envs = [env_item for env_item in necessary_envs if not os.getenv(env_item)] if lost_envs: @@ -317,28 +239,29 @@ def check_ready(): mindformers_default_env = { "MS_INTERNAL_DISABLE_CUSTOM_KERNEL_LIST": "FlashAttentionScore,PagedAttention", - "MS_ALLOC_CONF": "enable_vmm:False", } env_setup(mindformers_default_env) - - set_context(mode=0, device_target="Ascend", max_call_depth=10000) else: - env_setup({"MS_ALLOC_CONF": "enable_vmm:False", }) logger.info("Run with native model backend!") -def cal_block_num(cache_config, model_config, parallel_config): - from vllm.worker.cache_engine import CacheEngine - - torch.cuda.empty_cache() - torch.cuda.reset_peak_memory_stats() - _, total_gpu_memory = torch.cuda.mem_get_info() - memory_can_use = total_gpu_memory * cache_config.gpu_memory_utilization +def convert_np_to_ms_dtype(value): + """convert_np_to_ms_dtype""" + if value.dtype == np.int8: + value_dtype = ms.int8 + elif value.dtype == np.int32: + value_dtype = ms.int32 + elif value.dtype == np.int64: + value_dtype = ms.int64 + elif value.dtype == np.float64: + value_dtype = ms.float64 + elif value.dtype == np.float32: + value_dtype = ms.float32 + else: + value_dtype = ms.bfloat16 + return value_dtype - model_use_memory_b = int(os.getenv("vLLM_MODEL_MEMORY_USE_GB")) * 1024 * 1024 * 1024 - available_cache_memory = memory_can_use - model_use_memory_b - cache_block_size = CacheEngine.get_cache_block_size( - cache_config, model_config, parallel_config - ) - num_gpu_blocks = int(available_cache_memory // cache_block_size) - return num_gpu_blocks +# Replace the directly loaded module in vllm, such as 'from module import xxx' +def update_modules(name, module): + logger.info(f"replace module {name} by {module}") + sys.modules.update({name: module}) diff --git a/vllm_mindspore/worker/cache_engine.py b/vllm_mindspore/worker/cache_engine.py index 99fc7693c486fb51188092a3777e8c506e587b5a..dfd0ef10e64630933b81764626aab2c986fbf2d5 100644 --- a/vllm_mindspore/worker/cache_engine.py +++ b/vllm_mindspore/worker/cache_engine.py @@ -1,6 +1,7 @@ #!/usr/bin/env python3 # encoding: utf-8 # Copyright 2025 Huawei Technologies Co., Ltd +# Copyright 2024 The vLLM team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -18,18 +19,13 @@ from typing import List +import mindspore as ms +from mindspore import mutable from vllm.logger import init_logger +from vllm_mindspore.utils import MsKVCache, get_valid_dtype -logger = init_logger(__name__) - -from vllm_mindspore.utils import ( - MsKVCache, - get_valid_dtype, - is_mindformers_model_backend, -) -import mindspore as ms -from mindspore import mutable +logger = init_logger(__name__) def create_block(shape, dtype, name=None, device=None): @@ -52,7 +48,6 @@ def ms_allocate_kv_cache( self.dtype = get_valid_dtype(self.dtype) - # TODO(tronzhang): A shape with (2, ...) for a kv tensor cannot support in mindspore's tensor and block operation, so split it to two tensor. for _ in range(self.num_attention_layers): device_type = "CPU" if device == "cpu" else "Ascend" current_cache = [] @@ -77,65 +72,3 @@ def ms_swap_out(self, src_to_dst: ms.Tensor) -> None: self.attn_backend.swap_blocks( self.gpu_cache[i], self.cpu_cache[i], src_to_dst, True ) - - -def cache_engine_init( - self, - cache_config, - model_config, - parallel_config, - device_config, -) -> None: - - from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, LayerBlockType - from vllm.attention import get_attn_backend - - self.cache_config = cache_config - self.model_config = model_config - self.parallel_config = parallel_config - self.device_config = device_config - - self.head_size = model_config.get_head_size() - # Models like Jamba, have mixed typed layers, E.g Mamba - self.num_attention_layers = model_config.get_num_layers_by_block_type( - parallel_config, LayerBlockType.attention - ) - self.num_kv_heads = model_config.get_num_kv_heads(parallel_config) - - self.block_size = cache_config.block_size - self.num_gpu_blocks = cache_config.num_gpu_blocks - if self.num_gpu_blocks: - self.num_gpu_blocks //= parallel_config.pipeline_parallel_size - self.num_cpu_blocks = cache_config.num_cpu_blocks - if self.num_cpu_blocks: - self.num_cpu_blocks //= parallel_config.pipeline_parallel_size - - if cache_config.cache_dtype == "auto": - self.dtype = model_config.dtype - else: - self.dtype = STR_DTYPE_TO_TORCH_DTYPE[cache_config.cache_dtype] - - if ( - is_mindformers_model_backend() - and hasattr(model_config.hf_text_config, "model_type") - and (model_config.hf_text_config.model_type in ("deepseek_v3",)) - ): - is_mla = True - else: - is_mla = False - - # Get attention backend. - self.attn_backend = get_attn_backend( - self.head_size, - model_config.dtype, - cache_config.cache_dtype, - self.block_size, - model_config.is_attention_free, - use_mla=is_mla, - ) - - # Initialize the cache. - self.gpu_cache = self._allocate_kv_cache( - self.num_gpu_blocks, self.device_config.device_type - ) - self.cpu_cache = self._allocate_kv_cache(self.num_cpu_blocks, "cpu") diff --git a/vllm_mindspore/worker/model_runner.py b/vllm_mindspore/worker/model_runner.py index e8752d942da2e4c3ffa87f4889de1b2e82448cb3..561fd2021dd7c84764a04aaa1b3b06389f720b55 100644 --- a/vllm_mindspore/worker/model_runner.py +++ b/vllm_mindspore/worker/model_runner.py @@ -1,6 +1,7 @@ #!/usr/bin/env python3 # encoding: utf-8 # Copyright 2025 Huawei Technologies Co., Ltd +# Copyright 2024 The vLLM team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -25,8 +26,7 @@ from vllm.sampling_params import SamplingParams from vllm.sequence import SequenceGroupMetadata from vllm_mindspore.utils import STR_DTYPE_TO_TENSOR_DTYPE -from mindspore.common import dtype as mstype -from mindspore import mutable, Tensor +from mindspore import mutable logger = init_logger(__name__) @@ -40,118 +40,142 @@ def _get_cuda_graph_pad_size( return -1 -def profile_run(self) -> None: - # Enable top-k sampling to reflect the accurate memory usage. - sampling_params = SamplingParams(top_p=0.99, top_k=self.vocab_size - 1) - max_num_batched_tokens = self.scheduler_config.max_num_batched_tokens - max_num_seqs = self.scheduler_config.max_num_seqs - # This represents the maximum number of different requests - # that will have unique loras, an therefore the max amount of memory - # consumption create dummy lora request copies from the lora request - # passed in, which contains a lora from the lora warmup path. - dummy_lora_requests: List[LoRARequest] = [] - dummy_lora_requests_per_seq: List[LoRARequest] = [] - if self.lora_config: - assert self.lora_manager is not None - with self.lora_manager.dummy_lora_cache(): - for idx in range(self.lora_config.max_loras): - lora_id = idx + 1 - dummy_lora_request = LoRARequest( - lora_name=f"warmup_{lora_id}", - lora_int_id=lora_id, - lora_path="/not/a/real/path", - ) - self.lora_manager.add_dummy_lora(dummy_lora_request, - rank=LORA_WARMUP_RANK) - dummy_lora_requests.append(dummy_lora_request) - dummy_lora_requests_per_seq = [ - dummy_lora_requests[idx % len(dummy_lora_requests)] - for idx in range(max_num_seqs) - ] - - # Profile memory usage with max_num_sequences sequences and the total - # number of tokens equal to max_num_batched_tokens. - seqs: List[SequenceGroupMetadata] = [] - # Additional GPU memory may be needed for multi-modal encoding, which - # needs to be accounted for when calculating the GPU blocks for - # vLLM blocker manager. - # To exercise the worst scenario for GPU memory consumption, - # the number of seqs (batch_size) is chosen to maximize the number - # of images processed. - - max_mm_tokens = self.mm_registry.get_max_multimodal_tokens( - self.model_config) - if max_mm_tokens > 0: - max_num_seqs_orig = max_num_seqs - max_num_seqs = min(max_num_seqs, - max_num_batched_tokens // max_mm_tokens) - if max_num_seqs < 1: - expr = (f"min({max_num_seqs_orig}, " - f"{max_num_batched_tokens} // {max_mm_tokens})") - logger.warning( - "Computed max_num_seqs (%s) to be less than 1. " - "Setting it to the minimum value of 1.", expr) - max_num_seqs = 1 - - batch_size = 0 - for group_id in range(max_num_seqs): - seq_len = (max_num_batched_tokens // max_num_seqs + - (group_id < max_num_batched_tokens % max_num_seqs)) - batch_size += seq_len - - dummy_data = self.input_registry \ - .dummy_data_for_profiling(self.model_config, - seq_len, - self.mm_registry) - - seq = SequenceGroupMetadata( - request_id=str(group_id), - is_prompt=True, - seq_data={group_id: dummy_data.seq_data}, - sampling_params=sampling_params, - block_tables=None, - lora_request=dummy_lora_requests_per_seq[group_id] - if dummy_lora_requests_per_seq else None, - multi_modal_data=dummy_data.multi_modal_data, - multi_modal_placeholders=dummy_data.multi_modal_placeholders, - ) - seqs.append(seq) - - # Run the model with the dummy inputs. - num_layers = self.model_config.get_num_layers(self.parallel_config) - # use an empty tensor instead of `None`` to force Dynamo to pass - # it by reference, rather by specializing on the value ``None``. - # the `dtype` argument does not matter, and we use `float32` as - # a placeholder (it has wide hardware support). - # it is important to create tensors inside the loop, rather than - # multiplying the list, to avoid Dynamo from treating them as - # tensor aliasing. - - # TODO(tronzhang): MindSpore's tensor view is limit now, delete this whole funtion patching latter. - kv_cache_dtype = self.model_config.dtype if self.cache_config.cache_dtype == "auto" \ - else self.cache_config.cache_dtype - kv_cache_dtype = STR_DTYPE_TO_TENSOR_DTYPE[kv_cache_dtype] - block_size = self.cache_config.block_size - num_kv_heads = self.model_config.get_num_kv_heads(self.parallel_config) - head_size = self.model_config.get_head_size() - kv_shape = [0, block_size, num_kv_heads, head_size] - kv_caches = mutable([ - mutable(( - mutable(torch.tensor([], dtype=kv_cache_dtype, device=self.device).reshape(kv_shape)), - mutable(torch.tensor([], dtype=kv_cache_dtype, device=self.device).reshape(kv_shape)), - )) - for _ in range(num_layers) - ]) - finished_requests_ids = [seq.request_id for seq in seqs] - model_input = self.prepare_model_input( - seqs, finished_requests_ids=finished_requests_ids) - intermediate_tensors = None - if not get_pp_group().is_first_rank: - intermediate_tensors = self.model.make_empty_intermediate_tensors( - batch_size=batch_size, - dtype=self.model_config.dtype, - device=self.device) - - self.execute_model(model_input, kv_caches, intermediate_tensors) - torch.cuda.synchronize() - return +def _dummy_run(self, + max_num_batched_tokens: int, + max_num_seqs: int = 1) -> None: + with self.set_in_profile_run(): + # Enable top-k sampling to reflect the accurate memory usage. + sampling_params = \ + SamplingParams(top_p=0.99, top_k=self.vocab_size - 1) + + # This represents the maximum number of different requests + # that will have unique loras, an therefore the max amount of memory + # consumption create dummy lora request copies from the lora request + # passed in, which contains a lora from the lora warmup path. + dummy_lora_requests: List[LoRARequest] = [] + dummy_lora_requests_per_seq: List[LoRARequest] = [] + if self.lora_config: + assert self.lora_manager is not None + with self.lora_manager.dummy_lora_cache(): + for idx in range(self.lora_config.max_loras): + lora_id = idx + 1 + dummy_lora_request = LoRARequest( + lora_name=f"warmup_{lora_id}", + lora_int_id=lora_id, + lora_path="/not/a/real/path", + ) + self.lora_manager.add_dummy_lora(dummy_lora_request, + rank=LORA_WARMUP_RANK) + dummy_lora_requests.append(dummy_lora_request) + dummy_lora_requests_per_seq = [ + dummy_lora_requests[idx % len(dummy_lora_requests)] + for idx in range(max_num_seqs) + ] + + # Profile memory usage with max_num_sequences sequences and the + # total number of tokens equal to max_num_batched_tokens. + seqs: List[SequenceGroupMetadata] = [] + # Additional GPU memory may be needed for multi-modal encoding, + # which needs to be accounted for when calculating the GPU blocks + # for vLLM blocker manager. + # To exercise the worst scenario for GPU memory consumption, + # the number of seqs (batch_size) is chosen to maximize the number + # of images processed. + + max_mm_tokens = self.mm_registry.get_max_multimodal_tokens( + self.model_config) + if max_mm_tokens > 0: + max_num_seqs_orig = max_num_seqs + max_num_seqs = min(max_num_seqs, + max_num_batched_tokens // max_mm_tokens) + if max_num_seqs < 1: + expr = (f"min({max_num_seqs_orig}, " + f"{max_num_batched_tokens} // {max_mm_tokens})") + logger.warning( + "Computed max_num_seqs (%s) to be less than 1. " + "Setting it to the minimum value of 1.", expr) + max_num_seqs = 1 + + batch_size = 0 + for group_id in range(max_num_seqs): + seq_len = (max_num_batched_tokens // max_num_seqs + + (group_id < max_num_batched_tokens % max_num_seqs)) + batch_size += seq_len + + dummy_data = self.input_registry \ + .dummy_data_for_profiling(self.model_config, + seq_len, + self.mm_registry) + + seq = SequenceGroupMetadata( + request_id=str(group_id), + is_prompt=True, + seq_data={group_id: dummy_data.seq_data}, + sampling_params=sampling_params, + block_tables=None, + lora_request=dummy_lora_requests_per_seq[group_id] + if dummy_lora_requests_per_seq else None, + multi_modal_data=dummy_data.multi_modal_data, + multi_modal_placeholders=dummy_data. + multi_modal_placeholders, + ) + seqs.append(seq) + + # Run the model with the dummy inputs. + num_layers = self.model_config.get_num_layers(self.parallel_config) + # use an empty tensor instead of `None`` to force Dynamo to pass + # it by reference, rather by specializing on the value ``None``. + # the `dtype` argument does not matter, and we use `float32` as + # a placeholder (it has wide hardware support). + # it is important to create tensors inside the loop, rather than + # multiplying the list, to avoid Dynamo from treating them as + # tensor aliasing. + kv_cache_dtype = self.model_config.dtype if self.cache_config.cache_dtype == "auto" \ + else self.cache_config.cache_dtype + kv_cache_dtype = STR_DTYPE_TO_TENSOR_DTYPE[kv_cache_dtype] + block_size = self.cache_config.block_size + num_kv_heads = self.model_config.get_num_kv_heads(self.parallel_config) + head_size = self.model_config.get_head_size() + kv_shape = [0, block_size, num_kv_heads, head_size] + kv_caches = mutable([ + mutable(( + mutable(torch.tensor([], dtype=kv_cache_dtype, device=self.device).reshape(kv_shape)), + mutable(torch.tensor([], dtype=kv_cache_dtype, device=self.device).reshape(kv_shape)), + )) + for _ in range(num_layers) + ]) + finished_requests_ids = [seq.request_id for seq in seqs] + model_input = self.prepare_model_input( + seqs, finished_requests_ids=finished_requests_ids) + intermediate_tensors = None + if not get_pp_group().is_first_rank: + intermediate_tensors = \ + self.model.make_empty_intermediate_tensors( + batch_size=batch_size, + dtype=self.model_config.dtype, + device=self.device) + + # Disable KV Scale Calculation for dummy data during profile run + if model_input.attn_metadata is not None: + model_input.attn_metadata.enable_kv_scales_calculation = False + + self.execute_model(model_input, kv_caches, intermediate_tensors) + torch.cuda.synchronize() + if self.lora_config: + # Remove dummy loras. + assert self.lora_manager is not None + self.remove_all_loras() + return + + +MULTI_STEP_ATTENTION_BACKENDS = [ + "MS_MLA", "MS_ATTN", "NO_ATTENTION" +] +MULTI_STEP_CHUNKED_PREFILL_ATTENTION_BACKENDS = ["MS_MLA", "MS_ATTN"] + +def _get_supported_attention_backends(chunked_prefill_enabled: bool) \ + -> List[str]: + if chunked_prefill_enabled: + return MULTI_STEP_CHUNKED_PREFILL_ATTENTION_BACKENDS + else: + return MULTI_STEP_ATTENTION_BACKENDS \ No newline at end of file diff --git a/vllm_mindspore/worker/profile.py b/vllm_mindspore/worker/profile.py new file mode 100644 index 0000000000000000000000000000000000000000..728362d20fd722645a0a084bce3f3615a00048eb --- /dev/null +++ b/vllm_mindspore/worker/profile.py @@ -0,0 +1,64 @@ +import os +import sys +import subprocess + +from mindspore import Profiler +from mindspore.profiler import ProfilerLevel, ProfilerActivity, AicoreMetrics +from mindspore.profiler.common.profiler_context import ProfilerContext + +PROFILE_ENV_NAME = "VLLM_TORCH_PROFILER_DIR" + +def shell_analyse(path): + subprocess.run( + [sys.executable, "-c", f'from mindspore import Profiler; Profiler.offline_analyse("{path}")'], + shell=False, check=True) + +class AdapterProfiler: + def __init__(self, path): + self.profiler = Profiler( + profiler_level=ProfilerLevel.Level1, + activities=[ProfilerActivity.CPU, ProfilerActivity.NPU], + output_path=path, + start_profile=False + ) + + def start(self): + self.profiler.start() + + def stop(self): + self.profiler.stop() + path = ProfilerContext().ascend_ms_dir + shell_analyse(path) + +def wrapper_worker_init(fun): + def new_fun(*arg, **kwarg): + # Profiler initialization during worker init triggers device setup, + # causing init_device to fail due to duplicate configuration. + # To fix this, temporarily unset VLLM_TORCH_PROFILER_DIR before worker init, + # restore it afterward, then initialize profiler properly after worker init_device completes + profile_output_path = os.getenv(PROFILE_ENV_NAME, "") + if profile_output_path: + del os.environ[PROFILE_ENV_NAME] + + fun(*arg, **kwarg) + + if profile_output_path: + os.environ[PROFILE_ENV_NAME] = profile_output_path + return new_fun + +def wrapper_worker_init_device(fun): + def new_fun(*arg, **kwarg): + fun(*arg, **kwarg) + + # The actual profiler initialization is performed after the worker.init_device() method, + # based on the VLLM_TORCH_PROFILER_DIR environment variable. + self = arg[0] + profile_output_path = os.getenv(PROFILE_ENV_NAME, "") + if profile_output_path: + print(f"Profiling enabled. Traces will be saved to: {profile_output_path}") + self.profiler = AdapterProfiler(profile_output_path) + else: + self.profiler = None + return new_fun + + diff --git a/vllm_mindspore/worker/spec_decode_worker.py b/vllm_mindspore/worker/spec_decode_worker.py new file mode 100644 index 0000000000000000000000000000000000000000..91a717cc999ffdca82749e287d9b3bc6992e845f --- /dev/null +++ b/vllm_mindspore/worker/spec_decode_worker.py @@ -0,0 +1,364 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# Copyright 2025 Huawei Technologies Co., Ltd +# Copyright 2024 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +# ToDo: remove when msadapter supports +from collections import defaultdict +from typing import Any, Dict, List, Optional, Set, Tuple + +import torch + +from vllm.worker.worker_base import WorkerBase +from vllm.spec_decode.proposer_worker_base import ProposerWorkerBase +from vllm.spec_decode.metrics import AsyncMetricsCollector +from vllm.spec_decode.interfaces import (SpeculativeProposals, + SpeculativeScorer, SpeculativeScores) +from vllm.sequence import (VLLM_INVALID_TOKEN_ID, + CompletionSequenceGroupOutput, ExecuteModelRequest, + HiddenStates, SequenceGroupMetadata, + get_all_seq_ids_and_request_ids) +from vllm.model_executor.layers.spec_decode_base_sampler import ( + SpecDecodeBaseSampler, SpecDecodeStochasticBaseSampler) + +from vllm.spec_decode.util import (Timer, create_logprobs_output, + create_sequence_group_output, + get_all_num_logprobs, + get_sampled_token_logprobs, nvtx_range, + split_batch_by_proposal_len) + +# MQAScore is only supported in FLASH_ATTN and eager mode. +def spec_decode_worker_init( + self, + proposer_worker: ProposerWorkerBase, + scorer_worker: WorkerBase, + spec_decode_sampler: SpecDecodeBaseSampler, + disable_mqa_scorer: bool = False, + disable_logprobs: bool = False, + disable_log_stats: bool = False, + metrics_collector: Optional[AsyncMetricsCollector] = None, + disable_by_batch_size: Optional[int] = None, + allow_zero_draft_token_step: Optional[bool] = True, + enable_lm_head_weight_load: Optional[bool] = False, + num_spec_prefill_steps: int = 1, +): + self.proposer_worker = proposer_worker + self.scorer_worker = scorer_worker + scorer_runner = getattr(self.scorer_worker, "model_runner", None) + self.generators = scorer_runner.get_generators( + ) if scorer_runner else None + self.disable_by_batch_size = disable_by_batch_size or float("inf") + self.spec_decode_sampler = spec_decode_sampler + self._allow_zero_draft_token_step = allow_zero_draft_token_step + self._enable_lm_head_weight_load = enable_lm_head_weight_load + self._metrics = AsyncMetricsCollector( + self.spec_decode_sampler + ) if metrics_collector is None else metrics_collector + # Tracks the sequence IDs that received a bonus token ID in + # their last forward pass. Needed only if KV cache is being + # used for token generation such as in the case of MultiStepWorker. + self._seq_with_bonus_token_in_last_step: Set[int] = set() + # Tracks the currently active request ids and the sequence IDs + # corresponding to them + self._request_id_seq_id_mapping: Dict[str, Set[int]] = defaultdict(set) + # Tracks if the proposer worker uses the KV cache or not. + + self.probs_dtype = self.spec_decode_sampler.probs_dtype + self.token_id_dtype = self.spec_decode_sampler.token_id_dtype + # Lazy initialization. + self.scorer: SpeculativeScorer + self.disable_mqa_scorer = False + + # Hidden states from target model to pass to proposer + # in the subsequent step. + self.previous_hidden_states: Optional[HiddenStates] = None + self._disable_logprobs = disable_logprobs + self._disable_log_stats = disable_log_stats + self._num_spec_prefill_steps = num_spec_prefill_steps + +# msadapter does not support to slice tensor with empty index, +# rewrite this method to optimize the performance(almost 2ms) +@nvtx_range("spec_decode_worker._verify_tokens") +def _verify_tokens( + self, + seq_group_metadata_list: List[SequenceGroupMetadata], + proposal_scores: SpeculativeScores, + proposals: SpeculativeProposals, + max_proposal_len: int, +) -> Tuple[torch.Tensor, torch.Tensor]: + """Determine which speculative tokens are accepted using the + probabilities of each token according to the proposer and scorer models. + + Returns a tuple of Tensors, one for the accepted token ids and one for + the logprobs according to the scoring model. + """ + proposal_lens_list = proposals.proposal_lens.tolist() + + # vLLM currently only supports proposal lens equal to zero or the batch + # proposal len. This adds some complexity (splitting the batch into spec + # and non spec sequences) and should be removed in the future. It can be + # done by supporting per-sequence proposal lens. + (_, spec_indices), (_, non_spec_indices) = split_batch_by_proposal_len( + seq_group_metadata_list, proposal_lens_list) + original_indices = spec_indices + non_spec_indices + + proposal_verifier_probs = proposal_scores.probs + bonus_token_ids = proposal_scores.token_ids[:, -1:] + proposal_probs = proposals.proposal_probs + proposal_token_ids = proposals.proposal_token_ids + if non_spec_indices: + # Get probabilities of target model, including bonus tokens. + proposal_verifier_probs = proposal_verifier_probs[spec_indices] + # Get bonus tokens from target model. + bonus_token_ids = bonus_token_ids[spec_indices] + # Get probabilities according to proposal method. + proposal_probs = proposal_probs[spec_indices] + # Get proposed tokens. + proposal_token_ids = proposal_token_ids[spec_indices] + + # Sampler arguments + sampler_extra_kwargs: Dict[str, Any] = {} + if self.generators and isinstance(self.spec_decode_sampler, + SpecDecodeStochasticBaseSampler): + sampler_extra_kwargs["seeded_seqs"] = { + idx: self.generators[sgm.request_id] + for idx, sgm in enumerate(seq_group_metadata_list) + if sgm.sampling_params.seed is not None + } + + accepted_token_ids = self.spec_decode_sampler( + target_with_bonus_probs=proposal_verifier_probs, + bonus_token_ids=bonus_token_ids, + draft_probs=proposal_probs, + draft_token_ids=proposal_token_ids, + **sampler_extra_kwargs, + ) + if non_spec_indices: + # Get non-speculative sampled tokens from target model. + non_spec_token_ids = proposal_scores.token_ids[non_spec_indices].expand(-1, max_proposal_len + 1).clone() + + # Append output tokens from non-speculative sequences to + # the accepted token ids tensor. + non_spec_token_ids[:, 1:] = -1 + accepted_token_ids = torch.cat([accepted_token_ids, non_spec_token_ids]) + # Rearrange so that results are in the order of the original seq group + # metadata. + accepted_token_ids[original_indices] = accepted_token_ids.clone() + + logprobs = proposal_scores.logprobs + # B x K+1 x D + hidden_states = proposal_scores.hidden_states + if hidden_states is not None: + # Only get terminal hidden states for next step + terminal_metadata = [ + sg for sg in seq_group_metadata_list if sg.do_sample + ] + + # Contract hidden states based on accepted tokens + hs_size = hidden_states.shape[-1] + accepted_index = accepted_token_ids + 1 # Convert -1 to 0 + accepted_index = accepted_index.count_nonzero(dim=1).add_(-1) # b + # Drop non-terminal prefill chunks hidden states. + if VLLM_INVALID_TOKEN_ID in accepted_index.tolist(): + hidden_states = hidden_states[accepted_index != VLLM_INVALID_TOKEN_ID] + accepted_index = accepted_index[accepted_index != VLLM_INVALID_TOKEN_ID] + assert len(accepted_index) == hidden_states.shape[0] == len( terminal_metadata) + index = accepted_index[:, None, None].expand(-1, 1, hs_size) # b x 1 x d + second_last_token_hidden_states = hidden_states[:, -2] # b x d + hidden_states = hidden_states.gather(1, index).squeeze(1) # b x d + # Store hidden states from target model for subsequent decode step + self.previous_hidden_states = HiddenStates( + hidden_states, terminal_metadata, + second_last_token_hidden_states) + return accepted_token_ids, logprobs + + +from vllm.model_executor.layers.sampler import SamplerOutput +from vllm.spec_decode.spec_decode_worker import prepare_prefill_hidden_states + +# the 'where' ops in msadapter does not support condition-only inputs, use nonzero +@nvtx_range("spec_decode_worker._run_no_spec") +def _run_no_spec(self, execute_model_req: ExecuteModelRequest, + skip_proposer: bool) -> List[SamplerOutput]: + """Run a single generation step without any speculation. The input is + sent to the proposer and scorer model so that the KV cache is consistent + between the two. When skip_proposer is True, the proposer model is + not called, meaning that the kv-cache in proposer for requests is not + updated, so they cannot enable spec decode in the rest decoding. + """ + + sampler_output = self.scorer_worker.execute_model(execute_model_req) + assert len(sampler_output) == 1 + sampler_output = sampler_output[0] + + # Store hidden states from target model execution, BxD. + hidden_states = sampler_output.hidden_states + if hidden_states is not None: + # Only decodes and prefill terminal chunks need a hidden state. + seq_group_meta_with_hidden = [ + sg for sg in execute_model_req.seq_group_metadata_list + if sg.do_sample + ] + if any(seq.is_prompt for seq in seq_group_meta_with_hidden): + # Drop hidden_states with no prediction (eg non-terminal chunks) + hidden_states = hidden_states[ + (sampler_output.sampled_token_ids - VLLM_INVALID_TOKEN_ID).nonzero(as_tuple=True)[0]] + if self.previous_hidden_states is None and len( + seq_group_meta_with_hidden): + self.previous_hidden_states = HiddenStates( + hidden_states, seq_group_meta_with_hidden) + elif self.previous_hidden_states and len( + seq_group_meta_with_hidden): + self.previous_hidden_states.update(hidden_states, + seq_group_meta_with_hidden) + + if not skip_proposer: + # We prepare the prefill hidden states here so that there no + # additional complexity in worker for spec_decode vs non_spec_decode + # flow and execute_model doesn't need additional modifications. + execute_model_req.previous_hidden_states = \ + prepare_prefill_hidden_states( + sampler_output.prefill_hidden_states) + for i in range(self._num_spec_prefill_steps): + execute_model_req.spec_step_idx = i + self.proposer_worker.execute_model(execute_model_req) + + sampler_output_to_return = (self._serialize_sampler_output_no_logprobs( + execute_model_req=execute_model_req, sampler_output=sampler_output) + if self._disable_logprobs else + [sampler_output]) + + # Clear device tensors from sampler output. This reduces communication + # overhead when the engine runs in a different process than the workers. + sampler_output.sampled_token_probs = None + sampler_output.sampled_token_ids = None + sampler_output.logprobs = None + return sampler_output_to_return + + +# the output of 'tensor.max()' does not consistent with torch +def _create_output( + self, + accepted: torch.Tensor, # [batch_size, k] + substitute_token_ids: torch.Tensor, # [batch_size, k] + draft_token_ids: torch.Tensor, # [batch_size, k] + bonus_token_ids: torch.Tensor, # [batch_size] +) -> torch.Tensor: + """Format output. Returns a matrix of token ids. When + a token is rejected via sampling, all subsequent token ids are + set to -1 for the sequence. + + Args: + accepted: A boolean tensor indicating if the corresponding + draft token in draft_token_ids should be accepted or not. + substitute_token_ids: A tensor of token_ids that can be used + as substitutes for the draft token ids if the proposed token + is rejected. + draft_token_ids: A tensor of token ids speculated by the + draft model. + bonus_token_ids: Token ids to use as the bonus token if + all the draft tokens are accepted. + Returns: + A tensor containing the accepted token ids. The shape of the + tensor is [batch_size, k + num_bonus_tokens] + """ + # the return type of max is a tuple in msadapter + batch_size, k = substitute_token_ids.shape + assert self._num_bonus_tokens == 1 # ToDo: only support 1 mtp layer to optimize performance(almost 2ms) + + # Create an extended output tensor + output_with_bonus_tokens = -torch.ones( + (batch_size, k + self._num_bonus_tokens), + dtype=self.token_id_dtype, + device=accepted.device) + + # Fill in the first k columns of the output tensor using masks and data tensors. + output_with_bonus_tokens[:, :k] = draft_token_ids * accepted + substitute_token_ids * (~accepted) + + # Fill the last column. + # We check output directly as accepted may have True values inconsistentwith causal acceptance. + # Fill the recovered token ids. + output_with_bonus_tokens[:, -1:] = bonus_token_ids * accepted + (-1) * (~accepted) + + self.num_accepted_tokens += accepted.sum() + self.num_emitted_tokens += (output_with_bonus_tokens != -1).sum() + self.num_draft_tokens += batch_size * k + + return output_with_bonus_tokens + + +# msadapter does not support 'new_full', and the operator 'new_zero' only supports a list or a tuple as an input +from vllm.spec_decode.util import sampler_output_to_torch +def _merge_outputs( + self, + batch_size: int, + proposal_len: int, + maybe_sampler_output: Optional[List[SamplerOutput]], + proposal_lens: List[int], + nonzero_proposal_len_indices: List[int], + sampler_transposed: bool, +) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """After speculations are produced, merge the speculation results with + the skipped sequences. + """ + if maybe_sampler_output is None: + # If no speculative tokens, the sampler output will be None. + # In this case we return empty proposals. + proposal_tokens = torch.tensor(-1, + dtype=torch.long, + device=self._device).expand( + batch_size, proposal_len) + proposal_probs = torch.tensor(0, + dtype=torch.float32, + device=self._device).expand( + batch_size, proposal_len, + self._vocab_size) + proposal_lens_tensor = torch.tensor(0, + dtype=torch.long, + device=self._device).expand( + len(proposal_lens)) + return proposal_tokens, proposal_probs, proposal_lens_tensor + + sampler_output = maybe_sampler_output + proposal_tokens, proposal_probs, *_ = sampler_output_to_torch( + sampler_output, sampler_transposed) + + # Now, reformat the output GPU tensors such that each sequence has + # a proposal. the proposal can be empty, e.g. [-1, -1, -1] + + # entire_proposal_tokens = proposal_tokens.new_full( + # size=(batch_size, *proposal_tokens.shape[1:]), + # fill_value=-1, + # ) + entire_proposal_tokens = torch.full(size=(batch_size, *proposal_tokens.shape[1:]), fill_value=-1) + entire_proposal_tokens[nonzero_proposal_len_indices] = proposal_tokens + entire_proposal_probs = proposal_probs.new_zeros(( + batch_size, + *proposal_probs.shape[1:],) + ) + entire_proposal_probs[nonzero_proposal_len_indices] = proposal_probs + + proposal_tokens, proposal_probs = ( + entire_proposal_tokens, + entire_proposal_probs, + ) + + proposal_lens_tensor = torch.zeros(batch_size, + dtype=torch.long, + device=self._device) + proposal_lens_tensor[nonzero_proposal_len_indices] = proposal_len + + return proposal_tokens, proposal_probs, proposal_lens_tensor diff --git a/vllm_mindspore/worker/worker.py b/vllm_mindspore/worker/worker.py index d1e52a410a0ca439db5fafcf97c0481731e02523..8ce1bc91d511a43a83fd3c8b0e70d228b98b951b 100644 --- a/vllm_mindspore/worker/worker.py +++ b/vllm_mindspore/worker/worker.py @@ -1,6 +1,7 @@ #!/usr/bin/env python3 # encoding: utf-8 # Copyright 2025 Huawei Technologies Co., Ltd +# Copyright 2024 The vLLM team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -18,6 +19,7 @@ """Worker functions""" import gc import os +import math from typing import Tuple, Optional import torch @@ -32,113 +34,66 @@ from vllm.distributed import ( from vllm.logger import init_logger -from vllm_mindspore.utils import is_mindformers_model_backend +from vllm_mindspore.utils import get_valid_dtype +from vllm.model_executor import set_random_seed +from vllm.sequence import SequenceGroupMetadata +from vllm.sampling_params import SamplingParams logger = init_logger(__name__) -def _warm_up_model(self) -> None: - # Reset the seed to ensure that the random state is not affected by - # the model initialization and profiling. - from vllm.model_executor import set_random_seed - - # TODO(tronzhang): model compile here. - - set_random_seed(self.model_config.seed) - - -def determine_num_available_blocks(self) -> Tuple[int, int]: - """Profiles the peak memory usage of the model to determine how many - KV blocks may be allocated without OOMs. - - The engine will first conduct a profiling of the existing memory usage. - Then, it calculate the maximum possible number of GPU and CPU blocks - that can be allocated with the remaining free memory. +def _prepare_input_for_warmup(model_config, model_runner, cache_engine, is_prefill, is_mtp_model=False): + bs = 1 + seq_len = model_runner.scheduler_config.max_num_batched_tokens if is_prefill else 1 + dummy_data = model_runner.input_registry.dummy_data_for_profiling(model_config, seq_len, model_runner.mm_registry) + block_tables = [i for i in range(math.ceil(seq_len / cache_engine.block_size))] + seqs = [ + SequenceGroupMetadata( + request_id=str(idx), + is_prompt=is_prefill, + seq_data={idx: dummy_data.seq_data}, + sampling_params=SamplingParams(), + block_tables={idx: block_tables}, + lora_request=None, + multi_modal_data=None, + multi_modal_placeholders=None, + ) + for idx in range(bs) + ] - .. tip:: - You may limit the usage of GPU memory - by adjusting the `gpu_memory_utilization` parameter. - """ - from vllm.utils import GiB_bytes, memory_profiling + model_input = model_runner.prepare_model_input(seqs) + block_tables = model_input.attn_metadata.block_tables + if block_tables is not None and block_tables.numel() <= 0: + model_input.attn_metadata.block_tables = torch.zeros((1, 1), dtype=torch.int32) - # Profile the memory usage of the model and get the maximum number of - # cache blocks that can be allocated with the remaining free memory. - torch.cuda.empty_cache() - torch.cuda.reset_peak_memory_stats() + previous_hidden_states = None if not is_mtp_model else \ + torch.ones([bs, seq_len, model_config.get_hidden_size()], dtype=get_valid_dtype(model_config.dtype)) + return model_input, previous_hidden_states - _, total_gpu_memory = torch.cuda.mem_get_info() - if os.getenv("vLLM_MODEL_MEMORY_USE_GB"): - memory_use_for_model_run = int(os.environ["vLLM_MODEL_MEMORY_USE_GB"]) * 1024 * 1024 * 1024 - else: - # Execute a forward pass with dummy inputs to profile the memory usage - # of the model. - with memory_profiling( - baseline_memory_in_bytes=total_gpu_memory - self.init_gpu_memory, - weights_memory_in_bytes=self.model_runner.model_memory_usage, - ) as result: - self.model_runner.profile_run() - torch.cuda.synchronize() - - self._assert_memory_footprint_increased_during_profiling() - - memory_use_for_model_run = result.non_kv_cache_memory_in_bytes - - memory_for_current_instance = ( - total_gpu_memory * self.cache_config.gpu_memory_utilization - ) - available_kv_cache_memory = memory_for_current_instance - memory_use_for_model_run - - # Calculate the number of blocks that can be allocated with the - # profiled peak memory. - cache_block_size = self.get_cache_block_size_bytes() - if cache_block_size == 0: - num_gpu_blocks = 0 - num_cpu_blocks = 0 - else: - num_gpu_blocks = int(available_kv_cache_memory // cache_block_size) - num_cpu_blocks = int(self.cache_config.swap_space_bytes // cache_block_size) - num_gpu_blocks = max(num_gpu_blocks, 0) - num_cpu_blocks = max(num_cpu_blocks, 0) - - if os.getenv("vLLM_MODEL_MEMORY_USE_GB"): - msg = ( - f"The current vLLM instance can use " - "total_gpu_memory " - f"({(total_gpu_memory / GiB_bytes):.2f}GiB)" - " x gpu_memory_utilization " - f"({self.cache_config.gpu_memory_utilization:.2f})" - f" = {(memory_for_current_instance / GiB_bytes):.2f}GiB\n" - "set model use memory " - f"{(memory_use_for_model_run):.2f}GiB;" - " the rest of the memory reserved for KV Cache is " - f"{(available_kv_cache_memory / GiB_bytes):.2f}GiB." - ) +def _warm_up_model(self) -> None: + # cache_engine is a list with length equal to the size of pipeline-parallel, and only pp=1 is supported. + kv_cache = self.cache_engine[0].gpu_cache + is_mtp_model = self.speculative_config is not None and self.model_config.hf_config.model_type == "deepseek_mtp" + if is_mtp_model: + # prefill mtp model + model_input, previous_hidden_states = _prepare_input_for_warmup(self.model_config, self.model_runner, + self.cache_engine[0], True, is_mtp_model) + self.model_runner.execute_model(model_input, kv_cache, None, previous_hidden_states=previous_hidden_states) + + # warmup for decode + if self.vllm_config.scheduler_config.is_multi_step: + model_input, _ = _prepare_input_for_warmup(self.model_config, self.model_runner._base_model_runner, + self.cache_engine[0], False) + self.model_runner._base_model_runner.execute_model(model_input, kv_cache, None) else: - msg = ( - f"Memory profiling takes {result.profile_time:.2f} seconds\n" - "the current vLLM instance can use " - "total_gpu_memory " - f"({(total_gpu_memory / GiB_bytes):.2f}GiB)" - " x gpu_memory_utilization " - f"({self.cache_config.gpu_memory_utilization:.2f})" - f" = {(memory_for_current_instance / GiB_bytes):.2f}GiB\n" - "model weights take " - f"{(result.weights_memory_in_bytes / GiB_bytes):.2f}GiB;" - " non_torch_memory takes " - f"{(result.non_torch_increase_in_bytes / GiB_bytes):.2f}GiB;" - " PyTorch activation peak memory takes " - f"{(result.torch_peak_increase_in_bytes / GiB_bytes):.2f}GiB;" - " the rest of the memory reserved for KV Cache is " - f"{(available_kv_cache_memory / GiB_bytes):.2f}GiB." - ) + model_input, previous_hidden_states = _prepare_input_for_warmup(self.model_config, self.model_runner, + self.cache_engine[0], False, is_mtp_model) + self.model_runner.execute_model(model_input, kv_cache, None, previous_hidden_states=previous_hidden_states) - logger.info(msg) + torch.cuda.synchronize() - # Final cleanup - if self.model_runner.lora_manager: - self.model_runner.remove_all_loras() - gc.collect() - - return num_gpu_blocks, num_cpu_blocks + # Reset the seed to ensure that the random state is not affected by + # the model initialization and profiling. + set_random_seed(self.model_config.seed)