From b3f77dab4dbc3331d0dba5fe99e442ece5a46e62 Mon Sep 17 00:00:00 2001 From: zlq2020 Date: Sun, 27 Apr 2025 14:57:13 +0800 Subject: [PATCH 01/55] update version 0.1->0.2.0 --- vllm_mindspore/version.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm_mindspore/version.txt b/vllm_mindspore/version.txt index ceab6e11..0ea3a944 100644 --- a/vllm_mindspore/version.txt +++ b/vllm_mindspore/version.txt @@ -1 +1 @@ -0.1 \ No newline at end of file +0.2.0 -- Gitee From beb8dece2912ee73eb79cc3eeec8ba6e909542da Mon Sep 17 00:00:00 2001 From: moran Date: Sun, 27 Apr 2025 14:32:21 +0800 Subject: [PATCH 02/55] fix codecheck --- codecheck_toolkits/vllm_codecheck.sh | 46 +++++++++---------- .../pyproject.toml => pyproject.toml | 32 ++++++------- 2 files changed, 36 insertions(+), 42 deletions(-) rename codecheck_toolkits/pyproject.toml => pyproject.toml (81%) diff --git a/codecheck_toolkits/vllm_codecheck.sh b/codecheck_toolkits/vllm_codecheck.sh index e67c7372..928c70bd 100644 --- a/codecheck_toolkits/vllm_codecheck.sh +++ b/codecheck_toolkits/vllm_codecheck.sh @@ -1,27 +1,26 @@ -pip install -r requirements-lint.txt +pip install -r codecheck_toolkits/requirements-lint.txt RET_FLAG=0 -cd .. -# yapf formats code automatically +# yapf check -MERGEBASE="$(git merge-base origin/master HEAD)" -if ! git diff --diff-filter=ACM --quiet --exit-code "$MERGEBASE" -- '*.py' '*.pyi' &> /dev/null; then - git diff --name-only --diff-filter=ACM "$MERGEBASE" -- '*.py' '*.pyi' | xargs -P 5 \ - yapf --in-place --recursive --parallel --exclude build/ +MERGEBASE="$(git merge-base origin/develop HEAD)" +if ! git diff --cached --diff-filter=ACM --quiet --exit-code "$MERGEBASE" -- '*.py' '*.pyi' &> /dev/null; then + git diff --cached --name-only --diff-filter=ACM "$MERGEBASE" -- '*.py' '*.pyi' | xargs -P 5 \ + yapf --diff --recursive --parallel --exclude tests/ fi if [[ $? -ne 0 ]]; then - echo "yapf run failed." + echo "yapf check failed." RET_FLAG=1 else - echo "yapf run success." + echo "yapf check success." fi # codespell check -if ! git diff --diff-filter=ACM --quiet --exit-code "$MERGEBASE" -- '*.py' '*.pyi' &> /dev/null; then - git diff --name-only --diff-filter=ACM "$MERGEBASE" -- '*.py' '*.pyi' | xargs -P 5 \ - codespell --skip ./vllm_mindspore/ops/ascendc/* +if ! git diff --cached --diff-filter=ACM --quiet --exit-code "$MERGEBASE" -- '*.py' '*.pyi' &> /dev/null; then + git diff --cached --name-only --diff-filter=ACM "$MERGEBASE" -- '*.py' '*.pyi' | xargs \ + codespell --skip "./vllm_mindspore/ops/ascendc/*" fi if [[ $? -ne 0 ]]; then echo "codespell check failed." @@ -31,8 +30,9 @@ else fi # ruff check -if ! git diff --diff-filter=ACM --quiet --exit-code "$MERGEBASE" -- '*.py' '*.pyi' &> /dev/null; then - git diff --name-only --diff-filter=ACM "$MERGEBASE" -- '*.py' '*.pyi' | xargs -P 5 \ +if ! git diff --cached --diff-filter=ACM --quiet --exit-code "$MERGEBASE" -- '*.py' '*.pyi' &> /dev/null; then + echo "ruff check is running..." + git diff --cached --name-only --diff-filter=ACM "$MERGEBASE" -- '*.py' '*.pyi' | xargs \ ruff check fi if [[ $? -ne 0 ]]; then @@ -42,24 +42,24 @@ else echo "ruff check success." fi -# isort fixed -if ! git diff --diff-filter=ACM --quiet --exit-code "$MERGEBASE" -- '*.py' '*.pyi' &> /dev/null; then - git diff --name-only --diff-filter=ACM "$MERGEBASE" -- '*.py' '*.pyi' | xargs -P 5 \ - isort +# isort check +if ! git diff --cached --diff-filter=ACM --quiet --exit-code "$MERGEBASE" -- '*.py' '*.pyi' &> /dev/null; then + git diff --cached --name-only --diff-filter=ACM "$MERGEBASE" -- '*.py' '*.pyi' | xargs \ + isort --check-only fi if [[ $? -ne 0 ]]; then - echo "isort fixed failed." + echo "isort check failed." RET_FLAG=1 else - echo "isort fixed success." + echo "isort check success." fi # mypy check type PYTHON_VERSION=$(python -c 'import sys; print(f"{sys.version_info.major}.{sys.version_info.minor}")') -if ! git diff --diff-filter=ACM --quiet --exit-code "$MERGEBASE" -- '*.py' '*.pyi' &> /dev/null; then - git diff --name-only --diff-filter=ACM "$MERGEBASE" -- '*.py' '*.pyi' | xargs -P 5 \ +if ! git diff --cached --diff-filter=ACM --quiet --exit-code "$MERGEBASE" -- '*.py' '*.pyi' &> /dev/null; then + git diff --cached --name-only --diff-filter=ACM "$MERGEBASE" -- '*.py' '*.pyi' | xargs \ mypy --follow-imports skip --python-version "${PYTHON_VERSION}" "$@" fi if [[ $? -ne 0 ]]; then @@ -69,4 +69,4 @@ else echo "mypy check success." fi -cd - || exit $RET_FLAG +exit $RET_FLAG diff --git a/codecheck_toolkits/pyproject.toml b/pyproject.toml similarity index 81% rename from codecheck_toolkits/pyproject.toml rename to pyproject.toml index 9a3c52de..65e258dc 100644 --- a/codecheck_toolkits/pyproject.toml +++ b/pyproject.toml @@ -18,15 +18,18 @@ build-backend = "setuptools.build_meta" [tool.ruff] # Allow lines to be as long as 80. line-length = 80 + exclude = [ # External file, leaving license intact "vllm_mindspore/__init__.py", - "tests/*" + "tests/*", + "setup.py" ] [tool.ruff.lint.per-file-ignores] "vllm_mindspore/version.txt" = ["F401"] "vllm_mindspore/_version.txt" = ["ALL"] +"setup.py" = ["ALL"] [tool.ruff.lint] select = [ @@ -64,28 +67,19 @@ follow_imports = "silent" # After fixing type errors resulting from follow_imports: "skip" -> "silent", # move the directory here and remove it from tools/mypy.sh -#files = [ -# "vllm/*.py", -# "vllm/adapter_commons", -# "vllm/assets", -# "vllm/entrypoints", -# "vllm/core", -# "vllm/inputs", -# "vllm/logging_utils", -# "vllm/multimodal", -# "vllm/platforms", -# "vllm/transformers_utils", -# "vllm/triton_utils", -# "vllm/usage", -#] -files= ["vllm_mindspore/*.py",] -# TODO(woosuk): Include the code from Megatron and HuggingFace. +files= ["vllm_mindspore/"] exclude = [ "vllm_mindspore/model_executor/parallel_utils/|vllm_mindspore/model_executor/models/", # Ignore triton kernels in ops. - 'vllm_mindspore/attention/ops/.*\.py$' + 'vllm_mindspore/attention/ops/.*\.py$', + 'setup.py', + 'tests/' ] +[[tool.mypy.overrides]] +module = "setup" +ignore_errors = true + [tool.codespell] ignore-words-list = "dout, te, indicies, subtile, ElementE, CANN" skip = "./tests/models/fixtures,./tests/prompts,./benchmarks/sonnet.txt,./tests/lora/data,./build" @@ -94,7 +88,7 @@ skip = "./tests/models/fixtures,./tests/prompts,./benchmarks/sonnet.txt,./tests/ use_parentheses = true skip_gitignore = true -skip_glob = ["tests/*", "vllm_mindspore/ops/*"] +skip_glob = ["tests/*", "vllm_mindspore/ops/*", "setup.py"] skip = ["vllm_mindspore/__init__.py"] [tool.pytest.ini_options] -- Gitee From aced8789e16b427ddbef81b83287889200e8fcfe Mon Sep 17 00:00:00 2001 From: zhanzhan1 Date: Mon, 28 Apr 2025 22:08:58 +0800 Subject: [PATCH 03/55] Bugfix for QWen --- .../model_executor/layers/vocab_parallel_embedding.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/vllm_mindspore/model_executor/layers/vocab_parallel_embedding.py b/vllm_mindspore/model_executor/layers/vocab_parallel_embedding.py index 81ebbe11..cec40d8b 100644 --- a/vllm_mindspore/model_executor/layers/vocab_parallel_embedding.py +++ b/vllm_mindspore/model_executor/layers/vocab_parallel_embedding.py @@ -87,12 +87,12 @@ def get_masked_input_and_mask( ) -> Tuple[Tensor, Tensor]: displaced_x = mint.sub(input_, org_vocab_start_index) down_truncated_x = mint.nn.functional.relu(displaced_x) - truncated_x = mint.minimum(down_truncated_x, org_vocab_end_index) + truncated_x = mint.minimum(down_truncated_x, (org_vocab_end_index - org_vocab_start_index - 1)) org_vocab_mask = mint.eq(displaced_x, truncated_x) displaced_x = mint.sub(input_, added_vocab_start_index) down_truncated_x = mint.nn.functional.relu(displaced_x) - truncated_x = mint.minimum(down_truncated_x, added_vocab_end_index) + truncated_x = mint.minimum(down_truncated_x, (added_vocab_end_index - added_vocab_start_index - 1)) added_vocab_mask = mint.eq(displaced_x, truncated_x) added_offset = added_vocab_start_index - ( org_vocab_end_index - org_vocab_start_index) - num_org_vocab_padding @@ -197,8 +197,8 @@ class VocabParallelEmbedding(nn.Cell): ): super().__init__() # Keep the input dimensions. - tp_rank = get_tensor_model_parallel_rank() # 获取tp并行的rank - self.tp_size = get_tensor_model_parallel_world_size() # 获取tp并行的world_size + self.tp_rank = get_tensor_model_parallel_rank() + self.tp_size = get_tensor_model_parallel_world_size() self.num_embeddings = num_embeddings self.padding_size = padding_size self.org_vocab_size = org_num_embeddings or num_embeddings @@ -216,7 +216,7 @@ class VocabParallelEmbedding(nn.Cell): self.org_vocab_size_padded, self.num_embeddings, self.org_vocab_size, - tp_rank, + self.tp_rank, self.tp_size, ) -- Gitee From 6ec2466a94a42da7353a60f92b38b6f12199793f Mon Sep 17 00:00:00 2001 From: yyyyrf Date: Mon, 28 Apr 2025 15:15:40 +0800 Subject: [PATCH 04/55] [feature] support high precision gptq algo slpit online --- .../model_executor/models/mf_models/deepseek_v3.py | 11 +++++++++-- .../models/mf_models/deepseekv3_weight_processor.py | 13 +++++++++---- 2 files changed, 18 insertions(+), 6 deletions(-) diff --git a/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py b/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py index e0ede946..e7cda00c 100644 --- a/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py +++ b/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py @@ -184,8 +184,11 @@ class DeepseekV3ForCausalLM(MfModelBase): cfg = PTQConfig(mode=quant_mode, backend=BackendTarget.ASCEND, weight_quant_dtype=msdtype.qint4x2, algo_args=gptq_config, act_quant_dtype=None, precision_recovery=PrecisionRecovery.GPTQ, weight_quant_granularity=QuantGranularity.PER_GROUP, opname_blacklist=['lm_head', 'lkv2kv'], - group_size=128) - layer_policies = OrderedDict() + group_size=64) + w2_config = PTQConfig(mode=quant_mode, backend=BackendTarget.ASCEND, weight_quant_dtype=msdtype.int8, + act_quant_dtype=msdtype.int8, outliers_suppression=OutliersSuppressionType.SMOOTH) + layer_policies = OrderedDict({r'.*\.feed_forward\.w2.*': w2_config, + r'.*\.shared_experts.w2.*': w2_config}) elif quant_type.lower() == 'smoothquant': cfg = PTQConfig(mode=quant_mode, backend=BackendTarget.ASCEND, weight_quant_dtype=msdtype.int8, act_quant_dtype=msdtype.int8, outliers_suppression=OutliersSuppressionType.SMOOTH, @@ -217,5 +220,9 @@ class DeepseekV3ForCausalLM(MfModelBase): # pylint: disable=protected-access ptq._config.aclnn_quant_list = ["routed_experts.ffn.w_gate_hidden", "routed_experts.ffn.w1", "routed_experts.ffn.w3"] + if 'gptq-pergroup' in quant_type.lower(): + # pylint: disable=protected-access + ptq.layer_policies[r'.*\.feed_forward\.w2.*'].aclnn_quant_list = ["w2"] + ptq.layer_policies[r'.*\.shared_experts.w2.*'].aclnn_quant_list = ["w2"] ptq.decoder_layer_types.append(DeepseekV3DecodeLayer) return ptq diff --git a/vllm_mindspore/model_executor/models/mf_models/deepseekv3_weight_processor.py b/vllm_mindspore/model_executor/models/mf_models/deepseekv3_weight_processor.py index 642897ed..de7d70d0 100644 --- a/vllm_mindspore/model_executor/models/mf_models/deepseekv3_weight_processor.py +++ b/vllm_mindspore/model_executor/models/mf_models/deepseekv3_weight_processor.py @@ -1263,7 +1263,9 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor): else: value, _ = self.get_safetensor_from_file(param_name, src_hf_dir, hf_weight_map) - if "wo._layer.matmul.quant_bias" in param_name and get_tensor_model_parallel_rank() != 0: + quant_bias_set_zero = ["wo._layer.matmul.quant_bias", "w2._layer.matmul.quant_bias"] + if any([name in param_name for name in quant_bias_set_zero]) and \ + get_tensor_model_parallel_rank() != 0: value.fill(0) return value @@ -1365,11 +1367,13 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor): value, is_int4 = self.get_safetensor_from_file(param_name, src_hf_dir, hf_weight_map, is_split_param=True, split_axis=1) - elif any([name in param_name for name in [".feed_forward.w2.", ".wo.", - "shared_experts.w2"]]): + elif any([name in param_name for name in [".wo."]]): value, is_int4 = self.get_safetensor_from_file(param_name, src_hf_dir, hf_weight_map, is_split_param=True, split_axis=0) + elif any([name in param_name for name in [".feed_forward.w2.","shared_experts.w2"]]): + value = self.infer_smooth_quant_row_linear_split(param_name, src_hf_dir, hf_weight_map) + is_int4 = False elif ".routed_experts.ffn.w_gate_hidden." in param_name: value, is_int4 = self.get_safetensor_from_file(param_name, src_hf_dir, hf_weight_map) value_list = [] @@ -1430,7 +1434,8 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor): quantization_config = self.config.model.model_config.quantization_config quant_method = quantization_config.quant_method if quantization_config else None - if not quant_method or (quant_method != "gptq-pergroup" and quant_method != "smoothquant") and \ + support_quant_method = ["gptq-pergroup", "smoothquant"] + if not quant_method or (quant_method not in support_quant_method) and \ not is_mtp_model: self.infer_convert_outer_weight(src_hf_dir, hf_weight_map) -- Gitee From 7691430684c5d029af551cd676f09d7426a5ac73 Mon Sep 17 00:00:00 2001 From: dayschan Date: Sat, 10 May 2025 18:57:53 +0800 Subject: [PATCH 05/55] default building custom ops --- setup.py | 22 ++-- tests/st/python/test_custom_advstepflash.py | 117 +++++++++++++++++++ vllm_mindspore/attention/backends/ms_attn.py | 4 +- 3 files changed, 128 insertions(+), 15 deletions(-) create mode 100644 tests/st/python/test_custom_advstepflash.py diff --git a/setup.py b/setup.py index 8e2154b3..5296b18b 100644 --- a/setup.py +++ b/setup.py @@ -101,12 +101,11 @@ version = (Path("vllm_mindspore") / "version.txt").read_text() def _get_ascend_home_path(): return os.environ.get("ASCEND_HOME_PATH", "/usr/local/Ascend/ascend-toolkit/latest") -def _get_ascend_env_path(check_exists=True): - env_script_path = os.path.join(_get_ascend_home_path(), "bin", "setenv.bash") - if check_exists and not os.path.exists(env_script_path): - warnings.warn(f"The file '{env_script_path}' is not found, " - "please make sure env variable 'ASCEND_HOME_PATH' is set correctly.") - return None +def _get_ascend_env_path(): + env_script_path = os.path.realpath(os.path.join(_get_ascend_home_path(), "..", "set_env.sh")) + if not os.path.exists(env_script_path): + raise ValueError(f"The file '{env_script_path}' is not found, " + "please make sure environment variable 'ASCEND_HOME_PATH' is set correctly.") return env_script_path class CustomBuildExt(build_ext): @@ -128,7 +127,7 @@ class CustomBuildExt(build_ext): os.makedirs(BUILD_OPS_DIR, exist_ok=True) ascend_home_path = _get_ascend_home_path() - env_script_path = _get_ascend_env_path(False) + env_script_path = _get_ascend_env_path() build_extension_dir = os.path.join(BUILD_OPS_DIR, "kernel_meta", ext_name) # Combine all cmake commands into one string cmake_cmd = ( @@ -176,12 +175,9 @@ package_data = { def _get_ext_modules(): ext_modules = [] - # Currently, the CI environment does not support the compilation of custom operators. - # As a temporary solution, this is controlled via an environment variable. - # Once the CI environment adds support for custom operator compilation, - # this should be updated to enable compilation by default. - if os.getenv("vLLM_USE_NPU_ADV_STEP_FLASH_OP", "off") == "on" and _get_ascend_env_path() is not None: - ext_modules.append(Extension("vllm_mindspore.npu_ops", sources=[])) # sources are specified in CMakeLists.txt + if os.path.exists(_get_ascend_home_path()): + # sources are specified in CMakeLists.txt + ext_modules.append(Extension("vllm_mindspore.npu_ops", sources=[])) return ext_modules setup( diff --git a/tests/st/python/test_custom_advstepflash.py b/tests/st/python/test_custom_advstepflash.py new file mode 100644 index 00000000..4968ee46 --- /dev/null +++ b/tests/st/python/test_custom_advstepflash.py @@ -0,0 +1,117 @@ +# Copyright 2024 The vLLM team. +# Copyright 2024 Microsoft and the HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://wwww.apache.org/licenses/LICENSE-2.0 +# +# Unless required by application law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""test case for custom op adv_step_flash""" + +import time +import pytest +from vllm_mindspore import npu_ops +import numpy as np +import mindspore as ms +import torch + +def benchmark_advance_step_op(sampled_token_ids, + input_tokens, + input_positions, + seq_lens_tensor, + num_queries, + block_size, + block_tables, + slot_mapping): + # update input_tokens + sampled_token_ids_list = sampled_token_ids[:num_queries].squeeze(-1) + input_tokens[:num_queries] = sampled_token_ids_list + + # get seq_lens and input_positions + seq_lens = seq_lens_tensor[:num_queries] + next_seq_lens = seq_lens + 1 + next_input_pos = next_seq_lens - 1 + + # update seq_lens and input_positions + seq_lens_tensor[:num_queries] = next_seq_lens + input_positions[:num_queries] = next_input_pos + + block_idx = next_input_pos // block_size + block_offset = next_input_pos % block_size + + current_block_table = block_tables.gather(1, block_idx.unsqueeze(-1)).squeeze(-1) + slot_num = current_block_table * block_size + block_offset + + # update slot_mapping + slot_mapping[:num_queries] = slot_num + +def gendata(seed, num_seqs, block_size, block_num, make_tensor): + """generate inputs""" + np.random.seed(seed) + sampled_token_ids = np.random.randint(65536, size=(num_seqs,), dtype=np.int64) + input_tokens = np.random.randint(100, size=(num_seqs,), dtype=np.int64) # out + input_positions = np.random.randint(100, size=(num_seqs,), dtype=np.int64) # out + seq_lens_tensor = np.random.randint(block_size * block_num - 1, size=(num_seqs,), dtype=np.int64) # inplace + block_tables = np.random.randint(1024, size=(num_seqs, block_num), dtype=np.int64) + slot_mapping = np.random.randint(100, size=(num_seqs,), dtype=np.int64) # out + return (make_tensor(sampled_token_ids), \ + make_tensor(input_tokens), \ + make_tensor(input_positions), \ + make_tensor(seq_lens_tensor), \ + make_tensor(block_tables), \ + make_tensor(slot_mapping)) + + +class TestCustomAdvStepFlash: + """ + Test Custom op AdvStepFlash. + """ + @pytest.mark.level0 + @pytest.mark.platform_arm_ascend910b_training + @pytest.mark.env_single + def test_advstepflash(self): + """ + test case + """ + seed = int(time.time() * 1000) % 1000000009 + num_seqs = 256 + block_size = 32 + block_num = 4 + num_queries = num_seqs # no padding + print("test seed:", seed, flush=True) + sampled_token_ids1, input_tokens1, input_positions1, seq_lens_tensor1, block_tables1, slot_mapping1 = \ + gendata(seed, num_seqs, block_size, block_num, torch.Tensor) + benchmark_advance_step_op(sampled_token_ids1, + input_tokens1, + input_positions1, + seq_lens_tensor1, + num_queries, + block_size, + block_tables1, + slot_mapping1) + + sampled_token_ids2, input_tokens2, input_positions2, seq_lens_tensor2, block_tables2, slot_mapping2 = \ + gendata(seed, num_seqs, block_size, block_num, ms.Tensor) + npu_ops.adv_step_flash(num_seqs=num_seqs, + num_queries=num_queries, + block_size=block_size, + input_tokens=input_tokens2, + sampled_token_ids=sampled_token_ids2, + input_positions=input_positions2, + seq_lens=seq_lens_tensor2, + slot_mapping=slot_mapping2, + block_tables=block_tables2) + + assert np.allclose(sampled_token_ids1, sampled_token_ids2.asnumpy()) + assert np.allclose(input_tokens1, input_tokens2.asnumpy()) + assert np.allclose(input_positions1, input_positions2.asnumpy()) + assert np.allclose(seq_lens_tensor1, seq_lens_tensor2.asnumpy()) + assert np.allclose(block_tables1, block_tables2.asnumpy()) + assert np.allclose(slot_mapping1, slot_mapping2.asnumpy()) diff --git a/vllm_mindspore/attention/backends/ms_attn.py b/vllm_mindspore/attention/backends/ms_attn.py index 558882cd..d6123b0a 100644 --- a/vllm_mindspore/attention/backends/ms_attn.py +++ b/vllm_mindspore/attention/backends/ms_attn.py @@ -312,8 +312,8 @@ class MSAttentionMetadata(AttentionMetadata, PagedAttentionMetadata): self.seq_lens[i] += 1 self.max_decode_seq_len = max(self.seq_lens) - # default use python op - if os.getenv("vLLM_USE_NPU_ADV_STEP_FLASH_OP", "off") == "on": + # default use ascendc op + if os.getenv("vLLM_USE_NPU_ADV_STEP_FLASH_OP", "on") != "off": from vllm_mindspore import npu_ops npu_ops.adv_step_flash(num_seqs=num_seqs, num_queries=num_queries, -- Gitee From 6e0a970a7675f1cb494be6f2ef4544cf75ef5d0d Mon Sep 17 00:00:00 2001 From: lvhaoyu Date: Thu, 20 Mar 2025 19:47:55 +0800 Subject: [PATCH 06/55] update v1 --- vllm_mindspore/__init__.py | 80 ++- vllm_mindspore/compilation/__init__.py | 0 vllm_mindspore/compilation/inductor_pass.py | 73 ++ vllm_mindspore/config.py | 138 +++- vllm_mindspore/engine/arg_utils.py | 222 +++++++ .../model_executor/layers/sampler.py | 4 + .../model_executor/model_loader/utils.py | 2 +- .../models/mf_models/deepseek_v3.py | 10 +- .../models/mf_models/mf_model_base.py | 147 ++-- .../model_executor/models/mf_models/qwen2.py | 11 +- .../model_executor/models/model_base.py | 54 +- .../model_executor/models/registry.py | 9 +- vllm_mindspore/platforms/ascend.py | 37 +- vllm_mindspore/scripts.py | 3 +- vllm_mindspore/utils.py | 4 +- vllm_mindspore/v1/__init__.py | 0 vllm_mindspore/v1/attention/__init__.py | 0 .../v1/attention/backends/__init__.py | 0 .../v1/attention/backends/flash_attn.py | 237 +++++++ vllm_mindspore/v1/sample/__init__.py | 0 vllm_mindspore/v1/sample/ops/__init__.py | 0 vllm_mindspore/v1/sample/ops/penalties.py | 21 + .../v1/sample/ops/topk_topp_sampler.py | 98 +++ vllm_mindspore/v1/sample/rejection_sampler.py | 627 ++++++++++++++++++ vllm_mindspore/v1/sample/sampler.py | 10 + vllm_mindspore/v1/spec_decode/__init__.py | 0 vllm_mindspore/v1/spec_decode/eagle.py | 258 +++++++ vllm_mindspore/v1/utils.py | 12 + vllm_mindspore/v1/worker/__init__.py | 0 vllm_mindspore/v1/worker/block_table.py | 93 +++ vllm_mindspore/v1/worker/gpu_input_batch.py | 85 +++ vllm_mindspore/v1/worker/gpu_model_runner.py | 420 ++++++++++++ 32 files changed, 2582 insertions(+), 73 deletions(-) create mode 100644 vllm_mindspore/compilation/__init__.py create mode 100644 vllm_mindspore/compilation/inductor_pass.py create mode 100644 vllm_mindspore/engine/arg_utils.py create mode 100644 vllm_mindspore/v1/__init__.py create mode 100644 vllm_mindspore/v1/attention/__init__.py create mode 100644 vllm_mindspore/v1/attention/backends/__init__.py create mode 100644 vllm_mindspore/v1/attention/backends/flash_attn.py create mode 100644 vllm_mindspore/v1/sample/__init__.py create mode 100644 vllm_mindspore/v1/sample/ops/__init__.py create mode 100644 vllm_mindspore/v1/sample/ops/penalties.py create mode 100644 vllm_mindspore/v1/sample/ops/topk_topp_sampler.py create mode 100644 vllm_mindspore/v1/sample/rejection_sampler.py create mode 100644 vllm_mindspore/v1/sample/sampler.py create mode 100644 vllm_mindspore/v1/spec_decode/__init__.py create mode 100644 vllm_mindspore/v1/spec_decode/eagle.py create mode 100644 vllm_mindspore/v1/utils.py create mode 100644 vllm_mindspore/v1/worker/__init__.py create mode 100644 vllm_mindspore/v1/worker/block_table.py create mode 100644 vllm_mindspore/v1/worker/gpu_input_batch.py create mode 100644 vllm_mindspore/v1/worker/gpu_model_runner.py diff --git a/vllm_mindspore/__init__.py b/vllm_mindspore/__init__.py index 47a9e4d5..a943c92f 100644 --- a/vllm_mindspore/__init__.py +++ b/vllm_mindspore/__init__.py @@ -29,6 +29,10 @@ from vllm_mindspore.scripts import env_setup env_setup() +# should be place on the top of the file. +from vllm_mindspore.compilation import inductor_pass as ms_inductor_pass +sys.modules.update({"vllm.compilation.inductor_pass": ms_inductor_pass}) + from vllm_mindspore.platforms.ascend import AscendPlatform ascend_platform = AscendPlatform() @@ -45,6 +49,13 @@ import vllm.utils vllm.utils.current_platform = ascend_platform +import vllm.attention.selector +vllm.attention.selector.current_platform = ascend_platform + +import vllm.engine.arg_utils +from vllm_mindspore.engine.arg_utils import _is_v1_supported_oracle +vllm.engine.arg_utils.EngineArgs._is_v1_supported_oracle = _is_v1_supported_oracle + from vllm_mindspore.utils import ( direct_register_custom_op, make_tensor_with_pad, @@ -71,6 +82,7 @@ from vllm_mindspore.model_executor.models.registry import ( _SUBPROCESS_COMMAND, ) + vllm.config.ModelRegistry = MindSporeModelRegistry import vllm.model_executor @@ -160,6 +172,11 @@ from vllm.executor.multiproc_worker_utils import get_mp_context vllm.executor.multiproc_worker_utils.get_mp_context = ms_get_mp_context +import vllm.v1.executor.multiproc_executor +vllm.v1.executor.multiproc_executor.get_mp_context = ms_get_mp_context +import vllm.v1.utils +vllm.v1.utils.get_mp_context = ms_get_mp_context + from vllm_mindspore.executor.ray_gpu_executor import ( ms_init_workers_ray, initialize_ray_cluster, @@ -179,11 +196,14 @@ vllm.engine.llm_engine.initialize_ray_cluster = initialize_ray_cluster vllm.engine.async_llm_engine.initialize_ray_cluster = initialize_ray_cluster -from .config import _verify_quantization, _verify_args, vllm_config_post_init +from .config import _verify_quantization, _verify_args, vllm_config_post_init, model_post_init, \ + _get_and_verify_dtype vllm.config.ModelConfig._verify_quantization = _verify_quantization vllm.config.VllmConfig.__post_init__ = vllm_config_post_init vllm.config.SchedulerConfig._verify_args = _verify_args +vllm.config.CompilationConfig.model_post_init = model_post_init +vllm.config._get_and_verify_dtype = _get_and_verify_dtype from .utils import update_modules from vllm_mindspore.attention.backends import ms_attn @@ -213,6 +233,64 @@ RejectionSampler._smallest_positive_value = _smallest_positive_value RejectionSampler._smallest_positive_value.__set_name__(RejectionSampler, '_smallest_positive_value') vllm.model_executor.layers.rejection_sampler._multinomial = _multinomial +from vllm_mindspore.v1.sample import rejection_sampler +update_modules("vllm.v1.sample.rejection_sampler", rejection_sampler) + +from vllm_mindspore.v1.spec_decode import eagle +update_modules("vllm.v1.spec_decode.eagle", eagle) + +from vllm_mindspore.v1.attention.backends import flash_attn +import vllm.v1.attention.backends +sys.modules['vllm.v1.attention.backends.flash_attn'] = flash_attn +import vllm.v1.attention.backends.flash_attn + +import vllm.v1.worker.gpu_model_runner + +from vllm_mindspore.v1.worker.gpu_model_runner import _prepare_inputs +vllm.v1.worker.gpu_model_runner.GPUModelRunner._prepare_inputs = _prepare_inputs + +from vllm_mindspore.v1.worker.gpu_model_runner import _update_states +vllm.v1.worker.gpu_model_runner.GPUModelRunner._update_states = _update_states + +from vllm_mindspore.v1.worker.gpu_model_runner import initialize_kv_cache +vllm.v1.worker.gpu_model_runner.GPUModelRunner.initialize_kv_cache = initialize_kv_cache + +import vllm.v1.worker.block_table +from vllm_mindspore.v1.worker.block_table import BlockTable +vllm.v1.worker.block_table.BlockTable = BlockTable +vllm.v1.worker.gpu_input_batch.BlockTable = BlockTable + +import vllm.v1.worker.gpu_input_batch +from vllm_mindspore.v1.worker.gpu_input_batch import _make_sampling_metadata, _make_prompt_token_ids_tensor +vllm.v1.worker.gpu_input_batch.InputBatch._make_sampling_metadata = _make_sampling_metadata +vllm.v1.worker.gpu_model_runner.InputBatch._make_sampling_metadata = _make_sampling_metadata +vllm.v1.worker.gpu_input_batch.InputBatch._make_prompt_token_ids_tensor = _make_prompt_token_ids_tensor +vllm.v1.worker.gpu_model_runner.InputBatch._make_prompt_token_ids_tensor = _make_prompt_token_ids_tensor + +from vllm.v1.worker.gpu_worker import Worker + +Worker.__init__ = wrapper_worker_init(Worker.__init__) +Worker.init_device = wrapper_worker_init_device(Worker.init_device) + + +import vllm.v1.utils +from vllm_mindspore.v1.utils import copy_slice +vllm.v1.utils.copy_slice = copy_slice +vllm.v1.worker.gpu_input_batch.copy_slice = copy_slice + +from vllm_mindspore.v1.sample.ops.penalties import _convert_to_tensors +import vllm.v1.sample.ops.penalties +vllm.v1.sample.ops.penalties._convert_to_tensors = _convert_to_tensors + +from vllm_mindspore.v1.sample.ops.topk_topp_sampler import apply_top_k_top_p, random_sample +import vllm.v1.sample.ops.topk_topp_sampler +vllm.v1.sample.ops.topk_topp_sampler.apply_top_k_top_p = apply_top_k_top_p +vllm.v1.sample.ops.topk_topp_sampler.random_sample = random_sample + +from vllm_mindspore.v1.sample.sampler import apply_temperature +import vllm.v1.sample.sampler +vllm.v1.sample.sampler.Sampler.apply_temperature = apply_temperature + from .utils import check_ready from vllm_mindspore.engine.multiprocessing.engine import cleanup diff --git a/vllm_mindspore/compilation/__init__.py b/vllm_mindspore/compilation/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/vllm_mindspore/compilation/inductor_pass.py b/vllm_mindspore/compilation/inductor_pass.py new file mode 100644 index 00000000..835a1e11 --- /dev/null +++ b/vllm_mindspore/compilation/inductor_pass.py @@ -0,0 +1,73 @@ +# SPDX-License-Identifier: Apache-2.0 + +import hashlib +import inspect +import json +import types +from typing import Any, Callable, Dict, Optional, Union + +import torch +from packaging.version import Version + + +class InductorPass(): + """ + A custom graph pass that uses a hash of its source as the UUID. + This is defined as a convenience and should work in most cases. + """ + + def uuid(self) -> Any: + """ + Provide a unique identifier for the pass, used in Inductor code cache. + This should depend on the pass implementation, so that changes to the + pass result in recompilation. + By default, the object source is hashed. + """ + return InductorPass.hash_source(self) + + @staticmethod + def hash_source(*srcs: Union[str, Any]): + """ + Utility method to hash the sources of functions or objects. + :param srcs: strings or objects to add to the hash. + Objects and functions have their source inspected. + :return: + """ + hasher = hashlib.sha256() + for src in srcs: + if isinstance(src, str): + src_str = src + elif isinstance(src, types.FunctionType): + src_str = inspect.getsource(src) + else: + src_str = inspect.getsource(src.__class__) + hasher.update(src_str.encode("utf-8")) + return hasher.hexdigest() + + @staticmethod + def hash_dict(dict_: Dict[Any, Any]): + """ + Utility method to hash a dictionary, can alternatively be used for uuid. + :return: A sha256 hash of the json rep of the dictionary. + """ + encoded = json.dumps(dict_, sort_keys=True).encode("utf-8") + return hashlib.sha256(encoded).hexdigest() + + +class CallableInductorPass(InductorPass): + """ + This class is a wrapper for a callable that automatically provides an + implementation of the UUID. + """ + + def __init__(self, + callable, + uuid: Optional[Any] = None): + self.callable = callable + self._uuid = self.hash_source(callable) if uuid is None else uuid + + def __call__(self, graph): + self.callable(graph) + + def uuid(self) -> Any: + return self._uuid diff --git a/vllm_mindspore/config.py b/vllm_mindspore/config.py index e702278e..b6366434 100644 --- a/vllm_mindspore/config.py +++ b/vllm_mindspore/config.py @@ -15,13 +15,22 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================ +from collections import Counter +from typing import Union +import sys + import torch +from transformers import PretrainedConfig + + import vllm.envs as envs -from vllm.config import VllmConfig, CompilationConfig, CompilationLevel, logger +from vllm.config import VllmConfig, CompilationConfig, CompilationLevel, logger, _STR_DTYPE_TO_TORCH_DTYPE from vllm.utils import random_uuid from vllm.logger import init_logger +from vllm.compilation.inductor_pass import CallableInductorPass, InductorPass +from vllm.platforms import CpuArchEnum logger = init_logger(__name__) @@ -80,8 +89,14 @@ def vllm_config_post_init(self): self.compilation_config.use_inductor = True self.compilation_config.cudagraph_num_of_warmups = 1 self.compilation_config.pass_config.enable_fusion = False - self.compilation_config.pass_config.enable_reshape = False - self.compilation_config.level = CompilationLevel.PIECEWISE + self.compilation_config.pass_config.enable_noop = False + # When level is set to CompilationLevel.PIECEWISE, vllm will use cuda graph, + # which means the model inputs will be padded to cuda graph acceptable size, + # but it is not for mindspore. So here set to CompilationLevel.DYNAMO_AS_IS. + self.compilation_config.level = CompilationLevel.DYNAMO_AS_IS + # Set a small compile_sizes for warmup. '20' is not in 'cudagraph_capture_sizes'. + # So the warmup can be runned. + self.compilation_config.compile_sizes = [20] self._set_cudagraph_sizes() @@ -160,3 +175,120 @@ def _verify_args(self) -> None: f"max_long_partial_prefills ({self.max_long_partial_prefills}) " "must be greater than or equal to 1 and less than or equal to " f"max_num_partial_prefills ({self.max_num_partial_prefills}).") + + +def model_post_init(self, __context) -> None: + + count_none = self.custom_ops.count("none") + count_all = self.custom_ops.count("all") + assert count_none + count_all <= 1, "Can only specify 'none' or 'all'" + + if self.splitting_ops is None: + self.splitting_ops = [] + + for k, v in self.inductor_passes.items(): + if not isinstance(v, str): + assert callable(v), ( + f"pass {k} should be callable or a qualified name") + self.inductor_compile_config[k] = v if isinstance( + v, InductorPass) else CallableInductorPass(v) + continue + + # resolve function from qualified name + names = v.split(".") + module = ".".join(names[:-1]) + func_name = names[-1] + func = __import__(module).__dict__[func_name] + self.inductor_compile_config[k] = func if isinstance( + func, InductorPass) else CallableInductorPass(func) + + self.enabled_custom_ops = Counter() + self.disabled_custom_ops = Counter() + self.traced_files = set() + self.static_forward_context = {} + self.compilation_time = 0.0 + + +def _get_and_verify_dtype( + config: PretrainedConfig, + dtype: Union[str, torch.dtype], +) -> torch.dtype: + # NOTE: getattr(config, "torch_dtype", torch.float32) is not correct + # because config.torch_dtype can be None. + config_dtype = getattr(config, "torch_dtype", None) + + # Fallbacks for multi-modal models if the root config + # does not define torch_dtype + if config_dtype is None and hasattr(config, "text_config"): + config_dtype = getattr(config.text_config, "torch_dtype", None) + if config_dtype is None and hasattr(config, "vision_config"): + config_dtype = getattr(config.vision_config, "torch_dtype", None) + + if config_dtype is None: + config_dtype = torch.float32 + + if isinstance(dtype, str): + dtype = dtype.lower() + if dtype == "auto": + if config_dtype == torch.float32: + # Following common practice, we use float16 for float32 models + torch_dtype = torch.float16 + else: + torch_dtype = config_dtype + + from vllm.platforms import current_platform + if (current_platform.is_cpu() + and current_platform.get_cpu_architecture() + == CpuArchEnum.POWERPC + and (config_dtype == torch.float16 + or config_dtype == torch.float32)): + logger.info( + "For POWERPC, we cast models to bfloat16 instead of " + "using float16 by default. Float16 is not currently " + "supported for POWERPC.") + torch_dtype = torch.bfloat16 + + # TODO: change this condition to check if the platform support bf16 + # instead of checking the OS. For instance M2 shall supports bf16 + # already. But we need to modify `cpu_extension.cmake` to activate + # the feature in the build. + if (current_platform.is_cpu() and sys.platform.startswith("darwin") + and current_platform.get_cpu_architecture() + == CpuArchEnum.ARM and config_dtype == torch.bfloat16): + logger.info("For macOS with Apple Silicon, currently bfloat16 " + "is not supported. Setting dtype to float16.") + torch_dtype = torch.float16 + + if current_platform.is_hpu() and config_dtype == torch.float16: + logger.info( + "For HPU, we cast models to bfloat16 instead of " + "using float16 by default. Please specify `dtype` if you " + "want to use float16.") + torch_dtype = torch.bfloat16 + else: + if dtype not in _STR_DTYPE_TO_TORCH_DTYPE: + raise ValueError(f"Unknown dtype: {dtype}") + torch_dtype = _STR_DTYPE_TO_TORCH_DTYPE[dtype] + elif isinstance(dtype, torch.dtype): + torch_dtype = dtype + else: + raise ValueError(f"Unknown dtype: {dtype}") + + # Verify the dtype. + if torch_dtype != config_dtype: + if torch_dtype == torch.float32: + # Upcasting to float32 is allowed. + logger.info("Upcasting %s to %s.", config_dtype, torch_dtype) + pass + elif config_dtype == torch.float32: + # Downcasting from float32 to float16 or bfloat16 is allowed. + logger.info("Downcasting %s to %s.", config_dtype, torch_dtype) + pass + else: + # Casting between float16 and bfloat16 is allowed with a warning. + logger.warning("Casting %s to %s.", config_dtype, torch_dtype) + + if torch_dtype in _STR_DTYPE_TO_TORCH_DTYPE: + torch_dtype = _STR_DTYPE_TO_TORCH_DTYPE[torch_dtype] + + return torch_dtype diff --git a/vllm_mindspore/engine/arg_utils.py b/vllm_mindspore/engine/arg_utils.py new file mode 100644 index 00000000..ed74ba9e --- /dev/null +++ b/vllm_mindspore/engine/arg_utils.py @@ -0,0 +1,222 @@ +import threading + +import torch + +import vllm.envs as envs +from vllm.engine.arg_utils import _raise_or_fallback, EngineArgs, _warn_or_fallback +from vllm.config import LoadFormat, ModelConfig + +def _is_v1_supported_oracle(self, model_config: ModelConfig) -> bool: + """Oracle for whether to use V0 or V1 Engine by default.""" + + ############################################################# + # Unsupported Feature Flags on V1. + + if (self.load_format == LoadFormat.TENSORIZER.value + or self.load_format == LoadFormat.SHARDED_STATE.value): + _raise_or_fallback( + feature_name=f"--load_format {self.load_format}", + recommend_to_remove=False) + return False + + if (self.logits_processor_pattern + != EngineArgs.logits_processor_pattern): + _raise_or_fallback(feature_name="--logits-processor-pattern", + recommend_to_remove=False) + return False + + if self.preemption_mode != EngineArgs.preemption_mode: + _raise_or_fallback(feature_name="--preemption-mode", + recommend_to_remove=True) + return False + + if (self.disable_async_output_proc + != EngineArgs.disable_async_output_proc): + _raise_or_fallback(feature_name="--disable-async-output-proc", + recommend_to_remove=True) + return False + + if self.scheduling_policy != EngineArgs.scheduling_policy: + _raise_or_fallback(feature_name="--scheduling-policy", + recommend_to_remove=False) + return False + + if self.num_scheduler_steps != EngineArgs.num_scheduler_steps: + _raise_or_fallback(feature_name="--num-scheduler-steps", + recommend_to_remove=True) + return False + + if self.scheduler_delay_factor != EngineArgs.scheduler_delay_factor: + _raise_or_fallback(feature_name="--scheduler-delay-factor", + recommend_to_remove=True) + return False + + if self.additional_config != EngineArgs.additional_config: + _raise_or_fallback(feature_name="--additional-config", + recommend_to_remove=False) + return False + + # Xgrammar and Guidance are supported. + SUPPORTED_GUIDED_DECODING = [ + "xgrammar", "xgrammar:disable-any-whitespace", "guidance", + "guidance:disable-any-whitespace", "auto" + ] + if self.guided_decoding_backend not in SUPPORTED_GUIDED_DECODING: + _raise_or_fallback(feature_name="--guided-decoding-backend", + recommend_to_remove=False) + return False + + # Need at least Ampere for now (FA support required). + # Skip this check if we are running on a non-GPU platform, + # or if the device capability is not available + # (e.g. in a Ray actor without GPUs). + from vllm.platforms import current_platform + if (current_platform.is_cuda() + and current_platform.get_device_capability() + and current_platform.get_device_capability().major < 8): + _raise_or_fallback(feature_name="Compute Capability < 8.0", + recommend_to_remove=False) + return False + + # No Fp8 KV cache so far. + if self.kv_cache_dtype != "auto": + fp8_attention = self.kv_cache_dtype.startswith("fp8") + will_use_fa = ( + current_platform.is_cuda() + and not envs.is_set("VLLM_ATTENTION_BACKEND") + ) or envs.VLLM_ATTENTION_BACKEND == "FLASH_ATTN_VLLM_V1" + supported = False + if fp8_attention and will_use_fa: + from vllm.vllm_flash_attn.fa_utils import ( + flash_attn_supports_fp8) + supported = flash_attn_supports_fp8() + if not supported: + _raise_or_fallback(feature_name="--kv-cache-dtype", + recommend_to_remove=False) + return False + + # No Prompt Adapter so far. + if self.enable_prompt_adapter: + _raise_or_fallback(feature_name="--enable-prompt-adapter", + recommend_to_remove=False) + return False + + # Only Fp16 and Bf16 dtypes since we only support FA. + V1_SUPPORTED_DTYPES = [torch.bfloat16, torch.float16] + if model_config.dtype not in V1_SUPPORTED_DTYPES: + _raise_or_fallback(feature_name=f"--dtype {model_config.dtype}", + recommend_to_remove=False) + return False + + # Some quantization is not compatible with torch.compile. + V1_UNSUPPORTED_QUANT = ["gguf"] + if model_config.quantization in V1_UNSUPPORTED_QUANT: + _raise_or_fallback( + feature_name=f"--quantization {model_config.quantization}", + recommend_to_remove=False) + return False + + # No Embedding Models so far. + if model_config.task not in ["generate"]: + _raise_or_fallback(feature_name=f"--task {model_config.task}", + recommend_to_remove=False) + return False + + # No Mamba or Encoder-Decoder so far. + if not model_config.is_v1_compatible: + _raise_or_fallback(feature_name=model_config.architectures, + recommend_to_remove=False) + return False + + # No Concurrent Partial Prefills so far. + if (self.max_num_partial_prefills + != EngineArgs.max_num_partial_prefills + or self.max_long_partial_prefills + != EngineArgs.max_long_partial_prefills): + _raise_or_fallback(feature_name="Concurrent Partial Prefill", + recommend_to_remove=False) + return False + + # No OTLP observability so far. + if (self.otlp_traces_endpoint or self.collect_detailed_traces): + _raise_or_fallback(feature_name="--otlp-traces-endpoint", + recommend_to_remove=False) + return False + + # Only Ngram speculative decoding so far. + is_ngram_enabled = False + is_eagle_enabled = False + if self.speculative_config is not None: + # This is supported but experimental (handled below). + speculative_method = self.speculative_config.get("method") + if speculative_method: + if speculative_method in ("ngram", "[ngram]"): + is_ngram_enabled = True + elif speculative_method == "eagle": + is_eagle_enabled = True + else: + speculative_model = self.speculative_config.get("model") + if speculative_model in ("ngram", "[ngram]"): + is_ngram_enabled = True + if not (is_ngram_enabled or is_eagle_enabled): + # Other speculative decoding methods are not supported yet. + _raise_or_fallback(feature_name="Speculative Decoding", + recommend_to_remove=False) + return False + + # No Disaggregated Prefill so far. + if self.kv_transfer_config != EngineArgs.kv_transfer_config: + _raise_or_fallback(feature_name="--kv-transfer-config", + recommend_to_remove=False) + return False + + # No FlashInfer or XFormers so far. + V1_BACKENDS = [ + "FLASH_ATTN_VLLM_V1", "FLASH_ATTN", "PALLAS", "PALLAS_VLLM_V1", + "TRITON_ATTN_VLLM_V1", "TRITON_MLA", "FLASHMLA" + ] + if (envs.is_set("VLLM_ATTENTION_BACKEND") + and envs.VLLM_ATTENTION_BACKEND not in V1_BACKENDS): + name = f"VLLM_ATTENTION_BACKEND={envs.VLLM_ATTENTION_BACKEND}" + _raise_or_fallback(feature_name=name, recommend_to_remove=True) + return False + + # Platforms must decide if they can support v1 for this model + if not current_platform.supports_v1(model_config=model_config): + _raise_or_fallback( + feature_name=f"device type={current_platform.device_type}", + recommend_to_remove=False) + return False + ############################################################# + # Experimental Features - allow users to opt in. + + # Signal Handlers requires running in main thread. + if (threading.current_thread() != threading.main_thread() + and _warn_or_fallback("Engine in background thread")): + return False + + # PP is supported on V1 with Ray distributed executor, + # but off for MP distributed executor for now. + if (self.pipeline_parallel_size > 1 + and self.distributed_executor_backend != "ray"): + name = "Pipeline Parallelism without Ray distributed executor" + _raise_or_fallback(feature_name=name, recommend_to_remove=False) + return False + + # ngram is supported on V1, but off by default for now. + if is_ngram_enabled and _warn_or_fallback("ngram"): + return False + + # Eagle is under development, so we don't support it yet. + if is_eagle_enabled and _warn_or_fallback("Eagle"): + return False + + # Non-CUDA is supported on V1, but off by default for now. + # support vllm-mindspore defined AscendPlatform + not_cuda = not current_platform.is_cuda() and not current_platform.is_out_of_tree() + if not_cuda and _warn_or_fallback( # noqa: SIM103 + current_platform.device_name): + return False + ############################################################# + + return True diff --git a/vllm_mindspore/model_executor/layers/sampler.py b/vllm_mindspore/model_executor/layers/sampler.py index 354fb021..edfe6252 100644 --- a/vllm_mindspore/model_executor/layers/sampler.py +++ b/vllm_mindspore/model_executor/layers/sampler.py @@ -51,6 +51,10 @@ else: def get_sampler() -> torch.nn.Module: + if envs.VLLM_USE_V1: + # Lazy import: the v1 package isn't distributed + from vllm.v1.sample.sampler import Sampler as V1Sampler + return V1Sampler() return Sampler() diff --git a/vllm_mindspore/model_executor/model_loader/utils.py b/vllm_mindspore/model_executor/model_loader/utils.py index 66295a32..07a6cc32 100644 --- a/vllm_mindspore/model_executor/model_loader/utils.py +++ b/vllm_mindspore/model_executor/model_loader/utils.py @@ -24,7 +24,7 @@ from vllm.config import ModelConfig, ModelImpl from vllm.model_executor.models import ModelRegistry from vllm_mindspore.model_executor.models.registry import MindSporeModelRegistry -from vllm.model_executor.model_loader.utils import resolve_transformers_fallback +# from vllm.model_executor.model_loader.utils import resolve_transformers_fallback def get_ms_model_architecture(model_config: ModelConfig) -> Tuple[Type[nn.Module], str]: architectures = getattr(model_config.hf_config, "architectures", []) diff --git a/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py b/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py index e7cda00c..d0ac3c8c 100644 --- a/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py +++ b/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py @@ -26,6 +26,7 @@ from vllm.config import VllmConfig from vllm.config import get_current_vllm_config from vllm.forward_context import get_forward_context from vllm.logger import init_logger +import vllm.envs as envs import mindspore as ms from mindspore import Tensor, JitConfig, Model, mutable @@ -47,9 +48,8 @@ from research.deepseek3.deepseek3 import ( ) from vllm_mindspore.model_executor.layers.sampler import get_sampler -from vllm_mindspore.model_executor.models.model_base import Fake_MLA +from vllm_mindspore.model_executor.models.model_base import Fake_MLA, Fake_MLA_V1 from vllm_mindspore.model_executor.models.mf_models.mf_model_base import MfModelBase - from vllm_mindspore.model_executor.models.mf_models.deepseekv3_weight_processor import DeepseekV3WeightProcessor logger = init_logger(__name__) @@ -80,8 +80,10 @@ class DeepseekV3ForCausalLM(MfModelBase): self.sampler = get_sampler() self.set_modules({"model": self.network}) - - self.kv_caches = [Fake_MLA() for i in range(self.mf_model_config.num_layers)] + if envs.VLLM_USE_V1: + self.kv_caches = [Fake_MLA_V1() for i in range(self.mf_model_config.num_layers)] + else: + self.kv_caches = [Fake_MLA() for i in range(self.mf_model_config.num_layers)] compilation_config = get_current_vllm_config().compilation_config if prefix in compilation_config.static_forward_context: diff --git a/vllm_mindspore/model_executor/models/mf_models/mf_model_base.py b/vllm_mindspore/model_executor/models/mf_models/mf_model_base.py index 893d91a5..79974f6f 100644 --- a/vllm_mindspore/model_executor/models/mf_models/mf_model_base.py +++ b/vllm_mindspore/model_executor/models/mf_models/mf_model_base.py @@ -21,18 +21,19 @@ from types import MethodType from typing import Iterable, List, Optional, Set, Tuple, Union from abc import abstractmethod import numpy as np +import math -from vllm.attention import AttentionMetadata from vllm.config import VllmConfig from vllm.model_executor.layers.sampler import SamplerOutput from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from vllm.distributed import get_tensor_model_parallel_world_size from vllm.logger import init_logger +from vllm.forward_context import get_forward_context +import vllm.envs as envs -import torch import mindspore as ms -from mindspore import Tensor, mutable +from mindspore import Tensor from mindspore.common.api import _pynative_executor from mindformers.tools.register.config import MindFormerConfig @@ -41,6 +42,7 @@ from mindformers.core.parallel_config import build_parallel_config from vllm_mindspore.model_executor.models.model_base import MsModelBase from vllm_mindspore.model_executor.models.mf_models.attention_mask import LowerTriangularMask +from vllm_mindspore.v1.attention.backends.flash_attn import FlashAttentionMetadata logger = init_logger(__name__) @@ -55,7 +57,6 @@ def _batch_seq(input_tokens, prefill): return ms.mint.reshape(input_tokens, (-1, 1)).to(ms.int32) - class MfModelBase(MsModelBase): def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: super(MfModelBase, self).__init__( @@ -88,40 +89,96 @@ class MfModelBase(MsModelBase): raise NotImplementedError("Function _create_network should be Implemented!") + def _dummy_attention_metadata(self, input_ids: Tensor, positions: Tensor) -> FlashAttentionMetadata: + input_len = input_ids.shape[0] + max_seq_len = ms.Tensor(input_len, dtype=ms.int32) + seq_lengths = ms.Tensor([input_len], dtype=ms.int32) + q_seq_lens = ms.Tensor([input_len], dtype=ms.int32) + q_seq_lens_np = np.array([input_len], dtype=np.int32) + seq_lens_np = np.array([input_len], dtype=np.int32) + + block_tables = ms.Tensor([[0]], dtype=ms.int32) + slot_mapping = [-1 for _ in range(input_len)] + slot_mapping = ms.Tensor(slot_mapping, dtype=ms.int32) + return FlashAttentionMetadata( + max_seq_len=max_seq_len, + seq_lens=seq_lengths, + seq_lens_np=seq_lens_np, + block_tables=block_tables, + slot_mapping=slot_mapping, + q_seq_lens=q_seq_lens, + q_seq_lens_np=q_seq_lens_np, + context_lens=0, + # To enforce prefill and decode are both complied in warmup process. + # So set max_context_lens to 0 for prefill and 1 for decode. + max_context_lens=0 if not self.set_flags else 1, + query_start_loc = None + ) + def prepare_inputs(self, input_ids, positions, attn_metadata): key_cache, value_cache = self.get_kvcache() - seq_lens = attn_metadata.seq_lens - max_query_len = attn_metadata.max_query_len - # When Mutli-Step is enabled with Chunked-Prefill, prefills and - # decodes are scheduled together. In the first step, all the - # prefills turn into decodes and max_query_len will be 1. - if self.is_multi_step_chunked_prefill and max_query_len == 1: - query_lens = [1] * len(seq_lens) + if not envs.VLLM_USE_V1: + seq_lens = attn_metadata.seq_lens + max_query_len = attn_metadata.max_query_len + # When Mutli-Step is enabled with Chunked-Prefill, prefills and + # decodes are scheduled together. In the first step, all the + # prefills turn into decodes and max_query_len will be 1. + if self.is_multi_step_chunked_prefill and max_query_len == 1: + query_lens = [1] * len(seq_lens) + else: + query_lens = attn_metadata.query_lens + + seq_lens = attn_metadata.seq_lens + max_query_len = attn_metadata.max_query_len + # When Mutli-Step is enabled with Chunked-Prefill, prefills and + # decodes are scheduled together. In the first step, all the + # prefills turn into decodes and max_query_len will be 1. + if self.is_multi_step_chunked_prefill and max_query_len == 1: + query_lens = [1] * len(seq_lens) + else: + query_lens = attn_metadata.query_lens + + seq_lens_np = np.array(seq_lens, dtype=np.int32) + query_lens_np = np.array(query_lens, dtype=np.int32) + kv_cache_lens = seq_lens_np - query_lens_np + if attn_metadata.num_decode_tokens == 0 and kv_cache_lens.max() == 0: + is_prefill = True + else: + is_prefill = False + + q_seq_lens = ms.Tensor(query_lens_np, dtype=ms.int32) + position_ids = ms.Tensor(positions, dtype=ms.int32) + attention_mask = self.casual_mask.gen_attention_mask(is_prefill, position_ids, query_lens) + + model_inputs = {} + model_inputs["input_ids"] = _batch_seq(input_ids, is_prefill) + model_inputs["batch_valid_length"] = ms.Tensor.from_numpy(np.expand_dims(seq_lens_np, 0)) + model_inputs["block_tables"] = attn_metadata.block_tables + model_inputs["slot_mapping"] = attn_metadata.slot_mapping + model_inputs["position_ids"] = position_ids + model_inputs["q_seq_lens"] = q_seq_lens + model_inputs["attention_mask"] = attention_mask + model_inputs["key_cache"] = key_cache + model_inputs["value_cache"] = value_cache else: - query_lens = attn_metadata.query_lens - - seq_lens_np = np.array(seq_lens, dtype=np.int32) - query_lens_np = np.array(query_lens, dtype=np.int32) - kv_cache_lens = seq_lens_np - query_lens_np - if attn_metadata.num_decode_tokens == 0 and kv_cache_lens.max() == 0: - is_prefill = True - else: - is_prefill = False - - q_seq_lens = ms.Tensor(query_lens_np, dtype=ms.int32) - position_ids = ms.Tensor(positions, dtype=ms.int32) - attention_mask = self.casual_mask.gen_attention_mask(is_prefill, position_ids, query_lens) - - model_inputs = {} - model_inputs["input_ids"] = _batch_seq(input_ids, is_prefill) - model_inputs["batch_valid_length"] = ms.Tensor.from_numpy(np.expand_dims(seq_lens_np, 0)) - model_inputs["block_tables"] = attn_metadata.block_tables - model_inputs["slot_mapping"] = attn_metadata.slot_mapping - model_inputs["position_ids"] = position_ids - model_inputs["q_seq_lens"] = q_seq_lens - model_inputs["attention_mask"] = attention_mask - model_inputs["key_cache"] = key_cache - model_inputs["value_cache"] = value_cache + if attn_metadata.max_context_lens == 0: + is_prefill = True + else: + is_prefill = False + q_seq_lens = attn_metadata.q_seq_lens + query_lens_np = attn_metadata.q_seq_lens_np + attention_mask = self.casual_mask.gen_attention_mask(is_prefill, positions, query_lens_np) + + model_inputs = {} + model_inputs["input_ids"] = _batch_seq(input_ids, is_prefill) + model_inputs["batch_valid_length"] = ms.Tensor(np.expand_dims(attn_metadata.seq_lens_np, 0)) + model_inputs["block_tables"] = attn_metadata.block_tables + model_inputs["slot_mapping"] = attn_metadata.slot_mapping + model_inputs["position_ids"] = positions.to(ms.int32) + model_inputs["q_seq_lens"] = q_seq_lens + model_inputs["attention_mask"] = attention_mask + model_inputs["key_cache"] = key_cache + model_inputs["value_cache"] = value_cache return model_inputs, is_prefill @@ -132,12 +189,13 @@ class MfModelBase(MsModelBase): self, input_ids: Tensor, positions: Tensor, - kv_caches: List[Tensor], - attn_metadata: AttentionMetadata, intermediate_tensors: Optional[IntermediateTensors] = None, inputs_embeds: Optional[Tensor] = None, **kwargs ) -> Union[Tensor, IntermediateTensors]: + attn_metadata = get_forward_context().attn_metadata + if attn_metadata is None: + attn_metadata = self._dummy_attention_metadata(input_ids, positions) model_inputs, is_prefill = self.prepare_inputs(input_ids, positions, attn_metadata) model_inputs = self.update_model_inputs(model_inputs, **kwargs) @@ -160,15 +218,18 @@ class MfModelBase(MsModelBase): hidden_states: Tensor, sampling_metadata: SamplingMetadata, ) -> Optional[Tensor]: - selected_token_indices = sampling_metadata.selected_token_indices - if selected_token_indices is not None and selected_token_indices.numel() <= 0: - logits = ms.mint.zeros((0, self.mf_model_config.vocab_size), - dtype=self.mf_model_config.compute_dtype) + if sampling_metadata is not None: + selected_token_indices = sampling_metadata.selected_token_indices + if selected_token_indices is not None and selected_token_indices.numel() <= 0: + logits = ms.mint.zeros((0, self.mf_model_config.vocab_size), + dtype=self.mf_model_config.compute_dtype) + else: + hidden_states = hidden_states.index_select(0, selected_token_indices) + logits = self.network.lm_head(hidden_states) + logits = logits.reshape(-1, logits.shape[-1]) else: - hidden_states = hidden_states.index_select(0, selected_token_indices) logits = self.lm_head(hidden_states) logits = logits.reshape(-1, logits.shape[-1]) - return logits def sample( diff --git a/vllm_mindspore/model_executor/models/mf_models/qwen2.py b/vllm_mindspore/model_executor/models/mf_models/qwen2.py index 18a865c1..ddd037cf 100644 --- a/vllm_mindspore/model_executor/models/mf_models/qwen2.py +++ b/vllm_mindspore/model_executor/models/mf_models/qwen2.py @@ -21,6 +21,8 @@ from typing import Iterable, Set, Tuple from vllm.config import VllmConfig from vllm.config import get_current_vllm_config from vllm.logger import init_logger +import vllm.envs as envs + from mindspore import Tensor, JitConfig from mindspore.nn.utils import no_init_parameters @@ -31,14 +33,14 @@ from research.qwen2_5.infer.qwen2_5 import ( ) from vllm_mindspore.model_executor.layers.sampler import get_sampler -from vllm_mindspore.model_executor.models.model_base import Fake_Attention +from vllm_mindspore.model_executor.models.model_base import Fake_Attention, Fake_Attention_V1 from vllm_mindspore.model_executor.models.mf_models.mf_model_base import MfModelBase + from vllm_mindspore.model_executor.models.mf_models.qwen2_weight_processor import Qwen2WeightProcessor logger = init_logger(__name__) - class Qwen2ForCausalLM(MfModelBase): def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: super(Qwen2ForCausalLM, self).__init__(vllm_config=vllm_config, prefix=prefix) @@ -47,7 +49,10 @@ class Qwen2ForCausalLM(MfModelBase): self.sampler = get_sampler() self.set_modules({"model": self.network}) - self.kv_caches = [Fake_Attention() for i in range(self.mf_model_config.num_layers)] + if envs.VLLM_USE_V1: + self.kv_caches = [Fake_Attention_V1() for i in range(self.mf_model_config.num_layers)] + else: + self.kv_caches = [Fake_Attention() for i in range(self.mf_model_config.num_layers)] compilation_config = get_current_vllm_config().compilation_config if prefix in compilation_config.static_forward_context: diff --git a/vllm_mindspore/model_executor/models/model_base.py b/vllm_mindspore/model_executor/models/model_base.py index b97d7152..75e4648e 100644 --- a/vllm_mindspore/model_executor/models/model_base.py +++ b/vllm_mindspore/model_executor/models/model_base.py @@ -27,6 +27,7 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors from vllm.attention.backends.abstract import AttentionType from vllm.forward_context import get_forward_context +from vllm.attention.layer import Attention import torch @@ -64,6 +65,52 @@ class Fake_MLA(Fake_Attention): for _ in range(vllm_config.parallel_config.pipeline_parallel_size) ] + +class Fake_MLA(Fake_Attention): + def __init__(self): + super().__init__() + vllm_config = get_current_vllm_config() + self.kv_cache = [ + (torch.zeros(self.kv_shape, dtype=torch.bfloat16, device="Ascend"),) + for _ in range(vllm_config.parallel_config.pipeline_parallel_size) + ] + + +class Fake_Attention_V1(Attention): + def __init__(self): + vllm_config = get_current_vllm_config() + block_size = vllm_config.cache_config.block_size + num_kv_heads = vllm_config.model_config.get_num_kv_heads( + vllm_config.parallel_config + ) + head_size = vllm_config.model_config.get_head_size() + num_block = 0 + self.kv_shape = [num_block, block_size, num_kv_heads, head_size] + self.kv_cache = [ + ( + torch.zeros(self.kv_shape, dtype=torch.bfloat16, device="Ascend"), + torch.zeros(self.kv_shape, dtype=torch.bfloat16, device="Ascend"), + ) + for _ in range(vllm_config.parallel_config.pipeline_parallel_size) + ] + self.attn_type = AttentionType.DECODER + self.num_kv_heads = num_kv_heads + self.head_size = head_size + self.dtype = vllm_config.model_config.dtype + self.block_size = block_size + self.sliding_window = None + + +class Fake_MLA_V1(Fake_Attention_V1): + def __init__(self): + super().__init__() + vllm_config = get_current_vllm_config() + self.kv_cache = [ + (torch.zeros(self.kv_shape, dtype=torch.bfloat16, device="Ascend"),) + for _ in range(vllm_config.parallel_config.pipeline_parallel_size) + ] + + class MsModelBase(): def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: super(MsModelBase, self).__init__() @@ -157,8 +204,6 @@ class MsModelBase(): self, input_ids: Tensor, positions: Tensor, - kv_caches: List[Tensor], - attn_metadata: AttentionMetadata, intermediate_tensors: Optional[IntermediateTensors] = None, inputs_embeds: Optional[Tensor] = None, previous_hidden_states: Optional[Tensor] = None, @@ -167,8 +212,6 @@ class MsModelBase(): return self.forward( input_ids, positions, - kv_caches, - attn_metadata, intermediate_tensors, inputs_embeds, previous_hidden_states=previous_hidden_states, @@ -198,7 +241,8 @@ class MsModelBase(): kv_cache_dtype = self.model_config.dtype if self.cache_config.cache_dtype == "auto" \ else self.cache_config.cache_dtype - kv_cache_dtype = STR_DTYPE_TO_MS_DTYPE[kv_cache_dtype] + if kv_cache_dtype in STR_DTYPE_TO_MS_DTYPE: + kv_cache_dtype = STR_DTYPE_TO_MS_DTYPE[kv_cache_dtype] num_layers = self.model_config.get_num_layers(self.parallel_config) diff --git a/vllm_mindspore/model_executor/models/registry.py b/vllm_mindspore/model_executor/models/registry.py index 1a9dbe9f..d58130ef 100644 --- a/vllm_mindspore/model_executor/models/registry.py +++ b/vllm_mindspore/model_executor/models/registry.py @@ -62,11 +62,18 @@ _T = TypeVar("_T") _SUBPROCESS_COMMAND = [ - sys.executable, "-m", "vllm.model_executor.models.registry" + sys.executable, "-m", "vllm_mindspore.model_executor.models.registry" ] def _run() -> None: + import vllm_mindspore.compilation.inductor_pass as ms_inductor_pass + sys.modules["vllm.compilation.inductor_pass"] = ms_inductor_pass + + # Setup plugins + from vllm.plugins import load_general_plugins + load_general_plugins() + fn, output_file = pickle.loads(sys.stdin.buffer.read()) result = fn() diff --git a/vllm_mindspore/platforms/ascend.py b/vllm_mindspore/platforms/ascend.py index b96403d4..dddce58d 100644 --- a/vllm_mindspore/platforms/ascend.py +++ b/vllm_mindspore/platforms/ascend.py @@ -25,10 +25,12 @@ import mindspore as ms from vllm.platforms.interface import DeviceCapability, Platform, PlatformEnum, _Backend from vllm.logger import init_logger +import vllm.envs as envs if TYPE_CHECKING: - from vllm.config import VllmConfig + from vllm.config import ModelConfig, VllmConfig else: + ModelConfig = None VllmConfig = None logger = init_logger(__name__) @@ -79,24 +81,35 @@ class AscendPlatform(Platform): parallel_config = vllm_config.parallel_config scheduler_config = vllm_config.scheduler_config - if parallel_config.worker_cls == "auto": - if scheduler_config.is_multi_step: - parallel_config.worker_cls = "vllm.worker.multi_step_worker.MultiStepWorker" - elif vllm_config.speculative_config: - parallel_config.worker_cls = "vllm.spec_decode.spec_decode_worker.create_spec_worker" - parallel_config.sd_worker_cls = "vllm.worker.worker.Worker" - else: - parallel_config.worker_cls = "vllm.worker.worker.Worker" + import vllm.envs as envs + if envs.VLLM_USE_V1: + parallel_config.worker_cls = \ + "vllm.v1.worker.gpu_worker.Worker" + else: + if parallel_config.worker_cls == "auto": + if scheduler_config.is_multi_step: + parallel_config.worker_cls = "vllm.worker.multi_step_worker.MultiStepWorker" + elif vllm_config.speculative_config: + parallel_config.worker_cls = "vllm.spec_decode.spec_decode_worker.create_spec_worker" + parallel_config.sd_worker_cls = "vllm.worker.worker.Worker" + else: + parallel_config.worker_cls = "vllm.worker.worker.Worker" cache_config = vllm_config.cache_config if cache_config and cache_config.block_size is None: cache_config.block_size = 16 + # if envs.VLLM_USE_V1: + # vllm_config.model_config.enforce_eager = True + @classmethod def get_attn_backend_cls(cls, selected_backend, head_size, dtype, kv_cache_dtype, block_size, use_v1, use_mla): """Get the attention backend class of a device.""" if use_v1: + if use_mla: + return "vllm_mindspore.v1.attention.backends.flash_attn.MLABackend" + return "vllm_mindspore.v1.attention.backends.flash_attn.FlashAttentionBackend" raise RuntimeError("vLLM-MindSpore do not support v1 egine now!") if use_mla: logger.info("Using MindSpore MLA backend.") @@ -120,6 +133,8 @@ class AscendPlatform(Platform): @classmethod def get_device_communicator_cls(cls) -> str: """Get device specific communicator class for distributed communication.""" + if envs.VLLM_USE_V1: + return "vllm.distributed.device_communicators.cuda_communicator.CudaCommunicator" return "vllm.distributed.device_communicators.base_device_communicator.DeviceCommunicatorBase" @classmethod @@ -127,3 +142,7 @@ class AscendPlatform(Platform): """Get the total memory of a device in bytes.""" device_props = torch.cuda.get_device_properties(device_id) return device_props.total_memory + + @classmethod + def supports_v1(cls, model_config: ModelConfig) -> bool: + return True \ No newline at end of file diff --git a/vllm_mindspore/scripts.py b/vllm_mindspore/scripts.py index 530c1e62..f35190fa 100644 --- a/vllm_mindspore/scripts.py +++ b/vllm_mindspore/scripts.py @@ -41,7 +41,8 @@ def env_setup(target_env_dict=None): "DEVICE_NUM_PER_NODE": "16", "HCCL_OP_EXPANSION_MODE": "AIV", "MS_JIT_MODULES": "vllm_mindspore,research", - "GLOG_v": "3" + "GLOG_v": "3", + "RAY_CGRAPH_get_timeout": "360" } for key, value in target_env_dict.items(): diff --git a/vllm_mindspore/utils.py b/vllm_mindspore/utils.py index 717416bb..d32b525e 100644 --- a/vllm_mindspore/utils.py +++ b/vllm_mindspore/utils.py @@ -217,8 +217,8 @@ def check_ready(): import vllm.envs as envs from mindspore import set_context - if envs.VLLM_USE_V1: - raise NotImplementedError("vLLM-MindSpore does not support VLLM V1 now!") + # if envs.VLLM_USE_V1: + # raise NotImplementedError("vLLM-MindSpore does not support VLLM V1 now!") # Common environment variables of predict. set_context(jit_config={"jit_level": "O0", "infer_boost": "on"}) diff --git a/vllm_mindspore/v1/__init__.py b/vllm_mindspore/v1/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/vllm_mindspore/v1/attention/__init__.py b/vllm_mindspore/v1/attention/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/vllm_mindspore/v1/attention/backends/__init__.py b/vllm_mindspore/v1/attention/backends/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/vllm_mindspore/v1/attention/backends/flash_attn.py b/vllm_mindspore/v1/attention/backends/flash_attn.py new file mode 100644 index 00000000..77f6d726 --- /dev/null +++ b/vllm_mindspore/v1/attention/backends/flash_attn.py @@ -0,0 +1,237 @@ +# SPDX-License-Identifier: Apache-2.0 +"""Attention layer with FlashAttention.""" +from dataclasses import dataclass +from typing import Any, Dict, List, Optional, Tuple, Type + +import numpy as np +import torch + +from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl, + AttentionMetadata, AttentionType) +from vllm.logger import init_logger + + +from vllm_mindspore.utils import MsKVCache + +import mindspore as ms +from mindspore import mutable +from mindspore._c_expression import swap_cache + + +logger = init_logger(__name__) + + +class FlashAttentionBackend(AttentionBackend): + + accept_output_buffer: bool = True + + @staticmethod + def get_supported_head_sizes() -> List[int]: + return [32, 64, 96, 128, 160, 192, 224, 256] + + @staticmethod + def get_name() -> str: + return "MS_ATTN" + + @staticmethod + def get_impl_cls() -> Type["AttentionImpl"]: + return MsAttentionImpl + + @staticmethod + def get_metadata_cls() -> Type["AttentionMetadata"]: + return FlashAttentionMetadata + + @staticmethod + def get_builder_cls() -> Type["AttentionMetadataBuilder"]: + return FlashAttentionMetadataBuilder + + @staticmethod + def get_kv_cache_shape( + num_blocks: int, + block_size: int, + num_kv_heads: int, + head_size: int, + ) -> Tuple[int, ...]: + if block_size % 16 != 0: + raise ValueError("Block size must be a multiple of 16.") + return (2, num_blocks, block_size, num_kv_heads, head_size) + + @staticmethod + def use_cascade_attention(*args, **kwargs) -> bool: + return False + + +class MLABackend(AttentionBackend): + @staticmethod + def get_name() -> str: + return "MS_MLA" + + @staticmethod + def get_impl_cls() -> Type["AttentionImpl"]: + return MsAttentionImpl + + @staticmethod + def get_metadata_cls() -> Type["AttentionMetadata"]: + return FlashAttentionMetadata + + @staticmethod + def get_builder_cls() -> Type["AttentionMetadataBuilder"]: + return FlashAttentionMetadataBuilder + + @staticmethod + def get_kv_cache_shape( + num_blocks: int, + block_size: int, + num_kv_heads: int, + head_size: int, + ) -> Tuple[int, ...]: + if block_size % 16 != 0: + raise ValueError("Block size must be a multiple of 16.") + return (1, num_blocks, block_size, 1, head_size) + + @staticmethod + def use_cascade_attention(*args, **kwargs) -> bool: + return False + + @staticmethod + def get_supported_head_sizes() -> List[int]: + return [576] + + +@dataclass +class FlashAttentionMetadata: + # NOTE(sang): Definition of context_len, query_len, and seq_len. + # |---------- N-1 iteration --------| + # |---------------- N iteration ---------------------| + # |- tokenA -|......................|-- newTokens ---| + # |---------- context_len ----------| + # |-------------------- seq_len ---------------------| + # |-- query_len ---| + + max_seq_len: int + seq_lens: torch.Tensor + seq_lens_np: np.ndarray + block_tables: torch.Tensor + slot_mapping: torch.Tensor + q_seq_lens: torch.Tensor + q_seq_lens_np: np.ndarray + context_lens: torch.Tensor + max_context_lens: int + query_start_loc: torch.Tensor + + def __getitem__(self, key): + if key == "batch_valid_length": + key = "seq_lens" + if key == "block_tables": + if getattr(self, key).ndim == 1: + return mutable(getattr(self, key).expand_dims(0)) + return mutable(getattr(self, key)) + return getattr(self, key) + + +class MsAttentionImpl(AttentionImpl): + """ + If the input tensors contain prompt tokens, the layout is as follows: + |<--------------- num_prefill_tokens ----------------->| + |<--prefill_0-->|<--prefill_1-->|...|<--prefill_N-1--->| + + Otherwise, the layout is as follows: + |<----------------- num_decode_tokens ------------------>| + |<--decode_0-->|..........|<--decode_M-1-->|<--padding-->| + + Generation tokens can contain padding when cuda-graph is used. + Currently, prompt tokens don't contain any padding. + + The prompts might have different lengths, while the generation tokens + always have length 1. + + If chunked prefill is enabled, prefill tokens and decode tokens can be + batched together in a flattened 1D query. + + |<----- num_prefill_tokens ---->|<------- num_decode_tokens --------->| + |<-prefill_0->|...|<-prefill_N-1->|<--decode_0-->|...|<--decode_M-1-->| + + Currently, cuda graph is disabled for chunked prefill, meaning there's no + padding between prefill and decode tokens. + """ + + def __init__( + self, + num_heads: int, + head_size: int, + scale: float, + num_kv_heads: int, + alibi_slopes: Optional[List[float]], + sliding_window: Optional[int], + kv_cache_dtype: str, + blocksparse_params: Optional[Dict[str, Any]] = None, + logits_soft_cap: Optional[float] = None, + attn_type: AttentionType = AttentionType.DECODER, + ) -> None: + pass + + def forward( + self, + layer: torch.nn.Module, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + kv_cache: torch.Tensor, + attn_metadata: FlashAttentionMetadata, + output: Optional[torch.Tensor] = None, + ) -> torch.Tensor: + """Forward pass with FlashAttention. + + Args: + query: shape = [num_tokens, num_heads, head_size] + key: shape = [num_tokens, num_kv_heads, head_size] + value: shape = [num_tokens, num_kv_heads, head_size] + output: shape = [num_tokens, num_heads, head_size] + kv_cache = [2, num_blocks, block_size, num_kv_heads, head_size] + NOTE: kv_cache will be an empty tensor with shape [0] + for profiling run. + attn_metadata: Metadata for attention. + NOTE: It in-place updates the output tensor. + """ + pass + + +class FlashAttentionMetadataBuilder: + def __init__(self, runner: "GPUModelRunner"): + self.runner = runner + + def reorder_batch(self, input_batch: "InputBatch", + scheduler_output: "SchedulerOutput") -> bool: + return False + + def build(self, num_reqs: int, num_actual_tokens: int, max_query_len: int, + common_prefix_len: int): + query_start_loc = ms.from_numpy(self.runner.query_start_loc_np[:num_reqs + 1]) + query_start_loc.move_to("Ascend", blocking=False) + max_context_lens = self.runner.input_batch.num_computed_tokens_cpu[:num_reqs].max() + slot_mapping = ms.from_numpy(self.runner.slot_mapping_np[:num_actual_tokens]) + slot_mapping.move_to("Ascend", blocking=False) + seq_lens_np = self.runner.seq_lens_np[:num_reqs] + max_seq_len = seq_lens_np.max() + seq_lens = ms.from_numpy(seq_lens_np) + seq_lens.move_to("Ascend", blocking=False) + context_lens = ms.from_numpy(self.runner.input_batch.num_computed_tokens_cpu[:num_reqs]) + context_lens.move_to("Ascend", blocking=False) + + q_seq_lens_np = np.diff(self.runner.query_start_loc_np[:num_reqs + 1]) + q_seq_lens = ms.from_numpy(q_seq_lens_np) + q_seq_lens.move_to("Ascend", blocking=False) + + attn_metadata = FlashAttentionMetadata( + seq_lens=seq_lens, + seq_lens_np=seq_lens_np, + block_tables=(self.runner.input_batch.block_table.get_device_tensor()[:num_reqs]), + slot_mapping=slot_mapping, + q_seq_lens=q_seq_lens, + q_seq_lens_np=q_seq_lens_np, + max_seq_len=max_seq_len, + context_lens=context_lens, + max_context_lens=max_context_lens, + query_start_loc = query_start_loc + ) + return attn_metadata diff --git a/vllm_mindspore/v1/sample/__init__.py b/vllm_mindspore/v1/sample/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/vllm_mindspore/v1/sample/ops/__init__.py b/vllm_mindspore/v1/sample/ops/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/vllm_mindspore/v1/sample/ops/penalties.py b/vllm_mindspore/v1/sample/ops/penalties.py new file mode 100644 index 00000000..a6efb974 --- /dev/null +++ b/vllm_mindspore/v1/sample/ops/penalties.py @@ -0,0 +1,21 @@ +from typing import List + +import torch +from vllm.utils import is_pin_memory_available, make_tensor_with_pad + + +def _convert_to_tensors(output_token_ids: List[List[int]], vocab_size: int, + device: torch.device) -> torch.Tensor: + """ + Convert the different list data structures to tensors. + """ + output_tokens_tensor = make_tensor_with_pad( + output_token_ids, + # Use the value of vocab_size as a pad since we don't have a + # token_id of this value. + pad=vocab_size, + device="cpu", + dtype=torch.int64, + pin_memory=is_pin_memory_available(), + ) + return output_tokens_tensor diff --git a/vllm_mindspore/v1/sample/ops/topk_topp_sampler.py b/vllm_mindspore/v1/sample/ops/topk_topp_sampler.py new file mode 100644 index 00000000..57f0a81c --- /dev/null +++ b/vllm_mindspore/v1/sample/ops/topk_topp_sampler.py @@ -0,0 +1,98 @@ +from typing import Optional +import torch +from vllm.v1.sample.ops.topk_topp_sampler import apply_top_k_only + +def random_sample( + probs: torch.Tensor, + generators: dict[int, torch.Generator], +) -> torch.Tensor: + """Randomly sample from the probabilities. + + We use this function instead of torch.multinomial because torch.multinomial + causes CPU-GPU synchronization. + """ + q = torch.empty_like(probs) + # NOTE(woosuk): To batch-process the requests without their own seeds, + # which is the common case, we first assume that every request does + # not have its own seed. Then, we overwrite the values for the requests + # that have their own seeds. + if len(generators) != probs.shape[0]: + q.exponential_() + if generators: + # TODO(woosuk): This can be slow because we handle each request + # one by one. Optimize this. + for i, generator in generators.items(): + q[i].exponential_(generator=generator) + # if use probs.div_(q) instead of probs = probs.div(q), it will cause + # a error when running. + probs = probs.div(q) + return probs.argmax(dim=-1).view(-1) + + +def apply_top_k_top_p( + logits: torch.Tensor, + k: Optional[torch.Tensor], + p: Optional[torch.Tensor], +) -> torch.Tensor: + """Apply top-k and top-p masks to the logits. + + If a top-p is used, this function will sort the logits tensor, + which can be slow for large batches. + + The logits tensor may be updated in-place. + """ + if p is None: + if k is None: + return logits + + # Avoid sorting vocab for top-k only case. + return apply_top_k_only(logits, k) + + logits_sort, logits_idx = logits.sort(dim=-1, descending=False) + + if k is not None: + # Apply top-k. + top_k_mask = logits_sort.size(1) - k.to(torch.long) # shape: B + # Get all the top_k values. + top_k_mask = logits_sort.gather(1, top_k_mask.unsqueeze(dim=1)) + top_k_mask = logits_sort < top_k_mask + logits_sort.masked_fill_(top_k_mask, -float("inf")) + + if p is not None: + # Apply top-p. + probs_sort = logits_sort.softmax(dim=-1) + probs_sum = torch.cumsum(probs_sort, dim=-1) + top_p_mask = probs_sum <= 1 - p.unsqueeze(dim=1) + # at least one + top_p_mask[:, -1] = False + logits_sort.masked_fill_(top_p_mask, -float("inf")) + + # Re-sort the probabilities. + logits = logits_sort.scatter(dim=-1, index=logits_idx, src=logits_sort) + return logits + + +def apply_top_k_only( + logits: torch.Tensor, + k: torch.Tensor, +) -> torch.Tensor: + """ + Apply top-k mask to the logits. + + This implementation doesn't involve sorting the entire vocab. + + The logits tensor may be updated in-place. + """ + no_top_k_mask = k == logits.shape[1] + # Set non-top-k rows to 1 so that we can gather. + k = k.masked_fill(no_top_k_mask, 1) + max_top_k = k.max() + # topk.values tensor has shape [batch_size, max_top_k]. + # Convert top k to 0-based index in range [0, max_top_k). + k_index = k.sub_(1).unsqueeze(1).expand(logits.shape[0], 1) + + top_k_mask = logits.topk(max_top_k, dim=1)[0].gather(1, k_index.long()) + # Handle non-topk rows. + top_k_mask.masked_fill_(no_top_k_mask.unsqueeze(1), -float("inf")) + logits.masked_fill_(logits < top_k_mask, -float("inf")) + return logits diff --git a/vllm_mindspore/v1/sample/rejection_sampler.py b/vllm_mindspore/v1/sample/rejection_sampler.py new file mode 100644 index 00000000..9fe3e7fb --- /dev/null +++ b/vllm_mindspore/v1/sample/rejection_sampler.py @@ -0,0 +1,627 @@ +# SPDX-License-Identifier: Apache-2.0 +from typing import Optional + +import torch +import torch.nn as nn + +from vllm.logger import init_logger +from vllm.v1.sample.metadata import SamplingMetadata +from vllm.v1.sample.ops.topk_topp_sampler import apply_top_k_top_p +from vllm.v1.spec_decode.metadata import SpecDecodeMetadata + +logger = init_logger(__name__) + +PLACEHOLDER_TOKEN_ID = -1 +GREEDY_TEMPERATURE = -1 +# Maximum number of speculative draft tokens allowed per request in a single +# step. This value is chosen to be large enough to handle typical use cases. +MAX_SPEC_LEN = 32 + + + + +class RejectionSampler(nn.Module): + """ + The implementation strictly follows the algorithm described in + https://arxiv.org/abs/2211.17192. + However, we want to clarify the terminology used in the implementation: + accepted tokens: tokens that are accepted based on the relationship + between the "raw" draft and target probabilities. + recovered tokens: tokens that are sampled based on the adjusted probability + distribution, which is derived from both the draft and target + probabilities. + bonus tokens: + If all proposed tokens are accepted, the bonus token is added to the + end of the sequence. The bonus token is only sampled from the target + probabilities. We pass in the bonus tokens instead of sampling them + in the rejection sampler to allow for more flexibility in the + sampling process. For example, we can use top_p, top_k sampling for + bonus tokens, while spec decode does not support these sampling + strategies. + output tokens: + Tokens are finally generated with the rejection sampler. + output tokens = accepted tokens + recovered tokens + bonus tokens + """ + + def forward( + self, + metadata: SpecDecodeMetadata, + # [num_tokens, vocab_size] + draft_probs: Optional[torch.Tensor], + # [num_tokens, vocab_size] + target_logits: torch.Tensor, + # [batch_size, 1] + bonus_token_ids: torch.Tensor, + sampling_metadata: SamplingMetadata, + ) -> torch.Tensor: + ''' + Args: + metadata: + Metadata for spec decoding. + draft_probs (Optional[torch.Tensor]): + Probability distribution for the draft tokens. Shape is + [num_tokens, vocab_size]. Can be None if probabilities are + not provided, which is the case for ngram spec decode. + target_logits (torch.Tensor): + Target model's logits probability distribution. + Shape is [num_tokens, vocab_size]. Here, probabilities from + different requests are flattened into a single tensor because + this is the shape of the output logits. + NOTE: `target_logits` can be updated in place to save memory. + bonus_token_ids_tensor (torch.Tensor): + A tensor containing bonus tokens. Shape is [batch_size, 1]. + Bonus tokens are added to the end of the sequence if all + proposed tokens are accepted. We generate the bonus tokens + outside of the rejection sampler with the default sampling + strategy. It allows for more flexibility in the sampling + process such as top_p, top_k sampling. + sampling_metadata (SamplingMetadata): + Additional metadata needed for sampling, such as temperature, + top-k/top-p parameters, or other relevant information. + Returns: + output_token_ids (torch.Tensor): + A tensor containing the final output token IDs. + ''' + assert metadata.max_spec_len <= MAX_SPEC_LEN + # [num_tokens, vocab_size] + # NOTE(woosuk): `target_logits` can be updated in place inside the + # `compute_probs` function. + target_probs = compute_probs( + target_logits, + metadata.cu_num_draft_tokens, + sampling_metadata, + ) + + output_token_ids = rejection_sample( + metadata.draft_token_ids, + metadata.num_draft_tokens, + metadata.max_spec_len, + metadata.cu_num_draft_tokens, + draft_probs, + target_probs, + bonus_token_ids, + sampling_metadata, + ) + return output_token_ids + + @staticmethod + def parse_output( + output_token_ids: torch.Tensor, + vocab_size: int, + ) -> list[list[int]]: + """Parse the output of the rejection sampler. + + Args: + output_token_ids: The sampled token IDs in shape + [batch_size, max_spec_len + 1]. The rejected tokens are + replaced with `PLACEHOLDER_TOKEN_ID` by the rejection sampler + and will be filtered out in this function. + vocab_size: The size of the vocabulary. + + Returns: + A list of lists of token IDs. + """ + output_token_ids_np = output_token_ids.cpu().numpy() + # Create mask for valid tokens. + valid_mask = ((output_token_ids_np != PLACEHOLDER_TOKEN_ID) & + (output_token_ids_np < vocab_size)) + outputs = [ + row[valid_mask[i]].tolist() + for i, row in enumerate(output_token_ids_np) + ] + return outputs + + +def rejection_sample( + # [num_tokens] + draft_token_ids: torch.Tensor, + # [batch_size] + num_draft_tokens: list[int], + max_spec_len: int, + # [batch_size] + cu_num_draft_tokens: torch.Tensor, + # [num_tokens, vocab_size] + draft_probs: Optional[torch.Tensor], + # [num_tokens, vocab_size] + target_probs: torch.Tensor, + # [batch_size, 1] + bonus_token_ids: torch.Tensor, + sampling_metadata: SamplingMetadata, +) -> torch.Tensor: + assert draft_token_ids.ndim == 1 + assert draft_probs is None or draft_probs.ndim == 2 + assert cu_num_draft_tokens.ndim == 1 + assert target_probs.ndim == 2 + + batch_size = len(num_draft_tokens) + num_tokens = draft_token_ids.shape[0] + vocab_size = target_probs.shape[-1] + device = target_probs.device + assert draft_token_ids.is_contiguous() + assert draft_probs is None or draft_probs.is_contiguous() + assert target_probs.is_contiguous() + assert bonus_token_ids.is_contiguous() + assert target_probs.shape == (num_tokens, vocab_size) + + # Create output buffer. + output_token_ids = torch.empty( + (batch_size, max_spec_len + 1), + dtype=torch.int32, # Consistent with SamplerOutput.sampled_token_ids. + device=device, + ) + output_token_ids.fill_(PLACEHOLDER_TOKEN_ID) + + if sampling_metadata.all_greedy: + is_greedy = None + else: + is_greedy = sampling_metadata.temperature == GREEDY_TEMPERATURE + if not sampling_metadata.all_random: + # Rejection sampling for greedy sampling requests. + target_argmax = target_probs.argmax(dim=-1) + rejection_greedy_sample_kernel[(batch_size, )]( + output_token_ids, + cu_num_draft_tokens, + draft_token_ids, + target_argmax, + bonus_token_ids, + is_greedy, + max_spec_len, + num_warps=1, + ) + if sampling_metadata.all_greedy: + return output_token_ids + + # Generate uniform probabilities for rejection sampling. + # [num_tokens] + uniform_probs = generate_uniform_probs( + num_tokens, + num_draft_tokens, + sampling_metadata.generators, + device, + ) + + # Sample recovered tokens for each position. + # [num_tokens] + recovered_token_ids = sample_recovered_tokens( + max_spec_len, + num_draft_tokens, + cu_num_draft_tokens, + draft_token_ids, + draft_probs, + target_probs, + sampling_metadata, + device, + ) + + # Rejection sampling for random sampling requests. + rejection_random_sample_kernel[(batch_size, )]( + output_token_ids, + cu_num_draft_tokens, + draft_token_ids, + draft_probs, + target_probs, + bonus_token_ids, + recovered_token_ids, + uniform_probs, + is_greedy, + max_spec_len, + vocab_size, + IS_NGRAM=draft_probs is None, + num_warps=1, + ) + return output_token_ids + + +def compute_probs( + logits: torch.Tensor, # [num_tokens, vocab_size] + cu_num_draft_tokens: torch.Tensor, # [batch_size] + sampling_metadata: SamplingMetadata, +) -> torch.Tensor: + """Compute probability distribution from logits based on sampling metadata. + + This function applies temperature scaling to the logits and converts + them to probabilities using softmax. For greedy decoding, it returns + the original logits. + + Args: + logits: Input logits tensor to be converted to probabilities. + cu_num_draft_tokens: Cumulative number of draft tokens. + sampling_metadata: Metadata containing sampling parameters such as + temperature and whether greedy sampling is used. + + Returns: + torch.Tensor: Probability distribution (softmax of scaled logits) + if non-greedy sampling is used, otherwise returns the + original logits. + """ + assert logits.ndim == 2 + assert cu_num_draft_tokens.ndim == 1 + if sampling_metadata.all_greedy: + return logits + + num_tokens = logits.shape[0] + temperature = expand_batch_to_tokens( + sampling_metadata.temperature, + cu_num_draft_tokens, + num_tokens, + replace_from=GREEDY_TEMPERATURE, + replace_to=1, + ) + # NOTE(woosuk): Update `logits` in place to avoid allocating a new tensor. + logits.div_(temperature.unsqueeze(-1)) + + # Get expanded top_k and top_p tensors. + top_k = None + if sampling_metadata.top_k is not None: + top_k = expand_batch_to_tokens( + sampling_metadata.top_k, + cu_num_draft_tokens, + num_tokens, + ) + top_p = None + if sampling_metadata.top_p is not None: + top_p = expand_batch_to_tokens( + sampling_metadata.top_p, + cu_num_draft_tokens, + num_tokens, + ) + + # NOTE(woosuk): `apply_top_k_top_p` uses sorting to calculate the mask, + # which is slow for large vocab sizes. This may cause performance issues. + logits = apply_top_k_top_p(logits, top_k, top_p) + output_prob = logits.softmax(dim=-1, dtype=torch.float32) + return output_prob + + +def expand_batch_to_tokens( + x: torch.Tensor, # [batch_size] + cu_num_tokens: torch.Tensor, # [batch_size] + num_tokens: int, + replace_from: int = 0, + replace_to: int = 0, +) -> torch.Tensor: + """Expand [batch_size] tensor to [num_tokens] tensor based on the number of + tokens per batch in cu_num_tokens. + + For example, if x = [a, b, c] and cu_num_tokens = [2, 5, 6], then + num_tokens = 6, and expanded_x = [a, a, b, b, b, c]. + + Args: + x: [batch_size] tensor to expand. + cu_num_tokens: [batch_size] tensor containing the cumulative number of + tokens per batch. Each element represents the total number of + tokens up to and including that batch. + num_tokens: Total number of tokens. + replace_from: int = 0 + Value to be replaced if it is found in x. + replace_to: int = 0 + Value to replace with when replace_from is found. + Returns: + expanded_x: [num_tokens] tensor. + """ + batch_size = x.shape[0] + assert cu_num_tokens.shape[0] == batch_size + expanded_x = x.new_empty(num_tokens) + expand_kernel[(batch_size, )]( + expanded_x, + x, + cu_num_tokens, + replace_from, + replace_to, + MAX_NUM_TOKENS=MAX_SPEC_LEN, # To avoid recompilation. + num_warps=1, + ) + return expanded_x + + +def generate_uniform_probs( + num_tokens: int, + num_draft_tokens: list[int], + generators: dict[int, torch.Generator], + device: torch.device, +) -> torch.Tensor: + """ + Generates a batch of uniform random samples, with optional seeding + if available. + + This method creates a tensor of shape `(num_tokens, )` filled + with uniform random values in the range [0, 1). If `generators` is provided, + the requests with their own seeds will use the provided `torch.Generator` + for reproducibility. The samples for the other requests will be generated + without a seed. + + Args: + num_tokens : int + Total number of tokens. + num_draft_tokens : List[List[int]] + Number of draft tokens per request. + generators : Optional[Dict[int, torch.Generator]] + A dictionary mapping indices in the batch to + `torch.Generator` objects. + device : torch.device + The device on which to allocate the tensor. + Returns: + uniform_rand : torch.Tensor + A tensor of shape `(num_tokens, )` containing uniform + random values in the range [0, 1). + """ + uniform_probs = torch.rand( + (num_tokens, ), + dtype=torch.float32, + device=device, + ) + start_idx = 0 + for req_idx, n in enumerate(num_draft_tokens): + # Do not generate random numbers for requests with no draft tokens. + # This can be important for reproducibility. + if n == 0: + continue + end_idx = start_idx + n + generator = generators.get(req_idx) + if generator is not None: + uniform_probs[start_idx:end_idx].uniform_(generator=generator) + start_idx = end_idx + return uniform_probs + + +def sample_recovered_tokens( + max_spec_len: int, + num_draft_tokens: list[int], + # [batch_size] + cu_num_draft_tokens: torch.Tensor, + # [num_tokens] + draft_token_ids: torch.Tensor, + # [num_tokens, vocab_size] + draft_probs: Optional[torch.Tensor], + # [num_tokens, vocab_size] + target_probs: torch.Tensor, + sampling_metadata: SamplingMetadata, + device: torch.device, +) -> torch.Tensor: + # NOTE(woosuk): Create only one distribution for each request. + batch_size = len(num_draft_tokens) + vocab_size = target_probs.shape[-1] + q = torch.empty( + (batch_size, vocab_size), + dtype=torch.float32, + device=device, + ) + q.exponential_() + for i, generator in sampling_metadata.generators.items(): + # Do not generate random numbers for requests with no draft tokens. + # This can be important for reproducibility. + if num_draft_tokens[i] > 0: + q[i].exponential_(generator=generator) + + recovered_token_ids = torch.empty_like(draft_token_ids) + sample_recovered_tokens_kernel[(batch_size, max_spec_len)]( + recovered_token_ids, + cu_num_draft_tokens, + draft_token_ids, + draft_probs, + target_probs, + q, + vocab_size, + triton.next_power_of_2(vocab_size), + IS_NGRAM=draft_probs is None, + ) + return recovered_token_ids + + +# NOTE(woosuk): Avoid specialization to prevent unnecessary recompilation. +def rejection_greedy_sample_kernel( + output_token_ids_ptr, # [batch_size, max_spec_len + 1] + cu_num_draft_tokens_ptr, # [batch_size] + draft_token_ids_ptr, # [num_tokens] + target_argmax_ptr, # [num_tokens] + bonus_token_ids_ptr, # [batch_size] + is_greedy_ptr, # [batch_size] or None + max_spec_len, +): + req_idx = tl.program_id(0) + # FIXME(woosuk): Because is_greedy_ptr is not None at profiling run, + # re-compilation may happen during runtime when is_greedy_ptr is None. + if is_greedy_ptr is None: + is_greedy = True + else: + is_greedy = tl.load(is_greedy_ptr + req_idx) + if not is_greedy: + # Early exit for non-greedy sampling requests. + return + + if req_idx == 0: + start_idx = 0 + else: + start_idx = tl.load(cu_num_draft_tokens_ptr + req_idx - 1) + end_idx = tl.load(cu_num_draft_tokens_ptr + req_idx) + num_draft_tokens = end_idx - start_idx + + rejected = False + for pos in range(num_draft_tokens): + if not rejected: + draft_token_id = tl.load(draft_token_ids_ptr + start_idx + pos) + target_argmax_id = tl.load(target_argmax_ptr + start_idx + pos) + tl.store(output_token_ids_ptr + req_idx * (max_spec_len + 1) + pos, + target_argmax_id) + if draft_token_id != target_argmax_id: + # Reject. + rejected = True + + if not rejected: + # If all tokens are accepted, append the bonus token. + bonus_token_id = tl.load(bonus_token_ids_ptr + req_idx) + tl.store( + output_token_ids_ptr + req_idx * (max_spec_len + 1) + + num_draft_tokens, bonus_token_id) + + +# NOTE(woosuk): Avoid specialization to prevent unnecessary recompilation. +def rejection_random_sample_kernel( + output_token_ids_ptr, # [batch_size, max_spec_len + 1] + cu_num_draft_tokens_ptr, # [batch_size] + draft_token_ids_ptr, # [num_tokens] + draft_probs_ptr, # [num_tokens, vocab_size] or None + target_probs_ptr, # [num_tokens, vocab_size] + bonus_token_ids_ptr, # [batch_size] + recovered_token_ids_ptr, # [num_tokens] + uniform_probs_ptr, # [num_tokens] + is_greedy_ptr, # [batch_size] + max_spec_len, + vocab_size, + IS_NGRAM, +): + req_idx = tl.program_id(0) + is_greedy = tl.load(is_greedy_ptr + req_idx) + if is_greedy: + # Early exit for greedy sampling requests. + return + + if req_idx == 0: + start_idx = 0 + else: + start_idx = tl.load(cu_num_draft_tokens_ptr + req_idx - 1) + end_idx = tl.load(cu_num_draft_tokens_ptr + req_idx) + num_draft_tokens = end_idx - start_idx + + rejected = False + for pos in range(num_draft_tokens): + if not rejected: + draft_token_id = tl.load(draft_token_ids_ptr + start_idx + pos) + if IS_NGRAM: + draft_prob = 1 + else: + draft_prob = tl.load(draft_probs_ptr + + (start_idx + pos) * vocab_size + + draft_token_id) + target_prob = tl.load(target_probs_ptr + + (start_idx + pos) * vocab_size + + draft_token_id) + uniform_prob = tl.load(uniform_probs_ptr + start_idx + pos) + # NOTE(woosuk): While the draft probability should never be 0, + # we check it to avoid NaNs. If it happens to be 0, we reject. + if draft_prob > 0 and target_prob / draft_prob >= uniform_prob: + # Accept. + token_id = draft_token_id + else: + # Reject. Use recovered token. + rejected = True + token_id = tl.load(recovered_token_ids_ptr + start_idx + pos) + tl.store(output_token_ids_ptr + req_idx * (max_spec_len + 1) + pos, + token_id) + + if not rejected: + # If all tokens are accepted, append the bonus token. + bonus_token_id = tl.load(bonus_token_ids_ptr + req_idx) + tl.store( + output_token_ids_ptr + req_idx * (max_spec_len + 1) + + num_draft_tokens, bonus_token_id) + + +# NOTE(woosuk): Avoid specialization to prevent unnecessary recompilation. +def expand_kernel( + output_ptr, # [num_tokens] + input_ptr, # [batch_size] + cu_num_tokens_ptr, # [batch_size] + replace_from, + replace_to, + MAX_NUM_TOKENS, +): + req_idx = tl.program_id(0) + if req_idx == 0: # noqa: SIM108 + start_idx = 0 + else: + start_idx = tl.load(cu_num_tokens_ptr + req_idx - 1) + end_idx = tl.load(cu_num_tokens_ptr + req_idx) + num_tokens = end_idx - start_idx + + src_val = tl.load(input_ptr + req_idx) + src_val = tl.where(src_val == replace_from, replace_to, src_val) + offset = tl.arange(0, MAX_NUM_TOKENS) + tl.store(output_ptr + start_idx + offset, + src_val, + mask=offset < num_tokens) + + +def sample_recovered_tokens_kernel( + output_token_ids_ptr, # [num_tokens] + cu_num_draft_tokens_ptr, # [batch_size] + draft_token_ids_ptr, # [num_tokens] + draft_probs_ptr, # [num_tokens, vocab_size] or None + target_probs_ptr, # [num_tokens, vocab_size] + q_ptr, # [batch_size, vocab_size] + vocab_size, + PADDED_VOCAB_SIZE, + IS_NGRAM, +): + req_idx = tl.program_id(0) + if req_idx == 0: + start_idx = 0 + else: + start_idx = tl.load(cu_num_draft_tokens_ptr + req_idx - 1) + end_idx = tl.load(cu_num_draft_tokens_ptr + req_idx) + num_draft_tokens = end_idx - start_idx + + # Early exit for out-of-range positions. + pos = tl.program_id(1) + if pos >= num_draft_tokens: + return + + vocab_offset = tl.arange(0, PADDED_VOCAB_SIZE) + if IS_NGRAM: + draft_token_id = tl.load(draft_token_ids_ptr + start_idx + pos) + orig_prob = tl.load(target_probs_ptr + (start_idx + pos) * vocab_size + + draft_token_id) + # Temporarily zero out the probability of the draft token. + # This is essentially the same as target_prob - draft_prob, except that + # n-gram does not have draft_prob. We regard it as 1. + tl.store( + target_probs_ptr + (start_idx + pos) * vocab_size + draft_token_id, + 0) + prob = tl.load(target_probs_ptr + (start_idx + pos) * vocab_size + + vocab_offset, + mask=vocab_offset < vocab_size, + other=0) + else: + draft_prob = tl.load(draft_probs_ptr + (start_idx + pos) * vocab_size + + vocab_offset, + mask=vocab_offset < vocab_size, + other=0) + target_prob = tl.load(target_probs_ptr + + (start_idx + pos) * vocab_size + vocab_offset, + mask=vocab_offset < vocab_size, + other=0) + prob = tl.maximum(target_prob - draft_prob, 0) + # NOTE(woosuk): We don't need `prob = prob / tl.sum(prob)` here because + # `tl.argmax` will select the maximum value. + + q = tl.load(q_ptr + req_idx * vocab_size + vocab_offset, + mask=vocab_offset < vocab_size, + other=float("-inf")) + recovered_id = tl.argmax(prob / q, axis=-1) + tl.store(output_token_ids_ptr + start_idx + pos, recovered_id) + + if IS_NGRAM: + # Restore the original probability. + tl.store( + target_probs_ptr + (start_idx + pos) * vocab_size + draft_token_id, + orig_prob) diff --git a/vllm_mindspore/v1/sample/sampler.py b/vllm_mindspore/v1/sample/sampler.py new file mode 100644 index 00000000..ed5dcb5b --- /dev/null +++ b/vllm_mindspore/v1/sample/sampler.py @@ -0,0 +1,10 @@ +import torch + +def apply_temperature( + self, + logits: torch.Tensor, + temp: torch.Tensor, +) -> torch.Tensor: + # logits.div_ will cause some error right now. + # So we use logits = logits.div instead of logits.div_. + return logits.div(temp.unsqueeze(dim=1)) diff --git a/vllm_mindspore/v1/spec_decode/__init__.py b/vllm_mindspore/v1/spec_decode/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/vllm_mindspore/v1/spec_decode/eagle.py b/vllm_mindspore/v1/spec_decode/eagle.py new file mode 100644 index 00000000..7279bcaf --- /dev/null +++ b/vllm_mindspore/v1/spec_decode/eagle.py @@ -0,0 +1,258 @@ +# SPDX-License-Identifier: Apache-2.0 +import torch +import torch.nn as nn + +from vllm.config import VllmConfig +from vllm.forward_context import set_forward_context +from vllm_mindspore.v1.attention.backends.flash_attn import FlashAttentionMetadata +from vllm.v1.sample.metadata import SamplingMetadata + + +class EagleProposer: + + def __init__( + self, + vllm_config: VllmConfig, + device: torch.device, + ): + self.vllm_config = vllm_config + self.num_speculative_tokens = ( + vllm_config.speculative_config.num_speculative_tokens) + self.block_size = vllm_config.cache_config.block_size + self.arange = torch.arange(vllm_config.scheduler_config.max_num_seqs, + device=device) + + def propose( + self, + # [num_tokens] + target_token_ids: torch.Tensor, + # [num_tokens] + target_positions: torch.Tensor, + # [num_tokens, hidden_size] + target_hidden_states: torch.Tensor, + # [num_tokens] + target_slot_mapping: torch.Tensor, + # [batch_size] + next_token_ids: torch.Tensor, + # [batch_size + 1] starting with 0 + cu_num_tokens: torch.Tensor, + # [batch_size, max_num_blocks_per_req] + block_table: torch.Tensor, + sampling_metadata: SamplingMetadata, + ) -> tuple[torch.Tensor, torch.Tensor]: + num_tokens = target_token_ids.shape[0] + batch_size = next_token_ids.shape[0] + last_token_indices = cu_num_tokens[1:] - 1 + + input_ids = torch.empty_like(target_token_ids) + # Shift the input ids by one token. + # E.g., [a1, b1, b2, c1, c2, c3] -> [b1, b2, c1, c2, c3, c3] + input_ids[:-1] = target_token_ids[1:] + # Replace the last token with the next token. + # E.g., [b1, b2, c1, c2, c3, c3] -> [a2, b2, b3, c2, c3, c4] + input_ids[last_token_indices] = next_token_ids + + seq_lens = target_positions[last_token_indices] + 1 + # FIXME(woosuk): The below two ops cause synchronization. Optimize. + max_seq_len = seq_lens.max().item() + max_num_tokens = (cu_num_tokens[1:] - cu_num_tokens[:-1]).max().item() + attn_metadata = FlashAttentionMetadata( + num_actual_tokens=num_tokens, + max_query_len=max_num_tokens, + query_start_loc=cu_num_tokens, + max_seq_len=max_seq_len, + seq_lens=seq_lens, + block_table=block_table, + slot_mapping=target_slot_mapping, + # TODO(woosuk): Support cascade attention. + use_cascade=False, + common_prefix_len=0, + cu_prefix_query_lens=None, + prefix_kv_lens=None, + suffix_kv_lens=None, + ) + + with set_forward_context(attn_metadata, self.vllm_config): + hidden_states = self.model( + input_ids=input_ids, + hidden_states=target_hidden_states, + positions=target_positions, + ) + sample_hidden_states = hidden_states[last_token_indices] + logits = self.model.compute_logits(sample_hidden_states, None) + draft_token_ids, draft_probs = compute_probs_and_sample_next_token( + logits, sampling_metadata) + + # Early exit if there is only one draft token to be generated. + if self.num_speculative_tokens == 1: + # [batch_size, 1] and [batch_size, 1, vocab_size] + return draft_token_ids.view(-1, 1), draft_probs.unsqueeze(dim=1) + + # Generate the remaining draft tokens. + draft_token_ids_list = [draft_token_ids] + draft_probs_list = [draft_probs] + + positions = target_positions[last_token_indices] + hidden_states = sample_hidden_states + attn_metadata.num_actual_tokens = batch_size + attn_metadata.max_query_len = 1 + attn_metadata.query_start_loc = self.arange[:batch_size] + for _ in range(self.num_speculative_tokens - 1): + # Update the inputs. + input_ids = draft_token_ids_list[-1] + positions += 1 + attn_metadata.max_seq_len += 1 + attn_metadata.seq_lens += 1 + # Compute the slot mapping. + block_numbers = positions // self.block_size + block_ids = block_table.gather(dim=1, + index=block_numbers.view(-1, 1)) + block_ids = block_ids.view(-1) + attn_metadata.slot_mapping = (block_ids * self.block_size + + positions % self.block_size) + + # Run the model. + with set_forward_context(attn_metadata, self.vllm_config): + hidden_states = self.model( + input_ids=input_ids, + hidden_states=hidden_states, + positions=positions, + ) + logits = self.model.compute_logits(hidden_states, None) + draft_token_ids, probs = compute_probs_and_sample_next_token( + logits, sampling_metadata) + draft_token_ids_list.append(draft_token_ids) + draft_probs_list.append(probs) + + # [batch_size, num_speculative_tokens] + draft_token_ids = torch.stack(draft_token_ids_list, dim=1) + # [batch_size, num_speculative_tokens, vocab_size] + draft_probs = torch.stack(draft_probs_list, dim=1) + return draft_token_ids, draft_probs + + @staticmethod + def prepare_inputs( + # [batch_size + 1] + cu_target_query_lens: torch.Tensor, + # [batch_size] + num_rejected_tokens: torch.Tensor, + ) -> tuple[torch.Tensor, torch.Tensor]: + # cu_target_query_lens: [0, a, a + b, a + b + c] + # num_rejected_tokens: [n1, n2, n3] + # num_tokens_per_req: [a - n1, b - n2, c - n3] + # cu_num_tokens: [0, a - n1, a + b - n1 - n2, a + b + c - n1 - n2 - n3] + # token_indices: [0, 1, ..., a - n1 - 1, + # a, a + 1, ..., a + b - n2 - 1, + # a + b, a + b + 1, ..., a + b + c - n3 - 1] + + # [0, a, a + b, a + b + c] -> [a, b, c] + query_len_per_req = (cu_target_query_lens[1:] - + cu_target_query_lens[:-1]) + # [a, b, c] -> [a - n1, b - n2, c - n3] + num_tokens_per_req = query_len_per_req - num_rejected_tokens + + cu_num_tokens = torch.empty_like(cu_target_query_lens) + torch.cumsum(num_tokens_per_req, dim=0, out=cu_num_tokens[1:]) + cu_num_tokens[0] = 0 + + # FIXME(woosuk): Avoid synchronization. + num_tokens = cu_num_tokens[-1].item() + token_indices = torch.empty( + num_tokens, + dtype=torch.int32, + device=cu_num_tokens.device, + ) + + batch_size = num_rejected_tokens.shape[0] + BLOCK_SIZE = 1024 + prepare_input_kernel[(batch_size, )]( + token_indices, + cu_target_query_lens, + cu_num_tokens, + BLOCK_SIZE=BLOCK_SIZE, + ) + return cu_num_tokens, token_indices + + def load_model(self, target_model: nn.Module) -> None: + self.model = DummyEagleModel() + self.model.get_input_embeddings = target_model.get_input_embeddings + self.model.compute_logits = target_model.compute_logits + + +# FIXME(woosuk): This is a dummy model for testing. +# Remove this once we have a real model. +class DummyEagleModel(nn.Module): + + def __init__(self): + super().__init__() + + def forward( + self, + input_ids: torch.Tensor, + hidden_states: torch.Tensor, + positions: torch.Tensor, + ) -> torch.Tensor: + input_embeddings = self.get_input_embeddings(input_ids) + return hidden_states + input_embeddings # Dummy return. + + +# FIXME(woosuk): The logic here is duplicated with the main sampling code. +# We should refactor this to reuse the same sampling implementation. +def compute_probs_and_sample_next_token( + logits: torch.Tensor, + sampling_metadata: SamplingMetadata, +) -> tuple[torch.Tensor, torch.Tensor]: + if sampling_metadata.all_greedy: + # For greedy requests, draft_probs is not used in rejection sampling. + # Therefore, we can just return the logits. + probs = logits + next_token_ids = logits.argmax(dim=-1) + return next_token_ids, probs + + is_greedy = sampling_metadata.temperature == -1 + temperature = torch.where(is_greedy, 1.0, sampling_metadata.temperature) + logits.div_(temperature.view(-1, 1)) + probs = logits.softmax(dim=-1, dtype=torch.float32) + + # NOTE(woosuk): Currently, we ignore most of the sampling parameters in + # generating the draft tokens. We only use the temperature. While this + # could degrade the acceptance rate, it does not affect the distribution + # of the generated tokens after rejection sampling. + + # TODO(woosuk): Consider seeds. + q = torch.empty_like(probs) + q.exponential_() + next_token_ids = probs.div_(q).argmax(dim=-1).view(-1) + if not sampling_metadata.all_random: + greedy_token_ids = probs.argmax(dim=-1) + next_token_ids = torch.where( + is_greedy, + greedy_token_ids, + next_token_ids, + ) + return next_token_ids, probs + + +def prepare_input_kernel( + out_ptr, + cu_query_lens_ptr, + cu_num_tokens_ptr, + BLOCK_SIZE, +): + pid = tl.program_id(0) + + # [start_pos, end_pos) + start_pos = tl.load(cu_num_tokens_ptr + pid) + end_pos = tl.load(cu_num_tokens_ptr + pid + 1) + num_tokens = end_pos - start_pos + + index_start = tl.load(cu_query_lens_ptr + pid) + + num_blocks = tl.cdiv(num_tokens, BLOCK_SIZE) + for i in tl.range(num_blocks): + offset = i * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE) + tl.store( + out_ptr + start_pos + offset, + index_start + offset, + mask=offset < num_tokens, + ) diff --git a/vllm_mindspore/v1/utils.py b/vllm_mindspore/v1/utils.py new file mode 100644 index 00000000..6833c101 --- /dev/null +++ b/vllm_mindspore/v1/utils.py @@ -0,0 +1,12 @@ +import torch + +def copy_slice(from_tensor: torch.Tensor, to_tensor: torch.Tensor, + length: int) -> None: + """ + Copy the first length elements of a tensor into another tensor in a + non-blocking manner. + + Used to copy pinned CPU tensor data to pre-allocated GPU tensors. + """ + to_tensor[:length] = from_tensor[:length] + return to_tensor diff --git a/vllm_mindspore/v1/worker/__init__.py b/vllm_mindspore/v1/worker/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/vllm_mindspore/v1/worker/block_table.py b/vllm_mindspore/v1/worker/block_table.py new file mode 100644 index 00000000..b865bae3 --- /dev/null +++ b/vllm_mindspore/v1/worker/block_table.py @@ -0,0 +1,93 @@ +# SPDX-License-Identifier: Apache-2.0 + +from typing import List + +import numpy as np +import torch + +from vllm.logger import init_logger + +logger = init_logger(__name__) + + +class BlockTable: + + def __init__( + self, + max_num_reqs: int, + max_num_blocks_per_req: int, + pin_memory: bool, + device: torch.device, + ): + self.max_num_reqs = max_num_reqs + self.max_num_blocks_per_req = max_num_blocks_per_req + self.pin_memory = pin_memory + self.device = device + + self.block_table = torch.zeros( + (max_num_reqs, max_num_blocks_per_req), + device=self.device, + dtype=torch.int32, + ) + self.block_table_cpu = torch.zeros( + (max_num_reqs, max_num_blocks_per_req), + device="cpu", + dtype=torch.int32, + pin_memory=pin_memory, + ) + self.block_table_np = self.block_table_cpu.numpy() + self.num_blocks_per_row = np.zeros(max_num_reqs, dtype=np.int32) + + def append_row( + self, + block_ids: List[int], + row_idx: int, + ) -> None: + if not block_ids: + return + num_blocks = len(block_ids) + start = self.num_blocks_per_row[row_idx] + self.num_blocks_per_row[row_idx] += num_blocks + self.block_table_np[row_idx, start:start + num_blocks] = block_ids + + def add_row(self, block_ids: List[int], row_idx: int) -> None: + self.num_blocks_per_row[row_idx] = 0 + self.append_row(block_ids, row_idx) + + def move_row(self, src: int, tgt: int) -> None: + num_blocks = self.num_blocks_per_row[src] + self.block_table_np[tgt, :num_blocks] = self.block_table_np[ + src, :num_blocks] + self.num_blocks_per_row[tgt] = num_blocks + + def swap_row(self, src: int, tgt: int) -> None: + num_blocks_src = self.num_blocks_per_row[src] + num_blocks_tgt = self.num_blocks_per_row[tgt] + self.num_blocks_per_row[src] = num_blocks_tgt + self.num_blocks_per_row[tgt] = num_blocks_src + + self.block_table_np[[src, tgt]] = self.block_table_np[[tgt, src]] + + def commit(self, num_reqs: int) -> None: + self.block_table_cpu[:num_reqs] = torch.from_numpy(self.block_table_np[:num_reqs]) + # self.block_table[:num_reqs] = self.block_table_cpu[:num_reqs] + self.block_table[:num_reqs] = torch.from_numpy(self.block_table_np[:num_reqs]) + + def clear(self) -> None: + self.block_table.fill_(0) + self.block_table_cpu.fill_(0) + self.block_table_np.fill(0) + + def get_device_tensor(self) -> torch.Tensor: + """Ruturns the device tensor of the block table.""" + return self.block_table + + def get_cpu_tensor(self) -> torch.Tensor: + """Returns the CPU tensor of the block table.""" + self.block_table_cpu.copy_(torch.from_numpy(self.block_table_np), + non_blocking=True) + return self.block_table_cpu + + def get_numpy_array(self) -> np.ndarray: + """Returns the numpy array of the block table.""" + return self.block_table_np diff --git a/vllm_mindspore/v1/worker/gpu_input_batch.py b/vllm_mindspore/v1/worker/gpu_input_batch.py new file mode 100644 index 00000000..6a2254f7 --- /dev/null +++ b/vllm_mindspore/v1/worker/gpu_input_batch.py @@ -0,0 +1,85 @@ +from typing import Dict, List, Optional, Set, Tuple, cast + +import numpy as np +import torch + +from vllm.lora.request import LoRARequest +from vllm.sampling_params import SamplingType +from vllm.v1.sample.metadata import SamplingMetadata +from vllm_mindspore.v1.utils import copy_slice +from vllm.v1.worker.block_table import BlockTable + +_SAMPLING_EPS = 1e-5 + + +def _make_sampling_metadata(self) -> SamplingMetadata: + num_reqs = self.num_reqs + if not self.all_greedy: + temperature = copy_slice(torch.from_numpy(self.temperature_cpu), self.temperature, num_reqs) + else: + temperature = None + if not self.no_top_p: + copy_slice(torch.from_numpy(self.top_p_cpu), self.top_p, num_reqs) + if not self.no_top_k: + copy_slice(torch.from_numpy(self.top_k_cpu), self.top_k, num_reqs) + if not self.no_min_p: + copy_slice(torch.from_numpy(self.min_p_cpu), self.min_p, num_reqs) + + if not self.no_penalties: + # Since syncing these tensors is expensive only copy them + # if necessary i.e. if there are requests which require + # penalties to be applied during sampling. + copy_slice(torch.from_numpy(self.frequency_penalties_cpu), + self.frequency_penalties, num_reqs) + copy_slice(torch.from_numpy(self.presence_penalties_cpu), + self.presence_penalties, num_reqs) + copy_slice(torch.from_numpy(self.repetition_penalties_cpu), + self.repetition_penalties, num_reqs) + + # The prompt tokens are used only for applying penalties during + # the sampling process. Hence copy these tensors only when + # there are requests which need penalties to be applied. + prompt_token_ids = self._make_prompt_token_ids_tensor() + else: + prompt_token_ids = None + + allowed_token_ids_mask: Optional[torch.Tensor] = None + if not self.no_allowed_token_ids: + assert self.allowed_token_ids_mask is not None + copy_slice(self.allowed_token_ids_mask_cpu_tensor, + self.allowed_token_ids_mask, num_reqs) + allowed_token_ids_mask = self.allowed_token_ids_mask[:num_reqs] + + return SamplingMetadata( + temperature=temperature, + all_greedy=self.all_greedy, + all_random=self.all_random, + top_p=None if self.no_top_p else self.top_p[:num_reqs], + top_k=None if self.no_top_k else self.top_k[:num_reqs], + min_p=None if self.no_min_p else self.min_p[:num_reqs], + generators=self.generators, + max_num_logprobs=self.max_num_logprobs, + prompt_token_ids=prompt_token_ids, + frequency_penalties=self.frequency_penalties[:num_reqs], + presence_penalties=self.presence_penalties[:num_reqs], + repetition_penalties=self.repetition_penalties[:num_reqs], + output_token_ids=cast(list[list[int]], self.req_output_token_ids), + min_tokens=self.min_tokens, + no_penalties=self.no_penalties, + logit_bias=self.logit_bias[:num_reqs], + allowed_token_ids_mask=allowed_token_ids_mask, + bad_words_token_ids=self.bad_words_token_ids, + ) + + +def _make_prompt_token_ids_tensor(self) -> torch.Tensor: + max_prompt_len = self.num_prompt_tokens[:self.num_reqs].max() + prompt_token_ids = np.empty((self.num_reqs, max_prompt_len), dtype=np.int64) + prompt_token_ids[:] = self.token_ids_cpu[:self. + num_reqs, :max_prompt_len] + for i in range(self.num_reqs): + prompt_token_ids[i, self.num_prompt_tokens[i]:] = self.vocab_size + prompt_token_ids_cpu_tensor = torch.from_numpy(prompt_token_ids) + prompt_token_ids_cpu_tensor.move_to("Ascend", blocking=False) + return prompt_token_ids_cpu_tensor + diff --git a/vllm_mindspore/v1/worker/gpu_model_runner.py b/vllm_mindspore/v1/worker/gpu_model_runner.py new file mode 100644 index 00000000..988ee71c --- /dev/null +++ b/vllm_mindspore/v1/worker/gpu_model_runner.py @@ -0,0 +1,420 @@ + +from typing import Dict, Tuple, List +import gc +import numpy as np +import torch + +from mindspore import mutable +import mindspore as ms +from vllm_mindspore.v1.attention.backends.flash_attn import (FlashAttentionMetadata, + FlashAttentionBackend, + MLABackend) +from vllm_mindspore.utils import get_valid_dtype + +from vllm.v1.kv_cache_interface import FullAttentionSpec +from vllm.v1.utils import bind_kv_cache +from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs +from vllm.logger import logger +from vllm.distributed.parallel_state import get_pp_group +from vllm.utils import cdiv +from vllm.logger import init_logger +from vllm.v1.worker.gpu_input_batch import CachedRequestState +from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding +from vllm.sampling_params import SamplingType + + +logger = init_logger(__name__) +def _prepare_inputs( + self, + scheduler_output: "SchedulerOutput", +) -> Tuple[FlashAttentionMetadata, torch.Tensor]: + total_num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens + assert total_num_scheduled_tokens > 0 + num_reqs = self.input_batch.num_reqs + assert num_reqs > 0 + + modified_batch = self.attn_metadata_builder.reorder_batch( + self.input_batch, scheduler_output) + if modified_batch: + self.input_batch.refresh_sampling_metadata() + + # OPTIMIZATION: Start copying the block table first. + # This way, we can overlap the copy with the following CPU operations. + self.input_batch.block_table.commit(num_reqs) + + # Get the number of scheduled tokens for each request. + # TODO: The Python loop can be slow. Optimize. + num_scheduled_tokens = np.empty(num_reqs, dtype=np.int32) + max_num_scheduled_tokens = 0 + for i, req_id in enumerate(self.input_batch.req_ids): + num_tokens = scheduler_output.num_scheduled_tokens[req_id] + num_scheduled_tokens[i] = num_tokens + max_num_scheduled_tokens = max(max_num_scheduled_tokens, + num_tokens) + + # Get request indices. + # E.g., [2, 5, 3] -> [0, 0, 1, 1, 1, 1, 1, 2, 2, 2] + req_indices = np.repeat(self.arange_np[:num_reqs], + num_scheduled_tokens) + + # Get batched arange. + # E.g., [2, 5, 3] -> [0, 1, 0, 1, 2, 3, 4, 0, 1, 2] + # Equivalent to but faster than: + # np.concatenate([np.arange(n) for n in num_scheduled_tokens]) + # Step 1. [2, 5, 3] -> [2, 7, 10] + cu_num_tokens = np.cumsum(num_scheduled_tokens) + # Step 2. [2, 7, 10] -> [0, 0, 2, 2, 2, 2, 2, 7, 7, 7] + cumsums_offsets = np.repeat(cu_num_tokens - num_scheduled_tokens, + num_scheduled_tokens) + # Step 3. [0, 1, 0, 1, 2, 3, 4, 0, 1, 2] + arange = self.arange_np[:total_num_scheduled_tokens] - cumsums_offsets + + # Get positions. + positions_np = self.positions_np[:total_num_scheduled_tokens] + np.add(self.input_batch.num_computed_tokens_cpu[req_indices], + arange, + out=positions_np) + + if self.uses_mrope: + self._calc_mrope_positions(scheduler_output) + + if self.uses_mrope: + # Only relevant for models using M-RoPE (e.g, Qwen2-VL) + self.mrope_positions[:, :total_num_scheduled_tokens].copy_( + self.mrope_positions_cpu[:, :total_num_scheduled_tokens], + non_blocking=True) + else: + self.positions[:total_num_scheduled_tokens] = torch.from_numpy(positions_np) + + + # Get token indices. + # E.g., [0, 1, 0, 1, 2, 3, 4, 0, 1, 2] + # -> [0, 1, M, M + 1, M + 2, M + 3, M + 4, 2 * M, 2 * M + 1, 2 * M + 2] + # where M is the max_model_len. + token_indices = (positions_np + + req_indices * self.input_batch.token_ids_cpu.shape[1]) + + self.input_ids[:total_num_scheduled_tokens] = torch.from_numpy( + np.take(self.input_batch.token_ids_cpu.flatten(), + token_indices, + 0) + ) + + # Calculate the slot mapping. + # E.g., [0, 1, 0, 1, 2, 3, 4, 0, 1, 2] + # -> [0, 0, K, K, K + 1, K + 1, K + 2, 2 * K, 2 * K, 2 * K + 1] + # where K is the max_num_blocks_per_req and the block size is 2. + # NOTE(woosuk): We can't simply use `token_indices // block_size` here + # because M (max_model_len) is not necessarily divisible by block_size. + block_table_indices = (req_indices * self.max_num_blocks_per_req + + positions_np // self.block_size) + + + block_numbers = self.input_batch.block_table.block_table_np.flatten()[block_table_indices] + block_offsets = positions_np % self.block_size + np.add(block_numbers * self.block_size, + block_offsets, + out=self.slot_mapping_np[:total_num_scheduled_tokens]) + + # # Prepare the attention metadata. + self.query_start_loc_np[0] = 0 + self.query_start_loc_np[1:num_reqs + 1] = cu_num_tokens + + self.seq_lens_np[:num_reqs] = ( + self.input_batch.num_computed_tokens_cpu[:num_reqs] + + num_scheduled_tokens) + + common_prefix_len = 0 + if self.cascade_attn_enabled: + common_prefix_len = self._compute_cascade_attn_prefix_len( + num_scheduled_tokens, + scheduler_output.num_common_prefix_blocks, + ) + + attn_metadata = self.attn_metadata_builder.build( + num_reqs=num_reqs, + num_actual_tokens=total_num_scheduled_tokens, + max_query_len=max_num_scheduled_tokens, + common_prefix_len=common_prefix_len, + ) + + use_spec_decode = len( + scheduler_output.scheduled_spec_decode_tokens) > 0 + if not use_spec_decode: + # NOTE(woosuk): Due to chunked prefills, the batch may contain + # partial requests. While we should not sample any token + # from these partial requests, we do so for simplicity. + # We will ignore the sampled tokens from the partial requests. + # TODO: Support prompt logprobs. + logits_indices = attn_metadata.query_start_loc[1:] - 1 + spec_decode_metadata = None + else: + # Get the number of draft tokens for each request. + # Iterate over the dictionary rather than all requests since not all + # requests have draft tokens. + num_draft_tokens = np.zeros(num_reqs, dtype=np.int32) + for req_id, draft_token_ids in ( + scheduler_output.scheduled_spec_decode_tokens.items()): + req_idx = self.input_batch.req_id_to_index[req_id] + num_draft_tokens[req_idx] = len(draft_token_ids) + + spec_decode_metadata = self._calc_spec_decode_metadata( + num_draft_tokens, cu_num_tokens) + logits_indices = spec_decode_metadata.logits_indices + + # Hot-Swap lora model + if self.lora_config: + self.set_active_loras(self.input_batch, num_scheduled_tokens) + + return attn_metadata, logits_indices, spec_decode_metadata + + +def create_block(shape, dtype, name=None, device=None): + from mindspore import mint + blocks = mint.empty(shape, dtype=dtype, device=device) + return blocks + +def initialize_kv_cache(self, kv_cache_config) -> None: + """ + Initialize KV cache based on `kv_cache_config`. + Args: + kv_cache_config: Configuration for the KV cache, including the KV + cache size of each layer + """ + if len(kv_cache_config.kv_cache_groups) > 1: + raise NotImplementedError( + "Hybrid models with more than one KV cache type are not " + "supported yet.") + + kv_caches: Dict[str, torch.Tensor] = {} + + for kv_cache_group in kv_cache_config.kv_cache_groups: + kv_cache_spec = kv_cache_group.kv_cache_spec + for layer_name in kv_cache_group.layer_names: + tensor_config = kv_cache_config.tensors[layer_name] + assert tensor_config.size % kv_cache_spec.page_size_bytes == 0 + num_blocks = tensor_config.size // kv_cache_spec.page_size_bytes + # `num_blocks` is the number of blocks the model runner can use. + # `kv_cache_config.num_blocks` is the number of blocks that + # KVCacheManager may allocate. + # Since different GPUs may have different number of layers and + # different memory capacities, `num_blocks` can be different on + # different GPUs, and `kv_cache_config.num_blocks` is set to + # the min of all `num_blocks`. Verify it here. + assert num_blocks >= kv_cache_config.num_blocks + if isinstance(kv_cache_spec, FullAttentionSpec): + kv_cache_shape = self.attn_backend.get_kv_cache_shape( + num_blocks, kv_cache_spec.block_size, kv_cache_spec.num_kv_heads, + kv_cache_spec.head_size) + dtype = kv_cache_spec.dtype + dtype = get_valid_dtype(dtype) + current_cache = [] + device_type = "CPU" if self.device.type == "cpu" else "Ascend" + for i in range(kv_cache_shape[0]): + cache_blocks = create_block( + kv_cache_shape[1:], dtype, device=device_type + ) + current_cache.append(mutable(cache_blocks)) + kv_caches[layer_name] = mutable(tuple(current_cache)) + else: + raise NotImplementedError + + bind_kv_cache( + kv_caches, + self.vllm_config.compilation_config.static_forward_context, + self.kv_caches) + + +def _update_states(self, scheduler_output: "SchedulerOutput") -> None: + """Update the cached states and the persistent batch with the scheduler + output. + + The updated states are used by the `_prepare_inputs` function to create + the input GPU tensors for the model. + + The SamplingMetadata is updated and copied to the GPU if there is a + new/resumed/paused/finished request in the batch. + """ + # Remove finished requests from the cached states. + for req_id in scheduler_output.finished_req_ids: + self.requests.pop(req_id, None) + self.encoder_cache.pop(req_id, None) + # Remove the finished requests from the persistent batch. + # NOTE(woosuk): There could be an edge case where finished_req_ids and + # scheduled_req_ids overlap. This happens when a request is aborted and + # then resubmitted with the same ID. In this case, we treat them as two + # distinct requests - clearing the cached states for the first request + # and handling the second as a new request. + removed_req_indices: List[int] = [] + for req_id in scheduler_output.finished_req_ids: + req_index = self.input_batch.remove_request(req_id) + if req_index is not None: + removed_req_indices.append(req_index) + + # Free the cached encoder outputs. + for req_id, input_id in scheduler_output.free_encoder_input_ids: + encoder_outputs = self.encoder_cache.get(req_id) + if encoder_outputs is not None: + encoder_outputs.pop(input_id, None) + if not encoder_outputs: + self.encoder_cache.pop(req_id, None) + + # Remove the unscheduled requests from the persistent batch. + # NOTE(woosuk): The unscheduled requests are either preempted requests + # or running requests that are not scheduled in this step. We remove + # them from the persistent batch but keep their cached states since + # they will be scheduled again sometime in the future. + scheduled_req_ids = scheduler_output.num_scheduled_tokens.keys() + cached_req_ids = self.input_batch.req_id_to_index.keys() + unscheduled_req_ids = cached_req_ids - scheduled_req_ids + # NOTE(woosuk): The persistent batch optimization assumes that + # consecutive batches contain mostly the same requests. If batches + # have low request overlap (e.g., alternating between two distinct + # sets of requests), this optimization becomes very inefficient. + for req_id in unscheduled_req_ids: + req_index = self.input_batch.remove_request(req_id) + assert req_index is not None + removed_req_indices.append(req_index) + + req_ids_to_add: List[str] = [] + # Add new requests to the cached states. + for new_req_data in scheduler_output.scheduled_new_reqs: + req_id = new_req_data.req_id + sampling_params = new_req_data.sampling_params + if sampling_params.sampling_type == SamplingType.RANDOM_SEED: + generator = torch.Generator(device=self.device) + generator.manual_seed(sampling_params.seed) + else: + generator = None + + self.requests[req_id] = CachedRequestState( + req_id=req_id, + prompt_token_ids=new_req_data.prompt_token_ids, + prompt=new_req_data.prompt, + mm_inputs=new_req_data.mm_inputs, + mm_positions=new_req_data.mm_positions, + sampling_params=sampling_params, + generator=generator, + block_ids=new_req_data.block_ids, + num_computed_tokens=new_req_data.num_computed_tokens, + output_token_ids=[], + lora_request=new_req_data.lora_request, + ) + + # Only relevant for models using M-RoPE (e.g, Qwen2-VL) + if self.uses_mrope: + image_grid_thw = [] + video_grid_thw = [] + second_per_grid_ts = [] + for mm_input in self.requests[req_id].mm_inputs: + if mm_input.get("image_grid_thw") is not None: + image_grid_thw.extend( + mm_input["image_grid_thw"].tolist()) + if mm_input.get("video_grid_thw") is not None: + video_grid_thw.extend( + mm_input["video_grid_thw"].tolist()) + if mm_input.get("second_per_grid_ts") is not None: + second_per_grid_ts.extend( + mm_input["second_per_grid_ts"]) + + hf_config = self.model_config.hf_config + + self.requests[req_id].mrope_positions, \ + self.requests[req_id].mrope_position_delta = \ + MRotaryEmbedding.get_input_positions_tensor( + self.requests[req_id].prompt_token_ids, + hf_config=hf_config, + image_grid_thw=image_grid_thw, + video_grid_thw=video_grid_thw, + second_per_grid_ts=second_per_grid_ts, + ) + + req_ids_to_add.append(req_id) + + # Update the states of the running/resumed requests. + for req_data in scheduler_output.scheduled_cached_reqs: + req_id = req_data.req_id + req_state = self.requests[req_id] + + # Update the cached states. + num_computed_tokens = req_data.num_computed_tokens + req_state.num_computed_tokens = num_computed_tokens + # Add the sampled token(s) from the previous step (if any). + # This doesn't include "unverified" tokens like spec decode tokens. + num_new_tokens = (num_computed_tokens + + len(req_data.new_token_ids) - + req_state.num_tokens) + if num_new_tokens == 1: + # Avoid slicing list in most common case. + req_state.output_token_ids.append(req_data.new_token_ids[-1]) + elif num_new_tokens > 0: + req_state.output_token_ids.extend( + req_data.new_token_ids[-num_new_tokens:]) + # Update the block IDs. + if not req_data.resumed_from_preemption: + # Append the new blocks to the existing block IDs. + req_state.block_ids.extend(req_data.new_block_ids) + else: + # The request is resumed from preemption. + # Replace the existing block IDs with the new ones. + req_state.block_ids = req_data.new_block_ids + + req_index = self.input_batch.req_id_to_index.get(req_id) + if req_index is None: + # The request is not in the persistent batch. + # The request was either preempted and resumed later, or was not + # scheduled in the previous step and needs to be added again. + req_ids_to_add.append(req_id) + continue + + # Update the persistent batch. + self.input_batch.num_computed_tokens_cpu[req_index] = ( + num_computed_tokens) + start_index = (len(req_state.block_ids) - + len(req_data.new_block_ids)) + self.input_batch.block_table.append_row(req_data.new_block_ids, + req_index) + # Add new_token_ids to token_ids_cpu. + start_token_index = num_computed_tokens + end_token_index = num_computed_tokens + len(req_data.new_token_ids) + self.input_batch.token_ids_cpu[ + req_index, + start_token_index:end_token_index] = req_data.new_token_ids + + self.input_batch.num_tokens_no_spec[req_index] = end_token_index + # Add spec_token_ids to token_ids_cpu. + spec_token_ids = scheduler_output.scheduled_spec_decode_tokens.get( + req_id, ()) + if spec_token_ids: + start_index = end_token_index + end_token_index += len(spec_token_ids) + self.input_batch.token_ids_cpu[ + req_index, start_index:end_token_index] = spec_token_ids + # NOTE(woosuk): `num_tokens` here may include spec decode tokens. + self.input_batch.num_tokens[req_index] = end_token_index + + + # self.input_batch.token_ids_cpu_tensor.copy_(torch.from_numpy(self.input_batch.token_ids_cpu)) + # Check if the batch has changed. If not, we can skip copying the + # sampling metadata from CPU to GPU. + batch_changed = len(removed_req_indices) > 0 or len(req_ids_to_add) > 0 + + # Add the new or resumed requests to the persistent batch. + # The smaller empty indices are filled first. + removed_req_indices = sorted(removed_req_indices, reverse=True) + for req_id in req_ids_to_add: + req_state = self.requests[req_id] + if removed_req_indices: + # Fill the empty index. + req_index = removed_req_indices.pop() + else: + # Append to the end. + req_index = None + self.input_batch.add_request(req_state, req_index) + + # Condense the batched states if there are empty indices. + if removed_req_indices: + self.input_batch.condense(removed_req_indices) + + if batch_changed: + self.input_batch.refresh_sampling_metadata() -- Gitee From ae926b82e7793a4ab0ef78ee995087194b91f4c2 Mon Sep 17 00:00:00 2001 From: lvhaoyu Date: Sun, 13 Apr 2025 10:31:39 +0800 Subject: [PATCH 07/55] add check before calling STR_DTYPE_TO_TENSOR_DTYPE[] --- vllm_mindspore/worker/model_runner.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm_mindspore/worker/model_runner.py b/vllm_mindspore/worker/model_runner.py index 561fd202..55bb26ec 100644 --- a/vllm_mindspore/worker/model_runner.py +++ b/vllm_mindspore/worker/model_runner.py @@ -132,7 +132,8 @@ def _dummy_run(self, # tensor aliasing. kv_cache_dtype = self.model_config.dtype if self.cache_config.cache_dtype == "auto" \ else self.cache_config.cache_dtype - kv_cache_dtype = STR_DTYPE_TO_TENSOR_DTYPE[kv_cache_dtype] + if kv_cache_dtype in STR_DTYPE_TO_TENSOR_DTYPE: + kv_cache_dtype = STR_DTYPE_TO_TENSOR_DTYPE[kv_cache_dtype] block_size = self.cache_config.block_size num_kv_heads = self.model_config.get_num_kv_heads(self.parallel_config) head_size = self.model_config.get_head_size() -- Gitee From 2814e08082d0634769b6528b4b2bd6c9b3801f4f Mon Sep 17 00:00:00 2001 From: candyhong <1102229410@qq.com> Date: Wed, 16 Apr 2025 10:10:22 +0000 Subject: [PATCH 08/55] Supporting mp for v1 --- vllm_mindspore/__init__.py | 4 ++ vllm_mindspore/distributed/shm_broadcast.py | 71 +++++++++++++++++++++ 2 files changed, 75 insertions(+) create mode 100644 vllm_mindspore/distributed/shm_broadcast.py diff --git a/vllm_mindspore/__init__.py b/vllm_mindspore/__init__.py index a943c92f..90243e23 100644 --- a/vllm_mindspore/__init__.py +++ b/vllm_mindspore/__init__.py @@ -291,6 +291,10 @@ from vllm_mindspore.v1.sample.sampler import apply_temperature import vllm.v1.sample.sampler vllm.v1.sample.sampler.Sampler.apply_temperature = apply_temperature +from vllm_mindspore.distributed.shm_broadcast import initialize_ShmRingBuffer +from vllm.distributed.device_communicators.shm_broadcast import ShmRingBuffer +ShmRingBuffer.__init__ = initialize_ShmRingBuffer + from .utils import check_ready from vllm_mindspore.engine.multiprocessing.engine import cleanup diff --git a/vllm_mindspore/distributed/shm_broadcast.py b/vllm_mindspore/distributed/shm_broadcast.py new file mode 100644 index 00000000..bf010812 --- /dev/null +++ b/vllm_mindspore/distributed/shm_broadcast.py @@ -0,0 +1,71 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# Copyright 2025 Huawei Technologies Co., Ltd +# Copyright 2024 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +import numpy as np +from typing import Optional +from multiprocessing import shared_memory +from unittest.mock import patch +from vllm.logger import init_logger + +logger = init_logger(__name__) + +def initialize_ShmRingBuffer(self, + n_reader: int, + max_chunk_bytes: int, + max_chunks: int, + name: Optional[str] = None): + logger.info("Entering mindspore shm_broadcast") + self.n_reader = n_reader + self.metadata_size = 1 + n_reader + self.max_chunk_bytes = max_chunk_bytes + self.max_chunks = max_chunks + self.total_bytes_of_buffer = (self.max_chunk_bytes + + self.metadata_size) * self.max_chunks + self.data_offset = 0 + self.metadata_offset = self.max_chunk_bytes * self.max_chunks + + if name is None: + # we are creating a buffer + self.is_creator = True + self.shared_memory = shared_memory.SharedMemory( + create=True, size=self.total_bytes_of_buffer) + # initialize the metadata section to 0 + with memoryview(self.shared_memory.buf[self.metadata_offset:] + ) as metadata_buffer: + np.frombuffer(metadata_buffer, dtype=np.uint8).fill(0) + else: + # we are opening an existing buffer + self.is_creator = False + # fix to https://stackoverflow.com/q/62748654/9191338 + # Python incorrectly tracks shared memory even if it is not + # created by the process. The following patch is a workaround. + with patch("multiprocessing.resource_tracker.register", + lambda *args, **kwargs: None): + try: + self.shared_memory = shared_memory.SharedMemory(name=name) + # See https://docs.python.org/3/library/multiprocessing.shared_memory.html # noqa + # Some platforms allocate memory based on page size, + # so the shared memory block size may be larger or equal + # to the requested size. The size parameter is ignored + # when attaching to an existing block. + assert (self.shared_memory.size + >= self.total_bytes_of_buffer) + except FileNotFoundError: + # we might deserialize the object in a different node + # in this case, this object is not used, + # and we should suppress the error + pass -- Gitee From 14b5446e69ef41b2ba48621c2ea225a391e6be0b Mon Sep 17 00:00:00 2001 From: lvhaoyu Date: Sun, 13 Apr 2025 14:34:25 +0800 Subject: [PATCH 09/55] use cpu scoket to replace ProcessGroupGloo --- vllm_mindspore/__init__.py | 11 +- vllm_mindspore/config.py | 109 ++++++++++++++++++ vllm_mindspore/executor/ray_gpu_executor.py | 1 + .../models/mf_models/weight_processor.py | 5 +- vllm_mindspore/v1/engine/__init__.py | 0 vllm_mindspore/v1/engine/core.py | 6 + vllm_mindspore/v1/worker/gpu_worker.py | 41 +++++++ 7 files changed, 169 insertions(+), 4 deletions(-) create mode 100644 vllm_mindspore/v1/engine/__init__.py create mode 100644 vllm_mindspore/v1/engine/core.py create mode 100644 vllm_mindspore/v1/worker/gpu_worker.py diff --git a/vllm_mindspore/__init__.py b/vllm_mindspore/__init__.py index 90243e23..ce06a685 100644 --- a/vllm_mindspore/__init__.py +++ b/vllm_mindspore/__init__.py @@ -56,6 +56,10 @@ import vllm.engine.arg_utils from vllm_mindspore.engine.arg_utils import _is_v1_supported_oracle vllm.engine.arg_utils.EngineArgs._is_v1_supported_oracle = _is_v1_supported_oracle +import vllm.v1.engine.core +from vllm_mindspore.v1.engine.core import shutdown +vllm.v1.engine.core.DPEngineCoreProc.shutdown = shutdown + from vllm_mindspore.utils import ( direct_register_custom_op, make_tensor_with_pad, @@ -197,13 +201,15 @@ vllm.engine.async_llm_engine.initialize_ray_cluster = initialize_ray_cluster from .config import _verify_quantization, _verify_args, vllm_config_post_init, model_post_init, \ - _get_and_verify_dtype + _get_and_verify_dtype, stateless_init_dp_group, has_unfinished_dp vllm.config.ModelConfig._verify_quantization = _verify_quantization vllm.config.VllmConfig.__post_init__ = vllm_config_post_init vllm.config.SchedulerConfig._verify_args = _verify_args vllm.config.CompilationConfig.model_post_init = model_post_init vllm.config._get_and_verify_dtype = _get_and_verify_dtype +vllm.config.ParallelConfig.stateless_init_dp_group = stateless_init_dp_group +vllm.config.ParallelConfig.has_unfinished_dp = has_unfinished_dp from .utils import update_modules from vllm_mindspore.attention.backends import ms_attn @@ -268,9 +274,10 @@ vllm.v1.worker.gpu_input_batch.InputBatch._make_prompt_token_ids_tensor = _make_ vllm.v1.worker.gpu_model_runner.InputBatch._make_prompt_token_ids_tensor = _make_prompt_token_ids_tensor from vllm.v1.worker.gpu_worker import Worker +from vllm_mindspore.v1.worker.gpu_worker import init_device Worker.__init__ = wrapper_worker_init(Worker.__init__) -Worker.init_device = wrapper_worker_init_device(Worker.init_device) +Worker.init_device = wrapper_worker_init_device(init_device) import vllm.v1.utils diff --git a/vllm_mindspore/config.py b/vllm_mindspore/config.py index b6366434..21c2fded 100644 --- a/vllm_mindspore/config.py +++ b/vllm_mindspore/config.py @@ -18,6 +18,9 @@ from collections import Counter from typing import Union import sys +import socket +import pickle +import time import torch @@ -292,3 +295,109 @@ def _get_and_verify_dtype( torch_dtype = _STR_DTYPE_TO_TORCH_DTYPE[torch_dtype] return torch_dtype + + +class SocketProcessGroup: + def __init__(self, master_ip: str, master_port: int, rank: int, world_size: int): + self.master_ip = master_ip + self.master_port = master_port + self.rank = rank + self.world_size = world_size + self.sockets = [] + self.max_retries = 100 + self.retry_interval = 2 + + if self.rank == 0: + # Master node: create a server socket + self.server_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + self.server_socket.bind((self.master_ip, self.master_port)) + self.server_socket.listen(self.world_size - 1) + print(f"Master node listening on {self.master_ip}:{self.master_port}") + else: + # Worker node: connect to the master + self.client_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + retries = 0 + while retries < self.max_retries: + try: + self.client_socket.connect((self.master_ip, self.master_port)) + print(f"Worker {self.rank} connected to master at {self.master_ip}:{self.master_port}") + break + except ConnectionRefusedError: + retries += 1 + print(f"Worker {self.rank} failed to connect to master. Retrying in {self.retry_interval} seconds... ({retries}/{self.max_retries})") + time.sleep(self.retry_interval) + else: + raise ConnectionError(f"Worker {self.rank} could not connect to master at {self.master_ip}:{self.master_port} after {self.max_retries} retries.") + + def initialize_group(self): + if self.rank == 0: + # Master node: accept connections from workers + for _ in range(self.world_size - 1): + conn, addr = self.server_socket.accept() + print(f"Accepted connection from {addr}") + self.sockets.append(conn) + else: + # Worker node: no additional setup needed + pass + + def close(self): + if self.rank == 0: + # Master node: close all worker connections + for conn in self.sockets: + conn.close() + self.server_socket.close() + else: + # Worker node: close connection to master + self.client_socket.close() + + +def stateless_init_dp_group(self) -> SocketProcessGroup: + """ + Initialize a stateless data parallel process group using sockets. + """ + dp_group = SocketProcessGroup( + self.data_parallel_master_ip, + self.get_next_dp_init_port(), + self.data_parallel_rank, + self.data_parallel_size) + dp_group.initialize_group() + return dp_group + + +def has_unfinished_dp(dp_group: SocketProcessGroup, has_unfinished: bool) -> bool: + """ + Check if any process in the group has unfinished tasks. + """ + if dp_group.rank == 0: + # Master node: collect results from workers + results = [has_unfinished] + for conn in dp_group.sockets: + data = conn.recv(1024) + worker_result = pickle.loads(data) + results.append(worker_result) + + # Perform OR operation (any True means unfinished) + aggregated_result = any(results) + + # Broadcast the result back to workers + for conn in dp_group.sockets: + conn.send(pickle.dumps(aggregated_result)) + + return aggregated_result + else: + # Worker node: send result to master + dp_group.client_socket.send(pickle.dumps(has_unfinished)) + + # Receive aggregated result from master + data = dp_group.client_socket.recv(1024) + aggregated_result = pickle.loads(data) + return aggregated_result + +def stateless_destroy_socket_process_group(dp_group: "SocketProcessGroup") -> None: + """ + Destroy the socket-based data parallel process group. + This function closes all sockets and cleans up resources. + """ + if dp_group: + dp_group.close() + print(f"Socket process group for rank {dp_group.rank} destroyed.") diff --git a/vllm_mindspore/executor/ray_gpu_executor.py b/vllm_mindspore/executor/ray_gpu_executor.py index d9c2affd..76a00322 100644 --- a/vllm_mindspore/executor/ray_gpu_executor.py +++ b/vllm_mindspore/executor/ray_gpu_executor.py @@ -225,6 +225,7 @@ def ms_init_workers_ray(self, placement_group: "PlacementGroup", "TPU_HOST_BOUNDS", "VLLM_USE_V1", "VLLM_TRACE_FUNCTION", + "ASCEND_RT_VISIBLE_DEVICES", ]: if name in os.environ: args[name] = os.environ[name] diff --git a/vllm_mindspore/model_executor/models/mf_models/weight_processor.py b/vllm_mindspore/model_executor/models/mf_models/weight_processor.py index 9b0aab3a..82a104f6 100644 --- a/vllm_mindspore/model_executor/models/mf_models/weight_processor.py +++ b/vllm_mindspore/model_executor/models/mf_models/weight_processor.py @@ -21,6 +21,7 @@ import os from safetensors import safe_open from mindspore.communication.management import get_rank, get_group_size +from vllm.distributed import get_tensor_model_parallel_world_size, get_tensor_model_parallel_rank class BaseWeightProcessor: r""" @@ -35,8 +36,8 @@ class BaseWeightProcessor: self.config = config self.network = network self.is_quant = is_quant - self.tp_group_size = get_group_size() - self.rank_id = get_rank() + self.tp_group_size = get_tensor_model_parallel_world_size() + self.rank_id = get_tensor_model_parallel_rank() self.parameter_dict = {} self.file_handles = {} diff --git a/vllm_mindspore/v1/engine/__init__.py b/vllm_mindspore/v1/engine/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/vllm_mindspore/v1/engine/core.py b/vllm_mindspore/v1/engine/core.py new file mode 100644 index 00000000..03c6ff77 --- /dev/null +++ b/vllm_mindspore/v1/engine/core.py @@ -0,0 +1,6 @@ +from vllm_mindspore.config import stateless_destroy_socket_process_group + +def shutdown(self): + super(self.__class__, self).shutdown() + if dp_group := getattr(self, "dp_group", None): + stateless_destroy_socket_process_group(dp_group) diff --git a/vllm_mindspore/v1/worker/gpu_worker.py b/vllm_mindspore/v1/worker/gpu_worker.py new file mode 100644 index 00000000..4cdeca4d --- /dev/null +++ b/vllm_mindspore/v1/worker/gpu_worker.py @@ -0,0 +1,41 @@ +# SPDX-License-Identifier: Apache-2.0 +"""A GPU worker class""" + +import gc +import torch +# import mindspore +from vllm.logger import init_logger + +logger = init_logger(__name__) + +def init_device(self): + from vllm.config import get_current_vllm_config + from vllm.model_executor import set_random_seed + from vllm.v1.worker.gpu_model_runner import GPUModelRunner + from vllm.v1.worker.gpu_worker import ( + _check_if_gpu_supports_dtype, init_worker_distributed_environment) + + config = get_current_vllm_config() + if config is not None and config.parallel_config.data_parallel_size > 1: + device_id = self.parallel_config.data_parallel_rank_local * self.parallel_config.world_size + self.local_rank + self.device = torch.device(f"cuda:{device_id}") + else: + self.device = torch.device(f"cuda:{self.local_rank}") + torch.cuda.set_device(self.device) + + _check_if_gpu_supports_dtype(self.model_config.dtype) + gc.collect() + torch.cuda.empty_cache() + self.init_gpu_memory = torch.cuda.mem_get_info()[0] + + # Initialize the distributed environment. + init_worker_distributed_environment(self.parallel_config, self.rank, + self.distributed_init_method, + self.local_rank) + + # Set random seed. + set_random_seed(self.model_config.seed) + + # Construct the model runner + self.model_runner: GPUModelRunner = GPUModelRunner( + self.vllm_config, self.device) -- Gitee From a1ac04dd40ee8b6cab2ed858609a660c8ec9d4f7 Mon Sep 17 00:00:00 2001 From: wusimin Date: Tue, 22 Apr 2025 17:37:27 +0800 Subject: [PATCH 10/55] =?UTF-8?q?=E8=A7=A3=E5=86=B3DP=E5=90=8E=E9=87=87?= =?UTF-8?q?=E6=A0=B7=E5=A4=84=E7=90=86=E6=8A=A5logits=E5=92=8Ctemp=20shape?= =?UTF-8?q?=E4=B8=8D=E5=8C=B9=E9=85=8D=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- vllm_mindspore/v1/worker/gpu_input_batch.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm_mindspore/v1/worker/gpu_input_batch.py b/vllm_mindspore/v1/worker/gpu_input_batch.py index 6a2254f7..52c13344 100644 --- a/vllm_mindspore/v1/worker/gpu_input_batch.py +++ b/vllm_mindspore/v1/worker/gpu_input_batch.py @@ -16,6 +16,7 @@ def _make_sampling_metadata(self) -> SamplingMetadata: num_reqs = self.num_reqs if not self.all_greedy: temperature = copy_slice(torch.from_numpy(self.temperature_cpu), self.temperature, num_reqs) + temperature = temperature[:num_reqs] else: temperature = None if not self.no_top_p: -- Gitee From 3bf6dc9181eb33436ae8d93acfebaa89c1f36640 Mon Sep 17 00:00:00 2001 From: can-gaa-hou Date: Tue, 22 Apr 2025 03:18:26 +0000 Subject: [PATCH 11/55] fixing mp out-of-memory bugs --- vllm_mindspore/__init__.py | 4 ++++ vllm_mindspore/v1/worker/gpu_worker.py | 13 ++++++++++++- 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/vllm_mindspore/__init__.py b/vllm_mindspore/__init__.py index ce06a685..032415f0 100644 --- a/vllm_mindspore/__init__.py +++ b/vllm_mindspore/__init__.py @@ -302,6 +302,10 @@ from vllm_mindspore.distributed.shm_broadcast import initialize_ShmRingBuffer from vllm.distributed.device_communicators.shm_broadcast import ShmRingBuffer ShmRingBuffer.__init__ = initialize_ShmRingBuffer +from vllm_mindspore.v1.worker.gpu_worker import compile_or_warm_up_model +from vllm.v1.worker.gpu_worker import Worker +Worker.compile_or_warm_up_model = compile_or_warm_up_model + from .utils import check_ready from vllm_mindspore.engine.multiprocessing.engine import cleanup diff --git a/vllm_mindspore/v1/worker/gpu_worker.py b/vllm_mindspore/v1/worker/gpu_worker.py index 4cdeca4d..0395c339 100644 --- a/vllm_mindspore/v1/worker/gpu_worker.py +++ b/vllm_mindspore/v1/worker/gpu_worker.py @@ -3,11 +3,13 @@ import gc import torch -# import mindspore from vllm.logger import init_logger +from vllm.distributed.parallel_state import get_pp_group + logger = init_logger(__name__) + def init_device(self): from vllm.config import get_current_vllm_config from vllm.model_executor import set_random_seed @@ -39,3 +41,12 @@ def init_device(self): # Construct the model runner self.model_runner: GPUModelRunner = GPUModelRunner( self.vllm_config, self.device) + + +def compile_or_warm_up_model(self) -> None: + # MindSpore does not support cuda graph. No need to warm up the model. + # Since prefill is done previously, we do decode here. + default_max_num_reqs = 1 # For MindSpore, we only do one more decode here. + if get_pp_group().is_last_rank: + self.model_runner._dummy_sampler_run(self.model_runner._dummy_run( + num_tokens=default_max_num_reqs)) -- Gitee From 0c5acbd1b2c820d47f967a5e25557eeb5eee060d Mon Sep 17 00:00:00 2001 From: cs123abc Date: Fri, 25 Apr 2025 14:22:50 +0800 Subject: [PATCH 12/55] =?UTF-8?q?topk=20=E6=B5=8B=E8=AF=95=E9=97=AE?= =?UTF-8?q?=E9=A2=98=E4=BF=AE=E5=A4=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- vllm_mindspore/v1/sample/ops/topk_topp_sampler.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm_mindspore/v1/sample/ops/topk_topp_sampler.py b/vllm_mindspore/v1/sample/ops/topk_topp_sampler.py index 57f0a81c..8b0835c0 100644 --- a/vllm_mindspore/v1/sample/ops/topk_topp_sampler.py +++ b/vllm_mindspore/v1/sample/ops/topk_topp_sampler.py @@ -87,11 +87,12 @@ def apply_top_k_only( # Set non-top-k rows to 1 so that we can gather. k = k.masked_fill(no_top_k_mask, 1) max_top_k = k.max() + int_max_top_k = max_top_k.item() # topk.values tensor has shape [batch_size, max_top_k]. # Convert top k to 0-based index in range [0, max_top_k). k_index = k.sub_(1).unsqueeze(1).expand(logits.shape[0], 1) - top_k_mask = logits.topk(max_top_k, dim=1)[0].gather(1, k_index.long()) + top_k_mask = logits.topk(int_max_top_k, dim=1)[0].gather(1, k_index.long()) # Handle non-topk rows. top_k_mask.masked_fill_(no_top_k_mask.unsqueeze(1), -float("inf")) logits.masked_fill_(logits < top_k_mask, -float("inf")) -- Gitee From e9d96bacbe4925d549894972fb255bf9a49936c7 Mon Sep 17 00:00:00 2001 From: tronzhang Date: Wed, 23 Apr 2025 20:53:43 +0800 Subject: [PATCH 13/55] adapte multi-nodes dp serve --- setup.py | 5 + vllm_dp/dp_scale_out.patch | 1416 +++++++++++++++++ vllm_dp/install_dp_vllm.sh | 41 + vllm_mindspore/config.py | 18 +- .../models/mf_models/deepseek_v3.py | 71 +- .../mf_models/deepseekv3_weight_processor.py | 285 +++- .../models/mf_models/weight_processor.py | 79 +- vllm_mindspore/scripts.py | 2 +- 8 files changed, 1833 insertions(+), 84 deletions(-) create mode 100755 vllm_dp/dp_scale_out.patch create mode 100644 vllm_dp/install_dp_vllm.sh diff --git a/setup.py b/setup.py index 5296b18b..2f304bcc 100644 --- a/setup.py +++ b/setup.py @@ -214,4 +214,9 @@ setup( ext_modules=_get_ext_modules(), include_package_data=True, package_data=package_data, + entry_points={ + "console_scripts": [ + "vllm-mindspore=vllm_mindspore.scripts:main", + ], + }, ) diff --git a/vllm_dp/dp_scale_out.patch b/vllm_dp/dp_scale_out.patch new file mode 100755 index 00000000..fb6520d2 --- /dev/null +++ b/vllm_dp/dp_scale_out.patch @@ -0,0 +1,1416 @@ +diff --git a/vllm/config.py b/vllm/config.py +index bd52fc90b..24fc1154d 100644 +--- a/vllm/config.py ++++ b/vllm/config.py +@@ -1429,16 +1429,27 @@ class LoadConfig: + class ParallelConfig: + """Configuration for the distributed execution.""" + +- pipeline_parallel_size: int = 1 # Number of pipeline parallel groups. +- tensor_parallel_size: int = 1 # Number of tensor parallel groups. +- data_parallel_size: int = 1 # Number of data parallel groups. +- data_parallel_rank: int = 0 # Rank of the data parallel group. +- # Local rank of the data parallel group, defaults to global rank. ++ pipeline_parallel_size: int = 1 ++ """Number of pipeline parallel groups.""" ++ tensor_parallel_size: int = 1 ++ """Number of tensor parallel groups.""" ++ data_parallel_size: int = 1 ++ """Number of data parallel groups. MoE layers will be sharded according to ++ the product of the tensor parallel size and data parallel size.""" ++ data_parallel_size_local: int = 1 ++ """Number of local data parallel groups.""" ++ data_parallel_rank: int = 0 ++ """Rank of the data parallel group.""" + data_parallel_rank_local: Optional[int] = None + # IP of the data parallel master. + data_parallel_master_ip: str = "127.0.0.1" +- data_parallel_master_port: int = 29500 # Port of the data parallel master. +- enable_expert_parallel: bool = False # Use EP instead of TP for MoE layers. ++ """IP of the data parallel master.""" ++ data_parallel_rpc_port: int = 29550 ++ """Port for data parallel messaging.""" ++ data_parallel_master_port: int = 29500 ++ """Port of the data parallel master.""" ++ enable_expert_parallel: bool = False ++ """Use expert parallelism instead of tensor parallelism for MoE layers.""" + + # Maximum number of multiple batches + # when load model sequentially. To avoid RAM OOM when using tensor +@@ -1475,12 +1486,16 @@ class ParallelConfig: + + # world_size is TPxPP, it affects the number of workers we create. + world_size: int = field(init=False) +- # world_size_across_dp is TPxPPxDP, it is the size of the world +- # including data parallelism. +- world_size_across_dp: int = field(init=False) ++ """world_size is TPxPP, it affects the number of workers we create.""" + + rank: int = 0 + ++ @property ++ def world_size_across_dp(self) -> int: ++ """world_size_across_dp is TPxPPxDP, it is the size of the world ++ including data parallelism.""" ++ return self.world_size * self.data_parallel_size ++ + def get_next_dp_init_port(self) -> int: + """ + We might need to initialize process groups in multiple +@@ -1533,16 +1548,20 @@ class ParallelConfig: + factors: list[Any] = [] + factors.append(self.pipeline_parallel_size) + factors.append(self.tensor_parallel_size) ++ factors.append(self.data_parallel_size) + return hashlib.sha256(str(factors).encode()).hexdigest() + + def __post_init__(self) -> None: + self.world_size = self.pipeline_parallel_size * \ + self.tensor_parallel_size + +- if self.data_parallel_size > 1: ++ if self.data_parallel_size_local > self.data_parallel_size: ++ raise ValueError( ++ "data_parallel_size_local must be <= data_parallel_size") ++ ++ if self.data_parallel_size > 1 or self.data_parallel_size_local == 0: + # Data parallel was specified in the engine args. + self.data_parallel_master_port = get_open_port() +- # TODO multi-node + else: + # Otherwise fall back to env vars (e.g. for offline SPMD case). + self.data_parallel_size = envs.VLLM_DP_SIZE +@@ -1551,8 +1570,6 @@ class ParallelConfig: + self.data_parallel_master_ip = envs.VLLM_DP_MASTER_IP + self.data_parallel_master_port = envs.VLLM_DP_MASTER_PORT + +- self.world_size_across_dp = self.world_size * self.data_parallel_size +- + if self.distributed_executor_backend == "external_launcher": + import os + os.environ["VLLM_ENABLE_V1_MULTIPROCESSING"] = "0" +diff --git a/vllm/distributed/utils.py b/vllm/distributed/utils.py +index cae1a2551..2bdcdf4f1 100644 +--- a/vllm/distributed/utils.py ++++ b/vllm/distributed/utils.py +@@ -21,6 +21,7 @@ from torch.distributed.rendezvous import rendezvous + + import vllm.envs as envs + from vllm.logger import init_logger ++from vllm.utils import get_tcp_uri + + logger = init_logger(__name__) + +@@ -282,7 +283,7 @@ def stateless_init_torch_distributed_process_group( + always formed with process 1, 2, ..., 8, and the additional communication + channel is formed with process 9 and 10. + """ +- init_method = f"tcp://{host}:{port}" ++ init_method = get_tcp_uri(host, port) + backend = Backend(backend) # it is basically string + timeout = _get_default_timeout(backend) + +@@ -301,6 +302,9 @@ def stateless_init_torch_distributed_process_group( + prefix_store, + group_rank, + group_size, ++ ProcessGroup.Options( ++ backend=backend ++ ) + ) + + if backend == "gloo": +@@ -325,7 +329,7 @@ def stateless_init_torch_distributed_process_group( + else: + raise RuntimeError(f"Unsupported torch distributed backend: {backend}") + +- pg._set_default_backend(backend_type) ++ #pg._set_default_backend(backend_type) + backend_class._set_sequence_number_for_group() + + pg._register_backend(device, backend_type, backend_class) +diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py +index 89c9b6747..d6173763d 100644 +--- a/vllm/engine/arg_utils.py ++++ b/vllm/engine/arg_utils.py +@@ -113,11 +113,14 @@ class EngineArgs: + distributed_executor_backend: Optional[Union[str, + Type[ExecutorBase]]] = None + # number of P/D disaggregation (or other disaggregation) workers +- pipeline_parallel_size: int = 1 +- tensor_parallel_size: int = 1 +- data_parallel_size: int = 1 +- enable_expert_parallel: bool = False +- max_parallel_loading_workers: Optional[int] = None ++ pipeline_parallel_size: int = ParallelConfig.pipeline_parallel_size ++ tensor_parallel_size: int = ParallelConfig.tensor_parallel_size ++ data_parallel_size: int = ParallelConfig.data_parallel_size ++ data_parallel_size_local: Optional[int] = None ++ data_parallel_address: Optional[str] = None ++ data_parallel_rpc_port: Optional[int] = None ++ enable_expert_parallel: bool = ParallelConfig.enable_expert_parallel ++ max_parallel_loading_workers: Optional[int] = ParallelConfig.max_parallel_loading_workers + block_size: Optional[int] = None + enable_prefix_caching: Optional[bool] = None + prefix_caching_hash_algo: str = "builtin" +@@ -434,6 +437,21 @@ class EngineArgs: + 'MoE layers will be sharded according to the ' + 'product of the tensor-parallel-size and ' + 'data-parallel-size.') ++ parser.add_argument('--data-parallel-size-local', ++ '-dpl', ++ type=int, ++ help='Number of data parallel replicas ' ++ 'to run on this node.') ++ parser.add_argument('--data-parallel-address', ++ '-dpa', ++ type=str, ++ help='Address of data parallel cluster ' ++ 'head-node.') ++ parser.add_argument('--data-parallel-rpc-port', ++ '-dpp', ++ type=int, ++ help='Port for data parallel RPC ' ++ 'communication.') + parser.add_argument( + '--enable-expert-parallel', + action='store_true', +@@ -1186,10 +1204,30 @@ class EngineArgs: + # but we should not do this here. + placement_group = ray.util.get_current_placement_group() + ++ # Local DP size defaults to global DP size if not set. ++ data_parallel_size_local = self.data_parallel_size if ( ++ self.data_parallel_size_local ++ is None) else self.data_parallel_size_local ++ ++ # DP address, used in multi-node case for torch distributed group ++ # and ZMQ sockets. ++ data_parallel_address = self.data_parallel_address if ( ++ self.data_parallel_address ++ is not None) else ParallelConfig.data_parallel_master_ip ++ ++ # This port is only used when there are remote data parallel engines, ++ # otherwise the local IPC transport is used. ++ data_parallel_rpc_port = self.data_parallel_rpc_port if ( ++ self.data_parallel_rpc_port ++ is not None) else ParallelConfig.data_parallel_rpc_port ++ + parallel_config = ParallelConfig( + pipeline_parallel_size=self.pipeline_parallel_size, + tensor_parallel_size=self.tensor_parallel_size, + data_parallel_size=self.data_parallel_size, ++ data_parallel_size_local=data_parallel_size_local, ++ data_parallel_master_ip=data_parallel_address, ++ data_parallel_rpc_port=data_parallel_rpc_port, + enable_expert_parallel=self.enable_expert_parallel, + max_parallel_loading_workers=self.max_parallel_loading_workers, + disable_custom_all_reduce=self.disable_custom_all_reduce, +diff --git a/vllm/entrypoints/cli/serve.py b/vllm/entrypoints/cli/serve.py +index e89ac4e21..ffcc2bb10 100644 +--- a/vllm/entrypoints/cli/serve.py ++++ b/vllm/entrypoints/cli/serve.py +@@ -1,14 +1,24 @@ + # SPDX-License-Identifier: Apache-2.0 + + import argparse ++import signal + + import uvloop + ++import vllm.envs as envs ++from vllm import AsyncEngineArgs + from vllm.entrypoints.cli.types import CLISubcommand + from vllm.entrypoints.openai.api_server import run_server + from vllm.entrypoints.openai.cli_args import (make_arg_parser, + validate_parsed_serve_args) +-from vllm.utils import FlexibleArgumentParser ++from vllm.logger import init_logger ++from vllm.usage.usage_lib import UsageContext ++from vllm.utils import FlexibleArgumentParser, get_tcp_uri ++from vllm.v1.engine.core import EngineCoreProc ++from vllm.v1.engine.core_client import CoreEngineProcManager ++from vllm.v1.executor.abstract import Executor ++ ++logger = init_logger(__name__) + + + class ServeSubcommand(CLISubcommand): +@@ -24,7 +34,10 @@ class ServeSubcommand(CLISubcommand): + if hasattr(args, 'model_tag') and args.model_tag is not None: + args.model = args.model_tag + +- uvloop.run(run_server(args)) ++ if args.headless: ++ run_headless(args) ++ else: ++ uvloop.run(run_server(args)) + + def validate(self, args: argparse.Namespace) -> None: + validate_parsed_serve_args(args) +@@ -41,6 +54,18 @@ class ServeSubcommand(CLISubcommand): + nargs='?', + help="The model tag to serve " + "(optional if specified in config)") ++ serve_parser.add_argument( ++ "--headless", ++ action='store_true', ++ default=False, ++ help="Run in headless mode. See multi-node data parallel " ++ "documentation for more details.") ++ serve_parser.add_argument( ++ '--data-parallel-start-rank', ++ '-dpr', ++ type=int, ++ default=0, ++ help='Starting data parallel rank for secondary nodes.') + serve_parser.add_argument( + "--config", + type=str, +@@ -56,3 +81,55 @@ class ServeSubcommand(CLISubcommand): + + def cmd_init() -> list[CLISubcommand]: + return [ServeSubcommand()] ++ ++ ++def run_headless(args: argparse.Namespace): ++ ++ # Create the EngineConfig. ++ engine_args = AsyncEngineArgs.from_cli_args(args) ++ usage_context = UsageContext.OPENAI_API_SERVER ++ vllm_config = engine_args.create_engine_config(usage_context=usage_context) ++ ++ if not envs.VLLM_USE_V1: ++ raise RuntimeError("Headless mode is only supported for V1") ++ ++ parallel_config = vllm_config.parallel_config ++ local_engine_count = parallel_config.data_parallel_size_local ++ host = parallel_config.data_parallel_master_ip ++ port = engine_args.data_parallel_rpc_port # add to config too ++ input_address = get_tcp_uri(host, port) ++ ++ if local_engine_count <= 0: ++ raise RuntimeError("data_parallel_size_local must be > 0 in " ++ "headless mode") ++ ++ # Catch SIGTERM and SIGINT to allow graceful shutdown. ++ def signal_handler(signum, frame): ++ logger.debug("Received %d signal.", signum) ++ raise SystemExit ++ ++ signal.signal(signal.SIGTERM, signal_handler) ++ signal.signal(signal.SIGINT, signal_handler) ++ ++ logger.info( ++ "Launching %d data parallel engine(s) in headless mode, " ++ "with head node address %s.", local_engine_count, input_address) ++ ++ # Create the engines. ++ engine_manager = CoreEngineProcManager( ++ target_fn=EngineCoreProc.run_engine_core, ++ local_engine_count=local_engine_count, ++ start_index=args.data_parallel_start_rank, ++ local_start_index=0, ++ vllm_config=vllm_config, ++ on_head_node=False, ++ input_address=input_address, ++ executor_class=Executor.get_class(vllm_config), ++ log_stats=not engine_args.disable_log_stats, ++ ) ++ ++ try: ++ engine_manager.join_first() ++ finally: ++ logger.info("Shutting down.") ++ engine_manager.close() +diff --git a/vllm/forward_context.py b/vllm/forward_context.py +index e195a03c5..e5b9fd5fa 100644 +--- a/vllm/forward_context.py ++++ b/vllm/forward_context.py +@@ -77,7 +77,8 @@ def set_forward_context(attn_metadata: Any, + attn_metadata.num_decode_tokens + else: + # for v1 attention backends +- batchsize = attn_metadata.num_input_tokens ++ # batchsize = attn_metadata.num_input_tokens ++ batchsize = len(attn_metadata.seq_lens) + else: + batchsize = num_tokens + num_tokens_across_dp = [0] * dp_size +diff --git a/vllm/utils.py b/vllm/utils.py +index 5f32f8cb6..d38d02586 100644 +--- a/vllm/utils.py ++++ b/vllm/utils.py +@@ -551,6 +551,10 @@ def is_valid_ipv6_address(address: str) -> bool: + + + def get_distributed_init_method(ip: str, port: int) -> str: ++ return get_tcp_uri(ip, port) ++ ++ ++def get_tcp_uri(ip: str, port: int) -> str: + # Brackets are not permitted in ipv4 addresses, + # see https://github.com/python/cpython/issues/103848 + return f"tcp://[{ip}]:{port}" if ":" in ip else f"tcp://{ip}:{port}" +@@ -2189,6 +2193,8 @@ def make_zmq_socket( + ctx: Union[zmq.asyncio.Context, zmq.Context], # type: ignore[name-defined] + path: str, + socket_type: Any, ++ bind: Optional[bool] = None, ++ identity: Optional[bytes] = None, + ) -> Union[zmq.Socket, zmq.asyncio.Socket]: # type: ignore[name-defined] + """Make a ZMQ socket with the proper bind/connect semantics.""" + +@@ -2207,16 +2213,24 @@ def make_zmq_socket( + else: + buf_size = -1 # Use system default buffer size + +- if socket_type == zmq.constants.PULL: +- socket.setsockopt(zmq.constants.RCVHWM, 0) +- socket.setsockopt(zmq.constants.RCVBUF, buf_size) ++ if bind is None: ++ bind = socket_type != zmq.PUSH ++ ++ if socket_type in (zmq.PULL, zmq.DEALER, zmq.ROUTER): ++ socket.setsockopt(zmq.RCVHWM, 0) ++ socket.setsockopt(zmq.RCVBUF, buf_size) ++ ++ if socket_type in (zmq.PUSH, zmq.DEALER, zmq.ROUTER): ++ socket.setsockopt(zmq.SNDHWM, 0) ++ socket.setsockopt(zmq.SNDBUF, buf_size) ++ ++ if identity is not None: ++ socket.setsockopt(zmq.IDENTITY, identity) ++ ++ if bind: + socket.bind(path) +- elif socket_type == zmq.constants.PUSH: +- socket.setsockopt(zmq.constants.SNDHWM, 0) +- socket.setsockopt(zmq.constants.SNDBUF, buf_size) +- socket.connect(path) + else: +- raise ValueError(f"Unknown Socket Type: {socket_type}") ++ socket.connect(path) + + return socket + +@@ -2225,14 +2239,19 @@ def make_zmq_socket( + def zmq_socket_ctx( + path: str, + socket_type: Any, ++ bind: Optional[bool] = None, + linger: int = 0, ++ identity: Optional[bytes] = None, + ) -> Iterator[zmq.Socket]: + """Context manager for a ZMQ socket""" + + ctx = zmq.Context() # type: ignore[attr-defined] + try: +- yield make_zmq_socket(ctx, path, socket_type) +- ++ yield make_zmq_socket(ctx, ++ path, ++ socket_type, ++ bind=bind, ++ identity=identity) + except KeyboardInterrupt: + logger.debug("Got Keyboard Interrupt.") + +diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py +index 39caca0c2..32902a8b1 100644 +--- a/vllm/v1/engine/core.py ++++ b/vllm/v1/engine/core.py +@@ -23,7 +23,7 @@ from vllm.lora.request import LoRARequest + from vllm.transformers_utils.config import ( + maybe_register_config_serialize_by_value) + from vllm.utils import (get_exception_traceback, resolve_obj_by_qualname, +- zmq_socket_ctx) ++ make_zmq_socket, resolve_obj_by_qualname, zmq_socket_ctx) + from vllm.v1.core.kv_cache_utils import (get_kv_cache_config, + unify_kv_cache_configs) + from vllm.v1.core.sched.interface import SchedulerInterface +@@ -43,6 +43,7 @@ from vllm.version import __version__ as VLLM_VERSION + logger = init_logger(__name__) + + POLLING_TIMEOUT_S = 2.5 ++HANDSHAKE_TIMEOUT_MINS = 5 + + _R = TypeVar('_R') # Return type for collective_rpc + +@@ -306,43 +307,111 @@ class EngineCore: + + class EngineCoreProc(EngineCore): + """ZMQ-wrapper for running EngineCore in background process.""" ++ ENGINE_CORE_DEAD = b'ENGINE_CORE_DEAD' + + def __init__( + self, +- input_path: str, +- output_path: str, + vllm_config: VllmConfig, ++ on_head_node: bool, ++ input_address: str, + executor_class: type[Executor], + log_stats: bool, + engine_index: int = 0, + ): +- super().__init__(vllm_config, executor_class, log_stats) +- +- # Background Threads and Queues for IO. These enable us to +- # overlap ZMQ socket IO with GPU since they release the GIL, +- # and to overlap some serialization/deserialization with the +- # model forward pass. +- # Threads handle Socket <-> Queues and core_busy_loop uses Queue. +- self.input_queue: queue.Queue[tuple[EngineCoreRequestType, +- Any]] = queue.Queue() +- self.output_queue: queue.Queue[EngineCoreOutputs] = queue.Queue() +- threading.Thread(target=self.process_input_socket, +- args=(input_path, ), +- daemon=True).start() +- threading.Thread(target=self.process_output_socket, +- args=(output_path, engine_index), +- daemon=True).start() +- +- self.global_unfinished_reqs = False +- +- self.step_fn = (self.step if self.batch_queue is None else +- self.step_with_batch_queue) ++ input_queue = queue.Queue[tuple[EngineCoreRequestType, Any]]() ++ ++# GZQ DP patch TODO: executor_fail_callback is useful sometimes for reliabiltiy issues, should add later ++ executor_fail_callback = lambda: input_queue.put_nowait( ++ (EngineCoreRequestType.EXECUTOR_FAILED, b'')) ++ ++ # Create input socket. ++ input_ctx = zmq.Context() ++ identity = engine_index.to_bytes(length=2, byteorder="little") ++ input_socket = make_zmq_socket(input_ctx, ++ input_address, ++ zmq.DEALER, ++ identity=identity, ++ bind=False) ++ try: ++ # Register engine with front-end. ++ output_address = self.startup_handshake( ++ input_socket, on_head_node, vllm_config.parallel_config) ++ ++ # Update config which may have changed from the handshake. ++ vllm_config.__post_init__() ++ ++ # Set up data parallel environment. ++ self._init_data_parallel(vllm_config) ++ ++ # Initialize engine core and model. ++ super().__init__(vllm_config, executor_class, log_stats) ++ ++ self.step_fn = (self.step if self.batch_queue is None else ++ self.step_with_batch_queue) ++ ++ self.global_unfinished_reqs = False ++ ++ # Send ready message. ++ input_socket.send( ++ msgspec.msgpack.encode({ ++ "status": "READY", ++ "local": on_head_node ++ })) ++ ++ # Background Threads and Queues for IO. These enable us to ++ # overlap ZMQ socket IO with GPU since they release the GIL, ++ # and to overlap some serialization/deserialization with the ++ # model forward pass. ++ # Threads handle Socket <-> Queues and core_busy_loop uses Queue. ++ self.input_queue = input_queue ++ self.output_queue = queue.Queue[Union[EngineCoreOutputs, bytes]]() ++ threading.Thread(target=self.process_input_socket, ++ args=(input_socket, ), ++ daemon=True).start() ++ input_socket = None ++ self.output_thread = threading.Thread( ++ target=self.process_output_socket, ++ args=(output_address, engine_index), ++ daemon=True) ++ self.output_thread.start() ++ finally: ++ if input_socket is not None: ++ input_socket.close(linger=0) ++ ++ @staticmethod ++ def startup_handshake(input_socket: zmq.Socket, on_head_node: bool, ++ parallel_config: ParallelConfig) -> str: ++ ++ # Send registration message. ++ input_socket.send( ++ msgspec.msgpack.encode({ ++ "status": "HELLO", ++ "local": on_head_node, ++ })) ++ ++ # Receive initialization message. ++ logger.info("Waiting for init message from front-end.") ++ if not input_socket.poll(timeout=HANDSHAKE_TIMEOUT_MINS * 60 * 1000): ++ raise RuntimeError("Did not receive response from front-end " ++ f"process within {HANDSHAKE_TIMEOUT_MINS} " ++ f"minutes") ++ init_bytes = input_socket.recv() ++ init_message = msgspec.msgpack.decode(init_bytes) ++ logger.debug("Received init message: %s", init_message) ++ ++ output_socket_address = init_message["output_socket_address"] ++ #TBD(nick) maybe replace IP with configured head node address ++ ++ received_parallel_config = init_message["parallel_config"] ++ for key, value in received_parallel_config.items(): ++ setattr(parallel_config, key, value) ++ ++ return output_socket_address + + @staticmethod + def run_engine_core(*args, + dp_rank: int = 0, + local_dp_rank: int = 0, +- ready_pipe, + **kwargs): + """Launch EngineCore busy loop in background process.""" + +@@ -369,7 +438,7 @@ class EngineCoreProc(EngineCore): + try: + parallel_config: ParallelConfig = kwargs[ + "vllm_config"].parallel_config +- if parallel_config.data_parallel_size > 1: ++ if parallel_config.data_parallel_size > 1 or dp_rank > 0: + # Set data parallel rank for this engine process. + parallel_config.data_parallel_rank = dp_rank + parallel_config.data_parallel_rank_local = local_dp_rank +@@ -377,9 +446,6 @@ class EngineCoreProc(EngineCore): + else: + engine_core = EngineCoreProc(*args, **kwargs) + +- # Send Readiness signal to EngineClient. +- ready_pipe.send({"status": "READY"}) +- + engine_core.run_busy_loop() + + except SystemExit: +@@ -394,6 +460,9 @@ class EngineCoreProc(EngineCore): + if engine_core is not None: + engine_core.shutdown() + ++ def _init_data_parallel(self, vllm_config: VllmConfig): ++ pass ++ + def run_busy_loop(self): + """Core busy loop of the EngineCore.""" + +@@ -476,27 +545,37 @@ class EngineCoreProc(EngineCore): + and not isinstance(v, p.annotation) else v + for v, p in zip(args, arg_types)) + +- def process_input_socket(self, input_path: str): ++ def _send_engine_dead(self): ++ """Send EngineDead status to the EngineCoreClient.""" ++ ++ # Put ENGINE_CORE_DEAD in the queue. ++ self.output_queue.put_nowait(EngineCoreProc.ENGINE_CORE_DEAD) ++ ++ # Wait until msg sent by the daemon before shutdown. ++ self.output_thread.join(timeout=5.0) ++ if self.output_thread.is_alive(): ++ logger.fatal("vLLM shutdown signal from EngineCore failed " ++ "to send. Please report this issue.") ++ ++ def process_input_socket(self, input_socket: zmq.Socket): + """Input socket IO thread.""" + + # Msgpack serialization decoding. + add_request_decoder = MsgpackDecoder(EngineCoreRequest) + generic_decoder = MsgpackDecoder() + +- with zmq_socket_ctx(input_path, zmq.constants.PULL) as socket: +- while True: +- # (RequestType, RequestData) +- type_frame, data_frame = socket.recv_multipart(copy=False) +- request_type = EngineCoreRequestType(bytes(type_frame.buffer)) ++ while True: ++ # (RequestType, RequestData) ++ type_frame, data_frames = input_socket.recv_multipart(copy=False) ++ request_type = EngineCoreRequestType(bytes(type_frame.buffer)) + +- # Deserialize the request data. +- decoder = add_request_decoder if ( +- request_type +- == EngineCoreRequestType.ADD) else generic_decoder +- request = decoder.decode(data_frame.buffer) ++ # Deserialize the request data. ++ decoder = add_request_decoder if ( ++ request_type == EngineCoreRequestType.ADD) else generic_decoder ++ request = decoder.decode(data_frames) + +- # Push to input queue for core busy loop. +- self.input_queue.put_nowait((request_type, request)) ++ # Push to input queue for core busy loop. ++ self.input_queue.put_nowait((request_type, request)) + + def process_output_socket(self, output_path: str, engine_index: int): + """Output socket IO thread.""" +@@ -523,9 +602,9 @@ class DPEngineCoreProc(EngineCoreProc): + + def __init__( + self, +- input_path: str, +- output_path: str, + vllm_config: VllmConfig, ++ on_head_node: bool, ++ input_address: str, + executor_class: type[Executor], + log_stats: bool, + ): +@@ -537,8 +616,20 @@ class DPEngineCoreProc(EngineCoreProc): + _add_prefix(sys.stdout, process_name, pid) + _add_prefix(sys.stderr, process_name, pid) + +- dp_size = vllm_config.parallel_config.data_parallel_size ++ # Counts forward-passes of the model so that we can synchronize ++ # finished with DP peers every N steps. ++ self.counter = 0 ++ ++ # Initialize the engine. ++ dp_rank = vllm_config.parallel_config.data_parallel_rank ++ super().__init__(vllm_config, on_head_node, input_address, ++ executor_class, log_stats, dp_rank) ++ ++ def _init_data_parallel(self, vllm_config: VllmConfig): ++ ++ # Configure GPUs and stateless process group for data parallel. + dp_rank = vllm_config.parallel_config.data_parallel_rank ++ dp_size = vllm_config.parallel_config.data_parallel_size + local_dp_rank = vllm_config.parallel_config.data_parallel_rank_local + + assert dp_size > 1 +@@ -547,22 +638,14 @@ class DPEngineCoreProc(EngineCoreProc): + from vllm.platforms import current_platform + if current_platform.is_cuda_alike(): + from vllm.platforms.cuda import device_id_to_physical_device_id +- tp_size = vllm_config.parallel_config.tensor_parallel_size ++ world_size = vllm_config.parallel_config.world_size + os.environ["CUDA_VISIBLE_DEVICES"] = ",".join( + str(device_id_to_physical_device_id(i)) +- for i in range(local_dp_rank * tp_size, (local_dp_rank + 1) * +- tp_size)) ++ for i in range(local_dp_rank * ++ world_size, (local_dp_rank + 1) * world_size)) + + self.dp_group = vllm_config.parallel_config.stateless_init_dp_group() + +- # Initialize the engine after setting up environment. +- super().__init__(input_path, output_path, vllm_config, executor_class, +- log_stats, dp_rank) +- +- # Counts forward-passes of the model so that we can synchronize +- # finished with DP peers every N steps. +- self.counter = 0 +- + def shutdown(self): + super().shutdown() + if dp_group := getattr(self, "dp_group", None): +diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py +index e948e59b8..e443f45db 100644 +--- a/vllm/v1/engine/core_client.py ++++ b/vllm/v1/engine/core_client.py +@@ -8,26 +8,29 @@ import threading + import uuid + import weakref + from abc import ABC, abstractmethod +-from collections.abc import Awaitable, Sequence ++from collections.abc import Awaitable + from concurrent.futures import Future +-from dataclasses import dataclass, field ++from dataclasses import dataclass ++from enum import Enum, auto + from threading import Thread + from typing import Any, Callable, Optional, TypeVar, Union + ++import msgspec + import zmq + import zmq.asyncio + +-from vllm.config import VllmConfig ++from vllm.config import ParallelConfig, VllmConfig + from vllm.logger import init_logger + from vllm.lora.request import LoRARequest +-from vllm.utils import (get_open_zmq_inproc_path, get_open_zmq_ipc_path, +- kill_process_tree, make_zmq_socket) ++ ++from vllm.utils import (get_open_port, get_open_zmq_inproc_path, ++ get_open_zmq_ipc_path, kill_process_tree, get_tcp_uri, make_zmq_socket) + from vllm.v1.engine import (EngineCoreOutputs, EngineCoreRequest, + EngineCoreRequestType, UtilityOutput) + from vllm.v1.engine.core import EngineCore, EngineCoreProc + from vllm.v1.executor.abstract import Executor +-from vllm.v1.serial_utils import MsgpackDecoder, MsgpackEncoder +-from vllm.v1.utils import BackgroundProcHandle ++from vllm.v1.serial_utils import MsgpackDecoder, MsgpackEncoder, bytestr ++from vllm.v1.utils import CoreEngineProcManager + + logger = init_logger(__name__) + +@@ -35,6 +38,8 @@ AnyFuture = Union[asyncio.Future[Any], Future[Any]] + + _R = TypeVar('_R') # Return type for collective_rpc + ++STARTUP_POLL_PERIOD_MS = 10000 ++ + + class EngineCoreClient(ABC): + """ +@@ -253,52 +258,21 @@ class InprocClient(EngineCoreClient): + return self.engine_core.collective_rpc(method, timeout, args, kwargs) + + +-class CoreEngine: +- """One per data parallel rank.""" ++class CoreEngineState(Enum): ++ NEW = auto() ++ CONNECTED = auto() ++ READY = auto() + +- def __init__( +- self, +- vllm_config: VllmConfig, +- executor_class: type[Executor], +- log_stats: bool, +- ctx: Union[zmq.Context, zmq.asyncio.Context], +- output_path: str, +- index: int = 0, +- local_dp_rank: int = 0, +- ): +- # Paths and sockets for IPC. +- input_path = get_open_zmq_ipc_path() +- self.input_socket = make_zmq_socket(ctx, input_path, +- zmq.constants.PUSH) +- try: +- # Start EngineCore in background process. +- self.proc_handle = BackgroundProcHandle( +- input_path=input_path, +- output_path=output_path, +- process_name=f"EngineCore_{index}", +- target_fn=EngineCoreProc.run_engine_core, +- process_kwargs={ +- "vllm_config": vllm_config, +- "dp_rank": index, +- "local_dp_rank": local_dp_rank, +- "executor_class": executor_class, +- "log_stats": log_stats, +- }) + +- self.num_reqs_in_flight = 0 +- finally: +- if not hasattr(self, "num_reqs_in_flight"): +- # Ensure socket is closed if process fails to start. +- self.close() ++class CoreEngine: ++ """One per data parallel rank.""" + +- def send_multipart(self, msg_parts: Sequence): +- return self.input_socket.send_multipart(msg_parts, copy=False) ++ def __init__(self, index: int = 0, local: bool = True): ++ self.local = local ++ self.identity = index.to_bytes(length=2, byteorder="little") + +- def close(self): +- if proc_handle := getattr(self, "proc_handle", None): +- proc_handle.shutdown() +- if socket := getattr(self, "input_socket", None): +- socket.close(linger=0) ++ self.state = CoreEngineState.NEW ++ self.num_reqs_in_flight = 0 + + + @dataclass +@@ -307,20 +281,23 @@ class BackgroundResources: + circular reference back to the client object.""" + + ctx: Union[zmq.Context] +- core_engines: list[CoreEngine] = field(default_factory=list) ++ local_engine_manager: Optional[CoreEngineProcManager] = None + output_socket: Optional[Union[zmq.Socket, zmq.asyncio.Socket]] = None ++ input_socket: Optional[Union[zmq.Socket, zmq.asyncio.Socket]] = None + shutdown_path: Optional[str] = None + + def __call__(self): + """Clean up background resources.""" + +- for core_engine in self.core_engines: +- core_engine.close() ++ if self.local_engine_manager is not None: ++ self.local_engine_manager.close() + + # ZMQ context termination can hang if the sockets + # aren't explicitly closed first. + if self.output_socket is not None: + self.output_socket.close(linger=0) ++ if self.input_socket is not None: ++ self.input_socket.close(linger=0) + if self.shutdown_path is not None: + # We must ensure that the sync output socket is + # closed cleanly in its own thread. +@@ -384,38 +361,169 @@ class MPClient(EngineCoreClient): + # exception is raised mid-construction. + self.resources = BackgroundResources(ctx=sync_ctx) + self._finalizer = weakref.finalize(self, self.resources) ++ success = False ++ try: ++ parallel_config = vllm_config.parallel_config ++ local_engine_count = parallel_config.data_parallel_size_local ++ start_index = parallel_config.data_parallel_rank ++ local_start_index = parallel_config.data_parallel_rank_local ++ ++ # SPMD mode is where there is an LLM instance per DP rank and ++ # one core engine per LLM, see ++ # examples/offline_inference/data_parallel.py. ++ spmd_mode = local_start_index is not None ++ if spmd_mode: ++ assert local_engine_count == 1 ++ self.core_engines = [ ++ CoreEngine(index=local_start_index, local=True) ++ ] ++ else: ++ assert start_index == 0 ++ local_start_index = 0 ++ self.core_engines = [ ++ CoreEngine(index=i, local=(i < local_engine_count)) ++ for i in range(parallel_config.data_parallel_size) ++ ] + +- # Paths and sockets for IPC. +- self.output_path = get_open_zmq_ipc_path() +- +- new_core_engine = lambda index, local_dp_rank=None: CoreEngine( +- vllm_config, executor_class, log_stats, self.ctx, self.output_path, +- index, local_dp_rank) +- +- # Start engine core process(es). +- self._init_core_engines(vllm_config, new_core_engine, +- self.resources.core_engines) +- +- # Wait for engine core process(es) to start. +- for engine in self.resources.core_engines: +- engine.proc_handle.wait_for_startup() +- +- self.utility_results: dict[int, AnyFuture] = {} ++ input_address, output_address = self._get_zmq_addresses( ++ parallel_config, spmd_mode) ++ ++ # Create input and output sockets. ++ self.input_socket = self.resources.input_socket = make_zmq_socket( ++ self.ctx, input_address, zmq.ROUTER, bind=True) ++ ++ self.resources.output_socket = make_zmq_socket( ++ self.ctx, output_address, zmq.constants.PULL) ++ # Start local engines. ++ if local_engine_count: ++ # In server mode, start_index and local_start_index will ++ # both be 0. ++ self.resources.local_engine_manager = CoreEngineProcManager( ++ EngineCoreProc.run_engine_core, ++ vllm_config=vllm_config, ++ executor_class=executor_class, ++ log_stats=log_stats, ++ input_address=input_address, ++ on_head_node=True, ++ local_engine_count=local_engine_count, ++ start_index=start_index, ++ local_start_index=local_start_index) ++ ++ self.core_engine = self.core_engines[0] ++ ++ # Wait for engine core process(es) to start. ++ self._wait_for_engine_startup(output_address, parallel_config) ++ ++ self.utility_results: dict[int, AnyFuture] = {} ++ success = True ++ finally: ++ if not success: ++ self._finalizer() + +- def _init_core_engines( +- self, +- vllm_config: VllmConfig, +- new_core_engine: Callable[[int, Optional[int]], CoreEngine], +- core_engines: list[CoreEngine], +- ) -> None: +- +- # Default case - single core engine. +- dp_rank = vllm_config.parallel_config.data_parallel_rank +- local_dp_rank = vllm_config.parallel_config.data_parallel_rank_local +- core_engine = new_core_engine( +- dp_rank, local_dp_rank if local_dp_rank is not None else dp_rank) +- core_engines.append(core_engine) +- self.core_engine = core_engine ++ @staticmethod ++ def _get_zmq_addresses(parallel_config: ParallelConfig, ++ spmd_mode: bool) -> tuple[str, str]: ++ """Returns (input_address, output_address).""" ++ dp_size = parallel_config.data_parallel_size ++ local_engine_count = parallel_config.data_parallel_size_local ++ ++ if local_engine_count == dp_size or spmd_mode: ++ input_address = get_open_zmq_ipc_path() ++ output_address = get_open_zmq_ipc_path() ++ else: ++ host = parallel_config.data_parallel_master_ip ++ input_port = parallel_config.data_parallel_rpc_port ++ output_port = get_open_port() ++ input_address = get_tcp_uri(host, input_port) ++ output_address = get_tcp_uri(host, output_port) ++ ++ return input_address, output_address ++ ++ def _wait_for_engine_startup(self, output_address: str, ++ parallel_config: ParallelConfig): ++ # Get a sync handle to the socket which can be sync or async. ++ sync_input_socket = zmq.Socket.shadow(self.input_socket) ++ ++ # Wait for engine core process(es) to send ready messages. ++ local_count = parallel_config.data_parallel_size_local ++ remote_count = len(self.core_engines) - local_count ++ # [local, remote] counts ++ conn_pending, start_pending = [local_count, remote_count], [0, 0] ++ ++ poller = zmq.Poller() ++ poller.register(sync_input_socket, zmq.POLLIN) ++ proc_manager = self.resources.local_engine_manager ++ if proc_manager is not None: ++ for sentinel in proc_manager.sentinels(): ++ poller.register(sentinel, zmq.POLLIN) ++ while any(conn_pending) or any(start_pending): ++ events = poller.poll(STARTUP_POLL_PERIOD_MS) ++ if not events: ++ if any(conn_pending): ++ logger.debug( ++ "Waiting for %d local, %d remote core engine proc(s) " ++ "to connect.", *conn_pending) ++ if any(start_pending): ++ logger.debug( ++ "Waiting for %d local, %d remote core engine proc(s) " ++ "to start.", *start_pending) ++ continue ++ if len(events) > 1 or events[0][0] != sync_input_socket: ++ # One of the local core processes exited. ++ finished = proc_manager.finished_procs( ++ ) if proc_manager else {} ++ raise RuntimeError("Engine core initialization failed. " ++ "See root cause above. " ++ f"Failed core proc(s): {finished}") ++ ++ # Receive HELLO and READY messages from the input socket. ++ eng_identity, ready_msg_bytes = sync_input_socket.recv_multipart() ++ eng_index = int.from_bytes(eng_identity, byteorder="little") ++ engine = next( ++ (e for e in self.core_engines if e.identity == eng_identity), ++ None) ++ if engine is None: ++ raise RuntimeError(f"Message from engine with unexpected data " ++ f"parallel rank: {eng_index}") ++ msg = msgspec.msgpack.decode(ready_msg_bytes) ++ status, local = msg["status"], msg["local"] ++ if local != engine.local: ++ raise RuntimeError(f"{status} message from " ++ f"{'local' if local else 'remote'} " ++ f"engine {eng_index}, expected it to be " ++ f"{'local' if engine.local else 'remote'}") ++ ++ if status == "HELLO" and engine.state == CoreEngineState.NEW: ++ ++ # Send init message with DP config info. ++ init_message = self.encoder.encode({ ++ "output_socket_address": output_address, ++ "parallel_config": { ++ "data_parallel_master_ip": ++ parallel_config.data_parallel_master_ip, ++ "data_parallel_master_port": ++ parallel_config.data_parallel_master_port, ++ "data_parallel_size": ++ parallel_config.data_parallel_size, ++ }, ++ }) ++ sync_input_socket.send_multipart((eng_identity, init_message), ++ copy=False) ++ conn_pending[0 if local else 1] -= 1 ++ start_pending[0 if local else 1] += 1 ++ engine.state = CoreEngineState.CONNECTED ++ elif status == "READY" and (engine.state ++ == CoreEngineState.CONNECTED): ++ start_pending[0 if local else 1] -= 1 ++ engine.state = CoreEngineState.READY ++ else: ++ raise RuntimeError(f"Unexpected {status} message for " ++ f"{'local' if local else 'remote'} engine " ++ f"{eng_index} in {engine.state} state.") ++ ++ logger.debug("%s from %s core engine process %s.", status, ++ "local" if local else "remote", eng_index) ++# >>>>>>> fbe7575cc... squashed commit of pr#15977 + + def shutdown(self): + self._finalizer() +@@ -448,7 +556,8 @@ class SyncMPClient(MPClient): + # Ensure that the outputs socket processing thread does not have + # a ref to the client which prevents gc. + ctx = self.ctx +- output_path = self.output_path ++ out_socket = self.resources.output_socket ++ assert out_socket is not None + decoder = self.decoder + utility_results = self.utility_results + outputs_queue = self.outputs_queue +@@ -458,7 +567,6 @@ class SyncMPClient(MPClient): + + def process_outputs_socket(): + shutdown_socket = ctx.socket(zmq.PAIR) +- out_socket = make_zmq_socket(ctx, output_path, zmq.constants.PULL) + try: + shutdown_socket.bind(shutdown_path) + poller = zmq.Poller() +@@ -490,13 +598,17 @@ class SyncMPClient(MPClient): + daemon=True) + self.output_queue_thread.start() + ++ # The thread takes on responsibility for closing the socket. ++ self.resources.output_socket = None ++ + def get_output(self) -> EngineCoreOutputs: + return self.outputs_queue.get() + + def _send_input(self, request_type: EngineCoreRequestType, request: Any): +- # (RequestType, SerializedRequest) +- msg = (request_type.value, self.encoder.encode(request)) +- self.core_engine.send_multipart(msg) ++ # (Identity, RequestType, SerializedRequest) ++ msg = (self.core_engine.identity, request_type.value, ++ self.encoder.encode(request)) ++ self.input_socket.send_multipart(msg, copy=False) + + def call_utility(self, method: str, *args) -> Any: + call_id = uuid.uuid1().int >> 64 +@@ -581,6 +693,7 @@ class AsyncMPClient(MPClient): + [AsyncMPClient, EngineCoreOutputs], Awaitable[None]]] = None + + def _ensure_output_queue_task(self): ++ resources = self.resources + if self.outputs_queue is not None: + return + +@@ -592,10 +705,8 @@ class AsyncMPClient(MPClient): + outputs_queue = self.outputs_queue + output_handler = self.outputs_handler + _self_ref = weakref.ref(self) if output_handler else None +- output_path = self.output_path +- output_socket = make_zmq_socket(self.ctx, output_path, +- zmq.constants.PULL) +- self.resources.output_socket = output_socket ++ output_socket = resources.output_socket ++ assert output_socket is not None + + async def process_outputs_socket(): + while True: +@@ -625,30 +736,34 @@ class AsyncMPClient(MPClient): + assert self.outputs_queue is not None + return await self.outputs_queue.get() + +- async def _send_input(self, request_type: EngineCoreRequestType, +- request: Any) -> None: +- await self.core_engine.send_multipart( +- (request_type.value, self.encoder.encode(request))) ++ def _send_input(self, ++ request_type: EngineCoreRequestType, ++ request: Any, ++ engine: Optional[CoreEngine] = None) -> Awaitable[None]: ++ if engine is None: ++ engine = self.core_engine + +- self._ensure_output_queue_task() ++ message = (request_type.value, self.encoder.encode(request)) ++ return self._send_input_message(message, engine) ++ ++ def _send_input_message(self, message: tuple[bytes, bytes], ++ engine: CoreEngine) -> Awaitable[None]: ++ message = (engine.identity, ) + message # type: ignore[assignment] ++ return self.input_socket.send_multipart(message, copy=False) + + async def call_utility_async(self, method: str, *args) -> Any: + return await self._call_utility_async(method, + *args, + engine=self.core_engine) + +- async def _call_utility_async( +- self, +- method: str, +- *args, +- engine: CoreEngine, +- ) -> Any: ++ async def _call_utility_async(self, method: str, *args, ++ engine: CoreEngine) -> Any: + call_id = uuid.uuid1().int >> 64 + future = asyncio.get_running_loop().create_future() + self.utility_results[call_id] = future + message = (EngineCoreRequestType.UTILITY.value, + self.encoder.encode((call_id, method, args))) +- await engine.send_multipart(message) ++ await self._send_input_message(message, engine) + self._ensure_output_queue_task() + return await future + +@@ -657,6 +772,7 @@ class AsyncMPClient(MPClient): + # tokenized. + request.prompt = None + await self._send_input(EngineCoreRequestType.ADD, request) ++ self._ensure_output_queue_task() + + async def abort_requests_async(self, request_ids: list[str]) -> None: + if len(request_ids) > 0: +@@ -728,21 +844,6 @@ class DPAsyncMPClient(AsyncMPClient): + + self.outputs_handler = DPAsyncMPClient.process_engine_outputs # type: ignore[assignment] + +- def _init_core_engines( +- self, +- vllm_config: VllmConfig, +- new_core_engine: Callable[[int, Optional[int]], CoreEngine], +- core_engines: list[CoreEngine], +- ) -> None: +- +- # Launch a core engine for each data parallel rank. +- dp_size = vllm_config.parallel_config.data_parallel_size +- for i in range(dp_size): +- # Multi-node not yet supported so local_dp_rank == dp_rank. +- core_engines.append(new_core_engine(i, i)) +- +- self.core_engines = core_engines +- + async def call_utility_async(self, method: str, *args) -> Any: + # Only the result from the first engine is returned. + return (await asyncio.gather(*[ +@@ -761,15 +862,15 @@ class DPAsyncMPClient(AsyncMPClient): + self.reqs_in_flight[request.request_id] = chosen_engine + chosen_engine.num_reqs_in_flight += 1 + if self.num_engines_running >= len(self.core_engines): +- await chosen_engine.send_multipart(msg) ++ await self._send_input_message(msg, chosen_engine) + else: + # Send request to chosen engine and dp start loop + # control message to all other engines. + self.num_engines_running += len(self.core_engines) + await asyncio.gather(*[ +- engine.send_multipart(msg if engine is +- chosen_engine else self.start_dp_msg) +- for engine in self.core_engines ++ self._send_input_message( ++ msg if engine is chosen_engine else self.start_dp_msg, ++ engine) for engine in self.core_engines + ]) + + self._ensure_output_queue_task() +@@ -794,7 +895,7 @@ class DPAsyncMPClient(AsyncMPClient): + # sure to start the other engines: + self.num_engines_running = len(self.core_engines) + coros = [ +- engine.send_multipart(self.start_dp_msg) ++ self._send_input_message(self.start_dp_msg, engine) + for engine in self.core_engines + if not engine.num_reqs_in_flight + ] +@@ -820,5 +921,5 @@ class DPAsyncMPClient(AsyncMPClient): + + async def _abort_requests(self, request_ids: list[str], + engine: CoreEngine) -> None: +- await engine.send_multipart((EngineCoreRequestType.ABORT.value, +- self.encoder.encode(request_ids))) ++ await self._send_input(EngineCoreRequestType.ABORT, request_ids, ++ engine) +diff --git a/vllm/v1/serial_utils.py b/vllm/v1/serial_utils.py +index 146d7d747..7c1d48494 100644 +--- a/vllm/v1/serial_utils.py ++++ b/vllm/v1/serial_utils.py +@@ -2,9 +2,10 @@ + + import pickle + from types import FunctionType +-from typing import Any, Optional ++from typing import Any, Optional, Union + + import cloudpickle ++import zmq + import torch + from msgspec import msgpack + +@@ -12,6 +13,7 @@ CUSTOM_TYPE_TENSOR = 1 + CUSTOM_TYPE_PICKLE = 2 + CUSTOM_TYPE_CLOUDPICKLE = 3 + ++bytestr = Union[bytes, bytearray, memoryview, zmq.Frame] + + class MsgpackEncoder: + """Encoder with custom torch tensor serialization.""" +diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py +index f42b3501a..fbc0ee340 100644 +--- a/vllm/v1/utils.py ++++ b/vllm/v1/utils.py +@@ -2,17 +2,21 @@ + + import multiprocessing + import os ++import time + import weakref + from collections import defaultdict + from collections.abc import Sequence ++from multiprocessing import Process, connection + from typing import (TYPE_CHECKING, Any, Callable, Generic, Optional, TypeVar, + Union, overload) + + import torch + ++from vllm.config import VllmConfig + from vllm.logger import init_logger + from vllm.model_executor.models.utils import extract_layer_index + from vllm.utils import get_mp_context, kill_process_tree ++from vllm.v1.executor.abstract import Executor + + if TYPE_CHECKING: + from vllm.attention.layer import Attention +@@ -90,7 +94,7 @@ class ConstantList(Generic[T], Sequence): + return f"ConstantList({self._x})" + + +-class BackgroundProcHandle: ++class CoreEngineProcManager: + """ + Utility class to handle creation, readiness, and shutdown + of background processes used by the AsyncLLM and LLMEngine. +@@ -98,55 +102,91 @@ class BackgroundProcHandle: + + def __init__( + self, +- input_path: str, +- output_path: str, +- process_name: str, + target_fn: Callable, +- process_kwargs: dict[Any, Any], ++ local_engine_count: int, ++ start_index: int, ++ local_start_index: int, ++ vllm_config: VllmConfig, ++ on_head_node: bool, ++ input_address: str, ++ executor_class: type[Executor], ++ log_stats: bool, + ): + context = get_mp_context() +- self.reader, writer = context.Pipe(duplex=False) +- +- assert ("ready_pipe" not in process_kwargs +- and "input_path" not in process_kwargs +- and "output_path" not in process_kwargs) +- process_kwargs["ready_pipe"] = writer +- process_kwargs["input_path"] = input_path +- process_kwargs["output_path"] = output_path +- +- # Run busy loop in background process. +- self.proc = context.Process(target=target_fn, +- kwargs=process_kwargs, +- name=process_name) +- self._finalizer = weakref.finalize(self, shutdown, self.proc, +- input_path, output_path) +- self.proc.start() +- +- def wait_for_startup(self): +- # Wait for startup. +- if self.reader.recv()["status"] != "READY": +- raise RuntimeError(f"{self.proc.name} initialization failed. " +- "See root cause above.") +- +- def shutdown(self): ++ common_kwargs = { ++ "vllm_config": vllm_config, ++ "on_head_node": on_head_node, ++ "input_address": input_address, ++ "executor_class": executor_class, ++ "log_stats": log_stats, ++ } ++ ++ self.processes: list[Process] = [] ++ for index in range(local_engine_count): ++ local_index = local_start_index + index ++ global_index = start_index + index ++ # Start EngineCore in background process. ++ self.processes.append( ++ context.Process(target=target_fn, ++ name=f"EngineCore_{global_index}", ++ kwargs=common_kwargs | { ++ "dp_rank": global_index, ++ "local_dp_rank": local_index, ++ })) ++ ++ self._finalizer = weakref.finalize(self, shutdown, self.processes, ++ input_address) ++ try: ++ for proc in self.processes: ++ proc.start() ++ finally: ++ # Kill other procs if not all are running. ++ if self.finished_procs(): ++ self.close() ++ ++ def close(self): ++ """Shutdown all procs.""" + self._finalizer() + ++ def join_first(self): ++ """Wait for any process to exit.""" ++ connection.wait(proc.sentinel for proc in self.processes) ++ ++ def sentinels(self) -> list: ++ return [proc.sentinel for proc in self.processes] ++ ++ def finished_procs(self) -> dict[str, int]: ++ """Returns dict of proc name -> exit code for any finished procs.""" ++ return { ++ proc.name: proc.exitcode ++ for proc in self.processes if proc.exitcode is not None ++ } ++ + + # Note(rob): shutdown function cannot be a bound method, + # else the gc cannot collect the object. +-def shutdown(proc: multiprocessing.Process, input_path: str, output_path: str): ++def shutdown(procs: list[Process], input_address: str): + # Shutdown the process. +- if proc.is_alive(): +- proc.terminate() +- proc.join(5) ++ for proc in procs: ++ if proc.is_alive(): ++ proc.terminate() ++ ++ # Allow 5 seconds for remaining procs to terminate. ++ deadline = time.monotonic() + 5 ++ for proc in procs: ++ remaining = deadline - time.monotonic() ++ if remaining <= 0: ++ break ++ if proc.is_alive(): ++ proc.join(remaining) + ++ for proc in procs: + if proc.is_alive(): + kill_process_tree(proc.pid) + + # Remove zmq ipc socket files. +- ipc_sockets = [output_path, input_path] +- for ipc_socket in ipc_sockets: +- socket_file = ipc_socket.replace("ipc://", "") ++ if input_address.startswith("ipc://"): ++ socket_file = input_address[len("ipc://"):] + if os and os.path.exists(socket_file): + os.remove(socket_file) + diff --git a/vllm_dp/install_dp_vllm.sh b/vllm_dp/install_dp_vllm.sh new file mode 100644 index 00000000..ee02ea9f --- /dev/null +++ b/vllm_dp/install_dp_vllm.sh @@ -0,0 +1,41 @@ +#!/bin/bash +# Copyright 2025 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +# This bash is to apply njhill's Multi-node server solution +# (https://github.com/vllm-project/vllm/pull/15906, https://github.com/vllm-project/vllm/pull/15977) +# to vllm v0.8.3 and install it. + +script_dir=$(cd "$(dirname $0)"; pwd) +old_dir=$(pwd) + +vllm_tag="v0.8.3" + +vllm_source_dir="${script_dir}/vllm-${vllm_tag}" + +if [ -d "${vllm_source_dir}" ]; then + echo "The ${vllm_source_dir} already exists, install maybe done! If not, please remove and rename it first." + exit 1 +fi + +git clone https://github.com/vllm-project/vllm.git -b ${vllm_tag} --depth 1 ${vllm_source_dir} +cd ${vllm_source_dir} + +git apply "${script_dir}/dp_scale_out.patch" + +export VLLM_TARGET_DEVICE=empty +pip install . + +cd ${old_dir} \ No newline at end of file diff --git a/vllm_mindspore/config.py b/vllm_mindspore/config.py index 21c2fded..8079ca35 100644 --- a/vllm_mindspore/config.py +++ b/vllm_mindspore/config.py @@ -19,6 +19,7 @@ from collections import Counter from typing import Union import sys import socket +import threading import pickle import time @@ -328,17 +329,21 @@ class SocketProcessGroup: time.sleep(self.retry_interval) else: raise ConnectionError(f"Worker {self.rank} could not connect to master at {self.master_ip}:{self.master_port} after {self.max_retries} retries.") + + def accept_connections(self): + for _ in range(self.world_size - 1): + conn, addr = self.server_socket.accept() + print(f"Accepted connection from {addr}") + self.sockets.append(conn) def initialize_group(self): if self.rank == 0: # Master node: accept connections from workers - for _ in range(self.world_size - 1): - conn, addr = self.server_socket.accept() - print(f"Accepted connection from {addr}") - self.sockets.append(conn) + self.conn_thread = threading.Thread(target=self.accept_connections, daemon=True) + self.conn_thread.start() else: # Worker node: no additional setup needed - pass + self.conn_thread = None def close(self): if self.rank == 0: @@ -370,6 +375,9 @@ def has_unfinished_dp(dp_group: SocketProcessGroup, has_unfinished: bool) -> boo """ if dp_group.rank == 0: # Master node: collect results from workers + assert dp_group.conn_thread is not None + # Wait for all dp engine connectioned. + dp_group.conn_thread.join() results = [has_unfinished] for conn in dp_group.sockets: data = conn.recv(1024) diff --git a/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py b/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py index d0ac3c8c..fa39e0fb 100644 --- a/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py +++ b/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py @@ -21,12 +21,14 @@ from typing import Iterable, Set, Tuple from collections import OrderedDict import numpy as np +import vllm.envs as envs +import mindspore as ms from vllm.config import VllmConfig from vllm.config import get_current_vllm_config +from vllm.distributed.parallel_state import get_dp_group, get_tensor_model_parallel_world_size from vllm.forward_context import get_forward_context from vllm.logger import init_logger -import vllm.envs as envs import mindspore as ms from mindspore import Tensor, JitConfig, Model, mutable @@ -55,17 +57,46 @@ from vllm_mindspore.model_executor.models.mf_models.deepseekv3_weight_processor logger = init_logger(__name__) -def set_runtime_kernel_launch_group(): - kernel_launch_group = {'thread_num' : 2, 'kernel_group_num' : 8} - env_kernel_launch_group = os.getenv("EXPERIMENTAL_KERNEL_LAUNCH_GROUP", None) - if env_kernel_launch_group is not None: - pairs = env_kernel_launch_group.split(',') - for pair in pairs: - key, val = pair.split(':') - kernel_launch_group[key] = val - thread_num = int(kernel_launch_group.get('thread_num', 2)) - kernel_group_num = int(kernel_launch_group.get('kernel_group_num', 8)) - ms.runtime.set_kernel_launch_group(thread_num=thread_num, kernel_group_num=kernel_group_num) +def _get_padding_index(q_seq_len): + dp_size = get_dp_group().world_size + tp_size = get_tensor_model_parallel_world_size() + if dp_size == 1 or tp_size == 1: + return None, None, None, None + + tokens_len_per_dp = q_seq_len.sum().reshape(-1) + tokens_len_per_dp = get_dp_group().all_gather(tokens_len_per_dp) + tokens_len_per_dp = tokens_len_per_dp.asnumpy() + padding_size = (tokens_len_per_dp.max() + tp_size - 1) // tp_size * tp_size + + dp_rank_id = get_dp_group().rank_in_group + attn_padding_idx = None + attn_unpadding_idx = None + ffn_padding_idx = None + ffn_unpadding_idx = None + last_arange_index = 0 + + for dp_rank, tokens_length in enumerate(tokens_len_per_dp): + arange_data = np.arange(0, int(tokens_length), dtype=np.int32) + if dp_rank == dp_rank_id: + ffn_unpadding_idx = arange_data + attn_padding_idx = np.pad( + arange_data, (0, padding_size - arange_data.shape[0]), mode='constant', constant_values=0) + + if dp_rank == 0: + attn_unpadding_idx = arange_data + last_arange_index = arange_data[-1] + ffn_padding_idx= np.pad(attn_unpadding_idx, (0, padding_size - attn_unpadding_idx.shape[0]), + mode='constant', constant_values=0) + else: + attn_offset_idx = arange_data + padding_size * dp_rank + attn_unpadding_idx = np.concatenate((attn_unpadding_idx, attn_offset_idx), axis=0) + ffn_offset_idx = arange_data + last_arange_index + 1 + last_arange_index = ffn_offset_idx[-1] + ffn_offset_idx_pad_zero = np.pad( + ffn_offset_idx, (0, padding_size - ffn_offset_idx.shape[0]), mode='constant', constant_values=0) + ffn_padding_idx = np.concatenate((ffn_padding_idx, ffn_offset_idx_pad_zero), axis=0) + return ms.from_numpy(attn_padding_idx), ms.from_numpy(attn_unpadding_idx), ms.from_numpy(ffn_padding_idx), \ + ms.from_numpy(ffn_unpadding_idx) class DeepseekV3ForCausalLM(MfModelBase): @@ -92,7 +123,6 @@ class DeepseekV3ForCausalLM(MfModelBase): compilation_config.static_forward_context[str(i)] = self.kv_caches[i] self.set_flags = False - set_runtime_kernel_launch_group() def _generate_model_config(self): self.mf_config.load_checkpoint = self.get_model_path() @@ -137,11 +167,26 @@ class DeepseekV3ForCausalLM(MfModelBase): else: weight_processor = DeepseekV3WeightProcessor(self.mf_config, self.network, self.is_quant) weight_processor.load_safetensors_shard(self.mf_config.load_checkpoint) + self.network.set_dynamic_inputs() dynamic_hidden_states = Tensor(shape=[None, None], dtype=self.mf_model_config.compute_dtype) self.lm_head.set_inputs(dynamic_hidden_states) + return None + def prepare_inputs(self, input_ids, positions, attn_metadata): + model_inputs, is_prefill = super().prepare_inputs( + input_ids, positions, attn_metadata) + + attn_padding_idx, attn_unpadding_idx, ffn_padding_idx, ffn_unpadding_idx = _get_padding_index( + model_inputs["q_seq_lens"]) + model_inputs["attn_padding_idx"] = attn_padding_idx + model_inputs["attn_unpadding_idx"] = attn_unpadding_idx + model_inputs["ffn_padding_idx"] = ffn_padding_idx + model_inputs["ffn_unpadding_idx"] = ffn_unpadding_idx + + return model_inputs, is_prefill + def get_model_path(self): model_name_or_path = self.model_config.model if os.path.isdir(model_name_or_path): diff --git a/vllm_mindspore/model_executor/models/mf_models/deepseekv3_weight_processor.py b/vllm_mindspore/model_executor/models/mf_models/deepseekv3_weight_processor.py index de7d70d0..82e831d0 100644 --- a/vllm_mindspore/model_executor/models/mf_models/deepseekv3_weight_processor.py +++ b/vllm_mindspore/model_executor/models/mf_models/deepseekv3_weight_processor.py @@ -27,7 +27,27 @@ from mindspore import dtype from mindspore.communication.management import get_rank from mindformers.experimental.parallel_core.pynative.parallel_state import get_tensor_model_parallel_rank from vllm_mindspore.model_executor.models.mf_models.weight_processor import BaseWeightProcessor -from vllm_mindspore.utils import convert_np_to_ms_dtype +from vllm.logger import init_logger + + +logger = init_logger + + +def convert_np_to_ms_dtype(value): + """convert_np_to_ms_dtype""" + if value.dtype == np.int8: + value_dtype = ms.int8 + elif value.dtype == np.int32: + value_dtype = ms.int32 + elif value.dtype == np.int64: + value_dtype = ms.int64 + elif value.dtype == np.float64: + value_dtype = ms.float64 + elif value.dtype == np.float32: + value_dtype = ms.float32 + else: + value_dtype = ms.bfloat16 + return value_dtype class DeepseekV3WeightProcessor(BaseWeightProcessor): @@ -42,6 +62,9 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor): def __init__(self, config, network, is_quant): super().__init__(config, network, is_quant) self.num_layers = self.config.model.model_config.num_layers + self.expert_num = self.config.moe_config.expert_num + self.moe_tensor_parallel = self.config.moe_config.moe_tensor_parallel + self.moe_expert_parallel = self.config.moe_config.moe_expert_parallel def quant_convert_weight_name(self, weight_name: str): """replace quant net weight name""" @@ -112,28 +135,51 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor): weight[..., -qk_rope_head_dim:, :] = np.concatenate([w1, w2], axis=-2) return weight - def infer_quant_process_moe_routed_expert_ffn_weight(self, src_hf_dir, layer_id, hf_weight_map): - """process moe router expert weight""" - ffn_concat = self.config.model.model_config.ffn_concat - num_router_experts = self.config.moe_config.expert_num + def infer_quant_process_moe_with_tp(self, src_hf_dir, hf_weight_map, num_router_experts, layer_id): + w1_list = [] + w2_list = [] + w3_list = [] - # router expert dense - router_dense_hf_name = f"model.layers.{layer_id}.mlp.gate.weight" - router_dense_ms_name = self.quant_convert_weight_name(router_dense_hf_name) - router_dense_ms_param, _ = self.get_safetensor_from_file(router_dense_hf_name, src_hf_dir, hf_weight_map) - self.parameter_dict[router_dense_ms_name] = ms.Parameter( - ms.from_numpy(router_dense_ms_param).astype(ms.bfloat16), - name=router_dense_ms_name, requires_grad=False) + w1_scale_list = [] + w2_scale_list = [] + w3_scale_list = [] - # e_score_correction_bias - e_score_correction_bias_hf_name = f"model.layers.{layer_id}.mlp.gate.e_score_correction_bias" - e_score_correction_bias_ms_name = self.quant_convert_weight_name(e_score_correction_bias_hf_name) - e_score_correction_bias_ms_param, _ = self.get_safetensor_from_file(e_score_correction_bias_hf_name, src_hf_dir, - hf_weight_map) - self.parameter_dict[e_score_correction_bias_ms_name] = ms.Parameter( - ms.from_numpy(e_score_correction_bias_ms_param).astype(ms.float32), - name=e_score_correction_bias_ms_name, requires_grad=False) + for index in range(0, num_router_experts): + w1_hf_name = f"model.layers.{layer_id}.mlp.experts.{index}.gate_proj.weight" + w2_hf_name = f"model.layers.{layer_id}.mlp.experts.{index}.down_proj.weight" + w3_hf_name = f"model.layers.{layer_id}.mlp.experts.{index}.up_proj.weight" + + w1_ms_param, _ = self.get_moe_safetensor_from_file(w1_hf_name, src_hf_dir, hf_weight_map, + is_split_param=True, split_axis=0) + w2_ms_param, _ = self.get_moe_safetensor_from_file(w2_hf_name, src_hf_dir, hf_weight_map, + is_split_param=True, split_axis=1) + w3_ms_param, _ = self.get_moe_safetensor_from_file(w3_hf_name, src_hf_dir, hf_weight_map, + is_split_param=True, split_axis=0) + + w1_list.append(w1_ms_param) + w2_list.append(w2_ms_param) + w3_list.append(w3_ms_param) + + w1_scale_hf_name = f"model.layers.{layer_id}.mlp.experts.{index}.gate_proj.weight_scale" + w2_scale_hf_name = f"model.layers.{layer_id}.mlp.experts.{index}.down_proj.weight_scale" + w3_scale_hf_name = f"model.layers.{layer_id}.mlp.experts.{index}.up_proj.weight_scale" + + w1_scale_ms_param, _ = self.get_moe_safetensor_from_file(w1_scale_hf_name, src_hf_dir, hf_weight_map, + is_split_param=True, split_axis=0) + w2_scale_ms_param, _ = self.get_moe_safetensor_from_file(w2_scale_hf_name, src_hf_dir, hf_weight_map) + w3_scale_ms_param, _ = self.get_moe_safetensor_from_file(w3_scale_hf_name, src_hf_dir, hf_weight_map, + is_split_param=True, split_axis=0) + + w1_scale_ms_param = w1_scale_ms_param.squeeze(axis=-1) + w2_scale_ms_param = w2_scale_ms_param.squeeze(axis=-1) + w3_scale_ms_param = w3_scale_ms_param.squeeze(axis=-1) + w1_scale_list.append(w1_scale_ms_param) + w2_scale_list.append(w2_scale_ms_param) + w3_scale_list.append(w3_scale_ms_param) + return w1_list, w2_list, w3_list, w1_scale_list, w2_scale_list, w3_scale_list + + def infer_quant_process_moe_with_ep(self, src_hf_dir, hf_weight_map, num_router_experts, layer_id): w1_list = [] w2_list = [] w3_list = [] @@ -142,41 +188,75 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor): w2_scale_list = [] w3_scale_list = [] - w1_ms_name = f"model.layers.{layer_id}.feed_forward.routed_experts.ffn.w1._layer.weight" - w2_ms_name = f"model.layers.{layer_id}.feed_forward.routed_experts.ffn.w2._layer.weight" - w3_ms_name = f"model.layers.{layer_id}.feed_forward.routed_experts.ffn.w3._layer.weight" + ep_start = self.moe_ep_rank_id * self.ep_group_nums + ep_stop = (self.moe_ep_rank_id + 1) * self.ep_group_nums + for index in range(ep_start, ep_stop): + w1_hf_name = f"model.layers.{layer_id}.mlp.experts.{index}.gate_proj.weight" + w2_hf_name = f"model.layers.{layer_id}.mlp.experts.{index}.down_proj.weight" + w3_hf_name = f"model.layers.{layer_id}.mlp.experts.{index}.up_proj.weight" - w1_scale_ms_name = f"model.layers.{layer_id}.feed_forward.routed_experts.ffn.w1._layer.matmul.weight_scale" - w2_scale_ms_name = f"model.layers.{layer_id}.feed_forward.routed_experts.ffn.w2._layer.matmul.weight_scale" - w3_scale_ms_name = f"model.layers.{layer_id}.feed_forward.routed_experts.ffn.w3._layer.matmul.weight_scale" + w1_ms_param, _ = self.get_moe_safetensor_from_file(w1_hf_name, src_hf_dir, hf_weight_map) + w2_ms_param, _ = self.get_moe_safetensor_from_file(w2_hf_name, src_hf_dir, hf_weight_map) + w3_ms_param, _ = self.get_moe_safetensor_from_file(w3_hf_name, src_hf_dir, hf_weight_map) - for index in range(0, num_router_experts): - w1_hf_name = f"model.layers.{layer_id}.mlp.experts.{index}.gate_proj.weight" - w1_ms_param, _ = self.get_safetensor_from_file(w1_hf_name, src_hf_dir, hf_weight_map, - is_split_param=True, split_axis=0) + w1_list.append(w1_ms_param) + w2_list.append(w2_ms_param) + w3_list.append(w3_ms_param) - w2_hf_name = f"model.layers.{layer_id}.mlp.experts.{index}.down_proj.weight" - w2_ms_param, _ = self.get_safetensor_from_file(w2_hf_name, src_hf_dir, hf_weight_map, - is_split_param=True, split_axis=1) + w1_scale_hf_name = f"model.layers.{layer_id}.mlp.experts.{index}.gate_proj.weight_scale" + w2_scale_hf_name = f"model.layers.{layer_id}.mlp.experts.{index}.down_proj.weight_scale" + w3_scale_hf_name = f"model.layers.{layer_id}.mlp.experts.{index}.up_proj.weight_scale" + w1_scale_ms_param, _ = self.get_moe_safetensor_from_file(w1_scale_hf_name, src_hf_dir, hf_weight_map) + w2_scale_ms_param, _ = self.get_moe_safetensor_from_file(w2_scale_hf_name, src_hf_dir, hf_weight_map) + w3_scale_ms_param, _ = self.get_moe_safetensor_from_file(w3_scale_hf_name, src_hf_dir, hf_weight_map) + + w1_scale_ms_param = w1_scale_ms_param.squeeze(axis=-1) + w2_scale_ms_param = w2_scale_ms_param.squeeze(axis=-1) + w3_scale_ms_param = w3_scale_ms_param.squeeze(axis=-1) + w1_scale_list.append(w1_scale_ms_param) + w2_scale_list.append(w2_scale_ms_param) + w3_scale_list.append(w3_scale_ms_param) + + return w1_list, w2_list, w3_list, w1_scale_list, w2_scale_list, w3_scale_list + + def infer_quant_process_moe_with_ep_tp(self, src_hf_dir, hf_weight_map, num_router_experts, layer_id): + w1_list = [] + w2_list = [] + w3_list = [] + + w1_scale_list = [] + w2_scale_list = [] + w3_scale_list = [] + + ep_start = self.moe_ep_rank_id * self.ep_group_nums + ep_stop = (self.moe_ep_rank_id + 1) * self.ep_group_nums + + for index in range(ep_start, ep_stop): + w1_hf_name = f"model.layers.{layer_id}.mlp.experts.{index}.gate_proj.weight" + w2_hf_name = f"model.layers.{layer_id}.mlp.experts.{index}.down_proj.weight" w3_hf_name = f"model.layers.{layer_id}.mlp.experts.{index}.up_proj.weight" - w3_ms_param, _ = self.get_safetensor_from_file(w3_hf_name, src_hf_dir, hf_weight_map, - is_split_param=True, split_axis=0) + + w1_ms_param, _ = self.get_moe_safetensor_from_file(w1_hf_name, src_hf_dir, hf_weight_map, + is_split_param=True, split_axis=0) + w2_ms_param, _ = self.get_moe_safetensor_from_file(w2_hf_name, src_hf_dir, hf_weight_map, + is_split_param=True, split_axis=1) + w3_ms_param, _ = self.get_moe_safetensor_from_file(w3_hf_name, src_hf_dir, hf_weight_map, + is_split_param=True, split_axis=0) w1_list.append(w1_ms_param) w2_list.append(w2_ms_param) w3_list.append(w3_ms_param) w1_scale_hf_name = f"model.layers.{layer_id}.mlp.experts.{index}.gate_proj.weight_scale" - w1_scale_ms_param, _ = self.get_safetensor_from_file(w1_scale_hf_name, src_hf_dir, hf_weight_map, - is_split_param=True, split_axis=0) - w2_scale_hf_name = f"model.layers.{layer_id}.mlp.experts.{index}.down_proj.weight_scale" - w2_scale_ms_param, _ = self.get_safetensor_from_file(w2_scale_hf_name, src_hf_dir, hf_weight_map) - w3_scale_hf_name = f"model.layers.{layer_id}.mlp.experts.{index}.up_proj.weight_scale" - w3_scale_ms_param, _ = self.get_safetensor_from_file(w3_scale_hf_name, src_hf_dir, hf_weight_map, - is_split_param=True, split_axis=0) + + w1_scale_ms_param, _ = self.get_moe_safetensor_from_file(w1_scale_hf_name, src_hf_dir, hf_weight_map, + is_split_param=True, split_axis=0) + w2_scale_ms_param, _ = self.get_moe_safetensor_from_file(w2_scale_hf_name, src_hf_dir, hf_weight_map) + w3_scale_ms_param, _ = self.get_moe_safetensor_from_file(w3_scale_hf_name, src_hf_dir, hf_weight_map, + is_split_param=True, split_axis=0) w1_scale_ms_param = w1_scale_ms_param.squeeze(axis=-1) w2_scale_ms_param = w2_scale_ms_param.squeeze(axis=-1) @@ -185,6 +265,49 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor): w2_scale_list.append(w2_scale_ms_param) w3_scale_list.append(w3_scale_ms_param) + return w1_list, w2_list, w3_list, w1_scale_list, w2_scale_list, w3_scale_list + + def infer_quant_process_moe(self, src_hf_dir, hf_weight_map, num_router_experts, layer_id): + if self.moe_expert_parallel > 1 and self.moe_tensor_parallel > 1: + return self.infer_quant_process_moe_with_ep_tp(src_hf_dir, hf_weight_map, num_router_experts, layer_id) + elif self.moe_tensor_parallel > 1: + return self.infer_quant_process_moe_with_tp(src_hf_dir, hf_weight_map, num_router_experts, layer_id) + else: + return self.infer_quant_process_moe_with_ep(src_hf_dir, hf_weight_map, num_router_experts, layer_id) + + def infer_quant_process_moe_routed_expert_ffn_weight(self, src_hf_dir, layer_id, hf_weight_map): + """process moe router expert weight""" + ffn_concat = self.config.model.model_config.ffn_concat + num_router_experts = self.config.moe_config.expert_num + + # router expert dense + router_dense_hf_name = f"model.layers.{layer_id}.mlp.gate.weight" + router_dense_ms_name = self.quant_convert_weight_name(router_dense_hf_name) + router_dense_ms_param, _ = self.get_safetensor_from_file(router_dense_hf_name, src_hf_dir, hf_weight_map) + self.parameter_dict[router_dense_ms_name] = ms.Parameter( + ms.from_numpy(router_dense_ms_param).astype(ms.bfloat16), + name=router_dense_ms_name, requires_grad=False) + + # e_score_correction_bias + e_score_correction_bias_hf_name = f"model.layers.{layer_id}.mlp.gate.e_score_correction_bias" + e_score_correction_bias_ms_name = self.quant_convert_weight_name(e_score_correction_bias_hf_name) + e_score_correction_bias_ms_param, _ = self.get_safetensor_from_file(e_score_correction_bias_hf_name, src_hf_dir, + hf_weight_map) + self.parameter_dict[e_score_correction_bias_ms_name] = ms.Parameter( + ms.from_numpy(e_score_correction_bias_ms_param).astype(ms.float32), + name=e_score_correction_bias_ms_name, requires_grad=False) + + w1_ms_name = f"model.layers.{layer_id}.feed_forward.routed_experts.ffn.w1._layer.weight" + w2_ms_name = f"model.layers.{layer_id}.feed_forward.routed_experts.ffn.w2._layer.weight" + w3_ms_name = f"model.layers.{layer_id}.feed_forward.routed_experts.ffn.w3._layer.weight" + + w1_scale_ms_name = f"model.layers.{layer_id}.feed_forward.routed_experts.ffn.w1._layer.matmul.weight_scale" + w2_scale_ms_name = f"model.layers.{layer_id}.feed_forward.routed_experts.ffn.w2._layer.matmul.weight_scale" + w3_scale_ms_name = f"model.layers.{layer_id}.feed_forward.routed_experts.ffn.w3._layer.matmul.weight_scale" + + w1_list, w2_list, w3_list, w1_scale_list, w2_scale_list, w3_scale_list = \ + self.infer_quant_process_moe(src_hf_dir, hf_weight_map, num_router_experts, layer_id) + w1_ms_stack_param = np.stack(w1_list, axis=0) w2_ms_stack_param = np.stack(w2_list, axis=0) w3_ms_stack_param = np.stack(w3_list, axis=0) @@ -668,7 +791,7 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor): return weight_name def convert_mtp_weight_name(self, weight_name: str): - layer = 0 if 'layers.' not in weight_name else int(weight_name[weight_name.find('layers.') : ].split('.')[1]) + layer = 0 if 'layers.' not in weight_name else int(weight_name[weight_name.find('layers.'):].split('.')[1]) if layer < self.num_layers: return weight_name mtp_prefix = f'mtp_model' @@ -970,12 +1093,12 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor): ms_name = self.convert_weight_name(hf_name) if prefix_name in head_names and not self.config.parallel_config.vocab_emb_dp: ms_param, _ = self.get_safetensor_from_file(hf_name, src_hf_dir, hf_weight_map, - is_split_param=True, split_axis=0) + is_split_param=True, split_axis=0) else: ms_param, _ = self.get_safetensor_from_file(hf_name, src_hf_dir, hf_weight_map) parameter_dict[ms_name] = ms.Parameter(ms.Tensor(ms_param, ms.bfloat16), - name=ms_name, - requires_grad=False) + name=ms_name, + requires_grad=False) _, ckpt_not_load = ms.load_param_into_net(self.network, parameter_dict) @@ -1333,8 +1456,67 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor): if "model.layers" in param_name and int(param_name.split('.')[2]) >= num_layers: continue - if any([name in param_name for name in skip_layer]): - continue + if any([name in param_name for name in no_need_split_layer]): + value, _ = self.get_safetensor_from_file(param_name, src_hf_dir, + hf_weight_map) + elif any([name in param_name for name in [".l2q_proj.", ".feed_forward.w_gate_hidden.", + "shared_experts.w_gate_hidden"]]): + if param_name.endswith(".weight") or "matmul" in param_name: + value, _ = self.get_safetensor_from_file(param_name, src_hf_dir, + hf_weight_map, is_split_param=True, + split_axis=0) + else: + value, _ = self.get_safetensor_from_file(param_name, src_hf_dir, + hf_weight_map) + elif any([name in param_name for name in [".feed_forward.w2.", ".wo.", "shared_experts.w2"]]): + if param_name.endswith(".weight"): + value, _ = self.get_safetensor_from_file(param_name, src_hf_dir, + hf_weight_map, is_split_param=True, + split_axis=1) + elif "quant_op" in param_name: + value, _ = self.get_safetensor_from_file(param_name, src_hf_dir, + hf_weight_map, is_split_param=True, + split_axis=0) + else: + value, _ = self.get_safetensor_from_file(param_name, src_hf_dir, + hf_weight_map) + elif ".routed_experts.ffn.w_gate_hidden." in param_name: + if param_name.endswith(".weight"): + value, _ = self.get_safetensor_from_file(param_name, src_hf_dir, hf_weight_map) + value_list = [] + for experts_id in range(value.shape[0]): + value_list.append(self.split_weight_by_rank(value[experts_id, :, :], split_axis=1)) + value = np.stack(value_list, axis=0) + elif "matmul" in param_name: + value, _ = self.get_safetensor_from_file(param_name, src_hf_dir, hf_weight_map) + value_list = [] + for experts_id in range(value.shape[0]): + value_list.append(self.split_weight_by_rank(value[experts_id, :], split_axis=0)) + value = np.stack(value_list, axis=0) + else: + value, _ = self.get_safetensor_from_file(param_name, src_hf_dir, + hf_weight_map) + elif ".routed_experts.ffn.w2" in param_name: + if param_name.endswith(".weight"): + value, _ = self.get_safetensor_from_file(param_name, src_hf_dir, hf_weight_map) + value_list = [] + for experts_id in range(value.shape[0]): + value_list.append(self.split_weight_by_rank(value[experts_id, :, :], split_axis=0)) + value = np.stack(value_list, axis=0) + else: + value, _ = self.get_safetensor_from_file(param_name, src_hf_dir, + hf_weight_map) + elif any([name in param_name for name in ["lkv2kv_k_nope", "lkv2kv_v"]]): + value, _ = self.get_safetensor_from_file(param_name, src_hf_dir, hf_weight_map, + is_split_param=True, split_axis=0) + elif "lm_head" in param_name: + if not self.config.parallel_config.vocab_emb_dp: + value, _ = self.get_safetensor_from_file(param_name, src_hf_dir, hf_weight_map, + is_split_param=True, split_axis=0) + else: + value, _ = self.get_safetensor_from_file(param_name, src_hf_dir, hf_weight_map) + else: + raise ValueError(f"not found layer {param_name}, please check safetensors file.") value = self.infer_smooth_quant_get_value(param_name, src_hf_dir, hf_weight_map, no_need_split_layer) dst_dtype = convert_np_to_ms_dtype(value) @@ -1343,8 +1525,8 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor): name=param_name, requires_grad=False) param_not_load, ckpt_not_load = ms.load_param_into_net(self.network, parameter_dict) - print(f"smoothquant param_not_load:{param_not_load}") - print(f"smoothquant ckpt_not_load:{ckpt_not_load}") + logger.info("smoothquant param_not_load: %s" % str(param_not_load)) + logger.info("smoothquant ckpt_not_load: %s" % str(ckpt_not_load)) def infer_gptq_quant_net_ms_convert_layer_weight(self, src_hf_dir, num_layers, hf_weight_map): """infer_gptq_quant_net_ms_convert_layer_weight""" @@ -1456,6 +1638,7 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor): else: self.infer_convert_layer_weight(src_hf_dir, layer_id, hf_weight_map) - ms.load_param_into_net(self.network, self.parameter_dict) + param_not_load, ckpt_not_load = ms.load_param_into_net(self.network, self.parameter_dict) + logger.info("param_not_load: %s, ckpt_not_load: %s" % (str(param_not_load), str(ckpt_not_load))) del self.parameter_dict gc.collect() diff --git a/vllm_mindspore/model_executor/models/mf_models/weight_processor.py b/vllm_mindspore/model_executor/models/mf_models/weight_processor.py index 82a104f6..f46334fa 100644 --- a/vllm_mindspore/model_executor/models/mf_models/weight_processor.py +++ b/vllm_mindspore/model_executor/models/mf_models/weight_processor.py @@ -19,9 +19,9 @@ transform huggingface safetensor. import os from safetensors import safe_open -from mindspore.communication.management import get_rank, get_group_size - -from vllm.distributed import get_tensor_model_parallel_world_size, get_tensor_model_parallel_rank +from mindspore.communication.management import get_rank +from mindformers.experimental.infer.core.utils import get_tp_world_size +from mindformers.experimental.parallel_core.pynative.parallel_state import get_data_parallel_world_size class BaseWeightProcessor: r""" @@ -36,8 +36,29 @@ class BaseWeightProcessor: self.config = config self.network = network self.is_quant = is_quant - self.tp_group_size = get_tensor_model_parallel_world_size() - self.rank_id = get_tensor_model_parallel_rank() + self.global_rank_id = get_rank() + self.tp_group_size = get_tp_world_size() + self.dp_group_size = get_data_parallel_world_size() + self.moe_ep_size = self.config.moe_config.moe_expert_parallel + self.moe_tp_size = self.config.moe_config.moe_tensor_parallel + self.tp_rank_id = self.global_rank_id % self.tp_group_size + + num_router_experts = self.config.moe_config.expert_num + self.ep_group_nums = num_router_experts // self.moe_ep_size + self.moe_ep_rank_id = self.global_rank_id // self.moe_tp_size + self.moe_tp_rank_id = self.global_rank_id % self.moe_tp_size + + print(f"global_rank_id: {self.global_rank_id} \n" + f"tp_group_size: {self.tp_group_size} \n" + f"dp_group_size: {self.dp_group_size} \n" + f"tp_rank_id: {self.tp_rank_id} \n" + f"num_router_experts: {num_router_experts} \n" + f"ep_group_nums: {self.ep_group_nums} \n" + f"moe_ep_rank_id: {self.moe_ep_rank_id} \n" + f"moe_tp_rank_id: {self.moe_tp_rank_id} \n" + f"moe_ep_size: {self.moe_ep_size} \n" + f"moe_tp_size: {self.moe_tp_size}", flush=True) + self.parameter_dict = {} self.file_handles = {} @@ -50,6 +71,33 @@ class BaseWeightProcessor: def release_file_handles(self): del self.file_handles + def get_moe_safetensor_from_file(self, hf_param_name, src_hf_dir, hf_weight_map, is_split_param=False, split_axis=0): + safetensor_file = hf_weight_map[hf_param_name] + filename = os.path.join(src_hf_dir, safetensor_file) + sf_file = self.get_file_handles(filename) + qint4 = False + if sf_file.metadata() is not None and hf_param_name in sf_file.metadata().keys(): + qint4 = True + if not is_split_param or self.moe_tp_size == 1: + np_data = sf_file.get_tensor(hf_param_name) + return np_data, qint4 + + np_data = sf_file.get_slice(hf_param_name) + shape = np_data.get_shape() + if split_axis == 0: + split_size = shape[0] // self.moe_tp_size + start = self.moe_tp_rank_id * split_size + stop = (self.moe_tp_rank_id + 1) * split_size + split_data = np_data[start:stop] + elif split_axis == 1: + split_size = shape[1] // self.moe_tp_size + start = self.moe_tp_rank_id * split_size + stop = (self.moe_tp_rank_id + 1) * split_size + split_data = np_data[:, start:stop] + else: + raise ValueError("split_axis:{} is not supported.".format(split_axis)) + return split_data, qint4 + def get_safetensor_from_file(self, hf_param_name, src_hf_dir, hf_weight_map, is_split_param=False, split_axis=0): safetensor_file = hf_weight_map[hf_param_name] filename = os.path.join(src_hf_dir, safetensor_file) @@ -57,7 +105,7 @@ class BaseWeightProcessor: qint4 = False if sf_file.metadata() is not None and hf_param_name in sf_file.metadata().keys(): qint4 = True - if not is_split_param: + if not is_split_param or self.tp_group_size == 1: np_data = sf_file.get_tensor(hf_param_name) return np_data, qint4 @@ -65,13 +113,13 @@ class BaseWeightProcessor: shape = np_data.get_shape() if split_axis == 0: split_size = shape[0] // self.tp_group_size - start = self.rank_id * split_size - stop = (self.rank_id + 1) * split_size + start = self.tp_rank_id * split_size + stop = (self.tp_rank_id + 1) * split_size split_data = np_data[start:stop] elif split_axis == 1: split_size = shape[1] // self.tp_group_size - start = self.rank_id * split_size - stop = (self.rank_id + 1) * split_size + start = self.tp_rank_id * split_size + stop = (self.tp_rank_id + 1) * split_size split_data = np_data[:, start:stop] elif split_axis == 2: split_size = shape[2] // self.tp_group_size @@ -83,16 +131,19 @@ class BaseWeightProcessor: return split_data, qint4 def split_weight_by_rank(self, weight, split_axis=0): + if self.tp_group_size == 1: + return weight + shape = weight.shape if split_axis == 0: split_size = shape[0] // self.tp_group_size - start = self.rank_id * split_size - stop = (self.rank_id + 1) * split_size + start = self.tp_rank_id * split_size + stop = (self.tp_rank_id + 1) * split_size split_data = weight[start:stop] elif split_axis == 1: split_size = shape[1] // self.tp_group_size - start = self.rank_id * split_size - stop = (self.rank_id + 1) * split_size + start = self.tp_rank_id * split_size + stop = (self.tp_rank_id + 1) * split_size split_data = weight[:, start:stop] else: raise ValueError("split_axis:{} is not supported.".format(split_axis)) diff --git a/vllm_mindspore/scripts.py b/vllm_mindspore/scripts.py index f35190fa..3bbfbbc3 100644 --- a/vllm_mindspore/scripts.py +++ b/vllm_mindspore/scripts.py @@ -54,7 +54,7 @@ def env_setup(target_env_dict=None): def main(): env_setup() - from vllm.scripts import main as vllm_main + from vllm.entrypoints.cli.main import main as vllm_main vllm_main() -- Gitee From 6468923f6425c7da3b04a4f4aec4dfc56f8c309a Mon Sep 17 00:00:00 2001 From: Erpim Date: Sat, 26 Apr 2025 11:48:07 +0800 Subject: [PATCH 14/55] replace flatten --- vllm_mindspore/v1/worker/gpu_model_runner.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm_mindspore/v1/worker/gpu_model_runner.py b/vllm_mindspore/v1/worker/gpu_model_runner.py index 988ee71c..4928b56a 100644 --- a/vllm_mindspore/v1/worker/gpu_model_runner.py +++ b/vllm_mindspore/v1/worker/gpu_model_runner.py @@ -95,7 +95,7 @@ def _prepare_inputs( req_indices * self.input_batch.token_ids_cpu.shape[1]) self.input_ids[:total_num_scheduled_tokens] = torch.from_numpy( - np.take(self.input_batch.token_ids_cpu.flatten(), + np.take(self.input_batch.token_ids_cpu.ravel(), token_indices, 0) ) @@ -110,7 +110,7 @@ def _prepare_inputs( positions_np // self.block_size) - block_numbers = self.input_batch.block_table.block_table_np.flatten()[block_table_indices] + block_numbers = self.input_batch.block_table.block_table_np.ravel()[block_table_indices] block_offsets = positions_np % self.block_size np.add(block_numbers * self.block_size, block_offsets, -- Gitee From ba95224c137aa1158b7be613b690637fa952609e Mon Sep 17 00:00:00 2001 From: Erpim Date: Sat, 26 Apr 2025 15:48:05 +0800 Subject: [PATCH 15/55] support ep weight load --- .../mf_models/deepseekv3_weight_processor.py | 48 ++++++++++++++----- .../models/mf_models/weight_processor.py | 32 +++++++++---- 2 files changed, 60 insertions(+), 20 deletions(-) diff --git a/vllm_mindspore/model_executor/models/mf_models/deepseekv3_weight_processor.py b/vllm_mindspore/model_executor/models/mf_models/deepseekv3_weight_processor.py index 82e831d0..58a186b6 100644 --- a/vllm_mindspore/model_executor/models/mf_models/deepseekv3_weight_processor.py +++ b/vllm_mindspore/model_executor/models/mf_models/deepseekv3_weight_processor.py @@ -26,11 +26,11 @@ import mindspore as ms from mindspore import dtype from mindspore.communication.management import get_rank from mindformers.experimental.parallel_core.pynative.parallel_state import get_tensor_model_parallel_rank -from vllm_mindspore.model_executor.models.mf_models.weight_processor import BaseWeightProcessor +from vllm_mindspore.model_executor.models.mf_models.weight_processor import BaseWeightProcessor, EPMethod from vllm.logger import init_logger -logger = init_logger +logger = init_logger(__name__) def convert_np_to_ms_dtype(value): @@ -65,6 +65,11 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor): self.expert_num = self.config.moe_config.expert_num self.moe_tensor_parallel = self.config.moe_config.moe_tensor_parallel self.moe_expert_parallel = self.config.moe_config.moe_expert_parallel + self.ep_method = EPMethod.DEFAULT + if self.dp_group_size > 1 and self.moe_expert_parallel == self.global_group_size: + self.ep_method = EPMethod.ALLTOALL + elif self.dp_group_size > 1: + self.ep_method = EPMethod.ALLGATHER def quant_convert_weight_name(self, weight_name: str): """replace quant net weight name""" @@ -363,24 +368,38 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor): name=w2_scale_ms_name, requires_grad=False) + def get_moe_shared_expert_split_info(self): + split_num = -1 + rank_id = -1 + if self.ep_method == EPMethod.ALLGATHER: + split_num = self.global_group_size + rank_id = get_rank() + elif self.ep_method == EPMethod.ALLTOALL: + split_num = 1 + rank_id = 0 + return split_num, rank_id + def infer_quant_process_moe_shared_expert_ffn_weight(self, src_hf_dir, layer_id, hf_weight_map): """infer quant process moe shared expert ffn weight""" + split_num, rank_id = self.get_moe_shared_expert_split_info() ffn_concat = self.config.model.model_config.ffn_concat w1_hf_name = f"model.layers.{layer_id}.mlp.shared_experts.gate_proj.weight" w1_ms_name = self.quant_convert_weight_name(w1_hf_name) - w1_ms_param, _ = self.get_safetensor_from_file(w1_hf_name, src_hf_dir, hf_weight_map, - is_split_param=True, split_axis=0) + w1_ms_param, _ = self.get_safetensor_from_file(w1_hf_name, src_hf_dir, hf_weight_map, is_split_param=True, + split_axis=0, split_num=split_num, rank_id=rank_id) w1_scale_hf_name = f"model.layers.{layer_id}.mlp.shared_experts.gate_proj.weight_scale" w1_scale_ms_name = self.quant_convert_weight_name(w1_scale_hf_name) w1_scale_ms_param, _ = self.get_safetensor_from_file(w1_scale_hf_name, src_hf_dir, hf_weight_map, - is_split_param=True, split_axis=0) + is_split_param=True, split_axis=0, split_num=split_num, + rank_id=rank_id) w2_hf_name = f"model.layers.{layer_id}.mlp.shared_experts.down_proj.weight" w2_ms_name = self.quant_convert_weight_name(w2_hf_name) w2_ms_param, _ = self.get_safetensor_from_file(w2_hf_name, src_hf_dir, hf_weight_map, - is_split_param=True, split_axis=1) + is_split_param=True, split_axis=1, split_num=split_num, + rank_id=rank_id) w2_scale_hf_name = f"model.layers.{layer_id}.mlp.shared_experts.down_proj.weight_scale" w2_scale_ms_name = self.quant_convert_weight_name(w2_scale_hf_name) @@ -389,12 +408,14 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor): w3_hf_name = f"model.layers.{layer_id}.mlp.shared_experts.up_proj.weight" w3_ms_name = self.quant_convert_weight_name(w3_hf_name) w3_ms_param, _ = self.get_safetensor_from_file(w3_hf_name, src_hf_dir, hf_weight_map, - is_split_param=True, split_axis=0) + is_split_param=True, split_axis=0, split_num=split_num, + rank_id=rank_id) w3_scale_hf_name = f"model.layers.{layer_id}.mlp.shared_experts.up_proj.weight_scale" w3_scale_ms_name = self.quant_convert_weight_name(w3_scale_hf_name) w3_scale_ms_param, _ = self.get_safetensor_from_file(w3_scale_hf_name, src_hf_dir, hf_weight_map, - is_split_param=True, split_axis=0) + is_split_param=True, split_axis=0, split_num=split_num, + rank_id=rank_id) w1_scale_ms_param = w1_scale_ms_param.squeeze(axis=-1) w2_scale_ms_param = w2_scale_ms_param.squeeze(axis=-1) @@ -888,21 +909,26 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor): def infer_process_moe_shared_expert_ffn_weight(self, src_hf_dir, layer_id, hf_weight_map): """infer process moe shared expert ffn weight""" + split_num, rank_id = self.get_moe_shared_expert_split_info() + ffn_concat = self.config.model.model_config.ffn_concat w1_hf_name = f"model.layers.{layer_id}.mlp.shared_experts.gate_proj.weight" w1_ms_name = self.convert_weight_name(w1_hf_name) w1_ms_param, _ = self.get_safetensor_from_file(w1_hf_name, src_hf_dir, hf_weight_map, - is_split_param=True, split_axis=0) + is_split_param=True, split_axis=0, split_num=split_num, + rank_id=rank_id) w2_hf_name = f"model.layers.{layer_id}.mlp.shared_experts.down_proj.weight" w2_ms_name = self.convert_weight_name(w2_hf_name) w2_ms_param, _ = self.get_safetensor_from_file(w2_hf_name, src_hf_dir, hf_weight_map, - is_split_param=True, split_axis=1) + is_split_param=True, split_axis=1, split_num=split_num, + rank_id=rank_id) w3_hf_name = f"model.layers.{layer_id}.mlp.shared_experts.up_proj.weight" w3_ms_name = self.convert_weight_name(w3_hf_name) w3_ms_param, _ = self.get_safetensor_from_file(w3_hf_name, src_hf_dir, hf_weight_map, - is_split_param=True, split_axis=0) + is_split_param=True, split_axis=0, split_num=split_num, + rank_id=rank_id) if ffn_concat: w_gate_hidden_name = f"model.layers.{layer_id}.feed_forward.shared_experts.w_gate_hidden.weight" diff --git a/vllm_mindspore/model_executor/models/mf_models/weight_processor.py b/vllm_mindspore/model_executor/models/mf_models/weight_processor.py index f46334fa..696367ec 100644 --- a/vllm_mindspore/model_executor/models/mf_models/weight_processor.py +++ b/vllm_mindspore/model_executor/models/mf_models/weight_processor.py @@ -18,11 +18,20 @@ transform huggingface safetensor. """ import os +from enum import Enum from safetensors import safe_open -from mindspore.communication.management import get_rank +from mindspore.communication.management import get_rank, get_group_size from mindformers.experimental.infer.core.utils import get_tp_world_size from mindformers.experimental.parallel_core.pynative.parallel_state import get_data_parallel_world_size +class EPMethod(Enum): + """ + EP method enums + """ + DEFAULT = 'default' + ALLTOALL = 'alltoall' + ALLGATHER = 'allgather' + class BaseWeightProcessor: r""" Provide model weight load and shards. @@ -37,10 +46,12 @@ class BaseWeightProcessor: self.network = network self.is_quant = is_quant self.global_rank_id = get_rank() + self.global_group_size = get_group_size() self.tp_group_size = get_tp_world_size() self.dp_group_size = get_data_parallel_world_size() self.moe_ep_size = self.config.moe_config.moe_expert_parallel self.moe_tp_size = self.config.moe_config.moe_tensor_parallel + self.ep_method = EPMethod.DEFAULT self.tp_rank_id = self.global_rank_id % self.tp_group_size num_router_experts = self.config.moe_config.expert_num @@ -98,28 +109,31 @@ class BaseWeightProcessor: raise ValueError("split_axis:{} is not supported.".format(split_axis)) return split_data, qint4 - def get_safetensor_from_file(self, hf_param_name, src_hf_dir, hf_weight_map, is_split_param=False, split_axis=0): + def get_safetensor_from_file(self, hf_param_name, src_hf_dir, hf_weight_map, is_split_param=False, split_axis=0, + split_num=-1, rank_id=-1): + rank_id = rank_id if rank_id != -1 else self.tp_rank_id + split_num = split_num if split_num != -1 else self.tp_group_size safetensor_file = hf_weight_map[hf_param_name] filename = os.path.join(src_hf_dir, safetensor_file) sf_file = self.get_file_handles(filename) qint4 = False if sf_file.metadata() is not None and hf_param_name in sf_file.metadata().keys(): qint4 = True - if not is_split_param or self.tp_group_size == 1: + if not is_split_param or split_num == 1: np_data = sf_file.get_tensor(hf_param_name) return np_data, qint4 np_data = sf_file.get_slice(hf_param_name) shape = np_data.get_shape() if split_axis == 0: - split_size = shape[0] // self.tp_group_size - start = self.tp_rank_id * split_size - stop = (self.tp_rank_id + 1) * split_size + split_size = shape[0] // split_num + start = rank_id * split_size + stop = (rank_id + 1) * split_size split_data = np_data[start:stop] elif split_axis == 1: - split_size = shape[1] // self.tp_group_size - start = self.tp_rank_id * split_size - stop = (self.tp_rank_id + 1) * split_size + split_size = shape[1] // split_num + start = rank_id * split_size + stop = (rank_id + 1) * split_size split_data = np_data[:, start:stop] elif split_axis == 2: split_size = shape[2] // self.tp_group_size -- Gitee From 37fd6cd52c5c8930024fa65eb029b600ff93c483 Mon Sep 17 00:00:00 2001 From: zlq2020 Date: Thu, 24 Apr 2025 09:54:29 +0800 Subject: [PATCH 16/55] update log config --- setup.py | 16 ++--- vllm_mindspore/__init__.py | 7 +- vllm_mindspore/config.py | 10 +-- vllm_mindspore/logger.py | 70 +++++++++++++++++++ .../mf_models/deepseekv3_infer_save_ckpt.py | 5 +- vllm_mindspore/scripts.py | 2 + vllm_mindspore/utils.py | 6 +- vllm_mindspore/v1/worker/gpu_model_runner.py | 1 - vllm_mindspore/worker/profile.py | 6 +- 9 files changed, 101 insertions(+), 22 deletions(-) create mode 100644 vllm_mindspore/logger.py diff --git a/setup.py b/setup.py index 2f304bcc..2f5080a2 100644 --- a/setup.py +++ b/setup.py @@ -40,6 +40,7 @@ def load_module_from_path(module_name, path): ROOT_DIR = os.path.dirname(__file__) +logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) @@ -121,7 +122,7 @@ class CustomBuildExt(build_ext): # "vllm_mindspore.npu_ops" --> "npu_ops" ext_name = ext.name.split('.')[-1] so_name = ext_name + ".so" - print(f"Building {so_name} ...") + logger.info(f"Building {so_name} ...") OPS_DIR = os.path.join(ROOT_DIR, "vllm_mindspore", "ops") BUILD_OPS_DIR = os.path.join(ROOT_DIR, "build", "ops") os.makedirs(BUILD_OPS_DIR, exist_ok=True) @@ -143,12 +144,12 @@ class CustomBuildExt(build_ext): try: # Run the combined cmake command - print(f"Running combined CMake commands:\n{cmake_cmd}") + logger.info(f"Running combined CMake commands:\n{cmake_cmd}") result = subprocess.run(cmake_cmd, cwd=self.ROOT_DIR, text=True, shell=True, capture_output=True) if result.returncode != 0: - print("CMake commands failed:") - print(result.stdout) # Print standard output - print(result.stderr) # Print error output + logger.info("CMake commands failed:") + logger.info(result.stdout) # Print standard output + logger.info(result.stderr) # Print error output raise RuntimeError(f"Combined CMake commands failed with exit code {result.returncode}") except subprocess.CalledProcessError as e: raise RuntimeError(f"Failed to build {so_name}: {e}") @@ -160,10 +161,7 @@ class CustomBuildExt(build_ext): if os.path.exists(dst_so_path): os.remove(dst_so_path) shutil.copy(src_so_path, dst_so_path) - print(f"Copied {so_name} to {dst_so_path}") - - -write_commit_id() + logger.info(f"Copied {so_name} to {dst_so_path}") package_data = { "": [ diff --git a/vllm_mindspore/__init__.py b/vllm_mindspore/__init__.py index 032415f0..3aae2091 100644 --- a/vllm_mindspore/__init__.py +++ b/vllm_mindspore/__init__.py @@ -25,14 +25,17 @@ if "vllm" in sys.modules: "vllm import before vllm_mindspore, vllm_mindspore cannot worker right!" ) +# 1. set env before import mindspore. from vllm_mindspore.scripts import env_setup - env_setup() -# should be place on the top of the file. +# 2. replace the inductor_pass module before import vllm. from vllm_mindspore.compilation import inductor_pass as ms_inductor_pass sys.modules.update({"vllm.compilation.inductor_pass": ms_inductor_pass}) +# 3. update the log configuration ahead of other modifications. +import vllm_mindspore.logger + from vllm_mindspore.platforms.ascend import AscendPlatform ascend_platform = AscendPlatform() diff --git a/vllm_mindspore/config.py b/vllm_mindspore/config.py index 8079ca35..0fd6ca23 100644 --- a/vllm_mindspore/config.py +++ b/vllm_mindspore/config.py @@ -30,7 +30,7 @@ from transformers import PretrainedConfig import vllm.envs as envs -from vllm.config import VllmConfig, CompilationConfig, CompilationLevel, logger, _STR_DTYPE_TO_TORCH_DTYPE +from vllm.config import VllmConfig, CompilationConfig, CompilationLevel, _STR_DTYPE_TO_TORCH_DTYPE from vllm.utils import random_uuid from vllm.logger import init_logger from vllm.compilation.inductor_pass import CallableInductorPass, InductorPass @@ -313,7 +313,7 @@ class SocketProcessGroup: self.server_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) self.server_socket.bind((self.master_ip, self.master_port)) self.server_socket.listen(self.world_size - 1) - print(f"Master node listening on {self.master_ip}:{self.master_port}") + logger.info(f"Master node listening on {self.master_ip}:{self.master_port}") else: # Worker node: connect to the master self.client_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) @@ -321,11 +321,11 @@ class SocketProcessGroup: while retries < self.max_retries: try: self.client_socket.connect((self.master_ip, self.master_port)) - print(f"Worker {self.rank} connected to master at {self.master_ip}:{self.master_port}") + logger.info(f"Worker {self.rank} connected to master at {self.master_ip}:{self.master_port}") break except ConnectionRefusedError: retries += 1 - print(f"Worker {self.rank} failed to connect to master. Retrying in {self.retry_interval} seconds... ({retries}/{self.max_retries})") + logger.warning(f"Worker {self.rank} failed to connect to master. Retrying in {self.retry_interval} seconds... ({retries}/{self.max_retries})") time.sleep(self.retry_interval) else: raise ConnectionError(f"Worker {self.rank} could not connect to master at {self.master_ip}:{self.master_port} after {self.max_retries} retries.") @@ -408,4 +408,4 @@ def stateless_destroy_socket_process_group(dp_group: "SocketProcessGroup") -> No """ if dp_group: dp_group.close() - print(f"Socket process group for rank {dp_group.rank} destroyed.") + logger.info(f"Socket process group for rank {dp_group.rank} destroyed.") diff --git a/vllm_mindspore/logger.py b/vllm_mindspore/logger.py new file mode 100644 index 00000000..fcdbe610 --- /dev/null +++ b/vllm_mindspore/logger.py @@ -0,0 +1,70 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# Copyright 2025 Huawei Technologies Co., Ltd +# Copyright 2024 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""init logger for vllm-mindspore.""" + +from logging.config import dictConfig +import vllm.envs as envs +from vllm.logger import DEFAULT_LOGGING_CONFIG, init_logger + +VLLM_CONFIGURE_LOGGING = envs.VLLM_CONFIGURE_LOGGING +VLLM_LOGGING_CONFIG_PATH = envs.VLLM_LOGGING_CONFIG_PATH +VLLM_LOGGING_LEVEL = envs.VLLM_LOGGING_LEVEL +VLLM_LOGGING_PREFIX = envs.VLLM_LOGGING_PREFIX + +_DATE_FORMAT = "%m-%d %H:%M:%S" +_MS_FORMAT = (f"{VLLM_LOGGING_PREFIX}%(levelname)s %(asctime)s " + "vllm-mindspore[%(filename)s:%(lineno)d] %(message)s") + +_MS_FORMATTERS = { + "vllm_mindspore": { + "class": "vllm.logging_utils.NewLineFormatter", + "datefmt": _DATE_FORMAT, + "format": _MS_FORMAT, + } +} + +_MS_HANDLERS = { + "vllm_mindspore": { + "class": "logging.StreamHandler", + "formatter": "vllm_mindspore", + "level": VLLM_LOGGING_LEVEL, + "stream": "ext://sys.stdout", + } +} + +_MS_LOGGERS = { + "vllm_mindspore": { + "handlers": ["vllm_mindspore"], + "level": "DEBUG", + "propagate": False, + } +} + +def _update_configure_vllm_root_logger() -> None: + if VLLM_CONFIGURE_LOGGING and not VLLM_LOGGING_CONFIG_PATH: + logging_config = DEFAULT_LOGGING_CONFIG + logging_config["formatters"].update(_MS_FORMATTERS) + logging_config["handlers"].update(_MS_HANDLERS) + logging_config["loggers"].update(_MS_LOGGERS) + + dictConfig(logging_config) + +_update_configure_vllm_root_logger() + +logger = init_logger(__name__) +logger.info("The config of vllm-mindspore logger has been updated successfully.") diff --git a/vllm_mindspore/model_executor/models/mf_models/deepseekv3_infer_save_ckpt.py b/vllm_mindspore/model_executor/models/mf_models/deepseekv3_infer_save_ckpt.py index 4b781a8c..81dd8ef3 100644 --- a/vllm_mindspore/model_executor/models/mf_models/deepseekv3_infer_save_ckpt.py +++ b/vllm_mindspore/model_executor/models/mf_models/deepseekv3_infer_save_ckpt.py @@ -17,11 +17,12 @@ import argparse import os from collections import OrderedDict +from vllm.logger import init_logger + import mindspore as ms from mindspore import dtype as msdtype from mindspore.communication.management import get_rank from mindformers.core.parallel_config import build_parallel_config -from mindformers.tools.logger import logger from mindformers import MindFormerConfig from mindformers import build_context from research.deepseek3.deepseekv3_infer_parallelism import DeepseekInferParallelism @@ -29,6 +30,8 @@ from research.deepseek3.deepseekv3_infer_parallelism import DeepseekInferParalle from research.deepseek3.deepseek3_config import DeepseekV3Config from research.deepseek3.deepseek3_model_infer import InferenceDeepseekV3ForCausalLM +logger = init_logger(__name__) + # for example # bash scripts/msrun_launcher.sh "python ./infer_save_ckpt_from_safetensor.py # --config /path/to/predict_deepseek_r1_671b.yaml diff --git a/vllm_mindspore/scripts.py b/vllm_mindspore/scripts.py index 3bbfbbc3..ef297418 100644 --- a/vllm_mindspore/scripts.py +++ b/vllm_mindspore/scripts.py @@ -19,6 +19,8 @@ import logging import os +# It's before the vllm import, so vllm.logger cannot be used here. +logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) diff --git a/vllm_mindspore/utils.py b/vllm_mindspore/utils.py index d32b525e..e0fc3b04 100644 --- a/vllm_mindspore/utils.py +++ b/vllm_mindspore/utils.py @@ -18,7 +18,6 @@ import contextlib import gc -import logging import os import sys from typing import ( @@ -39,6 +38,7 @@ if TYPE_CHECKING: else: Library = None +from vllm.logger import init_logger from vllm.utils import T, TORCH_DTYPE_TO_NUMPY_DTYPE, make_ndarray_with_pad import mindspore as ms @@ -50,7 +50,7 @@ from .scripts import env_setup MsKVCache = Tuple[ms.Tensor, ms.Tensor] -logger = logging.getLogger(__name__) +logger = init_logger(__name__) STR_DTYPE_TO_MS_DTYPE = { @@ -263,5 +263,5 @@ def convert_np_to_ms_dtype(value): # Replace the directly loaded module in vllm, such as 'from module import xxx' def update_modules(name, module): - logger.info(f"replace module {name} by {module}") + logger.debug(f"replace module {name} by {module}") sys.modules.update({name: module}) diff --git a/vllm_mindspore/v1/worker/gpu_model_runner.py b/vllm_mindspore/v1/worker/gpu_model_runner.py index 4928b56a..a21a2f73 100644 --- a/vllm_mindspore/v1/worker/gpu_model_runner.py +++ b/vllm_mindspore/v1/worker/gpu_model_runner.py @@ -14,7 +14,6 @@ from vllm_mindspore.utils import get_valid_dtype from vllm.v1.kv_cache_interface import FullAttentionSpec from vllm.v1.utils import bind_kv_cache from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs -from vllm.logger import logger from vllm.distributed.parallel_state import get_pp_group from vllm.utils import cdiv from vllm.logger import init_logger diff --git a/vllm_mindspore/worker/profile.py b/vllm_mindspore/worker/profile.py index 728362d2..9958ebcb 100644 --- a/vllm_mindspore/worker/profile.py +++ b/vllm_mindspore/worker/profile.py @@ -2,10 +2,14 @@ import os import sys import subprocess +from vllm.logger import init_logger + from mindspore import Profiler from mindspore.profiler import ProfilerLevel, ProfilerActivity, AicoreMetrics from mindspore.profiler.common.profiler_context import ProfilerContext +logger = init_logger(__name__) + PROFILE_ENV_NAME = "VLLM_TORCH_PROFILER_DIR" def shell_analyse(path): @@ -55,7 +59,7 @@ def wrapper_worker_init_device(fun): self = arg[0] profile_output_path = os.getenv(PROFILE_ENV_NAME, "") if profile_output_path: - print(f"Profiling enabled. Traces will be saved to: {profile_output_path}") + logger.info(f"Profiling enabled. Traces will be saved to: {profile_output_path}") self.profiler = AdapterProfiler(profile_output_path) else: self.profiler = None -- Gitee From fff8f42c1542916a9d80e88fae5fff2342dde853 Mon Sep 17 00:00:00 2001 From: yangminghai Date: Thu, 10 Apr 2025 15:48:09 +0800 Subject: [PATCH 17/55] add commit info --- setup.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/setup.py b/setup.py index 2f5080a2..b80c7879 100644 --- a/setup.py +++ b/setup.py @@ -163,6 +163,9 @@ class CustomBuildExt(build_ext): shutil.copy(src_so_path, dst_so_path) logger.info(f"Copied {so_name} to {dst_so_path}") + +write_commit_id() + package_data = { "": [ "*.so", -- Gitee From 212a703f74f315b471ae77ac2d300cceb7faeacc Mon Sep 17 00:00:00 2001 From: twc Date: Mon, 28 Apr 2025 16:00:20 +0800 Subject: [PATCH 18/55] adapt to mf ep branch and support ep weight split --- .../models/mf_models/deepseek_v3.py | 26 +- .../mf_models/deepseekv3_weight_processor.py | 657 ++++++++++-------- .../models/mf_models/mf_model_base.py | 21 +- .../mf_models/qwen2_weight_processor.py | 58 +- .../models/mf_models/weight_processor.py | 170 +++-- 5 files changed, 535 insertions(+), 397 deletions(-) diff --git a/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py b/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py index fa39e0fb..e2112526 100644 --- a/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py +++ b/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py @@ -57,6 +57,19 @@ from vllm_mindspore.model_executor.models.mf_models.deepseekv3_weight_processor logger = init_logger(__name__) +def set_runtime_kernel_launch_group(): + kernel_launch_group = {'thread_num': 2, 'kernel_group_num': 8} + env_kernel_launch_group = os.getenv("EXPERIMENTAL_KERNEL_LAUNCH_GROUP", None) + if env_kernel_launch_group is not None: + pairs = env_kernel_launch_group.split(',') + for pair in pairs: + key, val = pair.split(':') + kernel_launch_group[key] = val + thread_num = int(kernel_launch_group.get('thread_num', 2)) + kernel_group_num = int(kernel_launch_group.get('kernel_group_num', 8)) + ms.runtime.set_kernel_launch_group(thread_num=thread_num, kernel_group_num=kernel_group_num) + + def _get_padding_index(q_seq_len): dp_size = get_dp_group().world_size tp_size = get_tensor_model_parallel_world_size() @@ -85,18 +98,19 @@ def _get_padding_index(q_seq_len): if dp_rank == 0: attn_unpadding_idx = arange_data last_arange_index = arange_data[-1] - ffn_padding_idx= np.pad(attn_unpadding_idx, (0, padding_size - attn_unpadding_idx.shape[0]), - mode='constant', constant_values=0) + ffn_padding_idx = np.pad(attn_unpadding_idx, (0, padding_size - attn_unpadding_idx.shape[0]), + mode='constant', constant_values=0) else: attn_offset_idx = arange_data + padding_size * dp_rank attn_unpadding_idx = np.concatenate((attn_unpadding_idx, attn_offset_idx), axis=0) ffn_offset_idx = arange_data + last_arange_index + 1 last_arange_index = ffn_offset_idx[-1] - ffn_offset_idx_pad_zero = np.pad( + ffn_offset_idx_pad_zero = np.pad( ffn_offset_idx, (0, padding_size - ffn_offset_idx.shape[0]), mode='constant', constant_values=0) ffn_padding_idx = np.concatenate((ffn_padding_idx, ffn_offset_idx_pad_zero), axis=0) return ms.from_numpy(attn_padding_idx), ms.from_numpy(attn_unpadding_idx), ms.from_numpy(ffn_padding_idx), \ - ms.from_numpy(ffn_unpadding_idx) + ms.from_numpy(ffn_unpadding_idx) + class DeepseekV3ForCausalLM(MfModelBase): @@ -123,6 +137,7 @@ class DeepseekV3ForCausalLM(MfModelBase): compilation_config.static_forward_context[str(i)] = self.kv_caches[i] self.set_flags = False + set_runtime_kernel_launch_group() def _generate_model_config(self): self.mf_config.load_checkpoint = self.get_model_path() @@ -139,7 +154,8 @@ class DeepseekV3ForCausalLM(MfModelBase): network = DeepseekV3ForCausalLM_MF(self.mf_model_config) # quant - if hasattr(self.mf_model_config, "quantization_config") and hasattr(self.mf_model_config.quantization_config, "quant_method"): + if hasattr(self.mf_model_config, "quantization_config") and hasattr(self.mf_model_config.quantization_config, + "quant_method"): ptq = self.create_ptq(self.mf_model_config.quantization_config.quant_method, PTQMode.DEPLOY) if ptq is not None: ptq.apply(network) diff --git a/vllm_mindspore/model_executor/models/mf_models/deepseekv3_weight_processor.py b/vllm_mindspore/model_executor/models/mf_models/deepseekv3_weight_processor.py index 58a186b6..1c6a99d9 100644 --- a/vllm_mindspore/model_executor/models/mf_models/deepseekv3_weight_processor.py +++ b/vllm_mindspore/model_executor/models/mf_models/deepseekv3_weight_processor.py @@ -29,7 +29,6 @@ from mindformers.experimental.parallel_core.pynative.parallel_state import get_t from vllm_mindspore.model_executor.models.mf_models.weight_processor import BaseWeightProcessor, EPMethod from vllm.logger import init_logger - logger = init_logger(__name__) @@ -63,13 +62,20 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor): super().__init__(config, network, is_quant) self.num_layers = self.config.model.model_config.num_layers self.expert_num = self.config.moe_config.expert_num - self.moe_tensor_parallel = self.config.moe_config.moe_tensor_parallel - self.moe_expert_parallel = self.config.moe_config.moe_expert_parallel - self.ep_method = EPMethod.DEFAULT - if self.dp_group_size > 1 and self.moe_expert_parallel == self.global_group_size: - self.ep_method = EPMethod.ALLTOALL - elif self.dp_group_size > 1: - self.ep_method = EPMethod.ALLGATHER + self.moe_split_tp = self.moe_tp_size > 1 + self.moe_split_ep = self.moe_ep_size > 1 + logger.debug(f"Deepseekv3 weight split info:" + f"global_rank_id: {self.global_rank_id} \n" + f"tp_group_size: {self.tp_group_size} \n" + f"dp_group_size: {self.dp_group_size} \n" + f"tp_rank_id: {self.tp_rank_id} \n" + f"ep_method: {self.ep_method.name} \n" + f"num_router_experts: {self.num_router_experts} \n" + f"ep_group_nums: {self.ep_group_nums} \n" + f"moe_ep_rank_id: {self.moe_ep_rank_id} \n" + f"moe_tp_rank_id: {self.moe_tp_rank_id} \n" + f"moe_ep_size: {self.moe_ep_size} \n" + f"moe_tp_size: {self.moe_tp_size}") def quant_convert_weight_name(self, weight_name: str): """replace quant net weight name""" @@ -140,7 +146,7 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor): weight[..., -qk_rope_head_dim:, :] = np.concatenate([w1, w2], axis=-2) return weight - def infer_quant_process_moe_with_tp(self, src_hf_dir, hf_weight_map, num_router_experts, layer_id): + def infer_quant_process_moe_with_ep(self, src_hf_dir, hf_weight_map, layer_id): w1_list = [] w2_list = [] w3_list = [] @@ -149,17 +155,14 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor): w2_scale_list = [] w3_scale_list = [] - for index in range(0, num_router_experts): + for index in range(self.ep_start, self.ep_stop): w1_hf_name = f"model.layers.{layer_id}.mlp.experts.{index}.gate_proj.weight" w2_hf_name = f"model.layers.{layer_id}.mlp.experts.{index}.down_proj.weight" w3_hf_name = f"model.layers.{layer_id}.mlp.experts.{index}.up_proj.weight" - w1_ms_param, _ = self.get_moe_safetensor_from_file(w1_hf_name, src_hf_dir, hf_weight_map, - is_split_param=True, split_axis=0) - w2_ms_param, _ = self.get_moe_safetensor_from_file(w2_hf_name, src_hf_dir, hf_weight_map, - is_split_param=True, split_axis=1) - w3_ms_param, _ = self.get_moe_safetensor_from_file(w3_hf_name, src_hf_dir, hf_weight_map, - is_split_param=True, split_axis=0) + w1_ms_param, _ = self.get_safetensor_from_file(w1_hf_name, src_hf_dir, hf_weight_map) + w2_ms_param, _ = self.get_safetensor_from_file(w2_hf_name, src_hf_dir, hf_weight_map) + w3_ms_param, _ = self.get_safetensor_from_file(w3_hf_name, src_hf_dir, hf_weight_map) w1_list.append(w1_ms_param) w2_list.append(w2_ms_param) @@ -169,11 +172,9 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor): w2_scale_hf_name = f"model.layers.{layer_id}.mlp.experts.{index}.down_proj.weight_scale" w3_scale_hf_name = f"model.layers.{layer_id}.mlp.experts.{index}.up_proj.weight_scale" - w1_scale_ms_param, _ = self.get_moe_safetensor_from_file(w1_scale_hf_name, src_hf_dir, hf_weight_map, - is_split_param=True, split_axis=0) - w2_scale_ms_param, _ = self.get_moe_safetensor_from_file(w2_scale_hf_name, src_hf_dir, hf_weight_map) - w3_scale_ms_param, _ = self.get_moe_safetensor_from_file(w3_scale_hf_name, src_hf_dir, hf_weight_map, - is_split_param=True, split_axis=0) + w1_scale_ms_param, _ = self.get_safetensor_from_file(w1_scale_hf_name, src_hf_dir, hf_weight_map) + w2_scale_ms_param, _ = self.get_safetensor_from_file(w2_scale_hf_name, src_hf_dir, hf_weight_map) + w3_scale_ms_param, _ = self.get_safetensor_from_file(w3_scale_hf_name, src_hf_dir, hf_weight_map) w1_scale_ms_param = w1_scale_ms_param.squeeze(axis=-1) w2_scale_ms_param = w2_scale_ms_param.squeeze(axis=-1) @@ -184,7 +185,7 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor): return w1_list, w2_list, w3_list, w1_scale_list, w2_scale_list, w3_scale_list - def infer_quant_process_moe_with_ep(self, src_hf_dir, hf_weight_map, num_router_experts, layer_id): + def infer_quant_process_moe_with_ep_tp(self, src_hf_dir, hf_weight_map, layer_id): w1_list = [] w2_list = [] w3_list = [] @@ -193,16 +194,17 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor): w2_scale_list = [] w3_scale_list = [] - ep_start = self.moe_ep_rank_id * self.ep_group_nums - ep_stop = (self.moe_ep_rank_id + 1) * self.ep_group_nums - for index in range(ep_start, ep_stop): + for index in range(self.ep_start, self.ep_stop): w1_hf_name = f"model.layers.{layer_id}.mlp.experts.{index}.gate_proj.weight" w2_hf_name = f"model.layers.{layer_id}.mlp.experts.{index}.down_proj.weight" w3_hf_name = f"model.layers.{layer_id}.mlp.experts.{index}.up_proj.weight" - w1_ms_param, _ = self.get_moe_safetensor_from_file(w1_hf_name, src_hf_dir, hf_weight_map) - w2_ms_param, _ = self.get_moe_safetensor_from_file(w2_hf_name, src_hf_dir, hf_weight_map) - w3_ms_param, _ = self.get_moe_safetensor_from_file(w3_hf_name, src_hf_dir, hf_weight_map) + w1_ms_param, _ = self.get_safetensor_from_file_split_moe_tp_group(w1_hf_name, src_hf_dir, hf_weight_map, + split_axis=0) + w2_ms_param, _ = self.get_safetensor_from_file_split_moe_tp_group(w2_hf_name, src_hf_dir, hf_weight_map, + split_axis=1) + w3_ms_param, _ = self.get_safetensor_from_file_split_moe_tp_group(w3_hf_name, src_hf_dir, hf_weight_map, + split_axis=0) w1_list.append(w1_ms_param) w2_list.append(w2_ms_param) @@ -212,9 +214,14 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor): w2_scale_hf_name = f"model.layers.{layer_id}.mlp.experts.{index}.down_proj.weight_scale" w3_scale_hf_name = f"model.layers.{layer_id}.mlp.experts.{index}.up_proj.weight_scale" - w1_scale_ms_param, _ = self.get_moe_safetensor_from_file(w1_scale_hf_name, src_hf_dir, hf_weight_map) - w2_scale_ms_param, _ = self.get_moe_safetensor_from_file(w2_scale_hf_name, src_hf_dir, hf_weight_map) - w3_scale_ms_param, _ = self.get_moe_safetensor_from_file(w3_scale_hf_name, src_hf_dir, hf_weight_map) + w1_scale_ms_param, _ = self.get_safetensor_from_file_split_moe_tp_group(w1_scale_hf_name, src_hf_dir, + hf_weight_map, + split_axis=0) + w2_scale_ms_param, _ = self.get_safetensor_from_file(w2_scale_hf_name, src_hf_dir, + hf_weight_map) + w3_scale_ms_param, _ = self.get_safetensor_from_file_split_moe_tp_group(w3_scale_hf_name, src_hf_dir, + hf_weight_map, + split_axis=0) w1_scale_ms_param = w1_scale_ms_param.squeeze(axis=-1) w2_scale_ms_param = w2_scale_ms_param.squeeze(axis=-1) @@ -225,65 +232,15 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor): return w1_list, w2_list, w3_list, w1_scale_list, w2_scale_list, w3_scale_list - def infer_quant_process_moe_with_ep_tp(self, src_hf_dir, hf_weight_map, num_router_experts, layer_id): - w1_list = [] - w2_list = [] - w3_list = [] - - w1_scale_list = [] - w2_scale_list = [] - w3_scale_list = [] - - ep_start = self.moe_ep_rank_id * self.ep_group_nums - ep_stop = (self.moe_ep_rank_id + 1) * self.ep_group_nums - - for index in range(ep_start, ep_stop): - w1_hf_name = f"model.layers.{layer_id}.mlp.experts.{index}.gate_proj.weight" - w2_hf_name = f"model.layers.{layer_id}.mlp.experts.{index}.down_proj.weight" - w3_hf_name = f"model.layers.{layer_id}.mlp.experts.{index}.up_proj.weight" - - w1_ms_param, _ = self.get_moe_safetensor_from_file(w1_hf_name, src_hf_dir, hf_weight_map, - is_split_param=True, split_axis=0) - w2_ms_param, _ = self.get_moe_safetensor_from_file(w2_hf_name, src_hf_dir, hf_weight_map, - is_split_param=True, split_axis=1) - w3_ms_param, _ = self.get_moe_safetensor_from_file(w3_hf_name, src_hf_dir, hf_weight_map, - is_split_param=True, split_axis=0) - - w1_list.append(w1_ms_param) - w2_list.append(w2_ms_param) - w3_list.append(w3_ms_param) - - w1_scale_hf_name = f"model.layers.{layer_id}.mlp.experts.{index}.gate_proj.weight_scale" - w2_scale_hf_name = f"model.layers.{layer_id}.mlp.experts.{index}.down_proj.weight_scale" - w3_scale_hf_name = f"model.layers.{layer_id}.mlp.experts.{index}.up_proj.weight_scale" - - w1_scale_ms_param, _ = self.get_moe_safetensor_from_file(w1_scale_hf_name, src_hf_dir, hf_weight_map, - is_split_param=True, split_axis=0) - w2_scale_ms_param, _ = self.get_moe_safetensor_from_file(w2_scale_hf_name, src_hf_dir, hf_weight_map) - w3_scale_ms_param, _ = self.get_moe_safetensor_from_file(w3_scale_hf_name, src_hf_dir, hf_weight_map, - is_split_param=True, split_axis=0) - - w1_scale_ms_param = w1_scale_ms_param.squeeze(axis=-1) - w2_scale_ms_param = w2_scale_ms_param.squeeze(axis=-1) - w3_scale_ms_param = w3_scale_ms_param.squeeze(axis=-1) - w1_scale_list.append(w1_scale_ms_param) - w2_scale_list.append(w2_scale_ms_param) - w3_scale_list.append(w3_scale_ms_param) - - return w1_list, w2_list, w3_list, w1_scale_list, w2_scale_list, w3_scale_list - - def infer_quant_process_moe(self, src_hf_dir, hf_weight_map, num_router_experts, layer_id): - if self.moe_expert_parallel > 1 and self.moe_tensor_parallel > 1: - return self.infer_quant_process_moe_with_ep_tp(src_hf_dir, hf_weight_map, num_router_experts, layer_id) - elif self.moe_tensor_parallel > 1: - return self.infer_quant_process_moe_with_tp(src_hf_dir, hf_weight_map, num_router_experts, layer_id) + def infer_quant_process_moe(self, src_hf_dir, hf_weight_map, layer_id): + if self.moe_tp_size > 1: + return self.infer_quant_process_moe_with_ep_tp(src_hf_dir, hf_weight_map, layer_id) else: - return self.infer_quant_process_moe_with_ep(src_hf_dir, hf_weight_map, num_router_experts, layer_id) + return self.infer_quant_process_moe_with_ep(src_hf_dir, hf_weight_map, layer_id) def infer_quant_process_moe_routed_expert_ffn_weight(self, src_hf_dir, layer_id, hf_weight_map): """process moe router expert weight""" ffn_concat = self.config.model.model_config.ffn_concat - num_router_experts = self.config.moe_config.expert_num # router expert dense router_dense_hf_name = f"model.layers.{layer_id}.mlp.gate.weight" @@ -311,7 +268,7 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor): w3_scale_ms_name = f"model.layers.{layer_id}.feed_forward.routed_experts.ffn.w3._layer.matmul.weight_scale" w1_list, w2_list, w3_list, w1_scale_list, w2_scale_list, w3_scale_list = \ - self.infer_quant_process_moe(src_hf_dir, hf_weight_map, num_router_experts, layer_id) + self.infer_quant_process_moe(src_hf_dir, hf_weight_map, layer_id) w1_ms_stack_param = np.stack(w1_list, axis=0) w2_ms_stack_param = np.stack(w2_list, axis=0) @@ -368,54 +325,57 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor): name=w2_scale_ms_name, requires_grad=False) - def get_moe_shared_expert_split_info(self): - split_num = -1 - rank_id = -1 - if self.ep_method == EPMethod.ALLGATHER: - split_num = self.global_group_size - rank_id = get_rank() + def get_quant_moe_shared_expert_weight(self, w1_hf_name, w2_hf_name, w3_hf_name, w1_scale_hf_name, w2_scale_hf_name, + w3_scale_hf_name, src_hf_dir, hf_weight_map): + if self.ep_method in [EPMethod.DEFAULT, EPMethod.ALLGATHER]: + w1_ms_param, _ = self.get_safetensor_from_file_split_global_group(w1_hf_name, src_hf_dir, hf_weight_map, + split_axis=0) + w2_ms_param, _ = self.get_safetensor_from_file_split_global_group(w2_hf_name, src_hf_dir, hf_weight_map, + split_axis=1) + w3_ms_param, _ = self.get_safetensor_from_file_split_global_group(w3_hf_name, src_hf_dir, hf_weight_map, + split_axis=0) + w1_scale_ms_param, _ = self.get_safetensor_from_file_split_global_group(w1_scale_hf_name, src_hf_dir, + hf_weight_map, split_axis=0) + w2_scale_ms_param, _ = self.get_safetensor_from_file(w2_scale_hf_name, src_hf_dir, hf_weight_map) + + w3_scale_ms_param, _ = self.get_safetensor_from_file_split_global_group(w3_scale_hf_name, src_hf_dir, + hf_weight_map, split_axis=0) elif self.ep_method == EPMethod.ALLTOALL: - split_num = 1 - rank_id = 0 - return split_num, rank_id + w1_ms_param, _ = self.get_safetensor_from_file(w1_hf_name, src_hf_dir, hf_weight_map) + w2_ms_param, _ = self.get_safetensor_from_file(w2_hf_name, src_hf_dir, hf_weight_map) + w3_ms_param, _ = self.get_safetensor_from_file(w3_hf_name, src_hf_dir, hf_weight_map) + + w1_scale_ms_param, _ = self.get_safetensor_from_file(w1_scale_hf_name, src_hf_dir, hf_weight_map) + w2_scale_ms_param, _ = self.get_safetensor_from_file(w2_scale_hf_name, src_hf_dir, hf_weight_map) + w3_scale_ms_param, _ = self.get_safetensor_from_file(w3_scale_hf_name, src_hf_dir, hf_weight_map) + else: + raise ValueError("Unsupported ep_method:{}".format(self.ep_method)) + + return w1_ms_param, w2_ms_param, w3_ms_param, w1_scale_ms_param, w2_scale_ms_param, w3_scale_ms_param def infer_quant_process_moe_shared_expert_ffn_weight(self, src_hf_dir, layer_id, hf_weight_map): """infer quant process moe shared expert ffn weight""" - split_num, rank_id = self.get_moe_shared_expert_split_info() - ffn_concat = self.config.model.model_config.ffn_concat w1_hf_name = f"model.layers.{layer_id}.mlp.shared_experts.gate_proj.weight" - w1_ms_name = self.quant_convert_weight_name(w1_hf_name) - w1_ms_param, _ = self.get_safetensor_from_file(w1_hf_name, src_hf_dir, hf_weight_map, is_split_param=True, - split_axis=0, split_num=split_num, rank_id=rank_id) - - w1_scale_hf_name = f"model.layers.{layer_id}.mlp.shared_experts.gate_proj.weight_scale" - w1_scale_ms_name = self.quant_convert_weight_name(w1_scale_hf_name) - w1_scale_ms_param, _ = self.get_safetensor_from_file(w1_scale_hf_name, src_hf_dir, hf_weight_map, - is_split_param=True, split_axis=0, split_num=split_num, - rank_id=rank_id) - w2_hf_name = f"model.layers.{layer_id}.mlp.shared_experts.down_proj.weight" - w2_ms_name = self.quant_convert_weight_name(w2_hf_name) - w2_ms_param, _ = self.get_safetensor_from_file(w2_hf_name, src_hf_dir, hf_weight_map, - is_split_param=True, split_axis=1, split_num=split_num, - rank_id=rank_id) + w3_hf_name = f"model.layers.{layer_id}.mlp.shared_experts.up_proj.weight" + w1_scale_hf_name = f"model.layers.{layer_id}.mlp.shared_experts.gate_proj.weight_scale" w2_scale_hf_name = f"model.layers.{layer_id}.mlp.shared_experts.down_proj.weight_scale" - w2_scale_ms_name = self.quant_convert_weight_name(w2_scale_hf_name) - w2_scale_ms_param, _ = self.get_safetensor_from_file(w2_scale_hf_name, src_hf_dir, hf_weight_map) + w3_scale_hf_name = f"model.layers.{layer_id}.mlp.shared_experts.up_proj.weight_scale" - w3_hf_name = f"model.layers.{layer_id}.mlp.shared_experts.up_proj.weight" + w1_ms_name = self.quant_convert_weight_name(w1_hf_name) + w2_ms_name = self.quant_convert_weight_name(w2_hf_name) w3_ms_name = self.quant_convert_weight_name(w3_hf_name) - w3_ms_param, _ = self.get_safetensor_from_file(w3_hf_name, src_hf_dir, hf_weight_map, - is_split_param=True, split_axis=0, split_num=split_num, - rank_id=rank_id) - w3_scale_hf_name = f"model.layers.{layer_id}.mlp.shared_experts.up_proj.weight_scale" + w1_scale_ms_name = self.quant_convert_weight_name(w1_scale_hf_name) + w2_scale_ms_name = self.quant_convert_weight_name(w2_scale_hf_name) w3_scale_ms_name = self.quant_convert_weight_name(w3_scale_hf_name) - w3_scale_ms_param, _ = self.get_safetensor_from_file(w3_scale_hf_name, src_hf_dir, hf_weight_map, - is_split_param=True, split_axis=0, split_num=split_num, - rank_id=rank_id) + + w1_ms_param, w2_ms_param, w3_ms_param, w1_scale_ms_param, w2_scale_ms_param, w3_scale_ms_param = \ + self.get_quant_moe_shared_expert_weight(w1_hf_name, w2_hf_name, w3_hf_name, w1_scale_hf_name, + w2_scale_hf_name, + w3_scale_hf_name, src_hf_dir, hf_weight_map) w1_scale_ms_param = w1_scale_ms_param.squeeze(axis=-1) w2_scale_ms_param = w2_scale_ms_param.squeeze(axis=-1) @@ -468,20 +428,17 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor): ffn_concat = self.config.model.model_config.ffn_concat w1_hf_name = f"model.layers.{layer_id}.mlp.gate_proj.weight" w1_ms_name = self.quant_convert_weight_name(w1_hf_name) - w1_ms_param, _ = self.get_safetensor_from_file(w1_hf_name, src_hf_dir, hf_weight_map, - is_split_param=True, - split_axis=0) + w1_ms_param, _ = self.get_safetensor_from_file_split_tp_group(w1_hf_name, src_hf_dir, hf_weight_map, + split_axis=0) w1_scale_hf_name = f"model.layers.{layer_id}.mlp.gate_proj.weight_scale" w1_scale_ms_name = self.quant_convert_weight_name(w1_scale_hf_name) - w1_scale_ms_param, _ = self.get_safetensor_from_file(w1_scale_hf_name, src_hf_dir, hf_weight_map, - is_split_param=True, - split_axis=0) + w1_scale_ms_param, _ = self.get_safetensor_from_file_split_tp_group(w1_scale_hf_name, src_hf_dir, hf_weight_map, + split_axis=0) w2_hf_name = f"model.layers.{layer_id}.mlp.down_proj.weight" w2_ms_name = self.quant_convert_weight_name(w2_hf_name) - w2_ms_param, _ = self.get_safetensor_from_file(w2_hf_name, src_hf_dir, hf_weight_map, - is_split_param=True, - split_axis=1) + w2_ms_param, _ = self.get_safetensor_from_file_split_tp_group(w2_hf_name, src_hf_dir, hf_weight_map, + split_axis=1) w2_scale_hf_name = f"model.layers.{layer_id}.mlp.down_proj.weight_scale" w2_scale_ms_name = self.quant_convert_weight_name(w2_scale_hf_name) # shape:[7168,1] @@ -489,14 +446,12 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor): w3_hf_name = f"model.layers.{layer_id}.mlp.up_proj.weight" w3_ms_name = self.quant_convert_weight_name(w3_hf_name) - w3_ms_param, _ = self.get_safetensor_from_file(w3_hf_name, src_hf_dir, hf_weight_map, - is_split_param=True, - split_axis=0) + w3_ms_param, _ = self.get_safetensor_from_file_split_tp_group(w3_hf_name, src_hf_dir, hf_weight_map, + split_axis=0) w3_scale_hf_name = f"model.layers.{layer_id}.mlp.up_proj.weight_scale" w3_scale_ms_name = self.quant_convert_weight_name(w3_scale_hf_name) - w3_scale_ms_param, _ = self.get_safetensor_from_file(w3_scale_hf_name, src_hf_dir, hf_weight_map, - is_split_param=True, - split_axis=0) + w3_scale_ms_param, _ = self.get_safetensor_from_file_split_tp_group(w3_scale_hf_name, src_hf_dir, hf_weight_map, + split_axis=0) w1_scale_ms_param = w1_scale_ms_param.squeeze(axis=-1) w2_scale_ms_param = w2_scale_ms_param.squeeze(axis=-1) @@ -561,8 +516,8 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor): lm_head_hf_name = "lm_head.weight" lm_head_ms_name = self.quant_convert_weight_name(lm_head_hf_name) if not self.config.parallel_config.vocab_emb_dp: - np_data, _ = self.get_safetensor_from_file(lm_head_hf_name, src_hf_dir, hf_weight_map, - is_split_param=True, split_axis=0) + np_data, _ = self.get_safetensor_from_file_split_tp_group(lm_head_hf_name, src_hf_dir, hf_weight_map, + split_axis=0) else: np_data, _ = self.get_safetensor_from_file(lm_head_hf_name, src_hf_dir, hf_weight_map) self.parameter_dict[lm_head_ms_name] = ms.Parameter(ms.from_numpy(np_data).astype(ms.bfloat16), @@ -595,7 +550,7 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor): quant_bias_hf_name = f"model.layers.{layer_id}.self_attn." + name + ".quant_bias" quant_bias_ms_name = self.quant_convert_weight_name(quant_bias_hf_name) quant_bias_ms_param, _ = self.get_safetensor_from_file(quant_bias_hf_name, src_hf_dir, hf_weight_map) - if name == "o_proj" and get_tensor_model_parallel_rank() != 0: + if name == "o_proj" and self.tp_rank_id != 0: quant_bias_ms_param.fill(0) dequant_scale_hf_name = f"model.layers.{layer_id}.self_attn." + name + ".deq_scale" @@ -831,7 +786,6 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor): def infer_process_moe_routed_expert_ffn_weight(self, src_hf_dir, layer_id, hf_weight_map): """process moe router expert weight""" ffn_concat = self.config.model.model_config.ffn_concat - num_router_experts = self.config.moe_config.expert_num # router expert dense router_dense_hf_name = f"model.layers.{layer_id}.mlp.gate.weight" @@ -861,18 +815,18 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor): w3_ms_name = f"model.layers.{layer_id}.feed_forward.routed_experts.ffn.w3.weight" w3_ms_name = w3_ms_name if layer_id < self.num_layers else self.convert_mtp_weight_name(w3_ms_name) - for index in range(0, num_router_experts): + for index in range(0, self.num_router_experts): w1_hf_name = f"model.layers.{layer_id}.mlp.experts.{index}.gate_proj.weight" - w1_ms_param, _ = self.get_safetensor_from_file(w1_hf_name, src_hf_dir, hf_weight_map, - is_split_param=True, split_axis=0) + w1_ms_param, _ = self.get_safetensor_from_file_split_tp_group(w1_hf_name, src_hf_dir, hf_weight_map, + split_axis=0) w2_hf_name = f"model.layers.{layer_id}.mlp.experts.{index}.down_proj.weight" - w2_ms_param, _ = self.get_safetensor_from_file(w2_hf_name, src_hf_dir, hf_weight_map, - is_split_param=True, split_axis=1) + w2_ms_param, _ = self.get_safetensor_from_file_split_tp_group(w2_hf_name, src_hf_dir, hf_weight_map, + split_axis=1) w3_hf_name = f"model.layers.{layer_id}.mlp.experts.{index}.up_proj.weight" - w3_ms_param, _ = self.get_safetensor_from_file(w3_hf_name, src_hf_dir, hf_weight_map, - is_split_param=True, split_axis=0) + w3_ms_param, _ = self.get_safetensor_from_file_split_tp_group(w3_hf_name, src_hf_dir, hf_weight_map, + split_axis=0) w1_list.append(w1_ms_param) w2_list.append(w2_ms_param) @@ -907,28 +861,37 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor): name=w2_ms_name, requires_grad=False) + def get_moe_shared_expert_weight(self, w1_hf_name, w2_hf_name, w3_hf_name, src_hf_dir, hf_weight_map): + if self.ep_method == EPMethod.ALLGATHER: + w1_ms_param, _ = self.get_safetensor_from_file_split_global_group(w1_hf_name, src_hf_dir, hf_weight_map, + split_axis=0) + w2_ms_param, _ = self.get_safetensor_from_file_split_global_group(w2_hf_name, src_hf_dir, hf_weight_map, + split_axis=1) + w3_ms_param, _ = self.get_safetensor_from_file_split_global_group(w3_hf_name, src_hf_dir, hf_weight_map, + split_axis=0) + elif self.ep_method == EPMethod.ALLTOALL: + w1_ms_param, _ = self.get_safetensor_from_file(w1_hf_name, src_hf_dir, hf_weight_map) + w2_ms_param, _ = self.get_safetensor_from_file(w2_hf_name, src_hf_dir, hf_weight_map) + w3_ms_param, _ = self.get_safetensor_from_file(w3_hf_name, src_hf_dir, hf_weight_map) + + else: + raise ValueError("Unsupported ep_method:{}".format(self.ep_method)) + + return w1_ms_param, w2_ms_param, w3_ms_param + def infer_process_moe_shared_expert_ffn_weight(self, src_hf_dir, layer_id, hf_weight_map): """infer process moe shared expert ffn weight""" - split_num, rank_id = self.get_moe_shared_expert_split_info() - ffn_concat = self.config.model.model_config.ffn_concat w1_hf_name = f"model.layers.{layer_id}.mlp.shared_experts.gate_proj.weight" - w1_ms_name = self.convert_weight_name(w1_hf_name) - w1_ms_param, _ = self.get_safetensor_from_file(w1_hf_name, src_hf_dir, hf_weight_map, - is_split_param=True, split_axis=0, split_num=split_num, - rank_id=rank_id) - w2_hf_name = f"model.layers.{layer_id}.mlp.shared_experts.down_proj.weight" - w2_ms_name = self.convert_weight_name(w2_hf_name) - w2_ms_param, _ = self.get_safetensor_from_file(w2_hf_name, src_hf_dir, hf_weight_map, - is_split_param=True, split_axis=1, split_num=split_num, - rank_id=rank_id) - w3_hf_name = f"model.layers.{layer_id}.mlp.shared_experts.up_proj.weight" + + w1_ms_name = self.convert_weight_name(w1_hf_name) + w2_ms_name = self.convert_weight_name(w2_hf_name) w3_ms_name = self.convert_weight_name(w3_hf_name) - w3_ms_param, _ = self.get_safetensor_from_file(w3_hf_name, src_hf_dir, hf_weight_map, - is_split_param=True, split_axis=0, split_num=split_num, - rank_id=rank_id) + + w1_ms_param, w2_ms_param, w3_ms_param = self.get_moe_shared_expert_weight(w1_hf_name, w2_hf_name, w3_hf_name, + src_hf_dir, hf_weight_map) if ffn_concat: w_gate_hidden_name = f"model.layers.{layer_id}.feed_forward.shared_experts.w_gate_hidden.weight" @@ -957,18 +920,18 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor): w1_hf_name = f"model.layers.{layer_id}.mlp.gate_proj.weight" w1_ms_name = self.convert_weight_name(w1_hf_name) - w1_ms_param, _ = self.get_safetensor_from_file(w1_hf_name, src_hf_dir, hf_weight_map, is_split_param=True, - split_axis=0) + w1_ms_param, _ = self.get_safetensor_from_file_split_tp_group(w1_hf_name, src_hf_dir, hf_weight_map, + split_axis=0) w2_hf_name = f"model.layers.{layer_id}.mlp.down_proj.weight" w2_ms_name = self.convert_weight_name(w2_hf_name) - w2_ms_param, _ = self.get_safetensor_from_file(w2_hf_name, src_hf_dir, hf_weight_map, is_split_param=True, - split_axis=1) + w2_ms_param, _ = self.get_safetensor_from_file_split_tp_group(w2_hf_name, src_hf_dir, hf_weight_map, + split_axis=1) w3_hf_name = f"model.layers.{layer_id}.mlp.up_proj.weight" w3_ms_name = self.convert_weight_name(w3_hf_name) - w3_ms_param, _ = self.get_safetensor_from_file(w3_hf_name, src_hf_dir, hf_weight_map, is_split_param=True, - split_axis=0) + w3_ms_param, _ = self.get_safetensor_from_file_split_tp_group(w3_hf_name, src_hf_dir, hf_weight_map, + split_axis=0) if ffn_concat: w_gate_hidden_name = f"model.layers.{layer_id}.feed_forward.w_gate_hidden.weight" @@ -1118,8 +1081,8 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor): hf_name = f"model.layers.{layer_id}.{prefix_name}" ms_name = self.convert_weight_name(hf_name) if prefix_name in head_names and not self.config.parallel_config.vocab_emb_dp: - ms_param, _ = self.get_safetensor_from_file(hf_name, src_hf_dir, hf_weight_map, - is_split_param=True, split_axis=0) + ms_param, _ = self.get_safetensor_from_file_split_tp_group(hf_name, src_hf_dir, hf_weight_map, + split_axis=0) else: ms_param, _ = self.get_safetensor_from_file(hf_name, src_hf_dir, hf_weight_map) parameter_dict[ms_name] = ms.Parameter(ms.Tensor(ms_param, ms.bfloat16), @@ -1145,45 +1108,49 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor): def smooth_quant_process_route_ffn_weight(self, src_hf_dir, layer_id, hf_weight_map, parameter_dict, layer_type): """smooth_quant_process_route_ffn_weight""" + ffn_concat = self.config.model.model_config.ffn_concat w1_weight_name = f"model.layers.{layer_id}.{layer_type}.w1._layer.weight" - w1_weight_param, _ = self.get_safetensor_from_file(w1_weight_name, src_hf_dir, hf_weight_map, - is_split_param=True, - split_axis=2) - w1_bias_name = f"model.layers.{layer_id}.{layer_type}.w1._layer.matmul.quant_bias" - w1_bias_param, _ = self.get_safetensor_from_file(w1_bias_name, src_hf_dir, hf_weight_map, - is_split_param=True, - split_axis=1) w1_scale_name = f"model.layers.{layer_id}.{layer_type}.w1._layer.matmul.dequant_scale" - w1_scale_param, _ = self.get_safetensor_from_file(w1_scale_name, src_hf_dir, hf_weight_map, - is_split_param=True, - split_axis=1) - w1_quant_zp = f"model.layers.{layer_id}.{layer_type}.w1.quant_op.input_zp" w1_quant_scale = f"model.layers.{layer_id}.{layer_type}.w1.quant_op.input_scale" + w3_weight_name = f"model.layers.{layer_id}.{layer_type}.w3._layer.weight" + w3_bias_name = f"model.layers.{layer_id}.{layer_type}.w3._layer.matmul.quant_bias" + w3_scale_name = f"model.layers.{layer_id}.{layer_type}.w3._layer.matmul.dequant_scale" + w3_quant_zp = f"model.layers.{layer_id}.{layer_type}.w3.quant_op.input_zp" + w3_quant_scale = f"model.layers.{layer_id}.{layer_type}.w3.quant_op.input_scale" + w2_weight_name = f"model.layers.{layer_id}.{layer_type}.w2._layer.weight" + w2_scale_name = f"model.layers.{layer_id}.{layer_type}.w2._layer.matmul.weight_scale" + w1_weight_param, _ = self.get_routed_safetensor_3_dim(w1_weight_name, src_hf_dir, hf_weight_map, tp_axis=2, + split_ep=self.moe_split_ep, split_tp=self.moe_split_tp) + + w1_bias_param, _ = self.get_routed_safetensor_2_dim(w1_bias_name, src_hf_dir, hf_weight_map, tp_axis=1, + split_ep=self.moe_split_ep, split_tp=self.moe_split_tp) + + w1_scale_param, _ = self.get_routed_safetensor_2_dim(w1_scale_name, src_hf_dir, hf_weight_map, tp_axis=1, + split_ep=self.moe_split_ep, split_tp=self.moe_split_tp) + w1_quant_zp_param, _ = self.get_safetensor_from_file(w1_quant_zp, src_hf_dir, hf_weight_map) w1_quant_scale_param, _ = self.get_safetensor_from_file(w1_quant_scale, src_hf_dir, hf_weight_map) - w3_weight_name = f"model.layers.{layer_id}.{layer_type}.w3._layer.weight" - w3_weight_param, _ = self.get_safetensor_from_file(w3_weight_name, src_hf_dir, hf_weight_map, - is_split_param=True, - split_axis=2) + w3_weight_param, _ = self.get_routed_safetensor_3_dim(w3_weight_name, src_hf_dir, hf_weight_map, tp_axis=2, + split_ep=self.moe_split_ep, split_tp=self.moe_split_tp) - w3_bias_name = f"model.layers.{layer_id}.{layer_type}.w3._layer.matmul.quant_bias" - w3_bias_param, _ = self.get_safetensor_from_file(w3_bias_name, src_hf_dir, hf_weight_map, - is_split_param=True, - split_axis=1) + w3_bias_param, _ = self.get_routed_safetensor_2_dim(w3_bias_name, src_hf_dir, hf_weight_map, tp_axis=1, + split_ep=self.moe_split_ep, split_tp=self.moe_split_tp) - w3_scale_name = f"model.layers.{layer_id}.{layer_type}.w3._layer.matmul.dequant_scale" - w3_scale_param, _ = self.get_safetensor_from_file(w3_scale_name, src_hf_dir, hf_weight_map, - is_split_param=True, - split_axis=1) + w3_scale_param, _ = self.get_routed_safetensor_2_dim(w3_scale_name, src_hf_dir, hf_weight_map, tp_axis=1, + split_ep=self.moe_split_ep, split_tp=self.moe_split_tp) - w3_quant_zp = f"model.layers.{layer_id}.{layer_type}.w3.quant_op.input_zp" - w3_quant_scale = f"model.layers.{layer_id}.{layer_type}.w3.quant_op.input_scale" w3_quant_zp_param, _ = self.get_safetensor_from_file(w3_quant_zp, src_hf_dir, hf_weight_map) w3_quant_scale_param, _ = self.get_safetensor_from_file(w3_quant_scale, src_hf_dir, hf_weight_map) + + w2_weight_param, _ = self.get_routed_safetensor_3_dim(w2_weight_name, src_hf_dir, hf_weight_map, tp_axis=1, + split_ep=self.moe_split_ep, split_tp=self.moe_split_tp) + w2_scale_param, _ = self.get_routed_safetensor_2_dim(w2_scale_name, src_hf_dir, hf_weight_map, + split_ep=self.moe_split_ep, split_tp=False) + if ffn_concat: concat_weight_name = f"model.layers.{layer_id}.{layer_type}.w_gate_hidden._layer.weight" concat_weight_param = ms.Tensor(np.concatenate([w1_weight_param, w3_weight_param], axis=2), dtype=ms.int8) @@ -1237,22 +1204,157 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor): parameter_dict[w3_quant_scale] = ms.Parameter(ms.Tensor(w3_quant_scale_param, ms.bfloat16), name=w3_quant_scale, requires_grad=False) + parameter_dict[w2_weight_name] = ms.Parameter(ms.Tensor(w2_weight_param, ms.int8), name=w2_weight_name, + requires_grad=False) + parameter_dict[w2_scale_name] = ms.Parameter(ms.Tensor(w2_scale_param, ms.bfloat16), + name=w2_scale_name, requires_grad=False) + + def get_smooth_quant_moe_shared_expert_weight(self, w1_weight_name, w1_bias_name, w1_scale_name, w3_weight_name, + w3_bias_name, w3_scale_name, w2_weight_name, src_hf_dir, + hf_weight_map): + + if self.ep_method == EPMethod.ALLGATHER: + w1_weight_param, _ = self.get_safetensor_from_file_split_moe_tp_group(w1_weight_name, src_hf_dir, + hf_weight_map, + split_axis=0) + w1_bias_param, _ = self.get_safetensor_from_file_split_moe_tp_group(w1_bias_name, src_hf_dir, hf_weight_map, + split_axis=0) + + w1_scale_param, _ = self.get_safetensor_from_file_split_moe_tp_group(w1_scale_name, src_hf_dir, + hf_weight_map, + split_axis=0) + + w3_weight_param, _ = self.get_safetensor_from_file_split_moe_tp_group(w3_weight_name, src_hf_dir, + hf_weight_map, + split_axis=0) + w3_bias_param, _ = self.get_safetensor_from_file_split_moe_tp_group(w3_bias_name, src_hf_dir, hf_weight_map, + split_axis=0) + w3_scale_param, _ = self.get_safetensor_from_file_split_moe_tp_group(w3_scale_name, src_hf_dir, + hf_weight_map, + split_axis=0) + + w2_weight_param, _ = self.get_safetensor_from_file_split_moe_tp_group(w2_weight_name, src_hf_dir, + hf_weight_map, + split_axis=1) + elif self.ep_method == EPMethod.ALLTOALL: + w1_weight_param, _ = self.get_safetensor_from_file(w1_weight_name, src_hf_dir, hf_weight_map) + w1_bias_param, _ = self.get_safetensor_from_file(w1_bias_name, src_hf_dir, hf_weight_map) + w1_scale_param, _ = self.get_safetensor_from_file(w1_scale_name, src_hf_dir, hf_weight_map) + + w3_weight_param, _ = self.get_safetensor_from_file(w3_weight_name, src_hf_dir, hf_weight_map) + w3_bias_param, _ = self.get_safetensor_from_file(w3_bias_name, src_hf_dir, hf_weight_map) + w3_scale_param, _ = self.get_safetensor_from_file(w3_scale_name, src_hf_dir, hf_weight_map) + + w2_weight_param, _ = self.get_safetensor_from_file(w2_weight_name, src_hf_dir, hf_weight_map) + else: + raise ValueError("Unsupported ep_method:{}".format(self.ep_method)) + + return w1_weight_param, w1_bias_param, w1_scale_param, w3_weight_param, w3_bias_param, w3_scale_param, w2_weight_param + + def smooth_quant_process_shared_ffn_weight(self, src_hf_dir, layer_id, hf_weight_map, parameter_dict, layer_type): + """smooth_quant_process_shared_ffn_weight""" + + ffn_concat = self.config.model.model_config.ffn_concat + w1_weight_name = f"model.layers.{layer_id}.{layer_type}.w1._layer.weight" + w2_weight_name = f"model.layers.{layer_id}.{layer_type}.w2._layer.weight" + w3_weight_name = f"model.layers.{layer_id}.{layer_type}.w3._layer.weight" + + w1_bias_name = f"model.layers.{layer_id}.{layer_type}.w1._layer.matmul.quant_bias" + w3_bias_name = f"model.layers.{layer_id}.{layer_type}.w3._layer.matmul.quant_bias" + + w1_scale_name = f"model.layers.{layer_id}.{layer_type}.w1._layer.matmul.dequant_scale" + w2_scale_name = f"model.layers.{layer_id}.{layer_type}.w2._layer.matmul.weight_scale" + w3_scale_name = f"model.layers.{layer_id}.{layer_type}.w3._layer.matmul.dequant_scale" + + w1_quant_zp = f"model.layers.{layer_id}.{layer_type}.w1.quant_op.input_zp" + w3_quant_zp = f"model.layers.{layer_id}.{layer_type}.w3.quant_op.input_zp" + + w1_quant_scale = f"model.layers.{layer_id}.{layer_type}.w1.quant_op.input_scale" + w3_quant_scale = f"model.layers.{layer_id}.{layer_type}.w3.quant_op.input_scale" + + w1_weight_param, w1_bias_param, w1_scale_param, w3_weight_param, w3_bias_param, w3_scale_param, w2_weight_param = \ + self.get_smooth_quant_moe_shared_expert_weight(w1_weight_name, w1_bias_name, w1_scale_name, w3_weight_name, + w3_bias_name, w3_scale_name, w2_weight_name, src_hf_dir, + hf_weight_map) + + w1_quant_zp_param, _ = self.get_safetensor_from_file(w1_quant_zp, src_hf_dir, hf_weight_map) + w1_quant_scale_param, _ = self.get_safetensor_from_file(w1_quant_scale, src_hf_dir, hf_weight_map) + + w2_scale_param, _ = self.get_safetensor_from_file(w2_scale_name, src_hf_dir, hf_weight_map) + + w3_quant_zp_param, _ = self.get_safetensor_from_file(w3_quant_zp, src_hf_dir, hf_weight_map) + w3_quant_scale_param, _ = self.get_safetensor_from_file(w3_quant_scale, src_hf_dir, hf_weight_map) + if ffn_concat: + concat_weight_name = f"model.layers.{layer_id}.{layer_type}.w_gate_hidden._layer.weight" + concat_weight_param = ms.Tensor(np.concatenate([w1_weight_param, w3_weight_param], axis=0), dtype=ms.int8) + parameter_dict[concat_weight_name] = ms.Parameter(concat_weight_param, name=concat_weight_name, + requires_grad=False) + + concat_bias_name = f"model.layers.{layer_id}.{layer_type}.w_gate_hidden._layer.matmul.quant_bias" + concat_bias_param = ms.Tensor(np.concatenate([w1_bias_param, w3_bias_param], axis=0), dtype=ms.int32) + parameter_dict[concat_bias_name] = ms.Parameter(concat_bias_param, name=concat_bias_name, + requires_grad=False) + + concat_scale_name = f"model.layers.{layer_id}.{layer_type}.w_gate_hidden._layer.matmul.dequant_scale" + concat_scale_param = ms.Tensor(np.concatenate([w1_scale_param, w3_scale_param], axis=0), dtype=ms.float32) + parameter_dict[concat_scale_name] = ms.Parameter(concat_scale_param, name=concat_scale_name, + requires_grad=False) + + concat_quant_zp_name = f"model.layers.{layer_id}.{layer_type}.w_gate_hidden.quant_op.input_zp" + concat_quant_zp_param = ms.Tensor(w1_quant_zp_param, dtype=ms.int8) + parameter_dict[concat_quant_zp_name] = ms.Parameter(concat_quant_zp_param, name=concat_quant_zp_name, + requires_grad=False) + + concat_quant_scale_name = f"model.layers.{layer_id}.{layer_type}.w_gate_hidden.quant_op.input_scale" + concat_quant_scale_param = ms.Tensor(w1_quant_scale_param, dtype=ms.bfloat16) + parameter_dict[concat_quant_scale_name] = ms.Parameter(concat_quant_scale_param, + name=concat_quant_scale_name, + requires_grad=False) + else: + # w1 w3 + parameter_dict[w1_weight_name] = ms.Parameter(ms.Tensor(w1_weight_param, ms.int8), name=w1_weight_name, + requires_grad=False) + parameter_dict[w3_weight_name] = ms.Parameter(ms.Tensor(w3_weight_param, ms.int8), name=w3_weight_name, + requires_grad=False) + + parameter_dict[w1_bias_name] = ms.Parameter(ms.Tensor(w1_bias_param, ms.int32), + name=w1_bias_name, requires_grad=False) + parameter_dict[w3_bias_name] = ms.Parameter(ms.Tensor(w3_bias_param, ms.int32), + name=w3_bias_name, requires_grad=False) + + parameter_dict[w1_scale_name] = ms.Parameter(ms.Tensor(w1_scale_param, ms.float32), + name=w1_scale_name, requires_grad=False) + parameter_dict[w3_scale_name] = ms.Parameter(ms.Tensor(w3_scale_param, ms.float32), + name=w3_scale_name, requires_grad=False) + + parameter_dict[w1_quant_zp] = ms.Parameter(ms.Tensor(w1_quant_zp_param, ms.int8), + name=w1_quant_zp, requires_grad=False) + parameter_dict[w3_quant_zp] = ms.Parameter(ms.Tensor(w3_quant_zp_param, ms.int8), + name=w3_quant_zp, requires_grad=False) + + parameter_dict[w1_quant_scale] = ms.Parameter(ms.Tensor(w1_quant_scale_param, ms.bfloat16), + name=w1_quant_scale, requires_grad=False) + parameter_dict[w3_quant_scale] = ms.Parameter(ms.Tensor(w3_quant_scale_param, ms.bfloat16), + name=w3_quant_scale, requires_grad=False) + parameter_dict[w2_weight_name] = ms.Parameter(ms.Tensor(w2_weight_param, ms.int8), name=w2_weight_name, + requires_grad=False) + parameter_dict[w2_scale_name] = ms.Parameter(ms.Tensor(w2_scale_param, ms.bfloat16), + name=w2_scale_name, requires_grad=False) + def smooth_quant_process_ffn_weight(self, src_hf_dir, layer_id, hf_weight_map, parameter_dict, layer_type): """smooth_quant_process_ffn_weight""" ffn_concat = self.config.model.model_config.ffn_concat w1_weight_name = f"model.layers.{layer_id}.{layer_type}.w1._layer.weight" - w1_weight_param, _ = self.get_safetensor_from_file(w1_weight_name, src_hf_dir, hf_weight_map, - is_split_param=True, - split_axis=0) + + w1_weight_param, _ = self.get_safetensor_from_file_split_tp_group(w1_weight_name, src_hf_dir, hf_weight_map, + split_axis=0) w1_bias_name = f"model.layers.{layer_id}.{layer_type}.w1._layer.matmul.quant_bias" - w1_bias_param, _ = self.get_safetensor_from_file(w1_bias_name, src_hf_dir, hf_weight_map, - is_split_param=True, - split_axis=0) + w1_bias_param, _ = self.get_safetensor_from_file_split_tp_group(w1_bias_name, src_hf_dir, hf_weight_map, + split_axis=0) w1_scale_name = f"model.layers.{layer_id}.{layer_type}.w1._layer.matmul.dequant_scale" - w1_scale_param, _ = self.get_safetensor_from_file(w1_scale_name, src_hf_dir, hf_weight_map, - is_split_param=True, - split_axis=0) + w1_scale_param, _ = self.get_safetensor_from_file_split_tp_group(w1_scale_name, src_hf_dir, hf_weight_map, + split_axis=0) w1_quant_zp = f"model.layers.{layer_id}.{layer_type}.w1.quant_op.input_zp" w1_quant_scale = f"model.layers.{layer_id}.{layer_type}.w1.quant_op.input_scale" @@ -1260,17 +1362,20 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor): w1_quant_scale_param, _ = self.get_safetensor_from_file(w1_quant_scale, src_hf_dir, hf_weight_map) w3_weight_name = f"model.layers.{layer_id}.{layer_type}.w3._layer.weight" - w3_weight_param, _ = self.get_safetensor_from_file(w3_weight_name, src_hf_dir, hf_weight_map, - is_split_param=True, - split_axis=0) + + w3_weight_param, _ = self.get_safetensor_from_file_split_tp_group(w3_weight_name, src_hf_dir, hf_weight_map, + split_axis=0) w3_bias_name = f"model.layers.{layer_id}.{layer_type}.w3._layer.matmul.quant_bias" - w3_bias_param, _ = self.get_safetensor_from_file(w3_bias_name, src_hf_dir, hf_weight_map, - is_split_param=True, - split_axis=0) + w3_bias_param, _ = self.get_safetensor_from_file_split_tp_group(w3_bias_name, src_hf_dir, hf_weight_map, + split_axis=0) w3_scale_name = f"model.layers.{layer_id}.{layer_type}.w3._layer.matmul.dequant_scale" - w3_scale_param, _ = self.get_safetensor_from_file(w3_scale_name, src_hf_dir, hf_weight_map, - is_split_param=True, - split_axis=0) + w3_scale_param, _ = self.get_safetensor_from_file_split_tp_group(w3_scale_name, src_hf_dir, hf_weight_map, + split_axis=0) + w2_weight_name = f"model.layers.{layer_id}.{layer_type}.w2._layer.weight" + w2_scale_name = f"model.layers.{layer_id}.{layer_type}.w2._layer.matmul.weight_scale" + w2_weight_param, _ = self.get_safetensor_from_file_split_tp_group(w2_weight_name, src_hf_dir, hf_weight_map, + split_axis=1) + w2_scale_param, _ = self.get_safetensor_from_file(w2_scale_name, src_hf_dir, hf_weight_map) w3_quant_zp = f"model.layers.{layer_id}.{layer_type}.w3.quant_op.input_zp" w3_quant_scale = f"model.layers.{layer_id}.{layer_type}.w3.quant_op.input_scale" @@ -1329,6 +1434,11 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor): parameter_dict[w3_quant_scale] = ms.Parameter(ms.Tensor(w3_quant_scale_param, ms.bfloat16), name=w3_quant_scale, requires_grad=False) + parameter_dict[w2_weight_name] = ms.Parameter(ms.Tensor(w2_weight_param, ms.int8), name=w2_weight_name, + requires_grad=False) + parameter_dict[w2_scale_name] = ms.Parameter(ms.Tensor(w2_scale_param, ms.bfloat16), + name=w2_scale_name, requires_grad=False) + def smooth_quant_process_qkv_weight(self, src_hf_dir, layer_id, hf_weight_map, parameter_dict): '''smooth_quant_process_qkv_weight''' qkv_concat = self.config.model.model_config.qkv_concat @@ -1402,13 +1512,13 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor): def infer_smooth_quant_row_linear_split(self, param_name, src_hf_dir, hf_weight_map): '''infer_smooth_quant_row_linear_split''' if param_name.endswith(".weight"): - value, _ = self.get_safetensor_from_file(param_name, src_hf_dir, - hf_weight_map, is_split_param=True, - split_axis=1) + value, _ = self.get_safetensor_from_file_split_tp_group(param_name, src_hf_dir, + hf_weight_map, + split_axis=1) elif "quant_op" in param_name: - value, _ = self.get_safetensor_from_file(param_name, src_hf_dir, - hf_weight_map, is_split_param=True, - split_axis=0) + value, _ = self.get_safetensor_from_file_split_tp_group(param_name, src_hf_dir, + hf_weight_map, + split_axis=0) else: value, _ = self.get_safetensor_from_file(param_name, src_hf_dir, hf_weight_map) @@ -1416,6 +1526,7 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor): if any([name in param_name for name in quant_bias_set_zero]) and \ get_tensor_model_parallel_rank() != 0: value.fill(0) + return value def infer_smooth_quant_get_value(self, param_name, src_hf_dir, hf_weight_map, no_need_split_layer): @@ -1426,28 +1537,21 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor): hf_weight_map) elif any([name in param_name for name in [".l2q_proj."]]): if param_name.endswith(".weight") or "matmul" in param_name: - value, _ = self.get_safetensor_from_file(param_name, src_hf_dir, - hf_weight_map, is_split_param=True, - split_axis=0) + value, _ = self.get_safetensor_from_file_split_tp_group(param_name, src_hf_dir, + hf_weight_map, + split_axis=0) else: value, _ = self.get_safetensor_from_file(param_name, src_hf_dir, hf_weight_map) - elif any([name in param_name for name in [".feed_forward.w2.", ".wo.", "shared_experts.w2"]]): + elif any([name in param_name for name in [".wo."]]): value = self.infer_smooth_quant_row_linear_split(param_name, src_hf_dir, hf_weight_map) - elif ".routed_experts.ffn.w2" in param_name: - if param_name.endswith(".weight"): - value, _ = self.get_safetensor_from_file(param_name, src_hf_dir, hf_weight_map, - is_split_param=True, split_axis=1) - else: - value, _ = self.get_safetensor_from_file(param_name, src_hf_dir, - hf_weight_map) elif any([name in param_name for name in ["lkv2kv_k_nope", "lkv2kv_v"]]): - value, _ = self.get_safetensor_from_file(param_name, src_hf_dir, hf_weight_map, - is_split_param=True, split_axis=0) + value, _ = self.get_safetensor_from_file_split_tp_group(param_name, src_hf_dir, hf_weight_map, + split_axis=0) elif "lm_head" in param_name: if not self.config.parallel_config.vocab_emb_dp: - value, _ = self.get_safetensor_from_file(param_name, src_hf_dir, hf_weight_map, - is_split_param=True, split_axis=0) + value, _ = self.get_safetensor_from_file_split_tp_group(param_name, src_hf_dir, hf_weight_map, + split_axis=0) else: value, _ = self.get_safetensor_from_file(param_name, src_hf_dir, hf_weight_map) else: @@ -1465,84 +1569,23 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor): if layer_id >= 3: self.smooth_quant_process_route_ffn_weight(src_hf_dir, layer_id, hf_weight_map, parameter_dict, "feed_forward.routed_experts.ffn") - self.smooth_quant_process_ffn_weight(src_hf_dir, layer_id, hf_weight_map, parameter_dict, - "feed_forward.shared_experts") + self.smooth_quant_process_shared_ffn_weight(src_hf_dir, layer_id, hf_weight_map, parameter_dict, + "feed_forward.shared_experts") else: self.smooth_quant_process_ffn_weight(src_hf_dir, layer_id, hf_weight_map, parameter_dict, "feed_forward") self.smooth_quant_process_qkv_weight(src_hf_dir, layer_id, hf_weight_map, parameter_dict) - skip_layer = ["feed_forward.routed_experts.ffn.w1", "feed_forward.shared_experts.w1", "feed_forward.w1", - "feed_forward.routed_experts.ffn.w3", "feed_forward.shared_experts.w3", "feed_forward.w3", - "feed_forward.routed_experts.ffn.w_gate_hidden", "feed_forward.shared_experts.w_gate_hidden", - "feed_forward.w_gate_hidden", "attention.kv2l", "attention.q2l_proj", "attention.qkv2l"] + skip_layer = ["feed_forward.routed_experts.ffn", "feed_forward.shared_experts", "feed_forward.w", + "attention.kv2l", "attention.q"] for param_name, _ in tqdm(hf_weight_map.items(), desc="remaining params load"): if "model.layers" in param_name and int(param_name.split('.')[2]) >= num_layers: continue - if any([name in param_name for name in no_need_split_layer]): - value, _ = self.get_safetensor_from_file(param_name, src_hf_dir, - hf_weight_map) - elif any([name in param_name for name in [".l2q_proj.", ".feed_forward.w_gate_hidden.", - "shared_experts.w_gate_hidden"]]): - if param_name.endswith(".weight") or "matmul" in param_name: - value, _ = self.get_safetensor_from_file(param_name, src_hf_dir, - hf_weight_map, is_split_param=True, - split_axis=0) - else: - value, _ = self.get_safetensor_from_file(param_name, src_hf_dir, - hf_weight_map) - elif any([name in param_name for name in [".feed_forward.w2.", ".wo.", "shared_experts.w2"]]): - if param_name.endswith(".weight"): - value, _ = self.get_safetensor_from_file(param_name, src_hf_dir, - hf_weight_map, is_split_param=True, - split_axis=1) - elif "quant_op" in param_name: - value, _ = self.get_safetensor_from_file(param_name, src_hf_dir, - hf_weight_map, is_split_param=True, - split_axis=0) - else: - value, _ = self.get_safetensor_from_file(param_name, src_hf_dir, - hf_weight_map) - elif ".routed_experts.ffn.w_gate_hidden." in param_name: - if param_name.endswith(".weight"): - value, _ = self.get_safetensor_from_file(param_name, src_hf_dir, hf_weight_map) - value_list = [] - for experts_id in range(value.shape[0]): - value_list.append(self.split_weight_by_rank(value[experts_id, :, :], split_axis=1)) - value = np.stack(value_list, axis=0) - elif "matmul" in param_name: - value, _ = self.get_safetensor_from_file(param_name, src_hf_dir, hf_weight_map) - value_list = [] - for experts_id in range(value.shape[0]): - value_list.append(self.split_weight_by_rank(value[experts_id, :], split_axis=0)) - value = np.stack(value_list, axis=0) - else: - value, _ = self.get_safetensor_from_file(param_name, src_hf_dir, - hf_weight_map) - elif ".routed_experts.ffn.w2" in param_name: - if param_name.endswith(".weight"): - value, _ = self.get_safetensor_from_file(param_name, src_hf_dir, hf_weight_map) - value_list = [] - for experts_id in range(value.shape[0]): - value_list.append(self.split_weight_by_rank(value[experts_id, :, :], split_axis=0)) - value = np.stack(value_list, axis=0) - else: - value, _ = self.get_safetensor_from_file(param_name, src_hf_dir, - hf_weight_map) - elif any([name in param_name for name in ["lkv2kv_k_nope", "lkv2kv_v"]]): - value, _ = self.get_safetensor_from_file(param_name, src_hf_dir, hf_weight_map, - is_split_param=True, split_axis=0) - elif "lm_head" in param_name: - if not self.config.parallel_config.vocab_emb_dp: - value, _ = self.get_safetensor_from_file(param_name, src_hf_dir, hf_weight_map, - is_split_param=True, split_axis=0) - else: - value, _ = self.get_safetensor_from_file(param_name, src_hf_dir, hf_weight_map) - else: - raise ValueError(f"not found layer {param_name}, please check safetensors file.") + if any([name in param_name for name in skip_layer]): + continue value = self.infer_smooth_quant_get_value(param_name, src_hf_dir, hf_weight_map, no_need_split_layer) dst_dtype = convert_np_to_ms_dtype(value) @@ -1551,8 +1594,8 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor): name=param_name, requires_grad=False) param_not_load, ckpt_not_load = ms.load_param_into_net(self.network, parameter_dict) - logger.info("smoothquant param_not_load: %s" % str(param_not_load)) - logger.info("smoothquant ckpt_not_load: %s" % str(ckpt_not_load)) + logger.info(f"smoothquant param_not_load:{param_not_load}") + logger.info(f"smoothquant ckpt_not_load:{ckpt_not_load}") def infer_gptq_quant_net_ms_convert_layer_weight(self, src_hf_dir, num_layers, hf_weight_map): """infer_gptq_quant_net_ms_convert_layer_weight""" @@ -1595,12 +1638,12 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor): value_list.append(self.split_weight_by_rank(value[experts_id, :, :], split_axis=0)) value = np.stack(value_list, axis=0) elif any([name in param_name for name in ["lkv2kv_k_nope", "lkv2kv_v"]]): - value, is_int4 = self.get_safetensor_from_file(param_name, src_hf_dir, hf_weight_map, - is_split_param=True, split_axis=0) + value, is_int4 = self.get_safetensor_from_file_split_tp_group(param_name, src_hf_dir, hf_weight_map, + split_axis=0) elif "lm_head" in param_name: if not self.config.parallel_config.vocab_emb_dp: - value, is_int4 = self.get_safetensor_from_file(param_name, src_hf_dir, hf_weight_map, - is_split_param=True, split_axis=0) + value, is_int4 = self.get_safetensor_from_file_split_tp_group(param_name, src_hf_dir, hf_weight_map, + split_axis=0) else: value, is_int4 = self.get_safetensor_from_file(param_name, src_hf_dir, hf_weight_map) else: diff --git a/vllm_mindspore/model_executor/models/mf_models/mf_model_base.py b/vllm_mindspore/model_executor/models/mf_models/mf_model_base.py index 79974f6f..533d16c7 100644 --- a/vllm_mindspore/model_executor/models/mf_models/mf_model_base.py +++ b/vllm_mindspore/model_executor/models/mf_models/mf_model_base.py @@ -46,17 +46,6 @@ from vllm_mindspore.v1.attention.backends.flash_attn import FlashAttentionMetada logger = init_logger(__name__) - -def _pad_to_max(x, max_len): - return x + [-1] * (max_len - len(x)) - - -def _batch_seq(input_tokens, prefill): - if prefill: - return ms.ops.expand_dims(input_tokens, 0).to(ms.int32) - - return ms.mint.reshape(input_tokens, (-1, 1)).to(ms.int32) - class MfModelBase(MsModelBase): def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None: super(MfModelBase, self).__init__( @@ -74,8 +63,10 @@ class MfModelBase(MsModelBase): ) self.mf_config.model.model_config.parallel_config.pipeline_stage = 1 self._generate_model_config() + self.casual_mask = LowerTriangularMask(mf_model_config=self.mf_model_config) self.network, self.lm_head = self._create_network() + affinity_config = self.mf_config.get('context', {}).get('affinity_cpu_list', {}) if isinstance(affinity_config, dict): ms.runtime.set_cpu_affinity(True, affinity_config) @@ -151,8 +142,8 @@ class MfModelBase(MsModelBase): attention_mask = self.casual_mask.gen_attention_mask(is_prefill, position_ids, query_lens) model_inputs = {} - model_inputs["input_ids"] = _batch_seq(input_ids, is_prefill) - model_inputs["batch_valid_length"] = ms.Tensor.from_numpy(np.expand_dims(seq_lens_np, 0)) + model_inputs["input_ids"] = input_ids.astype(ms.int32) + model_inputs["batch_valid_length"] = ms.from_numpy(seq_lens_np) model_inputs["block_tables"] = attn_metadata.block_tables model_inputs["slot_mapping"] = attn_metadata.slot_mapping model_inputs["position_ids"] = position_ids @@ -170,8 +161,8 @@ class MfModelBase(MsModelBase): attention_mask = self.casual_mask.gen_attention_mask(is_prefill, positions, query_lens_np) model_inputs = {} - model_inputs["input_ids"] = _batch_seq(input_ids, is_prefill) - model_inputs["batch_valid_length"] = ms.Tensor(np.expand_dims(attn_metadata.seq_lens_np, 0)) + model_inputs["input_ids"] = input_ids.astype(ms.int32) + model_inputs["batch_valid_length"] = ms.from_numpy(attn_metadata.seq_lens_np) model_inputs["block_tables"] = attn_metadata.block_tables model_inputs["slot_mapping"] = attn_metadata.slot_mapping model_inputs["position_ids"] = positions.to(ms.int32) diff --git a/vllm_mindspore/model_executor/models/mf_models/qwen2_weight_processor.py b/vllm_mindspore/model_executor/models/mf_models/qwen2_weight_processor.py index 59423eca..99b59a72 100644 --- a/vllm_mindspore/model_executor/models/mf_models/qwen2_weight_processor.py +++ b/vllm_mindspore/model_executor/models/mf_models/qwen2_weight_processor.py @@ -25,8 +25,10 @@ from safetensors import safe_open import mindspore as ms from mindspore.communication.management import get_rank -from vllm_mindspore.model_executor.models.mf_models.weight_processor import BaseWeightProcessor +from vllm_mindspore.model_executor.models.mf_models.weight_processor import BaseWeightProcessor, EPMethod +from vllm.logger import init_logger +logger = init_logger(__name__) class Qwen2WeightProcessor(BaseWeightProcessor): r""" @@ -47,8 +49,8 @@ class Qwen2WeightProcessor(BaseWeightProcessor): if self.config.parallel_config.vocab_emb_dp: np_data, _ = self.get_safetensor_from_file(embed_tokens_hf_name, src_hf_dir, hf_weight_map) else: - np_data, _ = self.get_safetensor_from_file(embed_tokens_hf_name, src_hf_dir, hf_weight_map, - is_split_param=True, split_axis=0) + np_data, _ = self.get_safetensor_from_file_split_tp_group(embed_tokens_hf_name, src_hf_dir, hf_weight_map, + split_axis=0) self.parameter_dict[embed_tokens_ms_name] = ms.Parameter(ms.from_numpy(np_data).astype(ms.bfloat16), name=embed_tokens_ms_name, requires_grad=False) @@ -64,8 +66,8 @@ class Qwen2WeightProcessor(BaseWeightProcessor): lm_head_ms_name = self.convert_weight_name(lm_head_hf_name) if not self.config.model.model_config.tie_word_embeddings: if not self.config.parallel_config.vocab_emb_dp: - np_data, _ = self.get_safetensor_from_file(lm_head_hf_name, src_hf_dir, hf_weight_map, - is_split_param=True, split_axis=0) + np_data, _ = self.get_safetensor_from_file_split_tp_group(lm_head_hf_name, src_hf_dir, hf_weight_map, + split_axis=0) else: np_data, _ = self.get_safetensor_from_file(lm_head_hf_name, src_hf_dir, hf_weight_map) self.parameter_dict[lm_head_ms_name] = ms.Parameter(ms.from_numpy(np_data).astype(ms.bfloat16), @@ -94,18 +96,18 @@ class Qwen2WeightProcessor(BaseWeightProcessor): ffn_concat = self.config.model.model_config.qkv_concat w1_hf_name = f"model.layers.{layer_id}.mlp.gate_proj.weight" w1_ms_name = self.convert_weight_name(w1_hf_name) - w1_ms_param, _ = self.get_safetensor_from_file(w1_hf_name, src_hf_dir, hf_weight_map, is_split_param=True, - split_axis=0) + w1_ms_param, _ = self.get_safetensor_from_file_split_tp_group(w1_hf_name, src_hf_dir, hf_weight_map, + split_axis=0) w2_hf_name = f"model.layers.{layer_id}.mlp.down_proj.weight" w2_ms_name = self.convert_weight_name(w2_hf_name) - w2_ms_param, _ = self.get_safetensor_from_file(w2_hf_name, src_hf_dir, hf_weight_map, is_split_param=True, - split_axis=1) + w2_ms_param, _ = self.get_safetensor_from_file_split_tp_group(w2_hf_name, src_hf_dir, hf_weight_map, + split_axis=1) w3_hf_name = f"model.layers.{layer_id}.mlp.up_proj.weight" w3_ms_name = self.convert_weight_name(w3_hf_name) - w3_ms_param, _ = self.get_safetensor_from_file(w3_hf_name, src_hf_dir, hf_weight_map, is_split_param=True, - split_axis=0) + w3_ms_param, _ = self.get_safetensor_from_file_split_tp_group(w3_hf_name, src_hf_dir, hf_weight_map, + split_axis=0) if ffn_concat: w_gate_hidden_name = f"model.layers.{layer_id}.feed_forward.w_gate_hidden.weight" @@ -130,38 +132,35 @@ class Qwen2WeightProcessor(BaseWeightProcessor): # wq wq_hf_name = f"model.layers.{layer_id}.self_attn.q_proj.weight" wq_ms_name = self.convert_weight_name(wq_hf_name) - wq_ms_param, _ = self.get_safetensor_from_file(wq_hf_name, src_hf_dir, hf_weight_map, is_split_param=True, - split_axis=0) + wq_ms_param, _ = self.get_safetensor_from_file_split_tp_group(wq_hf_name, src_hf_dir, hf_weight_map, + split_axis=0) # wq bias wq_bias_hf_name = f"model.layers.{layer_id}.self_attn.q_proj.bias" wq_bias_ms_name = self.convert_weight_name(wq_bias_hf_name) - wq_bias_ms_param, _ = self.get_safetensor_from_file(wq_bias_hf_name, src_hf_dir, hf_weight_map, - is_split_param=True, - split_axis=0) + wq_bias_ms_param, _ = self.get_safetensor_from_file_split_tp_group(wq_bias_hf_name, src_hf_dir, hf_weight_map, + split_axis=0) # wk wk_hf_name = f"model.layers.{layer_id}.self_attn.k_proj.weight" wk_ms_name = self.convert_weight_name(wk_hf_name) - wk_ms_param, _ = self.get_safetensor_from_file(wk_hf_name, src_hf_dir, hf_weight_map, is_split_param=True, - split_axis=0) + wk_ms_param, _ = self.get_safetensor_from_file_split_tp_group(wk_hf_name, src_hf_dir, hf_weight_map, + split_axis=0) # wk bias wk_bias_hf_name = f"model.layers.{layer_id}.self_attn.k_proj.bias" wk_bias_ms_name = self.convert_weight_name(wk_bias_hf_name) - wk_bias_ms_param, _ = self.get_safetensor_from_file(wk_bias_hf_name, src_hf_dir, hf_weight_map, - is_split_param=True, - split_axis=0) + wk_bias_ms_param, _ = self.get_safetensor_from_file_split_tp_group(wk_bias_hf_name, src_hf_dir, hf_weight_map, + split_axis=0) # wv wv_hf_name = f"model.layers.{layer_id}.self_attn.v_proj.weight" wv_ms_name = self.convert_weight_name(wv_hf_name) - wv_ms_param, _ = self.get_safetensor_from_file(wv_hf_name, src_hf_dir, hf_weight_map, is_split_param=True, - split_axis=0) + wv_ms_param, _ = self.get_safetensor_from_file_split_tp_group(wv_hf_name, src_hf_dir, hf_weight_map, + split_axis=0) # wv bias wv_bias_hf_name = f"model.layers.{layer_id}.self_attn.v_proj.bias" wv_bias_ms_name = self.convert_weight_name(wv_bias_hf_name) - wv_bias_ms_param, _ = self.get_safetensor_from_file(wv_bias_hf_name, src_hf_dir, hf_weight_map, - is_split_param=True, - split_axis=0) + wv_bias_ms_param, _ = self.get_safetensor_from_file_split_tp_group(wv_bias_hf_name, src_hf_dir, hf_weight_map, + split_axis=0) if qkv_concat: w_qkv_name = f"model.layers.{layer_id}.attention.w_qkv.weight" @@ -201,8 +200,8 @@ class Qwen2WeightProcessor(BaseWeightProcessor): # wo wo_hf_name = f"model.layers.{layer_id}.self_attn.o_proj.weight" wo_ms_name = self.convert_weight_name(wo_hf_name) - wo_ms_param, _ = self.get_safetensor_from_file(wo_hf_name, src_hf_dir, hf_weight_map, is_split_param=True, - split_axis=1) + wo_ms_param, _ = self.get_safetensor_from_file_split_tp_group(wo_hf_name, src_hf_dir, hf_weight_map, + split_axis=1) self.parameter_dict[wo_ms_name] = ms.Parameter(ms.from_numpy(wo_ms_param).astype(ms.bfloat16), name=wo_ms_name, requires_grad=False) @@ -262,6 +261,7 @@ class Qwen2WeightProcessor(BaseWeightProcessor): for layer_id in tqdm(range(num_layers), desc="Weight loading", disable=not enable_tqdm): self.infer_convert_layer_weight(src_hf_dir, layer_id, hf_weight_map) - ms.load_param_into_net(self.network, self.parameter_dict) + param_not_load, ckpt_not_load = ms.load_param_into_net(self.network, self.parameter_dict) + logger.info("param_not_load: %s, ckpt_not_load: %s" % (str(param_not_load), str(ckpt_not_load))) del self.parameter_dict gc.collect() diff --git a/vllm_mindspore/model_executor/models/mf_models/weight_processor.py b/vllm_mindspore/model_executor/models/mf_models/weight_processor.py index 696367ec..d96be356 100644 --- a/vllm_mindspore/model_executor/models/mf_models/weight_processor.py +++ b/vllm_mindspore/model_executor/models/mf_models/weight_processor.py @@ -21,9 +21,10 @@ import os from enum import Enum from safetensors import safe_open from mindspore.communication.management import get_rank, get_group_size -from mindformers.experimental.infer.core.utils import get_tp_world_size +from mindformers.experimental.infer.core.utils import get_tp_world_size, get_moe_tp_world_size, get_moe_ep_world_size from mindformers.experimental.parallel_core.pynative.parallel_state import get_data_parallel_world_size + class EPMethod(Enum): """ EP method enums @@ -49,26 +50,21 @@ class BaseWeightProcessor: self.global_group_size = get_group_size() self.tp_group_size = get_tp_world_size() self.dp_group_size = get_data_parallel_world_size() - self.moe_ep_size = self.config.moe_config.moe_expert_parallel - self.moe_tp_size = self.config.moe_config.moe_tensor_parallel + self.num_router_experts = self.config.moe_config.expert_num if hasattr(self.config.moe_config, "expert_num") else 1 + self.moe_ep_size = self.config.moe_config.moe_expert_parallel if hasattr(self.config.moe_config, "moe_expert_parallel") else 1 + self.moe_tp_size = self.config.moe_config.moe_tensor_parallel if hasattr(self.config.moe_config, "moe_tensor_parallel") else 1 self.ep_method = EPMethod.DEFAULT + if self.dp_group_size > 1 and self.moe_ep_size == self.global_group_size: + self.ep_method = EPMethod.ALLTOALL + elif self.dp_group_size > 1: + self.ep_method = EPMethod.ALLGATHER self.tp_rank_id = self.global_rank_id % self.tp_group_size - num_router_experts = self.config.moe_config.expert_num - self.ep_group_nums = num_router_experts // self.moe_ep_size + self.ep_group_nums = self.num_router_experts // self.moe_ep_size self.moe_ep_rank_id = self.global_rank_id // self.moe_tp_size self.moe_tp_rank_id = self.global_rank_id % self.moe_tp_size - - print(f"global_rank_id: {self.global_rank_id} \n" - f"tp_group_size: {self.tp_group_size} \n" - f"dp_group_size: {self.dp_group_size} \n" - f"tp_rank_id: {self.tp_rank_id} \n" - f"num_router_experts: {num_router_experts} \n" - f"ep_group_nums: {self.ep_group_nums} \n" - f"moe_ep_rank_id: {self.moe_ep_rank_id} \n" - f"moe_tp_rank_id: {self.moe_tp_rank_id} \n" - f"moe_ep_size: {self.moe_ep_size} \n" - f"moe_tp_size: {self.moe_tp_size}", flush=True) + self.ep_start = self.moe_ep_rank_id * self.ep_group_nums + self.ep_stop = (self.moe_ep_rank_id + 1) * self.ep_group_nums self.parameter_dict = {} self.file_handles = {} @@ -82,16 +78,82 @@ class BaseWeightProcessor: def release_file_handles(self): del self.file_handles - def get_moe_safetensor_from_file(self, hf_param_name, src_hf_dir, hf_weight_map, is_split_param=False, split_axis=0): + def get_safetensor_from_file(self, hf_param_name, src_hf_dir, hf_weight_map): + safetensor_file = hf_weight_map[hf_param_name] + filename = os.path.join(src_hf_dir, safetensor_file) + sf_file = self.get_file_handles(filename) + qint4 = False + if sf_file.metadata() is not None and hf_param_name in sf_file.metadata().keys(): + qint4 = True + + np_data = sf_file.get_tensor(hf_param_name) + return np_data, qint4 + + def get_safetensor_from_file_split_tp_group(self, hf_param_name, src_hf_dir, hf_weight_map, split_axis=0): + safetensor_file = hf_weight_map[hf_param_name] + filename = os.path.join(src_hf_dir, safetensor_file) + sf_file = self.get_file_handles(filename) + qint4 = False + if sf_file.metadata() is not None and hf_param_name in sf_file.metadata().keys(): + qint4 = True + + np_data = sf_file.get_slice(hf_param_name) + shape = np_data.get_shape() + if split_axis == 0: + split_size = shape[0] // self.tp_group_size + start = self.tp_rank_id * split_size + stop = (self.tp_rank_id + 1) * split_size + split_data = np_data[start:stop] + elif split_axis == 1: + split_size = shape[1] // self.tp_group_size + start = self.tp_rank_id * split_size + stop = (self.tp_rank_id + 1) * split_size + split_data = np_data[:, start:stop] + elif split_axis == 2: + split_size = shape[2] // self.tp_group_size + start = self.tp_rank_id * split_size + stop = (self.tp_rank_id + 1) * split_size + split_data = np_data[:, :, start:stop] + else: + raise ValueError("split_axis:{} is not supported.".format(split_axis)) + return split_data, qint4 + + def get_safetensor_from_file_split_global_group(self, hf_param_name, src_hf_dir, hf_weight_map, split_axis=0): + safetensor_file = hf_weight_map[hf_param_name] + filename = os.path.join(src_hf_dir, safetensor_file) + sf_file = self.get_file_handles(filename) + qint4 = False + if sf_file.metadata() is not None and hf_param_name in sf_file.metadata().keys(): + qint4 = True + + np_data = sf_file.get_slice(hf_param_name) + shape = np_data.get_shape() + if split_axis == 0: + split_size = shape[0] // self.global_group_size + start = self.global_rank_id * split_size + stop = (self.global_rank_id + 1) * split_size + split_data = np_data[start:stop] + elif split_axis == 1: + split_size = shape[1] // self.global_group_size + start = self.global_rank_id * split_size + stop = (self.global_rank_id + 1) * split_size + split_data = np_data[:, start:stop] + elif split_axis == 2: + split_size = shape[2] // self.global_group_size + start = self.global_rank_id * split_size + stop = (self.global_rank_id + 1) * split_size + split_data = np_data[:, :, start:stop] + else: + raise ValueError("split_axis:{} is not supported.".format(split_axis)) + return split_data, qint4 + + def get_safetensor_from_file_split_moe_tp_group(self, hf_param_name, src_hf_dir, hf_weight_map, split_axis=0): safetensor_file = hf_weight_map[hf_param_name] filename = os.path.join(src_hf_dir, safetensor_file) sf_file = self.get_file_handles(filename) qint4 = False if sf_file.metadata() is not None and hf_param_name in sf_file.metadata().keys(): qint4 = True - if not is_split_param or self.moe_tp_size == 1: - np_data = sf_file.get_tensor(hf_param_name) - return np_data, qint4 np_data = sf_file.get_slice(hf_param_name) shape = np_data.get_shape() @@ -109,39 +171,65 @@ class BaseWeightProcessor: raise ValueError("split_axis:{} is not supported.".format(split_axis)) return split_data, qint4 - def get_safetensor_from_file(self, hf_param_name, src_hf_dir, hf_weight_map, is_split_param=False, split_axis=0, - split_num=-1, rank_id=-1): - rank_id = rank_id if rank_id != -1 else self.tp_rank_id - split_num = split_num if split_num != -1 else self.tp_group_size + def get_routed_safetensor_3_dim(self, hf_param_name, src_hf_dir, hf_weight_map, split_ep=False, split_tp=False, + tp_axis=-1): + '''get_routed_safetensor_3_dim''' safetensor_file = hf_weight_map[hf_param_name] filename = os.path.join(src_hf_dir, safetensor_file) sf_file = self.get_file_handles(filename) qint4 = False if sf_file.metadata() is not None and hf_param_name in sf_file.metadata().keys(): qint4 = True - if not is_split_param or split_num == 1: + if not split_tp and not split_ep: np_data = sf_file.get_tensor(hf_param_name) return np_data, qint4 np_data = sf_file.get_slice(hf_param_name) + if not split_tp and split_ep: + split_data = np_data[self.ep_start:self.ep_stop, :, :] + return split_data, qint4 + shape = np_data.get_shape() - if split_axis == 0: - split_size = shape[0] // split_num - start = rank_id * split_size - stop = (rank_id + 1) * split_size - split_data = np_data[start:stop] - elif split_axis == 1: - split_size = shape[1] // split_num - start = rank_id * split_size - stop = (rank_id + 1) * split_size - split_data = np_data[:, start:stop] - elif split_axis == 2: - split_size = shape[2] // self.tp_group_size - start = self.rank_id * split_size - stop = (self.rank_id + 1) * split_size - split_data = np_data[:, :, start:stop] + if tp_axis == 1: + split_size = shape[1] // self.moe_tp_size + start = self.moe_tp_rank_id * split_size + stop = (self.moe_tp_rank_id + 1) * split_size + split_data = np_data[self.ep_start:self.ep_stop, start:stop, :] if split_ep else np_data[:, start:stop, :] + elif tp_axis == 2: + split_size = shape[2] // self.moe_tp_size + start = self.moe_tp_rank_id * split_size + stop = (self.moe_tp_rank_id + 1) * split_size + split_data = np_data[self.ep_start:self.ep_stop, :, start:stop] if split_ep else np_data[:, :, start:stop] else: - raise ValueError("split_axis:{} is not supported.".format(split_axis)) + raise ValueError("tp_axis:{} is not supported.".format(tp_axis)) + return split_data, qint4 + + def get_routed_safetensor_2_dim(self, hf_param_name, src_hf_dir, hf_weight_map, split_ep=False, split_tp=False, + tp_axis=-1): + '''get_moe_routed_safetensor_2_dim''' + safetensor_file = hf_weight_map[hf_param_name] + filename = os.path.join(src_hf_dir, safetensor_file) + sf_file = self.get_file_handles(filename) + qint4 = False + if sf_file.metadata() is not None and hf_param_name in sf_file.metadata().keys(): + qint4 = True + if not split_tp and not split_ep: + np_data = sf_file.get_tensor(hf_param_name) + return np_data, qint4 + + np_data = sf_file.get_slice(hf_param_name) + if not split_tp and split_ep: + split_data = np_data[self.ep_start:self.ep_stop, :] + return split_data, qint4 + + shape = np_data.get_shape() + if tp_axis == 1: + split_size = shape[1] // self.moe_tp_size + start = self.moe_tp_rank_id * split_size + stop = (self.moe_tp_rank_id + 1) * split_size + split_data = np_data[self.ep_start:self.ep_stop, start:stop] if split_ep else np_data[:, start:stop] + else: + raise ValueError("split_tp is True but tp_axis:{} is not supported.".format(tp_axis)) return split_data, qint4 def split_weight_by_rank(self, weight, split_axis=0): -- Gitee From 1a612d7220b9210d9fec51f62893ae2d772648a8 Mon Sep 17 00:00:00 2001 From: Erpim Date: Mon, 28 Apr 2025 15:32:53 +0800 Subject: [PATCH 19/55] opt get_padding_idx --- .../model_executor/models/mf_models/deepseek_v3.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py b/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py index e2112526..120bf5f3 100644 --- a/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py +++ b/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py @@ -92,22 +92,20 @@ def _get_padding_index(q_seq_len): arange_data = np.arange(0, int(tokens_length), dtype=np.int32) if dp_rank == dp_rank_id: ffn_unpadding_idx = arange_data - attn_padding_idx = np.pad( - arange_data, (0, padding_size - arange_data.shape[0]), mode='constant', constant_values=0) - + pad = np.zeros(padding_size - arange_data.shape[0], dtype=np.int32) + attn_padding_idx = np.concatenate((arange_data, pad), axis=0) if dp_rank == 0: attn_unpadding_idx = arange_data last_arange_index = arange_data[-1] - ffn_padding_idx = np.pad(attn_unpadding_idx, (0, padding_size - attn_unpadding_idx.shape[0]), - mode='constant', constant_values=0) + pad = np.zeros(padding_size - attn_unpadding_idx.shape[0], dtype=np.int32) + ffn_padding_idx = np.concatenate((attn_unpadding_idx, pad), axis=0) else: attn_offset_idx = arange_data + padding_size * dp_rank attn_unpadding_idx = np.concatenate((attn_unpadding_idx, attn_offset_idx), axis=0) ffn_offset_idx = arange_data + last_arange_index + 1 last_arange_index = ffn_offset_idx[-1] - ffn_offset_idx_pad_zero = np.pad( - ffn_offset_idx, (0, padding_size - ffn_offset_idx.shape[0]), mode='constant', constant_values=0) - ffn_padding_idx = np.concatenate((ffn_padding_idx, ffn_offset_idx_pad_zero), axis=0) + pad = np.zeros(padding_size - ffn_offset_idx.shape[0], dtype=np.int32) + ffn_padding_idx = np.concatenate((ffn_padding_idx, ffn_offset_idx, pad), axis=0) return ms.from_numpy(attn_padding_idx), ms.from_numpy(attn_unpadding_idx), ms.from_numpy(ffn_padding_idx), \ ms.from_numpy(ffn_unpadding_idx) -- Gitee From d3b7d62e4d3a3dee8a8608e2d31819065d6340b1 Mon Sep 17 00:00:00 2001 From: tronzhang Date: Mon, 28 Apr 2025 19:48:20 +0800 Subject: [PATCH 20/55] add exception log for dp --- vllm_dp/dp_scale_out.patch | 41 ++++++++++++++++++++++++++++---------- 1 file changed, 31 insertions(+), 10 deletions(-) mode change 100755 => 100644 vllm_dp/dp_scale_out.patch diff --git a/vllm_dp/dp_scale_out.patch b/vllm_dp/dp_scale_out.patch old mode 100755 new mode 100644 index fb6520d2..db5d145c --- a/vllm_dp/dp_scale_out.patch +++ b/vllm_dp/dp_scale_out.patch @@ -1,5 +1,5 @@ diff --git a/vllm/config.py b/vllm/config.py -index bd52fc90b..24fc1154d 100644 +index bd52fc9..24fc115 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -1429,16 +1429,27 @@ class LoadConfig: @@ -90,7 +90,7 @@ index bd52fc90b..24fc1154d 100644 import os os.environ["VLLM_ENABLE_V1_MULTIPROCESSING"] = "0" diff --git a/vllm/distributed/utils.py b/vllm/distributed/utils.py -index cae1a2551..2bdcdf4f1 100644 +index cae1a25..2bdcdf4 100644 --- a/vllm/distributed/utils.py +++ b/vllm/distributed/utils.py @@ -21,6 +21,7 @@ from torch.distributed.rendezvous import rendezvous @@ -130,7 +130,7 @@ index cae1a2551..2bdcdf4f1 100644 pg._register_backend(device, backend_type, backend_class) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py -index 89c9b6747..d6173763d 100644 +index 89c9b67..d617376 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -113,11 +113,14 @@ class EngineArgs: @@ -207,7 +207,7 @@ index 89c9b6747..d6173763d 100644 max_parallel_loading_workers=self.max_parallel_loading_workers, disable_custom_all_reduce=self.disable_custom_all_reduce, diff --git a/vllm/entrypoints/cli/serve.py b/vllm/entrypoints/cli/serve.py -index e89ac4e21..ffcc2bb10 100644 +index e89ac4e..ffcc2bb 100644 --- a/vllm/entrypoints/cli/serve.py +++ b/vllm/entrypoints/cli/serve.py @@ -1,14 +1,24 @@ @@ -324,7 +324,7 @@ index e89ac4e21..ffcc2bb10 100644 + logger.info("Shutting down.") + engine_manager.close() diff --git a/vllm/forward_context.py b/vllm/forward_context.py -index e195a03c5..e5b9fd5fa 100644 +index e195a03..e5b9fd5 100644 --- a/vllm/forward_context.py +++ b/vllm/forward_context.py @@ -77,7 +77,8 @@ def set_forward_context(attn_metadata: Any, @@ -338,7 +338,7 @@ index e195a03c5..e5b9fd5fa 100644 batchsize = num_tokens num_tokens_across_dp = [0] * dp_size diff --git a/vllm/utils.py b/vllm/utils.py -index 5f32f8cb6..d38d02586 100644 +index 5f32f8c..d38d025 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -551,6 +551,10 @@ def is_valid_ipv6_address(address: str) -> bool: @@ -417,7 +417,7 @@ index 5f32f8cb6..d38d02586 100644 logger.debug("Got Keyboard Interrupt.") diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py -index 39caca0c2..32902a8b1 100644 +index 39caca0..32902a8 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -23,7 +23,7 @@ from vllm.lora.request import LoRARequest @@ -714,7 +714,7 @@ index 39caca0c2..32902a8b1 100644 super().shutdown() if dp_group := getattr(self, "dp_group", None): diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py -index e948e59b8..e443f45db 100644 +index e948e59..e443f45 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -8,26 +8,29 @@ import threading @@ -1228,8 +1228,26 @@ index e948e59b8..e443f45db 100644 - self.encoder.encode(request_ids))) + await self._send_input(EngineCoreRequestType.ABORT, request_ids, + engine) +diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py +index 1d5175e..40efd27 100644 +--- a/vllm/v1/executor/multiproc_executor.py ++++ b/vllm/v1/executor/multiproc_executor.py +@@ -327,6 +327,13 @@ class WorkerProc: + logger.debug("Worker interrupted.") + + except Exception: ++ # Print exception and details. ++ import sys ++ import traceback ++ exec_type, exec_value, exec_traceback = sys.exc_info() ++ exception_str = "".join(traceback.format_exception(exec_type, exec_value, exec_traceback)) ++ logger.error("WorkerProc failed! %s" % exception_str) ++ + # worker_busy_loop sends exceptions exceptons to Executor + # for shutdown, but if there is an error in startup or an + # error with IPC itself, we need to alert the parent. diff --git a/vllm/v1/serial_utils.py b/vllm/v1/serial_utils.py -index 146d7d747..7c1d48494 100644 +index 146d7d7..7c1d484 100644 --- a/vllm/v1/serial_utils.py +++ b/vllm/v1/serial_utils.py @@ -2,9 +2,10 @@ @@ -1253,7 +1271,7 @@ index 146d7d747..7c1d48494 100644 class MsgpackEncoder: """Encoder with custom torch tensor serialization.""" diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py -index f42b3501a..fbc0ee340 100644 +index f42b350..fbc0ee3 100644 --- a/vllm/v1/utils.py +++ b/vllm/v1/utils.py @@ -2,17 +2,21 @@ @@ -1414,3 +1432,6 @@ index f42b3501a..fbc0ee340 100644 if os and os.path.exists(socket_file): os.remove(socket_file) +-- +2.43.0 + -- Gitee From 3825f64f71b8b28c0b566eb87ec5d56622311224 Mon Sep 17 00:00:00 2001 From: twc Date: Tue, 29 Apr 2025 20:36:28 +0800 Subject: [PATCH 21/55] weight processor bug fix --- .../model_executor/models/mf_models/weight_processor.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/vllm_mindspore/model_executor/models/mf_models/weight_processor.py b/vllm_mindspore/model_executor/models/mf_models/weight_processor.py index d96be356..c70302f2 100644 --- a/vllm_mindspore/model_executor/models/mf_models/weight_processor.py +++ b/vllm_mindspore/model_executor/models/mf_models/weight_processor.py @@ -50,9 +50,9 @@ class BaseWeightProcessor: self.global_group_size = get_group_size() self.tp_group_size = get_tp_world_size() self.dp_group_size = get_data_parallel_world_size() - self.num_router_experts = self.config.moe_config.expert_num if hasattr(self.config.moe_config, "expert_num") else 1 - self.moe_ep_size = self.config.moe_config.moe_expert_parallel if hasattr(self.config.moe_config, "moe_expert_parallel") else 1 - self.moe_tp_size = self.config.moe_config.moe_tensor_parallel if hasattr(self.config.moe_config, "moe_tensor_parallel") else 1 + self.num_router_experts = self.config.moe_config.expert_num if self.config.moe_config.expert_num else 1 + self.moe_ep_size = self.config.moe_config.moe_expert_parallel if self.config.moe_config.moe_expert_parallel else 1 + self.moe_tp_size = self.config.moe_config.moe_tensor_parallel if self.config.moe_config.moe_tensor_parallel else 1 self.ep_method = EPMethod.DEFAULT if self.dp_group_size > 1 and self.moe_ep_size == self.global_group_size: self.ep_method = EPMethod.ALLTOALL -- Gitee From bc448f417b98d1948e8e05ca1ae0df0a23459246 Mon Sep 17 00:00:00 2001 From: jiahaochen666 Date: Wed, 30 Apr 2025 06:48:18 +0000 Subject: [PATCH 22/55] adding v0-style prefix-first scheduler --- vllm_mindspore/__init__.py | 4 + vllm_mindspore/v1/core/__init__.py | 0 vllm_mindspore/v1/core/sched/scheduler.py | 319 ++++++++++++++++++++++ 3 files changed, 323 insertions(+) create mode 100644 vllm_mindspore/v1/core/__init__.py create mode 100644 vllm_mindspore/v1/core/sched/scheduler.py diff --git a/vllm_mindspore/__init__.py b/vllm_mindspore/__init__.py index 3aae2091..28f8469d 100644 --- a/vllm_mindspore/__init__.py +++ b/vllm_mindspore/__init__.py @@ -309,6 +309,10 @@ from vllm_mindspore.v1.worker.gpu_worker import compile_or_warm_up_model from vllm.v1.worker.gpu_worker import Worker Worker.compile_or_warm_up_model = compile_or_warm_up_model +from vllm_mindspore.v1.core.sched.scheduler import schedule +from vllm.v1.core.sched.scheduler import Scheduler +Scheduler.schedule = schedule + from .utils import check_ready from vllm_mindspore.engine.multiprocessing.engine import cleanup diff --git a/vllm_mindspore/v1/core/__init__.py b/vllm_mindspore/v1/core/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/vllm_mindspore/v1/core/sched/scheduler.py b/vllm_mindspore/v1/core/sched/scheduler.py new file mode 100644 index 00000000..c03f3469 --- /dev/null +++ b/vllm_mindspore/v1/core/sched/scheduler.py @@ -0,0 +1,319 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# Copyright 2025 Huawei Technologies Co., Ltd +# Copyright 2024 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# This file is based on vllm-ascend project. +# ============================================================================ + +from collections import deque + +import time +from vllm.logger import init_logger +from vllm.utils import cdiv +from vllm.v1.core.sched.output import NewRequestData, SchedulerOutput +from vllm.v1.core.sched.scheduler import Scheduler +from vllm.v1.request import Request, RequestStatus + +logger = init_logger(__name__) + +def _check_watermark_for_prefill(self, + request, + num_new_tokens, + computed_blocks, + watermark=0.01): + computed_blocks = computed_blocks or [] + watermark_blocks = self.kv_cache_config.num_blocks * watermark + num_computed_tokens = (request.num_computed_tokens + + len(computed_blocks) * self.block_size) + num_required_blocks = cdiv(num_new_tokens + num_computed_tokens, + self.block_size) + req_blocks = self.kv_cache_manager.req_to_blocks[request.request_id] + num_new_blocks = (num_required_blocks - len(req_blocks) - + len(computed_blocks)) + num_evictable_computed_blocks = sum(1 for blk in computed_blocks + if blk.ref_cnt == 0) + # If number of free blocks is less than water mark after allocating, don't allocate. + if (self.kv_cache_manager.block_pool.get_num_free_blocks() - + num_evictable_computed_blocks - + num_new_blocks) < watermark_blocks: + return False + return True + +def _get_prompt_limit(self, request: Request) -> int: + if (self.scheduler_config.chunked_prefill_enabled + and not self.scheduler_config.is_multi_step): + prompt_limit = self.scheduler_config.max_model_len + else: + prompt_limit = min( + self.scheduler_config.max_model_len, + self.scheduler_config.max_num_batched_tokens, + ) + + # Model is fine tuned with long context. Return the fine tuned max_len. + if request.lora_request and request.lora_request.long_lora_max_len: + assert prompt_limit <= request.lora_request.long_lora_max_len + return request.lora_request.long_lora_max_len + else: + return prompt_limit + +def schedule(self) -> SchedulerOutput: + scheduled_new_reqs: list[Request] = [] + scheduled_resumed_reqs: list[Request] = [] + scheduled_running_reqs: list[Request] = [] + preempted_reqs: list[Request] = [] + + req_to_new_block_ids: dict[str, list[int]] = {} + num_scheduled_tokens: dict[str, int] = {} + token_budget = self.max_num_scheduled_tokens + # Spec decode-related. + scheduled_spec_decode_tokens: dict[str, list[int]] = {} + + # Record scheduled LoRA requests. + scheduled_loras: set[int] = set() + + # Use a temporary deque to collect requests that need to be skipped + # and put back at the head of the waiting queue later + skipped_waiting_requests: deque[Request] = deque() + + # Schedule prefill requests first. + while self.waiting and token_budget > 0: + if len(scheduled_new_reqs) == self.max_num_running_reqs: + break + + request = self.waiting[0] + + def skip_cur_request(): + self.waiting.popleft() + skipped_waiting_requests.appendleft(request) + + # Check that adding the request still respects the max_loras + # constraint. + if (self.lora_config and request.lora_request and + (len(scheduled_loras) == self.lora_config.max_loras + and request.lora_request.lora_int_id not in scheduled_loras)): + # Scheduling would exceed max_loras, skip. + skip_cur_request() + continue + + prompt_limit = _get_prompt_limit(self, request) + # Get already-cached tokens. + computed_blocks, num_computed_tokens = ( + self.kv_cache_manager.get_computed_blocks(request)) + num_new_tokens = request.num_prompt_tokens - num_computed_tokens + if (0 < self.scheduler_config.long_prefill_token_threshold < + num_new_tokens): + num_new_tokens = ( + self.scheduler_config.long_prefill_token_threshold) + max_tokens_in_kvcache = (self.kv_cache_config.num_blocks * + self.block_size) + prompt_limit = min(prompt_limit, max_tokens_in_kvcache) + + # Finish request that exceeds prompt_limit or kv cache size. + if num_new_tokens > prompt_limit: + logger.warning( + "Input prompt (%d tokens) is too long" + " and exceeds limit of %d", + num_new_tokens, + prompt_limit, + ) + request.status = RequestStatus.FINISHED_IGNORED + self.finished_req_ids.add(request.request_id) # type: ignore + self.waiting.popleft() + continue + + if num_new_tokens > token_budget: + # Scheduling would exceed token_budget, skip. + skip_cur_request() + continue + + assert num_new_tokens > 0 + watermark = getattr(self.scheduler_config, "watermark", 0.01) + if not _check_watermark_for_prefill(self, + request, num_new_tokens, computed_blocks, watermark): + # Scheduling would exceed watermark, skip. + skip_cur_request() + continue + + new_blocks = self.kv_cache_manager.allocate_slots( + request, num_new_tokens, computed_blocks) + if new_blocks is None: + # The request cannot be scheduled. + break + + self.waiting.popleft() + self.running.append(request) + self.scheduled_req_ids.add(request.request_id) + # Check request status. + if request.status == RequestStatus.WAITING: + scheduled_new_reqs.append(request) + elif request.status == RequestStatus.PREEMPTED: + scheduled_resumed_reqs.append(request) + else: + raise RuntimeError(f"Invalid request status: {request.status}") + + if self.lora_config and request.lora_request: + scheduled_loras.add(request.lora_request.lora_int_id) + req_to_new_block_ids[request.request_id] = [ + b.block_id for b in computed_blocks + new_blocks + ] + # Update request info. + num_scheduled_tokens[request.request_id] = num_new_tokens + token_budget -= num_new_tokens + request.status = RequestStatus.RUNNING + request.num_computed_tokens = num_computed_tokens + + # Put back any skipped requests at the head of the waiting queue + if skipped_waiting_requests: + self.waiting.extendleft(skipped_waiting_requests) + + # If no prefill requests are scheduled, + # Schedule decode requests next. + if len(self.scheduled_req_ids) == 0: + req_index = 0 + while req_index < len(self.running) and token_budget > 0: + request = self.running[req_index] + if request.request_id in self.scheduled_req_ids: + # This request has already been scheduled. + req_index += 1 + continue + + num_new_tokens = (request.num_tokens_with_spec - + request.num_computed_tokens) + if (0 < self.scheduler_config.long_prefill_token_threshold < + num_new_tokens): + num_new_tokens = ( + self.scheduler_config.long_prefill_token_threshold) + num_new_tokens = min(num_new_tokens, token_budget) + assert num_new_tokens == 1 + + while True: + new_blocks = self.kv_cache_manager.allocate_slots( + request, num_new_tokens) + if new_blocks is None: + # The request cannot be scheduled. + # Preempt the lowest-priority request. + preempted_req = self.running.pop() + self.kv_cache_manager.free(preempted_req) + preempted_req.status = RequestStatus.PREEMPTED + preempted_req.num_computed_tokens = 0 + self.waiting.appendleft(preempted_req) + preempted_reqs.append(preempted_req) + if preempted_req == request: + # No more request to preempt. + can_schedule = False + break + else: + # The request can be scheduled. + can_schedule = True + break + if not can_schedule: + break + assert new_blocks is not None + + # Schedule the request. + scheduled_running_reqs.append(request) + self.scheduled_req_ids.add(request.request_id) + req_to_new_block_ids[request.request_id] = [ + b.block_id for b in new_blocks + ] + num_scheduled_tokens[request.request_id] = num_new_tokens + token_budget -= num_new_tokens + req_index += 1 + + # Speculative decode related. + if request.spec_token_ids: + num_scheduled_spec_tokens = (num_new_tokens + + request.num_computed_tokens - + request.num_tokens) + if num_scheduled_spec_tokens > 0: + # Trim spec_token_ids list to num_scheduled_spec_tokens. + del request.spec_token_ids[num_scheduled_spec_tokens:] + scheduled_spec_decode_tokens[request.request_id] = ( + request.spec_token_ids) + + # Check if the scheduling constraints are satisfied. + total_num_scheduled_tokens = sum(num_scheduled_tokens.values()) + assert total_num_scheduled_tokens <= self.max_num_scheduled_tokens + assert token_budget >= 0 + assert len(self.running) <= self.max_num_running_reqs + assert len(scheduled_new_reqs) + len(scheduled_resumed_reqs) + len( + scheduled_running_reqs) <= len(self.running) + + # Get the longest common prefix among all requests in the running queue. + # This can be potentially used for cascade attention. + num_common_prefix_blocks = 0 + if self.running: + any_request = self.running[0] + num_common_prefix_blocks = ( + self.kv_cache_manager.get_num_common_prefix_blocks( + any_request, len(self.running))) + + # Construct the scheduler output. + new_reqs_data = [ + NewRequestData.from_request(req, + req_to_new_block_ids[req.request_id]) + for req in scheduled_new_reqs + ] + resumed_reqs_data = [ + self._make_cached_request_data( + req, + num_scheduled_tokens[req.request_id], + len(scheduled_spec_decode_tokens.get(req.request_id, ())), + req_to_new_block_ids[req.request_id], + resumed_from_preemption=True, + ) for req in scheduled_resumed_reqs + ] + running_reqs_data = [ + self._make_cached_request_data( + req, + num_scheduled_tokens[req.request_id], + len(scheduled_spec_decode_tokens.get(req.request_id, ())), + req_to_new_block_ids[req.request_id], + resumed_from_preemption=False, + ) for req in scheduled_running_reqs + ] + scheduler_output = SchedulerOutput( + scheduled_new_reqs=new_reqs_data, + scheduled_cached_reqs=resumed_reqs_data + running_reqs_data, + num_scheduled_tokens=num_scheduled_tokens, + total_num_scheduled_tokens=total_num_scheduled_tokens, + scheduled_spec_decode_tokens=scheduled_spec_decode_tokens, + scheduled_encoder_inputs={}, + num_common_prefix_blocks=num_common_prefix_blocks, + # finished_req_ids is an existing state in the scheduler, + # instead of being newly scheduled in this step. + # It contains the request IDs that are finished in between + # the previous and the current steps. + finished_req_ids=self.finished_req_ids, # type: ignore + free_encoder_input_ids=self.encoder_cache_manager.get_freed_ids(), + structured_output_request_ids={}, + grammar_bitmask=None, + ) + + # Advance the number of computed tokens for the request AFTER + # the request is scheduled. + # 1. The scheduler_output of the current step has to include the + # original number of scheduled tokens to determine input IDs. + # 2. Advance the number of computed tokens here allowing us to + # schedule the prefill request again immediately in the next + # scheduling step. + # 3. If some tokens (e.g. spec tokens) are rejected later, the number of + # computed tokens will be adjusted in update_from_output. + for req_id, num_scheduled_token in num_scheduled_tokens.items(): + self.requests[req_id].num_computed_tokens += num_scheduled_token + + self.finished_req_ids = set() # type: ignore + return scheduler_output -- Gitee From 109edbf79a1405af1916d1ed46fe3020f71f5ed4 Mon Sep 17 00:00:00 2001 From: one_east Date: Wed, 7 May 2025 10:15:38 +0800 Subject: [PATCH 23/55] bugfix: add module init --- vllm_mindspore/v1/core/sched/__init__.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 vllm_mindspore/v1/core/sched/__init__.py diff --git a/vllm_mindspore/v1/core/sched/__init__.py b/vllm_mindspore/v1/core/sched/__init__.py new file mode 100644 index 00000000..e69de29b -- Gitee From 99effbffc222a38f80418f5760425108cba753ec Mon Sep 17 00:00:00 2001 From: hangangqiang Date: Tue, 29 Apr 2025 22:26:40 +0800 Subject: [PATCH 24/55] deepseekr1-int8 support qkvconcat --- .../mf_models/deepseekv3_weight_processor.py | 125 ++++++++++++++---- 1 file changed, 101 insertions(+), 24 deletions(-) diff --git a/vllm_mindspore/model_executor/models/mf_models/deepseekv3_weight_processor.py b/vllm_mindspore/model_executor/models/mf_models/deepseekv3_weight_processor.py index 1c6a99d9..966c569a 100644 --- a/vllm_mindspore/model_executor/models/mf_models/deepseekv3_weight_processor.py +++ b/vllm_mindspore/model_executor/models/mf_models/deepseekv3_weight_processor.py @@ -631,35 +631,11 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor): def infer_quant_process_attention_weight(self, src_hf_dir, layer_id, hf_weight_map): """infer quant process attention weight""" num_heads = self.config.model.model_config.num_heads - kv_lora_rank = self.config.model.model_config.kv_lora_rank qk_rope_head_dim = self.config.model.model_config.qk_rope_head_dim v_head_dim = self.config.model.model_config.v_head_dim qk_nope_head_dim = self.config.model.model_config.qk_nope_head_dim rope_dim = qk_rope_head_dim + qk_nope_head_dim - kv_head_dim = kv_lora_rank + qk_rope_head_dim - - # q_a_proj->q2l_proj - q2l_proj_hf_name = f"model.layers.{layer_id}.self_attn.q_a_proj.weight" - q2l_proj_ms_name = self.quant_convert_weight_name(q2l_proj_hf_name) - q2l_proj_ms_param, _ = self.get_safetensor_from_file(q2l_proj_hf_name, src_hf_dir, hf_weight_map) - self.parameter_dict[q2l_proj_ms_name] = ms.Parameter( - ms.from_numpy(q2l_proj_ms_param).astype(ms.int8), - name=q2l_proj_ms_name, - requires_grad=False) - self.quant_special_attention_weight(layer_id, src_hf_dir, hf_weight_map, "q_a_proj") - - # kv_a_proj_with_mqa->kv2l - kv2l_hf_name = f"model.layers.{layer_id}.self_attn.kv_a_proj_with_mqa.weight" - kv2l_ms_name = self.quant_convert_weight_name(kv2l_hf_name) - kv2l_ms_param, _ = self.get_safetensor_from_file(kv2l_hf_name, src_hf_dir, hf_weight_map) - kv2l_ms_param = kv2l_ms_param.reshape(kv_head_dim, -1) - kv2l_ms_param = self.infer_trans_rope_weight(kv2l_ms_param, qk_rope_head_dim) - self.parameter_dict[kv2l_ms_name] = ms.Parameter(ms.from_numpy(kv2l_ms_param).astype(ms.int8), - name=kv2l_ms_name, - requires_grad=False) - self.quant_special_attention_weight(layer_id, src_hf_dir, hf_weight_map, "kv_a_proj_with_mqa", - is_trans_rope_weigh=True) # q_a_layernorm->lq_norm lq_norm_hf_name = f"model.layers.{layer_id}.self_attn.q_a_layernorm.weight" @@ -726,6 +702,106 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor): requires_grad=False) self.quant_special_attention_weight(layer_id, src_hf_dir, hf_weight_map, "o_proj") + def infer_quant_process_dense_qkv_weight(self, src_hf_dir, layer_id, hf_weight_map): + """infer_quant_process_dense_qkv_weight""" + parameter_dict = {} + kv_lora_rank = self.config.model.model_config.kv_lora_rank + qk_rope_head_dim = self.config.model.model_config.qk_rope_head_dim + kv_head_dim = kv_lora_rank + qk_rope_head_dim + + qkv_concat = self.config.model.model_config.qkv_concat + # q2l + q2l_hf_name = f"model.layers.{layer_id}.self_attn.q_a_proj.weight" + q2l_ms_name = self.quant_convert_weight_name(q2l_hf_name) + q2l_ms_param, _ = self.get_safetensor_from_file(q2l_hf_name, src_hf_dir, hf_weight_map) + + q2l_input_scale_hf_name = f"model.layers.{layer_id}.self_attn.q_a_proj.input_scale" + q2l_input_scale_ms_name = self.quant_convert_weight_name(q2l_input_scale_hf_name) + q2l_input_scale_ms_param, _ = self.get_safetensor_from_file(q2l_input_scale_hf_name, src_hf_dir, + hf_weight_map) + + q2l_input_zp_hf_name = f"model.layers.{layer_id}.self_attn.q_a_proj.input_offset" + q2l_input_zp_ms_name = self.quant_convert_weight_name(q2l_input_zp_hf_name) + q2l_input_zp_ms_param, _ = self.get_safetensor_from_file(q2l_input_zp_hf_name, src_hf_dir, hf_weight_map) + + q2l_quant_bias_hf_name = f"model.layers.{layer_id}.self_attn.q_a_proj.quant_bias" + q2l_quant_bias_ms_name = self.quant_convert_weight_name(q2l_quant_bias_hf_name) + q2l_quant_bias_ms_param, _ = self.get_safetensor_from_file(q2l_quant_bias_hf_name, src_hf_dir, + hf_weight_map) + + q2l_dequant_scale_hf_name = f"model.layers.{layer_id}.self_attn.q_a_proj.deq_scale" + q2l_dequant_scale_ms_name = self.quant_convert_weight_name(q2l_dequant_scale_hf_name) + q2l_dequant_scale_ms_param, _ = self.get_safetensor_from_file(q2l_dequant_scale_hf_name, src_hf_dir, + hf_weight_map) + # kv2l + kv2l_hf_name = f"model.layers.{layer_id}.self_attn.kv_a_proj_with_mqa.weight" + kv2l_ms_name = self.quant_convert_weight_name(kv2l_hf_name) + kv2l_ms_param, _ = self.get_safetensor_from_file(kv2l_hf_name, src_hf_dir, hf_weight_map) + kv2l_ms_param = kv2l_ms_param.reshape(kv_head_dim, -1) + kv2l_ms_param = self.infer_trans_rope_weight(kv2l_ms_param, qk_rope_head_dim) + + kv2l_input_scale_hf_name = f"model.layers.{layer_id}.self_attn.kv_a_proj_with_mqa.input_scale" + kv2l_input_scale_ms_name = self.quant_convert_weight_name(kv2l_input_scale_hf_name) + kv2l_input_scale_ms_param, _ = self.get_safetensor_from_file(kv2l_input_scale_hf_name, src_hf_dir, + hf_weight_map) + + kv2l_input_zp_hf_name = f"model.layers.{layer_id}.self_attn.kv_a_proj_with_mqa.input_offset" + kv2l_input_zp_ms_name = self.quant_convert_weight_name(kv2l_input_zp_hf_name) + kv2l_input_zp_ms_param, _ = self.get_safetensor_from_file(kv2l_input_zp_hf_name, src_hf_dir, hf_weight_map) + + kv2l_quant_bias_hf_name = f"model.layers.{layer_id}.self_attn.kv_a_proj_with_mqa.quant_bias" + kv2l_quant_bias_ms_name = self.quant_convert_weight_name(kv2l_quant_bias_hf_name) + kv2l_quant_bias_ms_param, _ = self.get_safetensor_from_file(kv2l_quant_bias_hf_name, src_hf_dir, + hf_weight_map) + kv2l_quant_bias_ms_param = kv2l_quant_bias_ms_param.reshape(kv_head_dim, -1) + kv2l_quant_bias_ms_param = self.infer_trans_rope_weight(kv2l_quant_bias_ms_param, + qk_rope_head_dim).reshape(-1) + + kv2l_dequant_scale_hf_name = f"model.layers.{layer_id}.self_attn.kv_a_proj_with_mqa.deq_scale" + kv2l_dequant_scale_ms_name = self.quant_convert_weight_name(kv2l_dequant_scale_hf_name) + kv2l_dequant_scale_ms_param, _ = self.get_safetensor_from_file(kv2l_dequant_scale_hf_name, src_hf_dir, + hf_weight_map) + kv2l_dequant_scale_ms_param = kv2l_dequant_scale_ms_param.reshape(kv_head_dim, -1) + kv2l_dequant_scale_ms_param = self.infer_trans_rope_weight(kv2l_dequant_scale_ms_param, + qk_rope_head_dim).reshape(-1) + + attn_rmsnorm_beta_hf_name = f"model.layers.{layer_id}.input_layernorm.bias" + attn_rmsnorm_beta_ms_name = self.quant_convert_weight_name(attn_rmsnorm_beta_hf_name) + attn_rmsnorm_beta_ms_param, _ = self.get_safetensor_from_file(attn_rmsnorm_beta_hf_name, src_hf_dir, hf_weight_map) + + if qkv_concat: + qkv2l_weight_name = f"model.layers.{layer_id}.attention.qkv2l._layer.weight" + qkv2l_bias_name = f"model.layers.{layer_id}.attention.qkv2l._layer.matmul.quant_bias" + qkv2l_scale_name = f"model.layers.{layer_id}.attention.qkv2l._layer.matmul.dequant_scale" + qkv2l_quant_zp_name = f"model.layers.{layer_id}.attention.qkv2l.quant_op.input_zp" + qkv2l_quant_scale_name = f"model.layers.{layer_id}.attention.qkv2l.quant_op.input_scale" + qkv2l_rmsnorm_beta_name = f"model.layers.{layer_id}.attention.qkv2l.quant_op.beta" + + qkv2l_weight = np.concatenate((q2l_ms_param, kv2l_ms_param), 0) + parameter_dict[qkv2l_weight_name] = ms.Parameter(ms.Tensor(qkv2l_weight, ms.int8), name=qkv2l_weight_name, requires_grad=False) + qkv2l_bias = np.concatenate((q2l_quant_bias_ms_param, kv2l_quant_bias_ms_param), 0) + parameter_dict[qkv2l_bias_name] = ms.Parameter(ms.Tensor(qkv2l_bias, ms.int32), name=qkv2l_bias_name,requires_grad=False) + qkv2l_scale = np.concatenate((q2l_dequant_scale_ms_param, kv2l_dequant_scale_ms_param), 0) + parameter_dict[qkv2l_scale_name] = ms.Parameter(ms.Tensor(qkv2l_scale, ms.float32), name=qkv2l_scale_name, requires_grad=False) + parameter_dict[qkv2l_quant_zp_name] = ms.Parameter(ms.Tensor(q2l_input_zp_ms_param, ms.int8),requires_grad=False) + parameter_dict[qkv2l_quant_scale_name] = ms.Parameter(ms.Tensor(q2l_input_scale_ms_param, ms.bfloat16), requires_grad=False) + parameter_dict[qkv2l_rmsnorm_beta_name] = ms.Parameter(ms.Tensor(attn_rmsnorm_beta_ms_param, ms.float32), requires_grad=False) + else: + parameter_dict[q2l_ms_name] = ms.Parameter(ms.Tensor(q2l_ms_param, ms.int8), name=q2l_ms_name,requires_grad=False) + parameter_dict[kv2l_ms_name] = ms.Parameter(ms.Tensor(kv2l_ms_param, ms.int8),requires_grad=False) + parameter_dict[q2l_quant_bias_ms_name] = ms.Parameter(ms.Tensor(q2l_quant_bias_ms_param, ms.int32),name=q2l_quant_bias_ms_name,requires_grad = False) + parameter_dict[kv2l_quant_bias_ms_name] = ms.Parameter(ms.Tensor(kv2l_quant_bias_ms_param, ms.int32),name=kv2l_quant_bias_ms_name,requires_grad = False) + parameter_dict[q2l_dequant_scale_ms_name] = ms.Parameter(ms.Tensor(q2l_dequant_scale_ms_param, ms.float32), name=q2l_dequant_scale_ms_name, requires_grad = False) + parameter_dict[kv2l_dequant_scale_ms_name] = ms.Parameter(ms.Tensor(kv2l_dequant_scale_ms_param, ms.float32),name = kv2l_dequant_scale_ms_name, requires_grad = False) + parameter_dict[q2l_input_zp_ms_name] = ms.Parameter(ms.Tensor(q2l_input_zp_ms_param, ms.int8),name=q2l_input_zp_ms_name, requires_grad = False) + parameter_dict[kv2l_input_zp_ms_name] = ms.Parameter(ms.Tensor(kv2l_input_zp_ms_param, ms.int8), name=kv2l_input_zp_ms_name, requires_grad = False) + parameter_dict[q2l_input_scale_ms_name] = ms.Parameter(ms.Tensor(q2l_input_scale_ms_param, ms.bfloat16), name = q2l_input_scale_ms_name, requires_grad = False) + parameter_dict[kv2l_input_scale_ms_name] = ms.Parameter(ms.Tensor(kv2l_input_scale_ms_param, ms.bfloat16), name = kv2l_input_scale_ms_name, requires_grad = False) + parameter_dict[attn_rmsnorm_beta_ms_name] = ms.Parameter(ms.Tensor(attn_rmsnorm_beta_ms_param, ms.float32), name=attn_rmsnorm_beta_ms_name, requires_grad=False) + _, _ = ms.load_param_into_net(self.network, parameter_dict) + del parameter_dict + gc.collect() + def infer_quant_net_convert_layer_weight(self, src_hf_dir, layer_id, hf_weight_map): """infer quant net convert layer weight""" @@ -735,6 +811,7 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor): else: self.infer_quant_process_dense_ffn_weight(src_hf_dir, layer_id, hf_weight_map) + self.infer_quant_process_dense_qkv_weight(src_hf_dir, layer_id, hf_weight_map) self.infer_quant_process_attention_weight(src_hf_dir, layer_id, hf_weight_map) self.infer_quant_bias_weight(src_hf_dir, layer_id, hf_weight_map) self.infer_process_norm_weight(src_hf_dir, layer_id, hf_weight_map) -- Gitee From 31d20c344f5f9a85321bf8dec9826a187665accb Mon Sep 17 00:00:00 2001 From: tongl Date: Wed, 30 Apr 2025 19:44:03 +0800 Subject: [PATCH 25/55] Add osl for vllm-mindspore. --- .../models/mf_models/deepseek_v3.py | 16 ++++++++++++++++ .../mf_models/deepseekv3_weight_processor.py | 5 ++++- 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py b/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py index e7cda00c..82fa53d9 100644 --- a/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py +++ b/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py @@ -200,6 +200,18 @@ class DeepseekV3ForCausalLM(MfModelBase): act_quant_granularity=QuantGranularity.PER_TOKEN, weight_quant_granularity=QuantGranularity.PER_CHANNEL) layer_policies = OrderedDict({r'.*\.w2.*': w2_config}) + elif quant_type.lower() == 'osl': + cfg = PTQConfig(mode=quant_mode, backend=BackendTarget.ASCEND, weight_quant_dtype=msdtype.int8, + act_quant_dtype=msdtype.int8, + outliers_suppression=OutliersSuppressionType.OUTLIER_SUPPRESSION_LITE, + opname_blacklist=['lm_head', 'lkv2kv']) + w2_config = PTQConfig(mode=quant_mode, backend=BackendTarget.ASCEND, weight_quant_dtype=msdtype.int8, + act_quant_dtype=msdtype.int8, + outliers_suppression=OutliersSuppressionType.NONE, + precision_recovery=PrecisionRecovery.NONE, + act_quant_granularity=QuantGranularity.PER_TOKEN, + weight_quant_granularity=QuantGranularity.PER_CHANNEL) + layer_policies = OrderedDict({r'.*\.w2.*': w2_config}) elif quant_type.lower() == 'a16w8': cfg = PTQConfig(mode=quant_mode, backend=BackendTarget.ASCEND, weight_quant_dtype=msdtype.int8, opname_blacklist=['lm_head', 'lkv2kv']) @@ -220,6 +232,10 @@ class DeepseekV3ForCausalLM(MfModelBase): # pylint: disable=protected-access ptq._config.aclnn_quant_list = ["routed_experts.ffn.w_gate_hidden", "routed_experts.ffn.w1", "routed_experts.ffn.w3"] + if 'osl' in quant_type.lower(): + # pylint: disable=protected-access + ptq._config.aclnn_quant_list = ["routed_experts.ffn.w_gate_hidden", "routed_experts.ffn.w1", + "routed_experts.ffn.w3"] if 'gptq-pergroup' in quant_type.lower(): # pylint: disable=protected-access ptq.layer_policies[r'.*\.feed_forward\.w2.*'].aclnn_quant_list = ["w2"] diff --git a/vllm_mindspore/model_executor/models/mf_models/deepseekv3_weight_processor.py b/vllm_mindspore/model_executor/models/mf_models/deepseekv3_weight_processor.py index de7d70d0..c9bc5e1b 100644 --- a/vllm_mindspore/model_executor/models/mf_models/deepseekv3_weight_processor.py +++ b/vllm_mindspore/model_executor/models/mf_models/deepseekv3_weight_processor.py @@ -1434,7 +1434,7 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor): quantization_config = self.config.model.model_config.quantization_config quant_method = quantization_config.quant_method if quantization_config else None - support_quant_method = ["gptq-pergroup", "smoothquant"] + support_quant_method = ["gptq-pergroup", "smoothquant", "osl"] if not quant_method or (quant_method not in support_quant_method) and \ not is_mtp_model: self.infer_convert_outer_weight(src_hf_dir, hf_weight_map) @@ -1445,6 +1445,9 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor): if quant_method and quant_method == "smoothquant": self.infer_smooth_quant_net_ms_convert_layer_weight(src_hf_dir, self.num_layers, hf_weight_map) return + if quant_method and quant_method == "osl": + self.infer_smooth_quant_net_ms_convert_layer_weight(src_hf_dir, self.num_layers, hf_weight_map) + return enable_tqdm = rank_id == 0 mtp_layers = self.config.model.model_config.num_nextn_predict_layers -- Gitee From b35c4a37c5fc119307e4636342167b0e019e1575 Mon Sep 17 00:00:00 2001 From: lvhaoyu Date: Wed, 14 May 2025 10:09:25 +0800 Subject: [PATCH 26/55] add VLLM_USE_V1=0 for testcase --- tests/mindformers | 2 +- tests/st/python/test_vllm_deepseek_bf16_part.py | 3 ++- tests/st/python/test_vllm_deepseek_part.py | 5 ++++- tests/st/python/test_vllm_deepseek_smoothquant.py | 4 +++- tests/st/python/test_vllm_mf_qwen_7b.py | 3 ++- tests/st/python/test_vllm_mf_qwen_7b_chunk_prefill.py | 4 +++- tests/st/python/test_vllm_mf_qwen_7b_cp_pc_mss.py | 3 ++- tests/st/python/test_vllm_mf_qwen_7b_mss.py | 3 ++- tests/st/python/test_vllm_mf_qwen_7b_prefix_caching.py | 4 +++- tests/st/python/test_vllm_qwen_7b.py | 3 ++- 10 files changed, 24 insertions(+), 10 deletions(-) diff --git a/tests/mindformers b/tests/mindformers index 544c4009..16587217 160000 --- a/tests/mindformers +++ b/tests/mindformers @@ -1 +1 @@ -Subproject commit 544c4009573051e0e254efab71d212bfc77fc7b2 +Subproject commit 165872172ae5396cb4b66629614c85ff21038e11 diff --git a/tests/st/python/test_vllm_deepseek_bf16_part.py b/tests/st/python/test_vllm_deepseek_bf16_part.py index c19dd14a..86137637 100644 --- a/tests/st/python/test_vllm_deepseek_bf16_part.py +++ b/tests/st/python/test_vllm_deepseek_bf16_part.py @@ -30,7 +30,8 @@ env_vars = { "LCCL_DETERMINISTIC": "1", "HCCL_DETERMINISTIC": "true", "ATB_MATMUL_SHUFFLE_K_ENABLE": "0", - "ATB_LLM_LCOC_ENABLE": "0" + "ATB_LLM_LCOC_ENABLE": "0", + "VLLM_USE_V1": "0", } # set env env_manager.setup_ai_environment(env_vars) diff --git a/tests/st/python/test_vllm_deepseek_part.py b/tests/st/python/test_vllm_deepseek_part.py index 8dfa9563..f88ece2c 100644 --- a/tests/st/python/test_vllm_deepseek_part.py +++ b/tests/st/python/test_vllm_deepseek_part.py @@ -32,7 +32,8 @@ env_vars = { "LCCL_DETERMINISTIC": "1", "HCCL_DETERMINISTIC": "true", "ATB_MATMUL_SHUFFLE_K_ENABLE": "0", - "ATB_LLM_LCOC_ENABLE": "0" + "ATB_LLM_LCOC_ENABLE": "0", + "VLLM_USE_V1": "0", } # set env env_manager.setup_ai_environment(env_vars) @@ -47,6 +48,7 @@ class TestDeepSeek: @pytest.mark.level0 @pytest.mark.platform_arm_ascend910b_training @pytest.mark.env_single + @pytest.mark.skip(reason="gs master branch is not suit for the newest mindformers.") def test_deepseek_r1(self): """ test case deepseek r1 w8a8 @@ -87,6 +89,7 @@ class TestDeepSeekMTP: @pytest.mark.level0 @pytest.mark.platform_arm_ascend910b_training @pytest.mark.env_single + @pytest.mark.skip(reason="MTP need addition adaption on v0.8.3 V0") def test_deepseek_mtp(self): """ test case deepseek mtp with main model of r1-w8a8 diff --git a/tests/st/python/test_vllm_deepseek_smoothquant.py b/tests/st/python/test_vllm_deepseek_smoothquant.py index 7582e55b..7e3397e0 100644 --- a/tests/st/python/test_vllm_deepseek_smoothquant.py +++ b/tests/st/python/test_vllm_deepseek_smoothquant.py @@ -32,7 +32,8 @@ env_vars = { "LCCL_DETERMINISTIC": "1", "HCCL_DETERMINISTIC": "true", "ATB_MATMUL_SHUFFLE_K_ENABLE": "0", - "ATB_LLM_LCOC_ENABLE": "0" + "ATB_LLM_LCOC_ENABLE": "0", + "VLLM_USE_V1": "0", } # set env env_manager.setup_ai_environment(env_vars) @@ -47,6 +48,7 @@ class TestDeepSeek: @pytest.mark.level0 @pytest.mark.platform_arm_ascend910b_training @pytest.mark.env_single + @pytest.mark.skip(reason="gs master branch is not suit for the newest mindformers.") def test_deepseek_r1(self): """ test case deepseek r1 w8a8 diff --git a/tests/st/python/test_vllm_mf_qwen_7b.py b/tests/st/python/test_vllm_mf_qwen_7b.py index ddb545c7..bbb6aa46 100644 --- a/tests/st/python/test_vllm_mf_qwen_7b.py +++ b/tests/st/python/test_vllm_mf_qwen_7b.py @@ -32,7 +32,8 @@ env_vars = { "LCCL_DETERMINISTIC": "1", "HCCL_DETERMINISTIC": "true", "ATB_MATMUL_SHUFFLE_K_ENABLE": "0", - "ATB_LLM_LCOC_ENABLE": "0" + "ATB_LLM_LCOC_ENABLE": "0", + "VLLM_USE_V1": "0", } # set env env_manager.setup_ai_environment(env_vars) diff --git a/tests/st/python/test_vllm_mf_qwen_7b_chunk_prefill.py b/tests/st/python/test_vllm_mf_qwen_7b_chunk_prefill.py index 1523e46b..cc3cbcab 100644 --- a/tests/st/python/test_vllm_mf_qwen_7b_chunk_prefill.py +++ b/tests/st/python/test_vllm_mf_qwen_7b_chunk_prefill.py @@ -31,7 +31,8 @@ env_vars = { "LCCL_DETERMINISTIC": "1", "HCCL_DETERMINISTIC": "true", "ATB_MATMUL_SHUFFLE_K_ENABLE": "0", - "ATB_LLM_LCOC_ENABLE": "0" + "ATB_LLM_LCOC_ENABLE": "0", + "VLLM_USE_V1": "0", } # set env env_manager.setup_ai_environment(env_vars) @@ -47,6 +48,7 @@ class TestMfQwen_chunk_prefill: @pytest.mark.level0 @pytest.mark.platform_arm_ascend910b_training @pytest.mark.env_single + @pytest.mark.skip(reason="cp precision need to be fixed on v0.8.3 V0") def test_mf_qwen_7b_chunk_prefill(self): """ test case qwen_7b_chunk_prefill diff --git a/tests/st/python/test_vllm_mf_qwen_7b_cp_pc_mss.py b/tests/st/python/test_vllm_mf_qwen_7b_cp_pc_mss.py index 6292b22c..df61117d 100644 --- a/tests/st/python/test_vllm_mf_qwen_7b_cp_pc_mss.py +++ b/tests/st/python/test_vllm_mf_qwen_7b_cp_pc_mss.py @@ -30,7 +30,8 @@ env_vars = { "LCCL_DETERMINISTIC": "1", "HCCL_DETERMINISTIC": "true", "ATB_MATMUL_SHUFFLE_K_ENABLE": "0", - "ATB_LLM_LCOC_ENABLE": "0" + "ATB_LLM_LCOC_ENABLE": "0", + "VLLM_USE_V1": "0", } # set env env_manager.setup_ai_environment(env_vars) diff --git a/tests/st/python/test_vllm_mf_qwen_7b_mss.py b/tests/st/python/test_vllm_mf_qwen_7b_mss.py index b174804d..266e296a 100644 --- a/tests/st/python/test_vllm_mf_qwen_7b_mss.py +++ b/tests/st/python/test_vllm_mf_qwen_7b_mss.py @@ -32,7 +32,8 @@ env_vars = { "LCCL_DETERMINISTIC": "1", "HCCL_DETERMINISTIC": "true", "ATB_MATMUL_SHUFFLE_K_ENABLE": "0", - "ATB_LLM_LCOC_ENABLE": "0" + "ATB_LLM_LCOC_ENABLE": "0", + "VLLM_USE_V1": "0", } # set env env_manager.setup_ai_environment(env_vars) diff --git a/tests/st/python/test_vllm_mf_qwen_7b_prefix_caching.py b/tests/st/python/test_vllm_mf_qwen_7b_prefix_caching.py index 89ba64c0..28ec1058 100644 --- a/tests/st/python/test_vllm_mf_qwen_7b_prefix_caching.py +++ b/tests/st/python/test_vllm_mf_qwen_7b_prefix_caching.py @@ -31,7 +31,8 @@ env_vars = { "LCCL_DETERMINISTIC": "1", "HCCL_DETERMINISTIC": "true", "ATB_MATMUL_SHUFFLE_K_ENABLE": "0", - "ATB_LLM_LCOC_ENABLE": "0" + "ATB_LLM_LCOC_ENABLE": "0", + "VLLM_USE_V1": "0", } env_manager.setup_ai_environment(env_vars) import vllm_mindspore @@ -45,6 +46,7 @@ class TestMfQwen_prefix_caching: @pytest.mark.level0 @pytest.mark.platform_arm_ascend910b_training @pytest.mark.env_single + @pytest.mark.skip(reason="pc precision need to be fixed on v0.8.3 V0") def test_mf_qwen_7b_prefix_caching(self): """ test case qwen_7b_prefix_caching diff --git a/tests/st/python/test_vllm_qwen_7b.py b/tests/st/python/test_vllm_qwen_7b.py index bce75d3e..695e9cb6 100644 --- a/tests/st/python/test_vllm_qwen_7b.py +++ b/tests/st/python/test_vllm_qwen_7b.py @@ -28,7 +28,8 @@ env_vars = { "LCCL_DETERMINISTIC": "1", "HCCL_DETERMINISTIC": "true", "ATB_MATMUL_SHUFFLE_K_ENABLE": "0", - "ATB_LLM_LCOC_ENABLE": "0" + "ATB_LLM_LCOC_ENABLE": "0", + "VLLM_USE_V1": "0", } # set env env_manager.setup_ai_environment(env_vars) -- Gitee From ffe0562772872ea366159faa2f5c3fa77605c615 Mon Sep 17 00:00:00 2001 From: lvhaoyu Date: Wed, 14 May 2025 10:18:14 +0800 Subject: [PATCH 27/55] update codegate for vllm 0.8.3 --- .jenkins/test/config/dependent_packages.yaml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.jenkins/test/config/dependent_packages.yaml b/.jenkins/test/config/dependent_packages.yaml index 37511933..cdb73e74 100644 --- a/.jenkins/test/config/dependent_packages.yaml +++ b/.jenkins/test/config/dependent_packages.yaml @@ -1,11 +1,11 @@ mindspore: - 'https://repo.mindspore.cn/mindspore/mindspore/version/202504/20250417/br_infer_deepseek_os_20250417004508_38b6db6c3039b59153d52d5e353cd01fe774dc93_newest/' + 'https://repo.mindspore.cn/mindspore/mindspore/version/202505/20250514/br_infer_deepseek_os_20250514004506_0e705b79c36766d07889faa32bc6a3ef6ec79ef3_newest/' mindspore_gs: - 'https://repo.mindspore.cn/mindspore/golden-stick/version/202504/20250424/master_20250424010019_dc3222e266c572dce1070a112aa6e12155a45370_newest/' + 'https://repo.mindspore.cn/mindspore/golden-stick/version/202505/20250514/master_20250514010015_c6cede824328d0dd7069e735646ff4a1808a1c72_newest/' msadapter: - 'https://repo.mindspore.cn/mindspore/msadapter/version/202504/20250410/master_20250410120007_83e7214eb2b9598179135a4e98dce3b69ba27da2_newest/' + 'https://repo.mindspore.cn/mindspore/msadapter/version/202505/20250514/master_20250514010016_380ecadf0133da436503105d6e8e1db709472fe4_newest/' vllm: - 'https://repo.mindspore.cn/mirrors/vllm/version/202503/20250321/v0.7.3_20250321112504_ed6e9075d31e32c8548b480a47d1ffb77da1f54c_newest/' + 'https://repo.mindspore.cn/mirrors/vllm/version/202505/20250514/v0.8.4.dev0_newest/any/vllm-0.8.4.dev0+g296c657.d20250514.empty-py3-none-any.whl' -- Gitee From 50b54e4878030c413930da6dc57271a51a3411c6 Mon Sep 17 00:00:00 2001 From: lvhaoyu Date: Thu, 15 May 2025 10:12:22 +0800 Subject: [PATCH 28/55] Revert "adding v0-style prefix-first scheduler" This reverts commit bc448f417b98d1948e8e05ca1ae0df0a23459246. --- vllm_mindspore/__init__.py | 4 - vllm_mindspore/v1/core/__init__.py | 0 vllm_mindspore/v1/core/sched/scheduler.py | 319 ---------------------- 3 files changed, 323 deletions(-) delete mode 100644 vllm_mindspore/v1/core/__init__.py delete mode 100644 vllm_mindspore/v1/core/sched/scheduler.py diff --git a/vllm_mindspore/__init__.py b/vllm_mindspore/__init__.py index 28f8469d..3aae2091 100644 --- a/vllm_mindspore/__init__.py +++ b/vllm_mindspore/__init__.py @@ -309,10 +309,6 @@ from vllm_mindspore.v1.worker.gpu_worker import compile_or_warm_up_model from vllm.v1.worker.gpu_worker import Worker Worker.compile_or_warm_up_model = compile_or_warm_up_model -from vllm_mindspore.v1.core.sched.scheduler import schedule -from vllm.v1.core.sched.scheduler import Scheduler -Scheduler.schedule = schedule - from .utils import check_ready from vllm_mindspore.engine.multiprocessing.engine import cleanup diff --git a/vllm_mindspore/v1/core/__init__.py b/vllm_mindspore/v1/core/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/vllm_mindspore/v1/core/sched/scheduler.py b/vllm_mindspore/v1/core/sched/scheduler.py deleted file mode 100644 index c03f3469..00000000 --- a/vllm_mindspore/v1/core/sched/scheduler.py +++ /dev/null @@ -1,319 +0,0 @@ -#!/usr/bin/env python3 -# encoding: utf-8 -# Copyright 2025 Huawei Technologies Co., Ltd -# Copyright 2024 The vLLM team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -# This file is based on vllm-ascend project. -# ============================================================================ - -from collections import deque - -import time -from vllm.logger import init_logger -from vllm.utils import cdiv -from vllm.v1.core.sched.output import NewRequestData, SchedulerOutput -from vllm.v1.core.sched.scheduler import Scheduler -from vllm.v1.request import Request, RequestStatus - -logger = init_logger(__name__) - -def _check_watermark_for_prefill(self, - request, - num_new_tokens, - computed_blocks, - watermark=0.01): - computed_blocks = computed_blocks or [] - watermark_blocks = self.kv_cache_config.num_blocks * watermark - num_computed_tokens = (request.num_computed_tokens + - len(computed_blocks) * self.block_size) - num_required_blocks = cdiv(num_new_tokens + num_computed_tokens, - self.block_size) - req_blocks = self.kv_cache_manager.req_to_blocks[request.request_id] - num_new_blocks = (num_required_blocks - len(req_blocks) - - len(computed_blocks)) - num_evictable_computed_blocks = sum(1 for blk in computed_blocks - if blk.ref_cnt == 0) - # If number of free blocks is less than water mark after allocating, don't allocate. - if (self.kv_cache_manager.block_pool.get_num_free_blocks() - - num_evictable_computed_blocks - - num_new_blocks) < watermark_blocks: - return False - return True - -def _get_prompt_limit(self, request: Request) -> int: - if (self.scheduler_config.chunked_prefill_enabled - and not self.scheduler_config.is_multi_step): - prompt_limit = self.scheduler_config.max_model_len - else: - prompt_limit = min( - self.scheduler_config.max_model_len, - self.scheduler_config.max_num_batched_tokens, - ) - - # Model is fine tuned with long context. Return the fine tuned max_len. - if request.lora_request and request.lora_request.long_lora_max_len: - assert prompt_limit <= request.lora_request.long_lora_max_len - return request.lora_request.long_lora_max_len - else: - return prompt_limit - -def schedule(self) -> SchedulerOutput: - scheduled_new_reqs: list[Request] = [] - scheduled_resumed_reqs: list[Request] = [] - scheduled_running_reqs: list[Request] = [] - preempted_reqs: list[Request] = [] - - req_to_new_block_ids: dict[str, list[int]] = {} - num_scheduled_tokens: dict[str, int] = {} - token_budget = self.max_num_scheduled_tokens - # Spec decode-related. - scheduled_spec_decode_tokens: dict[str, list[int]] = {} - - # Record scheduled LoRA requests. - scheduled_loras: set[int] = set() - - # Use a temporary deque to collect requests that need to be skipped - # and put back at the head of the waiting queue later - skipped_waiting_requests: deque[Request] = deque() - - # Schedule prefill requests first. - while self.waiting and token_budget > 0: - if len(scheduled_new_reqs) == self.max_num_running_reqs: - break - - request = self.waiting[0] - - def skip_cur_request(): - self.waiting.popleft() - skipped_waiting_requests.appendleft(request) - - # Check that adding the request still respects the max_loras - # constraint. - if (self.lora_config and request.lora_request and - (len(scheduled_loras) == self.lora_config.max_loras - and request.lora_request.lora_int_id not in scheduled_loras)): - # Scheduling would exceed max_loras, skip. - skip_cur_request() - continue - - prompt_limit = _get_prompt_limit(self, request) - # Get already-cached tokens. - computed_blocks, num_computed_tokens = ( - self.kv_cache_manager.get_computed_blocks(request)) - num_new_tokens = request.num_prompt_tokens - num_computed_tokens - if (0 < self.scheduler_config.long_prefill_token_threshold < - num_new_tokens): - num_new_tokens = ( - self.scheduler_config.long_prefill_token_threshold) - max_tokens_in_kvcache = (self.kv_cache_config.num_blocks * - self.block_size) - prompt_limit = min(prompt_limit, max_tokens_in_kvcache) - - # Finish request that exceeds prompt_limit or kv cache size. - if num_new_tokens > prompt_limit: - logger.warning( - "Input prompt (%d tokens) is too long" - " and exceeds limit of %d", - num_new_tokens, - prompt_limit, - ) - request.status = RequestStatus.FINISHED_IGNORED - self.finished_req_ids.add(request.request_id) # type: ignore - self.waiting.popleft() - continue - - if num_new_tokens > token_budget: - # Scheduling would exceed token_budget, skip. - skip_cur_request() - continue - - assert num_new_tokens > 0 - watermark = getattr(self.scheduler_config, "watermark", 0.01) - if not _check_watermark_for_prefill(self, - request, num_new_tokens, computed_blocks, watermark): - # Scheduling would exceed watermark, skip. - skip_cur_request() - continue - - new_blocks = self.kv_cache_manager.allocate_slots( - request, num_new_tokens, computed_blocks) - if new_blocks is None: - # The request cannot be scheduled. - break - - self.waiting.popleft() - self.running.append(request) - self.scheduled_req_ids.add(request.request_id) - # Check request status. - if request.status == RequestStatus.WAITING: - scheduled_new_reqs.append(request) - elif request.status == RequestStatus.PREEMPTED: - scheduled_resumed_reqs.append(request) - else: - raise RuntimeError(f"Invalid request status: {request.status}") - - if self.lora_config and request.lora_request: - scheduled_loras.add(request.lora_request.lora_int_id) - req_to_new_block_ids[request.request_id] = [ - b.block_id for b in computed_blocks + new_blocks - ] - # Update request info. - num_scheduled_tokens[request.request_id] = num_new_tokens - token_budget -= num_new_tokens - request.status = RequestStatus.RUNNING - request.num_computed_tokens = num_computed_tokens - - # Put back any skipped requests at the head of the waiting queue - if skipped_waiting_requests: - self.waiting.extendleft(skipped_waiting_requests) - - # If no prefill requests are scheduled, - # Schedule decode requests next. - if len(self.scheduled_req_ids) == 0: - req_index = 0 - while req_index < len(self.running) and token_budget > 0: - request = self.running[req_index] - if request.request_id in self.scheduled_req_ids: - # This request has already been scheduled. - req_index += 1 - continue - - num_new_tokens = (request.num_tokens_with_spec - - request.num_computed_tokens) - if (0 < self.scheduler_config.long_prefill_token_threshold < - num_new_tokens): - num_new_tokens = ( - self.scheduler_config.long_prefill_token_threshold) - num_new_tokens = min(num_new_tokens, token_budget) - assert num_new_tokens == 1 - - while True: - new_blocks = self.kv_cache_manager.allocate_slots( - request, num_new_tokens) - if new_blocks is None: - # The request cannot be scheduled. - # Preempt the lowest-priority request. - preempted_req = self.running.pop() - self.kv_cache_manager.free(preempted_req) - preempted_req.status = RequestStatus.PREEMPTED - preempted_req.num_computed_tokens = 0 - self.waiting.appendleft(preempted_req) - preempted_reqs.append(preempted_req) - if preempted_req == request: - # No more request to preempt. - can_schedule = False - break - else: - # The request can be scheduled. - can_schedule = True - break - if not can_schedule: - break - assert new_blocks is not None - - # Schedule the request. - scheduled_running_reqs.append(request) - self.scheduled_req_ids.add(request.request_id) - req_to_new_block_ids[request.request_id] = [ - b.block_id for b in new_blocks - ] - num_scheduled_tokens[request.request_id] = num_new_tokens - token_budget -= num_new_tokens - req_index += 1 - - # Speculative decode related. - if request.spec_token_ids: - num_scheduled_spec_tokens = (num_new_tokens + - request.num_computed_tokens - - request.num_tokens) - if num_scheduled_spec_tokens > 0: - # Trim spec_token_ids list to num_scheduled_spec_tokens. - del request.spec_token_ids[num_scheduled_spec_tokens:] - scheduled_spec_decode_tokens[request.request_id] = ( - request.spec_token_ids) - - # Check if the scheduling constraints are satisfied. - total_num_scheduled_tokens = sum(num_scheduled_tokens.values()) - assert total_num_scheduled_tokens <= self.max_num_scheduled_tokens - assert token_budget >= 0 - assert len(self.running) <= self.max_num_running_reqs - assert len(scheduled_new_reqs) + len(scheduled_resumed_reqs) + len( - scheduled_running_reqs) <= len(self.running) - - # Get the longest common prefix among all requests in the running queue. - # This can be potentially used for cascade attention. - num_common_prefix_blocks = 0 - if self.running: - any_request = self.running[0] - num_common_prefix_blocks = ( - self.kv_cache_manager.get_num_common_prefix_blocks( - any_request, len(self.running))) - - # Construct the scheduler output. - new_reqs_data = [ - NewRequestData.from_request(req, - req_to_new_block_ids[req.request_id]) - for req in scheduled_new_reqs - ] - resumed_reqs_data = [ - self._make_cached_request_data( - req, - num_scheduled_tokens[req.request_id], - len(scheduled_spec_decode_tokens.get(req.request_id, ())), - req_to_new_block_ids[req.request_id], - resumed_from_preemption=True, - ) for req in scheduled_resumed_reqs - ] - running_reqs_data = [ - self._make_cached_request_data( - req, - num_scheduled_tokens[req.request_id], - len(scheduled_spec_decode_tokens.get(req.request_id, ())), - req_to_new_block_ids[req.request_id], - resumed_from_preemption=False, - ) for req in scheduled_running_reqs - ] - scheduler_output = SchedulerOutput( - scheduled_new_reqs=new_reqs_data, - scheduled_cached_reqs=resumed_reqs_data + running_reqs_data, - num_scheduled_tokens=num_scheduled_tokens, - total_num_scheduled_tokens=total_num_scheduled_tokens, - scheduled_spec_decode_tokens=scheduled_spec_decode_tokens, - scheduled_encoder_inputs={}, - num_common_prefix_blocks=num_common_prefix_blocks, - # finished_req_ids is an existing state in the scheduler, - # instead of being newly scheduled in this step. - # It contains the request IDs that are finished in between - # the previous and the current steps. - finished_req_ids=self.finished_req_ids, # type: ignore - free_encoder_input_ids=self.encoder_cache_manager.get_freed_ids(), - structured_output_request_ids={}, - grammar_bitmask=None, - ) - - # Advance the number of computed tokens for the request AFTER - # the request is scheduled. - # 1. The scheduler_output of the current step has to include the - # original number of scheduled tokens to determine input IDs. - # 2. Advance the number of computed tokens here allowing us to - # schedule the prefill request again immediately in the next - # scheduling step. - # 3. If some tokens (e.g. spec tokens) are rejected later, the number of - # computed tokens will be adjusted in update_from_output. - for req_id, num_scheduled_token in num_scheduled_tokens.items(): - self.requests[req_id].num_computed_tokens += num_scheduled_token - - self.finished_req_ids = set() # type: ignore - return scheduler_output -- Gitee From a7fedb2aadad90af22ee5a4cab3d7ebbaab8b7b8 Mon Sep 17 00:00:00 2001 From: lvhaoyu Date: Thu, 15 May 2025 10:22:31 +0800 Subject: [PATCH 29/55] Revert "bugfix: add module init" This reverts commit 109edbf79a1405af1916d1ed46fe3020f71f5ed4. --- vllm_mindspore/v1/core/sched/__init__.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 vllm_mindspore/v1/core/sched/__init__.py diff --git a/vllm_mindspore/v1/core/sched/__init__.py b/vllm_mindspore/v1/core/sched/__init__.py deleted file mode 100644 index e69de29b..00000000 -- Gitee From 4bdd0c791b67c06575e73b7acbc0472e36a103d6 Mon Sep 17 00:00:00 2001 From: zhanzhan1 Date: Wed, 14 May 2025 00:08:25 +0800 Subject: [PATCH 30/55] Update for EP parallel config changes --- .../models/mf_models/deepseekv3_weight_processor.py | 4 ++-- .../model_executor/models/mf_models/weight_processor.py | 6 ++++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/vllm_mindspore/model_executor/models/mf_models/deepseekv3_weight_processor.py b/vllm_mindspore/model_executor/models/mf_models/deepseekv3_weight_processor.py index 966c569a..c68bf4ad 100644 --- a/vllm_mindspore/model_executor/models/mf_models/deepseekv3_weight_processor.py +++ b/vllm_mindspore/model_executor/models/mf_models/deepseekv3_weight_processor.py @@ -939,7 +939,7 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor): requires_grad=False) def get_moe_shared_expert_weight(self, w1_hf_name, w2_hf_name, w3_hf_name, src_hf_dir, hf_weight_map): - if self.ep_method == EPMethod.ALLGATHER: + if self.ep_method in [EPMethod.DEFAULT, EPMethod.ALLGATHER]: w1_ms_param, _ = self.get_safetensor_from_file_split_global_group(w1_hf_name, src_hf_dir, hf_weight_map, split_axis=0) w2_ms_param, _ = self.get_safetensor_from_file_split_global_group(w2_hf_name, src_hf_dir, hf_weight_map, @@ -1290,7 +1290,7 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor): w3_bias_name, w3_scale_name, w2_weight_name, src_hf_dir, hf_weight_map): - if self.ep_method == EPMethod.ALLGATHER: + if self.ep_method in [EPMethod.DEFAULT, EPMethod.ALLGATHER]: w1_weight_param, _ = self.get_safetensor_from_file_split_moe_tp_group(w1_weight_name, src_hf_dir, hf_weight_map, split_axis=0) diff --git a/vllm_mindspore/model_executor/models/mf_models/weight_processor.py b/vllm_mindspore/model_executor/models/mf_models/weight_processor.py index c70302f2..542dbf77 100644 --- a/vllm_mindspore/model_executor/models/mf_models/weight_processor.py +++ b/vllm_mindspore/model_executor/models/mf_models/weight_processor.py @@ -33,6 +33,7 @@ class EPMethod(Enum): ALLTOALL = 'alltoall' ALLGATHER = 'allgather' + class BaseWeightProcessor: r""" Provide model weight load and shards. @@ -51,8 +52,9 @@ class BaseWeightProcessor: self.tp_group_size = get_tp_world_size() self.dp_group_size = get_data_parallel_world_size() self.num_router_experts = self.config.moe_config.expert_num if self.config.moe_config.expert_num else 1 - self.moe_ep_size = self.config.moe_config.moe_expert_parallel if self.config.moe_config.moe_expert_parallel else 1 - self.moe_tp_size = self.config.moe_config.moe_tensor_parallel if self.config.moe_config.moe_tensor_parallel else 1 + self.moe_ep_size = self.config.parallel_config.expert_parallel \ + if self.config.parallel_config.expert_parallel else 1 + self.moe_tp_size = self.global_group_size // self.moe_ep_size self.ep_method = EPMethod.DEFAULT if self.dp_group_size > 1 and self.moe_ep_size == self.global_group_size: self.ep_method = EPMethod.ALLTOALL -- Gitee From f7d9fabc5bbf17648d77dc5c7b0fe874c04ba4bb Mon Sep 17 00:00:00 2001 From: lvhaoyu1 Date: Fri, 16 May 2025 00:00:07 +0800 Subject: [PATCH 31/55] update yaml and some code --- .jenkins/test/config/dependent_packages.yaml | 2 +- tests/st/python/config/predict_deepseek_r1_671b.yaml | 4 ---- .../python/config/predict_deepseek_r1_671b_w8a8.yaml | 7 ++----- tests/st/python/test_vllm_deepseek_bf16_part.py | 2 +- tests/st/python/test_vllm_mf_qwen_7b_cp_pc_mss.py | 1 + tests/st/python/test_vllm_qwen_7b.py | 1 + .../model_executor/models/mf_models/mf_model_base.py | 2 +- vllm_mindspore/model_executor/models/model_base.py | 10 ---------- 8 files changed, 7 insertions(+), 22 deletions(-) diff --git a/.jenkins/test/config/dependent_packages.yaml b/.jenkins/test/config/dependent_packages.yaml index cdb73e74..5df5ff1f 100644 --- a/.jenkins/test/config/dependent_packages.yaml +++ b/.jenkins/test/config/dependent_packages.yaml @@ -8,4 +8,4 @@ msadapter: 'https://repo.mindspore.cn/mindspore/msadapter/version/202505/20250514/master_20250514010016_380ecadf0133da436503105d6e8e1db709472fe4_newest/' vllm: - 'https://repo.mindspore.cn/mirrors/vllm/version/202505/20250514/v0.8.4.dev0_newest/any/vllm-0.8.4.dev0+g296c657.d20250514.empty-py3-none-any.whl' + 'https://repo.mindspore.cn/mirrors/vllm/version/202505/20250514/v0.8.4.dev0_newest/' diff --git a/tests/st/python/config/predict_deepseek_r1_671b.yaml b/tests/st/python/config/predict_deepseek_r1_671b.yaml index 112375ef..a4d05570 100644 --- a/tests/st/python/config/predict_deepseek_r1_671b.yaml +++ b/tests/st/python/config/predict_deepseek_r1_671b.yaml @@ -26,10 +26,6 @@ context: device_id: 0 affinity_cpu_list: None -kernel_launch_group: - thread_num: 4 - kernel_group_num: 16 - # parallel context config parallel: parallel_mode: "STAND_ALONE" # use 'STAND_ALONE' mode for inference with parallelism in frontend diff --git a/tests/st/python/config/predict_deepseek_r1_671b_w8a8.yaml b/tests/st/python/config/predict_deepseek_r1_671b_w8a8.yaml index 5a5e9d60..239adc12 100644 --- a/tests/st/python/config/predict_deepseek_r1_671b_w8a8.yaml +++ b/tests/st/python/config/predict_deepseek_r1_671b_w8a8.yaml @@ -14,6 +14,7 @@ trainer: # default parallel of device num = 16 for Atlas 800T A2 parallel_config: + data_parallel: 1 model_parallel: 16 pipeline_stage: 1 expert_parallel: 1 @@ -22,14 +23,10 @@ parallel_config: # mindspore context init config context: mode: 0 # 0--Graph Mode; 1--Pynative Mode - max_device_memory: "61GB" + max_device_memory: "58GB" device_id: 0 affinity_cpu_list: None -kernel_launch_group: - thread_num: 4 - kernel_group_num: 16 - # parallel context config parallel: parallel_mode: "STAND_ALONE" # use 'STAND_ALONE' mode for inference with parallelism in frontend diff --git a/tests/st/python/test_vllm_deepseek_bf16_part.py b/tests/st/python/test_vllm_deepseek_bf16_part.py index 86137637..f4db891a 100644 --- a/tests/st/python/test_vllm_deepseek_bf16_part.py +++ b/tests/st/python/test_vllm_deepseek_bf16_part.py @@ -23,7 +23,7 @@ env_vars = { "MINDFORMERS_MODEL_CONFIG": "./config/predict_deepseek_r1_671b.yaml", "ASCEND_CUSTOM_PATH": os.path.expandvars("$ASCEND_HOME_PATH/../"), "vLLM_MODEL_BACKEND": "MindFormers", - "MS_ENABLE_LCCL": "off", + "MS_ENABLE_LCCL": "on", "HCCL_OP_EXPANSION_MODE": "AIV", "ASCEND_RT_VISIBLE_DEVICES": "0,1,2,3,4,5,6,7", "MS_ALLOC_CONF": "enable_vmm:True", diff --git a/tests/st/python/test_vllm_mf_qwen_7b_cp_pc_mss.py b/tests/st/python/test_vllm_mf_qwen_7b_cp_pc_mss.py index df61117d..856932d3 100644 --- a/tests/st/python/test_vllm_mf_qwen_7b_cp_pc_mss.py +++ b/tests/st/python/test_vllm_mf_qwen_7b_cp_pc_mss.py @@ -45,6 +45,7 @@ class TestMfQwen_cp_pc_mss: @pytest.mark.level0 @pytest.mark.platform_arm_ascend910b_training @pytest.mark.env_single + @pytest.mark.skip(reason="cp precision need to be fixed on v0.8.3 V0") def test_mf_qwen_7b_cp_pc_mss(self): """ test case mf_qwen_7b_cp_pc_mss diff --git a/tests/st/python/test_vllm_qwen_7b.py b/tests/st/python/test_vllm_qwen_7b.py index 695e9cb6..028f2ead 100644 --- a/tests/st/python/test_vllm_qwen_7b.py +++ b/tests/st/python/test_vllm_qwen_7b.py @@ -44,6 +44,7 @@ class TestQwen: @pytest.mark.level0 @pytest.mark.platform_arm_ascend910b_training @pytest.mark.env_single + @pytest.mark.skip(reason="qwen need to be supported on v0.8.3 V0") def test_vllm_qwen(self): """ test case qwen2.5 7B diff --git a/vllm_mindspore/model_executor/models/mf_models/mf_model_base.py b/vllm_mindspore/model_executor/models/mf_models/mf_model_base.py index 533d16c7..32cbb916 100644 --- a/vllm_mindspore/model_executor/models/mf_models/mf_model_base.py +++ b/vllm_mindspore/model_executor/models/mf_models/mf_model_base.py @@ -216,7 +216,7 @@ class MfModelBase(MsModelBase): dtype=self.mf_model_config.compute_dtype) else: hidden_states = hidden_states.index_select(0, selected_token_indices) - logits = self.network.lm_head(hidden_states) + logits = self.lm_head(hidden_states) logits = logits.reshape(-1, logits.shape[-1]) else: logits = self.lm_head(hidden_states) diff --git a/vllm_mindspore/model_executor/models/model_base.py b/vllm_mindspore/model_executor/models/model_base.py index 75e4648e..3df62d4c 100644 --- a/vllm_mindspore/model_executor/models/model_base.py +++ b/vllm_mindspore/model_executor/models/model_base.py @@ -66,16 +66,6 @@ class Fake_MLA(Fake_Attention): ] -class Fake_MLA(Fake_Attention): - def __init__(self): - super().__init__() - vllm_config = get_current_vllm_config() - self.kv_cache = [ - (torch.zeros(self.kv_shape, dtype=torch.bfloat16, device="Ascend"),) - for _ in range(vllm_config.parallel_config.pipeline_parallel_size) - ] - - class Fake_Attention_V1(Attention): def __init__(self): vllm_config = get_current_vllm_config() -- Gitee From 5c0e7dae5b7aec636c2391eca656da263827f9de Mon Sep 17 00:00:00 2001 From: moran Date: Thu, 15 May 2025 15:49:36 +0800 Subject: [PATCH 32/55] remove unused set in pyproject.toml --- .../pyproject.toml | 17 ----------------- codecheck_toolkits/vllm_codecheck.sh | 4 ++++ 2 files changed, 4 insertions(+), 17 deletions(-) rename pyproject.toml => codecheck_toolkits/pyproject.toml (85%) diff --git a/pyproject.toml b/codecheck_toolkits/pyproject.toml similarity index 85% rename from pyproject.toml rename to codecheck_toolkits/pyproject.toml index 65e258dc..8bce21ec 100644 --- a/pyproject.toml +++ b/codecheck_toolkits/pyproject.toml @@ -1,20 +1,3 @@ -[build-system] -# Should be mirrored in requirements-build.txt -requires = [ - "cmake>=3.26", - "ninja", - "packaging", - "setuptools>=61", - "setuptools-scm>=8.0", - "torch == 2.5.1", - "wheel", - "jinja2", -] -build-backend = "setuptools.build_meta" - -[tool.setuptools_scm] -# version_file = "vllm/_version.py" # currently handled by `setup.py:get_version()` - [tool.ruff] # Allow lines to be as long as 80. line-length = 80 diff --git a/codecheck_toolkits/vllm_codecheck.sh b/codecheck_toolkits/vllm_codecheck.sh index 928c70bd..201b4d52 100644 --- a/codecheck_toolkits/vllm_codecheck.sh +++ b/codecheck_toolkits/vllm_codecheck.sh @@ -1,5 +1,7 @@ pip install -r codecheck_toolkits/requirements-lint.txt +ln -s codecheck_toolkits/pyproject.toml pyproject.toml + RET_FLAG=0 # yapf check @@ -69,4 +71,6 @@ else echo "mypy check success." fi +rm -f pyproject.toml + exit $RET_FLAG -- Gitee From 2b3fee5149e3b08edf5f306c6aa8dcc4c3220ffb Mon Sep 17 00:00:00 2001 From: zlq2020 Date: Wed, 7 May 2025 21:20:16 +0800 Subject: [PATCH 33/55] remove inductor_pass --- vllm_mindspore/__init__.py | 6 +- vllm_mindspore/compilation/__init__.py | 0 vllm_mindspore/compilation/inductor_pass.py | 73 ------------------- .../model_executor/models/registry.py | 3 - 4 files changed, 1 insertion(+), 81 deletions(-) delete mode 100644 vllm_mindspore/compilation/__init__.py delete mode 100644 vllm_mindspore/compilation/inductor_pass.py diff --git a/vllm_mindspore/__init__.py b/vllm_mindspore/__init__.py index 3aae2091..d7515632 100644 --- a/vllm_mindspore/__init__.py +++ b/vllm_mindspore/__init__.py @@ -29,11 +29,7 @@ if "vllm" in sys.modules: from vllm_mindspore.scripts import env_setup env_setup() -# 2. replace the inductor_pass module before import vllm. -from vllm_mindspore.compilation import inductor_pass as ms_inductor_pass -sys.modules.update({"vllm.compilation.inductor_pass": ms_inductor_pass}) - -# 3. update the log configuration ahead of other modifications. +# 2. update the log configuration ahead of other modifications. import vllm_mindspore.logger from vllm_mindspore.platforms.ascend import AscendPlatform diff --git a/vllm_mindspore/compilation/__init__.py b/vllm_mindspore/compilation/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/vllm_mindspore/compilation/inductor_pass.py b/vllm_mindspore/compilation/inductor_pass.py deleted file mode 100644 index 835a1e11..00000000 --- a/vllm_mindspore/compilation/inductor_pass.py +++ /dev/null @@ -1,73 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 - -import hashlib -import inspect -import json -import types -from typing import Any, Callable, Dict, Optional, Union - -import torch -from packaging.version import Version - - -class InductorPass(): - """ - A custom graph pass that uses a hash of its source as the UUID. - This is defined as a convenience and should work in most cases. - """ - - def uuid(self) -> Any: - """ - Provide a unique identifier for the pass, used in Inductor code cache. - This should depend on the pass implementation, so that changes to the - pass result in recompilation. - By default, the object source is hashed. - """ - return InductorPass.hash_source(self) - - @staticmethod - def hash_source(*srcs: Union[str, Any]): - """ - Utility method to hash the sources of functions or objects. - :param srcs: strings or objects to add to the hash. - Objects and functions have their source inspected. - :return: - """ - hasher = hashlib.sha256() - for src in srcs: - if isinstance(src, str): - src_str = src - elif isinstance(src, types.FunctionType): - src_str = inspect.getsource(src) - else: - src_str = inspect.getsource(src.__class__) - hasher.update(src_str.encode("utf-8")) - return hasher.hexdigest() - - @staticmethod - def hash_dict(dict_: Dict[Any, Any]): - """ - Utility method to hash a dictionary, can alternatively be used for uuid. - :return: A sha256 hash of the json rep of the dictionary. - """ - encoded = json.dumps(dict_, sort_keys=True).encode("utf-8") - return hashlib.sha256(encoded).hexdigest() - - -class CallableInductorPass(InductorPass): - """ - This class is a wrapper for a callable that automatically provides an - implementation of the UUID. - """ - - def __init__(self, - callable, - uuid: Optional[Any] = None): - self.callable = callable - self._uuid = self.hash_source(callable) if uuid is None else uuid - - def __call__(self, graph): - self.callable(graph) - - def uuid(self) -> Any: - return self._uuid diff --git a/vllm_mindspore/model_executor/models/registry.py b/vllm_mindspore/model_executor/models/registry.py index d58130ef..0f01305e 100644 --- a/vllm_mindspore/model_executor/models/registry.py +++ b/vllm_mindspore/model_executor/models/registry.py @@ -67,9 +67,6 @@ _SUBPROCESS_COMMAND = [ def _run() -> None: - import vllm_mindspore.compilation.inductor_pass as ms_inductor_pass - sys.modules["vllm.compilation.inductor_pass"] = ms_inductor_pass - # Setup plugins from vllm.plugins import load_general_plugins load_general_plugins() -- Gitee From 9da74957ee5981f7e8fb0bd4669a6c312721b230 Mon Sep 17 00:00:00 2001 From: twc Date: Fri, 16 May 2025 16:12:58 +0800 Subject: [PATCH 34/55] code sync from master 1. fix fa and pa mask 2. Support for enable_prefix_caching --- tests/mindformers | 2 +- .../st/python/test_vllm_deepseek_bf16_part.py | 2 +- tests/st/python/test_vllm_deepseek_part.py | 2 +- .../python/test_vllm_deepseek_smoothquant.py | 2 +- vllm_mindspore/attention/layer.py | 11 ++- .../model_executor/layers/logits_processor.py | 18 +++-- vllm_mindspore/model_executor/layers/utils.py | 2 + .../models/{mf_models => }/attention_mask.py | 56 ++++++++++---- .../models/mf_models/deepseek_v3.py | 3 + .../models/mf_models/mf_model_base.py | 9 +-- .../model_executor/models/model_base.py | 43 ----------- vllm_mindspore/model_executor/models/qwen2.py | 73 ++++++++++++++++--- vllm_mindspore/utils.py | 9 +-- 13 files changed, 137 insertions(+), 95 deletions(-) rename vllm_mindspore/model_executor/models/{mf_models => }/attention_mask.py (39%) diff --git a/tests/mindformers b/tests/mindformers index 16587217..bbddc170 160000 --- a/tests/mindformers +++ b/tests/mindformers @@ -1 +1 @@ -Subproject commit 165872172ae5396cb4b66629614c85ff21038e11 +Subproject commit bbddc170167ac6705f07bf0aea25977e10f8d760 diff --git a/tests/st/python/test_vllm_deepseek_bf16_part.py b/tests/st/python/test_vllm_deepseek_bf16_part.py index f4db891a..97904c7f 100644 --- a/tests/st/python/test_vllm_deepseek_bf16_part.py +++ b/tests/st/python/test_vllm_deepseek_bf16_part.py @@ -61,7 +61,7 @@ class TestDeepSeek: # Create an LLM. llm = LLM(model="/home/workspace/mindspore_dataset/weight/DeepSeek-R1-bf16", - trust_remote_code=True, gpu_memory_utilization=0.9, tensor_parallel_size=8) + trust_remote_code=True, gpu_memory_utilization=0.9, tensor_parallel_size=8, max_model_len=4096) # Generate texts from the prompts. The output is a list of RequestOutput objects # that contain the prompt, generated text, and other information. outputs = llm.generate(prompts, sampling_params) diff --git a/tests/st/python/test_vllm_deepseek_part.py b/tests/st/python/test_vllm_deepseek_part.py index f88ece2c..c882ae58 100644 --- a/tests/st/python/test_vllm_deepseek_part.py +++ b/tests/st/python/test_vllm_deepseek_part.py @@ -64,7 +64,7 @@ class TestDeepSeek: # Create an LLM. llm = LLM(model="/home/workspace/mindspore_dataset/weight/DeepSeek-R1-W8A8", - trust_remote_code=True, gpu_memory_utilization=0.9, tensor_parallel_size=8) + trust_remote_code=True, gpu_memory_utilization=0.9, tensor_parallel_size=8, max_model_len=4096) # Generate texts from the prompts. The output is a list of RequestOutput objects # that contain the prompt, generated text, and other information. outputs = llm.generate(prompts, sampling_params) diff --git a/tests/st/python/test_vllm_deepseek_smoothquant.py b/tests/st/python/test_vllm_deepseek_smoothquant.py index 7e3397e0..c79222d9 100644 --- a/tests/st/python/test_vllm_deepseek_smoothquant.py +++ b/tests/st/python/test_vllm_deepseek_smoothquant.py @@ -64,7 +64,7 @@ class TestDeepSeek: # Create an LLM. llm = LLM(model="/home/workspace/mindspore_dataset/weight/DeepSeek-R1-W8A8-smoothquant", - trust_remote_code=True, gpu_memory_utilization=0.9, tensor_parallel_size=8) + trust_remote_code=True, gpu_memory_utilization=0.9, tensor_parallel_size=8, max_model_len=4096) # Generate texts from the prompts. The output is a list of RequestOutput objects # that contain the prompt, generated text, and other information. outputs = llm.generate(prompts, sampling_params) diff --git a/vllm_mindspore/attention/layer.py b/vllm_mindspore/attention/layer.py index 4634727b..89914e97 100644 --- a/vllm_mindspore/attention/layer.py +++ b/vllm_mindspore/attention/layer.py @@ -157,11 +157,10 @@ class Attention(nn.Cell): value_cache: Tensor, is_prefill: bool, slot_mapping: Tensor, - batch_valid_length: Tuple[int], + attn_mask: Tensor, + batch_valid_length: Tensor, q_seq_lens: Tensor, block_tables: Tensor, - attn_mask: Tensor, - decode_mask: Tensor, ) -> Tensor: """Attention foward, support MHA and GQA. @@ -181,7 +180,7 @@ class Attention(nn.Cell): output = self._run_prefill_forward(query, key, value, attn_mask, batch_valid_length, batch_valid_length) else: output = self._run_decode_forward(query, key_cache, value_cache, block_tables, batch_valid_length, - decode_mask, q_seq_lens) + attn_mask, q_seq_lens) return output def _run_prefill_forward( @@ -228,7 +227,7 @@ class Attention(nn.Cell): value_cache: Tensor, block_tables: Tensor, batch_valid_length: Tensor, - decode_mask: Tensor, + attn_mask: Tensor, q_seq_lens: Tensor, ) -> Tensor: """Decode with PagedAttention. @@ -248,7 +247,7 @@ class Attention(nn.Cell): batch_valid_length, None, None, - decode_mask, + attn_mask, q_seq_lens ) return output diff --git a/vllm_mindspore/model_executor/layers/logits_processor.py b/vllm_mindspore/model_executor/layers/logits_processor.py index 647b4ac8..75f35d6d 100644 --- a/vllm_mindspore/model_executor/layers/logits_processor.py +++ b/vllm_mindspore/model_executor/layers/logits_processor.py @@ -41,6 +41,7 @@ if envs.VLLM_LOGITS_PROCESSOR_THREADS is not None: _logits_processor_threadpool = ThreadPoolExecutor( envs.VLLM_LOGITS_PROCESSOR_THREADS) + class LogitsProcessor(nn.Cell): """Process logits and apply logits processors from sampling metadata. @@ -88,6 +89,8 @@ class LogitsProcessor(nn.Cell): logits = hidden_states else: if sampling_metadata is not None: + if sampling_metadata.selected_token_indices.numel() <= 0: + return mint.zeros((0, self.vocab_size), dtype=hidden_states.dtype) hidden_states = _prune_hidden_states(hidden_states, sampling_metadata) # Get the logits for the next tokens. @@ -102,7 +105,7 @@ class LogitsProcessor(nn.Cell): logits *= self.scale # Apply logits processors (if any). - if sampling_metadata is not None: + if sampling_metadata.seq_groups is not None: logits = _apply_logits_processors(logits, sampling_metadata) return logits @@ -146,10 +149,10 @@ def _prune_hidden_states( # NOTE(kzawora): The if guard is needed for Gaudi - in some scenarios # (warmup, profile_run) we might not have selected_token_indices, # so we skip pruning. - if sampling_metadata.selected_token_indices is not None: - return ops.gather(hidden_states, sampling_metadata.selected_token_indices, 0) - else: - return hidden_states + indices = sampling_metadata.selected_token_indices + if indices is not None and indices.numel() > 0: + return mint.index_select(hidden_states, 0, sampling_metadata.selected_token_indices) + return hidden_states def _apply_logits_processors( @@ -187,7 +190,7 @@ def _apply_logits_processors( logits_processed += len(seq_group.sample_indices) + len( seq_group.prompt_logprob_indices ) - + for logits_row_idx, future in logits_row_ids_and_logits_row_futures: logits[logits_row_idx] = future.result() @@ -196,6 +199,7 @@ def _apply_logits_processors( assert logits_processed == logits.shape[0] return logits + def _apply_logits_processors_single_seq(logits_row, logits_processors, past_tokens_ids, prompt_tokens_ids) -> Tensor: @@ -206,4 +210,4 @@ def _apply_logits_processors_single_seq(logits_row, logits_processors, logits_row) else: logits_row = logits_processor(past_tokens_ids, logits_row) - return logits_row \ No newline at end of file + return logits_row diff --git a/vllm_mindspore/model_executor/layers/utils.py b/vllm_mindspore/model_executor/layers/utils.py index bbef8d9c..0edf165c 100644 --- a/vllm_mindspore/model_executor/layers/utils.py +++ b/vllm_mindspore/model_executor/layers/utils.py @@ -53,6 +53,8 @@ def apply_penalties(logits: torch.Tensor, prompt_tokens_tensor: torch.Tensor, frequency_penalties: The frequency penalties of shape (num_seqs, ) repetition_penalties: The repetition penalties of shape (num_seqs, ) """ + if logits.numel() <= 0: + return logits num_seqs, vocab_size = logits.shape _, prompt_mask = get_token_bin_counts_and_mask(prompt_tokens_tensor, vocab_size, num_seqs) diff --git a/vllm_mindspore/model_executor/models/mf_models/attention_mask.py b/vllm_mindspore/model_executor/models/attention_mask.py similarity index 39% rename from vllm_mindspore/model_executor/models/mf_models/attention_mask.py rename to vllm_mindspore/model_executor/models/attention_mask.py index 10fcd25e..40be1f46 100644 --- a/vllm_mindspore/model_executor/models/mf_models/attention_mask.py +++ b/vllm_mindspore/model_executor/models/attention_mask.py @@ -18,36 +18,66 @@ infer attention mask. """ import numpy as np -import mindspore as ms -from mindspore import Tensor, JitConfig, Model +from mindspore import Tensor, mint +from mindspore import dtype as mstype + +r""" +PA:ASD-V2.1.5 +1.MLA + Q_seqlen =1: no mask.(BF16 mask(0/-10000), FP16 mask(0/-10000)). +2.MLA + Q_seqlen > 1: (MTP/PC/CP), BF16 mask(0/1), FP16 mask (0/-10000) +3.normal + Q_seqlen=1: no mask +4.normal + Q_seqlen > 1: (MTP/PC/CP),BF16 mask(0/-10000), FP16 mask(0/-10000).; + +FA:ASD-V2.1.5 +1.MLA: not implement; +2.normal: mask BF16(0/1), FP16 mask(0/-10000); +""" class LowerTriangularMask: r""" Provide Infer model attention mask. Args: - mf_model_config (MF Config): The config of Infer model. - + dtype (ms.dtype): The compute type of Infer model. + max_model_len (int): The max model length of Infer model. """ - def __init__(self, mf_model_config): - compute_dtype = mf_model_config.compute_dtype - seq_length = mf_model_config.seq_length - self.prefill_mask = Tensor(np.triu(np.ones(shape=(128, 128), dtype=np.float16), k=1), dtype=compute_dtype) + def __init__(self, dtype, max_model_len): + self.dtype = dtype + self.max_model_len = max_model_len + + prefill_mask_coeff = 1.0 if self.dtype is mstype.bfloat16 else -10000.0 - self.decode_mask = Tensor(np.triu(np.ones(shape=(seq_length, seq_length), dtype=np.int8), k=1), - dtype=compute_dtype) + self.prefill_mask = Tensor(np.triu(np.ones(shape=(128, 128), dtype=np.float16), k=1) * prefill_mask_coeff, + dtype=self.dtype) - self.hard_mask = Tensor([0], dtype=compute_dtype).reshape(1, 1) + self.decode_mask = Tensor(np.triu(np.ones(shape=(self.max_model_len, self.max_model_len), dtype=np.int8), k=1), + dtype=self.dtype) * -10000 - self.gather = ms.ops.Gather() + self.hard_mask = mint.zeros((1, 1), dtype=dtype) def gen_attention_mask(self, is_prefill, position_ids, query_lens): if is_prefill: attention_mask = self.prefill_mask else: if max(query_lens) > 1: - attention_mask = self.gather(self.decode_mask, position_ids, 0) + attention_mask = mint.index_select(self.decode_mask, 0, position_ids) else: attention_mask = self.hard_mask return attention_mask + + +class MLALowerTriangularMask(LowerTriangularMask): + r""" + Provide MLA Infer model attention mask. + Args: + dtype (ms.dtype): The compute type of Infer model. + max_model_len (int): The max model length of Infer model. + """ + + def __init__(self, dtype, max_model_len): + + super().__init__(dtype, max_model_len) + decode_mask_coeff = 1.0 if self.dtype is mstype.bfloat16 else -10000.0 + self.decode_mask = Tensor(np.triu(np.ones(shape=(self.max_model_len, self.max_model_len), dtype=np.int8), k=1), + dtype=self.dtype) * decode_mask_coeff diff --git a/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py b/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py index 3a1b2be6..67b80339 100644 --- a/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py +++ b/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py @@ -53,6 +53,7 @@ from vllm_mindspore.model_executor.layers.sampler import get_sampler from vllm_mindspore.model_executor.models.model_base import Fake_MLA, Fake_MLA_V1 from vllm_mindspore.model_executor.models.mf_models.mf_model_base import MfModelBase from vllm_mindspore.model_executor.models.mf_models.deepseekv3_weight_processor import DeepseekV3WeightProcessor +from vllm_mindspore.model_executor.models.attention_mask import MLALowerTriangularMask logger = init_logger(__name__) @@ -136,6 +137,8 @@ class DeepseekV3ForCausalLM(MfModelBase): self.set_flags = False set_runtime_kernel_launch_group() + self.casual_mask = MLALowerTriangularMask(dtype=self.mf_model_config.compute_dtype, + max_model_len=self.model_config.max_model_len) def _generate_model_config(self): self.mf_config.load_checkpoint = self.get_model_path() diff --git a/vllm_mindspore/model_executor/models/mf_models/mf_model_base.py b/vllm_mindspore/model_executor/models/mf_models/mf_model_base.py index 32cbb916..d11f776a 100644 --- a/vllm_mindspore/model_executor/models/mf_models/mf_model_base.py +++ b/vllm_mindspore/model_executor/models/mf_models/mf_model_base.py @@ -17,11 +17,9 @@ # ============================================================================ import os -from types import MethodType -from typing import Iterable, List, Optional, Set, Tuple, Union +from typing import Iterable, Optional, Set, Tuple, Union from abc import abstractmethod import numpy as np -import math from vllm.config import VllmConfig from vllm.model_executor.layers.sampler import SamplerOutput @@ -41,7 +39,7 @@ from mindformers.core.context import build_mf_context from mindformers.core.parallel_config import build_parallel_config from vllm_mindspore.model_executor.models.model_base import MsModelBase -from vllm_mindspore.model_executor.models.mf_models.attention_mask import LowerTriangularMask +from vllm_mindspore.model_executor.models.attention_mask import LowerTriangularMask from vllm_mindspore.v1.attention.backends.flash_attn import FlashAttentionMetadata logger = init_logger(__name__) @@ -64,7 +62,8 @@ class MfModelBase(MsModelBase): self.mf_config.model.model_config.parallel_config.pipeline_stage = 1 self._generate_model_config() - self.casual_mask = LowerTriangularMask(mf_model_config=self.mf_model_config) + self.casual_mask = LowerTriangularMask(dtype=self.mf_model_config.compute_dtype, + max_model_len=self.model_config.max_model_len) self.network, self.lm_head = self._create_network() affinity_config = self.mf_config.get('context', {}).get('affinity_cpu_list', {}) diff --git a/vllm_mindspore/model_executor/models/model_base.py b/vllm_mindspore/model_executor/models/model_base.py index 3df62d4c..7aa1de4e 100644 --- a/vllm_mindspore/model_executor/models/model_base.py +++ b/vllm_mindspore/model_executor/models/model_base.py @@ -32,9 +32,7 @@ from vllm.attention.layer import Attention import torch from mindspore import Tensor, nn, mutable -from mindspore import dtype as mstype -from vllm_mindspore.utils import STR_DTYPE_TO_MS_DTYPE class Fake_Attention: def __init__(self): @@ -220,47 +218,6 @@ class MsModelBase(): ) -> Union[Tensor, IntermediateTensors]: raise NotImplementedError - def set_model_inputs(self, is_prefill): - dyn_input_ids = Tensor(shape=[None, None], dtype=mstype.int64) - dyn_position_ids = Tensor(shape=[None], dtype=mstype.int64) - - block_size = self.cache_config.block_size - num_kv_heads = self.model_config.get_num_kv_heads(self.parallel_config) - head_size = self.model_config.get_head_size() - kv_cache_shape = (None, block_size, num_kv_heads, head_size) - - kv_cache_dtype = self.model_config.dtype if self.cache_config.cache_dtype == "auto" \ - else self.cache_config.cache_dtype - if kv_cache_dtype in STR_DTYPE_TO_MS_DTYPE: - kv_cache_dtype = STR_DTYPE_TO_MS_DTYPE[kv_cache_dtype] - - num_layers = self.model_config.get_num_layers(self.parallel_config) - - dyn_key_cache = mutable(Tensor(shape=kv_cache_shape, dtype=kv_cache_dtype)) - dyn_value_cache = mutable(Tensor(shape=kv_cache_shape, dtype=kv_cache_dtype)) - dyn_key_caches = mutable([dyn_key_cache for _ in range(num_layers)]) - dyn_value_caches = mutable([dyn_value_cache for _ in range(num_layers)]) - - dyn_batch_valid_length = Tensor(shape=[None, ], dtype=mstype.int32) - dyn_q_seq_lens = Tensor(shape=[None, ], dtype=mstype.int32) - dyn_slot_mapping = Tensor(shape=[None, ], dtype=mstype.int32) - dyn_block_tables = Tensor(shape=[None, None], dtype=mstype.int32) - dyn_intermediate_tensors = None - dyn_inputs_embeds = None - - self.model.set_inputs( - dyn_input_ids, - dyn_position_ids, - dyn_key_caches, - dyn_value_caches, - is_prefill, - dyn_slot_mapping, - dyn_batch_valid_length, - dyn_q_seq_lens, - dyn_block_tables, - dyn_intermediate_tensors, - dyn_inputs_embeds - ) def get_kvcache(self): key_cache = [] diff --git a/vllm_mindspore/model_executor/models/qwen2.py b/vllm_mindspore/model_executor/models/qwen2.py index 32d9da8d..5eb70a82 100644 --- a/vllm_mindspore/model_executor/models/qwen2.py +++ b/vllm_mindspore/model_executor/models/qwen2.py @@ -15,7 +15,6 @@ # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================ -from vllm.config import get_current_vllm_config from typing import TYPE_CHECKING, Dict, List, Optional, Set, Tuple, Union, Iterable if TYPE_CHECKING: @@ -25,7 +24,7 @@ else: import numpy as np -from mindspore import Parameter, Tensor, mint, nn, jit, ops +from mindspore import Parameter, Tensor, mint, nn, jit, ops, mutable from mindspore.common import dtype as mstype @@ -49,6 +48,8 @@ from vllm_mindspore.model_executor.models.utils import ( maybe_prefix) from vllm_mindspore.model_executor.sampling_metadata import SamplingMetadata from vllm_mindspore.model_executor.models.model_base import MsModelBase, Fake_Attention +from vllm_mindspore.model_executor.models.attention_mask import LowerTriangularMask +from vllm_mindspore.utils import STR_DTYPE_TO_MS_DTYPE from vllm.config import CacheConfig, VllmConfig @@ -173,8 +174,6 @@ class Qwen2Attention(nn.Cell): prefix=f"{prefix}.attn", attn_type=attn_type ) - self.attn_mask = mint.triu(mint.ones(size=(128, 128), dtype=mstype.bfloat16), 1) - self.hard_mask = Tensor([0], dtype=mstype.bfloat16).reshape(1, 1) @jit def construct( @@ -185,15 +184,16 @@ class Qwen2Attention(nn.Cell): value_cache: Tensor, is_prefill: bool, slot_mapping: Tensor, - batch_valid_length: Tuple[int], + attn_mask: Tensor, + batch_valid_length: Tensor, q_seq_lens: Tensor, block_tables: Tensor, ) -> Tensor: qkv, _ = self.qkv_proj(hidden_states) q, k, v = mint.split(qkv, (self.q_size, self.kv_size, self.kv_size), -1) - q, k = self.rotary_emb(positions, q, k, q_seq_lens, is_prefill) - attn_output = self.attn(q, k, v, key_cache, value_cache, is_prefill, slot_mapping, batch_valid_length, - q_seq_lens, block_tables, self.attn_mask, self.hard_mask) + q, k = self.rotary_emb(positions, q, k, batch_valid_length, is_prefill) + attn_output = self.attn(q, k, v, key_cache, value_cache, is_prefill, slot_mapping, attn_mask, + batch_valid_length, q_seq_lens, block_tables) output, _ = self.o_proj(attn_output) return output @@ -257,7 +257,8 @@ class Qwen2DecoderLayer(nn.Cell): value_cache: Tensor, is_prefill: bool, slot_mapping: Tensor, - batch_valid_length: Tuple[int], + attn_mask: Tensor, + batch_valid_length: Tensor, q_seq_lens: Tensor, block_tables: Tensor, residual: Optional[Tensor], @@ -275,6 +276,7 @@ class Qwen2DecoderLayer(nn.Cell): value_cache, is_prefill, slot_mapping, + attn_mask, batch_valid_length, q_seq_lens, block_tables @@ -342,6 +344,7 @@ class Qwen2Model(nn.Cell): value_caches: List[Tensor], is_prefill: bool, slot_mapping: Tensor, + attn_mask: Tensor, batch_valid_length: Tensor, q_seq_lens: Tensor, block_tables: Tensor, @@ -367,6 +370,7 @@ class Qwen2Model(nn.Cell): value_caches[i - self.start_layer], is_prefill, slot_mapping, + attn_mask, batch_valid_length, q_seq_lens, block_tables, @@ -486,6 +490,9 @@ class Qwen2ForCausalLM(MsModelBase): self.set_modules({"model": self.model, "lm_head": self.lm_head}) self.prefill = True + self.mstype = STR_DTYPE_TO_MS_DTYPE.get(self.model_config.dtype, self.model_config.dtype) + self.casual_mask = LowerTriangularMask(dtype=self.mstype, + max_model_len=self.model_config.max_model_len) self.set_model_inputs(self.prefill) self.kv_caches = [Fake_Attention() for i in range(config.num_hidden_layers)] compilation_config = vllm_config.compilation_config @@ -495,8 +502,47 @@ class Qwen2ForCausalLM(MsModelBase): for i in range(config.num_hidden_layers): compilation_config.static_forward_context[str(i)] = self.kv_caches[i] - def get_input_embeddings(self, input_ids: Tensor) -> Tensor: - return self.model.get_input_embeddings(input_ids) + def set_model_inputs(self, is_prefill): + dyn_input_ids = Tensor(shape=[None, None], dtype=mstype.int64) + dyn_position_ids = Tensor(shape=[None], dtype=mstype.int64) + + block_size = self.cache_config.block_size + num_kv_heads = self.model_config.get_num_kv_heads(self.parallel_config) + head_size = self.model_config.get_head_size() + kv_cache_shape = (None, block_size, num_kv_heads, head_size) + + kv_cache_dtype = self.model_config.dtype if self.cache_config.cache_dtype == "auto" \ + else self.cache_config.cache_dtype + kv_cache_dtype = STR_DTYPE_TO_MS_DTYPE[kv_cache_dtype] + + num_layers = self.model_config.get_num_layers(self.parallel_config) + + dyn_key_cache = Tensor(shape=kv_cache_shape, dtype=kv_cache_dtype) + dyn_value_cache = Tensor(shape=kv_cache_shape, dtype=kv_cache_dtype) + dyn_key_caches = mutable([dyn_key_cache for _ in range(num_layers)]) + dyn_value_caches = mutable([dyn_value_cache for _ in range(num_layers)]) + + dyn_slot_mapping = Tensor(shape=[None, ], dtype=mstype.int32) + dynamic_attention_mask = Tensor(shape=[None, None], dtype=self.mstype) + dyn_batch_valid_length = Tensor(shape=[None,], dtype=mstype.int32) + dyn_q_seq_lens = Tensor(shape=[None, ], dtype=mstype.int32) + dyn_block_tables = Tensor(shape=[None, None], dtype=mstype.int32) + dyn_intermediate_tensors = None + dyn_inputs_embeds = None + self.model.set_inputs( + dyn_input_ids, + dyn_position_ids, + dyn_key_caches, + dyn_value_caches, + is_prefill, + dyn_slot_mapping, + dynamic_attention_mask, + dyn_batch_valid_length, + dyn_q_seq_lens, + dyn_block_tables, + dyn_intermediate_tensors, + dyn_inputs_embeds + ) def forward( self, @@ -535,7 +581,9 @@ class Qwen2ForCausalLM(MsModelBase): self.set_model_inputs(self.prefill) slot_mapping = attn_metadata.slot_mapping - batch_valid_length = Tensor.from_numpy(np.array(attn_metadata.seq_lens, dtype=np.int32)) + attn_mask = self.casual_mask.gen_attention_mask(is_prefill, positions, query_lens) + seq_lens_np = np.array(attn_metadata.seq_lens, dtype=np.int32) + batch_valid_length = Tensor.from_numpy(seq_lens_np) q_seq_lens = Tensor.from_numpy(np.array(attn_metadata.query_lens, dtype=np.int32)) block_tables = attn_metadata.block_tables model_output = self.model(input_ids, @@ -544,6 +592,7 @@ class Qwen2ForCausalLM(MsModelBase): value_cache, is_prefill, slot_mapping, + attn_mask, batch_valid_length, q_seq_lens, block_tables, diff --git a/vllm_mindspore/utils.py b/vllm_mindspore/utils.py index e0fc3b04..da942479 100644 --- a/vllm_mindspore/utils.py +++ b/vllm_mindspore/utils.py @@ -222,6 +222,10 @@ def check_ready(): # Common environment variables of predict. set_context(jit_config={"jit_level": "O0", "infer_boost": "on"}) + default_env = { + "MS_INTERNAL_DISABLE_CUSTOM_KERNEL_LIST": "FlashAttentionScore,PagedAttention", + } + env_setup(default_env) if os.getenv("MS_MEMPOOL_BLOCK_SIZE"): set_context(mempool_block_size=f"{os.environ['MS_MEMPOOL_BLOCK_SIZE']}GB") @@ -236,11 +240,6 @@ def check_ready(): 'For "MindFormers" model backend, environments %s should be set!' % str(lost_envs) ) - - mindformers_default_env = { - "MS_INTERNAL_DISABLE_CUSTOM_KERNEL_LIST": "FlashAttentionScore,PagedAttention", - } - env_setup(mindformers_default_env) else: logger.info("Run with native model backend!") -- Gitee From 44eb435f2ce1a62984fe84f0aeef2de3671ef6dc Mon Sep 17 00:00:00 2001 From: lvhaoyu Date: Fri, 16 May 2025 14:47:53 +0800 Subject: [PATCH 35/55] add testcase for v1 --- tests/st/python/test_sampler_v1.py | 569 ++++++++++++++++++ .../python/test_vllm_deepseek_bf16_part_v1.py | 76 +++ tests/st/python/test_vllm_deepseek_part_v1.py | 79 +++ .../test_vllm_mf_qwen_7b_chunk_prefill_v1.py | 89 +++ .../test_vllm_mf_qwen_7b_prefix_caching_v1.py | 83 +++ tests/st/python/test_vllm_mf_qwen_7b_v1.py | 119 ++++ 6 files changed, 1015 insertions(+) create mode 100644 tests/st/python/test_sampler_v1.py create mode 100644 tests/st/python/test_vllm_deepseek_bf16_part_v1.py create mode 100644 tests/st/python/test_vllm_deepseek_part_v1.py create mode 100644 tests/st/python/test_vllm_mf_qwen_7b_chunk_prefill_v1.py create mode 100644 tests/st/python/test_vllm_mf_qwen_7b_prefix_caching_v1.py create mode 100644 tests/st/python/test_vllm_mf_qwen_7b_v1.py diff --git a/tests/st/python/test_sampler_v1.py b/tests/st/python/test_sampler_v1.py new file mode 100644 index 00000000..af2cd483 --- /dev/null +++ b/tests/st/python/test_sampler_v1.py @@ -0,0 +1,569 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# Copyright 2025 Huawei Technologies Co., Ltd +# Copyright 2025 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +from typing import Optional + +import numpy as np +import pytest +import torch + +import vllm_mindspore +from vllm.utils import make_tensor_with_pad +from vllm.v1.sample.metadata import SamplingMetadata +from vllm.v1.sample.sampler import Sampler + +VOCAB_SIZE = 1024 +NUM_OUTPUT_TOKENS = 20 +CUDA_DEVICES = [ + f"cuda:{0}" +] +MAX_NUM_PROMPT_TOKENS = 64 + + +def _create_fake_logits(batch_size: int, vocab_size: int) -> torch.Tensor: + fake_logits = torch.full((batch_size, vocab_size), 1e-2, dtype=torch.float) + return fake_logits + + +def _create_penalty_tensor(batch_size: int, penalty_value: float, + device: torch.device) -> torch.Tensor: + return torch.full((batch_size, ), + fill_value=penalty_value, + dtype=torch.float, + device=device) + + +def _create_prompt_tokens_tensor( + prompt_token_ids: list[list[int]], + vocab_size: int, + device: torch.device, +) -> torch.Tensor: + return make_tensor_with_pad( + prompt_token_ids, + pad=vocab_size, + device=device, + dtype=torch.int64, + pin_memory=False, + ) + + +def _create_logit_bias( + batch_size: int, + vocab_size: int, + bias_value: float, +) -> list[Optional[dict[int, float]]]: + res: list[Optional[dict[int, float]]] = [] + for i in range(batch_size): + logit_bias = {min(i, vocab_size - 1): bias_value} + res.append(logit_bias) + return res + + +def _create_allowed_token_ids( + batch_size: int, + vocab_size: int, + num_allowed_token_ids: int, + device: torch.device, +) -> Optional[torch.Tensor]: + mask: Optional[torch.Tensor] = None + for i in range(batch_size): + if i % 2 == 1: + continue + if mask is None: + mask = torch.zeros((batch_size, vocab_size), + dtype=torch.bool, + device=device) + start = min(i, vocab_size - 1) + end = min(i + num_allowed_token_ids, vocab_size - 1) + mask[i, start:end] = True + return mask + + +def _create_bad_words_token_ids( + batch_size: int, vocab_size: int, + bad_words_lengths: list[tuple[int]]) -> dict[int, list[list[int]]]: + bad_words_token_ids = {} + for batch_idx in range(batch_size): + token_ids_single_batch = [] + for bad_words_length in bad_words_lengths: + token_ids = np.random.choice(vocab_size, + size=bad_words_length, + replace=True).tolist() + token_ids_single_batch.append(token_ids) + bad_words_token_ids[batch_idx] = token_ids_single_batch + if batch_size >= 2: + # Test no bad_words for some batch + no_bad_words_batch_idx = np.random.choice(batch_size) + bad_words_token_ids.pop(no_bad_words_batch_idx, None) + return bad_words_token_ids + + +def _update_output_token_ids_for_bad_words( + metadata: SamplingMetadata, vocab_size: int) -> dict[int, list[int]]: + bad_words_last_tokens = {} + for batch_idx, bad_words_token_ids in metadata.bad_words_token_ids.items(): + output_token_ids = metadata.output_token_ids[batch_idx] + bad_words_last_token: list[int] = [] + for i, bad_word_token_ids in enumerate(bad_words_token_ids): + if len(bad_word_token_ids) == 1: + # Single token id always affects logits + bad_words_last_token.append(bad_word_token_ids[0]) + else: + prefix_length = len(bad_word_token_ids) - 1 + has_bad_words = np.random.choice([True, False]) + if has_bad_words: + output_token_ids[-prefix_length:] = bad_word_token_ids[:-1] + bad_words_last_token.append(bad_word_token_ids[-1]) + break # Maximum one update to output_token_ids + else: # Make sure no accidental match to bad words + output_token_ids[-1] = (bad_word_token_ids[-2] + + 1) % vocab_size + bad_words_last_tokens[batch_idx] = bad_words_last_token + return bad_words_last_tokens + + +def _create_default_sampling_metadata( + num_output_tokens: int, + batch_size: int, + vocab_size: int, + device: torch.device, +) -> SamplingMetadata: + output_token_ids: list[list[int]] = [] + prompt_token_ids: list[list[int]] = [] + for _ in range(batch_size): + output_token_ids.append( + np.random.randint(0, vocab_size, size=num_output_tokens).tolist()) + prompt_token_ids.append( + np.random.randint(0, + vocab_size, + size=np.random.randint( + 1, MAX_NUM_PROMPT_TOKENS)).tolist()) + fake_sampling_metadata = SamplingMetadata( + temperature=torch.full((batch_size, ), 0.0), + all_greedy=True, + all_random=False, + top_p=None, + top_k=None, + min_p=None, + generators={}, + max_num_logprobs=0, + prompt_token_ids=_create_prompt_tokens_tensor(prompt_token_ids, + vocab_size, device), + output_token_ids=output_token_ids, + frequency_penalties=_create_penalty_tensor(batch_size, 0.0, device), + presence_penalties=_create_penalty_tensor(batch_size, 0.0, device), + repetition_penalties=_create_penalty_tensor(batch_size, 1.0, device), + no_penalties=True, + min_tokens={}, + logit_bias=[None] * batch_size, + allowed_token_ids_mask=None, + bad_words_token_ids={}, + ) + return fake_sampling_metadata + + +def _generate_min_token_penalties_and_stop_tokens( + num_output_tokens: int, batch_size: int, vocab_size: int, + batch_indices_for_min_token_penalty: list[int] +) -> dict[int, tuple[int, set[int]]]: + """ + Generates and returns a dict of minimum token penalties and + corresponding stop token IDs (`min_tokens`, `stop_token_ids`) for each + batch. + + If a batch index is included in `batch_indices_for_min_token_penalty`, + a higher `min_tokens` value is assigned (within a randomized range), + and a random set of stop token IDs is created. Otherwise, a lower + `min_tokens` value is assigned, and the stop token IDs set is empty. + """ + min_tokens: dict[int, tuple[int, set[int]]] = {} + for index in range(batch_size): + if index in batch_indices_for_min_token_penalty: + min_tokens[index] = ( + np.random.randint(num_output_tokens + 1, + 2 * num_output_tokens), + set( + np.random.randint(0, vocab_size - 1) + for _ in range(np.random.randint(0, vocab_size)))) + else: + min_tokens[index] = (np.random.randint(0, + num_output_tokens), set()) + return min_tokens + + +def _create_weighted_output_token_list( + batch_size: int, + vocab_size: int) -> tuple[list[list[int]], list[list[int]]]: + """ + Creates an output token list where each token occurs a distinct + number of times. + + For each batch, a random subset of token IDs is selected from the + vocabulary. The selected tokens are then added to the output token + list, each with a different frequency. + + Returns: + tuple[list[list[int]], list[list[int]]]: + - The first element is the output token list, where each sublist + corresponds to a batch and contains tokens with weighted + frequencies. + - The second element is a list of distinct token IDs for each + batch, ordered by their frequency in the corresponding output + list. + """ + output_token_ids: list[list[int]] = [] + sorted_token_ids_in_output: list[list[int]] = [] + for _ in range(batch_size): + distinct_token_ids = np.random.choice(vocab_size, + size=np.random.randint(1, 10), + replace=False).tolist() + sorted_token_ids_in_output.append(distinct_token_ids) + output_token_ids_for_batch = [] + for index, token_id in enumerate(distinct_token_ids): + output_token_ids_for_batch.extend( + [token_id for _ in range(index + 1)]) + output_token_ids.append(output_token_ids_for_batch) + return output_token_ids, sorted_token_ids_in_output + +@pytest.mark.level0 +@pytest.mark.platform_arm_ascend910b_training +@pytest.mark.env_single +@pytest.mark.parametrize("device", CUDA_DEVICES) +@pytest.mark.parametrize("batch_size", [1, 2]) +def test_sampler_min_tokens_penalty(device: str, batch_size: int): + """ + Tests that if the number of output tokens is less than + SamplingParams.min_tokens then we will set the logits for + the stop token ids to -inf. + """ + fake_logits = _create_fake_logits(batch_size, VOCAB_SIZE) + sampling_metadata = _create_default_sampling_metadata( + NUM_OUTPUT_TOKENS, batch_size, VOCAB_SIZE, torch.device(device)) + batch_indices_for_min_token_penalty = np.random.randint( + 0, batch_size - 1, size=np.random.randint(0, batch_size)).tolist() + min_tokens = _generate_min_token_penalties_and_stop_tokens( + NUM_OUTPUT_TOKENS, batch_size, VOCAB_SIZE, + batch_indices_for_min_token_penalty) + sampling_metadata.min_tokens = min_tokens + sampler = Sampler() + logits = sampler.apply_penalties(fake_logits, sampling_metadata) + logits = logits.cpu() + for batch_idx in range(batch_size): + for token_id in range(VOCAB_SIZE): + _, stop_token_ids = min_tokens.get(batch_idx, (0, set())) + if token_id in stop_token_ids: + assert logits[batch_idx][token_id] == -float("inf") + else: + assert logits[batch_idx][token_id] != -float("inf") + +@pytest.mark.level0 +@pytest.mark.platform_arm_ascend910b_training +@pytest.mark.env_single +@pytest.mark.parametrize("device", CUDA_DEVICES) +@pytest.mark.parametrize("batch_size", [1, 2]) +@pytest.mark.parametrize("presence_penalty", [-2.0, 2.0]) +def test_sampler_presence_penalty(device: str, batch_size: int, + presence_penalty: float): + """ + Test to verify that if presence penalty is enabled then tokens + are penalized as per their presence in the existing output. + """ + # Create fake logits where each token is assigned the same + # logit value. + fake_logits = _create_fake_logits(batch_size, VOCAB_SIZE) + sampling_metadata = _create_default_sampling_metadata( + NUM_OUTPUT_TOKENS, batch_size, VOCAB_SIZE, torch.device(device)) + output_token_ids = sampling_metadata.output_token_ids + sampling_metadata.presence_penalties = _create_penalty_tensor( + batch_size, presence_penalty, torch.device(device)) + sampling_metadata.no_penalties = False + sampler = Sampler() + logits = sampler.apply_penalties(fake_logits, sampling_metadata) + logits = logits.cpu() + for batch_idx in range(batch_size): + # Since all tokens initially have the same logits, the non-penalized + # token ID will be the one with the highest logit value, while the + # penalized token ID will be the one with the lowest logit value. + non_penalized_token_id = logits[batch_idx].argmax().item() + penalized_token_id = logits[batch_idx].argmin().item() + if presence_penalty > 0: + # If `presence_penalty` is set to a value greater than 0, it + # indicates a preference for new tokens over those already + # present in the output. + # Verify that the penalized token ID exists in the output, while the + # non-penalized token ID does not. + assert penalized_token_id in output_token_ids[batch_idx] + assert non_penalized_token_id not in output_token_ids[batch_idx] + elif presence_penalty < 0: + # If `presence_penalty` is set to a value less than 0, it indicates + # a preference for existing tokens over new ones. Verify that the + # non-penalized token ID exists in the output, while the penalized + # token ID does not. + assert non_penalized_token_id in output_token_ids[batch_idx] + assert penalized_token_id not in output_token_ids[batch_idx] + +@pytest.mark.level0 +@pytest.mark.platform_arm_ascend910b_training +@pytest.mark.env_single +@pytest.mark.parametrize("device", CUDA_DEVICES) +@pytest.mark.parametrize("batch_size", [1, 2]) +@pytest.mark.parametrize("frequency_penalty", [-2.0, 2.0]) +def test_sampler_frequency_penalty(device: str, batch_size: int, + frequency_penalty: float): + """ + Test to verify that if frequency penalty is enabled then tokens are + penalized as per their frequency of occurrence. + """ + # Create fake logits where each token is assigned the same + # logit value. + fake_logits = _create_fake_logits(batch_size, VOCAB_SIZE) + sampling_metadata = _create_default_sampling_metadata( + NUM_OUTPUT_TOKENS, batch_size, VOCAB_SIZE, torch.device(device)) + sampling_metadata.frequency_penalties = _create_penalty_tensor( + batch_size, frequency_penalty, torch.device(device)) + output_token_ids, sorted_token_ids_in_output = \ + _create_weighted_output_token_list( + batch_size, + VOCAB_SIZE, + ) + sampling_metadata.output_token_ids = output_token_ids + sampling_metadata.no_penalties = False + sampler = Sampler() + logits = sampler.apply_penalties(fake_logits, sampling_metadata) + logits = logits.cpu() + for batch_idx in range(batch_size): + non_penalized_token_id = logits[batch_idx].argmax().item() + penalized_token_id = logits[batch_idx].argmin().item() + distinct_sorted_token_ids_in_output = sorted_token_ids_in_output[ + batch_idx] + most_frequent_token_id = distinct_sorted_token_ids_in_output[ + len(distinct_sorted_token_ids_in_output) - 1] + if frequency_penalty > 0: + # If `frequency_penalty` is set to > 0, it indicates + # a preference for new tokens over existing ones. Verify that the + # non-penalized token ID is not present in the output, while the + # most penalized token is the one that occurs most frequently in + # the output. + assert (non_penalized_token_id + not in distinct_sorted_token_ids_in_output) + assert penalized_token_id == most_frequent_token_id + elif frequency_penalty < 0: + # If `frequency_penalty` is set to < 0, it indicates + # a preference for existing tokens over new ones. Verify that the + # non-penalized token ID is the one that occurs most frequently + # in the output, while the penalized token ID is one that has not + # yet appeared. + assert non_penalized_token_id == most_frequent_token_id + assert penalized_token_id not in distinct_sorted_token_ids_in_output + +@pytest.mark.level0 +@pytest.mark.platform_arm_ascend910b_training +@pytest.mark.env_single +@pytest.mark.parametrize("device", CUDA_DEVICES) +@pytest.mark.parametrize("batch_size", [1, 2]) +@pytest.mark.parametrize("repetition_penalty", [0.1, 1.9]) +def test_sampler_repetition_penalty(device: str, batch_size: int, + repetition_penalty: float): + """ + Test to verify that when the repetition penalty is enabled, tokens + are penalized based on their presence in the prompt or the existing + output. + """ + # Create fake logits where each token is assigned the same + # logit value. + fake_logits = _create_fake_logits(batch_size, VOCAB_SIZE) + sampling_metadata = _create_default_sampling_metadata( + NUM_OUTPUT_TOKENS, batch_size, VOCAB_SIZE, torch.device(device)) + sampling_metadata.repetition_penalties = _create_penalty_tensor( + batch_size, repetition_penalty, torch.device(device)) + sampling_metadata.no_penalties = False + sampler = Sampler() + logits = sampler.apply_penalties(fake_logits, sampling_metadata) + logits = logits.cpu() + for batch_idx in range(batch_size): + non_penalized_token_id = logits[batch_idx].argmax().item() + penalized_token_id = logits[batch_idx].argmin().item() + prompt_tokens = sampling_metadata.prompt_token_ids[ + batch_idx][:].tolist() + output_tokens = sampling_metadata.output_token_ids[batch_idx] + if repetition_penalty > 1.0: + # If `repetition_penalty` > 1.0, verify that the non-penalized + # token ID has not been seen before, while the penalized token ID + # exists either in the prompt or the output. + assert (non_penalized_token_id not in prompt_tokens + and non_penalized_token_id not in output_tokens) + assert (penalized_token_id in prompt_tokens + or penalized_token_id in output_tokens) + elif repetition_penalty < 1.0: + # If `repetition_penalty` < 1.0, verify that the penalized + # token ID has not been seen before, while the non-penalized + # token ID exists either in the prompt or the output. + assert (penalized_token_id not in prompt_tokens + and penalized_token_id not in output_tokens) + assert (non_penalized_token_id in prompt_tokens + or non_penalized_token_id in output_tokens) + +@pytest.mark.level0 +@pytest.mark.platform_arm_ascend910b_training +@pytest.mark.env_single +@pytest.mark.parametrize("device", CUDA_DEVICES) +@pytest.mark.parametrize("batch_size", [1, 2]) +@pytest.mark.parametrize("min_p", [0.0, 0.1]) +def test_sampler_min_p(device: str, batch_size: int, min_p: float): + """ + Tests that when min_p is applied, tokens with probability below + min_p * max_prob are masked with -inf. + """ + fake_logits = _create_fake_logits(batch_size, VOCAB_SIZE) + + # Create one dominant token per batch + for i in range(batch_size): + fake_logits[i, 0] = 10.0 # High logit for first token + fake_logits[i, 1:] = 1e-2 # Others remain low + + sampling_metadata = _create_default_sampling_metadata( + NUM_OUTPUT_TOKENS, batch_size, VOCAB_SIZE, torch.device(device)) + + # Configure min_p parameters + sampling_metadata.min_p = torch.full((batch_size, ), min_p, device=device) + + sampler = Sampler() + logits = sampler.apply_min_p(fake_logits, sampling_metadata.min_p) + logits = logits.cpu() + + for batch_idx in range(batch_size): + for token_id in range(VOCAB_SIZE): + if token_id == 0: + # Dominant token should always be unmasked + assert logits[batch_idx][token_id] != -float("inf") + else: + if min_p > 0.0: + # Non-dominant tokens should be masked when min_p > 0 + assert logits[batch_idx][token_id] == -float("inf") + else: + # No masking when min_p is 0 + assert logits[batch_idx][token_id] != -float("inf") + +@pytest.mark.level0 +@pytest.mark.platform_arm_ascend910b_training +@pytest.mark.env_single +@pytest.mark.parametrize("device", CUDA_DEVICES) +@pytest.mark.parametrize("batch_size", [1, 2]) +@pytest.mark.parametrize("bias_value", [-0.1, 1.2]) +def test_sampler_logit_bias(device: str, batch_size: int, bias_value: float): + """ + Test to verify that when the repetition penalty is enabled, tokens + are penalized based on their presence in the prompt or the existing + output. + """ + # Create fake logits where each token is assigned the same + # logit value. + fake_logits = _create_fake_logits(batch_size, VOCAB_SIZE) + sampling_metadata = _create_default_sampling_metadata( + NUM_OUTPUT_TOKENS, batch_size, VOCAB_SIZE, torch.device(device)) + sampling_metadata.logit_bias = _create_logit_bias( + batch_size=batch_size, + vocab_size=VOCAB_SIZE, + bias_value=bias_value, + ) + sampler = Sampler() + logits = sampler.apply_logits_bias(fake_logits, sampling_metadata) + logits = logits.cpu() + for batch_idx in range(batch_size): + logits_for_req = logits[batch_idx] + biased_index = min(batch_idx, VOCAB_SIZE - 1) + for token_id in range(VOCAB_SIZE): + if biased_index == token_id: + assert logits_for_req[token_id].item() == pytest.approx(bias_value + + 1e-2) + else: + assert logits_for_req[token_id].item() == pytest.approx(1e-2) + +@pytest.mark.level0 +@pytest.mark.platform_arm_ascend910b_training +@pytest.mark.env_single +@pytest.mark.parametrize("device", CUDA_DEVICES) +@pytest.mark.parametrize("batch_size", [1, 2]) +@pytest.mark.parametrize("num_allowed_token_ids", [0, 1, 2]) +def test_sampler_allowed_token_ids(device: str, batch_size: int, + num_allowed_token_ids: int): + """ + Test to verify that when the repetition penalty is enabled, tokens + are penalized based on their presence in the prompt or the existing + output. + """ + # Create fake logits where each token is assigned the same + # logit value. + fake_logits = _create_fake_logits(batch_size, VOCAB_SIZE) + sampling_metadata = _create_default_sampling_metadata( + NUM_OUTPUT_TOKENS, batch_size, VOCAB_SIZE, torch.device(device)) + mask = _create_allowed_token_ids( + batch_size=batch_size, + vocab_size=VOCAB_SIZE, + num_allowed_token_ids=num_allowed_token_ids, + device=device, + ) + sampling_metadata.allowed_token_ids_mask = mask + sampler = Sampler() + logits = sampler.apply_allowed_token_ids(fake_logits, sampling_metadata) + logits = logits.cpu() + for batch_idx in range(batch_size): + logits_for_req = logits[batch_idx] + if batch_idx % 2 == 1: + assert torch.all(logits_for_req != -float("inf")) + continue + for token_id in range(VOCAB_SIZE): + start = min(batch_idx, VOCAB_SIZE - 1) + end = min(batch_idx + num_allowed_token_ids, VOCAB_SIZE - 1) + if token_id >= start and token_id < end: + assert logits_for_req[token_id] == -float( + "inf"), f"{batch_idx}, {token_id}" + else: + assert logits_for_req[token_id] != -float("inf") + +@pytest.mark.level0 +@pytest.mark.platform_arm_ascend910b_training +@pytest.mark.env_single +@pytest.mark.parametrize("device", CUDA_DEVICES) +@pytest.mark.parametrize("batch_size", [1, 2]) +@pytest.mark.parametrize("bad_words_lengths", [(1, ), (1, 3), (2, 2)]) +def test_sampler_bad_words(device: str, batch_size: int, + bad_words_lengths: list[tuple[int]]): + """ + Test to verify that when the bad words restriction is present, tokens + are penalized based on their match with the bad words. + """ + # Create fake logits where each token is assigned the same + # logit value. + fake_logits = _create_fake_logits(batch_size, VOCAB_SIZE) + sampling_metadata = _create_default_sampling_metadata( + NUM_OUTPUT_TOKENS, batch_size, VOCAB_SIZE, torch.device(device)) + sampling_metadata.bad_words_token_ids = _create_bad_words_token_ids( + batch_size, VOCAB_SIZE, bad_words_lengths) + bad_words_last_tokens = _update_output_token_ids_for_bad_words( + sampling_metadata, VOCAB_SIZE) + sampler = Sampler() + logits = sampler.apply_bad_words(fake_logits, sampling_metadata) + logits = logits.cpu() + for batch_idx in range(batch_size): + logits_for_req = logits[batch_idx] + for token_id in range(VOCAB_SIZE): + if (batch_idx in bad_words_last_tokens + and token_id in bad_words_last_tokens[batch_idx]): + assert logits_for_req[token_id] == -float("inf") + else: + assert logits_for_req[token_id] != -float("inf") diff --git a/tests/st/python/test_vllm_deepseek_bf16_part_v1.py b/tests/st/python/test_vllm_deepseek_bf16_part_v1.py new file mode 100644 index 00000000..11167779 --- /dev/null +++ b/tests/st/python/test_vllm_deepseek_bf16_part_v1.py @@ -0,0 +1,76 @@ +# Copyright 2024 The vLLM team. +# Copyright 2024 Microsoft and the HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://wwww.apache.org/licenses/LICENSE-2.0 +# +# Unless required by application law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""test mf deepseek r1.""" +import pytest +import os +from . import set_env +env_manager = set_env.EnvVarManager() +# def env +env_vars = { + "MINDFORMERS_MODEL_CONFIG": "./config/predict_deepseek_r1_671b.yaml", + "ASCEND_CUSTOM_PATH": os.path.expandvars("$ASCEND_HOME_PATH/../"), + "vLLM_MODEL_BACKEND": "MindFormers", + "MS_ENABLE_LCCL": "on", + "HCCL_OP_EXPANSION_MODE": "AIV", + "ASCEND_RT_VISIBLE_DEVICES": "0,1,2,3,4,5,6,7", + "MS_ALLOC_CONF": "enable_vmm:True", + "LCCL_DETERMINISTIC": "1", + "HCCL_DETERMINISTIC": "true", + "ATB_MATMUL_SHUFFLE_K_ENABLE": "0", + "ATB_LLM_LCOC_ENABLE": "0", +} +# set env +env_manager.setup_ai_environment(env_vars) +import vllm_mindspore +from vllm import LLM, SamplingParams + +class TestDeepSeek: + """ + Test Deepseek. + """ + + @pytest.mark.level0 + @pytest.mark.platform_arm_ascend910b_training + @pytest.mark.env_single + def test_deepseek_r1_bf16(self): + """ + test case deepseek r1 bf16 + """ + + # Sample prompts. + prompts = [ + "You are a helpful assistant.<|User|>将文本分类为中性、负面或正面。 \n文本:我认为这次假期还可以。 \n情感:<|Assistant|>\n", + ] + + # Create a sampling params object. + sampling_params = SamplingParams(temperature=0.0, max_tokens=10, top_k=1) + + # Create an LLM. + llm = LLM(model="/home/workspace/mindspore_dataset/weight/DeepSeek-R1-bf16", + trust_remote_code=True, gpu_memory_utilization=0.9, tensor_parallel_size=8) + # Generate texts from the prompts. The output is a list of RequestOutput objects + # that contain the prompt, generated text, and other information. + outputs = llm.generate(prompts, sampling_params) + except_list=['ugs611ాలు sic辨hara的开璞 SquaresInsp'] + # Print the outputs. + for i, output in enumerate(outputs): + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + assert generated_text == except_list[i] + + # unset env + env_manager.unset_all() diff --git a/tests/st/python/test_vllm_deepseek_part_v1.py b/tests/st/python/test_vllm_deepseek_part_v1.py new file mode 100644 index 00000000..3ec16fa7 --- /dev/null +++ b/tests/st/python/test_vllm_deepseek_part_v1.py @@ -0,0 +1,79 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# Copyright 2025 Huawei Technologies Co., Ltd +# Copyright 2024 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""test mf deepseek r1.""" +import pytest +import os +from . import set_env +env_manager = set_env.EnvVarManager() +# def env +env_vars = { + "MINDFORMERS_MODEL_CONFIG": "./config/predict_deepseek_r1_671b_w8a8.yaml", + "ASCEND_CUSTOM_PATH": os.path.expandvars("$ASCEND_HOME_PATH/../"), + "vLLM_MODEL_BACKEND": "MindFormers", + "MS_ENABLE_LCCL": "off", + "HCCL_OP_EXPANSION_MODE": "AIV", + "ASCEND_RT_VISIBLE_DEVICES": "0,1,2,3,4,5,6,7", + "MS_ALLOC_CONF": "enable_vmm:True", + "LCCL_DETERMINISTIC": "1", + "HCCL_DETERMINISTIC": "true", + "ATB_MATMUL_SHUFFLE_K_ENABLE": "0", + "ATB_LLM_LCOC_ENABLE": "0", +} +# set env +env_manager.setup_ai_environment(env_vars) +import vllm_mindspore +from vllm import LLM, SamplingParams + +class TestDeepSeek: + """ + Test Deepseek. + """ + + @pytest.mark.level0 + @pytest.mark.platform_arm_ascend910b_training + @pytest.mark.env_single + @pytest.mark.skip(reason="gs master branch is not suit for the newest mindformers.") + def test_deepseek_r1(self): + """ + test case deepseek r1 w8a8 + """ + + # Sample prompts. + prompts = [ + "You are a helpful assistant.<|User|>将文本分类为中性、负面或正面。 \n文本:我认为这次假期还可以。 \n情感:<|Assistant|>\n", + ] + + # Create a sampling params object. + sampling_params = SamplingParams(temperature=0.0, max_tokens=10, top_k=1) + + # Create an LLM. + llm = LLM(model="/home/workspace/mindspore_dataset/weight/DeepSeek-R1-W8A8", + trust_remote_code=True, gpu_memory_utilization=0.9, tensor_parallel_size=8) + # Generate texts from the prompts. The output is a list of RequestOutput objects + # that contain the prompt, generated text, and other information. + outputs = llm.generate(prompts, sampling_params) + except_list=['ugs611ాలు哒ాలు mahassisemaSTE的道德', 'ugs611ాలు哒ాలు mah战区rollerOVERlaid'] + # Print the outputs. + for i, output in enumerate(outputs): + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + assert generated_text in except_list + + # unset env + env_manager.unset_all() diff --git a/tests/st/python/test_vllm_mf_qwen_7b_chunk_prefill_v1.py b/tests/st/python/test_vllm_mf_qwen_7b_chunk_prefill_v1.py new file mode 100644 index 00000000..dfc738b2 --- /dev/null +++ b/tests/st/python/test_vllm_mf_qwen_7b_chunk_prefill_v1.py @@ -0,0 +1,89 @@ +# Copyright 2024 The vLLM team. +# Copyright 2024 Microsoft and the HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://wwww.apache.org/licenses/LICENSE-2.0 +# +# Unless required by application law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""test mf qwen chunk prefill.""" +import pytest +import os +from . import set_env + +env_manager = set_env.EnvVarManager() +# def env +env_vars = { + "MINDFORMERS_MODEL_CONFIG": "./config/predict_qwen2_5_7b_instruct.yaml", + "ASCEND_CUSTOM_PATH": os.path.expandvars("$ASCEND_HOME_PATH/../"), + "vLLM_MODEL_BACKEND": "MindFormers", + "MS_ENABLE_LCCL": "off", + "HCCL_OP_EXPANSION_MODE": "AIV", + "ASCEND_RT_VISIBLE_DEVICES": "0,1", + "MS_ALLOC_CONF": "enable_vmm:True", + "LCCL_DETERMINISTIC": "1", + "HCCL_DETERMINISTIC": "true", + "ATB_MATMUL_SHUFFLE_K_ENABLE": "0", + "ATB_LLM_LCOC_ENABLE": "0", +} +# set env +env_manager.setup_ai_environment(env_vars) +import vllm_mindspore +from vllm import LLM, SamplingParams + + +class TestMfQwen_chunk_prefill_v1: + """ + Test qwen. + """ + + @pytest.mark.level0 + @pytest.mark.platform_arm_ascend910b_training + @pytest.mark.env_single + @pytest.mark.skip(reason="cp precision need to be fixed on v0.8.3 V0") + def test_mf_qwen_7b_chunk_prefill(self): + """ + test case qwen_7b_chunk_prefill + """ + + # Sample prompts. + batch_datas = [{ + "prompt": "I love Beijing, because it is a city with a long history and profound cultural heritage. Walking through " + "its ancient hutongs, one can almost feel the whispers of the past. The Forbidden City, an architectural " + "marvel that once housed emperors, stands as a testament to the city's imperial past. Meanwhile, the Great " + "Wall, though not within the city limits, is easily accessible from Beijing and offers a glimpse into the " + "strategic genius and resilience of ancient China.", + "answer": " The city's blend of traditional and modern architecture, bustling markets, and vibrant street life make it " + "a unique and fascinating destination. In short, Beijing is a city"}, + {"prompt": "I love Beijing, because", + "answer": " it is a city with a long history. Which of the following options correctly expresses this sentence?\nA. I love Beijing, because it is a city with a"}, + ] + + # Create a sampling params object. + sampling_params = SamplingParams(temperature=0.0, max_tokens=32, top_k=1) + + # Create an LLM. + llm = LLM(model="/home/workspace/mindspore_dataset/weight/Qwen2.5-7B-Instruct", + max_model_len=8192, max_num_seqs=16, max_num_batched_tokens=32, + block_size=32, gpu_memory_utilization=0.85, tensor_parallel_size=2) + # Generate texts from the prompts. The output is a list of RequestOutput objects + # that contain the prompt, generated text, and other information. + for batch_data in batch_datas: + prompt = batch_data["prompt"] + answer = batch_data["answer"] + outputs = llm.generate(prompt, sampling_params) + # Print the outputs. + for i, output in enumerate(outputs): + generated_text = output.outputs[0].text + print(f"Prompt: {output.prompt!r}, Generated text: {generated_text!r}") + assert generated_text == answer + + # unset env + env_manager.unset_all() diff --git a/tests/st/python/test_vllm_mf_qwen_7b_prefix_caching_v1.py b/tests/st/python/test_vllm_mf_qwen_7b_prefix_caching_v1.py new file mode 100644 index 00000000..2bce85ad --- /dev/null +++ b/tests/st/python/test_vllm_mf_qwen_7b_prefix_caching_v1.py @@ -0,0 +1,83 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# Copyright 2025 Huawei Technologies Co., Ltd +# Copyright 2024 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +"""test mf qwen prefix caching.""" +import pytest +import os +from . import set_env +env_manager = set_env.EnvVarManager() +env_vars = { + "MINDFORMERS_MODEL_CONFIG": "./config/predict_qwen2_5_7b_instruct.yaml", + "ASCEND_CUSTOM_PATH": os.path.expandvars("$ASCEND_HOME_PATH/../"), + "vLLM_MODEL_BACKEND": "MindFormers", + "MS_ENABLE_LCCL": "off", + "HCCL_OP_EXPANSION_MODE": "AIV", + "ASCEND_RT_VISIBLE_DEVICES": "0,1", + "LCCL_DETERMINISTIC": "1", + "HCCL_DETERMINISTIC": "true", + "ATB_MATMUL_SHUFFLE_K_ENABLE": "0", + "ATB_LLM_LCOC_ENABLE": "0", +} +env_manager.setup_ai_environment(env_vars) +import vllm_mindspore +from vllm import LLM, SamplingParams + + +class TestMfQwen_prefix_caching_v1: + """ + Test qwen7b enable prefix_caching + """ + @pytest.mark.level0 + @pytest.mark.platform_arm_ascend910b_training + @pytest.mark.env_single + @pytest.mark.skip(reason="pc precision need to be fixed on v0.8.3 V0") + def test_mf_qwen_7b_prefix_caching(self): + """ + test case qwen_7b_prefix_caching + """ + + # First prompts. + prompts = [ + "I love Beijing, because it is a city that has so much to offer. I have visited" + ] + #second prompts, the second prompt is a continuation of the first prompts, make sure prefix caching work. + second_prompts = [ + "I love Beijing, because it is a city that has so much to offer. I have visited many places" + ] + # Create a sampling params object. + sampling_params = SamplingParams(temperature=0.0, max_tokens=10, top_k=1) + + # Create an LLM. + llm = LLM(model="/home/workspace/mindspore_dataset/weight/Qwen2.5-7B-Instruct", + max_model_len=8192, block_size=16, tensor_parallel_size=2) + # Generate texts from the prompts. The output is a list of RequestOutput objects + # that contain the prompt, generated text, and other information. + outputs = llm.generate(prompts, sampling_params) + second_outputs = llm.generate(second_prompts, sampling_params) + except_list=[' many times and each time I have found something new'] + second_except_list=[' in Beijing, but I have to say that the'] + for i, (output, second_output) in enumerate(zip(outputs, second_outputs)): + generated_text = output.outputs[i].text + print(f"Output1 - Prompt: {prompts[i]!r}, Generated text: {generated_text!r}") + assert generated_text == except_list[i] + + second_generated_text = second_output.outputs[i].text + print(f"Output2 - Prompt: {second_prompts[i]!r}, Generated text: {second_generated_text!r}") + assert second_generated_text == second_except_list[i] + + env_manager.unset_all() diff --git a/tests/st/python/test_vllm_mf_qwen_7b_v1.py b/tests/st/python/test_vllm_mf_qwen_7b_v1.py new file mode 100644 index 00000000..3ad831ff --- /dev/null +++ b/tests/st/python/test_vllm_mf_qwen_7b_v1.py @@ -0,0 +1,119 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# Copyright 2025 Huawei Technologies Co., Ltd +# Copyright 2024 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""test mf qwen.""" +import pytest +import os +from . import set_env +env_manager = set_env.EnvVarManager() +# def env +env_vars = { + "MINDFORMERS_MODEL_CONFIG": "./config/predict_qwen2_5_7b_instruct.yaml", + "ASCEND_CUSTOM_PATH": os.path.expandvars("$ASCEND_HOME_PATH/../"), + "vLLM_MODEL_BACKEND": "MindFormers", + "MS_ENABLE_LCCL": "off", + "HCCL_OP_EXPANSION_MODE": "AIV", + "ASCEND_RT_VISIBLE_DEVICES": "0,1", + "MS_ALLOC_CONF": "enable_vmm:True", + "LCCL_DETERMINISTIC": "1", + "HCCL_DETERMINISTIC": "true", + "ATB_MATMUL_SHUFFLE_K_ENABLE": "0", + "ATB_LLM_LCOC_ENABLE": "0", +} +# set env +env_manager.setup_ai_environment(env_vars) +import vllm_mindspore +from vllm import LLM, SamplingParams + + +class TestMfQwenV1: + """ + Test Qwen. + """ + @pytest.mark.level0 + @pytest.mark.platform_arm_ascend910b_training + @pytest.mark.env_single + def test_mf_qwen(self): + """ + test case qwen2.5 7B + """ + + # Sample prompts. + prompts = [ + "You are a helpful assistant.<|User|>将文本分类为中性、负面或正面。 \n文本:我认为这次假期还可以。 \n情感:<|Assistant|>\n", + ] + + # Create a sampling params object. + sampling_params = SamplingParams(temperature=0.0, max_tokens=10, top_k=1) + + # Create an LLM. + llm = LLM(model="/home/workspace/mindspore_dataset/weight/Qwen2.5-7B-Instruct", + gpu_memory_utilization=0.9, tensor_parallel_size=2) + # Generate texts from the prompts. The output is a list of RequestOutput objects + # that contain the prompt, generated text, and other information. + outputs = llm.generate(prompts, sampling_params) + except_list=['中性<|Assistant|> 这句话'] + # Print the outputs. + for i, output in enumerate(outputs): + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + assert generated_text == except_list[i] + + # unset env + env_manager.unset_all() + + @pytest.mark.level0 + @pytest.mark.platform_arm_ascend910b_training + @pytest.mark.env_single + @pytest.mark.skip(reason="pc precision need to be fixed on v0.8.3 V0") + def test_mf_qwen_batch(self): + """ + test case qwen2.5 7B, to test prefill and decode mixed, can trigger PA q_seq_len > 1 + """ + # Sample prompts. + prompts = [ + "北京烤鸭是", + "请介绍一下华为,华为是", + "今年似乎大模型之间的内卷已经有些偃旗息鼓了,各大技术公司逐渐聪单纯追求模型参数量的竞赛中抽身," + "转向更加注重模型的实际>应用效果和效率", + ] * 2 + + # Create a sampling params object. + sampling_params = SamplingParams(temperature=0.0, max_tokens=10, top_k=1) + + # Create an LLM. + llm = LLM(model="/home/workspace/mindspore_dataset/weight/Qwen2.5-7B-Instruct", block_size=32, + gpu_memory_utilization=0.9, tensor_parallel_size=2) + # Generate texts from the prompts. The output is a list of RequestOutput objects + # that contain the prompt, generated text, and other information. + outputs = llm.generate(prompts, sampling_params) + except_list=[ + "享誉世界的中华美食,其制作工艺独特,", + "做什么的? 华为是一家中国公司,", + "。 \n在这一背景下,阿里云发布了通", + ] * 2 + # Print the outputs. + for i, output in enumerate(outputs): + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + assert generated_text in except_list[i] + + # unset env + env_manager.unset_all() + -- Gitee From 6b7de351d70e11eaca309ce1607aff0a805258e7 Mon Sep 17 00:00:00 2001 From: fengtingyan Date: Mon, 19 May 2025 20:20:33 +0800 Subject: [PATCH 36/55] [Feature] V1 PD disaggregation, DLLM vllm v0.8.3 --- vllm_mindspore/__init__.py | 9 ++++ vllm_mindspore/engine/arg_utils.py | 6 --- .../models/mf_models/deepseek_v3.py | 17 ++++++ .../models/mf_models/mf_model_base.py | 52 ++++++++++++++++++- vllm_mindspore/v1/utils.py | 1 + vllm_mindspore/v1/worker/gpu_worker.py | 18 ++++--- vllm_mindspore/worker/worker.py | 1 - 7 files changed, 90 insertions(+), 14 deletions(-) diff --git a/vllm_mindspore/__init__.py b/vllm_mindspore/__init__.py index 3aae2091..224ae26e 100644 --- a/vllm_mindspore/__init__.py +++ b/vllm_mindspore/__init__.py @@ -315,4 +315,13 @@ from vllm_mindspore.engine.multiprocessing.engine import cleanup import vllm.engine.multiprocessing.engine vllm.engine.multiprocessing.engine.MQLLMEngine.cleanup = cleanup +try: + from vllm.distributed.kv_transfer.kv_connector.factory import KVConnectorFactory + + KVConnectorFactory.register_connector( + "DLLMDsConnector", + "dllm.dkvc.v1.dllm_ds_connector", + "DLLMDsConnector") +except: + pass check_ready() diff --git a/vllm_mindspore/engine/arg_utils.py b/vllm_mindspore/engine/arg_utils.py index ed74ba9e..9efb8923 100644 --- a/vllm_mindspore/engine/arg_utils.py +++ b/vllm_mindspore/engine/arg_utils.py @@ -164,12 +164,6 @@ def _is_v1_supported_oracle(self, model_config: ModelConfig) -> bool: recommend_to_remove=False) return False - # No Disaggregated Prefill so far. - if self.kv_transfer_config != EngineArgs.kv_transfer_config: - _raise_or_fallback(feature_name="--kv-transfer-config", - recommend_to_remove=False) - return False - # No FlashInfer or XFormers so far. V1_BACKENDS = [ "FLASH_ATTN_VLLM_V1", "FLASH_ATTN", "PALLAS", "PALLAS_VLLM_V1", diff --git a/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py b/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py index 3a1b2be6..e493c9c2 100644 --- a/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py +++ b/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py @@ -29,11 +29,13 @@ from vllm.config import get_current_vllm_config from vllm.distributed.parallel_state import get_dp_group, get_tensor_model_parallel_world_size from vllm.forward_context import get_forward_context from vllm.logger import init_logger +from vllm.attention.layer import Attention import mindspore as ms from mindspore import Tensor, JitConfig, Model, mutable from mindspore.common import dtype as msdtype from mindspore.nn.utils import no_init_parameters +from mindspore.common.api import _pynative_executor from mindspore_gs.ptq import PTQ from mindspore_gs.ptq import PTQMode, PTQConfig, OutliersSuppressionType, PrecisionRecovery, QuantGranularity, \ @@ -54,6 +56,13 @@ from vllm_mindspore.model_executor.models.model_base import Fake_MLA, Fake_MLA_V from vllm_mindspore.model_executor.models.mf_models.mf_model_base import MfModelBase from vllm_mindspore.model_executor.models.mf_models.deepseekv3_weight_processor import DeepseekV3WeightProcessor +try: + # Need to apply dllm pd patch on vllm to use pd disagg related functions + from vllm.attention.layer import maybe_save_kv_layer_to_connector +except ImportError: + pass + + logger = init_logger(__name__) @@ -168,6 +177,14 @@ class DeepseekV3ForCausalLM(MfModelBase): key_cache.append(k_cache) return mutable(key_cache), None + def connector_send_kvcache(self): + _pynative_executor.sync() + forward_context = get_forward_context() + for i in range(self.mf_model_config.num_layers): + kv_cache_module = self.kv_caches[i] + kv_cache = kv_cache_module.kv_cache[forward_context.virtual_engine][0] + maybe_save_kv_layer_to_connector("key." + str(i), kv_cache) + def load_weights(self, weights: Iterable[Tuple[str, Tensor]]) -> Set[str]: if self.mf_config.load_ckpt_format == "ckpt": model = Model(self.network) diff --git a/vllm_mindspore/model_executor/models/mf_models/mf_model_base.py b/vllm_mindspore/model_executor/models/mf_models/mf_model_base.py index 32cbb916..51b0bc89 100644 --- a/vllm_mindspore/model_executor/models/mf_models/mf_model_base.py +++ b/vllm_mindspore/model_executor/models/mf_models/mf_model_base.py @@ -39,11 +39,20 @@ from mindspore.common.api import _pynative_executor from mindformers.tools.register.config import MindFormerConfig from mindformers.core.context import build_mf_context from mindformers.core.parallel_config import build_parallel_config - +from mindspore.common.api import _pynative_executor from vllm_mindspore.model_executor.models.model_base import MsModelBase from vllm_mindspore.model_executor.models.mf_models.attention_mask import LowerTriangularMask from vllm_mindspore.v1.attention.backends.flash_attn import FlashAttentionMetadata +try: + # Need to apply dllm pd patch on vllm to use pd disagg related functions + from vllm.attention.layer import maybe_save_kv_layer_to_connector, wait_for_kv_layer_from_connector + from vllm.distributed.kv_transfer import is_v1_kv_transfer_group + kv_transfer_supported = True +except: + kv_transfer_supported = False + + logger = init_logger(__name__) class MfModelBase(MsModelBase): @@ -52,6 +61,7 @@ class MfModelBase(MsModelBase): vllm_config=vllm_config, prefix=prefix ) + self.kv_transfer_config = vllm_config.kv_transfer_config self.mf_config = MindFormerConfig(os.getenv("MINDFORMERS_MODEL_CONFIG")) build_mf_context(self.mf_config) build_parallel_config(self.mf_config) @@ -80,6 +90,18 @@ class MfModelBase(MsModelBase): raise NotImplementedError("Function _create_network should be Implemented!") + def is_decoder_task(self) -> bool: + if self.kv_transfer_config is None: + return False + + return self.kv_transfer_config.is_kv_consumer + + def is_prefill_task(self) -> bool: + if self.kv_transfer_config is None: + return False + + return self.kv_transfer_config.is_kv_producer + def _dummy_attention_metadata(self, input_ids: Tensor, positions: Tensor) -> FlashAttentionMetadata: input_len = input_ids.shape[0] max_seq_len = ms.Tensor(input_len, dtype=ms.int32) @@ -176,6 +198,24 @@ class MfModelBase(MsModelBase): def update_model_inputs(self, model_inputs, **kwargs): return model_inputs + def connector_send_kvcache(self): + #TODO 可优化 + _pynative_executor.sync() + forward_context = get_forward_context() + for i in range(self.mf_model_config.num_layers): + kv_cache = self.kv_caches[i] + k_cache = kv_cache.kv_cache[forward_context.virtual_engine][0] + v_cache = kv_cache.kv_cache[forward_context.virtual_engine][1] + maybe_save_kv_layer_to_connector("key." + str(i), (k_cache, v_cache)) + + + def connector_wait_for_kv_layer(self): + logger.debug(f"connector_wait_for_kv_layer") + #TODO 可优化 + for i in range(self.mf_model_config.num_layers): + wait_for_kv_layer_from_connector("key." + str(i)) + + def forward( self, input_ids: Tensor, @@ -199,7 +239,17 @@ class MfModelBase(MsModelBase): if not self.set_flags: self.network.add_flags_custom(is_first_iteration=False) self.set_flags = True + if kv_transfer_supported: + if is_v1_kv_transfer_group(): + self.connector_send_kvcache() else: + if kv_transfer_supported: + if is_v1_kv_transfer_group() and self.is_prefill_task(): + self.connector_send_kvcache() + + if is_v1_kv_transfer_group() and self.is_decoder_task(): + self.connector_wait_for_kv_layer() + logger.debug(f"connector_wait_for_kv_layer success") hidden_states = self.network(**model_inputs) return hidden_states diff --git a/vllm_mindspore/v1/utils.py b/vllm_mindspore/v1/utils.py index 6833c101..c13c292d 100644 --- a/vllm_mindspore/v1/utils.py +++ b/vllm_mindspore/v1/utils.py @@ -10,3 +10,4 @@ def copy_slice(from_tensor: torch.Tensor, to_tensor: torch.Tensor, """ to_tensor[:length] = from_tensor[:length] return to_tensor + diff --git a/vllm_mindspore/v1/worker/gpu_worker.py b/vllm_mindspore/v1/worker/gpu_worker.py index 0395c339..3ebde737 100644 --- a/vllm_mindspore/v1/worker/gpu_worker.py +++ b/vllm_mindspore/v1/worker/gpu_worker.py @@ -6,7 +6,6 @@ import torch from vllm.logger import init_logger from vllm.distributed.parallel_state import get_pp_group - logger = init_logger(__name__) @@ -19,8 +18,8 @@ def init_device(self): config = get_current_vllm_config() if config is not None and config.parallel_config.data_parallel_size > 1: - device_id = self.parallel_config.data_parallel_rank_local * self.parallel_config.world_size + self.local_rank - self.device = torch.device(f"cuda:{device_id}") + self.local_rank = self.parallel_config.data_parallel_rank_local * self.parallel_config.world_size + self.local_rank + self.device = torch.device(f"cuda:{self.local_rank}") else: self.device = torch.device(f"cuda:{self.local_rank}") torch.cuda.set_device(self.device) @@ -31,9 +30,16 @@ def init_device(self): self.init_gpu_memory = torch.cuda.mem_get_info()[0] # Initialize the distributed environment. - init_worker_distributed_environment(self.parallel_config, self.rank, - self.distributed_init_method, - self.local_rank) + try: + # not None -> Module found: DLLM patch applied + init_worker_distributed_environment(config, self.rank, + self.distributed_init_method, + self.local_rank) + except: + # None -> Module not found: Patch not applied + init_worker_distributed_environment(self.parallel_config, self.rank, + self.distributed_init_method, + self.local_rank) # Set random seed. set_random_seed(self.model_config.seed) diff --git a/vllm_mindspore/worker/worker.py b/vllm_mindspore/worker/worker.py index 8ce1bc91..2dc69fcd 100644 --- a/vllm_mindspore/worker/worker.py +++ b/vllm_mindspore/worker/worker.py @@ -26,7 +26,6 @@ import torch from vllm.config import VllmConfig from vllm.distributed import ( - ensure_kv_transfer_initialized, ensure_model_parallel_initialized, init_distributed_environment, set_custom_all_reduce, -- Gitee From 58d5dd926ae3d5507229e14d7897a48287101446 Mon Sep 17 00:00:00 2001 From: Erpim Date: Mon, 19 May 2025 23:14:38 +0800 Subject: [PATCH 37/55] update owner --- OWNERS | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/OWNERS b/OWNERS index 90ee21da..d6adb0dc 100644 --- a/OWNERS +++ b/OWNERS @@ -1,12 +1,16 @@ reviewers: -- wang_shaocong -- erpim -- zhang_xue_tong -- tan-wei-cheng +- zhaizhiqiang +- panshaowu +- zichun_ye +- ckey_dou approvers: -- tronzhang -- zichun_ye - zlq2020 -- panshaowu -- zhaizhiqiang \ No newline at end of file +- zhang_xue_tong +- tronzhang +- r1chardf1d0 +- wang_shaocong +- erpim +- tan-wei-cheng +- hangangqiang +- dayschan -- Gitee From 1d1f300b7c7513e6e26b635484497181c6c65112 Mon Sep 17 00:00:00 2001 From: lvhaoyu Date: Tue, 20 May 2025 09:33:32 +0800 Subject: [PATCH 38/55] set CPU communication waiting time to avoid timeout --- vllm_mindspore/scripts.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/vllm_mindspore/scripts.py b/vllm_mindspore/scripts.py index ef297418..bcd90780 100644 --- a/vllm_mindspore/scripts.py +++ b/vllm_mindspore/scripts.py @@ -44,7 +44,10 @@ def env_setup(target_env_dict=None): "HCCL_OP_EXPANSION_MODE": "AIV", "MS_JIT_MODULES": "vllm_mindspore,research", "GLOG_v": "3", - "RAY_CGRAPH_get_timeout": "360" + "RAY_CGRAPH_get_timeout": "360", + # For CPU communication timeout setting, default is 15s, change to 180s + # to avoid multi node timeout when starting service. + "MS_NODE_TIMEOUT": "180" } for key, value in target_env_dict.items(): -- Gitee From 4f963cc19c15c012169d66fe10f999b979bf34a2 Mon Sep 17 00:00:00 2001 From: ccsszz Date: Mon, 28 Apr 2025 20:43:30 +0800 Subject: [PATCH 39/55] change smoothquant config --- .../python/test_vllm_deepseek_smoothquant.py | 2 +- .../test_vllm_deepseek_smoothquant_mss.py | 78 +++++++ .../models/mf_models/deepseek_v3.py | 36 ++-- .../mf_models/deepseekv3_weight_processor.py | 190 ++---------------- 4 files changed, 115 insertions(+), 191 deletions(-) create mode 100644 tests/st/python/test_vllm_deepseek_smoothquant_mss.py diff --git a/tests/st/python/test_vllm_deepseek_smoothquant.py b/tests/st/python/test_vllm_deepseek_smoothquant.py index c79222d9..ec61d309 100644 --- a/tests/st/python/test_vllm_deepseek_smoothquant.py +++ b/tests/st/python/test_vllm_deepseek_smoothquant.py @@ -63,7 +63,7 @@ class TestDeepSeek: sampling_params = SamplingParams(temperature=0.0, max_tokens=10, top_k=1) # Create an LLM. - llm = LLM(model="/home/workspace/mindspore_dataset/weight/DeepSeek-R1-W8A8-smoothquant", + llm = LLM(model="/home/workspace/mindspore_dataset/weight/DeepSeek-R1-W8A8-smoothquant-newconfig", trust_remote_code=True, gpu_memory_utilization=0.9, tensor_parallel_size=8, max_model_len=4096) # Generate texts from the prompts. The output is a list of RequestOutput objects # that contain the prompt, generated text, and other information. diff --git a/tests/st/python/test_vllm_deepseek_smoothquant_mss.py b/tests/st/python/test_vllm_deepseek_smoothquant_mss.py new file mode 100644 index 00000000..bc3ad06b --- /dev/null +++ b/tests/st/python/test_vllm_deepseek_smoothquant_mss.py @@ -0,0 +1,78 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# Copyright 2025 Huawei Technologies Co., Ltd +# Copyright 2024 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""test mf deepseek r1 smoothquant.""" +import pytest +import os +from . import set_env +env_manager = set_env.EnvVarManager() +# def env +env_vars = { + "MINDFORMERS_MODEL_CONFIG": "./config/predict_deepseek_r1_671b_w8a8_smoothquant.yaml", + "ASCEND_CUSTOM_PATH": os.path.expandvars("$ASCEND_HOME_PATH/../"), + "vLLM_MODEL_BACKEND": "MindFormers", + "MS_ENABLE_LCCL": "off", + "HCCL_OP_EXPANSION_MODE": "AIV", + "ASCEND_RT_VISIBLE_DEVICES": "0,1,2,3,4,5,6,7", + "MS_ALLOC_CONF": "enable_vmm:True", + "LCCL_DETERMINISTIC": "1", + "HCCL_DETERMINISTIC": "true", + "ATB_MATMUL_SHUFFLE_K_ENABLE": "0", + "ATB_LLM_LCOC_ENABLE": "0", + "VLLM_USE_V1": "0", +} +# set env +env_manager.setup_ai_environment(env_vars) +import vllm_mindspore +from vllm import LLM, SamplingParams + +class TestDeepSeekMss: + ''' + TestDeepSeekMss + ''' + @pytest.mark.level0 + @pytest.mark.platform_arm_ascend910b_training + @pytest.mark.env_single + @pytest.mark.skip(reason="gs master branch is not suit for the newest mindformers.") + def test_deepseek_r1_mss(self): + """ + test case deepseek r1 w8a8 mss + """ + + # Sample prompts. + prompts = [ + "介绍下北京故宫", + ] + + # Create a sampling params object. + sampling_params = SamplingParams(temperature=0.0, max_tokens=10, top_k=1) + + # Create an LLM. + llm = LLM(model="/home/workspace/mindspore_dataset/weight/DeepSeek-R1-W8A8-smoothquant-newconfig", + trust_remote_code=True, gpu_memory_utilization=0.9, tensor_parallel_size=8, num_scheduler_steps=8) + # Generate texts from the prompts. The output is a list of RequestOutput objects + # that contain the prompt, generated text, and other information. + outputs = llm.generate(prompts, sampling_params) + # Print the outputs. + for i, output in enumerate(outputs): + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + assert "博物院" in generated_text + + # unset env + env_manager.unset_all() \ No newline at end of file diff --git a/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py b/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py index 67b80339..497e35ef 100644 --- a/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py +++ b/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py @@ -257,25 +257,25 @@ class DeepseekV3ForCausalLM(MfModelBase): cfg = PTQConfig(mode=quant_mode, backend=BackendTarget.ASCEND, weight_quant_dtype=msdtype.int8, act_quant_dtype=msdtype.int8, outliers_suppression=OutliersSuppressionType.SMOOTH, opname_blacklist=['lm_head', 'lkv2kv']) - w2_config = PTQConfig(mode=quant_mode, backend=BackendTarget.ASCEND, weight_quant_dtype=msdtype.int8, - act_quant_dtype=msdtype.int8, - outliers_suppression=OutliersSuppressionType.NONE, - precision_recovery=PrecisionRecovery.NONE, - act_quant_granularity=QuantGranularity.PER_TOKEN, - weight_quant_granularity=QuantGranularity.PER_CHANNEL) - layer_policies = OrderedDict({r'.*\.w2.*': w2_config}) + ffn_config = PTQConfig(mode=quant_mode, backend=BackendTarget.ASCEND, weight_quant_dtype=msdtype.int8, + act_quant_dtype=msdtype.int8, + outliers_suppression=OutliersSuppressionType.NONE, + precision_recovery=PrecisionRecovery.NONE, + act_quant_granularity=QuantGranularity.PER_TOKEN, + weight_quant_granularity=QuantGranularity.PER_CHANNEL) + layer_policies = OrderedDict({r'.*\.feed_forward\..*': ffn_config}) elif quant_type.lower() == 'osl': cfg = PTQConfig(mode=quant_mode, backend=BackendTarget.ASCEND, weight_quant_dtype=msdtype.int8, act_quant_dtype=msdtype.int8, outliers_suppression=OutliersSuppressionType.OUTLIER_SUPPRESSION_LITE, opname_blacklist=['lm_head', 'lkv2kv']) - w2_config = PTQConfig(mode=quant_mode, backend=BackendTarget.ASCEND, weight_quant_dtype=msdtype.int8, - act_quant_dtype=msdtype.int8, - outliers_suppression=OutliersSuppressionType.NONE, - precision_recovery=PrecisionRecovery.NONE, - act_quant_granularity=QuantGranularity.PER_TOKEN, - weight_quant_granularity=QuantGranularity.PER_CHANNEL) - layer_policies = OrderedDict({r'.*\.w2.*': w2_config}) + ffn_config = PTQConfig(mode=quant_mode, backend=BackendTarget.ASCEND, weight_quant_dtype=msdtype.int8, + act_quant_dtype=msdtype.int8, + outliers_suppression=OutliersSuppressionType.NONE, + precision_recovery=PrecisionRecovery.NONE, + act_quant_granularity=QuantGranularity.PER_TOKEN, + weight_quant_granularity=QuantGranularity.PER_CHANNEL) + layer_policies = OrderedDict({r'.*\.feed_forward\..*': ffn_config}) elif quant_type.lower() == 'a16w8': cfg = PTQConfig(mode=quant_mode, backend=BackendTarget.ASCEND, weight_quant_dtype=msdtype.int8, opname_blacklist=['lm_head', 'lkv2kv']) @@ -292,14 +292,6 @@ class DeepseekV3ForCausalLM(MfModelBase): if 'awq' in quant_type.lower(): # pylint: disable=protected-access ptq._config.weight_symmetric = False - if 'smoothquant' in quant_type.lower(): - # pylint: disable=protected-access - ptq._config.aclnn_quant_list = ["routed_experts.ffn.w_gate_hidden", "routed_experts.ffn.w1", - "routed_experts.ffn.w3"] - if 'osl' in quant_type.lower(): - # pylint: disable=protected-access - ptq._config.aclnn_quant_list = ["routed_experts.ffn.w_gate_hidden", "routed_experts.ffn.w1", - "routed_experts.ffn.w3"] if 'gptq-pergroup' in quant_type.lower(): # pylint: disable=protected-access ptq.layer_policies[r'.*\.feed_forward\.w2.*'].aclnn_quant_list = ["w2"] diff --git a/vllm_mindspore/model_executor/models/mf_models/deepseekv3_weight_processor.py b/vllm_mindspore/model_executor/models/mf_models/deepseekv3_weight_processor.py index b89e699b..28ebe6d6 100644 --- a/vllm_mindspore/model_executor/models/mf_models/deepseekv3_weight_processor.py +++ b/vllm_mindspore/model_executor/models/mf_models/deepseekv3_weight_processor.py @@ -1188,41 +1188,23 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor): ffn_concat = self.config.model.model_config.ffn_concat w1_weight_name = f"model.layers.{layer_id}.{layer_type}.w1._layer.weight" - w1_bias_name = f"model.layers.{layer_id}.{layer_type}.w1._layer.matmul.quant_bias" - w1_scale_name = f"model.layers.{layer_id}.{layer_type}.w1._layer.matmul.dequant_scale" - w1_quant_zp = f"model.layers.{layer_id}.{layer_type}.w1.quant_op.input_zp" - w1_quant_scale = f"model.layers.{layer_id}.{layer_type}.w1.quant_op.input_scale" + w1_scale_name = f"model.layers.{layer_id}.{layer_type}.w1._layer.matmul.weight_scale" w3_weight_name = f"model.layers.{layer_id}.{layer_type}.w3._layer.weight" - w3_bias_name = f"model.layers.{layer_id}.{layer_type}.w3._layer.matmul.quant_bias" - w3_scale_name = f"model.layers.{layer_id}.{layer_type}.w3._layer.matmul.dequant_scale" - w3_quant_zp = f"model.layers.{layer_id}.{layer_type}.w3.quant_op.input_zp" - w3_quant_scale = f"model.layers.{layer_id}.{layer_type}.w3.quant_op.input_scale" + w3_scale_name = f"model.layers.{layer_id}.{layer_type}.w3._layer.matmul.weight_scale" w2_weight_name = f"model.layers.{layer_id}.{layer_type}.w2._layer.weight" w2_scale_name = f"model.layers.{layer_id}.{layer_type}.w2._layer.matmul.weight_scale" w1_weight_param, _ = self.get_routed_safetensor_3_dim(w1_weight_name, src_hf_dir, hf_weight_map, tp_axis=2, split_ep=self.moe_split_ep, split_tp=self.moe_split_tp) - w1_bias_param, _ = self.get_routed_safetensor_2_dim(w1_bias_name, src_hf_dir, hf_weight_map, tp_axis=1, - split_ep=self.moe_split_ep, split_tp=self.moe_split_tp) - w1_scale_param, _ = self.get_routed_safetensor_2_dim(w1_scale_name, src_hf_dir, hf_weight_map, tp_axis=1, split_ep=self.moe_split_ep, split_tp=self.moe_split_tp) - w1_quant_zp_param, _ = self.get_safetensor_from_file(w1_quant_zp, src_hf_dir, hf_weight_map) - w1_quant_scale_param, _ = self.get_safetensor_from_file(w1_quant_scale, src_hf_dir, hf_weight_map) - w3_weight_param, _ = self.get_routed_safetensor_3_dim(w3_weight_name, src_hf_dir, hf_weight_map, tp_axis=2, split_ep=self.moe_split_ep, split_tp=self.moe_split_tp) - w3_bias_param, _ = self.get_routed_safetensor_2_dim(w3_bias_name, src_hf_dir, hf_weight_map, tp_axis=1, - split_ep=self.moe_split_ep, split_tp=self.moe_split_tp) - w3_scale_param, _ = self.get_routed_safetensor_2_dim(w3_scale_name, src_hf_dir, hf_weight_map, tp_axis=1, split_ep=self.moe_split_ep, split_tp=self.moe_split_tp) - w3_quant_zp_param, _ = self.get_safetensor_from_file(w3_quant_zp, src_hf_dir, hf_weight_map) - w3_quant_scale_param, _ = self.get_safetensor_from_file(w3_quant_scale, src_hf_dir, hf_weight_map) - w2_weight_param, _ = self.get_routed_safetensor_3_dim(w2_weight_name, src_hf_dir, hf_weight_map, tp_axis=1, split_ep=self.moe_split_ep, split_tp=self.moe_split_tp) w2_scale_param, _ = self.get_routed_safetensor_2_dim(w2_scale_name, src_hf_dir, hf_weight_map, @@ -1234,26 +1216,10 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor): parameter_dict[concat_weight_name] = ms.Parameter(concat_weight_param, name=concat_weight_name, requires_grad=False) - concat_bias_name = f"model.layers.{layer_id}.{layer_type}.w_gate_hidden._layer.matmul.quant_bias" - concat_bias_param = ms.Tensor(np.concatenate([w1_bias_param, w3_bias_param], axis=1), dtype=ms.int32) - parameter_dict[concat_bias_name] = ms.Parameter(concat_bias_param, name=concat_bias_name, - requires_grad=False) - - concat_scale_name = f"model.layers.{layer_id}.{layer_type}.w_gate_hidden._layer.matmul.dequant_scale" + concat_scale_name = f"model.layers.{layer_id}.{layer_type}.w_gate_hidden._layer.matmul.weight_scale" concat_scale_param = ms.Tensor(np.concatenate([w1_scale_param, w3_scale_param], axis=1), dtype=ms.bfloat16) parameter_dict[concat_scale_name] = ms.Parameter(concat_scale_param, name=concat_scale_name, requires_grad=False) - - concat_quant_zp_name = f"model.layers.{layer_id}.{layer_type}.w_gate_hidden.quant_op.input_zp" - concat_quant_zp_param = ms.Tensor(w1_quant_zp_param, dtype=ms.bfloat16) - parameter_dict[concat_quant_zp_name] = ms.Parameter(concat_quant_zp_param, name=concat_quant_zp_name, - requires_grad=False) - - concat_quant_scale_name = f"model.layers.{layer_id}.{layer_type}.w_gate_hidden.quant_op.input_scale" - concat_quant_scale_param = ms.Tensor(w1_quant_scale_param, dtype=ms.bfloat16) - parameter_dict[concat_quant_scale_name] = ms.Parameter(concat_quant_scale_param, - name=concat_quant_scale_name, - requires_grad=False) else: # w1 w3 parameter_dict[w1_weight_name] = ms.Parameter(ms.Tensor(w1_weight_param, ms.int8), name=w1_weight_name, @@ -1261,41 +1227,24 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor): parameter_dict[w3_weight_name] = ms.Parameter(ms.Tensor(w3_weight_param, ms.int8), name=w3_weight_name, requires_grad=False) - parameter_dict[w1_bias_name] = ms.Parameter(ms.Tensor(w1_bias_param, ms.int32), - name=w1_bias_name, requires_grad=False) - parameter_dict[w3_bias_name] = ms.Parameter(ms.Tensor(w3_bias_param, ms.int32), - name=w3_bias_name, requires_grad=False) - parameter_dict[w1_scale_name] = ms.Parameter(ms.Tensor(w1_scale_param, ms.bfloat16), name=w1_scale_name, requires_grad=False) parameter_dict[w3_scale_name] = ms.Parameter(ms.Tensor(w3_scale_param, ms.bfloat16), name=w3_scale_name, requires_grad=False) - parameter_dict[w1_quant_zp] = ms.Parameter(ms.Tensor(w1_quant_zp_param, ms.bfloat16), - name=w1_quant_zp, requires_grad=False) - parameter_dict[w3_quant_zp] = ms.Parameter(ms.Tensor(w3_quant_zp_param, ms.bfloat16), - name=w3_quant_zp, requires_grad=False) - - parameter_dict[w1_quant_scale] = ms.Parameter(ms.Tensor(w1_quant_scale_param, ms.bfloat16), - name=w1_quant_scale, requires_grad=False) - parameter_dict[w3_quant_scale] = ms.Parameter(ms.Tensor(w3_quant_scale_param, ms.bfloat16), - name=w3_quant_scale, requires_grad=False) - parameter_dict[w2_weight_name] = ms.Parameter(ms.Tensor(w2_weight_param, ms.int8), name=w2_weight_name, requires_grad=False) parameter_dict[w2_scale_name] = ms.Parameter(ms.Tensor(w2_scale_param, ms.bfloat16), name=w2_scale_name, requires_grad=False) - def get_smooth_quant_moe_shared_expert_weight(self, w1_weight_name, w1_bias_name, w1_scale_name, w3_weight_name, - w3_bias_name, w3_scale_name, w2_weight_name, src_hf_dir, - hf_weight_map): + def get_smooth_quant_moe_shared_expert_weight(self, w1_weight_name, w1_scale_name, w3_weight_name,w3_scale_name, + w2_weight_name, src_hf_dir, hf_weight_map): + '''get_smooth_quant_moe_shared_expert_weight''' if self.ep_method in [EPMethod.DEFAULT, EPMethod.ALLGATHER]: w1_weight_param, _ = self.get_safetensor_from_file_split_moe_tp_group(w1_weight_name, src_hf_dir, hf_weight_map, split_axis=0) - w1_bias_param, _ = self.get_safetensor_from_file_split_moe_tp_group(w1_bias_name, src_hf_dir, hf_weight_map, - split_axis=0) w1_scale_param, _ = self.get_safetensor_from_file_split_moe_tp_group(w1_scale_name, src_hf_dir, hf_weight_map, @@ -1304,8 +1253,6 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor): w3_weight_param, _ = self.get_safetensor_from_file_split_moe_tp_group(w3_weight_name, src_hf_dir, hf_weight_map, split_axis=0) - w3_bias_param, _ = self.get_safetensor_from_file_split_moe_tp_group(w3_bias_name, src_hf_dir, hf_weight_map, - split_axis=0) w3_scale_param, _ = self.get_safetensor_from_file_split_moe_tp_group(w3_scale_name, src_hf_dir, hf_weight_map, split_axis=0) @@ -1315,18 +1262,16 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor): split_axis=1) elif self.ep_method == EPMethod.ALLTOALL: w1_weight_param, _ = self.get_safetensor_from_file(w1_weight_name, src_hf_dir, hf_weight_map) - w1_bias_param, _ = self.get_safetensor_from_file(w1_bias_name, src_hf_dir, hf_weight_map) w1_scale_param, _ = self.get_safetensor_from_file(w1_scale_name, src_hf_dir, hf_weight_map) w3_weight_param, _ = self.get_safetensor_from_file(w3_weight_name, src_hf_dir, hf_weight_map) - w3_bias_param, _ = self.get_safetensor_from_file(w3_bias_name, src_hf_dir, hf_weight_map) w3_scale_param, _ = self.get_safetensor_from_file(w3_scale_name, src_hf_dir, hf_weight_map) w2_weight_param, _ = self.get_safetensor_from_file(w2_weight_name, src_hf_dir, hf_weight_map) else: raise ValueError("Unsupported ep_method:{}".format(self.ep_method)) - return w1_weight_param, w1_bias_param, w1_scale_param, w3_weight_param, w3_bias_param, w3_scale_param, w2_weight_param + return w1_weight_param, w1_scale_param, w3_weight_param, w3_scale_param, w2_weight_param def smooth_quant_process_shared_ffn_weight(self, src_hf_dir, layer_id, hf_weight_map, parameter_dict, layer_type): """smooth_quant_process_shared_ffn_weight""" @@ -1336,57 +1281,26 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor): w2_weight_name = f"model.layers.{layer_id}.{layer_type}.w2._layer.weight" w3_weight_name = f"model.layers.{layer_id}.{layer_type}.w3._layer.weight" - w1_bias_name = f"model.layers.{layer_id}.{layer_type}.w1._layer.matmul.quant_bias" - w3_bias_name = f"model.layers.{layer_id}.{layer_type}.w3._layer.matmul.quant_bias" - - w1_scale_name = f"model.layers.{layer_id}.{layer_type}.w1._layer.matmul.dequant_scale" + w1_scale_name = f"model.layers.{layer_id}.{layer_type}.w1._layer.matmul.weight_scale" w2_scale_name = f"model.layers.{layer_id}.{layer_type}.w2._layer.matmul.weight_scale" - w3_scale_name = f"model.layers.{layer_id}.{layer_type}.w3._layer.matmul.dequant_scale" - - w1_quant_zp = f"model.layers.{layer_id}.{layer_type}.w1.quant_op.input_zp" - w3_quant_zp = f"model.layers.{layer_id}.{layer_type}.w3.quant_op.input_zp" - - w1_quant_scale = f"model.layers.{layer_id}.{layer_type}.w1.quant_op.input_scale" - w3_quant_scale = f"model.layers.{layer_id}.{layer_type}.w3.quant_op.input_scale" - - w1_weight_param, w1_bias_param, w1_scale_param, w3_weight_param, w3_bias_param, w3_scale_param, w2_weight_param = \ - self.get_smooth_quant_moe_shared_expert_weight(w1_weight_name, w1_bias_name, w1_scale_name, w3_weight_name, - w3_bias_name, w3_scale_name, w2_weight_name, src_hf_dir, - hf_weight_map) - - w1_quant_zp_param, _ = self.get_safetensor_from_file(w1_quant_zp, src_hf_dir, hf_weight_map) - w1_quant_scale_param, _ = self.get_safetensor_from_file(w1_quant_scale, src_hf_dir, hf_weight_map) + w3_scale_name = f"model.layers.{layer_id}.{layer_type}.w3._layer.matmul.weight_scale" + w1_weight_param, w1_scale_param, w3_weight_param, w3_scale_param, w2_weight_param = \ + self.get_smooth_quant_moe_shared_expert_weight(w1_weight_name, w1_scale_name, w3_weight_name, w3_scale_name, + w2_weight_name, src_hf_dir, hf_weight_map) w2_scale_param, _ = self.get_safetensor_from_file(w2_scale_name, src_hf_dir, hf_weight_map) - w3_quant_zp_param, _ = self.get_safetensor_from_file(w3_quant_zp, src_hf_dir, hf_weight_map) - w3_quant_scale_param, _ = self.get_safetensor_from_file(w3_quant_scale, src_hf_dir, hf_weight_map) if ffn_concat: concat_weight_name = f"model.layers.{layer_id}.{layer_type}.w_gate_hidden._layer.weight" concat_weight_param = ms.Tensor(np.concatenate([w1_weight_param, w3_weight_param], axis=0), dtype=ms.int8) parameter_dict[concat_weight_name] = ms.Parameter(concat_weight_param, name=concat_weight_name, requires_grad=False) - concat_bias_name = f"model.layers.{layer_id}.{layer_type}.w_gate_hidden._layer.matmul.quant_bias" - concat_bias_param = ms.Tensor(np.concatenate([w1_bias_param, w3_bias_param], axis=0), dtype=ms.int32) - parameter_dict[concat_bias_name] = ms.Parameter(concat_bias_param, name=concat_bias_name, - requires_grad=False) - - concat_scale_name = f"model.layers.{layer_id}.{layer_type}.w_gate_hidden._layer.matmul.dequant_scale" - concat_scale_param = ms.Tensor(np.concatenate([w1_scale_param, w3_scale_param], axis=0), dtype=ms.float32) + concat_scale_name = f"model.layers.{layer_id}.{layer_type}.w_gate_hidden._layer.matmul.weight_scale" + concat_scale_param = ms.Tensor(np.concatenate([w1_scale_param, w3_scale_param], axis=0), dtype=ms.bfloat16) parameter_dict[concat_scale_name] = ms.Parameter(concat_scale_param, name=concat_scale_name, requires_grad=False) - concat_quant_zp_name = f"model.layers.{layer_id}.{layer_type}.w_gate_hidden.quant_op.input_zp" - concat_quant_zp_param = ms.Tensor(w1_quant_zp_param, dtype=ms.int8) - parameter_dict[concat_quant_zp_name] = ms.Parameter(concat_quant_zp_param, name=concat_quant_zp_name, - requires_grad=False) - - concat_quant_scale_name = f"model.layers.{layer_id}.{layer_type}.w_gate_hidden.quant_op.input_scale" - concat_quant_scale_param = ms.Tensor(w1_quant_scale_param, dtype=ms.bfloat16) - parameter_dict[concat_quant_scale_name] = ms.Parameter(concat_quant_scale_param, - name=concat_quant_scale_name, - requires_grad=False) else: # w1 w3 parameter_dict[w1_weight_name] = ms.Parameter(ms.Tensor(w1_weight_param, ms.int8), name=w1_weight_name, @@ -1394,25 +1308,11 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor): parameter_dict[w3_weight_name] = ms.Parameter(ms.Tensor(w3_weight_param, ms.int8), name=w3_weight_name, requires_grad=False) - parameter_dict[w1_bias_name] = ms.Parameter(ms.Tensor(w1_bias_param, ms.int32), - name=w1_bias_name, requires_grad=False) - parameter_dict[w3_bias_name] = ms.Parameter(ms.Tensor(w3_bias_param, ms.int32), - name=w3_bias_name, requires_grad=False) - - parameter_dict[w1_scale_name] = ms.Parameter(ms.Tensor(w1_scale_param, ms.float32), + parameter_dict[w1_scale_name] = ms.Parameter(ms.Tensor(w1_scale_param, ms.bfloat16), name=w1_scale_name, requires_grad=False) - parameter_dict[w3_scale_name] = ms.Parameter(ms.Tensor(w3_scale_param, ms.float32), + parameter_dict[w3_scale_name] = ms.Parameter(ms.Tensor(w3_scale_param, ms.bfloat16), name=w3_scale_name, requires_grad=False) - parameter_dict[w1_quant_zp] = ms.Parameter(ms.Tensor(w1_quant_zp_param, ms.int8), - name=w1_quant_zp, requires_grad=False) - parameter_dict[w3_quant_zp] = ms.Parameter(ms.Tensor(w3_quant_zp_param, ms.int8), - name=w3_quant_zp, requires_grad=False) - - parameter_dict[w1_quant_scale] = ms.Parameter(ms.Tensor(w1_quant_scale_param, ms.bfloat16), - name=w1_quant_scale, requires_grad=False) - parameter_dict[w3_quant_scale] = ms.Parameter(ms.Tensor(w3_quant_scale_param, ms.bfloat16), - name=w3_quant_scale, requires_grad=False) parameter_dict[w2_weight_name] = ms.Parameter(ms.Tensor(w2_weight_param, ms.int8), name=w2_weight_name, requires_grad=False) parameter_dict[w2_scale_name] = ms.Parameter(ms.Tensor(w2_scale_param, ms.bfloat16), @@ -1426,26 +1326,15 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor): w1_weight_param, _ = self.get_safetensor_from_file_split_tp_group(w1_weight_name, src_hf_dir, hf_weight_map, split_axis=0) - w1_bias_name = f"model.layers.{layer_id}.{layer_type}.w1._layer.matmul.quant_bias" - w1_bias_param, _ = self.get_safetensor_from_file_split_tp_group(w1_bias_name, src_hf_dir, hf_weight_map, - split_axis=0) - w1_scale_name = f"model.layers.{layer_id}.{layer_type}.w1._layer.matmul.dequant_scale" + w1_scale_name = f"model.layers.{layer_id}.{layer_type}.w1._layer.matmul.weight_scale" w1_scale_param, _ = self.get_safetensor_from_file_split_tp_group(w1_scale_name, src_hf_dir, hf_weight_map, split_axis=0) - w1_quant_zp = f"model.layers.{layer_id}.{layer_type}.w1.quant_op.input_zp" - w1_quant_scale = f"model.layers.{layer_id}.{layer_type}.w1.quant_op.input_scale" - w1_quant_zp_param, _ = self.get_safetensor_from_file(w1_quant_zp, src_hf_dir, hf_weight_map) - w1_quant_scale_param, _ = self.get_safetensor_from_file(w1_quant_scale, src_hf_dir, hf_weight_map) - w3_weight_name = f"model.layers.{layer_id}.{layer_type}.w3._layer.weight" w3_weight_param, _ = self.get_safetensor_from_file_split_tp_group(w3_weight_name, src_hf_dir, hf_weight_map, split_axis=0) - w3_bias_name = f"model.layers.{layer_id}.{layer_type}.w3._layer.matmul.quant_bias" - w3_bias_param, _ = self.get_safetensor_from_file_split_tp_group(w3_bias_name, src_hf_dir, hf_weight_map, - split_axis=0) - w3_scale_name = f"model.layers.{layer_id}.{layer_type}.w3._layer.matmul.dequant_scale" + w3_scale_name = f"model.layers.{layer_id}.{layer_type}.w3._layer.matmul.weight_scale" w3_scale_param, _ = self.get_safetensor_from_file_split_tp_group(w3_scale_name, src_hf_dir, hf_weight_map, split_axis=0) w2_weight_name = f"model.layers.{layer_id}.{layer_type}.w2._layer.weight" @@ -1454,36 +1343,16 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor): split_axis=1) w2_scale_param, _ = self.get_safetensor_from_file(w2_scale_name, src_hf_dir, hf_weight_map) - w3_quant_zp = f"model.layers.{layer_id}.{layer_type}.w3.quant_op.input_zp" - w3_quant_scale = f"model.layers.{layer_id}.{layer_type}.w3.quant_op.input_scale" - w3_quant_zp_param, _ = self.get_safetensor_from_file(w3_quant_zp, src_hf_dir, hf_weight_map) - w3_quant_scale_param, _ = self.get_safetensor_from_file(w3_quant_scale, src_hf_dir, hf_weight_map) if ffn_concat: concat_weight_name = f"model.layers.{layer_id}.{layer_type}.w_gate_hidden._layer.weight" concat_weight_param = ms.Tensor(np.concatenate([w1_weight_param, w3_weight_param], axis=0), dtype=ms.int8) parameter_dict[concat_weight_name] = ms.Parameter(concat_weight_param, name=concat_weight_name, requires_grad=False) - concat_bias_name = f"model.layers.{layer_id}.{layer_type}.w_gate_hidden._layer.matmul.quant_bias" - concat_bias_param = ms.Tensor(np.concatenate([w1_bias_param, w3_bias_param], axis=0), dtype=ms.int32) - parameter_dict[concat_bias_name] = ms.Parameter(concat_bias_param, name=concat_bias_name, - requires_grad=False) - - concat_scale_name = f"model.layers.{layer_id}.{layer_type}.w_gate_hidden._layer.matmul.dequant_scale" - concat_scale_param = ms.Tensor(np.concatenate([w1_scale_param, w3_scale_param], axis=0), dtype=ms.float32) + concat_scale_name = f"model.layers.{layer_id}.{layer_type}.w_gate_hidden._layer.matmul.weight_scale" + concat_scale_param = ms.Tensor(np.concatenate([w1_scale_param, w3_scale_param], axis=0), dtype=ms.bfloat16) parameter_dict[concat_scale_name] = ms.Parameter(concat_scale_param, name=concat_scale_name, requires_grad=False) - - concat_quant_zp_name = f"model.layers.{layer_id}.{layer_type}.w_gate_hidden.quant_op.input_zp" - concat_quant_zp_param = ms.Tensor(w1_quant_zp_param, dtype=ms.int8) - parameter_dict[concat_quant_zp_name] = ms.Parameter(concat_quant_zp_param, name=concat_quant_zp_name, - requires_grad=False) - - concat_quant_scale_name = f"model.layers.{layer_id}.{layer_type}.w_gate_hidden.quant_op.input_scale" - concat_quant_scale_param = ms.Tensor(w1_quant_scale_param, dtype=ms.bfloat16) - parameter_dict[concat_quant_scale_name] = ms.Parameter(concat_quant_scale_param, - name=concat_quant_scale_name, - requires_grad=False) else: # w1 w3 parameter_dict[w1_weight_name] = ms.Parameter(ms.Tensor(w1_weight_param, ms.int8), name=w1_weight_name, @@ -1491,26 +1360,11 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor): parameter_dict[w3_weight_name] = ms.Parameter(ms.Tensor(w3_weight_param, ms.int8), name=w3_weight_name, requires_grad=False) - parameter_dict[w1_bias_name] = ms.Parameter(ms.Tensor(w1_bias_param, ms.int32), - name=w1_bias_name, requires_grad=False) - parameter_dict[w3_bias_name] = ms.Parameter(ms.Tensor(w3_bias_param, ms.int32), - name=w3_bias_name, requires_grad=False) - - parameter_dict[w1_scale_name] = ms.Parameter(ms.Tensor(w1_scale_param, ms.float32), + parameter_dict[w1_scale_name] = ms.Parameter(ms.Tensor(w1_scale_param, ms.bfloat16), name=w1_scale_name, requires_grad=False) - parameter_dict[w3_scale_name] = ms.Parameter(ms.Tensor(w3_scale_param, ms.float32), + parameter_dict[w3_scale_name] = ms.Parameter(ms.Tensor(w3_scale_param, ms.bfloat16), name=w3_scale_name, requires_grad=False) - parameter_dict[w1_quant_zp] = ms.Parameter(ms.Tensor(w1_quant_zp_param, ms.int8), - name=w1_quant_zp, requires_grad=False) - parameter_dict[w3_quant_zp] = ms.Parameter(ms.Tensor(w3_quant_zp_param, ms.int8), - name=w3_quant_zp, requires_grad=False) - - parameter_dict[w1_quant_scale] = ms.Parameter(ms.Tensor(w1_quant_scale_param, ms.bfloat16), - name=w1_quant_scale, requires_grad=False) - parameter_dict[w3_quant_scale] = ms.Parameter(ms.Tensor(w3_quant_scale_param, ms.bfloat16), - name=w3_quant_scale, requires_grad=False) - parameter_dict[w2_weight_name] = ms.Parameter(ms.Tensor(w2_weight_param, ms.int8), name=w2_weight_name, requires_grad=False) parameter_dict[w2_scale_name] = ms.Parameter(ms.Tensor(w2_scale_param, ms.bfloat16), -- Gitee From db69fe94d804d9332eb2ea0382a76d50fee9cc09 Mon Sep 17 00:00:00 2001 From: lvhaoyu Date: Fri, 25 Apr 2025 12:10:24 +0800 Subject: [PATCH 40/55] Improve performence --- vllm_mindspore/__init__.py | 13 +++++- vllm_mindspore/model_executor/layers/utils.py | 23 ++++++---- .../models/mf_models/mf_model_base.py | 4 +- vllm_mindspore/platforms/ascend.py | 5 +-- .../v1/attention/backends/flash_attn.py | 11 +---- .../v1/sample/ops/topk_topp_sampler.py | 44 ++++++++++++++++++- vllm_mindspore/v1/utils.py | 18 ++++++-- vllm_mindspore/v1/worker/block_table.py | 2 - vllm_mindspore/v1/worker/gpu_input_batch.py | 22 ++++------ 9 files changed, 98 insertions(+), 44 deletions(-) diff --git a/vllm_mindspore/__init__.py b/vllm_mindspore/__init__.py index 224ae26e..98970a5f 100644 --- a/vllm_mindspore/__init__.py +++ b/vllm_mindspore/__init__.py @@ -291,12 +291,21 @@ vllm.v1.worker.gpu_input_batch.copy_slice = copy_slice from vllm_mindspore.v1.sample.ops.penalties import _convert_to_tensors import vllm.v1.sample.ops.penalties vllm.v1.sample.ops.penalties._convert_to_tensors = _convert_to_tensors +import vllm.model_executor.layers.utils +from vllm_mindspore.model_executor.layers.utils import apply_penalties +vllm.model_executor.layers.utils.apply_penalties = apply_penalties +vllm.v1.sample.ops.penalties.apply_penalties = apply_penalties + + +from vllm_mindspore.v1.sample.ops.topk_topp_sampler import apply_top_k_top_p, random_sample, \ + apply_top_k_only, topk_topp_sampler_forward_native -from vllm_mindspore.v1.sample.ops.topk_topp_sampler import apply_top_k_top_p, random_sample import vllm.v1.sample.ops.topk_topp_sampler +from vllm.v1.sample.ops.topk_topp_sampler import TopKTopPSampler +TopKTopPSampler.forward_native = topk_topp_sampler_forward_native vllm.v1.sample.ops.topk_topp_sampler.apply_top_k_top_p = apply_top_k_top_p vllm.v1.sample.ops.topk_topp_sampler.random_sample = random_sample - +vllm.v1.sample.ops.topk_topp_sampler.apply_top_k_only = apply_top_k_only from vllm_mindspore.v1.sample.sampler import apply_temperature import vllm.v1.sample.sampler vllm.v1.sample.sampler.Sampler.apply_temperature = apply_temperature diff --git a/vllm_mindspore/model_executor/layers/utils.py b/vllm_mindspore/model_executor/layers/utils.py index 0edf165c..3a95175a 100644 --- a/vllm_mindspore/model_executor/layers/utils.py +++ b/vllm_mindspore/model_executor/layers/utils.py @@ -18,6 +18,7 @@ """Utility methods for model layers.""" from typing import Tuple import torch +import mindspore as ms def get_token_bin_counts_and_mask( tokens: torch.Tensor, @@ -35,13 +36,14 @@ def get_token_bin_counts_and_mask( return bin_counts, mask + def apply_penalties(logits: torch.Tensor, prompt_tokens_tensor: torch.Tensor, output_tokens_tensor: torch.Tensor, presence_penalties: torch.Tensor, frequency_penalties: torch.Tensor, repetition_penalties: torch.Tensor) -> torch.Tensor: """ - Applies penalties in place to the logits tensor + Applies penalties out of place implement to imporve performance. logits : The input logits tensor of shape [num_seqs, vocab_size] prompt_tokens_tensor: A tensor containing the prompt tokens. The prompts are padded to the maximum prompt length within the batch using @@ -60,13 +62,18 @@ def apply_penalties(logits: torch.Tensor, prompt_tokens_tensor: torch.Tensor, vocab_size, num_seqs) output_bin_counts, output_mask = get_token_bin_counts_and_mask( output_tokens_tensor, vocab_size, num_seqs) - # repetition_penalties = repetition_penalties.unsqueeze_(dim=1).repeat( - # 1, vocab_size) - repetition_penalties = repetition_penalties.unsqueeze(dim=1).repeat(1, vocab_size) - logits[logits > 0] /= torch.where(prompt_mask | output_mask, - repetition_penalties, 1.0)[logits > 0] - logits[logits <= 0] *= torch.where(prompt_mask | output_mask, - repetition_penalties, 1.0)[logits <= 0] + + # use 'broadcast_to' to replace 'tensor.repeat' to imporve performance + # when tensor shape is (num,seqs, 1), then 'tensor.repeat(1, vocab_size)' + # is equal to 'broadcast_to(tensor, (num_seqs, vocab_size))' + repetition_penalties = ms.mint.broadcast_to(repetition_penalties.unsqueeze(dim=1), + (num_seqs, vocab_size)) + + # use out of place computation instead of inplace setitem to improve performance + # 'tensor[tensor > 0]' will result in setitem, which is slow. + mask = prompt_mask | output_mask + logits = torch.where(mask & (logits > 0), logits / repetition_penalties, logits) + logits = torch.where(mask & (logits <= 0), logits * repetition_penalties, logits) # We follow the definition in OpenAI API. # Refer to https://platform.openai.com/docs/api-reference/parameter-details logits -= frequency_penalties.unsqueeze(dim=1) * output_bin_counts diff --git a/vllm_mindspore/model_executor/models/mf_models/mf_model_base.py b/vllm_mindspore/model_executor/models/mf_models/mf_model_base.py index 2af6103d..ecc764be 100644 --- a/vllm_mindspore/model_executor/models/mf_models/mf_model_base.py +++ b/vllm_mindspore/model_executor/models/mf_models/mf_model_base.py @@ -266,10 +266,10 @@ class MfModelBase(MsModelBase): else: hidden_states = hidden_states.index_select(0, selected_token_indices) logits = self.lm_head(hidden_states) - logits = logits.reshape(-1, logits.shape[-1]) + logits = logits.view(-1, logits.shape[-1]) else: logits = self.lm_head(hidden_states) - logits = logits.reshape(-1, logits.shape[-1]) + logits = logits.view(-1, logits.shape[-1]) return logits def sample( diff --git a/vllm_mindspore/platforms/ascend.py b/vllm_mindspore/platforms/ascend.py index dddce58d..356a33a0 100644 --- a/vllm_mindspore/platforms/ascend.py +++ b/vllm_mindspore/platforms/ascend.py @@ -99,9 +99,8 @@ class AscendPlatform(Platform): if cache_config and cache_config.block_size is None: cache_config.block_size = 16 - - # if envs.VLLM_USE_V1: - # vllm_config.model_config.enforce_eager = True + model_config = vllm_config.model_config + model_config.disable_cascade_attn = True @classmethod def get_attn_backend_cls(cls, selected_backend, head_size, dtype, kv_cache_dtype, block_size, use_v1, use_mla): diff --git a/vllm_mindspore/v1/attention/backends/flash_attn.py b/vllm_mindspore/v1/attention/backends/flash_attn.py index 77f6d726..b5c5629e 100644 --- a/vllm_mindspore/v1/attention/backends/flash_attn.py +++ b/vllm_mindspore/v1/attention/backends/flash_attn.py @@ -122,10 +122,6 @@ class FlashAttentionMetadata: def __getitem__(self, key): if key == "batch_valid_length": key = "seq_lens" - if key == "block_tables": - if getattr(self, key).ndim == 1: - return mutable(getattr(self, key).expand_dims(0)) - return mutable(getattr(self, key)) return getattr(self, key) @@ -206,21 +202,18 @@ class FlashAttentionMetadataBuilder: def build(self, num_reqs: int, num_actual_tokens: int, max_query_len: int, common_prefix_len: int): + # do not manually call 'tensor.move_to("Ascend", blocking=False)' here, + # because it will cause a certain amount of host time. query_start_loc = ms.from_numpy(self.runner.query_start_loc_np[:num_reqs + 1]) - query_start_loc.move_to("Ascend", blocking=False) max_context_lens = self.runner.input_batch.num_computed_tokens_cpu[:num_reqs].max() slot_mapping = ms.from_numpy(self.runner.slot_mapping_np[:num_actual_tokens]) - slot_mapping.move_to("Ascend", blocking=False) seq_lens_np = self.runner.seq_lens_np[:num_reqs] max_seq_len = seq_lens_np.max() seq_lens = ms.from_numpy(seq_lens_np) - seq_lens.move_to("Ascend", blocking=False) context_lens = ms.from_numpy(self.runner.input_batch.num_computed_tokens_cpu[:num_reqs]) - context_lens.move_to("Ascend", blocking=False) q_seq_lens_np = np.diff(self.runner.query_start_loc_np[:num_reqs + 1]) q_seq_lens = ms.from_numpy(q_seq_lens_np) - q_seq_lens.move_to("Ascend", blocking=False) attn_metadata = FlashAttentionMetadata( seq_lens=seq_lens, diff --git a/vllm_mindspore/v1/sample/ops/topk_topp_sampler.py b/vllm_mindspore/v1/sample/ops/topk_topp_sampler.py index 8b0835c0..cbd218a2 100644 --- a/vllm_mindspore/v1/sample/ops/topk_topp_sampler.py +++ b/vllm_mindspore/v1/sample/ops/topk_topp_sampler.py @@ -1,6 +1,31 @@ from typing import Optional import torch -from vllm.v1.sample.ops.topk_topp_sampler import apply_top_k_only +from mindspore import mint + + +def apply_top_k_top_p_ms(logits, k, p): + """ + Apply top-k and top-p masks to the logits for high performance. + which is reference from 'apply_top_k_top_p_tpu' in vllm. + """ + if k is not None: + # use `apply_top_k_only` defined in this file. + logits = apply_top_k_only(logits, k) + + if p is not None: + probs = logits.softmax(dim=-1) + probs_sort, _ = mint.sort(probs, dim=-1, descending=False) + cumprob = mint.cumsum(probs_sort, dim=-1) + top_p_mask = cumprob <= 1 - p.unsqueeze(dim=1) + top_p_mask[:, -1] = False # at least one + + top_p_count = top_p_mask.sum(dim=-1).unsqueeze(1) + top_p_cutoff = probs_sort.gather(-1, top_p_count) + elements_to_discard = probs < top_p_cutoff + logits.masked_fill_(elements_to_discard, -float("inf")) + + return logits + def random_sample( probs: torch.Tensor, @@ -29,6 +54,18 @@ def random_sample( return probs.argmax(dim=-1).view(-1) +def topk_topp_sampler_forward_native( + self, + logits: torch.Tensor, + generators: dict[int, torch.Generator], + k: Optional[torch.Tensor], + p: Optional[torch.Tensor], +) -> torch.Tensor: + logits = apply_top_k_top_p_ms(logits, k, p) + probs = logits.softmax(dim=-1, dtype=torch.float32) + return random_sample(probs, generators) + + def apply_top_k_top_p( logits: torch.Tensor, k: Optional[torch.Tensor], @@ -87,11 +124,14 @@ def apply_top_k_only( # Set non-top-k rows to 1 so that we can gather. k = k.masked_fill(no_top_k_mask, 1) max_top_k = k.max() - int_max_top_k = max_top_k.item() # topk.values tensor has shape [batch_size, max_top_k]. # Convert top k to 0-based index in range [0, max_top_k). k_index = k.sub_(1).unsqueeze(1).expand(logits.shape[0], 1) + # tensor.item() will cause GPU-CPU Sync, so place as later as possible. + # can be deleted after logits.topk() support tensor-type input. + int_max_top_k = max_top_k.item() + top_k_mask = logits.topk(int_max_top_k, dim=1)[0].gather(1, k_index.long()) # Handle non-topk rows. top_k_mask.masked_fill_(no_top_k_mask.unsqueeze(1), -float("inf")) diff --git a/vllm_mindspore/v1/utils.py b/vllm_mindspore/v1/utils.py index c13c292d..daa6718c 100644 --- a/vllm_mindspore/v1/utils.py +++ b/vllm_mindspore/v1/utils.py @@ -1,7 +1,19 @@ +import numpy as np import torch +import mindspore as ms + +def _copy_slice_from_np(from_np: np.ndarray, to_tensor: torch.Tensor, + length: int) -> None: + """ + Copy the first length elements of a numpy array into a tensor in a + non-blocking manner. + """ + to_tensor[:length] = ms.from_numpy(from_np[:length]) + return to_tensor + def copy_slice(from_tensor: torch.Tensor, to_tensor: torch.Tensor, - length: int) -> None: + length: int, *, return_tensor=True) -> None: """ Copy the first length elements of a tensor into another tensor in a non-blocking manner. @@ -9,5 +21,5 @@ def copy_slice(from_tensor: torch.Tensor, to_tensor: torch.Tensor, Used to copy pinned CPU tensor data to pre-allocated GPU tensors. """ to_tensor[:length] = from_tensor[:length] - return to_tensor - + if return_tensor: + return to_tensor[:length] diff --git a/vllm_mindspore/v1/worker/block_table.py b/vllm_mindspore/v1/worker/block_table.py index b865bae3..d4563445 100644 --- a/vllm_mindspore/v1/worker/block_table.py +++ b/vllm_mindspore/v1/worker/block_table.py @@ -69,8 +69,6 @@ class BlockTable: self.block_table_np[[src, tgt]] = self.block_table_np[[tgt, src]] def commit(self, num_reqs: int) -> None: - self.block_table_cpu[:num_reqs] = torch.from_numpy(self.block_table_np[:num_reqs]) - # self.block_table[:num_reqs] = self.block_table_cpu[:num_reqs] self.block_table[:num_reqs] = torch.from_numpy(self.block_table_np[:num_reqs]) def clear(self) -> None: diff --git a/vllm_mindspore/v1/worker/gpu_input_batch.py b/vllm_mindspore/v1/worker/gpu_input_batch.py index 52c13344..a1a19156 100644 --- a/vllm_mindspore/v1/worker/gpu_input_batch.py +++ b/vllm_mindspore/v1/worker/gpu_input_batch.py @@ -3,11 +3,8 @@ from typing import Dict, List, Optional, Set, Tuple, cast import numpy as np import torch -from vllm.lora.request import LoRARequest -from vllm.sampling_params import SamplingType from vllm.v1.sample.metadata import SamplingMetadata -from vllm_mindspore.v1.utils import copy_slice -from vllm.v1.worker.block_table import BlockTable +from vllm_mindspore.v1.utils import _copy_slice_from_np, copy_slice _SAMPLING_EPS = 1e-5 @@ -15,26 +12,26 @@ _SAMPLING_EPS = 1e-5 def _make_sampling_metadata(self) -> SamplingMetadata: num_reqs = self.num_reqs if not self.all_greedy: - temperature = copy_slice(torch.from_numpy(self.temperature_cpu), self.temperature, num_reqs) + temperature = _copy_slice_from_np(self.temperature_cpu, self.temperature, num_reqs) temperature = temperature[:num_reqs] else: temperature = None if not self.no_top_p: - copy_slice(torch.from_numpy(self.top_p_cpu), self.top_p, num_reqs) + _copy_slice_from_np(self.top_p_cpu, self.top_p, num_reqs) if not self.no_top_k: - copy_slice(torch.from_numpy(self.top_k_cpu), self.top_k, num_reqs) + _copy_slice_from_np(self.top_k_cpu, self.top_k, num_reqs) if not self.no_min_p: - copy_slice(torch.from_numpy(self.min_p_cpu), self.min_p, num_reqs) + _copy_slice_from_np(self.min_p_cpu, self.min_p, num_reqs) if not self.no_penalties: # Since syncing these tensors is expensive only copy them # if necessary i.e. if there are requests which require # penalties to be applied during sampling. - copy_slice(torch.from_numpy(self.frequency_penalties_cpu), + _copy_slice_from_np(self.frequency_penalties_cpu, self.frequency_penalties, num_reqs) - copy_slice(torch.from_numpy(self.presence_penalties_cpu), + _copy_slice_from_np(self.presence_penalties_cpu, self.presence_penalties, num_reqs) - copy_slice(torch.from_numpy(self.repetition_penalties_cpu), + _copy_slice_from_np(self.repetition_penalties_cpu, self.repetition_penalties, num_reqs) # The prompt tokens are used only for applying penalties during @@ -48,7 +45,7 @@ def _make_sampling_metadata(self) -> SamplingMetadata: if not self.no_allowed_token_ids: assert self.allowed_token_ids_mask is not None copy_slice(self.allowed_token_ids_mask_cpu_tensor, - self.allowed_token_ids_mask, num_reqs) + self.allowed_token_ids_mask, num_reqs, return_tensor=False) allowed_token_ids_mask = self.allowed_token_ids_mask[:num_reqs] return SamplingMetadata( @@ -81,6 +78,5 @@ def _make_prompt_token_ids_tensor(self) -> torch.Tensor: for i in range(self.num_reqs): prompt_token_ids[i, self.num_prompt_tokens[i]:] = self.vocab_size prompt_token_ids_cpu_tensor = torch.from_numpy(prompt_token_ids) - prompt_token_ids_cpu_tensor.move_to("Ascend", blocking=False) return prompt_token_ids_cpu_tensor -- Gitee From 227411f6f1cb728dfd07fcf05a8ccecc1132f072 Mon Sep 17 00:00:00 2001 From: wusimin Date: Tue, 20 May 2025 17:55:19 +0800 Subject: [PATCH 41/55] =?UTF-8?q?[0.8.3=20v1]=E9=80=82=E9=85=8DV1=E5=8E=9F?= =?UTF-8?q?=E7=94=9FQwen?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/st/python/test_vllm_qwen_7b_v1.py | 78 ++++++++++++++ vllm_mindspore/attention/ops/paged_attn.py | 6 +- .../model_executor/layers/linear.py | 22 +++- .../model_executor/layers/logits_processor.py | 17 ++- .../layers/vocab_parallel_embedding.py | 2 +- .../model_loader/weight_utils.py | 8 +- .../model_executor/models/model_base.py | 4 +- vllm_mindspore/model_executor/models/qwen2.py | 101 +++++++++++++----- 8 files changed, 186 insertions(+), 52 deletions(-) create mode 100644 tests/st/python/test_vllm_qwen_7b_v1.py diff --git a/tests/st/python/test_vllm_qwen_7b_v1.py b/tests/st/python/test_vllm_qwen_7b_v1.py new file mode 100644 index 00000000..cdc37bab --- /dev/null +++ b/tests/st/python/test_vllm_qwen_7b_v1.py @@ -0,0 +1,78 @@ +#!/usr/bin/env python3 +# encoding: utf-8 +# Copyright 2025 Huawei Technologies Co., Ltd +# Copyright 2024 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://wwww.apache.org/licenses/LICENSE-2.0 +# +# Unless required by application law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""test vllm qwen.""" +import pytest +import os +from . import set_env +env_manager = set_env.EnvVarManager() +# def env +env_vars = { + "ASCEND_CUSTOM_PATH": os.path.expandvars("$ASCEND_HOME_PATH/../"), + "MS_ENABLE_LCCL": "off", + "HCCL_OP_EXPANSION_MODE": "AIV", + "ASCEND_RT_VISIBLE_DEVICES": "0,1", + "MS_ALLOC_CONF": "enable_vmm:True", + "LCCL_DETERMINISTIC": "1", + "HCCL_DETERMINISTIC": "true", + "ATB_MATMUL_SHUFFLE_K_ENABLE": "0", + "ATB_LLM_LCOC_ENABLE": "0", + "VLLM_USE_V1": "1", +} +# set env +env_manager.setup_ai_environment(env_vars) +import vllm_mindspore +from vllm import LLM, SamplingParams + + +class TestQwen: + """ + Test Qwen. + """ + @pytest.mark.level0 + @pytest.mark.platform_arm_ascend910b_training + @pytest.mark.env_single + @pytest.mark.skip(reason="qwen need to be supported on new MS package") + def test_vllm_qwen(self): + """ + test case qwen2.5 7B + """ + + # Sample prompts. + prompts = [ + "You are a helpful assistant.<|User|>将文本分类为中性、负面或正面。 \n文本:我认为这次假期还可以。 \n情感:<|Assistant|>\n", + ] + + # Create a sampling params object. + sampling_params = SamplingParams(temperature=0.0, max_tokens=10, top_k=1) + + # Create an LLM. + llm = LLM(model="/home/workspace/mindspore_dataset/weight/Qwen2.5-7B-Instruct", + gpu_memory_utilization=0.9, tensor_parallel_size=2) + # Generate texts from the prompts. The output is a list of RequestOutput objects + # that contain the prompt, generated text, and other information. + outputs = llm.generate(prompts, sampling_params) + except_list=['中性<|Assistant|> 这句话'] + # Print the outputs. + for i, output in enumerate(outputs): + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + assert generated_text == except_list[i] + + # unset env + env_manager.unset_all() diff --git a/vllm_mindspore/attention/ops/paged_attn.py b/vllm_mindspore/attention/ops/paged_attn.py index df9394c7..0d8a70c7 100644 --- a/vllm_mindspore/attention/ops/paged_attn.py +++ b/vllm_mindspore/attention/ops/paged_attn.py @@ -221,7 +221,6 @@ class PagedAttention: block_tables: torch.Tensor, query_start_loc: torch.Tensor, seq_lens_tensor: torch.Tensor, - context_lens: torch.Tensor, max_query_len: int, alibi_slopes: Optional[torch.Tensor], sliding_window: Optional[int], @@ -229,6 +228,7 @@ class PagedAttention: v_scale: float, ) -> torch.Tensor: output = torch.empty_like(query) + max_seq_len = None context_attention_fwd( query, key, @@ -239,9 +239,9 @@ class PagedAttention: value_cache, block_tables, # query_start_loc is (batch_size + 1,) - query_start_loc[:-1], + query_start_loc, seq_lens_tensor, - context_lens, + max_seq_len, max_query_len, k_scale, v_scale, diff --git a/vllm_mindspore/model_executor/layers/linear.py b/vllm_mindspore/model_executor/layers/linear.py index 45aa4c43..572f0e34 100644 --- a/vllm_mindspore/model_executor/layers/linear.py +++ b/vllm_mindspore/model_executor/layers/linear.py @@ -160,6 +160,8 @@ class LinearBase(ms.nn.Cell): params_dtype=None, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", + *, + return_bias: bool = True, ): super().__init__() @@ -175,6 +177,7 @@ class LinearBase(ms.nn.Cell): self.quant_method: Optional[QuantizeMethodBase] = UnquantizedLinearMethod() else: self.quant_method = quant_config.get_quant_method(self, prefix=prefix) + self.return_bias = return_bias def construct(self, x: ms.Tensor) -> ms.Tensor: raise NotImplementedError @@ -195,9 +198,11 @@ class ColumnParallelLinear(LinearBase): quant_config: Optional[QuantizationConfig] = None, output_sizes: Optional[List[int]] = None, prefix: str = "", + *, + return_bias: bool = True, ): super().__init__( - input_size, output_size, skip_bias_add, params_dtype, quant_config, prefix + input_size, output_size, skip_bias_add, params_dtype, quant_config, prefix, return_bias=return_bias ) self.gather_output = gather_output @@ -256,6 +261,8 @@ class ColumnParallelLinear(LinearBase): else: output = output_parallel output_bias = self.bias if self.skip_bias_add else None + if not self.return_bias: + return output return output, output_bias def weight_loader(self, param, loaded_weight): @@ -326,6 +333,8 @@ class MergedColumnParallelLinear(ColumnParallelLinear): params_dtype=None, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", + *, + return_bias: bool = True ): self.output_sizes = output_sizes tp_size = get_tensor_model_parallel_world_size() @@ -339,6 +348,7 @@ class MergedColumnParallelLinear(ColumnParallelLinear): params_dtype=params_dtype, quant_config=quant_config, prefix=prefix, + return_bias=return_bias ) def weight_loader( @@ -396,6 +406,8 @@ class QKVParallelLinear(ColumnParallelLinear): params_dtype=None, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", + *, + return_bias: bool = True, ): self.hidden_size = hidden_size self.head_size = head_size @@ -431,6 +443,7 @@ class QKVParallelLinear(ColumnParallelLinear): params_dtype=params_dtype, quant_config=quant_config, prefix=prefix, + return_bias=return_bias ) def weight_loader(self, param, loaded_weight, loaded_shard_id): @@ -494,9 +507,11 @@ class RowParallelLinear(LinearBase): reduce_results: bool = True, quant_config: Optional[QuantizationConfig] = None, prefix: str = "", + *, + return_bias: bool = True, ): super().__init__( - input_size, output_size, skip_bias_add, params_dtype, quant_config, prefix + input_size, output_size, skip_bias_add, params_dtype, quant_config, prefix, return_bias=return_bias ) # Divide the weight matrix along the last dimension. @@ -566,7 +581,8 @@ class RowParallelLinear(LinearBase): output = output_parallel output_bias = self.bias if self.skip_bias_add else None - + if not self.return_bias: + return output return output, output_bias def weight_loader(self, param, loaded_weight): diff --git a/vllm_mindspore/model_executor/layers/logits_processor.py b/vllm_mindspore/model_executor/layers/logits_processor.py index 75f35d6d..32b02fb7 100644 --- a/vllm_mindspore/model_executor/layers/logits_processor.py +++ b/vllm_mindspore/model_executor/layers/logits_processor.py @@ -41,7 +41,6 @@ if envs.VLLM_LOGITS_PROCESSOR_THREADS is not None: _logits_processor_threadpool = ThreadPoolExecutor( envs.VLLM_LOGITS_PROCESSOR_THREADS) - class LogitsProcessor(nn.Cell): """Process logits and apply logits processors from sampling metadata. @@ -74,9 +73,8 @@ class LogitsProcessor(nn.Cell): self.soft_cap = soft_cap # Whether to use gather or all-gather to gather the logits. parallel_config = get_current_vllm_config().parallel_config - self.use_gather = not current_platform.is_tpu() \ - or envs.VLLM_USE_V1 \ - or parallel_config.distributed_executor_backend == "external_launcher" + self.use_all_gather = envs.VLLM_USE_V1 \ + or parallel_config.distributed_executor_backend == "external_launcher" def construct( self, @@ -105,7 +103,8 @@ class LogitsProcessor(nn.Cell): logits *= self.scale # Apply logits processors (if any). - if sampling_metadata.seq_groups is not None: + if sampling_metadata is not None and \ + sampling_metadata.seq_groups is not None: logits = _apply_logits_processors(logits, sampling_metadata) return logits @@ -120,16 +119,16 @@ class LogitsProcessor(nn.Cell): logits = lm_head.quant_method.apply( lm_head, hidden_states, bias=embedding_bias ) - if self.use_gather: - # None may be returned for rank > 0 - logits = tensor_model_parallel_gather(logits) - else: + if self.use_all_gather: # Gather is not supported for some devices such as TPUs. # Use all-gather instead. # NOTE(woosuk): Here, the outputs of every device should not be None # because XLA requires strict SPMD among all devices. Every device # should execute the same operations after gathering the logits. logits = tensor_model_parallel_all_gather(logits) + else: + # None may be returned for rank > 0 + logits = tensor_model_parallel_gather(logits) # Remove paddings in vocab (if any). if logits is not None: logits = logits[..., : self.org_vocab_size] diff --git a/vllm_mindspore/model_executor/layers/vocab_parallel_embedding.py b/vllm_mindspore/model_executor/layers/vocab_parallel_embedding.py index cec40d8b..e3407f51 100644 --- a/vllm_mindspore/model_executor/layers/vocab_parallel_embedding.py +++ b/vllm_mindspore/model_executor/layers/vocab_parallel_embedding.py @@ -231,7 +231,7 @@ class VocabParallelEmbedding(nn.Cell): # If we are making an embedding layer, then our quantization linear # method must implement the embedding operation. If we are another # layer type like ParallelLMHead, this is not important. - is_embedding_layer = type(self.__class__) is VocabParallelEmbedding + is_embedding_layer = type(self) is VocabParallelEmbedding quant_method_implements_embedding = method_has_implemented_embedding( type(quant_method) ) diff --git a/vllm_mindspore/model_executor/model_loader/weight_utils.py b/vllm_mindspore/model_executor/model_loader/weight_utils.py index 45fe4bdd..0fc4d3d2 100644 --- a/vllm_mindspore/model_executor/model_loader/weight_utils.py +++ b/vllm_mindspore/model_executor/model_loader/weight_utils.py @@ -27,18 +27,16 @@ from mindspore import Parameter, Tensor def safetensors_weights_iterator( hf_weights_files: List[str], + use_tqdm_on_load: bool, ) -> Generator[Tuple[str, torch.Tensor], None, None]: """Iterate over the weights in the model safetensor files.""" from safetensors import safe_open - from vllm.model_executor.model_loader.weight_utils import _BAR_FORMAT + from vllm.model_executor.model_loader.weight_utils import _BAR_FORMAT, enable_tqdm - enable_tqdm = ( - not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0 - ) for st_file in tqdm( hf_weights_files, desc="Loading safetensors checkpoint shards", - disable=not enable_tqdm, + disable=not enable_tqdm(use_tqdm_on_load), bar_format=_BAR_FORMAT, ): with safe_open(st_file, framework="np") as f: diff --git a/vllm_mindspore/model_executor/models/model_base.py b/vllm_mindspore/model_executor/models/model_base.py index 7aa1de4e..a464e3e1 100644 --- a/vllm_mindspore/model_executor/models/model_base.py +++ b/vllm_mindspore/model_executor/models/model_base.py @@ -20,7 +20,6 @@ import os from abc import abstractmethod from typing import Iterable, List, Optional, Set, Tuple, Union, Dict -from vllm.attention import AttentionMetadata from vllm.config import VllmConfig, get_current_vllm_config from vllm.model_executor.layers.sampler import SamplerOutput from vllm.model_executor.sampling_metadata import SamplingMetadata @@ -82,6 +81,7 @@ class Fake_Attention_V1(Attention): for _ in range(vllm_config.parallel_config.pipeline_parallel_size) ] self.attn_type = AttentionType.DECODER + self.num_block = num_block self.num_kv_heads = num_kv_heads self.head_size = head_size self.dtype = vllm_config.model_config.dtype @@ -210,8 +210,6 @@ class MsModelBase(): self, input_ids: Tensor, positions: Tensor, - kv_caches: List[Tensor], - attn_metadata: AttentionMetadata, intermediate_tensors: Optional[IntermediateTensors] = None, inputs_embeds: Optional[Tensor] = None, **kwargs diff --git a/vllm_mindspore/model_executor/models/qwen2.py b/vllm_mindspore/model_executor/models/qwen2.py index 5eb70a82..8bf808a4 100644 --- a/vllm_mindspore/model_executor/models/qwen2.py +++ b/vllm_mindspore/model_executor/models/qwen2.py @@ -47,19 +47,21 @@ from vllm_mindspore.model_executor.models.utils import ( PPMissingLayer, make_empty_intermediate_tensors_factory, make_layers, maybe_prefix) from vllm_mindspore.model_executor.sampling_metadata import SamplingMetadata -from vllm_mindspore.model_executor.models.model_base import MsModelBase, Fake_Attention +from vllm_mindspore.model_executor.models.model_base import MsModelBase, Fake_Attention, Fake_Attention_V1 from vllm_mindspore.model_executor.models.attention_mask import LowerTriangularMask from vllm_mindspore.utils import STR_DTYPE_TO_MS_DTYPE from vllm.config import CacheConfig, VllmConfig +import vllm.envs as envs from vllm.model_executor.layers.quantization import \ QuantizationConfig from vllm.sequence import IntermediateTensors from vllm.attention.backends.abstract import AttentionType from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size -from vllm.attention.backends.abstract import AttentionMetadata - +from vllm.forward_context import get_forward_context +from vllm_mindspore.v1.attention.backends.flash_attn import FlashAttentionMetadata +import mindspore as ms class Qwen2MLP(nn.Cell): def __init__( @@ -299,7 +301,6 @@ class Qwen2Model(nn.Cell): self.config = config self.quant_config = quant_config - self.padding_idx = config.pad_token_id self.vocab_size = config.vocab_size if get_pp_group().is_first_rank or (config.tie_word_embeddings @@ -494,7 +495,10 @@ class Qwen2ForCausalLM(MsModelBase): self.casual_mask = LowerTriangularMask(dtype=self.mstype, max_model_len=self.model_config.max_model_len) self.set_model_inputs(self.prefill) - self.kv_caches = [Fake_Attention() for i in range(config.num_hidden_layers)] + if envs.VLLM_USE_V1: + self.kv_caches = [Fake_Attention_V1() for i in range(config.num_hidden_layers)] + else: + self.kv_caches = [Fake_Attention() for i in range(config.num_hidden_layers)] compilation_config = vllm_config.compilation_config if prefix in compilation_config.static_forward_context: @@ -513,7 +517,8 @@ class Qwen2ForCausalLM(MsModelBase): kv_cache_dtype = self.model_config.dtype if self.cache_config.cache_dtype == "auto" \ else self.cache_config.cache_dtype - kv_cache_dtype = STR_DTYPE_TO_MS_DTYPE[kv_cache_dtype] + if kv_cache_dtype in STR_DTYPE_TO_MS_DTYPE: + kv_cache_dtype = STR_DTYPE_TO_MS_DTYPE[kv_cache_dtype] num_layers = self.model_config.get_num_layers(self.parallel_config) @@ -548,27 +553,48 @@ class Qwen2ForCausalLM(MsModelBase): self, input_ids: Tensor, positions: Tensor, - kv_caches: List[Tuple[Tensor, Tensor]], - attn_metadata: AttentionMetadata, intermediate_tensors: IntermediateTensors = None, inputs_embeds: Tensor = None, **kwargs ) -> Union[Tensor, IntermediateTensors]: key_cache, value_cache = self.get_kvcache() - seq_lens = attn_metadata.seq_lens - max_query_len = attn_metadata.max_query_len - # When Mutli-Step is enabled with Chunked-Prefill, prefills and - # decodes are scheduled together. In the first step, all the - # prefills turn into decodes and max_query_len will be 1. - if self.is_multi_step_chunked_prefill and max_query_len == 1: - query_lens = [1] * len(seq_lens) + attn_metadata = get_forward_context().attn_metadata + input_ids = input_ids.to(ms.int64) + if attn_metadata is None: + attn_metadata = self._dummy_attention_metadata(input_ids, positions) + if not envs.VLLM_USE_V1: + seq_lens = attn_metadata.seq_lens + max_query_len = attn_metadata.max_query_len + # When Mutli-Step is enabled with Chunked-Prefill, prefills and + # decodes are scheduled together. In the first step, all the + # prefills turn into decodes and max_query_len will be 1. + if self.is_multi_step_chunked_prefill and max_query_len == 1: + query_lens = [1] * len(seq_lens) + else: + query_lens = attn_metadata.query_lens + + seq_lens_np = np.array(seq_lens, dtype=np.int32) + query_lens_np = np.array(query_lens, dtype=np.int32) + kv_cache_lens = seq_lens_np - query_lens_np + is_prefill = attn_metadata.num_decode_tokens == 0 and kv_cache_lens.max() == 0 + slot_mapping = attn_metadata.slot_mapping + batch_valid_length = Tensor.from_numpy(np.array(attn_metadata.seq_lens, dtype=np.int32)) + q_seq_lens = Tensor.from_numpy(np.array(attn_metadata.query_lens, dtype=np.int32)) + block_tables = attn_metadata.block_tables + position_ids = ms.Tensor(positions, dtype=ms.int32) + attn_mask = self.casual_mask.gen_attention_mask(is_prefill, position_ids, query_lens) else: - query_lens = attn_metadata.query_lens - - seq_lens_np = np.array(seq_lens, dtype=np.int32) - query_lens_np = np.array(query_lens, dtype=np.int32) - kv_cache_lens = seq_lens_np - query_lens_np - is_prefill = attn_metadata.num_decode_tokens == 0 and kv_cache_lens.max() == 0 + if attn_metadata.max_context_lens == 0: + is_prefill = True + else: + is_prefill = False + slot_mapping = attn_metadata.slot_mapping + batch_valid_length = Tensor.from_numpy(attn_metadata.seq_lens_np) + q_seq_lens = attn_metadata.q_seq_lens + block_tables = attn_metadata.block_tables + query_lens_np = attn_metadata.q_seq_lens_np + attn_mask = self.casual_mask.gen_attention_mask(is_prefill, positions, query_lens_np) + positions = positions.to(ms.int64) if is_prefill: input_ids = ops.expand_dims(input_ids, 0) if not self.prefill: @@ -579,13 +605,6 @@ class Qwen2ForCausalLM(MsModelBase): if self.prefill: self.prefill = False self.set_model_inputs(self.prefill) - - slot_mapping = attn_metadata.slot_mapping - attn_mask = self.casual_mask.gen_attention_mask(is_prefill, positions, query_lens) - seq_lens_np = np.array(attn_metadata.seq_lens, dtype=np.int32) - batch_valid_length = Tensor.from_numpy(seq_lens_np) - q_seq_lens = Tensor.from_numpy(np.array(attn_metadata.query_lens, dtype=np.int32)) - block_tables = attn_metadata.block_tables model_output = self.model(input_ids, positions, key_cache, @@ -604,6 +623,32 @@ class Qwen2ForCausalLM(MsModelBase): model_output = ops.squeeze(model_output, 1) return model_output + def _dummy_attention_metadata(self, input_ids: Tensor, positions: Tensor) -> FlashAttentionMetadata: + input_len = input_ids.shape[0] + max_seq_len = ms.Tensor(input_len, dtype=ms.int32) + seq_lengths = ms.Tensor([input_len], dtype=ms.int32) + q_seq_lens = ms.Tensor([input_len], dtype=ms.int32) + q_seq_lens_np = np.array([input_len], dtype=np.int32) + seq_lens_np = np.array([input_len], dtype=np.int32) + + block_tables = ms.Tensor([[0]], dtype=ms.int32) + slot_mapping = [-1 for _ in range(input_len)] + slot_mapping = ms.Tensor(slot_mapping, dtype=ms.int32) + return FlashAttentionMetadata( + max_seq_len=max_seq_len, + seq_lens=seq_lengths, + seq_lens_np=seq_lens_np, + block_tables=block_tables, + slot_mapping=slot_mapping, + q_seq_lens=q_seq_lens, + q_seq_lens_np=q_seq_lens_np, + context_lens=0, + # To enforce prefill and decode are both complied in warmup process. + # So set max_context_lens to 0 for prefill and 1 for decode. + max_context_lens=0 if self.prefill else 1, + query_start_loc = None + ) + def load_weights(self, weights: Iterable[Tuple[str, Tensor]]) -> Set[str]: params_dict = self.get_params_dict() self.model.load_weights(weights, params_dict) -- Gitee From 79505056e76716ae3087fa6a66bbd3f53c523611 Mon Sep 17 00:00:00 2001 From: r1chardf1d0 Date: Fri, 16 May 2025 16:00:47 +0800 Subject: [PATCH 42/55] mtp support 0.8.3 --- tests/mindformers | 2 +- tests/st/python/test_vllm_deepseek_bf16_part_v1.py | 2 +- tests/st/python/test_vllm_deepseek_part.py | 8 ++++---- tests/st/python/test_vllm_deepseek_part_v1.py | 2 +- .../model_executor/models/mf_models/deepseek_mtp.py | 2 ++ 5 files changed, 9 insertions(+), 7 deletions(-) diff --git a/tests/mindformers b/tests/mindformers index bbddc170..d2df5295 160000 --- a/tests/mindformers +++ b/tests/mindformers @@ -1 +1 @@ -Subproject commit bbddc170167ac6705f07bf0aea25977e10f8d760 +Subproject commit d2df52951122a0328a7ccea4230e0b6f49f6c2e8 diff --git a/tests/st/python/test_vllm_deepseek_bf16_part_v1.py b/tests/st/python/test_vllm_deepseek_bf16_part_v1.py index 11167779..3e616242 100644 --- a/tests/st/python/test_vllm_deepseek_bf16_part_v1.py +++ b/tests/st/python/test_vllm_deepseek_bf16_part_v1.py @@ -60,7 +60,7 @@ class TestDeepSeek: # Create an LLM. llm = LLM(model="/home/workspace/mindspore_dataset/weight/DeepSeek-R1-bf16", - trust_remote_code=True, gpu_memory_utilization=0.9, tensor_parallel_size=8) + trust_remote_code=True, gpu_memory_utilization=0.9, tensor_parallel_size=8, max_model_len=4096) # Generate texts from the prompts. The output is a list of RequestOutput objects # that contain the prompt, generated text, and other information. outputs = llm.generate(prompts, sampling_params) diff --git a/tests/st/python/test_vllm_deepseek_part.py b/tests/st/python/test_vllm_deepseek_part.py index c882ae58..21ba4fe4 100644 --- a/tests/st/python/test_vllm_deepseek_part.py +++ b/tests/st/python/test_vllm_deepseek_part.py @@ -25,7 +25,7 @@ env_vars = { "MINDFORMERS_MODEL_CONFIG": "./config/predict_deepseek_r1_671b_w8a8.yaml", "ASCEND_CUSTOM_PATH": os.path.expandvars("$ASCEND_HOME_PATH/../"), "vLLM_MODEL_BACKEND": "MindFormers", - "MS_ENABLE_LCCL": "off", + "MS_ENABLE_LCCL": "on", "HCCL_OP_EXPANSION_MODE": "AIV", "ASCEND_RT_VISIBLE_DEVICES": "0,1,2,3,4,5,6,7", "MS_ALLOC_CONF": "enable_vmm:True", @@ -89,7 +89,7 @@ class TestDeepSeekMTP: @pytest.mark.level0 @pytest.mark.platform_arm_ascend910b_training @pytest.mark.env_single - @pytest.mark.skip(reason="MTP need addition adaption on v0.8.3 V0") + @pytest.mark.skip(reason="gs master branch is not suit for the newest mindformers.") def test_deepseek_mtp(self): """ test case deepseek mtp with main model of r1-w8a8 @@ -105,8 +105,8 @@ class TestDeepSeekMTP: # Create an LLM. llm = LLM(model="/home/workspace/mindspore_dataset/weight/DeepSeek-R1-MTP", - trust_remote_code=True, gpu_memory_utilization=0.8, tensor_parallel_size=8, - num_speculative_tokens=1) + trust_remote_code=True, gpu_memory_utilization=0.7, tensor_parallel_size=8, + speculative_config={"num_speculative_tokens":1}) # Generate texts from the prompts. The output is a list of RequestOutput objects # that contain the prompt, generated text, and other information. outputs = llm.generate(prompts, sampling_params) diff --git a/tests/st/python/test_vllm_deepseek_part_v1.py b/tests/st/python/test_vllm_deepseek_part_v1.py index 3ec16fa7..889aae9b 100644 --- a/tests/st/python/test_vllm_deepseek_part_v1.py +++ b/tests/st/python/test_vllm_deepseek_part_v1.py @@ -63,7 +63,7 @@ class TestDeepSeek: # Create an LLM. llm = LLM(model="/home/workspace/mindspore_dataset/weight/DeepSeek-R1-W8A8", - trust_remote_code=True, gpu_memory_utilization=0.9, tensor_parallel_size=8) + trust_remote_code=True, gpu_memory_utilization=0.9, tensor_parallel_size=8, max_model_len=4096) # Generate texts from the prompts. The output is a list of RequestOutput objects # that contain the prompt, generated text, and other information. outputs = llm.generate(prompts, sampling_params) diff --git a/vllm_mindspore/model_executor/models/mf_models/deepseek_mtp.py b/vllm_mindspore/model_executor/models/mf_models/deepseek_mtp.py index fac2bf20..c0b72f4d 100644 --- a/vllm_mindspore/model_executor/models/mf_models/deepseek_mtp.py +++ b/vllm_mindspore/model_executor/models/mf_models/deepseek_mtp.py @@ -110,4 +110,6 @@ class DeepseekV3MTPForCausalLM(MfModelBase): weight_processor = DeepseekV3WeightProcessor(self.mf_config, self.network, False) weight_processor.load_safetensors_shard(self.mf_config.load_checkpoint, is_mtp_model=True) self.network.set_dynamic_inputs() + dynamic_hidden_states = Tensor(shape=[None, None], dtype=self.mf_model_config.compute_dtype) + self.lm_head.set_inputs(dynamic_hidden_states) return None -- Gitee From 60435452383233d0a23614c15b3e4ba8f6a157f2 Mon Sep 17 00:00:00 2001 From: huandong Date: Sat, 17 May 2025 15:58:37 +0800 Subject: [PATCH 43/55] add global_max_bs(use max_num_seqs) for dispatch op and combine op --- vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py | 3 +++ vllm_mindspore/model_executor/models/model_base.py | 1 + 2 files changed, 4 insertions(+) diff --git a/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py b/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py index d1102ad9..c087d998 100644 --- a/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py +++ b/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py @@ -155,6 +155,9 @@ class DeepseekV3ForCausalLM(MfModelBase): self.mf_model_config = DeepseekV3Config_MF(**self.mf_config.model.model_config) if self.mf_config.moe_config: self.mf_model_config.moe_config = self.mf_config.moe_config + # dispatch/combine in moe need max_num_seqs as global_max_bs + if hasattr(self.mf_model_config.moe_config, "dispatch_global_max_bs"): + self.mf_model_config.moe_config.dispatch_global_max_bs = self.scheduler_config.max_num_seqs self.mf_model_config.return_hidden_states = True setattr(self.mf_model_config, 'npu_mem_size', -1) diff --git a/vllm_mindspore/model_executor/models/model_base.py b/vllm_mindspore/model_executor/models/model_base.py index 7aa1de4e..d211c3a9 100644 --- a/vllm_mindspore/model_executor/models/model_base.py +++ b/vllm_mindspore/model_executor/models/model_base.py @@ -111,6 +111,7 @@ class MsModelBase(): self.cache_config = vllm_config.cache_config self.parallel_config = vllm_config.parallel_config self.load_config = vllm_config.load_config + self.scheduler_config = vllm_config.scheduler_config self.modules_dict = None -- Gitee From 51fb3b5ea0331ab1acfc6791c9c38bacfc5098df Mon Sep 17 00:00:00 2001 From: r1chardf1d0 Date: Wed, 21 May 2025 11:03:52 +0800 Subject: [PATCH 44/55] reopen v0 testcase --- tests/st/python/test_vllm_deepseek_part.py | 2 +- tests/st/python/test_vllm_mf_qwen_7b_chunk_prefill.py | 6 +++--- tests/st/python/test_vllm_mf_qwen_7b_cp_pc_mss.py | 2 +- tests/st/python/test_vllm_mf_qwen_7b_prefix_caching.py | 4 ++-- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/tests/st/python/test_vllm_deepseek_part.py b/tests/st/python/test_vllm_deepseek_part.py index 21ba4fe4..bf7d8d2a 100644 --- a/tests/st/python/test_vllm_deepseek_part.py +++ b/tests/st/python/test_vllm_deepseek_part.py @@ -105,7 +105,7 @@ class TestDeepSeekMTP: # Create an LLM. llm = LLM(model="/home/workspace/mindspore_dataset/weight/DeepSeek-R1-MTP", - trust_remote_code=True, gpu_memory_utilization=0.7, tensor_parallel_size=8, + trust_remote_code=True, gpu_memory_utilization=0.7, tensor_parallel_size=8, max_model_len=4096, speculative_config={"num_speculative_tokens":1}) # Generate texts from the prompts. The output is a list of RequestOutput objects # that contain the prompt, generated text, and other information. diff --git a/tests/st/python/test_vllm_mf_qwen_7b_chunk_prefill.py b/tests/st/python/test_vllm_mf_qwen_7b_chunk_prefill.py index cc3cbcab..7732bbbf 100644 --- a/tests/st/python/test_vllm_mf_qwen_7b_chunk_prefill.py +++ b/tests/st/python/test_vllm_mf_qwen_7b_chunk_prefill.py @@ -48,7 +48,7 @@ class TestMfQwen_chunk_prefill: @pytest.mark.level0 @pytest.mark.platform_arm_ascend910b_training @pytest.mark.env_single - @pytest.mark.skip(reason="cp precision need to be fixed on v0.8.3 V0") + @pytest.mark.skip(reason="mindspore not ready") def test_mf_qwen_7b_chunk_prefill(self): """ test case qwen_7b_chunk_prefill @@ -61,8 +61,8 @@ class TestMfQwen_chunk_prefill: "marvel that once housed emperors, stands as a testament to the city's imperial past. Meanwhile, the Great " "Wall, though not within the city limits, is easily accessible from Beijing and offers a glimpse into the " "strategic genius and resilience of ancient China.", - "answer": " The city's blend of traditional and modern architecture, vibrant street life, and rich culinary scene " - "make it a truly unique and captivating destination. I am always eager to"}, + "answer": " The city's blend of traditional and modern architecture, bustling markets, and vibrant street life make it " + "a unique and fascinating destination. In short, Beijing is a city"}, {"prompt": "I love Beijing, because", "answer": " it is a city with a long history. Which of the following options correctly expresses this sentence?\nA. I love Beijing, because it is a city with a"}, ] diff --git a/tests/st/python/test_vllm_mf_qwen_7b_cp_pc_mss.py b/tests/st/python/test_vllm_mf_qwen_7b_cp_pc_mss.py index 856932d3..90eb811f 100644 --- a/tests/st/python/test_vllm_mf_qwen_7b_cp_pc_mss.py +++ b/tests/st/python/test_vllm_mf_qwen_7b_cp_pc_mss.py @@ -45,7 +45,7 @@ class TestMfQwen_cp_pc_mss: @pytest.mark.level0 @pytest.mark.platform_arm_ascend910b_training @pytest.mark.env_single - @pytest.mark.skip(reason="cp precision need to be fixed on v0.8.3 V0") + @pytest.mark.skip(reason="mindspore not ready") def test_mf_qwen_7b_cp_pc_mss(self): """ test case mf_qwen_7b_cp_pc_mss diff --git a/tests/st/python/test_vllm_mf_qwen_7b_prefix_caching.py b/tests/st/python/test_vllm_mf_qwen_7b_prefix_caching.py index 28ec1058..80ea073f 100644 --- a/tests/st/python/test_vllm_mf_qwen_7b_prefix_caching.py +++ b/tests/st/python/test_vllm_mf_qwen_7b_prefix_caching.py @@ -46,7 +46,7 @@ class TestMfQwen_prefix_caching: @pytest.mark.level0 @pytest.mark.platform_arm_ascend910b_training @pytest.mark.env_single - @pytest.mark.skip(reason="pc precision need to be fixed on v0.8.3 V0") + @pytest.mark.skip(reason="mindspore not ready") def test_mf_qwen_7b_prefix_caching(self): """ test case qwen_7b_prefix_caching @@ -72,7 +72,7 @@ class TestMfQwen_prefix_caching: outputs = llm.generate(prompts, sampling_params) second_outputs = llm.generate(second_prompts, sampling_params) except_list=[' many times and each time I have found something new'] - second_except_list=[' to visit, such as the Forbidden City, the'] + second_except_list=[' in Beijing, but I have to say that the'] for i, (output, second_output) in enumerate(zip(outputs, second_outputs)): generated_text = output.outputs[i].text print(f"Output1 - Prompt: {prompts[i]!r}, Generated text: {generated_text!r}") -- Gitee From de9d18b6b421cd962bdfc596a9a7d98181f1d196 Mon Sep 17 00:00:00 2001 From: Erpim Date: Wed, 21 May 2025 14:47:24 +0800 Subject: [PATCH 45/55] support dp/tp case --- tests/mindformers | 2 +- vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/mindformers b/tests/mindformers index d2df5295..a926c39b 160000 --- a/tests/mindformers +++ b/tests/mindformers @@ -1 +1 @@ -Subproject commit d2df52951122a0328a7ccea4230e0b6f49f6c2e8 +Subproject commit a926c39bd0c97fa4ea145232da5cc65b7703f88a diff --git a/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py b/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py index 20f83e72..17a730bb 100644 --- a/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py +++ b/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py @@ -83,7 +83,7 @@ def set_runtime_kernel_launch_group(): def _get_padding_index(q_seq_len): dp_size = get_dp_group().world_size tp_size = get_tensor_model_parallel_world_size() - if dp_size == 1 or tp_size == 1: + if dp_size == 1: return None, None, None, None tokens_len_per_dp = q_seq_len.sum().reshape(-1) -- Gitee From 474e6ba6e0358b7dd4922ca7de937740915f43dd Mon Sep 17 00:00:00 2001 From: wusimin Date: Thu, 22 May 2025 10:02:31 +0800 Subject: [PATCH 46/55] =?UTF-8?q?[0.8.3=20v1]=E9=80=82=E9=85=8D=E5=8E=9F?= =?UTF-8?q?=E7=94=9FQwen,=E4=BF=AE=E5=A4=8D=E7=B2=BE=E5=BA=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- vllm_mindspore/model_executor/models/attention_mask.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm_mindspore/model_executor/models/attention_mask.py b/vllm_mindspore/model_executor/models/attention_mask.py index 40be1f46..42d6e629 100644 --- a/vllm_mindspore/model_executor/models/attention_mask.py +++ b/vllm_mindspore/model_executor/models/attention_mask.py @@ -46,7 +46,7 @@ class LowerTriangularMask: self.dtype = dtype self.max_model_len = max_model_len - prefill_mask_coeff = 1.0 if self.dtype is mstype.bfloat16 else -10000.0 + prefill_mask_coeff = 1.0 if self.dtype == mstype.bfloat16 else -10000.0 self.prefill_mask = Tensor(np.triu(np.ones(shape=(128, 128), dtype=np.float16), k=1) * prefill_mask_coeff, dtype=self.dtype) @@ -78,6 +78,6 @@ class MLALowerTriangularMask(LowerTriangularMask): def __init__(self, dtype, max_model_len): super().__init__(dtype, max_model_len) - decode_mask_coeff = 1.0 if self.dtype is mstype.bfloat16 else -10000.0 + decode_mask_coeff = 1.0 if self.dtype == mstype.bfloat16 else -10000.0 self.decode_mask = Tensor(np.triu(np.ones(shape=(self.max_model_len, self.max_model_len), dtype=np.int8), k=1), dtype=self.dtype) * decode_mask_coeff -- Gitee From 88afb5f0ade4820d7dacaadc81b92483d367bd99 Mon Sep 17 00:00:00 2001 From: one_east Date: Fri, 23 May 2025 10:49:25 +0800 Subject: [PATCH 47/55] bugfix: process do not terminated correctly --- vllm_mindspore/__init__.py | 5 +++++ vllm_mindspore/executor/multiproc_worker_utils.py | 5 +++++ 2 files changed, 10 insertions(+) diff --git a/vllm_mindspore/__init__.py b/vllm_mindspore/__init__.py index 98970a5f..34e7bdf3 100644 --- a/vllm_mindspore/__init__.py +++ b/vllm_mindspore/__init__.py @@ -172,6 +172,7 @@ vllm.worker.multi_step_model_runner._get_supported_attention_backends = ( from vllm_mindspore.executor.multiproc_worker_utils import ( get_mp_context as ms_get_mp_context, + terminate_worker as ms_terminate_worker, ) # To patching the get_mp_context, should import it first. @@ -179,6 +180,10 @@ from vllm.executor.multiproc_worker_utils import get_mp_context vllm.executor.multiproc_worker_utils.get_mp_context = ms_get_mp_context +import vllm.executor.multiproc_worker_utils + +vllm.executor.multiproc_worker_utils.ProcessWorkerWrapper.terminate_worker = ms_terminate_worker + import vllm.v1.executor.multiproc_executor vllm.v1.executor.multiproc_executor.get_mp_context = ms_get_mp_context import vllm.v1.utils diff --git a/vllm_mindspore/executor/multiproc_worker_utils.py b/vllm_mindspore/executor/multiproc_worker_utils.py index 86986fa6..30c7a597 100644 --- a/vllm_mindspore/executor/multiproc_worker_utils.py +++ b/vllm_mindspore/executor/multiproc_worker_utils.py @@ -21,3 +21,8 @@ import multiprocessing def get_mp_context(): return multiprocessing.get_context("fork") + + +def terminate_worker(self): + self.process.kill() + self._task_queue.close() -- Gitee From 18162c09666a06120839c5017f05920c5dea8ccd Mon Sep 17 00:00:00 2001 From: r1chardf1d0 Date: Fri, 23 May 2025 01:37:44 +0800 Subject: [PATCH 48/55] reopen v0 testcase --- .jenkins/test/config/dependent_packages.yaml | 4 ++-- tests/st/python/test_vllm_deepseek_part.py | 2 -- tests/st/python/test_vllm_mf_qwen_7b_chunk_prefill.py | 1 - tests/st/python/test_vllm_mf_qwen_7b_cp_pc_mss.py | 1 - tests/st/python/test_vllm_mf_qwen_7b_prefix_caching.py | 1 - 5 files changed, 2 insertions(+), 7 deletions(-) diff --git a/.jenkins/test/config/dependent_packages.yaml b/.jenkins/test/config/dependent_packages.yaml index 5df5ff1f..0425cf02 100644 --- a/.jenkins/test/config/dependent_packages.yaml +++ b/.jenkins/test/config/dependent_packages.yaml @@ -1,8 +1,8 @@ mindspore: - 'https://repo.mindspore.cn/mindspore/mindspore/version/202505/20250514/br_infer_deepseek_os_20250514004506_0e705b79c36766d07889faa32bc6a3ef6ec79ef3_newest/' + 'https://repo.mindspore.cn/mindspore/mindspore/version/202505/20250523/br_infer_deepseek_os_20250523150616_197336f8c8ab3ca63d02df74b31a080f521c0cab_newest/' mindspore_gs: - 'https://repo.mindspore.cn/mindspore/golden-stick/version/202505/20250514/master_20250514010015_c6cede824328d0dd7069e735646ff4a1808a1c72_newest/' + 'https://repo.mindspore.cn/mindspore/golden-stick/version/202505/20250521/develop_20250521153508_28a4a63203943d66d8c94b2b013e5cbed37f3e8a_newest/' msadapter: 'https://repo.mindspore.cn/mindspore/msadapter/version/202505/20250514/master_20250514010016_380ecadf0133da436503105d6e8e1db709472fe4_newest/' diff --git a/tests/st/python/test_vllm_deepseek_part.py b/tests/st/python/test_vllm_deepseek_part.py index bf7d8d2a..c6a7f339 100644 --- a/tests/st/python/test_vllm_deepseek_part.py +++ b/tests/st/python/test_vllm_deepseek_part.py @@ -48,7 +48,6 @@ class TestDeepSeek: @pytest.mark.level0 @pytest.mark.platform_arm_ascend910b_training @pytest.mark.env_single - @pytest.mark.skip(reason="gs master branch is not suit for the newest mindformers.") def test_deepseek_r1(self): """ test case deepseek r1 w8a8 @@ -89,7 +88,6 @@ class TestDeepSeekMTP: @pytest.mark.level0 @pytest.mark.platform_arm_ascend910b_training @pytest.mark.env_single - @pytest.mark.skip(reason="gs master branch is not suit for the newest mindformers.") def test_deepseek_mtp(self): """ test case deepseek mtp with main model of r1-w8a8 diff --git a/tests/st/python/test_vllm_mf_qwen_7b_chunk_prefill.py b/tests/st/python/test_vllm_mf_qwen_7b_chunk_prefill.py index 7732bbbf..daa57d93 100644 --- a/tests/st/python/test_vllm_mf_qwen_7b_chunk_prefill.py +++ b/tests/st/python/test_vllm_mf_qwen_7b_chunk_prefill.py @@ -48,7 +48,6 @@ class TestMfQwen_chunk_prefill: @pytest.mark.level0 @pytest.mark.platform_arm_ascend910b_training @pytest.mark.env_single - @pytest.mark.skip(reason="mindspore not ready") def test_mf_qwen_7b_chunk_prefill(self): """ test case qwen_7b_chunk_prefill diff --git a/tests/st/python/test_vllm_mf_qwen_7b_cp_pc_mss.py b/tests/st/python/test_vllm_mf_qwen_7b_cp_pc_mss.py index 90eb811f..df61117d 100644 --- a/tests/st/python/test_vllm_mf_qwen_7b_cp_pc_mss.py +++ b/tests/st/python/test_vllm_mf_qwen_7b_cp_pc_mss.py @@ -45,7 +45,6 @@ class TestMfQwen_cp_pc_mss: @pytest.mark.level0 @pytest.mark.platform_arm_ascend910b_training @pytest.mark.env_single - @pytest.mark.skip(reason="mindspore not ready") def test_mf_qwen_7b_cp_pc_mss(self): """ test case mf_qwen_7b_cp_pc_mss diff --git a/tests/st/python/test_vllm_mf_qwen_7b_prefix_caching.py b/tests/st/python/test_vllm_mf_qwen_7b_prefix_caching.py index 80ea073f..01736f85 100644 --- a/tests/st/python/test_vllm_mf_qwen_7b_prefix_caching.py +++ b/tests/st/python/test_vllm_mf_qwen_7b_prefix_caching.py @@ -46,7 +46,6 @@ class TestMfQwen_prefix_caching: @pytest.mark.level0 @pytest.mark.platform_arm_ascend910b_training @pytest.mark.env_single - @pytest.mark.skip(reason="mindspore not ready") def test_mf_qwen_7b_prefix_caching(self): """ test case qwen_7b_prefix_caching -- Gitee From ebc67cc32c10dd976bbe075ee4a532c7bc0753b7 Mon Sep 17 00:00:00 2001 From: tronzhang Date: Fri, 23 May 2025 20:22:39 +0800 Subject: [PATCH 49/55] fix pa error for v0 qwen in cp+mss case --- vllm_mindspore/model_executor/models/qwen2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm_mindspore/model_executor/models/qwen2.py b/vllm_mindspore/model_executor/models/qwen2.py index 8bf808a4..444ddc5a 100644 --- a/vllm_mindspore/model_executor/models/qwen2.py +++ b/vllm_mindspore/model_executor/models/qwen2.py @@ -579,7 +579,7 @@ class Qwen2ForCausalLM(MsModelBase): is_prefill = attn_metadata.num_decode_tokens == 0 and kv_cache_lens.max() == 0 slot_mapping = attn_metadata.slot_mapping batch_valid_length = Tensor.from_numpy(np.array(attn_metadata.seq_lens, dtype=np.int32)) - q_seq_lens = Tensor.from_numpy(np.array(attn_metadata.query_lens, dtype=np.int32)) + q_seq_lens = ms.Tensor(query_lens_np, dtype=ms.int32) block_tables = attn_metadata.block_tables position_ids = ms.Tensor(positions, dtype=ms.int32) attn_mask = self.casual_mask.gen_attention_mask(is_prefill, position_ids, query_lens) -- Gitee From c308469de798e7c24df2dd602f5707f7c0f6f687 Mon Sep 17 00:00:00 2001 From: yangminghai Date: Sat, 24 May 2025 19:47:14 +0800 Subject: [PATCH 50/55] fix bug at pynative mode when decode and prefill phase change --- .../model_executor/models/mf_models/mf_model_base.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/vllm_mindspore/model_executor/models/mf_models/mf_model_base.py b/vllm_mindspore/model_executor/models/mf_models/mf_model_base.py index ecc764be..394032ad 100644 --- a/vllm_mindspore/model_executor/models/mf_models/mf_model_base.py +++ b/vllm_mindspore/model_executor/models/mf_models/mf_model_base.py @@ -38,6 +38,7 @@ from mindformers.tools.register.config import MindFormerConfig from mindformers.core.context import build_mf_context from mindformers.core.parallel_config import build_parallel_config from mindspore.common.api import _pynative_executor +from mindformers.tools.utils import is_pynative from vllm_mindspore.model_executor.models.model_base import MsModelBase from vllm_mindspore.model_executor.models.attention_mask import LowerTriangularMask from vllm_mindspore.v1.attention.backends.flash_attn import FlashAttentionMetadata @@ -231,11 +232,11 @@ class MfModelBase(MsModelBase): if is_prefill: self.network.phase = "prefill" - if not self.set_flags: + if not self.set_flags or is_pynative(): self.network.add_flags_custom(is_first_iteration=True) hidden_states = self.network(**model_inputs) self.network.phase = "increment" - if not self.set_flags: + if not self.set_flags or is_pynative(): self.network.add_flags_custom(is_first_iteration=False) self.set_flags = True if kv_transfer_supported: -- Gitee From 646e481dbb09f350b695ebd959293be8ce802c7d Mon Sep 17 00:00:00 2001 From: Erpim Date: Tue, 13 May 2025 15:36:50 +0800 Subject: [PATCH 51/55] add mix parallel st --- .../python/test_vllm_deepseek_mix_parallel.py | 351 ++++++++++++++++++ 1 file changed, 351 insertions(+) create mode 100644 tests/st/python/test_vllm_deepseek_mix_parallel.py diff --git a/tests/st/python/test_vllm_deepseek_mix_parallel.py b/tests/st/python/test_vllm_deepseek_mix_parallel.py new file mode 100644 index 00000000..32a8aee1 --- /dev/null +++ b/tests/st/python/test_vllm_deepseek_mix_parallel.py @@ -0,0 +1,351 @@ +# Copyright 2025 Huawei Technologies Co., Ltd +# +# This file is mainly Adapted from https://github.com/vllm-project/vllm/blob/main/examples/offline_inference/data_parallel.py +# Copyright 2025 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""test mf deepseek r1.""" +import pytest +import os +import tempfile +import re + +from . import set_env +from multiprocessing import Process, Queue + +env_manager = set_env.EnvVarManager() + +env_vars = { + "MINDFORMERS_MODEL_CONFIG": "./config/predict_deepseek_r1_671b_w8a8.yaml", + "ASCEND_CUSTOM_PATH": os.path.expandvars("$ASCEND_HOME_PATH/../"), + "vLLM_MODEL_BACKEND": "MindFormers", + "MS_ENABLE_LCCL": "on", + "HCCL_OP_EXPANSION_MODE": "AIV", + "ASCEND_RT_VISIBLE_DEVICES": "0,1,2,3,4,5,6,7", + "MS_ALLOC_CONF": "enable_vmm:True", + "LCCL_DETERMINISTIC": "1", + "HCCL_DETERMINISTIC": "true", + "ATB_MATMUL_SHUFFLE_K_ENABLE": "0", + "ATB_LLM_LCOC_ENABLE": "0" +} +env_manager.setup_ai_environment(env_vars) +import vllm_mindspore +from vllm import LLM, SamplingParams +from vllm.utils import get_open_port + + +def dp_func(dp_size, local_dp_rank, global_dp_rank, dp_master_ip, dp_master_port, + GPUs_per_dp_rank, prompts, except_list, result_q): + os.environ["VLLM_DP_RANK"] = str(global_dp_rank) + os.environ["VLLM_DP_LOCAL"] = str(local_dp_rank) + os.environ["VLLM_DP_SIZE"] = str(dp_size) + os.environ["VLLM_DP_MASTER_IP"] = dp_master_ip + os.environ["VLLM_DP_MASTER_PORT"] = str(dp_master_port) + + promts_per_rank = len(prompts) // dp_size + start = global_dp_rank * promts_per_rank + end = start + promts_per_rank + prompts = prompts[start:end] + except_list = except_list[start:end] + if len(prompts) == 0: + prompts = ["Placeholder"] + print(f"DP rank {global_dp_rank} needs to process {len(prompts)} prompts") + + sampling_params = SamplingParams(temperature=0.0, + top_p=1.0, + top_k=1, + repetition_penalty=1.0, + max_tokens=3) + + # Create an LLM. + llm = LLM(model="/home/workspace/mindspore_dataset/weight/DeepSeek-R1-W8A8", + tensor_parallel_size=GPUs_per_dp_rank, + max_model_len = 4096, + max_num_batched_tokens=8, + max_num_seqs=8, + trust_remote_code=True, + enforce_eager=True, + enable_expert_parallel=True) + outputs = llm.generate(prompts, sampling_params) + # Print the outputs. + for i, output in enumerate(outputs): + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"DP rank {global_dp_rank}, Prompt: {prompt!r}, " + f"Generated text: {generated_text!r}") + result_q.put(generated_text == except_list[i]) + + +def exec_ds_with_dp(new_yaml, replaced_pattern, dp_size, tp_size, prompts, except_list): + file = open('./config/predict_deepseek_r1_671b_w8a8.yaml', 'r') + content = file.read() + file.close() + + replace_data_parallel = re.compile(r'data_parallel: 1') + replace_model_parallel = re.compile(r'model_parallel: 16') + replace_expert_parallel = re.compile(r'expert_parallel: 1') + + content = replace_data_parallel.sub(replaced_pattern[0], content) + content = replace_model_parallel.sub(replaced_pattern[1], content) + content = replace_expert_parallel.sub(replaced_pattern[2], content) + + with tempfile.TemporaryDirectory() as tmp_dir: + new_yaml_path = os.path.join(tmp_dir, new_yaml) + with open(new_yaml_path, 'w') as f: + f.write(content) + env_manager.set_env_var("MINDFORMERS_MODEL_CONFIG", new_yaml_path) + + node_size = 1 + node_rank = 0 + dp_master_ip = "127.0.0.1" + dp_master_port = get_open_port() + + dp_per_node = dp_size // node_size + + result_q = Queue() + procs = [] + for local_dp_rank, global_dp_rank in enumerate( + range(node_rank * dp_per_node, (node_rank + 1) * dp_per_node)): + proc = Process(target=dp_func, + args=(dp_size, local_dp_rank, + global_dp_rank, dp_master_ip, dp_master_port, + tp_size, prompts, except_list, result_q)) + proc.start() + procs.append(proc) + exit_code = 0 + + for proc in procs: + proc.join(timeout=180) + if proc.exitcode is None: + print(f"Killing process {proc.pid} that " + f"didn't stop within 3 minutes.") + proc.kill() + exit_code = 1 + elif proc.exitcode: + exit_code = proc.exitcode + + assert exit_code == 0 + result = True + for proc in procs: + result = result and result_q.get() + assert result + + # unset env + env_manager.unset_all() + + +def exec_ds_without_dp(new_yaml, replaced_pattern, prompts, except_list): + file = open('./config/predict_deepseek_r1_671b_w8a8.yaml', 'r') + content = file.read() + file.close() + + replace_data_parallel = re.compile(r'data_parallel: 1') + replace_model_parallel = re.compile(r'model_parallel: 16') + replace_expert_parallel = re.compile(r'expert_parallel: 1') + + content = replace_data_parallel.sub(replaced_pattern[0], content) + content = replace_model_parallel.sub(replaced_pattern[1], content) + content = replace_expert_parallel.sub(replaced_pattern[2], content) + + with tempfile.TemporaryDirectory() as tmp_dir: + new_yaml_path = os.path.join(tmp_dir, new_yaml) + with open(new_yaml_path, 'w') as f: + f.write(content) + env_manager.set_env_var("MINDFORMERS_MODEL_CONFIG", new_yaml_path) + + + # Create a sampling params object. + sampling_params = SamplingParams(temperature=0.0, max_tokens=3, top_k=1, top_p=1.0, + repetition_penalty=1.0) + + # Create an LLM. + llm = LLM(model="/home/workspace/mindspore_dataset/weight/DeepSeek-R1-W8A8", + trust_remote_code=True, gpu_memory_utilization=0.9, tensor_parallel_size=8, max_model_len=4096) + # Generate texts from the prompts. The output is a list of RequestOutput objects + # that contain the prompt, generated text, and other information. + outputs = llm.generate(prompts, sampling_params) + # Print the outputs. + for i, output in enumerate(outputs): + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") + assert generated_text == except_list[i] + + # unset env + env_manager.unset_all() + + + +@pytest.mark.level0 +@pytest.mark.platform_arm_ascend910b_training +@pytest.mark.allcards +def test_deepseek_r1_dp4_tp2_ep4(): + """ + test case deepseek r1 w8a8 dp4 tp2 ep4 + """ + new_yaml = "dp4_tp2_ep4.yaml" + replaced_pattern = ['data_parallel: 4', 'model_parallel: 2', 'expert_parallel: 4'] + dp_size = 4 + tp_size = 2 + # Sample prompts. + prompts = [ + "You are a helpful assistant.<|User|>将文本分类为中性、负面或正面。 \n文本:我认为这次假期还可以。 " + "\n情感:<|Assistant|>\n", + ] * 4 + + except_list = ['ugs611ాలు'] * 4 + exec_ds_with_dp(new_yaml, replaced_pattern, dp_size, tp_size, prompts, except_list) + + +@pytest.mark.level1 +@pytest.mark.platform_arm_ascend910b_training +@pytest.mark.allcards +def test_deepseek_r1_dp8_tp1_ep8(): + """ + test case deepseek r1 w8a8 Dp8 tp1 ep8 + """ + new_yaml = "dp8_tp1_ep8.yaml" + replaced_pattern = ['data_parallel: 8', 'model_parallel: 1', 'expert_parallel: 8'] + dp_size = 8 + tp_size = 1 + # Sample prompts. + prompts = [ + "You are a helpful assistant.<|User|>将文本分类为中性、负面或正面。 \n文本:我认为这次假期还可以。 " + "\n情感:<|Assistant|>\n", + ] * 8 + + except_list = ['ugs611ాలు'] * 8 + exec_ds_with_dp(new_yaml, replaced_pattern, dp_size, tp_size, prompts, except_list) + + +@pytest.mark.level1 +@pytest.mark.platform_arm_ascend910b_training +@pytest.mark.allcards +def test_deepseek_r1_dp2_tp4_ep1(): + """ + test case deepseek r1 w8a8 dp2 tp4 ep1 + """ + new_yaml = "dp2_tp4_ep1.yaml" + replaced_pattern = ['data_parallel: 2', 'model_parallel: 4', 'expert_parallel: 1'] + dp_size = 2 + tp_size = 4 + # Sample prompts. + prompts = [ + "You are a helpful assistant.<|User|>将文本分类为中性、负面或正面。 \n文本:我认为这次假期还可以。 " + "\n情感:<|Assistant|>\n", + ] * 2 + + except_list = ['ugs611ాలు'] * 2 + exec_ds_with_dp(new_yaml, replaced_pattern, dp_size, tp_size, prompts, except_list) + + +@pytest.mark.level1 +@pytest.mark.platform_arm_ascend910b_training +@pytest.mark.allcards +def test_deepseek_r1_dp4_tp2_ep8(): + """ + test case deepseek r1 w8a8 dp4 tp2 ep8 + """ + new_yaml = "dp4_tp2_ep8.yaml" + replaced_pattern = ['data_parallel: 4', 'model_parallel: 2', 'expert_parallel: 8'] + dp_size = 4 + tp_size = 2 + # Sample prompts. + prompts = [ + "You are a helpful assistant.<|User|>将文本分类为中性、负面或正面。 \n文本:我认为这次假期还可以。 " + "\n情感:<|Assistant|>\n", + ] * 4 + + except_list = ['ugs611ాలు'] * 4 + exec_ds_with_dp(new_yaml, replaced_pattern, dp_size, tp_size, prompts, except_list) + + +@pytest.mark.level1 +@pytest.mark.platform_arm_ascend910b_training +@pytest.mark.allcards +def test_deepseek_r1_dp8_tp1_ep1(): + """ + test case deepseek r1 w8a8 dp8 tp1 ep1 + """ + new_yaml = "dp8_tp1_ep1.yaml" + replaced_pattern = ['data_parallel: 8', 'model_parallel: 1', 'expert_parallel: 1'] + dp_size = 8 + tp_size = 1 + # Sample prompts. + prompts = [ + "You are a helpful assistant.<|User|>将文本分类为中性、负面或正面。 \n文本:我认为这次假期还可以。 " + "\n情感:<|Assistant|>\n", + ] * 8 + + except_list = ['ugs611ాలు'] * 8 + exec_ds_with_dp(new_yaml, replaced_pattern, dp_size, tp_size, prompts, except_list) + + +@pytest.mark.level1 +@pytest.mark.platform_arm_ascend910b_training +@pytest.mark.allcards +def test_deepseek_r1_dp8_tp1_ep4(): + """ + test case deepseek r1 w8a8 dp8 tp1 ep1 + """ + new_yaml = "dp8_tp1_ep4.yaml" + replaced_pattern = ['data_parallel: 8', 'model_parallel: 1', 'expert_parallel: 4'] + dp_size = 8 + tp_size = 1 + # Sample prompts. + prompts = [ + "You are a helpful assistant.<|User|>将文本分类为中性、负面或正面。 \n文本:我认为这次假期还可以。 " + "\n情感:<|Assistant|>\n", + ] * 8 + + except_list = ['ugs611ాలు'] * 8 + exec_ds_with_dp(new_yaml, replaced_pattern, dp_size, tp_size, prompts, except_list) + + +@pytest.mark.level1 +@pytest.mark.platform_arm_ascend910b_training +@pytest.mark.allcards +def test_deepseek_r1_tp8_ep8(): + """ + test case deepseek r1 w8a8 tp8 ep8 + """ + new_yaml = "tp8_ep8.yaml" + replaced_pattern = ['data_parallel: 1', 'model_parallel: 8', 'expert_parallel: 8'] + # Sample prompts. + prompts = [ + "You are a helpful assistant.<|User|>将文本分类为中性、负面或正面。 \n文本:我认为这次假期还可以。 " + "\n情感:<|Assistant|>\n", + ] + + except_list=['ugs611ాలు'] + exec_ds_without_dp(new_yaml, replaced_pattern, prompts, except_list) + + +@pytest.mark.level1 +@pytest.mark.platform_arm_ascend910b_training +@pytest.mark.allcards +def test_deepseek_r1_tp8_ep4(): + """ + test case deepseek r1 w8a8 tp8 ep4 + """ + new_yaml = "tp8_ep4.yaml" + replaced_pattern = ['data_parallel: 1', 'model_parallel: 8', 'expert_parallel: 4'] + # Sample prompts. + prompts = [ + "You are a helpful assistant.<|User|>将文本分类为中性、负面或正面。 \n文本:我认为这次假期还可以。 " + "\n情感:<|Assistant|>\n", + ] + + except_list=['ugs611ాలు'] + exec_ds_without_dp(new_yaml, replaced_pattern, prompts, except_list) -- Gitee From 0528bcc90ea4971e75f5054f863249fda61442ff Mon Sep 17 00:00:00 2001 From: zlq2020 Date: Sat, 17 May 2025 16:41:33 +0800 Subject: [PATCH 52/55] add install depended package scripts --- install_depend_pkgs.sh | 102 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 102 insertions(+) create mode 100644 install_depend_pkgs.sh diff --git a/install_depend_pkgs.sh b/install_depend_pkgs.sh new file mode 100644 index 00000000..a5d06518 --- /dev/null +++ b/install_depend_pkgs.sh @@ -0,0 +1,102 @@ +#!/bin/bash + +script_dir=$(cd "$(dirname $0)"; pwd) +yaml_file="$script_dir/.jenkins/test/config/dependent_packages.yaml" +work_dir="install_depend_pkgs" + +if [ ! -f "$yaml_file" ]; then + echo "$yaml_file does not exist." + exit 1 +fi + +if [ ! -d "$work_dir" ]; then + mkdir -p "$work_dir" + echo "Created $work_dir directory." +else + echo "$work_dir already exists. Removing existing whl packages." + rm -f "$work_dir"/*.whl +fi + +cd "$work_dir" || exit 1 + +get_yaml_value() { + local file="$1" + local key="$2" + + python3 -c " +import yaml +try: + with open('$file', 'r') as f: + data = yaml.safe_load(f) + print(data.get('$key', '')) +except Exception as e: + print(f'Error: {e}') + exit(1) +" +} + +echo "========= Installing vllm" +vllm_dir=vllm-v0.8.3 +if [ ! -d "$vllm_dir" ]; then + git clone https://github.com/vllm-project/vllm.git -b v0.8.3 "$vllm_dir" + cd "$vllm_dir" || { echo "Failed to git clone vllm!"; exit 1; } + git apply ../../vllm_dp/dp_scale_out.patch +else + echo "The $vllm_dir folder already exists and will not be re-downloaded." + cd "$vllm_dir" || { echo "Failed to git clone vllm!"; exit 1; } +fi +pip uninstall msadapter -y +pip uninstall vllm -y +pip install -v -r requirements/cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu +VLLM_TARGET_DEVICE=empty python setup.py install || { echo "Failed to install vllm"; exit 1; } +pip uninstall torch torch-npu torchvision -y +cd .. + + +echo "========= Installing mindspore" +python_v="cp$(python3 --version 2>&1 | grep -oP 'Python \K\d+\.\d+' | tr -d .)" +mindspore_path=$(get_yaml_value "$yaml_file" "mindspore") +mindspore_name="mindspore-2.6.0-${python_v}-${python_v}-linux_$(arch).whl" +mindspore_pkg="${mindspore_path}unified/$(arch)/${mindspore_name}" + +wget "$mindspore_pkg" --no-check-certificate || { echo "Failed to download mindspore"; exit 1; } +pip uninstall mindspore -y && pip install "$mindspore_name" || { echo "Failed to install mindspore"; exit 1; } + + +echo "========= Installing mindformers" +mf_dir=mindformers-os +if [ ! -d "$mf_dir" ]; then + git clone https://gitee.com/mindspore/mindformers.git -b br_infer_deepseek_os "$mf_dir" +else + echo "The $mf_dir folder already exists and will not be re-downloaded." +fi +if [ ! -d "$mf_dir" ]; then + echo "Failed to git clone mindformers!" + exit 1 +fi + + +echo "========= Installing mindspore golden-stick" +gs_dir=gs-develop +if [ ! -d "$gs_dir" ]; then + git clone https://gitee.com/mindspore/golden-stick.git -b develop "$gs_dir" +else + echo "The $gs_dir folder already exists and will not be re-downloaded." +fi +cd "$gs_dir" || { echo "Failed to git clone golden-stick!"; exit 1; } +pip uninstall mindspore-gs -y && pip install .|| { echo "Failed to install golden-stick"; exit 1; } +cd .. + + +echo "========= Installing msadapter" +msadapter_dir="MSAdapter" +if [ ! -d "$msadapter_dir" ]; then + git clone https://git.openi.org.cn/OpenI/MSAdapter.git +else + echo "The $msadapter_dir folder already exists and will not be re-downloaded." +fi +cd "$msadapter_dir" || { echo "Failed to git clone msadapter!"; exit 1; } +pip uninstall msadapter -y && python3 setup.py install || { echo "Failed to install msadapter"; exit 1; } +cd .. + +echo "========= All dependencies installed successfully!" -- Gitee From da953c3b89521092e94e276cd808894f60c5ea4b Mon Sep 17 00:00:00 2001 From: moran Date: Wed, 28 May 2025 10:09:11 +0800 Subject: [PATCH 53/55] update pr template --- .gitee/PULL_REQUEST_TEMPLATE.en.md | 38 +++++++++++++++++++++++++ .gitee/PULL_REQUEST_TEMPLATE.md | 38 +++++++++++++++++++++++++ .gitee/PULL_REQUEST_TEMPLATE.zh-CN.md | 41 +++++++++++++++++++++++++++ 3 files changed, 117 insertions(+) create mode 100644 .gitee/PULL_REQUEST_TEMPLATE.en.md create mode 100644 .gitee/PULL_REQUEST_TEMPLATE.md create mode 100644 .gitee/PULL_REQUEST_TEMPLATE.zh-CN.md diff --git a/.gitee/PULL_REQUEST_TEMPLATE.en.md b/.gitee/PULL_REQUEST_TEMPLATE.en.md new file mode 100644 index 00000000..c62f68d2 --- /dev/null +++ b/.gitee/PULL_REQUEST_TEMPLATE.en.md @@ -0,0 +1,38 @@ + + +**What type of PR is this?** + +/kind