From b3f77dab4dbc3331d0dba5fe99e442ece5a46e62 Mon Sep 17 00:00:00 2001
From: zlq2020 <zouliqin@huawei.com>
Date: Sun, 27 Apr 2025 14:57:13 +0800
Subject: [PATCH 01/55] update version 0.1->0.2.0

---
 vllm_mindspore/version.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm_mindspore/version.txt b/vllm_mindspore/version.txt
index ceab6e11..0ea3a944 100644
--- a/vllm_mindspore/version.txt
+++ b/vllm_mindspore/version.txt
@@ -1 +1 @@
-0.1
\ No newline at end of file
+0.2.0
-- 
Gitee


From beb8dece2912ee73eb79cc3eeec8ba6e909542da Mon Sep 17 00:00:00 2001
From: moran <moran2@huawei.com>
Date: Sun, 27 Apr 2025 14:32:21 +0800
Subject: [PATCH 02/55] fix codecheck

---
 codecheck_toolkits/vllm_codecheck.sh          | 46 +++++++++----------
 .../pyproject.toml => pyproject.toml          | 32 ++++++-------
 2 files changed, 36 insertions(+), 42 deletions(-)
 rename codecheck_toolkits/pyproject.toml => pyproject.toml (81%)

diff --git a/codecheck_toolkits/vllm_codecheck.sh b/codecheck_toolkits/vllm_codecheck.sh
index e67c7372..928c70bd 100644
--- a/codecheck_toolkits/vllm_codecheck.sh
+++ b/codecheck_toolkits/vllm_codecheck.sh
@@ -1,27 +1,26 @@
-pip install -r requirements-lint.txt
+pip install -r codecheck_toolkits/requirements-lint.txt
 
 RET_FLAG=0
 
-cd ..
-# yapf formats code automatically
+# yapf check
 
-MERGEBASE="$(git merge-base origin/master HEAD)"
-if ! git diff --diff-filter=ACM --quiet --exit-code "$MERGEBASE" -- '*.py' '*.pyi' &> /dev/null; then
-  git diff --name-only --diff-filter=ACM "$MERGEBASE" -- '*.py' '*.pyi' | xargs -P 5 \
-  yapf --in-place --recursive --parallel --exclude build/
+MERGEBASE="$(git merge-base origin/develop HEAD)"
+if ! git diff --cached --diff-filter=ACM --quiet --exit-code "$MERGEBASE" -- '*.py' '*.pyi' &> /dev/null; then
+  git diff --cached --name-only --diff-filter=ACM "$MERGEBASE" -- '*.py' '*.pyi' | xargs -P 5 \
+  yapf --diff --recursive --parallel --exclude tests/
 fi
 
 if [[ $? -ne 0 ]]; then
-  echo "yapf run failed."
+  echo "yapf check failed."
   RET_FLAG=1
 else
-  echo "yapf run success."
+  echo "yapf check success."
 fi
 
 # codespell check
-if ! git diff --diff-filter=ACM --quiet --exit-code "$MERGEBASE" -- '*.py' '*.pyi' &> /dev/null; then
-  git diff --name-only --diff-filter=ACM "$MERGEBASE" -- '*.py' '*.pyi' | xargs -P 5 \
-  codespell --skip ./vllm_mindspore/ops/ascendc/*
+if ! git diff --cached --diff-filter=ACM --quiet --exit-code "$MERGEBASE" -- '*.py' '*.pyi' &> /dev/null; then
+  git diff --cached --name-only --diff-filter=ACM "$MERGEBASE" -- '*.py' '*.pyi' | xargs \
+  codespell --skip "./vllm_mindspore/ops/ascendc/*"
 fi
 if [[ $? -ne 0 ]]; then
   echo "codespell check failed."
@@ -31,8 +30,9 @@ else
 fi
 
 # ruff check
-if ! git diff --diff-filter=ACM --quiet --exit-code "$MERGEBASE" -- '*.py' '*.pyi' &> /dev/null; then
-  git diff --name-only --diff-filter=ACM "$MERGEBASE" -- '*.py' '*.pyi' | xargs -P 5 \
+if ! git diff --cached --diff-filter=ACM --quiet --exit-code "$MERGEBASE" -- '*.py' '*.pyi' &> /dev/null; then
+  echo "ruff check is running..."
+  git diff --cached --name-only --diff-filter=ACM "$MERGEBASE" -- '*.py' '*.pyi' |  xargs \
   ruff check
 fi
 if [[ $? -ne 0 ]]; then
@@ -42,24 +42,24 @@ else
   echo "ruff check success."
 fi
 
-# isort fixed
-if ! git diff --diff-filter=ACM --quiet --exit-code "$MERGEBASE" -- '*.py' '*.pyi' &> /dev/null; then
-  git diff --name-only --diff-filter=ACM "$MERGEBASE" -- '*.py' '*.pyi' | xargs -P 5 \
-  isort
+# isort check
+if ! git diff --cached --diff-filter=ACM --quiet --exit-code "$MERGEBASE" -- '*.py' '*.pyi' &> /dev/null; then
+  git diff --cached --name-only --diff-filter=ACM "$MERGEBASE" -- '*.py' '*.pyi' | xargs \
+  isort --check-only
 fi
 if [[ $? -ne 0 ]]; then
-  echo "isort fixed failed."
+  echo "isort check failed."
   RET_FLAG=1
 else
-  echo "isort fixed success."
+  echo "isort check success."
 fi
 
 # mypy check type
 
 PYTHON_VERSION=$(python -c 'import sys; print(f"{sys.version_info.major}.{sys.version_info.minor}")')
 
-if ! git diff --diff-filter=ACM --quiet --exit-code "$MERGEBASE" -- '*.py' '*.pyi' &> /dev/null; then
-  git diff --name-only --diff-filter=ACM "$MERGEBASE" -- '*.py' '*.pyi' | xargs -P 5 \
+if ! git diff --cached --diff-filter=ACM --quiet --exit-code "$MERGEBASE" -- '*.py' '*.pyi' &> /dev/null; then
+  git diff --cached --name-only --diff-filter=ACM "$MERGEBASE" -- '*.py' '*.pyi' | xargs \
   mypy --follow-imports skip --python-version "${PYTHON_VERSION}" "$@"
 fi
 if [[ $? -ne 0 ]]; then
@@ -69,4 +69,4 @@ else
   echo "mypy check success."
 fi
 
-cd - || exit $RET_FLAG
+exit $RET_FLAG
diff --git a/codecheck_toolkits/pyproject.toml b/pyproject.toml
similarity index 81%
rename from codecheck_toolkits/pyproject.toml
rename to pyproject.toml
index 9a3c52de..65e258dc 100644
--- a/codecheck_toolkits/pyproject.toml
+++ b/pyproject.toml
@@ -18,15 +18,18 @@ build-backend = "setuptools.build_meta"
 [tool.ruff]
 # Allow lines to be as long as 80.
 line-length = 80
+
 exclude = [
     # External file, leaving license intact
     "vllm_mindspore/__init__.py",
-    "tests/*"
+    "tests/*",
+    "setup.py"
 ]
 
 [tool.ruff.lint.per-file-ignores]
 "vllm_mindspore/version.txt" = ["F401"]
 "vllm_mindspore/_version.txt" = ["ALL"]
+"setup.py" = ["ALL"]
 
 [tool.ruff.lint]
 select = [
@@ -64,28 +67,19 @@ follow_imports = "silent"
 
 # After fixing type errors resulting from follow_imports: "skip" -> "silent",
 # move the directory here and remove it from tools/mypy.sh
-#files = [
-#    "vllm/*.py",
-#    "vllm/adapter_commons",
-#    "vllm/assets",
-#    "vllm/entrypoints",
-#    "vllm/core",
-#    "vllm/inputs",
-#    "vllm/logging_utils",
-#    "vllm/multimodal",
-#    "vllm/platforms",
-#    "vllm/transformers_utils",
-#    "vllm/triton_utils",
-#    "vllm/usage",
-#]
-files= ["vllm_mindspore/*.py",]
-# TODO(woosuk): Include the code from Megatron and HuggingFace.
+files= ["vllm_mindspore/"]
 exclude = [
     "vllm_mindspore/model_executor/parallel_utils/|vllm_mindspore/model_executor/models/",
     # Ignore triton kernels in ops.
-    'vllm_mindspore/attention/ops/.*\.py$'
+    'vllm_mindspore/attention/ops/.*\.py$',
+    'setup.py',
+    'tests/'
 ]
 
+[[tool.mypy.overrides]]
+module = "setup"
+ignore_errors = true
+
 [tool.codespell]
 ignore-words-list = "dout, te, indicies, subtile, ElementE, CANN"
 skip = "./tests/models/fixtures,./tests/prompts,./benchmarks/sonnet.txt,./tests/lora/data,./build"
@@ -94,7 +88,7 @@ skip = "./tests/models/fixtures,./tests/prompts,./benchmarks/sonnet.txt,./tests/
 use_parentheses = true
 skip_gitignore = true
 
-skip_glob = ["tests/*", "vllm_mindspore/ops/*"]
+skip_glob = ["tests/*", "vllm_mindspore/ops/*", "setup.py"]
 skip = ["vllm_mindspore/__init__.py"]
 
 [tool.pytest.ini_options]
-- 
Gitee


From aced8789e16b427ddbef81b83287889200e8fcfe Mon Sep 17 00:00:00 2001
From: zhanzhan1 <zhanzhan1@huawei.com>
Date: Mon, 28 Apr 2025 22:08:58 +0800
Subject: [PATCH 03/55] Bugfix for QWen

---
 .../model_executor/layers/vocab_parallel_embedding.py  | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/vllm_mindspore/model_executor/layers/vocab_parallel_embedding.py b/vllm_mindspore/model_executor/layers/vocab_parallel_embedding.py
index 81ebbe11..cec40d8b 100644
--- a/vllm_mindspore/model_executor/layers/vocab_parallel_embedding.py
+++ b/vllm_mindspore/model_executor/layers/vocab_parallel_embedding.py
@@ -87,12 +87,12 @@ def get_masked_input_and_mask(
 ) -> Tuple[Tensor, Tensor]:
     displaced_x = mint.sub(input_, org_vocab_start_index)
     down_truncated_x = mint.nn.functional.relu(displaced_x)
-    truncated_x = mint.minimum(down_truncated_x, org_vocab_end_index)
+    truncated_x = mint.minimum(down_truncated_x, (org_vocab_end_index - org_vocab_start_index - 1))
     org_vocab_mask = mint.eq(displaced_x, truncated_x)
 
     displaced_x = mint.sub(input_, added_vocab_start_index)
     down_truncated_x = mint.nn.functional.relu(displaced_x)
-    truncated_x = mint.minimum(down_truncated_x, added_vocab_end_index)
+    truncated_x = mint.minimum(down_truncated_x, (added_vocab_end_index - added_vocab_start_index - 1))
     added_vocab_mask = mint.eq(displaced_x, truncated_x)
     added_offset = added_vocab_start_index - (
         org_vocab_end_index - org_vocab_start_index) - num_org_vocab_padding
@@ -197,8 +197,8 @@ class VocabParallelEmbedding(nn.Cell):
     ):
         super().__init__()
         # Keep the input dimensions.
-        tp_rank = get_tensor_model_parallel_rank()  # 获取tp并行的rank
-        self.tp_size = get_tensor_model_parallel_world_size()  # 获取tp并行的world_size
+        self.tp_rank = get_tensor_model_parallel_rank()
+        self.tp_size = get_tensor_model_parallel_world_size()
         self.num_embeddings = num_embeddings
         self.padding_size = padding_size
         self.org_vocab_size = org_num_embeddings or num_embeddings
@@ -216,7 +216,7 @@ class VocabParallelEmbedding(nn.Cell):
             self.org_vocab_size_padded,
             self.num_embeddings,
             self.org_vocab_size,
-            tp_rank,
+            self.tp_rank,
             self.tp_size,
         )
 
-- 
Gitee


From 6ec2466a94a42da7353a60f92b38b6f12199793f Mon Sep 17 00:00:00 2001
From: yyyyrf <yourifan@outlook.com>
Date: Mon, 28 Apr 2025 15:15:40 +0800
Subject: [PATCH 04/55] [feature] support high precision gptq algo slpit online

---
 .../model_executor/models/mf_models/deepseek_v3.py  | 11 +++++++++--
 .../models/mf_models/deepseekv3_weight_processor.py | 13 +++++++++----
 2 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py b/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py
index e0ede946..e7cda00c 100644
--- a/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py
+++ b/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py
@@ -184,8 +184,11 @@ class DeepseekV3ForCausalLM(MfModelBase):
             cfg = PTQConfig(mode=quant_mode, backend=BackendTarget.ASCEND, weight_quant_dtype=msdtype.qint4x2,
                             algo_args=gptq_config, act_quant_dtype=None, precision_recovery=PrecisionRecovery.GPTQ,
                             weight_quant_granularity=QuantGranularity.PER_GROUP, opname_blacklist=['lm_head', 'lkv2kv'],
-                            group_size=128)
-            layer_policies = OrderedDict()
+                            group_size=64)
+            w2_config = PTQConfig(mode=quant_mode, backend=BackendTarget.ASCEND, weight_quant_dtype=msdtype.int8,
+                                  act_quant_dtype=msdtype.int8, outliers_suppression=OutliersSuppressionType.SMOOTH)
+            layer_policies = OrderedDict({r'.*\.feed_forward\.w2.*': w2_config,
+                                          r'.*\.shared_experts.w2.*': w2_config})
         elif quant_type.lower() == 'smoothquant':
             cfg = PTQConfig(mode=quant_mode, backend=BackendTarget.ASCEND, weight_quant_dtype=msdtype.int8,
                             act_quant_dtype=msdtype.int8, outliers_suppression=OutliersSuppressionType.SMOOTH,
@@ -217,5 +220,9 @@ class DeepseekV3ForCausalLM(MfModelBase):
             # pylint: disable=protected-access
             ptq._config.aclnn_quant_list = ["routed_experts.ffn.w_gate_hidden", "routed_experts.ffn.w1",
                                             "routed_experts.ffn.w3"]
+        if 'gptq-pergroup' in quant_type.lower():
+            # pylint: disable=protected-access
+            ptq.layer_policies[r'.*\.feed_forward\.w2.*'].aclnn_quant_list = ["w2"]
+            ptq.layer_policies[r'.*\.shared_experts.w2.*'].aclnn_quant_list = ["w2"]
         ptq.decoder_layer_types.append(DeepseekV3DecodeLayer)
         return ptq
diff --git a/vllm_mindspore/model_executor/models/mf_models/deepseekv3_weight_processor.py b/vllm_mindspore/model_executor/models/mf_models/deepseekv3_weight_processor.py
index 642897ed..de7d70d0 100644
--- a/vllm_mindspore/model_executor/models/mf_models/deepseekv3_weight_processor.py
+++ b/vllm_mindspore/model_executor/models/mf_models/deepseekv3_weight_processor.py
@@ -1263,7 +1263,9 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor):
         else:
             value, _ = self.get_safetensor_from_file(param_name, src_hf_dir,
                                                      hf_weight_map)
-        if "wo._layer.matmul.quant_bias" in param_name and get_tensor_model_parallel_rank() != 0:
+        quant_bias_set_zero = ["wo._layer.matmul.quant_bias", "w2._layer.matmul.quant_bias"]
+        if any([name in param_name for name in quant_bias_set_zero]) and \
+            get_tensor_model_parallel_rank() != 0:
             value.fill(0)
         return value
 
@@ -1365,11 +1367,13 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor):
                 value, is_int4 = self.get_safetensor_from_file(param_name, src_hf_dir,
                                                                 hf_weight_map, is_split_param=True,
                                                                 split_axis=1)
-            elif any([name in param_name for name in [".feed_forward.w2.", ".wo.",
-                                                      "shared_experts.w2"]]):
+            elif any([name in param_name for name in [".wo."]]):
                 value, is_int4 = self.get_safetensor_from_file(param_name, src_hf_dir,
                                                                 hf_weight_map, is_split_param=True,
                                                                 split_axis=0)
+            elif any([name in param_name for name in [".feed_forward.w2.","shared_experts.w2"]]):
+                value = self.infer_smooth_quant_row_linear_split(param_name, src_hf_dir, hf_weight_map)
+                is_int4 = False
             elif ".routed_experts.ffn.w_gate_hidden." in param_name:
                 value, is_int4 = self.get_safetensor_from_file(param_name, src_hf_dir, hf_weight_map)
                 value_list = []
@@ -1430,7 +1434,8 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor):
 
         quantization_config = self.config.model.model_config.quantization_config
         quant_method = quantization_config.quant_method if quantization_config else None
-        if not quant_method or (quant_method != "gptq-pergroup" and quant_method != "smoothquant") and \
+        support_quant_method = ["gptq-pergroup", "smoothquant"]
+        if not quant_method or (quant_method not in support_quant_method) and \
                 not is_mtp_model:
             self.infer_convert_outer_weight(src_hf_dir, hf_weight_map)
 
-- 
Gitee


From 7691430684c5d029af551cd676f09d7426a5ac73 Mon Sep 17 00:00:00 2001
From: dayschan <chendeshi@huawei.com>
Date: Sat, 10 May 2025 18:57:53 +0800
Subject: [PATCH 05/55] default building custom ops

---
 setup.py                                     |  22 ++--
 tests/st/python/test_custom_advstepflash.py  | 117 +++++++++++++++++++
 vllm_mindspore/attention/backends/ms_attn.py |   4 +-
 3 files changed, 128 insertions(+), 15 deletions(-)
 create mode 100644 tests/st/python/test_custom_advstepflash.py

diff --git a/setup.py b/setup.py
index 8e2154b3..5296b18b 100644
--- a/setup.py
+++ b/setup.py
@@ -101,12 +101,11 @@ version = (Path("vllm_mindspore") / "version.txt").read_text()
 def _get_ascend_home_path():
     return os.environ.get("ASCEND_HOME_PATH", "/usr/local/Ascend/ascend-toolkit/latest")
 
-def _get_ascend_env_path(check_exists=True):
-    env_script_path = os.path.join(_get_ascend_home_path(), "bin", "setenv.bash")
-    if check_exists and not os.path.exists(env_script_path):
-        warnings.warn(f"The file '{env_script_path}' is not found, "
-                            "please make sure env variable 'ASCEND_HOME_PATH' is set correctly.")
-        return None
+def _get_ascend_env_path():
+    env_script_path = os.path.realpath(os.path.join(_get_ascend_home_path(), "..", "set_env.sh"))
+    if not os.path.exists(env_script_path):
+        raise ValueError(f"The file '{env_script_path}' is not found, "
+                            "please make sure environment variable 'ASCEND_HOME_PATH' is set correctly.")
     return env_script_path
 
 class CustomBuildExt(build_ext):
@@ -128,7 +127,7 @@ class CustomBuildExt(build_ext):
         os.makedirs(BUILD_OPS_DIR, exist_ok=True)
 
         ascend_home_path = _get_ascend_home_path()
-        env_script_path = _get_ascend_env_path(False)
+        env_script_path = _get_ascend_env_path()
         build_extension_dir = os.path.join(BUILD_OPS_DIR, "kernel_meta", ext_name)
         # Combine all cmake commands into one string
         cmake_cmd = (
@@ -176,12 +175,9 @@ package_data = {
 
 def _get_ext_modules():
     ext_modules = []
-    # Currently, the CI environment does not support the compilation of custom operators.
-    # As a temporary solution, this is controlled via an environment variable.
-    # Once the CI environment adds support for custom operator compilation,
-    # this should be updated to enable compilation by default.
-    if os.getenv("vLLM_USE_NPU_ADV_STEP_FLASH_OP", "off") == "on" and _get_ascend_env_path() is not None:
-        ext_modules.append(Extension("vllm_mindspore.npu_ops", sources=[])) # sources are specified in CMakeLists.txt
+    if os.path.exists(_get_ascend_home_path()):
+        # sources are specified in CMakeLists.txt
+        ext_modules.append(Extension("vllm_mindspore.npu_ops", sources=[]))
     return ext_modules
 
 setup(
diff --git a/tests/st/python/test_custom_advstepflash.py b/tests/st/python/test_custom_advstepflash.py
new file mode 100644
index 00000000..4968ee46
--- /dev/null
+++ b/tests/st/python/test_custom_advstepflash.py
@@ -0,0 +1,117 @@
+# Copyright 2024 The vLLM team.
+# Copyright 2024 Microsoft and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://wwww.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by application law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""test case for custom op adv_step_flash"""
+
+import time
+import pytest
+from vllm_mindspore import npu_ops
+import numpy as np
+import mindspore as ms
+import torch
+
+def benchmark_advance_step_op(sampled_token_ids,
+                       input_tokens,
+                       input_positions,
+                       seq_lens_tensor,
+                       num_queries,
+                       block_size,
+                       block_tables,
+                       slot_mapping):
+    # update input_tokens
+    sampled_token_ids_list = sampled_token_ids[:num_queries].squeeze(-1)
+    input_tokens[:num_queries] = sampled_token_ids_list
+
+    # get seq_lens and input_positions
+    seq_lens = seq_lens_tensor[:num_queries]
+    next_seq_lens = seq_lens + 1
+    next_input_pos = next_seq_lens - 1
+
+    # update seq_lens and input_positions
+    seq_lens_tensor[:num_queries] = next_seq_lens
+    input_positions[:num_queries] = next_input_pos
+
+    block_idx = next_input_pos // block_size
+    block_offset = next_input_pos % block_size
+
+    current_block_table = block_tables.gather(1, block_idx.unsqueeze(-1)).squeeze(-1)
+    slot_num = current_block_table * block_size + block_offset
+
+    # update slot_mapping
+    slot_mapping[:num_queries] = slot_num
+
+def gendata(seed, num_seqs, block_size, block_num, make_tensor):
+    """generate inputs"""
+    np.random.seed(seed)
+    sampled_token_ids = np.random.randint(65536, size=(num_seqs,), dtype=np.int64)
+    input_tokens = np.random.randint(100, size=(num_seqs,), dtype=np.int64) # out
+    input_positions = np.random.randint(100, size=(num_seqs,), dtype=np.int64) # out
+    seq_lens_tensor = np.random.randint(block_size * block_num - 1, size=(num_seqs,), dtype=np.int64) # inplace
+    block_tables = np.random.randint(1024, size=(num_seqs, block_num), dtype=np.int64)
+    slot_mapping = np.random.randint(100, size=(num_seqs,), dtype=np.int64) # out
+    return (make_tensor(sampled_token_ids), \
+            make_tensor(input_tokens),      \
+            make_tensor(input_positions),   \
+            make_tensor(seq_lens_tensor),   \
+            make_tensor(block_tables),      \
+            make_tensor(slot_mapping))
+
+
+class TestCustomAdvStepFlash:
+    """
+    Test Custom op AdvStepFlash.
+    """
+    @pytest.mark.level0
+    @pytest.mark.platform_arm_ascend910b_training
+    @pytest.mark.env_single
+    def test_advstepflash(self):
+        """
+        test case
+        """
+        seed = int(time.time() * 1000) % 1000000009
+        num_seqs = 256
+        block_size = 32
+        block_num = 4
+        num_queries = num_seqs # no padding
+        print("test seed:", seed, flush=True)
+        sampled_token_ids1, input_tokens1, input_positions1, seq_lens_tensor1, block_tables1, slot_mapping1 = \
+            gendata(seed, num_seqs, block_size, block_num, torch.Tensor)
+        benchmark_advance_step_op(sampled_token_ids1,
+                                  input_tokens1,
+                                  input_positions1,
+                                  seq_lens_tensor1,
+                                  num_queries,
+                                  block_size,
+                                  block_tables1,
+                                  slot_mapping1)
+
+        sampled_token_ids2, input_tokens2, input_positions2, seq_lens_tensor2, block_tables2, slot_mapping2 = \
+            gendata(seed, num_seqs, block_size, block_num, ms.Tensor)
+        npu_ops.adv_step_flash(num_seqs=num_seqs,
+                            num_queries=num_queries,
+                            block_size=block_size,
+                            input_tokens=input_tokens2,
+                            sampled_token_ids=sampled_token_ids2,
+                            input_positions=input_positions2,
+                            seq_lens=seq_lens_tensor2,
+                            slot_mapping=slot_mapping2,
+                            block_tables=block_tables2)
+
+        assert np.allclose(sampled_token_ids1, sampled_token_ids2.asnumpy())
+        assert np.allclose(input_tokens1, input_tokens2.asnumpy())
+        assert np.allclose(input_positions1, input_positions2.asnumpy())
+        assert np.allclose(seq_lens_tensor1, seq_lens_tensor2.asnumpy())
+        assert np.allclose(block_tables1, block_tables2.asnumpy())
+        assert np.allclose(slot_mapping1, slot_mapping2.asnumpy())
diff --git a/vllm_mindspore/attention/backends/ms_attn.py b/vllm_mindspore/attention/backends/ms_attn.py
index 558882cd..d6123b0a 100644
--- a/vllm_mindspore/attention/backends/ms_attn.py
+++ b/vllm_mindspore/attention/backends/ms_attn.py
@@ -312,8 +312,8 @@ class MSAttentionMetadata(AttentionMetadata, PagedAttentionMetadata):
             self.seq_lens[i] += 1
         self.max_decode_seq_len = max(self.seq_lens)
 
-        # default use python op
-        if os.getenv("vLLM_USE_NPU_ADV_STEP_FLASH_OP", "off") == "on":
+        # default use ascendc op
+        if os.getenv("vLLM_USE_NPU_ADV_STEP_FLASH_OP", "on") != "off":
             from vllm_mindspore import npu_ops
             npu_ops.adv_step_flash(num_seqs=num_seqs,
                                    num_queries=num_queries,
-- 
Gitee


From 6e0a970a7675f1cb494be6f2ef4544cf75ef5d0d Mon Sep 17 00:00:00 2001
From: lvhaoyu <lvhaoyu@huawei.com>
Date: Thu, 20 Mar 2025 19:47:55 +0800
Subject: [PATCH 06/55] update v1

---
 vllm_mindspore/__init__.py                    |  80 ++-
 vllm_mindspore/compilation/__init__.py        |   0
 vllm_mindspore/compilation/inductor_pass.py   |  73 ++
 vllm_mindspore/config.py                      | 138 +++-
 vllm_mindspore/engine/arg_utils.py            | 222 +++++++
 .../model_executor/layers/sampler.py          |   4 +
 .../model_executor/model_loader/utils.py      |   2 +-
 .../models/mf_models/deepseek_v3.py           |  10 +-
 .../models/mf_models/mf_model_base.py         | 147 ++--
 .../model_executor/models/mf_models/qwen2.py  |  11 +-
 .../model_executor/models/model_base.py       |  54 +-
 .../model_executor/models/registry.py         |   9 +-
 vllm_mindspore/platforms/ascend.py            |  37 +-
 vllm_mindspore/scripts.py                     |   3 +-
 vllm_mindspore/utils.py                       |   4 +-
 vllm_mindspore/v1/__init__.py                 |   0
 vllm_mindspore/v1/attention/__init__.py       |   0
 .../v1/attention/backends/__init__.py         |   0
 .../v1/attention/backends/flash_attn.py       | 237 +++++++
 vllm_mindspore/v1/sample/__init__.py          |   0
 vllm_mindspore/v1/sample/ops/__init__.py      |   0
 vllm_mindspore/v1/sample/ops/penalties.py     |  21 +
 .../v1/sample/ops/topk_topp_sampler.py        |  98 +++
 vllm_mindspore/v1/sample/rejection_sampler.py | 627 ++++++++++++++++++
 vllm_mindspore/v1/sample/sampler.py           |  10 +
 vllm_mindspore/v1/spec_decode/__init__.py     |   0
 vllm_mindspore/v1/spec_decode/eagle.py        | 258 +++++++
 vllm_mindspore/v1/utils.py                    |  12 +
 vllm_mindspore/v1/worker/__init__.py          |   0
 vllm_mindspore/v1/worker/block_table.py       |  93 +++
 vllm_mindspore/v1/worker/gpu_input_batch.py   |  85 +++
 vllm_mindspore/v1/worker/gpu_model_runner.py  | 420 ++++++++++++
 32 files changed, 2582 insertions(+), 73 deletions(-)
 create mode 100644 vllm_mindspore/compilation/__init__.py
 create mode 100644 vllm_mindspore/compilation/inductor_pass.py
 create mode 100644 vllm_mindspore/engine/arg_utils.py
 create mode 100644 vllm_mindspore/v1/__init__.py
 create mode 100644 vllm_mindspore/v1/attention/__init__.py
 create mode 100644 vllm_mindspore/v1/attention/backends/__init__.py
 create mode 100644 vllm_mindspore/v1/attention/backends/flash_attn.py
 create mode 100644 vllm_mindspore/v1/sample/__init__.py
 create mode 100644 vllm_mindspore/v1/sample/ops/__init__.py
 create mode 100644 vllm_mindspore/v1/sample/ops/penalties.py
 create mode 100644 vllm_mindspore/v1/sample/ops/topk_topp_sampler.py
 create mode 100644 vllm_mindspore/v1/sample/rejection_sampler.py
 create mode 100644 vllm_mindspore/v1/sample/sampler.py
 create mode 100644 vllm_mindspore/v1/spec_decode/__init__.py
 create mode 100644 vllm_mindspore/v1/spec_decode/eagle.py
 create mode 100644 vllm_mindspore/v1/utils.py
 create mode 100644 vllm_mindspore/v1/worker/__init__.py
 create mode 100644 vllm_mindspore/v1/worker/block_table.py
 create mode 100644 vllm_mindspore/v1/worker/gpu_input_batch.py
 create mode 100644 vllm_mindspore/v1/worker/gpu_model_runner.py

diff --git a/vllm_mindspore/__init__.py b/vllm_mindspore/__init__.py
index 47a9e4d5..a943c92f 100644
--- a/vllm_mindspore/__init__.py
+++ b/vllm_mindspore/__init__.py
@@ -29,6 +29,10 @@ from vllm_mindspore.scripts import env_setup
 
 env_setup()
 
+# should be place on the top of the file.
+from vllm_mindspore.compilation import inductor_pass as ms_inductor_pass
+sys.modules.update({"vllm.compilation.inductor_pass": ms_inductor_pass})
+
 from vllm_mindspore.platforms.ascend import AscendPlatform
 
 ascend_platform = AscendPlatform()
@@ -45,6 +49,13 @@ import vllm.utils
 
 vllm.utils.current_platform = ascend_platform
 
+import vllm.attention.selector
+vllm.attention.selector.current_platform = ascend_platform
+
+import vllm.engine.arg_utils
+from vllm_mindspore.engine.arg_utils import _is_v1_supported_oracle
+vllm.engine.arg_utils.EngineArgs._is_v1_supported_oracle = _is_v1_supported_oracle
+
 from vllm_mindspore.utils import (
     direct_register_custom_op,
     make_tensor_with_pad,
@@ -71,6 +82,7 @@ from vllm_mindspore.model_executor.models.registry import (
     _SUBPROCESS_COMMAND,
 )
 
+
 vllm.config.ModelRegistry = MindSporeModelRegistry
 
 import vllm.model_executor
@@ -160,6 +172,11 @@ from vllm.executor.multiproc_worker_utils import get_mp_context
 
 vllm.executor.multiproc_worker_utils.get_mp_context = ms_get_mp_context
 
+import vllm.v1.executor.multiproc_executor
+vllm.v1.executor.multiproc_executor.get_mp_context = ms_get_mp_context
+import vllm.v1.utils
+vllm.v1.utils.get_mp_context = ms_get_mp_context
+
 from vllm_mindspore.executor.ray_gpu_executor import (
     ms_init_workers_ray,
     initialize_ray_cluster,
@@ -179,11 +196,14 @@ vllm.engine.llm_engine.initialize_ray_cluster = initialize_ray_cluster
 vllm.engine.async_llm_engine.initialize_ray_cluster = initialize_ray_cluster
 
 
-from .config import _verify_quantization, _verify_args, vllm_config_post_init
+from .config import _verify_quantization, _verify_args, vllm_config_post_init, model_post_init, \
+    _get_and_verify_dtype
 
 vllm.config.ModelConfig._verify_quantization = _verify_quantization
 vllm.config.VllmConfig.__post_init__ = vllm_config_post_init
 vllm.config.SchedulerConfig._verify_args = _verify_args
+vllm.config.CompilationConfig.model_post_init = model_post_init
+vllm.config._get_and_verify_dtype = _get_and_verify_dtype
 
 from .utils import update_modules
 from vllm_mindspore.attention.backends import ms_attn
@@ -213,6 +233,64 @@ RejectionSampler._smallest_positive_value = _smallest_positive_value
 RejectionSampler._smallest_positive_value.__set_name__(RejectionSampler, '_smallest_positive_value')
 vllm.model_executor.layers.rejection_sampler._multinomial = _multinomial
 
+from vllm_mindspore.v1.sample import rejection_sampler
+update_modules("vllm.v1.sample.rejection_sampler", rejection_sampler)
+
+from vllm_mindspore.v1.spec_decode import eagle
+update_modules("vllm.v1.spec_decode.eagle", eagle)
+
+from vllm_mindspore.v1.attention.backends import flash_attn
+import vllm.v1.attention.backends
+sys.modules['vllm.v1.attention.backends.flash_attn'] = flash_attn
+import vllm.v1.attention.backends.flash_attn
+
+import vllm.v1.worker.gpu_model_runner
+
+from vllm_mindspore.v1.worker.gpu_model_runner import _prepare_inputs
+vllm.v1.worker.gpu_model_runner.GPUModelRunner._prepare_inputs = _prepare_inputs
+
+from vllm_mindspore.v1.worker.gpu_model_runner import _update_states
+vllm.v1.worker.gpu_model_runner.GPUModelRunner._update_states = _update_states
+
+from vllm_mindspore.v1.worker.gpu_model_runner import initialize_kv_cache
+vllm.v1.worker.gpu_model_runner.GPUModelRunner.initialize_kv_cache = initialize_kv_cache
+
+import vllm.v1.worker.block_table
+from vllm_mindspore.v1.worker.block_table import BlockTable
+vllm.v1.worker.block_table.BlockTable = BlockTable
+vllm.v1.worker.gpu_input_batch.BlockTable = BlockTable
+
+import vllm.v1.worker.gpu_input_batch
+from vllm_mindspore.v1.worker.gpu_input_batch import _make_sampling_metadata, _make_prompt_token_ids_tensor
+vllm.v1.worker.gpu_input_batch.InputBatch._make_sampling_metadata = _make_sampling_metadata
+vllm.v1.worker.gpu_model_runner.InputBatch._make_sampling_metadata = _make_sampling_metadata
+vllm.v1.worker.gpu_input_batch.InputBatch._make_prompt_token_ids_tensor = _make_prompt_token_ids_tensor
+vllm.v1.worker.gpu_model_runner.InputBatch._make_prompt_token_ids_tensor = _make_prompt_token_ids_tensor
+
+from vllm.v1.worker.gpu_worker import Worker
+
+Worker.__init__ = wrapper_worker_init(Worker.__init__)
+Worker.init_device = wrapper_worker_init_device(Worker.init_device)
+
+
+import vllm.v1.utils
+from vllm_mindspore.v1.utils import copy_slice
+vllm.v1.utils.copy_slice = copy_slice
+vllm.v1.worker.gpu_input_batch.copy_slice = copy_slice
+
+from vllm_mindspore.v1.sample.ops.penalties import _convert_to_tensors
+import vllm.v1.sample.ops.penalties
+vllm.v1.sample.ops.penalties._convert_to_tensors = _convert_to_tensors
+
+from vllm_mindspore.v1.sample.ops.topk_topp_sampler import apply_top_k_top_p, random_sample
+import vllm.v1.sample.ops.topk_topp_sampler
+vllm.v1.sample.ops.topk_topp_sampler.apply_top_k_top_p = apply_top_k_top_p
+vllm.v1.sample.ops.topk_topp_sampler.random_sample = random_sample
+
+from vllm_mindspore.v1.sample.sampler import apply_temperature
+import vllm.v1.sample.sampler
+vllm.v1.sample.sampler.Sampler.apply_temperature = apply_temperature
+
 from .utils import check_ready
 
 from vllm_mindspore.engine.multiprocessing.engine import cleanup
diff --git a/vllm_mindspore/compilation/__init__.py b/vllm_mindspore/compilation/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/vllm_mindspore/compilation/inductor_pass.py b/vllm_mindspore/compilation/inductor_pass.py
new file mode 100644
index 00000000..835a1e11
--- /dev/null
+++ b/vllm_mindspore/compilation/inductor_pass.py
@@ -0,0 +1,73 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import hashlib
+import inspect
+import json
+import types
+from typing import Any, Callable, Dict, Optional, Union
+
+import torch
+from packaging.version import Version
+
+
+class InductorPass():
+    """
+    A custom graph pass that uses a hash of its source as the UUID.
+    This is defined as a convenience and should work in most cases.
+    """
+
+    def uuid(self) -> Any:
+        """
+        Provide a unique identifier for the pass, used in Inductor code cache.
+        This should depend on the pass implementation, so that changes to the
+        pass result in recompilation.
+        By default, the object source is hashed.
+        """
+        return InductorPass.hash_source(self)
+
+    @staticmethod
+    def hash_source(*srcs: Union[str, Any]):
+        """
+        Utility method to hash the sources of functions or objects.
+        :param srcs: strings or objects to add to the hash.
+        Objects and functions have their source inspected.
+        :return:
+        """
+        hasher = hashlib.sha256()
+        for src in srcs:
+            if isinstance(src, str):
+                src_str = src
+            elif isinstance(src, types.FunctionType):
+                src_str = inspect.getsource(src)
+            else:
+                src_str = inspect.getsource(src.__class__)
+            hasher.update(src_str.encode("utf-8"))
+        return hasher.hexdigest()
+
+    @staticmethod
+    def hash_dict(dict_: Dict[Any, Any]):
+        """
+        Utility method to hash a dictionary, can alternatively be used for uuid.
+        :return: A sha256 hash of the json rep of the dictionary.
+        """
+        encoded = json.dumps(dict_, sort_keys=True).encode("utf-8")
+        return hashlib.sha256(encoded).hexdigest()
+
+
+class CallableInductorPass(InductorPass):
+    """
+    This class is a wrapper for a callable that automatically provides an
+    implementation of the UUID.
+    """
+
+    def __init__(self,
+                 callable,
+                 uuid: Optional[Any] = None):
+        self.callable = callable
+        self._uuid = self.hash_source(callable) if uuid is None else uuid
+
+    def __call__(self, graph):
+        self.callable(graph)
+
+    def uuid(self) -> Any:
+        return self._uuid
diff --git a/vllm_mindspore/config.py b/vllm_mindspore/config.py
index e702278e..b6366434 100644
--- a/vllm_mindspore/config.py
+++ b/vllm_mindspore/config.py
@@ -15,13 +15,22 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
+from collections import Counter
+from typing import Union
+import sys
+
 import torch
 
+from transformers import PretrainedConfig
+
+
 import vllm.envs as envs
 
-from vllm.config import VllmConfig, CompilationConfig, CompilationLevel, logger
+from vllm.config import VllmConfig, CompilationConfig, CompilationLevel, logger, _STR_DTYPE_TO_TORCH_DTYPE
 from vllm.utils import random_uuid
 from vllm.logger import init_logger
+from vllm.compilation.inductor_pass import CallableInductorPass, InductorPass
+from vllm.platforms import CpuArchEnum
 
 logger = init_logger(__name__)
 
@@ -80,8 +89,14 @@ def vllm_config_post_init(self):
         self.compilation_config.use_inductor = True
         self.compilation_config.cudagraph_num_of_warmups = 1
         self.compilation_config.pass_config.enable_fusion = False
-        self.compilation_config.pass_config.enable_reshape = False
-        self.compilation_config.level = CompilationLevel.PIECEWISE
+        self.compilation_config.pass_config.enable_noop = False
+        # When level is set to CompilationLevel.PIECEWISE, vllm will use cuda graph,
+        # which means the model inputs will be padded to cuda graph acceptable size,
+        # but it is not for mindspore. So here set to CompilationLevel.DYNAMO_AS_IS.
+        self.compilation_config.level = CompilationLevel.DYNAMO_AS_IS
+        # Set a small compile_sizes for warmup. '20' is not in 'cudagraph_capture_sizes'.
+        # So the warmup can be runned.
+        self.compilation_config.compile_sizes = [20]
 
     self._set_cudagraph_sizes()
 
@@ -160,3 +175,120 @@ def _verify_args(self) -> None:
             f"max_long_partial_prefills ({self.max_long_partial_prefills}) "
             "must be greater than or equal to 1 and less than or equal to "
             f"max_num_partial_prefills ({self.max_num_partial_prefills}).")
+
+
+def model_post_init(self, __context) -> None:
+
+    count_none = self.custom_ops.count("none")
+    count_all = self.custom_ops.count("all")
+    assert count_none + count_all <= 1, "Can only specify 'none' or 'all'"
+
+    if self.splitting_ops is None:
+        self.splitting_ops = []
+
+    for k, v in self.inductor_passes.items():
+        if not isinstance(v, str):
+            assert callable(v), (
+                f"pass {k} should be callable or a qualified name")
+            self.inductor_compile_config[k] = v if isinstance(
+                v, InductorPass) else CallableInductorPass(v)
+            continue
+
+        # resolve function from qualified name
+        names = v.split(".")
+        module = ".".join(names[:-1])
+        func_name = names[-1]
+        func = __import__(module).__dict__[func_name]
+        self.inductor_compile_config[k] = func if isinstance(
+            func, InductorPass) else CallableInductorPass(func)
+
+    self.enabled_custom_ops = Counter()
+    self.disabled_custom_ops = Counter()
+    self.traced_files = set()
+    self.static_forward_context = {}
+    self.compilation_time = 0.0
+
+
+def _get_and_verify_dtype(
+    config: PretrainedConfig,
+    dtype: Union[str, torch.dtype],
+) -> torch.dtype:
+    # NOTE: getattr(config, "torch_dtype", torch.float32) is not correct
+    # because config.torch_dtype can be None.
+    config_dtype = getattr(config, "torch_dtype", None)
+
+    # Fallbacks for multi-modal models if the root config
+    # does not define torch_dtype
+    if config_dtype is None and hasattr(config, "text_config"):
+        config_dtype = getattr(config.text_config, "torch_dtype", None)
+    if config_dtype is None and hasattr(config, "vision_config"):
+        config_dtype = getattr(config.vision_config, "torch_dtype", None)
+
+    if config_dtype is None:
+        config_dtype = torch.float32
+
+    if isinstance(dtype, str):
+        dtype = dtype.lower()
+        if dtype == "auto":
+            if config_dtype == torch.float32:
+                # Following common practice, we use float16 for float32 models
+                torch_dtype = torch.float16
+            else:
+                torch_dtype = config_dtype
+
+            from vllm.platforms import current_platform
+            if (current_platform.is_cpu()
+                    and current_platform.get_cpu_architecture()
+                    == CpuArchEnum.POWERPC
+                    and (config_dtype == torch.float16
+                         or config_dtype == torch.float32)):
+                logger.info(
+                    "For POWERPC, we cast models to bfloat16 instead of "
+                    "using float16 by default. Float16 is not currently "
+                    "supported for POWERPC.")
+                torch_dtype = torch.bfloat16
+
+            # TODO: change this condition to check if the platform support bf16
+            # instead of checking the OS. For instance M2 shall supports bf16
+            # already. But we need to modify `cpu_extension.cmake` to activate
+            # the feature in the build.
+            if (current_platform.is_cpu() and sys.platform.startswith("darwin")
+                    and current_platform.get_cpu_architecture()
+                    == CpuArchEnum.ARM and config_dtype == torch.bfloat16):
+                logger.info("For macOS with Apple Silicon, currently bfloat16 "
+                            "is not supported. Setting dtype to float16.")
+                torch_dtype = torch.float16
+
+            if current_platform.is_hpu() and config_dtype == torch.float16:
+                logger.info(
+                    "For HPU, we cast models to bfloat16 instead of "
+                    "using float16 by default. Please specify `dtype` if you "
+                    "want to use float16.")
+                torch_dtype = torch.bfloat16
+        else:
+            if dtype not in _STR_DTYPE_TO_TORCH_DTYPE:
+                raise ValueError(f"Unknown dtype: {dtype}")
+            torch_dtype = _STR_DTYPE_TO_TORCH_DTYPE[dtype]
+    elif isinstance(dtype, torch.dtype):
+        torch_dtype = dtype
+    else:
+        raise ValueError(f"Unknown dtype: {dtype}")
+
+    # Verify the dtype.
+    if torch_dtype != config_dtype:
+        if torch_dtype == torch.float32:
+            # Upcasting to float32 is allowed.
+            logger.info("Upcasting %s to %s.", config_dtype, torch_dtype)
+            pass
+        elif config_dtype == torch.float32:
+            # Downcasting from float32 to float16 or bfloat16 is allowed.
+            logger.info("Downcasting %s to %s.", config_dtype, torch_dtype)
+            pass
+        else:
+            # Casting between float16 and bfloat16 is allowed with a warning.
+            logger.warning("Casting %s to %s.", config_dtype, torch_dtype)
+
+    if torch_dtype in _STR_DTYPE_TO_TORCH_DTYPE:
+        torch_dtype = _STR_DTYPE_TO_TORCH_DTYPE[torch_dtype]
+
+    return torch_dtype
diff --git a/vllm_mindspore/engine/arg_utils.py b/vllm_mindspore/engine/arg_utils.py
new file mode 100644
index 00000000..ed74ba9e
--- /dev/null
+++ b/vllm_mindspore/engine/arg_utils.py
@@ -0,0 +1,222 @@
+import threading
+
+import torch
+
+import vllm.envs as envs
+from vllm.engine.arg_utils import _raise_or_fallback, EngineArgs, _warn_or_fallback
+from vllm.config import LoadFormat, ModelConfig
+
+def _is_v1_supported_oracle(self, model_config: ModelConfig) -> bool:
+    """Oracle for whether to use V0 or V1 Engine by default."""
+
+    #############################################################
+    # Unsupported Feature Flags on V1.
+
+    if (self.load_format == LoadFormat.TENSORIZER.value
+            or self.load_format == LoadFormat.SHARDED_STATE.value):
+        _raise_or_fallback(
+            feature_name=f"--load_format {self.load_format}",
+            recommend_to_remove=False)
+        return False
+
+    if (self.logits_processor_pattern
+            != EngineArgs.logits_processor_pattern):
+        _raise_or_fallback(feature_name="--logits-processor-pattern",
+                            recommend_to_remove=False)
+        return False
+
+    if self.preemption_mode != EngineArgs.preemption_mode:
+        _raise_or_fallback(feature_name="--preemption-mode",
+                            recommend_to_remove=True)
+        return False
+
+    if (self.disable_async_output_proc
+            != EngineArgs.disable_async_output_proc):
+        _raise_or_fallback(feature_name="--disable-async-output-proc",
+                            recommend_to_remove=True)
+        return False
+
+    if self.scheduling_policy != EngineArgs.scheduling_policy:
+        _raise_or_fallback(feature_name="--scheduling-policy",
+                            recommend_to_remove=False)
+        return False
+
+    if self.num_scheduler_steps != EngineArgs.num_scheduler_steps:
+        _raise_or_fallback(feature_name="--num-scheduler-steps",
+                            recommend_to_remove=True)
+        return False
+
+    if self.scheduler_delay_factor != EngineArgs.scheduler_delay_factor:
+        _raise_or_fallback(feature_name="--scheduler-delay-factor",
+                            recommend_to_remove=True)
+        return False
+
+    if self.additional_config != EngineArgs.additional_config:
+        _raise_or_fallback(feature_name="--additional-config",
+                            recommend_to_remove=False)
+        return False
+
+    # Xgrammar and Guidance are supported.
+    SUPPORTED_GUIDED_DECODING = [
+        "xgrammar", "xgrammar:disable-any-whitespace", "guidance",
+        "guidance:disable-any-whitespace", "auto"
+    ]
+    if self.guided_decoding_backend not in SUPPORTED_GUIDED_DECODING:
+        _raise_or_fallback(feature_name="--guided-decoding-backend",
+                            recommend_to_remove=False)
+        return False
+
+    # Need at least Ampere for now (FA support required).
+    # Skip this check if we are running on a non-GPU platform,
+    # or if the device capability is not available
+    # (e.g. in a Ray actor without GPUs).
+    from vllm.platforms import current_platform
+    if (current_platform.is_cuda()
+            and current_platform.get_device_capability()
+            and current_platform.get_device_capability().major < 8):
+        _raise_or_fallback(feature_name="Compute Capability < 8.0",
+                            recommend_to_remove=False)
+        return False
+
+    # No Fp8 KV cache so far.
+    if self.kv_cache_dtype != "auto":
+        fp8_attention = self.kv_cache_dtype.startswith("fp8")
+        will_use_fa = (
+            current_platform.is_cuda()
+            and not envs.is_set("VLLM_ATTENTION_BACKEND")
+        ) or envs.VLLM_ATTENTION_BACKEND == "FLASH_ATTN_VLLM_V1"
+        supported = False
+        if fp8_attention and will_use_fa:
+            from vllm.vllm_flash_attn.fa_utils import (
+                flash_attn_supports_fp8)
+            supported = flash_attn_supports_fp8()
+        if not supported:
+            _raise_or_fallback(feature_name="--kv-cache-dtype",
+                                recommend_to_remove=False)
+            return False
+
+    # No Prompt Adapter so far.
+    if self.enable_prompt_adapter:
+        _raise_or_fallback(feature_name="--enable-prompt-adapter",
+                            recommend_to_remove=False)
+        return False
+
+    # Only Fp16 and Bf16 dtypes since we only support FA.
+    V1_SUPPORTED_DTYPES = [torch.bfloat16, torch.float16]
+    if model_config.dtype not in V1_SUPPORTED_DTYPES:
+        _raise_or_fallback(feature_name=f"--dtype {model_config.dtype}",
+                            recommend_to_remove=False)
+        return False
+
+    # Some quantization is not compatible with torch.compile.
+    V1_UNSUPPORTED_QUANT = ["gguf"]
+    if model_config.quantization in V1_UNSUPPORTED_QUANT:
+        _raise_or_fallback(
+            feature_name=f"--quantization {model_config.quantization}",
+            recommend_to_remove=False)
+        return False
+
+    # No Embedding Models so far.
+    if model_config.task not in ["generate"]:
+        _raise_or_fallback(feature_name=f"--task {model_config.task}",
+                            recommend_to_remove=False)
+        return False
+
+    # No Mamba or Encoder-Decoder so far.
+    if not model_config.is_v1_compatible:
+        _raise_or_fallback(feature_name=model_config.architectures,
+                            recommend_to_remove=False)
+        return False
+
+    # No Concurrent Partial Prefills so far.
+    if (self.max_num_partial_prefills
+            != EngineArgs.max_num_partial_prefills
+            or self.max_long_partial_prefills
+            != EngineArgs.max_long_partial_prefills):
+        _raise_or_fallback(feature_name="Concurrent Partial Prefill",
+                            recommend_to_remove=False)
+        return False
+
+    # No OTLP observability so far.
+    if (self.otlp_traces_endpoint or self.collect_detailed_traces):
+        _raise_or_fallback(feature_name="--otlp-traces-endpoint",
+                            recommend_to_remove=False)
+        return False
+
+    # Only Ngram speculative decoding so far.
+    is_ngram_enabled = False
+    is_eagle_enabled = False
+    if self.speculative_config is not None:
+        # This is supported but experimental (handled below).
+        speculative_method = self.speculative_config.get("method")
+        if speculative_method:
+            if speculative_method in ("ngram", "[ngram]"):
+                is_ngram_enabled = True
+            elif speculative_method == "eagle":
+                is_eagle_enabled = True
+        else:
+            speculative_model = self.speculative_config.get("model")
+            if speculative_model in ("ngram", "[ngram]"):
+                is_ngram_enabled = True
+        if not (is_ngram_enabled or is_eagle_enabled):
+            # Other speculative decoding methods are not supported yet.
+            _raise_or_fallback(feature_name="Speculative Decoding",
+                                recommend_to_remove=False)
+            return False
+
+    # No Disaggregated Prefill so far.
+    if self.kv_transfer_config != EngineArgs.kv_transfer_config:
+        _raise_or_fallback(feature_name="--kv-transfer-config",
+                            recommend_to_remove=False)
+        return False
+
+    # No FlashInfer or XFormers so far.
+    V1_BACKENDS = [
+        "FLASH_ATTN_VLLM_V1", "FLASH_ATTN", "PALLAS", "PALLAS_VLLM_V1",
+        "TRITON_ATTN_VLLM_V1", "TRITON_MLA", "FLASHMLA"
+    ]
+    if (envs.is_set("VLLM_ATTENTION_BACKEND")
+            and envs.VLLM_ATTENTION_BACKEND not in V1_BACKENDS):
+        name = f"VLLM_ATTENTION_BACKEND={envs.VLLM_ATTENTION_BACKEND}"
+        _raise_or_fallback(feature_name=name, recommend_to_remove=True)
+        return False
+
+    # Platforms must decide if they can support v1 for this model
+    if not current_platform.supports_v1(model_config=model_config):
+        _raise_or_fallback(
+            feature_name=f"device type={current_platform.device_type}",
+            recommend_to_remove=False)
+        return False
+    #############################################################
+    # Experimental Features - allow users to opt in.
+
+    # Signal Handlers requires running in main thread.
+    if (threading.current_thread() != threading.main_thread()
+            and _warn_or_fallback("Engine in background thread")):
+        return False
+
+    # PP is supported on V1 with Ray distributed executor,
+    # but off for MP distributed executor for now.
+    if (self.pipeline_parallel_size > 1
+            and self.distributed_executor_backend != "ray"):
+        name = "Pipeline Parallelism without Ray distributed executor"
+        _raise_or_fallback(feature_name=name, recommend_to_remove=False)
+        return False
+
+    # ngram is supported on V1, but off by default for now.
+    if is_ngram_enabled and _warn_or_fallback("ngram"):
+        return False
+
+    # Eagle is under development, so we don't support it yet.
+    if is_eagle_enabled and _warn_or_fallback("Eagle"):
+        return False
+
+    # Non-CUDA is supported on V1, but off by default for now.
+    # support vllm-mindspore defined AscendPlatform
+    not_cuda = not current_platform.is_cuda() and not current_platform.is_out_of_tree()
+    if not_cuda and _warn_or_fallback(  # noqa: SIM103
+            current_platform.device_name):
+        return False
+    #############################################################
+
+    return True
diff --git a/vllm_mindspore/model_executor/layers/sampler.py b/vllm_mindspore/model_executor/layers/sampler.py
index 354fb021..edfe6252 100644
--- a/vllm_mindspore/model_executor/layers/sampler.py
+++ b/vllm_mindspore/model_executor/layers/sampler.py
@@ -51,6 +51,10 @@ else:
 
 
 def get_sampler() -> torch.nn.Module:
+    if envs.VLLM_USE_V1:
+        # Lazy import: the v1 package isn't distributed
+        from vllm.v1.sample.sampler import Sampler as V1Sampler
+        return V1Sampler()
     return Sampler()
 
 
diff --git a/vllm_mindspore/model_executor/model_loader/utils.py b/vllm_mindspore/model_executor/model_loader/utils.py
index 66295a32..07a6cc32 100644
--- a/vllm_mindspore/model_executor/model_loader/utils.py
+++ b/vllm_mindspore/model_executor/model_loader/utils.py
@@ -24,7 +24,7 @@ from vllm.config import ModelConfig, ModelImpl
 
 from vllm.model_executor.models import ModelRegistry
 from vllm_mindspore.model_executor.models.registry import MindSporeModelRegistry
-from vllm.model_executor.model_loader.utils import resolve_transformers_fallback
+# from vllm.model_executor.model_loader.utils import resolve_transformers_fallback
 
 def get_ms_model_architecture(model_config: ModelConfig) -> Tuple[Type[nn.Module], str]:
     architectures = getattr(model_config.hf_config, "architectures", [])
diff --git a/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py b/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py
index e7cda00c..d0ac3c8c 100644
--- a/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py
+++ b/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py
@@ -26,6 +26,7 @@ from vllm.config import VllmConfig
 from vllm.config import get_current_vllm_config
 from vllm.forward_context import get_forward_context
 from vllm.logger import init_logger
+import vllm.envs as envs
 
 import mindspore as ms
 from mindspore import Tensor, JitConfig, Model, mutable
@@ -47,9 +48,8 @@ from research.deepseek3.deepseek3 import (
 )
 
 from vllm_mindspore.model_executor.layers.sampler import get_sampler
-from vllm_mindspore.model_executor.models.model_base import Fake_MLA
+from vllm_mindspore.model_executor.models.model_base import Fake_MLA, Fake_MLA_V1
 from vllm_mindspore.model_executor.models.mf_models.mf_model_base import MfModelBase
-
 from vllm_mindspore.model_executor.models.mf_models.deepseekv3_weight_processor import DeepseekV3WeightProcessor
 
 logger = init_logger(__name__)
@@ -80,8 +80,10 @@ class DeepseekV3ForCausalLM(MfModelBase):
 
         self.sampler = get_sampler()
         self.set_modules({"model": self.network})
-
-        self.kv_caches = [Fake_MLA() for i in range(self.mf_model_config.num_layers)]
+        if envs.VLLM_USE_V1:
+            self.kv_caches = [Fake_MLA_V1() for i in range(self.mf_model_config.num_layers)]
+        else:
+            self.kv_caches = [Fake_MLA() for i in range(self.mf_model_config.num_layers)]
         compilation_config = get_current_vllm_config().compilation_config
 
         if prefix in compilation_config.static_forward_context:
diff --git a/vllm_mindspore/model_executor/models/mf_models/mf_model_base.py b/vllm_mindspore/model_executor/models/mf_models/mf_model_base.py
index 893d91a5..79974f6f 100644
--- a/vllm_mindspore/model_executor/models/mf_models/mf_model_base.py
+++ b/vllm_mindspore/model_executor/models/mf_models/mf_model_base.py
@@ -21,18 +21,19 @@ from types import MethodType
 from typing import Iterable, List, Optional, Set, Tuple, Union
 from abc import abstractmethod
 import numpy as np
+import math
 
-from vllm.attention import AttentionMetadata
 from vllm.config import VllmConfig
 from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
 from vllm.distributed import get_tensor_model_parallel_world_size
 from vllm.logger import init_logger
+from vllm.forward_context import get_forward_context
+import vllm.envs as envs
 
-import torch
 import mindspore as ms
-from mindspore import Tensor, mutable
+from mindspore import Tensor
 from mindspore.common.api import _pynative_executor
 
 from mindformers.tools.register.config import MindFormerConfig
@@ -41,6 +42,7 @@ from mindformers.core.parallel_config import build_parallel_config
 
 from vllm_mindspore.model_executor.models.model_base import MsModelBase
 from vllm_mindspore.model_executor.models.mf_models.attention_mask import LowerTriangularMask
+from vllm_mindspore.v1.attention.backends.flash_attn import FlashAttentionMetadata
 
 logger = init_logger(__name__)
 
@@ -55,7 +57,6 @@ def _batch_seq(input_tokens, prefill):
 
     return ms.mint.reshape(input_tokens, (-1, 1)).to(ms.int32)
 
-
 class MfModelBase(MsModelBase):
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
         super(MfModelBase, self).__init__(
@@ -88,40 +89,96 @@ class MfModelBase(MsModelBase):
         raise NotImplementedError("Function _create_network should be Implemented!")
 
 
+    def _dummy_attention_metadata(self, input_ids: Tensor, positions: Tensor) -> FlashAttentionMetadata:
+        input_len = input_ids.shape[0]
+        max_seq_len = ms.Tensor(input_len, dtype=ms.int32)
+        seq_lengths = ms.Tensor([input_len], dtype=ms.int32)
+        q_seq_lens = ms.Tensor([input_len], dtype=ms.int32)
+        q_seq_lens_np = np.array([input_len], dtype=np.int32)
+        seq_lens_np = np.array([input_len], dtype=np.int32)
+
+        block_tables = ms.Tensor([[0]], dtype=ms.int32)
+        slot_mapping = [-1 for _ in range(input_len)]
+        slot_mapping = ms.Tensor(slot_mapping, dtype=ms.int32)
+        return FlashAttentionMetadata(
+            max_seq_len=max_seq_len,
+            seq_lens=seq_lengths,
+            seq_lens_np=seq_lens_np,
+            block_tables=block_tables,
+            slot_mapping=slot_mapping,
+            q_seq_lens=q_seq_lens,
+            q_seq_lens_np=q_seq_lens_np,
+            context_lens=0,
+            # To enforce prefill and decode are both complied in warmup process.
+            # So set max_context_lens to 0 for prefill and 1 for decode.
+            max_context_lens=0 if not self.set_flags else 1,
+            query_start_loc = None
+        )
+
     def prepare_inputs(self, input_ids, positions, attn_metadata):
         key_cache, value_cache = self.get_kvcache()
-        seq_lens = attn_metadata.seq_lens
-        max_query_len = attn_metadata.max_query_len
-        # When Mutli-Step is enabled with Chunked-Prefill, prefills and
-        # decodes are scheduled together. In the first step, all the
-        # prefills turn into decodes and max_query_len will be 1.
-        if self.is_multi_step_chunked_prefill and max_query_len == 1:
-            query_lens = [1] * len(seq_lens)
+        if not envs.VLLM_USE_V1:
+            seq_lens = attn_metadata.seq_lens
+            max_query_len = attn_metadata.max_query_len
+            # When Mutli-Step is enabled with Chunked-Prefill, prefills and
+            # decodes are scheduled together. In the first step, all the
+            # prefills turn into decodes and max_query_len will be 1.
+            if self.is_multi_step_chunked_prefill and max_query_len == 1:
+                query_lens = [1] * len(seq_lens)
+            else:
+                query_lens = attn_metadata.query_lens
+
+            seq_lens = attn_metadata.seq_lens
+            max_query_len = attn_metadata.max_query_len
+            # When Mutli-Step is enabled with Chunked-Prefill, prefills and
+            # decodes are scheduled together. In the first step, all the
+            # prefills turn into decodes and max_query_len will be 1.
+            if self.is_multi_step_chunked_prefill and max_query_len == 1:
+                query_lens = [1] * len(seq_lens)
+            else:
+                query_lens = attn_metadata.query_lens
+
+            seq_lens_np = np.array(seq_lens, dtype=np.int32)
+            query_lens_np = np.array(query_lens, dtype=np.int32)
+            kv_cache_lens = seq_lens_np - query_lens_np
+            if attn_metadata.num_decode_tokens == 0 and kv_cache_lens.max() == 0:
+                is_prefill = True
+            else:
+                is_prefill = False
+
+            q_seq_lens = ms.Tensor(query_lens_np, dtype=ms.int32)
+            position_ids = ms.Tensor(positions, dtype=ms.int32)
+            attention_mask = self.casual_mask.gen_attention_mask(is_prefill, position_ids, query_lens)
+
+            model_inputs = {}
+            model_inputs["input_ids"] = _batch_seq(input_ids, is_prefill)
+            model_inputs["batch_valid_length"] = ms.Tensor.from_numpy(np.expand_dims(seq_lens_np, 0))
+            model_inputs["block_tables"] = attn_metadata.block_tables
+            model_inputs["slot_mapping"] = attn_metadata.slot_mapping
+            model_inputs["position_ids"] = position_ids
+            model_inputs["q_seq_lens"] = q_seq_lens
+            model_inputs["attention_mask"] = attention_mask
+            model_inputs["key_cache"] = key_cache
+            model_inputs["value_cache"] = value_cache
         else:
-            query_lens = attn_metadata.query_lens
-
-        seq_lens_np = np.array(seq_lens, dtype=np.int32)
-        query_lens_np = np.array(query_lens, dtype=np.int32)
-        kv_cache_lens = seq_lens_np - query_lens_np
-        if attn_metadata.num_decode_tokens == 0 and kv_cache_lens.max() == 0:
-            is_prefill = True
-        else:
-            is_prefill = False
-
-        q_seq_lens = ms.Tensor(query_lens_np, dtype=ms.int32)
-        position_ids = ms.Tensor(positions, dtype=ms.int32)
-        attention_mask = self.casual_mask.gen_attention_mask(is_prefill, position_ids, query_lens)
-
-        model_inputs = {}
-        model_inputs["input_ids"] = _batch_seq(input_ids, is_prefill)
-        model_inputs["batch_valid_length"] = ms.Tensor.from_numpy(np.expand_dims(seq_lens_np, 0))
-        model_inputs["block_tables"] = attn_metadata.block_tables
-        model_inputs["slot_mapping"] = attn_metadata.slot_mapping
-        model_inputs["position_ids"] = position_ids
-        model_inputs["q_seq_lens"] = q_seq_lens
-        model_inputs["attention_mask"] = attention_mask
-        model_inputs["key_cache"] = key_cache
-        model_inputs["value_cache"] = value_cache
+            if attn_metadata.max_context_lens == 0:
+                is_prefill = True
+            else:
+                is_prefill = False
+            q_seq_lens = attn_metadata.q_seq_lens
+            query_lens_np = attn_metadata.q_seq_lens_np
+            attention_mask = self.casual_mask.gen_attention_mask(is_prefill, positions, query_lens_np)
+
+            model_inputs = {}
+            model_inputs["input_ids"] = _batch_seq(input_ids, is_prefill)
+            model_inputs["batch_valid_length"] = ms.Tensor(np.expand_dims(attn_metadata.seq_lens_np, 0))
+            model_inputs["block_tables"] = attn_metadata.block_tables
+            model_inputs["slot_mapping"] = attn_metadata.slot_mapping
+            model_inputs["position_ids"] = positions.to(ms.int32)
+            model_inputs["q_seq_lens"] = q_seq_lens
+            model_inputs["attention_mask"] = attention_mask
+            model_inputs["key_cache"] = key_cache
+            model_inputs["value_cache"] = value_cache
 
         return model_inputs, is_prefill
 
@@ -132,12 +189,13 @@ class MfModelBase(MsModelBase):
         self,
         input_ids: Tensor,
         positions: Tensor,
-        kv_caches: List[Tensor],
-        attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[Tensor] = None,
         **kwargs
     ) -> Union[Tensor, IntermediateTensors]:
+        attn_metadata = get_forward_context().attn_metadata
+        if attn_metadata is None:
+            attn_metadata = self._dummy_attention_metadata(input_ids, positions)
         model_inputs, is_prefill = self.prepare_inputs(input_ids, positions, attn_metadata)
         model_inputs = self.update_model_inputs(model_inputs, **kwargs)
 
@@ -160,15 +218,18 @@ class MfModelBase(MsModelBase):
         hidden_states: Tensor,
         sampling_metadata: SamplingMetadata,
     ) -> Optional[Tensor]:
-        selected_token_indices = sampling_metadata.selected_token_indices
-        if selected_token_indices is not None and selected_token_indices.numel() <= 0:
-            logits = ms.mint.zeros((0, self.mf_model_config.vocab_size),
-                                    dtype=self.mf_model_config.compute_dtype)
+        if sampling_metadata is not None:
+            selected_token_indices = sampling_metadata.selected_token_indices
+            if selected_token_indices is not None and selected_token_indices.numel() <= 0:
+                logits = ms.mint.zeros((0, self.mf_model_config.vocab_size),
+                                        dtype=self.mf_model_config.compute_dtype)
+            else:
+                hidden_states = hidden_states.index_select(0, selected_token_indices)
+                logits = self.network.lm_head(hidden_states)
+                logits = logits.reshape(-1, logits.shape[-1])
         else:
-            hidden_states = hidden_states.index_select(0, selected_token_indices)
             logits = self.lm_head(hidden_states)
             logits = logits.reshape(-1, logits.shape[-1])
-
         return logits
 
     def sample(
diff --git a/vllm_mindspore/model_executor/models/mf_models/qwen2.py b/vllm_mindspore/model_executor/models/mf_models/qwen2.py
index 18a865c1..ddd037cf 100644
--- a/vllm_mindspore/model_executor/models/mf_models/qwen2.py
+++ b/vllm_mindspore/model_executor/models/mf_models/qwen2.py
@@ -21,6 +21,8 @@ from typing import Iterable, Set, Tuple
 from vllm.config import VllmConfig
 from vllm.config import get_current_vllm_config
 from vllm.logger import init_logger
+import vllm.envs as envs
+
 
 from mindspore import Tensor, JitConfig
 from mindspore.nn.utils import no_init_parameters
@@ -31,14 +33,14 @@ from research.qwen2_5.infer.qwen2_5 import (
 )
 
 from vllm_mindspore.model_executor.layers.sampler import get_sampler
-from vllm_mindspore.model_executor.models.model_base import Fake_Attention
+from vllm_mindspore.model_executor.models.model_base import Fake_Attention, Fake_Attention_V1
 from vllm_mindspore.model_executor.models.mf_models.mf_model_base import MfModelBase
+
 from vllm_mindspore.model_executor.models.mf_models.qwen2_weight_processor import Qwen2WeightProcessor
 
 
 logger = init_logger(__name__)
 
-
 class Qwen2ForCausalLM(MfModelBase):
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
         super(Qwen2ForCausalLM, self).__init__(vllm_config=vllm_config, prefix=prefix)
@@ -47,7 +49,10 @@ class Qwen2ForCausalLM(MfModelBase):
         self.sampler = get_sampler()
         self.set_modules({"model": self.network})
 
-        self.kv_caches = [Fake_Attention() for i in range(self.mf_model_config.num_layers)]
+        if envs.VLLM_USE_V1:
+            self.kv_caches = [Fake_Attention_V1() for i in range(self.mf_model_config.num_layers)]
+        else:
+            self.kv_caches = [Fake_Attention() for i in range(self.mf_model_config.num_layers)]
         compilation_config = get_current_vllm_config().compilation_config
 
         if prefix in compilation_config.static_forward_context:
diff --git a/vllm_mindspore/model_executor/models/model_base.py b/vllm_mindspore/model_executor/models/model_base.py
index b97d7152..75e4648e 100644
--- a/vllm_mindspore/model_executor/models/model_base.py
+++ b/vllm_mindspore/model_executor/models/model_base.py
@@ -27,6 +27,7 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sequence import IntermediateTensors
 from vllm.attention.backends.abstract import AttentionType
 from vllm.forward_context import get_forward_context
+from vllm.attention.layer import Attention
 
 import torch
 
@@ -64,6 +65,52 @@ class Fake_MLA(Fake_Attention):
             for _ in range(vllm_config.parallel_config.pipeline_parallel_size)
         ]
 
+
+class Fake_MLA(Fake_Attention):
+    def __init__(self):
+        super().__init__()
+        vllm_config = get_current_vllm_config()
+        self.kv_cache = [
+            (torch.zeros(self.kv_shape, dtype=torch.bfloat16, device="Ascend"),)
+            for _ in range(vllm_config.parallel_config.pipeline_parallel_size)
+        ]
+
+
+class Fake_Attention_V1(Attention):
+    def __init__(self):
+        vllm_config = get_current_vllm_config()
+        block_size = vllm_config.cache_config.block_size
+        num_kv_heads = vllm_config.model_config.get_num_kv_heads(
+            vllm_config.parallel_config
+        )
+        head_size = vllm_config.model_config.get_head_size()
+        num_block = 0
+        self.kv_shape = [num_block, block_size, num_kv_heads, head_size]
+        self.kv_cache = [
+            (
+                torch.zeros(self.kv_shape, dtype=torch.bfloat16, device="Ascend"),
+                torch.zeros(self.kv_shape, dtype=torch.bfloat16, device="Ascend"),
+            )
+            for _ in range(vllm_config.parallel_config.pipeline_parallel_size)
+        ]
+        self.attn_type = AttentionType.DECODER
+        self.num_kv_heads = num_kv_heads
+        self.head_size = head_size
+        self.dtype = vllm_config.model_config.dtype
+        self.block_size = block_size
+        self.sliding_window = None
+
+
+class Fake_MLA_V1(Fake_Attention_V1):
+    def __init__(self):
+        super().__init__()
+        vllm_config = get_current_vllm_config()
+        self.kv_cache = [
+            (torch.zeros(self.kv_shape, dtype=torch.bfloat16, device="Ascend"),)
+            for _ in range(vllm_config.parallel_config.pipeline_parallel_size)
+        ]
+
+
 class MsModelBase():
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
         super(MsModelBase, self).__init__()
@@ -157,8 +204,6 @@ class MsModelBase():
         self,
         input_ids: Tensor,
         positions: Tensor,
-        kv_caches: List[Tensor],
-        attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[Tensor] = None,
         previous_hidden_states: Optional[Tensor] = None,
@@ -167,8 +212,6 @@ class MsModelBase():
         return self.forward(
             input_ids,
             positions,
-            kv_caches,
-            attn_metadata,
             intermediate_tensors,
             inputs_embeds,
             previous_hidden_states=previous_hidden_states,
@@ -198,7 +241,8 @@ class MsModelBase():
 
         kv_cache_dtype = self.model_config.dtype if self.cache_config.cache_dtype == "auto" \
             else self.cache_config.cache_dtype
-        kv_cache_dtype = STR_DTYPE_TO_MS_DTYPE[kv_cache_dtype]
+        if kv_cache_dtype in STR_DTYPE_TO_MS_DTYPE:
+            kv_cache_dtype = STR_DTYPE_TO_MS_DTYPE[kv_cache_dtype]
 
         num_layers = self.model_config.get_num_layers(self.parallel_config)
 
diff --git a/vllm_mindspore/model_executor/models/registry.py b/vllm_mindspore/model_executor/models/registry.py
index 1a9dbe9f..d58130ef 100644
--- a/vllm_mindspore/model_executor/models/registry.py
+++ b/vllm_mindspore/model_executor/models/registry.py
@@ -62,11 +62,18 @@ _T = TypeVar("_T")
 
 
 _SUBPROCESS_COMMAND = [
-    sys.executable, "-m", "vllm.model_executor.models.registry"
+    sys.executable, "-m", "vllm_mindspore.model_executor.models.registry"
 ]
 
 
 def _run() -> None:
+    import vllm_mindspore.compilation.inductor_pass as ms_inductor_pass
+    sys.modules["vllm.compilation.inductor_pass"] = ms_inductor_pass
+
+    # Setup plugins
+    from vllm.plugins import load_general_plugins
+    load_general_plugins()
+
     fn, output_file = pickle.loads(sys.stdin.buffer.read())
 
     result = fn()
diff --git a/vllm_mindspore/platforms/ascend.py b/vllm_mindspore/platforms/ascend.py
index b96403d4..dddce58d 100644
--- a/vllm_mindspore/platforms/ascend.py
+++ b/vllm_mindspore/platforms/ascend.py
@@ -25,10 +25,12 @@ import mindspore as ms
 
 from vllm.platforms.interface import DeviceCapability, Platform, PlatformEnum, _Backend
 from vllm.logger import init_logger
+import vllm.envs as envs
 
 if TYPE_CHECKING:
-    from vllm.config import VllmConfig
+    from vllm.config import ModelConfig, VllmConfig
 else:
+    ModelConfig = None
     VllmConfig = None
 
 logger = init_logger(__name__)
@@ -79,24 +81,35 @@ class AscendPlatform(Platform):
         parallel_config = vllm_config.parallel_config
         scheduler_config = vllm_config.scheduler_config
 
-        if parallel_config.worker_cls == "auto":
-            if scheduler_config.is_multi_step:
-                parallel_config.worker_cls = "vllm.worker.multi_step_worker.MultiStepWorker"
-            elif vllm_config.speculative_config:
-                parallel_config.worker_cls = "vllm.spec_decode.spec_decode_worker.create_spec_worker"
-                parallel_config.sd_worker_cls = "vllm.worker.worker.Worker"
-            else:
-                parallel_config.worker_cls = "vllm.worker.worker.Worker"
+        import vllm.envs as envs
+        if envs.VLLM_USE_V1:
+            parallel_config.worker_cls = \
+                "vllm.v1.worker.gpu_worker.Worker"
+        else:
+            if parallel_config.worker_cls == "auto":
+                if scheduler_config.is_multi_step:
+                    parallel_config.worker_cls = "vllm.worker.multi_step_worker.MultiStepWorker"
+                elif vllm_config.speculative_config:
+                    parallel_config.worker_cls = "vllm.spec_decode.spec_decode_worker.create_spec_worker"
+                    parallel_config.sd_worker_cls = "vllm.worker.worker.Worker"
+                else:
+                    parallel_config.worker_cls = "vllm.worker.worker.Worker"
 
         cache_config = vllm_config.cache_config
         if cache_config and cache_config.block_size is None:
             cache_config.block_size = 16
 
 
+        # if envs.VLLM_USE_V1:
+        #     vllm_config.model_config.enforce_eager = True
+
     @classmethod
     def get_attn_backend_cls(cls, selected_backend, head_size, dtype, kv_cache_dtype, block_size, use_v1, use_mla):
         """Get the attention backend class of a device."""
         if use_v1:
+            if use_mla:
+                return "vllm_mindspore.v1.attention.backends.flash_attn.MLABackend"
+            return "vllm_mindspore.v1.attention.backends.flash_attn.FlashAttentionBackend"
             raise RuntimeError("vLLM-MindSpore do not support v1 egine now!")
         if use_mla:
             logger.info("Using MindSpore MLA backend.")
@@ -120,6 +133,8 @@ class AscendPlatform(Platform):
     @classmethod
     def get_device_communicator_cls(cls) -> str:
         """Get device specific communicator class for distributed communication."""
+        if envs.VLLM_USE_V1:
+            return "vllm.distributed.device_communicators.cuda_communicator.CudaCommunicator"
         return "vllm.distributed.device_communicators.base_device_communicator.DeviceCommunicatorBase"
 
     @classmethod
@@ -127,3 +142,7 @@ class AscendPlatform(Platform):
         """Get the total memory of a device in bytes."""
         device_props = torch.cuda.get_device_properties(device_id)
         return device_props.total_memory
+
+    @classmethod
+    def supports_v1(cls, model_config: ModelConfig) -> bool:
+        return True
\ No newline at end of file
diff --git a/vllm_mindspore/scripts.py b/vllm_mindspore/scripts.py
index 530c1e62..f35190fa 100644
--- a/vllm_mindspore/scripts.py
+++ b/vllm_mindspore/scripts.py
@@ -41,7 +41,8 @@ def env_setup(target_env_dict=None):
             "DEVICE_NUM_PER_NODE": "16",
             "HCCL_OP_EXPANSION_MODE": "AIV",
             "MS_JIT_MODULES": "vllm_mindspore,research",
-            "GLOG_v": "3"
+            "GLOG_v": "3",
+            "RAY_CGRAPH_get_timeout": "360"
         }
 
     for key, value in target_env_dict.items():
diff --git a/vllm_mindspore/utils.py b/vllm_mindspore/utils.py
index 717416bb..d32b525e 100644
--- a/vllm_mindspore/utils.py
+++ b/vllm_mindspore/utils.py
@@ -217,8 +217,8 @@ def check_ready():
     import vllm.envs as envs
     from mindspore import set_context
 
-    if envs.VLLM_USE_V1:
-        raise NotImplementedError("vLLM-MindSpore does not support VLLM V1 now!")
+    # if envs.VLLM_USE_V1:
+    #     raise NotImplementedError("vLLM-MindSpore does not support VLLM V1 now!")
 
     # Common environment variables of predict.
     set_context(jit_config={"jit_level": "O0", "infer_boost": "on"})
diff --git a/vllm_mindspore/v1/__init__.py b/vllm_mindspore/v1/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/vllm_mindspore/v1/attention/__init__.py b/vllm_mindspore/v1/attention/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/vllm_mindspore/v1/attention/backends/__init__.py b/vllm_mindspore/v1/attention/backends/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/vllm_mindspore/v1/attention/backends/flash_attn.py b/vllm_mindspore/v1/attention/backends/flash_attn.py
new file mode 100644
index 00000000..77f6d726
--- /dev/null
+++ b/vllm_mindspore/v1/attention/backends/flash_attn.py
@@ -0,0 +1,237 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Attention layer with FlashAttention."""
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Tuple, Type
+
+import numpy as np
+import torch
+
+from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
+                                              AttentionMetadata, AttentionType)
+from vllm.logger import init_logger
+
+
+from vllm_mindspore.utils import MsKVCache
+
+import mindspore as ms
+from mindspore import mutable
+from mindspore._c_expression import swap_cache
+
+
+logger = init_logger(__name__)
+
+
+class FlashAttentionBackend(AttentionBackend):
+
+    accept_output_buffer: bool = True
+
+    @staticmethod
+    def get_supported_head_sizes() -> List[int]:
+        return [32, 64, 96, 128, 160, 192, 224, 256]
+
+    @staticmethod
+    def get_name() -> str:
+        return "MS_ATTN"
+
+    @staticmethod
+    def get_impl_cls() -> Type["AttentionImpl"]:
+        return MsAttentionImpl
+
+    @staticmethod
+    def get_metadata_cls() -> Type["AttentionMetadata"]:
+        return FlashAttentionMetadata
+
+    @staticmethod
+    def get_builder_cls() -> Type["AttentionMetadataBuilder"]:
+        return FlashAttentionMetadataBuilder
+
+    @staticmethod
+    def get_kv_cache_shape(
+        num_blocks: int,
+        block_size: int,
+        num_kv_heads: int,
+        head_size: int,
+    ) -> Tuple[int, ...]:
+        if block_size % 16 != 0:
+            raise ValueError("Block size must be a multiple of 16.")
+        return (2, num_blocks, block_size, num_kv_heads, head_size)
+
+    @staticmethod
+    def use_cascade_attention(*args, **kwargs) -> bool:
+        return False
+
+
+class MLABackend(AttentionBackend):
+    @staticmethod
+    def get_name() -> str:
+        return "MS_MLA"
+
+    @staticmethod
+    def get_impl_cls() -> Type["AttentionImpl"]:
+        return MsAttentionImpl
+
+    @staticmethod
+    def get_metadata_cls() -> Type["AttentionMetadata"]:
+        return FlashAttentionMetadata
+
+    @staticmethod
+    def get_builder_cls() -> Type["AttentionMetadataBuilder"]:
+        return FlashAttentionMetadataBuilder
+
+    @staticmethod
+    def get_kv_cache_shape(
+        num_blocks: int,
+        block_size: int,
+        num_kv_heads: int,
+        head_size: int,
+    ) -> Tuple[int, ...]:
+        if block_size % 16 != 0:
+            raise ValueError("Block size must be a multiple of 16.")
+        return (1, num_blocks, block_size, 1, head_size)
+
+    @staticmethod
+    def use_cascade_attention(*args, **kwargs) -> bool:
+        return False
+
+    @staticmethod
+    def get_supported_head_sizes() -> List[int]:
+        return [576]
+
+
+@dataclass
+class FlashAttentionMetadata:
+    # NOTE(sang): Definition of context_len, query_len, and seq_len.
+    # |---------- N-1 iteration --------|
+    # |---------------- N iteration ---------------------|
+    # |- tokenA -|......................|-- newTokens ---|
+    # |---------- context_len ----------|
+    # |-------------------- seq_len ---------------------|
+    #                                   |-- query_len ---|
+
+    max_seq_len: int
+    seq_lens: torch.Tensor
+    seq_lens_np: np.ndarray
+    block_tables: torch.Tensor
+    slot_mapping: torch.Tensor
+    q_seq_lens: torch.Tensor
+    q_seq_lens_np: np.ndarray
+    context_lens: torch.Tensor
+    max_context_lens: int
+    query_start_loc: torch.Tensor
+
+    def __getitem__(self, key):
+        if key == "batch_valid_length":
+            key = "seq_lens"
+        if key == "block_tables":
+            if getattr(self, key).ndim == 1:
+                return mutable(getattr(self, key).expand_dims(0))
+            return mutable(getattr(self, key))
+        return getattr(self, key)
+
+
+class MsAttentionImpl(AttentionImpl):
+    """
+    If the input tensors contain prompt tokens, the layout is as follows:
+    |<--------------- num_prefill_tokens ----------------->|
+    |<--prefill_0-->|<--prefill_1-->|...|<--prefill_N-1--->|
+
+    Otherwise, the layout is as follows:
+    |<----------------- num_decode_tokens ------------------>|
+    |<--decode_0-->|..........|<--decode_M-1-->|<--padding-->|
+
+    Generation tokens can contain padding when cuda-graph is used.
+    Currently, prompt tokens don't contain any padding.
+
+    The prompts might have different lengths, while the generation tokens
+    always have length 1.
+
+    If chunked prefill is enabled, prefill tokens and decode tokens can be
+    batched together in a flattened 1D query.
+
+    |<----- num_prefill_tokens ---->|<------- num_decode_tokens --------->|
+    |<-prefill_0->|...|<-prefill_N-1->|<--decode_0-->|...|<--decode_M-1-->|
+
+    Currently, cuda graph is disabled for chunked prefill, meaning there's no
+    padding between prefill and decode tokens.
+    """
+
+    def __init__(
+        self,
+        num_heads: int,
+        head_size: int,
+        scale: float,
+        num_kv_heads: int,
+        alibi_slopes: Optional[List[float]],
+        sliding_window: Optional[int],
+        kv_cache_dtype: str,
+        blocksparse_params: Optional[Dict[str, Any]] = None,
+        logits_soft_cap: Optional[float] = None,
+        attn_type: AttentionType = AttentionType.DECODER,
+    ) -> None:
+        pass
+
+    def forward(
+        self,
+        layer: torch.nn.Module,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        kv_cache: torch.Tensor,
+        attn_metadata: FlashAttentionMetadata,
+        output: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """Forward pass with FlashAttention.
+
+        Args:
+            query: shape = [num_tokens, num_heads, head_size]
+            key: shape = [num_tokens, num_kv_heads, head_size]
+            value: shape = [num_tokens, num_kv_heads, head_size]
+            output: shape = [num_tokens, num_heads, head_size]
+            kv_cache = [2, num_blocks, block_size, num_kv_heads, head_size]
+                NOTE: kv_cache will be an empty tensor with shape [0]
+                for profiling run.
+            attn_metadata: Metadata for attention.
+        NOTE: It in-place updates the output tensor.
+        """
+        pass
+
+
+class FlashAttentionMetadataBuilder:
+    def __init__(self, runner: "GPUModelRunner"):
+        self.runner = runner
+
+    def reorder_batch(self, input_batch: "InputBatch",
+                      scheduler_output: "SchedulerOutput") -> bool:
+        return False
+
+    def build(self, num_reqs: int, num_actual_tokens: int, max_query_len: int,
+              common_prefix_len: int):
+        query_start_loc = ms.from_numpy(self.runner.query_start_loc_np[:num_reqs + 1])
+        query_start_loc.move_to("Ascend", blocking=False)
+        max_context_lens = self.runner.input_batch.num_computed_tokens_cpu[:num_reqs].max()
+        slot_mapping = ms.from_numpy(self.runner.slot_mapping_np[:num_actual_tokens])
+        slot_mapping.move_to("Ascend", blocking=False)
+        seq_lens_np = self.runner.seq_lens_np[:num_reqs]
+        max_seq_len = seq_lens_np.max()
+        seq_lens = ms.from_numpy(seq_lens_np)
+        seq_lens.move_to("Ascend", blocking=False)
+        context_lens = ms.from_numpy(self.runner.input_batch.num_computed_tokens_cpu[:num_reqs])
+        context_lens.move_to("Ascend", blocking=False)
+
+        q_seq_lens_np = np.diff(self.runner.query_start_loc_np[:num_reqs + 1])
+        q_seq_lens = ms.from_numpy(q_seq_lens_np)
+        q_seq_lens.move_to("Ascend", blocking=False)
+
+        attn_metadata = FlashAttentionMetadata(
+            seq_lens=seq_lens,
+            seq_lens_np=seq_lens_np,
+            block_tables=(self.runner.input_batch.block_table.get_device_tensor()[:num_reqs]),
+            slot_mapping=slot_mapping,
+            q_seq_lens=q_seq_lens,
+            q_seq_lens_np=q_seq_lens_np,
+            max_seq_len=max_seq_len,
+            context_lens=context_lens,
+            max_context_lens=max_context_lens,
+            query_start_loc = query_start_loc
+        )
+        return attn_metadata
diff --git a/vllm_mindspore/v1/sample/__init__.py b/vllm_mindspore/v1/sample/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/vllm_mindspore/v1/sample/ops/__init__.py b/vllm_mindspore/v1/sample/ops/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/vllm_mindspore/v1/sample/ops/penalties.py b/vllm_mindspore/v1/sample/ops/penalties.py
new file mode 100644
index 00000000..a6efb974
--- /dev/null
+++ b/vllm_mindspore/v1/sample/ops/penalties.py
@@ -0,0 +1,21 @@
+from typing import List
+
+import torch
+from vllm.utils import is_pin_memory_available, make_tensor_with_pad
+
+
+def _convert_to_tensors(output_token_ids: List[List[int]], vocab_size: int,
+                        device: torch.device) -> torch.Tensor:
+    """
+    Convert the different list data structures to tensors.
+    """
+    output_tokens_tensor = make_tensor_with_pad(
+        output_token_ids,
+        # Use the value of vocab_size as a pad since we don't have a
+        # token_id of this value.
+        pad=vocab_size,
+        device="cpu",
+        dtype=torch.int64,
+        pin_memory=is_pin_memory_available(),
+    )
+    return output_tokens_tensor
diff --git a/vllm_mindspore/v1/sample/ops/topk_topp_sampler.py b/vllm_mindspore/v1/sample/ops/topk_topp_sampler.py
new file mode 100644
index 00000000..57f0a81c
--- /dev/null
+++ b/vllm_mindspore/v1/sample/ops/topk_topp_sampler.py
@@ -0,0 +1,98 @@
+from typing import Optional
+import torch
+from vllm.v1.sample.ops.topk_topp_sampler import apply_top_k_only
+
+def random_sample(
+    probs: torch.Tensor,
+    generators: dict[int, torch.Generator],
+) -> torch.Tensor:
+    """Randomly sample from the probabilities.
+
+    We use this function instead of torch.multinomial because torch.multinomial
+    causes CPU-GPU synchronization.
+    """
+    q = torch.empty_like(probs)
+    # NOTE(woosuk): To batch-process the requests without their own seeds,
+    # which is the common case, we first assume that every request does
+    # not have its own seed. Then, we overwrite the values for the requests
+    # that have their own seeds.
+    if len(generators) != probs.shape[0]:
+        q.exponential_()
+    if generators:
+        # TODO(woosuk): This can be slow because we handle each request
+        # one by one. Optimize this.
+        for i, generator in generators.items():
+            q[i].exponential_(generator=generator)
+    # if use probs.div_(q) instead of probs = probs.div(q), it will cause
+    # a error when running.
+    probs = probs.div(q)
+    return probs.argmax(dim=-1).view(-1)
+
+
+def apply_top_k_top_p(
+    logits: torch.Tensor,
+    k: Optional[torch.Tensor],
+    p: Optional[torch.Tensor],
+) -> torch.Tensor:
+    """Apply top-k and top-p masks to the logits.
+
+    If a top-p is used, this function will sort the logits tensor,
+    which can be slow for large batches.
+
+    The logits tensor may be updated in-place.
+    """
+    if p is None:
+        if k is None:
+            return logits
+
+        # Avoid sorting vocab for top-k only case.
+        return apply_top_k_only(logits, k)
+
+    logits_sort, logits_idx = logits.sort(dim=-1, descending=False)
+
+    if k is not None:
+        # Apply top-k.
+        top_k_mask = logits_sort.size(1) - k.to(torch.long)  # shape: B
+        # Get all the top_k values.
+        top_k_mask = logits_sort.gather(1, top_k_mask.unsqueeze(dim=1))
+        top_k_mask = logits_sort < top_k_mask
+        logits_sort.masked_fill_(top_k_mask, -float("inf"))
+
+    if p is not None:
+        # Apply top-p.
+        probs_sort = logits_sort.softmax(dim=-1)
+        probs_sum = torch.cumsum(probs_sort, dim=-1)
+        top_p_mask = probs_sum <= 1 - p.unsqueeze(dim=1)
+        # at least one
+        top_p_mask[:, -1] = False
+        logits_sort.masked_fill_(top_p_mask, -float("inf"))
+
+    # Re-sort the probabilities.
+    logits = logits_sort.scatter(dim=-1, index=logits_idx, src=logits_sort)
+    return logits
+
+
+def apply_top_k_only(
+    logits: torch.Tensor,
+    k: torch.Tensor,
+) -> torch.Tensor:
+    """
+    Apply top-k mask to the logits.
+
+    This implementation doesn't involve sorting the entire vocab.
+
+    The logits tensor may be updated in-place.
+    """
+    no_top_k_mask = k == logits.shape[1]
+    # Set non-top-k rows to 1 so that we can gather.
+    k = k.masked_fill(no_top_k_mask, 1)
+    max_top_k = k.max()
+    # topk.values tensor has shape [batch_size, max_top_k].
+    # Convert top k to 0-based index in range [0, max_top_k).
+    k_index = k.sub_(1).unsqueeze(1).expand(logits.shape[0], 1)
+
+    top_k_mask = logits.topk(max_top_k, dim=1)[0].gather(1, k_index.long())
+    # Handle non-topk rows.
+    top_k_mask.masked_fill_(no_top_k_mask.unsqueeze(1), -float("inf"))
+    logits.masked_fill_(logits < top_k_mask, -float("inf"))
+    return logits
diff --git a/vllm_mindspore/v1/sample/rejection_sampler.py b/vllm_mindspore/v1/sample/rejection_sampler.py
new file mode 100644
index 00000000..9fe3e7fb
--- /dev/null
+++ b/vllm_mindspore/v1/sample/rejection_sampler.py
@@ -0,0 +1,627 @@
+# SPDX-License-Identifier: Apache-2.0
+from typing import Optional
+
+import torch
+import torch.nn as nn
+
+from vllm.logger import init_logger
+from vllm.v1.sample.metadata import SamplingMetadata
+from vllm.v1.sample.ops.topk_topp_sampler import apply_top_k_top_p
+from vllm.v1.spec_decode.metadata import SpecDecodeMetadata
+
+logger = init_logger(__name__)
+
+PLACEHOLDER_TOKEN_ID = -1
+GREEDY_TEMPERATURE = -1
+# Maximum number of speculative draft tokens allowed per request in a single
+# step. This value is chosen to be large enough to handle typical use cases.
+MAX_SPEC_LEN = 32
+
+
+
+
+class RejectionSampler(nn.Module):
+    """
+    The implementation strictly follows the algorithm described in
+        https://arxiv.org/abs/2211.17192.
+    However, we want to clarify the terminology used in the implementation:
+    accepted tokens: tokens that are accepted based on the relationship
+            between the "raw" draft and target probabilities.
+    recovered tokens: tokens that are sampled based on the adjusted probability
+        distribution, which is derived from both the draft and target
+        probabilities.
+    bonus tokens:
+        If all proposed tokens are accepted, the bonus token is added to the
+        end of the sequence. The bonus token is only sampled from the target
+        probabilities. We pass in the bonus tokens instead of sampling them
+        in the rejection sampler to allow for more flexibility in the
+        sampling process. For example, we can use top_p, top_k sampling for
+        bonus tokens, while spec decode does not support these sampling
+        strategies.
+    output tokens:
+        Tokens are finally generated with the rejection sampler.
+        output tokens = accepted tokens + recovered tokens + bonus tokens
+    """
+
+    def forward(
+        self,
+        metadata: SpecDecodeMetadata,
+        # [num_tokens, vocab_size]
+        draft_probs: Optional[torch.Tensor],
+        # [num_tokens, vocab_size]
+        target_logits: torch.Tensor,
+        # [batch_size, 1]
+        bonus_token_ids: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> torch.Tensor:
+        '''
+        Args:
+            metadata:
+                Metadata for spec decoding.
+            draft_probs (Optional[torch.Tensor]):
+                Probability distribution for the draft tokens. Shape is
+                [num_tokens, vocab_size]. Can be None if probabilities are
+                not provided, which is the case for ngram spec decode.
+            target_logits (torch.Tensor):
+                Target model's logits probability distribution.
+                Shape is [num_tokens, vocab_size]. Here, probabilities from
+                different requests are flattened into a single tensor because
+                this is the shape of the output logits.
+                NOTE: `target_logits` can be updated in place to save memory.
+            bonus_token_ids_tensor (torch.Tensor):
+                A tensor containing bonus tokens. Shape is [batch_size, 1].
+                Bonus tokens are added to the end of the sequence if all
+                proposed tokens are accepted. We generate the bonus tokens
+                outside of the rejection sampler with the default sampling
+                strategy. It allows for more flexibility in the sampling
+                process such as top_p, top_k sampling.
+            sampling_metadata (SamplingMetadata):
+                Additional metadata needed for sampling, such as temperature,
+                top-k/top-p parameters, or other relevant information.
+        Returns:
+            output_token_ids (torch.Tensor):
+                A tensor containing the final output token IDs.
+        '''
+        assert metadata.max_spec_len <= MAX_SPEC_LEN
+        # [num_tokens, vocab_size]
+        # NOTE(woosuk): `target_logits` can be updated in place inside the
+        # `compute_probs` function.
+        target_probs = compute_probs(
+            target_logits,
+            metadata.cu_num_draft_tokens,
+            sampling_metadata,
+        )
+
+        output_token_ids = rejection_sample(
+            metadata.draft_token_ids,
+            metadata.num_draft_tokens,
+            metadata.max_spec_len,
+            metadata.cu_num_draft_tokens,
+            draft_probs,
+            target_probs,
+            bonus_token_ids,
+            sampling_metadata,
+        )
+        return output_token_ids
+
+    @staticmethod
+    def parse_output(
+        output_token_ids: torch.Tensor,
+        vocab_size: int,
+    ) -> list[list[int]]:
+        """Parse the output of the rejection sampler.
+
+        Args:
+            output_token_ids: The sampled token IDs in shape
+                [batch_size, max_spec_len + 1]. The rejected tokens are
+                replaced with `PLACEHOLDER_TOKEN_ID` by the rejection sampler
+                and will be filtered out in this function.
+            vocab_size: The size of the vocabulary.
+
+        Returns:
+            A list of lists of token IDs.
+        """
+        output_token_ids_np = output_token_ids.cpu().numpy()
+        # Create mask for valid tokens.
+        valid_mask = ((output_token_ids_np != PLACEHOLDER_TOKEN_ID) &
+                      (output_token_ids_np < vocab_size))
+        outputs = [
+            row[valid_mask[i]].tolist()
+            for i, row in enumerate(output_token_ids_np)
+        ]
+        return outputs
+
+
+def rejection_sample(
+    # [num_tokens]
+    draft_token_ids: torch.Tensor,
+    # [batch_size]
+    num_draft_tokens: list[int],
+    max_spec_len: int,
+    # [batch_size]
+    cu_num_draft_tokens: torch.Tensor,
+    # [num_tokens, vocab_size]
+    draft_probs: Optional[torch.Tensor],
+    # [num_tokens, vocab_size]
+    target_probs: torch.Tensor,
+    # [batch_size, 1]
+    bonus_token_ids: torch.Tensor,
+    sampling_metadata: SamplingMetadata,
+) -> torch.Tensor:
+    assert draft_token_ids.ndim == 1
+    assert draft_probs is None or draft_probs.ndim == 2
+    assert cu_num_draft_tokens.ndim == 1
+    assert target_probs.ndim == 2
+
+    batch_size = len(num_draft_tokens)
+    num_tokens = draft_token_ids.shape[0]
+    vocab_size = target_probs.shape[-1]
+    device = target_probs.device
+    assert draft_token_ids.is_contiguous()
+    assert draft_probs is None or draft_probs.is_contiguous()
+    assert target_probs.is_contiguous()
+    assert bonus_token_ids.is_contiguous()
+    assert target_probs.shape == (num_tokens, vocab_size)
+
+    # Create output buffer.
+    output_token_ids = torch.empty(
+        (batch_size, max_spec_len + 1),
+        dtype=torch.int32,  # Consistent with SamplerOutput.sampled_token_ids.
+        device=device,
+    )
+    output_token_ids.fill_(PLACEHOLDER_TOKEN_ID)
+
+    if sampling_metadata.all_greedy:
+        is_greedy = None
+    else:
+        is_greedy = sampling_metadata.temperature == GREEDY_TEMPERATURE
+    if not sampling_metadata.all_random:
+        # Rejection sampling for greedy sampling requests.
+        target_argmax = target_probs.argmax(dim=-1)
+        rejection_greedy_sample_kernel[(batch_size, )](
+            output_token_ids,
+            cu_num_draft_tokens,
+            draft_token_ids,
+            target_argmax,
+            bonus_token_ids,
+            is_greedy,
+            max_spec_len,
+            num_warps=1,
+        )
+        if sampling_metadata.all_greedy:
+            return output_token_ids
+
+    # Generate uniform probabilities for rejection sampling.
+    # [num_tokens]
+    uniform_probs = generate_uniform_probs(
+        num_tokens,
+        num_draft_tokens,
+        sampling_metadata.generators,
+        device,
+    )
+
+    # Sample recovered tokens for each position.
+    # [num_tokens]
+    recovered_token_ids = sample_recovered_tokens(
+        max_spec_len,
+        num_draft_tokens,
+        cu_num_draft_tokens,
+        draft_token_ids,
+        draft_probs,
+        target_probs,
+        sampling_metadata,
+        device,
+    )
+
+    # Rejection sampling for random sampling requests.
+    rejection_random_sample_kernel[(batch_size, )](
+        output_token_ids,
+        cu_num_draft_tokens,
+        draft_token_ids,
+        draft_probs,
+        target_probs,
+        bonus_token_ids,
+        recovered_token_ids,
+        uniform_probs,
+        is_greedy,
+        max_spec_len,
+        vocab_size,
+        IS_NGRAM=draft_probs is None,
+        num_warps=1,
+    )
+    return output_token_ids
+
+
+def compute_probs(
+    logits: torch.Tensor,  # [num_tokens, vocab_size]
+    cu_num_draft_tokens: torch.Tensor,  # [batch_size]
+    sampling_metadata: SamplingMetadata,
+) -> torch.Tensor:
+    """Compute probability distribution from logits based on sampling metadata.
+
+    This function applies temperature scaling to the logits and converts
+    them to probabilities using softmax. For greedy decoding, it returns
+    the original logits.
+
+    Args:
+        logits: Input logits tensor to be converted to probabilities.
+        cu_num_draft_tokens: Cumulative number of draft tokens.
+        sampling_metadata: Metadata containing sampling parameters such as
+            temperature and whether greedy sampling is used.
+
+    Returns:
+        torch.Tensor: Probability distribution (softmax of scaled logits)
+            if non-greedy sampling is used, otherwise returns the
+            original logits.
+    """
+    assert logits.ndim == 2
+    assert cu_num_draft_tokens.ndim == 1
+    if sampling_metadata.all_greedy:
+        return logits
+
+    num_tokens = logits.shape[0]
+    temperature = expand_batch_to_tokens(
+        sampling_metadata.temperature,
+        cu_num_draft_tokens,
+        num_tokens,
+        replace_from=GREEDY_TEMPERATURE,
+        replace_to=1,
+    )
+    # NOTE(woosuk): Update `logits` in place to avoid allocating a new tensor.
+    logits.div_(temperature.unsqueeze(-1))
+
+    # Get expanded top_k and top_p tensors.
+    top_k = None
+    if sampling_metadata.top_k is not None:
+        top_k = expand_batch_to_tokens(
+            sampling_metadata.top_k,
+            cu_num_draft_tokens,
+            num_tokens,
+        )
+    top_p = None
+    if sampling_metadata.top_p is not None:
+        top_p = expand_batch_to_tokens(
+            sampling_metadata.top_p,
+            cu_num_draft_tokens,
+            num_tokens,
+        )
+
+    # NOTE(woosuk): `apply_top_k_top_p` uses sorting to calculate the mask,
+    # which is slow for large vocab sizes. This may cause performance issues.
+    logits = apply_top_k_top_p(logits, top_k, top_p)
+    output_prob = logits.softmax(dim=-1, dtype=torch.float32)
+    return output_prob
+
+
+def expand_batch_to_tokens(
+    x: torch.Tensor,  # [batch_size]
+    cu_num_tokens: torch.Tensor,  # [batch_size]
+    num_tokens: int,
+    replace_from: int = 0,
+    replace_to: int = 0,
+) -> torch.Tensor:
+    """Expand [batch_size] tensor to [num_tokens] tensor based on the number of
+    tokens per batch in cu_num_tokens.
+
+    For example, if x = [a, b, c] and cu_num_tokens = [2, 5, 6], then
+    num_tokens = 6, and expanded_x = [a, a, b, b, b, c].
+
+    Args:
+        x: [batch_size] tensor to expand.
+        cu_num_tokens: [batch_size] tensor containing the cumulative number of
+            tokens per batch. Each element represents the total number of
+            tokens up to and including that batch.
+        num_tokens: Total number of tokens.
+        replace_from: int = 0
+            Value to be replaced if it is found in x.
+        replace_to: int = 0
+            Value to replace with when replace_from is found.
+    Returns:
+        expanded_x: [num_tokens] tensor.
+    """
+    batch_size = x.shape[0]
+    assert cu_num_tokens.shape[0] == batch_size
+    expanded_x = x.new_empty(num_tokens)
+    expand_kernel[(batch_size, )](
+        expanded_x,
+        x,
+        cu_num_tokens,
+        replace_from,
+        replace_to,
+        MAX_NUM_TOKENS=MAX_SPEC_LEN,  # To avoid recompilation.
+        num_warps=1,
+    )
+    return expanded_x
+
+
+def generate_uniform_probs(
+    num_tokens: int,
+    num_draft_tokens: list[int],
+    generators: dict[int, torch.Generator],
+    device: torch.device,
+) -> torch.Tensor:
+    """
+    Generates a batch of uniform random samples, with optional seeding
+    if available.
+
+    This method creates a tensor of shape `(num_tokens, )` filled
+    with uniform random values in the range [0, 1). If `generators` is provided,
+    the requests with their own seeds will use the provided `torch.Generator`
+    for reproducibility. The samples for the other requests will be generated
+    without a seed.
+
+    Args:
+        num_tokens : int
+            Total number of tokens.
+        num_draft_tokens : List[List[int]]
+            Number of draft tokens per request.
+        generators : Optional[Dict[int, torch.Generator]]
+            A dictionary mapping indices in the batch to
+            `torch.Generator` objects.
+        device : torch.device
+            The device on which to allocate the tensor.
+    Returns:
+        uniform_rand : torch.Tensor
+            A tensor of shape `(num_tokens, )` containing uniform
+            random values in the range [0, 1).
+    """
+    uniform_probs = torch.rand(
+        (num_tokens, ),
+        dtype=torch.float32,
+        device=device,
+    )
+    start_idx = 0
+    for req_idx, n in enumerate(num_draft_tokens):
+        # Do not generate random numbers for requests with no draft tokens.
+        # This can be important for reproducibility.
+        if n == 0:
+            continue
+        end_idx = start_idx + n
+        generator = generators.get(req_idx)
+        if generator is not None:
+            uniform_probs[start_idx:end_idx].uniform_(generator=generator)
+        start_idx = end_idx
+    return uniform_probs
+
+
+def sample_recovered_tokens(
+    max_spec_len: int,
+    num_draft_tokens: list[int],
+    # [batch_size]
+    cu_num_draft_tokens: torch.Tensor,
+    # [num_tokens]
+    draft_token_ids: torch.Tensor,
+    # [num_tokens, vocab_size]
+    draft_probs: Optional[torch.Tensor],
+    # [num_tokens, vocab_size]
+    target_probs: torch.Tensor,
+    sampling_metadata: SamplingMetadata,
+    device: torch.device,
+) -> torch.Tensor:
+    # NOTE(woosuk): Create only one distribution for each request.
+    batch_size = len(num_draft_tokens)
+    vocab_size = target_probs.shape[-1]
+    q = torch.empty(
+        (batch_size, vocab_size),
+        dtype=torch.float32,
+        device=device,
+    )
+    q.exponential_()
+    for i, generator in sampling_metadata.generators.items():
+        # Do not generate random numbers for requests with no draft tokens.
+        # This can be important for reproducibility.
+        if num_draft_tokens[i] > 0:
+            q[i].exponential_(generator=generator)
+
+    recovered_token_ids = torch.empty_like(draft_token_ids)
+    sample_recovered_tokens_kernel[(batch_size, max_spec_len)](
+        recovered_token_ids,
+        cu_num_draft_tokens,
+        draft_token_ids,
+        draft_probs,
+        target_probs,
+        q,
+        vocab_size,
+        triton.next_power_of_2(vocab_size),
+        IS_NGRAM=draft_probs is None,
+    )
+    return recovered_token_ids
+
+
+# NOTE(woosuk): Avoid specialization to prevent unnecessary recompilation.
+def rejection_greedy_sample_kernel(
+    output_token_ids_ptr,  # [batch_size, max_spec_len + 1]
+    cu_num_draft_tokens_ptr,  # [batch_size]
+    draft_token_ids_ptr,  # [num_tokens]
+    target_argmax_ptr,  # [num_tokens]
+    bonus_token_ids_ptr,  # [batch_size]
+    is_greedy_ptr,  # [batch_size] or None
+    max_spec_len,
+):
+    req_idx = tl.program_id(0)
+    # FIXME(woosuk): Because is_greedy_ptr is not None at profiling run,
+    # re-compilation may happen during runtime when is_greedy_ptr is None.
+    if is_greedy_ptr is None:
+        is_greedy = True
+    else:
+        is_greedy = tl.load(is_greedy_ptr + req_idx)
+    if not is_greedy:
+        # Early exit for non-greedy sampling requests.
+        return
+
+    if req_idx == 0:
+        start_idx = 0
+    else:
+        start_idx = tl.load(cu_num_draft_tokens_ptr + req_idx - 1)
+    end_idx = tl.load(cu_num_draft_tokens_ptr + req_idx)
+    num_draft_tokens = end_idx - start_idx
+
+    rejected = False
+    for pos in range(num_draft_tokens):
+        if not rejected:
+            draft_token_id = tl.load(draft_token_ids_ptr + start_idx + pos)
+            target_argmax_id = tl.load(target_argmax_ptr + start_idx + pos)
+            tl.store(output_token_ids_ptr + req_idx * (max_spec_len + 1) + pos,
+                     target_argmax_id)
+            if draft_token_id != target_argmax_id:
+                # Reject.
+                rejected = True
+
+    if not rejected:
+        # If all tokens are accepted, append the bonus token.
+        bonus_token_id = tl.load(bonus_token_ids_ptr + req_idx)
+        tl.store(
+            output_token_ids_ptr + req_idx * (max_spec_len + 1) +
+            num_draft_tokens, bonus_token_id)
+
+
+# NOTE(woosuk): Avoid specialization to prevent unnecessary recompilation.
+def rejection_random_sample_kernel(
+    output_token_ids_ptr,  # [batch_size, max_spec_len + 1]
+    cu_num_draft_tokens_ptr,  # [batch_size]
+    draft_token_ids_ptr,  # [num_tokens]
+    draft_probs_ptr,  # [num_tokens, vocab_size] or None
+    target_probs_ptr,  # [num_tokens, vocab_size]
+    bonus_token_ids_ptr,  # [batch_size]
+    recovered_token_ids_ptr,  # [num_tokens]
+    uniform_probs_ptr,  # [num_tokens]
+    is_greedy_ptr,  # [batch_size]
+    max_spec_len,
+    vocab_size,
+    IS_NGRAM,
+):
+    req_idx = tl.program_id(0)
+    is_greedy = tl.load(is_greedy_ptr + req_idx)
+    if is_greedy:
+        # Early exit for greedy sampling requests.
+        return
+
+    if req_idx == 0:
+        start_idx = 0
+    else:
+        start_idx = tl.load(cu_num_draft_tokens_ptr + req_idx - 1)
+    end_idx = tl.load(cu_num_draft_tokens_ptr + req_idx)
+    num_draft_tokens = end_idx - start_idx
+
+    rejected = False
+    for pos in range(num_draft_tokens):
+        if not rejected:
+            draft_token_id = tl.load(draft_token_ids_ptr + start_idx + pos)
+            if IS_NGRAM:
+                draft_prob = 1
+            else:
+                draft_prob = tl.load(draft_probs_ptr +
+                                     (start_idx + pos) * vocab_size +
+                                     draft_token_id)
+            target_prob = tl.load(target_probs_ptr +
+                                  (start_idx + pos) * vocab_size +
+                                  draft_token_id)
+            uniform_prob = tl.load(uniform_probs_ptr + start_idx + pos)
+            # NOTE(woosuk): While the draft probability should never be 0,
+            # we check it to avoid NaNs. If it happens to be 0, we reject.
+            if draft_prob > 0 and target_prob / draft_prob >= uniform_prob:
+                # Accept.
+                token_id = draft_token_id
+            else:
+                # Reject. Use recovered token.
+                rejected = True
+                token_id = tl.load(recovered_token_ids_ptr + start_idx + pos)
+            tl.store(output_token_ids_ptr + req_idx * (max_spec_len + 1) + pos,
+                     token_id)
+
+    if not rejected:
+        # If all tokens are accepted, append the bonus token.
+        bonus_token_id = tl.load(bonus_token_ids_ptr + req_idx)
+        tl.store(
+            output_token_ids_ptr + req_idx * (max_spec_len + 1) +
+            num_draft_tokens, bonus_token_id)
+
+
+# NOTE(woosuk): Avoid specialization to prevent unnecessary recompilation.
+def expand_kernel(
+    output_ptr,  # [num_tokens]
+    input_ptr,  # [batch_size]
+    cu_num_tokens_ptr,  # [batch_size]
+    replace_from,
+    replace_to,
+    MAX_NUM_TOKENS,
+):
+    req_idx = tl.program_id(0)
+    if req_idx == 0:  # noqa: SIM108
+        start_idx = 0
+    else:
+        start_idx = tl.load(cu_num_tokens_ptr + req_idx - 1)
+    end_idx = tl.load(cu_num_tokens_ptr + req_idx)
+    num_tokens = end_idx - start_idx
+
+    src_val = tl.load(input_ptr + req_idx)
+    src_val = tl.where(src_val == replace_from, replace_to, src_val)
+    offset = tl.arange(0, MAX_NUM_TOKENS)
+    tl.store(output_ptr + start_idx + offset,
+             src_val,
+             mask=offset < num_tokens)
+
+
+def sample_recovered_tokens_kernel(
+    output_token_ids_ptr,  # [num_tokens]
+    cu_num_draft_tokens_ptr,  # [batch_size]
+    draft_token_ids_ptr,  # [num_tokens]
+    draft_probs_ptr,  # [num_tokens, vocab_size] or None
+    target_probs_ptr,  # [num_tokens, vocab_size]
+    q_ptr,  # [batch_size, vocab_size]
+    vocab_size,
+    PADDED_VOCAB_SIZE,
+    IS_NGRAM,
+):
+    req_idx = tl.program_id(0)
+    if req_idx == 0:
+        start_idx = 0
+    else:
+        start_idx = tl.load(cu_num_draft_tokens_ptr + req_idx - 1)
+    end_idx = tl.load(cu_num_draft_tokens_ptr + req_idx)
+    num_draft_tokens = end_idx - start_idx
+
+    # Early exit for out-of-range positions.
+    pos = tl.program_id(1)
+    if pos >= num_draft_tokens:
+        return
+
+    vocab_offset = tl.arange(0, PADDED_VOCAB_SIZE)
+    if IS_NGRAM:
+        draft_token_id = tl.load(draft_token_ids_ptr + start_idx + pos)
+        orig_prob = tl.load(target_probs_ptr + (start_idx + pos) * vocab_size +
+                            draft_token_id)
+        # Temporarily zero out the probability of the draft token.
+        # This is essentially the same as target_prob - draft_prob, except that
+        # n-gram does not have draft_prob. We regard it as 1.
+        tl.store(
+            target_probs_ptr + (start_idx + pos) * vocab_size + draft_token_id,
+            0)
+        prob = tl.load(target_probs_ptr + (start_idx + pos) * vocab_size +
+                       vocab_offset,
+                       mask=vocab_offset < vocab_size,
+                       other=0)
+    else:
+        draft_prob = tl.load(draft_probs_ptr + (start_idx + pos) * vocab_size +
+                             vocab_offset,
+                             mask=vocab_offset < vocab_size,
+                             other=0)
+        target_prob = tl.load(target_probs_ptr +
+                              (start_idx + pos) * vocab_size + vocab_offset,
+                              mask=vocab_offset < vocab_size,
+                              other=0)
+        prob = tl.maximum(target_prob - draft_prob, 0)
+        # NOTE(woosuk): We don't need `prob = prob / tl.sum(prob)` here because
+        # `tl.argmax` will select the maximum value.
+
+    q = tl.load(q_ptr + req_idx * vocab_size + vocab_offset,
+                mask=vocab_offset < vocab_size,
+                other=float("-inf"))
+    recovered_id = tl.argmax(prob / q, axis=-1)
+    tl.store(output_token_ids_ptr + start_idx + pos, recovered_id)
+
+    if IS_NGRAM:
+        # Restore the original probability.
+        tl.store(
+            target_probs_ptr + (start_idx + pos) * vocab_size + draft_token_id,
+            orig_prob)
diff --git a/vllm_mindspore/v1/sample/sampler.py b/vllm_mindspore/v1/sample/sampler.py
new file mode 100644
index 00000000..ed5dcb5b
--- /dev/null
+++ b/vllm_mindspore/v1/sample/sampler.py
@@ -0,0 +1,10 @@
+import torch
+
+def apply_temperature(
+    self,
+    logits: torch.Tensor,
+    temp: torch.Tensor,
+) -> torch.Tensor:
+    # logits.div_ will cause some error right now.
+    # So we use logits = logits.div instead of logits.div_.
+    return logits.div(temp.unsqueeze(dim=1))
diff --git a/vllm_mindspore/v1/spec_decode/__init__.py b/vllm_mindspore/v1/spec_decode/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/vllm_mindspore/v1/spec_decode/eagle.py b/vllm_mindspore/v1/spec_decode/eagle.py
new file mode 100644
index 00000000..7279bcaf
--- /dev/null
+++ b/vllm_mindspore/v1/spec_decode/eagle.py
@@ -0,0 +1,258 @@
+# SPDX-License-Identifier: Apache-2.0
+import torch
+import torch.nn as nn
+
+from vllm.config import VllmConfig
+from vllm.forward_context import set_forward_context
+from vllm_mindspore.v1.attention.backends.flash_attn import FlashAttentionMetadata
+from vllm.v1.sample.metadata import SamplingMetadata
+
+
+class EagleProposer:
+
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        device: torch.device,
+    ):
+        self.vllm_config = vllm_config
+        self.num_speculative_tokens = (
+            vllm_config.speculative_config.num_speculative_tokens)
+        self.block_size = vllm_config.cache_config.block_size
+        self.arange = torch.arange(vllm_config.scheduler_config.max_num_seqs,
+                                   device=device)
+
+    def propose(
+        self,
+        # [num_tokens]
+        target_token_ids: torch.Tensor,
+        # [num_tokens]
+        target_positions: torch.Tensor,
+        # [num_tokens, hidden_size]
+        target_hidden_states: torch.Tensor,
+        # [num_tokens]
+        target_slot_mapping: torch.Tensor,
+        # [batch_size]
+        next_token_ids: torch.Tensor,
+        # [batch_size + 1] starting with 0
+        cu_num_tokens: torch.Tensor,
+        # [batch_size, max_num_blocks_per_req]
+        block_table: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        num_tokens = target_token_ids.shape[0]
+        batch_size = next_token_ids.shape[0]
+        last_token_indices = cu_num_tokens[1:] - 1
+
+        input_ids = torch.empty_like(target_token_ids)
+        # Shift the input ids by one token.
+        # E.g., [a1, b1, b2, c1, c2, c3] -> [b1, b2, c1, c2, c3, c3]
+        input_ids[:-1] = target_token_ids[1:]
+        # Replace the last token with the next token.
+        # E.g., [b1, b2, c1, c2, c3, c3] -> [a2, b2, b3, c2, c3, c4]
+        input_ids[last_token_indices] = next_token_ids
+
+        seq_lens = target_positions[last_token_indices] + 1
+        # FIXME(woosuk): The below two ops cause synchronization. Optimize.
+        max_seq_len = seq_lens.max().item()
+        max_num_tokens = (cu_num_tokens[1:] - cu_num_tokens[:-1]).max().item()
+        attn_metadata = FlashAttentionMetadata(
+            num_actual_tokens=num_tokens,
+            max_query_len=max_num_tokens,
+            query_start_loc=cu_num_tokens,
+            max_seq_len=max_seq_len,
+            seq_lens=seq_lens,
+            block_table=block_table,
+            slot_mapping=target_slot_mapping,
+            # TODO(woosuk): Support cascade attention.
+            use_cascade=False,
+            common_prefix_len=0,
+            cu_prefix_query_lens=None,
+            prefix_kv_lens=None,
+            suffix_kv_lens=None,
+        )
+
+        with set_forward_context(attn_metadata, self.vllm_config):
+            hidden_states = self.model(
+                input_ids=input_ids,
+                hidden_states=target_hidden_states,
+                positions=target_positions,
+            )
+        sample_hidden_states = hidden_states[last_token_indices]
+        logits = self.model.compute_logits(sample_hidden_states, None)
+        draft_token_ids, draft_probs = compute_probs_and_sample_next_token(
+            logits, sampling_metadata)
+
+        # Early exit if there is only one draft token to be generated.
+        if self.num_speculative_tokens == 1:
+            # [batch_size, 1] and [batch_size, 1, vocab_size]
+            return draft_token_ids.view(-1, 1), draft_probs.unsqueeze(dim=1)
+
+        # Generate the remaining draft tokens.
+        draft_token_ids_list = [draft_token_ids]
+        draft_probs_list = [draft_probs]
+
+        positions = target_positions[last_token_indices]
+        hidden_states = sample_hidden_states
+        attn_metadata.num_actual_tokens = batch_size
+        attn_metadata.max_query_len = 1
+        attn_metadata.query_start_loc = self.arange[:batch_size]
+        for _ in range(self.num_speculative_tokens - 1):
+            # Update the inputs.
+            input_ids = draft_token_ids_list[-1]
+            positions += 1
+            attn_metadata.max_seq_len += 1
+            attn_metadata.seq_lens += 1
+            # Compute the slot mapping.
+            block_numbers = positions // self.block_size
+            block_ids = block_table.gather(dim=1,
+                                           index=block_numbers.view(-1, 1))
+            block_ids = block_ids.view(-1)
+            attn_metadata.slot_mapping = (block_ids * self.block_size +
+                                          positions % self.block_size)
+
+            # Run the model.
+            with set_forward_context(attn_metadata, self.vllm_config):
+                hidden_states = self.model(
+                    input_ids=input_ids,
+                    hidden_states=hidden_states,
+                    positions=positions,
+                )
+            logits = self.model.compute_logits(hidden_states, None)
+            draft_token_ids, probs = compute_probs_and_sample_next_token(
+                logits, sampling_metadata)
+            draft_token_ids_list.append(draft_token_ids)
+            draft_probs_list.append(probs)
+
+        # [batch_size, num_speculative_tokens]
+        draft_token_ids = torch.stack(draft_token_ids_list, dim=1)
+        # [batch_size, num_speculative_tokens, vocab_size]
+        draft_probs = torch.stack(draft_probs_list, dim=1)
+        return draft_token_ids, draft_probs
+
+    @staticmethod
+    def prepare_inputs(
+        # [batch_size + 1]
+        cu_target_query_lens: torch.Tensor,
+        # [batch_size]
+        num_rejected_tokens: torch.Tensor,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        # cu_target_query_lens: [0, a, a + b, a + b + c]
+        # num_rejected_tokens: [n1, n2, n3]
+        # num_tokens_per_req: [a - n1, b - n2, c - n3]
+        # cu_num_tokens: [0, a - n1, a + b - n1 - n2, a + b + c - n1 - n2 - n3]
+        # token_indices: [0, 1, ..., a - n1 - 1,
+        #                 a, a + 1, ..., a + b - n2 - 1,
+        #                 a + b, a + b + 1, ..., a + b + c - n3 - 1]
+
+        # [0, a, a + b, a + b + c] -> [a, b, c]
+        query_len_per_req = (cu_target_query_lens[1:] -
+                             cu_target_query_lens[:-1])
+        # [a, b, c] -> [a - n1, b - n2, c - n3]
+        num_tokens_per_req = query_len_per_req - num_rejected_tokens
+
+        cu_num_tokens = torch.empty_like(cu_target_query_lens)
+        torch.cumsum(num_tokens_per_req, dim=0, out=cu_num_tokens[1:])
+        cu_num_tokens[0] = 0
+
+        # FIXME(woosuk): Avoid synchronization.
+        num_tokens = cu_num_tokens[-1].item()
+        token_indices = torch.empty(
+            num_tokens,
+            dtype=torch.int32,
+            device=cu_num_tokens.device,
+        )
+
+        batch_size = num_rejected_tokens.shape[0]
+        BLOCK_SIZE = 1024
+        prepare_input_kernel[(batch_size, )](
+            token_indices,
+            cu_target_query_lens,
+            cu_num_tokens,
+            BLOCK_SIZE=BLOCK_SIZE,
+        )
+        return cu_num_tokens, token_indices
+
+    def load_model(self, target_model: nn.Module) -> None:
+        self.model = DummyEagleModel()
+        self.model.get_input_embeddings = target_model.get_input_embeddings
+        self.model.compute_logits = target_model.compute_logits
+
+
+# FIXME(woosuk): This is a dummy model for testing.
+# Remove this once we have a real model.
+class DummyEagleModel(nn.Module):
+
+    def __init__(self):
+        super().__init__()
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        hidden_states: torch.Tensor,
+        positions: torch.Tensor,
+    ) -> torch.Tensor:
+        input_embeddings = self.get_input_embeddings(input_ids)
+        return hidden_states + input_embeddings  # Dummy return.
+
+
+# FIXME(woosuk): The logic here is duplicated with the main sampling code.
+# We should refactor this to reuse the same sampling implementation.
+def compute_probs_and_sample_next_token(
+    logits: torch.Tensor,
+    sampling_metadata: SamplingMetadata,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    if sampling_metadata.all_greedy:
+        # For greedy requests, draft_probs is not used in rejection sampling.
+        # Therefore, we can just return the logits.
+        probs = logits
+        next_token_ids = logits.argmax(dim=-1)
+        return next_token_ids, probs
+
+    is_greedy = sampling_metadata.temperature == -1
+    temperature = torch.where(is_greedy, 1.0, sampling_metadata.temperature)
+    logits.div_(temperature.view(-1, 1))
+    probs = logits.softmax(dim=-1, dtype=torch.float32)
+
+    # NOTE(woosuk): Currently, we ignore most of the sampling parameters in
+    # generating the draft tokens. We only use the temperature. While this
+    # could degrade the acceptance rate, it does not affect the distribution
+    # of the generated tokens after rejection sampling.
+
+    # TODO(woosuk): Consider seeds.
+    q = torch.empty_like(probs)
+    q.exponential_()
+    next_token_ids = probs.div_(q).argmax(dim=-1).view(-1)
+    if not sampling_metadata.all_random:
+        greedy_token_ids = probs.argmax(dim=-1)
+        next_token_ids = torch.where(
+            is_greedy,
+            greedy_token_ids,
+            next_token_ids,
+        )
+    return next_token_ids, probs
+
+
+def prepare_input_kernel(
+    out_ptr,
+    cu_query_lens_ptr,
+    cu_num_tokens_ptr,
+    BLOCK_SIZE,
+):
+    pid = tl.program_id(0)
+
+    # [start_pos, end_pos)
+    start_pos = tl.load(cu_num_tokens_ptr + pid)
+    end_pos = tl.load(cu_num_tokens_ptr + pid + 1)
+    num_tokens = end_pos - start_pos
+
+    index_start = tl.load(cu_query_lens_ptr + pid)
+
+    num_blocks = tl.cdiv(num_tokens, BLOCK_SIZE)
+    for i in tl.range(num_blocks):
+        offset = i * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+        tl.store(
+            out_ptr + start_pos + offset,
+            index_start + offset,
+            mask=offset < num_tokens,
+        )
diff --git a/vllm_mindspore/v1/utils.py b/vllm_mindspore/v1/utils.py
new file mode 100644
index 00000000..6833c101
--- /dev/null
+++ b/vllm_mindspore/v1/utils.py
@@ -0,0 +1,12 @@
+import torch
+
+def copy_slice(from_tensor: torch.Tensor, to_tensor: torch.Tensor,
+               length: int) -> None:
+    """
+    Copy the first length elements of a tensor into another tensor in a
+    non-blocking manner.
+
+    Used to copy pinned CPU tensor data to pre-allocated GPU tensors.
+    """
+    to_tensor[:length] = from_tensor[:length]
+    return to_tensor
diff --git a/vllm_mindspore/v1/worker/__init__.py b/vllm_mindspore/v1/worker/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/vllm_mindspore/v1/worker/block_table.py b/vllm_mindspore/v1/worker/block_table.py
new file mode 100644
index 00000000..b865bae3
--- /dev/null
+++ b/vllm_mindspore/v1/worker/block_table.py
@@ -0,0 +1,93 @@
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import List
+
+import numpy as np
+import torch
+
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+
+class BlockTable:
+
+    def __init__(
+        self,
+        max_num_reqs: int,
+        max_num_blocks_per_req: int,
+        pin_memory: bool,
+        device: torch.device,
+    ):
+        self.max_num_reqs = max_num_reqs
+        self.max_num_blocks_per_req = max_num_blocks_per_req
+        self.pin_memory = pin_memory
+        self.device = device
+
+        self.block_table = torch.zeros(
+            (max_num_reqs, max_num_blocks_per_req),
+            device=self.device,
+            dtype=torch.int32,
+        )
+        self.block_table_cpu = torch.zeros(
+            (max_num_reqs, max_num_blocks_per_req),
+            device="cpu",
+            dtype=torch.int32,
+            pin_memory=pin_memory,
+        )
+        self.block_table_np = self.block_table_cpu.numpy()
+        self.num_blocks_per_row = np.zeros(max_num_reqs, dtype=np.int32)
+
+    def append_row(
+        self,
+        block_ids: List[int],
+        row_idx: int,
+    ) -> None:
+        if not block_ids:
+            return
+        num_blocks = len(block_ids)
+        start = self.num_blocks_per_row[row_idx]
+        self.num_blocks_per_row[row_idx] += num_blocks
+        self.block_table_np[row_idx, start:start + num_blocks] = block_ids
+
+    def add_row(self, block_ids: List[int], row_idx: int) -> None:
+        self.num_blocks_per_row[row_idx] = 0
+        self.append_row(block_ids, row_idx)
+
+    def move_row(self, src: int, tgt: int) -> None:
+        num_blocks = self.num_blocks_per_row[src]
+        self.block_table_np[tgt, :num_blocks] = self.block_table_np[
+            src, :num_blocks]
+        self.num_blocks_per_row[tgt] = num_blocks
+
+    def swap_row(self, src: int, tgt: int) -> None:
+        num_blocks_src = self.num_blocks_per_row[src]
+        num_blocks_tgt = self.num_blocks_per_row[tgt]
+        self.num_blocks_per_row[src] = num_blocks_tgt
+        self.num_blocks_per_row[tgt] = num_blocks_src
+
+        self.block_table_np[[src, tgt]] = self.block_table_np[[tgt, src]]
+
+    def commit(self, num_reqs: int) -> None:
+        self.block_table_cpu[:num_reqs] = torch.from_numpy(self.block_table_np[:num_reqs])
+        # self.block_table[:num_reqs] = self.block_table_cpu[:num_reqs]
+        self.block_table[:num_reqs] = torch.from_numpy(self.block_table_np[:num_reqs])
+
+    def clear(self) -> None:
+        self.block_table.fill_(0)
+        self.block_table_cpu.fill_(0)
+        self.block_table_np.fill(0)
+
+    def get_device_tensor(self) -> torch.Tensor:
+        """Ruturns the device tensor of the block table."""
+        return self.block_table
+
+    def get_cpu_tensor(self) -> torch.Tensor:
+        """Returns the CPU tensor of the block table."""
+        self.block_table_cpu.copy_(torch.from_numpy(self.block_table_np),
+                                   non_blocking=True)
+        return self.block_table_cpu
+
+    def get_numpy_array(self) -> np.ndarray:
+        """Returns the numpy array of the block table."""
+        return self.block_table_np
diff --git a/vllm_mindspore/v1/worker/gpu_input_batch.py b/vllm_mindspore/v1/worker/gpu_input_batch.py
new file mode 100644
index 00000000..6a2254f7
--- /dev/null
+++ b/vllm_mindspore/v1/worker/gpu_input_batch.py
@@ -0,0 +1,85 @@
+from typing import Dict, List, Optional, Set, Tuple, cast
+
+import numpy as np
+import torch
+
+from vllm.lora.request import LoRARequest
+from vllm.sampling_params import SamplingType
+from vllm.v1.sample.metadata import SamplingMetadata
+from vllm_mindspore.v1.utils import copy_slice
+from vllm.v1.worker.block_table import BlockTable
+
+_SAMPLING_EPS = 1e-5
+
+
+def _make_sampling_metadata(self) -> SamplingMetadata:
+    num_reqs = self.num_reqs
+    if not self.all_greedy:
+        temperature = copy_slice(torch.from_numpy(self.temperature_cpu), self.temperature, num_reqs)
+    else:
+        temperature = None
+    if not self.no_top_p:
+        copy_slice(torch.from_numpy(self.top_p_cpu), self.top_p, num_reqs)
+    if not self.no_top_k:
+        copy_slice(torch.from_numpy(self.top_k_cpu), self.top_k, num_reqs)
+    if not self.no_min_p:
+        copy_slice(torch.from_numpy(self.min_p_cpu), self.min_p, num_reqs)
+
+    if not self.no_penalties:
+        # Since syncing these tensors is expensive only copy them
+        # if necessary i.e. if there are requests which require
+        # penalties to be applied during sampling.
+        copy_slice(torch.from_numpy(self.frequency_penalties_cpu),
+                self.frequency_penalties, num_reqs)
+        copy_slice(torch.from_numpy(self.presence_penalties_cpu),
+                self.presence_penalties, num_reqs)
+        copy_slice(torch.from_numpy(self.repetition_penalties_cpu),
+                self.repetition_penalties, num_reqs)
+
+        # The prompt tokens are used only for applying penalties during
+        # the sampling process. Hence copy these tensors only when
+        # there are requests which need penalties to be applied.
+        prompt_token_ids = self._make_prompt_token_ids_tensor()
+    else:
+        prompt_token_ids = None
+
+    allowed_token_ids_mask: Optional[torch.Tensor] = None
+    if not self.no_allowed_token_ids:
+        assert self.allowed_token_ids_mask is not None
+        copy_slice(self.allowed_token_ids_mask_cpu_tensor,
+                    self.allowed_token_ids_mask, num_reqs)
+        allowed_token_ids_mask = self.allowed_token_ids_mask[:num_reqs]
+
+    return SamplingMetadata(
+        temperature=temperature,
+        all_greedy=self.all_greedy,
+        all_random=self.all_random,
+        top_p=None if self.no_top_p else self.top_p[:num_reqs],
+        top_k=None if self.no_top_k else self.top_k[:num_reqs],
+        min_p=None if self.no_min_p else self.min_p[:num_reqs],
+        generators=self.generators,
+        max_num_logprobs=self.max_num_logprobs,
+        prompt_token_ids=prompt_token_ids,
+        frequency_penalties=self.frequency_penalties[:num_reqs],
+        presence_penalties=self.presence_penalties[:num_reqs],
+        repetition_penalties=self.repetition_penalties[:num_reqs],
+        output_token_ids=cast(list[list[int]], self.req_output_token_ids),
+        min_tokens=self.min_tokens,
+        no_penalties=self.no_penalties,
+        logit_bias=self.logit_bias[:num_reqs],
+        allowed_token_ids_mask=allowed_token_ids_mask,
+        bad_words_token_ids=self.bad_words_token_ids,
+    )
+
+
+def _make_prompt_token_ids_tensor(self) -> torch.Tensor:
+    max_prompt_len = self.num_prompt_tokens[:self.num_reqs].max()
+    prompt_token_ids = np.empty((self.num_reqs, max_prompt_len), dtype=np.int64)
+    prompt_token_ids[:] = self.token_ids_cpu[:self.
+                                             num_reqs, :max_prompt_len]
+    for i in range(self.num_reqs):
+        prompt_token_ids[i, self.num_prompt_tokens[i]:] = self.vocab_size
+    prompt_token_ids_cpu_tensor = torch.from_numpy(prompt_token_ids)
+    prompt_token_ids_cpu_tensor.move_to("Ascend", blocking=False)
+    return prompt_token_ids_cpu_tensor
+
diff --git a/vllm_mindspore/v1/worker/gpu_model_runner.py b/vllm_mindspore/v1/worker/gpu_model_runner.py
new file mode 100644
index 00000000..988ee71c
--- /dev/null
+++ b/vllm_mindspore/v1/worker/gpu_model_runner.py
@@ -0,0 +1,420 @@
+
+from typing import Dict, Tuple, List
+import gc
+import numpy as np
+import torch
+
+from mindspore import mutable
+import mindspore as ms
+from vllm_mindspore.v1.attention.backends.flash_attn import (FlashAttentionMetadata,
+                                                             FlashAttentionBackend,
+                                                             MLABackend)
+from vllm_mindspore.utils import get_valid_dtype
+
+from vllm.v1.kv_cache_interface import FullAttentionSpec
+from vllm.v1.utils import bind_kv_cache
+from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
+from vllm.logger import logger
+from vllm.distributed.parallel_state import get_pp_group
+from vllm.utils import cdiv
+from vllm.logger import init_logger
+from vllm.v1.worker.gpu_input_batch import CachedRequestState
+from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding
+from vllm.sampling_params import SamplingType
+
+
+logger = init_logger(__name__)
+def _prepare_inputs(
+    self,
+    scheduler_output: "SchedulerOutput",
+) -> Tuple[FlashAttentionMetadata, torch.Tensor]:
+    total_num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
+    assert total_num_scheduled_tokens > 0
+    num_reqs = self.input_batch.num_reqs
+    assert num_reqs > 0
+
+    modified_batch = self.attn_metadata_builder.reorder_batch(
+        self.input_batch, scheduler_output)
+    if modified_batch:
+        self.input_batch.refresh_sampling_metadata()
+
+    # OPTIMIZATION: Start copying the block table first.
+    # This way, we can overlap the copy with the following CPU operations.
+    self.input_batch.block_table.commit(num_reqs)
+
+    # Get the number of scheduled tokens for each request.
+    # TODO: The Python loop can be slow. Optimize.
+    num_scheduled_tokens = np.empty(num_reqs, dtype=np.int32)
+    max_num_scheduled_tokens = 0
+    for i, req_id in enumerate(self.input_batch.req_ids):
+        num_tokens = scheduler_output.num_scheduled_tokens[req_id]
+        num_scheduled_tokens[i] = num_tokens
+        max_num_scheduled_tokens = max(max_num_scheduled_tokens,
+                                        num_tokens)
+
+    # Get request indices.
+    # E.g., [2, 5, 3] -> [0, 0, 1, 1, 1, 1, 1, 2, 2, 2]
+    req_indices = np.repeat(self.arange_np[:num_reqs],
+                            num_scheduled_tokens)
+
+    # Get batched arange.
+    # E.g., [2, 5, 3] -> [0, 1, 0, 1, 2, 3, 4, 0, 1, 2]
+    # Equivalent to but faster than:
+    # np.concatenate([np.arange(n) for n in num_scheduled_tokens])
+    # Step 1. [2, 5, 3] -> [2, 7, 10]
+    cu_num_tokens = np.cumsum(num_scheduled_tokens)
+    # Step 2. [2, 7, 10] -> [0, 0, 2, 2, 2, 2, 2, 7, 7, 7]
+    cumsums_offsets = np.repeat(cu_num_tokens - num_scheduled_tokens,
+                                num_scheduled_tokens)
+    # Step 3. [0, 1, 0, 1, 2, 3, 4, 0, 1, 2]
+    arange = self.arange_np[:total_num_scheduled_tokens] - cumsums_offsets
+
+    # Get positions.
+    positions_np = self.positions_np[:total_num_scheduled_tokens]
+    np.add(self.input_batch.num_computed_tokens_cpu[req_indices],
+            arange,
+            out=positions_np)
+
+    if self.uses_mrope:
+        self._calc_mrope_positions(scheduler_output)
+
+    if self.uses_mrope:
+        # Only relevant for models using M-RoPE (e.g, Qwen2-VL)
+        self.mrope_positions[:, :total_num_scheduled_tokens].copy_(
+            self.mrope_positions_cpu[:, :total_num_scheduled_tokens],
+            non_blocking=True)
+    else:
+        self.positions[:total_num_scheduled_tokens] = torch.from_numpy(positions_np)
+
+
+    # Get token indices.
+    # E.g., [0, 1, 0, 1, 2, 3, 4, 0, 1, 2]
+    # -> [0, 1, M, M + 1, M + 2, M + 3, M + 4, 2 * M, 2 * M + 1, 2 * M + 2]
+    # where M is the max_model_len.
+    token_indices = (positions_np +
+                     req_indices * self.input_batch.token_ids_cpu.shape[1])
+
+    self.input_ids[:total_num_scheduled_tokens] = torch.from_numpy(
+        np.take(self.input_batch.token_ids_cpu.flatten(),
+                token_indices,
+                0)
+    )
+
+    # Calculate the slot mapping.
+    # E.g., [0, 1, 0, 1, 2, 3, 4, 0, 1, 2]
+    # -> [0, 0, K, K, K + 1, K + 1, K + 2, 2 * K, 2 * K, 2 * K + 1]
+    # where K is the max_num_blocks_per_req and the block size is 2.
+    # NOTE(woosuk): We can't simply use `token_indices // block_size` here
+    # because M (max_model_len) is not necessarily divisible by block_size.
+    block_table_indices = (req_indices * self.max_num_blocks_per_req +
+                           positions_np // self.block_size)
+
+
+    block_numbers = self.input_batch.block_table.block_table_np.flatten()[block_table_indices]
+    block_offsets = positions_np % self.block_size
+    np.add(block_numbers * self.block_size,
+            block_offsets,
+            out=self.slot_mapping_np[:total_num_scheduled_tokens])
+
+    # # Prepare the attention metadata.
+    self.query_start_loc_np[0] = 0
+    self.query_start_loc_np[1:num_reqs + 1] = cu_num_tokens
+
+    self.seq_lens_np[:num_reqs] = (
+        self.input_batch.num_computed_tokens_cpu[:num_reqs] +
+        num_scheduled_tokens)
+
+    common_prefix_len = 0
+    if self.cascade_attn_enabled:
+        common_prefix_len = self._compute_cascade_attn_prefix_len(
+            num_scheduled_tokens,
+            scheduler_output.num_common_prefix_blocks,
+        )
+
+    attn_metadata = self.attn_metadata_builder.build(
+        num_reqs=num_reqs,
+        num_actual_tokens=total_num_scheduled_tokens,
+        max_query_len=max_num_scheduled_tokens,
+        common_prefix_len=common_prefix_len,
+    )
+
+    use_spec_decode = len(
+        scheduler_output.scheduled_spec_decode_tokens) > 0
+    if not use_spec_decode:
+        # NOTE(woosuk): Due to chunked prefills, the batch may contain
+        # partial requests. While we should not sample any token
+        # from these partial requests, we do so for simplicity.
+        # We will ignore the sampled tokens from the partial requests.
+        # TODO: Support prompt logprobs.
+        logits_indices = attn_metadata.query_start_loc[1:] - 1
+        spec_decode_metadata = None
+    else:
+        # Get the number of draft tokens for each request.
+        # Iterate over the dictionary rather than all requests since not all
+        # requests have draft tokens.
+        num_draft_tokens = np.zeros(num_reqs, dtype=np.int32)
+        for req_id, draft_token_ids in (
+                scheduler_output.scheduled_spec_decode_tokens.items()):
+            req_idx = self.input_batch.req_id_to_index[req_id]
+            num_draft_tokens[req_idx] = len(draft_token_ids)
+
+        spec_decode_metadata = self._calc_spec_decode_metadata(
+            num_draft_tokens, cu_num_tokens)
+        logits_indices = spec_decode_metadata.logits_indices
+
+    # Hot-Swap lora model
+    if self.lora_config:
+        self.set_active_loras(self.input_batch, num_scheduled_tokens)
+
+    return attn_metadata, logits_indices, spec_decode_metadata
+
+
+def create_block(shape, dtype, name=None, device=None):
+    from mindspore import mint
+    blocks = mint.empty(shape, dtype=dtype, device=device)
+    return blocks
+
+def initialize_kv_cache(self, kv_cache_config) -> None:
+    """
+    Initialize KV cache based on `kv_cache_config`.
+    Args:
+        kv_cache_config: Configuration for the KV cache, including the KV 
+        cache size of each layer
+    """
+    if len(kv_cache_config.kv_cache_groups) > 1:
+        raise NotImplementedError(
+            "Hybrid models with more than one KV cache type are not "
+            "supported yet.")
+
+    kv_caches: Dict[str, torch.Tensor] = {}
+
+    for kv_cache_group in kv_cache_config.kv_cache_groups:
+        kv_cache_spec = kv_cache_group.kv_cache_spec
+        for layer_name in kv_cache_group.layer_names:
+            tensor_config = kv_cache_config.tensors[layer_name]
+            assert tensor_config.size % kv_cache_spec.page_size_bytes == 0
+            num_blocks = tensor_config.size // kv_cache_spec.page_size_bytes
+            # `num_blocks` is the number of blocks the model runner can use.
+            # `kv_cache_config.num_blocks` is the number of blocks that
+            # KVCacheManager may allocate.
+            # Since different GPUs may have different number of layers and
+            # different memory capacities, `num_blocks` can be different on
+            # different GPUs, and `kv_cache_config.num_blocks` is set to
+            # the min of all `num_blocks`. Verify it here.
+            assert num_blocks >= kv_cache_config.num_blocks
+            if isinstance(kv_cache_spec, FullAttentionSpec):
+                kv_cache_shape = self.attn_backend.get_kv_cache_shape(
+                    num_blocks, kv_cache_spec.block_size, kv_cache_spec.num_kv_heads,
+                    kv_cache_spec.head_size)
+                dtype = kv_cache_spec.dtype
+                dtype = get_valid_dtype(dtype)
+                current_cache = []
+                device_type = "CPU" if self.device.type == "cpu" else "Ascend"
+                for i in range(kv_cache_shape[0]):
+                    cache_blocks = create_block(
+                        kv_cache_shape[1:], dtype, device=device_type
+                    )
+                    current_cache.append(mutable(cache_blocks))
+                kv_caches[layer_name] = mutable(tuple(current_cache))
+            else:
+                raise NotImplementedError
+
+    bind_kv_cache(
+        kv_caches,
+        self.vllm_config.compilation_config.static_forward_context,
+        self.kv_caches)
+
+
+def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
+    """Update the cached states and the persistent batch with the scheduler
+    output.
+
+    The updated states are used by the `_prepare_inputs` function to create
+    the input GPU tensors for the model.
+
+    The SamplingMetadata is updated and copied to the GPU if there is a
+    new/resumed/paused/finished request in the batch.
+    """
+    # Remove finished requests from the cached states.
+    for req_id in scheduler_output.finished_req_ids:
+        self.requests.pop(req_id, None)
+        self.encoder_cache.pop(req_id, None)
+    # Remove the finished requests from the persistent batch.
+    # NOTE(woosuk): There could be an edge case where finished_req_ids and
+    # scheduled_req_ids overlap. This happens when a request is aborted and
+    # then resubmitted with the same ID. In this case, we treat them as two
+    # distinct requests - clearing the cached states for the first request
+    # and handling the second as a new request.
+    removed_req_indices: List[int] = []
+    for req_id in scheduler_output.finished_req_ids:
+        req_index = self.input_batch.remove_request(req_id)
+        if req_index is not None:
+            removed_req_indices.append(req_index)
+
+    # Free the cached encoder outputs.
+    for req_id, input_id in scheduler_output.free_encoder_input_ids:
+        encoder_outputs = self.encoder_cache.get(req_id)
+        if encoder_outputs is not None:
+            encoder_outputs.pop(input_id, None)
+            if not encoder_outputs:
+                self.encoder_cache.pop(req_id, None)
+
+    # Remove the unscheduled requests from the persistent batch.
+    # NOTE(woosuk): The unscheduled requests are either preempted requests
+    # or running requests that are not scheduled in this step. We remove
+    # them from the persistent batch but keep their cached states since
+    # they will be scheduled again sometime in the future.
+    scheduled_req_ids = scheduler_output.num_scheduled_tokens.keys()
+    cached_req_ids = self.input_batch.req_id_to_index.keys()
+    unscheduled_req_ids = cached_req_ids - scheduled_req_ids
+    # NOTE(woosuk): The persistent batch optimization assumes that
+    # consecutive batches contain mostly the same requests. If batches
+    # have low request overlap (e.g., alternating between two distinct
+    # sets of requests), this optimization becomes very inefficient.
+    for req_id in unscheduled_req_ids:
+        req_index = self.input_batch.remove_request(req_id)
+        assert req_index is not None
+        removed_req_indices.append(req_index)
+
+    req_ids_to_add: List[str] = []
+    # Add new requests to the cached states.
+    for new_req_data in scheduler_output.scheduled_new_reqs:
+        req_id = new_req_data.req_id
+        sampling_params = new_req_data.sampling_params
+        if sampling_params.sampling_type == SamplingType.RANDOM_SEED:
+            generator = torch.Generator(device=self.device)
+            generator.manual_seed(sampling_params.seed)
+        else:
+            generator = None
+
+        self.requests[req_id] = CachedRequestState(
+            req_id=req_id,
+            prompt_token_ids=new_req_data.prompt_token_ids,
+            prompt=new_req_data.prompt,
+            mm_inputs=new_req_data.mm_inputs,
+            mm_positions=new_req_data.mm_positions,
+            sampling_params=sampling_params,
+            generator=generator,
+            block_ids=new_req_data.block_ids,
+            num_computed_tokens=new_req_data.num_computed_tokens,
+            output_token_ids=[],
+            lora_request=new_req_data.lora_request,
+        )
+
+        # Only relevant for models using M-RoPE (e.g, Qwen2-VL)
+        if self.uses_mrope:
+            image_grid_thw = []
+            video_grid_thw = []
+            second_per_grid_ts = []
+            for mm_input in self.requests[req_id].mm_inputs:
+                if mm_input.get("image_grid_thw") is not None:
+                    image_grid_thw.extend(
+                        mm_input["image_grid_thw"].tolist())
+                if mm_input.get("video_grid_thw") is not None:
+                    video_grid_thw.extend(
+                        mm_input["video_grid_thw"].tolist())
+                if mm_input.get("second_per_grid_ts") is not None:
+                    second_per_grid_ts.extend(
+                        mm_input["second_per_grid_ts"])
+
+            hf_config = self.model_config.hf_config
+
+            self.requests[req_id].mrope_positions, \
+                self.requests[req_id].mrope_position_delta = \
+                MRotaryEmbedding.get_input_positions_tensor(
+                    self.requests[req_id].prompt_token_ids,
+                    hf_config=hf_config,
+                    image_grid_thw=image_grid_thw,
+                    video_grid_thw=video_grid_thw,
+                    second_per_grid_ts=second_per_grid_ts,
+                )
+
+        req_ids_to_add.append(req_id)
+
+    # Update the states of the running/resumed requests.
+    for req_data in scheduler_output.scheduled_cached_reqs:
+        req_id = req_data.req_id
+        req_state = self.requests[req_id]
+
+        # Update the cached states.
+        num_computed_tokens = req_data.num_computed_tokens
+        req_state.num_computed_tokens = num_computed_tokens
+        # Add the sampled token(s) from the previous step (if any).
+        # This doesn't include "unverified" tokens like spec decode tokens.
+        num_new_tokens = (num_computed_tokens +
+                            len(req_data.new_token_ids) -
+                            req_state.num_tokens)
+        if num_new_tokens == 1:
+            # Avoid slicing list in most common case.
+            req_state.output_token_ids.append(req_data.new_token_ids[-1])
+        elif num_new_tokens > 0:
+            req_state.output_token_ids.extend(
+                req_data.new_token_ids[-num_new_tokens:])
+        # Update the block IDs.
+        if not req_data.resumed_from_preemption:
+            # Append the new blocks to the existing block IDs.
+            req_state.block_ids.extend(req_data.new_block_ids)
+        else:
+            # The request is resumed from preemption.
+            # Replace the existing block IDs with the new ones.
+            req_state.block_ids = req_data.new_block_ids
+
+        req_index = self.input_batch.req_id_to_index.get(req_id)
+        if req_index is None:
+            # The request is not in the persistent batch.
+            # The request was either preempted and resumed later, or was not
+            # scheduled in the previous step and needs to be added again.
+            req_ids_to_add.append(req_id)
+            continue
+
+        # Update the persistent batch.
+        self.input_batch.num_computed_tokens_cpu[req_index] = (
+            num_computed_tokens)
+        start_index = (len(req_state.block_ids) -
+                        len(req_data.new_block_ids))
+        self.input_batch.block_table.append_row(req_data.new_block_ids,
+                                                req_index)
+        # Add new_token_ids to token_ids_cpu.
+        start_token_index = num_computed_tokens
+        end_token_index = num_computed_tokens + len(req_data.new_token_ids)
+        self.input_batch.token_ids_cpu[
+            req_index,
+            start_token_index:end_token_index] = req_data.new_token_ids
+
+        self.input_batch.num_tokens_no_spec[req_index] = end_token_index
+        # Add spec_token_ids to token_ids_cpu.
+        spec_token_ids = scheduler_output.scheduled_spec_decode_tokens.get(
+            req_id, ())
+        if spec_token_ids:
+            start_index = end_token_index
+            end_token_index += len(spec_token_ids)
+            self.input_batch.token_ids_cpu[
+                req_index, start_index:end_token_index] = spec_token_ids
+        # NOTE(woosuk): `num_tokens` here may include spec decode tokens.
+        self.input_batch.num_tokens[req_index] = end_token_index
+
+
+    # self.input_batch.token_ids_cpu_tensor.copy_(torch.from_numpy(self.input_batch.token_ids_cpu))
+    # Check if the batch has changed. If not, we can skip copying the
+    # sampling metadata from CPU to GPU.
+    batch_changed = len(removed_req_indices) > 0 or len(req_ids_to_add) > 0
+
+    # Add the new or resumed requests to the persistent batch.
+    # The smaller empty indices are filled first.
+    removed_req_indices = sorted(removed_req_indices, reverse=True)
+    for req_id in req_ids_to_add:
+        req_state = self.requests[req_id]
+        if removed_req_indices:
+            # Fill the empty index.
+            req_index = removed_req_indices.pop()
+        else:
+            # Append to the end.
+            req_index = None
+        self.input_batch.add_request(req_state, req_index)
+
+    # Condense the batched states if there are empty indices.
+    if removed_req_indices:
+        self.input_batch.condense(removed_req_indices)
+
+    if batch_changed:
+        self.input_batch.refresh_sampling_metadata()
-- 
Gitee


From ae926b82e7793a4ab0ef78ee995087194b91f4c2 Mon Sep 17 00:00:00 2001
From: lvhaoyu <lvhaoyu@huawei.com>
Date: Sun, 13 Apr 2025 10:31:39 +0800
Subject: [PATCH 07/55] add check before calling STR_DTYPE_TO_TENSOR_DTYPE[]

---
 vllm_mindspore/worker/model_runner.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm_mindspore/worker/model_runner.py b/vllm_mindspore/worker/model_runner.py
index 561fd202..55bb26ec 100644
--- a/vllm_mindspore/worker/model_runner.py
+++ b/vllm_mindspore/worker/model_runner.py
@@ -132,7 +132,8 @@ def _dummy_run(self,
         # tensor aliasing.
         kv_cache_dtype = self.model_config.dtype if self.cache_config.cache_dtype == "auto" \
             else self.cache_config.cache_dtype
-        kv_cache_dtype = STR_DTYPE_TO_TENSOR_DTYPE[kv_cache_dtype]
+        if kv_cache_dtype in STR_DTYPE_TO_TENSOR_DTYPE:
+            kv_cache_dtype = STR_DTYPE_TO_TENSOR_DTYPE[kv_cache_dtype]
         block_size = self.cache_config.block_size
         num_kv_heads = self.model_config.get_num_kv_heads(self.parallel_config)
         head_size = self.model_config.get_head_size()
-- 
Gitee


From 2814e08082d0634769b6528b4b2bd6c9b3801f4f Mon Sep 17 00:00:00 2001
From: candyhong <1102229410@qq.com>
Date: Wed, 16 Apr 2025 10:10:22 +0000
Subject: [PATCH 08/55] Supporting mp for v1

---
 vllm_mindspore/__init__.py                  |  4 ++
 vllm_mindspore/distributed/shm_broadcast.py | 71 +++++++++++++++++++++
 2 files changed, 75 insertions(+)
 create mode 100644 vllm_mindspore/distributed/shm_broadcast.py

diff --git a/vllm_mindspore/__init__.py b/vllm_mindspore/__init__.py
index a943c92f..90243e23 100644
--- a/vllm_mindspore/__init__.py
+++ b/vllm_mindspore/__init__.py
@@ -291,6 +291,10 @@ from vllm_mindspore.v1.sample.sampler import apply_temperature
 import vllm.v1.sample.sampler
 vllm.v1.sample.sampler.Sampler.apply_temperature = apply_temperature
 
+from vllm_mindspore.distributed.shm_broadcast import initialize_ShmRingBuffer
+from vllm.distributed.device_communicators.shm_broadcast import ShmRingBuffer
+ShmRingBuffer.__init__ = initialize_ShmRingBuffer
+
 from .utils import check_ready
 
 from vllm_mindspore.engine.multiprocessing.engine import cleanup
diff --git a/vllm_mindspore/distributed/shm_broadcast.py b/vllm_mindspore/distributed/shm_broadcast.py
new file mode 100644
index 00000000..bf010812
--- /dev/null
+++ b/vllm_mindspore/distributed/shm_broadcast.py
@@ -0,0 +1,71 @@
+#!/usr/bin/env python3
+# encoding: utf-8
+# Copyright 2025 Huawei Technologies Co., Ltd
+# Copyright 2024 The vLLM team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+import numpy as np
+from typing import Optional
+from multiprocessing import shared_memory
+from unittest.mock import patch
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+def initialize_ShmRingBuffer(self,
+                            n_reader: int,
+                            max_chunk_bytes: int,
+                            max_chunks: int,
+                            name: Optional[str] = None):
+    logger.info("Entering mindspore shm_broadcast")
+    self.n_reader = n_reader
+    self.metadata_size = 1 + n_reader
+    self.max_chunk_bytes = max_chunk_bytes
+    self.max_chunks = max_chunks
+    self.total_bytes_of_buffer = (self.max_chunk_bytes +
+                                    self.metadata_size) * self.max_chunks
+    self.data_offset = 0
+    self.metadata_offset = self.max_chunk_bytes * self.max_chunks
+
+    if name is None:
+        # we are creating a buffer
+        self.is_creator = True
+        self.shared_memory = shared_memory.SharedMemory(
+            create=True, size=self.total_bytes_of_buffer)
+        # initialize the metadata section to 0
+        with memoryview(self.shared_memory.buf[self.metadata_offset:]
+                        ) as metadata_buffer:
+            np.frombuffer(metadata_buffer, dtype=np.uint8).fill(0)
+    else:
+        # we are opening an existing buffer
+        self.is_creator = False
+        # fix to https://stackoverflow.com/q/62748654/9191338
+        # Python incorrectly tracks shared memory even if it is not
+        # created by the process. The following patch is a workaround.
+        with patch("multiprocessing.resource_tracker.register",
+                    lambda *args, **kwargs: None):
+            try:
+                self.shared_memory = shared_memory.SharedMemory(name=name)
+                # See https://docs.python.org/3/library/multiprocessing.shared_memory.html # noqa
+                # Some platforms allocate memory based on page size,
+                # so the shared memory block size may be larger or equal
+                # to the requested size. The size parameter is ignored
+                # when attaching to an existing block.
+                assert (self.shared_memory.size
+                        >= self.total_bytes_of_buffer)
+            except FileNotFoundError:
+                # we might deserialize the object in a different node
+                # in this case, this object is not used,
+                # and we should suppress the error
+                pass
-- 
Gitee


From 14b5446e69ef41b2ba48621c2ea225a391e6be0b Mon Sep 17 00:00:00 2001
From: lvhaoyu <lvhaoyu@huawei.com>
Date: Sun, 13 Apr 2025 14:34:25 +0800
Subject: [PATCH 09/55] use cpu scoket to replace ProcessGroupGloo

---
 vllm_mindspore/__init__.py                    |  11 +-
 vllm_mindspore/config.py                      | 109 ++++++++++++++++++
 vllm_mindspore/executor/ray_gpu_executor.py   |   1 +
 .../models/mf_models/weight_processor.py      |   5 +-
 vllm_mindspore/v1/engine/__init__.py          |   0
 vllm_mindspore/v1/engine/core.py              |   6 +
 vllm_mindspore/v1/worker/gpu_worker.py        |  41 +++++++
 7 files changed, 169 insertions(+), 4 deletions(-)
 create mode 100644 vllm_mindspore/v1/engine/__init__.py
 create mode 100644 vllm_mindspore/v1/engine/core.py
 create mode 100644 vllm_mindspore/v1/worker/gpu_worker.py

diff --git a/vllm_mindspore/__init__.py b/vllm_mindspore/__init__.py
index 90243e23..ce06a685 100644
--- a/vllm_mindspore/__init__.py
+++ b/vllm_mindspore/__init__.py
@@ -56,6 +56,10 @@ import vllm.engine.arg_utils
 from vllm_mindspore.engine.arg_utils import _is_v1_supported_oracle
 vllm.engine.arg_utils.EngineArgs._is_v1_supported_oracle = _is_v1_supported_oracle
 
+import vllm.v1.engine.core
+from vllm_mindspore.v1.engine.core import shutdown
+vllm.v1.engine.core.DPEngineCoreProc.shutdown = shutdown
+
 from vllm_mindspore.utils import (
     direct_register_custom_op,
     make_tensor_with_pad,
@@ -197,13 +201,15 @@ vllm.engine.async_llm_engine.initialize_ray_cluster = initialize_ray_cluster
 
 
 from .config import _verify_quantization, _verify_args, vllm_config_post_init, model_post_init, \
-    _get_and_verify_dtype
+    _get_and_verify_dtype, stateless_init_dp_group, has_unfinished_dp
 
 vllm.config.ModelConfig._verify_quantization = _verify_quantization
 vllm.config.VllmConfig.__post_init__ = vllm_config_post_init
 vllm.config.SchedulerConfig._verify_args = _verify_args
 vllm.config.CompilationConfig.model_post_init = model_post_init
 vllm.config._get_and_verify_dtype = _get_and_verify_dtype
+vllm.config.ParallelConfig.stateless_init_dp_group = stateless_init_dp_group
+vllm.config.ParallelConfig.has_unfinished_dp = has_unfinished_dp
 
 from .utils import update_modules
 from vllm_mindspore.attention.backends import ms_attn
@@ -268,9 +274,10 @@ vllm.v1.worker.gpu_input_batch.InputBatch._make_prompt_token_ids_tensor = _make_
 vllm.v1.worker.gpu_model_runner.InputBatch._make_prompt_token_ids_tensor = _make_prompt_token_ids_tensor
 
 from vllm.v1.worker.gpu_worker import Worker
+from vllm_mindspore.v1.worker.gpu_worker import init_device
 
 Worker.__init__ = wrapper_worker_init(Worker.__init__)
-Worker.init_device = wrapper_worker_init_device(Worker.init_device)
+Worker.init_device = wrapper_worker_init_device(init_device)
 
 
 import vllm.v1.utils
diff --git a/vllm_mindspore/config.py b/vllm_mindspore/config.py
index b6366434..21c2fded 100644
--- a/vllm_mindspore/config.py
+++ b/vllm_mindspore/config.py
@@ -18,6 +18,9 @@
 from collections import Counter
 from typing import Union
 import sys
+import socket
+import pickle
+import time
 
 import torch
 
@@ -292,3 +295,109 @@ def _get_and_verify_dtype(
         torch_dtype = _STR_DTYPE_TO_TORCH_DTYPE[torch_dtype]
 
     return torch_dtype
+
+
+class SocketProcessGroup:
+    def __init__(self, master_ip: str, master_port: int, rank: int, world_size: int):
+        self.master_ip = master_ip
+        self.master_port = master_port
+        self.rank = rank
+        self.world_size = world_size
+        self.sockets = []
+        self.max_retries = 100
+        self.retry_interval = 2
+
+        if self.rank == 0:
+            # Master node: create a server socket
+            self.server_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+            self.server_socket.bind((self.master_ip, self.master_port))
+            self.server_socket.listen(self.world_size - 1)
+            print(f"Master node listening on {self.master_ip}:{self.master_port}")
+        else:
+            # Worker node: connect to the master
+            self.client_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+            retries = 0
+            while retries < self.max_retries:
+                try:
+                    self.client_socket.connect((self.master_ip, self.master_port))
+                    print(f"Worker {self.rank} connected to master at {self.master_ip}:{self.master_port}")
+                    break
+                except ConnectionRefusedError:
+                    retries += 1
+                    print(f"Worker {self.rank} failed to connect to master. Retrying in {self.retry_interval} seconds... ({retries}/{self.max_retries})")
+                    time.sleep(self.retry_interval)
+            else:
+                raise ConnectionError(f"Worker {self.rank} could not connect to master at {self.master_ip}:{self.master_port} after {self.max_retries} retries.")
+
+    def initialize_group(self):
+        if self.rank == 0:
+            # Master node: accept connections from workers
+            for _ in range(self.world_size - 1):
+                conn, addr = self.server_socket.accept()
+                print(f"Accepted connection from {addr}")
+                self.sockets.append(conn)
+        else:
+            # Worker node: no additional setup needed
+            pass
+
+    def close(self):
+        if self.rank == 0:
+            # Master node: close all worker connections
+            for conn in self.sockets:
+                conn.close()
+            self.server_socket.close()
+        else:
+            # Worker node: close connection to master
+            self.client_socket.close()
+
+
+def stateless_init_dp_group(self) -> SocketProcessGroup:
+    """
+    Initialize a stateless data parallel process group using sockets.
+    """
+    dp_group = SocketProcessGroup(            
+            self.data_parallel_master_ip,
+            self.get_next_dp_init_port(),
+            self.data_parallel_rank,
+            self.data_parallel_size)
+    dp_group.initialize_group()
+    return dp_group
+
+
+def has_unfinished_dp(dp_group: SocketProcessGroup, has_unfinished: bool) -> bool:
+    """
+    Check if any process in the group has unfinished tasks.
+    """
+    if dp_group.rank == 0:
+        # Master node: collect results from workers
+        results = [has_unfinished]
+        for conn in dp_group.sockets:
+            data = conn.recv(1024)
+            worker_result = pickle.loads(data)
+            results.append(worker_result)
+        
+        # Perform OR operation (any True means unfinished)
+        aggregated_result = any(results)
+        
+        # Broadcast the result back to workers
+        for conn in dp_group.sockets:
+            conn.send(pickle.dumps(aggregated_result))
+        
+        return aggregated_result
+    else:
+        # Worker node: send result to master
+        dp_group.client_socket.send(pickle.dumps(has_unfinished))
+        
+        # Receive aggregated result from master
+        data = dp_group.client_socket.recv(1024)
+        aggregated_result = pickle.loads(data)
+        return aggregated_result
+
+def stateless_destroy_socket_process_group(dp_group: "SocketProcessGroup") -> None:
+    """
+    Destroy the socket-based data parallel process group.
+    This function closes all sockets and cleans up resources.
+    """
+    if dp_group:
+        dp_group.close()
+        print(f"Socket process group for rank {dp_group.rank} destroyed.")
diff --git a/vllm_mindspore/executor/ray_gpu_executor.py b/vllm_mindspore/executor/ray_gpu_executor.py
index d9c2affd..76a00322 100644
--- a/vllm_mindspore/executor/ray_gpu_executor.py
+++ b/vllm_mindspore/executor/ray_gpu_executor.py
@@ -225,6 +225,7 @@ def ms_init_workers_ray(self, placement_group: "PlacementGroup",
                     "TPU_HOST_BOUNDS",
                     "VLLM_USE_V1",
                     "VLLM_TRACE_FUNCTION",
+                    "ASCEND_RT_VISIBLE_DEVICES",
             ]:
                 if name in os.environ:
                     args[name] = os.environ[name]
diff --git a/vllm_mindspore/model_executor/models/mf_models/weight_processor.py b/vllm_mindspore/model_executor/models/mf_models/weight_processor.py
index 9b0aab3a..82a104f6 100644
--- a/vllm_mindspore/model_executor/models/mf_models/weight_processor.py
+++ b/vllm_mindspore/model_executor/models/mf_models/weight_processor.py
@@ -21,6 +21,7 @@ import os
 from safetensors import safe_open
 from mindspore.communication.management import get_rank, get_group_size
 
+from vllm.distributed import get_tensor_model_parallel_world_size, get_tensor_model_parallel_rank
 
 class BaseWeightProcessor:
     r"""
@@ -35,8 +36,8 @@ class BaseWeightProcessor:
         self.config = config
         self.network = network
         self.is_quant = is_quant
-        self.tp_group_size = get_group_size()
-        self.rank_id = get_rank()
+        self.tp_group_size = get_tensor_model_parallel_world_size()
+        self.rank_id = get_tensor_model_parallel_rank()
         self.parameter_dict = {}
         self.file_handles = {}
 
diff --git a/vllm_mindspore/v1/engine/__init__.py b/vllm_mindspore/v1/engine/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/vllm_mindspore/v1/engine/core.py b/vllm_mindspore/v1/engine/core.py
new file mode 100644
index 00000000..03c6ff77
--- /dev/null
+++ b/vllm_mindspore/v1/engine/core.py
@@ -0,0 +1,6 @@
+from vllm_mindspore.config import stateless_destroy_socket_process_group
+
+def shutdown(self):
+    super(self.__class__, self).shutdown()
+    if dp_group := getattr(self, "dp_group", None):
+        stateless_destroy_socket_process_group(dp_group)
diff --git a/vllm_mindspore/v1/worker/gpu_worker.py b/vllm_mindspore/v1/worker/gpu_worker.py
new file mode 100644
index 00000000..4cdeca4d
--- /dev/null
+++ b/vllm_mindspore/v1/worker/gpu_worker.py
@@ -0,0 +1,41 @@
+# SPDX-License-Identifier: Apache-2.0
+"""A GPU worker class"""
+
+import gc
+import torch
+# import mindspore
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+def init_device(self):
+    from vllm.config import get_current_vllm_config
+    from vllm.model_executor import set_random_seed
+    from vllm.v1.worker.gpu_model_runner import GPUModelRunner
+    from vllm.v1.worker.gpu_worker import (
+        _check_if_gpu_supports_dtype, init_worker_distributed_environment)
+
+    config = get_current_vllm_config()
+    if config is not None and config.parallel_config.data_parallel_size > 1:
+        device_id = self.parallel_config.data_parallel_rank_local * self.parallel_config.world_size + self.local_rank
+        self.device = torch.device(f"cuda:{device_id}")
+    else:
+        self.device = torch.device(f"cuda:{self.local_rank}")
+    torch.cuda.set_device(self.device)
+
+    _check_if_gpu_supports_dtype(self.model_config.dtype)
+    gc.collect()
+    torch.cuda.empty_cache()
+    self.init_gpu_memory = torch.cuda.mem_get_info()[0]
+
+    # Initialize the distributed environment.
+    init_worker_distributed_environment(self.parallel_config, self.rank,
+                                        self.distributed_init_method,
+                                        self.local_rank)
+
+    # Set random seed.
+    set_random_seed(self.model_config.seed)
+
+    # Construct the model runner
+    self.model_runner: GPUModelRunner = GPUModelRunner(
+        self.vllm_config, self.device)
-- 
Gitee


From a1ac04dd40ee8b6cab2ed858609a660c8ec9d4f7 Mon Sep 17 00:00:00 2001
From: wusimin <wusimin7@huawei.com>
Date: Tue, 22 Apr 2025 17:37:27 +0800
Subject: [PATCH 10/55] =?UTF-8?q?=E8=A7=A3=E5=86=B3DP=E5=90=8E=E9=87=87?=
 =?UTF-8?q?=E6=A0=B7=E5=A4=84=E7=90=86=E6=8A=A5logits=E5=92=8Ctemp=20shape?=
 =?UTF-8?q?=E4=B8=8D=E5=8C=B9=E9=85=8D=E9=97=AE=E9=A2=98?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 vllm_mindspore/v1/worker/gpu_input_batch.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vllm_mindspore/v1/worker/gpu_input_batch.py b/vllm_mindspore/v1/worker/gpu_input_batch.py
index 6a2254f7..52c13344 100644
--- a/vllm_mindspore/v1/worker/gpu_input_batch.py
+++ b/vllm_mindspore/v1/worker/gpu_input_batch.py
@@ -16,6 +16,7 @@ def _make_sampling_metadata(self) -> SamplingMetadata:
     num_reqs = self.num_reqs
     if not self.all_greedy:
         temperature = copy_slice(torch.from_numpy(self.temperature_cpu), self.temperature, num_reqs)
+        temperature = temperature[:num_reqs]
     else:
         temperature = None
     if not self.no_top_p:
-- 
Gitee


From 3bf6dc9181eb33436ae8d93acfebaa89c1f36640 Mon Sep 17 00:00:00 2001
From: can-gaa-hou <jiahaochen535@gmail.com>
Date: Tue, 22 Apr 2025 03:18:26 +0000
Subject: [PATCH 11/55] fixing mp out-of-memory bugs

---
 vllm_mindspore/__init__.py             |  4 ++++
 vllm_mindspore/v1/worker/gpu_worker.py | 13 ++++++++++++-
 2 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/vllm_mindspore/__init__.py b/vllm_mindspore/__init__.py
index ce06a685..032415f0 100644
--- a/vllm_mindspore/__init__.py
+++ b/vllm_mindspore/__init__.py
@@ -302,6 +302,10 @@ from vllm_mindspore.distributed.shm_broadcast import initialize_ShmRingBuffer
 from vllm.distributed.device_communicators.shm_broadcast import ShmRingBuffer
 ShmRingBuffer.__init__ = initialize_ShmRingBuffer
 
+from vllm_mindspore.v1.worker.gpu_worker import compile_or_warm_up_model
+from vllm.v1.worker.gpu_worker import Worker
+Worker.compile_or_warm_up_model = compile_or_warm_up_model
+
 from .utils import check_ready
 
 from vllm_mindspore.engine.multiprocessing.engine import cleanup
diff --git a/vllm_mindspore/v1/worker/gpu_worker.py b/vllm_mindspore/v1/worker/gpu_worker.py
index 4cdeca4d..0395c339 100644
--- a/vllm_mindspore/v1/worker/gpu_worker.py
+++ b/vllm_mindspore/v1/worker/gpu_worker.py
@@ -3,11 +3,13 @@
 
 import gc
 import torch
-# import mindspore
 from vllm.logger import init_logger
+from vllm.distributed.parallel_state import get_pp_group
+
 
 logger = init_logger(__name__)
 
+
 def init_device(self):
     from vllm.config import get_current_vllm_config
     from vllm.model_executor import set_random_seed
@@ -39,3 +41,12 @@ def init_device(self):
     # Construct the model runner
     self.model_runner: GPUModelRunner = GPUModelRunner(
         self.vllm_config, self.device)
+
+
+def compile_or_warm_up_model(self) -> None:
+    # MindSpore does not support cuda graph. No need to warm up the model.
+    # Since prefill is done previously, we do decode here.
+    default_max_num_reqs = 1 # For MindSpore, we only do one more decode here.
+    if get_pp_group().is_last_rank:
+        self.model_runner._dummy_sampler_run(self.model_runner._dummy_run(
+                num_tokens=default_max_num_reqs))
-- 
Gitee


From 0c5acbd1b2c820d47f967a5e25557eeb5eee060d Mon Sep 17 00:00:00 2001
From: cs123abc <chensheng63@huawei.com>
Date: Fri, 25 Apr 2025 14:22:50 +0800
Subject: [PATCH 12/55] =?UTF-8?q?topk=20=E6=B5=8B=E8=AF=95=E9=97=AE?=
 =?UTF-8?q?=E9=A2=98=E4=BF=AE=E5=A4=8D?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 vllm_mindspore/v1/sample/ops/topk_topp_sampler.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm_mindspore/v1/sample/ops/topk_topp_sampler.py b/vllm_mindspore/v1/sample/ops/topk_topp_sampler.py
index 57f0a81c..8b0835c0 100644
--- a/vllm_mindspore/v1/sample/ops/topk_topp_sampler.py
+++ b/vllm_mindspore/v1/sample/ops/topk_topp_sampler.py
@@ -87,11 +87,12 @@ def apply_top_k_only(
     # Set non-top-k rows to 1 so that we can gather.
     k = k.masked_fill(no_top_k_mask, 1)
     max_top_k = k.max()
+    int_max_top_k = max_top_k.item()
     # topk.values tensor has shape [batch_size, max_top_k].
     # Convert top k to 0-based index in range [0, max_top_k).
     k_index = k.sub_(1).unsqueeze(1).expand(logits.shape[0], 1)
 
-    top_k_mask = logits.topk(max_top_k, dim=1)[0].gather(1, k_index.long())
+    top_k_mask = logits.topk(int_max_top_k, dim=1)[0].gather(1, k_index.long())
     # Handle non-topk rows.
     top_k_mask.masked_fill_(no_top_k_mask.unsqueeze(1), -float("inf"))
     logits.masked_fill_(logits < top_k_mask, -float("inf"))
-- 
Gitee


From e9d96bacbe4925d549894972fb255bf9a49936c7 Mon Sep 17 00:00:00 2001
From: tronzhang <zhangzhaochuang@huawei.com>
Date: Wed, 23 Apr 2025 20:53:43 +0800
Subject: [PATCH 13/55] adapte multi-nodes dp serve

---
 setup.py                                      |    5 +
 vllm_dp/dp_scale_out.patch                    | 1416 +++++++++++++++++
 vllm_dp/install_dp_vllm.sh                    |   41 +
 vllm_mindspore/config.py                      |   18 +-
 .../models/mf_models/deepseek_v3.py           |   71 +-
 .../mf_models/deepseekv3_weight_processor.py  |  285 +++-
 .../models/mf_models/weight_processor.py      |   79 +-
 vllm_mindspore/scripts.py                     |    2 +-
 8 files changed, 1833 insertions(+), 84 deletions(-)
 create mode 100755 vllm_dp/dp_scale_out.patch
 create mode 100644 vllm_dp/install_dp_vllm.sh

diff --git a/setup.py b/setup.py
index 5296b18b..2f304bcc 100644
--- a/setup.py
+++ b/setup.py
@@ -214,4 +214,9 @@ setup(
     ext_modules=_get_ext_modules(),
     include_package_data=True,
     package_data=package_data,
+    entry_points={
+        "console_scripts": [
+            "vllm-mindspore=vllm_mindspore.scripts:main",
+        ],
+    },
 )
diff --git a/vllm_dp/dp_scale_out.patch b/vllm_dp/dp_scale_out.patch
new file mode 100755
index 00000000..fb6520d2
--- /dev/null
+++ b/vllm_dp/dp_scale_out.patch
@@ -0,0 +1,1416 @@
+diff --git a/vllm/config.py b/vllm/config.py
+index bd52fc90b..24fc1154d 100644
+--- a/vllm/config.py
++++ b/vllm/config.py
+@@ -1429,16 +1429,27 @@ class LoadConfig:
+ class ParallelConfig:
+     """Configuration for the distributed execution."""
+ 
+-    pipeline_parallel_size: int = 1  # Number of pipeline parallel groups.
+-    tensor_parallel_size: int = 1  # Number of tensor parallel groups.
+-    data_parallel_size: int = 1  # Number of data parallel groups.
+-    data_parallel_rank: int = 0  # Rank of the data parallel group.
+-    # Local rank of the data parallel group, defaults to global rank.
++    pipeline_parallel_size: int = 1
++    """Number of pipeline parallel groups."""
++    tensor_parallel_size: int = 1
++    """Number of tensor parallel groups."""
++    data_parallel_size: int = 1
++    """Number of data parallel groups. MoE layers will be sharded according to
++    the product of the tensor parallel size and data parallel size."""
++    data_parallel_size_local: int = 1
++    """Number of local data parallel groups."""
++    data_parallel_rank: int = 0
++    """Rank of the data parallel group."""
+     data_parallel_rank_local: Optional[int] = None
+     # IP of the data parallel master.
+     data_parallel_master_ip: str = "127.0.0.1"
+-    data_parallel_master_port: int = 29500  # Port of the data parallel master.
+-    enable_expert_parallel: bool = False  # Use EP instead of TP for MoE layers.
++    """IP of the data parallel master."""
++    data_parallel_rpc_port: int = 29550
++    """Port for data parallel messaging."""
++    data_parallel_master_port: int = 29500
++    """Port of the data parallel master."""
++    enable_expert_parallel: bool = False
++    """Use expert parallelism instead of tensor parallelism for MoE layers."""
+ 
+     # Maximum number of multiple batches
+     # when load model sequentially. To avoid RAM OOM when using tensor
+@@ -1475,12 +1486,16 @@ class ParallelConfig:
+ 
+     # world_size is TPxPP, it affects the number of workers we create.
+     world_size: int = field(init=False)
+-    # world_size_across_dp is TPxPPxDP, it is the size of the world
+-    # including data parallelism.
+-    world_size_across_dp: int = field(init=False)
++    """world_size is TPxPP, it affects the number of workers we create."""
+ 
+     rank: int = 0
+ 
++    @property
++    def world_size_across_dp(self) -> int:
++        """world_size_across_dp is TPxPPxDP, it is the size of the world
++        including data parallelism."""
++        return self.world_size * self.data_parallel_size
++
+     def get_next_dp_init_port(self) -> int:
+         """
+         We might need to initialize process groups in multiple
+@@ -1533,16 +1548,20 @@ class ParallelConfig:
+         factors: list[Any] = []
+         factors.append(self.pipeline_parallel_size)
+         factors.append(self.tensor_parallel_size)
++        factors.append(self.data_parallel_size)
+         return hashlib.sha256(str(factors).encode()).hexdigest()
+ 
+     def __post_init__(self) -> None:
+         self.world_size = self.pipeline_parallel_size * \
+             self.tensor_parallel_size
+ 
+-        if self.data_parallel_size > 1:
++        if self.data_parallel_size_local > self.data_parallel_size:
++            raise ValueError(
++                "data_parallel_size_local must be <= data_parallel_size")
++
++        if self.data_parallel_size > 1 or self.data_parallel_size_local == 0:
+             # Data parallel was specified in the engine args.
+             self.data_parallel_master_port = get_open_port()
+-            # TODO multi-node
+         else:
+             # Otherwise fall back to env vars (e.g. for offline SPMD case).
+             self.data_parallel_size = envs.VLLM_DP_SIZE
+@@ -1551,8 +1570,6 @@ class ParallelConfig:
+             self.data_parallel_master_ip = envs.VLLM_DP_MASTER_IP
+             self.data_parallel_master_port = envs.VLLM_DP_MASTER_PORT
+ 
+-        self.world_size_across_dp = self.world_size * self.data_parallel_size
+-
+         if self.distributed_executor_backend == "external_launcher":
+             import os
+             os.environ["VLLM_ENABLE_V1_MULTIPROCESSING"] = "0"
+diff --git a/vllm/distributed/utils.py b/vllm/distributed/utils.py
+index cae1a2551..2bdcdf4f1 100644
+--- a/vllm/distributed/utils.py
++++ b/vllm/distributed/utils.py
+@@ -21,6 +21,7 @@ from torch.distributed.rendezvous import rendezvous
+ 
+ import vllm.envs as envs
+ from vllm.logger import init_logger
++from vllm.utils import get_tcp_uri
+ 
+ logger = init_logger(__name__)
+ 
+@@ -282,7 +283,7 @@ def stateless_init_torch_distributed_process_group(
+     always formed with process 1, 2, ..., 8, and the additional communication
+     channel is formed with process 9 and 10.
+     """
+-    init_method = f"tcp://{host}:{port}"
++    init_method = get_tcp_uri(host, port)
+     backend = Backend(backend)  # it is basically string
+     timeout = _get_default_timeout(backend)
+ 
+@@ -301,6 +302,9 @@ def stateless_init_torch_distributed_process_group(
+         prefix_store,
+         group_rank,
+         group_size,
++        ProcessGroup.Options(
++            backend=backend
++        )
+     )
+ 
+     if backend == "gloo":
+@@ -325,7 +329,7 @@ def stateless_init_torch_distributed_process_group(
+     else:
+         raise RuntimeError(f"Unsupported torch distributed backend: {backend}")
+ 
+-    pg._set_default_backend(backend_type)
++    #pg._set_default_backend(backend_type)
+     backend_class._set_sequence_number_for_group()
+ 
+     pg._register_backend(device, backend_type, backend_class)
+diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
+index 89c9b6747..d6173763d 100644
+--- a/vllm/engine/arg_utils.py
++++ b/vllm/engine/arg_utils.py
+@@ -113,11 +113,14 @@ class EngineArgs:
+     distributed_executor_backend: Optional[Union[str,
+                                                  Type[ExecutorBase]]] = None
+     # number of P/D disaggregation (or other disaggregation) workers
+-    pipeline_parallel_size: int = 1
+-    tensor_parallel_size: int = 1
+-    data_parallel_size: int = 1
+-    enable_expert_parallel: bool = False
+-    max_parallel_loading_workers: Optional[int] = None
++    pipeline_parallel_size: int = ParallelConfig.pipeline_parallel_size
++    tensor_parallel_size: int = ParallelConfig.tensor_parallel_size
++    data_parallel_size: int = ParallelConfig.data_parallel_size
++    data_parallel_size_local: Optional[int] = None
++    data_parallel_address: Optional[str] = None
++    data_parallel_rpc_port: Optional[int] = None
++    enable_expert_parallel: bool = ParallelConfig.enable_expert_parallel
++    max_parallel_loading_workers: Optional[int] = ParallelConfig.max_parallel_loading_workers
+     block_size: Optional[int] = None
+     enable_prefix_caching: Optional[bool] = None
+     prefix_caching_hash_algo: str = "builtin"
+@@ -434,6 +437,21 @@ class EngineArgs:
+                             'MoE layers will be sharded according to the '
+                             'product of the tensor-parallel-size and '
+                             'data-parallel-size.')
++        parser.add_argument('--data-parallel-size-local',
++                            '-dpl',
++                            type=int,
++                            help='Number of data parallel replicas '
++                            'to run on this node.')
++        parser.add_argument('--data-parallel-address',
++                            '-dpa',
++                            type=str,
++                            help='Address of data parallel cluster '
++                            'head-node.')
++        parser.add_argument('--data-parallel-rpc-port',
++                            '-dpp',
++                            type=int,
++                            help='Port for data parallel RPC '
++                            'communication.')
+         parser.add_argument(
+             '--enable-expert-parallel',
+             action='store_true',
+@@ -1186,10 +1204,30 @@ class EngineArgs:
+             # but we should not do this here.
+             placement_group = ray.util.get_current_placement_group()
+ 
++        # Local DP size defaults to global DP size if not set.
++        data_parallel_size_local = self.data_parallel_size if (
++            self.data_parallel_size_local
++            is None) else self.data_parallel_size_local
++
++        # DP address, used in multi-node case for torch distributed group
++        # and ZMQ sockets.
++        data_parallel_address = self.data_parallel_address if (
++            self.data_parallel_address
++            is not None) else ParallelConfig.data_parallel_master_ip
++
++        # This port is only used when there are remote data parallel engines,
++        # otherwise the local IPC transport is used.
++        data_parallel_rpc_port = self.data_parallel_rpc_port if (
++            self.data_parallel_rpc_port
++            is not None) else ParallelConfig.data_parallel_rpc_port
++
+         parallel_config = ParallelConfig(
+             pipeline_parallel_size=self.pipeline_parallel_size,
+             tensor_parallel_size=self.tensor_parallel_size,
+             data_parallel_size=self.data_parallel_size,
++            data_parallel_size_local=data_parallel_size_local,
++            data_parallel_master_ip=data_parallel_address,
++            data_parallel_rpc_port=data_parallel_rpc_port,
+             enable_expert_parallel=self.enable_expert_parallel,
+             max_parallel_loading_workers=self.max_parallel_loading_workers,
+             disable_custom_all_reduce=self.disable_custom_all_reduce,
+diff --git a/vllm/entrypoints/cli/serve.py b/vllm/entrypoints/cli/serve.py
+index e89ac4e21..ffcc2bb10 100644
+--- a/vllm/entrypoints/cli/serve.py
++++ b/vllm/entrypoints/cli/serve.py
+@@ -1,14 +1,24 @@
+ # SPDX-License-Identifier: Apache-2.0
+ 
+ import argparse
++import signal
+ 
+ import uvloop
+ 
++import vllm.envs as envs
++from vllm import AsyncEngineArgs
+ from vllm.entrypoints.cli.types import CLISubcommand
+ from vllm.entrypoints.openai.api_server import run_server
+ from vllm.entrypoints.openai.cli_args import (make_arg_parser,
+                                               validate_parsed_serve_args)
+-from vllm.utils import FlexibleArgumentParser
++from vllm.logger import init_logger
++from vllm.usage.usage_lib import UsageContext
++from vllm.utils import FlexibleArgumentParser, get_tcp_uri
++from vllm.v1.engine.core import EngineCoreProc
++from vllm.v1.engine.core_client import CoreEngineProcManager
++from vllm.v1.executor.abstract import Executor
++
++logger = init_logger(__name__)
+ 
+ 
+ class ServeSubcommand(CLISubcommand):
+@@ -24,7 +34,10 @@ class ServeSubcommand(CLISubcommand):
+         if hasattr(args, 'model_tag') and args.model_tag is not None:
+             args.model = args.model_tag
+ 
+-        uvloop.run(run_server(args))
++        if args.headless:
++            run_headless(args)
++        else:
++            uvloop.run(run_server(args))
+ 
+     def validate(self, args: argparse.Namespace) -> None:
+         validate_parsed_serve_args(args)
+@@ -41,6 +54,18 @@ class ServeSubcommand(CLISubcommand):
+                                   nargs='?',
+                                   help="The model tag to serve "
+                                   "(optional if specified in config)")
++        serve_parser.add_argument(
++            "--headless",
++            action='store_true',
++            default=False,
++            help="Run in headless mode. See multi-node data parallel "
++            "documentation for more details.")
++        serve_parser.add_argument(
++            '--data-parallel-start-rank',
++            '-dpr',
++            type=int,
++            default=0,
++            help='Starting data parallel rank for secondary nodes.')
+         serve_parser.add_argument(
+             "--config",
+             type=str,
+@@ -56,3 +81,55 @@ class ServeSubcommand(CLISubcommand):
+ 
+ def cmd_init() -> list[CLISubcommand]:
+     return [ServeSubcommand()]
++
++
++def run_headless(args: argparse.Namespace):
++
++    # Create the EngineConfig.
++    engine_args = AsyncEngineArgs.from_cli_args(args)
++    usage_context = UsageContext.OPENAI_API_SERVER
++    vllm_config = engine_args.create_engine_config(usage_context=usage_context)
++
++    if not envs.VLLM_USE_V1:
++        raise RuntimeError("Headless mode is only supported for V1")
++
++    parallel_config = vllm_config.parallel_config
++    local_engine_count = parallel_config.data_parallel_size_local
++    host = parallel_config.data_parallel_master_ip
++    port = engine_args.data_parallel_rpc_port  # add to config too
++    input_address = get_tcp_uri(host, port)
++
++    if local_engine_count <= 0:
++        raise RuntimeError("data_parallel_size_local must be > 0 in "
++                           "headless mode")
++
++    # Catch SIGTERM and SIGINT to allow graceful shutdown.
++    def signal_handler(signum, frame):
++        logger.debug("Received %d signal.", signum)
++        raise SystemExit
++
++    signal.signal(signal.SIGTERM, signal_handler)
++    signal.signal(signal.SIGINT, signal_handler)
++
++    logger.info(
++        "Launching %d data parallel engine(s) in headless mode, "
++        "with head node address %s.", local_engine_count, input_address)
++
++    # Create the engines.
++    engine_manager = CoreEngineProcManager(
++        target_fn=EngineCoreProc.run_engine_core,
++        local_engine_count=local_engine_count,
++        start_index=args.data_parallel_start_rank,
++        local_start_index=0,
++        vllm_config=vllm_config,
++        on_head_node=False,
++        input_address=input_address,
++        executor_class=Executor.get_class(vllm_config),
++        log_stats=not engine_args.disable_log_stats,
++    )
++
++    try:
++        engine_manager.join_first()
++    finally:
++        logger.info("Shutting down.")
++        engine_manager.close()
+diff --git a/vllm/forward_context.py b/vllm/forward_context.py
+index e195a03c5..e5b9fd5fa 100644
+--- a/vllm/forward_context.py
++++ b/vllm/forward_context.py
+@@ -77,7 +77,8 @@ def set_forward_context(attn_metadata: Any,
+                     attn_metadata.num_decode_tokens
+             else:
+                 # for v1 attention backends
+-                batchsize = attn_metadata.num_input_tokens
++                # batchsize = attn_metadata.num_input_tokens
++                batchsize = len(attn_metadata.seq_lens)
+         else:
+             batchsize = num_tokens
+         num_tokens_across_dp = [0] * dp_size
+diff --git a/vllm/utils.py b/vllm/utils.py
+index 5f32f8cb6..d38d02586 100644
+--- a/vllm/utils.py
++++ b/vllm/utils.py
+@@ -551,6 +551,10 @@ def is_valid_ipv6_address(address: str) -> bool:
+ 
+ 
+ def get_distributed_init_method(ip: str, port: int) -> str:
++    return get_tcp_uri(ip, port)
++
++
++def get_tcp_uri(ip: str, port: int) -> str:
+     # Brackets are not permitted in ipv4 addresses,
+     # see https://github.com/python/cpython/issues/103848
+     return f"tcp://[{ip}]:{port}" if ":" in ip else f"tcp://{ip}:{port}"
+@@ -2189,6 +2193,8 @@ def make_zmq_socket(
+     ctx: Union[zmq.asyncio.Context, zmq.Context],  # type: ignore[name-defined]
+     path: str,
+     socket_type: Any,
++    bind: Optional[bool] = None,
++    identity: Optional[bytes] = None,
+ ) -> Union[zmq.Socket, zmq.asyncio.Socket]:  # type: ignore[name-defined]
+     """Make a ZMQ socket with the proper bind/connect semantics."""
+ 
+@@ -2207,16 +2213,24 @@ def make_zmq_socket(
+     else:
+         buf_size = -1  # Use system default buffer size
+ 
+-    if socket_type == zmq.constants.PULL:
+-        socket.setsockopt(zmq.constants.RCVHWM, 0)
+-        socket.setsockopt(zmq.constants.RCVBUF, buf_size)
++    if bind is None:
++        bind = socket_type != zmq.PUSH
++
++    if socket_type in (zmq.PULL, zmq.DEALER, zmq.ROUTER):
++        socket.setsockopt(zmq.RCVHWM, 0)
++        socket.setsockopt(zmq.RCVBUF, buf_size)
++
++    if socket_type in (zmq.PUSH, zmq.DEALER, zmq.ROUTER):
++        socket.setsockopt(zmq.SNDHWM, 0)
++        socket.setsockopt(zmq.SNDBUF, buf_size)
++
++    if identity is not None:
++        socket.setsockopt(zmq.IDENTITY, identity)
++
++    if bind:
+         socket.bind(path)
+-    elif socket_type == zmq.constants.PUSH:
+-        socket.setsockopt(zmq.constants.SNDHWM, 0)
+-        socket.setsockopt(zmq.constants.SNDBUF, buf_size)
+-        socket.connect(path)
+     else:
+-        raise ValueError(f"Unknown Socket Type: {socket_type}")
++        socket.connect(path)
+ 
+     return socket
+ 
+@@ -2225,14 +2239,19 @@ def make_zmq_socket(
+ def zmq_socket_ctx(
+     path: str,
+     socket_type: Any,
++    bind: Optional[bool] = None,
+     linger: int = 0,
++    identity: Optional[bytes] = None,
+ ) -> Iterator[zmq.Socket]:
+     """Context manager for a ZMQ socket"""
+ 
+     ctx = zmq.Context()  # type: ignore[attr-defined]
+     try:
+-        yield make_zmq_socket(ctx, path, socket_type)
+-
++        yield make_zmq_socket(ctx,
++                              path,
++                              socket_type,
++                              bind=bind,
++                              identity=identity)
+     except KeyboardInterrupt:
+         logger.debug("Got Keyboard Interrupt.")
+ 
+diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
+index 39caca0c2..32902a8b1 100644
+--- a/vllm/v1/engine/core.py
++++ b/vllm/v1/engine/core.py
+@@ -23,7 +23,7 @@ from vllm.lora.request import LoRARequest
+ from vllm.transformers_utils.config import (
+     maybe_register_config_serialize_by_value)
+ from vllm.utils import (get_exception_traceback, resolve_obj_by_qualname,
+-                        zmq_socket_ctx)
++                        make_zmq_socket, resolve_obj_by_qualname, zmq_socket_ctx)
+ from vllm.v1.core.kv_cache_utils import (get_kv_cache_config,
+                                          unify_kv_cache_configs)
+ from vllm.v1.core.sched.interface import SchedulerInterface
+@@ -43,6 +43,7 @@ from vllm.version import __version__ as VLLM_VERSION
+ logger = init_logger(__name__)
+ 
+ POLLING_TIMEOUT_S = 2.5
++HANDSHAKE_TIMEOUT_MINS = 5
+ 
+ _R = TypeVar('_R')  # Return type for collective_rpc
+ 
+@@ -306,43 +307,111 @@ class EngineCore:
+ 
+ class EngineCoreProc(EngineCore):
+     """ZMQ-wrapper for running EngineCore in background process."""
++    ENGINE_CORE_DEAD = b'ENGINE_CORE_DEAD'
+ 
+     def __init__(
+         self,
+-        input_path: str,
+-        output_path: str,
+         vllm_config: VllmConfig,
++        on_head_node: bool,
++        input_address: str,
+         executor_class: type[Executor],
+         log_stats: bool,
+         engine_index: int = 0,
+     ):
+-        super().__init__(vllm_config, executor_class, log_stats)
+-
+-        # Background Threads and Queues for IO. These enable us to
+-        # overlap ZMQ socket IO with GPU since they release the GIL,
+-        # and to overlap some serialization/deserialization with the
+-        # model forward pass.
+-        # Threads handle Socket <-> Queues and core_busy_loop uses Queue.
+-        self.input_queue: queue.Queue[tuple[EngineCoreRequestType,
+-                                            Any]] = queue.Queue()
+-        self.output_queue: queue.Queue[EngineCoreOutputs] = queue.Queue()
+-        threading.Thread(target=self.process_input_socket,
+-                         args=(input_path, ),
+-                         daemon=True).start()
+-        threading.Thread(target=self.process_output_socket,
+-                         args=(output_path, engine_index),
+-                         daemon=True).start()
+-
+-        self.global_unfinished_reqs = False
+-
+-        self.step_fn = (self.step if self.batch_queue is None else
+-                        self.step_with_batch_queue)
++        input_queue = queue.Queue[tuple[EngineCoreRequestType, Any]]()
++
++# GZQ DP patch TODO: executor_fail_callback is useful sometimes for reliabiltiy issues, should add later
++        executor_fail_callback = lambda: input_queue.put_nowait(
++            (EngineCoreRequestType.EXECUTOR_FAILED, b''))
++
++        # Create input socket.
++        input_ctx = zmq.Context()
++        identity = engine_index.to_bytes(length=2, byteorder="little")
++        input_socket = make_zmq_socket(input_ctx,
++                                       input_address,
++                                       zmq.DEALER,
++                                       identity=identity,
++                                       bind=False)
++        try:
++            # Register engine with front-end.
++            output_address = self.startup_handshake(
++                input_socket, on_head_node, vllm_config.parallel_config)
++
++            # Update config which may have changed from the handshake.
++            vllm_config.__post_init__()
++
++            # Set up data parallel environment.
++            self._init_data_parallel(vllm_config)
++
++            # Initialize engine core and model.
++            super().__init__(vllm_config, executor_class, log_stats)
++
++            self.step_fn = (self.step if self.batch_queue is None else
++                            self.step_with_batch_queue)
++
++            self.global_unfinished_reqs = False
++
++            # Send ready message.
++            input_socket.send(
++                msgspec.msgpack.encode({
++                    "status": "READY",
++                    "local": on_head_node
++                }))
++
++            # Background Threads and Queues for IO. These enable us to
++            # overlap ZMQ socket IO with GPU since they release the GIL,
++            # and to overlap some serialization/deserialization with the
++            # model forward pass.
++            # Threads handle Socket <-> Queues and core_busy_loop uses Queue.
++            self.input_queue = input_queue
++            self.output_queue = queue.Queue[Union[EngineCoreOutputs, bytes]]()
++            threading.Thread(target=self.process_input_socket,
++                             args=(input_socket, ),
++                             daemon=True).start()
++            input_socket = None
++            self.output_thread = threading.Thread(
++                target=self.process_output_socket,
++                args=(output_address, engine_index),
++                daemon=True)
++            self.output_thread.start()
++        finally:
++            if input_socket is not None:
++                input_socket.close(linger=0)
++
++    @staticmethod
++    def startup_handshake(input_socket: zmq.Socket, on_head_node: bool,
++                          parallel_config: ParallelConfig) -> str:
++
++        # Send registration message.
++        input_socket.send(
++            msgspec.msgpack.encode({
++                "status": "HELLO",
++                "local": on_head_node,
++            }))
++
++        # Receive initialization message.
++        logger.info("Waiting for init message from front-end.")
++        if not input_socket.poll(timeout=HANDSHAKE_TIMEOUT_MINS * 60 * 1000):
++            raise RuntimeError("Did not receive response from front-end "
++                               f"process within {HANDSHAKE_TIMEOUT_MINS} "
++                               f"minutes")
++        init_bytes = input_socket.recv()
++        init_message = msgspec.msgpack.decode(init_bytes)
++        logger.debug("Received init message: %s", init_message)
++
++        output_socket_address = init_message["output_socket_address"]
++        #TBD(nick) maybe replace IP with configured head node address
++
++        received_parallel_config = init_message["parallel_config"]
++        for key, value in received_parallel_config.items():
++            setattr(parallel_config, key, value)
++
++        return output_socket_address
+ 
+     @staticmethod
+     def run_engine_core(*args,
+                         dp_rank: int = 0,
+                         local_dp_rank: int = 0,
+-                        ready_pipe,
+                         **kwargs):
+         """Launch EngineCore busy loop in background process."""
+ 
+@@ -369,7 +438,7 @@ class EngineCoreProc(EngineCore):
+         try:
+             parallel_config: ParallelConfig = kwargs[
+                 "vllm_config"].parallel_config
+-            if parallel_config.data_parallel_size > 1:
++            if parallel_config.data_parallel_size > 1 or dp_rank > 0:
+                 # Set data parallel rank for this engine process.
+                 parallel_config.data_parallel_rank = dp_rank
+                 parallel_config.data_parallel_rank_local = local_dp_rank
+@@ -377,9 +446,6 @@ class EngineCoreProc(EngineCore):
+             else:
+                 engine_core = EngineCoreProc(*args, **kwargs)
+ 
+-            # Send Readiness signal to EngineClient.
+-            ready_pipe.send({"status": "READY"})
+-
+             engine_core.run_busy_loop()
+ 
+         except SystemExit:
+@@ -394,6 +460,9 @@ class EngineCoreProc(EngineCore):
+             if engine_core is not None:
+                 engine_core.shutdown()
+ 
++    def _init_data_parallel(self, vllm_config: VllmConfig):
++        pass
++
+     def run_busy_loop(self):
+         """Core busy loop of the EngineCore."""
+ 
+@@ -476,27 +545,37 @@ class EngineCoreProc(EngineCore):
+             and not isinstance(v, p.annotation) else v
+             for v, p in zip(args, arg_types))
+ 
+-    def process_input_socket(self, input_path: str):
++    def _send_engine_dead(self):
++        """Send EngineDead status to the EngineCoreClient."""
++
++        # Put ENGINE_CORE_DEAD in the queue.
++        self.output_queue.put_nowait(EngineCoreProc.ENGINE_CORE_DEAD)
++
++        # Wait until msg sent by the daemon before shutdown.
++        self.output_thread.join(timeout=5.0)
++        if self.output_thread.is_alive():
++            logger.fatal("vLLM shutdown signal from EngineCore failed "
++                         "to send. Please report this issue.")
++
++    def process_input_socket(self, input_socket: zmq.Socket):
+         """Input socket IO thread."""
+ 
+         # Msgpack serialization decoding.
+         add_request_decoder = MsgpackDecoder(EngineCoreRequest)
+         generic_decoder = MsgpackDecoder()
+ 
+-        with zmq_socket_ctx(input_path, zmq.constants.PULL) as socket:
+-            while True:
+-                # (RequestType, RequestData)
+-                type_frame, data_frame = socket.recv_multipart(copy=False)
+-                request_type = EngineCoreRequestType(bytes(type_frame.buffer))
++        while True:
++            # (RequestType, RequestData)
++            type_frame, data_frames = input_socket.recv_multipart(copy=False)
++            request_type = EngineCoreRequestType(bytes(type_frame.buffer))
+ 
+-                # Deserialize the request data.
+-                decoder = add_request_decoder if (
+-                    request_type
+-                    == EngineCoreRequestType.ADD) else generic_decoder
+-                request = decoder.decode(data_frame.buffer)
++            # Deserialize the request data.
++            decoder = add_request_decoder if (
++                request_type == EngineCoreRequestType.ADD) else generic_decoder
++            request = decoder.decode(data_frames)
+ 
+-                # Push to input queue for core busy loop.
+-                self.input_queue.put_nowait((request_type, request))
++            # Push to input queue for core busy loop.
++            self.input_queue.put_nowait((request_type, request))
+ 
+     def process_output_socket(self, output_path: str, engine_index: int):
+         """Output socket IO thread."""
+@@ -523,9 +602,9 @@ class DPEngineCoreProc(EngineCoreProc):
+ 
+     def __init__(
+         self,
+-        input_path: str,
+-        output_path: str,
+         vllm_config: VllmConfig,
++        on_head_node: bool,
++        input_address: str,
+         executor_class: type[Executor],
+         log_stats: bool,
+     ):
+@@ -537,8 +616,20 @@ class DPEngineCoreProc(EngineCoreProc):
+         _add_prefix(sys.stdout, process_name, pid)
+         _add_prefix(sys.stderr, process_name, pid)
+ 
+-        dp_size = vllm_config.parallel_config.data_parallel_size
++        # Counts forward-passes of the model so that we can synchronize
++        # finished with DP peers every N steps.
++        self.counter = 0
++
++        # Initialize the engine.
++        dp_rank = vllm_config.parallel_config.data_parallel_rank
++        super().__init__(vllm_config, on_head_node, input_address,
++                         executor_class, log_stats, dp_rank)
++
++    def _init_data_parallel(self, vllm_config: VllmConfig):
++
++        # Configure GPUs and stateless process group for data parallel.
+         dp_rank = vllm_config.parallel_config.data_parallel_rank
++        dp_size = vllm_config.parallel_config.data_parallel_size
+         local_dp_rank = vllm_config.parallel_config.data_parallel_rank_local
+ 
+         assert dp_size > 1
+@@ -547,22 +638,14 @@ class DPEngineCoreProc(EngineCoreProc):
+         from vllm.platforms import current_platform
+         if current_platform.is_cuda_alike():
+             from vllm.platforms.cuda import device_id_to_physical_device_id
+-            tp_size = vllm_config.parallel_config.tensor_parallel_size
++            world_size = vllm_config.parallel_config.world_size
+             os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(
+                 str(device_id_to_physical_device_id(i))
+-                for i in range(local_dp_rank * tp_size, (local_dp_rank + 1) *
+-                               tp_size))
++                for i in range(local_dp_rank *
++                               world_size, (local_dp_rank + 1) * world_size))
+ 
+         self.dp_group = vllm_config.parallel_config.stateless_init_dp_group()
+ 
+-        # Initialize the engine after setting up environment.
+-        super().__init__(input_path, output_path, vllm_config, executor_class,
+-                         log_stats, dp_rank)
+-
+-        # Counts forward-passes of the model so that we can synchronize
+-        # finished with DP peers every N steps.
+-        self.counter = 0
+-
+     def shutdown(self):
+         super().shutdown()
+         if dp_group := getattr(self, "dp_group", None):
+diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
+index e948e59b8..e443f45db 100644
+--- a/vllm/v1/engine/core_client.py
++++ b/vllm/v1/engine/core_client.py
+@@ -8,26 +8,29 @@ import threading
+ import uuid
+ import weakref
+ from abc import ABC, abstractmethod
+-from collections.abc import Awaitable, Sequence
++from collections.abc import Awaitable
+ from concurrent.futures import Future
+-from dataclasses import dataclass, field
++from dataclasses import dataclass
++from enum import Enum, auto
+ from threading import Thread
+ from typing import Any, Callable, Optional, TypeVar, Union
+ 
++import msgspec
+ import zmq
+ import zmq.asyncio
+ 
+-from vllm.config import VllmConfig
++from vllm.config import ParallelConfig, VllmConfig
+ from vllm.logger import init_logger
+ from vllm.lora.request import LoRARequest
+-from vllm.utils import (get_open_zmq_inproc_path, get_open_zmq_ipc_path,
+-                        kill_process_tree, make_zmq_socket)
++
++from vllm.utils import (get_open_port, get_open_zmq_inproc_path,
++                        get_open_zmq_ipc_path, kill_process_tree, get_tcp_uri, make_zmq_socket)
+ from vllm.v1.engine import (EngineCoreOutputs, EngineCoreRequest,
+                             EngineCoreRequestType, UtilityOutput)
+ from vllm.v1.engine.core import EngineCore, EngineCoreProc
+ from vllm.v1.executor.abstract import Executor
+-from vllm.v1.serial_utils import MsgpackDecoder, MsgpackEncoder
+-from vllm.v1.utils import BackgroundProcHandle
++from vllm.v1.serial_utils import MsgpackDecoder, MsgpackEncoder, bytestr
++from vllm.v1.utils import CoreEngineProcManager
+ 
+ logger = init_logger(__name__)
+ 
+@@ -35,6 +38,8 @@ AnyFuture = Union[asyncio.Future[Any], Future[Any]]
+ 
+ _R = TypeVar('_R')  # Return type for collective_rpc
+ 
++STARTUP_POLL_PERIOD_MS = 10000
++
+ 
+ class EngineCoreClient(ABC):
+     """
+@@ -253,52 +258,21 @@ class InprocClient(EngineCoreClient):
+         return self.engine_core.collective_rpc(method, timeout, args, kwargs)
+ 
+ 
+-class CoreEngine:
+-    """One per data parallel rank."""
++class CoreEngineState(Enum):
++    NEW = auto()
++    CONNECTED = auto()
++    READY = auto()
+ 
+-    def __init__(
+-        self,
+-        vllm_config: VllmConfig,
+-        executor_class: type[Executor],
+-        log_stats: bool,
+-        ctx: Union[zmq.Context, zmq.asyncio.Context],
+-        output_path: str,
+-        index: int = 0,
+-        local_dp_rank: int = 0,
+-    ):
+-        # Paths and sockets for IPC.
+-        input_path = get_open_zmq_ipc_path()
+-        self.input_socket = make_zmq_socket(ctx, input_path,
+-                                            zmq.constants.PUSH)
+-        try:
+-            # Start EngineCore in background process.
+-            self.proc_handle = BackgroundProcHandle(
+-                input_path=input_path,
+-                output_path=output_path,
+-                process_name=f"EngineCore_{index}",
+-                target_fn=EngineCoreProc.run_engine_core,
+-                process_kwargs={
+-                    "vllm_config": vllm_config,
+-                    "dp_rank": index,
+-                    "local_dp_rank": local_dp_rank,
+-                    "executor_class": executor_class,
+-                    "log_stats": log_stats,
+-                })
+ 
+-            self.num_reqs_in_flight = 0
+-        finally:
+-            if not hasattr(self, "num_reqs_in_flight"):
+-                # Ensure socket is closed if process fails to start.
+-                self.close()
++class CoreEngine:
++    """One per data parallel rank."""
+ 
+-    def send_multipart(self, msg_parts: Sequence):
+-        return self.input_socket.send_multipart(msg_parts, copy=False)
++    def __init__(self, index: int = 0, local: bool = True):
++        self.local = local
++        self.identity = index.to_bytes(length=2, byteorder="little")
+ 
+-    def close(self):
+-        if proc_handle := getattr(self, "proc_handle", None):
+-            proc_handle.shutdown()
+-        if socket := getattr(self, "input_socket", None):
+-            socket.close(linger=0)
++        self.state = CoreEngineState.NEW
++        self.num_reqs_in_flight = 0
+ 
+ 
+ @dataclass
+@@ -307,20 +281,23 @@ class BackgroundResources:
+     circular reference back to the client object."""
+ 
+     ctx: Union[zmq.Context]
+-    core_engines: list[CoreEngine] = field(default_factory=list)
++    local_engine_manager: Optional[CoreEngineProcManager] = None
+     output_socket: Optional[Union[zmq.Socket, zmq.asyncio.Socket]] = None
++    input_socket: Optional[Union[zmq.Socket, zmq.asyncio.Socket]] = None
+     shutdown_path: Optional[str] = None
+ 
+     def __call__(self):
+         """Clean up background resources."""
+ 
+-        for core_engine in self.core_engines:
+-            core_engine.close()
++        if self.local_engine_manager is not None:
++            self.local_engine_manager.close()
+ 
+         # ZMQ context termination can hang if the sockets
+         # aren't explicitly closed first.
+         if self.output_socket is not None:
+             self.output_socket.close(linger=0)
++        if self.input_socket is not None:
++            self.input_socket.close(linger=0)
+         if self.shutdown_path is not None:
+             # We must ensure that the sync output socket is
+             # closed cleanly in its own thread.
+@@ -384,38 +361,169 @@ class MPClient(EngineCoreClient):
+         # exception is raised mid-construction.
+         self.resources = BackgroundResources(ctx=sync_ctx)
+         self._finalizer = weakref.finalize(self, self.resources)
++        success = False
++        try:
++            parallel_config = vllm_config.parallel_config
++            local_engine_count = parallel_config.data_parallel_size_local
++            start_index = parallel_config.data_parallel_rank
++            local_start_index = parallel_config.data_parallel_rank_local
++
++            # SPMD mode is where there is an LLM instance per DP rank and
++            # one core engine per LLM, see
++            # examples/offline_inference/data_parallel.py.
++            spmd_mode = local_start_index is not None
++            if spmd_mode:
++                assert local_engine_count == 1
++                self.core_engines = [
++                    CoreEngine(index=local_start_index, local=True)
++                ]
++            else:
++                assert start_index == 0
++                local_start_index = 0
++                self.core_engines = [
++                    CoreEngine(index=i, local=(i < local_engine_count))
++                    for i in range(parallel_config.data_parallel_size)
++                ]
+ 
+-        # Paths and sockets for IPC.
+-        self.output_path = get_open_zmq_ipc_path()
+-
+-        new_core_engine = lambda index, local_dp_rank=None: CoreEngine(
+-            vllm_config, executor_class, log_stats, self.ctx, self.output_path,
+-            index, local_dp_rank)
+-
+-        # Start engine core process(es).
+-        self._init_core_engines(vllm_config, new_core_engine,
+-                                self.resources.core_engines)
+-
+-        # Wait for engine core process(es) to start.
+-        for engine in self.resources.core_engines:
+-            engine.proc_handle.wait_for_startup()
+-
+-        self.utility_results: dict[int, AnyFuture] = {}
++            input_address, output_address = self._get_zmq_addresses(
++                parallel_config, spmd_mode)
++
++            # Create input and output sockets.
++            self.input_socket = self.resources.input_socket = make_zmq_socket(
++                self.ctx, input_address, zmq.ROUTER, bind=True)
++
++            self.resources.output_socket = make_zmq_socket(
++                self.ctx, output_address, zmq.constants.PULL)
++            # Start local engines.
++            if local_engine_count:
++                # In server mode, start_index and local_start_index will
++                # both be 0.
++                self.resources.local_engine_manager = CoreEngineProcManager(
++                    EngineCoreProc.run_engine_core,
++                    vllm_config=vllm_config,
++                    executor_class=executor_class,
++                    log_stats=log_stats,
++                    input_address=input_address,
++                    on_head_node=True,
++                    local_engine_count=local_engine_count,
++                    start_index=start_index,
++                    local_start_index=local_start_index)
++
++            self.core_engine = self.core_engines[0]
++
++            # Wait for engine core process(es) to start.
++            self._wait_for_engine_startup(output_address, parallel_config)
++
++            self.utility_results: dict[int, AnyFuture] = {}
++            success = True
++        finally:
++            if not success:
++                self._finalizer()
+ 
+-    def _init_core_engines(
+-        self,
+-        vllm_config: VllmConfig,
+-        new_core_engine: Callable[[int, Optional[int]], CoreEngine],
+-        core_engines: list[CoreEngine],
+-    ) -> None:
+-
+-        # Default case - single core engine.
+-        dp_rank = vllm_config.parallel_config.data_parallel_rank
+-        local_dp_rank = vllm_config.parallel_config.data_parallel_rank_local
+-        core_engine = new_core_engine(
+-            dp_rank, local_dp_rank if local_dp_rank is not None else dp_rank)
+-        core_engines.append(core_engine)
+-        self.core_engine = core_engine
++    @staticmethod
++    def _get_zmq_addresses(parallel_config: ParallelConfig,
++                           spmd_mode: bool) -> tuple[str, str]:
++        """Returns (input_address, output_address)."""
++        dp_size = parallel_config.data_parallel_size
++        local_engine_count = parallel_config.data_parallel_size_local
++
++        if local_engine_count == dp_size or spmd_mode:
++            input_address = get_open_zmq_ipc_path()
++            output_address = get_open_zmq_ipc_path()
++        else:
++            host = parallel_config.data_parallel_master_ip
++            input_port = parallel_config.data_parallel_rpc_port
++            output_port = get_open_port()
++            input_address = get_tcp_uri(host, input_port)
++            output_address = get_tcp_uri(host, output_port)
++
++        return input_address, output_address
++
++    def _wait_for_engine_startup(self, output_address: str,
++                                 parallel_config: ParallelConfig):
++        # Get a sync handle to the socket which can be sync or async.
++        sync_input_socket = zmq.Socket.shadow(self.input_socket)
++
++        # Wait for engine core process(es) to send ready messages.
++        local_count = parallel_config.data_parallel_size_local
++        remote_count = len(self.core_engines) - local_count
++        # [local, remote] counts
++        conn_pending, start_pending = [local_count, remote_count], [0, 0]
++
++        poller = zmq.Poller()
++        poller.register(sync_input_socket, zmq.POLLIN)
++        proc_manager = self.resources.local_engine_manager
++        if proc_manager is not None:
++            for sentinel in proc_manager.sentinels():
++                poller.register(sentinel, zmq.POLLIN)
++        while any(conn_pending) or any(start_pending):
++            events = poller.poll(STARTUP_POLL_PERIOD_MS)
++            if not events:
++                if any(conn_pending):
++                    logger.debug(
++                        "Waiting for %d local, %d remote core engine proc(s) "
++                        "to connect.", *conn_pending)
++                if any(start_pending):
++                    logger.debug(
++                        "Waiting for %d local, %d remote core engine proc(s) "
++                        "to start.", *start_pending)
++                continue
++            if len(events) > 1 or events[0][0] != sync_input_socket:
++                # One of the local core processes exited.
++                finished = proc_manager.finished_procs(
++                ) if proc_manager else {}
++                raise RuntimeError("Engine core initialization failed. "
++                                   "See root cause above. "
++                                   f"Failed core proc(s): {finished}")
++
++            # Receive HELLO and READY messages from the input socket.
++            eng_identity, ready_msg_bytes = sync_input_socket.recv_multipart()
++            eng_index = int.from_bytes(eng_identity, byteorder="little")
++            engine = next(
++                (e for e in self.core_engines if e.identity == eng_identity),
++                None)
++            if engine is None:
++                raise RuntimeError(f"Message from engine with unexpected data "
++                                   f"parallel rank: {eng_index}")
++            msg = msgspec.msgpack.decode(ready_msg_bytes)
++            status, local = msg["status"], msg["local"]
++            if local != engine.local:
++                raise RuntimeError(f"{status} message from "
++                                   f"{'local' if local else 'remote'} "
++                                   f"engine {eng_index}, expected it to be "
++                                   f"{'local' if engine.local else 'remote'}")
++
++            if status == "HELLO" and engine.state == CoreEngineState.NEW:
++
++                # Send init message with DP config info.
++                init_message = self.encoder.encode({
++                    "output_socket_address": output_address,
++                    "parallel_config": {
++                        "data_parallel_master_ip":
++                        parallel_config.data_parallel_master_ip,
++                        "data_parallel_master_port":
++                        parallel_config.data_parallel_master_port,
++                        "data_parallel_size":
++                        parallel_config.data_parallel_size,
++                    },
++                })
++                sync_input_socket.send_multipart((eng_identity, init_message),
++                                                 copy=False)
++                conn_pending[0 if local else 1] -= 1
++                start_pending[0 if local else 1] += 1
++                engine.state = CoreEngineState.CONNECTED
++            elif status == "READY" and (engine.state
++                                        == CoreEngineState.CONNECTED):
++                start_pending[0 if local else 1] -= 1
++                engine.state = CoreEngineState.READY
++            else:
++                raise RuntimeError(f"Unexpected {status} message for "
++                                   f"{'local' if local else 'remote'} engine "
++                                   f"{eng_index} in {engine.state} state.")
++
++            logger.debug("%s from %s core engine process %s.", status,
++                         "local" if local else "remote", eng_index)
++# >>>>>>> fbe7575cc... squashed commit of pr#15977
+ 
+     def shutdown(self):
+         self._finalizer()
+@@ -448,7 +556,8 @@ class SyncMPClient(MPClient):
+         # Ensure that the outputs socket processing thread does not have
+         # a ref to the client which prevents gc.
+         ctx = self.ctx
+-        output_path = self.output_path
++        out_socket = self.resources.output_socket
++        assert out_socket is not None
+         decoder = self.decoder
+         utility_results = self.utility_results
+         outputs_queue = self.outputs_queue
+@@ -458,7 +567,6 @@ class SyncMPClient(MPClient):
+ 
+         def process_outputs_socket():
+             shutdown_socket = ctx.socket(zmq.PAIR)
+-            out_socket = make_zmq_socket(ctx, output_path, zmq.constants.PULL)
+             try:
+                 shutdown_socket.bind(shutdown_path)
+                 poller = zmq.Poller()
+@@ -490,13 +598,17 @@ class SyncMPClient(MPClient):
+                                           daemon=True)
+         self.output_queue_thread.start()
+ 
++        # The thread takes on responsibility for closing the socket.
++        self.resources.output_socket = None
++
+     def get_output(self) -> EngineCoreOutputs:
+         return self.outputs_queue.get()
+ 
+     def _send_input(self, request_type: EngineCoreRequestType, request: Any):
+-        # (RequestType, SerializedRequest)
+-        msg = (request_type.value, self.encoder.encode(request))
+-        self.core_engine.send_multipart(msg)
++        # (Identity, RequestType, SerializedRequest)
++        msg = (self.core_engine.identity, request_type.value,
++               self.encoder.encode(request))
++        self.input_socket.send_multipart(msg, copy=False)
+ 
+     def call_utility(self, method: str, *args) -> Any:
+         call_id = uuid.uuid1().int >> 64
+@@ -581,6 +693,7 @@ class AsyncMPClient(MPClient):
+             [AsyncMPClient, EngineCoreOutputs], Awaitable[None]]] = None
+ 
+     def _ensure_output_queue_task(self):
++        resources = self.resources
+         if self.outputs_queue is not None:
+             return
+ 
+@@ -592,10 +705,8 @@ class AsyncMPClient(MPClient):
+         outputs_queue = self.outputs_queue
+         output_handler = self.outputs_handler
+         _self_ref = weakref.ref(self) if output_handler else None
+-        output_path = self.output_path
+-        output_socket = make_zmq_socket(self.ctx, output_path,
+-                                        zmq.constants.PULL)
+-        self.resources.output_socket = output_socket
++        output_socket = resources.output_socket
++        assert output_socket is not None
+ 
+         async def process_outputs_socket():
+             while True:
+@@ -625,30 +736,34 @@ class AsyncMPClient(MPClient):
+         assert self.outputs_queue is not None
+         return await self.outputs_queue.get()
+ 
+-    async def _send_input(self, request_type: EngineCoreRequestType,
+-                          request: Any) -> None:
+-        await self.core_engine.send_multipart(
+-            (request_type.value, self.encoder.encode(request)))
++    def _send_input(self,
++                    request_type: EngineCoreRequestType,
++                    request: Any,
++                    engine: Optional[CoreEngine] = None) -> Awaitable[None]:
++        if engine is None:
++            engine = self.core_engine
+ 
+-        self._ensure_output_queue_task()
++        message = (request_type.value, self.encoder.encode(request))
++        return self._send_input_message(message, engine)
++
++    def _send_input_message(self, message: tuple[bytes, bytes],
++                            engine: CoreEngine) -> Awaitable[None]:
++        message = (engine.identity, ) + message  # type: ignore[assignment]
++        return self.input_socket.send_multipart(message, copy=False)
+ 
+     async def call_utility_async(self, method: str, *args) -> Any:
+         return await self._call_utility_async(method,
+                                               *args,
+                                               engine=self.core_engine)
+ 
+-    async def _call_utility_async(
+-        self,
+-        method: str,
+-        *args,
+-        engine: CoreEngine,
+-    ) -> Any:
++    async def _call_utility_async(self, method: str, *args,
++                                  engine: CoreEngine) -> Any:
+         call_id = uuid.uuid1().int >> 64
+         future = asyncio.get_running_loop().create_future()
+         self.utility_results[call_id] = future
+         message = (EngineCoreRequestType.UTILITY.value,
+                    self.encoder.encode((call_id, method, args)))
+-        await engine.send_multipart(message)
++        await self._send_input_message(message, engine)
+         self._ensure_output_queue_task()
+         return await future
+ 
+@@ -657,6 +772,7 @@ class AsyncMPClient(MPClient):
+         # tokenized.
+         request.prompt = None
+         await self._send_input(EngineCoreRequestType.ADD, request)
++        self._ensure_output_queue_task()
+ 
+     async def abort_requests_async(self, request_ids: list[str]) -> None:
+         if len(request_ids) > 0:
+@@ -728,21 +844,6 @@ class DPAsyncMPClient(AsyncMPClient):
+ 
+         self.outputs_handler = DPAsyncMPClient.process_engine_outputs  # type: ignore[assignment]
+ 
+-    def _init_core_engines(
+-        self,
+-        vllm_config: VllmConfig,
+-        new_core_engine: Callable[[int, Optional[int]], CoreEngine],
+-        core_engines: list[CoreEngine],
+-    ) -> None:
+-
+-        # Launch a core engine for each data parallel rank.
+-        dp_size = vllm_config.parallel_config.data_parallel_size
+-        for i in range(dp_size):
+-            # Multi-node not yet supported so local_dp_rank == dp_rank.
+-            core_engines.append(new_core_engine(i, i))
+-
+-        self.core_engines = core_engines
+-
+     async def call_utility_async(self, method: str, *args) -> Any:
+         # Only the result from the first engine is returned.
+         return (await asyncio.gather(*[
+@@ -761,15 +862,15 @@ class DPAsyncMPClient(AsyncMPClient):
+         self.reqs_in_flight[request.request_id] = chosen_engine
+         chosen_engine.num_reqs_in_flight += 1
+         if self.num_engines_running >= len(self.core_engines):
+-            await chosen_engine.send_multipart(msg)
++            await self._send_input_message(msg, chosen_engine)
+         else:
+             # Send request to chosen engine and dp start loop
+             # control message to all other engines.
+             self.num_engines_running += len(self.core_engines)
+             await asyncio.gather(*[
+-                engine.send_multipart(msg if engine is
+-                                      chosen_engine else self.start_dp_msg)
+-                for engine in self.core_engines
++                self._send_input_message(
++                    msg if engine is chosen_engine else self.start_dp_msg,
++                    engine) for engine in self.core_engines
+             ])
+ 
+         self._ensure_output_queue_task()
+@@ -794,7 +895,7 @@ class DPAsyncMPClient(AsyncMPClient):
+                 # sure to start the other engines:
+                 self.num_engines_running = len(self.core_engines)
+                 coros = [
+-                    engine.send_multipart(self.start_dp_msg)
++                    self._send_input_message(self.start_dp_msg, engine)
+                     for engine in self.core_engines
+                     if not engine.num_reqs_in_flight
+                 ]
+@@ -820,5 +921,5 @@ class DPAsyncMPClient(AsyncMPClient):
+ 
+     async def _abort_requests(self, request_ids: list[str],
+                               engine: CoreEngine) -> None:
+-        await engine.send_multipart((EngineCoreRequestType.ABORT.value,
+-                                     self.encoder.encode(request_ids)))
++        await self._send_input(EngineCoreRequestType.ABORT, request_ids,
++                               engine)
+diff --git a/vllm/v1/serial_utils.py b/vllm/v1/serial_utils.py
+index 146d7d747..7c1d48494 100644
+--- a/vllm/v1/serial_utils.py
++++ b/vllm/v1/serial_utils.py
+@@ -2,9 +2,10 @@
+ 
+ import pickle
+ from types import FunctionType
+-from typing import Any, Optional
++from typing import Any, Optional, Union
+ 
+ import cloudpickle
++import zmq
+ import torch
+ from msgspec import msgpack
+ 
+@@ -12,6 +13,7 @@ CUSTOM_TYPE_TENSOR = 1
+ CUSTOM_TYPE_PICKLE = 2
+ CUSTOM_TYPE_CLOUDPICKLE = 3
+ 
++bytestr = Union[bytes, bytearray, memoryview, zmq.Frame]
+ 
+ class MsgpackEncoder:
+     """Encoder with custom torch tensor serialization."""
+diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py
+index f42b3501a..fbc0ee340 100644
+--- a/vllm/v1/utils.py
++++ b/vllm/v1/utils.py
+@@ -2,17 +2,21 @@
+ 
+ import multiprocessing
+ import os
++import time
+ import weakref
+ from collections import defaultdict
+ from collections.abc import Sequence
++from multiprocessing import Process, connection
+ from typing import (TYPE_CHECKING, Any, Callable, Generic, Optional, TypeVar,
+                     Union, overload)
+ 
+ import torch
+ 
++from vllm.config import VllmConfig
+ from vllm.logger import init_logger
+ from vllm.model_executor.models.utils import extract_layer_index
+ from vllm.utils import get_mp_context, kill_process_tree
++from vllm.v1.executor.abstract import Executor
+ 
+ if TYPE_CHECKING:
+     from vllm.attention.layer import Attention
+@@ -90,7 +94,7 @@ class ConstantList(Generic[T], Sequence):
+         return f"ConstantList({self._x})"
+ 
+ 
+-class BackgroundProcHandle:
++class CoreEngineProcManager:
+     """
+     Utility class to handle creation, readiness, and shutdown
+     of background processes used by the AsyncLLM and LLMEngine.
+@@ -98,55 +102,91 @@ class BackgroundProcHandle:
+ 
+     def __init__(
+         self,
+-        input_path: str,
+-        output_path: str,
+-        process_name: str,
+         target_fn: Callable,
+-        process_kwargs: dict[Any, Any],
++        local_engine_count: int,
++        start_index: int,
++        local_start_index: int,
++        vllm_config: VllmConfig,
++        on_head_node: bool,
++        input_address: str,
++        executor_class: type[Executor],
++        log_stats: bool,
+     ):
+         context = get_mp_context()
+-        self.reader, writer = context.Pipe(duplex=False)
+-
+-        assert ("ready_pipe" not in process_kwargs
+-                and "input_path" not in process_kwargs
+-                and "output_path" not in process_kwargs)
+-        process_kwargs["ready_pipe"] = writer
+-        process_kwargs["input_path"] = input_path
+-        process_kwargs["output_path"] = output_path
+-
+-        # Run busy loop in background process.
+-        self.proc = context.Process(target=target_fn,
+-                                    kwargs=process_kwargs,
+-                                    name=process_name)
+-        self._finalizer = weakref.finalize(self, shutdown, self.proc,
+-                                           input_path, output_path)
+-        self.proc.start()
+-
+-    def wait_for_startup(self):
+-        # Wait for startup.
+-        if self.reader.recv()["status"] != "READY":
+-            raise RuntimeError(f"{self.proc.name} initialization failed. "
+-                               "See root cause above.")
+-
+-    def shutdown(self):
++        common_kwargs = {
++            "vllm_config": vllm_config,
++            "on_head_node": on_head_node,
++            "input_address": input_address,
++            "executor_class": executor_class,
++            "log_stats": log_stats,
++        }
++
++        self.processes: list[Process] = []
++        for index in range(local_engine_count):
++            local_index = local_start_index + index
++            global_index = start_index + index
++            # Start EngineCore in background process.
++            self.processes.append(
++                context.Process(target=target_fn,
++                                name=f"EngineCore_{global_index}",
++                                kwargs=common_kwargs | {
++                                    "dp_rank": global_index,
++                                    "local_dp_rank": local_index,
++                                }))
++
++        self._finalizer = weakref.finalize(self, shutdown, self.processes,
++                                           input_address)
++        try:
++            for proc in self.processes:
++                proc.start()
++        finally:
++            # Kill other procs if not all are running.
++            if self.finished_procs():
++                self.close()
++
++    def close(self):
++        """Shutdown all procs."""
+         self._finalizer()
+ 
++    def join_first(self):
++        """Wait for any process to exit."""
++        connection.wait(proc.sentinel for proc in self.processes)
++
++    def sentinels(self) -> list:
++        return [proc.sentinel for proc in self.processes]
++
++    def finished_procs(self) -> dict[str, int]:
++        """Returns dict of proc name -> exit code for any finished procs."""
++        return {
++            proc.name: proc.exitcode
++            for proc in self.processes if proc.exitcode is not None
++        }
++
+ 
+ # Note(rob): shutdown function cannot be a bound method,
+ # else the gc cannot collect the object.
+-def shutdown(proc: multiprocessing.Process, input_path: str, output_path: str):
++def shutdown(procs: list[Process], input_address: str):
+     # Shutdown the process.
+-    if proc.is_alive():
+-        proc.terminate()
+-        proc.join(5)
++    for proc in procs:
++        if proc.is_alive():
++            proc.terminate()
++
++    # Allow 5 seconds for remaining procs to terminate.
++    deadline = time.monotonic() + 5
++    for proc in procs:
++        remaining = deadline - time.monotonic()
++        if remaining <= 0:
++            break
++        if proc.is_alive():
++            proc.join(remaining)
+ 
++    for proc in procs:
+         if proc.is_alive():
+             kill_process_tree(proc.pid)
+ 
+     # Remove zmq ipc socket files.
+-    ipc_sockets = [output_path, input_path]
+-    for ipc_socket in ipc_sockets:
+-        socket_file = ipc_socket.replace("ipc://", "")
++    if input_address.startswith("ipc://"):
++        socket_file = input_address[len("ipc://"):]
+         if os and os.path.exists(socket_file):
+             os.remove(socket_file)
+ 
diff --git a/vllm_dp/install_dp_vllm.sh b/vllm_dp/install_dp_vllm.sh
new file mode 100644
index 00000000..ee02ea9f
--- /dev/null
+++ b/vllm_dp/install_dp_vllm.sh
@@ -0,0 +1,41 @@
+#!/bin/bash
+# Copyright 2025 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+# This bash is to apply njhill's Multi-node server solution 
+# (https://github.com/vllm-project/vllm/pull/15906, https://github.com/vllm-project/vllm/pull/15977)
+# to vllm v0.8.3 and install it.
+
+script_dir=$(cd "$(dirname $0)"; pwd)
+old_dir=$(pwd)
+
+vllm_tag="v0.8.3"
+
+vllm_source_dir="${script_dir}/vllm-${vllm_tag}"
+
+if [ -d "${vllm_source_dir}" ]; then
+    echo "The ${vllm_source_dir} already exists, install maybe done! If not, please remove and rename it first."
+    exit 1
+fi
+
+git clone https://github.com/vllm-project/vllm.git -b ${vllm_tag} --depth 1 ${vllm_source_dir}
+cd ${vllm_source_dir}
+
+git apply "${script_dir}/dp_scale_out.patch"
+
+export VLLM_TARGET_DEVICE=empty
+pip install .
+
+cd ${old_dir}
\ No newline at end of file
diff --git a/vllm_mindspore/config.py b/vllm_mindspore/config.py
index 21c2fded..8079ca35 100644
--- a/vllm_mindspore/config.py
+++ b/vllm_mindspore/config.py
@@ -19,6 +19,7 @@ from collections import Counter
 from typing import Union
 import sys
 import socket
+import threading
 import pickle
 import time
 
@@ -328,17 +329,21 @@ class SocketProcessGroup:
                     time.sleep(self.retry_interval)
             else:
                 raise ConnectionError(f"Worker {self.rank} could not connect to master at {self.master_ip}:{self.master_port} after {self.max_retries} retries.")
+    
+    def accept_connections(self):
+        for _ in range(self.world_size - 1):
+            conn, addr = self.server_socket.accept()
+            print(f"Accepted connection from {addr}")
+            self.sockets.append(conn)
 
     def initialize_group(self):
         if self.rank == 0:
             # Master node: accept connections from workers
-            for _ in range(self.world_size - 1):
-                conn, addr = self.server_socket.accept()
-                print(f"Accepted connection from {addr}")
-                self.sockets.append(conn)
+            self.conn_thread = threading.Thread(target=self.accept_connections, daemon=True)
+            self.conn_thread.start()
         else:
             # Worker node: no additional setup needed
-            pass
+            self.conn_thread = None
 
     def close(self):
         if self.rank == 0:
@@ -370,6 +375,9 @@ def has_unfinished_dp(dp_group: SocketProcessGroup, has_unfinished: bool) -> boo
     """
     if dp_group.rank == 0:
         # Master node: collect results from workers
+        assert dp_group.conn_thread is not None
+        # Wait for all dp engine connectioned.
+        dp_group.conn_thread.join()
         results = [has_unfinished]
         for conn in dp_group.sockets:
             data = conn.recv(1024)
diff --git a/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py b/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py
index d0ac3c8c..fa39e0fb 100644
--- a/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py
+++ b/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py
@@ -21,12 +21,14 @@ from typing import Iterable, Set, Tuple
 from collections import OrderedDict
 
 import numpy as np
+import vllm.envs as envs
+import mindspore as ms
 
 from vllm.config import VllmConfig
 from vllm.config import get_current_vllm_config
+from vllm.distributed.parallel_state import get_dp_group, get_tensor_model_parallel_world_size
 from vllm.forward_context import get_forward_context
 from vllm.logger import init_logger
-import vllm.envs as envs
 
 import mindspore as ms
 from mindspore import Tensor, JitConfig, Model, mutable
@@ -55,17 +57,46 @@ from vllm_mindspore.model_executor.models.mf_models.deepseekv3_weight_processor
 logger = init_logger(__name__)
 
 
-def set_runtime_kernel_launch_group():
-    kernel_launch_group = {'thread_num' : 2, 'kernel_group_num' : 8}
-    env_kernel_launch_group = os.getenv("EXPERIMENTAL_KERNEL_LAUNCH_GROUP", None)
-    if env_kernel_launch_group is not None:
-        pairs = env_kernel_launch_group.split(',')
-        for pair in pairs:
-            key, val = pair.split(':')
-            kernel_launch_group[key] = val
-    thread_num = int(kernel_launch_group.get('thread_num', 2))
-    kernel_group_num = int(kernel_launch_group.get('kernel_group_num', 8))
-    ms.runtime.set_kernel_launch_group(thread_num=thread_num, kernel_group_num=kernel_group_num)
+def _get_padding_index(q_seq_len):
+    dp_size = get_dp_group().world_size
+    tp_size = get_tensor_model_parallel_world_size()
+    if dp_size == 1 or tp_size == 1:
+        return None, None, None, None
+
+    tokens_len_per_dp = q_seq_len.sum().reshape(-1)
+    tokens_len_per_dp = get_dp_group().all_gather(tokens_len_per_dp)
+    tokens_len_per_dp = tokens_len_per_dp.asnumpy()
+    padding_size = (tokens_len_per_dp.max() + tp_size - 1) // tp_size * tp_size
+
+    dp_rank_id = get_dp_group().rank_in_group
+    attn_padding_idx = None
+    attn_unpadding_idx = None
+    ffn_padding_idx = None
+    ffn_unpadding_idx = None
+    last_arange_index = 0
+
+    for dp_rank, tokens_length in enumerate(tokens_len_per_dp):
+        arange_data = np.arange(0, int(tokens_length), dtype=np.int32)
+        if dp_rank == dp_rank_id:
+            ffn_unpadding_idx = arange_data
+            attn_padding_idx = np.pad(
+                arange_data, (0, padding_size - arange_data.shape[0]), mode='constant', constant_values=0)
+
+        if dp_rank == 0:
+            attn_unpadding_idx = arange_data
+            last_arange_index = arange_data[-1]
+            ffn_padding_idx= np.pad(attn_unpadding_idx, (0, padding_size - attn_unpadding_idx.shape[0]),
+                                    mode='constant', constant_values=0)
+        else:
+            attn_offset_idx = arange_data + padding_size * dp_rank
+            attn_unpadding_idx = np.concatenate((attn_unpadding_idx, attn_offset_idx), axis=0)
+            ffn_offset_idx = arange_data + last_arange_index + 1
+            last_arange_index = ffn_offset_idx[-1]
+            ffn_offset_idx_pad_zero =  np.pad(
+                ffn_offset_idx, (0, padding_size - ffn_offset_idx.shape[0]), mode='constant', constant_values=0)
+            ffn_padding_idx = np.concatenate((ffn_padding_idx, ffn_offset_idx_pad_zero), axis=0)
+    return ms.from_numpy(attn_padding_idx), ms.from_numpy(attn_unpadding_idx), ms.from_numpy(ffn_padding_idx), \
+        ms.from_numpy(ffn_unpadding_idx)
 
 
 class DeepseekV3ForCausalLM(MfModelBase):
@@ -92,7 +123,6 @@ class DeepseekV3ForCausalLM(MfModelBase):
             compilation_config.static_forward_context[str(i)] = self.kv_caches[i]
 
         self.set_flags = False
-        set_runtime_kernel_launch_group()
 
     def _generate_model_config(self):
         self.mf_config.load_checkpoint = self.get_model_path()
@@ -137,11 +167,26 @@ class DeepseekV3ForCausalLM(MfModelBase):
         else:
             weight_processor = DeepseekV3WeightProcessor(self.mf_config, self.network, self.is_quant)
             weight_processor.load_safetensors_shard(self.mf_config.load_checkpoint)
+
         self.network.set_dynamic_inputs()
         dynamic_hidden_states = Tensor(shape=[None, None], dtype=self.mf_model_config.compute_dtype)
         self.lm_head.set_inputs(dynamic_hidden_states)
+
         return None
 
+    def prepare_inputs(self, input_ids, positions, attn_metadata):
+        model_inputs, is_prefill = super().prepare_inputs(
+            input_ids, positions, attn_metadata)
+
+        attn_padding_idx, attn_unpadding_idx, ffn_padding_idx, ffn_unpadding_idx = _get_padding_index(
+            model_inputs["q_seq_lens"])
+        model_inputs["attn_padding_idx"] = attn_padding_idx
+        model_inputs["attn_unpadding_idx"] = attn_unpadding_idx
+        model_inputs["ffn_padding_idx"] = ffn_padding_idx
+        model_inputs["ffn_unpadding_idx"] = ffn_unpadding_idx
+
+        return model_inputs, is_prefill
+
     def get_model_path(self):
         model_name_or_path = self.model_config.model
         if os.path.isdir(model_name_or_path):
diff --git a/vllm_mindspore/model_executor/models/mf_models/deepseekv3_weight_processor.py b/vllm_mindspore/model_executor/models/mf_models/deepseekv3_weight_processor.py
index de7d70d0..82e831d0 100644
--- a/vllm_mindspore/model_executor/models/mf_models/deepseekv3_weight_processor.py
+++ b/vllm_mindspore/model_executor/models/mf_models/deepseekv3_weight_processor.py
@@ -27,7 +27,27 @@ from mindspore import dtype
 from mindspore.communication.management import get_rank
 from mindformers.experimental.parallel_core.pynative.parallel_state import get_tensor_model_parallel_rank
 from vllm_mindspore.model_executor.models.mf_models.weight_processor import BaseWeightProcessor
-from vllm_mindspore.utils import convert_np_to_ms_dtype
+from vllm.logger import init_logger
+
+
+logger = init_logger
+
+
+def convert_np_to_ms_dtype(value):
+    """convert_np_to_ms_dtype"""
+    if value.dtype == np.int8:
+        value_dtype = ms.int8
+    elif value.dtype == np.int32:
+        value_dtype = ms.int32
+    elif value.dtype == np.int64:
+        value_dtype = ms.int64
+    elif value.dtype == np.float64:
+        value_dtype = ms.float64
+    elif value.dtype == np.float32:
+        value_dtype = ms.float32
+    else:
+        value_dtype = ms.bfloat16
+    return value_dtype
 
 
 class DeepseekV3WeightProcessor(BaseWeightProcessor):
@@ -42,6 +62,9 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor):
     def __init__(self, config, network, is_quant):
         super().__init__(config, network, is_quant)
         self.num_layers = self.config.model.model_config.num_layers
+        self.expert_num = self.config.moe_config.expert_num
+        self.moe_tensor_parallel = self.config.moe_config.moe_tensor_parallel
+        self.moe_expert_parallel = self.config.moe_config.moe_expert_parallel
 
     def quant_convert_weight_name(self, weight_name: str):
         """replace quant net weight name"""
@@ -112,28 +135,51 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor):
         weight[..., -qk_rope_head_dim:, :] = np.concatenate([w1, w2], axis=-2)
         return weight
 
-    def infer_quant_process_moe_routed_expert_ffn_weight(self, src_hf_dir, layer_id, hf_weight_map):
-        """process moe router expert weight"""
-        ffn_concat = self.config.model.model_config.ffn_concat
-        num_router_experts = self.config.moe_config.expert_num
+    def infer_quant_process_moe_with_tp(self, src_hf_dir, hf_weight_map, num_router_experts, layer_id):
+        w1_list = []
+        w2_list = []
+        w3_list = []
 
-        # router expert dense
-        router_dense_hf_name = f"model.layers.{layer_id}.mlp.gate.weight"
-        router_dense_ms_name = self.quant_convert_weight_name(router_dense_hf_name)
-        router_dense_ms_param, _ = self.get_safetensor_from_file(router_dense_hf_name, src_hf_dir, hf_weight_map)
-        self.parameter_dict[router_dense_ms_name] = ms.Parameter(
-            ms.from_numpy(router_dense_ms_param).astype(ms.bfloat16),
-            name=router_dense_ms_name, requires_grad=False)
+        w1_scale_list = []
+        w2_scale_list = []
+        w3_scale_list = []
 
-        # e_score_correction_bias
-        e_score_correction_bias_hf_name = f"model.layers.{layer_id}.mlp.gate.e_score_correction_bias"
-        e_score_correction_bias_ms_name = self.quant_convert_weight_name(e_score_correction_bias_hf_name)
-        e_score_correction_bias_ms_param, _ = self.get_safetensor_from_file(e_score_correction_bias_hf_name, src_hf_dir,
-                                                                            hf_weight_map)
-        self.parameter_dict[e_score_correction_bias_ms_name] = ms.Parameter(
-            ms.from_numpy(e_score_correction_bias_ms_param).astype(ms.float32),
-            name=e_score_correction_bias_ms_name, requires_grad=False)
+        for index in range(0, num_router_experts):
+            w1_hf_name = f"model.layers.{layer_id}.mlp.experts.{index}.gate_proj.weight"
+            w2_hf_name = f"model.layers.{layer_id}.mlp.experts.{index}.down_proj.weight"
+            w3_hf_name = f"model.layers.{layer_id}.mlp.experts.{index}.up_proj.weight"
+
+            w1_ms_param, _ = self.get_moe_safetensor_from_file(w1_hf_name, src_hf_dir, hf_weight_map,
+                                                               is_split_param=True, split_axis=0)
+            w2_ms_param, _ = self.get_moe_safetensor_from_file(w2_hf_name, src_hf_dir, hf_weight_map,
+                                                               is_split_param=True, split_axis=1)
+            w3_ms_param, _ = self.get_moe_safetensor_from_file(w3_hf_name, src_hf_dir, hf_weight_map,
+                                                               is_split_param=True, split_axis=0)
+
+            w1_list.append(w1_ms_param)
+            w2_list.append(w2_ms_param)
+            w3_list.append(w3_ms_param)
+
+            w1_scale_hf_name = f"model.layers.{layer_id}.mlp.experts.{index}.gate_proj.weight_scale"
+            w2_scale_hf_name = f"model.layers.{layer_id}.mlp.experts.{index}.down_proj.weight_scale"
+            w3_scale_hf_name = f"model.layers.{layer_id}.mlp.experts.{index}.up_proj.weight_scale"
+
+            w1_scale_ms_param, _ = self.get_moe_safetensor_from_file(w1_scale_hf_name, src_hf_dir, hf_weight_map,
+                                                                     is_split_param=True, split_axis=0)
+            w2_scale_ms_param, _ = self.get_moe_safetensor_from_file(w2_scale_hf_name, src_hf_dir, hf_weight_map)
+            w3_scale_ms_param, _ = self.get_moe_safetensor_from_file(w3_scale_hf_name, src_hf_dir, hf_weight_map,
+                                                                     is_split_param=True, split_axis=0)
+
+            w1_scale_ms_param = w1_scale_ms_param.squeeze(axis=-1)
+            w2_scale_ms_param = w2_scale_ms_param.squeeze(axis=-1)
+            w3_scale_ms_param = w3_scale_ms_param.squeeze(axis=-1)
+            w1_scale_list.append(w1_scale_ms_param)
+            w2_scale_list.append(w2_scale_ms_param)
+            w3_scale_list.append(w3_scale_ms_param)
 
+        return w1_list, w2_list, w3_list, w1_scale_list, w2_scale_list, w3_scale_list
+
+    def infer_quant_process_moe_with_ep(self, src_hf_dir, hf_weight_map, num_router_experts, layer_id):
         w1_list = []
         w2_list = []
         w3_list = []
@@ -142,41 +188,75 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor):
         w2_scale_list = []
         w3_scale_list = []
 
-        w1_ms_name = f"model.layers.{layer_id}.feed_forward.routed_experts.ffn.w1._layer.weight"
-        w2_ms_name = f"model.layers.{layer_id}.feed_forward.routed_experts.ffn.w2._layer.weight"
-        w3_ms_name = f"model.layers.{layer_id}.feed_forward.routed_experts.ffn.w3._layer.weight"
+        ep_start = self.moe_ep_rank_id * self.ep_group_nums
+        ep_stop = (self.moe_ep_rank_id + 1) * self.ep_group_nums
+        for index in range(ep_start, ep_stop):
+            w1_hf_name = f"model.layers.{layer_id}.mlp.experts.{index}.gate_proj.weight"
+            w2_hf_name = f"model.layers.{layer_id}.mlp.experts.{index}.down_proj.weight"
+            w3_hf_name = f"model.layers.{layer_id}.mlp.experts.{index}.up_proj.weight"
 
-        w1_scale_ms_name = f"model.layers.{layer_id}.feed_forward.routed_experts.ffn.w1._layer.matmul.weight_scale"
-        w2_scale_ms_name = f"model.layers.{layer_id}.feed_forward.routed_experts.ffn.w2._layer.matmul.weight_scale"
-        w3_scale_ms_name = f"model.layers.{layer_id}.feed_forward.routed_experts.ffn.w3._layer.matmul.weight_scale"
+            w1_ms_param, _ = self.get_moe_safetensor_from_file(w1_hf_name, src_hf_dir, hf_weight_map)
+            w2_ms_param, _ = self.get_moe_safetensor_from_file(w2_hf_name, src_hf_dir, hf_weight_map)
+            w3_ms_param, _ = self.get_moe_safetensor_from_file(w3_hf_name, src_hf_dir, hf_weight_map)
 
-        for index in range(0, num_router_experts):
-            w1_hf_name = f"model.layers.{layer_id}.mlp.experts.{index}.gate_proj.weight"
-            w1_ms_param, _ = self.get_safetensor_from_file(w1_hf_name, src_hf_dir, hf_weight_map,
-                                                           is_split_param=True, split_axis=0)
+            w1_list.append(w1_ms_param)
+            w2_list.append(w2_ms_param)
+            w3_list.append(w3_ms_param)
 
-            w2_hf_name = f"model.layers.{layer_id}.mlp.experts.{index}.down_proj.weight"
-            w2_ms_param, _ = self.get_safetensor_from_file(w2_hf_name, src_hf_dir, hf_weight_map,
-                                                           is_split_param=True, split_axis=1)
+            w1_scale_hf_name = f"model.layers.{layer_id}.mlp.experts.{index}.gate_proj.weight_scale"
+            w2_scale_hf_name = f"model.layers.{layer_id}.mlp.experts.{index}.down_proj.weight_scale"
+            w3_scale_hf_name = f"model.layers.{layer_id}.mlp.experts.{index}.up_proj.weight_scale"
 
+            w1_scale_ms_param, _ = self.get_moe_safetensor_from_file(w1_scale_hf_name, src_hf_dir, hf_weight_map)
+            w2_scale_ms_param, _ = self.get_moe_safetensor_from_file(w2_scale_hf_name, src_hf_dir, hf_weight_map)
+            w3_scale_ms_param, _ = self.get_moe_safetensor_from_file(w3_scale_hf_name, src_hf_dir, hf_weight_map)
+
+            w1_scale_ms_param = w1_scale_ms_param.squeeze(axis=-1)
+            w2_scale_ms_param = w2_scale_ms_param.squeeze(axis=-1)
+            w3_scale_ms_param = w3_scale_ms_param.squeeze(axis=-1)
+            w1_scale_list.append(w1_scale_ms_param)
+            w2_scale_list.append(w2_scale_ms_param)
+            w3_scale_list.append(w3_scale_ms_param)
+
+        return w1_list, w2_list, w3_list, w1_scale_list, w2_scale_list, w3_scale_list
+
+    def infer_quant_process_moe_with_ep_tp(self, src_hf_dir, hf_weight_map, num_router_experts, layer_id):
+        w1_list = []
+        w2_list = []
+        w3_list = []
+
+        w1_scale_list = []
+        w2_scale_list = []
+        w3_scale_list = []
+
+        ep_start = self.moe_ep_rank_id * self.ep_group_nums
+        ep_stop = (self.moe_ep_rank_id + 1) * self.ep_group_nums
+
+        for index in range(ep_start, ep_stop):
+            w1_hf_name = f"model.layers.{layer_id}.mlp.experts.{index}.gate_proj.weight"
+            w2_hf_name = f"model.layers.{layer_id}.mlp.experts.{index}.down_proj.weight"
             w3_hf_name = f"model.layers.{layer_id}.mlp.experts.{index}.up_proj.weight"
-            w3_ms_param, _ = self.get_safetensor_from_file(w3_hf_name, src_hf_dir, hf_weight_map,
-                                                           is_split_param=True, split_axis=0)
+
+            w1_ms_param, _ = self.get_moe_safetensor_from_file(w1_hf_name, src_hf_dir, hf_weight_map,
+                                                               is_split_param=True, split_axis=0)
+            w2_ms_param, _ = self.get_moe_safetensor_from_file(w2_hf_name, src_hf_dir, hf_weight_map,
+                                                               is_split_param=True, split_axis=1)
+            w3_ms_param, _ = self.get_moe_safetensor_from_file(w3_hf_name, src_hf_dir, hf_weight_map,
+                                                               is_split_param=True, split_axis=0)
 
             w1_list.append(w1_ms_param)
             w2_list.append(w2_ms_param)
             w3_list.append(w3_ms_param)
 
             w1_scale_hf_name = f"model.layers.{layer_id}.mlp.experts.{index}.gate_proj.weight_scale"
-            w1_scale_ms_param, _ = self.get_safetensor_from_file(w1_scale_hf_name, src_hf_dir, hf_weight_map,
-                                                                 is_split_param=True, split_axis=0)
-
             w2_scale_hf_name = f"model.layers.{layer_id}.mlp.experts.{index}.down_proj.weight_scale"
-            w2_scale_ms_param, _ = self.get_safetensor_from_file(w2_scale_hf_name, src_hf_dir, hf_weight_map)
-
             w3_scale_hf_name = f"model.layers.{layer_id}.mlp.experts.{index}.up_proj.weight_scale"
-            w3_scale_ms_param, _ = self.get_safetensor_from_file(w3_scale_hf_name, src_hf_dir, hf_weight_map,
-                                                                 is_split_param=True, split_axis=0)
+
+            w1_scale_ms_param, _ = self.get_moe_safetensor_from_file(w1_scale_hf_name, src_hf_dir, hf_weight_map,
+                                                                     is_split_param=True, split_axis=0)
+            w2_scale_ms_param, _ = self.get_moe_safetensor_from_file(w2_scale_hf_name, src_hf_dir, hf_weight_map)
+            w3_scale_ms_param, _ = self.get_moe_safetensor_from_file(w3_scale_hf_name, src_hf_dir, hf_weight_map,
+                                                                     is_split_param=True, split_axis=0)
 
             w1_scale_ms_param = w1_scale_ms_param.squeeze(axis=-1)
             w2_scale_ms_param = w2_scale_ms_param.squeeze(axis=-1)
@@ -185,6 +265,49 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor):
             w2_scale_list.append(w2_scale_ms_param)
             w3_scale_list.append(w3_scale_ms_param)
 
+        return w1_list, w2_list, w3_list, w1_scale_list, w2_scale_list, w3_scale_list
+
+    def infer_quant_process_moe(self, src_hf_dir, hf_weight_map, num_router_experts, layer_id):
+        if self.moe_expert_parallel > 1 and self.moe_tensor_parallel > 1:
+            return self.infer_quant_process_moe_with_ep_tp(src_hf_dir, hf_weight_map, num_router_experts, layer_id)
+        elif self.moe_tensor_parallel > 1:
+            return self.infer_quant_process_moe_with_tp(src_hf_dir, hf_weight_map, num_router_experts, layer_id)
+        else:
+            return self.infer_quant_process_moe_with_ep(src_hf_dir, hf_weight_map, num_router_experts, layer_id)
+
+    def infer_quant_process_moe_routed_expert_ffn_weight(self, src_hf_dir, layer_id, hf_weight_map):
+        """process moe router expert weight"""
+        ffn_concat = self.config.model.model_config.ffn_concat
+        num_router_experts = self.config.moe_config.expert_num
+
+        # router expert dense
+        router_dense_hf_name = f"model.layers.{layer_id}.mlp.gate.weight"
+        router_dense_ms_name = self.quant_convert_weight_name(router_dense_hf_name)
+        router_dense_ms_param, _ = self.get_safetensor_from_file(router_dense_hf_name, src_hf_dir, hf_weight_map)
+        self.parameter_dict[router_dense_ms_name] = ms.Parameter(
+            ms.from_numpy(router_dense_ms_param).astype(ms.bfloat16),
+            name=router_dense_ms_name, requires_grad=False)
+
+        # e_score_correction_bias
+        e_score_correction_bias_hf_name = f"model.layers.{layer_id}.mlp.gate.e_score_correction_bias"
+        e_score_correction_bias_ms_name = self.quant_convert_weight_name(e_score_correction_bias_hf_name)
+        e_score_correction_bias_ms_param, _ = self.get_safetensor_from_file(e_score_correction_bias_hf_name, src_hf_dir,
+                                                                            hf_weight_map)
+        self.parameter_dict[e_score_correction_bias_ms_name] = ms.Parameter(
+            ms.from_numpy(e_score_correction_bias_ms_param).astype(ms.float32),
+            name=e_score_correction_bias_ms_name, requires_grad=False)
+
+        w1_ms_name = f"model.layers.{layer_id}.feed_forward.routed_experts.ffn.w1._layer.weight"
+        w2_ms_name = f"model.layers.{layer_id}.feed_forward.routed_experts.ffn.w2._layer.weight"
+        w3_ms_name = f"model.layers.{layer_id}.feed_forward.routed_experts.ffn.w3._layer.weight"
+
+        w1_scale_ms_name = f"model.layers.{layer_id}.feed_forward.routed_experts.ffn.w1._layer.matmul.weight_scale"
+        w2_scale_ms_name = f"model.layers.{layer_id}.feed_forward.routed_experts.ffn.w2._layer.matmul.weight_scale"
+        w3_scale_ms_name = f"model.layers.{layer_id}.feed_forward.routed_experts.ffn.w3._layer.matmul.weight_scale"
+
+        w1_list, w2_list, w3_list, w1_scale_list, w2_scale_list, w3_scale_list = \
+            self.infer_quant_process_moe(src_hf_dir, hf_weight_map, num_router_experts, layer_id)
+
         w1_ms_stack_param = np.stack(w1_list, axis=0)
         w2_ms_stack_param = np.stack(w2_list, axis=0)
         w3_ms_stack_param = np.stack(w3_list, axis=0)
@@ -668,7 +791,7 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor):
         return weight_name
 
     def convert_mtp_weight_name(self, weight_name: str):
-        layer = 0 if 'layers.' not in weight_name else int(weight_name[weight_name.find('layers.') : ].split('.')[1])
+        layer = 0 if 'layers.' not in weight_name else int(weight_name[weight_name.find('layers.'):].split('.')[1])
         if layer < self.num_layers:
             return weight_name
         mtp_prefix = f'mtp_model'
@@ -970,12 +1093,12 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor):
             ms_name = self.convert_weight_name(hf_name)
             if prefix_name in head_names and not self.config.parallel_config.vocab_emb_dp:
                 ms_param, _ = self.get_safetensor_from_file(hf_name, src_hf_dir, hf_weight_map,
-                                                         is_split_param=True, split_axis=0)
+                                                            is_split_param=True, split_axis=0)
             else:
                 ms_param, _ = self.get_safetensor_from_file(hf_name, src_hf_dir, hf_weight_map)
             parameter_dict[ms_name] = ms.Parameter(ms.Tensor(ms_param, ms.bfloat16),
-                                                             name=ms_name,
-                                                             requires_grad=False)
+                                                   name=ms_name,
+                                                   requires_grad=False)
 
         _, ckpt_not_load = ms.load_param_into_net(self.network, parameter_dict)
 
@@ -1333,8 +1456,67 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor):
             if "model.layers" in param_name and int(param_name.split('.')[2]) >= num_layers:
                 continue
 
-            if any([name in param_name for name in skip_layer]):
-                continue
+            if any([name in param_name for name in no_need_split_layer]):
+                value, _ = self.get_safetensor_from_file(param_name, src_hf_dir,
+                                                         hf_weight_map)
+            elif any([name in param_name for name in [".l2q_proj.", ".feed_forward.w_gate_hidden.",
+                                                      "shared_experts.w_gate_hidden"]]):
+                if param_name.endswith(".weight") or "matmul" in param_name:
+                    value, _ = self.get_safetensor_from_file(param_name, src_hf_dir,
+                                                             hf_weight_map, is_split_param=True,
+                                                             split_axis=0)
+                else:
+                    value, _ = self.get_safetensor_from_file(param_name, src_hf_dir,
+                                                             hf_weight_map)
+            elif any([name in param_name for name in [".feed_forward.w2.", ".wo.", "shared_experts.w2"]]):
+                if param_name.endswith(".weight"):
+                    value, _ = self.get_safetensor_from_file(param_name, src_hf_dir,
+                                                             hf_weight_map, is_split_param=True,
+                                                             split_axis=1)
+                elif "quant_op" in param_name:
+                    value, _ = self.get_safetensor_from_file(param_name, src_hf_dir,
+                                                             hf_weight_map, is_split_param=True,
+                                                             split_axis=0)
+                else:
+                    value, _ = self.get_safetensor_from_file(param_name, src_hf_dir,
+                                                             hf_weight_map)
+            elif ".routed_experts.ffn.w_gate_hidden." in param_name:
+                if param_name.endswith(".weight"):
+                    value, _ = self.get_safetensor_from_file(param_name, src_hf_dir, hf_weight_map)
+                    value_list = []
+                    for experts_id in range(value.shape[0]):
+                        value_list.append(self.split_weight_by_rank(value[experts_id, :, :], split_axis=1))
+                    value = np.stack(value_list, axis=0)
+                elif "matmul" in param_name:
+                    value, _ = self.get_safetensor_from_file(param_name, src_hf_dir, hf_weight_map)
+                    value_list = []
+                    for experts_id in range(value.shape[0]):
+                        value_list.append(self.split_weight_by_rank(value[experts_id, :], split_axis=0))
+                    value = np.stack(value_list, axis=0)
+                else:
+                    value, _ = self.get_safetensor_from_file(param_name, src_hf_dir,
+                                                             hf_weight_map)
+            elif ".routed_experts.ffn.w2" in param_name:
+                if param_name.endswith(".weight"):
+                    value, _ = self.get_safetensor_from_file(param_name, src_hf_dir, hf_weight_map)
+                    value_list = []
+                    for experts_id in range(value.shape[0]):
+                        value_list.append(self.split_weight_by_rank(value[experts_id, :, :], split_axis=0))
+                    value = np.stack(value_list, axis=0)
+                else:
+                    value, _ = self.get_safetensor_from_file(param_name, src_hf_dir,
+                                                             hf_weight_map)
+            elif any([name in param_name for name in ["lkv2kv_k_nope", "lkv2kv_v"]]):
+                value, _ = self.get_safetensor_from_file(param_name, src_hf_dir, hf_weight_map,
+                                                         is_split_param=True, split_axis=0)
+            elif "lm_head" in param_name:
+                if not self.config.parallel_config.vocab_emb_dp:
+                    value, _ = self.get_safetensor_from_file(param_name, src_hf_dir, hf_weight_map,
+                                                             is_split_param=True, split_axis=0)
+                else:
+                    value, _ = self.get_safetensor_from_file(param_name, src_hf_dir, hf_weight_map)
+            else:
+                raise ValueError(f"not found layer {param_name}, please check safetensors file.")
 
             value = self.infer_smooth_quant_get_value(param_name, src_hf_dir, hf_weight_map, no_need_split_layer)
             dst_dtype = convert_np_to_ms_dtype(value)
@@ -1343,8 +1525,8 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor):
                                                       name=param_name, requires_grad=False)
 
         param_not_load, ckpt_not_load = ms.load_param_into_net(self.network, parameter_dict)
-        print(f"smoothquant param_not_load:{param_not_load}")
-        print(f"smoothquant ckpt_not_load:{ckpt_not_load}")
+        logger.info("smoothquant param_not_load: %s" % str(param_not_load))
+        logger.info("smoothquant ckpt_not_load: %s" % str(ckpt_not_load))
 
     def infer_gptq_quant_net_ms_convert_layer_weight(self, src_hf_dir, num_layers, hf_weight_map):
         """infer_gptq_quant_net_ms_convert_layer_weight"""
@@ -1456,6 +1638,7 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor):
             else:
                 self.infer_convert_layer_weight(src_hf_dir, layer_id, hf_weight_map)
 
-        ms.load_param_into_net(self.network, self.parameter_dict)
+        param_not_load, ckpt_not_load = ms.load_param_into_net(self.network, self.parameter_dict)
+        logger.info("param_not_load: %s, ckpt_not_load: %s" % (str(param_not_load), str(ckpt_not_load)))
         del self.parameter_dict
         gc.collect()
diff --git a/vllm_mindspore/model_executor/models/mf_models/weight_processor.py b/vllm_mindspore/model_executor/models/mf_models/weight_processor.py
index 82a104f6..f46334fa 100644
--- a/vllm_mindspore/model_executor/models/mf_models/weight_processor.py
+++ b/vllm_mindspore/model_executor/models/mf_models/weight_processor.py
@@ -19,9 +19,9 @@ transform huggingface safetensor.
 
 import os
 from safetensors import safe_open
-from mindspore.communication.management import get_rank, get_group_size
-
-from vllm.distributed import get_tensor_model_parallel_world_size, get_tensor_model_parallel_rank
+from mindspore.communication.management import get_rank
+from mindformers.experimental.infer.core.utils import get_tp_world_size
+from mindformers.experimental.parallel_core.pynative.parallel_state import get_data_parallel_world_size
 
 class BaseWeightProcessor:
     r"""
@@ -36,8 +36,29 @@ class BaseWeightProcessor:
         self.config = config
         self.network = network
         self.is_quant = is_quant
-        self.tp_group_size = get_tensor_model_parallel_world_size()
-        self.rank_id = get_tensor_model_parallel_rank()
+        self.global_rank_id = get_rank()
+        self.tp_group_size = get_tp_world_size()
+        self.dp_group_size = get_data_parallel_world_size()
+        self.moe_ep_size = self.config.moe_config.moe_expert_parallel
+        self.moe_tp_size = self.config.moe_config.moe_tensor_parallel
+        self.tp_rank_id = self.global_rank_id % self.tp_group_size
+
+        num_router_experts = self.config.moe_config.expert_num
+        self.ep_group_nums = num_router_experts // self.moe_ep_size
+        self.moe_ep_rank_id = self.global_rank_id // self.moe_tp_size
+        self.moe_tp_rank_id = self.global_rank_id % self.moe_tp_size
+
+        print(f"global_rank_id: {self.global_rank_id} \n"
+              f"tp_group_size: {self.tp_group_size} \n"
+              f"dp_group_size: {self.dp_group_size} \n"
+              f"tp_rank_id: {self.tp_rank_id} \n"
+              f"num_router_experts: {num_router_experts} \n"
+              f"ep_group_nums: {self.ep_group_nums} \n"
+              f"moe_ep_rank_id: {self.moe_ep_rank_id} \n"
+              f"moe_tp_rank_id: {self.moe_tp_rank_id} \n"
+              f"moe_ep_size: {self.moe_ep_size} \n"
+              f"moe_tp_size: {self.moe_tp_size}", flush=True)
+
         self.parameter_dict = {}
         self.file_handles = {}
 
@@ -50,6 +71,33 @@ class BaseWeightProcessor:
     def release_file_handles(self):
         del self.file_handles
 
+    def get_moe_safetensor_from_file(self, hf_param_name, src_hf_dir, hf_weight_map, is_split_param=False, split_axis=0):
+        safetensor_file = hf_weight_map[hf_param_name]
+        filename = os.path.join(src_hf_dir, safetensor_file)
+        sf_file = self.get_file_handles(filename)
+        qint4 = False
+        if sf_file.metadata() is not None and hf_param_name in sf_file.metadata().keys():
+            qint4 = True
+        if not is_split_param or self.moe_tp_size == 1:
+            np_data = sf_file.get_tensor(hf_param_name)
+            return np_data, qint4
+
+        np_data = sf_file.get_slice(hf_param_name)
+        shape = np_data.get_shape()
+        if split_axis == 0:
+            split_size = shape[0] // self.moe_tp_size
+            start = self.moe_tp_rank_id * split_size
+            stop = (self.moe_tp_rank_id + 1) * split_size
+            split_data = np_data[start:stop]
+        elif split_axis == 1:
+            split_size = shape[1] // self.moe_tp_size
+            start = self.moe_tp_rank_id * split_size
+            stop = (self.moe_tp_rank_id + 1) * split_size
+            split_data = np_data[:, start:stop]
+        else:
+            raise ValueError("split_axis:{} is not supported.".format(split_axis))
+        return split_data, qint4
+
     def get_safetensor_from_file(self, hf_param_name, src_hf_dir, hf_weight_map, is_split_param=False, split_axis=0):
         safetensor_file = hf_weight_map[hf_param_name]
         filename = os.path.join(src_hf_dir, safetensor_file)
@@ -57,7 +105,7 @@ class BaseWeightProcessor:
         qint4 = False
         if sf_file.metadata() is not None and hf_param_name in sf_file.metadata().keys():
             qint4 = True
-        if not is_split_param:
+        if not is_split_param or self.tp_group_size == 1:
             np_data = sf_file.get_tensor(hf_param_name)
             return np_data, qint4
 
@@ -65,13 +113,13 @@ class BaseWeightProcessor:
         shape = np_data.get_shape()
         if split_axis == 0:
             split_size = shape[0] // self.tp_group_size
-            start = self.rank_id * split_size
-            stop = (self.rank_id + 1) * split_size
+            start = self.tp_rank_id * split_size
+            stop = (self.tp_rank_id + 1) * split_size
             split_data = np_data[start:stop]
         elif split_axis == 1:
             split_size = shape[1] // self.tp_group_size
-            start = self.rank_id * split_size
-            stop = (self.rank_id + 1) * split_size
+            start = self.tp_rank_id * split_size
+            stop = (self.tp_rank_id + 1) * split_size
             split_data = np_data[:, start:stop]
         elif split_axis == 2:
             split_size = shape[2] // self.tp_group_size
@@ -83,16 +131,19 @@ class BaseWeightProcessor:
         return split_data, qint4
 
     def split_weight_by_rank(self, weight, split_axis=0):
+        if self.tp_group_size == 1:
+            return weight
+
         shape = weight.shape
         if split_axis == 0:
             split_size = shape[0] // self.tp_group_size
-            start = self.rank_id * split_size
-            stop = (self.rank_id + 1) * split_size
+            start = self.tp_rank_id * split_size
+            stop = (self.tp_rank_id + 1) * split_size
             split_data = weight[start:stop]
         elif split_axis == 1:
             split_size = shape[1] // self.tp_group_size
-            start = self.rank_id * split_size
-            stop = (self.rank_id + 1) * split_size
+            start = self.tp_rank_id * split_size
+            stop = (self.tp_rank_id + 1) * split_size
             split_data = weight[:, start:stop]
         else:
             raise ValueError("split_axis:{} is not supported.".format(split_axis))
diff --git a/vllm_mindspore/scripts.py b/vllm_mindspore/scripts.py
index f35190fa..3bbfbbc3 100644
--- a/vllm_mindspore/scripts.py
+++ b/vllm_mindspore/scripts.py
@@ -54,7 +54,7 @@ def env_setup(target_env_dict=None):
 def main():
     env_setup()
 
-    from vllm.scripts import main as vllm_main
+    from vllm.entrypoints.cli.main import main as vllm_main
 
     vllm_main()
 
-- 
Gitee


From 6468923f6425c7da3b04a4f4aec4dfc56f8c309a Mon Sep 17 00:00:00 2001
From: Erpim <dengyepeng@huawei.com>
Date: Sat, 26 Apr 2025 11:48:07 +0800
Subject: [PATCH 14/55] replace flatten

---
 vllm_mindspore/v1/worker/gpu_model_runner.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm_mindspore/v1/worker/gpu_model_runner.py b/vllm_mindspore/v1/worker/gpu_model_runner.py
index 988ee71c..4928b56a 100644
--- a/vllm_mindspore/v1/worker/gpu_model_runner.py
+++ b/vllm_mindspore/v1/worker/gpu_model_runner.py
@@ -95,7 +95,7 @@ def _prepare_inputs(
                      req_indices * self.input_batch.token_ids_cpu.shape[1])
 
     self.input_ids[:total_num_scheduled_tokens] = torch.from_numpy(
-        np.take(self.input_batch.token_ids_cpu.flatten(),
+        np.take(self.input_batch.token_ids_cpu.ravel(),
                 token_indices,
                 0)
     )
@@ -110,7 +110,7 @@ def _prepare_inputs(
                            positions_np // self.block_size)
 
 
-    block_numbers = self.input_batch.block_table.block_table_np.flatten()[block_table_indices]
+    block_numbers = self.input_batch.block_table.block_table_np.ravel()[block_table_indices]
     block_offsets = positions_np % self.block_size
     np.add(block_numbers * self.block_size,
             block_offsets,
-- 
Gitee


From ba95224c137aa1158b7be613b690637fa952609e Mon Sep 17 00:00:00 2001
From: Erpim <dengyepeng@huawei.com>
Date: Sat, 26 Apr 2025 15:48:05 +0800
Subject: [PATCH 15/55] support ep weight load

---
 .../mf_models/deepseekv3_weight_processor.py  | 48 ++++++++++++++-----
 .../models/mf_models/weight_processor.py      | 32 +++++++++----
 2 files changed, 60 insertions(+), 20 deletions(-)

diff --git a/vllm_mindspore/model_executor/models/mf_models/deepseekv3_weight_processor.py b/vllm_mindspore/model_executor/models/mf_models/deepseekv3_weight_processor.py
index 82e831d0..58a186b6 100644
--- a/vllm_mindspore/model_executor/models/mf_models/deepseekv3_weight_processor.py
+++ b/vllm_mindspore/model_executor/models/mf_models/deepseekv3_weight_processor.py
@@ -26,11 +26,11 @@ import mindspore as ms
 from mindspore import dtype
 from mindspore.communication.management import get_rank
 from mindformers.experimental.parallel_core.pynative.parallel_state import get_tensor_model_parallel_rank
-from vllm_mindspore.model_executor.models.mf_models.weight_processor import BaseWeightProcessor
+from vllm_mindspore.model_executor.models.mf_models.weight_processor import BaseWeightProcessor, EPMethod
 from vllm.logger import init_logger
 
 
-logger = init_logger
+logger = init_logger(__name__)
 
 
 def convert_np_to_ms_dtype(value):
@@ -65,6 +65,11 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor):
         self.expert_num = self.config.moe_config.expert_num
         self.moe_tensor_parallel = self.config.moe_config.moe_tensor_parallel
         self.moe_expert_parallel = self.config.moe_config.moe_expert_parallel
+        self.ep_method = EPMethod.DEFAULT
+        if self.dp_group_size > 1 and self.moe_expert_parallel == self.global_group_size:
+            self.ep_method = EPMethod.ALLTOALL
+        elif self.dp_group_size > 1:
+            self.ep_method = EPMethod.ALLGATHER
 
     def quant_convert_weight_name(self, weight_name: str):
         """replace quant net weight name"""
@@ -363,24 +368,38 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor):
             name=w2_scale_ms_name,
             requires_grad=False)
 
+    def get_moe_shared_expert_split_info(self):
+        split_num = -1
+        rank_id = -1
+        if self.ep_method == EPMethod.ALLGATHER:
+            split_num = self.global_group_size
+            rank_id = get_rank()
+        elif self.ep_method == EPMethod.ALLTOALL:
+            split_num = 1
+            rank_id = 0
+        return split_num, rank_id
+
     def infer_quant_process_moe_shared_expert_ffn_weight(self, src_hf_dir, layer_id, hf_weight_map):
         """infer quant process moe shared expert ffn weight"""
+        split_num, rank_id = self.get_moe_shared_expert_split_info()
 
         ffn_concat = self.config.model.model_config.ffn_concat
         w1_hf_name = f"model.layers.{layer_id}.mlp.shared_experts.gate_proj.weight"
         w1_ms_name = self.quant_convert_weight_name(w1_hf_name)
-        w1_ms_param, _ = self.get_safetensor_from_file(w1_hf_name, src_hf_dir, hf_weight_map,
-                                                       is_split_param=True, split_axis=0)
+        w1_ms_param, _ = self.get_safetensor_from_file(w1_hf_name, src_hf_dir, hf_weight_map, is_split_param=True,
+                                                       split_axis=0, split_num=split_num, rank_id=rank_id)
 
         w1_scale_hf_name = f"model.layers.{layer_id}.mlp.shared_experts.gate_proj.weight_scale"
         w1_scale_ms_name = self.quant_convert_weight_name(w1_scale_hf_name)
         w1_scale_ms_param, _ = self.get_safetensor_from_file(w1_scale_hf_name, src_hf_dir, hf_weight_map,
-                                                             is_split_param=True, split_axis=0)
+                                                             is_split_param=True, split_axis=0, split_num=split_num,
+                                                             rank_id=rank_id)
 
         w2_hf_name = f"model.layers.{layer_id}.mlp.shared_experts.down_proj.weight"
         w2_ms_name = self.quant_convert_weight_name(w2_hf_name)
         w2_ms_param, _ = self.get_safetensor_from_file(w2_hf_name, src_hf_dir, hf_weight_map,
-                                                       is_split_param=True, split_axis=1)
+                                                       is_split_param=True, split_axis=1, split_num=split_num,
+                                                       rank_id=rank_id)
 
         w2_scale_hf_name = f"model.layers.{layer_id}.mlp.shared_experts.down_proj.weight_scale"
         w2_scale_ms_name = self.quant_convert_weight_name(w2_scale_hf_name)
@@ -389,12 +408,14 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor):
         w3_hf_name = f"model.layers.{layer_id}.mlp.shared_experts.up_proj.weight"
         w3_ms_name = self.quant_convert_weight_name(w3_hf_name)
         w3_ms_param, _ = self.get_safetensor_from_file(w3_hf_name, src_hf_dir, hf_weight_map,
-                                                       is_split_param=True, split_axis=0)
+                                                       is_split_param=True, split_axis=0, split_num=split_num,
+                                                       rank_id=rank_id)
 
         w3_scale_hf_name = f"model.layers.{layer_id}.mlp.shared_experts.up_proj.weight_scale"
         w3_scale_ms_name = self.quant_convert_weight_name(w3_scale_hf_name)
         w3_scale_ms_param, _ = self.get_safetensor_from_file(w3_scale_hf_name, src_hf_dir, hf_weight_map,
-                                                             is_split_param=True, split_axis=0)
+                                                             is_split_param=True, split_axis=0, split_num=split_num,
+                                                             rank_id=rank_id)
 
         w1_scale_ms_param = w1_scale_ms_param.squeeze(axis=-1)
         w2_scale_ms_param = w2_scale_ms_param.squeeze(axis=-1)
@@ -888,21 +909,26 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor):
 
     def infer_process_moe_shared_expert_ffn_weight(self, src_hf_dir, layer_id, hf_weight_map):
         """infer process moe shared expert ffn weight"""
+        split_num, rank_id = self.get_moe_shared_expert_split_info()
+
         ffn_concat = self.config.model.model_config.ffn_concat
         w1_hf_name = f"model.layers.{layer_id}.mlp.shared_experts.gate_proj.weight"
         w1_ms_name = self.convert_weight_name(w1_hf_name)
         w1_ms_param, _ = self.get_safetensor_from_file(w1_hf_name, src_hf_dir, hf_weight_map,
-                                                       is_split_param=True, split_axis=0)
+                                                       is_split_param=True, split_axis=0, split_num=split_num,
+                                                       rank_id=rank_id)
 
         w2_hf_name = f"model.layers.{layer_id}.mlp.shared_experts.down_proj.weight"
         w2_ms_name = self.convert_weight_name(w2_hf_name)
         w2_ms_param, _ = self.get_safetensor_from_file(w2_hf_name, src_hf_dir, hf_weight_map,
-                                                       is_split_param=True, split_axis=1)
+                                                       is_split_param=True, split_axis=1, split_num=split_num,
+                                                       rank_id=rank_id)
 
         w3_hf_name = f"model.layers.{layer_id}.mlp.shared_experts.up_proj.weight"
         w3_ms_name = self.convert_weight_name(w3_hf_name)
         w3_ms_param, _ = self.get_safetensor_from_file(w3_hf_name, src_hf_dir, hf_weight_map,
-                                                       is_split_param=True, split_axis=0)
+                                                       is_split_param=True, split_axis=0, split_num=split_num,
+                                                       rank_id=rank_id)
 
         if ffn_concat:
             w_gate_hidden_name = f"model.layers.{layer_id}.feed_forward.shared_experts.w_gate_hidden.weight"
diff --git a/vllm_mindspore/model_executor/models/mf_models/weight_processor.py b/vllm_mindspore/model_executor/models/mf_models/weight_processor.py
index f46334fa..696367ec 100644
--- a/vllm_mindspore/model_executor/models/mf_models/weight_processor.py
+++ b/vllm_mindspore/model_executor/models/mf_models/weight_processor.py
@@ -18,11 +18,20 @@ transform huggingface safetensor.
 """
 
 import os
+from enum import Enum
 from safetensors import safe_open
-from mindspore.communication.management import get_rank
+from mindspore.communication.management import get_rank, get_group_size
 from mindformers.experimental.infer.core.utils import get_tp_world_size
 from mindformers.experimental.parallel_core.pynative.parallel_state import get_data_parallel_world_size
 
+class EPMethod(Enum):
+    """
+    EP method enums
+    """
+    DEFAULT = 'default'
+    ALLTOALL = 'alltoall'
+    ALLGATHER = 'allgather'
+
 class BaseWeightProcessor:
     r"""
     Provide model weight load and shards.
@@ -37,10 +46,12 @@ class BaseWeightProcessor:
         self.network = network
         self.is_quant = is_quant
         self.global_rank_id = get_rank()
+        self.global_group_size = get_group_size()
         self.tp_group_size = get_tp_world_size()
         self.dp_group_size = get_data_parallel_world_size()
         self.moe_ep_size = self.config.moe_config.moe_expert_parallel
         self.moe_tp_size = self.config.moe_config.moe_tensor_parallel
+        self.ep_method = EPMethod.DEFAULT
         self.tp_rank_id = self.global_rank_id % self.tp_group_size
 
         num_router_experts = self.config.moe_config.expert_num
@@ -98,28 +109,31 @@ class BaseWeightProcessor:
             raise ValueError("split_axis:{} is not supported.".format(split_axis))
         return split_data, qint4
 
-    def get_safetensor_from_file(self, hf_param_name, src_hf_dir, hf_weight_map, is_split_param=False, split_axis=0):
+    def get_safetensor_from_file(self, hf_param_name, src_hf_dir, hf_weight_map, is_split_param=False, split_axis=0,
+                                 split_num=-1, rank_id=-1):
+        rank_id = rank_id if rank_id != -1 else self.tp_rank_id
+        split_num = split_num if split_num != -1 else self.tp_group_size
         safetensor_file = hf_weight_map[hf_param_name]
         filename = os.path.join(src_hf_dir, safetensor_file)
         sf_file = self.get_file_handles(filename)
         qint4 = False
         if sf_file.metadata() is not None and hf_param_name in sf_file.metadata().keys():
             qint4 = True
-        if not is_split_param or self.tp_group_size == 1:
+        if not is_split_param or split_num == 1:
             np_data = sf_file.get_tensor(hf_param_name)
             return np_data, qint4
 
         np_data = sf_file.get_slice(hf_param_name)
         shape = np_data.get_shape()
         if split_axis == 0:
-            split_size = shape[0] // self.tp_group_size
-            start = self.tp_rank_id * split_size
-            stop = (self.tp_rank_id + 1) * split_size
+            split_size = shape[0] // split_num
+            start = rank_id * split_size
+            stop = (rank_id + 1) * split_size
             split_data = np_data[start:stop]
         elif split_axis == 1:
-            split_size = shape[1] // self.tp_group_size
-            start = self.tp_rank_id * split_size
-            stop = (self.tp_rank_id + 1) * split_size
+            split_size = shape[1] // split_num
+            start = rank_id * split_size
+            stop = (rank_id + 1) * split_size
             split_data = np_data[:, start:stop]
         elif split_axis == 2:
             split_size = shape[2] // self.tp_group_size
-- 
Gitee


From 37fd6cd52c5c8930024fa65eb029b600ff93c483 Mon Sep 17 00:00:00 2001
From: zlq2020 <zouliqin@huawei.com>
Date: Thu, 24 Apr 2025 09:54:29 +0800
Subject: [PATCH 16/55] update log config

---
 setup.py                                      | 16 ++---
 vllm_mindspore/__init__.py                    |  7 +-
 vllm_mindspore/config.py                      | 10 +--
 vllm_mindspore/logger.py                      | 70 +++++++++++++++++++
 .../mf_models/deepseekv3_infer_save_ckpt.py   |  5 +-
 vllm_mindspore/scripts.py                     |  2 +
 vllm_mindspore/utils.py                       |  6 +-
 vllm_mindspore/v1/worker/gpu_model_runner.py  |  1 -
 vllm_mindspore/worker/profile.py              |  6 +-
 9 files changed, 101 insertions(+), 22 deletions(-)
 create mode 100644 vllm_mindspore/logger.py

diff --git a/setup.py b/setup.py
index 2f304bcc..2f5080a2 100644
--- a/setup.py
+++ b/setup.py
@@ -40,6 +40,7 @@ def load_module_from_path(module_name, path):
 
 
 ROOT_DIR = os.path.dirname(__file__)
+logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 
 
@@ -121,7 +122,7 @@ class CustomBuildExt(build_ext):
         # "vllm_mindspore.npu_ops" --> "npu_ops"
         ext_name = ext.name.split('.')[-1]
         so_name = ext_name + ".so"
-        print(f"Building {so_name} ...")
+        logger.info(f"Building {so_name} ...")
         OPS_DIR = os.path.join(ROOT_DIR, "vllm_mindspore", "ops")
         BUILD_OPS_DIR = os.path.join(ROOT_DIR, "build", "ops")
         os.makedirs(BUILD_OPS_DIR, exist_ok=True)
@@ -143,12 +144,12 @@ class CustomBuildExt(build_ext):
 
         try:
             # Run the combined cmake command
-            print(f"Running combined CMake commands:\n{cmake_cmd}")
+            logger.info(f"Running combined CMake commands:\n{cmake_cmd}")
             result = subprocess.run(cmake_cmd, cwd=self.ROOT_DIR, text=True, shell=True, capture_output=True)
             if result.returncode != 0:
-                print("CMake commands failed:")
-                print(result.stdout)  # Print standard output
-                print(result.stderr)  # Print error output
+                logger.info("CMake commands failed:")
+                logger.info(result.stdout)  # Print standard output
+                logger.info(result.stderr)  # Print error output
                 raise RuntimeError(f"Combined CMake commands failed with exit code {result.returncode}")
         except subprocess.CalledProcessError as e:
             raise RuntimeError(f"Failed to build {so_name}: {e}")
@@ -160,10 +161,7 @@ class CustomBuildExt(build_ext):
         if os.path.exists(dst_so_path):
             os.remove(dst_so_path)
         shutil.copy(src_so_path, dst_so_path)
-        print(f"Copied {so_name} to {dst_so_path}")
-
-
-write_commit_id()
+        logger.info(f"Copied {so_name} to {dst_so_path}")
 
 package_data = {
     "": [
diff --git a/vllm_mindspore/__init__.py b/vllm_mindspore/__init__.py
index 032415f0..3aae2091 100644
--- a/vllm_mindspore/__init__.py
+++ b/vllm_mindspore/__init__.py
@@ -25,14 +25,17 @@ if "vllm" in sys.modules:
         "vllm import before vllm_mindspore, vllm_mindspore cannot worker right!"
     )
 
+# 1. set env before import mindspore.
 from vllm_mindspore.scripts import env_setup
-
 env_setup()
 
-# should be place on the top of the file.
+# 2. replace the inductor_pass module before import vllm.
 from vllm_mindspore.compilation import inductor_pass as ms_inductor_pass
 sys.modules.update({"vllm.compilation.inductor_pass": ms_inductor_pass})
 
+# 3. update the log configuration ahead of other modifications.
+import vllm_mindspore.logger
+
 from vllm_mindspore.platforms.ascend import AscendPlatform
 
 ascend_platform = AscendPlatform()
diff --git a/vllm_mindspore/config.py b/vllm_mindspore/config.py
index 8079ca35..0fd6ca23 100644
--- a/vllm_mindspore/config.py
+++ b/vllm_mindspore/config.py
@@ -30,7 +30,7 @@ from transformers import PretrainedConfig
 
 import vllm.envs as envs
 
-from vllm.config import VllmConfig, CompilationConfig, CompilationLevel, logger, _STR_DTYPE_TO_TORCH_DTYPE
+from vllm.config import VllmConfig, CompilationConfig, CompilationLevel, _STR_DTYPE_TO_TORCH_DTYPE
 from vllm.utils import random_uuid
 from vllm.logger import init_logger
 from vllm.compilation.inductor_pass import CallableInductorPass, InductorPass
@@ -313,7 +313,7 @@ class SocketProcessGroup:
             self.server_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
             self.server_socket.bind((self.master_ip, self.master_port))
             self.server_socket.listen(self.world_size - 1)
-            print(f"Master node listening on {self.master_ip}:{self.master_port}")
+            logger.info(f"Master node listening on {self.master_ip}:{self.master_port}")
         else:
             # Worker node: connect to the master
             self.client_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
@@ -321,11 +321,11 @@ class SocketProcessGroup:
             while retries < self.max_retries:
                 try:
                     self.client_socket.connect((self.master_ip, self.master_port))
-                    print(f"Worker {self.rank} connected to master at {self.master_ip}:{self.master_port}")
+                    logger.info(f"Worker {self.rank} connected to master at {self.master_ip}:{self.master_port}")
                     break
                 except ConnectionRefusedError:
                     retries += 1
-                    print(f"Worker {self.rank} failed to connect to master. Retrying in {self.retry_interval} seconds... ({retries}/{self.max_retries})")
+                    logger.warning(f"Worker {self.rank} failed to connect to master. Retrying in {self.retry_interval} seconds... ({retries}/{self.max_retries})")
                     time.sleep(self.retry_interval)
             else:
                 raise ConnectionError(f"Worker {self.rank} could not connect to master at {self.master_ip}:{self.master_port} after {self.max_retries} retries.")
@@ -408,4 +408,4 @@ def stateless_destroy_socket_process_group(dp_group: "SocketProcessGroup") -> No
     """
     if dp_group:
         dp_group.close()
-        print(f"Socket process group for rank {dp_group.rank} destroyed.")
+        logger.info(f"Socket process group for rank {dp_group.rank} destroyed.")
diff --git a/vllm_mindspore/logger.py b/vllm_mindspore/logger.py
new file mode 100644
index 00000000..fcdbe610
--- /dev/null
+++ b/vllm_mindspore/logger.py
@@ -0,0 +1,70 @@
+#!/usr/bin/env python3
+# encoding: utf-8
+# Copyright 2025 Huawei Technologies Co., Ltd
+# Copyright 2024 The vLLM team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""init logger for vllm-mindspore."""
+
+from logging.config import dictConfig
+import vllm.envs as envs
+from vllm.logger import DEFAULT_LOGGING_CONFIG, init_logger
+
+VLLM_CONFIGURE_LOGGING = envs.VLLM_CONFIGURE_LOGGING
+VLLM_LOGGING_CONFIG_PATH = envs.VLLM_LOGGING_CONFIG_PATH
+VLLM_LOGGING_LEVEL = envs.VLLM_LOGGING_LEVEL
+VLLM_LOGGING_PREFIX = envs.VLLM_LOGGING_PREFIX
+
+_DATE_FORMAT = "%m-%d %H:%M:%S"
+_MS_FORMAT = (f"{VLLM_LOGGING_PREFIX}%(levelname)s %(asctime)s "
+           "vllm-mindspore[%(filename)s:%(lineno)d] %(message)s")
+
+_MS_FORMATTERS = {
+    "vllm_mindspore": {
+        "class": "vllm.logging_utils.NewLineFormatter",
+        "datefmt": _DATE_FORMAT,
+        "format": _MS_FORMAT,
+    }
+}
+
+_MS_HANDLERS = {
+    "vllm_mindspore": {
+        "class": "logging.StreamHandler",
+        "formatter": "vllm_mindspore",
+        "level": VLLM_LOGGING_LEVEL,
+        "stream": "ext://sys.stdout",
+    }
+}
+
+_MS_LOGGERS = {
+    "vllm_mindspore": {
+        "handlers": ["vllm_mindspore"],
+        "level": "DEBUG",
+        "propagate": False,
+    }
+}
+
+def _update_configure_vllm_root_logger() -> None:
+    if VLLM_CONFIGURE_LOGGING and not VLLM_LOGGING_CONFIG_PATH:
+        logging_config = DEFAULT_LOGGING_CONFIG
+        logging_config["formatters"].update(_MS_FORMATTERS)
+        logging_config["handlers"].update(_MS_HANDLERS)
+        logging_config["loggers"].update(_MS_LOGGERS)
+
+        dictConfig(logging_config)
+
+_update_configure_vllm_root_logger()
+
+logger = init_logger(__name__)
+logger.info("The config of vllm-mindspore logger has been updated successfully.")
diff --git a/vllm_mindspore/model_executor/models/mf_models/deepseekv3_infer_save_ckpt.py b/vllm_mindspore/model_executor/models/mf_models/deepseekv3_infer_save_ckpt.py
index 4b781a8c..81dd8ef3 100644
--- a/vllm_mindspore/model_executor/models/mf_models/deepseekv3_infer_save_ckpt.py
+++ b/vllm_mindspore/model_executor/models/mf_models/deepseekv3_infer_save_ckpt.py
@@ -17,11 +17,12 @@ import argparse
 import os
 from collections import OrderedDict
 
+from vllm.logger import init_logger
+
 import mindspore as ms
 from mindspore import dtype as msdtype
 from mindspore.communication.management import get_rank
 from mindformers.core.parallel_config import build_parallel_config
-from mindformers.tools.logger import logger
 from mindformers import MindFormerConfig
 from mindformers import build_context
 from research.deepseek3.deepseekv3_infer_parallelism import DeepseekInferParallelism
@@ -29,6 +30,8 @@ from research.deepseek3.deepseekv3_infer_parallelism import DeepseekInferParalle
 from research.deepseek3.deepseek3_config import DeepseekV3Config
 from research.deepseek3.deepseek3_model_infer import InferenceDeepseekV3ForCausalLM
 
+logger = init_logger(__name__)
+
 # for example
 # bash scripts/msrun_launcher.sh "python ./infer_save_ckpt_from_safetensor.py
 # --config /path/to/predict_deepseek_r1_671b.yaml
diff --git a/vllm_mindspore/scripts.py b/vllm_mindspore/scripts.py
index 3bbfbbc3..ef297418 100644
--- a/vllm_mindspore/scripts.py
+++ b/vllm_mindspore/scripts.py
@@ -19,6 +19,8 @@
 import logging
 import os
 
+# It's before the vllm import, so vllm.logger cannot be used here.
+logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 
 
diff --git a/vllm_mindspore/utils.py b/vllm_mindspore/utils.py
index d32b525e..e0fc3b04 100644
--- a/vllm_mindspore/utils.py
+++ b/vllm_mindspore/utils.py
@@ -18,7 +18,6 @@
 
 import contextlib
 import gc
-import logging
 import os
 import sys
 from typing import (
@@ -39,6 +38,7 @@ if TYPE_CHECKING:
 else:
     Library = None
 
+from vllm.logger import init_logger
 from vllm.utils import T, TORCH_DTYPE_TO_NUMPY_DTYPE, make_ndarray_with_pad
 
 import mindspore as ms
@@ -50,7 +50,7 @@ from .scripts import env_setup
 
 MsKVCache = Tuple[ms.Tensor, ms.Tensor]
 
-logger = logging.getLogger(__name__)
+logger = init_logger(__name__)
 
 
 STR_DTYPE_TO_MS_DTYPE = {
@@ -263,5 +263,5 @@ def convert_np_to_ms_dtype(value):
 
 # Replace the directly loaded module in vllm, such as 'from module import xxx'
 def update_modules(name, module):
-    logger.info(f"replace module {name} by {module}")
+    logger.debug(f"replace module {name} by {module}")
     sys.modules.update({name: module})
diff --git a/vllm_mindspore/v1/worker/gpu_model_runner.py b/vllm_mindspore/v1/worker/gpu_model_runner.py
index 4928b56a..a21a2f73 100644
--- a/vllm_mindspore/v1/worker/gpu_model_runner.py
+++ b/vllm_mindspore/v1/worker/gpu_model_runner.py
@@ -14,7 +14,6 @@ from vllm_mindspore.utils import get_valid_dtype
 from vllm.v1.kv_cache_interface import FullAttentionSpec
 from vllm.v1.utils import bind_kv_cache
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalKwargs
-from vllm.logger import logger
 from vllm.distributed.parallel_state import get_pp_group
 from vllm.utils import cdiv
 from vllm.logger import init_logger
diff --git a/vllm_mindspore/worker/profile.py b/vllm_mindspore/worker/profile.py
index 728362d2..9958ebcb 100644
--- a/vllm_mindspore/worker/profile.py
+++ b/vllm_mindspore/worker/profile.py
@@ -2,10 +2,14 @@ import os
 import sys
 import subprocess
 
+from vllm.logger import init_logger
+
 from mindspore import Profiler
 from mindspore.profiler import ProfilerLevel, ProfilerActivity, AicoreMetrics
 from mindspore.profiler.common.profiler_context import ProfilerContext
 
+logger = init_logger(__name__)
+
 PROFILE_ENV_NAME = "VLLM_TORCH_PROFILER_DIR"
 
 def shell_analyse(path):
@@ -55,7 +59,7 @@ def wrapper_worker_init_device(fun):
         self = arg[0]
         profile_output_path = os.getenv(PROFILE_ENV_NAME, "")
         if profile_output_path:
-            print(f"Profiling enabled. Traces will be saved to: {profile_output_path}")
+            logger.info(f"Profiling enabled. Traces will be saved to: {profile_output_path}")
             self.profiler = AdapterProfiler(profile_output_path)
         else:
             self.profiler = None
-- 
Gitee


From fff8f42c1542916a9d80e88fae5fff2342dde853 Mon Sep 17 00:00:00 2001
From: yangminghai <hale97518@foxmail.com>
Date: Thu, 10 Apr 2025 15:48:09 +0800
Subject: [PATCH 17/55] add commit info

---
 setup.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/setup.py b/setup.py
index 2f5080a2..b80c7879 100644
--- a/setup.py
+++ b/setup.py
@@ -163,6 +163,9 @@ class CustomBuildExt(build_ext):
         shutil.copy(src_so_path, dst_so_path)
         logger.info(f"Copied {so_name} to {dst_so_path}")
 
+
+write_commit_id()
+
 package_data = {
     "": [
         "*.so",
-- 
Gitee


From 212a703f74f315b471ae77ac2d300cceb7faeacc Mon Sep 17 00:00:00 2001
From: twc <tanweicheng@huawei.com>
Date: Mon, 28 Apr 2025 16:00:20 +0800
Subject: [PATCH 18/55] adapt to mf ep branch and support ep weight split

---
 .../models/mf_models/deepseek_v3.py           |  26 +-
 .../mf_models/deepseekv3_weight_processor.py  | 657 ++++++++++--------
 .../models/mf_models/mf_model_base.py         |  21 +-
 .../mf_models/qwen2_weight_processor.py       |  58 +-
 .../models/mf_models/weight_processor.py      | 170 +++--
 5 files changed, 535 insertions(+), 397 deletions(-)

diff --git a/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py b/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py
index fa39e0fb..e2112526 100644
--- a/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py
+++ b/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py
@@ -57,6 +57,19 @@ from vllm_mindspore.model_executor.models.mf_models.deepseekv3_weight_processor
 logger = init_logger(__name__)
 
 
+def set_runtime_kernel_launch_group():
+    kernel_launch_group = {'thread_num': 2, 'kernel_group_num': 8}
+    env_kernel_launch_group = os.getenv("EXPERIMENTAL_KERNEL_LAUNCH_GROUP", None)
+    if env_kernel_launch_group is not None:
+        pairs = env_kernel_launch_group.split(',')
+        for pair in pairs:
+            key, val = pair.split(':')
+            kernel_launch_group[key] = val
+    thread_num = int(kernel_launch_group.get('thread_num', 2))
+    kernel_group_num = int(kernel_launch_group.get('kernel_group_num', 8))
+    ms.runtime.set_kernel_launch_group(thread_num=thread_num, kernel_group_num=kernel_group_num)
+
+
 def _get_padding_index(q_seq_len):
     dp_size = get_dp_group().world_size
     tp_size = get_tensor_model_parallel_world_size()
@@ -85,18 +98,19 @@ def _get_padding_index(q_seq_len):
         if dp_rank == 0:
             attn_unpadding_idx = arange_data
             last_arange_index = arange_data[-1]
-            ffn_padding_idx= np.pad(attn_unpadding_idx, (0, padding_size - attn_unpadding_idx.shape[0]),
-                                    mode='constant', constant_values=0)
+            ffn_padding_idx = np.pad(attn_unpadding_idx, (0, padding_size - attn_unpadding_idx.shape[0]),
+                                     mode='constant', constant_values=0)
         else:
             attn_offset_idx = arange_data + padding_size * dp_rank
             attn_unpadding_idx = np.concatenate((attn_unpadding_idx, attn_offset_idx), axis=0)
             ffn_offset_idx = arange_data + last_arange_index + 1
             last_arange_index = ffn_offset_idx[-1]
-            ffn_offset_idx_pad_zero =  np.pad(
+            ffn_offset_idx_pad_zero = np.pad(
                 ffn_offset_idx, (0, padding_size - ffn_offset_idx.shape[0]), mode='constant', constant_values=0)
             ffn_padding_idx = np.concatenate((ffn_padding_idx, ffn_offset_idx_pad_zero), axis=0)
     return ms.from_numpy(attn_padding_idx), ms.from_numpy(attn_unpadding_idx), ms.from_numpy(ffn_padding_idx), \
-        ms.from_numpy(ffn_unpadding_idx)
+           ms.from_numpy(ffn_unpadding_idx)
+
 
 
 class DeepseekV3ForCausalLM(MfModelBase):
@@ -123,6 +137,7 @@ class DeepseekV3ForCausalLM(MfModelBase):
             compilation_config.static_forward_context[str(i)] = self.kv_caches[i]
 
         self.set_flags = False
+        set_runtime_kernel_launch_group()
 
     def _generate_model_config(self):
         self.mf_config.load_checkpoint = self.get_model_path()
@@ -139,7 +154,8 @@ class DeepseekV3ForCausalLM(MfModelBase):
             network = DeepseekV3ForCausalLM_MF(self.mf_model_config)
 
         # quant
-        if hasattr(self.mf_model_config, "quantization_config") and hasattr(self.mf_model_config.quantization_config, "quant_method"):
+        if hasattr(self.mf_model_config, "quantization_config") and hasattr(self.mf_model_config.quantization_config,
+                                                                            "quant_method"):
             ptq = self.create_ptq(self.mf_model_config.quantization_config.quant_method, PTQMode.DEPLOY)
             if ptq is not None:
                 ptq.apply(network)
diff --git a/vllm_mindspore/model_executor/models/mf_models/deepseekv3_weight_processor.py b/vllm_mindspore/model_executor/models/mf_models/deepseekv3_weight_processor.py
index 58a186b6..1c6a99d9 100644
--- a/vllm_mindspore/model_executor/models/mf_models/deepseekv3_weight_processor.py
+++ b/vllm_mindspore/model_executor/models/mf_models/deepseekv3_weight_processor.py
@@ -29,7 +29,6 @@ from mindformers.experimental.parallel_core.pynative.parallel_state import get_t
 from vllm_mindspore.model_executor.models.mf_models.weight_processor import BaseWeightProcessor, EPMethod
 from vllm.logger import init_logger
 
-
 logger = init_logger(__name__)
 
 
@@ -63,13 +62,20 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor):
         super().__init__(config, network, is_quant)
         self.num_layers = self.config.model.model_config.num_layers
         self.expert_num = self.config.moe_config.expert_num
-        self.moe_tensor_parallel = self.config.moe_config.moe_tensor_parallel
-        self.moe_expert_parallel = self.config.moe_config.moe_expert_parallel
-        self.ep_method = EPMethod.DEFAULT
-        if self.dp_group_size > 1 and self.moe_expert_parallel == self.global_group_size:
-            self.ep_method = EPMethod.ALLTOALL
-        elif self.dp_group_size > 1:
-            self.ep_method = EPMethod.ALLGATHER
+        self.moe_split_tp = self.moe_tp_size > 1
+        self.moe_split_ep = self.moe_ep_size > 1
+        logger.debug(f"Deepseekv3 weight split info:"
+                     f"global_rank_id: {self.global_rank_id} \n"
+                     f"tp_group_size: {self.tp_group_size} \n"
+                     f"dp_group_size: {self.dp_group_size} \n"
+                     f"tp_rank_id: {self.tp_rank_id} \n"
+                     f"ep_method: {self.ep_method.name} \n"
+                     f"num_router_experts: {self.num_router_experts} \n"
+                     f"ep_group_nums: {self.ep_group_nums} \n"
+                     f"moe_ep_rank_id: {self.moe_ep_rank_id} \n"
+                     f"moe_tp_rank_id: {self.moe_tp_rank_id} \n"
+                     f"moe_ep_size: {self.moe_ep_size} \n"
+                     f"moe_tp_size: {self.moe_tp_size}")
 
     def quant_convert_weight_name(self, weight_name: str):
         """replace quant net weight name"""
@@ -140,7 +146,7 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor):
         weight[..., -qk_rope_head_dim:, :] = np.concatenate([w1, w2], axis=-2)
         return weight
 
-    def infer_quant_process_moe_with_tp(self, src_hf_dir, hf_weight_map, num_router_experts, layer_id):
+    def infer_quant_process_moe_with_ep(self, src_hf_dir, hf_weight_map, layer_id):
         w1_list = []
         w2_list = []
         w3_list = []
@@ -149,17 +155,14 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor):
         w2_scale_list = []
         w3_scale_list = []
 
-        for index in range(0, num_router_experts):
+        for index in range(self.ep_start, self.ep_stop):
             w1_hf_name = f"model.layers.{layer_id}.mlp.experts.{index}.gate_proj.weight"
             w2_hf_name = f"model.layers.{layer_id}.mlp.experts.{index}.down_proj.weight"
             w3_hf_name = f"model.layers.{layer_id}.mlp.experts.{index}.up_proj.weight"
 
-            w1_ms_param, _ = self.get_moe_safetensor_from_file(w1_hf_name, src_hf_dir, hf_weight_map,
-                                                               is_split_param=True, split_axis=0)
-            w2_ms_param, _ = self.get_moe_safetensor_from_file(w2_hf_name, src_hf_dir, hf_weight_map,
-                                                               is_split_param=True, split_axis=1)
-            w3_ms_param, _ = self.get_moe_safetensor_from_file(w3_hf_name, src_hf_dir, hf_weight_map,
-                                                               is_split_param=True, split_axis=0)
+            w1_ms_param, _ = self.get_safetensor_from_file(w1_hf_name, src_hf_dir, hf_weight_map)
+            w2_ms_param, _ = self.get_safetensor_from_file(w2_hf_name, src_hf_dir, hf_weight_map)
+            w3_ms_param, _ = self.get_safetensor_from_file(w3_hf_name, src_hf_dir, hf_weight_map)
 
             w1_list.append(w1_ms_param)
             w2_list.append(w2_ms_param)
@@ -169,11 +172,9 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor):
             w2_scale_hf_name = f"model.layers.{layer_id}.mlp.experts.{index}.down_proj.weight_scale"
             w3_scale_hf_name = f"model.layers.{layer_id}.mlp.experts.{index}.up_proj.weight_scale"
 
-            w1_scale_ms_param, _ = self.get_moe_safetensor_from_file(w1_scale_hf_name, src_hf_dir, hf_weight_map,
-                                                                     is_split_param=True, split_axis=0)
-            w2_scale_ms_param, _ = self.get_moe_safetensor_from_file(w2_scale_hf_name, src_hf_dir, hf_weight_map)
-            w3_scale_ms_param, _ = self.get_moe_safetensor_from_file(w3_scale_hf_name, src_hf_dir, hf_weight_map,
-                                                                     is_split_param=True, split_axis=0)
+            w1_scale_ms_param, _ = self.get_safetensor_from_file(w1_scale_hf_name, src_hf_dir, hf_weight_map)
+            w2_scale_ms_param, _ = self.get_safetensor_from_file(w2_scale_hf_name, src_hf_dir, hf_weight_map)
+            w3_scale_ms_param, _ = self.get_safetensor_from_file(w3_scale_hf_name, src_hf_dir, hf_weight_map)
 
             w1_scale_ms_param = w1_scale_ms_param.squeeze(axis=-1)
             w2_scale_ms_param = w2_scale_ms_param.squeeze(axis=-1)
@@ -184,7 +185,7 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor):
 
         return w1_list, w2_list, w3_list, w1_scale_list, w2_scale_list, w3_scale_list
 
-    def infer_quant_process_moe_with_ep(self, src_hf_dir, hf_weight_map, num_router_experts, layer_id):
+    def infer_quant_process_moe_with_ep_tp(self, src_hf_dir, hf_weight_map, layer_id):
         w1_list = []
         w2_list = []
         w3_list = []
@@ -193,16 +194,17 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor):
         w2_scale_list = []
         w3_scale_list = []
 
-        ep_start = self.moe_ep_rank_id * self.ep_group_nums
-        ep_stop = (self.moe_ep_rank_id + 1) * self.ep_group_nums
-        for index in range(ep_start, ep_stop):
+        for index in range(self.ep_start, self.ep_stop):
             w1_hf_name = f"model.layers.{layer_id}.mlp.experts.{index}.gate_proj.weight"
             w2_hf_name = f"model.layers.{layer_id}.mlp.experts.{index}.down_proj.weight"
             w3_hf_name = f"model.layers.{layer_id}.mlp.experts.{index}.up_proj.weight"
 
-            w1_ms_param, _ = self.get_moe_safetensor_from_file(w1_hf_name, src_hf_dir, hf_weight_map)
-            w2_ms_param, _ = self.get_moe_safetensor_from_file(w2_hf_name, src_hf_dir, hf_weight_map)
-            w3_ms_param, _ = self.get_moe_safetensor_from_file(w3_hf_name, src_hf_dir, hf_weight_map)
+            w1_ms_param, _ = self.get_safetensor_from_file_split_moe_tp_group(w1_hf_name, src_hf_dir, hf_weight_map,
+                                                                              split_axis=0)
+            w2_ms_param, _ = self.get_safetensor_from_file_split_moe_tp_group(w2_hf_name, src_hf_dir, hf_weight_map,
+                                                                              split_axis=1)
+            w3_ms_param, _ = self.get_safetensor_from_file_split_moe_tp_group(w3_hf_name, src_hf_dir, hf_weight_map,
+                                                                              split_axis=0)
 
             w1_list.append(w1_ms_param)
             w2_list.append(w2_ms_param)
@@ -212,9 +214,14 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor):
             w2_scale_hf_name = f"model.layers.{layer_id}.mlp.experts.{index}.down_proj.weight_scale"
             w3_scale_hf_name = f"model.layers.{layer_id}.mlp.experts.{index}.up_proj.weight_scale"
 
-            w1_scale_ms_param, _ = self.get_moe_safetensor_from_file(w1_scale_hf_name, src_hf_dir, hf_weight_map)
-            w2_scale_ms_param, _ = self.get_moe_safetensor_from_file(w2_scale_hf_name, src_hf_dir, hf_weight_map)
-            w3_scale_ms_param, _ = self.get_moe_safetensor_from_file(w3_scale_hf_name, src_hf_dir, hf_weight_map)
+            w1_scale_ms_param, _ = self.get_safetensor_from_file_split_moe_tp_group(w1_scale_hf_name, src_hf_dir,
+                                                                                    hf_weight_map,
+                                                                                    split_axis=0)
+            w2_scale_ms_param, _ = self.get_safetensor_from_file(w2_scale_hf_name, src_hf_dir,
+                                                                 hf_weight_map)
+            w3_scale_ms_param, _ = self.get_safetensor_from_file_split_moe_tp_group(w3_scale_hf_name, src_hf_dir,
+                                                                                    hf_weight_map,
+                                                                                    split_axis=0)
 
             w1_scale_ms_param = w1_scale_ms_param.squeeze(axis=-1)
             w2_scale_ms_param = w2_scale_ms_param.squeeze(axis=-1)
@@ -225,65 +232,15 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor):
 
         return w1_list, w2_list, w3_list, w1_scale_list, w2_scale_list, w3_scale_list
 
-    def infer_quant_process_moe_with_ep_tp(self, src_hf_dir, hf_weight_map, num_router_experts, layer_id):
-        w1_list = []
-        w2_list = []
-        w3_list = []
-
-        w1_scale_list = []
-        w2_scale_list = []
-        w3_scale_list = []
-
-        ep_start = self.moe_ep_rank_id * self.ep_group_nums
-        ep_stop = (self.moe_ep_rank_id + 1) * self.ep_group_nums
-
-        for index in range(ep_start, ep_stop):
-            w1_hf_name = f"model.layers.{layer_id}.mlp.experts.{index}.gate_proj.weight"
-            w2_hf_name = f"model.layers.{layer_id}.mlp.experts.{index}.down_proj.weight"
-            w3_hf_name = f"model.layers.{layer_id}.mlp.experts.{index}.up_proj.weight"
-
-            w1_ms_param, _ = self.get_moe_safetensor_from_file(w1_hf_name, src_hf_dir, hf_weight_map,
-                                                               is_split_param=True, split_axis=0)
-            w2_ms_param, _ = self.get_moe_safetensor_from_file(w2_hf_name, src_hf_dir, hf_weight_map,
-                                                               is_split_param=True, split_axis=1)
-            w3_ms_param, _ = self.get_moe_safetensor_from_file(w3_hf_name, src_hf_dir, hf_weight_map,
-                                                               is_split_param=True, split_axis=0)
-
-            w1_list.append(w1_ms_param)
-            w2_list.append(w2_ms_param)
-            w3_list.append(w3_ms_param)
-
-            w1_scale_hf_name = f"model.layers.{layer_id}.mlp.experts.{index}.gate_proj.weight_scale"
-            w2_scale_hf_name = f"model.layers.{layer_id}.mlp.experts.{index}.down_proj.weight_scale"
-            w3_scale_hf_name = f"model.layers.{layer_id}.mlp.experts.{index}.up_proj.weight_scale"
-
-            w1_scale_ms_param, _ = self.get_moe_safetensor_from_file(w1_scale_hf_name, src_hf_dir, hf_weight_map,
-                                                                     is_split_param=True, split_axis=0)
-            w2_scale_ms_param, _ = self.get_moe_safetensor_from_file(w2_scale_hf_name, src_hf_dir, hf_weight_map)
-            w3_scale_ms_param, _ = self.get_moe_safetensor_from_file(w3_scale_hf_name, src_hf_dir, hf_weight_map,
-                                                                     is_split_param=True, split_axis=0)
-
-            w1_scale_ms_param = w1_scale_ms_param.squeeze(axis=-1)
-            w2_scale_ms_param = w2_scale_ms_param.squeeze(axis=-1)
-            w3_scale_ms_param = w3_scale_ms_param.squeeze(axis=-1)
-            w1_scale_list.append(w1_scale_ms_param)
-            w2_scale_list.append(w2_scale_ms_param)
-            w3_scale_list.append(w3_scale_ms_param)
-
-        return w1_list, w2_list, w3_list, w1_scale_list, w2_scale_list, w3_scale_list
-
-    def infer_quant_process_moe(self, src_hf_dir, hf_weight_map, num_router_experts, layer_id):
-        if self.moe_expert_parallel > 1 and self.moe_tensor_parallel > 1:
-            return self.infer_quant_process_moe_with_ep_tp(src_hf_dir, hf_weight_map, num_router_experts, layer_id)
-        elif self.moe_tensor_parallel > 1:
-            return self.infer_quant_process_moe_with_tp(src_hf_dir, hf_weight_map, num_router_experts, layer_id)
+    def infer_quant_process_moe(self, src_hf_dir, hf_weight_map, layer_id):
+        if self.moe_tp_size > 1:
+            return self.infer_quant_process_moe_with_ep_tp(src_hf_dir, hf_weight_map, layer_id)
         else:
-            return self.infer_quant_process_moe_with_ep(src_hf_dir, hf_weight_map, num_router_experts, layer_id)
+            return self.infer_quant_process_moe_with_ep(src_hf_dir, hf_weight_map, layer_id)
 
     def infer_quant_process_moe_routed_expert_ffn_weight(self, src_hf_dir, layer_id, hf_weight_map):
         """process moe router expert weight"""
         ffn_concat = self.config.model.model_config.ffn_concat
-        num_router_experts = self.config.moe_config.expert_num
 
         # router expert dense
         router_dense_hf_name = f"model.layers.{layer_id}.mlp.gate.weight"
@@ -311,7 +268,7 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor):
         w3_scale_ms_name = f"model.layers.{layer_id}.feed_forward.routed_experts.ffn.w3._layer.matmul.weight_scale"
 
         w1_list, w2_list, w3_list, w1_scale_list, w2_scale_list, w3_scale_list = \
-            self.infer_quant_process_moe(src_hf_dir, hf_weight_map, num_router_experts, layer_id)
+            self.infer_quant_process_moe(src_hf_dir, hf_weight_map, layer_id)
 
         w1_ms_stack_param = np.stack(w1_list, axis=0)
         w2_ms_stack_param = np.stack(w2_list, axis=0)
@@ -368,54 +325,57 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor):
             name=w2_scale_ms_name,
             requires_grad=False)
 
-    def get_moe_shared_expert_split_info(self):
-        split_num = -1
-        rank_id = -1
-        if self.ep_method == EPMethod.ALLGATHER:
-            split_num = self.global_group_size
-            rank_id = get_rank()
+    def get_quant_moe_shared_expert_weight(self, w1_hf_name, w2_hf_name, w3_hf_name, w1_scale_hf_name, w2_scale_hf_name,
+                                           w3_scale_hf_name, src_hf_dir, hf_weight_map):
+        if self.ep_method in [EPMethod.DEFAULT, EPMethod.ALLGATHER]:
+            w1_ms_param, _ = self.get_safetensor_from_file_split_global_group(w1_hf_name, src_hf_dir, hf_weight_map,
+                                                                              split_axis=0)
+            w2_ms_param, _ = self.get_safetensor_from_file_split_global_group(w2_hf_name, src_hf_dir, hf_weight_map,
+                                                                              split_axis=1)
+            w3_ms_param, _ = self.get_safetensor_from_file_split_global_group(w3_hf_name, src_hf_dir, hf_weight_map,
+                                                                              split_axis=0)
+            w1_scale_ms_param, _ = self.get_safetensor_from_file_split_global_group(w1_scale_hf_name, src_hf_dir,
+                                                                                    hf_weight_map, split_axis=0)
+            w2_scale_ms_param, _ = self.get_safetensor_from_file(w2_scale_hf_name, src_hf_dir, hf_weight_map)
+
+            w3_scale_ms_param, _ = self.get_safetensor_from_file_split_global_group(w3_scale_hf_name, src_hf_dir,
+                                                                                    hf_weight_map, split_axis=0)
         elif self.ep_method == EPMethod.ALLTOALL:
-            split_num = 1
-            rank_id = 0
-        return split_num, rank_id
+            w1_ms_param, _ = self.get_safetensor_from_file(w1_hf_name, src_hf_dir, hf_weight_map)
+            w2_ms_param, _ = self.get_safetensor_from_file(w2_hf_name, src_hf_dir, hf_weight_map)
+            w3_ms_param, _ = self.get_safetensor_from_file(w3_hf_name, src_hf_dir, hf_weight_map)
+
+            w1_scale_ms_param, _ = self.get_safetensor_from_file(w1_scale_hf_name, src_hf_dir, hf_weight_map)
+            w2_scale_ms_param, _ = self.get_safetensor_from_file(w2_scale_hf_name, src_hf_dir, hf_weight_map)
+            w3_scale_ms_param, _ = self.get_safetensor_from_file(w3_scale_hf_name, src_hf_dir, hf_weight_map)
+        else:
+            raise ValueError("Unsupported ep_method:{}".format(self.ep_method))
+
+        return w1_ms_param, w2_ms_param, w3_ms_param, w1_scale_ms_param, w2_scale_ms_param, w3_scale_ms_param
 
     def infer_quant_process_moe_shared_expert_ffn_weight(self, src_hf_dir, layer_id, hf_weight_map):
         """infer quant process moe shared expert ffn weight"""
-        split_num, rank_id = self.get_moe_shared_expert_split_info()
-
         ffn_concat = self.config.model.model_config.ffn_concat
         w1_hf_name = f"model.layers.{layer_id}.mlp.shared_experts.gate_proj.weight"
-        w1_ms_name = self.quant_convert_weight_name(w1_hf_name)
-        w1_ms_param, _ = self.get_safetensor_from_file(w1_hf_name, src_hf_dir, hf_weight_map, is_split_param=True,
-                                                       split_axis=0, split_num=split_num, rank_id=rank_id)
-
-        w1_scale_hf_name = f"model.layers.{layer_id}.mlp.shared_experts.gate_proj.weight_scale"
-        w1_scale_ms_name = self.quant_convert_weight_name(w1_scale_hf_name)
-        w1_scale_ms_param, _ = self.get_safetensor_from_file(w1_scale_hf_name, src_hf_dir, hf_weight_map,
-                                                             is_split_param=True, split_axis=0, split_num=split_num,
-                                                             rank_id=rank_id)
-
         w2_hf_name = f"model.layers.{layer_id}.mlp.shared_experts.down_proj.weight"
-        w2_ms_name = self.quant_convert_weight_name(w2_hf_name)
-        w2_ms_param, _ = self.get_safetensor_from_file(w2_hf_name, src_hf_dir, hf_weight_map,
-                                                       is_split_param=True, split_axis=1, split_num=split_num,
-                                                       rank_id=rank_id)
+        w3_hf_name = f"model.layers.{layer_id}.mlp.shared_experts.up_proj.weight"
 
+        w1_scale_hf_name = f"model.layers.{layer_id}.mlp.shared_experts.gate_proj.weight_scale"
         w2_scale_hf_name = f"model.layers.{layer_id}.mlp.shared_experts.down_proj.weight_scale"
-        w2_scale_ms_name = self.quant_convert_weight_name(w2_scale_hf_name)
-        w2_scale_ms_param, _ = self.get_safetensor_from_file(w2_scale_hf_name, src_hf_dir, hf_weight_map)
+        w3_scale_hf_name = f"model.layers.{layer_id}.mlp.shared_experts.up_proj.weight_scale"
 
-        w3_hf_name = f"model.layers.{layer_id}.mlp.shared_experts.up_proj.weight"
+        w1_ms_name = self.quant_convert_weight_name(w1_hf_name)
+        w2_ms_name = self.quant_convert_weight_name(w2_hf_name)
         w3_ms_name = self.quant_convert_weight_name(w3_hf_name)
-        w3_ms_param, _ = self.get_safetensor_from_file(w3_hf_name, src_hf_dir, hf_weight_map,
-                                                       is_split_param=True, split_axis=0, split_num=split_num,
-                                                       rank_id=rank_id)
 
-        w3_scale_hf_name = f"model.layers.{layer_id}.mlp.shared_experts.up_proj.weight_scale"
+        w1_scale_ms_name = self.quant_convert_weight_name(w1_scale_hf_name)
+        w2_scale_ms_name = self.quant_convert_weight_name(w2_scale_hf_name)
         w3_scale_ms_name = self.quant_convert_weight_name(w3_scale_hf_name)
-        w3_scale_ms_param, _ = self.get_safetensor_from_file(w3_scale_hf_name, src_hf_dir, hf_weight_map,
-                                                             is_split_param=True, split_axis=0, split_num=split_num,
-                                                             rank_id=rank_id)
+
+        w1_ms_param, w2_ms_param, w3_ms_param, w1_scale_ms_param, w2_scale_ms_param, w3_scale_ms_param = \
+            self.get_quant_moe_shared_expert_weight(w1_hf_name, w2_hf_name, w3_hf_name, w1_scale_hf_name,
+                                                    w2_scale_hf_name,
+                                                    w3_scale_hf_name, src_hf_dir, hf_weight_map)
 
         w1_scale_ms_param = w1_scale_ms_param.squeeze(axis=-1)
         w2_scale_ms_param = w2_scale_ms_param.squeeze(axis=-1)
@@ -468,20 +428,17 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor):
         ffn_concat = self.config.model.model_config.ffn_concat
         w1_hf_name = f"model.layers.{layer_id}.mlp.gate_proj.weight"
         w1_ms_name = self.quant_convert_weight_name(w1_hf_name)
-        w1_ms_param, _ = self.get_safetensor_from_file(w1_hf_name, src_hf_dir, hf_weight_map,
-                                                       is_split_param=True,
-                                                       split_axis=0)
+        w1_ms_param, _ = self.get_safetensor_from_file_split_tp_group(w1_hf_name, src_hf_dir, hf_weight_map,
+                                                                      split_axis=0)
         w1_scale_hf_name = f"model.layers.{layer_id}.mlp.gate_proj.weight_scale"
         w1_scale_ms_name = self.quant_convert_weight_name(w1_scale_hf_name)
-        w1_scale_ms_param, _ = self.get_safetensor_from_file(w1_scale_hf_name, src_hf_dir, hf_weight_map,
-                                                             is_split_param=True,
-                                                             split_axis=0)
+        w1_scale_ms_param, _ = self.get_safetensor_from_file_split_tp_group(w1_scale_hf_name, src_hf_dir, hf_weight_map,
+                                                                            split_axis=0)
 
         w2_hf_name = f"model.layers.{layer_id}.mlp.down_proj.weight"
         w2_ms_name = self.quant_convert_weight_name(w2_hf_name)
-        w2_ms_param, _ = self.get_safetensor_from_file(w2_hf_name, src_hf_dir, hf_weight_map,
-                                                       is_split_param=True,
-                                                       split_axis=1)
+        w2_ms_param, _ = self.get_safetensor_from_file_split_tp_group(w2_hf_name, src_hf_dir, hf_weight_map,
+                                                                      split_axis=1)
         w2_scale_hf_name = f"model.layers.{layer_id}.mlp.down_proj.weight_scale"
         w2_scale_ms_name = self.quant_convert_weight_name(w2_scale_hf_name)
         # shape:[7168,1]
@@ -489,14 +446,12 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor):
 
         w3_hf_name = f"model.layers.{layer_id}.mlp.up_proj.weight"
         w3_ms_name = self.quant_convert_weight_name(w3_hf_name)
-        w3_ms_param, _ = self.get_safetensor_from_file(w3_hf_name, src_hf_dir, hf_weight_map,
-                                                       is_split_param=True,
-                                                       split_axis=0)
+        w3_ms_param, _ = self.get_safetensor_from_file_split_tp_group(w3_hf_name, src_hf_dir, hf_weight_map,
+                                                                      split_axis=0)
         w3_scale_hf_name = f"model.layers.{layer_id}.mlp.up_proj.weight_scale"
         w3_scale_ms_name = self.quant_convert_weight_name(w3_scale_hf_name)
-        w3_scale_ms_param, _ = self.get_safetensor_from_file(w3_scale_hf_name, src_hf_dir, hf_weight_map,
-                                                             is_split_param=True,
-                                                             split_axis=0)
+        w3_scale_ms_param, _ = self.get_safetensor_from_file_split_tp_group(w3_scale_hf_name, src_hf_dir, hf_weight_map,
+                                                                            split_axis=0)
 
         w1_scale_ms_param = w1_scale_ms_param.squeeze(axis=-1)
         w2_scale_ms_param = w2_scale_ms_param.squeeze(axis=-1)
@@ -561,8 +516,8 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor):
         lm_head_hf_name = "lm_head.weight"
         lm_head_ms_name = self.quant_convert_weight_name(lm_head_hf_name)
         if not self.config.parallel_config.vocab_emb_dp:
-            np_data, _ = self.get_safetensor_from_file(lm_head_hf_name, src_hf_dir, hf_weight_map,
-                                                       is_split_param=True, split_axis=0)
+            np_data, _ = self.get_safetensor_from_file_split_tp_group(lm_head_hf_name, src_hf_dir, hf_weight_map,
+                                                                      split_axis=0)
         else:
             np_data, _ = self.get_safetensor_from_file(lm_head_hf_name, src_hf_dir, hf_weight_map)
         self.parameter_dict[lm_head_ms_name] = ms.Parameter(ms.from_numpy(np_data).astype(ms.bfloat16),
@@ -595,7 +550,7 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor):
             quant_bias_hf_name = f"model.layers.{layer_id}.self_attn." + name + ".quant_bias"
             quant_bias_ms_name = self.quant_convert_weight_name(quant_bias_hf_name)
             quant_bias_ms_param, _ = self.get_safetensor_from_file(quant_bias_hf_name, src_hf_dir, hf_weight_map)
-            if name == "o_proj" and get_tensor_model_parallel_rank() != 0:
+            if name == "o_proj" and self.tp_rank_id != 0:
                 quant_bias_ms_param.fill(0)
 
             dequant_scale_hf_name = f"model.layers.{layer_id}.self_attn." + name + ".deq_scale"
@@ -831,7 +786,6 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor):
     def infer_process_moe_routed_expert_ffn_weight(self, src_hf_dir, layer_id, hf_weight_map):
         """process moe router expert weight"""
         ffn_concat = self.config.model.model_config.ffn_concat
-        num_router_experts = self.config.moe_config.expert_num
 
         # router expert dense
         router_dense_hf_name = f"model.layers.{layer_id}.mlp.gate.weight"
@@ -861,18 +815,18 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor):
         w3_ms_name = f"model.layers.{layer_id}.feed_forward.routed_experts.ffn.w3.weight"
         w3_ms_name = w3_ms_name if layer_id < self.num_layers else self.convert_mtp_weight_name(w3_ms_name)
 
-        for index in range(0, num_router_experts):
+        for index in range(0, self.num_router_experts):
             w1_hf_name = f"model.layers.{layer_id}.mlp.experts.{index}.gate_proj.weight"
-            w1_ms_param, _ = self.get_safetensor_from_file(w1_hf_name, src_hf_dir, hf_weight_map,
-                                                           is_split_param=True, split_axis=0)
+            w1_ms_param, _ = self.get_safetensor_from_file_split_tp_group(w1_hf_name, src_hf_dir, hf_weight_map,
+                                                                          split_axis=0)
 
             w2_hf_name = f"model.layers.{layer_id}.mlp.experts.{index}.down_proj.weight"
-            w2_ms_param, _ = self.get_safetensor_from_file(w2_hf_name, src_hf_dir, hf_weight_map,
-                                                           is_split_param=True, split_axis=1)
+            w2_ms_param, _ = self.get_safetensor_from_file_split_tp_group(w2_hf_name, src_hf_dir, hf_weight_map,
+                                                                          split_axis=1)
 
             w3_hf_name = f"model.layers.{layer_id}.mlp.experts.{index}.up_proj.weight"
-            w3_ms_param, _ = self.get_safetensor_from_file(w3_hf_name, src_hf_dir, hf_weight_map,
-                                                           is_split_param=True, split_axis=0)
+            w3_ms_param, _ = self.get_safetensor_from_file_split_tp_group(w3_hf_name, src_hf_dir, hf_weight_map,
+                                                                          split_axis=0)
 
             w1_list.append(w1_ms_param)
             w2_list.append(w2_ms_param)
@@ -907,28 +861,37 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor):
                                                        name=w2_ms_name,
                                                        requires_grad=False)
 
+    def get_moe_shared_expert_weight(self, w1_hf_name, w2_hf_name, w3_hf_name, src_hf_dir, hf_weight_map):
+        if self.ep_method == EPMethod.ALLGATHER:
+            w1_ms_param, _ = self.get_safetensor_from_file_split_global_group(w1_hf_name, src_hf_dir, hf_weight_map,
+                                                                              split_axis=0)
+            w2_ms_param, _ = self.get_safetensor_from_file_split_global_group(w2_hf_name, src_hf_dir, hf_weight_map,
+                                                                              split_axis=1)
+            w3_ms_param, _ = self.get_safetensor_from_file_split_global_group(w3_hf_name, src_hf_dir, hf_weight_map,
+                                                                              split_axis=0)
+        elif self.ep_method == EPMethod.ALLTOALL:
+            w1_ms_param, _ = self.get_safetensor_from_file(w1_hf_name, src_hf_dir, hf_weight_map)
+            w2_ms_param, _ = self.get_safetensor_from_file(w2_hf_name, src_hf_dir, hf_weight_map)
+            w3_ms_param, _ = self.get_safetensor_from_file(w3_hf_name, src_hf_dir, hf_weight_map)
+
+        else:
+            raise ValueError("Unsupported ep_method:{}".format(self.ep_method))
+
+        return w1_ms_param, w2_ms_param, w3_ms_param
+
     def infer_process_moe_shared_expert_ffn_weight(self, src_hf_dir, layer_id, hf_weight_map):
         """infer process moe shared expert ffn weight"""
-        split_num, rank_id = self.get_moe_shared_expert_split_info()
-
         ffn_concat = self.config.model.model_config.ffn_concat
         w1_hf_name = f"model.layers.{layer_id}.mlp.shared_experts.gate_proj.weight"
-        w1_ms_name = self.convert_weight_name(w1_hf_name)
-        w1_ms_param, _ = self.get_safetensor_from_file(w1_hf_name, src_hf_dir, hf_weight_map,
-                                                       is_split_param=True, split_axis=0, split_num=split_num,
-                                                       rank_id=rank_id)
-
         w2_hf_name = f"model.layers.{layer_id}.mlp.shared_experts.down_proj.weight"
-        w2_ms_name = self.convert_weight_name(w2_hf_name)
-        w2_ms_param, _ = self.get_safetensor_from_file(w2_hf_name, src_hf_dir, hf_weight_map,
-                                                       is_split_param=True, split_axis=1, split_num=split_num,
-                                                       rank_id=rank_id)
-
         w3_hf_name = f"model.layers.{layer_id}.mlp.shared_experts.up_proj.weight"
+
+        w1_ms_name = self.convert_weight_name(w1_hf_name)
+        w2_ms_name = self.convert_weight_name(w2_hf_name)
         w3_ms_name = self.convert_weight_name(w3_hf_name)
-        w3_ms_param, _ = self.get_safetensor_from_file(w3_hf_name, src_hf_dir, hf_weight_map,
-                                                       is_split_param=True, split_axis=0, split_num=split_num,
-                                                       rank_id=rank_id)
+
+        w1_ms_param, w2_ms_param, w3_ms_param = self.get_moe_shared_expert_weight(w1_hf_name, w2_hf_name, w3_hf_name,
+                                                                                  src_hf_dir, hf_weight_map)
 
         if ffn_concat:
             w_gate_hidden_name = f"model.layers.{layer_id}.feed_forward.shared_experts.w_gate_hidden.weight"
@@ -957,18 +920,18 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor):
 
         w1_hf_name = f"model.layers.{layer_id}.mlp.gate_proj.weight"
         w1_ms_name = self.convert_weight_name(w1_hf_name)
-        w1_ms_param, _ = self.get_safetensor_from_file(w1_hf_name, src_hf_dir, hf_weight_map, is_split_param=True,
-                                                       split_axis=0)
+        w1_ms_param, _ = self.get_safetensor_from_file_split_tp_group(w1_hf_name, src_hf_dir, hf_weight_map,
+                                                                      split_axis=0)
 
         w2_hf_name = f"model.layers.{layer_id}.mlp.down_proj.weight"
         w2_ms_name = self.convert_weight_name(w2_hf_name)
-        w2_ms_param, _ = self.get_safetensor_from_file(w2_hf_name, src_hf_dir, hf_weight_map, is_split_param=True,
-                                                       split_axis=1)
+        w2_ms_param, _ = self.get_safetensor_from_file_split_tp_group(w2_hf_name, src_hf_dir, hf_weight_map,
+                                                                      split_axis=1)
 
         w3_hf_name = f"model.layers.{layer_id}.mlp.up_proj.weight"
         w3_ms_name = self.convert_weight_name(w3_hf_name)
-        w3_ms_param, _ = self.get_safetensor_from_file(w3_hf_name, src_hf_dir, hf_weight_map, is_split_param=True,
-                                                       split_axis=0)
+        w3_ms_param, _ = self.get_safetensor_from_file_split_tp_group(w3_hf_name, src_hf_dir, hf_weight_map,
+                                                                      split_axis=0)
 
         if ffn_concat:
             w_gate_hidden_name = f"model.layers.{layer_id}.feed_forward.w_gate_hidden.weight"
@@ -1118,8 +1081,8 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor):
             hf_name = f"model.layers.{layer_id}.{prefix_name}"
             ms_name = self.convert_weight_name(hf_name)
             if prefix_name in head_names and not self.config.parallel_config.vocab_emb_dp:
-                ms_param, _ = self.get_safetensor_from_file(hf_name, src_hf_dir, hf_weight_map,
-                                                            is_split_param=True, split_axis=0)
+                ms_param, _ = self.get_safetensor_from_file_split_tp_group(hf_name, src_hf_dir, hf_weight_map,
+                                                                           split_axis=0)
             else:
                 ms_param, _ = self.get_safetensor_from_file(hf_name, src_hf_dir, hf_weight_map)
             parameter_dict[ms_name] = ms.Parameter(ms.Tensor(ms_param, ms.bfloat16),
@@ -1145,45 +1108,49 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor):
 
     def smooth_quant_process_route_ffn_weight(self, src_hf_dir, layer_id, hf_weight_map, parameter_dict, layer_type):
         """smooth_quant_process_route_ffn_weight"""
+
         ffn_concat = self.config.model.model_config.ffn_concat
         w1_weight_name = f"model.layers.{layer_id}.{layer_type}.w1._layer.weight"
-        w1_weight_param, _ = self.get_safetensor_from_file(w1_weight_name, src_hf_dir, hf_weight_map,
-                                                           is_split_param=True,
-                                                           split_axis=2)
-
         w1_bias_name = f"model.layers.{layer_id}.{layer_type}.w1._layer.matmul.quant_bias"
-        w1_bias_param, _ = self.get_safetensor_from_file(w1_bias_name, src_hf_dir, hf_weight_map,
-                                                         is_split_param=True,
-                                                         split_axis=1)
         w1_scale_name = f"model.layers.{layer_id}.{layer_type}.w1._layer.matmul.dequant_scale"
-        w1_scale_param, _ = self.get_safetensor_from_file(w1_scale_name, src_hf_dir, hf_weight_map,
-                                                          is_split_param=True,
-                                                          split_axis=1)
-
         w1_quant_zp = f"model.layers.{layer_id}.{layer_type}.w1.quant_op.input_zp"
         w1_quant_scale = f"model.layers.{layer_id}.{layer_type}.w1.quant_op.input_scale"
+        w3_weight_name = f"model.layers.{layer_id}.{layer_type}.w3._layer.weight"
+        w3_bias_name = f"model.layers.{layer_id}.{layer_type}.w3._layer.matmul.quant_bias"
+        w3_scale_name = f"model.layers.{layer_id}.{layer_type}.w3._layer.matmul.dequant_scale"
+        w3_quant_zp = f"model.layers.{layer_id}.{layer_type}.w3.quant_op.input_zp"
+        w3_quant_scale = f"model.layers.{layer_id}.{layer_type}.w3.quant_op.input_scale"
+        w2_weight_name = f"model.layers.{layer_id}.{layer_type}.w2._layer.weight"
+        w2_scale_name = f"model.layers.{layer_id}.{layer_type}.w2._layer.matmul.weight_scale"
+        w1_weight_param, _ = self.get_routed_safetensor_3_dim(w1_weight_name, src_hf_dir, hf_weight_map, tp_axis=2,
+                                                              split_ep=self.moe_split_ep, split_tp=self.moe_split_tp)
+
+        w1_bias_param, _ = self.get_routed_safetensor_2_dim(w1_bias_name, src_hf_dir, hf_weight_map, tp_axis=1,
+                                                            split_ep=self.moe_split_ep, split_tp=self.moe_split_tp)
+
+        w1_scale_param, _ = self.get_routed_safetensor_2_dim(w1_scale_name, src_hf_dir, hf_weight_map, tp_axis=1,
+                                                             split_ep=self.moe_split_ep, split_tp=self.moe_split_tp)
+
         w1_quant_zp_param, _ = self.get_safetensor_from_file(w1_quant_zp, src_hf_dir, hf_weight_map)
         w1_quant_scale_param, _ = self.get_safetensor_from_file(w1_quant_scale, src_hf_dir, hf_weight_map)
 
-        w3_weight_name = f"model.layers.{layer_id}.{layer_type}.w3._layer.weight"
-        w3_weight_param, _ = self.get_safetensor_from_file(w3_weight_name, src_hf_dir, hf_weight_map,
-                                                           is_split_param=True,
-                                                           split_axis=2)
+        w3_weight_param, _ = self.get_routed_safetensor_3_dim(w3_weight_name, src_hf_dir, hf_weight_map, tp_axis=2,
+                                                              split_ep=self.moe_split_ep, split_tp=self.moe_split_tp)
 
-        w3_bias_name = f"model.layers.{layer_id}.{layer_type}.w3._layer.matmul.quant_bias"
-        w3_bias_param, _ = self.get_safetensor_from_file(w3_bias_name, src_hf_dir, hf_weight_map,
-                                                         is_split_param=True,
-                                                         split_axis=1)
+        w3_bias_param, _ = self.get_routed_safetensor_2_dim(w3_bias_name, src_hf_dir, hf_weight_map, tp_axis=1,
+                                                            split_ep=self.moe_split_ep, split_tp=self.moe_split_tp)
 
-        w3_scale_name = f"model.layers.{layer_id}.{layer_type}.w3._layer.matmul.dequant_scale"
-        w3_scale_param, _ = self.get_safetensor_from_file(w3_scale_name, src_hf_dir, hf_weight_map,
-                                                          is_split_param=True,
-                                                          split_axis=1)
+        w3_scale_param, _ = self.get_routed_safetensor_2_dim(w3_scale_name, src_hf_dir, hf_weight_map, tp_axis=1,
+                                                             split_ep=self.moe_split_ep, split_tp=self.moe_split_tp)
 
-        w3_quant_zp = f"model.layers.{layer_id}.{layer_type}.w3.quant_op.input_zp"
-        w3_quant_scale = f"model.layers.{layer_id}.{layer_type}.w3.quant_op.input_scale"
         w3_quant_zp_param, _ = self.get_safetensor_from_file(w3_quant_zp, src_hf_dir, hf_weight_map)
         w3_quant_scale_param, _ = self.get_safetensor_from_file(w3_quant_scale, src_hf_dir, hf_weight_map)
+
+        w2_weight_param, _ = self.get_routed_safetensor_3_dim(w2_weight_name, src_hf_dir, hf_weight_map, tp_axis=1,
+                                                              split_ep=self.moe_split_ep, split_tp=self.moe_split_tp)
+        w2_scale_param, _ = self.get_routed_safetensor_2_dim(w2_scale_name, src_hf_dir, hf_weight_map,
+                                                             split_ep=self.moe_split_ep, split_tp=False)
+
         if ffn_concat:
             concat_weight_name = f"model.layers.{layer_id}.{layer_type}.w_gate_hidden._layer.weight"
             concat_weight_param = ms.Tensor(np.concatenate([w1_weight_param, w3_weight_param], axis=2), dtype=ms.int8)
@@ -1237,22 +1204,157 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor):
             parameter_dict[w3_quant_scale] = ms.Parameter(ms.Tensor(w3_quant_scale_param, ms.bfloat16),
                                                           name=w3_quant_scale, requires_grad=False)
 
+        parameter_dict[w2_weight_name] = ms.Parameter(ms.Tensor(w2_weight_param, ms.int8), name=w2_weight_name,
+                                                      requires_grad=False)
+        parameter_dict[w2_scale_name] = ms.Parameter(ms.Tensor(w2_scale_param, ms.bfloat16),
+                                                     name=w2_scale_name, requires_grad=False)
+
+    def get_smooth_quant_moe_shared_expert_weight(self, w1_weight_name, w1_bias_name, w1_scale_name, w3_weight_name,
+                                                  w3_bias_name, w3_scale_name, w2_weight_name, src_hf_dir,
+                                                  hf_weight_map):
+
+        if self.ep_method == EPMethod.ALLGATHER:
+            w1_weight_param, _ = self.get_safetensor_from_file_split_moe_tp_group(w1_weight_name, src_hf_dir,
+                                                                                  hf_weight_map,
+                                                                                  split_axis=0)
+            w1_bias_param, _ = self.get_safetensor_from_file_split_moe_tp_group(w1_bias_name, src_hf_dir, hf_weight_map,
+                                                                                split_axis=0)
+
+            w1_scale_param, _ = self.get_safetensor_from_file_split_moe_tp_group(w1_scale_name, src_hf_dir,
+                                                                                 hf_weight_map,
+                                                                                 split_axis=0)
+
+            w3_weight_param, _ = self.get_safetensor_from_file_split_moe_tp_group(w3_weight_name, src_hf_dir,
+                                                                                  hf_weight_map,
+                                                                                  split_axis=0)
+            w3_bias_param, _ = self.get_safetensor_from_file_split_moe_tp_group(w3_bias_name, src_hf_dir, hf_weight_map,
+                                                                                split_axis=0)
+            w3_scale_param, _ = self.get_safetensor_from_file_split_moe_tp_group(w3_scale_name, src_hf_dir,
+                                                                                 hf_weight_map,
+                                                                                 split_axis=0)
+
+            w2_weight_param, _ = self.get_safetensor_from_file_split_moe_tp_group(w2_weight_name, src_hf_dir,
+                                                                                  hf_weight_map,
+                                                                                  split_axis=1)
+        elif self.ep_method == EPMethod.ALLTOALL:
+            w1_weight_param, _ = self.get_safetensor_from_file(w1_weight_name, src_hf_dir, hf_weight_map)
+            w1_bias_param, _ = self.get_safetensor_from_file(w1_bias_name, src_hf_dir, hf_weight_map)
+            w1_scale_param, _ = self.get_safetensor_from_file(w1_scale_name, src_hf_dir, hf_weight_map)
+
+            w3_weight_param, _ = self.get_safetensor_from_file(w3_weight_name, src_hf_dir, hf_weight_map)
+            w3_bias_param, _ = self.get_safetensor_from_file(w3_bias_name, src_hf_dir, hf_weight_map)
+            w3_scale_param, _ = self.get_safetensor_from_file(w3_scale_name, src_hf_dir, hf_weight_map)
+
+            w2_weight_param, _ = self.get_safetensor_from_file(w2_weight_name, src_hf_dir, hf_weight_map)
+        else:
+            raise ValueError("Unsupported ep_method:{}".format(self.ep_method))
+
+        return w1_weight_param, w1_bias_param, w1_scale_param, w3_weight_param, w3_bias_param, w3_scale_param, w2_weight_param
+
+    def smooth_quant_process_shared_ffn_weight(self, src_hf_dir, layer_id, hf_weight_map, parameter_dict, layer_type):
+        """smooth_quant_process_shared_ffn_weight"""
+
+        ffn_concat = self.config.model.model_config.ffn_concat
+        w1_weight_name = f"model.layers.{layer_id}.{layer_type}.w1._layer.weight"
+        w2_weight_name = f"model.layers.{layer_id}.{layer_type}.w2._layer.weight"
+        w3_weight_name = f"model.layers.{layer_id}.{layer_type}.w3._layer.weight"
+
+        w1_bias_name = f"model.layers.{layer_id}.{layer_type}.w1._layer.matmul.quant_bias"
+        w3_bias_name = f"model.layers.{layer_id}.{layer_type}.w3._layer.matmul.quant_bias"
+
+        w1_scale_name = f"model.layers.{layer_id}.{layer_type}.w1._layer.matmul.dequant_scale"
+        w2_scale_name = f"model.layers.{layer_id}.{layer_type}.w2._layer.matmul.weight_scale"
+        w3_scale_name = f"model.layers.{layer_id}.{layer_type}.w3._layer.matmul.dequant_scale"
+
+        w1_quant_zp = f"model.layers.{layer_id}.{layer_type}.w1.quant_op.input_zp"
+        w3_quant_zp = f"model.layers.{layer_id}.{layer_type}.w3.quant_op.input_zp"
+
+        w1_quant_scale = f"model.layers.{layer_id}.{layer_type}.w1.quant_op.input_scale"
+        w3_quant_scale = f"model.layers.{layer_id}.{layer_type}.w3.quant_op.input_scale"
+
+        w1_weight_param, w1_bias_param, w1_scale_param, w3_weight_param, w3_bias_param, w3_scale_param, w2_weight_param = \
+            self.get_smooth_quant_moe_shared_expert_weight(w1_weight_name, w1_bias_name, w1_scale_name, w3_weight_name,
+                                                           w3_bias_name, w3_scale_name, w2_weight_name, src_hf_dir,
+                                                           hf_weight_map)
+
+        w1_quant_zp_param, _ = self.get_safetensor_from_file(w1_quant_zp, src_hf_dir, hf_weight_map)
+        w1_quant_scale_param, _ = self.get_safetensor_from_file(w1_quant_scale, src_hf_dir, hf_weight_map)
+
+        w2_scale_param, _ = self.get_safetensor_from_file(w2_scale_name, src_hf_dir, hf_weight_map)
+
+        w3_quant_zp_param, _ = self.get_safetensor_from_file(w3_quant_zp, src_hf_dir, hf_weight_map)
+        w3_quant_scale_param, _ = self.get_safetensor_from_file(w3_quant_scale, src_hf_dir, hf_weight_map)
+        if ffn_concat:
+            concat_weight_name = f"model.layers.{layer_id}.{layer_type}.w_gate_hidden._layer.weight"
+            concat_weight_param = ms.Tensor(np.concatenate([w1_weight_param, w3_weight_param], axis=0), dtype=ms.int8)
+            parameter_dict[concat_weight_name] = ms.Parameter(concat_weight_param, name=concat_weight_name,
+                                                              requires_grad=False)
+
+            concat_bias_name = f"model.layers.{layer_id}.{layer_type}.w_gate_hidden._layer.matmul.quant_bias"
+            concat_bias_param = ms.Tensor(np.concatenate([w1_bias_param, w3_bias_param], axis=0), dtype=ms.int32)
+            parameter_dict[concat_bias_name] = ms.Parameter(concat_bias_param, name=concat_bias_name,
+                                                            requires_grad=False)
+
+            concat_scale_name = f"model.layers.{layer_id}.{layer_type}.w_gate_hidden._layer.matmul.dequant_scale"
+            concat_scale_param = ms.Tensor(np.concatenate([w1_scale_param, w3_scale_param], axis=0), dtype=ms.float32)
+            parameter_dict[concat_scale_name] = ms.Parameter(concat_scale_param, name=concat_scale_name,
+                                                             requires_grad=False)
+
+            concat_quant_zp_name = f"model.layers.{layer_id}.{layer_type}.w_gate_hidden.quant_op.input_zp"
+            concat_quant_zp_param = ms.Tensor(w1_quant_zp_param, dtype=ms.int8)
+            parameter_dict[concat_quant_zp_name] = ms.Parameter(concat_quant_zp_param, name=concat_quant_zp_name,
+                                                                requires_grad=False)
+
+            concat_quant_scale_name = f"model.layers.{layer_id}.{layer_type}.w_gate_hidden.quant_op.input_scale"
+            concat_quant_scale_param = ms.Tensor(w1_quant_scale_param, dtype=ms.bfloat16)
+            parameter_dict[concat_quant_scale_name] = ms.Parameter(concat_quant_scale_param,
+                                                                   name=concat_quant_scale_name,
+                                                                   requires_grad=False)
+        else:
+            # w1 w3
+            parameter_dict[w1_weight_name] = ms.Parameter(ms.Tensor(w1_weight_param, ms.int8), name=w1_weight_name,
+                                                          requires_grad=False)
+            parameter_dict[w3_weight_name] = ms.Parameter(ms.Tensor(w3_weight_param, ms.int8), name=w3_weight_name,
+                                                          requires_grad=False)
+
+            parameter_dict[w1_bias_name] = ms.Parameter(ms.Tensor(w1_bias_param, ms.int32),
+                                                        name=w1_bias_name, requires_grad=False)
+            parameter_dict[w3_bias_name] = ms.Parameter(ms.Tensor(w3_bias_param, ms.int32),
+                                                        name=w3_bias_name, requires_grad=False)
+
+            parameter_dict[w1_scale_name] = ms.Parameter(ms.Tensor(w1_scale_param, ms.float32),
+                                                         name=w1_scale_name, requires_grad=False)
+            parameter_dict[w3_scale_name] = ms.Parameter(ms.Tensor(w3_scale_param, ms.float32),
+                                                         name=w3_scale_name, requires_grad=False)
+
+            parameter_dict[w1_quant_zp] = ms.Parameter(ms.Tensor(w1_quant_zp_param, ms.int8),
+                                                       name=w1_quant_zp, requires_grad=False)
+            parameter_dict[w3_quant_zp] = ms.Parameter(ms.Tensor(w3_quant_zp_param, ms.int8),
+                                                       name=w3_quant_zp, requires_grad=False)
+
+            parameter_dict[w1_quant_scale] = ms.Parameter(ms.Tensor(w1_quant_scale_param, ms.bfloat16),
+                                                          name=w1_quant_scale, requires_grad=False)
+            parameter_dict[w3_quant_scale] = ms.Parameter(ms.Tensor(w3_quant_scale_param, ms.bfloat16),
+                                                          name=w3_quant_scale, requires_grad=False)
+        parameter_dict[w2_weight_name] = ms.Parameter(ms.Tensor(w2_weight_param, ms.int8), name=w2_weight_name,
+                                                      requires_grad=False)
+        parameter_dict[w2_scale_name] = ms.Parameter(ms.Tensor(w2_scale_param, ms.bfloat16),
+                                                     name=w2_scale_name, requires_grad=False)
+
     def smooth_quant_process_ffn_weight(self, src_hf_dir, layer_id, hf_weight_map, parameter_dict, layer_type):
         """smooth_quant_process_ffn_weight"""
 
         ffn_concat = self.config.model.model_config.ffn_concat
         w1_weight_name = f"model.layers.{layer_id}.{layer_type}.w1._layer.weight"
-        w1_weight_param, _ = self.get_safetensor_from_file(w1_weight_name, src_hf_dir, hf_weight_map,
-                                                           is_split_param=True,
-                                                           split_axis=0)
+
+        w1_weight_param, _ = self.get_safetensor_from_file_split_tp_group(w1_weight_name, src_hf_dir, hf_weight_map,
+                                                                          split_axis=0)
         w1_bias_name = f"model.layers.{layer_id}.{layer_type}.w1._layer.matmul.quant_bias"
-        w1_bias_param, _ = self.get_safetensor_from_file(w1_bias_name, src_hf_dir, hf_weight_map,
-                                                         is_split_param=True,
-                                                         split_axis=0)
+        w1_bias_param, _ = self.get_safetensor_from_file_split_tp_group(w1_bias_name, src_hf_dir, hf_weight_map,
+                                                                        split_axis=0)
         w1_scale_name = f"model.layers.{layer_id}.{layer_type}.w1._layer.matmul.dequant_scale"
-        w1_scale_param, _ = self.get_safetensor_from_file(w1_scale_name, src_hf_dir, hf_weight_map,
-                                                          is_split_param=True,
-                                                          split_axis=0)
+        w1_scale_param, _ = self.get_safetensor_from_file_split_tp_group(w1_scale_name, src_hf_dir, hf_weight_map,
+                                                                         split_axis=0)
 
         w1_quant_zp = f"model.layers.{layer_id}.{layer_type}.w1.quant_op.input_zp"
         w1_quant_scale = f"model.layers.{layer_id}.{layer_type}.w1.quant_op.input_scale"
@@ -1260,17 +1362,20 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor):
         w1_quant_scale_param, _ = self.get_safetensor_from_file(w1_quant_scale, src_hf_dir, hf_weight_map)
 
         w3_weight_name = f"model.layers.{layer_id}.{layer_type}.w3._layer.weight"
-        w3_weight_param, _ = self.get_safetensor_from_file(w3_weight_name, src_hf_dir, hf_weight_map,
-                                                           is_split_param=True,
-                                                           split_axis=0)
+
+        w3_weight_param, _ = self.get_safetensor_from_file_split_tp_group(w3_weight_name, src_hf_dir, hf_weight_map,
+                                                                          split_axis=0)
         w3_bias_name = f"model.layers.{layer_id}.{layer_type}.w3._layer.matmul.quant_bias"
-        w3_bias_param, _ = self.get_safetensor_from_file(w3_bias_name, src_hf_dir, hf_weight_map,
-                                                         is_split_param=True,
-                                                         split_axis=0)
+        w3_bias_param, _ = self.get_safetensor_from_file_split_tp_group(w3_bias_name, src_hf_dir, hf_weight_map,
+                                                                        split_axis=0)
         w3_scale_name = f"model.layers.{layer_id}.{layer_type}.w3._layer.matmul.dequant_scale"
-        w3_scale_param, _ = self.get_safetensor_from_file(w3_scale_name, src_hf_dir, hf_weight_map,
-                                                          is_split_param=True,
-                                                          split_axis=0)
+        w3_scale_param, _ = self.get_safetensor_from_file_split_tp_group(w3_scale_name, src_hf_dir, hf_weight_map,
+                                                                         split_axis=0)
+        w2_weight_name = f"model.layers.{layer_id}.{layer_type}.w2._layer.weight"
+        w2_scale_name = f"model.layers.{layer_id}.{layer_type}.w2._layer.matmul.weight_scale"
+        w2_weight_param, _ = self.get_safetensor_from_file_split_tp_group(w2_weight_name, src_hf_dir, hf_weight_map,
+                                                                          split_axis=1)
+        w2_scale_param, _ = self.get_safetensor_from_file(w2_scale_name, src_hf_dir, hf_weight_map)
 
         w3_quant_zp = f"model.layers.{layer_id}.{layer_type}.w3.quant_op.input_zp"
         w3_quant_scale = f"model.layers.{layer_id}.{layer_type}.w3.quant_op.input_scale"
@@ -1329,6 +1434,11 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor):
             parameter_dict[w3_quant_scale] = ms.Parameter(ms.Tensor(w3_quant_scale_param, ms.bfloat16),
                                                           name=w3_quant_scale, requires_grad=False)
 
+        parameter_dict[w2_weight_name] = ms.Parameter(ms.Tensor(w2_weight_param, ms.int8), name=w2_weight_name,
+                                                      requires_grad=False)
+        parameter_dict[w2_scale_name] = ms.Parameter(ms.Tensor(w2_scale_param, ms.bfloat16),
+                                                     name=w2_scale_name, requires_grad=False)
+
     def smooth_quant_process_qkv_weight(self, src_hf_dir, layer_id, hf_weight_map, parameter_dict):
         '''smooth_quant_process_qkv_weight'''
         qkv_concat = self.config.model.model_config.qkv_concat
@@ -1402,13 +1512,13 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor):
     def infer_smooth_quant_row_linear_split(self, param_name, src_hf_dir, hf_weight_map):
         '''infer_smooth_quant_row_linear_split'''
         if param_name.endswith(".weight"):
-            value, _ = self.get_safetensor_from_file(param_name, src_hf_dir,
-                                                     hf_weight_map, is_split_param=True,
-                                                     split_axis=1)
+            value, _ = self.get_safetensor_from_file_split_tp_group(param_name, src_hf_dir,
+                                                                    hf_weight_map,
+                                                                    split_axis=1)
         elif "quant_op" in param_name:
-            value, _ = self.get_safetensor_from_file(param_name, src_hf_dir,
-                                                     hf_weight_map, is_split_param=True,
-                                                     split_axis=0)
+            value, _ = self.get_safetensor_from_file_split_tp_group(param_name, src_hf_dir,
+                                                                    hf_weight_map,
+                                                                    split_axis=0)
         else:
             value, _ = self.get_safetensor_from_file(param_name, src_hf_dir,
                                                      hf_weight_map)
@@ -1416,6 +1526,7 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor):
         if any([name in param_name for name in quant_bias_set_zero]) and \
             get_tensor_model_parallel_rank() != 0:
             value.fill(0)
+
         return value
 
     def infer_smooth_quant_get_value(self, param_name, src_hf_dir, hf_weight_map, no_need_split_layer):
@@ -1426,28 +1537,21 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor):
                                                      hf_weight_map)
         elif any([name in param_name for name in [".l2q_proj."]]):
             if param_name.endswith(".weight") or "matmul" in param_name:
-                value, _ = self.get_safetensor_from_file(param_name, src_hf_dir,
-                                                         hf_weight_map, is_split_param=True,
-                                                         split_axis=0)
+                value, _ = self.get_safetensor_from_file_split_tp_group(param_name, src_hf_dir,
+                                                                        hf_weight_map,
+                                                                        split_axis=0)
             else:
                 value, _ = self.get_safetensor_from_file(param_name, src_hf_dir,
                                                          hf_weight_map)
-        elif any([name in param_name for name in [".feed_forward.w2.", ".wo.", "shared_experts.w2"]]):
+        elif any([name in param_name for name in [".wo."]]):
             value = self.infer_smooth_quant_row_linear_split(param_name, src_hf_dir, hf_weight_map)
-        elif ".routed_experts.ffn.w2" in param_name:
-            if param_name.endswith(".weight"):
-                value, _ = self.get_safetensor_from_file(param_name, src_hf_dir, hf_weight_map,
-                                                         is_split_param=True, split_axis=1)
-            else:
-                value, _ = self.get_safetensor_from_file(param_name, src_hf_dir,
-                                                         hf_weight_map)
         elif any([name in param_name for name in ["lkv2kv_k_nope", "lkv2kv_v"]]):
-            value, _ = self.get_safetensor_from_file(param_name, src_hf_dir, hf_weight_map,
-                                                     is_split_param=True, split_axis=0)
+            value, _ = self.get_safetensor_from_file_split_tp_group(param_name, src_hf_dir, hf_weight_map,
+                                                                    split_axis=0)
         elif "lm_head" in param_name:
             if not self.config.parallel_config.vocab_emb_dp:
-                value, _ = self.get_safetensor_from_file(param_name, src_hf_dir, hf_weight_map,
-                                                         is_split_param=True, split_axis=0)
+                value, _ = self.get_safetensor_from_file_split_tp_group(param_name, src_hf_dir, hf_weight_map,
+                                                                        split_axis=0)
             else:
                 value, _ = self.get_safetensor_from_file(param_name, src_hf_dir, hf_weight_map)
         else:
@@ -1465,84 +1569,23 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor):
             if layer_id >= 3:
                 self.smooth_quant_process_route_ffn_weight(src_hf_dir, layer_id, hf_weight_map, parameter_dict,
                                                            "feed_forward.routed_experts.ffn")
-                self.smooth_quant_process_ffn_weight(src_hf_dir, layer_id, hf_weight_map, parameter_dict,
-                                                     "feed_forward.shared_experts")
+                self.smooth_quant_process_shared_ffn_weight(src_hf_dir, layer_id, hf_weight_map, parameter_dict,
+                                                            "feed_forward.shared_experts")
 
             else:
                 self.smooth_quant_process_ffn_weight(src_hf_dir, layer_id, hf_weight_map, parameter_dict,
                                                      "feed_forward")
             self.smooth_quant_process_qkv_weight(src_hf_dir, layer_id, hf_weight_map, parameter_dict)
 
-        skip_layer = ["feed_forward.routed_experts.ffn.w1", "feed_forward.shared_experts.w1", "feed_forward.w1",
-                      "feed_forward.routed_experts.ffn.w3", "feed_forward.shared_experts.w3", "feed_forward.w3",
-                      "feed_forward.routed_experts.ffn.w_gate_hidden", "feed_forward.shared_experts.w_gate_hidden",
-                      "feed_forward.w_gate_hidden", "attention.kv2l", "attention.q2l_proj", "attention.qkv2l"]
+        skip_layer = ["feed_forward.routed_experts.ffn", "feed_forward.shared_experts", "feed_forward.w",
+                      "attention.kv2l", "attention.q"]
 
         for param_name, _ in tqdm(hf_weight_map.items(), desc="remaining params load"):
             if "model.layers" in param_name and int(param_name.split('.')[2]) >= num_layers:
                 continue
 
-            if any([name in param_name for name in no_need_split_layer]):
-                value, _ = self.get_safetensor_from_file(param_name, src_hf_dir,
-                                                         hf_weight_map)
-            elif any([name in param_name for name in [".l2q_proj.", ".feed_forward.w_gate_hidden.",
-                                                      "shared_experts.w_gate_hidden"]]):
-                if param_name.endswith(".weight") or "matmul" in param_name:
-                    value, _ = self.get_safetensor_from_file(param_name, src_hf_dir,
-                                                             hf_weight_map, is_split_param=True,
-                                                             split_axis=0)
-                else:
-                    value, _ = self.get_safetensor_from_file(param_name, src_hf_dir,
-                                                             hf_weight_map)
-            elif any([name in param_name for name in [".feed_forward.w2.", ".wo.", "shared_experts.w2"]]):
-                if param_name.endswith(".weight"):
-                    value, _ = self.get_safetensor_from_file(param_name, src_hf_dir,
-                                                             hf_weight_map, is_split_param=True,
-                                                             split_axis=1)
-                elif "quant_op" in param_name:
-                    value, _ = self.get_safetensor_from_file(param_name, src_hf_dir,
-                                                             hf_weight_map, is_split_param=True,
-                                                             split_axis=0)
-                else:
-                    value, _ = self.get_safetensor_from_file(param_name, src_hf_dir,
-                                                             hf_weight_map)
-            elif ".routed_experts.ffn.w_gate_hidden." in param_name:
-                if param_name.endswith(".weight"):
-                    value, _ = self.get_safetensor_from_file(param_name, src_hf_dir, hf_weight_map)
-                    value_list = []
-                    for experts_id in range(value.shape[0]):
-                        value_list.append(self.split_weight_by_rank(value[experts_id, :, :], split_axis=1))
-                    value = np.stack(value_list, axis=0)
-                elif "matmul" in param_name:
-                    value, _ = self.get_safetensor_from_file(param_name, src_hf_dir, hf_weight_map)
-                    value_list = []
-                    for experts_id in range(value.shape[0]):
-                        value_list.append(self.split_weight_by_rank(value[experts_id, :], split_axis=0))
-                    value = np.stack(value_list, axis=0)
-                else:
-                    value, _ = self.get_safetensor_from_file(param_name, src_hf_dir,
-                                                             hf_weight_map)
-            elif ".routed_experts.ffn.w2" in param_name:
-                if param_name.endswith(".weight"):
-                    value, _ = self.get_safetensor_from_file(param_name, src_hf_dir, hf_weight_map)
-                    value_list = []
-                    for experts_id in range(value.shape[0]):
-                        value_list.append(self.split_weight_by_rank(value[experts_id, :, :], split_axis=0))
-                    value = np.stack(value_list, axis=0)
-                else:
-                    value, _ = self.get_safetensor_from_file(param_name, src_hf_dir,
-                                                             hf_weight_map)
-            elif any([name in param_name for name in ["lkv2kv_k_nope", "lkv2kv_v"]]):
-                value, _ = self.get_safetensor_from_file(param_name, src_hf_dir, hf_weight_map,
-                                                         is_split_param=True, split_axis=0)
-            elif "lm_head" in param_name:
-                if not self.config.parallel_config.vocab_emb_dp:
-                    value, _ = self.get_safetensor_from_file(param_name, src_hf_dir, hf_weight_map,
-                                                             is_split_param=True, split_axis=0)
-                else:
-                    value, _ = self.get_safetensor_from_file(param_name, src_hf_dir, hf_weight_map)
-            else:
-                raise ValueError(f"not found layer {param_name}, please check safetensors file.")
+            if any([name in param_name for name in skip_layer]):
+                continue
 
             value = self.infer_smooth_quant_get_value(param_name, src_hf_dir, hf_weight_map, no_need_split_layer)
             dst_dtype = convert_np_to_ms_dtype(value)
@@ -1551,8 +1594,8 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor):
                                                       name=param_name, requires_grad=False)
 
         param_not_load, ckpt_not_load = ms.load_param_into_net(self.network, parameter_dict)
-        logger.info("smoothquant param_not_load: %s" % str(param_not_load))
-        logger.info("smoothquant ckpt_not_load: %s" % str(ckpt_not_load))
+        logger.info(f"smoothquant param_not_load:{param_not_load}")
+        logger.info(f"smoothquant ckpt_not_load:{ckpt_not_load}")
 
     def infer_gptq_quant_net_ms_convert_layer_weight(self, src_hf_dir, num_layers, hf_weight_map):
         """infer_gptq_quant_net_ms_convert_layer_weight"""
@@ -1595,12 +1638,12 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor):
                     value_list.append(self.split_weight_by_rank(value[experts_id, :, :], split_axis=0))
                 value = np.stack(value_list, axis=0)
             elif any([name in param_name for name in ["lkv2kv_k_nope", "lkv2kv_v"]]):
-                value, is_int4 = self.get_safetensor_from_file(param_name, src_hf_dir, hf_weight_map,
-                                                               is_split_param=True, split_axis=0)
+                value, is_int4 = self.get_safetensor_from_file_split_tp_group(param_name, src_hf_dir, hf_weight_map,
+                                                                              split_axis=0)
             elif "lm_head" in param_name:
                 if not self.config.parallel_config.vocab_emb_dp:
-                    value, is_int4 = self.get_safetensor_from_file(param_name, src_hf_dir, hf_weight_map,
-                                                                   is_split_param=True, split_axis=0)
+                    value, is_int4 = self.get_safetensor_from_file_split_tp_group(param_name, src_hf_dir, hf_weight_map,
+                                                                                  split_axis=0)
                 else:
                     value, is_int4 = self.get_safetensor_from_file(param_name, src_hf_dir, hf_weight_map)
             else:
diff --git a/vllm_mindspore/model_executor/models/mf_models/mf_model_base.py b/vllm_mindspore/model_executor/models/mf_models/mf_model_base.py
index 79974f6f..533d16c7 100644
--- a/vllm_mindspore/model_executor/models/mf_models/mf_model_base.py
+++ b/vllm_mindspore/model_executor/models/mf_models/mf_model_base.py
@@ -46,17 +46,6 @@ from vllm_mindspore.v1.attention.backends.flash_attn import FlashAttentionMetada
 
 logger = init_logger(__name__)
 
-
-def _pad_to_max(x, max_len):
-    return x + [-1] * (max_len - len(x))
-
-
-def _batch_seq(input_tokens, prefill):
-    if prefill:
-        return ms.ops.expand_dims(input_tokens, 0).to(ms.int32)
-
-    return ms.mint.reshape(input_tokens, (-1, 1)).to(ms.int32)
-
 class MfModelBase(MsModelBase):
     def __init__(self, *, vllm_config: VllmConfig, prefix: str = "") -> None:
         super(MfModelBase, self).__init__(
@@ -74,8 +63,10 @@ class MfModelBase(MsModelBase):
         )
         self.mf_config.model.model_config.parallel_config.pipeline_stage = 1
         self._generate_model_config()
+
         self.casual_mask = LowerTriangularMask(mf_model_config=self.mf_model_config)
         self.network, self.lm_head = self._create_network()
+
         affinity_config = self.mf_config.get('context', {}).get('affinity_cpu_list', {})
         if isinstance(affinity_config, dict):
             ms.runtime.set_cpu_affinity(True, affinity_config)
@@ -151,8 +142,8 @@ class MfModelBase(MsModelBase):
             attention_mask = self.casual_mask.gen_attention_mask(is_prefill, position_ids, query_lens)
 
             model_inputs = {}
-            model_inputs["input_ids"] = _batch_seq(input_ids, is_prefill)
-            model_inputs["batch_valid_length"] = ms.Tensor.from_numpy(np.expand_dims(seq_lens_np, 0))
+            model_inputs["input_ids"] = input_ids.astype(ms.int32)
+            model_inputs["batch_valid_length"] = ms.from_numpy(seq_lens_np)
             model_inputs["block_tables"] = attn_metadata.block_tables
             model_inputs["slot_mapping"] = attn_metadata.slot_mapping
             model_inputs["position_ids"] = position_ids
@@ -170,8 +161,8 @@ class MfModelBase(MsModelBase):
             attention_mask = self.casual_mask.gen_attention_mask(is_prefill, positions, query_lens_np)
 
             model_inputs = {}
-            model_inputs["input_ids"] = _batch_seq(input_ids, is_prefill)
-            model_inputs["batch_valid_length"] = ms.Tensor(np.expand_dims(attn_metadata.seq_lens_np, 0))
+            model_inputs["input_ids"] = input_ids.astype(ms.int32)
+            model_inputs["batch_valid_length"] = ms.from_numpy(attn_metadata.seq_lens_np)
             model_inputs["block_tables"] = attn_metadata.block_tables
             model_inputs["slot_mapping"] = attn_metadata.slot_mapping
             model_inputs["position_ids"] = positions.to(ms.int32)
diff --git a/vllm_mindspore/model_executor/models/mf_models/qwen2_weight_processor.py b/vllm_mindspore/model_executor/models/mf_models/qwen2_weight_processor.py
index 59423eca..99b59a72 100644
--- a/vllm_mindspore/model_executor/models/mf_models/qwen2_weight_processor.py
+++ b/vllm_mindspore/model_executor/models/mf_models/qwen2_weight_processor.py
@@ -25,8 +25,10 @@ from safetensors import safe_open
 import mindspore as ms
 from mindspore.communication.management import get_rank
 
-from vllm_mindspore.model_executor.models.mf_models.weight_processor import BaseWeightProcessor
+from vllm_mindspore.model_executor.models.mf_models.weight_processor import BaseWeightProcessor, EPMethod
+from vllm.logger import init_logger
 
+logger = init_logger(__name__)
 
 class Qwen2WeightProcessor(BaseWeightProcessor):
     r"""
@@ -47,8 +49,8 @@ class Qwen2WeightProcessor(BaseWeightProcessor):
         if self.config.parallel_config.vocab_emb_dp:
             np_data, _ = self.get_safetensor_from_file(embed_tokens_hf_name, src_hf_dir, hf_weight_map)
         else:
-            np_data, _ = self.get_safetensor_from_file(embed_tokens_hf_name, src_hf_dir, hf_weight_map,
-                                                       is_split_param=True, split_axis=0)
+            np_data, _ = self.get_safetensor_from_file_split_tp_group(embed_tokens_hf_name, src_hf_dir, hf_weight_map,
+                                                                      split_axis=0)
         self.parameter_dict[embed_tokens_ms_name] = ms.Parameter(ms.from_numpy(np_data).astype(ms.bfloat16),
                                                                  name=embed_tokens_ms_name,
                                                                  requires_grad=False)
@@ -64,8 +66,8 @@ class Qwen2WeightProcessor(BaseWeightProcessor):
         lm_head_ms_name = self.convert_weight_name(lm_head_hf_name)
         if not self.config.model.model_config.tie_word_embeddings:
             if not self.config.parallel_config.vocab_emb_dp:
-                np_data, _ = self.get_safetensor_from_file(lm_head_hf_name, src_hf_dir, hf_weight_map,
-                                                           is_split_param=True, split_axis=0)
+                np_data, _ = self.get_safetensor_from_file_split_tp_group(lm_head_hf_name, src_hf_dir, hf_weight_map,
+                                                                          split_axis=0)
             else:
                 np_data, _ = self.get_safetensor_from_file(lm_head_hf_name, src_hf_dir, hf_weight_map)
             self.parameter_dict[lm_head_ms_name] = ms.Parameter(ms.from_numpy(np_data).astype(ms.bfloat16),
@@ -94,18 +96,18 @@ class Qwen2WeightProcessor(BaseWeightProcessor):
         ffn_concat = self.config.model.model_config.qkv_concat
         w1_hf_name = f"model.layers.{layer_id}.mlp.gate_proj.weight"
         w1_ms_name = self.convert_weight_name(w1_hf_name)
-        w1_ms_param, _ = self.get_safetensor_from_file(w1_hf_name, src_hf_dir, hf_weight_map, is_split_param=True,
-                                                       split_axis=0)
+        w1_ms_param, _ = self.get_safetensor_from_file_split_tp_group(w1_hf_name, src_hf_dir, hf_weight_map,
+                                                                      split_axis=0)
 
         w2_hf_name = f"model.layers.{layer_id}.mlp.down_proj.weight"
         w2_ms_name = self.convert_weight_name(w2_hf_name)
-        w2_ms_param, _ = self.get_safetensor_from_file(w2_hf_name, src_hf_dir, hf_weight_map, is_split_param=True,
-                                                       split_axis=1)
+        w2_ms_param, _ = self.get_safetensor_from_file_split_tp_group(w2_hf_name, src_hf_dir, hf_weight_map,
+                                                                      split_axis=1)
 
         w3_hf_name = f"model.layers.{layer_id}.mlp.up_proj.weight"
         w3_ms_name = self.convert_weight_name(w3_hf_name)
-        w3_ms_param, _ = self.get_safetensor_from_file(w3_hf_name, src_hf_dir, hf_weight_map, is_split_param=True,
-                                                       split_axis=0)
+        w3_ms_param, _ = self.get_safetensor_from_file_split_tp_group(w3_hf_name, src_hf_dir, hf_weight_map,
+                                                                      split_axis=0)
 
         if ffn_concat:
             w_gate_hidden_name = f"model.layers.{layer_id}.feed_forward.w_gate_hidden.weight"
@@ -130,38 +132,35 @@ class Qwen2WeightProcessor(BaseWeightProcessor):
         # wq
         wq_hf_name = f"model.layers.{layer_id}.self_attn.q_proj.weight"
         wq_ms_name = self.convert_weight_name(wq_hf_name)
-        wq_ms_param, _ = self.get_safetensor_from_file(wq_hf_name, src_hf_dir, hf_weight_map, is_split_param=True,
-                                                       split_axis=0)
+        wq_ms_param, _ = self.get_safetensor_from_file_split_tp_group(wq_hf_name, src_hf_dir, hf_weight_map,
+                                                                      split_axis=0)
         # wq bias
         wq_bias_hf_name = f"model.layers.{layer_id}.self_attn.q_proj.bias"
         wq_bias_ms_name = self.convert_weight_name(wq_bias_hf_name)
-        wq_bias_ms_param, _ = self.get_safetensor_from_file(wq_bias_hf_name, src_hf_dir, hf_weight_map,
-                                                            is_split_param=True,
-                                                            split_axis=0)
+        wq_bias_ms_param, _ = self.get_safetensor_from_file_split_tp_group(wq_bias_hf_name, src_hf_dir, hf_weight_map,
+                                                                           split_axis=0)
 
         # wk
         wk_hf_name = f"model.layers.{layer_id}.self_attn.k_proj.weight"
         wk_ms_name = self.convert_weight_name(wk_hf_name)
-        wk_ms_param, _ = self.get_safetensor_from_file(wk_hf_name, src_hf_dir, hf_weight_map, is_split_param=True,
-                                                       split_axis=0)
+        wk_ms_param, _ = self.get_safetensor_from_file_split_tp_group(wk_hf_name, src_hf_dir, hf_weight_map,
+                                                                      split_axis=0)
         # wk bias
         wk_bias_hf_name = f"model.layers.{layer_id}.self_attn.k_proj.bias"
         wk_bias_ms_name = self.convert_weight_name(wk_bias_hf_name)
-        wk_bias_ms_param, _ = self.get_safetensor_from_file(wk_bias_hf_name, src_hf_dir, hf_weight_map,
-                                                            is_split_param=True,
-                                                            split_axis=0)
+        wk_bias_ms_param, _ = self.get_safetensor_from_file_split_tp_group(wk_bias_hf_name, src_hf_dir, hf_weight_map,
+                                                                           split_axis=0)
 
         # wv
         wv_hf_name = f"model.layers.{layer_id}.self_attn.v_proj.weight"
         wv_ms_name = self.convert_weight_name(wv_hf_name)
-        wv_ms_param, _ = self.get_safetensor_from_file(wv_hf_name, src_hf_dir, hf_weight_map, is_split_param=True,
-                                                       split_axis=0)
+        wv_ms_param, _ = self.get_safetensor_from_file_split_tp_group(wv_hf_name, src_hf_dir, hf_weight_map,
+                                                                      split_axis=0)
         # wv bias
         wv_bias_hf_name = f"model.layers.{layer_id}.self_attn.v_proj.bias"
         wv_bias_ms_name = self.convert_weight_name(wv_bias_hf_name)
-        wv_bias_ms_param, _ = self.get_safetensor_from_file(wv_bias_hf_name, src_hf_dir, hf_weight_map,
-                                                            is_split_param=True,
-                                                            split_axis=0)
+        wv_bias_ms_param, _ = self.get_safetensor_from_file_split_tp_group(wv_bias_hf_name, src_hf_dir, hf_weight_map,
+                                                                           split_axis=0)
 
         if qkv_concat:
             w_qkv_name = f"model.layers.{layer_id}.attention.w_qkv.weight"
@@ -201,8 +200,8 @@ class Qwen2WeightProcessor(BaseWeightProcessor):
         # wo
         wo_hf_name = f"model.layers.{layer_id}.self_attn.o_proj.weight"
         wo_ms_name = self.convert_weight_name(wo_hf_name)
-        wo_ms_param, _ = self.get_safetensor_from_file(wo_hf_name, src_hf_dir, hf_weight_map, is_split_param=True,
-                                                       split_axis=1)
+        wo_ms_param, _ = self.get_safetensor_from_file_split_tp_group(wo_hf_name, src_hf_dir, hf_weight_map,
+                                                                      split_axis=1)
         self.parameter_dict[wo_ms_name] = ms.Parameter(ms.from_numpy(wo_ms_param).astype(ms.bfloat16),
                                                        name=wo_ms_name,
                                                        requires_grad=False)
@@ -262,6 +261,7 @@ class Qwen2WeightProcessor(BaseWeightProcessor):
         for layer_id in tqdm(range(num_layers), desc="Weight loading", disable=not enable_tqdm):
             self.infer_convert_layer_weight(src_hf_dir, layer_id, hf_weight_map)
 
-        ms.load_param_into_net(self.network, self.parameter_dict)
+        param_not_load, ckpt_not_load = ms.load_param_into_net(self.network, self.parameter_dict)
+        logger.info("param_not_load: %s, ckpt_not_load: %s" % (str(param_not_load), str(ckpt_not_load)))
         del self.parameter_dict
         gc.collect()
diff --git a/vllm_mindspore/model_executor/models/mf_models/weight_processor.py b/vllm_mindspore/model_executor/models/mf_models/weight_processor.py
index 696367ec..d96be356 100644
--- a/vllm_mindspore/model_executor/models/mf_models/weight_processor.py
+++ b/vllm_mindspore/model_executor/models/mf_models/weight_processor.py
@@ -21,9 +21,10 @@ import os
 from enum import Enum
 from safetensors import safe_open
 from mindspore.communication.management import get_rank, get_group_size
-from mindformers.experimental.infer.core.utils import get_tp_world_size
+from mindformers.experimental.infer.core.utils import get_tp_world_size, get_moe_tp_world_size, get_moe_ep_world_size
 from mindformers.experimental.parallel_core.pynative.parallel_state import get_data_parallel_world_size
 
+
 class EPMethod(Enum):
     """
     EP method enums
@@ -49,26 +50,21 @@ class BaseWeightProcessor:
         self.global_group_size = get_group_size()
         self.tp_group_size = get_tp_world_size()
         self.dp_group_size = get_data_parallel_world_size()
-        self.moe_ep_size = self.config.moe_config.moe_expert_parallel
-        self.moe_tp_size = self.config.moe_config.moe_tensor_parallel
+        self.num_router_experts = self.config.moe_config.expert_num if hasattr(self.config.moe_config, "expert_num") else 1
+        self.moe_ep_size = self.config.moe_config.moe_expert_parallel if hasattr(self.config.moe_config, "moe_expert_parallel") else 1
+        self.moe_tp_size = self.config.moe_config.moe_tensor_parallel if hasattr(self.config.moe_config, "moe_tensor_parallel") else 1
         self.ep_method = EPMethod.DEFAULT
+        if self.dp_group_size > 1 and self.moe_ep_size == self.global_group_size:
+            self.ep_method = EPMethod.ALLTOALL
+        elif self.dp_group_size > 1:
+            self.ep_method = EPMethod.ALLGATHER
         self.tp_rank_id = self.global_rank_id % self.tp_group_size
 
-        num_router_experts = self.config.moe_config.expert_num
-        self.ep_group_nums = num_router_experts // self.moe_ep_size
+        self.ep_group_nums = self.num_router_experts // self.moe_ep_size
         self.moe_ep_rank_id = self.global_rank_id // self.moe_tp_size
         self.moe_tp_rank_id = self.global_rank_id % self.moe_tp_size
-
-        print(f"global_rank_id: {self.global_rank_id} \n"
-              f"tp_group_size: {self.tp_group_size} \n"
-              f"dp_group_size: {self.dp_group_size} \n"
-              f"tp_rank_id: {self.tp_rank_id} \n"
-              f"num_router_experts: {num_router_experts} \n"
-              f"ep_group_nums: {self.ep_group_nums} \n"
-              f"moe_ep_rank_id: {self.moe_ep_rank_id} \n"
-              f"moe_tp_rank_id: {self.moe_tp_rank_id} \n"
-              f"moe_ep_size: {self.moe_ep_size} \n"
-              f"moe_tp_size: {self.moe_tp_size}", flush=True)
+        self.ep_start = self.moe_ep_rank_id * self.ep_group_nums
+        self.ep_stop = (self.moe_ep_rank_id + 1) * self.ep_group_nums
 
         self.parameter_dict = {}
         self.file_handles = {}
@@ -82,16 +78,82 @@ class BaseWeightProcessor:
     def release_file_handles(self):
         del self.file_handles
 
-    def get_moe_safetensor_from_file(self, hf_param_name, src_hf_dir, hf_weight_map, is_split_param=False, split_axis=0):
+    def get_safetensor_from_file(self, hf_param_name, src_hf_dir, hf_weight_map):
+        safetensor_file = hf_weight_map[hf_param_name]
+        filename = os.path.join(src_hf_dir, safetensor_file)
+        sf_file = self.get_file_handles(filename)
+        qint4 = False
+        if sf_file.metadata() is not None and hf_param_name in sf_file.metadata().keys():
+            qint4 = True
+
+        np_data = sf_file.get_tensor(hf_param_name)
+        return np_data, qint4
+
+    def get_safetensor_from_file_split_tp_group(self, hf_param_name, src_hf_dir, hf_weight_map, split_axis=0):
+        safetensor_file = hf_weight_map[hf_param_name]
+        filename = os.path.join(src_hf_dir, safetensor_file)
+        sf_file = self.get_file_handles(filename)
+        qint4 = False
+        if sf_file.metadata() is not None and hf_param_name in sf_file.metadata().keys():
+            qint4 = True
+
+        np_data = sf_file.get_slice(hf_param_name)
+        shape = np_data.get_shape()
+        if split_axis == 0:
+            split_size = shape[0] // self.tp_group_size
+            start = self.tp_rank_id * split_size
+            stop = (self.tp_rank_id + 1) * split_size
+            split_data = np_data[start:stop]
+        elif split_axis == 1:
+            split_size = shape[1] // self.tp_group_size
+            start = self.tp_rank_id * split_size
+            stop = (self.tp_rank_id + 1) * split_size
+            split_data = np_data[:, start:stop]
+        elif split_axis == 2:
+            split_size = shape[2] // self.tp_group_size
+            start = self.tp_rank_id * split_size
+            stop = (self.tp_rank_id + 1) * split_size
+            split_data = np_data[:, :, start:stop]
+        else:
+            raise ValueError("split_axis:{} is not supported.".format(split_axis))
+        return split_data, qint4
+
+    def get_safetensor_from_file_split_global_group(self, hf_param_name, src_hf_dir, hf_weight_map, split_axis=0):
+        safetensor_file = hf_weight_map[hf_param_name]
+        filename = os.path.join(src_hf_dir, safetensor_file)
+        sf_file = self.get_file_handles(filename)
+        qint4 = False
+        if sf_file.metadata() is not None and hf_param_name in sf_file.metadata().keys():
+            qint4 = True
+
+        np_data = sf_file.get_slice(hf_param_name)
+        shape = np_data.get_shape()
+        if split_axis == 0:
+            split_size = shape[0] // self.global_group_size
+            start = self.global_rank_id * split_size
+            stop = (self.global_rank_id + 1) * split_size
+            split_data = np_data[start:stop]
+        elif split_axis == 1:
+            split_size = shape[1] // self.global_group_size
+            start = self.global_rank_id * split_size
+            stop = (self.global_rank_id + 1) * split_size
+            split_data = np_data[:, start:stop]
+        elif split_axis == 2:
+            split_size = shape[2] // self.global_group_size
+            start = self.global_rank_id * split_size
+            stop = (self.global_rank_id + 1) * split_size
+            split_data = np_data[:, :, start:stop]
+        else:
+            raise ValueError("split_axis:{} is not supported.".format(split_axis))
+        return split_data, qint4
+
+    def get_safetensor_from_file_split_moe_tp_group(self, hf_param_name, src_hf_dir, hf_weight_map, split_axis=0):
         safetensor_file = hf_weight_map[hf_param_name]
         filename = os.path.join(src_hf_dir, safetensor_file)
         sf_file = self.get_file_handles(filename)
         qint4 = False
         if sf_file.metadata() is not None and hf_param_name in sf_file.metadata().keys():
             qint4 = True
-        if not is_split_param or self.moe_tp_size == 1:
-            np_data = sf_file.get_tensor(hf_param_name)
-            return np_data, qint4
 
         np_data = sf_file.get_slice(hf_param_name)
         shape = np_data.get_shape()
@@ -109,39 +171,65 @@ class BaseWeightProcessor:
             raise ValueError("split_axis:{} is not supported.".format(split_axis))
         return split_data, qint4
 
-    def get_safetensor_from_file(self, hf_param_name, src_hf_dir, hf_weight_map, is_split_param=False, split_axis=0,
-                                 split_num=-1, rank_id=-1):
-        rank_id = rank_id if rank_id != -1 else self.tp_rank_id
-        split_num = split_num if split_num != -1 else self.tp_group_size
+    def get_routed_safetensor_3_dim(self, hf_param_name, src_hf_dir, hf_weight_map, split_ep=False, split_tp=False,
+                                    tp_axis=-1):
+        '''get_routed_safetensor_3_dim'''
         safetensor_file = hf_weight_map[hf_param_name]
         filename = os.path.join(src_hf_dir, safetensor_file)
         sf_file = self.get_file_handles(filename)
         qint4 = False
         if sf_file.metadata() is not None and hf_param_name in sf_file.metadata().keys():
             qint4 = True
-        if not is_split_param or split_num == 1:
+        if not split_tp and not split_ep:
             np_data = sf_file.get_tensor(hf_param_name)
             return np_data, qint4
 
         np_data = sf_file.get_slice(hf_param_name)
+        if not split_tp and split_ep:
+            split_data = np_data[self.ep_start:self.ep_stop, :, :]
+            return split_data, qint4
+
         shape = np_data.get_shape()
-        if split_axis == 0:
-            split_size = shape[0] // split_num
-            start = rank_id * split_size
-            stop = (rank_id + 1) * split_size
-            split_data = np_data[start:stop]
-        elif split_axis == 1:
-            split_size = shape[1] // split_num
-            start = rank_id * split_size
-            stop = (rank_id + 1) * split_size
-            split_data = np_data[:, start:stop]
-        elif split_axis == 2:
-            split_size = shape[2] // self.tp_group_size
-            start = self.rank_id * split_size
-            stop = (self.rank_id + 1) * split_size
-            split_data = np_data[:, :, start:stop]
+        if tp_axis == 1:
+            split_size = shape[1] // self.moe_tp_size
+            start = self.moe_tp_rank_id * split_size
+            stop = (self.moe_tp_rank_id + 1) * split_size
+            split_data = np_data[self.ep_start:self.ep_stop, start:stop, :] if split_ep else np_data[:, start:stop, :]
+        elif tp_axis == 2:
+            split_size = shape[2] // self.moe_tp_size
+            start = self.moe_tp_rank_id * split_size
+            stop = (self.moe_tp_rank_id + 1) * split_size
+            split_data = np_data[self.ep_start:self.ep_stop, :, start:stop] if split_ep else np_data[:, :, start:stop]
         else:
-            raise ValueError("split_axis:{} is not supported.".format(split_axis))
+            raise ValueError("tp_axis:{} is not supported.".format(tp_axis))
+        return split_data, qint4
+
+    def get_routed_safetensor_2_dim(self, hf_param_name, src_hf_dir, hf_weight_map, split_ep=False, split_tp=False,
+                                    tp_axis=-1):
+        '''get_moe_routed_safetensor_2_dim'''
+        safetensor_file = hf_weight_map[hf_param_name]
+        filename = os.path.join(src_hf_dir, safetensor_file)
+        sf_file = self.get_file_handles(filename)
+        qint4 = False
+        if sf_file.metadata() is not None and hf_param_name in sf_file.metadata().keys():
+            qint4 = True
+        if not split_tp and not split_ep:
+            np_data = sf_file.get_tensor(hf_param_name)
+            return np_data, qint4
+
+        np_data = sf_file.get_slice(hf_param_name)
+        if not split_tp and split_ep:
+            split_data = np_data[self.ep_start:self.ep_stop, :]
+            return split_data, qint4
+
+        shape = np_data.get_shape()
+        if tp_axis == 1:
+            split_size = shape[1] // self.moe_tp_size
+            start = self.moe_tp_rank_id * split_size
+            stop = (self.moe_tp_rank_id + 1) * split_size
+            split_data = np_data[self.ep_start:self.ep_stop, start:stop] if split_ep else np_data[:, start:stop]
+        else:
+            raise ValueError("split_tp is True but tp_axis:{} is not supported.".format(tp_axis))
         return split_data, qint4
 
     def split_weight_by_rank(self, weight, split_axis=0):
-- 
Gitee


From 1a612d7220b9210d9fec51f62893ae2d772648a8 Mon Sep 17 00:00:00 2001
From: Erpim <dengyepeng@huawei.com>
Date: Mon, 28 Apr 2025 15:32:53 +0800
Subject: [PATCH 19/55] opt get_padding_idx

---
 .../model_executor/models/mf_models/deepseek_v3.py | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py b/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py
index e2112526..120bf5f3 100644
--- a/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py
+++ b/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py
@@ -92,22 +92,20 @@ def _get_padding_index(q_seq_len):
         arange_data = np.arange(0, int(tokens_length), dtype=np.int32)
         if dp_rank == dp_rank_id:
             ffn_unpadding_idx = arange_data
-            attn_padding_idx = np.pad(
-                arange_data, (0, padding_size - arange_data.shape[0]), mode='constant', constant_values=0)
-
+            pad = np.zeros(padding_size - arange_data.shape[0], dtype=np.int32)
+            attn_padding_idx = np.concatenate((arange_data, pad), axis=0)
         if dp_rank == 0:
             attn_unpadding_idx = arange_data
             last_arange_index = arange_data[-1]
-            ffn_padding_idx = np.pad(attn_unpadding_idx, (0, padding_size - attn_unpadding_idx.shape[0]),
-                                     mode='constant', constant_values=0)
+            pad = np.zeros(padding_size - attn_unpadding_idx.shape[0], dtype=np.int32)
+            ffn_padding_idx = np.concatenate((attn_unpadding_idx, pad), axis=0)
         else:
             attn_offset_idx = arange_data + padding_size * dp_rank
             attn_unpadding_idx = np.concatenate((attn_unpadding_idx, attn_offset_idx), axis=0)
             ffn_offset_idx = arange_data + last_arange_index + 1
             last_arange_index = ffn_offset_idx[-1]
-            ffn_offset_idx_pad_zero = np.pad(
-                ffn_offset_idx, (0, padding_size - ffn_offset_idx.shape[0]), mode='constant', constant_values=0)
-            ffn_padding_idx = np.concatenate((ffn_padding_idx, ffn_offset_idx_pad_zero), axis=0)
+            pad = np.zeros(padding_size - ffn_offset_idx.shape[0], dtype=np.int32)
+            ffn_padding_idx = np.concatenate((ffn_padding_idx, ffn_offset_idx, pad), axis=0)
     return ms.from_numpy(attn_padding_idx), ms.from_numpy(attn_unpadding_idx), ms.from_numpy(ffn_padding_idx), \
            ms.from_numpy(ffn_unpadding_idx)
 
-- 
Gitee


From d3b7d62e4d3a3dee8a8608e2d31819065d6340b1 Mon Sep 17 00:00:00 2001
From: tronzhang <zhangzhaochuang@huawei.com>
Date: Mon, 28 Apr 2025 19:48:20 +0800
Subject: [PATCH 20/55] add exception log for dp

---
 vllm_dp/dp_scale_out.patch | 41 ++++++++++++++++++++++++++++----------
 1 file changed, 31 insertions(+), 10 deletions(-)
 mode change 100755 => 100644 vllm_dp/dp_scale_out.patch

diff --git a/vllm_dp/dp_scale_out.patch b/vllm_dp/dp_scale_out.patch
old mode 100755
new mode 100644
index fb6520d2..db5d145c
--- a/vllm_dp/dp_scale_out.patch
+++ b/vllm_dp/dp_scale_out.patch
@@ -1,5 +1,5 @@
 diff --git a/vllm/config.py b/vllm/config.py
-index bd52fc90b..24fc1154d 100644
+index bd52fc9..24fc115 100644
 --- a/vllm/config.py
 +++ b/vllm/config.py
 @@ -1429,16 +1429,27 @@ class LoadConfig:
@@ -90,7 +90,7 @@ index bd52fc90b..24fc1154d 100644
              import os
              os.environ["VLLM_ENABLE_V1_MULTIPROCESSING"] = "0"
 diff --git a/vllm/distributed/utils.py b/vllm/distributed/utils.py
-index cae1a2551..2bdcdf4f1 100644
+index cae1a25..2bdcdf4 100644
 --- a/vllm/distributed/utils.py
 +++ b/vllm/distributed/utils.py
 @@ -21,6 +21,7 @@ from torch.distributed.rendezvous import rendezvous
@@ -130,7 +130,7 @@ index cae1a2551..2bdcdf4f1 100644
  
      pg._register_backend(device, backend_type, backend_class)
 diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
-index 89c9b6747..d6173763d 100644
+index 89c9b67..d617376 100644
 --- a/vllm/engine/arg_utils.py
 +++ b/vllm/engine/arg_utils.py
 @@ -113,11 +113,14 @@ class EngineArgs:
@@ -207,7 +207,7 @@ index 89c9b6747..d6173763d 100644
              max_parallel_loading_workers=self.max_parallel_loading_workers,
              disable_custom_all_reduce=self.disable_custom_all_reduce,
 diff --git a/vllm/entrypoints/cli/serve.py b/vllm/entrypoints/cli/serve.py
-index e89ac4e21..ffcc2bb10 100644
+index e89ac4e..ffcc2bb 100644
 --- a/vllm/entrypoints/cli/serve.py
 +++ b/vllm/entrypoints/cli/serve.py
 @@ -1,14 +1,24 @@
@@ -324,7 +324,7 @@ index e89ac4e21..ffcc2bb10 100644
 +        logger.info("Shutting down.")
 +        engine_manager.close()
 diff --git a/vllm/forward_context.py b/vllm/forward_context.py
-index e195a03c5..e5b9fd5fa 100644
+index e195a03..e5b9fd5 100644
 --- a/vllm/forward_context.py
 +++ b/vllm/forward_context.py
 @@ -77,7 +77,8 @@ def set_forward_context(attn_metadata: Any,
@@ -338,7 +338,7 @@ index e195a03c5..e5b9fd5fa 100644
              batchsize = num_tokens
          num_tokens_across_dp = [0] * dp_size
 diff --git a/vllm/utils.py b/vllm/utils.py
-index 5f32f8cb6..d38d02586 100644
+index 5f32f8c..d38d025 100644
 --- a/vllm/utils.py
 +++ b/vllm/utils.py
 @@ -551,6 +551,10 @@ def is_valid_ipv6_address(address: str) -> bool:
@@ -417,7 +417,7 @@ index 5f32f8cb6..d38d02586 100644
          logger.debug("Got Keyboard Interrupt.")
  
 diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py
-index 39caca0c2..32902a8b1 100644
+index 39caca0..32902a8 100644
 --- a/vllm/v1/engine/core.py
 +++ b/vllm/v1/engine/core.py
 @@ -23,7 +23,7 @@ from vllm.lora.request import LoRARequest
@@ -714,7 +714,7 @@ index 39caca0c2..32902a8b1 100644
          super().shutdown()
          if dp_group := getattr(self, "dp_group", None):
 diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py
-index e948e59b8..e443f45db 100644
+index e948e59..e443f45 100644
 --- a/vllm/v1/engine/core_client.py
 +++ b/vllm/v1/engine/core_client.py
 @@ -8,26 +8,29 @@ import threading
@@ -1228,8 +1228,26 @@ index e948e59b8..e443f45db 100644
 -                                     self.encoder.encode(request_ids)))
 +        await self._send_input(EngineCoreRequestType.ABORT, request_ids,
 +                               engine)
+diff --git a/vllm/v1/executor/multiproc_executor.py b/vllm/v1/executor/multiproc_executor.py
+index 1d5175e..40efd27 100644
+--- a/vllm/v1/executor/multiproc_executor.py
++++ b/vllm/v1/executor/multiproc_executor.py
+@@ -327,6 +327,13 @@ class WorkerProc:
+             logger.debug("Worker interrupted.")
+ 
+         except Exception:
++            # Print exception and details.
++            import sys
++            import traceback
++            exec_type, exec_value, exec_traceback = sys.exc_info()
++            exception_str = "".join(traceback.format_exception(exec_type, exec_value, exec_traceback))
++            logger.error("WorkerProc failed! %s" % exception_str)
++
+             # worker_busy_loop sends exceptions exceptons to Executor
+             # for shutdown, but if there is an error in startup or an
+             # error with IPC itself, we need to alert the parent.
 diff --git a/vllm/v1/serial_utils.py b/vllm/v1/serial_utils.py
-index 146d7d747..7c1d48494 100644
+index 146d7d7..7c1d484 100644
 --- a/vllm/v1/serial_utils.py
 +++ b/vllm/v1/serial_utils.py
 @@ -2,9 +2,10 @@
@@ -1253,7 +1271,7 @@ index 146d7d747..7c1d48494 100644
  class MsgpackEncoder:
      """Encoder with custom torch tensor serialization."""
 diff --git a/vllm/v1/utils.py b/vllm/v1/utils.py
-index f42b3501a..fbc0ee340 100644
+index f42b350..fbc0ee3 100644
 --- a/vllm/v1/utils.py
 +++ b/vllm/v1/utils.py
 @@ -2,17 +2,21 @@
@@ -1414,3 +1432,6 @@ index f42b3501a..fbc0ee340 100644
          if os and os.path.exists(socket_file):
              os.remove(socket_file)
  
+-- 
+2.43.0
+
-- 
Gitee


From 3825f64f71b8b28c0b566eb87ec5d56622311224 Mon Sep 17 00:00:00 2001
From: twc <tanweicheng@huawei.com>
Date: Tue, 29 Apr 2025 20:36:28 +0800
Subject: [PATCH 21/55] weight processor bug fix

---
 .../model_executor/models/mf_models/weight_processor.py     | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/vllm_mindspore/model_executor/models/mf_models/weight_processor.py b/vllm_mindspore/model_executor/models/mf_models/weight_processor.py
index d96be356..c70302f2 100644
--- a/vllm_mindspore/model_executor/models/mf_models/weight_processor.py
+++ b/vllm_mindspore/model_executor/models/mf_models/weight_processor.py
@@ -50,9 +50,9 @@ class BaseWeightProcessor:
         self.global_group_size = get_group_size()
         self.tp_group_size = get_tp_world_size()
         self.dp_group_size = get_data_parallel_world_size()
-        self.num_router_experts = self.config.moe_config.expert_num if hasattr(self.config.moe_config, "expert_num") else 1
-        self.moe_ep_size = self.config.moe_config.moe_expert_parallel if hasattr(self.config.moe_config, "moe_expert_parallel") else 1
-        self.moe_tp_size = self.config.moe_config.moe_tensor_parallel if hasattr(self.config.moe_config, "moe_tensor_parallel") else 1
+        self.num_router_experts = self.config.moe_config.expert_num if self.config.moe_config.expert_num else 1
+        self.moe_ep_size = self.config.moe_config.moe_expert_parallel if self.config.moe_config.moe_expert_parallel else 1
+        self.moe_tp_size = self.config.moe_config.moe_tensor_parallel if self.config.moe_config.moe_tensor_parallel else 1
         self.ep_method = EPMethod.DEFAULT
         if self.dp_group_size > 1 and self.moe_ep_size == self.global_group_size:
             self.ep_method = EPMethod.ALLTOALL
-- 
Gitee


From bc448f417b98d1948e8e05ca1ae0df0a23459246 Mon Sep 17 00:00:00 2001
From: jiahaochen666 <jiahaochen535@gmail.com>
Date: Wed, 30 Apr 2025 06:48:18 +0000
Subject: [PATCH 22/55] adding v0-style prefix-first scheduler

---
 vllm_mindspore/__init__.py                |   4 +
 vllm_mindspore/v1/core/__init__.py        |   0
 vllm_mindspore/v1/core/sched/scheduler.py | 319 ++++++++++++++++++++++
 3 files changed, 323 insertions(+)
 create mode 100644 vllm_mindspore/v1/core/__init__.py
 create mode 100644 vllm_mindspore/v1/core/sched/scheduler.py

diff --git a/vllm_mindspore/__init__.py b/vllm_mindspore/__init__.py
index 3aae2091..28f8469d 100644
--- a/vllm_mindspore/__init__.py
+++ b/vllm_mindspore/__init__.py
@@ -309,6 +309,10 @@ from vllm_mindspore.v1.worker.gpu_worker import compile_or_warm_up_model
 from vllm.v1.worker.gpu_worker import Worker
 Worker.compile_or_warm_up_model = compile_or_warm_up_model
 
+from vllm_mindspore.v1.core.sched.scheduler import schedule
+from vllm.v1.core.sched.scheduler import Scheduler
+Scheduler.schedule = schedule
+
 from .utils import check_ready
 
 from vllm_mindspore.engine.multiprocessing.engine import cleanup
diff --git a/vllm_mindspore/v1/core/__init__.py b/vllm_mindspore/v1/core/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/vllm_mindspore/v1/core/sched/scheduler.py b/vllm_mindspore/v1/core/sched/scheduler.py
new file mode 100644
index 00000000..c03f3469
--- /dev/null
+++ b/vllm_mindspore/v1/core/sched/scheduler.py
@@ -0,0 +1,319 @@
+#!/usr/bin/env python3
+# encoding: utf-8
+# Copyright 2025 Huawei Technologies Co., Ltd
+# Copyright 2024 The vLLM team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# This file is based on vllm-ascend project.
+# ============================================================================
+
+from collections import deque
+
+import time
+from vllm.logger import init_logger
+from vllm.utils import cdiv
+from vllm.v1.core.sched.output import NewRequestData, SchedulerOutput
+from vllm.v1.core.sched.scheduler import Scheduler
+from vllm.v1.request import Request, RequestStatus
+
+logger = init_logger(__name__)
+
+def _check_watermark_for_prefill(self,
+                                    request,
+                                    num_new_tokens,
+                                    computed_blocks,
+                                    watermark=0.01):
+    computed_blocks = computed_blocks or []
+    watermark_blocks = self.kv_cache_config.num_blocks * watermark
+    num_computed_tokens = (request.num_computed_tokens +
+                            len(computed_blocks) * self.block_size)
+    num_required_blocks = cdiv(num_new_tokens + num_computed_tokens,
+                                self.block_size)
+    req_blocks = self.kv_cache_manager.req_to_blocks[request.request_id]
+    num_new_blocks = (num_required_blocks - len(req_blocks) -
+                        len(computed_blocks))
+    num_evictable_computed_blocks = sum(1 for blk in computed_blocks
+                                        if blk.ref_cnt == 0)
+    # If number of free blocks is less than water mark after allocating, don't allocate.
+    if (self.kv_cache_manager.block_pool.get_num_free_blocks() -
+            num_evictable_computed_blocks -
+            num_new_blocks) < watermark_blocks:
+        return False
+    return True
+
+def _get_prompt_limit(self, request: Request) -> int:
+    if (self.scheduler_config.chunked_prefill_enabled
+            and not self.scheduler_config.is_multi_step):
+        prompt_limit = self.scheduler_config.max_model_len
+    else:
+        prompt_limit = min(
+            self.scheduler_config.max_model_len,
+            self.scheduler_config.max_num_batched_tokens,
+        )
+
+    # Model is fine tuned with long context. Return the fine tuned max_len.
+    if request.lora_request and request.lora_request.long_lora_max_len:
+        assert prompt_limit <= request.lora_request.long_lora_max_len
+        return request.lora_request.long_lora_max_len
+    else:
+        return prompt_limit
+
+def schedule(self) -> SchedulerOutput:
+    scheduled_new_reqs: list[Request] = []
+    scheduled_resumed_reqs: list[Request] = []
+    scheduled_running_reqs: list[Request] = []
+    preempted_reqs: list[Request] = []
+
+    req_to_new_block_ids: dict[str, list[int]] = {}
+    num_scheduled_tokens: dict[str, int] = {}
+    token_budget = self.max_num_scheduled_tokens
+    # Spec decode-related.
+    scheduled_spec_decode_tokens: dict[str, list[int]] = {}
+
+    # Record scheduled LoRA requests.
+    scheduled_loras: set[int] = set()
+
+    # Use a temporary deque to collect requests that need to be skipped
+    # and put back at the head of the waiting queue later
+    skipped_waiting_requests: deque[Request] = deque()
+
+    # Schedule prefill requests first.
+    while self.waiting and token_budget > 0:
+        if len(scheduled_new_reqs) == self.max_num_running_reqs:
+            break
+
+        request = self.waiting[0]
+
+        def skip_cur_request():
+            self.waiting.popleft()
+            skipped_waiting_requests.appendleft(request)
+
+        # Check that adding the request still respects the max_loras
+        # constraint.
+        if (self.lora_config and request.lora_request and
+            (len(scheduled_loras) == self.lora_config.max_loras
+                and request.lora_request.lora_int_id not in scheduled_loras)):
+            # Scheduling would exceed max_loras, skip.
+            skip_cur_request()
+            continue
+
+        prompt_limit = _get_prompt_limit(self, request)
+        # Get already-cached tokens.
+        computed_blocks, num_computed_tokens = (
+            self.kv_cache_manager.get_computed_blocks(request))
+        num_new_tokens = request.num_prompt_tokens - num_computed_tokens
+        if (0 < self.scheduler_config.long_prefill_token_threshold <
+                num_new_tokens):
+            num_new_tokens = (
+                self.scheduler_config.long_prefill_token_threshold)
+        max_tokens_in_kvcache = (self.kv_cache_config.num_blocks *
+                                    self.block_size)
+        prompt_limit = min(prompt_limit, max_tokens_in_kvcache)
+
+        # Finish request that exceeds prompt_limit or kv cache size.
+        if num_new_tokens > prompt_limit:
+            logger.warning(
+                "Input prompt (%d tokens) is too long"
+                " and exceeds limit of %d",
+                num_new_tokens,
+                prompt_limit,
+            )
+            request.status = RequestStatus.FINISHED_IGNORED
+            self.finished_req_ids.add(request.request_id)  # type: ignore
+            self.waiting.popleft()
+            continue
+
+        if num_new_tokens > token_budget:
+            # Scheduling would exceed token_budget, skip.
+            skip_cur_request()
+            continue
+
+        assert num_new_tokens > 0
+        watermark = getattr(self.scheduler_config, "watermark", 0.01)
+        if not _check_watermark_for_prefill(self,
+                request, num_new_tokens, computed_blocks, watermark):
+            # Scheduling would exceed watermark, skip.
+            skip_cur_request()
+            continue
+
+        new_blocks = self.kv_cache_manager.allocate_slots(
+            request, num_new_tokens, computed_blocks)
+        if new_blocks is None:
+            # The request cannot be scheduled.
+            break
+
+        self.waiting.popleft()
+        self.running.append(request)
+        self.scheduled_req_ids.add(request.request_id)
+        # Check request status.
+        if request.status == RequestStatus.WAITING:
+            scheduled_new_reqs.append(request)
+        elif request.status == RequestStatus.PREEMPTED:
+            scheduled_resumed_reqs.append(request)
+        else:
+            raise RuntimeError(f"Invalid request status: {request.status}")
+
+        if self.lora_config and request.lora_request:
+            scheduled_loras.add(request.lora_request.lora_int_id)
+        req_to_new_block_ids[request.request_id] = [
+            b.block_id for b in computed_blocks + new_blocks
+        ]
+        # Update request info.
+        num_scheduled_tokens[request.request_id] = num_new_tokens
+        token_budget -= num_new_tokens
+        request.status = RequestStatus.RUNNING
+        request.num_computed_tokens = num_computed_tokens
+
+    # Put back any skipped requests at the head of the waiting queue
+    if skipped_waiting_requests:
+        self.waiting.extendleft(skipped_waiting_requests)
+
+    # If no prefill requests are scheduled,
+    # Schedule decode requests next.
+    if len(self.scheduled_req_ids) == 0:
+        req_index = 0
+        while req_index < len(self.running) and token_budget > 0:
+            request = self.running[req_index]
+            if request.request_id in self.scheduled_req_ids:
+                # This request has already been scheduled.
+                req_index += 1
+                continue
+
+            num_new_tokens = (request.num_tokens_with_spec -
+                                request.num_computed_tokens)
+            if (0 < self.scheduler_config.long_prefill_token_threshold <
+                    num_new_tokens):
+                num_new_tokens = (
+                    self.scheduler_config.long_prefill_token_threshold)
+            num_new_tokens = min(num_new_tokens, token_budget)
+            assert num_new_tokens == 1
+
+            while True:
+                new_blocks = self.kv_cache_manager.allocate_slots(
+                    request, num_new_tokens)
+                if new_blocks is None:
+                    # The request cannot be scheduled.
+                    # Preempt the lowest-priority request.
+                    preempted_req = self.running.pop()
+                    self.kv_cache_manager.free(preempted_req)
+                    preempted_req.status = RequestStatus.PREEMPTED
+                    preempted_req.num_computed_tokens = 0
+                    self.waiting.appendleft(preempted_req)
+                    preempted_reqs.append(preempted_req)
+                    if preempted_req == request:
+                        # No more request to preempt.
+                        can_schedule = False
+                        break
+                else:
+                    # The request can be scheduled.
+                    can_schedule = True
+                    break
+            if not can_schedule:
+                break
+            assert new_blocks is not None
+
+            # Schedule the request.
+            scheduled_running_reqs.append(request)
+            self.scheduled_req_ids.add(request.request_id)
+            req_to_new_block_ids[request.request_id] = [
+                b.block_id for b in new_blocks
+            ]
+            num_scheduled_tokens[request.request_id] = num_new_tokens
+            token_budget -= num_new_tokens
+            req_index += 1
+
+            # Speculative decode related.
+            if request.spec_token_ids:
+                num_scheduled_spec_tokens = (num_new_tokens +
+                                                request.num_computed_tokens -
+                                                request.num_tokens)
+                if num_scheduled_spec_tokens > 0:
+                    # Trim spec_token_ids list to num_scheduled_spec_tokens.
+                    del request.spec_token_ids[num_scheduled_spec_tokens:]
+                    scheduled_spec_decode_tokens[request.request_id] = (
+                        request.spec_token_ids)
+
+    # Check if the scheduling constraints are satisfied.
+    total_num_scheduled_tokens = sum(num_scheduled_tokens.values())
+    assert total_num_scheduled_tokens <= self.max_num_scheduled_tokens
+    assert token_budget >= 0
+    assert len(self.running) <= self.max_num_running_reqs
+    assert len(scheduled_new_reqs) + len(scheduled_resumed_reqs) + len(
+        scheduled_running_reqs) <= len(self.running)
+
+    # Get the longest common prefix among all requests in the running queue.
+    # This can be potentially used for cascade attention.
+    num_common_prefix_blocks = 0
+    if self.running:
+        any_request = self.running[0]
+        num_common_prefix_blocks = (
+            self.kv_cache_manager.get_num_common_prefix_blocks(
+                any_request, len(self.running)))
+
+    # Construct the scheduler output.
+    new_reqs_data = [
+        NewRequestData.from_request(req,
+                                    req_to_new_block_ids[req.request_id])
+        for req in scheduled_new_reqs
+    ]
+    resumed_reqs_data = [
+        self._make_cached_request_data(
+            req,
+            num_scheduled_tokens[req.request_id],
+            len(scheduled_spec_decode_tokens.get(req.request_id, ())),
+            req_to_new_block_ids[req.request_id],
+            resumed_from_preemption=True,
+        ) for req in scheduled_resumed_reqs
+    ]
+    running_reqs_data = [
+        self._make_cached_request_data(
+            req,
+            num_scheduled_tokens[req.request_id],
+            len(scheduled_spec_decode_tokens.get(req.request_id, ())),
+            req_to_new_block_ids[req.request_id],
+            resumed_from_preemption=False,
+        ) for req in scheduled_running_reqs
+    ]
+    scheduler_output = SchedulerOutput(
+        scheduled_new_reqs=new_reqs_data,
+        scheduled_cached_reqs=resumed_reqs_data + running_reqs_data,
+        num_scheduled_tokens=num_scheduled_tokens,
+        total_num_scheduled_tokens=total_num_scheduled_tokens,
+        scheduled_spec_decode_tokens=scheduled_spec_decode_tokens,
+        scheduled_encoder_inputs={},
+        num_common_prefix_blocks=num_common_prefix_blocks,
+        # finished_req_ids is an existing state in the scheduler,
+        # instead of being newly scheduled in this step.
+        # It contains the request IDs that are finished in between
+        # the previous and the current steps.
+        finished_req_ids=self.finished_req_ids,  # type: ignore
+        free_encoder_input_ids=self.encoder_cache_manager.get_freed_ids(),
+        structured_output_request_ids={},
+        grammar_bitmask=None,
+    )
+
+    # Advance the number of computed tokens for the request AFTER
+    # the request is scheduled.
+    # 1. The scheduler_output of the current step has to include the
+    #    original number of scheduled tokens to determine input IDs.
+    # 2. Advance the number of computed tokens here allowing us to
+    #    schedule the prefill request again immediately in the next
+    #    scheduling step.
+    # 3. If some tokens (e.g. spec tokens) are rejected later, the number of
+    #    computed tokens will be adjusted in update_from_output.
+    for req_id, num_scheduled_token in num_scheduled_tokens.items():
+        self.requests[req_id].num_computed_tokens += num_scheduled_token
+
+    self.finished_req_ids = set()  # type: ignore
+    return scheduler_output
-- 
Gitee


From 109edbf79a1405af1916d1ed46fe3020f71f5ed4 Mon Sep 17 00:00:00 2001
From: one_east <wanyidong@huawei.com>
Date: Wed, 7 May 2025 10:15:38 +0800
Subject: [PATCH 23/55] bugfix: add module init

---
 vllm_mindspore/v1/core/sched/__init__.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 vllm_mindspore/v1/core/sched/__init__.py

diff --git a/vllm_mindspore/v1/core/sched/__init__.py b/vllm_mindspore/v1/core/sched/__init__.py
new file mode 100644
index 00000000..e69de29b
-- 
Gitee


From 99effbffc222a38f80418f5760425108cba753ec Mon Sep 17 00:00:00 2001
From: hangangqiang <hangangqiang2@huawei.com>
Date: Tue, 29 Apr 2025 22:26:40 +0800
Subject: [PATCH 24/55] deepseekr1-int8 support qkvconcat

---
 .../mf_models/deepseekv3_weight_processor.py  | 125 ++++++++++++++----
 1 file changed, 101 insertions(+), 24 deletions(-)

diff --git a/vllm_mindspore/model_executor/models/mf_models/deepseekv3_weight_processor.py b/vllm_mindspore/model_executor/models/mf_models/deepseekv3_weight_processor.py
index 1c6a99d9..966c569a 100644
--- a/vllm_mindspore/model_executor/models/mf_models/deepseekv3_weight_processor.py
+++ b/vllm_mindspore/model_executor/models/mf_models/deepseekv3_weight_processor.py
@@ -631,35 +631,11 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor):
     def infer_quant_process_attention_weight(self, src_hf_dir, layer_id, hf_weight_map):
         """infer quant process attention weight"""
         num_heads = self.config.model.model_config.num_heads
-        kv_lora_rank = self.config.model.model_config.kv_lora_rank
         qk_rope_head_dim = self.config.model.model_config.qk_rope_head_dim
         v_head_dim = self.config.model.model_config.v_head_dim
         qk_nope_head_dim = self.config.model.model_config.qk_nope_head_dim
 
         rope_dim = qk_rope_head_dim + qk_nope_head_dim
-        kv_head_dim = kv_lora_rank + qk_rope_head_dim
-
-        # q_a_proj->q2l_proj
-        q2l_proj_hf_name = f"model.layers.{layer_id}.self_attn.q_a_proj.weight"
-        q2l_proj_ms_name = self.quant_convert_weight_name(q2l_proj_hf_name)
-        q2l_proj_ms_param, _ = self.get_safetensor_from_file(q2l_proj_hf_name, src_hf_dir, hf_weight_map)
-        self.parameter_dict[q2l_proj_ms_name] = ms.Parameter(
-            ms.from_numpy(q2l_proj_ms_param).astype(ms.int8),
-            name=q2l_proj_ms_name,
-            requires_grad=False)
-        self.quant_special_attention_weight(layer_id, src_hf_dir, hf_weight_map, "q_a_proj")
-
-        # kv_a_proj_with_mqa->kv2l
-        kv2l_hf_name = f"model.layers.{layer_id}.self_attn.kv_a_proj_with_mqa.weight"
-        kv2l_ms_name = self.quant_convert_weight_name(kv2l_hf_name)
-        kv2l_ms_param, _ = self.get_safetensor_from_file(kv2l_hf_name, src_hf_dir, hf_weight_map)
-        kv2l_ms_param = kv2l_ms_param.reshape(kv_head_dim, -1)
-        kv2l_ms_param = self.infer_trans_rope_weight(kv2l_ms_param, qk_rope_head_dim)
-        self.parameter_dict[kv2l_ms_name] = ms.Parameter(ms.from_numpy(kv2l_ms_param).astype(ms.int8),
-                                                         name=kv2l_ms_name,
-                                                         requires_grad=False)
-        self.quant_special_attention_weight(layer_id, src_hf_dir, hf_weight_map, "kv_a_proj_with_mqa",
-                                            is_trans_rope_weigh=True)
 
         # q_a_layernorm->lq_norm
         lq_norm_hf_name = f"model.layers.{layer_id}.self_attn.q_a_layernorm.weight"
@@ -726,6 +702,106 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor):
                                                        requires_grad=False)
         self.quant_special_attention_weight(layer_id, src_hf_dir, hf_weight_map, "o_proj")
 
+    def infer_quant_process_dense_qkv_weight(self, src_hf_dir, layer_id, hf_weight_map):
+        """infer_quant_process_dense_qkv_weight"""
+        parameter_dict = {}
+        kv_lora_rank = self.config.model.model_config.kv_lora_rank
+        qk_rope_head_dim = self.config.model.model_config.qk_rope_head_dim
+        kv_head_dim = kv_lora_rank + qk_rope_head_dim
+
+        qkv_concat = self.config.model.model_config.qkv_concat
+        # q2l
+        q2l_hf_name = f"model.layers.{layer_id}.self_attn.q_a_proj.weight"
+        q2l_ms_name = self.quant_convert_weight_name(q2l_hf_name)
+        q2l_ms_param, _ = self.get_safetensor_from_file(q2l_hf_name, src_hf_dir, hf_weight_map)
+
+        q2l_input_scale_hf_name = f"model.layers.{layer_id}.self_attn.q_a_proj.input_scale"
+        q2l_input_scale_ms_name = self.quant_convert_weight_name(q2l_input_scale_hf_name)
+        q2l_input_scale_ms_param, _ = self.get_safetensor_from_file(q2l_input_scale_hf_name, src_hf_dir,
+                                                                     hf_weight_map)
+
+        q2l_input_zp_hf_name = f"model.layers.{layer_id}.self_attn.q_a_proj.input_offset"
+        q2l_input_zp_ms_name = self.quant_convert_weight_name(q2l_input_zp_hf_name)
+        q2l_input_zp_ms_param, _ = self.get_safetensor_from_file(q2l_input_zp_hf_name, src_hf_dir, hf_weight_map)
+
+        q2l_quant_bias_hf_name = f"model.layers.{layer_id}.self_attn.q_a_proj.quant_bias"
+        q2l_quant_bias_ms_name = self.quant_convert_weight_name(q2l_quant_bias_hf_name)
+        q2l_quant_bias_ms_param, _ = self.get_safetensor_from_file(q2l_quant_bias_hf_name, src_hf_dir,
+                                                                   hf_weight_map)
+
+        q2l_dequant_scale_hf_name = f"model.layers.{layer_id}.self_attn.q_a_proj.deq_scale"
+        q2l_dequant_scale_ms_name = self.quant_convert_weight_name(q2l_dequant_scale_hf_name)
+        q2l_dequant_scale_ms_param, _ = self.get_safetensor_from_file(q2l_dequant_scale_hf_name, src_hf_dir,
+                                                                      hf_weight_map)
+        # kv2l
+        kv2l_hf_name = f"model.layers.{layer_id}.self_attn.kv_a_proj_with_mqa.weight"
+        kv2l_ms_name = self.quant_convert_weight_name(kv2l_hf_name)
+        kv2l_ms_param, _ = self.get_safetensor_from_file(kv2l_hf_name, src_hf_dir, hf_weight_map)
+        kv2l_ms_param = kv2l_ms_param.reshape(kv_head_dim, -1)
+        kv2l_ms_param = self.infer_trans_rope_weight(kv2l_ms_param, qk_rope_head_dim)
+
+        kv2l_input_scale_hf_name = f"model.layers.{layer_id}.self_attn.kv_a_proj_with_mqa.input_scale"
+        kv2l_input_scale_ms_name = self.quant_convert_weight_name(kv2l_input_scale_hf_name)
+        kv2l_input_scale_ms_param, _ = self.get_safetensor_from_file(kv2l_input_scale_hf_name, src_hf_dir,
+                                                                     hf_weight_map)
+
+        kv2l_input_zp_hf_name = f"model.layers.{layer_id}.self_attn.kv_a_proj_with_mqa.input_offset"
+        kv2l_input_zp_ms_name = self.quant_convert_weight_name(kv2l_input_zp_hf_name)
+        kv2l_input_zp_ms_param, _ = self.get_safetensor_from_file(kv2l_input_zp_hf_name, src_hf_dir, hf_weight_map)
+
+        kv2l_quant_bias_hf_name = f"model.layers.{layer_id}.self_attn.kv_a_proj_with_mqa.quant_bias"
+        kv2l_quant_bias_ms_name = self.quant_convert_weight_name(kv2l_quant_bias_hf_name)
+        kv2l_quant_bias_ms_param, _ = self.get_safetensor_from_file(kv2l_quant_bias_hf_name, src_hf_dir,
+                                                                     hf_weight_map)
+        kv2l_quant_bias_ms_param = kv2l_quant_bias_ms_param.reshape(kv_head_dim, -1)
+        kv2l_quant_bias_ms_param = self.infer_trans_rope_weight(kv2l_quant_bias_ms_param,
+                                                                 qk_rope_head_dim).reshape(-1)
+
+        kv2l_dequant_scale_hf_name = f"model.layers.{layer_id}.self_attn.kv_a_proj_with_mqa.deq_scale"
+        kv2l_dequant_scale_ms_name = self.quant_convert_weight_name(kv2l_dequant_scale_hf_name)
+        kv2l_dequant_scale_ms_param, _ = self.get_safetensor_from_file(kv2l_dequant_scale_hf_name, src_hf_dir,
+                                                                       hf_weight_map)
+        kv2l_dequant_scale_ms_param = kv2l_dequant_scale_ms_param.reshape(kv_head_dim, -1)
+        kv2l_dequant_scale_ms_param = self.infer_trans_rope_weight(kv2l_dequant_scale_ms_param,
+                                                                   qk_rope_head_dim).reshape(-1)
+
+        attn_rmsnorm_beta_hf_name = f"model.layers.{layer_id}.input_layernorm.bias"
+        attn_rmsnorm_beta_ms_name = self.quant_convert_weight_name(attn_rmsnorm_beta_hf_name)
+        attn_rmsnorm_beta_ms_param, _ = self.get_safetensor_from_file(attn_rmsnorm_beta_hf_name, src_hf_dir, hf_weight_map)
+
+        if qkv_concat:
+            qkv2l_weight_name = f"model.layers.{layer_id}.attention.qkv2l._layer.weight"
+            qkv2l_bias_name = f"model.layers.{layer_id}.attention.qkv2l._layer.matmul.quant_bias"
+            qkv2l_scale_name = f"model.layers.{layer_id}.attention.qkv2l._layer.matmul.dequant_scale"
+            qkv2l_quant_zp_name = f"model.layers.{layer_id}.attention.qkv2l.quant_op.input_zp"
+            qkv2l_quant_scale_name = f"model.layers.{layer_id}.attention.qkv2l.quant_op.input_scale"
+            qkv2l_rmsnorm_beta_name = f"model.layers.{layer_id}.attention.qkv2l.quant_op.beta"
+
+            qkv2l_weight = np.concatenate((q2l_ms_param, kv2l_ms_param), 0)
+            parameter_dict[qkv2l_weight_name] = ms.Parameter(ms.Tensor(qkv2l_weight, ms.int8), name=qkv2l_weight_name, requires_grad=False)
+            qkv2l_bias = np.concatenate((q2l_quant_bias_ms_param, kv2l_quant_bias_ms_param), 0)
+            parameter_dict[qkv2l_bias_name] = ms.Parameter(ms.Tensor(qkv2l_bias, ms.int32), name=qkv2l_bias_name,requires_grad=False)
+            qkv2l_scale = np.concatenate((q2l_dequant_scale_ms_param, kv2l_dequant_scale_ms_param), 0)
+            parameter_dict[qkv2l_scale_name] = ms.Parameter(ms.Tensor(qkv2l_scale, ms.float32), name=qkv2l_scale_name, requires_grad=False)
+            parameter_dict[qkv2l_quant_zp_name] = ms.Parameter(ms.Tensor(q2l_input_zp_ms_param, ms.int8),requires_grad=False)
+            parameter_dict[qkv2l_quant_scale_name] = ms.Parameter(ms.Tensor(q2l_input_scale_ms_param, ms.bfloat16), requires_grad=False)
+            parameter_dict[qkv2l_rmsnorm_beta_name] = ms.Parameter(ms.Tensor(attn_rmsnorm_beta_ms_param, ms.float32), requires_grad=False)
+        else:
+            parameter_dict[q2l_ms_name] = ms.Parameter(ms.Tensor(q2l_ms_param, ms.int8), name=q2l_ms_name,requires_grad=False)
+            parameter_dict[kv2l_ms_name] = ms.Parameter(ms.Tensor(kv2l_ms_param, ms.int8),requires_grad=False)
+            parameter_dict[q2l_quant_bias_ms_name] = ms.Parameter(ms.Tensor(q2l_quant_bias_ms_param, ms.int32),name=q2l_quant_bias_ms_name,requires_grad = False)
+            parameter_dict[kv2l_quant_bias_ms_name] = ms.Parameter(ms.Tensor(kv2l_quant_bias_ms_param, ms.int32),name=kv2l_quant_bias_ms_name,requires_grad = False)
+            parameter_dict[q2l_dequant_scale_ms_name] = ms.Parameter(ms.Tensor(q2l_dequant_scale_ms_param, ms.float32), name=q2l_dequant_scale_ms_name, requires_grad = False)
+            parameter_dict[kv2l_dequant_scale_ms_name] = ms.Parameter(ms.Tensor(kv2l_dequant_scale_ms_param, ms.float32),name = kv2l_dequant_scale_ms_name, requires_grad = False)
+            parameter_dict[q2l_input_zp_ms_name] = ms.Parameter(ms.Tensor(q2l_input_zp_ms_param, ms.int8),name=q2l_input_zp_ms_name, requires_grad = False)
+            parameter_dict[kv2l_input_zp_ms_name] = ms.Parameter(ms.Tensor(kv2l_input_zp_ms_param, ms.int8), name=kv2l_input_zp_ms_name, requires_grad = False)
+            parameter_dict[q2l_input_scale_ms_name] = ms.Parameter(ms.Tensor(q2l_input_scale_ms_param, ms.bfloat16), name = q2l_input_scale_ms_name, requires_grad = False)
+            parameter_dict[kv2l_input_scale_ms_name] = ms.Parameter(ms.Tensor(kv2l_input_scale_ms_param, ms.bfloat16), name = kv2l_input_scale_ms_name, requires_grad = False)
+            parameter_dict[attn_rmsnorm_beta_ms_name] = ms.Parameter(ms.Tensor(attn_rmsnorm_beta_ms_param, ms.float32), name=attn_rmsnorm_beta_ms_name, requires_grad=False)
+        _, _ = ms.load_param_into_net(self.network, parameter_dict)
+        del parameter_dict
+        gc.collect()
+
     def infer_quant_net_convert_layer_weight(self, src_hf_dir, layer_id, hf_weight_map):
         """infer quant net convert layer weight"""
 
@@ -735,6 +811,7 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor):
         else:
             self.infer_quant_process_dense_ffn_weight(src_hf_dir, layer_id, hf_weight_map)
 
+        self.infer_quant_process_dense_qkv_weight(src_hf_dir, layer_id, hf_weight_map)
         self.infer_quant_process_attention_weight(src_hf_dir, layer_id, hf_weight_map)
         self.infer_quant_bias_weight(src_hf_dir, layer_id, hf_weight_map)
         self.infer_process_norm_weight(src_hf_dir, layer_id, hf_weight_map)
-- 
Gitee


From 31d20c344f5f9a85321bf8dec9826a187665accb Mon Sep 17 00:00:00 2001
From: tongl <liu.tong1@outlook.com>
Date: Wed, 30 Apr 2025 19:44:03 +0800
Subject: [PATCH 25/55] Add osl for vllm-mindspore.

---
 .../models/mf_models/deepseek_v3.py              | 16 ++++++++++++++++
 .../mf_models/deepseekv3_weight_processor.py     |  5 ++++-
 2 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py b/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py
index e7cda00c..82fa53d9 100644
--- a/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py
+++ b/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py
@@ -200,6 +200,18 @@ class DeepseekV3ForCausalLM(MfModelBase):
                                   act_quant_granularity=QuantGranularity.PER_TOKEN,
                                   weight_quant_granularity=QuantGranularity.PER_CHANNEL)
             layer_policies = OrderedDict({r'.*\.w2.*': w2_config})
+        elif quant_type.lower() == 'osl':
+            cfg = PTQConfig(mode=quant_mode, backend=BackendTarget.ASCEND, weight_quant_dtype=msdtype.int8,
+                            act_quant_dtype=msdtype.int8,
+                            outliers_suppression=OutliersSuppressionType.OUTLIER_SUPPRESSION_LITE,
+                            opname_blacklist=['lm_head', 'lkv2kv'])
+            w2_config = PTQConfig(mode=quant_mode, backend=BackendTarget.ASCEND, weight_quant_dtype=msdtype.int8,
+                                act_quant_dtype=msdtype.int8,
+                                outliers_suppression=OutliersSuppressionType.NONE,
+                                precision_recovery=PrecisionRecovery.NONE,
+                                act_quant_granularity=QuantGranularity.PER_TOKEN,
+                                weight_quant_granularity=QuantGranularity.PER_CHANNEL)
+            layer_policies = OrderedDict({r'.*\.w2.*': w2_config})
         elif quant_type.lower() == 'a16w8':
             cfg = PTQConfig(mode=quant_mode, backend=BackendTarget.ASCEND, weight_quant_dtype=msdtype.int8,
                             opname_blacklist=['lm_head', 'lkv2kv'])
@@ -220,6 +232,10 @@ class DeepseekV3ForCausalLM(MfModelBase):
             # pylint: disable=protected-access
             ptq._config.aclnn_quant_list = ["routed_experts.ffn.w_gate_hidden", "routed_experts.ffn.w1",
                                             "routed_experts.ffn.w3"]
+        if 'osl' in quant_type.lower():
+            # pylint: disable=protected-access
+            ptq._config.aclnn_quant_list = ["routed_experts.ffn.w_gate_hidden", "routed_experts.ffn.w1",
+                                            "routed_experts.ffn.w3"]
         if 'gptq-pergroup' in quant_type.lower():
             # pylint: disable=protected-access
             ptq.layer_policies[r'.*\.feed_forward\.w2.*'].aclnn_quant_list = ["w2"]
diff --git a/vllm_mindspore/model_executor/models/mf_models/deepseekv3_weight_processor.py b/vllm_mindspore/model_executor/models/mf_models/deepseekv3_weight_processor.py
index de7d70d0..c9bc5e1b 100644
--- a/vllm_mindspore/model_executor/models/mf_models/deepseekv3_weight_processor.py
+++ b/vllm_mindspore/model_executor/models/mf_models/deepseekv3_weight_processor.py
@@ -1434,7 +1434,7 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor):
 
         quantization_config = self.config.model.model_config.quantization_config
         quant_method = quantization_config.quant_method if quantization_config else None
-        support_quant_method = ["gptq-pergroup", "smoothquant"]
+        support_quant_method = ["gptq-pergroup", "smoothquant", "osl"]
         if not quant_method or (quant_method not in support_quant_method) and \
                 not is_mtp_model:
             self.infer_convert_outer_weight(src_hf_dir, hf_weight_map)
@@ -1445,6 +1445,9 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor):
         if quant_method and quant_method == "smoothquant":
             self.infer_smooth_quant_net_ms_convert_layer_weight(src_hf_dir, self.num_layers, hf_weight_map)
             return
+        if quant_method and quant_method == "osl":
+            self.infer_smooth_quant_net_ms_convert_layer_weight(src_hf_dir, self.num_layers, hf_weight_map)
+            return
 
         enable_tqdm = rank_id == 0
         mtp_layers = self.config.model.model_config.num_nextn_predict_layers
-- 
Gitee


From b35c4a37c5fc119307e4636342167b0e019e1575 Mon Sep 17 00:00:00 2001
From: lvhaoyu <lvhaoyu@huawei.com>
Date: Wed, 14 May 2025 10:09:25 +0800
Subject: [PATCH 26/55] add VLLM_USE_V1=0 for testcase

---
 tests/mindformers                                      | 2 +-
 tests/st/python/test_vllm_deepseek_bf16_part.py        | 3 ++-
 tests/st/python/test_vllm_deepseek_part.py             | 5 ++++-
 tests/st/python/test_vllm_deepseek_smoothquant.py      | 4 +++-
 tests/st/python/test_vllm_mf_qwen_7b.py                | 3 ++-
 tests/st/python/test_vllm_mf_qwen_7b_chunk_prefill.py  | 4 +++-
 tests/st/python/test_vllm_mf_qwen_7b_cp_pc_mss.py      | 3 ++-
 tests/st/python/test_vllm_mf_qwen_7b_mss.py            | 3 ++-
 tests/st/python/test_vllm_mf_qwen_7b_prefix_caching.py | 4 +++-
 tests/st/python/test_vllm_qwen_7b.py                   | 3 ++-
 10 files changed, 24 insertions(+), 10 deletions(-)

diff --git a/tests/mindformers b/tests/mindformers
index 544c4009..16587217 160000
--- a/tests/mindformers
+++ b/tests/mindformers
@@ -1 +1 @@
-Subproject commit 544c4009573051e0e254efab71d212bfc77fc7b2
+Subproject commit 165872172ae5396cb4b66629614c85ff21038e11
diff --git a/tests/st/python/test_vllm_deepseek_bf16_part.py b/tests/st/python/test_vllm_deepseek_bf16_part.py
index c19dd14a..86137637 100644
--- a/tests/st/python/test_vllm_deepseek_bf16_part.py
+++ b/tests/st/python/test_vllm_deepseek_bf16_part.py
@@ -30,7 +30,8 @@ env_vars = {
     "LCCL_DETERMINISTIC": "1",
     "HCCL_DETERMINISTIC": "true",
     "ATB_MATMUL_SHUFFLE_K_ENABLE": "0",
-    "ATB_LLM_LCOC_ENABLE": "0"
+    "ATB_LLM_LCOC_ENABLE": "0",
+    "VLLM_USE_V1": "0",
 }
 # set env
 env_manager.setup_ai_environment(env_vars)
diff --git a/tests/st/python/test_vllm_deepseek_part.py b/tests/st/python/test_vllm_deepseek_part.py
index 8dfa9563..f88ece2c 100644
--- a/tests/st/python/test_vllm_deepseek_part.py
+++ b/tests/st/python/test_vllm_deepseek_part.py
@@ -32,7 +32,8 @@ env_vars = {
     "LCCL_DETERMINISTIC": "1",
     "HCCL_DETERMINISTIC": "true",
     "ATB_MATMUL_SHUFFLE_K_ENABLE": "0",
-    "ATB_LLM_LCOC_ENABLE": "0"
+    "ATB_LLM_LCOC_ENABLE": "0",
+    "VLLM_USE_V1": "0",
 }
 # set env
 env_manager.setup_ai_environment(env_vars)
@@ -47,6 +48,7 @@ class TestDeepSeek:
     @pytest.mark.level0
     @pytest.mark.platform_arm_ascend910b_training
     @pytest.mark.env_single
+    @pytest.mark.skip(reason="gs master branch is not suit for the newest mindformers.")
     def test_deepseek_r1(self):
         """
         test case deepseek r1 w8a8
@@ -87,6 +89,7 @@ class TestDeepSeekMTP:
     @pytest.mark.level0
     @pytest.mark.platform_arm_ascend910b_training
     @pytest.mark.env_single
+    @pytest.mark.skip(reason="MTP need addition adaption on v0.8.3 V0")
     def test_deepseek_mtp(self):
         """
         test case deepseek mtp with main model of r1-w8a8
diff --git a/tests/st/python/test_vllm_deepseek_smoothquant.py b/tests/st/python/test_vllm_deepseek_smoothquant.py
index 7582e55b..7e3397e0 100644
--- a/tests/st/python/test_vllm_deepseek_smoothquant.py
+++ b/tests/st/python/test_vllm_deepseek_smoothquant.py
@@ -32,7 +32,8 @@ env_vars = {
     "LCCL_DETERMINISTIC": "1",
     "HCCL_DETERMINISTIC": "true",
     "ATB_MATMUL_SHUFFLE_K_ENABLE": "0",
-    "ATB_LLM_LCOC_ENABLE": "0"
+    "ATB_LLM_LCOC_ENABLE": "0",
+    "VLLM_USE_V1": "0",
 }
 # set env
 env_manager.setup_ai_environment(env_vars)
@@ -47,6 +48,7 @@ class TestDeepSeek:
     @pytest.mark.level0
     @pytest.mark.platform_arm_ascend910b_training
     @pytest.mark.env_single
+    @pytest.mark.skip(reason="gs master branch is not suit for the newest mindformers.")
     def test_deepseek_r1(self):
         """
         test case deepseek r1 w8a8
diff --git a/tests/st/python/test_vllm_mf_qwen_7b.py b/tests/st/python/test_vllm_mf_qwen_7b.py
index ddb545c7..bbb6aa46 100644
--- a/tests/st/python/test_vllm_mf_qwen_7b.py
+++ b/tests/st/python/test_vllm_mf_qwen_7b.py
@@ -32,7 +32,8 @@ env_vars = {
     "LCCL_DETERMINISTIC": "1",
     "HCCL_DETERMINISTIC": "true",
     "ATB_MATMUL_SHUFFLE_K_ENABLE": "0",
-    "ATB_LLM_LCOC_ENABLE": "0"
+    "ATB_LLM_LCOC_ENABLE": "0",
+    "VLLM_USE_V1": "0",
 }
 # set env
 env_manager.setup_ai_environment(env_vars)
diff --git a/tests/st/python/test_vllm_mf_qwen_7b_chunk_prefill.py b/tests/st/python/test_vllm_mf_qwen_7b_chunk_prefill.py
index 1523e46b..cc3cbcab 100644
--- a/tests/st/python/test_vllm_mf_qwen_7b_chunk_prefill.py
+++ b/tests/st/python/test_vllm_mf_qwen_7b_chunk_prefill.py
@@ -31,7 +31,8 @@ env_vars = {
     "LCCL_DETERMINISTIC": "1",
     "HCCL_DETERMINISTIC": "true",
     "ATB_MATMUL_SHUFFLE_K_ENABLE": "0",
-    "ATB_LLM_LCOC_ENABLE": "0"
+    "ATB_LLM_LCOC_ENABLE": "0",
+    "VLLM_USE_V1": "0",
 }
 # set env
 env_manager.setup_ai_environment(env_vars)
@@ -47,6 +48,7 @@ class TestMfQwen_chunk_prefill:
     @pytest.mark.level0
     @pytest.mark.platform_arm_ascend910b_training
     @pytest.mark.env_single
+    @pytest.mark.skip(reason="cp precision need to be fixed on v0.8.3 V0")
     def test_mf_qwen_7b_chunk_prefill(self):
         """
         test case qwen_7b_chunk_prefill
diff --git a/tests/st/python/test_vllm_mf_qwen_7b_cp_pc_mss.py b/tests/st/python/test_vllm_mf_qwen_7b_cp_pc_mss.py
index 6292b22c..df61117d 100644
--- a/tests/st/python/test_vllm_mf_qwen_7b_cp_pc_mss.py
+++ b/tests/st/python/test_vllm_mf_qwen_7b_cp_pc_mss.py
@@ -30,7 +30,8 @@ env_vars = {
     "LCCL_DETERMINISTIC": "1",
     "HCCL_DETERMINISTIC": "true",
     "ATB_MATMUL_SHUFFLE_K_ENABLE": "0",
-    "ATB_LLM_LCOC_ENABLE": "0"
+    "ATB_LLM_LCOC_ENABLE": "0",
+    "VLLM_USE_V1": "0",
 }
 # set env
 env_manager.setup_ai_environment(env_vars)
diff --git a/tests/st/python/test_vllm_mf_qwen_7b_mss.py b/tests/st/python/test_vllm_mf_qwen_7b_mss.py
index b174804d..266e296a 100644
--- a/tests/st/python/test_vllm_mf_qwen_7b_mss.py
+++ b/tests/st/python/test_vllm_mf_qwen_7b_mss.py
@@ -32,7 +32,8 @@ env_vars = {
     "LCCL_DETERMINISTIC": "1",
     "HCCL_DETERMINISTIC": "true",
     "ATB_MATMUL_SHUFFLE_K_ENABLE": "0",
-    "ATB_LLM_LCOC_ENABLE": "0"
+    "ATB_LLM_LCOC_ENABLE": "0",
+    "VLLM_USE_V1": "0",
 }
 # set env
 env_manager.setup_ai_environment(env_vars)
diff --git a/tests/st/python/test_vllm_mf_qwen_7b_prefix_caching.py b/tests/st/python/test_vllm_mf_qwen_7b_prefix_caching.py
index 89ba64c0..28ec1058 100644
--- a/tests/st/python/test_vllm_mf_qwen_7b_prefix_caching.py
+++ b/tests/st/python/test_vllm_mf_qwen_7b_prefix_caching.py
@@ -31,7 +31,8 @@ env_vars = {
     "LCCL_DETERMINISTIC": "1",
     "HCCL_DETERMINISTIC": "true",
     "ATB_MATMUL_SHUFFLE_K_ENABLE": "0",
-    "ATB_LLM_LCOC_ENABLE": "0"
+    "ATB_LLM_LCOC_ENABLE": "0",
+    "VLLM_USE_V1": "0",
 }
 env_manager.setup_ai_environment(env_vars)
 import vllm_mindspore
@@ -45,6 +46,7 @@ class TestMfQwen_prefix_caching:
     @pytest.mark.level0
     @pytest.mark.platform_arm_ascend910b_training
     @pytest.mark.env_single
+    @pytest.mark.skip(reason="pc precision need to be fixed on v0.8.3 V0")
     def test_mf_qwen_7b_prefix_caching(self):
         """
         test case qwen_7b_prefix_caching
diff --git a/tests/st/python/test_vllm_qwen_7b.py b/tests/st/python/test_vllm_qwen_7b.py
index bce75d3e..695e9cb6 100644
--- a/tests/st/python/test_vllm_qwen_7b.py
+++ b/tests/st/python/test_vllm_qwen_7b.py
@@ -28,7 +28,8 @@ env_vars = {
     "LCCL_DETERMINISTIC": "1",
     "HCCL_DETERMINISTIC": "true",
     "ATB_MATMUL_SHUFFLE_K_ENABLE": "0",
-    "ATB_LLM_LCOC_ENABLE": "0"
+    "ATB_LLM_LCOC_ENABLE": "0",
+    "VLLM_USE_V1": "0",
 }
 # set env
 env_manager.setup_ai_environment(env_vars)
-- 
Gitee


From ffe0562772872ea366159faa2f5c3fa77605c615 Mon Sep 17 00:00:00 2001
From: lvhaoyu <lvhaoyu@huawei.com>
Date: Wed, 14 May 2025 10:18:14 +0800
Subject: [PATCH 27/55] update codegate for vllm 0.8.3

---
 .jenkins/test/config/dependent_packages.yaml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/.jenkins/test/config/dependent_packages.yaml b/.jenkins/test/config/dependent_packages.yaml
index 37511933..cdb73e74 100644
--- a/.jenkins/test/config/dependent_packages.yaml
+++ b/.jenkins/test/config/dependent_packages.yaml
@@ -1,11 +1,11 @@
 mindspore:
-  'https://repo.mindspore.cn/mindspore/mindspore/version/202504/20250417/br_infer_deepseek_os_20250417004508_38b6db6c3039b59153d52d5e353cd01fe774dc93_newest/'
+  'https://repo.mindspore.cn/mindspore/mindspore/version/202505/20250514/br_infer_deepseek_os_20250514004506_0e705b79c36766d07889faa32bc6a3ef6ec79ef3_newest/'
 
 mindspore_gs:
-  'https://repo.mindspore.cn/mindspore/golden-stick/version/202504/20250424/master_20250424010019_dc3222e266c572dce1070a112aa6e12155a45370_newest/'
+  'https://repo.mindspore.cn/mindspore/golden-stick/version/202505/20250514/master_20250514010015_c6cede824328d0dd7069e735646ff4a1808a1c72_newest/'
 
 msadapter:
-  'https://repo.mindspore.cn/mindspore/msadapter/version/202504/20250410/master_20250410120007_83e7214eb2b9598179135a4e98dce3b69ba27da2_newest/'
+  'https://repo.mindspore.cn/mindspore/msadapter/version/202505/20250514/master_20250514010016_380ecadf0133da436503105d6e8e1db709472fe4_newest/'
 
 vllm:
-  'https://repo.mindspore.cn/mirrors/vllm/version/202503/20250321/v0.7.3_20250321112504_ed6e9075d31e32c8548b480a47d1ffb77da1f54c_newest/'
+  'https://repo.mindspore.cn/mirrors/vllm/version/202505/20250514/v0.8.4.dev0_newest/any/vllm-0.8.4.dev0+g296c657.d20250514.empty-py3-none-any.whl'
-- 
Gitee


From 50b54e4878030c413930da6dc57271a51a3411c6 Mon Sep 17 00:00:00 2001
From: lvhaoyu <lvhaoyu@huawei.com>
Date: Thu, 15 May 2025 10:12:22 +0800
Subject: [PATCH 28/55] Revert "adding v0-style prefix-first scheduler"

This reverts commit bc448f417b98d1948e8e05ca1ae0df0a23459246.
---
 vllm_mindspore/__init__.py                |   4 -
 vllm_mindspore/v1/core/__init__.py        |   0
 vllm_mindspore/v1/core/sched/scheduler.py | 319 ----------------------
 3 files changed, 323 deletions(-)
 delete mode 100644 vllm_mindspore/v1/core/__init__.py
 delete mode 100644 vllm_mindspore/v1/core/sched/scheduler.py

diff --git a/vllm_mindspore/__init__.py b/vllm_mindspore/__init__.py
index 28f8469d..3aae2091 100644
--- a/vllm_mindspore/__init__.py
+++ b/vllm_mindspore/__init__.py
@@ -309,10 +309,6 @@ from vllm_mindspore.v1.worker.gpu_worker import compile_or_warm_up_model
 from vllm.v1.worker.gpu_worker import Worker
 Worker.compile_or_warm_up_model = compile_or_warm_up_model
 
-from vllm_mindspore.v1.core.sched.scheduler import schedule
-from vllm.v1.core.sched.scheduler import Scheduler
-Scheduler.schedule = schedule
-
 from .utils import check_ready
 
 from vllm_mindspore.engine.multiprocessing.engine import cleanup
diff --git a/vllm_mindspore/v1/core/__init__.py b/vllm_mindspore/v1/core/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/vllm_mindspore/v1/core/sched/scheduler.py b/vllm_mindspore/v1/core/sched/scheduler.py
deleted file mode 100644
index c03f3469..00000000
--- a/vllm_mindspore/v1/core/sched/scheduler.py
+++ /dev/null
@@ -1,319 +0,0 @@
-#!/usr/bin/env python3
-# encoding: utf-8
-# Copyright 2025 Huawei Technologies Co., Ltd
-# Copyright 2024 The vLLM team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# This file is based on vllm-ascend project.
-# ============================================================================
-
-from collections import deque
-
-import time
-from vllm.logger import init_logger
-from vllm.utils import cdiv
-from vllm.v1.core.sched.output import NewRequestData, SchedulerOutput
-from vllm.v1.core.sched.scheduler import Scheduler
-from vllm.v1.request import Request, RequestStatus
-
-logger = init_logger(__name__)
-
-def _check_watermark_for_prefill(self,
-                                    request,
-                                    num_new_tokens,
-                                    computed_blocks,
-                                    watermark=0.01):
-    computed_blocks = computed_blocks or []
-    watermark_blocks = self.kv_cache_config.num_blocks * watermark
-    num_computed_tokens = (request.num_computed_tokens +
-                            len(computed_blocks) * self.block_size)
-    num_required_blocks = cdiv(num_new_tokens + num_computed_tokens,
-                                self.block_size)
-    req_blocks = self.kv_cache_manager.req_to_blocks[request.request_id]
-    num_new_blocks = (num_required_blocks - len(req_blocks) -
-                        len(computed_blocks))
-    num_evictable_computed_blocks = sum(1 for blk in computed_blocks
-                                        if blk.ref_cnt == 0)
-    # If number of free blocks is less than water mark after allocating, don't allocate.
-    if (self.kv_cache_manager.block_pool.get_num_free_blocks() -
-            num_evictable_computed_blocks -
-            num_new_blocks) < watermark_blocks:
-        return False
-    return True
-
-def _get_prompt_limit(self, request: Request) -> int:
-    if (self.scheduler_config.chunked_prefill_enabled
-            and not self.scheduler_config.is_multi_step):
-        prompt_limit = self.scheduler_config.max_model_len
-    else:
-        prompt_limit = min(
-            self.scheduler_config.max_model_len,
-            self.scheduler_config.max_num_batched_tokens,
-        )
-
-    # Model is fine tuned with long context. Return the fine tuned max_len.
-    if request.lora_request and request.lora_request.long_lora_max_len:
-        assert prompt_limit <= request.lora_request.long_lora_max_len
-        return request.lora_request.long_lora_max_len
-    else:
-        return prompt_limit
-
-def schedule(self) -> SchedulerOutput:
-    scheduled_new_reqs: list[Request] = []
-    scheduled_resumed_reqs: list[Request] = []
-    scheduled_running_reqs: list[Request] = []
-    preempted_reqs: list[Request] = []
-
-    req_to_new_block_ids: dict[str, list[int]] = {}
-    num_scheduled_tokens: dict[str, int] = {}
-    token_budget = self.max_num_scheduled_tokens
-    # Spec decode-related.
-    scheduled_spec_decode_tokens: dict[str, list[int]] = {}
-
-    # Record scheduled LoRA requests.
-    scheduled_loras: set[int] = set()
-
-    # Use a temporary deque to collect requests that need to be skipped
-    # and put back at the head of the waiting queue later
-    skipped_waiting_requests: deque[Request] = deque()
-
-    # Schedule prefill requests first.
-    while self.waiting and token_budget > 0:
-        if len(scheduled_new_reqs) == self.max_num_running_reqs:
-            break
-
-        request = self.waiting[0]
-
-        def skip_cur_request():
-            self.waiting.popleft()
-            skipped_waiting_requests.appendleft(request)
-
-        # Check that adding the request still respects the max_loras
-        # constraint.
-        if (self.lora_config and request.lora_request and
-            (len(scheduled_loras) == self.lora_config.max_loras
-                and request.lora_request.lora_int_id not in scheduled_loras)):
-            # Scheduling would exceed max_loras, skip.
-            skip_cur_request()
-            continue
-
-        prompt_limit = _get_prompt_limit(self, request)
-        # Get already-cached tokens.
-        computed_blocks, num_computed_tokens = (
-            self.kv_cache_manager.get_computed_blocks(request))
-        num_new_tokens = request.num_prompt_tokens - num_computed_tokens
-        if (0 < self.scheduler_config.long_prefill_token_threshold <
-                num_new_tokens):
-            num_new_tokens = (
-                self.scheduler_config.long_prefill_token_threshold)
-        max_tokens_in_kvcache = (self.kv_cache_config.num_blocks *
-                                    self.block_size)
-        prompt_limit = min(prompt_limit, max_tokens_in_kvcache)
-
-        # Finish request that exceeds prompt_limit or kv cache size.
-        if num_new_tokens > prompt_limit:
-            logger.warning(
-                "Input prompt (%d tokens) is too long"
-                " and exceeds limit of %d",
-                num_new_tokens,
-                prompt_limit,
-            )
-            request.status = RequestStatus.FINISHED_IGNORED
-            self.finished_req_ids.add(request.request_id)  # type: ignore
-            self.waiting.popleft()
-            continue
-
-        if num_new_tokens > token_budget:
-            # Scheduling would exceed token_budget, skip.
-            skip_cur_request()
-            continue
-
-        assert num_new_tokens > 0
-        watermark = getattr(self.scheduler_config, "watermark", 0.01)
-        if not _check_watermark_for_prefill(self,
-                request, num_new_tokens, computed_blocks, watermark):
-            # Scheduling would exceed watermark, skip.
-            skip_cur_request()
-            continue
-
-        new_blocks = self.kv_cache_manager.allocate_slots(
-            request, num_new_tokens, computed_blocks)
-        if new_blocks is None:
-            # The request cannot be scheduled.
-            break
-
-        self.waiting.popleft()
-        self.running.append(request)
-        self.scheduled_req_ids.add(request.request_id)
-        # Check request status.
-        if request.status == RequestStatus.WAITING:
-            scheduled_new_reqs.append(request)
-        elif request.status == RequestStatus.PREEMPTED:
-            scheduled_resumed_reqs.append(request)
-        else:
-            raise RuntimeError(f"Invalid request status: {request.status}")
-
-        if self.lora_config and request.lora_request:
-            scheduled_loras.add(request.lora_request.lora_int_id)
-        req_to_new_block_ids[request.request_id] = [
-            b.block_id for b in computed_blocks + new_blocks
-        ]
-        # Update request info.
-        num_scheduled_tokens[request.request_id] = num_new_tokens
-        token_budget -= num_new_tokens
-        request.status = RequestStatus.RUNNING
-        request.num_computed_tokens = num_computed_tokens
-
-    # Put back any skipped requests at the head of the waiting queue
-    if skipped_waiting_requests:
-        self.waiting.extendleft(skipped_waiting_requests)
-
-    # If no prefill requests are scheduled,
-    # Schedule decode requests next.
-    if len(self.scheduled_req_ids) == 0:
-        req_index = 0
-        while req_index < len(self.running) and token_budget > 0:
-            request = self.running[req_index]
-            if request.request_id in self.scheduled_req_ids:
-                # This request has already been scheduled.
-                req_index += 1
-                continue
-
-            num_new_tokens = (request.num_tokens_with_spec -
-                                request.num_computed_tokens)
-            if (0 < self.scheduler_config.long_prefill_token_threshold <
-                    num_new_tokens):
-                num_new_tokens = (
-                    self.scheduler_config.long_prefill_token_threshold)
-            num_new_tokens = min(num_new_tokens, token_budget)
-            assert num_new_tokens == 1
-
-            while True:
-                new_blocks = self.kv_cache_manager.allocate_slots(
-                    request, num_new_tokens)
-                if new_blocks is None:
-                    # The request cannot be scheduled.
-                    # Preempt the lowest-priority request.
-                    preempted_req = self.running.pop()
-                    self.kv_cache_manager.free(preempted_req)
-                    preempted_req.status = RequestStatus.PREEMPTED
-                    preempted_req.num_computed_tokens = 0
-                    self.waiting.appendleft(preempted_req)
-                    preempted_reqs.append(preempted_req)
-                    if preempted_req == request:
-                        # No more request to preempt.
-                        can_schedule = False
-                        break
-                else:
-                    # The request can be scheduled.
-                    can_schedule = True
-                    break
-            if not can_schedule:
-                break
-            assert new_blocks is not None
-
-            # Schedule the request.
-            scheduled_running_reqs.append(request)
-            self.scheduled_req_ids.add(request.request_id)
-            req_to_new_block_ids[request.request_id] = [
-                b.block_id for b in new_blocks
-            ]
-            num_scheduled_tokens[request.request_id] = num_new_tokens
-            token_budget -= num_new_tokens
-            req_index += 1
-
-            # Speculative decode related.
-            if request.spec_token_ids:
-                num_scheduled_spec_tokens = (num_new_tokens +
-                                                request.num_computed_tokens -
-                                                request.num_tokens)
-                if num_scheduled_spec_tokens > 0:
-                    # Trim spec_token_ids list to num_scheduled_spec_tokens.
-                    del request.spec_token_ids[num_scheduled_spec_tokens:]
-                    scheduled_spec_decode_tokens[request.request_id] = (
-                        request.spec_token_ids)
-
-    # Check if the scheduling constraints are satisfied.
-    total_num_scheduled_tokens = sum(num_scheduled_tokens.values())
-    assert total_num_scheduled_tokens <= self.max_num_scheduled_tokens
-    assert token_budget >= 0
-    assert len(self.running) <= self.max_num_running_reqs
-    assert len(scheduled_new_reqs) + len(scheduled_resumed_reqs) + len(
-        scheduled_running_reqs) <= len(self.running)
-
-    # Get the longest common prefix among all requests in the running queue.
-    # This can be potentially used for cascade attention.
-    num_common_prefix_blocks = 0
-    if self.running:
-        any_request = self.running[0]
-        num_common_prefix_blocks = (
-            self.kv_cache_manager.get_num_common_prefix_blocks(
-                any_request, len(self.running)))
-
-    # Construct the scheduler output.
-    new_reqs_data = [
-        NewRequestData.from_request(req,
-                                    req_to_new_block_ids[req.request_id])
-        for req in scheduled_new_reqs
-    ]
-    resumed_reqs_data = [
-        self._make_cached_request_data(
-            req,
-            num_scheduled_tokens[req.request_id],
-            len(scheduled_spec_decode_tokens.get(req.request_id, ())),
-            req_to_new_block_ids[req.request_id],
-            resumed_from_preemption=True,
-        ) for req in scheduled_resumed_reqs
-    ]
-    running_reqs_data = [
-        self._make_cached_request_data(
-            req,
-            num_scheduled_tokens[req.request_id],
-            len(scheduled_spec_decode_tokens.get(req.request_id, ())),
-            req_to_new_block_ids[req.request_id],
-            resumed_from_preemption=False,
-        ) for req in scheduled_running_reqs
-    ]
-    scheduler_output = SchedulerOutput(
-        scheduled_new_reqs=new_reqs_data,
-        scheduled_cached_reqs=resumed_reqs_data + running_reqs_data,
-        num_scheduled_tokens=num_scheduled_tokens,
-        total_num_scheduled_tokens=total_num_scheduled_tokens,
-        scheduled_spec_decode_tokens=scheduled_spec_decode_tokens,
-        scheduled_encoder_inputs={},
-        num_common_prefix_blocks=num_common_prefix_blocks,
-        # finished_req_ids is an existing state in the scheduler,
-        # instead of being newly scheduled in this step.
-        # It contains the request IDs that are finished in between
-        # the previous and the current steps.
-        finished_req_ids=self.finished_req_ids,  # type: ignore
-        free_encoder_input_ids=self.encoder_cache_manager.get_freed_ids(),
-        structured_output_request_ids={},
-        grammar_bitmask=None,
-    )
-
-    # Advance the number of computed tokens for the request AFTER
-    # the request is scheduled.
-    # 1. The scheduler_output of the current step has to include the
-    #    original number of scheduled tokens to determine input IDs.
-    # 2. Advance the number of computed tokens here allowing us to
-    #    schedule the prefill request again immediately in the next
-    #    scheduling step.
-    # 3. If some tokens (e.g. spec tokens) are rejected later, the number of
-    #    computed tokens will be adjusted in update_from_output.
-    for req_id, num_scheduled_token in num_scheduled_tokens.items():
-        self.requests[req_id].num_computed_tokens += num_scheduled_token
-
-    self.finished_req_ids = set()  # type: ignore
-    return scheduler_output
-- 
Gitee


From a7fedb2aadad90af22ee5a4cab3d7ebbaab8b7b8 Mon Sep 17 00:00:00 2001
From: lvhaoyu <lvhaoyu@huawei.com>
Date: Thu, 15 May 2025 10:22:31 +0800
Subject: [PATCH 29/55] Revert "bugfix: add module init"

This reverts commit 109edbf79a1405af1916d1ed46fe3020f71f5ed4.
---
 vllm_mindspore/v1/core/sched/__init__.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 delete mode 100644 vllm_mindspore/v1/core/sched/__init__.py

diff --git a/vllm_mindspore/v1/core/sched/__init__.py b/vllm_mindspore/v1/core/sched/__init__.py
deleted file mode 100644
index e69de29b..00000000
-- 
Gitee


From 4bdd0c791b67c06575e73b7acbc0472e36a103d6 Mon Sep 17 00:00:00 2001
From: zhanzhan1 <zhanzhan1@huawei.com>
Date: Wed, 14 May 2025 00:08:25 +0800
Subject: [PATCH 30/55] Update for EP parallel config changes

---
 .../models/mf_models/deepseekv3_weight_processor.py         | 4 ++--
 .../model_executor/models/mf_models/weight_processor.py     | 6 ++++--
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/vllm_mindspore/model_executor/models/mf_models/deepseekv3_weight_processor.py b/vllm_mindspore/model_executor/models/mf_models/deepseekv3_weight_processor.py
index 966c569a..c68bf4ad 100644
--- a/vllm_mindspore/model_executor/models/mf_models/deepseekv3_weight_processor.py
+++ b/vllm_mindspore/model_executor/models/mf_models/deepseekv3_weight_processor.py
@@ -939,7 +939,7 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor):
                                                        requires_grad=False)
 
     def get_moe_shared_expert_weight(self, w1_hf_name, w2_hf_name, w3_hf_name, src_hf_dir, hf_weight_map):
-        if self.ep_method == EPMethod.ALLGATHER:
+        if self.ep_method in [EPMethod.DEFAULT, EPMethod.ALLGATHER]:
             w1_ms_param, _ = self.get_safetensor_from_file_split_global_group(w1_hf_name, src_hf_dir, hf_weight_map,
                                                                               split_axis=0)
             w2_ms_param, _ = self.get_safetensor_from_file_split_global_group(w2_hf_name, src_hf_dir, hf_weight_map,
@@ -1290,7 +1290,7 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor):
                                                   w3_bias_name, w3_scale_name, w2_weight_name, src_hf_dir,
                                                   hf_weight_map):
 
-        if self.ep_method == EPMethod.ALLGATHER:
+        if self.ep_method in [EPMethod.DEFAULT, EPMethod.ALLGATHER]:
             w1_weight_param, _ = self.get_safetensor_from_file_split_moe_tp_group(w1_weight_name, src_hf_dir,
                                                                                   hf_weight_map,
                                                                                   split_axis=0)
diff --git a/vllm_mindspore/model_executor/models/mf_models/weight_processor.py b/vllm_mindspore/model_executor/models/mf_models/weight_processor.py
index c70302f2..542dbf77 100644
--- a/vllm_mindspore/model_executor/models/mf_models/weight_processor.py
+++ b/vllm_mindspore/model_executor/models/mf_models/weight_processor.py
@@ -33,6 +33,7 @@ class EPMethod(Enum):
     ALLTOALL = 'alltoall'
     ALLGATHER = 'allgather'
 
+
 class BaseWeightProcessor:
     r"""
     Provide model weight load and shards.
@@ -51,8 +52,9 @@ class BaseWeightProcessor:
         self.tp_group_size = get_tp_world_size()
         self.dp_group_size = get_data_parallel_world_size()
         self.num_router_experts = self.config.moe_config.expert_num if self.config.moe_config.expert_num else 1
-        self.moe_ep_size = self.config.moe_config.moe_expert_parallel if self.config.moe_config.moe_expert_parallel else 1
-        self.moe_tp_size = self.config.moe_config.moe_tensor_parallel if self.config.moe_config.moe_tensor_parallel else 1
+        self.moe_ep_size = self.config.parallel_config.expert_parallel \
+            if self.config.parallel_config.expert_parallel else 1
+        self.moe_tp_size = self.global_group_size // self.moe_ep_size
         self.ep_method = EPMethod.DEFAULT
         if self.dp_group_size > 1 and self.moe_ep_size == self.global_group_size:
             self.ep_method = EPMethod.ALLTOALL
-- 
Gitee


From f7d9fabc5bbf17648d77dc5c7b0fe874c04ba4bb Mon Sep 17 00:00:00 2001
From: lvhaoyu1 <lvhaoyu@huawei.com>
Date: Fri, 16 May 2025 00:00:07 +0800
Subject: [PATCH 31/55] update yaml and some code

---
 .jenkins/test/config/dependent_packages.yaml           |  2 +-
 tests/st/python/config/predict_deepseek_r1_671b.yaml   |  4 ----
 .../python/config/predict_deepseek_r1_671b_w8a8.yaml   |  7 ++-----
 tests/st/python/test_vllm_deepseek_bf16_part.py        |  2 +-
 tests/st/python/test_vllm_mf_qwen_7b_cp_pc_mss.py      |  1 +
 tests/st/python/test_vllm_qwen_7b.py                   |  1 +
 .../model_executor/models/mf_models/mf_model_base.py   |  2 +-
 vllm_mindspore/model_executor/models/model_base.py     | 10 ----------
 8 files changed, 7 insertions(+), 22 deletions(-)

diff --git a/.jenkins/test/config/dependent_packages.yaml b/.jenkins/test/config/dependent_packages.yaml
index cdb73e74..5df5ff1f 100644
--- a/.jenkins/test/config/dependent_packages.yaml
+++ b/.jenkins/test/config/dependent_packages.yaml
@@ -8,4 +8,4 @@ msadapter:
   'https://repo.mindspore.cn/mindspore/msadapter/version/202505/20250514/master_20250514010016_380ecadf0133da436503105d6e8e1db709472fe4_newest/'
 
 vllm:
-  'https://repo.mindspore.cn/mirrors/vllm/version/202505/20250514/v0.8.4.dev0_newest/any/vllm-0.8.4.dev0+g296c657.d20250514.empty-py3-none-any.whl'
+  'https://repo.mindspore.cn/mirrors/vllm/version/202505/20250514/v0.8.4.dev0_newest/'
diff --git a/tests/st/python/config/predict_deepseek_r1_671b.yaml b/tests/st/python/config/predict_deepseek_r1_671b.yaml
index 112375ef..a4d05570 100644
--- a/tests/st/python/config/predict_deepseek_r1_671b.yaml
+++ b/tests/st/python/config/predict_deepseek_r1_671b.yaml
@@ -26,10 +26,6 @@ context:
   device_id: 0
   affinity_cpu_list: None
 
-kernel_launch_group:
-  thread_num: 4
-  kernel_group_num: 16
-
 # parallel context config
 parallel:
   parallel_mode: "STAND_ALONE" # use 'STAND_ALONE' mode for inference with parallelism in frontend
diff --git a/tests/st/python/config/predict_deepseek_r1_671b_w8a8.yaml b/tests/st/python/config/predict_deepseek_r1_671b_w8a8.yaml
index 5a5e9d60..239adc12 100644
--- a/tests/st/python/config/predict_deepseek_r1_671b_w8a8.yaml
+++ b/tests/st/python/config/predict_deepseek_r1_671b_w8a8.yaml
@@ -14,6 +14,7 @@ trainer:
 
 # default parallel of device num = 16 for Atlas 800T A2
 parallel_config:
+  data_parallel: 1
   model_parallel: 16
   pipeline_stage: 1
   expert_parallel: 1
@@ -22,14 +23,10 @@ parallel_config:
 # mindspore context init config
 context:
   mode: 0 # 0--Graph Mode; 1--Pynative Mode
-  max_device_memory: "61GB"
+  max_device_memory: "58GB"
   device_id: 0
   affinity_cpu_list: None
 
-kernel_launch_group:
-  thread_num: 4
-  kernel_group_num: 16
-
 # parallel context config
 parallel:
   parallel_mode: "STAND_ALONE" # use 'STAND_ALONE' mode for inference with parallelism in frontend
diff --git a/tests/st/python/test_vllm_deepseek_bf16_part.py b/tests/st/python/test_vllm_deepseek_bf16_part.py
index 86137637..f4db891a 100644
--- a/tests/st/python/test_vllm_deepseek_bf16_part.py
+++ b/tests/st/python/test_vllm_deepseek_bf16_part.py
@@ -23,7 +23,7 @@ env_vars = {
     "MINDFORMERS_MODEL_CONFIG": "./config/predict_deepseek_r1_671b.yaml",
     "ASCEND_CUSTOM_PATH": os.path.expandvars("$ASCEND_HOME_PATH/../"),
     "vLLM_MODEL_BACKEND": "MindFormers",
-    "MS_ENABLE_LCCL": "off",
+    "MS_ENABLE_LCCL": "on",
     "HCCL_OP_EXPANSION_MODE": "AIV",
     "ASCEND_RT_VISIBLE_DEVICES": "0,1,2,3,4,5,6,7",
     "MS_ALLOC_CONF": "enable_vmm:True",
diff --git a/tests/st/python/test_vllm_mf_qwen_7b_cp_pc_mss.py b/tests/st/python/test_vllm_mf_qwen_7b_cp_pc_mss.py
index df61117d..856932d3 100644
--- a/tests/st/python/test_vllm_mf_qwen_7b_cp_pc_mss.py
+++ b/tests/st/python/test_vllm_mf_qwen_7b_cp_pc_mss.py
@@ -45,6 +45,7 @@ class TestMfQwen_cp_pc_mss:
     @pytest.mark.level0
     @pytest.mark.platform_arm_ascend910b_training
     @pytest.mark.env_single
+    @pytest.mark.skip(reason="cp precision need to be fixed on v0.8.3 V0")
     def test_mf_qwen_7b_cp_pc_mss(self):
         """
         test case mf_qwen_7b_cp_pc_mss
diff --git a/tests/st/python/test_vllm_qwen_7b.py b/tests/st/python/test_vllm_qwen_7b.py
index 695e9cb6..028f2ead 100644
--- a/tests/st/python/test_vllm_qwen_7b.py
+++ b/tests/st/python/test_vllm_qwen_7b.py
@@ -44,6 +44,7 @@ class TestQwen:
     @pytest.mark.level0
     @pytest.mark.platform_arm_ascend910b_training
     @pytest.mark.env_single
+    @pytest.mark.skip(reason="qwen need to be supported on v0.8.3 V0")
     def test_vllm_qwen(self):
         """
         test case qwen2.5 7B
diff --git a/vllm_mindspore/model_executor/models/mf_models/mf_model_base.py b/vllm_mindspore/model_executor/models/mf_models/mf_model_base.py
index 533d16c7..32cbb916 100644
--- a/vllm_mindspore/model_executor/models/mf_models/mf_model_base.py
+++ b/vllm_mindspore/model_executor/models/mf_models/mf_model_base.py
@@ -216,7 +216,7 @@ class MfModelBase(MsModelBase):
                                         dtype=self.mf_model_config.compute_dtype)
             else:
                 hidden_states = hidden_states.index_select(0, selected_token_indices)
-                logits = self.network.lm_head(hidden_states)
+                logits = self.lm_head(hidden_states)
                 logits = logits.reshape(-1, logits.shape[-1])
         else:
             logits = self.lm_head(hidden_states)
diff --git a/vllm_mindspore/model_executor/models/model_base.py b/vllm_mindspore/model_executor/models/model_base.py
index 75e4648e..3df62d4c 100644
--- a/vllm_mindspore/model_executor/models/model_base.py
+++ b/vllm_mindspore/model_executor/models/model_base.py
@@ -66,16 +66,6 @@ class Fake_MLA(Fake_Attention):
         ]
 
 
-class Fake_MLA(Fake_Attention):
-    def __init__(self):
-        super().__init__()
-        vllm_config = get_current_vllm_config()
-        self.kv_cache = [
-            (torch.zeros(self.kv_shape, dtype=torch.bfloat16, device="Ascend"),)
-            for _ in range(vllm_config.parallel_config.pipeline_parallel_size)
-        ]
-
-
 class Fake_Attention_V1(Attention):
     def __init__(self):
         vllm_config = get_current_vllm_config()
-- 
Gitee


From 5c0e7dae5b7aec636c2391eca656da263827f9de Mon Sep 17 00:00:00 2001
From: moran <moran2@huawei.com>
Date: Thu, 15 May 2025 15:49:36 +0800
Subject: [PATCH 32/55] remove unused set in pyproject.toml

---
 .../pyproject.toml                              | 17 -----------------
 codecheck_toolkits/vllm_codecheck.sh            |  4 ++++
 2 files changed, 4 insertions(+), 17 deletions(-)
 rename pyproject.toml => codecheck_toolkits/pyproject.toml (85%)

diff --git a/pyproject.toml b/codecheck_toolkits/pyproject.toml
similarity index 85%
rename from pyproject.toml
rename to codecheck_toolkits/pyproject.toml
index 65e258dc..8bce21ec 100644
--- a/pyproject.toml
+++ b/codecheck_toolkits/pyproject.toml
@@ -1,20 +1,3 @@
-[build-system]
-# Should be mirrored in requirements-build.txt
-requires = [
-    "cmake>=3.26",
-    "ninja",
-    "packaging",
-    "setuptools>=61",
-    "setuptools-scm>=8.0",
-    "torch == 2.5.1",
-    "wheel",
-    "jinja2",
-]
-build-backend = "setuptools.build_meta"
-
-[tool.setuptools_scm]
-# version_file = "vllm/_version.py" # currently handled by `setup.py:get_version()`
-
 [tool.ruff]
 # Allow lines to be as long as 80.
 line-length = 80
diff --git a/codecheck_toolkits/vllm_codecheck.sh b/codecheck_toolkits/vllm_codecheck.sh
index 928c70bd..201b4d52 100644
--- a/codecheck_toolkits/vllm_codecheck.sh
+++ b/codecheck_toolkits/vllm_codecheck.sh
@@ -1,5 +1,7 @@
 pip install -r codecheck_toolkits/requirements-lint.txt
 
+ln -s codecheck_toolkits/pyproject.toml pyproject.toml
+
 RET_FLAG=0
 
 # yapf check
@@ -69,4 +71,6 @@ else
   echo "mypy check success."
 fi
 
+rm -f pyproject.toml
+
 exit $RET_FLAG
-- 
Gitee


From 2b3fee5149e3b08edf5f306c6aa8dcc4c3220ffb Mon Sep 17 00:00:00 2001
From: zlq2020 <zouliqin@huawei.com>
Date: Wed, 7 May 2025 21:20:16 +0800
Subject: [PATCH 33/55] remove inductor_pass

---
 vllm_mindspore/__init__.py                    |  6 +-
 vllm_mindspore/compilation/__init__.py        |  0
 vllm_mindspore/compilation/inductor_pass.py   | 73 -------------------
 .../model_executor/models/registry.py         |  3 -
 4 files changed, 1 insertion(+), 81 deletions(-)
 delete mode 100644 vllm_mindspore/compilation/__init__.py
 delete mode 100644 vllm_mindspore/compilation/inductor_pass.py

diff --git a/vllm_mindspore/__init__.py b/vllm_mindspore/__init__.py
index 3aae2091..d7515632 100644
--- a/vllm_mindspore/__init__.py
+++ b/vllm_mindspore/__init__.py
@@ -29,11 +29,7 @@ if "vllm" in sys.modules:
 from vllm_mindspore.scripts import env_setup
 env_setup()
 
-# 2. replace the inductor_pass module before import vllm.
-from vllm_mindspore.compilation import inductor_pass as ms_inductor_pass
-sys.modules.update({"vllm.compilation.inductor_pass": ms_inductor_pass})
-
-# 3. update the log configuration ahead of other modifications.
+# 2. update the log configuration ahead of other modifications.
 import vllm_mindspore.logger
 
 from vllm_mindspore.platforms.ascend import AscendPlatform
diff --git a/vllm_mindspore/compilation/__init__.py b/vllm_mindspore/compilation/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/vllm_mindspore/compilation/inductor_pass.py b/vllm_mindspore/compilation/inductor_pass.py
deleted file mode 100644
index 835a1e11..00000000
--- a/vllm_mindspore/compilation/inductor_pass.py
+++ /dev/null
@@ -1,73 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-
-import hashlib
-import inspect
-import json
-import types
-from typing import Any, Callable, Dict, Optional, Union
-
-import torch
-from packaging.version import Version
-
-
-class InductorPass():
-    """
-    A custom graph pass that uses a hash of its source as the UUID.
-    This is defined as a convenience and should work in most cases.
-    """
-
-    def uuid(self) -> Any:
-        """
-        Provide a unique identifier for the pass, used in Inductor code cache.
-        This should depend on the pass implementation, so that changes to the
-        pass result in recompilation.
-        By default, the object source is hashed.
-        """
-        return InductorPass.hash_source(self)
-
-    @staticmethod
-    def hash_source(*srcs: Union[str, Any]):
-        """
-        Utility method to hash the sources of functions or objects.
-        :param srcs: strings or objects to add to the hash.
-        Objects and functions have their source inspected.
-        :return:
-        """
-        hasher = hashlib.sha256()
-        for src in srcs:
-            if isinstance(src, str):
-                src_str = src
-            elif isinstance(src, types.FunctionType):
-                src_str = inspect.getsource(src)
-            else:
-                src_str = inspect.getsource(src.__class__)
-            hasher.update(src_str.encode("utf-8"))
-        return hasher.hexdigest()
-
-    @staticmethod
-    def hash_dict(dict_: Dict[Any, Any]):
-        """
-        Utility method to hash a dictionary, can alternatively be used for uuid.
-        :return: A sha256 hash of the json rep of the dictionary.
-        """
-        encoded = json.dumps(dict_, sort_keys=True).encode("utf-8")
-        return hashlib.sha256(encoded).hexdigest()
-
-
-class CallableInductorPass(InductorPass):
-    """
-    This class is a wrapper for a callable that automatically provides an
-    implementation of the UUID.
-    """
-
-    def __init__(self,
-                 callable,
-                 uuid: Optional[Any] = None):
-        self.callable = callable
-        self._uuid = self.hash_source(callable) if uuid is None else uuid
-
-    def __call__(self, graph):
-        self.callable(graph)
-
-    def uuid(self) -> Any:
-        return self._uuid
diff --git a/vllm_mindspore/model_executor/models/registry.py b/vllm_mindspore/model_executor/models/registry.py
index d58130ef..0f01305e 100644
--- a/vllm_mindspore/model_executor/models/registry.py
+++ b/vllm_mindspore/model_executor/models/registry.py
@@ -67,9 +67,6 @@ _SUBPROCESS_COMMAND = [
 
 
 def _run() -> None:
-    import vllm_mindspore.compilation.inductor_pass as ms_inductor_pass
-    sys.modules["vllm.compilation.inductor_pass"] = ms_inductor_pass
-
     # Setup plugins
     from vllm.plugins import load_general_plugins
     load_general_plugins()
-- 
Gitee


From 9da74957ee5981f7e8fb0bd4669a6c312721b230 Mon Sep 17 00:00:00 2001
From: twc <tanweicheng@huawei.com>
Date: Fri, 16 May 2025 16:12:58 +0800
Subject: [PATCH 34/55] code sync from master 1. fix fa and pa mask 2. Support
 for enable_prefix_caching

---
 tests/mindformers                             |  2 +-
 .../st/python/test_vllm_deepseek_bf16_part.py |  2 +-
 tests/st/python/test_vllm_deepseek_part.py    |  2 +-
 .../python/test_vllm_deepseek_smoothquant.py  |  2 +-
 vllm_mindspore/attention/layer.py             | 11 ++-
 .../model_executor/layers/logits_processor.py | 18 +++--
 vllm_mindspore/model_executor/layers/utils.py |  2 +
 .../models/{mf_models => }/attention_mask.py  | 56 ++++++++++----
 .../models/mf_models/deepseek_v3.py           |  3 +
 .../models/mf_models/mf_model_base.py         |  9 +--
 .../model_executor/models/model_base.py       | 43 -----------
 vllm_mindspore/model_executor/models/qwen2.py | 73 ++++++++++++++++---
 vllm_mindspore/utils.py                       |  9 +--
 13 files changed, 137 insertions(+), 95 deletions(-)
 rename vllm_mindspore/model_executor/models/{mf_models => }/attention_mask.py (39%)

diff --git a/tests/mindformers b/tests/mindformers
index 16587217..bbddc170 160000
--- a/tests/mindformers
+++ b/tests/mindformers
@@ -1 +1 @@
-Subproject commit 165872172ae5396cb4b66629614c85ff21038e11
+Subproject commit bbddc170167ac6705f07bf0aea25977e10f8d760
diff --git a/tests/st/python/test_vllm_deepseek_bf16_part.py b/tests/st/python/test_vllm_deepseek_bf16_part.py
index f4db891a..97904c7f 100644
--- a/tests/st/python/test_vllm_deepseek_bf16_part.py
+++ b/tests/st/python/test_vllm_deepseek_bf16_part.py
@@ -61,7 +61,7 @@ class TestDeepSeek:
 
         # Create an LLM.
         llm = LLM(model="/home/workspace/mindspore_dataset/weight/DeepSeek-R1-bf16",
-                  trust_remote_code=True, gpu_memory_utilization=0.9, tensor_parallel_size=8)
+                  trust_remote_code=True, gpu_memory_utilization=0.9, tensor_parallel_size=8, max_model_len=4096)
         # Generate texts from the prompts. The output is a list of RequestOutput objects
         # that contain the prompt, generated text, and other information.
         outputs = llm.generate(prompts, sampling_params)
diff --git a/tests/st/python/test_vllm_deepseek_part.py b/tests/st/python/test_vllm_deepseek_part.py
index f88ece2c..c882ae58 100644
--- a/tests/st/python/test_vllm_deepseek_part.py
+++ b/tests/st/python/test_vllm_deepseek_part.py
@@ -64,7 +64,7 @@ class TestDeepSeek:
 
         # Create an LLM.
         llm = LLM(model="/home/workspace/mindspore_dataset/weight/DeepSeek-R1-W8A8",
-                  trust_remote_code=True, gpu_memory_utilization=0.9, tensor_parallel_size=8)
+                  trust_remote_code=True, gpu_memory_utilization=0.9, tensor_parallel_size=8, max_model_len=4096)
         # Generate texts from the prompts. The output is a list of RequestOutput objects
         # that contain the prompt, generated text, and other information.
         outputs = llm.generate(prompts, sampling_params)
diff --git a/tests/st/python/test_vllm_deepseek_smoothquant.py b/tests/st/python/test_vllm_deepseek_smoothquant.py
index 7e3397e0..c79222d9 100644
--- a/tests/st/python/test_vllm_deepseek_smoothquant.py
+++ b/tests/st/python/test_vllm_deepseek_smoothquant.py
@@ -64,7 +64,7 @@ class TestDeepSeek:
 
         # Create an LLM.
         llm = LLM(model="/home/workspace/mindspore_dataset/weight/DeepSeek-R1-W8A8-smoothquant",
-                  trust_remote_code=True, gpu_memory_utilization=0.9, tensor_parallel_size=8)
+                  trust_remote_code=True, gpu_memory_utilization=0.9, tensor_parallel_size=8, max_model_len=4096)
         # Generate texts from the prompts. The output is a list of RequestOutput objects
         # that contain the prompt, generated text, and other information.
         outputs = llm.generate(prompts, sampling_params)
diff --git a/vllm_mindspore/attention/layer.py b/vllm_mindspore/attention/layer.py
index 4634727b..89914e97 100644
--- a/vllm_mindspore/attention/layer.py
+++ b/vllm_mindspore/attention/layer.py
@@ -157,11 +157,10 @@ class Attention(nn.Cell):
         value_cache: Tensor,
         is_prefill: bool,
         slot_mapping: Tensor,
-        batch_valid_length: Tuple[int],
+        attn_mask: Tensor,
+        batch_valid_length: Tensor,
         q_seq_lens: Tensor,
         block_tables: Tensor,
-        attn_mask: Tensor,
-        decode_mask: Tensor,
     ) -> Tensor:
         """Attention foward, support MHA and GQA.
 
@@ -181,7 +180,7 @@ class Attention(nn.Cell):
             output = self._run_prefill_forward(query, key, value, attn_mask, batch_valid_length, batch_valid_length)
         else:
             output = self._run_decode_forward(query, key_cache, value_cache, block_tables, batch_valid_length,
-                                              decode_mask, q_seq_lens)
+                                              attn_mask, q_seq_lens)
         return output
 
     def _run_prefill_forward(
@@ -228,7 +227,7 @@ class Attention(nn.Cell):
         value_cache: Tensor,
         block_tables: Tensor,
         batch_valid_length: Tensor,
-        decode_mask: Tensor,
+        attn_mask: Tensor,
         q_seq_lens: Tensor,
     ) -> Tensor:
         """Decode with PagedAttention.
@@ -248,7 +247,7 @@ class Attention(nn.Cell):
             batch_valid_length,
             None,
             None,
-            decode_mask,
+            attn_mask,
             q_seq_lens
         )
         return output
diff --git a/vllm_mindspore/model_executor/layers/logits_processor.py b/vllm_mindspore/model_executor/layers/logits_processor.py
index 647b4ac8..75f35d6d 100644
--- a/vllm_mindspore/model_executor/layers/logits_processor.py
+++ b/vllm_mindspore/model_executor/layers/logits_processor.py
@@ -41,6 +41,7 @@ if envs.VLLM_LOGITS_PROCESSOR_THREADS is not None:
     _logits_processor_threadpool = ThreadPoolExecutor(
         envs.VLLM_LOGITS_PROCESSOR_THREADS)
 
+
 class LogitsProcessor(nn.Cell):
     """Process logits and apply logits processors from sampling metadata.
 
@@ -88,6 +89,8 @@ class LogitsProcessor(nn.Cell):
             logits = hidden_states
         else:
             if sampling_metadata is not None:
+                if sampling_metadata.selected_token_indices.numel() <= 0:
+                    return mint.zeros((0, self.vocab_size), dtype=hidden_states.dtype)
                 hidden_states = _prune_hidden_states(hidden_states, sampling_metadata)
 
             # Get the logits for the next tokens.
@@ -102,7 +105,7 @@ class LogitsProcessor(nn.Cell):
                 logits *= self.scale
 
             # Apply logits processors (if any).
-            if sampling_metadata is not None:
+            if sampling_metadata.seq_groups is not None:
                 logits = _apply_logits_processors(logits, sampling_metadata)
 
         return logits
@@ -146,10 +149,10 @@ def _prune_hidden_states(
     # NOTE(kzawora): The if guard is needed for Gaudi - in some scenarios
     # (warmup, profile_run) we might not have selected_token_indices,
     # so we skip pruning.
-    if sampling_metadata.selected_token_indices is not None:
-        return ops.gather(hidden_states, sampling_metadata.selected_token_indices, 0)
-    else:
-        return hidden_states
+    indices = sampling_metadata.selected_token_indices
+    if indices is not None and indices.numel() > 0:
+        return mint.index_select(hidden_states, 0, sampling_metadata.selected_token_indices)
+    return hidden_states
 
 
 def _apply_logits_processors(
@@ -187,7 +190,7 @@ def _apply_logits_processors(
         logits_processed += len(seq_group.sample_indices) + len(
             seq_group.prompt_logprob_indices
         )
-    
+
     for logits_row_idx, future in logits_row_ids_and_logits_row_futures:
         logits[logits_row_idx] = future.result()
 
@@ -196,6 +199,7 @@ def _apply_logits_processors(
         assert logits_processed == logits.shape[0]
     return logits
 
+
 def _apply_logits_processors_single_seq(logits_row, logits_processors,
                                         past_tokens_ids,
                                         prompt_tokens_ids) -> Tensor:
@@ -206,4 +210,4 @@ def _apply_logits_processors_single_seq(logits_row, logits_processors,
                                           logits_row)
         else:
             logits_row = logits_processor(past_tokens_ids, logits_row)
-    return logits_row
\ No newline at end of file
+    return logits_row
diff --git a/vllm_mindspore/model_executor/layers/utils.py b/vllm_mindspore/model_executor/layers/utils.py
index bbef8d9c..0edf165c 100644
--- a/vllm_mindspore/model_executor/layers/utils.py
+++ b/vllm_mindspore/model_executor/layers/utils.py
@@ -53,6 +53,8 @@ def apply_penalties(logits: torch.Tensor, prompt_tokens_tensor: torch.Tensor,
     frequency_penalties: The frequency penalties of shape (num_seqs, )
     repetition_penalties: The repetition penalties of shape (num_seqs, )
     """
+    if logits.numel() <= 0:
+        return logits
     num_seqs, vocab_size = logits.shape
     _, prompt_mask = get_token_bin_counts_and_mask(prompt_tokens_tensor,
                                                    vocab_size, num_seqs)
diff --git a/vllm_mindspore/model_executor/models/mf_models/attention_mask.py b/vllm_mindspore/model_executor/models/attention_mask.py
similarity index 39%
rename from vllm_mindspore/model_executor/models/mf_models/attention_mask.py
rename to vllm_mindspore/model_executor/models/attention_mask.py
index 10fcd25e..40be1f46 100644
--- a/vllm_mindspore/model_executor/models/mf_models/attention_mask.py
+++ b/vllm_mindspore/model_executor/models/attention_mask.py
@@ -18,36 +18,66 @@ infer attention mask.
 """
 import numpy as np
 
-import mindspore as ms
-from mindspore import Tensor, JitConfig, Model
+from mindspore import Tensor, mint
+from mindspore import dtype as mstype
+
+r"""
+PA:ASD-V2.1.5
+1.MLA + Q_seqlen =1: no mask.(BF16 mask(0/-10000), FP16 mask(0/-10000)).
+2.MLA + Q_seqlen > 1: (MTP/PC/CP), BF16 mask(0/1), FP16 mask (0/-10000)
+3.normal + Q_seqlen=1: no mask
+4.normal + Q_seqlen > 1: (MTP/PC/CP),BF16 mask(0/-10000), FP16 mask(0/-10000).;
+
+FA:ASD-V2.1.5
+1.MLA: not implement;
+2.normal: mask BF16(0/1), FP16 mask(0/-10000);
+"""
 
 
 class LowerTriangularMask:
     r"""
     Provide Infer model attention mask.
     Args:
-        mf_model_config (MF Config): The config of Infer model.
-
+        dtype (ms.dtype): The compute type of Infer model.
+        max_model_len (int): The max model length of Infer model.
     """
 
-    def __init__(self, mf_model_config):
-        compute_dtype = mf_model_config.compute_dtype
-        seq_length = mf_model_config.seq_length
-        self.prefill_mask = Tensor(np.triu(np.ones(shape=(128, 128), dtype=np.float16), k=1), dtype=compute_dtype)
+    def __init__(self, dtype, max_model_len):
+        self.dtype = dtype
+        self.max_model_len = max_model_len
+
+        prefill_mask_coeff = 1.0 if self.dtype is mstype.bfloat16 else -10000.0
 
-        self.decode_mask = Tensor(np.triu(np.ones(shape=(seq_length, seq_length), dtype=np.int8), k=1),
-                                  dtype=compute_dtype)
+        self.prefill_mask = Tensor(np.triu(np.ones(shape=(128, 128), dtype=np.float16), k=1) * prefill_mask_coeff,
+                                   dtype=self.dtype)
 
-        self.hard_mask = Tensor([0], dtype=compute_dtype).reshape(1, 1)
+        self.decode_mask = Tensor(np.triu(np.ones(shape=(self.max_model_len, self.max_model_len), dtype=np.int8), k=1),
+                                  dtype=self.dtype) * -10000
 
-        self.gather = ms.ops.Gather()
+        self.hard_mask = mint.zeros((1, 1), dtype=dtype)
 
     def gen_attention_mask(self, is_prefill, position_ids, query_lens):
         if is_prefill:
             attention_mask = self.prefill_mask
         else:
             if max(query_lens) > 1:
-                attention_mask = self.gather(self.decode_mask, position_ids, 0)
+                attention_mask = mint.index_select(self.decode_mask, 0, position_ids)
             else:
                 attention_mask = self.hard_mask
         return attention_mask
+
+
+class MLALowerTriangularMask(LowerTriangularMask):
+    r"""
+    Provide MLA Infer model attention mask.
+    Args:
+        dtype (ms.dtype): The compute type of Infer model.
+        max_model_len (int): The max model length of Infer model.
+    """
+
+    def __init__(self, dtype, max_model_len):
+
+        super().__init__(dtype, max_model_len)
+        decode_mask_coeff = 1.0 if self.dtype is mstype.bfloat16 else -10000.0
+        self.decode_mask = Tensor(np.triu(np.ones(shape=(self.max_model_len, self.max_model_len), dtype=np.int8), k=1),
+                                  dtype=self.dtype) * decode_mask_coeff
diff --git a/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py b/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py
index 3a1b2be6..67b80339 100644
--- a/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py
+++ b/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py
@@ -53,6 +53,7 @@ from vllm_mindspore.model_executor.layers.sampler import get_sampler
 from vllm_mindspore.model_executor.models.model_base import Fake_MLA, Fake_MLA_V1
 from vllm_mindspore.model_executor.models.mf_models.mf_model_base import MfModelBase
 from vllm_mindspore.model_executor.models.mf_models.deepseekv3_weight_processor import DeepseekV3WeightProcessor
+from vllm_mindspore.model_executor.models.attention_mask import MLALowerTriangularMask
 
 logger = init_logger(__name__)
 
@@ -136,6 +137,8 @@ class DeepseekV3ForCausalLM(MfModelBase):
 
         self.set_flags = False
         set_runtime_kernel_launch_group()
+        self.casual_mask = MLALowerTriangularMask(dtype=self.mf_model_config.compute_dtype,
+                                                  max_model_len=self.model_config.max_model_len)
 
     def _generate_model_config(self):
         self.mf_config.load_checkpoint = self.get_model_path()
diff --git a/vllm_mindspore/model_executor/models/mf_models/mf_model_base.py b/vllm_mindspore/model_executor/models/mf_models/mf_model_base.py
index 32cbb916..d11f776a 100644
--- a/vllm_mindspore/model_executor/models/mf_models/mf_model_base.py
+++ b/vllm_mindspore/model_executor/models/mf_models/mf_model_base.py
@@ -17,11 +17,9 @@
 # ============================================================================
 
 import os
-from types import MethodType
-from typing import Iterable, List, Optional, Set, Tuple, Union
+from typing import Iterable, Optional, Set, Tuple, Union
 from abc import abstractmethod
 import numpy as np
-import math
 
 from vllm.config import VllmConfig
 from vllm.model_executor.layers.sampler import SamplerOutput
@@ -41,7 +39,7 @@ from mindformers.core.context import build_mf_context
 from mindformers.core.parallel_config import build_parallel_config
 
 from vllm_mindspore.model_executor.models.model_base import MsModelBase
-from vllm_mindspore.model_executor.models.mf_models.attention_mask import LowerTriangularMask
+from vllm_mindspore.model_executor.models.attention_mask import LowerTriangularMask
 from vllm_mindspore.v1.attention.backends.flash_attn import FlashAttentionMetadata
 
 logger = init_logger(__name__)
@@ -64,7 +62,8 @@ class MfModelBase(MsModelBase):
         self.mf_config.model.model_config.parallel_config.pipeline_stage = 1
         self._generate_model_config()
 
-        self.casual_mask = LowerTriangularMask(mf_model_config=self.mf_model_config)
+        self.casual_mask = LowerTriangularMask(dtype=self.mf_model_config.compute_dtype,
+                                               max_model_len=self.model_config.max_model_len)
         self.network, self.lm_head = self._create_network()
 
         affinity_config = self.mf_config.get('context', {}).get('affinity_cpu_list', {})
diff --git a/vllm_mindspore/model_executor/models/model_base.py b/vllm_mindspore/model_executor/models/model_base.py
index 3df62d4c..7aa1de4e 100644
--- a/vllm_mindspore/model_executor/models/model_base.py
+++ b/vllm_mindspore/model_executor/models/model_base.py
@@ -32,9 +32,7 @@ from vllm.attention.layer import Attention
 import torch
 
 from mindspore import Tensor, nn, mutable
-from mindspore import dtype as mstype
 
-from vllm_mindspore.utils import STR_DTYPE_TO_MS_DTYPE
 
 class Fake_Attention:
     def __init__(self):
@@ -220,47 +218,6 @@ class MsModelBase():
     ) -> Union[Tensor, IntermediateTensors]:
         raise NotImplementedError
 
-    def set_model_inputs(self, is_prefill):
-        dyn_input_ids = Tensor(shape=[None, None], dtype=mstype.int64)
-        dyn_position_ids = Tensor(shape=[None], dtype=mstype.int64)
-
-        block_size = self.cache_config.block_size
-        num_kv_heads = self.model_config.get_num_kv_heads(self.parallel_config)
-        head_size = self.model_config.get_head_size()
-        kv_cache_shape = (None, block_size, num_kv_heads, head_size)
-
-        kv_cache_dtype = self.model_config.dtype if self.cache_config.cache_dtype == "auto" \
-            else self.cache_config.cache_dtype
-        if kv_cache_dtype in STR_DTYPE_TO_MS_DTYPE:
-            kv_cache_dtype = STR_DTYPE_TO_MS_DTYPE[kv_cache_dtype]
-
-        num_layers = self.model_config.get_num_layers(self.parallel_config)
-
-        dyn_key_cache = mutable(Tensor(shape=kv_cache_shape, dtype=kv_cache_dtype))
-        dyn_value_cache = mutable(Tensor(shape=kv_cache_shape, dtype=kv_cache_dtype))
-        dyn_key_caches = mutable([dyn_key_cache for _ in range(num_layers)])
-        dyn_value_caches = mutable([dyn_value_cache for _ in range(num_layers)])
-
-        dyn_batch_valid_length = Tensor(shape=[None, ], dtype=mstype.int32)
-        dyn_q_seq_lens = Tensor(shape=[None, ], dtype=mstype.int32)
-        dyn_slot_mapping = Tensor(shape=[None, ], dtype=mstype.int32)
-        dyn_block_tables = Tensor(shape=[None, None], dtype=mstype.int32)
-        dyn_intermediate_tensors = None
-        dyn_inputs_embeds = None
-
-        self.model.set_inputs(
-            dyn_input_ids,
-            dyn_position_ids,
-            dyn_key_caches,
-            dyn_value_caches,
-            is_prefill,
-            dyn_slot_mapping,
-            dyn_batch_valid_length,
-            dyn_q_seq_lens,
-            dyn_block_tables,
-            dyn_intermediate_tensors,
-            dyn_inputs_embeds
-        )
 
     def get_kvcache(self):
         key_cache = []
diff --git a/vllm_mindspore/model_executor/models/qwen2.py b/vllm_mindspore/model_executor/models/qwen2.py
index 32d9da8d..5eb70a82 100644
--- a/vllm_mindspore/model_executor/models/qwen2.py
+++ b/vllm_mindspore/model_executor/models/qwen2.py
@@ -15,7 +15,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
-from vllm.config import get_current_vllm_config
 from typing import TYPE_CHECKING, Dict, List, Optional, Set, Tuple, Union, Iterable
 
 if TYPE_CHECKING:
@@ -25,7 +24,7 @@ else:
 
 import numpy as np
 
-from mindspore import Parameter, Tensor, mint, nn, jit, ops
+from mindspore import Parameter, Tensor, mint, nn, jit, ops, mutable
 from mindspore.common import dtype as mstype
 
 
@@ -49,6 +48,8 @@ from vllm_mindspore.model_executor.models.utils import (
     maybe_prefix)
 from vllm_mindspore.model_executor.sampling_metadata import SamplingMetadata
 from vllm_mindspore.model_executor.models.model_base import MsModelBase, Fake_Attention
+from vllm_mindspore.model_executor.models.attention_mask import LowerTriangularMask
+from vllm_mindspore.utils import STR_DTYPE_TO_MS_DTYPE
 
 
 from vllm.config import CacheConfig, VllmConfig
@@ -173,8 +174,6 @@ class Qwen2Attention(nn.Cell):
             prefix=f"{prefix}.attn",
             attn_type=attn_type
         )
-        self.attn_mask = mint.triu(mint.ones(size=(128, 128), dtype=mstype.bfloat16), 1)
-        self.hard_mask = Tensor([0], dtype=mstype.bfloat16).reshape(1, 1)
 
     @jit
     def construct(
@@ -185,15 +184,16 @@ class Qwen2Attention(nn.Cell):
         value_cache: Tensor,
         is_prefill: bool,
         slot_mapping: Tensor,
-        batch_valid_length: Tuple[int],
+        attn_mask: Tensor,
+        batch_valid_length: Tensor,
         q_seq_lens: Tensor,
         block_tables: Tensor,
     ) -> Tensor:
         qkv, _ = self.qkv_proj(hidden_states)
         q, k, v = mint.split(qkv, (self.q_size, self.kv_size, self.kv_size), -1)
-        q, k = self.rotary_emb(positions, q, k, q_seq_lens, is_prefill)
-        attn_output = self.attn(q, k, v, key_cache, value_cache, is_prefill, slot_mapping, batch_valid_length,
-                                q_seq_lens, block_tables, self.attn_mask, self.hard_mask)
+        q, k = self.rotary_emb(positions, q, k, batch_valid_length, is_prefill)
+        attn_output = self.attn(q, k, v, key_cache, value_cache, is_prefill, slot_mapping, attn_mask,
+                                batch_valid_length, q_seq_lens, block_tables)
         output, _ = self.o_proj(attn_output)
         return output
 
@@ -257,7 +257,8 @@ class Qwen2DecoderLayer(nn.Cell):
         value_cache: Tensor,
         is_prefill: bool,
         slot_mapping: Tensor,
-        batch_valid_length: Tuple[int],
+        attn_mask: Tensor,
+        batch_valid_length: Tensor,
         q_seq_lens: Tensor,
         block_tables: Tensor,
         residual: Optional[Tensor],
@@ -275,6 +276,7 @@ class Qwen2DecoderLayer(nn.Cell):
             value_cache,
             is_prefill,
             slot_mapping,
+            attn_mask,
             batch_valid_length,
             q_seq_lens,
             block_tables
@@ -342,6 +344,7 @@ class Qwen2Model(nn.Cell):
         value_caches: List[Tensor],
         is_prefill: bool,
         slot_mapping: Tensor,
+        attn_mask: Tensor,
         batch_valid_length: Tensor,
         q_seq_lens: Tensor,
         block_tables: Tensor,
@@ -367,6 +370,7 @@ class Qwen2Model(nn.Cell):
                 value_caches[i - self.start_layer],
                 is_prefill,
                 slot_mapping,
+                attn_mask,
                 batch_valid_length,
                 q_seq_lens,
                 block_tables,
@@ -486,6 +490,9 @@ class Qwen2ForCausalLM(MsModelBase):
         self.set_modules({"model": self.model, "lm_head": self.lm_head})
 
         self.prefill = True
+        self.mstype = STR_DTYPE_TO_MS_DTYPE.get(self.model_config.dtype, self.model_config.dtype)
+        self.casual_mask = LowerTriangularMask(dtype=self.mstype, 
+                                               max_model_len=self.model_config.max_model_len)
         self.set_model_inputs(self.prefill)
         self.kv_caches = [Fake_Attention() for i in range(config.num_hidden_layers)]
         compilation_config = vllm_config.compilation_config
@@ -495,8 +502,47 @@ class Qwen2ForCausalLM(MsModelBase):
         for i in range(config.num_hidden_layers):
             compilation_config.static_forward_context[str(i)] = self.kv_caches[i]
 
-    def get_input_embeddings(self, input_ids: Tensor) -> Tensor:
-        return self.model.get_input_embeddings(input_ids)
+    def set_model_inputs(self, is_prefill):
+        dyn_input_ids = Tensor(shape=[None, None], dtype=mstype.int64)
+        dyn_position_ids = Tensor(shape=[None], dtype=mstype.int64)
+
+        block_size = self.cache_config.block_size
+        num_kv_heads = self.model_config.get_num_kv_heads(self.parallel_config)
+        head_size = self.model_config.get_head_size()
+        kv_cache_shape = (None, block_size, num_kv_heads, head_size)
+
+        kv_cache_dtype = self.model_config.dtype if self.cache_config.cache_dtype == "auto" \
+            else self.cache_config.cache_dtype
+        kv_cache_dtype = STR_DTYPE_TO_MS_DTYPE[kv_cache_dtype]
+
+        num_layers = self.model_config.get_num_layers(self.parallel_config)
+
+        dyn_key_cache = Tensor(shape=kv_cache_shape, dtype=kv_cache_dtype)
+        dyn_value_cache = Tensor(shape=kv_cache_shape, dtype=kv_cache_dtype)
+        dyn_key_caches = mutable([dyn_key_cache for _ in range(num_layers)])
+        dyn_value_caches = mutable([dyn_value_cache for _ in range(num_layers)])
+
+        dyn_slot_mapping = Tensor(shape=[None, ], dtype=mstype.int32)
+        dynamic_attention_mask = Tensor(shape=[None, None], dtype=self.mstype)
+        dyn_batch_valid_length = Tensor(shape=[None,], dtype=mstype.int32)
+        dyn_q_seq_lens = Tensor(shape=[None, ], dtype=mstype.int32)
+        dyn_block_tables = Tensor(shape=[None, None], dtype=mstype.int32)
+        dyn_intermediate_tensors = None
+        dyn_inputs_embeds = None
+        self.model.set_inputs(
+            dyn_input_ids,
+            dyn_position_ids,
+            dyn_key_caches,
+            dyn_value_caches,
+            is_prefill,
+            dyn_slot_mapping,
+            dynamic_attention_mask,
+            dyn_batch_valid_length,
+            dyn_q_seq_lens,
+            dyn_block_tables,
+            dyn_intermediate_tensors,
+            dyn_inputs_embeds
+        )
 
     def forward(
         self,
@@ -535,7 +581,9 @@ class Qwen2ForCausalLM(MsModelBase):
                 self.set_model_inputs(self.prefill)
 
         slot_mapping = attn_metadata.slot_mapping
-        batch_valid_length = Tensor.from_numpy(np.array(attn_metadata.seq_lens, dtype=np.int32))
+        attn_mask = self.casual_mask.gen_attention_mask(is_prefill, positions, query_lens)
+        seq_lens_np = np.array(attn_metadata.seq_lens, dtype=np.int32)
+        batch_valid_length = Tensor.from_numpy(seq_lens_np)
         q_seq_lens = Tensor.from_numpy(np.array(attn_metadata.query_lens, dtype=np.int32))
         block_tables = attn_metadata.block_tables
         model_output = self.model(input_ids,
@@ -544,6 +592,7 @@ class Qwen2ForCausalLM(MsModelBase):
                                   value_cache,
                                   is_prefill,
                                   slot_mapping,
+                                  attn_mask,
                                   batch_valid_length,
                                   q_seq_lens,
                                   block_tables,
diff --git a/vllm_mindspore/utils.py b/vllm_mindspore/utils.py
index e0fc3b04..da942479 100644
--- a/vllm_mindspore/utils.py
+++ b/vllm_mindspore/utils.py
@@ -222,6 +222,10 @@ def check_ready():
 
     # Common environment variables of predict.
     set_context(jit_config={"jit_level": "O0", "infer_boost": "on"})
+    default_env = {
+        "MS_INTERNAL_DISABLE_CUSTOM_KERNEL_LIST": "FlashAttentionScore,PagedAttention",
+    }
+    env_setup(default_env)
 
     if os.getenv("MS_MEMPOOL_BLOCK_SIZE"):
         set_context(mempool_block_size=f"{os.environ['MS_MEMPOOL_BLOCK_SIZE']}GB")
@@ -236,11 +240,6 @@ def check_ready():
                 'For "MindFormers" model backend, environments %s should be set!'
                 % str(lost_envs)
             )
-
-        mindformers_default_env = {
-            "MS_INTERNAL_DISABLE_CUSTOM_KERNEL_LIST": "FlashAttentionScore,PagedAttention",
-        }
-        env_setup(mindformers_default_env)
     else:
         logger.info("Run with native model backend!")
 
-- 
Gitee


From 44eb435f2ce1a62984fe84f0aeef2de3671ef6dc Mon Sep 17 00:00:00 2001
From: lvhaoyu <lvhaoyu@huawei.com>
Date: Fri, 16 May 2025 14:47:53 +0800
Subject: [PATCH 35/55] add testcase for v1

---
 tests/st/python/test_sampler_v1.py            | 569 ++++++++++++++++++
 .../python/test_vllm_deepseek_bf16_part_v1.py |  76 +++
 tests/st/python/test_vllm_deepseek_part_v1.py |  79 +++
 .../test_vllm_mf_qwen_7b_chunk_prefill_v1.py  |  89 +++
 .../test_vllm_mf_qwen_7b_prefix_caching_v1.py |  83 +++
 tests/st/python/test_vllm_mf_qwen_7b_v1.py    | 119 ++++
 6 files changed, 1015 insertions(+)
 create mode 100644 tests/st/python/test_sampler_v1.py
 create mode 100644 tests/st/python/test_vllm_deepseek_bf16_part_v1.py
 create mode 100644 tests/st/python/test_vllm_deepseek_part_v1.py
 create mode 100644 tests/st/python/test_vllm_mf_qwen_7b_chunk_prefill_v1.py
 create mode 100644 tests/st/python/test_vllm_mf_qwen_7b_prefix_caching_v1.py
 create mode 100644 tests/st/python/test_vllm_mf_qwen_7b_v1.py

diff --git a/tests/st/python/test_sampler_v1.py b/tests/st/python/test_sampler_v1.py
new file mode 100644
index 00000000..af2cd483
--- /dev/null
+++ b/tests/st/python/test_sampler_v1.py
@@ -0,0 +1,569 @@
+#!/usr/bin/env python3
+# encoding: utf-8
+# Copyright 2025 Huawei Technologies Co., Ltd
+# Copyright 2025 The vLLM team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+from typing import Optional
+
+import numpy as np
+import pytest
+import torch
+
+import vllm_mindspore
+from vllm.utils import make_tensor_with_pad
+from vllm.v1.sample.metadata import SamplingMetadata
+from vllm.v1.sample.sampler import Sampler
+
+VOCAB_SIZE = 1024
+NUM_OUTPUT_TOKENS = 20
+CUDA_DEVICES = [
+    f"cuda:{0}"
+]
+MAX_NUM_PROMPT_TOKENS = 64
+
+
+def _create_fake_logits(batch_size: int, vocab_size: int) -> torch.Tensor:
+    fake_logits = torch.full((batch_size, vocab_size), 1e-2, dtype=torch.float)
+    return fake_logits
+
+
+def _create_penalty_tensor(batch_size: int, penalty_value: float,
+                           device: torch.device) -> torch.Tensor:
+    return torch.full((batch_size, ),
+                      fill_value=penalty_value,
+                      dtype=torch.float,
+                      device=device)
+
+
+def _create_prompt_tokens_tensor(
+    prompt_token_ids: list[list[int]],
+    vocab_size: int,
+    device: torch.device,
+) -> torch.Tensor:
+    return make_tensor_with_pad(
+        prompt_token_ids,
+        pad=vocab_size,
+        device=device,
+        dtype=torch.int64,
+        pin_memory=False,
+    )
+
+
+def _create_logit_bias(
+    batch_size: int,
+    vocab_size: int,
+    bias_value: float,
+) -> list[Optional[dict[int, float]]]:
+    res: list[Optional[dict[int, float]]] = []
+    for i in range(batch_size):
+        logit_bias = {min(i, vocab_size - 1): bias_value}
+        res.append(logit_bias)
+    return res
+
+
+def _create_allowed_token_ids(
+    batch_size: int,
+    vocab_size: int,
+    num_allowed_token_ids: int,
+    device: torch.device,
+) -> Optional[torch.Tensor]:
+    mask: Optional[torch.Tensor] = None
+    for i in range(batch_size):
+        if i % 2 == 1:
+            continue
+        if mask is None:
+            mask = torch.zeros((batch_size, vocab_size),
+                               dtype=torch.bool,
+                               device=device)
+        start = min(i, vocab_size - 1)
+        end = min(i + num_allowed_token_ids, vocab_size - 1)
+        mask[i, start:end] = True
+    return mask
+
+
+def _create_bad_words_token_ids(
+        batch_size: int, vocab_size: int,
+        bad_words_lengths: list[tuple[int]]) -> dict[int, list[list[int]]]:
+    bad_words_token_ids = {}
+    for batch_idx in range(batch_size):
+        token_ids_single_batch = []
+        for bad_words_length in bad_words_lengths:
+            token_ids = np.random.choice(vocab_size,
+                                         size=bad_words_length,
+                                         replace=True).tolist()
+            token_ids_single_batch.append(token_ids)
+        bad_words_token_ids[batch_idx] = token_ids_single_batch
+    if batch_size >= 2:
+        # Test no bad_words for some batch
+        no_bad_words_batch_idx = np.random.choice(batch_size)
+        bad_words_token_ids.pop(no_bad_words_batch_idx, None)
+    return bad_words_token_ids
+
+
+def _update_output_token_ids_for_bad_words(
+        metadata: SamplingMetadata, vocab_size: int) -> dict[int, list[int]]:
+    bad_words_last_tokens = {}
+    for batch_idx, bad_words_token_ids in metadata.bad_words_token_ids.items():
+        output_token_ids = metadata.output_token_ids[batch_idx]
+        bad_words_last_token: list[int] = []
+        for i, bad_word_token_ids in enumerate(bad_words_token_ids):
+            if len(bad_word_token_ids) == 1:
+                # Single token id always affects logits
+                bad_words_last_token.append(bad_word_token_ids[0])
+            else:
+                prefix_length = len(bad_word_token_ids) - 1
+                has_bad_words = np.random.choice([True, False])
+                if has_bad_words:
+                    output_token_ids[-prefix_length:] = bad_word_token_ids[:-1]
+                    bad_words_last_token.append(bad_word_token_ids[-1])
+                    break  # Maximum one update to output_token_ids
+                else:  # Make sure no accidental match to bad words
+                    output_token_ids[-1] = (bad_word_token_ids[-2] +
+                                            1) % vocab_size
+        bad_words_last_tokens[batch_idx] = bad_words_last_token
+    return bad_words_last_tokens
+
+
+def _create_default_sampling_metadata(
+    num_output_tokens: int,
+    batch_size: int,
+    vocab_size: int,
+    device: torch.device,
+) -> SamplingMetadata:
+    output_token_ids: list[list[int]] = []
+    prompt_token_ids: list[list[int]] = []
+    for _ in range(batch_size):
+        output_token_ids.append(
+            np.random.randint(0, vocab_size, size=num_output_tokens).tolist())
+        prompt_token_ids.append(
+            np.random.randint(0,
+                              vocab_size,
+                              size=np.random.randint(
+                                  1, MAX_NUM_PROMPT_TOKENS)).tolist())
+    fake_sampling_metadata = SamplingMetadata(
+        temperature=torch.full((batch_size, ), 0.0),
+        all_greedy=True,
+        all_random=False,
+        top_p=None,
+        top_k=None,
+        min_p=None,
+        generators={},
+        max_num_logprobs=0,
+        prompt_token_ids=_create_prompt_tokens_tensor(prompt_token_ids,
+                                                      vocab_size, device),
+        output_token_ids=output_token_ids,
+        frequency_penalties=_create_penalty_tensor(batch_size, 0.0, device),
+        presence_penalties=_create_penalty_tensor(batch_size, 0.0, device),
+        repetition_penalties=_create_penalty_tensor(batch_size, 1.0, device),
+        no_penalties=True,
+        min_tokens={},
+        logit_bias=[None] * batch_size,
+        allowed_token_ids_mask=None,
+        bad_words_token_ids={},
+    )
+    return fake_sampling_metadata
+
+
+def _generate_min_token_penalties_and_stop_tokens(
+    num_output_tokens: int, batch_size: int, vocab_size: int,
+    batch_indices_for_min_token_penalty: list[int]
+) -> dict[int, tuple[int, set[int]]]:
+    """
+    Generates and returns a dict of minimum token penalties and
+    corresponding stop token IDs (`min_tokens`, `stop_token_ids`) for each
+    batch.
+
+    If a batch index is included in `batch_indices_for_min_token_penalty`,
+    a higher `min_tokens` value is assigned (within a randomized range),
+    and a random set of stop token IDs is created. Otherwise, a lower
+    `min_tokens` value is assigned, and the stop token IDs set is empty.
+    """
+    min_tokens: dict[int, tuple[int, set[int]]] = {}
+    for index in range(batch_size):
+        if index in batch_indices_for_min_token_penalty:
+            min_tokens[index] = (
+                np.random.randint(num_output_tokens + 1,
+                                  2 * num_output_tokens),
+                set(
+                    np.random.randint(0, vocab_size - 1)
+                    for _ in range(np.random.randint(0, vocab_size))))
+        else:
+            min_tokens[index] = (np.random.randint(0,
+                                                   num_output_tokens), set())
+    return min_tokens
+
+
+def _create_weighted_output_token_list(
+        batch_size: int,
+        vocab_size: int) -> tuple[list[list[int]], list[list[int]]]:
+    """
+    Creates an output token list where each token occurs a distinct
+    number of times.
+
+    For each batch, a random subset of token IDs is selected from the
+    vocabulary. The selected tokens are then added to the output token
+    list, each with a different frequency.
+
+    Returns:
+        tuple[list[list[int]], list[list[int]]]:
+            - The first element is the output token list, where each sublist
+              corresponds to a batch and contains tokens with weighted
+              frequencies.
+            - The second element is a list of distinct token IDs for each
+              batch, ordered by their frequency in the corresponding output
+              list.
+    """
+    output_token_ids: list[list[int]] = []
+    sorted_token_ids_in_output: list[list[int]] = []
+    for _ in range(batch_size):
+        distinct_token_ids = np.random.choice(vocab_size,
+                                              size=np.random.randint(1, 10),
+                                              replace=False).tolist()
+        sorted_token_ids_in_output.append(distinct_token_ids)
+        output_token_ids_for_batch = []
+        for index, token_id in enumerate(distinct_token_ids):
+            output_token_ids_for_batch.extend(
+                [token_id for _ in range(index + 1)])
+        output_token_ids.append(output_token_ids_for_batch)
+    return output_token_ids, sorted_token_ids_in_output
+
+@pytest.mark.level0
+@pytest.mark.platform_arm_ascend910b_training
+@pytest.mark.env_single
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("batch_size", [1, 2])
+def test_sampler_min_tokens_penalty(device: str, batch_size: int):
+    """
+    Tests that if the number of output tokens is less than
+    SamplingParams.min_tokens then we will set the logits for
+    the stop token ids to -inf.
+    """
+    fake_logits = _create_fake_logits(batch_size, VOCAB_SIZE)
+    sampling_metadata = _create_default_sampling_metadata(
+        NUM_OUTPUT_TOKENS, batch_size, VOCAB_SIZE, torch.device(device))
+    batch_indices_for_min_token_penalty = np.random.randint(
+        0, batch_size - 1, size=np.random.randint(0, batch_size)).tolist()
+    min_tokens = _generate_min_token_penalties_and_stop_tokens(
+        NUM_OUTPUT_TOKENS, batch_size, VOCAB_SIZE,
+        batch_indices_for_min_token_penalty)
+    sampling_metadata.min_tokens = min_tokens
+    sampler = Sampler()
+    logits = sampler.apply_penalties(fake_logits, sampling_metadata)
+    logits = logits.cpu()
+    for batch_idx in range(batch_size):
+        for token_id in range(VOCAB_SIZE):
+            _, stop_token_ids = min_tokens.get(batch_idx, (0, set()))
+            if token_id in stop_token_ids:
+                assert logits[batch_idx][token_id] == -float("inf")
+            else:
+                assert logits[batch_idx][token_id] != -float("inf")
+
+@pytest.mark.level0
+@pytest.mark.platform_arm_ascend910b_training
+@pytest.mark.env_single
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("batch_size", [1, 2])
+@pytest.mark.parametrize("presence_penalty", [-2.0, 2.0])
+def test_sampler_presence_penalty(device: str, batch_size: int,
+                                  presence_penalty: float):
+    """
+    Test to verify that if presence penalty is enabled then tokens
+    are penalized as per their presence in the existing output.
+    """
+    # Create fake logits where each token is assigned the same
+    # logit value.
+    fake_logits = _create_fake_logits(batch_size, VOCAB_SIZE)
+    sampling_metadata = _create_default_sampling_metadata(
+        NUM_OUTPUT_TOKENS, batch_size, VOCAB_SIZE, torch.device(device))
+    output_token_ids = sampling_metadata.output_token_ids
+    sampling_metadata.presence_penalties = _create_penalty_tensor(
+        batch_size, presence_penalty, torch.device(device))
+    sampling_metadata.no_penalties = False
+    sampler = Sampler()
+    logits = sampler.apply_penalties(fake_logits, sampling_metadata)
+    logits = logits.cpu()
+    for batch_idx in range(batch_size):
+        # Since all tokens initially have the same logits, the non-penalized
+        # token ID will be the one with the highest logit value, while the
+        # penalized token ID will be the one with the lowest logit value.
+        non_penalized_token_id = logits[batch_idx].argmax().item()
+        penalized_token_id = logits[batch_idx].argmin().item()
+        if presence_penalty > 0:
+            # If `presence_penalty` is set to a value greater than 0, it
+            # indicates a preference for new tokens over those already
+            # present in the output.
+            # Verify that the penalized token ID exists in the output, while the
+            # non-penalized token ID does not.
+            assert penalized_token_id in output_token_ids[batch_idx]
+            assert non_penalized_token_id not in output_token_ids[batch_idx]
+        elif presence_penalty < 0:
+            # If `presence_penalty` is set to a value less than 0, it indicates
+            # a preference for existing tokens over new ones. Verify that the
+            # non-penalized token ID exists in the output, while the penalized
+            # token ID does not.
+            assert non_penalized_token_id in output_token_ids[batch_idx]
+            assert penalized_token_id not in output_token_ids[batch_idx]
+
+@pytest.mark.level0
+@pytest.mark.platform_arm_ascend910b_training
+@pytest.mark.env_single
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("batch_size", [1, 2])
+@pytest.mark.parametrize("frequency_penalty", [-2.0, 2.0])
+def test_sampler_frequency_penalty(device: str, batch_size: int,
+                                   frequency_penalty: float):
+    """
+    Test to verify that if frequency penalty is enabled then tokens are
+    penalized as per their frequency of occurrence.
+    """
+    # Create fake logits where each token is assigned the same
+    # logit value.
+    fake_logits = _create_fake_logits(batch_size, VOCAB_SIZE)
+    sampling_metadata = _create_default_sampling_metadata(
+        NUM_OUTPUT_TOKENS, batch_size, VOCAB_SIZE, torch.device(device))
+    sampling_metadata.frequency_penalties = _create_penalty_tensor(
+        batch_size, frequency_penalty, torch.device(device))
+    output_token_ids, sorted_token_ids_in_output = \
+        _create_weighted_output_token_list(
+            batch_size,
+            VOCAB_SIZE,
+        )
+    sampling_metadata.output_token_ids = output_token_ids
+    sampling_metadata.no_penalties = False
+    sampler = Sampler()
+    logits = sampler.apply_penalties(fake_logits, sampling_metadata)
+    logits = logits.cpu()
+    for batch_idx in range(batch_size):
+        non_penalized_token_id = logits[batch_idx].argmax().item()
+        penalized_token_id = logits[batch_idx].argmin().item()
+        distinct_sorted_token_ids_in_output = sorted_token_ids_in_output[
+            batch_idx]
+        most_frequent_token_id = distinct_sorted_token_ids_in_output[
+            len(distinct_sorted_token_ids_in_output) - 1]
+        if frequency_penalty > 0:
+            # If `frequency_penalty` is set to > 0, it indicates
+            # a preference for new tokens over existing ones. Verify that the
+            # non-penalized token ID is not present in the output, while the
+            # most penalized token is the one that occurs most frequently in
+            # the output.
+            assert (non_penalized_token_id
+                    not in distinct_sorted_token_ids_in_output)
+            assert penalized_token_id == most_frequent_token_id
+        elif frequency_penalty < 0:
+            # If `frequency_penalty` is set to < 0, it indicates
+            # a preference for existing tokens over new ones. Verify that the
+            # non-penalized token ID is the one that occurs most frequently
+            # in the output, while the penalized token ID is one that has not
+            # yet appeared.
+            assert non_penalized_token_id == most_frequent_token_id
+            assert penalized_token_id not in distinct_sorted_token_ids_in_output
+
+@pytest.mark.level0
+@pytest.mark.platform_arm_ascend910b_training
+@pytest.mark.env_single
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("batch_size", [1, 2])
+@pytest.mark.parametrize("repetition_penalty", [0.1, 1.9])
+def test_sampler_repetition_penalty(device: str, batch_size: int,
+                                    repetition_penalty: float):
+    """
+    Test to verify that when the repetition penalty is enabled, tokens
+    are penalized based on their presence in the prompt or the existing
+    output.
+    """
+    # Create fake logits where each token is assigned the same
+    # logit value.
+    fake_logits = _create_fake_logits(batch_size, VOCAB_SIZE)
+    sampling_metadata = _create_default_sampling_metadata(
+        NUM_OUTPUT_TOKENS, batch_size, VOCAB_SIZE, torch.device(device))
+    sampling_metadata.repetition_penalties = _create_penalty_tensor(
+        batch_size, repetition_penalty, torch.device(device))
+    sampling_metadata.no_penalties = False
+    sampler = Sampler()
+    logits = sampler.apply_penalties(fake_logits, sampling_metadata)
+    logits = logits.cpu()
+    for batch_idx in range(batch_size):
+        non_penalized_token_id = logits[batch_idx].argmax().item()
+        penalized_token_id = logits[batch_idx].argmin().item()
+        prompt_tokens = sampling_metadata.prompt_token_ids[
+            batch_idx][:].tolist()
+        output_tokens = sampling_metadata.output_token_ids[batch_idx]
+        if repetition_penalty > 1.0:
+            # If `repetition_penalty` > 1.0, verify that the non-penalized
+            # token ID has not been seen before, while the penalized token ID
+            # exists either in the prompt or the output.
+            assert (non_penalized_token_id not in prompt_tokens
+                    and non_penalized_token_id not in output_tokens)
+            assert (penalized_token_id in prompt_tokens
+                    or penalized_token_id in output_tokens)
+        elif repetition_penalty < 1.0:
+            # If `repetition_penalty` < 1.0, verify that the penalized
+            # token ID has not been seen before, while the non-penalized
+            # token ID exists either in the prompt or the output.
+            assert (penalized_token_id not in prompt_tokens
+                    and penalized_token_id not in output_tokens)
+            assert (non_penalized_token_id in prompt_tokens
+                    or non_penalized_token_id in output_tokens)
+
+@pytest.mark.level0
+@pytest.mark.platform_arm_ascend910b_training
+@pytest.mark.env_single
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("batch_size", [1, 2])
+@pytest.mark.parametrize("min_p", [0.0, 0.1])
+def test_sampler_min_p(device: str, batch_size: int, min_p: float):
+    """
+    Tests that when min_p is applied, tokens with probability below 
+    min_p * max_prob are masked with -inf.
+    """
+    fake_logits = _create_fake_logits(batch_size, VOCAB_SIZE)
+
+    # Create one dominant token per batch
+    for i in range(batch_size):
+        fake_logits[i, 0] = 10.0  # High logit for first token
+        fake_logits[i, 1:] = 1e-2  # Others remain low
+
+    sampling_metadata = _create_default_sampling_metadata(
+        NUM_OUTPUT_TOKENS, batch_size, VOCAB_SIZE, torch.device(device))
+
+    # Configure min_p parameters
+    sampling_metadata.min_p = torch.full((batch_size, ), min_p, device=device)
+
+    sampler = Sampler()
+    logits = sampler.apply_min_p(fake_logits, sampling_metadata.min_p)
+    logits = logits.cpu()
+
+    for batch_idx in range(batch_size):
+        for token_id in range(VOCAB_SIZE):
+            if token_id == 0:
+                # Dominant token should always be unmasked
+                assert logits[batch_idx][token_id] != -float("inf")
+            else:
+                if min_p > 0.0:
+                    # Non-dominant tokens should be masked when min_p > 0
+                    assert logits[batch_idx][token_id] == -float("inf")
+                else:
+                    # No masking when min_p is 0
+                    assert logits[batch_idx][token_id] != -float("inf")
+
+@pytest.mark.level0
+@pytest.mark.platform_arm_ascend910b_training
+@pytest.mark.env_single
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("batch_size", [1, 2])
+@pytest.mark.parametrize("bias_value", [-0.1, 1.2])
+def test_sampler_logit_bias(device: str, batch_size: int, bias_value: float):
+    """
+    Test to verify that when the repetition penalty is enabled, tokens
+    are penalized based on their presence in the prompt or the existing
+    output.
+    """
+    # Create fake logits where each token is assigned the same
+    # logit value.
+    fake_logits = _create_fake_logits(batch_size, VOCAB_SIZE)
+    sampling_metadata = _create_default_sampling_metadata(
+        NUM_OUTPUT_TOKENS, batch_size, VOCAB_SIZE, torch.device(device))
+    sampling_metadata.logit_bias = _create_logit_bias(
+        batch_size=batch_size,
+        vocab_size=VOCAB_SIZE,
+        bias_value=bias_value,
+    )
+    sampler = Sampler()
+    logits = sampler.apply_logits_bias(fake_logits, sampling_metadata)
+    logits = logits.cpu()
+    for batch_idx in range(batch_size):
+        logits_for_req = logits[batch_idx]
+        biased_index = min(batch_idx, VOCAB_SIZE - 1)
+        for token_id in range(VOCAB_SIZE):
+            if biased_index == token_id:
+                assert logits_for_req[token_id].item() == pytest.approx(bias_value +
+                                                                 1e-2)
+            else:
+                assert logits_for_req[token_id].item() == pytest.approx(1e-2)
+
+@pytest.mark.level0
+@pytest.mark.platform_arm_ascend910b_training
+@pytest.mark.env_single
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("batch_size", [1, 2])
+@pytest.mark.parametrize("num_allowed_token_ids", [0, 1, 2])
+def test_sampler_allowed_token_ids(device: str, batch_size: int,
+                                   num_allowed_token_ids: int):
+    """
+    Test to verify that when the repetition penalty is enabled, tokens
+    are penalized based on their presence in the prompt or the existing
+    output.
+    """
+    # Create fake logits where each token is assigned the same
+    # logit value.
+    fake_logits = _create_fake_logits(batch_size, VOCAB_SIZE)
+    sampling_metadata = _create_default_sampling_metadata(
+        NUM_OUTPUT_TOKENS, batch_size, VOCAB_SIZE, torch.device(device))
+    mask = _create_allowed_token_ids(
+        batch_size=batch_size,
+        vocab_size=VOCAB_SIZE,
+        num_allowed_token_ids=num_allowed_token_ids,
+        device=device,
+    )
+    sampling_metadata.allowed_token_ids_mask = mask
+    sampler = Sampler()
+    logits = sampler.apply_allowed_token_ids(fake_logits, sampling_metadata)
+    logits = logits.cpu()
+    for batch_idx in range(batch_size):
+        logits_for_req = logits[batch_idx]
+        if batch_idx % 2 == 1:
+            assert torch.all(logits_for_req != -float("inf"))
+            continue
+        for token_id in range(VOCAB_SIZE):
+            start = min(batch_idx, VOCAB_SIZE - 1)
+            end = min(batch_idx + num_allowed_token_ids, VOCAB_SIZE - 1)
+            if token_id >= start and token_id < end:
+                assert logits_for_req[token_id] == -float(
+                    "inf"), f"{batch_idx}, {token_id}"
+            else:
+                assert logits_for_req[token_id] != -float("inf")
+
+@pytest.mark.level0
+@pytest.mark.platform_arm_ascend910b_training
+@pytest.mark.env_single
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@pytest.mark.parametrize("batch_size", [1, 2])
+@pytest.mark.parametrize("bad_words_lengths", [(1, ), (1, 3), (2, 2)])
+def test_sampler_bad_words(device: str, batch_size: int,
+                           bad_words_lengths: list[tuple[int]]):
+    """
+    Test to verify that when the bad words restriction is present, tokens
+    are penalized based on their match with the bad words.
+    """
+    # Create fake logits where each token is assigned the same
+    # logit value.
+    fake_logits = _create_fake_logits(batch_size, VOCAB_SIZE)
+    sampling_metadata = _create_default_sampling_metadata(
+        NUM_OUTPUT_TOKENS, batch_size, VOCAB_SIZE, torch.device(device))
+    sampling_metadata.bad_words_token_ids = _create_bad_words_token_ids(
+        batch_size, VOCAB_SIZE, bad_words_lengths)
+    bad_words_last_tokens = _update_output_token_ids_for_bad_words(
+        sampling_metadata, VOCAB_SIZE)
+    sampler = Sampler()
+    logits = sampler.apply_bad_words(fake_logits, sampling_metadata)
+    logits = logits.cpu()
+    for batch_idx in range(batch_size):
+        logits_for_req = logits[batch_idx]
+        for token_id in range(VOCAB_SIZE):
+            if (batch_idx in bad_words_last_tokens
+                    and token_id in bad_words_last_tokens[batch_idx]):
+                assert logits_for_req[token_id] == -float("inf")
+            else:
+                assert logits_for_req[token_id] != -float("inf")
diff --git a/tests/st/python/test_vllm_deepseek_bf16_part_v1.py b/tests/st/python/test_vllm_deepseek_bf16_part_v1.py
new file mode 100644
index 00000000..11167779
--- /dev/null
+++ b/tests/st/python/test_vllm_deepseek_bf16_part_v1.py
@@ -0,0 +1,76 @@
+# Copyright 2024 The vLLM team.
+# Copyright 2024 Microsoft and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://wwww.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by application law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""test mf deepseek r1."""
+import pytest
+import os
+from . import set_env
+env_manager = set_env.EnvVarManager()
+# def env
+env_vars = {
+    "MINDFORMERS_MODEL_CONFIG": "./config/predict_deepseek_r1_671b.yaml",
+    "ASCEND_CUSTOM_PATH": os.path.expandvars("$ASCEND_HOME_PATH/../"),
+    "vLLM_MODEL_BACKEND": "MindFormers",
+    "MS_ENABLE_LCCL": "on",
+    "HCCL_OP_EXPANSION_MODE": "AIV",
+    "ASCEND_RT_VISIBLE_DEVICES": "0,1,2,3,4,5,6,7",
+    "MS_ALLOC_CONF": "enable_vmm:True",
+    "LCCL_DETERMINISTIC": "1",
+    "HCCL_DETERMINISTIC": "true",
+    "ATB_MATMUL_SHUFFLE_K_ENABLE": "0",
+    "ATB_LLM_LCOC_ENABLE": "0",
+}
+# set env
+env_manager.setup_ai_environment(env_vars)
+import vllm_mindspore
+from vllm import LLM, SamplingParams
+
+class TestDeepSeek:
+    """
+    Test Deepseek.
+    """
+
+    @pytest.mark.level0
+    @pytest.mark.platform_arm_ascend910b_training
+    @pytest.mark.env_single
+    def test_deepseek_r1_bf16(self):
+        """
+        test case deepseek r1 bf16
+        """
+
+        # Sample prompts.
+        prompts = [
+            "You are a helpful assistant.<｜User｜>将文本分类为中性、负面或正面。 \n文本：我认为这次假期还可以。 \n情感：<｜Assistant｜>\n",
+        ]
+
+        # Create a sampling params object.
+        sampling_params = SamplingParams(temperature=0.0, max_tokens=10, top_k=1)
+
+        # Create an LLM.
+        llm = LLM(model="/home/workspace/mindspore_dataset/weight/DeepSeek-R1-bf16",
+                  trust_remote_code=True, gpu_memory_utilization=0.9, tensor_parallel_size=8)
+        # Generate texts from the prompts. The output is a list of RequestOutput objects
+        # that contain the prompt, generated text, and other information.
+        outputs = llm.generate(prompts, sampling_params)
+        except_list=['ugs611ాలు sic辨hara的开璞 SquaresInsp']
+        # Print the outputs.
+        for i, output in enumerate(outputs):
+            prompt = output.prompt
+            generated_text = output.outputs[0].text
+            print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+            assert generated_text == except_list[i]
+
+        # unset env
+        env_manager.unset_all()
diff --git a/tests/st/python/test_vllm_deepseek_part_v1.py b/tests/st/python/test_vllm_deepseek_part_v1.py
new file mode 100644
index 00000000..3ec16fa7
--- /dev/null
+++ b/tests/st/python/test_vllm_deepseek_part_v1.py
@@ -0,0 +1,79 @@
+#!/usr/bin/env python3
+# encoding: utf-8
+# Copyright 2025 Huawei Technologies Co., Ltd
+# Copyright 2024 The vLLM team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""test mf deepseek r1."""
+import pytest
+import os
+from . import set_env
+env_manager = set_env.EnvVarManager()
+# def env
+env_vars = {
+    "MINDFORMERS_MODEL_CONFIG": "./config/predict_deepseek_r1_671b_w8a8.yaml",
+    "ASCEND_CUSTOM_PATH": os.path.expandvars("$ASCEND_HOME_PATH/../"),
+    "vLLM_MODEL_BACKEND": "MindFormers",
+    "MS_ENABLE_LCCL": "off",
+    "HCCL_OP_EXPANSION_MODE": "AIV",
+    "ASCEND_RT_VISIBLE_DEVICES": "0,1,2,3,4,5,6,7",
+    "MS_ALLOC_CONF": "enable_vmm:True",
+    "LCCL_DETERMINISTIC": "1",
+    "HCCL_DETERMINISTIC": "true",
+    "ATB_MATMUL_SHUFFLE_K_ENABLE": "0",
+    "ATB_LLM_LCOC_ENABLE": "0",
+}
+# set env
+env_manager.setup_ai_environment(env_vars)
+import vllm_mindspore
+from vllm import LLM, SamplingParams
+
+class TestDeepSeek:
+    """
+    Test Deepseek.
+    """
+
+    @pytest.mark.level0
+    @pytest.mark.platform_arm_ascend910b_training
+    @pytest.mark.env_single
+    @pytest.mark.skip(reason="gs master branch is not suit for the newest mindformers.")
+    def test_deepseek_r1(self):
+        """
+        test case deepseek r1 w8a8
+        """
+
+        # Sample prompts.
+        prompts = [
+            "You are a helpful assistant.<｜User｜>将文本分类为中性、负面或正面。 \n文本：我认为这次假期还可以。 \n情感：<｜Assistant｜>\n",
+        ]
+
+        # Create a sampling params object.
+        sampling_params = SamplingParams(temperature=0.0, max_tokens=10, top_k=1)
+
+        # Create an LLM.
+        llm = LLM(model="/home/workspace/mindspore_dataset/weight/DeepSeek-R1-W8A8",
+                  trust_remote_code=True, gpu_memory_utilization=0.9, tensor_parallel_size=8)
+        # Generate texts from the prompts. The output is a list of RequestOutput objects
+        # that contain the prompt, generated text, and other information.
+        outputs = llm.generate(prompts, sampling_params)
+        except_list=['ugs611ాలు哒ాలు mahassisemaSTE的道德', 'ugs611ాలు哒ాలు mah战区rollerOVERlaid']
+        # Print the outputs.
+        for i, output in enumerate(outputs):
+            prompt = output.prompt
+            generated_text = output.outputs[0].text
+            print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+            assert generated_text in except_list
+
+        # unset env
+        env_manager.unset_all()
diff --git a/tests/st/python/test_vllm_mf_qwen_7b_chunk_prefill_v1.py b/tests/st/python/test_vllm_mf_qwen_7b_chunk_prefill_v1.py
new file mode 100644
index 00000000..dfc738b2
--- /dev/null
+++ b/tests/st/python/test_vllm_mf_qwen_7b_chunk_prefill_v1.py
@@ -0,0 +1,89 @@
+# Copyright 2024 The vLLM team.
+# Copyright 2024 Microsoft and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://wwww.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by application law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""test mf qwen chunk prefill."""
+import pytest
+import os
+from . import set_env
+
+env_manager = set_env.EnvVarManager()
+# def env
+env_vars = {
+    "MINDFORMERS_MODEL_CONFIG": "./config/predict_qwen2_5_7b_instruct.yaml",
+    "ASCEND_CUSTOM_PATH": os.path.expandvars("$ASCEND_HOME_PATH/../"),
+    "vLLM_MODEL_BACKEND": "MindFormers",
+    "MS_ENABLE_LCCL": "off",
+    "HCCL_OP_EXPANSION_MODE": "AIV",
+    "ASCEND_RT_VISIBLE_DEVICES": "0,1",
+    "MS_ALLOC_CONF": "enable_vmm:True",
+    "LCCL_DETERMINISTIC": "1",
+    "HCCL_DETERMINISTIC": "true",
+    "ATB_MATMUL_SHUFFLE_K_ENABLE": "0",
+    "ATB_LLM_LCOC_ENABLE": "0",
+}
+# set env
+env_manager.setup_ai_environment(env_vars)
+import vllm_mindspore
+from vllm import LLM, SamplingParams
+
+
+class TestMfQwen_chunk_prefill_v1:
+    """
+    Test qwen.
+    """
+
+    @pytest.mark.level0
+    @pytest.mark.platform_arm_ascend910b_training
+    @pytest.mark.env_single
+    @pytest.mark.skip(reason="cp precision need to be fixed on v0.8.3 V0")
+    def test_mf_qwen_7b_chunk_prefill(self):
+        """
+        test case qwen_7b_chunk_prefill
+        """
+
+        # Sample prompts.
+        batch_datas = [{
+            "prompt": "I love Beijing, because it is a city with a long history and profound cultural heritage. Walking through "
+                      "its ancient hutongs, one can almost feel the whispers of the past. The Forbidden City, an architectural "
+                      "marvel that once housed emperors, stands as a testament to the city's imperial past. Meanwhile, the Great "
+                      "Wall, though not within the city limits, is easily accessible from Beijing and offers a glimpse into the "
+                      "strategic genius and resilience of ancient China.",
+            "answer": " The city's blend of traditional and modern architecture, bustling markets, and vibrant street life make it "
+                      "a unique and fascinating destination. In short, Beijing is a city"},
+            {"prompt": "I love Beijing, because",
+             "answer": " it is a city with a long history. Which of the following options correctly expresses this sentence?\nA. I love Beijing, because it is a city with a"},
+        ]
+
+        # Create a sampling params object.
+        sampling_params = SamplingParams(temperature=0.0, max_tokens=32, top_k=1)
+
+        # Create an LLM.
+        llm = LLM(model="/home/workspace/mindspore_dataset/weight/Qwen2.5-7B-Instruct",
+                  max_model_len=8192, max_num_seqs=16, max_num_batched_tokens=32,
+                  block_size=32, gpu_memory_utilization=0.85, tensor_parallel_size=2)
+        # Generate texts from the prompts. The output is a list of RequestOutput objects
+        # that contain the prompt, generated text, and other information.
+        for batch_data in batch_datas:
+            prompt = batch_data["prompt"]
+            answer = batch_data["answer"]
+            outputs = llm.generate(prompt, sampling_params)
+            # Print the outputs.
+            for i, output in enumerate(outputs):
+                generated_text = output.outputs[0].text
+                print(f"Prompt: {output.prompt!r}, Generated text: {generated_text!r}")
+                assert generated_text == answer
+
+        # unset env
+        env_manager.unset_all()
diff --git a/tests/st/python/test_vllm_mf_qwen_7b_prefix_caching_v1.py b/tests/st/python/test_vllm_mf_qwen_7b_prefix_caching_v1.py
new file mode 100644
index 00000000..2bce85ad
--- /dev/null
+++ b/tests/st/python/test_vllm_mf_qwen_7b_prefix_caching_v1.py
@@ -0,0 +1,83 @@
+#!/usr/bin/env python3
+# encoding: utf-8
+# Copyright 2025 Huawei Technologies Co., Ltd
+# Copyright 2024 The vLLM team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""test mf qwen prefix caching."""
+import pytest
+import os
+from . import set_env
+env_manager = set_env.EnvVarManager()
+env_vars = {
+    "MINDFORMERS_MODEL_CONFIG": "./config/predict_qwen2_5_7b_instruct.yaml",
+    "ASCEND_CUSTOM_PATH": os.path.expandvars("$ASCEND_HOME_PATH/../"),
+    "vLLM_MODEL_BACKEND": "MindFormers",
+    "MS_ENABLE_LCCL": "off",
+    "HCCL_OP_EXPANSION_MODE": "AIV",
+    "ASCEND_RT_VISIBLE_DEVICES": "0,1",
+    "LCCL_DETERMINISTIC": "1",
+    "HCCL_DETERMINISTIC": "true",
+    "ATB_MATMUL_SHUFFLE_K_ENABLE": "0",
+    "ATB_LLM_LCOC_ENABLE": "0",
+}
+env_manager.setup_ai_environment(env_vars)
+import vllm_mindspore
+from vllm import LLM, SamplingParams
+
+
+class TestMfQwen_prefix_caching_v1:
+    """
+    Test qwen7b enable prefix_caching
+    """
+    @pytest.mark.level0
+    @pytest.mark.platform_arm_ascend910b_training
+    @pytest.mark.env_single
+    @pytest.mark.skip(reason="pc precision need to be fixed on v0.8.3 V0")
+    def test_mf_qwen_7b_prefix_caching(self):
+        """
+        test case qwen_7b_prefix_caching
+        """
+
+        # First prompts.
+        prompts = [
+            "I love Beijing, because it is a city that has so much to offer. I have visited"
+        ]
+        #second prompts, the second prompt is a continuation of the first prompts, make sure prefix caching work.
+        second_prompts = [
+            "I love Beijing, because it is a city that has so much to offer. I have visited many places"
+        ]
+        # Create a sampling params object.
+        sampling_params = SamplingParams(temperature=0.0, max_tokens=10, top_k=1)
+
+        # Create an LLM.
+        llm = LLM(model="/home/workspace/mindspore_dataset/weight/Qwen2.5-7B-Instruct",
+                  max_model_len=8192, block_size=16, tensor_parallel_size=2)
+        # Generate texts from the prompts. The output is a list of RequestOutput objects
+        # that contain the prompt, generated text, and other information.
+        outputs = llm.generate(prompts, sampling_params)
+        second_outputs = llm.generate(second_prompts, sampling_params)
+        except_list=[' many times and each time I have found something new']
+        second_except_list=[' in Beijing, but I have to say that the']
+        for i, (output, second_output) in enumerate(zip(outputs, second_outputs)):
+            generated_text = output.outputs[i].text
+            print(f"Output1 - Prompt: {prompts[i]!r}, Generated text: {generated_text!r}")
+            assert generated_text == except_list[i]
+
+            second_generated_text = second_output.outputs[i].text
+            print(f"Output2 - Prompt: {second_prompts[i]!r}, Generated text: {second_generated_text!r}")
+            assert second_generated_text == second_except_list[i]
+
+        env_manager.unset_all()
diff --git a/tests/st/python/test_vllm_mf_qwen_7b_v1.py b/tests/st/python/test_vllm_mf_qwen_7b_v1.py
new file mode 100644
index 00000000..3ad831ff
--- /dev/null
+++ b/tests/st/python/test_vllm_mf_qwen_7b_v1.py
@@ -0,0 +1,119 @@
+#!/usr/bin/env python3
+# encoding: utf-8
+# Copyright 2025 Huawei Technologies Co., Ltd
+# Copyright 2024 The vLLM team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""test mf qwen."""
+import pytest
+import os
+from . import set_env
+env_manager = set_env.EnvVarManager()
+# def env
+env_vars = {
+    "MINDFORMERS_MODEL_CONFIG": "./config/predict_qwen2_5_7b_instruct.yaml",
+    "ASCEND_CUSTOM_PATH": os.path.expandvars("$ASCEND_HOME_PATH/../"),
+    "vLLM_MODEL_BACKEND": "MindFormers",
+    "MS_ENABLE_LCCL": "off",
+    "HCCL_OP_EXPANSION_MODE": "AIV",
+    "ASCEND_RT_VISIBLE_DEVICES": "0,1",
+    "MS_ALLOC_CONF": "enable_vmm:True",
+    "LCCL_DETERMINISTIC": "1",
+    "HCCL_DETERMINISTIC": "true",
+    "ATB_MATMUL_SHUFFLE_K_ENABLE": "0",
+    "ATB_LLM_LCOC_ENABLE": "0",
+}
+# set env
+env_manager.setup_ai_environment(env_vars)
+import vllm_mindspore
+from vllm import LLM, SamplingParams
+
+
+class TestMfQwenV1:
+    """
+    Test Qwen.
+    """
+    @pytest.mark.level0
+    @pytest.mark.platform_arm_ascend910b_training
+    @pytest.mark.env_single
+    def test_mf_qwen(self):
+        """
+        test case qwen2.5 7B
+        """
+
+        # Sample prompts.
+        prompts = [
+            "You are a helpful assistant.<｜User｜>将文本分类为中性、负面或正面。 \n文本：我认为这次假期还可以。 \n情感：<｜Assistant｜>\n",
+        ]
+
+        # Create a sampling params object.
+        sampling_params = SamplingParams(temperature=0.0, max_tokens=10, top_k=1)
+
+        # Create an LLM.
+        llm = LLM(model="/home/workspace/mindspore_dataset/weight/Qwen2.5-7B-Instruct",
+                  gpu_memory_utilization=0.9, tensor_parallel_size=2)
+        # Generate texts from the prompts. The output is a list of RequestOutput objects
+        # that contain the prompt, generated text, and other information.
+        outputs = llm.generate(prompts, sampling_params)
+        except_list=['中性<｜Assistant｜> 这句话']
+        # Print the outputs.
+        for i, output in enumerate(outputs):
+            prompt = output.prompt
+            generated_text = output.outputs[0].text
+            print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+            assert generated_text == except_list[i]
+
+        # unset env
+        env_manager.unset_all()
+
+    @pytest.mark.level0
+    @pytest.mark.platform_arm_ascend910b_training
+    @pytest.mark.env_single
+    @pytest.mark.skip(reason="pc precision need to be fixed on v0.8.3 V0")
+    def test_mf_qwen_batch(self):
+        """
+        test case qwen2.5 7B, to test prefill and decode mixed, can trigger PA q_seq_len > 1
+        """
+        # Sample prompts.
+        prompts = [
+            "北京烤鸭是",
+            "请介绍一下华为，华为是",
+            "今年似乎大模型之间的内卷已经有些偃旗息鼓了，各大技术公司逐渐聪单纯追求模型参数量的竞赛中抽身,"
+            "转向更加注重模型的实际>应用效果和效率",
+        ] * 2
+
+        # Create a sampling params object.
+        sampling_params = SamplingParams(temperature=0.0, max_tokens=10, top_k=1)
+
+        # Create an LLM.
+        llm = LLM(model="/home/workspace/mindspore_dataset/weight/Qwen2.5-7B-Instruct", block_size=32,
+                  gpu_memory_utilization=0.9, tensor_parallel_size=2)
+        # Generate texts from the prompts. The output is a list of RequestOutput objects
+        # that contain the prompt, generated text, and other information.
+        outputs = llm.generate(prompts, sampling_params)
+        except_list=[
+            "享誉世界的中华美食，其制作工艺独特，",
+            "做什么的？ 华为是一家中国公司，",
+            "。 \n在这一背景下，阿里云发布了通",
+        ] * 2
+        # Print the outputs.
+        for i, output in enumerate(outputs):
+            prompt = output.prompt
+            generated_text = output.outputs[0].text
+            print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+            assert generated_text in except_list[i]
+
+        # unset env
+        env_manager.unset_all()
+
-- 
Gitee


From 6b7de351d70e11eaca309ce1607aff0a805258e7 Mon Sep 17 00:00:00 2001
From: fengtingyan <fengtingyan1@huawei.com>
Date: Mon, 19 May 2025 20:20:33 +0800
Subject: [PATCH 36/55] [Feature] V1 PD disaggregation, DLLM vllm v0.8.3

---
 vllm_mindspore/__init__.py                    |  9 ++++
 vllm_mindspore/engine/arg_utils.py            |  6 ---
 .../models/mf_models/deepseek_v3.py           | 17 ++++++
 .../models/mf_models/mf_model_base.py         | 52 ++++++++++++++++++-
 vllm_mindspore/v1/utils.py                    |  1 +
 vllm_mindspore/v1/worker/gpu_worker.py        | 18 ++++---
 vllm_mindspore/worker/worker.py               |  1 -
 7 files changed, 90 insertions(+), 14 deletions(-)

diff --git a/vllm_mindspore/__init__.py b/vllm_mindspore/__init__.py
index 3aae2091..224ae26e 100644
--- a/vllm_mindspore/__init__.py
+++ b/vllm_mindspore/__init__.py
@@ -315,4 +315,13 @@ from vllm_mindspore.engine.multiprocessing.engine import cleanup
 import vllm.engine.multiprocessing.engine
 vllm.engine.multiprocessing.engine.MQLLMEngine.cleanup = cleanup
 
+try:
+    from vllm.distributed.kv_transfer.kv_connector.factory import KVConnectorFactory
+
+    KVConnectorFactory.register_connector(
+        "DLLMDsConnector",
+        "dllm.dkvc.v1.dllm_ds_connector",
+        "DLLMDsConnector")
+except:
+    pass
 check_ready()
diff --git a/vllm_mindspore/engine/arg_utils.py b/vllm_mindspore/engine/arg_utils.py
index ed74ba9e..9efb8923 100644
--- a/vllm_mindspore/engine/arg_utils.py
+++ b/vllm_mindspore/engine/arg_utils.py
@@ -164,12 +164,6 @@ def _is_v1_supported_oracle(self, model_config: ModelConfig) -> bool:
                                 recommend_to_remove=False)
             return False
 
-    # No Disaggregated Prefill so far.
-    if self.kv_transfer_config != EngineArgs.kv_transfer_config:
-        _raise_or_fallback(feature_name="--kv-transfer-config",
-                            recommend_to_remove=False)
-        return False
-
     # No FlashInfer or XFormers so far.
     V1_BACKENDS = [
         "FLASH_ATTN_VLLM_V1", "FLASH_ATTN", "PALLAS", "PALLAS_VLLM_V1",
diff --git a/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py b/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py
index 3a1b2be6..e493c9c2 100644
--- a/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py
+++ b/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py
@@ -29,11 +29,13 @@ from vllm.config import get_current_vllm_config
 from vllm.distributed.parallel_state import get_dp_group, get_tensor_model_parallel_world_size
 from vllm.forward_context import get_forward_context
 from vllm.logger import init_logger
+from vllm.attention.layer import Attention
 
 import mindspore as ms
 from mindspore import Tensor, JitConfig, Model, mutable
 from mindspore.common import dtype as msdtype
 from mindspore.nn.utils import no_init_parameters
+from mindspore.common.api import _pynative_executor
 
 from mindspore_gs.ptq import PTQ
 from mindspore_gs.ptq import PTQMode, PTQConfig, OutliersSuppressionType, PrecisionRecovery, QuantGranularity, \
@@ -54,6 +56,13 @@ from vllm_mindspore.model_executor.models.model_base import Fake_MLA, Fake_MLA_V
 from vllm_mindspore.model_executor.models.mf_models.mf_model_base import MfModelBase
 from vllm_mindspore.model_executor.models.mf_models.deepseekv3_weight_processor import DeepseekV3WeightProcessor
 
+try:
+    # Need to apply dllm pd patch on vllm to use pd disagg related functions
+    from vllm.attention.layer import maybe_save_kv_layer_to_connector
+except ImportError:
+    pass
+
+
 logger = init_logger(__name__)
 
 
@@ -168,6 +177,14 @@ class DeepseekV3ForCausalLM(MfModelBase):
             key_cache.append(k_cache)
         return mutable(key_cache), None
 
+    def connector_send_kvcache(self):
+        _pynative_executor.sync()
+        forward_context = get_forward_context()
+        for i in range(self.mf_model_config.num_layers):
+            kv_cache_module = self.kv_caches[i]
+            kv_cache = kv_cache_module.kv_cache[forward_context.virtual_engine][0]
+            maybe_save_kv_layer_to_connector("key." + str(i), kv_cache)
+
     def load_weights(self, weights: Iterable[Tuple[str, Tensor]]) -> Set[str]:
         if self.mf_config.load_ckpt_format == "ckpt":
             model = Model(self.network)
diff --git a/vllm_mindspore/model_executor/models/mf_models/mf_model_base.py b/vllm_mindspore/model_executor/models/mf_models/mf_model_base.py
index 32cbb916..51b0bc89 100644
--- a/vllm_mindspore/model_executor/models/mf_models/mf_model_base.py
+++ b/vllm_mindspore/model_executor/models/mf_models/mf_model_base.py
@@ -39,11 +39,20 @@ from mindspore.common.api import _pynative_executor
 from mindformers.tools.register.config import MindFormerConfig
 from mindformers.core.context import build_mf_context
 from mindformers.core.parallel_config import build_parallel_config
-
+from mindspore.common.api import _pynative_executor
 from vllm_mindspore.model_executor.models.model_base import MsModelBase
 from vllm_mindspore.model_executor.models.mf_models.attention_mask import LowerTriangularMask
 from vllm_mindspore.v1.attention.backends.flash_attn import FlashAttentionMetadata
 
+try:
+    # Need to apply dllm pd patch on vllm to use pd disagg related functions
+    from vllm.attention.layer import maybe_save_kv_layer_to_connector, wait_for_kv_layer_from_connector
+    from vllm.distributed.kv_transfer import is_v1_kv_transfer_group
+    kv_transfer_supported = True
+except:
+    kv_transfer_supported = False
+
+
 logger = init_logger(__name__)
 
 class MfModelBase(MsModelBase):
@@ -52,6 +61,7 @@ class MfModelBase(MsModelBase):
             vllm_config=vllm_config, prefix=prefix
         )
 
+        self.kv_transfer_config = vllm_config.kv_transfer_config
         self.mf_config = MindFormerConfig(os.getenv("MINDFORMERS_MODEL_CONFIG"))
         build_mf_context(self.mf_config)
         build_parallel_config(self.mf_config)
@@ -80,6 +90,18 @@ class MfModelBase(MsModelBase):
         raise NotImplementedError("Function _create_network should be Implemented!")
 
 
+    def is_decoder_task(self) -> bool:
+        if self.kv_transfer_config is None:
+            return False
+
+        return self.kv_transfer_config.is_kv_consumer
+
+    def is_prefill_task(self) -> bool:
+        if self.kv_transfer_config is None:
+            return False
+
+        return self.kv_transfer_config.is_kv_producer
+
     def _dummy_attention_metadata(self, input_ids: Tensor, positions: Tensor) -> FlashAttentionMetadata:
         input_len = input_ids.shape[0]
         max_seq_len = ms.Tensor(input_len, dtype=ms.int32)
@@ -176,6 +198,24 @@ class MfModelBase(MsModelBase):
     def update_model_inputs(self, model_inputs, **kwargs):
         return model_inputs
 
+    def connector_send_kvcache(self):
+        #TODO 可优化
+        _pynative_executor.sync()
+        forward_context = get_forward_context()
+        for i in range(self.mf_model_config.num_layers):
+            kv_cache = self.kv_caches[i]
+            k_cache = kv_cache.kv_cache[forward_context.virtual_engine][0]
+            v_cache = kv_cache.kv_cache[forward_context.virtual_engine][1]
+            maybe_save_kv_layer_to_connector("key." + str(i), (k_cache, v_cache))
+
+
+    def connector_wait_for_kv_layer(self):
+        logger.debug(f"connector_wait_for_kv_layer")
+        #TODO 可优化
+        for i in range(self.mf_model_config.num_layers):
+            wait_for_kv_layer_from_connector("key." + str(i))
+
+
     def forward(
         self,
         input_ids: Tensor,
@@ -199,7 +239,17 @@ class MfModelBase(MsModelBase):
             if not self.set_flags:
                 self.network.add_flags_custom(is_first_iteration=False)
                 self.set_flags = True
+            if kv_transfer_supported:
+                if is_v1_kv_transfer_group():
+                    self.connector_send_kvcache()
         else:
+            if kv_transfer_supported:
+                if is_v1_kv_transfer_group() and self.is_prefill_task():
+                    self.connector_send_kvcache()
+
+                if is_v1_kv_transfer_group() and self.is_decoder_task():
+                    self.connector_wait_for_kv_layer()
+                    logger.debug(f"connector_wait_for_kv_layer success")
             hidden_states = self.network(**model_inputs)
 
         return hidden_states
diff --git a/vllm_mindspore/v1/utils.py b/vllm_mindspore/v1/utils.py
index 6833c101..c13c292d 100644
--- a/vllm_mindspore/v1/utils.py
+++ b/vllm_mindspore/v1/utils.py
@@ -10,3 +10,4 @@ def copy_slice(from_tensor: torch.Tensor, to_tensor: torch.Tensor,
     """
     to_tensor[:length] = from_tensor[:length]
     return to_tensor
+
diff --git a/vllm_mindspore/v1/worker/gpu_worker.py b/vllm_mindspore/v1/worker/gpu_worker.py
index 0395c339..3ebde737 100644
--- a/vllm_mindspore/v1/worker/gpu_worker.py
+++ b/vllm_mindspore/v1/worker/gpu_worker.py
@@ -6,7 +6,6 @@ import torch
 from vllm.logger import init_logger
 from vllm.distributed.parallel_state import get_pp_group
 
-
 logger = init_logger(__name__)
 
 
@@ -19,8 +18,8 @@ def init_device(self):
 
     config = get_current_vllm_config()
     if config is not None and config.parallel_config.data_parallel_size > 1:
-        device_id = self.parallel_config.data_parallel_rank_local * self.parallel_config.world_size + self.local_rank
-        self.device = torch.device(f"cuda:{device_id}")
+        self.local_rank = self.parallel_config.data_parallel_rank_local * self.parallel_config.world_size + self.local_rank
+        self.device = torch.device(f"cuda:{self.local_rank}")
     else:
         self.device = torch.device(f"cuda:{self.local_rank}")
     torch.cuda.set_device(self.device)
@@ -31,9 +30,16 @@ def init_device(self):
     self.init_gpu_memory = torch.cuda.mem_get_info()[0]
 
     # Initialize the distributed environment.
-    init_worker_distributed_environment(self.parallel_config, self.rank,
-                                        self.distributed_init_method,
-                                        self.local_rank)
+    try:
+        # not None -> Module found: DLLM patch applied
+        init_worker_distributed_environment(config, self.rank,
+                                            self.distributed_init_method,
+                                            self.local_rank)
+    except:
+        # None -> Module not found: Patch not applied
+        init_worker_distributed_environment(self.parallel_config, self.rank,
+                                            self.distributed_init_method,
+                                            self.local_rank)
 
     # Set random seed.
     set_random_seed(self.model_config.seed)
diff --git a/vllm_mindspore/worker/worker.py b/vllm_mindspore/worker/worker.py
index 8ce1bc91..2dc69fcd 100644
--- a/vllm_mindspore/worker/worker.py
+++ b/vllm_mindspore/worker/worker.py
@@ -26,7 +26,6 @@ import torch
 
 from vllm.config import VllmConfig
 from vllm.distributed import (
-    ensure_kv_transfer_initialized,
     ensure_model_parallel_initialized,
     init_distributed_environment,
     set_custom_all_reduce,
-- 
Gitee


From 58d5dd926ae3d5507229e14d7897a48287101446 Mon Sep 17 00:00:00 2001
From: Erpim <dengyepeng@huawei.com>
Date: Mon, 19 May 2025 23:14:38 +0800
Subject: [PATCH 37/55] update owner

---
 OWNERS | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/OWNERS b/OWNERS
index 90ee21da..d6adb0dc 100644
--- a/OWNERS
+++ b/OWNERS
@@ -1,12 +1,16 @@
 reviewers:
-- wang_shaocong
-- erpim
-- zhang_xue_tong
-- tan-wei-cheng
+- zhaizhiqiang
+- panshaowu
+- zichun_ye
+- ckey_dou
 
 approvers:
-- tronzhang
-- zichun_ye
 - zlq2020
-- panshaowu
-- zhaizhiqiang
\ No newline at end of file
+- zhang_xue_tong
+- tronzhang
+- r1chardf1d0
+- wang_shaocong
+- erpim
+- tan-wei-cheng
+- hangangqiang
+- dayschan
-- 
Gitee


From 1d1f300b7c7513e6e26b635484497181c6c65112 Mon Sep 17 00:00:00 2001
From: lvhaoyu <lvhaoyu@huawei.com>
Date: Tue, 20 May 2025 09:33:32 +0800
Subject: [PATCH 38/55] set CPU communication waiting time to avoid timeout

---
 vllm_mindspore/scripts.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/vllm_mindspore/scripts.py b/vllm_mindspore/scripts.py
index ef297418..bcd90780 100644
--- a/vllm_mindspore/scripts.py
+++ b/vllm_mindspore/scripts.py
@@ -44,7 +44,10 @@ def env_setup(target_env_dict=None):
             "HCCL_OP_EXPANSION_MODE": "AIV",
             "MS_JIT_MODULES": "vllm_mindspore,research",
             "GLOG_v": "3",
-            "RAY_CGRAPH_get_timeout": "360"
+            "RAY_CGRAPH_get_timeout": "360",
+            # For CPU communication timeout setting, default is 15s, change to 180s
+            # to avoid multi node timeout when starting service.
+            "MS_NODE_TIMEOUT": "180"
         }
 
     for key, value in target_env_dict.items():
-- 
Gitee


From 4f963cc19c15c012169d66fe10f999b979bf34a2 Mon Sep 17 00:00:00 2001
From: ccsszz <changshaozhong@huawei.com>
Date: Mon, 28 Apr 2025 20:43:30 +0800
Subject: [PATCH 39/55] change smoothquant config

---
 .../python/test_vllm_deepseek_smoothquant.py  |   2 +-
 .../test_vllm_deepseek_smoothquant_mss.py     |  78 +++++++
 .../models/mf_models/deepseek_v3.py           |  36 ++--
 .../mf_models/deepseekv3_weight_processor.py  | 190 ++----------------
 4 files changed, 115 insertions(+), 191 deletions(-)
 create mode 100644 tests/st/python/test_vllm_deepseek_smoothquant_mss.py

diff --git a/tests/st/python/test_vllm_deepseek_smoothquant.py b/tests/st/python/test_vllm_deepseek_smoothquant.py
index c79222d9..ec61d309 100644
--- a/tests/st/python/test_vllm_deepseek_smoothquant.py
+++ b/tests/st/python/test_vllm_deepseek_smoothquant.py
@@ -63,7 +63,7 @@ class TestDeepSeek:
         sampling_params = SamplingParams(temperature=0.0, max_tokens=10, top_k=1)
 
         # Create an LLM.
-        llm = LLM(model="/home/workspace/mindspore_dataset/weight/DeepSeek-R1-W8A8-smoothquant",
+        llm = LLM(model="/home/workspace/mindspore_dataset/weight/DeepSeek-R1-W8A8-smoothquant-newconfig",
                   trust_remote_code=True, gpu_memory_utilization=0.9, tensor_parallel_size=8, max_model_len=4096)
         # Generate texts from the prompts. The output is a list of RequestOutput objects
         # that contain the prompt, generated text, and other information.
diff --git a/tests/st/python/test_vllm_deepseek_smoothquant_mss.py b/tests/st/python/test_vllm_deepseek_smoothquant_mss.py
new file mode 100644
index 00000000..bc3ad06b
--- /dev/null
+++ b/tests/st/python/test_vllm_deepseek_smoothquant_mss.py
@@ -0,0 +1,78 @@
+#!/usr/bin/env python3
+# encoding: utf-8
+# Copyright 2025 Huawei Technologies Co., Ltd
+# Copyright 2024 The vLLM team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""test mf deepseek r1 smoothquant."""
+import pytest
+import os
+from . import set_env
+env_manager = set_env.EnvVarManager()
+# def env
+env_vars = {
+    "MINDFORMERS_MODEL_CONFIG": "./config/predict_deepseek_r1_671b_w8a8_smoothquant.yaml",
+    "ASCEND_CUSTOM_PATH": os.path.expandvars("$ASCEND_HOME_PATH/../"),
+    "vLLM_MODEL_BACKEND": "MindFormers",
+    "MS_ENABLE_LCCL": "off",
+    "HCCL_OP_EXPANSION_MODE": "AIV",
+    "ASCEND_RT_VISIBLE_DEVICES": "0,1,2,3,4,5,6,7",
+    "MS_ALLOC_CONF": "enable_vmm:True",
+    "LCCL_DETERMINISTIC": "1",
+    "HCCL_DETERMINISTIC": "true",
+    "ATB_MATMUL_SHUFFLE_K_ENABLE": "0",
+    "ATB_LLM_LCOC_ENABLE": "0",
+    "VLLM_USE_V1": "0",
+}
+# set env
+env_manager.setup_ai_environment(env_vars)
+import vllm_mindspore
+from vllm import LLM, SamplingParams
+
+class TestDeepSeekMss:
+    '''
+    TestDeepSeekMss
+    '''
+    @pytest.mark.level0
+    @pytest.mark.platform_arm_ascend910b_training
+    @pytest.mark.env_single
+    @pytest.mark.skip(reason="gs master branch is not suit for the newest mindformers.")
+    def test_deepseek_r1_mss(self):
+        """
+        test case deepseek r1 w8a8 mss
+        """
+
+        # Sample prompts.
+        prompts = [
+            "介绍下北京故宫",
+        ]
+
+        # Create a sampling params object.
+        sampling_params = SamplingParams(temperature=0.0, max_tokens=10, top_k=1)
+
+        # Create an LLM.
+        llm = LLM(model="/home/workspace/mindspore_dataset/weight/DeepSeek-R1-W8A8-smoothquant-newconfig",
+                  trust_remote_code=True, gpu_memory_utilization=0.9, tensor_parallel_size=8, num_scheduler_steps=8)
+        # Generate texts from the prompts. The output is a list of RequestOutput objects
+        # that contain the prompt, generated text, and other information.
+        outputs = llm.generate(prompts, sampling_params)
+        # Print the outputs.
+        for i, output in enumerate(outputs):
+            prompt = output.prompt
+            generated_text = output.outputs[0].text
+            print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+            assert "博物院" in generated_text
+
+        # unset env
+        env_manager.unset_all()
\ No newline at end of file
diff --git a/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py b/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py
index 67b80339..497e35ef 100644
--- a/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py
+++ b/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py
@@ -257,25 +257,25 @@ class DeepseekV3ForCausalLM(MfModelBase):
             cfg = PTQConfig(mode=quant_mode, backend=BackendTarget.ASCEND, weight_quant_dtype=msdtype.int8,
                             act_quant_dtype=msdtype.int8, outliers_suppression=OutliersSuppressionType.SMOOTH,
                             opname_blacklist=['lm_head', 'lkv2kv'])
-            w2_config = PTQConfig(mode=quant_mode, backend=BackendTarget.ASCEND, weight_quant_dtype=msdtype.int8,
-                                  act_quant_dtype=msdtype.int8,
-                                  outliers_suppression=OutliersSuppressionType.NONE,
-                                  precision_recovery=PrecisionRecovery.NONE,
-                                  act_quant_granularity=QuantGranularity.PER_TOKEN,
-                                  weight_quant_granularity=QuantGranularity.PER_CHANNEL)
-            layer_policies = OrderedDict({r'.*\.w2.*': w2_config})
+            ffn_config = PTQConfig(mode=quant_mode, backend=BackendTarget.ASCEND, weight_quant_dtype=msdtype.int8,
+                                   act_quant_dtype=msdtype.int8,
+                                   outliers_suppression=OutliersSuppressionType.NONE,
+                                   precision_recovery=PrecisionRecovery.NONE,
+                                   act_quant_granularity=QuantGranularity.PER_TOKEN,
+                                   weight_quant_granularity=QuantGranularity.PER_CHANNEL)
+            layer_policies = OrderedDict({r'.*\.feed_forward\..*': ffn_config})
         elif quant_type.lower() == 'osl':
             cfg = PTQConfig(mode=quant_mode, backend=BackendTarget.ASCEND, weight_quant_dtype=msdtype.int8,
                             act_quant_dtype=msdtype.int8,
                             outliers_suppression=OutliersSuppressionType.OUTLIER_SUPPRESSION_LITE,
                             opname_blacklist=['lm_head', 'lkv2kv'])
-            w2_config = PTQConfig(mode=quant_mode, backend=BackendTarget.ASCEND, weight_quant_dtype=msdtype.int8,
-                                act_quant_dtype=msdtype.int8,
-                                outliers_suppression=OutliersSuppressionType.NONE,
-                                precision_recovery=PrecisionRecovery.NONE,
-                                act_quant_granularity=QuantGranularity.PER_TOKEN,
-                                weight_quant_granularity=QuantGranularity.PER_CHANNEL)
-            layer_policies = OrderedDict({r'.*\.w2.*': w2_config})
+            ffn_config = PTQConfig(mode=quant_mode, backend=BackendTarget.ASCEND, weight_quant_dtype=msdtype.int8,
+                                   act_quant_dtype=msdtype.int8,
+                                   outliers_suppression=OutliersSuppressionType.NONE,
+                                   precision_recovery=PrecisionRecovery.NONE,
+                                   act_quant_granularity=QuantGranularity.PER_TOKEN,
+                                   weight_quant_granularity=QuantGranularity.PER_CHANNEL)
+            layer_policies = OrderedDict({r'.*\.feed_forward\..*': ffn_config})
         elif quant_type.lower() == 'a16w8':
             cfg = PTQConfig(mode=quant_mode, backend=BackendTarget.ASCEND, weight_quant_dtype=msdtype.int8,
                             opname_blacklist=['lm_head', 'lkv2kv'])
@@ -292,14 +292,6 @@ class DeepseekV3ForCausalLM(MfModelBase):
         if 'awq' in quant_type.lower():
             # pylint: disable=protected-access
             ptq._config.weight_symmetric = False
-        if 'smoothquant' in quant_type.lower():
-            # pylint: disable=protected-access
-            ptq._config.aclnn_quant_list = ["routed_experts.ffn.w_gate_hidden", "routed_experts.ffn.w1",
-                                            "routed_experts.ffn.w3"]
-        if 'osl' in quant_type.lower():
-            # pylint: disable=protected-access
-            ptq._config.aclnn_quant_list = ["routed_experts.ffn.w_gate_hidden", "routed_experts.ffn.w1",
-                                            "routed_experts.ffn.w3"]
         if 'gptq-pergroup' in quant_type.lower():
             # pylint: disable=protected-access
             ptq.layer_policies[r'.*\.feed_forward\.w2.*'].aclnn_quant_list = ["w2"]
diff --git a/vllm_mindspore/model_executor/models/mf_models/deepseekv3_weight_processor.py b/vllm_mindspore/model_executor/models/mf_models/deepseekv3_weight_processor.py
index b89e699b..28ebe6d6 100644
--- a/vllm_mindspore/model_executor/models/mf_models/deepseekv3_weight_processor.py
+++ b/vllm_mindspore/model_executor/models/mf_models/deepseekv3_weight_processor.py
@@ -1188,41 +1188,23 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor):
 
         ffn_concat = self.config.model.model_config.ffn_concat
         w1_weight_name = f"model.layers.{layer_id}.{layer_type}.w1._layer.weight"
-        w1_bias_name = f"model.layers.{layer_id}.{layer_type}.w1._layer.matmul.quant_bias"
-        w1_scale_name = f"model.layers.{layer_id}.{layer_type}.w1._layer.matmul.dequant_scale"
-        w1_quant_zp = f"model.layers.{layer_id}.{layer_type}.w1.quant_op.input_zp"
-        w1_quant_scale = f"model.layers.{layer_id}.{layer_type}.w1.quant_op.input_scale"
+        w1_scale_name = f"model.layers.{layer_id}.{layer_type}.w1._layer.matmul.weight_scale"
         w3_weight_name = f"model.layers.{layer_id}.{layer_type}.w3._layer.weight"
-        w3_bias_name = f"model.layers.{layer_id}.{layer_type}.w3._layer.matmul.quant_bias"
-        w3_scale_name = f"model.layers.{layer_id}.{layer_type}.w3._layer.matmul.dequant_scale"
-        w3_quant_zp = f"model.layers.{layer_id}.{layer_type}.w3.quant_op.input_zp"
-        w3_quant_scale = f"model.layers.{layer_id}.{layer_type}.w3.quant_op.input_scale"
+        w3_scale_name = f"model.layers.{layer_id}.{layer_type}.w3._layer.matmul.weight_scale"
         w2_weight_name = f"model.layers.{layer_id}.{layer_type}.w2._layer.weight"
         w2_scale_name = f"model.layers.{layer_id}.{layer_type}.w2._layer.matmul.weight_scale"
         w1_weight_param, _ = self.get_routed_safetensor_3_dim(w1_weight_name, src_hf_dir, hf_weight_map, tp_axis=2,
                                                               split_ep=self.moe_split_ep, split_tp=self.moe_split_tp)
 
-        w1_bias_param, _ = self.get_routed_safetensor_2_dim(w1_bias_name, src_hf_dir, hf_weight_map, tp_axis=1,
-                                                            split_ep=self.moe_split_ep, split_tp=self.moe_split_tp)
-
         w1_scale_param, _ = self.get_routed_safetensor_2_dim(w1_scale_name, src_hf_dir, hf_weight_map, tp_axis=1,
                                                              split_ep=self.moe_split_ep, split_tp=self.moe_split_tp)
 
-        w1_quant_zp_param, _ = self.get_safetensor_from_file(w1_quant_zp, src_hf_dir, hf_weight_map)
-        w1_quant_scale_param, _ = self.get_safetensor_from_file(w1_quant_scale, src_hf_dir, hf_weight_map)
-
         w3_weight_param, _ = self.get_routed_safetensor_3_dim(w3_weight_name, src_hf_dir, hf_weight_map, tp_axis=2,
                                                               split_ep=self.moe_split_ep, split_tp=self.moe_split_tp)
 
-        w3_bias_param, _ = self.get_routed_safetensor_2_dim(w3_bias_name, src_hf_dir, hf_weight_map, tp_axis=1,
-                                                            split_ep=self.moe_split_ep, split_tp=self.moe_split_tp)
-
         w3_scale_param, _ = self.get_routed_safetensor_2_dim(w3_scale_name, src_hf_dir, hf_weight_map, tp_axis=1,
                                                              split_ep=self.moe_split_ep, split_tp=self.moe_split_tp)
 
-        w3_quant_zp_param, _ = self.get_safetensor_from_file(w3_quant_zp, src_hf_dir, hf_weight_map)
-        w3_quant_scale_param, _ = self.get_safetensor_from_file(w3_quant_scale, src_hf_dir, hf_weight_map)
-
         w2_weight_param, _ = self.get_routed_safetensor_3_dim(w2_weight_name, src_hf_dir, hf_weight_map, tp_axis=1,
                                                               split_ep=self.moe_split_ep, split_tp=self.moe_split_tp)
         w2_scale_param, _ = self.get_routed_safetensor_2_dim(w2_scale_name, src_hf_dir, hf_weight_map,
@@ -1234,26 +1216,10 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor):
             parameter_dict[concat_weight_name] = ms.Parameter(concat_weight_param, name=concat_weight_name,
                                                               requires_grad=False)
 
-            concat_bias_name = f"model.layers.{layer_id}.{layer_type}.w_gate_hidden._layer.matmul.quant_bias"
-            concat_bias_param = ms.Tensor(np.concatenate([w1_bias_param, w3_bias_param], axis=1), dtype=ms.int32)
-            parameter_dict[concat_bias_name] = ms.Parameter(concat_bias_param, name=concat_bias_name,
-                                                            requires_grad=False)
-
-            concat_scale_name = f"model.layers.{layer_id}.{layer_type}.w_gate_hidden._layer.matmul.dequant_scale"
+            concat_scale_name = f"model.layers.{layer_id}.{layer_type}.w_gate_hidden._layer.matmul.weight_scale"
             concat_scale_param = ms.Tensor(np.concatenate([w1_scale_param, w3_scale_param], axis=1), dtype=ms.bfloat16)
             parameter_dict[concat_scale_name] = ms.Parameter(concat_scale_param, name=concat_scale_name,
                                                              requires_grad=False)
-
-            concat_quant_zp_name = f"model.layers.{layer_id}.{layer_type}.w_gate_hidden.quant_op.input_zp"
-            concat_quant_zp_param = ms.Tensor(w1_quant_zp_param, dtype=ms.bfloat16)
-            parameter_dict[concat_quant_zp_name] = ms.Parameter(concat_quant_zp_param, name=concat_quant_zp_name,
-                                                                requires_grad=False)
-
-            concat_quant_scale_name = f"model.layers.{layer_id}.{layer_type}.w_gate_hidden.quant_op.input_scale"
-            concat_quant_scale_param = ms.Tensor(w1_quant_scale_param, dtype=ms.bfloat16)
-            parameter_dict[concat_quant_scale_name] = ms.Parameter(concat_quant_scale_param,
-                                                                   name=concat_quant_scale_name,
-                                                                   requires_grad=False)
         else:
             # w1 w3
             parameter_dict[w1_weight_name] = ms.Parameter(ms.Tensor(w1_weight_param, ms.int8), name=w1_weight_name,
@@ -1261,41 +1227,24 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor):
             parameter_dict[w3_weight_name] = ms.Parameter(ms.Tensor(w3_weight_param, ms.int8), name=w3_weight_name,
                                                           requires_grad=False)
 
-            parameter_dict[w1_bias_name] = ms.Parameter(ms.Tensor(w1_bias_param, ms.int32),
-                                                        name=w1_bias_name, requires_grad=False)
-            parameter_dict[w3_bias_name] = ms.Parameter(ms.Tensor(w3_bias_param, ms.int32),
-                                                        name=w3_bias_name, requires_grad=False)
-
             parameter_dict[w1_scale_name] = ms.Parameter(ms.Tensor(w1_scale_param, ms.bfloat16),
                                                          name=w1_scale_name, requires_grad=False)
             parameter_dict[w3_scale_name] = ms.Parameter(ms.Tensor(w3_scale_param, ms.bfloat16),
                                                          name=w3_scale_name, requires_grad=False)
 
-            parameter_dict[w1_quant_zp] = ms.Parameter(ms.Tensor(w1_quant_zp_param, ms.bfloat16),
-                                                       name=w1_quant_zp, requires_grad=False)
-            parameter_dict[w3_quant_zp] = ms.Parameter(ms.Tensor(w3_quant_zp_param, ms.bfloat16),
-                                                       name=w3_quant_zp, requires_grad=False)
-
-            parameter_dict[w1_quant_scale] = ms.Parameter(ms.Tensor(w1_quant_scale_param, ms.bfloat16),
-                                                          name=w1_quant_scale, requires_grad=False)
-            parameter_dict[w3_quant_scale] = ms.Parameter(ms.Tensor(w3_quant_scale_param, ms.bfloat16),
-                                                          name=w3_quant_scale, requires_grad=False)
-
         parameter_dict[w2_weight_name] = ms.Parameter(ms.Tensor(w2_weight_param, ms.int8), name=w2_weight_name,
                                                       requires_grad=False)
         parameter_dict[w2_scale_name] = ms.Parameter(ms.Tensor(w2_scale_param, ms.bfloat16),
                                                      name=w2_scale_name, requires_grad=False)
 
-    def get_smooth_quant_moe_shared_expert_weight(self, w1_weight_name, w1_bias_name, w1_scale_name, w3_weight_name,
-                                                  w3_bias_name, w3_scale_name, w2_weight_name, src_hf_dir,
-                                                  hf_weight_map):
+    def get_smooth_quant_moe_shared_expert_weight(self, w1_weight_name, w1_scale_name, w3_weight_name,w3_scale_name,
+                                                  w2_weight_name, src_hf_dir, hf_weight_map):
+        '''get_smooth_quant_moe_shared_expert_weight'''
 
         if self.ep_method in [EPMethod.DEFAULT, EPMethod.ALLGATHER]:
             w1_weight_param, _ = self.get_safetensor_from_file_split_moe_tp_group(w1_weight_name, src_hf_dir,
                                                                                   hf_weight_map,
                                                                                   split_axis=0)
-            w1_bias_param, _ = self.get_safetensor_from_file_split_moe_tp_group(w1_bias_name, src_hf_dir, hf_weight_map,
-                                                                                split_axis=0)
 
             w1_scale_param, _ = self.get_safetensor_from_file_split_moe_tp_group(w1_scale_name, src_hf_dir,
                                                                                  hf_weight_map,
@@ -1304,8 +1253,6 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor):
             w3_weight_param, _ = self.get_safetensor_from_file_split_moe_tp_group(w3_weight_name, src_hf_dir,
                                                                                   hf_weight_map,
                                                                                   split_axis=0)
-            w3_bias_param, _ = self.get_safetensor_from_file_split_moe_tp_group(w3_bias_name, src_hf_dir, hf_weight_map,
-                                                                                split_axis=0)
             w3_scale_param, _ = self.get_safetensor_from_file_split_moe_tp_group(w3_scale_name, src_hf_dir,
                                                                                  hf_weight_map,
                                                                                  split_axis=0)
@@ -1315,18 +1262,16 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor):
                                                                                   split_axis=1)
         elif self.ep_method == EPMethod.ALLTOALL:
             w1_weight_param, _ = self.get_safetensor_from_file(w1_weight_name, src_hf_dir, hf_weight_map)
-            w1_bias_param, _ = self.get_safetensor_from_file(w1_bias_name, src_hf_dir, hf_weight_map)
             w1_scale_param, _ = self.get_safetensor_from_file(w1_scale_name, src_hf_dir, hf_weight_map)
 
             w3_weight_param, _ = self.get_safetensor_from_file(w3_weight_name, src_hf_dir, hf_weight_map)
-            w3_bias_param, _ = self.get_safetensor_from_file(w3_bias_name, src_hf_dir, hf_weight_map)
             w3_scale_param, _ = self.get_safetensor_from_file(w3_scale_name, src_hf_dir, hf_weight_map)
 
             w2_weight_param, _ = self.get_safetensor_from_file(w2_weight_name, src_hf_dir, hf_weight_map)
         else:
             raise ValueError("Unsupported ep_method:{}".format(self.ep_method))
 
-        return w1_weight_param, w1_bias_param, w1_scale_param, w3_weight_param, w3_bias_param, w3_scale_param, w2_weight_param
+        return w1_weight_param, w1_scale_param, w3_weight_param, w3_scale_param, w2_weight_param
 
     def smooth_quant_process_shared_ffn_weight(self, src_hf_dir, layer_id, hf_weight_map, parameter_dict, layer_type):
         """smooth_quant_process_shared_ffn_weight"""
@@ -1336,57 +1281,26 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor):
         w2_weight_name = f"model.layers.{layer_id}.{layer_type}.w2._layer.weight"
         w3_weight_name = f"model.layers.{layer_id}.{layer_type}.w3._layer.weight"
 
-        w1_bias_name = f"model.layers.{layer_id}.{layer_type}.w1._layer.matmul.quant_bias"
-        w3_bias_name = f"model.layers.{layer_id}.{layer_type}.w3._layer.matmul.quant_bias"
-
-        w1_scale_name = f"model.layers.{layer_id}.{layer_type}.w1._layer.matmul.dequant_scale"
+        w1_scale_name = f"model.layers.{layer_id}.{layer_type}.w1._layer.matmul.weight_scale"
         w2_scale_name = f"model.layers.{layer_id}.{layer_type}.w2._layer.matmul.weight_scale"
-        w3_scale_name = f"model.layers.{layer_id}.{layer_type}.w3._layer.matmul.dequant_scale"
-
-        w1_quant_zp = f"model.layers.{layer_id}.{layer_type}.w1.quant_op.input_zp"
-        w3_quant_zp = f"model.layers.{layer_id}.{layer_type}.w3.quant_op.input_zp"
-
-        w1_quant_scale = f"model.layers.{layer_id}.{layer_type}.w1.quant_op.input_scale"
-        w3_quant_scale = f"model.layers.{layer_id}.{layer_type}.w3.quant_op.input_scale"
-
-        w1_weight_param, w1_bias_param, w1_scale_param, w3_weight_param, w3_bias_param, w3_scale_param, w2_weight_param = \
-            self.get_smooth_quant_moe_shared_expert_weight(w1_weight_name, w1_bias_name, w1_scale_name, w3_weight_name,
-                                                           w3_bias_name, w3_scale_name, w2_weight_name, src_hf_dir,
-                                                           hf_weight_map)
-
-        w1_quant_zp_param, _ = self.get_safetensor_from_file(w1_quant_zp, src_hf_dir, hf_weight_map)
-        w1_quant_scale_param, _ = self.get_safetensor_from_file(w1_quant_scale, src_hf_dir, hf_weight_map)
+        w3_scale_name = f"model.layers.{layer_id}.{layer_type}.w3._layer.matmul.weight_scale"
 
+        w1_weight_param, w1_scale_param, w3_weight_param, w3_scale_param, w2_weight_param = \
+            self.get_smooth_quant_moe_shared_expert_weight(w1_weight_name, w1_scale_name, w3_weight_name, w3_scale_name,
+                                                           w2_weight_name, src_hf_dir, hf_weight_map)
         w2_scale_param, _ = self.get_safetensor_from_file(w2_scale_name, src_hf_dir, hf_weight_map)
 
-        w3_quant_zp_param, _ = self.get_safetensor_from_file(w3_quant_zp, src_hf_dir, hf_weight_map)
-        w3_quant_scale_param, _ = self.get_safetensor_from_file(w3_quant_scale, src_hf_dir, hf_weight_map)
         if ffn_concat:
             concat_weight_name = f"model.layers.{layer_id}.{layer_type}.w_gate_hidden._layer.weight"
             concat_weight_param = ms.Tensor(np.concatenate([w1_weight_param, w3_weight_param], axis=0), dtype=ms.int8)
             parameter_dict[concat_weight_name] = ms.Parameter(concat_weight_param, name=concat_weight_name,
                                                               requires_grad=False)
 
-            concat_bias_name = f"model.layers.{layer_id}.{layer_type}.w_gate_hidden._layer.matmul.quant_bias"
-            concat_bias_param = ms.Tensor(np.concatenate([w1_bias_param, w3_bias_param], axis=0), dtype=ms.int32)
-            parameter_dict[concat_bias_name] = ms.Parameter(concat_bias_param, name=concat_bias_name,
-                                                            requires_grad=False)
-
-            concat_scale_name = f"model.layers.{layer_id}.{layer_type}.w_gate_hidden._layer.matmul.dequant_scale"
-            concat_scale_param = ms.Tensor(np.concatenate([w1_scale_param, w3_scale_param], axis=0), dtype=ms.float32)
+            concat_scale_name = f"model.layers.{layer_id}.{layer_type}.w_gate_hidden._layer.matmul.weight_scale"
+            concat_scale_param = ms.Tensor(np.concatenate([w1_scale_param, w3_scale_param], axis=0), dtype=ms.bfloat16)
             parameter_dict[concat_scale_name] = ms.Parameter(concat_scale_param, name=concat_scale_name,
                                                              requires_grad=False)
 
-            concat_quant_zp_name = f"model.layers.{layer_id}.{layer_type}.w_gate_hidden.quant_op.input_zp"
-            concat_quant_zp_param = ms.Tensor(w1_quant_zp_param, dtype=ms.int8)
-            parameter_dict[concat_quant_zp_name] = ms.Parameter(concat_quant_zp_param, name=concat_quant_zp_name,
-                                                                requires_grad=False)
-
-            concat_quant_scale_name = f"model.layers.{layer_id}.{layer_type}.w_gate_hidden.quant_op.input_scale"
-            concat_quant_scale_param = ms.Tensor(w1_quant_scale_param, dtype=ms.bfloat16)
-            parameter_dict[concat_quant_scale_name] = ms.Parameter(concat_quant_scale_param,
-                                                                   name=concat_quant_scale_name,
-                                                                   requires_grad=False)
         else:
             # w1 w3
             parameter_dict[w1_weight_name] = ms.Parameter(ms.Tensor(w1_weight_param, ms.int8), name=w1_weight_name,
@@ -1394,25 +1308,11 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor):
             parameter_dict[w3_weight_name] = ms.Parameter(ms.Tensor(w3_weight_param, ms.int8), name=w3_weight_name,
                                                           requires_grad=False)
 
-            parameter_dict[w1_bias_name] = ms.Parameter(ms.Tensor(w1_bias_param, ms.int32),
-                                                        name=w1_bias_name, requires_grad=False)
-            parameter_dict[w3_bias_name] = ms.Parameter(ms.Tensor(w3_bias_param, ms.int32),
-                                                        name=w3_bias_name, requires_grad=False)
-
-            parameter_dict[w1_scale_name] = ms.Parameter(ms.Tensor(w1_scale_param, ms.float32),
+            parameter_dict[w1_scale_name] = ms.Parameter(ms.Tensor(w1_scale_param, ms.bfloat16),
                                                          name=w1_scale_name, requires_grad=False)
-            parameter_dict[w3_scale_name] = ms.Parameter(ms.Tensor(w3_scale_param, ms.float32),
+            parameter_dict[w3_scale_name] = ms.Parameter(ms.Tensor(w3_scale_param, ms.bfloat16),
                                                          name=w3_scale_name, requires_grad=False)
 
-            parameter_dict[w1_quant_zp] = ms.Parameter(ms.Tensor(w1_quant_zp_param, ms.int8),
-                                                       name=w1_quant_zp, requires_grad=False)
-            parameter_dict[w3_quant_zp] = ms.Parameter(ms.Tensor(w3_quant_zp_param, ms.int8),
-                                                       name=w3_quant_zp, requires_grad=False)
-
-            parameter_dict[w1_quant_scale] = ms.Parameter(ms.Tensor(w1_quant_scale_param, ms.bfloat16),
-                                                          name=w1_quant_scale, requires_grad=False)
-            parameter_dict[w3_quant_scale] = ms.Parameter(ms.Tensor(w3_quant_scale_param, ms.bfloat16),
-                                                          name=w3_quant_scale, requires_grad=False)
         parameter_dict[w2_weight_name] = ms.Parameter(ms.Tensor(w2_weight_param, ms.int8), name=w2_weight_name,
                                                       requires_grad=False)
         parameter_dict[w2_scale_name] = ms.Parameter(ms.Tensor(w2_scale_param, ms.bfloat16),
@@ -1426,26 +1326,15 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor):
 
         w1_weight_param, _ = self.get_safetensor_from_file_split_tp_group(w1_weight_name, src_hf_dir, hf_weight_map,
                                                                           split_axis=0)
-        w1_bias_name = f"model.layers.{layer_id}.{layer_type}.w1._layer.matmul.quant_bias"
-        w1_bias_param, _ = self.get_safetensor_from_file_split_tp_group(w1_bias_name, src_hf_dir, hf_weight_map,
-                                                                        split_axis=0)
-        w1_scale_name = f"model.layers.{layer_id}.{layer_type}.w1._layer.matmul.dequant_scale"
+        w1_scale_name = f"model.layers.{layer_id}.{layer_type}.w1._layer.matmul.weight_scale"
         w1_scale_param, _ = self.get_safetensor_from_file_split_tp_group(w1_scale_name, src_hf_dir, hf_weight_map,
                                                                          split_axis=0)
 
-        w1_quant_zp = f"model.layers.{layer_id}.{layer_type}.w1.quant_op.input_zp"
-        w1_quant_scale = f"model.layers.{layer_id}.{layer_type}.w1.quant_op.input_scale"
-        w1_quant_zp_param, _ = self.get_safetensor_from_file(w1_quant_zp, src_hf_dir, hf_weight_map)
-        w1_quant_scale_param, _ = self.get_safetensor_from_file(w1_quant_scale, src_hf_dir, hf_weight_map)
-
         w3_weight_name = f"model.layers.{layer_id}.{layer_type}.w3._layer.weight"
 
         w3_weight_param, _ = self.get_safetensor_from_file_split_tp_group(w3_weight_name, src_hf_dir, hf_weight_map,
                                                                           split_axis=0)
-        w3_bias_name = f"model.layers.{layer_id}.{layer_type}.w3._layer.matmul.quant_bias"
-        w3_bias_param, _ = self.get_safetensor_from_file_split_tp_group(w3_bias_name, src_hf_dir, hf_weight_map,
-                                                                        split_axis=0)
-        w3_scale_name = f"model.layers.{layer_id}.{layer_type}.w3._layer.matmul.dequant_scale"
+        w3_scale_name = f"model.layers.{layer_id}.{layer_type}.w3._layer.matmul.weight_scale"
         w3_scale_param, _ = self.get_safetensor_from_file_split_tp_group(w3_scale_name, src_hf_dir, hf_weight_map,
                                                                          split_axis=0)
         w2_weight_name = f"model.layers.{layer_id}.{layer_type}.w2._layer.weight"
@@ -1454,36 +1343,16 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor):
                                                                           split_axis=1)
         w2_scale_param, _ = self.get_safetensor_from_file(w2_scale_name, src_hf_dir, hf_weight_map)
 
-        w3_quant_zp = f"model.layers.{layer_id}.{layer_type}.w3.quant_op.input_zp"
-        w3_quant_scale = f"model.layers.{layer_id}.{layer_type}.w3.quant_op.input_scale"
-        w3_quant_zp_param, _ = self.get_safetensor_from_file(w3_quant_zp, src_hf_dir, hf_weight_map)
-        w3_quant_scale_param, _ = self.get_safetensor_from_file(w3_quant_scale, src_hf_dir, hf_weight_map)
         if ffn_concat:
             concat_weight_name = f"model.layers.{layer_id}.{layer_type}.w_gate_hidden._layer.weight"
             concat_weight_param = ms.Tensor(np.concatenate([w1_weight_param, w3_weight_param], axis=0), dtype=ms.int8)
             parameter_dict[concat_weight_name] = ms.Parameter(concat_weight_param, name=concat_weight_name,
                                                               requires_grad=False)
 
-            concat_bias_name = f"model.layers.{layer_id}.{layer_type}.w_gate_hidden._layer.matmul.quant_bias"
-            concat_bias_param = ms.Tensor(np.concatenate([w1_bias_param, w3_bias_param], axis=0), dtype=ms.int32)
-            parameter_dict[concat_bias_name] = ms.Parameter(concat_bias_param, name=concat_bias_name,
-                                                            requires_grad=False)
-
-            concat_scale_name = f"model.layers.{layer_id}.{layer_type}.w_gate_hidden._layer.matmul.dequant_scale"
-            concat_scale_param = ms.Tensor(np.concatenate([w1_scale_param, w3_scale_param], axis=0), dtype=ms.float32)
+            concat_scale_name = f"model.layers.{layer_id}.{layer_type}.w_gate_hidden._layer.matmul.weight_scale"
+            concat_scale_param = ms.Tensor(np.concatenate([w1_scale_param, w3_scale_param], axis=0), dtype=ms.bfloat16)
             parameter_dict[concat_scale_name] = ms.Parameter(concat_scale_param, name=concat_scale_name,
                                                              requires_grad=False)
-
-            concat_quant_zp_name = f"model.layers.{layer_id}.{layer_type}.w_gate_hidden.quant_op.input_zp"
-            concat_quant_zp_param = ms.Tensor(w1_quant_zp_param, dtype=ms.int8)
-            parameter_dict[concat_quant_zp_name] = ms.Parameter(concat_quant_zp_param, name=concat_quant_zp_name,
-                                                                requires_grad=False)
-
-            concat_quant_scale_name = f"model.layers.{layer_id}.{layer_type}.w_gate_hidden.quant_op.input_scale"
-            concat_quant_scale_param = ms.Tensor(w1_quant_scale_param, dtype=ms.bfloat16)
-            parameter_dict[concat_quant_scale_name] = ms.Parameter(concat_quant_scale_param,
-                                                                   name=concat_quant_scale_name,
-                                                                   requires_grad=False)
         else:
             # w1 w3
             parameter_dict[w1_weight_name] = ms.Parameter(ms.Tensor(w1_weight_param, ms.int8), name=w1_weight_name,
@@ -1491,26 +1360,11 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor):
             parameter_dict[w3_weight_name] = ms.Parameter(ms.Tensor(w3_weight_param, ms.int8), name=w3_weight_name,
                                                           requires_grad=False)
 
-            parameter_dict[w1_bias_name] = ms.Parameter(ms.Tensor(w1_bias_param, ms.int32),
-                                                        name=w1_bias_name, requires_grad=False)
-            parameter_dict[w3_bias_name] = ms.Parameter(ms.Tensor(w3_bias_param, ms.int32),
-                                                        name=w3_bias_name, requires_grad=False)
-
-            parameter_dict[w1_scale_name] = ms.Parameter(ms.Tensor(w1_scale_param, ms.float32),
+            parameter_dict[w1_scale_name] = ms.Parameter(ms.Tensor(w1_scale_param, ms.bfloat16),
                                                          name=w1_scale_name, requires_grad=False)
-            parameter_dict[w3_scale_name] = ms.Parameter(ms.Tensor(w3_scale_param, ms.float32),
+            parameter_dict[w3_scale_name] = ms.Parameter(ms.Tensor(w3_scale_param, ms.bfloat16),
                                                          name=w3_scale_name, requires_grad=False)
 
-            parameter_dict[w1_quant_zp] = ms.Parameter(ms.Tensor(w1_quant_zp_param, ms.int8),
-                                                       name=w1_quant_zp, requires_grad=False)
-            parameter_dict[w3_quant_zp] = ms.Parameter(ms.Tensor(w3_quant_zp_param, ms.int8),
-                                                       name=w3_quant_zp, requires_grad=False)
-
-            parameter_dict[w1_quant_scale] = ms.Parameter(ms.Tensor(w1_quant_scale_param, ms.bfloat16),
-                                                          name=w1_quant_scale, requires_grad=False)
-            parameter_dict[w3_quant_scale] = ms.Parameter(ms.Tensor(w3_quant_scale_param, ms.bfloat16),
-                                                          name=w3_quant_scale, requires_grad=False)
-
         parameter_dict[w2_weight_name] = ms.Parameter(ms.Tensor(w2_weight_param, ms.int8), name=w2_weight_name,
                                                       requires_grad=False)
         parameter_dict[w2_scale_name] = ms.Parameter(ms.Tensor(w2_scale_param, ms.bfloat16),
-- 
Gitee


From db69fe94d804d9332eb2ea0382a76d50fee9cc09 Mon Sep 17 00:00:00 2001
From: lvhaoyu <lvhaoyu@huawei.com>
Date: Fri, 25 Apr 2025 12:10:24 +0800
Subject: [PATCH 40/55] Improve performence

---
 vllm_mindspore/__init__.py                    | 13 +++++-
 vllm_mindspore/model_executor/layers/utils.py | 23 ++++++----
 .../models/mf_models/mf_model_base.py         |  4 +-
 vllm_mindspore/platforms/ascend.py            |  5 +--
 .../v1/attention/backends/flash_attn.py       | 11 +----
 .../v1/sample/ops/topk_topp_sampler.py        | 44 ++++++++++++++++++-
 vllm_mindspore/v1/utils.py                    | 18 ++++++--
 vllm_mindspore/v1/worker/block_table.py       |  2 -
 vllm_mindspore/v1/worker/gpu_input_batch.py   | 22 ++++------
 9 files changed, 98 insertions(+), 44 deletions(-)

diff --git a/vllm_mindspore/__init__.py b/vllm_mindspore/__init__.py
index 224ae26e..98970a5f 100644
--- a/vllm_mindspore/__init__.py
+++ b/vllm_mindspore/__init__.py
@@ -291,12 +291,21 @@ vllm.v1.worker.gpu_input_batch.copy_slice = copy_slice
 from vllm_mindspore.v1.sample.ops.penalties import _convert_to_tensors
 import vllm.v1.sample.ops.penalties
 vllm.v1.sample.ops.penalties._convert_to_tensors = _convert_to_tensors
+import vllm.model_executor.layers.utils
+from vllm_mindspore.model_executor.layers.utils import apply_penalties
+vllm.model_executor.layers.utils.apply_penalties = apply_penalties
+vllm.v1.sample.ops.penalties.apply_penalties = apply_penalties
+
+
+from vllm_mindspore.v1.sample.ops.topk_topp_sampler import apply_top_k_top_p, random_sample, \
+    apply_top_k_only, topk_topp_sampler_forward_native
 
-from vllm_mindspore.v1.sample.ops.topk_topp_sampler import apply_top_k_top_p, random_sample
 import vllm.v1.sample.ops.topk_topp_sampler
+from vllm.v1.sample.ops.topk_topp_sampler import TopKTopPSampler
+TopKTopPSampler.forward_native = topk_topp_sampler_forward_native
 vllm.v1.sample.ops.topk_topp_sampler.apply_top_k_top_p = apply_top_k_top_p
 vllm.v1.sample.ops.topk_topp_sampler.random_sample = random_sample
-
+vllm.v1.sample.ops.topk_topp_sampler.apply_top_k_only = apply_top_k_only
 from vllm_mindspore.v1.sample.sampler import apply_temperature
 import vllm.v1.sample.sampler
 vllm.v1.sample.sampler.Sampler.apply_temperature = apply_temperature
diff --git a/vllm_mindspore/model_executor/layers/utils.py b/vllm_mindspore/model_executor/layers/utils.py
index 0edf165c..3a95175a 100644
--- a/vllm_mindspore/model_executor/layers/utils.py
+++ b/vllm_mindspore/model_executor/layers/utils.py
@@ -18,6 +18,7 @@
 """Utility methods for model layers."""
 from typing import Tuple
 import torch
+import mindspore as ms
 
 def get_token_bin_counts_and_mask(
     tokens: torch.Tensor,
@@ -35,13 +36,14 @@ def get_token_bin_counts_and_mask(
 
     return bin_counts, mask
 
+
 def apply_penalties(logits: torch.Tensor, prompt_tokens_tensor: torch.Tensor,
                     output_tokens_tensor: torch.Tensor,
                     presence_penalties: torch.Tensor,
                     frequency_penalties: torch.Tensor,
                     repetition_penalties: torch.Tensor) -> torch.Tensor:
     """
-    Applies penalties in place to the logits tensor
+    Applies penalties out of place implement to imporve performance.
     logits : The input logits tensor of shape [num_seqs, vocab_size]
     prompt_tokens_tensor: A tensor containing the prompt tokens. The prompts 
         are padded to the maximum prompt length within the batch using 
@@ -60,13 +62,18 @@ def apply_penalties(logits: torch.Tensor, prompt_tokens_tensor: torch.Tensor,
                                                    vocab_size, num_seqs)
     output_bin_counts, output_mask = get_token_bin_counts_and_mask(
         output_tokens_tensor, vocab_size, num_seqs)
-    # repetition_penalties = repetition_penalties.unsqueeze_(dim=1).repeat(
-    #     1, vocab_size)
-    repetition_penalties = repetition_penalties.unsqueeze(dim=1).repeat(1, vocab_size)
-    logits[logits > 0] /= torch.where(prompt_mask | output_mask,
-                                      repetition_penalties, 1.0)[logits > 0]
-    logits[logits <= 0] *= torch.where(prompt_mask | output_mask,
-                                       repetition_penalties, 1.0)[logits <= 0]
+
+    # use 'broadcast_to' to replace 'tensor.repeat' to imporve performance
+    # when tensor shape is (num,seqs, 1), then 'tensor.repeat(1, vocab_size)'
+    # is equal to 'broadcast_to(tensor, (num_seqs, vocab_size))'
+    repetition_penalties = ms.mint.broadcast_to(repetition_penalties.unsqueeze(dim=1),
+                                                (num_seqs, vocab_size))
+
+    # use out of place computation instead of inplace setitem to improve performance
+    # 'tensor[tensor > 0]' will result in setitem, which is slow.
+    mask = prompt_mask | output_mask
+    logits = torch.where(mask & (logits > 0), logits / repetition_penalties, logits)
+    logits = torch.where(mask & (logits <= 0), logits * repetition_penalties, logits)
     # We follow the definition in OpenAI API.
     # Refer to https://platform.openai.com/docs/api-reference/parameter-details
     logits -= frequency_penalties.unsqueeze(dim=1) * output_bin_counts
diff --git a/vllm_mindspore/model_executor/models/mf_models/mf_model_base.py b/vllm_mindspore/model_executor/models/mf_models/mf_model_base.py
index 2af6103d..ecc764be 100644
--- a/vllm_mindspore/model_executor/models/mf_models/mf_model_base.py
+++ b/vllm_mindspore/model_executor/models/mf_models/mf_model_base.py
@@ -266,10 +266,10 @@ class MfModelBase(MsModelBase):
             else:
                 hidden_states = hidden_states.index_select(0, selected_token_indices)
                 logits = self.lm_head(hidden_states)
-                logits = logits.reshape(-1, logits.shape[-1])
+                logits = logits.view(-1, logits.shape[-1])
         else:
             logits = self.lm_head(hidden_states)
-            logits = logits.reshape(-1, logits.shape[-1])
+            logits = logits.view(-1, logits.shape[-1])
         return logits
 
     def sample(
diff --git a/vllm_mindspore/platforms/ascend.py b/vllm_mindspore/platforms/ascend.py
index dddce58d..356a33a0 100644
--- a/vllm_mindspore/platforms/ascend.py
+++ b/vllm_mindspore/platforms/ascend.py
@@ -99,9 +99,8 @@ class AscendPlatform(Platform):
         if cache_config and cache_config.block_size is None:
             cache_config.block_size = 16
 
-
-        # if envs.VLLM_USE_V1:
-        #     vllm_config.model_config.enforce_eager = True
+        model_config = vllm_config.model_config
+        model_config.disable_cascade_attn = True
 
     @classmethod
     def get_attn_backend_cls(cls, selected_backend, head_size, dtype, kv_cache_dtype, block_size, use_v1, use_mla):
diff --git a/vllm_mindspore/v1/attention/backends/flash_attn.py b/vllm_mindspore/v1/attention/backends/flash_attn.py
index 77f6d726..b5c5629e 100644
--- a/vllm_mindspore/v1/attention/backends/flash_attn.py
+++ b/vllm_mindspore/v1/attention/backends/flash_attn.py
@@ -122,10 +122,6 @@ class FlashAttentionMetadata:
     def __getitem__(self, key):
         if key == "batch_valid_length":
             key = "seq_lens"
-        if key == "block_tables":
-            if getattr(self, key).ndim == 1:
-                return mutable(getattr(self, key).expand_dims(0))
-            return mutable(getattr(self, key))
         return getattr(self, key)
 
 
@@ -206,21 +202,18 @@ class FlashAttentionMetadataBuilder:
 
     def build(self, num_reqs: int, num_actual_tokens: int, max_query_len: int,
               common_prefix_len: int):
+        # do not manually call 'tensor.move_to("Ascend", blocking=False)' here,
+        # because it will cause a certain amount of host time.
         query_start_loc = ms.from_numpy(self.runner.query_start_loc_np[:num_reqs + 1])
-        query_start_loc.move_to("Ascend", blocking=False)
         max_context_lens = self.runner.input_batch.num_computed_tokens_cpu[:num_reqs].max()
         slot_mapping = ms.from_numpy(self.runner.slot_mapping_np[:num_actual_tokens])
-        slot_mapping.move_to("Ascend", blocking=False)
         seq_lens_np = self.runner.seq_lens_np[:num_reqs]
         max_seq_len = seq_lens_np.max()
         seq_lens = ms.from_numpy(seq_lens_np)
-        seq_lens.move_to("Ascend", blocking=False)
         context_lens = ms.from_numpy(self.runner.input_batch.num_computed_tokens_cpu[:num_reqs])
-        context_lens.move_to("Ascend", blocking=False)
 
         q_seq_lens_np = np.diff(self.runner.query_start_loc_np[:num_reqs + 1])
         q_seq_lens = ms.from_numpy(q_seq_lens_np)
-        q_seq_lens.move_to("Ascend", blocking=False)
 
         attn_metadata = FlashAttentionMetadata(
             seq_lens=seq_lens,
diff --git a/vllm_mindspore/v1/sample/ops/topk_topp_sampler.py b/vllm_mindspore/v1/sample/ops/topk_topp_sampler.py
index 8b0835c0..cbd218a2 100644
--- a/vllm_mindspore/v1/sample/ops/topk_topp_sampler.py
+++ b/vllm_mindspore/v1/sample/ops/topk_topp_sampler.py
@@ -1,6 +1,31 @@
 from typing import Optional
 import torch
-from vllm.v1.sample.ops.topk_topp_sampler import apply_top_k_only
+from mindspore import mint
+
+
+def apply_top_k_top_p_ms(logits, k, p):
+    """
+    Apply top-k and top-p masks to the logits for high performance.
+    which is reference from 'apply_top_k_top_p_tpu' in vllm.
+    """
+    if k is not None:
+        # use `apply_top_k_only` defined in this file.
+        logits = apply_top_k_only(logits, k)
+
+    if p is not None:
+        probs = logits.softmax(dim=-1)
+        probs_sort, _ = mint.sort(probs, dim=-1, descending=False)
+        cumprob = mint.cumsum(probs_sort, dim=-1)
+        top_p_mask = cumprob <= 1 - p.unsqueeze(dim=1)
+        top_p_mask[:, -1] = False  # at least one
+
+        top_p_count = top_p_mask.sum(dim=-1).unsqueeze(1)
+        top_p_cutoff = probs_sort.gather(-1, top_p_count)
+        elements_to_discard = probs < top_p_cutoff
+        logits.masked_fill_(elements_to_discard, -float("inf"))
+
+    return logits
+
 
 def random_sample(
     probs: torch.Tensor,
@@ -29,6 +54,18 @@ def random_sample(
     return probs.argmax(dim=-1).view(-1)
 
 
+def topk_topp_sampler_forward_native(
+    self,
+    logits: torch.Tensor,
+    generators: dict[int, torch.Generator],
+    k: Optional[torch.Tensor],
+    p: Optional[torch.Tensor],
+) -> torch.Tensor:
+    logits = apply_top_k_top_p_ms(logits, k, p)
+    probs = logits.softmax(dim=-1, dtype=torch.float32)
+    return random_sample(probs, generators)
+
+
 def apply_top_k_top_p(
     logits: torch.Tensor,
     k: Optional[torch.Tensor],
@@ -87,11 +124,14 @@ def apply_top_k_only(
     # Set non-top-k rows to 1 so that we can gather.
     k = k.masked_fill(no_top_k_mask, 1)
     max_top_k = k.max()
-    int_max_top_k = max_top_k.item()
     # topk.values tensor has shape [batch_size, max_top_k].
     # Convert top k to 0-based index in range [0, max_top_k).
     k_index = k.sub_(1).unsqueeze(1).expand(logits.shape[0], 1)
 
+    # tensor.item() will cause GPU-CPU Sync, so place as later as possible.
+    # can be deleted after logits.topk() support tensor-type input.
+    int_max_top_k = max_top_k.item()
+
     top_k_mask = logits.topk(int_max_top_k, dim=1)[0].gather(1, k_index.long())
     # Handle non-topk rows.
     top_k_mask.masked_fill_(no_top_k_mask.unsqueeze(1), -float("inf"))
diff --git a/vllm_mindspore/v1/utils.py b/vllm_mindspore/v1/utils.py
index c13c292d..daa6718c 100644
--- a/vllm_mindspore/v1/utils.py
+++ b/vllm_mindspore/v1/utils.py
@@ -1,7 +1,19 @@
+import numpy as np
 import torch
+import mindspore as ms
+
+def _copy_slice_from_np(from_np: np.ndarray, to_tensor: torch.Tensor,
+                        length: int) -> None:
+    """
+    Copy the first length elements of a numpy array into a tensor in a
+    non-blocking manner.
+    """
+    to_tensor[:length] = ms.from_numpy(from_np[:length])
+    return to_tensor
+
 
 def copy_slice(from_tensor: torch.Tensor, to_tensor: torch.Tensor,
-               length: int) -> None:
+               length: int, *, return_tensor=True) -> None:
     """
     Copy the first length elements of a tensor into another tensor in a
     non-blocking manner.
@@ -9,5 +21,5 @@ def copy_slice(from_tensor: torch.Tensor, to_tensor: torch.Tensor,
     Used to copy pinned CPU tensor data to pre-allocated GPU tensors.
     """
     to_tensor[:length] = from_tensor[:length]
-    return to_tensor
-
+    if return_tensor:
+        return to_tensor[:length]
diff --git a/vllm_mindspore/v1/worker/block_table.py b/vllm_mindspore/v1/worker/block_table.py
index b865bae3..d4563445 100644
--- a/vllm_mindspore/v1/worker/block_table.py
+++ b/vllm_mindspore/v1/worker/block_table.py
@@ -69,8 +69,6 @@ class BlockTable:
         self.block_table_np[[src, tgt]] = self.block_table_np[[tgt, src]]
 
     def commit(self, num_reqs: int) -> None:
-        self.block_table_cpu[:num_reqs] = torch.from_numpy(self.block_table_np[:num_reqs])
-        # self.block_table[:num_reqs] = self.block_table_cpu[:num_reqs]
         self.block_table[:num_reqs] = torch.from_numpy(self.block_table_np[:num_reqs])
 
     def clear(self) -> None:
diff --git a/vllm_mindspore/v1/worker/gpu_input_batch.py b/vllm_mindspore/v1/worker/gpu_input_batch.py
index 52c13344..a1a19156 100644
--- a/vllm_mindspore/v1/worker/gpu_input_batch.py
+++ b/vllm_mindspore/v1/worker/gpu_input_batch.py
@@ -3,11 +3,8 @@ from typing import Dict, List, Optional, Set, Tuple, cast
 import numpy as np
 import torch
 
-from vllm.lora.request import LoRARequest
-from vllm.sampling_params import SamplingType
 from vllm.v1.sample.metadata import SamplingMetadata
-from vllm_mindspore.v1.utils import copy_slice
-from vllm.v1.worker.block_table import BlockTable
+from vllm_mindspore.v1.utils import _copy_slice_from_np, copy_slice
 
 _SAMPLING_EPS = 1e-5
 
@@ -15,26 +12,26 @@ _SAMPLING_EPS = 1e-5
 def _make_sampling_metadata(self) -> SamplingMetadata:
     num_reqs = self.num_reqs
     if not self.all_greedy:
-        temperature = copy_slice(torch.from_numpy(self.temperature_cpu), self.temperature, num_reqs)
+        temperature = _copy_slice_from_np(self.temperature_cpu, self.temperature, num_reqs)
         temperature = temperature[:num_reqs]
     else:
         temperature = None
     if not self.no_top_p:
-        copy_slice(torch.from_numpy(self.top_p_cpu), self.top_p, num_reqs)
+        _copy_slice_from_np(self.top_p_cpu, self.top_p, num_reqs)
     if not self.no_top_k:
-        copy_slice(torch.from_numpy(self.top_k_cpu), self.top_k, num_reqs)
+        _copy_slice_from_np(self.top_k_cpu, self.top_k, num_reqs)
     if not self.no_min_p:
-        copy_slice(torch.from_numpy(self.min_p_cpu), self.min_p, num_reqs)
+        _copy_slice_from_np(self.min_p_cpu, self.min_p, num_reqs)
 
     if not self.no_penalties:
         # Since syncing these tensors is expensive only copy them
         # if necessary i.e. if there are requests which require
         # penalties to be applied during sampling.
-        copy_slice(torch.from_numpy(self.frequency_penalties_cpu),
+        _copy_slice_from_np(self.frequency_penalties_cpu,
                 self.frequency_penalties, num_reqs)
-        copy_slice(torch.from_numpy(self.presence_penalties_cpu),
+        _copy_slice_from_np(self.presence_penalties_cpu,
                 self.presence_penalties, num_reqs)
-        copy_slice(torch.from_numpy(self.repetition_penalties_cpu),
+        _copy_slice_from_np(self.repetition_penalties_cpu,
                 self.repetition_penalties, num_reqs)
 
         # The prompt tokens are used only for applying penalties during
@@ -48,7 +45,7 @@ def _make_sampling_metadata(self) -> SamplingMetadata:
     if not self.no_allowed_token_ids:
         assert self.allowed_token_ids_mask is not None
         copy_slice(self.allowed_token_ids_mask_cpu_tensor,
-                    self.allowed_token_ids_mask, num_reqs)
+                    self.allowed_token_ids_mask, num_reqs, return_tensor=False)
         allowed_token_ids_mask = self.allowed_token_ids_mask[:num_reqs]
 
     return SamplingMetadata(
@@ -81,6 +78,5 @@ def _make_prompt_token_ids_tensor(self) -> torch.Tensor:
     for i in range(self.num_reqs):
         prompt_token_ids[i, self.num_prompt_tokens[i]:] = self.vocab_size
     prompt_token_ids_cpu_tensor = torch.from_numpy(prompt_token_ids)
-    prompt_token_ids_cpu_tensor.move_to("Ascend", blocking=False)
     return prompt_token_ids_cpu_tensor
 
-- 
Gitee


From 227411f6f1cb728dfd07fcf05a8ccecc1132f072 Mon Sep 17 00:00:00 2001
From: wusimin <wusimin7@huawei.com>
Date: Tue, 20 May 2025 17:55:19 +0800
Subject: [PATCH 41/55] =?UTF-8?q?[0.8.3=20v1]=E9=80=82=E9=85=8DV1=E5=8E=9F?=
 =?UTF-8?q?=E7=94=9FQwen?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 tests/st/python/test_vllm_qwen_7b_v1.py       |  78 ++++++++++++++
 vllm_mindspore/attention/ops/paged_attn.py    |   6 +-
 .../model_executor/layers/linear.py           |  22 +++-
 .../model_executor/layers/logits_processor.py |  17 ++-
 .../layers/vocab_parallel_embedding.py        |   2 +-
 .../model_loader/weight_utils.py              |   8 +-
 .../model_executor/models/model_base.py       |   4 +-
 vllm_mindspore/model_executor/models/qwen2.py | 101 +++++++++++++-----
 8 files changed, 186 insertions(+), 52 deletions(-)
 create mode 100644 tests/st/python/test_vllm_qwen_7b_v1.py

diff --git a/tests/st/python/test_vllm_qwen_7b_v1.py b/tests/st/python/test_vllm_qwen_7b_v1.py
new file mode 100644
index 00000000..cdc37bab
--- /dev/null
+++ b/tests/st/python/test_vllm_qwen_7b_v1.py
@@ -0,0 +1,78 @@
+#!/usr/bin/env python3
+# encoding: utf-8
+# Copyright 2025 Huawei Technologies Co., Ltd
+# Copyright 2024 The vLLM team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://wwww.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by application law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""test vllm qwen."""
+import pytest
+import os
+from . import set_env
+env_manager = set_env.EnvVarManager()
+# def env
+env_vars = {
+    "ASCEND_CUSTOM_PATH": os.path.expandvars("$ASCEND_HOME_PATH/../"),
+    "MS_ENABLE_LCCL": "off",
+    "HCCL_OP_EXPANSION_MODE": "AIV",
+    "ASCEND_RT_VISIBLE_DEVICES": "0,1",
+    "MS_ALLOC_CONF": "enable_vmm:True",
+    "LCCL_DETERMINISTIC": "1",
+    "HCCL_DETERMINISTIC": "true",
+    "ATB_MATMUL_SHUFFLE_K_ENABLE": "0",
+    "ATB_LLM_LCOC_ENABLE": "0",
+    "VLLM_USE_V1": "1",
+}
+# set env
+env_manager.setup_ai_environment(env_vars)
+import vllm_mindspore
+from vllm import LLM, SamplingParams
+
+
+class TestQwen:
+    """
+    Test Qwen.
+    """
+    @pytest.mark.level0
+    @pytest.mark.platform_arm_ascend910b_training
+    @pytest.mark.env_single
+    @pytest.mark.skip(reason="qwen need to be supported on new MS package")
+    def test_vllm_qwen(self):
+        """
+        test case qwen2.5 7B
+        """
+
+        # Sample prompts.
+        prompts = [
+            "You are a helpful assistant.<｜User｜>将文本分类为中性、负面或正面。 \n文本：我认为这次假期还可以。 \n情感：<｜Assistant｜>\n",
+        ]
+
+        # Create a sampling params object.
+        sampling_params = SamplingParams(temperature=0.0, max_tokens=10, top_k=1)
+
+        # Create an LLM.
+        llm = LLM(model="/home/workspace/mindspore_dataset/weight/Qwen2.5-7B-Instruct",
+                  gpu_memory_utilization=0.9, tensor_parallel_size=2)
+        # Generate texts from the prompts. The output is a list of RequestOutput objects
+        # that contain the prompt, generated text, and other information.
+        outputs = llm.generate(prompts, sampling_params)
+        except_list=['中性<｜Assistant｜> 这句话']
+        # Print the outputs.
+        for i, output in enumerate(outputs):
+            prompt = output.prompt
+            generated_text = output.outputs[0].text
+            print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+            assert generated_text == except_list[i]
+
+        # unset env
+        env_manager.unset_all()
diff --git a/vllm_mindspore/attention/ops/paged_attn.py b/vllm_mindspore/attention/ops/paged_attn.py
index df9394c7..0d8a70c7 100644
--- a/vllm_mindspore/attention/ops/paged_attn.py
+++ b/vllm_mindspore/attention/ops/paged_attn.py
@@ -221,7 +221,6 @@ class PagedAttention:
         block_tables: torch.Tensor,
         query_start_loc: torch.Tensor,
         seq_lens_tensor: torch.Tensor,
-        context_lens: torch.Tensor,
         max_query_len: int,
         alibi_slopes: Optional[torch.Tensor],
         sliding_window: Optional[int],
@@ -229,6 +228,7 @@ class PagedAttention:
         v_scale: float,
     ) -> torch.Tensor:
         output = torch.empty_like(query)
+        max_seq_len = None
         context_attention_fwd(
             query,
             key,
@@ -239,9 +239,9 @@ class PagedAttention:
             value_cache,
             block_tables,
             # query_start_loc is (batch_size + 1,)
-            query_start_loc[:-1],
+            query_start_loc,
             seq_lens_tensor,
-            context_lens,
+            max_seq_len,
             max_query_len,
             k_scale,
             v_scale,
diff --git a/vllm_mindspore/model_executor/layers/linear.py b/vllm_mindspore/model_executor/layers/linear.py
index 45aa4c43..572f0e34 100644
--- a/vllm_mindspore/model_executor/layers/linear.py
+++ b/vllm_mindspore/model_executor/layers/linear.py
@@ -160,6 +160,8 @@ class LinearBase(ms.nn.Cell):
         params_dtype=None,
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
+        *,
+        return_bias: bool = True,
     ):
         super().__init__()
 
@@ -175,6 +177,7 @@ class LinearBase(ms.nn.Cell):
             self.quant_method: Optional[QuantizeMethodBase] = UnquantizedLinearMethod()
         else:
             self.quant_method = quant_config.get_quant_method(self, prefix=prefix)
+        self.return_bias = return_bias
 
     def construct(self, x: ms.Tensor) -> ms.Tensor:
         raise NotImplementedError
@@ -195,9 +198,11 @@ class ColumnParallelLinear(LinearBase):
         quant_config: Optional[QuantizationConfig] = None,
         output_sizes: Optional[List[int]] = None,
         prefix: str = "",
+        *,
+        return_bias: bool = True,
     ):
         super().__init__(
-            input_size, output_size, skip_bias_add, params_dtype, quant_config, prefix
+            input_size, output_size, skip_bias_add, params_dtype, quant_config, prefix, return_bias=return_bias
         )
 
         self.gather_output = gather_output
@@ -256,6 +261,8 @@ class ColumnParallelLinear(LinearBase):
         else:
             output = output_parallel
         output_bias = self.bias if self.skip_bias_add else None
+        if not self.return_bias:
+            return output
         return output, output_bias
 
     def weight_loader(self, param, loaded_weight):
@@ -326,6 +333,8 @@ class MergedColumnParallelLinear(ColumnParallelLinear):
         params_dtype=None,
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
+        *,
+        return_bias: bool = True
     ):
         self.output_sizes = output_sizes
         tp_size = get_tensor_model_parallel_world_size()
@@ -339,6 +348,7 @@ class MergedColumnParallelLinear(ColumnParallelLinear):
             params_dtype=params_dtype,
             quant_config=quant_config,
             prefix=prefix,
+            return_bias=return_bias
         )
 
     def weight_loader(
@@ -396,6 +406,8 @@ class QKVParallelLinear(ColumnParallelLinear):
         params_dtype=None,
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
+        *,
+        return_bias: bool = True,
     ):
         self.hidden_size = hidden_size
         self.head_size = head_size
@@ -431,6 +443,7 @@ class QKVParallelLinear(ColumnParallelLinear):
             params_dtype=params_dtype,
             quant_config=quant_config,
             prefix=prefix,
+            return_bias=return_bias
         )
 
     def weight_loader(self, param, loaded_weight, loaded_shard_id):
@@ -494,9 +507,11 @@ class RowParallelLinear(LinearBase):
         reduce_results: bool = True,
         quant_config: Optional[QuantizationConfig] = None,
         prefix: str = "",
+        *,
+        return_bias: bool = True,
     ):
         super().__init__(
-            input_size, output_size, skip_bias_add, params_dtype, quant_config, prefix
+            input_size, output_size, skip_bias_add, params_dtype, quant_config, prefix, return_bias=return_bias
         )
 
         # Divide the weight matrix along the last dimension.
@@ -566,7 +581,8 @@ class RowParallelLinear(LinearBase):
             output = output_parallel
 
         output_bias = self.bias if self.skip_bias_add else None
-
+        if not self.return_bias:
+            return output
         return output, output_bias
 
     def weight_loader(self, param, loaded_weight):
diff --git a/vllm_mindspore/model_executor/layers/logits_processor.py b/vllm_mindspore/model_executor/layers/logits_processor.py
index 75f35d6d..32b02fb7 100644
--- a/vllm_mindspore/model_executor/layers/logits_processor.py
+++ b/vllm_mindspore/model_executor/layers/logits_processor.py
@@ -41,7 +41,6 @@ if envs.VLLM_LOGITS_PROCESSOR_THREADS is not None:
     _logits_processor_threadpool = ThreadPoolExecutor(
         envs.VLLM_LOGITS_PROCESSOR_THREADS)
 
-
 class LogitsProcessor(nn.Cell):
     """Process logits and apply logits processors from sampling metadata.
 
@@ -74,9 +73,8 @@ class LogitsProcessor(nn.Cell):
         self.soft_cap = soft_cap
         # Whether to use gather or all-gather to gather the logits.
         parallel_config = get_current_vllm_config().parallel_config
-        self.use_gather = not current_platform.is_tpu() \
-            or envs.VLLM_USE_V1 \
-            or parallel_config.distributed_executor_backend == "external_launcher"
+        self.use_all_gather = envs.VLLM_USE_V1 \
+                              or parallel_config.distributed_executor_backend == "external_launcher"
 
     def construct(
         self,
@@ -105,7 +103,8 @@ class LogitsProcessor(nn.Cell):
                 logits *= self.scale
 
             # Apply logits processors (if any).
-            if sampling_metadata.seq_groups is not None:
+            if sampling_metadata is not None and \
+                    sampling_metadata.seq_groups is not None:
                 logits = _apply_logits_processors(logits, sampling_metadata)
 
         return logits
@@ -120,16 +119,16 @@ class LogitsProcessor(nn.Cell):
         logits = lm_head.quant_method.apply(
             lm_head, hidden_states, bias=embedding_bias
         )
-        if self.use_gather:
-            # None may be returned for rank > 0
-            logits = tensor_model_parallel_gather(logits)
-        else:
+        if self.use_all_gather:
             # Gather is not supported for some devices such as TPUs.
             # Use all-gather instead.
             # NOTE(woosuk): Here, the outputs of every device should not be None
             # because XLA requires strict SPMD among all devices. Every device
             # should execute the same operations after gathering the logits.
             logits = tensor_model_parallel_all_gather(logits)
+        else:
+            # None may be returned for rank > 0
+            logits = tensor_model_parallel_gather(logits)
         # Remove paddings in vocab (if any).
         if logits is not None:
             logits = logits[..., : self.org_vocab_size]
diff --git a/vllm_mindspore/model_executor/layers/vocab_parallel_embedding.py b/vllm_mindspore/model_executor/layers/vocab_parallel_embedding.py
index cec40d8b..e3407f51 100644
--- a/vllm_mindspore/model_executor/layers/vocab_parallel_embedding.py
+++ b/vllm_mindspore/model_executor/layers/vocab_parallel_embedding.py
@@ -231,7 +231,7 @@ class VocabParallelEmbedding(nn.Cell):
         # If we are making an embedding layer, then our quantization linear
         # method must implement the embedding operation. If we are another
         # layer type like ParallelLMHead, this is not important.
-        is_embedding_layer = type(self.__class__) is VocabParallelEmbedding
+        is_embedding_layer = type(self) is VocabParallelEmbedding
         quant_method_implements_embedding = method_has_implemented_embedding(
             type(quant_method)
         )
diff --git a/vllm_mindspore/model_executor/model_loader/weight_utils.py b/vllm_mindspore/model_executor/model_loader/weight_utils.py
index 45fe4bdd..0fc4d3d2 100644
--- a/vllm_mindspore/model_executor/model_loader/weight_utils.py
+++ b/vllm_mindspore/model_executor/model_loader/weight_utils.py
@@ -27,18 +27,16 @@ from mindspore import Parameter, Tensor
 
 def safetensors_weights_iterator(
     hf_weights_files: List[str],
+    use_tqdm_on_load: bool,
 ) -> Generator[Tuple[str, torch.Tensor], None, None]:
     """Iterate over the weights in the model safetensor files."""
     from safetensors import safe_open
-    from vllm.model_executor.model_loader.weight_utils import _BAR_FORMAT
+    from vllm.model_executor.model_loader.weight_utils import _BAR_FORMAT, enable_tqdm
 
-    enable_tqdm = (
-        not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0
-    )
     for st_file in tqdm(
         hf_weights_files,
         desc="Loading safetensors checkpoint shards",
-        disable=not enable_tqdm,
+        disable=not enable_tqdm(use_tqdm_on_load),
         bar_format=_BAR_FORMAT,
     ):
         with safe_open(st_file, framework="np") as f:
diff --git a/vllm_mindspore/model_executor/models/model_base.py b/vllm_mindspore/model_executor/models/model_base.py
index 7aa1de4e..a464e3e1 100644
--- a/vllm_mindspore/model_executor/models/model_base.py
+++ b/vllm_mindspore/model_executor/models/model_base.py
@@ -20,7 +20,6 @@ import os
 from abc import abstractmethod
 from typing import Iterable, List, Optional, Set, Tuple, Union, Dict
 
-from vllm.attention import AttentionMetadata
 from vllm.config import VllmConfig, get_current_vllm_config
 from vllm.model_executor.layers.sampler import SamplerOutput
 from vllm.model_executor.sampling_metadata import SamplingMetadata
@@ -82,6 +81,7 @@ class Fake_Attention_V1(Attention):
             for _ in range(vllm_config.parallel_config.pipeline_parallel_size)
         ]
         self.attn_type = AttentionType.DECODER
+        self.num_block = num_block
         self.num_kv_heads = num_kv_heads
         self.head_size = head_size
         self.dtype = vllm_config.model_config.dtype
@@ -210,8 +210,6 @@ class MsModelBase():
         self,
         input_ids: Tensor,
         positions: Tensor,
-        kv_caches: List[Tensor],
-        attn_metadata: AttentionMetadata,
         intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[Tensor] = None,
         **kwargs
diff --git a/vllm_mindspore/model_executor/models/qwen2.py b/vllm_mindspore/model_executor/models/qwen2.py
index 5eb70a82..8bf808a4 100644
--- a/vllm_mindspore/model_executor/models/qwen2.py
+++ b/vllm_mindspore/model_executor/models/qwen2.py
@@ -47,19 +47,21 @@ from vllm_mindspore.model_executor.models.utils import (
     PPMissingLayer, make_empty_intermediate_tensors_factory, make_layers,
     maybe_prefix)
 from vllm_mindspore.model_executor.sampling_metadata import SamplingMetadata
-from vllm_mindspore.model_executor.models.model_base import MsModelBase, Fake_Attention
+from vllm_mindspore.model_executor.models.model_base import MsModelBase, Fake_Attention, Fake_Attention_V1
 from vllm_mindspore.model_executor.models.attention_mask import LowerTriangularMask
 from vllm_mindspore.utils import STR_DTYPE_TO_MS_DTYPE
 
 
 from vllm.config import CacheConfig, VllmConfig
+import vllm.envs as envs
 from vllm.model_executor.layers.quantization import \
     QuantizationConfig
 from vllm.sequence import IntermediateTensors
 from vllm.attention.backends.abstract import AttentionType
 from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
-from vllm.attention.backends.abstract import AttentionMetadata
-
+from vllm.forward_context import get_forward_context
+from vllm_mindspore.v1.attention.backends.flash_attn import FlashAttentionMetadata
+import mindspore as ms
 
 class Qwen2MLP(nn.Cell):
     def __init__(
@@ -299,7 +301,6 @@ class Qwen2Model(nn.Cell):
 
         self.config = config
         self.quant_config = quant_config
-        self.padding_idx = config.pad_token_id
         self.vocab_size = config.vocab_size
 
         if get_pp_group().is_first_rank or (config.tie_word_embeddings
@@ -494,7 +495,10 @@ class Qwen2ForCausalLM(MsModelBase):
         self.casual_mask = LowerTriangularMask(dtype=self.mstype, 
                                                max_model_len=self.model_config.max_model_len)
         self.set_model_inputs(self.prefill)
-        self.kv_caches = [Fake_Attention() for i in range(config.num_hidden_layers)]
+        if envs.VLLM_USE_V1:
+            self.kv_caches = [Fake_Attention_V1() for i in range(config.num_hidden_layers)]
+        else:
+            self.kv_caches = [Fake_Attention() for i in range(config.num_hidden_layers)]
         compilation_config = vllm_config.compilation_config
 
         if prefix in compilation_config.static_forward_context:
@@ -513,7 +517,8 @@ class Qwen2ForCausalLM(MsModelBase):
 
         kv_cache_dtype = self.model_config.dtype if self.cache_config.cache_dtype == "auto" \
             else self.cache_config.cache_dtype
-        kv_cache_dtype = STR_DTYPE_TO_MS_DTYPE[kv_cache_dtype]
+        if kv_cache_dtype in STR_DTYPE_TO_MS_DTYPE:
+            kv_cache_dtype = STR_DTYPE_TO_MS_DTYPE[kv_cache_dtype]
 
         num_layers = self.model_config.get_num_layers(self.parallel_config)
 
@@ -548,27 +553,48 @@ class Qwen2ForCausalLM(MsModelBase):
         self,
         input_ids: Tensor,
         positions: Tensor,
-        kv_caches: List[Tuple[Tensor, Tensor]],
-        attn_metadata: AttentionMetadata,
         intermediate_tensors: IntermediateTensors = None,
         inputs_embeds: Tensor = None,
         **kwargs
     ) -> Union[Tensor, IntermediateTensors]:
         key_cache, value_cache = self.get_kvcache()
-        seq_lens = attn_metadata.seq_lens
-        max_query_len = attn_metadata.max_query_len
-        # When Mutli-Step is enabled with Chunked-Prefill, prefills and
-        # decodes are scheduled together. In the first step, all the
-        # prefills turn into decodes and max_query_len will be 1.
-        if self.is_multi_step_chunked_prefill and max_query_len == 1:
-            query_lens = [1] * len(seq_lens)
+        attn_metadata = get_forward_context().attn_metadata
+        input_ids = input_ids.to(ms.int64)
+        if attn_metadata is None:
+            attn_metadata = self._dummy_attention_metadata(input_ids, positions)
+        if not envs.VLLM_USE_V1:
+            seq_lens = attn_metadata.seq_lens
+            max_query_len = attn_metadata.max_query_len
+            # When Mutli-Step is enabled with Chunked-Prefill, prefills and
+            # decodes are scheduled together. In the first step, all the
+            # prefills turn into decodes and max_query_len will be 1.
+            if self.is_multi_step_chunked_prefill and max_query_len == 1:
+                query_lens = [1] * len(seq_lens)
+            else:
+                query_lens = attn_metadata.query_lens
+
+            seq_lens_np = np.array(seq_lens, dtype=np.int32)
+            query_lens_np = np.array(query_lens, dtype=np.int32)
+            kv_cache_lens = seq_lens_np - query_lens_np
+            is_prefill = attn_metadata.num_decode_tokens == 0 and kv_cache_lens.max() == 0
+            slot_mapping = attn_metadata.slot_mapping
+            batch_valid_length = Tensor.from_numpy(np.array(attn_metadata.seq_lens, dtype=np.int32))
+            q_seq_lens = Tensor.from_numpy(np.array(attn_metadata.query_lens, dtype=np.int32))
+            block_tables = attn_metadata.block_tables
+            position_ids = ms.Tensor(positions, dtype=ms.int32)
+            attn_mask = self.casual_mask.gen_attention_mask(is_prefill, position_ids, query_lens)
         else:
-            query_lens = attn_metadata.query_lens
-
-        seq_lens_np = np.array(seq_lens, dtype=np.int32)
-        query_lens_np = np.array(query_lens, dtype=np.int32)
-        kv_cache_lens = seq_lens_np - query_lens_np
-        is_prefill = attn_metadata.num_decode_tokens == 0 and kv_cache_lens.max() == 0
+            if attn_metadata.max_context_lens == 0:
+                is_prefill = True
+            else:
+                is_prefill = False
+            slot_mapping = attn_metadata.slot_mapping
+            batch_valid_length = Tensor.from_numpy(attn_metadata.seq_lens_np)
+            q_seq_lens = attn_metadata.q_seq_lens
+            block_tables = attn_metadata.block_tables
+            query_lens_np = attn_metadata.q_seq_lens_np
+            attn_mask = self.casual_mask.gen_attention_mask(is_prefill, positions, query_lens_np)
+            positions = positions.to(ms.int64)
         if is_prefill:
             input_ids = ops.expand_dims(input_ids, 0)
             if not self.prefill:
@@ -579,13 +605,6 @@ class Qwen2ForCausalLM(MsModelBase):
             if self.prefill:
                 self.prefill = False
                 self.set_model_inputs(self.prefill)
-
-        slot_mapping = attn_metadata.slot_mapping
-        attn_mask = self.casual_mask.gen_attention_mask(is_prefill, positions, query_lens)
-        seq_lens_np = np.array(attn_metadata.seq_lens, dtype=np.int32)
-        batch_valid_length = Tensor.from_numpy(seq_lens_np)
-        q_seq_lens = Tensor.from_numpy(np.array(attn_metadata.query_lens, dtype=np.int32))
-        block_tables = attn_metadata.block_tables
         model_output = self.model(input_ids,
                                   positions,
                                   key_cache,
@@ -604,6 +623,32 @@ class Qwen2ForCausalLM(MsModelBase):
             model_output = ops.squeeze(model_output, 1)
         return model_output
 
+    def _dummy_attention_metadata(self, input_ids: Tensor, positions: Tensor) -> FlashAttentionMetadata:
+        input_len = input_ids.shape[0]
+        max_seq_len = ms.Tensor(input_len, dtype=ms.int32)
+        seq_lengths = ms.Tensor([input_len], dtype=ms.int32)
+        q_seq_lens = ms.Tensor([input_len], dtype=ms.int32)
+        q_seq_lens_np = np.array([input_len], dtype=np.int32)
+        seq_lens_np = np.array([input_len], dtype=np.int32)
+
+        block_tables = ms.Tensor([[0]], dtype=ms.int32)
+        slot_mapping = [-1 for _ in range(input_len)]
+        slot_mapping = ms.Tensor(slot_mapping, dtype=ms.int32)
+        return FlashAttentionMetadata(
+            max_seq_len=max_seq_len,
+            seq_lens=seq_lengths,
+            seq_lens_np=seq_lens_np,
+            block_tables=block_tables,
+            slot_mapping=slot_mapping,
+            q_seq_lens=q_seq_lens,
+            q_seq_lens_np=q_seq_lens_np,
+            context_lens=0,
+            # To enforce prefill and decode are both complied in warmup process.
+            # So set max_context_lens to 0 for prefill and 1 for decode.
+            max_context_lens=0 if self.prefill else 1,
+            query_start_loc = None
+        )
+
     def load_weights(self, weights: Iterable[Tuple[str, Tensor]]) -> Set[str]:
         params_dict = self.get_params_dict()
         self.model.load_weights(weights, params_dict)
-- 
Gitee


From 79505056e76716ae3087fa6a66bbd3f53c523611 Mon Sep 17 00:00:00 2001
From: r1chardf1d0 <xiaruijie@huawei.com>
Date: Fri, 16 May 2025 16:00:47 +0800
Subject: [PATCH 42/55] mtp support 0.8.3

---
 tests/mindformers                                         | 2 +-
 tests/st/python/test_vllm_deepseek_bf16_part_v1.py        | 2 +-
 tests/st/python/test_vllm_deepseek_part.py                | 8 ++++----
 tests/st/python/test_vllm_deepseek_part_v1.py             | 2 +-
 .../model_executor/models/mf_models/deepseek_mtp.py       | 2 ++
 5 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/tests/mindformers b/tests/mindformers
index bbddc170..d2df5295 160000
--- a/tests/mindformers
+++ b/tests/mindformers
@@ -1 +1 @@
-Subproject commit bbddc170167ac6705f07bf0aea25977e10f8d760
+Subproject commit d2df52951122a0328a7ccea4230e0b6f49f6c2e8
diff --git a/tests/st/python/test_vllm_deepseek_bf16_part_v1.py b/tests/st/python/test_vllm_deepseek_bf16_part_v1.py
index 11167779..3e616242 100644
--- a/tests/st/python/test_vllm_deepseek_bf16_part_v1.py
+++ b/tests/st/python/test_vllm_deepseek_bf16_part_v1.py
@@ -60,7 +60,7 @@ class TestDeepSeek:
 
         # Create an LLM.
         llm = LLM(model="/home/workspace/mindspore_dataset/weight/DeepSeek-R1-bf16",
-                  trust_remote_code=True, gpu_memory_utilization=0.9, tensor_parallel_size=8)
+                  trust_remote_code=True, gpu_memory_utilization=0.9, tensor_parallel_size=8, max_model_len=4096)
         # Generate texts from the prompts. The output is a list of RequestOutput objects
         # that contain the prompt, generated text, and other information.
         outputs = llm.generate(prompts, sampling_params)
diff --git a/tests/st/python/test_vllm_deepseek_part.py b/tests/st/python/test_vllm_deepseek_part.py
index c882ae58..21ba4fe4 100644
--- a/tests/st/python/test_vllm_deepseek_part.py
+++ b/tests/st/python/test_vllm_deepseek_part.py
@@ -25,7 +25,7 @@ env_vars = {
     "MINDFORMERS_MODEL_CONFIG": "./config/predict_deepseek_r1_671b_w8a8.yaml",
     "ASCEND_CUSTOM_PATH": os.path.expandvars("$ASCEND_HOME_PATH/../"),
     "vLLM_MODEL_BACKEND": "MindFormers",
-    "MS_ENABLE_LCCL": "off",
+    "MS_ENABLE_LCCL": "on",
     "HCCL_OP_EXPANSION_MODE": "AIV",
     "ASCEND_RT_VISIBLE_DEVICES": "0,1,2,3,4,5,6,7",
     "MS_ALLOC_CONF": "enable_vmm:True",
@@ -89,7 +89,7 @@ class TestDeepSeekMTP:
     @pytest.mark.level0
     @pytest.mark.platform_arm_ascend910b_training
     @pytest.mark.env_single
-    @pytest.mark.skip(reason="MTP need addition adaption on v0.8.3 V0")
+    @pytest.mark.skip(reason="gs master branch is not suit for the newest mindformers.")
     def test_deepseek_mtp(self):
         """
         test case deepseek mtp with main model of r1-w8a8
@@ -105,8 +105,8 @@ class TestDeepSeekMTP:
 
         # Create an LLM.
         llm = LLM(model="/home/workspace/mindspore_dataset/weight/DeepSeek-R1-MTP",
-                  trust_remote_code=True, gpu_memory_utilization=0.8, tensor_parallel_size=8,
-                  num_speculative_tokens=1)
+                  trust_remote_code=True, gpu_memory_utilization=0.7, tensor_parallel_size=8,
+                  speculative_config={"num_speculative_tokens":1})
         # Generate texts from the prompts. The output is a list of RequestOutput objects
         # that contain the prompt, generated text, and other information.
         outputs = llm.generate(prompts, sampling_params)
diff --git a/tests/st/python/test_vllm_deepseek_part_v1.py b/tests/st/python/test_vllm_deepseek_part_v1.py
index 3ec16fa7..889aae9b 100644
--- a/tests/st/python/test_vllm_deepseek_part_v1.py
+++ b/tests/st/python/test_vllm_deepseek_part_v1.py
@@ -63,7 +63,7 @@ class TestDeepSeek:
 
         # Create an LLM.
         llm = LLM(model="/home/workspace/mindspore_dataset/weight/DeepSeek-R1-W8A8",
-                  trust_remote_code=True, gpu_memory_utilization=0.9, tensor_parallel_size=8)
+                  trust_remote_code=True, gpu_memory_utilization=0.9, tensor_parallel_size=8, max_model_len=4096)
         # Generate texts from the prompts. The output is a list of RequestOutput objects
         # that contain the prompt, generated text, and other information.
         outputs = llm.generate(prompts, sampling_params)
diff --git a/vllm_mindspore/model_executor/models/mf_models/deepseek_mtp.py b/vllm_mindspore/model_executor/models/mf_models/deepseek_mtp.py
index fac2bf20..c0b72f4d 100644
--- a/vllm_mindspore/model_executor/models/mf_models/deepseek_mtp.py
+++ b/vllm_mindspore/model_executor/models/mf_models/deepseek_mtp.py
@@ -110,4 +110,6 @@ class DeepseekV3MTPForCausalLM(MfModelBase):
         weight_processor = DeepseekV3WeightProcessor(self.mf_config, self.network, False)
         weight_processor.load_safetensors_shard(self.mf_config.load_checkpoint, is_mtp_model=True)
         self.network.set_dynamic_inputs()
+        dynamic_hidden_states = Tensor(shape=[None, None], dtype=self.mf_model_config.compute_dtype)
+        self.lm_head.set_inputs(dynamic_hidden_states)
         return None
-- 
Gitee


From 60435452383233d0a23614c15b3e4ba8f6a157f2 Mon Sep 17 00:00:00 2001
From: huandong <huandong1@huawei.com>
Date: Sat, 17 May 2025 15:58:37 +0800
Subject: [PATCH 43/55] add global_max_bs(use max_num_seqs) for dispatch op and
 combine op

---
 vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py | 3 +++
 vllm_mindspore/model_executor/models/model_base.py            | 1 +
 2 files changed, 4 insertions(+)

diff --git a/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py b/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py
index d1102ad9..c087d998 100644
--- a/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py
+++ b/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py
@@ -155,6 +155,9 @@ class DeepseekV3ForCausalLM(MfModelBase):
         self.mf_model_config = DeepseekV3Config_MF(**self.mf_config.model.model_config)
         if self.mf_config.moe_config:
             self.mf_model_config.moe_config = self.mf_config.moe_config
+            # dispatch/combine in moe need max_num_seqs as global_max_bs
+            if hasattr(self.mf_model_config.moe_config, "dispatch_global_max_bs"):
+                self.mf_model_config.moe_config.dispatch_global_max_bs = self.scheduler_config.max_num_seqs
         self.mf_model_config.return_hidden_states = True
         setattr(self.mf_model_config, 'npu_mem_size', -1)
 
diff --git a/vllm_mindspore/model_executor/models/model_base.py b/vllm_mindspore/model_executor/models/model_base.py
index 7aa1de4e..d211c3a9 100644
--- a/vllm_mindspore/model_executor/models/model_base.py
+++ b/vllm_mindspore/model_executor/models/model_base.py
@@ -111,6 +111,7 @@ class MsModelBase():
         self.cache_config = vllm_config.cache_config
         self.parallel_config = vllm_config.parallel_config
         self.load_config = vllm_config.load_config
+        self.scheduler_config = vllm_config.scheduler_config
 
         self.modules_dict = None
 
-- 
Gitee


From 51fb3b5ea0331ab1acfc6791c9c38bacfc5098df Mon Sep 17 00:00:00 2001
From: r1chardf1d0 <xiaruijie@huawei.com>
Date: Wed, 21 May 2025 11:03:52 +0800
Subject: [PATCH 44/55] reopen v0 testcase

---
 tests/st/python/test_vllm_deepseek_part.py             | 2 +-
 tests/st/python/test_vllm_mf_qwen_7b_chunk_prefill.py  | 6 +++---
 tests/st/python/test_vllm_mf_qwen_7b_cp_pc_mss.py      | 2 +-
 tests/st/python/test_vllm_mf_qwen_7b_prefix_caching.py | 4 ++--
 4 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/tests/st/python/test_vllm_deepseek_part.py b/tests/st/python/test_vllm_deepseek_part.py
index 21ba4fe4..bf7d8d2a 100644
--- a/tests/st/python/test_vllm_deepseek_part.py
+++ b/tests/st/python/test_vllm_deepseek_part.py
@@ -105,7 +105,7 @@ class TestDeepSeekMTP:
 
         # Create an LLM.
         llm = LLM(model="/home/workspace/mindspore_dataset/weight/DeepSeek-R1-MTP",
-                  trust_remote_code=True, gpu_memory_utilization=0.7, tensor_parallel_size=8,
+                  trust_remote_code=True, gpu_memory_utilization=0.7, tensor_parallel_size=8, max_model_len=4096,
                   speculative_config={"num_speculative_tokens":1})
         # Generate texts from the prompts. The output is a list of RequestOutput objects
         # that contain the prompt, generated text, and other information.
diff --git a/tests/st/python/test_vllm_mf_qwen_7b_chunk_prefill.py b/tests/st/python/test_vllm_mf_qwen_7b_chunk_prefill.py
index cc3cbcab..7732bbbf 100644
--- a/tests/st/python/test_vllm_mf_qwen_7b_chunk_prefill.py
+++ b/tests/st/python/test_vllm_mf_qwen_7b_chunk_prefill.py
@@ -48,7 +48,7 @@ class TestMfQwen_chunk_prefill:
     @pytest.mark.level0
     @pytest.mark.platform_arm_ascend910b_training
     @pytest.mark.env_single
-    @pytest.mark.skip(reason="cp precision need to be fixed on v0.8.3 V0")
+    @pytest.mark.skip(reason="mindspore not ready")
     def test_mf_qwen_7b_chunk_prefill(self):
         """
         test case qwen_7b_chunk_prefill
@@ -61,8 +61,8 @@ class TestMfQwen_chunk_prefill:
                       "marvel that once housed emperors, stands as a testament to the city's imperial past. Meanwhile, the Great "
                       "Wall, though not within the city limits, is easily accessible from Beijing and offers a glimpse into the "
                       "strategic genius and resilience of ancient China.",
-            "answer": " The city's blend of traditional and modern architecture, vibrant street life, and rich culinary scene "
-                      "make it a truly unique and captivating destination. I am always eager to"},
+            "answer": " The city's blend of traditional and modern architecture, bustling markets, and vibrant street life make it "
+                      "a unique and fascinating destination. In short, Beijing is a city"},
             {"prompt": "I love Beijing, because",
              "answer": " it is a city with a long history. Which of the following options correctly expresses this sentence?\nA. I love Beijing, because it is a city with a"},
         ]
diff --git a/tests/st/python/test_vllm_mf_qwen_7b_cp_pc_mss.py b/tests/st/python/test_vllm_mf_qwen_7b_cp_pc_mss.py
index 856932d3..90eb811f 100644
--- a/tests/st/python/test_vllm_mf_qwen_7b_cp_pc_mss.py
+++ b/tests/st/python/test_vllm_mf_qwen_7b_cp_pc_mss.py
@@ -45,7 +45,7 @@ class TestMfQwen_cp_pc_mss:
     @pytest.mark.level0
     @pytest.mark.platform_arm_ascend910b_training
     @pytest.mark.env_single
-    @pytest.mark.skip(reason="cp precision need to be fixed on v0.8.3 V0")
+    @pytest.mark.skip(reason="mindspore not ready")
     def test_mf_qwen_7b_cp_pc_mss(self):
         """
         test case mf_qwen_7b_cp_pc_mss
diff --git a/tests/st/python/test_vllm_mf_qwen_7b_prefix_caching.py b/tests/st/python/test_vllm_mf_qwen_7b_prefix_caching.py
index 28ec1058..80ea073f 100644
--- a/tests/st/python/test_vllm_mf_qwen_7b_prefix_caching.py
+++ b/tests/st/python/test_vllm_mf_qwen_7b_prefix_caching.py
@@ -46,7 +46,7 @@ class TestMfQwen_prefix_caching:
     @pytest.mark.level0
     @pytest.mark.platform_arm_ascend910b_training
     @pytest.mark.env_single
-    @pytest.mark.skip(reason="pc precision need to be fixed on v0.8.3 V0")
+    @pytest.mark.skip(reason="mindspore not ready")
     def test_mf_qwen_7b_prefix_caching(self):
         """
         test case qwen_7b_prefix_caching
@@ -72,7 +72,7 @@ class TestMfQwen_prefix_caching:
         outputs = llm.generate(prompts, sampling_params)
         second_outputs = llm.generate(second_prompts, sampling_params)
         except_list=[' many times and each time I have found something new']
-        second_except_list=[' to visit, such as the Forbidden City, the']
+        second_except_list=[' in Beijing, but I have to say that the']
         for i, (output, second_output) in enumerate(zip(outputs, second_outputs)):
             generated_text = output.outputs[i].text
             print(f"Output1 - Prompt: {prompts[i]!r}, Generated text: {generated_text!r}")
-- 
Gitee


From de9d18b6b421cd962bdfc596a9a7d98181f1d196 Mon Sep 17 00:00:00 2001
From: Erpim <dengyepeng@huawei.com>
Date: Wed, 21 May 2025 14:47:24 +0800
Subject: [PATCH 45/55] support dp/tp case

---
 tests/mindformers                                             | 2 +-
 vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/mindformers b/tests/mindformers
index d2df5295..a926c39b 160000
--- a/tests/mindformers
+++ b/tests/mindformers
@@ -1 +1 @@
-Subproject commit d2df52951122a0328a7ccea4230e0b6f49f6c2e8
+Subproject commit a926c39bd0c97fa4ea145232da5cc65b7703f88a
diff --git a/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py b/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py
index 20f83e72..17a730bb 100644
--- a/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py
+++ b/vllm_mindspore/model_executor/models/mf_models/deepseek_v3.py
@@ -83,7 +83,7 @@ def set_runtime_kernel_launch_group():
 def _get_padding_index(q_seq_len):
     dp_size = get_dp_group().world_size
     tp_size = get_tensor_model_parallel_world_size()
-    if dp_size == 1 or tp_size == 1:
+    if dp_size == 1:
         return None, None, None, None
 
     tokens_len_per_dp = q_seq_len.sum().reshape(-1)
-- 
Gitee


From 474e6ba6e0358b7dd4922ca7de937740915f43dd Mon Sep 17 00:00:00 2001
From: wusimin <wusimin7@huawei.com>
Date: Thu, 22 May 2025 10:02:31 +0800
Subject: [PATCH 46/55] =?UTF-8?q?[0.8.3=20v1]=E9=80=82=E9=85=8D=E5=8E=9F?=
 =?UTF-8?q?=E7=94=9FQwen,=E4=BF=AE=E5=A4=8D=E7=B2=BE=E5=BA=A6?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 vllm_mindspore/model_executor/models/attention_mask.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm_mindspore/model_executor/models/attention_mask.py b/vllm_mindspore/model_executor/models/attention_mask.py
index 40be1f46..42d6e629 100644
--- a/vllm_mindspore/model_executor/models/attention_mask.py
+++ b/vllm_mindspore/model_executor/models/attention_mask.py
@@ -46,7 +46,7 @@ class LowerTriangularMask:
         self.dtype = dtype
         self.max_model_len = max_model_len
 
-        prefill_mask_coeff = 1.0 if self.dtype is mstype.bfloat16 else -10000.0
+        prefill_mask_coeff = 1.0 if self.dtype == mstype.bfloat16 else -10000.0
 
         self.prefill_mask = Tensor(np.triu(np.ones(shape=(128, 128), dtype=np.float16), k=1) * prefill_mask_coeff,
                                    dtype=self.dtype)
@@ -78,6 +78,6 @@ class MLALowerTriangularMask(LowerTriangularMask):
     def __init__(self, dtype, max_model_len):
 
         super().__init__(dtype, max_model_len)
-        decode_mask_coeff = 1.0 if self.dtype is mstype.bfloat16 else -10000.0
+        decode_mask_coeff = 1.0 if self.dtype == mstype.bfloat16 else -10000.0
         self.decode_mask = Tensor(np.triu(np.ones(shape=(self.max_model_len, self.max_model_len), dtype=np.int8), k=1),
                                   dtype=self.dtype) * decode_mask_coeff
-- 
Gitee


From 88afb5f0ade4820d7dacaadc81b92483d367bd99 Mon Sep 17 00:00:00 2001
From: one_east <wanyidong@huawei.com>
Date: Fri, 23 May 2025 10:49:25 +0800
Subject: [PATCH 47/55] bugfix: process do not terminated correctly

---
 vllm_mindspore/__init__.py                        | 5 +++++
 vllm_mindspore/executor/multiproc_worker_utils.py | 5 +++++
 2 files changed, 10 insertions(+)

diff --git a/vllm_mindspore/__init__.py b/vllm_mindspore/__init__.py
index 98970a5f..34e7bdf3 100644
--- a/vllm_mindspore/__init__.py
+++ b/vllm_mindspore/__init__.py
@@ -172,6 +172,7 @@ vllm.worker.multi_step_model_runner._get_supported_attention_backends = (
 
 from vllm_mindspore.executor.multiproc_worker_utils import (
     get_mp_context as ms_get_mp_context,
+    terminate_worker as ms_terminate_worker,
 )
 
 # To patching the get_mp_context, should import it first.
@@ -179,6 +180,10 @@ from vllm.executor.multiproc_worker_utils import get_mp_context
 
 vllm.executor.multiproc_worker_utils.get_mp_context = ms_get_mp_context
 
+import vllm.executor.multiproc_worker_utils
+
+vllm.executor.multiproc_worker_utils.ProcessWorkerWrapper.terminate_worker = ms_terminate_worker
+
 import vllm.v1.executor.multiproc_executor
 vllm.v1.executor.multiproc_executor.get_mp_context = ms_get_mp_context
 import vllm.v1.utils
diff --git a/vllm_mindspore/executor/multiproc_worker_utils.py b/vllm_mindspore/executor/multiproc_worker_utils.py
index 86986fa6..30c7a597 100644
--- a/vllm_mindspore/executor/multiproc_worker_utils.py
+++ b/vllm_mindspore/executor/multiproc_worker_utils.py
@@ -21,3 +21,8 @@ import multiprocessing
 
 def get_mp_context():
     return multiprocessing.get_context("fork")
+
+
+def terminate_worker(self):
+    self.process.kill()
+    self._task_queue.close()
-- 
Gitee


From 18162c09666a06120839c5017f05920c5dea8ccd Mon Sep 17 00:00:00 2001
From: r1chardf1d0 <xiaruijie@huawei.com>
Date: Fri, 23 May 2025 01:37:44 +0800
Subject: [PATCH 48/55] reopen v0 testcase

---
 .jenkins/test/config/dependent_packages.yaml           | 4 ++--
 tests/st/python/test_vllm_deepseek_part.py             | 2 --
 tests/st/python/test_vllm_mf_qwen_7b_chunk_prefill.py  | 1 -
 tests/st/python/test_vllm_mf_qwen_7b_cp_pc_mss.py      | 1 -
 tests/st/python/test_vllm_mf_qwen_7b_prefix_caching.py | 1 -
 5 files changed, 2 insertions(+), 7 deletions(-)

diff --git a/.jenkins/test/config/dependent_packages.yaml b/.jenkins/test/config/dependent_packages.yaml
index 5df5ff1f..0425cf02 100644
--- a/.jenkins/test/config/dependent_packages.yaml
+++ b/.jenkins/test/config/dependent_packages.yaml
@@ -1,8 +1,8 @@
 mindspore:
-  'https://repo.mindspore.cn/mindspore/mindspore/version/202505/20250514/br_infer_deepseek_os_20250514004506_0e705b79c36766d07889faa32bc6a3ef6ec79ef3_newest/'
+  'https://repo.mindspore.cn/mindspore/mindspore/version/202505/20250523/br_infer_deepseek_os_20250523150616_197336f8c8ab3ca63d02df74b31a080f521c0cab_newest/'
 
 mindspore_gs:
-  'https://repo.mindspore.cn/mindspore/golden-stick/version/202505/20250514/master_20250514010015_c6cede824328d0dd7069e735646ff4a1808a1c72_newest/'
+  'https://repo.mindspore.cn/mindspore/golden-stick/version/202505/20250521/develop_20250521153508_28a4a63203943d66d8c94b2b013e5cbed37f3e8a_newest/'
 
 msadapter:
   'https://repo.mindspore.cn/mindspore/msadapter/version/202505/20250514/master_20250514010016_380ecadf0133da436503105d6e8e1db709472fe4_newest/'
diff --git a/tests/st/python/test_vllm_deepseek_part.py b/tests/st/python/test_vllm_deepseek_part.py
index bf7d8d2a..c6a7f339 100644
--- a/tests/st/python/test_vllm_deepseek_part.py
+++ b/tests/st/python/test_vllm_deepseek_part.py
@@ -48,7 +48,6 @@ class TestDeepSeek:
     @pytest.mark.level0
     @pytest.mark.platform_arm_ascend910b_training
     @pytest.mark.env_single
-    @pytest.mark.skip(reason="gs master branch is not suit for the newest mindformers.")
     def test_deepseek_r1(self):
         """
         test case deepseek r1 w8a8
@@ -89,7 +88,6 @@ class TestDeepSeekMTP:
     @pytest.mark.level0
     @pytest.mark.platform_arm_ascend910b_training
     @pytest.mark.env_single
-    @pytest.mark.skip(reason="gs master branch is not suit for the newest mindformers.")
     def test_deepseek_mtp(self):
         """
         test case deepseek mtp with main model of r1-w8a8
diff --git a/tests/st/python/test_vllm_mf_qwen_7b_chunk_prefill.py b/tests/st/python/test_vllm_mf_qwen_7b_chunk_prefill.py
index 7732bbbf..daa57d93 100644
--- a/tests/st/python/test_vllm_mf_qwen_7b_chunk_prefill.py
+++ b/tests/st/python/test_vllm_mf_qwen_7b_chunk_prefill.py
@@ -48,7 +48,6 @@ class TestMfQwen_chunk_prefill:
     @pytest.mark.level0
     @pytest.mark.platform_arm_ascend910b_training
     @pytest.mark.env_single
-    @pytest.mark.skip(reason="mindspore not ready")
     def test_mf_qwen_7b_chunk_prefill(self):
         """
         test case qwen_7b_chunk_prefill
diff --git a/tests/st/python/test_vllm_mf_qwen_7b_cp_pc_mss.py b/tests/st/python/test_vllm_mf_qwen_7b_cp_pc_mss.py
index 90eb811f..df61117d 100644
--- a/tests/st/python/test_vllm_mf_qwen_7b_cp_pc_mss.py
+++ b/tests/st/python/test_vllm_mf_qwen_7b_cp_pc_mss.py
@@ -45,7 +45,6 @@ class TestMfQwen_cp_pc_mss:
     @pytest.mark.level0
     @pytest.mark.platform_arm_ascend910b_training
     @pytest.mark.env_single
-    @pytest.mark.skip(reason="mindspore not ready")
     def test_mf_qwen_7b_cp_pc_mss(self):
         """
         test case mf_qwen_7b_cp_pc_mss
diff --git a/tests/st/python/test_vllm_mf_qwen_7b_prefix_caching.py b/tests/st/python/test_vllm_mf_qwen_7b_prefix_caching.py
index 80ea073f..01736f85 100644
--- a/tests/st/python/test_vllm_mf_qwen_7b_prefix_caching.py
+++ b/tests/st/python/test_vllm_mf_qwen_7b_prefix_caching.py
@@ -46,7 +46,6 @@ class TestMfQwen_prefix_caching:
     @pytest.mark.level0
     @pytest.mark.platform_arm_ascend910b_training
     @pytest.mark.env_single
-    @pytest.mark.skip(reason="mindspore not ready")
     def test_mf_qwen_7b_prefix_caching(self):
         """
         test case qwen_7b_prefix_caching
-- 
Gitee


From ebc67cc32c10dd976bbe075ee4a532c7bc0753b7 Mon Sep 17 00:00:00 2001
From: tronzhang <zhangzhaochuang@huawei.com>
Date: Fri, 23 May 2025 20:22:39 +0800
Subject: [PATCH 49/55] fix pa error for v0 qwen in cp+mss case

---
 vllm_mindspore/model_executor/models/qwen2.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm_mindspore/model_executor/models/qwen2.py b/vllm_mindspore/model_executor/models/qwen2.py
index 8bf808a4..444ddc5a 100644
--- a/vllm_mindspore/model_executor/models/qwen2.py
+++ b/vllm_mindspore/model_executor/models/qwen2.py
@@ -579,7 +579,7 @@ class Qwen2ForCausalLM(MsModelBase):
             is_prefill = attn_metadata.num_decode_tokens == 0 and kv_cache_lens.max() == 0
             slot_mapping = attn_metadata.slot_mapping
             batch_valid_length = Tensor.from_numpy(np.array(attn_metadata.seq_lens, dtype=np.int32))
-            q_seq_lens = Tensor.from_numpy(np.array(attn_metadata.query_lens, dtype=np.int32))
+            q_seq_lens = ms.Tensor(query_lens_np, dtype=ms.int32)
             block_tables = attn_metadata.block_tables
             position_ids = ms.Tensor(positions, dtype=ms.int32)
             attn_mask = self.casual_mask.gen_attention_mask(is_prefill, position_ids, query_lens)
-- 
Gitee


From c308469de798e7c24df2dd602f5707f7c0f6f687 Mon Sep 17 00:00:00 2001
From: yangminghai <hale97518@foxmail.com>
Date: Sat, 24 May 2025 19:47:14 +0800
Subject: [PATCH 50/55] fix bug at pynative mode when decode and prefill phase
 change

---
 .../model_executor/models/mf_models/mf_model_base.py         | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/vllm_mindspore/model_executor/models/mf_models/mf_model_base.py b/vllm_mindspore/model_executor/models/mf_models/mf_model_base.py
index ecc764be..394032ad 100644
--- a/vllm_mindspore/model_executor/models/mf_models/mf_model_base.py
+++ b/vllm_mindspore/model_executor/models/mf_models/mf_model_base.py
@@ -38,6 +38,7 @@ from mindformers.tools.register.config import MindFormerConfig
 from mindformers.core.context import build_mf_context
 from mindformers.core.parallel_config import build_parallel_config
 from mindspore.common.api import _pynative_executor
+from mindformers.tools.utils import is_pynative
 from vllm_mindspore.model_executor.models.model_base import MsModelBase
 from vllm_mindspore.model_executor.models.attention_mask import LowerTriangularMask
 from vllm_mindspore.v1.attention.backends.flash_attn import FlashAttentionMetadata
@@ -231,11 +232,11 @@ class MfModelBase(MsModelBase):
 
         if is_prefill:
             self.network.phase = "prefill"
-            if not self.set_flags:
+            if not self.set_flags or is_pynative():
                 self.network.add_flags_custom(is_first_iteration=True)
             hidden_states = self.network(**model_inputs)
             self.network.phase = "increment"
-            if not self.set_flags:
+            if not self.set_flags or is_pynative():
                 self.network.add_flags_custom(is_first_iteration=False)
                 self.set_flags = True
             if kv_transfer_supported:
-- 
Gitee


From 646e481dbb09f350b695ebd959293be8ce802c7d Mon Sep 17 00:00:00 2001
From: Erpim <dengyepeng@huawei.com>
Date: Tue, 13 May 2025 15:36:50 +0800
Subject: [PATCH 51/55] add mix parallel st

---
 .../python/test_vllm_deepseek_mix_parallel.py | 351 ++++++++++++++++++
 1 file changed, 351 insertions(+)
 create mode 100644 tests/st/python/test_vllm_deepseek_mix_parallel.py

diff --git a/tests/st/python/test_vllm_deepseek_mix_parallel.py b/tests/st/python/test_vllm_deepseek_mix_parallel.py
new file mode 100644
index 00000000..32a8aee1
--- /dev/null
+++ b/tests/st/python/test_vllm_deepseek_mix_parallel.py
@@ -0,0 +1,351 @@
+# Copyright 2025 Huawei Technologies Co., Ltd
+#
+# This file is mainly Adapted from https://github.com/vllm-project/vllm/blob/main/examples/offline_inference/data_parallel.py
+# Copyright 2025 The vLLM team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""test mf deepseek r1."""
+import pytest
+import os
+import tempfile
+import re
+
+from . import set_env
+from multiprocessing import Process, Queue
+
+env_manager = set_env.EnvVarManager()
+
+env_vars = {
+    "MINDFORMERS_MODEL_CONFIG": "./config/predict_deepseek_r1_671b_w8a8.yaml",
+    "ASCEND_CUSTOM_PATH": os.path.expandvars("$ASCEND_HOME_PATH/../"),
+    "vLLM_MODEL_BACKEND": "MindFormers",
+    "MS_ENABLE_LCCL": "on",
+    "HCCL_OP_EXPANSION_MODE": "AIV",
+    "ASCEND_RT_VISIBLE_DEVICES": "0,1,2,3,4,5,6,7",
+    "MS_ALLOC_CONF": "enable_vmm:True",
+    "LCCL_DETERMINISTIC": "1",
+    "HCCL_DETERMINISTIC": "true",
+    "ATB_MATMUL_SHUFFLE_K_ENABLE": "0",
+    "ATB_LLM_LCOC_ENABLE": "0"
+}
+env_manager.setup_ai_environment(env_vars)
+import vllm_mindspore
+from vllm import LLM, SamplingParams
+from vllm.utils import get_open_port
+
+
+def dp_func(dp_size, local_dp_rank, global_dp_rank, dp_master_ip, dp_master_port,
+            GPUs_per_dp_rank, prompts, except_list, result_q):
+    os.environ["VLLM_DP_RANK"] = str(global_dp_rank)
+    os.environ["VLLM_DP_LOCAL"] = str(local_dp_rank)
+    os.environ["VLLM_DP_SIZE"] = str(dp_size)
+    os.environ["VLLM_DP_MASTER_IP"] = dp_master_ip
+    os.environ["VLLM_DP_MASTER_PORT"] = str(dp_master_port)
+
+    promts_per_rank = len(prompts) // dp_size
+    start = global_dp_rank * promts_per_rank
+    end = start + promts_per_rank
+    prompts = prompts[start:end]
+    except_list = except_list[start:end]
+    if len(prompts) == 0:
+        prompts = ["Placeholder"]
+    print(f"DP rank {global_dp_rank} needs to process {len(prompts)} prompts")
+
+    sampling_params = SamplingParams(temperature=0.0,
+                                     top_p=1.0,
+                                     top_k=1,
+                                     repetition_penalty=1.0,
+                                     max_tokens=3)
+
+    # Create an LLM.
+    llm = LLM(model="/home/workspace/mindspore_dataset/weight/DeepSeek-R1-W8A8",
+              tensor_parallel_size=GPUs_per_dp_rank,
+              max_model_len = 4096,
+              max_num_batched_tokens=8,
+              max_num_seqs=8,
+              trust_remote_code=True,
+              enforce_eager=True,
+              enable_expert_parallel=True)
+    outputs = llm.generate(prompts, sampling_params)
+    # Print the outputs.
+    for i, output in enumerate(outputs):
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"DP rank {global_dp_rank}, Prompt: {prompt!r}, "
+              f"Generated text: {generated_text!r}")
+        result_q.put(generated_text == except_list[i])
+
+
+def exec_ds_with_dp(new_yaml, replaced_pattern, dp_size, tp_size, prompts, except_list):
+    file = open('./config/predict_deepseek_r1_671b_w8a8.yaml', 'r')
+    content = file.read()
+    file.close()
+
+    replace_data_parallel = re.compile(r'data_parallel: 1')
+    replace_model_parallel = re.compile(r'model_parallel: 16')
+    replace_expert_parallel = re.compile(r'expert_parallel: 1')
+
+    content = replace_data_parallel.sub(replaced_pattern[0], content)
+    content = replace_model_parallel.sub(replaced_pattern[1], content)
+    content = replace_expert_parallel.sub(replaced_pattern[2], content)
+
+    with tempfile.TemporaryDirectory() as tmp_dir:
+        new_yaml_path = os.path.join(tmp_dir, new_yaml)
+        with open(new_yaml_path, 'w') as f:
+            f.write(content)
+        env_manager.set_env_var("MINDFORMERS_MODEL_CONFIG", new_yaml_path)
+
+        node_size = 1
+        node_rank = 0
+        dp_master_ip = "127.0.0.1"
+        dp_master_port = get_open_port()
+
+        dp_per_node = dp_size // node_size
+
+        result_q = Queue()
+        procs = []
+        for local_dp_rank, global_dp_rank in enumerate(
+                range(node_rank * dp_per_node, (node_rank + 1) * dp_per_node)):
+            proc = Process(target=dp_func,
+                           args=(dp_size, local_dp_rank,
+                                 global_dp_rank, dp_master_ip, dp_master_port,
+                                 tp_size, prompts, except_list, result_q))
+            proc.start()
+            procs.append(proc)
+        exit_code = 0
+
+        for proc in procs:
+            proc.join(timeout=180)
+            if proc.exitcode is None:
+                print(f"Killing process {proc.pid} that "
+                      f"didn't stop within 3 minutes.")
+                proc.kill()
+                exit_code = 1
+            elif proc.exitcode:
+                exit_code = proc.exitcode
+
+        assert exit_code == 0
+        result = True
+        for proc in procs:
+            result = result and result_q.get()
+        assert result
+
+    # unset env
+    env_manager.unset_all()
+
+
+def exec_ds_without_dp(new_yaml, replaced_pattern, prompts, except_list):
+    file = open('./config/predict_deepseek_r1_671b_w8a8.yaml', 'r')
+    content = file.read()
+    file.close()
+
+    replace_data_parallel = re.compile(r'data_parallel: 1')
+    replace_model_parallel = re.compile(r'model_parallel: 16')
+    replace_expert_parallel = re.compile(r'expert_parallel: 1')
+
+    content = replace_data_parallel.sub(replaced_pattern[0], content)
+    content = replace_model_parallel.sub(replaced_pattern[1], content)
+    content = replace_expert_parallel.sub(replaced_pattern[2], content)
+
+    with tempfile.TemporaryDirectory() as tmp_dir:
+        new_yaml_path = os.path.join(tmp_dir, new_yaml)
+        with open(new_yaml_path, 'w') as f:
+            f.write(content)
+        env_manager.set_env_var("MINDFORMERS_MODEL_CONFIG", new_yaml_path)
+
+
+        # Create a sampling params object.
+        sampling_params = SamplingParams(temperature=0.0, max_tokens=3, top_k=1, top_p=1.0,
+                                         repetition_penalty=1.0)
+
+        # Create an LLM.
+        llm = LLM(model="/home/workspace/mindspore_dataset/weight/DeepSeek-R1-W8A8",
+                  trust_remote_code=True, gpu_memory_utilization=0.9, tensor_parallel_size=8, max_model_len=4096)
+        # Generate texts from the prompts. The output is a list of RequestOutput objects
+        # that contain the prompt, generated text, and other information.
+        outputs = llm.generate(prompts, sampling_params)
+        # Print the outputs.
+        for i, output in enumerate(outputs):
+            prompt = output.prompt
+            generated_text = output.outputs[0].text
+            print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+            assert generated_text == except_list[i]
+
+    # unset env
+    env_manager.unset_all()
+
+
+
+@pytest.mark.level0
+@pytest.mark.platform_arm_ascend910b_training
+@pytest.mark.allcards
+def test_deepseek_r1_dp4_tp2_ep4():
+    """
+    test case deepseek r1 w8a8 dp4 tp2 ep4
+    """
+    new_yaml = "dp4_tp2_ep4.yaml"
+    replaced_pattern = ['data_parallel: 4', 'model_parallel: 2', 'expert_parallel: 4']
+    dp_size = 4
+    tp_size = 2
+    # Sample prompts.
+    prompts = [
+        "You are a helpful assistant.<｜User｜>将文本分类为中性、负面或正面。 \n文本：我认为这次假期还可以。 "
+        "\n情感：<｜Assistant｜>\n",
+    ] * 4
+
+    except_list = ['ugs611ాలు'] * 4
+    exec_ds_with_dp(new_yaml, replaced_pattern, dp_size, tp_size, prompts, except_list)
+
+
+@pytest.mark.level1
+@pytest.mark.platform_arm_ascend910b_training
+@pytest.mark.allcards
+def test_deepseek_r1_dp8_tp1_ep8():
+    """
+    test case deepseek r1 w8a8 Dp8 tp1 ep8
+    """
+    new_yaml = "dp8_tp1_ep8.yaml"
+    replaced_pattern = ['data_parallel: 8', 'model_parallel: 1', 'expert_parallel: 8']
+    dp_size = 8
+    tp_size = 1
+    # Sample prompts.
+    prompts = [
+        "You are a helpful assistant.<｜User｜>将文本分类为中性、负面或正面。 \n文本：我认为这次假期还可以。 "
+        "\n情感：<｜Assistant｜>\n",
+    ] * 8
+
+    except_list = ['ugs611ాలు'] * 8
+    exec_ds_with_dp(new_yaml, replaced_pattern, dp_size, tp_size, prompts, except_list)
+
+
+@pytest.mark.level1
+@pytest.mark.platform_arm_ascend910b_training
+@pytest.mark.allcards
+def test_deepseek_r1_dp2_tp4_ep1():
+    """
+    test case deepseek r1 w8a8 dp2 tp4 ep1
+    """
+    new_yaml = "dp2_tp4_ep1.yaml"
+    replaced_pattern = ['data_parallel: 2', 'model_parallel: 4', 'expert_parallel: 1']
+    dp_size = 2
+    tp_size = 4
+    # Sample prompts.
+    prompts = [
+        "You are a helpful assistant.<｜User｜>将文本分类为中性、负面或正面。 \n文本：我认为这次假期还可以。 "
+        "\n情感：<｜Assistant｜>\n",
+    ] * 2
+
+    except_list = ['ugs611ాలు'] * 2
+    exec_ds_with_dp(new_yaml, replaced_pattern, dp_size, tp_size, prompts, except_list)
+
+
+@pytest.mark.level1
+@pytest.mark.platform_arm_ascend910b_training
+@pytest.mark.allcards
+def test_deepseek_r1_dp4_tp2_ep8():
+    """
+    test case deepseek r1 w8a8 dp4 tp2 ep8
+    """
+    new_yaml = "dp4_tp2_ep8.yaml"
+    replaced_pattern = ['data_parallel: 4', 'model_parallel: 2', 'expert_parallel: 8']
+    dp_size = 4
+    tp_size = 2
+    # Sample prompts.
+    prompts = [
+        "You are a helpful assistant.<｜User｜>将文本分类为中性、负面或正面。 \n文本：我认为这次假期还可以。 "
+        "\n情感：<｜Assistant｜>\n",
+    ] * 4
+
+    except_list = ['ugs611ాలు'] * 4
+    exec_ds_with_dp(new_yaml, replaced_pattern, dp_size, tp_size, prompts, except_list)
+
+
+@pytest.mark.level1
+@pytest.mark.platform_arm_ascend910b_training
+@pytest.mark.allcards
+def test_deepseek_r1_dp8_tp1_ep1():
+    """
+    test case deepseek r1 w8a8 dp8 tp1 ep1
+    """
+    new_yaml = "dp8_tp1_ep1.yaml"
+    replaced_pattern = ['data_parallel: 8', 'model_parallel: 1', 'expert_parallel: 1']
+    dp_size = 8
+    tp_size = 1
+    # Sample prompts.
+    prompts = [
+        "You are a helpful assistant.<｜User｜>将文本分类为中性、负面或正面。 \n文本：我认为这次假期还可以。 "
+        "\n情感：<｜Assistant｜>\n",
+    ] * 8
+
+    except_list = ['ugs611ాలు'] * 8
+    exec_ds_with_dp(new_yaml, replaced_pattern, dp_size, tp_size, prompts, except_list)
+
+
+@pytest.mark.level1
+@pytest.mark.platform_arm_ascend910b_training
+@pytest.mark.allcards
+def test_deepseek_r1_dp8_tp1_ep4():
+    """
+    test case deepseek r1 w8a8 dp8 tp1 ep1
+    """
+    new_yaml = "dp8_tp1_ep4.yaml"
+    replaced_pattern = ['data_parallel: 8', 'model_parallel: 1', 'expert_parallel: 4']
+    dp_size = 8
+    tp_size = 1
+    # Sample prompts.
+    prompts = [
+        "You are a helpful assistant.<｜User｜>将文本分类为中性、负面或正面。 \n文本：我认为这次假期还可以。 "
+        "\n情感：<｜Assistant｜>\n",
+    ] * 8
+
+    except_list = ['ugs611ాలు'] * 8
+    exec_ds_with_dp(new_yaml, replaced_pattern, dp_size, tp_size, prompts, except_list)
+
+
+@pytest.mark.level1
+@pytest.mark.platform_arm_ascend910b_training
+@pytest.mark.allcards
+def test_deepseek_r1_tp8_ep8():
+    """
+    test case deepseek r1 w8a8 tp8 ep8
+    """
+    new_yaml = "tp8_ep8.yaml"
+    replaced_pattern = ['data_parallel: 1', 'model_parallel: 8', 'expert_parallel: 8']
+    # Sample prompts.
+    prompts = [
+        "You are a helpful assistant.<｜User｜>将文本分类为中性、负面或正面。 \n文本：我认为这次假期还可以。 "
+        "\n情感：<｜Assistant｜>\n",
+    ]
+
+    except_list=['ugs611ాలు']
+    exec_ds_without_dp(new_yaml, replaced_pattern, prompts, except_list)
+
+
+@pytest.mark.level1
+@pytest.mark.platform_arm_ascend910b_training
+@pytest.mark.allcards
+def test_deepseek_r1_tp8_ep4():
+    """
+    test case deepseek r1 w8a8 tp8 ep4
+    """
+    new_yaml = "tp8_ep4.yaml"
+    replaced_pattern = ['data_parallel: 1', 'model_parallel: 8', 'expert_parallel: 4']
+    # Sample prompts.
+    prompts = [
+        "You are a helpful assistant.<｜User｜>将文本分类为中性、负面或正面。 \n文本：我认为这次假期还可以。 "
+        "\n情感：<｜Assistant｜>\n",
+    ]
+
+    except_list=['ugs611ాలు']
+    exec_ds_without_dp(new_yaml, replaced_pattern, prompts, except_list)
-- 
Gitee


From 0528bcc90ea4971e75f5054f863249fda61442ff Mon Sep 17 00:00:00 2001
From: zlq2020 <zouliqin@huawei.com>
Date: Sat, 17 May 2025 16:41:33 +0800
Subject: [PATCH 52/55] add install depended package scripts

---
 install_depend_pkgs.sh | 102 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 102 insertions(+)
 create mode 100644 install_depend_pkgs.sh

diff --git a/install_depend_pkgs.sh b/install_depend_pkgs.sh
new file mode 100644
index 00000000..a5d06518
--- /dev/null
+++ b/install_depend_pkgs.sh
@@ -0,0 +1,102 @@
+#!/bin/bash
+
+script_dir=$(cd "$(dirname $0)"; pwd)
+yaml_file="$script_dir/.jenkins/test/config/dependent_packages.yaml"
+work_dir="install_depend_pkgs"
+
+if [ ! -f "$yaml_file" ]; then
+    echo "$yaml_file does not exist."
+    exit 1
+fi
+
+if [ ! -d "$work_dir" ]; then
+    mkdir -p "$work_dir"
+    echo "Created $work_dir directory."
+else
+    echo "$work_dir already exists. Removing existing whl packages."
+    rm -f "$work_dir"/*.whl
+fi
+
+cd "$work_dir" || exit 1
+
+get_yaml_value() {
+    local file="$1"
+    local key="$2"
+
+    python3 -c "
+import yaml
+try:
+    with open('$file', 'r') as f:
+        data = yaml.safe_load(f)
+        print(data.get('$key', ''))
+except Exception as e:
+    print(f'Error: {e}')
+    exit(1)
+"
+}
+
+echo "========= Installing vllm"
+vllm_dir=vllm-v0.8.3
+if [ ! -d "$vllm_dir" ]; then
+    git clone https://github.com/vllm-project/vllm.git -b v0.8.3 "$vllm_dir"
+    cd "$vllm_dir" ||  { echo "Failed to git clone vllm!"; exit 1; }
+    git apply ../../vllm_dp/dp_scale_out.patch
+else
+    echo "The $vllm_dir folder already exists and will not be re-downloaded."
+    cd "$vllm_dir" || { echo "Failed to git clone vllm!"; exit 1; }
+fi
+pip uninstall msadapter -y
+pip uninstall vllm -y
+pip install -v -r requirements/cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu
+VLLM_TARGET_DEVICE=empty python setup.py install || { echo "Failed to install vllm"; exit 1; }
+pip uninstall torch torch-npu torchvision -y
+cd ..
+
+
+echo "========= Installing mindspore"
+python_v="cp$(python3 --version 2>&1 | grep -oP 'Python \K\d+\.\d+' | tr -d .)"
+mindspore_path=$(get_yaml_value "$yaml_file" "mindspore")
+mindspore_name="mindspore-2.6.0-${python_v}-${python_v}-linux_$(arch).whl"
+mindspore_pkg="${mindspore_path}unified/$(arch)/${mindspore_name}"
+
+wget "$mindspore_pkg" --no-check-certificate || { echo "Failed to download mindspore"; exit 1; }
+pip uninstall mindspore -y && pip install "$mindspore_name" || { echo "Failed to install mindspore"; exit 1; }
+
+
+echo "========= Installing mindformers"
+mf_dir=mindformers-os
+if [ ! -d "$mf_dir" ]; then
+    git clone https://gitee.com/mindspore/mindformers.git -b br_infer_deepseek_os "$mf_dir"
+else
+    echo "The $mf_dir folder already exists and will not be re-downloaded."
+fi
+if [ ! -d "$mf_dir" ]; then
+    echo "Failed to git clone mindformers!"
+    exit 1 
+fi
+
+
+echo "========= Installing mindspore golden-stick"
+gs_dir=gs-develop
+if [ ! -d "$gs_dir" ]; then
+    git clone https://gitee.com/mindspore/golden-stick.git  -b develop  "$gs_dir"
+else
+    echo "The $gs_dir folder already exists and will not be re-downloaded."
+fi
+cd "$gs_dir" || { echo "Failed to git clone golden-stick!"; exit 1; }
+pip uninstall mindspore-gs -y && pip install .|| { echo "Failed to install golden-stick"; exit 1; }
+cd ..
+
+
+echo "========= Installing msadapter"
+msadapter_dir="MSAdapter"
+if [ ! -d "$msadapter_dir" ]; then
+    git clone https://git.openi.org.cn/OpenI/MSAdapter.git
+else
+    echo "The $msadapter_dir folder already exists and will not be re-downloaded."
+fi
+cd "$msadapter_dir" || { echo "Failed to git clone msadapter!"; exit 1; }
+pip uninstall msadapter -y && python3 setup.py install || { echo "Failed to install msadapter"; exit 1; }
+cd ..
+
+echo "========= All dependencies installed successfully!"
-- 
Gitee


From da953c3b89521092e94e276cd808894f60c5ea4b Mon Sep 17 00:00:00 2001
From: moran <moran2@huawei.com>
Date: Wed, 28 May 2025 10:09:11 +0800
Subject: [PATCH 53/55] update pr template

---
 .gitee/PULL_REQUEST_TEMPLATE.en.md    | 38 +++++++++++++++++++++++++
 .gitee/PULL_REQUEST_TEMPLATE.md       | 38 +++++++++++++++++++++++++
 .gitee/PULL_REQUEST_TEMPLATE.zh-CN.md | 41 +++++++++++++++++++++++++++
 3 files changed, 117 insertions(+)
 create mode 100644 .gitee/PULL_REQUEST_TEMPLATE.en.md
 create mode 100644 .gitee/PULL_REQUEST_TEMPLATE.md
 create mode 100644 .gitee/PULL_REQUEST_TEMPLATE.zh-CN.md

diff --git a/.gitee/PULL_REQUEST_TEMPLATE.en.md b/.gitee/PULL_REQUEST_TEMPLATE.en.md
new file mode 100644
index 00000000..c62f68d2
--- /dev/null
+++ b/.gitee/PULL_REQUEST_TEMPLATE.en.md
@@ -0,0 +1,38 @@
+<!--  Thanks for sending a pull request!  Here are some tips for you:
+
+1) If this is your first time, please read our contributor guidelines: https://gitee.com/mindspore/mindspore/blob/master/CONTRIBUTING.md
+
+2) If you want to contribute your code but don't know who will review and merge, please add label `mindspore-assistant` to the pull request, we will find and do it as soon as possible.
+-->
+
+**What type of PR is this?**
+<!-- 
+Choose one label from `bug`, `task`, `feature` and `refactor`, and replace `<label>` below the comment block. 
+
+If this pr is not only bugfix/task/feature and also a refactor, you can append `/kind refactor` label after `/kind bug`, `/kind task` and `/kind feature`.
+-->
+/kind <label>
+
+
+**What does this PR do / why do we need it**:
+
+
+**Which issue(s) this PR fixes**:
+<!-- 
+*Automatically closes linked issue when PR is merged.
+Usage: `Fixes #<issue number>`, or `Fixes (paste link of issue)`.
+-->
+Fixes #
+
+
+**Code review checklist [[illustration]](https://gitee.com/mindspore/community/blob/master/security/code_review_checklist_mechanism.md)**:
+
++ - [ ] whether to verify the function's return value (It is forbidden to use void to mask the return values of security functions and self-developed functions. C++ STL functions can be masked if there is no problem)
++ - [ ] Whether to comply with ***SOLID principle / Demeter's law***
++ - [ ] Whether there is UT test case && the test case is a valid (if there is no test case, please explain the reason)
++ - [ ] Whether the API change is involved
++ - [ ] Whether official document modification is involved
+
+<!-- **Special notes for your reviewers**: -->
+<!-- + - [ ] Whether it causes forward compatibility failure -->
+<!-- + - [ ] Whether the dependent third-party library change is involved -->
diff --git a/.gitee/PULL_REQUEST_TEMPLATE.md b/.gitee/PULL_REQUEST_TEMPLATE.md
new file mode 100644
index 00000000..c62f68d2
--- /dev/null
+++ b/.gitee/PULL_REQUEST_TEMPLATE.md
@@ -0,0 +1,38 @@
+<!--  Thanks for sending a pull request!  Here are some tips for you:
+
+1) If this is your first time, please read our contributor guidelines: https://gitee.com/mindspore/mindspore/blob/master/CONTRIBUTING.md
+
+2) If you want to contribute your code but don't know who will review and merge, please add label `mindspore-assistant` to the pull request, we will find and do it as soon as possible.
+-->
+
+**What type of PR is this?**
+<!-- 
+Choose one label from `bug`, `task`, `feature` and `refactor`, and replace `<label>` below the comment block. 
+
+If this pr is not only bugfix/task/feature and also a refactor, you can append `/kind refactor` label after `/kind bug`, `/kind task` and `/kind feature`.
+-->
+/kind <label>
+
+
+**What does this PR do / why do we need it**:
+
+
+**Which issue(s) this PR fixes**:
+<!-- 
+*Automatically closes linked issue when PR is merged.
+Usage: `Fixes #<issue number>`, or `Fixes (paste link of issue)`.
+-->
+Fixes #
+
+
+**Code review checklist [[illustration]](https://gitee.com/mindspore/community/blob/master/security/code_review_checklist_mechanism.md)**:
+
++ - [ ] whether to verify the function's return value (It is forbidden to use void to mask the return values of security functions and self-developed functions. C++ STL functions can be masked if there is no problem)
++ - [ ] Whether to comply with ***SOLID principle / Demeter's law***
++ - [ ] Whether there is UT test case && the test case is a valid (if there is no test case, please explain the reason)
++ - [ ] Whether the API change is involved
++ - [ ] Whether official document modification is involved
+
+<!-- **Special notes for your reviewers**: -->
+<!-- + - [ ] Whether it causes forward compatibility failure -->
+<!-- + - [ ] Whether the dependent third-party library change is involved -->
diff --git a/.gitee/PULL_REQUEST_TEMPLATE.zh-CN.md b/.gitee/PULL_REQUEST_TEMPLATE.zh-CN.md
new file mode 100644
index 00000000..8636fdcf
--- /dev/null
+++ b/.gitee/PULL_REQUEST_TEMPLATE.zh-CN.md
@@ -0,0 +1,41 @@
+<!--  Thanks for sending a pull request!  Here are some tips for you:
+
+1) If this is your first time, please read our contributor guidelines: https://gitee.com/mindspore/mindspore/blob/master/CONTRIBUTING.md
+
+2) If you want to contribute your code but don't know who will review and merge, please add label `mindspore-assistant` to the pull request, we will find and do it as soon as possible.
+-->
+
+**What type of PR is this?**
+<!-- 
+选择下面一种标签替换下方 `/kind <label>`，可选标签类型有：
+- /kind bug 
+- /kind task
+- /kind feature
+- /kind refactor
+如PR描述不符合规范，修改PR描述后需要/check-pr重新检查PR规范。
+-->
+/kind <label>
+
+
+**What does this PR do / why do we need it**:
+
+
+**Which issue(s) this PR fixes**:
+<!-- 
+注意：在下方 #号 后仅输入issue编号或SR/AR编号，粘贴issue链接无效。
+如PR描述不符合规范，修改PR描述后需要/check-pr重新检查PR规范。
+-->
+Fixes #
+
+
+**Code review checklist [【代码检视checklist说明】](https://gitee.com/mindspore/community/blob/master/security/code_review_checklist_mechanism.md)**:
+
++ - [ ] 是否进行返回值校验 (禁止使用void屏蔽安全函数、自研函数返回值，C++标准库函数确认无问题可以屏蔽)
++ - [ ] 是否遵守 ***SOLID原则 / 迪米特法则***
++ - [ ] 是否具备UT测试用例看护 && 测试用例为有效用例 (若新特性无测试用例看护请说明原因)
++ - [ ] 是否为对外接口变更
++ - [ ] 是否涉及官网文档修改
+
+<!-- **Special notes for your reviewers**: -->
+<!-- + - [ ] 是否导致无法前向兼容 -->
+<!-- + - [ ] 是否涉及依赖的三方库变更 -->
-- 
Gitee


From fbd0349da6259ab508fb3d1f6c88aa05de359f17 Mon Sep 17 00:00:00 2001
From: tongl <liu.tong1@outlook.com>
Date: Fri, 16 May 2025 14:54:40 +0800
Subject: [PATCH 54/55] Add osl testcase.

---
 .../predict_deepseek_r1_671b_w8a8_osl.yaml    | 125 ++++++++++++++++++
 tests/st/python/test_vllm_deepseek_osl.py     |  90 +++++++++++++
 2 files changed, 215 insertions(+)
 create mode 100644 tests/st/python/config/predict_deepseek_r1_671b_w8a8_osl.yaml
 create mode 100644 tests/st/python/test_vllm_deepseek_osl.py

diff --git a/tests/st/python/config/predict_deepseek_r1_671b_w8a8_osl.yaml b/tests/st/python/config/predict_deepseek_r1_671b_w8a8_osl.yaml
new file mode 100644
index 00000000..e8d31047
--- /dev/null
+++ b/tests/st/python/config/predict_deepseek_r1_671b_w8a8_osl.yaml
@@ -0,0 +1,125 @@
+seed: 0
+output_dir: './output' # path to save checkpoint/strategy
+run_mode: 'predict'
+use_parallel: True
+
+load_checkpoint: "/path/to/deepseekr1/model_w8a8_osl_ckpt"
+load_ckpt_format: "safetensors"
+auto_trans_ckpt: True # If true, auto transform load_checkpoint to load in distributed model
+
+# trainer config
+trainer:
+  type: CausalLanguageModelingTrainer
+  model_name: 'DeepSeekR1-W8A8'
+
+# default parallel of device num = 16 for Atlas 800T A2
+parallel_config:
+  model_parallel: 16
+  pipeline_stage: 1
+  expert_parallel: 1
+  vocab_emb_dp: False
+
+# mindspore context init config
+context:
+  mode: 0 # 0--Graph Mode; 1--Pynative Mode
+  max_device_memory: "61GB"
+  device_id: 0
+  affinity_cpu_list: None
+
+kernel_launch_group:
+  thread_num: 4
+  kernel_group_num: 16
+
+# parallel context config
+parallel:
+  parallel_mode: "STAND_ALONE" # use 'STAND_ALONE' mode for inference with parallelism in frontend
+  full_batch: False
+  strategy_ckpt_save_file: "./ckpt_strategy.ckpt"
+
+# model config
+model:
+  model_config:
+    type: DeepseekV3Config
+    auto_register: deepseek3_config.DeepseekV3Config
+    batch_size: 1 # add for incre predict
+    seq_length: 4096
+    hidden_size: 7168
+    num_layers: 4
+    num_heads: 128
+    max_position_embeddings: 163840
+    intermediate_size: 18432
+    kv_lora_rank:  512
+    q_lora_rank: 1536
+    qk_rope_head_dim: 64
+    v_head_dim: 128
+    qk_nope_head_dim: 128
+    vocab_size: 129280
+    multiple_of: 256
+    rms_norm_eps: 1.0e-6
+    bos_token_id: 0
+    eos_token_id: 1
+    pad_token_id: 1
+    ignore_token_id: -100
+    compute_dtype: "bfloat16"
+    layernorm_compute_type: "bfloat16"
+    softmax_compute_type: "bfloat16"
+    rotary_dtype: "bfloat16"
+    router_dense_type: "bfloat16"
+    param_init_type: "bfloat16"
+    scaling_factor:
+      beta_fast: 32.0
+      beta_slow: 1.0
+      factor: 40.0
+      mscale: 1.0
+      mscale_all_dim: 1.0
+      original_max_position_embeddings: 4096
+    use_past: True
+    extend_method: "YARN"
+    use_flash_attention: True
+    block_size: 16
+    num_blocks: 512
+    offset: 0
+    checkpoint_name_or_path: ""
+    repetition_penalty: 1
+    max_decode_length: 1024
+    top_k: 1
+    top_p: 1
+    theta: 10000.0
+    do_sample: False
+    is_dynamic: True
+    qkv_concat: True
+    ffn_concat: True
+    quantization_config:
+      quant_method: 'osl'
+      weight_dtype: 'int8'
+      activation_dtype: 'int8'
+    auto_map:
+      AutoConfig: deepseek3_config.DeepseekV3Config
+      AutoModel: deepseek3.DeepseekV3ForCausalLM
+  arch:
+    type: DeepseekV3ForCausalLM
+    auto_register: deepseek3.DeepseekV3ForCausalLM
+
+moe_config:
+  expert_num: 256
+  num_experts_chosen: 8
+  routing_policy: "TopkRouterV2"
+  shared_expert_num: 1
+  routed_scaling_factor: 2.5
+  first_k_dense_replace: 3
+  moe_intermediate_size: 2048
+  topk_group: 4
+  n_group: 8
+
+processor:
+  return_tensors: ms
+  tokenizer:
+    unk_token: '<unk>'
+    bos_token: '<｜begin▁of▁sentence｜>'
+    eos_token: '<｜end▁of▁sentence｜>'
+    pad_token: '<｜end▁of▁sentence｜>'
+    type: LlamaTokenizerFast
+    vocab_file: '/path/to/deepseekr1/tokenizer.json'
+    tokenizer_file: '/path/to/deepseekr1/tokenizer.json'
+    chat_template: "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='', is_first_sp=true) %}{%- for message in messages %}{%- if message['role'] == 'system' %}{%- if ns.is_first_sp %}{% set ns.system_prompt = ns.system_prompt + message['content'] %}{% set ns.is_first_sp = false %}{%- else %}{% set ns.system_prompt = ns.system_prompt + '\n\n' + message['content'] %}{%- endif %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<｜User｜>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<｜Assistant｜><｜tool▁calls▁begin｜><｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<｜tool▁call▁end｜>'}}{%- set ns.is_first = true -%}{%- else %}{{'\n' + '<｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<｜tool▁call▁end｜>'}}{{'<｜tool▁calls▁end｜><｜end▁of▁sentence｜>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<｜tool▁outputs▁end｜>' + message['content'] + '<｜end▁of▁sentence｜>'}}{%- set ns.is_tool = false -%}{%- else %}{{'<｜Assistant｜>' + message['content'] + '<｜end▁of▁sentence｜>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<｜tool▁outputs▁begin｜><｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\n<｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<｜tool▁outputs▁end｜>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<｜Assistant｜>'}}{% endif %}"
+  type: LlamaProcessor
diff --git a/tests/st/python/test_vllm_deepseek_osl.py b/tests/st/python/test_vllm_deepseek_osl.py
new file mode 100644
index 00000000..07c7a02b
--- /dev/null
+++ b/tests/st/python/test_vllm_deepseek_osl.py
@@ -0,0 +1,90 @@
+#!/usr/bin/env python3
+# Copyright 2025 Huawei Technologies Co., Ltd
+# Copyright 2024 The vLLM team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""
+test mf deepseek r1 osl.
+isort:skip_file
+"""
+import pytest
+import os
+from . import set_env
+
+env_manager = set_env.EnvVarManager()
+# def env
+env_vars = {
+    "MINDFORMERS_MODEL_CONFIG":
+    "./config/predict_deepseek_r1_671b_w8a8_osl.yaml",
+    "ASCEND_CUSTOM_PATH": os.path.expandvars("$ASCEND_HOME_PATH/../"),
+    "vLLM_MODEL_BACKEND": "MindFormers",
+    "MS_ENABLE_LCCL": "off",
+    "HCCL_OP_EXPANSION_MODE": "AIV",
+    "ASCEND_RT_VISIBLE_DEVICES": "0,1,2,3,4,5,6,7",
+    "MS_ALLOC_CONF": "enable_vmm:True",
+    "LCCL_DETERMINISTIC": "1",
+    "HCCL_DETERMINISTIC": "true",
+    "ATB_MATMUL_SHUFFLE_K_ENABLE": "0",
+    "ATB_LLM_LCOC_ENABLE": "0",
+    "VLLM_USE_V1": "0",
+}
+# set env
+env_manager.setup_ai_environment(env_vars)
+import vllm_mindspore  # noqa: F401, E402
+from vllm import LLM, SamplingParams  # noqa: E402
+
+
+class TestDeepSeek:
+    """
+    Test Deepseek.
+    """
+
+    @pytest.mark.level0
+    @pytest.mark.platform_arm_ascend910b_training
+    @pytest.mark.env_single
+    def test_deepseek_r1(self):
+        """
+        test case deepseek r1 w8a8
+        """
+
+        # Sample prompts.
+        prompts = [
+            "介绍下北京故宫",
+        ]
+
+        # Create a sampling params object.
+        sampling_params = SamplingParams(temperature=0.0,
+                                         max_tokens=10,
+                                         top_k=1)
+
+        # Create an LLM.
+        llm = LLM(
+            model=
+            "/home/workspace/mindspore_dataset/weight/DeepSeek-R1-W8A8-osl",
+            trust_remote_code=True,
+            gpu_memory_utilization=0.9,
+            tensor_parallel_size=8,
+            max_model_len=4096)
+        # Generate texts from the prompts. The output is a list of RequestOutput objects
+        # that contain the prompt, generated text, and other information.
+        outputs = llm.generate(prompts, sampling_params)
+        # Print the outputs.
+        for i, output in enumerate(outputs):
+            prompt = output.prompt
+            generated_text = output.outputs[0].text
+            print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+            assert "博物院" in generated_text
+
+        # unset env
+        env_manager.unset_all()
-- 
Gitee


From 2341780489a99c92e2afbd1dad9bec62bc045990 Mon Sep 17 00:00:00 2001
From: YiYang <15594999221@163.com>
Date: Tue, 3 Jun 2025 10:41:28 +0800
Subject: [PATCH 55/55] =?UTF-8?q?feat:=20=E4=B8=93=E5=AE=B6=E6=9D=83?=
 =?UTF-8?q?=E9=87=8D=E5=8A=A0=E8=BD=BD=E4=BC=98=E5=8C=96?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../models/mf_models/deepseekv3_weight_processor.py        | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/vllm_mindspore/model_executor/models/mf_models/deepseekv3_weight_processor.py b/vllm_mindspore/model_executor/models/mf_models/deepseekv3_weight_processor.py
index 28ebe6d6..fcf8e5cb 100644
--- a/vllm_mindspore/model_executor/models/mf_models/deepseekv3_weight_processor.py
+++ b/vllm_mindspore/model_executor/models/mf_models/deepseekv3_weight_processor.py
@@ -246,6 +246,13 @@ class DeepseekV3WeightProcessor(BaseWeightProcessor):
         router_dense_hf_name = f"model.layers.{layer_id}.mlp.gate.weight"
         router_dense_ms_name = self.quant_convert_weight_name(router_dense_hf_name)
         router_dense_ms_param, _ = self.get_safetensor_from_file(router_dense_hf_name, src_hf_dir, hf_weight_map)
+
+        if self.moe_split_ep and self.ep_method != EPMethod.ALLTOALL:
+            expert_idx = [idx for idx in range(router_dense_ms_param.shape[0])]
+            in_start_expert_idx = self.ep_group_nums * self.moe_ep_rank_id
+            expert_idx = expert_idx[in_start_expert_idx:] + expert_idx[:in_start_expert_idx]
+            router_dense_ms_param = np.array(router_dense_ms_param)[expert_idx]
+
         self.parameter_dict[router_dense_ms_name] = ms.Parameter(
             ms.from_numpy(router_dense_ms_param).astype(ms.bfloat16),
             name=router_dense_ms_name, requires_grad=False)
-- 
Gitee