From b956ab00f7c4bf3a1af28174c7a2884721c45f4f Mon Sep 17 00:00:00 2001 From: ccsszz Date: Thu, 4 Sep 2025 16:05:31 +0800 Subject: [PATCH 1/3] change format_cast to transdata for A2/A3 nd->nz --- .../model_executor/models/model_base.py | 24 +++++++++---------- vllm_mindspore/v1/worker/gpu_model_runner.py | 7 ++++-- 2 files changed, 17 insertions(+), 14 deletions(-) diff --git a/vllm_mindspore/model_executor/models/model_base.py b/vllm_mindspore/model_executor/models/model_base.py index 31a94e7b..f40424ed 100644 --- a/vllm_mindspore/model_executor/models/model_base.py +++ b/vllm_mindspore/model_executor/models/model_base.py @@ -20,9 +20,10 @@ from collections.abc import Iterable from typing import Any, Optional, Union, cast import mindspore as ms +import ms_custom_ops import numpy as np import vllm.envs as envs -from mindspore import Tensor, mutable, nn, ops +from mindspore import Tensor, mutable, nn from mindspore.common import dtype as mstype from vllm.attention.backends.abstract import AttentionType from vllm.config import VllmConfig, get_current_vllm_config @@ -93,17 +94,16 @@ class MLAAttentionWrapper(AttentionWrapper): # format_cast ops may not recycle device memory k_shape = [1, *(self.kv_shape[1:-2]), kv_lora_rank] r_shape = [1, *(self.kv_shape[1:-2]), qk_rope_head_dim] - self.kv_cache = [ - (ops.auto_generate.format_cast( - ms.mint.zeros(k_shape, dtype=kv_cache_dtype), 29), - ops.auto_generate.format_cast( - ms.mint.zeros(r_shape, - dtype=vllm_config.model_config.dtype), - 29)) - for _ in range( - vllm_config.parallel_config.pipeline_parallel_size) - ] - + # Currently, transdata has a bug and ms.jit must be added. + # Later, ms.jit will be removed. + self.kv_cache = [(ms.jit(ms_custom_ops.trans_data)( + ms.mint.zeros(k_shape, dtype=kv_cache_dtype), + transdata_type=1), ms.jit(ms_custom_ops.trans_data)( + ms.mint.zeros(r_shape, + dtype=vllm_config.model_config.dtype), + transdata_type=1)) for _ in range( + vllm_config.parallel_config.pipeline_parallel_size) + ] else: k_shape = [*(self.kv_shape[0:-1]), kv_lora_rank] r_shape = [*(self.kv_shape[0:-1]), qk_rope_head_dim] diff --git a/vllm_mindspore/v1/worker/gpu_model_runner.py b/vllm_mindspore/v1/worker/gpu_model_runner.py index 54fb277c..cbbcf6e3 100644 --- a/vllm_mindspore/v1/worker/gpu_model_runner.py +++ b/vllm_mindspore/v1/worker/gpu_model_runner.py @@ -22,6 +22,7 @@ import traceback from typing import Any, Optional import mindspore as ms +import ms_custom_ops import numpy as np import torch from mindspore import Generator as msGenerator @@ -477,11 +478,13 @@ def _reshape_kv_cache_tensors( kv_cache_shape[1:]).permute(*inv_order[1:]) if fa3_quant: # for fa3_quant, kvcache need be nz format due to ops + # Currently, transdata has a bug and ms.jit must be + # added. Later, ms.jit will be removed. num_blocks, block_size, _, _ = cache_block.shape cache_block = ops.reshape(cache_block, (num_blocks, block_size, -1)) - cache_block_nz = ops.auto_generate.format_cast( - cache_block, 29) + cache_block_nz = ms.jit(ms_custom_ops.trans_data)\ + (cache_block, transdata_type=1) kv_cache_layer.append(cache_block_nz) else: kv_cache_layer.append(cache_block) -- Gitee From f8785dfbe417e53bae5c80ad67f25e0dd24f92ac Mon Sep 17 00:00:00 2001 From: fanjibin Date: Tue, 2 Sep 2025 20:17:26 +0800 Subject: [PATCH 2/3] add ms_custom_ops submodule --- .gitmodules | 4 ++++ ms_custom_ops | 1 + setup.py | 28 ++++++++++++++++++++++++++++ 3 files changed, 33 insertions(+) create mode 160000 ms_custom_ops diff --git a/.gitmodules b/.gitmodules index d5f8b94d..885ebe56 100644 --- a/.gitmodules +++ b/.gitmodules @@ -2,3 +2,7 @@ path = tests/mindformers url = https://gitee.com/mindspore/mindformers.git branch = master +[submodule "ms_custom_ops"] + path = ms_custom_ops + url = https://gitee.com/mindspore/akg.git + branch = ms_custom_ops diff --git a/ms_custom_ops b/ms_custom_ops new file mode 160000 index 00000000..7fec7a3a --- /dev/null +++ b/ms_custom_ops @@ -0,0 +1 @@ +Subproject commit 7fec7a3a66c9936c37be39f90431c1fc7349d4da diff --git a/setup.py b/setup.py index f7d37dcd..510cfdee 100644 --- a/setup.py +++ b/setup.py @@ -125,9 +125,35 @@ class CustomBuildExt(build_ext): def build_extension(self, ext): if ext.name == "vllm_mindspore._C_ops": self.build_c_ops(ext) + elif ext.name == "vllm_mindspore.ms_custom_ops": + self.build_ms_custom_ops(ext) else: raise ValueError(f"Unknown extension name: {ext.name}") + def build_ms_custom_ops(self, ext): + # "vllm_mindspore.ms_custom_ops" --> "ms_custom_ops" + ext_dir = os.path.dirname( + os.path.realpath(self.get_ext_fullpath(ext.name))) + build_cmd = build_cmd = ( + "git submodule update --init ms_custom_ops && " + "cd ms_custom_ops && " + "python setup.py build && " + f"cp -r build/*/ms_custom_ops {ext_dir}/..") + try: + logger.info("Running build ms_custom_ops commands:\n%s", build_cmd) + result = subprocess.run(build_cmd, + cwd=self.ROOT_DIR, + text=True, + shell=True, + capture_output=False) + if result.returncode != 0: + raise RuntimeError( + "Build ms_custom_ops failed with exit code {}".format( + result.returncode)) + except subprocess.CalledProcessError as e: + raise RuntimeError("Failed to build {}: {}".format( + "ms_custom_ops", e)) from e + def build_c_ops(self, ext): # "vllm_mindspore._C_ops" --> "_C_ops" ext_name = ext.name.split('.')[-1] @@ -190,6 +216,8 @@ def _get_ext_modules(): if os.path.exists(_get_ascend_home_path()): # sources are specified in CMakeLists.txt ext_modules.append(Extension("vllm_mindspore._C_ops", sources=[])) + ext_modules.append( + Extension("vllm_mindspore.ms_custom_ops", sources=[])) return ext_modules -- Gitee From b872b5e755906943f2803c3ad8fe9553295a9289 Mon Sep 17 00:00:00 2001 From: ccsszz Date: Sat, 6 Sep 2025 19:04:47 +0800 Subject: [PATCH 3/3] change the judgment of prefill for dp and update mindformers and msadpter --- .jenkins/test/config/dependent_packages.yaml | 2 +- tests/mindformers | 2 +- .../model_executor/models/mf_models/mindformers.py | 1 + vllm_mindspore/model_executor/models/model_base.py | 5 +++-- vllm_mindspore/model_executor/models/utils.py | 3 ++- vllm_mindspore/v1/worker/gpu_model_runner.py | 2 +- 6 files changed, 9 insertions(+), 6 deletions(-) diff --git a/.jenkins/test/config/dependent_packages.yaml b/.jenkins/test/config/dependent_packages.yaml index 7beac3e3..d873cf81 100644 --- a/.jenkins/test/config/dependent_packages.yaml +++ b/.jenkins/test/config/dependent_packages.yaml @@ -5,7 +5,7 @@ mindspore_gs: 'https://repo.mindspore.cn/mindspore/golden-stick/version/202509/20250901/master_20250901221800_3e34fd43040b0c5d296e6bc1a82212deae3ee041_newest/' msadapter: - 'https://repo.mindspore.cn/mindspore/msadapter/version/202509/20250904/master_20250904010017_666cf4b92070873b6b551de4ec1ae4263707de10_newest/' + 'https://repo.mindspore.cn/mindspore/msadapter/version/202509/20250906/master_20250906160017_75be4576b7f1081f95dd8b6bca95b0b9a6697f49_newest/' vllm: 'https://repo.mindspore.cn/mirrors/vllm/version/202507/20250715/v0.9.1/' diff --git a/tests/mindformers b/tests/mindformers index 21bf5ccd..28964c24 160000 --- a/tests/mindformers +++ b/tests/mindformers @@ -1 +1 @@ -Subproject commit 21bf5ccddb757e02ea139a6beb525c878dc03e9e +Subproject commit 28964c24ef390e26fad106ee876a3c4e5356f2ca diff --git a/vllm_mindspore/model_executor/models/mf_models/mindformers.py b/vllm_mindspore/model_executor/models/mf_models/mindformers.py index 8bfa7681..66e59b8c 100644 --- a/vllm_mindspore/model_executor/models/mf_models/mindformers.py +++ b/vllm_mindspore/model_executor/models/mf_models/mindformers.py @@ -61,6 +61,7 @@ class MindFormersForCausalLM(MsModelBase, SupportsPP): self.mla_config = self.mf_config.get('model', None).get( 'model_config', None).get('multi_latent_attention', False) self.use_ringmla = is_use_ringmla(vllm_config, mf_config) + self.mf_config.model.model_config.use_fused_mla = self.use_ringmla self.is_chunked = False build_mf_context(self.mf_config) diff --git a/vllm_mindspore/model_executor/models/model_base.py b/vllm_mindspore/model_executor/models/model_base.py index f40424ed..524e31ce 100644 --- a/vllm_mindspore/model_executor/models/model_base.py +++ b/vllm_mindspore/model_executor/models/model_base.py @@ -20,7 +20,6 @@ from collections.abc import Iterable from typing import Any, Optional, Union, cast import mindspore as ms -import ms_custom_ops import numpy as np import vllm.envs as envs from mindspore import Tensor, mutable, nn @@ -96,6 +95,7 @@ class MLAAttentionWrapper(AttentionWrapper): r_shape = [1, *(self.kv_shape[1:-2]), qk_rope_head_dim] # Currently, transdata has a bug and ms.jit must be added. # Later, ms.jit will be removed. + import ms_custom_ops self.kv_cache = [(ms.jit(ms_custom_ops.trans_data)( ms.mint.zeros(k_shape, dtype=kv_cache_dtype), transdata_type=1), ms.jit(ms_custom_ops.trans_data)( @@ -295,7 +295,8 @@ class MsModelBase: seq_lengths = ms.Tensor([input_len], dtype=ms.int32) q_seq_lens_np = np.array([input_len], dtype=np.int32) seq_lens_np = np.array([input_len], dtype=np.int32) - context_lens_tensor = ms.Tensor([0], dtype=ms.int32) + context_lens_tensor = ms.Tensor([0], dtype=ms.int32) if not \ + self.set_flags else ms.Tensor([1], dtype=ms.int32) block_tables = ms.Tensor([[0]], dtype=ms.int32) slot_mapping = [-1 for _ in range(input_len)] diff --git a/vllm_mindspore/model_executor/models/utils.py b/vllm_mindspore/model_executor/models/utils.py index f099df88..5d9e56d3 100644 --- a/vllm_mindspore/model_executor/models/utils.py +++ b/vllm_mindspore/model_executor/models/utils.py @@ -24,6 +24,7 @@ from typing import Optional, Union import mindspore as ms from mindspore import mint, ops +from vllm import envs from vllm.sequence import IntermediateTensors from vllm_mindspore.multimodal.inputs import NestedTensors @@ -280,7 +281,7 @@ def is_use_ringmla(vllm_config, mf_config=None): if vllm_config.model_config.hf_config.model_type == "deepseek_mtp": # weight of deepseek mtp model has not been quantized return False - use_ringmla = (vllm_config.model_config.use_mla + use_ringmla = (vllm_config.model_config.use_mla and envs.VLLM_USE_V1 and vllm_config.model_config.quantization is not None and vllm_config.parallel_config.tensor_parallel_size < 16) return use_ringmla diff --git a/vllm_mindspore/v1/worker/gpu_model_runner.py b/vllm_mindspore/v1/worker/gpu_model_runner.py index cbbcf6e3..64973e5b 100644 --- a/vllm_mindspore/v1/worker/gpu_model_runner.py +++ b/vllm_mindspore/v1/worker/gpu_model_runner.py @@ -22,7 +22,6 @@ import traceback from typing import Any, Optional import mindspore as ms -import ms_custom_ops import numpy as np import torch from mindspore import Generator as msGenerator @@ -481,6 +480,7 @@ def _reshape_kv_cache_tensors( # Currently, transdata has a bug and ms.jit must be # added. Later, ms.jit will be removed. num_blocks, block_size, _, _ = cache_block.shape + import ms_custom_ops cache_block = ops.reshape(cache_block, (num_blocks, block_size, -1)) cache_block_nz = ms.jit(ms_custom_ops.trans_data)\ -- Gitee