diff --git a/.gitmodules b/.gitmodules index d5f8b94d0442ba1f95861b38419aaea8e2e85f36..885ebe56b41110fddfeaa237b5e11fbd66f92e3d 100644 --- a/.gitmodules +++ b/.gitmodules @@ -2,3 +2,7 @@ path = tests/mindformers url = https://gitee.com/mindspore/mindformers.git branch = master +[submodule "ms_custom_ops"] + path = ms_custom_ops + url = https://gitee.com/mindspore/akg.git + branch = ms_custom_ops diff --git a/.jenkins/test/config/dependent_packages.yaml b/.jenkins/test/config/dependent_packages.yaml index 7beac3e31e13e3c2206b9f2612b9c079ff5cea45..d873cf813eaff4f2123c987792974cdf8314f664 100644 --- a/.jenkins/test/config/dependent_packages.yaml +++ b/.jenkins/test/config/dependent_packages.yaml @@ -5,7 +5,7 @@ mindspore_gs: 'https://repo.mindspore.cn/mindspore/golden-stick/version/202509/20250901/master_20250901221800_3e34fd43040b0c5d296e6bc1a82212deae3ee041_newest/' msadapter: - 'https://repo.mindspore.cn/mindspore/msadapter/version/202509/20250904/master_20250904010017_666cf4b92070873b6b551de4ec1ae4263707de10_newest/' + 'https://repo.mindspore.cn/mindspore/msadapter/version/202509/20250906/master_20250906160017_75be4576b7f1081f95dd8b6bca95b0b9a6697f49_newest/' vllm: 'https://repo.mindspore.cn/mirrors/vllm/version/202507/20250715/v0.9.1/' diff --git a/ms_custom_ops b/ms_custom_ops new file mode 160000 index 0000000000000000000000000000000000000000..7fec7a3a66c9936c37be39f90431c1fc7349d4da --- /dev/null +++ b/ms_custom_ops @@ -0,0 +1 @@ +Subproject commit 7fec7a3a66c9936c37be39f90431c1fc7349d4da diff --git a/setup.py b/setup.py index f7d37dcd1f72896ae00a59f6c85328812e932edd..510cfdee9ff28fbfd63fe1b993f9885c8896a0b4 100644 --- a/setup.py +++ b/setup.py @@ -125,9 +125,35 @@ class CustomBuildExt(build_ext): def build_extension(self, ext): if ext.name == "vllm_mindspore._C_ops": self.build_c_ops(ext) + elif ext.name == "vllm_mindspore.ms_custom_ops": + self.build_ms_custom_ops(ext) else: raise ValueError(f"Unknown extension name: {ext.name}") + def build_ms_custom_ops(self, ext): + # "vllm_mindspore.ms_custom_ops" --> "ms_custom_ops" + ext_dir = os.path.dirname( + os.path.realpath(self.get_ext_fullpath(ext.name))) + build_cmd = build_cmd = ( + "git submodule update --init ms_custom_ops && " + "cd ms_custom_ops && " + "python setup.py build && " + f"cp -r build/*/ms_custom_ops {ext_dir}/..") + try: + logger.info("Running build ms_custom_ops commands:\n%s", build_cmd) + result = subprocess.run(build_cmd, + cwd=self.ROOT_DIR, + text=True, + shell=True, + capture_output=False) + if result.returncode != 0: + raise RuntimeError( + "Build ms_custom_ops failed with exit code {}".format( + result.returncode)) + except subprocess.CalledProcessError as e: + raise RuntimeError("Failed to build {}: {}".format( + "ms_custom_ops", e)) from e + def build_c_ops(self, ext): # "vllm_mindspore._C_ops" --> "_C_ops" ext_name = ext.name.split('.')[-1] @@ -190,6 +216,8 @@ def _get_ext_modules(): if os.path.exists(_get_ascend_home_path()): # sources are specified in CMakeLists.txt ext_modules.append(Extension("vllm_mindspore._C_ops", sources=[])) + ext_modules.append( + Extension("vllm_mindspore.ms_custom_ops", sources=[])) return ext_modules diff --git a/tests/mindformers b/tests/mindformers index 21bf5ccddb757e02ea139a6beb525c878dc03e9e..28964c24ef390e26fad106ee876a3c4e5356f2ca 160000 --- a/tests/mindformers +++ b/tests/mindformers @@ -1 +1 @@ -Subproject commit 21bf5ccddb757e02ea139a6beb525c878dc03e9e +Subproject commit 28964c24ef390e26fad106ee876a3c4e5356f2ca diff --git a/vllm_mindspore/model_executor/models/mf_models/mindformers.py b/vllm_mindspore/model_executor/models/mf_models/mindformers.py index 8bfa76819a6b6575ddfadba088dfba970fcf1a9b..66e59b8cb69e12f80efea6562db6a266f39aeed8 100644 --- a/vllm_mindspore/model_executor/models/mf_models/mindformers.py +++ b/vllm_mindspore/model_executor/models/mf_models/mindformers.py @@ -61,6 +61,7 @@ class MindFormersForCausalLM(MsModelBase, SupportsPP): self.mla_config = self.mf_config.get('model', None).get( 'model_config', None).get('multi_latent_attention', False) self.use_ringmla = is_use_ringmla(vllm_config, mf_config) + self.mf_config.model.model_config.use_fused_mla = self.use_ringmla self.is_chunked = False build_mf_context(self.mf_config) diff --git a/vllm_mindspore/model_executor/models/model_base.py b/vllm_mindspore/model_executor/models/model_base.py index 31a94e7bd29b68acbffefd627467822525fc976f..524e31cec9cc2f35d1827ff5f94b795b60598c86 100644 --- a/vllm_mindspore/model_executor/models/model_base.py +++ b/vllm_mindspore/model_executor/models/model_base.py @@ -22,7 +22,7 @@ from typing import Any, Optional, Union, cast import mindspore as ms import numpy as np import vllm.envs as envs -from mindspore import Tensor, mutable, nn, ops +from mindspore import Tensor, mutable, nn from mindspore.common import dtype as mstype from vllm.attention.backends.abstract import AttentionType from vllm.config import VllmConfig, get_current_vllm_config @@ -93,17 +93,17 @@ class MLAAttentionWrapper(AttentionWrapper): # format_cast ops may not recycle device memory k_shape = [1, *(self.kv_shape[1:-2]), kv_lora_rank] r_shape = [1, *(self.kv_shape[1:-2]), qk_rope_head_dim] - self.kv_cache = [ - (ops.auto_generate.format_cast( - ms.mint.zeros(k_shape, dtype=kv_cache_dtype), 29), - ops.auto_generate.format_cast( - ms.mint.zeros(r_shape, - dtype=vllm_config.model_config.dtype), - 29)) - for _ in range( - vllm_config.parallel_config.pipeline_parallel_size) - ] - + # Currently, transdata has a bug and ms.jit must be added. + # Later, ms.jit will be removed. + import ms_custom_ops + self.kv_cache = [(ms.jit(ms_custom_ops.trans_data)( + ms.mint.zeros(k_shape, dtype=kv_cache_dtype), + transdata_type=1), ms.jit(ms_custom_ops.trans_data)( + ms.mint.zeros(r_shape, + dtype=vllm_config.model_config.dtype), + transdata_type=1)) for _ in range( + vllm_config.parallel_config.pipeline_parallel_size) + ] else: k_shape = [*(self.kv_shape[0:-1]), kv_lora_rank] r_shape = [*(self.kv_shape[0:-1]), qk_rope_head_dim] @@ -295,7 +295,8 @@ class MsModelBase: seq_lengths = ms.Tensor([input_len], dtype=ms.int32) q_seq_lens_np = np.array([input_len], dtype=np.int32) seq_lens_np = np.array([input_len], dtype=np.int32) - context_lens_tensor = ms.Tensor([0], dtype=ms.int32) + context_lens_tensor = ms.Tensor([0], dtype=ms.int32) if not \ + self.set_flags else ms.Tensor([1], dtype=ms.int32) block_tables = ms.Tensor([[0]], dtype=ms.int32) slot_mapping = [-1 for _ in range(input_len)] diff --git a/vllm_mindspore/model_executor/models/utils.py b/vllm_mindspore/model_executor/models/utils.py index f099df8831c8a44431cbe460e982cc3625867d8d..5d9e56d3431257e81e07c47515b6084b5fa38f0a 100644 --- a/vllm_mindspore/model_executor/models/utils.py +++ b/vllm_mindspore/model_executor/models/utils.py @@ -24,6 +24,7 @@ from typing import Optional, Union import mindspore as ms from mindspore import mint, ops +from vllm import envs from vllm.sequence import IntermediateTensors from vllm_mindspore.multimodal.inputs import NestedTensors @@ -280,7 +281,7 @@ def is_use_ringmla(vllm_config, mf_config=None): if vllm_config.model_config.hf_config.model_type == "deepseek_mtp": # weight of deepseek mtp model has not been quantized return False - use_ringmla = (vllm_config.model_config.use_mla + use_ringmla = (vllm_config.model_config.use_mla and envs.VLLM_USE_V1 and vllm_config.model_config.quantization is not None and vllm_config.parallel_config.tensor_parallel_size < 16) return use_ringmla diff --git a/vllm_mindspore/v1/worker/gpu_model_runner.py b/vllm_mindspore/v1/worker/gpu_model_runner.py index 54fb277cb9cd96e31c2fa395f6e4b276014ee595..64973e5b9dc0d41903cff85b216680afcc76287c 100644 --- a/vllm_mindspore/v1/worker/gpu_model_runner.py +++ b/vllm_mindspore/v1/worker/gpu_model_runner.py @@ -477,11 +477,14 @@ def _reshape_kv_cache_tensors( kv_cache_shape[1:]).permute(*inv_order[1:]) if fa3_quant: # for fa3_quant, kvcache need be nz format due to ops + # Currently, transdata has a bug and ms.jit must be + # added. Later, ms.jit will be removed. num_blocks, block_size, _, _ = cache_block.shape + import ms_custom_ops cache_block = ops.reshape(cache_block, (num_blocks, block_size, -1)) - cache_block_nz = ops.auto_generate.format_cast( - cache_block, 29) + cache_block_nz = ms.jit(ms_custom_ops.trans_data)\ + (cache_block, transdata_type=1) kv_cache_layer.append(cache_block_nz) else: kv_cache_layer.append(cache_block)