diff --git a/vllm_mindspore/ops/CMakeLists.txt b/csrc/CMakeLists.txt similarity index 94% rename from vllm_mindspore/ops/CMakeLists.txt rename to csrc/CMakeLists.txt index 4c94b2c085b0be5ed4247e4c5829325531648ae9..86ae77716793e2965f1a91e74745a86b19eb1b8f 100644 --- a/vllm_mindspore/ops/CMakeLists.txt +++ b/csrc/CMakeLists.txt @@ -14,7 +14,7 @@ endif() add_subdirectory(ascendc) # Collect source files -file(GLOB SRC_FILES ${CMAKE_CURRENT_SOURCE_DIR}/module/*.cpp) +file(GLOB_RECURSE SRC_FILES ${CMAKE_CURRENT_SOURCE_DIR}/module/*.cpp) # Generate a temporary python script file to build custom ops with MindSpore's CustomOpBuilder set(PYTHON_SCRIPT_PATH "${CMAKE_BINARY_DIR}/build_custom_with_ms.py") diff --git a/vllm_mindspore/ops/ascendc/CMakeLists.txt b/csrc/ascendc/CMakeLists.txt similarity index 100% rename from vllm_mindspore/ops/ascendc/CMakeLists.txt rename to csrc/ascendc/CMakeLists.txt diff --git a/vllm_mindspore/ops/ascendc/adv_step_flash.c b/csrc/ascendc/adv_step_flash.c similarity index 100% rename from vllm_mindspore/ops/ascendc/adv_step_flash.c rename to csrc/ascendc/adv_step_flash.c diff --git a/vllm_mindspore/ops/ascendc/adv_step_flash.h b/csrc/ascendc/adv_step_flash.h similarity index 85% rename from vllm_mindspore/ops/ascendc/adv_step_flash.h rename to csrc/ascendc/adv_step_flash.h index 3b80b3e4b63631265d91b1360f17a55d7f1e6432..3c601e04a0a8aea2cecde695f200c5769ebf0467 100644 --- a/vllm_mindspore/ops/ascendc/adv_step_flash.h +++ b/csrc/ascendc/adv_step_flash.h @@ -13,8 +13,8 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#ifndef VLLM_MINDSPORE_OPS_ASCENDC_ADV_STEP_FLASH_H -#define VLLM_MINDSPORE_OPS_ASCENDC_ADV_STEP_FLASH_H +#ifndef VLLM_MINDSPORE_CSRC_ASCENDC_ADV_STEP_FLASH_H +#define VLLM_MINDSPORE_CSRC_ASCENDC_ADV_STEP_FLASH_H extern void AdvStepFlashKernelEntry( uint32_t blockDims, void *l2ctrl, void *aclStream, uint8_t *sampledTokenIds, @@ -22,4 +22,4 @@ extern void AdvStepFlashKernelEntry( uint8_t *inputPositions, uint8_t *seqLensOut, uint8_t *slotMapping, int32_t num_seqs, int32_t block_size, int32_t block_tables_stride); -#endif // VLLM_MINDSPORE_OPS_ASCENDC_ADV_STEP_FLASH_H +#endif // VLLM_MINDSPORE_CSRC_ASCENDC_ADV_STEP_FLASH_H diff --git a/csrc/module/adv_step_flash.cpp b/csrc/module/adv_step_flash.cpp new file mode 100644 index 0000000000000000000000000000000000000000..899de8bd2fa57c2126ff90685b599870fb6e00e8 --- /dev/null +++ b/csrc/module/adv_step_flash.cpp @@ -0,0 +1,125 @@ +/** + * Copyright 2025 Huawei Technologies Co., Ltd. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include "ms_extension/api.h" + +#include "ascendc/adv_step_flash.h" +#include "module/module.h" + +struct DtypeCaster { + ms::Tensor CheckAndCast(const ms::Tensor &t, const std::string &name = "") { + if (t.data_type() != ms::TypeId::kNumberTypeInt32) { + if (!name.empty()) { + tensor_map_[name] = t; + } + return t.cast(ms::TypeId::kNumberTypeInt32); + } + return t; + } + + ms::Tensor RecoveryTensorDtype(const ms::Tensor &t, const std::string &name) { + auto iter = tensor_map_.find(name); + if (iter == tensor_map_.end()) { + return t; + } + auto ori_tensor = iter->second; + auto ret = t.cast(ori_tensor.data_type()); + ori_tensor.AssignTensor(ret); + return ori_tensor; + } + std::map tensor_map_; +}; + +class AdvStepFlashOp : public ms::pynative::PyboostRunner { +public: + using PyboostRunner::PyboostRunner; + void LaunchKernel() override { + uint8_t *sampledTokenIdsPtr = + static_cast(inputs()[0].GetDataPtr()); + uint8_t *seqLensPtr = static_cast(inputs()[1].GetDataPtr()); + uint8_t *blockTablesPtr = static_cast(inputs()[2].GetDataPtr()); + uint8_t *inputTokensPtr = static_cast(outputs()[0].GetDataPtr()); + uint8_t *inputPositionsPtr = + static_cast(outputs()[1].GetDataPtr()); + uint8_t *slotMappingPtr = static_cast(outputs()[3].GetDataPtr()); + auto stride = inputs()[2].stride(); + int32_t block_tables_stride = stride.empty() ? 1 : stride[0]; + + uint32_t blockDims = 1; + void *l2ctrl = nullptr; + AdvStepFlashKernelEntry(blockDims, l2ctrl, stream(), sampledTokenIdsPtr, + blockTablesPtr, seqLensPtr, inputTokensPtr, + inputPositionsPtr, seqLensPtr, slotMappingPtr, + num_seqs_, block_size_, block_tables_stride); + } + + static void Eval(int32_t num_seqs, int32_t num_queries, int32_t block_size, + ms::Tensor input_tokens, // output + ms::Tensor sampled_token_ids, // input + ms::Tensor input_positions, // output + ms::Tensor seq_lens, // input&output (inplace) + ms::Tensor slot_mapping, // output + ms::Tensor block_tables // input + ) { + // the AdvStepFlashKernelEntry only support int32 inputs. + DtypeCaster caster; + sampled_token_ids = caster.CheckAndCast(sampled_token_ids); + block_tables = caster.CheckAndCast(block_tables); + input_tokens = caster.CheckAndCast(input_tokens, "input_tokens"); + input_positions = caster.CheckAndCast(input_positions, "input_positions"); + slot_mapping = caster.CheckAndCast(slot_mapping, "slot_mapping"); + seq_lens = caster.CheckAndCast(seq_lens, "seq_lens"); + + auto runner = std::make_shared("AdvanceStepFlashattn"); + runner->num_seqs_ = num_seqs; + runner->num_queries_ = num_queries; + runner->block_size_ = block_size; + runner->Run({sampled_token_ids, seq_lens, block_tables}, + {input_tokens, input_positions, seq_lens, slot_mapping}); + + input_tokens = caster.RecoveryTensorDtype(input_tokens, "input_tokens"); + input_positions = + caster.RecoveryTensorDtype(input_positions, "input_positions"); + slot_mapping = caster.RecoveryTensorDtype(slot_mapping, "slot_mapping"); + seq_lens = caster.RecoveryTensorDtype(seq_lens, "seq_lens"); + } + int32_t num_seqs_{0}; + int32_t num_queries_{0}; + int32_t block_size_{0}; +}; + +auto pyboost_adv_step_flash(int32_t num_seqs, int32_t num_queries, + int32_t block_size, ms::Tensor input_tokens, + ms::Tensor sampled_token_ids, + ms::Tensor input_positions, ms::Tensor seq_lens, + ms::Tensor slot_mapping, ms::Tensor block_tables) { + return ms::pynative::PyboostRunner::Call<0>( + AdvStepFlashOp::Eval, num_seqs, num_queries, block_size, input_tokens, + sampled_token_ids, input_positions, seq_lens, slot_mapping, block_tables); +} + +VLLM_MS_EXTENSION_MODULE(m) { + m.def("advance_step_flashattn", &pyboost_adv_step_flash, + "advance_step_flashattn", pybind11::arg("num_seqs"), + pybind11::arg("num_queries"), pybind11::arg("block_size"), + pybind11::arg("input_tokens"), pybind11::arg("sampled_token_ids"), + pybind11::arg("input_positions"), pybind11::arg("seq_lens"), + pybind11::arg("slot_mapping"), pybind11::arg("block_tables")); +} diff --git a/vllm_mindspore/ops/module/module.cpp b/csrc/module/module.cpp similarity index 100% rename from vllm_mindspore/ops/module/module.cpp rename to csrc/module/module.cpp diff --git a/vllm_mindspore/ops/module/module.h b/csrc/module/module.h similarity index 70% rename from vllm_mindspore/ops/module/module.h rename to csrc/module/module.h index b979d3d58dd005daade249e708d19b12893b1492..acda4235d8f72ce549df1c91071dae87bebfc423 100644 --- a/vllm_mindspore/ops/module/module.h +++ b/csrc/module/module.h @@ -13,8 +13,8 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#ifndef VLLM_MINDSPORE_OPS_MODULE_MODULE_H -#define VLLM_MINDSPORE_OPS_MODULE_MODULE_H +#ifndef VLLM_MINDSPORE_CSRC_MODULE_MODULE_H +#define VLLM_MINDSPORE_CSRC_MODULE_MODULE_H #include #include @@ -57,17 +57,21 @@ private: std::vector functions_; }; -// Define a macro to register module functions -#define MS_EXTENSION_MODULE(func) \ - static void func##_register(pybind11::module_ &); \ +#define CONCATENATE_DETAIL(x, y) x##y +#define CONCATENATE(x, y) CONCATENATE_DETAIL(x, y) + +#define VLLM_MS_EXTENSION_MODULE(m) \ + static void CONCATENATE(func_register_, __LINE__)(pybind11::module_ &); \ namespace { \ - struct func##_registrar { \ - func##_registrar() { \ - ModuleRegistry::Instance().Register(func##_register); \ + struct CONCATENATE(func_registrar_, __LINE__) { \ + CONCATENATE(func_registrar_, __LINE__)() { \ + ModuleRegistry::Instance().Register( \ + CONCATENATE(func_register_, __LINE__)); \ } \ }; \ - static func##_registrar registrar_instance; \ + static CONCATENATE(func_registrar_, __LINE__) \ + CONCATENATE(registrar_instance_, __LINE__); \ } \ - static void func##_register(pybind11::module_ &m) + static void CONCATENATE(func_register_, __LINE__)(pybind11::module_ & m) -#endif // VLLM_MINDSPORE_OPS_MODULE_MODULE_H +#endif // VLLM_MINDSPORE_CSRC_MODULE_MODULE_H diff --git a/setup.py b/setup.py index 11d9b3fcc84d26aa854bf8df4226cefee075e085..cc632c7eecd9d5caf421045c72c61783df2cd673 100644 --- a/setup.py +++ b/setup.py @@ -123,18 +123,20 @@ class CustomBuildExt(build_ext): ROOT_DIR = os.path.abspath(os.path.dirname(__file__)) def build_extension(self, ext): - if ext.name == "vllm_mindspore.npu_ops": - self.build_npu_ops(ext) + if ext.name == "vllm_mindspore._C_ops": + self.build_c_ops(ext) else: raise ValueError(f"Unknown extension name: {ext.name}") - def build_npu_ops(self, ext): - # "vllm_mindspore.npu_ops" --> "npu_ops" + def build_c_ops(self, ext): + # "vllm_mindspore._C_ops" --> "_C_ops" ext_name = ext.name.split('.')[-1] so_name = ext_name + ".so" logger.info("Building %s ...", so_name) - OPS_DIR = os.path.join(ROOT_DIR, "vllm_mindspore", "ops") - BUILD_OPS_DIR = os.path.join(ROOT_DIR, "build", "ops") + OPS_DIR = os.path.join(ROOT_DIR, "csrc") + BUILD_OPS_DIR = os.path.join(ROOT_DIR, "build", "csrc_ops") + if os.path.exists(BUILD_OPS_DIR): + shutil.rmtree(BUILD_OPS_DIR) os.makedirs(BUILD_OPS_DIR, exist_ok=True) ascend_home_path = _get_ascend_home_path() @@ -152,24 +154,21 @@ class CustomBuildExt(build_ext): f" -DASCEND_CANN_PACKAGE_PATH={ascend_home_path} && " f"cmake --build {BUILD_OPS_DIR} -j --verbose") - try: - # Run the combined cmake command - logger.info("Running combined CMake commands:\n%s", cmake_cmd) - result = subprocess.run(cmake_cmd, + # Run the combined cmake command + logger.info("Running commands: \n%s", cmake_cmd) + build_log_file = os.path.join(BUILD_OPS_DIR, "build_log.txt") + with open(build_log_file, "w") as log_file: + result = subprocess.run(["bash", "-c", cmake_cmd], cwd=self.ROOT_DIR, text=True, - shell=True, - capture_output=True) - if result.returncode != 0: - logger.info("CMake commands failed:") - logger.info(result.stdout) # Print standard output - logger.info(result.stderr) # Print error output - raise RuntimeError( - "Combined CMake commands failed with exit code {}".format( - result.returncode)) - except subprocess.CalledProcessError as e: - raise RuntimeError("Failed to build {}: {}".format(so_name, - e)) from e + stdout=log_file, + stderr=log_file) + if result.returncode != 0: + logger.error("Command failed: '%s' exited with code %d", cmake_cmd, + result.returncode) + raise RuntimeError( + "Failed to build {}, check the build log for details: {}". + format(ext_name, build_log_file)) # Copy the generated .so file to the target directory src_so_path = os.path.join(build_extension_dir, so_name) @@ -178,7 +177,7 @@ class CustomBuildExt(build_ext): if os.path.exists(dst_so_path): os.remove(dst_so_path) shutil.copy(src_so_path, dst_so_path) - logger.info("Copied %s to %s", so_name, dst_so_path) + logger.info("Build %s succeeded.", dst_so_path) write_commit_id() @@ -190,7 +189,7 @@ def _get_ext_modules(): ext_modules = [] if os.path.exists(_get_ascend_home_path()): # sources are specified in CMakeLists.txt - ext_modules.append(Extension("vllm_mindspore.npu_ops", sources=[])) + ext_modules.append(Extension("vllm_mindspore._C_ops", sources=[])) return ext_modules diff --git a/tests/st/python/test_custom_advstepflash.py b/tests/st/python/test_custom_advstepflash.py index b8a97e0c29c33b2722c954593abf45d34db598db..c1273ba078bf2e557821f183ff6b1855bcf81eab 100644 --- a/tests/st/python/test_custom_advstepflash.py +++ b/tests/st/python/test_custom_advstepflash.py @@ -14,13 +14,15 @@ # See the License for the specific language governing permissions and # limitations under the License. """test case for custom op adv_step_flash""" -import time +# isort: skip_file + +import time import mindspore as ms import numpy as np import pytest -from vllm_mindspore import npu_ops +from vllm_mindspore import _custom_ops as custom_ops import torch from .utils import cleanup_subprocesses @@ -103,15 +105,15 @@ def test_advstepflash(): sampled_token_ids2, input_tokens2, input_positions2, seq_lens_tensor2, \ block_tables2, slot_mapping2 = \ gendata(seed, num_seqs, block_size, block_num, ms.Tensor) - npu_ops.adv_step_flash(num_seqs=num_seqs, - num_queries=num_queries, - block_size=block_size, - input_tokens=input_tokens2, - sampled_token_ids=sampled_token_ids2, - input_positions=input_positions2, - seq_lens=seq_lens_tensor2, - slot_mapping=slot_mapping2, - block_tables=block_tables2) + custom_ops.advance_step_flashattn(num_seqs=num_seqs, + num_queries=num_queries, + block_size=block_size, + input_tokens=input_tokens2, + sampled_token_ids=sampled_token_ids2, + input_positions=input_positions2, + seq_lens=seq_lens_tensor2, + slot_mapping=slot_mapping2, + block_tables=block_tables2) assert np.allclose(sampled_token_ids1, sampled_token_ids2.asnumpy()) assert np.allclose(input_tokens1, input_tokens2.asnumpy()) diff --git a/vllm_mindspore/_custom_ops.py b/vllm_mindspore/_custom_ops.py new file mode 100644 index 0000000000000000000000000000000000000000..1bf50507c608c1d1f20131e6076012436e4933b7 --- /dev/null +++ b/vllm_mindspore/_custom_ops.py @@ -0,0 +1,37 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: Apache-2.0 +# Copyright 2025 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +import mindspore as ms + + +def advance_step_flashattn(num_seqs: int, num_queries: int, block_size: int, + input_tokens: ms.Tensor, + sampled_token_ids: ms.Tensor, + input_positions: ms.Tensor, seq_lens: ms.Tensor, + slot_mapping: ms.Tensor, + block_tables: ms.Tensor) -> None: + """Advance a step on Ascend for existing inputs for a multi-step runner""" + from vllm_mindspore import _C_ops as c_ops + c_ops.advance_step_flashattn(num_seqs=num_seqs, + num_queries=num_queries, + block_size=block_size, + input_tokens=input_tokens, + sampled_token_ids=sampled_token_ids, + input_positions=input_positions, + seq_lens=seq_lens, + slot_mapping=slot_mapping, + block_tables=block_tables)