From 4ef75d52db10fa717a518181b6319e4245ae2b8c Mon Sep 17 00:00:00 2001 From: hongshiyi Date: Fri, 5 Sep 2025 10:57:24 +0800 Subject: [PATCH 01/94] support core control --- .../platform/platform_infos_def.cpp | 72 +++++++++++-------- 1 file changed, 44 insertions(+), 28 deletions(-) diff --git a/src/kernels/tbe_adapter/platform/platform_infos_def.cpp b/src/kernels/tbe_adapter/platform/platform_infos_def.cpp index 268de985..816c822a 100644 --- a/src/kernels/tbe_adapter/platform/platform_infos_def.cpp +++ b/src/kernels/tbe_adapter/platform/platform_infos_def.cpp @@ -12,6 +12,7 @@ #include #include #include "platform_infos_impl.h" +#include "acl/acl_rt.h" namespace fe { constexpr uint32_t MAX_CORE_NUM = 128; @@ -102,46 +103,61 @@ void PlatFormInfos::SetFixPipeDtypeMap(const std::map lockGuard(g_asdopsFePlatMutex); - (void)GetPlatformRes("SoCInfo", coreTypeStr, coreNumStr); - MKI_LOG(DEBUG) << "Set PlatFormInfos::core_num_ to " << coreTypeStr << ": " << coreNumStr; - if (coreNumStr.empty()) { - core_num_ = 1; - MKI_LOG(ERROR) << "CoreNumStr is empty!"; - } else { - core_num_ = std::strtoul(coreNumStr.c_str(), nullptr, 10); // 10 进制 - if (core_num_ > MAX_CORE_NUM) { + std::string coreNumStr; + std::string coreTypeStr; + if (core_type == "VectorCore") { + coreTypeStr = "vector_core_cnt"; + } else { + coreTypeStr = "ai_core_cnt"; + } + std::lock_guardstd::mutex lockGuard(g_asdopsFePlatMutex); + (void)GetPlatformRes("SoCInfo", coreTypeStr, coreNumStr); + MKI_LOG(DEBUG) << "Set PlatFormInfos::core_num_ to " << coreTypeStr << ": " << coreNumStr; + if (coreNumStr.empty()) { core_num_ = 1; - MKI_LOG(ERROR) << "core_num is out of range : " << core_num_; + MKI_LOG(ERROR) << "CoreNumStr is empty!"; + } else { + core_num_ = std::strtoul(coreNumStr.c_str(), nullptr, 10); // 10 进制 } } + if (core_num_ == 0 || core_num_ > MAX_CORE_NUM) { + MKI_LOG(ERROR) << "core_num is out of range : " << core_num_; + core_num_ = 1; + } } uint32_t PlatFormInfos::GetCoreNumByType(const std::string &core_type) { - std::string coreNumStr; - std::string coreTypeStr = core_type == "VectorCore" ? "vector_core_cnt" : "ai_core_cnt"; - std::lock_guard lockGuard(g_asdopsFePlatMutex); - (void)GetPlatformRes("SoCInfo", coreTypeStr, coreNumStr); - MKI_LOG(DEBUG) << "Get PlatFormInfos::core_num_ to " << coreTypeStr << ": " << coreNumStr; - if (coreNumStr.empty()) { - MKI_LOG(ERROR) << "CoreNumStr is empty!"; - return 1; + uint32_t coreNum = 0; + aclrtDevResLimitType resType = core_type == "VectorCore" ? ACL_RT_DEV_RES_VECTOR_CORE : ACL_RT_DEV_RES_CUBE_CORE; + aclError getResRet = aclrtGetResInCurrentThread(resType, &coreNum); + if (getResRet == ACL_SUCCESS) { + MKI_LOG(DEBUG) << "Get ThreadResource::core_num_ to " << core_type << ": " << coreNum; } else { - uint32_t coreTypeNum = std::strtoul(coreNumStr.c_str(), nullptr, 10); // 10 进制 - if (coreTypeNum > MAX_CORE_NUM) { - MKI_LOG(ERROR) << "core_num is out of range : " << coreTypeNum; + std::string coreNumStr; + std::string coreTypeStr = core_type == "VectorCore" ? "vector_core_cnt" : "ai_core_cnt"; + std::lock_guardstd::mutex lockGuard(g_asdopsFePlatMutex); + (void)GetPlatformRes("SoCInfo", coreTypeStr, coreNumStr); + MKI_LOG(DEBUG) << "Get PlatFormInfos::core_num_ to " << coreTypeStr << ": " << coreNumStr; + if (coreNumStr.empty()) { + MKI_LOG(ERROR) << "CoreNumStr is empty!"; return 1; + } else { + coreNum = std::strtoul(coreNumStr.c_str(), nullptr, 10); // 10 进制 } - return coreTypeNum; } + if (coreNum > MAX_CORE_NUM) { + MKI_LOG(ERROR) << "core_num is out of range : " << coreNum; + return 1; + } + return coreNum; } void PlatFormInfos::SetCoreNum(const uint32_t &coreNum) -- Gitee From 9fdd0f5a1a2fd855c898d590c6ca23b2ba253c26 Mon Sep 17 00:00:00 2001 From: hongshiyi Date: Fri, 5 Sep 2025 14:16:28 +0800 Subject: [PATCH 02/94] add <> --- src/kernels/tbe_adapter/platform/platform_infos_def.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/kernels/tbe_adapter/platform/platform_infos_def.cpp b/src/kernels/tbe_adapter/platform/platform_infos_def.cpp index 816c822a..e0ac8527 100644 --- a/src/kernels/tbe_adapter/platform/platform_infos_def.cpp +++ b/src/kernels/tbe_adapter/platform/platform_infos_def.cpp @@ -117,7 +117,7 @@ void PlatFormInfos::SetCoreNumByCoreType(const std::string &core_type) } else { coreTypeStr = "ai_core_cnt"; } - std::lock_guardstd::mutex lockGuard(g_asdopsFePlatMutex); + std::lock_guard lockGuard(g_asdopsFePlatMutex); (void)GetPlatformRes("SoCInfo", coreTypeStr, coreNumStr); MKI_LOG(DEBUG) << "Set PlatFormInfos::core_num_ to " << coreTypeStr << ": " << coreNumStr; if (coreNumStr.empty()) { @@ -143,7 +143,7 @@ uint32_t PlatFormInfos::GetCoreNumByType(const std::string &core_type) } else { std::string coreNumStr; std::string coreTypeStr = core_type == "VectorCore" ? "vector_core_cnt" : "ai_core_cnt"; - std::lock_guardstd::mutex lockGuard(g_asdopsFePlatMutex); + std::lock_guard lockGuard(g_asdopsFePlatMutex); (void)GetPlatformRes("SoCInfo", coreTypeStr, coreNumStr); MKI_LOG(DEBUG) << "Get PlatFormInfos::core_num_ to " << coreTypeStr << ": " << coreNumStr; if (coreNumStr.empty()) { -- Gitee From 11179f21897d84b8d124f6f42e4200e0b0a918a3 Mon Sep 17 00:00:00 2001 From: hongshiyi Date: Tue, 9 Sep 2025 20:04:50 +0800 Subject: [PATCH 03/94] add zero judge --- src/kernels/tbe_adapter/platform/platform_infos_def.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/kernels/tbe_adapter/platform/platform_infos_def.cpp b/src/kernels/tbe_adapter/platform/platform_infos_def.cpp index e0ac8527..2cb40ebf 100644 --- a/src/kernels/tbe_adapter/platform/platform_infos_def.cpp +++ b/src/kernels/tbe_adapter/platform/platform_infos_def.cpp @@ -153,7 +153,7 @@ uint32_t PlatFormInfos::GetCoreNumByType(const std::string &core_type) coreNum = std::strtoul(coreNumStr.c_str(), nullptr, 10); // 10 进制 } } - if (coreNum > MAX_CORE_NUM) { + if (coreNum == 0 || coreNum > MAX_CORE_NUM) { MKI_LOG(ERROR) << "core_num is out of range : " << coreNum; return 1; } -- Gitee From ee0ae57f3d9ea14d627b9e34fd34c4b33441bd68 Mon Sep 17 00:00:00 2001 From: hongshiyi Date: Wed, 10 Sep 2025 15:22:07 +0800 Subject: [PATCH 04/94] add compile option --- src/torch_atb/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/src/torch_atb/CMakeLists.txt b/src/torch_atb/CMakeLists.txt index 50111f2e..073e51f3 100644 --- a/src/torch_atb/CMakeLists.txt +++ b/src/torch_atb/CMakeLists.txt @@ -12,6 +12,7 @@ file(GLOB_RECURSE pybind11_source_files "*.cpp") pybind11_add_module(_C ${pybind11_source_files}) set_target_properties(_C PROPERTIES OUTPUT_NAME "_C" SUFFIX ".so") target_link_options(_C PRIVATE -rdynamic -ldl -Wl,-z,relro -Wl,-z,now -Wl,-z,noexecstack -Wl,--build-id=none -fexceptions) +target_compile_options(_C PRIVATE -Wno-odr-violation) target_link_libraries(_C PRIVATE torch_npu) target_include_directories(_C PRIVATE ${ATB_INCLUDE_DIR}) install(TARGETS _C DESTINATION ${CMAKE_SOURCE_DIR}/output/torch_atb) \ No newline at end of file -- Gitee From 2fb4bb1bd7cba5b1ffcbd49903c9973c4b2c5eb8 Mon Sep 17 00:00:00 2001 From: hongshiyi Date: Wed, 10 Sep 2025 17:51:03 +0800 Subject: [PATCH 05/94] change cmakelist --- src/torch_atb/CMakeLists.txt | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/torch_atb/CMakeLists.txt b/src/torch_atb/CMakeLists.txt index 073e51f3..f75429df 100644 --- a/src/torch_atb/CMakeLists.txt +++ b/src/torch_atb/CMakeLists.txt @@ -11,8 +11,7 @@ file(GLOB_RECURSE pybind11_source_files "*.cpp") pybind11_add_module(_C ${pybind11_source_files}) set_target_properties(_C PROPERTIES OUTPUT_NAME "_C" SUFFIX ".so") -target_link_options(_C PRIVATE -rdynamic -ldl -Wl,-z,relro -Wl,-z,now -Wl,-z,noexecstack -Wl,--build-id=none -fexceptions) -target_compile_options(_C PRIVATE -Wno-odr-violation) +target_link_options(_C PRIVATE -Wno-odr -rdynamic -ldl -Wl,-z,relro -Wl,-z,now -Wl,-z,noexecstack -Wl,--build-id=none -fexceptions) target_link_libraries(_C PRIVATE torch_npu) target_include_directories(_C PRIVATE ${ATB_INCLUDE_DIR}) install(TARGETS _C DESTINATION ${CMAKE_SOURCE_DIR}/output/torch_atb) \ No newline at end of file -- Gitee From 5ccc9703c790bb503aee7f92246b1971e9427365 Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Thu, 11 Sep 2025 19:36:56 +0800 Subject: [PATCH 06/94] fix --- comm/lcal/src/CMakeLists.txt | 2 +- comm/lcal/src/lcal_internal.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/comm/lcal/src/CMakeLists.txt b/comm/lcal/src/CMakeLists.txt index 400edecd..17cdc9cd 100644 --- a/comm/lcal/src/CMakeLists.txt +++ b/comm/lcal/src/CMakeLists.txt @@ -26,7 +26,7 @@ target_link_libraries(lcal ascendcl runtime profapi c_sec mki) target_link_libraries(lcal_static ascendcl runtime profapi c_sec mki) message(STATUS "LCAL USE_MSSANITIZER = ${USE_MSSANITIZER}") -set(LCAL_CCE_PATH "/tmp/lcal_cce.o") +set(LCAL_CCE_PATH "${CMAKE_CURRENT_BINARY_DIR}/lcal_cce.o") if(USE_MSSANITIZER) math(EXPR LCAL_1OP_BIN_SIZE "128 * 1024 * 1024") add_definitions(-DUSE_MSSANITIZER) diff --git a/comm/lcal/src/lcal_internal.cpp b/comm/lcal/src/lcal_internal.cpp index 5fd8b8b5..0dff180e 100644 --- a/comm/lcal/src/lcal_internal.cpp +++ b/comm/lcal/src/lcal_internal.cpp @@ -23,7 +23,7 @@ using namespace Mki; extern const int LCAL_CCE_BIN_STR[]; asm(R"(.section .rodata, "a", @progbits -LCAL_CCE_BIN_STR:.incbin "/tmp/lcal_cce.o" +LCAL_CCE_BIN_STR:.incbin "lcal_cce.o" .byte 0 .previous)"); -- Gitee From 4961585a1a329e6e16f641288cdcc6383a7dd2ee Mon Sep 17 00:00:00 2001 From: guo-jiong Date: Wed, 10 Sep 2025 17:51:16 +0800 Subject: [PATCH 07/94] move lcal and cinterface --- CMakeLists.txt | 13 +- src/CMakeLists.txt | 2 - .../utils}/atb_acl_util.cpp | 0 src/{cinterface => atb/utils}/atb_acl_util.h | 0 src/kernels/CMakeLists.txt | 1 + {comm => src/kernels}/lcal/CMakeLists.txt | 1 + .../lcal/cmake/CMakeCCECompiler.cmake.in | 0 .../lcal/cmake/CMakeCCEInformation.cmake | 0 .../cmake/CMakeDetermineCCECompiler.cmake | 0 .../lcal/cmake/CMakeTestCCECompiler.cmake | 0 .../kernels}/lcal/include/comm_args.h | 0 {comm => src/kernels}/lcal/include/lcal.h | 0 {comm => src/kernels}/lcal/include/lcal_api.h | 0 .../kernels}/lcal/include/lcal_comm.h | 0 .../kernels}/lcal/include/lcal_types.h | 0 {comm => src/kernels}/lcal/include/lccl.h | 0 .../kernels}/lcal/include/lcoc/lcoc.h | 0 .../kernels}/lcal/include/lcoc/lcoc_args.h | 0 .../kernels}/lcal/include/lcoc/lcoc_base.h | 0 .../kernels}/lcal/include/lcoc/lcoc_func.h | 0 .../lcal/include/lcoc/lcoc_workspace.h | 0 .../lcal/include/lcoc/tiling/tiling.h | 0 .../lcal/include/lcoc/tiling/tiling_91093.h | 0 .../lcal/include/lcoc/tiling/tiling_910B.h | 0 .../lcal/include/lcoc/tiling/tiling_args.h | 0 .../lcal/include/lcoc/tiling/tiling_func.h | 0 {comm => src/kernels}/lcal/src/CMakeLists.txt | 0 {comm => src/kernels}/lcal/src/ascendc.cmake | 0 .../ascendc_kernels/91093/all2all_hierarchy.h | 0 .../91093/all2all_hierarchy_small.h | 0 .../91093/allgather_hierarchy_double_ring.h | 0 .../91093/allreduce_big_data_sio.h | 0 .../91093/allreduce_hierarchy_double_ring.h | 0 .../reduce_scatter_big_data_91093_4step.h | 0 .../reduce_scatter_hierarchy_double_ring.h | 0 .../lcal/src/ascendc_kernels/CMakeLists.txt | 0 .../lcal/src/ascendc_kernels/allgather.h | 0 .../src/ascendc_kernels/allreduce_big_data.h | 0 .../src/ascendc_kernels/allreduce_one_shot.h | 0 .../src/ascendc_kernels/allreduce_quant.h | 0 .../src/ascendc_kernels/allreduce_two_shot.h | 0 .../lcal/src/ascendc_kernels/collectives.h | 0 .../lcal/src/ascendc_kernels/datacopy_gm2gm.h | 0 .../ascendc_kernels/datacopy_gm2gm_delay.h | 0 .../lcal/src/ascendc_kernels/ipc_queue.h | 0 .../lcal/src/ascendc_kernels/lccl_op.h | 0 .../lcal/src/ascendc_kernels/lccl_op1.cpp | 0 .../lcal/src/ascendc_kernels/lccl_op2.cpp | 0 .../lcal/src/ascendc_kernels/op_def.h | 0 .../lcal/src/ascendc_kernels/reduce_scatter.h | 0 .../src/ascendc_kernels/sync_collectives.h | 0 .../kernels}/lcal/src/ccl_kernel_args.h | 0 .../kernels}/lcal/src/coc_kernel_args.cpp | 0 .../kernels}/lcal/src/coc_kernel_args.h | 0 .../kernels}/lcal/src/kernels/CMakeLists.txt | 0 .../lcal/src/kernels/coc_add_bias_runner.cce | 0 .../lcal/src/kernels/coc_allgather.cce | 0 .../lcal/src/kernels/coc_allgather_matmul.cce | 0 .../coc_allgather_matmul_reduce_scatter.cce | 0 .../kernels/coc_allgather_reducescatter.cce | 0 .../lcal/src/kernels/coc_allgather_v2.cce | 0 .../lcal/src/kernels/coc_allreduce.cce | 0 .../kernels/coc_alltoall_allgather_hidden.cce | 0 .../coc_alltoall_reduce_scatter_hidden.cce | 0 .../src/kernels/coc_alltoallv_allgather.cce | 0 .../coc_alltoallv_allgather_matmul.cce | 0 .../lcal/src/kernels/coc_comm_base.cce | 0 .../lcal/src/kernels/coc_const_args.cce | 0 .../lcal/src/kernels/coc_dequant_runner.cce | 0 .../lcal/src/kernels/coc_internal.cce | 0 .../lcal/src/kernels/coc_matmul_allreduce.cce | 0 .../src/kernels/coc_matmul_reduce_scatter.cce | 0 .../coc_matmul_reduce_scatter_alltoallv.cce | 0 .../lcal/src/kernels/coc_matmulmoe.cce | 0 .../lcal/src/kernels/coc_postprocessor.cce | 0 .../lcal/src/kernels/coc_ppmatmul.cce | 0 .../lcal/src/kernels/coc_ppmatmul_switch.cce | 0 .../lcal/src/kernels/coc_preprocessor.cce | 0 .../lcal/src/kernels/coc_pure_matmul.cce | 0 .../lcal/src/kernels/coc_reduce_scatter.cce | 0 .../kernels}/lcal/src/kernels/collectives.cce | 0 .../src/kernels/lcal_all2all_transpose.cce | 0 .../lcal/src/kernels/lcal_allgather.cce | 0 .../lcal/src/kernels/lcal_allgather_2npu.cce | 0 .../lcal_allgather_2npu_big_data_write.cce | 0 .../src/kernels/lcal_allgather_910B2C.cce | 0 .../src/kernels/lcal_allgather_big_data.cce | 0 .../lcal_allgather_big_data_910B2C.cce | 0 .../kernels/lcal_allreduce_2npu_big_write.cce | 0 .../src/kernels/lcal_allreduce_2npu_read.cce | 0 .../src/kernels/lcal_allreduce_2npu_write.cce | 0 .../src/kernels/lcal_allreduce_big_data.cce | 0 .../lcal_allreduce_big_data_910B2C.cce | 0 .../kernels/lcal_allreduce_deterministic.cce | 0 .../lcal_allreduce_deterministic_big_data.cce | 0 .../src/kernels/lcal_allreduce_two_shot.cce | 0 .../lcal_allreduce_two_shot_910B2C.cce | 0 .../src/kernels/lcal_broadcast_big_data.cce | 0 .../lcal/src/kernels/lcal_broadcast_write.cce | 0 .../lcal/src/kernels/lcal_reduce_scatter.cce | 0 .../kernels/lcal_reduce_scatter_big_data.cce | 0 .../lcal_reduce_scatter_big_data_write.cce | 0 .../src/kernels/lcal_reduce_scatter_write.cce | 0 {comm => src/kernels}/lcal/src/lcal_comm.cpp | 0 .../kernels}/lcal/src/lcal_internal.cpp | 0 .../kernels}/lcal/src/lcal_internal.h | 0 {comm => src/kernels}/lcal/src/lcal_wrap.cpp | 0 {comm => src/kernels}/lcal/src/lccl.cpp | 0 {comm => src/kernels}/lcal/src/lcoc.cpp | 0 {comm => src/kernels}/lcal/src/lcoc_func.cpp | 0 .../lcal/src/profiling/report_timing.h | 0 .../tiling/allgather_reducescatter_tiling.cpp | 0 .../lcal/src/tiling/allgather_tiling.cpp | 0 .../src/tiling/allgather_tiling_91093.cpp | 0 .../lcal/src/tiling/allgather_tiling_910B.cpp | 0 .../src/tiling/allgatherv2_tiling_91093.cpp | 0 .../src/tiling/allgatherv2_tiling_910B.cpp | 0 .../lcal/src/tiling/allreduce_tiling.cpp | 0 .../src/tiling/allreduce_tiling_91093.cpp | 0 .../lcal/src/tiling/allreduce_tiling_910B.cpp | 0 .../alltoall_allgather_hidden_tiling.cpp | 0 .../src/tiling/alltoall_allgather_tiling.cpp | 0 .../reducescatter_alltoall_hidden_tiling.cpp | 0 .../lcal/src/tiling/reducescatter_tiling.cpp | 0 .../src/tiling/reducescatter_tiling_91093.cpp | 0 .../src/tiling/reducescatter_tiling_910B.cpp | 0 .../kernels}/lcal/src/tiling/tiling.cpp | 0 .../kernels}/lcal/src/tiling/tiling_args.cpp | 0 .../kernels}/lcal/src/tiling/tiling_func.cpp | 0 .../src/tools/socket/lcal_sock_exchange.cpp | 0 .../src/tools/socket/lcal_sock_exchange.h | 0 .../atb_acl_fused_add_topk_div.cpp | 2 +- .../atb_acl_mla_preprocess.cpp | 358 ++++++++-------- .../multi_latent_attention}/atb_acl_mla.cpp | 402 +++++++++--------- .../atb_acl_paged_cache_load.cpp | 2 +- .../ring_mla}/atb_acl_ring_mla.cpp | 198 ++++----- .../atb_acl_self_attention_prefix_encoder.cpp | 228 +++++----- 137 files changed, 603 insertions(+), 604 deletions(-) rename src/{cinterface => atb/utils}/atb_acl_util.cpp (100%) rename src/{cinterface => atb/utils}/atb_acl_util.h (100%) rename {comm => src/kernels}/lcal/CMakeLists.txt (98%) rename {comm => src/kernels}/lcal/cmake/CMakeCCECompiler.cmake.in (100%) rename {comm => src/kernels}/lcal/cmake/CMakeCCEInformation.cmake (100%) rename {comm => src/kernels}/lcal/cmake/CMakeDetermineCCECompiler.cmake (100%) rename {comm => src/kernels}/lcal/cmake/CMakeTestCCECompiler.cmake (100%) rename {comm => src/kernels}/lcal/include/comm_args.h (100%) rename {comm => src/kernels}/lcal/include/lcal.h (100%) rename {comm => src/kernels}/lcal/include/lcal_api.h (100%) rename {comm => src/kernels}/lcal/include/lcal_comm.h (100%) rename {comm => src/kernels}/lcal/include/lcal_types.h (100%) rename {comm => src/kernels}/lcal/include/lccl.h (100%) rename {comm => src/kernels}/lcal/include/lcoc/lcoc.h (100%) rename {comm => src/kernels}/lcal/include/lcoc/lcoc_args.h (100%) rename {comm => src/kernels}/lcal/include/lcoc/lcoc_base.h (100%) rename {comm => src/kernels}/lcal/include/lcoc/lcoc_func.h (100%) rename {comm => src/kernels}/lcal/include/lcoc/lcoc_workspace.h (100%) rename {comm => src/kernels}/lcal/include/lcoc/tiling/tiling.h (100%) rename {comm => src/kernels}/lcal/include/lcoc/tiling/tiling_91093.h (100%) rename {comm => src/kernels}/lcal/include/lcoc/tiling/tiling_910B.h (100%) rename {comm => src/kernels}/lcal/include/lcoc/tiling/tiling_args.h (100%) rename {comm => src/kernels}/lcal/include/lcoc/tiling/tiling_func.h (100%) rename {comm => src/kernels}/lcal/src/CMakeLists.txt (100%) rename {comm => src/kernels}/lcal/src/ascendc.cmake (100%) rename {comm => src/kernels}/lcal/src/ascendc_kernels/91093/all2all_hierarchy.h (100%) rename {comm => src/kernels}/lcal/src/ascendc_kernels/91093/all2all_hierarchy_small.h (100%) rename {comm => src/kernels}/lcal/src/ascendc_kernels/91093/allgather_hierarchy_double_ring.h (100%) rename {comm => src/kernels}/lcal/src/ascendc_kernels/91093/allreduce_big_data_sio.h (100%) rename {comm => src/kernels}/lcal/src/ascendc_kernels/91093/allreduce_hierarchy_double_ring.h (100%) rename {comm => src/kernels}/lcal/src/ascendc_kernels/91093/reduce_scatter_big_data_91093_4step.h (100%) rename {comm => src/kernels}/lcal/src/ascendc_kernels/91093/reduce_scatter_hierarchy_double_ring.h (100%) rename {comm => src/kernels}/lcal/src/ascendc_kernels/CMakeLists.txt (100%) rename {comm => src/kernels}/lcal/src/ascendc_kernels/allgather.h (100%) rename {comm => src/kernels}/lcal/src/ascendc_kernels/allreduce_big_data.h (100%) rename {comm => src/kernels}/lcal/src/ascendc_kernels/allreduce_one_shot.h (100%) rename {comm => src/kernels}/lcal/src/ascendc_kernels/allreduce_quant.h (100%) rename {comm => src/kernels}/lcal/src/ascendc_kernels/allreduce_two_shot.h (100%) rename {comm => src/kernels}/lcal/src/ascendc_kernels/collectives.h (100%) rename {comm => src/kernels}/lcal/src/ascendc_kernels/datacopy_gm2gm.h (100%) rename {comm => src/kernels}/lcal/src/ascendc_kernels/datacopy_gm2gm_delay.h (100%) rename {comm => src/kernels}/lcal/src/ascendc_kernels/ipc_queue.h (100%) rename {comm => src/kernels}/lcal/src/ascendc_kernels/lccl_op.h (100%) rename {comm => src/kernels}/lcal/src/ascendc_kernels/lccl_op1.cpp (100%) rename {comm => src/kernels}/lcal/src/ascendc_kernels/lccl_op2.cpp (100%) rename {comm => src/kernels}/lcal/src/ascendc_kernels/op_def.h (100%) rename {comm => src/kernels}/lcal/src/ascendc_kernels/reduce_scatter.h (100%) rename {comm => src/kernels}/lcal/src/ascendc_kernels/sync_collectives.h (100%) rename {comm => src/kernels}/lcal/src/ccl_kernel_args.h (100%) rename {comm => src/kernels}/lcal/src/coc_kernel_args.cpp (100%) rename {comm => src/kernels}/lcal/src/coc_kernel_args.h (100%) rename {comm => src/kernels}/lcal/src/kernels/CMakeLists.txt (100%) rename {comm => src/kernels}/lcal/src/kernels/coc_add_bias_runner.cce (100%) rename {comm => src/kernels}/lcal/src/kernels/coc_allgather.cce (100%) rename {comm => src/kernels}/lcal/src/kernels/coc_allgather_matmul.cce (100%) rename {comm => src/kernels}/lcal/src/kernels/coc_allgather_matmul_reduce_scatter.cce (100%) rename {comm => src/kernels}/lcal/src/kernels/coc_allgather_reducescatter.cce (100%) rename {comm => src/kernels}/lcal/src/kernels/coc_allgather_v2.cce (100%) rename {comm => src/kernels}/lcal/src/kernels/coc_allreduce.cce (100%) rename {comm => src/kernels}/lcal/src/kernels/coc_alltoall_allgather_hidden.cce (100%) rename {comm => src/kernels}/lcal/src/kernels/coc_alltoall_reduce_scatter_hidden.cce (100%) rename {comm => src/kernels}/lcal/src/kernels/coc_alltoallv_allgather.cce (100%) rename {comm => src/kernels}/lcal/src/kernels/coc_alltoallv_allgather_matmul.cce (100%) rename {comm => src/kernels}/lcal/src/kernels/coc_comm_base.cce (100%) rename {comm => src/kernels}/lcal/src/kernels/coc_const_args.cce (100%) rename {comm => src/kernels}/lcal/src/kernels/coc_dequant_runner.cce (100%) rename {comm => src/kernels}/lcal/src/kernels/coc_internal.cce (100%) rename {comm => src/kernels}/lcal/src/kernels/coc_matmul_allreduce.cce (100%) rename {comm => src/kernels}/lcal/src/kernels/coc_matmul_reduce_scatter.cce (100%) rename {comm => src/kernels}/lcal/src/kernels/coc_matmul_reduce_scatter_alltoallv.cce (100%) rename {comm => src/kernels}/lcal/src/kernels/coc_matmulmoe.cce (100%) rename {comm => src/kernels}/lcal/src/kernels/coc_postprocessor.cce (100%) rename {comm => src/kernels}/lcal/src/kernels/coc_ppmatmul.cce (100%) rename {comm => src/kernels}/lcal/src/kernels/coc_ppmatmul_switch.cce (100%) rename {comm => src/kernels}/lcal/src/kernels/coc_preprocessor.cce (100%) rename {comm => src/kernels}/lcal/src/kernels/coc_pure_matmul.cce (100%) rename {comm => src/kernels}/lcal/src/kernels/coc_reduce_scatter.cce (100%) rename {comm => src/kernels}/lcal/src/kernels/collectives.cce (100%) rename {comm => src/kernels}/lcal/src/kernels/lcal_all2all_transpose.cce (100%) rename {comm => src/kernels}/lcal/src/kernels/lcal_allgather.cce (100%) rename {comm => src/kernels}/lcal/src/kernels/lcal_allgather_2npu.cce (100%) rename {comm => src/kernels}/lcal/src/kernels/lcal_allgather_2npu_big_data_write.cce (100%) rename {comm => src/kernels}/lcal/src/kernels/lcal_allgather_910B2C.cce (100%) rename {comm => src/kernels}/lcal/src/kernels/lcal_allgather_big_data.cce (100%) rename {comm => src/kernels}/lcal/src/kernels/lcal_allgather_big_data_910B2C.cce (100%) rename {comm => src/kernels}/lcal/src/kernels/lcal_allreduce_2npu_big_write.cce (100%) rename {comm => src/kernels}/lcal/src/kernels/lcal_allreduce_2npu_read.cce (100%) rename {comm => src/kernels}/lcal/src/kernels/lcal_allreduce_2npu_write.cce (100%) rename {comm => src/kernels}/lcal/src/kernels/lcal_allreduce_big_data.cce (100%) rename {comm => src/kernels}/lcal/src/kernels/lcal_allreduce_big_data_910B2C.cce (100%) rename {comm => src/kernels}/lcal/src/kernels/lcal_allreduce_deterministic.cce (100%) rename {comm => src/kernels}/lcal/src/kernels/lcal_allreduce_deterministic_big_data.cce (100%) rename {comm => src/kernels}/lcal/src/kernels/lcal_allreduce_two_shot.cce (100%) rename {comm => src/kernels}/lcal/src/kernels/lcal_allreduce_two_shot_910B2C.cce (100%) rename {comm => src/kernels}/lcal/src/kernels/lcal_broadcast_big_data.cce (100%) rename {comm => src/kernels}/lcal/src/kernels/lcal_broadcast_write.cce (100%) rename {comm => src/kernels}/lcal/src/kernels/lcal_reduce_scatter.cce (100%) rename {comm => src/kernels}/lcal/src/kernels/lcal_reduce_scatter_big_data.cce (100%) rename {comm => src/kernels}/lcal/src/kernels/lcal_reduce_scatter_big_data_write.cce (100%) rename {comm => src/kernels}/lcal/src/kernels/lcal_reduce_scatter_write.cce (100%) rename {comm => src/kernels}/lcal/src/lcal_comm.cpp (100%) rename {comm => src/kernels}/lcal/src/lcal_internal.cpp (100%) rename {comm => src/kernels}/lcal/src/lcal_internal.h (100%) rename {comm => src/kernels}/lcal/src/lcal_wrap.cpp (100%) rename {comm => src/kernels}/lcal/src/lccl.cpp (100%) rename {comm => src/kernels}/lcal/src/lcoc.cpp (100%) rename {comm => src/kernels}/lcal/src/lcoc_func.cpp (100%) rename {comm => src/kernels}/lcal/src/profiling/report_timing.h (100%) rename {comm => src/kernels}/lcal/src/tiling/allgather_reducescatter_tiling.cpp (100%) rename {comm => src/kernels}/lcal/src/tiling/allgather_tiling.cpp (100%) rename {comm => src/kernels}/lcal/src/tiling/allgather_tiling_91093.cpp (100%) rename {comm => src/kernels}/lcal/src/tiling/allgather_tiling_910B.cpp (100%) rename {comm => src/kernels}/lcal/src/tiling/allgatherv2_tiling_91093.cpp (100%) rename {comm => src/kernels}/lcal/src/tiling/allgatherv2_tiling_910B.cpp (100%) rename {comm => src/kernels}/lcal/src/tiling/allreduce_tiling.cpp (100%) rename {comm => src/kernels}/lcal/src/tiling/allreduce_tiling_91093.cpp (100%) rename {comm => src/kernels}/lcal/src/tiling/allreduce_tiling_910B.cpp (100%) rename {comm => src/kernels}/lcal/src/tiling/alltoall_allgather_hidden_tiling.cpp (100%) rename {comm => src/kernels}/lcal/src/tiling/alltoall_allgather_tiling.cpp (100%) rename {comm => src/kernels}/lcal/src/tiling/reducescatter_alltoall_hidden_tiling.cpp (100%) rename {comm => src/kernels}/lcal/src/tiling/reducescatter_tiling.cpp (100%) rename {comm => src/kernels}/lcal/src/tiling/reducescatter_tiling_91093.cpp (100%) rename {comm => src/kernels}/lcal/src/tiling/reducescatter_tiling_910B.cpp (100%) rename {comm => src/kernels}/lcal/src/tiling/tiling.cpp (100%) rename {comm => src/kernels}/lcal/src/tiling/tiling_args.cpp (100%) rename {comm => src/kernels}/lcal/src/tiling/tiling_func.cpp (100%) rename {comm => src/kernels}/lcal/src/tools/socket/lcal_sock_exchange.cpp (100%) rename {comm => src/kernels}/lcal/src/tools/socket/lcal_sock_exchange.h (100%) rename src/{cinterface => ops_infer/fused_add_topk_div}/atb_acl_fused_add_topk_div.cpp (99%) rename src/{cinterface => ops_infer/mla_preprocess}/atb_acl_mla_preprocess.cpp (97%) rename src/{cinterface => ops_infer/multi_latent_attention}/atb_acl_mla.cpp (97%) rename src/{cinterface => ops_infer/paged_cache_load}/atb_acl_paged_cache_load.cpp (99%) rename src/{cinterface => ops_infer/ring_mla}/atb_acl_ring_mla.cpp (97%) rename src/{cinterface => ops_infer/self_attention}/atb_acl_self_attention_prefix_encoder.cpp (97%) diff --git a/CMakeLists.txt b/CMakeLists.txt index 60cc53e9..46e82ad4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -74,6 +74,9 @@ else() set(cxx_abi 0) endif() +set(CMAKE_INSTALL_PREFIX "${CMAKE_SOURCE_DIR}/output/atb/cxx_abi_${cxx_abi}") +message(STATUS "CMAKE_INSTALL_PREFIX:${CMAKE_INSTALL_PREFIX}") + if(BUILD_PYBIND AND NOT USE_CXX11_ABI) add_subdirectory(${PROJECT_SOURCE_DIR}/3rdparty/pybind11) endif() @@ -83,9 +86,9 @@ include_directories( ${PROJECT_SOURCE_DIR}/include ${PROJECT_SOURCE_DIR}/src ${PROJECT_SOURCE_DIR}/src/kernels/include - ${PROJECT_SOURCE_DIR}/comm/lcal/include - ${PROJECT_SOURCE_DIR}/comm/lcal/include/lcoc - ${PROJECT_SOURCE_DIR}/comm/lcal/include/lcoc/tiling + ${PROJECT_SOURCE_DIR}/src/kernels/lcal/include + ${PROJECT_SOURCE_DIR}/src/kernels/lcal/include/lcoc + ${PROJECT_SOURCE_DIR}/src/kernels/lcal/include/lcoc/tiling ${PROJECT_SOURCE_DIR}/3rdparty/mki/include ${PROJECT_SOURCE_DIR}/3rdparty/nlohmannJson/include $ENV{ASCEND_HOME_PATH}/include/aclnn @@ -116,10 +119,6 @@ if (BUILD_CUSTOMIZE_OPS) add_subdirectory(ops_customize) endif() -set(CMAKE_INSTALL_PREFIX "${CMAKE_SOURCE_DIR}/output/atb/cxx_abi_${cxx_abi}") -add_subdirectory(comm/lcal) -message(STATUS "CMAKE_INSTALL_PREFIX:${CMAKE_INSTALL_PREFIX}") - install(FILES ${PROJECT_SOURCE_DIR}/scripts/set_env.sh DESTINATION ./..) install(DIRECTORY ${PROJECT_SOURCE_DIR}/ops_configs DESTINATION ./configs) install(FILES ${PROJECT_SOURCE_DIR}/3rdparty/mki/lib/libmki.so DESTINATION lib) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 208da858..389b643b 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -12,7 +12,6 @@ set(ops_train_directory ${CMAKE_CURRENT_LIST_DIR}/ops_train) set(ops_infer_directory ${CMAKE_CURRENT_LIST_DIR}/ops_infer) set(ops_common_directory ${CMAKE_CURRENT_LIST_DIR}/ops_common) set(atb_directory ${CMAKE_CURRENT_LIST_DIR}/atb) -set(c_interface_directory ${CMAKE_CURRENT_LIST_DIR}/cinterface) set(MSTX_PATH $ENV{ASCEND_HOME_PATH}/tools/mstx/include) set(ATB_INCLUDE_DIR $ENV{ASCEND_HOME_PATH}/include) @@ -22,7 +21,6 @@ file(GLOB_RECURSE INFER_OP_SOURCE "${ops_infer_directory}/*.cpp") file(GLOB_RECURSE TRAIN_OP_SOURCE "${ops_train_directory}/*.cpp") file(GLOB_RECURSE COMMON_OP_SOURCE "${ops_common_directory}/*.cpp") file(GLOB_RECURSE ATB_FRAMEWORK_SOURCE "${atb_directory}/*.cpp") -file(GLOB_RECURSE C_INTERFACE_SOURCE "${c_interface_directory}/*.cpp") add_subdirectory(kernels) diff --git a/src/cinterface/atb_acl_util.cpp b/src/atb/utils/atb_acl_util.cpp similarity index 100% rename from src/cinterface/atb_acl_util.cpp rename to src/atb/utils/atb_acl_util.cpp diff --git a/src/cinterface/atb_acl_util.h b/src/atb/utils/atb_acl_util.h similarity index 100% rename from src/cinterface/atb_acl_util.h rename to src/atb/utils/atb_acl_util.h diff --git a/src/kernels/CMakeLists.txt b/src/kernels/CMakeLists.txt index ce8bcf4a..57974f7e 100644 --- a/src/kernels/CMakeLists.txt +++ b/src/kernels/CMakeLists.txt @@ -30,6 +30,7 @@ include_directories( add_subdirectory(mixkernels) add_subdirectory(kernels) +add_subdirectory(lcal) if (BUILD_TBE_ADAPTER) add_subdirectory(tbe_adapter) endif() \ No newline at end of file diff --git a/comm/lcal/CMakeLists.txt b/src/kernels/lcal/CMakeLists.txt similarity index 98% rename from comm/lcal/CMakeLists.txt rename to src/kernels/lcal/CMakeLists.txt index a5e63434..6f874baf 100644 --- a/comm/lcal/CMakeLists.txt +++ b/src/kernels/lcal/CMakeLists.txt @@ -12,6 +12,7 @@ project(Lcal LANGUAGES CXX) set(CMAKE_CXX_STANDARD 14) list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake") +add_compile_options(-Wno-float-equal) option(USE_CXX11_ABI "USE_CXX11_ABI" 0) IF (CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "aarch64") diff --git a/comm/lcal/cmake/CMakeCCECompiler.cmake.in b/src/kernels/lcal/cmake/CMakeCCECompiler.cmake.in similarity index 100% rename from comm/lcal/cmake/CMakeCCECompiler.cmake.in rename to src/kernels/lcal/cmake/CMakeCCECompiler.cmake.in diff --git a/comm/lcal/cmake/CMakeCCEInformation.cmake b/src/kernels/lcal/cmake/CMakeCCEInformation.cmake similarity index 100% rename from comm/lcal/cmake/CMakeCCEInformation.cmake rename to src/kernels/lcal/cmake/CMakeCCEInformation.cmake diff --git a/comm/lcal/cmake/CMakeDetermineCCECompiler.cmake b/src/kernels/lcal/cmake/CMakeDetermineCCECompiler.cmake similarity index 100% rename from comm/lcal/cmake/CMakeDetermineCCECompiler.cmake rename to src/kernels/lcal/cmake/CMakeDetermineCCECompiler.cmake diff --git a/comm/lcal/cmake/CMakeTestCCECompiler.cmake b/src/kernels/lcal/cmake/CMakeTestCCECompiler.cmake similarity index 100% rename from comm/lcal/cmake/CMakeTestCCECompiler.cmake rename to src/kernels/lcal/cmake/CMakeTestCCECompiler.cmake diff --git a/comm/lcal/include/comm_args.h b/src/kernels/lcal/include/comm_args.h similarity index 100% rename from comm/lcal/include/comm_args.h rename to src/kernels/lcal/include/comm_args.h diff --git a/comm/lcal/include/lcal.h b/src/kernels/lcal/include/lcal.h similarity index 100% rename from comm/lcal/include/lcal.h rename to src/kernels/lcal/include/lcal.h diff --git a/comm/lcal/include/lcal_api.h b/src/kernels/lcal/include/lcal_api.h similarity index 100% rename from comm/lcal/include/lcal_api.h rename to src/kernels/lcal/include/lcal_api.h diff --git a/comm/lcal/include/lcal_comm.h b/src/kernels/lcal/include/lcal_comm.h similarity index 100% rename from comm/lcal/include/lcal_comm.h rename to src/kernels/lcal/include/lcal_comm.h diff --git a/comm/lcal/include/lcal_types.h b/src/kernels/lcal/include/lcal_types.h similarity index 100% rename from comm/lcal/include/lcal_types.h rename to src/kernels/lcal/include/lcal_types.h diff --git a/comm/lcal/include/lccl.h b/src/kernels/lcal/include/lccl.h similarity index 100% rename from comm/lcal/include/lccl.h rename to src/kernels/lcal/include/lccl.h diff --git a/comm/lcal/include/lcoc/lcoc.h b/src/kernels/lcal/include/lcoc/lcoc.h similarity index 100% rename from comm/lcal/include/lcoc/lcoc.h rename to src/kernels/lcal/include/lcoc/lcoc.h diff --git a/comm/lcal/include/lcoc/lcoc_args.h b/src/kernels/lcal/include/lcoc/lcoc_args.h similarity index 100% rename from comm/lcal/include/lcoc/lcoc_args.h rename to src/kernels/lcal/include/lcoc/lcoc_args.h diff --git a/comm/lcal/include/lcoc/lcoc_base.h b/src/kernels/lcal/include/lcoc/lcoc_base.h similarity index 100% rename from comm/lcal/include/lcoc/lcoc_base.h rename to src/kernels/lcal/include/lcoc/lcoc_base.h diff --git a/comm/lcal/include/lcoc/lcoc_func.h b/src/kernels/lcal/include/lcoc/lcoc_func.h similarity index 100% rename from comm/lcal/include/lcoc/lcoc_func.h rename to src/kernels/lcal/include/lcoc/lcoc_func.h diff --git a/comm/lcal/include/lcoc/lcoc_workspace.h b/src/kernels/lcal/include/lcoc/lcoc_workspace.h similarity index 100% rename from comm/lcal/include/lcoc/lcoc_workspace.h rename to src/kernels/lcal/include/lcoc/lcoc_workspace.h diff --git a/comm/lcal/include/lcoc/tiling/tiling.h b/src/kernels/lcal/include/lcoc/tiling/tiling.h similarity index 100% rename from comm/lcal/include/lcoc/tiling/tiling.h rename to src/kernels/lcal/include/lcoc/tiling/tiling.h diff --git a/comm/lcal/include/lcoc/tiling/tiling_91093.h b/src/kernels/lcal/include/lcoc/tiling/tiling_91093.h similarity index 100% rename from comm/lcal/include/lcoc/tiling/tiling_91093.h rename to src/kernels/lcal/include/lcoc/tiling/tiling_91093.h diff --git a/comm/lcal/include/lcoc/tiling/tiling_910B.h b/src/kernels/lcal/include/lcoc/tiling/tiling_910B.h similarity index 100% rename from comm/lcal/include/lcoc/tiling/tiling_910B.h rename to src/kernels/lcal/include/lcoc/tiling/tiling_910B.h diff --git a/comm/lcal/include/lcoc/tiling/tiling_args.h b/src/kernels/lcal/include/lcoc/tiling/tiling_args.h similarity index 100% rename from comm/lcal/include/lcoc/tiling/tiling_args.h rename to src/kernels/lcal/include/lcoc/tiling/tiling_args.h diff --git a/comm/lcal/include/lcoc/tiling/tiling_func.h b/src/kernels/lcal/include/lcoc/tiling/tiling_func.h similarity index 100% rename from comm/lcal/include/lcoc/tiling/tiling_func.h rename to src/kernels/lcal/include/lcoc/tiling/tiling_func.h diff --git a/comm/lcal/src/CMakeLists.txt b/src/kernels/lcal/src/CMakeLists.txt similarity index 100% rename from comm/lcal/src/CMakeLists.txt rename to src/kernels/lcal/src/CMakeLists.txt diff --git a/comm/lcal/src/ascendc.cmake b/src/kernels/lcal/src/ascendc.cmake similarity index 100% rename from comm/lcal/src/ascendc.cmake rename to src/kernels/lcal/src/ascendc.cmake diff --git a/comm/lcal/src/ascendc_kernels/91093/all2all_hierarchy.h b/src/kernels/lcal/src/ascendc_kernels/91093/all2all_hierarchy.h similarity index 100% rename from comm/lcal/src/ascendc_kernels/91093/all2all_hierarchy.h rename to src/kernels/lcal/src/ascendc_kernels/91093/all2all_hierarchy.h diff --git a/comm/lcal/src/ascendc_kernels/91093/all2all_hierarchy_small.h b/src/kernels/lcal/src/ascendc_kernels/91093/all2all_hierarchy_small.h similarity index 100% rename from comm/lcal/src/ascendc_kernels/91093/all2all_hierarchy_small.h rename to src/kernels/lcal/src/ascendc_kernels/91093/all2all_hierarchy_small.h diff --git a/comm/lcal/src/ascendc_kernels/91093/allgather_hierarchy_double_ring.h b/src/kernels/lcal/src/ascendc_kernels/91093/allgather_hierarchy_double_ring.h similarity index 100% rename from comm/lcal/src/ascendc_kernels/91093/allgather_hierarchy_double_ring.h rename to src/kernels/lcal/src/ascendc_kernels/91093/allgather_hierarchy_double_ring.h diff --git a/comm/lcal/src/ascendc_kernels/91093/allreduce_big_data_sio.h b/src/kernels/lcal/src/ascendc_kernels/91093/allreduce_big_data_sio.h similarity index 100% rename from comm/lcal/src/ascendc_kernels/91093/allreduce_big_data_sio.h rename to src/kernels/lcal/src/ascendc_kernels/91093/allreduce_big_data_sio.h diff --git a/comm/lcal/src/ascendc_kernels/91093/allreduce_hierarchy_double_ring.h b/src/kernels/lcal/src/ascendc_kernels/91093/allreduce_hierarchy_double_ring.h similarity index 100% rename from comm/lcal/src/ascendc_kernels/91093/allreduce_hierarchy_double_ring.h rename to src/kernels/lcal/src/ascendc_kernels/91093/allreduce_hierarchy_double_ring.h diff --git a/comm/lcal/src/ascendc_kernels/91093/reduce_scatter_big_data_91093_4step.h b/src/kernels/lcal/src/ascendc_kernels/91093/reduce_scatter_big_data_91093_4step.h similarity index 100% rename from comm/lcal/src/ascendc_kernels/91093/reduce_scatter_big_data_91093_4step.h rename to src/kernels/lcal/src/ascendc_kernels/91093/reduce_scatter_big_data_91093_4step.h diff --git a/comm/lcal/src/ascendc_kernels/91093/reduce_scatter_hierarchy_double_ring.h b/src/kernels/lcal/src/ascendc_kernels/91093/reduce_scatter_hierarchy_double_ring.h similarity index 100% rename from comm/lcal/src/ascendc_kernels/91093/reduce_scatter_hierarchy_double_ring.h rename to src/kernels/lcal/src/ascendc_kernels/91093/reduce_scatter_hierarchy_double_ring.h diff --git a/comm/lcal/src/ascendc_kernels/CMakeLists.txt b/src/kernels/lcal/src/ascendc_kernels/CMakeLists.txt similarity index 100% rename from comm/lcal/src/ascendc_kernels/CMakeLists.txt rename to src/kernels/lcal/src/ascendc_kernels/CMakeLists.txt diff --git a/comm/lcal/src/ascendc_kernels/allgather.h b/src/kernels/lcal/src/ascendc_kernels/allgather.h similarity index 100% rename from comm/lcal/src/ascendc_kernels/allgather.h rename to src/kernels/lcal/src/ascendc_kernels/allgather.h diff --git a/comm/lcal/src/ascendc_kernels/allreduce_big_data.h b/src/kernels/lcal/src/ascendc_kernels/allreduce_big_data.h similarity index 100% rename from comm/lcal/src/ascendc_kernels/allreduce_big_data.h rename to src/kernels/lcal/src/ascendc_kernels/allreduce_big_data.h diff --git a/comm/lcal/src/ascendc_kernels/allreduce_one_shot.h b/src/kernels/lcal/src/ascendc_kernels/allreduce_one_shot.h similarity index 100% rename from comm/lcal/src/ascendc_kernels/allreduce_one_shot.h rename to src/kernels/lcal/src/ascendc_kernels/allreduce_one_shot.h diff --git a/comm/lcal/src/ascendc_kernels/allreduce_quant.h b/src/kernels/lcal/src/ascendc_kernels/allreduce_quant.h similarity index 100% rename from comm/lcal/src/ascendc_kernels/allreduce_quant.h rename to src/kernels/lcal/src/ascendc_kernels/allreduce_quant.h diff --git a/comm/lcal/src/ascendc_kernels/allreduce_two_shot.h b/src/kernels/lcal/src/ascendc_kernels/allreduce_two_shot.h similarity index 100% rename from comm/lcal/src/ascendc_kernels/allreduce_two_shot.h rename to src/kernels/lcal/src/ascendc_kernels/allreduce_two_shot.h diff --git a/comm/lcal/src/ascendc_kernels/collectives.h b/src/kernels/lcal/src/ascendc_kernels/collectives.h similarity index 100% rename from comm/lcal/src/ascendc_kernels/collectives.h rename to src/kernels/lcal/src/ascendc_kernels/collectives.h diff --git a/comm/lcal/src/ascendc_kernels/datacopy_gm2gm.h b/src/kernels/lcal/src/ascendc_kernels/datacopy_gm2gm.h similarity index 100% rename from comm/lcal/src/ascendc_kernels/datacopy_gm2gm.h rename to src/kernels/lcal/src/ascendc_kernels/datacopy_gm2gm.h diff --git a/comm/lcal/src/ascendc_kernels/datacopy_gm2gm_delay.h b/src/kernels/lcal/src/ascendc_kernels/datacopy_gm2gm_delay.h similarity index 100% rename from comm/lcal/src/ascendc_kernels/datacopy_gm2gm_delay.h rename to src/kernels/lcal/src/ascendc_kernels/datacopy_gm2gm_delay.h diff --git a/comm/lcal/src/ascendc_kernels/ipc_queue.h b/src/kernels/lcal/src/ascendc_kernels/ipc_queue.h similarity index 100% rename from comm/lcal/src/ascendc_kernels/ipc_queue.h rename to src/kernels/lcal/src/ascendc_kernels/ipc_queue.h diff --git a/comm/lcal/src/ascendc_kernels/lccl_op.h b/src/kernels/lcal/src/ascendc_kernels/lccl_op.h similarity index 100% rename from comm/lcal/src/ascendc_kernels/lccl_op.h rename to src/kernels/lcal/src/ascendc_kernels/lccl_op.h diff --git a/comm/lcal/src/ascendc_kernels/lccl_op1.cpp b/src/kernels/lcal/src/ascendc_kernels/lccl_op1.cpp similarity index 100% rename from comm/lcal/src/ascendc_kernels/lccl_op1.cpp rename to src/kernels/lcal/src/ascendc_kernels/lccl_op1.cpp diff --git a/comm/lcal/src/ascendc_kernels/lccl_op2.cpp b/src/kernels/lcal/src/ascendc_kernels/lccl_op2.cpp similarity index 100% rename from comm/lcal/src/ascendc_kernels/lccl_op2.cpp rename to src/kernels/lcal/src/ascendc_kernels/lccl_op2.cpp diff --git a/comm/lcal/src/ascendc_kernels/op_def.h b/src/kernels/lcal/src/ascendc_kernels/op_def.h similarity index 100% rename from comm/lcal/src/ascendc_kernels/op_def.h rename to src/kernels/lcal/src/ascendc_kernels/op_def.h diff --git a/comm/lcal/src/ascendc_kernels/reduce_scatter.h b/src/kernels/lcal/src/ascendc_kernels/reduce_scatter.h similarity index 100% rename from comm/lcal/src/ascendc_kernels/reduce_scatter.h rename to src/kernels/lcal/src/ascendc_kernels/reduce_scatter.h diff --git a/comm/lcal/src/ascendc_kernels/sync_collectives.h b/src/kernels/lcal/src/ascendc_kernels/sync_collectives.h similarity index 100% rename from comm/lcal/src/ascendc_kernels/sync_collectives.h rename to src/kernels/lcal/src/ascendc_kernels/sync_collectives.h diff --git a/comm/lcal/src/ccl_kernel_args.h b/src/kernels/lcal/src/ccl_kernel_args.h similarity index 100% rename from comm/lcal/src/ccl_kernel_args.h rename to src/kernels/lcal/src/ccl_kernel_args.h diff --git a/comm/lcal/src/coc_kernel_args.cpp b/src/kernels/lcal/src/coc_kernel_args.cpp similarity index 100% rename from comm/lcal/src/coc_kernel_args.cpp rename to src/kernels/lcal/src/coc_kernel_args.cpp diff --git a/comm/lcal/src/coc_kernel_args.h b/src/kernels/lcal/src/coc_kernel_args.h similarity index 100% rename from comm/lcal/src/coc_kernel_args.h rename to src/kernels/lcal/src/coc_kernel_args.h diff --git a/comm/lcal/src/kernels/CMakeLists.txt b/src/kernels/lcal/src/kernels/CMakeLists.txt similarity index 100% rename from comm/lcal/src/kernels/CMakeLists.txt rename to src/kernels/lcal/src/kernels/CMakeLists.txt diff --git a/comm/lcal/src/kernels/coc_add_bias_runner.cce b/src/kernels/lcal/src/kernels/coc_add_bias_runner.cce similarity index 100% rename from comm/lcal/src/kernels/coc_add_bias_runner.cce rename to src/kernels/lcal/src/kernels/coc_add_bias_runner.cce diff --git a/comm/lcal/src/kernels/coc_allgather.cce b/src/kernels/lcal/src/kernels/coc_allgather.cce similarity index 100% rename from comm/lcal/src/kernels/coc_allgather.cce rename to src/kernels/lcal/src/kernels/coc_allgather.cce diff --git a/comm/lcal/src/kernels/coc_allgather_matmul.cce b/src/kernels/lcal/src/kernels/coc_allgather_matmul.cce similarity index 100% rename from comm/lcal/src/kernels/coc_allgather_matmul.cce rename to src/kernels/lcal/src/kernels/coc_allgather_matmul.cce diff --git a/comm/lcal/src/kernels/coc_allgather_matmul_reduce_scatter.cce b/src/kernels/lcal/src/kernels/coc_allgather_matmul_reduce_scatter.cce similarity index 100% rename from comm/lcal/src/kernels/coc_allgather_matmul_reduce_scatter.cce rename to src/kernels/lcal/src/kernels/coc_allgather_matmul_reduce_scatter.cce diff --git a/comm/lcal/src/kernels/coc_allgather_reducescatter.cce b/src/kernels/lcal/src/kernels/coc_allgather_reducescatter.cce similarity index 100% rename from comm/lcal/src/kernels/coc_allgather_reducescatter.cce rename to src/kernels/lcal/src/kernels/coc_allgather_reducescatter.cce diff --git a/comm/lcal/src/kernels/coc_allgather_v2.cce b/src/kernels/lcal/src/kernels/coc_allgather_v2.cce similarity index 100% rename from comm/lcal/src/kernels/coc_allgather_v2.cce rename to src/kernels/lcal/src/kernels/coc_allgather_v2.cce diff --git a/comm/lcal/src/kernels/coc_allreduce.cce b/src/kernels/lcal/src/kernels/coc_allreduce.cce similarity index 100% rename from comm/lcal/src/kernels/coc_allreduce.cce rename to src/kernels/lcal/src/kernels/coc_allreduce.cce diff --git a/comm/lcal/src/kernels/coc_alltoall_allgather_hidden.cce b/src/kernels/lcal/src/kernels/coc_alltoall_allgather_hidden.cce similarity index 100% rename from comm/lcal/src/kernels/coc_alltoall_allgather_hidden.cce rename to src/kernels/lcal/src/kernels/coc_alltoall_allgather_hidden.cce diff --git a/comm/lcal/src/kernels/coc_alltoall_reduce_scatter_hidden.cce b/src/kernels/lcal/src/kernels/coc_alltoall_reduce_scatter_hidden.cce similarity index 100% rename from comm/lcal/src/kernels/coc_alltoall_reduce_scatter_hidden.cce rename to src/kernels/lcal/src/kernels/coc_alltoall_reduce_scatter_hidden.cce diff --git a/comm/lcal/src/kernels/coc_alltoallv_allgather.cce b/src/kernels/lcal/src/kernels/coc_alltoallv_allgather.cce similarity index 100% rename from comm/lcal/src/kernels/coc_alltoallv_allgather.cce rename to src/kernels/lcal/src/kernels/coc_alltoallv_allgather.cce diff --git a/comm/lcal/src/kernels/coc_alltoallv_allgather_matmul.cce b/src/kernels/lcal/src/kernels/coc_alltoallv_allgather_matmul.cce similarity index 100% rename from comm/lcal/src/kernels/coc_alltoallv_allgather_matmul.cce rename to src/kernels/lcal/src/kernels/coc_alltoallv_allgather_matmul.cce diff --git a/comm/lcal/src/kernels/coc_comm_base.cce b/src/kernels/lcal/src/kernels/coc_comm_base.cce similarity index 100% rename from comm/lcal/src/kernels/coc_comm_base.cce rename to src/kernels/lcal/src/kernels/coc_comm_base.cce diff --git a/comm/lcal/src/kernels/coc_const_args.cce b/src/kernels/lcal/src/kernels/coc_const_args.cce similarity index 100% rename from comm/lcal/src/kernels/coc_const_args.cce rename to src/kernels/lcal/src/kernels/coc_const_args.cce diff --git a/comm/lcal/src/kernels/coc_dequant_runner.cce b/src/kernels/lcal/src/kernels/coc_dequant_runner.cce similarity index 100% rename from comm/lcal/src/kernels/coc_dequant_runner.cce rename to src/kernels/lcal/src/kernels/coc_dequant_runner.cce diff --git a/comm/lcal/src/kernels/coc_internal.cce b/src/kernels/lcal/src/kernels/coc_internal.cce similarity index 100% rename from comm/lcal/src/kernels/coc_internal.cce rename to src/kernels/lcal/src/kernels/coc_internal.cce diff --git a/comm/lcal/src/kernels/coc_matmul_allreduce.cce b/src/kernels/lcal/src/kernels/coc_matmul_allreduce.cce similarity index 100% rename from comm/lcal/src/kernels/coc_matmul_allreduce.cce rename to src/kernels/lcal/src/kernels/coc_matmul_allreduce.cce diff --git a/comm/lcal/src/kernels/coc_matmul_reduce_scatter.cce b/src/kernels/lcal/src/kernels/coc_matmul_reduce_scatter.cce similarity index 100% rename from comm/lcal/src/kernels/coc_matmul_reduce_scatter.cce rename to src/kernels/lcal/src/kernels/coc_matmul_reduce_scatter.cce diff --git a/comm/lcal/src/kernels/coc_matmul_reduce_scatter_alltoallv.cce b/src/kernels/lcal/src/kernels/coc_matmul_reduce_scatter_alltoallv.cce similarity index 100% rename from comm/lcal/src/kernels/coc_matmul_reduce_scatter_alltoallv.cce rename to src/kernels/lcal/src/kernels/coc_matmul_reduce_scatter_alltoallv.cce diff --git a/comm/lcal/src/kernels/coc_matmulmoe.cce b/src/kernels/lcal/src/kernels/coc_matmulmoe.cce similarity index 100% rename from comm/lcal/src/kernels/coc_matmulmoe.cce rename to src/kernels/lcal/src/kernels/coc_matmulmoe.cce diff --git a/comm/lcal/src/kernels/coc_postprocessor.cce b/src/kernels/lcal/src/kernels/coc_postprocessor.cce similarity index 100% rename from comm/lcal/src/kernels/coc_postprocessor.cce rename to src/kernels/lcal/src/kernels/coc_postprocessor.cce diff --git a/comm/lcal/src/kernels/coc_ppmatmul.cce b/src/kernels/lcal/src/kernels/coc_ppmatmul.cce similarity index 100% rename from comm/lcal/src/kernels/coc_ppmatmul.cce rename to src/kernels/lcal/src/kernels/coc_ppmatmul.cce diff --git a/comm/lcal/src/kernels/coc_ppmatmul_switch.cce b/src/kernels/lcal/src/kernels/coc_ppmatmul_switch.cce similarity index 100% rename from comm/lcal/src/kernels/coc_ppmatmul_switch.cce rename to src/kernels/lcal/src/kernels/coc_ppmatmul_switch.cce diff --git a/comm/lcal/src/kernels/coc_preprocessor.cce b/src/kernels/lcal/src/kernels/coc_preprocessor.cce similarity index 100% rename from comm/lcal/src/kernels/coc_preprocessor.cce rename to src/kernels/lcal/src/kernels/coc_preprocessor.cce diff --git a/comm/lcal/src/kernels/coc_pure_matmul.cce b/src/kernels/lcal/src/kernels/coc_pure_matmul.cce similarity index 100% rename from comm/lcal/src/kernels/coc_pure_matmul.cce rename to src/kernels/lcal/src/kernels/coc_pure_matmul.cce diff --git a/comm/lcal/src/kernels/coc_reduce_scatter.cce b/src/kernels/lcal/src/kernels/coc_reduce_scatter.cce similarity index 100% rename from comm/lcal/src/kernels/coc_reduce_scatter.cce rename to src/kernels/lcal/src/kernels/coc_reduce_scatter.cce diff --git a/comm/lcal/src/kernels/collectives.cce b/src/kernels/lcal/src/kernels/collectives.cce similarity index 100% rename from comm/lcal/src/kernels/collectives.cce rename to src/kernels/lcal/src/kernels/collectives.cce diff --git a/comm/lcal/src/kernels/lcal_all2all_transpose.cce b/src/kernels/lcal/src/kernels/lcal_all2all_transpose.cce similarity index 100% rename from comm/lcal/src/kernels/lcal_all2all_transpose.cce rename to src/kernels/lcal/src/kernels/lcal_all2all_transpose.cce diff --git a/comm/lcal/src/kernels/lcal_allgather.cce b/src/kernels/lcal/src/kernels/lcal_allgather.cce similarity index 100% rename from comm/lcal/src/kernels/lcal_allgather.cce rename to src/kernels/lcal/src/kernels/lcal_allgather.cce diff --git a/comm/lcal/src/kernels/lcal_allgather_2npu.cce b/src/kernels/lcal/src/kernels/lcal_allgather_2npu.cce similarity index 100% rename from comm/lcal/src/kernels/lcal_allgather_2npu.cce rename to src/kernels/lcal/src/kernels/lcal_allgather_2npu.cce diff --git a/comm/lcal/src/kernels/lcal_allgather_2npu_big_data_write.cce b/src/kernels/lcal/src/kernels/lcal_allgather_2npu_big_data_write.cce similarity index 100% rename from comm/lcal/src/kernels/lcal_allgather_2npu_big_data_write.cce rename to src/kernels/lcal/src/kernels/lcal_allgather_2npu_big_data_write.cce diff --git a/comm/lcal/src/kernels/lcal_allgather_910B2C.cce b/src/kernels/lcal/src/kernels/lcal_allgather_910B2C.cce similarity index 100% rename from comm/lcal/src/kernels/lcal_allgather_910B2C.cce rename to src/kernels/lcal/src/kernels/lcal_allgather_910B2C.cce diff --git a/comm/lcal/src/kernels/lcal_allgather_big_data.cce b/src/kernels/lcal/src/kernels/lcal_allgather_big_data.cce similarity index 100% rename from comm/lcal/src/kernels/lcal_allgather_big_data.cce rename to src/kernels/lcal/src/kernels/lcal_allgather_big_data.cce diff --git a/comm/lcal/src/kernels/lcal_allgather_big_data_910B2C.cce b/src/kernels/lcal/src/kernels/lcal_allgather_big_data_910B2C.cce similarity index 100% rename from comm/lcal/src/kernels/lcal_allgather_big_data_910B2C.cce rename to src/kernels/lcal/src/kernels/lcal_allgather_big_data_910B2C.cce diff --git a/comm/lcal/src/kernels/lcal_allreduce_2npu_big_write.cce b/src/kernels/lcal/src/kernels/lcal_allreduce_2npu_big_write.cce similarity index 100% rename from comm/lcal/src/kernels/lcal_allreduce_2npu_big_write.cce rename to src/kernels/lcal/src/kernels/lcal_allreduce_2npu_big_write.cce diff --git a/comm/lcal/src/kernels/lcal_allreduce_2npu_read.cce b/src/kernels/lcal/src/kernels/lcal_allreduce_2npu_read.cce similarity index 100% rename from comm/lcal/src/kernels/lcal_allreduce_2npu_read.cce rename to src/kernels/lcal/src/kernels/lcal_allreduce_2npu_read.cce diff --git a/comm/lcal/src/kernels/lcal_allreduce_2npu_write.cce b/src/kernels/lcal/src/kernels/lcal_allreduce_2npu_write.cce similarity index 100% rename from comm/lcal/src/kernels/lcal_allreduce_2npu_write.cce rename to src/kernels/lcal/src/kernels/lcal_allreduce_2npu_write.cce diff --git a/comm/lcal/src/kernels/lcal_allreduce_big_data.cce b/src/kernels/lcal/src/kernels/lcal_allreduce_big_data.cce similarity index 100% rename from comm/lcal/src/kernels/lcal_allreduce_big_data.cce rename to src/kernels/lcal/src/kernels/lcal_allreduce_big_data.cce diff --git a/comm/lcal/src/kernels/lcal_allreduce_big_data_910B2C.cce b/src/kernels/lcal/src/kernels/lcal_allreduce_big_data_910B2C.cce similarity index 100% rename from comm/lcal/src/kernels/lcal_allreduce_big_data_910B2C.cce rename to src/kernels/lcal/src/kernels/lcal_allreduce_big_data_910B2C.cce diff --git a/comm/lcal/src/kernels/lcal_allreduce_deterministic.cce b/src/kernels/lcal/src/kernels/lcal_allreduce_deterministic.cce similarity index 100% rename from comm/lcal/src/kernels/lcal_allreduce_deterministic.cce rename to src/kernels/lcal/src/kernels/lcal_allreduce_deterministic.cce diff --git a/comm/lcal/src/kernels/lcal_allreduce_deterministic_big_data.cce b/src/kernels/lcal/src/kernels/lcal_allreduce_deterministic_big_data.cce similarity index 100% rename from comm/lcal/src/kernels/lcal_allreduce_deterministic_big_data.cce rename to src/kernels/lcal/src/kernels/lcal_allreduce_deterministic_big_data.cce diff --git a/comm/lcal/src/kernels/lcal_allreduce_two_shot.cce b/src/kernels/lcal/src/kernels/lcal_allreduce_two_shot.cce similarity index 100% rename from comm/lcal/src/kernels/lcal_allreduce_two_shot.cce rename to src/kernels/lcal/src/kernels/lcal_allreduce_two_shot.cce diff --git a/comm/lcal/src/kernels/lcal_allreduce_two_shot_910B2C.cce b/src/kernels/lcal/src/kernels/lcal_allreduce_two_shot_910B2C.cce similarity index 100% rename from comm/lcal/src/kernels/lcal_allreduce_two_shot_910B2C.cce rename to src/kernels/lcal/src/kernels/lcal_allreduce_two_shot_910B2C.cce diff --git a/comm/lcal/src/kernels/lcal_broadcast_big_data.cce b/src/kernels/lcal/src/kernels/lcal_broadcast_big_data.cce similarity index 100% rename from comm/lcal/src/kernels/lcal_broadcast_big_data.cce rename to src/kernels/lcal/src/kernels/lcal_broadcast_big_data.cce diff --git a/comm/lcal/src/kernels/lcal_broadcast_write.cce b/src/kernels/lcal/src/kernels/lcal_broadcast_write.cce similarity index 100% rename from comm/lcal/src/kernels/lcal_broadcast_write.cce rename to src/kernels/lcal/src/kernels/lcal_broadcast_write.cce diff --git a/comm/lcal/src/kernels/lcal_reduce_scatter.cce b/src/kernels/lcal/src/kernels/lcal_reduce_scatter.cce similarity index 100% rename from comm/lcal/src/kernels/lcal_reduce_scatter.cce rename to src/kernels/lcal/src/kernels/lcal_reduce_scatter.cce diff --git a/comm/lcal/src/kernels/lcal_reduce_scatter_big_data.cce b/src/kernels/lcal/src/kernels/lcal_reduce_scatter_big_data.cce similarity index 100% rename from comm/lcal/src/kernels/lcal_reduce_scatter_big_data.cce rename to src/kernels/lcal/src/kernels/lcal_reduce_scatter_big_data.cce diff --git a/comm/lcal/src/kernels/lcal_reduce_scatter_big_data_write.cce b/src/kernels/lcal/src/kernels/lcal_reduce_scatter_big_data_write.cce similarity index 100% rename from comm/lcal/src/kernels/lcal_reduce_scatter_big_data_write.cce rename to src/kernels/lcal/src/kernels/lcal_reduce_scatter_big_data_write.cce diff --git a/comm/lcal/src/kernels/lcal_reduce_scatter_write.cce b/src/kernels/lcal/src/kernels/lcal_reduce_scatter_write.cce similarity index 100% rename from comm/lcal/src/kernels/lcal_reduce_scatter_write.cce rename to src/kernels/lcal/src/kernels/lcal_reduce_scatter_write.cce diff --git a/comm/lcal/src/lcal_comm.cpp b/src/kernels/lcal/src/lcal_comm.cpp similarity index 100% rename from comm/lcal/src/lcal_comm.cpp rename to src/kernels/lcal/src/lcal_comm.cpp diff --git a/comm/lcal/src/lcal_internal.cpp b/src/kernels/lcal/src/lcal_internal.cpp similarity index 100% rename from comm/lcal/src/lcal_internal.cpp rename to src/kernels/lcal/src/lcal_internal.cpp diff --git a/comm/lcal/src/lcal_internal.h b/src/kernels/lcal/src/lcal_internal.h similarity index 100% rename from comm/lcal/src/lcal_internal.h rename to src/kernels/lcal/src/lcal_internal.h diff --git a/comm/lcal/src/lcal_wrap.cpp b/src/kernels/lcal/src/lcal_wrap.cpp similarity index 100% rename from comm/lcal/src/lcal_wrap.cpp rename to src/kernels/lcal/src/lcal_wrap.cpp diff --git a/comm/lcal/src/lccl.cpp b/src/kernels/lcal/src/lccl.cpp similarity index 100% rename from comm/lcal/src/lccl.cpp rename to src/kernels/lcal/src/lccl.cpp diff --git a/comm/lcal/src/lcoc.cpp b/src/kernels/lcal/src/lcoc.cpp similarity index 100% rename from comm/lcal/src/lcoc.cpp rename to src/kernels/lcal/src/lcoc.cpp diff --git a/comm/lcal/src/lcoc_func.cpp b/src/kernels/lcal/src/lcoc_func.cpp similarity index 100% rename from comm/lcal/src/lcoc_func.cpp rename to src/kernels/lcal/src/lcoc_func.cpp diff --git a/comm/lcal/src/profiling/report_timing.h b/src/kernels/lcal/src/profiling/report_timing.h similarity index 100% rename from comm/lcal/src/profiling/report_timing.h rename to src/kernels/lcal/src/profiling/report_timing.h diff --git a/comm/lcal/src/tiling/allgather_reducescatter_tiling.cpp b/src/kernels/lcal/src/tiling/allgather_reducescatter_tiling.cpp similarity index 100% rename from comm/lcal/src/tiling/allgather_reducescatter_tiling.cpp rename to src/kernels/lcal/src/tiling/allgather_reducescatter_tiling.cpp diff --git a/comm/lcal/src/tiling/allgather_tiling.cpp b/src/kernels/lcal/src/tiling/allgather_tiling.cpp similarity index 100% rename from comm/lcal/src/tiling/allgather_tiling.cpp rename to src/kernels/lcal/src/tiling/allgather_tiling.cpp diff --git a/comm/lcal/src/tiling/allgather_tiling_91093.cpp b/src/kernels/lcal/src/tiling/allgather_tiling_91093.cpp similarity index 100% rename from comm/lcal/src/tiling/allgather_tiling_91093.cpp rename to src/kernels/lcal/src/tiling/allgather_tiling_91093.cpp diff --git a/comm/lcal/src/tiling/allgather_tiling_910B.cpp b/src/kernels/lcal/src/tiling/allgather_tiling_910B.cpp similarity index 100% rename from comm/lcal/src/tiling/allgather_tiling_910B.cpp rename to src/kernels/lcal/src/tiling/allgather_tiling_910B.cpp diff --git a/comm/lcal/src/tiling/allgatherv2_tiling_91093.cpp b/src/kernels/lcal/src/tiling/allgatherv2_tiling_91093.cpp similarity index 100% rename from comm/lcal/src/tiling/allgatherv2_tiling_91093.cpp rename to src/kernels/lcal/src/tiling/allgatherv2_tiling_91093.cpp diff --git a/comm/lcal/src/tiling/allgatherv2_tiling_910B.cpp b/src/kernels/lcal/src/tiling/allgatherv2_tiling_910B.cpp similarity index 100% rename from comm/lcal/src/tiling/allgatherv2_tiling_910B.cpp rename to src/kernels/lcal/src/tiling/allgatherv2_tiling_910B.cpp diff --git a/comm/lcal/src/tiling/allreduce_tiling.cpp b/src/kernels/lcal/src/tiling/allreduce_tiling.cpp similarity index 100% rename from comm/lcal/src/tiling/allreduce_tiling.cpp rename to src/kernels/lcal/src/tiling/allreduce_tiling.cpp diff --git a/comm/lcal/src/tiling/allreduce_tiling_91093.cpp b/src/kernels/lcal/src/tiling/allreduce_tiling_91093.cpp similarity index 100% rename from comm/lcal/src/tiling/allreduce_tiling_91093.cpp rename to src/kernels/lcal/src/tiling/allreduce_tiling_91093.cpp diff --git a/comm/lcal/src/tiling/allreduce_tiling_910B.cpp b/src/kernels/lcal/src/tiling/allreduce_tiling_910B.cpp similarity index 100% rename from comm/lcal/src/tiling/allreduce_tiling_910B.cpp rename to src/kernels/lcal/src/tiling/allreduce_tiling_910B.cpp diff --git a/comm/lcal/src/tiling/alltoall_allgather_hidden_tiling.cpp b/src/kernels/lcal/src/tiling/alltoall_allgather_hidden_tiling.cpp similarity index 100% rename from comm/lcal/src/tiling/alltoall_allgather_hidden_tiling.cpp rename to src/kernels/lcal/src/tiling/alltoall_allgather_hidden_tiling.cpp diff --git a/comm/lcal/src/tiling/alltoall_allgather_tiling.cpp b/src/kernels/lcal/src/tiling/alltoall_allgather_tiling.cpp similarity index 100% rename from comm/lcal/src/tiling/alltoall_allgather_tiling.cpp rename to src/kernels/lcal/src/tiling/alltoall_allgather_tiling.cpp diff --git a/comm/lcal/src/tiling/reducescatter_alltoall_hidden_tiling.cpp b/src/kernels/lcal/src/tiling/reducescatter_alltoall_hidden_tiling.cpp similarity index 100% rename from comm/lcal/src/tiling/reducescatter_alltoall_hidden_tiling.cpp rename to src/kernels/lcal/src/tiling/reducescatter_alltoall_hidden_tiling.cpp diff --git a/comm/lcal/src/tiling/reducescatter_tiling.cpp b/src/kernels/lcal/src/tiling/reducescatter_tiling.cpp similarity index 100% rename from comm/lcal/src/tiling/reducescatter_tiling.cpp rename to src/kernels/lcal/src/tiling/reducescatter_tiling.cpp diff --git a/comm/lcal/src/tiling/reducescatter_tiling_91093.cpp b/src/kernels/lcal/src/tiling/reducescatter_tiling_91093.cpp similarity index 100% rename from comm/lcal/src/tiling/reducescatter_tiling_91093.cpp rename to src/kernels/lcal/src/tiling/reducescatter_tiling_91093.cpp diff --git a/comm/lcal/src/tiling/reducescatter_tiling_910B.cpp b/src/kernels/lcal/src/tiling/reducescatter_tiling_910B.cpp similarity index 100% rename from comm/lcal/src/tiling/reducescatter_tiling_910B.cpp rename to src/kernels/lcal/src/tiling/reducescatter_tiling_910B.cpp diff --git a/comm/lcal/src/tiling/tiling.cpp b/src/kernels/lcal/src/tiling/tiling.cpp similarity index 100% rename from comm/lcal/src/tiling/tiling.cpp rename to src/kernels/lcal/src/tiling/tiling.cpp diff --git a/comm/lcal/src/tiling/tiling_args.cpp b/src/kernels/lcal/src/tiling/tiling_args.cpp similarity index 100% rename from comm/lcal/src/tiling/tiling_args.cpp rename to src/kernels/lcal/src/tiling/tiling_args.cpp diff --git a/comm/lcal/src/tiling/tiling_func.cpp b/src/kernels/lcal/src/tiling/tiling_func.cpp similarity index 100% rename from comm/lcal/src/tiling/tiling_func.cpp rename to src/kernels/lcal/src/tiling/tiling_func.cpp diff --git a/comm/lcal/src/tools/socket/lcal_sock_exchange.cpp b/src/kernels/lcal/src/tools/socket/lcal_sock_exchange.cpp similarity index 100% rename from comm/lcal/src/tools/socket/lcal_sock_exchange.cpp rename to src/kernels/lcal/src/tools/socket/lcal_sock_exchange.cpp diff --git a/comm/lcal/src/tools/socket/lcal_sock_exchange.h b/src/kernels/lcal/src/tools/socket/lcal_sock_exchange.h similarity index 100% rename from comm/lcal/src/tools/socket/lcal_sock_exchange.h rename to src/kernels/lcal/src/tools/socket/lcal_sock_exchange.h diff --git a/src/cinterface/atb_acl_fused_add_topk_div.cpp b/src/ops_infer/fused_add_topk_div/atb_acl_fused_add_topk_div.cpp similarity index 99% rename from src/cinterface/atb_acl_fused_add_topk_div.cpp rename to src/ops_infer/fused_add_topk_div/atb_acl_fused_add_topk_div.cpp index 65f84243..c55b1d01 100644 --- a/src/cinterface/atb_acl_fused_add_topk_div.cpp +++ b/src/ops_infer/fused_add_topk_div/atb_acl_fused_add_topk_div.cpp @@ -8,7 +8,7 @@ * See LICENSE in the root of the software repository for the full text of the License. */ #include "atb/atb_acl.h" -#include "atb_acl_util.h" +#include "atb/utils/atb_acl_util.h" #include "atb/operation/operation_base.h" #ifdef __cplusplus diff --git a/src/cinterface/atb_acl_mla_preprocess.cpp b/src/ops_infer/mla_preprocess/atb_acl_mla_preprocess.cpp similarity index 97% rename from src/cinterface/atb_acl_mla_preprocess.cpp rename to src/ops_infer/mla_preprocess/atb_acl_mla_preprocess.cpp index 32bd22c6..bac5171a 100644 --- a/src/cinterface/atb_acl_mla_preprocess.cpp +++ b/src/ops_infer/mla_preprocess/atb_acl_mla_preprocess.cpp @@ -1,179 +1,179 @@ -/* - * Copyright (c) 2025 Huawei Technologies Co., Ltd. - * This file is a part of the CANN Open Software. - * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). - * Please refer to the License for details. You may not use this file except in compliance with the License. - * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, - * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. - * See LICENSE in the root of the software repository for the full text of the License. - */ -#include "atb/atb_acl.h" -#include "atb_acl_util.h" -#include "atb/operation/operation_base.h" - -#ifdef __cplusplus -extern "C" { -#endif - -const size_t g_MLAPPINTENSORNUM = 24; -const size_t g_MLAPPOUTTENSORNUMCACHEMODE = 4; -const size_t g_MLAPPOUTTENSORNUM = 2; - -atb::Status AtbMLAPreprocessGetWorkspaceSize( - const aclTensor *input, const aclTensor *gamma0, const aclTensor *beta0, const aclTensor *quantScale0, - const aclTensor *quantOffset0, const aclTensor *wdqkv, const aclTensor *deScale0, const aclTensor *bias0, - const aclTensor *gamma1, const aclTensor *beta1, const aclTensor *quantScale1, const aclTensor *quantOffset1, - const aclTensor *wuq, const aclTensor *deScale1, const aclTensor *bias1, const aclTensor *gamma2, - const aclTensor *cos, const aclTensor *sin, const aclTensor *wuk, const aclTensor *kvCache, - const aclTensor *kvCacheRope, const aclTensor *slotmapping, const aclTensor *ctkvScale, const aclTensor *qNopeScale, - uint32_t wdqDim, uint32_t qRopeDim, uint32_t kRopeDim, float epsilon, uint32_t qRotaryCoeff, uint32_t kRotaryCoeff, - bool transposeWdq, bool transposeWuq, bool transposeWuk, uint8_t cacheMode, uint16_t quantMode, aclTensor *qOut0, - aclTensor *kvCacheOut0, aclTensor *qOut1, aclTensor *kvCacheOut1, uint64_t *workspaceSize, atb::Operation **op, - atb::Context *context) -{ - atb::infer::MlaPreprocessParam param; - param.wdqDim = wdqDim; - param.qRopeDim = qRopeDim; - param.kRopeDim = kRopeDim; - param.epsilon = epsilon; - param.qRotaryCoeff = static_cast(qRotaryCoeff); - param.kRotaryCoeff = static_cast(kRotaryCoeff); - param.transposeWdq = transposeWdq; - param.transposeWuq = transposeWuq; - param.transposeWuk = transposeWuk; - param.cacheMode = atb::infer::MlaPreprocessParam::CacheMode(cacheMode); - param.quantMode = atb::infer::MlaPreprocessParam::QuantMode(quantMode); - - if (op != nullptr && *op == nullptr) { - auto st = CreateOperation(param, op); - if (st != atb::NO_ERROR) { - ATB_LOG(ERROR) << "Create MLAPreprocess Operation failed!"; - return st; - } - } - atb::VariantPack pack; - size_t i = 0; - pack.inTensors.resize(g_MLAPPINTENSORNUM); - auto status = aclTensorToAtbTensor(input, &(pack.inTensors[i++])); - ATB_CHECK(status == atb::NO_ERROR, "input create failed!", return status); - status = aclTensorToAtbTensor(gamma0, &(pack.inTensors[i++])); - ATB_CHECK(status == atb::NO_ERROR, "gamma0 create failed!", return status); - status = aclTensorToAtbTensor(beta0, &(pack.inTensors[i++])); - ATB_CHECK(status == atb::NO_ERROR, "beta0 create failed!", return status); - if (param.quantMode == atb::infer::MlaPreprocessParam::QuantMode::PER_TENSOR_QUANT_ASYMM) { - status = aclTensorToAtbTensor(quantScale0, &(pack.inTensors[i++])); - ATB_CHECK(status == atb::NO_ERROR, "quantScale0 create failed!", return status); - status = aclTensorToAtbTensor(quantOffset0, &(pack.inTensors[i++])); - ATB_CHECK(status == atb::NO_ERROR, "quantOffset0 create failed!", return status); - } else { - status = aclTensorToAtbTensor(nullptr, &(pack.inTensors[i++])); - status = aclTensorToAtbTensor(nullptr, &(pack.inTensors[i++])); - } - status = aclTensorToAtbTensor(wdqkv, &(pack.inTensors[i++])); - ATB_CHECK(status == atb::NO_ERROR, "wdqkv create failed!", return status); - status = aclTensorToAtbTensor(deScale0, &(pack.inTensors[i++])); - ATB_CHECK(status == atb::NO_ERROR, "deScale0 create failed!", return status); - if (param.quantMode != atb::infer::MlaPreprocessParam::QuantMode::PER_TOKEN_QUANT_SYMM && - param.quantMode != atb::infer::MlaPreprocessParam::QuantMode::UNQUANT) { - status = aclTensorToAtbTensor(bias0, &(pack.inTensors[i++])); - ATB_CHECK(status == atb::NO_ERROR, "bias0 create failed!", return status); - } else { - status = aclTensorToAtbTensor(nullptr, &(pack.inTensors[i++])); - } - status = aclTensorToAtbTensor(gamma1, &(pack.inTensors[i++])); - ATB_CHECK(status == atb::NO_ERROR, "gamma1 create failed!", return status); - status = aclTensorToAtbTensor(beta1, &(pack.inTensors[i++])); - ATB_CHECK(status == atb::NO_ERROR, "beta1 create failed!", return status); - - if (param.quantMode == atb::infer::MlaPreprocessParam::QuantMode::PER_TENSOR_QUANT_ASYMM) { - status = aclTensorToAtbTensor(quantScale1, &(pack.inTensors[i++])); - ATB_CHECK(status == atb::NO_ERROR, "quantScale1 create failed!", return status); - status = aclTensorToAtbTensor(quantOffset1, &(pack.inTensors[i++])); - ATB_CHECK(status == atb::NO_ERROR, "quantOffset1 create failed!", return status); - } else { - status = aclTensorToAtbTensor(nullptr, &(pack.inTensors[i++])); - status = aclTensorToAtbTensor(nullptr, &(pack.inTensors[i++])); - } - status = aclTensorToAtbTensor(wuq, &(pack.inTensors[i++])); - ATB_CHECK(status == atb::NO_ERROR, "wuq create failed!", return status); - status = aclTensorToAtbTensor(deScale1, &(pack.inTensors[i++])); - ATB_CHECK(status == atb::NO_ERROR, "deScale1 create failed!", return status); - if (param.quantMode != atb::infer::MlaPreprocessParam::QuantMode::PER_TOKEN_QUANT_SYMM && - param.quantMode != atb::infer::MlaPreprocessParam::QuantMode::UNQUANT) { - status = aclTensorToAtbTensor(bias1, &(pack.inTensors[i++])); - ATB_CHECK(status == atb::NO_ERROR, "bias1 create failed!", return status); - } else { - status = aclTensorToAtbTensor(nullptr, &(pack.inTensors[i++])); - } - status = aclTensorToAtbTensor(gamma2, &(pack.inTensors[i++])); - ATB_CHECK(status == atb::NO_ERROR, "gamma2 create failed!", return status); - - status = aclTensorToAtbTensor(cos, &(pack.inTensors[i++])); - ATB_CHECK(status == atb::NO_ERROR, "cos create failed!", return status); - - status = aclTensorToAtbTensor(sin, &(pack.inTensors[i++])); - ATB_CHECK(status == atb::NO_ERROR, "sin create failed!", return status); - - status = aclTensorToAtbTensor(wuk, &(pack.inTensors[i++])); - ATB_CHECK(status == atb::NO_ERROR, "wuk create failed!", return status); - - status = aclTensorToAtbTensor(kvCache, &(pack.inTensors[i++])); - ATB_CHECK(status == atb::NO_ERROR, "kvCache create failed!", return status); - - if (param.cacheMode != atb::infer::MlaPreprocessParam::CacheMode::KVCACHE) { - status = aclTensorToAtbTensor(kvCacheRope, &(pack.inTensors[i++])); - ATB_CHECK(status == atb::NO_ERROR, "kvCacheRope create failed!", return status); - } else { - status = aclTensorToAtbTensor(nullptr, &(pack.inTensors[i++])); - } - status = aclTensorToAtbTensor(slotmapping, &(pack.inTensors[i++])); - if (param.cacheMode == atb::infer::MlaPreprocessParam::CacheMode::INT8_NZCACHE) { - status = aclTensorToAtbTensor(ctkvScale, &(pack.inTensors[i++])); - ATB_CHECK(status == atb::NO_ERROR, "ctkvScale create failed!", return status); - status = aclTensorToAtbTensor(qNopeScale, &(pack.inTensors[i++])); - ATB_CHECK(status == atb::NO_ERROR, "qNopeScale create failed!", return status); - } else { - status = aclTensorToAtbTensor(nullptr, &(pack.inTensors[i++])); - status = aclTensorToAtbTensor(nullptr, &(pack.inTensors[i++])); - } - - i = 0; - if (param.cacheMode != atb::infer::MlaPreprocessParam::CacheMode::KVCACHE) { - pack.outTensors.resize(g_MLAPPOUTTENSORNUMCACHEMODE); - status = aclTensorToAtbTensor(qOut0, &(pack.outTensors[i++])); - ATB_CHECK(status == atb::NO_ERROR, "qOut0 create failed!", return status); - status = aclTensorToAtbTensor(kvCacheOut0, &(pack.outTensors[i++])); - ATB_CHECK(status == atb::NO_ERROR, "kvCacheOut0 create failed!", return status); - status = aclTensorToAtbTensor(qOut1, &(pack.outTensors[i++])); - ATB_CHECK(status == atb::NO_ERROR, "qOut1 create failed!", return status); - status = aclTensorToAtbTensor(kvCacheOut1, &(pack.outTensors[i++])); - ATB_CHECK(status == atb::NO_ERROR, "kvCacheOut1 create failed!", return status); - } else { - pack.outTensors.resize(g_MLAPPOUTTENSORNUM); - status = aclTensorToAtbTensor(qOut0, &(pack.outTensors[i++])); - ATB_CHECK(status == atb::NO_ERROR, "qOut0 create failed!", return status); - status = aclTensorToAtbTensor(kvCacheOut0, &(pack.outTensors[i++])); - ATB_CHECK(status == atb::NO_ERROR, "kvCacheOut0 create failed!", return status); - } - if (op == nullptr || *op == nullptr) { - ATB_LOG(ERROR) << "AtbMLAPreprocessGetWorkspaceSize opeartion pointer is nullptr!"; - return atb::ERROR_INVALID_OPERATION_ADDR; - } - atb::Status st = (*op)->Setup(pack, *workspaceSize, context); - ATB_CHECK(st == atb::NO_ERROR, "AtbMLAPreprocess Setup failed!", return st); - return atb::NO_ERROR; -} - -atb::Status AtbMLAPreprocess(void *workspace, uint64_t workspaceSize, atb::Operation *op, atb::Context *context) -{ - ATB_CHECK(op != nullptr, "AtbMLAPreprocess expect op pointer not to be null!", - return atb::ERROR_INVALID_OPERATION_ADDR); - atb::VariantPack pack; - atb::Status st = op->Execute(pack, (uint8_t *)(workspace), workspaceSize, context); - ATB_CHECK(st == atb::NO_ERROR, "AtbMLAPreprocess Execute failed!", return st); - return st; -} - -#ifdef __cplusplus -} -#endif +/* + * Copyright (c) 2025 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ +#include "atb/atb_acl.h" +#include "atb/utils/atb_acl_util.h" +#include "atb/operation/operation_base.h" + +#ifdef __cplusplus +extern "C" { +#endif + +const size_t g_MLAPPINTENSORNUM = 24; +const size_t g_MLAPPOUTTENSORNUMCACHEMODE = 4; +const size_t g_MLAPPOUTTENSORNUM = 2; + +atb::Status AtbMLAPreprocessGetWorkspaceSize( + const aclTensor *input, const aclTensor *gamma0, const aclTensor *beta0, const aclTensor *quantScale0, + const aclTensor *quantOffset0, const aclTensor *wdqkv, const aclTensor *deScale0, const aclTensor *bias0, + const aclTensor *gamma1, const aclTensor *beta1, const aclTensor *quantScale1, const aclTensor *quantOffset1, + const aclTensor *wuq, const aclTensor *deScale1, const aclTensor *bias1, const aclTensor *gamma2, + const aclTensor *cos, const aclTensor *sin, const aclTensor *wuk, const aclTensor *kvCache, + const aclTensor *kvCacheRope, const aclTensor *slotmapping, const aclTensor *ctkvScale, const aclTensor *qNopeScale, + uint32_t wdqDim, uint32_t qRopeDim, uint32_t kRopeDim, float epsilon, uint32_t qRotaryCoeff, uint32_t kRotaryCoeff, + bool transposeWdq, bool transposeWuq, bool transposeWuk, uint8_t cacheMode, uint16_t quantMode, aclTensor *qOut0, + aclTensor *kvCacheOut0, aclTensor *qOut1, aclTensor *kvCacheOut1, uint64_t *workspaceSize, atb::Operation **op, + atb::Context *context) +{ + atb::infer::MlaPreprocessParam param; + param.wdqDim = wdqDim; + param.qRopeDim = qRopeDim; + param.kRopeDim = kRopeDim; + param.epsilon = epsilon; + param.qRotaryCoeff = static_cast(qRotaryCoeff); + param.kRotaryCoeff = static_cast(kRotaryCoeff); + param.transposeWdq = transposeWdq; + param.transposeWuq = transposeWuq; + param.transposeWuk = transposeWuk; + param.cacheMode = atb::infer::MlaPreprocessParam::CacheMode(cacheMode); + param.quantMode = atb::infer::MlaPreprocessParam::QuantMode(quantMode); + + if (op != nullptr && *op == nullptr) { + auto st = CreateOperation(param, op); + if (st != atb::NO_ERROR) { + ATB_LOG(ERROR) << "Create MLAPreprocess Operation failed!"; + return st; + } + } + atb::VariantPack pack; + size_t i = 0; + pack.inTensors.resize(g_MLAPPINTENSORNUM); + auto status = aclTensorToAtbTensor(input, &(pack.inTensors[i++])); + ATB_CHECK(status == atb::NO_ERROR, "input create failed!", return status); + status = aclTensorToAtbTensor(gamma0, &(pack.inTensors[i++])); + ATB_CHECK(status == atb::NO_ERROR, "gamma0 create failed!", return status); + status = aclTensorToAtbTensor(beta0, &(pack.inTensors[i++])); + ATB_CHECK(status == atb::NO_ERROR, "beta0 create failed!", return status); + if (param.quantMode == atb::infer::MlaPreprocessParam::QuantMode::PER_TENSOR_QUANT_ASYMM) { + status = aclTensorToAtbTensor(quantScale0, &(pack.inTensors[i++])); + ATB_CHECK(status == atb::NO_ERROR, "quantScale0 create failed!", return status); + status = aclTensorToAtbTensor(quantOffset0, &(pack.inTensors[i++])); + ATB_CHECK(status == atb::NO_ERROR, "quantOffset0 create failed!", return status); + } else { + status = aclTensorToAtbTensor(nullptr, &(pack.inTensors[i++])); + status = aclTensorToAtbTensor(nullptr, &(pack.inTensors[i++])); + } + status = aclTensorToAtbTensor(wdqkv, &(pack.inTensors[i++])); + ATB_CHECK(status == atb::NO_ERROR, "wdqkv create failed!", return status); + status = aclTensorToAtbTensor(deScale0, &(pack.inTensors[i++])); + ATB_CHECK(status == atb::NO_ERROR, "deScale0 create failed!", return status); + if (param.quantMode != atb::infer::MlaPreprocessParam::QuantMode::PER_TOKEN_QUANT_SYMM && + param.quantMode != atb::infer::MlaPreprocessParam::QuantMode::UNQUANT) { + status = aclTensorToAtbTensor(bias0, &(pack.inTensors[i++])); + ATB_CHECK(status == atb::NO_ERROR, "bias0 create failed!", return status); + } else { + status = aclTensorToAtbTensor(nullptr, &(pack.inTensors[i++])); + } + status = aclTensorToAtbTensor(gamma1, &(pack.inTensors[i++])); + ATB_CHECK(status == atb::NO_ERROR, "gamma1 create failed!", return status); + status = aclTensorToAtbTensor(beta1, &(pack.inTensors[i++])); + ATB_CHECK(status == atb::NO_ERROR, "beta1 create failed!", return status); + + if (param.quantMode == atb::infer::MlaPreprocessParam::QuantMode::PER_TENSOR_QUANT_ASYMM) { + status = aclTensorToAtbTensor(quantScale1, &(pack.inTensors[i++])); + ATB_CHECK(status == atb::NO_ERROR, "quantScale1 create failed!", return status); + status = aclTensorToAtbTensor(quantOffset1, &(pack.inTensors[i++])); + ATB_CHECK(status == atb::NO_ERROR, "quantOffset1 create failed!", return status); + } else { + status = aclTensorToAtbTensor(nullptr, &(pack.inTensors[i++])); + status = aclTensorToAtbTensor(nullptr, &(pack.inTensors[i++])); + } + status = aclTensorToAtbTensor(wuq, &(pack.inTensors[i++])); + ATB_CHECK(status == atb::NO_ERROR, "wuq create failed!", return status); + status = aclTensorToAtbTensor(deScale1, &(pack.inTensors[i++])); + ATB_CHECK(status == atb::NO_ERROR, "deScale1 create failed!", return status); + if (param.quantMode != atb::infer::MlaPreprocessParam::QuantMode::PER_TOKEN_QUANT_SYMM && + param.quantMode != atb::infer::MlaPreprocessParam::QuantMode::UNQUANT) { + status = aclTensorToAtbTensor(bias1, &(pack.inTensors[i++])); + ATB_CHECK(status == atb::NO_ERROR, "bias1 create failed!", return status); + } else { + status = aclTensorToAtbTensor(nullptr, &(pack.inTensors[i++])); + } + status = aclTensorToAtbTensor(gamma2, &(pack.inTensors[i++])); + ATB_CHECK(status == atb::NO_ERROR, "gamma2 create failed!", return status); + + status = aclTensorToAtbTensor(cos, &(pack.inTensors[i++])); + ATB_CHECK(status == atb::NO_ERROR, "cos create failed!", return status); + + status = aclTensorToAtbTensor(sin, &(pack.inTensors[i++])); + ATB_CHECK(status == atb::NO_ERROR, "sin create failed!", return status); + + status = aclTensorToAtbTensor(wuk, &(pack.inTensors[i++])); + ATB_CHECK(status == atb::NO_ERROR, "wuk create failed!", return status); + + status = aclTensorToAtbTensor(kvCache, &(pack.inTensors[i++])); + ATB_CHECK(status == atb::NO_ERROR, "kvCache create failed!", return status); + + if (param.cacheMode != atb::infer::MlaPreprocessParam::CacheMode::KVCACHE) { + status = aclTensorToAtbTensor(kvCacheRope, &(pack.inTensors[i++])); + ATB_CHECK(status == atb::NO_ERROR, "kvCacheRope create failed!", return status); + } else { + status = aclTensorToAtbTensor(nullptr, &(pack.inTensors[i++])); + } + status = aclTensorToAtbTensor(slotmapping, &(pack.inTensors[i++])); + if (param.cacheMode == atb::infer::MlaPreprocessParam::CacheMode::INT8_NZCACHE) { + status = aclTensorToAtbTensor(ctkvScale, &(pack.inTensors[i++])); + ATB_CHECK(status == atb::NO_ERROR, "ctkvScale create failed!", return status); + status = aclTensorToAtbTensor(qNopeScale, &(pack.inTensors[i++])); + ATB_CHECK(status == atb::NO_ERROR, "qNopeScale create failed!", return status); + } else { + status = aclTensorToAtbTensor(nullptr, &(pack.inTensors[i++])); + status = aclTensorToAtbTensor(nullptr, &(pack.inTensors[i++])); + } + + i = 0; + if (param.cacheMode != atb::infer::MlaPreprocessParam::CacheMode::KVCACHE) { + pack.outTensors.resize(g_MLAPPOUTTENSORNUMCACHEMODE); + status = aclTensorToAtbTensor(qOut0, &(pack.outTensors[i++])); + ATB_CHECK(status == atb::NO_ERROR, "qOut0 create failed!", return status); + status = aclTensorToAtbTensor(kvCacheOut0, &(pack.outTensors[i++])); + ATB_CHECK(status == atb::NO_ERROR, "kvCacheOut0 create failed!", return status); + status = aclTensorToAtbTensor(qOut1, &(pack.outTensors[i++])); + ATB_CHECK(status == atb::NO_ERROR, "qOut1 create failed!", return status); + status = aclTensorToAtbTensor(kvCacheOut1, &(pack.outTensors[i++])); + ATB_CHECK(status == atb::NO_ERROR, "kvCacheOut1 create failed!", return status); + } else { + pack.outTensors.resize(g_MLAPPOUTTENSORNUM); + status = aclTensorToAtbTensor(qOut0, &(pack.outTensors[i++])); + ATB_CHECK(status == atb::NO_ERROR, "qOut0 create failed!", return status); + status = aclTensorToAtbTensor(kvCacheOut0, &(pack.outTensors[i++])); + ATB_CHECK(status == atb::NO_ERROR, "kvCacheOut0 create failed!", return status); + } + if (op == nullptr || *op == nullptr) { + ATB_LOG(ERROR) << "AtbMLAPreprocessGetWorkspaceSize opeartion pointer is nullptr!"; + return atb::ERROR_INVALID_OPERATION_ADDR; + } + atb::Status st = (*op)->Setup(pack, *workspaceSize, context); + ATB_CHECK(st == atb::NO_ERROR, "AtbMLAPreprocess Setup failed!", return st); + return atb::NO_ERROR; +} + +atb::Status AtbMLAPreprocess(void *workspace, uint64_t workspaceSize, atb::Operation *op, atb::Context *context) +{ + ATB_CHECK(op != nullptr, "AtbMLAPreprocess expect op pointer not to be null!", + return atb::ERROR_INVALID_OPERATION_ADDR); + atb::VariantPack pack; + atb::Status st = op->Execute(pack, (uint8_t *)(workspace), workspaceSize, context); + ATB_CHECK(st == atb::NO_ERROR, "AtbMLAPreprocess Execute failed!", return st); + return st; +} + +#ifdef __cplusplus +} +#endif diff --git a/src/cinterface/atb_acl_mla.cpp b/src/ops_infer/multi_latent_attention/atb_acl_mla.cpp similarity index 97% rename from src/cinterface/atb_acl_mla.cpp rename to src/ops_infer/multi_latent_attention/atb_acl_mla.cpp index e050b04c..54b9f909 100644 --- a/src/cinterface/atb_acl_mla.cpp +++ b/src/ops_infer/multi_latent_attention/atb_acl_mla.cpp @@ -1,201 +1,201 @@ -/* - * Copyright (c) 2025 Huawei Technologies Co., Ltd. - * This file is a part of the CANN Open Software. - * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). - * Please refer to the License for details. You may not use this file except in compliance with the License. - * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, - * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. - * See LICENSE in the root of the software repository for the full text of the License. - */ -#include "atb/atb_acl.h" -#include "atb_acl_util.h" -#include "atb/operation/operation_base.h" - -#ifdef __cplusplus -extern "C" { -#endif - -const size_t g_MLAINTENSORNUMINT8NOMASK = 9; -const size_t g_MLAINTENSORNUMINT8MASK = 10; -const size_t g_MLAINTENSORNUMNOMASK = 7; -const size_t g_MLAINTENSORNUMMASK = 8; -const size_t g_MLAOUTTENSORNUMCALCRING = 2; -const size_t g_MLAOUTTENSORNUMNOCALCRING = 1; - -atb::Status AtbMLAGetWorkspaceSize(const aclTensor *qNope, const aclTensor *qRope, const aclTensor *ctKV, - const aclTensor *kRope, const aclTensor *blockTables, const aclTensor *contextLens, - const aclTensor *mask, const aclTensor *qSeqLen, const aclTensor *qkDescale, - const aclTensor *pvDescale, int32_t headNum, float qkScale, int32_t kvHeadNum, - int maskType, int calcType, uint8_t cacheMode, aclTensor *attenOut, aclTensor *lse, - uint64_t *workspaceSize, atb::Operation **op, atb::Context *context) -{ - atb::infer::MultiLatentAttentionParam param; - param.headNum = headNum; - param.qkScale = qkScale; - param.kvHeadNum = kvHeadNum; - param.maskType = atb::infer::MultiLatentAttentionParam::MaskType(maskType); - param.calcType = atb::infer::MultiLatentAttentionParam::CalcType(calcType); - param.cacheMode = atb::infer::MultiLatentAttentionParam::CacheMode(cacheMode); - if (op != nullptr && *op == nullptr) { - auto st = CreateOperation(param, op); - if (st != atb::NO_ERROR) { - ATB_LOG(ERROR) << "Create MLA Operation failed!"; - return st; - } - } - atb::VariantPack pack; - size_t i = 0; - size_t counter = 0; - if (param.cacheMode == atb::infer::MultiLatentAttentionParam::CacheMode::INT8_NZCACHE) { - if (param.maskType == atb::infer::MultiLatentAttentionParam::MaskType::UNDEFINED) { - pack.inTensors.resize(g_MLAINTENSORNUMINT8NOMASK); - counter = g_MLAINTENSORNUMINT8NOMASK; - } else { - pack.inTensors.resize(g_MLAINTENSORNUMINT8MASK); - counter = g_MLAINTENSORNUMINT8MASK; - } - } else { - if (param.maskType == atb::infer::MultiLatentAttentionParam::MaskType::UNDEFINED) { - pack.inTensors.resize(g_MLAINTENSORNUMNOMASK); - counter = g_MLAINTENSORNUMNOMASK; - } else { - pack.inTensors.resize(g_MLAINTENSORNUMMASK); - counter = g_MLAINTENSORNUMMASK; - } - } - if (param.calcType != atb::infer::MultiLatentAttentionParam::CalcType::CALC_TYPE_SPEC) { - pack.inTensors.resize(counter - 1); - } - auto status = aclTensorToAtbTensor(qNope, &(pack.inTensors[i++])); - ATB_CHECK(status == atb::NO_ERROR, "qNope create failed!", return status); - status = aclTensorToAtbTensor(qRope, &(pack.inTensors[i++])); - ATB_CHECK(status == atb::NO_ERROR, "qRope create failed!", return status); - status = aclTensorToAtbTensor(ctKV, &(pack.inTensors[i++])); - ATB_CHECK(status == atb::NO_ERROR, "ctKV create failed!", return status); - status = aclTensorToAtbTensor(kRope, &(pack.inTensors[i++])); - ATB_CHECK(status == atb::NO_ERROR, "kRope create failed!", return status); - status = aclTensorToAtbTensor(blockTables, &(pack.inTensors[i++])); - ATB_CHECK(status == atb::NO_ERROR, "blockTables create failed!", return status); - status = aclTensorToAtbTensorHost(contextLens, &(pack.inTensors[i++])); - ATB_CHECK(status == atb::NO_ERROR, "contextLens create failed!", return status); - - if (param.maskType != atb::infer::MultiLatentAttentionParam::MaskType::UNDEFINED) { - status = aclTensorToAtbTensor(mask, &(pack.inTensors[i++])); - ATB_CHECK(status == atb::NO_ERROR, "mask create failed!", return status); - } - if (param.calcType == atb::infer::MultiLatentAttentionParam::CalcType::CALC_TYPE_SPEC) { - status = aclTensorToAtbTensorHost(qSeqLen, &(pack.inTensors[i++])); - ATB_CHECK(status == atb::NO_ERROR, "qSeqLen create failed!", return status); - } - if (param.cacheMode == atb::infer::MultiLatentAttentionParam::CacheMode::INT8_NZCACHE) { - status = aclTensorToAtbTensor(qkDescale, &(pack.inTensors[i++])); - ATB_CHECK(status == atb::NO_ERROR, "qkDescale create failed!", return status); - status = aclTensorToAtbTensor(pvDescale, &(pack.inTensors[i++])); - ATB_CHECK(status == atb::NO_ERROR, "pvDescale create failed!", return status); - } - i = 0; - if (param.calcType != atb::infer::MultiLatentAttentionParam::CalcType::CALC_TYPE_RING) { - pack.outTensors.resize(g_MLAOUTTENSORNUMNOCALCRING); - status = aclTensorToAtbTensor(attenOut, &(pack.outTensors[i++])); - ATB_CHECK(status == atb::NO_ERROR, "attenOut create failed!", return status); - } else { - pack.outTensors.resize(g_MLAOUTTENSORNUMCALCRING); - status = aclTensorToAtbTensor(attenOut, &(pack.outTensors[i++])); - ATB_CHECK(status == atb::NO_ERROR, "calc_type_ring attenOut create failed!", return status); - status = aclTensorToAtbTensor(lse, &(pack.outTensors[i++])); - ATB_CHECK(status == atb::NO_ERROR, "calc_type_ring lse create failed!", return status); - } - if (op == nullptr || *op == nullptr) { - ATB_LOG(ERROR) << "AtbMLAGetWorkspaceSize opeartion pointer is nullptr!"; - return atb::ERROR_INVALID_OPERATION_ADDR; - } - atb::Status st = (*op)->Setup(pack, *workspaceSize, context); - ATB_CHECK(st == atb::NO_ERROR, "AtbMLA Setup failed!", return st); - return atb::NO_ERROR; -} - -atb::Status AtbMLA(void *workSpcace, uint64_t workspaceSize, atb::Operation *op, atb::Context *context) -{ - atb::VariantPack pack; - atb::Status st = op->Execute(pack, (uint8_t *)(workSpcace), workspaceSize, context); - ATB_CHECK(st == atb::NO_ERROR, "AtbMLA Execute failed!", return st); - return st; -} - - -atb::Status AtbMLAPreFillGetWorkspaceSize(const aclTensor *q, const aclTensor *qRope, const aclTensor *k, - const aclTensor *kRope, const aclTensor *v, const aclTensor *qSeqLen, - const aclTensor *kvSeqLen, const aclTensor *mask, int32_t headNum, - float qkScale, int32_t kvHeadNum, int maskType, uint8_t cacheMode, - aclTensor *attenOut, uint64_t *workspaceSize, atb::Operation **op, - atb::Context *context) -{ - atb::infer::MultiLatentAttentionParam param; - param.headNum = headNum; - param.qkScale = qkScale; - param.kvHeadNum = kvHeadNum; - param.maskType = atb::infer::MultiLatentAttentionParam::MaskType(maskType); - param.calcType = atb::infer::MultiLatentAttentionParam::CalcType::CALC_TYPE_PREFILL; - param.cacheMode = atb::infer::MultiLatentAttentionParam::CacheMode(cacheMode); - if (op != nullptr && *op == nullptr) { - auto st = CreateOperation(param, op); - if (st != atb::NO_ERROR) { - ATB_LOG(ERROR) << "Create MLA Operation prefill failed!"; - return st; - } - } - atb::VariantPack pack; - size_t i = 0; - - if (param.maskType == atb::infer::MultiLatentAttentionParam::MaskType::UNDEFINED) { - pack.inTensors.resize(g_MLAINTENSORNUMNOMASK); - } else { - pack.inTensors.resize(g_MLAINTENSORNUMMASK); - } - - auto status = aclTensorToAtbTensor(q, &(pack.inTensors[i++])); - ATB_CHECK(status == atb::NO_ERROR, "qNope create failed!", return status); - status = aclTensorToAtbTensor(qRope, &(pack.inTensors[i++])); - ATB_CHECK(status == atb::NO_ERROR, "qRope create failed!", return status); - status = aclTensorToAtbTensor(k, &(pack.inTensors[i++])); - ATB_CHECK(status == atb::NO_ERROR, "key create failed!", return status); - status = aclTensorToAtbTensor(kRope, &(pack.inTensors[i++])); - ATB_CHECK(status == atb::NO_ERROR, "kRope create failed!", return status); - status = aclTensorToAtbTensor(v, &(pack.inTensors[i++])); - ATB_CHECK(status == atb::NO_ERROR, "value create failed!", return status); - status = aclTensorToAtbTensorHost(qSeqLen, &(pack.inTensors[i++])); - ATB_CHECK(status == atb::NO_ERROR, "qSeqLen create failed!", return status); - status = aclTensorToAtbTensorHost(kvSeqLen, &(pack.inTensors[i++])); - ATB_CHECK(status == atb::NO_ERROR, "kvSeqLen create failed!", return status); - - if (param.maskType == atb::infer::MultiLatentAttentionParam::MaskType::MASK_TYPE_MASK_FREE) { - status = aclTensorToAtbTensor(mask, &(pack.inTensors[i++])); - ATB_CHECK(status == atb::NO_ERROR, "mask create failed!", return status); - } - - pack.outTensors.resize(g_MLAOUTTENSORNUMNOCALCRING); - status = aclTensorToAtbTensor(attenOut, &(pack.outTensors[0])); - ATB_CHECK(status == atb::NO_ERROR, "attenOut create failed!", return status); - - if (op == nullptr || *op == nullptr) { - ATB_LOG(ERROR) << "AtbMLAPreFillGetWorkspaceSize opeartion pointer is nullptr!"; - return atb::ERROR_INVALID_OPERATION_ADDR; - } - atb::Status st = (*op)->Setup(pack, *workspaceSize, context); - ATB_CHECK(st == atb::NO_ERROR, "AtbMLAPreFill Setup failed!", return st); - return atb::NO_ERROR; -} - -atb::Status AtbMLAPreFill(void *workspace, uint64_t workspaceSize, atb::Operation *op, atb::Context *context) -{ - ATB_CHECK(op != nullptr, "AtbMLAPreFill expect op pointer not to be null!", - return atb::ERROR_INVALID_OPERATION_ADDR); - atb::VariantPack pack; - atb::Status st = op->Execute(pack, (uint8_t *)(workspace), workspaceSize, context); - ATB_CHECK(st == atb::NO_ERROR, "AtbMLAPreFill Execute failed!", return st); - return st; -} - -#ifdef __cplusplus -} -#endif +/* + * Copyright (c) 2025 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ +#include "atb/atb_acl.h" +#include "atb/utils/atb_acl_util.h" +#include "atb/operation/operation_base.h" + +#ifdef __cplusplus +extern "C" { +#endif + +const size_t g_MLAINTENSORNUMINT8NOMASK = 9; +const size_t g_MLAINTENSORNUMINT8MASK = 10; +const size_t g_MLAINTENSORNUMNOMASK = 7; +const size_t g_MLAINTENSORNUMMASK = 8; +const size_t g_MLAOUTTENSORNUMCALCRING = 2; +const size_t g_MLAOUTTENSORNUMNOCALCRING = 1; + +atb::Status AtbMLAGetWorkspaceSize(const aclTensor *qNope, const aclTensor *qRope, const aclTensor *ctKV, + const aclTensor *kRope, const aclTensor *blockTables, const aclTensor *contextLens, + const aclTensor *mask, const aclTensor *qSeqLen, const aclTensor *qkDescale, + const aclTensor *pvDescale, int32_t headNum, float qkScale, int32_t kvHeadNum, + int maskType, int calcType, uint8_t cacheMode, aclTensor *attenOut, aclTensor *lse, + uint64_t *workspaceSize, atb::Operation **op, atb::Context *context) +{ + atb::infer::MultiLatentAttentionParam param; + param.headNum = headNum; + param.qkScale = qkScale; + param.kvHeadNum = kvHeadNum; + param.maskType = atb::infer::MultiLatentAttentionParam::MaskType(maskType); + param.calcType = atb::infer::MultiLatentAttentionParam::CalcType(calcType); + param.cacheMode = atb::infer::MultiLatentAttentionParam::CacheMode(cacheMode); + if (op != nullptr && *op == nullptr) { + auto st = CreateOperation(param, op); + if (st != atb::NO_ERROR) { + ATB_LOG(ERROR) << "Create MLA Operation failed!"; + return st; + } + } + atb::VariantPack pack; + size_t i = 0; + size_t counter = 0; + if (param.cacheMode == atb::infer::MultiLatentAttentionParam::CacheMode::INT8_NZCACHE) { + if (param.maskType == atb::infer::MultiLatentAttentionParam::MaskType::UNDEFINED) { + pack.inTensors.resize(g_MLAINTENSORNUMINT8NOMASK); + counter = g_MLAINTENSORNUMINT8NOMASK; + } else { + pack.inTensors.resize(g_MLAINTENSORNUMINT8MASK); + counter = g_MLAINTENSORNUMINT8MASK; + } + } else { + if (param.maskType == atb::infer::MultiLatentAttentionParam::MaskType::UNDEFINED) { + pack.inTensors.resize(g_MLAINTENSORNUMNOMASK); + counter = g_MLAINTENSORNUMNOMASK; + } else { + pack.inTensors.resize(g_MLAINTENSORNUMMASK); + counter = g_MLAINTENSORNUMMASK; + } + } + if (param.calcType != atb::infer::MultiLatentAttentionParam::CalcType::CALC_TYPE_SPEC) { + pack.inTensors.resize(counter - 1); + } + auto status = aclTensorToAtbTensor(qNope, &(pack.inTensors[i++])); + ATB_CHECK(status == atb::NO_ERROR, "qNope create failed!", return status); + status = aclTensorToAtbTensor(qRope, &(pack.inTensors[i++])); + ATB_CHECK(status == atb::NO_ERROR, "qRope create failed!", return status); + status = aclTensorToAtbTensor(ctKV, &(pack.inTensors[i++])); + ATB_CHECK(status == atb::NO_ERROR, "ctKV create failed!", return status); + status = aclTensorToAtbTensor(kRope, &(pack.inTensors[i++])); + ATB_CHECK(status == atb::NO_ERROR, "kRope create failed!", return status); + status = aclTensorToAtbTensor(blockTables, &(pack.inTensors[i++])); + ATB_CHECK(status == atb::NO_ERROR, "blockTables create failed!", return status); + status = aclTensorToAtbTensorHost(contextLens, &(pack.inTensors[i++])); + ATB_CHECK(status == atb::NO_ERROR, "contextLens create failed!", return status); + + if (param.maskType != atb::infer::MultiLatentAttentionParam::MaskType::UNDEFINED) { + status = aclTensorToAtbTensor(mask, &(pack.inTensors[i++])); + ATB_CHECK(status == atb::NO_ERROR, "mask create failed!", return status); + } + if (param.calcType == atb::infer::MultiLatentAttentionParam::CalcType::CALC_TYPE_SPEC) { + status = aclTensorToAtbTensorHost(qSeqLen, &(pack.inTensors[i++])); + ATB_CHECK(status == atb::NO_ERROR, "qSeqLen create failed!", return status); + } + if (param.cacheMode == atb::infer::MultiLatentAttentionParam::CacheMode::INT8_NZCACHE) { + status = aclTensorToAtbTensor(qkDescale, &(pack.inTensors[i++])); + ATB_CHECK(status == atb::NO_ERROR, "qkDescale create failed!", return status); + status = aclTensorToAtbTensor(pvDescale, &(pack.inTensors[i++])); + ATB_CHECK(status == atb::NO_ERROR, "pvDescale create failed!", return status); + } + i = 0; + if (param.calcType != atb::infer::MultiLatentAttentionParam::CalcType::CALC_TYPE_RING) { + pack.outTensors.resize(g_MLAOUTTENSORNUMNOCALCRING); + status = aclTensorToAtbTensor(attenOut, &(pack.outTensors[i++])); + ATB_CHECK(status == atb::NO_ERROR, "attenOut create failed!", return status); + } else { + pack.outTensors.resize(g_MLAOUTTENSORNUMCALCRING); + status = aclTensorToAtbTensor(attenOut, &(pack.outTensors[i++])); + ATB_CHECK(status == atb::NO_ERROR, "calc_type_ring attenOut create failed!", return status); + status = aclTensorToAtbTensor(lse, &(pack.outTensors[i++])); + ATB_CHECK(status == atb::NO_ERROR, "calc_type_ring lse create failed!", return status); + } + if (op == nullptr || *op == nullptr) { + ATB_LOG(ERROR) << "AtbMLAGetWorkspaceSize opeartion pointer is nullptr!"; + return atb::ERROR_INVALID_OPERATION_ADDR; + } + atb::Status st = (*op)->Setup(pack, *workspaceSize, context); + ATB_CHECK(st == atb::NO_ERROR, "AtbMLA Setup failed!", return st); + return atb::NO_ERROR; +} + +atb::Status AtbMLA(void *workSpcace, uint64_t workspaceSize, atb::Operation *op, atb::Context *context) +{ + atb::VariantPack pack; + atb::Status st = op->Execute(pack, (uint8_t *)(workSpcace), workspaceSize, context); + ATB_CHECK(st == atb::NO_ERROR, "AtbMLA Execute failed!", return st); + return st; +} + + +atb::Status AtbMLAPreFillGetWorkspaceSize(const aclTensor *q, const aclTensor *qRope, const aclTensor *k, + const aclTensor *kRope, const aclTensor *v, const aclTensor *qSeqLen, + const aclTensor *kvSeqLen, const aclTensor *mask, int32_t headNum, + float qkScale, int32_t kvHeadNum, int maskType, uint8_t cacheMode, + aclTensor *attenOut, uint64_t *workspaceSize, atb::Operation **op, + atb::Context *context) +{ + atb::infer::MultiLatentAttentionParam param; + param.headNum = headNum; + param.qkScale = qkScale; + param.kvHeadNum = kvHeadNum; + param.maskType = atb::infer::MultiLatentAttentionParam::MaskType(maskType); + param.calcType = atb::infer::MultiLatentAttentionParam::CalcType::CALC_TYPE_PREFILL; + param.cacheMode = atb::infer::MultiLatentAttentionParam::CacheMode(cacheMode); + if (op != nullptr && *op == nullptr) { + auto st = CreateOperation(param, op); + if (st != atb::NO_ERROR) { + ATB_LOG(ERROR) << "Create MLA Operation prefill failed!"; + return st; + } + } + atb::VariantPack pack; + size_t i = 0; + + if (param.maskType == atb::infer::MultiLatentAttentionParam::MaskType::UNDEFINED) { + pack.inTensors.resize(g_MLAINTENSORNUMNOMASK); + } else { + pack.inTensors.resize(g_MLAINTENSORNUMMASK); + } + + auto status = aclTensorToAtbTensor(q, &(pack.inTensors[i++])); + ATB_CHECK(status == atb::NO_ERROR, "qNope create failed!", return status); + status = aclTensorToAtbTensor(qRope, &(pack.inTensors[i++])); + ATB_CHECK(status == atb::NO_ERROR, "qRope create failed!", return status); + status = aclTensorToAtbTensor(k, &(pack.inTensors[i++])); + ATB_CHECK(status == atb::NO_ERROR, "key create failed!", return status); + status = aclTensorToAtbTensor(kRope, &(pack.inTensors[i++])); + ATB_CHECK(status == atb::NO_ERROR, "kRope create failed!", return status); + status = aclTensorToAtbTensor(v, &(pack.inTensors[i++])); + ATB_CHECK(status == atb::NO_ERROR, "value create failed!", return status); + status = aclTensorToAtbTensorHost(qSeqLen, &(pack.inTensors[i++])); + ATB_CHECK(status == atb::NO_ERROR, "qSeqLen create failed!", return status); + status = aclTensorToAtbTensorHost(kvSeqLen, &(pack.inTensors[i++])); + ATB_CHECK(status == atb::NO_ERROR, "kvSeqLen create failed!", return status); + + if (param.maskType == atb::infer::MultiLatentAttentionParam::MaskType::MASK_TYPE_MASK_FREE) { + status = aclTensorToAtbTensor(mask, &(pack.inTensors[i++])); + ATB_CHECK(status == atb::NO_ERROR, "mask create failed!", return status); + } + + pack.outTensors.resize(g_MLAOUTTENSORNUMNOCALCRING); + status = aclTensorToAtbTensor(attenOut, &(pack.outTensors[0])); + ATB_CHECK(status == atb::NO_ERROR, "attenOut create failed!", return status); + + if (op == nullptr || *op == nullptr) { + ATB_LOG(ERROR) << "AtbMLAPreFillGetWorkspaceSize opeartion pointer is nullptr!"; + return atb::ERROR_INVALID_OPERATION_ADDR; + } + atb::Status st = (*op)->Setup(pack, *workspaceSize, context); + ATB_CHECK(st == atb::NO_ERROR, "AtbMLAPreFill Setup failed!", return st); + return atb::NO_ERROR; +} + +atb::Status AtbMLAPreFill(void *workspace, uint64_t workspaceSize, atb::Operation *op, atb::Context *context) +{ + ATB_CHECK(op != nullptr, "AtbMLAPreFill expect op pointer not to be null!", + return atb::ERROR_INVALID_OPERATION_ADDR); + atb::VariantPack pack; + atb::Status st = op->Execute(pack, (uint8_t *)(workspace), workspaceSize, context); + ATB_CHECK(st == atb::NO_ERROR, "AtbMLAPreFill Execute failed!", return st); + return st; +} + +#ifdef __cplusplus +} +#endif diff --git a/src/cinterface/atb_acl_paged_cache_load.cpp b/src/ops_infer/paged_cache_load/atb_acl_paged_cache_load.cpp similarity index 99% rename from src/cinterface/atb_acl_paged_cache_load.cpp rename to src/ops_infer/paged_cache_load/atb_acl_paged_cache_load.cpp index df6d86d5..07580b09 100644 --- a/src/cinterface/atb_acl_paged_cache_load.cpp +++ b/src/ops_infer/paged_cache_load/atb_acl_paged_cache_load.cpp @@ -8,7 +8,7 @@ * See LICENSE in the root of the software repository for the full text of the License. */ #include "atb/atb_acl.h" -#include "atb_acl_util.h" +#include "atb/utils/atb_acl_util.h" #include "atb/operation/operation_base.h" #ifdef __cplusplus diff --git a/src/cinterface/atb_acl_ring_mla.cpp b/src/ops_infer/ring_mla/atb_acl_ring_mla.cpp similarity index 97% rename from src/cinterface/atb_acl_ring_mla.cpp rename to src/ops_infer/ring_mla/atb_acl_ring_mla.cpp index 62468810..041888a1 100644 --- a/src/cinterface/atb_acl_ring_mla.cpp +++ b/src/ops_infer/ring_mla/atb_acl_ring_mla.cpp @@ -1,99 +1,99 @@ -/* - * Copyright (c) 2025 Huawei Technologies Co., Ltd. - * This file is a part of the CANN Open Software. - * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). - * Please refer to the License for details. You may not use this file except in compliance with the License. - * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, - * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. - * See LICENSE in the root of the software repository for the full text of the License. - */ -#include "atb/atb_acl.h" -#include "atb_acl_util.h" -#include "atb/operation/operation_base.h" - -#ifdef __cplusplus -extern "C" { -#endif - -const size_t g_RING_MLA_INTENSOR_NUM = 7; -const size_t g_RING_MLA_OUTTENSOR_NUM = 2; - -atb::Status AtbRingMLAGetWorkspaceSize(const aclTensor *querySplit1, const aclTensor *querySplit2, - const aclTensor *keySplit1, const aclTensor *keySplit2, const aclTensor *value, - const aclTensor *mask, const aclTensor *seqLen, const aclTensor *prevOut, - const aclTensor *prevLse, int32_t headNum, int32_t kvHeadNum, float qkScale, - int kernelType, int maskType, int inputLayout, int calcType, aclTensor *output, - aclTensor *softmaxLse, uint64_t *workspaceSize, atb::Operation **op, - atb::Context *context) -{ - atb::infer::RingMLAParam param; - param.headNum = headNum; - param.kvHeadNum = kvHeadNum; - param.qkScale = qkScale; - param.kernelType = atb::infer::RingMLAParam::KernelType(kernelType); - param.maskType = atb::infer::RingMLAParam::MaskType(maskType); - param.inputLayout = atb::infer::InputLayout(inputLayout); - param.calcType = atb::infer::RingMLAParam::CalcType(calcType); - if (op != nullptr && *op == nullptr) { - auto st = CreateOperation(param, op); - if (st != atb::NO_ERROR) { - ATB_LOG(ERROR) << "Create RingMLA Operation failed!"; - return st; - } - } - atb::VariantPack pack; - size_t index = 0; - if (param.calcType == atb::infer::RingMLAParam::CalcType::CALC_TYPE_DEFAULT) { - pack.inTensors.resize(g_RING_MLA_INTENSOR_NUM + 2); // 2: prevOut, prevLse - } else { - pack.inTensors.resize(g_RING_MLA_INTENSOR_NUM); - } - - auto status = aclTensorToAtbTensor(querySplit1, &(pack.inTensors[index++])); - ATB_CHECK(status == atb::NO_ERROR, "querySplit1 create failed!", return status); - status = aclTensorToAtbTensor(querySplit2, &(pack.inTensors[index++])); - ATB_CHECK(status == atb::NO_ERROR, "querySplit2 create failed!", return status); - status = aclTensorToAtbTensor(keySplit1, &(pack.inTensors[index++])); - ATB_CHECK(status == atb::NO_ERROR, "keySplit1 create failed!", return status); - status = aclTensorToAtbTensor(keySplit2, &(pack.inTensors[index++])); - ATB_CHECK(status == atb::NO_ERROR, "keySplit2 create failed!", return status); - status = aclTensorToAtbTensor(value, &(pack.inTensors[index++])); - ATB_CHECK(status == atb::NO_ERROR, "value create failed!", return status); - status = aclTensorToAtbTensor(mask, &(pack.inTensors[index++])); - ATB_CHECK(status == atb::NO_ERROR, "mask create failed!", return status); - status = aclTensorToAtbTensorHost(seqLen, &(pack.inTensors[index++])); - ATB_CHECK(status == atb::NO_ERROR, "seqLen create failed!", return status); - if (param.calcType == atb::infer::RingMLAParam::CalcType::CALC_TYPE_DEFAULT) { - status = aclTensorToAtbTensor(prevOut, &(pack.inTensors[index++])); - ATB_CHECK(status == atb::NO_ERROR, "prevOut create failed!", return status); - status = aclTensorToAtbTensor(prevLse, &(pack.inTensors[index++])); - ATB_CHECK(status == atb::NO_ERROR, "prevLse create failed!", return status); - } - - index = 0; - pack.outTensors.resize(g_RING_MLA_OUTTENSOR_NUM); - status = aclTensorToAtbTensor(output, &(pack.outTensors[index++])); - ATB_CHECK(status == atb::NO_ERROR, "output create failed!", return status); - status = aclTensorToAtbTensor(softmaxLse, &(pack.outTensors[index++])); - ATB_CHECK(status == atb::NO_ERROR, "softmaxLse create failed!", return status); - if (op == nullptr || *op == nullptr) { - ATB_LOG(ERROR) << "AtbRingMLAGetWorkspaceSize opeartion pointer is nullptr!"; - return atb::ERROR_INVALID_OPERATION_ADDR; - } - status = (*op)->Setup(pack, *workspaceSize, context); - ATB_CHECK(status == atb::NO_ERROR, "AtbRingMLA Setup failed!", return status); - return atb::NO_ERROR; -} - -atb::Status AtbRingMLA(void *workspace, uint64_t workspaceSize, atb::Operation *op, atb::Context *context) -{ - ATB_CHECK(op != nullptr, "AtbRingMLA expect op pointer not to be null!", return atb::ERROR_INVALID_OPERATION_ADDR); - atb::VariantPack pack; - atb::Status st = op->Execute(pack, (uint8_t *)(workspace), workspaceSize, context); - ATB_CHECK(st == atb::NO_ERROR, "AtbRingMLA Execute failed!", return st); - return st; -} - -#ifdef __cplusplus -} -#endif +/* + * Copyright (c) 2025 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ +#include "atb/atb_acl.h" +#include "atb/utils/atb_acl_util.h" +#include "atb/operation/operation_base.h" + +#ifdef __cplusplus +extern "C" { +#endif + +const size_t g_RING_MLA_INTENSOR_NUM = 7; +const size_t g_RING_MLA_OUTTENSOR_NUM = 2; + +atb::Status AtbRingMLAGetWorkspaceSize(const aclTensor *querySplit1, const aclTensor *querySplit2, + const aclTensor *keySplit1, const aclTensor *keySplit2, const aclTensor *value, + const aclTensor *mask, const aclTensor *seqLen, const aclTensor *prevOut, + const aclTensor *prevLse, int32_t headNum, int32_t kvHeadNum, float qkScale, + int kernelType, int maskType, int inputLayout, int calcType, aclTensor *output, + aclTensor *softmaxLse, uint64_t *workspaceSize, atb::Operation **op, + atb::Context *context) +{ + atb::infer::RingMLAParam param; + param.headNum = headNum; + param.kvHeadNum = kvHeadNum; + param.qkScale = qkScale; + param.kernelType = atb::infer::RingMLAParam::KernelType(kernelType); + param.maskType = atb::infer::RingMLAParam::MaskType(maskType); + param.inputLayout = atb::infer::InputLayout(inputLayout); + param.calcType = atb::infer::RingMLAParam::CalcType(calcType); + if (op != nullptr && *op == nullptr) { + auto st = CreateOperation(param, op); + if (st != atb::NO_ERROR) { + ATB_LOG(ERROR) << "Create RingMLA Operation failed!"; + return st; + } + } + atb::VariantPack pack; + size_t index = 0; + if (param.calcType == atb::infer::RingMLAParam::CalcType::CALC_TYPE_DEFAULT) { + pack.inTensors.resize(g_RING_MLA_INTENSOR_NUM + 2); // 2: prevOut, prevLse + } else { + pack.inTensors.resize(g_RING_MLA_INTENSOR_NUM); + } + + auto status = aclTensorToAtbTensor(querySplit1, &(pack.inTensors[index++])); + ATB_CHECK(status == atb::NO_ERROR, "querySplit1 create failed!", return status); + status = aclTensorToAtbTensor(querySplit2, &(pack.inTensors[index++])); + ATB_CHECK(status == atb::NO_ERROR, "querySplit2 create failed!", return status); + status = aclTensorToAtbTensor(keySplit1, &(pack.inTensors[index++])); + ATB_CHECK(status == atb::NO_ERROR, "keySplit1 create failed!", return status); + status = aclTensorToAtbTensor(keySplit2, &(pack.inTensors[index++])); + ATB_CHECK(status == atb::NO_ERROR, "keySplit2 create failed!", return status); + status = aclTensorToAtbTensor(value, &(pack.inTensors[index++])); + ATB_CHECK(status == atb::NO_ERROR, "value create failed!", return status); + status = aclTensorToAtbTensor(mask, &(pack.inTensors[index++])); + ATB_CHECK(status == atb::NO_ERROR, "mask create failed!", return status); + status = aclTensorToAtbTensorHost(seqLen, &(pack.inTensors[index++])); + ATB_CHECK(status == atb::NO_ERROR, "seqLen create failed!", return status); + if (param.calcType == atb::infer::RingMLAParam::CalcType::CALC_TYPE_DEFAULT) { + status = aclTensorToAtbTensor(prevOut, &(pack.inTensors[index++])); + ATB_CHECK(status == atb::NO_ERROR, "prevOut create failed!", return status); + status = aclTensorToAtbTensor(prevLse, &(pack.inTensors[index++])); + ATB_CHECK(status == atb::NO_ERROR, "prevLse create failed!", return status); + } + + index = 0; + pack.outTensors.resize(g_RING_MLA_OUTTENSOR_NUM); + status = aclTensorToAtbTensor(output, &(pack.outTensors[index++])); + ATB_CHECK(status == atb::NO_ERROR, "output create failed!", return status); + status = aclTensorToAtbTensor(softmaxLse, &(pack.outTensors[index++])); + ATB_CHECK(status == atb::NO_ERROR, "softmaxLse create failed!", return status); + if (op == nullptr || *op == nullptr) { + ATB_LOG(ERROR) << "AtbRingMLAGetWorkspaceSize opeartion pointer is nullptr!"; + return atb::ERROR_INVALID_OPERATION_ADDR; + } + status = (*op)->Setup(pack, *workspaceSize, context); + ATB_CHECK(status == atb::NO_ERROR, "AtbRingMLA Setup failed!", return status); + return atb::NO_ERROR; +} + +atb::Status AtbRingMLA(void *workspace, uint64_t workspaceSize, atb::Operation *op, atb::Context *context) +{ + ATB_CHECK(op != nullptr, "AtbRingMLA expect op pointer not to be null!", return atb::ERROR_INVALID_OPERATION_ADDR); + atb::VariantPack pack; + atb::Status st = op->Execute(pack, (uint8_t *)(workspace), workspaceSize, context); + ATB_CHECK(st == atb::NO_ERROR, "AtbRingMLA Execute failed!", return st); + return st; +} + +#ifdef __cplusplus +} +#endif diff --git a/src/cinterface/atb_acl_self_attention_prefix_encoder.cpp b/src/ops_infer/self_attention/atb_acl_self_attention_prefix_encoder.cpp similarity index 97% rename from src/cinterface/atb_acl_self_attention_prefix_encoder.cpp rename to src/ops_infer/self_attention/atb_acl_self_attention_prefix_encoder.cpp index 73e4e366..44da3af4 100644 --- a/src/cinterface/atb_acl_self_attention_prefix_encoder.cpp +++ b/src/ops_infer/self_attention/atb_acl_self_attention_prefix_encoder.cpp @@ -1,115 +1,115 @@ -/* - -Copyright (c) 2025 Huawei Technologies Co., Ltd. -This file is a part of the CANN Open Software. -Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). -Please refer to the License for details. You may not use this file except in compliance with the License. -THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, -INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. -See LICENSE in the root of the software repository for the full text of the License. -*/ -#include "atb/atb_acl.h" -#include "atb_acl_util.h" -#include "atb/operation/operation_base.h" -#ifdef __cplusplus -extern "C" { -#endif - -const size_t g_SELF_ATTENTION_PREFIX_ENCODER_INTENSOR_NUM = 6; -const size_t g_SELF_ATTENTION_PREFIX_ENCODER_OUTTENSOR_NUM = 1; - -atb::Status AtbSelfAttentionPrefixEncoderGetWorkspaceSize(const aclTensor *query, const aclTensor *key, - const aclTensor *value, const aclTensor *blockTables, - const aclTensor *mask, const aclTensor *seqLen, - const aclTensor *kvSeqLen, const aclTensor *slopes, - int maskType, int32_t headNum, int32_t kvHeadNum, - float qkScale, aclTensor *attnOut, uint64_t *workspaceSize, - atb::Operation **op, atb::Context *context) -{ - atb::infer::SelfAttentionParam param; - param.maskType = atb::infer::SelfAttentionParam::MaskType(maskType); - param.headNum = headNum; - param.kvHeadNum = kvHeadNum; - param.qkScale = qkScale; - param.quantType = atb::infer::SelfAttentionParam::QuantType::TYPE_QUANT_UNDEFINED; - param.outDataType = ACL_DT_UNDEFINED; - param.qScale = 1; - param.batchRunStatusEnable = false; - param.isTriuMask = 1; - param.calcType = atb::infer::SelfAttentionParam::CalcType::PREFIX_ENCODER; - param.kernelType = atb::infer::SelfAttentionParam::KernelType::KERNELTYPE_HIGH_PRECISION; - param.clampType = atb::infer::SelfAttentionParam::ClampType::CLAMP_TYPE_UNDEFINED; - param.clampMin = 0; - param.clampMax = 0; - param.kvcacheCfg = atb::infer::SelfAttentionParam::KvCacheCfg::K_CACHE_V_CACHE; - param.scaleType = atb::infer::SelfAttentionParam::ScaleType::SCALE_TYPE_TOR; - param.inputLayout = atb::infer::InputLayout::TYPE_BSND; - - if (op != nullptr && *op == nullptr) { - auto st = CreateOperation(param, op); - if (st != atb::NO_ERROR) { - ATB_LOG(ERROR) << "Create SelfAttention Operation Prefix Encoder failed!"; - return st; - } - } - atb::VariantPack pack; - size_t index = 0; - bool isAlibiMask = param.maskType == atb::infer::SelfAttentionParam::MaskType::MASK_TYPE_ALIBI_COMPRESS || - param.maskType == atb::infer::SelfAttentionParam::MaskType::MASK_TYPE_ALIBI_COMPRESS_SQRT; - if (isAlibiMask) { - pack.inTensors.resize(g_SELF_ATTENTION_PREFIX_ENCODER_INTENSOR_NUM + 2); // 2: mask, slopes - } else if (param.maskType == atb::infer::SelfAttentionParam::MaskType::MASK_TYPE_CAUSAL_MASK) { - pack.inTensors.resize(g_SELF_ATTENTION_PREFIX_ENCODER_INTENSOR_NUM); // mask auto-generated - } else { - pack.inTensors.resize(g_SELF_ATTENTION_PREFIX_ENCODER_INTENSOR_NUM + 1); // 1: mask - } - - auto status = aclTensorToAtbTensor(query, &(pack.inTensors[index++])); - ATB_CHECK(status == atb::NO_ERROR, "query create failed!", return status); - status = aclTensorToAtbTensor(key, &(pack.inTensors[index++])); - ATB_CHECK(status == atb::NO_ERROR, "key create failed!", return status); - status = aclTensorToAtbTensor(value, &(pack.inTensors[index++])); - ATB_CHECK(status == atb::NO_ERROR, "value create failed!", return status); - status = aclTensorToAtbTensor(blockTables, &(pack.inTensors[index++])); - ATB_CHECK(status == atb::NO_ERROR, "blockTables create failed!", return status); - if (param.maskType != atb::infer::SelfAttentionParam::MaskType::MASK_TYPE_CAUSAL_MASK) { - status = aclTensorToAtbTensor(mask, &(pack.inTensors[index++])); - ATB_CHECK(status == atb::NO_ERROR, "mask create failed!", return status); - } - status = aclTensorToAtbTensorHost(seqLen, &(pack.inTensors[index++])); - ATB_CHECK(status == atb::NO_ERROR, "seqLen create failed!", return status); - status = aclTensorToAtbTensorHost(kvSeqLen, &(pack.inTensors[index++])); - ATB_CHECK(status == atb::NO_ERROR, "kvSeqLen create failed!", return status); - if (isAlibiMask) { - status = aclTensorToAtbTensor(slopes, &(pack.inTensors[index++])); - ATB_CHECK(status == atb::NO_ERROR, "slopes create failed!", return status); - } - - index = 0; - pack.outTensors.resize(g_SELF_ATTENTION_PREFIX_ENCODER_OUTTENSOR_NUM); - status = aclTensorToAtbTensor(attnOut, &(pack.outTensors[index])); - ATB_CHECK(status == atb::NO_ERROR, "attnOut create failed!", return status); - - if (op == nullptr || *op == nullptr) { - ATB_LOG(ERROR) << "AtbSelfAttentionPrefixEncoderGetWorkspaceSize opeartion pointer is nullptr!"; - return atb::ERROR_INVALID_OPERATION_ADDR; - } - status = (*op)->Setup(pack, *workspaceSize, context); - ATB_CHECK(status == atb::NO_ERROR, "AtbSelfAttentionPrefixEncoder Setup failed!", return status); - return atb::NO_ERROR; -} - -atb::Status AtbSelfAttentionPrefixEncoder(void *workspace, uint64_t workspaceSize, atb::Operation *op, - atb::Context *context) -{ - ATB_CHECK(op != nullptr, "AtbSelfAttentionPrefixEncoder expect op pointer not to be null!", - return atb::ERROR_INVALID_OPERATION_ADDR); - atb::VariantPack pack; - atb::Status st = op->Execute(pack, (uint8_t *)(workspace), workspaceSize, context); - ATB_CHECK(st == atb::NO_ERROR, "AtbSelfAttentionPrefixEncoder Execute failed!", return st); - return st; -} - -#ifdef __cplusplus -} +/* + +Copyright (c) 2025 Huawei Technologies Co., Ltd. +This file is a part of the CANN Open Software. +Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). +Please refer to the License for details. You may not use this file except in compliance with the License. +THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +See LICENSE in the root of the software repository for the full text of the License. +*/ +#include "atb/atb_acl.h" +#include "atb/utils/atb_acl_util.h" +#include "atb/operation/operation_base.h" +#ifdef __cplusplus +extern "C" { +#endif + +const size_t g_SELF_ATTENTION_PREFIX_ENCODER_INTENSOR_NUM = 6; +const size_t g_SELF_ATTENTION_PREFIX_ENCODER_OUTTENSOR_NUM = 1; + +atb::Status AtbSelfAttentionPrefixEncoderGetWorkspaceSize(const aclTensor *query, const aclTensor *key, + const aclTensor *value, const aclTensor *blockTables, + const aclTensor *mask, const aclTensor *seqLen, + const aclTensor *kvSeqLen, const aclTensor *slopes, + int maskType, int32_t headNum, int32_t kvHeadNum, + float qkScale, aclTensor *attnOut, uint64_t *workspaceSize, + atb::Operation **op, atb::Context *context) +{ + atb::infer::SelfAttentionParam param; + param.maskType = atb::infer::SelfAttentionParam::MaskType(maskType); + param.headNum = headNum; + param.kvHeadNum = kvHeadNum; + param.qkScale = qkScale; + param.quantType = atb::infer::SelfAttentionParam::QuantType::TYPE_QUANT_UNDEFINED; + param.outDataType = ACL_DT_UNDEFINED; + param.qScale = 1; + param.batchRunStatusEnable = false; + param.isTriuMask = 1; + param.calcType = atb::infer::SelfAttentionParam::CalcType::PREFIX_ENCODER; + param.kernelType = atb::infer::SelfAttentionParam::KernelType::KERNELTYPE_HIGH_PRECISION; + param.clampType = atb::infer::SelfAttentionParam::ClampType::CLAMP_TYPE_UNDEFINED; + param.clampMin = 0; + param.clampMax = 0; + param.kvcacheCfg = atb::infer::SelfAttentionParam::KvCacheCfg::K_CACHE_V_CACHE; + param.scaleType = atb::infer::SelfAttentionParam::ScaleType::SCALE_TYPE_TOR; + param.inputLayout = atb::infer::InputLayout::TYPE_BSND; + + if (op != nullptr && *op == nullptr) { + auto st = CreateOperation(param, op); + if (st != atb::NO_ERROR) { + ATB_LOG(ERROR) << "Create SelfAttention Operation Prefix Encoder failed!"; + return st; + } + } + atb::VariantPack pack; + size_t index = 0; + bool isAlibiMask = param.maskType == atb::infer::SelfAttentionParam::MaskType::MASK_TYPE_ALIBI_COMPRESS || + param.maskType == atb::infer::SelfAttentionParam::MaskType::MASK_TYPE_ALIBI_COMPRESS_SQRT; + if (isAlibiMask) { + pack.inTensors.resize(g_SELF_ATTENTION_PREFIX_ENCODER_INTENSOR_NUM + 2); // 2: mask, slopes + } else if (param.maskType == atb::infer::SelfAttentionParam::MaskType::MASK_TYPE_CAUSAL_MASK) { + pack.inTensors.resize(g_SELF_ATTENTION_PREFIX_ENCODER_INTENSOR_NUM); // mask auto-generated + } else { + pack.inTensors.resize(g_SELF_ATTENTION_PREFIX_ENCODER_INTENSOR_NUM + 1); // 1: mask + } + + auto status = aclTensorToAtbTensor(query, &(pack.inTensors[index++])); + ATB_CHECK(status == atb::NO_ERROR, "query create failed!", return status); + status = aclTensorToAtbTensor(key, &(pack.inTensors[index++])); + ATB_CHECK(status == atb::NO_ERROR, "key create failed!", return status); + status = aclTensorToAtbTensor(value, &(pack.inTensors[index++])); + ATB_CHECK(status == atb::NO_ERROR, "value create failed!", return status); + status = aclTensorToAtbTensor(blockTables, &(pack.inTensors[index++])); + ATB_CHECK(status == atb::NO_ERROR, "blockTables create failed!", return status); + if (param.maskType != atb::infer::SelfAttentionParam::MaskType::MASK_TYPE_CAUSAL_MASK) { + status = aclTensorToAtbTensor(mask, &(pack.inTensors[index++])); + ATB_CHECK(status == atb::NO_ERROR, "mask create failed!", return status); + } + status = aclTensorToAtbTensorHost(seqLen, &(pack.inTensors[index++])); + ATB_CHECK(status == atb::NO_ERROR, "seqLen create failed!", return status); + status = aclTensorToAtbTensorHost(kvSeqLen, &(pack.inTensors[index++])); + ATB_CHECK(status == atb::NO_ERROR, "kvSeqLen create failed!", return status); + if (isAlibiMask) { + status = aclTensorToAtbTensor(slopes, &(pack.inTensors[index++])); + ATB_CHECK(status == atb::NO_ERROR, "slopes create failed!", return status); + } + + index = 0; + pack.outTensors.resize(g_SELF_ATTENTION_PREFIX_ENCODER_OUTTENSOR_NUM); + status = aclTensorToAtbTensor(attnOut, &(pack.outTensors[index])); + ATB_CHECK(status == atb::NO_ERROR, "attnOut create failed!", return status); + + if (op == nullptr || *op == nullptr) { + ATB_LOG(ERROR) << "AtbSelfAttentionPrefixEncoderGetWorkspaceSize opeartion pointer is nullptr!"; + return atb::ERROR_INVALID_OPERATION_ADDR; + } + status = (*op)->Setup(pack, *workspaceSize, context); + ATB_CHECK(status == atb::NO_ERROR, "AtbSelfAttentionPrefixEncoder Setup failed!", return status); + return atb::NO_ERROR; +} + +atb::Status AtbSelfAttentionPrefixEncoder(void *workspace, uint64_t workspaceSize, atb::Operation *op, + atb::Context *context) +{ + ATB_CHECK(op != nullptr, "AtbSelfAttentionPrefixEncoder expect op pointer not to be null!", + return atb::ERROR_INVALID_OPERATION_ADDR); + atb::VariantPack pack; + atb::Status st = op->Execute(pack, (uint8_t *)(workspace), workspaceSize, context); + ATB_CHECK(st == atb::NO_ERROR, "AtbSelfAttentionPrefixEncoder Execute failed!", return st); + return st; +} + +#ifdef __cplusplus +} #endif \ No newline at end of file -- Gitee From d911cf29c751080ad7473623be896fac04915aba Mon Sep 17 00:00:00 2001 From: caixilong Date: Sun, 14 Sep 2025 20:30:18 +0800 Subject: [PATCH 08/94] add self_attention combined test --- .../test_self_attention_combine.py | 1913 +++++++++++++++++ 1 file changed, 1913 insertions(+) create mode 100644 tests/apitest/opstest/python/operations/self_attention/test_self_attention_combine.py diff --git a/tests/apitest/opstest/python/operations/self_attention/test_self_attention_combine.py b/tests/apitest/opstest/python/operations/self_attention/test_self_attention_combine.py new file mode 100644 index 00000000..bfbfb539 --- /dev/null +++ b/tests/apitest/opstest/python/operations/self_attention/test_self_attention_combine.py @@ -0,0 +1,1913 @@ +# +# Copyright (c) 2024 Huawei Technologies Co., Ltd. +# This file is a part of the CANN Open Software. +# Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# + +import time +import json +from enum import Enum +import torch +import logging +import unittest +import math +import numpy as np +import sys +import os +import random +sys.path.append(os.path.join(os.path.dirname(__file__), "../")) +from self_attention.self_attention_test_data_generator import SelfAttentionTestDataGenerator + +import operation_test # NOQA: E402 +torch.set_printoptions(profile="full") +np.set_printoptions(threshold=np.inf) +sys.path.append("./tests/pythontest") +save_path = "./" +from golden_compare_cv import compare_cv + +class ScaleType(Enum): + SCALE_TOR = 0 + SCALE_LOGN = 1 + SCALE_LOGN_FP32 = 2 +np.random.seed(123) +MASK_TYPE_NO_MASK = 0 +MASK_TYPE_NO_HEAD = 1 +MASK_TYPE_NO_BATCH = 2 +MASK_TYPE_ALIBI_WITH_BATCH = 3 +MASK_TYPE_ALIBI_NO_BATCH = 4 +MASK_TYPE_NO_HEAD_DECODER = 5 +MASK_TYPE_SWA = 6 +MASK_TYPE_SWA_DECODER = 7 +MASK_TYPE_ALIBI_WITH_PREFIX_BATCH = 8 +MASK_TYPE_NO_BATCH_WITH_PREFIX = 9 +MASK_TYPE_ALIBI_NO_BATCH_WITH_PREFIX = 10 +MASK_TYPE_RAZOR_FUSION = 11 +UNPAD_FLASH_ATTENTION_ENCODER_PREFIX_CACHE_ND = 2012 +CAL_TYPE_PREFIX_ENCODER = 4 +MASK_TYPE_ALIBI_COMPRESS = 4 +MASK_TYPE_CAUSAL_MASK = 9 +MASK_TYPE_ALIBI_COMPRESS_SQRT = 5 +KERNELTYPE_HIGH_PRECISION = 1 + +def gen_seq_len(batch, max_seq, variate_seq=False): + if variate_seq: + num = max_seq // 16 + seqlen_aligned_arange = np.arange(1, num) * 16 + if batch > num: + seqlen_aligned_remain = np.random.randint(1, max_seq, size=(batch - num)) + seqlen_aligned_remain[:] = ((seqlen_aligned_remain[:] + 15) // 16) * 16 + seqlen_aligned = np.concatenate((seqlen_aligned_arange, seqlen_aligned_remain), 0) + else: + seqlen_aligned = seqlen_aligned_arange + sp_list = np.random.randint(0, 15, size=(num - 1)) + seqlen = seqlen_aligned - sp_list + seqlen = seqlen[-batch:] + seqlen_aligned = seqlen_aligned[-batch:] + print(seqlen) + else: + max_seq_aligned = (max_seq + 15) // 16 * 16 + sp_list = np.ones((batch,)) * (max_seq_aligned - max_seq) + sp_list = sp_list.astype(np.int32) + seqlen = np.ones((batch,)) * max_seq + seqlen = seqlen.astype(np.int32) + print(seqlen) + seqlen_aligned = np.ones((batch,)) * max_seq_aligned + seqlen_aligned = seqlen_aligned.astype(np.int32) + + ntokens = seqlen.sum() + print("ntokens:", ntokens) + return seqlen, seqlen_aligned, ntokens + +def group_matmul(heads, group_num, A, B): + group_head = heads // group_num + score = None + for i in range(group_num): + group_score = np.matmul(A[i * group_head: (i + 1) * group_head, :, :].astype(np.float32), + B[i:(i + 1), :, :].astype(np.float32)).astype(np.float16) + if score is None: + score = group_score + else: + score = np.concatenate((score, group_score), 0) + print(score.shape) + return score + +def gen_swa_cmp(window_size, embeddim): + swa_mask = np.ones(shape=(1, 512, 512)) * -10000.0 + pp_n = 128 if embeddim <= 128 else 64 + # pp_n = 128 + if window_size <= pp_n * 3: + true_size = window_size + else: + if window_size % pp_n == 0: + true_size = pp_n * 3 + else: + true_size = pp_n * 2 + window_size % pp_n + triu_mask = np.triu(swa_mask, 1) + tril_mask = np.tril(swa_mask, -true_size) + swa_mask = triu_mask + tril_mask + swa_mask = swa_mask.reshape(512,512) + return swa_mask + +class TestFlashAttention(operation_test.OperationTest): + + def close_pack(self, in_data, seq_len): + kv = in_data.numpy() + dim1len = np.size(kv, -2) + if max(seq_len) > dim1len: + return None + kv = kv.reshape(np.prod(kv.shape[0:-1]), kv.shape[-1]) + c_offset = 0 + s_offset = 0 + for i, len in enumerate(seq_len): + kv[c_offset:c_offset + seq_len[i]][:] = kv[s_offset:s_offset + seq_len[i]][:] + c_offset += seq_len[i] + s_offset += dim1len + return torch.from_numpy(kv[0:sum(seq_len)][:]) + + def calc_expect_func(self, batch, seqlen, heads, embed, window_size, mask_type, group_num=32): + is_mask = True + self.is_mask = is_mask + variate_seq = False + is_decoder = False + self.is_decoder = is_decoder + max_seq = 2048 + self.max_seq = max_seq + src_type = 'float16' + fp32 = True + print(f"group_num: {group_num}") + print("q_seq is:") + if is_decoder: + q_seqlen, q_seqlen_aligned, q_ntokens = gen_seq_len(batch, 1, variate_seq) + kv_seqlen, kv_seqlen_aligned, kv_ntokens = gen_seq_len(batch, seqlen, variate_seq) + else: + q_seqlen, q_seqlen_aligned, q_ntokens = gen_seq_len(batch, seqlen, variate_seq) + kv_seqlen, kv_seqlen_aligned, kv_ntokens = q_seqlen, q_seqlen_aligned, q_ntokens # crossattention时,q_seqlen != k_seqlen + + self.q_seqlen, self.q_seqlen_aligned, self.q_ntokens, self.kv_seqLen = q_seqlen, q_seqlen_aligned, q_ntokens, kv_seqlen + print("qseqlen is ", self.q_seqlen) + self.kv_seqlen_aligned, self.kv_ntokens = kv_seqlen_aligned, kv_ntokens + max_s = np.max(q_seqlen) + ntokens2 = (q_seqlen * kv_seqlen).sum() + embed_v = embed + + q = np.random.uniform(-1.0, 1.0, size=(q_ntokens, heads * embed)).astype(np.float16) + k = np.random.uniform(-1.0, 1.0, size=(kv_ntokens, group_num * embed)).astype(np.float16) + v = np.random.uniform(-1.0, 1.0, size=(kv_ntokens, group_num * embed_v)).astype(np.float16) + self.heads, self.embeddim, self.embeddimv = heads, embed, embed_v + mask = np.ones(shape=(1, max_s, max_s)).astype(np.float16) # 使用当前最大seqlen生成mask + mask_u = np.triu(mask, 1) + mask_l = np.tril(mask, -window_size) + mask = mask_u + mask_l + mask *= -10000.0 + + q_offset = 0 + k_offset = 0 + v_offset = 0 + + s = None + _p = None + out = None + + for idx in range(batch): + q_s = q_seqlen[idx] + kv_s = kv_seqlen[idx] + q_slice = q[q_offset:q_offset + q_s][:] + q_slice = q_slice.reshape(q_s, heads, embed) + q_slice = np.transpose(q_slice, (1, 0, 2)) # (heads, q_seq, embed) + k_slice = k[k_offset:k_offset + kv_s][:] + k_slice = k_slice.reshape(kv_s, group_num, embed) + k_slice = np.transpose(k_slice, (1, 0, 2)) + k_slice_t = np.transpose(k_slice, (0, 2, 1)) # get K^T (kv_heads, embed, k_seq) + v_slice = v[v_offset:v_offset + kv_s][:] + v_slice = v_slice.reshape(kv_s, group_num, embed_v) + v_slice = np.transpose(v_slice, (1, 0, 2)) + score = group_matmul(heads, group_num, q_slice, k_slice_t) + if s is None: + s = score.reshape([-1, ]) + else: + s = np.concatenate((s, score.reshape([-1, ])), 0) + + tor = np.float16(1.0 / math.sqrt(1.0 * embed)) + score = score * tor + if is_mask: + score = score + mask[:, :q_s, :kv_s] + score_max = np.max(score, axis=-1) + score = score - score_max.reshape((heads, q_s, 1)) + score_exp = np.exp(score.astype(np.float32)) + if not fp32: + score_sum = np.sum(score_exp.astype(np.float16), axis=-1) + if _p is None: + _p = score_exp.astype(np.float16).reshape([-1, ]) + else: + _p = np.concatenate((_p, score_exp.astype(np.float16).reshape([-1, ])), 0) + p = score_exp.astype(np.float16) / score_sum.reshape((heads, q_s, 1)).astype(np.float16) + out_sub = group_matmul(heads, group_num, p, v_slice) + else: + score_sum = np.sum(score_exp, axis=-1) + if _p is None: + _p = score_exp.astype(np.float16).reshape([-1, ]) + else: + _p = np.concatenate((_p, score_exp.astype(np.float16).reshape([-1, ])), 0) + p = score_exp.astype(np.float16) + out_sub = group_matmul(heads, group_num, p, v_slice) + out_sub = out_sub / score_sum.reshape((heads, q_s, 1)).astype(np.float16) + + out_sub = out_sub.reshape(heads, q_s, embed_v) + out_sub = np.transpose(out_sub, (1, 0, 2)) + out_sub = np.ascontiguousarray(out_sub) + if out is None: + out = out_sub + else: + out = np.concatenate((out, out_sub), 0) + + q_offset += q_s + k_offset += kv_s + v_offset += kv_s + + print("==> data generate finished!") + + q = q.astype(src_type).reshape(-1, heads, embed) + k = k.astype(src_type).reshape(-1, group_num, embed) + v = v.astype(src_type).reshape(-1, group_num, embed_v) + # mask = mask.astype(src_type).reshape(max_s, max_s) + mask = gen_swa_cmp(window_size, embed).astype(src_type) + q_len = q_seqlen.astype(np.int32) + out = out.astype(src_type).reshape(-1, heads, embed_v) + ret_data = q, k, v, mask, q_len, out + return ret_data + + def set_data_params(self, dynamic_batch=False, batch_state=None, window_size=0, cache_type=0, + is_mask=True, is_decoder=False, is_alibi=False, is_razor_fusion = False, alibi_dim=4, + batch = 1, kv_head = 1, heads = 1, embeddim = 128, embeddimv = 0, max_seq = 2048, + kv_seqLen = [], is_clamp = 0, clamp_min = 0, preTokens = 0, nextTokens = 0, + tileQ = 0, tileKv = 0, razorLen = 0, baseM = 0, textQLen = 0, textKvLen = 0, + is_splitm = False, + clamp_max = 0, data_type = torch.float16, op_type = 0, mask_type = 0, + no_cache = False, long_seq = False, is_triu_mask = False, is_multi_layer = False, + is_sqrt = False, left_align = False, scaleType = ScaleType.SCALE_TOR.value, fav3 = False, + tor = 1, bnsd = False, is_compress = False, q_seqlens=None, num_blocks=None, + block_size=None): + self.dynamic_batch = dynamic_batch + self.batch_state = batch_state + self.is_mask = is_mask + self.is_decoder = is_decoder + self.is_alibi = is_alibi + self.preTokens = preTokens + self.nextTokens = nextTokens + self.tileQ = tileQ + self.tileKv = tileKv + self.razorLen = razorLen + self.baseM = baseM + self.textQLen = textQLen + self.textKvLen = textKvLen + self.is_razor_fusion = is_razor_fusion + self.alibi_dim = alibi_dim + self.batch = batch + self.kv_head = kv_head + self.heads = heads + self.embeddim = embeddim + self.embeddimv = embeddimv + self.max_seq = max_seq + self.kv_seqLen = kv_seqLen + self.dynamic_batch = dynamic_batch + self.is_clamp = is_clamp + self.clamp_min = clamp_min + self.clamp_max = clamp_max + self.data_type = data_type + self.no_cache = no_cache + self.long_seq = long_seq + self.mask_type = mask_type + self.is_triu_mask = is_triu_mask + self.is_multi_layer = is_multi_layer + self.is_sqrt = is_sqrt + self.left_align = left_align + self.fav3 = fav3 + self.scaleType = scaleType + self.tor = tor + self.is_int8_flag = False + self.online = False + self.bnsd = bnsd + self.window_size = window_size + self.is_compress = is_compress + self.cache_type = cache_type + self.q_seqlens = q_seqlens if q_seqlens is not None else kv_seqLen + + if self.embeddimv == 0: + self.embeddimv = self.embeddim + if is_decoder: + self.q_seqlen, self.q_ntokens = self.gen_seq_len(batch, [1] * batch) + else: + self.q_seqlen, self.q_ntokens = self.gen_seq_len(batch, self.q_seqlens) + self.kv_seqlen, self.kv_ntokens = self.gen_seq_len(batch, kv_seqLen) + # gen intensor for fa kernel + if is_multi_layer: + self.layer_id = torch.from_numpy(np.array([1], dtype=np.int32)).to(torch.int32) + else: + self.layer_id = torch.from_numpy(np.array([0], dtype=np.int32)).to(torch.int32) + print("here is ", self.q_seqlen) + self.q_max_seq = np.max(self.q_seqlen) + self.kv_max_seq = np.max(self.kv_seqlen) + q = torch.from_numpy(np.random.uniform(-5.0, 5.0, size=(self.q_ntokens, heads * self.embeddim))) + + self.q = q.to(data_type) + if num_blocks is None: + self.k = torch.from_numpy(np.random.uniform(-5.0, 5.0, size=(self.layer_id[0] + 1, batch, self.max_seq, kv_head * self.embeddim))).to(data_type) + self.v = torch.from_numpy(np.random.uniform(-5.0, 5.0, size=(self.layer_id[0] + 1, batch, self.max_seq, kv_head * self.embeddimv))).to(data_type) + if is_splitm: + maxKvSeqlen = max(self.kv_seqlen) + self.k = torch.from_numpy(np.random.uniform(-5.0, 5.0, size=(self.layer_id[0] + 1, batch, maxKvSeqlen, kv_head * self.embeddim))).to(data_type) + self.v = torch.from_numpy(np.random.uniform(-5.0, 5.0, size=(self.layer_id[0] + 1, batch, maxKvSeqlen, kv_head * self.embeddimv))).to(data_type) + else: + # kv cache shape: (num_blocks, block_size, num_heads, head_size) + self.k_cache = torch.from_numpy(np.random.uniform(-5.0, 5.0, size=(num_blocks, block_size, kv_head, embeddim))).to(data_type) + self.v_cache = torch.from_numpy(np.random.uniform(-5.0, 5.0, size=(num_blocks, block_size, kv_head, embeddim))).to(data_type) + + batch = len(kv_seqLen) + max_context_len = max(kv_seqLen) + max_num_blocks_per_seq = (max_context_len + block_size - 1) // block_size + block_tables = [] # (batch, max_num_blocks_per_seq) + offset = 0 + for i in range(batch): + num_blocks_cur_seq = (kv_seqLen[i] + block_size - 1) // block_size + # padding block table with 0 + block_table = [ + random.randint(0, num_blocks-1) if j < num_blocks_cur_seq else 0 for j in range(max_num_blocks_per_seq) + ] + offset += num_blocks_cur_seq + block_tables.append(block_table) + self.block_tables = torch.from_numpy(np.array(block_tables)).to(torch.int32) + self.k = torch.stack([self.k_cache[self.block_tables[torch.tensor(i, dtype=torch.long)].to(torch.long)].reshape(-1, kv_head * self.embeddim)[:max_context_len, :] for i in range(batch)]) + self.v = torch.stack([self.v_cache[self.block_tables[torch.tensor(i, dtype=torch.long)].to(torch.long)].reshape(-1, kv_head * self.embeddim)[:max_context_len, :] for i in range(batch)]) + self.k = self.k.reshape(1, batch, max_context_len, kv_head * self.embeddim) + self.v = self.v.reshape(1, batch, max_context_len, kv_head * self.embeddim) + + if self.fav3: + self.is_int8_flag = True + self.q_scale, self.q_offset, _ = self.quant_per_head(self.q, heads, embeddim, (self.q_ntokens, heads * self.embeddim)) + self.k_scale, self.k_offset, _ = self.quant_per_head(self.k, kv_head, embeddim, (self.layer_id[0] + 1, batch, self.max_seq, kv_head * self.embeddim)) + self.v_scale, self.v_offset, _ = self.quant_per_head(self.v, kv_head, embeddim, (self.layer_id[0] + 1, batch, self.max_seq, kv_head * self.embeddim)) + self.k_scale = (self.k_scale.view(kv_head, 1) * torch.ones([kv_head, heads // kv_head])).view(-1) + self.k_offset= (self.k_offset.view(kv_head, 1) * torch.ones([kv_head, heads // kv_head])).view(-1) + self.v_scale = (self.v_scale.view(kv_head, 1) * torch.ones([kv_head, heads // kv_head])).view(-1) + self.v_offset= (self.v_offset.view(kv_head, 1) * torch.ones([kv_head, heads // kv_head])).view(-1) + self.offline_scale = torch.from_numpy(np.random.uniform(1 / 127, 3 / 127, size=(heads))).to(torch.float32) + + self.q_int8 = torch.from_numpy(np.random.uniform(-5.0, 5.0, size=(self.q_ntokens, heads * self.embeddim))).to(torch.int8) + self.k_int8 = torch.from_numpy(np.random.uniform(-5.0, 5.0, size=(self.layer_id[0] + 1, batch, self.max_seq, kv_head * self.embeddim))).to(torch.int8) + self.v_int8 = torch.from_numpy(np.random.uniform(-5.0, 5.0, size=(self.layer_id[0] + 1, batch, self.max_seq, kv_head * self.embeddimv))).to(torch.int8) + + self.gen_mask(batch, heads, data_type, mask_type, window_size, is_compress, cache_type) + print("**********data gen shape***********") + print(f"q shape: {self.q.shape}") + print(f"k shape: {self.k.shape}") + print(f"v shape: {self.v.shape}") + print(f"layer_id shape: {self.layer_id.shape}") + print(f"mask shape: {self.mask.shape}") + + def quant_per_head(self, data, heads, embeddim, shape): + temp = data.view(-1, heads, self.embeddim).to(torch.float32) + scale = torch.stack([self.fav3_quant(temp[:, i, :], data_min = -1, data_max = 1, symmetric = True)[0] for i in range(heads)]) + offset = torch.stack([self.fav3_quant(temp[:, i, :], data_min = -1, data_max = 1, symmetric = True)[1] for i in range(heads)]) + int8_data = torch.zeros_like(temp) + for i in range(heads): + int8_data[:, i, :] = ((temp[:, i, :] / scale[i]).round_() + offset[i]) + int8_data = int8_data.view(shape).to(torch.int8) + return scale, offset, int8_data + + def fav3_quant(self, data, data_min = 0, data_max = 0, symmetric = False, bit = 8): + n = 2 ** (bit - 1) + if symmetric: + quant_min, quant_max = -(n - 1), (n - 1) + else: + quant_min, quant_max = -n, (n - 1) + span = quant_max - quant_min + if data_min == data_max: + data_max = data.max().item() + data_min = data.min().item() + if symmetric: + scale = max(data_max, -data_min) / (float(span) / 2) + offset = 0 + else: + scale = (data_max - data_min) / float(span) + offset = (data_min * quant_min + data_max * quant_max) / (data_min - data_max) + # 量化公式:x / scale + offset + return torch.tensor(float(scale), dtype = torch.float), torch.tensor(int(offset), dtype = torch.float) + + def get_alibi_slopes(self, n_heads): + n = 2 ** math.floor(math.log2(n_heads)) + m0 = 2.0 ** (-8.0 / n) + slopes = torch.pow(m0, torch.arange(1, n + 1)) + if n < n_heads: + m1 = 2.0 ** ( -4.0 / n) + mm = torch.pow(m1, torch.arange(1, 1 + 2 * (n_heads - n), 2)) + slopes = torch.cat([slopes, mm]) + return slopes + + def get_alibi_bias(self, n_heads, max_seqlen): + if not self.left_align: + self.bias = torch.arange(max_seqlen) + self.bias = self.bias[None, :] - self.bias[:, None] + if (self.is_sqrt): + self.bias = torch.sqrt(torch.abs(self.bias)) * torch.sign(self.bias) + bias = torch.empty( + n_heads, + max_seqlen, + max_seqlen + )[:, :max_seqlen, :max_seqlen].copy_(self.bias) + self.alibi_slopes = self.get_alibi_slopes(n_heads) + else: + self.bias = torch.arange(max_seqlen, dtype=torch.float32).unsqueeze(0).unsqueeze(0).expand(n_heads, max_seqlen, -1) + self.alibi_slopes = torch.Tensor(self.get_interleave(n_heads)) + bias = self.bias + bias = bias * self.alibi_slopes[:, None, None] + return bias + + def get_interleave(self, n, alibi_bias_max=8.0): + def get_interleave_power_of_2(n, alibi_bias_max): + if n == 0: + return 0 + start = (2 ** (-2 ** -(math.log2(n) - 3))) + ratio = start + return [start * ratio ** i for i in range(n)] + if math.log2(n).is_integer(): + return get_interleave_power_of_2(n, alibi_bias_max) + else: + closest_power_of_2 = 2 ** math.floor(math.log2(n)) + return get_interleave_power_of_2(closest_power_of_2, alibi_bias_max) + \ + self.get_interleave(2 * closest_power_of_2)[0::2][:n - closest_power_of_2] + + def gen_swa_cmp(self, max_seq, window_size): + print("self.pre_mask_coff", self.pre_mask_coff) + swa_mask = np.ones(shape=(1, 512, 512)) * self.pre_mask_coff + print("gen_swa_cmp ", swa_mask.shape) + pp_n = 128 if self.embeddim <= 128 else 64 + pp_n = 128 if self.embeddim != self.embeddimv else pp_n + if window_size <= pp_n * 3: + true_size = window_size + elif window_size % pp_n == 0: + true_size = pp_n * 3 + else: + true_size = pp_n * 2 + window_size % pp_n + triu_mask = np.triu(swa_mask, 1) + tril_mask = np.tril(swa_mask, -true_size) + print("gen_swa_cmp ", swa_mask.shape) + print("gen_swa_cmp ", tril_mask.shape) + swa_mask = triu_mask + tril_mask + swa_mask = torch.from_numpy(swa_mask).to(torch.float32) + print("gen_swa_cmp ", swa_mask.shape) + return swa_mask + + def gen_razor_fusion_mask(self, razorLen, tileQ, tileKv, textQLen, textKvLen, preTokens, nextTokens, baseM): + np.set_printoptions(threshold=np.inf) + + mask_sizeQ = razorLen * tileQ + textQLen + mask_sizeK = razorLen * tileKv + textKvLen + logging.info("generate razor mask:", razorLen, tileQ, tileKv, textQLen, textKvLen, preTokens, nextTokens, baseM) + mask = np.zeros((mask_sizeQ, mask_sizeK), dtype=int) + preTokensBlock = preTokens // baseM + nextTokensBlock = nextTokens // baseM + idx = razorLen // baseM * baseM + mask[:, int(idx) : int(razorLen)] = 0 + mask[int(idx) : int(razorLen), :] = 0 + for i in range((razorLen + baseM - 1) // baseM): + start = i - preTokensBlock + 1 if i >= preTokensBlock else 0 + end = i + nextTokensBlock if i < preTokensBlock else start + preTokensBlock + nextTokensBlock - 1 + end = (razorLen + baseM - 1) // baseM if end > (razorLen + baseM - 1) // baseM else end + for j in range(start, end): + mask[i * baseM : (i + 1) * baseM, j * baseM : (j + 1) * baseM] = 1 + mask[razorLen :, :] = 0 + mask[:, razorLen :] = 0 + for i in range(tileQ): + for j in range(tileKv): + mask[i * razorLen : (i + 1) * razorLen, j * razorLen : (j + 1) * razorLen] = mask[0 : razorLen, 0 : razorLen] + + mask[razorLen * tileQ : , :] = 1 + mask[: , razorLen * tileKv :] = 1 + mask = mask[None, None, :] + mask = 1 - mask + return mask * -10000 + + def gen_swa_mask(self, max_seq, window_size, pre_mask_coff, cache_type=0): + swa_mask = np.ones(shape=self.mask_info[0]) * pre_mask_coff + logging.info(f"gen_swa_mask: window_size {window_size} max_seq {max_seq} self.kv_seqLen {self.kv_seqLen}") + if window_size < max_seq and self.is_decoder: + if cache_type == 1: + for idx, kvseqlen in enumerate(self.kv_seqLen): + swa_mask[idx, :, :window_size] = 0 + else: + for idx, kvseqlen in enumerate(self.kv_seqLen): + swa_mask[idx, :, kvseqlen - window_size: kvseqlen] = 0 + elif window_size < max_seq or self.is_compress: + triu_mask = np.triu(swa_mask, 1) + tril_mask = np.tril(swa_mask, -window_size) + swa_mask = triu_mask + tril_mask + else: + swa_mask = np.triu(swa_mask, 1) + return swa_mask + + def gen_mask(self, batch, heads, data_type, mask_type, window_size, is_compress, cache_type=0): + import random + q_max_seq = self.max_seq + kv_max_seq = self.max_seq + mask_type_dict = { + # 四维的alibi mask + MASK_TYPE_ALIBI_WITH_BATCH : ((batch, heads, q_max_seq, kv_max_seq), (lambda mask, idx, q_s, kv_s: (mask[idx, :, :q_s, :kv_s]))), + MASK_TYPE_ALIBI_WITH_PREFIX_BATCH : ((batch, heads, q_max_seq, kv_max_seq), (lambda mask, idx, q_s, kv_s: (mask[idx, :, kv_s-q_s:kv_s, :kv_s]))), + # 三维的alibi mask + MASK_TYPE_ALIBI_NO_BATCH : ((heads, q_max_seq, kv_max_seq), (lambda mask, idx, q_s, kv_s: (mask[:, :q_s, :kv_s]))), + MASK_TYPE_ALIBI_NO_BATCH_WITH_PREFIX : ((heads, q_max_seq, kv_max_seq), (lambda mask, idx, q_s, kv_s: (mask[:, kv_s-q_s:kv_s, :kv_s]))), + MASK_TYPE_NO_HEAD : ((batch, q_max_seq, kv_max_seq), (lambda mask, idx, q_s, kv_s: (mask[idx, :q_s, :kv_s]))), + MASK_TYPE_NO_HEAD_DECODER : ((batch, 1, kv_max_seq), (lambda mask, idx, q_s, kv_s: (mask[idx, :q_s, :kv_s]))), + MASK_TYPE_NO_BATCH : ((1, q_max_seq, kv_max_seq), (lambda mask, idx, q_s, kv_s: (mask[:, :q_s, :kv_s]))), + MASK_TYPE_NO_BATCH_WITH_PREFIX : ((1, q_max_seq, kv_max_seq), (lambda mask, idx, q_s, kv_s: (mask[:, kv_s-q_s:kv_s, :kv_s]))), + MASK_TYPE_SWA : ((1, q_max_seq, kv_max_seq), (lambda mask, idx, q_s, kv_s: (mask[:, :q_s, :kv_s]))), + MASK_TYPE_SWA_DECODER : ((batch, 1, kv_max_seq), (lambda mask, idx, q_s, kv_s: (mask[idx, :q_s, :kv_s]))), + # 不加mask + MASK_TYPE_RAZOR_FUSION : ((1, q_max_seq, kv_max_seq), (lambda mask, idx, q_s, kv_s: (mask[:q_s, :kv_s]))), + MASK_TYPE_NO_MASK : ((1, q_max_seq, kv_max_seq), (lambda mask, idx, q_s, kv_s: 0)) + } + # kernel中mask的系数 + if data_type == torch.float16: + post_mask_coff = 1 + pre_mask_coff = -10000.0 + elif data_type == torch.bfloat16 and self.is_alibi: + post_mask_coff = 1 + pre_mask_coff = -float("inf") + elif data_type == torch.float32 and self.is_alibi: + post_mask_coff = 1 + pre_mask_coff = 1 + else: + post_mask_coff = -3e38 + pre_mask_coff = 1 + if data_type == torch.float16: + if self.window_size > 0: + select_zero = False + elif self.is_alibi or self.long_seq: + select_zero = False + else: + select_zero = True + elif data_type == torch.bfloat16: + if self.window_size > 0: + select_zero = False + elif self.is_alibi: + select_zero = False + elif self.dynamic_batch or self.is_decoder: + select_zero = True + else: + select_zero = False + else: + if self.is_alibi or self.is_decoder: + select_zero = True + else: + select_zero = False + if self.is_triu_mask: + select_zero = False + + self.mask_info = mask_type_dict[mask_type] + mask = np.ones(shape=self.mask_info[0]) * pre_mask_coff + mask = np.triu(mask, 1) + zero_indice = random.choices(range(self.max_seq), k = 300) + if self.window_size > 0: + mask = self.gen_swa_mask(self.max_seq, window_size, pre_mask_coff, cache_type) + if self.is_alibi: + self.alibi_bias = self.get_alibi_bias(heads, self.max_seq) + mask += self.alibi_bias.numpy() + if select_zero: + mask.flat[zero_indice] = 0 + if self.is_razor_fusion: + mask = self.gen_razor_fusion_mask(self.razorLen, self.tileQ, self.tileKv, self.textQLen, self.textKvLen, + self.preTokens, self.nextTokens, self.baseM) + post_mask_coff = 1 + self.mask = torch.from_numpy(mask).to(torch.float32) + self.post_mask_coff = post_mask_coff + self.pre_mask_coff = pre_mask_coff + + def quantize_tensor_symmetric(self, x, prev_max_abs_vals=None, num_bits=8): + if x.dtype != torch.float: + x = x.to(torch.float) + + quant_min = -2 ** (num_bits - 1) + quant_max = 2 ** (num_bits - 1) - 1 + + current_max_abs_vals = x.abs().max(dim=1).values + if prev_max_abs_vals is not None: + max_abs_vals = torch.max(prev_max_abs_vals, current_max_abs_vals) + else: + max_abs_vals = current_max_abs_vals + scales = max_abs_vals / (quant_max) + x_q = torch.clamp(torch.round(x / scales.unsqueeze(1)), quant_min, quant_max) + x_q = torch.round(x_q) + x_q = x_q.to(torch.int8) + return x_q, scales, max_abs_vals + + def dequantize_tensor(self, x_q, scales, value): + x_deq = x_q.to(torch.float32) + scales = scales.unsqueeze(1) + x_deq = x_deq * value + x_deq = x_deq * scales + return x_deq + + def online_softmax(self, s_qk, q_s, v_slice, heads, kv_head, embed, online, dtype): + ans = None + group_num = heads // kv_head + for head_idx in range(heads): + s_head_idx = s_qk[head_idx] + O = torch.zeros((q_s, embed)).to(dtype) + Br = q_s + Bc = 128 + self.row_block_size = Br + self.col_block_size = Bc + d = embed + V_mat = v_slice[head_idx // group_num] + Tr = q_s // Br + Tc = q_s // Bc + + d = embed + Tr = q_s // Br + Tc = q_s // Bc + + start_row_idx = 0 + start_col_idx = 0 + + for i in range(Tr): + + Oi = torch.zeros((Br, d)).to(dtype) # shape Br x d + li = torch.zeros((Br, 1)).to(dtype) # shape Br x 1 + mi = torch.full((Br, 1), -torch.inf).to(dtype) # shape Br x 1 + pp_max_num = None + + for j in range(Tc): + + Sij = s_head_idx[i * Br : (i + 1) * Br, start_col_idx + j * Bc : start_col_idx + (j + 1) * Bc].to(dtype) + + Vj = V_mat[start_col_idx + j * Bc : start_col_idx + (j + 1) * Bc, :] + + mi_new = torch.max( + torch.column_stack([mi, torch.max(Sij, dim=1).values[:, None]]), dim=1 + ).values[:, None].to(dtype) + Pij_hat = torch.exp((Sij - mi_new).to(torch.float32)) + Pij_hat = Pij_hat.to(dtype) + li = torch.exp((mi - mi_new).to(torch.float32)).to(dtype) * li + torch.sum(Pij_hat, dim=1)[:, None] + if self.is_int8_flag: + if online: + x_q, scales, pp_max_num = self.quantize_tensor_symmetric(Pij_hat, pp_max_num) + if pp_max_num == None: + pp_max_num = pp_max_num + pv = x_q.to(torch.int32) @ Vj.to(torch.int32) + Oi = Oi * torch.exp((mi - mi_new).to(torch.float32)).to(dtype) + self.dequantize_tensor(pv, scales, self.v_scale[head_idx]).to(dtype) + else: + x_q = Pij_hat / self.offline_scale[head_idx] + x_q = torch.round(x_q.to(torch.float32)) + pv = x_q.to(torch.int32) @ Vj.to(torch.int32) + pv = pv.to(torch.float32) + value = self.v_scale[head_idx] * self.offline_scale[head_idx] + Oi = Oi * torch.exp((mi - mi_new).to(torch.float32)).to(dtype) + (pv * value).to(dtype) + else: + Oi = Oi * torch.exp((mi - mi_new).to(torch.float32)).to(dtype) + Pij_hat @ Vj.to(dtype) + + mi = mi_new + + if (q_s % Bc != 0): + Bc = q_s % Bc + start_row_idx = (q_s // self.row_block_size) * self.row_block_size + start_col_idx = (q_s // self.col_block_size) * self.col_block_size + + Sij = s_head_idx[i * Br : (i + 1) * Br, start_col_idx : start_col_idx + Bc].to(dtype) + Vj = V_mat[start_col_idx : start_col_idx + Bc, :] + mi_new = torch.max( + torch.column_stack([mi, torch.max(Sij, dim=1).values[:, None]]), dim=1 + ).values[:, None].to(dtype) + Pij_hat = torch.exp((Sij - mi_new).to(torch.float32)) + Pij_hat = Pij_hat.to(dtype) + li = torch.exp((mi - mi_new).to(torch.float32)).to(dtype) * li + torch.sum(Pij_hat, dim=1)[:, None] + if self.is_int8_flag: + if online: + x_q, scales, pp_max_num = self.quantize_tensor_symmetric(Pij_hat, pp_max_num) + if pp_max_num == None: + pp_max_num = pp_max_num + pv = x_q.to(torch.int32) @ Vj.to(torch.int32) + Oi = Oi * torch.exp((mi - mi_new).to(torch.float32)).to(dtype) + self.dequantize_tensor(pv, scales, self.v_scale[head_idx]).to(dtype) + else: + x_q = Pij_hat / self.offline_scale[head_idx] + x_q = torch.round(x_q.to(torch.float32)) + pv = x_q.to(torch.int32) @ Vj.to(torch.int32) + pv = pv.to(torch.float32) + value = self.v_scale[head_idx] * self.offline_scale[head_idx] + Oi = Oi * torch.exp((mi - mi_new).to(torch.float32)).to(dtype) + (pv * value).to(dtype) + else: + Oi = Oi * torch.exp((mi - mi_new).to(torch.float32)).to(dtype) + Pij_hat @ Vj.to(dtype) + Oi = Oi / li + + O[i * Br : (i + 1) * Br, :] = Oi + + if ans is None: + ans = O + else: + ans = torch.cat((ans, O), 1) + return ans + + def gen_out_tensor(self, online=False): + q_offset = 0 + k_offset = 0 + v_offset = 0 + batch = self.batch + dynamic_batch = self.dynamic_batch + batch_state = self.batch_state + heads = self.heads + is_decoder = self.is_decoder + embed = self.embeddim + embedv = self.embeddimv + max_seq = self.max_seq + q_seqlen = self.q_seqlen + kv_seqlen = self.kv_seqLen + kv_head = self.kv_head + mask = self.mask + is_mask = self.is_mask + is_razor_fusion = self.is_razor_fusion + q = self.q + k = self.k + v = self.v + if self.fav3: + q = self.q_int8 + k = self.k_int8 + v = self.v_int8 + q_ntokens = self.q_ntokens + kv_ntokens = self.kv_ntokens + layer_id = self.layer_id[0] + s = None + _p = None + out = None + ans_concat = None + ans_concat_true = None + out_true = None + + self.encoder_logN = torch.tensor([2.0] * self.max_seq).to(torch.float32) + self.encoder_logN.uniform_(1, 2) + self.decoder_logN = torch.tensor([2.0] * batch).to(torch.float32) + self.decoder_logN.uniform_(1, 2) + for idx in range(batch): + if dynamic_batch and batch_state[idx] == 0 and not is_decoder: + continue + if dynamic_batch and batch_state[idx] == 0: + output = torch.zeros([heads, q_s, embedv]) + output = torch.permute(output, (1, 0, 2)) + if out is None: + out = output + if not self.fav3: + out_true = output + else: + out = torch.cat((out, output), 0) + if not self.fav3: + out_true = torch.cat((out_true, output), 0) + q_offset += q_s + k_offset += max_seq + v_offset += max_seq + continue + q_s = q_seqlen[idx] + kv_s = kv_seqlen[idx] + q_slice = q[q_offset:q_offset + q_s][:] + q_slice = q_slice.view(q_s, heads, embed) + q_slice = torch.permute(q_slice, (1, 0, 2)) + k_slice = k[layer_id][idx][:kv_s][:] + k_slice = k_slice.view(kv_s, kv_head, embed) + k_slice_t = torch.permute(k_slice, (1, 2, 0)) # get K^T + v_slice = v[layer_id][idx][:kv_s][:] + v_slice = v_slice.view(kv_s, kv_head, embedv) + v_slice = torch.permute(v_slice, (1, 0, 2)) + + if self.fav3: + score = self.group_mm_torch(heads, kv_head, q_slice, k_slice_t, torch.int32) + else: + score = self.group_mm_torch(heads, kv_head, q_slice, k_slice_t) + if self.fav3: + # score:[heads,m,n] + score = score.to(torch.float32) + score = score * self.q_scale.view(heads, 1, 1) + score = score.to(torch.float16) + + if s is None: + s = score.view([-1, ]) + else: + s = torch.cat((s, score.view([-1, ])), 0) + + if self.scaleType == ScaleType.SCALE_LOGN_FP32.value: + if is_decoder: + score *= self.decoder_logN[idx] + else: + score *= self.encoder_logN[None, :q_s, None] + + if self.fav3: + score = score * torch.tensor(self.tor, dtype=torch.float16) + else: + score *= self.tor + + if self.is_clamp == 1: + clamp_min_brc = np.ones((score.shape)) * self.clamp_min + clamp_max_brc = np.ones((score.shape)) * self.clamp_max + score = np.float16(np.maximum(score, clamp_min_brc)) + score = torch.from_numpy(np.float16(np.minimum(score, clamp_max_brc))) + temp_mask = self.mask_info[1](self.mask, idx, q_s, kv_s) * self.post_mask_coff + if is_mask or is_razor_fusion: + score = score + temp_mask + + s_qk = score + s_qk_true = score.to(torch.float32) + score = score.numpy().astype(np.float32) + + if self.is_int8_flag: + ans = self.online_softmax(s_qk, q_s, v_slice, heads, kv_head, embed, online, torch.float16) + if ans_concat is None: + ans_concat = ans + else: + ans_concat = torch.cat((ans_concat, ans), 0) + + ans_true = self.online_softmax(s_qk_true, q_s, v_slice, heads, kv_head, embed, online, torch.float32) + if ans_concat_true is None: + ans_concat_true = ans_true + else: + ans_concat_true = torch.cat((ans_concat_true, ans_true), 0) + + score_max = np.max(score, axis=-1) + score = score - score_max.reshape((heads, q_s, 1)) + score_exp = np.exp(score) + score_sum = np.sum(score_exp, axis=-1) + + if _p is None: + _p = score_exp.astype(np.float32).reshape([-1, ]) + else: + _p = np.concatenate( + (_p, score_exp.astype(np.float32).reshape([-1, ])), 0) + if self.fav3: + p = score_exp + p = p * 127 + p = torch.from_numpy(p).to(torch.int8) + else: + p_true = (score_exp / score_sum.reshape((heads, q_s, 1))) + p_true = torch.from_numpy(p_true) + p = p_true.to(torch.bfloat16) + o_true = self.group_mm_torch(heads, kv_head, p_true, v_slice) + + o = self.group_mm_torch(heads, kv_head, p, v_slice) + if self.fav3: + o = o.to(torch.float) + v_scale = self.v_scale + v_scale = v_scale.view(heads, 1, 1) + o = o * v_scale + o = o / 127 + o = o / score_sum.reshape((heads, q_s, 1)) + else: + o_true = o_true.view(heads, q_s, embedv) + o_true = torch.permute(o_true, (1, 0, 2)).contiguous() + o = o.view(heads, q_s, embedv) + o = torch.permute(o, (1, 0, 2)).contiguous() + if out is None: + out = o + if not self.fav3: + out_true = o_true + else: + out = torch.cat((out, o), 0) + if not self.fav3: + out_true = torch.cat((out_true, o_true), 0) + + q_offset += q_s + k_offset += max_seq + v_offset += max_seq + # golden data + print("now is: ", q_ntokens, heads, embedv) + + if self.is_int8_flag: + ans_concat = ans_concat.view(q_ntokens, heads * embedv) + ans_concat_true = ans_concat_true.view(q_ntokens, heads * embedv) + self.golden_out = ans_concat + self.golden_out_true = ans_concat_true + else: + out = out.view(q_ntokens, heads * embedv) + self.golden_out = out.to(self.data_type) + out_true = out_true.view(q_ntokens, heads * embedv) + self.golden_out_true = out_true.to(torch.float32) + + if self.no_cache: + self.k = self.close_pack(self.k.to(torch.float32), kv_seqlen).to(self.data_type) + self.v = self.close_pack(self.v.to(torch.float32), kv_seqlen).to(self.data_type) + if self.fav3: + self.k_int8 = self.close_pack(self.k_int8.to(torch.float32), kv_seqlen).to(torch.int8) + self.v_int8 = self.close_pack(self.v_int8.to(torch.float32), kv_seqlen).to(torch.int8) + if self.long_seq: + self.max_seq = 128 + self.gen_mask(self.batch, self.heads, self.data_type, self.mask_type, 0, False, 0) + + def gen_out_tensor_bnsd(self): + q_offset = 0 + k_offset = 0 + v_offset = 0 + batch = self.batch + dynamic_batch = self.dynamic_batch + batch_state = self.batch_state + heads = self.heads + is_decoder = self.is_decoder + embed = self.embeddim + embedv = self.embeddimv + max_seq = self.max_seq + q_seqlen = self.q_seqlen + kv_seqlen = self.kv_seqLen + kv_head = self.kv_head + mask = self.mask + is_mask = self.is_mask + q = self.q + k = self.k + v = self.v + q_ntokens = self.q_ntokens + kv_ntokens = self.kv_ntokens + layer_id = self.layer_id[0] + s = None + _p = None + out = None + obsnd = torch.zeros(batch, max_seq, heads, embedv) + out_true_bnsd = torch.zeros(batch, max_seq, heads, embedv) + kbsnd=k.view(layer_id+1,batch,max_seq,kv_head,embed) + vbsnd=v.view(layer_id+1,batch,max_seq,kv_head,embedv) + qbsnd = torch.zeros(batch, max_seq, heads, embed) + self.encoder_logN = torch.tensor([2.0] * self.max_seq).to(torch.float32) + self.encoder_logN.uniform_(1, 2) + self.decoder_logN = torch.tensor([2.0] * batch).to(torch.float32) + self.decoder_logN.uniform_(1, 2) + for idx in range(batch): + if dynamic_batch and batch_state[idx] == 0 and not is_decoder: + continue + if dynamic_batch and batch_state[idx] == 0: + output = torch.zeros([heads, q_s, embedv]) + output = torch.permute(output, (1, 0, 2)) + if out is None: + out = output + else: + out = torch.cat((out, output), 0) + q_offset += q_s + k_offset += max_seq + v_offset += max_seq + continue + # todo bs,n,d 转b,n,s,d + q_s = q_seqlen[idx] + kv_s = kv_seqlen[idx] + q_slice = q[q_offset:q_offset + q_s][:] + q_slice = q_slice.view(q_s, heads, embed) + for q_s_idx in range(q_s): + qbsnd[idx][q_s_idx] = q_slice[q_s_idx][:] + q_slice = torch.permute(q_slice, (1, 0, 2)) + k_slice = k[layer_id][idx][:kv_s][:] + k_slice = k_slice.view(kv_s, kv_head, embed) + k_slice_t = torch.permute(k_slice, (1, 2, 0)) # get K^T + v_slice = v[layer_id][idx][:kv_s][:] + v_slice = v_slice.view(kv_s, kv_head, embedv) + v_slice = torch.permute(v_slice, (1, 0, 2)) + + score = self.group_mm_torch(heads, kv_head, q_slice, k_slice_t) + if s is None: + s = score.view([-1, ]) + else: + s = torch.cat((s, score.view([-1, ])), 0) + score = score * self.tor + if self.scaleType == ScaleType.SCALE_LOGN_FP32.value: + if is_decoder: + score *= self.decoder_logN[idx] + else: + score *= self.encoder_logN[None, :q_s, None] + if self.is_clamp == 1: + clamp_min_brc = np.ones((score.shape)) * self.clamp_min + clamp_max_brc = np.ones((score.shape)) * self.clamp_max + score = np.float16(np.maximum(score, clamp_min_brc)) + score = torch.from_numpy(np.float16(np.minimum(score, clamp_max_brc))) + temp_mask = self.mask_info[1](self.mask, idx, q_s, kv_s) * self.post_mask_coff + if is_mask: + score = score + temp_mask + score = score.numpy().astype(np.float32) + score_max = np.max(score, axis=-1) + score = score - score_max.reshape((heads, q_s, 1)) + score_exp = np.exp(score) + score_sum = np.sum(score_exp, axis=-1) + + if _p is None: + _p = score_exp.astype(np.float32).reshape([-1, ]) + else: + _p = np.concatenate( + (_p, score_exp.astype(np.float32).reshape([-1, ])), 0) + + p_true = (score_exp / score_sum.reshape((heads, q_s, 1))) + p_true = torch.from_numpy(p_true) + o_true = self.group_mm_torch(heads, kv_head, p_true, v_slice) + o_true = o_true.view(heads, q_s, embedv) + o_true = torch.permute(o_true, (1, 0, 2)).contiguous() + + #根据数据类型转换 + p = p_true.to(torch.bfloat16) + o = self.group_mm_torch(heads, kv_head, p, v_slice) + o = o.view(heads, q_s, embedv) + o = torch.permute(o, (1, 0, 2)).contiguous() + if out is None: + out = o + out_true = o_true + else: + out = torch.cat((out, o), 0) + out_true = torch.cat((out_true, o_true), 0) + + for i in range(0,q_s): + obsnd[idx][i] = o[i] + out_true_bnsd[idx]=out_true[i] + q_offset += q_s + k_offset += max_seq + v_offset += max_seq + obnsd = torch.permute(obsnd, (0, 2, 1,3)) + out_true_bnsd = torch.permute(out_true_bnsd, (0, 2, 1,3)) + self.qbnsd = torch.permute(qbsnd, (0, 2, 1, 3)).to(self.data_type) + self.kbnsd = torch.permute(kbsnd, (0, 1, 3, 2, 4)).to(self.data_type) + self.vbnsd = torch.permute(vbsnd, (0, 1, 3, 2, 4)).to(self.data_type) + # golden data + out = out.view(q_ntokens, heads * embedv) + out_true = out_true.view(q_ntokens, heads * embedv) + if(self.is_decoder == 1): + self.golden_out = out + self.golden_out_true = out_true.to(torch.float32) + else: + self.golden_out = obnsd.to(self.data_type) + self.golden_out_true = out_true_bnsd.to(torch.float32) + logging.debug(f"golden_out shape: {self.golden_out.shape}") + + if self.no_cache: + self.k = self.close_pack(self.k.to(torch.float32), kv_seqlen).to(self.data_type) + self.v = self.close_pack(self.v.to(torch.float32), kv_seqlen).to(self.data_type) + if self.long_seq: + self.max_seq = 128 + self.gen_mask(self.batch, self.heads, self.data_type, self.mask_type) + + def gen_out_tensor_bnsd_splitm(self): + q_offset = 0 + k_offset = 0 + v_offset = 0 + batch = self.batch + dynamic_batch = self.dynamic_batch + batch_state = self.batch_state + heads = self.heads + is_decoder = self.is_decoder + embed = self.embeddim + embedv = self.embeddimv + max_seq = self.max_seq + q_seqlen = self.q_seqlen + kv_seqlen = self.kv_seqLen + kv_head = self.kv_head + mask = self.mask + is_mask = self.is_mask + q = self.q + k = self.k + v = self.v + q_ntokens = self.q_ntokens + kv_ntokens = self.kv_ntokens + layer_id = self.layer_id[0] + s = None + _p = None + out = None + maxQSeqlen = max(q_seqlen) + obsnd = torch.zeros(batch, maxQSeqlen, heads, embedv) + out_true_bnsd = torch.zeros(batch, maxQSeqlen, heads, embedv) + maxKvSeqlen = max(kv_seqlen) + kbsnd=k.view(layer_id+1,batch,maxKvSeqlen,kv_head,embed) + vbsnd=v.view(layer_id+1,batch,maxKvSeqlen,kv_head,embedv) + qbsnd = torch.zeros(batch, maxQSeqlen, heads, embed) + for idx in range(batch): + if dynamic_batch and batch_state[idx] == 0 and not is_decoder: + continue + if dynamic_batch and batch_state[idx] == 0: + output = torch.zeros([heads, q_s, embedv]) + output = torch.permute(output, (1, 0, 2)) + if out is None: + out = output + else: + out = torch.cat((out, output), 0) + q_offset += q_s + k_offset += max_seq + v_offset += max_seq + continue + # todo bs,n,d 转b,n,s,d + q_s = q_seqlen[idx] + kv_s = kv_seqlen[idx] + q_slice = q[q_offset:q_offset + q_s][:] + q_slice = q_slice.view(q_s, heads, embed) + for q_s_idx in range(q_s): + qbsnd[idx][q_s_idx] = q_slice[q_s_idx][:] + q_slice = torch.permute(q_slice, (1, 0, 2)) + k_slice = k[layer_id][idx][:kv_s][:] + k_slice = k_slice.view(kv_s, kv_head, embed) + k_slice_t = torch.permute(k_slice, (1, 2, 0)) # get K^T + v_slice = v[layer_id][idx][:kv_s][:] + v_slice = v_slice.view(kv_s, kv_head, embedv) + v_slice = torch.permute(v_slice, (1, 0, 2)) + + score = self.group_mm_torch(heads, kv_head, q_slice, k_slice_t) + if s is None: + s = score.view([-1, ]) + else: + s = torch.cat((s, score.view([-1, ])), 0) + score = score * self.tor + score = score.numpy().astype(np.float32) + score_max = np.max(score, axis=-1) + score = score - score_max.reshape((heads, q_s, 1)) + score_exp = np.exp(score) + score_sum = np.sum(score_exp, axis=-1) + + if _p is None: + _p = score_exp.astype(np.float32).reshape([-1, ]) + else: + _p = np.concatenate( + (_p, score_exp.astype(np.float32).reshape([-1, ])), 0) + + p_true = (score_exp / score_sum.reshape((heads, q_s, 1))) + p_true = torch.from_numpy(p_true) + o_true = self.group_mm_torch(heads, kv_head, p_true, v_slice) + o_true = o_true.view(heads, q_s, embedv) + o_true = torch.permute(o_true, (1, 0, 2)).contiguous() + + #根据数据类型转换 + p = p_true.to(torch.bfloat16) + o = self.group_mm_torch(heads, kv_head, p, v_slice) + o = o.view(heads, q_s, embedv) + o = torch.permute(o, (1, 0, 2)).contiguous() + + if out is None: + out = o + out_true = o_true + else: + out = torch.cat((out, o), 0) + out_true = torch.cat((out_true, o_true), 0) + for i in range(0,q_s): + obsnd[idx][i] = o[i] + out_true_bnsd[idx][i]=out_true[i] + q_offset += q_s + k_offset += kv_s + v_offset += kv_s + obnsd = torch.permute(obsnd, (0, 2, 1,3)) + out_true_bnsd = torch.permute(out_true_bnsd, (0, 2, 1,3)) + self.qbnsd = torch.permute(qbsnd, (0, 2, 1, 3)).to(self.data_type) + self.kbnsd = torch.permute(kbsnd, (0, 1, 3, 2, 4)).to(self.data_type) + self.vbnsd = torch.permute(vbsnd, (0, 1, 3, 2, 4)).to(self.data_type) + # golden data + out = out.view(q_ntokens, heads * embedv) + out_true = out_true.view(q_ntokens, heads * embedv) + self.golden_out = obnsd.to(self.data_type) + self.golden_out_true = out_true_bnsd.to(torch.float32) + logging.debug(f"golden_out shape: {self.golden_out.shape}") + + if self.no_cache: + self.k = self.close_pack(self.k.to(torch.float32), kv_seqlen).to(self.data_type) + self.v = self.close_pack(self.v.to(torch.float32), kv_seqlen).to(self.data_type) + + def gen_seq_len(self, batch, seq_len): + ntokens = sum(seq_len) + return seq_len, ntokens + + def compare_output_data(self, out, golden, ratios): + error_count = 0 + strict_error_count = 0 + fp16_min_normal = 1.0 / (1 << 14) + golden = golden.flatten().to(torch.float32) + out = out.flatten().to(torch.float32) + out_len = out.shape[0] + diff = torch.abs(golden - out) + max_diff = diff.max().item() + limit_error = torch.maximum(torch.abs(golden * ratios[0]), torch.tensor(ratios[1])) + strict_limit_error = torch.maximum(torch.abs(golden * ratios[2]), torch.tensor(ratios[3])) + error_count = torch.gt(diff, limit_error).sum().item() + strict_error_count = torch.gt(diff, strict_limit_error).sum().item() + logging.info(f"maxDiff {max_diff}") + logging.info("1/1000 Accuracy is %f", 1 - float(error_count) / out_len) + logging.info("5/1000 Accuracy is %f", 1 - float(strict_error_count) / out_len) + if self.data_type == torch.bfloat16: + logging.debug("accuracy is correct in old standard: %r", (float(strict_error_count) / out_len) <= ratios[2]) + else: + logging.debug("accuracy is correct in old standard: %r", (float(strict_error_count) / out_len) <= ratios[0]) + calc_times = self.heads * self.max_seq + 4 + if self.data_type == torch.bfloat16: + if calc_times < 2048: + error = 2**(-7) + else : + error = 2**(-6) + error_threshold = torch.clamp(torch.abs(golden), min = 1) * error + res = (diff <= error_threshold).all().item() + logging.debug("accuracy is correct in new standard: %r", res) + return res + elif self.data_type == torch.float16: + if calc_times < 2048: + error = 2**(-8) + else : + error = 2**(-7) + error_threshold = torch.clamp(torch.abs(golden), min = 1) * error + res = (diff <= error_threshold).all().item() + logging.debug("accuracy is correct in new standard: %r", res) + return res + else : + if calc_times < 2048: + error = 2**(-11) + elif calc_times >= 2048 and calc_times < 16384: + error = 2**(-10) + else: + error = 2**(-14) + error_threshold = torch.clamp(torch.abs(golden), min = 1) * error + res = (diff <= error_threshold).all().item() + logging.debug("accuracy is correct in new standard: %r", res) + return res + + def group_mm_torch(self, heads, group_num, A, B, dtype=torch.float32): + group_head = heads // group_num + score = None + for i in range(group_num): + group_score = torch.matmul(A[i * group_head: (i + 1) * group_head, :, :].to(dtype), B[i:(i + 1), :, :].to(dtype)) + if score is None: + score = group_score + else: + score = torch.cat((score, group_score), 0) + return score + + def golden_calc(self, in_tensors): + golden_out = self.golden_out.clone().detach().requires_grad_(True).half().npu() + return [golden_out] + + def golden_compare(self, out_tensors, golden_tensors): + print("max(golden_out): ", torch.max(golden_tensors[0].clone().detach().half().npu()).item(),) + print("min(golden_out): ", torch.min(golden_tensors[0].clone().detach().half().npu()).item(),) + print("max(actual out): ", torch.max(out_tensors[0].clone().detach().half().npu()).item(),) + print("min(actual out): ", torch.min(out_tensors[0].clone().detach().half().npu()).item(),) + # nan/inf + result_single = self.compare_output_data(out_tensors[0].clone().detach().half().npu(), + golden_tensors[0].clone().detach().half().npu(), + [0.001, 0.001, 0.005, 0.005]) + if self.is_int8_flag: + result_double = compare_cv(self.golden_out_true.clone().detach().half().npu(), + golden_tensors[0].clone().detach().half().npu(), out_tensors[0]) + return (result_double or result_single) + else: + result_double = compare_cv(self.golden_out_true.clone().detach().half().npu(), + golden_tensors[0].clone().detach().half().npu(), out_tensors[0]) + return (result_double or result_single) + + + def test_swa_decoder(self): + """ + is_decoder = 1, no_cache=False, "maskType": MASK_TYPE_SLIDING_WINDOW_NORM + qselen[i] = 1 for all i (decoder) + kv_seqLen[i] = 114 for all i + """ + if not operation_test.get_soc_version() == 'Ascend910B': + print("this testcase only supports Ascend910B") + return + self.data_type = torch.float16 + data_type = self.data_type + self.batch = 8 + batch = self.batch + self.kv_head = 32 # kv_head num + kv_head = self.kv_head + self.is_decoder = 1 # prefill or decoder + self.heads = 32 # llama7b hidden_size 4096 + self.embeddim = 128 + self.embeddim_v = self.embeddim + tor = 1 + self.dynamic_batch = False + kv_seqLen = [114] * batch + qSeqLen = [1] * batch + self.max_seq = max(max(kv_seqLen), max(qSeqLen)) + self.window_size = 16 + self.q_seqlen, self.q_ntokens = self.gen_seq_len(batch, [1] * batch) + self.kv_seqlen, self.kv_ntokens = self.gen_seq_len(batch, kv_seqLen) + self.layer_id = torch.from_numpy(np.array([0], dtype=np.int32)).to(torch.int32) + self.q_max_seq = np.max(self.q_seqlen) + self.kv_max_seq = np.max(self.kv_seqlen) + tor = np.float32(1.0 / math.sqrt(1.0 * self.embeddim)) + + self.q_scale = 1 + self.qk_scale = tor + self.cache_type = 1 + + OP_NAME = "SelfAttentionOperation" + OP_PARAM = {"type": 1} + self.set_data_params(cache_type=self.cache_type, is_decoder=self.is_decoder, batch=batch, kv_head=kv_head, heads=self.heads, + embeddim=self.embeddim, max_seq=self.max_seq, kv_seqLen=kv_seqLen, + data_type=data_type, long_seq = True, + op_type=OP_PARAM["type"], mask_type = MASK_TYPE_SWA, + no_cache=False, is_sqrt=False, tor=tor, q_seqlens=self.q_seqlen) + self.gen_out_tensor() + self.window_size = 16 + param = json.dumps( + {"headNum": self.heads, "qScale": float(self.q_scale), "qkScale": float(self.qk_scale), "maskType": 7, + "kvcacheCfg": self.cache_type, "calcType": 2, "windowSize": self.window_size}) + + self.param_seqlen = self.q_seqlen + self.param_token_offset = self.kv_seqlen + run_param = json.dumps({"tokenOffset": self.param_token_offset, "seqLen": self.param_seqlen, "maskType": 7}) + self.execute_with_param(OP_NAME, param, run_param, + [self.q.npu(), self.k.npu(), self.v.npu(), + torch.tensor(self.kv_seqlen).to(torch.int32).npu(), + torch.tensor(self.q_seqlen).to(torch.int32).npu(), self.layer_id.npu()]) + + + def test_swa_encoder_cache(self): + """ + is_decoder = 0, no_cache=False, "maskType": MASK_TYPE_SLIDING_WINDOW_NORM, cacheType = 1 + qselen = kv_seqLen = [33, 512, ...] + """ + if not operation_test.get_soc_version() == 'Ascend910B': + print("this testcase only supports Ascend910B") + return + self.data_type = torch.float16 + data_type = self.data_type + self.batch = 8 + batch = self.batch + self.kv_head = 33 # kv_head num + kv_head = self.kv_head + self.is_decoder = 0 # prefill or decoder + self.heads = 33 # llama7b hidden_size 4096 + self.embeddim = 128 + self.embeddim_v = self.embeddim + self.dynamic_batch = False + kv_seqLen = [self.heads, 512] * (self.batch // 2) + self.max_seq = max(kv_seqLen) + + self.window_size = 16 + self.cacheType = 1 + self.kv_seqlen, self.kv_ntokens = self.gen_seq_len(batch, kv_seqLen) + self.q_seqlen, self.q_ntokens = self.kv_seqlen, self.kv_ntokens + + self.q_max_seq = np.max(self.q_seqlen) + self.kv_max_seq = np.max(self.kv_seqlen) + tor = np.float32(1.0 / math.sqrt(1.0 * self.embeddim)) + + self.q_scale = 1 + self.qk_scale = tor + self.cache_type = 1 + + OP_NAME = "SelfAttentionOperation" + OP_PARAM = {"type": 1} + print(f" self.q_ntokens 1 {self.q_ntokens}") + self.set_data_params(cache_type=self.cache_type, is_decoder=self.is_decoder, batch=batch, kv_head=kv_head, + heads=self.heads, embeddim=self.embeddim, max_seq=self.max_seq, kv_seqLen=kv_seqLen, + data_type=data_type, long_seq = False, op_type=OP_PARAM["type"], mask_type = MASK_TYPE_SWA, + no_cache=False, is_sqrt=False, tor=tor) + self.gen_out_tensor() + self.window_size = 16 + self.q_scale = 1 + self.qk_scale = tor + param = json.dumps( + {"headNum": self.heads, "qScale": float(self.q_scale), "qkScale": float(self.qk_scale), "maskType": 7, + "kvcacheCfg": 1, "calcType": 1, + "windowSize": self.window_size, "cacheType": self.cacheType}) + self.param_seqlen = self.kv_seqlen + self.param_token_offset = self.kv_seqlen + run_param = json.dumps({"tokenOffset": self.param_token_offset, "seqLen": self.param_seqlen, "maskType": 7}) + self.execute_with_param(OP_NAME, param, run_param, + [self.q.npu(), self.k.npu(), self.v.npu(), + self.mask.reshape(self.q_max_seq, self.kv_max_seq).to(data_type).npu(), + torch.tensor(self.kv_seqlen).to(torch.int32).npu(), + torch.tensor(self.kv_seqlen).to(torch.int32).npu(), self.layer_id.npu()]) + + def test_swa_decoder_cache(self): + """ + is_decoder = 0, no_cache=False, "maskType": MASK_TYPE_SLIDING_WINDOW_NORM, cacheType = 1 + qselen = kv_seqLen = [33, 512, ...] + """ + if not operation_test.get_soc_version() == 'Ascend910B': + print("this testcase only supports Ascend910B") + return + self.data_type = torch.bfloat16 + data_type = self.data_type + self.batch = 8 + batch = self.batch + self.kv_head = 32 # kv_head num + self.is_decoder = 1 # prefill or decoder + self.heads = 32 # llama7b hidden_size 4096 + self.embeddim = 128 + self.embeddim_v = self.embeddim + self.dynamic_batch = False + kv_seqLen = [32, 1024] * 4 + self.max_seq = 1024 + self.window_size = 64 + self.cacheType = 1 + self.q_seqlen, self.q_ntokens = self.gen_seq_len(batch, [1] * batch) + self.kv_seqlen, self.kv_ntokens = self.gen_seq_len(batch, kv_seqLen) + self.layer_id = torch.from_numpy(np.array([0], dtype=np.int32)).to(torch.int32) + self.q_max_seq = np.max(self.q_seqlen) + self.kv_max_seq = np.max(self.kv_seqlen) + tor = np.float32(1.0 / math.sqrt(1.0 * self.embeddim)) + + self.q_scale = 1 + self.qk_scale = tor + self.cache_type = 1 + + OP_NAME = "SelfAttentionOperation" + OP_PARAM = {"type": 1} + self.set_data_params(cache_type=self.cache_type, is_decoder=self.is_decoder, batch=batch, kv_head=self.kv_head, + heads=self.heads, embeddim=self.embeddim, max_seq=self.max_seq, kv_seqLen=kv_seqLen, + data_type=data_type, long_seq = False, op_type=OP_PARAM["type"], mask_type = MASK_TYPE_SWA, + no_cache=False, is_sqrt=False, tor=tor, q_seqlens=self.q_seqlen) + self.gen_out_tensor() + self.window_size = 64 + + param = json.dumps( + {"headNum": self.heads, "qScale": float(self.q_scale), "qkScale": float(self.qk_scale), "maskType": 7, + "kvcacheCfg": 1, "calcType": 2, "windowSize": self.window_size, "cacheType": 1}) + self.param_seqlen = self.q_seqlen + self.param_token_offset = self.kv_seqlen + run_param = json.dumps({"tokenOffset": self.param_token_offset, "seqLen": self.param_seqlen, "maskType": 7}) + self.execute_with_param(OP_NAME, param, run_param, + [self.q.npu(), self.k.npu(), self.v.npu(), + torch.tensor(self.kv_seqlen).to(torch.int32).npu(), + torch.tensor(self.q_seqlen).to(torch.int32).npu(), self.layer_id.npu()]) + + def test_swa_encoder(self): + """ + is_decoder = 0, no_cache=False, "maskType": MASK_TYPE_SLIDING_WINDOW_NORM, cacheType = 0 + qselen = kv_seqLen = [32, 256, ...] + norm swa mask + """ + if not operation_test.get_soc_version() == 'Ascend910B': + print("this testcase only supports Ascend910B") + return + self.data_type = torch.bfloat16 + data_type = self.data_type + self.batch = 8 + batch = self.batch + self.kv_head = 32 # kv_head num + kv_head = self.kv_head + self.is_decoder = 0 # prefill or decoder + self.heads = 32 # llama7b hidden_size 4096 + self.embeddim = 128 + self.embeddim_v = self.embeddim + self.dynamic_batch = False + kv_seqLen = [32, 256] * 4 + self.max_seq = 256 + self.window_size = 16 + self.cacheType = 0 + self.q_seqlen, self.q_ntokens = self.gen_seq_len(batch, kv_seqLen) + self.kv_seqlen, self.kv_ntokens = self.gen_seq_len(batch, kv_seqLen) + self.layer_id = torch.from_numpy(np.array([0], dtype=np.int32)).to(torch.int32) + self.q_max_seq = np.max(self.q_seqlen) + self.kv_max_seq = np.max(self.kv_seqlen) + tor = np.float32(1.0 / math.sqrt(1.0 * self.embeddim)) + + self.q_scale = 1 + self.qk_scale = tor + self.cache_type = 1 + OP_NAME = "SelfAttentionOperation" + OP_PARAM = {"type": 1} + self.set_data_params(cache_type=self.cache_type, is_decoder=self.is_decoder, batch=batch, kv_head=self.kv_head, + heads=self.heads, embeddim=self.embeddim, max_seq=self.max_seq, kv_seqLen=kv_seqLen, + data_type=data_type, long_seq = False, op_type=OP_PARAM["type"], mask_type = MASK_TYPE_SWA, + no_cache=False, is_sqrt=False, tor=tor, q_seqlens=self.q_seqlen) + self.gen_out_tensor() + self.window_size = 16 + + param = json.dumps( + {"headNum": self.heads, "qScale": float(self.q_scale), "qkScale": float(self.qk_scale), "maskType": 7, + "kvcacheCfg": 1, "calcType": 1, "windowSize": self.window_size, "cacheType": 0}) + self.param_seqlen = self.q_seqlen + self.param_token_offset = self.kv_seqlen + run_param = json.dumps({"tokenOffset": self.param_token_offset, "seqLen": self.param_seqlen, "maskType": 7}) + self.execute_with_param(OP_NAME, param, run_param, + [self.q.npu(), self.k.npu(), self.v.npu(), + self.mask.reshape(self.q_max_seq, self.kv_max_seq).to(data_type).npu(), + torch.tensor(self.kv_seqlen).to(torch.int32).npu(), + torch.tensor(self.q_seqlen).to(torch.int32).npu(), self.layer_id.npu()]) + + + def test_swa_encoder_compress_mask(self): + """ + is_decoder = 0, no_cache=False, "maskType": MASK_TYPE_SLIDING_WINDOW_COMPRESS, cacheType = 0 + qselen = kv_seqLen = [32, 256, ...] + compress swa mask + """ + if not operation_test.get_soc_version() == 'Ascend910B': + print("this testcase only supports Ascend910B") + return + self.data_type = torch.bfloat16 + data_type = self.data_type + self.batch = 8 + batch = self.batch + self.kv_head = 32 # kv_head num + kv_head = self.kv_head + self.is_decoder = 0 # prefill or decoder + self.heads = 32 # llama7b hidden_size 4096 + self.embeddim = 128 + self.embeddim_v = self.embeddim + kv_seqLen = [32, 256] * 4 + self.max_seq = 256 + self.window_size = 16 + self.cacheType = 0 + self.q_seqlen, self.q_ntokens = self.gen_seq_len(batch, kv_seqLen) + self.kv_seqlen, self.kv_ntokens = self.gen_seq_len(batch, kv_seqLen) + self.layer_id = torch.from_numpy(np.array([0], dtype=np.int32)).to(torch.int32) + self.q_max_seq = np.max(self.q_seqlen) + self.kv_max_seq = np.max(self.kv_seqlen) + tor = np.float32(1.0 / math.sqrt(1.0 * self.embeddim)) + + self.q_scale = 1 + self.qk_scale = tor + self.cache_type = 1 + OP_NAME = "SelfAttentionOperation" + OP_PARAM = {"type": 1} + self.set_data_params(cache_type=self.cache_type, is_decoder=self.is_decoder, batch=batch, kv_head=self.kv_head, + heads=self.heads, embeddim=self.embeddim, max_seq=self.max_seq, kv_seqLen=kv_seqLen, + data_type=data_type, long_seq = False, op_type=OP_PARAM["type"], mask_type = MASK_TYPE_SWA, + no_cache=False, is_sqrt=False, tor=tor, q_seqlens=self.q_seqlen) + self.gen_out_tensor() + self.window_size = 16 + attention_mask = self.gen_swa_cmp(self.max_seq, self.window_size).to(data_type).npu() + + param = json.dumps( + {"headNum": self.heads, "qScale": float(self.q_scale), "qkScale": float(self.qk_scale), "maskType": 8, + "kvcacheCfg": 1, "calcType": 1, "windowSize": self.window_size, "cacheType": 0}) + self.param_seqlen = self.q_seqlen + self.param_token_offset = self.kv_seqlen + run_param = json.dumps({"tokenOffset": self.param_token_offset, "seqLen": self.param_seqlen, "maskType": 7}) + self.execute_with_param(OP_NAME, param, run_param, + [self.q.npu(), self.k.npu(), self.v.npu(), + attention_mask.reshape(512, 512).to(data_type).npu(), + torch.tensor(self.kv_seqlen).to(torch.int32).npu(), + torch.tensor(self.q_seqlen).to(torch.int32).npu(), self.layer_id.npu()]) + + def test_operation_logn(self): + """ + is_decoder = 1, no_cache=False, "maskType": MASK_TYPE_NORM + qseqlen = [1] * batch + kv_seqLen = [32, 1024] * 4 + """ + if not operation_test.get_soc_version() == 'Ascend910B': + print("this testcase only supports Ascend910B") + return + mask_type = MASK_TYPE_NO_HEAD_DECODER + data_type = torch.bfloat16 + batch = 8 + kv_head = 32 # kv_head num + is_decoder = 1 # prefill or decoder + heads = 32 # llama7b hidden_size 4096 + embeddim = 128 + embeddimv = np.random.randint(1, embeddim) + max_seq = 1024 + tor = 1 + dynamic_batch = False + kv_seqLen = [32, 1024] * 4 + qSeqLen = [1] * batch + self.q_seqlen, self.q_ntokens = self.gen_seq_len(batch, [1] * batch) + self.kv_seqlen, self.kv_ntokens = self.gen_seq_len(batch, kv_seqLen) + self.q_max_seq = np.max(self.q_seqlen) + self.kv_max_seq = np.max(self.kv_seqlen) + tor = np.float32(1.0 / math.sqrt(1.0 * embeddim)) + self.set_data_params(mask_type=mask_type, tor=tor, q_seqlens=self.q_seqlen, kv_seqLen=self.kv_seqlen, data_type=data_type, batch=batch, kv_head=kv_head, + is_decoder=is_decoder, heads=heads, embeddim=embeddim, embeddimv=embeddimv, max_seq=max_seq, dynamic_batch=dynamic_batch) + self.gen_out_tensor() + OP_NAME = "SelfAttentionOperation" + self.q_scale = 1 + self.qk_scale = tor + param = json.dumps( + {"headNum": self.heads, "qScale": float(self.q_scale), "qkScale": float(self.qk_scale), "maskType": 1, + "kvcacheCfg": 1, "calcType": 2}) + self.param_seqlen = self.q_seqlen + self.param_token_offset = self.kv_seqlen + run_param = json.dumps({"tokenOffset": self.param_token_offset, "seqLen": self.param_seqlen}) + # pdb.set_trace() + self.execute_with_param(OP_NAME, param, run_param, + [self.q.npu(), self.k.npu(), self.v.npu(), self.mask.to(data_type).npu(), + torch.tensor(self.kv_seqlen).to(torch.int32).npu(), + torch.tensor(self.q_seqlen).to(torch.int32).npu(), self.layer_id.npu()]) + + def test_operation_split_kvcache_success_float16(self): + """ + is_decoder = 1, no_cache=False, "maskType": MASK_TYPE_NO_HEAD_DECODER, cache_type = 1 + qseqlen = [1, ...] + kv_seqLen = [114, ...] + """ + if not operation_test.get_soc_version() == 'Ascend910B': + print("this testcase only supports Ascend910B") + return + mask_type = MASK_TYPE_NO_HEAD_DECODER + self.data_type = torch.float16 + data_type = self.data_type + self.batch = 22 + batch = self.batch + self.kv_head = 44 # kv_head num + kv_head = self.kv_head + self.is_decoder = 1 # prefill or decoder + self.heads = 44 # llama7b hidden_size 4096 + self.embeddim = 256 + self.embeddim_v = self.embeddim + self.max_seq = 256 + tor = 1 + self.dynamic_batch = False + kv_seqLen = [114] * batch + qSeqLen = [1] * batch + self.is_clamp = 0 + self.clamp_min = 0 + self.clamp_max = 0 + self.is_triu_mask = False + self.long_seq = False + self.is_alibi = False + self.q_seqlen, self.q_ntokens = self.gen_seq_len(batch, [1] * batch) + self.kv_seqlen, self.kv_ntokens = self.gen_seq_len(batch, kv_seqLen) + self.layer_id = torch.from_numpy(np.array([0], dtype=np.int32)).to(torch.int32) + self.q_max_seq = np.max(self.q_seqlen) + self.kv_max_seq = np.max(self.kv_seqlen) + self.cache_type = 1 + self.window_size = 0 + self.is_compress = False + OP_NAME = "SelfAttentionOperation" + OP_PARAM = {"type": 1} + tor = np.float32(1.0 / math.sqrt(1.0 * self.embeddim)) + self.set_data_params(cache_type=self.cache_type, is_decoder=self.is_decoder, batch=batch, kv_head=self.kv_head, + heads=self.heads, embeddim=self.embeddim, embeddimv=self.embeddim_v, max_seq=self.max_seq, kv_seqLen=kv_seqLen, + data_type=data_type, op_type=OP_PARAM["type"], mask_type = mask_type, + no_cache=False, is_sqrt=False, tor=tor, q_seqlens=self.q_seqlen) + q = torch.from_numpy(np.random.uniform(-5.0, 5.0, size=(self.q_ntokens, self.heads * self.embeddim))) + + self.q = q.to(data_type) + self.k_list = [] + self.v_list = [] + for i in range(self.batch): + self.k_list.append(torch.from_numpy(np.random.uniform(-5.0, 5.0, size=(1, 1, self.max_seq, kv_head * self.embeddim))).to(data_type).npu()) + self.v_list.append(torch.from_numpy(np.random.uniform(-5.0, 5.0, size=(1, 1, self.max_seq, kv_head * self.embeddim_v))).to(data_type).npu()) + + self.k = torch.cat(self.k_list, 1).cpu() + self.v = torch.cat(self.v_list, 1).cpu() + + for i in range(self.batch): + self.k_list[i] = self.k_list[i].squeeze().npu() + self.v_list[i] = self.v_list[i].squeeze().npu() + self.gen_out_tensor() + + self.q_scale = 1 + self.qk_scale = tor + param = json.dumps({"headNum": self.heads, "qScale": float(self.q_scale), "qkScale": float(self.qk_scale), "maskType": 1, "kvcacheCfg":1, "calcType":2}) + self.param_seqlen = self.q_seqlen + self.param_token_offset = self.kv_seqlen + run_param = json.dumps({"tokenOffset": self.param_token_offset, "seqLen": self.param_seqlen, "byPass": "true"}) + #pdb.set_trace() + self.execute_with_param_and_tensor_list(OP_NAME, param, run_param, + [self.q.npu(), self.k.npu(), self.v.npu(),self.mask.to(data_type).npu(),torch.tensor(self.kv_seqlen).to(torch.int32).npu(), torch.tensor(self.q_seqlen).to(torch.int32).npu(), self.layer_id.npu()], + [self.k_list, self.v_list], ["kCache", "vCache"]) + + def test_operation_split_kvcache_success_bfloat16(self): + """ + is_decoder = 1, no_cache=False, "maskType": MASK_TYPE_NO_HEAD_DECODER, cache_type = 1 + qseqlen = [1, ...] + kv_seqLen = [114, ...] + """ + if not operation_test.get_soc_version() == 'Ascend910B': + print("this testcase only supports Ascend910B") + return + mask_type = MASK_TYPE_NO_HEAD_DECODER + self.data_type = torch.bfloat16 + data_type = self.data_type + self.batch = 22 + batch = self.batch + self.kv_head = 44 # kv_head num + kv_head = self.kv_head + self.is_decoder = 1 # prefill or decoder + self.heads = 44 # llama7b hidden_size 4096 + self.embeddim = 256 + self.embeddim_v = self.embeddim + self.max_seq = 256 + tor = 1 + self.dynamic_batch = False + kv_seqLen = [114] * batch + qSeqLen = [1] * batch + self.is_clamp = 0 + self.clamp_min = 0 + self.clamp_max = 0 + self.is_triu_mask = False + self.long_seq = False + self.is_alibi = False + self.q_seqlen, self.q_ntokens = self.gen_seq_len(batch, [1] * batch) + self.kv_seqlen, self.kv_ntokens = self.gen_seq_len(batch, kv_seqLen) + self.layer_id = torch.from_numpy(np.array([0], dtype=np.int32)).to(torch.int32) + self.q_max_seq = np.max(self.q_seqlen) + self.kv_max_seq = np.max(self.kv_seqlen) + self.cache_type = 1 + self.window_size = 0 + self.is_compress = False + + q = torch.from_numpy(np.random.uniform(-5.0, 5.0, size=(self.q_ntokens, self.heads * self.embeddim))) + tor = np.float32(1.0 / math.sqrt(1.0 * self.embeddim)) + #self.q = (q * tor).to(data_type) + + OP_NAME = "SelfAttentionOperation" + OP_PARAM = {"type": 1} + self.set_data_params(cache_type=self.cache_type, is_decoder=self.is_decoder, batch=batch, kv_head=self.kv_head, + heads=self.heads, embeddim=self.embeddim, embeddimv=self.embeddim_v, max_seq=self.max_seq, kv_seqLen=kv_seqLen, + data_type=data_type, long_seq = False, op_type=OP_PARAM["type"], mask_type = mask_type, + tor=tor, q_seqlens=self.q_seqlen) + self.q = q.to(data_type) + self.k_list = [] + self.v_list = [] + for i in range(self.batch): + self.k_list.append(torch.from_numpy(np.random.uniform(-5.0, 5.0, size=(1, 1, self.max_seq, self.kv_head * self.embeddim))).to(data_type).npu()) + self.v_list.append(torch.from_numpy(np.random.uniform(-5.0, 5.0, size=(1, 1, self.max_seq, self.kv_head * self.embeddim_v))).to(data_type).npu()) + + self.k = torch.cat(self.k_list, 1).cpu() + self.v = torch.cat(self.v_list, 1).cpu() + + for i in range(self.batch): + self.k_list[i] = self.k_list[i].squeeze().npu() + self.v_list[i] = self.v_list[i].squeeze().npu() + self.gen_out_tensor() + + self.q_scale = 1 + self.qk_scale = tor + param = json.dumps({"headNum": self.heads, "qScale": float(self.q_scale), "qkScale": float(self.qk_scale), "maskType": 1, "kvcacheCfg":1, "calcType":2}) + self.param_seqlen = self.q_seqlen + self.param_token_offset = self.kv_seqlen + run_param = json.dumps({"tokenOffset": self.param_token_offset, "seqLen": self.param_seqlen, "byPass": "true"}) + #pdb.set_trace() + self.execute_with_param_and_tensor_list(OP_NAME, param, run_param, + [self.q.npu(), self.k.npu(), self.v.npu(), self.mask.to(data_type).npu(),torch.tensor(self.kv_seqlen).to(torch.int32).npu(), torch.tensor(self.q_seqlen).to(torch.int32).npu(), self.layer_id.npu()], + [self.k_list, self.v_list], ["kCache", "vCache"]) + + def test_encoder_operation_mask_free_fp16(self): + """ + is_decoder = 0, no_cache=True, "maskType": MASK_TYPE_NO_BATCH, + qseqlen = kv_seqLen = [1024] + """ + if not operation_test.get_soc_version() == 'Ascend910B': + print("this testcase only supports Ascend910B") + return + + batch = 1 + kv_head = 1 # kv_head num + isdecoder = 0 # prefill or decoder + heads = 12 + embeddim = 128 + max_seq = 1024 + tor = 1 + kv_seqLen = [1024] + is_clamp = 0 + clamp_min = 0 + clamp_max = 0 + dynamic_batch = False + data_type = torch.float16 + + + self.set_data_params(dynamic_batch = dynamic_batch, + is_decoder = isdecoder, batch = batch, kv_head = kv_head, heads = heads, + embeddim = embeddim, max_seq = max_seq, kv_seqLen = kv_seqLen, + is_clamp = is_clamp, clamp_max = clamp_max, clamp_min = clamp_min, + data_type = data_type, is_alibi = True, + op_type = 2001, mask_type = MASK_TYPE_ALIBI_WITH_BATCH, no_cache = True) + print("embeddimv: ", self.embeddimv) + self.gen_out_tensor() + param_seqlen = self.kv_seqLen + self.alibi_slopes *= -1 + mask = np.ones((256,256)) * 60000 + mask = np.triu(mask, 1) + self.mask = self.bias[:256, :256] * -1 + mask + self.mask = self.mask.to(torch.float16) + print(f"===============self.mask {self.mask.shape}") + print(f"===============self.mask {torch.max(self.mask)} {torch.min(self.mask)}") + print(self.alibi_slopes) + OP_NAME = "SelfAttentionOperation" + PARAM = json.dumps({"headNum": 12, "qkScale": 1, "kvHeadNum": 1, + "calcType": 3, "maskType": 4, "isTriuMask": 1, "kernelType": 1}) + RUN_PARAM = json.dumps({"seqLen": param_seqlen}) + print(self.q.npu().contiguous().shape, self.k.npu().contiguous().shape, self.v.npu().contiguous().shape, self.mask.npu().contiguous().shape, torch.from_numpy(np.array(self.kv_seqLen).astype(np.int32)).npu().contiguous().shape, self.alibi_slopes.npu().contiguous().shape, param_seqlen) + self.execute_with_param(OP_NAME, PARAM, RUN_PARAM, [ + self.q.npu().contiguous(), self.k.npu().contiguous(), self.v.npu().contiguous(), self.mask.npu().contiguous(), torch.from_numpy(np.array(self.kv_seqLen).astype(np.int32)).npu().contiguous(), self.alibi_slopes.npu().contiguous() + ]) + + def test_flash_attention_case_fa_encoder_withcache_bf16_maskfree(self): + """ + is_decoder = 0, no_cache=True, "maskType": MASK_TYPE_CAUSAL_MASK, + qseqlen = [seqlen] * batch + kv_seqLen = [seqlen + 128 * random.randint(1, 4)] * batch + """ + if not operation_test.get_soc_version() == 'Ascend910B': + print("this testcase only supports Ascend910B") + return + # [b,ms,ms] + for _ in range(700): + batch = random.randint(1, 16) + kv_head = random.randint(1, 5) # kv_head num + isdecoder = 0 # prefill or decoder + heads = kv_head * random.randint(1, 4) # head num + embeddim = 128 + max_seq = 128 * 100 + tor = 1.0 / math.sqrt(1.0 * embeddim) + seqlen = random.randint(1, 4096) + q_seqlens = [seqlen] * batch + kv_seqLen = [seqlen + 128 * random.randint(1, 4)] * batch + is_clamp = 0 + clamp_min = 0 + clamp_max = 0 + dynamic_batch = False + block_size = 128 + num_blocks = 1024 + OP_NAME = "SelfAttentionOperation" + OP_PARAM = {"type": UNPAD_FLASH_ATTENTION_ENCODER_PREFIX_CACHE_ND, "qSeqLen": q_seqlens, + "kvSeqLen": kv_seqLen, "headSize": heads, "tor": tor, + "isClamp": is_clamp, "clampMin": clamp_min, "clampMax": clamp_max, + "maskType": MASK_TYPE_CAUSAL_MASK, "kvHead": kv_head, + "isTriuMask": 1, "alibiLeftAlign": 0, "isAlibiMaskSqrt": 0} + data_type = random.choice([torch.bfloat16, torch.float16]) + print( + f"---batch:{batch}---kv_head:{kv_head}---q_seqlens:{q_seqlens}---kv_seqLen:{kv_seqLen}---kv_head:{kv_head}---heads:{heads}---data_type:{data_type}---") + self.set_data_params(dynamic_batch=dynamic_batch, + is_decoder=isdecoder, batch=batch, kv_head=kv_head, heads=heads, + embeddim=embeddim, max_seq=max_seq, kv_seqLen=kv_seqLen, + is_clamp=is_clamp, clamp_max=clamp_max, clamp_min=clamp_min, + data_type=data_type, + op_type=OP_PARAM["type"], mask_type=MASK_TYPE_ALIBI_NO_BATCH_WITH_PREFIX, + no_cache=True, tor=tor, q_seqlens=q_seqlens, + num_blocks=num_blocks, block_size=block_size, is_triu_mask=True, is_mask=True) + self.gen_out_tensor() + PARAM = json.dumps( + {"headNum": heads, "calcType": CAL_TYPE_PREFIX_ENCODER, "maskType": MASK_TYPE_CAUSAL_MASK, + "kvHeadNum": kv_head, "isTriuMask": 1, "qkScale": tor, "kernelType": KERNELTYPE_HIGH_PRECISION}) + RUN_PARAM = json.dumps({"seqLen": q_seqlens, "kvSeqLen": kv_seqLen, "CalcType": CAL_TYPE_PREFIX_ENCODER, + "maskType": MASK_TYPE_CAUSAL_MASK}) + q_seqlen = np.array(q_seqlens) + q_seqlen = torch.from_numpy(q_seqlen).to(torch.int32).npu() + kv_seqLen = np.array(kv_seqLen) + kv_seqLen = torch.from_numpy(kv_seqLen).to(torch.int32).npu() + self.execute_with_param(OP_NAME, PARAM, RUN_PARAM, [self.q.npu(), self.k_cache.npu(), self.v_cache.npu(), + self.block_tables.npu(), q_seqlen, kv_seqLen]) + def test_self_attention_encoder_operation_alibi_bf16(self): + """ + is_decoder = 1, no_cache=False, "maskType": MASK_TYPE_NO_HEAD_DECODER, cache_type = 1 + qseqlen = kv_seqLen = [1024] + """ + batch = 1 + kv_head = 1 # kv_head num + isdecoder = 0 # prefill or decoder + heads = 12 + embeddim = 128 + max_seq = 1024 + tor = 1 + kv_seqLen = [1024] + is_clamp = 0 + clamp_min = 0 + clamp_max = 0 + dynamic_batch = False + data_type = torch.bfloat16 + + self.set_data_params(dynamic_batch = dynamic_batch, + is_decoder = isdecoder, batch = batch, kv_head = kv_head, heads = heads, + embeddim = embeddim, max_seq = max_seq, kv_seqLen = kv_seqLen, + is_clamp = is_clamp, clamp_max = clamp_max, clamp_min = clamp_min, tor=tor, + data_type = data_type) + self.gen_out_tensor() + self.mask = self.mask.to(torch.bfloat16) + data = [self.q, self.k, self.v, self.mask, self.kv_seqLen, self.golden_out] + param_seqlen = data[4] + data[4] = torch.from_numpy(np.array(data[4]).astype(np.int32)) + data[1], data[2] = torch.reshape(data[1], (max_seq, embeddim)), torch.reshape(data[2], (max_seq, embeddim)) + in_tensors = [tensor.npu().contiguous() for tensor in data] + + OP_NAME = "SelfAttentionOperation" + PARAM = json.dumps({"headNum": 12, "qkScale": 1, "kvHeadNum": 1, + "calcType": 3, "maskType": 2, "isTriuMask": 1, "kernelType": 0}) + RUN_PARAM = json.dumps({"seqLen": param_seqlen}) + if not operation_test.get_soc_version() == 'Ascend910B': + print("this testcase only supports Ascend910B") + return + self.execute_with_param(OP_NAME, PARAM, RUN_PARAM, [ + in_tensors[0], in_tensors[1], in_tensors[2], in_tensors[3], in_tensors[4] + ]) + + def test_self_attention_encoder_operation_compress_mask_swa_cycle_cache(self): + """ + is_decoder = 0, no_cache=False, "maskType": MASK_TYPE_ALIBI_WITH_PREFIX_BATCH, cache_type = 1 + qseqlen = kv_seqLen = [128...] + """ + if operation_test.get_soc_version() == 'Ascend910B': + kv_head = 2 + window_size = 32 + mask_type = 8 + data = self.calc_expect_func(2, 1024, 2, 128, window_size, mask_type, group_num=kv_head) + param_seqlen = data[4].tolist() + in_tensors = [torch.from_numpy(tensor) for tensor in data] + in_tensors = [tensor.npu() for tensor in in_tensors] + a = [print(tensor.dtype, tensor.device) for tensor in in_tensors] + + OP_NAME = "SelfAttentionOperation" + print("now qseqlen is ", self.q_seqlen) + self.set_data_params(kv_head=kv_head, mask_type=mask_type, heads=self.heads, embeddim=self.embeddim, embeddimv=self.embeddimv, kv_seqLen=self.kv_seqLen, batch=2, window_size=window_size, + no_cache=True) + self.gen_out_tensor() + PARAM = json.dumps({"headNum": kv_head, "qkScale": (1 / float(math.sqrt(128))), "kvHeadNum": kv_head, \ + "maskType": mask_type, "calcType": 3, "windowSize": 32, "cacheType": 1}) + RUN_PARAM = json.dumps({"seqLen": param_seqlen}) + print(PARAM, RUN_PARAM) + if not operation_test.get_soc_version() == 'Ascend910B': + print("this testcase only supports Ascend910B") + return + + self.mask = gen_swa_cmp(window_size, self.embeddim).astype('float16') + self.golden_out = torch.reshape(self.golden_out, (2048, 2, 128)) + self.golden_out_true = torch.reshape(self.golden_out_true, (2048, 2, 128)) + self.execute_with_param(OP_NAME, PARAM, RUN_PARAM, [ + torch.reshape(self.q, (2048, 2, 128)).npu(), torch.reshape(self.k, (2048, 2, 128)).npu(), torch.reshape(self.v, (2048, 2, 128)).npu(), torch.from_numpy(self.mask).npu(), torch.from_numpy(self.q_seqlen.astype(np.int32)).npu() + ]) + +if __name__ == '__main__': + unittest.main() -- Gitee From 2b497cecbaca2c3d8ee19ad9ecf24c23c0906024 Mon Sep 17 00:00:00 2001 From: caixilong Date: Mon, 15 Sep 2025 09:40:05 +0800 Subject: [PATCH 09/94] remove loop from fa_encoder_withcache_bf16_maskfree --- .../test_self_attention_combine.py | 89 +++++++++---------- 1 file changed, 44 insertions(+), 45 deletions(-) diff --git a/tests/apitest/opstest/python/operations/self_attention/test_self_attention_combine.py b/tests/apitest/opstest/python/operations/self_attention/test_self_attention_combine.py index bfbfb539..d4c61b3f 100644 --- a/tests/apitest/opstest/python/operations/self_attention/test_self_attention_combine.py +++ b/tests/apitest/opstest/python/operations/self_attention/test_self_attention_combine.py @@ -1785,51 +1785,50 @@ class TestFlashAttention(operation_test.OperationTest): print("this testcase only supports Ascend910B") return # [b,ms,ms] - for _ in range(700): - batch = random.randint(1, 16) - kv_head = random.randint(1, 5) # kv_head num - isdecoder = 0 # prefill or decoder - heads = kv_head * random.randint(1, 4) # head num - embeddim = 128 - max_seq = 128 * 100 - tor = 1.0 / math.sqrt(1.0 * embeddim) - seqlen = random.randint(1, 4096) - q_seqlens = [seqlen] * batch - kv_seqLen = [seqlen + 128 * random.randint(1, 4)] * batch - is_clamp = 0 - clamp_min = 0 - clamp_max = 0 - dynamic_batch = False - block_size = 128 - num_blocks = 1024 - OP_NAME = "SelfAttentionOperation" - OP_PARAM = {"type": UNPAD_FLASH_ATTENTION_ENCODER_PREFIX_CACHE_ND, "qSeqLen": q_seqlens, - "kvSeqLen": kv_seqLen, "headSize": heads, "tor": tor, - "isClamp": is_clamp, "clampMin": clamp_min, "clampMax": clamp_max, - "maskType": MASK_TYPE_CAUSAL_MASK, "kvHead": kv_head, - "isTriuMask": 1, "alibiLeftAlign": 0, "isAlibiMaskSqrt": 0} - data_type = random.choice([torch.bfloat16, torch.float16]) - print( - f"---batch:{batch}---kv_head:{kv_head}---q_seqlens:{q_seqlens}---kv_seqLen:{kv_seqLen}---kv_head:{kv_head}---heads:{heads}---data_type:{data_type}---") - self.set_data_params(dynamic_batch=dynamic_batch, - is_decoder=isdecoder, batch=batch, kv_head=kv_head, heads=heads, - embeddim=embeddim, max_seq=max_seq, kv_seqLen=kv_seqLen, - is_clamp=is_clamp, clamp_max=clamp_max, clamp_min=clamp_min, - data_type=data_type, - op_type=OP_PARAM["type"], mask_type=MASK_TYPE_ALIBI_NO_BATCH_WITH_PREFIX, - no_cache=True, tor=tor, q_seqlens=q_seqlens, - num_blocks=num_blocks, block_size=block_size, is_triu_mask=True, is_mask=True) - self.gen_out_tensor() - PARAM = json.dumps( - {"headNum": heads, "calcType": CAL_TYPE_PREFIX_ENCODER, "maskType": MASK_TYPE_CAUSAL_MASK, - "kvHeadNum": kv_head, "isTriuMask": 1, "qkScale": tor, "kernelType": KERNELTYPE_HIGH_PRECISION}) - RUN_PARAM = json.dumps({"seqLen": q_seqlens, "kvSeqLen": kv_seqLen, "CalcType": CAL_TYPE_PREFIX_ENCODER, - "maskType": MASK_TYPE_CAUSAL_MASK}) - q_seqlen = np.array(q_seqlens) - q_seqlen = torch.from_numpy(q_seqlen).to(torch.int32).npu() - kv_seqLen = np.array(kv_seqLen) - kv_seqLen = torch.from_numpy(kv_seqLen).to(torch.int32).npu() - self.execute_with_param(OP_NAME, PARAM, RUN_PARAM, [self.q.npu(), self.k_cache.npu(), self.v_cache.npu(), + batch = random.randint(1, 16) + kv_head = random.randint(1, 5) # kv_head num + isdecoder = 0 # prefill or decoder + heads = kv_head * random.randint(1, 4) # head num + embeddim = 128 + max_seq = 128 * 100 + tor = 1.0 / math.sqrt(1.0 * embeddim) + seqlen = random.randint(1, 4096) + q_seqlens = [seqlen] * batch + kv_seqLen = [seqlen + 128 * random.randint(1, 4)] * batch + is_clamp = 0 + clamp_min = 0 + clamp_max = 0 + dynamic_batch = False + block_size = 128 + num_blocks = 1024 + OP_NAME = "SelfAttentionOperation" + OP_PARAM = {"type": UNPAD_FLASH_ATTENTION_ENCODER_PREFIX_CACHE_ND, "qSeqLen": q_seqlens, + "kvSeqLen": kv_seqLen, "headSize": heads, "tor": tor, + "isClamp": is_clamp, "clampMin": clamp_min, "clampMax": clamp_max, + "maskType": MASK_TYPE_CAUSAL_MASK, "kvHead": kv_head, + "isTriuMask": 1, "alibiLeftAlign": 0, "isAlibiMaskSqrt": 0} + data_type = random.choice([torch.bfloat16, torch.float16]) + print( + f"---batch:{batch}---kv_head:{kv_head}---q_seqlens:{q_seqlens}---kv_seqLen:{kv_seqLen}---kv_head:{kv_head}---heads:{heads}---data_type:{data_type}---") + self.set_data_params(dynamic_batch=dynamic_batch, + is_decoder=isdecoder, batch=batch, kv_head=kv_head, heads=heads, + embeddim=embeddim, max_seq=max_seq, kv_seqLen=kv_seqLen, + is_clamp=is_clamp, clamp_max=clamp_max, clamp_min=clamp_min, + data_type=data_type, + op_type=OP_PARAM["type"], mask_type=MASK_TYPE_ALIBI_NO_BATCH_WITH_PREFIX, + no_cache=True, tor=tor, q_seqlens=q_seqlens, + num_blocks=num_blocks, block_size=block_size, is_triu_mask=True, is_mask=True) + self.gen_out_tensor() + PARAM = json.dumps( + {"headNum": heads, "calcType": CAL_TYPE_PREFIX_ENCODER, "maskType": MASK_TYPE_CAUSAL_MASK, + "kvHeadNum": kv_head, "isTriuMask": 1, "qkScale": tor, "kernelType": KERNELTYPE_HIGH_PRECISION}) + RUN_PARAM = json.dumps({"seqLen": q_seqlens, "kvSeqLen": kv_seqLen, "CalcType": CAL_TYPE_PREFIX_ENCODER, + "maskType": MASK_TYPE_CAUSAL_MASK}) + q_seqlen = np.array(q_seqlens) + q_seqlen = torch.from_numpy(q_seqlen).to(torch.int32).npu() + kv_seqLen = np.array(kv_seqLen) + kv_seqLen = torch.from_numpy(kv_seqLen).to(torch.int32).npu() + self.execute_with_param(OP_NAME, PARAM, RUN_PARAM, [self.q.npu(), self.k_cache.npu(), self.v_cache.npu(), self.block_tables.npu(), q_seqlen, kv_seqLen]) def test_self_attention_encoder_operation_alibi_bf16(self): """ -- Gitee From 58d087303b54aa6231cf156510eaab518820c9eb Mon Sep 17 00:00:00 2001 From: caixilong Date: Tue, 16 Sep 2025 11:17:39 +0800 Subject: [PATCH 10/94] fix alibi 16 bug --- .../test_self_attention_combine.py | 32 +++++++++---------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/tests/apitest/opstest/python/operations/self_attention/test_self_attention_combine.py b/tests/apitest/opstest/python/operations/self_attention/test_self_attention_combine.py index d4c61b3f..0102de9b 100644 --- a/tests/apitest/opstest/python/operations/self_attention/test_self_attention_combine.py +++ b/tests/apitest/opstest/python/operations/self_attention/test_self_attention_combine.py @@ -1835,35 +1835,35 @@ class TestFlashAttention(operation_test.OperationTest): is_decoder = 1, no_cache=False, "maskType": MASK_TYPE_NO_HEAD_DECODER, cache_type = 1 qseqlen = kv_seqLen = [1024] """ - batch = 1 - kv_head = 1 # kv_head num - isdecoder = 0 # prefill or decoder - heads = 12 - embeddim = 128 - max_seq = 1024 - tor = 1 - kv_seqLen = [1024] + batch = random.randint(1, 10) + kv_head = random.randint(1, 32) # kv_head num + isdecoder = 0 # prefill or decoder + heads = kv_head * random.randint(1, 5) + embeddim = random.choice([32, 64, 128]) + max_seq = random.randint(1, 2048) + tor = 1.0 / math.sqrt(1.0 * embeddim) + kv_seqLen = [max_seq] * batch is_clamp = 0 clamp_min = 0 clamp_max = 0 dynamic_batch = False data_type = torch.bfloat16 + print(f"--batch:{batch}--kv_head:{kv_head}--heads:{heads}--embeddim:{embeddim}--max_seq:{max_seq}") - self.set_data_params(dynamic_batch = dynamic_batch, - is_decoder = isdecoder, batch = batch, kv_head = kv_head, heads = heads, - embeddim = embeddim, max_seq = max_seq, kv_seqLen = kv_seqLen, - is_clamp = is_clamp, clamp_max = clamp_max, clamp_min = clamp_min, tor=tor, - data_type = data_type) + self.set_data_params(dynamic_batch=dynamic_batch, + is_decoder=isdecoder, batch=batch, kv_head=kv_head, heads=heads, + embeddim=embeddim, max_seq=max_seq, kv_seqLen=kv_seqLen, + is_clamp=is_clamp, clamp_max=clamp_max, clamp_min=clamp_min, + data_type=data_type, is_alibi=True, tor=tor, + op_type=10, mask_type=MASK_TYPE_ALIBI_WITH_BATCH, no_cache=True) self.gen_out_tensor() self.mask = self.mask.to(torch.bfloat16) data = [self.q, self.k, self.v, self.mask, self.kv_seqLen, self.golden_out] param_seqlen = data[4] data[4] = torch.from_numpy(np.array(data[4]).astype(np.int32)) - data[1], data[2] = torch.reshape(data[1], (max_seq, embeddim)), torch.reshape(data[2], (max_seq, embeddim)) in_tensors = [tensor.npu().contiguous() for tensor in data] - OP_NAME = "SelfAttentionOperation" - PARAM = json.dumps({"headNum": 12, "qkScale": 1, "kvHeadNum": 1, + PARAM = json.dumps({"headNum": heads, "qkScale": 1.0 / math.sqrt(1.0 * embeddim), "kvHeadNum": kv_head, "calcType": 3, "maskType": 2, "isTriuMask": 1, "kernelType": 0}) RUN_PARAM = json.dumps({"seqLen": param_seqlen}) if not operation_test.get_soc_version() == 'Ascend910B': -- Gitee From 99a120bed4fc01f8128edb9edbb02b26478be795 Mon Sep 17 00:00:00 2001 From: caixilong Date: Tue, 16 Sep 2025 19:45:37 +0800 Subject: [PATCH 11/94] fix swa_decoder and swa_decoder_cache bug --- .../test_self_attention_combine.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/tests/apitest/opstest/python/operations/self_attention/test_self_attention_combine.py b/tests/apitest/opstest/python/operations/self_attention/test_self_attention_combine.py index 0102de9b..af32382b 100644 --- a/tests/apitest/opstest/python/operations/self_attention/test_self_attention_combine.py +++ b/tests/apitest/opstest/python/operations/self_attention/test_self_attention_combine.py @@ -1274,7 +1274,7 @@ class TestFlashAttention(operation_test.OperationTest): self.dynamic_batch = False kv_seqLen = [114] * batch qSeqLen = [1] * batch - self.max_seq = max(max(kv_seqLen), max(qSeqLen)) + self.max_seq = 256 self.window_size = 16 self.q_seqlen, self.q_ntokens = self.gen_seq_len(batch, [1] * batch) self.kv_seqlen, self.kv_ntokens = self.gen_seq_len(batch, kv_seqLen) @@ -1285,20 +1285,20 @@ class TestFlashAttention(operation_test.OperationTest): self.q_scale = 1 self.qk_scale = tor - self.cache_type = 1 + self.cache_type = 0 OP_NAME = "SelfAttentionOperation" OP_PARAM = {"type": 1} self.set_data_params(cache_type=self.cache_type, is_decoder=self.is_decoder, batch=batch, kv_head=kv_head, heads=self.heads, embeddim=self.embeddim, max_seq=self.max_seq, kv_seqLen=kv_seqLen, - data_type=data_type, long_seq = True, - op_type=OP_PARAM["type"], mask_type = MASK_TYPE_SWA, + data_type=data_type, long_seq = True, window_size=self.window_size, + op_type=OP_PARAM["type"], mask_type = MASK_TYPE_NO_HEAD_DECODER, no_cache=False, is_sqrt=False, tor=tor, q_seqlens=self.q_seqlen) self.gen_out_tensor() self.window_size = 16 param = json.dumps( {"headNum": self.heads, "qScale": float(self.q_scale), "qkScale": float(self.qk_scale), "maskType": 7, - "kvcacheCfg": self.cache_type, "calcType": 2, "windowSize": self.window_size}) + "kvcacheCfg": 1, "calcType": 2, "windowSize": self.window_size}) self.param_seqlen = self.q_seqlen self.param_token_offset = self.kv_seqlen @@ -1389,7 +1389,7 @@ class TestFlashAttention(operation_test.OperationTest): kv_seqLen = [32, 1024] * 4 self.max_seq = 1024 self.window_size = 64 - self.cacheType = 1 + self.cache_type = 1 self.q_seqlen, self.q_ntokens = self.gen_seq_len(batch, [1] * batch) self.kv_seqlen, self.kv_ntokens = self.gen_seq_len(batch, kv_seqLen) self.layer_id = torch.from_numpy(np.array([0], dtype=np.int32)).to(torch.int32) @@ -1405,8 +1405,8 @@ class TestFlashAttention(operation_test.OperationTest): OP_PARAM = {"type": 1} self.set_data_params(cache_type=self.cache_type, is_decoder=self.is_decoder, batch=batch, kv_head=self.kv_head, heads=self.heads, embeddim=self.embeddim, max_seq=self.max_seq, kv_seqLen=kv_seqLen, - data_type=data_type, long_seq = False, op_type=OP_PARAM["type"], mask_type = MASK_TYPE_SWA, - no_cache=False, is_sqrt=False, tor=tor, q_seqlens=self.q_seqlen) + data_type=data_type, long_seq = False, op_type=OP_PARAM["type"], mask_type = MASK_TYPE_NO_HEAD_DECODER, + no_cache=False, is_sqrt=False, tor=tor, q_seqlens=self.q_seqlen, window_size=self.window_size) self.gen_out_tensor() self.window_size = 64 -- Gitee From fc2e25eb7f3f58a69da2cc979c69f3a5ff4dc066 Mon Sep 17 00:00:00 2001 From: hongshiyi Date: Wed, 17 Sep 2025 19:49:02 +0800 Subject: [PATCH 12/94] add dlopen to resolve acl function --- .../platform/platform_infos_def.cpp | 105 +++++++++++------- 1 file changed, 66 insertions(+), 39 deletions(-) diff --git a/src/kernels/tbe_adapter/platform/platform_infos_def.cpp b/src/kernels/tbe_adapter/platform/platform_infos_def.cpp index 2cb40ebf..0e1d7637 100644 --- a/src/kernels/tbe_adapter/platform/platform_infos_def.cpp +++ b/src/kernels/tbe_adapter/platform/platform_infos_def.cpp @@ -9,10 +9,11 @@ */ #include "platform/platform_infos_def.h" +#include "mki/utils/dl/dl.h" +#include "mki/utils/env/env.h" #include #include #include "platform_infos_impl.h" -#include "acl/acl_rt.h" namespace fe { constexpr uint32_t MAX_CORE_NUM = 128; @@ -101,59 +102,85 @@ void PlatFormInfos::SetFixPipeDtypeMap(const std::mapSetFixPipeDtypeMap(fixpipeDtypeMap); } +typedef int (*aclrtGetResInCurrentThreadFunc)(int, uint32_t*); + void PlatFormInfos::SetCoreNumByCoreType(const std::string &core_type) { uint32_t coreNum = 0; - aclrtDevResLimitType resType = core_type == "VectorCore" ? ACL_RT_DEV_RES_VECTOR_CORE : ACL_RT_DEV_RES_CUBE_CORE; - aclError getResRet = aclrtGetResInCurrentThread(resType, &coreNum); - if (getResRet == ACL_SUCCESS) { - core_num_ = coreNum; - MKI_LOG(DEBUG) << "Get ThreadResource::core_num_ to " << core_type << ": " << coreNum; - } else { - std::string coreNumStr; - std::string coreTypeStr; - if (core_type == "VectorCore") { - coreTypeStr = "vector_core_cnt"; + Mki::Dl dl = Mki::Dl(std::string(Mki::GetEnv("ASCEND_HOME_PATH")) + "/runtime/lib64/libascendcl.so", false); + aclrtGetResInCurrentThreadFunc aclrtGetResInCurrentThread = (aclrtGetResInCurrentThreadFunc)dl.GetSymbol("aclrtGetResInCurrentThread"); + if (aclrtGetResInCurrentThread != nullptr) { + int8_t resType = core_type == "VectorCore" ? 1 : 0; + int getResRet = aclrtGetResInCurrentThread(resType, &coreNum); + if (getResRet == 0) { + core_num_ = coreNum; + MKI_LOG(DEBUG) << "Get ThreadResource::core_num_ to " << core_type << ": " << coreNum; + if (core_num_ == 0 || core_num_ > MAX_CORE_NUM) { + MKI_LOG(ERROR) << "core_num is out of range : " << core_num_; + core_num_ = 1; + } + return; } else { - coreTypeStr = "ai_core_cnt"; - } - std::lock_guard lockGuard(g_asdopsFePlatMutex); - (void)GetPlatformRes("SoCInfo", coreTypeStr, coreNumStr); - MKI_LOG(DEBUG) << "Set PlatFormInfos::core_num_ to " << coreTypeStr << ": " << coreNumStr; - if (coreNumStr.empty()) { - core_num_ = 1; - MKI_LOG(ERROR) << "CoreNumStr is empty!"; - } else { - core_num_ = std::strtoul(coreNumStr.c_str(), nullptr, 10); // 10 进制 + MKI_LOG(WARN) << "Failed to get thread core num!"; } + } else { + MKI_LOG(WARN) << "Failed to acl function!"; } + std::string coreNumStr; + std::string coreTypeStr; + if (core_type == "VectorCore") { + coreTypeStr = "vector_core_cnt"; + } else { + coreTypeStr = "ai_core_cnt"; + } + std::lock_guard lockGuard(g_asdopsFePlatMutex); + (void)GetPlatformRes("SoCInfo", coreTypeStr, coreNumStr); + MKI_LOG(DEBUG) << "Set PlatFormInfos::core_num_ to " << coreTypeStr << ": " << coreNumStr; + if (coreNumStr.empty()) { + core_num_ = 1; + MKI_LOG(ERROR) << "CoreNumStr is empty!"; + } else { + core_num_ = std::strtoul(coreNumStr.c_str(), nullptr, 10); // 10 进制 + } if (core_num_ == 0 || core_num_ > MAX_CORE_NUM) { MKI_LOG(ERROR) << "core_num is out of range : " << core_num_; - core_num_ = 1; - } + core_num_ = 1; + } } uint32_t PlatFormInfos::GetCoreNumByType(const std::string &core_type) -{ +{ uint32_t coreNum = 0; - aclrtDevResLimitType resType = core_type == "VectorCore" ? ACL_RT_DEV_RES_VECTOR_CORE : ACL_RT_DEV_RES_CUBE_CORE; - aclError getResRet = aclrtGetResInCurrentThread(resType, &coreNum); - if (getResRet == ACL_SUCCESS) { - MKI_LOG(DEBUG) << "Get ThreadResource::core_num_ to " << core_type << ": " << coreNum; - } else { - std::string coreNumStr; - std::string coreTypeStr = core_type == "VectorCore" ? "vector_core_cnt" : "ai_core_cnt"; - std::lock_guard lockGuard(g_asdopsFePlatMutex); - (void)GetPlatformRes("SoCInfo", coreTypeStr, coreNumStr); - MKI_LOG(DEBUG) << "Get PlatFormInfos::core_num_ to " << coreTypeStr << ": " << coreNumStr; - if (coreNumStr.empty()) { - MKI_LOG(ERROR) << "CoreNumStr is empty!"; - return 1; + Mki::Dl dl = Mki::Dl(std::string(Mki::GetEnv("ASCEND_HOME_PATH")) + "/runtime/lib64/libascendcl.so", false); + aclrtGetResInCurrentThreadFunc aclrtGetResInCurrentThread = (aclrtGetResInCurrentThreadFunc)dl.GetSymbol("aclrtGetResInCurrentThread"); + if (aclrtGetResInCurrentThread != nullptr) { + int resType = core_type == "VectorCore" ? 1 : 0; + int getResRet = aclrtGetResInCurrentThread(resType, &coreNum); + if (getResRet == 0) { + MKI_LOG(DEBUG) << "Get ThreadResource::core_num_ to " << core_type << ": " << coreNum; + if (coreNum > MAX_CORE_NUM) { + MKI_LOG(ERROR) << "core_num is out of range : " << coreNum; + return 1; + } + return coreNum; } else { - coreNum = std::strtoul(coreNumStr.c_str(), nullptr, 10); // 10 进制 + MKI_LOG(WARN) << "Failed to get thread resource! "; } + } else { + MKI_LOG(WARN) << "Failed to load acl Function!"; + } + std::string coreNumStr; + std::string coreTypeStr = core_type == "VectorCore" ? "vector_core_cnt" : "ai_core_cnt"; + std::lock_guard lockGuard(g_asdopsFePlatMutex); + (void)GetPlatformRes("SoCInfo", coreTypeStr, coreNumStr); + MKI_LOG(DEBUG) << "Get PlatFormInfos::core_num_ to " << coreTypeStr << ": " << coreNumStr; + if (coreNumStr.empty()) { + MKI_LOG(ERROR) << "CoreNumStr is empty!"; + return 1; + } else { + coreNum = std::strtoul(coreNumStr.c_str(), nullptr, 10); // 10 进制 } - if (coreNum == 0 || coreNum > MAX_CORE_NUM) { + if (coreNum > MAX_CORE_NUM) { MKI_LOG(ERROR) << "core_num is out of range : " << coreNum; return 1; } -- Gitee From a77b572846d808d6628595c18950068b2e0c14a6 Mon Sep 17 00:00:00 2001 From: hongshiyi Date: Wed, 17 Sep 2025 20:22:30 +0800 Subject: [PATCH 13/94] fix codecheck problem --- src/kernels/tbe_adapter/platform/platform_infos_def.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/kernels/tbe_adapter/platform/platform_infos_def.cpp b/src/kernels/tbe_adapter/platform/platform_infos_def.cpp index 0e1d7637..71b1cbdd 100644 --- a/src/kernels/tbe_adapter/platform/platform_infos_def.cpp +++ b/src/kernels/tbe_adapter/platform/platform_infos_def.cpp @@ -102,13 +102,14 @@ void PlatFormInfos::SetFixPipeDtypeMap(const std::mapSetFixPipeDtypeMap(fixpipeDtypeMap); } -typedef int (*aclrtGetResInCurrentThreadFunc)(int, uint32_t*); +using AclrtGetResInCurrentThreadFunc = int(*)(int, uint32_t*); void PlatFormInfos::SetCoreNumByCoreType(const std::string &core_type) { uint32_t coreNum = 0; Mki::Dl dl = Mki::Dl(std::string(Mki::GetEnv("ASCEND_HOME_PATH")) + "/runtime/lib64/libascendcl.so", false); - aclrtGetResInCurrentThreadFunc aclrtGetResInCurrentThread = (aclrtGetResInCurrentThreadFunc)dl.GetSymbol("aclrtGetResInCurrentThread"); + AclrtGetResInCurrentThreadFunc aclrtGetResInCurrentThread = + (AclrtGetResInCurrentThreadFunc)dl.GetSymbol("aclrtGetResInCurrentThread"); if (aclrtGetResInCurrentThread != nullptr) { int8_t resType = core_type == "VectorCore" ? 1 : 0; int getResRet = aclrtGetResInCurrentThread(resType, &coreNum); -- Gitee From e198375459cb40e5ba98a743f5149b9d88f8027f Mon Sep 17 00:00:00 2001 From: hongshiyi Date: Thu, 18 Sep 2025 08:58:53 +0800 Subject: [PATCH 14/94] delete extra spaces --- .../tbe_adapter/platform/platform_infos_def.cpp | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/kernels/tbe_adapter/platform/platform_infos_def.cpp b/src/kernels/tbe_adapter/platform/platform_infos_def.cpp index 71b1cbdd..49a5cba5 100644 --- a/src/kernels/tbe_adapter/platform/platform_infos_def.cpp +++ b/src/kernels/tbe_adapter/platform/platform_infos_def.cpp @@ -118,8 +118,8 @@ void PlatFormInfos::SetCoreNumByCoreType(const std::string &core_type) MKI_LOG(DEBUG) << "Get ThreadResource::core_num_ to " << core_type << ": " << coreNum; if (core_num_ == 0 || core_num_ > MAX_CORE_NUM) { MKI_LOG(ERROR) << "core_num is out of range : " << core_num_; - core_num_ = 1; - } + core_num_ = 1; + } return; } else { MKI_LOG(WARN) << "Failed to get thread core num!"; @@ -142,15 +142,15 @@ void PlatFormInfos::SetCoreNumByCoreType(const std::string &core_type) MKI_LOG(ERROR) << "CoreNumStr is empty!"; } else { core_num_ = std::strtoul(coreNumStr.c_str(), nullptr, 10); // 10 进制 - } + } if (core_num_ == 0 || core_num_ > MAX_CORE_NUM) { MKI_LOG(ERROR) << "core_num is out of range : " << core_num_; - core_num_ = 1; - } + core_num_ = 1; + } } uint32_t PlatFormInfos::GetCoreNumByType(const std::string &core_type) -{ +{ uint32_t coreNum = 0; Mki::Dl dl = Mki::Dl(std::string(Mki::GetEnv("ASCEND_HOME_PATH")) + "/runtime/lib64/libascendcl.so", false); aclrtGetResInCurrentThreadFunc aclrtGetResInCurrentThread = (aclrtGetResInCurrentThreadFunc)dl.GetSymbol("aclrtGetResInCurrentThread"); @@ -179,7 +179,7 @@ uint32_t PlatFormInfos::GetCoreNumByType(const std::string &core_type) MKI_LOG(ERROR) << "CoreNumStr is empty!"; return 1; } else { - coreNum = std::strtoul(coreNumStr.c_str(), nullptr, 10); // 10 进制 + coreNum = std::strtoul(coreNumStr.c_str(), nullptr, 10); // 10 进制 } if (coreNum > MAX_CORE_NUM) { MKI_LOG(ERROR) << "core_num is out of range : " << coreNum; -- Gitee From bf6c6148b9959ed8716b043aa563de56bffda31d Mon Sep 17 00:00:00 2001 From: hongshiyi Date: Thu, 18 Sep 2025 11:14:59 +0800 Subject: [PATCH 15/94] change header order --- src/kernels/tbe_adapter/platform/platform_infos_def.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/kernels/tbe_adapter/platform/platform_infos_def.cpp b/src/kernels/tbe_adapter/platform/platform_infos_def.cpp index 49a5cba5..c24edb60 100644 --- a/src/kernels/tbe_adapter/platform/platform_infos_def.cpp +++ b/src/kernels/tbe_adapter/platform/platform_infos_def.cpp @@ -9,10 +9,10 @@ */ #include "platform/platform_infos_def.h" +#include #include "mki/utils/dl/dl.h" #include "mki/utils/env/env.h" -#include -#include +#include "mki/utils/log/log.h" #include "platform_infos_impl.h" namespace fe { @@ -127,8 +127,8 @@ void PlatFormInfos::SetCoreNumByCoreType(const std::string &core_type) } else { MKI_LOG(WARN) << "Failed to acl function!"; } - std::string coreNumStr; - std::string coreTypeStr; + std::string coreNumStr = ""; + std::string coreTypeStr = ""; if (core_type == "VectorCore") { coreTypeStr = "vector_core_cnt"; } else { -- Gitee From 96e5a249a06627866326b9334b460e352ed1f03e Mon Sep 17 00:00:00 2001 From: hongshiyi Date: Thu, 18 Sep 2025 11:16:52 +0800 Subject: [PATCH 16/94] init string --- src/kernels/tbe_adapter/platform/platform_infos_def.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/kernels/tbe_adapter/platform/platform_infos_def.cpp b/src/kernels/tbe_adapter/platform/platform_infos_def.cpp index c24edb60..3464a4f6 100644 --- a/src/kernels/tbe_adapter/platform/platform_infos_def.cpp +++ b/src/kernels/tbe_adapter/platform/platform_infos_def.cpp @@ -170,7 +170,7 @@ uint32_t PlatFormInfos::GetCoreNumByType(const std::string &core_type) } else { MKI_LOG(WARN) << "Failed to load acl Function!"; } - std::string coreNumStr; + std::string coreNumStr = ""; std::string coreTypeStr = core_type == "VectorCore" ? "vector_core_cnt" : "ai_core_cnt"; std::lock_guard lockGuard(g_asdopsFePlatMutex); (void)GetPlatformRes("SoCInfo", coreTypeStr, coreNumStr); -- Gitee From d677a364ecedb672c673c63f7af3668402d266a0 Mon Sep 17 00:00:00 2001 From: godantshen_ Date: Wed, 17 Sep 2025 16:34:12 +0800 Subject: [PATCH 17/94] icsl modification --- .../kernels/matmul/pp_matmul_i8_kernel/pp_matmul_i8_kernel.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/kernels/kernels/matmul/pp_matmul_i8_kernel/pp_matmul_i8_kernel.cpp b/src/kernels/kernels/matmul/pp_matmul_i8_kernel/pp_matmul_i8_kernel.cpp index e4c4f5cc..82328c5e 100644 --- a/src/kernels/kernels/matmul/pp_matmul_i8_kernel/pp_matmul_i8_kernel.cpp +++ b/src/kernels/kernels/matmul/pp_matmul_i8_kernel/pp_matmul_i8_kernel.cpp @@ -140,6 +140,7 @@ public: Status InitImpl(const LaunchParam &launchParam) override { Status status = PpMatmulTiling(launchParam, kernelInfo_); + MKI_CHECK(status.Ok(), "tiling return invalid value.", return status); kernelInfo_.SetHwsyncIdx(0); return status; } -- Gitee From d5e5cab9d80c2a95f0fa5916caf1d7990e2e4feb Mon Sep 17 00:00:00 2001 From: Hall Date: Thu, 18 Sep 2025 14:50:56 +0800 Subject: [PATCH 18/94] fix(rope): support rope test when rotaryCoeff equals headsize --- example/op_demo/rms_norm/README.md | 22 ++++---- example/op_demo/rope/README.md | 14 ++--- tests/apitest/opstest/csv/rope.csv | 16 +++++- .../python/CsvOpsTestTool/data_generation.py | 54 +++++++++---------- 4 files changed, 60 insertions(+), 46 deletions(-) diff --git a/example/op_demo/rms_norm/README.md b/example/op_demo/rms_norm/README.md index 11826914..7e5a3f73 100644 --- a/example/op_demo/rms_norm/README.md +++ b/example/op_demo/rms_norm/README.md @@ -1,6 +1,6 @@ # 加速库RmsNormOperation C++ Demo ## 介绍 -该目录下为加速库RmsNormOperation C++调用示例,以下示例仅适用于Atlas A2/A3训练系列产品、Atlas 800I A2推理产品、Atlas A3 推理系列产品。 +该目录下为加速库RmsNormOperation C++调用示例。 ## 使用说明 - 首先source 对应的CANN和nnal包 @@ -46,8 +46,8 @@ tests/apitest/opstest/python/operations/rms_norm/ - **rms_norm_qwen_demo_0.cpp** - 【注】:编译脚本内替换 rms_norm_demo.cpp 为 rms_norm_qwen_demo_0.cpp 可编译运行 - + 【注】:编译脚本内替换 rms_norm_demo.cpp 为 rms_norm_qwen_demo_0.cpp 可编译运行; + 该示例仅适用于Atlas A2/A3训练系列产品、Atlas 800I A2推理产品、Atlas A3 推理系列产品。 **参数设置** | Param | value | | :-----------------: | :--------------------------------------------------: | @@ -70,8 +70,8 @@ tests/apitest/opstest/python/operations/rms_norm/ - **rms_norm_qwen_demo_1.cpp** - 【注】:编译脚本内替换 rms_norm_demo.cpp 为 rms_norm_qwen_demo_1.cpp 可编译运行 - + 【注】:编译脚本内替换 rms_norm_demo.cpp 为 rms_norm_qwen_demo_1.cpp 可编译运行; + 该示例仅适用于Atlas A2/A3训练系列产品、Atlas 800I A2推理产品、Atlas A3 推理系列产品。 **参数设置** | Param | value | | :-----------------: | :--------------------------------------------------: | @@ -94,8 +94,8 @@ tests/apitest/opstest/python/operations/rms_norm/ - **rms_norm_qwen_demo_2.cpp** - 【注】:编译脚本内替换 rms_norm_demo.cpp 为 rms_norm_qwen_demo_2.cpp 可编译运行 - + 【注】:编译脚本内替换 rms_norm_demo.cpp 为 rms_norm_qwen_demo_2.cpp 可编译运行; + 该示例仅适用于Atlas A2/A3训练系列产品、Atlas 800I A2推理产品、Atlas A3 推理系列产品。 **参数设置** | Param | value | | :-----------------: | :--------------------------------------------------: | @@ -118,8 +118,8 @@ tests/apitest/opstest/python/operations/rms_norm/ - **rms_norm_deepseek_demo_0.cpp** - 【注】:编译脚本内替换 rms_norm_demo.cpp 为 rms_norm_deepseek_demo_0.cpp 可编译运行 - + 【注】:编译脚本内替换 rms_norm_demo.cpp 为 rms_norm_deepseek_demo_0.cpp 可编译运行; + 该示例仅适用于Atlas A2/A3训练系列产品、Atlas 800I A2推理产品、Atlas A3 推理系列产品。 **参数设置** | Param | value | | :-----------------: | :--------------------------------------------------: | @@ -142,8 +142,8 @@ tests/apitest/opstest/python/operations/rms_norm/ - **rms_norm_deepseek_demo_1.cpp** - 【注】:编译脚本内替换 rms_norm_demo.cpp 为 rms_norm_deepseek_demo_1.cpp 可编译运行 - + 【注】:编译脚本内替换 rms_norm_demo.cpp 为 rms_norm_deepseek_demo_1.cpp 可编译运行; + 该示例仅适用于Atlas A2/A3训练系列产品、Atlas 800I A2推理产品、Atlas A3 推理系列产品。 **参数设置** | Param | value | | :-----------------: | :--------------------------------------------------: | diff --git a/example/op_demo/rope/README.md b/example/op_demo/rope/README.md index 93362e45..6898cb29 100644 --- a/example/op_demo/rope/README.md +++ b/example/op_demo/rope/README.md @@ -1,6 +1,6 @@ # 加速库RopeOperation C++ Demo ## 介绍 -该目录下为加速库RopeOperation C++调用示例, 以下示例仅适用于Atlas A2/A3训练系列产品、Atlas 800I A2推理产品、Atlas A3 推理系列产品。 +该目录下为加速库RopeOperation C++调用示例。 ## 使用说明 - 首先source 对应的CANN和nnal包 @@ -49,8 +49,8 @@ tests/apitest/opstest/python/operations/rope/ - **rope_qwen_demo_0.cpp** - 【注】:编译脚本内替换 rope_demo.cpp 为 rope_qwen_demo_0.cpp 可编译运行 - + 【注】:编译脚本内替换 rope_demo.cpp 为 rope_qwen_demo_0.cpp 可编译运行; + 该示例仅适用于Atlas A2/A3训练系列产品、Atlas 800I A2推理产品、Atlas A3 推理系列产品。 **参数设置** | Param | value | | :---------: | :---: | @@ -76,8 +76,8 @@ tests/apitest/opstest/python/operations/rope/ - **rope_qwen_demo_1.cpp** - 【注】:编译脚本内替换 rope_demo.cpp 为 rope_qwen_demo_1.cpp 可编译运行 - + 【注】:编译脚本内替换 rope_demo.cpp 为 rope_qwen_demo_1.cpp 可编译运行; + 该示例仅适用于Atlas A2/A3训练系列产品、Atlas 800I A2推理产品、Atlas A3 推理系列产品。 **参数设置** | Param | value | | :---------: | :---: | @@ -103,8 +103,8 @@ tests/apitest/opstest/python/operations/rope/ - **rope_qwen_demo_2.cpp** - 【注】:编译脚本内替换 rope_demo.cpp 为 rope_qwen_demo_2.cpp 可编译运行 - + 【注】:编译脚本内替换 rope_demo.cpp 为 rope_qwen_demo_2.cpp 可编译运行; + 该示例仅适用于Atlas A2/A3训练系列产品、Atlas 800I A2推理产品、Atlas A3 推理系列产品。 **参数设置** | Param | value | | :---------: | :---: | diff --git a/tests/apitest/opstest/csv/rope.csv b/tests/apitest/opstest/csv/rope.csv index a24ba7b4..e84c2f23 100644 --- a/tests/apitest/opstest/csv/rope.csv +++ b/tests/apitest/opstest/csv/rope.csv @@ -23,4 +23,18 @@ CaseNum|CaseName |OpName |OpParam |InNum 22 |rsv|RopeOperation|{"rsv":[1]}|0||||0||||||||C:ERROR_INVALID_PARAM||| | 23 |310Brope2Error |RopeOperation|{"rotaryCoeff": 64} |5 |float16;float16;float16;float16;int32 |nd;nd;nd;nd;nd |2,256,32,64;2,256,32,64;512,64;512,64;2 |2 |float16;float16|nd;nd |2,256,32,64;2,256,32,64 |random;random;customize;customize;random |0,1;0,1;0,1;0,1;256,256 | | |NO_ERROR | |ChatGLM-6B |Ascend310B |Function 24 |310BunpadropeError|RopeOperation|{"rotaryCoeff": 4} |5 |float16;float16;float16;float16;int32 |nd;nd;nd;nd;nd |4,4096;4,4096;4,128;4,128;1 |2 |float16;float16|nd;nd |4,4096;4,4096 |customize;customize;customize;customize;customize|0,1;0,1;0,1;0,1;4,4 | | |NO_ERROR | | |Ascend310B |Function -25 |deepseek |RopeOperation|{"rotaryCoeff": 2} |5 |float16;float16;float16;float16;uint32 |nd;nd;nd;nd;nd |3072,8192;3072,64;3072,64;3072,64;1 |2 |float16;float16|nd;nd |3072,8192;3072,64 |random;random;random;random;random |0,1;0,1;0,1;0,1;2,2 | | |NO_ERROR | |deepseek |Ascend910B |Function \ No newline at end of file +25 |deepseek |RopeOperation|{"rotaryCoeff": 2} |5 |float16;float16;float16;float16;uint32 |nd;nd;nd;nd;nd |3072,8192;3072,64;3072,64;3072,64;1 |2 |float16;float16|nd;nd |3072,8192;3072,64 |random;random;random;random;random |0,1;0,1;0,1;0,1;2,2 | | |NO_ERROR | |deepseek |Ascend910B |Function +26 |rotaryCoeff16 |RopeOperation|{"rotaryCoeff": 16, "cosFormat": 0}|5 |bf16;bf16;bf16;bf16;uint32 |nd;nd;nd;nd;nd |4,448;4,448;4,16;4,16;1 |2 |bf16;bf16 |nd;nd |4,448;4,448 |random;random;customize;customize;random |-100,100;-100,100;-100,100;-100,100;4,4|| |NO_ERROR | | |Ascend910B |Function +27 |rotaryCoeff16 |RopeOperation|{"rotaryCoeff": 16, "cosFormat": 1}|5 |bf16;bf16;bf16;bf16;uint32 |nd;nd;nd;nd;nd |4,448;4,448;4,16;4,16;1 |2 |bf16;bf16 |nd;nd |4,448;4,448 |random;random;customize;customize;random |-100,100;-100,100;-100,100;-100,100;4,4|| |NO_ERROR | | |Ascend910B |Function +28 |rotaryCoeff32 |RopeOperation|{"rotaryCoeff": 32, "cosFormat": 0}|5 |bf16;bf16;bf16;bf16;uint32 |nd;nd;nd;nd;nd |4,1312;4,1312;4,32;4,32;1 |2 |bf16;bf16 |nd;nd |4,1312;4,1312 |random;random;customize;customize;random |-100,100;-100,100;-100,100;-100,100;4,4|| |NO_ERROR | | |Ascend910B |Function +29 |rotaryCoeff32 |RopeOperation|{"rotaryCoeff": 32, "cosFormat": 1}|5 |bf16;bf16;bf16;bf16;uint32 |nd;nd;nd;nd;nd |4,1312;4,1312;4,32;4,32;1 |2 |bf16;bf16 |nd;nd |4,1312;4,1312 |random;random;customize;customize;random |-100,100;-100,100;-100,100;-100,100;4,4|| |NO_ERROR | | |Ascend910B |Function +30 |rotaryCoeff64 |RopeOperation|{"rotaryCoeff": 64, "cosFormat": 0}|5 |bf16;bf16;bf16;bf16;uint32 |nd;nd;nd;nd;nd |4,4416;4,4416;4,64;4,64;1 |2 |bf16;bf16 |nd;nd |4,4416;4,4416 |random;random;customize;customize;random |-100,100;-100,100;-100,100;-100,100;4,4|| |NO_ERROR | | |Ascend910B |Function +31 |rotaryCoeff64 |RopeOperation|{"rotaryCoeff": 64, "cosFormat": 1}|5 |bf16;bf16;bf16;bf16;uint32 |nd;nd;nd;nd;nd |4,4416;4,4416;4,64;4,64;1 |2 |bf16;bf16 |nd;nd |4,4416;4,4416 |random;random;customize;customize;random |-100,100;-100,100;-100,100;-100,100;4,4|| |NO_ERROR | | |Ascend910B |Function +32 |rotaryCoeff128 |RopeOperation|{"rotaryCoeff": 128, "cosFormat": 0}|5 |bf16;bf16;bf16;bf16;uint32 |nd;nd;nd;nd;nd |4,2048;4,2048;4,128;4,128;1 |2 |bf16;bf16 |nd;nd |4,2048;4,2048 |random;random;customize;customize;random |-100,100;-100,100;-100,100;-100,100;4,4|| |NO_ERROR | | |Ascend910B |Function +33 |rotaryCoeff128 |RopeOperation|{"rotaryCoeff": 128, "cosFormat": 1}|5 |bf16;bf16;bf16;bf16;uint32 |nd;nd;nd;nd;nd |4,2048;4,2048;4,128;4,128;1 |2 |bf16;bf16 |nd;nd |4,2048;4,2048 |random;random;customize;customize;random |-100,100;-100,100;-100,100;-100,100;4,4|| |NO_ERROR | | |Ascend910B |Function +34 |rotaryCoeff256 |RopeOperation|{"rotaryCoeff": 256, "cosFormat": 0}|5 |bf16;bf16;bf16;bf16;uint32 |nd;nd;nd;nd;nd |4,32512;4,32512;4,256;4,256;1 |2 |bf16;bf16 |nd;nd |4,32512;4,32512 |random;random;customize;customize;random |-100,100;-100,100;-100,100;-100,100;4,4|| |NO_ERROR | | |Ascend910B |Function +35 |rotaryCoeff256 |RopeOperation|{"rotaryCoeff": 256, "cosFormat": 1}|5 |bf16;bf16;bf16;bf16;uint32 |nd;nd;nd;nd;nd |4,32512;4,32512;4,256;4,256;1 |2 |bf16;bf16 |nd;nd |4,32512;4,32512 |random;random;customize;customize;random |-100,100;-100,100;-100,100;-100,100;4,4|| |NO_ERROR | | |Ascend910B |Function +36 |rotaryCoeff512 |RopeOperation|{"rotaryCoeff": 512, "cosFormat": 0}|5 |bf16;bf16;bf16;bf16;uint32 |nd;nd;nd;nd;nd |4,22528;4,22528;4,512;4,512;1 |2 |bf16;bf16 |nd;nd |4,22528;4,22528 |random;random;customize;customize;random |-100,100;-100,100;-100,100;-100,100;4,4|| |NO_ERROR | | |Ascend910B |Function +37 |rotaryCoeff512 |RopeOperation|{"rotaryCoeff": 512, "cosFormat": 1}|5 |bf16;bf16;bf16;bf16;uint32 |nd;nd;nd;nd;nd |4,22528;4,22528;4,512;4,512;1 |2 |bf16;bf16 |nd;nd |4,22528;4,22528 |random;random;customize;customize;random |-100,100;-100,100;-100,100;-100,100;4,4|| |NO_ERROR | | |Ascend910B |Function +38 |rotaryCoeff1024 |RopeOperation|{"rotaryCoeff": 1024, "cosFormat": 0}|5 |bf16;bf16;bf16;bf16;uint32 |nd;nd;nd;nd;nd |4,55296;4,55296;4,1024;4,1024;1 |2 |bf16;bf16 |nd;nd |4,55296;4,55296 |random;random;customize;customize;random |-100,100;-100,100;-100,100;-100,100;4,4|| |NO_ERROR | | |Ascend910B |Function +39 |rotaryCoeff1024 |RopeOperation|{"rotaryCoeff": 1024, "cosFormat": 1}|5 |bf16;bf16;bf16;bf16;uint32 |nd;nd;nd;nd;nd |4,55296;4,55296;4,1024;4,1024;1 |2 |bf16;bf16 |nd;nd |4,55296;4,55296 |random;random;customize;customize;random |-100,100;-100,100;-100,100;-100,100;4,4|| |NO_ERROR | | |Ascend910B |Function \ No newline at end of file diff --git a/tests/framework/python/CsvOpsTestTool/data_generation.py b/tests/framework/python/CsvOpsTestTool/data_generation.py index 1fcbc6ea..0d4a1df2 100644 --- a/tests/framework/python/CsvOpsTestTool/data_generation.py +++ b/tests/framework/python/CsvOpsTestTool/data_generation.py @@ -3297,7 +3297,7 @@ class ReduceOperation(DataGen): class RopeOperation(DataGen): @staticmethod def customize(shapes, i, datatype, format, data_gen_ranges, op_params): - if (i == 2 or i == 3) and json.loads(op_params)["rotaryCoeff"] == 64: + if (i == 2 or i == 3) and json.loads(op_params)["rotaryCoeff"] != 2 and json.loads(op_params)["rotaryCoeff"] != 4: ntoken = shapes[i][0] head_size = shapes[i][1] # op需要cos/sin重复一次 @@ -3396,7 +3396,32 @@ class RopeOperation(DataGen): q = torch.concat([q0, q1], dim=(q0.ndim - 1)).view(ntoken, hidden_size) k = torch.concat([k0, k1], dim=(k0.ndim - 1)).view(ntoken, hidden_size) return [q, k] - elif json_data['rotaryCoeff'] == 64: + elif json_data['rotaryCoeff'] == 2: + dtype = in_tensors[0].dtype + if dtype == torch.bfloat16: + in_tensors[0] = in_tensors[0].to(torch.float32) + in_tensors[1] = in_tensors[1].to(torch.float32) + in_tensors[2] = in_tensors[2].to(torch.float32) + in_tensors[3] = in_tensors[3].to(torch.float32) + ntoken = in_tensors[0].size()[0] + seqlen = int(in_tensors[4][0]) + batch = ntoken // seqlen + if batch == 0: + batch = 1 + seqlen = ntoken + hidden_size = in_tensors[0].size()[1] + hidden_size1 = in_tensors[1].size()[1] + head_size = in_tensors[2].size()[1] + head_num = hidden_size // head_size + head_num1 = hidden_size1 // head_size + q = in_tensors[0].view(batch, seqlen, head_num, head_size) + k = in_tensors[1].view(batch, seqlen, head_num1, head_size) + cos = in_tensors[2].view(batch, seqlen, head_size).unsqueeze(2) + sin = in_tensors[3].view(batch, seqlen, head_size).unsqueeze(2) + q_embed = ((q * cos) + (RopeOperation.rotate_half(q) * sin)).view(ntoken, hidden_size) + k_embed = ((k * cos) + (RopeOperation.rotate_half(k) * sin)).view(ntoken, hidden_size1) + return [q_embed.to(dtype), k_embed.to(dtype)] + else: if len(in_tensors[0].size()) == 4: seqlen = in_tensors[0].size()[1] batch = in_tensors[0].size()[0] @@ -3451,31 +3476,6 @@ class RopeOperation(DataGen): return [q_out2, k_out2] else: return [q_out2.view(ntoken, hidden_sizeq), k_out2.view(ntoken, hidden_sizek)] - else: - dtype = in_tensors[0].dtype - if dtype == torch.bfloat16: - in_tensors[0] = in_tensors[0].to(torch.float32) - in_tensors[1] = in_tensors[1].to(torch.float32) - in_tensors[2] = in_tensors[2].to(torch.float32) - in_tensors[3] = in_tensors[3].to(torch.float32) - ntoken = in_tensors[0].size()[0] - seqlen = int(in_tensors[4][0]) - batch = ntoken // seqlen - if batch == 0: - batch = 1 - seqlen = ntoken - hidden_size = in_tensors[0].size()[1] - hidden_size1 = in_tensors[1].size()[1] - head_size = in_tensors[2].size()[1] - head_num = hidden_size // head_size - head_num1 = hidden_size1 // head_size - q = in_tensors[0].view(batch, seqlen, head_num, head_size) - k = in_tensors[1].view(batch, seqlen, head_num1, head_size) - cos = in_tensors[2].view(batch, seqlen, head_size).unsqueeze(2) - sin = in_tensors[3].view(batch, seqlen, head_size).unsqueeze(2) - q_embed = ((q * cos) + (RopeOperation.rotate_half(q) * sin)).view(ntoken, hidden_size) - k_embed = ((k * cos) + (RopeOperation.rotate_half(k) * sin)).view(ntoken, hidden_size1) - return [q_embed.to(dtype), k_embed.to(dtype)] @staticmethod def get_op_type(op_params): -- Gitee From bf23a364a36024b6d361343f3ebe10b2942871f1 Mon Sep 17 00:00:00 2001 From: hongshiyi Date: Thu, 18 Sep 2025 16:33:17 +0800 Subject: [PATCH 19/94] change acl to Acl --- src/kernels/tbe_adapter/platform/platform_infos_def.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/kernels/tbe_adapter/platform/platform_infos_def.cpp b/src/kernels/tbe_adapter/platform/platform_infos_def.cpp index 3464a4f6..7444715d 100644 --- a/src/kernels/tbe_adapter/platform/platform_infos_def.cpp +++ b/src/kernels/tbe_adapter/platform/platform_infos_def.cpp @@ -153,7 +153,7 @@ uint32_t PlatFormInfos::GetCoreNumByType(const std::string &core_type) { uint32_t coreNum = 0; Mki::Dl dl = Mki::Dl(std::string(Mki::GetEnv("ASCEND_HOME_PATH")) + "/runtime/lib64/libascendcl.so", false); - aclrtGetResInCurrentThreadFunc aclrtGetResInCurrentThread = (aclrtGetResInCurrentThreadFunc)dl.GetSymbol("aclrtGetResInCurrentThread"); + AclrtGetResInCurrentThreadFunc aclrtGetResInCurrentThread = (AclrtGetResInCurrentThreadFunc)dl.GetSymbol("aclrtGetResInCurrentThread"); if (aclrtGetResInCurrentThread != nullptr) { int resType = core_type == "VectorCore" ? 1 : 0; int getResRet = aclrtGetResInCurrentThread(resType, &coreNum); -- Gitee From 6e3bde03081fdfa253980cbd5c2eb3cfc4a4611e Mon Sep 17 00:00:00 2001 From: huangxiaolan Date: Fri, 19 Sep 2025 15:54:37 +0800 Subject: [PATCH 20/94] fix function point --- .../src/tools/socket/lcal_sock_exchange.cpp | 8 +- src/atb/operation/operation_base.cpp | 4 +- src/atb/runner/ops_runner.cpp | 8 +- src/cinterface/atb_acl_fused_add_topk_div.cpp | 4 +- src/cinterface/atb_acl_mla.cpp | 10 +- src/cinterface/atb_acl_mla_preprocess.cpp | 4 +- src/cinterface/atb_acl_paged_cache_load.cpp | 4 +- src/cinterface/atb_acl_ring_mla.cpp | 4 +- .../atb_acl_self_attention_prefix_encoder.cpp | 4 +- src/cinterface/atb_acl_util.cpp | 4 +- .../tiling/faster_gelu_tiling.cpp | 16 +- .../gelu_forward/tiling/gelu_tiling.cpp | 20 +-- .../dynamic_quant_tiling.cpp | 118 +++++++------- .../tiling/cohere_layer_norm_tiling.cpp | 66 ++++---- .../norm/rmsnorm/tiling/rms_norm_tiling.cpp | 30 ++-- .../tiling/fused_add_topk_div_tiling.cpp | 148 +++++++++--------- .../tiling/mla_preprocess_tiling.cpp | 76 ++++----- .../tiling/mla_tiling_dependency.cpp | 4 +- .../tiling/ring_mla_tiling_dependency.cpp | 4 +- .../mixkernels/rope/tiling/rope_tiling.cpp | 14 +- .../tiling/rope_q_concat_tiling.cpp | 30 ++-- .../tiling/swi_glu_quant_tiling.cpp | 46 +++--- .../tiling/swi_glu_quant_tiling_utils.h | 22 +-- .../tbe_adapter/platform/platform_ascendc.cpp | 6 +- 24 files changed, 328 insertions(+), 326 deletions(-) diff --git a/comm/lcal/src/tools/socket/lcal_sock_exchange.cpp b/comm/lcal/src/tools/socket/lcal_sock_exchange.cpp index 552fde6b..ff5dec47 100644 --- a/comm/lcal/src/tools/socket/lcal_sock_exchange.cpp +++ b/comm/lcal/src/tools/socket/lcal_sock_exchange.cpp @@ -335,7 +335,7 @@ void LcalSockExchange::Cleanup() } } -int GetAddrFromString(LcalSocketAddress* ua, const char* ipPortPair) +int GetAddrFromString(LcalSocketAddress& ua, const char* ipPortPair) { std::string ip; uint16_t port; @@ -344,9 +344,9 @@ int GetAddrFromString(LcalSocketAddress* ua, const char* ipPortPair) MKI_LOG(ERROR) << "lcal ParseIpAndPort failed!"; return LCAL_ERROR_INTERNAL; } - ua->sin.sin_family = AF_INET; - ua->sin.sin_addr.s_addr = inet_addr(ip.c_str()); - ua->sin.sin_port = htons(port); + ua.sin.sin_family = AF_INET; + ua.sin.sin_addr.s_addr = inet_addr(ip.c_str()); + ua.sin.sin_port = htons(port); return LCAL_SUCCESS; } diff --git a/src/atb/operation/operation_base.cpp b/src/atb/operation/operation_base.cpp index 8668f9e5..b3afa56d 100644 --- a/src/atb/operation/operation_base.cpp +++ b/src/atb/operation/operation_base.cpp @@ -1064,12 +1064,12 @@ Status OperationBase::GraphModeLaunch() } Status OperationBase::Execute(const VariantPack &variantPack, uint8_t *workspace, uint64_t workspaceSize, - Context *context) + Context &context) { const uint64_t beginTime = GetSingleton().GetProfilingLevel0Status() ? GetSingleton().ProfSysCycleTime() : 0; - ExecuteType executeType = context->GetExecuteType(); + ExecuteType executeType = context.GetExecuteType(); ProfilingFuncName profType = executeType == EXECUTE_NORMAL ? OPERATION_EXECUTE : (executeType == EXECUTE_PRELAUNCH ? OPERATION_PRELAUNCH : OPERATION_LAUNCH); diff --git a/src/atb/runner/ops_runner.cpp b/src/atb/runner/ops_runner.cpp index e181a319..28347ad8 100644 --- a/src/atb/runner/ops_runner.cpp +++ b/src/atb/runner/ops_runner.cpp @@ -279,9 +279,9 @@ Status OpsRunner::FillHostTilingBufferImpl(uint8_t *hostTilingBuffer, uint64_t t Status OpsRunner::FillSingleKernelHostTilingBuffer(KernelGraphNode &node, size_t nodeId, uint8_t *kernelHostTilingBuffer, size_t tilingSize, - ContextBase *context) + ContextBase &context) { - bool ifGraphLaunchNeedCalcTiling = needKernelGraphModify_ && (context->GetLaunchMode() == GRAPH_LAUNCH_MODE); + bool ifGraphLaunchNeedCalcTiling = needKernelGraphModify_ && (context.GetLaunchMode() == GRAPH_LAUNCH_MODE); if (node.impl->GetTilingFilledFlag() && !ifGraphLaunchNeedCalcTiling) { return NO_ERROR; } @@ -289,7 +289,7 @@ Status OpsRunner::FillSingleKernelHostTilingBuffer(KernelGraphNode &node, size_t ATB_LOG(DEBUG) << GetLogPrefix() << " node[" << nodeId << "] InitHostLaunchBuffer start"; GetOpSetupStatistic().tilingCacheMissCount += 1; Mki::Timer fillTimer; - bool launchWithTiling = context->GetLaunchWithTilingStatus(); + bool launchWithTiling = context.GetLaunchWithTilingStatus(); Status status = node.impl->InitKernelInfo(kernelHostTilingBuffer, tilingSize, launchWithTiling); if (status != NO_ERROR) { ATB_LOG(ERROR) << GetLogPrefix() << " node[" << nodeId << "] InitRunInfo failed!"; @@ -300,7 +300,7 @@ Status OpsRunner::FillSingleKernelHostTilingBuffer(KernelGraphNode &node, size_t ATB_LOG(DEBUG) << GetLogPrefix() << " node[" << nodeId << "] InitHostLaunchBuffer end, time:" << fillTime; UpdateCacheTiling(node, nodeId, kernelHostTilingBuffer, tilingSize); - if (context->GetLaunchMode() == GRAPH_LAUNCH_MODE) { + if (context.GetLaunchMode() == GRAPH_LAUNCH_MODE) { // 整图下发模式下绝大部分算子tiling只需计算一次,少部分需要多次计算的用needKernelGraphModify_进行标记 node.impl->SetTilingFilledFlag(true); } diff --git a/src/cinterface/atb_acl_fused_add_topk_div.cpp b/src/cinterface/atb_acl_fused_add_topk_div.cpp index 65f84243..fbbfca90 100644 --- a/src/cinterface/atb_acl_fused_add_topk_div.cpp +++ b/src/cinterface/atb_acl_fused_add_topk_div.cpp @@ -21,7 +21,7 @@ atb::Status AtbFusedAddTopkDivGetWorkspaceSize(const aclTensor *x, const aclTens const aclTensor *mappingTable, uint32_t groupNum, uint32_t groupTopk, uint32_t n, uint32_t k, int activationType, bool isNorm, float scale, bool enableExpertMapping, aclTensor *y, aclTensor *indices, - uint64_t *workspaceSize, atb::Operation **op, atb::Context *context) + uint64_t &workspaceSize, atb::Operation **op, atb::Context *context) { atb::infer::FusedAddTopkDivParam param; param.groupNum = groupNum; @@ -69,7 +69,7 @@ atb::Status AtbFusedAddTopkDivGetWorkspaceSize(const aclTensor *x, const aclTens ATB_LOG(ERROR) << "AtbFusedAddTopkDivGetWorkspaceSize opeartion pointer is nullptr!"; return atb::ERROR_INVALID_OPERATION_ADDR; } - status = (*op)->Setup(pack, *workspaceSize, context); + status = (*op)->Setup(pack, workspaceSize, context); ATB_CHECK(status == atb::NO_ERROR, "AtbFusedAddTopkDiv Setup failed!", return status); return atb::NO_ERROR; } diff --git a/src/cinterface/atb_acl_mla.cpp b/src/cinterface/atb_acl_mla.cpp index e050b04c..1b115833 100644 --- a/src/cinterface/atb_acl_mla.cpp +++ b/src/cinterface/atb_acl_mla.cpp @@ -27,7 +27,7 @@ atb::Status AtbMLAGetWorkspaceSize(const aclTensor *qNope, const aclTensor *qRop const aclTensor *mask, const aclTensor *qSeqLen, const aclTensor *qkDescale, const aclTensor *pvDescale, int32_t headNum, float qkScale, int32_t kvHeadNum, int maskType, int calcType, uint8_t cacheMode, aclTensor *attenOut, aclTensor *lse, - uint64_t *workspaceSize, atb::Operation **op, atb::Context *context) + uint64_t &workspaceSize, atb::Operation **op, atb::Context *context) { atb::infer::MultiLatentAttentionParam param; param.headNum = headNum; @@ -109,13 +109,15 @@ atb::Status AtbMLAGetWorkspaceSize(const aclTensor *qNope, const aclTensor *qRop ATB_LOG(ERROR) << "AtbMLAGetWorkspaceSize opeartion pointer is nullptr!"; return atb::ERROR_INVALID_OPERATION_ADDR; } - atb::Status st = (*op)->Setup(pack, *workspaceSize, context); + atb::Status st = (*op)->Setup(pack, workspaceSize, context); ATB_CHECK(st == atb::NO_ERROR, "AtbMLA Setup failed!", return st); return atb::NO_ERROR; } atb::Status AtbMLA(void *workSpcace, uint64_t workspaceSize, atb::Operation *op, atb::Context *context) { + ATB_CHECK(op != nullptr, "AtbMLA expect op pointer not to be null!", + return atb::ERROR_INVALID_OPERATION_ADDR); atb::VariantPack pack; atb::Status st = op->Execute(pack, (uint8_t *)(workSpcace), workspaceSize, context); ATB_CHECK(st == atb::NO_ERROR, "AtbMLA Execute failed!", return st); @@ -127,7 +129,7 @@ atb::Status AtbMLAPreFillGetWorkspaceSize(const aclTensor *q, const aclTensor *q const aclTensor *kRope, const aclTensor *v, const aclTensor *qSeqLen, const aclTensor *kvSeqLen, const aclTensor *mask, int32_t headNum, float qkScale, int32_t kvHeadNum, int maskType, uint8_t cacheMode, - aclTensor *attenOut, uint64_t *workspaceSize, atb::Operation **op, + aclTensor *attenOut, uint64_t &workspaceSize, atb::Operation **op, atb::Context *context) { atb::infer::MultiLatentAttentionParam param; @@ -181,7 +183,7 @@ atb::Status AtbMLAPreFillGetWorkspaceSize(const aclTensor *q, const aclTensor *q ATB_LOG(ERROR) << "AtbMLAPreFillGetWorkspaceSize opeartion pointer is nullptr!"; return atb::ERROR_INVALID_OPERATION_ADDR; } - atb::Status st = (*op)->Setup(pack, *workspaceSize, context); + atb::Status st = (*op)->Setup(pack, workspaceSize, context); ATB_CHECK(st == atb::NO_ERROR, "AtbMLAPreFill Setup failed!", return st); return atb::NO_ERROR; } diff --git a/src/cinterface/atb_acl_mla_preprocess.cpp b/src/cinterface/atb_acl_mla_preprocess.cpp index 32bd22c6..be40ee8a 100644 --- a/src/cinterface/atb_acl_mla_preprocess.cpp +++ b/src/cinterface/atb_acl_mla_preprocess.cpp @@ -28,7 +28,7 @@ atb::Status AtbMLAPreprocessGetWorkspaceSize( const aclTensor *kvCacheRope, const aclTensor *slotmapping, const aclTensor *ctkvScale, const aclTensor *qNopeScale, uint32_t wdqDim, uint32_t qRopeDim, uint32_t kRopeDim, float epsilon, uint32_t qRotaryCoeff, uint32_t kRotaryCoeff, bool transposeWdq, bool transposeWuq, bool transposeWuk, uint8_t cacheMode, uint16_t quantMode, aclTensor *qOut0, - aclTensor *kvCacheOut0, aclTensor *qOut1, aclTensor *kvCacheOut1, uint64_t *workspaceSize, atb::Operation **op, + aclTensor *kvCacheOut0, aclTensor *qOut1, aclTensor *kvCacheOut1, uint64_t &workspaceSize, atb::Operation **op, atb::Context *context) { atb::infer::MlaPreprocessParam param; @@ -159,7 +159,7 @@ atb::Status AtbMLAPreprocessGetWorkspaceSize( ATB_LOG(ERROR) << "AtbMLAPreprocessGetWorkspaceSize opeartion pointer is nullptr!"; return atb::ERROR_INVALID_OPERATION_ADDR; } - atb::Status st = (*op)->Setup(pack, *workspaceSize, context); + atb::Status st = (*op)->Setup(pack, workspaceSize, context); ATB_CHECK(st == atb::NO_ERROR, "AtbMLAPreprocess Setup failed!", return st); return atb::NO_ERROR; } diff --git a/src/cinterface/atb_acl_paged_cache_load.cpp b/src/cinterface/atb_acl_paged_cache_load.cpp index df6d86d5..834c8f4d 100644 --- a/src/cinterface/atb_acl_paged_cache_load.cpp +++ b/src/cinterface/atb_acl_paged_cache_load.cpp @@ -22,7 +22,7 @@ atb::Status AtbPagedCacheLoadGetWorkspaceSize(const aclTensor *keyCache, const a const aclTensor *blockTables, const aclTensor *contextLens, const aclTensor *key, const aclTensor *value, const aclTensor *seqStarts, int8_t kvCacheCfg, bool isSeqLensCumsumType, bool hasSeqStarts, - uint64_t *workspaceSize, atb::Operation **op, atb::Context *context) + uint64_t &workspaceSize, atb::Operation **op, atb::Context *context) { atb::infer::PagedCacheLoadParam param; param.kvCacheCfg = atb::infer::PagedCacheLoadParam::KvCacheCfg(kvCacheCfg); @@ -72,7 +72,7 @@ atb::Status AtbPagedCacheLoadGetWorkspaceSize(const aclTensor *keyCache, const a ATB_LOG(ERROR) << "AtbPagedCacheLoadGetWorkspaceSize opeartion pointer is nullptr!"; return atb::ERROR_INVALID_OPERATION_ADDR; } - atb::Status st = (*op)->Setup(pack, *workspaceSize, context); + atb::Status st = (*op)->Setup(pack, workspaceSize, context); ATB_CHECK(st == atb::NO_ERROR, "AtbPagedCacheLoad Setup failed!", return st); return atb::NO_ERROR; } diff --git a/src/cinterface/atb_acl_ring_mla.cpp b/src/cinterface/atb_acl_ring_mla.cpp index 62468810..677fcf1b 100644 --- a/src/cinterface/atb_acl_ring_mla.cpp +++ b/src/cinterface/atb_acl_ring_mla.cpp @@ -23,7 +23,7 @@ atb::Status AtbRingMLAGetWorkspaceSize(const aclTensor *querySplit1, const aclTe const aclTensor *mask, const aclTensor *seqLen, const aclTensor *prevOut, const aclTensor *prevLse, int32_t headNum, int32_t kvHeadNum, float qkScale, int kernelType, int maskType, int inputLayout, int calcType, aclTensor *output, - aclTensor *softmaxLse, uint64_t *workspaceSize, atb::Operation **op, + aclTensor *softmaxLse, uint64_t &workspaceSize, atb::Operation **op, atb::Context *context) { atb::infer::RingMLAParam param; @@ -80,7 +80,7 @@ atb::Status AtbRingMLAGetWorkspaceSize(const aclTensor *querySplit1, const aclTe ATB_LOG(ERROR) << "AtbRingMLAGetWorkspaceSize opeartion pointer is nullptr!"; return atb::ERROR_INVALID_OPERATION_ADDR; } - status = (*op)->Setup(pack, *workspaceSize, context); + status = (*op)->Setup(pack, workspaceSize, context); ATB_CHECK(status == atb::NO_ERROR, "AtbRingMLA Setup failed!", return status); return atb::NO_ERROR; } diff --git a/src/cinterface/atb_acl_self_attention_prefix_encoder.cpp b/src/cinterface/atb_acl_self_attention_prefix_encoder.cpp index 73e4e366..ffc3e5df 100644 --- a/src/cinterface/atb_acl_self_attention_prefix_encoder.cpp +++ b/src/cinterface/atb_acl_self_attention_prefix_encoder.cpp @@ -23,7 +23,7 @@ atb::Status AtbSelfAttentionPrefixEncoderGetWorkspaceSize(const aclTensor *query const aclTensor *mask, const aclTensor *seqLen, const aclTensor *kvSeqLen, const aclTensor *slopes, int maskType, int32_t headNum, int32_t kvHeadNum, - float qkScale, aclTensor *attnOut, uint64_t *workspaceSize, + float qkScale, aclTensor *attnOut, uint64_t &workspaceSize, atb::Operation **op, atb::Context *context) { atb::infer::SelfAttentionParam param; @@ -94,7 +94,7 @@ atb::Status AtbSelfAttentionPrefixEncoderGetWorkspaceSize(const aclTensor *query ATB_LOG(ERROR) << "AtbSelfAttentionPrefixEncoderGetWorkspaceSize opeartion pointer is nullptr!"; return atb::ERROR_INVALID_OPERATION_ADDR; } - status = (*op)->Setup(pack, *workspaceSize, context); + status = (*op)->Setup(pack, workspaceSize, context); ATB_CHECK(status == atb::NO_ERROR, "AtbSelfAttentionPrefixEncoder Setup failed!", return status); return atb::NO_ERROR; } diff --git a/src/cinterface/atb_acl_util.cpp b/src/cinterface/atb_acl_util.cpp index d4061128..c4b28caf 100644 --- a/src/cinterface/atb_acl_util.cpp +++ b/src/cinterface/atb_acl_util.cpp @@ -17,9 +17,9 @@ extern "C" { // 256GB const int64_t MAX_TENSOR_SIZE = 256uLL * 1024uLL * 1024uLL * 1024uLL; -int64_t GetTensorSize(const aclTensor *input) +int64_t GetTensorSize(const aclTensor &input) { - const op::Shape shape = input->GetViewShape(); + const op::Shape shape = input.GetViewShape(); const size_t dims = shape.GetDimNum(); int64_t size = 1; for (size_t i = 0; i < dims; ++i) { diff --git a/src/kernels/kernels/activation/faster_gelu_forward/tiling/faster_gelu_tiling.cpp b/src/kernels/kernels/activation/faster_gelu_forward/tiling/faster_gelu_tiling.cpp index 9f65346d..fec67067 100644 --- a/src/kernels/kernels/activation/faster_gelu_forward/tiling/faster_gelu_tiling.cpp +++ b/src/kernels/kernels/activation/faster_gelu_forward/tiling/faster_gelu_tiling.cpp @@ -23,7 +23,7 @@ constexpr uint32_t TQUE_NUM = 2; constexpr uint32_t UB_RESERVED_BUFF = 0; // reserve ubSize constexpr uint32_t ALIGN_SIZE = 32; -void CalcVectorTiling512Align(const LaunchParam &launchParam, FasterGeluForwardTilingData *tilingDataPtr, +void CalcVectorTiling512Align(const LaunchParam &launchParam, FasterGeluForwardTilingData &tilingDataPtr, uint32_t &usedCoreNum) { uint64_t dataLen = static_cast(launchParam.GetInTensor(0).Numel()); @@ -71,20 +71,20 @@ void CalcVectorTiling512Align(const LaunchParam &launchParam, FasterGeluForwardT // 每个核计算的block_length 最均匀的分法 uint64_t baseBlockLength = dataLenAlign / (usedCoreNum * packLen) * packLen; // 搬运向下512B对齐 uint64_t resDataLenAlign = dataLenAlign - usedCoreNum * baseBlockLength; - tilingDataPtr->usedCoreNum = usedCoreNum; - std::fill(tilingDataPtr->singleCoreDataLen, tilingDataPtr->singleCoreDataLen + usedCoreNum, baseBlockLength); + tilingDataPtr.usedCoreNum = usedCoreNum; + std::fill(tilingDataPtr.singleCoreDataLen, tilingDataPtr.singleCoreDataLen + usedCoreNum, baseBlockLength); uint32_t index = 0; for (uint32_t i = packLen; i <= resDataLenAlign; i += packLen) { - tilingDataPtr->singleCoreDataLen[index % usedCoreNum] += packLen; + tilingDataPtr.singleCoreDataLen[index % usedCoreNum] += packLen; index++; } - tilingDataPtr->singleCoreDataLen[usedCoreNum - 1] += resDataLenAlign % packLen; + tilingDataPtr.singleCoreDataLen[usedCoreNum - 1] += resDataLenAlign % packLen; - tilingDataPtr->maxTileLen = - availableUB > tilingDataPtr->singleCoreDataLen[0] ? tilingDataPtr->singleCoreDataLen[0] : availableUB; + tilingDataPtr.maxTileLen = + availableUB > tilingDataPtr.singleCoreDataLen[0] ? tilingDataPtr.singleCoreDataLen[0] : availableUB; // 如果只用一个核心,对齐数量置为0,防止核内计算偏移时访问非法内存 - tilingDataPtr->alignDataNum = usedCoreNum > 1 ? alignDataNum : 0; + tilingDataPtr.alignDataNum = usedCoreNum > 1 ? alignDataNum : 0; } Status FasterGeluForwardTiling(const LaunchParam &launchParam, KernelInfo &kernelInfo) diff --git a/src/kernels/kernels/activation/gelu_forward/tiling/gelu_tiling.cpp b/src/kernels/kernels/activation/gelu_forward/tiling/gelu_tiling.cpp index 9a842c4c..f6b3c3b4 100644 --- a/src/kernels/kernels/activation/gelu_forward/tiling/gelu_tiling.cpp +++ b/src/kernels/kernels/activation/gelu_forward/tiling/gelu_tiling.cpp @@ -31,9 +31,9 @@ using namespace Mki; 3. 数据在core上能放的下 4. 数据在core上放不下,多次循环,不同的length */ -bool FillTilingParam(const LaunchParam &launchParam, GeluForwardTilingData *tilingDataPtr, uint32_t &coreNum) +bool FillTilingParam(const LaunchParam &launchParam, GeluForwardTilingData &tilingDataPtr, uint32_t &coreNum) { - tilingDataPtr->bufferNum = AsdOps::GELU_FORWARD_BUFF_NUM; + tilingDataPtr.bufferNum = AsdOps::GELU_FORWARD_BUFF_NUM; // 获取可用核数 coreNum = PlatformInfo::Instance().GetCoreNum(CoreType::CORE_TYPE_VECTOR); // UB空间大小,输入数据信息 @@ -61,19 +61,19 @@ bool FillTilingParam(const LaunchParam &launchParam, GeluForwardTilingData *tili alignSize * alignSize; // 判断是否为小shape来决定是否重置单次搬运数据 if (alignDataLen <= maxPerElemBytes * GELU_FORWARD_BUFF_NUM * coreNum) { - tilingDataPtr->bufferNum = 1; + tilingDataPtr.bufferNum = 1; maxPerElemBytes = maxPerElemBytes * NUM_2; } - tilingDataPtr->blockLength = (static_cast(launchParam.GetInTensor(0).Numel()) + coreNum - 1) / coreNum; - tilingDataPtr->blockLength = (tilingDataPtr->blockLength + alignSize - 1) / alignSize * alignSize; + tilingDataPtr.blockLength = (static_cast(launchParam.GetInTensor(0).Numel()) + coreNum - 1) / coreNum; + tilingDataPtr.blockLength = (tilingDataPtr.blockLength + alignSize - 1) / alignSize * alignSize; // 每个核要算的数据能否在UB上放的下 - tilingDataPtr->tileNum = tilingDataPtr->blockLength / maxPerElemBytes; - tilingDataPtr->tailLength = tilingDataPtr->blockLength % maxPerElemBytes; - if (tilingDataPtr->tileNum == 0) { - tilingDataPtr->tileLength = tilingDataPtr->tailLength; + tilingDataPtr.tileNum = tilingDataPtr.blockLength / maxPerElemBytes; + tilingDataPtr.tailLength = tilingDataPtr.blockLength % maxPerElemBytes; + if (tilingDataPtr.tileNum == 0) { + tilingDataPtr.tileLength = tilingDataPtr.tailLength; } else { - tilingDataPtr->tileLength = maxPerElemBytes; + tilingDataPtr.tileLength = maxPerElemBytes; } return true; } diff --git a/src/kernels/kernels/elewise/dynamic_quant/dynamic_quant_tiling/dynamic_quant_tiling.cpp b/src/kernels/kernels/elewise/dynamic_quant/dynamic_quant_tiling/dynamic_quant_tiling.cpp index bd67e59e..ff72a00e 100644 --- a/src/kernels/kernels/elewise/dynamic_quant/dynamic_quant_tiling/dynamic_quant_tiling.cpp +++ b/src/kernels/kernels/elewise/dynamic_quant/dynamic_quant_tiling/dynamic_quant_tiling.cpp @@ -37,8 +37,8 @@ uint64_t ComputeTilingKey(uint32_t alignType, const LaunchParam &launchParam) return tilingKey; } -Status ParseShape(const LaunchParam &launchParam, DynamicQuantTilingData *tilingDataPtr, - uint64_t *rowNumTotal) +Status ParseShape(const LaunchParam &launchParam, DynamicQuantTilingData &tilingDataPtr, + uint64_t &rowNumTotal) { const Mki::SVector &shape = launchParam.GetInTensor(0).desc.dims; MKI_CHECK(!shape.empty(), "shape should not be empty", @@ -50,19 +50,19 @@ Status ParseShape(const LaunchParam &launchParam, DynamicQuantTilingData *tiling MKI_CHECK(shape[i] > 0 && *rowNumTotal < static_cast(UINT32_MAX / shape[i]), "rowNumTotal or shape is invalid!", return Status::FailStatus(ERROR_INVALID_VALUE, "rowNumTotal or shape is invalid!")); - *rowNumTotal *= shape[i]; + rowNumTotal *= shape[i]; } else { - tilingDataPtr->sizeH = shape[i]; + tilingDataPtr.sizeH = shape[i]; } } if (launchParam.GetInTensor(0).desc.dtype == TENSOR_DTYPE_BF16) { - MKI_CHECK(tilingDataPtr->sizeH <= DYNAMIC_QUANT_BF16_LAST_DIM_LIMITATION, + MKI_CHECK(tilingDataPtr.sizeH <= DYNAMIC_QUANT_BF16_LAST_DIM_LIMITATION, "Ascend910B BF16 input last dim is bigger than limitation!", return Status::FailStatus(ERROR_INVALID_VALUE, "Ascend910B BF16 input last dim is bigger than limitation!")); } if (PlatformInfo::Instance().GetPlatformType() == PlatformType::ASCEND_310P) { - MKI_CHECK(tilingDataPtr->sizeH <= DYNAMIC_QUANT_FP16_LAST_DIM_LIMITATION_310P, + MKI_CHECK(tilingDataPtr.sizeH <= DYNAMIC_QUANT_FP16_LAST_DIM_LIMITATION_310P, "Ascend310P F16 input last dim is bigger than limitation!", return Status::FailStatus(ERROR_INVALID_VALUE, "Ascend310P F16 input last dim is bigger than limitation!")); @@ -78,33 +78,33 @@ Status ParseShape(const LaunchParam &launchParam, DynamicQuantTilingData *tiling * 3. sizeH > 512 -> numCopyRow: 64 ; 8 <= sizeH < 64 -> numCopyRow: 192 * 64 <= sizeH <= 512 -> numCopyRow: Utils::RoundUp(213 - sizeX * 2 / 7, 8) */ -void SetSuitNumCopyRow(DynamicQuantTilingData *tilingDataPtr) +void SetSuitNumCopyRow(DynamicQuantTilingData &tilingDataPtr) { - tilingDataPtr->sizeX = Utils::RoundUp(tilingDataPtr->sizeH, DYNAMIC_QUANT_ALIGN_NUM_X); - tilingDataPtr->sizeZOut = Utils::RoundUp(tilingDataPtr->sizeH); + tilingDataPtr.sizeX = Utils::RoundUp(tilingDataPtr.sizeH, DYNAMIC_QUANT_ALIGN_NUM_X); + tilingDataPtr.sizeZOut = Utils::RoundUp(tilingDataPtr.sizeH); uint32_t ubSize = PlatformInfo::Instance().GetUbSize(); - tilingDataPtr->numCopyRow = (ubSize - tilingDataPtr->sizeX * DYNAMIC_QUANT_FP16_BUF_SCALE - \ - DYNAMIC_QUANT_HEADSPACE) / (tilingDataPtr->sizeX * DYNAMIC_QUANT_COPY_ROW_SCALE); - MKI_LOG(INFO) << "numCopyRow = " << tilingDataPtr->numCopyRow; - uint32_t rowSuit = DYNAMIC_QUANT_ROW_SUIT_ADD - tilingDataPtr->sizeX * \ + tilingDataPtr.numCopyRow = (ubSize - tilingDataPtr.sizeX * DYNAMIC_QUANT_FP16_BUF_SCALE - \ + DYNAMIC_QUANT_HEADSPACE) / (tilingDataPtr.sizeX * DYNAMIC_QUANT_COPY_ROW_SCALE); + MKI_LOG(INFO) << "numCopyRow = " << tilingDataPtr.numCopyRow; + uint32_t rowSuit = DYNAMIC_QUANT_ROW_SUIT_ADD - tilingDataPtr.sizeX * \ DYNAMIC_QUANT_ROW_SUIT_MUL / DYNAMIC_QUANT_ROW_SUIT_DIV; rowSuit = rowSuit - rowSuit % DYNAMIC_QUANT_ALIGN_NUM_SCALE; - if (tilingDataPtr->numCopyRow > DYNAMIC_QUANT_COPY_ROW_LONG && - tilingDataPtr->sizeX >= DYNAMIC_QUANT_LEN_H_LONG) { - tilingDataPtr->numCopyRow = DYNAMIC_QUANT_COPY_ROW_LONG; - } else if (tilingDataPtr->numCopyRow > rowSuit && rowSuit > DYNAMIC_QUANT_ALIGN_NUM_SCALE && - tilingDataPtr->sizeX >= DYNAMIC_QUANT_LEN_H_SHORT) { - tilingDataPtr->numCopyRow = rowSuit; - } else if (tilingDataPtr->numCopyRow > DYNAMIC_QUANT_COPY_ROW_SHORT && - tilingDataPtr->sizeX < DYNAMIC_QUANT_LEN_H_SHORT && - tilingDataPtr->sizeX > DYNAMIC_QUANT_ALIGN_NUM_SCALE) { - tilingDataPtr->numCopyRow = DYNAMIC_QUANT_COPY_ROW_SHORT; - } else if (tilingDataPtr->numCopyRow > DYNAMIC_QUANT_ALIGN_NUM_SCALE) { - tilingDataPtr->numCopyRow = tilingDataPtr->numCopyRow - tilingDataPtr->numCopyRow % \ + if (tilingDataPtr.numCopyRow > DYNAMIC_QUANT_COPY_ROW_LONG && + tilingDataPtr.sizeX >= DYNAMIC_QUANT_LEN_H_LONG) { + tilingDataPtr.numCopyRow = DYNAMIC_QUANT_COPY_ROW_LONG; + } else if (tilingDataPtr.numCopyRow > rowSuit && rowSuit > DYNAMIC_QUANT_ALIGN_NUM_SCALE && + tilingDataPtr.sizeX >= DYNAMIC_QUANT_LEN_H_SHORT) { + tilingDataPtr.numCopyRow = rowSuit; + } else if (tilingDataPtr.numCopyRow > DYNAMIC_QUANT_COPY_ROW_SHORT && + tilingDataPtr.sizeX < DYNAMIC_QUANT_LEN_H_SHORT && + tilingDataPtr.sizeX > DYNAMIC_QUANT_ALIGN_NUM_SCALE) { + tilingDataPtr.numCopyRow = DYNAMIC_QUANT_COPY_ROW_SHORT; + } else if (tilingDataPtr.numCopyRow > DYNAMIC_QUANT_ALIGN_NUM_SCALE) { + tilingDataPtr.numCopyRow = tilingDataPtr.numCopyRow - tilingDataPtr.numCopyRow % \ DYNAMIC_QUANT_ALIGN_NUM_SCALE; } - MKI_LOG(INFO) << "numCopyRow = " << tilingDataPtr->numCopyRow; + MKI_LOG(INFO) << "numCopyRow = " << tilingDataPtr.numCopyRow; } /** @@ -116,44 +116,44 @@ void SetSuitNumCopyRow(DynamicQuantTilingData *tilingDataPtr) * numCopyRow > alignRowNum, perRowNum < alignRowNum -> numCopyRow = perRowNum * numCopyRow > perRowNum, perRowNum < 8 -> numCopyRow = perRowNum */ -Status CorrectNumCopyRow(DynamicQuantTilingData *tilingDataPtr, uint64_t rowNumTotal) +Status CorrectNumCopyRow(DynamicQuantTilingData &tilingDataPtr, uint64_t rowNumTotal) { - uint32_t perRowNum = Utils::CeilDiv(static_cast(rowNumTotal), tilingDataPtr->numCore); + uint32_t perRowNum = Utils::CeilDiv(static_cast(rowNumTotal), tilingDataPtr.numCore); uint32_t alignRowNum = Utils::RoundUp(perRowNum, DYNAMIC_QUANT_ALIGN_NUM_SCALE); MKI_LOG(INFO) << "perRowNum = " << perRowNum; if (PlatformInfo::Instance().GetPlatformType() == PlatformType::ASCEND_310P) { - tilingDataPtr->alignType = DYNAMIC_QUANT_STATUS_UNALIGN_310P; - if (tilingDataPtr->numCopyRow >= DYNAMIC_QUANT_ALIGN_NUM_SCALE && + tilingDataPtr.alignType = DYNAMIC_QUANT_STATUS_UNALIGN_310P; + if (tilingDataPtr.numCopyRow >= DYNAMIC_QUANT_ALIGN_NUM_SCALE && perRowNum <= DYNAMIC_QUANT_ALIGN_NUM_SCALE) { - tilingDataPtr->numCopyRow = DYNAMIC_QUANT_ALIGN_NUM_SCALE; - } else if (tilingDataPtr->numCopyRow >= alignRowNum) { - tilingDataPtr->numCopyRow = alignRowNum; + tilingDataPtr.numCopyRow = DYNAMIC_QUANT_ALIGN_NUM_SCALE; + } else if (tilingDataPtr.numCopyRow >= alignRowNum) { + tilingDataPtr.numCopyRow = alignRowNum; } - if (tilingDataPtr->sizeH % DYNAMIC_QUANT_ALIGN_SIZE != 0 || - tilingDataPtr->numCopyRow < DYNAMIC_QUANT_ALIGN_NUM_SCALE) { + if (tilingDataPtr.sizeH % DYNAMIC_QUANT_ALIGN_SIZE != 0 || + tilingDataPtr.numCopyRow < DYNAMIC_QUANT_ALIGN_NUM_SCALE) { return Status::FailStatus(ERROR_INVALID_VALUE, "Ascend310P input last dim must 64Byte alignment"); } } else { - tilingDataPtr->alignType = DYNAMIC_QUANT_STATUS_UNALIGN_910B; - if (perRowNum <= 0 && tilingDataPtr->numCopyRow > 0) { - tilingDataPtr->numCopyRow = 1; - } else if (tilingDataPtr->numCopyRow > alignRowNum && perRowNum > alignRowNum) { - tilingDataPtr->numCopyRow = alignRowNum; - } else if (tilingDataPtr->numCopyRow > alignRowNum && perRowNum < alignRowNum) { - tilingDataPtr->numCopyRow = perRowNum; - } else if (tilingDataPtr->numCopyRow > perRowNum && perRowNum < DYNAMIC_QUANT_ALIGN_NUM_SCALE) { - tilingDataPtr->numCopyRow = perRowNum; + tilingDataPtr.alignType = DYNAMIC_QUANT_STATUS_UNALIGN_910B; + if (perRowNum <= 0 && tilingDataPtr.numCopyRow > 0) { + tilingDataPtr.numCopyRow = 1; + } else if (tilingDataPtr.numCopyRow > alignRowNum && perRowNum > alignRowNum) { + tilingDataPtr.numCopyRow = alignRowNum; + } else if (tilingDataPtr.numCopyRow > alignRowNum && perRowNum < alignRowNum) { + tilingDataPtr.numCopyRow = perRowNum; + } else if (tilingDataPtr.numCopyRow > perRowNum && perRowNum < DYNAMIC_QUANT_ALIGN_NUM_SCALE) { + tilingDataPtr.numCopyRow = perRowNum; } - if (tilingDataPtr->numCopyRow == 0) { + if (tilingDataPtr.numCopyRow == 0) { return Status::FailStatus(ERROR_INVALID_VALUE, "Ascend910B input last dim is bigger than limitation"); } } - MKI_LOG(INFO) << "numCopyRow = " << tilingDataPtr->numCopyRow; - tilingDataPtr->sizeCopyRow = Utils::RoundUp(tilingDataPtr->numCopyRow, DYNAMIC_QUANT_ALIGN_NUM_SCALE); + MKI_LOG(INFO) << "numCopyRow = " << tilingDataPtr.numCopyRow; + tilingDataPtr.sizeCopyRow = Utils::RoundUp(tilingDataPtr.numCopyRow, DYNAMIC_QUANT_ALIGN_NUM_SCALE); return Status::OkStatus(); } -Status SetTilingData(DynamicQuantTilingData *tilingDataPtr, uint64_t rowNumTotal) +Status SetTilingData(DynamicQuantTilingData &tilingDataPtr, uint64_t rowNumTotal) { SetSuitNumCopyRow(tilingDataPtr); @@ -162,17 +162,17 @@ Status SetTilingData(DynamicQuantTilingData *tilingDataPtr, uint64_t rowNumTotal return status; } - uint32_t patchTotal = rowNumTotal / tilingDataPtr->numCopyRow; - tilingDataPtr->numLastTailRow = rowNumTotal % tilingDataPtr->numCopyRow; - tilingDataPtr->numTailTimes = patchTotal / tilingDataPtr->numCore; - tilingDataPtr->numHeadCore = patchTotal % tilingDataPtr->numCore; - tilingDataPtr->numTailCore = tilingDataPtr->numCore - tilingDataPtr->numHeadCore; - tilingDataPtr->numHeadTimes = tilingDataPtr->numTailTimes + 1; - - if (tilingDataPtr->numLastTailRow == 0 && - tilingDataPtr->numCopyRow % DYNAMIC_QUANT_ALIGN_NUM_SCALE == 0 && - tilingDataPtr->sizeH % DYNAMIC_QUANT_ALIGN_SIZE == 0) { - tilingDataPtr->alignType = DYNAMIC_QUANT_STATUS_ALIGN; + uint32_t patchTotal = rowNumTotal / tilingDataPtr.numCopyRow; + tilingDataPtr.numLastTailRow = rowNumTotal % tilingDataPtr.numCopyRow; + tilingDataPtr.numTailTimes = patchTotal / tilingDataPtr.numCore; + tilingDataPtr.numHeadCore = patchTotal % tilingDataPtr.numCore; + tilingDataPtr.numTailCore = tilingDataPtr.numCore - tilingDataPtr.numHeadCore; + tilingDataPtr.numHeadTimes = tilingDataPtr.numTailTimes + 1; + + if (tilingDataPtr.numLastTailRow == 0 && + tilingDataPtr.numCopyRow % DYNAMIC_QUANT_ALIGN_NUM_SCALE == 0 && + tilingDataPtr.sizeH % DYNAMIC_QUANT_ALIGN_SIZE == 0) { + tilingDataPtr.alignType = DYNAMIC_QUANT_STATUS_ALIGN; } return Status::OkStatus(); } diff --git a/src/kernels/kernels/norm/coherelayernorm/tiling/cohere_layer_norm_tiling.cpp b/src/kernels/kernels/norm/coherelayernorm/tiling/cohere_layer_norm_tiling.cpp index e3264884..a0465d01 100644 --- a/src/kernels/kernels/norm/coherelayernorm/tiling/cohere_layer_norm_tiling.cpp +++ b/src/kernels/kernels/norm/coherelayernorm/tiling/cohere_layer_norm_tiling.cpp @@ -45,36 +45,36 @@ void CohereLayerNormPrintTilingInfo(const CohereLayerNormTilingData &tilingDataP << "averageFactor = " << tilingDataPtr.averageFactor; } -Status MultipleRowMovedTiling(NormTilingDataPtrCon &layerNormPtrCon, CohereLayerNormTilingData *tilingDataPtr, +Status MultipleRowMovedTiling(NormTilingDataPtrCon &layerNormPtrCon, CohereLayerNormTilingData &tilingDataPtr, uint32_t singleRowMovedBufferSize, uint32_t multipleRowMovedBufferSize, uint32_t miscBuffersSize) { uint32_t numResidualCoreRows = layerNormPtrCon.numRow - - tilingDataPtr->numCoreRows * (tilingDataPtr->numCore - 1); + tilingDataPtr.numCoreRows * (tilingDataPtr.numCore - 1); uint32_t calcCoreRowStrides = (layerNormPtrCon.maxUbSize - singleRowMovedBufferSize - miscBuffersSize) / multipleRowMovedBufferSize; - tilingDataPtr->coreRowStrides = std::min(tilingDataPtr->numCoreRows, calcCoreRowStrides); - MKI_CHECK(tilingDataPtr->coreRowStrides != 0, "coreRowStrides is equal to 0", + tilingDataPtr.coreRowStrides = std::min(tilingDataPtr.numCoreRows, calcCoreRowStrides); + MKI_CHECK(tilingDataPtr.coreRowStrides != 0, "coreRowStrides is equal to 0", return Status::FailStatus(ERROR_INVALID_VALUE)); - tilingDataPtr->coreRowRepeats = tilingDataPtr->numCoreRows / tilingDataPtr->coreRowStrides; - tilingDataPtr->coreRowTailStrides = tilingDataPtr->numCoreRows % tilingDataPtr->coreRowStrides; - tilingDataPtr->coreRowTailRepeats = tilingDataPtr->coreRowTailStrides == 0 ? 0 : 1; - tilingDataPtr->residualCoreRowStrides = std::min(numResidualCoreRows, calcCoreRowStrides); - MKI_CHECK(tilingDataPtr->residualCoreRowStrides != 0, "residualCoreRowStrides is equal to 0", + tilingDataPtr.coreRowRepeats = tilingDataPtr.numCoreRows / tilingDataPtr.coreRowStrides; + tilingDataPtr.coreRowTailStrides = tilingDataPtr.numCoreRows % tilingDataPtr.coreRowStrides; + tilingDataPtr.coreRowTailRepeats = tilingDataPtr.coreRowTailStrides == 0 ? 0 : 1; + tilingDataPtr.residualCoreRowStrides = std::min(numResidualCoreRows, calcCoreRowStrides); + MKI_CHECK(tilingDataPtr.residualCoreRowStrides != 0, "residualCoreRowStrides is equal to 0", return Status::FailStatus(ERROR_INVALID_VALUE)); - tilingDataPtr->residualCoreRowRepeats = numResidualCoreRows / - tilingDataPtr->residualCoreRowStrides; - tilingDataPtr->residualCoreRowTailStrides = numResidualCoreRows % - tilingDataPtr->residualCoreRowStrides; - tilingDataPtr->residualCoreRowTailRepeats = tilingDataPtr->residualCoreRowTailStrides == 0 ? 0 : 1; - tilingDataPtr->columnStrides = layerNormPtrCon.numCol; - tilingDataPtr->columnRepeats = 1; - tilingDataPtr->residualColumnStrides = 0; - tilingDataPtr->residualColumnRepeats = 0; + tilingDataPtr.residualCoreRowRepeats = numResidualCoreRows / + tilingDataPtr.residualCoreRowStrides; + tilingDataPtr.residualCoreRowTailStrides = numResidualCoreRows % + tilingDataPtr.residualCoreRowStrides; + tilingDataPtr.residualCoreRowTailRepeats = tilingDataPtr.residualCoreRowTailStrides == 0 ? 0 : 1; + tilingDataPtr.columnStrides = layerNormPtrCon.numCol; + tilingDataPtr.columnRepeats = 1; + tilingDataPtr.residualColumnStrides = 0; + tilingDataPtr.residualColumnRepeats = 0; return Status::OkStatus(); } -Status SingleRowMovedTiling(NormTilingDataPtrCon &layerNormPtrCon, CohereLayerNormTilingData *tilingDataPtr, +Status SingleRowMovedTiling(NormTilingDataPtrCon &layerNormPtrCon, CohereLayerNormTilingData &tilingDataPtr, uint32_t singleRowMovedElemSize, uint32_t multipleRowMovedElemSize, uint32_t miscBuffersSize) { @@ -83,21 +83,21 @@ Status SingleRowMovedTiling(NormTilingDataPtrCon &layerNormPtrCon, CohereLayerNo (singleRowMovedElemSize + multipleRowMovedElemSize), oneRepeatElemCount); uint32_t numResidualCoreRows = layerNormPtrCon.numRow - - tilingDataPtr->numCoreRows * (tilingDataPtr->numCore - 1); - tilingDataPtr->columnStrides = std::min(tilingDataPtr->numColumns, calcColumnStrides); - MKI_CHECK(tilingDataPtr->columnStrides != 0, "columnStrides is equal to 0", + tilingDataPtr.numCoreRows * (tilingDataPtr.numCore - 1); + tilingDataPtr.columnStrides = std::min(tilingDataPtr.numColumns, calcColumnStrides); + MKI_CHECK(tilingDataPtr.columnStrides != 0, "columnStrides is equal to 0", return Status::FailStatus(ERROR_INVALID_VALUE)); - tilingDataPtr->columnRepeats = layerNormPtrCon.numCol / tilingDataPtr->columnStrides; - tilingDataPtr->residualColumnStrides = layerNormPtrCon.numCol % tilingDataPtr->columnStrides; - tilingDataPtr->residualColumnRepeats = tilingDataPtr->residualColumnStrides == 0 ? 0 : 1; - tilingDataPtr->coreRowStrides = 1; - tilingDataPtr->coreRowRepeats = tilingDataPtr->numCoreRows; - tilingDataPtr->coreRowTailStrides = 0; - tilingDataPtr->coreRowTailRepeats = 0; - tilingDataPtr->residualCoreRowStrides = 1; - tilingDataPtr->residualCoreRowRepeats = numResidualCoreRows; - tilingDataPtr->residualCoreRowTailStrides = 0; - tilingDataPtr->residualCoreRowTailRepeats = 0; + tilingDataPtr.columnRepeats = layerNormPtrCon.numCol / tilingDataPtr.columnStrides; + tilingDataPtr.residualColumnStrides = layerNormPtrCon.numCol % tilingDataPtr.columnStrides; + tilingDataPtr.residualColumnRepeats = tilingDataPtr.residualColumnStrides == 0 ? 0 : 1; + tilingDataPtr.coreRowStrides = 1; + tilingDataPtr.coreRowRepeats = tilingDataPtr.numCoreRows; + tilingDataPtr.coreRowTailStrides = 0; + tilingDataPtr.coreRowTailRepeats = 0; + tilingDataPtr.residualCoreRowStrides = 1; + tilingDataPtr.residualCoreRowRepeats = numResidualCoreRows; + tilingDataPtr.residualCoreRowTailStrides = 0; + tilingDataPtr.residualCoreRowTailRepeats = 0; return Status::OkStatus(); } diff --git a/src/kernels/kernels/norm/rmsnorm/tiling/rms_norm_tiling.cpp b/src/kernels/kernels/norm/rmsnorm/tiling/rms_norm_tiling.cpp index 5963fc98..840efb97 100644 --- a/src/kernels/kernels/norm/rmsnorm/tiling/rms_norm_tiling.cpp +++ b/src/kernels/kernels/norm/rmsnorm/tiling/rms_norm_tiling.cpp @@ -34,18 +34,18 @@ constexpr uint32_t RMS_NORM_TILING_KEY_GEMMAMODE = 1000; // 0:gemmamode no; 1 constexpr uint32_t RMS_NORM_TILING_KEY_PRECISIONMODE = 100; // 0:precisionmode; 1:performance mode constexpr uint32_t RMS_NORM_TILING_KEY_DTYPE = 10; // 0:fp16;1:bf16 -void PrintRmsNormTiling(const RmsNormCommonTilingData *tilingDataPtr) +void PrintRmsNormTiling(const RmsNormCommonTilingData &tilingDataPtr) { MKI_LOG(INFO) << "RmsNorm Tiling Data:" - << " numCore " << tilingDataPtr->numCore - << " numCol " << tilingDataPtr->numCol - << " numRow " << tilingDataPtr->numRow - << " avgFactor " << tilingDataPtr->avgFactor - << " epsilon " << tilingDataPtr->epsilon - << " sliceSize " << tilingDataPtr->sliceSize - << " mode " << tilingDataPtr->mode - << " precisionMode " << tilingDataPtr->precisionMode - << " gemmaMode " << tilingDataPtr->gemmaMode; + << " numCore " << tilingDataPtr.numCore + << " numCol " << tilingDataPtr.numCol + << " numRow " << tilingDataPtr.numRow + << " avgFactor " << tilingDataPtr.avgFactor + << " epsilon " << tilingDataPtr.epsilon + << " sliceSize " << tilingDataPtr.sliceSize + << " mode " << tilingDataPtr.mode + << " precisionMode " << tilingDataPtr.precisionMode + << " gemmaMode " << tilingDataPtr.gemmaMode; } uint64_t ComputeTilingKey(uint32_t gemmaMode, uint32_t precisionMode, bool isShortTail, const LaunchParam &launchParam) { @@ -60,19 +60,19 @@ uint64_t ComputeTilingKey(uint32_t gemmaMode, uint32_t precisionMode, bool isSho return tilingKey; } -void SetNonContiguousTenor(RmsNormCommonTilingData *tilingDataPtr, const LaunchParam &launchParam) +void SetNonContiguousTenor(RmsNormCommonTilingData &tilingDataPtr, const LaunchParam &launchParam) { const auto& xStrides = launchParam.GetInTensor(0).desc.strides; const auto& shape = launchParam.GetInTensor(0).desc.dims; uint32_t dimNum = xStrides.size(); if (xStrides.empty() || dimNum == 1 || xStrides[dimNum - NUM_TWO] == shape[dimNum - 1]) { - tilingDataPtr->xDimNum = 0; + tilingDataPtr.xDimNum = 0; } else { for (size_t i = 0; i < xStrides.size(); ++ i) { - tilingDataPtr->xStrides[i] = xStrides[i]; + tilingDataPtr.xStrides[i] = xStrides[i]; } - tilingDataPtr->xDimNum = dimNum; - tilingDataPtr->xOffset = launchParam.GetInTensor(0).desc.offset; + tilingDataPtr.xDimNum = dimNum; + tilingDataPtr.xOffset = launchParam.GetInTensor(0).desc.offset; } } diff --git a/src/kernels/mixkernels/fused_add_topk_div/tiling/fused_add_topk_div_tiling.cpp b/src/kernels/mixkernels/fused_add_topk_div/tiling/fused_add_topk_div_tiling.cpp index 782bc8f5..0b1c929b 100644 --- a/src/kernels/mixkernels/fused_add_topk_div/tiling/fused_add_topk_div_tiling.cpp +++ b/src/kernels/mixkernels/fused_add_topk_div/tiling/fused_add_topk_div_tiling.cpp @@ -48,21 +48,21 @@ namespace AtbOps { template class FusedAddTopkDivTiling { public: - explicit FusedAddTopkDivTiling(FusedAddTopkDivTilingData *tilingDataPtr, + explicit FusedAddTopkDivTiling(FusedAddTopkDivTilingData &tilingDataPtr, const uint32_t inputCoreNum, const uint32_t inputUbSize) { - this->firstDimSize = tilingDataPtr->firstDimSize; - this->secondDimSize = tilingDataPtr->secondDimSize; - this->addNumDimSize = tilingDataPtr->addNumDimSize; - this->groupNum = tilingDataPtr->groupNum; - this->groupTopk = tilingDataPtr->groupTopk; - this->n = tilingDataPtr->n; - this->k = tilingDataPtr->k; - this->activateType = tilingDataPtr->activateType; - this->isNorm = tilingDataPtr->isNorm; - this->scale = tilingDataPtr->scale; - this->groupEles = tilingDataPtr->groupEles; - this->dtype = tilingDataPtr->dtype; + this->firstDimSize = tilingDataPtr.firstDimSize; + this->secondDimSize = tilingDataPtr.secondDimSize; + this->addNumDimSize = tilingDataPtr.addNumDimSize; + this->groupNum = tilingDataPtr.groupNum; + this->groupTopk = tilingDataPtr.groupTopk; + this->n = tilingDataPtr.n; + this->k = tilingDataPtr.k; + this->activateType = tilingDataPtr.activateType; + this->isNorm = tilingDataPtr.isNorm; + this->scale = tilingDataPtr.scale; + this->groupEles = tilingDataPtr.groupEles; + this->dtype = tilingDataPtr.dtype; this->ubSize = FloorAlign(inputUbSize, BYTE_BLOCK); this->coreNum = inputCoreNum; return; @@ -110,9 +110,9 @@ namespace AtbOps { }; template - void FusedAddTopkDivTiling::GetTilingKey(TilingData *tilingDataPtr) + void FusedAddTopkDivTiling::GetTilingKey(TilingData &tilingDataPtr) { - tilingKey = tilingDataPtr->enableExpertMapping * NUM_TEN + dtype; + tilingKey = tilingDataPtr.enableExpertMapping * NUM_TEN + dtype; } template @@ -139,27 +139,27 @@ namespace AtbOps { } template - void FusedAddTopkDivTiling::FillTilingData(TilingData *tilingDataPtr) + void FusedAddTopkDivTiling::FillTilingData(TilingData &tilingDataPtr) { - tilingDataPtr->firstDimSize = firstDimSize; - tilingDataPtr->secondDimSize = secondDimSize; - tilingDataPtr->addNumDimSize = addNumDimSize; - tilingDataPtr->groupNum = groupNum; - tilingDataPtr->groupTopk = groupTopk; - tilingDataPtr->n = n; - tilingDataPtr->k = k; - tilingDataPtr->activateType = activateType; - tilingDataPtr->isNorm = isNorm; - tilingDataPtr->scale = scale; - tilingDataPtr->groupEles = groupEles; - tilingDataPtr->blockNum = usedCoreNum; - tilingDataPtr->ubFactorElement = ubFactorElement; - tilingDataPtr->batchPerCore = batchPerCore; - tilingDataPtr->tailBatch = tailBatch; - tilingDataPtr->tilingKey = tilingKey; + tilingDataPtr.firstDimSize = firstDimSize; + tilingDataPtr.secondDimSize = secondDimSize; + tilingDataPtr.addNumDimSize = addNumDimSize; + tilingDataPtr.groupNum = groupNum; + tilingDataPtr.groupTopk = groupTopk; + tilingDataPtr.n = n; + tilingDataPtr.k = k; + tilingDataPtr.activateType = activateType; + tilingDataPtr.isNorm = isNorm; + tilingDataPtr.scale = scale; + tilingDataPtr.groupEles = groupEles; + tilingDataPtr.blockNum = usedCoreNum; + tilingDataPtr.ubFactorElement = ubFactorElement; + tilingDataPtr.batchPerCore = batchPerCore; + tilingDataPtr.tailBatch = tailBatch; + tilingDataPtr.tilingKey = tilingKey; uint64_t wsSize = BASE_COUNT * FLOAT_BYTES; - tilingDataPtr->workspacePerCore = wsSize; - tilingDataPtr->tempSize = firstDimSize * secondDimSize * FLOAT_BYTES; + tilingDataPtr.workspacePerCore = wsSize; + tilingDataPtr.tempSize = firstDimSize * secondDimSize * FLOAT_BYTES; } template @@ -178,30 +178,30 @@ namespace AtbOps { tilingObj.GetTiling(tilingDataPtr); } - static void PrintTilingData(const FusedAddTopkDivTilingData *tilingDataPtr) + static void PrintTilingData(const FusedAddTopkDivTilingData &tilingDataPtr) { - MKI_LOG(INFO) << "firstDimSize is: " << tilingDataPtr->firstDimSize << "\n" - << "secondDimSize is: " << tilingDataPtr->secondDimSize << "\n" - << "addNumDimSize is: " << tilingDataPtr->addNumDimSize << "\n" - << "groupNum is: " << tilingDataPtr->groupNum << "\n" - << "grouptopk is: " << tilingDataPtr->groupTopk << "\n" - << "n is: " << tilingDataPtr->n << "\n" - << "k is: " << tilingDataPtr->k << "\n" - << "activateType is: " << tilingDataPtr->activateType << "\n" - << "isNorm is: " << tilingDataPtr->isNorm << "\n" - << "scale is: " << tilingDataPtr->scale << "\n" - << "groupEles is: " << tilingDataPtr->groupEles << "\n" - << "blockNum is: " << tilingDataPtr->blockNum << "\n" - << "dtype is: " << tilingDataPtr->dtype << "\n" - << "ubFactorElement is: " << tilingDataPtr->ubFactorElement << "\n" - << "batchPerCore is: " << tilingDataPtr->batchPerCore << "\n" - << "tailBatch is: " << tilingDataPtr->tailBatch << "\n" - << "tilingKey is: " << tilingDataPtr->tilingKey << "\n" - << "tempSize is: " << tilingDataPtr->tempSize << "\n" - << "enableExpertMapping is: " << tilingDataPtr->enableExpertMapping << "\n" - << "expertNum is: " << tilingDataPtr->expertNum << "\n" - << "tableDim is: " << tilingDataPtr->tableDim << "\n" - << "workspacePerCore is: " << tilingDataPtr->workspacePerCore; + MKI_LOG(INFO) << "firstDimSize is: " << tilingDataPtr.firstDimSize << "\n" + << "secondDimSize is: " << tilingDataPtr.secondDimSize << "\n" + << "addNumDimSize is: " << tilingDataPtr.addNumDimSize << "\n" + << "groupNum is: " << tilingDataPtr.groupNum << "\n" + << "grouptopk is: " << tilingDataPtr.groupTopk << "\n" + << "n is: " << tilingDataPtr.n << "\n" + << "k is: " << tilingDataPtr.k << "\n" + << "activateType is: " << tilingDataPtr.activateType << "\n" + << "isNorm is: " << tilingDataPtr.isNorm << "\n" + << "scale is: " << tilingDataPtr.scale << "\n" + << "groupEles is: " << tilingDataPtr.groupEles << "\n" + << "blockNum is: " << tilingDataPtr.blockNum << "\n" + << "dtype is: " << tilingDataPtr.dtype << "\n" + << "ubFactorElement is: " << tilingDataPtr.ubFactorElement << "\n" + << "batchPerCore is: " << tilingDataPtr.batchPerCore << "\n" + << "tailBatch is: " << tilingDataPtr.tailBatch << "\n" + << "tilingKey is: " << tilingDataPtr.tilingKey << "\n" + << "tempSize is: " << tilingDataPtr.tempSize << "\n" + << "enableExpertMapping is: " << tilingDataPtr.enableExpertMapping << "\n" + << "expertNum is: " << tilingDataPtr.expertNum << "\n" + << "tableDim is: " << tilingDataPtr.tableDim << "\n" + << "workspacePerCore is: " << tilingDataPtr.workspacePerCore; } template Status CeilAlign(T1 a, T2 b) @@ -209,29 +209,29 @@ namespace AtbOps { return b == 0 ? a : (a + b - 1) / b * b; } - Status GetInputInfo(const LaunchParam &launchParam, FusedAddTopkDivTilingData *tilingDataPtr) + Status GetInputInfo(const LaunchParam &launchParam, FusedAddTopkDivTilingData &tilingDataPtr) { auto inTensor0 = launchParam.GetInTensor(X_INPUT_INDEX).desc; auto inTensor1 = launchParam.GetInTensor(ADD_NUM_INPUT_INDEX).desc; - tilingDataPtr->firstDimSize = inTensor0.dims[DIM_INDEX0]; - tilingDataPtr->secondDimSize = inTensor0.dims[DIM_INDEX1]; - tilingDataPtr->addNumDimSize = inTensor1.dims[DIM_INDEX0]; + tilingDataPtr.firstDimSize = inTensor0.dims[DIM_INDEX0]; + tilingDataPtr.secondDimSize = inTensor0.dims[DIM_INDEX1]; + tilingDataPtr.addNumDimSize = inTensor1.dims[DIM_INDEX0]; auto param = AnyCast(launchParam.GetParam()); - tilingDataPtr->groupNum = static_cast(param.groupNum); - tilingDataPtr->groupTopk = static_cast(param.groupTopk); - tilingDataPtr->n = static_cast(param.n); - tilingDataPtr->k = static_cast(param.k); - tilingDataPtr->activateType = static_cast(param.activateType); - tilingDataPtr->isNorm = static_cast(param.isNorm); - tilingDataPtr->enableExpertMapping = static_cast(param.enableExpertMapping); - tilingDataPtr->groupEles = tilingDataPtr->groupNum == 0 ? tilingDataPtr->secondDimSize : - tilingDataPtr->secondDimSize / tilingDataPtr->groupNum; - tilingDataPtr->scale = param.scale; - if (tilingDataPtr->enableExpertMapping) { + tilingDataPtr.groupNum = static_cast(param.groupNum); + tilingDataPtr.groupTopk = static_cast(param.groupTopk); + tilingDataPtr.n = static_cast(param.n); + tilingDataPtr.k = static_cast(param.k); + tilingDataPtr.activateType = static_cast(param.activateType); + tilingDataPtr.isNorm = static_cast(param.isNorm); + tilingDataPtr.enableExpertMapping = static_cast(param.enableExpertMapping); + tilingDataPtr.groupEles = tilingDataPtr.groupNum == 0 ? tilingDataPtr.secondDimSize : + tilingDataPtr.secondDimSize / tilingDataPtr.groupNum; + tilingDataPtr.scale = param.scale; + if (tilingDataPtr.enableExpertMapping) { const Tensor &inTensor3 = launchParam.GetInTensor(MAPPING_TABLE_INPUT_INDEX); - tilingDataPtr->expertNum = inTensor3.desc.dims[DIM_INDEX0]; - tilingDataPtr->tableDim = inTensor3.desc.dims[DIM_INDEX1]; + tilingDataPtr.expertNum = inTensor3.desc.dims[DIM_INDEX0]; + tilingDataPtr.tableDim = inTensor3.desc.dims[DIM_INDEX1]; } return Status::OkStatus(); } diff --git a/src/kernels/mixkernels/mla_preprocess/tiling/mla_preprocess_tiling.cpp b/src/kernels/mixkernels/mla_preprocess/tiling/mla_preprocess_tiling.cpp index 9fe26640..eee7b7db 100644 --- a/src/kernels/mixkernels/mla_preprocess/tiling/mla_preprocess_tiling.cpp +++ b/src/kernels/mixkernels/mla_preprocess/tiling/mla_preprocess_tiling.cpp @@ -436,45 +436,45 @@ void MlaPreprocessTiling::EinSumQuantTiling(const OpParam::MlaPreprocess ¶m, tilingData.esqColTail = esqColTail; } -void MlaPreprocessTiling::SetTiling(AtbOps::MlaTilingData *tilingParam) +void MlaPreprocessTiling::SetTiling(AtbOps::MlaTilingData &tilingParam) { - tilingParam->n = tilingData.n; - tilingParam->perTaskNum = tilingData.perTaskNum; - tilingParam->resTaskNum = tilingData.resTaskNum; - tilingParam->numCore = tilingData.numCore; - - tilingParam->rmsNumCore1 = tilingData.rmsNumCore1; - tilingParam->rmsNumCol1 = tilingData.rmsNumCol1; - - tilingParam->rmsNumCore2 = tilingData.rmsNumCore2; - tilingParam->rmsNumCol2 = tilingData.rmsNumCol2; - - tilingParam->hiddenSizeQ = tilingData.hiddenSizeQ; - tilingParam->headNumQ = tilingData.headNumQ; - tilingParam->headDim = tilingData.headDim; - tilingParam->concatSize = tilingData.concatSize; - tilingParam->rotaryCoeff = tilingData.rotaryCoeff; - tilingParam->ntokens = tilingData.ntokens; - tilingParam->realCore = tilingData.realCore; - tilingParam->nlCoreRun = tilingData.nlCoreRun; - tilingParam->lCoreRun = tilingData.lCoreRun; - tilingParam->maxNPerLoopForUb = tilingData.maxNPerLoopForUb; - tilingParam->preCoreLoopTime = tilingData.preCoreLoopTime; - tilingParam->preCoreLoopNLast = tilingData.preCoreLoopNLast; - tilingParam->lastCoreLoopTime = tilingData.lastCoreLoopTime; - tilingParam->lastCoreLoopNLast = tilingData.lastCoreLoopNLast; - - tilingParam->esqFrontCore = tilingData.esqFrontCore; - tilingParam->esqTailCore = tilingData.esqTailCore; - tilingParam->esqFrontCoreBatch = tilingData.esqFrontCoreBatch; - tilingParam->esqTailCoreBatch = tilingData.esqTailCoreBatch; - tilingParam->esqHeadNum = tilingData.esqHeadNum; - tilingParam->esqColNum = tilingData.esqColNum; - tilingParam->esqUbHeadLoop = tilingData.esqUbHeadLoop; - tilingParam->esqHeadPerLoop = tilingData.esqHeadPerLoop; - tilingParam->esqHeadTail = tilingData.esqHeadTail; - tilingParam->esqColLoop = tilingData.esqColLoop; - tilingParam->esqColTail = tilingData.esqColTail; + tilingParam.n = tilingData.n; + tilingParam.perTaskNum = tilingData.perTaskNum; + tilingParam.resTaskNum = tilingData.resTaskNum; + tilingParam.numCore = tilingData.numCore; + + tilingParam.rmsNumCore1 = tilingData.rmsNumCore1; + tilingParam.rmsNumCol1 = tilingData.rmsNumCol1; + + tilingParam.rmsNumCore2 = tilingData.rmsNumCore2; + tilingParam.rmsNumCol2 = tilingData.rmsNumCol2; + + tilingParam.hiddenSizeQ = tilingData.hiddenSizeQ; + tilingParam.headNumQ = tilingData.headNumQ; + tilingParam.headDim = tilingData.headDim; + tilingParam.concatSize = tilingData.concatSize; + tilingParam.rotaryCoeff = tilingData.rotaryCoeff; + tilingParam.ntokens = tilingData.ntokens; + tilingParam.realCore = tilingData.realCore; + tilingParam.nlCoreRun = tilingData.nlCoreRun; + tilingParam.lCoreRun = tilingData.lCoreRun; + tilingParam.maxNPerLoopForUb = tilingData.maxNPerLoopForUb; + tilingParam.preCoreLoopTime = tilingData.preCoreLoopTime; + tilingParam.preCoreLoopNLast = tilingData.preCoreLoopNLast; + tilingParam.lastCoreLoopTime = tilingData.lastCoreLoopTime; + tilingParam.lastCoreLoopNLast = tilingData.lastCoreLoopNLast; + + tilingParam.esqFrontCore = tilingData.esqFrontCore; + tilingParam.esqTailCore = tilingData.esqTailCore; + tilingParam.esqFrontCoreBatch = tilingData.esqFrontCoreBatch; + tilingParam.esqTailCoreBatch = tilingData.esqTailCoreBatch; + tilingParam.esqHeadNum = tilingData.esqHeadNum; + tilingParam.esqColNum = tilingData.esqColNum; + tilingParam.esqUbHeadLoop = tilingData.esqUbHeadLoop; + tilingParam.esqHeadPerLoop = tilingData.esqHeadPerLoop; + tilingParam.esqHeadTail = tilingData.esqHeadTail; + tilingParam.esqColLoop = tilingData.esqColLoop; + tilingParam.esqColTail = tilingData.esqColTail; } void MlaPreprocessTiling::SetTilingKey(const Mki::LaunchParam &launchParam, Mki::KernelInfo &kernelInfo) diff --git a/src/kernels/mixkernels/multi_latent_attention/tiling/mla_tiling_dependency.cpp b/src/kernels/mixkernels/multi_latent_attention/tiling/mla_tiling_dependency.cpp index f90a72dd..c662de13 100644 --- a/src/kernels/mixkernels/multi_latent_attention/tiling/mla_tiling_dependency.cpp +++ b/src/kernels/mixkernels/multi_latent_attention/tiling/mla_tiling_dependency.cpp @@ -318,7 +318,7 @@ void GetNdMLAMtpTilingTP1(const MLAInfo &mmInfo, uint32_t &blockDim, uint32_t *t } void GetTilingHead(const MLAInfo &mmInfo, const OpParam::MLA ¶m, uint32_t *tilingParam, - const uint32_t *torPtr, uint32_t blockDim) + const uint32_t &torPtr, uint32_t blockDim) { tilingParam[TILING_BATCH] = static_cast(mmInfo.batch); tilingParam[TILING_HEADSIZE] = static_cast(TILING_HEAD_SIZE); @@ -329,7 +329,7 @@ void GetTilingHead(const MLAInfo &mmInfo, const OpParam::MLA ¶m, uint32_t *t tilingParam[TILING_NUMBLOKS] = static_cast(mmInfo.numBlocks); tilingParam[TILING_BLOCKSIZE] = static_cast(mmInfo.blockSize); tilingParam[TILING_MAXBLOCKS] = static_cast(mmInfo.maxNumBlocksPerQuery); - tilingParam[TILING_TOR] = *torPtr; + tilingParam[TILING_TOR] = torPtr; tilingParam[TILING_KVHEADS] = (mmInfo.kvHeads == 0) ? mmInfo.numHeads : mmInfo.kvHeads; tilingParam[TILING_MASK_TYPE_ND] = static_cast(mmInfo.maskType); diff --git a/src/kernels/mixkernels/ring_mla/tiling/ring_mla_tiling_dependency.cpp b/src/kernels/mixkernels/ring_mla/tiling/ring_mla_tiling_dependency.cpp index 8327547e..de6273b7 100644 --- a/src/kernels/mixkernels/ring_mla/tiling/ring_mla_tiling_dependency.cpp +++ b/src/kernels/mixkernels/ring_mla/tiling/ring_mla_tiling_dependency.cpp @@ -241,7 +241,7 @@ void GetNdMLAMtpTilingTP1(const RINGMLAInfo &mmInfo, uint32_t &blockDim, uint32_ } void GetTilingHead(const RINGMLAInfo &mmInfo, const OpParam::RINGMLA ¶m, uint32_t *tilingParam, - const uint32_t *torPtr) + const uint32_t &torPtr) { tilingParam[TILING_BATCH] = static_cast(mmInfo.batch); tilingParam[TILING_HEADSIZE] = static_cast(TILING_HEAD_SIZE); @@ -252,7 +252,7 @@ void GetTilingHead(const RINGMLAInfo &mmInfo, const OpParam::RINGMLA ¶m, uin tilingParam[TILING_NUMBLOKS] = static_cast(mmInfo.numBlocks); tilingParam[TILING_BLOCKSIZE] = static_cast(mmInfo.blockSize); tilingParam[TILING_MAXBLOCKS] = static_cast(mmInfo.maxNumBlocksPerQuery); - tilingParam[TILING_TOR] = *torPtr; + tilingParam[TILING_TOR] = torPtr; tilingParam[TILING_KVHEADS] = (mmInfo.kvHeads == 0) ? mmInfo.numHeads : mmInfo.kvHeads; tilingParam[TILING_MASK_TYPE_ND] = static_cast(mmInfo.maskType); diff --git a/src/kernels/mixkernels/rope/tiling/rope_tiling.cpp b/src/kernels/mixkernels/rope/tiling/rope_tiling.cpp index 91d3b448..c932da29 100644 --- a/src/kernels/mixkernels/rope/tiling/rope_tiling.cpp +++ b/src/kernels/mixkernels/rope/tiling/rope_tiling.cpp @@ -85,7 +85,7 @@ void RopeNdProcess(const LaunchParam &launchParam, KernelInfo &kernelInfo, RopeT MKI_LOG(DEBUG) << "Multiple is " << multiple; MKI_LOG(DEBUG) << "RealCore is " << realCore; } -Status TilingKeyChose(const LaunchParam &launchParam, KernelInfo &kernelInfo, const RopeTilingData *tilingDataPtr) +Status TilingKeyChose(const LaunchParam &launchParam, KernelInfo &kernelInfo, const RopeTilingData &tilingDataPtr) { auto platformType = PlatformInfo::Instance().GetPlatformType(); auto cosSize = launchParam.GetInTensor(NUM_COSIN).desc.dims.size(); @@ -95,8 +95,8 @@ Status TilingKeyChose(const LaunchParam &launchParam, KernelInfo &kernelInfo, co MKI_LOG(ERROR) << "BF16 only supports 800I A2"; return Status::FailStatus(ERROR_INVALID_VALUE); } - uint32_t alignRotary = (tilingDataPtr->headDim / tilingDataPtr->rotaryCoeff) % ELE_NUM_FP16; - bool condition = (alignRotary == 0) && (tilingDataPtr->ntokens >= LARGE_NTOKENS_THRESHOLD); + uint32_t alignRotary = (tilingDataPtr.headDim / tilingDataPtr.rotaryCoeff) % ELE_NUM_FP16; + bool condition = (alignRotary == 0) && (tilingDataPtr.ntokens >= LARGE_NTOKENS_THRESHOLD); if (condition) { // ntokens >= 64时,走TILING_BF16_ALIGN kernelInfo.SetTilingId(TILING_BF16_ALIGN); // first 2 for shape dims of cos } else { @@ -105,8 +105,8 @@ Status TilingKeyChose(const LaunchParam &launchParam, KernelInfo &kernelInfo, co } else if (launchParam.GetInTensor(NUM_COSIN).desc.dtype == TENSOR_DTYPE_FLOAT) { kernelInfo.SetTilingId(TILING_HIGH_PREC); // second 1 for FP32 } else { - bool condition = tilingDataPtr->ntokens * tilingDataPtr->multiple >= LARGE_NTOKENS_THRESHOLD && - tilingDataPtr->cosFormat == 0; + bool condition = tilingDataPtr.ntokens * tilingDataPtr.multiple >= LARGE_NTOKENS_THRESHOLD && + tilingDataPtr.cosFormat == 0; if (condition) { // ntokens >= 64时,走TILING_HIGH_PERF_LARGE_NTOKENS kernelInfo.SetTilingId(TILING_HIGH_PERF_LARGE_NTOKENS); } else { @@ -119,8 +119,8 @@ Status TilingKeyChose(const LaunchParam &launchParam, KernelInfo &kernelInfo, co MKI_LOG(ERROR) << "BF16 only supports 800I A2"; return Status::FailStatus(ERROR_INVALID_VALUE); } - uint32_t alignRotary = (tilingDataPtr->headDim / tilingDataPtr->rotaryCoeff) % ELE_NUM_FP16; - bool condition = (alignRotary == 0) && (tilingDataPtr->ntokens >= LARGE_NTOKENS_THRESHOLD); + uint32_t alignRotary = (tilingDataPtr.headDim / tilingDataPtr.rotaryCoeff) % ELE_NUM_FP16; + bool condition = (alignRotary == 0) && (tilingDataPtr.ntokens >= LARGE_NTOKENS_THRESHOLD); if (condition) { // ntokens >= 64时,走TILING_BF16_ALIGN_BROARD kernelInfo.SetTilingId(TILING_BF16_ALIGN_BROARD); // first 2 for shape dims of cos } else { diff --git a/src/kernels/mixkernels/rope_q_concat/tiling/rope_q_concat_tiling.cpp b/src/kernels/mixkernels/rope_q_concat/tiling/rope_q_concat_tiling.cpp index f0fc38e5..01b09360 100644 --- a/src/kernels/mixkernels/rope_q_concat/tiling/rope_q_concat_tiling.cpp +++ b/src/kernels/mixkernels/rope_q_concat/tiling/rope_q_concat_tiling.cpp @@ -39,7 +39,7 @@ Status TilingKeyChose(const LaunchParam &launchParam, KernelInfo &kernelInfo) return Status::OkStatus(); } -Status RopeNdProcess(const LaunchParam &launchParam, RopeQConcatTilingData *tilingDataPtr) +Status RopeNdProcess(const LaunchParam &launchParam, RopeQConcatTilingData &tilingDataPtr) { auto &inTensor0 = launchParam.GetInTensor(0).desc; auto &inTensor1 = launchParam.GetInTensor(DIM_1).desc; @@ -54,7 +54,7 @@ Status RopeNdProcess(const LaunchParam &launchParam, RopeQConcatTilingData *tili uint32_t concatSize = inTensor3.dims[DIM_2]; // 当前场景只支持rotaryCoeff = 2的情况 - tilingDataPtr->rotaryCoeff = 2; + tilingDataPtr.rotaryCoeff = 2; uint32_t maxCore = static_cast(PlatformInfo::Instance().GetCoreNum(CoreType::CORE_TYPE_VECTOR)); auto maxUbSize = static_cast(PlatformInfo::Instance().GetUbSize()) - REMAIN_TILING_SIZE; @@ -76,19 +76,19 @@ Status RopeNdProcess(const LaunchParam &launchParam, RopeQConcatTilingData *tili uint32_t preCoreLoopNLast = nlCoreRun - (preCoreLoopTime - 1) * maxNPerLoopForUb; // 前核最后一批处理数据行数 uint32_t lastCoreLoopTime = (lCoreRun + maxNPerLoopForUb - 1) / maxNPerLoopForUb; // 尾核循环次数 uint32_t lastCoreLoopNLast = lCoreRun - (lastCoreLoopTime - 1) * maxNPerLoopForUb; // 尾核最后一批处理数据行数 - tilingDataPtr->hiddenSizeQ = hiddenSizeQ; - tilingDataPtr->headNumQ = headNumQ; - tilingDataPtr->headDim = headDim; - tilingDataPtr->concatSize = concatSize; - tilingDataPtr->ntokens = ntokens; - tilingDataPtr->realCore = realCore; - tilingDataPtr->nlCoreRun = nlCoreRun; - tilingDataPtr->lCoreRun = lCoreRun; - tilingDataPtr->maxNPerLoopForUb = maxNPerLoopForUb; - tilingDataPtr->preCoreLoopTime = preCoreLoopTime; - tilingDataPtr->preCoreLoopNLast = preCoreLoopNLast; - tilingDataPtr->lastCoreLoopTime = lastCoreLoopTime; - tilingDataPtr->lastCoreLoopNLast = lastCoreLoopNLast; + tilingDataPtr.hiddenSizeQ = hiddenSizeQ; + tilingDataPtr.headNumQ = headNumQ; + tilingDataPtr.headDim = headDim; + tilingDataPtr.concatSize = concatSize; + tilingDataPtr.ntokens = ntokens; + tilingDataPtr.realCore = realCore; + tilingDataPtr.nlCoreRun = nlCoreRun; + tilingDataPtr.lCoreRun = lCoreRun; + tilingDataPtr.maxNPerLoopForUb = maxNPerLoopForUb; + tilingDataPtr.preCoreLoopTime = preCoreLoopTime; + tilingDataPtr.preCoreLoopNLast = preCoreLoopNLast; + tilingDataPtr.lastCoreLoopTime = lastCoreLoopTime; + tilingDataPtr.lastCoreLoopNLast = lastCoreLoopNLast; return Status::OkStatus(); } diff --git a/src/kernels/mixkernels/swi_glu_quant/tiling/swi_glu_quant_tiling.cpp b/src/kernels/mixkernels/swi_glu_quant/tiling/swi_glu_quant_tiling.cpp index 2d5b5389..46579d28 100644 --- a/src/kernels/mixkernels/swi_glu_quant/tiling/swi_glu_quant_tiling.cpp +++ b/src/kernels/mixkernels/swi_glu_quant/tiling/swi_glu_quant_tiling.cpp @@ -32,37 +32,37 @@ static constexpr int TILING_KEY_FP32_QUANT_MODE = 306; // Tiling key for FP32 qu namespace AtbOps { using namespace Mki; -void SetTilingData(SwiGluQuantTilingData *tilingData) +void SetTilingData(SwiGluQuantTilingData &tilingData) { - tilingData->basicRowLenHeadCore = tilingData->optBaseRowLenHeadCore; - tilingData->basicRowLenTailCore = tilingData->optBaseRowLenTailCore; - tilingData->basicColLen = tilingData->optBaseColLen; - tilingData->realCoreNum = tilingData->coreNumUsed; + tilingData.basicRowLenHeadCore = tilingData.optBaseRowLenHeadCore; + tilingData.basicRowLenTailCore = tilingData.optBaseRowLenTailCore; + tilingData.basicColLen = tilingData.optBaseColLen; + tilingData.realCoreNum = tilingData.coreNumUsed; } -bool CalTilingData(SwiGluQuantTilingData *tilingData) +bool CalTilingData(SwiGluQuantTilingData& tilingData) { - uint32_t rowLen = tilingData->rowLen; - tilingData->coreNumUsed = Max(Min(tilingData->totalCore, rowLen), ONE); - tilingData->headCoreNum = rowLen % tilingData->coreNumUsed; - tilingData->rowLenPerHeadCore = (rowLen + tilingData->coreNumUsed - 1) / tilingData->coreNumUsed; - tilingData->rowLenPerTailCore = rowLen / tilingData->coreNumUsed; + uint32_t rowLen = tilingData.rowLen; + tilingData.coreNumUsed = Max(Min(tilingData.totalCore, rowLen), ONE); + tilingData.headCoreNum = rowLen % tilingData.coreNumUsed; + tilingData.rowLenPerHeadCore = (rowLen + tilingData.coreNumUsed - 1) / tilingData.coreNumUsed; + tilingData.rowLenPerTailCore = rowLen / tilingData.coreNumUsed; return CalculateMaxUbSizePerRow(tilingData); } -void PrintSwiQuantTiling(SwiGluQuantTilingData *tilingData) +void PrintSwiQuantTiling(const SwiGluQuantTilingData &tilingData) { MKI_LOG(INFO) << "SwiGlu Tiling Data:" << "\n" - << " groupLen " << tilingData->groupLen << "\n" - << " rowLen " << tilingData->rowLen << "\n" - << " colLen " << tilingData->colLen << "\n" - << " rowLenPerHeadCore " << tilingData->rowLenPerHeadCore << "\n" - << " rowLenPerTailCore " << tilingData->rowLenPerTailCore << "\n" - << " basicRowLenHeadCore " << tilingData->basicRowLenHeadCore << "\n" - << " basicRowLenTailCore " << tilingData->basicRowLenTailCore << "\n" - << " basicColLen " << tilingData->basicColLen << "\n" - << " headCoreNum " << tilingData->headCoreNum << "\n" - << " realCoreNum " << tilingData->realCoreNum << "\n" - << " totalCore " << tilingData->totalCore; + << " groupLen " << tilingData.groupLen << "\n" + << " rowLen " << tilingData.rowLen << "\n" + << " colLen " << tilingData.colLen << "\n" + << " rowLenPerHeadCore " << tilingData.rowLenPerHeadCore << "\n" + << " rowLenPerTailCore " << tilingData.rowLenPerTailCore << "\n" + << " basicRowLenHeadCore " << tilingData.basicRowLenHeadCore << "\n" + << " basicRowLenTailCore " << tilingData.basicRowLenTailCore << "\n" + << " basicColLen " << tilingData.basicColLen << "\n" + << " headCoreNum " << tilingData.headCoreNum << "\n" + << " realCoreNum " << tilingData.realCoreNum << "\n" + << " totalCore " << tilingData.totalCore; } void SwigluQuantTilingKeyChose(const LaunchParam &launchParam, KernelInfo &kernelInfo) diff --git a/src/kernels/mixkernels/swi_glu_quant/tiling/swi_glu_quant_tiling_utils.h b/src/kernels/mixkernels/swi_glu_quant/tiling/swi_glu_quant_tiling_utils.h index 0e4756af..c1e1b60f 100644 --- a/src/kernels/mixkernels/swi_glu_quant/tiling/swi_glu_quant_tiling_utils.h +++ b/src/kernels/mixkernels/swi_glu_quant/tiling/swi_glu_quant_tiling_utils.h @@ -54,7 +54,7 @@ template T Min(T num, T div) { return num < div ? num : div; } template T Max(T num, T div) { return num < div ? div : num; } -inline bool SetTotalShape(const Mki::SVector &inShape, SwiGluQuantTilingData *tilingData) +inline bool SetTotalShape(const Mki::SVector &inShape, SwiGluQuantTilingData& tilingData) { int64_t shapeBefore = 1; int64_t shapeAfter = 1; @@ -69,27 +69,27 @@ inline bool SetTotalShape(const Mki::SVector &inShape, SwiGluQuantTilin } MKI_CHECK(shapeAfter % EVEN_FACTOR == 0, "shapeAfter % 2 != 0", return false); MKI_CHECK(shapeAfter != 0, "shapeAfter == 0", return false); - tilingData->rowLen = static_cast(shapeBefore); - tilingData->colLen = static_cast(shapeAfter / EVEN_FACTOR); + tilingData.rowLen = static_cast(shapeBefore); + tilingData.colLen = static_cast(shapeAfter / EVEN_FACTOR); return true; } -inline bool CalculateMaxUbSizePerRow(SwiGluQuantTilingData *tilingData) +inline bool CalculateMaxUbSizePerRow(SwiGluQuantTilingData& tilingData) { - uint32_t colLen = tilingData->colLen; - uint32_t alignedColLen = AlignUp(colLen, tilingData->blockNum); + uint32_t colLen = tilingData.colLen; + uint32_t alignedColLen = AlignUp(colLen, tilingData.blockNum); MKI_CHECK(alignedColLen != 0, "CalculateMaxUbSizePerRow Unsupported alignedColLen == 0", return false); MKI_LOG(INFO) << "alignedColLen:" << alignedColLen << "\n"; - uint32_t ubAvail = tilingData->dataNumSingleUb / alignedColLen; - MKI_LOG(INFO) << "tilingData->dataNumSingleUb:" << tilingData->dataNumSingleUb << "\n"; + uint32_t ubAvail = tilingDat.dataNumSingleUb / alignedColLen; + MKI_LOG(INFO) << "tilingData.dataNumSingleUb:" << tilingData.dataNumSingleUb << "\n"; MKI_LOG(INFO) << "ubAvail:" << ubAvail << "\n"; MKI_CHECK(ubAvail != 0, "The input vector is too large. It is not supported currently.", return false); - tilingData->optBaseColLen = colLen; + tilingData.optBaseColLen = colLen; ubAvail = Max(ubAvail, ONE); - tilingData->optBaseRowLenHeadCore = Min(Min(ubAvail, tilingData->rowLenPerHeadCore), COMPARE_INT); - tilingData->optBaseRowLenTailCore = Min(Min(ubAvail, tilingData->rowLenPerTailCore), COMPARE_INT); + tilingData.optBaseRowLenHeadCore = Min(Min(ubAvail, tilingData.rowLenPerHeadCore), COMPARE_INT); + tilingData.optBaseRowLenTailCore = Min(Min(ubAvail, tilingData.rowLenPerTailCore), COMPARE_INT); return true; } diff --git a/src/kernels/tbe_adapter/platform/platform_ascendc.cpp b/src/kernels/tbe_adapter/platform/platform_ascendc.cpp index e1d924f2..15c771eb 100644 --- a/src/kernels/tbe_adapter/platform/platform_ascendc.cpp +++ b/src/kernels/tbe_adapter/platform/platform_ascendc.cpp @@ -41,11 +41,11 @@ const static std::map CONVERT_MAP = { {"Ascend910_93", SocVersion::ASCEND910B}, }; -static inline uint32_t GetCoreNumByType(fe::PlatFormInfos *platformInfo, bool isAiv) +static inline uint32_t GetCoreNumByType(const fe::PlatFormInfos &platformInfo, bool isAiv) { std::string key; std::string val; - bool ret = platformInfo->GetPlatformResWithLock(STR_SOC_INFO, STR_SPLIT_KEY, val); + bool ret = platformInfo.GetPlatformResWithLock(STR_SOC_INFO, STR_SPLIT_KEY, val); MKI_LOG_IF(!ret, ERROR) << "get platform failed, val is " << val; if (STR_SPLIT_VAL.compare(val) != 0) { @@ -55,7 +55,7 @@ static inline uint32_t GetCoreNumByType(fe::PlatFormInfos *platformInfo, bool is } else { key = STR_CORE_CNT_CUB; } - ret = platformInfo->GetPlatformResWithLock(STR_SOC_INFO, key, val); + ret = platformInfo.GetPlatformResWithLock(STR_SOC_INFO, key, val); MKI_LOG_IF(!ret, ERROR) << "get platform failed, key is " << key << ", val is" << val; return val.empty() ? 0 : static_cast(std::atoi(val.c_str())); } -- Gitee From 4cd4b4103414138ee87659ec071699777bf2faaa Mon Sep 17 00:00:00 2001 From: huangxiaolan Date: Fri, 19 Sep 2025 16:16:45 +0800 Subject: [PATCH 21/94] fix function point --- src/atb/operation/if_operation.cpp | 6 +++--- src/atb/utils/dl_manager.cpp | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/atb/operation/if_operation.cpp b/src/atb/operation/if_operation.cpp index 3189eb59..aa929ade 100644 --- a/src/atb/operation/if_operation.cpp +++ b/src/atb/operation/if_operation.cpp @@ -16,7 +16,7 @@ namespace atb { -Status IfOperation::GetOperationFromCondition(Operation **op) const +Status IfOperation::GetOperationFromCondition(Operation *&op) const { bool cond = true; try { @@ -28,10 +28,10 @@ Status IfOperation::GetOperationFromCondition(Operation **op) const if (cond && param_.opA) { ATB_LOG(INFO) << GetLogPrefix() << "Condition met (true), selecting opA..."; - *op = param_.opA; + op = param_.opA; } else if (!cond && param_.opB) { ATB_LOG(INFO) << GetLogPrefix() << "Condition not met (false), selecting opB..."; - *op = param_.opB; + op = param_.opB; } else { ATB_LOG(ERROR) << GetLogPrefix() << "Please check the intended operation is valid, opA: " << param_.opA << " opB: " << param_.opB; diff --git a/src/atb/utils/dl_manager.cpp b/src/atb/utils/dl_manager.cpp index ad84d945..2872d781 100644 --- a/src/atb/utils/dl_manager.cpp +++ b/src/atb/utils/dl_manager.cpp @@ -29,15 +29,15 @@ DlManager::~DlManager() } } -Status DlManager::getSymbol(const std::string &symbol, void **symbolPtr) const +Status DlManager::getSymbol(const std::string &symbol, void *&symbolPtr) const { if (handle_ == nullptr) { ATB_LOG(ERROR) << "Dynamic library handle is null, please check the path: " << path_; return ERROR_CANN_ERROR; } - *symbolPtr = dlsym(handle_, symbol.c_str()); + symbolPtr = dlsym(handle_, symbol.c_str()); char *errorInfo = dlerror(); - if (*symbolPtr == nullptr || errorInfo != nullptr) { + if (symbolPtr == nullptr || errorInfo != nullptr) { ATB_LOG(ERROR) << "Failed to find symbol " << symbol << " from path: " << path_ << ", error: " << errorInfo; return ERROR_CANN_ERROR; } -- Gitee From c78854b3095441ab7e219fefdfc8ddf71d9ef922 Mon Sep 17 00:00:00 2001 From: huangxiaolan Date: Fri, 19 Sep 2025 16:20:19 +0800 Subject: [PATCH 22/94] fix function declaration --- src/atb/operation/if_operation.h | 2 +- src/atb/utils/dl_manager.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/atb/operation/if_operation.h b/src/atb/operation/if_operation.h index d1a4d414..eab4a45f 100644 --- a/src/atb/operation/if_operation.h +++ b/src/atb/operation/if_operation.h @@ -34,7 +34,7 @@ protected: std::shared_ptr CreateRunner(Context &context) const override; private: - Status GetOperationFromCondition(Operation **op) const; + Status GetOperationFromCondition(Operation *&op) const; private: common::IfCondParam param_; diff --git a/src/atb/utils/dl_manager.h b/src/atb/utils/dl_manager.h index 626d630e..c1d00253 100644 --- a/src/atb/utils/dl_manager.h +++ b/src/atb/utils/dl_manager.h @@ -19,7 +19,7 @@ class DlManager { public: DlManager(std::string path); ~DlManager(); - Status getSymbol(const std::string &symbol, void **symbolPtr) const; + Status getSymbol(const std::string &symbol, void *&symbolPtr) const; private: std::string path_; -- Gitee From a23cc295df97c4d3b8046840b5aabea14126efaf Mon Sep 17 00:00:00 2001 From: wanyukang Date: Fri, 19 Sep 2025 16:22:12 +0800 Subject: [PATCH 23/94] maxbatch --- .../mixkernels/toppsample/op_kernel/toppsample.cpp | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/kernels/mixkernels/toppsample/op_kernel/toppsample.cpp b/src/kernels/mixkernels/toppsample/op_kernel/toppsample.cpp index 77357126..fbc9eda7 100644 --- a/src/kernels/mixkernels/toppsample/op_kernel/toppsample.cpp +++ b/src/kernels/mixkernels/toppsample/op_kernel/toppsample.cpp @@ -20,7 +20,6 @@ static constexpr uint32_t DEFAULT_STRIDE = 8; static constexpr uint32_t FP32_PER_REPEAT = 64; static constexpr uint32_t FP16_PER_REPEAT = 128; static constexpr uint32_t FP16_PER_BLOCK = 16; -static constexpr uint32_t MAX_BATCH = 1024; static constexpr uint32_t NUM_4 = 4; using AscendC::HardEvent; @@ -45,6 +44,7 @@ public: nlCoreRun_ = (firstDim_ + realCore_ - 1) / realCore_; lCoreRun_ = firstDim_ - (realCore_ - 1) * nlCoreRun_; dynamicRound_ = (blockIdx_ == realCore_ - 1) ? lCoreRun_ : nlCoreRun_; + maxBatch_ = (firstDim_ + FP16_PER_BLOCK - 1) / FP16_PER_BLOCK * FP16_PER_BLOCK; xGm_.SetGlobalBuffer((__gm__ T *)cumsumed_probs); yGm_.SetGlobalBuffer((__gm__ T *)topp); // batch,num_samples @@ -54,8 +54,8 @@ public: pipe_.InitBuffer(inputBuf_, tempUbEleAligened_ * DATA_BYTE); pipe_.InitBuffer(tempBuf_, tempUbEleAligened_ * DATA_BYTE * DATA_BYTE); pipe_.InitBuffer(fp32Buf_, tempUbEleAligened_ * DATA_BYTE * DATA_BYTE); - pipe_.InitBuffer(yBuf_, MAX_BATCH * DATA_BYTE); // topp - pipe_.InitBuffer(yF32Buf_, MAX_BATCH * DATA_BYTE * DATA_BYTE); // toppfp32 + pipe_.InitBuffer(yBuf_, maxBatch_ * DATA_BYTE); // topp + pipe_.InitBuffer(yF32Buf_, maxBatch_ * DATA_BYTE * DATA_BYTE); // toppfp32 pipe_.InitBuffer(int8Buf_, tempUbEleAligened_ / DEFAULT_STRIDE); // compare pipe_.InitBuffer(blockBuf_, BLK_SIZE); // 存下标 pipe_.InitBuffer(int32Buf_, MAX_CORE_NUM * DATA_BYTE * DATA_BYTE); // 每个核做几个batch @@ -65,7 +65,7 @@ public: __aicore__ inline void PickUpRand() { AscendC::LocalTensor buf = yBuf_.Get(); - DataCopy(buf, yGm_, MAX_BATCH); + DataCopy(buf, yGm_, maxBatch_); } __aicore__ inline void FirstPick(uint32_t cid, uint32_t offset) @@ -127,7 +127,7 @@ public: Duplicate(uint32Buf_, uint32_t(0), tempUbEleAligened_ / BLK_SIZE); // 截断数可能是batch个,也可能是1个 // 每个batch往后取一个随机数。(*(tilingUb_ + batchOffset)) - Cast(toppBufF32_, toppBuf_, AscendC::RoundMode::CAST_NONE, MAX_BATCH); + Cast(toppBufF32_, toppBuf_, AscendC::RoundMode::CAST_NONE, maxBatch_); for (int cid = 0; cid < dynamicRound_; cid++) { // 每个核做多少次 absIdx_ = 0; uint32_t batchOffset = (blockIdx_ * nlCoreRun_ + cid) % MAX_CORE_NUM; @@ -336,6 +336,7 @@ private: uint32_t expandLastDim_{0}; uint32_t numSamplesMax_{0}; uint32_t firstDim_{0}; + uint32_t maxBatch_{0}; float maxNum_{0}; float tempValue_{0}; uint32_t perCoreRunNum_{0}; -- Gitee From 8dae4d53fecae188107ad058c371705c0765c238 Mon Sep 17 00:00:00 2001 From: huangxiaolan Date: Fri, 19 Sep 2025 16:55:45 +0800 Subject: [PATCH 24/94] fix function declaration --- .../faster_gelu_forward/tiling/faster_gelu_tiling.cpp | 2 +- .../kernels/activation/gelu_forward/tiling/gelu_tiling.cpp | 2 +- .../dynamic_quant_tiling/dynamic_quant_tiling.cpp | 6 +++--- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/kernels/kernels/activation/faster_gelu_forward/tiling/faster_gelu_tiling.cpp b/src/kernels/kernels/activation/faster_gelu_forward/tiling/faster_gelu_tiling.cpp index fec67067..a64b1f92 100644 --- a/src/kernels/kernels/activation/faster_gelu_forward/tiling/faster_gelu_tiling.cpp +++ b/src/kernels/kernels/activation/faster_gelu_forward/tiling/faster_gelu_tiling.cpp @@ -95,7 +95,7 @@ Status FasterGeluForwardTiling(const LaunchParam &launchParam, KernelInfo &kerne reinterpret_cast(kernelInfo.GetTilingHostAddr()); MKI_CHECK(tilingDataPtr != nullptr, "tilingDataPtr should not be empty", return Status::FailStatus(ERROR_INVALID_VALUE, "tilingDataPtr should not be empty")); - CalcVectorTiling512Align(launchParam, tilingDataPtr, blockDim); + CalcVectorTiling512Align(launchParam, *tilingDataPtr, blockDim); for (uint32_t i = 0; i < tilingDataPtr->usedCoreNum; i++) { MKI_LOG(INFO) << "Core-" << i << " singleCoreDataLen num is " << tilingDataPtr->singleCoreDataLen[i]; diff --git a/src/kernels/kernels/activation/gelu_forward/tiling/gelu_tiling.cpp b/src/kernels/kernels/activation/gelu_forward/tiling/gelu_tiling.cpp index f6b3c3b4..a51cd6be 100644 --- a/src/kernels/kernels/activation/gelu_forward/tiling/gelu_tiling.cpp +++ b/src/kernels/kernels/activation/gelu_forward/tiling/gelu_tiling.cpp @@ -85,7 +85,7 @@ Status GeluForwardTiling(const LaunchParam &launchParam, KernelInfo &kernelInfo) reinterpret_cast(kernelInfo.GetTilingHostAddr()); MKI_CHECK(tilingDataPtr != nullptr, "tilingDataPtr should not be empty", return Status::FailStatus(ERROR_INVALID_VALUE, "tilingDataPtr should not be empty")); - MKI_CHECK(FillTilingParam(launchParam, tilingDataPtr, blockDim), "FillTilingParam Failed.", + MKI_CHECK(FillTilingParam(launchParam, *tilingDataPtr, blockDim), "FillTilingParam Failed.", return Status::FailStatus(ERROR_INVALID_VALUE, "FillTilingParam Failed.")); kernelInfo.SetBlockDim(blockDim); kernelInfo.SetTilingId(dataType); // 不同的数据类型用不同的分核策略,所以暂时用数据类型的枚举来表示分核ID diff --git a/src/kernels/kernels/elewise/dynamic_quant/dynamic_quant_tiling/dynamic_quant_tiling.cpp b/src/kernels/kernels/elewise/dynamic_quant/dynamic_quant_tiling/dynamic_quant_tiling.cpp index ff72a00e..8e8c2a1a 100644 --- a/src/kernels/kernels/elewise/dynamic_quant/dynamic_quant_tiling/dynamic_quant_tiling.cpp +++ b/src/kernels/kernels/elewise/dynamic_quant/dynamic_quant_tiling/dynamic_quant_tiling.cpp @@ -47,7 +47,7 @@ Status ParseShape(const LaunchParam &launchParam, DynamicQuantTilingData &tiling size_t dims = shape.size(); for (size_t i = 0; i < dims; ++i) { if (i < dims - 1) { - MKI_CHECK(shape[i] > 0 && *rowNumTotal < static_cast(UINT32_MAX / shape[i]), + MKI_CHECK(shape[i] > 0 && rowNumTotal < static_cast(UINT32_MAX / shape[i]), "rowNumTotal or shape is invalid!", return Status::FailStatus(ERROR_INVALID_VALUE, "rowNumTotal or shape is invalid!")); rowNumTotal *= shape[i]; @@ -194,10 +194,10 @@ Status DynamicQuantTiling(const LaunchParam &launchParam, KernelInfo &kernelInfo tilingDataPtr->asymmetric = *reinterpret_cast(&attrs.asymmetric); uint64_t rowNumTotal = 1; - Status res = ParseShape(launchParam, tilingDataPtr, &rowNumTotal); + Status res = ParseShape(launchParam, *tilingDataPtr, &rowNumTotal); OP_TILING_CHECK_STATUS_RETURN(res); - Status ret = SetTilingData(tilingDataPtr, rowNumTotal); + Status ret = SetTilingData(*tilingDataPtr, rowNumTotal); OP_TILING_CHECK_STATUS_RETURN(ret); MKI_LOG(INFO) << "numCore = " << tilingDataPtr->numCore -- Gitee From da0e862560b6ebf17c504d4172ec10c1c9699e62 Mon Sep 17 00:00:00 2001 From: huangxiaolan Date: Fri, 19 Sep 2025 17:45:04 +0800 Subject: [PATCH 25/94] fix function point --- .../dynamic_quant_tiling.cpp | 2 +- .../tiling/cohere_layer_norm_tiling.cpp | 4 ++-- .../norm/rmsnorm/tiling/rms_norm_tiling.cpp | 4 ++-- .../tiling/fused_add_topk_div_tiling.cpp | 10 ++++---- .../tiling/ring_mla_tiling_dependency.cpp | 2 +- .../mixkernels/rope/tiling/rope_tiling.cpp | 24 +++++++++---------- .../tiling/rope_q_concat_tiling.cpp | 2 +- 7 files changed, 24 insertions(+), 24 deletions(-) diff --git a/src/kernels/kernels/elewise/dynamic_quant/dynamic_quant_tiling/dynamic_quant_tiling.cpp b/src/kernels/kernels/elewise/dynamic_quant/dynamic_quant_tiling/dynamic_quant_tiling.cpp index 8e8c2a1a..fab2d4bd 100644 --- a/src/kernels/kernels/elewise/dynamic_quant/dynamic_quant_tiling/dynamic_quant_tiling.cpp +++ b/src/kernels/kernels/elewise/dynamic_quant/dynamic_quant_tiling/dynamic_quant_tiling.cpp @@ -194,7 +194,7 @@ Status DynamicQuantTiling(const LaunchParam &launchParam, KernelInfo &kernelInfo tilingDataPtr->asymmetric = *reinterpret_cast(&attrs.asymmetric); uint64_t rowNumTotal = 1; - Status res = ParseShape(launchParam, *tilingDataPtr, &rowNumTotal); + Status res = ParseShape(launchParam, *tilingDataPtr, rowNumTotal); OP_TILING_CHECK_STATUS_RETURN(res); Status ret = SetTilingData(*tilingDataPtr, rowNumTotal); diff --git a/src/kernels/kernels/norm/coherelayernorm/tiling/cohere_layer_norm_tiling.cpp b/src/kernels/kernels/norm/coherelayernorm/tiling/cohere_layer_norm_tiling.cpp index a0465d01..8095aa44 100644 --- a/src/kernels/kernels/norm/coherelayernorm/tiling/cohere_layer_norm_tiling.cpp +++ b/src/kernels/kernels/norm/coherelayernorm/tiling/cohere_layer_norm_tiling.cpp @@ -145,10 +145,10 @@ Status CohereLayerNormTiling(const LaunchParam &launchParam, KernelInfo &kernelI uint64_t tilingKey = LAYER_NORM_TILING_KEY_BASE; if (fixedUsedBufferSize < layerNormPtrCon.maxUbSize) { // multiple rows moved simultaneously - MultipleRowMovedTiling(layerNormPtrCon, tilingDataPtr, + MultipleRowMovedTiling(layerNormPtrCon, *tilingDataPtr, singleRowMovedBufferSize, multipleRowMovedBufferSize, MISC_BUFFERS_SIZE); } else { // single row moved - SingleRowMovedTiling(layerNormPtrCon, tilingDataPtr, + SingleRowMovedTiling(layerNormPtrCon, *tilingDataPtr, singleRowMovedElemSize, multipleRowMovedElemSize, MISC_BUFFERS_SIZE); tilingKey += LAYER_NORM_TILING_KEY_FAST; } diff --git a/src/kernels/kernels/norm/rmsnorm/tiling/rms_norm_tiling.cpp b/src/kernels/kernels/norm/rmsnorm/tiling/rms_norm_tiling.cpp index 840efb97..b8b70570 100644 --- a/src/kernels/kernels/norm/rmsnorm/tiling/rms_norm_tiling.cpp +++ b/src/kernels/kernels/norm/rmsnorm/tiling/rms_norm_tiling.cpp @@ -118,12 +118,12 @@ Status RmsNormTiling(const LaunchParam &launchParam, KernelInfo &kernelInfo) tilingDataPtr->quantMin = -127; // set int8 min to -127 } kernelInfo.SetBlockDim(tilingDataPtr->numCore); - SetNonContiguousTenor(tilingDataPtr, launchParam); + SetNonContiguousTenor(*tilingDataPtr, launchParam); uint64_t tilingKey = ComputeTilingKey(tilingDataPtr->gemmaMode, tilingDataPtr->precisionMode, isShortTail, launchParam); MKI_LOG(INFO) << "post rmsnorm tilingKey is : " << tilingKey; kernelInfo.SetTilingId(tilingKey); - PrintRmsNormTiling(tilingDataPtr); + PrintRmsNormTiling(*tilingDataPtr); return Status::OkStatus(); } diff --git a/src/kernels/mixkernels/fused_add_topk_div/tiling/fused_add_topk_div_tiling.cpp b/src/kernels/mixkernels/fused_add_topk_div/tiling/fused_add_topk_div_tiling.cpp index 0b1c929b..e3a57c7a 100644 --- a/src/kernels/mixkernels/fused_add_topk_div/tiling/fused_add_topk_div_tiling.cpp +++ b/src/kernels/mixkernels/fused_add_topk_div/tiling/fused_add_topk_div_tiling.cpp @@ -71,10 +71,10 @@ namespace AtbOps { void GetTiling(TilingData *tilingDataPtr); private: - void GetTilingKey(TilingData *tilingDataPtr); + void GetTilingKey(TilingData &tilingDataPtr); void GetUsedCore(); void SplitUb(); - void FillTilingData(TilingData *tilingDataPtr); + void FillTilingData(TilingData &tilingDataPtr); template inline T1 CeilAlign(T1 a, T2 b) const { @@ -174,7 +174,7 @@ namespace AtbOps { template void GetFusedAddTopkDivTiling(TilingData *tilingDataPtr, uint32_t coreNum, uint32_t ubSize) { - class FusedAddTopkDivTiling tilingObj(tilingDataPtr, coreNum, ubSize); + class FusedAddTopkDivTiling tilingObj(*tilingDataPtr, coreNum, ubSize); tilingObj.GetTiling(tilingDataPtr); } @@ -291,7 +291,7 @@ namespace AtbOps { auto inputDatatype = launchParam.GetInTensor(X_INPUT_INDEX).desc.dtype; tilingDataPtr->dtype = g_dtypeMap[inputDatatype]; - auto checkInputInfo = GetInputInfo(launchParam, tilingDataPtr); + auto checkInputInfo = GetInputInfo(launchParam, *tilingDataPtr); if (!checkInputInfo.Ok()) { return Status::FailStatus(ERROR_INVALID_VALUE); } @@ -303,7 +303,7 @@ namespace AtbOps { kernelInfo.SetTilingId(tilingKey); uint32_t syncWorkspaceSize = sysWorkspaceSize + blockNum * tilingDataPtr->workspacePerCore; kernelInfo.GetScratchSizes() = {syncWorkspaceSize}; - PrintTilingData(tilingDataPtr); + PrintTilingData(*tilingDataPtr); return Status::OkStatus(); } } // namespace AtbOps \ No newline at end of file diff --git a/src/kernels/mixkernels/ring_mla/tiling/ring_mla_tiling_dependency.cpp b/src/kernels/mixkernels/ring_mla/tiling/ring_mla_tiling_dependency.cpp index de6273b7..ea58a71a 100644 --- a/src/kernels/mixkernels/ring_mla/tiling/ring_mla_tiling_dependency.cpp +++ b/src/kernels/mixkernels/ring_mla/tiling/ring_mla_tiling_dependency.cpp @@ -288,7 +288,7 @@ Status GetRINGMLATilingParam(const LaunchParam &launchParam, const RINGMLAInfo & GetNdMLATiling(mmInfo, blockDim, tilingParam, param); blockDim = mmInfo.batch == BATCH_MLA ? BLOCK_DIM_MLA : blockDim; } - GetTilingHead(mmInfo, param, tilingParam, torPtr); + GetTilingHead(mmInfo, param, tilingParam, *torPtr); return AtbOps::Status::OkStatus(); } diff --git a/src/kernels/mixkernels/rope/tiling/rope_tiling.cpp b/src/kernels/mixkernels/rope/tiling/rope_tiling.cpp index c932da29..147efa1e 100644 --- a/src/kernels/mixkernels/rope/tiling/rope_tiling.cpp +++ b/src/kernels/mixkernels/rope/tiling/rope_tiling.cpp @@ -34,7 +34,7 @@ static constexpr uint32_t TILING_HIGH_PREC = 21; static constexpr uint32_t TILING_HIGH_PERF = 20; static constexpr uint32_t TILING_HIGH_PERF_LARGE_NTOKENS = 23; -void RopeNdProcess(const LaunchParam &launchParam, KernelInfo &kernelInfo, RopeTilingData *tilingDataPtr) +void RopeNdProcess(const LaunchParam &launchParam, KernelInfo &kernelInfo, RopeTilingData &tilingDataPtr) { uint32_t hiddenSizeQ = static_cast(launchParam.GetInTensor(0).desc.dims[1]); uint32_t hiddenSizeK = static_cast(launchParam.GetInTensor(1).desc.dims[1]); @@ -44,12 +44,12 @@ void RopeNdProcess(const LaunchParam &launchParam, KernelInfo &kernelInfo, RopeT uint32_t batch = static_cast(launchParam.GetInTensor(4).desc.dims[0]); uint32_t maxCore = static_cast(PlatformInfo::Instance().GetCoreNum(CoreType::CORE_TYPE_VECTOR)); auto maxUbSize = static_cast(PlatformInfo::Instance().GetUbSize()); - tilingDataPtr->maxUbSize = maxUbSize; + tilingDataPtr.maxUbSize = maxUbSize; uint32_t multiple = 1; - bool condition = tilingDataPtr->cosFormat == 0 && cosSize == NUM_COSIN && + bool condition = tilingDataPtr.cosFormat == 0 && cosSize == NUM_COSIN && launchParam.GetInTensor(NUM_COSIN).desc.dtype == TENSOR_DTYPE_FLOAT16 && - ntokens >= LARGE_NTOKENS_THRESHOLD && headDim / tilingDataPtr->rotaryCoeff % ELE_NUM_FP16 == 0; + ntokens >= LARGE_NTOKENS_THRESHOLD && headDim / tilingDataPtr.rotaryCoeff % ELE_NUM_FP16 == 0; if (condition) { // 不对齐场景, multiple为1 uint32_t hiddenSize = hiddenSizeK > hiddenSizeQ ? hiddenSizeK : hiddenSizeQ; multiple = SLICE_SIZE_FP16_LARGE_NTOKENS / hiddenSize; @@ -69,13 +69,13 @@ void RopeNdProcess(const LaunchParam &launchParam, KernelInfo &kernelInfo, RopeT } uint32_t tempCore = (ntokens + maxCore - 1) / maxCore; uint32_t realCore = (ntokens + tempCore - 1) / tempCore; - tilingDataPtr->realCore = realCore; - tilingDataPtr->hiddenSizeQ = hiddenSizeQ; - tilingDataPtr->hiddenSizeK = hiddenSizeK; - tilingDataPtr->headDim = headDim; - tilingDataPtr->ntokens = ntokens; - tilingDataPtr->batch = batch; - tilingDataPtr->multiple = multiple; + tilingDataPtr.realCore = realCore; + tilingDataPtr.hiddenSizeQ = hiddenSizeQ; + tilingDataPtr.hiddenSizeK = hiddenSizeK; + tilingDataPtr.headDim = headDim; + tilingDataPtr.ntokens = ntokens; + tilingDataPtr.batch = batch; + tilingDataPtr.multiple = multiple; kernelInfo.SetBlockDim(realCore); MKI_LOG(DEBUG) << "Ntokens is " << ntokens; MKI_LOG(DEBUG) << "Batch is " << batch; @@ -157,7 +157,7 @@ Status RopeTiling(const LaunchParam &launchParam, KernelInfo &kernelInfo) } tilingDataPtr->headNumQ = headNumQ; tilingDataPtr->headNumK = headNumK; - auto ret = TilingKeyChose(launchParam, kernelInfo, tilingDataPtr); + auto ret = TilingKeyChose(launchParam, kernelInfo, *tilingDataPtr); if (!ret.Ok()) { return Status::FailStatus(ERROR_INVALID_VALUE); } diff --git a/src/kernels/mixkernels/rope_q_concat/tiling/rope_q_concat_tiling.cpp b/src/kernels/mixkernels/rope_q_concat/tiling/rope_q_concat_tiling.cpp index 01b09360..97ad7741 100644 --- a/src/kernels/mixkernels/rope_q_concat/tiling/rope_q_concat_tiling.cpp +++ b/src/kernels/mixkernels/rope_q_concat/tiling/rope_q_concat_tiling.cpp @@ -104,7 +104,7 @@ Status RopeQConcatTiling(const LaunchParam &launchParam, KernelInfo &kernelInfo) if (!ret.Ok()) { return Status::FailStatus(ERROR_INVALID_VALUE); } - auto retProcess = RopeNdProcess(launchParam, tilingDataPtr); + auto retProcess = RopeNdProcess(launchParam, *tilingDataPtr); if (!retProcess.Ok()) { return Status::FailStatus(ERROR_INVALID_VALUE); } -- Gitee From 4a422fb6092b946f6ab5393b94a25ed5b89bc4ad Mon Sep 17 00:00:00 2001 From: Vector Date: Sat, 20 Sep 2025 16:16:00 +0800 Subject: [PATCH 26/94] update --- example/op_demo/activation/README.md | 6 +++--- example/op_demo/all_gather/README.md | 6 +++--- example/op_demo/all_reduce/README.md | 6 +++--- example/op_demo/concat/README.md | 6 +++--- example/op_demo/elewise/README.md | 6 +++--- example/op_demo/faupdate/README.md | 6 +++--- example/op_demo/fused_add_topk_div/README.md | 6 +++--- example/op_demo/gather/README.md | 6 +++--- example/op_demo/layer_norm/README.md | 6 +++--- example/op_demo/linear/README.md | 6 +++--- example/op_demo/linear_parallel/README.md | 6 +++--- example/op_demo/mla_preprocess/README.md | 6 +++--- example/op_demo/multi_latent_attention/README.md | 6 +++--- example/op_demo/paged_attention/README.md | 6 +++--- example/op_demo/paged_cache_load/README.md | 2 +- example/op_demo/reshape_and_cache/README.md | 6 +++--- example/op_demo/ring_mla/README.md | 6 +++--- example/op_demo/rms_norm/README.md | 2 +- example/op_demo/rms_norm_backward/README.md | 6 +++--- example/op_demo/rope/README.md | 2 +- example/op_demo/self_attention/README.md | 6 +++--- example/op_demo/slice/README.md | 6 +++--- example/op_demo/split/README.md | 6 +++--- example/op_demo/transdata/README.md | 6 +++--- example/op_demo/transpose/README.md | 6 +++--- 25 files changed, 69 insertions(+), 69 deletions(-) diff --git a/example/op_demo/activation/README.md b/example/op_demo/activation/README.md index 2a8b3ecf..2616d541 100644 --- a/example/op_demo/activation/README.md +++ b/example/op_demo/activation/README.md @@ -9,16 +9,16 @@ 2. source [nnal安装路径]/set_env.sh 默认:source /usr/local/Ascend/nnal/atb/set_env.sh 1. 如果使用加速库源码编译,source [加速库源码路径]/output/atb/set_env.sh - e.g. source ./ascend-transformer-boost/output/atb/set_env.sh + 例如: source ./ascend-transformer-boost/output/atb/set_env.sh - 运行demo - bash build.sh **注意**: - - 使用cxx_abi=0(默认)时,设置`D_GLIBCXX_USE_CXX11_ABI`为0,i.e. + - 使用cxx_abi=0(默认)时,设置`D_GLIBCXX_USE_CXX11_ABI`为0,即: ```sh g++ -D_GLIBCXX_USE_CXX11_ABI=0 -I ... ``` - - 使用cxx_abi=1时,更改`D_GLIBCXX_USE_CXX11_ABI`为1,i.e. + - 使用cxx_abi=1时,更改`D_GLIBCXX_USE_CXX11_ABI`为1,即: ```sh g++ -D_GLIBCXX_USE_CXX11_ABI=1 -I ... ``` diff --git a/example/op_demo/all_gather/README.md b/example/op_demo/all_gather/README.md index feb6b94b..8897b7bf 100644 --- a/example/op_demo/all_gather/README.md +++ b/example/op_demo/all_gather/README.md @@ -9,16 +9,16 @@ 2. source [nnal安装路径]/set_env.sh 默认:source /usr/local/Ascend/nnal/atb/set_env.sh 1. 如果使用加速库源码编译,source [加速库源码路径]/output/atb/set_env.sh - e.g. source ./ascend-transformer-boost/output/atb/set_env.sh + 例如: source ./ascend-transformer-boost/output/atb/set_env.sh - 运行demo - bash build.sh **注意**: - - 使用cxx_abi=0(默认)时,设置`D_GLIBCXX_USE_CXX11_ABI`为0,i.e. + - 使用cxx_abi=0(默认)时,设置`D_GLIBCXX_USE_CXX11_ABI`为0,即: ```sh g++ -D_GLIBCXX_USE_CXX11_ABI=0 -I ... ``` - - 使用cxx_abi=1时,更改`D_GLIBCXX_USE_CXX11_ABI`为1,i.e. + - 使用cxx_abi=1时,更改`D_GLIBCXX_USE_CXX11_ABI`为1,即: ```sh g++ -D_GLIBCXX_USE_CXX11_ABI=1 -I ... ``` diff --git a/example/op_demo/all_reduce/README.md b/example/op_demo/all_reduce/README.md index 44ade794..17d8d4b0 100644 --- a/example/op_demo/all_reduce/README.md +++ b/example/op_demo/all_reduce/README.md @@ -9,16 +9,16 @@ 2. source [nnal安装路径]/set_env.sh 默认:source /usr/local/Ascend/nnal/atb/set_env.sh 1. 如果使用加速库源码编译,source [加速库源码路径]/output/atb/set_env.sh - e.g. source ./ascend-transformer-boost/output/atb/set_env.sh + 例如: source ./ascend-transformer-boost/output/atb/set_env.sh - 运行demo - bash build.sh **注意**: - - 使用cxx_abi=0(默认)时,设置`D_GLIBCXX_USE_CXX11_ABI`为0,i.e. + - 使用cxx_abi=0(默认)时,设置`D_GLIBCXX_USE_CXX11_ABI`为0,即: ```sh g++ -D_GLIBCXX_USE_CXX11_ABI=0 -I ... ``` - - 使用cxx_abi=1时,更改`D_GLIBCXX_USE_CXX11_ABI`为1,i.e. + - 使用cxx_abi=1时,更改`D_GLIBCXX_USE_CXX11_ABI`为1,即: ```sh g++ -D_GLIBCXX_USE_CXX11_ABI=1 -I ... ``` diff --git a/example/op_demo/concat/README.md b/example/op_demo/concat/README.md index cc39ea1c..dd76fc32 100644 --- a/example/op_demo/concat/README.md +++ b/example/op_demo/concat/README.md @@ -9,16 +9,16 @@ 2. source [nnal安装路径]/set_env.sh 默认:source /usr/local/Ascend/nnal/atb/set_env.sh 1. 如果使用加速库源码编译,source [加速库源码路径]/output/atb/set_env.sh - e.g. source ./ascend-transformer-boost/output/atb/set_env.sh + 例如: source ./ascend-transformer-boost/output/atb/set_env.sh - 运行demo - bash build.sh **注意**: - - 使用cxx_abi=0(默认)时,设置`D_GLIBCXX_USE_CXX11_ABI`为0,i.e. + - 使用cxx_abi=0(默认)时,设置`D_GLIBCXX_USE_CXX11_ABI`为0,即: ```sh g++ -D_GLIBCXX_USE_CXX11_ABI=0 -I ... ``` - - 使用cxx_abi=1时,更改`D_GLIBCXX_USE_CXX11_ABI`为1,i.e. + - 使用cxx_abi=1时,更改`D_GLIBCXX_USE_CXX11_ABI`为1,即: ```sh g++ -D_GLIBCXX_USE_CXX11_ABI=1 -I ... ``` diff --git a/example/op_demo/elewise/README.md b/example/op_demo/elewise/README.md index 7a0fe069..cd9c719b 100644 --- a/example/op_demo/elewise/README.md +++ b/example/op_demo/elewise/README.md @@ -9,16 +9,16 @@ 2. source [nnal安装路径]/set_env.sh 默认:source /usr/local/Ascend/nnal/atb/set_env.sh 1. 如果使用加速库源码编译,source [加速库源码路径]/output/atb/set_env.sh - e.g. source ./ascend-transformer-boost/output/atb/set_env.sh + 例如: source ./ascend-transformer-boost/output/atb/set_env.sh - 运行demo - bash build.sh **注意**: - - 使用cxx_abi=0(默认)时,设置`D_GLIBCXX_USE_CXX11_ABI`为0,i.e. + - 使用cxx_abi=0(默认)时,设置`D_GLIBCXX_USE_CXX11_ABI`为0,即: ```sh g++ -D_GLIBCXX_USE_CXX11_ABI=0 -I ... ``` - - 使用cxx_abi=1时,更改`D_GLIBCXX_USE_CXX11_ABI`为1,i.e. + - 使用cxx_abi=1时,更改`D_GLIBCXX_USE_CXX11_ABI`为1,即: ```sh g++ -D_GLIBCXX_USE_CXX11_ABI=1 -I ... ``` diff --git a/example/op_demo/faupdate/README.md b/example/op_demo/faupdate/README.md index bc74850a..b65d1b9a 100644 --- a/example/op_demo/faupdate/README.md +++ b/example/op_demo/faupdate/README.md @@ -9,18 +9,18 @@ 2. source [nnal安装路径]/set_env.sh 默认:source /usr/local/Ascend/nnal/atb/set_env.sh 1. 如果使用加速库源码编译,source [加速库源码路径]/output/atb/set_env.sh - e.g. source ./ascend-transformer-boost/output/atb/set_env.sh + 例如: source ./ascend-transformer-boost/output/atb/set_env.sh - 运行demo ```sh bash build.sh ``` **注意**: - - 使用cxx_abi=0(默认)时,设置`D_GLIBCXX_USE_CXX11_ABI`为0,i.e. + - 使用cxx_abi=0(默认)时,设置`D_GLIBCXX_USE_CXX11_ABI`为0,即: ```sh g++ -D_GLIBCXX_USE_CXX11_ABI=0 -I ... ``` - - 使用cxx_abi=1时,更改`D_GLIBCXX_USE_CXX11_ABI`为1,i.e. + - 使用cxx_abi=1时,更改`D_GLIBCXX_USE_CXX11_ABI`为1,即: ```sh g++ -D_GLIBCXX_USE_CXX11_ABI=1 -I ... ``` diff --git a/example/op_demo/fused_add_topk_div/README.md b/example/op_demo/fused_add_topk_div/README.md index ac07c0f9..c4a50d74 100644 --- a/example/op_demo/fused_add_topk_div/README.md +++ b/example/op_demo/fused_add_topk_div/README.md @@ -9,16 +9,16 @@ 2. source [nnal安装路径]/set_env.sh 默认:source /usr/local/Ascend/nnal/atb/set_env.sh 1. 如果使用加速库源码编译,source [加速库源码路径]/output/atb/set_env.sh - e.g. source ./ascend-transformer-boost/output/atb/set_env.sh + 例如: source ./ascend-transformer-boost/output/atb/set_env.sh - 运行demo - bash build.sh **注意**: - - 使用cxx_abi=0(默认)时,设置`D_GLIBCXX_USE_CXX11_ABI`为0,i.e. + - 使用cxx_abi=0(默认)时,设置`D_GLIBCXX_USE_CXX11_ABI`为0,即: ```sh g++ -D_GLIBCXX_USE_CXX11_ABI=0 -I ... ``` - - 使用cxx_abi=1时,更改`D_GLIBCXX_USE_CXX11_ABI`为1,i.e. + - 使用cxx_abi=1时,更改`D_GLIBCXX_USE_CXX11_ABI`为1,即: ```sh g++ -D_GLIBCXX_USE_CXX11_ABI=1 -I ... ``` diff --git a/example/op_demo/gather/README.md b/example/op_demo/gather/README.md index 84cce479..b0e16a71 100644 --- a/example/op_demo/gather/README.md +++ b/example/op_demo/gather/README.md @@ -9,16 +9,16 @@ 2. source [nnal安装路径]/set_env.sh 默认:source /usr/local/Ascend/nnal/atb/set_env.sh 1. 如果使用加速库源码编译,source [加速库源码路径]/output/atb/set_env.sh - e.g. source ./ascend-transformer-boost/output/atb/set_env.sh + 例如: source ./ascend-transformer-boost/output/atb/set_env.sh - 运行demo - bash build.sh **注意**: - - 使用cxx_abi=0(默认)时,设置`D_GLIBCXX_USE_CXX11_ABI`为0,i.e. + - 使用cxx_abi=0(默认)时,设置`D_GLIBCXX_USE_CXX11_ABI`为0,即: ```sh g++ -D_GLIBCXX_USE_CXX11_ABI=0 -I ... ``` - - 使用cxx_abi=1时,更改`D_GLIBCXX_USE_CXX11_ABI`为1,i.e. + - 使用cxx_abi=1时,更改`D_GLIBCXX_USE_CXX11_ABI`为1,即: ```sh g++ -D_GLIBCXX_USE_CXX11_ABI=1 -I ... ``` diff --git a/example/op_demo/layer_norm/README.md b/example/op_demo/layer_norm/README.md index 76f0f57c..cf4437eb 100644 --- a/example/op_demo/layer_norm/README.md +++ b/example/op_demo/layer_norm/README.md @@ -9,16 +9,16 @@ 2. source [nnal安装路径]/set_env.sh 默认:source /usr/local/Ascend/nnal/atb/set_env.sh 1. 如果使用加速库源码编译,source [加速库源码路径]/output/atb/set_env.sh - e.g. source ./ascend-transformer-boost/output/atb/set_env.sh + 例如: source ./ascend-transformer-boost/output/atb/set_env.sh - 运行demo - bash build.sh **注意**: - - 使用cxx_abi=0(默认)时,设置`D_GLIBCXX_USE_CXX11_ABI`为0,i.e. + - 使用cxx_abi=0(默认)时,设置`D_GLIBCXX_USE_CXX11_ABI`为0,即: ```sh g++ -D_GLIBCXX_USE_CXX11_ABI=0 -I ... ``` - - 使用cxx_abi=1时,更改`D_GLIBCXX_USE_CXX11_ABI`为1,i.e. + - 使用cxx_abi=1时,更改`D_GLIBCXX_USE_CXX11_ABI`为1,即: ```sh g++ -D_GLIBCXX_USE_CXX11_ABI=1 -I ... ``` diff --git a/example/op_demo/linear/README.md b/example/op_demo/linear/README.md index a4f517e7..f7c45a0d 100644 --- a/example/op_demo/linear/README.md +++ b/example/op_demo/linear/README.md @@ -11,7 +11,7 @@ 2. source [nnal安装路径]/set_env.sh 默认:source /usr/local/Ascend/nnal/atb/set_env.sh 1. 如果使用加速库源码编译,source [加速库源码路径]/output/atb/set_env.sh - e.g. source ./ascend-transformer-boost/output/atb/set_env.sh + 例如: source ./ascend-transformer-boost/output/atb/set_env.sh - 运行demo @@ -20,12 +20,12 @@ ``` **注意**: - - 使用cxx_abi=0(默认)时,设置`D_GLIBCXX_USE_CXX11_ABI`为0,i.e. + - 使用cxx_abi=0(默认)时,设置`D_GLIBCXX_USE_CXX11_ABI`为0,即: ```sh g++ -D_GLIBCXX_USE_CXX11_ABI=0 -I ... ``` - - 使用cxx_abi=1时,更改`D_GLIBCXX_USE_CXX11_ABI`为1,i.e. + - 使用cxx_abi=1时,更改`D_GLIBCXX_USE_CXX11_ABI`为1,即: ```sh g++ -D_GLIBCXX_USE_CXX11_ABI=1 -I ... diff --git a/example/op_demo/linear_parallel/README.md b/example/op_demo/linear_parallel/README.md index 7e2eb96b..3885aac5 100644 --- a/example/op_demo/linear_parallel/README.md +++ b/example/op_demo/linear_parallel/README.md @@ -9,16 +9,16 @@ 2. source [nnal安装路径]/set_env.sh 默认:source /usr/local/Ascend/nnal/atb/set_env.sh 1. 如果使用加速库源码编译,source [加速库源码路径]/output/atb/set_env.sh - e.g. source ./ascend-transformer-boost/output/atb/set_env.sh + 例如: source ./ascend-transformer-boost/output/atb/set_env.sh - 运行demo - bash build.sh **注意**: - - 使用cxx_abi=0(默认)时,设置`D_GLIBCXX_USE_CXX11_ABI`为0,i.e. + - 使用cxx_abi=0(默认)时,设置`D_GLIBCXX_USE_CXX11_ABI`为0,即: ```sh g++ -D_GLIBCXX_USE_CXX11_ABI=0 -I ... ``` - - 使用cxx_abi=1时,更改`D_GLIBCXX_USE_CXX11_ABI`为1,i.e. + - 使用cxx_abi=1时,更改`D_GLIBCXX_USE_CXX11_ABI`为1,即: ```sh g++ -D_GLIBCXX_USE_CXX11_ABI=1 -I ... ``` diff --git a/example/op_demo/mla_preprocess/README.md b/example/op_demo/mla_preprocess/README.md index 752d62c7..87c24619 100644 --- a/example/op_demo/mla_preprocess/README.md +++ b/example/op_demo/mla_preprocess/README.md @@ -9,16 +9,16 @@ 2. source [nnal安装路径]/set_env.sh 默认:source /usr/local/Ascend/nnal/atb/set_env.sh 1. 如果使用加速库源码编译,source [加速库源码路径]/output/atb/set_env.sh - e.g. source ./ascend-transformer-boost/output/atb/set_env.sh + 例如: source ./ascend-transformer-boost/output/atb/set_env.sh - 运行demo - bash build.sh **注意**: - - 使用cxx_abi=0(默认)时,设置`D_GLIBCXX_USE_CXX11_ABI`为0,i.e. + - 使用cxx_abi=0(默认)时,设置`D_GLIBCXX_USE_CXX11_ABI`为0,即: ```sh g++ -D_GLIBCXX_USE_CXX11_ABI=0 -I ... ``` - - 使用cxx_abi=1时,更改`D_GLIBCXX_USE_CXX11_ABI`为1,i.e. + - 使用cxx_abi=1时,更改`D_GLIBCXX_USE_CXX11_ABI`为1,即: ```sh g++ -D_GLIBCXX_USE_CXX11_ABI=1 -I ... ``` diff --git a/example/op_demo/multi_latent_attention/README.md b/example/op_demo/multi_latent_attention/README.md index bc2c43a7..55ef7ce4 100644 --- a/example/op_demo/multi_latent_attention/README.md +++ b/example/op_demo/multi_latent_attention/README.md @@ -9,15 +9,15 @@ 2. source [nnal安装路径]/set_env.sh 默认:source /usr/local/Ascend/nnal/atb/set_env.sh 1. 如果使用加速库源码编译,source [加速库源码路径]/output/atb/set_env.sh - e.g. source ./ascend-transformer-boost/output/atb/set_env.sh + 例如: source ./ascend-transformer-boost/output/atb/set_env.sh - 运行demo - bash build.sh **注意**: - - 使用cxx_abi=0(默认)时,设置`D_GLIBCXX_USE_CXX11_ABI`为0,i.e. + - 使用cxx_abi=0(默认)时,设置`D_GLIBCXX_USE_CXX11_ABI`为0,即: ```sh g++ -D_GLIBCXX_USE_CXX11_ABI=0 -I ... ``` - - 使用cxx_abi=1时,更改`D_GLIBCXX_USE_CXX11_ABI`为1,i.e. + - 使用cxx_abi=1时,更改`D_GLIBCXX_USE_CXX11_ABI`为1,即: ```sh g++ -D_GLIBCXX_USE_CXX11_ABI=1 -I ... ``` diff --git a/example/op_demo/paged_attention/README.md b/example/op_demo/paged_attention/README.md index ac6cb71b..49fcbd6b 100644 --- a/example/op_demo/paged_attention/README.md +++ b/example/op_demo/paged_attention/README.md @@ -9,16 +9,16 @@ 2. source [nnal安装路径]/set_env.sh 默认:source /usr/local/Ascend/nnal/atb/set_env.sh 1. 如果使用加速库源码编译,source [加速库源码路径]/output/atb/set_env.sh - e.g. source ./ascend-transformer-boost/output/atb/set_env.sh + 例如: source ./ascend-transformer-boost/output/atb/set_env.sh - 运行demo - bash build.sh **注意**: - - 使用cxx_abi=0(默认)时,设置`D_GLIBCXX_USE_CXX11_ABI`为0,i.e. + - 使用cxx_abi=0(默认)时,设置`D_GLIBCXX_USE_CXX11_ABI`为0,即: ```sh g++ -D_GLIBCXX_USE_CXX11_ABI=0 -I ... ``` - - 使用cxx_abi=1时,更改`D_GLIBCXX_USE_CXX11_ABI`为1,i.e. + - 使用cxx_abi=1时,更改`D_GLIBCXX_USE_CXX11_ABI`为1,即: ```sh g++ -D_GLIBCXX_USE_CXX11_ABI=1 -I ... ``` diff --git a/example/op_demo/paged_cache_load/README.md b/example/op_demo/paged_cache_load/README.md index 13b7e62b..055e0bef 100644 --- a/example/op_demo/paged_cache_load/README.md +++ b/example/op_demo/paged_cache_load/README.md @@ -9,7 +9,7 @@ 2. source [nnal安装路径]/set_env.sh 默认:source /usr/local/Ascend/nnal/atb/set_env.sh 1. 如果使用加速库源码编译,source [加速库源码路径]/output/atb/set_env.sh - e.g. source ./ascend-transformer-boost/output/atb/set_env.sh + 例如: source ./ascend-transformer-boost/output/atb/set_env.sh - 编译、运行demo - bash build.sh diff --git a/example/op_demo/reshape_and_cache/README.md b/example/op_demo/reshape_and_cache/README.md index 42c48a87..f4e4ede9 100644 --- a/example/op_demo/reshape_and_cache/README.md +++ b/example/op_demo/reshape_and_cache/README.md @@ -9,16 +9,16 @@ 2. source [nnal安装路径]/set_env.sh 默认:source /usr/local/Ascend/nnal/atb/set_env.sh 1. 如果使用加速库源码编译,source [加速库源码路径]/output/atb/set_env.sh - e.g. source ./ascend-transformer-boost/output/atb/set_env.sh + 例如: source ./ascend-transformer-boost/output/atb/set_env.sh - 运行demo - bash build.sh **注意**: - - 使用cxx_abi=0(默认)时,设置`D_GLIBCXX_USE_CXX11_ABI`为0,i.e. + - 使用cxx_abi=0(默认)时,设置`D_GLIBCXX_USE_CXX11_ABI`为0,即: ```sh g++ -D_GLIBCXX_USE_CXX11_ABI=0 -I ... ``` - - 使用cxx_abi=1时,更改`D_GLIBCXX_USE_CXX11_ABI`为1,i.e. + - 使用cxx_abi=1时,更改`D_GLIBCXX_USE_CXX11_ABI`为1,即: ```sh g++ -D_GLIBCXX_USE_CXX11_ABI=1 -I ... ``` diff --git a/example/op_demo/ring_mla/README.md b/example/op_demo/ring_mla/README.md index cc89d794..333678bd 100644 --- a/example/op_demo/ring_mla/README.md +++ b/example/op_demo/ring_mla/README.md @@ -9,16 +9,16 @@ 2. source [nnal安装路径]/set_env.sh 默认:source /usr/local/Ascend/nnal/atb/set_env.sh 1. 如果使用加速库源码编译,source [加速库源码路径]/output/atb/set_env.sh - e.g. source ./ascend-transformer-boost/output/atb/set_env.sh + 例如: source ./ascend-transformer-boost/output/atb/set_env.sh - 运行demo - bash build.sh **注意**: - - 使用cxx_abi=0时,设置`D_GLIBCXX_USE_CXX11_ABI`为0,i.e. + - 使用cxx_abi=0时,设置`D_GLIBCXX_USE_CXX11_ABI`为0,即: ```sh g++ -D_GLIBCXX_USE_CXX11_ABI=0 -I ... ``` - - 使用cxx_abi=1时,更改`D_GLIBCXX_USE_CXX11_ABI`为1,i.e. + - 使用cxx_abi=1时,更改`D_GLIBCXX_USE_CXX11_ABI`为1,即: ```sh g++ -D_GLIBCXX_USE_CXX11_ABI=1 -I ... ``` diff --git a/example/op_demo/rms_norm/README.md b/example/op_demo/rms_norm/README.md index 7e5a3f73..f3b43873 100644 --- a/example/op_demo/rms_norm/README.md +++ b/example/op_demo/rms_norm/README.md @@ -9,7 +9,7 @@ 2. source [nnal安装路径]/set_env.sh 默认:source /usr/local/Ascend/nnal/atb/set_env.sh 1. 如果使用加速库源码编译,source [加速库源码路径]/output/atb/set_env.sh - e.g. source ./ascend-transformer-boost/output/atb/set_env.sh + 例如: source ./ascend-transformer-boost/output/atb/set_env.sh - 编译、运行demo - bash build.sh diff --git a/example/op_demo/rms_norm_backward/README.md b/example/op_demo/rms_norm_backward/README.md index 8ab2c459..78daf3f9 100644 --- a/example/op_demo/rms_norm_backward/README.md +++ b/example/op_demo/rms_norm_backward/README.md @@ -9,16 +9,16 @@ 2. source [nnal安装路径]/set_env.sh 默认:source /usr/local/Ascend/nnal/atb/set_env.sh 1. 如果使用加速库源码编译,source [加速库源码路径]/output/atb/set_env.sh - e.g. source ./ascend-transformer-boost/output/atb/set_env.sh + 例如: source ./ascend-transformer-boost/output/atb/set_env.sh - 运行demo - bash build.sh **注意**: - - 使用cxx_abi=0(默认)时,设置`D_GLIBCXX_USE_CXX11_ABI`为0,i.e. + - 使用cxx_abi=0(默认)时,设置`D_GLIBCXX_USE_CXX11_ABI`为0,即: ```sh g++ -D_GLIBCXX_USE_CXX11_ABI=0 -I ... ``` - - 使用cxx_abi=1时,更改`D_GLIBCXX_USE_CXX11_ABI`为1,i.e. + - 使用cxx_abi=1时,更改`D_GLIBCXX_USE_CXX11_ABI`为1,即: ```sh g++ -D_GLIBCXX_USE_CXX11_ABI=1 -I ... ``` diff --git a/example/op_demo/rope/README.md b/example/op_demo/rope/README.md index 6898cb29..c6c1ec27 100644 --- a/example/op_demo/rope/README.md +++ b/example/op_demo/rope/README.md @@ -9,7 +9,7 @@ 2. source [nnal安装路径]/set_env.sh 默认:source /usr/local/Ascend/nnal/atb/set_env.sh 1. 如果使用加速库源码编译,source [加速库源码路径]/output/atb/set_env.sh - e.g. source ./ascend-transformer-boost/output/atb/set_env.sh + 例如: source ./ascend-transformer-boost/output/atb/set_env.sh - 编译、运行demo - bash build.sh diff --git a/example/op_demo/self_attention/README.md b/example/op_demo/self_attention/README.md index a6d8ac9a..8f428192 100644 --- a/example/op_demo/self_attention/README.md +++ b/example/op_demo/self_attention/README.md @@ -9,16 +9,16 @@ 2. source [nnal安装路径]/set_env.sh 默认:source /usr/local/Ascend/nnal/atb/set_env.sh 1. 如果使用加速库源码编译,source [加速库源码路径]/output/atb/set_env.sh - e.g. source ./ascend-transformer-boost/output/atb/set_env.sh + 例如: source ./ascend-transformer-boost/output/atb/set_env.sh - 运行demo - bash build.sh **注意**: - - 使用cxx_abi=0时,设置`D_GLIBCXX_USE_CXX11_ABI`为0,i.e. + - 使用cxx_abi=0时,设置`D_GLIBCXX_USE_CXX11_ABI`为0,即: ```sh g++ -D_GLIBCXX_USE_CXX11_ABI=0 -I ... ``` - - 使用cxx_abi=1时,更改`D_GLIBCXX_USE_CXX11_ABI`为1,i.e. + - 使用cxx_abi=1时,更改`D_GLIBCXX_USE_CXX11_ABI`为1,即: ```sh g++ -D_GLIBCXX_USE_CXX11_ABI=1 -I ... ``` diff --git a/example/op_demo/slice/README.md b/example/op_demo/slice/README.md index 9b3826c0..aa0ebb81 100644 --- a/example/op_demo/slice/README.md +++ b/example/op_demo/slice/README.md @@ -9,16 +9,16 @@ 2. source [nnal安装路径]/set_env.sh 默认:source /usr/local/Ascend/nnal/atb/set_env.sh 1. 如果使用加速库源码编译,source [加速库源码路径]/output/atb/set_env.sh - e.g. source ./ascend-transformer-boost/output/atb/set_env.sh + 例如: source ./ascend-transformer-boost/output/atb/set_env.sh - 运行demo - bash build.sh **注意**: - - 使用cxx_abi=0(默认)时,设置`D_GLIBCXX_USE_CXX11_ABI`为0,i.e. + - 使用cxx_abi=0(默认)时,设置`D_GLIBCXX_USE_CXX11_ABI`为0,即: ```sh g++ -D_GLIBCXX_USE_CXX11_ABI=0 -I ... ``` - - 使用cxx_abi=1时,更改`D_GLIBCXX_USE_CXX11_ABI`为1,i.e. + - 使用cxx_abi=1时,更改`D_GLIBCXX_USE_CXX11_ABI`为1,即: ```sh g++ -D_GLIBCXX_USE_CXX11_ABI=1 -I ... ``` diff --git a/example/op_demo/split/README.md b/example/op_demo/split/README.md index 3a1db3b2..39fa1e02 100644 --- a/example/op_demo/split/README.md +++ b/example/op_demo/split/README.md @@ -9,16 +9,16 @@ 2. source [nnal安装路径]/set_env.sh 默认:source /usr/local/Ascend/nnal/atb/set_env.sh 1. 如果使用加速库源码编译,source [加速库源码路径]/output/atb/set_env.sh - e.g. source ./ascend-transformer-boost/output/atb/set_env.sh + 例如: source ./ascend-transformer-boost/output/atb/set_env.sh - 运行demo - bash build.sh **注意**: - - 使用cxx_abi=0(默认)时,设置`D_GLIBCXX_USE_CXX11_ABI`为0,i.e. + - 使用cxx_abi=0(默认)时,设置`D_GLIBCXX_USE_CXX11_ABI`为0,即: ```sh g++ -D_GLIBCXX_USE_CXX11_ABI=0 -I ... ``` - - 使用cxx_abi=1时,更改`D_GLIBCXX_USE_CXX11_ABI`为1,i.e. + - 使用cxx_abi=1时,更改`D_GLIBCXX_USE_CXX11_ABI`为1,即: ```sh g++ -D_GLIBCXX_USE_CXX11_ABI=1 -I ... ``` diff --git a/example/op_demo/transdata/README.md b/example/op_demo/transdata/README.md index e162bc70..99b54219 100644 --- a/example/op_demo/transdata/README.md +++ b/example/op_demo/transdata/README.md @@ -9,16 +9,16 @@ 2. source [nnal安装路径]/set_env.sh 默认:source /usr/local/Ascend/nnal/atb/set_env.sh 1. 如果使用加速库源码编译,source [加速库源码路径]/output/atb/set_env.sh - e.g. source ./ascend-transformer-boost/output/atb/set_env.sh + 例如: source ./ascend-transformer-boost/output/atb/set_env.sh - 运行demo - bash build.sh **注意**: - - 使用cxx_abi=0(默认)时,设置`D_GLIBCXX_USE_CXX11_ABI`为0,i.e. + - 使用cxx_abi=0(默认)时,设置`D_GLIBCXX_USE_CXX11_ABI`为0,即: ```sh g++ -D_GLIBCXX_USE_CXX11_ABI=0 -I ... ``` - - 使用cxx_abi=1时,更改`D_GLIBCXX_USE_CXX11_ABI`为1,i.e. + - 使用cxx_abi=1时,更改`D_GLIBCXX_USE_CXX11_ABI`为1,即: ```sh g++ -D_GLIBCXX_USE_CXX11_ABI=1 -I ... ``` diff --git a/example/op_demo/transpose/README.md b/example/op_demo/transpose/README.md index 8a1e141f..d73e19e9 100644 --- a/example/op_demo/transpose/README.md +++ b/example/op_demo/transpose/README.md @@ -9,16 +9,16 @@ 2. source [nnal安装路径]/set_env.sh 默认:source /usr/local/Ascend/nnal/atb/set_env.sh 1. 如果使用加速库源码编译,source [加速库源码路径]/output/atb/set_env.sh - e.g. source ./ascend-transformer-boost/output/atb/set_env.sh + 例如: source ./ascend-transformer-boost/output/atb/set_env.sh - 运行demo - bash build.sh **注意**: - - 使用cxx_abi=0(默认)时,设置`D_GLIBCXX_USE_CXX11_ABI`为0,i.e. + - 使用cxx_abi=0(默认)时,设置`D_GLIBCXX_USE_CXX11_ABI`为0,即: ```sh g++ -D_GLIBCXX_USE_CXX11_ABI=0 -I ... ``` - - 使用cxx_abi=1时,更改`D_GLIBCXX_USE_CXX11_ABI`为1,i.e. + - 使用cxx_abi=1时,更改`D_GLIBCXX_USE_CXX11_ABI`为1,即: ```sh g++ -D_GLIBCXX_USE_CXX11_ABI=1 -I ... ``` -- Gitee From 3615ab3f4fcecce6ade9f71dd9220dd9827db012 Mon Sep 17 00:00:00 2001 From: huangxiaolan Date: Mon, 22 Sep 2025 09:29:48 +0800 Subject: [PATCH 27/94] fix function point --- .../mla_preprocess/tiling/mla_preprocess_tiling.cpp | 2 +- .../multi_latent_attention/tiling/mla_tiling_dependency.cpp | 2 +- .../swi_glu_quant/tiling/swi_glu_quant_tiling.cpp | 4 ++-- .../swi_glu_quant/tiling/swi_glu_quant_tiling_utils.h | 6 +++--- .../linear_parallel/linear_parallel_aclnn_runner.cpp | 4 ++-- .../mla_preprocess/mla_preprocess_aclnn_runner.cpp | 4 ++-- 6 files changed, 11 insertions(+), 11 deletions(-) diff --git a/src/kernels/mixkernels/mla_preprocess/tiling/mla_preprocess_tiling.cpp b/src/kernels/mixkernels/mla_preprocess/tiling/mla_preprocess_tiling.cpp index eee7b7db..4c9b0727 100644 --- a/src/kernels/mixkernels/mla_preprocess/tiling/mla_preprocess_tiling.cpp +++ b/src/kernels/mixkernels/mla_preprocess/tiling/mla_preprocess_tiling.cpp @@ -320,7 +320,7 @@ public: void RmsNormQuantTiling(const uint32_t numTokens); void RopeConcatTiling(const OpParam::MlaPreprocess ¶m, const uint32_t &aicNum); void EinSumQuantTiling(const OpParam::MlaPreprocess ¶m, const uint32_t &aicNum, const TensorDType inDtype); - void SetTiling(AtbOps::MlaTilingData *tilingParam); + void SetTiling(AtbOps::MlaTilingData &tilingParam); void SetTilingKey(const Mki::LaunchParam &launchParam, Mki::KernelInfo &kernelInfo); void SetMlapoWorkSpace(const TensorDType inDtype, const OpParam::MlaPreprocess ¶m, Mki::KernelInfo &kernelInfo); }; diff --git a/src/kernels/mixkernels/multi_latent_attention/tiling/mla_tiling_dependency.cpp b/src/kernels/mixkernels/multi_latent_attention/tiling/mla_tiling_dependency.cpp index c662de13..896c23eb 100644 --- a/src/kernels/mixkernels/multi_latent_attention/tiling/mla_tiling_dependency.cpp +++ b/src/kernels/mixkernels/multi_latent_attention/tiling/mla_tiling_dependency.cpp @@ -377,7 +377,7 @@ Status GetMLATilingParam(const LaunchParam &launchParam, MLAInfo &mmInfo, GetNdMLATiling(mmInfo, blockDim, tilingParam, param); blockDim = mmInfo.batch == BATCH_MLA ? BLOCK_DIM_MLA : blockDim; } - GetTilingHead(mmInfo, param, tilingParam, torPtr, blockDim); + GetTilingHead(mmInfo, param, tilingParam, *torPtr, blockDim); return AtbOps::Status::OkStatus(); } diff --git a/src/kernels/mixkernels/swi_glu_quant/tiling/swi_glu_quant_tiling.cpp b/src/kernels/mixkernels/swi_glu_quant/tiling/swi_glu_quant_tiling.cpp index 46579d28..51b6cc14 100644 --- a/src/kernels/mixkernels/swi_glu_quant/tiling/swi_glu_quant_tiling.cpp +++ b/src/kernels/mixkernels/swi_glu_quant/tiling/swi_glu_quant_tiling.cpp @@ -93,13 +93,13 @@ Status SwiGluQuantTiling(const LaunchParam &launchParam, KernelInfo &kernelInfo) tilingData->blockNum = BLOCK_SIZE / SIZE_OF_FLOAT16; tilingData->cacheLineLen = L2_CACHE_LINE_SIZE / SIZE_OF_FLOAT16; const Mki::SVector &xShape = launchParam.GetInTensor(0).desc.dims; - MKI_CHECK_NO_LOG(SetTotalShape(xShape, tilingData), return Status::FailStatus(ERROR_INVALID_VALUE)); + MKI_CHECK_NO_LOG(SetTotalShape(xShape, *tilingData), return Status::FailStatus(ERROR_INVALID_VALUE)); MKI_CHECK_NO_LOG(CalTilingData(tilingData), return Status::FailStatus(ERROR_INVALID_VALUE)); SetTilingData(tilingData); SwigluQuantTilingKeyChose(launchParam, kernelInfo); kernelInfo.SetBlockDim(tilingData->coreNumUsed); - PrintSwiQuantTiling(tilingData); + PrintSwiQuantTiling(*tilingData); return Status::OkStatus(); } } \ No newline at end of file diff --git a/src/kernels/mixkernels/swi_glu_quant/tiling/swi_glu_quant_tiling_utils.h b/src/kernels/mixkernels/swi_glu_quant/tiling/swi_glu_quant_tiling_utils.h index c1e1b60f..829268fd 100644 --- a/src/kernels/mixkernels/swi_glu_quant/tiling/swi_glu_quant_tiling_utils.h +++ b/src/kernels/mixkernels/swi_glu_quant/tiling/swi_glu_quant_tiling_utils.h @@ -80,7 +80,7 @@ inline bool CalculateMaxUbSizePerRow(SwiGluQuantTilingData& tilingData) uint32_t alignedColLen = AlignUp(colLen, tilingData.blockNum); MKI_CHECK(alignedColLen != 0, "CalculateMaxUbSizePerRow Unsupported alignedColLen == 0", return false); MKI_LOG(INFO) << "alignedColLen:" << alignedColLen << "\n"; - uint32_t ubAvail = tilingDat.dataNumSingleUb / alignedColLen; + uint32_t ubAvail = tilingData.dataNumSingleUb / alignedColLen; MKI_LOG(INFO) << "tilingData.dataNumSingleUb:" << tilingData.dataNumSingleUb << "\n"; MKI_LOG(INFO) << "ubAvail:" << ubAvail << "\n"; MKI_CHECK(ubAvail != 0, "The input vector is too large. It is not supported currently.", return false); @@ -93,9 +93,9 @@ inline bool CalculateMaxUbSizePerRow(SwiGluQuantTilingData& tilingData) return true; } -bool CalTilingData(SwiGluQuantTilingData *tilingData); +bool CalTilingData(SwiGluQuantTilingData &tilingData); -void SetTilingData(SwiGluQuantTilingData *tilingData); +void SetTilingData(SwiGluQuantTilingData &tilingData); } // namespace AsdOps #endif // OPS_SWI_GLU_QUANT_TILING_H \ No newline at end of file diff --git a/src/ops_infer/linear_parallel/linear_parallel_aclnn_runner.cpp b/src/ops_infer/linear_parallel/linear_parallel_aclnn_runner.cpp index 7e151f31..c98fb564 100644 --- a/src/ops_infer/linear_parallel/linear_parallel_aclnn_runner.cpp +++ b/src/ops_infer/linear_parallel/linear_parallel_aclnn_runner.cpp @@ -193,13 +193,13 @@ Status LinearParallelAclnnRunner::LoadMethodMatmulReduceScatter() static DlManager dlManager = DlManager(std::string(std::getenv("ASCEND_HOME_PATH")) + "/lib64/libopapi.so"); Status ret = dlManager.getSymbol("aclnnMatmulReduceScatterV2GetWorkspaceSize", - (void **)&LinearParallelAclnnRunner::aclnnMatmulReduceScatterV2GetWorkspaceSizeFunc_); + (void *&)&LinearParallelAclnnRunner::aclnnMatmulReduceScatterV2GetWorkspaceSizeFunc_); if (ret != NO_ERROR) { ATB_LOG(ERROR) << "load aclnnMatmulReduceScatterV2GetWorkspaceSize failed! Consider upgrade the CANN first!"; return ret; } ret = dlManager.getSymbol("aclnnMatmulReduceScatterV2", - (void **)&LinearParallelAclnnRunner::aclnnMatmulReduceScatterV2Func_); + (void *&)&LinearParallelAclnnRunner::aclnnMatmulReduceScatterV2Func_); if (ret != NO_ERROR) { ATB_LOG(ERROR) << "load aclnnMatmulReduceScatterV2 failed! Consider upgrade the CANN first!"; return ret; diff --git a/src/ops_infer/mla_preprocess/mla_preprocess_aclnn_runner.cpp b/src/ops_infer/mla_preprocess/mla_preprocess_aclnn_runner.cpp index 4c5a102e..7f7957e0 100644 --- a/src/ops_infer/mla_preprocess/mla_preprocess_aclnn_runner.cpp +++ b/src/ops_infer/mla_preprocess/mla_preprocess_aclnn_runner.cpp @@ -206,12 +206,12 @@ Status MlaPreprocessAclnnRunner::LoadMethod() } static DlManager dlManager = DlManager(std::string(std::getenv("ASCEND_HOME_PATH")) + "/lib64/libopapi.so"); Status ret = dlManager.getSymbol("aclnnMlaPreprocessGetWorkspaceSize", - (void **)&MlaPreprocessAclnnRunner::aclnnGetWorkspaceSizeFunc_); + (void *&)&MlaPreprocessAclnnRunner::aclnnGetWorkspaceSizeFunc_); if (ret != NO_ERROR) { ATB_LOG(ERROR) << "load aclnnMlaPreprocessGetWorkspaceSize failed! Consider upgrade the CANN first!"; return ret; } - ret = dlManager.getSymbol("aclnnMlaPreprocess", (void **)&MlaPreprocessAclnnRunner::aclnnExecuteFunc_); + ret = dlManager.getSymbol("aclnnMlaPreprocess", (void *&)&MlaPreprocessAclnnRunner::aclnnExecuteFunc_); if (ret != NO_ERROR) { ATB_LOG(ERROR) << "load aclnnMlaPreprocess failed! Consider upgrade the CANN first!"; return ret; -- Gitee From 107e31a04d7d4c688b9516c25cac0576699f0804 Mon Sep 17 00:00:00 2001 From: huangxiaolan Date: Mon, 22 Sep 2025 11:28:32 +0800 Subject: [PATCH 28/94] fix function point --- include/atb/atb_acl.h | 14 +++++++------- src/atb/operation/if_operation.cpp | 4 ++-- src/atb/operation/operation_base.h | 2 +- src/atb/runner/ops_runner.h | 2 +- src/cinterface/atb_acl_util.cpp | 4 ++-- .../tiling/fused_add_topk_div_tiling.cpp | 4 ++-- .../tiling/mla_preprocess_tiling.cpp | 2 +- src/kernels/mixkernels/rope/tiling/rope_tiling.cpp | 2 +- .../swi_glu_quant/tiling/swi_glu_quant_tiling.cpp | 4 ++-- .../linear_parallel_aclnn_runner.cpp | 4 ++-- .../mla_preprocess/mla_preprocess_aclnn_runner.cpp | 4 ++-- 11 files changed, 23 insertions(+), 23 deletions(-) diff --git a/include/atb/atb_acl.h b/include/atb/atb_acl.h index dfd8f0d4..377d37d7 100644 --- a/include/atb/atb_acl.h +++ b/include/atb/atb_acl.h @@ -55,7 +55,7 @@ atb::Status AtbFusedAddTopkDivGetWorkspaceSize(const aclTensor *x, const aclTens const aclTensor *mappingTable, uint32_t groupNum, uint32_t groupTopk, uint32_t n, uint32_t k, int activationType, bool isNorm, float scale, bool enableExpertMapping, aclTensor *y, aclTensor *indices, - uint64_t *workspaceSize, atb::Operation **op, atb::Context *context); + uint64_t &workspaceSize, atb::Operation **op, atb::Context *context); //! //! \brief 关于FusedAddTopkDiv算子使用aclnn风格调用的2段式接口的第2段, @@ -101,7 +101,7 @@ atb::Status AtbMLAGetWorkspaceSize(const aclTensor *qNope, const aclTensor *qRop const aclTensor *mask, const aclTensor *qSeqLen, const aclTensor *qkDescale, const aclTensor *pvDescale, int32_t headNum, float qkScale, int32_t kvHeadNum, int maskType, int calcType, uint8_t cacheMode, aclTensor *attenOut, aclTensor *lse, - uint64_t *workspaceSize, atb::Operation **op, atb::Context *context); + uint64_t &workspaceSize, atb::Operation **op, atb::Context &context); //! //! \brief 关于MLA算子使用aclnn风格调用的2段式接口的第2段, @@ -142,7 +142,7 @@ atb::Status AtbMLAPreFillGetWorkspaceSize(const aclTensor *q, const aclTensor *q const aclTensor *kRope, const aclTensor *v, const aclTensor *qSeqLen, const aclTensor *kvSeqLen, const aclTensor *mask, int32_t headNum, float qkScale, int32_t kvHeadNum, int maskType, uint8_t cacheMode, aclTensor *attenOut, - uint64_t *workspaceSize, atb::Operation **op, atb::Context *context); + uint64_t &workspaceSize, atb::Operation **op, atb::Context *context); //! //! \brief MLA prefill 处理接口 @@ -214,7 +214,7 @@ atb::Status AtbMLAPreprocessGetWorkspaceSize( const aclTensor *kvCacheRope, const aclTensor *slotmapping, const aclTensor *ctkvScale, const aclTensor *qNopeScale, uint32_t wdqDim, uint32_t qRopeDim, uint32_t kRopeDim, float epsilon, uint32_t qRotaryCoeff, uint32_t kRotaryCoeff, bool transposeWdq, bool transposeWuq, bool transposeWuk, uint8_t cacheMode, uint16_t quantMode, aclTensor *qOut0, - aclTensor *kvCacheOut0, aclTensor *qOut1, aclTensor *kvCacheOut1, uint64_t *workspaceSize, atb::Operation **op, + aclTensor *kvCacheOut0, aclTensor *qOut1, aclTensor *kvCacheOut1, uint64_t &workspaceSize, atb::Operation **op, atb::Context *context); //! @@ -252,7 +252,7 @@ atb::Status AtbPagedCacheLoadGetWorkspaceSize(const aclTensor *keyCache, const a const aclTensor *blockTables, const aclTensor *contextLens, const aclTensor *key, const aclTensor *value, const aclTensor *seqStarts, int8_t kvCacheCfg, bool isSeqLensCumsumType, bool hasSeqStarts, - uint64_t *workspaceSize, atb::Operation **op, atb::Context *context); + uint64_t &workspaceSize, atb::Operation **op, atb::Context *context); //! //! \brief 关于PagedCacheLoad算子使用aclnn风格调用的2段式接口的第2段, @@ -300,7 +300,7 @@ atb::Status AtbRingMLAGetWorkspaceSize(const aclTensor *querySplit1, const aclTe const aclTensor *mask, const aclTensor *seqLen, const aclTensor *prevOut, const aclTensor *prevLse, int32_t headNum, int32_t kvHeadNum, float qkScale, int kernelType, int maskType, int inputLayout, int calcType, aclTensor *output, - aclTensor *softmaxLse, uint64_t *workspaceSize, atb::Operation **op, + aclTensor *softmaxLse, uint64_t &workspaceSize, atb::Operation **op, atb::Context *context); //! @@ -344,7 +344,7 @@ atb::Status AtbSelfAttentionPrefixEncoderGetWorkspaceSize(const aclTensor *query const aclTensor *mask, const aclTensor *seqLen, const aclTensor *kvSeqLen, const aclTensor *slopes, int maskType, int32_t headNum, int32_t kvHeadNum, - float qkScale, aclTensor *attnOut, uint64_t *workspaceSize, + float qkScale, aclTensor *attnOut, uint64_t &workspaceSize, atb::Operation **op, atb::Context *context); //! diff --git a/src/atb/operation/if_operation.cpp b/src/atb/operation/if_operation.cpp index aa929ade..a9c8f406 100644 --- a/src/atb/operation/if_operation.cpp +++ b/src/atb/operation/if_operation.cpp @@ -95,7 +95,7 @@ Status IfOperation::Setup(const VariantPack &variantPack, uint64_t &workspaceSiz } else { ATB_LOG(WARN) << GetLogPrefix() << "Operation already selected, resetting opSelected_..."; } - Status st = GetOperationFromCondition(&opSelected_); + Status st = GetOperationFromCondition(opSelected_); if (st != NO_ERROR) { ATB_LOG(ERROR) << GetLogPrefix() << "Failed to select operation based on condition!"; } @@ -155,7 +155,7 @@ std::shared_ptr IfOperation::CreateRunner(Context &context) const if (!opSelected_) { ATB_LOG(INFO) << GetLogPrefix() << "Operation not selected yet, executing create runner as part of graph, setting opSelected_..."; - Status st = GetOperationFromCondition(&opSelected_); + Status st = GetOperationFromCondition(opSelected_); if (st != NO_ERROR) { ATB_LOG(ERROR) << GetLogPrefix() << "Failed to select operation based on condition!"; } diff --git a/src/atb/operation/operation_base.h b/src/atb/operation/operation_base.h index 309fc0bf..f0f5d791 100644 --- a/src/atb/operation/operation_base.h +++ b/src/atb/operation/operation_base.h @@ -39,7 +39,7 @@ public: Status InferShape(const SVector &inTensorDescs, SVector &outTensorDescs) const override; Status Setup(const VariantPack &variantPack, uint64_t &workspaceSize, Context *context) override; Status Execute(const VariantPack &variantPack, uint8_t *workspace, uint64_t workspaceSize, - Context *context) override; + Context &context) override; Status SetOperationBaseIds(const std::vector &operationBaseIds, const int64_t nodeId); virtual nlohmann::json GetParamJson() const; const std::vector &GetOperationBaseIds(); diff --git a/src/atb/runner/ops_runner.h b/src/atb/runner/ops_runner.h index e2b93669..89a13a8d 100644 --- a/src/atb/runner/ops_runner.h +++ b/src/atb/runner/ops_runner.h @@ -99,7 +99,7 @@ private: Status UpdateDeviceRealAddr(const RunnerVariantPack &runnerVariantPack); Status RunKernel(KernelGraphNode &node, size_t nodeId, ContextBase *context) const; Status FillSingleKernelHostTilingBuffer(KernelGraphNode &node, size_t nodeId, uint8_t *kernelHostTilingBuffer, - size_t tilingSize, ContextBase *context); + size_t tilingSize, ContextBase &context); void MallocLocalInternalTensor(const KernelGraphNode &node, size_t nodeId, size_t tensorId, const Mki::Tensor &infershapedOutTensor, Mki::Tensor *outTensor); void MallocGlobalInternalTensor(const KernelGraphNode &node, size_t nodeId, size_t tensorId, diff --git a/src/cinterface/atb_acl_util.cpp b/src/cinterface/atb_acl_util.cpp index c4b28caf..40ec98f6 100644 --- a/src/cinterface/atb_acl_util.cpp +++ b/src/cinterface/atb_acl_util.cpp @@ -57,7 +57,7 @@ atb::Status aclTensorToAtbTensor(const aclTensor *aclTensorSrc, atb::Tensor *atb atbTensorDst->desc = desc; atbTensorDst->deviceData = aclTensorSrc->GetData(); atbTensorDst->hostData = nullptr; - int64_t tensorSize = GetTensorSize(aclTensorSrc); + int64_t tensorSize = GetTensorSize(*aclTensorSrc); int64_t dataTypeSize = static_cast(aclDataTypeSize(dataType)); if (tensorSize > MAX_TENSOR_SIZE / dataTypeSize) { ATB_LOG(ERROR) << "The size of a tensor * dataTypeSize should be no more than 256GB, but got tensor size: " @@ -97,7 +97,7 @@ atb::Status aclTensorToAtbTensorHost(const aclTensor *aclTensorSrc, atb::Tensor atbTensorDst->desc = desc; atbTensorDst->deviceData = nullptr; atbTensorDst->hostData = aclTensorSrc->GetData(); - int64_t tensorSize = GetTensorSize(aclTensorSrc); + int64_t tensorSize = GetTensorSize(*aclTensorSrc); int64_t dataTypeSize = static_cast(aclDataTypeSize(dataType)); if (tensorSize > MAX_TENSOR_SIZE / dataTypeSize) { ATB_LOG(ERROR) << "The size of a tensor * dataTypeSize should be no more than 256GB, but got tensor size: " diff --git a/src/kernels/mixkernels/fused_add_topk_div/tiling/fused_add_topk_div_tiling.cpp b/src/kernels/mixkernels/fused_add_topk_div/tiling/fused_add_topk_div_tiling.cpp index e3a57c7a..da682dcf 100644 --- a/src/kernels/mixkernels/fused_add_topk_div/tiling/fused_add_topk_div_tiling.cpp +++ b/src/kernels/mixkernels/fused_add_topk_div/tiling/fused_add_topk_div_tiling.cpp @@ -165,10 +165,10 @@ namespace AtbOps { template void FusedAddTopkDivTiling::GetTiling(TilingData *tilingDataPtr) { - GetTilingKey(tilingDataPtr); + GetTilingKey(*tilingDataPtr); GetUsedCore(); SplitUb(); - FillTilingData(tilingDataPtr); + FillTilingData(*tilingDataPtr); } template diff --git a/src/kernels/mixkernels/mla_preprocess/tiling/mla_preprocess_tiling.cpp b/src/kernels/mixkernels/mla_preprocess/tiling/mla_preprocess_tiling.cpp index 4c9b0727..2b264942 100644 --- a/src/kernels/mixkernels/mla_preprocess/tiling/mla_preprocess_tiling.cpp +++ b/src/kernels/mixkernels/mla_preprocess/tiling/mla_preprocess_tiling.cpp @@ -624,7 +624,7 @@ Mki::Status MlaPreprocessTiling::Init(const Mki::LaunchParam &launchParam, Mki:: false, // enDequant deqOnTheFly); // in bf16.cce? mm3TilingApi.GetTilingData(tilingParam->mm3); - SetTiling(tilingParam); + SetTiling(*tilingParam); MKI_LOG(INFO) << *tilingParam; SetMlapoWorkSpace(inDtype, param, kernelInfo); kernelInfo.SetBlockDim(aicNum); diff --git a/src/kernels/mixkernels/rope/tiling/rope_tiling.cpp b/src/kernels/mixkernels/rope/tiling/rope_tiling.cpp index 147efa1e..43704ab2 100644 --- a/src/kernels/mixkernels/rope/tiling/rope_tiling.cpp +++ b/src/kernels/mixkernels/rope/tiling/rope_tiling.cpp @@ -148,7 +148,7 @@ Status RopeTiling(const LaunchParam &launchParam, KernelInfo &kernelInfo) MKI_LOG(DEBUG) << "cosFormat is " << tilingDataPtr->cosFormat; uint32_t headNumQ = 1; uint32_t headNumK = 1; - RopeNdProcess(launchParam, kernelInfo, tilingDataPtr); + RopeNdProcess(launchParam, kernelInfo, *tilingDataPtr); if (tilingDataPtr->headDim != 0) { headNumQ = tilingDataPtr->hiddenSizeQ / tilingDataPtr->headDim; headNumK = tilingDataPtr->hiddenSizeK / tilingDataPtr->headDim; diff --git a/src/kernels/mixkernels/swi_glu_quant/tiling/swi_glu_quant_tiling.cpp b/src/kernels/mixkernels/swi_glu_quant/tiling/swi_glu_quant_tiling.cpp index 51b6cc14..05924aae 100644 --- a/src/kernels/mixkernels/swi_glu_quant/tiling/swi_glu_quant_tiling.cpp +++ b/src/kernels/mixkernels/swi_glu_quant/tiling/swi_glu_quant_tiling.cpp @@ -94,8 +94,8 @@ Status SwiGluQuantTiling(const LaunchParam &launchParam, KernelInfo &kernelInfo) tilingData->cacheLineLen = L2_CACHE_LINE_SIZE / SIZE_OF_FLOAT16; const Mki::SVector &xShape = launchParam.GetInTensor(0).desc.dims; MKI_CHECK_NO_LOG(SetTotalShape(xShape, *tilingData), return Status::FailStatus(ERROR_INVALID_VALUE)); - MKI_CHECK_NO_LOG(CalTilingData(tilingData), return Status::FailStatus(ERROR_INVALID_VALUE)); - SetTilingData(tilingData); + MKI_CHECK_NO_LOG(CalTilingData(*tilingData), return Status::FailStatus(ERROR_INVALID_VALUE)); + SetTilingData(*tilingData); SwigluQuantTilingKeyChose(launchParam, kernelInfo); kernelInfo.SetBlockDim(tilingData->coreNumUsed); diff --git a/src/ops_infer/linear_parallel/linear_parallel_aclnn_runner.cpp b/src/ops_infer/linear_parallel/linear_parallel_aclnn_runner.cpp index c98fb564..333f1bb8 100644 --- a/src/ops_infer/linear_parallel/linear_parallel_aclnn_runner.cpp +++ b/src/ops_infer/linear_parallel/linear_parallel_aclnn_runner.cpp @@ -193,13 +193,13 @@ Status LinearParallelAclnnRunner::LoadMethodMatmulReduceScatter() static DlManager dlManager = DlManager(std::string(std::getenv("ASCEND_HOME_PATH")) + "/lib64/libopapi.so"); Status ret = dlManager.getSymbol("aclnnMatmulReduceScatterV2GetWorkspaceSize", - (void *&)&LinearParallelAclnnRunner::aclnnMatmulReduceScatterV2GetWorkspaceSizeFunc_); + (void *&)LinearParallelAclnnRunner::aclnnMatmulReduceScatterV2GetWorkspaceSizeFunc_); if (ret != NO_ERROR) { ATB_LOG(ERROR) << "load aclnnMatmulReduceScatterV2GetWorkspaceSize failed! Consider upgrade the CANN first!"; return ret; } ret = dlManager.getSymbol("aclnnMatmulReduceScatterV2", - (void *&)&LinearParallelAclnnRunner::aclnnMatmulReduceScatterV2Func_); + (void *&)LinearParallelAclnnRunner::aclnnMatmulReduceScatterV2Func_); if (ret != NO_ERROR) { ATB_LOG(ERROR) << "load aclnnMatmulReduceScatterV2 failed! Consider upgrade the CANN first!"; return ret; diff --git a/src/ops_infer/mla_preprocess/mla_preprocess_aclnn_runner.cpp b/src/ops_infer/mla_preprocess/mla_preprocess_aclnn_runner.cpp index 7f7957e0..552d2594 100644 --- a/src/ops_infer/mla_preprocess/mla_preprocess_aclnn_runner.cpp +++ b/src/ops_infer/mla_preprocess/mla_preprocess_aclnn_runner.cpp @@ -206,12 +206,12 @@ Status MlaPreprocessAclnnRunner::LoadMethod() } static DlManager dlManager = DlManager(std::string(std::getenv("ASCEND_HOME_PATH")) + "/lib64/libopapi.so"); Status ret = dlManager.getSymbol("aclnnMlaPreprocessGetWorkspaceSize", - (void *&)&MlaPreprocessAclnnRunner::aclnnGetWorkspaceSizeFunc_); + (void *&)MlaPreprocessAclnnRunner::aclnnGetWorkspaceSizeFunc_); if (ret != NO_ERROR) { ATB_LOG(ERROR) << "load aclnnMlaPreprocessGetWorkspaceSize failed! Consider upgrade the CANN first!"; return ret; } - ret = dlManager.getSymbol("aclnnMlaPreprocess", (void *&)&MlaPreprocessAclnnRunner::aclnnExecuteFunc_); + ret = dlManager.getSymbol("aclnnMlaPreprocess", (void *&)MlaPreprocessAclnnRunner::aclnnExecuteFunc_); if (ret != NO_ERROR) { ATB_LOG(ERROR) << "load aclnnMlaPreprocess failed! Consider upgrade the CANN first!"; return ret; -- Gitee From edf688c291f5c48b35e89425635c3241fe35da32 Mon Sep 17 00:00:00 2001 From: huangxiaolan Date: Mon, 22 Sep 2025 15:14:14 +0800 Subject: [PATCH 29/94] fix operation.h --- include/atb/operation.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/atb/operation.h b/include/atb/operation.h index f3aeec56..c1b48e1d 100644 --- a/include/atb/operation.h +++ b/include/atb/operation.h @@ -95,7 +95,7 @@ public: //! \return 状态值,如果成功,返回NO_ERROR //! virtual Status Execute(const VariantPack &variantPack, uint8_t *workspace, uint64_t workspaceSize, - Context *context) = 0; + Context &context) = 0; }; //! -- Gitee From 8a69220c6c6c275d1e860557c489d1c175f86c90 Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Tue, 9 Sep 2025 11:36:49 +0800 Subject: [PATCH 30/94] fix --- .../src/ascendc_kernels/allreduce_big_data.h | 18 ++++++++++++++++++ src/kernels/lcal/src/ascendc_kernels/lccl_op.h | 9 ++++++++- 2 files changed, 26 insertions(+), 1 deletion(-) diff --git a/src/kernels/lcal/src/ascendc_kernels/allreduce_big_data.h b/src/kernels/lcal/src/ascendc_kernels/allreduce_big_data.h index f8ce0276..5623ab45 100644 --- a/src/kernels/lcal/src/ascendc_kernels/allreduce_big_data.h +++ b/src/kernels/lcal/src/ascendc_kernels/allreduce_big_data.h @@ -30,6 +30,8 @@ public: DumpLcclLogInfo(LogId::INIT, static_cast(op)); if constexpr(!std::is_same_v) { BuildScaleOffset(scale, scaleCount, offset); + this->input = input; + this->output = output; } if (blockIdx >= PING_PONG_SIZE * rankSize) { @@ -124,6 +126,22 @@ public: } DumpLcclLogInfo(LogId::PROCESS, static_cast(atomOp)); } + + FORCE_INLINE_AICORE void SupportBigScale() + { + if constexpr(!std::is_same_v) { + constexpr int32_t bigScaleFlagOffset = 2; + if (blockIdx == 0) { + inputGt.SetGlobalBuffer((__gm__ U*)input); + outputGt.SetGlobalBuffer((__gm__ T*)output); + CpGM2GMWithScale(len, inputGt, outputGt, COPYONLY); + sync.SetSyncFlag(magic, 0, blockNum * bigScaleFlagOffset, rank); + } else { + sync.WaitSyncFlag(magic, 0, blockNum * bigScaleFlagOffset, rank); + } + } + return; + } private: FORCE_INLINE_AICORE void Producer() { diff --git a/src/kernels/lcal/src/ascendc_kernels/lccl_op.h b/src/kernels/lcal/src/ascendc_kernels/lccl_op.h index bf54ce2b..115c2690 100644 --- a/src/kernels/lcal/src/ascendc_kernels/lccl_op.h +++ b/src/kernels/lcal/src/ascendc_kernels/lccl_op.h @@ -129,6 +129,7 @@ extern "C" __global__ __aicore__ void LcalAllReduce_##type##suffix(KERNELS_ARGS_ constexpr int32_t cceSmallDataSize = 2 * 1024 * 1024; \ constexpr int32_t smallDataSize910a3 = 32 * 1024 * 1024; \ constexpr int32_t rankSize910a3 = 16; \ + constexpr int32_t scaleCountMax = 12 * 1024 * 1024; \ __gm__ type * shareAddrs[LCAL_MAX_RANK_SIZE]; \ GET_IPC_MEM_ARGS(type); \ if ((extraFlag & ExtraFlag::TOPO_PCIE) != 0) { \ @@ -142,8 +143,14 @@ extern "C" __global__ __aicore__ void LcalAllReduce_##type##suffix(KERNELS_ARGS_ CLASS_OP_QUANT_LAUNCH(AllReduceOneShot, half, int8_t); \ } else if (len * sizeof(type) <= quantSmallDataSize) { \ CLASS_OP_QUANT_LAUNCH(AllReduceTwoShot, half, int8_t); \ - } else { \ + } else if (scaleCount * rankSize <= scaleCountMax) { \ CLASS_OP_QUANT_LAUNCH(AllReduceBigData, half, int8_t); \ + } else { \ + AllReduceBigData opTmp(localRank, localRankSize, extraFlag); \ + opTmp.Init(KERNELS_ARGS_CALL()); \ + opTmp.SupportBigScale(); \ + input = output; \ + CLASS_OP_LAUNCH(AllReduceBigData, half); \ } \ } else if ((extraFlag & ExtraFlag::TOPO_910B2C) != 0 && rankSize > smallRankSize) { \ if (len * sizeof(type) < cceSmallDataSize) { \ -- Gitee From 74c01e931f88f3a5e99b271d1b39b8281dd95037 Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Tue, 9 Sep 2025 11:54:52 +0800 Subject: [PATCH 31/94] fix --- src/kernels/lcal/src/ascendc_kernels/allreduce_big_data.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/kernels/lcal/src/ascendc_kernels/allreduce_big_data.h b/src/kernels/lcal/src/ascendc_kernels/allreduce_big_data.h index 5623ab45..f8d7c9d5 100644 --- a/src/kernels/lcal/src/ascendc_kernels/allreduce_big_data.h +++ b/src/kernels/lcal/src/ascendc_kernels/allreduce_big_data.h @@ -269,6 +269,8 @@ private: T offset = 0; bool isEnableScale = false; bool isVectorScale = false; + GM_ADDR input = nullptr; + GM_ADDR output = nullptr; }; #endif // LCCL_ALLREDUCE_BIG_DATA_H \ No newline at end of file -- Gitee From 1fb0916239760d2a32227275ee48375f0a19d2fd Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Thu, 4 Sep 2025 11:14:04 +0800 Subject: [PATCH 32/94] add kernel control --- src/kernels/lcal/src/lccl.cpp | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/src/kernels/lcal/src/lccl.cpp b/src/kernels/lcal/src/lccl.cpp index 694bfc74..48abdad3 100644 --- a/src/kernels/lcal/src/lccl.cpp +++ b/src/kernels/lcal/src/lccl.cpp @@ -229,6 +229,8 @@ uint32_t Lccl::GetBlockNum(LcalType cclType, uint32_t rankSize, int64_t dataSize MKI_LOG(ERROR) << "comm is nullptr" << __LINE__; return 0; } + uint32_t limitVal = 0; + aclrtDevResLimitType limitType = aclrtDevResLimitType::ACL_RT_DEV_RES_VECTOR_CORE; uint32_t blockNum = GetKernelBlockNum(cclType, rankSize, dataSize, localRankSize, extraFlag); if (comm_->isEnableMix_) { constexpr uint32_t aivNumPerAic = 2; @@ -236,10 +238,16 @@ uint32_t Lccl::GetBlockNum(LcalType cclType, uint32_t rankSize, int64_t dataSize MKI_LOG(ERROR) << "Lccl not support odd block number at msprof op enabled!"; return 0; } - return blockNum / aivNumPerAic; - } else { - return blockNum; + blockNum = blockNum / aivNumPerAic; + limitType = aclrtDevResLimitType::ACL_RT_DEV_RES_CUBE_CORE; + } + aclrtGetResInCurrentThread(limitType, &limitVal); + if (blockNum > limitVal) { + MKI_LOG(ERROR) << "Insufficient blockDim: Required blockNum(" << blockNum << + ") exceeds limit (limitVal=" << limitVal << ", limitType=" << static_cast(limitType) << ")"; + return 0; } + return blockNum; } int Lccl::LoopBack(const void *sendBuff, void *recvBuff, int64_t count, HcclDataType dataType, aclrtStream stream) const -- Gitee From a8de569ea145ad97604650f8e3dbfda6d7ad06d3 Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Mon, 15 Sep 2025 20:48:35 +0800 Subject: [PATCH 33/94] fix --- src/kernels/lcal/src/lccl.cpp | 51 +++++++++++++++++++++++++++++++---- 1 file changed, 46 insertions(+), 5 deletions(-) diff --git a/src/kernels/lcal/src/lccl.cpp b/src/kernels/lcal/src/lccl.cpp index 48abdad3..7faffd67 100644 --- a/src/kernels/lcal/src/lccl.cpp +++ b/src/kernels/lcal/src/lccl.cpp @@ -13,6 +13,7 @@ #include #include #include +#include #include #include @@ -25,6 +26,42 @@ using namespace chrono; using namespace Mki; namespace Lcal { +using PFN_aclrtGetResInCurrentThread = int(*)(aclrtDevResLimitType type, uint32_t *); +static PFN_aclrtGetResInCurrentThread g_aclGetResFunc = nullptr; +static void *g_libHandle = nullptr; +static std::mutex g_initMutex; + +bool InitAclFunctions() +{ + std::lock_guard lock(g_initMutex); + + if (g_libHandle != nullptr) { + return true; + } + + const char *libPath = "libascendcl.so"; + g_libHandle = dlopen(libPath, RTLD_LAZY | RTLD_LOCAL); + if (g_libHandle == nullptr) { + MKI_LOG(ERROR) << "Failed to load " << libPath << ": " << dlerror(); + return false; + } + + dlerror(); + + const char *funcName = "aclrtGetResInCurrentThread"; + g_aclGetResFunc = reinterpret_cast(dlsym(g_libHandle, funcName)); + const char *dlsymError = dlerror(); + if (dlsymError != nullptr) { + MKI_LOG(WARN) << "Failed to load " << funcName << ": " << dlsymError; + dlclose(g_libHandle); + g_libHandle = nullptr; + g_aclGetResFunc = nullptr; + return false; + } + + MKI_LOG(DEBUG) << "Successfully loaded " << libPath << "::" << funcName; + return true; +} uint32_t GetLocalReduceBlockDum(int64_t dataSize) { @@ -241,11 +278,15 @@ uint32_t Lccl::GetBlockNum(LcalType cclType, uint32_t rankSize, int64_t dataSize blockNum = blockNum / aivNumPerAic; limitType = aclrtDevResLimitType::ACL_RT_DEV_RES_CUBE_CORE; } - aclrtGetResInCurrentThread(limitType, &limitVal); - if (blockNum > limitVal) { - MKI_LOG(ERROR) << "Insufficient blockDim: Required blockNum(" << blockNum << - ") exceeds limit (limitVal=" << limitVal << ", limitType=" << static_cast(limitType) << ")"; - return 0; + if (InitAclFunctions() && g_aclGetResFunc != nullptr) { + g_aclGetResFunc(limitType, &limitVal); + MKI_LOG(ERROR) << "Required blockNum(" << blockNum << + ") limit:(limitVal=" << limitVal << ", limitType=" << static_cast(limitType) << ")"; + if (blockNum > limitVal) { + MKI_LOG(ERROR) << "Insufficient blockDim: Required blockNum(" << blockNum << + ") exceeds limit (limitVal=" << limitVal << ", limitType=" << static_cast(limitType) << ")"; + return 0; + } } return blockNum; } -- Gitee From 21fa5806b79d33bc5e00c133deaf7fb94933fa49 Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Tue, 16 Sep 2025 11:18:10 +0800 Subject: [PATCH 34/94] fix clean code --- src/kernels/lcal/src/lccl.cpp | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/src/kernels/lcal/src/lccl.cpp b/src/kernels/lcal/src/lccl.cpp index 7faffd67..0f9eb276 100644 --- a/src/kernels/lcal/src/lccl.cpp +++ b/src/kernels/lcal/src/lccl.cpp @@ -26,8 +26,8 @@ using namespace chrono; using namespace Mki; namespace Lcal { -using PFN_aclrtGetResInCurrentThread = int(*)(aclrtDevResLimitType type, uint32_t *); -static PFN_aclrtGetResInCurrentThread g_aclGetResFunc = nullptr; +using LCAL_GET_RES_IN_CUR_THREAD = int(*)(aclrtDevResLimitType type, uint32_t *); +static LCAL_GET_RES_IN_CUR_THREAD g_aclGetResFunc = nullptr; static void *g_libHandle = nullptr; static std::mutex g_initMutex; @@ -45,11 +45,12 @@ bool InitAclFunctions() MKI_LOG(ERROR) << "Failed to load " << libPath << ": " << dlerror(); return false; } - + + // 清理错误信息 dlerror(); const char *funcName = "aclrtGetResInCurrentThread"; - g_aclGetResFunc = reinterpret_cast(dlsym(g_libHandle, funcName)); + g_aclGetResFunc = reinterpret_cast(dlsym(g_libHandle, funcName)); const char *dlsymError = dlerror(); if (dlsymError != nullptr) { MKI_LOG(WARN) << "Failed to load " << funcName << ": " << dlsymError; @@ -63,6 +64,17 @@ bool InitAclFunctions() return true; } +void CleanupAclFunctions() +{ + std::lock_guard lock(g_initMutex); + + if (g_libHandle != nullptr) { + dlclose(g_libHandle); + g_libHandle = nullptr; + } + g_aclGetResFunc = nullptr; +} + uint32_t GetLocalReduceBlockDum(int64_t dataSize) { constexpr int oneDataSize = 190 * 1024; @@ -263,7 +275,7 @@ uint32_t Lccl::GetBlockNum(LcalType cclType, uint32_t rankSize, int64_t dataSize int localRankSize, uint32_t extraFlag) const { if (comm_ == nullptr) { - MKI_LOG(ERROR) << "comm is nullptr" << __LINE__; + MKI_LOG(ERROR) << "comm is nullptr " << __LINE__; return 0; } uint32_t limitVal = 0; @@ -530,6 +542,7 @@ Lccl::~Lccl() if (rankSize_ == -1 and comm_ != nullptr) { delete comm_; } + CleanupAclFunctions(); } Lccl::Lccl(LcalComm *comm) : comm_(comm) @@ -545,11 +558,13 @@ Lccl::Lccl(LcalComm *comm) : comm_(comm) } rankSize_ = -1; } + InitAclFunctions(); } Lccl::Lccl(LcalComm &comm) : comm_(&comm) { rank_ = comm.rank_; rankSize_ = comm.rankSize_; + InitAclFunctions(); } } \ No newline at end of file -- Gitee From 085d6f7ed12fa86f6b527d1dbc5fcd7d75185130 Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Tue, 16 Sep 2025 19:09:05 +0800 Subject: [PATCH 35/94] fix --- src/kernels/lcal/include/lcal_comm.h | 1 + src/kernels/lcal/src/lcal_comm.cpp | 67 ++++++++++++++++++++++++++++ src/kernels/lcal/src/lccl.cpp | 52 --------------------- 3 files changed, 68 insertions(+), 52 deletions(-) diff --git a/src/kernels/lcal/include/lcal_comm.h b/src/kernels/lcal/include/lcal_comm.h index 6ec0fbd7..b5d0918c 100644 --- a/src/kernels/lcal/include/lcal_comm.h +++ b/src/kernels/lcal/include/lcal_comm.h @@ -63,6 +63,7 @@ private: int GetName(std::string &name, char names[LCAL_MAX_RANK_SIZE][IPC_NAME_SIZE]) const; int SyncCommArgs(); int InitDumpAddr(); + int CallAclRtGetRes(int type, uint32_t *resource) const; private: int rank_ = 0; // global rank id diff --git a/src/kernels/lcal/src/lcal_comm.cpp b/src/kernels/lcal/src/lcal_comm.cpp index b54380a0..ce1c2251 100644 --- a/src/kernels/lcal/src/lcal_comm.cpp +++ b/src/kernels/lcal/src/lcal_comm.cpp @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -57,6 +58,69 @@ static map g_localPeerMemMap; static map g_devList; static std::mutex g_mtx; +using LCAL_GET_RES_IN_CUR_THREAD = int(*)(aclrtDevResLimitType type, uint32_t *resource); +static LCAL_GET_RES_IN_CUR_THREAD g_aclGetResFunc = nullptr; +static void *g_libHandle = nullptr; +static std::mutex g_initMutex; + +bool InitAclFunctions() +{ + std::lock_guard lock(g_initMutex); + + if (g_libHandle != nullptr) { + return true; + } + + const char *libPath = "libascendcl.so"; + g_libHandle = dlopen(libPath, RTLD_LAZY | RTLD_LOCAL); + if (g_libHandle == nullptr) { + MKI_LOG(ERROR) << "Failed to load " << libPath << ": " << dlerror(); + return false; + } + + // 清理错误信息 + dlerror(); + + const char *funcName = "aclrtGetResInCurrentThread"; + g_aclGetResFunc = reinterpret_cast(dlsym(g_libHandle, funcName)); + const char *dlsymError = dlerror(); + if (dlsymError != nullptr) { + MKI_LOG(WARN) << "Failed to load " << funcName << ": " << dlsymError; + dlclose(g_libHandle); + g_libHandle = nullptr; + g_aclGetResFunc = nullptr; + return false; + } + + MKI_LOG(DEBUG) << "Successfully loaded " << libPath << "::" << funcName; + return true; +} + +void CleanupAclFunctions() +{ + std::lock_guard lock(g_initMutex); + + if (g_libHandle != nullptr) { + dlclose(g_libHandle); + g_libHandle = nullptr; + } + g_aclGetResFunc = nullptr; +} + +int LcalComm::CallAclRtGetRes(int type, uint32_t *resource) const +{ + if (g_aclGetResFunc != nullptr) { + if (type == ACL_RT_DEV_RES_CUBE_CORE || type == ACL_RT_DEV_RES_VECTOR_CORE) { + g_aclGetResFunc(static_cast(type), resource); + return LCAL_SUCCESS; + } else { + MKI_LOG(ERROR) << "aclrtGetResInCurrentThread not support type " << type; + return LCAL_ERROR_INTERNAL; + } + } + return LCAL_ERROR_NOT_FOUND; +} + static const std::unordered_map CHIP_MAP = { {"Ascend310P", ChipName::CHIP_310P3}, {"Ascend910B1", ChipName::CHIP_910B1}, @@ -303,6 +367,7 @@ int LcalComm::Init() if (inited_) { return LCAL_SUCCESS; } + InitAclFunctions(); if (rank_ < 0 || rank_ >= rankSize_ || rankSize_ <= 0 || rankSize_ > LCAL_MAX_RANK_SIZE) { MKI_LOG(ERROR) << "The rank is invalid! rank:" << rank_ << " rankSize:" << rankSize_; return LCAL_ERROR_PARA_CHECK_FAIL; @@ -351,6 +416,7 @@ int LcalComm::InitThread(const std::string &uid) if (inited_) { return LCAL_SUCCESS; } + InitAclFunctions(); if (rank_ < 0 || rank_ >= rankSize_ || rankSize_ <= 0 || rankSize_ > LCAL_MAX_RANK_SIZE) { MKI_LOG(ERROR) << "The rank is invalid! rank:" << rank_ << "rankSize:" << rankSize_; return LCAL_ERROR_PARA_CHECK_FAIL; @@ -723,6 +789,7 @@ LcalComm::~LcalComm() FreePeerMem(commArgs_.dumpAddr); FreePeerMem(peerMem_[rank_]); FreePeerMem(commArgsPtr_); + CleanupAclFunctions(); } LcalComm::LcalComm(int rank, int rankSize) : rank_(rank), rankSize_(rankSize) diff --git a/src/kernels/lcal/src/lccl.cpp b/src/kernels/lcal/src/lccl.cpp index 0f9eb276..93a090c9 100644 --- a/src/kernels/lcal/src/lccl.cpp +++ b/src/kernels/lcal/src/lccl.cpp @@ -26,55 +26,6 @@ using namespace chrono; using namespace Mki; namespace Lcal { -using LCAL_GET_RES_IN_CUR_THREAD = int(*)(aclrtDevResLimitType type, uint32_t *); -static LCAL_GET_RES_IN_CUR_THREAD g_aclGetResFunc = nullptr; -static void *g_libHandle = nullptr; -static std::mutex g_initMutex; - -bool InitAclFunctions() -{ - std::lock_guard lock(g_initMutex); - - if (g_libHandle != nullptr) { - return true; - } - - const char *libPath = "libascendcl.so"; - g_libHandle = dlopen(libPath, RTLD_LAZY | RTLD_LOCAL); - if (g_libHandle == nullptr) { - MKI_LOG(ERROR) << "Failed to load " << libPath << ": " << dlerror(); - return false; - } - - // 清理错误信息 - dlerror(); - - const char *funcName = "aclrtGetResInCurrentThread"; - g_aclGetResFunc = reinterpret_cast(dlsym(g_libHandle, funcName)); - const char *dlsymError = dlerror(); - if (dlsymError != nullptr) { - MKI_LOG(WARN) << "Failed to load " << funcName << ": " << dlsymError; - dlclose(g_libHandle); - g_libHandle = nullptr; - g_aclGetResFunc = nullptr; - return false; - } - - MKI_LOG(DEBUG) << "Successfully loaded " << libPath << "::" << funcName; - return true; -} - -void CleanupAclFunctions() -{ - std::lock_guard lock(g_initMutex); - - if (g_libHandle != nullptr) { - dlclose(g_libHandle); - g_libHandle = nullptr; - } - g_aclGetResFunc = nullptr; -} - uint32_t GetLocalReduceBlockDum(int64_t dataSize) { constexpr int oneDataSize = 190 * 1024; @@ -542,7 +493,6 @@ Lccl::~Lccl() if (rankSize_ == -1 and comm_ != nullptr) { delete comm_; } - CleanupAclFunctions(); } Lccl::Lccl(LcalComm *comm) : comm_(comm) @@ -558,13 +508,11 @@ Lccl::Lccl(LcalComm *comm) : comm_(comm) } rankSize_ = -1; } - InitAclFunctions(); } Lccl::Lccl(LcalComm &comm) : comm_(&comm) { rank_ = comm.rank_; rankSize_ = comm.rankSize_; - InitAclFunctions(); } } \ No newline at end of file -- Gitee From 243a96120979278367a73986ed6f989e5ee20810 Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Tue, 16 Sep 2025 19:13:51 +0800 Subject: [PATCH 36/94] fix --- src/kernels/lcal/src/lccl.cpp | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/kernels/lcal/src/lccl.cpp b/src/kernels/lcal/src/lccl.cpp index 93a090c9..5c6d9235 100644 --- a/src/kernels/lcal/src/lccl.cpp +++ b/src/kernels/lcal/src/lccl.cpp @@ -13,7 +13,6 @@ #include #include #include -#include #include #include @@ -26,6 +25,7 @@ using namespace chrono; using namespace Mki; namespace Lcal { + uint32_t GetLocalReduceBlockDum(int64_t dataSize) { constexpr int oneDataSize = 190 * 1024; @@ -241,9 +241,10 @@ uint32_t Lccl::GetBlockNum(LcalType cclType, uint32_t rankSize, int64_t dataSize blockNum = blockNum / aivNumPerAic; limitType = aclrtDevResLimitType::ACL_RT_DEV_RES_CUBE_CORE; } - if (InitAclFunctions() && g_aclGetResFunc != nullptr) { - g_aclGetResFunc(limitType, &limitVal); - MKI_LOG(ERROR) << "Required blockNum(" << blockNum << + + int res = comm_->CallAclRtGetRes(static_cast(limitType), &limitVal); + if (res == LCAL_SUCCESS) { + MKI_LOG(DEBUG) << "Required blockNum(" << blockNum << ") limit:(limitVal=" << limitVal << ", limitType=" << static_cast(limitType) << ")"; if (blockNum > limitVal) { MKI_LOG(ERROR) << "Insufficient blockDim: Required blockNum(" << blockNum << -- Gitee From fb83038cd01803f060644114cc057aea5884a052 Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Tue, 16 Sep 2025 19:21:49 +0800 Subject: [PATCH 37/94] fix --- src/kernels/lcal/src/lccl.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/kernels/lcal/src/lccl.cpp b/src/kernels/lcal/src/lccl.cpp index 5c6d9235..66eb8d8e 100644 --- a/src/kernels/lcal/src/lccl.cpp +++ b/src/kernels/lcal/src/lccl.cpp @@ -241,7 +241,7 @@ uint32_t Lccl::GetBlockNum(LcalType cclType, uint32_t rankSize, int64_t dataSize blockNum = blockNum / aivNumPerAic; limitType = aclrtDevResLimitType::ACL_RT_DEV_RES_CUBE_CORE; } - + int res = comm_->CallAclRtGetRes(static_cast(limitType), &limitVal); if (res == LCAL_SUCCESS) { MKI_LOG(DEBUG) << "Required blockNum(" << blockNum << -- Gitee From b7e850f52cde34b51f35629264407dc94309eec6 Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Thu, 18 Sep 2025 16:32:06 +0800 Subject: [PATCH 38/94] include order --- src/kernels/lcal/src/lcal_comm.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/kernels/lcal/src/lcal_comm.cpp b/src/kernels/lcal/src/lcal_comm.cpp index ce1c2251..57a3a15f 100644 --- a/src/kernels/lcal/src/lcal_comm.cpp +++ b/src/kernels/lcal/src/lcal_comm.cpp @@ -14,13 +14,13 @@ #include #include #include -#include #include #include #include #include #include +#include #include #include "mki/utils/log/log.h" #include "mki/utils/env/env.h" -- Gitee From ce08766da0f35debafe9fb59ffff9cc9c79eca38 Mon Sep 17 00:00:00 2001 From: huangxiaolan Date: Mon, 22 Sep 2025 16:43:12 +0800 Subject: [PATCH 39/94] recover changes --- include/atb/atb_acl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/atb/atb_acl.h b/include/atb/atb_acl.h index 377d37d7..bf62b985 100644 --- a/include/atb/atb_acl.h +++ b/include/atb/atb_acl.h @@ -101,7 +101,7 @@ atb::Status AtbMLAGetWorkspaceSize(const aclTensor *qNope, const aclTensor *qRop const aclTensor *mask, const aclTensor *qSeqLen, const aclTensor *qkDescale, const aclTensor *pvDescale, int32_t headNum, float qkScale, int32_t kvHeadNum, int maskType, int calcType, uint8_t cacheMode, aclTensor *attenOut, aclTensor *lse, - uint64_t &workspaceSize, atb::Operation **op, atb::Context &context); + uint64_t &workspaceSize, atb::Operation **op, atb::Context *context); //! //! \brief 关于MLA算子使用aclnn风格调用的2段式接口的第2段, -- Gitee From d3a634074c2c9c7437f196034b41eabf91988393 Mon Sep 17 00:00:00 2001 From: Vector Date: Mon, 22 Sep 2025 17:06:21 +0800 Subject: [PATCH 40/94] update --- example/op_demo/linear/README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/example/op_demo/linear/README.md b/example/op_demo/linear/README.md index f7c45a0d..e1547d92 100644 --- a/example/op_demo/linear/README.md +++ b/example/op_demo/linear/README.md @@ -185,6 +185,7 @@ - linear_dequant_ds_demo.cpp + 该demo支持Atlas A2/A3系列和Atlas 推理系列产品上运行。 **参数设置**: | 成员名称 | 取值 | @@ -204,4 +205,4 @@ | `weight` | int8 | nd | [7168, 16384] | npu | | `bias` | int32 | nd | [1, 7168] | npu | | `deqScale` | int64 | nd | [1, 7168] | npu | - | `output` | fp16 | nd | [32, 7168] | npu | + | `output` | float16 | nd | [32, 7168] | npu | -- Gitee From 09751e58329573bfa0868b8803869787509a57a3 Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Mon, 22 Sep 2025 17:21:56 +0800 Subject: [PATCH 41/94] fix --- src/kernels/lcal/include/lcal_comm.h | 3 +- src/kernels/lcal/src/lcal_comm.cpp | 73 ++-------------------------- src/kernels/lcal/src/lccl.cpp | 40 +++++++++++++++ 3 files changed, 44 insertions(+), 72 deletions(-) diff --git a/src/kernels/lcal/include/lcal_comm.h b/src/kernels/lcal/include/lcal_comm.h index b5d0918c..bff77eea 100644 --- a/src/kernels/lcal/include/lcal_comm.h +++ b/src/kernels/lcal/include/lcal_comm.h @@ -63,8 +63,7 @@ private: int GetName(std::string &name, char names[LCAL_MAX_RANK_SIZE][IPC_NAME_SIZE]) const; int SyncCommArgs(); int InitDumpAddr(); - int CallAclRtGetRes(int type, uint32_t *resource) const; - + private: int rank_ = 0; // global rank id int rankSize_ = 0; // global rank size diff --git a/src/kernels/lcal/src/lcal_comm.cpp b/src/kernels/lcal/src/lcal_comm.cpp index 57a3a15f..8b77500a 100644 --- a/src/kernels/lcal/src/lcal_comm.cpp +++ b/src/kernels/lcal/src/lcal_comm.cpp @@ -20,7 +20,6 @@ #include #include -#include #include #include "mki/utils/log/log.h" #include "mki/utils/env/env.h" @@ -58,69 +57,6 @@ static map g_localPeerMemMap; static map g_devList; static std::mutex g_mtx; -using LCAL_GET_RES_IN_CUR_THREAD = int(*)(aclrtDevResLimitType type, uint32_t *resource); -static LCAL_GET_RES_IN_CUR_THREAD g_aclGetResFunc = nullptr; -static void *g_libHandle = nullptr; -static std::mutex g_initMutex; - -bool InitAclFunctions() -{ - std::lock_guard lock(g_initMutex); - - if (g_libHandle != nullptr) { - return true; - } - - const char *libPath = "libascendcl.so"; - g_libHandle = dlopen(libPath, RTLD_LAZY | RTLD_LOCAL); - if (g_libHandle == nullptr) { - MKI_LOG(ERROR) << "Failed to load " << libPath << ": " << dlerror(); - return false; - } - - // 清理错误信息 - dlerror(); - - const char *funcName = "aclrtGetResInCurrentThread"; - g_aclGetResFunc = reinterpret_cast(dlsym(g_libHandle, funcName)); - const char *dlsymError = dlerror(); - if (dlsymError != nullptr) { - MKI_LOG(WARN) << "Failed to load " << funcName << ": " << dlsymError; - dlclose(g_libHandle); - g_libHandle = nullptr; - g_aclGetResFunc = nullptr; - return false; - } - - MKI_LOG(DEBUG) << "Successfully loaded " << libPath << "::" << funcName; - return true; -} - -void CleanupAclFunctions() -{ - std::lock_guard lock(g_initMutex); - - if (g_libHandle != nullptr) { - dlclose(g_libHandle); - g_libHandle = nullptr; - } - g_aclGetResFunc = nullptr; -} - -int LcalComm::CallAclRtGetRes(int type, uint32_t *resource) const -{ - if (g_aclGetResFunc != nullptr) { - if (type == ACL_RT_DEV_RES_CUBE_CORE || type == ACL_RT_DEV_RES_VECTOR_CORE) { - g_aclGetResFunc(static_cast(type), resource); - return LCAL_SUCCESS; - } else { - MKI_LOG(ERROR) << "aclrtGetResInCurrentThread not support type " << type; - return LCAL_ERROR_INTERNAL; - } - } - return LCAL_ERROR_NOT_FOUND; -} - static const std::unordered_map CHIP_MAP = { {"Ascend310P", ChipName::CHIP_310P3}, {"Ascend910B1", ChipName::CHIP_910B1}, @@ -367,8 +303,7 @@ int LcalComm::Init() if (inited_) { return LCAL_SUCCESS; } - InitAclFunctions(); - if (rank_ < 0 || rank_ >= rankSize_ || rankSize_ <= 0 || rankSize_ > LCAL_MAX_RANK_SIZE) { + if (rank_ < 0 || rank_ >= rankSize_ || rankSize_ <= 0 || rankSize_ > LCAL_MAX_RANK_SIZE) { MKI_LOG(ERROR) << "The rank is invalid! rank:" << rank_ << " rankSize:" << rankSize_; return LCAL_ERROR_PARA_CHECK_FAIL; } @@ -416,8 +351,7 @@ int LcalComm::InitThread(const std::string &uid) if (inited_) { return LCAL_SUCCESS; } - InitAclFunctions(); - if (rank_ < 0 || rank_ >= rankSize_ || rankSize_ <= 0 || rankSize_ > LCAL_MAX_RANK_SIZE) { + if (rank_ < 0 || rank_ >= rankSize_ || rankSize_ <= 0 || rankSize_ > LCAL_MAX_RANK_SIZE) { MKI_LOG(ERROR) << "The rank is invalid! rank:" << rank_ << "rankSize:" << rankSize_; return LCAL_ERROR_PARA_CHECK_FAIL; } @@ -789,8 +723,7 @@ LcalComm::~LcalComm() FreePeerMem(commArgs_.dumpAddr); FreePeerMem(peerMem_[rank_]); FreePeerMem(commArgsPtr_); - CleanupAclFunctions(); -} + } LcalComm::LcalComm(int rank, int rankSize) : rank_(rank), rankSize_(rankSize) { diff --git a/src/kernels/lcal/src/lccl.cpp b/src/kernels/lcal/src/lccl.cpp index 66eb8d8e..62cf73d7 100644 --- a/src/kernels/lcal/src/lccl.cpp +++ b/src/kernels/lcal/src/lccl.cpp @@ -17,6 +17,7 @@ #include #include +#include #include "profiling/report_timing.h" @@ -26,6 +27,45 @@ using namespace Mki; namespace Lcal { +using AclrtGetResInCurrentThreadFunc = int(*)(int, uint32_t*); + +int GetAclResInCurThread(int type, uint32_t *resource) +{ + // 静态变量:保存函数指针和库句柄 + static std::unique_ptr mkiDl; + static AclrtGetResInCurrentThreadFunc aclrtGetResInCurrentThread = nullptr; + static std::mutex localMutex; // 线程安全锁 + + std::lock_guard lock(localMutex); // 加锁 + + // 首次调用时初始化 + if (!mkiDl) { + std::string libPath = std::string(Mki::GetEnv("ASCEND_HOME_PATH")) + "/runtime/lib64/libascendcl.so"; + mkiDl = std::make_unique(libPath, false); + if (!mkiDl->IsValid()) { // 检查库是否加载成功 + MKI_LOG(WARN) << "Failed to load libascendcl.so!"; + return LCAL_ERROR_NOT_FOUND; + } + aclrtGetResInCurrentThread = + (AclrtGetResInCurrentThreadFunc)mkiDl->GetSymbol("aclrtGetResInCurrentThread"); + if (aclrtGetResInCurrentThread == nullptr) { + MKI_LOG(WARN) << "Failed to get acl function!"; + return LCAL_ERROR_NOT_FOUND; + } + MKI_LOG(DEBUG) << "Successfully loaded libascendcl.so and resolved aclrtGetResInCurrentThread"; + } + + // 调用函数 + int getResRet = aclrtGetResInCurrentThread(type, resource); + if (getResRet != ACL_SUCCESS) { + MKI_LOG(ERROR) << "Failed to get resource in current thread for type:" << type << " err:" << getResRet; + return LCAL_ERROR_INTERNAL; + } else { + MKI_LOG(DEBUG) << "Get resource in current thread for type:" << type << " resource:" << *resource; + return LCAL_SUCCESS; + } +} + uint32_t GetLocalReduceBlockDum(int64_t dataSize) { constexpr int oneDataSize = 190 * 1024; -- Gitee From b1689b3c53f9c0ec9cb61abfc5f535e3d2ed114c Mon Sep 17 00:00:00 2001 From: huangxiaolan Date: Mon, 22 Sep 2025 17:28:27 +0800 Subject: [PATCH 42/94] fix point --- src/atb/operation/if_operation.cpp | 2 +- src/atb/operation/if_operation.h | 2 +- src/atb/operation/operation_base.cpp | 2 +- src/atb/runner/ops_runner.cpp | 2 +- src/atb/runner/plugin_runner.cpp | 2 +- src/cinterface/atb_acl_fused_add_topk_div.cpp | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/atb/operation/if_operation.cpp b/src/atb/operation/if_operation.cpp index a9c8f406..7a70570e 100644 --- a/src/atb/operation/if_operation.cpp +++ b/src/atb/operation/if_operation.cpp @@ -104,7 +104,7 @@ Status IfOperation::Setup(const VariantPack &variantPack, uint64_t &workspaceSiz } Status IfOperation::Execute(const VariantPack &variantPack, uint8_t *workspace, uint64_t workspaceSize, - Context *context) + Context &context) { ATB_LOG(INFO) << GetLogPrefix() << "Calling Execute..."; return opSelected_->Execute(variantPack, workspace, workspaceSize, context); diff --git a/src/atb/operation/if_operation.h b/src/atb/operation/if_operation.h index eab4a45f..8f5dfbe7 100644 --- a/src/atb/operation/if_operation.h +++ b/src/atb/operation/if_operation.h @@ -24,7 +24,7 @@ public: std::string GetName() const override; Status Setup(const VariantPack &variantPack, uint64_t &workspaceSize, Context *context) override; Status Execute(const VariantPack &variantPack, uint8_t *workspace, uint64_t workspaceSize, - Context *context) override; + Context &context) override; uint32_t GetInputNum() const override; uint32_t GetOutputNum() const override; void SetExecuteStreamId(uint32_t streamId) override; diff --git a/src/atb/operation/operation_base.cpp b/src/atb/operation/operation_base.cpp index b3afa56d..526b1be6 100644 --- a/src/atb/operation/operation_base.cpp +++ b/src/atb/operation/operation_base.cpp @@ -1083,7 +1083,7 @@ Status OperationBase::Execute(const VariantPack &variantPack, uint8_t *workspace } Status st = NO_ERROR; if (executeType == EXECUTE_NORMAL || executeType == EXECUTE_PRELAUNCH) { - st = PreLaunch(variantPack, workspace, workspaceSize, context); + st = PreLaunch(variantPack, workspace, workspaceSize, &context); if (st != NO_ERROR) { ATB_LOG(ERROR) << GetLogPrefix() << "PreLaunch fail, error code: " << st; return st; diff --git a/src/atb/runner/ops_runner.cpp b/src/atb/runner/ops_runner.cpp index 28347ad8..4c4fcbd4 100644 --- a/src/atb/runner/ops_runner.cpp +++ b/src/atb/runner/ops_runner.cpp @@ -263,7 +263,7 @@ Status OpsRunner::FillHostTilingBufferImpl(uint8_t *hostTilingBuffer, uint64_t t } uint8_t *kernelHostTilingBuffer = hostTilingBuffer + offset; - Status ret = FillSingleKernelHostTilingBuffer(node, nodeId, kernelHostTilingBuffer, tilingSize, context); + Status ret = FillSingleKernelHostTilingBuffer(node, nodeId, kernelHostTilingBuffer, tilingSize, *context); if (ret != NO_ERROR) { ATB_LOG(ERROR) << GetLogPrefix() << " node[" << nodeId << "] fill tiling buffer fail, error code:" << ret; return ret; diff --git a/src/atb/runner/plugin_runner.cpp b/src/atb/runner/plugin_runner.cpp index aeaa20c6..fb26b4f6 100644 --- a/src/atb/runner/plugin_runner.cpp +++ b/src/atb/runner/plugin_runner.cpp @@ -37,7 +37,7 @@ Status PluginRunner::ExecuteImpl(RunnerVariantPack &runnerVariantPack) variantPack_.inTensors = runnerVariantPack.inTensors; variantPack_.outTensors = runnerVariantPack.outTensors; return operation_->Execute(variantPack_, runnerVariantPack.workspaceBuffer, - runnerVariantPack.workspaceBufferSize, runnerVariantPack.context); + runnerVariantPack.workspaceBufferSize, *runnerVariantPack.context); } return ERROR_INVALID_PARAM; diff --git a/src/cinterface/atb_acl_fused_add_topk_div.cpp b/src/cinterface/atb_acl_fused_add_topk_div.cpp index fbbfca90..7d54559a 100644 --- a/src/cinterface/atb_acl_fused_add_topk_div.cpp +++ b/src/cinterface/atb_acl_fused_add_topk_div.cpp @@ -79,7 +79,7 @@ atb::Status AtbFusedAddTopkDiv(void *workspace, uint64_t workspaceSize, atb::Ope ATB_CHECK(op != nullptr, "AtbFusedAddTopkDiv expect op pointer not to be null!", return atb::ERROR_INVALID_OPERATION_ADDR); atb::VariantPack pack; - atb::Status st = op->Execute(pack, (uint8_t *)(workspace), workspaceSize, context); + atb::Status st = op->Execute(pack, (uint8_t *)(workspace), workspaceSize, *context); ATB_CHECK(st == atb::NO_ERROR, "AtbFusedAddTopkDiv Execute failed!", return st); return st; } -- Gitee From 6d92e12ed867ef22be1b3652a4d0d1c0b116baa3 Mon Sep 17 00:00:00 2001 From: huangxiaolan Date: Mon, 22 Sep 2025 17:42:43 +0800 Subject: [PATCH 43/94] recover extern changes --- comm/lcal/src/tools/socket/lcal_sock_exchange.cpp | 8 ++++---- include/atb/atb_acl.h | 14 +++++++------- include/atb/operation.h | 2 +- src/cinterface/atb_acl_fused_add_topk_div.cpp | 6 +++--- src/cinterface/atb_acl_mla.cpp | 8 ++++---- src/cinterface/atb_acl_mla_preprocess.cpp | 4 ++-- src/cinterface/atb_acl_paged_cache_load.cpp | 4 ++-- src/cinterface/atb_acl_ring_mla.cpp | 4 ++-- .../atb_acl_self_attention_prefix_encoder.cpp | 4 ++-- src/cinterface/atb_acl_util.cpp | 8 ++++---- 10 files changed, 31 insertions(+), 31 deletions(-) diff --git a/comm/lcal/src/tools/socket/lcal_sock_exchange.cpp b/comm/lcal/src/tools/socket/lcal_sock_exchange.cpp index ff5dec47..552fde6b 100644 --- a/comm/lcal/src/tools/socket/lcal_sock_exchange.cpp +++ b/comm/lcal/src/tools/socket/lcal_sock_exchange.cpp @@ -335,7 +335,7 @@ void LcalSockExchange::Cleanup() } } -int GetAddrFromString(LcalSocketAddress& ua, const char* ipPortPair) +int GetAddrFromString(LcalSocketAddress* ua, const char* ipPortPair) { std::string ip; uint16_t port; @@ -344,9 +344,9 @@ int GetAddrFromString(LcalSocketAddress& ua, const char* ipPortPair) MKI_LOG(ERROR) << "lcal ParseIpAndPort failed!"; return LCAL_ERROR_INTERNAL; } - ua.sin.sin_family = AF_INET; - ua.sin.sin_addr.s_addr = inet_addr(ip.c_str()); - ua.sin.sin_port = htons(port); + ua->sin.sin_family = AF_INET; + ua->sin.sin_addr.s_addr = inet_addr(ip.c_str()); + ua->sin.sin_port = htons(port); return LCAL_SUCCESS; } diff --git a/include/atb/atb_acl.h b/include/atb/atb_acl.h index bf62b985..dfd8f0d4 100644 --- a/include/atb/atb_acl.h +++ b/include/atb/atb_acl.h @@ -55,7 +55,7 @@ atb::Status AtbFusedAddTopkDivGetWorkspaceSize(const aclTensor *x, const aclTens const aclTensor *mappingTable, uint32_t groupNum, uint32_t groupTopk, uint32_t n, uint32_t k, int activationType, bool isNorm, float scale, bool enableExpertMapping, aclTensor *y, aclTensor *indices, - uint64_t &workspaceSize, atb::Operation **op, atb::Context *context); + uint64_t *workspaceSize, atb::Operation **op, atb::Context *context); //! //! \brief 关于FusedAddTopkDiv算子使用aclnn风格调用的2段式接口的第2段, @@ -101,7 +101,7 @@ atb::Status AtbMLAGetWorkspaceSize(const aclTensor *qNope, const aclTensor *qRop const aclTensor *mask, const aclTensor *qSeqLen, const aclTensor *qkDescale, const aclTensor *pvDescale, int32_t headNum, float qkScale, int32_t kvHeadNum, int maskType, int calcType, uint8_t cacheMode, aclTensor *attenOut, aclTensor *lse, - uint64_t &workspaceSize, atb::Operation **op, atb::Context *context); + uint64_t *workspaceSize, atb::Operation **op, atb::Context *context); //! //! \brief 关于MLA算子使用aclnn风格调用的2段式接口的第2段, @@ -142,7 +142,7 @@ atb::Status AtbMLAPreFillGetWorkspaceSize(const aclTensor *q, const aclTensor *q const aclTensor *kRope, const aclTensor *v, const aclTensor *qSeqLen, const aclTensor *kvSeqLen, const aclTensor *mask, int32_t headNum, float qkScale, int32_t kvHeadNum, int maskType, uint8_t cacheMode, aclTensor *attenOut, - uint64_t &workspaceSize, atb::Operation **op, atb::Context *context); + uint64_t *workspaceSize, atb::Operation **op, atb::Context *context); //! //! \brief MLA prefill 处理接口 @@ -214,7 +214,7 @@ atb::Status AtbMLAPreprocessGetWorkspaceSize( const aclTensor *kvCacheRope, const aclTensor *slotmapping, const aclTensor *ctkvScale, const aclTensor *qNopeScale, uint32_t wdqDim, uint32_t qRopeDim, uint32_t kRopeDim, float epsilon, uint32_t qRotaryCoeff, uint32_t kRotaryCoeff, bool transposeWdq, bool transposeWuq, bool transposeWuk, uint8_t cacheMode, uint16_t quantMode, aclTensor *qOut0, - aclTensor *kvCacheOut0, aclTensor *qOut1, aclTensor *kvCacheOut1, uint64_t &workspaceSize, atb::Operation **op, + aclTensor *kvCacheOut0, aclTensor *qOut1, aclTensor *kvCacheOut1, uint64_t *workspaceSize, atb::Operation **op, atb::Context *context); //! @@ -252,7 +252,7 @@ atb::Status AtbPagedCacheLoadGetWorkspaceSize(const aclTensor *keyCache, const a const aclTensor *blockTables, const aclTensor *contextLens, const aclTensor *key, const aclTensor *value, const aclTensor *seqStarts, int8_t kvCacheCfg, bool isSeqLensCumsumType, bool hasSeqStarts, - uint64_t &workspaceSize, atb::Operation **op, atb::Context *context); + uint64_t *workspaceSize, atb::Operation **op, atb::Context *context); //! //! \brief 关于PagedCacheLoad算子使用aclnn风格调用的2段式接口的第2段, @@ -300,7 +300,7 @@ atb::Status AtbRingMLAGetWorkspaceSize(const aclTensor *querySplit1, const aclTe const aclTensor *mask, const aclTensor *seqLen, const aclTensor *prevOut, const aclTensor *prevLse, int32_t headNum, int32_t kvHeadNum, float qkScale, int kernelType, int maskType, int inputLayout, int calcType, aclTensor *output, - aclTensor *softmaxLse, uint64_t &workspaceSize, atb::Operation **op, + aclTensor *softmaxLse, uint64_t *workspaceSize, atb::Operation **op, atb::Context *context); //! @@ -344,7 +344,7 @@ atb::Status AtbSelfAttentionPrefixEncoderGetWorkspaceSize(const aclTensor *query const aclTensor *mask, const aclTensor *seqLen, const aclTensor *kvSeqLen, const aclTensor *slopes, int maskType, int32_t headNum, int32_t kvHeadNum, - float qkScale, aclTensor *attnOut, uint64_t &workspaceSize, + float qkScale, aclTensor *attnOut, uint64_t *workspaceSize, atb::Operation **op, atb::Context *context); //! diff --git a/include/atb/operation.h b/include/atb/operation.h index c1b48e1d..f3aeec56 100644 --- a/include/atb/operation.h +++ b/include/atb/operation.h @@ -95,7 +95,7 @@ public: //! \return 状态值,如果成功,返回NO_ERROR //! virtual Status Execute(const VariantPack &variantPack, uint8_t *workspace, uint64_t workspaceSize, - Context &context) = 0; + Context *context) = 0; }; //! diff --git a/src/cinterface/atb_acl_fused_add_topk_div.cpp b/src/cinterface/atb_acl_fused_add_topk_div.cpp index 7d54559a..65f84243 100644 --- a/src/cinterface/atb_acl_fused_add_topk_div.cpp +++ b/src/cinterface/atb_acl_fused_add_topk_div.cpp @@ -21,7 +21,7 @@ atb::Status AtbFusedAddTopkDivGetWorkspaceSize(const aclTensor *x, const aclTens const aclTensor *mappingTable, uint32_t groupNum, uint32_t groupTopk, uint32_t n, uint32_t k, int activationType, bool isNorm, float scale, bool enableExpertMapping, aclTensor *y, aclTensor *indices, - uint64_t &workspaceSize, atb::Operation **op, atb::Context *context) + uint64_t *workspaceSize, atb::Operation **op, atb::Context *context) { atb::infer::FusedAddTopkDivParam param; param.groupNum = groupNum; @@ -69,7 +69,7 @@ atb::Status AtbFusedAddTopkDivGetWorkspaceSize(const aclTensor *x, const aclTens ATB_LOG(ERROR) << "AtbFusedAddTopkDivGetWorkspaceSize opeartion pointer is nullptr!"; return atb::ERROR_INVALID_OPERATION_ADDR; } - status = (*op)->Setup(pack, workspaceSize, context); + status = (*op)->Setup(pack, *workspaceSize, context); ATB_CHECK(status == atb::NO_ERROR, "AtbFusedAddTopkDiv Setup failed!", return status); return atb::NO_ERROR; } @@ -79,7 +79,7 @@ atb::Status AtbFusedAddTopkDiv(void *workspace, uint64_t workspaceSize, atb::Ope ATB_CHECK(op != nullptr, "AtbFusedAddTopkDiv expect op pointer not to be null!", return atb::ERROR_INVALID_OPERATION_ADDR); atb::VariantPack pack; - atb::Status st = op->Execute(pack, (uint8_t *)(workspace), workspaceSize, *context); + atb::Status st = op->Execute(pack, (uint8_t *)(workspace), workspaceSize, context); ATB_CHECK(st == atb::NO_ERROR, "AtbFusedAddTopkDiv Execute failed!", return st); return st; } diff --git a/src/cinterface/atb_acl_mla.cpp b/src/cinterface/atb_acl_mla.cpp index 1b115833..c8fa9695 100644 --- a/src/cinterface/atb_acl_mla.cpp +++ b/src/cinterface/atb_acl_mla.cpp @@ -27,7 +27,7 @@ atb::Status AtbMLAGetWorkspaceSize(const aclTensor *qNope, const aclTensor *qRop const aclTensor *mask, const aclTensor *qSeqLen, const aclTensor *qkDescale, const aclTensor *pvDescale, int32_t headNum, float qkScale, int32_t kvHeadNum, int maskType, int calcType, uint8_t cacheMode, aclTensor *attenOut, aclTensor *lse, - uint64_t &workspaceSize, atb::Operation **op, atb::Context *context) + uint64_t *workspaceSize, atb::Operation **op, atb::Context *context) { atb::infer::MultiLatentAttentionParam param; param.headNum = headNum; @@ -109,7 +109,7 @@ atb::Status AtbMLAGetWorkspaceSize(const aclTensor *qNope, const aclTensor *qRop ATB_LOG(ERROR) << "AtbMLAGetWorkspaceSize opeartion pointer is nullptr!"; return atb::ERROR_INVALID_OPERATION_ADDR; } - atb::Status st = (*op)->Setup(pack, workspaceSize, context); + atb::Status st = (*op)->Setup(pack, *workspaceSize, context); ATB_CHECK(st == atb::NO_ERROR, "AtbMLA Setup failed!", return st); return atb::NO_ERROR; } @@ -129,7 +129,7 @@ atb::Status AtbMLAPreFillGetWorkspaceSize(const aclTensor *q, const aclTensor *q const aclTensor *kRope, const aclTensor *v, const aclTensor *qSeqLen, const aclTensor *kvSeqLen, const aclTensor *mask, int32_t headNum, float qkScale, int32_t kvHeadNum, int maskType, uint8_t cacheMode, - aclTensor *attenOut, uint64_t &workspaceSize, atb::Operation **op, + aclTensor *attenOut, uint64_t *workspaceSize, atb::Operation **op, atb::Context *context) { atb::infer::MultiLatentAttentionParam param; @@ -183,7 +183,7 @@ atb::Status AtbMLAPreFillGetWorkspaceSize(const aclTensor *q, const aclTensor *q ATB_LOG(ERROR) << "AtbMLAPreFillGetWorkspaceSize opeartion pointer is nullptr!"; return atb::ERROR_INVALID_OPERATION_ADDR; } - atb::Status st = (*op)->Setup(pack, workspaceSize, context); + atb::Status st = (*op)->Setup(pack, *workspaceSize, context); ATB_CHECK(st == atb::NO_ERROR, "AtbMLAPreFill Setup failed!", return st); return atb::NO_ERROR; } diff --git a/src/cinterface/atb_acl_mla_preprocess.cpp b/src/cinterface/atb_acl_mla_preprocess.cpp index be40ee8a..32bd22c6 100644 --- a/src/cinterface/atb_acl_mla_preprocess.cpp +++ b/src/cinterface/atb_acl_mla_preprocess.cpp @@ -28,7 +28,7 @@ atb::Status AtbMLAPreprocessGetWorkspaceSize( const aclTensor *kvCacheRope, const aclTensor *slotmapping, const aclTensor *ctkvScale, const aclTensor *qNopeScale, uint32_t wdqDim, uint32_t qRopeDim, uint32_t kRopeDim, float epsilon, uint32_t qRotaryCoeff, uint32_t kRotaryCoeff, bool transposeWdq, bool transposeWuq, bool transposeWuk, uint8_t cacheMode, uint16_t quantMode, aclTensor *qOut0, - aclTensor *kvCacheOut0, aclTensor *qOut1, aclTensor *kvCacheOut1, uint64_t &workspaceSize, atb::Operation **op, + aclTensor *kvCacheOut0, aclTensor *qOut1, aclTensor *kvCacheOut1, uint64_t *workspaceSize, atb::Operation **op, atb::Context *context) { atb::infer::MlaPreprocessParam param; @@ -159,7 +159,7 @@ atb::Status AtbMLAPreprocessGetWorkspaceSize( ATB_LOG(ERROR) << "AtbMLAPreprocessGetWorkspaceSize opeartion pointer is nullptr!"; return atb::ERROR_INVALID_OPERATION_ADDR; } - atb::Status st = (*op)->Setup(pack, workspaceSize, context); + atb::Status st = (*op)->Setup(pack, *workspaceSize, context); ATB_CHECK(st == atb::NO_ERROR, "AtbMLAPreprocess Setup failed!", return st); return atb::NO_ERROR; } diff --git a/src/cinterface/atb_acl_paged_cache_load.cpp b/src/cinterface/atb_acl_paged_cache_load.cpp index 834c8f4d..df6d86d5 100644 --- a/src/cinterface/atb_acl_paged_cache_load.cpp +++ b/src/cinterface/atb_acl_paged_cache_load.cpp @@ -22,7 +22,7 @@ atb::Status AtbPagedCacheLoadGetWorkspaceSize(const aclTensor *keyCache, const a const aclTensor *blockTables, const aclTensor *contextLens, const aclTensor *key, const aclTensor *value, const aclTensor *seqStarts, int8_t kvCacheCfg, bool isSeqLensCumsumType, bool hasSeqStarts, - uint64_t &workspaceSize, atb::Operation **op, atb::Context *context) + uint64_t *workspaceSize, atb::Operation **op, atb::Context *context) { atb::infer::PagedCacheLoadParam param; param.kvCacheCfg = atb::infer::PagedCacheLoadParam::KvCacheCfg(kvCacheCfg); @@ -72,7 +72,7 @@ atb::Status AtbPagedCacheLoadGetWorkspaceSize(const aclTensor *keyCache, const a ATB_LOG(ERROR) << "AtbPagedCacheLoadGetWorkspaceSize opeartion pointer is nullptr!"; return atb::ERROR_INVALID_OPERATION_ADDR; } - atb::Status st = (*op)->Setup(pack, workspaceSize, context); + atb::Status st = (*op)->Setup(pack, *workspaceSize, context); ATB_CHECK(st == atb::NO_ERROR, "AtbPagedCacheLoad Setup failed!", return st); return atb::NO_ERROR; } diff --git a/src/cinterface/atb_acl_ring_mla.cpp b/src/cinterface/atb_acl_ring_mla.cpp index 677fcf1b..62468810 100644 --- a/src/cinterface/atb_acl_ring_mla.cpp +++ b/src/cinterface/atb_acl_ring_mla.cpp @@ -23,7 +23,7 @@ atb::Status AtbRingMLAGetWorkspaceSize(const aclTensor *querySplit1, const aclTe const aclTensor *mask, const aclTensor *seqLen, const aclTensor *prevOut, const aclTensor *prevLse, int32_t headNum, int32_t kvHeadNum, float qkScale, int kernelType, int maskType, int inputLayout, int calcType, aclTensor *output, - aclTensor *softmaxLse, uint64_t &workspaceSize, atb::Operation **op, + aclTensor *softmaxLse, uint64_t *workspaceSize, atb::Operation **op, atb::Context *context) { atb::infer::RingMLAParam param; @@ -80,7 +80,7 @@ atb::Status AtbRingMLAGetWorkspaceSize(const aclTensor *querySplit1, const aclTe ATB_LOG(ERROR) << "AtbRingMLAGetWorkspaceSize opeartion pointer is nullptr!"; return atb::ERROR_INVALID_OPERATION_ADDR; } - status = (*op)->Setup(pack, workspaceSize, context); + status = (*op)->Setup(pack, *workspaceSize, context); ATB_CHECK(status == atb::NO_ERROR, "AtbRingMLA Setup failed!", return status); return atb::NO_ERROR; } diff --git a/src/cinterface/atb_acl_self_attention_prefix_encoder.cpp b/src/cinterface/atb_acl_self_attention_prefix_encoder.cpp index ffc3e5df..73e4e366 100644 --- a/src/cinterface/atb_acl_self_attention_prefix_encoder.cpp +++ b/src/cinterface/atb_acl_self_attention_prefix_encoder.cpp @@ -23,7 +23,7 @@ atb::Status AtbSelfAttentionPrefixEncoderGetWorkspaceSize(const aclTensor *query const aclTensor *mask, const aclTensor *seqLen, const aclTensor *kvSeqLen, const aclTensor *slopes, int maskType, int32_t headNum, int32_t kvHeadNum, - float qkScale, aclTensor *attnOut, uint64_t &workspaceSize, + float qkScale, aclTensor *attnOut, uint64_t *workspaceSize, atb::Operation **op, atb::Context *context) { atb::infer::SelfAttentionParam param; @@ -94,7 +94,7 @@ atb::Status AtbSelfAttentionPrefixEncoderGetWorkspaceSize(const aclTensor *query ATB_LOG(ERROR) << "AtbSelfAttentionPrefixEncoderGetWorkspaceSize opeartion pointer is nullptr!"; return atb::ERROR_INVALID_OPERATION_ADDR; } - status = (*op)->Setup(pack, workspaceSize, context); + status = (*op)->Setup(pack, *workspaceSize, context); ATB_CHECK(status == atb::NO_ERROR, "AtbSelfAttentionPrefixEncoder Setup failed!", return status); return atb::NO_ERROR; } diff --git a/src/cinterface/atb_acl_util.cpp b/src/cinterface/atb_acl_util.cpp index 40ec98f6..d4061128 100644 --- a/src/cinterface/atb_acl_util.cpp +++ b/src/cinterface/atb_acl_util.cpp @@ -17,9 +17,9 @@ extern "C" { // 256GB const int64_t MAX_TENSOR_SIZE = 256uLL * 1024uLL * 1024uLL * 1024uLL; -int64_t GetTensorSize(const aclTensor &input) +int64_t GetTensorSize(const aclTensor *input) { - const op::Shape shape = input.GetViewShape(); + const op::Shape shape = input->GetViewShape(); const size_t dims = shape.GetDimNum(); int64_t size = 1; for (size_t i = 0; i < dims; ++i) { @@ -57,7 +57,7 @@ atb::Status aclTensorToAtbTensor(const aclTensor *aclTensorSrc, atb::Tensor *atb atbTensorDst->desc = desc; atbTensorDst->deviceData = aclTensorSrc->GetData(); atbTensorDst->hostData = nullptr; - int64_t tensorSize = GetTensorSize(*aclTensorSrc); + int64_t tensorSize = GetTensorSize(aclTensorSrc); int64_t dataTypeSize = static_cast(aclDataTypeSize(dataType)); if (tensorSize > MAX_TENSOR_SIZE / dataTypeSize) { ATB_LOG(ERROR) << "The size of a tensor * dataTypeSize should be no more than 256GB, but got tensor size: " @@ -97,7 +97,7 @@ atb::Status aclTensorToAtbTensorHost(const aclTensor *aclTensorSrc, atb::Tensor atbTensorDst->desc = desc; atbTensorDst->deviceData = nullptr; atbTensorDst->hostData = aclTensorSrc->GetData(); - int64_t tensorSize = GetTensorSize(*aclTensorSrc); + int64_t tensorSize = GetTensorSize(aclTensorSrc); int64_t dataTypeSize = static_cast(aclDataTypeSize(dataType)); if (tensorSize > MAX_TENSOR_SIZE / dataTypeSize) { ATB_LOG(ERROR) << "The size of a tensor * dataTypeSize should be no more than 256GB, but got tensor size: " -- Gitee From 2a10ca4d1fa83b0ffa6265002e7425246c6f6e48 Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Mon, 22 Sep 2025 17:46:39 +0800 Subject: [PATCH 44/94] fix --- src/kernels/lcal/src/lccl.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/kernels/lcal/src/lccl.cpp b/src/kernels/lcal/src/lccl.cpp index 62cf73d7..8d515fb2 100644 --- a/src/kernels/lcal/src/lccl.cpp +++ b/src/kernels/lcal/src/lccl.cpp @@ -282,7 +282,7 @@ uint32_t Lccl::GetBlockNum(LcalType cclType, uint32_t rankSize, int64_t dataSize limitType = aclrtDevResLimitType::ACL_RT_DEV_RES_CUBE_CORE; } - int res = comm_->CallAclRtGetRes(static_cast(limitType), &limitVal); + int res = GetAclResInCurThread(static_cast(limitType), &limitVal); if (res == LCAL_SUCCESS) { MKI_LOG(DEBUG) << "Required blockNum(" << blockNum << ") limit:(limitVal=" << limitVal << ", limitType=" << static_cast(limitType) << ")"; -- Gitee From 623add70587cb6b198144530fc1a41d6efef5cc1 Mon Sep 17 00:00:00 2001 From: guo-jiong Date: Sat, 20 Sep 2025 17:06:43 +0800 Subject: [PATCH 45/94] change ops dir --- src/CMakeLists.txt | 6 +-- .../event_operation/event_operation.cpp | 0 .../event_operation/event_operation.h | 0 .../event_operation/event_runner.cpp | 0 .../ops_common/event_operation/event_runner.h | 0 .../activation/activation_operation.cpp | 0 .../activation/activation_operation.h | 0 .../activation/activation_ops_runner.cpp | 0 .../activation/activation_ops_runner.h | 0 .../all_gather/all_gather_hccl_runner.cpp | 0 .../all_gather/all_gather_hccl_runner.h | 0 .../all_gather/all_gather_lccl_runner.cpp | 0 .../all_gather/all_gather_lccl_runner.h | 0 .../all_gather/all_gather_operation.cpp | 0 .../all_gather/all_gather_operation.h | 0 .../all_gatherv/all_gatherv_hccl_runner.cpp | 0 .../all_gatherv/all_gatherv_hccl_runner.h | 0 .../all_gatherv/all_gatherv_operation.cpp | 0 .../all_gatherv/all_gatherv_operation.h | 0 .../all_reduce/all_reduce_hccl_runner.cpp | 0 .../all_reduce/all_reduce_hccl_runner.h | 0 .../all_reduce/all_reduce_lccl_runner.cpp | 0 .../all_reduce/all_reduce_lccl_runner.h | 0 .../all_reduce/all_reduce_operation.cpp | 0 .../all_reduce/all_reduce_operation.h | 0 .../all_to_all/all_to_all_hccl_runner.cpp | 0 .../all_to_all/all_to_all_hccl_runner.h | 0 .../all_to_all/all_to_all_lccl_runner.cpp | 0 .../all_to_all/all_to_all_lccl_runner.h | 0 .../all_to_all/all_to_all_operation.cpp | 0 .../all_to_all/all_to_all_operation.h | 0 .../all_to_allv/all_to_allv_hccl_runner.cpp | 0 .../all_to_allv/all_to_allv_hccl_runner.h | 0 .../all_to_allv/all_to_allv_operation.cpp | 0 .../all_to_allv/all_to_allv_operation.h | 0 .../all_to_allvv2_hccl_runner.cpp | 0 .../all_to_allvv2/all_to_allvv2_hccl_runner.h | 0 .../all_to_allvv2/all_to_allvv2_operation.cpp | 0 .../all_to_allvv2/all_to_allvv2_operation.h | 0 .../as_strided/as_strided_operation.cpp | 0 .../as_strided/as_strided_operation.h | 0 .../as_strided/as_strided_ops_runner.cpp | 0 .../as_strided/as_strided_ops_runner.h | 0 .../block_copy/block_copy_operation.cpp | 0 .../block_copy/block_copy_operation.h | 0 .../block_copy/block_copy_ops_runner.cpp | 0 .../block_copy/block_copy_ops_runner.h | 0 .../broadcast/broadcast_hccl_runner.cpp | 0 .../broadcast/broadcast_hccl_runner.h | 0 .../broadcast/broadcast_lccl_runner.cpp | 0 .../broadcast/broadcast_lccl_runner.h | 0 .../broadcast/broadcast_operation.cpp | 0 .../ops_infer/broadcast/broadcast_operation.h | 0 .../cohere_layernorm_operation.cpp | 0 .../cohere_layernorm_operation.h | 0 .../cohere_layernorm_runner.cpp | 0 .../cohere_layernorm_runner.h | 0 .../ops_infer/concat/concat_operation.cpp | 0 .../ops_infer/concat/concat_operation.h | 0 .../ops_infer/concat/concat_ops_runner.cpp | 0 .../ops_infer/concat/concat_ops_runner.h | 0 .../ops_infer/cumsum/cumsum_operation.cpp | 0 .../ops_infer/cumsum/cumsum_operation.h | 0 .../ops_infer/cumsum/cumsum_ops_runner.cpp | 0 .../ops_infer/cumsum/cumsum_ops_runner.h | 0 .../dynamic_ntk/dynamic_ntk_operation.cpp | 0 .../dynamic_ntk/dynamic_ntk_operation.h | 0 .../dynamic_ntk/dynamic_ntk_ops_runner.cpp | 0 .../dynamic_ntk/dynamic_ntk_ops_runner.h | 0 .../ops_infer/elewise/elewise_operation.cpp | 0 .../ops_infer/elewise/elewise_operation.h | 0 .../ops_infer/elewise/elewise_ops_runner.cpp | 0 .../ops_infer/elewise/elewise_ops_runner.h | 0 .../ops_infer/faupdate/faupdate_operation.cpp | 0 .../ops_infer/faupdate/faupdate_operation.h | 0 .../faupdate/faupdate_ops_runner.cpp | 0 .../ops_infer/faupdate/faupdate_ops_runner.h | 0 .../ops_infer/fill/fill_operation.cpp | 0 src/{ => ops}/ops_infer/fill/fill_operation.h | 0 .../ops_infer/fill/fill_ops_runner.cpp | 0 .../ops_infer/fill/fill_ops_runner.h | 0 .../atb_acl_fused_add_topk_div.cpp | 0 .../fused_add_topk_div_operation.cpp | 0 .../fused_add_topk_div_operation.h | 0 .../fused_add_topk_div_ops_runner.cpp | 0 .../fused_add_topk_div_ops_runner.h | 0 .../ops_infer/gather/gather_operation.cpp | 0 .../ops_infer/gather/gather_operation.h | 0 .../ops_infer/gather/gather_ops_runner.cpp | 0 .../ops_infer/gather/gather_ops_runner.h | 0 .../gather_pre_rms_norm_operation.cpp | 0 .../gather_pre_rms_norm_operation.h | 0 .../gather_pre_rms_norm_ops_runner.cpp | 0 .../gather_pre_rms_norm_ops_runner.h | 0 .../ops_infer/gating/gating_operation.cpp | 0 .../ops_infer/gating/gating_operation.h | 0 .../ops_infer/gating/gating_ops_runner.cpp | 0 .../ops_infer/gating/gating_ops_runner.h | 0 ...gmm_deq_swiglu_quant_gmm_deq_operation.cpp | 0 .../gmm_deq_swiglu_quant_gmm_deq_operation.h | 0 ...mm_deq_swiglu_quant_gmm_deq_ops_runner.cpp | 0 .../gmm_deq_swiglu_quant_gmm_deq_ops_runner.h | 0 .../group_topk/group_topk_operation.cpp | 0 .../group_topk/group_topk_operation.h | 0 .../group_topk/group_topk_ops_runner.cpp | 0 .../group_topk/group_topk_ops_runner.h | 0 .../grouped_matmul_inplace_add_operation.cpp | 0 .../grouped_matmul_inplace_add_operation.h | 0 .../grouped_matmul_inplace_add_ops_runner.cpp | 0 .../grouped_matmul_inplace_add_ops_runner.h | 0 .../grouped_matmul_with_routing_operation.cpp | 0 .../grouped_matmul_with_routing_operation.h | 0 .../grouped_matmul_with_routing_runner.cpp | 0 .../grouped_matmul_with_routing_runner.h | 0 .../index_add/index_add_operation.cpp | 0 .../ops_infer/index_add/index_add_operation.h | 0 .../index_add/index_add_ops_runner.cpp | 0 .../index_add/index_add_ops_runner.h | 0 .../ops_infer/kv_cache/kv_cache_operation.cpp | 0 .../ops_infer/kv_cache/kv_cache_operation.h | 0 .../kv_cache/kv_cache_ops_runner.cpp | 0 .../ops_infer/kv_cache/kv_cache_ops_runner.h | 0 .../layer_norm/layer_norm_operation.cpp | 0 .../layer_norm/layer_norm_operation.h | 0 .../layer_norm/layer_norm_ops_runner.cpp | 0 .../layer_norm/layer_norm_ops_runner.h | 0 .../layer_norm_with_stride_operation.cpp | 0 .../layer_norm_with_stride_operation.h | 0 .../layer_norm_with_stride_ops_runner.cpp | 0 .../layer_norm_with_stride_ops_runner.h | 0 .../ops_infer/linear/linear_operation.cpp | 0 .../ops_infer/linear/linear_operation.h | 0 .../ops_infer/linear/linear_ops_runner.cpp | 0 .../ops_infer/linear/linear_ops_runner.h | 0 .../linear_parallel_aclnn_runner.cpp | 0 .../linear_parallel_aclnn_runner.h | 0 .../linear_parallel_graph_runner.cpp | 0 .../linear_parallel_graph_runner.h | 0 .../linear_parallel_lcoc_runner.cpp | 0 .../linear_parallel_lcoc_runner.h | 0 .../linear_parallel_operation.cpp | 0 .../linear_parallel_operation.h | 0 .../linear_sparse/linear_sparse_operation.cpp | 0 .../linear_sparse/linear_sparse_operation.h | 0 .../linear_sparse_ops_runner.cpp | 0 .../linear_sparse/linear_sparse_ops_runner.h | 0 .../mla_preprocess/atb_acl_mla_preprocess.cpp | 0 .../mla_preprocess_aclnn_runner.cpp | 0 .../mla_preprocess_aclnn_runner.h | 0 .../mla_preprocess_operation.cpp | 0 .../mla_preprocess/mla_preprocess_operation.h | 0 .../mla_preprocess_ops_runner.cpp | 0 .../mla_preprocess_ops_runner.h | 0 .../mla_preprocess_ops_runner_split.cpp | 0 .../mla_preprocess_ops_runner_split.h | 0 .../mm_deq_swiglu_quant_mm_deq_operation.cpp | 0 .../mm_deq_swiglu_quant_mm_deq_operation.h | 0 .../mm_deq_swiglu_quant_mm_deq_ops_runner.cpp | 0 .../mm_deq_swiglu_quant_mm_deq_ops_runner.h | 0 .../multi_latent_attention/atb_acl_mla.cpp | 0 .../multi_latent_attention_operation.cpp | 0 .../multi_latent_attention_operation.h | 0 .../multi_latent_attention_ops_runner.cpp | 0 .../multi_latent_attention_ops_runner.h | 0 ...ti_latent_attention_ops_runner_prefill.cpp | 0 ...ulti_latent_attention_ops_runner_prefill.h | 0 .../multi_latent_attention/param.cpp | 0 .../ops_infer/multi_latent_attention/param.h | 0 .../multinomial/multinomial_operation.cpp | 0 .../multinomial/multinomial_operation.h | 0 .../multinomial/multinomial_ops_runner.cpp | 0 .../multinomial/multinomial_ops_runner.h | 0 .../ops_infer/nonzero/nonzero_operation.cpp | 0 .../ops_infer/nonzero/nonzero_operation.h | 0 .../ops_infer/nonzero/nonzero_runner.cpp | 0 .../ops_infer/nonzero/nonzero_runner.h | 0 .../norm_rope_reshape_operation.cpp | 7 ++- .../norm_rope_reshape_operation.h | 0 .../norm_rope_reshape_ops_runner.cpp | 7 ++- .../norm_rope_reshape_ops_runner.h | 0 .../ops_infer/onehot/onehot_operation.cpp | 0 .../ops_infer/onehot/onehot_operation.h | 0 .../ops_infer/onehot/onehot_ops_runner.cpp | 0 .../ops_infer/onehot/onehot_ops_runner.h | 0 src/{ => ops}/ops_infer/pad/pad_operation.cpp | 0 src/{ => ops}/ops_infer/pad/pad_operation.h | 0 .../ops_infer/pad/pad_ops_runner.cpp | 0 src/{ => ops}/ops_infer/pad/pad_ops_runner.h | 0 .../paged_attention_operation.cpp | 0 .../paged_attention_operation.h | 0 .../paged_attention_ops_runner.cpp | 0 .../paged_attention_ops_runner.h | 0 .../paged_attention_ops_runner_910a.cpp | 0 .../paged_attention_ops_runner_910a.h | 0 .../paged_attention_runner_utils.cpp | 0 .../paged_attention_runner_utils.h | 0 .../ops_infer/paged_attention/param.cpp | 0 .../ops_infer/paged_attention/param.h | 0 .../atb_acl_paged_cache_load.cpp | 0 .../paged_cache_load_operation.cpp | 0 .../paged_cache_load_operation.h | 0 .../paged_cache_load_ops_runner.cpp | 0 .../paged_cache_load_ops_runner.h | 0 .../razor_fusion_attention_operation.cpp | 0 .../razor_fusion_attention_operation.h | 0 .../razor_fusion_attention_ops_runner.cpp | 0 .../razor_fusion_attention_ops_runner.h | 0 .../ops_infer/recv/recv_hccl_runner.cpp | 0 .../ops_infer/recv/recv_hccl_runner.h | 0 .../ops_infer/recv/recv_operation.cpp | 0 src/{ => ops}/ops_infer/recv/recv_operation.h | 0 .../ops_infer/reduce/reduce_operation.cpp | 0 .../ops_infer/reduce/reduce_operation.h | 0 .../ops_infer/reduce/reduce_ops_runner.cpp | 0 .../ops_infer/reduce/reduce_ops_runner.h | 0 .../reduce_scatter_hccl_runner.cpp | 0 .../reduce_scatter_hccl_runner.h | 0 .../reduce_scatter_lccl_runner.cpp | 0 .../reduce_scatter_lccl_runner.h | 0 .../reduce_scatter_operation.cpp | 0 .../reduce_scatter/reduce_scatter_operation.h | 0 .../reduce_scatterv_hccl_runner.cpp | 0 .../reduce_scatterv_hccl_runner.h | 0 .../reduce_scatterv_operation.cpp | 0 .../reduce_scatterv_operation.h | 0 .../ops_infer/relay_attention/param.cpp | 0 .../ops_infer/relay_attention/param.h | 0 .../relay_attention_operation.cpp | 0 .../relay_attention_operation.h | 0 .../relay_attention_ops_runner.cpp | 0 .../relay_attention_ops_runner.h | 0 .../ops_infer/repeat/repeat_operation.cpp | 0 .../ops_infer/repeat/repeat_operation.h | 0 .../ops_infer/repeat/repeat_ops_runner.cpp | 0 .../ops_infer/repeat/repeat_ops_runner.h | 0 .../reshape_and_cache_operation.cpp | 0 .../reshape_and_cache_operation.h | 0 .../reshape_and_cache_ops_runner.cpp | 0 .../reshape_and_cache_ops_runner.h | 0 .../reshape_and_cache_ops_runner_310p.cpp | 0 .../reshape_and_cache_ops_runner_310p.h | 0 .../reshape_and_cache_ops_runner_A2_NZ.cpp | 0 .../reshape_and_cache_ops_runner_A2_NZ.h | 0 .../reshape_and_cache_ops_runner_SISO.cpp | 0 .../reshape_and_cache_ops_runner_SISO.h | 0 .../reshape_and_cache_omni_operation.cpp | 0 .../reshape_and_cache_omni_operation.h | 0 .../reshape_and_cache_omni_ops_runner.cpp | 0 .../reshape_and_cache_omni_ops_runner.h | 0 ...eshape_and_cache_with_stride_operation.cpp | 0 .../reshape_and_cache_with_stride_operation.h | 0 ...shape_and_cache_with_stride_ops_runner.cpp | 0 ...reshape_and_cache_with_stride_ops_runner.h | 0 ..._and_cache_with_stride_ops_runner_SISO.cpp | 0 ...pe_and_cache_with_stride_ops_runner_SISO.h | 0 .../ops_infer/ring_mla/atb_acl_ring_mla.cpp | 0 src/{ => ops}/ops_infer/ring_mla/param.cpp | 0 src/{ => ops}/ops_infer/ring_mla/param.h | 0 .../ops_infer/ring_mla/ring_mla_operation.cpp | 0 .../ops_infer/ring_mla/ring_mla_operation.h | 0 .../ring_mla/ring_mla_ops_runner.cpp | 0 .../ops_infer/ring_mla/ring_mla_ops_runner.h | 0 .../ops_infer/rms_norm/rms_norm_operation.cpp | 0 .../ops_infer/rms_norm/rms_norm_operation.h | 0 .../rms_norm/rms_norm_ops_runner.cpp | 0 .../ops_infer/rms_norm/rms_norm_ops_runner.h | 0 .../rms_norm_with_stride_operation.cpp | 0 .../rms_norm_with_stride_operation.h | 0 .../rms_norm_with_stride_ops_runner.cpp | 0 .../rms_norm_with_stride_ops_runner.h | 0 .../ops_infer/rope/rope_operation.cpp | 0 src/{ => ops}/ops_infer/rope/rope_operation.h | 0 .../ops_infer/rope/rope_ops_runner.cpp | 0 .../ops_infer/rope/rope_ops_runner.h | 0 .../rope_q_concat/rope_q_concat_operation.cpp | 0 .../rope_q_concat/rope_q_concat_operation.h | 0 .../rope_q_concat_ops_runner.cpp | 0 .../rope_q_concat/rope_q_concat_ops_runner.h | 0 .../scatter_elements_v2_operation.cpp | 0 .../scatter_elements_v2_operation.h | 0 .../scatter_elements_v2_ops_runner.cpp | 0 .../scatter_elements_v2_ops_runner.h | 0 .../atb_acl_self_attention_prefix_encoder.cpp | 0 .../ops_infer/self_attention/param.cpp | 0 .../ops_infer/self_attention/param.h | 0 ...tention_encoder_fuison_ops_runner_910a.cpp | 0 ...lf_attention_encoder_fusion_ops_runner.cpp | 0 ...self_attention_encoder_fusion_ops_runner.h | 0 ...attention_encoder_fusion_ops_runner_910a.h | 0 ...elf_attention_fusion_bypass_ops_runner.cpp | 0 .../self_attention_fusion_bypass_ops_runner.h | 0 ...ttention_fusion_bypass_ops_runner_910a.cpp | 0 ..._attention_fusion_bypass_ops_runner_910a.h | 0 ...ttention_fusion_bypass_ops_runner_BNSD.cpp | 0 ..._attention_fusion_bypass_ops_runner_BNSD.h | 0 ...ion_fusion_bypass_ops_runner_BNSD_910a.cpp | 0 ...ntion_fusion_bypass_ops_runner_BNSD_910a.h | 0 .../self_attention_fusion_ops_runner.cpp | 0 .../self_attention_fusion_ops_runner.h | 0 .../self_attention_fusion_ops_runner_910a.cpp | 0 .../self_attention_fusion_ops_runner_910a.h | 0 .../self_attention_operation.cpp | 0 .../self_attention/self_attention_operation.h | 0 ...lf_attention_prefix_encoder_ops_runner.cpp | 0 ...self_attention_prefix_encoder_ops_runner.h | 0 .../self_attention_runner_utils.cpp | 0 .../self_attention_runner_utils.h | 0 .../ops_infer/send/send_hccl_runner.cpp | 0 .../ops_infer/send/send_hccl_runner.h | 0 .../ops_infer/send/send_operation.cpp | 0 src/{ => ops}/ops_infer/send/send_operation.h | 0 .../set_value/set_value_operation.cpp | 0 .../ops_infer/set_value/set_value_operation.h | 0 .../set_value/set_value_ops_runner.cpp | 0 .../set_value/set_value_ops_runner.h | 0 .../ops_infer/slice/slice_operation.cpp | 0 .../ops_infer/slice/slice_operation.h | 0 .../ops_infer/slice/slice_ops_runner.cpp | 0 .../ops_infer/slice/slice_ops_runner.h | 0 .../ops_infer/softmax/softmax_operation.cpp | 0 .../ops_infer/softmax/softmax_operation.h | 0 .../ops_infer/softmax/softmax_ops_runner.cpp | 0 .../ops_infer/softmax/softmax_ops_runner.h | 0 .../ops_infer/sort/sort_operation.cpp | 0 src/{ => ops}/ops_infer/sort/sort_operation.h | 0 .../ops_infer/sort/sort_ops_runner.cpp | 0 .../ops_infer/sort/sort_ops_runner.h | 52 +++++++++---------- .../ops_infer/split/split_operation.cpp | 0 .../ops_infer/split/split_operation.h | 0 .../ops_infer/split/split_ops_runner.cpp | 0 .../ops_infer/split/split_ops_runner.h | 0 .../swiglu_quant/swiglu_quant_operation.cpp | 0 .../swiglu_quant/swiglu_quant_operation.h | 0 .../swiglu_quant/swiglu_quant_ops_runner.cpp | 0 .../swiglu_quant/swiglu_quant_ops_runner.h | 0 .../topk_topp_sampling_operation.cpp | 0 .../topk_topp_sampling_operation.h | 0 .../topk_topp_sampling_ops_runner.cpp | 0 .../topk_topp_sampling_ops_runner.h | 0 .../transdata/transdata_operation.cpp | 0 .../ops_infer/transdata/transdata_operation.h | 0 .../transdata/transdata_ops_runner.cpp | 0 .../transdata/transdata_ops_runner.h | 0 .../transpose/transpose_operation.cpp | 0 .../ops_infer/transpose/transpose_operation.h | 0 .../transpose/transpose_ops_runner.cpp | 0 .../transpose/transpose_ops_runner.h | 0 .../ops_infer/unpad/unpad_operation.cpp | 0 .../ops_infer/unpad/unpad_operation.h | 0 .../ops_infer/unpad/unpad_ops_runner.cpp | 0 .../ops_infer/unpad/unpad_ops_runner.h | 0 .../ops_infer/where/where_operation.cpp | 0 .../ops_infer/where/where_operation.h | 0 .../ops_infer/where/where_ops_runner.cpp | 0 .../ops_infer/where/where_ops_runner.h | 0 .../fast_soft_max/fastsoftmax_operation.cpp | 0 .../fast_soft_max/fastsoftmax_operation.h | 0 .../fast_soft_max/fastsoftmax_ops_runner.cpp | 0 .../fast_soft_max/fastsoftmax_ops_runner.h | 0 .../fastsoftmaxgrad_operation.cpp | 0 .../fastsoftmaxgrad_operation.h | 0 .../fastsoftmaxgrad_ops_runner.cpp | 0 .../fastsoftmaxgrad_ops_runner.h | 0 .../genattentionmask_operation.cpp | 0 .../genattentionmask_operation.h | 0 .../genattentionmask_ops_runner.cpp | 0 .../genattentionmask_ops_runner.h | 0 .../laser_attention_operation.cpp | 0 .../laser_attention_operation.h | 0 .../laser_attention_ops_runner.cpp | 0 .../laser_attention_ops_runner.h | 0 .../laser_attention_grad_operation.cpp | 0 .../laser_attention_grad_operation.h | 0 .../laser_attention_grad_ops_runner.cpp | 0 .../laser_attention_grad_ops_runner.h | 0 .../pad_with_hidden_state_operation.cpp | 0 .../pad_with_hidden_state_operation.h | 0 .../pad_with_hidden_state_ops_runner.cpp | 0 .../pad_with_hidden_state_ops_runner.h | 0 .../rms_norm_backward_operation.cpp | 0 .../rms_norm_backward_operation.h | 0 .../rms_norm_backward_ops_runner.cpp | 0 .../rms_norm_backward_ops_runner.h | 0 .../rope_grad/rope_grad_operation.cpp | 0 .../ops_train/rope_grad/rope_grad_operation.h | 0 .../rope_grad/rope_grad_ops_runner.cpp | 0 .../rope_grad/rope_grad_ops_runner.h | 0 .../stridedbatchmatmul_operation.cpp | 0 .../stridedbatchmatmul_operation.h | 0 .../stridedbatchmatmul_ops_runner.cpp | 0 .../stridedbatchmatmul_ops_runner.h | 0 .../unpad_with_hidden_state_operation.cpp | 0 .../unpad_with_hidden_state_operation.h | 0 .../unpad_with_hidden_state_ops_runner.cpp | 0 .../unpad_with_hidden_state_ops_runner.h | 0 395 files changed, 35 insertions(+), 37 deletions(-) rename src/{ => ops}/ops_common/event_operation/event_operation.cpp (100%) rename src/{ => ops}/ops_common/event_operation/event_operation.h (100%) rename src/{ => ops}/ops_common/event_operation/event_runner.cpp (100%) rename src/{ => ops}/ops_common/event_operation/event_runner.h (100%) rename src/{ => ops}/ops_infer/activation/activation_operation.cpp (100%) rename src/{ => ops}/ops_infer/activation/activation_operation.h (100%) rename src/{ => ops}/ops_infer/activation/activation_ops_runner.cpp (100%) rename src/{ => ops}/ops_infer/activation/activation_ops_runner.h (100%) rename src/{ => ops}/ops_infer/all_gather/all_gather_hccl_runner.cpp (100%) rename src/{ => ops}/ops_infer/all_gather/all_gather_hccl_runner.h (100%) rename src/{ => ops}/ops_infer/all_gather/all_gather_lccl_runner.cpp (100%) rename src/{ => ops}/ops_infer/all_gather/all_gather_lccl_runner.h (100%) rename src/{ => ops}/ops_infer/all_gather/all_gather_operation.cpp (100%) rename src/{ => ops}/ops_infer/all_gather/all_gather_operation.h (100%) rename src/{ => ops}/ops_infer/all_gatherv/all_gatherv_hccl_runner.cpp (100%) rename src/{ => ops}/ops_infer/all_gatherv/all_gatherv_hccl_runner.h (100%) rename src/{ => ops}/ops_infer/all_gatherv/all_gatherv_operation.cpp (100%) rename src/{ => ops}/ops_infer/all_gatherv/all_gatherv_operation.h (100%) rename src/{ => ops}/ops_infer/all_reduce/all_reduce_hccl_runner.cpp (100%) rename src/{ => ops}/ops_infer/all_reduce/all_reduce_hccl_runner.h (100%) rename src/{ => ops}/ops_infer/all_reduce/all_reduce_lccl_runner.cpp (100%) rename src/{ => ops}/ops_infer/all_reduce/all_reduce_lccl_runner.h (100%) rename src/{ => ops}/ops_infer/all_reduce/all_reduce_operation.cpp (100%) rename src/{ => ops}/ops_infer/all_reduce/all_reduce_operation.h (100%) rename src/{ => ops}/ops_infer/all_to_all/all_to_all_hccl_runner.cpp (100%) rename src/{ => ops}/ops_infer/all_to_all/all_to_all_hccl_runner.h (100%) rename src/{ => ops}/ops_infer/all_to_all/all_to_all_lccl_runner.cpp (100%) rename src/{ => ops}/ops_infer/all_to_all/all_to_all_lccl_runner.h (100%) rename src/{ => ops}/ops_infer/all_to_all/all_to_all_operation.cpp (100%) rename src/{ => ops}/ops_infer/all_to_all/all_to_all_operation.h (100%) rename src/{ => ops}/ops_infer/all_to_allv/all_to_allv_hccl_runner.cpp (100%) rename src/{ => ops}/ops_infer/all_to_allv/all_to_allv_hccl_runner.h (100%) rename src/{ => ops}/ops_infer/all_to_allv/all_to_allv_operation.cpp (100%) rename src/{ => ops}/ops_infer/all_to_allv/all_to_allv_operation.h (100%) rename src/{ => ops}/ops_infer/all_to_allvv2/all_to_allvv2_hccl_runner.cpp (100%) rename src/{ => ops}/ops_infer/all_to_allvv2/all_to_allvv2_hccl_runner.h (100%) rename src/{ => ops}/ops_infer/all_to_allvv2/all_to_allvv2_operation.cpp (100%) rename src/{ => ops}/ops_infer/all_to_allvv2/all_to_allvv2_operation.h (100%) rename src/{ => ops}/ops_infer/as_strided/as_strided_operation.cpp (100%) rename src/{ => ops}/ops_infer/as_strided/as_strided_operation.h (100%) rename src/{ => ops}/ops_infer/as_strided/as_strided_ops_runner.cpp (100%) rename src/{ => ops}/ops_infer/as_strided/as_strided_ops_runner.h (100%) rename src/{ => ops}/ops_infer/block_copy/block_copy_operation.cpp (100%) rename src/{ => ops}/ops_infer/block_copy/block_copy_operation.h (100%) rename src/{ => ops}/ops_infer/block_copy/block_copy_ops_runner.cpp (100%) rename src/{ => ops}/ops_infer/block_copy/block_copy_ops_runner.h (100%) rename src/{ => ops}/ops_infer/broadcast/broadcast_hccl_runner.cpp (100%) rename src/{ => ops}/ops_infer/broadcast/broadcast_hccl_runner.h (100%) rename src/{ => ops}/ops_infer/broadcast/broadcast_lccl_runner.cpp (100%) rename src/{ => ops}/ops_infer/broadcast/broadcast_lccl_runner.h (100%) rename src/{ => ops}/ops_infer/broadcast/broadcast_operation.cpp (100%) rename src/{ => ops}/ops_infer/broadcast/broadcast_operation.h (100%) rename src/{ => ops}/ops_infer/cohere_layernorm/cohere_layernorm_operation.cpp (100%) rename src/{ => ops}/ops_infer/cohere_layernorm/cohere_layernorm_operation.h (100%) rename src/{ => ops}/ops_infer/cohere_layernorm/cohere_layernorm_runner.cpp (100%) rename src/{ => ops}/ops_infer/cohere_layernorm/cohere_layernorm_runner.h (100%) rename src/{ => ops}/ops_infer/concat/concat_operation.cpp (100%) rename src/{ => ops}/ops_infer/concat/concat_operation.h (100%) rename src/{ => ops}/ops_infer/concat/concat_ops_runner.cpp (100%) rename src/{ => ops}/ops_infer/concat/concat_ops_runner.h (100%) rename src/{ => ops}/ops_infer/cumsum/cumsum_operation.cpp (100%) rename src/{ => ops}/ops_infer/cumsum/cumsum_operation.h (100%) rename src/{ => ops}/ops_infer/cumsum/cumsum_ops_runner.cpp (100%) rename src/{ => ops}/ops_infer/cumsum/cumsum_ops_runner.h (100%) rename src/{ => ops}/ops_infer/dynamic_ntk/dynamic_ntk_operation.cpp (100%) rename src/{ => ops}/ops_infer/dynamic_ntk/dynamic_ntk_operation.h (100%) rename src/{ => ops}/ops_infer/dynamic_ntk/dynamic_ntk_ops_runner.cpp (100%) rename src/{ => ops}/ops_infer/dynamic_ntk/dynamic_ntk_ops_runner.h (100%) rename src/{ => ops}/ops_infer/elewise/elewise_operation.cpp (100%) rename src/{ => ops}/ops_infer/elewise/elewise_operation.h (100%) rename src/{ => ops}/ops_infer/elewise/elewise_ops_runner.cpp (100%) rename src/{ => ops}/ops_infer/elewise/elewise_ops_runner.h (100%) rename src/{ => ops}/ops_infer/faupdate/faupdate_operation.cpp (100%) rename src/{ => ops}/ops_infer/faupdate/faupdate_operation.h (100%) rename src/{ => ops}/ops_infer/faupdate/faupdate_ops_runner.cpp (100%) rename src/{ => ops}/ops_infer/faupdate/faupdate_ops_runner.h (100%) rename src/{ => ops}/ops_infer/fill/fill_operation.cpp (100%) rename src/{ => ops}/ops_infer/fill/fill_operation.h (100%) rename src/{ => ops}/ops_infer/fill/fill_ops_runner.cpp (100%) rename src/{ => ops}/ops_infer/fill/fill_ops_runner.h (100%) rename src/{ => ops}/ops_infer/fused_add_topk_div/atb_acl_fused_add_topk_div.cpp (100%) rename src/{ => ops}/ops_infer/fused_add_topk_div/fused_add_topk_div_operation.cpp (100%) rename src/{ => ops}/ops_infer/fused_add_topk_div/fused_add_topk_div_operation.h (100%) rename src/{ => ops}/ops_infer/fused_add_topk_div/fused_add_topk_div_ops_runner.cpp (100%) rename src/{ => ops}/ops_infer/fused_add_topk_div/fused_add_topk_div_ops_runner.h (100%) rename src/{ => ops}/ops_infer/gather/gather_operation.cpp (100%) rename src/{ => ops}/ops_infer/gather/gather_operation.h (100%) rename src/{ => ops}/ops_infer/gather/gather_ops_runner.cpp (100%) rename src/{ => ops}/ops_infer/gather/gather_ops_runner.h (100%) rename src/{ => ops}/ops_infer/gather_pre_rms_norm/gather_pre_rms_norm_operation.cpp (100%) rename src/{ => ops}/ops_infer/gather_pre_rms_norm/gather_pre_rms_norm_operation.h (100%) rename src/{ => ops}/ops_infer/gather_pre_rms_norm/gather_pre_rms_norm_ops_runner.cpp (100%) rename src/{ => ops}/ops_infer/gather_pre_rms_norm/gather_pre_rms_norm_ops_runner.h (100%) rename src/{ => ops}/ops_infer/gating/gating_operation.cpp (100%) rename src/{ => ops}/ops_infer/gating/gating_operation.h (100%) rename src/{ => ops}/ops_infer/gating/gating_ops_runner.cpp (100%) rename src/{ => ops}/ops_infer/gating/gating_ops_runner.h (100%) rename src/{ => ops}/ops_infer/gmm_deq_swiglu_quant_gmm_deq/gmm_deq_swiglu_quant_gmm_deq_operation.cpp (100%) rename src/{ => ops}/ops_infer/gmm_deq_swiglu_quant_gmm_deq/gmm_deq_swiglu_quant_gmm_deq_operation.h (100%) rename src/{ => ops}/ops_infer/gmm_deq_swiglu_quant_gmm_deq/gmm_deq_swiglu_quant_gmm_deq_ops_runner.cpp (100%) rename src/{ => ops}/ops_infer/gmm_deq_swiglu_quant_gmm_deq/gmm_deq_swiglu_quant_gmm_deq_ops_runner.h (100%) rename src/{ => ops}/ops_infer/group_topk/group_topk_operation.cpp (100%) rename src/{ => ops}/ops_infer/group_topk/group_topk_operation.h (100%) rename src/{ => ops}/ops_infer/group_topk/group_topk_ops_runner.cpp (100%) rename src/{ => ops}/ops_infer/group_topk/group_topk_ops_runner.h (100%) rename src/{ => ops}/ops_infer/grouped_matmul_inplace_add/grouped_matmul_inplace_add_operation.cpp (100%) rename src/{ => ops}/ops_infer/grouped_matmul_inplace_add/grouped_matmul_inplace_add_operation.h (100%) rename src/{ => ops}/ops_infer/grouped_matmul_inplace_add/grouped_matmul_inplace_add_ops_runner.cpp (100%) rename src/{ => ops}/ops_infer/grouped_matmul_inplace_add/grouped_matmul_inplace_add_ops_runner.h (100%) rename src/{ => ops}/ops_infer/grouped_matmul_with_routing/grouped_matmul_with_routing_operation.cpp (100%) rename src/{ => ops}/ops_infer/grouped_matmul_with_routing/grouped_matmul_with_routing_operation.h (100%) rename src/{ => ops}/ops_infer/grouped_matmul_with_routing/grouped_matmul_with_routing_runner.cpp (100%) rename src/{ => ops}/ops_infer/grouped_matmul_with_routing/grouped_matmul_with_routing_runner.h (100%) rename src/{ => ops}/ops_infer/index_add/index_add_operation.cpp (100%) rename src/{ => ops}/ops_infer/index_add/index_add_operation.h (100%) rename src/{ => ops}/ops_infer/index_add/index_add_ops_runner.cpp (100%) rename src/{ => ops}/ops_infer/index_add/index_add_ops_runner.h (100%) rename src/{ => ops}/ops_infer/kv_cache/kv_cache_operation.cpp (100%) rename src/{ => ops}/ops_infer/kv_cache/kv_cache_operation.h (100%) rename src/{ => ops}/ops_infer/kv_cache/kv_cache_ops_runner.cpp (100%) rename src/{ => ops}/ops_infer/kv_cache/kv_cache_ops_runner.h (100%) rename src/{ => ops}/ops_infer/layer_norm/layer_norm_operation.cpp (100%) rename src/{ => ops}/ops_infer/layer_norm/layer_norm_operation.h (100%) rename src/{ => ops}/ops_infer/layer_norm/layer_norm_ops_runner.cpp (100%) rename src/{ => ops}/ops_infer/layer_norm/layer_norm_ops_runner.h (100%) rename src/{ => ops}/ops_infer/layer_norm_with_stride/layer_norm_with_stride_operation.cpp (100%) rename src/{ => ops}/ops_infer/layer_norm_with_stride/layer_norm_with_stride_operation.h (100%) rename src/{ => ops}/ops_infer/layer_norm_with_stride/layer_norm_with_stride_ops_runner.cpp (100%) rename src/{ => ops}/ops_infer/layer_norm_with_stride/layer_norm_with_stride_ops_runner.h (100%) rename src/{ => ops}/ops_infer/linear/linear_operation.cpp (100%) rename src/{ => ops}/ops_infer/linear/linear_operation.h (100%) rename src/{ => ops}/ops_infer/linear/linear_ops_runner.cpp (100%) rename src/{ => ops}/ops_infer/linear/linear_ops_runner.h (100%) rename src/{ => ops}/ops_infer/linear_parallel/linear_parallel_aclnn_runner.cpp (100%) rename src/{ => ops}/ops_infer/linear_parallel/linear_parallel_aclnn_runner.h (100%) rename src/{ => ops}/ops_infer/linear_parallel/linear_parallel_graph_runner.cpp (100%) rename src/{ => ops}/ops_infer/linear_parallel/linear_parallel_graph_runner.h (100%) rename src/{ => ops}/ops_infer/linear_parallel/linear_parallel_lcoc_runner.cpp (100%) rename src/{ => ops}/ops_infer/linear_parallel/linear_parallel_lcoc_runner.h (100%) rename src/{ => ops}/ops_infer/linear_parallel/linear_parallel_operation.cpp (100%) rename src/{ => ops}/ops_infer/linear_parallel/linear_parallel_operation.h (100%) rename src/{ => ops}/ops_infer/linear_sparse/linear_sparse_operation.cpp (100%) rename src/{ => ops}/ops_infer/linear_sparse/linear_sparse_operation.h (100%) rename src/{ => ops}/ops_infer/linear_sparse/linear_sparse_ops_runner.cpp (100%) rename src/{ => ops}/ops_infer/linear_sparse/linear_sparse_ops_runner.h (100%) rename src/{ => ops}/ops_infer/mla_preprocess/atb_acl_mla_preprocess.cpp (100%) rename src/{ => ops}/ops_infer/mla_preprocess/mla_preprocess_aclnn_runner.cpp (100%) rename src/{ => ops}/ops_infer/mla_preprocess/mla_preprocess_aclnn_runner.h (100%) rename src/{ => ops}/ops_infer/mla_preprocess/mla_preprocess_operation.cpp (100%) rename src/{ => ops}/ops_infer/mla_preprocess/mla_preprocess_operation.h (100%) rename src/{ => ops}/ops_infer/mla_preprocess/mla_preprocess_ops_runner.cpp (100%) rename src/{ => ops}/ops_infer/mla_preprocess/mla_preprocess_ops_runner.h (100%) rename src/{ => ops}/ops_infer/mla_preprocess/mla_preprocess_ops_runner_split.cpp (100%) rename src/{ => ops}/ops_infer/mla_preprocess/mla_preprocess_ops_runner_split.h (100%) rename src/{ => ops}/ops_infer/mm_deq_swiglu_quant_mm_deq/mm_deq_swiglu_quant_mm_deq_operation.cpp (100%) rename src/{ => ops}/ops_infer/mm_deq_swiglu_quant_mm_deq/mm_deq_swiglu_quant_mm_deq_operation.h (100%) rename src/{ => ops}/ops_infer/mm_deq_swiglu_quant_mm_deq/mm_deq_swiglu_quant_mm_deq_ops_runner.cpp (100%) rename src/{ => ops}/ops_infer/mm_deq_swiglu_quant_mm_deq/mm_deq_swiglu_quant_mm_deq_ops_runner.h (100%) rename src/{ => ops}/ops_infer/multi_latent_attention/atb_acl_mla.cpp (100%) rename src/{ => ops}/ops_infer/multi_latent_attention/multi_latent_attention_operation.cpp (100%) rename src/{ => ops}/ops_infer/multi_latent_attention/multi_latent_attention_operation.h (100%) rename src/{ => ops}/ops_infer/multi_latent_attention/multi_latent_attention_ops_runner.cpp (100%) rename src/{ => ops}/ops_infer/multi_latent_attention/multi_latent_attention_ops_runner.h (100%) rename src/{ => ops}/ops_infer/multi_latent_attention/multi_latent_attention_ops_runner_prefill.cpp (100%) rename src/{ => ops}/ops_infer/multi_latent_attention/multi_latent_attention_ops_runner_prefill.h (100%) rename src/{ => ops}/ops_infer/multi_latent_attention/param.cpp (100%) rename src/{ => ops}/ops_infer/multi_latent_attention/param.h (100%) rename src/{ => ops}/ops_infer/multinomial/multinomial_operation.cpp (100%) rename src/{ => ops}/ops_infer/multinomial/multinomial_operation.h (100%) rename src/{ => ops}/ops_infer/multinomial/multinomial_ops_runner.cpp (100%) rename src/{ => ops}/ops_infer/multinomial/multinomial_ops_runner.h (100%) rename src/{ => ops}/ops_infer/nonzero/nonzero_operation.cpp (100%) rename src/{ => ops}/ops_infer/nonzero/nonzero_operation.h (100%) rename src/{ => ops}/ops_infer/nonzero/nonzero_runner.cpp (100%) rename src/{ => ops}/ops_infer/nonzero/nonzero_runner.h (100%) rename src/{ => ops}/ops_infer/norm_rope_reshape/norm_rope_reshape_operation.cpp (97%) rename src/{ => ops}/ops_infer/norm_rope_reshape/norm_rope_reshape_operation.h (100%) rename src/{ => ops}/ops_infer/norm_rope_reshape/norm_rope_reshape_ops_runner.cpp (92%) rename src/{ => ops}/ops_infer/norm_rope_reshape/norm_rope_reshape_ops_runner.h (100%) rename src/{ => ops}/ops_infer/onehot/onehot_operation.cpp (100%) rename src/{ => ops}/ops_infer/onehot/onehot_operation.h (100%) rename src/{ => ops}/ops_infer/onehot/onehot_ops_runner.cpp (100%) rename src/{ => ops}/ops_infer/onehot/onehot_ops_runner.h (100%) rename src/{ => ops}/ops_infer/pad/pad_operation.cpp (100%) rename src/{ => ops}/ops_infer/pad/pad_operation.h (100%) rename src/{ => ops}/ops_infer/pad/pad_ops_runner.cpp (100%) rename src/{ => ops}/ops_infer/pad/pad_ops_runner.h (100%) rename src/{ => ops}/ops_infer/paged_attention/paged_attention_operation.cpp (100%) rename src/{ => ops}/ops_infer/paged_attention/paged_attention_operation.h (100%) rename src/{ => ops}/ops_infer/paged_attention/paged_attention_ops_runner.cpp (100%) rename src/{ => ops}/ops_infer/paged_attention/paged_attention_ops_runner.h (100%) rename src/{ => ops}/ops_infer/paged_attention/paged_attention_ops_runner_910a.cpp (100%) rename src/{ => ops}/ops_infer/paged_attention/paged_attention_ops_runner_910a.h (100%) rename src/{ => ops}/ops_infer/paged_attention/paged_attention_runner_utils.cpp (100%) rename src/{ => ops}/ops_infer/paged_attention/paged_attention_runner_utils.h (100%) rename src/{ => ops}/ops_infer/paged_attention/param.cpp (100%) rename src/{ => ops}/ops_infer/paged_attention/param.h (100%) rename src/{ => ops}/ops_infer/paged_cache_load/atb_acl_paged_cache_load.cpp (100%) rename src/{ => ops}/ops_infer/paged_cache_load/paged_cache_load_operation.cpp (100%) rename src/{ => ops}/ops_infer/paged_cache_load/paged_cache_load_operation.h (100%) rename src/{ => ops}/ops_infer/paged_cache_load/paged_cache_load_ops_runner.cpp (100%) rename src/{ => ops}/ops_infer/paged_cache_load/paged_cache_load_ops_runner.h (100%) rename src/{ => ops}/ops_infer/razor_fusion_attention/razor_fusion_attention_operation.cpp (100%) rename src/{ => ops}/ops_infer/razor_fusion_attention/razor_fusion_attention_operation.h (100%) rename src/{ => ops}/ops_infer/razor_fusion_attention/razor_fusion_attention_ops_runner.cpp (100%) rename src/{ => ops}/ops_infer/razor_fusion_attention/razor_fusion_attention_ops_runner.h (100%) rename src/{ => ops}/ops_infer/recv/recv_hccl_runner.cpp (100%) rename src/{ => ops}/ops_infer/recv/recv_hccl_runner.h (100%) rename src/{ => ops}/ops_infer/recv/recv_operation.cpp (100%) rename src/{ => ops}/ops_infer/recv/recv_operation.h (100%) rename src/{ => ops}/ops_infer/reduce/reduce_operation.cpp (100%) rename src/{ => ops}/ops_infer/reduce/reduce_operation.h (100%) rename src/{ => ops}/ops_infer/reduce/reduce_ops_runner.cpp (100%) rename src/{ => ops}/ops_infer/reduce/reduce_ops_runner.h (100%) rename src/{ => ops}/ops_infer/reduce_scatter/reduce_scatter_hccl_runner.cpp (100%) rename src/{ => ops}/ops_infer/reduce_scatter/reduce_scatter_hccl_runner.h (100%) rename src/{ => ops}/ops_infer/reduce_scatter/reduce_scatter_lccl_runner.cpp (100%) rename src/{ => ops}/ops_infer/reduce_scatter/reduce_scatter_lccl_runner.h (100%) rename src/{ => ops}/ops_infer/reduce_scatter/reduce_scatter_operation.cpp (100%) rename src/{ => ops}/ops_infer/reduce_scatter/reduce_scatter_operation.h (100%) rename src/{ => ops}/ops_infer/reduce_scatterv/reduce_scatterv_hccl_runner.cpp (100%) rename src/{ => ops}/ops_infer/reduce_scatterv/reduce_scatterv_hccl_runner.h (100%) rename src/{ => ops}/ops_infer/reduce_scatterv/reduce_scatterv_operation.cpp (100%) rename src/{ => ops}/ops_infer/reduce_scatterv/reduce_scatterv_operation.h (100%) rename src/{ => ops}/ops_infer/relay_attention/param.cpp (100%) rename src/{ => ops}/ops_infer/relay_attention/param.h (100%) rename src/{ => ops}/ops_infer/relay_attention/relay_attention_operation.cpp (100%) rename src/{ => ops}/ops_infer/relay_attention/relay_attention_operation.h (100%) rename src/{ => ops}/ops_infer/relay_attention/relay_attention_ops_runner.cpp (100%) rename src/{ => ops}/ops_infer/relay_attention/relay_attention_ops_runner.h (100%) rename src/{ => ops}/ops_infer/repeat/repeat_operation.cpp (100%) rename src/{ => ops}/ops_infer/repeat/repeat_operation.h (100%) rename src/{ => ops}/ops_infer/repeat/repeat_ops_runner.cpp (100%) rename src/{ => ops}/ops_infer/repeat/repeat_ops_runner.h (100%) rename src/{ => ops}/ops_infer/reshape_and_cache/reshape_and_cache_operation.cpp (100%) rename src/{ => ops}/ops_infer/reshape_and_cache/reshape_and_cache_operation.h (100%) rename src/{ => ops}/ops_infer/reshape_and_cache/reshape_and_cache_ops_runner.cpp (100%) rename src/{ => ops}/ops_infer/reshape_and_cache/reshape_and_cache_ops_runner.h (100%) rename src/{ => ops}/ops_infer/reshape_and_cache/reshape_and_cache_ops_runner_310p.cpp (100%) rename src/{ => ops}/ops_infer/reshape_and_cache/reshape_and_cache_ops_runner_310p.h (100%) rename src/{ => ops}/ops_infer/reshape_and_cache/reshape_and_cache_ops_runner_A2_NZ.cpp (100%) rename src/{ => ops}/ops_infer/reshape_and_cache/reshape_and_cache_ops_runner_A2_NZ.h (100%) rename src/{ => ops}/ops_infer/reshape_and_cache/reshape_and_cache_ops_runner_SISO.cpp (100%) rename src/{ => ops}/ops_infer/reshape_and_cache/reshape_and_cache_ops_runner_SISO.h (100%) rename src/{ => ops}/ops_infer/reshape_and_cache_omni/reshape_and_cache_omni_operation.cpp (100%) rename src/{ => ops}/ops_infer/reshape_and_cache_omni/reshape_and_cache_omni_operation.h (100%) rename src/{ => ops}/ops_infer/reshape_and_cache_omni/reshape_and_cache_omni_ops_runner.cpp (100%) rename src/{ => ops}/ops_infer/reshape_and_cache_omni/reshape_and_cache_omni_ops_runner.h (100%) rename src/{ => ops}/ops_infer/reshape_and_cache_with_stride/reshape_and_cache_with_stride_operation.cpp (100%) rename src/{ => ops}/ops_infer/reshape_and_cache_with_stride/reshape_and_cache_with_stride_operation.h (100%) rename src/{ => ops}/ops_infer/reshape_and_cache_with_stride/reshape_and_cache_with_stride_ops_runner.cpp (100%) rename src/{ => ops}/ops_infer/reshape_and_cache_with_stride/reshape_and_cache_with_stride_ops_runner.h (100%) rename src/{ => ops}/ops_infer/reshape_and_cache_with_stride/reshape_and_cache_with_stride_ops_runner_SISO.cpp (100%) rename src/{ => ops}/ops_infer/reshape_and_cache_with_stride/reshape_and_cache_with_stride_ops_runner_SISO.h (100%) rename src/{ => ops}/ops_infer/ring_mla/atb_acl_ring_mla.cpp (100%) rename src/{ => ops}/ops_infer/ring_mla/param.cpp (100%) rename src/{ => ops}/ops_infer/ring_mla/param.h (100%) rename src/{ => ops}/ops_infer/ring_mla/ring_mla_operation.cpp (100%) rename src/{ => ops}/ops_infer/ring_mla/ring_mla_operation.h (100%) rename src/{ => ops}/ops_infer/ring_mla/ring_mla_ops_runner.cpp (100%) rename src/{ => ops}/ops_infer/ring_mla/ring_mla_ops_runner.h (100%) rename src/{ => ops}/ops_infer/rms_norm/rms_norm_operation.cpp (100%) rename src/{ => ops}/ops_infer/rms_norm/rms_norm_operation.h (100%) rename src/{ => ops}/ops_infer/rms_norm/rms_norm_ops_runner.cpp (100%) rename src/{ => ops}/ops_infer/rms_norm/rms_norm_ops_runner.h (100%) rename src/{ => ops}/ops_infer/rms_norm_with_stride/rms_norm_with_stride_operation.cpp (100%) rename src/{ => ops}/ops_infer/rms_norm_with_stride/rms_norm_with_stride_operation.h (100%) rename src/{ => ops}/ops_infer/rms_norm_with_stride/rms_norm_with_stride_ops_runner.cpp (100%) rename src/{ => ops}/ops_infer/rms_norm_with_stride/rms_norm_with_stride_ops_runner.h (100%) rename src/{ => ops}/ops_infer/rope/rope_operation.cpp (100%) rename src/{ => ops}/ops_infer/rope/rope_operation.h (100%) rename src/{ => ops}/ops_infer/rope/rope_ops_runner.cpp (100%) rename src/{ => ops}/ops_infer/rope/rope_ops_runner.h (100%) rename src/{ => ops}/ops_infer/rope_q_concat/rope_q_concat_operation.cpp (100%) rename src/{ => ops}/ops_infer/rope_q_concat/rope_q_concat_operation.h (100%) rename src/{ => ops}/ops_infer/rope_q_concat/rope_q_concat_ops_runner.cpp (100%) rename src/{ => ops}/ops_infer/rope_q_concat/rope_q_concat_ops_runner.h (100%) rename src/{ => ops}/ops_infer/scatter_elements_v2/scatter_elements_v2_operation.cpp (100%) rename src/{ => ops}/ops_infer/scatter_elements_v2/scatter_elements_v2_operation.h (100%) rename src/{ => ops}/ops_infer/scatter_elements_v2/scatter_elements_v2_ops_runner.cpp (100%) rename src/{ => ops}/ops_infer/scatter_elements_v2/scatter_elements_v2_ops_runner.h (100%) rename src/{ => ops}/ops_infer/self_attention/atb_acl_self_attention_prefix_encoder.cpp (100%) rename src/{ => ops}/ops_infer/self_attention/param.cpp (100%) rename src/{ => ops}/ops_infer/self_attention/param.h (100%) rename src/{ => ops}/ops_infer/self_attention/self_attention_encoder_fuison_ops_runner_910a.cpp (100%) rename src/{ => ops}/ops_infer/self_attention/self_attention_encoder_fusion_ops_runner.cpp (100%) rename src/{ => ops}/ops_infer/self_attention/self_attention_encoder_fusion_ops_runner.h (100%) rename src/{ => ops}/ops_infer/self_attention/self_attention_encoder_fusion_ops_runner_910a.h (100%) rename src/{ => ops}/ops_infer/self_attention/self_attention_fusion_bypass_ops_runner.cpp (100%) rename src/{ => ops}/ops_infer/self_attention/self_attention_fusion_bypass_ops_runner.h (100%) rename src/{ => ops}/ops_infer/self_attention/self_attention_fusion_bypass_ops_runner_910a.cpp (100%) rename src/{ => ops}/ops_infer/self_attention/self_attention_fusion_bypass_ops_runner_910a.h (100%) rename src/{ => ops}/ops_infer/self_attention/self_attention_fusion_bypass_ops_runner_BNSD.cpp (100%) rename src/{ => ops}/ops_infer/self_attention/self_attention_fusion_bypass_ops_runner_BNSD.h (100%) rename src/{ => ops}/ops_infer/self_attention/self_attention_fusion_bypass_ops_runner_BNSD_910a.cpp (100%) rename src/{ => ops}/ops_infer/self_attention/self_attention_fusion_bypass_ops_runner_BNSD_910a.h (100%) rename src/{ => ops}/ops_infer/self_attention/self_attention_fusion_ops_runner.cpp (100%) rename src/{ => ops}/ops_infer/self_attention/self_attention_fusion_ops_runner.h (100%) rename src/{ => ops}/ops_infer/self_attention/self_attention_fusion_ops_runner_910a.cpp (100%) rename src/{ => ops}/ops_infer/self_attention/self_attention_fusion_ops_runner_910a.h (100%) rename src/{ => ops}/ops_infer/self_attention/self_attention_operation.cpp (100%) rename src/{ => ops}/ops_infer/self_attention/self_attention_operation.h (100%) rename src/{ => ops}/ops_infer/self_attention/self_attention_prefix_encoder_ops_runner.cpp (100%) rename src/{ => ops}/ops_infer/self_attention/self_attention_prefix_encoder_ops_runner.h (100%) rename src/{ => ops}/ops_infer/self_attention/self_attention_runner_utils.cpp (100%) rename src/{ => ops}/ops_infer/self_attention/self_attention_runner_utils.h (100%) rename src/{ => ops}/ops_infer/send/send_hccl_runner.cpp (100%) rename src/{ => ops}/ops_infer/send/send_hccl_runner.h (100%) rename src/{ => ops}/ops_infer/send/send_operation.cpp (100%) rename src/{ => ops}/ops_infer/send/send_operation.h (100%) rename src/{ => ops}/ops_infer/set_value/set_value_operation.cpp (100%) rename src/{ => ops}/ops_infer/set_value/set_value_operation.h (100%) rename src/{ => ops}/ops_infer/set_value/set_value_ops_runner.cpp (100%) rename src/{ => ops}/ops_infer/set_value/set_value_ops_runner.h (100%) rename src/{ => ops}/ops_infer/slice/slice_operation.cpp (100%) rename src/{ => ops}/ops_infer/slice/slice_operation.h (100%) rename src/{ => ops}/ops_infer/slice/slice_ops_runner.cpp (100%) rename src/{ => ops}/ops_infer/slice/slice_ops_runner.h (100%) rename src/{ => ops}/ops_infer/softmax/softmax_operation.cpp (100%) rename src/{ => ops}/ops_infer/softmax/softmax_operation.h (100%) rename src/{ => ops}/ops_infer/softmax/softmax_ops_runner.cpp (100%) rename src/{ => ops}/ops_infer/softmax/softmax_ops_runner.h (100%) rename src/{ => ops}/ops_infer/sort/sort_operation.cpp (100%) rename src/{ => ops}/ops_infer/sort/sort_operation.h (100%) rename src/{ => ops}/ops_infer/sort/sort_ops_runner.cpp (100%) rename src/{ => ops}/ops_infer/sort/sort_ops_runner.h (97%) rename src/{ => ops}/ops_infer/split/split_operation.cpp (100%) rename src/{ => ops}/ops_infer/split/split_operation.h (100%) rename src/{ => ops}/ops_infer/split/split_ops_runner.cpp (100%) rename src/{ => ops}/ops_infer/split/split_ops_runner.h (100%) rename src/{ => ops}/ops_infer/swiglu_quant/swiglu_quant_operation.cpp (100%) rename src/{ => ops}/ops_infer/swiglu_quant/swiglu_quant_operation.h (100%) rename src/{ => ops}/ops_infer/swiglu_quant/swiglu_quant_ops_runner.cpp (100%) rename src/{ => ops}/ops_infer/swiglu_quant/swiglu_quant_ops_runner.h (100%) rename src/{ => ops}/ops_infer/topk_topp_sampling/topk_topp_sampling_operation.cpp (100%) rename src/{ => ops}/ops_infer/topk_topp_sampling/topk_topp_sampling_operation.h (100%) rename src/{ => ops}/ops_infer/topk_topp_sampling/topk_topp_sampling_ops_runner.cpp (100%) rename src/{ => ops}/ops_infer/topk_topp_sampling/topk_topp_sampling_ops_runner.h (100%) rename src/{ => ops}/ops_infer/transdata/transdata_operation.cpp (100%) rename src/{ => ops}/ops_infer/transdata/transdata_operation.h (100%) rename src/{ => ops}/ops_infer/transdata/transdata_ops_runner.cpp (100%) rename src/{ => ops}/ops_infer/transdata/transdata_ops_runner.h (100%) rename src/{ => ops}/ops_infer/transpose/transpose_operation.cpp (100%) rename src/{ => ops}/ops_infer/transpose/transpose_operation.h (100%) rename src/{ => ops}/ops_infer/transpose/transpose_ops_runner.cpp (100%) rename src/{ => ops}/ops_infer/transpose/transpose_ops_runner.h (100%) rename src/{ => ops}/ops_infer/unpad/unpad_operation.cpp (100%) rename src/{ => ops}/ops_infer/unpad/unpad_operation.h (100%) rename src/{ => ops}/ops_infer/unpad/unpad_ops_runner.cpp (100%) rename src/{ => ops}/ops_infer/unpad/unpad_ops_runner.h (100%) rename src/{ => ops}/ops_infer/where/where_operation.cpp (100%) rename src/{ => ops}/ops_infer/where/where_operation.h (100%) rename src/{ => ops}/ops_infer/where/where_ops_runner.cpp (100%) rename src/{ => ops}/ops_infer/where/where_ops_runner.h (100%) rename src/{ => ops}/ops_train/fast_soft_max/fastsoftmax_operation.cpp (100%) rename src/{ => ops}/ops_train/fast_soft_max/fastsoftmax_operation.h (100%) rename src/{ => ops}/ops_train/fast_soft_max/fastsoftmax_ops_runner.cpp (100%) rename src/{ => ops}/ops_train/fast_soft_max/fastsoftmax_ops_runner.h (100%) rename src/{ => ops}/ops_train/fast_soft_max_grad/fastsoftmaxgrad_operation.cpp (100%) rename src/{ => ops}/ops_train/fast_soft_max_grad/fastsoftmaxgrad_operation.h (100%) rename src/{ => ops}/ops_train/fast_soft_max_grad/fastsoftmaxgrad_ops_runner.cpp (100%) rename src/{ => ops}/ops_train/fast_soft_max_grad/fastsoftmaxgrad_ops_runner.h (100%) rename src/{ => ops}/ops_train/gen_attention_mask/genattentionmask_operation.cpp (100%) rename src/{ => ops}/ops_train/gen_attention_mask/genattentionmask_operation.h (100%) rename src/{ => ops}/ops_train/gen_attention_mask/genattentionmask_ops_runner.cpp (100%) rename src/{ => ops}/ops_train/gen_attention_mask/genattentionmask_ops_runner.h (100%) rename src/{ => ops}/ops_train/laser_attention/laser_attention_operation.cpp (100%) rename src/{ => ops}/ops_train/laser_attention/laser_attention_operation.h (100%) rename src/{ => ops}/ops_train/laser_attention/laser_attention_ops_runner.cpp (100%) rename src/{ => ops}/ops_train/laser_attention/laser_attention_ops_runner.h (100%) rename src/{ => ops}/ops_train/laser_attention_grad/laser_attention_grad_operation.cpp (100%) rename src/{ => ops}/ops_train/laser_attention_grad/laser_attention_grad_operation.h (100%) rename src/{ => ops}/ops_train/laser_attention_grad/laser_attention_grad_ops_runner.cpp (100%) rename src/{ => ops}/ops_train/laser_attention_grad/laser_attention_grad_ops_runner.h (100%) rename src/{ => ops}/ops_train/pad_with_hidden_state/pad_with_hidden_state_operation.cpp (100%) rename src/{ => ops}/ops_train/pad_with_hidden_state/pad_with_hidden_state_operation.h (100%) rename src/{ => ops}/ops_train/pad_with_hidden_state/pad_with_hidden_state_ops_runner.cpp (100%) rename src/{ => ops}/ops_train/pad_with_hidden_state/pad_with_hidden_state_ops_runner.h (100%) rename src/{ => ops}/ops_train/rms_norm_backward/rms_norm_backward_operation.cpp (100%) rename src/{ => ops}/ops_train/rms_norm_backward/rms_norm_backward_operation.h (100%) rename src/{ => ops}/ops_train/rms_norm_backward/rms_norm_backward_ops_runner.cpp (100%) rename src/{ => ops}/ops_train/rms_norm_backward/rms_norm_backward_ops_runner.h (100%) rename src/{ => ops}/ops_train/rope_grad/rope_grad_operation.cpp (100%) rename src/{ => ops}/ops_train/rope_grad/rope_grad_operation.h (100%) rename src/{ => ops}/ops_train/rope_grad/rope_grad_ops_runner.cpp (100%) rename src/{ => ops}/ops_train/rope_grad/rope_grad_ops_runner.h (100%) rename src/{ => ops}/ops_train/strided_batch_matmul/stridedbatchmatmul_operation.cpp (100%) rename src/{ => ops}/ops_train/strided_batch_matmul/stridedbatchmatmul_operation.h (100%) rename src/{ => ops}/ops_train/strided_batch_matmul/stridedbatchmatmul_ops_runner.cpp (100%) rename src/{ => ops}/ops_train/strided_batch_matmul/stridedbatchmatmul_ops_runner.h (100%) rename src/{ => ops}/ops_train/unpad_with_hidden_state/unpad_with_hidden_state_operation.cpp (100%) rename src/{ => ops}/ops_train/unpad_with_hidden_state/unpad_with_hidden_state_operation.h (100%) rename src/{ => ops}/ops_train/unpad_with_hidden_state/unpad_with_hidden_state_ops_runner.cpp (100%) rename src/{ => ops}/ops_train/unpad_with_hidden_state/unpad_with_hidden_state_ops_runner.h (100%) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 389b643b..fb980df9 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -8,9 +8,9 @@ # See LICENSE in the root of the software repository for the full text of the License. # -set(ops_train_directory ${CMAKE_CURRENT_LIST_DIR}/ops_train) -set(ops_infer_directory ${CMAKE_CURRENT_LIST_DIR}/ops_infer) -set(ops_common_directory ${CMAKE_CURRENT_LIST_DIR}/ops_common) +set(ops_train_directory ${CMAKE_CURRENT_LIST_DIR}/ops/ops_train) +set(ops_infer_directory ${CMAKE_CURRENT_LIST_DIR}/ops/ops_infer) +set(ops_common_directory ${CMAKE_CURRENT_LIST_DIR}/ops/ops_common) set(atb_directory ${CMAKE_CURRENT_LIST_DIR}/atb) set(MSTX_PATH $ENV{ASCEND_HOME_PATH}/tools/mstx/include) set(ATB_INCLUDE_DIR $ENV{ASCEND_HOME_PATH}/include) diff --git a/src/ops_common/event_operation/event_operation.cpp b/src/ops/ops_common/event_operation/event_operation.cpp similarity index 100% rename from src/ops_common/event_operation/event_operation.cpp rename to src/ops/ops_common/event_operation/event_operation.cpp diff --git a/src/ops_common/event_operation/event_operation.h b/src/ops/ops_common/event_operation/event_operation.h similarity index 100% rename from src/ops_common/event_operation/event_operation.h rename to src/ops/ops_common/event_operation/event_operation.h diff --git a/src/ops_common/event_operation/event_runner.cpp b/src/ops/ops_common/event_operation/event_runner.cpp similarity index 100% rename from src/ops_common/event_operation/event_runner.cpp rename to src/ops/ops_common/event_operation/event_runner.cpp diff --git a/src/ops_common/event_operation/event_runner.h b/src/ops/ops_common/event_operation/event_runner.h similarity index 100% rename from src/ops_common/event_operation/event_runner.h rename to src/ops/ops_common/event_operation/event_runner.h diff --git a/src/ops_infer/activation/activation_operation.cpp b/src/ops/ops_infer/activation/activation_operation.cpp similarity index 100% rename from src/ops_infer/activation/activation_operation.cpp rename to src/ops/ops_infer/activation/activation_operation.cpp diff --git a/src/ops_infer/activation/activation_operation.h b/src/ops/ops_infer/activation/activation_operation.h similarity index 100% rename from src/ops_infer/activation/activation_operation.h rename to src/ops/ops_infer/activation/activation_operation.h diff --git a/src/ops_infer/activation/activation_ops_runner.cpp b/src/ops/ops_infer/activation/activation_ops_runner.cpp similarity index 100% rename from src/ops_infer/activation/activation_ops_runner.cpp rename to src/ops/ops_infer/activation/activation_ops_runner.cpp diff --git a/src/ops_infer/activation/activation_ops_runner.h b/src/ops/ops_infer/activation/activation_ops_runner.h similarity index 100% rename from src/ops_infer/activation/activation_ops_runner.h rename to src/ops/ops_infer/activation/activation_ops_runner.h diff --git a/src/ops_infer/all_gather/all_gather_hccl_runner.cpp b/src/ops/ops_infer/all_gather/all_gather_hccl_runner.cpp similarity index 100% rename from src/ops_infer/all_gather/all_gather_hccl_runner.cpp rename to src/ops/ops_infer/all_gather/all_gather_hccl_runner.cpp diff --git a/src/ops_infer/all_gather/all_gather_hccl_runner.h b/src/ops/ops_infer/all_gather/all_gather_hccl_runner.h similarity index 100% rename from src/ops_infer/all_gather/all_gather_hccl_runner.h rename to src/ops/ops_infer/all_gather/all_gather_hccl_runner.h diff --git a/src/ops_infer/all_gather/all_gather_lccl_runner.cpp b/src/ops/ops_infer/all_gather/all_gather_lccl_runner.cpp similarity index 100% rename from src/ops_infer/all_gather/all_gather_lccl_runner.cpp rename to src/ops/ops_infer/all_gather/all_gather_lccl_runner.cpp diff --git a/src/ops_infer/all_gather/all_gather_lccl_runner.h b/src/ops/ops_infer/all_gather/all_gather_lccl_runner.h similarity index 100% rename from src/ops_infer/all_gather/all_gather_lccl_runner.h rename to src/ops/ops_infer/all_gather/all_gather_lccl_runner.h diff --git a/src/ops_infer/all_gather/all_gather_operation.cpp b/src/ops/ops_infer/all_gather/all_gather_operation.cpp similarity index 100% rename from src/ops_infer/all_gather/all_gather_operation.cpp rename to src/ops/ops_infer/all_gather/all_gather_operation.cpp diff --git a/src/ops_infer/all_gather/all_gather_operation.h b/src/ops/ops_infer/all_gather/all_gather_operation.h similarity index 100% rename from src/ops_infer/all_gather/all_gather_operation.h rename to src/ops/ops_infer/all_gather/all_gather_operation.h diff --git a/src/ops_infer/all_gatherv/all_gatherv_hccl_runner.cpp b/src/ops/ops_infer/all_gatherv/all_gatherv_hccl_runner.cpp similarity index 100% rename from src/ops_infer/all_gatherv/all_gatherv_hccl_runner.cpp rename to src/ops/ops_infer/all_gatherv/all_gatherv_hccl_runner.cpp diff --git a/src/ops_infer/all_gatherv/all_gatherv_hccl_runner.h b/src/ops/ops_infer/all_gatherv/all_gatherv_hccl_runner.h similarity index 100% rename from src/ops_infer/all_gatherv/all_gatherv_hccl_runner.h rename to src/ops/ops_infer/all_gatherv/all_gatherv_hccl_runner.h diff --git a/src/ops_infer/all_gatherv/all_gatherv_operation.cpp b/src/ops/ops_infer/all_gatherv/all_gatherv_operation.cpp similarity index 100% rename from src/ops_infer/all_gatherv/all_gatherv_operation.cpp rename to src/ops/ops_infer/all_gatherv/all_gatherv_operation.cpp diff --git a/src/ops_infer/all_gatherv/all_gatherv_operation.h b/src/ops/ops_infer/all_gatherv/all_gatherv_operation.h similarity index 100% rename from src/ops_infer/all_gatherv/all_gatherv_operation.h rename to src/ops/ops_infer/all_gatherv/all_gatherv_operation.h diff --git a/src/ops_infer/all_reduce/all_reduce_hccl_runner.cpp b/src/ops/ops_infer/all_reduce/all_reduce_hccl_runner.cpp similarity index 100% rename from src/ops_infer/all_reduce/all_reduce_hccl_runner.cpp rename to src/ops/ops_infer/all_reduce/all_reduce_hccl_runner.cpp diff --git a/src/ops_infer/all_reduce/all_reduce_hccl_runner.h b/src/ops/ops_infer/all_reduce/all_reduce_hccl_runner.h similarity index 100% rename from src/ops_infer/all_reduce/all_reduce_hccl_runner.h rename to src/ops/ops_infer/all_reduce/all_reduce_hccl_runner.h diff --git a/src/ops_infer/all_reduce/all_reduce_lccl_runner.cpp b/src/ops/ops_infer/all_reduce/all_reduce_lccl_runner.cpp similarity index 100% rename from src/ops_infer/all_reduce/all_reduce_lccl_runner.cpp rename to src/ops/ops_infer/all_reduce/all_reduce_lccl_runner.cpp diff --git a/src/ops_infer/all_reduce/all_reduce_lccl_runner.h b/src/ops/ops_infer/all_reduce/all_reduce_lccl_runner.h similarity index 100% rename from src/ops_infer/all_reduce/all_reduce_lccl_runner.h rename to src/ops/ops_infer/all_reduce/all_reduce_lccl_runner.h diff --git a/src/ops_infer/all_reduce/all_reduce_operation.cpp b/src/ops/ops_infer/all_reduce/all_reduce_operation.cpp similarity index 100% rename from src/ops_infer/all_reduce/all_reduce_operation.cpp rename to src/ops/ops_infer/all_reduce/all_reduce_operation.cpp diff --git a/src/ops_infer/all_reduce/all_reduce_operation.h b/src/ops/ops_infer/all_reduce/all_reduce_operation.h similarity index 100% rename from src/ops_infer/all_reduce/all_reduce_operation.h rename to src/ops/ops_infer/all_reduce/all_reduce_operation.h diff --git a/src/ops_infer/all_to_all/all_to_all_hccl_runner.cpp b/src/ops/ops_infer/all_to_all/all_to_all_hccl_runner.cpp similarity index 100% rename from src/ops_infer/all_to_all/all_to_all_hccl_runner.cpp rename to src/ops/ops_infer/all_to_all/all_to_all_hccl_runner.cpp diff --git a/src/ops_infer/all_to_all/all_to_all_hccl_runner.h b/src/ops/ops_infer/all_to_all/all_to_all_hccl_runner.h similarity index 100% rename from src/ops_infer/all_to_all/all_to_all_hccl_runner.h rename to src/ops/ops_infer/all_to_all/all_to_all_hccl_runner.h diff --git a/src/ops_infer/all_to_all/all_to_all_lccl_runner.cpp b/src/ops/ops_infer/all_to_all/all_to_all_lccl_runner.cpp similarity index 100% rename from src/ops_infer/all_to_all/all_to_all_lccl_runner.cpp rename to src/ops/ops_infer/all_to_all/all_to_all_lccl_runner.cpp diff --git a/src/ops_infer/all_to_all/all_to_all_lccl_runner.h b/src/ops/ops_infer/all_to_all/all_to_all_lccl_runner.h similarity index 100% rename from src/ops_infer/all_to_all/all_to_all_lccl_runner.h rename to src/ops/ops_infer/all_to_all/all_to_all_lccl_runner.h diff --git a/src/ops_infer/all_to_all/all_to_all_operation.cpp b/src/ops/ops_infer/all_to_all/all_to_all_operation.cpp similarity index 100% rename from src/ops_infer/all_to_all/all_to_all_operation.cpp rename to src/ops/ops_infer/all_to_all/all_to_all_operation.cpp diff --git a/src/ops_infer/all_to_all/all_to_all_operation.h b/src/ops/ops_infer/all_to_all/all_to_all_operation.h similarity index 100% rename from src/ops_infer/all_to_all/all_to_all_operation.h rename to src/ops/ops_infer/all_to_all/all_to_all_operation.h diff --git a/src/ops_infer/all_to_allv/all_to_allv_hccl_runner.cpp b/src/ops/ops_infer/all_to_allv/all_to_allv_hccl_runner.cpp similarity index 100% rename from src/ops_infer/all_to_allv/all_to_allv_hccl_runner.cpp rename to src/ops/ops_infer/all_to_allv/all_to_allv_hccl_runner.cpp diff --git a/src/ops_infer/all_to_allv/all_to_allv_hccl_runner.h b/src/ops/ops_infer/all_to_allv/all_to_allv_hccl_runner.h similarity index 100% rename from src/ops_infer/all_to_allv/all_to_allv_hccl_runner.h rename to src/ops/ops_infer/all_to_allv/all_to_allv_hccl_runner.h diff --git a/src/ops_infer/all_to_allv/all_to_allv_operation.cpp b/src/ops/ops_infer/all_to_allv/all_to_allv_operation.cpp similarity index 100% rename from src/ops_infer/all_to_allv/all_to_allv_operation.cpp rename to src/ops/ops_infer/all_to_allv/all_to_allv_operation.cpp diff --git a/src/ops_infer/all_to_allv/all_to_allv_operation.h b/src/ops/ops_infer/all_to_allv/all_to_allv_operation.h similarity index 100% rename from src/ops_infer/all_to_allv/all_to_allv_operation.h rename to src/ops/ops_infer/all_to_allv/all_to_allv_operation.h diff --git a/src/ops_infer/all_to_allvv2/all_to_allvv2_hccl_runner.cpp b/src/ops/ops_infer/all_to_allvv2/all_to_allvv2_hccl_runner.cpp similarity index 100% rename from src/ops_infer/all_to_allvv2/all_to_allvv2_hccl_runner.cpp rename to src/ops/ops_infer/all_to_allvv2/all_to_allvv2_hccl_runner.cpp diff --git a/src/ops_infer/all_to_allvv2/all_to_allvv2_hccl_runner.h b/src/ops/ops_infer/all_to_allvv2/all_to_allvv2_hccl_runner.h similarity index 100% rename from src/ops_infer/all_to_allvv2/all_to_allvv2_hccl_runner.h rename to src/ops/ops_infer/all_to_allvv2/all_to_allvv2_hccl_runner.h diff --git a/src/ops_infer/all_to_allvv2/all_to_allvv2_operation.cpp b/src/ops/ops_infer/all_to_allvv2/all_to_allvv2_operation.cpp similarity index 100% rename from src/ops_infer/all_to_allvv2/all_to_allvv2_operation.cpp rename to src/ops/ops_infer/all_to_allvv2/all_to_allvv2_operation.cpp diff --git a/src/ops_infer/all_to_allvv2/all_to_allvv2_operation.h b/src/ops/ops_infer/all_to_allvv2/all_to_allvv2_operation.h similarity index 100% rename from src/ops_infer/all_to_allvv2/all_to_allvv2_operation.h rename to src/ops/ops_infer/all_to_allvv2/all_to_allvv2_operation.h diff --git a/src/ops_infer/as_strided/as_strided_operation.cpp b/src/ops/ops_infer/as_strided/as_strided_operation.cpp similarity index 100% rename from src/ops_infer/as_strided/as_strided_operation.cpp rename to src/ops/ops_infer/as_strided/as_strided_operation.cpp diff --git a/src/ops_infer/as_strided/as_strided_operation.h b/src/ops/ops_infer/as_strided/as_strided_operation.h similarity index 100% rename from src/ops_infer/as_strided/as_strided_operation.h rename to src/ops/ops_infer/as_strided/as_strided_operation.h diff --git a/src/ops_infer/as_strided/as_strided_ops_runner.cpp b/src/ops/ops_infer/as_strided/as_strided_ops_runner.cpp similarity index 100% rename from src/ops_infer/as_strided/as_strided_ops_runner.cpp rename to src/ops/ops_infer/as_strided/as_strided_ops_runner.cpp diff --git a/src/ops_infer/as_strided/as_strided_ops_runner.h b/src/ops/ops_infer/as_strided/as_strided_ops_runner.h similarity index 100% rename from src/ops_infer/as_strided/as_strided_ops_runner.h rename to src/ops/ops_infer/as_strided/as_strided_ops_runner.h diff --git a/src/ops_infer/block_copy/block_copy_operation.cpp b/src/ops/ops_infer/block_copy/block_copy_operation.cpp similarity index 100% rename from src/ops_infer/block_copy/block_copy_operation.cpp rename to src/ops/ops_infer/block_copy/block_copy_operation.cpp diff --git a/src/ops_infer/block_copy/block_copy_operation.h b/src/ops/ops_infer/block_copy/block_copy_operation.h similarity index 100% rename from src/ops_infer/block_copy/block_copy_operation.h rename to src/ops/ops_infer/block_copy/block_copy_operation.h diff --git a/src/ops_infer/block_copy/block_copy_ops_runner.cpp b/src/ops/ops_infer/block_copy/block_copy_ops_runner.cpp similarity index 100% rename from src/ops_infer/block_copy/block_copy_ops_runner.cpp rename to src/ops/ops_infer/block_copy/block_copy_ops_runner.cpp diff --git a/src/ops_infer/block_copy/block_copy_ops_runner.h b/src/ops/ops_infer/block_copy/block_copy_ops_runner.h similarity index 100% rename from src/ops_infer/block_copy/block_copy_ops_runner.h rename to src/ops/ops_infer/block_copy/block_copy_ops_runner.h diff --git a/src/ops_infer/broadcast/broadcast_hccl_runner.cpp b/src/ops/ops_infer/broadcast/broadcast_hccl_runner.cpp similarity index 100% rename from src/ops_infer/broadcast/broadcast_hccl_runner.cpp rename to src/ops/ops_infer/broadcast/broadcast_hccl_runner.cpp diff --git a/src/ops_infer/broadcast/broadcast_hccl_runner.h b/src/ops/ops_infer/broadcast/broadcast_hccl_runner.h similarity index 100% rename from src/ops_infer/broadcast/broadcast_hccl_runner.h rename to src/ops/ops_infer/broadcast/broadcast_hccl_runner.h diff --git a/src/ops_infer/broadcast/broadcast_lccl_runner.cpp b/src/ops/ops_infer/broadcast/broadcast_lccl_runner.cpp similarity index 100% rename from src/ops_infer/broadcast/broadcast_lccl_runner.cpp rename to src/ops/ops_infer/broadcast/broadcast_lccl_runner.cpp diff --git a/src/ops_infer/broadcast/broadcast_lccl_runner.h b/src/ops/ops_infer/broadcast/broadcast_lccl_runner.h similarity index 100% rename from src/ops_infer/broadcast/broadcast_lccl_runner.h rename to src/ops/ops_infer/broadcast/broadcast_lccl_runner.h diff --git a/src/ops_infer/broadcast/broadcast_operation.cpp b/src/ops/ops_infer/broadcast/broadcast_operation.cpp similarity index 100% rename from src/ops_infer/broadcast/broadcast_operation.cpp rename to src/ops/ops_infer/broadcast/broadcast_operation.cpp diff --git a/src/ops_infer/broadcast/broadcast_operation.h b/src/ops/ops_infer/broadcast/broadcast_operation.h similarity index 100% rename from src/ops_infer/broadcast/broadcast_operation.h rename to src/ops/ops_infer/broadcast/broadcast_operation.h diff --git a/src/ops_infer/cohere_layernorm/cohere_layernorm_operation.cpp b/src/ops/ops_infer/cohere_layernorm/cohere_layernorm_operation.cpp similarity index 100% rename from src/ops_infer/cohere_layernorm/cohere_layernorm_operation.cpp rename to src/ops/ops_infer/cohere_layernorm/cohere_layernorm_operation.cpp diff --git a/src/ops_infer/cohere_layernorm/cohere_layernorm_operation.h b/src/ops/ops_infer/cohere_layernorm/cohere_layernorm_operation.h similarity index 100% rename from src/ops_infer/cohere_layernorm/cohere_layernorm_operation.h rename to src/ops/ops_infer/cohere_layernorm/cohere_layernorm_operation.h diff --git a/src/ops_infer/cohere_layernorm/cohere_layernorm_runner.cpp b/src/ops/ops_infer/cohere_layernorm/cohere_layernorm_runner.cpp similarity index 100% rename from src/ops_infer/cohere_layernorm/cohere_layernorm_runner.cpp rename to src/ops/ops_infer/cohere_layernorm/cohere_layernorm_runner.cpp diff --git a/src/ops_infer/cohere_layernorm/cohere_layernorm_runner.h b/src/ops/ops_infer/cohere_layernorm/cohere_layernorm_runner.h similarity index 100% rename from src/ops_infer/cohere_layernorm/cohere_layernorm_runner.h rename to src/ops/ops_infer/cohere_layernorm/cohere_layernorm_runner.h diff --git a/src/ops_infer/concat/concat_operation.cpp b/src/ops/ops_infer/concat/concat_operation.cpp similarity index 100% rename from src/ops_infer/concat/concat_operation.cpp rename to src/ops/ops_infer/concat/concat_operation.cpp diff --git a/src/ops_infer/concat/concat_operation.h b/src/ops/ops_infer/concat/concat_operation.h similarity index 100% rename from src/ops_infer/concat/concat_operation.h rename to src/ops/ops_infer/concat/concat_operation.h diff --git a/src/ops_infer/concat/concat_ops_runner.cpp b/src/ops/ops_infer/concat/concat_ops_runner.cpp similarity index 100% rename from src/ops_infer/concat/concat_ops_runner.cpp rename to src/ops/ops_infer/concat/concat_ops_runner.cpp diff --git a/src/ops_infer/concat/concat_ops_runner.h b/src/ops/ops_infer/concat/concat_ops_runner.h similarity index 100% rename from src/ops_infer/concat/concat_ops_runner.h rename to src/ops/ops_infer/concat/concat_ops_runner.h diff --git a/src/ops_infer/cumsum/cumsum_operation.cpp b/src/ops/ops_infer/cumsum/cumsum_operation.cpp similarity index 100% rename from src/ops_infer/cumsum/cumsum_operation.cpp rename to src/ops/ops_infer/cumsum/cumsum_operation.cpp diff --git a/src/ops_infer/cumsum/cumsum_operation.h b/src/ops/ops_infer/cumsum/cumsum_operation.h similarity index 100% rename from src/ops_infer/cumsum/cumsum_operation.h rename to src/ops/ops_infer/cumsum/cumsum_operation.h diff --git a/src/ops_infer/cumsum/cumsum_ops_runner.cpp b/src/ops/ops_infer/cumsum/cumsum_ops_runner.cpp similarity index 100% rename from src/ops_infer/cumsum/cumsum_ops_runner.cpp rename to src/ops/ops_infer/cumsum/cumsum_ops_runner.cpp diff --git a/src/ops_infer/cumsum/cumsum_ops_runner.h b/src/ops/ops_infer/cumsum/cumsum_ops_runner.h similarity index 100% rename from src/ops_infer/cumsum/cumsum_ops_runner.h rename to src/ops/ops_infer/cumsum/cumsum_ops_runner.h diff --git a/src/ops_infer/dynamic_ntk/dynamic_ntk_operation.cpp b/src/ops/ops_infer/dynamic_ntk/dynamic_ntk_operation.cpp similarity index 100% rename from src/ops_infer/dynamic_ntk/dynamic_ntk_operation.cpp rename to src/ops/ops_infer/dynamic_ntk/dynamic_ntk_operation.cpp diff --git a/src/ops_infer/dynamic_ntk/dynamic_ntk_operation.h b/src/ops/ops_infer/dynamic_ntk/dynamic_ntk_operation.h similarity index 100% rename from src/ops_infer/dynamic_ntk/dynamic_ntk_operation.h rename to src/ops/ops_infer/dynamic_ntk/dynamic_ntk_operation.h diff --git a/src/ops_infer/dynamic_ntk/dynamic_ntk_ops_runner.cpp b/src/ops/ops_infer/dynamic_ntk/dynamic_ntk_ops_runner.cpp similarity index 100% rename from src/ops_infer/dynamic_ntk/dynamic_ntk_ops_runner.cpp rename to src/ops/ops_infer/dynamic_ntk/dynamic_ntk_ops_runner.cpp diff --git a/src/ops_infer/dynamic_ntk/dynamic_ntk_ops_runner.h b/src/ops/ops_infer/dynamic_ntk/dynamic_ntk_ops_runner.h similarity index 100% rename from src/ops_infer/dynamic_ntk/dynamic_ntk_ops_runner.h rename to src/ops/ops_infer/dynamic_ntk/dynamic_ntk_ops_runner.h diff --git a/src/ops_infer/elewise/elewise_operation.cpp b/src/ops/ops_infer/elewise/elewise_operation.cpp similarity index 100% rename from src/ops_infer/elewise/elewise_operation.cpp rename to src/ops/ops_infer/elewise/elewise_operation.cpp diff --git a/src/ops_infer/elewise/elewise_operation.h b/src/ops/ops_infer/elewise/elewise_operation.h similarity index 100% rename from src/ops_infer/elewise/elewise_operation.h rename to src/ops/ops_infer/elewise/elewise_operation.h diff --git a/src/ops_infer/elewise/elewise_ops_runner.cpp b/src/ops/ops_infer/elewise/elewise_ops_runner.cpp similarity index 100% rename from src/ops_infer/elewise/elewise_ops_runner.cpp rename to src/ops/ops_infer/elewise/elewise_ops_runner.cpp diff --git a/src/ops_infer/elewise/elewise_ops_runner.h b/src/ops/ops_infer/elewise/elewise_ops_runner.h similarity index 100% rename from src/ops_infer/elewise/elewise_ops_runner.h rename to src/ops/ops_infer/elewise/elewise_ops_runner.h diff --git a/src/ops_infer/faupdate/faupdate_operation.cpp b/src/ops/ops_infer/faupdate/faupdate_operation.cpp similarity index 100% rename from src/ops_infer/faupdate/faupdate_operation.cpp rename to src/ops/ops_infer/faupdate/faupdate_operation.cpp diff --git a/src/ops_infer/faupdate/faupdate_operation.h b/src/ops/ops_infer/faupdate/faupdate_operation.h similarity index 100% rename from src/ops_infer/faupdate/faupdate_operation.h rename to src/ops/ops_infer/faupdate/faupdate_operation.h diff --git a/src/ops_infer/faupdate/faupdate_ops_runner.cpp b/src/ops/ops_infer/faupdate/faupdate_ops_runner.cpp similarity index 100% rename from src/ops_infer/faupdate/faupdate_ops_runner.cpp rename to src/ops/ops_infer/faupdate/faupdate_ops_runner.cpp diff --git a/src/ops_infer/faupdate/faupdate_ops_runner.h b/src/ops/ops_infer/faupdate/faupdate_ops_runner.h similarity index 100% rename from src/ops_infer/faupdate/faupdate_ops_runner.h rename to src/ops/ops_infer/faupdate/faupdate_ops_runner.h diff --git a/src/ops_infer/fill/fill_operation.cpp b/src/ops/ops_infer/fill/fill_operation.cpp similarity index 100% rename from src/ops_infer/fill/fill_operation.cpp rename to src/ops/ops_infer/fill/fill_operation.cpp diff --git a/src/ops_infer/fill/fill_operation.h b/src/ops/ops_infer/fill/fill_operation.h similarity index 100% rename from src/ops_infer/fill/fill_operation.h rename to src/ops/ops_infer/fill/fill_operation.h diff --git a/src/ops_infer/fill/fill_ops_runner.cpp b/src/ops/ops_infer/fill/fill_ops_runner.cpp similarity index 100% rename from src/ops_infer/fill/fill_ops_runner.cpp rename to src/ops/ops_infer/fill/fill_ops_runner.cpp diff --git a/src/ops_infer/fill/fill_ops_runner.h b/src/ops/ops_infer/fill/fill_ops_runner.h similarity index 100% rename from src/ops_infer/fill/fill_ops_runner.h rename to src/ops/ops_infer/fill/fill_ops_runner.h diff --git a/src/ops_infer/fused_add_topk_div/atb_acl_fused_add_topk_div.cpp b/src/ops/ops_infer/fused_add_topk_div/atb_acl_fused_add_topk_div.cpp similarity index 100% rename from src/ops_infer/fused_add_topk_div/atb_acl_fused_add_topk_div.cpp rename to src/ops/ops_infer/fused_add_topk_div/atb_acl_fused_add_topk_div.cpp diff --git a/src/ops_infer/fused_add_topk_div/fused_add_topk_div_operation.cpp b/src/ops/ops_infer/fused_add_topk_div/fused_add_topk_div_operation.cpp similarity index 100% rename from src/ops_infer/fused_add_topk_div/fused_add_topk_div_operation.cpp rename to src/ops/ops_infer/fused_add_topk_div/fused_add_topk_div_operation.cpp diff --git a/src/ops_infer/fused_add_topk_div/fused_add_topk_div_operation.h b/src/ops/ops_infer/fused_add_topk_div/fused_add_topk_div_operation.h similarity index 100% rename from src/ops_infer/fused_add_topk_div/fused_add_topk_div_operation.h rename to src/ops/ops_infer/fused_add_topk_div/fused_add_topk_div_operation.h diff --git a/src/ops_infer/fused_add_topk_div/fused_add_topk_div_ops_runner.cpp b/src/ops/ops_infer/fused_add_topk_div/fused_add_topk_div_ops_runner.cpp similarity index 100% rename from src/ops_infer/fused_add_topk_div/fused_add_topk_div_ops_runner.cpp rename to src/ops/ops_infer/fused_add_topk_div/fused_add_topk_div_ops_runner.cpp diff --git a/src/ops_infer/fused_add_topk_div/fused_add_topk_div_ops_runner.h b/src/ops/ops_infer/fused_add_topk_div/fused_add_topk_div_ops_runner.h similarity index 100% rename from src/ops_infer/fused_add_topk_div/fused_add_topk_div_ops_runner.h rename to src/ops/ops_infer/fused_add_topk_div/fused_add_topk_div_ops_runner.h diff --git a/src/ops_infer/gather/gather_operation.cpp b/src/ops/ops_infer/gather/gather_operation.cpp similarity index 100% rename from src/ops_infer/gather/gather_operation.cpp rename to src/ops/ops_infer/gather/gather_operation.cpp diff --git a/src/ops_infer/gather/gather_operation.h b/src/ops/ops_infer/gather/gather_operation.h similarity index 100% rename from src/ops_infer/gather/gather_operation.h rename to src/ops/ops_infer/gather/gather_operation.h diff --git a/src/ops_infer/gather/gather_ops_runner.cpp b/src/ops/ops_infer/gather/gather_ops_runner.cpp similarity index 100% rename from src/ops_infer/gather/gather_ops_runner.cpp rename to src/ops/ops_infer/gather/gather_ops_runner.cpp diff --git a/src/ops_infer/gather/gather_ops_runner.h b/src/ops/ops_infer/gather/gather_ops_runner.h similarity index 100% rename from src/ops_infer/gather/gather_ops_runner.h rename to src/ops/ops_infer/gather/gather_ops_runner.h diff --git a/src/ops_infer/gather_pre_rms_norm/gather_pre_rms_norm_operation.cpp b/src/ops/ops_infer/gather_pre_rms_norm/gather_pre_rms_norm_operation.cpp similarity index 100% rename from src/ops_infer/gather_pre_rms_norm/gather_pre_rms_norm_operation.cpp rename to src/ops/ops_infer/gather_pre_rms_norm/gather_pre_rms_norm_operation.cpp diff --git a/src/ops_infer/gather_pre_rms_norm/gather_pre_rms_norm_operation.h b/src/ops/ops_infer/gather_pre_rms_norm/gather_pre_rms_norm_operation.h similarity index 100% rename from src/ops_infer/gather_pre_rms_norm/gather_pre_rms_norm_operation.h rename to src/ops/ops_infer/gather_pre_rms_norm/gather_pre_rms_norm_operation.h diff --git a/src/ops_infer/gather_pre_rms_norm/gather_pre_rms_norm_ops_runner.cpp b/src/ops/ops_infer/gather_pre_rms_norm/gather_pre_rms_norm_ops_runner.cpp similarity index 100% rename from src/ops_infer/gather_pre_rms_norm/gather_pre_rms_norm_ops_runner.cpp rename to src/ops/ops_infer/gather_pre_rms_norm/gather_pre_rms_norm_ops_runner.cpp diff --git a/src/ops_infer/gather_pre_rms_norm/gather_pre_rms_norm_ops_runner.h b/src/ops/ops_infer/gather_pre_rms_norm/gather_pre_rms_norm_ops_runner.h similarity index 100% rename from src/ops_infer/gather_pre_rms_norm/gather_pre_rms_norm_ops_runner.h rename to src/ops/ops_infer/gather_pre_rms_norm/gather_pre_rms_norm_ops_runner.h diff --git a/src/ops_infer/gating/gating_operation.cpp b/src/ops/ops_infer/gating/gating_operation.cpp similarity index 100% rename from src/ops_infer/gating/gating_operation.cpp rename to src/ops/ops_infer/gating/gating_operation.cpp diff --git a/src/ops_infer/gating/gating_operation.h b/src/ops/ops_infer/gating/gating_operation.h similarity index 100% rename from src/ops_infer/gating/gating_operation.h rename to src/ops/ops_infer/gating/gating_operation.h diff --git a/src/ops_infer/gating/gating_ops_runner.cpp b/src/ops/ops_infer/gating/gating_ops_runner.cpp similarity index 100% rename from src/ops_infer/gating/gating_ops_runner.cpp rename to src/ops/ops_infer/gating/gating_ops_runner.cpp diff --git a/src/ops_infer/gating/gating_ops_runner.h b/src/ops/ops_infer/gating/gating_ops_runner.h similarity index 100% rename from src/ops_infer/gating/gating_ops_runner.h rename to src/ops/ops_infer/gating/gating_ops_runner.h diff --git a/src/ops_infer/gmm_deq_swiglu_quant_gmm_deq/gmm_deq_swiglu_quant_gmm_deq_operation.cpp b/src/ops/ops_infer/gmm_deq_swiglu_quant_gmm_deq/gmm_deq_swiglu_quant_gmm_deq_operation.cpp similarity index 100% rename from src/ops_infer/gmm_deq_swiglu_quant_gmm_deq/gmm_deq_swiglu_quant_gmm_deq_operation.cpp rename to src/ops/ops_infer/gmm_deq_swiglu_quant_gmm_deq/gmm_deq_swiglu_quant_gmm_deq_operation.cpp diff --git a/src/ops_infer/gmm_deq_swiglu_quant_gmm_deq/gmm_deq_swiglu_quant_gmm_deq_operation.h b/src/ops/ops_infer/gmm_deq_swiglu_quant_gmm_deq/gmm_deq_swiglu_quant_gmm_deq_operation.h similarity index 100% rename from src/ops_infer/gmm_deq_swiglu_quant_gmm_deq/gmm_deq_swiglu_quant_gmm_deq_operation.h rename to src/ops/ops_infer/gmm_deq_swiglu_quant_gmm_deq/gmm_deq_swiglu_quant_gmm_deq_operation.h diff --git a/src/ops_infer/gmm_deq_swiglu_quant_gmm_deq/gmm_deq_swiglu_quant_gmm_deq_ops_runner.cpp b/src/ops/ops_infer/gmm_deq_swiglu_quant_gmm_deq/gmm_deq_swiglu_quant_gmm_deq_ops_runner.cpp similarity index 100% rename from src/ops_infer/gmm_deq_swiglu_quant_gmm_deq/gmm_deq_swiglu_quant_gmm_deq_ops_runner.cpp rename to src/ops/ops_infer/gmm_deq_swiglu_quant_gmm_deq/gmm_deq_swiglu_quant_gmm_deq_ops_runner.cpp diff --git a/src/ops_infer/gmm_deq_swiglu_quant_gmm_deq/gmm_deq_swiglu_quant_gmm_deq_ops_runner.h b/src/ops/ops_infer/gmm_deq_swiglu_quant_gmm_deq/gmm_deq_swiglu_quant_gmm_deq_ops_runner.h similarity index 100% rename from src/ops_infer/gmm_deq_swiglu_quant_gmm_deq/gmm_deq_swiglu_quant_gmm_deq_ops_runner.h rename to src/ops/ops_infer/gmm_deq_swiglu_quant_gmm_deq/gmm_deq_swiglu_quant_gmm_deq_ops_runner.h diff --git a/src/ops_infer/group_topk/group_topk_operation.cpp b/src/ops/ops_infer/group_topk/group_topk_operation.cpp similarity index 100% rename from src/ops_infer/group_topk/group_topk_operation.cpp rename to src/ops/ops_infer/group_topk/group_topk_operation.cpp diff --git a/src/ops_infer/group_topk/group_topk_operation.h b/src/ops/ops_infer/group_topk/group_topk_operation.h similarity index 100% rename from src/ops_infer/group_topk/group_topk_operation.h rename to src/ops/ops_infer/group_topk/group_topk_operation.h diff --git a/src/ops_infer/group_topk/group_topk_ops_runner.cpp b/src/ops/ops_infer/group_topk/group_topk_ops_runner.cpp similarity index 100% rename from src/ops_infer/group_topk/group_topk_ops_runner.cpp rename to src/ops/ops_infer/group_topk/group_topk_ops_runner.cpp diff --git a/src/ops_infer/group_topk/group_topk_ops_runner.h b/src/ops/ops_infer/group_topk/group_topk_ops_runner.h similarity index 100% rename from src/ops_infer/group_topk/group_topk_ops_runner.h rename to src/ops/ops_infer/group_topk/group_topk_ops_runner.h diff --git a/src/ops_infer/grouped_matmul_inplace_add/grouped_matmul_inplace_add_operation.cpp b/src/ops/ops_infer/grouped_matmul_inplace_add/grouped_matmul_inplace_add_operation.cpp similarity index 100% rename from src/ops_infer/grouped_matmul_inplace_add/grouped_matmul_inplace_add_operation.cpp rename to src/ops/ops_infer/grouped_matmul_inplace_add/grouped_matmul_inplace_add_operation.cpp diff --git a/src/ops_infer/grouped_matmul_inplace_add/grouped_matmul_inplace_add_operation.h b/src/ops/ops_infer/grouped_matmul_inplace_add/grouped_matmul_inplace_add_operation.h similarity index 100% rename from src/ops_infer/grouped_matmul_inplace_add/grouped_matmul_inplace_add_operation.h rename to src/ops/ops_infer/grouped_matmul_inplace_add/grouped_matmul_inplace_add_operation.h diff --git a/src/ops_infer/grouped_matmul_inplace_add/grouped_matmul_inplace_add_ops_runner.cpp b/src/ops/ops_infer/grouped_matmul_inplace_add/grouped_matmul_inplace_add_ops_runner.cpp similarity index 100% rename from src/ops_infer/grouped_matmul_inplace_add/grouped_matmul_inplace_add_ops_runner.cpp rename to src/ops/ops_infer/grouped_matmul_inplace_add/grouped_matmul_inplace_add_ops_runner.cpp diff --git a/src/ops_infer/grouped_matmul_inplace_add/grouped_matmul_inplace_add_ops_runner.h b/src/ops/ops_infer/grouped_matmul_inplace_add/grouped_matmul_inplace_add_ops_runner.h similarity index 100% rename from src/ops_infer/grouped_matmul_inplace_add/grouped_matmul_inplace_add_ops_runner.h rename to src/ops/ops_infer/grouped_matmul_inplace_add/grouped_matmul_inplace_add_ops_runner.h diff --git a/src/ops_infer/grouped_matmul_with_routing/grouped_matmul_with_routing_operation.cpp b/src/ops/ops_infer/grouped_matmul_with_routing/grouped_matmul_with_routing_operation.cpp similarity index 100% rename from src/ops_infer/grouped_matmul_with_routing/grouped_matmul_with_routing_operation.cpp rename to src/ops/ops_infer/grouped_matmul_with_routing/grouped_matmul_with_routing_operation.cpp diff --git a/src/ops_infer/grouped_matmul_with_routing/grouped_matmul_with_routing_operation.h b/src/ops/ops_infer/grouped_matmul_with_routing/grouped_matmul_with_routing_operation.h similarity index 100% rename from src/ops_infer/grouped_matmul_with_routing/grouped_matmul_with_routing_operation.h rename to src/ops/ops_infer/grouped_matmul_with_routing/grouped_matmul_with_routing_operation.h diff --git a/src/ops_infer/grouped_matmul_with_routing/grouped_matmul_with_routing_runner.cpp b/src/ops/ops_infer/grouped_matmul_with_routing/grouped_matmul_with_routing_runner.cpp similarity index 100% rename from src/ops_infer/grouped_matmul_with_routing/grouped_matmul_with_routing_runner.cpp rename to src/ops/ops_infer/grouped_matmul_with_routing/grouped_matmul_with_routing_runner.cpp diff --git a/src/ops_infer/grouped_matmul_with_routing/grouped_matmul_with_routing_runner.h b/src/ops/ops_infer/grouped_matmul_with_routing/grouped_matmul_with_routing_runner.h similarity index 100% rename from src/ops_infer/grouped_matmul_with_routing/grouped_matmul_with_routing_runner.h rename to src/ops/ops_infer/grouped_matmul_with_routing/grouped_matmul_with_routing_runner.h diff --git a/src/ops_infer/index_add/index_add_operation.cpp b/src/ops/ops_infer/index_add/index_add_operation.cpp similarity index 100% rename from src/ops_infer/index_add/index_add_operation.cpp rename to src/ops/ops_infer/index_add/index_add_operation.cpp diff --git a/src/ops_infer/index_add/index_add_operation.h b/src/ops/ops_infer/index_add/index_add_operation.h similarity index 100% rename from src/ops_infer/index_add/index_add_operation.h rename to src/ops/ops_infer/index_add/index_add_operation.h diff --git a/src/ops_infer/index_add/index_add_ops_runner.cpp b/src/ops/ops_infer/index_add/index_add_ops_runner.cpp similarity index 100% rename from src/ops_infer/index_add/index_add_ops_runner.cpp rename to src/ops/ops_infer/index_add/index_add_ops_runner.cpp diff --git a/src/ops_infer/index_add/index_add_ops_runner.h b/src/ops/ops_infer/index_add/index_add_ops_runner.h similarity index 100% rename from src/ops_infer/index_add/index_add_ops_runner.h rename to src/ops/ops_infer/index_add/index_add_ops_runner.h diff --git a/src/ops_infer/kv_cache/kv_cache_operation.cpp b/src/ops/ops_infer/kv_cache/kv_cache_operation.cpp similarity index 100% rename from src/ops_infer/kv_cache/kv_cache_operation.cpp rename to src/ops/ops_infer/kv_cache/kv_cache_operation.cpp diff --git a/src/ops_infer/kv_cache/kv_cache_operation.h b/src/ops/ops_infer/kv_cache/kv_cache_operation.h similarity index 100% rename from src/ops_infer/kv_cache/kv_cache_operation.h rename to src/ops/ops_infer/kv_cache/kv_cache_operation.h diff --git a/src/ops_infer/kv_cache/kv_cache_ops_runner.cpp b/src/ops/ops_infer/kv_cache/kv_cache_ops_runner.cpp similarity index 100% rename from src/ops_infer/kv_cache/kv_cache_ops_runner.cpp rename to src/ops/ops_infer/kv_cache/kv_cache_ops_runner.cpp diff --git a/src/ops_infer/kv_cache/kv_cache_ops_runner.h b/src/ops/ops_infer/kv_cache/kv_cache_ops_runner.h similarity index 100% rename from src/ops_infer/kv_cache/kv_cache_ops_runner.h rename to src/ops/ops_infer/kv_cache/kv_cache_ops_runner.h diff --git a/src/ops_infer/layer_norm/layer_norm_operation.cpp b/src/ops/ops_infer/layer_norm/layer_norm_operation.cpp similarity index 100% rename from src/ops_infer/layer_norm/layer_norm_operation.cpp rename to src/ops/ops_infer/layer_norm/layer_norm_operation.cpp diff --git a/src/ops_infer/layer_norm/layer_norm_operation.h b/src/ops/ops_infer/layer_norm/layer_norm_operation.h similarity index 100% rename from src/ops_infer/layer_norm/layer_norm_operation.h rename to src/ops/ops_infer/layer_norm/layer_norm_operation.h diff --git a/src/ops_infer/layer_norm/layer_norm_ops_runner.cpp b/src/ops/ops_infer/layer_norm/layer_norm_ops_runner.cpp similarity index 100% rename from src/ops_infer/layer_norm/layer_norm_ops_runner.cpp rename to src/ops/ops_infer/layer_norm/layer_norm_ops_runner.cpp diff --git a/src/ops_infer/layer_norm/layer_norm_ops_runner.h b/src/ops/ops_infer/layer_norm/layer_norm_ops_runner.h similarity index 100% rename from src/ops_infer/layer_norm/layer_norm_ops_runner.h rename to src/ops/ops_infer/layer_norm/layer_norm_ops_runner.h diff --git a/src/ops_infer/layer_norm_with_stride/layer_norm_with_stride_operation.cpp b/src/ops/ops_infer/layer_norm_with_stride/layer_norm_with_stride_operation.cpp similarity index 100% rename from src/ops_infer/layer_norm_with_stride/layer_norm_with_stride_operation.cpp rename to src/ops/ops_infer/layer_norm_with_stride/layer_norm_with_stride_operation.cpp diff --git a/src/ops_infer/layer_norm_with_stride/layer_norm_with_stride_operation.h b/src/ops/ops_infer/layer_norm_with_stride/layer_norm_with_stride_operation.h similarity index 100% rename from src/ops_infer/layer_norm_with_stride/layer_norm_with_stride_operation.h rename to src/ops/ops_infer/layer_norm_with_stride/layer_norm_with_stride_operation.h diff --git a/src/ops_infer/layer_norm_with_stride/layer_norm_with_stride_ops_runner.cpp b/src/ops/ops_infer/layer_norm_with_stride/layer_norm_with_stride_ops_runner.cpp similarity index 100% rename from src/ops_infer/layer_norm_with_stride/layer_norm_with_stride_ops_runner.cpp rename to src/ops/ops_infer/layer_norm_with_stride/layer_norm_with_stride_ops_runner.cpp diff --git a/src/ops_infer/layer_norm_with_stride/layer_norm_with_stride_ops_runner.h b/src/ops/ops_infer/layer_norm_with_stride/layer_norm_with_stride_ops_runner.h similarity index 100% rename from src/ops_infer/layer_norm_with_stride/layer_norm_with_stride_ops_runner.h rename to src/ops/ops_infer/layer_norm_with_stride/layer_norm_with_stride_ops_runner.h diff --git a/src/ops_infer/linear/linear_operation.cpp b/src/ops/ops_infer/linear/linear_operation.cpp similarity index 100% rename from src/ops_infer/linear/linear_operation.cpp rename to src/ops/ops_infer/linear/linear_operation.cpp diff --git a/src/ops_infer/linear/linear_operation.h b/src/ops/ops_infer/linear/linear_operation.h similarity index 100% rename from src/ops_infer/linear/linear_operation.h rename to src/ops/ops_infer/linear/linear_operation.h diff --git a/src/ops_infer/linear/linear_ops_runner.cpp b/src/ops/ops_infer/linear/linear_ops_runner.cpp similarity index 100% rename from src/ops_infer/linear/linear_ops_runner.cpp rename to src/ops/ops_infer/linear/linear_ops_runner.cpp diff --git a/src/ops_infer/linear/linear_ops_runner.h b/src/ops/ops_infer/linear/linear_ops_runner.h similarity index 100% rename from src/ops_infer/linear/linear_ops_runner.h rename to src/ops/ops_infer/linear/linear_ops_runner.h diff --git a/src/ops_infer/linear_parallel/linear_parallel_aclnn_runner.cpp b/src/ops/ops_infer/linear_parallel/linear_parallel_aclnn_runner.cpp similarity index 100% rename from src/ops_infer/linear_parallel/linear_parallel_aclnn_runner.cpp rename to src/ops/ops_infer/linear_parallel/linear_parallel_aclnn_runner.cpp diff --git a/src/ops_infer/linear_parallel/linear_parallel_aclnn_runner.h b/src/ops/ops_infer/linear_parallel/linear_parallel_aclnn_runner.h similarity index 100% rename from src/ops_infer/linear_parallel/linear_parallel_aclnn_runner.h rename to src/ops/ops_infer/linear_parallel/linear_parallel_aclnn_runner.h diff --git a/src/ops_infer/linear_parallel/linear_parallel_graph_runner.cpp b/src/ops/ops_infer/linear_parallel/linear_parallel_graph_runner.cpp similarity index 100% rename from src/ops_infer/linear_parallel/linear_parallel_graph_runner.cpp rename to src/ops/ops_infer/linear_parallel/linear_parallel_graph_runner.cpp diff --git a/src/ops_infer/linear_parallel/linear_parallel_graph_runner.h b/src/ops/ops_infer/linear_parallel/linear_parallel_graph_runner.h similarity index 100% rename from src/ops_infer/linear_parallel/linear_parallel_graph_runner.h rename to src/ops/ops_infer/linear_parallel/linear_parallel_graph_runner.h diff --git a/src/ops_infer/linear_parallel/linear_parallel_lcoc_runner.cpp b/src/ops/ops_infer/linear_parallel/linear_parallel_lcoc_runner.cpp similarity index 100% rename from src/ops_infer/linear_parallel/linear_parallel_lcoc_runner.cpp rename to src/ops/ops_infer/linear_parallel/linear_parallel_lcoc_runner.cpp diff --git a/src/ops_infer/linear_parallel/linear_parallel_lcoc_runner.h b/src/ops/ops_infer/linear_parallel/linear_parallel_lcoc_runner.h similarity index 100% rename from src/ops_infer/linear_parallel/linear_parallel_lcoc_runner.h rename to src/ops/ops_infer/linear_parallel/linear_parallel_lcoc_runner.h diff --git a/src/ops_infer/linear_parallel/linear_parallel_operation.cpp b/src/ops/ops_infer/linear_parallel/linear_parallel_operation.cpp similarity index 100% rename from src/ops_infer/linear_parallel/linear_parallel_operation.cpp rename to src/ops/ops_infer/linear_parallel/linear_parallel_operation.cpp diff --git a/src/ops_infer/linear_parallel/linear_parallel_operation.h b/src/ops/ops_infer/linear_parallel/linear_parallel_operation.h similarity index 100% rename from src/ops_infer/linear_parallel/linear_parallel_operation.h rename to src/ops/ops_infer/linear_parallel/linear_parallel_operation.h diff --git a/src/ops_infer/linear_sparse/linear_sparse_operation.cpp b/src/ops/ops_infer/linear_sparse/linear_sparse_operation.cpp similarity index 100% rename from src/ops_infer/linear_sparse/linear_sparse_operation.cpp rename to src/ops/ops_infer/linear_sparse/linear_sparse_operation.cpp diff --git a/src/ops_infer/linear_sparse/linear_sparse_operation.h b/src/ops/ops_infer/linear_sparse/linear_sparse_operation.h similarity index 100% rename from src/ops_infer/linear_sparse/linear_sparse_operation.h rename to src/ops/ops_infer/linear_sparse/linear_sparse_operation.h diff --git a/src/ops_infer/linear_sparse/linear_sparse_ops_runner.cpp b/src/ops/ops_infer/linear_sparse/linear_sparse_ops_runner.cpp similarity index 100% rename from src/ops_infer/linear_sparse/linear_sparse_ops_runner.cpp rename to src/ops/ops_infer/linear_sparse/linear_sparse_ops_runner.cpp diff --git a/src/ops_infer/linear_sparse/linear_sparse_ops_runner.h b/src/ops/ops_infer/linear_sparse/linear_sparse_ops_runner.h similarity index 100% rename from src/ops_infer/linear_sparse/linear_sparse_ops_runner.h rename to src/ops/ops_infer/linear_sparse/linear_sparse_ops_runner.h diff --git a/src/ops_infer/mla_preprocess/atb_acl_mla_preprocess.cpp b/src/ops/ops_infer/mla_preprocess/atb_acl_mla_preprocess.cpp similarity index 100% rename from src/ops_infer/mla_preprocess/atb_acl_mla_preprocess.cpp rename to src/ops/ops_infer/mla_preprocess/atb_acl_mla_preprocess.cpp diff --git a/src/ops_infer/mla_preprocess/mla_preprocess_aclnn_runner.cpp b/src/ops/ops_infer/mla_preprocess/mla_preprocess_aclnn_runner.cpp similarity index 100% rename from src/ops_infer/mla_preprocess/mla_preprocess_aclnn_runner.cpp rename to src/ops/ops_infer/mla_preprocess/mla_preprocess_aclnn_runner.cpp diff --git a/src/ops_infer/mla_preprocess/mla_preprocess_aclnn_runner.h b/src/ops/ops_infer/mla_preprocess/mla_preprocess_aclnn_runner.h similarity index 100% rename from src/ops_infer/mla_preprocess/mla_preprocess_aclnn_runner.h rename to src/ops/ops_infer/mla_preprocess/mla_preprocess_aclnn_runner.h diff --git a/src/ops_infer/mla_preprocess/mla_preprocess_operation.cpp b/src/ops/ops_infer/mla_preprocess/mla_preprocess_operation.cpp similarity index 100% rename from src/ops_infer/mla_preprocess/mla_preprocess_operation.cpp rename to src/ops/ops_infer/mla_preprocess/mla_preprocess_operation.cpp diff --git a/src/ops_infer/mla_preprocess/mla_preprocess_operation.h b/src/ops/ops_infer/mla_preprocess/mla_preprocess_operation.h similarity index 100% rename from src/ops_infer/mla_preprocess/mla_preprocess_operation.h rename to src/ops/ops_infer/mla_preprocess/mla_preprocess_operation.h diff --git a/src/ops_infer/mla_preprocess/mla_preprocess_ops_runner.cpp b/src/ops/ops_infer/mla_preprocess/mla_preprocess_ops_runner.cpp similarity index 100% rename from src/ops_infer/mla_preprocess/mla_preprocess_ops_runner.cpp rename to src/ops/ops_infer/mla_preprocess/mla_preprocess_ops_runner.cpp diff --git a/src/ops_infer/mla_preprocess/mla_preprocess_ops_runner.h b/src/ops/ops_infer/mla_preprocess/mla_preprocess_ops_runner.h similarity index 100% rename from src/ops_infer/mla_preprocess/mla_preprocess_ops_runner.h rename to src/ops/ops_infer/mla_preprocess/mla_preprocess_ops_runner.h diff --git a/src/ops_infer/mla_preprocess/mla_preprocess_ops_runner_split.cpp b/src/ops/ops_infer/mla_preprocess/mla_preprocess_ops_runner_split.cpp similarity index 100% rename from src/ops_infer/mla_preprocess/mla_preprocess_ops_runner_split.cpp rename to src/ops/ops_infer/mla_preprocess/mla_preprocess_ops_runner_split.cpp diff --git a/src/ops_infer/mla_preprocess/mla_preprocess_ops_runner_split.h b/src/ops/ops_infer/mla_preprocess/mla_preprocess_ops_runner_split.h similarity index 100% rename from src/ops_infer/mla_preprocess/mla_preprocess_ops_runner_split.h rename to src/ops/ops_infer/mla_preprocess/mla_preprocess_ops_runner_split.h diff --git a/src/ops_infer/mm_deq_swiglu_quant_mm_deq/mm_deq_swiglu_quant_mm_deq_operation.cpp b/src/ops/ops_infer/mm_deq_swiglu_quant_mm_deq/mm_deq_swiglu_quant_mm_deq_operation.cpp similarity index 100% rename from src/ops_infer/mm_deq_swiglu_quant_mm_deq/mm_deq_swiglu_quant_mm_deq_operation.cpp rename to src/ops/ops_infer/mm_deq_swiglu_quant_mm_deq/mm_deq_swiglu_quant_mm_deq_operation.cpp diff --git a/src/ops_infer/mm_deq_swiglu_quant_mm_deq/mm_deq_swiglu_quant_mm_deq_operation.h b/src/ops/ops_infer/mm_deq_swiglu_quant_mm_deq/mm_deq_swiglu_quant_mm_deq_operation.h similarity index 100% rename from src/ops_infer/mm_deq_swiglu_quant_mm_deq/mm_deq_swiglu_quant_mm_deq_operation.h rename to src/ops/ops_infer/mm_deq_swiglu_quant_mm_deq/mm_deq_swiglu_quant_mm_deq_operation.h diff --git a/src/ops_infer/mm_deq_swiglu_quant_mm_deq/mm_deq_swiglu_quant_mm_deq_ops_runner.cpp b/src/ops/ops_infer/mm_deq_swiglu_quant_mm_deq/mm_deq_swiglu_quant_mm_deq_ops_runner.cpp similarity index 100% rename from src/ops_infer/mm_deq_swiglu_quant_mm_deq/mm_deq_swiglu_quant_mm_deq_ops_runner.cpp rename to src/ops/ops_infer/mm_deq_swiglu_quant_mm_deq/mm_deq_swiglu_quant_mm_deq_ops_runner.cpp diff --git a/src/ops_infer/mm_deq_swiglu_quant_mm_deq/mm_deq_swiglu_quant_mm_deq_ops_runner.h b/src/ops/ops_infer/mm_deq_swiglu_quant_mm_deq/mm_deq_swiglu_quant_mm_deq_ops_runner.h similarity index 100% rename from src/ops_infer/mm_deq_swiglu_quant_mm_deq/mm_deq_swiglu_quant_mm_deq_ops_runner.h rename to src/ops/ops_infer/mm_deq_swiglu_quant_mm_deq/mm_deq_swiglu_quant_mm_deq_ops_runner.h diff --git a/src/ops_infer/multi_latent_attention/atb_acl_mla.cpp b/src/ops/ops_infer/multi_latent_attention/atb_acl_mla.cpp similarity index 100% rename from src/ops_infer/multi_latent_attention/atb_acl_mla.cpp rename to src/ops/ops_infer/multi_latent_attention/atb_acl_mla.cpp diff --git a/src/ops_infer/multi_latent_attention/multi_latent_attention_operation.cpp b/src/ops/ops_infer/multi_latent_attention/multi_latent_attention_operation.cpp similarity index 100% rename from src/ops_infer/multi_latent_attention/multi_latent_attention_operation.cpp rename to src/ops/ops_infer/multi_latent_attention/multi_latent_attention_operation.cpp diff --git a/src/ops_infer/multi_latent_attention/multi_latent_attention_operation.h b/src/ops/ops_infer/multi_latent_attention/multi_latent_attention_operation.h similarity index 100% rename from src/ops_infer/multi_latent_attention/multi_latent_attention_operation.h rename to src/ops/ops_infer/multi_latent_attention/multi_latent_attention_operation.h diff --git a/src/ops_infer/multi_latent_attention/multi_latent_attention_ops_runner.cpp b/src/ops/ops_infer/multi_latent_attention/multi_latent_attention_ops_runner.cpp similarity index 100% rename from src/ops_infer/multi_latent_attention/multi_latent_attention_ops_runner.cpp rename to src/ops/ops_infer/multi_latent_attention/multi_latent_attention_ops_runner.cpp diff --git a/src/ops_infer/multi_latent_attention/multi_latent_attention_ops_runner.h b/src/ops/ops_infer/multi_latent_attention/multi_latent_attention_ops_runner.h similarity index 100% rename from src/ops_infer/multi_latent_attention/multi_latent_attention_ops_runner.h rename to src/ops/ops_infer/multi_latent_attention/multi_latent_attention_ops_runner.h diff --git a/src/ops_infer/multi_latent_attention/multi_latent_attention_ops_runner_prefill.cpp b/src/ops/ops_infer/multi_latent_attention/multi_latent_attention_ops_runner_prefill.cpp similarity index 100% rename from src/ops_infer/multi_latent_attention/multi_latent_attention_ops_runner_prefill.cpp rename to src/ops/ops_infer/multi_latent_attention/multi_latent_attention_ops_runner_prefill.cpp diff --git a/src/ops_infer/multi_latent_attention/multi_latent_attention_ops_runner_prefill.h b/src/ops/ops_infer/multi_latent_attention/multi_latent_attention_ops_runner_prefill.h similarity index 100% rename from src/ops_infer/multi_latent_attention/multi_latent_attention_ops_runner_prefill.h rename to src/ops/ops_infer/multi_latent_attention/multi_latent_attention_ops_runner_prefill.h diff --git a/src/ops_infer/multi_latent_attention/param.cpp b/src/ops/ops_infer/multi_latent_attention/param.cpp similarity index 100% rename from src/ops_infer/multi_latent_attention/param.cpp rename to src/ops/ops_infer/multi_latent_attention/param.cpp diff --git a/src/ops_infer/multi_latent_attention/param.h b/src/ops/ops_infer/multi_latent_attention/param.h similarity index 100% rename from src/ops_infer/multi_latent_attention/param.h rename to src/ops/ops_infer/multi_latent_attention/param.h diff --git a/src/ops_infer/multinomial/multinomial_operation.cpp b/src/ops/ops_infer/multinomial/multinomial_operation.cpp similarity index 100% rename from src/ops_infer/multinomial/multinomial_operation.cpp rename to src/ops/ops_infer/multinomial/multinomial_operation.cpp diff --git a/src/ops_infer/multinomial/multinomial_operation.h b/src/ops/ops_infer/multinomial/multinomial_operation.h similarity index 100% rename from src/ops_infer/multinomial/multinomial_operation.h rename to src/ops/ops_infer/multinomial/multinomial_operation.h diff --git a/src/ops_infer/multinomial/multinomial_ops_runner.cpp b/src/ops/ops_infer/multinomial/multinomial_ops_runner.cpp similarity index 100% rename from src/ops_infer/multinomial/multinomial_ops_runner.cpp rename to src/ops/ops_infer/multinomial/multinomial_ops_runner.cpp diff --git a/src/ops_infer/multinomial/multinomial_ops_runner.h b/src/ops/ops_infer/multinomial/multinomial_ops_runner.h similarity index 100% rename from src/ops_infer/multinomial/multinomial_ops_runner.h rename to src/ops/ops_infer/multinomial/multinomial_ops_runner.h diff --git a/src/ops_infer/nonzero/nonzero_operation.cpp b/src/ops/ops_infer/nonzero/nonzero_operation.cpp similarity index 100% rename from src/ops_infer/nonzero/nonzero_operation.cpp rename to src/ops/ops_infer/nonzero/nonzero_operation.cpp diff --git a/src/ops_infer/nonzero/nonzero_operation.h b/src/ops/ops_infer/nonzero/nonzero_operation.h similarity index 100% rename from src/ops_infer/nonzero/nonzero_operation.h rename to src/ops/ops_infer/nonzero/nonzero_operation.h diff --git a/src/ops_infer/nonzero/nonzero_runner.cpp b/src/ops/ops_infer/nonzero/nonzero_runner.cpp similarity index 100% rename from src/ops_infer/nonzero/nonzero_runner.cpp rename to src/ops/ops_infer/nonzero/nonzero_runner.cpp diff --git a/src/ops_infer/nonzero/nonzero_runner.h b/src/ops/ops_infer/nonzero/nonzero_runner.h similarity index 100% rename from src/ops_infer/nonzero/nonzero_runner.h rename to src/ops/ops_infer/nonzero/nonzero_runner.h diff --git a/src/ops_infer/norm_rope_reshape/norm_rope_reshape_operation.cpp b/src/ops/ops_infer/norm_rope_reshape/norm_rope_reshape_operation.cpp similarity index 97% rename from src/ops_infer/norm_rope_reshape/norm_rope_reshape_operation.cpp rename to src/ops/ops_infer/norm_rope_reshape/norm_rope_reshape_operation.cpp index 00278615..864df1ca 100644 --- a/src/ops_infer/norm_rope_reshape/norm_rope_reshape_operation.cpp +++ b/src/ops/ops_infer/norm_rope_reshape/norm_rope_reshape_operation.cpp @@ -55,8 +55,7 @@ template <> Status CreateOperation(const infer::NormRopeReshapeParam &opParam, O return NO_ERROR; } -NormRopeReshapeOperation::NormRopeReshapeOperation -(const infer::NormRopeReshapeParam ¶m) +NormRopeReshapeOperation::NormRopeReshapeOperation(const infer::NormRopeReshapeParam ¶m) : OperationBase("NormRopeReshapeOperation"), param_(param) { operationIr_ = GetSingleton().GetOperationIr("NormRopeReshapeOperation"); @@ -206,8 +205,8 @@ Status NormRopeReshapeOperation::CheckOutTensorSame return NO_ERROR; } -bool NormRopeReshapeOperation::GammaBetaTensorCheck -(const TensorDesc &xTensorDesc, const TensorDesc &tensorDesc2) const +bool NormRopeReshapeOperation::GammaBetaTensorCheck( + const TensorDesc &xTensorDesc, const TensorDesc &tensorDesc2) const { int embedDim = xTensorDesc.shape.dims[xTensorDesc.shape.dimNum - 1]; if (xTensorDesc.dtype != tensorDesc2.dtype || xTensorDesc.format != tensorDesc2.format) { diff --git a/src/ops_infer/norm_rope_reshape/norm_rope_reshape_operation.h b/src/ops/ops_infer/norm_rope_reshape/norm_rope_reshape_operation.h similarity index 100% rename from src/ops_infer/norm_rope_reshape/norm_rope_reshape_operation.h rename to src/ops/ops_infer/norm_rope_reshape/norm_rope_reshape_operation.h diff --git a/src/ops_infer/norm_rope_reshape/norm_rope_reshape_ops_runner.cpp b/src/ops/ops_infer/norm_rope_reshape/norm_rope_reshape_ops_runner.cpp similarity index 92% rename from src/ops_infer/norm_rope_reshape/norm_rope_reshape_ops_runner.cpp rename to src/ops/ops_infer/norm_rope_reshape/norm_rope_reshape_ops_runner.cpp index 8bcf8c45..bca072d9 100644 --- a/src/ops_infer/norm_rope_reshape/norm_rope_reshape_ops_runner.cpp +++ b/src/ops/ops_infer/norm_rope_reshape/norm_rope_reshape_ops_runner.cpp @@ -22,8 +22,8 @@ void NormRopeReshapeOpsRunner::SetNormRopeReshapeParam( asdopsParam.rotaryCoeff = inferParam.rotaryCoeff; } -void NormRopeReshapeOpsRunner::BuildNormRopeReshapeGraph -(const AtbOps::OpParam::RmsNormAndRopeAndReshapeAndCache &normRopeReshapeParam) +void NormRopeReshapeOpsRunner::BuildNormRopeReshapeGraph( + const AtbOps::OpParam::RmsNormAndRopeAndReshapeAndCache &normRopeReshapeParam) { kernelGraph_.inTensors.resize(IN_TENSOR_COUNT_SEVEN); size_t inId = 0; @@ -47,8 +47,7 @@ void NormRopeReshapeOpsRunner::BuildNormRopeReshapeGraph normRopeReshapeNode.outTensors = {&keycacheOutTensor}; } -NormRopeReshapeOpsRunner::NormRopeReshapeOpsRunner -(const infer::NormRopeReshapeParam ¶m) +NormRopeReshapeOpsRunner::NormRopeReshapeOpsRunner(const infer::NormRopeReshapeParam ¶m) : OpsRunner("NormRopeReshapeOpsRunner", RUNNER_TYPE_NORM_ROPE_RESHAPE), param_(param) { AtbOps::OpParam::RmsNormAndRopeAndReshapeAndCache rmsNormAndRopeAndReshapeAndCacheParam; diff --git a/src/ops_infer/norm_rope_reshape/norm_rope_reshape_ops_runner.h b/src/ops/ops_infer/norm_rope_reshape/norm_rope_reshape_ops_runner.h similarity index 100% rename from src/ops_infer/norm_rope_reshape/norm_rope_reshape_ops_runner.h rename to src/ops/ops_infer/norm_rope_reshape/norm_rope_reshape_ops_runner.h diff --git a/src/ops_infer/onehot/onehot_operation.cpp b/src/ops/ops_infer/onehot/onehot_operation.cpp similarity index 100% rename from src/ops_infer/onehot/onehot_operation.cpp rename to src/ops/ops_infer/onehot/onehot_operation.cpp diff --git a/src/ops_infer/onehot/onehot_operation.h b/src/ops/ops_infer/onehot/onehot_operation.h similarity index 100% rename from src/ops_infer/onehot/onehot_operation.h rename to src/ops/ops_infer/onehot/onehot_operation.h diff --git a/src/ops_infer/onehot/onehot_ops_runner.cpp b/src/ops/ops_infer/onehot/onehot_ops_runner.cpp similarity index 100% rename from src/ops_infer/onehot/onehot_ops_runner.cpp rename to src/ops/ops_infer/onehot/onehot_ops_runner.cpp diff --git a/src/ops_infer/onehot/onehot_ops_runner.h b/src/ops/ops_infer/onehot/onehot_ops_runner.h similarity index 100% rename from src/ops_infer/onehot/onehot_ops_runner.h rename to src/ops/ops_infer/onehot/onehot_ops_runner.h diff --git a/src/ops_infer/pad/pad_operation.cpp b/src/ops/ops_infer/pad/pad_operation.cpp similarity index 100% rename from src/ops_infer/pad/pad_operation.cpp rename to src/ops/ops_infer/pad/pad_operation.cpp diff --git a/src/ops_infer/pad/pad_operation.h b/src/ops/ops_infer/pad/pad_operation.h similarity index 100% rename from src/ops_infer/pad/pad_operation.h rename to src/ops/ops_infer/pad/pad_operation.h diff --git a/src/ops_infer/pad/pad_ops_runner.cpp b/src/ops/ops_infer/pad/pad_ops_runner.cpp similarity index 100% rename from src/ops_infer/pad/pad_ops_runner.cpp rename to src/ops/ops_infer/pad/pad_ops_runner.cpp diff --git a/src/ops_infer/pad/pad_ops_runner.h b/src/ops/ops_infer/pad/pad_ops_runner.h similarity index 100% rename from src/ops_infer/pad/pad_ops_runner.h rename to src/ops/ops_infer/pad/pad_ops_runner.h diff --git a/src/ops_infer/paged_attention/paged_attention_operation.cpp b/src/ops/ops_infer/paged_attention/paged_attention_operation.cpp similarity index 100% rename from src/ops_infer/paged_attention/paged_attention_operation.cpp rename to src/ops/ops_infer/paged_attention/paged_attention_operation.cpp diff --git a/src/ops_infer/paged_attention/paged_attention_operation.h b/src/ops/ops_infer/paged_attention/paged_attention_operation.h similarity index 100% rename from src/ops_infer/paged_attention/paged_attention_operation.h rename to src/ops/ops_infer/paged_attention/paged_attention_operation.h diff --git a/src/ops_infer/paged_attention/paged_attention_ops_runner.cpp b/src/ops/ops_infer/paged_attention/paged_attention_ops_runner.cpp similarity index 100% rename from src/ops_infer/paged_attention/paged_attention_ops_runner.cpp rename to src/ops/ops_infer/paged_attention/paged_attention_ops_runner.cpp diff --git a/src/ops_infer/paged_attention/paged_attention_ops_runner.h b/src/ops/ops_infer/paged_attention/paged_attention_ops_runner.h similarity index 100% rename from src/ops_infer/paged_attention/paged_attention_ops_runner.h rename to src/ops/ops_infer/paged_attention/paged_attention_ops_runner.h diff --git a/src/ops_infer/paged_attention/paged_attention_ops_runner_910a.cpp b/src/ops/ops_infer/paged_attention/paged_attention_ops_runner_910a.cpp similarity index 100% rename from src/ops_infer/paged_attention/paged_attention_ops_runner_910a.cpp rename to src/ops/ops_infer/paged_attention/paged_attention_ops_runner_910a.cpp diff --git a/src/ops_infer/paged_attention/paged_attention_ops_runner_910a.h b/src/ops/ops_infer/paged_attention/paged_attention_ops_runner_910a.h similarity index 100% rename from src/ops_infer/paged_attention/paged_attention_ops_runner_910a.h rename to src/ops/ops_infer/paged_attention/paged_attention_ops_runner_910a.h diff --git a/src/ops_infer/paged_attention/paged_attention_runner_utils.cpp b/src/ops/ops_infer/paged_attention/paged_attention_runner_utils.cpp similarity index 100% rename from src/ops_infer/paged_attention/paged_attention_runner_utils.cpp rename to src/ops/ops_infer/paged_attention/paged_attention_runner_utils.cpp diff --git a/src/ops_infer/paged_attention/paged_attention_runner_utils.h b/src/ops/ops_infer/paged_attention/paged_attention_runner_utils.h similarity index 100% rename from src/ops_infer/paged_attention/paged_attention_runner_utils.h rename to src/ops/ops_infer/paged_attention/paged_attention_runner_utils.h diff --git a/src/ops_infer/paged_attention/param.cpp b/src/ops/ops_infer/paged_attention/param.cpp similarity index 100% rename from src/ops_infer/paged_attention/param.cpp rename to src/ops/ops_infer/paged_attention/param.cpp diff --git a/src/ops_infer/paged_attention/param.h b/src/ops/ops_infer/paged_attention/param.h similarity index 100% rename from src/ops_infer/paged_attention/param.h rename to src/ops/ops_infer/paged_attention/param.h diff --git a/src/ops_infer/paged_cache_load/atb_acl_paged_cache_load.cpp b/src/ops/ops_infer/paged_cache_load/atb_acl_paged_cache_load.cpp similarity index 100% rename from src/ops_infer/paged_cache_load/atb_acl_paged_cache_load.cpp rename to src/ops/ops_infer/paged_cache_load/atb_acl_paged_cache_load.cpp diff --git a/src/ops_infer/paged_cache_load/paged_cache_load_operation.cpp b/src/ops/ops_infer/paged_cache_load/paged_cache_load_operation.cpp similarity index 100% rename from src/ops_infer/paged_cache_load/paged_cache_load_operation.cpp rename to src/ops/ops_infer/paged_cache_load/paged_cache_load_operation.cpp diff --git a/src/ops_infer/paged_cache_load/paged_cache_load_operation.h b/src/ops/ops_infer/paged_cache_load/paged_cache_load_operation.h similarity index 100% rename from src/ops_infer/paged_cache_load/paged_cache_load_operation.h rename to src/ops/ops_infer/paged_cache_load/paged_cache_load_operation.h diff --git a/src/ops_infer/paged_cache_load/paged_cache_load_ops_runner.cpp b/src/ops/ops_infer/paged_cache_load/paged_cache_load_ops_runner.cpp similarity index 100% rename from src/ops_infer/paged_cache_load/paged_cache_load_ops_runner.cpp rename to src/ops/ops_infer/paged_cache_load/paged_cache_load_ops_runner.cpp diff --git a/src/ops_infer/paged_cache_load/paged_cache_load_ops_runner.h b/src/ops/ops_infer/paged_cache_load/paged_cache_load_ops_runner.h similarity index 100% rename from src/ops_infer/paged_cache_load/paged_cache_load_ops_runner.h rename to src/ops/ops_infer/paged_cache_load/paged_cache_load_ops_runner.h diff --git a/src/ops_infer/razor_fusion_attention/razor_fusion_attention_operation.cpp b/src/ops/ops_infer/razor_fusion_attention/razor_fusion_attention_operation.cpp similarity index 100% rename from src/ops_infer/razor_fusion_attention/razor_fusion_attention_operation.cpp rename to src/ops/ops_infer/razor_fusion_attention/razor_fusion_attention_operation.cpp diff --git a/src/ops_infer/razor_fusion_attention/razor_fusion_attention_operation.h b/src/ops/ops_infer/razor_fusion_attention/razor_fusion_attention_operation.h similarity index 100% rename from src/ops_infer/razor_fusion_attention/razor_fusion_attention_operation.h rename to src/ops/ops_infer/razor_fusion_attention/razor_fusion_attention_operation.h diff --git a/src/ops_infer/razor_fusion_attention/razor_fusion_attention_ops_runner.cpp b/src/ops/ops_infer/razor_fusion_attention/razor_fusion_attention_ops_runner.cpp similarity index 100% rename from src/ops_infer/razor_fusion_attention/razor_fusion_attention_ops_runner.cpp rename to src/ops/ops_infer/razor_fusion_attention/razor_fusion_attention_ops_runner.cpp diff --git a/src/ops_infer/razor_fusion_attention/razor_fusion_attention_ops_runner.h b/src/ops/ops_infer/razor_fusion_attention/razor_fusion_attention_ops_runner.h similarity index 100% rename from src/ops_infer/razor_fusion_attention/razor_fusion_attention_ops_runner.h rename to src/ops/ops_infer/razor_fusion_attention/razor_fusion_attention_ops_runner.h diff --git a/src/ops_infer/recv/recv_hccl_runner.cpp b/src/ops/ops_infer/recv/recv_hccl_runner.cpp similarity index 100% rename from src/ops_infer/recv/recv_hccl_runner.cpp rename to src/ops/ops_infer/recv/recv_hccl_runner.cpp diff --git a/src/ops_infer/recv/recv_hccl_runner.h b/src/ops/ops_infer/recv/recv_hccl_runner.h similarity index 100% rename from src/ops_infer/recv/recv_hccl_runner.h rename to src/ops/ops_infer/recv/recv_hccl_runner.h diff --git a/src/ops_infer/recv/recv_operation.cpp b/src/ops/ops_infer/recv/recv_operation.cpp similarity index 100% rename from src/ops_infer/recv/recv_operation.cpp rename to src/ops/ops_infer/recv/recv_operation.cpp diff --git a/src/ops_infer/recv/recv_operation.h b/src/ops/ops_infer/recv/recv_operation.h similarity index 100% rename from src/ops_infer/recv/recv_operation.h rename to src/ops/ops_infer/recv/recv_operation.h diff --git a/src/ops_infer/reduce/reduce_operation.cpp b/src/ops/ops_infer/reduce/reduce_operation.cpp similarity index 100% rename from src/ops_infer/reduce/reduce_operation.cpp rename to src/ops/ops_infer/reduce/reduce_operation.cpp diff --git a/src/ops_infer/reduce/reduce_operation.h b/src/ops/ops_infer/reduce/reduce_operation.h similarity index 100% rename from src/ops_infer/reduce/reduce_operation.h rename to src/ops/ops_infer/reduce/reduce_operation.h diff --git a/src/ops_infer/reduce/reduce_ops_runner.cpp b/src/ops/ops_infer/reduce/reduce_ops_runner.cpp similarity index 100% rename from src/ops_infer/reduce/reduce_ops_runner.cpp rename to src/ops/ops_infer/reduce/reduce_ops_runner.cpp diff --git a/src/ops_infer/reduce/reduce_ops_runner.h b/src/ops/ops_infer/reduce/reduce_ops_runner.h similarity index 100% rename from src/ops_infer/reduce/reduce_ops_runner.h rename to src/ops/ops_infer/reduce/reduce_ops_runner.h diff --git a/src/ops_infer/reduce_scatter/reduce_scatter_hccl_runner.cpp b/src/ops/ops_infer/reduce_scatter/reduce_scatter_hccl_runner.cpp similarity index 100% rename from src/ops_infer/reduce_scatter/reduce_scatter_hccl_runner.cpp rename to src/ops/ops_infer/reduce_scatter/reduce_scatter_hccl_runner.cpp diff --git a/src/ops_infer/reduce_scatter/reduce_scatter_hccl_runner.h b/src/ops/ops_infer/reduce_scatter/reduce_scatter_hccl_runner.h similarity index 100% rename from src/ops_infer/reduce_scatter/reduce_scatter_hccl_runner.h rename to src/ops/ops_infer/reduce_scatter/reduce_scatter_hccl_runner.h diff --git a/src/ops_infer/reduce_scatter/reduce_scatter_lccl_runner.cpp b/src/ops/ops_infer/reduce_scatter/reduce_scatter_lccl_runner.cpp similarity index 100% rename from src/ops_infer/reduce_scatter/reduce_scatter_lccl_runner.cpp rename to src/ops/ops_infer/reduce_scatter/reduce_scatter_lccl_runner.cpp diff --git a/src/ops_infer/reduce_scatter/reduce_scatter_lccl_runner.h b/src/ops/ops_infer/reduce_scatter/reduce_scatter_lccl_runner.h similarity index 100% rename from src/ops_infer/reduce_scatter/reduce_scatter_lccl_runner.h rename to src/ops/ops_infer/reduce_scatter/reduce_scatter_lccl_runner.h diff --git a/src/ops_infer/reduce_scatter/reduce_scatter_operation.cpp b/src/ops/ops_infer/reduce_scatter/reduce_scatter_operation.cpp similarity index 100% rename from src/ops_infer/reduce_scatter/reduce_scatter_operation.cpp rename to src/ops/ops_infer/reduce_scatter/reduce_scatter_operation.cpp diff --git a/src/ops_infer/reduce_scatter/reduce_scatter_operation.h b/src/ops/ops_infer/reduce_scatter/reduce_scatter_operation.h similarity index 100% rename from src/ops_infer/reduce_scatter/reduce_scatter_operation.h rename to src/ops/ops_infer/reduce_scatter/reduce_scatter_operation.h diff --git a/src/ops_infer/reduce_scatterv/reduce_scatterv_hccl_runner.cpp b/src/ops/ops_infer/reduce_scatterv/reduce_scatterv_hccl_runner.cpp similarity index 100% rename from src/ops_infer/reduce_scatterv/reduce_scatterv_hccl_runner.cpp rename to src/ops/ops_infer/reduce_scatterv/reduce_scatterv_hccl_runner.cpp diff --git a/src/ops_infer/reduce_scatterv/reduce_scatterv_hccl_runner.h b/src/ops/ops_infer/reduce_scatterv/reduce_scatterv_hccl_runner.h similarity index 100% rename from src/ops_infer/reduce_scatterv/reduce_scatterv_hccl_runner.h rename to src/ops/ops_infer/reduce_scatterv/reduce_scatterv_hccl_runner.h diff --git a/src/ops_infer/reduce_scatterv/reduce_scatterv_operation.cpp b/src/ops/ops_infer/reduce_scatterv/reduce_scatterv_operation.cpp similarity index 100% rename from src/ops_infer/reduce_scatterv/reduce_scatterv_operation.cpp rename to src/ops/ops_infer/reduce_scatterv/reduce_scatterv_operation.cpp diff --git a/src/ops_infer/reduce_scatterv/reduce_scatterv_operation.h b/src/ops/ops_infer/reduce_scatterv/reduce_scatterv_operation.h similarity index 100% rename from src/ops_infer/reduce_scatterv/reduce_scatterv_operation.h rename to src/ops/ops_infer/reduce_scatterv/reduce_scatterv_operation.h diff --git a/src/ops_infer/relay_attention/param.cpp b/src/ops/ops_infer/relay_attention/param.cpp similarity index 100% rename from src/ops_infer/relay_attention/param.cpp rename to src/ops/ops_infer/relay_attention/param.cpp diff --git a/src/ops_infer/relay_attention/param.h b/src/ops/ops_infer/relay_attention/param.h similarity index 100% rename from src/ops_infer/relay_attention/param.h rename to src/ops/ops_infer/relay_attention/param.h diff --git a/src/ops_infer/relay_attention/relay_attention_operation.cpp b/src/ops/ops_infer/relay_attention/relay_attention_operation.cpp similarity index 100% rename from src/ops_infer/relay_attention/relay_attention_operation.cpp rename to src/ops/ops_infer/relay_attention/relay_attention_operation.cpp diff --git a/src/ops_infer/relay_attention/relay_attention_operation.h b/src/ops/ops_infer/relay_attention/relay_attention_operation.h similarity index 100% rename from src/ops_infer/relay_attention/relay_attention_operation.h rename to src/ops/ops_infer/relay_attention/relay_attention_operation.h diff --git a/src/ops_infer/relay_attention/relay_attention_ops_runner.cpp b/src/ops/ops_infer/relay_attention/relay_attention_ops_runner.cpp similarity index 100% rename from src/ops_infer/relay_attention/relay_attention_ops_runner.cpp rename to src/ops/ops_infer/relay_attention/relay_attention_ops_runner.cpp diff --git a/src/ops_infer/relay_attention/relay_attention_ops_runner.h b/src/ops/ops_infer/relay_attention/relay_attention_ops_runner.h similarity index 100% rename from src/ops_infer/relay_attention/relay_attention_ops_runner.h rename to src/ops/ops_infer/relay_attention/relay_attention_ops_runner.h diff --git a/src/ops_infer/repeat/repeat_operation.cpp b/src/ops/ops_infer/repeat/repeat_operation.cpp similarity index 100% rename from src/ops_infer/repeat/repeat_operation.cpp rename to src/ops/ops_infer/repeat/repeat_operation.cpp diff --git a/src/ops_infer/repeat/repeat_operation.h b/src/ops/ops_infer/repeat/repeat_operation.h similarity index 100% rename from src/ops_infer/repeat/repeat_operation.h rename to src/ops/ops_infer/repeat/repeat_operation.h diff --git a/src/ops_infer/repeat/repeat_ops_runner.cpp b/src/ops/ops_infer/repeat/repeat_ops_runner.cpp similarity index 100% rename from src/ops_infer/repeat/repeat_ops_runner.cpp rename to src/ops/ops_infer/repeat/repeat_ops_runner.cpp diff --git a/src/ops_infer/repeat/repeat_ops_runner.h b/src/ops/ops_infer/repeat/repeat_ops_runner.h similarity index 100% rename from src/ops_infer/repeat/repeat_ops_runner.h rename to src/ops/ops_infer/repeat/repeat_ops_runner.h diff --git a/src/ops_infer/reshape_and_cache/reshape_and_cache_operation.cpp b/src/ops/ops_infer/reshape_and_cache/reshape_and_cache_operation.cpp similarity index 100% rename from src/ops_infer/reshape_and_cache/reshape_and_cache_operation.cpp rename to src/ops/ops_infer/reshape_and_cache/reshape_and_cache_operation.cpp diff --git a/src/ops_infer/reshape_and_cache/reshape_and_cache_operation.h b/src/ops/ops_infer/reshape_and_cache/reshape_and_cache_operation.h similarity index 100% rename from src/ops_infer/reshape_and_cache/reshape_and_cache_operation.h rename to src/ops/ops_infer/reshape_and_cache/reshape_and_cache_operation.h diff --git a/src/ops_infer/reshape_and_cache/reshape_and_cache_ops_runner.cpp b/src/ops/ops_infer/reshape_and_cache/reshape_and_cache_ops_runner.cpp similarity index 100% rename from src/ops_infer/reshape_and_cache/reshape_and_cache_ops_runner.cpp rename to src/ops/ops_infer/reshape_and_cache/reshape_and_cache_ops_runner.cpp diff --git a/src/ops_infer/reshape_and_cache/reshape_and_cache_ops_runner.h b/src/ops/ops_infer/reshape_and_cache/reshape_and_cache_ops_runner.h similarity index 100% rename from src/ops_infer/reshape_and_cache/reshape_and_cache_ops_runner.h rename to src/ops/ops_infer/reshape_and_cache/reshape_and_cache_ops_runner.h diff --git a/src/ops_infer/reshape_and_cache/reshape_and_cache_ops_runner_310p.cpp b/src/ops/ops_infer/reshape_and_cache/reshape_and_cache_ops_runner_310p.cpp similarity index 100% rename from src/ops_infer/reshape_and_cache/reshape_and_cache_ops_runner_310p.cpp rename to src/ops/ops_infer/reshape_and_cache/reshape_and_cache_ops_runner_310p.cpp diff --git a/src/ops_infer/reshape_and_cache/reshape_and_cache_ops_runner_310p.h b/src/ops/ops_infer/reshape_and_cache/reshape_and_cache_ops_runner_310p.h similarity index 100% rename from src/ops_infer/reshape_and_cache/reshape_and_cache_ops_runner_310p.h rename to src/ops/ops_infer/reshape_and_cache/reshape_and_cache_ops_runner_310p.h diff --git a/src/ops_infer/reshape_and_cache/reshape_and_cache_ops_runner_A2_NZ.cpp b/src/ops/ops_infer/reshape_and_cache/reshape_and_cache_ops_runner_A2_NZ.cpp similarity index 100% rename from src/ops_infer/reshape_and_cache/reshape_and_cache_ops_runner_A2_NZ.cpp rename to src/ops/ops_infer/reshape_and_cache/reshape_and_cache_ops_runner_A2_NZ.cpp diff --git a/src/ops_infer/reshape_and_cache/reshape_and_cache_ops_runner_A2_NZ.h b/src/ops/ops_infer/reshape_and_cache/reshape_and_cache_ops_runner_A2_NZ.h similarity index 100% rename from src/ops_infer/reshape_and_cache/reshape_and_cache_ops_runner_A2_NZ.h rename to src/ops/ops_infer/reshape_and_cache/reshape_and_cache_ops_runner_A2_NZ.h diff --git a/src/ops_infer/reshape_and_cache/reshape_and_cache_ops_runner_SISO.cpp b/src/ops/ops_infer/reshape_and_cache/reshape_and_cache_ops_runner_SISO.cpp similarity index 100% rename from src/ops_infer/reshape_and_cache/reshape_and_cache_ops_runner_SISO.cpp rename to src/ops/ops_infer/reshape_and_cache/reshape_and_cache_ops_runner_SISO.cpp diff --git a/src/ops_infer/reshape_and_cache/reshape_and_cache_ops_runner_SISO.h b/src/ops/ops_infer/reshape_and_cache/reshape_and_cache_ops_runner_SISO.h similarity index 100% rename from src/ops_infer/reshape_and_cache/reshape_and_cache_ops_runner_SISO.h rename to src/ops/ops_infer/reshape_and_cache/reshape_and_cache_ops_runner_SISO.h diff --git a/src/ops_infer/reshape_and_cache_omni/reshape_and_cache_omni_operation.cpp b/src/ops/ops_infer/reshape_and_cache_omni/reshape_and_cache_omni_operation.cpp similarity index 100% rename from src/ops_infer/reshape_and_cache_omni/reshape_and_cache_omni_operation.cpp rename to src/ops/ops_infer/reshape_and_cache_omni/reshape_and_cache_omni_operation.cpp diff --git a/src/ops_infer/reshape_and_cache_omni/reshape_and_cache_omni_operation.h b/src/ops/ops_infer/reshape_and_cache_omni/reshape_and_cache_omni_operation.h similarity index 100% rename from src/ops_infer/reshape_and_cache_omni/reshape_and_cache_omni_operation.h rename to src/ops/ops_infer/reshape_and_cache_omni/reshape_and_cache_omni_operation.h diff --git a/src/ops_infer/reshape_and_cache_omni/reshape_and_cache_omni_ops_runner.cpp b/src/ops/ops_infer/reshape_and_cache_omni/reshape_and_cache_omni_ops_runner.cpp similarity index 100% rename from src/ops_infer/reshape_and_cache_omni/reshape_and_cache_omni_ops_runner.cpp rename to src/ops/ops_infer/reshape_and_cache_omni/reshape_and_cache_omni_ops_runner.cpp diff --git a/src/ops_infer/reshape_and_cache_omni/reshape_and_cache_omni_ops_runner.h b/src/ops/ops_infer/reshape_and_cache_omni/reshape_and_cache_omni_ops_runner.h similarity index 100% rename from src/ops_infer/reshape_and_cache_omni/reshape_and_cache_omni_ops_runner.h rename to src/ops/ops_infer/reshape_and_cache_omni/reshape_and_cache_omni_ops_runner.h diff --git a/src/ops_infer/reshape_and_cache_with_stride/reshape_and_cache_with_stride_operation.cpp b/src/ops/ops_infer/reshape_and_cache_with_stride/reshape_and_cache_with_stride_operation.cpp similarity index 100% rename from src/ops_infer/reshape_and_cache_with_stride/reshape_and_cache_with_stride_operation.cpp rename to src/ops/ops_infer/reshape_and_cache_with_stride/reshape_and_cache_with_stride_operation.cpp diff --git a/src/ops_infer/reshape_and_cache_with_stride/reshape_and_cache_with_stride_operation.h b/src/ops/ops_infer/reshape_and_cache_with_stride/reshape_and_cache_with_stride_operation.h similarity index 100% rename from src/ops_infer/reshape_and_cache_with_stride/reshape_and_cache_with_stride_operation.h rename to src/ops/ops_infer/reshape_and_cache_with_stride/reshape_and_cache_with_stride_operation.h diff --git a/src/ops_infer/reshape_and_cache_with_stride/reshape_and_cache_with_stride_ops_runner.cpp b/src/ops/ops_infer/reshape_and_cache_with_stride/reshape_and_cache_with_stride_ops_runner.cpp similarity index 100% rename from src/ops_infer/reshape_and_cache_with_stride/reshape_and_cache_with_stride_ops_runner.cpp rename to src/ops/ops_infer/reshape_and_cache_with_stride/reshape_and_cache_with_stride_ops_runner.cpp diff --git a/src/ops_infer/reshape_and_cache_with_stride/reshape_and_cache_with_stride_ops_runner.h b/src/ops/ops_infer/reshape_and_cache_with_stride/reshape_and_cache_with_stride_ops_runner.h similarity index 100% rename from src/ops_infer/reshape_and_cache_with_stride/reshape_and_cache_with_stride_ops_runner.h rename to src/ops/ops_infer/reshape_and_cache_with_stride/reshape_and_cache_with_stride_ops_runner.h diff --git a/src/ops_infer/reshape_and_cache_with_stride/reshape_and_cache_with_stride_ops_runner_SISO.cpp b/src/ops/ops_infer/reshape_and_cache_with_stride/reshape_and_cache_with_stride_ops_runner_SISO.cpp similarity index 100% rename from src/ops_infer/reshape_and_cache_with_stride/reshape_and_cache_with_stride_ops_runner_SISO.cpp rename to src/ops/ops_infer/reshape_and_cache_with_stride/reshape_and_cache_with_stride_ops_runner_SISO.cpp diff --git a/src/ops_infer/reshape_and_cache_with_stride/reshape_and_cache_with_stride_ops_runner_SISO.h b/src/ops/ops_infer/reshape_and_cache_with_stride/reshape_and_cache_with_stride_ops_runner_SISO.h similarity index 100% rename from src/ops_infer/reshape_and_cache_with_stride/reshape_and_cache_with_stride_ops_runner_SISO.h rename to src/ops/ops_infer/reshape_and_cache_with_stride/reshape_and_cache_with_stride_ops_runner_SISO.h diff --git a/src/ops_infer/ring_mla/atb_acl_ring_mla.cpp b/src/ops/ops_infer/ring_mla/atb_acl_ring_mla.cpp similarity index 100% rename from src/ops_infer/ring_mla/atb_acl_ring_mla.cpp rename to src/ops/ops_infer/ring_mla/atb_acl_ring_mla.cpp diff --git a/src/ops_infer/ring_mla/param.cpp b/src/ops/ops_infer/ring_mla/param.cpp similarity index 100% rename from src/ops_infer/ring_mla/param.cpp rename to src/ops/ops_infer/ring_mla/param.cpp diff --git a/src/ops_infer/ring_mla/param.h b/src/ops/ops_infer/ring_mla/param.h similarity index 100% rename from src/ops_infer/ring_mla/param.h rename to src/ops/ops_infer/ring_mla/param.h diff --git a/src/ops_infer/ring_mla/ring_mla_operation.cpp b/src/ops/ops_infer/ring_mla/ring_mla_operation.cpp similarity index 100% rename from src/ops_infer/ring_mla/ring_mla_operation.cpp rename to src/ops/ops_infer/ring_mla/ring_mla_operation.cpp diff --git a/src/ops_infer/ring_mla/ring_mla_operation.h b/src/ops/ops_infer/ring_mla/ring_mla_operation.h similarity index 100% rename from src/ops_infer/ring_mla/ring_mla_operation.h rename to src/ops/ops_infer/ring_mla/ring_mla_operation.h diff --git a/src/ops_infer/ring_mla/ring_mla_ops_runner.cpp b/src/ops/ops_infer/ring_mla/ring_mla_ops_runner.cpp similarity index 100% rename from src/ops_infer/ring_mla/ring_mla_ops_runner.cpp rename to src/ops/ops_infer/ring_mla/ring_mla_ops_runner.cpp diff --git a/src/ops_infer/ring_mla/ring_mla_ops_runner.h b/src/ops/ops_infer/ring_mla/ring_mla_ops_runner.h similarity index 100% rename from src/ops_infer/ring_mla/ring_mla_ops_runner.h rename to src/ops/ops_infer/ring_mla/ring_mla_ops_runner.h diff --git a/src/ops_infer/rms_norm/rms_norm_operation.cpp b/src/ops/ops_infer/rms_norm/rms_norm_operation.cpp similarity index 100% rename from src/ops_infer/rms_norm/rms_norm_operation.cpp rename to src/ops/ops_infer/rms_norm/rms_norm_operation.cpp diff --git a/src/ops_infer/rms_norm/rms_norm_operation.h b/src/ops/ops_infer/rms_norm/rms_norm_operation.h similarity index 100% rename from src/ops_infer/rms_norm/rms_norm_operation.h rename to src/ops/ops_infer/rms_norm/rms_norm_operation.h diff --git a/src/ops_infer/rms_norm/rms_norm_ops_runner.cpp b/src/ops/ops_infer/rms_norm/rms_norm_ops_runner.cpp similarity index 100% rename from src/ops_infer/rms_norm/rms_norm_ops_runner.cpp rename to src/ops/ops_infer/rms_norm/rms_norm_ops_runner.cpp diff --git a/src/ops_infer/rms_norm/rms_norm_ops_runner.h b/src/ops/ops_infer/rms_norm/rms_norm_ops_runner.h similarity index 100% rename from src/ops_infer/rms_norm/rms_norm_ops_runner.h rename to src/ops/ops_infer/rms_norm/rms_norm_ops_runner.h diff --git a/src/ops_infer/rms_norm_with_stride/rms_norm_with_stride_operation.cpp b/src/ops/ops_infer/rms_norm_with_stride/rms_norm_with_stride_operation.cpp similarity index 100% rename from src/ops_infer/rms_norm_with_stride/rms_norm_with_stride_operation.cpp rename to src/ops/ops_infer/rms_norm_with_stride/rms_norm_with_stride_operation.cpp diff --git a/src/ops_infer/rms_norm_with_stride/rms_norm_with_stride_operation.h b/src/ops/ops_infer/rms_norm_with_stride/rms_norm_with_stride_operation.h similarity index 100% rename from src/ops_infer/rms_norm_with_stride/rms_norm_with_stride_operation.h rename to src/ops/ops_infer/rms_norm_with_stride/rms_norm_with_stride_operation.h diff --git a/src/ops_infer/rms_norm_with_stride/rms_norm_with_stride_ops_runner.cpp b/src/ops/ops_infer/rms_norm_with_stride/rms_norm_with_stride_ops_runner.cpp similarity index 100% rename from src/ops_infer/rms_norm_with_stride/rms_norm_with_stride_ops_runner.cpp rename to src/ops/ops_infer/rms_norm_with_stride/rms_norm_with_stride_ops_runner.cpp diff --git a/src/ops_infer/rms_norm_with_stride/rms_norm_with_stride_ops_runner.h b/src/ops/ops_infer/rms_norm_with_stride/rms_norm_with_stride_ops_runner.h similarity index 100% rename from src/ops_infer/rms_norm_with_stride/rms_norm_with_stride_ops_runner.h rename to src/ops/ops_infer/rms_norm_with_stride/rms_norm_with_stride_ops_runner.h diff --git a/src/ops_infer/rope/rope_operation.cpp b/src/ops/ops_infer/rope/rope_operation.cpp similarity index 100% rename from src/ops_infer/rope/rope_operation.cpp rename to src/ops/ops_infer/rope/rope_operation.cpp diff --git a/src/ops_infer/rope/rope_operation.h b/src/ops/ops_infer/rope/rope_operation.h similarity index 100% rename from src/ops_infer/rope/rope_operation.h rename to src/ops/ops_infer/rope/rope_operation.h diff --git a/src/ops_infer/rope/rope_ops_runner.cpp b/src/ops/ops_infer/rope/rope_ops_runner.cpp similarity index 100% rename from src/ops_infer/rope/rope_ops_runner.cpp rename to src/ops/ops_infer/rope/rope_ops_runner.cpp diff --git a/src/ops_infer/rope/rope_ops_runner.h b/src/ops/ops_infer/rope/rope_ops_runner.h similarity index 100% rename from src/ops_infer/rope/rope_ops_runner.h rename to src/ops/ops_infer/rope/rope_ops_runner.h diff --git a/src/ops_infer/rope_q_concat/rope_q_concat_operation.cpp b/src/ops/ops_infer/rope_q_concat/rope_q_concat_operation.cpp similarity index 100% rename from src/ops_infer/rope_q_concat/rope_q_concat_operation.cpp rename to src/ops/ops_infer/rope_q_concat/rope_q_concat_operation.cpp diff --git a/src/ops_infer/rope_q_concat/rope_q_concat_operation.h b/src/ops/ops_infer/rope_q_concat/rope_q_concat_operation.h similarity index 100% rename from src/ops_infer/rope_q_concat/rope_q_concat_operation.h rename to src/ops/ops_infer/rope_q_concat/rope_q_concat_operation.h diff --git a/src/ops_infer/rope_q_concat/rope_q_concat_ops_runner.cpp b/src/ops/ops_infer/rope_q_concat/rope_q_concat_ops_runner.cpp similarity index 100% rename from src/ops_infer/rope_q_concat/rope_q_concat_ops_runner.cpp rename to src/ops/ops_infer/rope_q_concat/rope_q_concat_ops_runner.cpp diff --git a/src/ops_infer/rope_q_concat/rope_q_concat_ops_runner.h b/src/ops/ops_infer/rope_q_concat/rope_q_concat_ops_runner.h similarity index 100% rename from src/ops_infer/rope_q_concat/rope_q_concat_ops_runner.h rename to src/ops/ops_infer/rope_q_concat/rope_q_concat_ops_runner.h diff --git a/src/ops_infer/scatter_elements_v2/scatter_elements_v2_operation.cpp b/src/ops/ops_infer/scatter_elements_v2/scatter_elements_v2_operation.cpp similarity index 100% rename from src/ops_infer/scatter_elements_v2/scatter_elements_v2_operation.cpp rename to src/ops/ops_infer/scatter_elements_v2/scatter_elements_v2_operation.cpp diff --git a/src/ops_infer/scatter_elements_v2/scatter_elements_v2_operation.h b/src/ops/ops_infer/scatter_elements_v2/scatter_elements_v2_operation.h similarity index 100% rename from src/ops_infer/scatter_elements_v2/scatter_elements_v2_operation.h rename to src/ops/ops_infer/scatter_elements_v2/scatter_elements_v2_operation.h diff --git a/src/ops_infer/scatter_elements_v2/scatter_elements_v2_ops_runner.cpp b/src/ops/ops_infer/scatter_elements_v2/scatter_elements_v2_ops_runner.cpp similarity index 100% rename from src/ops_infer/scatter_elements_v2/scatter_elements_v2_ops_runner.cpp rename to src/ops/ops_infer/scatter_elements_v2/scatter_elements_v2_ops_runner.cpp diff --git a/src/ops_infer/scatter_elements_v2/scatter_elements_v2_ops_runner.h b/src/ops/ops_infer/scatter_elements_v2/scatter_elements_v2_ops_runner.h similarity index 100% rename from src/ops_infer/scatter_elements_v2/scatter_elements_v2_ops_runner.h rename to src/ops/ops_infer/scatter_elements_v2/scatter_elements_v2_ops_runner.h diff --git a/src/ops_infer/self_attention/atb_acl_self_attention_prefix_encoder.cpp b/src/ops/ops_infer/self_attention/atb_acl_self_attention_prefix_encoder.cpp similarity index 100% rename from src/ops_infer/self_attention/atb_acl_self_attention_prefix_encoder.cpp rename to src/ops/ops_infer/self_attention/atb_acl_self_attention_prefix_encoder.cpp diff --git a/src/ops_infer/self_attention/param.cpp b/src/ops/ops_infer/self_attention/param.cpp similarity index 100% rename from src/ops_infer/self_attention/param.cpp rename to src/ops/ops_infer/self_attention/param.cpp diff --git a/src/ops_infer/self_attention/param.h b/src/ops/ops_infer/self_attention/param.h similarity index 100% rename from src/ops_infer/self_attention/param.h rename to src/ops/ops_infer/self_attention/param.h diff --git a/src/ops_infer/self_attention/self_attention_encoder_fuison_ops_runner_910a.cpp b/src/ops/ops_infer/self_attention/self_attention_encoder_fuison_ops_runner_910a.cpp similarity index 100% rename from src/ops_infer/self_attention/self_attention_encoder_fuison_ops_runner_910a.cpp rename to src/ops/ops_infer/self_attention/self_attention_encoder_fuison_ops_runner_910a.cpp diff --git a/src/ops_infer/self_attention/self_attention_encoder_fusion_ops_runner.cpp b/src/ops/ops_infer/self_attention/self_attention_encoder_fusion_ops_runner.cpp similarity index 100% rename from src/ops_infer/self_attention/self_attention_encoder_fusion_ops_runner.cpp rename to src/ops/ops_infer/self_attention/self_attention_encoder_fusion_ops_runner.cpp diff --git a/src/ops_infer/self_attention/self_attention_encoder_fusion_ops_runner.h b/src/ops/ops_infer/self_attention/self_attention_encoder_fusion_ops_runner.h similarity index 100% rename from src/ops_infer/self_attention/self_attention_encoder_fusion_ops_runner.h rename to src/ops/ops_infer/self_attention/self_attention_encoder_fusion_ops_runner.h diff --git a/src/ops_infer/self_attention/self_attention_encoder_fusion_ops_runner_910a.h b/src/ops/ops_infer/self_attention/self_attention_encoder_fusion_ops_runner_910a.h similarity index 100% rename from src/ops_infer/self_attention/self_attention_encoder_fusion_ops_runner_910a.h rename to src/ops/ops_infer/self_attention/self_attention_encoder_fusion_ops_runner_910a.h diff --git a/src/ops_infer/self_attention/self_attention_fusion_bypass_ops_runner.cpp b/src/ops/ops_infer/self_attention/self_attention_fusion_bypass_ops_runner.cpp similarity index 100% rename from src/ops_infer/self_attention/self_attention_fusion_bypass_ops_runner.cpp rename to src/ops/ops_infer/self_attention/self_attention_fusion_bypass_ops_runner.cpp diff --git a/src/ops_infer/self_attention/self_attention_fusion_bypass_ops_runner.h b/src/ops/ops_infer/self_attention/self_attention_fusion_bypass_ops_runner.h similarity index 100% rename from src/ops_infer/self_attention/self_attention_fusion_bypass_ops_runner.h rename to src/ops/ops_infer/self_attention/self_attention_fusion_bypass_ops_runner.h diff --git a/src/ops_infer/self_attention/self_attention_fusion_bypass_ops_runner_910a.cpp b/src/ops/ops_infer/self_attention/self_attention_fusion_bypass_ops_runner_910a.cpp similarity index 100% rename from src/ops_infer/self_attention/self_attention_fusion_bypass_ops_runner_910a.cpp rename to src/ops/ops_infer/self_attention/self_attention_fusion_bypass_ops_runner_910a.cpp diff --git a/src/ops_infer/self_attention/self_attention_fusion_bypass_ops_runner_910a.h b/src/ops/ops_infer/self_attention/self_attention_fusion_bypass_ops_runner_910a.h similarity index 100% rename from src/ops_infer/self_attention/self_attention_fusion_bypass_ops_runner_910a.h rename to src/ops/ops_infer/self_attention/self_attention_fusion_bypass_ops_runner_910a.h diff --git a/src/ops_infer/self_attention/self_attention_fusion_bypass_ops_runner_BNSD.cpp b/src/ops/ops_infer/self_attention/self_attention_fusion_bypass_ops_runner_BNSD.cpp similarity index 100% rename from src/ops_infer/self_attention/self_attention_fusion_bypass_ops_runner_BNSD.cpp rename to src/ops/ops_infer/self_attention/self_attention_fusion_bypass_ops_runner_BNSD.cpp diff --git a/src/ops_infer/self_attention/self_attention_fusion_bypass_ops_runner_BNSD.h b/src/ops/ops_infer/self_attention/self_attention_fusion_bypass_ops_runner_BNSD.h similarity index 100% rename from src/ops_infer/self_attention/self_attention_fusion_bypass_ops_runner_BNSD.h rename to src/ops/ops_infer/self_attention/self_attention_fusion_bypass_ops_runner_BNSD.h diff --git a/src/ops_infer/self_attention/self_attention_fusion_bypass_ops_runner_BNSD_910a.cpp b/src/ops/ops_infer/self_attention/self_attention_fusion_bypass_ops_runner_BNSD_910a.cpp similarity index 100% rename from src/ops_infer/self_attention/self_attention_fusion_bypass_ops_runner_BNSD_910a.cpp rename to src/ops/ops_infer/self_attention/self_attention_fusion_bypass_ops_runner_BNSD_910a.cpp diff --git a/src/ops_infer/self_attention/self_attention_fusion_bypass_ops_runner_BNSD_910a.h b/src/ops/ops_infer/self_attention/self_attention_fusion_bypass_ops_runner_BNSD_910a.h similarity index 100% rename from src/ops_infer/self_attention/self_attention_fusion_bypass_ops_runner_BNSD_910a.h rename to src/ops/ops_infer/self_attention/self_attention_fusion_bypass_ops_runner_BNSD_910a.h diff --git a/src/ops_infer/self_attention/self_attention_fusion_ops_runner.cpp b/src/ops/ops_infer/self_attention/self_attention_fusion_ops_runner.cpp similarity index 100% rename from src/ops_infer/self_attention/self_attention_fusion_ops_runner.cpp rename to src/ops/ops_infer/self_attention/self_attention_fusion_ops_runner.cpp diff --git a/src/ops_infer/self_attention/self_attention_fusion_ops_runner.h b/src/ops/ops_infer/self_attention/self_attention_fusion_ops_runner.h similarity index 100% rename from src/ops_infer/self_attention/self_attention_fusion_ops_runner.h rename to src/ops/ops_infer/self_attention/self_attention_fusion_ops_runner.h diff --git a/src/ops_infer/self_attention/self_attention_fusion_ops_runner_910a.cpp b/src/ops/ops_infer/self_attention/self_attention_fusion_ops_runner_910a.cpp similarity index 100% rename from src/ops_infer/self_attention/self_attention_fusion_ops_runner_910a.cpp rename to src/ops/ops_infer/self_attention/self_attention_fusion_ops_runner_910a.cpp diff --git a/src/ops_infer/self_attention/self_attention_fusion_ops_runner_910a.h b/src/ops/ops_infer/self_attention/self_attention_fusion_ops_runner_910a.h similarity index 100% rename from src/ops_infer/self_attention/self_attention_fusion_ops_runner_910a.h rename to src/ops/ops_infer/self_attention/self_attention_fusion_ops_runner_910a.h diff --git a/src/ops_infer/self_attention/self_attention_operation.cpp b/src/ops/ops_infer/self_attention/self_attention_operation.cpp similarity index 100% rename from src/ops_infer/self_attention/self_attention_operation.cpp rename to src/ops/ops_infer/self_attention/self_attention_operation.cpp diff --git a/src/ops_infer/self_attention/self_attention_operation.h b/src/ops/ops_infer/self_attention/self_attention_operation.h similarity index 100% rename from src/ops_infer/self_attention/self_attention_operation.h rename to src/ops/ops_infer/self_attention/self_attention_operation.h diff --git a/src/ops_infer/self_attention/self_attention_prefix_encoder_ops_runner.cpp b/src/ops/ops_infer/self_attention/self_attention_prefix_encoder_ops_runner.cpp similarity index 100% rename from src/ops_infer/self_attention/self_attention_prefix_encoder_ops_runner.cpp rename to src/ops/ops_infer/self_attention/self_attention_prefix_encoder_ops_runner.cpp diff --git a/src/ops_infer/self_attention/self_attention_prefix_encoder_ops_runner.h b/src/ops/ops_infer/self_attention/self_attention_prefix_encoder_ops_runner.h similarity index 100% rename from src/ops_infer/self_attention/self_attention_prefix_encoder_ops_runner.h rename to src/ops/ops_infer/self_attention/self_attention_prefix_encoder_ops_runner.h diff --git a/src/ops_infer/self_attention/self_attention_runner_utils.cpp b/src/ops/ops_infer/self_attention/self_attention_runner_utils.cpp similarity index 100% rename from src/ops_infer/self_attention/self_attention_runner_utils.cpp rename to src/ops/ops_infer/self_attention/self_attention_runner_utils.cpp diff --git a/src/ops_infer/self_attention/self_attention_runner_utils.h b/src/ops/ops_infer/self_attention/self_attention_runner_utils.h similarity index 100% rename from src/ops_infer/self_attention/self_attention_runner_utils.h rename to src/ops/ops_infer/self_attention/self_attention_runner_utils.h diff --git a/src/ops_infer/send/send_hccl_runner.cpp b/src/ops/ops_infer/send/send_hccl_runner.cpp similarity index 100% rename from src/ops_infer/send/send_hccl_runner.cpp rename to src/ops/ops_infer/send/send_hccl_runner.cpp diff --git a/src/ops_infer/send/send_hccl_runner.h b/src/ops/ops_infer/send/send_hccl_runner.h similarity index 100% rename from src/ops_infer/send/send_hccl_runner.h rename to src/ops/ops_infer/send/send_hccl_runner.h diff --git a/src/ops_infer/send/send_operation.cpp b/src/ops/ops_infer/send/send_operation.cpp similarity index 100% rename from src/ops_infer/send/send_operation.cpp rename to src/ops/ops_infer/send/send_operation.cpp diff --git a/src/ops_infer/send/send_operation.h b/src/ops/ops_infer/send/send_operation.h similarity index 100% rename from src/ops_infer/send/send_operation.h rename to src/ops/ops_infer/send/send_operation.h diff --git a/src/ops_infer/set_value/set_value_operation.cpp b/src/ops/ops_infer/set_value/set_value_operation.cpp similarity index 100% rename from src/ops_infer/set_value/set_value_operation.cpp rename to src/ops/ops_infer/set_value/set_value_operation.cpp diff --git a/src/ops_infer/set_value/set_value_operation.h b/src/ops/ops_infer/set_value/set_value_operation.h similarity index 100% rename from src/ops_infer/set_value/set_value_operation.h rename to src/ops/ops_infer/set_value/set_value_operation.h diff --git a/src/ops_infer/set_value/set_value_ops_runner.cpp b/src/ops/ops_infer/set_value/set_value_ops_runner.cpp similarity index 100% rename from src/ops_infer/set_value/set_value_ops_runner.cpp rename to src/ops/ops_infer/set_value/set_value_ops_runner.cpp diff --git a/src/ops_infer/set_value/set_value_ops_runner.h b/src/ops/ops_infer/set_value/set_value_ops_runner.h similarity index 100% rename from src/ops_infer/set_value/set_value_ops_runner.h rename to src/ops/ops_infer/set_value/set_value_ops_runner.h diff --git a/src/ops_infer/slice/slice_operation.cpp b/src/ops/ops_infer/slice/slice_operation.cpp similarity index 100% rename from src/ops_infer/slice/slice_operation.cpp rename to src/ops/ops_infer/slice/slice_operation.cpp diff --git a/src/ops_infer/slice/slice_operation.h b/src/ops/ops_infer/slice/slice_operation.h similarity index 100% rename from src/ops_infer/slice/slice_operation.h rename to src/ops/ops_infer/slice/slice_operation.h diff --git a/src/ops_infer/slice/slice_ops_runner.cpp b/src/ops/ops_infer/slice/slice_ops_runner.cpp similarity index 100% rename from src/ops_infer/slice/slice_ops_runner.cpp rename to src/ops/ops_infer/slice/slice_ops_runner.cpp diff --git a/src/ops_infer/slice/slice_ops_runner.h b/src/ops/ops_infer/slice/slice_ops_runner.h similarity index 100% rename from src/ops_infer/slice/slice_ops_runner.h rename to src/ops/ops_infer/slice/slice_ops_runner.h diff --git a/src/ops_infer/softmax/softmax_operation.cpp b/src/ops/ops_infer/softmax/softmax_operation.cpp similarity index 100% rename from src/ops_infer/softmax/softmax_operation.cpp rename to src/ops/ops_infer/softmax/softmax_operation.cpp diff --git a/src/ops_infer/softmax/softmax_operation.h b/src/ops/ops_infer/softmax/softmax_operation.h similarity index 100% rename from src/ops_infer/softmax/softmax_operation.h rename to src/ops/ops_infer/softmax/softmax_operation.h diff --git a/src/ops_infer/softmax/softmax_ops_runner.cpp b/src/ops/ops_infer/softmax/softmax_ops_runner.cpp similarity index 100% rename from src/ops_infer/softmax/softmax_ops_runner.cpp rename to src/ops/ops_infer/softmax/softmax_ops_runner.cpp diff --git a/src/ops_infer/softmax/softmax_ops_runner.h b/src/ops/ops_infer/softmax/softmax_ops_runner.h similarity index 100% rename from src/ops_infer/softmax/softmax_ops_runner.h rename to src/ops/ops_infer/softmax/softmax_ops_runner.h diff --git a/src/ops_infer/sort/sort_operation.cpp b/src/ops/ops_infer/sort/sort_operation.cpp similarity index 100% rename from src/ops_infer/sort/sort_operation.cpp rename to src/ops/ops_infer/sort/sort_operation.cpp diff --git a/src/ops_infer/sort/sort_operation.h b/src/ops/ops_infer/sort/sort_operation.h similarity index 100% rename from src/ops_infer/sort/sort_operation.h rename to src/ops/ops_infer/sort/sort_operation.h diff --git a/src/ops_infer/sort/sort_ops_runner.cpp b/src/ops/ops_infer/sort/sort_ops_runner.cpp similarity index 100% rename from src/ops_infer/sort/sort_ops_runner.cpp rename to src/ops/ops_infer/sort/sort_ops_runner.cpp diff --git a/src/ops_infer/sort/sort_ops_runner.h b/src/ops/ops_infer/sort/sort_ops_runner.h similarity index 97% rename from src/ops_infer/sort/sort_ops_runner.h rename to src/ops/ops_infer/sort/sort_ops_runner.h index c2e9cb39..d4d2228c 100644 --- a/src/ops_infer/sort/sort_ops_runner.h +++ b/src/ops/ops_infer/sort/sort_ops_runner.h @@ -1,27 +1,27 @@ -/* - * Copyright (c) 2024 Huawei Technologies Co., Ltd. - * This file is a part of the CANN Open Software. - * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). - * Please refer to the License for details. You may not use this file except in compliance with the License. - * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, - * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. - * See LICENSE in the root of the software repository for the full text of the License. - */ - -#ifndef ATB_SORT_OPS_RUNNER_H -#define ATB_SORT_OPS_RUNNER_H - -#include "atb/runner/ops_runner.h" -#include "atb/infer_op_params.h" - -namespace atb { -class SortOpsRunner : public OpsRunner { -public: - explicit SortOpsRunner(const infer::SortParam ¶m); - ~SortOpsRunner() override; - -private: - infer::SortParam param_; -}; -} // namespace atb +/* + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#ifndef ATB_SORT_OPS_RUNNER_H +#define ATB_SORT_OPS_RUNNER_H + +#include "atb/runner/ops_runner.h" +#include "atb/infer_op_params.h" + +namespace atb { +class SortOpsRunner : public OpsRunner { +public: + explicit SortOpsRunner(const infer::SortParam ¶m); + ~SortOpsRunner() override; + +private: + infer::SortParam param_; +}; +} // namespace atb #endif \ No newline at end of file diff --git a/src/ops_infer/split/split_operation.cpp b/src/ops/ops_infer/split/split_operation.cpp similarity index 100% rename from src/ops_infer/split/split_operation.cpp rename to src/ops/ops_infer/split/split_operation.cpp diff --git a/src/ops_infer/split/split_operation.h b/src/ops/ops_infer/split/split_operation.h similarity index 100% rename from src/ops_infer/split/split_operation.h rename to src/ops/ops_infer/split/split_operation.h diff --git a/src/ops_infer/split/split_ops_runner.cpp b/src/ops/ops_infer/split/split_ops_runner.cpp similarity index 100% rename from src/ops_infer/split/split_ops_runner.cpp rename to src/ops/ops_infer/split/split_ops_runner.cpp diff --git a/src/ops_infer/split/split_ops_runner.h b/src/ops/ops_infer/split/split_ops_runner.h similarity index 100% rename from src/ops_infer/split/split_ops_runner.h rename to src/ops/ops_infer/split/split_ops_runner.h diff --git a/src/ops_infer/swiglu_quant/swiglu_quant_operation.cpp b/src/ops/ops_infer/swiglu_quant/swiglu_quant_operation.cpp similarity index 100% rename from src/ops_infer/swiglu_quant/swiglu_quant_operation.cpp rename to src/ops/ops_infer/swiglu_quant/swiglu_quant_operation.cpp diff --git a/src/ops_infer/swiglu_quant/swiglu_quant_operation.h b/src/ops/ops_infer/swiglu_quant/swiglu_quant_operation.h similarity index 100% rename from src/ops_infer/swiglu_quant/swiglu_quant_operation.h rename to src/ops/ops_infer/swiglu_quant/swiglu_quant_operation.h diff --git a/src/ops_infer/swiglu_quant/swiglu_quant_ops_runner.cpp b/src/ops/ops_infer/swiglu_quant/swiglu_quant_ops_runner.cpp similarity index 100% rename from src/ops_infer/swiglu_quant/swiglu_quant_ops_runner.cpp rename to src/ops/ops_infer/swiglu_quant/swiglu_quant_ops_runner.cpp diff --git a/src/ops_infer/swiglu_quant/swiglu_quant_ops_runner.h b/src/ops/ops_infer/swiglu_quant/swiglu_quant_ops_runner.h similarity index 100% rename from src/ops_infer/swiglu_quant/swiglu_quant_ops_runner.h rename to src/ops/ops_infer/swiglu_quant/swiglu_quant_ops_runner.h diff --git a/src/ops_infer/topk_topp_sampling/topk_topp_sampling_operation.cpp b/src/ops/ops_infer/topk_topp_sampling/topk_topp_sampling_operation.cpp similarity index 100% rename from src/ops_infer/topk_topp_sampling/topk_topp_sampling_operation.cpp rename to src/ops/ops_infer/topk_topp_sampling/topk_topp_sampling_operation.cpp diff --git a/src/ops_infer/topk_topp_sampling/topk_topp_sampling_operation.h b/src/ops/ops_infer/topk_topp_sampling/topk_topp_sampling_operation.h similarity index 100% rename from src/ops_infer/topk_topp_sampling/topk_topp_sampling_operation.h rename to src/ops/ops_infer/topk_topp_sampling/topk_topp_sampling_operation.h diff --git a/src/ops_infer/topk_topp_sampling/topk_topp_sampling_ops_runner.cpp b/src/ops/ops_infer/topk_topp_sampling/topk_topp_sampling_ops_runner.cpp similarity index 100% rename from src/ops_infer/topk_topp_sampling/topk_topp_sampling_ops_runner.cpp rename to src/ops/ops_infer/topk_topp_sampling/topk_topp_sampling_ops_runner.cpp diff --git a/src/ops_infer/topk_topp_sampling/topk_topp_sampling_ops_runner.h b/src/ops/ops_infer/topk_topp_sampling/topk_topp_sampling_ops_runner.h similarity index 100% rename from src/ops_infer/topk_topp_sampling/topk_topp_sampling_ops_runner.h rename to src/ops/ops_infer/topk_topp_sampling/topk_topp_sampling_ops_runner.h diff --git a/src/ops_infer/transdata/transdata_operation.cpp b/src/ops/ops_infer/transdata/transdata_operation.cpp similarity index 100% rename from src/ops_infer/transdata/transdata_operation.cpp rename to src/ops/ops_infer/transdata/transdata_operation.cpp diff --git a/src/ops_infer/transdata/transdata_operation.h b/src/ops/ops_infer/transdata/transdata_operation.h similarity index 100% rename from src/ops_infer/transdata/transdata_operation.h rename to src/ops/ops_infer/transdata/transdata_operation.h diff --git a/src/ops_infer/transdata/transdata_ops_runner.cpp b/src/ops/ops_infer/transdata/transdata_ops_runner.cpp similarity index 100% rename from src/ops_infer/transdata/transdata_ops_runner.cpp rename to src/ops/ops_infer/transdata/transdata_ops_runner.cpp diff --git a/src/ops_infer/transdata/transdata_ops_runner.h b/src/ops/ops_infer/transdata/transdata_ops_runner.h similarity index 100% rename from src/ops_infer/transdata/transdata_ops_runner.h rename to src/ops/ops_infer/transdata/transdata_ops_runner.h diff --git a/src/ops_infer/transpose/transpose_operation.cpp b/src/ops/ops_infer/transpose/transpose_operation.cpp similarity index 100% rename from src/ops_infer/transpose/transpose_operation.cpp rename to src/ops/ops_infer/transpose/transpose_operation.cpp diff --git a/src/ops_infer/transpose/transpose_operation.h b/src/ops/ops_infer/transpose/transpose_operation.h similarity index 100% rename from src/ops_infer/transpose/transpose_operation.h rename to src/ops/ops_infer/transpose/transpose_operation.h diff --git a/src/ops_infer/transpose/transpose_ops_runner.cpp b/src/ops/ops_infer/transpose/transpose_ops_runner.cpp similarity index 100% rename from src/ops_infer/transpose/transpose_ops_runner.cpp rename to src/ops/ops_infer/transpose/transpose_ops_runner.cpp diff --git a/src/ops_infer/transpose/transpose_ops_runner.h b/src/ops/ops_infer/transpose/transpose_ops_runner.h similarity index 100% rename from src/ops_infer/transpose/transpose_ops_runner.h rename to src/ops/ops_infer/transpose/transpose_ops_runner.h diff --git a/src/ops_infer/unpad/unpad_operation.cpp b/src/ops/ops_infer/unpad/unpad_operation.cpp similarity index 100% rename from src/ops_infer/unpad/unpad_operation.cpp rename to src/ops/ops_infer/unpad/unpad_operation.cpp diff --git a/src/ops_infer/unpad/unpad_operation.h b/src/ops/ops_infer/unpad/unpad_operation.h similarity index 100% rename from src/ops_infer/unpad/unpad_operation.h rename to src/ops/ops_infer/unpad/unpad_operation.h diff --git a/src/ops_infer/unpad/unpad_ops_runner.cpp b/src/ops/ops_infer/unpad/unpad_ops_runner.cpp similarity index 100% rename from src/ops_infer/unpad/unpad_ops_runner.cpp rename to src/ops/ops_infer/unpad/unpad_ops_runner.cpp diff --git a/src/ops_infer/unpad/unpad_ops_runner.h b/src/ops/ops_infer/unpad/unpad_ops_runner.h similarity index 100% rename from src/ops_infer/unpad/unpad_ops_runner.h rename to src/ops/ops_infer/unpad/unpad_ops_runner.h diff --git a/src/ops_infer/where/where_operation.cpp b/src/ops/ops_infer/where/where_operation.cpp similarity index 100% rename from src/ops_infer/where/where_operation.cpp rename to src/ops/ops_infer/where/where_operation.cpp diff --git a/src/ops_infer/where/where_operation.h b/src/ops/ops_infer/where/where_operation.h similarity index 100% rename from src/ops_infer/where/where_operation.h rename to src/ops/ops_infer/where/where_operation.h diff --git a/src/ops_infer/where/where_ops_runner.cpp b/src/ops/ops_infer/where/where_ops_runner.cpp similarity index 100% rename from src/ops_infer/where/where_ops_runner.cpp rename to src/ops/ops_infer/where/where_ops_runner.cpp diff --git a/src/ops_infer/where/where_ops_runner.h b/src/ops/ops_infer/where/where_ops_runner.h similarity index 100% rename from src/ops_infer/where/where_ops_runner.h rename to src/ops/ops_infer/where/where_ops_runner.h diff --git a/src/ops_train/fast_soft_max/fastsoftmax_operation.cpp b/src/ops/ops_train/fast_soft_max/fastsoftmax_operation.cpp similarity index 100% rename from src/ops_train/fast_soft_max/fastsoftmax_operation.cpp rename to src/ops/ops_train/fast_soft_max/fastsoftmax_operation.cpp diff --git a/src/ops_train/fast_soft_max/fastsoftmax_operation.h b/src/ops/ops_train/fast_soft_max/fastsoftmax_operation.h similarity index 100% rename from src/ops_train/fast_soft_max/fastsoftmax_operation.h rename to src/ops/ops_train/fast_soft_max/fastsoftmax_operation.h diff --git a/src/ops_train/fast_soft_max/fastsoftmax_ops_runner.cpp b/src/ops/ops_train/fast_soft_max/fastsoftmax_ops_runner.cpp similarity index 100% rename from src/ops_train/fast_soft_max/fastsoftmax_ops_runner.cpp rename to src/ops/ops_train/fast_soft_max/fastsoftmax_ops_runner.cpp diff --git a/src/ops_train/fast_soft_max/fastsoftmax_ops_runner.h b/src/ops/ops_train/fast_soft_max/fastsoftmax_ops_runner.h similarity index 100% rename from src/ops_train/fast_soft_max/fastsoftmax_ops_runner.h rename to src/ops/ops_train/fast_soft_max/fastsoftmax_ops_runner.h diff --git a/src/ops_train/fast_soft_max_grad/fastsoftmaxgrad_operation.cpp b/src/ops/ops_train/fast_soft_max_grad/fastsoftmaxgrad_operation.cpp similarity index 100% rename from src/ops_train/fast_soft_max_grad/fastsoftmaxgrad_operation.cpp rename to src/ops/ops_train/fast_soft_max_grad/fastsoftmaxgrad_operation.cpp diff --git a/src/ops_train/fast_soft_max_grad/fastsoftmaxgrad_operation.h b/src/ops/ops_train/fast_soft_max_grad/fastsoftmaxgrad_operation.h similarity index 100% rename from src/ops_train/fast_soft_max_grad/fastsoftmaxgrad_operation.h rename to src/ops/ops_train/fast_soft_max_grad/fastsoftmaxgrad_operation.h diff --git a/src/ops_train/fast_soft_max_grad/fastsoftmaxgrad_ops_runner.cpp b/src/ops/ops_train/fast_soft_max_grad/fastsoftmaxgrad_ops_runner.cpp similarity index 100% rename from src/ops_train/fast_soft_max_grad/fastsoftmaxgrad_ops_runner.cpp rename to src/ops/ops_train/fast_soft_max_grad/fastsoftmaxgrad_ops_runner.cpp diff --git a/src/ops_train/fast_soft_max_grad/fastsoftmaxgrad_ops_runner.h b/src/ops/ops_train/fast_soft_max_grad/fastsoftmaxgrad_ops_runner.h similarity index 100% rename from src/ops_train/fast_soft_max_grad/fastsoftmaxgrad_ops_runner.h rename to src/ops/ops_train/fast_soft_max_grad/fastsoftmaxgrad_ops_runner.h diff --git a/src/ops_train/gen_attention_mask/genattentionmask_operation.cpp b/src/ops/ops_train/gen_attention_mask/genattentionmask_operation.cpp similarity index 100% rename from src/ops_train/gen_attention_mask/genattentionmask_operation.cpp rename to src/ops/ops_train/gen_attention_mask/genattentionmask_operation.cpp diff --git a/src/ops_train/gen_attention_mask/genattentionmask_operation.h b/src/ops/ops_train/gen_attention_mask/genattentionmask_operation.h similarity index 100% rename from src/ops_train/gen_attention_mask/genattentionmask_operation.h rename to src/ops/ops_train/gen_attention_mask/genattentionmask_operation.h diff --git a/src/ops_train/gen_attention_mask/genattentionmask_ops_runner.cpp b/src/ops/ops_train/gen_attention_mask/genattentionmask_ops_runner.cpp similarity index 100% rename from src/ops_train/gen_attention_mask/genattentionmask_ops_runner.cpp rename to src/ops/ops_train/gen_attention_mask/genattentionmask_ops_runner.cpp diff --git a/src/ops_train/gen_attention_mask/genattentionmask_ops_runner.h b/src/ops/ops_train/gen_attention_mask/genattentionmask_ops_runner.h similarity index 100% rename from src/ops_train/gen_attention_mask/genattentionmask_ops_runner.h rename to src/ops/ops_train/gen_attention_mask/genattentionmask_ops_runner.h diff --git a/src/ops_train/laser_attention/laser_attention_operation.cpp b/src/ops/ops_train/laser_attention/laser_attention_operation.cpp similarity index 100% rename from src/ops_train/laser_attention/laser_attention_operation.cpp rename to src/ops/ops_train/laser_attention/laser_attention_operation.cpp diff --git a/src/ops_train/laser_attention/laser_attention_operation.h b/src/ops/ops_train/laser_attention/laser_attention_operation.h similarity index 100% rename from src/ops_train/laser_attention/laser_attention_operation.h rename to src/ops/ops_train/laser_attention/laser_attention_operation.h diff --git a/src/ops_train/laser_attention/laser_attention_ops_runner.cpp b/src/ops/ops_train/laser_attention/laser_attention_ops_runner.cpp similarity index 100% rename from src/ops_train/laser_attention/laser_attention_ops_runner.cpp rename to src/ops/ops_train/laser_attention/laser_attention_ops_runner.cpp diff --git a/src/ops_train/laser_attention/laser_attention_ops_runner.h b/src/ops/ops_train/laser_attention/laser_attention_ops_runner.h similarity index 100% rename from src/ops_train/laser_attention/laser_attention_ops_runner.h rename to src/ops/ops_train/laser_attention/laser_attention_ops_runner.h diff --git a/src/ops_train/laser_attention_grad/laser_attention_grad_operation.cpp b/src/ops/ops_train/laser_attention_grad/laser_attention_grad_operation.cpp similarity index 100% rename from src/ops_train/laser_attention_grad/laser_attention_grad_operation.cpp rename to src/ops/ops_train/laser_attention_grad/laser_attention_grad_operation.cpp diff --git a/src/ops_train/laser_attention_grad/laser_attention_grad_operation.h b/src/ops/ops_train/laser_attention_grad/laser_attention_grad_operation.h similarity index 100% rename from src/ops_train/laser_attention_grad/laser_attention_grad_operation.h rename to src/ops/ops_train/laser_attention_grad/laser_attention_grad_operation.h diff --git a/src/ops_train/laser_attention_grad/laser_attention_grad_ops_runner.cpp b/src/ops/ops_train/laser_attention_grad/laser_attention_grad_ops_runner.cpp similarity index 100% rename from src/ops_train/laser_attention_grad/laser_attention_grad_ops_runner.cpp rename to src/ops/ops_train/laser_attention_grad/laser_attention_grad_ops_runner.cpp diff --git a/src/ops_train/laser_attention_grad/laser_attention_grad_ops_runner.h b/src/ops/ops_train/laser_attention_grad/laser_attention_grad_ops_runner.h similarity index 100% rename from src/ops_train/laser_attention_grad/laser_attention_grad_ops_runner.h rename to src/ops/ops_train/laser_attention_grad/laser_attention_grad_ops_runner.h diff --git a/src/ops_train/pad_with_hidden_state/pad_with_hidden_state_operation.cpp b/src/ops/ops_train/pad_with_hidden_state/pad_with_hidden_state_operation.cpp similarity index 100% rename from src/ops_train/pad_with_hidden_state/pad_with_hidden_state_operation.cpp rename to src/ops/ops_train/pad_with_hidden_state/pad_with_hidden_state_operation.cpp diff --git a/src/ops_train/pad_with_hidden_state/pad_with_hidden_state_operation.h b/src/ops/ops_train/pad_with_hidden_state/pad_with_hidden_state_operation.h similarity index 100% rename from src/ops_train/pad_with_hidden_state/pad_with_hidden_state_operation.h rename to src/ops/ops_train/pad_with_hidden_state/pad_with_hidden_state_operation.h diff --git a/src/ops_train/pad_with_hidden_state/pad_with_hidden_state_ops_runner.cpp b/src/ops/ops_train/pad_with_hidden_state/pad_with_hidden_state_ops_runner.cpp similarity index 100% rename from src/ops_train/pad_with_hidden_state/pad_with_hidden_state_ops_runner.cpp rename to src/ops/ops_train/pad_with_hidden_state/pad_with_hidden_state_ops_runner.cpp diff --git a/src/ops_train/pad_with_hidden_state/pad_with_hidden_state_ops_runner.h b/src/ops/ops_train/pad_with_hidden_state/pad_with_hidden_state_ops_runner.h similarity index 100% rename from src/ops_train/pad_with_hidden_state/pad_with_hidden_state_ops_runner.h rename to src/ops/ops_train/pad_with_hidden_state/pad_with_hidden_state_ops_runner.h diff --git a/src/ops_train/rms_norm_backward/rms_norm_backward_operation.cpp b/src/ops/ops_train/rms_norm_backward/rms_norm_backward_operation.cpp similarity index 100% rename from src/ops_train/rms_norm_backward/rms_norm_backward_operation.cpp rename to src/ops/ops_train/rms_norm_backward/rms_norm_backward_operation.cpp diff --git a/src/ops_train/rms_norm_backward/rms_norm_backward_operation.h b/src/ops/ops_train/rms_norm_backward/rms_norm_backward_operation.h similarity index 100% rename from src/ops_train/rms_norm_backward/rms_norm_backward_operation.h rename to src/ops/ops_train/rms_norm_backward/rms_norm_backward_operation.h diff --git a/src/ops_train/rms_norm_backward/rms_norm_backward_ops_runner.cpp b/src/ops/ops_train/rms_norm_backward/rms_norm_backward_ops_runner.cpp similarity index 100% rename from src/ops_train/rms_norm_backward/rms_norm_backward_ops_runner.cpp rename to src/ops/ops_train/rms_norm_backward/rms_norm_backward_ops_runner.cpp diff --git a/src/ops_train/rms_norm_backward/rms_norm_backward_ops_runner.h b/src/ops/ops_train/rms_norm_backward/rms_norm_backward_ops_runner.h similarity index 100% rename from src/ops_train/rms_norm_backward/rms_norm_backward_ops_runner.h rename to src/ops/ops_train/rms_norm_backward/rms_norm_backward_ops_runner.h diff --git a/src/ops_train/rope_grad/rope_grad_operation.cpp b/src/ops/ops_train/rope_grad/rope_grad_operation.cpp similarity index 100% rename from src/ops_train/rope_grad/rope_grad_operation.cpp rename to src/ops/ops_train/rope_grad/rope_grad_operation.cpp diff --git a/src/ops_train/rope_grad/rope_grad_operation.h b/src/ops/ops_train/rope_grad/rope_grad_operation.h similarity index 100% rename from src/ops_train/rope_grad/rope_grad_operation.h rename to src/ops/ops_train/rope_grad/rope_grad_operation.h diff --git a/src/ops_train/rope_grad/rope_grad_ops_runner.cpp b/src/ops/ops_train/rope_grad/rope_grad_ops_runner.cpp similarity index 100% rename from src/ops_train/rope_grad/rope_grad_ops_runner.cpp rename to src/ops/ops_train/rope_grad/rope_grad_ops_runner.cpp diff --git a/src/ops_train/rope_grad/rope_grad_ops_runner.h b/src/ops/ops_train/rope_grad/rope_grad_ops_runner.h similarity index 100% rename from src/ops_train/rope_grad/rope_grad_ops_runner.h rename to src/ops/ops_train/rope_grad/rope_grad_ops_runner.h diff --git a/src/ops_train/strided_batch_matmul/stridedbatchmatmul_operation.cpp b/src/ops/ops_train/strided_batch_matmul/stridedbatchmatmul_operation.cpp similarity index 100% rename from src/ops_train/strided_batch_matmul/stridedbatchmatmul_operation.cpp rename to src/ops/ops_train/strided_batch_matmul/stridedbatchmatmul_operation.cpp diff --git a/src/ops_train/strided_batch_matmul/stridedbatchmatmul_operation.h b/src/ops/ops_train/strided_batch_matmul/stridedbatchmatmul_operation.h similarity index 100% rename from src/ops_train/strided_batch_matmul/stridedbatchmatmul_operation.h rename to src/ops/ops_train/strided_batch_matmul/stridedbatchmatmul_operation.h diff --git a/src/ops_train/strided_batch_matmul/stridedbatchmatmul_ops_runner.cpp b/src/ops/ops_train/strided_batch_matmul/stridedbatchmatmul_ops_runner.cpp similarity index 100% rename from src/ops_train/strided_batch_matmul/stridedbatchmatmul_ops_runner.cpp rename to src/ops/ops_train/strided_batch_matmul/stridedbatchmatmul_ops_runner.cpp diff --git a/src/ops_train/strided_batch_matmul/stridedbatchmatmul_ops_runner.h b/src/ops/ops_train/strided_batch_matmul/stridedbatchmatmul_ops_runner.h similarity index 100% rename from src/ops_train/strided_batch_matmul/stridedbatchmatmul_ops_runner.h rename to src/ops/ops_train/strided_batch_matmul/stridedbatchmatmul_ops_runner.h diff --git a/src/ops_train/unpad_with_hidden_state/unpad_with_hidden_state_operation.cpp b/src/ops/ops_train/unpad_with_hidden_state/unpad_with_hidden_state_operation.cpp similarity index 100% rename from src/ops_train/unpad_with_hidden_state/unpad_with_hidden_state_operation.cpp rename to src/ops/ops_train/unpad_with_hidden_state/unpad_with_hidden_state_operation.cpp diff --git a/src/ops_train/unpad_with_hidden_state/unpad_with_hidden_state_operation.h b/src/ops/ops_train/unpad_with_hidden_state/unpad_with_hidden_state_operation.h similarity index 100% rename from src/ops_train/unpad_with_hidden_state/unpad_with_hidden_state_operation.h rename to src/ops/ops_train/unpad_with_hidden_state/unpad_with_hidden_state_operation.h diff --git a/src/ops_train/unpad_with_hidden_state/unpad_with_hidden_state_ops_runner.cpp b/src/ops/ops_train/unpad_with_hidden_state/unpad_with_hidden_state_ops_runner.cpp similarity index 100% rename from src/ops_train/unpad_with_hidden_state/unpad_with_hidden_state_ops_runner.cpp rename to src/ops/ops_train/unpad_with_hidden_state/unpad_with_hidden_state_ops_runner.cpp diff --git a/src/ops_train/unpad_with_hidden_state/unpad_with_hidden_state_ops_runner.h b/src/ops/ops_train/unpad_with_hidden_state/unpad_with_hidden_state_ops_runner.h similarity index 100% rename from src/ops_train/unpad_with_hidden_state/unpad_with_hidden_state_ops_runner.h rename to src/ops/ops_train/unpad_with_hidden_state/unpad_with_hidden_state_ops_runner.h -- Gitee From 5b0b600ab317cdee116f2ebc52227f4426ff7738 Mon Sep 17 00:00:00 2001 From: zhuhaozhecool Date: Mon, 22 Sep 2025 20:11:59 +0800 Subject: [PATCH 46/94] fix pp matmul I8 Kernel memory illegal read --- .../pp_matmul_i8_kernel/op_kernel/pp_matmul.cce | 9 ++++++++- .../op_kernel/pp_matmul_bf16.cce | 7 ++++--- .../op_kernel/pp_matmul_i8_weight_nz.cce | 13 ++++++++++--- .../op_kernel/pp_matmul_nz_m300.cce | 13 ++++++++++--- 4 files changed, 32 insertions(+), 10 deletions(-) diff --git a/src/kernels/kernels/matmul/pp_matmul_i8_kernel/op_kernel/pp_matmul.cce b/src/kernels/kernels/matmul/pp_matmul_i8_kernel/op_kernel/pp_matmul.cce index 1cd15bfb..1beff8e7 100644 --- a/src/kernels/kernels/matmul/pp_matmul_i8_kernel/op_kernel/pp_matmul.cce +++ b/src/kernels/kernels/matmul/pp_matmul_i8_kernel/op_kernel/pp_matmul.cce @@ -24,6 +24,7 @@ constexpr uint32_t L0AB_PINGPONG_BUFFER_LEN_FP16 = 16384; // 32 KB constexpr uint32_t L0AB_PINGPONG_BUFFER_LEN_INT8 = 32768; // 32 KB +constexpr uint32_t BLOCK_SIZE_8 = 8; constexpr uint32_t BLOCK_SIZE_16 = 16; constexpr uint32_t BLOCK_SIZE_32 = 32; constexpr uint32_t CUBE_MATRIX_SIZE_256 = 256; // 16 * 16 @@ -55,6 +56,11 @@ __aicore__ __force_inline__ uint32_t RoundUp16(const uint32_t val) return (val + BLOCK_SIZE_16 - 1) / BLOCK_SIZE_16 * BLOCK_SIZE_16; } +__aicore__ __force_inline__ uint32_t RoundUp8(const uint32_t val) +{ + return (val + BLOCK_SIZE_8 - 1) / BLOCK_SIZE_8 * BLOCK_SIZE_8; +} + template class PpMatmulInt { @@ -163,6 +169,7 @@ public: uint64_t offset_c = batch_idx * m * n + m_idx * m0 * n + n_idx * n0; uint32_t m_actual = (m_idx == (m_loop - 1)) ? (m - m_idx * m0) : m0; uint32_t n_actual = (n_idx == (n_loop - 1)) ? (n - n_idx * n0) : n0; + uint32_t bias_n_round = RoundUp8(n_actual); uint32_t m_round = 0; uint32_t n_round = 0; uint64_t shuffle_k = en_shuffle_k ? core_idx % k_loop : 0; @@ -212,7 +219,7 @@ public: WAIT_FLAG(MTE1, MTE2, EVENT_ID7); gm_to_l1(bias_l1, // dst gm_bias[offset_bias], // src - 1, RoundUp16(1), 1, n_round, + 1, RoundUp16(1), 1, bias_n_round, RoundUp16(n_round), n_round); SET_FLAG(MTE2, MTE1, EVENT_ID6); } diff --git a/src/kernels/kernels/matmul/pp_matmul_i8_kernel/op_kernel/pp_matmul_bf16.cce b/src/kernels/kernels/matmul/pp_matmul_i8_kernel/op_kernel/pp_matmul_bf16.cce index 85d802c1..44e2475b 100644 --- a/src/kernels/kernels/matmul/pp_matmul_i8_kernel/op_kernel/pp_matmul_bf16.cce +++ b/src/kernels/kernels/matmul/pp_matmul_i8_kernel/op_kernel/pp_matmul_bf16.cce @@ -180,6 +180,7 @@ public: uint64_t offset_c = batch_idx * m * n + m_idx * m0 * n + n_idx * n0; uint32_t m_actual = (m_idx == (m_loop - 1)) ? (m - m_idx * m0) : m0; uint32_t n_actual = (n_idx == (n_loop - 1)) ? (n - n_idx * n0) : n0; + uint32_t bias_n_round = RoundUp8(n_actual); uint32_t m_round = 0; uint32_t n_round = 0; uint64_t shuffle_k = en_shuffle_k ? core_idx % k_loop : 0; @@ -240,7 +241,7 @@ public: WAIT_FLAG(MTE1, MTE2, EVENT_ID7); gm_to_l1(bias_l1, // dst gm_bias[offset_bias], // src - 1, RoundUp16(1), 1, n_round, + 1, RoundUp16(1), 1, bias_n_round, RoundUp16(n_round), n_round); SET_FLAG(MTE2, MTE1, EVENT_ID6); } @@ -279,7 +280,7 @@ public: l1_buf_b, gm_b[offset_b], k_actual, k_round, k, n_actual, n_round, n); } else { gm_to_l1( - l1_buf_b, gm_b[offset_b], k_actual, k_round, RoundUp16(k), n_actual, n_round, RoundUp32(n)); + l1_buf_b, gm_b[offset_b], k_actual, RoundUp16(k_actual), RoundUp16(k), n_actual, n_round, RoundUp32(n)); } } SET_FLAG(MTE2, MTE1, event_id + CONST_2); @@ -372,7 +373,7 @@ public: gm_to_l1(l1_buf_b_next, gm_b[offset_b_next], k_actual_next, - k_round_next, + RoundUp16(k_actual_next), RoundUp16(k), n_actual, n_round, diff --git a/src/kernels/kernels/matmul/pp_matmul_i8_kernel/op_kernel/pp_matmul_i8_weight_nz.cce b/src/kernels/kernels/matmul/pp_matmul_i8_kernel/op_kernel/pp_matmul_i8_weight_nz.cce index d3d58a48..a498351b 100644 --- a/src/kernels/kernels/matmul/pp_matmul_i8_kernel/op_kernel/pp_matmul_i8_weight_nz.cce +++ b/src/kernels/kernels/matmul/pp_matmul_i8_kernel/op_kernel/pp_matmul_i8_weight_nz.cce @@ -23,6 +23,7 @@ constexpr uint32_t L0AB_PINGPONG_BUFFER_LEN_FP16 = 16384; // 32 KB constexpr uint32_t L0AB_PINGPONG_BUFFER_LEN_INT8 = 32768; // 32 KB +constexpr uint32_t BLOCK_SIZE_8 = 8; constexpr uint32_t BLOCK_SIZE_16 = 16; constexpr uint32_t BLOCK_SIZE_32 = 32; constexpr uint32_t CUBE_MATRIX_SIZE_256 = 256; // 16 * 16 @@ -70,6 +71,11 @@ __aicore__ __force_inline__ uint32_t RoundUp16(const uint32_t val) return (val + BLOCK_SIZE_16 - 1) / BLOCK_SIZE_16 * BLOCK_SIZE_16; } +__aicore__ __force_inline__ uint32_t RoundUp8(const uint32_t val) +{ + return (val + BLOCK_SIZE_8 - 1) / BLOCK_SIZE_8 * BLOCK_SIZE_8; +} + template ( - l1_buf_b, gm_b[offset_b], k_actual, k_round, RoundUp16(k), n_actual, n_round, RoundUp32(n)); + l1_buf_b, gm_b[offset_b], k_actual, RoundUp16(k_actual), RoundUp16(k), n_actual, n_round, RoundUp32(n)); } SET_FLAG(MTE2, MTE1, event_id + CONST_2); @@ -383,7 +390,7 @@ public: gm_to_l1(l1_buf_b_next, gm_b[offset_b_next], k_actual_next, - k_round_next, + RoundUp16(k_actual_next), RoundUp16(k), n_actual, n_round, diff --git a/src/kernels/kernels/matmul/pp_matmul_i8_kernel/op_kernel/pp_matmul_nz_m300.cce b/src/kernels/kernels/matmul/pp_matmul_i8_kernel/op_kernel/pp_matmul_nz_m300.cce index c21ac739..c373e68f 100644 --- a/src/kernels/kernels/matmul/pp_matmul_i8_kernel/op_kernel/pp_matmul_nz_m300.cce +++ b/src/kernels/kernels/matmul/pp_matmul_i8_kernel/op_kernel/pp_matmul_nz_m300.cce @@ -24,6 +24,7 @@ constexpr uint32_t L0AB_PINGPONG_BUFFER_LEN_FP16 = 16384; // 32 KB constexpr uint32_t L0AB_PINGPONG_BUFFER_LEN_INT8 = 32768; // 32 KB +constexpr uint32_t BLOCK_SIZE_8 = 8; constexpr uint32_t BLOCK_SIZE_16 = 16; constexpr uint32_t BLOCK_SIZE_32 = 32; constexpr uint32_t CUBE_MATRIX_SIZE_256 = 256; // 16 * 16 @@ -55,6 +56,11 @@ __aicore__ __force_inline__ uint32_t RoundUp16(const uint32_t val) return (val + BLOCK_SIZE_16 - 1) / BLOCK_SIZE_16 * BLOCK_SIZE_16; } +__aicore__ __force_inline__ uint32_t RoundUp8(const uint32_t val) +{ + return (val + BLOCK_SIZE_8 - 1) / BLOCK_SIZE_8 * BLOCK_SIZE_8; +} + template class PpMatmulInt { @@ -163,6 +169,7 @@ public: uint64_t offset_c = batch_idx * m * n + m_idx * m0 * n + n_idx * n0; uint32_t m_actual = (m_idx == (m_loop - 1)) ? (m - m_idx * m0) : m0; uint32_t n_actual = (n_idx == (n_loop - 1)) ? (n - n_idx * n0) : n0; + uint32_t bias_n_round = RoundUp8(n_actual); uint32_t m_round = 0; uint32_t n_round = 0; uint64_t shuffle_k = en_shuffle_k ? core_idx % k_loop : 0; @@ -212,7 +219,7 @@ public: WAIT_FLAG(MTE1, MTE2, EVENT_ID7); gm_to_l1(bias_l1, // dst gm_bias[offset_bias], // src - 1, RoundUp16(1), 1, n_round, + 1, RoundUp16(1), 1, bias_n_round, RoundUp16(n_round), n_round); SET_FLAG(MTE2, MTE1, EVENT_ID6); } @@ -242,7 +249,7 @@ public: l1_buf_b, gm_b[offset_b], n_actual, n_round, RoundUp16(n), k_actual, k_round, RoundUp32(k)); } else { gm_to_l1( - l1_buf_b, gm_b[offset_b], k_actual, k_round, RoundUp16(k), n_actual, n_round, RoundUp32(n)); + l1_buf_b, gm_b[offset_b], k_actual, RoundUp16(k_actual), RoundUp16(k), n_actual, n_round, RoundUp32(n)); } SET_FLAG(MTE2, MTE1, event_id + CONST_2); @@ -323,7 +330,7 @@ public: l1_buf_b_next, gm_b[offset_b_next], n_actual, n_round, RoundUp16(n), k_actual_next, k_round_next, RoundUp32(k)); } else { gm_to_l1( - l1_buf_b_next, gm_b[offset_b_next], k_actual_next, k_round_next, RoundUp16(k), n_actual, n_round, RoundUp32(n)); + l1_buf_b_next, gm_b[offset_b_next], k_actual_next, RoundUp16(k_actual_next), RoundUp16(k), n_actual, n_round, RoundUp32(n)); } SET_FLAG(MTE2, MTE1, event_id_next + CONST_2); } -- Gitee From 1ae0cd8240fb4e4559dbcdaae09fb44d2616abe4 Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Mon, 22 Sep 2025 20:18:11 +0800 Subject: [PATCH 47/94] fix --- src/kernels/lcal/src/lccl.cpp | 36 ++++++++++++++++++++--------------- 1 file changed, 21 insertions(+), 15 deletions(-) diff --git a/src/kernels/lcal/src/lccl.cpp b/src/kernels/lcal/src/lccl.cpp index 8d515fb2..bf79bf66 100644 --- a/src/kernels/lcal/src/lccl.cpp +++ b/src/kernels/lcal/src/lccl.cpp @@ -31,39 +31,45 @@ using AclrtGetResInCurrentThreadFunc = int(*)(int, uint32_t*); int GetAclResInCurThread(int type, uint32_t *resource) { - // 静态变量:保存函数指针和库句柄 - static std::unique_ptr mkiDl; - static AclrtGetResInCurrentThreadFunc aclrtGetResInCurrentThread = nullptr; static std::mutex localMutex; // 线程安全锁 - - std::lock_guard lock(localMutex); // 加锁 + static AclrtGetResInCurrentThreadFunc aclrtGetResInCurrentThread = nullptr; + static int res = -1; // 首次调用时初始化 - if (!mkiDl) { + if (res == -1) { + std::lock_guard lock(localMutex); // 加锁 + std::unique_ptr mkiDl; std::string libPath = std::string(Mki::GetEnv("ASCEND_HOME_PATH")) + "/runtime/lib64/libascendcl.so"; mkiDl = std::make_unique(libPath, false); if (!mkiDl->IsValid()) { // 检查库是否加载成功 - MKI_LOG(WARN) << "Failed to load libascendcl.so!"; - return LCAL_ERROR_NOT_FOUND; + MKI_LOG(ERROR) << "Failed to load libascendcl.so!"; + return LCAL_ERROR_INTERNAL; } aclrtGetResInCurrentThread = (AclrtGetResInCurrentThreadFunc)mkiDl->GetSymbol("aclrtGetResInCurrentThread"); if (aclrtGetResInCurrentThread == nullptr) { - MKI_LOG(WARN) << "Failed to get acl function!"; + MKI_LOG(WARN) << "Failed to get aclrtGetResInCurrentThread function!"; + res = LCAL_ERROR_NOT_FOUND; return LCAL_ERROR_NOT_FOUND; } + res = LCAL_SUCCESS; MKI_LOG(DEBUG) << "Successfully loaded libascendcl.so and resolved aclrtGetResInCurrentThread"; } // 调用函数 - int getResRet = aclrtGetResInCurrentThread(type, resource); - if (getResRet != ACL_SUCCESS) { - MKI_LOG(ERROR) << "Failed to get resource in current thread for type:" << type << " err:" << getResRet; - return LCAL_ERROR_INTERNAL; + if (res == LCAL_SUCCESS) { + int getResRet = aclrtGetResInCurrentThread(type, resource); + if (getResRet != ACL_SUCCESS) { + MKI_LOG(ERROR) << "Failed to get resource in current thread for type:" << type << " err:" << getResRet; + return LCAL_ERROR_INTERNAL; + } else { + MKI_LOG(DEBUG) << "Get resource in current thread for type:" << type << " resource:" << *resource; + return LCAL_SUCCESS; + } } else { - MKI_LOG(DEBUG) << "Get resource in current thread for type:" << type << " resource:" << *resource; - return LCAL_SUCCESS; + return res; } + } uint32_t GetLocalReduceBlockDum(int64_t dataSize) -- Gitee From c08b4e7b6a31704398a3812057ef0755f5f63466 Mon Sep 17 00:00:00 2001 From: guo-jiong Date: Mon, 22 Sep 2025 20:20:45 +0800 Subject: [PATCH 48/94] move configs dir --- configs/build_config.json | 8 -------- scripts/build_util.py | 2 +- scripts/update_tbe_tactic_json.py | 2 +- src/kernels/configs/build_config.json | 2 +- .../kernels/configs}/mixops/tbe_tactic_info.ini | 0 {configs => src/kernels/configs}/ops/tbe_tactic_info.ini | 0 src/kernels/tbe_adapter/CMakeLists.txt | 6 +++--- 7 files changed, 6 insertions(+), 14 deletions(-) delete mode 100644 configs/build_config.json rename {configs => src/kernels/configs}/mixops/tbe_tactic_info.ini (100%) rename {configs => src/kernels/configs}/ops/tbe_tactic_info.ini (100%) diff --git a/configs/build_config.json b/configs/build_config.json deleted file mode 100644 index 8f8b66dc..00000000 --- a/configs/build_config.json +++ /dev/null @@ -1,8 +0,0 @@ -{ - "targets": { - "ascend310b": true, - "ascend310p": true, - "ascend910b": true, - "ascend910": true - } -} diff --git a/scripts/build_util.py b/scripts/build_util.py index 424038f5..fc4f55a8 100644 --- a/scripts/build_util.py +++ b/scripts/build_util.py @@ -23,7 +23,7 @@ def get_build_target_list(): if usr_config_file_path == '': script_file_path = os.path.realpath(__file__) build_config_json_file_path = os.path.join(os.path.dirname( - script_file_path), "../configs/build_config.json") + script_file_path), "../src/kernels/configs/build_config.json") else: build_config_json_file_path = usr_config_file_path device_list = [] diff --git a/scripts/update_tbe_tactic_json.py b/scripts/update_tbe_tactic_json.py index 965e4476..89395ea4 100644 --- a/scripts/update_tbe_tactic_json.py +++ b/scripts/update_tbe_tactic_json.py @@ -339,7 +339,7 @@ def write_tbe_tactic_json(input_args, json_paths_info): def main(): code_root_dir = get_code_root() - tactic_info_path = os.path.join(code_root_dir, "configs/ops/tbe_tactic_info.ini") + tactic_info_path = os.path.join(code_root_dir, "src/kernels/configs/ops/tbe_tactic_info.ini") build_cache_dir, _ = get_build_cache_path() tactic_json_path = os.path.join(build_cache_dir, "tbe_tactic_json.ini") diff --git a/src/kernels/configs/build_config.json b/src/kernels/configs/build_config.json index 73c00e47..6b8dcdad 100644 --- a/src/kernels/configs/build_config.json +++ b/src/kernels/configs/build_config.json @@ -5,4 +5,4 @@ "ascend910b": true, "ascend910": true } -} \ No newline at end of file +} diff --git a/configs/mixops/tbe_tactic_info.ini b/src/kernels/configs/mixops/tbe_tactic_info.ini similarity index 100% rename from configs/mixops/tbe_tactic_info.ini rename to src/kernels/configs/mixops/tbe_tactic_info.ini diff --git a/configs/ops/tbe_tactic_info.ini b/src/kernels/configs/ops/tbe_tactic_info.ini similarity index 100% rename from configs/ops/tbe_tactic_info.ini rename to src/kernels/configs/ops/tbe_tactic_info.ini diff --git a/src/kernels/tbe_adapter/CMakeLists.txt b/src/kernels/tbe_adapter/CMakeLists.txt index 0276e210..7251dbf9 100644 --- a/src/kernels/tbe_adapter/CMakeLists.txt +++ b/src/kernels/tbe_adapter/CMakeLists.txt @@ -565,7 +565,7 @@ set_source_files_properties( # MIX OPS file(MAKE_DIRECTORY ${CMAKE_BINARY_DIR}/mixops/) execute_process(COMMAND python3 ${PROJECT_SOURCE_DIR}/scripts/update_tbe_tactic_json.py - --src_ini_path ${PROJECT_SOURCE_DIR}/configs/mixops/tbe_tactic_info.ini + --src_ini_path ${PROJECT_SOURCE_DIR}/src/kernels/configs/mixops/tbe_tactic_info.ini --dst_ini_path ${CMAKE_BINARY_DIR}/mixops/tbe_tactic_json.ini OUTPUT_VARIABLE MIX_PYTHON_OUTPUT ERROR_VARIABLE RESULT_INFO @@ -580,7 +580,7 @@ set_source_files_properties(${MIX_REUSE_BINARY_LIST} PROPERTIES GENERATED TRUE) add_custom_command( OUTPUT ${MIX_REUSE_BINARY_LIST} ${CMAKE_BINARY_DIR}/mix_wait_flag.cpp - DEPENDS ${PROJECT_SOURCE_DIR}/configs/mixops/tbe_tactic_info.ini + DEPENDS ${PROJECT_SOURCE_DIR}/src/kernels/configs/mixops/tbe_tactic_info.ini COMMAND python3 ${MKI_PACKAGE_DIR}/scripts/build_util.py --binary_dir ${CMAKE_BINARY_DIR} --op_type tbe --tbe_ini_path ${CMAKE_BINARY_DIR}/mixops/tbe_tactic_json.ini COMMAND cmake -E sleep 10 @@ -610,7 +610,7 @@ set_source_files_properties(${OPS_REUSE_BINARY_LIST} PROPERTIES GENERATED TRUE) add_custom_command( OUTPUT ${OPS_REUSE_BINARY_LIST} ${CMAKE_BINARY_DIR}/ops_wait_flag.cpp - DEPENDS ${PROJECT_SOURCE_DIR}/configs/ops/tbe_tactic_info.ini + DEPENDS ${PROJECT_SOURCE_DIR}/src/kernels/configs/ops/tbe_tactic_info.ini COMMAND python3 ${MKI_PACKAGE_DIR}/scripts/build_util.py --binary_dir ${CMAKE_BINARY_DIR} --op_type tbe --tbe_ini_path ${CMAKE_BINARY_DIR}/tbe_tactic_json.ini COMMAND cmake -E sleep 10 -- Gitee From 935002b6b829d37726110af77611565e2c4ff7c1 Mon Sep 17 00:00:00 2001 From: huangxiaolan Date: Mon, 22 Sep 2025 20:24:58 +0800 Subject: [PATCH 49/94] recover operation_base --- src/atb/operation/operation_base.cpp | 6 +++--- src/atb/operation/operation_base.h | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/atb/operation/operation_base.cpp b/src/atb/operation/operation_base.cpp index 526b1be6..8668f9e5 100644 --- a/src/atb/operation/operation_base.cpp +++ b/src/atb/operation/operation_base.cpp @@ -1064,12 +1064,12 @@ Status OperationBase::GraphModeLaunch() } Status OperationBase::Execute(const VariantPack &variantPack, uint8_t *workspace, uint64_t workspaceSize, - Context &context) + Context *context) { const uint64_t beginTime = GetSingleton().GetProfilingLevel0Status() ? GetSingleton().ProfSysCycleTime() : 0; - ExecuteType executeType = context.GetExecuteType(); + ExecuteType executeType = context->GetExecuteType(); ProfilingFuncName profType = executeType == EXECUTE_NORMAL ? OPERATION_EXECUTE : (executeType == EXECUTE_PRELAUNCH ? OPERATION_PRELAUNCH : OPERATION_LAUNCH); @@ -1083,7 +1083,7 @@ Status OperationBase::Execute(const VariantPack &variantPack, uint8_t *workspace } Status st = NO_ERROR; if (executeType == EXECUTE_NORMAL || executeType == EXECUTE_PRELAUNCH) { - st = PreLaunch(variantPack, workspace, workspaceSize, &context); + st = PreLaunch(variantPack, workspace, workspaceSize, context); if (st != NO_ERROR) { ATB_LOG(ERROR) << GetLogPrefix() << "PreLaunch fail, error code: " << st; return st; diff --git a/src/atb/operation/operation_base.h b/src/atb/operation/operation_base.h index f0f5d791..309fc0bf 100644 --- a/src/atb/operation/operation_base.h +++ b/src/atb/operation/operation_base.h @@ -39,7 +39,7 @@ public: Status InferShape(const SVector &inTensorDescs, SVector &outTensorDescs) const override; Status Setup(const VariantPack &variantPack, uint64_t &workspaceSize, Context *context) override; Status Execute(const VariantPack &variantPack, uint8_t *workspace, uint64_t workspaceSize, - Context &context) override; + Context *context) override; Status SetOperationBaseIds(const std::vector &operationBaseIds, const int64_t nodeId); virtual nlohmann::json GetParamJson() const; const std::vector &GetOperationBaseIds(); -- Gitee From d6ae7ce0eb2d185187a56f7cf75bcd3a35c2f209 Mon Sep 17 00:00:00 2001 From: guanguan Date: Mon, 22 Sep 2025 20:26:35 +0800 Subject: [PATCH 50/94] fix error --- .../linear_parallel/linear_parallel_operation.cpp | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/src/ops_infer/linear_parallel/linear_parallel_operation.cpp b/src/ops_infer/linear_parallel/linear_parallel_operation.cpp index 9ce37e92..6a7fd77b 100644 --- a/src/ops_infer/linear_parallel/linear_parallel_operation.cpp +++ b/src/ops_infer/linear_parallel/linear_parallel_operation.cpp @@ -145,7 +145,7 @@ template <> Status CreateOperation(const infer::LinearParallelParam &opParam, Op return ERROR_INVALID_PARAM; } int rankSize = opParam.rankSize; - if (opParam.rankSize <= 0 || (rankSize & (rankSize - 1)) != 0) { + if ((opParam.rankSize <= 0 || (rankSize & (rankSize - 1)) != 0) && opParam.backend == "lcoc") { ATB_LOG(ERROR) << "LinearParallel rankSize support power of 2 but got [" << opParam.rankSize << "]"; return ERROR_INVALID_PARAM; } @@ -409,12 +409,6 @@ Status LinearParallelOperation::InferShapeCheckLinearReduceScatter(const SVector return ERROR_INVALID_TENSOR_INI_MATCH; } - int64_t xTensorM = OperationUtil::GetXTensorM(inTensorDescs.at(0)); - if (xTensorM % param_.rankSize != 0) { - ATB_LOG(ERROR) << GetLogPrefix() << "inTensor0 m [" << xTensorM - << "] should be an integer multiple of rankSize :" << param_.rankSize; - return ERROR_INVALID_TENSOR_DIM; - } if (param_.backend == "mc2") { int64_t xTensorK = OperationUtil::GetXTensorK(inTensorDescs.at(0)); if (xTensorK < 256 || xTensorK > 65535) { -- Gitee From 9f2bdcd05ed784795de587bfed547e5a7bdad232 Mon Sep 17 00:00:00 2001 From: huangxiaolan Date: Mon, 22 Sep 2025 20:30:49 +0800 Subject: [PATCH 51/94] recover extern changes --- src/atb/operation/if_operation.cpp | 2 +- src/atb/operation/if_operation.h | 2 +- src/atb/runner/plugin_runner.cpp | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/atb/operation/if_operation.cpp b/src/atb/operation/if_operation.cpp index 7a70570e..8a950c22 100644 --- a/src/atb/operation/if_operation.cpp +++ b/src/atb/operation/if_operation.cpp @@ -107,7 +107,7 @@ Status IfOperation::Execute(const VariantPack &variantPack, uint8_t *workspace, Context &context) { ATB_LOG(INFO) << GetLogPrefix() << "Calling Execute..."; - return opSelected_->Execute(variantPack, workspace, workspaceSize, context); + return opSelected_->Execute(variantPack, workspace, workspaceSize, *context); } uint32_t IfOperation::GetInputNum() const diff --git a/src/atb/operation/if_operation.h b/src/atb/operation/if_operation.h index 8f5dfbe7..eab4a45f 100644 --- a/src/atb/operation/if_operation.h +++ b/src/atb/operation/if_operation.h @@ -24,7 +24,7 @@ public: std::string GetName() const override; Status Setup(const VariantPack &variantPack, uint64_t &workspaceSize, Context *context) override; Status Execute(const VariantPack &variantPack, uint8_t *workspace, uint64_t workspaceSize, - Context &context) override; + Context *context) override; uint32_t GetInputNum() const override; uint32_t GetOutputNum() const override; void SetExecuteStreamId(uint32_t streamId) override; diff --git a/src/atb/runner/plugin_runner.cpp b/src/atb/runner/plugin_runner.cpp index fb26b4f6..aeaa20c6 100644 --- a/src/atb/runner/plugin_runner.cpp +++ b/src/atb/runner/plugin_runner.cpp @@ -37,7 +37,7 @@ Status PluginRunner::ExecuteImpl(RunnerVariantPack &runnerVariantPack) variantPack_.inTensors = runnerVariantPack.inTensors; variantPack_.outTensors = runnerVariantPack.outTensors; return operation_->Execute(variantPack_, runnerVariantPack.workspaceBuffer, - runnerVariantPack.workspaceBufferSize, *runnerVariantPack.context); + runnerVariantPack.workspaceBufferSize, runnerVariantPack.context); } return ERROR_INVALID_PARAM; -- Gitee From 1ca11b7c1b773930a7423a36551aefa0f179c3db Mon Sep 17 00:00:00 2001 From: huangxiaolan Date: Mon, 22 Sep 2025 20:32:38 +0800 Subject: [PATCH 52/94] recover extern changes --- src/atb/operation/if_operation.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/atb/operation/if_operation.cpp b/src/atb/operation/if_operation.cpp index 8a950c22..a9c8f406 100644 --- a/src/atb/operation/if_operation.cpp +++ b/src/atb/operation/if_operation.cpp @@ -104,10 +104,10 @@ Status IfOperation::Setup(const VariantPack &variantPack, uint64_t &workspaceSiz } Status IfOperation::Execute(const VariantPack &variantPack, uint8_t *workspace, uint64_t workspaceSize, - Context &context) + Context *context) { ATB_LOG(INFO) << GetLogPrefix() << "Calling Execute..."; - return opSelected_->Execute(variantPack, workspace, workspaceSize, *context); + return opSelected_->Execute(variantPack, workspace, workspaceSize, context); } uint32_t IfOperation::GetInputNum() const -- Gitee From a0bde3130497d6bc37f5e0c33b0d864454b63b6b Mon Sep 17 00:00:00 2001 From: guanguan Date: Mon, 22 Sep 2025 20:40:21 +0800 Subject: [PATCH 53/94] add nz check --- .../linear_parallel_operation.cpp | 38 +++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/src/ops_infer/linear_parallel/linear_parallel_operation.cpp b/src/ops_infer/linear_parallel/linear_parallel_operation.cpp index 6a7fd77b..ea29279e 100644 --- a/src/ops_infer/linear_parallel/linear_parallel_operation.cpp +++ b/src/ops_infer/linear_parallel/linear_parallel_operation.cpp @@ -378,8 +378,26 @@ Status LinearParallelOperation::CheckResidual(const SVector &inTenso return NO_ERROR; } +Status LinearParallelOperation::CheckWeightNzFormat(const SVector &inTensorDescs) const +{ + const TensorDesc &weight = inTensorDescs.at(1); + bool weightNz = (weight.format == ACL_FORMAT_FRACTAL_NZ); + if (weightNz) { + if (weight.shape.dimNum != DIM_4) { + ATB_LOG(ERROR) << GetLogPrefix() << "fractal_nz shape dim should be 4. now is "<< weight.shape.dimNum; + return ERROR_INVALID_TENSOR_DIM; + } + } + return NO_ERROR; +} + Status LinearParallelOperation::InferShapeCheckLinearAllReduce(const SVector &inTensorDescs) const { + Status st = CheckWeightNzFormat(inTensorDescs); + if (st != NO_ERROR) { + return st; + } + if (!OperationUtil::MatmulInTensorDescsCheck(inTensorDescs, GetLogPrefix(), commonCheckParam_)) { return ERROR_INVALID_TENSOR_DIM; } @@ -397,6 +415,11 @@ Status LinearParallelOperation::InferShapeCheckLinearAllReduce(const SVector &inTensorDescs) const { + Status st = CheckWeightNzFormat(inTensorDescs); + if (st != NO_ERROR) { + return st; + } + if (!OperationUtil::MatmulInTensorDescsCheck(inTensorDescs, GetLogPrefix(), commonCheckParam_)) { return ERROR_INVALID_TENSOR_DIM; } @@ -422,6 +445,11 @@ Status LinearParallelOperation::InferShapeCheckLinearReduceScatter(const SVector Status LinearParallelOperation::InferShapeCheckAllGatherLinear(const SVector &inTensorDescs) const { + Status st = CheckWeightNzFormat(inTensorDescs); + if (st != NO_ERROR) { + return st; + } + bool isQuant = param_.quantType > infer::LinearParallelParam::QuantType::QUANT_TYPE_UNQUANT && param_.quantType < infer::LinearParallelParam::QuantType::QUANT_TYPE_MAX; if (isQuant && inTensorDescs.at(3).dtype == ACL_FLOAT && param_.outDataType == ACL_FLOAT16) { @@ -443,6 +471,11 @@ Status LinearParallelOperation::InferShapeCheckAllGatherLinear(const SVector &inTensorDescs) const { + Status st = CheckWeightNzFormat(inTensorDescs); + if (st != NO_ERROR) { + return st; + } + if (param_.twoDimTPInfo.rsDim * param_.twoDimTPInfo.agDim != param_.rankSize) { ATB_LOG(ERROR) << "agDim * rsDim should equal to rankSize"; return ERROR_INVALID_PARAM; @@ -470,6 +503,11 @@ LinearParallelOperation::InferShapeCheckAllGatherLinearReduceScatter(const SVect Status LinearParallelOperation::InferShapeCheckAllToAllvcAllGatherGmm(const SVector &inTensorDescs) const { + Status st = CheckWeightNzFormat(inTensorDescs); + if (st != NO_ERROR) { + return st; + } + bool isQuant = param_.quantType > infer::LinearParallelParam::QuantType::QUANT_TYPE_UNQUANT && param_.quantType < infer::LinearParallelParam::QuantType::QUANT_TYPE_MAX; if (isQuant && inTensorDescs.at(2).dtype == ACL_FLOAT && param_.outDataType == ACL_FLOAT16) { -- Gitee From 789e4a897a266c37447b7b3dd3cf403997383374 Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Mon, 22 Sep 2025 20:54:53 +0800 Subject: [PATCH 54/94] fix --- src/kernels/lcal/src/lccl.cpp | 82 ++++++++++++++++++++--------------- 1 file changed, 47 insertions(+), 35 deletions(-) diff --git a/src/kernels/lcal/src/lccl.cpp b/src/kernels/lcal/src/lccl.cpp index bf79bf66..f6af2d3b 100644 --- a/src/kernels/lcal/src/lccl.cpp +++ b/src/kernels/lcal/src/lccl.cpp @@ -29,47 +29,59 @@ namespace Lcal { using AclrtGetResInCurrentThreadFunc = int(*)(int, uint32_t*); -int GetAclResInCurThread(int type, uint32_t *resource) +int GetAclResInCurThread(int type, uint32_t &resource) { - static std::mutex localMutex; // 线程安全锁 - static AclrtGetResInCurrentThreadFunc aclrtGetResInCurrentThread = nullptr; - static int res = -1; - - // 首次调用时初始化 - if (res == -1) { - std::lock_guard lock(localMutex); // 加锁 - std::unique_ptr mkiDl; - std::string libPath = std::string(Mki::GetEnv("ASCEND_HOME_PATH")) + "/runtime/lib64/libascendcl.so"; - mkiDl = std::make_unique(libPath, false); - if (!mkiDl->IsValid()) { // 检查库是否加载成功 - MKI_LOG(ERROR) << "Failed to load libascendcl.so!"; - return LCAL_ERROR_INTERNAL; + static std::once_flag onceFlag; + static std::atomic initFlag(LCAL_ERROR_NOT_INITIALIZED); // -1 + static std::shared_ptr mkiDl; + static AclrtGetResInCurrentThreadFunc aclFn = nullptr; + + std::call_once(onceFlag, []() { + std::string home = Mki::GetEnv("ASCEND_HOME_PATH"); + std::vector candidates; + if (!home.empty()) { + candidates.push_back(home + "/runtime/lib64/libascendcl.so"); } - aclrtGetResInCurrentThread = - (AclrtGetResInCurrentThreadFunc)mkiDl->GetSymbol("aclrtGetResInCurrentThread"); - if (aclrtGetResInCurrentThread == nullptr) { - MKI_LOG(WARN) << "Failed to get aclrtGetResInCurrentThread function!"; - res = LCAL_ERROR_NOT_FOUND; - return LCAL_ERROR_NOT_FOUND; + candidates.emplace_back("libascendcl.so"); + + for (const auto &p : candidates) { + auto dl = std::make_unique(p, false); + if (!dl->IsValid()) { + MKI_LOG(WARN) << "Try load libascendcl.so failed: " << p; + continue; + } + auto sym = dl->GetSymbol("aclrtGetResInCurrentThread"); + if (sym == nullptr) { + MKI_LOG(WARN) << "Symbol aclrtGetResInCurrentThread not found in: " << p; + continue; + } + mkiDl = std::move(dl); + aclFn = reinterpret_cast(sym); + initFlag.store(LCAL_SUCCESS, std::memory_order_release); + MKI_LOG(DEBUG) << "Loaded libascendcl.so and resolved aclrtGetResInCurrentThread from: " << p; + return; } - res = LCAL_SUCCESS; - MKI_LOG(DEBUG) << "Successfully loaded libascendcl.so and resolved aclrtGetResInCurrentThread"; + initFlag.store(LCAL_ERROR_NOT_FOUND, std::memory_order_release); + MKI_LOG(ERROR) << "Failed to load libascendcl.so or resolve aclrtGetResInCurrentThread."; + }); + + int rc = initFlag.load(std::memory_order_acquire); + if (rc != LCAL_SUCCESS) { + return rc; } - // 调用函数 - if (res == LCAL_SUCCESS) { - int getResRet = aclrtGetResInCurrentThread(type, resource); - if (getResRet != ACL_SUCCESS) { - MKI_LOG(ERROR) << "Failed to get resource in current thread for type:" << type << " err:" << getResRet; - return LCAL_ERROR_INTERNAL; - } else { - MKI_LOG(DEBUG) << "Get resource in current thread for type:" << type << " resource:" << *resource; - return LCAL_SUCCESS; - } - } else { - return res; + if (type != ACL_RT_DEV_RES_CUBE_CORE && type != ACL_RT_DEV_RES_VECTOR_CORE) { + MKI_LOG(ERROR) << "aclrtGetResInCurrentThread not support resource type:" << type; + return LCAL_ERROR_PARA_CHECK_FAIL; } + const int ret = aclFn(type, &resource); + if (ret != ACL_SUCCESS) { + MKI_LOG(ERROR) << "aclrtGetResInCurrentThread failed. type:" << type << " err:" << ret; + return LCAL_ERROR_INTERNAL; + } + MKI_LOG(DEBUG) << "Got resource in current thread. type:" << type << " resource:" << resource; + return LCAL_SUCCESS; } uint32_t GetLocalReduceBlockDum(int64_t dataSize) @@ -288,7 +300,7 @@ uint32_t Lccl::GetBlockNum(LcalType cclType, uint32_t rankSize, int64_t dataSize limitType = aclrtDevResLimitType::ACL_RT_DEV_RES_CUBE_CORE; } - int res = GetAclResInCurThread(static_cast(limitType), &limitVal); + int res = GetAclResInCurThread(static_cast(limitType), limitVal); if (res == LCAL_SUCCESS) { MKI_LOG(DEBUG) << "Required blockNum(" << blockNum << ") limit:(limitVal=" << limitVal << ", limitType=" << static_cast(limitType) << ")"; -- Gitee From bc56bcf4b0d1271eb79b580e8e4e74ec9a0b2881 Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Mon, 22 Sep 2025 20:59:06 +0800 Subject: [PATCH 55/94] fix --- src/kernels/lcal/src/lccl.cpp | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/src/kernels/lcal/src/lccl.cpp b/src/kernels/lcal/src/lccl.cpp index f6af2d3b..9e44d0b5 100644 --- a/src/kernels/lcal/src/lccl.cpp +++ b/src/kernels/lcal/src/lccl.cpp @@ -26,14 +26,13 @@ using namespace chrono; using namespace Mki; namespace Lcal { - using AclrtGetResInCurrentThreadFunc = int(*)(int, uint32_t*); int GetAclResInCurThread(int type, uint32_t &resource) { static std::once_flag onceFlag; - static std::atomic initFlag(LCAL_ERROR_NOT_INITIALIZED); // -1 - static std::shared_ptr mkiDl; + static std::atomic initFlag{LCAL_ERROR_NOT_INITIALIZED}; // -1 + static std::unique_ptr mkiDl; static AclrtGetResInCurrentThreadFunc aclFn = nullptr; std::call_once(onceFlag, []() { @@ -61,8 +60,9 @@ int GetAclResInCurThread(int type, uint32_t &resource) MKI_LOG(DEBUG) << "Loaded libascendcl.so and resolved aclrtGetResInCurrentThread from: " << p; return; } + MKI_LOG(ERROR) << "Failed to load libascendcl.so or resolve aclrtGetResInCurrentThread. Tried paths: " + << boost::algorithm::join(candidates, ", "); initFlag.store(LCAL_ERROR_NOT_FOUND, std::memory_order_release); - MKI_LOG(ERROR) << "Failed to load libascendcl.so or resolve aclrtGetResInCurrentThread."; }); int rc = initFlag.load(std::memory_order_acquire); @@ -71,16 +71,17 @@ int GetAclResInCurThread(int type, uint32_t &resource) } if (type != ACL_RT_DEV_RES_CUBE_CORE && type != ACL_RT_DEV_RES_VECTOR_CORE) { - MKI_LOG(ERROR) << "aclrtGetResInCurrentThread not support resource type:" << type; + MKI_LOG(ERROR) << "aclrtGetResInCurrentThread not support resource type: " << type; return LCAL_ERROR_PARA_CHECK_FAIL; } const int ret = aclFn(type, &resource); if (ret != ACL_SUCCESS) { - MKI_LOG(ERROR) << "aclrtGetResInCurrentThread failed. type:" << type << " err:" << ret; + MKI_LOG(ERROR) << "aclrtGetResInCurrentThread failed. type: " << type << " err: " << ret; return LCAL_ERROR_INTERNAL; } - MKI_LOG(DEBUG) << "Got resource in current thread. type:" << type << " resource:" << resource; + + MKI_LOG(DEBUG) << "Got resource in current thread. type: " << type << " resource: " << resource; return LCAL_SUCCESS; } -- Gitee From b5256c9d5be1189dd6f500563f457f985c367efd Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Mon, 22 Sep 2025 21:05:02 +0800 Subject: [PATCH 56/94] fix --- src/kernels/lcal/src/lccl.cpp | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/src/kernels/lcal/src/lccl.cpp b/src/kernels/lcal/src/lccl.cpp index 9e44d0b5..11fd73e7 100644 --- a/src/kernels/lcal/src/lccl.cpp +++ b/src/kernels/lcal/src/lccl.cpp @@ -32,14 +32,14 @@ int GetAclResInCurThread(int type, uint32_t &resource) { static std::once_flag onceFlag; static std::atomic initFlag{LCAL_ERROR_NOT_INITIALIZED}; // -1 - static std::unique_ptr mkiDl; + static std::unique_ptr mkiDl; // 持久保存,避免库被卸载 static AclrtGetResInCurrentThreadFunc aclFn = nullptr; std::call_once(onceFlag, []() { std::string home = Mki::GetEnv("ASCEND_HOME_PATH"); std::vector candidates; if (!home.empty()) { - candidates.push_back(home + "/runtime/lib64/libascendcl.so"); + candidates.emplace_back(home + "/runtime/lib64/libascendcl.so"); } candidates.emplace_back("libascendcl.so"); @@ -54,17 +54,19 @@ int GetAclResInCurThread(int type, uint32_t &resource) MKI_LOG(WARN) << "Symbol aclrtGetResInCurrentThread not found in: " << p; continue; } - mkiDl = std::move(dl); + mkiDl = std::move(dl); // 保留句柄,防止卸载 aclFn = reinterpret_cast(sym); initFlag.store(LCAL_SUCCESS, std::memory_order_release); MKI_LOG(DEBUG) << "Loaded libascendcl.so and resolved aclrtGetResInCurrentThread from: " << p; - return; + return; // 成功 } + // 失败 MKI_LOG(ERROR) << "Failed to load libascendcl.so or resolve aclrtGetResInCurrentThread. Tried paths: " << boost::algorithm::join(candidates, ", "); initFlag.store(LCAL_ERROR_NOT_FOUND, std::memory_order_release); }); + // 初始化结果判定 int rc = initFlag.load(std::memory_order_acquire); if (rc != LCAL_SUCCESS) { return rc; @@ -75,6 +77,7 @@ int GetAclResInCurThread(int type, uint32_t &resource) return LCAL_ERROR_PARA_CHECK_FAIL; } + // 调用底层函数 const int ret = aclFn(type, &resource); if (ret != ACL_SUCCESS) { MKI_LOG(ERROR) << "aclrtGetResInCurrentThread failed. type: " << type << " err: " << ret; -- Gitee From 9bff1329bccbb3f5327ffcd6218a6ee36c69be6b Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Mon, 22 Sep 2025 21:17:40 +0800 Subject: [PATCH 57/94] fix --- src/kernels/lcal/src/lccl.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/kernels/lcal/src/lccl.cpp b/src/kernels/lcal/src/lccl.cpp index 11fd73e7..9da09500 100644 --- a/src/kernels/lcal/src/lccl.cpp +++ b/src/kernels/lcal/src/lccl.cpp @@ -61,8 +61,7 @@ int GetAclResInCurThread(int type, uint32_t &resource) return; // 成功 } // 失败 - MKI_LOG(ERROR) << "Failed to load libascendcl.so or resolve aclrtGetResInCurrentThread. Tried paths: " - << boost::algorithm::join(candidates, ", "); + MKI_LOG(ERROR) << "Failed to load libascendcl.so or resolve aclrtGetResInCurrentThread."; initFlag.store(LCAL_ERROR_NOT_FOUND, std::memory_order_release); }); -- Gitee From 1eb367e23c40314d9f1d6f3675c63c9f9222f9fa Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Mon, 22 Sep 2025 21:29:39 +0800 Subject: [PATCH 58/94] fix --- src/kernels/lcal/src/lccl.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/kernels/lcal/src/lccl.cpp b/src/kernels/lcal/src/lccl.cpp index 9da09500..e6ab4f2f 100644 --- a/src/kernels/lcal/src/lccl.cpp +++ b/src/kernels/lcal/src/lccl.cpp @@ -40,8 +40,9 @@ int GetAclResInCurThread(int type, uint32_t &resource) std::vector candidates; if (!home.empty()) { candidates.emplace_back(home + "/runtime/lib64/libascendcl.so"); + } else { + MKI_LOG(ERROR) << "ASCEND_HOME_PATH is empty."; } - candidates.emplace_back("libascendcl.so"); for (const auto &p : candidates) { auto dl = std::make_unique(p, false); -- Gitee From 2da65a527d683ecac79f9ac512fa5e660d2b9921 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BC=A0=E7=A1=95=E7=B4=AF?= Date: Mon, 22 Sep 2025 21:37:57 +0800 Subject: [PATCH 59/94] fix rms_norm_grad --- .../norm/rmsnormbackward/tiling/rms_norm_grad_tiling.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/kernels/kernels/norm/rmsnormbackward/tiling/rms_norm_grad_tiling.cpp b/src/kernels/kernels/norm/rmsnormbackward/tiling/rms_norm_grad_tiling.cpp index 48eeaaad..c15daf1a 100644 --- a/src/kernels/kernels/norm/rmsnormbackward/tiling/rms_norm_grad_tiling.cpp +++ b/src/kernels/kernels/norm/rmsnormbackward/tiling/rms_norm_grad_tiling.cpp @@ -231,7 +231,7 @@ Status RmsNormGradTiling(const LaunchParam &launchParam, KernelInfo &kernelInfo) tilingDataPointer->avg = avgVal; kernelInfo.SetTilingId(tilingKey); uint64_t sysWorkspaceSize = - static_cast(BLOCK_SIZE + tilingDataPointer->blockDim * BLOCK_SIZE * TWICE_WORKSPACE); + static_cast((BLOCK_SIZE + tilingDataPointer->blockDim * BLOCK_SIZE * TWICE_WORKSPACE) * sizeof(int)); kernelInfo.GetScratchSizes().push_back(sysWorkspaceSize); kernelInfo.SetMemsetInfo(WORK_SPACE_INDEX, sysWorkspaceSize); return Status::OkStatus(); -- Gitee From 245fb24a8ec6a7e14fb2aa8bfdc3c89925b4f9d4 Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Mon, 22 Sep 2025 21:38:44 +0800 Subject: [PATCH 60/94] fix --- src/kernels/lcal/src/lccl.cpp | 42 +++++++++++++---------------------- 1 file changed, 16 insertions(+), 26 deletions(-) diff --git a/src/kernels/lcal/src/lccl.cpp b/src/kernels/lcal/src/lccl.cpp index e6ab4f2f..77d5d6d6 100644 --- a/src/kernels/lcal/src/lccl.cpp +++ b/src/kernels/lcal/src/lccl.cpp @@ -36,34 +36,24 @@ int GetAclResInCurThread(int type, uint32_t &resource) static AclrtGetResInCurrentThreadFunc aclFn = nullptr; std::call_once(onceFlag, []() { - std::string home = Mki::GetEnv("ASCEND_HOME_PATH"); - std::vector candidates; - if (!home.empty()) { - candidates.emplace_back(home + "/runtime/lib64/libascendcl.so"); - } else { - MKI_LOG(ERROR) << "ASCEND_HOME_PATH is empty."; + std::string p = Mki::GetEnv("ASCEND_HOME_PATH") + "/runtime/lib64/libascendcl.so"; + auto dl = std::make_unique(p, false); + if (!dl->IsValid()) { + MKI_LOG(ERROR) << "Try load libascendcl.so failed: " << p; + initFlag.store(LCAL_ERROR_NOT_FOUND, std::memory_order_release); + return; } - - for (const auto &p : candidates) { - auto dl = std::make_unique(p, false); - if (!dl->IsValid()) { - MKI_LOG(WARN) << "Try load libascendcl.so failed: " << p; - continue; - } - auto sym = dl->GetSymbol("aclrtGetResInCurrentThread"); - if (sym == nullptr) { - MKI_LOG(WARN) << "Symbol aclrtGetResInCurrentThread not found in: " << p; - continue; - } - mkiDl = std::move(dl); // 保留句柄,防止卸载 - aclFn = reinterpret_cast(sym); - initFlag.store(LCAL_SUCCESS, std::memory_order_release); - MKI_LOG(DEBUG) << "Loaded libascendcl.so and resolved aclrtGetResInCurrentThread from: " << p; - return; // 成功 + auto sym = dl->GetSymbol("aclrtGetResInCurrentThread"); + if (sym == nullptr) { + MKI_LOG(WARN) << "Symbol aclrtGetResInCurrentThread not found in: " << p; + initFlag.store(LCAL_ERROR_NOT_FOUND, std::memory_order_release); + return; } - // 失败 - MKI_LOG(ERROR) << "Failed to load libascendcl.so or resolve aclrtGetResInCurrentThread."; - initFlag.store(LCAL_ERROR_NOT_FOUND, std::memory_order_release); + mkiDl = std::move(dl); // 保留句柄,防止卸载 + aclFn = reinterpret_cast(sym); + initFlag.store(LCAL_SUCCESS, std::memory_order_release); + MKI_LOG(DEBUG) << "Loaded libascendcl.so and resolved aclrtGetResInCurrentThread from: " << p; + return; // 成功 }); // 初始化结果判定 -- Gitee From bc11a5e97f6314da30d304d891d6750e7cc9d2e5 Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Mon, 22 Sep 2025 21:48:59 +0800 Subject: [PATCH 61/94] fix --- src/kernels/lcal/include/lcal_comm.h | 2 +- src/kernels/lcal/src/lcal_comm.cpp | 6 +++--- src/kernels/lcal/src/lccl.cpp | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/kernels/lcal/include/lcal_comm.h b/src/kernels/lcal/include/lcal_comm.h index bff77eea..6ec0fbd7 100644 --- a/src/kernels/lcal/include/lcal_comm.h +++ b/src/kernels/lcal/include/lcal_comm.h @@ -63,7 +63,7 @@ private: int GetName(std::string &name, char names[LCAL_MAX_RANK_SIZE][IPC_NAME_SIZE]) const; int SyncCommArgs(); int InitDumpAddr(); - + private: int rank_ = 0; // global rank id int rankSize_ = 0; // global rank size diff --git a/src/kernels/lcal/src/lcal_comm.cpp b/src/kernels/lcal/src/lcal_comm.cpp index 8b77500a..b54380a0 100644 --- a/src/kernels/lcal/src/lcal_comm.cpp +++ b/src/kernels/lcal/src/lcal_comm.cpp @@ -303,7 +303,7 @@ int LcalComm::Init() if (inited_) { return LCAL_SUCCESS; } - if (rank_ < 0 || rank_ >= rankSize_ || rankSize_ <= 0 || rankSize_ > LCAL_MAX_RANK_SIZE) { + if (rank_ < 0 || rank_ >= rankSize_ || rankSize_ <= 0 || rankSize_ > LCAL_MAX_RANK_SIZE) { MKI_LOG(ERROR) << "The rank is invalid! rank:" << rank_ << " rankSize:" << rankSize_; return LCAL_ERROR_PARA_CHECK_FAIL; } @@ -351,7 +351,7 @@ int LcalComm::InitThread(const std::string &uid) if (inited_) { return LCAL_SUCCESS; } - if (rank_ < 0 || rank_ >= rankSize_ || rankSize_ <= 0 || rankSize_ > LCAL_MAX_RANK_SIZE) { + if (rank_ < 0 || rank_ >= rankSize_ || rankSize_ <= 0 || rankSize_ > LCAL_MAX_RANK_SIZE) { MKI_LOG(ERROR) << "The rank is invalid! rank:" << rank_ << "rankSize:" << rankSize_; return LCAL_ERROR_PARA_CHECK_FAIL; } @@ -723,7 +723,7 @@ LcalComm::~LcalComm() FreePeerMem(commArgs_.dumpAddr); FreePeerMem(peerMem_[rank_]); FreePeerMem(commArgsPtr_); - } +} LcalComm::LcalComm(int rank, int rankSize) : rank_(rank), rankSize_(rankSize) { diff --git a/src/kernels/lcal/src/lccl.cpp b/src/kernels/lcal/src/lccl.cpp index 77d5d6d6..8bf1255b 100644 --- a/src/kernels/lcal/src/lccl.cpp +++ b/src/kernels/lcal/src/lccl.cpp @@ -36,7 +36,7 @@ int GetAclResInCurThread(int type, uint32_t &resource) static AclrtGetResInCurrentThreadFunc aclFn = nullptr; std::call_once(onceFlag, []() { - std::string p = Mki::GetEnv("ASCEND_HOME_PATH") + "/runtime/lib64/libascendcl.so"; + std::string p = std::string(Mki::GetEnv("ASCEND_HOME_PATH")) + "/runtime/lib64/libascendcl.so"; auto dl = std::make_unique(p, false); if (!dl->IsValid()) { MKI_LOG(ERROR) << "Try load libascendcl.so failed: " << p; -- Gitee From 8dfdb58f34c4f243325bf518047484aad5869a60 Mon Sep 17 00:00:00 2001 From: huangxiaolan Date: Tue, 23 Sep 2025 09:36:42 +0800 Subject: [PATCH 62/94] fix include --- src/ops_infer/multi_latent_attention/atb_acl_mla.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ops_infer/multi_latent_attention/atb_acl_mla.cpp b/src/ops_infer/multi_latent_attention/atb_acl_mla.cpp index 2784d656..54b9f909 100644 --- a/src/ops_infer/multi_latent_attention/atb_acl_mla.cpp +++ b/src/ops_infer/multi_latent_attention/atb_acl_mla.cpp @@ -8,7 +8,7 @@ * See LICENSE in the root of the software repository for the full text of the License. */ #include "atb/atb_acl.h" -#include "atb_acl_util.h" +#include "atb/utils/atb_acl_util.h" #include "atb/operation/operation_base.h" #ifdef __cplusplus -- Gitee From 460a7ed85c495033b85de9037f2e1fa1bb7b9dbd Mon Sep 17 00:00:00 2001 From: huangxiaolan Date: Tue, 23 Sep 2025 11:19:52 +0800 Subject: [PATCH 63/94] fix expression --- .../mixkernels/fusion/fusion_kernel.cpp | 4 ++-- .../mixkernels/fusion/fusion_operation.cpp | 18 +++++++++--------- .../mixkernels/fusion/tiling/fusion_tiling.cpp | 4 ++-- .../paged_cache_load_operation.cpp | 16 ++++++++-------- 4 files changed, 21 insertions(+), 21 deletions(-) diff --git a/src/kernels/mixkernels/fusion/fusion_kernel.cpp b/src/kernels/mixkernels/fusion/fusion_kernel.cpp index 7fb940a5..7f2d9e8b 100644 --- a/src/kernels/mixkernels/fusion/fusion_kernel.cpp +++ b/src/kernels/mixkernels/fusion/fusion_kernel.cpp @@ -31,7 +31,7 @@ public: bool CanSupport(const LaunchParam &launchParam) const override { OpParam::Fusion fusionType = launchParam.GetParam(); - if (OpParam::Fusion::MATMUL_ADD == fusionType.fusionType) { + if (fusionType.fusionType == OpParam::Fusion::MATMUL_ADD) { MKI_CHECK(launchParam.GetInTensorCount() == TENSOR_INPUT_NUM_MATMUL_ADD, "in tensor num invalid", return false); MKI_CHECK(launchParam.GetOutTensorCount() == TENSOR_OUTPUT_NUM, "out tensor num invalid", return false); @@ -41,7 +41,7 @@ public: MKI_CHECK(inTensor1.desc.dtype == TENSOR_DTYPE_FLOAT16, "in tensor 1 dtype invalid", return false); auto inTensor2 = launchParam.GetInTensor(DIM_2); MKI_CHECK(inTensor2.desc.dtype == TENSOR_DTYPE_FLOAT16, "in tensor 2 dtype invalid", return false); - } else if (OpParam::Fusion::MATMUL_GELU == fusionType.fusionType) { + } else if (fusionType.fusionType == OpParam::Fusion::MATMUL_GELU) { MKI_CHECK(launchParam.GetInTensorCount() == TENSOR_INPUT_NUM_MATMUL_ACTIVATE, "in tensor num invalid", return false); MKI_CHECK(launchParam.GetOutTensorCount() == TENSOR_OUTPUT_NUM, "out tensor num invalid", return false); diff --git a/src/kernels/mixkernels/fusion/fusion_operation.cpp b/src/kernels/mixkernels/fusion/fusion_operation.cpp index 3f61161d..ff035217 100644 --- a/src/kernels/mixkernels/fusion/fusion_operation.cpp +++ b/src/kernels/mixkernels/fusion/fusion_operation.cpp @@ -37,13 +37,13 @@ public: { std::string kernelName = "FusionMatmulAddKernel"; OpParam::Fusion fusionType = launchParam.GetParam(); - if (OpParam::Fusion::MATMUL_GELU == fusionType.fusionType) { + if (fusionType.fusionType == OpParam::Fusion::MATMUL_GELU) { kernelName = "FusionMatmulGeluKernel"; - } else if (OpParam::Fusion::MATMUL_SIGMOID == fusionType.fusionType) { + } else if (fusionType.fusionType == OpParam::Fusion::MATMUL_SIGMOID) { kernelName = "FusionMatmulSigmoidKernel"; - } else if (OpParam::Fusion::MATMUL_SWIGLU == fusionType.fusionType) { + } else if (fusionType.fusionType == OpParam::Fusion::MATMUL_SWIGLU) { kernelName = "FusionMatmulSwiGluKernel"; - } else if (OpParam::Fusion::NON_FUSION == fusionType.fusionType) { + } else if (fusionType.fusionType == OpParam::Fusion::NON_FUSION) { kernelName = "FusionErasedKernel"; } MKI_LOG(INFO) << "getBestKernel " << kernelName; @@ -242,13 +242,13 @@ public: { OpParam::Fusion fusionType = launchParam.GetParam(); std::string deviceVersion = PlatformInfo::Instance().GetPlatformName(); - if (OpParam::Fusion::MATMUL_ADD == fusionType.fusionType) { + if (fusionType.fusionType == OpParam::Fusion::MATMUL_ADD) { MatMulAddFusion(); - } else if (OpParam::Fusion::MATMUL_GELU == fusionType.fusionType) { + } else if (fusionType.fusionType == OpParam::Fusion::MATMUL_GELU) { MatMulGeluFusion(); - } else if (OpParam::Fusion::MATMUL_SIGMOID == fusionType.fusionType) { + } else if (fusionType.fusionType == OpParam::Fusion::MATMUL_SIGMOID) { MatMulSigmoidFusion(); - } else if (OpParam::Fusion::MATMUL_SWIGLU == fusionType.fusionType) { + } else if (fusionType.fusionType == OpParam::Fusion::MATMUL_SWIGLU) { MatMulSwigluFusion(); } else { ErasedFusion(); @@ -270,7 +270,7 @@ protected: Status InferShapeImpl(const LaunchParam &launchParam, SVector &outTensors) const override { OpParam::Fusion fusionType = launchParam.GetParam(); - if (OpParam::Fusion::MATMUL_ADD == fusionType.fusionType) { + if (fusionType.fusionType == OpParam::Fusion::MATMUL_ADD) { auto inTensorDescA = launchParam.GetInTensor(2).desc; TensorDesc &tensorDescOut = outTensors[0].desc; tensorDescOut.dtype = TENSOR_DTYPE_FLOAT16; diff --git a/src/kernels/mixkernels/fusion/tiling/fusion_tiling.cpp b/src/kernels/mixkernels/fusion/tiling/fusion_tiling.cpp index 5145c511..130128ed 100644 --- a/src/kernels/mixkernels/fusion/tiling/fusion_tiling.cpp +++ b/src/kernels/mixkernels/fusion/tiling/fusion_tiling.cpp @@ -22,9 +22,9 @@ Status FusionTiling(const LaunchParam &launchParam, KernelInfo &kernelInfo) OpParam::Fusion fusionType = launchParam.GetParam(); std::string path(std::getenv("HOME")); path += std::string("/.atb_auto_fusion/bishengir_bin/") + - (OpParam::Fusion::MATMUL_ADD == fusionType.fusionType ? "libmatmul_add.so" : "libmatmul_gelu.so"); + (fusionType.fusionType ? "libmatmul_add.so" : "libmatmul_gelu.so" == OpParam::Fusion::MATMUL_ADD); std::string inferWorkspaceFuncName = - (OpParam::Fusion::MATMUL_ADD == fusionType.fusionType ? "matmul_add_" : "matmul_gelu_"); + (fusionType.fusionType ? "matmul_add_" : "matmul_gelu_" == OpParam::Fusion::MATMUL_ADD); FusionTilingData *tilingDataPtr = reinterpret_cast(kernelInfo.GetTilingHostAddr()); void *handle = dlopen(path.c_str(), RTLD_LAZY); if (!handle) { diff --git a/src/ops_infer/paged_cache_load/paged_cache_load_operation.cpp b/src/ops_infer/paged_cache_load/paged_cache_load_operation.cpp index 7daf6d45..6ab4f5e2 100644 --- a/src/ops_infer/paged_cache_load/paged_cache_load_operation.cpp +++ b/src/ops_infer/paged_cache_load/paged_cache_load_operation.cpp @@ -220,22 +220,22 @@ Status PagedCacheLoadOperation::KVCacheDimCheck910BNZ(const SVector return ERROR_INVALID_TENSOR_DIM_NUM; } if (inTensorDescs.at(IN_TENSOR_0_KEYCACHE).dtype == ACL_INT8) { - if (THIRTYTWO != inTensorDescs.at(IN_TENSOR_0_KEYCACHE).shape.dims[OUT_DIM] || - THIRTYTWO!= inTensorDescs.at(IN_TENSOR_1_VALUECACHE).shape.dims[OUT_DIM]) { // 1: valueCache + if (inTensorDescs.at(IN_TENSOR_0_KEYCACHE).shape.dims[OUT_DIM] != THIRTYTWO || + inTensorDescs.at(IN_TENSOR_1_VALUECACHE).shape.dims[OUT_DIM] != THIRTYTWO) { // 1: valueCache ATB_LOG(ERROR) << GetLogPrefix() << "The last dimension of keycache and valuecache must be 32"; return ERROR_INVALID_TENSOR_DIM; } - if (MAX_k < inTensorDescs.at(IN_TENSOR_0_KEYCACHE).shape.dims[1] * THIRTYTWO || - MAX_v < inTensorDescs.at(IN_TENSOR_1_VALUECACHE).shape.dims[1] * THIRTYTWO) { + if (inTensorDescs.at(IN_TENSOR_0_KEYCACHE).shape.dims[1] * THIRTYTWO > MAX_k || + inTensorDescs.at(IN_TENSOR_1_VALUECACHE).shape.dims[1] * THIRTYTWO > MAX_v) { ATB_LOG(ERROR) << GetLogPrefix() << "The scend dimension of blocktables must be less than 147456"; return ERROR_INVALID_TENSOR_DIM; } - } else if (SIXTEEN != inTensorDescs.at(IN_TENSOR_0_KEYCACHE).shape.dims[OUT_DIM] || - SIXTEEN!= inTensorDescs.at(IN_TENSOR_1_VALUECACHE).shape.dims[OUT_DIM]) { // 1: valueCache + } else if (inTensorDescs.at(IN_TENSOR_0_KEYCACHE).shape.dims[OUT_DIM] != SIXTEEN || + inTensorDescs.at(IN_TENSOR_1_VALUECACHE).shape.dims[OUT_DIM] != SIXTEEN) { // 1: valueCache ATB_LOG(ERROR) << GetLogPrefix() << "The last dimension of keycache and valuecache must be 16"; return ERROR_INVALID_TENSOR_DIM; - } else if (MAX_k < inTensorDescs.at(IN_TENSOR_0_KEYCACHE).shape.dims[1] * SIXTEEN || - MAX_v < inTensorDescs.at(IN_TENSOR_1_VALUECACHE).shape.dims[1] * SIXTEEN) { + } else if (inTensorDescs.at(IN_TENSOR_0_KEYCACHE).shape.dims[1] * SIXTEEN > MAX_k || + inTensorDescs.at(IN_TENSOR_1_VALUECACHE).shape.dims[1] * SIXTEEN > MAX_v) { ATB_LOG(ERROR) << GetLogPrefix() << "The scend dimension of blocktables must be less than 147456"; return ERROR_INVALID_TENSOR_DIM; } -- Gitee From 82e4298c723c70ca9f3524a771983bd1007ccf8b Mon Sep 17 00:00:00 2001 From: guanguan Date: Tue, 23 Sep 2025 11:26:49 +0800 Subject: [PATCH 64/94] fix error --- src/ops_infer/linear_parallel/linear_parallel_operation.cpp | 1 + src/ops_infer/linear_parallel/linear_parallel_operation.h | 1 + 2 files changed, 2 insertions(+) mode change 100644 => 100755 src/ops_infer/linear_parallel/linear_parallel_operation.h diff --git a/src/ops_infer/linear_parallel/linear_parallel_operation.cpp b/src/ops_infer/linear_parallel/linear_parallel_operation.cpp index ea29279e..810c8d75 100644 --- a/src/ops_infer/linear_parallel/linear_parallel_operation.cpp +++ b/src/ops_infer/linear_parallel/linear_parallel_operation.cpp @@ -37,6 +37,7 @@ static const uint32_t RESIDUAL_TENSOR_INDEX_3 = 3; static const uint32_t RESIDUAL_TENSOR_INDEX_4 = 4; static const uint32_t MAX_OUTPUT_SIZE = 204800; static const uint32_t MAX_K = 24000; +static const uint32_t DIM_4 = 4; static bool AllToAllvcAllGatherGmmOutTensorCheck(const SVector &inTensorDescs, const TensorDesc &outTensorDesc, const std::string &logPrefix) diff --git a/src/ops_infer/linear_parallel/linear_parallel_operation.h b/src/ops_infer/linear_parallel/linear_parallel_operation.h old mode 100644 new mode 100755 index d9e17af5..2658ac3f --- a/src/ops_infer/linear_parallel/linear_parallel_operation.h +++ b/src/ops_infer/linear_parallel/linear_parallel_operation.h @@ -38,6 +38,7 @@ private: Status InferShapeAllToAllvcAllGatherGmm(const SVector &inTensorDescs, SVector &outTensorDescs) const; Status CheckResidual(const SVector &inTensorDescs) const; + Status CheckWeightNzFormat(const SVector &inTensorDescs) const; Status InferShapeCheckLinearAllReduce(const SVector &inTensorDescs) const; Status InferShapeCheckLinearReduceScatter(const SVector &inTensorDescs) const; Status InferShapeCheckAllGatherLinear(const SVector &inTensorDescs) const; -- Gitee From 4ea1802054b32cc80946423072e6e4448a3255b6 Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Tue, 23 Sep 2025 11:35:24 +0800 Subject: [PATCH 65/94] fix --- src/kernels/lcal/src/lccl.cpp | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/kernels/lcal/src/lccl.cpp b/src/kernels/lcal/src/lccl.cpp index 8bf1255b..b1d81330 100644 --- a/src/kernels/lcal/src/lccl.cpp +++ b/src/kernels/lcal/src/lccl.cpp @@ -26,7 +26,7 @@ using namespace chrono; using namespace Mki; namespace Lcal { -using AclrtGetResInCurrentThreadFunc = int(*)(int, uint32_t*); +using AclrtGetResInCurrentThreadFunc = int(*)(int, uint32_t *); int GetAclResInCurThread(int type, uint32_t &resource) { @@ -36,7 +36,13 @@ int GetAclResInCurThread(int type, uint32_t &resource) static AclrtGetResInCurrentThreadFunc aclFn = nullptr; std::call_once(onceFlag, []() { - std::string p = std::string(Mki::GetEnv("ASCEND_HOME_PATH")) + "/runtime/lib64/libascendcl.so"; + std::string p; + const char *c = Mki::GetEnv("ASCEND_HOME_PATH"); + if (c) { + p = std::string(c) + "/runtime/lib64/libascendcl.so"; + } else { + p = "libascendcl.so"; + } auto dl = std::make_unique(p, false); if (!dl->IsValid()) { MKI_LOG(ERROR) << "Try load libascendcl.so failed: " << p; -- Gitee From 90bb43b677343b0d3f395c429a612d0adc99a498 Mon Sep 17 00:00:00 2001 From: qq_44359711 Date: Tue, 23 Sep 2025 15:13:06 +0800 Subject: [PATCH 66/94] update --- example/op_demo/activation/README.md | 4 +++- example/op_demo/all_gather/README.md | 4 +++- example/op_demo/all_reduce/README.md | 4 +++- example/op_demo/concat/README.md | 4 +++- example/op_demo/elewise/README.md | 4 +++- example/op_demo/fused_add_topk_div/README.md | 4 +++- example/op_demo/gather/README.md | 4 +++- example/op_demo/layer_norm/README.md | 4 +++- example/op_demo/linear/README.md | 7 ++----- example/op_demo/linear_parallel/README.md | 4 +++- example/op_demo/mla_preprocess/README.md | 5 ++++- .../op_demo/multi_latent_attention/README.md | 5 ++++- example/op_demo/paged_attention/README.md | 4 +++- example/op_demo/paged_cache_load/README.md | 16 ++++++++++++++-- example/op_demo/reshape_and_cache/README.md | 4 +++- example/op_demo/ring_mla/README.md | 6 ++++-- example/op_demo/rms_norm/README.md | 17 +++++++++++++++-- example/op_demo/rms_norm_backward/README.md | 4 +++- example/op_demo/rope/README.md | 16 ++++++++++++++-- example/op_demo/self_attention/README.md | 6 ++++-- example/op_demo/slice/README.md | 4 +++- example/op_demo/split/README.md | 4 +++- example/op_demo/transdata/README.md | 4 +++- example/op_demo/transpose/README.md | 5 +++-- 24 files changed, 109 insertions(+), 34 deletions(-) diff --git a/example/op_demo/activation/README.md b/example/op_demo/activation/README.md index 2616d541..af988f86 100644 --- a/example/op_demo/activation/README.md +++ b/example/op_demo/activation/README.md @@ -12,7 +12,9 @@ 例如: source ./ascend-transformer-boost/output/atb/set_env.sh - 运行demo - - bash build.sh + ```sh + bash build.sh + ``` **注意**: - 使用cxx_abi=0(默认)时,设置`D_GLIBCXX_USE_CXX11_ABI`为0,即: ```sh diff --git a/example/op_demo/all_gather/README.md b/example/op_demo/all_gather/README.md index 8897b7bf..0d071aa3 100644 --- a/example/op_demo/all_gather/README.md +++ b/example/op_demo/all_gather/README.md @@ -12,7 +12,9 @@ 例如: source ./ascend-transformer-boost/output/atb/set_env.sh - 运行demo - - bash build.sh + ```sh + bash build.sh + ``` **注意**: - 使用cxx_abi=0(默认)时,设置`D_GLIBCXX_USE_CXX11_ABI`为0,即: ```sh diff --git a/example/op_demo/all_reduce/README.md b/example/op_demo/all_reduce/README.md index 17d8d4b0..75fc37fc 100644 --- a/example/op_demo/all_reduce/README.md +++ b/example/op_demo/all_reduce/README.md @@ -12,7 +12,9 @@ 例如: source ./ascend-transformer-boost/output/atb/set_env.sh - 运行demo - - bash build.sh + ```sh + bash build.sh + ``` **注意**: - 使用cxx_abi=0(默认)时,设置`D_GLIBCXX_USE_CXX11_ABI`为0,即: ```sh diff --git a/example/op_demo/concat/README.md b/example/op_demo/concat/README.md index dd76fc32..73ed0a5c 100644 --- a/example/op_demo/concat/README.md +++ b/example/op_demo/concat/README.md @@ -12,7 +12,9 @@ 例如: source ./ascend-transformer-boost/output/atb/set_env.sh - 运行demo - - bash build.sh + ```sh + bash build.sh + ``` **注意**: - 使用cxx_abi=0(默认)时,设置`D_GLIBCXX_USE_CXX11_ABI`为0,即: ```sh diff --git a/example/op_demo/elewise/README.md b/example/op_demo/elewise/README.md index cd9c719b..d86f53ed 100644 --- a/example/op_demo/elewise/README.md +++ b/example/op_demo/elewise/README.md @@ -12,7 +12,9 @@ 例如: source ./ascend-transformer-boost/output/atb/set_env.sh - 运行demo - - bash build.sh + ```sh + bash build.sh + ``` **注意**: - 使用cxx_abi=0(默认)时,设置`D_GLIBCXX_USE_CXX11_ABI`为0,即: ```sh diff --git a/example/op_demo/fused_add_topk_div/README.md b/example/op_demo/fused_add_topk_div/README.md index c4a50d74..ad529f6e 100644 --- a/example/op_demo/fused_add_topk_div/README.md +++ b/example/op_demo/fused_add_topk_div/README.md @@ -12,7 +12,9 @@ 例如: source ./ascend-transformer-boost/output/atb/set_env.sh - 运行demo - - bash build.sh + ```sh + bash build.sh + ``` **注意**: - 使用cxx_abi=0(默认)时,设置`D_GLIBCXX_USE_CXX11_ABI`为0,即: ```sh diff --git a/example/op_demo/gather/README.md b/example/op_demo/gather/README.md index b0e16a71..cdcd67a5 100644 --- a/example/op_demo/gather/README.md +++ b/example/op_demo/gather/README.md @@ -12,7 +12,9 @@ 例如: source ./ascend-transformer-boost/output/atb/set_env.sh - 运行demo - - bash build.sh + ```sh + bash build.sh + ``` **注意**: - 使用cxx_abi=0(默认)时,设置`D_GLIBCXX_USE_CXX11_ABI`为0,即: ```sh diff --git a/example/op_demo/layer_norm/README.md b/example/op_demo/layer_norm/README.md index cf4437eb..429c9c78 100644 --- a/example/op_demo/layer_norm/README.md +++ b/example/op_demo/layer_norm/README.md @@ -12,7 +12,9 @@ 例如: source ./ascend-transformer-boost/output/atb/set_env.sh - 运行demo - - bash build.sh + ```sh + bash build.sh + ``` **注意**: - 使用cxx_abi=0(默认)时,设置`D_GLIBCXX_USE_CXX11_ABI`为0,即: ```sh diff --git a/example/op_demo/linear/README.md b/example/op_demo/linear/README.md index e1547d92..5f244c0a 100644 --- a/example/op_demo/linear/README.md +++ b/example/op_demo/linear/README.md @@ -18,15 +18,12 @@ ```sh bash build.sh ``` - **注意**: - - 使用cxx_abi=0(默认)时,设置`D_GLIBCXX_USE_CXX11_ABI`为0,即: - + - 使用cxx_abi=0(默认)时,设置`D_GLIBCXX_USE_CXX11_ABI`为0,即: ```sh g++ -D_GLIBCXX_USE_CXX11_ABI=0 -I ... ``` - - 使用cxx_abi=1时,更改`D_GLIBCXX_USE_CXX11_ABI`为1,即: - + - 使用cxx_abi=1时,更改`D_GLIBCXX_USE_CXX11_ABI`为1,即: ```sh g++ -D_GLIBCXX_USE_CXX11_ABI=1 -I ... ``` diff --git a/example/op_demo/linear_parallel/README.md b/example/op_demo/linear_parallel/README.md index 3885aac5..238d1651 100644 --- a/example/op_demo/linear_parallel/README.md +++ b/example/op_demo/linear_parallel/README.md @@ -12,7 +12,9 @@ 例如: source ./ascend-transformer-boost/output/atb/set_env.sh - 运行demo - - bash build.sh + ```sh + bash build.sh + ``` **注意**: - 使用cxx_abi=0(默认)时,设置`D_GLIBCXX_USE_CXX11_ABI`为0,即: ```sh diff --git a/example/op_demo/mla_preprocess/README.md b/example/op_demo/mla_preprocess/README.md index 87c24619..50f59fc4 100644 --- a/example/op_demo/mla_preprocess/README.md +++ b/example/op_demo/mla_preprocess/README.md @@ -12,7 +12,9 @@ 例如: source ./ascend-transformer-boost/output/atb/set_env.sh - 运行demo - - bash build.sh + ```sh + bash build.sh + ``` **注意**: - 使用cxx_abi=0(默认)时,设置`D_GLIBCXX_USE_CXX11_ABI`为0,即: ```sh @@ -22,6 +24,7 @@ ```sh g++ -D_GLIBCXX_USE_CXX11_ABI=1 -I ... ``` + - 提供的build脚本仅用于编译和运行mlapo_demo.cpp,如需编译其他demo,需要替换“mlapo_demo”为对应的cpp文件名 ## 额外说明 示例中生成的数据不代表实际场景,如需数据生成参考请查看python用例目录: diff --git a/example/op_demo/multi_latent_attention/README.md b/example/op_demo/multi_latent_attention/README.md index 55ef7ce4..cd042604 100644 --- a/example/op_demo/multi_latent_attention/README.md +++ b/example/op_demo/multi_latent_attention/README.md @@ -11,7 +11,9 @@ 1. 如果使用加速库源码编译,source [加速库源码路径]/output/atb/set_env.sh 例如: source ./ascend-transformer-boost/output/atb/set_env.sh - 运行demo - - bash build.sh + ```sh + bash build.sh + ``` **注意**: - 使用cxx_abi=0(默认)时,设置`D_GLIBCXX_USE_CXX11_ABI`为0,即: ```sh @@ -21,6 +23,7 @@ ```sh g++ -D_GLIBCXX_USE_CXX11_ABI=1 -I ... ``` + - 提供的build脚本仅用于编译和运行mlapo_demo.cpp,如需编译其他demo,需要替换“mlapo_demo”为对应的cpp文件名 ## 额外说明 示例中生成的数据不代表实际场景,如需数据生成参考请查看python用例目录: diff --git a/example/op_demo/paged_attention/README.md b/example/op_demo/paged_attention/README.md index 49fcbd6b..136271c2 100644 --- a/example/op_demo/paged_attention/README.md +++ b/example/op_demo/paged_attention/README.md @@ -12,7 +12,9 @@ 例如: source ./ascend-transformer-boost/output/atb/set_env.sh - 运行demo - - bash build.sh + ```sh + bash build.sh + ``` **注意**: - 使用cxx_abi=0(默认)时,设置`D_GLIBCXX_USE_CXX11_ABI`为0,即: ```sh diff --git a/example/op_demo/paged_cache_load/README.md b/example/op_demo/paged_cache_load/README.md index 055e0bef..66745e0c 100644 --- a/example/op_demo/paged_cache_load/README.md +++ b/example/op_demo/paged_cache_load/README.md @@ -11,8 +11,20 @@ 1. 如果使用加速库源码编译,source [加速库源码路径]/output/atb/set_env.sh 例如: source ./ascend-transformer-boost/output/atb/set_env.sh -- 编译、运行demo - - bash build.sh +- 运行demo + ```sh + bash build.sh + ``` + **注意**: + - 使用cxx_abi=0(默认)时,设置`D_GLIBCXX_USE_CXX11_ABI`为0,即: + ```sh + g++ -D_GLIBCXX_USE_CXX11_ABI=0 -I ... + ``` + - 使用cxx_abi=1时,更改`D_GLIBCXX_USE_CXX11_ABI`为1,即: + ```sh + g++ -D_GLIBCXX_USE_CXX11_ABI=1 -I ... + ``` + - 提供的build脚本仅用于编译和运行paged_cache_load_demo.cpp,如需编译其他demo,需要替换“paged_cache_load_demo”为对应的cpp文件名 ## 额外说明 示例中生成的数据不代表实际场景,如需数据生成参考请查看python用例目录: diff --git a/example/op_demo/reshape_and_cache/README.md b/example/op_demo/reshape_and_cache/README.md index f4e4ede9..75580358 100644 --- a/example/op_demo/reshape_and_cache/README.md +++ b/example/op_demo/reshape_and_cache/README.md @@ -12,7 +12,9 @@ 例如: source ./ascend-transformer-boost/output/atb/set_env.sh - 运行demo - - bash build.sh + ```sh + bash build.sh + ``` **注意**: - 使用cxx_abi=0(默认)时,设置`D_GLIBCXX_USE_CXX11_ABI`为0,即: ```sh diff --git a/example/op_demo/ring_mla/README.md b/example/op_demo/ring_mla/README.md index 333678bd..4011ef7e 100644 --- a/example/op_demo/ring_mla/README.md +++ b/example/op_demo/ring_mla/README.md @@ -12,9 +12,11 @@ 例如: source ./ascend-transformer-boost/output/atb/set_env.sh - 运行demo - - bash build.sh + ```sh + bash build.sh + ``` **注意**: - - 使用cxx_abi=0时,设置`D_GLIBCXX_USE_CXX11_ABI`为0,即: + - 使用cxx_abi=0(默认)时,设置`D_GLIBCXX_USE_CXX11_ABI`为0,即: ```sh g++ -D_GLIBCXX_USE_CXX11_ABI=0 -I ... ``` diff --git a/example/op_demo/rms_norm/README.md b/example/op_demo/rms_norm/README.md index f3b43873..b0a35b58 100644 --- a/example/op_demo/rms_norm/README.md +++ b/example/op_demo/rms_norm/README.md @@ -11,8 +11,21 @@ 1. 如果使用加速库源码编译,source [加速库源码路径]/output/atb/set_env.sh 例如: source ./ascend-transformer-boost/output/atb/set_env.sh -- 编译、运行demo - - bash build.sh +- 运行demo + ```sh + bash build.sh + ``` + **注意**: + - 使用cxx_abi=0(默认)时,设置`D_GLIBCXX_USE_CXX11_ABI`为0,即: + ```sh + g++ -D_GLIBCXX_USE_CXX11_ABI=0 -I ... + ``` + - 使用cxx_abi=1时,更改`D_GLIBCXX_USE_CXX11_ABI`为1,即: + ```sh + g++ -D_GLIBCXX_USE_CXX11_ABI=1 -I ... + ``` + - 提供的build脚本仅用于编译和运行rms_norm_demo.cpp,如需编译其他demo,需要替换“rms_norm_demo”为对应的cpp文件名 + ## 额外说明 示例中生成的数据不代表实际场景,如需数据生成参考请查看python用例目录: diff --git a/example/op_demo/rms_norm_backward/README.md b/example/op_demo/rms_norm_backward/README.md index 78daf3f9..409088a7 100644 --- a/example/op_demo/rms_norm_backward/README.md +++ b/example/op_demo/rms_norm_backward/README.md @@ -12,7 +12,9 @@ 例如: source ./ascend-transformer-boost/output/atb/set_env.sh - 运行demo - - bash build.sh + ```sh + bash build.sh + ``` **注意**: - 使用cxx_abi=0(默认)时,设置`D_GLIBCXX_USE_CXX11_ABI`为0,即: ```sh diff --git a/example/op_demo/rope/README.md b/example/op_demo/rope/README.md index c6c1ec27..e2072c77 100644 --- a/example/op_demo/rope/README.md +++ b/example/op_demo/rope/README.md @@ -11,8 +11,20 @@ 1. 如果使用加速库源码编译,source [加速库源码路径]/output/atb/set_env.sh 例如: source ./ascend-transformer-boost/output/atb/set_env.sh -- 编译、运行demo - - bash build.sh +- 运行demo + ```sh + bash build.sh + ``` + **注意**: + - 使用cxx_abi=0(默认)时,设置`D_GLIBCXX_USE_CXX11_ABI`为0,即: + ```sh + g++ -D_GLIBCXX_USE_CXX11_ABI=0 -I ... + ``` + - 使用cxx_abi=1时,更改`D_GLIBCXX_USE_CXX11_ABI`为1,即: + ```sh + g++ -D_GLIBCXX_USE_CXX11_ABI=1 -I ... + ``` + - 提供的build脚本仅用于编译和运行rope_demo.cpp,如需编译其他demo,需要替换“rope_demo”为对应的cpp文件名 ## 额外说明 示例中生成的数据不代表实际场景,如需数据生成参考请查看python用例目录: diff --git a/example/op_demo/self_attention/README.md b/example/op_demo/self_attention/README.md index 8f428192..1b56c3af 100644 --- a/example/op_demo/self_attention/README.md +++ b/example/op_demo/self_attention/README.md @@ -12,9 +12,11 @@ 例如: source ./ascend-transformer-boost/output/atb/set_env.sh - 运行demo - - bash build.sh + ```sh + bash build.sh + ``` **注意**: - - 使用cxx_abi=0时,设置`D_GLIBCXX_USE_CXX11_ABI`为0,即: + - 使用cxx_abi=0(默认)时,设置`D_GLIBCXX_USE_CXX11_ABI`为0,即: ```sh g++ -D_GLIBCXX_USE_CXX11_ABI=0 -I ... ``` diff --git a/example/op_demo/slice/README.md b/example/op_demo/slice/README.md index aa0ebb81..352a04a9 100644 --- a/example/op_demo/slice/README.md +++ b/example/op_demo/slice/README.md @@ -12,7 +12,9 @@ 例如: source ./ascend-transformer-boost/output/atb/set_env.sh - 运行demo - - bash build.sh + ```sh + bash build.sh + ``` **注意**: - 使用cxx_abi=0(默认)时,设置`D_GLIBCXX_USE_CXX11_ABI`为0,即: ```sh diff --git a/example/op_demo/split/README.md b/example/op_demo/split/README.md index 39fa1e02..d33fa164 100644 --- a/example/op_demo/split/README.md +++ b/example/op_demo/split/README.md @@ -12,7 +12,9 @@ 例如: source ./ascend-transformer-boost/output/atb/set_env.sh - 运行demo - - bash build.sh + ```sh + bash build.sh + ``` **注意**: - 使用cxx_abi=0(默认)时,设置`D_GLIBCXX_USE_CXX11_ABI`为0,即: ```sh diff --git a/example/op_demo/transdata/README.md b/example/op_demo/transdata/README.md index 99b54219..6db3ab25 100644 --- a/example/op_demo/transdata/README.md +++ b/example/op_demo/transdata/README.md @@ -12,7 +12,9 @@ 例如: source ./ascend-transformer-boost/output/atb/set_env.sh - 运行demo - - bash build.sh + ```sh + bash build.sh + ``` **注意**: - 使用cxx_abi=0(默认)时,设置`D_GLIBCXX_USE_CXX11_ABI`为0,即: ```sh diff --git a/example/op_demo/transpose/README.md b/example/op_demo/transpose/README.md index d73e19e9..bc2a8a56 100644 --- a/example/op_demo/transpose/README.md +++ b/example/op_demo/transpose/README.md @@ -12,7 +12,9 @@ 例如: source ./ascend-transformer-boost/output/atb/set_env.sh - 运行demo - - bash build.sh + ```sh + bash build.sh + ``` **注意**: - 使用cxx_abi=0(默认)时,设置`D_GLIBCXX_USE_CXX11_ABI`为0,即: ```sh @@ -22,7 +24,6 @@ ```sh g++ -D_GLIBCXX_USE_CXX11_ABI=1 -I ... ``` - ## 额外说明 示例中生成的数据不代表实际场景,如需数据生成参考请查看python用例目录: tests/apitest/opstest/python/operations/transpose_demo/ -- Gitee From 12257c40941ef0af7e43cfee4313b4315dccaf08 Mon Sep 17 00:00:00 2001 From: huangxiaolan Date: Tue, 23 Sep 2025 15:55:54 +0800 Subject: [PATCH 67/94] add fix predefine --- .../tbe_adapter/platform/tiling/platform/platform_ascendc.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/kernels/tbe_adapter/platform/tiling/platform/platform_ascendc.h b/src/kernels/tbe_adapter/platform/tiling/platform/platform_ascendc.h index 6eda0979..6fa0fcfd 100644 --- a/src/kernels/tbe_adapter/platform/tiling/platform/platform_ascendc.h +++ b/src/kernels/tbe_adapter/platform/tiling/platform/platform_ascendc.h @@ -18,6 +18,7 @@ #include #include +#include "stubs\include\metadef\inc\external\platform\platform_info.h" #define ASCENDC_ASSERT(cond, behavior) \ do { \ @@ -26,9 +27,6 @@ raise(SIGABRT); \ } \ } while (0) -namespace fe { -class PlatFormInfos; -} namespace platform_ascendc { enum class CoreMemType { -- Gitee From 2a2d8b6936b63f734da476a005438c2ed0396e06 Mon Sep 17 00:00:00 2001 From: huangxiaolan Date: Tue, 23 Sep 2025 15:58:54 +0800 Subject: [PATCH 68/94] fix expression --- src/kernels/mixkernels/fusion/tiling/fusion_tiling.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/kernels/mixkernels/fusion/tiling/fusion_tiling.cpp b/src/kernels/mixkernels/fusion/tiling/fusion_tiling.cpp index 130128ed..142e6a26 100644 --- a/src/kernels/mixkernels/fusion/tiling/fusion_tiling.cpp +++ b/src/kernels/mixkernels/fusion/tiling/fusion_tiling.cpp @@ -22,9 +22,9 @@ Status FusionTiling(const LaunchParam &launchParam, KernelInfo &kernelInfo) OpParam::Fusion fusionType = launchParam.GetParam(); std::string path(std::getenv("HOME")); path += std::string("/.atb_auto_fusion/bishengir_bin/") + - (fusionType.fusionType ? "libmatmul_add.so" : "libmatmul_gelu.so" == OpParam::Fusion::MATMUL_ADD); + ((fusionType.fusionType ? "libmatmul_add.so" : "libmatmul_gelu.so") == OpParam::Fusion::MATMUL_ADD); std::string inferWorkspaceFuncName = - (fusionType.fusionType ? "matmul_add_" : "matmul_gelu_" == OpParam::Fusion::MATMUL_ADD); + ((fusionType.fusionType ? "matmul_add_" : "matmul_gelu_") == OpParam::Fusion::MATMUL_ADD); FusionTilingData *tilingDataPtr = reinterpret_cast(kernelInfo.GetTilingHostAddr()); void *handle = dlopen(path.c_str(), RTLD_LAZY); if (!handle) { -- Gitee From e4e9517544540766c8be93cf2d32361786fa2868 Mon Sep 17 00:00:00 2001 From: wanyukang Date: Tue, 23 Sep 2025 16:02:49 +0800 Subject: [PATCH 69/94] compute --- src/kernels/mixkernels/toppsample/op_kernel/toppsample.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/kernels/mixkernels/toppsample/op_kernel/toppsample.cpp b/src/kernels/mixkernels/toppsample/op_kernel/toppsample.cpp index fbc9eda7..0b1d7c09 100644 --- a/src/kernels/mixkernels/toppsample/op_kernel/toppsample.cpp +++ b/src/kernels/mixkernels/toppsample/op_kernel/toppsample.cpp @@ -18,7 +18,6 @@ static constexpr uint32_t MAX_CORE_NUM = 512; static constexpr uint32_t BLK_SIZE = 32; static constexpr uint32_t DEFAULT_STRIDE = 8; static constexpr uint32_t FP32_PER_REPEAT = 64; -static constexpr uint32_t FP16_PER_REPEAT = 128; static constexpr uint32_t FP16_PER_BLOCK = 16; static constexpr uint32_t NUM_4 = 4; @@ -183,7 +182,7 @@ private: AscendC::LocalTensor fp32Buf = fp32Buf_.Get(); AscendC::LocalTensor fp16Buf = fp32Buf_.Get(); AscendC::LocalTensor fp32TempBuf = tempBuf_.Get(); - uint32_t copyEleNumAlignF16_ = (copyEleNum + FP16_PER_REPEAT - 1) / FP16_PER_REPEAT * FP16_PER_REPEAT; + uint32_t copyEleNumAlignF16_ = (copyEleNum + FP16_PER_BLOCK - 1) / FP16_PER_BLOCK * FP16_PER_BLOCK; uint32_t copyEleNumAlignF32_ = (copyEleNum + FP32_PER_REPEAT - 1) / FP32_PER_REPEAT * FP32_PER_REPEAT; for (uint32_t dupVal = copyEleNum; dupVal < copyEleNumAlignF16_; dupVal++) { buf.SetValue(dupVal, T(1)); -- Gitee From 65a220b3bc4bb99914137bab4493364b94b06997 Mon Sep 17 00:00:00 2001 From: huangxiaolan Date: Tue, 23 Sep 2025 16:20:09 +0800 Subject: [PATCH 70/94] fix expression --- src/kernels/mixkernels/fusion/tiling/fusion_tiling.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/kernels/mixkernels/fusion/tiling/fusion_tiling.cpp b/src/kernels/mixkernels/fusion/tiling/fusion_tiling.cpp index 142e6a26..2f85cfd7 100644 --- a/src/kernels/mixkernels/fusion/tiling/fusion_tiling.cpp +++ b/src/kernels/mixkernels/fusion/tiling/fusion_tiling.cpp @@ -22,9 +22,9 @@ Status FusionTiling(const LaunchParam &launchParam, KernelInfo &kernelInfo) OpParam::Fusion fusionType = launchParam.GetParam(); std::string path(std::getenv("HOME")); path += std::string("/.atb_auto_fusion/bishengir_bin/") + - ((fusionType.fusionType ? "libmatmul_add.so" : "libmatmul_gelu.so") == OpParam::Fusion::MATMUL_ADD); + (fusionType.fusionType == OpParam::Fusion::MATMUL_ADD ? "libmatmul_add.so" : "libmatmul_gelu.so"); std::string inferWorkspaceFuncName = - ((fusionType.fusionType ? "matmul_add_" : "matmul_gelu_") == OpParam::Fusion::MATMUL_ADD); + (fusionType.fusionType == OpParam::Fusion::MATMUL_ADD ? "matmul_add_" : "matmul_gelu_"); FusionTilingData *tilingDataPtr = reinterpret_cast(kernelInfo.GetTilingHostAddr()); void *handle = dlopen(path.c_str(), RTLD_LAZY); if (!handle) { -- Gitee From 6cfe7b3baf028b0b248a287adcf4d2aaeef1b884 Mon Sep 17 00:00:00 2001 From: huangxiaolan Date: Tue, 23 Sep 2025 16:27:48 +0800 Subject: [PATCH 71/94] fix include order --- src/atb/utils/aclnn_util.cpp | 2 +- src/kernels/mixkernels/blockcopy/tiling/blockcopy_tiling.cpp | 2 +- src/ops_infer/linear_parallel/linear_parallel_aclnn_runner.cpp | 2 +- src/ops_infer/mla_preprocess/mla_preprocess_aclnn_runner.cpp | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/atb/utils/aclnn_util.cpp b/src/atb/utils/aclnn_util.cpp index dcbd1ac7..83ba2254 100644 --- a/src/atb/utils/aclnn_util.cpp +++ b/src/atb/utils/aclnn_util.cpp @@ -7,12 +7,12 @@ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. * See LICENSE in the root of the software repository for the full text of the License. */ +#include "aclnn_util.h" #include #include #include -#include "aclnn_util.h" #include "log.h" namespace { diff --git a/src/kernels/mixkernels/blockcopy/tiling/blockcopy_tiling.cpp b/src/kernels/mixkernels/blockcopy/tiling/blockcopy_tiling.cpp index 2ccdb31e..c3cb907e 100644 --- a/src/kernels/mixkernels/blockcopy/tiling/blockcopy_tiling.cpp +++ b/src/kernels/mixkernels/blockcopy/tiling/blockcopy_tiling.cpp @@ -7,8 +7,8 @@ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. * See LICENSE in the root of the software repository for the full text of the License. */ -#include #include "blockcopy_tiling.h" +#include #include #include #include diff --git a/src/ops_infer/linear_parallel/linear_parallel_aclnn_runner.cpp b/src/ops_infer/linear_parallel/linear_parallel_aclnn_runner.cpp index 7e151f31..a2eabc4e 100644 --- a/src/ops_infer/linear_parallel/linear_parallel_aclnn_runner.cpp +++ b/src/ops_infer/linear_parallel/linear_parallel_aclnn_runner.cpp @@ -9,9 +9,9 @@ */ #include "linear_parallel_aclnn_runner.h" +#include #include "atb/utils/dl_manager.h" #include "atb/utils/aclnn_util.h" -#include namespace atb { diff --git a/src/ops_infer/mla_preprocess/mla_preprocess_aclnn_runner.cpp b/src/ops_infer/mla_preprocess/mla_preprocess_aclnn_runner.cpp index 4c5a102e..eaf43ae3 100644 --- a/src/ops_infer/mla_preprocess/mla_preprocess_aclnn_runner.cpp +++ b/src/ops_infer/mla_preprocess/mla_preprocess_aclnn_runner.cpp @@ -11,7 +11,7 @@ #include "atb/utils/dl_manager.h" #include "atb/utils/aclnn_util.h" #include "atb/utils/log.h" -#include +#include "atbops/params/params.h" namespace { static const uint32_t IN_TENSOR_NUM = 24; -- Gitee From dc1214869ecd028fc4f99e7a38d276e6ab012cf2 Mon Sep 17 00:00:00 2001 From: huangxiaolan Date: Tue, 23 Sep 2025 16:42:55 +0800 Subject: [PATCH 72/94] fix bool expression --- .../gmm_deq_swiglu_quant_gmm_deq_operation.cpp | 4 ++-- .../mm_deq_swiglu_quant_mm_deq_operation.cpp | 4 ++-- .../gmm_deq_swiglu_quant_gmm_deq_operation.cpp | 4 ++-- .../mm_deq_swiglu_quant_mm_deq_operation.cpp | 4 ++-- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/kernels/mixkernels/gmm_deq_swiglu_quant_gmm_deq/gmm_deq_swiglu_quant_gmm_deq_operation.cpp b/src/kernels/mixkernels/gmm_deq_swiglu_quant_gmm_deq/gmm_deq_swiglu_quant_gmm_deq_operation.cpp index c03469b1..1337ebf5 100644 --- a/src/kernels/mixkernels/gmm_deq_swiglu_quant_gmm_deq/gmm_deq_swiglu_quant_gmm_deq_operation.cpp +++ b/src/kernels/mixkernels/gmm_deq_swiglu_quant_gmm_deq/gmm_deq_swiglu_quant_gmm_deq_operation.cpp @@ -102,9 +102,9 @@ private: "Param groupListType only support GROUP_LIST_CUM_SUM (0).", return false); MKI_CHECK(param.weightUpPermuteType != OpParam::GmmDeqSwigluQuantGmmDeq::PERMUTE_INVALID, "Param weightUpPermuteType has invalid value.", return false); - MKI_CHECK(param.transposeWeightUp == false, + MKI_CHECK(!param.transposeWeightUp, "Param transposeWeightUp only support false.", return false); - MKI_CHECK(param.transposeWeightDown == true, + MKI_CHECK(param.transposeWeightDown, "Param transposeWeightDown only support true.", return false); return true; } diff --git a/src/kernels/mixkernels/mm_deq_swiglu_quant_mm_deq/mm_deq_swiglu_quant_mm_deq_operation.cpp b/src/kernels/mixkernels/mm_deq_swiglu_quant_mm_deq/mm_deq_swiglu_quant_mm_deq_operation.cpp index 2a402181..135beb06 100644 --- a/src/kernels/mixkernels/mm_deq_swiglu_quant_mm_deq/mm_deq_swiglu_quant_mm_deq_operation.cpp +++ b/src/kernels/mixkernels/mm_deq_swiglu_quant_mm_deq/mm_deq_swiglu_quant_mm_deq_operation.cpp @@ -94,9 +94,9 @@ private: "Param outputType only support OUTPUT_FLOAT16 (0).", return false); MKI_CHECK(param.weightUpPermuteType != OpParam::MmDeqSwigluQuantMmDeq::PERMUTE_INVALID, "Param weightUpPermuteType has invalid value.", return false); - MKI_CHECK(param.transposeWeightUp == false, + MKI_CHECK(!param.transposeWeightUp, "Param transposeWeightUp only support false.", return false); - MKI_CHECK(param.transposeWeightDown == true, + MKI_CHECK(param.transposeWeightDown, "Param transposeWeightDown only support true.", return false); return true; } diff --git a/src/ops_infer/gmm_deq_swiglu_quant_gmm_deq/gmm_deq_swiglu_quant_gmm_deq_operation.cpp b/src/ops_infer/gmm_deq_swiglu_quant_gmm_deq/gmm_deq_swiglu_quant_gmm_deq_operation.cpp index e63c92e9..de8be698 100644 --- a/src/ops_infer/gmm_deq_swiglu_quant_gmm_deq/gmm_deq_swiglu_quant_gmm_deq_operation.cpp +++ b/src/ops_infer/gmm_deq_swiglu_quant_gmm_deq/gmm_deq_swiglu_quant_gmm_deq_operation.cpp @@ -102,12 +102,12 @@ bool ParamCheck(const atb::infer::GmmDeqSwigluQuantGmmDeqParam &opParam) return false; } - if (opParam.transposeWeightUp != false) { + if (opParam.transposeWeightUp) { ATB_LOG(ERROR) << "Param transposeWeightUp only support false."; return false; } - if (opParam.transposeWeightDown != true) { + if (!opParam.transposeWeightDown) { ATB_LOG(ERROR) << "Param transposeWeightDown only support true."; return false; } diff --git a/src/ops_infer/mm_deq_swiglu_quant_mm_deq/mm_deq_swiglu_quant_mm_deq_operation.cpp b/src/ops_infer/mm_deq_swiglu_quant_mm_deq/mm_deq_swiglu_quant_mm_deq_operation.cpp index d96602ba..64cf34fb 100644 --- a/src/ops_infer/mm_deq_swiglu_quant_mm_deq/mm_deq_swiglu_quant_mm_deq_operation.cpp +++ b/src/ops_infer/mm_deq_swiglu_quant_mm_deq/mm_deq_swiglu_quant_mm_deq_operation.cpp @@ -87,12 +87,12 @@ bool ParamCheck(const atb::infer::MmDeqSwigluQuantMmDeqParam &opParam) return false; } - if (opParam.transposeWeightUp != false) { + if (opParam.transposeWeightUp) { ATB_LOG(ERROR) << "Param transposeWeightUp only support false."; return false; } - if (opParam.transposeWeightDown != true) { + if (!opParam.transposeWeightDown) { ATB_LOG(ERROR) << "Param transposeWeightDown only support true."; return false; } -- Gitee From 9dc70dec45ace7186f54303064fe0e76a7b6e61e Mon Sep 17 00:00:00 2001 From: huangxiaolan Date: Tue, 23 Sep 2025 16:46:32 +0800 Subject: [PATCH 73/94] fix variable init --- example/multiStream/multiStream_multiGraph_demo.cpp | 3 +-- example/multiStream/multiStream_singleGraph_demo.cpp | 3 +-- src/ops_infer/ring_mla/ring_mla_operation.cpp | 3 +-- 3 files changed, 3 insertions(+), 6 deletions(-) diff --git a/example/multiStream/multiStream_multiGraph_demo.cpp b/example/multiStream/multiStream_multiGraph_demo.cpp index ff6491a0..17a0e039 100644 --- a/example/multiStream/multiStream_multiGraph_demo.cpp +++ b/example/multiStream/multiStream_multiGraph_demo.cpp @@ -237,8 +237,7 @@ int main() packRW.outTensors.resize(outTensorNum); operationWR->InferShape(intensorDescs, outtensorDescs); - aclError ret; - ret = CreateInTensors(packWR.inTensors, intensorDescs); + aclError ret = CreateInTensors(packWR.inTensors, intensorDescs); if (ret != 0) { exit(ret); } diff --git a/example/multiStream/multiStream_singleGraph_demo.cpp b/example/multiStream/multiStream_singleGraph_demo.cpp index d95873fd..e1eb53af 100644 --- a/example/multiStream/multiStream_singleGraph_demo.cpp +++ b/example/multiStream/multiStream_singleGraph_demo.cpp @@ -264,8 +264,7 @@ int main() outtensorDescs.resize(outTensorNum); pack.outTensors.resize(outTensorNum); operation->InferShape(intensorDescs, outtensorDescs); - aclError ret; - ret = CreateOutTensors(pack.outTensors, outtensorDescs); + aclError ret = CreateOutTensors(pack.outTensors, outtensorDescs); if (ret != 0) { exit(ret); } diff --git a/src/ops_infer/ring_mla/ring_mla_operation.cpp b/src/ops_infer/ring_mla/ring_mla_operation.cpp index 8106fdd3..0376eabb 100644 --- a/src/ops_infer/ring_mla/ring_mla_operation.cpp +++ b/src/ops_infer/ring_mla/ring_mla_operation.cpp @@ -394,8 +394,7 @@ bool RingMLAOperation::InputLseDimCheck(const SVector &inTensorDescs Status RingMLAOperation::InferShapeCheckImpl(const SVector &inTensorDescs) const { - Status st; - st = DimCheck(inTensorDescs); + Status st = DimCheck(inTensorDescs); if (st != NO_ERROR) { return st; } -- Gitee From 61968c9d25d1d79c774982a50e0d5b04d4e9664f Mon Sep 17 00:00:00 2001 From: huangxiaolan Date: Tue, 23 Sep 2025 16:55:10 +0800 Subject: [PATCH 74/94] add brackets --- .../multi_latent_attention/multi_latent_attention_operation.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ops_infer/multi_latent_attention/multi_latent_attention_operation.cpp b/src/ops_infer/multi_latent_attention/multi_latent_attention_operation.cpp index 8bdfceaf..4cfbc6cd 100644 --- a/src/ops_infer/multi_latent_attention/multi_latent_attention_operation.cpp +++ b/src/ops_infer/multi_latent_attention/multi_latent_attention_operation.cpp @@ -461,7 +461,7 @@ Status MultiLatentAttentionOperation::DimCheckInt8Nz(const SVector & return ERROR_INVALID_TENSOR_DIM; } if (inTensorDesc.at(idx + 1).shape.dims[0] != param_.headNum) { - ATB_LOG(ERROR) << GetLogPrefix() << "dim 0 of of pvDescale(intensor" << idx + 1 + ATB_LOG(ERROR) << GetLogPrefix() << "dim 0 of of pvDescale(intensor" << (idx + 1) << ") should be equal to dim0 of headNum"; return ERROR_INVALID_TENSOR_DIM; } -- Gitee From 8c535d08d8f36b0b32c2f3f37e78cbec3bd08072 Mon Sep 17 00:00:00 2001 From: zouyanlong Date: Tue, 23 Sep 2025 17:04:29 +0800 Subject: [PATCH 75/94] fix --- .../linear_parallel_aclnn_runner.cpp | 18 +- .../linear_parallel_aclnn_runner.h | 1 - .../linear_parallel_operation.cpp | 31 ++- .../linear_parallel_generation.cpp | 261 ++++++++++++++++++ ...near_parallel_mc2_linear_reduce_scatter.py | 112 ++++++++ .../mc2_linear_reduce_scatter/run_test.sh | 94 +++++++ tests/apitest/opstest/cpp/precision_calcu.py | 147 ++++++++++ tests/apitest/opstest/csv/linear_parallel.csv | 9 + 8 files changed, 651 insertions(+), 22 deletions(-) create mode 100644 tests/apitest/opstest/cpp/linear_parallel/mc2_linear_reduce_scatter/linear_parallel_generation.cpp create mode 100644 tests/apitest/opstest/cpp/linear_parallel/mc2_linear_reduce_scatter/linear_parallel_mc2_linear_reduce_scatter.py create mode 100644 tests/apitest/opstest/cpp/linear_parallel/mc2_linear_reduce_scatter/run_test.sh create mode 100644 tests/apitest/opstest/cpp/precision_calcu.py diff --git a/src/ops_infer/linear_parallel/linear_parallel_aclnn_runner.cpp b/src/ops_infer/linear_parallel/linear_parallel_aclnn_runner.cpp index 7e151f31..d8f4abb1 100644 --- a/src/ops_infer/linear_parallel/linear_parallel_aclnn_runner.cpp +++ b/src/ops_infer/linear_parallel/linear_parallel_aclnn_runner.cpp @@ -19,10 +19,6 @@ namespace atb { static const uint32_t LINEAR_REDUCE_SCATTER_IN_TENSOR_NUM = 6; static const uint32_t LINEAR_REDUCE_SCATTER_OUT_TENSOR_NUM = 2; - -static const uint32_t BIAS_TENSOR_INDEX = 2; - - aclnnStatus (*LinearParallelAclnnRunner::aclnnMatmulReduceScatterV2GetWorkspaceSizeFunc_)( const aclTensor *, const aclTensor *, const aclTensor *, const aclTensor *, const aclTensor *, const aclTensor *, int64_t, const char *, const char *, int64_t, int64_t, int64_t, const char *, const aclTensor *, const aclTensor *, @@ -60,15 +56,21 @@ Status LinearParallelAclnnRunner::BuildAclnnVariantPack(const RunnerVariantPack this->aclnnVariantPack_.aclInTensors.resize(LINEAR_REDUCE_SCATTER_IN_TENSOR_NUM); for (size_t i = 0; i < this->aclnnVariantPack_.aclInTensors.size(); ++i) { std::shared_ptr aclnnTensorPtr = std::make_shared(); - if (i >= 3 || (!param_.hasResidual && i == BIAS_TENSOR_INDEX)) { + if (i > 1) { this->aclnnVariantPack_.aclInTensors[i] = aclnnTensorPtr; continue; } atb::Tensor atbTensor = runnerVariantPack.inTensors.at(i); aclnnTensorPtr->atbTensor = atbTensor; - aclnnTensorPtr->strides = (i == 1 && param_.transWeight) ? GetTransposeTensorStride(atbTensor.desc.shape) : - GetCopyTensorStride(atbTensor.desc.shape); - ret = CallAclCreateTensor(atbTensor.desc.shape, atbTensor.desc.shape, atbTensor, aclnnTensorPtr); + atb::Dims viewDims = atbTensor.desc.shape; + if (i == 1 && param_.transWeight) { + aclnnTensorPtr->strides = GetTransposeTensorStride(viewDims); + viewDims.dims[0] = atbTensor.desc.shape.dims[1]; + viewDims.dims[1] = atbTensor.desc.shape.dims[0]; + } else { + aclnnTensorPtr->strides = GetCopyTensorStride(viewDims); + } + ret = CallAclCreateTensor(viewDims, atbTensor.desc.shape, atbTensor, aclnnTensorPtr); if (ret != NO_ERROR) { ATB_LOG(ERROR) << GetLogPrefix() << "create aclTensor by aclCreateTensor failed!"; return ret; diff --git a/src/ops_infer/linear_parallel/linear_parallel_aclnn_runner.h b/src/ops_infer/linear_parallel/linear_parallel_aclnn_runner.h index b8f55e79..3a8cfb74 100644 --- a/src/ops_infer/linear_parallel/linear_parallel_aclnn_runner.h +++ b/src/ops_infer/linear_parallel/linear_parallel_aclnn_runner.h @@ -29,7 +29,6 @@ protected: aclnnStatus SetAclNNWorkspaceExecutor() override; Status LaunchAclnnKernel() override; - private: HcclRunner hcclRunner_; infer::LinearParallelParam param_; diff --git a/src/ops_infer/linear_parallel/linear_parallel_operation.cpp b/src/ops_infer/linear_parallel/linear_parallel_operation.cpp index 9ce37e92..21655c11 100644 --- a/src/ops_infer/linear_parallel/linear_parallel_operation.cpp +++ b/src/ops_infer/linear_parallel/linear_parallel_operation.cpp @@ -104,8 +104,8 @@ bool CheckType(const infer::LinearParallelParam &opParam, Status &isOK) bool CheckTypeMc2(const infer::LinearParallelParam &opParam, Status &isOK) { - if (opParam.transWeight) { - ATB_LOG(ERROR) << "When LinearParallel backend is mc2, not support transWeight"; + if (opParam.hasResidual) { + ATB_LOG(ERROR) << "When LinearParallel backend is mc2, not support residual"; isOK = ERROR_INVALID_PARAM; return true; } @@ -114,8 +114,10 @@ bool CheckTypeMc2(const infer::LinearParallelParam &opParam, Status &isOK) isOK = ERROR_INVALID_PARAM; return true; } - if (opParam.quantType != atb::infer::LinearParallelParam::QuantType::QUANT_TYPE_UNQUANT) { - ATB_LOG(ERROR) << "When LinearParallel backend is mc2, only support quantType[QUANT_TYPE_UNQUANT]"; + if (opParam.quantType != atb::infer::LinearParallelParam::QuantType::QUANT_TYPE_UNDEFINED || + opParam.quantType != atb::infer::LinearParallelParam::QuantType::QUANT_TYPE_UNQUANT) { + ATB_LOG(ERROR) + << "When LinearParallel backend is mc2, only support quantType[QUANT_TYPE_UNDEFINED][QUANT_TYPE_UNQUANT]"; isOK = ERROR_INVALID_PARAM; return true; } @@ -126,11 +128,6 @@ bool CheckTypeMc2(const infer::LinearParallelParam &opParam, Status &isOK) isOK = ERROR_INVALID_PARAM; return true; } - if (opParam.quantType == atb::infer::LinearParallelParam::QuantType::QUANT_TYPE_PER_TOKEN) { - ATB_LOG(ERROR) << "When LinearParallel backend is mc2, not support quantType[QUANT_TYPE_PER_TOKEN]"; - isOK = ERROR_INVALID_PARAM; - return true; - } return false; } @@ -416,10 +413,18 @@ Status LinearParallelOperation::InferShapeCheckLinearReduceScatter(const SVector return ERROR_INVALID_TENSOR_DIM; } if (param_.backend == "mc2") { + if (inTensorDescs.at(0).shape.dimNum != IN_TENSOR_DIM_NUM) { + ATB_LOG(ERROR) << GetLogPrefix() << "inTensor0 dimNum should be equal to 2"; + return ERROR_INVALID_TENSOR_DIM_NUM; + } + if (inTensorDescs.at(1).shape.dimNum != IN_TENSOR_DIM_NUM) { + ATB_LOG(ERROR) << GetLogPrefix() << "inTensor1 dimNum should be equal to 2"; + return ERROR_INVALID_TENSOR_DIM_NUM; + } int64_t xTensorK = OperationUtil::GetXTensorK(inTensorDescs.at(0)); - if (xTensorK < 256 || xTensorK > 65535) { + if (xTensorK < 256 || xTensorK >= 65535) { ATB_LOG(ERROR) << GetLogPrefix() << "inTensor0 k [" << xTensorK - << "] should be an integer between [256 ~ 65535]"; + << "] should be an integer between [256 ~ 65535)"; return ERROR_INVALID_TENSOR_DIM; } } @@ -446,8 +451,8 @@ Status LinearParallelOperation::InferShapeCheckAllGatherLinear(const SVector &inTensorDescs) const +Status LinearParallelOperation::InferShapeCheckAllGatherLinearReduceScatter( + const SVector &inTensorDescs) const { if (param_.twoDimTPInfo.rsDim * param_.twoDimTPInfo.agDim != param_.rankSize) { ATB_LOG(ERROR) << "agDim * rsDim should equal to rankSize"; diff --git a/tests/apitest/opstest/cpp/linear_parallel/mc2_linear_reduce_scatter/linear_parallel_generation.cpp b/tests/apitest/opstest/cpp/linear_parallel/mc2_linear_reduce_scatter/linear_parallel_generation.cpp new file mode 100644 index 00000000..cf744275 --- /dev/null +++ b/tests/apitest/opstest/cpp/linear_parallel/mc2_linear_reduce_scatter/linear_parallel_generation.cpp @@ -0,0 +1,261 @@ +/* + * Copyright (c) 2025 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "acl/acl.h" +#include "atb/types.h" +#include "atb/atb_infer.h" +#include "atb/operation.h" + + +#define CHECK_STATUS(status) \ + do { \ + if ((status) != 0) { \ + std::cout << __FILE__ << ":" << __LINE__ << " [error]: " << (status) << std::endl; \ + return status; \ + } \ + } while (0) + +const int32_t DEV_NUM = 2; + +const int32_t M = 2; +const int32_t K = 256; +const int32_t N = 2; + +const aclDataType DATA_TYPE = aclDataType::ACL_FLOAT16; + +typedef uint16_t float16; +typedef uint16_t bfloat16; + +float16 FloatToFloat16(float fp32) +{ + if (fp32 == 0.0f) { + return (std::signbit(fp32) ? 0x8000 : 0x0000); + } + + uint32_t float_bits; + static_assert(sizeof(float) == sizeof(uint32_t), "Float size mismatch"); + std::memcpy(&float_bits, &fp32, sizeof(float)); + + const uint32_t sign = (float_bits >> 31) & 0x1; + const uint32_t exp = (float_bits >> 23) & 0xFF; + const uint32_t mant = float_bits & 0x7FFFFF; + if (exp == 0xFF) { + if (mant == 0) { + return (sign << 15) | 0x7C00; + } else { + return (sign << 15) | 0x7C00 | (mant >> 13); + } + } + + int32_t exp_fp16 = static_cast(exp) - 127 + 15; + if (exp_fp16 <= 0) { + return (sign << 15); + } + + if (exp_fp16 >= 0x1F) { + return (sign < 15) | 0x7C00; + } + + uint32_t mant24 = (1 << 23) | mant; + uint32_t round_bits = mant24 & 0x1FFF; + uint32_t base = (mant24 >> 13) & 0x3FF; + + if (round_bits > 0x1000 || (round_bits == 0x1000 && (base & 1))) { + base++; + if (base > 0xFF) { + base = 0; + exp_fp16++; + if (exp_fp16 >= 0x1F) { + return (sign << 15) | 0x7C00; + } + } + } + + return (sign << 15) | (exp_fp16 << 10) | base; +} + +bfloat16 FloatToBfloat16(float fp32) +{ + if (fp32 == 0.0f) { + return (std::signbit(fp32) ? 0x8000 : 0x0000); + } + + uint32_t float_bits; + static_assert(sizeof(float) == sizeof(uint32_t), "Float size mismatch"); + std::memcpy(&float_bits, &fp32, sizeof(float)); + + bfloat16 bfloat16_bits = static_cast(float_bits >> 16); + + const uint32_t exp = (float_bits >> 23) & 0xFF; + const uint32_t mant = float_bits & 0x7FFFFF; + if (exp == 0xFF && mant != 0) { + bfloat16_bits |= 0x01; + } + + return bfloat16_bits; +} + +size_t GetDataItemSize(aclDataType dtype) +{ + switch (dtype) { + case ACL_DT_UNDEFINED: + return sizeof(bool); + case ACL_FLOAT16: + return sizeof(uint16_t); + case ACL_BF16: + return sizeof(uint16_t); + default: + return 0; + } +} + +static std::mt19937 gen(0); + +template T random_float(float min, float max) +{ + std::uniform_real_distribution dist(min, max); + return dist(gen); +} + +atb::Tensor FillTensorDataRandomly(const atb::TensorDesc &desc, float range_min, float range_max) +{ + atb::Tensor tensor{desc, nullptr, nullptr, 0}; + tensor.dataSize = atb::Utils::GetTensorSize(desc); + aclrtMallocHost((void **)&tensor.hostData, tensor.dataSize); + { + size_t dataItemSize = GetDataItemSize(desc.dtype); + uint64_t tensorNumel = atb::Utils::GetTensorNumel(desc); + void *basePtr = static_cast(tensor.hostData); + for (uint64_t i = 0; i < tensorNumel; ++i) { + void *elementPtr = static_cast(basePtr) + i * dataItemSize; + switch (desc.dtype) { + case ACL_FLOAT16: + *static_cast(elementPtr) = FloatToFloat16(random_float(range_min, range_max)); + break; + case ACL_BF16: + *static_cast(elementPtr) = FloatToBfloat16(random_float(range_min, range_max)); + break; + default: + break; + } + } + } + aclrtMalloc((void **)&tensor.deviceData, tensor.dataSize, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMemcpy(tensor.deviceData, tensor.dataSize, tensor.hostData, tensor.dataSize, ACL_MEMCPY_HOST_TO_DEVICE); + + return tensor; +} + +atb::Status saveTensor(atb::Tensor tensor, std::string path) +{ + if (tensor.deviceData == nullptr) {} + void *hostData = nullptr; + aclrtMallocHost((void **)&hostData, tensor.dataSize); + aclrtMemcpy(hostData, tensor.dataSize, tensor.deviceData, tensor.dataSize, ACL_MEMCPY_DEVICE_TO_HOST); + std::ofstream file(path, std::ios::binary); + file.write(static_cast(hostData), tensor.dataSize); + file.close(); + aclrtFreeHost(hostData); + return atb::ErrorType::NO_ERROR; +} + +atb::Status ExcuteImpl(atb::Operation *op, atb::VariantPack variantPack, atb::Context *context, aclrtStream &stream) +{ + uint64_t workspaceSize = 0; + CHECK_STATUS(op->Setup(variantPack, workspaceSize, context)); + void *workspace = nullptr; + if (workspaceSize > 0) { + CHECK_STATUS(aclrtMalloc(&workspace, workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST)); + } + CHECK_STATUS(op->Execute(variantPack, (uint8_t *)workspace, workspaceSize, context)); + CHECK_STATUS(aclrtSynchronizeStream(stream)); // 流同步,等待device侧任务计算完成 + + if (workspace) { + CHECK_STATUS(aclrtFree(workspace)); // 销毁workspace + } + return atb::ErrorType::NO_ERROR; +} + +atb::Status LinearParallelOneThread(int rank, int rankSize) +{ + int deviceId = rank; + CHECK_STATUS(aclrtSetDevice(deviceId)); + atb::Context *context = nullptr; + CHECK_STATUS(atb::CreateContext(&context)); + aclrtStream stream = nullptr; + CHECK_STATUS(aclrtCreateStream(&stream)); + context->SetExecuteStream(stream); + + atb::TensorDesc inputTensorDesc{ + .dtype = DATA_TYPE, .format = aclFormat::ACL_FORMAT_ND, .shape{.dims = {M, K}, .dimNum = 2}}; + atb::Tensor input = FillTensorDataRandomly(inputTensorDesc, -10, 10); + + atb::TensorDesc weightTensorDesc{ + .dtype = DATA_TYPE, .format = aclFormat::ACL_FORMAT_ND, .shape{.dims = {K, N}, .dimNum = 2}}; + atb::Tensor weight = FillTensorDataRandomly(weightTensorDesc, -10, 10); + + atb::Tensor output; + output.desc.dtype = DATA_TYPE; + output.desc.format = ACL_FORMAT_ND; + output.desc.shape.dimNum = 2; + output.desc.shape.dims[0] = M / DEV_NUM; + output.desc.shape.dims[1] = N; + output.dataSize = atb::Utils::GetTensorSize(output); + CHECK_STATUS(aclrtMalloc(&output.deviceData, output.dataSize, ACL_MEM_MALLOC_HUGE_FIRST)); + + atb::infer::LinearParallelParam param; + param.transWeight = false; + param.rank = rank; + param.rankRoot = 0; + param.commMode = atb::infer::CommMode::COMM_MULTI_THREAD; + param.rankSize = rankSize; + param.backend = "mc2"; + param.type = atb::infer::LinearParallelParam::ParallelType::LINEAR_REDUCE_SCATTER; + atb::Operation *op = nullptr; + CHECK_STATUS(atb::CreateOperation(param, &op)); + + atb::VariantPack variantPack; + variantPack.inTensors = {input, weight}; + variantPack.outTensors = {output}; + ExcuteImpl(op, variantPack, context, stream); + std::cout << "rank: " << rank << " executed END." << std::endl; + saveTensor(input, "rank" + std::to_string(rank) + "_inTensor0.bin"); + saveTensor(weight, "rank" + std::to_string(rank) + "_inTensor1.bin"); + saveTensor(output, "rank" + std::to_string(rank) + "_outTensor0.bin"); + // 资源释放 + CHECK_STATUS(atb::DestroyOperation(op)); // 销毁op对象 + CHECK_STATUS(aclrtDestroyStream(stream)); // 销毁stream + CHECK_STATUS(atb::DestroyContext(context)); // 销毁context + return atb::ErrorType::NO_ERROR; +} + +int main(int argc, const char *argv[]) +{ + int ret = aclInit(nullptr); + + std::vector> threads(DEV_NUM); + for (size_t i = 0; i < DEV_NUM; i++) { + threads[i].reset(new (std::nothrow) std::thread(LinearParallelOneThread, i, DEV_NUM)); + } + for (size_t i = 0; i < DEV_NUM; ++i) { + threads[i]->join(); + } + + CHECK_STATUS(aclFinalize()); + return 0; +} diff --git a/tests/apitest/opstest/cpp/linear_parallel/mc2_linear_reduce_scatter/linear_parallel_mc2_linear_reduce_scatter.py b/tests/apitest/opstest/cpp/linear_parallel/mc2_linear_reduce_scatter/linear_parallel_mc2_linear_reduce_scatter.py new file mode 100644 index 00000000..cf0470d9 --- /dev/null +++ b/tests/apitest/opstest/cpp/linear_parallel/mc2_linear_reduce_scatter/linear_parallel_mc2_linear_reduce_scatter.py @@ -0,0 +1,112 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. +# This file is a part of the CANN Open Software. +# Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# + +import builtins +import os +import json +import unittest +import sys +import numpy as np +import torch +import torch_npu +import torch.multiprocessing as mp + +sys.path.append(os.path.join(os.path.dirname(__file__), "../../")) +from precision_calcu import * + +sys.path.append(os.path.join(os.path.dirname(__file__), "../../../python/operations/")) +import operation_test # NOQA: E402 + +sys.path.append(os.path.join(os.path.dirname(__file__), "../../../python/")) + +ATB_HOME_PATH = os.environ.get("ATB_HOME_PATH") +if ATB_HOME_PATH is None: + raise RuntimeError( + "env ATB_HOME_PATH not exist, source set_env.sh") +LIBTORCH_PATH = os.path.join(ATB_HOME_PATH, "lib/libatb_test_framework.so") +LIB_PATH = os.path.join(ATB_HOME_PATH, "lib/libatb.so") +torch.classes.load_library(LIBTORCH_PATH) + +DEV_NUM = 2 + +M = 2 +K = 256 +N = 2 + +DATA_TYPE = torch.float16 + +def load_tensor(data_size,data_type,data_path): + with open(data_path, 'rb') as f: + data=f.read() + if data_type == torch.float16: + np_data = np.frombuffer(data, dtype=np.float16).copy() + tensor = torch.from_numpy(np_data) + elif data_type == torch.bfloat16: + tensor = torch.frombuffer(bytearray(data), dtype=torch.bfloat16) + else: + tensor = torch.zeros(data_size) + + tensor = tensor.view(data_size) + + return tensor + + +def main_worker(rank, world_size, data_type, data_size): + torch_npu.npu.set_device(rank) + print(f'Process {rank} started, using device npu:{rank}.') + golden_out_tensor_high = None + golden_out_tensor_low = None + + for i in range(world_size): + input_tensor = load_tensor(data_size=data_size[0],data_type=data_type,data_path=f"rank{i}_inTensor{0}.bin") + weight_tensor = load_tensor(data_size=data_size[1],data_type=data_type,data_path=f"rank{i}_inTensor{1}.bin") + out_single_tensor = torch.matmul(input_tensor.to(torch.float), weight_tensor.to(torch.float)) + if golden_out_tensor_high is None: + golden_out_tensor_high = out_single_tensor.clone() + golden_out_tensor_low = out_single_tensor.clone().to(data_type) + in_tensors_desc = [input_tensor.shape, weight_tensor.shape] + else: + golden_out_tensor_high = torch.add(golden_out_tensor_high,out_single_tensor) + golden_out_tensor_low = torch.add(golden_out_tensor_low,out_single_tensor.to(data_type)) + chunks_size = int(data_size[0][0] // world_size) + chunks_high = torch.split(golden_out_tensor_high, chunks_size) + chunks_low = torch.split(golden_out_tensor_low, chunks_size) + golden_result_high = chunks_high[rank] + golden_result_low = chunks_low[rank] + + acl_out_tensor = load_tensor(data_size=data_size[2],data_type=data_type,data_path=f"rank{rank}_outTensor{0}.bin") + + assert check_precision_new(in_tensors_desc, acl_out_tensor.float(), golden_result_high.float(), golden_result_low.float(), rank) + +def check_precision_new(in_tensors_desc, out_tensor, golden_out_tensor_high, golden_out_tensor_low, rank): + if rank == 0: + print(in_tensors_desc) + print(out_tensor) + result_double = compare_cv(golden_out_tensor_high, golden_out_tensor_low, out_tensor) + return result_double + +class LinearParallelCoverOperationTest(operation_test.OperationTest): + + def test_linear_parallel(self): + if not operation_test.get_soc_version() == 'Ascend910B': + return + print(f"———————— LinearParallelCoverOp test start ————————") + + world_size = DEV_NUM + + data_type = DATA_TYPE + + data_size = [[M, K], [K, N], [M // DEV_NUM, N]] + + mp.spawn(main_worker, nprocs=world_size, args=(world_size, data_type, data_size)) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/apitest/opstest/cpp/linear_parallel/mc2_linear_reduce_scatter/run_test.sh b/tests/apitest/opstest/cpp/linear_parallel/mc2_linear_reduce_scatter/run_test.sh new file mode 100644 index 00000000..6597d672 --- /dev/null +++ b/tests/apitest/opstest/cpp/linear_parallel/mc2_linear_reduce_scatter/run_test.sh @@ -0,0 +1,94 @@ +#!/bin/bash +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. +# This file is a part of the CANN Open Software. +# Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# + +handle_error(){ + rm -rf linear_parallel_generation + rm -rf *.bin + + cd $current_dir +} + +trap handle_error ERR + +set -e + +current_dir=$(pwd) + +cd "$(dirname "$0")" + +cxx_abi=$(python3 -c ' +try: + import torch + print("1" if torch.compiled_with_cxx11_abi() else "0") +except ImportError: + print("1") +') + +echo "Using cxx_abi=$cxx_abi" + +DEV_NUM=2 + +M=2 +K=256 +N=2 + +DATA_TYPE="FLOAT16" + +DATA_TYPE_CPP="" +DATA_TYPE_PY="" + +case ${DATA_TYPE} in + FLOAT16) + DATA_TYPE_CPP="aclDataType::ACL_FLOAT16" + DATA_TYPE_PY="torch.float16" + ;; + BF16) + DATA_TYPE_CPP="aclDataType::ACL_BF16" + DATA_TYPE_PY="torch.bfloat16" + ;; + *) + DATA_TYPE_CPP="" + DATA_TYPE_PY="" + ;; +esac + +# 修改 DEV_NUM 的值 +sed -i "s/const int32_t DEV_NUM = .*;/const int32_t DEV_NUM = ${DEV_NUM};/" linear_parallel_generation.cpp +# 修改 M 的值 +sed -i "s/const int32_t M = .*;/const int32_t M = ${M};/" linear_parallel_generation.cpp +# 修改 K 的值 +sed -i "s/const int32_t K = .*;/const int32_t K = ${K};/" linear_parallel_generation.cpp +# 修改 N 的值 +sed -i "s/const int32_t N = .*;/const int32_t N = ${N};/" linear_parallel_generation.cpp +# 修改 DATA_TYPE 的值 +sed -i "s/const aclDataType DATA_TYPE = .*;/const aclDataType DATA_TYPE = ${DATA_TYPE_CPP};/" linear_parallel_generation.cpp + +# 修改 DEV_NUM 的值 +sed -i "s/DEV_NUM = .*/DEV_NUM = ${DEV_NUM}/" linear_parallel_mc2_linear_reduce_scatter.py +# 修改 M 的值 +sed -i "s/M = .*/M = ${M}/" linear_parallel_mc2_linear_reduce_scatter.py +# 修改 K 的值 +sed -i "s/K = .*/K = ${K}/" linear_parallel_mc2_linear_reduce_scatter.py +# 修改 N 的值 +sed -i "s/N = .*/N = ${N}/" linear_parallel_mc2_linear_reduce_scatter.py +# 修改 DATA_TYPE 的值 +sed -i "s/DATA_TYPE = .*/DATA_TYPE = ${DATA_TYPE_PY}/" linear_parallel_mc2_linear_reduce_scatter.py + +g++ -D_GLIBCXX_USE_CXX11_ABI=$cxx_abi -I "${ATB_HOME_PATH}/include" -I "${ASCEND_HOME_PATH}/include" -L "${ATB_HOME_PATH}/lib" -L "${ASCEND_HOME_PATH}/lib64" \ +linear_parallel_generation.cpp -l atb -l ascendcl -l hccl -l nnopbase -l opapi -o linear_parallel_generation +./linear_parallel_generation + +python linear_parallel_mc2_linear_reduce_scatter.py + +rm -rf linear_parallel_generation +rm -rf *.bin + +cd $current_dir \ No newline at end of file diff --git a/tests/apitest/opstest/cpp/precision_calcu.py b/tests/apitest/opstest/cpp/precision_calcu.py new file mode 100644 index 00000000..7d67ad58 --- /dev/null +++ b/tests/apitest/opstest/cpp/precision_calcu.py @@ -0,0 +1,147 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. +# This file is a part of the CANN Open Software. +# Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# + +import os +import unittest +import logging +import json +import re +import numpy +import torch +import torch_npu +import math +import sys +import shutil +from enum import Enum + +MIN_ERR = 1e-7 +class OpTypes(Enum): + NA = 0 # new standard is not available + MOVE = 1 + RAND = 2 + CAST = 3 + COMPUTE_INTEGER = 4 + COMPUTE_QUANT = 5 + COMPUTE_FLOAT = 6 + COMPUTE_FLOAT_HIGH_PRECISION = 7 + VECTOR_FUSION = 8 + CV_FUSION = 9 + +dtype_dict = {"float": torch.float32, "float16": torch.float16, "int8": torch.int8, "int32": torch.int32, "uint8": torch.uint8, + "int16": torch.int16, "uint16": torch.int16, "uint32": torch.int32, "int64": torch.int64, "uint64": torch.int64, + "double": torch.double, "bool": torch.bool, "complex64": torch.complex64, "complex128": torch.complex128, "bf16": torch.bfloat16} + +def get_eb_threshold(dtype:torch.dtype): + eb_threshold = 0 + if dtype in [torch.bfloat16]: + eb_threshold = 2**(-7) + if dtype in [torch.float16]: + eb_threshold = 2**(-10) + if dtype in [torch.float32]: + eb_threshold = 2**(-14) + return eb_threshold + +def get_err_threshold(op_type:OpTypes, dtype:torch.dtype): + err_threshold = 0 + if op_type in [OpTypes.MOVE, OpTypes.RAND, OpTypes.CAST, OpTypes.COMPUTE_INTEGER]: + pass + if op_type in [OpTypes.COMPUTE_QUANT, OpTypes.COMPUTE_FLOAT]: + if dtype in [torch.bfloat16]: + err_threshold = 2**(-7) + if dtype in [torch.float16]: + err_threshold = 2**(-8) + if dtype in [torch.float32]: + err_threshold = 2**(-11) + if op_type in [OpTypes.CV_FUSION]: + if dtype in [torch.bfloat16]: + err_threshold = 2**(-8) + if dtype in [torch.float16]: + err_threshold = 2**(-11) + if dtype in [torch.float32]: + err_threshold = 2**(-14) + return err_threshold + + +#误差均衡性(EB) +def get_eb(golden:torch.Tensor, actual:torch.Tensor): + golden = golden.to(torch.float32) + golden_nmax = torch.clamp(torch.abs(golden), min = 1) + actual_error = actual.to(torch.float32) - golden + EB = torch.mean(actual_error / golden_nmax) + return EB + +#单标杆、浮点比对方法|actual - expected| <= err × max(1, | expected |) +def ref_compare(golden:torch.Tensor, actual:torch.Tensor, err): + golden = golden.to(torch.float32) + golden_nmax = torch.clamp(torch.abs(golden), min = 1) + abs_error = torch.abs(actual.to(torch.float32) - golden) + result = (abs_error <= err * golden_nmax).all() + logging.info(f"new golden result:{result}") + return result + + +#最大相对误差:max relative error,MARE +def get_mare(golden:torch.Tensor, actual:torch.Tensor): + golden = golden.to(torch.float32) + abs_error = torch.abs(actual.to(torch.float32) - golden) / (torch.abs(golden) + MIN_ERR) + mare = torch.max(abs_error.flatten()) + return mare + +#平均相对误差:mean relative error,MERE +def get_mere(golden:torch.Tensor, actual:torch.Tensor): + golden = golden.to(torch.float32) + abs_error = torch.abs(actual.to(torch.float32) - golden) / (torch.abs(golden) + MIN_ERR) + mere = torch.mean(abs_error) + return mere + +#均方根误差:Root Mean Squared Error,RMSE +def get_rmse(golden:torch.Tensor, actual:torch.Tensor): + golden = golden.to(torch.float32) + sqr_err = torch.pow((actual.to(torch.float32) - golden), 2) + rmse = torch.sqrt(torch.mean(sqr_err)) + return rmse + +def compare_cv(golden:torch.Tensor, gpu:torch.Tensor, actual:torch.Tensor): + op_type = OpTypes.CV_FUSION + judge_threshold = 522 + eb_threshold = get_eb_threshold(actual.dtype) + err_threshold = get_err_threshold(op_type, actual.dtype) + logging.info(f"err_threshold:{err_threshold} eb_threshold:{eb_threshold}") + mare_npu = get_mare(golden, actual) + mare_gpu = get_mare(golden, gpu) + + mere_npu = get_mere(golden, actual) + mere_gpu = get_mere(golden, gpu) + + rmse_npu = get_rmse(golden, actual) + rmse_gpu = get_rmse(golden, gpu) + + mare_rate = mare_npu / max(mare_gpu, err_threshold) + mere_rate = mere_npu / max(mere_gpu, err_threshold) + rmse_rate = rmse_npu / max(rmse_gpu, err_threshold) + + EB = get_eb(gpu, actual) + result = (mare_rate < 10) and (mere_rate < 2) and (rmse_rate < 2) and (EB < eb_threshold) + + print(f"eb_threshold:{eb_threshold} err_threshold:{err_threshold}") + print(f"mere_npu:{mere_npu} mere_gpu:{mere_gpu}") + print(f"rmse_npu:{rmse_npu} rmse_gpu:{rmse_gpu}") + print(f"MARE:{mare_rate} MERE:{mere_rate} RMSE:{rmse_rate} EB:{EB}") + print(f"new golden cv result:{result}") + return result + +if __name__ == '__main__': + logging.basicConfig(level=logging.INFO, + format='%(asctime)s - %(levelname)s - %(message)s') + gloden = torch.rand((128,128), dtype=torch.float32) + actual = gloden.to(torch.float16) + gpu = actual + compare_cv(gloden, gpu, actual) + \ No newline at end of file diff --git a/tests/apitest/opstest/csv/linear_parallel.csv b/tests/apitest/opstest/csv/linear_parallel.csv index ae51e1c4..6e3efc07 100644 --- a/tests/apitest/opstest/csv/linear_parallel.csv +++ b/tests/apitest/opstest/csv/linear_parallel.csv @@ -103,3 +103,12 @@ CaseNum|CaseName|OpName|OpParam|InNum|InDType|InFormat|InShape|OutNum|OutDType|O 102|SErrorGroupMatmulReduceScatterAlltoallvc|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":6,"quantType":-1,"moeInfo":{"epSize":2,"tpSize":1,"localExpertNums":16}}|7|int8;int8;int32;int64;float;int32;int32|nd;nd;nd;nd;nd;nd;nd|1024,1024;16,1024,1024;1024;1024;1024;2,32;32768|1|float16|nd|32768,1024|random;random;random;random;random;random;random|-5,5;-5,5;-10,10;-10,10;-1,1;-10,10;-10,10||||||Ascend910B|I:ERROR_INVALID_IN_TENSOR_NUM 103|SErrorGroupMatmulReduceScatterAlltoallvc|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":6,"quantType":-1,"moeInfo":{"epSize":3,"tpSize":1,"localExpertNums":16}}|4|float16;float16;int32;int32|nd;nd;nd;nd|1024,1024;2,32;32768|1|float16|nd|32768,1024|random;random;random;random|-1,1;-1,1;-10,10;-10,10||||||Ascend910B|C:ERROR_INVALID_PARAM 104|PureMatmulW8A8Fp16_3_float|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":3,"quantType":0,"outDataType":1}|4|int8;int8;int32;float|nd;nd;nd;nd|2,4;4,4;1;1|1|float16|nd|2,4|random;random;random;random|-5,5;-5,5;-10,10;1,2||||||Ascend910B|I:ERROR_INVALID_TENSOR_INI_MATCH +105|ErrorCase0LinearReduceScatter|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":true,"backend":"mc2","type":0,"commMode":1}|2|float16;float16|nd;nd|2,16;32,16|1|float16|nd|1,32|customize;customize|-1,1;-1,1||||||Ascend910B|C:ERROR_INVALID_PARAM +106|ErrorCase1LinearReduceScatter|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"mc2","type":0,"commMode":1}|2|float16;float16|nd;nd|2,16;32,16|1|float16|nd|1,32|customize;customize|-1,1;-1,1||||||Ascend910B|C:ERROR_INVALID_PARAM +107|ErrorCase2LinearReduceScatter|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"mc2","type":1,"commMode":0}|2|float16;float16|nd;nd|2,16;32,16|1|float16|nd|1,32|customize;customize|-1,1;-1,1||||||Ascend910B|C:ERROR_INVALID_PARAM +108|ErrorCase3LinearReduceScatter|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"mc2","type":1,"commMode":1,"quantType":0}|2|float16;float16|nd;nd|2,256;32,256|1|float16|nd|1,32|customize;customize|-1,1;-1,1||||||Ascend910B|C:ERROR_INVALID_PARAM +109|ErrorCase4LinearReduceScatter|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"mc2","type":1,"commMode":1,"quantType":-1}|2|float16;float16|nd;nd|2,255;32,255|1|float16|nd|1,32|customize;customize|-1,1;-1,1||||||Ascend910B|I:ERROR_INVALID_TENSOR_DIM +110|ErrorCase5LinearReduceScatter|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"mc2","type":1,"commMode":1,"quantType":-1}|2|float16;float16|nd;nd|2,65535;32,65535|1|float16|nd|1,32|customize;customize|-1,1;-1,1||||||Ascend910B|I:ERROR_INVALID_TENSOR_DIM +111|ErrorCase6LinearReduceScatter|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"mc2","type":1,"commMode":1,"quantType":-1}|2|float16;float16|nd;nd|1,2,256;32,256|1|float16|nd|1,32|customize;customize|-1,1;-1,1||||||Ascend910B|I:ERROR_INVALID_TENSOR_DIM_NUM +112|ErrorCase7LinearReduceScatter|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"mc2","type":1,"commMode":1,"quantType":-1}|2|float16;float16|nd;nd|2,256;1,32,256|1|float16|nd|1,32|customize;customize|-1,1;-1,1||||||Ascend910B|I:ERROR_INVALID_TENSOR_DIM_NUM +112|ErrorCase7LinearReduceScatter|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"mc2","type":1,"commMode":1,"quantType":-1}|2|float16;float16|nd;fractal_nz|2,256;1,32,256|1|float16|nd|1,32|customize;customize|-1,1;-1,1||||||Ascend910B|I:ERROR_INVALID_TENSOR_INI_MATCH -- Gitee From f08c76f6643e9c832ec0315ff12826549f7f9fb2 Mon Sep 17 00:00:00 2001 From: zouyanlong Date: Tue, 23 Sep 2025 17:45:55 +0800 Subject: [PATCH 76/94] fix --- tests/apitest/opstest/csv/linear_parallel.csv | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/apitest/opstest/csv/linear_parallel.csv b/tests/apitest/opstest/csv/linear_parallel.csv index 6e3efc07..5ac11486 100644 --- a/tests/apitest/opstest/csv/linear_parallel.csv +++ b/tests/apitest/opstest/csv/linear_parallel.csv @@ -111,4 +111,3 @@ CaseNum|CaseName|OpName|OpParam|InNum|InDType|InFormat|InShape|OutNum|OutDType|O 110|ErrorCase5LinearReduceScatter|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"mc2","type":1,"commMode":1,"quantType":-1}|2|float16;float16|nd;nd|2,65535;32,65535|1|float16|nd|1,32|customize;customize|-1,1;-1,1||||||Ascend910B|I:ERROR_INVALID_TENSOR_DIM 111|ErrorCase6LinearReduceScatter|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"mc2","type":1,"commMode":1,"quantType":-1}|2|float16;float16|nd;nd|1,2,256;32,256|1|float16|nd|1,32|customize;customize|-1,1;-1,1||||||Ascend910B|I:ERROR_INVALID_TENSOR_DIM_NUM 112|ErrorCase7LinearReduceScatter|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"mc2","type":1,"commMode":1,"quantType":-1}|2|float16;float16|nd;nd|2,256;1,32,256|1|float16|nd|1,32|customize;customize|-1,1;-1,1||||||Ascend910B|I:ERROR_INVALID_TENSOR_DIM_NUM -112|ErrorCase7LinearReduceScatter|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"mc2","type":1,"commMode":1,"quantType":-1}|2|float16;float16|nd;fractal_nz|2,256;1,32,256|1|float16|nd|1,32|customize;customize|-1,1;-1,1||||||Ascend910B|I:ERROR_INVALID_TENSOR_INI_MATCH -- Gitee From 00981e831305e57ebf409f84ec190ba86fa32520 Mon Sep 17 00:00:00 2001 From: zouyanlong Date: Tue, 23 Sep 2025 18:58:41 +0800 Subject: [PATCH 77/94] fix --- .../mc2_linear_reduce_scatter/run_test.sh | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/tests/apitest/opstest/cpp/linear_parallel/mc2_linear_reduce_scatter/run_test.sh b/tests/apitest/opstest/cpp/linear_parallel/mc2_linear_reduce_scatter/run_test.sh index 6597d672..e1e5f6f8 100644 --- a/tests/apitest/opstest/cpp/linear_parallel/mc2_linear_reduce_scatter/run_test.sh +++ b/tests/apitest/opstest/cpp/linear_parallel/mc2_linear_reduce_scatter/run_test.sh @@ -61,26 +61,26 @@ case ${DATA_TYPE} in esac # 修改 DEV_NUM 的值 -sed -i "s/const int32_t DEV_NUM = .*;/const int32_t DEV_NUM = ${DEV_NUM};/" linear_parallel_generation.cpp +sed -i "s/^const int32_t DEV_NUM = .*;/const int32_t DEV_NUM = ${DEV_NUM};/" linear_parallel_generation.cpp # 修改 M 的值 -sed -i "s/const int32_t M = .*;/const int32_t M = ${M};/" linear_parallel_generation.cpp +sed -i "s/^const int32_t M = .*;/const int32_t M = ${M};/" linear_parallel_generation.cpp # 修改 K 的值 -sed -i "s/const int32_t K = .*;/const int32_t K = ${K};/" linear_parallel_generation.cpp +sed -i "s/^const int32_t K = .*;/const int32_t K = ${K};/" linear_parallel_generation.cpp # 修改 N 的值 -sed -i "s/const int32_t N = .*;/const int32_t N = ${N};/" linear_parallel_generation.cpp +sed -i "s/^const int32_t N = .*;/const int32_t N = ${N};/" linear_parallel_generation.cpp # 修改 DATA_TYPE 的值 -sed -i "s/const aclDataType DATA_TYPE = .*;/const aclDataType DATA_TYPE = ${DATA_TYPE_CPP};/" linear_parallel_generation.cpp +sed -i "s/^const aclDataType DATA_TYPE = .*;/const aclDataType DATA_TYPE = ${DATA_TYPE_CPP};/" linear_parallel_generation.cpp # 修改 DEV_NUM 的值 -sed -i "s/DEV_NUM = .*/DEV_NUM = ${DEV_NUM}/" linear_parallel_mc2_linear_reduce_scatter.py +sed -i "s/^DEV_NUM = .*/DEV_NUM = ${DEV_NUM}/" linear_parallel_mc2_linear_reduce_scatter.py # 修改 M 的值 -sed -i "s/M = .*/M = ${M}/" linear_parallel_mc2_linear_reduce_scatter.py +sed -i "s/^M = .*/M = ${M}/" linear_parallel_mc2_linear_reduce_scatter.py # 修改 K 的值 -sed -i "s/K = .*/K = ${K}/" linear_parallel_mc2_linear_reduce_scatter.py +sed -i "s/^K = .*/K = ${K}/" linear_parallel_mc2_linear_reduce_scatter.py # 修改 N 的值 -sed -i "s/N = .*/N = ${N}/" linear_parallel_mc2_linear_reduce_scatter.py +sed -i "s/^N = .*/N = ${N}/" linear_parallel_mc2_linear_reduce_scatter.py # 修改 DATA_TYPE 的值 -sed -i "s/DATA_TYPE = .*/DATA_TYPE = ${DATA_TYPE_PY}/" linear_parallel_mc2_linear_reduce_scatter.py +sed -i "s/^DATA_TYPE = .*/DATA_TYPE = ${DATA_TYPE_PY}/" linear_parallel_mc2_linear_reduce_scatter.py g++ -D_GLIBCXX_USE_CXX11_ABI=$cxx_abi -I "${ATB_HOME_PATH}/include" -I "${ASCEND_HOME_PATH}/include" -L "${ATB_HOME_PATH}/lib" -L "${ASCEND_HOME_PATH}/lib64" \ linear_parallel_generation.cpp -l atb -l ascendcl -l hccl -l nnopbase -l opapi -o linear_parallel_generation -- Gitee From fc6d7ef058e5973dfeb72b3673d36b239c6bee51 Mon Sep 17 00:00:00 2001 From: guanguan Date: Tue, 23 Sep 2025 19:17:38 +0800 Subject: [PATCH 78/94] fix --- .../linear_parallel_operation.cpp | 38 ------------------- .../linear_parallel_operation.h | 1 - 2 files changed, 39 deletions(-) diff --git a/src/ops_infer/linear_parallel/linear_parallel_operation.cpp b/src/ops_infer/linear_parallel/linear_parallel_operation.cpp index 810c8d75..a4396794 100644 --- a/src/ops_infer/linear_parallel/linear_parallel_operation.cpp +++ b/src/ops_infer/linear_parallel/linear_parallel_operation.cpp @@ -37,7 +37,6 @@ static const uint32_t RESIDUAL_TENSOR_INDEX_3 = 3; static const uint32_t RESIDUAL_TENSOR_INDEX_4 = 4; static const uint32_t MAX_OUTPUT_SIZE = 204800; static const uint32_t MAX_K = 24000; -static const uint32_t DIM_4 = 4; static bool AllToAllvcAllGatherGmmOutTensorCheck(const SVector &inTensorDescs, const TensorDesc &outTensorDesc, const std::string &logPrefix) @@ -379,26 +378,9 @@ Status LinearParallelOperation::CheckResidual(const SVector &inTenso return NO_ERROR; } -Status LinearParallelOperation::CheckWeightNzFormat(const SVector &inTensorDescs) const -{ - const TensorDesc &weight = inTensorDescs.at(1); - bool weightNz = (weight.format == ACL_FORMAT_FRACTAL_NZ); - if (weightNz) { - if (weight.shape.dimNum != DIM_4) { - ATB_LOG(ERROR) << GetLogPrefix() << "fractal_nz shape dim should be 4. now is "<< weight.shape.dimNum; - return ERROR_INVALID_TENSOR_DIM; - } - } - return NO_ERROR; -} Status LinearParallelOperation::InferShapeCheckLinearAllReduce(const SVector &inTensorDescs) const { - Status st = CheckWeightNzFormat(inTensorDescs); - if (st != NO_ERROR) { - return st; - } - if (!OperationUtil::MatmulInTensorDescsCheck(inTensorDescs, GetLogPrefix(), commonCheckParam_)) { return ERROR_INVALID_TENSOR_DIM; } @@ -416,11 +398,6 @@ Status LinearParallelOperation::InferShapeCheckLinearAllReduce(const SVector &inTensorDescs) const { - Status st = CheckWeightNzFormat(inTensorDescs); - if (st != NO_ERROR) { - return st; - } - if (!OperationUtil::MatmulInTensorDescsCheck(inTensorDescs, GetLogPrefix(), commonCheckParam_)) { return ERROR_INVALID_TENSOR_DIM; } @@ -446,11 +423,6 @@ Status LinearParallelOperation::InferShapeCheckLinearReduceScatter(const SVector Status LinearParallelOperation::InferShapeCheckAllGatherLinear(const SVector &inTensorDescs) const { - Status st = CheckWeightNzFormat(inTensorDescs); - if (st != NO_ERROR) { - return st; - } - bool isQuant = param_.quantType > infer::LinearParallelParam::QuantType::QUANT_TYPE_UNQUANT && param_.quantType < infer::LinearParallelParam::QuantType::QUANT_TYPE_MAX; if (isQuant && inTensorDescs.at(3).dtype == ACL_FLOAT && param_.outDataType == ACL_FLOAT16) { @@ -472,11 +444,6 @@ Status LinearParallelOperation::InferShapeCheckAllGatherLinear(const SVector &inTensorDescs) const { - Status st = CheckWeightNzFormat(inTensorDescs); - if (st != NO_ERROR) { - return st; - } - if (param_.twoDimTPInfo.rsDim * param_.twoDimTPInfo.agDim != param_.rankSize) { ATB_LOG(ERROR) << "agDim * rsDim should equal to rankSize"; return ERROR_INVALID_PARAM; @@ -504,11 +471,6 @@ LinearParallelOperation::InferShapeCheckAllGatherLinearReduceScatter(const SVect Status LinearParallelOperation::InferShapeCheckAllToAllvcAllGatherGmm(const SVector &inTensorDescs) const { - Status st = CheckWeightNzFormat(inTensorDescs); - if (st != NO_ERROR) { - return st; - } - bool isQuant = param_.quantType > infer::LinearParallelParam::QuantType::QUANT_TYPE_UNQUANT && param_.quantType < infer::LinearParallelParam::QuantType::QUANT_TYPE_MAX; if (isQuant && inTensorDescs.at(2).dtype == ACL_FLOAT && param_.outDataType == ACL_FLOAT16) { diff --git a/src/ops_infer/linear_parallel/linear_parallel_operation.h b/src/ops_infer/linear_parallel/linear_parallel_operation.h index 2658ac3f..d9e17af5 100755 --- a/src/ops_infer/linear_parallel/linear_parallel_operation.h +++ b/src/ops_infer/linear_parallel/linear_parallel_operation.h @@ -38,7 +38,6 @@ private: Status InferShapeAllToAllvcAllGatherGmm(const SVector &inTensorDescs, SVector &outTensorDescs) const; Status CheckResidual(const SVector &inTensorDescs) const; - Status CheckWeightNzFormat(const SVector &inTensorDescs) const; Status InferShapeCheckLinearAllReduce(const SVector &inTensorDescs) const; Status InferShapeCheckLinearReduceScatter(const SVector &inTensorDescs) const; Status InferShapeCheckAllGatherLinear(const SVector &inTensorDescs) const; -- Gitee From b59fa3aae909b074942adb6e5729853c1d465e8a Mon Sep 17 00:00:00 2001 From: guanguan Date: Tue, 23 Sep 2025 19:31:59 +0800 Subject: [PATCH 79/94] fix --- src/ops_infer/linear_parallel/linear_parallel_lcoc_runner.cpp | 4 ++++ 1 file changed, 4 insertions(+) mode change 100644 => 100755 src/ops_infer/linear_parallel/linear_parallel_lcoc_runner.cpp diff --git a/src/ops_infer/linear_parallel/linear_parallel_lcoc_runner.cpp b/src/ops_infer/linear_parallel/linear_parallel_lcoc_runner.cpp old mode 100644 new mode 100755 index 3d3ebdc8..9d4ac7da --- a/src/ops_infer/linear_parallel/linear_parallel_lcoc_runner.cpp +++ b/src/ops_infer/linear_parallel/linear_parallel_lcoc_runner.cpp @@ -118,6 +118,10 @@ Status LinearParallelLcocRunner::SetupImpl(RunnerVariantPack &runnerVariantPack) ATB_LOG(ERROR) << GetLogPrefix() << "GetCoCDataTypeDesc failed."; return ERROR_INVALID_PARAM; } + if (mmInfo.transB && mmInfo.weightNz) { + ATB_LOG(ERROR) << GetLogPrefix() << "transWeight and weightNz can not be true at the same time."; + return ERROR_INVALID_PARAM; + } Lcal::CoCParamDesc coCParamDesc{ .dataTypeDesc = dataTypeDesc, .mmInfo = mmInfo, -- Gitee From a96807388e40e9a663f9ced9d705906c698116df Mon Sep 17 00:00:00 2001 From: guanguan Date: Tue, 23 Sep 2025 19:55:17 +0800 Subject: [PATCH 80/94] fix --- tests/apitest/opstest/csv/linear_parallel.csv | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) mode change 100644 => 100755 tests/apitest/opstest/csv/linear_parallel.csv diff --git a/tests/apitest/opstest/csv/linear_parallel.csv b/tests/apitest/opstest/csv/linear_parallel.csv old mode 100644 new mode 100755 index ae51e1c4..1e0de978 --- a/tests/apitest/opstest/csv/linear_parallel.csv +++ b/tests/apitest/opstest/csv/linear_parallel.csv @@ -20,7 +20,7 @@ CaseNum|CaseName|OpName|OpParam|InNum|InDType|InFormat|InShape|OutNum|OutDType|O 19|IErrorIniMatchCase3|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"hccl"}|2|float16;float16|nd;nhwc|2,16;32,16|0||||zero;zero|0,0;0,0|||||||I:ERROR_INVALID_TENSOR_INI_MATCH 20|IErrorIniMatchCase4|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":true,"backend":"hccl"}|3|float16;float16;bool|nd;nd;nd|2,16;32,16;1,32|0||||zero;zero;zero|0,0;0,0;0,0|||||||I:ERROR_INVALID_TENSOR_INI_MATCH 21|IErrorIniMatchCase5|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":true,"backend":"hccl"}|3|float16;float16;float16|nd;nd;nhwc|2,16;32,16;1,32|0||||zero;zero;zero|0,0;0,0;0,0|||||||I:ERROR_INVALID_TENSOR_INI_MATCH -22|LinearParallelLcocSupportWeightNz|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc"}|2|float16;float16|nd;fractal_nz|2,16;32,16|1|float16|nd|2,32|customize;customize;customize|-1,1;-1,1;-2,2||||||Ascend910B|NO_ERROR +22|LinearParallelLcocSupportWeightNz|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc"}|2|float16;float16|nd;fractal_nz|2,16;16,32|1|float16|nd|2,32|customize;customize;customize|-1,1;-1,1;-2,2||||||Ascend910B|NO_ERROR 23|IErrorDimCase0|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"hccl"}|2|float16;float16|nd;nd|16;32,16|0||||zero;zero|0,0;0,0|||||||I:ERROR_INVALID_TENSOR_DIM 24|IErrorDimCase1|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"hccl"}|2|float16;float16|nd;nd|2,16;32|0||||zero;zero|0,0;0,0|||||||I:ERROR_INVALID_TENSOR_DIM 25|LinearParallelHcclSupportWeightNz|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"hccl"}|2|float16;float16|nd;fractal_nz|2,16;32,16|1|float16|nd|2,32|customize;customize;customize|-1,1;-1,1;-2,2||||||Ascend910B|NO_ERROR @@ -67,17 +67,17 @@ CaseNum|CaseName|OpName|OpParam|InNum|InDType|InFormat|InShape|OutNum|OutDType|O 66|PureMatmulW8A8Bf16PerChannel|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":3,"quantType":1,"outDataType":27}|4|int8;int8;int32;int64|nd;nd;nd;nd|2,4;4,4;4;4|1|bf16|nd|2,4|customize;customize;customize;customize|-5,5;-5,5;-10,10;1,2||||||Ascend910B|NO_ERROR 67|PureMatmulW8A8InvalidQuantType|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":3,"quantType":2,"outDataType":27}|4|int8;int8;int32;int64|nd;nd;nd;nd|2,4;4,4;4;4|1|bf16|nd|2,4|customize;customize;customize;customize|-5,5;-5,5;-10,10;1,2||||||Ascend910B|S:ERROR_INVALID_PARAM 68|PureMatmulKeepIntermediateInValid|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","keepIntermediate":true,"type":3,"quantType":0,"outDataType":27}|4|int8;int8;int32;int64|nd;nd;nd;nd|2,4;4,4;1;1|1|bf16|nd|2,4|customize;customize;customize;customize|-5,5;-5,5;-10,10;1,2||||||Ascend910B|C:ERROR_INVALID_PARAM -69|MatmulAllReduceNzCase1|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc"}|2|float16;float16|nd;fractal_nz|1,5120;16,5120|1|float16|nd|1,16|customize;customize|-1,1;-1,1||||||Ascend910B|NO_ERROR -70|MatmulAllReduceNzCase2|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc"}|2|float16;float16|nd;fractal_nz|28,5,1024;8192,1024|1|float16|nd|28,5,8192|customize;customize|-1,1;-1,1||||||Ascend910B|NO_ERROR -71|MatmulAllReduceNzCase3|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc"}|2|float16;float16|nd;fractal_nz|2,16;1,1,32,16|1|float16|nd|2,32|customize;customize|-1,1;-1,1||||||Ascend910B|NO_ERROR -72|MatmulAllReduceNzCase4|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc"}|2|float16;float16|nd;fractal_nz|2,2,16;1,1,32,16|1|float16|nd|2,2,32|customize;customize|-1,1;-1,1||||||Ascend910B|NO_ERROR -73|MatmulAllReduceNzCase5|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc"}|2|bf16;bf16|nd;fractal_nz|1,5120;16,5120|1|bf16|nd|1,16|customize;customize|-1,1;-1,1||||||Ascend910B|NO_ERROR -74|MatmulAllReduceNzCase6|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc"}|2|bf16;bf16|nd;fractal_nz|2,2,16;1,1,32,16|1|bf16|nd|2,2,32|customize;customize|-1,1;-1,1||||||Ascend910B|NO_ERROR -75|PureMatmulNz|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type": 3}|2|float16;float16|nd;fractal_nz|1,5120;16,5120|1|float16|nd|1,16|customize;customize|-1,1;-1,1||||||Ascend910B|NO_ERROR -76|PureMatmulNzBf16|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type": 3}|2|bf16;bf16|nd;fractal_nz|1,5120;16,5120|1|bf16|nd|1,16|customize;customize|-1,1;-1,1||||||Ascend910B|NO_ERROR -77|MatmulReduceScatterNz|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":1}|2|float16;float16|nd;fractal_nz|2,16;32,16|1|float16|nd|1,32|customize;customize|-1,1;-1,1||||||Ascend910B|NO_ERROR -78|MatmulReduceScatterNzBf16|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":1}|2|bf16;bf16|nd;fractal_nz|2,16;32,16|1|bf16|nd|1,32|customize;customize|-1,1;-1,1||||||Ascend910B|NO_ERROR -79|AllGatherMatmulNz|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":2}|2|float16;float16|nd;fractal_nz|2,16;32,16|1|float16|nd|4,32|customize;customize|-1,1;-1,1||||||Ascend910B|NO_ERROR +69|MatmulAllReduceNzCase1|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc"}|2|float16;float16|nd;fractal_nz|1,5120;5120,16|1|float16|nd|1,16|customize;customize|-1,1;-1,1||||||Ascend910B|NO_ERROR +70|MatmulAllReduceNzCase2|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc"}|2|float16;float16|nd;fractal_nz|28,5,1024;1024,8192|1|float16|nd|28,5,8192|customize;customize|-1,1;-1,1||||||Ascend910B|NO_ERROR +71|MatmulAllReduceNzCase3|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc"}|2|float16;float16|nd;fractal_nz|2,16;1,2,16,16|1|float16|nd|2,32|customize;customize|-1,1;-1,1||||||Ascend910B|NO_ERROR +72|MatmulAllReduceNzCase4|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc"}|2|float16;float16|nd;fractal_nz|2,2,16;1,2,16,16|1|float16|nd|2,2,32|customize;customize|-1,1;-1,1||||||Ascend910B|NO_ERROR +73|MatmulAllReduceNzCase5|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc"}|2|bf16;bf16|nd;fractal_nz|1,5120;5120,16|1|bf16|nd|1,16|customize;customize|-1,1;-1,1||||||Ascend910B|NO_ERROR +74|MatmulAllReduceNzCase6|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc"}|2|bf16;bf16|nd;fractal_nz|2,2,16;1,2,16,16|1|bf16|nd|2,2,32|customize;customize|-1,1;-1,1||||||Ascend910B|NO_ERROR +75|PureMatmulNz|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type": 3}|2|float16;float16|nd;fractal_nz|1,5120;5120,16|1|float16|nd|1,16|customize;customize|-1,1;-1,1||||||Ascend910B|NO_ERROR +76|PureMatmulNzBf16|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type": 3}|2|bf16;bf16|nd;fractal_nz|1,5120;5120,16|1|bf16|nd|1,16|customize;customize|-1,1;-1,1||||||Ascend910B|NO_ERROR +77|MatmulReduceScatterNz|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":1}|2|float16;float16|nd;fractal_nz|2,16;16,32|1|float16|nd|1,32|customize;customize|-1,1;-1,1||||||Ascend910B|NO_ERROR +78|MatmulReduceScatterNzBf16|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":1}|2|bf16;bf16|nd;fractal_nz|2,16;16,32|1|bf16|nd|1,32|customize;customize|-1,1;-1,1||||||Ascend910B|NO_ERROR +79|AllGatherMatmulNz|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":2}|2|float16;float16|nd;fractal_nz|2,16;16,32|1|float16|nd|4,32|customize;customize|-1,1;-1,1||||||Ascend910B|NO_ERROR 80|LinearParallelBf16Error|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"hccl"}|2|bf16;bf16|nd;nd|2,16;32,16|1|bf16|nd|1,32|customize;customize|-1,1;-1,1||||||Ascend310P|I:ERROR_INVALID_TENSOR_DTYPE 81|rsv|LinearParallelOperation|{"rank":0,"rankSize":2,"rsv":[1]}|0||||0||||||||||||C:ERROR_INVALID_PARAM 82|NoErrorCase0AllGatherLinearReduceScatter|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":8,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":4,"twoDimTPInfo":{"agDim":4,"rsDim":2,"innerDimIsAg":1}}|2|float16;float16|nd;nd|2,16;32,16|1|float16|nd|4,32|customize;customize|-1,1;-1,1||||||Ascend910B|NO_ERROR -- Gitee From c570a872bad5928197dc871728312ed5456c4d2a Mon Sep 17 00:00:00 2001 From: huangxiaolan Date: Wed, 24 Sep 2025 09:35:01 +0800 Subject: [PATCH 81/94] fix include --- .../tbe_adapter/platform/tiling/platform/platform_ascendc.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/kernels/tbe_adapter/platform/tiling/platform/platform_ascendc.h b/src/kernels/tbe_adapter/platform/tiling/platform/platform_ascendc.h index 6fa0fcfd..8e2b0a53 100644 --- a/src/kernels/tbe_adapter/platform/tiling/platform/platform_ascendc.h +++ b/src/kernels/tbe_adapter/platform/tiling/platform/platform_ascendc.h @@ -18,7 +18,7 @@ #include #include -#include "stubs\include\metadef\inc\external\platform\platform_info.h" +#include "stubs/include/metadef/inc/external/platform/platform_info.h" #define ASCENDC_ASSERT(cond, behavior) \ do { \ -- Gitee From 3f578716360cc1f0606b241010d3dab6def84c8a Mon Sep 17 00:00:00 2001 From: x30073543 Date: Mon, 15 Sep 2025 11:31:52 +0800 Subject: [PATCH 82/94] null ptr check --- .../mixkernels/fusion/tiling/fusion_tiling.cpp | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/src/kernels/mixkernels/fusion/tiling/fusion_tiling.cpp b/src/kernels/mixkernels/fusion/tiling/fusion_tiling.cpp index 5145c511..a6acecd4 100644 --- a/src/kernels/mixkernels/fusion/tiling/fusion_tiling.cpp +++ b/src/kernels/mixkernels/fusion/tiling/fusion_tiling.cpp @@ -36,9 +36,12 @@ Status FusionTiling(const LaunchParam &launchParam, KernelInfo &kernelInfo) std::string tilingFuncName = inferWorkspaceFuncName + "tiling_func"; *(void **)(&tilingFunc) = dlsym(handle, tilingFuncName.c_str()); errorInfo = dlerror(); + if (errorInfo != nullptr || tilingFunc == nullptr) { + return Status::FailStatus(-1, "Get tilingFunc failed!"); + } KernelArgs *kernelArgs = new (std::nothrow) KernelArgs; - if (errorInfo != nullptr || tilingFunc == nullptr || kernelArgs == nullptr) { - return Status::FailStatus(-1, "Get tilingFunc or Malloc for binary params failed!"); + if (kernelArgs == nullptr) { + return Status::FailStatus(-1, "Malloc for binary params failed!"); } kernelArgs->tilingDevice = static_cast(tilingDataPtr); kernelArgs->tilingDeviceDup = kernelArgs->tilingDevice; @@ -48,10 +51,15 @@ Status FusionTiling(const LaunchParam &launchParam, KernelInfo &kernelInfo) MKI_LOG(INFO) << "now inferWorkspaceFuncName is" << inferWorkspaceFuncName; *(void **)(&inferworkspaceFunc) = dlsym(handle, inferWorkspaceFuncName.c_str()); errorInfo = dlerror(); + if (errorInfo != nullptr || inferworkspaceFunc == nullptr) { + delete kernelArgs; + return Status::FailStatus(-1, "Get workspaceFunc failed!"); + } KernelArgsForInferShapeWorkspaceWithTiling *wsWithTiling = new (std::nothrow) KernelArgsForInferShapeWorkspaceWithTiling; - if (errorInfo != nullptr || inferworkspaceFunc == nullptr || wsWithTiling == nullptr) { - return Status::FailStatus(-1, "Get workspaceFunc or Get workspace tiling failed!"); + if (wsWithTiling == nullptr) { + delete kernelArgs; + return Status::FailStatus(-1, "Get workspace tiling space failed!"); } wsWithTiling->tilingDevice = tilingDataPtr; wsWithTiling->tilingDeviceDup = tilingDataPtr; -- Gitee From b7f46d29536e431174b31e4058a037487ace32d3 Mon Sep 17 00:00:00 2001 From: zouyanlong Date: Wed, 24 Sep 2025 10:45:53 +0800 Subject: [PATCH 83/94] fix --- example/op_demo/fused_add_topk_div/README.md | 26 +++++++++++++++++++ example/op_demo/fused_add_topk_div/build.sh | 4 +-- ...pk_div.cpp => fused_add_topk_div_demo.cpp} | 0 3 files changed, 28 insertions(+), 2 deletions(-) rename example/op_demo/fused_add_topk_div/{fused_add_topk_div.cpp => fused_add_topk_div_demo.cpp} (100%) diff --git a/example/op_demo/fused_add_topk_div/README.md b/example/op_demo/fused_add_topk_div/README.md index ad529f6e..f2f62b1f 100644 --- a/example/op_demo/fused_add_topk_div/README.md +++ b/example/op_demo/fused_add_topk_div/README.md @@ -28,3 +28,29 @@ ## 额外说明 示例中生成的数据不代表实际场景,如需数据生成参考请查看python用例目录: tests/apitest/opstest/python/operations/fused_add_topk_div/ + +## 场景说明 + + 该算子所给demo仅支持在Atlas A2/A3系列产品上运行,demo的场景说明如下: + +- fused_add_topk_div_demo + + **参数设置**: + | 成员名称 | 取值 | + | :------------------ | :------------------- | + | groupNum | 8 | + | groupTopk | 4 | + | n | 2 | + | k | 8 | + | activationType | `ACTIVATION_SIGMOID` | + | isNorm | `true` | + | scale | 2.5 | + | enableExpertMapping | `false` | + + **数据规格**: + | tensor名字 | 数据类型 | 数据格式 | 维度信息 | cpu/npu | + | ---------- | -------- | -------- | ---------- | ------- | + | `x` | float16 | nd | [512, 256] | npu | + | `add_num` | float16 | nd | [256] | npu | + | `y` | float | nd | [512, 8] | npu | + | `indices` | int32 | nd | [512, 8] | npu | diff --git a/example/op_demo/fused_add_topk_div/build.sh b/example/op_demo/fused_add_topk_div/build.sh index 351a64e8..fc37e66c 100644 --- a/example/op_demo/fused_add_topk_div/build.sh +++ b/example/op_demo/fused_add_topk_div/build.sh @@ -20,5 +20,5 @@ except ImportError: echo "Using cxx_abi=$cxx_abi" g++ -D_GLIBCXX_USE_CXX11_ABI=$cxx_abi -I "${ATB_HOME_PATH}/include" -I "${ASCEND_HOME_PATH}/include" -L "${ATB_HOME_PATH}/lib" -L "${ASCEND_HOME_PATH}/lib64" \ -fused_add_topk_div.cpp ../demo_util.h -l atb -l ascendcl -o fused_add_topk_div -./fused_add_topk_div +fused_add_topk_div_demo.cpp ../demo_util.h -l atb -l ascendcl -o fused_add_topk_div_demo +./fused_add_topk_div_demo diff --git a/example/op_demo/fused_add_topk_div/fused_add_topk_div.cpp b/example/op_demo/fused_add_topk_div/fused_add_topk_div_demo.cpp similarity index 100% rename from example/op_demo/fused_add_topk_div/fused_add_topk_div.cpp rename to example/op_demo/fused_add_topk_div/fused_add_topk_div_demo.cpp -- Gitee From 28d702a865aa32bc7e80d70d165d954462942537 Mon Sep 17 00:00:00 2001 From: He Changcheng Date: Wed, 24 Sep 2025 15:24:02 +0800 Subject: [PATCH 84/94] fix --- src/kernels/lcal/src/lccl.cpp | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/kernels/lcal/src/lccl.cpp b/src/kernels/lcal/src/lccl.cpp index b1d81330..eea814a5 100644 --- a/src/kernels/lcal/src/lccl.cpp +++ b/src/kernels/lcal/src/lccl.cpp @@ -59,7 +59,6 @@ int GetAclResInCurThread(int type, uint32_t &resource) aclFn = reinterpret_cast(sym); initFlag.store(LCAL_SUCCESS, std::memory_order_release); MKI_LOG(DEBUG) << "Loaded libascendcl.so and resolved aclrtGetResInCurrentThread from: " << p; - return; // 成功 }); // 初始化结果判定 @@ -73,6 +72,12 @@ int GetAclResInCurThread(int type, uint32_t &resource) return LCAL_ERROR_PARA_CHECK_FAIL; } + // 调用前检查函数指针有效性 + if (aclFn == nullptr) { + MKI_LOG(ERROR) << "aclrtGetResInCurrentThread function pointer is null."; + return LCAL_ERROR_INTERNAL; + } + // 调用底层函数 const int ret = aclFn(type, &resource); if (ret != ACL_SUCCESS) { -- Gitee From 4858fc7f94c003ad557e8bf6cb6d0e6e0c0e9c04 Mon Sep 17 00:00:00 2001 From: huangxiaolan Date: Wed, 24 Sep 2025 17:23:02 +0800 Subject: [PATCH 85/94] recover class defination --- .../tbe_adapter/platform/tiling/platform/platform_ascendc.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/kernels/tbe_adapter/platform/tiling/platform/platform_ascendc.h b/src/kernels/tbe_adapter/platform/tiling/platform/platform_ascendc.h index 8e2b0a53..ad6082e9 100644 --- a/src/kernels/tbe_adapter/platform/tiling/platform/platform_ascendc.h +++ b/src/kernels/tbe_adapter/platform/tiling/platform/platform_ascendc.h @@ -18,7 +18,6 @@ #include #include -#include "stubs/include/metadef/inc/external/platform/platform_info.h" #define ASCENDC_ASSERT(cond, behavior) \ do { \ @@ -27,6 +26,9 @@ raise(SIGABRT); \ } \ } while (0) +namespace fe { +class PlatformInfo; +} namespace platform_ascendc { enum class CoreMemType { -- Gitee From 2a6076f201577e02def5ff9027cb0d5de467e412 Mon Sep 17 00:00:00 2001 From: huangxiaolan Date: Wed, 24 Sep 2025 17:24:38 +0800 Subject: [PATCH 86/94] recover class defination --- .../tbe_adapter/platform/tiling/platform/platform_ascendc.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/kernels/tbe_adapter/platform/tiling/platform/platform_ascendc.h b/src/kernels/tbe_adapter/platform/tiling/platform/platform_ascendc.h index ad6082e9..6eda0979 100644 --- a/src/kernels/tbe_adapter/platform/tiling/platform/platform_ascendc.h +++ b/src/kernels/tbe_adapter/platform/tiling/platform/platform_ascendc.h @@ -27,7 +27,7 @@ } \ } while (0) namespace fe { -class PlatformInfo; +class PlatFormInfos; } namespace platform_ascendc { -- Gitee From 97ad742b4f5323e07bc5c652299c3623748ea056 Mon Sep 17 00:00:00 2001 From: qiuqianjin Date: Wed, 24 Sep 2025 17:54:50 +0800 Subject: [PATCH 87/94] [task]pa test support range -5 to 5 --- .../paged_attention/precision_calcu.py | 2 +- .../test_paged_attention_operation_range_5.py | 1411 +++++++++++++++++ 2 files changed, 1412 insertions(+), 1 deletion(-) create mode 100644 tests/apitest/opstest/python/operations/paged_attention/test_paged_attention_operation_range_5.py diff --git a/tests/apitest/opstest/python/operations/paged_attention/precision_calcu.py b/tests/apitest/opstest/python/operations/paged_attention/precision_calcu.py index 2db3c4b3..9ad48e21 100644 --- a/tests/apitest/opstest/python/operations/paged_attention/precision_calcu.py +++ b/tests/apitest/opstest/python/operations/paged_attention/precision_calcu.py @@ -134,7 +134,7 @@ def compare_cv(golden:torch.Tensor, gpu:torch.Tensor, actual:torch.Tensor): logging.info(f"mere_npu:{mere_npu} mere_gpu:{mere_gpu}") logging.info(f"rmse_npu:{rmse_npu} rmse_gpu:{rmse_gpu}") logging.info(f"MARE:{mare_rate} MERE:{mere_rate} RMSE:{rmse_rate} EB:{EB}") - logging.info(f"new golden cv result:{result}") + print(f"new golden cv result:{result}") return result if __name__ == '__main__': diff --git a/tests/apitest/opstest/python/operations/paged_attention/test_paged_attention_operation_range_5.py b/tests/apitest/opstest/python/operations/paged_attention/test_paged_attention_operation_range_5.py new file mode 100644 index 00000000..d613fa65 --- /dev/null +++ b/tests/apitest/opstest/python/operations/paged_attention/test_paged_attention_operation_range_5.py @@ -0,0 +1,1411 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. +# This file is a part of the CANN Open Software. +# Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# +import json +import math +import os +import random +import sys +import unittest +import collections +import numpy as np +import torch +import torch_npu + +sys.path.append(os.path.join(os.path.dirname(__file__), "../")) +import operation_test # NOQA: E402 +from precision_calcu import * + +MAX_SEQ_LEN = 1024 + +class TestPagedAttention(operation_test.OperationTest): + + def compare_output_data(self, out, golden, ratios): + error_count = 0 + strict_error_count = 0 + fp16_min_normal = 1.0 / (1 << 14) + golden = golden.flatten().to(torch.float32) + out = out.flatten().to(torch.float32) + len = out.shape[0] + diff = torch.abs(golden - out) + max_diff = diff.max().item() + limit_error = torch.maximum(torch.abs(golden * ratios[0]), torch.tensor(ratios[1])) + strict_limit_error = torch.maximum(torch.abs(golden * ratios[2]), torch.tensor(ratios[3])) + error_count = torch.gt(diff, limit_error).sum().item() + strict_error_count = torch.gt(diff, strict_limit_error).sum().item() + print(f"maxDiff {max_diff}") + print("1/1000 Accuracy is %f", 1 - float(error_count) / len) + print("5/1000 Accuracy is %f", 1 - float(strict_error_count) / len) + if self.data_type == torch.bfloat16 or self.is_int8_flag: + print("accuracy is correct in old standard: %r", (float(strict_error_count) / len) <= ratios[2]) + else: + print("accuracy is correct in old standard: %r", (float(strict_error_count) / len) <= ratios[0]) + calc_times = self.head_size * self.max_context_len + 4 + if self.data_type == torch.bfloat16: + if calc_times < 2048: + error = 2**(-7) + else : + error = 2**(-6) + error_threshold = torch.clamp(torch.abs(golden), min = 1) * error + res = (diff <= error_threshold).all().item() + print("accuracy is correct in new standard: %r", res) + return res + else: + if calc_times < 2048: + error = 2**(-8) + else : + error = 2**(-7) + error_threshold = torch.clamp(torch.abs(golden), min = 1) * error + res = (diff <= error_threshold).all().item() + print("accuracy is correct in new standard: %r", res) + return res + + def get_alibi_slopes(self, n_heads): + n = 2 ** math.floor(math.log2(n_heads)) + m0 = 2.0 ** (-8.0 / n) + slopes = torch.pow(m0, torch.arange(1, n + 1)) + if n < n_heads: + m1 = 2.0 ** ( -4.0 / n) + mm = torch.pow(m1, torch.arange(1, 1 + 2 * (n_heads - n), 2)) + slopes = torch.cat([slopes, mm]) + # slopes = torch.ones(n_heads) + return slopes + + def group_mm_torch(self, heads, kv_head, A, B, razor_mod, is_k): + group_head = heads // kv_head + score_high = None + for i in range(kv_head): + if self.is_int8_flag: + int8_B = B[i: (i+1), :, :, ] + head_dim = int8_B.shape[2] + float32_B = int8_B.to(torch.float32) + if is_k: + if self.has_bias: + float32_B = float32_B + self.offset1[(i + razor_mod) * head_dim : (i + razor_mod + 1) * head_dim].to(torch.float32) + fp32_B = float32_B.to(torch.float32) * self.de_scale1_fp32[(i + razor_mod) * head_dim : (i + razor_mod + 1) * head_dim] + fp32_B = torch.permute(fp32_B, (0, 2, 1)) + else: + if self.has_bias: + float32_B = float32_B + self.offset2[(i + razor_mod) * head_dim : (i + razor_mod + 1) * head_dim] + fp32_B = float32_B.to(torch.float32) * self.de_scale2_fp32[(i + razor_mod) * head_dim : (i + razor_mod + 1) * head_dim] + group_score_high = torch.matmul(A[i * group_head: (i + 1) * group_head, :, :].to(torch.float32), + fp32_B) + elif self.is_quant_flag: + group_score_int32 = torch.matmul(A[i*group_head: (i + 1)*group_head, :, :].to(torch.int32), + B[i: (i+1), :, :].to(torch.int32)).to(torch.int32) + if is_k: + group_score_high = group_score_int32.to(torch.float32) * self.de_scale1_fp32[(i * group_head): (i + 1) * group_head].reshape(group_head, 1, 1).to(torch.float32) + else: + group_score_high = group_score_int32.to(torch.float32) * self.de_scalev[(i * group_head): (i + 1) * group_head].reshape(group_head, 1, 1).to(torch.float32) + else: + group_score_high = torch.matmul(A[i * group_head: (i + 1) * group_head, :, :].to(torch.float32), + B[i:(i + 1), :, :].to(torch.float32)) + if score_high is None: + score_high = group_score_high + else: + score_high = torch.cat((score_high, group_score_high), 0) + return score_high + + def process_deq_scale(self, deq_scale) -> np.ndarray: + new_deq_scale = np.frombuffer(deq_scale.tobytes(), dtype=np.uint32) + return new_deq_scale.astype(np.uint64) + + def softmax(self, sim): + row_max = torch.max(sim, axis=-1, keepdims=True)[0] + sim_sub = sim - row_max + sim_sub = torch.exp(sim_sub) + row_sum = torch.sum(sim_sub, axis=-1, keepdims=True) + soft_res = sim_sub / row_sum + return soft_res + + def softmax_numpy(self, sim): + sim = sim.cpu().numpy() + row_max = np.max(sim, axis=-1, keepdims=True) + sim_sub = sim - row_max + sim_sub = np.exp(sim_sub) + row_sum = np.sum(sim_sub, axis=-1, keepdims=True) + soft_res = sim_sub / row_sum + return soft_res + + def softmax_quant_numpy(self, sim, is_first): + lm = np.max(sim, axis=-1, keepdims=True) + if is_first: + hm = lm + self.dm = 0 + else: + hm = np.maximum(self.gm, lm) + self.dm = self.gm - hm + self.gm = hm + sim_sub = sim - hm + sim_sub = np.exp(sim_sub) + row_sum = np.sum(sim_sub, axis=-1, keepdims=True) + row_maxp = np.max(sim_sub, axis=-1, keepdims=True) + if not self.is_quant_offiline: + scale = row_maxp.astype("float32") / 127.0 + sim_int8 = sim_sub / scale + soft_res = sim_int8.astype("float16") + soft_res = np.rint(soft_res).astype("int8") + de_scalev = self.de_scale2_fp32 * row_maxp[:,0,0] / 127 + else: + soft_res = sim_sub * self.scale.reshape(self.scale.shape[0], 1, 1).numpy() + soft_res = soft_res.astype("float16") + soft_res = np.rint(soft_res).astype("int8") + de_scalev = self.de_scale2_fp32 + return soft_res, row_sum, de_scalev, hm, self.dm + + + def softmax_quant_numpy_online(self, sim, heads, kv_head, value, razor_mod): + group_head = heads // kv_head + score_high = None + # (kv_heads, context_len, head_size) + kv_seqlen = value.shape[1] + cur_kv_seqlen = kv_seqlen + n_loop = (cur_kv_seqlen + self.block_size_calc - 1) // self.block_size_calc + qk_n = self.block_size_calc + self.tmp_l_list = [] + self.tmp_o_list = [] + for cur_nIndx in range(self.kvsplit): + kv_seqlen_align = (kv_seqlen + self.block_size - 1) // self.block_size * self.block_size + start_kv = cur_nIndx * self.kv_split_per_core + cur_kv_seqlen = self.kv_split_per_core + kv_loop = (kv_seqlen_align + self.kv_split_per_core - 1) // self.kv_split_per_core + if cur_nIndx >= kv_loop: + continue + if cur_nIndx == (kv_loop - 1): + cur_kv_seqlen = kv_seqlen - cur_nIndx * self.kv_split_per_core + n_loop = (cur_kv_seqlen + self.block_size_calc - 1) // self.block_size_calc + qk_n = self.block_size_calc + end_kv = start_kv + for n_idx in range(n_loop): + is_first = (n_idx == 0) + if n_idx == n_loop - 1: + qk_n = cur_kv_seqlen - n_idx * self.block_size_calc + end_kv = end_kv + qk_n + sim_block = sim[:, :, start_kv : end_kv] + p_block, ll, de_scalev, hm, dm = self.softmax_quant_numpy(sim_block, is_first) + self.de_scalev = de_scalev + value_block = value[:, start_kv : end_kv, :] + lo = self.group_mm_torch(heads, kv_head, torch.from_numpy(p_block), value_block, razor_mod, 0) + lo = lo.cpu().numpy() + if n_idx == 0: + self.gl = ll + self.go = lo + else: + dm = np.exp(dm) + self.gl = self.gl * dm + self.gl = self.gl + ll + self.go = self.go * dm + self.go = self.go + lo + start_kv = start_kv + qk_n + self.go = self.go / self.gl + self.tmp_o_list.append(self.go.reshape([1, self.num_heads, 1, value.shape[2]])) + ls = np.log(self.gl) + self.gm + self.tmp_l_list.append(ls.reshape([1, self.num_heads])) + if self.kvsplit > 1: + l = np.concatenate(self.tmp_l_list, 0) + o = np.concatenate(self.tmp_o_list, 0) + l = np.transpose(l, (1, 0)) + lse_max = np.max(l, axis=1, keepdims=True) + l_tmp = np.exp(l - lse_max) + lse_sum = np.sum(l_tmp, axis=1, keepdims=True) + lse_logsum = np.log(lse_sum) + lse_max + scale = np.exp(l - lse_logsum) + o = o * scale.transpose(1, 0)[:,:,np.newaxis,np.newaxis] + self.go = np.sum(o, axis=0, keepdims=True) + self.go = np.squeeze(self.go, axis=0) + return torch.from_numpy(self.go) + + def ref_masked_attention(self, + query, # (1, num_heads, head_size) + key, # (context_len, kv_heads, head_size) + value, + scale: float, + alibi_bias, + razor_rope, + razor_offset_list, + razor_mod, + mask_data_type = torch.bfloat16, + ): + # Q * K.T + query = query + query = torch.permute(query, (1, 0, 2)) + if not self.is_int8_flag: + key = torch.permute(key, (1, 2, 0)) # 0 1 2 + else: + key = torch.permute(key, (1, 0, 2)) + sim_high = self.group_mm_torch(query.shape[0], key.shape[0], query, key, razor_mod, 1) # (head_num, q_seqlen, k_seqlen) + + if razor_rope: + razor_offset_list = razor_offset_list.view(1, 1, razor_offset_list.shape[0]) + sim_high = sim_high.to(torch.float32) + razor_offset_list + sim_high = sim_high.to(torch.float32) * scale + if alibi_bias is not None: + sim_high = sim_high + alibi_bias.to(torch.float32) + # softmax + if self.is_quant_flag: + self.gm = np.full([query.shape[0] , 1, 1], np.finfo(np.float32).min) + p_high, row_sum, de_scalev, _, _ = self.softmax_quant_numpy(sim_high.numpy(), 1) + self.de_scalev = de_scalev + value = torch.permute(value, (1, 0, 2)) + out_high = self.group_mm_torch(query.shape[0], key.shape[0], torch.from_numpy(p_high), value, razor_mod, 0) + out_high = out_high / row_sum + out_high = torch.permute(out_high, (1, 0, 2)) + s_qk = sim_high.numpy() + out = self.softmax_quant_numpy_online(s_qk, query.shape[0], key.shape[0], value, razor_mod) + else: + p_high = self.softmax_numpy(sim_high) + p = torch.from_numpy(p_high).to(mask_data_type) + p_high = torch.from_numpy(p_high) + # P * V + value = torch.permute(value, (1, 0, 2)) + out = self.group_mm_torch(query.shape[0], key.shape[0], p, value, razor_mod, 0) + out_high = self.group_mm_torch(query.shape[0], key.shape[0], p_high, value, razor_mod, 0) + out = torch.permute(out, (1, 0, 2)) + out_high = torch.permute(out_high, (1, 0, 2)) + return out, out_high + + def ref_single_query_cached_kv_attention(self, + output, + true_out, + query, + key_cache, # (num_blocks, block_size, num_heads, head_size) + value_cache, # (num_blocks, block_size, num_heads, head_size) + block_tables, + context_lens, + mask, + razor_offset, + razor_rope, + mask_dim = 4, + mask_data_type = torch.bfloat16 + ) -> None: + mask_index_coff = 1 + if self.compressHead: + query = query.view(self.num_tokens * self.kv_heads, self.num_heads // self.kv_heads, self.head_size) + output = output.view(self.num_tokens * self.kv_heads, self.num_heads // self.kv_heads, self.head_size) + true_out = true_out.view(self.num_tokens * self.kv_heads, self.num_heads // self.kv_heads, self.head_size) + if mask_dim == 4: + mask_shape = mask.shape + mask = mask.view(mask_shape[0] * self.kv_heads, self.num_heads // self.kv_heads, 1, self.max_context_len) + else: + mask_index_coff = self.kv_heads + num_heads = query.shape[1] + kv_heads = value_cache.shape[2] + head_size = value_cache.shape[3] + block_size = value_cache.shape[1] + + num_input_tokens = query.shape[0] + index = 0 + razor_mod = 0 + if self.scaleType == 2: + self.logN = torch.tensor([2.0] * len(context_lens)).to(torch.float32) + self.logN.uniform_(1, 2) + for i in range(len(context_lens)): + block_table = block_tables[i] + context_len = int(context_lens[i]) + if context_len == 0: + continue + q = query[index].view(1, num_heads, head_size) + keys = [] + values = [] + razor_offset_list = [] + for j in range(context_len): + block_number = int(block_table[j // block_size]) + block_offset = j % block_size + + k = key_cache[block_number, block_offset, :, :] + k = k.reshape(kv_heads, head_size) + keys.append(k) + + v = value_cache[block_number, block_offset, :, :] + v = v.reshape(kv_heads, head_size) + values.append(v) + + if razor_rope: + offset = razor_offset[block_number, block_offset] + razor_offset_list.append(offset) + + keys = torch.stack(keys, axis=0) + values = torch.stack(values, axis=0) + + if razor_rope: + razor_mod = i % self.kv_heads + razor_offset_list = torch.stack(razor_offset_list, axis=0) + self.razor_start_head = (i * num_heads) % self.num_heads + elif self.compressHead: + razor_mod = i % self.kv_heads + self.razor_start_head = (i * num_heads) % self.num_heads + else: + self.razor_start_head = 0 + scale = np.float32(1.0 / (head_size ** 0.5)) + if self.scaleType == 2: + scale *= self.logN[i] + if mask_dim == 4: + out, out_high = self.ref_masked_attention(q, keys, values, scale, mask[i, :, :, :context_len], razor_rope, razor_offset_list, razor_mod, mask_data_type) + out = out.reshape(num_heads, head_size) + elif mask_dim == 3: + out,out_high = self.ref_masked_attention(q, keys, values, scale, mask[i // mask_index_coff, :, :context_len], razor_rope, razor_offset_list, razor_mod, mask_data_type) + out = out.reshape(num_heads, head_size) + else: + out,out_high = self.ref_masked_attention(q, keys, values, scale, mask, razor_rope, razor_offset_list, razor_mod, mask_data_type) + out = out.reshape(num_heads, head_size) + out_high = out_high.reshape(num_heads, head_size) + output[index] = out.to(mask_data_type) + true_out[index] = out_high + index = index + 1 + + def get_blockszie_calc(self, max_context_len, block_size, embeddingSize, embeddingSizeV): + embedQKSplit = 256 if embeddingSize > 256 else embeddingSize + embedVOSplit = 256 if embeddingSizeV > 256 else embeddingSizeV + BLOCK_LIMIT = 128 * 128 + KV_SEQLEN_SLICE = 128 + KV_SEQLEN_SLICE_256 = 256 + KV_SEQLEN_SLICE_512 = 512 + BLOCK_LIMIT_NO_PINGPONG = 128 * 256; + block_size_calc = block_size + headdimMax = np.maximum(embedQKSplit, embedVOSplit) + if block_size <= KV_SEQLEN_SLICE / 2 and \ + block_size * 2 * embedQKSplit <= BLOCK_LIMIT and \ + block_size * 2 * embedVOSplit <= BLOCK_LIMIT: + block_size_calc = block_size * 2 + if not self.is_int8_flag and \ + max_context_len >= KV_SEQLEN_SLICE_256 and \ + self.kv_split_per_core >= KV_SEQLEN_SLICE_256 and \ + KV_SEQLEN_SLICE_256 * embedQKSplit <= BLOCK_LIMIT_NO_PINGPONG and \ + KV_SEQLEN_SLICE_256 * embedVOSplit <= BLOCK_LIMIT_NO_PINGPONG and \ + (block_size == KV_SEQLEN_SLICE_256 // 4 or block_size == KV_SEQLEN_SLICE_256 // 2): + block_size_calc = 256 + + if self.is_quant_flag and \ + max_context_len >= KV_SEQLEN_SLICE_512 and \ + self.kv_split_per_core >= KV_SEQLEN_SLICE_512 and \ + KV_SEQLEN_SLICE_512 * embedQKSplit <= BLOCK_LIMIT_NO_PINGPONG * 2 and \ + KV_SEQLEN_SLICE_512 * embedVOSplit <= BLOCK_LIMIT_NO_PINGPONG * 2 and \ + (block_size == KV_SEQLEN_SLICE_256 // 4 or block_size == KV_SEQLEN_SLICE_256 // 2) and \ + KV_SEQLEN_SLICE_512 * headdimMax <= BLOCK_LIMIT_NO_PINGPONG and self.head_num_move < 4: + block_size_calc = KV_SEQLEN_SLICE_512 + return block_size_calc + + def getkvsplit(self, num_tokens, num_heads, max_context_len, block_size, blocknum, isLongSeq): + if isLongSeq: + kvSeqklenMaxAlign = (max_context_len + block_size - 1) // block_size * block_size + kvSeqBlockNum = int(kvSeqklenMaxAlign / block_size) + kvBlockPreCore = int((kvSeqBlockNum + blocknum - 1)) // blocknum + kvSplitPerCore = int(kvBlockPreCore * block_size) + kvSplitCoreNum = int(kvSeqklenMaxAlign + kvSplitPerCore - 1) // kvSplitPerCore + headSplit = int((num_heads + kvSplitCoreNum - 1) // kvSplitCoreNum) + else: + coreNumPerBatch = int((blocknum + num_tokens - 1) // num_tokens) + kvSeqklenMaxAlign = (max_context_len + block_size - 1) // block_size * block_size + kvSeqBlockNum = int(kvSeqklenMaxAlign / block_size) + kvBlockPreCore = int((kvSeqBlockNum + coreNumPerBatch - 1)) // coreNumPerBatch + kvSplitPerCore = int(kvBlockPreCore * block_size) + kvSplitCoreNum = int(kvSeqklenMaxAlign + kvSplitPerCore - 1) // kvSplitPerCore + headSplit = int((num_heads + kvSplitCoreNum - 1) // kvSplitCoreNum) + return kvSplitCoreNum, kvSplitPerCore + + def get_head_num_move(self, num_heads, kvhead, embeddingSize, embeddingSizeV): + if embeddingSize % 32 == 0 and embeddingSizeV % 32 == 0 and embeddingSize <= 128 and embeddingSizeV <= 128 and num_heads == kvhead: + head_num_move = 4 + else: + head_num_move = 1 + return head_num_move + + def calc_data(self, num_tokens, num_heads, kv_heads, head_size, block_size, num_blocks, k_seqlen,\ + dtype, mask_dim = 4, mask_data_type = torch.bfloat16,\ + dynamic_batch = False, dynamic_seqlen = None, is_int8_flag = False, has_bias = False, + compressHead = False, razor_rope = False, blocknum = 20, is_quant_flag = 0, is_quant_offiline = 0, scaleType = 0): + self.num_heads = num_heads + self.kv_heads = kv_heads + self.num_tokens = num_tokens + self.compressHead = compressHead + self.head_size = head_size + self.scaleType = scaleType + self.group_num = num_heads / kv_heads + logging.debug(f'input info: {num_tokens}, {num_heads}, {kv_heads}, {head_size}, {block_size}, {num_blocks}, {k_seqlen}, {dtype}') + + q_min_range = -5.0 + q_max_range = 5.0 + kv_min_range = -5.0 + kv_max_range = 5.0 + kv_type = dtype + self.is_quant_flag = is_quant_flag + self.is_quant_offiline = is_quant_offiline + if self.is_quant_flag: + q_min_range = -5 + q_max_range = 5 + kv_min_range = -5 + kv_max_range = 5 + dtype = torch.int8 + kv_type = torch.int8 + if is_int8_flag: + kv_min_range = -5 + kv_max_range = 5 + kv_type = torch.int8 + query = torch.from_numpy(np.random.uniform(q_min_range, q_max_range, size=(num_tokens, num_heads, head_size))).to(dtype) + # (num_blocks, block_size, num_heads, head_size) + if not compressHead: + key_cache = torch.from_numpy(np.random.uniform(kv_min_range, kv_max_range, size=(num_blocks, block_size, kv_heads, head_size))).to(kv_type) + # # (num_blocks, block_size, num_heads, head_size) + value_cache = torch.from_numpy(np.random.uniform(kv_min_range, kv_max_range, size=(num_blocks, block_size, kv_heads, head_size))).to(kv_type) + # (num_blocks, block_size, num_heads, head_size) + else: + key_cache = torch.from_numpy(np.random.uniform(kv_min_range, kv_max_range, size=(num_blocks * kv_heads, block_size, 1, head_size))).to(kv_type) + # # (num_blocks, block_size, num_heads, head_size) + value_cache = torch.from_numpy(np.random.uniform(kv_min_range, kv_max_range, size=(num_blocks * kv_heads, block_size, 1, head_size))).to(kv_type) + # (num_blocks, block_size, num_heads, head_size) + self.data_type = dtype + + razor_offset = torch.tensor([], dtype=torch.float32) + if razor_rope: + razor_offset = torch.zeros(num_blocks * kv_heads, block_size) + mask = np.random.choice([False, True], size=num_blocks * kv_heads, p=[0.2, 0.8]) + + random_indices = np.random.randint(0, block_size, size=np.sum(mask)) + random_values = np.random.uniform(0, 20, size=np.sum(mask)) + + active_rows = np.where(mask)[0] + razor_offset[active_rows, random_indices] = torch.from_numpy(random_values).to(torch.float32) + + if dynamic_batch: + context_lens = dynamic_seqlen + else: + context_lens = [k_seqlen] * num_tokens + max_context_len = max(context_lens) + self.max_context_len = max_context_len + batch = len(context_lens) + + # alibi mask + if mask_dim == 4: + mask = np.zeros((batch, num_heads, 1, self.max_context_len), dtype=np.float32) + alibi_slopes = self.get_alibi_slopes(num_heads) + for i, context_len in enumerate(context_lens): + if context_len == 0: + continue + position_ids = np.arange(context_len).astype(np.int32) + alibi_bias = (position_ids - context_len + 1).astype(np.float32) + alibi_bias = alibi_slopes.reshape(-1, 1, 1) * alibi_bias.reshape(1, 1, -1) # (head_num, 1, context) + mask[i, :, :, :context_len] = alibi_bias + mask = torch.from_numpy(mask).to(mask_data_type) + # normal mask headnum, 1, maxS + elif mask_dim == 3: + mask = np.zeros((batch, 1, max_context_len), dtype=np.float16) + for i in range(batch): + mask[i, :, :i] = -10000 + mask = torch.from_numpy(mask).to(mask_data_type) + else: # no mask + mask = None + + if compressHead: + context_lens = [val for val in context_lens for _ in range(kv_heads)] + batch = len(context_lens) + max_num_blocks_per_seq = (max_context_len + block_size - 1) // block_size + block_tables = [] # (num_tokens, max_num_blocks_per_seq) + for _ in range(batch): + block_table = [ + random.randint(0, num_blocks - 1) for _ in range(max_num_blocks_per_seq) + ] + block_tables.append(block_table) + self.is_int8_flag = is_int8_flag + + if is_int8_flag: + de_scale1_fp32 = np.random.randint(-1, 2, size=(kv_heads * head_size)).astype(np.float32) + de_scale1_int64 = self.process_deq_scale(de_scale1_fp32) + + de_scale2_fp32 = np.random.randint(-1, 2, size=(kv_heads * head_size)).astype(np.float32) + de_scale2_int64 = self.process_deq_scale(de_scale2_fp32) + + offset1 = np.random.randint(-20, 20, size=(kv_heads * head_size)).astype(np.int32) + + offset2 = np.random.randint(-20, 20, size=(kv_heads * head_size)).astype(np.int32) + + self.de_scale1_int64 = torch.tensor(list(de_scale1_int64), dtype=torch.int64) + self.de_scale2_int64 = torch.tensor(list(de_scale2_int64), dtype=torch.int64) + self.de_scale1_fp32 = torch.from_numpy(de_scale1_fp32) + self.de_scale2_fp32 = torch.from_numpy(de_scale2_fp32) + self.offset1 = torch.from_numpy(offset1) + self.offset2 = torch.from_numpy(offset2) + self.has_bias = has_bias + + if self.is_quant_flag: + self.de_scale1_fp32 = torch.from_numpy(np.random.uniform(-5/127, 5/127, size=(num_heads)).astype(np.float32)).to(torch.float32) + self.de_scale2_fp32 = torch.from_numpy(np.random.uniform(-5/127, 5/127, size=(num_heads)).astype(np.float32)).to(torch.float32) + self.scale = torch.from_numpy(np.random.uniform(0, 127, size=(num_heads)).astype(np.float32)).to(torch.float32) + isLongSeq = max_context_len > blocknum * 128 * 2 and num_tokens < blocknum * 0.8 + if num_tokens * num_heads < 0.8 * blocknum or isLongSeq: + self.kvsplit, self.kv_split_per_core = self.getkvsplit(num_tokens, num_heads, max_context_len, block_size, blocknum, isLongSeq) + else: + self.kvsplit = 1 + self.kv_split_per_core = max_context_len + self.head_num_move = self.get_head_num_move(num_heads, kv_heads, head_size, head_size) + self.block_size_calc = self.get_blockszie_calc(max_context_len, block_size, head_size, head_size) + self.block_size = block_size + + ref_output = torch.zeros_like(query).to(torch.float32) + true_out = torch.zeros_like(query, dtype=torch.float32) + + self.ref_single_query_cached_kv_attention( + ref_output, + true_out, + query, + key_cache, + value_cache, + block_tables, + context_lens, + mask, + razor_offset, + razor_rope, + mask_dim, + mask_data_type + ) + self.q = query + self.key_cache = key_cache + self.value_cache = value_cache + self.block_tables = np.array(block_tables).astype(np.int32) + self.contex_lens = np.array(context_lens).astype(np.int32) + self.mask = mask + self.golden_out = ref_output + self.true_out = true_out + self.razor_offset = razor_offset + + def calc_data_bnsd(self, num_tokens, num_heads, kv_heads, head_size, block_size, num_blocks, k_seqlen,\ + dtype, mask_dim = 4, mask_data_type = torch.bfloat16,\ + dynamic_batch = False, dynamic_seqlen = None, is_int8_flag = False, has_bias = False, + compressHead = False, razor_rope = False, scaleType = 0): + self.num_heads = num_heads + self.kv_heads = kv_heads + self.num_tokens = num_tokens + self.compressHead = compressHead + self.head_size = head_size + self.is_quant_flag = 0 + self.scaleType = scaleType + logging.debug(f'input info: {num_tokens}, {num_heads}, {kv_heads}, {head_size}, {block_size}, {num_blocks}, {k_seqlen}, {dtype}') + + query = torch.from_numpy(np.random.uniform(-1.0, 1.0, size=(num_tokens, num_heads, head_size))).to(dtype) + # (num_blocks, block_size, num_heads, head_size) + kv_range = 5.0 + kv_type = dtype + if is_int8_flag: + kv_range = 4.0 + kv_type = torch.int8 + if not compressHead: + key_cache = torch.from_numpy(np.random.uniform(-kv_range, kv_range, size=(num_blocks, block_size, kv_heads, head_size))).to(kv_type) + # (num_blocks, block_size, num_heads, head_size) + value_cache = torch.from_numpy(np.random.uniform(-kv_range, kv_range, size=(num_blocks, block_size, kv_heads, head_size))).to(kv_type) + else: + key_cache = torch.from_numpy(np.random.uniform(-kv_range, kv_range, size=(num_blocks * kv_heads, block_size, 1, head_size))).to(kv_type) + # (num_blocks, block_size, num_heads, head_size) + value_cache = torch.from_numpy(np.random.uniform(-kv_range, kv_range, size=(num_blocks * kv_heads, block_size, 1, head_size))).to(kv_type) + self.data_type = dtype + + razor_offset = torch.tensor([], dtype=torch.float32) + if razor_rope: + razor_offset = torch.zeros(num_blocks * kv_heads, block_size) + mask = np.random.choice([False, True], size=num_blocks * kv_heads, p=[0.2, 0.8]) + + random_indices = np.random.randint(0, block_size, size=np.sum(mask)) + random_values = np.random.uniform(0, 20, size=np.sum(mask)) + + active_rows = np.where(mask)[0] + razor_offset[active_rows, random_indices] = torch.from_numpy(random_values).to(torch.float32) + + if dynamic_batch: + context_lens = dynamic_seqlen + else: + context_lens = [k_seqlen] * num_tokens + max_context_len = max(context_lens) + self.max_context_len = max_context_len + batch = len(context_lens) + + # alibi mask + if mask_dim == 4: + mask = np.zeros((batch, num_heads, 1, self.max_context_len), dtype=np.float32) + alibi_slopes = self.get_alibi_slopes(num_heads) + for i, context_len in enumerate(context_lens): + if context_len == 0: + continue + position_ids = np.arange(context_len).astype(np.int32) + alibi_bias = (position_ids - context_len + 1).astype(np.float32) + alibi_bias = alibi_slopes.reshape(-1, 1, 1) * alibi_bias.reshape(1, 1, -1) # (head_num, 1, context) + mask[i, :, :, :context_len] = alibi_bias + mask = torch.from_numpy(mask).to(mask_data_type) + # normal mask + elif mask_dim == 3: + mask = np.zeros((batch, 1, max_context_len), dtype=np.float16) + for i in range(batch): + mask[i, :, :i] = -10000 + mask = torch.from_numpy(mask).to(mask_data_type) + else: # no mask + mask = None + + if compressHead: + context_lens = [val for val in context_lens for _ in range(kv_heads)] + batch = len(context_lens) + max_num_blocks_per_seq = (max_context_len + block_size - 1) // block_size + block_tables = [] # (num_tokens, max_num_blocks_per_seq) + for _ in range(batch): + block_table = [ + random.randint(0, num_blocks - 1) for _ in range(max_num_blocks_per_seq) + ] + block_tables.append(block_table) + + self.is_int8_flag = is_int8_flag + if is_int8_flag: + de_scale1_fp32 = np.random.randint(-1, 2, size=(kv_heads * head_size)).astype(np.float32) + de_scale1_int64 = self.process_deq_scale(de_scale1_fp32) + + de_scale2_fp32 = np.random.randint(-1, 2, size=(kv_heads * head_size)).astype(np.float32) + de_scale2_int64 = self.process_deq_scale(de_scale2_fp32) + + offset1 = np.random.randint(-20, 20, size=(kv_heads * head_size)).astype(np.int32) + + offset2 = np.random.randint(-20, 20, size=(kv_heads * head_size)).astype(np.int32) + + self.de_scale1_int64 = torch.tensor(list(de_scale1_int64), dtype=torch.int64) + self.de_scale2_int64 = torch.tensor(list(de_scale2_int64), dtype=torch.int64) + self.de_scale1_fp32 = torch.from_numpy(de_scale1_fp32) + self.de_scale2_fp32 = torch.from_numpy(de_scale2_fp32) + self.offset1 = torch.from_numpy(offset1) + self.offset2 = torch.from_numpy(offset2) + self.has_bias = has_bias + + + ref_output = torch.zeros_like(query) + true_out = torch.zeros_like(query, dtype=torch.float32) + self.ref_single_query_cached_kv_attention( + ref_output, + true_out, + query, + key_cache, + value_cache, + block_tables, + context_lens, + mask, + razor_offset, + razor_rope, + mask_dim, + mask_data_type + ) + + self.q = query + self.key_cache = key_cache + self.key_cache_bnsd = torch.permute(key_cache, (0, 2, 1,3)) + self.value_cache = value_cache + self.value_cache_bnsd = torch.permute(value_cache, (0, 2, 1,3)) + self.block_tables = np.array(block_tables).astype(np.int32) + self.contex_lens = np.array(context_lens).astype(np.int32) + self.mask = mask + self.golden_out = ref_output + self.true_out = true_out + self.razor_offset = razor_offset + + def golden_calc(self, in_tensors): + golden_out = torch.tensor(self.golden_out) + return [golden_out.npu()] + + def golden_compare(self, out_tensors, golden_tensors): + if self.data_type == torch.bfloat16 and self.is_int8_flag is True: + result_old = self.compare_output_data(out_tensors, self.true_out.npu(), [0.001, 0.001, 0.005, 0.005]) + else: + result_old = self.compare_output_data(out_tensors, golden_tensors, [0.001, 0.001, 0.005, 0.005]) + result_double = compare_cv(self.true_out.npu(), golden_tensors.npu(), out_tensors.npu()) + return (result_double or result_old) + + def test_paged_fp16_nomask(self): + if not operation_test.get_soc_version() == 'Ascend910B': + print("this testcase only supports Ascend910B") + return + num_tokens = 2 + num_heads = 32 + kv_heads = 16 + block_size = 128 + head_size = 288 + num_blocks = 64 + k_seqlen = 256 + tor = float(1.0 / (head_size ** 0.5)) + mask_dim = 0 + dtype = torch.float16 + + self.calc_data(num_tokens, num_heads, kv_heads, head_size, block_size, num_blocks, k_seqlen, dtype, mask_dim, dtype) + + OP_NAME = "PagedAttentionOperation" + + PARAM = json.dumps({"headNum": num_heads, "qkScale": tor, "kvHeadNum": kv_heads, "maskType":0}) + RUN_PARAM = json.dumps({"contextLens": self.contex_lens.tolist()}) + self.execute_with_param(OP_NAME, PARAM, RUN_PARAM, + [ + self.q.npu(), + self.key_cache.npu(), + self.value_cache.npu(), + torch.from_numpy(self.block_tables.astype(np.int32)).npu(), + torch.from_numpy(self.contex_lens).npu() + ]) + + def test_paged_bf16_nomask(self): + if not operation_test.get_soc_version() == 'Ascend910B': + print("this testcase only supports Ascend910B") + return + num_tokens = 2 + num_heads = 32 + kv_heads = 16 + block_size = 128 + head_size = 288 + num_blocks = 64 + k_seqlen = 256 + tor = float(1.0 / (head_size ** 0.5)) + mask_dim = 0 + dtype = torch.bfloat16 + + self.calc_data(num_tokens, num_heads, kv_heads, head_size, block_size, num_blocks, k_seqlen, dtype, mask_dim, dtype) + + OP_NAME = "PagedAttentionOperation" + + PARAM = json.dumps({"headNum": num_heads, "qkScale": tor, "kvHeadNum": kv_heads, "maskType":0}) + RUN_PARAM = json.dumps({"contextLens": self.contex_lens.tolist()}) + self.execute_with_param(OP_NAME, PARAM, RUN_PARAM, + [ + self.q.npu(), + self.key_cache.npu(), + self.value_cache.npu(), + torch.from_numpy(self.block_tables.astype(np.int32)).npu(), + torch.from_numpy(self.contex_lens).npu() + ]) + + def test_paged_fp16_normmask(self): + if not operation_test.get_soc_version() == 'Ascend910B': + print("this testcase only supports Ascend910B") + return + num_tokens = 2 + num_heads = 32 + kv_heads = 16 + block_size = 128 + head_size = 288 + num_blocks = 64 + k_seqlen = 256 + tor = float(1.0 / (head_size ** 0.5)) + mask_dim = 3 + dtype = torch.float16 + + self.calc_data(num_tokens, num_heads, kv_heads, head_size, block_size, num_blocks, k_seqlen, dtype, mask_dim, dtype) + + OP_NAME = "PagedAttentionOperation" + + PARAM = json.dumps({"headNum": num_heads, "qkScale": tor, "kvHeadNum": kv_heads, "maskType":1}) + RUN_PARAM = json.dumps({"contextLens": self.contex_lens.tolist(), "maskType":1}) + self.execute_with_param(OP_NAME, PARAM, RUN_PARAM, + [ + self.q.npu(), + self.key_cache.npu(), + self.value_cache.npu(), + torch.from_numpy(self.block_tables.astype(np.int32)).npu(), + torch.from_numpy(self.contex_lens).npu(), + self.mask.npu() + ]) + + def test_paged_bf16_normmask(self): + if not operation_test.get_soc_version() == 'Ascend910B': + print("this testcase only supports Ascend910B") + return + num_tokens = 2 + num_heads = 32 + kv_heads = 16 + block_size = 128 + head_size = 288 + num_blocks = 64 + k_seqlen = 256 + tor = float(1.0 / (head_size ** 0.5)) + mask_dim = 3 + dtype = torch.bfloat16 + + self.calc_data(num_tokens, num_heads, kv_heads, head_size, block_size, num_blocks, k_seqlen, dtype, mask_dim, dtype) + + OP_NAME = "PagedAttentionOperation" + + PARAM = json.dumps({"headNum": num_heads, "qkScale": tor, "kvHeadNum": kv_heads, "maskType":1}) + RUN_PARAM = json.dumps({"contextLens": self.contex_lens.tolist(), "maskType":1}) + self.execute_with_param(OP_NAME, PARAM, RUN_PARAM, + [ + self.q.npu(), + self.key_cache.npu(), + self.value_cache.npu(), + torch.from_numpy(self.block_tables.astype(np.int32)).npu(), + torch.from_numpy(self.contex_lens).npu(), + self.mask.npu() + ]) + + def test_paged_fp16_alibmask(self): + if not operation_test.get_soc_version() == 'Ascend910B': + print("this testcase only supports Ascend910B") + return + num_tokens = 2 + num_heads = 32 + kv_heads = 16 + block_size = 128 + head_size = 288 + num_blocks = 64 + k_seqlen = 256 + tor = float(1.0 / (head_size ** 0.5)) + mask_dim = 4 + dtype = torch.float16 + + self.calc_data(num_tokens, num_heads, kv_heads, head_size, block_size, num_blocks, k_seqlen, dtype, mask_dim, dtype) + + OP_NAME = "PagedAttentionOperation" + + PARAM = json.dumps({"headNum": num_heads, "qkScale": tor, "kvHeadNum": kv_heads, "maskType":2}) + RUN_PARAM = json.dumps({"contextLens": self.contex_lens.tolist(), "maskType":2}) + self.execute_with_param(OP_NAME, PARAM, RUN_PARAM, + [ + self.q.npu(), + self.key_cache.npu(), + self.value_cache.npu(), + torch.from_numpy(self.block_tables.astype(np.int32)).npu(), + torch.from_numpy(self.contex_lens).npu(), + self.mask.npu() + ]) + + def test_paged_bf16_alibmask(self): + if not operation_test.get_soc_version() == 'Ascend910B': + print("this testcase only supports Ascend910B") + return + num_tokens = 2 + num_heads = 32 + kv_heads = 16 + block_size = 128 + head_size = 288 + num_blocks = 64 + k_seqlen = 256 + tor = float(1.0 / (head_size ** 0.5)) + mask_dim = 4 + dtype = torch.bfloat16 + + self.calc_data(num_tokens, num_heads, kv_heads, head_size, block_size, num_blocks, k_seqlen, dtype, mask_dim, dtype) + + OP_NAME = "PagedAttentionOperation" + + PARAM = json.dumps({"headNum": num_heads, "qkScale": tor, "kvHeadNum": kv_heads, "maskType":2}) + RUN_PARAM = json.dumps({"contextLens": self.contex_lens.tolist(), "maskType":2}) + self.execute_with_param(OP_NAME, PARAM, RUN_PARAM, + [ + self.q.npu(), + self.key_cache.npu(), + self.value_cache.npu(), + torch.from_numpy(self.block_tables.astype(np.int32)).npu(), + torch.from_numpy(self.contex_lens).npu(), + self.mask.npu() + ]) + + def test_paged_fp16_dequant_nomask(self): + if not operation_test.get_soc_version() == 'Ascend910B': + print("this testcase only supports Ascend910B") + return + num_tokens = 2 + num_heads = 32 + kv_heads = 16 + block_size = 128 + head_size = 128 + num_blocks = 64 + k_seqlen = 256 + tor = float(1.0 / (head_size ** 0.5)) + mask_dim = 0 + dynamic_batch = False + is_int8_flag = True + has_bias = True + dtype = torch.float16 + + self.calc_data(num_tokens, num_heads, kv_heads, head_size, block_size, num_blocks, k_seqlen, dtype, mask_dim, dtype, + is_int8_flag=is_int8_flag, has_bias=has_bias) + + OP_NAME = "PagedAttentionOperation" + + PARAM = json.dumps({"headNum": num_heads, "qkScale": tor, "kvHeadNum": kv_heads, "maskType":0, "quantType":1, "hasQuantOffset":True}) + RUN_PARAM = json.dumps({"contextLens": self.contex_lens.tolist(), "maskType":0}) + self.execute_with_param(OP_NAME, PARAM, RUN_PARAM, + [ + self.q.npu(), + self.key_cache.npu(), + self.value_cache.npu(), + torch.from_numpy(self.block_tables.astype(np.int32)).npu(), + torch.from_numpy(self.contex_lens).npu(), + self.de_scale1_int64.npu(), + self.offset1.npu(), + self.de_scale2_int64.npu(), + self.offset2.npu(), + ]) + + def test_paged_bf16_dequant_nomask(self): + if not operation_test.get_soc_version() == 'Ascend910B': + print("this testcase only supports Ascend910B") + return + num_tokens = 2 + num_heads = 32 + kv_heads = 16 + block_size = 128 + head_size = 128 + num_blocks = 64 + k_seqlen = 256 + tor = float(1.0 / (head_size ** 0.5)) + mask_dim = 0 + dynamic_batch = False + is_int8_flag = True + has_bias = True + dtype = torch.bfloat16 + + self.calc_data(num_tokens, num_heads, kv_heads, head_size, block_size, num_blocks, k_seqlen, dtype, mask_dim, dtype, + is_int8_flag=is_int8_flag, has_bias=has_bias) + + OP_NAME = "PagedAttentionOperation" + + PARAM = json.dumps({"headNum": num_heads, "qkScale": tor, "kvHeadNum": kv_heads, "maskType":0, "quantType":1, "hasQuantOffset":True}) + RUN_PARAM = json.dumps({"contextLens": self.contex_lens.tolist(), "maskType":0}) + self.execute_with_param(OP_NAME, PARAM, RUN_PARAM, + [ + self.q.npu(), + self.key_cache.npu(), + self.value_cache.npu(), + torch.from_numpy(self.block_tables.astype(np.int32)).npu(), + torch.from_numpy(self.contex_lens).npu(), + self.de_scale1_fp32.npu(), + self.offset1.npu(), + self.de_scale2_fp32.npu(), + self.offset2.npu(), + ]) + + def test_paged_fp16_dequant_normmask(self): + if not operation_test.get_soc_version() == 'Ascend910B': + print("this testcase only supports Ascend910B") + return + num_tokens = 2 + num_heads = 32 + kv_heads = 16 + block_size = 128 + head_size = 128 + num_blocks = 64 + k_seqlen = 256 + tor = float(1.0 / (head_size ** 0.5)) + mask_dim = 3 + dynamic_batch = False + is_int8_flag = True + has_bias = True + dtype = torch.float16 + + self.calc_data(num_tokens, num_heads, kv_heads, head_size, block_size, num_blocks, k_seqlen, dtype, mask_dim, dtype, + is_int8_flag=is_int8_flag, has_bias=has_bias) + + OP_NAME = "PagedAttentionOperation" + + PARAM = json.dumps({"headNum": num_heads, "qkScale": tor, "kvHeadNum": kv_heads, "maskType":1, "quantType":1, "hasQuantOffset":True}) + RUN_PARAM = json.dumps({"contextLens": self.contex_lens.tolist(), "maskType":1}) + self.execute_with_param(OP_NAME, PARAM, RUN_PARAM, + [ + self.q.npu(), + self.key_cache.npu(), + self.value_cache.npu(), + torch.from_numpy(self.block_tables.astype(np.int32)).npu(), + torch.from_numpy(self.contex_lens).npu(), + self.mask.npu(), + self.de_scale1_int64.npu(), + self.offset1.npu(), + self.de_scale2_int64.npu(), + self.offset2.npu(), + ]) + + def test_paged_bf16_dequant_normmask(self): + if not operation_test.get_soc_version() == 'Ascend910B': + print("this testcase only supports Ascend910B") + return + num_tokens = 2 + num_heads = 32 + kv_heads = 16 + block_size = 128 + head_size = 128 + num_blocks = 64 + k_seqlen = 256 + tor = float(1.0 / (head_size ** 0.5)) + mask_dim = 3 + dynamic_batch = False + is_int8_flag = True + has_bias = True + dtype = torch.bfloat16 + + self.calc_data(num_tokens, num_heads, kv_heads, head_size, block_size, num_blocks, k_seqlen, dtype, mask_dim, dtype, + is_int8_flag=is_int8_flag, has_bias=has_bias) + + OP_NAME = "PagedAttentionOperation" + + PARAM = json.dumps({"headNum": num_heads, "qkScale": tor, "kvHeadNum": kv_heads, "maskType":1, "quantType":1, "hasQuantOffset":True}) + RUN_PARAM = json.dumps({"contextLens": self.contex_lens.tolist(), "maskType":1}) + self.execute_with_param(OP_NAME, PARAM, RUN_PARAM, + [ + self.q.npu(), + self.key_cache.npu(), + self.value_cache.npu(), + torch.from_numpy(self.block_tables.astype(np.int32)).npu(), + torch.from_numpy(self.contex_lens).npu(), + self.mask.npu(), + self.de_scale1_fp32.npu(), + self.offset1.npu(), + self.de_scale2_fp32.npu(), + self.offset2.npu(), + ]) + + def test_paged_fp16_dequant_alibmask(self): + if not operation_test.get_soc_version() == 'Ascend910B': + print("this testcase only supports Ascend910B") + return + num_tokens = 2 + num_heads = 32 + kv_heads = 16 + block_size = 128 + head_size = 128 + num_blocks = 64 + k_seqlen = 256 + tor = float(1.0 / (head_size ** 0.5)) + mask_dim = 4 + dynamic_batch = False + is_int8_flag = True + has_bias = True + dtype = torch.float16 + + self.calc_data(num_tokens, num_heads, kv_heads, head_size, block_size, num_blocks, k_seqlen, dtype, mask_dim, dtype, + is_int8_flag=is_int8_flag, has_bias=has_bias) + + OP_NAME = "PagedAttentionOperation" + + PARAM = json.dumps({"headNum": num_heads, "qkScale": tor, "kvHeadNum": kv_heads, "maskType":2, "quantType":1, "hasQuantOffset":True}) + RUN_PARAM = json.dumps({"contextLens": self.contex_lens.tolist(), "maskType":2}) + self.execute_with_param(OP_NAME, PARAM, RUN_PARAM, + [ + self.q.npu(), + self.key_cache.npu(), + self.value_cache.npu(), + torch.from_numpy(self.block_tables.astype(np.int32)).npu(), + torch.from_numpy(self.contex_lens).npu(), + self.mask.npu(), + self.de_scale1_int64.npu(), + self.offset1.npu(), + self.de_scale2_int64.npu(), + self.offset2.npu(), + ]) + + def test_paged_bf16_dequant_alibmask(self): + if not operation_test.get_soc_version() == 'Ascend910B': + print("this testcase only supports Ascend910B") + return + num_tokens = 2 + num_heads = 32 + kv_heads = 16 + block_size = 128 + head_size = 128 + num_blocks = 64 + k_seqlen = 256 + tor = float(1.0 / (head_size ** 0.5)) + mask_dim = 4 + dynamic_batch = False + is_int8_flag = True + has_bias = True + dtype = torch.bfloat16 + + self.calc_data(num_tokens, num_heads, kv_heads, head_size, block_size, num_blocks, k_seqlen, dtype, mask_dim, dtype, + is_int8_flag=is_int8_flag, has_bias=has_bias) + + OP_NAME = "PagedAttentionOperation" + + PARAM = json.dumps({"headNum": num_heads, "qkScale": tor, "kvHeadNum": kv_heads, "maskType":2, "quantType":1, "hasQuantOffset":True}) + RUN_PARAM = json.dumps({"contextLens": self.contex_lens.tolist(), "maskType":2}) + self.execute_with_param(OP_NAME, PARAM, RUN_PARAM, + [ + self.q.npu(), + self.key_cache.npu(), + self.value_cache.npu(), + torch.from_numpy(self.block_tables.astype(np.int32)).npu(), + torch.from_numpy(self.contex_lens).npu(), + self.mask.npu(), + self.de_scale1_fp32.npu(), + self.offset1.npu(), + self.de_scale2_fp32.npu(), + self.offset2.npu(), + ]) + + def test_paged_fp16_BNSD(self): + if not operation_test.get_soc_version() == 'Ascend910B': + print("this testcase only supports Ascend910B") + return + num_tokens = 2 + num_heads = 32 + kv_heads = 16 + block_size = 128 + head_size = 128 + num_blocks = 64 + k_seqlen = 256 + tor = float(1.0 / (head_size ** 0.5)) + mask_dim = 0 + dtype = torch.float16 + + self.calc_data_bnsd(num_tokens, num_heads, kv_heads, head_size, block_size, num_blocks, k_seqlen, dtype, mask_dim, dtype) + + OP_NAME = "PagedAttentionOperation" + + PARAM = json.dumps({"headNum": num_heads, "qkScale": tor, "kvHeadNum": kv_heads, "maskType":0, "inputLayout":1}) + RUN_PARAM = json.dumps({"contextLens": self.contex_lens.tolist(), "maskType":0}) + self.execute_with_param(OP_NAME, PARAM, RUN_PARAM, + [ + self.q.npu(), + self.key_cache_bnsd.npu(), + self.value_cache_bnsd.npu(), + torch.from_numpy(self.block_tables.astype(np.int32)).npu(), + torch.from_numpy(self.contex_lens).npu() + ]) + + def test_paged_bf16_BNSD(self): + if not operation_test.get_soc_version() == 'Ascend910B': + print("this testcase only supports Ascend910B") + return + num_tokens = 2 + num_heads = 32 + kv_heads = 16 + block_size = 128 + head_size = 128 + num_blocks = 64 + k_seqlen = 256 + tor = float(1.0 / (head_size ** 0.5)) + mask_dim = 0 + dtype = torch.bfloat16 + + self.calc_data_bnsd(num_tokens, num_heads, kv_heads, head_size, block_size, num_blocks, k_seqlen, dtype, mask_dim, dtype) + + OP_NAME = "PagedAttentionOperation" + + PARAM = json.dumps({"headNum": num_heads, "qkScale": tor, "kvHeadNum": kv_heads, "maskType":0, "inputLayout":1}) + RUN_PARAM = json.dumps({"contextLens": self.contex_lens.tolist(), "maskType":0}) + self.execute_with_param(OP_NAME, PARAM, RUN_PARAM, + [ + self.q.npu(), + self.key_cache_bnsd.npu(), + self.value_cache_bnsd.npu(), + torch.from_numpy(self.block_tables.astype(np.int32)).npu(), + torch.from_numpy(self.contex_lens).npu() + ]) + + def test_paged_fp16_BNSD_normmask(self): + if not operation_test.get_soc_version() == 'Ascend910B': + print("this testcase only supports Ascend910B") + return + num_tokens = 2 + num_heads = 32 + kv_heads = 16 + block_size = 128 + head_size = 128 + num_blocks = 64 + k_seqlen = 256 + tor = float(1.0 / (head_size ** 0.5)) + mask_dim = 3 + dtype = torch.float16 + + self.calc_data_bnsd(num_tokens, num_heads, kv_heads, head_size, block_size, num_blocks, k_seqlen, dtype, mask_dim, dtype) + + OP_NAME = "PagedAttentionOperation" + + PARAM = json.dumps({"headNum": num_heads, "qkScale": tor, "kvHeadNum": kv_heads, "maskType":1, "inputLayout":1}) + RUN_PARAM = json.dumps({"contextLens": self.contex_lens.tolist(), "maskType":1}) + self.execute_with_param(OP_NAME, PARAM, RUN_PARAM, + [ + self.q.npu(), + self.key_cache_bnsd.npu(), + self.value_cache_bnsd.npu(), + torch.from_numpy(self.block_tables.astype(np.int32)).npu(), + torch.from_numpy(self.contex_lens).npu(), + self.mask.npu() + ]) + + def test_paged_bf16_BNSD_normmask(self): + if not operation_test.get_soc_version() == 'Ascend910B': + print("this testcase only supports Ascend910B") + return + num_tokens = 2 + num_heads = 32 + kv_heads = 16 + block_size = 128 + head_size = 128 + num_blocks = 64 + k_seqlen = 256 + tor = float(1.0 / (head_size ** 0.5)) + mask_dim = 3 + dtype = torch.bfloat16 + + self.calc_data_bnsd(num_tokens, num_heads, kv_heads, head_size, block_size, num_blocks, k_seqlen, dtype, mask_dim, dtype) + + OP_NAME = "PagedAttentionOperation" + + PARAM = json.dumps({"headNum": num_heads, "qkScale": tor, "kvHeadNum": kv_heads, "maskType":1, "inputLayout":1}) + RUN_PARAM = json.dumps({"contextLens": self.contex_lens.tolist(), "maskType":1}) + self.execute_with_param(OP_NAME, PARAM, RUN_PARAM, + [ + self.q.npu(), + self.key_cache_bnsd.npu(), + self.value_cache_bnsd.npu(), + torch.from_numpy(self.block_tables.astype(np.int32)).npu(), + torch.from_numpy(self.contex_lens).npu(), + self.mask.npu() + ]) + + def test_paged_fp16_quant_case_normal_mask(self): + if not operation_test.get_soc_version() == 'Ascend910B': + print("this testcase only supports Ascend910B") + return + num_tokens = 9 + num_heads = 32 + kv_heads = 2 + block_size = 128 + head_size = 128 + num_blocks = 64 + dynamic_batch = True + batch_tatus = [1] * num_tokens + k_seqlen = [3000, 300, 14000, 33, 65, 1, 16, 14000, 300] + tor = 1.0 / (head_size ** 0.5) + dtype = torch.float16 + outDtype = torch.float16 + mask_dim = 3 + is_quant_flag = 1 + + self.calc_data(num_tokens, num_heads, kv_heads, head_size, block_size, num_blocks, k_seqlen, dtype, mask_dim, outDtype, + dynamic_batch, k_seqlen, is_quant_flag = is_quant_flag) + + OP_NAME = "PagedAttentionOperation" + + PARAM = json.dumps({"headNum": num_heads, "qkScale": tor, "kvHeadNum": kv_heads, "maskType":1, "quantType":3, "outDataType": 1}) + RUN_PARAM = json.dumps({"contextLens": self.contex_lens.tolist(), "maskType":1}) + self.execute_with_param(OP_NAME, PARAM, RUN_PARAM, + [ + self.q.npu(), + self.key_cache.npu(), + self.value_cache.npu(), + torch.from_numpy(self.block_tables.astype(np.int32)).npu(), + torch.from_numpy(self.contex_lens).npu(), + self.mask.npu(), + self.de_scale1_fp32.npu(), + self.de_scale2_fp32.npu(), + ]) + + def test_paged_bf16_quant_case_normal_mask(self): + if not operation_test.get_soc_version() == 'Ascend910B': + print("this testcase only supports Ascend910B") + return + num_tokens = 9 + num_heads = 32 + kv_heads = 2 + block_size = 128 + head_size = 128 + num_blocks = 64 + dynamic_batch = True + batch_tatus = [1] * num_tokens + k_seqlen = [3000, 300, 14000, 33, 65, 1, 16, 14000, 300] + tor = 1.0 / (head_size ** 0.5) + dtype = torch.bfloat16 + outDtype = torch.bfloat16 + mask_dim = 3 + is_quant_flag = 1 + + self.calc_data(num_tokens, num_heads, kv_heads, head_size, block_size, num_blocks, k_seqlen, dtype, mask_dim, outDtype, + dynamic_batch, k_seqlen, is_quant_flag = is_quant_flag) + + OP_NAME = "PagedAttentionOperation" + + PARAM = json.dumps({"headNum": num_heads, "qkScale": tor, "kvHeadNum": kv_heads, "maskType":1, "quantType":3, "outDataType": 27}) + RUN_PARAM = json.dumps({"contextLens": self.contex_lens.tolist(), "maskType":1}) + self.execute_with_param(OP_NAME, PARAM, RUN_PARAM, + [ + self.q.npu(), + self.key_cache.npu(), + self.value_cache.npu(), + torch.from_numpy(self.block_tables.astype(np.int32)).npu(), + torch.from_numpy(self.contex_lens).npu(), + self.mask.npu(), + self.de_scale1_fp32.npu(), + self.de_scale2_fp32.npu(), + ]) + + def test_paged_fp16_quantoffline_case_normal_mask(self): + if not operation_test.get_soc_version() == 'Ascend910B': + print("this testcase only supports Ascend910B") + return + num_tokens = 9 + num_heads = 2 + kv_heads = 2 + block_size = 16 + head_size = 128 + num_blocks = 64 + dynamic_batch = True + batch_tatus = [1] * num_tokens + k_seqlen = [3000, 300, 14000, 33, 65, 1, 16, 14000, 300] + tor = 1.0 / (head_size ** 0.5) + dtype = torch.float16 + outDtype = torch.float16 + mask_dim = 3 + is_quant_flag = 1 + + self.calc_data(num_tokens, num_heads, kv_heads, head_size, block_size, num_blocks, k_seqlen, dtype, mask_dim, outDtype, + dynamic_batch, k_seqlen, is_quant_flag = is_quant_flag, is_quant_offiline = 1) + + OP_NAME = "PagedAttentionOperation" + + PARAM = json.dumps({"headNum": num_heads, "qkScale": tor, "kvHeadNum": kv_heads, "maskType":1, "quantType":2, "outDataType": 1}) + RUN_PARAM = json.dumps({"contextLens": self.contex_lens.tolist(), "maskType":1}) + self.execute_with_param(OP_NAME, PARAM, RUN_PARAM, + [ + self.q.npu(), + self.key_cache.npu(), + self.value_cache.npu(), + torch.from_numpy(self.block_tables.astype(np.int32)).npu(), + torch.from_numpy(self.contex_lens).npu(), + self.mask.npu(), + self.de_scale1_fp32.npu(), + self.de_scale2_fp32.npu(), + self.scale.npu(), + ]) + + def test_paged_bf16_quantoffline_case_normal_mask(self): + if not operation_test.get_soc_version() == 'Ascend910B': + print("this testcase only supports Ascend910B") + return + num_tokens = 9 + num_heads = 2 + kv_heads = 2 + block_size = 16 + head_size = 128 + num_blocks = 64 + dynamic_batch = True + batch_tatus = [1] * num_tokens + k_seqlen = [3000, 300, 14000, 33, 65, 1, 16, 14000, 300] + tor = 1.0 / (head_size ** 0.5) + dtype = torch.bfloat16 + outDtype = torch.bfloat16 + mask_dim = 3 + is_quant_flag = 1 + + self.calc_data(num_tokens, num_heads, kv_heads, head_size, block_size, num_blocks, k_seqlen, dtype, mask_dim, outDtype, + dynamic_batch, k_seqlen, is_quant_flag = is_quant_flag, is_quant_offiline = 1) + + OP_NAME = "PagedAttentionOperation" + + PARAM = json.dumps({"headNum": num_heads, "qkScale": tor, "kvHeadNum": kv_heads, "maskType":1, "quantType":2, "outDataType": 27}) + RUN_PARAM = json.dumps({"contextLens": self.contex_lens.tolist(), "maskType":1}) + self.execute_with_param(OP_NAME, PARAM, RUN_PARAM, + [ + self.q.npu(), + self.key_cache.npu(), + self.value_cache.npu(), + torch.from_numpy(self.block_tables.astype(np.int32)).npu(), + torch.from_numpy(self.contex_lens).npu(), + self.mask.npu(), + self.de_scale1_fp32.npu(), + self.de_scale2_fp32.npu(), + self.scale.npu(), + ]) +if __name__ == '__main__': + unittest.main() \ No newline at end of file -- Gitee From a48a3857394fbce98606e69f8473162602aa84c2 Mon Sep 17 00:00:00 2001 From: ivanshan_8170 Date: Wed, 24 Sep 2025 18:57:09 +0800 Subject: [PATCH 88/94] bug: mla remove nz for ring --- .../multi_latent_attention_operation.cpp | 7 - .../test_multi_latent_attention_lse.py | 470 ++++++++++++++++++ 2 files changed, 470 insertions(+), 7 deletions(-) create mode 100644 tests/apitest/opstest/python/operations/multi_latent_attention/test_multi_latent_attention_lse.py diff --git a/src/ops_infer/multi_latent_attention/multi_latent_attention_operation.cpp b/src/ops_infer/multi_latent_attention/multi_latent_attention_operation.cpp index 8bdfceaf..35e0643c 100644 --- a/src/ops_infer/multi_latent_attention/multi_latent_attention_operation.cpp +++ b/src/ops_infer/multi_latent_attention/multi_latent_attention_operation.cpp @@ -106,13 +106,6 @@ static bool ParamCheck(const infer::MultiLatentAttentionParam &opParam) ATB_LOG(ERROR) << "only mtp(CALC_TYPE_SPEC) support mask"; return false; } - if ((opParam.calcType == infer::MultiLatentAttentionParam::CalcType::CALC_TYPE_RING || - opParam.calcType == infer::MultiLatentAttentionParam::CalcType::CALC_TYPE_SPEC_AND_RING) && - (opParam.cacheMode != infer::MultiLatentAttentionParam::CacheMode::KROPE_CTKV && - opParam.cacheMode != infer::MultiLatentAttentionParam::CacheMode::INT8_NZCACHE)) { - ATB_LOG(ERROR) << "CalcType is ring only support krppe ctkv and int8 nzcache"; - return false; - } if ((opParam.cacheMode == infer::MultiLatentAttentionParam::CacheMode::INT8_NZCACHE) && (opParam.calcType == infer::MultiLatentAttentionParam::CalcType::CALC_TYPE_RING || opParam.calcType == infer::MultiLatentAttentionParam::CalcType::CALC_TYPE_SPEC_AND_RING) && diff --git a/tests/apitest/opstest/python/operations/multi_latent_attention/test_multi_latent_attention_lse.py b/tests/apitest/opstest/python/operations/multi_latent_attention/test_multi_latent_attention_lse.py new file mode 100644 index 00000000..27bba90a --- /dev/null +++ b/tests/apitest/opstest/python/operations/multi_latent_attention/test_multi_latent_attention_lse.py @@ -0,0 +1,470 @@ +# Copyright (c) 2024 Huawei Technologies Co., Ltd. +# This file is a part of the CANN Open Software. +# Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# + +import logging +import sys +import os +import unittest +import math +import numpy as np +import torch +import random +import json +import torch.nn.functional as F +import torch_npu +sys.path.append(os.path.join(os.path.dirname(__file__), "../")) +import operation_test +from precision_calcu import * + +torch.set_printoptions(precision=4, sci_mode=False) +# torch_npu.npu.set_device() + +class TestPagedAttentionMLA(operation_test.OperationTest): + + def compare_output_data(self, out, golden, ratios): + error_count = 0 + strict_error_count = 0 + fp16_min_normal = 1.0 / (1 << 14) + golden = golden.flatten().to(torch.float32) + out = out.flatten().to(torch.float32) + len = out.shape[0] + diff = torch.abs(golden - out) + max_diff = diff.max().item() + limit_error = torch.maximum(torch.abs(golden * ratios[0]), torch.tensor(ratios[1])) + strict_limit_error = torch.maximum(torch.abs(golden * ratios[2]), torch.tensor(ratios[3])) + error_count = torch.gt(diff, limit_error).sum().item() + strict_error_count = torch.gt(diff, strict_limit_error).sum().item() + logging.info(f"maxDiff {max_diff}") + logging.info("1/1000 Accuracy is %f", 1 - float(error_count) / len) + logging.info("5/1000 Accuracy is %f", 1 - float(strict_error_count) / len) + if self.data_type == torch.bfloat16 or self.is_int8_flag: + logging.info("accuracy is correct in old standard: %r", (float(strict_error_count) / len) <= ratios[2]) + else: + logging.info("accuracy is correct in old standard: %r", (float(strict_error_count) / len) <= ratios[0]) + calc_times = self.head_size_qk * self.max_context_len + 4 + if self.data_type == torch.bfloat16: + if calc_times < 2048: + error = 2 ** (-7) + else: + error = 2 ** (-6) + error_threshold = torch.clamp(torch.abs(golden), min=1) * error + res = (diff <= error_threshold).all().item() + logging.debug("accuracy is correct in new standard: %r", res) + return res + else: + if calc_times < 2048: + error = 2 ** (-8) + else: + error = 2 ** (-7) + error_threshold = torch.clamp(torch.abs(golden), min=1) * error + res = (diff <= error_threshold).all().item() + logging.debug("accuracy is correct in new standard: %r", res) + return res + + def get_alibi_slopes(self, n_heads): + n = 2 ** math.floor(math.log2(n_heads)) + m0 = 2.0 ** (-8.0 / n) + slopes = torch.pow(m0, torch.arange(1, n + 1)) + if n < n_heads: + m1 = 2.0 ** (-4.0 / n) + mm = torch.pow(m1, torch.arange(1, 1 + 2 * (n_heads - n), 2)) + slopes = torch.cat([slopes, mm]) + # slopes = torch.ones(n_heads) + return slopes + + def group_mm_torch(self, heads, group_num, A, B, is_k): + group_head = heads // group_num + score_high = None + for i in range(group_num): + if self.is_int8_flag: + int8_B = B[i: (i + 1), :, :, ] + head_dim = int8_B.shape[2] + int32_B = torch.matmul(torch.eye(int8_B.shape[1]).to(torch.float32), int8_B.to(torch.float32)).to( + torch.int32) + if is_k: + if self.has_bias: + int32_B = int32_B + self.offset1[i * head_dim:(i + 1) * head_dim] + fp32_B = int32_B.to(torch.float32) * self.de_scale1_fp32[i * head_dim:(i + 1) * head_dim] + fp32_B = torch.permute(fp32_B, (0, 2, 1)) + else: + if self.has_bias: + int32_B = int32_B + self.offset2[i * head_dim:(i + 1) * head_dim] + fp32_B = int32_B.to(torch.float32) * self.de_scale2_fp32[i * head_dim:(i + 1) * head_dim] + group_score_high = torch.matmul(A[i * group_head: (i + 1) * group_head, :, :].to(torch.float32), + fp32_B) + else: + group_score_high = torch.matmul(A[i * group_head: (i + 1) * group_head, :, :].to(torch.float32), + B[i:(i + 1), :, :].to(torch.float32)) + if score_high is None: + score_high = group_score_high + else: + score_high = torch.cat((score_high, group_score_high), 0) + return score_high + + def process_deq_scale(self, deq_scale) -> np.ndarray: + new_deq_scale = np.frombuffer(deq_scale.tobytes(), dtype=np.uint32) + return new_deq_scale.astype(np.uint64) + + def softmax(self, sim): + row_max = torch.max(sim, axis=-1, keepdims=True)[0] + sim_sub = sim - row_max + sim_sub = torch.exp(sim_sub) + row_sum = torch.sum(sim_sub, axis=-1, keepdims=True) + soft_res = sim_sub / row_sum + return soft_res + + def softmax_numpy(self, sim): + sim = sim.cpu().numpy() + row_max = np.max(sim, axis=-1, keepdims=True) + sim_sub = sim - row_max + sim_sub = np.exp(sim_sub) + # print(sim_sub) + row_sum = np.sum(sim_sub, axis=-1, keepdims=True) + soft_res = sim_sub / row_sum + return soft_res, row_max + np.log(row_sum) + + def shape_nd_to_nz(self, shape, dtype='float16'): + assert len(shape) >= 2 + batch = shape[:-2] # 最后两维nd->nz + a, b = shape[-2], shape[-1] + a0, b0 = 16, 16 + return list(batch) + [math.ceil(b / b0), math.ceil(a / a0), a0, b0] + + def gen_axes_for_transpose(self,offset, base): + return [x for x in range(offset)] + [x + offset for x in base] + + def convert_nd_to_nz(self, x): + array_trans = self.gen_axes_for_transpose(len(x.shape) - 2, [2, 0, 1, 3]) # (m1, m0, n1, n0) -> (n1, m1, m0, n0) + x_shape = self.shape_nd_to_nz(x.shape, dtype=x.dtype) + *_, n1, m1, m0, n0 = x_shape + return x.reshape(x_shape[:-4] + [m1, m0, n1, n0]).permute(*array_trans) # x原始需要对齐,才能reshape + + def ref_masked_attention(self, + query, # (1, num_heads, head_size) + key, # (context_len, kv_heads, head_size) + value, + scale: float, + alibi_bias, + mask_data_type=torch.bfloat16 + ): + # Q * K.T + query = query + query = torch.permute(query, (1, 0, 2)) + if not self.is_int8_flag: + key = torch.permute(key, (1, 2, 0)) # 0 1 2 + else: + key = torch.permute(key, (1, 0, 2)) + sim_high = self.group_mm_torch(query.shape[0], key.shape[0], query, key, 1) # (head_num, q_seqlen, k_seqlen) + sim_out = sim_high.to(torch.float32) + sim_high = sim_high.to(torch.float32) * scale + if alibi_bias is not None: + sim_high = sim_high + alibi_bias.to(torch.float32) + # softmax + p_high, lse = self.softmax_numpy(sim_high) + p = torch.from_numpy(p_high).to(mask_data_type) + p_high = torch.from_numpy(p_high) + + lse = torch.permute(torch.from_numpy(lse).to(mask_data_type), (1, 0, 2)) # (q_seqlen, head_num, 1) + + # P * V + value = torch.permute(value, (1, 0, 2)) + out = self.group_mm_torch(query.shape[0], key.shape[0], p, value, 0) + out_high = self.group_mm_torch(query.shape[0], key.shape[0], p_high, value, 0) + out = torch.permute(out, (1, 0, 2)) + out_high = torch.permute(out_high, (1, 0, 2)) + sim_out = torch.permute(sim_out, (1, 0, 2)) + return out, out_high, sim_out, lse + + def ref_single_query_cached_kv_attention(self, + sim, + output, + true_out, + lse, # (num_tokens, num_heads, 1) + query, + key_cache, # (num_blocks, block_size, num_heads, head_size) + value_cache, # (num_blocks, block_size, num_heads, head_size) + block_tables, + context_lens, + mask, + mask_dim=4, + mask_data_type=torch.bfloat16 + ) -> None: + mask_index_coff = 1 + if self.compressHead: + query = query.view(self.num_tokens * self.kv_heads, self.num_heads // self.kv_heads, self.head_size_qk) + output = output.view(self.num_tokens * self.kv_heads, self.num_heads // self.kv_heads, self.head_size_vo) + true_out = true_out.view(self.num_tokens * self.kv_heads, self.num_heads // self.kv_heads, + self.head_size_vo) + if mask_dim == 4: + mask_shape = mask.shape + mask = mask.view(mask_shape[0] * self.kv_heads, self.num_heads // self.kv_heads, 1, + self.max_context_len) + else: + mask_index_coff = self.kv_heads + num_heads = query.shape[1] + kv_heads = value_cache.shape[2] + head_size_qk = key_cache.shape[3] + head_size_vo = value_cache.shape[3] + block_size = value_cache.shape[1] + + num_input_tokens = query.shape[0] + index = 0 + for i in range(len(context_lens)): + block_table = block_tables[i] + context_len = int(context_lens[i]) + if context_len == 0: + continue + + q = query[index].view(1, num_heads, head_size_qk) + keys = [] + values = [] + for j in range(context_len): + block_number = int(block_table[j // block_size]) + block_offset = j % block_size + + k = key_cache[block_number, block_offset, :, :] + k = k.reshape(kv_heads, head_size_qk) + keys.append(k) + + v = value_cache[block_number, block_offset, :, :] + v = v.reshape(kv_heads, head_size_vo) + values.append(v) + keys = torch.stack(keys, axis=0) + values = torch.stack(values, axis=0) + scale = np.float32(1.0 / (head_size_qk ** 0.5)) + if mask_dim == 4: + out, out_high, sim_out, _ = self.ref_masked_attention(q, keys, values, scale, + mask[i, :, :, :context_len], mask_data_type) + out = out.reshape(num_heads, head_size_vo) + elif mask_dim == 3: + out, out_high, sim_out, _ = self.ref_masked_attention(q, keys, values, scale, + mask[i // mask_index_coff, :, :context_len], + mask_data_type) + out = out.reshape(num_heads, head_size_vo) + else: + out, out_high, sim_out, lse_i = self.ref_masked_attention(q, keys, values, scale, mask, + mask_data_type) + out = out.reshape(num_heads, head_size_vo) + lse_i = lse_i.reshape(num_heads, 1) + lse[index] = lse_i.to(mask_data_type) + out_high = out_high.reshape(num_heads, head_size_vo) + sim_out = sim_out.reshape(1, num_heads * context_len) + output[index] = out.to(mask_data_type) + true_out[index] = out_high + sim[index] = sim_out + index = index + 1 + + def calc_data(self, num_tokens, num_heads, kv_heads, head_size_qk, head_size_vo, block_size, num_blocks, k_seqlen,\ + dtype, mask_dim = 0, mask_data_type = torch.bfloat16,\ + dynamic_batch = False, dynamic_seqlen = None, is_int8_flag = False, has_bias = False, + compressHead = False, is_kv_combined = True, is_nz_in = False): + self.num_heads = num_heads + self.kv_heads = kv_heads + self.num_tokens = num_tokens + self.compressHead = compressHead + self.head_size_qk = head_size_qk + self.head_size_vo = head_size_vo + + logging.debug( + f'input info: {num_tokens}, {num_heads}, {kv_heads}, {head_size_qk}, {head_size_vo}, {block_size}, {num_blocks}, {k_seqlen}, {dtype}') + + q_range = 5.0 + query = torch.from_numpy(np.random.uniform(-q_range, q_range, size=(num_tokens, num_heads, head_size_qk))).to(dtype) + # (num_blocks, block_size, num_heads, head_size) + kv_range = 5.0 + kv_type = dtype + if is_int8_flag: + kv_type = torch.int8 + if not compressHead: + key_cache = torch.from_numpy( + np.random.uniform(-kv_range, kv_range, size=(num_blocks, block_size, kv_heads, head_size_qk))).to( + kv_type) + # (num_blocks, block_size, num_heads, head_size) + if not is_kv_combined: + value_cache = torch.from_numpy( + np.random.uniform(-kv_range, kv_range, size=(num_blocks, block_size, kv_heads, head_size_vo))).to( + kv_type) + else: + value_cache = key_cache[:, :, :, :head_size_vo] + else: + key_cache = torch.from_numpy( + np.random.uniform(-kv_range, kv_range, size=(num_blocks * kv_heads, block_size, 1, head_size_qk))).to( + kv_type) + # (num_blocks, block_size, num_heads, head_size) + if not is_kv_combined: + value_cache = torch.from_numpy(np.random.uniform(-kv_range, kv_range, size=( + num_blocks * kv_heads, block_size, 1, head_size_vo))).to(kv_type) + else: + value_cache = key_cache[:, :, :, :head_size_vo] + self.data_type = dtype + + if dynamic_batch: + context_lens = dynamic_seqlen + else: + context_lens = [k_seqlen] * num_tokens + max_context_len = max(context_lens) + self.max_context_len = max_context_len + batch = len(context_lens) + + # alibi mask + if mask_dim == 4: + mask = np.zeros((batch, num_heads, 1, self.max_context_len), dtype=np.float32) + alibi_slopes = self.get_alibi_slopes(num_heads) + for i, context_len in enumerate(context_lens): + if context_len == 0: + continue + position_ids = np.arange(context_len).astype(np.int32) + alibi_bias = (position_ids - context_len + 1).astype(np.float32) + alibi_bias = alibi_slopes.reshape(-1, 1, 1) * alibi_bias.reshape(1, 1, -1) # (head_num, 1, context) + mask[i, :, :, :context_len] = alibi_bias + mask = torch.from_numpy(mask).to(mask_data_type) + # normal mask + elif mask_dim == 3: + mask = np.zeros((batch, 1, max_context_len), dtype=np.float16) + for i in range(batch): + mask[i, :, :i] = -10000 + mask = torch.from_numpy(mask).to(mask_data_type) + else: # no mask + mask = None + + if compressHead: + context_lens = [val for val in context_lens for _ in range(kv_heads)] + batch = len(context_lens) + max_num_blocks_per_seq = (max_context_len + block_size - 1) // block_size + block_tables = [] # (num_tokens, max_num_blocks_per_seq) + for i in range(batch): + block_table = [ + i * max_num_blocks_per_seq + _ for _ in range(max_num_blocks_per_seq) + ] + block_tables.append(block_table) + + self.is_int8_flag = is_int8_flag + if is_int8_flag: + de_scale1_fp32 = np.random.randint(-1, 2, size=(kv_heads * head_size)).astype(np.float32) + de_scale1_int64 = self.process_deq_scale(de_scale1_fp32) + + de_scale2_fp32 = np.random.randint(-1, 2, size=(kv_heads * head_size)).astype(np.float32) + de_scale2_int64 = self.process_deq_scale(de_scale2_fp32) + + offset1 = np.random.randint(-20, 20, size=(kv_heads * head_size)).astype(np.int32) + + offset2 = np.random.randint(-20, 20, size=(kv_heads * head_size)).astype(np.int32) + + self.de_scale1_int64 = torch.tensor(list(de_scale1_int64), dtype=torch.int64) + self.de_scale2_int64 = torch.tensor(list(de_scale2_int64), dtype=torch.int64) + self.de_scale1_fp32 = torch.from_numpy(de_scale1_fp32) + self.de_scale2_fp32 = torch.from_numpy(de_scale2_fp32) + self.offset1 = torch.from_numpy(offset1) + self.offset2 = torch.from_numpy(offset2) + self.has_bias = has_bias + + shape_out = (num_tokens, num_heads, head_size_vo) + ref_output = torch.zeros(shape_out, dtype=dtype) + true_out = torch.zeros(shape_out, dtype=torch.float32) + sim = torch.zeros((num_tokens, num_heads * k_seqlen), dtype=torch.float32) + lse = torch.zeros((num_tokens, num_heads, 1), dtype=dtype) + self.ref_single_query_cached_kv_attention( + sim, + ref_output, + true_out, + lse, + query, + key_cache, + value_cache, + block_tables, + context_lens, + mask, + mask_dim, + mask_data_type + ) + + self.q_split1, self.q_split2 = torch.split(query, [512, 64], dim=2) + self.key_cache_split1, self.key_cache_split2 = torch.split(key_cache, [512, 64], dim=3) + self.value_cache = value_cache + + if (is_nz_in): + key_cache_split1, key_cache_split2 = torch.split(key_cache, [512, 64], dim=3) + key_cache_split1 = key_cache_split1.reshape(num_blocks, block_size, -1) + key_cache_split2 = key_cache_split2.reshape(num_blocks, block_size, -1) + key_cache_split1_nz = self.convert_nd_to_nz(key_cache_split1) + key_cache_split2_nz = self.convert_nd_to_nz(key_cache_split2) + self.key_cache_split1 = key_cache_split1_nz.to(mask_data_type).reshape(num_blocks, -1, block_size, 16) + self.key_cache_split2 = key_cache_split2_nz.to(mask_data_type).reshape(num_blocks, -1, block_size, 16) + + self.block_tables = np.array(block_tables).astype(np.int32) + self.contex_lens = np.array(context_lens).astype(np.int32) + self.alib_mask = mask + self.golden_out = ref_output + self.true_out = true_out + self.lse = lse + + def golden_calc(self, in_tensors): + golden_out = torch.tensor(self.golden_out) + return [golden_out, self.lse] + + def golden_compare(self, out_tensors, golden_tensors): + go_double = compare_cv(self.true_out, golden_tensors[0].cpu(), out_tensors[0].cpu()) + result_old = self.compare_output_data(out_tensors[0].npu(), golden_tensors[0].npu(), [0.001, 0.001, 0.005, 0.005]) + lse_double = True + lse_old = True + if self.is_ring: + lse_double = compare_cv(golden_tensors[1].npu(), golden_tensors[1].npu(), out_tensors[1].npu()) + lse_old = self.compare_output_data(out_tensors[1].npu(), golden_tensors[1].npu(), [0.001, 0.001, 0.005, 0.005]) + return (result_old) and (lse_double or lse_old) + + def test_paged_mla_combine_cache_norm_128_nz(self): + if not operation_test.get_soc_version() == 'Ascend910B': + print("this testcase only supports Ascend910B") + return + num_tokens = 32 + num_heads = 32 + kv_heads = 1 + block_size = 128 + head_size_qk = 576 + head_size_vo = 512 + num_blocks = 64 + k_seqlen = 256 + tor = 1.0 / (head_size_qk ** 0.5) + mask_dim = 0 + dtype = torch.float16 + is_kv_combined = True + self.is_ring = 1 + is_nz_in = True + + self.calc_data(num_tokens, num_heads, kv_heads, head_size_qk, head_size_vo, block_size, num_blocks, k_seqlen, + dtype, mask_dim, dtype, + is_kv_combined=is_kv_combined, is_nz_in=is_nz_in) + + OP_NAME = "MLAOperation" + OP_PARAM = {"type": 0, "kvHead": kv_heads, "headSize": num_heads, "tor": tor, + "kvSeqLen": self.contex_lens.tolist(), "isRing": self.is_ring} + logging.debug(f"blcok_tables shape: {self.block_tables}") + logging.debug(f"contex_lens shape: {self.contex_lens}") + logging.debug(f"numTokens: {num_tokens}, numHeads: {num_heads}, kvHead: {kv_heads}" + f", blockSize: {block_size}, headSizeQK: {head_size_qk}, headSizeVO: {head_size_vo}, numBlocks: {num_blocks}") + logging.info(f"Q1 shape: {self.q_split1.shape}") + logging.info(f"Q2 shape: {self.q_split2.shape}") + logging.info(f"K1 shape: {self.key_cache_split1.shape}") + logging.info(f"K2 shape: {self.key_cache_split2.shape}") + + OP_NAME = "MultiLatentAttentionOperation" + PARAM = json.dumps({"headNum": num_heads, "qkScale":tor, "kvHeadNum":kv_heads, "maskType": 0, "cacheMode": 3, "calcType": 2}) + RUN_PARAM = json.dumps({"contextLens": self.contex_lens.tolist()}) + self.execute_with_param(OP_NAME, PARAM, RUN_PARAM, + [ + self.q_split1.npu(), + self.q_split2.npu(), + torch.tensor(self.key_cache_split1).npu(), + torch.tensor(self.key_cache_split2).npu(), + torch.tensor(self.block_tables).int().npu(), + torch.tensor(self.contex_lens).npu() + ]) + +if __name__ == '__main__': + unittest.main() -- Gitee From 60c06b4c9e3af50357453e4cb2423bad36f5c443 Mon Sep 17 00:00:00 2001 From: ivanshan_8170 Date: Wed, 24 Sep 2025 20:12:21 +0800 Subject: [PATCH 89/94] remove test case --- tests/apitest/opstest/csv/multi_latent_attention.csv | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/apitest/opstest/csv/multi_latent_attention.csv b/tests/apitest/opstest/csv/multi_latent_attention.csv index 09c3ff7f..f5a43959 100644 --- a/tests/apitest/opstest/csv/multi_latent_attention.csv +++ b/tests/apitest/opstest/csv/multi_latent_attention.csv @@ -1,14 +1,12 @@ CaseNum |CaseName |OpName |OpParam |InNum |InDType |InFormat |InShape |OutNum |OutDType |OutFormat |OutShape |DataGenType |DataGenRange |InTensorFile |OutTensorFile |TestType |TestLevel |FromModel |SocVersion |ExpectedError 1 |MultiLatentAttentionBadCaseHeadNum |MultiLatentAttentionOperation |{"maskType":1,"calcType":2,"cacheMode":1} | 6 |float16;float16;float16;float16;int32;int32 |nd;nd;nd;nd;nd;nd|32,32,512;32,32,64;64,128,1,512;64,128,1,64;32,2;32|2|float16;float16|nd;nd|32,32,512;32,32,1|random;random;random;random;random;random|-100,100;-100,100;-100,100;-100,100;-100,100;-100,100| | | | | |Ascend910B |C:ERROR_INVALID_PARAM 2 |MultiLatentAttentionBadCaseKvHeadNumNot1 |MultiLatentAttentionOperation |{"maskType":0,"calcType":2,"cacheMode":3,"headNum":8,"kvHeadNum":2} | 6 |float16;float16;float16;float16;int32;int32 |nd;nd;nd;nd;nd;nd|32,32,512;32,32,64;64,128,1,512;64,128,1,64;32,2;32|2 |float16;float16|nd;nd|32,32,512;32,32,1|random;random;random;random;random;random|-100,100;-100,100;-100,100;-100,100;-100,100;-100,100| | | | | |Ascend910B |C:ERROR_INVALID_PARAM -3 |MultiLatentAttentionBadCaseInt8NzCacheHeadNum128 |MultiLatentAttentionOperation |{"maskType":0,"calcType":2,"cacheMode":3,"headNum":128,"kvHeadNum":1} | 6 |float16;float16;float16;float16;int32;int32 |nd;nd;nd;nd;nd;nd|32,32,512;32,32,64;64,128,1,512;64,128,1,64;32,2;32|1 |float16|nd|32,32,512|random;random;random;random;random;random|-100,100;-100,100;-100,100;-100,100;-100,100;-100,100| | | | | |Ascend910B |C:ERROR_INVALID_PARAM 4 |MultiLatentAttentionErrorQkScale |MultiLatentAttentionOperation |{"maskType":0,"calcType":2,"cacheMode":1,"headNum":16,"kvHeadNum":1,"qkScale":100} | 6 |float16;float16;float16;float16;int32;int32 |nd;nd;nd;nd;nd;nd|32,32;32,32,64;64,128,1,512;64,128,1,64;32,2;32|2 |float16;float16|nd;nd|32,32,512;32,32,1|random;random;random;random;random;random|-100,100;-100,100;-100,100;-100,100;-100,100;-100,100| | | | | |Ascend910B |C:ERROR_INVALID_PARAM 5 |MultiLatentAttentionErrorMaskType |MultiLatentAttentionOperation |{"maskType":3,"calcType":2,"cacheMode":1,"headNum":16,"kvHeadNum":1} | 6 |float16;float16;float16;float16;int32;int32 |nd;nd;nd;nd;nd;nd|32,32;32,32,64;64,128,1,512;64,128,1,64;32,2;32|2 |float16;float16|nd;nd|32,32,512;32,32,1|random;random;random;random;random;random|-100,100;-100,100;-100,100;-100,100;-100,100;-100,100| | | | | |Ascend910B |C:ERROR_INVALID_PARAM 6 |MultiLatentAttentionInvalidCalcType |MultiLatentAttentionOperation |{"maskType":0,"calcType":5,"cacheMode":3,"headNum":16,"kvHeadNum":1} | 6 |float16;float16;float16;float16;int32;int32 |nd;nd;nd;nd;nd;nd|32,32;32,32,64;64,128,1,512;64,128,1,64;32,2;32|2 |float16;float16|nd;nd|32,32,512;32,32,1|random;random;random;random;random;random|-100,100;-100,100;-100,100;-100,100;-100,100;-100,100| | | | | |Ascend910B |C:ERROR_INVALID_PARAM 7 |MultiLatentAttentionInvalidCacheType |MultiLatentAttentionOperation |{"maskType":0,"calcType":2,"cacheMode":4,"headNum":16,"kvHeadNum":1} | 6 |float16;float16;float16;float16;int32;int32 |nd;nd;nd;nd;nd;nd|32,32;32,32,64;64,128,1,512;64,128,1,64;32,2;32|2 |float16;float16|nd;nd|32,32,512;32,32,1|random;random;random;random;random;random|-100,100;-100,100;-100,100;-100,100;-100,100;-100,100| | | | | |Ascend910B |C:ERROR_INVALID_PARAM 8 |MultiLatentAttentionBadCaseNotSupportedKvCache |MultiLatentAttentionOperation |{"maskType":0,"calcType":0,"cacheMode":0, "headNum": 8, "kvHeadNum": 1} | 6 |float16;float16;float16;float16;int32;int32 |nd;nd;nd;nd;nd;nd|32,32,512;32,32,64;64,128,1,512;64,128,1,64;32,2;32|1 |float16|nd|32,32,512|random;random;random;random;random;random|-100,100;-100,100;-100,100;-100,100;-100,100;-100,100| | | | | |Ascend910B |C:ERROR_INVALID_PARAM 9 |MultiLatentAttentionNoError |MultiLatentAttentionOperation |{"maskType":1,"calcType":2,"cacheMode":1,"headNum":16,"kvHeadNum":1} | 6 |float16;float16;float16;float16;int32;int32 |nd;nd;nd;nd;nd;nd|32,32;32,32,64;64,128,1,512;64,128,1,64;32,2;32|2 |float16;float16|nd;nd|32,32,512;32,32,1|random;random;random;random;random;random|-100,100;-100,100;-100,100;-100,100;-100,100;-100,100| | | | | |Ascend910B |C:ERROR_INVALID_PARAM -10 |MultiLatentAttentionNoError |MultiLatentAttentionOperation |{"maskType":0,"calcType":2,"cacheMode":3,"headNum":16,"kvHeadNum":1} | 6 |float16;float16;float16;float16;int32;int32 |nd;nd;nd;nd;nd;nd|32,32;32,32,64;64,128,1,512;64,128,1,64;32,2;32|2 |float16;float16|nd;nd|32,32,512;32,32,1|random;random;random;random;random;random|-100,100;-100,100;-100,100;-100,100;-100,100;-100,100| | | | | |Ascend910B |C:ERROR_INVALID_PARAM 11 |MultiLatentAttentionWrongDimNum |MultiLatentAttentionOperation |{"maskType":0,"calcType":2,"cacheMode":1,"headNum":32,"kvHeadNum":1} | 6 |float16;float16;float16;float16;int32;int32 |nd;nd;nd;nd;nd;nd|32,32,512;32,32,64;64,128,1,512;64,128,1;32,2;32|2 |float16;float16|nd;nd|32,32,512;32,32,1|random;random;random;random;random;random|-100,100;-100,100;-100,100;-100,100;-100,100;-100,100| | | | | |Ascend910B |I:ERROR_INVALID_TENSOR_DIM_NUM 12 |MultiLatentAttentionErrorBatchExceeded |MultiLatentAttentionOperation |{"maskType":0,"calcType":2,"cacheMode":1,"headNum":32,"kvHeadNum":1} | 6 |float16;float16;float16;float16;int32;int32 |nd;nd;nd;nd;nd;nd|32,32,512;32,32,64;64,128,1,512;64,128,1,64;8200,2;32|2 |float16;float16|nd;nd|32,32,512;32,32,1|random;random;random;random;random;random|-100,100;-100,100;-100,100;-100,100;-100,100;-100,100| | | | | |Ascend910B |I:ERROR_INVALID_TENSOR_DIM 13 |MultiLatentAttentionBatchNotSame |MultiLatentAttentionOperation |{"maskType":0,"calcType":2,"cacheMode":1,"headNum":32,"kvHeadNum":1} | 6 |float16;float16;float16;float16;int32;int32 |nd;nd;nd;nd;nd;nd|32,32,512;32,32,64;64,128,1,512;64,128,1,64;64,2;32|2 |float16;float16|nd;nd|32,32,512;32,32,1|random;random;random;random;random;random|-100,100;-100,100;-100,100;-100,100;-100,100;-100,100| | | | | |Ascend910B |I:ERROR_INVALID_TENSOR_DIM -- Gitee From 2dfb00bd446517b915cd3bd67f67614181636270 Mon Sep 17 00:00:00 2001 From: huangxiaolan Date: Thu, 25 Sep 2025 11:49:57 +0800 Subject: [PATCH 90/94] delete const --- src/kernels/tbe_adapter/platform/platform_ascendc.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/kernels/tbe_adapter/platform/platform_ascendc.cpp b/src/kernels/tbe_adapter/platform/platform_ascendc.cpp index 15c771eb..142befbe 100644 --- a/src/kernels/tbe_adapter/platform/platform_ascendc.cpp +++ b/src/kernels/tbe_adapter/platform/platform_ascendc.cpp @@ -41,7 +41,7 @@ const static std::map CONVERT_MAP = { {"Ascend910_93", SocVersion::ASCEND910B}, }; -static inline uint32_t GetCoreNumByType(const fe::PlatFormInfos &platformInfo, bool isAiv) +static inline uint32_t GetCoreNumByType(fe::PlatFormInfos &platformInfo, bool isAiv) { std::string key; std::string val; -- Gitee From d105acfca6601bc3e142a5753cddf76e4476a67a Mon Sep 17 00:00:00 2001 From: huangxiaolan Date: Thu, 25 Sep 2025 13:05:25 +0800 Subject: [PATCH 91/94] recover tbe_adapter --- src/kernels/tbe_adapter/platform/platform_ascendc.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/kernels/tbe_adapter/platform/platform_ascendc.cpp b/src/kernels/tbe_adapter/platform/platform_ascendc.cpp index 142befbe..e1d924f2 100644 --- a/src/kernels/tbe_adapter/platform/platform_ascendc.cpp +++ b/src/kernels/tbe_adapter/platform/platform_ascendc.cpp @@ -41,11 +41,11 @@ const static std::map CONVERT_MAP = { {"Ascend910_93", SocVersion::ASCEND910B}, }; -static inline uint32_t GetCoreNumByType(fe::PlatFormInfos &platformInfo, bool isAiv) +static inline uint32_t GetCoreNumByType(fe::PlatFormInfos *platformInfo, bool isAiv) { std::string key; std::string val; - bool ret = platformInfo.GetPlatformResWithLock(STR_SOC_INFO, STR_SPLIT_KEY, val); + bool ret = platformInfo->GetPlatformResWithLock(STR_SOC_INFO, STR_SPLIT_KEY, val); MKI_LOG_IF(!ret, ERROR) << "get platform failed, val is " << val; if (STR_SPLIT_VAL.compare(val) != 0) { @@ -55,7 +55,7 @@ static inline uint32_t GetCoreNumByType(fe::PlatFormInfos &platformInfo, bool is } else { key = STR_CORE_CNT_CUB; } - ret = platformInfo.GetPlatformResWithLock(STR_SOC_INFO, key, val); + ret = platformInfo->GetPlatformResWithLock(STR_SOC_INFO, key, val); MKI_LOG_IF(!ret, ERROR) << "get platform failed, key is " << key << ", val is" << val; return val.empty() ? 0 : static_cast(std::atoi(val.c_str())); } -- Gitee From 102822bbdecb71b2cd3d53f78b62e67b2661fddd Mon Sep 17 00:00:00 2001 From: huangxiaolan Date: Thu, 25 Sep 2025 14:27:27 +0800 Subject: [PATCH 92/94] fix func para --- src/kernels/tbe_adapter/platform/platform_ascendc.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/kernels/tbe_adapter/platform/platform_ascendc.cpp b/src/kernels/tbe_adapter/platform/platform_ascendc.cpp index e1d924f2..8b5fe1a1 100644 --- a/src/kernels/tbe_adapter/platform/platform_ascendc.cpp +++ b/src/kernels/tbe_adapter/platform/platform_ascendc.cpp @@ -41,11 +41,11 @@ const static std::map CONVERT_MAP = { {"Ascend910_93", SocVersion::ASCEND910B}, }; -static inline uint32_t GetCoreNumByType(fe::PlatFormInfos *platformInfo, bool isAiv) +static inline uint32_t GetCoreNumByType(const fe::PlatFormInfos &platformInfo, bool isAiv) { std::string key; std::string val; - bool ret = platformInfo->GetPlatformResWithLock(STR_SOC_INFO, STR_SPLIT_KEY, val); + bool ret = platformInfo.GetPlatformResWithLock(STR_SOC_INFO, STR_SPLIT_KEY, val); MKI_LOG_IF(!ret, ERROR) << "get platform failed, val is " << val; if (STR_SPLIT_VAL.compare(val) != 0) { @@ -55,7 +55,7 @@ static inline uint32_t GetCoreNumByType(fe::PlatFormInfos *platformInfo, bool is } else { key = STR_CORE_CNT_CUB; } - ret = platformInfo->GetPlatformResWithLock(STR_SOC_INFO, key, val); + ret = platformInfo.GetPlatformResWithLock(STR_SOC_INFO, key, val); MKI_LOG_IF(!ret, ERROR) << "get platform failed, key is " << key << ", val is" << val; return val.empty() ? 0 : static_cast(std::atoi(val.c_str())); } @@ -73,12 +73,12 @@ uint32_t PlatformAscendC::GetCoreNumVector(void) const uint32_t PlatformAscendC::GetCoreNumAic(void) const { - return GetCoreNumByType(GetPlatFormInfo(), false); + return GetCoreNumByType(*GetPlatFormInfo(), false); } uint32_t PlatformAscendC::GetCoreNumAiv(void) const { - return GetCoreNumByType(GetPlatFormInfo(), true); + return GetCoreNumByType(*GetPlatFormInfo(), true); } uint32_t PlatformAscendC::GetCoreNum(void) const -- Gitee From b903bd0f764d10e8eac1c02c084c0291e0c08c58 Mon Sep 17 00:00:00 2001 From: huangxiaolan Date: Thu, 25 Sep 2025 16:20:04 +0800 Subject: [PATCH 93/94] fix func para --- src/kernels/tbe_adapter/platform/platform_ascendc.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/kernels/tbe_adapter/platform/platform_ascendc.cpp b/src/kernels/tbe_adapter/platform/platform_ascendc.cpp index 8b5fe1a1..9fced58f 100644 --- a/src/kernels/tbe_adapter/platform/platform_ascendc.cpp +++ b/src/kernels/tbe_adapter/platform/platform_ascendc.cpp @@ -41,7 +41,7 @@ const static std::map CONVERT_MAP = { {"Ascend910_93", SocVersion::ASCEND910B}, }; -static inline uint32_t GetCoreNumByType(const fe::PlatFormInfos &platformInfo, bool isAiv) +static inline uint32_t GetCoreNumByType(fe::PlatFormInfos &platformInfo, bool isAiv) { std::string key; std::string val; -- Gitee From 75a76e0286036384aa47f264a52201d03da0ff58 Mon Sep 17 00:00:00 2001 From: huangxiaolan Date: Thu, 25 Sep 2025 19:08:40 +0800 Subject: [PATCH 94/94] recover tbe_adapter --- src/kernels/tbe_adapter/platform/platform_ascendc.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/kernels/tbe_adapter/platform/platform_ascendc.cpp b/src/kernels/tbe_adapter/platform/platform_ascendc.cpp index 9fced58f..e1d924f2 100644 --- a/src/kernels/tbe_adapter/platform/platform_ascendc.cpp +++ b/src/kernels/tbe_adapter/platform/platform_ascendc.cpp @@ -41,11 +41,11 @@ const static std::map CONVERT_MAP = { {"Ascend910_93", SocVersion::ASCEND910B}, }; -static inline uint32_t GetCoreNumByType(fe::PlatFormInfos &platformInfo, bool isAiv) +static inline uint32_t GetCoreNumByType(fe::PlatFormInfos *platformInfo, bool isAiv) { std::string key; std::string val; - bool ret = platformInfo.GetPlatformResWithLock(STR_SOC_INFO, STR_SPLIT_KEY, val); + bool ret = platformInfo->GetPlatformResWithLock(STR_SOC_INFO, STR_SPLIT_KEY, val); MKI_LOG_IF(!ret, ERROR) << "get platform failed, val is " << val; if (STR_SPLIT_VAL.compare(val) != 0) { @@ -55,7 +55,7 @@ static inline uint32_t GetCoreNumByType(fe::PlatFormInfos &platformInfo, bool is } else { key = STR_CORE_CNT_CUB; } - ret = platformInfo.GetPlatformResWithLock(STR_SOC_INFO, key, val); + ret = platformInfo->GetPlatformResWithLock(STR_SOC_INFO, key, val); MKI_LOG_IF(!ret, ERROR) << "get platform failed, key is " << key << ", val is" << val; return val.empty() ? 0 : static_cast(std::atoi(val.c_str())); } @@ -73,12 +73,12 @@ uint32_t PlatformAscendC::GetCoreNumVector(void) const uint32_t PlatformAscendC::GetCoreNumAic(void) const { - return GetCoreNumByType(*GetPlatFormInfo(), false); + return GetCoreNumByType(GetPlatFormInfo(), false); } uint32_t PlatformAscendC::GetCoreNumAiv(void) const { - return GetCoreNumByType(*GetPlatFormInfo(), true); + return GetCoreNumByType(GetPlatFormInfo(), true); } uint32_t PlatformAscendC::GetCoreNum(void) const -- Gitee