From 4ef75d52db10fa717a518181b6319e4245ae2b8c Mon Sep 17 00:00:00 2001
From: hongshiyi <hongshiyi@huawei.com>
Date: Fri, 5 Sep 2025 10:57:24 +0800
Subject: [PATCH 01/94] support core control

---
 .../platform/platform_infos_def.cpp           | 72 +++++++++++--------
 1 file changed, 44 insertions(+), 28 deletions(-)
diff --git a/src/kernels/tbe_adapter/platform/platform_infos_def.cpp b/src/kernels/tbe_adapter/platform/platform_infos_def.cpp
index 268de985..816c822a 100644
--- a/src/kernels/tbe_adapter/platform/platform_infos_def.cpp
+++ b/src/kernels/tbe_adapter/platform/platform_infos_def.cpp
@@ -12,6 +12,7 @@
 #include <mutex>
 #include <mki/utils/log/log.h>
 #include "platform_infos_impl.h"
+#include "acl/acl_rt.h"
 
 namespace fe {
 constexpr uint32_t MAX_CORE_NUM = 128;
@@ -102,46 +103,61 @@ void PlatFormInfos::SetFixPipeDtypeMap(const std::map<std::string, std::vector<s
 
 void PlatFormInfos::SetCoreNumByCoreType(const std::string &core_type)
 {
-    std::string coreNumStr;
-    std::string coreTypeStr;
-    if (core_type == "VectorCore") {
-        coreTypeStr = "vector_core_cnt";
+    uint32_t coreNum = 0;
+    aclrtDevResLimitType resType = core_type == "VectorCore" ? ACL_RT_DEV_RES_VECTOR_CORE : ACL_RT_DEV_RES_CUBE_CORE;
+    aclError getResRet = aclrtGetResInCurrentThread(resType, &coreNum);
+    if (getResRet == ACL_SUCCESS) {
+        core_num_ = coreNum;
+        MKI_LOG(DEBUG) << "Get ThreadResource::core_num_ to " << core_type << ": " << coreNum;
     } else {
-        coreTypeStr = "ai_core_cnt";
-    }
-    std::lock_guard<std::mutex> lockGuard(g_asdopsFePlatMutex);
-    (void)GetPlatformRes("SoCInfo", coreTypeStr, coreNumStr);
-    MKI_LOG(DEBUG) << "Set PlatFormInfos::core_num_ to " << coreTypeStr << ": " << coreNumStr;
-    if (coreNumStr.empty()) {
-        core_num_ = 1;
-        MKI_LOG(ERROR) << "CoreNumStr is empty!";
-    } else {
-        core_num_ = std::strtoul(coreNumStr.c_str(), nullptr, 10); // 10 进制
-        if (core_num_ > MAX_CORE_NUM) {
+        std::string coreNumStr;
+        std::string coreTypeStr;
+        if (core_type == "VectorCore") {
+            coreTypeStr = "vector_core_cnt";
+        } else {
+            coreTypeStr = "ai_core_cnt";
+        }
+        std::lock_guardstd::mutex lockGuard(g_asdopsFePlatMutex);
+        (void)GetPlatformRes("SoCInfo", coreTypeStr, coreNumStr);
+        MKI_LOG(DEBUG) << "Set PlatFormInfos::core_num_ to " << coreTypeStr << ": " << coreNumStr;
+        if (coreNumStr.empty()) {
             core_num_ = 1;
-            MKI_LOG(ERROR) << "core_num is out of range : " << core_num_;
+            MKI_LOG(ERROR) << "CoreNumStr is empty!";
+        } else {
+            core_num_ = std::strtoul(coreNumStr.c_str(), nullptr, 10); // 10 进制
         }
     }
+    if (core_num_ == 0 || core_num_ > MAX_CORE_NUM) {
+        MKI_LOG(ERROR) << "core_num is out of range : " << core_num_;
+        core_num_ = 1;
+    }
 }
 
 uint32_t PlatFormInfos::GetCoreNumByType(const std::string &core_type)
 {
-    std::string coreNumStr;
-    std::string coreTypeStr = core_type == "VectorCore" ? "vector_core_cnt" : "ai_core_cnt";
-    std::lock_guard<std::mutex> lockGuard(g_asdopsFePlatMutex);
-    (void)GetPlatformRes("SoCInfo", coreTypeStr, coreNumStr);
-    MKI_LOG(DEBUG) << "Get PlatFormInfos::core_num_ to " << coreTypeStr << ": " << coreNumStr;
-    if (coreNumStr.empty()) {
-        MKI_LOG(ERROR) << "CoreNumStr is empty!";
-        return 1;
+    uint32_t coreNum = 0;
+    aclrtDevResLimitType resType = core_type == "VectorCore" ? ACL_RT_DEV_RES_VECTOR_CORE : ACL_RT_DEV_RES_CUBE_CORE;
+    aclError getResRet = aclrtGetResInCurrentThread(resType, &coreNum);
+    if (getResRet == ACL_SUCCESS) {
+        MKI_LOG(DEBUG) << "Get ThreadResource::core_num_ to " << core_type << ": " << coreNum;
     } else {
-        uint32_t coreTypeNum = std::strtoul(coreNumStr.c_str(), nullptr, 10); // 10 进制
-        if (coreTypeNum > MAX_CORE_NUM) {
-            MKI_LOG(ERROR) << "core_num is out of range : " << coreTypeNum;
+        std::string coreNumStr;
+        std::string coreTypeStr = core_type == "VectorCore" ? "vector_core_cnt" : "ai_core_cnt";
+        std::lock_guardstd::mutex lockGuard(g_asdopsFePlatMutex);
+        (void)GetPlatformRes("SoCInfo", coreTypeStr, coreNumStr);
+        MKI_LOG(DEBUG) << "Get PlatFormInfos::core_num_ to " << coreTypeStr << ": " << coreNumStr;
+        if (coreNumStr.empty()) {
+            MKI_LOG(ERROR) << "CoreNumStr is empty!";
             return 1;
+        } else {
+            coreNum = std::strtoul(coreNumStr.c_str(), nullptr, 10); // 10 进制
         }
-        return coreTypeNum;
     }
+    if (coreNum > MAX_CORE_NUM) {
+        MKI_LOG(ERROR) << "core_num is out of range : " << coreNum;
+        return 1;
+    }
+    return coreNum;
 }
 
 void PlatFormInfos::SetCoreNum(const uint32_t &coreNum)
-- 
Gitee


From 9fdd0f5a1a2fd855c898d590c6ca23b2ba253c26 Mon Sep 17 00:00:00 2001
From: hongshiyi <hongshiyi@huawei.com>
Date: Fri, 5 Sep 2025 14:16:28 +0800
Subject: [PATCH 02/94] add <>

---
 src/kernels/tbe_adapter/platform/platform_infos_def.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/kernels/tbe_adapter/platform/platform_infos_def.cpp b/src/kernels/tbe_adapter/platform/platform_infos_def.cpp
index 816c822a..e0ac8527 100644
--- a/src/kernels/tbe_adapter/platform/platform_infos_def.cpp
+++ b/src/kernels/tbe_adapter/platform/platform_infos_def.cpp
@@ -117,7 +117,7 @@ void PlatFormInfos::SetCoreNumByCoreType(const std::string &core_type)
         } else {
             coreTypeStr = "ai_core_cnt";
         }
-        std::lock_guardstd::mutex lockGuard(g_asdopsFePlatMutex);
+        std::lock_guard<std::mutex> lockGuard(g_asdopsFePlatMutex);
         (void)GetPlatformRes("SoCInfo", coreTypeStr, coreNumStr);
         MKI_LOG(DEBUG) << "Set PlatFormInfos::core_num_ to " << coreTypeStr << ": " << coreNumStr;
         if (coreNumStr.empty()) {
@@ -143,7 +143,7 @@ uint32_t PlatFormInfos::GetCoreNumByType(const std::string &core_type)
     } else {
         std::string coreNumStr;
         std::string coreTypeStr = core_type == "VectorCore" ? "vector_core_cnt" : "ai_core_cnt";
-        std::lock_guardstd::mutex lockGuard(g_asdopsFePlatMutex);
+        std::lock_guard<std::mutex> lockGuard(g_asdopsFePlatMutex);
         (void)GetPlatformRes("SoCInfo", coreTypeStr, coreNumStr);
         MKI_LOG(DEBUG) << "Get PlatFormInfos::core_num_ to " << coreTypeStr << ": " << coreNumStr;
         if (coreNumStr.empty()) {
-- 
Gitee


From 11179f21897d84b8d124f6f42e4200e0b0a918a3 Mon Sep 17 00:00:00 2001
From: hongshiyi <hongshiyi@huawei.com>
Date: Tue, 9 Sep 2025 20:04:50 +0800
Subject: [PATCH 03/94] add zero judge

---
 src/kernels/tbe_adapter/platform/platform_infos_def.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/kernels/tbe_adapter/platform/platform_infos_def.cpp b/src/kernels/tbe_adapter/platform/platform_infos_def.cpp
index e0ac8527..2cb40ebf 100644
--- a/src/kernels/tbe_adapter/platform/platform_infos_def.cpp
+++ b/src/kernels/tbe_adapter/platform/platform_infos_def.cpp
@@ -153,7 +153,7 @@ uint32_t PlatFormInfos::GetCoreNumByType(const std::string &core_type)
             coreNum = std::strtoul(coreNumStr.c_str(), nullptr, 10); // 10 进制
         }
     }
-    if (coreNum > MAX_CORE_NUM) {
+    if (coreNum == 0 || coreNum > MAX_CORE_NUM) {
         MKI_LOG(ERROR) << "core_num is out of range : " << coreNum;
         return 1;
     }
-- 
Gitee


From ee0ae57f3d9ea14d627b9e34fd34c4b33441bd68 Mon Sep 17 00:00:00 2001
From: hongshiyi <hongshiyi@huawei.com>
Date: Wed, 10 Sep 2025 15:22:07 +0800
Subject: [PATCH 04/94] add compile option

---
 src/torch_atb/CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/torch_atb/CMakeLists.txt b/src/torch_atb/CMakeLists.txt
index 50111f2e..073e51f3 100644
--- a/src/torch_atb/CMakeLists.txt
+++ b/src/torch_atb/CMakeLists.txt
@@ -12,6 +12,7 @@ file(GLOB_RECURSE pybind11_source_files "*.cpp")
 pybind11_add_module(_C ${pybind11_source_files})
 set_target_properties(_C PROPERTIES OUTPUT_NAME "_C" SUFFIX ".so")
 target_link_options(_C PRIVATE -rdynamic -ldl -Wl,-z,relro -Wl,-z,now -Wl,-z,noexecstack -Wl,--build-id=none -fexceptions)
+target_compile_options(_C PRIVATE -Wno-odr-violation)
 target_link_libraries(_C PRIVATE torch_npu)
 target_include_directories(_C PRIVATE ${ATB_INCLUDE_DIR})
 install(TARGETS _C DESTINATION ${CMAKE_SOURCE_DIR}/output/torch_atb)
\ No newline at end of file
-- 
Gitee


From 2fb4bb1bd7cba5b1ffcbd49903c9973c4b2c5eb8 Mon Sep 17 00:00:00 2001
From: hongshiyi <hongshiyi@huawei.com>
Date: Wed, 10 Sep 2025 17:51:03 +0800
Subject: [PATCH 05/94] change cmakelist

---
 src/torch_atb/CMakeLists.txt | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/torch_atb/CMakeLists.txt b/src/torch_atb/CMakeLists.txt
index 073e51f3..f75429df 100644
--- a/src/torch_atb/CMakeLists.txt
+++ b/src/torch_atb/CMakeLists.txt
@@ -11,8 +11,7 @@
 file(GLOB_RECURSE pybind11_source_files "*.cpp")
 pybind11_add_module(_C ${pybind11_source_files})
 set_target_properties(_C PROPERTIES OUTPUT_NAME "_C" SUFFIX ".so")
-target_link_options(_C PRIVATE -rdynamic -ldl -Wl,-z,relro -Wl,-z,now -Wl,-z,noexecstack -Wl,--build-id=none -fexceptions)
-target_compile_options(_C PRIVATE -Wno-odr-violation)
+target_link_options(_C PRIVATE -Wno-odr -rdynamic -ldl -Wl,-z,relro -Wl,-z,now -Wl,-z,noexecstack -Wl,--build-id=none -fexceptions)
 target_link_libraries(_C PRIVATE torch_npu)
 target_include_directories(_C PRIVATE ${ATB_INCLUDE_DIR})
 install(TARGETS _C DESTINATION ${CMAKE_SOURCE_DIR}/output/torch_atb)
\ No newline at end of file
-- 
Gitee


From 5ccc9703c790bb503aee7f92246b1971e9427365 Mon Sep 17 00:00:00 2001
From: He Changcheng <hechangcheng2@h-partners.com>
Date: Thu, 11 Sep 2025 19:36:56 +0800
Subject: [PATCH 06/94] fix

---
 comm/lcal/src/CMakeLists.txt    | 2 +-
 comm/lcal/src/lcal_internal.cpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/comm/lcal/src/CMakeLists.txt b/comm/lcal/src/CMakeLists.txt
index 400edecd..17cdc9cd 100644
--- a/comm/lcal/src/CMakeLists.txt
+++ b/comm/lcal/src/CMakeLists.txt
@@ -26,7 +26,7 @@ target_link_libraries(lcal ascendcl runtime profapi c_sec mki)
 target_link_libraries(lcal_static ascendcl runtime profapi c_sec mki)
 
 message(STATUS "LCAL USE_MSSANITIZER = ${USE_MSSANITIZER}")
-set(LCAL_CCE_PATH "/tmp/lcal_cce.o")
+set(LCAL_CCE_PATH "${CMAKE_CURRENT_BINARY_DIR}/lcal_cce.o")
 if(USE_MSSANITIZER)
 math(EXPR LCAL_1OP_BIN_SIZE "128 * 1024 * 1024")
 add_definitions(-DUSE_MSSANITIZER)
diff --git a/comm/lcal/src/lcal_internal.cpp b/comm/lcal/src/lcal_internal.cpp
index 5fd8b8b5..0dff180e 100644
--- a/comm/lcal/src/lcal_internal.cpp
+++ b/comm/lcal/src/lcal_internal.cpp
@@ -23,7 +23,7 @@ using namespace Mki;
 
 extern const int LCAL_CCE_BIN_STR[];
 asm(R"(.section .rodata, "a", @progbits
-LCAL_CCE_BIN_STR:.incbin "/tmp/lcal_cce.o"
+LCAL_CCE_BIN_STR:.incbin "lcal_cce.o"
 .byte 0
 .previous)");
 
-- 
Gitee


From 4961585a1a329e6e16f641288cdcc6383a7dd2ee Mon Sep 17 00:00:00 2001
From: guo-jiong <guojiong1@huawei.com>
Date: Wed, 10 Sep 2025 17:51:16 +0800
Subject: [PATCH 07/94] move lcal and cinterface

---
 CMakeLists.txt                                |  13 +-
 src/CMakeLists.txt                            |   2 -
 .../utils}/atb_acl_util.cpp                   |   0
 src/{cinterface => atb/utils}/atb_acl_util.h  |   0
 src/kernels/CMakeLists.txt                    |   1 +
 {comm => src/kernels}/lcal/CMakeLists.txt     |   1 +
 .../lcal/cmake/CMakeCCECompiler.cmake.in      |   0
 .../lcal/cmake/CMakeCCEInformation.cmake      |   0
 .../cmake/CMakeDetermineCCECompiler.cmake     |   0
 .../lcal/cmake/CMakeTestCCECompiler.cmake     |   0
 .../kernels}/lcal/include/comm_args.h         |   0
 {comm => src/kernels}/lcal/include/lcal.h     |   0
 {comm => src/kernels}/lcal/include/lcal_api.h |   0
 .../kernels}/lcal/include/lcal_comm.h         |   0
 .../kernels}/lcal/include/lcal_types.h        |   0
 {comm => src/kernels}/lcal/include/lccl.h     |   0
 .../kernels}/lcal/include/lcoc/lcoc.h         |   0
 .../kernels}/lcal/include/lcoc/lcoc_args.h    |   0
 .../kernels}/lcal/include/lcoc/lcoc_base.h    |   0
 .../kernels}/lcal/include/lcoc/lcoc_func.h    |   0
 .../lcal/include/lcoc/lcoc_workspace.h        |   0
 .../lcal/include/lcoc/tiling/tiling.h         |   0
 .../lcal/include/lcoc/tiling/tiling_91093.h   |   0
 .../lcal/include/lcoc/tiling/tiling_910B.h    |   0
 .../lcal/include/lcoc/tiling/tiling_args.h    |   0
 .../lcal/include/lcoc/tiling/tiling_func.h    |   0
 {comm => src/kernels}/lcal/src/CMakeLists.txt |   0
 {comm => src/kernels}/lcal/src/ascendc.cmake  |   0
 .../ascendc_kernels/91093/all2all_hierarchy.h |   0
 .../91093/all2all_hierarchy_small.h           |   0
 .../91093/allgather_hierarchy_double_ring.h   |   0
 .../91093/allreduce_big_data_sio.h            |   0
 .../91093/allreduce_hierarchy_double_ring.h   |   0
 .../reduce_scatter_big_data_91093_4step.h     |   0
 .../reduce_scatter_hierarchy_double_ring.h    |   0
 .../lcal/src/ascendc_kernels/CMakeLists.txt   |   0
 .../lcal/src/ascendc_kernels/allgather.h      |   0
 .../src/ascendc_kernels/allreduce_big_data.h  |   0
 .../src/ascendc_kernels/allreduce_one_shot.h  |   0
 .../src/ascendc_kernels/allreduce_quant.h     |   0
 .../src/ascendc_kernels/allreduce_two_shot.h  |   0
 .../lcal/src/ascendc_kernels/collectives.h    |   0
 .../lcal/src/ascendc_kernels/datacopy_gm2gm.h |   0
 .../ascendc_kernels/datacopy_gm2gm_delay.h    |   0
 .../lcal/src/ascendc_kernels/ipc_queue.h      |   0
 .../lcal/src/ascendc_kernels/lccl_op.h        |   0
 .../lcal/src/ascendc_kernels/lccl_op1.cpp     |   0
 .../lcal/src/ascendc_kernels/lccl_op2.cpp     |   0
 .../lcal/src/ascendc_kernels/op_def.h         |   0
 .../lcal/src/ascendc_kernels/reduce_scatter.h |   0
 .../src/ascendc_kernels/sync_collectives.h    |   0
 .../kernels}/lcal/src/ccl_kernel_args.h       |   0
 .../kernels}/lcal/src/coc_kernel_args.cpp     |   0
 .../kernels}/lcal/src/coc_kernel_args.h       |   0
 .../kernels}/lcal/src/kernels/CMakeLists.txt  |   0
 .../lcal/src/kernels/coc_add_bias_runner.cce  |   0
 .../lcal/src/kernels/coc_allgather.cce        |   0
 .../lcal/src/kernels/coc_allgather_matmul.cce |   0
 .../coc_allgather_matmul_reduce_scatter.cce   |   0
 .../kernels/coc_allgather_reducescatter.cce   |   0
 .../lcal/src/kernels/coc_allgather_v2.cce     |   0
 .../lcal/src/kernels/coc_allreduce.cce        |   0
 .../kernels/coc_alltoall_allgather_hidden.cce |   0
 .../coc_alltoall_reduce_scatter_hidden.cce    |   0
 .../src/kernels/coc_alltoallv_allgather.cce   |   0
 .../coc_alltoallv_allgather_matmul.cce        |   0
 .../lcal/src/kernels/coc_comm_base.cce        |   0
 .../lcal/src/kernels/coc_const_args.cce       |   0
 .../lcal/src/kernels/coc_dequant_runner.cce   |   0
 .../lcal/src/kernels/coc_internal.cce         |   0
 .../lcal/src/kernels/coc_matmul_allreduce.cce |   0
 .../src/kernels/coc_matmul_reduce_scatter.cce |   0
 .../coc_matmul_reduce_scatter_alltoallv.cce   |   0
 .../lcal/src/kernels/coc_matmulmoe.cce        |   0
 .../lcal/src/kernels/coc_postprocessor.cce    |   0
 .../lcal/src/kernels/coc_ppmatmul.cce         |   0
 .../lcal/src/kernels/coc_ppmatmul_switch.cce  |   0
 .../lcal/src/kernels/coc_preprocessor.cce     |   0
 .../lcal/src/kernels/coc_pure_matmul.cce      |   0
 .../lcal/src/kernels/coc_reduce_scatter.cce   |   0
 .../kernels}/lcal/src/kernels/collectives.cce |   0
 .../src/kernels/lcal_all2all_transpose.cce    |   0
 .../lcal/src/kernels/lcal_allgather.cce       |   0
 .../lcal/src/kernels/lcal_allgather_2npu.cce  |   0
 .../lcal_allgather_2npu_big_data_write.cce    |   0
 .../src/kernels/lcal_allgather_910B2C.cce     |   0
 .../src/kernels/lcal_allgather_big_data.cce   |   0
 .../lcal_allgather_big_data_910B2C.cce        |   0
 .../kernels/lcal_allreduce_2npu_big_write.cce |   0
 .../src/kernels/lcal_allreduce_2npu_read.cce  |   0
 .../src/kernels/lcal_allreduce_2npu_write.cce |   0
 .../src/kernels/lcal_allreduce_big_data.cce   |   0
 .../lcal_allreduce_big_data_910B2C.cce        |   0
 .../kernels/lcal_allreduce_deterministic.cce  |   0
 .../lcal_allreduce_deterministic_big_data.cce |   0
 .../src/kernels/lcal_allreduce_two_shot.cce   |   0
 .../lcal_allreduce_two_shot_910B2C.cce        |   0
 .../src/kernels/lcal_broadcast_big_data.cce   |   0
 .../lcal/src/kernels/lcal_broadcast_write.cce |   0
 .../lcal/src/kernels/lcal_reduce_scatter.cce  |   0
 .../kernels/lcal_reduce_scatter_big_data.cce  |   0
 .../lcal_reduce_scatter_big_data_write.cce    |   0
 .../src/kernels/lcal_reduce_scatter_write.cce |   0
 {comm => src/kernels}/lcal/src/lcal_comm.cpp  |   0
 .../kernels}/lcal/src/lcal_internal.cpp       |   0
 .../kernels}/lcal/src/lcal_internal.h         |   0
 {comm => src/kernels}/lcal/src/lcal_wrap.cpp  |   0
 {comm => src/kernels}/lcal/src/lccl.cpp       |   0
 {comm => src/kernels}/lcal/src/lcoc.cpp       |   0
 {comm => src/kernels}/lcal/src/lcoc_func.cpp  |   0
 .../lcal/src/profiling/report_timing.h        |   0
 .../tiling/allgather_reducescatter_tiling.cpp |   0
 .../lcal/src/tiling/allgather_tiling.cpp      |   0
 .../src/tiling/allgather_tiling_91093.cpp     |   0
 .../lcal/src/tiling/allgather_tiling_910B.cpp |   0
 .../src/tiling/allgatherv2_tiling_91093.cpp   |   0
 .../src/tiling/allgatherv2_tiling_910B.cpp    |   0
 .../lcal/src/tiling/allreduce_tiling.cpp      |   0
 .../src/tiling/allreduce_tiling_91093.cpp     |   0
 .../lcal/src/tiling/allreduce_tiling_910B.cpp |   0
 .../alltoall_allgather_hidden_tiling.cpp      |   0
 .../src/tiling/alltoall_allgather_tiling.cpp  |   0
 .../reducescatter_alltoall_hidden_tiling.cpp  |   0
 .../lcal/src/tiling/reducescatter_tiling.cpp  |   0
 .../src/tiling/reducescatter_tiling_91093.cpp |   0
 .../src/tiling/reducescatter_tiling_910B.cpp  |   0
 .../kernels}/lcal/src/tiling/tiling.cpp       |   0
 .../kernels}/lcal/src/tiling/tiling_args.cpp  |   0
 .../kernels}/lcal/src/tiling/tiling_func.cpp  |   0
 .../src/tools/socket/lcal_sock_exchange.cpp   |   0
 .../src/tools/socket/lcal_sock_exchange.h     |   0
 .../atb_acl_fused_add_topk_div.cpp            |   2 +-
 .../atb_acl_mla_preprocess.cpp                | 358 ++++++++--------
 .../multi_latent_attention}/atb_acl_mla.cpp   | 402 +++++++++---------
 .../atb_acl_paged_cache_load.cpp              |   2 +-
 .../ring_mla}/atb_acl_ring_mla.cpp            | 198 ++++-----
 .../atb_acl_self_attention_prefix_encoder.cpp | 228 +++++-----
 137 files changed, 603 insertions(+), 604 deletions(-)
 rename src/{cinterface => atb/utils}/atb_acl_util.cpp (100%)
 rename src/{cinterface => atb/utils}/atb_acl_util.h (100%)
 rename {comm => src/kernels}/lcal/CMakeLists.txt (98%)
 rename {comm => src/kernels}/lcal/cmake/CMakeCCECompiler.cmake.in (100%)
 rename {comm => src/kernels}/lcal/cmake/CMakeCCEInformation.cmake (100%)
 rename {comm => src/kernels}/lcal/cmake/CMakeDetermineCCECompiler.cmake (100%)
 rename {comm => src/kernels}/lcal/cmake/CMakeTestCCECompiler.cmake (100%)
 rename {comm => src/kernels}/lcal/include/comm_args.h (100%)
 rename {comm => src/kernels}/lcal/include/lcal.h (100%)
 rename {comm => src/kernels}/lcal/include/lcal_api.h (100%)
 rename {comm => src/kernels}/lcal/include/lcal_comm.h (100%)
 rename {comm => src/kernels}/lcal/include/lcal_types.h (100%)
 rename {comm => src/kernels}/lcal/include/lccl.h (100%)
 rename {comm => src/kernels}/lcal/include/lcoc/lcoc.h (100%)
 rename {comm => src/kernels}/lcal/include/lcoc/lcoc_args.h (100%)
 rename {comm => src/kernels}/lcal/include/lcoc/lcoc_base.h (100%)
 rename {comm => src/kernels}/lcal/include/lcoc/lcoc_func.h (100%)
 rename {comm => src/kernels}/lcal/include/lcoc/lcoc_workspace.h (100%)
 rename {comm => src/kernels}/lcal/include/lcoc/tiling/tiling.h (100%)
 rename {comm => src/kernels}/lcal/include/lcoc/tiling/tiling_91093.h (100%)
 rename {comm => src/kernels}/lcal/include/lcoc/tiling/tiling_910B.h (100%)
 rename {comm => src/kernels}/lcal/include/lcoc/tiling/tiling_args.h (100%)
 rename {comm => src/kernels}/lcal/include/lcoc/tiling/tiling_func.h (100%)
 rename {comm => src/kernels}/lcal/src/CMakeLists.txt (100%)
 rename {comm => src/kernels}/lcal/src/ascendc.cmake (100%)
 rename {comm => src/kernels}/lcal/src/ascendc_kernels/91093/all2all_hierarchy.h (100%)
 rename {comm => src/kernels}/lcal/src/ascendc_kernels/91093/all2all_hierarchy_small.h (100%)
 rename {comm => src/kernels}/lcal/src/ascendc_kernels/91093/allgather_hierarchy_double_ring.h (100%)
 rename {comm => src/kernels}/lcal/src/ascendc_kernels/91093/allreduce_big_data_sio.h (100%)
 rename {comm => src/kernels}/lcal/src/ascendc_kernels/91093/allreduce_hierarchy_double_ring.h (100%)
 rename {comm => src/kernels}/lcal/src/ascendc_kernels/91093/reduce_scatter_big_data_91093_4step.h (100%)
 rename {comm => src/kernels}/lcal/src/ascendc_kernels/91093/reduce_scatter_hierarchy_double_ring.h (100%)
 rename {comm => src/kernels}/lcal/src/ascendc_kernels/CMakeLists.txt (100%)
 rename {comm => src/kernels}/lcal/src/ascendc_kernels/allgather.h (100%)
 rename {comm => src/kernels}/lcal/src/ascendc_kernels/allreduce_big_data.h (100%)
 rename {comm => src/kernels}/lcal/src/ascendc_kernels/allreduce_one_shot.h (100%)
 rename {comm => src/kernels}/lcal/src/ascendc_kernels/allreduce_quant.h (100%)
 rename {comm => src/kernels}/lcal/src/ascendc_kernels/allreduce_two_shot.h (100%)
 rename {comm => src/kernels}/lcal/src/ascendc_kernels/collectives.h (100%)
 rename {comm => src/kernels}/lcal/src/ascendc_kernels/datacopy_gm2gm.h (100%)
 rename {comm => src/kernels}/lcal/src/ascendc_kernels/datacopy_gm2gm_delay.h (100%)
 rename {comm => src/kernels}/lcal/src/ascendc_kernels/ipc_queue.h (100%)
 rename {comm => src/kernels}/lcal/src/ascendc_kernels/lccl_op.h (100%)
 rename {comm => src/kernels}/lcal/src/ascendc_kernels/lccl_op1.cpp (100%)
 rename {comm => src/kernels}/lcal/src/ascendc_kernels/lccl_op2.cpp (100%)
 rename {comm => src/kernels}/lcal/src/ascendc_kernels/op_def.h (100%)
 rename {comm => src/kernels}/lcal/src/ascendc_kernels/reduce_scatter.h (100%)
 rename {comm => src/kernels}/lcal/src/ascendc_kernels/sync_collectives.h (100%)
 rename {comm => src/kernels}/lcal/src/ccl_kernel_args.h (100%)
 rename {comm => src/kernels}/lcal/src/coc_kernel_args.cpp (100%)
 rename {comm => src/kernels}/lcal/src/coc_kernel_args.h (100%)
 rename {comm => src/kernels}/lcal/src/kernels/CMakeLists.txt (100%)
 rename {comm => src/kernels}/lcal/src/kernels/coc_add_bias_runner.cce (100%)
 rename {comm => src/kernels}/lcal/src/kernels/coc_allgather.cce (100%)
 rename {comm => src/kernels}/lcal/src/kernels/coc_allgather_matmul.cce (100%)
 rename {comm => src/kernels}/lcal/src/kernels/coc_allgather_matmul_reduce_scatter.cce (100%)
 rename {comm => src/kernels}/lcal/src/kernels/coc_allgather_reducescatter.cce (100%)
 rename {comm => src/kernels}/lcal/src/kernels/coc_allgather_v2.cce (100%)
 rename {comm => src/kernels}/lcal/src/kernels/coc_allreduce.cce (100%)
 rename {comm => src/kernels}/lcal/src/kernels/coc_alltoall_allgather_hidden.cce (100%)
 rename {comm => src/kernels}/lcal/src/kernels/coc_alltoall_reduce_scatter_hidden.cce (100%)
 rename {comm => src/kernels}/lcal/src/kernels/coc_alltoallv_allgather.cce (100%)
 rename {comm => src/kernels}/lcal/src/kernels/coc_alltoallv_allgather_matmul.cce (100%)
 rename {comm => src/kernels}/lcal/src/kernels/coc_comm_base.cce (100%)
 rename {comm => src/kernels}/lcal/src/kernels/coc_const_args.cce (100%)
 rename {comm => src/kernels}/lcal/src/kernels/coc_dequant_runner.cce (100%)
 rename {comm => src/kernels}/lcal/src/kernels/coc_internal.cce (100%)
 rename {comm => src/kernels}/lcal/src/kernels/coc_matmul_allreduce.cce (100%)
 rename {comm => src/kernels}/lcal/src/kernels/coc_matmul_reduce_scatter.cce (100%)
 rename {comm => src/kernels}/lcal/src/kernels/coc_matmul_reduce_scatter_alltoallv.cce (100%)
 rename {comm => src/kernels}/lcal/src/kernels/coc_matmulmoe.cce (100%)
 rename {comm => src/kernels}/lcal/src/kernels/coc_postprocessor.cce (100%)
 rename {comm => src/kernels}/lcal/src/kernels/coc_ppmatmul.cce (100%)
 rename {comm => src/kernels}/lcal/src/kernels/coc_ppmatmul_switch.cce (100%)
 rename {comm => src/kernels}/lcal/src/kernels/coc_preprocessor.cce (100%)
 rename {comm => src/kernels}/lcal/src/kernels/coc_pure_matmul.cce (100%)
 rename {comm => src/kernels}/lcal/src/kernels/coc_reduce_scatter.cce (100%)
 rename {comm => src/kernels}/lcal/src/kernels/collectives.cce (100%)
 rename {comm => src/kernels}/lcal/src/kernels/lcal_all2all_transpose.cce (100%)
 rename {comm => src/kernels}/lcal/src/kernels/lcal_allgather.cce (100%)
 rename {comm => src/kernels}/lcal/src/kernels/lcal_allgather_2npu.cce (100%)
 rename {comm => src/kernels}/lcal/src/kernels/lcal_allgather_2npu_big_data_write.cce (100%)
 rename {comm => src/kernels}/lcal/src/kernels/lcal_allgather_910B2C.cce (100%)
 rename {comm => src/kernels}/lcal/src/kernels/lcal_allgather_big_data.cce (100%)
 rename {comm => src/kernels}/lcal/src/kernels/lcal_allgather_big_data_910B2C.cce (100%)
 rename {comm => src/kernels}/lcal/src/kernels/lcal_allreduce_2npu_big_write.cce (100%)
 rename {comm => src/kernels}/lcal/src/kernels/lcal_allreduce_2npu_read.cce (100%)
 rename {comm => src/kernels}/lcal/src/kernels/lcal_allreduce_2npu_write.cce (100%)
 rename {comm => src/kernels}/lcal/src/kernels/lcal_allreduce_big_data.cce (100%)
 rename {comm => src/kernels}/lcal/src/kernels/lcal_allreduce_big_data_910B2C.cce (100%)
 rename {comm => src/kernels}/lcal/src/kernels/lcal_allreduce_deterministic.cce (100%)
 rename {comm => src/kernels}/lcal/src/kernels/lcal_allreduce_deterministic_big_data.cce (100%)
 rename {comm => src/kernels}/lcal/src/kernels/lcal_allreduce_two_shot.cce (100%)
 rename {comm => src/kernels}/lcal/src/kernels/lcal_allreduce_two_shot_910B2C.cce (100%)
 rename {comm => src/kernels}/lcal/src/kernels/lcal_broadcast_big_data.cce (100%)
 rename {comm => src/kernels}/lcal/src/kernels/lcal_broadcast_write.cce (100%)
 rename {comm => src/kernels}/lcal/src/kernels/lcal_reduce_scatter.cce (100%)
 rename {comm => src/kernels}/lcal/src/kernels/lcal_reduce_scatter_big_data.cce (100%)
 rename {comm => src/kernels}/lcal/src/kernels/lcal_reduce_scatter_big_data_write.cce (100%)
 rename {comm => src/kernels}/lcal/src/kernels/lcal_reduce_scatter_write.cce (100%)
 rename {comm => src/kernels}/lcal/src/lcal_comm.cpp (100%)
 rename {comm => src/kernels}/lcal/src/lcal_internal.cpp (100%)
 rename {comm => src/kernels}/lcal/src/lcal_internal.h (100%)
 rename {comm => src/kernels}/lcal/src/lcal_wrap.cpp (100%)
 rename {comm => src/kernels}/lcal/src/lccl.cpp (100%)
 rename {comm => src/kernels}/lcal/src/lcoc.cpp (100%)
 rename {comm => src/kernels}/lcal/src/lcoc_func.cpp (100%)
 rename {comm => src/kernels}/lcal/src/profiling/report_timing.h (100%)
 rename {comm => src/kernels}/lcal/src/tiling/allgather_reducescatter_tiling.cpp (100%)
 rename {comm => src/kernels}/lcal/src/tiling/allgather_tiling.cpp (100%)
 rename {comm => src/kernels}/lcal/src/tiling/allgather_tiling_91093.cpp (100%)
 rename {comm => src/kernels}/lcal/src/tiling/allgather_tiling_910B.cpp (100%)
 rename {comm => src/kernels}/lcal/src/tiling/allgatherv2_tiling_91093.cpp (100%)
 rename {comm => src/kernels}/lcal/src/tiling/allgatherv2_tiling_910B.cpp (100%)
 rename {comm => src/kernels}/lcal/src/tiling/allreduce_tiling.cpp (100%)
 rename {comm => src/kernels}/lcal/src/tiling/allreduce_tiling_91093.cpp (100%)
 rename {comm => src/kernels}/lcal/src/tiling/allreduce_tiling_910B.cpp (100%)
 rename {comm => src/kernels}/lcal/src/tiling/alltoall_allgather_hidden_tiling.cpp (100%)
 rename {comm => src/kernels}/lcal/src/tiling/alltoall_allgather_tiling.cpp (100%)
 rename {comm => src/kernels}/lcal/src/tiling/reducescatter_alltoall_hidden_tiling.cpp (100%)
 rename {comm => src/kernels}/lcal/src/tiling/reducescatter_tiling.cpp (100%)
 rename {comm => src/kernels}/lcal/src/tiling/reducescatter_tiling_91093.cpp (100%)
 rename {comm => src/kernels}/lcal/src/tiling/reducescatter_tiling_910B.cpp (100%)
 rename {comm => src/kernels}/lcal/src/tiling/tiling.cpp (100%)
 rename {comm => src/kernels}/lcal/src/tiling/tiling_args.cpp (100%)
 rename {comm => src/kernels}/lcal/src/tiling/tiling_func.cpp (100%)
 rename {comm => src/kernels}/lcal/src/tools/socket/lcal_sock_exchange.cpp (100%)
 rename {comm => src/kernels}/lcal/src/tools/socket/lcal_sock_exchange.h (100%)
 rename src/{cinterface => ops_infer/fused_add_topk_div}/atb_acl_fused_add_topk_div.cpp (99%)
 rename src/{cinterface => ops_infer/mla_preprocess}/atb_acl_mla_preprocess.cpp (97%)
 rename src/{cinterface => ops_infer/multi_latent_attention}/atb_acl_mla.cpp (97%)
 rename src/{cinterface => ops_infer/paged_cache_load}/atb_acl_paged_cache_load.cpp (99%)
 rename src/{cinterface => ops_infer/ring_mla}/atb_acl_ring_mla.cpp (97%)
 rename src/{cinterface => ops_infer/self_attention}/atb_acl_self_attention_prefix_encoder.cpp (97%)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 60cc53e9..46e82ad4 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -74,6 +74,9 @@ else()
     set(cxx_abi 0)
 endif()
 
+set(CMAKE_INSTALL_PREFIX "${CMAKE_SOURCE_DIR}/output/atb/cxx_abi_${cxx_abi}")
+message(STATUS "CMAKE_INSTALL_PREFIX:${CMAKE_INSTALL_PREFIX}")
+
 if(BUILD_PYBIND AND NOT USE_CXX11_ABI)
     add_subdirectory(${PROJECT_SOURCE_DIR}/3rdparty/pybind11)
 endif()
@@ -83,9 +86,9 @@ include_directories(
     ${PROJECT_SOURCE_DIR}/include
     ${PROJECT_SOURCE_DIR}/src
     ${PROJECT_SOURCE_DIR}/src/kernels/include
-    ${PROJECT_SOURCE_DIR}/comm/lcal/include
-    ${PROJECT_SOURCE_DIR}/comm/lcal/include/lcoc
-    ${PROJECT_SOURCE_DIR}/comm/lcal/include/lcoc/tiling
+    ${PROJECT_SOURCE_DIR}/src/kernels/lcal/include
+    ${PROJECT_SOURCE_DIR}/src/kernels/lcal/include/lcoc
+    ${PROJECT_SOURCE_DIR}/src/kernels/lcal/include/lcoc/tiling
     ${PROJECT_SOURCE_DIR}/3rdparty/mki/include
     ${PROJECT_SOURCE_DIR}/3rdparty/nlohmannJson/include
     $ENV{ASCEND_HOME_PATH}/include/aclnn
@@ -116,10 +119,6 @@ if (BUILD_CUSTOMIZE_OPS)
     add_subdirectory(ops_customize)
 endif()
 
-set(CMAKE_INSTALL_PREFIX "${CMAKE_SOURCE_DIR}/output/atb/cxx_abi_${cxx_abi}")
-add_subdirectory(comm/lcal)
-message(STATUS "CMAKE_INSTALL_PREFIX:${CMAKE_INSTALL_PREFIX}")
-
 install(FILES ${PROJECT_SOURCE_DIR}/scripts/set_env.sh DESTINATION ./..)
 install(DIRECTORY ${PROJECT_SOURCE_DIR}/ops_configs DESTINATION ./configs)
 install(FILES ${PROJECT_SOURCE_DIR}/3rdparty/mki/lib/libmki.so DESTINATION lib)
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 208da858..389b643b 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -12,7 +12,6 @@ set(ops_train_directory ${CMAKE_CURRENT_LIST_DIR}/ops_train)
 set(ops_infer_directory ${CMAKE_CURRENT_LIST_DIR}/ops_infer)
 set(ops_common_directory ${CMAKE_CURRENT_LIST_DIR}/ops_common)
 set(atb_directory ${CMAKE_CURRENT_LIST_DIR}/atb)
-set(c_interface_directory ${CMAKE_CURRENT_LIST_DIR}/cinterface)
 set(MSTX_PATH $ENV{ASCEND_HOME_PATH}/tools/mstx/include)
 set(ATB_INCLUDE_DIR $ENV{ASCEND_HOME_PATH}/include)
 
@@ -22,7 +21,6 @@ file(GLOB_RECURSE INFER_OP_SOURCE "${ops_infer_directory}/*.cpp")
 file(GLOB_RECURSE TRAIN_OP_SOURCE "${ops_train_directory}/*.cpp")
 file(GLOB_RECURSE COMMON_OP_SOURCE "${ops_common_directory}/*.cpp")
 file(GLOB_RECURSE ATB_FRAMEWORK_SOURCE "${atb_directory}/*.cpp")
-file(GLOB_RECURSE C_INTERFACE_SOURCE "${c_interface_directory}/*.cpp")
 
 add_subdirectory(kernels)
 
diff --git a/src/cinterface/atb_acl_util.cpp b/src/atb/utils/atb_acl_util.cpp
similarity index 100%
rename from src/cinterface/atb_acl_util.cpp
rename to src/atb/utils/atb_acl_util.cpp
diff --git a/src/cinterface/atb_acl_util.h b/src/atb/utils/atb_acl_util.h
similarity index 100%
rename from src/cinterface/atb_acl_util.h
rename to src/atb/utils/atb_acl_util.h
diff --git a/src/kernels/CMakeLists.txt b/src/kernels/CMakeLists.txt
index ce8bcf4a..57974f7e 100644
--- a/src/kernels/CMakeLists.txt
+++ b/src/kernels/CMakeLists.txt
@@ -30,6 +30,7 @@ include_directories(
 
 add_subdirectory(mixkernels)
 add_subdirectory(kernels)
+add_subdirectory(lcal)
 if (BUILD_TBE_ADAPTER)
     add_subdirectory(tbe_adapter)
 endif()
\ No newline at end of file
diff --git a/comm/lcal/CMakeLists.txt b/src/kernels/lcal/CMakeLists.txt
similarity index 98%
rename from comm/lcal/CMakeLists.txt
rename to src/kernels/lcal/CMakeLists.txt
index a5e63434..6f874baf 100644
--- a/comm/lcal/CMakeLists.txt
+++ b/src/kernels/lcal/CMakeLists.txt
@@ -12,6 +12,7 @@ project(Lcal LANGUAGES CXX)
 set(CMAKE_CXX_STANDARD 14)
 list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
 
+add_compile_options(-Wno-float-equal)
 option(USE_CXX11_ABI "USE_CXX11_ABI" 0)
 
 IF (CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "aarch64")
diff --git a/comm/lcal/cmake/CMakeCCECompiler.cmake.in b/src/kernels/lcal/cmake/CMakeCCECompiler.cmake.in
similarity index 100%
rename from comm/lcal/cmake/CMakeCCECompiler.cmake.in
rename to src/kernels/lcal/cmake/CMakeCCECompiler.cmake.in
diff --git a/comm/lcal/cmake/CMakeCCEInformation.cmake b/src/kernels/lcal/cmake/CMakeCCEInformation.cmake
similarity index 100%
rename from comm/lcal/cmake/CMakeCCEInformation.cmake
rename to src/kernels/lcal/cmake/CMakeCCEInformation.cmake
diff --git a/comm/lcal/cmake/CMakeDetermineCCECompiler.cmake b/src/kernels/lcal/cmake/CMakeDetermineCCECompiler.cmake
similarity index 100%
rename from comm/lcal/cmake/CMakeDetermineCCECompiler.cmake
rename to src/kernels/lcal/cmake/CMakeDetermineCCECompiler.cmake
diff --git a/comm/lcal/cmake/CMakeTestCCECompiler.cmake b/src/kernels/lcal/cmake/CMakeTestCCECompiler.cmake
similarity index 100%
rename from comm/lcal/cmake/CMakeTestCCECompiler.cmake
rename to src/kernels/lcal/cmake/CMakeTestCCECompiler.cmake
diff --git a/comm/lcal/include/comm_args.h b/src/kernels/lcal/include/comm_args.h
similarity index 100%
rename from comm/lcal/include/comm_args.h
rename to src/kernels/lcal/include/comm_args.h
diff --git a/comm/lcal/include/lcal.h b/src/kernels/lcal/include/lcal.h
similarity index 100%
rename from comm/lcal/include/lcal.h
rename to src/kernels/lcal/include/lcal.h
diff --git a/comm/lcal/include/lcal_api.h b/src/kernels/lcal/include/lcal_api.h
similarity index 100%
rename from comm/lcal/include/lcal_api.h
rename to src/kernels/lcal/include/lcal_api.h
diff --git a/comm/lcal/include/lcal_comm.h b/src/kernels/lcal/include/lcal_comm.h
similarity index 100%
rename from comm/lcal/include/lcal_comm.h
rename to src/kernels/lcal/include/lcal_comm.h
diff --git a/comm/lcal/include/lcal_types.h b/src/kernels/lcal/include/lcal_types.h
similarity index 100%
rename from comm/lcal/include/lcal_types.h
rename to src/kernels/lcal/include/lcal_types.h
diff --git a/comm/lcal/include/lccl.h b/src/kernels/lcal/include/lccl.h
similarity index 100%
rename from comm/lcal/include/lccl.h
rename to src/kernels/lcal/include/lccl.h
diff --git a/comm/lcal/include/lcoc/lcoc.h b/src/kernels/lcal/include/lcoc/lcoc.h
similarity index 100%
rename from comm/lcal/include/lcoc/lcoc.h
rename to src/kernels/lcal/include/lcoc/lcoc.h
diff --git a/comm/lcal/include/lcoc/lcoc_args.h b/src/kernels/lcal/include/lcoc/lcoc_args.h
similarity index 100%
rename from comm/lcal/include/lcoc/lcoc_args.h
rename to src/kernels/lcal/include/lcoc/lcoc_args.h
diff --git a/comm/lcal/include/lcoc/lcoc_base.h b/src/kernels/lcal/include/lcoc/lcoc_base.h
similarity index 100%
rename from comm/lcal/include/lcoc/lcoc_base.h
rename to src/kernels/lcal/include/lcoc/lcoc_base.h
diff --git a/comm/lcal/include/lcoc/lcoc_func.h b/src/kernels/lcal/include/lcoc/lcoc_func.h
similarity index 100%
rename from comm/lcal/include/lcoc/lcoc_func.h
rename to src/kernels/lcal/include/lcoc/lcoc_func.h
diff --git a/comm/lcal/include/lcoc/lcoc_workspace.h b/src/kernels/lcal/include/lcoc/lcoc_workspace.h
similarity index 100%
rename from comm/lcal/include/lcoc/lcoc_workspace.h
rename to src/kernels/lcal/include/lcoc/lcoc_workspace.h
diff --git a/comm/lcal/include/lcoc/tiling/tiling.h b/src/kernels/lcal/include/lcoc/tiling/tiling.h
similarity index 100%
rename from comm/lcal/include/lcoc/tiling/tiling.h
rename to src/kernels/lcal/include/lcoc/tiling/tiling.h
diff --git a/comm/lcal/include/lcoc/tiling/tiling_91093.h b/src/kernels/lcal/include/lcoc/tiling/tiling_91093.h
similarity index 100%
rename from comm/lcal/include/lcoc/tiling/tiling_91093.h
rename to src/kernels/lcal/include/lcoc/tiling/tiling_91093.h
diff --git a/comm/lcal/include/lcoc/tiling/tiling_910B.h b/src/kernels/lcal/include/lcoc/tiling/tiling_910B.h
similarity index 100%
rename from comm/lcal/include/lcoc/tiling/tiling_910B.h
rename to src/kernels/lcal/include/lcoc/tiling/tiling_910B.h
diff --git a/comm/lcal/include/lcoc/tiling/tiling_args.h b/src/kernels/lcal/include/lcoc/tiling/tiling_args.h
similarity index 100%
rename from comm/lcal/include/lcoc/tiling/tiling_args.h
rename to src/kernels/lcal/include/lcoc/tiling/tiling_args.h
diff --git a/comm/lcal/include/lcoc/tiling/tiling_func.h b/src/kernels/lcal/include/lcoc/tiling/tiling_func.h
similarity index 100%
rename from comm/lcal/include/lcoc/tiling/tiling_func.h
rename to src/kernels/lcal/include/lcoc/tiling/tiling_func.h
diff --git a/comm/lcal/src/CMakeLists.txt b/src/kernels/lcal/src/CMakeLists.txt
similarity index 100%
rename from comm/lcal/src/CMakeLists.txt
rename to src/kernels/lcal/src/CMakeLists.txt
diff --git a/comm/lcal/src/ascendc.cmake b/src/kernels/lcal/src/ascendc.cmake
similarity index 100%
rename from comm/lcal/src/ascendc.cmake
rename to src/kernels/lcal/src/ascendc.cmake
diff --git a/comm/lcal/src/ascendc_kernels/91093/all2all_hierarchy.h b/src/kernels/lcal/src/ascendc_kernels/91093/all2all_hierarchy.h
similarity index 100%
rename from comm/lcal/src/ascendc_kernels/91093/all2all_hierarchy.h
rename to src/kernels/lcal/src/ascendc_kernels/91093/all2all_hierarchy.h
diff --git a/comm/lcal/src/ascendc_kernels/91093/all2all_hierarchy_small.h b/src/kernels/lcal/src/ascendc_kernels/91093/all2all_hierarchy_small.h
similarity index 100%
rename from comm/lcal/src/ascendc_kernels/91093/all2all_hierarchy_small.h
rename to src/kernels/lcal/src/ascendc_kernels/91093/all2all_hierarchy_small.h
diff --git a/comm/lcal/src/ascendc_kernels/91093/allgather_hierarchy_double_ring.h b/src/kernels/lcal/src/ascendc_kernels/91093/allgather_hierarchy_double_ring.h
similarity index 100%
rename from comm/lcal/src/ascendc_kernels/91093/allgather_hierarchy_double_ring.h
rename to src/kernels/lcal/src/ascendc_kernels/91093/allgather_hierarchy_double_ring.h
diff --git a/comm/lcal/src/ascendc_kernels/91093/allreduce_big_data_sio.h b/src/kernels/lcal/src/ascendc_kernels/91093/allreduce_big_data_sio.h
similarity index 100%
rename from comm/lcal/src/ascendc_kernels/91093/allreduce_big_data_sio.h
rename to src/kernels/lcal/src/ascendc_kernels/91093/allreduce_big_data_sio.h
diff --git a/comm/lcal/src/ascendc_kernels/91093/allreduce_hierarchy_double_ring.h b/src/kernels/lcal/src/ascendc_kernels/91093/allreduce_hierarchy_double_ring.h
similarity index 100%
rename from comm/lcal/src/ascendc_kernels/91093/allreduce_hierarchy_double_ring.h
rename to src/kernels/lcal/src/ascendc_kernels/91093/allreduce_hierarchy_double_ring.h
diff --git a/comm/lcal/src/ascendc_kernels/91093/reduce_scatter_big_data_91093_4step.h b/src/kernels/lcal/src/ascendc_kernels/91093/reduce_scatter_big_data_91093_4step.h
similarity index 100%
rename from comm/lcal/src/ascendc_kernels/91093/reduce_scatter_big_data_91093_4step.h
rename to src/kernels/lcal/src/ascendc_kernels/91093/reduce_scatter_big_data_91093_4step.h
diff --git a/comm/lcal/src/ascendc_kernels/91093/reduce_scatter_hierarchy_double_ring.h b/src/kernels/lcal/src/ascendc_kernels/91093/reduce_scatter_hierarchy_double_ring.h
similarity index 100%
rename from comm/lcal/src/ascendc_kernels/91093/reduce_scatter_hierarchy_double_ring.h
rename to src/kernels/lcal/src/ascendc_kernels/91093/reduce_scatter_hierarchy_double_ring.h
diff --git a/comm/lcal/src/ascendc_kernels/CMakeLists.txt b/src/kernels/lcal/src/ascendc_kernels/CMakeLists.txt
similarity index 100%
rename from comm/lcal/src/ascendc_kernels/CMakeLists.txt
rename to src/kernels/lcal/src/ascendc_kernels/CMakeLists.txt
diff --git a/comm/lcal/src/ascendc_kernels/allgather.h b/src/kernels/lcal/src/ascendc_kernels/allgather.h
similarity index 100%
rename from comm/lcal/src/ascendc_kernels/allgather.h
rename to src/kernels/lcal/src/ascendc_kernels/allgather.h
diff --git a/comm/lcal/src/ascendc_kernels/allreduce_big_data.h b/src/kernels/lcal/src/ascendc_kernels/allreduce_big_data.h
similarity index 100%
rename from comm/lcal/src/ascendc_kernels/allreduce_big_data.h
rename to src/kernels/lcal/src/ascendc_kernels/allreduce_big_data.h
diff --git a/comm/lcal/src/ascendc_kernels/allreduce_one_shot.h b/src/kernels/lcal/src/ascendc_kernels/allreduce_one_shot.h
similarity index 100%
rename from comm/lcal/src/ascendc_kernels/allreduce_one_shot.h
rename to src/kernels/lcal/src/ascendc_kernels/allreduce_one_shot.h
diff --git a/comm/lcal/src/ascendc_kernels/allreduce_quant.h b/src/kernels/lcal/src/ascendc_kernels/allreduce_quant.h
similarity index 100%
rename from comm/lcal/src/ascendc_kernels/allreduce_quant.h
rename to src/kernels/lcal/src/ascendc_kernels/allreduce_quant.h
diff --git a/comm/lcal/src/ascendc_kernels/allreduce_two_shot.h b/src/kernels/lcal/src/ascendc_kernels/allreduce_two_shot.h
similarity index 100%
rename from comm/lcal/src/ascendc_kernels/allreduce_two_shot.h
rename to src/kernels/lcal/src/ascendc_kernels/allreduce_two_shot.h
diff --git a/comm/lcal/src/ascendc_kernels/collectives.h b/src/kernels/lcal/src/ascendc_kernels/collectives.h
similarity index 100%
rename from comm/lcal/src/ascendc_kernels/collectives.h
rename to src/kernels/lcal/src/ascendc_kernels/collectives.h
diff --git a/comm/lcal/src/ascendc_kernels/datacopy_gm2gm.h b/src/kernels/lcal/src/ascendc_kernels/datacopy_gm2gm.h
similarity index 100%
rename from comm/lcal/src/ascendc_kernels/datacopy_gm2gm.h
rename to src/kernels/lcal/src/ascendc_kernels/datacopy_gm2gm.h
diff --git a/comm/lcal/src/ascendc_kernels/datacopy_gm2gm_delay.h b/src/kernels/lcal/src/ascendc_kernels/datacopy_gm2gm_delay.h
similarity index 100%
rename from comm/lcal/src/ascendc_kernels/datacopy_gm2gm_delay.h
rename to src/kernels/lcal/src/ascendc_kernels/datacopy_gm2gm_delay.h
diff --git a/comm/lcal/src/ascendc_kernels/ipc_queue.h b/src/kernels/lcal/src/ascendc_kernels/ipc_queue.h
similarity index 100%
rename from comm/lcal/src/ascendc_kernels/ipc_queue.h
rename to src/kernels/lcal/src/ascendc_kernels/ipc_queue.h
diff --git a/comm/lcal/src/ascendc_kernels/lccl_op.h b/src/kernels/lcal/src/ascendc_kernels/lccl_op.h
similarity index 100%
rename from comm/lcal/src/ascendc_kernels/lccl_op.h
rename to src/kernels/lcal/src/ascendc_kernels/lccl_op.h
diff --git a/comm/lcal/src/ascendc_kernels/lccl_op1.cpp b/src/kernels/lcal/src/ascendc_kernels/lccl_op1.cpp
similarity index 100%
rename from comm/lcal/src/ascendc_kernels/lccl_op1.cpp
rename to src/kernels/lcal/src/ascendc_kernels/lccl_op1.cpp
diff --git a/comm/lcal/src/ascendc_kernels/lccl_op2.cpp b/src/kernels/lcal/src/ascendc_kernels/lccl_op2.cpp
similarity index 100%
rename from comm/lcal/src/ascendc_kernels/lccl_op2.cpp
rename to src/kernels/lcal/src/ascendc_kernels/lccl_op2.cpp
diff --git a/comm/lcal/src/ascendc_kernels/op_def.h b/src/kernels/lcal/src/ascendc_kernels/op_def.h
similarity index 100%
rename from comm/lcal/src/ascendc_kernels/op_def.h
rename to src/kernels/lcal/src/ascendc_kernels/op_def.h
diff --git a/comm/lcal/src/ascendc_kernels/reduce_scatter.h b/src/kernels/lcal/src/ascendc_kernels/reduce_scatter.h
similarity index 100%
rename from comm/lcal/src/ascendc_kernels/reduce_scatter.h
rename to src/kernels/lcal/src/ascendc_kernels/reduce_scatter.h
diff --git a/comm/lcal/src/ascendc_kernels/sync_collectives.h b/src/kernels/lcal/src/ascendc_kernels/sync_collectives.h
similarity index 100%
rename from comm/lcal/src/ascendc_kernels/sync_collectives.h
rename to src/kernels/lcal/src/ascendc_kernels/sync_collectives.h
diff --git a/comm/lcal/src/ccl_kernel_args.h b/src/kernels/lcal/src/ccl_kernel_args.h
similarity index 100%
rename from comm/lcal/src/ccl_kernel_args.h
rename to src/kernels/lcal/src/ccl_kernel_args.h
diff --git a/comm/lcal/src/coc_kernel_args.cpp b/src/kernels/lcal/src/coc_kernel_args.cpp
similarity index 100%
rename from comm/lcal/src/coc_kernel_args.cpp
rename to src/kernels/lcal/src/coc_kernel_args.cpp
diff --git a/comm/lcal/src/coc_kernel_args.h b/src/kernels/lcal/src/coc_kernel_args.h
similarity index 100%
rename from comm/lcal/src/coc_kernel_args.h
rename to src/kernels/lcal/src/coc_kernel_args.h
diff --git a/comm/lcal/src/kernels/CMakeLists.txt b/src/kernels/lcal/src/kernels/CMakeLists.txt
similarity index 100%
rename from comm/lcal/src/kernels/CMakeLists.txt
rename to src/kernels/lcal/src/kernels/CMakeLists.txt
diff --git a/comm/lcal/src/kernels/coc_add_bias_runner.cce b/src/kernels/lcal/src/kernels/coc_add_bias_runner.cce
similarity index 100%
rename from comm/lcal/src/kernels/coc_add_bias_runner.cce
rename to src/kernels/lcal/src/kernels/coc_add_bias_runner.cce
diff --git a/comm/lcal/src/kernels/coc_allgather.cce b/src/kernels/lcal/src/kernels/coc_allgather.cce
similarity index 100%
rename from comm/lcal/src/kernels/coc_allgather.cce
rename to src/kernels/lcal/src/kernels/coc_allgather.cce
diff --git a/comm/lcal/src/kernels/coc_allgather_matmul.cce b/src/kernels/lcal/src/kernels/coc_allgather_matmul.cce
similarity index 100%
rename from comm/lcal/src/kernels/coc_allgather_matmul.cce
rename to src/kernels/lcal/src/kernels/coc_allgather_matmul.cce
diff --git a/comm/lcal/src/kernels/coc_allgather_matmul_reduce_scatter.cce b/src/kernels/lcal/src/kernels/coc_allgather_matmul_reduce_scatter.cce
similarity index 100%
rename from comm/lcal/src/kernels/coc_allgather_matmul_reduce_scatter.cce
rename to src/kernels/lcal/src/kernels/coc_allgather_matmul_reduce_scatter.cce
diff --git a/comm/lcal/src/kernels/coc_allgather_reducescatter.cce b/src/kernels/lcal/src/kernels/coc_allgather_reducescatter.cce
similarity index 100%
rename from comm/lcal/src/kernels/coc_allgather_reducescatter.cce
rename to src/kernels/lcal/src/kernels/coc_allgather_reducescatter.cce
diff --git a/comm/lcal/src/kernels/coc_allgather_v2.cce b/src/kernels/lcal/src/kernels/coc_allgather_v2.cce
similarity index 100%
rename from comm/lcal/src/kernels/coc_allgather_v2.cce
rename to src/kernels/lcal/src/kernels/coc_allgather_v2.cce
diff --git a/comm/lcal/src/kernels/coc_allreduce.cce b/src/kernels/lcal/src/kernels/coc_allreduce.cce
similarity index 100%
rename from comm/lcal/src/kernels/coc_allreduce.cce
rename to src/kernels/lcal/src/kernels/coc_allreduce.cce
diff --git a/comm/lcal/src/kernels/coc_alltoall_allgather_hidden.cce b/src/kernels/lcal/src/kernels/coc_alltoall_allgather_hidden.cce
similarity index 100%
rename from comm/lcal/src/kernels/coc_alltoall_allgather_hidden.cce
rename to src/kernels/lcal/src/kernels/coc_alltoall_allgather_hidden.cce
diff --git a/comm/lcal/src/kernels/coc_alltoall_reduce_scatter_hidden.cce b/src/kernels/lcal/src/kernels/coc_alltoall_reduce_scatter_hidden.cce
similarity index 100%
rename from comm/lcal/src/kernels/coc_alltoall_reduce_scatter_hidden.cce
rename to src/kernels/lcal/src/kernels/coc_alltoall_reduce_scatter_hidden.cce
diff --git a/comm/lcal/src/kernels/coc_alltoallv_allgather.cce b/src/kernels/lcal/src/kernels/coc_alltoallv_allgather.cce
similarity index 100%
rename from comm/lcal/src/kernels/coc_alltoallv_allgather.cce
rename to src/kernels/lcal/src/kernels/coc_alltoallv_allgather.cce
diff --git a/comm/lcal/src/kernels/coc_alltoallv_allgather_matmul.cce b/src/kernels/lcal/src/kernels/coc_alltoallv_allgather_matmul.cce
similarity index 100%
rename from comm/lcal/src/kernels/coc_alltoallv_allgather_matmul.cce
rename to src/kernels/lcal/src/kernels/coc_alltoallv_allgather_matmul.cce
diff --git a/comm/lcal/src/kernels/coc_comm_base.cce b/src/kernels/lcal/src/kernels/coc_comm_base.cce
similarity index 100%
rename from comm/lcal/src/kernels/coc_comm_base.cce
rename to src/kernels/lcal/src/kernels/coc_comm_base.cce
diff --git a/comm/lcal/src/kernels/coc_const_args.cce b/src/kernels/lcal/src/kernels/coc_const_args.cce
similarity index 100%
rename from comm/lcal/src/kernels/coc_const_args.cce
rename to src/kernels/lcal/src/kernels/coc_const_args.cce
diff --git a/comm/lcal/src/kernels/coc_dequant_runner.cce b/src/kernels/lcal/src/kernels/coc_dequant_runner.cce
similarity index 100%
rename from comm/lcal/src/kernels/coc_dequant_runner.cce
rename to src/kernels/lcal/src/kernels/coc_dequant_runner.cce
diff --git a/comm/lcal/src/kernels/coc_internal.cce b/src/kernels/lcal/src/kernels/coc_internal.cce
similarity index 100%
rename from comm/lcal/src/kernels/coc_internal.cce
rename to src/kernels/lcal/src/kernels/coc_internal.cce
diff --git a/comm/lcal/src/kernels/coc_matmul_allreduce.cce b/src/kernels/lcal/src/kernels/coc_matmul_allreduce.cce
similarity index 100%
rename from comm/lcal/src/kernels/coc_matmul_allreduce.cce
rename to src/kernels/lcal/src/kernels/coc_matmul_allreduce.cce
diff --git a/comm/lcal/src/kernels/coc_matmul_reduce_scatter.cce b/src/kernels/lcal/src/kernels/coc_matmul_reduce_scatter.cce
similarity index 100%
rename from comm/lcal/src/kernels/coc_matmul_reduce_scatter.cce
rename to src/kernels/lcal/src/kernels/coc_matmul_reduce_scatter.cce
diff --git a/comm/lcal/src/kernels/coc_matmul_reduce_scatter_alltoallv.cce b/src/kernels/lcal/src/kernels/coc_matmul_reduce_scatter_alltoallv.cce
similarity index 100%
rename from comm/lcal/src/kernels/coc_matmul_reduce_scatter_alltoallv.cce
rename to src/kernels/lcal/src/kernels/coc_matmul_reduce_scatter_alltoallv.cce
diff --git a/comm/lcal/src/kernels/coc_matmulmoe.cce b/src/kernels/lcal/src/kernels/coc_matmulmoe.cce
similarity index 100%
rename from comm/lcal/src/kernels/coc_matmulmoe.cce
rename to src/kernels/lcal/src/kernels/coc_matmulmoe.cce
diff --git a/comm/lcal/src/kernels/coc_postprocessor.cce b/src/kernels/lcal/src/kernels/coc_postprocessor.cce
similarity index 100%
rename from comm/lcal/src/kernels/coc_postprocessor.cce
rename to src/kernels/lcal/src/kernels/coc_postprocessor.cce
diff --git a/comm/lcal/src/kernels/coc_ppmatmul.cce b/src/kernels/lcal/src/kernels/coc_ppmatmul.cce
similarity index 100%
rename from comm/lcal/src/kernels/coc_ppmatmul.cce
rename to src/kernels/lcal/src/kernels/coc_ppmatmul.cce
diff --git a/comm/lcal/src/kernels/coc_ppmatmul_switch.cce b/src/kernels/lcal/src/kernels/coc_ppmatmul_switch.cce
similarity index 100%
rename from comm/lcal/src/kernels/coc_ppmatmul_switch.cce
rename to src/kernels/lcal/src/kernels/coc_ppmatmul_switch.cce
diff --git a/comm/lcal/src/kernels/coc_preprocessor.cce b/src/kernels/lcal/src/kernels/coc_preprocessor.cce
similarity index 100%
rename from comm/lcal/src/kernels/coc_preprocessor.cce
rename to src/kernels/lcal/src/kernels/coc_preprocessor.cce
diff --git a/comm/lcal/src/kernels/coc_pure_matmul.cce b/src/kernels/lcal/src/kernels/coc_pure_matmul.cce
similarity index 100%
rename from comm/lcal/src/kernels/coc_pure_matmul.cce
rename to src/kernels/lcal/src/kernels/coc_pure_matmul.cce
diff --git a/comm/lcal/src/kernels/coc_reduce_scatter.cce b/src/kernels/lcal/src/kernels/coc_reduce_scatter.cce
similarity index 100%
rename from comm/lcal/src/kernels/coc_reduce_scatter.cce
rename to src/kernels/lcal/src/kernels/coc_reduce_scatter.cce
diff --git a/comm/lcal/src/kernels/collectives.cce b/src/kernels/lcal/src/kernels/collectives.cce
similarity index 100%
rename from comm/lcal/src/kernels/collectives.cce
rename to src/kernels/lcal/src/kernels/collectives.cce
diff --git a/comm/lcal/src/kernels/lcal_all2all_transpose.cce b/src/kernels/lcal/src/kernels/lcal_all2all_transpose.cce
similarity index 100%
rename from comm/lcal/src/kernels/lcal_all2all_transpose.cce
rename to src/kernels/lcal/src/kernels/lcal_all2all_transpose.cce
diff --git a/comm/lcal/src/kernels/lcal_allgather.cce b/src/kernels/lcal/src/kernels/lcal_allgather.cce
similarity index 100%
rename from comm/lcal/src/kernels/lcal_allgather.cce
rename to src/kernels/lcal/src/kernels/lcal_allgather.cce
diff --git a/comm/lcal/src/kernels/lcal_allgather_2npu.cce b/src/kernels/lcal/src/kernels/lcal_allgather_2npu.cce
similarity index 100%
rename from comm/lcal/src/kernels/lcal_allgather_2npu.cce
rename to src/kernels/lcal/src/kernels/lcal_allgather_2npu.cce
diff --git a/comm/lcal/src/kernels/lcal_allgather_2npu_big_data_write.cce b/src/kernels/lcal/src/kernels/lcal_allgather_2npu_big_data_write.cce
similarity index 100%
rename from comm/lcal/src/kernels/lcal_allgather_2npu_big_data_write.cce
rename to src/kernels/lcal/src/kernels/lcal_allgather_2npu_big_data_write.cce
diff --git a/comm/lcal/src/kernels/lcal_allgather_910B2C.cce b/src/kernels/lcal/src/kernels/lcal_allgather_910B2C.cce
similarity index 100%
rename from comm/lcal/src/kernels/lcal_allgather_910B2C.cce
rename to src/kernels/lcal/src/kernels/lcal_allgather_910B2C.cce
diff --git a/comm/lcal/src/kernels/lcal_allgather_big_data.cce b/src/kernels/lcal/src/kernels/lcal_allgather_big_data.cce
similarity index 100%
rename from comm/lcal/src/kernels/lcal_allgather_big_data.cce
rename to src/kernels/lcal/src/kernels/lcal_allgather_big_data.cce
diff --git a/comm/lcal/src/kernels/lcal_allgather_big_data_910B2C.cce b/src/kernels/lcal/src/kernels/lcal_allgather_big_data_910B2C.cce
similarity index 100%
rename from comm/lcal/src/kernels/lcal_allgather_big_data_910B2C.cce
rename to src/kernels/lcal/src/kernels/lcal_allgather_big_data_910B2C.cce
diff --git a/comm/lcal/src/kernels/lcal_allreduce_2npu_big_write.cce b/src/kernels/lcal/src/kernels/lcal_allreduce_2npu_big_write.cce
similarity index 100%
rename from comm/lcal/src/kernels/lcal_allreduce_2npu_big_write.cce
rename to src/kernels/lcal/src/kernels/lcal_allreduce_2npu_big_write.cce
diff --git a/comm/lcal/src/kernels/lcal_allreduce_2npu_read.cce b/src/kernels/lcal/src/kernels/lcal_allreduce_2npu_read.cce
similarity index 100%
rename from comm/lcal/src/kernels/lcal_allreduce_2npu_read.cce
rename to src/kernels/lcal/src/kernels/lcal_allreduce_2npu_read.cce
diff --git a/comm/lcal/src/kernels/lcal_allreduce_2npu_write.cce b/src/kernels/lcal/src/kernels/lcal_allreduce_2npu_write.cce
similarity index 100%
rename from comm/lcal/src/kernels/lcal_allreduce_2npu_write.cce
rename to src/kernels/lcal/src/kernels/lcal_allreduce_2npu_write.cce
diff --git a/comm/lcal/src/kernels/lcal_allreduce_big_data.cce b/src/kernels/lcal/src/kernels/lcal_allreduce_big_data.cce
similarity index 100%
rename from comm/lcal/src/kernels/lcal_allreduce_big_data.cce
rename to src/kernels/lcal/src/kernels/lcal_allreduce_big_data.cce
diff --git a/comm/lcal/src/kernels/lcal_allreduce_big_data_910B2C.cce b/src/kernels/lcal/src/kernels/lcal_allreduce_big_data_910B2C.cce
similarity index 100%
rename from comm/lcal/src/kernels/lcal_allreduce_big_data_910B2C.cce
rename to src/kernels/lcal/src/kernels/lcal_allreduce_big_data_910B2C.cce
diff --git a/comm/lcal/src/kernels/lcal_allreduce_deterministic.cce b/src/kernels/lcal/src/kernels/lcal_allreduce_deterministic.cce
similarity index 100%
rename from comm/lcal/src/kernels/lcal_allreduce_deterministic.cce
rename to src/kernels/lcal/src/kernels/lcal_allreduce_deterministic.cce
diff --git a/comm/lcal/src/kernels/lcal_allreduce_deterministic_big_data.cce b/src/kernels/lcal/src/kernels/lcal_allreduce_deterministic_big_data.cce
similarity index 100%
rename from comm/lcal/src/kernels/lcal_allreduce_deterministic_big_data.cce
rename to src/kernels/lcal/src/kernels/lcal_allreduce_deterministic_big_data.cce
diff --git a/comm/lcal/src/kernels/lcal_allreduce_two_shot.cce b/src/kernels/lcal/src/kernels/lcal_allreduce_two_shot.cce
similarity index 100%
rename from comm/lcal/src/kernels/lcal_allreduce_two_shot.cce
rename to src/kernels/lcal/src/kernels/lcal_allreduce_two_shot.cce
diff --git a/comm/lcal/src/kernels/lcal_allreduce_two_shot_910B2C.cce b/src/kernels/lcal/src/kernels/lcal_allreduce_two_shot_910B2C.cce
similarity index 100%
rename from comm/lcal/src/kernels/lcal_allreduce_two_shot_910B2C.cce
rename to src/kernels/lcal/src/kernels/lcal_allreduce_two_shot_910B2C.cce
diff --git a/comm/lcal/src/kernels/lcal_broadcast_big_data.cce b/src/kernels/lcal/src/kernels/lcal_broadcast_big_data.cce
similarity index 100%
rename from comm/lcal/src/kernels/lcal_broadcast_big_data.cce
rename to src/kernels/lcal/src/kernels/lcal_broadcast_big_data.cce
diff --git a/comm/lcal/src/kernels/lcal_broadcast_write.cce b/src/kernels/lcal/src/kernels/lcal_broadcast_write.cce
similarity index 100%
rename from comm/lcal/src/kernels/lcal_broadcast_write.cce
rename to src/kernels/lcal/src/kernels/lcal_broadcast_write.cce
diff --git a/comm/lcal/src/kernels/lcal_reduce_scatter.cce b/src/kernels/lcal/src/kernels/lcal_reduce_scatter.cce
similarity index 100%
rename from comm/lcal/src/kernels/lcal_reduce_scatter.cce
rename to src/kernels/lcal/src/kernels/lcal_reduce_scatter.cce
diff --git a/comm/lcal/src/kernels/lcal_reduce_scatter_big_data.cce b/src/kernels/lcal/src/kernels/lcal_reduce_scatter_big_data.cce
similarity index 100%
rename from comm/lcal/src/kernels/lcal_reduce_scatter_big_data.cce
rename to src/kernels/lcal/src/kernels/lcal_reduce_scatter_big_data.cce
diff --git a/comm/lcal/src/kernels/lcal_reduce_scatter_big_data_write.cce b/src/kernels/lcal/src/kernels/lcal_reduce_scatter_big_data_write.cce
similarity index 100%
rename from comm/lcal/src/kernels/lcal_reduce_scatter_big_data_write.cce
rename to src/kernels/lcal/src/kernels/lcal_reduce_scatter_big_data_write.cce
diff --git a/comm/lcal/src/kernels/lcal_reduce_scatter_write.cce b/src/kernels/lcal/src/kernels/lcal_reduce_scatter_write.cce
similarity index 100%
rename from comm/lcal/src/kernels/lcal_reduce_scatter_write.cce
rename to src/kernels/lcal/src/kernels/lcal_reduce_scatter_write.cce
diff --git a/comm/lcal/src/lcal_comm.cpp b/src/kernels/lcal/src/lcal_comm.cpp
similarity index 100%
rename from comm/lcal/src/lcal_comm.cpp
rename to src/kernels/lcal/src/lcal_comm.cpp
diff --git a/comm/lcal/src/lcal_internal.cpp b/src/kernels/lcal/src/lcal_internal.cpp
similarity index 100%
rename from comm/lcal/src/lcal_internal.cpp
rename to src/kernels/lcal/src/lcal_internal.cpp
diff --git a/comm/lcal/src/lcal_internal.h b/src/kernels/lcal/src/lcal_internal.h
similarity index 100%
rename from comm/lcal/src/lcal_internal.h
rename to src/kernels/lcal/src/lcal_internal.h
diff --git a/comm/lcal/src/lcal_wrap.cpp b/src/kernels/lcal/src/lcal_wrap.cpp
similarity index 100%
rename from comm/lcal/src/lcal_wrap.cpp
rename to src/kernels/lcal/src/lcal_wrap.cpp
diff --git a/comm/lcal/src/lccl.cpp b/src/kernels/lcal/src/lccl.cpp
similarity index 100%
rename from comm/lcal/src/lccl.cpp
rename to src/kernels/lcal/src/lccl.cpp
diff --git a/comm/lcal/src/lcoc.cpp b/src/kernels/lcal/src/lcoc.cpp
similarity index 100%
rename from comm/lcal/src/lcoc.cpp
rename to src/kernels/lcal/src/lcoc.cpp
diff --git a/comm/lcal/src/lcoc_func.cpp b/src/kernels/lcal/src/lcoc_func.cpp
similarity index 100%
rename from comm/lcal/src/lcoc_func.cpp
rename to src/kernels/lcal/src/lcoc_func.cpp
diff --git a/comm/lcal/src/profiling/report_timing.h b/src/kernels/lcal/src/profiling/report_timing.h
similarity index 100%
rename from comm/lcal/src/profiling/report_timing.h
rename to src/kernels/lcal/src/profiling/report_timing.h
diff --git a/comm/lcal/src/tiling/allgather_reducescatter_tiling.cpp b/src/kernels/lcal/src/tiling/allgather_reducescatter_tiling.cpp
similarity index 100%
rename from comm/lcal/src/tiling/allgather_reducescatter_tiling.cpp
rename to src/kernels/lcal/src/tiling/allgather_reducescatter_tiling.cpp
diff --git a/comm/lcal/src/tiling/allgather_tiling.cpp b/src/kernels/lcal/src/tiling/allgather_tiling.cpp
similarity index 100%
rename from comm/lcal/src/tiling/allgather_tiling.cpp
rename to src/kernels/lcal/src/tiling/allgather_tiling.cpp
diff --git a/comm/lcal/src/tiling/allgather_tiling_91093.cpp b/src/kernels/lcal/src/tiling/allgather_tiling_91093.cpp
similarity index 100%
rename from comm/lcal/src/tiling/allgather_tiling_91093.cpp
rename to src/kernels/lcal/src/tiling/allgather_tiling_91093.cpp
diff --git a/comm/lcal/src/tiling/allgather_tiling_910B.cpp b/src/kernels/lcal/src/tiling/allgather_tiling_910B.cpp
similarity index 100%
rename from comm/lcal/src/tiling/allgather_tiling_910B.cpp
rename to src/kernels/lcal/src/tiling/allgather_tiling_910B.cpp
diff --git a/comm/lcal/src/tiling/allgatherv2_tiling_91093.cpp b/src/kernels/lcal/src/tiling/allgatherv2_tiling_91093.cpp
similarity index 100%
rename from comm/lcal/src/tiling/allgatherv2_tiling_91093.cpp
rename to src/kernels/lcal/src/tiling/allgatherv2_tiling_91093.cpp
diff --git a/comm/lcal/src/tiling/allgatherv2_tiling_910B.cpp b/src/kernels/lcal/src/tiling/allgatherv2_tiling_910B.cpp
similarity index 100%
rename from comm/lcal/src/tiling/allgatherv2_tiling_910B.cpp
rename to src/kernels/lcal/src/tiling/allgatherv2_tiling_910B.cpp
diff --git a/comm/lcal/src/tiling/allreduce_tiling.cpp b/src/kernels/lcal/src/tiling/allreduce_tiling.cpp
similarity index 100%
rename from comm/lcal/src/tiling/allreduce_tiling.cpp
rename to src/kernels/lcal/src/tiling/allreduce_tiling.cpp
diff --git a/comm/lcal/src/tiling/allreduce_tiling_91093.cpp b/src/kernels/lcal/src/tiling/allreduce_tiling_91093.cpp
similarity index 100%
rename from comm/lcal/src/tiling/allreduce_tiling_91093.cpp
rename to src/kernels/lcal/src/tiling/allreduce_tiling_91093.cpp
diff --git a/comm/lcal/src/tiling/allreduce_tiling_910B.cpp b/src/kernels/lcal/src/tiling/allreduce_tiling_910B.cpp
similarity index 100%
rename from comm/lcal/src/tiling/allreduce_tiling_910B.cpp
rename to src/kernels/lcal/src/tiling/allreduce_tiling_910B.cpp
diff --git a/comm/lcal/src/tiling/alltoall_allgather_hidden_tiling.cpp b/src/kernels/lcal/src/tiling/alltoall_allgather_hidden_tiling.cpp
similarity index 100%
rename from comm/lcal/src/tiling/alltoall_allgather_hidden_tiling.cpp
rename to src/kernels/lcal/src/tiling/alltoall_allgather_hidden_tiling.cpp
diff --git a/comm/lcal/src/tiling/alltoall_allgather_tiling.cpp b/src/kernels/lcal/src/tiling/alltoall_allgather_tiling.cpp
similarity index 100%
rename from comm/lcal/src/tiling/alltoall_allgather_tiling.cpp
rename to src/kernels/lcal/src/tiling/alltoall_allgather_tiling.cpp
diff --git a/comm/lcal/src/tiling/reducescatter_alltoall_hidden_tiling.cpp b/src/kernels/lcal/src/tiling/reducescatter_alltoall_hidden_tiling.cpp
similarity index 100%
rename from comm/lcal/src/tiling/reducescatter_alltoall_hidden_tiling.cpp
rename to src/kernels/lcal/src/tiling/reducescatter_alltoall_hidden_tiling.cpp
diff --git a/comm/lcal/src/tiling/reducescatter_tiling.cpp b/src/kernels/lcal/src/tiling/reducescatter_tiling.cpp
similarity index 100%
rename from comm/lcal/src/tiling/reducescatter_tiling.cpp
rename to src/kernels/lcal/src/tiling/reducescatter_tiling.cpp
diff --git a/comm/lcal/src/tiling/reducescatter_tiling_91093.cpp b/src/kernels/lcal/src/tiling/reducescatter_tiling_91093.cpp
similarity index 100%
rename from comm/lcal/src/tiling/reducescatter_tiling_91093.cpp
rename to src/kernels/lcal/src/tiling/reducescatter_tiling_91093.cpp
diff --git a/comm/lcal/src/tiling/reducescatter_tiling_910B.cpp b/src/kernels/lcal/src/tiling/reducescatter_tiling_910B.cpp
similarity index 100%
rename from comm/lcal/src/tiling/reducescatter_tiling_910B.cpp
rename to src/kernels/lcal/src/tiling/reducescatter_tiling_910B.cpp
diff --git a/comm/lcal/src/tiling/tiling.cpp b/src/kernels/lcal/src/tiling/tiling.cpp
similarity index 100%
rename from comm/lcal/src/tiling/tiling.cpp
rename to src/kernels/lcal/src/tiling/tiling.cpp
diff --git a/comm/lcal/src/tiling/tiling_args.cpp b/src/kernels/lcal/src/tiling/tiling_args.cpp
similarity index 100%
rename from comm/lcal/src/tiling/tiling_args.cpp
rename to src/kernels/lcal/src/tiling/tiling_args.cpp
diff --git a/comm/lcal/src/tiling/tiling_func.cpp b/src/kernels/lcal/src/tiling/tiling_func.cpp
similarity index 100%
rename from comm/lcal/src/tiling/tiling_func.cpp
rename to src/kernels/lcal/src/tiling/tiling_func.cpp
diff --git a/comm/lcal/src/tools/socket/lcal_sock_exchange.cpp b/src/kernels/lcal/src/tools/socket/lcal_sock_exchange.cpp
similarity index 100%
rename from comm/lcal/src/tools/socket/lcal_sock_exchange.cpp
rename to src/kernels/lcal/src/tools/socket/lcal_sock_exchange.cpp
diff --git a/comm/lcal/src/tools/socket/lcal_sock_exchange.h b/src/kernels/lcal/src/tools/socket/lcal_sock_exchange.h
similarity index 100%
rename from comm/lcal/src/tools/socket/lcal_sock_exchange.h
rename to src/kernels/lcal/src/tools/socket/lcal_sock_exchange.h
diff --git a/src/cinterface/atb_acl_fused_add_topk_div.cpp b/src/ops_infer/fused_add_topk_div/atb_acl_fused_add_topk_div.cpp
similarity index 99%
rename from src/cinterface/atb_acl_fused_add_topk_div.cpp
rename to src/ops_infer/fused_add_topk_div/atb_acl_fused_add_topk_div.cpp
index 65f84243..c55b1d01 100644
--- a/src/cinterface/atb_acl_fused_add_topk_div.cpp
+++ b/src/ops_infer/fused_add_topk_div/atb_acl_fused_add_topk_div.cpp
@@ -8,7 +8,7 @@
  * See LICENSE in the root of the software repository for the full text of the License.
  */
 #include "atb/atb_acl.h"
-#include "atb_acl_util.h"
+#include "atb/utils/atb_acl_util.h"
 #include "atb/operation/operation_base.h"
 
 #ifdef __cplusplus
diff --git a/src/cinterface/atb_acl_mla_preprocess.cpp b/src/ops_infer/mla_preprocess/atb_acl_mla_preprocess.cpp
similarity index 97%
rename from src/cinterface/atb_acl_mla_preprocess.cpp
rename to src/ops_infer/mla_preprocess/atb_acl_mla_preprocess.cpp
index 32bd22c6..bac5171a 100644
--- a/src/cinterface/atb_acl_mla_preprocess.cpp
+++ b/src/ops_infer/mla_preprocess/atb_acl_mla_preprocess.cpp
@@ -1,179 +1,179 @@
-/*
- * Copyright (c) 2025 Huawei Technologies Co., Ltd.
- * This file is a part of the CANN Open Software.
- * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
- * Please refer to the License for details. You may not use this file except in compliance with the License.
- * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
- * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
- * See LICENSE in the root of the software repository for the full text of the License.
- */
-#include "atb/atb_acl.h"
-#include "atb_acl_util.h"
-#include "atb/operation/operation_base.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-const size_t g_MLAPPINTENSORNUM = 24;
-const size_t g_MLAPPOUTTENSORNUMCACHEMODE = 4;
-const size_t g_MLAPPOUTTENSORNUM = 2;
-
-atb::Status AtbMLAPreprocessGetWorkspaceSize(
-    const aclTensor *input, const aclTensor *gamma0, const aclTensor *beta0, const aclTensor *quantScale0,
-    const aclTensor *quantOffset0, const aclTensor *wdqkv, const aclTensor *deScale0, const aclTensor *bias0,
-    const aclTensor *gamma1, const aclTensor *beta1, const aclTensor *quantScale1, const aclTensor *quantOffset1,
-    const aclTensor *wuq, const aclTensor *deScale1, const aclTensor *bias1, const aclTensor *gamma2,
-    const aclTensor *cos, const aclTensor *sin, const aclTensor *wuk, const aclTensor *kvCache,
-    const aclTensor *kvCacheRope, const aclTensor *slotmapping, const aclTensor *ctkvScale, const aclTensor *qNopeScale,
-    uint32_t wdqDim, uint32_t qRopeDim, uint32_t kRopeDim, float epsilon, uint32_t qRotaryCoeff, uint32_t kRotaryCoeff,
-    bool transposeWdq, bool transposeWuq, bool transposeWuk, uint8_t cacheMode, uint16_t quantMode, aclTensor *qOut0,
-    aclTensor *kvCacheOut0, aclTensor *qOut1, aclTensor *kvCacheOut1, uint64_t *workspaceSize, atb::Operation **op,
-    atb::Context *context)
-{
-    atb::infer::MlaPreprocessParam param;
-    param.wdqDim = wdqDim;
-    param.qRopeDim = qRopeDim;
-    param.kRopeDim = kRopeDim;
-    param.epsilon = epsilon;
-    param.qRotaryCoeff = static_cast<int32_t>(qRotaryCoeff);
-    param.kRotaryCoeff = static_cast<int32_t>(kRotaryCoeff);
-    param.transposeWdq = transposeWdq;
-    param.transposeWuq = transposeWuq;
-    param.transposeWuk = transposeWuk;
-    param.cacheMode = atb::infer::MlaPreprocessParam::CacheMode(cacheMode);
-    param.quantMode = atb::infer::MlaPreprocessParam::QuantMode(quantMode);
-
-    if (op != nullptr && *op == nullptr) {
-        auto st = CreateOperation(param, op);
-        if (st != atb::NO_ERROR) {
-            ATB_LOG(ERROR) << "Create MLAPreprocess Operation failed!";
-            return st;
-        }
-    }
-    atb::VariantPack pack;
-    size_t i = 0;
-    pack.inTensors.resize(g_MLAPPINTENSORNUM);
-    auto status = aclTensorToAtbTensor(input, &(pack.inTensors[i++]));
-    ATB_CHECK(status == atb::NO_ERROR, "input create failed!", return status);
-    status = aclTensorToAtbTensor(gamma0, &(pack.inTensors[i++]));
-    ATB_CHECK(status == atb::NO_ERROR, "gamma0 create failed!", return status);
-    status = aclTensorToAtbTensor(beta0, &(pack.inTensors[i++]));
-    ATB_CHECK(status == atb::NO_ERROR, "beta0 create failed!", return status);
-    if (param.quantMode == atb::infer::MlaPreprocessParam::QuantMode::PER_TENSOR_QUANT_ASYMM) {
-        status = aclTensorToAtbTensor(quantScale0, &(pack.inTensors[i++]));
-        ATB_CHECK(status == atb::NO_ERROR, "quantScale0 create failed!", return status);
-        status = aclTensorToAtbTensor(quantOffset0, &(pack.inTensors[i++]));
-        ATB_CHECK(status == atb::NO_ERROR, "quantOffset0 create failed!", return status);
-    } else {
-        status = aclTensorToAtbTensor(nullptr, &(pack.inTensors[i++]));
-        status = aclTensorToAtbTensor(nullptr, &(pack.inTensors[i++]));
-    }
-    status = aclTensorToAtbTensor(wdqkv, &(pack.inTensors[i++]));
-    ATB_CHECK(status == atb::NO_ERROR, "wdqkv create failed!", return status);
-    status = aclTensorToAtbTensor(deScale0, &(pack.inTensors[i++]));
-    ATB_CHECK(status == atb::NO_ERROR, "deScale0 create failed!", return status);
-    if (param.quantMode != atb::infer::MlaPreprocessParam::QuantMode::PER_TOKEN_QUANT_SYMM &&
-        param.quantMode != atb::infer::MlaPreprocessParam::QuantMode::UNQUANT) {
-        status = aclTensorToAtbTensor(bias0, &(pack.inTensors[i++]));
-        ATB_CHECK(status == atb::NO_ERROR, "bias0 create failed!", return status);
-    } else {
-        status = aclTensorToAtbTensor(nullptr, &(pack.inTensors[i++]));
-    }
-    status = aclTensorToAtbTensor(gamma1, &(pack.inTensors[i++]));
-    ATB_CHECK(status == atb::NO_ERROR, "gamma1 create failed!", return status);
-    status = aclTensorToAtbTensor(beta1, &(pack.inTensors[i++]));
-    ATB_CHECK(status == atb::NO_ERROR, "beta1 create failed!", return status);
-
-    if (param.quantMode == atb::infer::MlaPreprocessParam::QuantMode::PER_TENSOR_QUANT_ASYMM) {
-        status = aclTensorToAtbTensor(quantScale1, &(pack.inTensors[i++]));
-        ATB_CHECK(status == atb::NO_ERROR, "quantScale1 create failed!", return status);
-        status = aclTensorToAtbTensor(quantOffset1, &(pack.inTensors[i++]));
-        ATB_CHECK(status == atb::NO_ERROR, "quantOffset1 create failed!", return status);
-    } else {
-        status = aclTensorToAtbTensor(nullptr, &(pack.inTensors[i++]));
-        status = aclTensorToAtbTensor(nullptr, &(pack.inTensors[i++]));
-    }
-    status = aclTensorToAtbTensor(wuq, &(pack.inTensors[i++]));
-    ATB_CHECK(status == atb::NO_ERROR, "wuq create failed!", return status);
-    status = aclTensorToAtbTensor(deScale1, &(pack.inTensors[i++]));
-    ATB_CHECK(status == atb::NO_ERROR, "deScale1 create failed!", return status);
-    if (param.quantMode != atb::infer::MlaPreprocessParam::QuantMode::PER_TOKEN_QUANT_SYMM &&
-        param.quantMode != atb::infer::MlaPreprocessParam::QuantMode::UNQUANT) {
-        status = aclTensorToAtbTensor(bias1, &(pack.inTensors[i++]));
-        ATB_CHECK(status == atb::NO_ERROR, "bias1 create failed!", return status);
-    } else {
-        status = aclTensorToAtbTensor(nullptr, &(pack.inTensors[i++]));
-    }
-    status = aclTensorToAtbTensor(gamma2, &(pack.inTensors[i++]));
-    ATB_CHECK(status == atb::NO_ERROR, "gamma2 create failed!", return status);
-
-    status = aclTensorToAtbTensor(cos, &(pack.inTensors[i++]));
-    ATB_CHECK(status == atb::NO_ERROR, "cos create failed!", return status);
-
-    status = aclTensorToAtbTensor(sin, &(pack.inTensors[i++]));
-    ATB_CHECK(status == atb::NO_ERROR, "sin create failed!", return status);
-
-    status = aclTensorToAtbTensor(wuk, &(pack.inTensors[i++]));
-    ATB_CHECK(status == atb::NO_ERROR, "wuk create failed!", return status);
-
-    status = aclTensorToAtbTensor(kvCache, &(pack.inTensors[i++]));
-    ATB_CHECK(status == atb::NO_ERROR, "kvCache create failed!", return status);
-
-    if (param.cacheMode != atb::infer::MlaPreprocessParam::CacheMode::KVCACHE) {
-        status = aclTensorToAtbTensor(kvCacheRope, &(pack.inTensors[i++]));
-        ATB_CHECK(status == atb::NO_ERROR, "kvCacheRope create failed!", return status);
-    } else {
-        status = aclTensorToAtbTensor(nullptr, &(pack.inTensors[i++]));
-    }
-    status = aclTensorToAtbTensor(slotmapping, &(pack.inTensors[i++]));
-    if (param.cacheMode == atb::infer::MlaPreprocessParam::CacheMode::INT8_NZCACHE) {
-        status = aclTensorToAtbTensor(ctkvScale, &(pack.inTensors[i++]));
-        ATB_CHECK(status == atb::NO_ERROR, "ctkvScale create failed!", return status);
-        status = aclTensorToAtbTensor(qNopeScale, &(pack.inTensors[i++]));
-        ATB_CHECK(status == atb::NO_ERROR, "qNopeScale create failed!", return status);
-    } else {
-        status = aclTensorToAtbTensor(nullptr, &(pack.inTensors[i++]));
-        status = aclTensorToAtbTensor(nullptr, &(pack.inTensors[i++]));
-    }
-
-    i = 0;
-    if (param.cacheMode != atb::infer::MlaPreprocessParam::CacheMode::KVCACHE) {
-        pack.outTensors.resize(g_MLAPPOUTTENSORNUMCACHEMODE);
-        status = aclTensorToAtbTensor(qOut0, &(pack.outTensors[i++]));
-        ATB_CHECK(status == atb::NO_ERROR, "qOut0 create failed!", return status);
-        status = aclTensorToAtbTensor(kvCacheOut0, &(pack.outTensors[i++]));
-        ATB_CHECK(status == atb::NO_ERROR, "kvCacheOut0 create failed!", return status);
-        status = aclTensorToAtbTensor(qOut1, &(pack.outTensors[i++]));
-        ATB_CHECK(status == atb::NO_ERROR, "qOut1 create failed!", return status);
-        status = aclTensorToAtbTensor(kvCacheOut1, &(pack.outTensors[i++]));
-        ATB_CHECK(status == atb::NO_ERROR, "kvCacheOut1 create failed!", return status);
-    } else {
-        pack.outTensors.resize(g_MLAPPOUTTENSORNUM);
-        status = aclTensorToAtbTensor(qOut0, &(pack.outTensors[i++]));
-        ATB_CHECK(status == atb::NO_ERROR, "qOut0 create failed!", return status);
-        status = aclTensorToAtbTensor(kvCacheOut0, &(pack.outTensors[i++]));
-        ATB_CHECK(status == atb::NO_ERROR, "kvCacheOut0 create failed!", return status);
-    }
-    if (op == nullptr || *op == nullptr) {
-        ATB_LOG(ERROR) << "AtbMLAPreprocessGetWorkspaceSize opeartion pointer is nullptr!";
-        return atb::ERROR_INVALID_OPERATION_ADDR;
-    }
-    atb::Status st = (*op)->Setup(pack, *workspaceSize, context);
-    ATB_CHECK(st == atb::NO_ERROR, "AtbMLAPreprocess Setup failed!", return st);
-    return atb::NO_ERROR;
-}
-
-atb::Status AtbMLAPreprocess(void *workspace, uint64_t workspaceSize, atb::Operation *op, atb::Context *context)
-{
-    ATB_CHECK(op != nullptr, "AtbMLAPreprocess expect op pointer not to be null!",
-              return atb::ERROR_INVALID_OPERATION_ADDR);
-    atb::VariantPack pack;
-    atb::Status st = op->Execute(pack, (uint8_t *)(workspace), workspaceSize, context);
-    ATB_CHECK(st == atb::NO_ERROR, "AtbMLAPreprocess Execute failed!", return st);
-    return st;
-}
-
-#ifdef __cplusplus
-}
-#endif
+/*
+ * Copyright (c) 2025 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+#include "atb/atb_acl.h"
+#include "atb/utils/atb_acl_util.h"
+#include "atb/operation/operation_base.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+const size_t g_MLAPPINTENSORNUM = 24;
+const size_t g_MLAPPOUTTENSORNUMCACHEMODE = 4;
+const size_t g_MLAPPOUTTENSORNUM = 2;
+
+atb::Status AtbMLAPreprocessGetWorkspaceSize(
+    const aclTensor *input, const aclTensor *gamma0, const aclTensor *beta0, const aclTensor *quantScale0,
+    const aclTensor *quantOffset0, const aclTensor *wdqkv, const aclTensor *deScale0, const aclTensor *bias0,
+    const aclTensor *gamma1, const aclTensor *beta1, const aclTensor *quantScale1, const aclTensor *quantOffset1,
+    const aclTensor *wuq, const aclTensor *deScale1, const aclTensor *bias1, const aclTensor *gamma2,
+    const aclTensor *cos, const aclTensor *sin, const aclTensor *wuk, const aclTensor *kvCache,
+    const aclTensor *kvCacheRope, const aclTensor *slotmapping, const aclTensor *ctkvScale, const aclTensor *qNopeScale,
+    uint32_t wdqDim, uint32_t qRopeDim, uint32_t kRopeDim, float epsilon, uint32_t qRotaryCoeff, uint32_t kRotaryCoeff,
+    bool transposeWdq, bool transposeWuq, bool transposeWuk, uint8_t cacheMode, uint16_t quantMode, aclTensor *qOut0,
+    aclTensor *kvCacheOut0, aclTensor *qOut1, aclTensor *kvCacheOut1, uint64_t *workspaceSize, atb::Operation **op,
+    atb::Context *context)
+{
+    atb::infer::MlaPreprocessParam param;
+    param.wdqDim = wdqDim;
+    param.qRopeDim = qRopeDim;
+    param.kRopeDim = kRopeDim;
+    param.epsilon = epsilon;
+    param.qRotaryCoeff = static_cast<int32_t>(qRotaryCoeff);
+    param.kRotaryCoeff = static_cast<int32_t>(kRotaryCoeff);
+    param.transposeWdq = transposeWdq;
+    param.transposeWuq = transposeWuq;
+    param.transposeWuk = transposeWuk;
+    param.cacheMode = atb::infer::MlaPreprocessParam::CacheMode(cacheMode);
+    param.quantMode = atb::infer::MlaPreprocessParam::QuantMode(quantMode);
+
+    if (op != nullptr && *op == nullptr) {
+        auto st = CreateOperation(param, op);
+        if (st != atb::NO_ERROR) {
+            ATB_LOG(ERROR) << "Create MLAPreprocess Operation failed!";
+            return st;
+        }
+    }
+    atb::VariantPack pack;
+    size_t i = 0;
+    pack.inTensors.resize(g_MLAPPINTENSORNUM);
+    auto status = aclTensorToAtbTensor(input, &(pack.inTensors[i++]));
+    ATB_CHECK(status == atb::NO_ERROR, "input create failed!", return status);
+    status = aclTensorToAtbTensor(gamma0, &(pack.inTensors[i++]));
+    ATB_CHECK(status == atb::NO_ERROR, "gamma0 create failed!", return status);
+    status = aclTensorToAtbTensor(beta0, &(pack.inTensors[i++]));
+    ATB_CHECK(status == atb::NO_ERROR, "beta0 create failed!", return status);
+    if (param.quantMode == atb::infer::MlaPreprocessParam::QuantMode::PER_TENSOR_QUANT_ASYMM) {
+        status = aclTensorToAtbTensor(quantScale0, &(pack.inTensors[i++]));
+        ATB_CHECK(status == atb::NO_ERROR, "quantScale0 create failed!", return status);
+        status = aclTensorToAtbTensor(quantOffset0, &(pack.inTensors[i++]));
+        ATB_CHECK(status == atb::NO_ERROR, "quantOffset0 create failed!", return status);
+    } else {
+        status = aclTensorToAtbTensor(nullptr, &(pack.inTensors[i++]));
+        status = aclTensorToAtbTensor(nullptr, &(pack.inTensors[i++]));
+    }
+    status = aclTensorToAtbTensor(wdqkv, &(pack.inTensors[i++]));
+    ATB_CHECK(status == atb::NO_ERROR, "wdqkv create failed!", return status);
+    status = aclTensorToAtbTensor(deScale0, &(pack.inTensors[i++]));
+    ATB_CHECK(status == atb::NO_ERROR, "deScale0 create failed!", return status);
+    if (param.quantMode != atb::infer::MlaPreprocessParam::QuantMode::PER_TOKEN_QUANT_SYMM &&
+        param.quantMode != atb::infer::MlaPreprocessParam::QuantMode::UNQUANT) {
+        status = aclTensorToAtbTensor(bias0, &(pack.inTensors[i++]));
+        ATB_CHECK(status == atb::NO_ERROR, "bias0 create failed!", return status);
+    } else {
+        status = aclTensorToAtbTensor(nullptr, &(pack.inTensors[i++]));
+    }
+    status = aclTensorToAtbTensor(gamma1, &(pack.inTensors[i++]));
+    ATB_CHECK(status == atb::NO_ERROR, "gamma1 create failed!", return status);
+    status = aclTensorToAtbTensor(beta1, &(pack.inTensors[i++]));
+    ATB_CHECK(status == atb::NO_ERROR, "beta1 create failed!", return status);
+
+    if (param.quantMode == atb::infer::MlaPreprocessParam::QuantMode::PER_TENSOR_QUANT_ASYMM) {
+        status = aclTensorToAtbTensor(quantScale1, &(pack.inTensors[i++]));
+        ATB_CHECK(status == atb::NO_ERROR, "quantScale1 create failed!", return status);
+        status = aclTensorToAtbTensor(quantOffset1, &(pack.inTensors[i++]));
+        ATB_CHECK(status == atb::NO_ERROR, "quantOffset1 create failed!", return status);
+    } else {
+        status = aclTensorToAtbTensor(nullptr, &(pack.inTensors[i++]));
+        status = aclTensorToAtbTensor(nullptr, &(pack.inTensors[i++]));
+    }
+    status = aclTensorToAtbTensor(wuq, &(pack.inTensors[i++]));
+    ATB_CHECK(status == atb::NO_ERROR, "wuq create failed!", return status);
+    status = aclTensorToAtbTensor(deScale1, &(pack.inTensors[i++]));
+    ATB_CHECK(status == atb::NO_ERROR, "deScale1 create failed!", return status);
+    if (param.quantMode != atb::infer::MlaPreprocessParam::QuantMode::PER_TOKEN_QUANT_SYMM &&
+        param.quantMode != atb::infer::MlaPreprocessParam::QuantMode::UNQUANT) {
+        status = aclTensorToAtbTensor(bias1, &(pack.inTensors[i++]));
+        ATB_CHECK(status == atb::NO_ERROR, "bias1 create failed!", return status);
+    } else {
+        status = aclTensorToAtbTensor(nullptr, &(pack.inTensors[i++]));
+    }
+    status = aclTensorToAtbTensor(gamma2, &(pack.inTensors[i++]));
+    ATB_CHECK(status == atb::NO_ERROR, "gamma2 create failed!", return status);
+
+    status = aclTensorToAtbTensor(cos, &(pack.inTensors[i++]));
+    ATB_CHECK(status == atb::NO_ERROR, "cos create failed!", return status);
+
+    status = aclTensorToAtbTensor(sin, &(pack.inTensors[i++]));
+    ATB_CHECK(status == atb::NO_ERROR, "sin create failed!", return status);
+
+    status = aclTensorToAtbTensor(wuk, &(pack.inTensors[i++]));
+    ATB_CHECK(status == atb::NO_ERROR, "wuk create failed!", return status);
+
+    status = aclTensorToAtbTensor(kvCache, &(pack.inTensors[i++]));
+    ATB_CHECK(status == atb::NO_ERROR, "kvCache create failed!", return status);
+
+    if (param.cacheMode != atb::infer::MlaPreprocessParam::CacheMode::KVCACHE) {
+        status = aclTensorToAtbTensor(kvCacheRope, &(pack.inTensors[i++]));
+        ATB_CHECK(status == atb::NO_ERROR, "kvCacheRope create failed!", return status);
+    } else {
+        status = aclTensorToAtbTensor(nullptr, &(pack.inTensors[i++]));
+    }
+    status = aclTensorToAtbTensor(slotmapping, &(pack.inTensors[i++]));
+    if (param.cacheMode == atb::infer::MlaPreprocessParam::CacheMode::INT8_NZCACHE) {
+        status = aclTensorToAtbTensor(ctkvScale, &(pack.inTensors[i++]));
+        ATB_CHECK(status == atb::NO_ERROR, "ctkvScale create failed!", return status);
+        status = aclTensorToAtbTensor(qNopeScale, &(pack.inTensors[i++]));
+        ATB_CHECK(status == atb::NO_ERROR, "qNopeScale create failed!", return status);
+    } else {
+        status = aclTensorToAtbTensor(nullptr, &(pack.inTensors[i++]));
+        status = aclTensorToAtbTensor(nullptr, &(pack.inTensors[i++]));
+    }
+
+    i = 0;
+    if (param.cacheMode != atb::infer::MlaPreprocessParam::CacheMode::KVCACHE) {
+        pack.outTensors.resize(g_MLAPPOUTTENSORNUMCACHEMODE);
+        status = aclTensorToAtbTensor(qOut0, &(pack.outTensors[i++]));
+        ATB_CHECK(status == atb::NO_ERROR, "qOut0 create failed!", return status);
+        status = aclTensorToAtbTensor(kvCacheOut0, &(pack.outTensors[i++]));
+        ATB_CHECK(status == atb::NO_ERROR, "kvCacheOut0 create failed!", return status);
+        status = aclTensorToAtbTensor(qOut1, &(pack.outTensors[i++]));
+        ATB_CHECK(status == atb::NO_ERROR, "qOut1 create failed!", return status);
+        status = aclTensorToAtbTensor(kvCacheOut1, &(pack.outTensors[i++]));
+        ATB_CHECK(status == atb::NO_ERROR, "kvCacheOut1 create failed!", return status);
+    } else {
+        pack.outTensors.resize(g_MLAPPOUTTENSORNUM);
+        status = aclTensorToAtbTensor(qOut0, &(pack.outTensors[i++]));
+        ATB_CHECK(status == atb::NO_ERROR, "qOut0 create failed!", return status);
+        status = aclTensorToAtbTensor(kvCacheOut0, &(pack.outTensors[i++]));
+        ATB_CHECK(status == atb::NO_ERROR, "kvCacheOut0 create failed!", return status);
+    }
+    if (op == nullptr || *op == nullptr) {
+        ATB_LOG(ERROR) << "AtbMLAPreprocessGetWorkspaceSize opeartion pointer is nullptr!";
+        return atb::ERROR_INVALID_OPERATION_ADDR;
+    }
+    atb::Status st = (*op)->Setup(pack, *workspaceSize, context);
+    ATB_CHECK(st == atb::NO_ERROR, "AtbMLAPreprocess Setup failed!", return st);
+    return atb::NO_ERROR;
+}
+
+atb::Status AtbMLAPreprocess(void *workspace, uint64_t workspaceSize, atb::Operation *op, atb::Context *context)
+{
+    ATB_CHECK(op != nullptr, "AtbMLAPreprocess expect op pointer not to be null!",
+              return atb::ERROR_INVALID_OPERATION_ADDR);
+    atb::VariantPack pack;
+    atb::Status st = op->Execute(pack, (uint8_t *)(workspace), workspaceSize, context);
+    ATB_CHECK(st == atb::NO_ERROR, "AtbMLAPreprocess Execute failed!", return st);
+    return st;
+}
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/src/cinterface/atb_acl_mla.cpp b/src/ops_infer/multi_latent_attention/atb_acl_mla.cpp
similarity index 97%
rename from src/cinterface/atb_acl_mla.cpp
rename to src/ops_infer/multi_latent_attention/atb_acl_mla.cpp
index e050b04c..54b9f909 100644
--- a/src/cinterface/atb_acl_mla.cpp
+++ b/src/ops_infer/multi_latent_attention/atb_acl_mla.cpp
@@ -1,201 +1,201 @@
-/*
- * Copyright (c) 2025 Huawei Technologies Co., Ltd.
- * This file is a part of the CANN Open Software.
- * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
- * Please refer to the License for details. You may not use this file except in compliance with the License.
- * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
- * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
- * See LICENSE in the root of the software repository for the full text of the License.
- */
-#include "atb/atb_acl.h"
-#include "atb_acl_util.h"
-#include "atb/operation/operation_base.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-const size_t g_MLAINTENSORNUMINT8NOMASK = 9;
-const size_t g_MLAINTENSORNUMINT8MASK = 10;
-const size_t g_MLAINTENSORNUMNOMASK = 7;
-const size_t g_MLAINTENSORNUMMASK = 8;
-const size_t g_MLAOUTTENSORNUMCALCRING = 2;
-const size_t g_MLAOUTTENSORNUMNOCALCRING = 1;
-
-atb::Status AtbMLAGetWorkspaceSize(const aclTensor *qNope, const aclTensor *qRope, const aclTensor *ctKV,
-                                   const aclTensor *kRope, const aclTensor *blockTables, const aclTensor *contextLens,
-                                   const aclTensor *mask, const aclTensor *qSeqLen, const aclTensor *qkDescale,
-                                   const aclTensor *pvDescale, int32_t headNum, float qkScale, int32_t kvHeadNum,
-                                   int maskType, int calcType, uint8_t cacheMode, aclTensor *attenOut, aclTensor *lse,
-                                   uint64_t *workspaceSize, atb::Operation **op, atb::Context *context)
-{
-    atb::infer::MultiLatentAttentionParam param;
-    param.headNum = headNum;
-    param.qkScale = qkScale;
-    param.kvHeadNum = kvHeadNum;
-    param.maskType = atb::infer::MultiLatentAttentionParam::MaskType(maskType);
-    param.calcType = atb::infer::MultiLatentAttentionParam::CalcType(calcType);
-    param.cacheMode = atb::infer::MultiLatentAttentionParam::CacheMode(cacheMode);
-    if (op != nullptr && *op == nullptr) {
-        auto st = CreateOperation(param, op);
-        if (st != atb::NO_ERROR) {
-            ATB_LOG(ERROR) << "Create MLA Operation failed!";
-            return st;
-        }
-    }
-    atb::VariantPack pack;
-    size_t i = 0;
-    size_t counter = 0;
-    if (param.cacheMode == atb::infer::MultiLatentAttentionParam::CacheMode::INT8_NZCACHE) {
-        if (param.maskType == atb::infer::MultiLatentAttentionParam::MaskType::UNDEFINED) {
-            pack.inTensors.resize(g_MLAINTENSORNUMINT8NOMASK);
-            counter = g_MLAINTENSORNUMINT8NOMASK;
-        } else {
-            pack.inTensors.resize(g_MLAINTENSORNUMINT8MASK);
-            counter = g_MLAINTENSORNUMINT8MASK;
-        }
-    } else {
-        if (param.maskType == atb::infer::MultiLatentAttentionParam::MaskType::UNDEFINED) {
-            pack.inTensors.resize(g_MLAINTENSORNUMNOMASK);
-            counter = g_MLAINTENSORNUMNOMASK;
-        } else {
-            pack.inTensors.resize(g_MLAINTENSORNUMMASK);
-            counter = g_MLAINTENSORNUMMASK;
-        }
-    }
-    if (param.calcType != atb::infer::MultiLatentAttentionParam::CalcType::CALC_TYPE_SPEC) {
-        pack.inTensors.resize(counter - 1);
-    }
-    auto status = aclTensorToAtbTensor(qNope, &(pack.inTensors[i++]));
-    ATB_CHECK(status == atb::NO_ERROR, "qNope create failed!", return status);
-    status = aclTensorToAtbTensor(qRope, &(pack.inTensors[i++]));
-    ATB_CHECK(status == atb::NO_ERROR, "qRope create failed!", return status);
-    status = aclTensorToAtbTensor(ctKV, &(pack.inTensors[i++]));
-    ATB_CHECK(status == atb::NO_ERROR, "ctKV create failed!", return status);
-    status = aclTensorToAtbTensor(kRope, &(pack.inTensors[i++]));
-    ATB_CHECK(status == atb::NO_ERROR, "kRope create failed!", return status);
-    status = aclTensorToAtbTensor(blockTables, &(pack.inTensors[i++]));
-    ATB_CHECK(status == atb::NO_ERROR, "blockTables create failed!", return status);
-    status = aclTensorToAtbTensorHost(contextLens, &(pack.inTensors[i++]));
-    ATB_CHECK(status == atb::NO_ERROR, "contextLens create failed!", return status);
-
-    if (param.maskType != atb::infer::MultiLatentAttentionParam::MaskType::UNDEFINED) {
-        status = aclTensorToAtbTensor(mask, &(pack.inTensors[i++]));
-        ATB_CHECK(status == atb::NO_ERROR, "mask create failed!", return status);
-    }
-    if (param.calcType == atb::infer::MultiLatentAttentionParam::CalcType::CALC_TYPE_SPEC) {
-        status = aclTensorToAtbTensorHost(qSeqLen, &(pack.inTensors[i++]));
-        ATB_CHECK(status == atb::NO_ERROR, "qSeqLen create failed!", return status);
-    }
-    if (param.cacheMode == atb::infer::MultiLatentAttentionParam::CacheMode::INT8_NZCACHE) {
-        status = aclTensorToAtbTensor(qkDescale, &(pack.inTensors[i++]));
-        ATB_CHECK(status == atb::NO_ERROR, "qkDescale create failed!", return status);
-        status = aclTensorToAtbTensor(pvDescale, &(pack.inTensors[i++]));
-        ATB_CHECK(status == atb::NO_ERROR, "pvDescale create failed!", return status);
-    }
-    i = 0;
-    if (param.calcType != atb::infer::MultiLatentAttentionParam::CalcType::CALC_TYPE_RING) {
-        pack.outTensors.resize(g_MLAOUTTENSORNUMNOCALCRING);
-        status = aclTensorToAtbTensor(attenOut, &(pack.outTensors[i++]));
-        ATB_CHECK(status == atb::NO_ERROR, "attenOut create failed!", return status);
-    } else {
-        pack.outTensors.resize(g_MLAOUTTENSORNUMCALCRING);
-        status = aclTensorToAtbTensor(attenOut, &(pack.outTensors[i++]));
-        ATB_CHECK(status == atb::NO_ERROR, "calc_type_ring attenOut create failed!", return status);
-        status = aclTensorToAtbTensor(lse, &(pack.outTensors[i++]));
-        ATB_CHECK(status == atb::NO_ERROR, "calc_type_ring lse create failed!", return status);
-    }
-    if (op == nullptr || *op == nullptr) {
-        ATB_LOG(ERROR) << "AtbMLAGetWorkspaceSize opeartion pointer is nullptr!";
-        return atb::ERROR_INVALID_OPERATION_ADDR;
-    }
-    atb::Status st = (*op)->Setup(pack, *workspaceSize, context);
-    ATB_CHECK(st == atb::NO_ERROR, "AtbMLA Setup failed!", return st);
-    return atb::NO_ERROR;
-}
-
-atb::Status AtbMLA(void *workSpcace, uint64_t workspaceSize, atb::Operation *op, atb::Context *context)
-{
-    atb::VariantPack pack;
-    atb::Status st = op->Execute(pack, (uint8_t *)(workSpcace), workspaceSize, context);
-    ATB_CHECK(st == atb::NO_ERROR, "AtbMLA Execute failed!", return st);
-    return st;
-}
-
-
-atb::Status AtbMLAPreFillGetWorkspaceSize(const aclTensor *q, const aclTensor *qRope, const aclTensor *k,
-                                          const aclTensor *kRope, const aclTensor *v, const aclTensor *qSeqLen,
-                                          const aclTensor *kvSeqLen, const aclTensor *mask, int32_t headNum,
-                                          float qkScale, int32_t kvHeadNum, int maskType, uint8_t cacheMode,
-                                          aclTensor *attenOut, uint64_t *workspaceSize, atb::Operation **op,
-                                          atb::Context *context)
-{
-    atb::infer::MultiLatentAttentionParam param;
-    param.headNum = headNum;
-    param.qkScale = qkScale;
-    param.kvHeadNum = kvHeadNum;
-    param.maskType = atb::infer::MultiLatentAttentionParam::MaskType(maskType);
-    param.calcType = atb::infer::MultiLatentAttentionParam::CalcType::CALC_TYPE_PREFILL;
-    param.cacheMode = atb::infer::MultiLatentAttentionParam::CacheMode(cacheMode);
-    if (op != nullptr && *op == nullptr) {
-        auto st = CreateOperation(param, op);
-        if (st != atb::NO_ERROR) {
-            ATB_LOG(ERROR) << "Create MLA Operation prefill failed!";
-            return st;
-        }
-    }
-    atb::VariantPack pack;
-    size_t i = 0;
-
-    if (param.maskType == atb::infer::MultiLatentAttentionParam::MaskType::UNDEFINED) {
-        pack.inTensors.resize(g_MLAINTENSORNUMNOMASK);
-    } else {
-        pack.inTensors.resize(g_MLAINTENSORNUMMASK);
-    }
-
-    auto status = aclTensorToAtbTensor(q, &(pack.inTensors[i++]));
-    ATB_CHECK(status == atb::NO_ERROR, "qNope create failed!", return status);
-    status = aclTensorToAtbTensor(qRope, &(pack.inTensors[i++]));
-    ATB_CHECK(status == atb::NO_ERROR, "qRope create failed!", return status);
-    status = aclTensorToAtbTensor(k, &(pack.inTensors[i++]));
-    ATB_CHECK(status == atb::NO_ERROR, "key create failed!", return status);
-    status = aclTensorToAtbTensor(kRope, &(pack.inTensors[i++]));
-    ATB_CHECK(status == atb::NO_ERROR, "kRope create failed!", return status);
-    status = aclTensorToAtbTensor(v, &(pack.inTensors[i++]));
-    ATB_CHECK(status == atb::NO_ERROR, "value create failed!", return status);
-    status = aclTensorToAtbTensorHost(qSeqLen, &(pack.inTensors[i++]));
-    ATB_CHECK(status == atb::NO_ERROR, "qSeqLen create failed!", return status);
-    status = aclTensorToAtbTensorHost(kvSeqLen, &(pack.inTensors[i++]));
-    ATB_CHECK(status == atb::NO_ERROR, "kvSeqLen create failed!", return status);
-
-    if (param.maskType == atb::infer::MultiLatentAttentionParam::MaskType::MASK_TYPE_MASK_FREE) {
-        status = aclTensorToAtbTensor(mask, &(pack.inTensors[i++]));
-        ATB_CHECK(status == atb::NO_ERROR, "mask create failed!", return status);
-    }
-
-    pack.outTensors.resize(g_MLAOUTTENSORNUMNOCALCRING);
-    status = aclTensorToAtbTensor(attenOut, &(pack.outTensors[0]));
-    ATB_CHECK(status == atb::NO_ERROR, "attenOut create failed!", return status);
-
-    if (op == nullptr || *op == nullptr) {
-        ATB_LOG(ERROR) << "AtbMLAPreFillGetWorkspaceSize opeartion pointer is nullptr!";
-        return atb::ERROR_INVALID_OPERATION_ADDR;
-    }
-    atb::Status st = (*op)->Setup(pack, *workspaceSize, context);
-    ATB_CHECK(st == atb::NO_ERROR, "AtbMLAPreFill Setup failed!", return st);
-    return atb::NO_ERROR;
-}
-
-atb::Status AtbMLAPreFill(void *workspace, uint64_t workspaceSize, atb::Operation *op, atb::Context *context)
-{
-    ATB_CHECK(op != nullptr, "AtbMLAPreFill expect op pointer not to be null!",
-              return atb::ERROR_INVALID_OPERATION_ADDR);
-    atb::VariantPack pack;
-    atb::Status st = op->Execute(pack, (uint8_t *)(workspace), workspaceSize, context);
-    ATB_CHECK(st == atb::NO_ERROR, "AtbMLAPreFill Execute failed!", return st);
-    return st;
-}
-
-#ifdef __cplusplus
-}
-#endif
+/*
+ * Copyright (c) 2025 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+#include "atb/atb_acl.h"
+#include "atb/utils/atb_acl_util.h"
+#include "atb/operation/operation_base.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+const size_t g_MLAINTENSORNUMINT8NOMASK = 9;
+const size_t g_MLAINTENSORNUMINT8MASK = 10;
+const size_t g_MLAINTENSORNUMNOMASK = 7;
+const size_t g_MLAINTENSORNUMMASK = 8;
+const size_t g_MLAOUTTENSORNUMCALCRING = 2;
+const size_t g_MLAOUTTENSORNUMNOCALCRING = 1;
+
+atb::Status AtbMLAGetWorkspaceSize(const aclTensor *qNope, const aclTensor *qRope, const aclTensor *ctKV,
+                                   const aclTensor *kRope, const aclTensor *blockTables, const aclTensor *contextLens,
+                                   const aclTensor *mask, const aclTensor *qSeqLen, const aclTensor *qkDescale,
+                                   const aclTensor *pvDescale, int32_t headNum, float qkScale, int32_t kvHeadNum,
+                                   int maskType, int calcType, uint8_t cacheMode, aclTensor *attenOut, aclTensor *lse,
+                                   uint64_t *workspaceSize, atb::Operation **op, atb::Context *context)
+{
+    atb::infer::MultiLatentAttentionParam param;
+    param.headNum = headNum;
+    param.qkScale = qkScale;
+    param.kvHeadNum = kvHeadNum;
+    param.maskType = atb::infer::MultiLatentAttentionParam::MaskType(maskType);
+    param.calcType = atb::infer::MultiLatentAttentionParam::CalcType(calcType);
+    param.cacheMode = atb::infer::MultiLatentAttentionParam::CacheMode(cacheMode);
+    if (op != nullptr && *op == nullptr) {
+        auto st = CreateOperation(param, op);
+        if (st != atb::NO_ERROR) {
+            ATB_LOG(ERROR) << "Create MLA Operation failed!";
+            return st;
+        }
+    }
+    atb::VariantPack pack;
+    size_t i = 0;
+    size_t counter = 0;
+    if (param.cacheMode == atb::infer::MultiLatentAttentionParam::CacheMode::INT8_NZCACHE) {
+        if (param.maskType == atb::infer::MultiLatentAttentionParam::MaskType::UNDEFINED) {
+            pack.inTensors.resize(g_MLAINTENSORNUMINT8NOMASK);
+            counter = g_MLAINTENSORNUMINT8NOMASK;
+        } else {
+            pack.inTensors.resize(g_MLAINTENSORNUMINT8MASK);
+            counter = g_MLAINTENSORNUMINT8MASK;
+        }
+    } else {
+        if (param.maskType == atb::infer::MultiLatentAttentionParam::MaskType::UNDEFINED) {
+            pack.inTensors.resize(g_MLAINTENSORNUMNOMASK);
+            counter = g_MLAINTENSORNUMNOMASK;
+        } else {
+            pack.inTensors.resize(g_MLAINTENSORNUMMASK);
+            counter = g_MLAINTENSORNUMMASK;
+        }
+    }
+    if (param.calcType != atb::infer::MultiLatentAttentionParam::CalcType::CALC_TYPE_SPEC) {
+        pack.inTensors.resize(counter - 1);
+    }
+    auto status = aclTensorToAtbTensor(qNope, &(pack.inTensors[i++]));
+    ATB_CHECK(status == atb::NO_ERROR, "qNope create failed!", return status);
+    status = aclTensorToAtbTensor(qRope, &(pack.inTensors[i++]));
+    ATB_CHECK(status == atb::NO_ERROR, "qRope create failed!", return status);
+    status = aclTensorToAtbTensor(ctKV, &(pack.inTensors[i++]));
+    ATB_CHECK(status == atb::NO_ERROR, "ctKV create failed!", return status);
+    status = aclTensorToAtbTensor(kRope, &(pack.inTensors[i++]));
+    ATB_CHECK(status == atb::NO_ERROR, "kRope create failed!", return status);
+    status = aclTensorToAtbTensor(blockTables, &(pack.inTensors[i++]));
+    ATB_CHECK(status == atb::NO_ERROR, "blockTables create failed!", return status);
+    status = aclTensorToAtbTensorHost(contextLens, &(pack.inTensors[i++]));
+    ATB_CHECK(status == atb::NO_ERROR, "contextLens create failed!", return status);
+
+    if (param.maskType != atb::infer::MultiLatentAttentionParam::MaskType::UNDEFINED) {
+        status = aclTensorToAtbTensor(mask, &(pack.inTensors[i++]));
+        ATB_CHECK(status == atb::NO_ERROR, "mask create failed!", return status);
+    }
+    if (param.calcType == atb::infer::MultiLatentAttentionParam::CalcType::CALC_TYPE_SPEC) {
+        status = aclTensorToAtbTensorHost(qSeqLen, &(pack.inTensors[i++]));
+        ATB_CHECK(status == atb::NO_ERROR, "qSeqLen create failed!", return status);
+    }
+    if (param.cacheMode == atb::infer::MultiLatentAttentionParam::CacheMode::INT8_NZCACHE) {
+        status = aclTensorToAtbTensor(qkDescale, &(pack.inTensors[i++]));
+        ATB_CHECK(status == atb::NO_ERROR, "qkDescale create failed!", return status);
+        status = aclTensorToAtbTensor(pvDescale, &(pack.inTensors[i++]));
+        ATB_CHECK(status == atb::NO_ERROR, "pvDescale create failed!", return status);
+    }
+    i = 0;
+    if (param.calcType != atb::infer::MultiLatentAttentionParam::CalcType::CALC_TYPE_RING) {
+        pack.outTensors.resize(g_MLAOUTTENSORNUMNOCALCRING);
+        status = aclTensorToAtbTensor(attenOut, &(pack.outTensors[i++]));
+        ATB_CHECK(status == atb::NO_ERROR, "attenOut create failed!", return status);
+    } else {
+        pack.outTensors.resize(g_MLAOUTTENSORNUMCALCRING);
+        status = aclTensorToAtbTensor(attenOut, &(pack.outTensors[i++]));
+        ATB_CHECK(status == atb::NO_ERROR, "calc_type_ring attenOut create failed!", return status);
+        status = aclTensorToAtbTensor(lse, &(pack.outTensors[i++]));
+        ATB_CHECK(status == atb::NO_ERROR, "calc_type_ring lse create failed!", return status);
+    }
+    if (op == nullptr || *op == nullptr) {
+        ATB_LOG(ERROR) << "AtbMLAGetWorkspaceSize opeartion pointer is nullptr!";
+        return atb::ERROR_INVALID_OPERATION_ADDR;
+    }
+    atb::Status st = (*op)->Setup(pack, *workspaceSize, context);
+    ATB_CHECK(st == atb::NO_ERROR, "AtbMLA Setup failed!", return st);
+    return atb::NO_ERROR;
+}
+
+atb::Status AtbMLA(void *workSpcace, uint64_t workspaceSize, atb::Operation *op, atb::Context *context)
+{
+    atb::VariantPack pack;
+    atb::Status st = op->Execute(pack, (uint8_t *)(workSpcace), workspaceSize, context);
+    ATB_CHECK(st == atb::NO_ERROR, "AtbMLA Execute failed!", return st);
+    return st;
+}
+
+
+atb::Status AtbMLAPreFillGetWorkspaceSize(const aclTensor *q, const aclTensor *qRope, const aclTensor *k,
+                                          const aclTensor *kRope, const aclTensor *v, const aclTensor *qSeqLen,
+                                          const aclTensor *kvSeqLen, const aclTensor *mask, int32_t headNum,
+                                          float qkScale, int32_t kvHeadNum, int maskType, uint8_t cacheMode,
+                                          aclTensor *attenOut, uint64_t *workspaceSize, atb::Operation **op,
+                                          atb::Context *context)
+{
+    atb::infer::MultiLatentAttentionParam param;
+    param.headNum = headNum;
+    param.qkScale = qkScale;
+    param.kvHeadNum = kvHeadNum;
+    param.maskType = atb::infer::MultiLatentAttentionParam::MaskType(maskType);
+    param.calcType = atb::infer::MultiLatentAttentionParam::CalcType::CALC_TYPE_PREFILL;
+    param.cacheMode = atb::infer::MultiLatentAttentionParam::CacheMode(cacheMode);
+    if (op != nullptr && *op == nullptr) {
+        auto st = CreateOperation(param, op);
+        if (st != atb::NO_ERROR) {
+            ATB_LOG(ERROR) << "Create MLA Operation prefill failed!";
+            return st;
+        }
+    }
+    atb::VariantPack pack;
+    size_t i = 0;
+
+    if (param.maskType == atb::infer::MultiLatentAttentionParam::MaskType::UNDEFINED) {
+        pack.inTensors.resize(g_MLAINTENSORNUMNOMASK);
+    } else {
+        pack.inTensors.resize(g_MLAINTENSORNUMMASK);
+    }
+
+    auto status = aclTensorToAtbTensor(q, &(pack.inTensors[i++]));
+    ATB_CHECK(status == atb::NO_ERROR, "qNope create failed!", return status);
+    status = aclTensorToAtbTensor(qRope, &(pack.inTensors[i++]));
+    ATB_CHECK(status == atb::NO_ERROR, "qRope create failed!", return status);
+    status = aclTensorToAtbTensor(k, &(pack.inTensors[i++]));
+    ATB_CHECK(status == atb::NO_ERROR, "key create failed!", return status);
+    status = aclTensorToAtbTensor(kRope, &(pack.inTensors[i++]));
+    ATB_CHECK(status == atb::NO_ERROR, "kRope create failed!", return status);
+    status = aclTensorToAtbTensor(v, &(pack.inTensors[i++]));
+    ATB_CHECK(status == atb::NO_ERROR, "value create failed!", return status);
+    status = aclTensorToAtbTensorHost(qSeqLen, &(pack.inTensors[i++]));
+    ATB_CHECK(status == atb::NO_ERROR, "qSeqLen create failed!", return status);
+    status = aclTensorToAtbTensorHost(kvSeqLen, &(pack.inTensors[i++]));
+    ATB_CHECK(status == atb::NO_ERROR, "kvSeqLen create failed!", return status);
+
+    if (param.maskType == atb::infer::MultiLatentAttentionParam::MaskType::MASK_TYPE_MASK_FREE) {
+        status = aclTensorToAtbTensor(mask, &(pack.inTensors[i++]));
+        ATB_CHECK(status == atb::NO_ERROR, "mask create failed!", return status);
+    }
+
+    pack.outTensors.resize(g_MLAOUTTENSORNUMNOCALCRING);
+    status = aclTensorToAtbTensor(attenOut, &(pack.outTensors[0]));
+    ATB_CHECK(status == atb::NO_ERROR, "attenOut create failed!", return status);
+
+    if (op == nullptr || *op == nullptr) {
+        ATB_LOG(ERROR) << "AtbMLAPreFillGetWorkspaceSize opeartion pointer is nullptr!";
+        return atb::ERROR_INVALID_OPERATION_ADDR;
+    }
+    atb::Status st = (*op)->Setup(pack, *workspaceSize, context);
+    ATB_CHECK(st == atb::NO_ERROR, "AtbMLAPreFill Setup failed!", return st);
+    return atb::NO_ERROR;
+}
+
+atb::Status AtbMLAPreFill(void *workspace, uint64_t workspaceSize, atb::Operation *op, atb::Context *context)
+{
+    ATB_CHECK(op != nullptr, "AtbMLAPreFill expect op pointer not to be null!",
+              return atb::ERROR_INVALID_OPERATION_ADDR);
+    atb::VariantPack pack;
+    atb::Status st = op->Execute(pack, (uint8_t *)(workspace), workspaceSize, context);
+    ATB_CHECK(st == atb::NO_ERROR, "AtbMLAPreFill Execute failed!", return st);
+    return st;
+}
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/src/cinterface/atb_acl_paged_cache_load.cpp b/src/ops_infer/paged_cache_load/atb_acl_paged_cache_load.cpp
similarity index 99%
rename from src/cinterface/atb_acl_paged_cache_load.cpp
rename to src/ops_infer/paged_cache_load/atb_acl_paged_cache_load.cpp
index df6d86d5..07580b09 100644
--- a/src/cinterface/atb_acl_paged_cache_load.cpp
+++ b/src/ops_infer/paged_cache_load/atb_acl_paged_cache_load.cpp
@@ -8,7 +8,7 @@
  * See LICENSE in the root of the software repository for the full text of the License.
  */
 #include "atb/atb_acl.h"
-#include "atb_acl_util.h"
+#include "atb/utils/atb_acl_util.h"
 #include "atb/operation/operation_base.h"
 
 #ifdef __cplusplus
diff --git a/src/cinterface/atb_acl_ring_mla.cpp b/src/ops_infer/ring_mla/atb_acl_ring_mla.cpp
similarity index 97%
rename from src/cinterface/atb_acl_ring_mla.cpp
rename to src/ops_infer/ring_mla/atb_acl_ring_mla.cpp
index 62468810..041888a1 100644
--- a/src/cinterface/atb_acl_ring_mla.cpp
+++ b/src/ops_infer/ring_mla/atb_acl_ring_mla.cpp
@@ -1,99 +1,99 @@
-/*
- * Copyright (c) 2025 Huawei Technologies Co., Ltd.
- * This file is a part of the CANN Open Software.
- * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
- * Please refer to the License for details. You may not use this file except in compliance with the License.
- * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
- * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
- * See LICENSE in the root of the software repository for the full text of the License.
- */
-#include "atb/atb_acl.h"
-#include "atb_acl_util.h"
-#include "atb/operation/operation_base.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-const size_t g_RING_MLA_INTENSOR_NUM = 7;
-const size_t g_RING_MLA_OUTTENSOR_NUM = 2;
-
-atb::Status AtbRingMLAGetWorkspaceSize(const aclTensor *querySplit1, const aclTensor *querySplit2,
-                                       const aclTensor *keySplit1, const aclTensor *keySplit2, const aclTensor *value,
-                                       const aclTensor *mask, const aclTensor *seqLen, const aclTensor *prevOut,
-                                       const aclTensor *prevLse, int32_t headNum, int32_t kvHeadNum, float qkScale,
-                                       int kernelType, int maskType, int inputLayout, int calcType, aclTensor *output,
-                                       aclTensor *softmaxLse, uint64_t *workspaceSize, atb::Operation **op,
-                                       atb::Context *context)
-{
-    atb::infer::RingMLAParam param;
-    param.headNum = headNum;
-    param.kvHeadNum = kvHeadNum;
-    param.qkScale = qkScale;
-    param.kernelType = atb::infer::RingMLAParam::KernelType(kernelType);
-    param.maskType = atb::infer::RingMLAParam::MaskType(maskType);
-    param.inputLayout = atb::infer::InputLayout(inputLayout);
-    param.calcType = atb::infer::RingMLAParam::CalcType(calcType);
-    if (op != nullptr && *op == nullptr) {
-        auto st = CreateOperation(param, op);
-        if (st != atb::NO_ERROR) {
-            ATB_LOG(ERROR) << "Create RingMLA Operation failed!";
-            return st;
-        }
-    }
-    atb::VariantPack pack;
-    size_t index = 0;
-    if (param.calcType == atb::infer::RingMLAParam::CalcType::CALC_TYPE_DEFAULT) {
-        pack.inTensors.resize(g_RING_MLA_INTENSOR_NUM + 2); // 2: prevOut, prevLse
-    } else {
-        pack.inTensors.resize(g_RING_MLA_INTENSOR_NUM);
-    }
-
-    auto status = aclTensorToAtbTensor(querySplit1, &(pack.inTensors[index++]));
-    ATB_CHECK(status == atb::NO_ERROR, "querySplit1 create failed!", return status);
-    status = aclTensorToAtbTensor(querySplit2, &(pack.inTensors[index++]));
-    ATB_CHECK(status == atb::NO_ERROR, "querySplit2 create failed!", return status);
-    status = aclTensorToAtbTensor(keySplit1, &(pack.inTensors[index++]));
-    ATB_CHECK(status == atb::NO_ERROR, "keySplit1 create failed!", return status);
-    status = aclTensorToAtbTensor(keySplit2, &(pack.inTensors[index++]));
-    ATB_CHECK(status == atb::NO_ERROR, "keySplit2 create failed!", return status);
-    status = aclTensorToAtbTensor(value, &(pack.inTensors[index++]));
-    ATB_CHECK(status == atb::NO_ERROR, "value create failed!", return status);
-    status = aclTensorToAtbTensor(mask, &(pack.inTensors[index++]));
-    ATB_CHECK(status == atb::NO_ERROR, "mask create failed!", return status);
-    status = aclTensorToAtbTensorHost(seqLen, &(pack.inTensors[index++]));
-    ATB_CHECK(status == atb::NO_ERROR, "seqLen create failed!", return status);
-    if (param.calcType == atb::infer::RingMLAParam::CalcType::CALC_TYPE_DEFAULT) {
-        status = aclTensorToAtbTensor(prevOut, &(pack.inTensors[index++]));
-        ATB_CHECK(status == atb::NO_ERROR, "prevOut create failed!", return status);
-        status = aclTensorToAtbTensor(prevLse, &(pack.inTensors[index++]));
-        ATB_CHECK(status == atb::NO_ERROR, "prevLse create failed!", return status);
-    }
-
-    index = 0;
-    pack.outTensors.resize(g_RING_MLA_OUTTENSOR_NUM);
-    status = aclTensorToAtbTensor(output, &(pack.outTensors[index++]));
-    ATB_CHECK(status == atb::NO_ERROR, "output create failed!", return status);
-    status = aclTensorToAtbTensor(softmaxLse, &(pack.outTensors[index++]));
-    ATB_CHECK(status == atb::NO_ERROR, "softmaxLse create failed!", return status);
-    if (op == nullptr || *op == nullptr) {
-        ATB_LOG(ERROR) << "AtbRingMLAGetWorkspaceSize opeartion pointer is nullptr!";
-        return atb::ERROR_INVALID_OPERATION_ADDR;
-    }
-    status = (*op)->Setup(pack, *workspaceSize, context);
-    ATB_CHECK(status == atb::NO_ERROR, "AtbRingMLA Setup failed!", return status);
-    return atb::NO_ERROR;
-}
-
-atb::Status AtbRingMLA(void *workspace, uint64_t workspaceSize, atb::Operation *op, atb::Context *context)
-{
-    ATB_CHECK(op != nullptr, "AtbRingMLA expect op pointer not to be null!", return atb::ERROR_INVALID_OPERATION_ADDR);
-    atb::VariantPack pack;
-    atb::Status st = op->Execute(pack, (uint8_t *)(workspace), workspaceSize, context);
-    ATB_CHECK(st == atb::NO_ERROR, "AtbRingMLA Execute failed!", return st);
-    return st;
-}
-
-#ifdef __cplusplus
-}
-#endif
+/*
+ * Copyright (c) 2025 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+#include "atb/atb_acl.h"
+#include "atb/utils/atb_acl_util.h"
+#include "atb/operation/operation_base.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+const size_t g_RING_MLA_INTENSOR_NUM = 7;
+const size_t g_RING_MLA_OUTTENSOR_NUM = 2;
+
+atb::Status AtbRingMLAGetWorkspaceSize(const aclTensor *querySplit1, const aclTensor *querySplit2,
+                                       const aclTensor *keySplit1, const aclTensor *keySplit2, const aclTensor *value,
+                                       const aclTensor *mask, const aclTensor *seqLen, const aclTensor *prevOut,
+                                       const aclTensor *prevLse, int32_t headNum, int32_t kvHeadNum, float qkScale,
+                                       int kernelType, int maskType, int inputLayout, int calcType, aclTensor *output,
+                                       aclTensor *softmaxLse, uint64_t *workspaceSize, atb::Operation **op,
+                                       atb::Context *context)
+{
+    atb::infer::RingMLAParam param;
+    param.headNum = headNum;
+    param.kvHeadNum = kvHeadNum;
+    param.qkScale = qkScale;
+    param.kernelType = atb::infer::RingMLAParam::KernelType(kernelType);
+    param.maskType = atb::infer::RingMLAParam::MaskType(maskType);
+    param.inputLayout = atb::infer::InputLayout(inputLayout);
+    param.calcType = atb::infer::RingMLAParam::CalcType(calcType);
+    if (op != nullptr && *op == nullptr) {
+        auto st = CreateOperation(param, op);
+        if (st != atb::NO_ERROR) {
+            ATB_LOG(ERROR) << "Create RingMLA Operation failed!";
+            return st;
+        }
+    }
+    atb::VariantPack pack;
+    size_t index = 0;
+    if (param.calcType == atb::infer::RingMLAParam::CalcType::CALC_TYPE_DEFAULT) {
+        pack.inTensors.resize(g_RING_MLA_INTENSOR_NUM + 2); // 2: prevOut, prevLse
+    } else {
+        pack.inTensors.resize(g_RING_MLA_INTENSOR_NUM);
+    }
+
+    auto status = aclTensorToAtbTensor(querySplit1, &(pack.inTensors[index++]));
+    ATB_CHECK(status == atb::NO_ERROR, "querySplit1 create failed!", return status);
+    status = aclTensorToAtbTensor(querySplit2, &(pack.inTensors[index++]));
+    ATB_CHECK(status == atb::NO_ERROR, "querySplit2 create failed!", return status);
+    status = aclTensorToAtbTensor(keySplit1, &(pack.inTensors[index++]));
+    ATB_CHECK(status == atb::NO_ERROR, "keySplit1 create failed!", return status);
+    status = aclTensorToAtbTensor(keySplit2, &(pack.inTensors[index++]));
+    ATB_CHECK(status == atb::NO_ERROR, "keySplit2 create failed!", return status);
+    status = aclTensorToAtbTensor(value, &(pack.inTensors[index++]));
+    ATB_CHECK(status == atb::NO_ERROR, "value create failed!", return status);
+    status = aclTensorToAtbTensor(mask, &(pack.inTensors[index++]));
+    ATB_CHECK(status == atb::NO_ERROR, "mask create failed!", return status);
+    status = aclTensorToAtbTensorHost(seqLen, &(pack.inTensors[index++]));
+    ATB_CHECK(status == atb::NO_ERROR, "seqLen create failed!", return status);
+    if (param.calcType == atb::infer::RingMLAParam::CalcType::CALC_TYPE_DEFAULT) {
+        status = aclTensorToAtbTensor(prevOut, &(pack.inTensors[index++]));
+        ATB_CHECK(status == atb::NO_ERROR, "prevOut create failed!", return status);
+        status = aclTensorToAtbTensor(prevLse, &(pack.inTensors[index++]));
+        ATB_CHECK(status == atb::NO_ERROR, "prevLse create failed!", return status);
+    }
+
+    index = 0;
+    pack.outTensors.resize(g_RING_MLA_OUTTENSOR_NUM);
+    status = aclTensorToAtbTensor(output, &(pack.outTensors[index++]));
+    ATB_CHECK(status == atb::NO_ERROR, "output create failed!", return status);
+    status = aclTensorToAtbTensor(softmaxLse, &(pack.outTensors[index++]));
+    ATB_CHECK(status == atb::NO_ERROR, "softmaxLse create failed!", return status);
+    if (op == nullptr || *op == nullptr) {
+        ATB_LOG(ERROR) << "AtbRingMLAGetWorkspaceSize opeartion pointer is nullptr!";
+        return atb::ERROR_INVALID_OPERATION_ADDR;
+    }
+    status = (*op)->Setup(pack, *workspaceSize, context);
+    ATB_CHECK(status == atb::NO_ERROR, "AtbRingMLA Setup failed!", return status);
+    return atb::NO_ERROR;
+}
+
+atb::Status AtbRingMLA(void *workspace, uint64_t workspaceSize, atb::Operation *op, atb::Context *context)
+{
+    ATB_CHECK(op != nullptr, "AtbRingMLA expect op pointer not to be null!", return atb::ERROR_INVALID_OPERATION_ADDR);
+    atb::VariantPack pack;
+    atb::Status st = op->Execute(pack, (uint8_t *)(workspace), workspaceSize, context);
+    ATB_CHECK(st == atb::NO_ERROR, "AtbRingMLA Execute failed!", return st);
+    return st;
+}
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/src/cinterface/atb_acl_self_attention_prefix_encoder.cpp b/src/ops_infer/self_attention/atb_acl_self_attention_prefix_encoder.cpp
similarity index 97%
rename from src/cinterface/atb_acl_self_attention_prefix_encoder.cpp
rename to src/ops_infer/self_attention/atb_acl_self_attention_prefix_encoder.cpp
index 73e4e366..44da3af4 100644
--- a/src/cinterface/atb_acl_self_attention_prefix_encoder.cpp
+++ b/src/ops_infer/self_attention/atb_acl_self_attention_prefix_encoder.cpp
@@ -1,115 +1,115 @@
-/*
-
-Copyright (c) 2025 Huawei Technologies Co., Ltd.
-This file is a part of the CANN Open Software.
-Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
-Please refer to the License for details. You may not use this file except in compliance with the License.
-THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
-INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
-See LICENSE in the root of the software repository for the full text of the License.
-*/
-#include "atb/atb_acl.h"
-#include "atb_acl_util.h"
-#include "atb/operation/operation_base.h"
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-const size_t g_SELF_ATTENTION_PREFIX_ENCODER_INTENSOR_NUM = 6;
-const size_t g_SELF_ATTENTION_PREFIX_ENCODER_OUTTENSOR_NUM = 1;
-
-atb::Status AtbSelfAttentionPrefixEncoderGetWorkspaceSize(const aclTensor *query, const aclTensor *key,
-                                                          const aclTensor *value, const aclTensor *blockTables,
-                                                          const aclTensor *mask, const aclTensor *seqLen,
-                                                          const aclTensor *kvSeqLen, const aclTensor *slopes,
-                                                          int maskType, int32_t headNum, int32_t kvHeadNum,
-                                                          float qkScale, aclTensor *attnOut, uint64_t *workspaceSize,
-                                                          atb::Operation **op, atb::Context *context)
-{
-    atb::infer::SelfAttentionParam param;
-    param.maskType = atb::infer::SelfAttentionParam::MaskType(maskType);
-    param.headNum = headNum;
-    param.kvHeadNum = kvHeadNum;
-    param.qkScale = qkScale;
-    param.quantType = atb::infer::SelfAttentionParam::QuantType::TYPE_QUANT_UNDEFINED;
-    param.outDataType = ACL_DT_UNDEFINED;
-    param.qScale = 1;
-    param.batchRunStatusEnable = false;
-    param.isTriuMask = 1;
-    param.calcType = atb::infer::SelfAttentionParam::CalcType::PREFIX_ENCODER;
-    param.kernelType = atb::infer::SelfAttentionParam::KernelType::KERNELTYPE_HIGH_PRECISION;
-    param.clampType = atb::infer::SelfAttentionParam::ClampType::CLAMP_TYPE_UNDEFINED;
-    param.clampMin = 0;
-    param.clampMax = 0;
-    param.kvcacheCfg = atb::infer::SelfAttentionParam::KvCacheCfg::K_CACHE_V_CACHE;
-    param.scaleType = atb::infer::SelfAttentionParam::ScaleType::SCALE_TYPE_TOR;
-    param.inputLayout = atb::infer::InputLayout::TYPE_BSND;
-
-    if (op != nullptr && *op == nullptr) {
-        auto st = CreateOperation(param, op);
-        if (st != atb::NO_ERROR) {
-            ATB_LOG(ERROR) << "Create SelfAttention Operation Prefix Encoder failed!";
-            return st;
-        }
-    }
-    atb::VariantPack pack;
-    size_t index = 0;
-    bool isAlibiMask = param.maskType == atb::infer::SelfAttentionParam::MaskType::MASK_TYPE_ALIBI_COMPRESS ||
-                       param.maskType == atb::infer::SelfAttentionParam::MaskType::MASK_TYPE_ALIBI_COMPRESS_SQRT;
-    if (isAlibiMask) {
-        pack.inTensors.resize(g_SELF_ATTENTION_PREFIX_ENCODER_INTENSOR_NUM + 2); // 2: mask, slopes
-    } else if (param.maskType == atb::infer::SelfAttentionParam::MaskType::MASK_TYPE_CAUSAL_MASK) {
-        pack.inTensors.resize(g_SELF_ATTENTION_PREFIX_ENCODER_INTENSOR_NUM); // mask auto-generated
-    } else {
-        pack.inTensors.resize(g_SELF_ATTENTION_PREFIX_ENCODER_INTENSOR_NUM + 1); // 1: mask
-    }
-
-    auto status = aclTensorToAtbTensor(query, &(pack.inTensors[index++]));
-    ATB_CHECK(status == atb::NO_ERROR, "query create failed!", return status);
-    status = aclTensorToAtbTensor(key, &(pack.inTensors[index++]));
-    ATB_CHECK(status == atb::NO_ERROR, "key create failed!", return status);
-    status = aclTensorToAtbTensor(value, &(pack.inTensors[index++]));
-    ATB_CHECK(status == atb::NO_ERROR, "value create failed!", return status);
-    status = aclTensorToAtbTensor(blockTables, &(pack.inTensors[index++]));
-    ATB_CHECK(status == atb::NO_ERROR, "blockTables create failed!", return status);
-    if (param.maskType != atb::infer::SelfAttentionParam::MaskType::MASK_TYPE_CAUSAL_MASK) {
-        status = aclTensorToAtbTensor(mask, &(pack.inTensors[index++]));
-        ATB_CHECK(status == atb::NO_ERROR, "mask create failed!", return status);
-    }
-    status = aclTensorToAtbTensorHost(seqLen, &(pack.inTensors[index++]));
-    ATB_CHECK(status == atb::NO_ERROR, "seqLen create failed!", return status);
-    status = aclTensorToAtbTensorHost(kvSeqLen, &(pack.inTensors[index++]));
-    ATB_CHECK(status == atb::NO_ERROR, "kvSeqLen create failed!", return status);
-    if (isAlibiMask) {
-        status = aclTensorToAtbTensor(slopes, &(pack.inTensors[index++]));
-        ATB_CHECK(status == atb::NO_ERROR, "slopes create failed!", return status);
-    }
-
-    index = 0;
-    pack.outTensors.resize(g_SELF_ATTENTION_PREFIX_ENCODER_OUTTENSOR_NUM);
-    status = aclTensorToAtbTensor(attnOut, &(pack.outTensors[index]));
-    ATB_CHECK(status == atb::NO_ERROR, "attnOut create failed!", return status);
-
-    if (op == nullptr || *op == nullptr) {
-        ATB_LOG(ERROR) << "AtbSelfAttentionPrefixEncoderGetWorkspaceSize opeartion pointer is nullptr!";
-        return atb::ERROR_INVALID_OPERATION_ADDR;
-    }
-    status = (*op)->Setup(pack, *workspaceSize, context);
-    ATB_CHECK(status == atb::NO_ERROR, "AtbSelfAttentionPrefixEncoder Setup failed!", return status);
-    return atb::NO_ERROR;
-}
-
-atb::Status AtbSelfAttentionPrefixEncoder(void *workspace, uint64_t workspaceSize, atb::Operation *op,
-                                          atb::Context *context)
-{
-    ATB_CHECK(op != nullptr, "AtbSelfAttentionPrefixEncoder expect op pointer not to be null!",
-              return atb::ERROR_INVALID_OPERATION_ADDR);
-    atb::VariantPack pack;
-    atb::Status st = op->Execute(pack, (uint8_t *)(workspace), workspaceSize, context);
-    ATB_CHECK(st == atb::NO_ERROR, "AtbSelfAttentionPrefixEncoder Execute failed!", return st);
-    return st;
-}
-
-#ifdef __cplusplus
-}
+/*
+
+Copyright (c) 2025 Huawei Technologies Co., Ltd.
+This file is a part of the CANN Open Software.
+Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+Please refer to the License for details. You may not use this file except in compliance with the License.
+THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+See LICENSE in the root of the software repository for the full text of the License.
+*/
+#include "atb/atb_acl.h"
+#include "atb/utils/atb_acl_util.h"
+#include "atb/operation/operation_base.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+const size_t g_SELF_ATTENTION_PREFIX_ENCODER_INTENSOR_NUM = 6;
+const size_t g_SELF_ATTENTION_PREFIX_ENCODER_OUTTENSOR_NUM = 1;
+
+atb::Status AtbSelfAttentionPrefixEncoderGetWorkspaceSize(const aclTensor *query, const aclTensor *key,
+                                                          const aclTensor *value, const aclTensor *blockTables,
+                                                          const aclTensor *mask, const aclTensor *seqLen,
+                                                          const aclTensor *kvSeqLen, const aclTensor *slopes,
+                                                          int maskType, int32_t headNum, int32_t kvHeadNum,
+                                                          float qkScale, aclTensor *attnOut, uint64_t *workspaceSize,
+                                                          atb::Operation **op, atb::Context *context)
+{
+    atb::infer::SelfAttentionParam param;
+    param.maskType = atb::infer::SelfAttentionParam::MaskType(maskType);
+    param.headNum = headNum;
+    param.kvHeadNum = kvHeadNum;
+    param.qkScale = qkScale;
+    param.quantType = atb::infer::SelfAttentionParam::QuantType::TYPE_QUANT_UNDEFINED;
+    param.outDataType = ACL_DT_UNDEFINED;
+    param.qScale = 1;
+    param.batchRunStatusEnable = false;
+    param.isTriuMask = 1;
+    param.calcType = atb::infer::SelfAttentionParam::CalcType::PREFIX_ENCODER;
+    param.kernelType = atb::infer::SelfAttentionParam::KernelType::KERNELTYPE_HIGH_PRECISION;
+    param.clampType = atb::infer::SelfAttentionParam::ClampType::CLAMP_TYPE_UNDEFINED;
+    param.clampMin = 0;
+    param.clampMax = 0;
+    param.kvcacheCfg = atb::infer::SelfAttentionParam::KvCacheCfg::K_CACHE_V_CACHE;
+    param.scaleType = atb::infer::SelfAttentionParam::ScaleType::SCALE_TYPE_TOR;
+    param.inputLayout = atb::infer::InputLayout::TYPE_BSND;
+
+    if (op != nullptr && *op == nullptr) {
+        auto st = CreateOperation(param, op);
+        if (st != atb::NO_ERROR) {
+            ATB_LOG(ERROR) << "Create SelfAttention Operation Prefix Encoder failed!";
+            return st;
+        }
+    }
+    atb::VariantPack pack;
+    size_t index = 0;
+    bool isAlibiMask = param.maskType == atb::infer::SelfAttentionParam::MaskType::MASK_TYPE_ALIBI_COMPRESS ||
+                       param.maskType == atb::infer::SelfAttentionParam::MaskType::MASK_TYPE_ALIBI_COMPRESS_SQRT;
+    if (isAlibiMask) {
+        pack.inTensors.resize(g_SELF_ATTENTION_PREFIX_ENCODER_INTENSOR_NUM + 2); // 2: mask, slopes
+    } else if (param.maskType == atb::infer::SelfAttentionParam::MaskType::MASK_TYPE_CAUSAL_MASK) {
+        pack.inTensors.resize(g_SELF_ATTENTION_PREFIX_ENCODER_INTENSOR_NUM); // mask auto-generated
+    } else {
+        pack.inTensors.resize(g_SELF_ATTENTION_PREFIX_ENCODER_INTENSOR_NUM + 1); // 1: mask
+    }
+
+    auto status = aclTensorToAtbTensor(query, &(pack.inTensors[index++]));
+    ATB_CHECK(status == atb::NO_ERROR, "query create failed!", return status);
+    status = aclTensorToAtbTensor(key, &(pack.inTensors[index++]));
+    ATB_CHECK(status == atb::NO_ERROR, "key create failed!", return status);
+    status = aclTensorToAtbTensor(value, &(pack.inTensors[index++]));
+    ATB_CHECK(status == atb::NO_ERROR, "value create failed!", return status);
+    status = aclTensorToAtbTensor(blockTables, &(pack.inTensors[index++]));
+    ATB_CHECK(status == atb::NO_ERROR, "blockTables create failed!", return status);
+    if (param.maskType != atb::infer::SelfAttentionParam::MaskType::MASK_TYPE_CAUSAL_MASK) {
+        status = aclTensorToAtbTensor(mask, &(pack.inTensors[index++]));
+        ATB_CHECK(status == atb::NO_ERROR, "mask create failed!", return status);
+    }
+    status = aclTensorToAtbTensorHost(seqLen, &(pack.inTensors[index++]));
+    ATB_CHECK(status == atb::NO_ERROR, "seqLen create failed!", return status);
+    status = aclTensorToAtbTensorHost(kvSeqLen, &(pack.inTensors[index++]));
+    ATB_CHECK(status == atb::NO_ERROR, "kvSeqLen create failed!", return status);
+    if (isAlibiMask) {
+        status = aclTensorToAtbTensor(slopes, &(pack.inTensors[index++]));
+        ATB_CHECK(status == atb::NO_ERROR, "slopes create failed!", return status);
+    }
+
+    index = 0;
+    pack.outTensors.resize(g_SELF_ATTENTION_PREFIX_ENCODER_OUTTENSOR_NUM);
+    status = aclTensorToAtbTensor(attnOut, &(pack.outTensors[index]));
+    ATB_CHECK(status == atb::NO_ERROR, "attnOut create failed!", return status);
+
+    if (op == nullptr || *op == nullptr) {
+        ATB_LOG(ERROR) << "AtbSelfAttentionPrefixEncoderGetWorkspaceSize opeartion pointer is nullptr!";
+        return atb::ERROR_INVALID_OPERATION_ADDR;
+    }
+    status = (*op)->Setup(pack, *workspaceSize, context);
+    ATB_CHECK(status == atb::NO_ERROR, "AtbSelfAttentionPrefixEncoder Setup failed!", return status);
+    return atb::NO_ERROR;
+}
+
+atb::Status AtbSelfAttentionPrefixEncoder(void *workspace, uint64_t workspaceSize, atb::Operation *op,
+                                          atb::Context *context)
+{
+    ATB_CHECK(op != nullptr, "AtbSelfAttentionPrefixEncoder expect op pointer not to be null!",
+              return atb::ERROR_INVALID_OPERATION_ADDR);
+    atb::VariantPack pack;
+    atb::Status st = op->Execute(pack, (uint8_t *)(workspace), workspaceSize, context);
+    ATB_CHECK(st == atb::NO_ERROR, "AtbSelfAttentionPrefixEncoder Execute failed!", return st);
+    return st;
+}
+
+#ifdef __cplusplus
+}
 #endif
\ No newline at end of file
-- 
Gitee


From d911cf29c751080ad7473623be896fac04915aba Mon Sep 17 00:00:00 2001
From: caixilong <caixilong2@h-partners.com>
Date: Sun, 14 Sep 2025 20:30:18 +0800
Subject: [PATCH 08/94] add self_attention combined test

---
 .../test_self_attention_combine.py            | 1913 +++++++++++++++++
 1 file changed, 1913 insertions(+)
 create mode 100644 tests/apitest/opstest/python/operations/self_attention/test_self_attention_combine.py

diff --git a/tests/apitest/opstest/python/operations/self_attention/test_self_attention_combine.py b/tests/apitest/opstest/python/operations/self_attention/test_self_attention_combine.py
new file mode 100644
index 00000000..bfbfb539
--- /dev/null
+++ b/tests/apitest/opstest/python/operations/self_attention/test_self_attention_combine.py
@@ -0,0 +1,1913 @@
+#
+# Copyright (c) 2024 Huawei Technologies Co., Ltd.
+# This file is a part of the CANN Open Software.
+# Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+#
+
+import time
+import json
+from enum import Enum
+import torch
+import logging
+import unittest
+import math
+import numpy as np
+import sys
+import os
+import random
+sys.path.append(os.path.join(os.path.dirname(__file__), "../"))
+from self_attention.self_attention_test_data_generator import SelfAttentionTestDataGenerator
+
+import operation_test  # NOQA: E402
+torch.set_printoptions(profile="full")
+np.set_printoptions(threshold=np.inf)
+sys.path.append("./tests/pythontest")
+save_path = "./"
+from golden_compare_cv import compare_cv
+
+class ScaleType(Enum):
+    SCALE_TOR = 0
+    SCALE_LOGN = 1
+    SCALE_LOGN_FP32 = 2
+np.random.seed(123)
+MASK_TYPE_NO_MASK = 0
+MASK_TYPE_NO_HEAD = 1
+MASK_TYPE_NO_BATCH = 2
+MASK_TYPE_ALIBI_WITH_BATCH = 3
+MASK_TYPE_ALIBI_NO_BATCH = 4
+MASK_TYPE_NO_HEAD_DECODER = 5
+MASK_TYPE_SWA = 6
+MASK_TYPE_SWA_DECODER = 7
+MASK_TYPE_ALIBI_WITH_PREFIX_BATCH = 8
+MASK_TYPE_NO_BATCH_WITH_PREFIX = 9
+MASK_TYPE_ALIBI_NO_BATCH_WITH_PREFIX = 10
+MASK_TYPE_RAZOR_FUSION = 11
+UNPAD_FLASH_ATTENTION_ENCODER_PREFIX_CACHE_ND = 2012
+CAL_TYPE_PREFIX_ENCODER = 4
+MASK_TYPE_ALIBI_COMPRESS = 4
+MASK_TYPE_CAUSAL_MASK = 9
+MASK_TYPE_ALIBI_COMPRESS_SQRT = 5
+KERNELTYPE_HIGH_PRECISION = 1
+
+def gen_seq_len(batch, max_seq, variate_seq=False):
+    if variate_seq:
+        num = max_seq // 16
+        seqlen_aligned_arange = np.arange(1, num) * 16
+        if batch > num:
+            seqlen_aligned_remain = np.random.randint(1, max_seq, size=(batch - num))
+            seqlen_aligned_remain[:] = ((seqlen_aligned_remain[:] + 15) // 16) * 16
+            seqlen_aligned = np.concatenate((seqlen_aligned_arange, seqlen_aligned_remain), 0)
+        else:
+            seqlen_aligned = seqlen_aligned_arange
+        sp_list = np.random.randint(0, 15, size=(num - 1))
+        seqlen = seqlen_aligned - sp_list
+        seqlen = seqlen[-batch:]
+        seqlen_aligned = seqlen_aligned[-batch:]
+        print(seqlen)
+    else:
+        max_seq_aligned = (max_seq + 15) // 16 * 16
+        sp_list = np.ones((batch,)) * (max_seq_aligned - max_seq)
+        sp_list = sp_list.astype(np.int32)
+        seqlen = np.ones((batch,)) * max_seq
+        seqlen = seqlen.astype(np.int32)
+        print(seqlen)
+        seqlen_aligned = np.ones((batch,)) * max_seq_aligned
+        seqlen_aligned = seqlen_aligned.astype(np.int32)
+
+    ntokens = seqlen.sum()
+    print("ntokens:", ntokens)
+    return seqlen, seqlen_aligned, ntokens
+
+def group_matmul(heads, group_num, A, B):
+    group_head = heads // group_num
+    score = None
+    for i in range(group_num):
+        group_score = np.matmul(A[i * group_head: (i + 1) * group_head, :, :].astype(np.float32),
+                                B[i:(i + 1), :, :].astype(np.float32)).astype(np.float16)
+        if score is None:
+            score = group_score
+        else:
+            score = np.concatenate((score, group_score), 0)
+    print(score.shape)
+    return score
+
+def gen_swa_cmp(window_size, embeddim):
+    swa_mask = np.ones(shape=(1, 512, 512)) * -10000.0
+    pp_n = 128 if embeddim <= 128 else 64
+    # pp_n = 128
+    if window_size <= pp_n * 3:
+        true_size = window_size
+    else:
+        if window_size % pp_n == 0:
+            true_size = pp_n * 3
+        else:
+            true_size = pp_n * 2 + window_size % pp_n
+    triu_mask = np.triu(swa_mask, 1)
+    tril_mask = np.tril(swa_mask, -true_size)
+    swa_mask = triu_mask + tril_mask
+    swa_mask = swa_mask.reshape(512,512)
+    return swa_mask
+
+class TestFlashAttention(operation_test.OperationTest):
+
+    def close_pack(self, in_data, seq_len):
+        kv = in_data.numpy()
+        dim1len = np.size(kv, -2)
+        if max(seq_len) > dim1len:
+            return None
+        kv = kv.reshape(np.prod(kv.shape[0:-1]), kv.shape[-1])
+        c_offset = 0
+        s_offset = 0
+        for i, len in enumerate(seq_len):
+            kv[c_offset:c_offset + seq_len[i]][:] = kv[s_offset:s_offset + seq_len[i]][:]
+            c_offset += seq_len[i]
+            s_offset += dim1len
+        return torch.from_numpy(kv[0:sum(seq_len)][:])
+
+    def calc_expect_func(self, batch, seqlen, heads, embed, window_size, mask_type, group_num=32):
+        is_mask = True
+        self.is_mask = is_mask
+        variate_seq = False
+        is_decoder = False
+        self.is_decoder = is_decoder
+        max_seq = 2048
+        self.max_seq = max_seq
+        src_type = 'float16'
+        fp32 = True
+        print(f"group_num: {group_num}")
+        print("q_seq is:")
+        if is_decoder:
+            q_seqlen, q_seqlen_aligned, q_ntokens = gen_seq_len(batch, 1, variate_seq)
+            kv_seqlen, kv_seqlen_aligned, kv_ntokens = gen_seq_len(batch, seqlen, variate_seq)
+        else:
+            q_seqlen, q_seqlen_aligned, q_ntokens = gen_seq_len(batch, seqlen, variate_seq)
+            kv_seqlen, kv_seqlen_aligned, kv_ntokens = q_seqlen, q_seqlen_aligned, q_ntokens   # crossattention时，q_seqlen != k_seqlen
+        
+        self.q_seqlen, self.q_seqlen_aligned, self.q_ntokens, self.kv_seqLen = q_seqlen, q_seqlen_aligned, q_ntokens, kv_seqlen
+        print("qseqlen is ", self.q_seqlen)
+        self.kv_seqlen_aligned, self.kv_ntokens = kv_seqlen_aligned, kv_ntokens
+        max_s = np.max(q_seqlen)
+        ntokens2 = (q_seqlen * kv_seqlen).sum()
+        embed_v = embed
+
+        q = np.random.uniform(-1.0, 1.0, size=(q_ntokens, heads * embed)).astype(np.float16)
+        k = np.random.uniform(-1.0, 1.0, size=(kv_ntokens, group_num * embed)).astype(np.float16)
+        v = np.random.uniform(-1.0, 1.0, size=(kv_ntokens, group_num * embed_v)).astype(np.float16)
+        self.heads, self.embeddim, self.embeddimv = heads, embed, embed_v
+        mask = np.ones(shape=(1, max_s, max_s)).astype(np.float16)  # 使用当前最大seqlen生成mask
+        mask_u = np.triu(mask, 1)
+        mask_l = np.tril(mask, -window_size)
+        mask = mask_u + mask_l
+        mask *= -10000.0
+
+        q_offset = 0
+        k_offset = 0
+        v_offset = 0
+
+        s = None
+        _p = None
+        out = None
+
+        for idx in range(batch):
+            q_s = q_seqlen[idx]
+            kv_s = kv_seqlen[idx]
+            q_slice = q[q_offset:q_offset + q_s][:]
+            q_slice = q_slice.reshape(q_s, heads, embed)
+            q_slice = np.transpose(q_slice, (1, 0, 2))  # (heads, q_seq, embed)
+            k_slice = k[k_offset:k_offset + kv_s][:]
+            k_slice = k_slice.reshape(kv_s, group_num, embed)
+            k_slice = np.transpose(k_slice, (1, 0, 2))
+            k_slice_t = np.transpose(k_slice, (0, 2, 1))   # get K^T (kv_heads, embed, k_seq)
+            v_slice = v[v_offset:v_offset + kv_s][:]
+            v_slice = v_slice.reshape(kv_s, group_num, embed_v)
+            v_slice = np.transpose(v_slice, (1, 0, 2))
+            score = group_matmul(heads, group_num, q_slice, k_slice_t)
+            if s is None:
+                s = score.reshape([-1, ])
+            else:
+                s = np.concatenate((s, score.reshape([-1, ])), 0)
+
+            tor = np.float16(1.0 / math.sqrt(1.0 * embed))
+            score = score * tor
+            if is_mask:
+                score = score + mask[:, :q_s, :kv_s]
+            score_max = np.max(score, axis=-1)
+            score = score - score_max.reshape((heads, q_s, 1))
+            score_exp = np.exp(score.astype(np.float32))
+            if not fp32:
+                score_sum = np.sum(score_exp.astype(np.float16), axis=-1)
+                if _p is None:
+                    _p = score_exp.astype(np.float16).reshape([-1, ])
+                else:
+                    _p = np.concatenate((_p, score_exp.astype(np.float16).reshape([-1, ])), 0)
+                p = score_exp.astype(np.float16) / score_sum.reshape((heads, q_s, 1)).astype(np.float16)
+                out_sub = group_matmul(heads, group_num, p, v_slice)
+            else:
+                score_sum = np.sum(score_exp, axis=-1)
+                if _p is None:
+                    _p = score_exp.astype(np.float16).reshape([-1, ])
+                else:
+                    _p = np.concatenate((_p, score_exp.astype(np.float16).reshape([-1, ])), 0)
+                p = score_exp.astype(np.float16)
+                out_sub = group_matmul(heads, group_num, p, v_slice)
+                out_sub = out_sub / score_sum.reshape((heads, q_s, 1)).astype(np.float16)
+
+            out_sub = out_sub.reshape(heads, q_s, embed_v)
+            out_sub = np.transpose(out_sub, (1, 0, 2))
+            out_sub = np.ascontiguousarray(out_sub)
+            if out is None:
+                out = out_sub
+            else:
+                out = np.concatenate((out, out_sub), 0)
+
+            q_offset += q_s
+            k_offset += kv_s
+            v_offset += kv_s
+
+        print("==> data generate finished!")
+
+        q = q.astype(src_type).reshape(-1, heads, embed)
+        k = k.astype(src_type).reshape(-1, group_num, embed)
+        v = v.astype(src_type).reshape(-1, group_num, embed_v)
+        # mask = mask.astype(src_type).reshape(max_s, max_s)
+        mask = gen_swa_cmp(window_size, embed).astype(src_type)
+        q_len = q_seqlen.astype(np.int32)
+        out = out.astype(src_type).reshape(-1, heads, embed_v)
+        ret_data = q, k, v, mask, q_len, out
+        return ret_data
+
+    def set_data_params(self, dynamic_batch=False, batch_state=None, window_size=0, cache_type=0,
+                       is_mask=True, is_decoder=False, is_alibi=False, is_razor_fusion = False, alibi_dim=4,
+                       batch = 1, kv_head = 1, heads = 1, embeddim = 128, embeddimv = 0, max_seq = 2048,
+                       kv_seqLen = [], is_clamp = 0, clamp_min = 0, preTokens = 0, nextTokens = 0,
+                       tileQ = 0, tileKv = 0, razorLen = 0, baseM = 0, textQLen = 0, textKvLen = 0,
+                       is_splitm = False,
+                       clamp_max = 0, data_type = torch.float16, op_type = 0, mask_type = 0,
+                       no_cache = False, long_seq = False, is_triu_mask = False, is_multi_layer = False,
+                       is_sqrt = False, left_align = False, scaleType = ScaleType.SCALE_TOR.value, fav3 = False,
+                       tor = 1, bnsd = False, is_compress = False, q_seqlens=None, num_blocks=None,
+                       block_size=None):
+        self.dynamic_batch = dynamic_batch
+        self.batch_state = batch_state
+        self.is_mask = is_mask
+        self.is_decoder = is_decoder
+        self.is_alibi = is_alibi
+        self.preTokens = preTokens
+        self.nextTokens = nextTokens
+        self.tileQ = tileQ
+        self.tileKv = tileKv
+        self.razorLen = razorLen
+        self.baseM = baseM
+        self.textQLen = textQLen
+        self.textKvLen = textKvLen
+        self.is_razor_fusion = is_razor_fusion
+        self.alibi_dim = alibi_dim
+        self.batch = batch
+        self.kv_head = kv_head
+        self.heads = heads
+        self.embeddim = embeddim
+        self.embeddimv = embeddimv
+        self.max_seq = max_seq
+        self.kv_seqLen = kv_seqLen
+        self.dynamic_batch = dynamic_batch
+        self.is_clamp = is_clamp
+        self.clamp_min = clamp_min
+        self.clamp_max = clamp_max
+        self.data_type = data_type
+        self.no_cache = no_cache
+        self.long_seq = long_seq
+        self.mask_type = mask_type
+        self.is_triu_mask = is_triu_mask
+        self.is_multi_layer = is_multi_layer
+        self.is_sqrt = is_sqrt
+        self.left_align = left_align
+        self.fav3 = fav3
+        self.scaleType = scaleType
+        self.tor = tor
+        self.is_int8_flag = False
+        self.online = False
+        self.bnsd = bnsd
+        self.window_size = window_size
+        self.is_compress = is_compress
+        self.cache_type = cache_type
+        self.q_seqlens = q_seqlens if q_seqlens is not None else kv_seqLen
+
+        if self.embeddimv == 0:
+            self.embeddimv = self.embeddim
+        if is_decoder:
+            self.q_seqlen, self.q_ntokens = self.gen_seq_len(batch, [1] * batch)
+        else:
+            self.q_seqlen, self.q_ntokens = self.gen_seq_len(batch, self.q_seqlens)
+        self.kv_seqlen, self.kv_ntokens = self.gen_seq_len(batch, kv_seqLen)
+        # gen intensor for fa kernel
+        if is_multi_layer:
+            self.layer_id = torch.from_numpy(np.array([1], dtype=np.int32)).to(torch.int32)
+        else:
+            self.layer_id = torch.from_numpy(np.array([0], dtype=np.int32)).to(torch.int32)
+        print("here is ", self.q_seqlen)
+        self.q_max_seq = np.max(self.q_seqlen)
+        self.kv_max_seq = np.max(self.kv_seqlen)
+        q = torch.from_numpy(np.random.uniform(-5.0, 5.0, size=(self.q_ntokens, heads * self.embeddim)))
+
+        self.q = q.to(data_type)
+        if num_blocks is None:
+            self.k = torch.from_numpy(np.random.uniform(-5.0, 5.0, size=(self.layer_id[0] + 1, batch, self.max_seq, kv_head * self.embeddim))).to(data_type)
+            self.v = torch.from_numpy(np.random.uniform(-5.0, 5.0, size=(self.layer_id[0] + 1, batch, self.max_seq, kv_head * self.embeddimv))).to(data_type)
+            if is_splitm:
+                maxKvSeqlen = max(self.kv_seqlen)
+                self.k = torch.from_numpy(np.random.uniform(-5.0, 5.0, size=(self.layer_id[0] + 1, batch, maxKvSeqlen, kv_head * self.embeddim))).to(data_type)
+                self.v = torch.from_numpy(np.random.uniform(-5.0, 5.0, size=(self.layer_id[0] + 1, batch, maxKvSeqlen, kv_head * self.embeddimv))).to(data_type)
+        else:
+            # kv cache shape: (num_blocks, block_size, num_heads, head_size)
+            self.k_cache = torch.from_numpy(np.random.uniform(-5.0, 5.0, size=(num_blocks, block_size, kv_head, embeddim))).to(data_type)
+            self.v_cache = torch.from_numpy(np.random.uniform(-5.0, 5.0, size=(num_blocks, block_size, kv_head, embeddim))).to(data_type)
+
+            batch = len(kv_seqLen)
+            max_context_len = max(kv_seqLen)
+            max_num_blocks_per_seq = (max_context_len + block_size - 1) // block_size
+            block_tables = []   # (batch, max_num_blocks_per_seq)
+            offset = 0
+            for i in range(batch):
+                num_blocks_cur_seq = (kv_seqLen[i] + block_size - 1) // block_size
+                # padding block table with 0
+                block_table = [
+                    random.randint(0, num_blocks-1) if j < num_blocks_cur_seq else 0 for j in range(max_num_blocks_per_seq)
+                ]
+                offset += num_blocks_cur_seq
+                block_tables.append(block_table)
+            self.block_tables = torch.from_numpy(np.array(block_tables)).to(torch.int32)
+            self.k = torch.stack([self.k_cache[self.block_tables[torch.tensor(i, dtype=torch.long)].to(torch.long)].reshape(-1, kv_head * self.embeddim)[:max_context_len, :] for i in range(batch)])
+            self.v = torch.stack([self.v_cache[self.block_tables[torch.tensor(i, dtype=torch.long)].to(torch.long)].reshape(-1, kv_head * self.embeddim)[:max_context_len, :] for i in range(batch)])
+            self.k = self.k.reshape(1, batch, max_context_len, kv_head * self.embeddim)
+            self.v = self.v.reshape(1, batch, max_context_len, kv_head * self.embeddim)
+
+        if self.fav3:
+            self.is_int8_flag = True
+            self.q_scale, self.q_offset, _ = self.quant_per_head(self.q, heads, embeddim, (self.q_ntokens, heads * self.embeddim))
+            self.k_scale, self.k_offset, _ = self.quant_per_head(self.k, kv_head, embeddim, (self.layer_id[0] + 1, batch, self.max_seq, kv_head * self.embeddim))
+            self.v_scale, self.v_offset, _ = self.quant_per_head(self.v, kv_head, embeddim, (self.layer_id[0] + 1, batch, self.max_seq, kv_head * self.embeddim))
+            self.k_scale = (self.k_scale.view(kv_head, 1) * torch.ones([kv_head, heads // kv_head])).view(-1)
+            self.k_offset= (self.k_offset.view(kv_head, 1) * torch.ones([kv_head, heads // kv_head])).view(-1)
+            self.v_scale = (self.v_scale.view(kv_head, 1) * torch.ones([kv_head, heads // kv_head])).view(-1)
+            self.v_offset= (self.v_offset.view(kv_head, 1) * torch.ones([kv_head, heads // kv_head])).view(-1)
+            self.offline_scale = torch.from_numpy(np.random.uniform(1 / 127, 3 / 127, size=(heads))).to(torch.float32)
+
+            self.q_int8 = torch.from_numpy(np.random.uniform(-5.0, 5.0, size=(self.q_ntokens, heads * self.embeddim))).to(torch.int8)
+            self.k_int8 = torch.from_numpy(np.random.uniform(-5.0, 5.0, size=(self.layer_id[0] + 1, batch, self.max_seq, kv_head * self.embeddim))).to(torch.int8)
+            self.v_int8 = torch.from_numpy(np.random.uniform(-5.0, 5.0, size=(self.layer_id[0] + 1, batch, self.max_seq, kv_head * self.embeddimv))).to(torch.int8)
+        
+        self.gen_mask(batch, heads, data_type, mask_type, window_size, is_compress, cache_type)
+        print("**********data gen shape***********")
+        print(f"q shape: {self.q.shape}")
+        print(f"k shape: {self.k.shape}")
+        print(f"v shape: {self.v.shape}")
+        print(f"layer_id shape: {self.layer_id.shape}")
+        print(f"mask shape: {self.mask.shape}")
+
+    def quant_per_head(self, data, heads, embeddim, shape):
+        temp = data.view(-1, heads, self.embeddim).to(torch.float32)
+        scale = torch.stack([self.fav3_quant(temp[:, i, :], data_min = -1, data_max = 1, symmetric = True)[0] for i in range(heads)])
+        offset = torch.stack([self.fav3_quant(temp[:, i, :], data_min = -1, data_max = 1, symmetric = True)[1] for i in range(heads)])
+        int8_data = torch.zeros_like(temp)
+        for i in range(heads):
+            int8_data[:, i, :] = ((temp[:, i, :] / scale[i]).round_() + offset[i])
+        int8_data = int8_data.view(shape).to(torch.int8)
+        return scale, offset, int8_data
+
+    def fav3_quant(self, data, data_min = 0, data_max = 0, symmetric = False, bit = 8):
+        n = 2 ** (bit - 1)
+        if symmetric:
+            quant_min, quant_max = -(n - 1), (n - 1)
+        else:
+            quant_min, quant_max = -n, (n - 1)
+        span = quant_max - quant_min
+        if data_min == data_max:
+            data_max = data.max().item()
+            data_min = data.min().item()
+        if symmetric:
+            scale = max(data_max, -data_min) / (float(span) / 2)
+            offset = 0
+        else:
+            scale = (data_max - data_min) / float(span)
+            offset = (data_min * quant_min + data_max * quant_max) / (data_min - data_max)
+        # 量化公式：x / scale + offset
+        return torch.tensor(float(scale), dtype = torch.float), torch.tensor(int(offset), dtype = torch.float)
+
+    def get_alibi_slopes(self, n_heads):
+        n = 2 ** math.floor(math.log2(n_heads))
+        m0 = 2.0 ** (-8.0 / n)
+        slopes = torch.pow(m0, torch.arange(1, n + 1))
+        if n < n_heads:
+            m1 = 2.0 ** ( -4.0 / n)
+            mm = torch.pow(m1, torch.arange(1, 1 + 2 * (n_heads - n), 2))
+            slopes = torch.cat([slopes, mm])
+        return slopes
+
+    def get_alibi_bias(self, n_heads, max_seqlen):
+        if not self.left_align:
+            self.bias = torch.arange(max_seqlen)
+            self.bias = self.bias[None, :] - self.bias[:, None]
+            if (self.is_sqrt):
+                self.bias = torch.sqrt(torch.abs(self.bias)) * torch.sign(self.bias)
+            bias = torch.empty(
+                n_heads,
+                max_seqlen,
+                max_seqlen
+            )[:, :max_seqlen, :max_seqlen].copy_(self.bias)
+            self.alibi_slopes = self.get_alibi_slopes(n_heads)
+        else:
+            self.bias = torch.arange(max_seqlen, dtype=torch.float32).unsqueeze(0).unsqueeze(0).expand(n_heads, max_seqlen, -1)
+            self.alibi_slopes = torch.Tensor(self.get_interleave(n_heads))
+            bias = self.bias
+        bias = bias * self.alibi_slopes[:, None, None]
+        return bias
+
+    def get_interleave(self, n, alibi_bias_max=8.0):
+        def get_interleave_power_of_2(n, alibi_bias_max):
+            if n == 0:
+                return 0
+            start = (2 ** (-2 ** -(math.log2(n) - 3)))
+            ratio = start
+            return [start * ratio ** i for i in range(n)]
+        if math.log2(n).is_integer():
+            return get_interleave_power_of_2(n, alibi_bias_max)
+        else:
+            closest_power_of_2 = 2 ** math.floor(math.log2(n))
+            return get_interleave_power_of_2(closest_power_of_2, alibi_bias_max) + \
+                self.get_interleave(2 * closest_power_of_2)[0::2][:n - closest_power_of_2]
+
+    def gen_swa_cmp(self, max_seq, window_size):
+        print("self.pre_mask_coff", self.pre_mask_coff)
+        swa_mask = np.ones(shape=(1, 512, 512)) * self.pre_mask_coff
+        print("gen_swa_cmp ", swa_mask.shape)
+        pp_n = 128 if self.embeddim <= 128 else 64
+        pp_n = 128 if self.embeddim != self.embeddimv else pp_n
+        if window_size <= pp_n * 3:
+            true_size = window_size
+        elif window_size % pp_n == 0:
+            true_size = pp_n * 3
+        else:
+            true_size = pp_n * 2 + window_size % pp_n
+        triu_mask = np.triu(swa_mask, 1)
+        tril_mask = np.tril(swa_mask, -true_size)
+        print("gen_swa_cmp ", swa_mask.shape)
+        print("gen_swa_cmp ", tril_mask.shape)
+        swa_mask = triu_mask + tril_mask
+        swa_mask = torch.from_numpy(swa_mask).to(torch.float32)
+        print("gen_swa_cmp ", swa_mask.shape)
+        return swa_mask
+
+    def gen_razor_fusion_mask(self, razorLen, tileQ, tileKv, textQLen, textKvLen, preTokens, nextTokens, baseM):
+        np.set_printoptions(threshold=np.inf)
+        
+        mask_sizeQ = razorLen * tileQ + textQLen
+        mask_sizeK = razorLen * tileKv + textKvLen
+        logging.info("generate razor mask:", razorLen, tileQ, tileKv, textQLen, textKvLen, preTokens, nextTokens, baseM)
+        mask = np.zeros((mask_sizeQ, mask_sizeK), dtype=int)
+        preTokensBlock = preTokens // baseM
+        nextTokensBlock = nextTokens // baseM
+        idx = razorLen // baseM * baseM
+        mask[:, int(idx) : int(razorLen)] = 0
+        mask[int(idx) : int(razorLen), :] = 0
+        for i in range((razorLen + baseM - 1) // baseM):
+            start =  i - preTokensBlock + 1 if i >= preTokensBlock else 0
+            end =  i + nextTokensBlock if i < preTokensBlock else start + preTokensBlock + nextTokensBlock - 1
+            end = (razorLen + baseM - 1) // baseM if end > (razorLen + baseM - 1) // baseM else end
+            for j in range(start, end):
+                mask[i * baseM : (i + 1) * baseM, j * baseM : (j + 1) * baseM] = 1
+        mask[razorLen :, :] = 0
+        mask[:, razorLen :] = 0
+        for i in range(tileQ):
+            for j in range(tileKv):
+                mask[i * razorLen : (i + 1) * razorLen, j * razorLen : (j + 1) * razorLen] = mask[0 : razorLen, 0 : razorLen]
+
+        mask[razorLen * tileQ : , :] = 1
+        mask[: , razorLen * tileKv :] = 1
+        mask = mask[None, None, :]
+        mask = 1 - mask
+        return mask * -10000
+
+    def gen_swa_mask(self, max_seq, window_size, pre_mask_coff, cache_type=0):
+        swa_mask = np.ones(shape=self.mask_info[0]) * pre_mask_coff
+        logging.info(f"gen_swa_mask: window_size {window_size} max_seq {max_seq} self.kv_seqLen {self.kv_seqLen}")
+        if window_size < max_seq and self.is_decoder:
+            if cache_type == 1:
+                for idx, kvseqlen in enumerate(self.kv_seqLen):
+                    swa_mask[idx, :, :window_size] = 0
+            else:
+                for idx, kvseqlen in enumerate(self.kv_seqLen):
+                    swa_mask[idx, :, kvseqlen - window_size: kvseqlen] = 0
+        elif window_size < max_seq or self.is_compress:
+            triu_mask = np.triu(swa_mask, 1)
+            tril_mask = np.tril(swa_mask, -window_size)
+            swa_mask = triu_mask + tril_mask
+        else:
+            swa_mask = np.triu(swa_mask, 1)
+        return swa_mask
+
+    def gen_mask(self, batch, heads, data_type, mask_type, window_size, is_compress, cache_type=0):
+        import random
+        q_max_seq = self.max_seq
+        kv_max_seq = self.max_seq
+        mask_type_dict = {
+            # 四维的alibi mask
+            MASK_TYPE_ALIBI_WITH_BATCH : ((batch, heads, q_max_seq, kv_max_seq), (lambda mask, idx, q_s, kv_s: (mask[idx, :, :q_s, :kv_s]))),
+            MASK_TYPE_ALIBI_WITH_PREFIX_BATCH : ((batch, heads, q_max_seq, kv_max_seq), (lambda mask, idx, q_s, kv_s: (mask[idx, :, kv_s-q_s:kv_s, :kv_s]))),
+            # 三维的alibi mask
+            MASK_TYPE_ALIBI_NO_BATCH : ((heads, q_max_seq, kv_max_seq), (lambda mask, idx, q_s, kv_s: (mask[:, :q_s, :kv_s]))),
+            MASK_TYPE_ALIBI_NO_BATCH_WITH_PREFIX : ((heads, q_max_seq, kv_max_seq), (lambda mask, idx, q_s, kv_s: (mask[:, kv_s-q_s:kv_s, :kv_s]))),
+            MASK_TYPE_NO_HEAD : ((batch, q_max_seq, kv_max_seq), (lambda mask, idx, q_s, kv_s: (mask[idx, :q_s, :kv_s]))),
+            MASK_TYPE_NO_HEAD_DECODER : ((batch, 1, kv_max_seq), (lambda mask, idx, q_s, kv_s: (mask[idx, :q_s, :kv_s]))),
+            MASK_TYPE_NO_BATCH : ((1, q_max_seq, kv_max_seq), (lambda mask, idx, q_s, kv_s: (mask[:, :q_s, :kv_s]))),
+            MASK_TYPE_NO_BATCH_WITH_PREFIX : ((1, q_max_seq, kv_max_seq), (lambda mask, idx, q_s, kv_s: (mask[:, kv_s-q_s:kv_s, :kv_s]))),
+            MASK_TYPE_SWA : ((1, q_max_seq, kv_max_seq), (lambda mask, idx, q_s, kv_s: (mask[:, :q_s, :kv_s]))),
+            MASK_TYPE_SWA_DECODER : ((batch, 1, kv_max_seq), (lambda mask, idx, q_s, kv_s: (mask[idx, :q_s, :kv_s]))),
+            # 不加mask
+            MASK_TYPE_RAZOR_FUSION : ((1, q_max_seq, kv_max_seq), (lambda mask, idx, q_s, kv_s: (mask[:q_s, :kv_s]))),
+            MASK_TYPE_NO_MASK : ((1, q_max_seq, kv_max_seq), (lambda mask, idx, q_s, kv_s: 0))
+        }
+        # kernel中mask的系数
+        if data_type == torch.float16:
+            post_mask_coff = 1
+            pre_mask_coff = -10000.0
+        elif data_type == torch.bfloat16 and self.is_alibi:
+            post_mask_coff = 1
+            pre_mask_coff = -float("inf")
+        elif data_type == torch.float32 and self.is_alibi:
+            post_mask_coff = 1
+            pre_mask_coff = 1
+        else:
+            post_mask_coff = -3e38
+            pre_mask_coff = 1
+        if data_type == torch.float16:
+            if self.window_size > 0:
+                select_zero = False
+            elif self.is_alibi or self.long_seq:
+                select_zero = False
+            else:
+                select_zero = True
+        elif data_type == torch.bfloat16:
+            if self.window_size > 0:
+                select_zero = False
+            elif self.is_alibi:
+                select_zero = False
+            elif self.dynamic_batch or self.is_decoder:
+                select_zero = True
+            else:
+                select_zero = False
+        else:
+            if self.is_alibi or self.is_decoder:
+                select_zero = True
+            else:
+                select_zero = False
+        if self.is_triu_mask:
+            select_zero = False
+
+        self.mask_info = mask_type_dict[mask_type]
+        mask = np.ones(shape=self.mask_info[0]) * pre_mask_coff
+        mask = np.triu(mask, 1)
+        zero_indice = random.choices(range(self.max_seq), k = 300)
+        if self.window_size > 0:
+            mask = self.gen_swa_mask(self.max_seq, window_size, pre_mask_coff, cache_type)
+        if self.is_alibi:
+            self.alibi_bias = self.get_alibi_bias(heads, self.max_seq)
+            mask += self.alibi_bias.numpy()
+        if select_zero:
+            mask.flat[zero_indice] = 0
+        if self.is_razor_fusion:
+            mask = self.gen_razor_fusion_mask(self.razorLen, self.tileQ, self.tileKv, self.textQLen, self.textKvLen,
+                                                self.preTokens, self.nextTokens, self.baseM)
+            post_mask_coff = 1
+        self.mask = torch.from_numpy(mask).to(torch.float32)
+        self.post_mask_coff = post_mask_coff
+        self.pre_mask_coff = pre_mask_coff
+
+    def quantize_tensor_symmetric(self, x, prev_max_abs_vals=None, num_bits=8):
+        if x.dtype != torch.float:
+            x = x.to(torch.float)
+
+        quant_min = -2 ** (num_bits - 1)
+        quant_max = 2 ** (num_bits - 1) - 1
+
+        current_max_abs_vals = x.abs().max(dim=1).values
+        if prev_max_abs_vals is not None:
+            max_abs_vals = torch.max(prev_max_abs_vals, current_max_abs_vals)
+        else:
+            max_abs_vals = current_max_abs_vals
+        scales = max_abs_vals / (quant_max)
+        x_q = torch.clamp(torch.round(x / scales.unsqueeze(1)), quant_min, quant_max)
+        x_q = torch.round(x_q)
+        x_q = x_q.to(torch.int8)
+        return x_q, scales, max_abs_vals
+
+    def dequantize_tensor(self, x_q, scales, value):
+        x_deq = x_q.to(torch.float32)
+        scales = scales.unsqueeze(1)
+        x_deq = x_deq * value
+        x_deq = x_deq * scales
+        return x_deq
+
+    def online_softmax(self, s_qk, q_s, v_slice, heads, kv_head, embed, online, dtype):
+        ans = None
+        group_num = heads // kv_head
+        for head_idx in range(heads):
+            s_head_idx = s_qk[head_idx]
+            O = torch.zeros((q_s, embed)).to(dtype)
+            Br = q_s
+            Bc = 128
+            self.row_block_size = Br
+            self.col_block_size = Bc
+            d = embed
+            V_mat = v_slice[head_idx // group_num]
+            Tr = q_s // Br
+            Tc = q_s // Bc
+
+            d = embed
+            Tr = q_s // Br
+            Tc = q_s // Bc
+
+            start_row_idx = 0
+            start_col_idx = 0
+
+            for i in range(Tr):
+
+                Oi = torch.zeros((Br, d)).to(dtype)  # shape Br x d
+                li = torch.zeros((Br, 1)).to(dtype)  # shape Br x 1
+                mi = torch.full((Br, 1), -torch.inf).to(dtype)  # shape Br x 1
+                pp_max_num = None
+
+                for j in range(Tc):
+
+                    Sij = s_head_idx[i *  Br : (i + 1) * Br, start_col_idx + j * Bc : start_col_idx + (j + 1) * Bc].to(dtype)
+
+                    Vj = V_mat[start_col_idx + j * Bc : start_col_idx + (j + 1) * Bc, :]
+
+                    mi_new = torch.max(
+                        torch.column_stack([mi, torch.max(Sij, dim=1).values[:, None]]), dim=1
+                    ).values[:, None].to(dtype)
+                    Pij_hat = torch.exp((Sij - mi_new).to(torch.float32))
+                    Pij_hat = Pij_hat.to(dtype)
+                    li = torch.exp((mi - mi_new).to(torch.float32)).to(dtype) * li + torch.sum(Pij_hat, dim=1)[:, None]
+                    if self.is_int8_flag:
+                        if online:
+                            x_q, scales, pp_max_num = self.quantize_tensor_symmetric(Pij_hat, pp_max_num)
+                            if pp_max_num == None:
+                                pp_max_num = pp_max_num
+                            pv = x_q.to(torch.int32) @ Vj.to(torch.int32)
+                            Oi = Oi * torch.exp((mi - mi_new).to(torch.float32)).to(dtype) + self.dequantize_tensor(pv, scales, self.v_scale[head_idx]).to(dtype)
+                        else:
+                            x_q = Pij_hat / self.offline_scale[head_idx]
+                            x_q = torch.round(x_q.to(torch.float32))
+                            pv = x_q.to(torch.int32) @ Vj.to(torch.int32)
+                            pv = pv.to(torch.float32)
+                            value = self.v_scale[head_idx] * self.offline_scale[head_idx]
+                            Oi = Oi * torch.exp((mi - mi_new).to(torch.float32)).to(dtype) + (pv * value).to(dtype)
+                    else:
+                        Oi = Oi * torch.exp((mi - mi_new).to(torch.float32)).to(dtype) + Pij_hat @ Vj.to(dtype)
+
+                    mi = mi_new
+
+                if (q_s % Bc != 0):
+                    Bc = q_s % Bc
+                    start_row_idx = (q_s // self.row_block_size) * self.row_block_size
+                    start_col_idx = (q_s // self.col_block_size) * self.col_block_size
+
+                    Sij = s_head_idx[i *  Br : (i + 1) * Br, start_col_idx : start_col_idx + Bc].to(dtype)
+                    Vj = V_mat[start_col_idx : start_col_idx + Bc, :]
+                    mi_new = torch.max(
+                        torch.column_stack([mi, torch.max(Sij, dim=1).values[:, None]]), dim=1
+                    ).values[:, None].to(dtype)
+                    Pij_hat = torch.exp((Sij - mi_new).to(torch.float32))
+                    Pij_hat = Pij_hat.to(dtype)
+                    li = torch.exp((mi - mi_new).to(torch.float32)).to(dtype) * li + torch.sum(Pij_hat, dim=1)[:, None]
+                    if self.is_int8_flag:
+                        if online:
+                            x_q, scales, pp_max_num = self.quantize_tensor_symmetric(Pij_hat, pp_max_num)
+                            if pp_max_num == None:
+                                pp_max_num = pp_max_num
+                            pv = x_q.to(torch.int32) @ Vj.to(torch.int32)
+                            Oi = Oi * torch.exp((mi - mi_new).to(torch.float32)).to(dtype) + self.dequantize_tensor(pv, scales, self.v_scale[head_idx]).to(dtype)
+                        else:
+                            x_q = Pij_hat / self.offline_scale[head_idx]
+                            x_q = torch.round(x_q.to(torch.float32))
+                            pv = x_q.to(torch.int32) @ Vj.to(torch.int32)
+                            pv = pv.to(torch.float32)
+                            value = self.v_scale[head_idx] * self.offline_scale[head_idx]
+                            Oi = Oi * torch.exp((mi - mi_new).to(torch.float32)).to(dtype) + (pv * value).to(dtype)
+                    else:
+                        Oi = Oi * torch.exp((mi - mi_new).to(torch.float32)).to(dtype) + Pij_hat @ Vj.to(dtype)
+                Oi = Oi / li
+
+                O[i * Br : (i + 1) * Br, :] = Oi
+
+            if ans is None:
+                ans = O
+            else:
+                ans = torch.cat((ans, O), 1)
+        return ans
+
+    def gen_out_tensor(self, online=False):
+        q_offset = 0
+        k_offset = 0
+        v_offset = 0
+        batch = self.batch
+        dynamic_batch = self.dynamic_batch
+        batch_state = self.batch_state
+        heads = self.heads
+        is_decoder = self.is_decoder
+        embed = self.embeddim
+        embedv = self.embeddimv
+        max_seq = self.max_seq
+        q_seqlen = self.q_seqlen
+        kv_seqlen = self.kv_seqLen
+        kv_head = self.kv_head
+        mask = self.mask
+        is_mask = self.is_mask
+        is_razor_fusion = self.is_razor_fusion
+        q = self.q
+        k = self.k
+        v = self.v
+        if self.fav3:
+            q = self.q_int8
+            k = self.k_int8
+            v = self.v_int8
+        q_ntokens = self.q_ntokens
+        kv_ntokens = self.kv_ntokens
+        layer_id = self.layer_id[0]
+        s = None
+        _p = None
+        out = None
+        ans_concat = None
+        ans_concat_true = None
+        out_true = None
+
+        self.encoder_logN = torch.tensor([2.0] * self.max_seq).to(torch.float32)
+        self.encoder_logN.uniform_(1, 2)
+        self.decoder_logN = torch.tensor([2.0] * batch).to(torch.float32)
+        self.decoder_logN.uniform_(1, 2)
+        for idx in range(batch):
+            if dynamic_batch and batch_state[idx] == 0 and not is_decoder:
+                continue
+            if dynamic_batch and batch_state[idx] == 0:
+                output = torch.zeros([heads, q_s, embedv])
+                output = torch.permute(output, (1, 0, 2))
+                if out is None:
+                    out = output
+                    if not self.fav3:
+                        out_true = output
+                else:
+                    out = torch.cat((out, output), 0)
+                    if not self.fav3:
+                        out_true = torch.cat((out_true, output), 0)
+                q_offset += q_s
+                k_offset += max_seq
+                v_offset += max_seq
+                continue
+            q_s = q_seqlen[idx]
+            kv_s = kv_seqlen[idx]
+            q_slice = q[q_offset:q_offset + q_s][:]
+            q_slice = q_slice.view(q_s, heads, embed)
+            q_slice = torch.permute(q_slice, (1, 0, 2))
+            k_slice = k[layer_id][idx][:kv_s][:]
+            k_slice = k_slice.view(kv_s, kv_head, embed)
+            k_slice_t = torch.permute(k_slice, (1, 2, 0))   # get K^T
+            v_slice = v[layer_id][idx][:kv_s][:]
+            v_slice = v_slice.view(kv_s, kv_head, embedv)
+            v_slice = torch.permute(v_slice, (1, 0, 2))
+
+            if self.fav3:
+                score = self.group_mm_torch(heads, kv_head, q_slice, k_slice_t, torch.int32)
+            else:
+                score = self.group_mm_torch(heads, kv_head, q_slice, k_slice_t)
+            if self.fav3:
+                # score:[heads,m,n]
+                score = score.to(torch.float32)
+                score = score * self.q_scale.view(heads, 1, 1)
+                score = score.to(torch.float16)
+
+            if s is None:
+                s = score.view([-1, ])
+            else:
+                s = torch.cat((s, score.view([-1, ])), 0)
+
+            if self.scaleType == ScaleType.SCALE_LOGN_FP32.value:
+                if is_decoder:
+                    score *= self.decoder_logN[idx]
+                else:
+                    score *= self.encoder_logN[None, :q_s, None]
+
+            if self.fav3:
+                score = score * torch.tensor(self.tor, dtype=torch.float16)
+            else:
+                score *= self.tor
+
+            if self.is_clamp == 1:
+                clamp_min_brc = np.ones((score.shape)) * self.clamp_min
+                clamp_max_brc = np.ones((score.shape)) * self.clamp_max
+                score = np.float16(np.maximum(score, clamp_min_brc))
+                score = torch.from_numpy(np.float16(np.minimum(score, clamp_max_brc)))
+            temp_mask = self.mask_info[1](self.mask, idx, q_s, kv_s) * self.post_mask_coff
+            if is_mask or is_razor_fusion:
+                score = score + temp_mask
+
+            s_qk = score
+            s_qk_true = score.to(torch.float32)
+            score = score.numpy().astype(np.float32)
+
+            if self.is_int8_flag:
+                ans = self.online_softmax(s_qk, q_s, v_slice, heads, kv_head, embed, online, torch.float16)
+                if ans_concat is None:
+                    ans_concat = ans
+                else:
+                    ans_concat = torch.cat((ans_concat, ans), 0)
+
+                ans_true = self.online_softmax(s_qk_true, q_s, v_slice, heads, kv_head, embed, online, torch.float32)
+                if ans_concat_true is None:
+                    ans_concat_true = ans_true
+                else:
+                    ans_concat_true = torch.cat((ans_concat_true, ans_true), 0)
+
+            score_max = np.max(score, axis=-1)
+            score = score - score_max.reshape((heads, q_s, 1))
+            score_exp = np.exp(score)
+            score_sum = np.sum(score_exp, axis=-1)
+
+            if _p is None:
+                _p = score_exp.astype(np.float32).reshape([-1, ])
+            else:
+                _p = np.concatenate(
+                    (_p, score_exp.astype(np.float32).reshape([-1, ])), 0)
+            if self.fav3:
+                p = score_exp
+                p = p * 127
+                p = torch.from_numpy(p).to(torch.int8)
+            else:
+                p_true = (score_exp / score_sum.reshape((heads, q_s, 1)))
+                p_true = torch.from_numpy(p_true)
+                p = p_true.to(torch.bfloat16)
+                o_true = self.group_mm_torch(heads, kv_head, p_true, v_slice)
+
+            o = self.group_mm_torch(heads, kv_head, p, v_slice)
+            if self.fav3:
+                o = o.to(torch.float)
+                v_scale = self.v_scale
+                v_scale = v_scale.view(heads, 1, 1)
+                o = o * v_scale
+                o = o / 127
+                o = o / score_sum.reshape((heads, q_s, 1))
+            else:
+                o_true = o_true.view(heads, q_s, embedv)
+                o_true = torch.permute(o_true, (1, 0, 2)).contiguous()
+            o = o.view(heads, q_s, embedv)
+            o = torch.permute(o, (1, 0, 2)).contiguous()
+            if out is None:
+                out = o
+                if not self.fav3:
+                    out_true = o_true
+            else:
+                out = torch.cat((out, o), 0)
+                if not self.fav3:
+                    out_true = torch.cat((out_true, o_true), 0)
+
+            q_offset += q_s
+            k_offset += max_seq
+            v_offset += max_seq
+        # golden data
+        print("now is: ", q_ntokens, heads, embedv)
+
+        if self.is_int8_flag:
+            ans_concat = ans_concat.view(q_ntokens, heads * embedv)
+            ans_concat_true = ans_concat_true.view(q_ntokens, heads * embedv)
+            self.golden_out = ans_concat
+            self.golden_out_true = ans_concat_true
+        else:
+            out = out.view(q_ntokens, heads * embedv)
+            self.golden_out = out.to(self.data_type)
+            out_true = out_true.view(q_ntokens, heads * embedv)
+            self.golden_out_true = out_true.to(torch.float32)
+
+        if self.no_cache:
+            self.k = self.close_pack(self.k.to(torch.float32), kv_seqlen).to(self.data_type)
+            self.v = self.close_pack(self.v.to(torch.float32), kv_seqlen).to(self.data_type)
+            if self.fav3:
+                self.k_int8 = self.close_pack(self.k_int8.to(torch.float32), kv_seqlen).to(torch.int8)
+                self.v_int8 = self.close_pack(self.v_int8.to(torch.float32), kv_seqlen).to(torch.int8)
+        if self.long_seq:
+            self.max_seq = 128
+            self.gen_mask(self.batch, self.heads, self.data_type, self.mask_type, 0, False, 0)
+
+    def gen_out_tensor_bnsd(self):
+        q_offset = 0
+        k_offset = 0
+        v_offset = 0
+        batch = self.batch
+        dynamic_batch = self.dynamic_batch
+        batch_state = self.batch_state
+        heads = self.heads
+        is_decoder = self.is_decoder
+        embed = self.embeddim
+        embedv = self.embeddimv
+        max_seq = self.max_seq
+        q_seqlen = self.q_seqlen
+        kv_seqlen = self.kv_seqLen
+        kv_head = self.kv_head
+        mask = self.mask
+        is_mask = self.is_mask
+        q = self.q
+        k = self.k
+        v = self.v
+        q_ntokens = self.q_ntokens
+        kv_ntokens = self.kv_ntokens
+        layer_id = self.layer_id[0]
+        s = None
+        _p = None
+        out = None
+        obsnd = torch.zeros(batch, max_seq, heads, embedv)
+        out_true_bnsd = torch.zeros(batch, max_seq, heads, embedv)
+        kbsnd=k.view(layer_id+1,batch,max_seq,kv_head,embed)
+        vbsnd=v.view(layer_id+1,batch,max_seq,kv_head,embedv)
+        qbsnd = torch.zeros(batch, max_seq, heads, embed)
+        self.encoder_logN = torch.tensor([2.0] * self.max_seq).to(torch.float32)
+        self.encoder_logN.uniform_(1, 2)
+        self.decoder_logN = torch.tensor([2.0] * batch).to(torch.float32)
+        self.decoder_logN.uniform_(1, 2)
+        for idx in range(batch):
+            if dynamic_batch and batch_state[idx] == 0 and not is_decoder:
+                continue
+            if dynamic_batch and batch_state[idx] == 0:
+                output = torch.zeros([heads, q_s, embedv])
+                output = torch.permute(output, (1, 0, 2))
+                if out is None:
+                    out = output
+                else:
+                    out = torch.cat((out, output), 0)
+                q_offset += q_s
+                k_offset += max_seq
+                v_offset += max_seq
+                continue
+            # todo bs,n,d 转b，n，s，d
+            q_s = q_seqlen[idx]
+            kv_s = kv_seqlen[idx]
+            q_slice = q[q_offset:q_offset + q_s][:]
+            q_slice = q_slice.view(q_s, heads, embed)
+            for q_s_idx in range(q_s):
+               qbsnd[idx][q_s_idx] = q_slice[q_s_idx][:]
+            q_slice = torch.permute(q_slice, (1, 0, 2))
+            k_slice = k[layer_id][idx][:kv_s][:]
+            k_slice = k_slice.view(kv_s, kv_head, embed)
+            k_slice_t = torch.permute(k_slice, (1, 2, 0))   # get K^T
+            v_slice = v[layer_id][idx][:kv_s][:]
+            v_slice = v_slice.view(kv_s, kv_head, embedv)
+            v_slice = torch.permute(v_slice, (1, 0, 2))
+
+            score = self.group_mm_torch(heads, kv_head, q_slice, k_slice_t)
+            if s is None:
+                s = score.view([-1, ])
+            else:
+                s = torch.cat((s, score.view([-1, ])), 0)
+            score = score * self.tor
+            if self.scaleType == ScaleType.SCALE_LOGN_FP32.value:
+                if is_decoder:
+                    score *= self.decoder_logN[idx]
+                else:
+                    score *= self.encoder_logN[None, :q_s, None]
+            if self.is_clamp == 1:
+                clamp_min_brc = np.ones((score.shape)) * self.clamp_min
+                clamp_max_brc = np.ones((score.shape)) * self.clamp_max
+                score = np.float16(np.maximum(score, clamp_min_brc))
+                score = torch.from_numpy(np.float16(np.minimum(score, clamp_max_brc)))
+            temp_mask = self.mask_info[1](self.mask, idx, q_s, kv_s) * self.post_mask_coff
+            if is_mask:
+                score = score + temp_mask
+            score = score.numpy().astype(np.float32)
+            score_max = np.max(score, axis=-1)
+            score = score - score_max.reshape((heads, q_s, 1))
+            score_exp = np.exp(score)
+            score_sum = np.sum(score_exp, axis=-1)
+
+            if _p is None:
+                _p = score_exp.astype(np.float32).reshape([-1, ])
+            else:
+                _p = np.concatenate(
+                    (_p, score_exp.astype(np.float32).reshape([-1, ])), 0)
+
+            p_true = (score_exp / score_sum.reshape((heads, q_s, 1)))
+            p_true = torch.from_numpy(p_true)
+            o_true = self.group_mm_torch(heads, kv_head, p_true, v_slice)
+            o_true = o_true.view(heads, q_s, embedv)
+            o_true = torch.permute(o_true, (1, 0, 2)).contiguous()
+
+            #根据数据类型转换
+            p = p_true.to(torch.bfloat16)
+            o = self.group_mm_torch(heads, kv_head, p, v_slice)
+            o = o.view(heads, q_s, embedv)
+            o = torch.permute(o, (1, 0, 2)).contiguous()
+            if out is None:
+                out = o
+                out_true = o_true
+            else:
+                out = torch.cat((out, o), 0)
+                out_true = torch.cat((out_true, o_true), 0)
+
+            for i in range(0,q_s):
+                obsnd[idx][i] = o[i]
+                out_true_bnsd[idx]=out_true[i]
+            q_offset += q_s
+            k_offset += max_seq
+            v_offset += max_seq
+        obnsd = torch.permute(obsnd, (0, 2, 1,3))
+        out_true_bnsd = torch.permute(out_true_bnsd, (0, 2, 1,3))
+        self.qbnsd = torch.permute(qbsnd, (0, 2, 1, 3)).to(self.data_type)
+        self.kbnsd = torch.permute(kbsnd, (0, 1, 3, 2, 4)).to(self.data_type)
+        self.vbnsd = torch.permute(vbsnd, (0, 1, 3, 2, 4)).to(self.data_type)
+        # golden data
+        out = out.view(q_ntokens, heads * embedv)
+        out_true = out_true.view(q_ntokens, heads * embedv)
+        if(self.is_decoder == 1):
+            self.golden_out = out
+            self.golden_out_true = out_true.to(torch.float32)
+        else:
+            self.golden_out = obnsd.to(self.data_type)
+            self.golden_out_true = out_true_bnsd.to(torch.float32)
+        logging.debug(f"golden_out shape: {self.golden_out.shape}")
+
+        if self.no_cache:
+            self.k = self.close_pack(self.k.to(torch.float32), kv_seqlen).to(self.data_type)
+            self.v = self.close_pack(self.v.to(torch.float32), kv_seqlen).to(self.data_type)
+        if self.long_seq:
+            self.max_seq = 128
+            self.gen_mask(self.batch, self.heads, self.data_type, self.mask_type)
+
+    def gen_out_tensor_bnsd_splitm(self):
+        q_offset = 0
+        k_offset = 0
+        v_offset = 0
+        batch = self.batch
+        dynamic_batch = self.dynamic_batch
+        batch_state = self.batch_state
+        heads = self.heads
+        is_decoder = self.is_decoder
+        embed = self.embeddim
+        embedv = self.embeddimv
+        max_seq = self.max_seq
+        q_seqlen = self.q_seqlen
+        kv_seqlen = self.kv_seqLen
+        kv_head = self.kv_head
+        mask = self.mask
+        is_mask = self.is_mask
+        q = self.q
+        k = self.k
+        v = self.v
+        q_ntokens = self.q_ntokens
+        kv_ntokens = self.kv_ntokens
+        layer_id = self.layer_id[0]
+        s = None
+        _p = None
+        out = None
+        maxQSeqlen = max(q_seqlen)
+        obsnd = torch.zeros(batch, maxQSeqlen, heads, embedv)
+        out_true_bnsd = torch.zeros(batch, maxQSeqlen, heads, embedv)
+        maxKvSeqlen = max(kv_seqlen)
+        kbsnd=k.view(layer_id+1,batch,maxKvSeqlen,kv_head,embed)
+        vbsnd=v.view(layer_id+1,batch,maxKvSeqlen,kv_head,embedv)
+        qbsnd = torch.zeros(batch, maxQSeqlen, heads, embed)
+        for idx in range(batch):
+            if dynamic_batch and batch_state[idx] == 0 and not is_decoder:
+                continue
+            if dynamic_batch and batch_state[idx] == 0:
+                output = torch.zeros([heads, q_s, embedv])
+                output = torch.permute(output, (1, 0, 2))
+                if out is None:
+                    out = output
+                else:
+                    out = torch.cat((out, output), 0)
+                q_offset += q_s
+                k_offset += max_seq
+                v_offset += max_seq
+                continue
+            # todo bs,n,d 转b，n，s，d
+            q_s = q_seqlen[idx]
+            kv_s = kv_seqlen[idx]
+            q_slice = q[q_offset:q_offset + q_s][:]
+            q_slice = q_slice.view(q_s, heads, embed)
+            for q_s_idx in range(q_s):
+                qbsnd[idx][q_s_idx] = q_slice[q_s_idx][:]
+            q_slice = torch.permute(q_slice, (1, 0, 2))
+            k_slice = k[layer_id][idx][:kv_s][:]
+            k_slice = k_slice.view(kv_s, kv_head, embed)
+            k_slice_t = torch.permute(k_slice, (1, 2, 0))   # get K^T
+            v_slice = v[layer_id][idx][:kv_s][:]
+            v_slice = v_slice.view(kv_s, kv_head, embedv)
+            v_slice = torch.permute(v_slice, (1, 0, 2))
+
+            score = self.group_mm_torch(heads, kv_head, q_slice, k_slice_t)
+            if s is None:
+                s = score.view([-1, ])
+            else:
+                s = torch.cat((s, score.view([-1, ])), 0)
+            score = score * self.tor
+            score = score.numpy().astype(np.float32)
+            score_max = np.max(score, axis=-1)
+            score = score - score_max.reshape((heads, q_s, 1))
+            score_exp = np.exp(score)
+            score_sum = np.sum(score_exp, axis=-1)
+
+            if _p is None:
+                _p = score_exp.astype(np.float32).reshape([-1, ])
+            else:
+                _p = np.concatenate(
+                    (_p, score_exp.astype(np.float32).reshape([-1, ])), 0)
+
+            p_true = (score_exp / score_sum.reshape((heads, q_s, 1)))
+            p_true = torch.from_numpy(p_true)
+            o_true = self.group_mm_torch(heads, kv_head, p_true, v_slice)
+            o_true = o_true.view(heads, q_s, embedv)
+            o_true = torch.permute(o_true, (1, 0, 2)).contiguous()
+
+            #根据数据类型转换
+            p = p_true.to(torch.bfloat16)
+            o = self.group_mm_torch(heads, kv_head, p, v_slice)
+            o = o.view(heads, q_s, embedv)
+            o = torch.permute(o, (1, 0, 2)).contiguous()
+
+            if out is None:
+                out = o
+                out_true = o_true
+            else:
+                out = torch.cat((out, o), 0)
+                out_true = torch.cat((out_true, o_true), 0)
+            for i in range(0,q_s):
+                obsnd[idx][i] = o[i]
+                out_true_bnsd[idx][i]=out_true[i]
+            q_offset += q_s
+            k_offset += kv_s
+            v_offset += kv_s
+        obnsd = torch.permute(obsnd, (0, 2, 1,3))
+        out_true_bnsd = torch.permute(out_true_bnsd, (0, 2, 1,3))
+        self.qbnsd = torch.permute(qbsnd, (0, 2, 1, 3)).to(self.data_type)
+        self.kbnsd = torch.permute(kbsnd, (0, 1, 3, 2, 4)).to(self.data_type)
+        self.vbnsd = torch.permute(vbsnd, (0, 1, 3, 2, 4)).to(self.data_type)
+        # golden data
+        out = out.view(q_ntokens, heads * embedv)
+        out_true = out_true.view(q_ntokens, heads * embedv)
+        self.golden_out = obnsd.to(self.data_type)
+        self.golden_out_true = out_true_bnsd.to(torch.float32)
+        logging.debug(f"golden_out shape: {self.golden_out.shape}")
+
+        if self.no_cache:
+            self.k = self.close_pack(self.k.to(torch.float32), kv_seqlen).to(self.data_type)
+            self.v = self.close_pack(self.v.to(torch.float32), kv_seqlen).to(self.data_type)
+
+    def gen_seq_len(self, batch, seq_len):
+        ntokens = sum(seq_len)
+        return seq_len, ntokens
+
+    def compare_output_data(self, out, golden, ratios):
+        error_count = 0
+        strict_error_count = 0
+        fp16_min_normal = 1.0 / (1 << 14)
+        golden = golden.flatten().to(torch.float32)
+        out = out.flatten().to(torch.float32)
+        out_len = out.shape[0]
+        diff = torch.abs(golden - out)
+        max_diff = diff.max().item()
+        limit_error = torch.maximum(torch.abs(golden * ratios[0]), torch.tensor(ratios[1]))
+        strict_limit_error = torch.maximum(torch.abs(golden * ratios[2]), torch.tensor(ratios[3]))
+        error_count = torch.gt(diff, limit_error).sum().item()
+        strict_error_count = torch.gt(diff, strict_limit_error).sum().item()
+        logging.info(f"maxDiff {max_diff}")
+        logging.info("1/1000 Accuracy is %f",  1 - float(error_count) / out_len)
+        logging.info("5/1000 Accuracy is %f",  1 - float(strict_error_count) / out_len)
+        if self.data_type == torch.bfloat16:
+            logging.debug("accuracy is correct in old standard: %r", (float(strict_error_count) / out_len) <= ratios[2])
+        else:
+            logging.debug("accuracy is correct in old standard: %r", (float(strict_error_count) / out_len) <= ratios[0])
+        calc_times = self.heads * self.max_seq + 4
+        if self.data_type == torch.bfloat16:
+            if calc_times < 2048:
+                error = 2**(-7)
+            else :
+                error = 2**(-6)
+            error_threshold = torch.clamp(torch.abs(golden), min = 1) * error
+            res = (diff <= error_threshold).all().item()
+            logging.debug("accuracy is correct in new standard: %r", res)
+            return res
+        elif self.data_type == torch.float16:
+            if calc_times < 2048:
+                error = 2**(-8)
+            else :
+                error = 2**(-7)
+            error_threshold = torch.clamp(torch.abs(golden), min = 1) * error
+            res = (diff <= error_threshold).all().item()
+            logging.debug("accuracy is correct in new standard: %r", res)
+            return res
+        else :
+            if calc_times < 2048:
+                error = 2**(-11)
+            elif calc_times >= 2048 and calc_times < 16384:
+                error = 2**(-10)
+            else:
+                error = 2**(-14)
+            error_threshold = torch.clamp(torch.abs(golden), min = 1) * error
+            res = (diff <= error_threshold).all().item()
+            logging.debug("accuracy is correct in new standard: %r", res)
+            return res
+
+    def group_mm_torch(self, heads, group_num, A, B, dtype=torch.float32):
+        group_head = heads // group_num
+        score = None
+        for i in range(group_num):
+            group_score = torch.matmul(A[i * group_head: (i + 1) * group_head, :, :].to(dtype), B[i:(i + 1), :, :].to(dtype))
+            if score is None:
+                score = group_score
+            else:
+                score = torch.cat((score, group_score), 0)
+        return score
+
+    def golden_calc(self, in_tensors):
+        golden_out = self.golden_out.clone().detach().requires_grad_(True).half().npu()
+        return [golden_out]
+
+    def golden_compare(self, out_tensors, golden_tensors):
+        print("max(golden_out): ", torch.max(golden_tensors[0].clone().detach().half().npu()).item(),)
+        print("min(golden_out): ", torch.min(golden_tensors[0].clone().detach().half().npu()).item(),)
+        print("max(actual out): ", torch.max(out_tensors[0].clone().detach().half().npu()).item(),)
+        print("min(actual out): ", torch.min(out_tensors[0].clone().detach().half().npu()).item(),)
+        # nan/inf
+        result_single = self.compare_output_data(out_tensors[0].clone().detach().half().npu(),
+                                                 golden_tensors[0].clone().detach().half().npu(),
+                                                 [0.001, 0.001, 0.005, 0.005])
+        if self.is_int8_flag:
+            result_double = compare_cv(self.golden_out_true.clone().detach().half().npu(),
+                                       golden_tensors[0].clone().detach().half().npu(), out_tensors[0])
+            return (result_double or result_single)
+        else:
+            result_double = compare_cv(self.golden_out_true.clone().detach().half().npu(),
+                                       golden_tensors[0].clone().detach().half().npu(), out_tensors[0])
+            return (result_double or result_single)
+
+
+    def test_swa_decoder(self):
+        """
+            is_decoder = 1, no_cache=False, "maskType": MASK_TYPE_SLIDING_WINDOW_NORM
+            qselen[i] = 1 for all i (decoder)
+            kv_seqLen[i] = 114 for all i
+        """
+        if not operation_test.get_soc_version() == 'Ascend910B':
+            print("this testcase only supports Ascend910B")
+            return
+        self.data_type = torch.float16
+        data_type = self.data_type
+        self.batch = 8
+        batch = self.batch
+        self.kv_head = 32  # kv_head num
+        kv_head = self.kv_head
+        self.is_decoder = 1  # prefill or decoder
+        self.heads = 32  # llama7b  hidden_size 4096
+        self.embeddim = 128
+        self.embeddim_v = self.embeddim
+        tor = 1
+        self.dynamic_batch = False
+        kv_seqLen = [114] * batch
+        qSeqLen = [1] * batch
+        self.max_seq = max(max(kv_seqLen), max(qSeqLen))
+        self.window_size = 16
+        self.q_seqlen, self.q_ntokens = self.gen_seq_len(batch, [1] * batch)
+        self.kv_seqlen, self.kv_ntokens = self.gen_seq_len(batch, kv_seqLen)
+        self.layer_id = torch.from_numpy(np.array([0], dtype=np.int32)).to(torch.int32)
+        self.q_max_seq = np.max(self.q_seqlen)
+        self.kv_max_seq = np.max(self.kv_seqlen)
+        tor = np.float32(1.0 / math.sqrt(1.0 * self.embeddim))
+
+        self.q_scale = 1
+        self.qk_scale = tor
+        self.cache_type = 1
+        
+        OP_NAME = "SelfAttentionOperation"
+        OP_PARAM = {"type": 1}
+        self.set_data_params(cache_type=self.cache_type, is_decoder=self.is_decoder, batch=batch, kv_head=kv_head, heads=self.heads,
+                             embeddim=self.embeddim, max_seq=self.max_seq, kv_seqLen=kv_seqLen,
+                             data_type=data_type, long_seq = True,
+                             op_type=OP_PARAM["type"], mask_type = MASK_TYPE_SWA,
+                             no_cache=False, is_sqrt=False, tor=tor, q_seqlens=self.q_seqlen)
+        self.gen_out_tensor()
+        self.window_size = 16
+        param = json.dumps(
+            {"headNum": self.heads, "qScale": float(self.q_scale), "qkScale": float(self.qk_scale), "maskType": 7,
+             "kvcacheCfg": self.cache_type, "calcType": 2, "windowSize": self.window_size})
+        
+        self.param_seqlen = self.q_seqlen
+        self.param_token_offset = self.kv_seqlen
+        run_param = json.dumps({"tokenOffset": self.param_token_offset, "seqLen": self.param_seqlen, "maskType": 7})
+        self.execute_with_param(OP_NAME, param, run_param,
+                                [self.q.npu(), self.k.npu(), self.v.npu(),
+                                 torch.tensor(self.kv_seqlen).to(torch.int32).npu(),
+                                 torch.tensor(self.q_seqlen).to(torch.int32).npu(), self.layer_id.npu()])
+
+
+    def test_swa_encoder_cache(self):
+        """
+            is_decoder = 0, no_cache=False, "maskType": MASK_TYPE_SLIDING_WINDOW_NORM， cacheType = 1
+            qselen = kv_seqLen = [33, 512, ...]
+        """
+        if not operation_test.get_soc_version() == 'Ascend910B':
+            print("this testcase only supports Ascend910B")
+            return
+        self.data_type = torch.float16
+        data_type = self.data_type
+        self.batch = 8
+        batch = self.batch
+        self.kv_head = 33  # kv_head num
+        kv_head = self.kv_head
+        self.is_decoder = 0  # prefill or decoder
+        self.heads = 33  # llama7b  hidden_size 4096
+        self.embeddim = 128
+        self.embeddim_v = self.embeddim
+        self.dynamic_batch = False
+        kv_seqLen = [self.heads, 512] * (self.batch // 2)
+        self.max_seq = max(kv_seqLen)
+        
+        self.window_size = 16
+        self.cacheType = 1
+        self.kv_seqlen, self.kv_ntokens = self.gen_seq_len(batch, kv_seqLen)
+        self.q_seqlen, self.q_ntokens = self.kv_seqlen, self.kv_ntokens
+        
+        self.q_max_seq = np.max(self.q_seqlen)
+        self.kv_max_seq = np.max(self.kv_seqlen)
+        tor = np.float32(1.0 / math.sqrt(1.0 * self.embeddim))
+        
+        self.q_scale = 1
+        self.qk_scale = tor
+        self.cache_type = 1
+        
+        OP_NAME = "SelfAttentionOperation"
+        OP_PARAM = {"type": 1}
+        print(f" self.q_ntokens 1  {self.q_ntokens}")
+        self.set_data_params(cache_type=self.cache_type, is_decoder=self.is_decoder, batch=batch, kv_head=kv_head,
+                             heads=self.heads, embeddim=self.embeddim, max_seq=self.max_seq, kv_seqLen=kv_seqLen,
+                             data_type=data_type, long_seq = False, op_type=OP_PARAM["type"], mask_type = MASK_TYPE_SWA,
+                             no_cache=False, is_sqrt=False, tor=tor)
+        self.gen_out_tensor()
+        self.window_size = 16
+        self.q_scale = 1
+        self.qk_scale = tor
+        param = json.dumps(
+            {"headNum": self.heads, "qScale": float(self.q_scale), "qkScale": float(self.qk_scale), "maskType": 7,
+             "kvcacheCfg": 1, "calcType": 1,
+             "windowSize": self.window_size, "cacheType": self.cacheType})
+        self.param_seqlen = self.kv_seqlen
+        self.param_token_offset = self.kv_seqlen
+        run_param = json.dumps({"tokenOffset": self.param_token_offset, "seqLen": self.param_seqlen, "maskType": 7})
+        self.execute_with_param(OP_NAME, param, run_param,
+                                [self.q.npu(), self.k.npu(), self.v.npu(), 
+                                 self.mask.reshape(self.q_max_seq, self.kv_max_seq).to(data_type).npu(),
+                                 torch.tensor(self.kv_seqlen).to(torch.int32).npu(),
+                                 torch.tensor(self.kv_seqlen).to(torch.int32).npu(), self.layer_id.npu()])
+
+    def test_swa_decoder_cache(self):
+        """
+            is_decoder = 0, no_cache=False, "maskType": MASK_TYPE_SLIDING_WINDOW_NORM， cacheType = 1
+            qselen = kv_seqLen = [33, 512, ...]
+        """
+        if not operation_test.get_soc_version() == 'Ascend910B':
+            print("this testcase only supports Ascend910B")
+            return
+        self.data_type = torch.bfloat16
+        data_type = self.data_type
+        self.batch = 8
+        batch = self.batch
+        self.kv_head = 32  # kv_head num
+        self.is_decoder = 1  # prefill or decoder
+        self.heads = 32  # llama7b  hidden_size 4096
+        self.embeddim = 128
+        self.embeddim_v = self.embeddim
+        self.dynamic_batch = False
+        kv_seqLen = [32, 1024] * 4
+        self.max_seq = 1024
+        self.window_size = 64
+        self.cacheType = 1
+        self.q_seqlen, self.q_ntokens = self.gen_seq_len(batch, [1] * batch)
+        self.kv_seqlen, self.kv_ntokens = self.gen_seq_len(batch, kv_seqLen)
+        self.layer_id = torch.from_numpy(np.array([0], dtype=np.int32)).to(torch.int32)
+        self.q_max_seq = np.max(self.q_seqlen)
+        self.kv_max_seq = np.max(self.kv_seqlen)
+        tor = np.float32(1.0 / math.sqrt(1.0 * self.embeddim))
+
+        self.q_scale = 1
+        self.qk_scale = tor
+        self.cache_type = 1
+        
+        OP_NAME = "SelfAttentionOperation"
+        OP_PARAM = {"type": 1}
+        self.set_data_params(cache_type=self.cache_type, is_decoder=self.is_decoder, batch=batch, kv_head=self.kv_head,
+                             heads=self.heads, embeddim=self.embeddim, max_seq=self.max_seq, kv_seqLen=kv_seqLen,
+                             data_type=data_type, long_seq = False, op_type=OP_PARAM["type"], mask_type = MASK_TYPE_SWA,
+                             no_cache=False, is_sqrt=False, tor=tor, q_seqlens=self.q_seqlen)
+        self.gen_out_tensor()
+        self.window_size = 64
+        
+        param = json.dumps(
+            {"headNum": self.heads, "qScale": float(self.q_scale), "qkScale": float(self.qk_scale), "maskType": 7,
+             "kvcacheCfg": 1, "calcType": 2, "windowSize": self.window_size, "cacheType": 1})
+        self.param_seqlen = self.q_seqlen
+        self.param_token_offset = self.kv_seqlen
+        run_param = json.dumps({"tokenOffset": self.param_token_offset, "seqLen": self.param_seqlen, "maskType": 7})
+        self.execute_with_param(OP_NAME, param, run_param,
+                                [self.q.npu(), self.k.npu(), self.v.npu(),
+                                 torch.tensor(self.kv_seqlen).to(torch.int32).npu(),
+                                 torch.tensor(self.q_seqlen).to(torch.int32).npu(), self.layer_id.npu()])
+
+    def test_swa_encoder(self):
+        """
+            is_decoder = 0, no_cache=False, "maskType": MASK_TYPE_SLIDING_WINDOW_NORM， cacheType = 0
+            qselen = kv_seqLen = [32, 256, ...] + norm swa mask
+        """
+        if not operation_test.get_soc_version() == 'Ascend910B':
+            print("this testcase only supports Ascend910B")
+            return
+        self.data_type = torch.bfloat16
+        data_type = self.data_type
+        self.batch = 8
+        batch = self.batch
+        self.kv_head = 32        # kv_head num
+        kv_head = self.kv_head
+        self.is_decoder = 0       # prefill or decoder
+        self.heads = 32          # llama7b  hidden_size 4096
+        self.embeddim = 128
+        self.embeddim_v = self.embeddim
+        self.dynamic_batch = False
+        kv_seqLen = [32, 256] * 4
+        self.max_seq = 256
+        self.window_size = 16
+        self.cacheType = 0
+        self.q_seqlen, self.q_ntokens = self.gen_seq_len(batch, kv_seqLen)
+        self.kv_seqlen, self.kv_ntokens = self.gen_seq_len(batch, kv_seqLen)
+        self.layer_id = torch.from_numpy(np.array([0], dtype=np.int32)).to(torch.int32)
+        self.q_max_seq = np.max(self.q_seqlen)
+        self.kv_max_seq = np.max(self.kv_seqlen)
+        tor = np.float32(1.0 / math.sqrt(1.0 * self.embeddim))
+    
+        self.q_scale = 1
+        self.qk_scale = tor
+        self.cache_type = 1
+        OP_NAME = "SelfAttentionOperation"
+        OP_PARAM = {"type": 1}
+        self.set_data_params(cache_type=self.cache_type, is_decoder=self.is_decoder, batch=batch, kv_head=self.kv_head,
+                             heads=self.heads, embeddim=self.embeddim, max_seq=self.max_seq, kv_seqLen=kv_seqLen,
+                             data_type=data_type, long_seq = False, op_type=OP_PARAM["type"], mask_type = MASK_TYPE_SWA,
+                             no_cache=False, is_sqrt=False, tor=tor, q_seqlens=self.q_seqlen)
+        self.gen_out_tensor()
+        self.window_size = 16
+        
+        param = json.dumps(
+            {"headNum": self.heads, "qScale": float(self.q_scale), "qkScale": float(self.qk_scale), "maskType": 7,
+             "kvcacheCfg": 1, "calcType": 1, "windowSize": self.window_size, "cacheType": 0})
+        self.param_seqlen = self.q_seqlen
+        self.param_token_offset = self.kv_seqlen
+        run_param = json.dumps({"tokenOffset": self.param_token_offset, "seqLen": self.param_seqlen, "maskType": 7})
+        self.execute_with_param(OP_NAME, param, run_param,
+                                [self.q.npu(), self.k.npu(), self.v.npu(),
+                                 self.mask.reshape(self.q_max_seq, self.kv_max_seq).to(data_type).npu(),
+                                 torch.tensor(self.kv_seqlen).to(torch.int32).npu(),
+                                 torch.tensor(self.q_seqlen).to(torch.int32).npu(), self.layer_id.npu()])
+
+    
+    def test_swa_encoder_compress_mask(self):
+        """
+            is_decoder = 0, no_cache=False, "maskType": MASK_TYPE_SLIDING_WINDOW_COMPRESS， cacheType = 0
+            qselen = kv_seqLen = [32, 256, ...] + compress swa mask
+        """
+        if not operation_test.get_soc_version() == 'Ascend910B':
+            print("this testcase only supports Ascend910B")
+            return
+        self.data_type = torch.bfloat16
+        data_type = self.data_type
+        self.batch = 8
+        batch = self.batch
+        self.kv_head = 32        # kv_head num
+        kv_head = self.kv_head
+        self.is_decoder = 0       # prefill or decoder
+        self.heads = 32          # llama7b  hidden_size 4096
+        self.embeddim = 128
+        self.embeddim_v = self.embeddim
+        kv_seqLen = [32, 256] * 4
+        self.max_seq = 256
+        self.window_size = 16
+        self.cacheType = 0
+        self.q_seqlen, self.q_ntokens = self.gen_seq_len(batch, kv_seqLen)
+        self.kv_seqlen, self.kv_ntokens = self.gen_seq_len(batch, kv_seqLen)
+        self.layer_id = torch.from_numpy(np.array([0], dtype=np.int32)).to(torch.int32)
+        self.q_max_seq = np.max(self.q_seqlen)
+        self.kv_max_seq = np.max(self.kv_seqlen)
+        tor = np.float32(1.0 / math.sqrt(1.0 * self.embeddim))
+        
+        self.q_scale = 1
+        self.qk_scale = tor
+        self.cache_type = 1
+        OP_NAME = "SelfAttentionOperation"
+        OP_PARAM = {"type": 1}
+        self.set_data_params(cache_type=self.cache_type, is_decoder=self.is_decoder, batch=batch, kv_head=self.kv_head,
+                             heads=self.heads, embeddim=self.embeddim, max_seq=self.max_seq, kv_seqLen=kv_seqLen,
+                             data_type=data_type, long_seq = False, op_type=OP_PARAM["type"], mask_type = MASK_TYPE_SWA,
+                             no_cache=False, is_sqrt=False, tor=tor, q_seqlens=self.q_seqlen)
+        self.gen_out_tensor()
+        self.window_size = 16
+        attention_mask = self.gen_swa_cmp(self.max_seq, self.window_size).to(data_type).npu()
+
+        param = json.dumps(
+            {"headNum": self.heads, "qScale": float(self.q_scale), "qkScale": float(self.qk_scale), "maskType": 8,
+             "kvcacheCfg": 1, "calcType": 1, "windowSize": self.window_size, "cacheType": 0})
+        self.param_seqlen = self.q_seqlen
+        self.param_token_offset = self.kv_seqlen
+        run_param = json.dumps({"tokenOffset": self.param_token_offset, "seqLen": self.param_seqlen, "maskType": 7})
+        self.execute_with_param(OP_NAME, param, run_param,
+                                [self.q.npu(), self.k.npu(), self.v.npu(),
+                                 attention_mask.reshape(512, 512).to(data_type).npu(),
+                                 torch.tensor(self.kv_seqlen).to(torch.int32).npu(),
+                                 torch.tensor(self.q_seqlen).to(torch.int32).npu(), self.layer_id.npu()])
+
+    def test_operation_logn(self):
+        """
+            is_decoder = 1, no_cache=False, "maskType": MASK_TYPE_NORM
+            qseqlen = [1] * batch
+            kv_seqLen = [32, 1024] * 4
+        """
+        if not operation_test.get_soc_version() == 'Ascend910B':
+            print("this testcase only supports Ascend910B")
+            return
+        mask_type = MASK_TYPE_NO_HEAD_DECODER
+        data_type = torch.bfloat16
+        batch = 8
+        kv_head = 32  # kv_head num
+        is_decoder = 1  # prefill or decoder
+        heads = 32  # llama7b  hidden_size 4096
+        embeddim = 128
+        embeddimv = np.random.randint(1, embeddim)
+        max_seq = 1024
+        tor = 1
+        dynamic_batch = False
+        kv_seqLen = [32, 1024] * 4
+        qSeqLen = [1] * batch
+        self.q_seqlen, self.q_ntokens = self.gen_seq_len(batch, [1] * batch)
+        self.kv_seqlen, self.kv_ntokens = self.gen_seq_len(batch, kv_seqLen)
+        self.q_max_seq = np.max(self.q_seqlen)
+        self.kv_max_seq = np.max(self.kv_seqlen)
+        tor = np.float32(1.0 / math.sqrt(1.0 * embeddim))
+        self.set_data_params(mask_type=mask_type, tor=tor, q_seqlens=self.q_seqlen, kv_seqLen=self.kv_seqlen, data_type=data_type, batch=batch, kv_head=kv_head, 
+                            is_decoder=is_decoder, heads=heads, embeddim=embeddim, embeddimv=embeddimv, max_seq=max_seq, dynamic_batch=dynamic_batch)
+        self.gen_out_tensor()
+        OP_NAME = "SelfAttentionOperation"
+        self.q_scale = 1
+        self.qk_scale = tor
+        param = json.dumps(
+            {"headNum": self.heads, "qScale": float(self.q_scale), "qkScale": float(self.qk_scale), "maskType": 1,
+             "kvcacheCfg": 1, "calcType": 2})
+        self.param_seqlen = self.q_seqlen
+        self.param_token_offset = self.kv_seqlen
+        run_param = json.dumps({"tokenOffset": self.param_token_offset, "seqLen": self.param_seqlen})
+        # pdb.set_trace()
+        self.execute_with_param(OP_NAME, param, run_param,
+                                [self.q.npu(), self.k.npu(), self.v.npu(), self.mask.to(data_type).npu(),
+                                 torch.tensor(self.kv_seqlen).to(torch.int32).npu(),
+                                 torch.tensor(self.q_seqlen).to(torch.int32).npu(), self.layer_id.npu()])
+    
+    def test_operation_split_kvcache_success_float16(self):
+        """
+            is_decoder = 1, no_cache=False, "maskType": MASK_TYPE_NO_HEAD_DECODER, cache_type = 1
+            qseqlen = [1, ...] 
+            kv_seqLen = [114, ...]
+        """
+        if not operation_test.get_soc_version() == 'Ascend910B':
+            print("this testcase only supports Ascend910B")
+            return
+        mask_type = MASK_TYPE_NO_HEAD_DECODER    
+        self.data_type = torch.float16
+        data_type = self.data_type
+        self.batch = 22
+        batch = self.batch
+        self.kv_head = 44       # kv_head num
+        kv_head = self.kv_head
+        self.is_decoder = 1       # prefill or decoder
+        self.heads = 44          # llama7b  hidden_size 4096
+        self.embeddim = 256
+        self.embeddim_v = self.embeddim
+        self.max_seq = 256
+        tor = 1
+        self.dynamic_batch = False
+        kv_seqLen = [114] * batch
+        qSeqLen = [1] * batch
+        self.is_clamp = 0
+        self.clamp_min = 0
+        self.clamp_max = 0
+        self.is_triu_mask = False
+        self.long_seq = False
+        self.is_alibi = False
+        self.q_seqlen, self.q_ntokens = self.gen_seq_len(batch, [1] * batch)
+        self.kv_seqlen, self.kv_ntokens = self.gen_seq_len(batch, kv_seqLen)
+        self.layer_id = torch.from_numpy(np.array([0], dtype=np.int32)).to(torch.int32)
+        self.q_max_seq = np.max(self.q_seqlen)
+        self.kv_max_seq = np.max(self.kv_seqlen)
+        self.cache_type = 1
+        self.window_size = 0
+        self.is_compress = False
+        OP_NAME = "SelfAttentionOperation"
+        OP_PARAM = {"type": 1}
+        tor = np.float32(1.0 / math.sqrt(1.0 * self.embeddim))
+        self.set_data_params(cache_type=self.cache_type, is_decoder=self.is_decoder, batch=batch, kv_head=self.kv_head,
+                             heads=self.heads, embeddim=self.embeddim, embeddimv=self.embeddim_v, max_seq=self.max_seq, kv_seqLen=kv_seqLen,
+                             data_type=data_type, op_type=OP_PARAM["type"], mask_type = mask_type,
+                             no_cache=False, is_sqrt=False, tor=tor, q_seqlens=self.q_seqlen)
+        q = torch.from_numpy(np.random.uniform(-5.0, 5.0, size=(self.q_ntokens, self.heads * self.embeddim)))
+        
+        self.q = q.to(data_type)
+        self.k_list = []
+        self.v_list = []
+        for i in range(self.batch):
+            self.k_list.append(torch.from_numpy(np.random.uniform(-5.0, 5.0, size=(1, 1, self.max_seq, kv_head * self.embeddim))).to(data_type).npu())
+            self.v_list.append(torch.from_numpy(np.random.uniform(-5.0, 5.0, size=(1, 1, self.max_seq, kv_head * self.embeddim_v))).to(data_type).npu())
+        
+        self.k = torch.cat(self.k_list, 1).cpu()
+        self.v = torch.cat(self.v_list, 1).cpu()
+ 
+        for i in range(self.batch):
+            self.k_list[i] = self.k_list[i].squeeze().npu()
+            self.v_list[i] = self.v_list[i].squeeze().npu()
+        self.gen_out_tensor()
+    
+        self.q_scale = 1
+        self.qk_scale = tor
+        param = json.dumps({"headNum": self.heads, "qScale": float(self.q_scale), "qkScale": float(self.qk_scale), "maskType": 1, "kvcacheCfg":1, "calcType":2})
+        self.param_seqlen = self.q_seqlen
+        self.param_token_offset = self.kv_seqlen
+        run_param = json.dumps({"tokenOffset": self.param_token_offset, "seqLen": self.param_seqlen, "byPass": "true"})
+        #pdb.set_trace()
+        self.execute_with_param_and_tensor_list(OP_NAME, param, run_param,
+                     [self.q.npu(), self.k.npu(), self.v.npu(),self.mask.to(data_type).npu(),torch.tensor(self.kv_seqlen).to(torch.int32).npu(), torch.tensor(self.q_seqlen).to(torch.int32).npu(), self.layer_id.npu()],
+                     [self.k_list, self.v_list], ["kCache", "vCache"])
+    
+    def test_operation_split_kvcache_success_bfloat16(self):
+        """
+            is_decoder = 1, no_cache=False, "maskType": MASK_TYPE_NO_HEAD_DECODER, cache_type = 1
+            qseqlen = [1, ...] 
+            kv_seqLen = [114, ...]
+        """
+        if not operation_test.get_soc_version() == 'Ascend910B':
+            print("this testcase only supports Ascend910B")
+            return
+        mask_type = MASK_TYPE_NO_HEAD_DECODER    
+        self.data_type = torch.bfloat16
+        data_type = self.data_type
+        self.batch = 22
+        batch = self.batch
+        self.kv_head = 44       # kv_head num
+        kv_head = self.kv_head
+        self.is_decoder = 1       # prefill or decoder
+        self.heads = 44          # llama7b  hidden_size 4096
+        self.embeddim = 256
+        self.embeddim_v = self.embeddim
+        self.max_seq = 256
+        tor = 1
+        self.dynamic_batch = False
+        kv_seqLen = [114] * batch
+        qSeqLen = [1] * batch
+        self.is_clamp = 0
+        self.clamp_min = 0
+        self.clamp_max = 0
+        self.is_triu_mask = False
+        self.long_seq = False
+        self.is_alibi = False
+        self.q_seqlen, self.q_ntokens = self.gen_seq_len(batch, [1] * batch)
+        self.kv_seqlen, self.kv_ntokens = self.gen_seq_len(batch, kv_seqLen)
+        self.layer_id = torch.from_numpy(np.array([0], dtype=np.int32)).to(torch.int32)
+        self.q_max_seq = np.max(self.q_seqlen)
+        self.kv_max_seq = np.max(self.kv_seqlen)
+        self.cache_type = 1
+        self.window_size = 0
+        self.is_compress = False
+        
+        q = torch.from_numpy(np.random.uniform(-5.0, 5.0, size=(self.q_ntokens, self.heads * self.embeddim)))
+        tor = np.float32(1.0 / math.sqrt(1.0 * self.embeddim))
+        #self.q = (q * tor).to(data_type)
+        
+        OP_NAME = "SelfAttentionOperation"
+        OP_PARAM = {"type": 1}
+        self.set_data_params(cache_type=self.cache_type, is_decoder=self.is_decoder, batch=batch, kv_head=self.kv_head,
+                             heads=self.heads, embeddim=self.embeddim, embeddimv=self.embeddim_v, max_seq=self.max_seq, kv_seqLen=kv_seqLen,
+                             data_type=data_type, long_seq = False, op_type=OP_PARAM["type"], mask_type = mask_type,
+                             tor=tor, q_seqlens=self.q_seqlen)
+        self.q = q.to(data_type)
+        self.k_list = []
+        self.v_list = []
+        for i in range(self.batch):
+            self.k_list.append(torch.from_numpy(np.random.uniform(-5.0, 5.0, size=(1, 1, self.max_seq, self.kv_head * self.embeddim))).to(data_type).npu())
+            self.v_list.append(torch.from_numpy(np.random.uniform(-5.0, 5.0, size=(1, 1, self.max_seq, self.kv_head * self.embeddim_v))).to(data_type).npu())
+        
+        self.k = torch.cat(self.k_list, 1).cpu()
+        self.v = torch.cat(self.v_list, 1).cpu()
+ 
+        for i in range(self.batch):
+            self.k_list[i] = self.k_list[i].squeeze().npu()
+            self.v_list[i] = self.v_list[i].squeeze().npu()
+        self.gen_out_tensor()
+        
+        self.q_scale = 1
+        self.qk_scale = tor
+        param = json.dumps({"headNum": self.heads, "qScale": float(self.q_scale), "qkScale": float(self.qk_scale), "maskType": 1, "kvcacheCfg":1, "calcType":2})
+        self.param_seqlen = self.q_seqlen
+        self.param_token_offset = self.kv_seqlen
+        run_param = json.dumps({"tokenOffset": self.param_token_offset, "seqLen": self.param_seqlen, "byPass": "true"})
+        #pdb.set_trace()
+        self.execute_with_param_and_tensor_list(OP_NAME, param, run_param,
+                     [self.q.npu(), self.k.npu(), self.v.npu(), self.mask.to(data_type).npu(),torch.tensor(self.kv_seqlen).to(torch.int32).npu(), torch.tensor(self.q_seqlen).to(torch.int32).npu(), self.layer_id.npu()],
+                     [self.k_list, self.v_list], ["kCache", "vCache"])
+    
+    def test_encoder_operation_mask_free_fp16(self):
+        """
+            is_decoder = 0, no_cache=True, "maskType": MASK_TYPE_NO_BATCH,
+            qseqlen = kv_seqLen = [1024]
+        """
+        if not operation_test.get_soc_version() == 'Ascend910B':
+            print("this testcase only supports Ascend910B")
+            return
+        
+        batch = 1
+        kv_head = 1        # kv_head num
+        isdecoder = 0       # prefill or decoder
+        heads = 12          
+        embeddim = 128
+        max_seq = 1024
+        tor = 1
+        kv_seqLen = [1024]
+        is_clamp = 0
+        clamp_min = 0
+        clamp_max = 0
+        dynamic_batch = False
+        data_type = torch.float16
+
+
+        self.set_data_params(dynamic_batch = dynamic_batch, 
+                             is_decoder = isdecoder, batch = batch, kv_head = kv_head, heads = heads, 
+                             embeddim = embeddim, max_seq = max_seq, kv_seqLen = kv_seqLen,
+                             is_clamp = is_clamp, clamp_max = clamp_max, clamp_min = clamp_min,
+                             data_type = data_type, is_alibi = True,
+                             op_type = 2001, mask_type = MASK_TYPE_ALIBI_WITH_BATCH, no_cache = True)
+        print("embeddimv: ", self.embeddimv)
+        self.gen_out_tensor()
+        param_seqlen = self.kv_seqLen
+        self.alibi_slopes *= -1
+        mask = np.ones((256,256)) * 60000
+        mask = np.triu(mask, 1)
+        self.mask = self.bias[:256, :256] * -1 + mask
+        self.mask = self.mask.to(torch.float16)
+        print(f"===============self.mask {self.mask.shape}")
+        print(f"===============self.mask {torch.max(self.mask)} {torch.min(self.mask)}")
+        print(self.alibi_slopes)
+        OP_NAME = "SelfAttentionOperation"
+        PARAM = json.dumps({"headNum": 12, "qkScale": 1, "kvHeadNum": 1,
+                                "calcType": 3, "maskType": 4, "isTriuMask": 1, "kernelType": 1})
+        RUN_PARAM = json.dumps({"seqLen": param_seqlen})
+        print(self.q.npu().contiguous().shape, self.k.npu().contiguous().shape, self.v.npu().contiguous().shape, self.mask.npu().contiguous().shape, torch.from_numpy(np.array(self.kv_seqLen).astype(np.int32)).npu().contiguous().shape, self.alibi_slopes.npu().contiguous().shape, param_seqlen)
+        self.execute_with_param(OP_NAME, PARAM, RUN_PARAM, [
+            self.q.npu().contiguous(), self.k.npu().contiguous(), self.v.npu().contiguous(), self.mask.npu().contiguous(), torch.from_numpy(np.array(self.kv_seqLen).astype(np.int32)).npu().contiguous(), self.alibi_slopes.npu().contiguous()
+        ])
+
+    def test_flash_attention_case_fa_encoder_withcache_bf16_maskfree(self):
+        """
+            is_decoder = 0, no_cache=True, "maskType": MASK_TYPE_CAUSAL_MASK,
+            qseqlen = [seqlen] * batch
+            kv_seqLen = [seqlen + 128 * random.randint(1, 4)] * batch
+        """
+        if not operation_test.get_soc_version() == 'Ascend910B':
+            print("this testcase only supports Ascend910B")
+            return
+        # [b,ms,ms]
+        for _ in range(700):
+            batch = random.randint(1, 16)
+            kv_head = random.randint(1, 5)  # kv_head num
+            isdecoder = 0  # prefill or decoder
+            heads = kv_head * random.randint(1, 4)  # head num
+            embeddim = 128
+            max_seq = 128 * 100
+            tor = 1.0 / math.sqrt(1.0 * embeddim)
+            seqlen = random.randint(1, 4096)
+            q_seqlens = [seqlen] * batch
+            kv_seqLen = [seqlen + 128 * random.randint(1, 4)] * batch
+            is_clamp = 0
+            clamp_min = 0
+            clamp_max = 0
+            dynamic_batch = False
+            block_size = 128
+            num_blocks = 1024
+            OP_NAME = "SelfAttentionOperation"
+            OP_PARAM = {"type": UNPAD_FLASH_ATTENTION_ENCODER_PREFIX_CACHE_ND, "qSeqLen": q_seqlens,
+                        "kvSeqLen": kv_seqLen, "headSize": heads, "tor": tor,
+                        "isClamp": is_clamp, "clampMin": clamp_min, "clampMax": clamp_max,
+                        "maskType": MASK_TYPE_CAUSAL_MASK, "kvHead": kv_head,
+                        "isTriuMask": 1, "alibiLeftAlign": 0, "isAlibiMaskSqrt": 0}
+            data_type = random.choice([torch.bfloat16, torch.float16])
+            print(
+                f"---batch:{batch}---kv_head:{kv_head}---q_seqlens:{q_seqlens}---kv_seqLen:{kv_seqLen}---kv_head:{kv_head}---heads:{heads}---data_type:{data_type}---")
+            self.set_data_params(dynamic_batch=dynamic_batch,
+                                 is_decoder=isdecoder, batch=batch, kv_head=kv_head, heads=heads,
+                                 embeddim=embeddim, max_seq=max_seq, kv_seqLen=kv_seqLen,
+                                 is_clamp=is_clamp, clamp_max=clamp_max, clamp_min=clamp_min,
+                                 data_type=data_type,
+                                 op_type=OP_PARAM["type"], mask_type=MASK_TYPE_ALIBI_NO_BATCH_WITH_PREFIX,
+                                 no_cache=True, tor=tor, q_seqlens=q_seqlens,
+                                 num_blocks=num_blocks, block_size=block_size, is_triu_mask=True, is_mask=True)
+            self.gen_out_tensor()
+            PARAM = json.dumps(
+                {"headNum": heads, "calcType": CAL_TYPE_PREFIX_ENCODER, "maskType": MASK_TYPE_CAUSAL_MASK,
+                 "kvHeadNum": kv_head, "isTriuMask": 1, "qkScale": tor, "kernelType": KERNELTYPE_HIGH_PRECISION})
+            RUN_PARAM = json.dumps({"seqLen": q_seqlens, "kvSeqLen": kv_seqLen, "CalcType": CAL_TYPE_PREFIX_ENCODER,
+                                    "maskType": MASK_TYPE_CAUSAL_MASK})
+            q_seqlen = np.array(q_seqlens)
+            q_seqlen = torch.from_numpy(q_seqlen).to(torch.int32).npu()
+            kv_seqLen = np.array(kv_seqLen)
+            kv_seqLen = torch.from_numpy(kv_seqLen).to(torch.int32).npu()
+            self.execute_with_param(OP_NAME, PARAM, RUN_PARAM, [self.q.npu(), self.k_cache.npu(), self.v_cache.npu(),
+                                                                self.block_tables.npu(), q_seqlen, kv_seqLen])
+    def test_self_attention_encoder_operation_alibi_bf16(self):
+        """
+            is_decoder = 1, no_cache=False, "maskType": MASK_TYPE_NO_HEAD_DECODER, cache_type = 1
+            qseqlen = kv_seqLen = [1024]
+        """
+        batch = 1
+        kv_head = 1        # kv_head num
+        isdecoder = 0       # prefill or decoder
+        heads = 12          
+        embeddim = 128
+        max_seq = 1024
+        tor = 1
+        kv_seqLen = [1024]
+        is_clamp = 0
+        clamp_min = 0
+        clamp_max = 0
+        dynamic_batch = False
+        data_type = torch.bfloat16
+
+        self.set_data_params(dynamic_batch = dynamic_batch, 
+                             is_decoder = isdecoder, batch = batch, kv_head = kv_head, heads = heads, 
+                             embeddim = embeddim, max_seq = max_seq, kv_seqLen = kv_seqLen,
+                             is_clamp = is_clamp, clamp_max = clamp_max, clamp_min = clamp_min, tor=tor,
+                             data_type = data_type)
+        self.gen_out_tensor()
+        self.mask = self.mask.to(torch.bfloat16)
+        data = [self.q, self.k, self.v, self.mask, self.kv_seqLen, self.golden_out]
+        param_seqlen = data[4]
+        data[4] = torch.from_numpy(np.array(data[4]).astype(np.int32))
+        data[1], data[2] = torch.reshape(data[1], (max_seq, embeddim)), torch.reshape(data[2], (max_seq, embeddim))
+        in_tensors = [tensor.npu().contiguous() for tensor in data]
+
+        OP_NAME = "SelfAttentionOperation"
+        PARAM = json.dumps({"headNum": 12, "qkScale": 1, "kvHeadNum": 1,
+                            "calcType": 3, "maskType": 2, "isTriuMask": 1, "kernelType": 0})
+        RUN_PARAM = json.dumps({"seqLen": param_seqlen})
+        if not operation_test.get_soc_version() == 'Ascend910B':
+            print("this testcase only supports Ascend910B")
+            return
+        self.execute_with_param(OP_NAME, PARAM, RUN_PARAM, [
+            in_tensors[0], in_tensors[1], in_tensors[2], in_tensors[3], in_tensors[4]
+        ])
+
+    def test_self_attention_encoder_operation_compress_mask_swa_cycle_cache(self):
+        """
+            is_decoder = 0, no_cache=False, "maskType": MASK_TYPE_ALIBI_WITH_PREFIX_BATCH, cache_type = 1
+            qseqlen = kv_seqLen = [128...]
+        """
+        if operation_test.get_soc_version() == 'Ascend910B':
+            kv_head = 2
+            window_size = 32
+            mask_type = 8
+            data = self.calc_expect_func(2, 1024, 2, 128, window_size, mask_type, group_num=kv_head)
+            param_seqlen = data[4].tolist()
+            in_tensors = [torch.from_numpy(tensor) for tensor in data]
+            in_tensors = [tensor.npu() for tensor in in_tensors]
+            a = [print(tensor.dtype, tensor.device) for tensor in in_tensors]
+
+            OP_NAME = "SelfAttentionOperation"
+            print("now qseqlen is ", self.q_seqlen)
+            self.set_data_params(kv_head=kv_head, mask_type=mask_type, heads=self.heads, embeddim=self.embeddim, embeddimv=self.embeddimv, kv_seqLen=self.kv_seqLen, batch=2, window_size=window_size,
+                                 no_cache=True)
+            self.gen_out_tensor()
+            PARAM = json.dumps({"headNum": kv_head, "qkScale": (1 / float(math.sqrt(128))), "kvHeadNum": kv_head, \
+                "maskType": mask_type, "calcType": 3, "windowSize": 32, "cacheType": 1})
+            RUN_PARAM = json.dumps({"seqLen": param_seqlen})
+            print(PARAM, RUN_PARAM)
+        if not operation_test.get_soc_version() == 'Ascend910B':
+            print("this testcase only supports Ascend910B")
+            return
+        
+        self.mask = gen_swa_cmp(window_size, self.embeddim).astype('float16')
+        self.golden_out = torch.reshape(self.golden_out, (2048, 2, 128))
+        self.golden_out_true = torch.reshape(self.golden_out_true, (2048, 2, 128))
+        self.execute_with_param(OP_NAME, PARAM, RUN_PARAM, [
+            torch.reshape(self.q, (2048, 2, 128)).npu(), torch.reshape(self.k, (2048, 2, 128)).npu(), torch.reshape(self.v, (2048, 2, 128)).npu(), torch.from_numpy(self.mask).npu(), torch.from_numpy(self.q_seqlen.astype(np.int32)).npu()
+        ])
+
+if __name__ == '__main__':
+    unittest.main()
-- 
Gitee


From 2b497cecbaca2c3d8ee19ad9ecf24c23c0906024 Mon Sep 17 00:00:00 2001
From: caixilong <caixilong2@h-partners.com>
Date: Mon, 15 Sep 2025 09:40:05 +0800
Subject: [PATCH 09/94] remove loop from fa_encoder_withcache_bf16_maskfree

---
 .../test_self_attention_combine.py            | 89 +++++++++----------
 1 file changed, 44 insertions(+), 45 deletions(-)

diff --git a/tests/apitest/opstest/python/operations/self_attention/test_self_attention_combine.py b/tests/apitest/opstest/python/operations/self_attention/test_self_attention_combine.py
index bfbfb539..d4c61b3f 100644
--- a/tests/apitest/opstest/python/operations/self_attention/test_self_attention_combine.py
+++ b/tests/apitest/opstest/python/operations/self_attention/test_self_attention_combine.py
@@ -1785,51 +1785,50 @@ class TestFlashAttention(operation_test.OperationTest):
             print("this testcase only supports Ascend910B")
             return
         # [b,ms,ms]
-        for _ in range(700):
-            batch = random.randint(1, 16)
-            kv_head = random.randint(1, 5)  # kv_head num
-            isdecoder = 0  # prefill or decoder
-            heads = kv_head * random.randint(1, 4)  # head num
-            embeddim = 128
-            max_seq = 128 * 100
-            tor = 1.0 / math.sqrt(1.0 * embeddim)
-            seqlen = random.randint(1, 4096)
-            q_seqlens = [seqlen] * batch
-            kv_seqLen = [seqlen + 128 * random.randint(1, 4)] * batch
-            is_clamp = 0
-            clamp_min = 0
-            clamp_max = 0
-            dynamic_batch = False
-            block_size = 128
-            num_blocks = 1024
-            OP_NAME = "SelfAttentionOperation"
-            OP_PARAM = {"type": UNPAD_FLASH_ATTENTION_ENCODER_PREFIX_CACHE_ND, "qSeqLen": q_seqlens,
-                        "kvSeqLen": kv_seqLen, "headSize": heads, "tor": tor,
-                        "isClamp": is_clamp, "clampMin": clamp_min, "clampMax": clamp_max,
-                        "maskType": MASK_TYPE_CAUSAL_MASK, "kvHead": kv_head,
-                        "isTriuMask": 1, "alibiLeftAlign": 0, "isAlibiMaskSqrt": 0}
-            data_type = random.choice([torch.bfloat16, torch.float16])
-            print(
-                f"---batch:{batch}---kv_head:{kv_head}---q_seqlens:{q_seqlens}---kv_seqLen:{kv_seqLen}---kv_head:{kv_head}---heads:{heads}---data_type:{data_type}---")
-            self.set_data_params(dynamic_batch=dynamic_batch,
-                                 is_decoder=isdecoder, batch=batch, kv_head=kv_head, heads=heads,
-                                 embeddim=embeddim, max_seq=max_seq, kv_seqLen=kv_seqLen,
-                                 is_clamp=is_clamp, clamp_max=clamp_max, clamp_min=clamp_min,
-                                 data_type=data_type,
-                                 op_type=OP_PARAM["type"], mask_type=MASK_TYPE_ALIBI_NO_BATCH_WITH_PREFIX,
-                                 no_cache=True, tor=tor, q_seqlens=q_seqlens,
-                                 num_blocks=num_blocks, block_size=block_size, is_triu_mask=True, is_mask=True)
-            self.gen_out_tensor()
-            PARAM = json.dumps(
-                {"headNum": heads, "calcType": CAL_TYPE_PREFIX_ENCODER, "maskType": MASK_TYPE_CAUSAL_MASK,
-                 "kvHeadNum": kv_head, "isTriuMask": 1, "qkScale": tor, "kernelType": KERNELTYPE_HIGH_PRECISION})
-            RUN_PARAM = json.dumps({"seqLen": q_seqlens, "kvSeqLen": kv_seqLen, "CalcType": CAL_TYPE_PREFIX_ENCODER,
-                                    "maskType": MASK_TYPE_CAUSAL_MASK})
-            q_seqlen = np.array(q_seqlens)
-            q_seqlen = torch.from_numpy(q_seqlen).to(torch.int32).npu()
-            kv_seqLen = np.array(kv_seqLen)
-            kv_seqLen = torch.from_numpy(kv_seqLen).to(torch.int32).npu()
-            self.execute_with_param(OP_NAME, PARAM, RUN_PARAM, [self.q.npu(), self.k_cache.npu(), self.v_cache.npu(),
+        batch = random.randint(1, 16)
+        kv_head = random.randint(1, 5)  # kv_head num
+        isdecoder = 0  # prefill or decoder
+        heads = kv_head * random.randint(1, 4)  # head num
+        embeddim = 128
+        max_seq = 128 * 100
+        tor = 1.0 / math.sqrt(1.0 * embeddim)
+        seqlen = random.randint(1, 4096)
+        q_seqlens = [seqlen] * batch
+        kv_seqLen = [seqlen + 128 * random.randint(1, 4)] * batch
+        is_clamp = 0
+        clamp_min = 0
+        clamp_max = 0
+        dynamic_batch = False
+        block_size = 128
+        num_blocks = 1024
+        OP_NAME = "SelfAttentionOperation"
+        OP_PARAM = {"type": UNPAD_FLASH_ATTENTION_ENCODER_PREFIX_CACHE_ND, "qSeqLen": q_seqlens,
+                    "kvSeqLen": kv_seqLen, "headSize": heads, "tor": tor,
+                    "isClamp": is_clamp, "clampMin": clamp_min, "clampMax": clamp_max,
+                    "maskType": MASK_TYPE_CAUSAL_MASK, "kvHead": kv_head,
+                    "isTriuMask": 1, "alibiLeftAlign": 0, "isAlibiMaskSqrt": 0}
+        data_type = random.choice([torch.bfloat16, torch.float16])
+        print(
+            f"---batch:{batch}---kv_head:{kv_head}---q_seqlens:{q_seqlens}---kv_seqLen:{kv_seqLen}---kv_head:{kv_head}---heads:{heads}---data_type:{data_type}---")
+        self.set_data_params(dynamic_batch=dynamic_batch,
+                             is_decoder=isdecoder, batch=batch, kv_head=kv_head, heads=heads,
+                             embeddim=embeddim, max_seq=max_seq, kv_seqLen=kv_seqLen,
+                             is_clamp=is_clamp, clamp_max=clamp_max, clamp_min=clamp_min,
+                             data_type=data_type,
+                             op_type=OP_PARAM["type"], mask_type=MASK_TYPE_ALIBI_NO_BATCH_WITH_PREFIX,
+                             no_cache=True, tor=tor, q_seqlens=q_seqlens,
+                             num_blocks=num_blocks, block_size=block_size, is_triu_mask=True, is_mask=True)
+        self.gen_out_tensor()
+        PARAM = json.dumps(
+            {"headNum": heads, "calcType": CAL_TYPE_PREFIX_ENCODER, "maskType": MASK_TYPE_CAUSAL_MASK,
+             "kvHeadNum": kv_head, "isTriuMask": 1, "qkScale": tor, "kernelType": KERNELTYPE_HIGH_PRECISION})
+        RUN_PARAM = json.dumps({"seqLen": q_seqlens, "kvSeqLen": kv_seqLen, "CalcType": CAL_TYPE_PREFIX_ENCODER,
+                                "maskType": MASK_TYPE_CAUSAL_MASK})
+        q_seqlen = np.array(q_seqlens)
+        q_seqlen = torch.from_numpy(q_seqlen).to(torch.int32).npu()
+        kv_seqLen = np.array(kv_seqLen)
+        kv_seqLen = torch.from_numpy(kv_seqLen).to(torch.int32).npu()
+        self.execute_with_param(OP_NAME, PARAM, RUN_PARAM, [self.q.npu(), self.k_cache.npu(), self.v_cache.npu(),
                                                                 self.block_tables.npu(), q_seqlen, kv_seqLen])
     def test_self_attention_encoder_operation_alibi_bf16(self):
         """
-- 
Gitee


From 58d087303b54aa6231cf156510eaab518820c9eb Mon Sep 17 00:00:00 2001
From: caixilong <caixilong2@h-partners.com>
Date: Tue, 16 Sep 2025 11:17:39 +0800
Subject: [PATCH 10/94] fix alibi 16 bug

---
 .../test_self_attention_combine.py            | 32 +++++++++----------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/tests/apitest/opstest/python/operations/self_attention/test_self_attention_combine.py b/tests/apitest/opstest/python/operations/self_attention/test_self_attention_combine.py
index d4c61b3f..0102de9b 100644
--- a/tests/apitest/opstest/python/operations/self_attention/test_self_attention_combine.py
+++ b/tests/apitest/opstest/python/operations/self_attention/test_self_attention_combine.py
@@ -1835,35 +1835,35 @@ class TestFlashAttention(operation_test.OperationTest):
             is_decoder = 1, no_cache=False, "maskType": MASK_TYPE_NO_HEAD_DECODER, cache_type = 1
             qseqlen = kv_seqLen = [1024]
         """
-        batch = 1
-        kv_head = 1        # kv_head num
-        isdecoder = 0       # prefill or decoder
-        heads = 12          
-        embeddim = 128
-        max_seq = 1024
-        tor = 1
-        kv_seqLen = [1024]
+        batch = random.randint(1, 10)
+        kv_head = random.randint(1, 32)  # kv_head num
+        isdecoder = 0  # prefill or decoder
+        heads = kv_head * random.randint(1, 5)
+        embeddim = random.choice([32, 64, 128])
+        max_seq = random.randint(1, 2048)
+        tor = 1.0 / math.sqrt(1.0 * embeddim)
+        kv_seqLen = [max_seq] * batch
         is_clamp = 0
         clamp_min = 0
         clamp_max = 0
         dynamic_batch = False
         data_type = torch.bfloat16
+        print(f"--batch:{batch}--kv_head:{kv_head}--heads:{heads}--embeddim:{embeddim}--max_seq:{max_seq}")
 
-        self.set_data_params(dynamic_batch = dynamic_batch, 
-                             is_decoder = isdecoder, batch = batch, kv_head = kv_head, heads = heads, 
-                             embeddim = embeddim, max_seq = max_seq, kv_seqLen = kv_seqLen,
-                             is_clamp = is_clamp, clamp_max = clamp_max, clamp_min = clamp_min, tor=tor,
-                             data_type = data_type)
+        self.set_data_params(dynamic_batch=dynamic_batch,
+                             is_decoder=isdecoder, batch=batch, kv_head=kv_head, heads=heads,
+                             embeddim=embeddim, max_seq=max_seq, kv_seqLen=kv_seqLen,
+                             is_clamp=is_clamp, clamp_max=clamp_max, clamp_min=clamp_min,
+                             data_type=data_type, is_alibi=True, tor=tor,
+                             op_type=10, mask_type=MASK_TYPE_ALIBI_WITH_BATCH, no_cache=True)
         self.gen_out_tensor()
         self.mask = self.mask.to(torch.bfloat16)
         data = [self.q, self.k, self.v, self.mask, self.kv_seqLen, self.golden_out]
         param_seqlen = data[4]
         data[4] = torch.from_numpy(np.array(data[4]).astype(np.int32))
-        data[1], data[2] = torch.reshape(data[1], (max_seq, embeddim)), torch.reshape(data[2], (max_seq, embeddim))
         in_tensors = [tensor.npu().contiguous() for tensor in data]
-
         OP_NAME = "SelfAttentionOperation"
-        PARAM = json.dumps({"headNum": 12, "qkScale": 1, "kvHeadNum": 1,
+        PARAM = json.dumps({"headNum": heads, "qkScale": 1.0 / math.sqrt(1.0 * embeddim), "kvHeadNum": kv_head,
                             "calcType": 3, "maskType": 2, "isTriuMask": 1, "kernelType": 0})
         RUN_PARAM = json.dumps({"seqLen": param_seqlen})
         if not operation_test.get_soc_version() == 'Ascend910B':
-- 
Gitee


From 99a120bed4fc01f8128edb9edbb02b26478be795 Mon Sep 17 00:00:00 2001
From: caixilong <caixilong2@h-partners.com>
Date: Tue, 16 Sep 2025 19:45:37 +0800
Subject: [PATCH 11/94] fix swa_decoder and swa_decoder_cache bug

---
 .../test_self_attention_combine.py               | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/tests/apitest/opstest/python/operations/self_attention/test_self_attention_combine.py b/tests/apitest/opstest/python/operations/self_attention/test_self_attention_combine.py
index 0102de9b..af32382b 100644
--- a/tests/apitest/opstest/python/operations/self_attention/test_self_attention_combine.py
+++ b/tests/apitest/opstest/python/operations/self_attention/test_self_attention_combine.py
@@ -1274,7 +1274,7 @@ class TestFlashAttention(operation_test.OperationTest):
         self.dynamic_batch = False
         kv_seqLen = [114] * batch
         qSeqLen = [1] * batch
-        self.max_seq = max(max(kv_seqLen), max(qSeqLen))
+        self.max_seq = 256
         self.window_size = 16
         self.q_seqlen, self.q_ntokens = self.gen_seq_len(batch, [1] * batch)
         self.kv_seqlen, self.kv_ntokens = self.gen_seq_len(batch, kv_seqLen)
@@ -1285,20 +1285,20 @@ class TestFlashAttention(operation_test.OperationTest):
 
         self.q_scale = 1
         self.qk_scale = tor
-        self.cache_type = 1
+        self.cache_type = 0
         
         OP_NAME = "SelfAttentionOperation"
         OP_PARAM = {"type": 1}
         self.set_data_params(cache_type=self.cache_type, is_decoder=self.is_decoder, batch=batch, kv_head=kv_head, heads=self.heads,
                              embeddim=self.embeddim, max_seq=self.max_seq, kv_seqLen=kv_seqLen,
-                             data_type=data_type, long_seq = True,
-                             op_type=OP_PARAM["type"], mask_type = MASK_TYPE_SWA,
+                             data_type=data_type, long_seq = True, window_size=self.window_size,
+                             op_type=OP_PARAM["type"], mask_type = MASK_TYPE_NO_HEAD_DECODER,
                              no_cache=False, is_sqrt=False, tor=tor, q_seqlens=self.q_seqlen)
         self.gen_out_tensor()
         self.window_size = 16
         param = json.dumps(
             {"headNum": self.heads, "qScale": float(self.q_scale), "qkScale": float(self.qk_scale), "maskType": 7,
-             "kvcacheCfg": self.cache_type, "calcType": 2, "windowSize": self.window_size})
+             "kvcacheCfg": 1, "calcType": 2, "windowSize": self.window_size})
         
         self.param_seqlen = self.q_seqlen
         self.param_token_offset = self.kv_seqlen
@@ -1389,7 +1389,7 @@ class TestFlashAttention(operation_test.OperationTest):
         kv_seqLen = [32, 1024] * 4
         self.max_seq = 1024
         self.window_size = 64
-        self.cacheType = 1
+        self.cache_type = 1
         self.q_seqlen, self.q_ntokens = self.gen_seq_len(batch, [1] * batch)
         self.kv_seqlen, self.kv_ntokens = self.gen_seq_len(batch, kv_seqLen)
         self.layer_id = torch.from_numpy(np.array([0], dtype=np.int32)).to(torch.int32)
@@ -1405,8 +1405,8 @@ class TestFlashAttention(operation_test.OperationTest):
         OP_PARAM = {"type": 1}
         self.set_data_params(cache_type=self.cache_type, is_decoder=self.is_decoder, batch=batch, kv_head=self.kv_head,
                              heads=self.heads, embeddim=self.embeddim, max_seq=self.max_seq, kv_seqLen=kv_seqLen,
-                             data_type=data_type, long_seq = False, op_type=OP_PARAM["type"], mask_type = MASK_TYPE_SWA,
-                             no_cache=False, is_sqrt=False, tor=tor, q_seqlens=self.q_seqlen)
+                             data_type=data_type, long_seq = False, op_type=OP_PARAM["type"], mask_type = MASK_TYPE_NO_HEAD_DECODER,
+                             no_cache=False, is_sqrt=False, tor=tor, q_seqlens=self.q_seqlen, window_size=self.window_size)
         self.gen_out_tensor()
         self.window_size = 64
         
-- 
Gitee


From fc2e25eb7f3f58a69da2cc979c69f3a5ff4dc066 Mon Sep 17 00:00:00 2001
From: hongshiyi <hongshiyi@huawei.com>
Date: Wed, 17 Sep 2025 19:49:02 +0800
Subject: [PATCH 12/94] add dlopen to resolve acl function

---
 .../platform/platform_infos_def.cpp           | 105 +++++++++++-------
 1 file changed, 66 insertions(+), 39 deletions(-)

diff --git a/src/kernels/tbe_adapter/platform/platform_infos_def.cpp b/src/kernels/tbe_adapter/platform/platform_infos_def.cpp
index 2cb40ebf..0e1d7637 100644
--- a/src/kernels/tbe_adapter/platform/platform_infos_def.cpp
+++ b/src/kernels/tbe_adapter/platform/platform_infos_def.cpp
@@ -9,10 +9,11 @@
  */
 
 #include "platform/platform_infos_def.h"
+#include "mki/utils/dl/dl.h"
+#include "mki/utils/env/env.h"
 #include <mutex>
 #include <mki/utils/log/log.h>
 #include "platform_infos_impl.h"
-#include "acl/acl_rt.h"
 
 namespace fe {
 constexpr uint32_t MAX_CORE_NUM = 128;
@@ -101,59 +102,85 @@ void PlatFormInfos::SetFixPipeDtypeMap(const std::map<std::string, std::vector<s
     platform_infos_impl_->SetFixPipeDtypeMap(fixpipeDtypeMap);
 }
 
+typedef int (*aclrtGetResInCurrentThreadFunc)(int, uint32_t*);
+
 void PlatFormInfos::SetCoreNumByCoreType(const std::string &core_type)
 {
     uint32_t coreNum = 0;
-    aclrtDevResLimitType resType = core_type == "VectorCore" ? ACL_RT_DEV_RES_VECTOR_CORE : ACL_RT_DEV_RES_CUBE_CORE;
-    aclError getResRet = aclrtGetResInCurrentThread(resType, &coreNum);
-    if (getResRet == ACL_SUCCESS) {
-        core_num_ = coreNum;
-        MKI_LOG(DEBUG) << "Get ThreadResource::core_num_ to " << core_type << ": " << coreNum;
-    } else {
-        std::string coreNumStr;
-        std::string coreTypeStr;
-        if (core_type == "VectorCore") {
-            coreTypeStr = "vector_core_cnt";
+    Mki::Dl dl = Mki::Dl(std::string(Mki::GetEnv("ASCEND_HOME_PATH")) + "/runtime/lib64/libascendcl.so", false);
+    aclrtGetResInCurrentThreadFunc aclrtGetResInCurrentThread = (aclrtGetResInCurrentThreadFunc)dl.GetSymbol("aclrtGetResInCurrentThread");
+    if (aclrtGetResInCurrentThread != nullptr) {
+        int8_t resType = core_type == "VectorCore" ? 1 : 0;
+        int getResRet = aclrtGetResInCurrentThread(resType, &coreNum);
+        if (getResRet == 0) {
+            core_num_ = coreNum;
+            MKI_LOG(DEBUG) << "Get ThreadResource::core_num_ to " << core_type << ": " << coreNum;
+            if (core_num_ == 0 || core_num_ > MAX_CORE_NUM) {
+                MKI_LOG(ERROR) << "core_num is out of range : " << core_num_;
+                core_num_ = 1; 
+            } 
+            return;
         } else {
-            coreTypeStr = "ai_core_cnt";
-        }
-        std::lock_guard<std::mutex> lockGuard(g_asdopsFePlatMutex);
-        (void)GetPlatformRes("SoCInfo", coreTypeStr, coreNumStr);
-        MKI_LOG(DEBUG) << "Set PlatFormInfos::core_num_ to " << coreTypeStr << ": " << coreNumStr;
-        if (coreNumStr.empty()) {
-            core_num_ = 1;
-            MKI_LOG(ERROR) << "CoreNumStr is empty!";
-        } else {
-            core_num_ = std::strtoul(coreNumStr.c_str(), nullptr, 10); // 10 进制
+            MKI_LOG(WARN) << "Failed to get thread core num!";
         }
+    } else {
+        MKI_LOG(WARN) << "Failed to acl function!";
     }
+    std::string coreNumStr;
+    std::string coreTypeStr;
+    if (core_type == "VectorCore") {
+        coreTypeStr = "vector_core_cnt";
+    } else {
+        coreTypeStr = "ai_core_cnt";
+    }
+    std::lock_guard<std::mutex> lockGuard(g_asdopsFePlatMutex);
+    (void)GetPlatformRes("SoCInfo", coreTypeStr, coreNumStr);
+    MKI_LOG(DEBUG) << "Set PlatFormInfos::core_num_ to " << coreTypeStr << ": " << coreNumStr;
+    if (coreNumStr.empty()) {
+        core_num_ = 1;
+        MKI_LOG(ERROR) << "CoreNumStr is empty!";
+    } else {
+        core_num_ = std::strtoul(coreNumStr.c_str(), nullptr, 10); // 10 进制
+    }       
     if (core_num_ == 0 || core_num_ > MAX_CORE_NUM) {
         MKI_LOG(ERROR) << "core_num is out of range : " << core_num_;
-        core_num_ = 1;
-    }
+        core_num_ = 1; 
+    }  
 }
 
 uint32_t PlatFormInfos::GetCoreNumByType(const std::string &core_type)
-{
+{   
     uint32_t coreNum = 0;
-    aclrtDevResLimitType resType = core_type == "VectorCore" ? ACL_RT_DEV_RES_VECTOR_CORE : ACL_RT_DEV_RES_CUBE_CORE;
-    aclError getResRet = aclrtGetResInCurrentThread(resType, &coreNum);
-    if (getResRet == ACL_SUCCESS) {
-        MKI_LOG(DEBUG) << "Get ThreadResource::core_num_ to " << core_type << ": " << coreNum;
-    } else {
-        std::string coreNumStr;
-        std::string coreTypeStr = core_type == "VectorCore" ? "vector_core_cnt" : "ai_core_cnt";
-        std::lock_guard<std::mutex> lockGuard(g_asdopsFePlatMutex);
-        (void)GetPlatformRes("SoCInfo", coreTypeStr, coreNumStr);
-        MKI_LOG(DEBUG) << "Get PlatFormInfos::core_num_ to " << coreTypeStr << ": " << coreNumStr;
-        if (coreNumStr.empty()) {
-            MKI_LOG(ERROR) << "CoreNumStr is empty!";
-            return 1;
+    Mki::Dl dl = Mki::Dl(std::string(Mki::GetEnv("ASCEND_HOME_PATH")) + "/runtime/lib64/libascendcl.so", false);
+    aclrtGetResInCurrentThreadFunc aclrtGetResInCurrentThread = (aclrtGetResInCurrentThreadFunc)dl.GetSymbol("aclrtGetResInCurrentThread");
+    if (aclrtGetResInCurrentThread != nullptr) {
+        int resType = core_type == "VectorCore" ? 1 : 0;
+        int getResRet = aclrtGetResInCurrentThread(resType, &coreNum);
+        if (getResRet == 0) {
+            MKI_LOG(DEBUG) << "Get ThreadResource::core_num_ to " << core_type << ": " << coreNum;
+            if (coreNum > MAX_CORE_NUM) {
+                MKI_LOG(ERROR) << "core_num is out of range : " << coreNum;
+                return 1;
+            }
+            return coreNum;
         } else {
-            coreNum = std::strtoul(coreNumStr.c_str(), nullptr, 10); // 10 进制
+            MKI_LOG(WARN) << "Failed to get thread resource! ";
         }
+    } else {
+        MKI_LOG(WARN) << "Failed to load acl Function!";
+    }
+    std::string coreNumStr;
+    std::string coreTypeStr = core_type == "VectorCore" ? "vector_core_cnt" : "ai_core_cnt";
+    std::lock_guard<std::mutex> lockGuard(g_asdopsFePlatMutex);
+    (void)GetPlatformRes("SoCInfo", coreTypeStr, coreNumStr);
+    MKI_LOG(DEBUG) << "Get PlatFormInfos::core_num_ to " << coreTypeStr << ": " << coreNumStr;
+    if (coreNumStr.empty()) {
+        MKI_LOG(ERROR) << "CoreNumStr is empty!";
+        return 1;
+    } else {
+        coreNum = std::strtoul(coreNumStr.c_str(), nullptr, 10); // 10 进制            
     }
-    if (coreNum == 0 || coreNum > MAX_CORE_NUM) {
+    if (coreNum > MAX_CORE_NUM) {
         MKI_LOG(ERROR) << "core_num is out of range : " << coreNum;
         return 1;
     }
-- 
Gitee


From a77b572846d808d6628595c18950068b2e0c14a6 Mon Sep 17 00:00:00 2001
From: hongshiyi <hongshiyi@huawei.com>
Date: Wed, 17 Sep 2025 20:22:30 +0800
Subject: [PATCH 13/94] fix codecheck problem

---
 src/kernels/tbe_adapter/platform/platform_infos_def.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/kernels/tbe_adapter/platform/platform_infos_def.cpp b/src/kernels/tbe_adapter/platform/platform_infos_def.cpp
index 0e1d7637..71b1cbdd 100644
--- a/src/kernels/tbe_adapter/platform/platform_infos_def.cpp
+++ b/src/kernels/tbe_adapter/platform/platform_infos_def.cpp
@@ -102,13 +102,14 @@ void PlatFormInfos::SetFixPipeDtypeMap(const std::map<std::string, std::vector<s
     platform_infos_impl_->SetFixPipeDtypeMap(fixpipeDtypeMap);
 }
 
-typedef int (*aclrtGetResInCurrentThreadFunc)(int, uint32_t*);
+using AclrtGetResInCurrentThreadFunc = int(*)(int, uint32_t*);
 
 void PlatFormInfos::SetCoreNumByCoreType(const std::string &core_type)
 {
     uint32_t coreNum = 0;
     Mki::Dl dl = Mki::Dl(std::string(Mki::GetEnv("ASCEND_HOME_PATH")) + "/runtime/lib64/libascendcl.so", false);
-    aclrtGetResInCurrentThreadFunc aclrtGetResInCurrentThread = (aclrtGetResInCurrentThreadFunc)dl.GetSymbol("aclrtGetResInCurrentThread");
+    AclrtGetResInCurrentThreadFunc aclrtGetResInCurrentThread =
+        (AclrtGetResInCurrentThreadFunc)dl.GetSymbol("aclrtGetResInCurrentThread");
     if (aclrtGetResInCurrentThread != nullptr) {
         int8_t resType = core_type == "VectorCore" ? 1 : 0;
         int getResRet = aclrtGetResInCurrentThread(resType, &coreNum);
-- 
Gitee


From e198375459cb40e5ba98a743f5149b9d88f8027f Mon Sep 17 00:00:00 2001
From: hongshiyi <hongshiyi@huawei.com>
Date: Thu, 18 Sep 2025 08:58:53 +0800
Subject: [PATCH 14/94] delete extra spaces

---
 .../tbe_adapter/platform/platform_infos_def.cpp    | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/kernels/tbe_adapter/platform/platform_infos_def.cpp b/src/kernels/tbe_adapter/platform/platform_infos_def.cpp
index 71b1cbdd..49a5cba5 100644
--- a/src/kernels/tbe_adapter/platform/platform_infos_def.cpp
+++ b/src/kernels/tbe_adapter/platform/platform_infos_def.cpp
@@ -118,8 +118,8 @@ void PlatFormInfos::SetCoreNumByCoreType(const std::string &core_type)
             MKI_LOG(DEBUG) << "Get ThreadResource::core_num_ to " << core_type << ": " << coreNum;
             if (core_num_ == 0 || core_num_ > MAX_CORE_NUM) {
                 MKI_LOG(ERROR) << "core_num is out of range : " << core_num_;
-                core_num_ = 1; 
-            } 
+                core_num_ = 1;
+            }
             return;
         } else {
             MKI_LOG(WARN) << "Failed to get thread core num!";
@@ -142,15 +142,15 @@ void PlatFormInfos::SetCoreNumByCoreType(const std::string &core_type)
         MKI_LOG(ERROR) << "CoreNumStr is empty!";
     } else {
         core_num_ = std::strtoul(coreNumStr.c_str(), nullptr, 10); // 10 进制
-    }       
+    }
     if (core_num_ == 0 || core_num_ > MAX_CORE_NUM) {
         MKI_LOG(ERROR) << "core_num is out of range : " << core_num_;
-        core_num_ = 1; 
-    }  
+        core_num_ = 1;
+    }
 }
 
 uint32_t PlatFormInfos::GetCoreNumByType(const std::string &core_type)
-{   
+{
     uint32_t coreNum = 0;
     Mki::Dl dl = Mki::Dl(std::string(Mki::GetEnv("ASCEND_HOME_PATH")) + "/runtime/lib64/libascendcl.so", false);
     aclrtGetResInCurrentThreadFunc aclrtGetResInCurrentThread = (aclrtGetResInCurrentThreadFunc)dl.GetSymbol("aclrtGetResInCurrentThread");
@@ -179,7 +179,7 @@ uint32_t PlatFormInfos::GetCoreNumByType(const std::string &core_type)
         MKI_LOG(ERROR) << "CoreNumStr is empty!";
         return 1;
     } else {
-        coreNum = std::strtoul(coreNumStr.c_str(), nullptr, 10); // 10 进制            
+        coreNum = std::strtoul(coreNumStr.c_str(), nullptr, 10); // 10 进制
     }
     if (coreNum > MAX_CORE_NUM) {
         MKI_LOG(ERROR) << "core_num is out of range : " << coreNum;
-- 
Gitee


From bf6c6148b9959ed8716b043aa563de56bffda31d Mon Sep 17 00:00:00 2001
From: hongshiyi <hongshiyi@huawei.com>
Date: Thu, 18 Sep 2025 11:14:59 +0800
Subject: [PATCH 15/94] change header order

---
 src/kernels/tbe_adapter/platform/platform_infos_def.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/kernels/tbe_adapter/platform/platform_infos_def.cpp b/src/kernels/tbe_adapter/platform/platform_infos_def.cpp
index 49a5cba5..c24edb60 100644
--- a/src/kernels/tbe_adapter/platform/platform_infos_def.cpp
+++ b/src/kernels/tbe_adapter/platform/platform_infos_def.cpp
@@ -9,10 +9,10 @@
  */
 
 #include "platform/platform_infos_def.h"
+#include <mutex>
 #include "mki/utils/dl/dl.h"
 #include "mki/utils/env/env.h"
-#include <mutex>
-#include <mki/utils/log/log.h>
+#include "mki/utils/log/log.h"
 #include "platform_infos_impl.h"
 
 namespace fe {
@@ -127,8 +127,8 @@ void PlatFormInfos::SetCoreNumByCoreType(const std::string &core_type)
     } else {
         MKI_LOG(WARN) << "Failed to acl function!";
     }
-    std::string coreNumStr;
-    std::string coreTypeStr;
+    std::string coreNumStr = "";
+    std::string coreTypeStr = "";
     if (core_type == "VectorCore") {
         coreTypeStr = "vector_core_cnt";
     } else {
-- 
Gitee


From 96e5a249a06627866326b9334b460e352ed1f03e Mon Sep 17 00:00:00 2001
From: hongshiyi <hongshiyi@huawei.com>
Date: Thu, 18 Sep 2025 11:16:52 +0800
Subject: [PATCH 16/94] init string

---
 src/kernels/tbe_adapter/platform/platform_infos_def.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/kernels/tbe_adapter/platform/platform_infos_def.cpp b/src/kernels/tbe_adapter/platform/platform_infos_def.cpp
index c24edb60..3464a4f6 100644
--- a/src/kernels/tbe_adapter/platform/platform_infos_def.cpp
+++ b/src/kernels/tbe_adapter/platform/platform_infos_def.cpp
@@ -170,7 +170,7 @@ uint32_t PlatFormInfos::GetCoreNumByType(const std::string &core_type)
     } else {
         MKI_LOG(WARN) << "Failed to load acl Function!";
     }
-    std::string coreNumStr;
+    std::string coreNumStr = "";
     std::string coreTypeStr = core_type == "VectorCore" ? "vector_core_cnt" : "ai_core_cnt";
     std::lock_guard<std::mutex> lockGuard(g_asdopsFePlatMutex);
     (void)GetPlatformRes("SoCInfo", coreTypeStr, coreNumStr);
-- 
Gitee


From d677a364ecedb672c673c63f7af3668402d266a0 Mon Sep 17 00:00:00 2001
From: godantshen_ <shenjunhui1@huawei.com>
Date: Wed, 17 Sep 2025 16:34:12 +0800
Subject: [PATCH 17/94] icsl modification

---
 .../kernels/matmul/pp_matmul_i8_kernel/pp_matmul_i8_kernel.cpp   | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/kernels/kernels/matmul/pp_matmul_i8_kernel/pp_matmul_i8_kernel.cpp b/src/kernels/kernels/matmul/pp_matmul_i8_kernel/pp_matmul_i8_kernel.cpp
index e4c4f5cc..82328c5e 100644
--- a/src/kernels/kernels/matmul/pp_matmul_i8_kernel/pp_matmul_i8_kernel.cpp
+++ b/src/kernels/kernels/matmul/pp_matmul_i8_kernel/pp_matmul_i8_kernel.cpp
@@ -140,6 +140,7 @@ public:
     Status InitImpl(const LaunchParam &launchParam) override
     {
         Status status = PpMatmulTiling(launchParam, kernelInfo_);
+        MKI_CHECK(status.Ok(), "tiling return invalid value.", return status);
         kernelInfo_.SetHwsyncIdx(0);
         return status;
     }
-- 
Gitee


From d5e5cab9d80c2a95f0fa5916caf1d7990e2e4feb Mon Sep 17 00:00:00 2001
From: Hall <qinhao61@h-partners.com>
Date: Thu, 18 Sep 2025 14:50:56 +0800
Subject: [PATCH 18/94] fix(rope): support rope test when rotaryCoeff equals
 headsize

---
 example/op_demo/rms_norm/README.md            | 22 ++++----
 example/op_demo/rope/README.md                | 14 ++---
 tests/apitest/opstest/csv/rope.csv            | 16 +++++-
 .../python/CsvOpsTestTool/data_generation.py  | 54 +++++++++----------
 4 files changed, 60 insertions(+), 46 deletions(-)

diff --git a/example/op_demo/rms_norm/README.md b/example/op_demo/rms_norm/README.md
index 11826914..7e5a3f73 100644
--- a/example/op_demo/rms_norm/README.md
+++ b/example/op_demo/rms_norm/README.md
@@ -1,6 +1,6 @@
 # 加速库RmsNormOperation C++ Demo
 ## 介绍
-该目录下为加速库RmsNormOperation C++调用示例，以下示例仅适用于Atlas A2/A3训练系列产品、Atlas 800I A2推理产品、Atlas A3 推理系列产品。
+该目录下为加速库RmsNormOperation C++调用示例。
 
 ## 使用说明
 - 首先source 对应的CANN和nnal包
@@ -46,8 +46,8 @@ tests/apitest/opstest/python/operations/rms_norm/
 
 - **rms_norm_qwen_demo_0.cpp**
 
-    【注】：编译脚本内替换 rms_norm_demo.cpp 为 rms_norm_qwen_demo_0.cpp 可编译运行
-
+    【注】：编译脚本内替换 rms_norm_demo.cpp 为 rms_norm_qwen_demo_0.cpp 可编译运行；
+           该示例仅适用于Atlas A2/A3训练系列产品、Atlas 800I A2推理产品、Atlas A3 推理系列产品。
     **参数设置**
     |        Param        |                        value                         |
     | :-----------------: | :--------------------------------------------------: |
@@ -70,8 +70,8 @@ tests/apitest/opstest/python/operations/rms_norm/
 
 - **rms_norm_qwen_demo_1.cpp**
 
-    【注】：编译脚本内替换 rms_norm_demo.cpp 为 rms_norm_qwen_demo_1.cpp 可编译运行
-
+    【注】：编译脚本内替换 rms_norm_demo.cpp 为 rms_norm_qwen_demo_1.cpp 可编译运行；
+           该示例仅适用于Atlas A2/A3训练系列产品、Atlas 800I A2推理产品、Atlas A3 推理系列产品。
     **参数设置**
     |        Param        |                        value                         |
     | :-----------------: | :--------------------------------------------------: |
@@ -94,8 +94,8 @@ tests/apitest/opstest/python/operations/rms_norm/
 
 - **rms_norm_qwen_demo_2.cpp**
 
-    【注】：编译脚本内替换 rms_norm_demo.cpp 为 rms_norm_qwen_demo_2.cpp 可编译运行
-
+    【注】：编译脚本内替换 rms_norm_demo.cpp 为 rms_norm_qwen_demo_2.cpp 可编译运行；
+           该示例仅适用于Atlas A2/A3训练系列产品、Atlas 800I A2推理产品、Atlas A3 推理系列产品。
     **参数设置**
     |        Param        |                        value                         |
     | :-----------------: | :--------------------------------------------------: |
@@ -118,8 +118,8 @@ tests/apitest/opstest/python/operations/rms_norm/
 
 - **rms_norm_deepseek_demo_0.cpp**
 
-    【注】：编译脚本内替换 rms_norm_demo.cpp 为 rms_norm_deepseek_demo_0.cpp 可编译运行
-
+    【注】：编译脚本内替换 rms_norm_demo.cpp 为 rms_norm_deepseek_demo_0.cpp 可编译运行；
+           该示例仅适用于Atlas A2/A3训练系列产品、Atlas 800I A2推理产品、Atlas A3 推理系列产品。
     **参数设置**
     |        Param        |                        value                         |
     | :-----------------: | :--------------------------------------------------: |
@@ -142,8 +142,8 @@ tests/apitest/opstest/python/operations/rms_norm/
 
 - **rms_norm_deepseek_demo_1.cpp**
 
-    【注】：编译脚本内替换 rms_norm_demo.cpp 为 rms_norm_deepseek_demo_1.cpp 可编译运行
-
+    【注】：编译脚本内替换 rms_norm_demo.cpp 为 rms_norm_deepseek_demo_1.cpp 可编译运行；
+           该示例仅适用于Atlas A2/A3训练系列产品、Atlas 800I A2推理产品、Atlas A3 推理系列产品。
     **参数设置**
     |        Param        |                        value                         |
     | :-----------------: | :--------------------------------------------------: |
diff --git a/example/op_demo/rope/README.md b/example/op_demo/rope/README.md
index 93362e45..6898cb29 100644
--- a/example/op_demo/rope/README.md
+++ b/example/op_demo/rope/README.md
@@ -1,6 +1,6 @@
 # 加速库RopeOperation C++ Demo
 ## 介绍
-该目录下为加速库RopeOperation C++调用示例, 以下示例仅适用于Atlas A2/A3训练系列产品、Atlas 800I A2推理产品、Atlas A3 推理系列产品。
+该目录下为加速库RopeOperation C++调用示例。
 
 ## 使用说明
 - 首先source 对应的CANN和nnal包
@@ -49,8 +49,8 @@ tests/apitest/opstest/python/operations/rope/
 
 - **rope_qwen_demo_0.cpp**
 
-    【注】：编译脚本内替换 rope_demo.cpp 为 rope_qwen_demo_0.cpp 可编译运行
-
+    【注】：编译脚本内替换 rope_demo.cpp 为 rope_qwen_demo_0.cpp 可编译运行；
+           该示例仅适用于Atlas A2/A3训练系列产品、Atlas 800I A2推理产品、Atlas A3 推理系列产品。
     **参数设置**
     |    Param    | value |
     | :---------: | :---: |
@@ -76,8 +76,8 @@ tests/apitest/opstest/python/operations/rope/
 
 - **rope_qwen_demo_1.cpp**
 
-    【注】：编译脚本内替换 rope_demo.cpp 为 rope_qwen_demo_1.cpp 可编译运行
-
+    【注】：编译脚本内替换 rope_demo.cpp 为 rope_qwen_demo_1.cpp 可编译运行；
+           该示例仅适用于Atlas A2/A3训练系列产品、Atlas 800I A2推理产品、Atlas A3 推理系列产品。
     **参数设置**
     |    Param    | value |
     | :---------: | :---: |
@@ -103,8 +103,8 @@ tests/apitest/opstest/python/operations/rope/
 
 - **rope_qwen_demo_2.cpp**
 
-    【注】：编译脚本内替换 rope_demo.cpp 为 rope_qwen_demo_2.cpp 可编译运行
-
+    【注】：编译脚本内替换 rope_demo.cpp 为 rope_qwen_demo_2.cpp 可编译运行；
+           该示例仅适用于Atlas A2/A3训练系列产品、Atlas 800I A2推理产品、Atlas A3 推理系列产品。
     **参数设置**
     |    Param    | value |
     | :---------: | :---: |
diff --git a/tests/apitest/opstest/csv/rope.csv b/tests/apitest/opstest/csv/rope.csv
index a24ba7b4..e84c2f23 100644
--- a/tests/apitest/opstest/csv/rope.csv
+++ b/tests/apitest/opstest/csv/rope.csv
@@ -23,4 +23,18 @@ CaseNum|CaseName         |OpName       |OpParam                           |InNum
 22     |rsv|RopeOperation|{"rsv":[1]}|0||||0||||||||C:ERROR_INVALID_PARAM|||                         |
 23     |310Brope2Error   |RopeOperation|{"rotaryCoeff": 64}               |5    |float16;float16;float16;float16;int32  |nd;nd;nd;nd;nd        |2,256,32,64;2,256,32,64;512,64;512,64;2    |2     |float16;float16|nd;nd        |2,256,32,64;2,256,32,64  |random;random;customize;customize;random         |0,1;0,1;0,1;0,1;256,256  |            |             |NO_ERROR              |         |ChatGLM-6B  |Ascend310B           |Function
 24     |310BunpadropeError|RopeOperation|{"rotaryCoeff": 4}               |5    |float16;float16;float16;float16;int32  |nd;nd;nd;nd;nd        |4,4096;4,4096;4,128;4,128;1                |2     |float16;float16|nd;nd        |4,4096;4,4096            |customize;customize;customize;customize;customize|0,1;0,1;0,1;0,1;4,4      |            |             |NO_ERROR              |         |            |Ascend310B           |Function
-25     |deepseek         |RopeOperation|{"rotaryCoeff": 2}                |5    |float16;float16;float16;float16;uint32 |nd;nd;nd;nd;nd        |3072,8192;3072,64;3072,64;3072,64;1        |2     |float16;float16|nd;nd        |3072,8192;3072,64        |random;random;random;random;random               |0,1;0,1;0,1;0,1;2,2      |            |             |NO_ERROR              |         |deepseek    |Ascend910B           |Function
\ No newline at end of file
+25     |deepseek         |RopeOperation|{"rotaryCoeff": 2}                |5    |float16;float16;float16;float16;uint32 |nd;nd;nd;nd;nd        |3072,8192;3072,64;3072,64;3072,64;1        |2     |float16;float16|nd;nd        |3072,8192;3072,64        |random;random;random;random;random               |0,1;0,1;0,1;0,1;2,2      |            |             |NO_ERROR              |         |deepseek    |Ascend910B           |Function
+26     |rotaryCoeff16    |RopeOperation|{"rotaryCoeff": 16, "cosFormat": 0}|5   |bf16;bf16;bf16;bf16;uint32             |nd;nd;nd;nd;nd        |4,448;4,448;4,16;4,16;1                    |2     |bf16;bf16      |nd;nd        |4,448;4,448              |random;random;customize;customize;random         |-100,100;-100,100;-100,100;-100,100;4,4||           |NO_ERROR              |         |            |Ascend910B           |Function
+27     |rotaryCoeff16    |RopeOperation|{"rotaryCoeff": 16, "cosFormat": 1}|5   |bf16;bf16;bf16;bf16;uint32             |nd;nd;nd;nd;nd        |4,448;4,448;4,16;4,16;1                    |2     |bf16;bf16      |nd;nd        |4,448;4,448              |random;random;customize;customize;random         |-100,100;-100,100;-100,100;-100,100;4,4||           |NO_ERROR              |         |            |Ascend910B           |Function
+28     |rotaryCoeff32    |RopeOperation|{"rotaryCoeff": 32, "cosFormat": 0}|5   |bf16;bf16;bf16;bf16;uint32             |nd;nd;nd;nd;nd        |4,1312;4,1312;4,32;4,32;1                  |2     |bf16;bf16      |nd;nd        |4,1312;4,1312            |random;random;customize;customize;random         |-100,100;-100,100;-100,100;-100,100;4,4||           |NO_ERROR              |         |            |Ascend910B           |Function
+29     |rotaryCoeff32    |RopeOperation|{"rotaryCoeff": 32, "cosFormat": 1}|5   |bf16;bf16;bf16;bf16;uint32             |nd;nd;nd;nd;nd        |4,1312;4,1312;4,32;4,32;1                  |2     |bf16;bf16      |nd;nd        |4,1312;4,1312            |random;random;customize;customize;random         |-100,100;-100,100;-100,100;-100,100;4,4||           |NO_ERROR              |         |            |Ascend910B           |Function
+30     |rotaryCoeff64    |RopeOperation|{"rotaryCoeff": 64, "cosFormat": 0}|5   |bf16;bf16;bf16;bf16;uint32             |nd;nd;nd;nd;nd        |4,4416;4,4416;4,64;4,64;1                  |2     |bf16;bf16      |nd;nd        |4,4416;4,4416            |random;random;customize;customize;random         |-100,100;-100,100;-100,100;-100,100;4,4||           |NO_ERROR              |         |            |Ascend910B           |Function
+31     |rotaryCoeff64    |RopeOperation|{"rotaryCoeff": 64, "cosFormat": 1}|5   |bf16;bf16;bf16;bf16;uint32             |nd;nd;nd;nd;nd        |4,4416;4,4416;4,64;4,64;1                  |2     |bf16;bf16      |nd;nd        |4,4416;4,4416            |random;random;customize;customize;random         |-100,100;-100,100;-100,100;-100,100;4,4||           |NO_ERROR              |         |            |Ascend910B           |Function
+32     |rotaryCoeff128   |RopeOperation|{"rotaryCoeff": 128, "cosFormat": 0}|5  |bf16;bf16;bf16;bf16;uint32             |nd;nd;nd;nd;nd        |4,2048;4,2048;4,128;4,128;1                |2     |bf16;bf16      |nd;nd        |4,2048;4,2048            |random;random;customize;customize;random         |-100,100;-100,100;-100,100;-100,100;4,4||           |NO_ERROR              |         |            |Ascend910B           |Function
+33     |rotaryCoeff128   |RopeOperation|{"rotaryCoeff": 128, "cosFormat": 1}|5  |bf16;bf16;bf16;bf16;uint32             |nd;nd;nd;nd;nd        |4,2048;4,2048;4,128;4,128;1                |2     |bf16;bf16      |nd;nd        |4,2048;4,2048            |random;random;customize;customize;random         |-100,100;-100,100;-100,100;-100,100;4,4||           |NO_ERROR              |         |            |Ascend910B           |Function
+34     |rotaryCoeff256   |RopeOperation|{"rotaryCoeff": 256, "cosFormat": 0}|5  |bf16;bf16;bf16;bf16;uint32             |nd;nd;nd;nd;nd        |4,32512;4,32512;4,256;4,256;1              |2     |bf16;bf16      |nd;nd        |4,32512;4,32512          |random;random;customize;customize;random         |-100,100;-100,100;-100,100;-100,100;4,4||           |NO_ERROR              |         |            |Ascend910B           |Function
+35     |rotaryCoeff256   |RopeOperation|{"rotaryCoeff": 256, "cosFormat": 1}|5  |bf16;bf16;bf16;bf16;uint32             |nd;nd;nd;nd;nd        |4,32512;4,32512;4,256;4,256;1              |2     |bf16;bf16      |nd;nd        |4,32512;4,32512          |random;random;customize;customize;random         |-100,100;-100,100;-100,100;-100,100;4,4||           |NO_ERROR              |         |            |Ascend910B           |Function
+36     |rotaryCoeff512   |RopeOperation|{"rotaryCoeff": 512, "cosFormat": 0}|5  |bf16;bf16;bf16;bf16;uint32             |nd;nd;nd;nd;nd        |4,22528;4,22528;4,512;4,512;1              |2     |bf16;bf16      |nd;nd        |4,22528;4,22528          |random;random;customize;customize;random         |-100,100;-100,100;-100,100;-100,100;4,4||           |NO_ERROR              |         |            |Ascend910B           |Function
+37     |rotaryCoeff512   |RopeOperation|{"rotaryCoeff": 512, "cosFormat": 1}|5  |bf16;bf16;bf16;bf16;uint32             |nd;nd;nd;nd;nd        |4,22528;4,22528;4,512;4,512;1              |2     |bf16;bf16      |nd;nd        |4,22528;4,22528          |random;random;customize;customize;random         |-100,100;-100,100;-100,100;-100,100;4,4||           |NO_ERROR              |         |            |Ascend910B           |Function
+38     |rotaryCoeff1024  |RopeOperation|{"rotaryCoeff": 1024, "cosFormat": 0}|5 |bf16;bf16;bf16;bf16;uint32             |nd;nd;nd;nd;nd        |4,55296;4,55296;4,1024;4,1024;1            |2     |bf16;bf16      |nd;nd        |4,55296;4,55296          |random;random;customize;customize;random         |-100,100;-100,100;-100,100;-100,100;4,4||           |NO_ERROR              |         |            |Ascend910B           |Function
+39     |rotaryCoeff1024  |RopeOperation|{"rotaryCoeff": 1024, "cosFormat": 1}|5 |bf16;bf16;bf16;bf16;uint32             |nd;nd;nd;nd;nd        |4,55296;4,55296;4,1024;4,1024;1            |2     |bf16;bf16      |nd;nd        |4,55296;4,55296          |random;random;customize;customize;random         |-100,100;-100,100;-100,100;-100,100;4,4||           |NO_ERROR              |         |            |Ascend910B           |Function
\ No newline at end of file
diff --git a/tests/framework/python/CsvOpsTestTool/data_generation.py b/tests/framework/python/CsvOpsTestTool/data_generation.py
index 1fcbc6ea..0d4a1df2 100644
--- a/tests/framework/python/CsvOpsTestTool/data_generation.py
+++ b/tests/framework/python/CsvOpsTestTool/data_generation.py
@@ -3297,7 +3297,7 @@ class ReduceOperation(DataGen):
 class RopeOperation(DataGen):
     @staticmethod
     def customize(shapes, i, datatype, format, data_gen_ranges, op_params):
-        if (i == 2 or i == 3) and json.loads(op_params)["rotaryCoeff"] == 64:
+        if (i == 2 or i == 3) and json.loads(op_params)["rotaryCoeff"] != 2 and json.loads(op_params)["rotaryCoeff"] != 4:
             ntoken = shapes[i][0]
             head_size = shapes[i][1]
             # op需要cos/sin重复一次
@@ -3396,7 +3396,32 @@ class RopeOperation(DataGen):
                 q = torch.concat([q0, q1], dim=(q0.ndim - 1)).view(ntoken, hidden_size)
                 k = torch.concat([k0, k1], dim=(k0.ndim - 1)).view(ntoken, hidden_size)
                 return [q, k]
-        elif json_data['rotaryCoeff'] == 64:
+        elif json_data['rotaryCoeff'] == 2:
+            dtype = in_tensors[0].dtype
+            if dtype == torch.bfloat16:
+                in_tensors[0] = in_tensors[0].to(torch.float32)
+                in_tensors[1] = in_tensors[1].to(torch.float32)
+                in_tensors[2] = in_tensors[2].to(torch.float32)
+                in_tensors[3] = in_tensors[3].to(torch.float32)
+            ntoken = in_tensors[0].size()[0]
+            seqlen = int(in_tensors[4][0])
+            batch = ntoken // seqlen
+            if batch == 0:
+                batch = 1
+                seqlen = ntoken
+            hidden_size = in_tensors[0].size()[1]
+            hidden_size1 = in_tensors[1].size()[1]
+            head_size = in_tensors[2].size()[1]
+            head_num = hidden_size // head_size
+            head_num1 = hidden_size1 // head_size
+            q = in_tensors[0].view(batch, seqlen, head_num, head_size)
+            k = in_tensors[1].view(batch, seqlen, head_num1, head_size)
+            cos = in_tensors[2].view(batch, seqlen, head_size).unsqueeze(2)
+            sin = in_tensors[3].view(batch, seqlen, head_size).unsqueeze(2)
+            q_embed = ((q * cos) + (RopeOperation.rotate_half(q) * sin)).view(ntoken, hidden_size)
+            k_embed = ((k * cos) + (RopeOperation.rotate_half(k) * sin)).view(ntoken, hidden_size1)
+            return [q_embed.to(dtype), k_embed.to(dtype)]
+        else:
             if len(in_tensors[0].size()) == 4:
                 seqlen = in_tensors[0].size()[1]
                 batch = in_tensors[0].size()[0]
@@ -3451,31 +3476,6 @@ class RopeOperation(DataGen):
                 return [q_out2, k_out2]
             else:
                 return [q_out2.view(ntoken, hidden_sizeq), k_out2.view(ntoken, hidden_sizek)]
-        else:
-            dtype = in_tensors[0].dtype
-            if dtype == torch.bfloat16:
-                in_tensors[0] = in_tensors[0].to(torch.float32)
-                in_tensors[1] = in_tensors[1].to(torch.float32)
-                in_tensors[2] = in_tensors[2].to(torch.float32)
-                in_tensors[3] = in_tensors[3].to(torch.float32)
-            ntoken = in_tensors[0].size()[0]
-            seqlen = int(in_tensors[4][0])
-            batch = ntoken // seqlen
-            if batch == 0:
-                batch = 1
-                seqlen = ntoken
-            hidden_size = in_tensors[0].size()[1]
-            hidden_size1 = in_tensors[1].size()[1]
-            head_size = in_tensors[2].size()[1]
-            head_num = hidden_size // head_size
-            head_num1 = hidden_size1 // head_size
-            q = in_tensors[0].view(batch, seqlen, head_num, head_size)
-            k = in_tensors[1].view(batch, seqlen, head_num1, head_size)
-            cos = in_tensors[2].view(batch, seqlen, head_size).unsqueeze(2)
-            sin = in_tensors[3].view(batch, seqlen, head_size).unsqueeze(2)
-            q_embed = ((q * cos) + (RopeOperation.rotate_half(q) * sin)).view(ntoken, hidden_size)
-            k_embed = ((k * cos) + (RopeOperation.rotate_half(k) * sin)).view(ntoken, hidden_size1)
-            return [q_embed.to(dtype), k_embed.to(dtype)]
 
     @staticmethod
     def get_op_type(op_params):
-- 
Gitee


From bf23a364a36024b6d361343f3ebe10b2942871f1 Mon Sep 17 00:00:00 2001
From: hongshiyi <hongshiyi@huawei.com>
Date: Thu, 18 Sep 2025 16:33:17 +0800
Subject: [PATCH 19/94] change acl to Acl

---
 src/kernels/tbe_adapter/platform/platform_infos_def.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/kernels/tbe_adapter/platform/platform_infos_def.cpp b/src/kernels/tbe_adapter/platform/platform_infos_def.cpp
index 3464a4f6..7444715d 100644
--- a/src/kernels/tbe_adapter/platform/platform_infos_def.cpp
+++ b/src/kernels/tbe_adapter/platform/platform_infos_def.cpp
@@ -153,7 +153,7 @@ uint32_t PlatFormInfos::GetCoreNumByType(const std::string &core_type)
 {
     uint32_t coreNum = 0;
     Mki::Dl dl = Mki::Dl(std::string(Mki::GetEnv("ASCEND_HOME_PATH")) + "/runtime/lib64/libascendcl.so", false);
-    aclrtGetResInCurrentThreadFunc aclrtGetResInCurrentThread = (aclrtGetResInCurrentThreadFunc)dl.GetSymbol("aclrtGetResInCurrentThread");
+    AclrtGetResInCurrentThreadFunc aclrtGetResInCurrentThread = (AclrtGetResInCurrentThreadFunc)dl.GetSymbol("aclrtGetResInCurrentThread");
     if (aclrtGetResInCurrentThread != nullptr) {
         int resType = core_type == "VectorCore" ? 1 : 0;
         int getResRet = aclrtGetResInCurrentThread(resType, &coreNum);
-- 
Gitee


From 6e3bde03081fdfa253980cbd5c2eb3cfc4a4611e Mon Sep 17 00:00:00 2001
From: huangxiaolan <huangxiaolan7@huawei.com>
Date: Fri, 19 Sep 2025 15:54:37 +0800
Subject: [PATCH 20/94] fix function point

---
 .../src/tools/socket/lcal_sock_exchange.cpp   |   8 +-
 src/atb/operation/operation_base.cpp          |   4 +-
 src/atb/runner/ops_runner.cpp                 |   8 +-
 src/cinterface/atb_acl_fused_add_topk_div.cpp |   4 +-
 src/cinterface/atb_acl_mla.cpp                |  10 +-
 src/cinterface/atb_acl_mla_preprocess.cpp     |   4 +-
 src/cinterface/atb_acl_paged_cache_load.cpp   |   4 +-
 src/cinterface/atb_acl_ring_mla.cpp           |   4 +-
 .../atb_acl_self_attention_prefix_encoder.cpp |   4 +-
 src/cinterface/atb_acl_util.cpp               |   4 +-
 .../tiling/faster_gelu_tiling.cpp             |  16 +-
 .../gelu_forward/tiling/gelu_tiling.cpp       |  20 +--
 .../dynamic_quant_tiling.cpp                  | 118 +++++++-------
 .../tiling/cohere_layer_norm_tiling.cpp       |  66 ++++----
 .../norm/rmsnorm/tiling/rms_norm_tiling.cpp   |  30 ++--
 .../tiling/fused_add_topk_div_tiling.cpp      | 148 +++++++++---------
 .../tiling/mla_preprocess_tiling.cpp          |  76 ++++-----
 .../tiling/mla_tiling_dependency.cpp          |   4 +-
 .../tiling/ring_mla_tiling_dependency.cpp     |   4 +-
 .../mixkernels/rope/tiling/rope_tiling.cpp    |  14 +-
 .../tiling/rope_q_concat_tiling.cpp           |  30 ++--
 .../tiling/swi_glu_quant_tiling.cpp           |  46 +++---
 .../tiling/swi_glu_quant_tiling_utils.h       |  22 +--
 .../tbe_adapter/platform/platform_ascendc.cpp |   6 +-
 24 files changed, 328 insertions(+), 326 deletions(-)

diff --git a/comm/lcal/src/tools/socket/lcal_sock_exchange.cpp b/comm/lcal/src/tools/socket/lcal_sock_exchange.cpp
index 552fde6b..ff5dec47 100644
--- a/comm/lcal/src/tools/socket/lcal_sock_exchange.cpp
+++ b/comm/lcal/src/tools/socket/lcal_sock_exchange.cpp
@@ -335,7 +335,7 @@ void LcalSockExchange::Cleanup()
     }
 }
 
-int GetAddrFromString(LcalSocketAddress* ua, const char* ipPortPair)
+int GetAddrFromString(LcalSocketAddress& ua, const char* ipPortPair)
 {
     std::string ip;
     uint16_t port;
@@ -344,9 +344,9 @@ int GetAddrFromString(LcalSocketAddress* ua, const char* ipPortPair)
         MKI_LOG(ERROR) << "lcal ParseIpAndPort failed!";
         return LCAL_ERROR_INTERNAL;
     }
-    ua->sin.sin_family = AF_INET;
-    ua->sin.sin_addr.s_addr = inet_addr(ip.c_str());
-    ua->sin.sin_port = htons(port);
+    ua.sin.sin_family = AF_INET;
+    ua.sin.sin_addr.s_addr = inet_addr(ip.c_str());
+    ua.sin.sin_port = htons(port);
     return LCAL_SUCCESS;
 }
 
diff --git a/src/atb/operation/operation_base.cpp b/src/atb/operation/operation_base.cpp
index 8668f9e5..b3afa56d 100644
--- a/src/atb/operation/operation_base.cpp
+++ b/src/atb/operation/operation_base.cpp
@@ -1064,12 +1064,12 @@ Status OperationBase::GraphModeLaunch()
 }
 
 Status OperationBase::Execute(const VariantPack &variantPack, uint8_t *workspace, uint64_t workspaceSize,
-                              Context *context)
+                              Context &context)
 {
     const uint64_t beginTime = GetSingleton<Mki::ProfilingFuncs>().GetProfilingLevel0Status() ?
                                    GetSingleton<Mki::ProfilingFuncs>().ProfSysCycleTime() :
                                    0;
-    ExecuteType executeType = context->GetExecuteType();
+    ExecuteType executeType = context.GetExecuteType();
     ProfilingFuncName profType = executeType == EXECUTE_NORMAL ?
                                      OPERATION_EXECUTE :
                                      (executeType == EXECUTE_PRELAUNCH ? OPERATION_PRELAUNCH : OPERATION_LAUNCH);
diff --git a/src/atb/runner/ops_runner.cpp b/src/atb/runner/ops_runner.cpp
index e181a319..28347ad8 100644
--- a/src/atb/runner/ops_runner.cpp
+++ b/src/atb/runner/ops_runner.cpp
@@ -279,9 +279,9 @@ Status OpsRunner::FillHostTilingBufferImpl(uint8_t *hostTilingBuffer, uint64_t t
 
 Status OpsRunner::FillSingleKernelHostTilingBuffer(KernelGraphNode &node, size_t nodeId,
                                                    uint8_t *kernelHostTilingBuffer, size_t tilingSize,
-                                                   ContextBase *context)
+                                                   ContextBase &context)
 {
-    bool ifGraphLaunchNeedCalcTiling = needKernelGraphModify_ && (context->GetLaunchMode() == GRAPH_LAUNCH_MODE);
+    bool ifGraphLaunchNeedCalcTiling = needKernelGraphModify_ && (context.GetLaunchMode() == GRAPH_LAUNCH_MODE);
     if (node.impl->GetTilingFilledFlag() && !ifGraphLaunchNeedCalcTiling) {
         return NO_ERROR;
     }
@@ -289,7 +289,7 @@ Status OpsRunner::FillSingleKernelHostTilingBuffer(KernelGraphNode &node, size_t
     ATB_LOG(DEBUG) << GetLogPrefix() << " node[" << nodeId << "] InitHostLaunchBuffer start";
     GetOpSetupStatistic().tilingCacheMissCount += 1;
     Mki::Timer fillTimer;
-    bool launchWithTiling = context->GetLaunchWithTilingStatus();
+    bool launchWithTiling = context.GetLaunchWithTilingStatus();
     Status status = node.impl->InitKernelInfo(kernelHostTilingBuffer, tilingSize, launchWithTiling);
     if (status != NO_ERROR) {
         ATB_LOG(ERROR) << GetLogPrefix() << " node[" << nodeId << "] InitRunInfo failed!";
@@ -300,7 +300,7 @@ Status OpsRunner::FillSingleKernelHostTilingBuffer(KernelGraphNode &node, size_t
     ATB_LOG(DEBUG) << GetLogPrefix() << " node[" << nodeId << "] InitHostLaunchBuffer end, time:" << fillTime;
 
     UpdateCacheTiling(node, nodeId, kernelHostTilingBuffer, tilingSize);
-    if (context->GetLaunchMode() == GRAPH_LAUNCH_MODE) {
+    if (context.GetLaunchMode() == GRAPH_LAUNCH_MODE) {
         // 整图下发模式下绝大部分算子tiling只需计算一次，少部分需要多次计算的用needKernelGraphModify_进行标记
         node.impl->SetTilingFilledFlag(true);
     }
diff --git a/src/cinterface/atb_acl_fused_add_topk_div.cpp b/src/cinterface/atb_acl_fused_add_topk_div.cpp
index 65f84243..fbbfca90 100644
--- a/src/cinterface/atb_acl_fused_add_topk_div.cpp
+++ b/src/cinterface/atb_acl_fused_add_topk_div.cpp
@@ -21,7 +21,7 @@ atb::Status AtbFusedAddTopkDivGetWorkspaceSize(const aclTensor *x, const aclTens
                                                const aclTensor *mappingTable, uint32_t groupNum, uint32_t groupTopk,
                                                uint32_t n, uint32_t k, int activationType, bool isNorm, float scale,
                                                bool enableExpertMapping, aclTensor *y, aclTensor *indices,
-                                               uint64_t *workspaceSize, atb::Operation **op, atb::Context *context)
+                                               uint64_t &workspaceSize, atb::Operation **op, atb::Context *context)
 {
     atb::infer::FusedAddTopkDivParam param;
     param.groupNum = groupNum;
@@ -69,7 +69,7 @@ atb::Status AtbFusedAddTopkDivGetWorkspaceSize(const aclTensor *x, const aclTens
         ATB_LOG(ERROR) << "AtbFusedAddTopkDivGetWorkspaceSize opeartion pointer is nullptr!";
         return atb::ERROR_INVALID_OPERATION_ADDR;
     }
-    status = (*op)->Setup(pack, *workspaceSize, context);
+    status = (*op)->Setup(pack, workspaceSize, context);
     ATB_CHECK(status == atb::NO_ERROR, "AtbFusedAddTopkDiv Setup failed!", return status);
     return atb::NO_ERROR;
 }
diff --git a/src/cinterface/atb_acl_mla.cpp b/src/cinterface/atb_acl_mla.cpp
index e050b04c..1b115833 100644
--- a/src/cinterface/atb_acl_mla.cpp
+++ b/src/cinterface/atb_acl_mla.cpp
@@ -27,7 +27,7 @@ atb::Status AtbMLAGetWorkspaceSize(const aclTensor *qNope, const aclTensor *qRop
                                    const aclTensor *mask, const aclTensor *qSeqLen, const aclTensor *qkDescale,
                                    const aclTensor *pvDescale, int32_t headNum, float qkScale, int32_t kvHeadNum,
                                    int maskType, int calcType, uint8_t cacheMode, aclTensor *attenOut, aclTensor *lse,
-                                   uint64_t *workspaceSize, atb::Operation **op, atb::Context *context)
+                                   uint64_t &workspaceSize, atb::Operation **op, atb::Context *context)
 {
     atb::infer::MultiLatentAttentionParam param;
     param.headNum = headNum;
@@ -109,13 +109,15 @@ atb::Status AtbMLAGetWorkspaceSize(const aclTensor *qNope, const aclTensor *qRop
         ATB_LOG(ERROR) << "AtbMLAGetWorkspaceSize opeartion pointer is nullptr!";
         return atb::ERROR_INVALID_OPERATION_ADDR;
     }
-    atb::Status st = (*op)->Setup(pack, *workspaceSize, context);
+    atb::Status st = (*op)->Setup(pack, workspaceSize, context);
     ATB_CHECK(st == atb::NO_ERROR, "AtbMLA Setup failed!", return st);
     return atb::NO_ERROR;
 }
 
 atb::Status AtbMLA(void *workSpcace, uint64_t workspaceSize, atb::Operation *op, atb::Context *context)
 {
+    ATB_CHECK(op != nullptr, "AtbMLA expect op pointer not to be null!",
+              return atb::ERROR_INVALID_OPERATION_ADDR);
     atb::VariantPack pack;
     atb::Status st = op->Execute(pack, (uint8_t *)(workSpcace), workspaceSize, context);
     ATB_CHECK(st == atb::NO_ERROR, "AtbMLA Execute failed!", return st);
@@ -127,7 +129,7 @@ atb::Status AtbMLAPreFillGetWorkspaceSize(const aclTensor *q, const aclTensor *q
                                           const aclTensor *kRope, const aclTensor *v, const aclTensor *qSeqLen,
                                           const aclTensor *kvSeqLen, const aclTensor *mask, int32_t headNum,
                                           float qkScale, int32_t kvHeadNum, int maskType, uint8_t cacheMode,
-                                          aclTensor *attenOut, uint64_t *workspaceSize, atb::Operation **op,
+                                          aclTensor *attenOut, uint64_t &workspaceSize, atb::Operation **op,
                                           atb::Context *context)
 {
     atb::infer::MultiLatentAttentionParam param;
@@ -181,7 +183,7 @@ atb::Status AtbMLAPreFillGetWorkspaceSize(const aclTensor *q, const aclTensor *q
         ATB_LOG(ERROR) << "AtbMLAPreFillGetWorkspaceSize opeartion pointer is nullptr!";
         return atb::ERROR_INVALID_OPERATION_ADDR;
     }
-    atb::Status st = (*op)->Setup(pack, *workspaceSize, context);
+    atb::Status st = (*op)->Setup(pack, workspaceSize, context);
     ATB_CHECK(st == atb::NO_ERROR, "AtbMLAPreFill Setup failed!", return st);
     return atb::NO_ERROR;
 }
diff --git a/src/cinterface/atb_acl_mla_preprocess.cpp b/src/cinterface/atb_acl_mla_preprocess.cpp
index 32bd22c6..be40ee8a 100644
--- a/src/cinterface/atb_acl_mla_preprocess.cpp
+++ b/src/cinterface/atb_acl_mla_preprocess.cpp
@@ -28,7 +28,7 @@ atb::Status AtbMLAPreprocessGetWorkspaceSize(
     const aclTensor *kvCacheRope, const aclTensor *slotmapping, const aclTensor *ctkvScale, const aclTensor *qNopeScale,
     uint32_t wdqDim, uint32_t qRopeDim, uint32_t kRopeDim, float epsilon, uint32_t qRotaryCoeff, uint32_t kRotaryCoeff,
     bool transposeWdq, bool transposeWuq, bool transposeWuk, uint8_t cacheMode, uint16_t quantMode, aclTensor *qOut0,
-    aclTensor *kvCacheOut0, aclTensor *qOut1, aclTensor *kvCacheOut1, uint64_t *workspaceSize, atb::Operation **op,
+    aclTensor *kvCacheOut0, aclTensor *qOut1, aclTensor *kvCacheOut1, uint64_t &workspaceSize, atb::Operation **op,
     atb::Context *context)
 {
     atb::infer::MlaPreprocessParam param;
@@ -159,7 +159,7 @@ atb::Status AtbMLAPreprocessGetWorkspaceSize(
         ATB_LOG(ERROR) << "AtbMLAPreprocessGetWorkspaceSize opeartion pointer is nullptr!";
         return atb::ERROR_INVALID_OPERATION_ADDR;
     }
-    atb::Status st = (*op)->Setup(pack, *workspaceSize, context);
+    atb::Status st = (*op)->Setup(pack, workspaceSize, context);
     ATB_CHECK(st == atb::NO_ERROR, "AtbMLAPreprocess Setup failed!", return st);
     return atb::NO_ERROR;
 }
diff --git a/src/cinterface/atb_acl_paged_cache_load.cpp b/src/cinterface/atb_acl_paged_cache_load.cpp
index df6d86d5..834c8f4d 100644
--- a/src/cinterface/atb_acl_paged_cache_load.cpp
+++ b/src/cinterface/atb_acl_paged_cache_load.cpp
@@ -22,7 +22,7 @@ atb::Status AtbPagedCacheLoadGetWorkspaceSize(const aclTensor *keyCache, const a
                                               const aclTensor *blockTables, const aclTensor *contextLens,
                                               const aclTensor *key, const aclTensor *value, const aclTensor *seqStarts,
                                               int8_t kvCacheCfg, bool isSeqLensCumsumType, bool hasSeqStarts,
-                                              uint64_t *workspaceSize, atb::Operation **op, atb::Context *context)
+                                              uint64_t &workspaceSize, atb::Operation **op, atb::Context *context)
 {
     atb::infer::PagedCacheLoadParam param;
     param.kvCacheCfg = atb::infer::PagedCacheLoadParam::KvCacheCfg(kvCacheCfg);
@@ -72,7 +72,7 @@ atb::Status AtbPagedCacheLoadGetWorkspaceSize(const aclTensor *keyCache, const a
         ATB_LOG(ERROR) << "AtbPagedCacheLoadGetWorkspaceSize opeartion pointer is nullptr!";
         return atb::ERROR_INVALID_OPERATION_ADDR;
     }
-    atb::Status st = (*op)->Setup(pack, *workspaceSize, context);
+    atb::Status st = (*op)->Setup(pack, workspaceSize, context);
     ATB_CHECK(st == atb::NO_ERROR, "AtbPagedCacheLoad Setup failed!", return st);
     return atb::NO_ERROR;
 }
diff --git a/src/cinterface/atb_acl_ring_mla.cpp b/src/cinterface/atb_acl_ring_mla.cpp
index 62468810..677fcf1b 100644
--- a/src/cinterface/atb_acl_ring_mla.cpp
+++ b/src/cinterface/atb_acl_ring_mla.cpp
@@ -23,7 +23,7 @@ atb::Status AtbRingMLAGetWorkspaceSize(const aclTensor *querySplit1, const aclTe
                                        const aclTensor *mask, const aclTensor *seqLen, const aclTensor *prevOut,
                                        const aclTensor *prevLse, int32_t headNum, int32_t kvHeadNum, float qkScale,
                                        int kernelType, int maskType, int inputLayout, int calcType, aclTensor *output,
-                                       aclTensor *softmaxLse, uint64_t *workspaceSize, atb::Operation **op,
+                                       aclTensor *softmaxLse, uint64_t &workspaceSize, atb::Operation **op,
                                        atb::Context *context)
 {
     atb::infer::RingMLAParam param;
@@ -80,7 +80,7 @@ atb::Status AtbRingMLAGetWorkspaceSize(const aclTensor *querySplit1, const aclTe
         ATB_LOG(ERROR) << "AtbRingMLAGetWorkspaceSize opeartion pointer is nullptr!";
         return atb::ERROR_INVALID_OPERATION_ADDR;
     }
-    status = (*op)->Setup(pack, *workspaceSize, context);
+    status = (*op)->Setup(pack, workspaceSize, context);
     ATB_CHECK(status == atb::NO_ERROR, "AtbRingMLA Setup failed!", return status);
     return atb::NO_ERROR;
 }
diff --git a/src/cinterface/atb_acl_self_attention_prefix_encoder.cpp b/src/cinterface/atb_acl_self_attention_prefix_encoder.cpp
index 73e4e366..ffc3e5df 100644
--- a/src/cinterface/atb_acl_self_attention_prefix_encoder.cpp
+++ b/src/cinterface/atb_acl_self_attention_prefix_encoder.cpp
@@ -23,7 +23,7 @@ atb::Status AtbSelfAttentionPrefixEncoderGetWorkspaceSize(const aclTensor *query
                                                           const aclTensor *mask, const aclTensor *seqLen,
                                                           const aclTensor *kvSeqLen, const aclTensor *slopes,
                                                           int maskType, int32_t headNum, int32_t kvHeadNum,
-                                                          float qkScale, aclTensor *attnOut, uint64_t *workspaceSize,
+                                                          float qkScale, aclTensor *attnOut, uint64_t &workspaceSize,
                                                           atb::Operation **op, atb::Context *context)
 {
     atb::infer::SelfAttentionParam param;
@@ -94,7 +94,7 @@ atb::Status AtbSelfAttentionPrefixEncoderGetWorkspaceSize(const aclTensor *query
         ATB_LOG(ERROR) << "AtbSelfAttentionPrefixEncoderGetWorkspaceSize opeartion pointer is nullptr!";
         return atb::ERROR_INVALID_OPERATION_ADDR;
     }
-    status = (*op)->Setup(pack, *workspaceSize, context);
+    status = (*op)->Setup(pack, workspaceSize, context);
     ATB_CHECK(status == atb::NO_ERROR, "AtbSelfAttentionPrefixEncoder Setup failed!", return status);
     return atb::NO_ERROR;
 }
diff --git a/src/cinterface/atb_acl_util.cpp b/src/cinterface/atb_acl_util.cpp
index d4061128..c4b28caf 100644
--- a/src/cinterface/atb_acl_util.cpp
+++ b/src/cinterface/atb_acl_util.cpp
@@ -17,9 +17,9 @@ extern "C" {
 // 256GB
 const int64_t MAX_TENSOR_SIZE = 256uLL * 1024uLL * 1024uLL * 1024uLL;
 
-int64_t GetTensorSize(const aclTensor *input)
+int64_t GetTensorSize(const aclTensor &input)
 {
-    const op::Shape shape = input->GetViewShape();
+    const op::Shape shape = input.GetViewShape();
     const size_t dims = shape.GetDimNum();
     int64_t size = 1;
     for (size_t i = 0; i < dims; ++i) {
diff --git a/src/kernels/kernels/activation/faster_gelu_forward/tiling/faster_gelu_tiling.cpp b/src/kernels/kernels/activation/faster_gelu_forward/tiling/faster_gelu_tiling.cpp
index 9f65346d..fec67067 100644
--- a/src/kernels/kernels/activation/faster_gelu_forward/tiling/faster_gelu_tiling.cpp
+++ b/src/kernels/kernels/activation/faster_gelu_forward/tiling/faster_gelu_tiling.cpp
@@ -23,7 +23,7 @@ constexpr uint32_t TQUE_NUM = 2;
 constexpr uint32_t UB_RESERVED_BUFF = 0; // reserve ubSize
 constexpr uint32_t ALIGN_SIZE = 32;
 
-void CalcVectorTiling512Align(const LaunchParam &launchParam, FasterGeluForwardTilingData *tilingDataPtr,
+void CalcVectorTiling512Align(const LaunchParam &launchParam, FasterGeluForwardTilingData &tilingDataPtr,
                               uint32_t &usedCoreNum)
 {
     uint64_t dataLen = static_cast<uint64_t>(launchParam.GetInTensor(0).Numel());
@@ -71,20 +71,20 @@ void CalcVectorTiling512Align(const LaunchParam &launchParam, FasterGeluForwardT
     // 每个核计算的block_length 最均匀的分法
     uint64_t baseBlockLength = dataLenAlign / (usedCoreNum * packLen) * packLen; // 搬运向下512B对齐
     uint64_t resDataLenAlign = dataLenAlign - usedCoreNum * baseBlockLength;
-    tilingDataPtr->usedCoreNum = usedCoreNum;
-    std::fill(tilingDataPtr->singleCoreDataLen, tilingDataPtr->singleCoreDataLen + usedCoreNum, baseBlockLength);
+    tilingDataPtr.usedCoreNum = usedCoreNum;
+    std::fill(tilingDataPtr.singleCoreDataLen, tilingDataPtr.singleCoreDataLen + usedCoreNum, baseBlockLength);
     uint32_t index = 0;
     for (uint32_t i = packLen; i <= resDataLenAlign; i += packLen) {
-        tilingDataPtr->singleCoreDataLen[index % usedCoreNum] += packLen;
+        tilingDataPtr.singleCoreDataLen[index % usedCoreNum] += packLen;
         index++;
     }
-    tilingDataPtr->singleCoreDataLen[usedCoreNum - 1] += resDataLenAlign % packLen;
+    tilingDataPtr.singleCoreDataLen[usedCoreNum - 1] += resDataLenAlign % packLen;
 
-    tilingDataPtr->maxTileLen =
-        availableUB > tilingDataPtr->singleCoreDataLen[0] ? tilingDataPtr->singleCoreDataLen[0] : availableUB;
+    tilingDataPtr.maxTileLen =
+        availableUB > tilingDataPtr.singleCoreDataLen[0] ? tilingDataPtr.singleCoreDataLen[0] : availableUB;
 
     // 如果只用一个核心，对齐数量置为0，防止核内计算偏移时访问非法内存
-    tilingDataPtr->alignDataNum = usedCoreNum > 1 ? alignDataNum : 0;
+    tilingDataPtr.alignDataNum = usedCoreNum > 1 ? alignDataNum : 0;
 }
 
 Status FasterGeluForwardTiling(const LaunchParam &launchParam, KernelInfo &kernelInfo)
diff --git a/src/kernels/kernels/activation/gelu_forward/tiling/gelu_tiling.cpp b/src/kernels/kernels/activation/gelu_forward/tiling/gelu_tiling.cpp
index 9a842c4c..f6b3c3b4 100644
--- a/src/kernels/kernels/activation/gelu_forward/tiling/gelu_tiling.cpp
+++ b/src/kernels/kernels/activation/gelu_forward/tiling/gelu_tiling.cpp
@@ -31,9 +31,9 @@ using namespace Mki;
 3. 数据在core上能放的下
 4. 数据在core上放不下，多次循环，不同的length
 */
-bool FillTilingParam(const LaunchParam &launchParam, GeluForwardTilingData *tilingDataPtr, uint32_t &coreNum)
+bool FillTilingParam(const LaunchParam &launchParam, GeluForwardTilingData &tilingDataPtr, uint32_t &coreNum)
 {
-    tilingDataPtr->bufferNum = AsdOps::GELU_FORWARD_BUFF_NUM;
+    tilingDataPtr.bufferNum = AsdOps::GELU_FORWARD_BUFF_NUM;
     // 获取可用核数
     coreNum = PlatformInfo::Instance().GetCoreNum(CoreType::CORE_TYPE_VECTOR);
     // UB空间大小,输入数据信息
@@ -61,19 +61,19 @@ bool FillTilingParam(const LaunchParam &launchParam, GeluForwardTilingData *tili
          alignSize * alignSize;
     // 判断是否为小shape来决定是否重置单次搬运数据
     if (alignDataLen <= maxPerElemBytes * GELU_FORWARD_BUFF_NUM * coreNum) {
-        tilingDataPtr->bufferNum = 1;
+        tilingDataPtr.bufferNum = 1;
         maxPerElemBytes = maxPerElemBytes * NUM_2;
     }
 
-    tilingDataPtr->blockLength = (static_cast<uint32_t>(launchParam.GetInTensor(0).Numel()) + coreNum - 1) / coreNum;
-    tilingDataPtr->blockLength = (tilingDataPtr->blockLength + alignSize - 1) / alignSize * alignSize;
+    tilingDataPtr.blockLength = (static_cast<uint32_t>(launchParam.GetInTensor(0).Numel()) + coreNum - 1) / coreNum;
+    tilingDataPtr.blockLength = (tilingDataPtr.blockLength + alignSize - 1) / alignSize * alignSize;
     // 每个核要算的数据能否在UB上放的下
-    tilingDataPtr->tileNum = tilingDataPtr->blockLength / maxPerElemBytes;
-    tilingDataPtr->tailLength = tilingDataPtr->blockLength % maxPerElemBytes;
-    if (tilingDataPtr->tileNum == 0) {
-        tilingDataPtr->tileLength = tilingDataPtr->tailLength;
+    tilingDataPtr.tileNum = tilingDataPtr.blockLength / maxPerElemBytes;
+    tilingDataPtr.tailLength = tilingDataPtr.blockLength % maxPerElemBytes;
+    if (tilingDataPtr.tileNum == 0) {
+        tilingDataPtr.tileLength = tilingDataPtr.tailLength;
     } else {
-        tilingDataPtr->tileLength = maxPerElemBytes;
+        tilingDataPtr.tileLength = maxPerElemBytes;
     }
     return true;
 }
diff --git a/src/kernels/kernels/elewise/dynamic_quant/dynamic_quant_tiling/dynamic_quant_tiling.cpp b/src/kernels/kernels/elewise/dynamic_quant/dynamic_quant_tiling/dynamic_quant_tiling.cpp
index bd67e59e..ff72a00e 100644
--- a/src/kernels/kernels/elewise/dynamic_quant/dynamic_quant_tiling/dynamic_quant_tiling.cpp
+++ b/src/kernels/kernels/elewise/dynamic_quant/dynamic_quant_tiling/dynamic_quant_tiling.cpp
@@ -37,8 +37,8 @@ uint64_t ComputeTilingKey(uint32_t alignType, const LaunchParam &launchParam)
     return tilingKey;
 }
 
-Status ParseShape(const LaunchParam &launchParam, DynamicQuantTilingData *tilingDataPtr,
-    uint64_t *rowNumTotal)
+Status ParseShape(const LaunchParam &launchParam, DynamicQuantTilingData &tilingDataPtr,
+    uint64_t &rowNumTotal)
 {
     const Mki::SVector<int64_t> &shape = launchParam.GetInTensor(0).desc.dims;
     MKI_CHECK(!shape.empty(), "shape should not be empty",
@@ -50,19 +50,19 @@ Status ParseShape(const LaunchParam &launchParam, DynamicQuantTilingData *tiling
             MKI_CHECK(shape[i] > 0 && *rowNumTotal < static_cast<uint64_t>(UINT32_MAX / shape[i]),
                 "rowNumTotal or shape is invalid!",
                 return Status::FailStatus(ERROR_INVALID_VALUE, "rowNumTotal or shape is invalid!"));
-            *rowNumTotal *= shape[i];
+            rowNumTotal *= shape[i];
         } else {
-            tilingDataPtr->sizeH = shape[i];
+            tilingDataPtr.sizeH = shape[i];
         }
     }
     if (launchParam.GetInTensor(0).desc.dtype == TENSOR_DTYPE_BF16) {
-        MKI_CHECK(tilingDataPtr->sizeH <= DYNAMIC_QUANT_BF16_LAST_DIM_LIMITATION,
+        MKI_CHECK(tilingDataPtr.sizeH <= DYNAMIC_QUANT_BF16_LAST_DIM_LIMITATION,
             "Ascend910B BF16 input last dim is bigger than limitation!",
             return Status::FailStatus(ERROR_INVALID_VALUE,
             "Ascend910B BF16 input last dim is bigger than limitation!"));
     }
     if (PlatformInfo::Instance().GetPlatformType() == PlatformType::ASCEND_310P) {
-        MKI_CHECK(tilingDataPtr->sizeH <= DYNAMIC_QUANT_FP16_LAST_DIM_LIMITATION_310P,
+        MKI_CHECK(tilingDataPtr.sizeH <= DYNAMIC_QUANT_FP16_LAST_DIM_LIMITATION_310P,
             "Ascend310P F16 input last dim is bigger than limitation!",
             return Status::FailStatus(ERROR_INVALID_VALUE,
             "Ascend310P F16 input last dim is bigger than limitation!"));
@@ -78,33 +78,33 @@ Status ParseShape(const LaunchParam &launchParam, DynamicQuantTilingData *tiling
  * 3. sizeH > 512 -> numCopyRow: 64 ; 8 <= sizeH < 64 -> numCopyRow: 192
  *    64 <= sizeH <= 512 -> numCopyRow: Utils::RoundUp(213 - sizeX * 2 / 7, 8)
  */
-void SetSuitNumCopyRow(DynamicQuantTilingData *tilingDataPtr)
+void SetSuitNumCopyRow(DynamicQuantTilingData &tilingDataPtr)
 {
-    tilingDataPtr->sizeX = Utils::RoundUp(tilingDataPtr->sizeH, DYNAMIC_QUANT_ALIGN_NUM_X);
-    tilingDataPtr->sizeZOut = Utils::RoundUp(tilingDataPtr->sizeH);
+    tilingDataPtr.sizeX = Utils::RoundUp(tilingDataPtr.sizeH, DYNAMIC_QUANT_ALIGN_NUM_X);
+    tilingDataPtr.sizeZOut = Utils::RoundUp(tilingDataPtr.sizeH);
 
     uint32_t ubSize = PlatformInfo::Instance().GetUbSize();
-    tilingDataPtr->numCopyRow = (ubSize - tilingDataPtr->sizeX * DYNAMIC_QUANT_FP16_BUF_SCALE - \
-        DYNAMIC_QUANT_HEADSPACE) / (tilingDataPtr->sizeX * DYNAMIC_QUANT_COPY_ROW_SCALE);
-    MKI_LOG(INFO) << "numCopyRow = " << tilingDataPtr->numCopyRow;
-    uint32_t rowSuit = DYNAMIC_QUANT_ROW_SUIT_ADD - tilingDataPtr->sizeX * \
+    tilingDataPtr.numCopyRow = (ubSize - tilingDataPtr.sizeX * DYNAMIC_QUANT_FP16_BUF_SCALE - \
+        DYNAMIC_QUANT_HEADSPACE) / (tilingDataPtr.sizeX * DYNAMIC_QUANT_COPY_ROW_SCALE);
+    MKI_LOG(INFO) << "numCopyRow = " << tilingDataPtr.numCopyRow;
+    uint32_t rowSuit = DYNAMIC_QUANT_ROW_SUIT_ADD - tilingDataPtr.sizeX * \
         DYNAMIC_QUANT_ROW_SUIT_MUL / DYNAMIC_QUANT_ROW_SUIT_DIV;
     rowSuit = rowSuit - rowSuit % DYNAMIC_QUANT_ALIGN_NUM_SCALE;
-    if (tilingDataPtr->numCopyRow > DYNAMIC_QUANT_COPY_ROW_LONG &&
-        tilingDataPtr->sizeX >= DYNAMIC_QUANT_LEN_H_LONG) {
-        tilingDataPtr->numCopyRow = DYNAMIC_QUANT_COPY_ROW_LONG;
-    } else if (tilingDataPtr->numCopyRow > rowSuit && rowSuit > DYNAMIC_QUANT_ALIGN_NUM_SCALE &&
-        tilingDataPtr->sizeX >= DYNAMIC_QUANT_LEN_H_SHORT) {
-        tilingDataPtr->numCopyRow = rowSuit;
-    } else if (tilingDataPtr->numCopyRow > DYNAMIC_QUANT_COPY_ROW_SHORT &&
-        tilingDataPtr->sizeX < DYNAMIC_QUANT_LEN_H_SHORT &&
-        tilingDataPtr->sizeX > DYNAMIC_QUANT_ALIGN_NUM_SCALE) {
-        tilingDataPtr->numCopyRow = DYNAMIC_QUANT_COPY_ROW_SHORT;
-    } else if (tilingDataPtr->numCopyRow > DYNAMIC_QUANT_ALIGN_NUM_SCALE) {
-        tilingDataPtr->numCopyRow = tilingDataPtr->numCopyRow - tilingDataPtr->numCopyRow % \
+    if (tilingDataPtr.numCopyRow > DYNAMIC_QUANT_COPY_ROW_LONG &&
+        tilingDataPtr.sizeX >= DYNAMIC_QUANT_LEN_H_LONG) {
+        tilingDataPtr.numCopyRow = DYNAMIC_QUANT_COPY_ROW_LONG;
+    } else if (tilingDataPtr.numCopyRow > rowSuit && rowSuit > DYNAMIC_QUANT_ALIGN_NUM_SCALE &&
+        tilingDataPtr.sizeX >= DYNAMIC_QUANT_LEN_H_SHORT) {
+        tilingDataPtr.numCopyRow = rowSuit;
+    } else if (tilingDataPtr.numCopyRow > DYNAMIC_QUANT_COPY_ROW_SHORT &&
+        tilingDataPtr.sizeX < DYNAMIC_QUANT_LEN_H_SHORT &&
+        tilingDataPtr.sizeX > DYNAMIC_QUANT_ALIGN_NUM_SCALE) {
+        tilingDataPtr.numCopyRow = DYNAMIC_QUANT_COPY_ROW_SHORT;
+    } else if (tilingDataPtr.numCopyRow > DYNAMIC_QUANT_ALIGN_NUM_SCALE) {
+        tilingDataPtr.numCopyRow = tilingDataPtr.numCopyRow - tilingDataPtr.numCopyRow % \
             DYNAMIC_QUANT_ALIGN_NUM_SCALE;
     }
-    MKI_LOG(INFO) << "numCopyRow = " << tilingDataPtr->numCopyRow;
+    MKI_LOG(INFO) << "numCopyRow = " << tilingDataPtr.numCopyRow;
 }
 
 /**
@@ -116,44 +116,44 @@ void SetSuitNumCopyRow(DynamicQuantTilingData *tilingDataPtr)
  *             numCopyRow > alignRowNum, perRowNum < alignRowNum -> numCopyRow = perRowNum
  *             numCopyRow > perRowNum, perRowNum < 8 -> numCopyRow = perRowNum
  */
-Status CorrectNumCopyRow(DynamicQuantTilingData *tilingDataPtr, uint64_t rowNumTotal)
+Status CorrectNumCopyRow(DynamicQuantTilingData &tilingDataPtr, uint64_t rowNumTotal)
 {
-    uint32_t perRowNum = Utils::CeilDiv(static_cast<uint32_t>(rowNumTotal), tilingDataPtr->numCore);
+    uint32_t perRowNum = Utils::CeilDiv(static_cast<uint32_t>(rowNumTotal), tilingDataPtr.numCore);
     uint32_t alignRowNum = Utils::RoundUp(perRowNum, DYNAMIC_QUANT_ALIGN_NUM_SCALE);
     MKI_LOG(INFO) << "perRowNum = " << perRowNum;
     if (PlatformInfo::Instance().GetPlatformType() == PlatformType::ASCEND_310P) {
-        tilingDataPtr->alignType = DYNAMIC_QUANT_STATUS_UNALIGN_310P;
-        if (tilingDataPtr->numCopyRow >= DYNAMIC_QUANT_ALIGN_NUM_SCALE &&
+        tilingDataPtr.alignType = DYNAMIC_QUANT_STATUS_UNALIGN_310P;
+        if (tilingDataPtr.numCopyRow >= DYNAMIC_QUANT_ALIGN_NUM_SCALE &&
             perRowNum <= DYNAMIC_QUANT_ALIGN_NUM_SCALE) {
-            tilingDataPtr->numCopyRow = DYNAMIC_QUANT_ALIGN_NUM_SCALE;
-        } else if (tilingDataPtr->numCopyRow >= alignRowNum) {
-            tilingDataPtr->numCopyRow = alignRowNum;
+            tilingDataPtr.numCopyRow = DYNAMIC_QUANT_ALIGN_NUM_SCALE;
+        } else if (tilingDataPtr.numCopyRow >= alignRowNum) {
+            tilingDataPtr.numCopyRow = alignRowNum;
         }
-        if (tilingDataPtr->sizeH % DYNAMIC_QUANT_ALIGN_SIZE != 0 ||
-            tilingDataPtr->numCopyRow < DYNAMIC_QUANT_ALIGN_NUM_SCALE) {
+        if (tilingDataPtr.sizeH % DYNAMIC_QUANT_ALIGN_SIZE != 0 ||
+            tilingDataPtr.numCopyRow < DYNAMIC_QUANT_ALIGN_NUM_SCALE) {
             return Status::FailStatus(ERROR_INVALID_VALUE, "Ascend310P input last dim must 64Byte alignment");
         }
     } else {
-        tilingDataPtr->alignType = DYNAMIC_QUANT_STATUS_UNALIGN_910B;
-        if (perRowNum <= 0 && tilingDataPtr->numCopyRow > 0) {
-            tilingDataPtr->numCopyRow = 1;
-        } else if (tilingDataPtr->numCopyRow > alignRowNum && perRowNum > alignRowNum) {
-            tilingDataPtr->numCopyRow = alignRowNum;
-        } else if (tilingDataPtr->numCopyRow > alignRowNum && perRowNum < alignRowNum) {
-            tilingDataPtr->numCopyRow = perRowNum;
-        } else if (tilingDataPtr->numCopyRow > perRowNum && perRowNum < DYNAMIC_QUANT_ALIGN_NUM_SCALE) {
-            tilingDataPtr->numCopyRow = perRowNum;
+        tilingDataPtr.alignType = DYNAMIC_QUANT_STATUS_UNALIGN_910B;
+        if (perRowNum <= 0 && tilingDataPtr.numCopyRow > 0) {
+            tilingDataPtr.numCopyRow = 1;
+        } else if (tilingDataPtr.numCopyRow > alignRowNum && perRowNum > alignRowNum) {
+            tilingDataPtr.numCopyRow = alignRowNum;
+        } else if (tilingDataPtr.numCopyRow > alignRowNum && perRowNum < alignRowNum) {
+            tilingDataPtr.numCopyRow = perRowNum;
+        } else if (tilingDataPtr.numCopyRow > perRowNum && perRowNum < DYNAMIC_QUANT_ALIGN_NUM_SCALE) {
+            tilingDataPtr.numCopyRow = perRowNum;
         }
-        if (tilingDataPtr->numCopyRow == 0) {
+        if (tilingDataPtr.numCopyRow == 0) {
             return Status::FailStatus(ERROR_INVALID_VALUE, "Ascend910B input last dim is bigger than limitation");
         }
     }
-    MKI_LOG(INFO) << "numCopyRow = " << tilingDataPtr->numCopyRow;
-    tilingDataPtr->sizeCopyRow = Utils::RoundUp(tilingDataPtr->numCopyRow, DYNAMIC_QUANT_ALIGN_NUM_SCALE);
+    MKI_LOG(INFO) << "numCopyRow = " << tilingDataPtr.numCopyRow;
+    tilingDataPtr.sizeCopyRow = Utils::RoundUp(tilingDataPtr.numCopyRow, DYNAMIC_QUANT_ALIGN_NUM_SCALE);
     return Status::OkStatus();
 }
 
-Status SetTilingData(DynamicQuantTilingData *tilingDataPtr, uint64_t rowNumTotal)
+Status SetTilingData(DynamicQuantTilingData &tilingDataPtr, uint64_t rowNumTotal)
 {
     SetSuitNumCopyRow(tilingDataPtr);
 
@@ -162,17 +162,17 @@ Status SetTilingData(DynamicQuantTilingData *tilingDataPtr, uint64_t rowNumTotal
         return status;
     }
 
-    uint32_t patchTotal = rowNumTotal / tilingDataPtr->numCopyRow;
-    tilingDataPtr->numLastTailRow = rowNumTotal % tilingDataPtr->numCopyRow;
-    tilingDataPtr->numTailTimes = patchTotal / tilingDataPtr->numCore;
-    tilingDataPtr->numHeadCore = patchTotal % tilingDataPtr->numCore;
-    tilingDataPtr->numTailCore = tilingDataPtr->numCore - tilingDataPtr->numHeadCore;
-    tilingDataPtr->numHeadTimes = tilingDataPtr->numTailTimes + 1;
-
-    if (tilingDataPtr->numLastTailRow == 0 &&
-        tilingDataPtr->numCopyRow % DYNAMIC_QUANT_ALIGN_NUM_SCALE == 0 &&
-        tilingDataPtr->sizeH % DYNAMIC_QUANT_ALIGN_SIZE == 0) {
-        tilingDataPtr->alignType = DYNAMIC_QUANT_STATUS_ALIGN;
+    uint32_t patchTotal = rowNumTotal / tilingDataPtr.numCopyRow;
+    tilingDataPtr.numLastTailRow = rowNumTotal % tilingDataPtr.numCopyRow;
+    tilingDataPtr.numTailTimes = patchTotal / tilingDataPtr.numCore;
+    tilingDataPtr.numHeadCore = patchTotal % tilingDataPtr.numCore;
+    tilingDataPtr.numTailCore = tilingDataPtr.numCore - tilingDataPtr.numHeadCore;
+    tilingDataPtr.numHeadTimes = tilingDataPtr.numTailTimes + 1;
+
+    if (tilingDataPtr.numLastTailRow == 0 &&
+        tilingDataPtr.numCopyRow % DYNAMIC_QUANT_ALIGN_NUM_SCALE == 0 &&
+        tilingDataPtr.sizeH % DYNAMIC_QUANT_ALIGN_SIZE == 0) {
+        tilingDataPtr.alignType = DYNAMIC_QUANT_STATUS_ALIGN;
     }
     return Status::OkStatus();
 }
diff --git a/src/kernels/kernels/norm/coherelayernorm/tiling/cohere_layer_norm_tiling.cpp b/src/kernels/kernels/norm/coherelayernorm/tiling/cohere_layer_norm_tiling.cpp
index e3264884..a0465d01 100644
--- a/src/kernels/kernels/norm/coherelayernorm/tiling/cohere_layer_norm_tiling.cpp
+++ b/src/kernels/kernels/norm/coherelayernorm/tiling/cohere_layer_norm_tiling.cpp
@@ -45,36 +45,36 @@ void CohereLayerNormPrintTilingInfo(const CohereLayerNormTilingData &tilingDataP
                   << "averageFactor = " << tilingDataPtr.averageFactor;
 }
 
-Status MultipleRowMovedTiling(NormTilingDataPtrCon &layerNormPtrCon, CohereLayerNormTilingData *tilingDataPtr,
+Status MultipleRowMovedTiling(NormTilingDataPtrCon &layerNormPtrCon, CohereLayerNormTilingData &tilingDataPtr,
                               uint32_t singleRowMovedBufferSize, uint32_t multipleRowMovedBufferSize,
                               uint32_t miscBuffersSize)
 {
     uint32_t numResidualCoreRows = layerNormPtrCon.numRow -
-                                   tilingDataPtr->numCoreRows * (tilingDataPtr->numCore - 1);
+                                   tilingDataPtr.numCoreRows * (tilingDataPtr.numCore - 1);
     uint32_t calcCoreRowStrides = (layerNormPtrCon.maxUbSize - singleRowMovedBufferSize - miscBuffersSize) /
                                    multipleRowMovedBufferSize;
-    tilingDataPtr->coreRowStrides = std::min(tilingDataPtr->numCoreRows, calcCoreRowStrides);
-    MKI_CHECK(tilingDataPtr->coreRowStrides != 0, "coreRowStrides is equal to 0",
+    tilingDataPtr.coreRowStrides = std::min(tilingDataPtr.numCoreRows, calcCoreRowStrides);
+    MKI_CHECK(tilingDataPtr.coreRowStrides != 0, "coreRowStrides is equal to 0",
                 return Status::FailStatus(ERROR_INVALID_VALUE));
-    tilingDataPtr->coreRowRepeats = tilingDataPtr->numCoreRows / tilingDataPtr->coreRowStrides;
-    tilingDataPtr->coreRowTailStrides = tilingDataPtr->numCoreRows % tilingDataPtr->coreRowStrides;
-    tilingDataPtr->coreRowTailRepeats = tilingDataPtr->coreRowTailStrides == 0 ? 0 : 1;
-    tilingDataPtr->residualCoreRowStrides = std::min(numResidualCoreRows, calcCoreRowStrides);
-    MKI_CHECK(tilingDataPtr->residualCoreRowStrides != 0, "residualCoreRowStrides is equal to 0",
+    tilingDataPtr.coreRowRepeats = tilingDataPtr.numCoreRows / tilingDataPtr.coreRowStrides;
+    tilingDataPtr.coreRowTailStrides = tilingDataPtr.numCoreRows % tilingDataPtr.coreRowStrides;
+    tilingDataPtr.coreRowTailRepeats = tilingDataPtr.coreRowTailStrides == 0 ? 0 : 1;
+    tilingDataPtr.residualCoreRowStrides = std::min(numResidualCoreRows, calcCoreRowStrides);
+    MKI_CHECK(tilingDataPtr.residualCoreRowStrides != 0, "residualCoreRowStrides is equal to 0",
                 return Status::FailStatus(ERROR_INVALID_VALUE));
-    tilingDataPtr->residualCoreRowRepeats = numResidualCoreRows /
-                                            tilingDataPtr->residualCoreRowStrides;
-    tilingDataPtr->residualCoreRowTailStrides = numResidualCoreRows %
-                                                tilingDataPtr->residualCoreRowStrides;
-    tilingDataPtr->residualCoreRowTailRepeats = tilingDataPtr->residualCoreRowTailStrides == 0 ? 0 : 1;
-    tilingDataPtr->columnStrides = layerNormPtrCon.numCol;
-    tilingDataPtr->columnRepeats = 1;
-    tilingDataPtr->residualColumnStrides = 0;
-    tilingDataPtr->residualColumnRepeats = 0;
+    tilingDataPtr.residualCoreRowRepeats = numResidualCoreRows /
+                                            tilingDataPtr.residualCoreRowStrides;
+    tilingDataPtr.residualCoreRowTailStrides = numResidualCoreRows %
+                                                tilingDataPtr.residualCoreRowStrides;
+    tilingDataPtr.residualCoreRowTailRepeats = tilingDataPtr.residualCoreRowTailStrides == 0 ? 0 : 1;
+    tilingDataPtr.columnStrides = layerNormPtrCon.numCol;
+    tilingDataPtr.columnRepeats = 1;
+    tilingDataPtr.residualColumnStrides = 0;
+    tilingDataPtr.residualColumnRepeats = 0;
     return Status::OkStatus();
 }
 
-Status SingleRowMovedTiling(NormTilingDataPtrCon &layerNormPtrCon, CohereLayerNormTilingData *tilingDataPtr,
+Status SingleRowMovedTiling(NormTilingDataPtrCon &layerNormPtrCon, CohereLayerNormTilingData &tilingDataPtr,
                             uint32_t singleRowMovedElemSize, uint32_t multipleRowMovedElemSize,
                             uint32_t miscBuffersSize)
 {
@@ -83,21 +83,21 @@ Status SingleRowMovedTiling(NormTilingDataPtrCon &layerNormPtrCon, CohereLayerNo
                                                   (singleRowMovedElemSize + multipleRowMovedElemSize),
                                                   oneRepeatElemCount);
     uint32_t numResidualCoreRows = layerNormPtrCon.numRow -
-                                   tilingDataPtr->numCoreRows * (tilingDataPtr->numCore - 1);
-    tilingDataPtr->columnStrides = std::min(tilingDataPtr->numColumns, calcColumnStrides);
-    MKI_CHECK(tilingDataPtr->columnStrides != 0, "columnStrides is equal to 0",
+                                   tilingDataPtr.numCoreRows * (tilingDataPtr.numCore - 1);
+    tilingDataPtr.columnStrides = std::min(tilingDataPtr.numColumns, calcColumnStrides);
+    MKI_CHECK(tilingDataPtr.columnStrides != 0, "columnStrides is equal to 0",
               return Status::FailStatus(ERROR_INVALID_VALUE));
-    tilingDataPtr->columnRepeats = layerNormPtrCon.numCol / tilingDataPtr->columnStrides;
-    tilingDataPtr->residualColumnStrides = layerNormPtrCon.numCol % tilingDataPtr->columnStrides;
-    tilingDataPtr->residualColumnRepeats = tilingDataPtr->residualColumnStrides == 0 ? 0 : 1;
-    tilingDataPtr->coreRowStrides = 1;
-    tilingDataPtr->coreRowRepeats = tilingDataPtr->numCoreRows;
-    tilingDataPtr->coreRowTailStrides = 0;
-    tilingDataPtr->coreRowTailRepeats = 0;
-    tilingDataPtr->residualCoreRowStrides = 1;
-    tilingDataPtr->residualCoreRowRepeats = numResidualCoreRows;
-    tilingDataPtr->residualCoreRowTailStrides = 0;
-    tilingDataPtr->residualCoreRowTailRepeats = 0;
+    tilingDataPtr.columnRepeats = layerNormPtrCon.numCol / tilingDataPtr.columnStrides;
+    tilingDataPtr.residualColumnStrides = layerNormPtrCon.numCol % tilingDataPtr.columnStrides;
+    tilingDataPtr.residualColumnRepeats = tilingDataPtr.residualColumnStrides == 0 ? 0 : 1;
+    tilingDataPtr.coreRowStrides = 1;
+    tilingDataPtr.coreRowRepeats = tilingDataPtr.numCoreRows;
+    tilingDataPtr.coreRowTailStrides = 0;
+    tilingDataPtr.coreRowTailRepeats = 0;
+    tilingDataPtr.residualCoreRowStrides = 1;
+    tilingDataPtr.residualCoreRowRepeats = numResidualCoreRows;
+    tilingDataPtr.residualCoreRowTailStrides = 0;
+    tilingDataPtr.residualCoreRowTailRepeats = 0;
     return Status::OkStatus();
 }
 
diff --git a/src/kernels/kernels/norm/rmsnorm/tiling/rms_norm_tiling.cpp b/src/kernels/kernels/norm/rmsnorm/tiling/rms_norm_tiling.cpp
index 5963fc98..840efb97 100644
--- a/src/kernels/kernels/norm/rmsnorm/tiling/rms_norm_tiling.cpp
+++ b/src/kernels/kernels/norm/rmsnorm/tiling/rms_norm_tiling.cpp
@@ -34,18 +34,18 @@ constexpr uint32_t RMS_NORM_TILING_KEY_GEMMAMODE = 1000;    // 0:gemmamode no; 1
 constexpr uint32_t RMS_NORM_TILING_KEY_PRECISIONMODE = 100; // 0:precisionmode; 1:performance mode
 constexpr uint32_t RMS_NORM_TILING_KEY_DTYPE = 10;          // 0:fp16;1:bf16
 
-void PrintRmsNormTiling(const RmsNormCommonTilingData *tilingDataPtr)
+void PrintRmsNormTiling(const RmsNormCommonTilingData &tilingDataPtr)
 {
     MKI_LOG(INFO) << "RmsNorm Tiling Data:"
-                  << " numCore " << tilingDataPtr->numCore
-                  << " numCol " << tilingDataPtr->numCol
-                  << " numRow " << tilingDataPtr->numRow
-                  << " avgFactor " << tilingDataPtr->avgFactor
-                  << " epsilon " << tilingDataPtr->epsilon
-                  << " sliceSize " << tilingDataPtr->sliceSize
-                  << " mode " << tilingDataPtr->mode
-                  << " precisionMode  " << tilingDataPtr->precisionMode
-                  << " gemmaMode " << tilingDataPtr->gemmaMode;
+                  << " numCore " << tilingDataPtr.numCore
+                  << " numCol " << tilingDataPtr.numCol
+                  << " numRow " << tilingDataPtr.numRow
+                  << " avgFactor " << tilingDataPtr.avgFactor
+                  << " epsilon " << tilingDataPtr.epsilon
+                  << " sliceSize " << tilingDataPtr.sliceSize
+                  << " mode " << tilingDataPtr.mode
+                  << " precisionMode  " << tilingDataPtr.precisionMode
+                  << " gemmaMode " << tilingDataPtr.gemmaMode;
 }
 uint64_t ComputeTilingKey(uint32_t gemmaMode, uint32_t precisionMode, bool isShortTail, const LaunchParam &launchParam)
 {
@@ -60,19 +60,19 @@ uint64_t ComputeTilingKey(uint32_t gemmaMode, uint32_t precisionMode, bool isSho
     return tilingKey;
 }
 
-void SetNonContiguousTenor(RmsNormCommonTilingData *tilingDataPtr, const LaunchParam &launchParam)
+void SetNonContiguousTenor(RmsNormCommonTilingData &tilingDataPtr, const LaunchParam &launchParam)
 {
     const auto& xStrides = launchParam.GetInTensor(0).desc.strides;
     const auto& shape = launchParam.GetInTensor(0).desc.dims;
     uint32_t dimNum = xStrides.size();
     if (xStrides.empty() || dimNum == 1 || xStrides[dimNum - NUM_TWO] == shape[dimNum - 1]) {
-        tilingDataPtr->xDimNum = 0;
+        tilingDataPtr.xDimNum = 0;
     } else {
         for (size_t i = 0; i < xStrides.size(); ++ i) {
-            tilingDataPtr->xStrides[i] = xStrides[i];
+            tilingDataPtr.xStrides[i] = xStrides[i];
         }
-        tilingDataPtr->xDimNum = dimNum;
-        tilingDataPtr->xOffset = launchParam.GetInTensor(0).desc.offset;
+        tilingDataPtr.xDimNum = dimNum;
+        tilingDataPtr.xOffset = launchParam.GetInTensor(0).desc.offset;
     }
 }
 
diff --git a/src/kernels/mixkernels/fused_add_topk_div/tiling/fused_add_topk_div_tiling.cpp b/src/kernels/mixkernels/fused_add_topk_div/tiling/fused_add_topk_div_tiling.cpp
index 782bc8f5..0b1c929b 100644
--- a/src/kernels/mixkernels/fused_add_topk_div/tiling/fused_add_topk_div_tiling.cpp
+++ b/src/kernels/mixkernels/fused_add_topk_div/tiling/fused_add_topk_div_tiling.cpp
@@ -48,21 +48,21 @@ namespace AtbOps {
     template <typename TilingData>
     class FusedAddTopkDivTiling {
     public:
-        explicit FusedAddTopkDivTiling(FusedAddTopkDivTilingData *tilingDataPtr,
+        explicit FusedAddTopkDivTiling(FusedAddTopkDivTilingData &tilingDataPtr,
                                        const uint32_t inputCoreNum, const uint32_t inputUbSize)
         {
-            this->firstDimSize = tilingDataPtr->firstDimSize;
-            this->secondDimSize = tilingDataPtr->secondDimSize;
-            this->addNumDimSize = tilingDataPtr->addNumDimSize;
-            this->groupNum = tilingDataPtr->groupNum;
-            this->groupTopk = tilingDataPtr->groupTopk;
-            this->n = tilingDataPtr->n;
-            this->k = tilingDataPtr->k;
-            this->activateType = tilingDataPtr->activateType;
-            this->isNorm = tilingDataPtr->isNorm;
-            this->scale = tilingDataPtr->scale;
-            this->groupEles = tilingDataPtr->groupEles;
-            this->dtype = tilingDataPtr->dtype;
+            this->firstDimSize = tilingDataPtr.firstDimSize;
+            this->secondDimSize = tilingDataPtr.secondDimSize;
+            this->addNumDimSize = tilingDataPtr.addNumDimSize;
+            this->groupNum = tilingDataPtr.groupNum;
+            this->groupTopk = tilingDataPtr.groupTopk;
+            this->n = tilingDataPtr.n;
+            this->k = tilingDataPtr.k;
+            this->activateType = tilingDataPtr.activateType;
+            this->isNorm = tilingDataPtr.isNorm;
+            this->scale = tilingDataPtr.scale;
+            this->groupEles = tilingDataPtr.groupEles;
+            this->dtype = tilingDataPtr.dtype;
             this->ubSize = FloorAlign(inputUbSize, BYTE_BLOCK);
             this->coreNum = inputCoreNum;
             return;
@@ -110,9 +110,9 @@ namespace AtbOps {
     };
 
     template <typename TilingData>
-    void FusedAddTopkDivTiling<TilingData>::GetTilingKey(TilingData *tilingDataPtr)
+    void FusedAddTopkDivTiling<TilingData>::GetTilingKey(TilingData &tilingDataPtr)
     {
-        tilingKey = tilingDataPtr->enableExpertMapping * NUM_TEN + dtype;
+        tilingKey = tilingDataPtr.enableExpertMapping * NUM_TEN + dtype;
     }
 
     template <typename TilingData>
@@ -139,27 +139,27 @@ namespace AtbOps {
     }
 
     template <typename TilingData>
-    void FusedAddTopkDivTiling<TilingData>::FillTilingData(TilingData *tilingDataPtr)
+    void FusedAddTopkDivTiling<TilingData>::FillTilingData(TilingData &tilingDataPtr)
     {
-        tilingDataPtr->firstDimSize = firstDimSize;
-        tilingDataPtr->secondDimSize = secondDimSize;
-        tilingDataPtr->addNumDimSize = addNumDimSize;
-        tilingDataPtr->groupNum = groupNum;
-        tilingDataPtr->groupTopk = groupTopk;
-        tilingDataPtr->n = n;
-        tilingDataPtr->k = k;
-        tilingDataPtr->activateType = activateType;
-        tilingDataPtr->isNorm = isNorm;
-        tilingDataPtr->scale = scale;
-        tilingDataPtr->groupEles = groupEles;
-        tilingDataPtr->blockNum = usedCoreNum;
-        tilingDataPtr->ubFactorElement = ubFactorElement;
-        tilingDataPtr->batchPerCore = batchPerCore;
-        tilingDataPtr->tailBatch = tailBatch;
-        tilingDataPtr->tilingKey = tilingKey;
+        tilingDataPtr.firstDimSize = firstDimSize;
+        tilingDataPtr.secondDimSize = secondDimSize;
+        tilingDataPtr.addNumDimSize = addNumDimSize;
+        tilingDataPtr.groupNum = groupNum;
+        tilingDataPtr.groupTopk = groupTopk;
+        tilingDataPtr.n = n;
+        tilingDataPtr.k = k;
+        tilingDataPtr.activateType = activateType;
+        tilingDataPtr.isNorm = isNorm;
+        tilingDataPtr.scale = scale;
+        tilingDataPtr.groupEles = groupEles;
+        tilingDataPtr.blockNum = usedCoreNum;
+        tilingDataPtr.ubFactorElement = ubFactorElement;
+        tilingDataPtr.batchPerCore = batchPerCore;
+        tilingDataPtr.tailBatch = tailBatch;
+        tilingDataPtr.tilingKey = tilingKey;
         uint64_t wsSize = BASE_COUNT * FLOAT_BYTES;
-        tilingDataPtr->workspacePerCore = wsSize;
-        tilingDataPtr->tempSize = firstDimSize * secondDimSize * FLOAT_BYTES;
+        tilingDataPtr.workspacePerCore = wsSize;
+        tilingDataPtr.tempSize = firstDimSize * secondDimSize * FLOAT_BYTES;
     }
 
     template <typename TilingData>
@@ -178,30 +178,30 @@ namespace AtbOps {
         tilingObj.GetTiling(tilingDataPtr);
     }
 
-    static void PrintTilingData(const FusedAddTopkDivTilingData *tilingDataPtr)
+    static void PrintTilingData(const FusedAddTopkDivTilingData &tilingDataPtr)
     {
-        MKI_LOG(INFO) << "firstDimSize is: " << tilingDataPtr->firstDimSize << "\n"
-                      << "secondDimSize is: " << tilingDataPtr->secondDimSize << "\n"
-                      << "addNumDimSize is: " << tilingDataPtr->addNumDimSize << "\n"
-                      << "groupNum is: " << tilingDataPtr->groupNum << "\n"
-                      << "grouptopk is: " << tilingDataPtr->groupTopk << "\n"
-                      << "n is: " << tilingDataPtr->n << "\n"
-                      << "k is: " << tilingDataPtr->k << "\n"
-                      << "activateType is: " << tilingDataPtr->activateType << "\n"
-                      << "isNorm is: " << tilingDataPtr->isNorm << "\n"
-                      << "scale is: " << tilingDataPtr->scale << "\n"
-                      << "groupEles is: " << tilingDataPtr->groupEles << "\n"
-                      << "blockNum is: " << tilingDataPtr->blockNum << "\n"
-                      << "dtype is: " << tilingDataPtr->dtype << "\n"
-                      << "ubFactorElement is: " << tilingDataPtr->ubFactorElement << "\n"
-                      << "batchPerCore is: " << tilingDataPtr->batchPerCore << "\n"
-                      << "tailBatch is: " << tilingDataPtr->tailBatch << "\n"
-                      << "tilingKey is: " << tilingDataPtr->tilingKey << "\n"
-                      << "tempSize is: " << tilingDataPtr->tempSize << "\n"
-                      << "enableExpertMapping is: " << tilingDataPtr->enableExpertMapping << "\n"
-                      << "expertNum is: " << tilingDataPtr->expertNum << "\n"
-                      << "tableDim is: " << tilingDataPtr->tableDim << "\n"
-                      << "workspacePerCore is: " << tilingDataPtr->workspacePerCore;
+        MKI_LOG(INFO) << "firstDimSize is: " << tilingDataPtr.firstDimSize << "\n"
+                      << "secondDimSize is: " << tilingDataPtr.secondDimSize << "\n"
+                      << "addNumDimSize is: " << tilingDataPtr.addNumDimSize << "\n"
+                      << "groupNum is: " << tilingDataPtr.groupNum << "\n"
+                      << "grouptopk is: " << tilingDataPtr.groupTopk << "\n"
+                      << "n is: " << tilingDataPtr.n << "\n"
+                      << "k is: " << tilingDataPtr.k << "\n"
+                      << "activateType is: " << tilingDataPtr.activateType << "\n"
+                      << "isNorm is: " << tilingDataPtr.isNorm << "\n"
+                      << "scale is: " << tilingDataPtr.scale << "\n"
+                      << "groupEles is: " << tilingDataPtr.groupEles << "\n"
+                      << "blockNum is: " << tilingDataPtr.blockNum << "\n"
+                      << "dtype is: " << tilingDataPtr.dtype << "\n"
+                      << "ubFactorElement is: " << tilingDataPtr.ubFactorElement << "\n"
+                      << "batchPerCore is: " << tilingDataPtr.batchPerCore << "\n"
+                      << "tailBatch is: " << tilingDataPtr.tailBatch << "\n"
+                      << "tilingKey is: " << tilingDataPtr.tilingKey << "\n"
+                      << "tempSize is: " << tilingDataPtr.tempSize << "\n"
+                      << "enableExpertMapping is: " << tilingDataPtr.enableExpertMapping << "\n"
+                      << "expertNum is: " << tilingDataPtr.expertNum << "\n"
+                      << "tableDim is: " << tilingDataPtr.tableDim << "\n"
+                      << "workspacePerCore is: " << tilingDataPtr.workspacePerCore;
     }
     template <typename T1, typename T2>
     Status CeilAlign(T1 a, T2 b)
@@ -209,29 +209,29 @@ namespace AtbOps {
         return b == 0 ? a : (a + b - 1) / b * b;
     }
  
-    Status GetInputInfo(const LaunchParam &launchParam, FusedAddTopkDivTilingData *tilingDataPtr)
+    Status GetInputInfo(const LaunchParam &launchParam, FusedAddTopkDivTilingData &tilingDataPtr)
     {
         auto inTensor0 = launchParam.GetInTensor(X_INPUT_INDEX).desc;
         auto inTensor1 = launchParam.GetInTensor(ADD_NUM_INPUT_INDEX).desc;
-        tilingDataPtr->firstDimSize = inTensor0.dims[DIM_INDEX0];
-        tilingDataPtr->secondDimSize = inTensor0.dims[DIM_INDEX1];
-        tilingDataPtr->addNumDimSize = inTensor1.dims[DIM_INDEX0];
+        tilingDataPtr.firstDimSize = inTensor0.dims[DIM_INDEX0];
+        tilingDataPtr.secondDimSize = inTensor0.dims[DIM_INDEX1];
+        tilingDataPtr.addNumDimSize = inTensor1.dims[DIM_INDEX0];
  
         auto param = AnyCast<OpParam::FusedAddTopkDiv>(launchParam.GetParam());
-        tilingDataPtr->groupNum = static_cast<int32_t>(param.groupNum);
-        tilingDataPtr->groupTopk = static_cast<int32_t>(param.groupTopk);
-        tilingDataPtr->n = static_cast<int32_t>(param.n);
-        tilingDataPtr->k = static_cast<int32_t>(param.k);
-        tilingDataPtr->activateType = static_cast<int32_t>(param.activateType);
-        tilingDataPtr->isNorm = static_cast<uint32_t>(param.isNorm);
-        tilingDataPtr->enableExpertMapping = static_cast<uint32_t>(param.enableExpertMapping);
-        tilingDataPtr->groupEles = tilingDataPtr->groupNum == 0 ? tilingDataPtr->secondDimSize :
-                                   tilingDataPtr->secondDimSize / tilingDataPtr->groupNum;
-        tilingDataPtr->scale = param.scale;
-        if (tilingDataPtr->enableExpertMapping) {
+        tilingDataPtr.groupNum = static_cast<int32_t>(param.groupNum);
+        tilingDataPtr.groupTopk = static_cast<int32_t>(param.groupTopk);
+        tilingDataPtr.n = static_cast<int32_t>(param.n);
+        tilingDataPtr.k = static_cast<int32_t>(param.k);
+        tilingDataPtr.activateType = static_cast<int32_t>(param.activateType);
+        tilingDataPtr.isNorm = static_cast<uint32_t>(param.isNorm);
+        tilingDataPtr.enableExpertMapping = static_cast<uint32_t>(param.enableExpertMapping);
+        tilingDataPtr.groupEles = tilingDataPtr.groupNum == 0 ? tilingDataPtr.secondDimSize :
+                                   tilingDataPtr.secondDimSize / tilingDataPtr.groupNum;
+        tilingDataPtr.scale = param.scale;
+        if (tilingDataPtr.enableExpertMapping) {
             const Tensor &inTensor3 = launchParam.GetInTensor(MAPPING_TABLE_INPUT_INDEX);
-            tilingDataPtr->expertNum = inTensor3.desc.dims[DIM_INDEX0];
-            tilingDataPtr->tableDim = inTensor3.desc.dims[DIM_INDEX1];
+            tilingDataPtr.expertNum = inTensor3.desc.dims[DIM_INDEX0];
+            tilingDataPtr.tableDim = inTensor3.desc.dims[DIM_INDEX1];
         }
         return Status::OkStatus();
     }
diff --git a/src/kernels/mixkernels/mla_preprocess/tiling/mla_preprocess_tiling.cpp b/src/kernels/mixkernels/mla_preprocess/tiling/mla_preprocess_tiling.cpp
index 9fe26640..eee7b7db 100644
--- a/src/kernels/mixkernels/mla_preprocess/tiling/mla_preprocess_tiling.cpp
+++ b/src/kernels/mixkernels/mla_preprocess/tiling/mla_preprocess_tiling.cpp
@@ -436,45 +436,45 @@ void MlaPreprocessTiling::EinSumQuantTiling(const OpParam::MlaPreprocess &param,
     tilingData.esqColTail = esqColTail;
 }
 
-void MlaPreprocessTiling::SetTiling(AtbOps::MlaTilingData *tilingParam)
+void MlaPreprocessTiling::SetTiling(AtbOps::MlaTilingData &tilingParam)
 {
-    tilingParam->n = tilingData.n;
-    tilingParam->perTaskNum = tilingData.perTaskNum;
-    tilingParam->resTaskNum = tilingData.resTaskNum;
-    tilingParam->numCore = tilingData.numCore;
-
-    tilingParam->rmsNumCore1 = tilingData.rmsNumCore1;
-    tilingParam->rmsNumCol1 = tilingData.rmsNumCol1;
-
-    tilingParam->rmsNumCore2 = tilingData.rmsNumCore2;
-    tilingParam->rmsNumCol2 = tilingData.rmsNumCol2;
-
-    tilingParam->hiddenSizeQ = tilingData.hiddenSizeQ;
-    tilingParam->headNumQ = tilingData.headNumQ;
-    tilingParam->headDim = tilingData.headDim;
-    tilingParam->concatSize = tilingData.concatSize;
-    tilingParam->rotaryCoeff = tilingData.rotaryCoeff;
-    tilingParam->ntokens = tilingData.ntokens;
-    tilingParam->realCore = tilingData.realCore;
-    tilingParam->nlCoreRun = tilingData.nlCoreRun;
-    tilingParam->lCoreRun = tilingData.lCoreRun;
-    tilingParam->maxNPerLoopForUb = tilingData.maxNPerLoopForUb;
-    tilingParam->preCoreLoopTime = tilingData.preCoreLoopTime;
-    tilingParam->preCoreLoopNLast = tilingData.preCoreLoopNLast;
-    tilingParam->lastCoreLoopTime = tilingData.lastCoreLoopTime;
-    tilingParam->lastCoreLoopNLast = tilingData.lastCoreLoopNLast;
-
-    tilingParam->esqFrontCore = tilingData.esqFrontCore;
-    tilingParam->esqTailCore = tilingData.esqTailCore;
-    tilingParam->esqFrontCoreBatch = tilingData.esqFrontCoreBatch;
-    tilingParam->esqTailCoreBatch = tilingData.esqTailCoreBatch;
-    tilingParam->esqHeadNum = tilingData.esqHeadNum;
-    tilingParam->esqColNum = tilingData.esqColNum;
-    tilingParam->esqUbHeadLoop = tilingData.esqUbHeadLoop;
-    tilingParam->esqHeadPerLoop = tilingData.esqHeadPerLoop;
-    tilingParam->esqHeadTail = tilingData.esqHeadTail;
-    tilingParam->esqColLoop = tilingData.esqColLoop;
-    tilingParam->esqColTail = tilingData.esqColTail;
+    tilingParam.n = tilingData.n;
+    tilingParam.perTaskNum = tilingData.perTaskNum;
+    tilingParam.resTaskNum = tilingData.resTaskNum;
+    tilingParam.numCore = tilingData.numCore;
+
+    tilingParam.rmsNumCore1 = tilingData.rmsNumCore1;
+    tilingParam.rmsNumCol1 = tilingData.rmsNumCol1;
+
+    tilingParam.rmsNumCore2 = tilingData.rmsNumCore2;
+    tilingParam.rmsNumCol2 = tilingData.rmsNumCol2;
+
+    tilingParam.hiddenSizeQ = tilingData.hiddenSizeQ;
+    tilingParam.headNumQ = tilingData.headNumQ;
+    tilingParam.headDim = tilingData.headDim;
+    tilingParam.concatSize = tilingData.concatSize;
+    tilingParam.rotaryCoeff = tilingData.rotaryCoeff;
+    tilingParam.ntokens = tilingData.ntokens;
+    tilingParam.realCore = tilingData.realCore;
+    tilingParam.nlCoreRun = tilingData.nlCoreRun;
+    tilingParam.lCoreRun = tilingData.lCoreRun;
+    tilingParam.maxNPerLoopForUb = tilingData.maxNPerLoopForUb;
+    tilingParam.preCoreLoopTime = tilingData.preCoreLoopTime;
+    tilingParam.preCoreLoopNLast = tilingData.preCoreLoopNLast;
+    tilingParam.lastCoreLoopTime = tilingData.lastCoreLoopTime;
+    tilingParam.lastCoreLoopNLast = tilingData.lastCoreLoopNLast;
+
+    tilingParam.esqFrontCore = tilingData.esqFrontCore;
+    tilingParam.esqTailCore = tilingData.esqTailCore;
+    tilingParam.esqFrontCoreBatch = tilingData.esqFrontCoreBatch;
+    tilingParam.esqTailCoreBatch = tilingData.esqTailCoreBatch;
+    tilingParam.esqHeadNum = tilingData.esqHeadNum;
+    tilingParam.esqColNum = tilingData.esqColNum;
+    tilingParam.esqUbHeadLoop = tilingData.esqUbHeadLoop;
+    tilingParam.esqHeadPerLoop = tilingData.esqHeadPerLoop;
+    tilingParam.esqHeadTail = tilingData.esqHeadTail;
+    tilingParam.esqColLoop = tilingData.esqColLoop;
+    tilingParam.esqColTail = tilingData.esqColTail;
 }
 
 void MlaPreprocessTiling::SetTilingKey(const Mki::LaunchParam &launchParam, Mki::KernelInfo &kernelInfo)
diff --git a/src/kernels/mixkernels/multi_latent_attention/tiling/mla_tiling_dependency.cpp b/src/kernels/mixkernels/multi_latent_attention/tiling/mla_tiling_dependency.cpp
index f90a72dd..c662de13 100644
--- a/src/kernels/mixkernels/multi_latent_attention/tiling/mla_tiling_dependency.cpp
+++ b/src/kernels/mixkernels/multi_latent_attention/tiling/mla_tiling_dependency.cpp
@@ -318,7 +318,7 @@ void GetNdMLAMtpTilingTP1(const MLAInfo &mmInfo, uint32_t &blockDim, uint32_t *t
 }
 
 void GetTilingHead(const MLAInfo &mmInfo, const OpParam::MLA &param, uint32_t *tilingParam,
-                   const uint32_t *torPtr, uint32_t blockDim)
+                   const uint32_t &torPtr, uint32_t blockDim)
 {
     tilingParam[TILING_BATCH] = static_cast<uint32_t>(mmInfo.batch);
     tilingParam[TILING_HEADSIZE] = static_cast<uint32_t>(TILING_HEAD_SIZE);
@@ -329,7 +329,7 @@ void GetTilingHead(const MLAInfo &mmInfo, const OpParam::MLA &param, uint32_t *t
     tilingParam[TILING_NUMBLOKS] = static_cast<uint32_t>(mmInfo.numBlocks);
     tilingParam[TILING_BLOCKSIZE] = static_cast<uint32_t>(mmInfo.blockSize);
     tilingParam[TILING_MAXBLOCKS] = static_cast<uint32_t>(mmInfo.maxNumBlocksPerQuery);
-    tilingParam[TILING_TOR] = *torPtr;
+    tilingParam[TILING_TOR] = torPtr;
     tilingParam[TILING_KVHEADS] = (mmInfo.kvHeads == 0) ? mmInfo.numHeads : mmInfo.kvHeads;
 
     tilingParam[TILING_MASK_TYPE_ND] = static_cast<uint32_t>(mmInfo.maskType);
diff --git a/src/kernels/mixkernels/ring_mla/tiling/ring_mla_tiling_dependency.cpp b/src/kernels/mixkernels/ring_mla/tiling/ring_mla_tiling_dependency.cpp
index 8327547e..de6273b7 100644
--- a/src/kernels/mixkernels/ring_mla/tiling/ring_mla_tiling_dependency.cpp
+++ b/src/kernels/mixkernels/ring_mla/tiling/ring_mla_tiling_dependency.cpp
@@ -241,7 +241,7 @@ void GetNdMLAMtpTilingTP1(const RINGMLAInfo &mmInfo, uint32_t &blockDim, uint32_
 }
 
 void GetTilingHead(const RINGMLAInfo &mmInfo, const OpParam::RINGMLA &param, uint32_t *tilingParam,
-                   const uint32_t *torPtr)
+                   const uint32_t &torPtr)
 {
     tilingParam[TILING_BATCH] = static_cast<uint32_t>(mmInfo.batch);
     tilingParam[TILING_HEADSIZE] = static_cast<uint32_t>(TILING_HEAD_SIZE);
@@ -252,7 +252,7 @@ void GetTilingHead(const RINGMLAInfo &mmInfo, const OpParam::RINGMLA &param, uin
     tilingParam[TILING_NUMBLOKS] = static_cast<uint32_t>(mmInfo.numBlocks);
     tilingParam[TILING_BLOCKSIZE] = static_cast<uint32_t>(mmInfo.blockSize);
     tilingParam[TILING_MAXBLOCKS] = static_cast<uint32_t>(mmInfo.maxNumBlocksPerQuery);
-    tilingParam[TILING_TOR] = *torPtr;
+    tilingParam[TILING_TOR] = torPtr;
     tilingParam[TILING_KVHEADS] = (mmInfo.kvHeads == 0) ? mmInfo.numHeads : mmInfo.kvHeads;
 
     tilingParam[TILING_MASK_TYPE_ND] = static_cast<uint32_t>(mmInfo.maskType);
diff --git a/src/kernels/mixkernels/rope/tiling/rope_tiling.cpp b/src/kernels/mixkernels/rope/tiling/rope_tiling.cpp
index 91d3b448..c932da29 100644
--- a/src/kernels/mixkernels/rope/tiling/rope_tiling.cpp
+++ b/src/kernels/mixkernels/rope/tiling/rope_tiling.cpp
@@ -85,7 +85,7 @@ void RopeNdProcess(const LaunchParam &launchParam, KernelInfo &kernelInfo, RopeT
     MKI_LOG(DEBUG) << "Multiple is " << multiple;
     MKI_LOG(DEBUG) << "RealCore is " << realCore;
 }
-Status TilingKeyChose(const LaunchParam &launchParam, KernelInfo &kernelInfo, const RopeTilingData *tilingDataPtr)
+Status TilingKeyChose(const LaunchParam &launchParam, KernelInfo &kernelInfo, const RopeTilingData &tilingDataPtr)
 {
     auto platformType = PlatformInfo::Instance().GetPlatformType();
     auto cosSize = launchParam.GetInTensor(NUM_COSIN).desc.dims.size();
@@ -95,8 +95,8 @@ Status TilingKeyChose(const LaunchParam &launchParam, KernelInfo &kernelInfo, co
                 MKI_LOG(ERROR) << "BF16 only supports 800I A2";
                 return Status::FailStatus(ERROR_INVALID_VALUE);
             }
-            uint32_t alignRotary = (tilingDataPtr->headDim / tilingDataPtr->rotaryCoeff) % ELE_NUM_FP16;
-            bool condition = (alignRotary == 0) && (tilingDataPtr->ntokens >= LARGE_NTOKENS_THRESHOLD);
+            uint32_t alignRotary = (tilingDataPtr.headDim / tilingDataPtr.rotaryCoeff) % ELE_NUM_FP16;
+            bool condition = (alignRotary == 0) && (tilingDataPtr.ntokens >= LARGE_NTOKENS_THRESHOLD);
             if (condition) { // ntokens >= 64时，走TILING_BF16_ALIGN
                 kernelInfo.SetTilingId(TILING_BF16_ALIGN); // first 2 for shape dims of cos
             } else {
@@ -105,8 +105,8 @@ Status TilingKeyChose(const LaunchParam &launchParam, KernelInfo &kernelInfo, co
         } else if (launchParam.GetInTensor(NUM_COSIN).desc.dtype == TENSOR_DTYPE_FLOAT) {
             kernelInfo.SetTilingId(TILING_HIGH_PREC); // second 1 for FP32
         } else {
-            bool condition = tilingDataPtr->ntokens * tilingDataPtr->multiple >= LARGE_NTOKENS_THRESHOLD &&
-                             tilingDataPtr->cosFormat == 0;
+            bool condition = tilingDataPtr.ntokens * tilingDataPtr.multiple >= LARGE_NTOKENS_THRESHOLD &&
+                             tilingDataPtr.cosFormat == 0;
             if (condition) { // ntokens >= 64时，走TILING_HIGH_PERF_LARGE_NTOKENS
                 kernelInfo.SetTilingId(TILING_HIGH_PERF_LARGE_NTOKENS);
             } else {
@@ -119,8 +119,8 @@ Status TilingKeyChose(const LaunchParam &launchParam, KernelInfo &kernelInfo, co
                 MKI_LOG(ERROR) << "BF16 only supports 800I A2";
                 return Status::FailStatus(ERROR_INVALID_VALUE);
             }
-            uint32_t alignRotary = (tilingDataPtr->headDim / tilingDataPtr->rotaryCoeff) % ELE_NUM_FP16;
-            bool condition = (alignRotary == 0) && (tilingDataPtr->ntokens >= LARGE_NTOKENS_THRESHOLD);
+            uint32_t alignRotary = (tilingDataPtr.headDim / tilingDataPtr.rotaryCoeff) % ELE_NUM_FP16;
+            bool condition = (alignRotary == 0) && (tilingDataPtr.ntokens >= LARGE_NTOKENS_THRESHOLD);
             if (condition) { // ntokens >= 64时，走TILING_BF16_ALIGN_BROARD
                 kernelInfo.SetTilingId(TILING_BF16_ALIGN_BROARD); // first 2 for shape dims of cos
             } else {
diff --git a/src/kernels/mixkernels/rope_q_concat/tiling/rope_q_concat_tiling.cpp b/src/kernels/mixkernels/rope_q_concat/tiling/rope_q_concat_tiling.cpp
index f0fc38e5..01b09360 100644
--- a/src/kernels/mixkernels/rope_q_concat/tiling/rope_q_concat_tiling.cpp
+++ b/src/kernels/mixkernels/rope_q_concat/tiling/rope_q_concat_tiling.cpp
@@ -39,7 +39,7 @@ Status TilingKeyChose(const LaunchParam &launchParam, KernelInfo &kernelInfo)
     return Status::OkStatus();
 }
 
-Status RopeNdProcess(const LaunchParam &launchParam, RopeQConcatTilingData *tilingDataPtr)
+Status RopeNdProcess(const LaunchParam &launchParam, RopeQConcatTilingData &tilingDataPtr)
 {
     auto &inTensor0 = launchParam.GetInTensor(0).desc;
     auto &inTensor1 = launchParam.GetInTensor(DIM_1).desc;
@@ -54,7 +54,7 @@ Status RopeNdProcess(const LaunchParam &launchParam, RopeQConcatTilingData *tili
     uint32_t concatSize = inTensor3.dims[DIM_2];
 
     // 当前场景只支持rotaryCoeff = 2的情况
-    tilingDataPtr->rotaryCoeff = 2;
+    tilingDataPtr.rotaryCoeff = 2;
     uint32_t maxCore = static_cast<uint32_t>(PlatformInfo::Instance().GetCoreNum(CoreType::CORE_TYPE_VECTOR));
     auto maxUbSize = static_cast<uint32_t>(PlatformInfo::Instance().GetUbSize()) - REMAIN_TILING_SIZE;
 
@@ -76,19 +76,19 @@ Status RopeNdProcess(const LaunchParam &launchParam, RopeQConcatTilingData *tili
     uint32_t preCoreLoopNLast = nlCoreRun - (preCoreLoopTime - 1) * maxNPerLoopForUb;  // 前核最后一批处理数据行数
     uint32_t lastCoreLoopTime = (lCoreRun + maxNPerLoopForUb - 1) / maxNPerLoopForUb;  // 尾核循环次数
     uint32_t lastCoreLoopNLast = lCoreRun - (lastCoreLoopTime - 1) * maxNPerLoopForUb;  // 尾核最后一批处理数据行数
-    tilingDataPtr->hiddenSizeQ = hiddenSizeQ;
-    tilingDataPtr->headNumQ = headNumQ;
-    tilingDataPtr->headDim = headDim;
-    tilingDataPtr->concatSize = concatSize;
-    tilingDataPtr->ntokens = ntokens;
-    tilingDataPtr->realCore = realCore;
-    tilingDataPtr->nlCoreRun = nlCoreRun;
-    tilingDataPtr->lCoreRun = lCoreRun;
-    tilingDataPtr->maxNPerLoopForUb = maxNPerLoopForUb;
-    tilingDataPtr->preCoreLoopTime = preCoreLoopTime;
-    tilingDataPtr->preCoreLoopNLast = preCoreLoopNLast;
-    tilingDataPtr->lastCoreLoopTime = lastCoreLoopTime;
-    tilingDataPtr->lastCoreLoopNLast = lastCoreLoopNLast;
+    tilingDataPtr.hiddenSizeQ = hiddenSizeQ;
+    tilingDataPtr.headNumQ = headNumQ;
+    tilingDataPtr.headDim = headDim;
+    tilingDataPtr.concatSize = concatSize;
+    tilingDataPtr.ntokens = ntokens;
+    tilingDataPtr.realCore = realCore;
+    tilingDataPtr.nlCoreRun = nlCoreRun;
+    tilingDataPtr.lCoreRun = lCoreRun;
+    tilingDataPtr.maxNPerLoopForUb = maxNPerLoopForUb;
+    tilingDataPtr.preCoreLoopTime = preCoreLoopTime;
+    tilingDataPtr.preCoreLoopNLast = preCoreLoopNLast;
+    tilingDataPtr.lastCoreLoopTime = lastCoreLoopTime;
+    tilingDataPtr.lastCoreLoopNLast = lastCoreLoopNLast;
     return Status::OkStatus();
 }
 
diff --git a/src/kernels/mixkernels/swi_glu_quant/tiling/swi_glu_quant_tiling.cpp b/src/kernels/mixkernels/swi_glu_quant/tiling/swi_glu_quant_tiling.cpp
index 2d5b5389..46579d28 100644
--- a/src/kernels/mixkernels/swi_glu_quant/tiling/swi_glu_quant_tiling.cpp
+++ b/src/kernels/mixkernels/swi_glu_quant/tiling/swi_glu_quant_tiling.cpp
@@ -32,37 +32,37 @@ static constexpr int TILING_KEY_FP32_QUANT_MODE = 306; // Tiling key for FP32 qu
 
 namespace AtbOps  {
 using namespace Mki;
-void SetTilingData(SwiGluQuantTilingData *tilingData)
+void SetTilingData(SwiGluQuantTilingData &tilingData)
 {
-    tilingData->basicRowLenHeadCore = tilingData->optBaseRowLenHeadCore;
-    tilingData->basicRowLenTailCore = tilingData->optBaseRowLenTailCore;
-    tilingData->basicColLen = tilingData->optBaseColLen;
-    tilingData->realCoreNum = tilingData->coreNumUsed;
+    tilingData.basicRowLenHeadCore = tilingData.optBaseRowLenHeadCore;
+    tilingData.basicRowLenTailCore = tilingData.optBaseRowLenTailCore;
+    tilingData.basicColLen = tilingData.optBaseColLen;
+    tilingData.realCoreNum = tilingData.coreNumUsed;
 }
-bool CalTilingData(SwiGluQuantTilingData *tilingData)
+bool CalTilingData(SwiGluQuantTilingData& tilingData)
 {
-    uint32_t rowLen = tilingData->rowLen;
-    tilingData->coreNumUsed = Max(Min(tilingData->totalCore, rowLen), ONE);
-    tilingData->headCoreNum = rowLen % tilingData->coreNumUsed;
-    tilingData->rowLenPerHeadCore = (rowLen + tilingData->coreNumUsed - 1) / tilingData->coreNumUsed;
-    tilingData->rowLenPerTailCore = rowLen / tilingData->coreNumUsed;
+    uint32_t rowLen = tilingData.rowLen;
+    tilingData.coreNumUsed = Max(Min(tilingData.totalCore, rowLen), ONE);
+    tilingData.headCoreNum = rowLen % tilingData.coreNumUsed;
+    tilingData.rowLenPerHeadCore = (rowLen + tilingData.coreNumUsed - 1) / tilingData.coreNumUsed;
+    tilingData.rowLenPerTailCore = rowLen / tilingData.coreNumUsed;
     return CalculateMaxUbSizePerRow(tilingData);
 }
-void PrintSwiQuantTiling(SwiGluQuantTilingData *tilingData)
+void PrintSwiQuantTiling(const SwiGluQuantTilingData &tilingData)
 {
     MKI_LOG(INFO) << "SwiGlu Tiling Data:"
                   << "\n"
-                  << " groupLen " << tilingData->groupLen << "\n"
-                  << " rowLen " << tilingData->rowLen << "\n"
-                  << " colLen " << tilingData->colLen << "\n"
-                  << " rowLenPerHeadCore " << tilingData->rowLenPerHeadCore << "\n"
-                  << " rowLenPerTailCore " << tilingData->rowLenPerTailCore << "\n"
-                  << " basicRowLenHeadCore " << tilingData->basicRowLenHeadCore << "\n"
-                  << " basicRowLenTailCore " << tilingData->basicRowLenTailCore << "\n"
-                  << " basicColLen  " << tilingData->basicColLen << "\n"
-                  << " headCoreNum " << tilingData->headCoreNum << "\n"
-                  << " realCoreNum  " << tilingData->realCoreNum << "\n"
-                  << " totalCore  " << tilingData->totalCore;
+                  << " groupLen " << tilingData.groupLen << "\n"
+                  << " rowLen " << tilingData.rowLen << "\n"
+                  << " colLen " << tilingData.colLen << "\n"
+                  << " rowLenPerHeadCore " << tilingData.rowLenPerHeadCore << "\n"
+                  << " rowLenPerTailCore " << tilingData.rowLenPerTailCore << "\n"
+                  << " basicRowLenHeadCore " << tilingData.basicRowLenHeadCore << "\n"
+                  << " basicRowLenTailCore " << tilingData.basicRowLenTailCore << "\n"
+                  << " basicColLen  " << tilingData.basicColLen << "\n"
+                  << " headCoreNum " << tilingData.headCoreNum << "\n"
+                  << " realCoreNum  " << tilingData.realCoreNum << "\n"
+                  << " totalCore  " << tilingData.totalCore;
 }
 
 void SwigluQuantTilingKeyChose(const LaunchParam &launchParam, KernelInfo &kernelInfo)
diff --git a/src/kernels/mixkernels/swi_glu_quant/tiling/swi_glu_quant_tiling_utils.h b/src/kernels/mixkernels/swi_glu_quant/tiling/swi_glu_quant_tiling_utils.h
index 0e4756af..c1e1b60f 100644
--- a/src/kernels/mixkernels/swi_glu_quant/tiling/swi_glu_quant_tiling_utils.h
+++ b/src/kernels/mixkernels/swi_glu_quant/tiling/swi_glu_quant_tiling_utils.h
@@ -54,7 +54,7 @@ template<typename T> T Min(T num, T div) { return num < div ? num : div; }
 
 template<typename T> T Max(T num, T div) { return num < div ? div : num; }
 
-inline bool SetTotalShape(const Mki::SVector<int64_t> &inShape, SwiGluQuantTilingData *tilingData)
+inline bool SetTotalShape(const Mki::SVector<int64_t> &inShape, SwiGluQuantTilingData& tilingData)
     {
     int64_t shapeBefore = 1;
     int64_t shapeAfter = 1;
@@ -69,27 +69,27 @@ inline bool SetTotalShape(const Mki::SVector<int64_t> &inShape, SwiGluQuantTilin
     }
     MKI_CHECK(shapeAfter % EVEN_FACTOR == 0, "shapeAfter % 2 != 0", return false);
     MKI_CHECK(shapeAfter != 0, "shapeAfter == 0", return false);
-    tilingData->rowLen = static_cast<uint32_t>(shapeBefore);
-    tilingData->colLen = static_cast<uint32_t>(shapeAfter / EVEN_FACTOR);
+    tilingData.rowLen = static_cast<uint32_t>(shapeBefore);
+    tilingData.colLen = static_cast<uint32_t>(shapeAfter / EVEN_FACTOR);
     return true;
 }
 
-inline bool CalculateMaxUbSizePerRow(SwiGluQuantTilingData *tilingData)
+inline bool CalculateMaxUbSizePerRow(SwiGluQuantTilingData& tilingData)
 {
-    uint32_t colLen = tilingData->colLen;
-    uint32_t alignedColLen = AlignUp<uint32_t>(colLen, tilingData->blockNum);
+    uint32_t colLen = tilingData.colLen;
+    uint32_t alignedColLen = AlignUp<uint32_t>(colLen, tilingData.blockNum);
     MKI_CHECK(alignedColLen != 0, "CalculateMaxUbSizePerRow Unsupported alignedColLen  == 0", return false);
     MKI_LOG(INFO) << "alignedColLen:" << alignedColLen << "\n";
-    uint32_t ubAvail = tilingData->dataNumSingleUb / alignedColLen;
-    MKI_LOG(INFO) << "tilingData->dataNumSingleUb:" << tilingData->dataNumSingleUb << "\n";
+    uint32_t ubAvail = tilingDat.dataNumSingleUb / alignedColLen;
+    MKI_LOG(INFO) << "tilingData.dataNumSingleUb:" << tilingData.dataNumSingleUb << "\n";
     MKI_LOG(INFO) << "ubAvail:" << ubAvail << "\n";
     MKI_CHECK(ubAvail != 0, "The input vector is too large. It is not supported currently.", return false);
 
-    tilingData->optBaseColLen = colLen;
+    tilingData.optBaseColLen = colLen;
     ubAvail = Max(ubAvail, ONE);
 
-    tilingData->optBaseRowLenHeadCore = Min(Min(ubAvail, tilingData->rowLenPerHeadCore), COMPARE_INT);
-    tilingData->optBaseRowLenTailCore = Min(Min(ubAvail, tilingData->rowLenPerTailCore), COMPARE_INT);
+    tilingData.optBaseRowLenHeadCore = Min(Min(ubAvail, tilingData.rowLenPerHeadCore), COMPARE_INT);
+    tilingData.optBaseRowLenTailCore = Min(Min(ubAvail, tilingData.rowLenPerTailCore), COMPARE_INT);
     return true;
 }
 
diff --git a/src/kernels/tbe_adapter/platform/platform_ascendc.cpp b/src/kernels/tbe_adapter/platform/platform_ascendc.cpp
index e1d924f2..15c771eb 100644
--- a/src/kernels/tbe_adapter/platform/platform_ascendc.cpp
+++ b/src/kernels/tbe_adapter/platform/platform_ascendc.cpp
@@ -41,11 +41,11 @@ const static std::map<std::string, SocVersion> CONVERT_MAP = {
     {"Ascend910_93", SocVersion::ASCEND910B},
 };
 
-static inline uint32_t GetCoreNumByType(fe::PlatFormInfos *platformInfo, bool isAiv)
+static inline uint32_t GetCoreNumByType(const fe::PlatFormInfos &platformInfo, bool isAiv)
 {
     std::string key;
     std::string val;
-    bool ret = platformInfo->GetPlatformResWithLock(STR_SOC_INFO, STR_SPLIT_KEY, val);
+    bool ret = platformInfo.GetPlatformResWithLock(STR_SOC_INFO, STR_SPLIT_KEY, val);
     MKI_LOG_IF(!ret, ERROR) << "get platform failed, val is " << val;
 
     if (STR_SPLIT_VAL.compare(val) != 0) {
@@ -55,7 +55,7 @@ static inline uint32_t GetCoreNumByType(fe::PlatFormInfos *platformInfo, bool is
     } else {
         key = STR_CORE_CNT_CUB;
     }
-    ret = platformInfo->GetPlatformResWithLock(STR_SOC_INFO, key, val);
+    ret = platformInfo.GetPlatformResWithLock(STR_SOC_INFO, key, val);
     MKI_LOG_IF(!ret, ERROR) << "get platform failed, key is " << key << ", val is" << val;
     return val.empty() ? 0 : static_cast<uint32_t>(std::atoi(val.c_str()));
 }
-- 
Gitee


From 4cd4b4103414138ee87659ec071699777bf2faaa Mon Sep 17 00:00:00 2001
From: huangxiaolan <huangxiaolan7@huawei.com>
Date: Fri, 19 Sep 2025 16:16:45 +0800
Subject: [PATCH 21/94] fix function point

---
 src/atb/operation/if_operation.cpp | 6 +++---
 src/atb/utils/dl_manager.cpp       | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/atb/operation/if_operation.cpp b/src/atb/operation/if_operation.cpp
index 3189eb59..aa929ade 100644
--- a/src/atb/operation/if_operation.cpp
+++ b/src/atb/operation/if_operation.cpp
@@ -16,7 +16,7 @@
 
 namespace atb {
 
-Status IfOperation::GetOperationFromCondition(Operation **op) const
+Status IfOperation::GetOperationFromCondition(Operation *&op) const
 {
     bool cond = true;
     try {
@@ -28,10 +28,10 @@ Status IfOperation::GetOperationFromCondition(Operation **op) const
 
     if (cond && param_.opA) {
         ATB_LOG(INFO) << GetLogPrefix() << "Condition met (true), selecting opA...";
-        *op = param_.opA;
+        op = param_.opA;
     } else if (!cond && param_.opB) {
         ATB_LOG(INFO) << GetLogPrefix() << "Condition not met (false), selecting opB...";
-        *op = param_.opB;
+        op = param_.opB;
     } else {
         ATB_LOG(ERROR) << GetLogPrefix() << "Please check the intended operation is valid, opA: " << param_.opA
                        << " opB: " << param_.opB;
diff --git a/src/atb/utils/dl_manager.cpp b/src/atb/utils/dl_manager.cpp
index ad84d945..2872d781 100644
--- a/src/atb/utils/dl_manager.cpp
+++ b/src/atb/utils/dl_manager.cpp
@@ -29,15 +29,15 @@ DlManager::~DlManager()
     }
 }
 
-Status DlManager::getSymbol(const std::string &symbol, void **symbolPtr) const
+Status DlManager::getSymbol(const std::string &symbol, void *&symbolPtr) const
 {
     if (handle_ == nullptr) {
         ATB_LOG(ERROR) << "Dynamic library handle is null, please check the path: " << path_;
         return ERROR_CANN_ERROR;
     }
-    *symbolPtr = dlsym(handle_, symbol.c_str());
+    symbolPtr = dlsym(handle_, symbol.c_str());
     char *errorInfo = dlerror();
-    if (*symbolPtr == nullptr || errorInfo != nullptr) {
+    if (symbolPtr == nullptr || errorInfo != nullptr) {
         ATB_LOG(ERROR) << "Failed to find symbol " << symbol << " from path: " << path_ << ", error: " << errorInfo;
         return ERROR_CANN_ERROR;
     }
-- 
Gitee


From c78854b3095441ab7e219fefdfc8ddf71d9ef922 Mon Sep 17 00:00:00 2001
From: huangxiaolan <huangxiaolan7@huawei.com>
Date: Fri, 19 Sep 2025 16:20:19 +0800
Subject: [PATCH 22/94] fix function declaration

---
 src/atb/operation/if_operation.h | 2 +-
 src/atb/utils/dl_manager.h       | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/atb/operation/if_operation.h b/src/atb/operation/if_operation.h
index d1a4d414..eab4a45f 100644
--- a/src/atb/operation/if_operation.h
+++ b/src/atb/operation/if_operation.h
@@ -34,7 +34,7 @@ protected:
     std::shared_ptr<Runner> CreateRunner(Context &context) const override;
 
 private:
-    Status GetOperationFromCondition(Operation **op) const;
+    Status GetOperationFromCondition(Operation *&op) const;
 
 private:
     common::IfCondParam param_;
diff --git a/src/atb/utils/dl_manager.h b/src/atb/utils/dl_manager.h
index 626d630e..c1d00253 100644
--- a/src/atb/utils/dl_manager.h
+++ b/src/atb/utils/dl_manager.h
@@ -19,7 +19,7 @@ class DlManager {
 public:
     DlManager(std::string path);
     ~DlManager();
-    Status getSymbol(const std::string &symbol, void **symbolPtr) const;
+    Status getSymbol(const std::string &symbol, void *&symbolPtr) const;
 
 private:
     std::string path_;
-- 
Gitee


From a23cc295df97c4d3b8046840b5aabea14126efaf Mon Sep 17 00:00:00 2001
From: wanyukang <wanyukang@h-partners.com>
Date: Fri, 19 Sep 2025 16:22:12 +0800
Subject: [PATCH 23/94] maxbatch

---
 .../mixkernels/toppsample/op_kernel/toppsample.cpp    | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/src/kernels/mixkernels/toppsample/op_kernel/toppsample.cpp b/src/kernels/mixkernels/toppsample/op_kernel/toppsample.cpp
index 77357126..fbc9eda7 100644
--- a/src/kernels/mixkernels/toppsample/op_kernel/toppsample.cpp
+++ b/src/kernels/mixkernels/toppsample/op_kernel/toppsample.cpp
@@ -20,7 +20,6 @@ static constexpr uint32_t DEFAULT_STRIDE = 8;
 static constexpr uint32_t FP32_PER_REPEAT = 64;
 static constexpr uint32_t FP16_PER_REPEAT = 128;
 static constexpr uint32_t FP16_PER_BLOCK = 16;
-static constexpr uint32_t MAX_BATCH = 1024;
 static constexpr uint32_t NUM_4 = 4;
 
 using AscendC::HardEvent;
@@ -45,6 +44,7 @@ public:
         nlCoreRun_ = (firstDim_ + realCore_ - 1) / realCore_;
         lCoreRun_ = firstDim_ - (realCore_ - 1) * nlCoreRun_;
         dynamicRound_ = (blockIdx_ == realCore_ - 1) ? lCoreRun_ : nlCoreRun_;
+        maxBatch_ = (firstDim_ + FP16_PER_BLOCK - 1) / FP16_PER_BLOCK * FP16_PER_BLOCK;
 
         xGm_.SetGlobalBuffer((__gm__ T *)cumsumed_probs);
         yGm_.SetGlobalBuffer((__gm__ T *)topp); // batch,num_samples
@@ -54,8 +54,8 @@ public:
         pipe_.InitBuffer(inputBuf_, tempUbEleAligened_ * DATA_BYTE);
         pipe_.InitBuffer(tempBuf_, tempUbEleAligened_ * DATA_BYTE * DATA_BYTE);
         pipe_.InitBuffer(fp32Buf_, tempUbEleAligened_ * DATA_BYTE * DATA_BYTE);
-        pipe_.InitBuffer(yBuf_, MAX_BATCH * DATA_BYTE);                    // topp
-        pipe_.InitBuffer(yF32Buf_, MAX_BATCH * DATA_BYTE * DATA_BYTE);     // toppfp32
+        pipe_.InitBuffer(yBuf_, maxBatch_ * DATA_BYTE);                    // topp
+        pipe_.InitBuffer(yF32Buf_, maxBatch_ * DATA_BYTE * DATA_BYTE);     // toppfp32
         pipe_.InitBuffer(int8Buf_, tempUbEleAligened_ / DEFAULT_STRIDE);   // compare
         pipe_.InitBuffer(blockBuf_, BLK_SIZE);                             // 存下标
         pipe_.InitBuffer(int32Buf_, MAX_CORE_NUM * DATA_BYTE * DATA_BYTE); // 每个核做几个batch
@@ -65,7 +65,7 @@ public:
     __aicore__ inline void PickUpRand()
     {
         AscendC::LocalTensor<T> buf = yBuf_.Get<T>();
-        DataCopy(buf, yGm_, MAX_BATCH);
+        DataCopy(buf, yGm_, maxBatch_);
     }
 
     __aicore__ inline void FirstPick(uint32_t cid, uint32_t offset)
@@ -127,7 +127,7 @@ public:
         Duplicate(uint32Buf_, uint32_t(0), tempUbEleAligened_ / BLK_SIZE);
         // 截断数可能是batch个，也可能是1个
         // 每个batch往后取一个随机数。(*(tilingUb_ + batchOffset))
-        Cast(toppBufF32_, toppBuf_, AscendC::RoundMode::CAST_NONE, MAX_BATCH);
+        Cast(toppBufF32_, toppBuf_, AscendC::RoundMode::CAST_NONE, maxBatch_);
         for (int cid = 0; cid < dynamicRound_; cid++) { // 每个核做多少次
             absIdx_ = 0;
             uint32_t batchOffset = (blockIdx_ * nlCoreRun_ + cid) % MAX_CORE_NUM;
@@ -336,6 +336,7 @@ private:
     uint32_t expandLastDim_{0};
     uint32_t numSamplesMax_{0};
     uint32_t firstDim_{0};
+    uint32_t maxBatch_{0};
     float maxNum_{0};
     float tempValue_{0};
     uint32_t perCoreRunNum_{0};
-- 
Gitee


From 8dae4d53fecae188107ad058c371705c0765c238 Mon Sep 17 00:00:00 2001
From: huangxiaolan <huangxiaolan7@huawei.com>
Date: Fri, 19 Sep 2025 16:55:45 +0800
Subject: [PATCH 24/94] fix function declaration

---
 .../faster_gelu_forward/tiling/faster_gelu_tiling.cpp       | 2 +-
 .../kernels/activation/gelu_forward/tiling/gelu_tiling.cpp  | 2 +-
 .../dynamic_quant_tiling/dynamic_quant_tiling.cpp           | 6 +++---
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/kernels/kernels/activation/faster_gelu_forward/tiling/faster_gelu_tiling.cpp b/src/kernels/kernels/activation/faster_gelu_forward/tiling/faster_gelu_tiling.cpp
index fec67067..a64b1f92 100644
--- a/src/kernels/kernels/activation/faster_gelu_forward/tiling/faster_gelu_tiling.cpp
+++ b/src/kernels/kernels/activation/faster_gelu_forward/tiling/faster_gelu_tiling.cpp
@@ -95,7 +95,7 @@ Status FasterGeluForwardTiling(const LaunchParam &launchParam, KernelInfo &kerne
         reinterpret_cast<FasterGeluForwardTilingData *>(kernelInfo.GetTilingHostAddr());
     MKI_CHECK(tilingDataPtr != nullptr, "tilingDataPtr should not be empty",
                  return Status::FailStatus(ERROR_INVALID_VALUE, "tilingDataPtr should not be empty"));
-    CalcVectorTiling512Align(launchParam, tilingDataPtr, blockDim);
+    CalcVectorTiling512Align(launchParam, *tilingDataPtr, blockDim);
 
     for (uint32_t i = 0; i < tilingDataPtr->usedCoreNum; i++) {
         MKI_LOG(INFO) << "Core-" << i << "  singleCoreDataLen num is " << tilingDataPtr->singleCoreDataLen[i];
diff --git a/src/kernels/kernels/activation/gelu_forward/tiling/gelu_tiling.cpp b/src/kernels/kernels/activation/gelu_forward/tiling/gelu_tiling.cpp
index f6b3c3b4..a51cd6be 100644
--- a/src/kernels/kernels/activation/gelu_forward/tiling/gelu_tiling.cpp
+++ b/src/kernels/kernels/activation/gelu_forward/tiling/gelu_tiling.cpp
@@ -85,7 +85,7 @@ Status GeluForwardTiling(const LaunchParam &launchParam, KernelInfo &kernelInfo)
         reinterpret_cast<GeluForwardTilingData *>(kernelInfo.GetTilingHostAddr());
     MKI_CHECK(tilingDataPtr != nullptr, "tilingDataPtr should not be empty",
                  return Status::FailStatus(ERROR_INVALID_VALUE, "tilingDataPtr should not be empty"));
-    MKI_CHECK(FillTilingParam(launchParam, tilingDataPtr, blockDim), "FillTilingParam Failed.",
+    MKI_CHECK(FillTilingParam(launchParam, *tilingDataPtr, blockDim), "FillTilingParam Failed.",
                  return Status::FailStatus(ERROR_INVALID_VALUE, "FillTilingParam Failed."));
     kernelInfo.SetBlockDim(blockDim);
     kernelInfo.SetTilingId(dataType); // 不同的数据类型用不同的分核策略，所以暂时用数据类型的枚举来表示分核ID
diff --git a/src/kernels/kernels/elewise/dynamic_quant/dynamic_quant_tiling/dynamic_quant_tiling.cpp b/src/kernels/kernels/elewise/dynamic_quant/dynamic_quant_tiling/dynamic_quant_tiling.cpp
index ff72a00e..8e8c2a1a 100644
--- a/src/kernels/kernels/elewise/dynamic_quant/dynamic_quant_tiling/dynamic_quant_tiling.cpp
+++ b/src/kernels/kernels/elewise/dynamic_quant/dynamic_quant_tiling/dynamic_quant_tiling.cpp
@@ -47,7 +47,7 @@ Status ParseShape(const LaunchParam &launchParam, DynamicQuantTilingData &tiling
     size_t dims = shape.size();
     for (size_t i = 0; i < dims; ++i) {
         if (i < dims - 1) {
-            MKI_CHECK(shape[i] > 0 && *rowNumTotal < static_cast<uint64_t>(UINT32_MAX / shape[i]),
+            MKI_CHECK(shape[i] > 0 && rowNumTotal < static_cast<uint64_t>(UINT32_MAX / shape[i]),
                 "rowNumTotal or shape is invalid!",
                 return Status::FailStatus(ERROR_INVALID_VALUE, "rowNumTotal or shape is invalid!"));
             rowNumTotal *= shape[i];
@@ -194,10 +194,10 @@ Status DynamicQuantTiling(const LaunchParam &launchParam, KernelInfo &kernelInfo
     tilingDataPtr->asymmetric = *reinterpret_cast<uint32_t *>(&attrs.asymmetric);
 
     uint64_t rowNumTotal = 1;
-    Status res = ParseShape(launchParam, tilingDataPtr, &rowNumTotal);
+    Status res = ParseShape(launchParam, *tilingDataPtr, &rowNumTotal);
     OP_TILING_CHECK_STATUS_RETURN(res);
 
-    Status ret = SetTilingData(tilingDataPtr, rowNumTotal);
+    Status ret = SetTilingData(*tilingDataPtr, rowNumTotal);
     OP_TILING_CHECK_STATUS_RETURN(ret);
 
     MKI_LOG(INFO) << "numCore = "           << tilingDataPtr->numCore
-- 
Gitee


From da0e862560b6ebf17c504d4172ec10c1c9699e62 Mon Sep 17 00:00:00 2001
From: huangxiaolan <huangxiaolan7@huawei.com>
Date: Fri, 19 Sep 2025 17:45:04 +0800
Subject: [PATCH 25/94] fix function point

---
 .../dynamic_quant_tiling.cpp                  |  2 +-
 .../tiling/cohere_layer_norm_tiling.cpp       |  4 ++--
 .../norm/rmsnorm/tiling/rms_norm_tiling.cpp   |  4 ++--
 .../tiling/fused_add_topk_div_tiling.cpp      | 10 ++++----
 .../tiling/ring_mla_tiling_dependency.cpp     |  2 +-
 .../mixkernels/rope/tiling/rope_tiling.cpp    | 24 +++++++++----------
 .../tiling/rope_q_concat_tiling.cpp           |  2 +-
 7 files changed, 24 insertions(+), 24 deletions(-)

diff --git a/src/kernels/kernels/elewise/dynamic_quant/dynamic_quant_tiling/dynamic_quant_tiling.cpp b/src/kernels/kernels/elewise/dynamic_quant/dynamic_quant_tiling/dynamic_quant_tiling.cpp
index 8e8c2a1a..fab2d4bd 100644
--- a/src/kernels/kernels/elewise/dynamic_quant/dynamic_quant_tiling/dynamic_quant_tiling.cpp
+++ b/src/kernels/kernels/elewise/dynamic_quant/dynamic_quant_tiling/dynamic_quant_tiling.cpp
@@ -194,7 +194,7 @@ Status DynamicQuantTiling(const LaunchParam &launchParam, KernelInfo &kernelInfo
     tilingDataPtr->asymmetric = *reinterpret_cast<uint32_t *>(&attrs.asymmetric);
 
     uint64_t rowNumTotal = 1;
-    Status res = ParseShape(launchParam, *tilingDataPtr, &rowNumTotal);
+    Status res = ParseShape(launchParam, *tilingDataPtr, rowNumTotal);
     OP_TILING_CHECK_STATUS_RETURN(res);
 
     Status ret = SetTilingData(*tilingDataPtr, rowNumTotal);
diff --git a/src/kernels/kernels/norm/coherelayernorm/tiling/cohere_layer_norm_tiling.cpp b/src/kernels/kernels/norm/coherelayernorm/tiling/cohere_layer_norm_tiling.cpp
index a0465d01..8095aa44 100644
--- a/src/kernels/kernels/norm/coherelayernorm/tiling/cohere_layer_norm_tiling.cpp
+++ b/src/kernels/kernels/norm/coherelayernorm/tiling/cohere_layer_norm_tiling.cpp
@@ -145,10 +145,10 @@ Status CohereLayerNormTiling(const LaunchParam &launchParam, KernelInfo &kernelI
     uint64_t tilingKey = LAYER_NORM_TILING_KEY_BASE;
 
     if (fixedUsedBufferSize < layerNormPtrCon.maxUbSize) {  // multiple rows moved simultaneously
-        MultipleRowMovedTiling(layerNormPtrCon, tilingDataPtr,
+        MultipleRowMovedTiling(layerNormPtrCon, *tilingDataPtr,
                                singleRowMovedBufferSize, multipleRowMovedBufferSize, MISC_BUFFERS_SIZE);
     } else {  // single row moved
-        SingleRowMovedTiling(layerNormPtrCon, tilingDataPtr,
+        SingleRowMovedTiling(layerNormPtrCon, *tilingDataPtr,
                              singleRowMovedElemSize, multipleRowMovedElemSize, MISC_BUFFERS_SIZE);
         tilingKey += LAYER_NORM_TILING_KEY_FAST;
     }
diff --git a/src/kernels/kernels/norm/rmsnorm/tiling/rms_norm_tiling.cpp b/src/kernels/kernels/norm/rmsnorm/tiling/rms_norm_tiling.cpp
index 840efb97..b8b70570 100644
--- a/src/kernels/kernels/norm/rmsnorm/tiling/rms_norm_tiling.cpp
+++ b/src/kernels/kernels/norm/rmsnorm/tiling/rms_norm_tiling.cpp
@@ -118,12 +118,12 @@ Status RmsNormTiling(const LaunchParam &launchParam, KernelInfo &kernelInfo)
         tilingDataPtr->quantMin = -127; // set int8 min to -127
     }
     kernelInfo.SetBlockDim(tilingDataPtr->numCore);
-    SetNonContiguousTenor(tilingDataPtr, launchParam);
+    SetNonContiguousTenor(*tilingDataPtr, launchParam);
     uint64_t tilingKey = ComputeTilingKey(tilingDataPtr->gemmaMode, tilingDataPtr->precisionMode, isShortTail,
                                           launchParam);
     MKI_LOG(INFO) << "post rmsnorm tilingKey is : " << tilingKey;
     kernelInfo.SetTilingId(tilingKey);
-    PrintRmsNormTiling(tilingDataPtr);
+    PrintRmsNormTiling(*tilingDataPtr);
     return Status::OkStatus();
 }
 
diff --git a/src/kernels/mixkernels/fused_add_topk_div/tiling/fused_add_topk_div_tiling.cpp b/src/kernels/mixkernels/fused_add_topk_div/tiling/fused_add_topk_div_tiling.cpp
index 0b1c929b..e3a57c7a 100644
--- a/src/kernels/mixkernels/fused_add_topk_div/tiling/fused_add_topk_div_tiling.cpp
+++ b/src/kernels/mixkernels/fused_add_topk_div/tiling/fused_add_topk_div_tiling.cpp
@@ -71,10 +71,10 @@ namespace AtbOps {
         void GetTiling(TilingData *tilingDataPtr);
 
     private:
-        void GetTilingKey(TilingData *tilingDataPtr);
+        void GetTilingKey(TilingData &tilingDataPtr);
         void GetUsedCore();
         void SplitUb();
-        void FillTilingData(TilingData *tilingDataPtr);
+        void FillTilingData(TilingData &tilingDataPtr);
         template <typename T1, typename T2>
         inline T1 CeilAlign(T1 a, T2 b) const
         {
@@ -174,7 +174,7 @@ namespace AtbOps {
     template <typename TilingData>
     void GetFusedAddTopkDivTiling(TilingData *tilingDataPtr, uint32_t coreNum, uint32_t ubSize)
     {
-        class FusedAddTopkDivTiling<TilingData> tilingObj(tilingDataPtr, coreNum, ubSize);
+        class FusedAddTopkDivTiling<TilingData> tilingObj(*tilingDataPtr, coreNum, ubSize);
         tilingObj.GetTiling(tilingDataPtr);
     }
 
@@ -291,7 +291,7 @@ namespace AtbOps {
         auto inputDatatype = launchParam.GetInTensor(X_INPUT_INDEX).desc.dtype;
         tilingDataPtr->dtype = g_dtypeMap[inputDatatype];
  
-        auto checkInputInfo = GetInputInfo(launchParam, tilingDataPtr);
+        auto checkInputInfo = GetInputInfo(launchParam, *tilingDataPtr);
         if (!checkInputInfo.Ok()) {
             return Status::FailStatus(ERROR_INVALID_VALUE);
         }
@@ -303,7 +303,7 @@ namespace AtbOps {
         kernelInfo.SetTilingId(tilingKey);
         uint32_t syncWorkspaceSize = sysWorkspaceSize + blockNum * tilingDataPtr->workspacePerCore;
         kernelInfo.GetScratchSizes() = {syncWorkspaceSize};
-        PrintTilingData(tilingDataPtr);
+        PrintTilingData(*tilingDataPtr);
         return Status::OkStatus();
     }
 } // namespace AtbOps
\ No newline at end of file
diff --git a/src/kernels/mixkernels/ring_mla/tiling/ring_mla_tiling_dependency.cpp b/src/kernels/mixkernels/ring_mla/tiling/ring_mla_tiling_dependency.cpp
index de6273b7..ea58a71a 100644
--- a/src/kernels/mixkernels/ring_mla/tiling/ring_mla_tiling_dependency.cpp
+++ b/src/kernels/mixkernels/ring_mla/tiling/ring_mla_tiling_dependency.cpp
@@ -288,7 +288,7 @@ Status GetRINGMLATilingParam(const LaunchParam &launchParam, const RINGMLAInfo &
         GetNdMLATiling(mmInfo, blockDim, tilingParam, param);
         blockDim = mmInfo.batch == BATCH_MLA ? BLOCK_DIM_MLA : blockDim;
     }
-    GetTilingHead(mmInfo, param, tilingParam, torPtr);
+    GetTilingHead(mmInfo, param, tilingParam, *torPtr);
     return AtbOps::Status::OkStatus();
 }
 
diff --git a/src/kernels/mixkernels/rope/tiling/rope_tiling.cpp b/src/kernels/mixkernels/rope/tiling/rope_tiling.cpp
index c932da29..147efa1e 100644
--- a/src/kernels/mixkernels/rope/tiling/rope_tiling.cpp
+++ b/src/kernels/mixkernels/rope/tiling/rope_tiling.cpp
@@ -34,7 +34,7 @@ static constexpr uint32_t TILING_HIGH_PREC = 21;
 static constexpr uint32_t TILING_HIGH_PERF = 20;
 static constexpr uint32_t TILING_HIGH_PERF_LARGE_NTOKENS = 23;
 
-void RopeNdProcess(const LaunchParam &launchParam, KernelInfo &kernelInfo, RopeTilingData *tilingDataPtr)
+void RopeNdProcess(const LaunchParam &launchParam, KernelInfo &kernelInfo, RopeTilingData &tilingDataPtr)
 {
     uint32_t hiddenSizeQ = static_cast<uint32_t>(launchParam.GetInTensor(0).desc.dims[1]);
     uint32_t hiddenSizeK = static_cast<uint32_t>(launchParam.GetInTensor(1).desc.dims[1]);
@@ -44,12 +44,12 @@ void RopeNdProcess(const LaunchParam &launchParam, KernelInfo &kernelInfo, RopeT
     uint32_t batch = static_cast<uint32_t>(launchParam.GetInTensor(4).desc.dims[0]);
     uint32_t maxCore = static_cast<uint32_t>(PlatformInfo::Instance().GetCoreNum(CoreType::CORE_TYPE_VECTOR));
     auto maxUbSize = static_cast<uint32_t>(PlatformInfo::Instance().GetUbSize());
-    tilingDataPtr->maxUbSize = maxUbSize;
+    tilingDataPtr.maxUbSize = maxUbSize;
 
     uint32_t multiple = 1;
-    bool condition = tilingDataPtr->cosFormat == 0 && cosSize == NUM_COSIN &&
+    bool condition = tilingDataPtr.cosFormat == 0 && cosSize == NUM_COSIN &&
                      launchParam.GetInTensor(NUM_COSIN).desc.dtype == TENSOR_DTYPE_FLOAT16 &&
-                     ntokens >= LARGE_NTOKENS_THRESHOLD && headDim / tilingDataPtr->rotaryCoeff % ELE_NUM_FP16 == 0;
+                     ntokens >= LARGE_NTOKENS_THRESHOLD && headDim / tilingDataPtr.rotaryCoeff % ELE_NUM_FP16 == 0;
     if (condition) { // 不对齐场景, multiple为1
         uint32_t hiddenSize = hiddenSizeK > hiddenSizeQ ? hiddenSizeK : hiddenSizeQ;
         multiple = SLICE_SIZE_FP16_LARGE_NTOKENS / hiddenSize;
@@ -69,13 +69,13 @@ void RopeNdProcess(const LaunchParam &launchParam, KernelInfo &kernelInfo, RopeT
     }
     uint32_t tempCore = (ntokens + maxCore - 1) / maxCore;
     uint32_t realCore = (ntokens + tempCore - 1) / tempCore;
-    tilingDataPtr->realCore = realCore;
-    tilingDataPtr->hiddenSizeQ = hiddenSizeQ;
-    tilingDataPtr->hiddenSizeK = hiddenSizeK;
-    tilingDataPtr->headDim = headDim;
-    tilingDataPtr->ntokens = ntokens;
-    tilingDataPtr->batch = batch;
-    tilingDataPtr->multiple = multiple;
+    tilingDataPtr.realCore = realCore;
+    tilingDataPtr.hiddenSizeQ = hiddenSizeQ;
+    tilingDataPtr.hiddenSizeK = hiddenSizeK;
+    tilingDataPtr.headDim = headDim;
+    tilingDataPtr.ntokens = ntokens;
+    tilingDataPtr.batch = batch;
+    tilingDataPtr.multiple = multiple;
     kernelInfo.SetBlockDim(realCore);
     MKI_LOG(DEBUG) << "Ntokens is " << ntokens;
     MKI_LOG(DEBUG) << "Batch is " << batch;
@@ -157,7 +157,7 @@ Status RopeTiling(const LaunchParam &launchParam, KernelInfo &kernelInfo)
     }
     tilingDataPtr->headNumQ = headNumQ;
     tilingDataPtr->headNumK = headNumK;
-    auto ret = TilingKeyChose(launchParam, kernelInfo, tilingDataPtr);
+    auto ret = TilingKeyChose(launchParam, kernelInfo, *tilingDataPtr);
     if (!ret.Ok()) {
         return Status::FailStatus(ERROR_INVALID_VALUE);
     }
diff --git a/src/kernels/mixkernels/rope_q_concat/tiling/rope_q_concat_tiling.cpp b/src/kernels/mixkernels/rope_q_concat/tiling/rope_q_concat_tiling.cpp
index 01b09360..97ad7741 100644
--- a/src/kernels/mixkernels/rope_q_concat/tiling/rope_q_concat_tiling.cpp
+++ b/src/kernels/mixkernels/rope_q_concat/tiling/rope_q_concat_tiling.cpp
@@ -104,7 +104,7 @@ Status RopeQConcatTiling(const LaunchParam &launchParam, KernelInfo &kernelInfo)
     if (!ret.Ok()) {
         return Status::FailStatus(ERROR_INVALID_VALUE);
     }
-    auto retProcess = RopeNdProcess(launchParam, tilingDataPtr);
+    auto retProcess = RopeNdProcess(launchParam, *tilingDataPtr);
     if (!retProcess.Ok()) {
         return Status::FailStatus(ERROR_INVALID_VALUE);
     }
-- 
Gitee


From 4a422fb6092b946f6ab5393b94a25ed5b89bc4ad Mon Sep 17 00:00:00 2001
From: Vector <caobingjie@huawei.com>
Date: Sat, 20 Sep 2025 16:16:00 +0800
Subject: [PATCH 26/94] update

---
 example/op_demo/activation/README.md             | 6 +++---
 example/op_demo/all_gather/README.md             | 6 +++---
 example/op_demo/all_reduce/README.md             | 6 +++---
 example/op_demo/concat/README.md                 | 6 +++---
 example/op_demo/elewise/README.md                | 6 +++---
 example/op_demo/faupdate/README.md               | 6 +++---
 example/op_demo/fused_add_topk_div/README.md     | 6 +++---
 example/op_demo/gather/README.md                 | 6 +++---
 example/op_demo/layer_norm/README.md             | 6 +++---
 example/op_demo/linear/README.md                 | 6 +++---
 example/op_demo/linear_parallel/README.md        | 6 +++---
 example/op_demo/mla_preprocess/README.md         | 6 +++---
 example/op_demo/multi_latent_attention/README.md | 6 +++---
 example/op_demo/paged_attention/README.md        | 6 +++---
 example/op_demo/paged_cache_load/README.md       | 2 +-
 example/op_demo/reshape_and_cache/README.md      | 6 +++---
 example/op_demo/ring_mla/README.md               | 6 +++---
 example/op_demo/rms_norm/README.md               | 2 +-
 example/op_demo/rms_norm_backward/README.md      | 6 +++---
 example/op_demo/rope/README.md                   | 2 +-
 example/op_demo/self_attention/README.md         | 6 +++---
 example/op_demo/slice/README.md                  | 6 +++---
 example/op_demo/split/README.md                  | 6 +++---
 example/op_demo/transdata/README.md              | 6 +++---
 example/op_demo/transpose/README.md              | 6 +++---
 25 files changed, 69 insertions(+), 69 deletions(-)

diff --git a/example/op_demo/activation/README.md b/example/op_demo/activation/README.md
index 2a8b3ecf..2616d541 100644
--- a/example/op_demo/activation/README.md
+++ b/example/op_demo/activation/README.md
@@ -9,16 +9,16 @@
     2. source [nnal安装路径]/set_env.sh
         默认：source /usr/local/Ascend/nnal/atb/set_env.sh
         1. 如果使用加速库源码编译，source [加速库源码路径]/output/atb/set_env.sh
-        e.g. source ./ascend-transformer-boost/output/atb/set_env.sh
+        例如： source ./ascend-transformer-boost/output/atb/set_env.sh
 
 - 运行demo
     - bash build.sh
     **注意**：
-    - 使用cxx_abi=0（默认）时，设置`D_GLIBCXX_USE_CXX11_ABI`为0，i.e.
+    - 使用cxx_abi=0（默认）时，设置`D_GLIBCXX_USE_CXX11_ABI`为0，即：
         ```sh
         g++ -D_GLIBCXX_USE_CXX11_ABI=0 -I ...
         ```
-    - 使用cxx_abi=1时，更改`D_GLIBCXX_USE_CXX11_ABI`为1，i.e.
+    - 使用cxx_abi=1时，更改`D_GLIBCXX_USE_CXX11_ABI`为1，即：
         ```sh
         g++ -D_GLIBCXX_USE_CXX11_ABI=1 -I ...
         ```
diff --git a/example/op_demo/all_gather/README.md b/example/op_demo/all_gather/README.md
index feb6b94b..8897b7bf 100644
--- a/example/op_demo/all_gather/README.md
+++ b/example/op_demo/all_gather/README.md
@@ -9,16 +9,16 @@
     2. source [nnal安装路径]/set_env.sh
         默认：source /usr/local/Ascend/nnal/atb/set_env.sh
         1. 如果使用加速库源码编译，source [加速库源码路径]/output/atb/set_env.sh
-        e.g. source ./ascend-transformer-boost/output/atb/set_env.sh
+        例如： source ./ascend-transformer-boost/output/atb/set_env.sh
 
 - 运行demo
     - bash build.sh
     **注意**：
-    - 使用cxx_abi=0（默认）时，设置`D_GLIBCXX_USE_CXX11_ABI`为0，i.e.
+    - 使用cxx_abi=0（默认）时，设置`D_GLIBCXX_USE_CXX11_ABI`为0，即：
         ```sh
         g++ -D_GLIBCXX_USE_CXX11_ABI=0 -I ...
         ```
-    - 使用cxx_abi=1时，更改`D_GLIBCXX_USE_CXX11_ABI`为1，i.e.
+    - 使用cxx_abi=1时，更改`D_GLIBCXX_USE_CXX11_ABI`为1，即：
         ```sh
         g++ -D_GLIBCXX_USE_CXX11_ABI=1 -I ...
         ```
diff --git a/example/op_demo/all_reduce/README.md b/example/op_demo/all_reduce/README.md
index 44ade794..17d8d4b0 100644
--- a/example/op_demo/all_reduce/README.md
+++ b/example/op_demo/all_reduce/README.md
@@ -9,16 +9,16 @@
     2. source [nnal安装路径]/set_env.sh
         默认：source /usr/local/Ascend/nnal/atb/set_env.sh
         1. 如果使用加速库源码编译，source [加速库源码路径]/output/atb/set_env.sh
-        e.g. source ./ascend-transformer-boost/output/atb/set_env.sh
+        例如： source ./ascend-transformer-boost/output/atb/set_env.sh
 
 - 运行demo
     - bash build.sh
     **注意**：
-    - 使用cxx_abi=0（默认）时，设置`D_GLIBCXX_USE_CXX11_ABI`为0，i.e.
+    - 使用cxx_abi=0（默认）时，设置`D_GLIBCXX_USE_CXX11_ABI`为0，即：
         ```sh
         g++ -D_GLIBCXX_USE_CXX11_ABI=0 -I ...
         ```
-    - 使用cxx_abi=1时，更改`D_GLIBCXX_USE_CXX11_ABI`为1，i.e.
+    - 使用cxx_abi=1时，更改`D_GLIBCXX_USE_CXX11_ABI`为1，即：
         ```sh
         g++ -D_GLIBCXX_USE_CXX11_ABI=1 -I ...
         ```
diff --git a/example/op_demo/concat/README.md b/example/op_demo/concat/README.md
index cc39ea1c..dd76fc32 100644
--- a/example/op_demo/concat/README.md
+++ b/example/op_demo/concat/README.md
@@ -9,16 +9,16 @@
     2. source [nnal安装路径]/set_env.sh
         默认：source /usr/local/Ascend/nnal/atb/set_env.sh
         1. 如果使用加速库源码编译，source [加速库源码路径]/output/atb/set_env.sh
-        e.g. source ./ascend-transformer-boost/output/atb/set_env.sh
+        例如： source ./ascend-transformer-boost/output/atb/set_env.sh
 
 - 运行demo
     - bash build.sh
     **注意**：
-    - 使用cxx_abi=0（默认）时，设置`D_GLIBCXX_USE_CXX11_ABI`为0，i.e.
+    - 使用cxx_abi=0（默认）时，设置`D_GLIBCXX_USE_CXX11_ABI`为0，即：
         ```sh
         g++ -D_GLIBCXX_USE_CXX11_ABI=0 -I ...
         ```
-    - 使用cxx_abi=1时，更改`D_GLIBCXX_USE_CXX11_ABI`为1，i.e.
+    - 使用cxx_abi=1时，更改`D_GLIBCXX_USE_CXX11_ABI`为1，即：
         ```sh
         g++ -D_GLIBCXX_USE_CXX11_ABI=1 -I ...
         ```
diff --git a/example/op_demo/elewise/README.md b/example/op_demo/elewise/README.md
index 7a0fe069..cd9c719b 100644
--- a/example/op_demo/elewise/README.md
+++ b/example/op_demo/elewise/README.md
@@ -9,16 +9,16 @@
     2. source [nnal安装路径]/set_env.sh
         默认：source /usr/local/Ascend/nnal/atb/set_env.sh
         1. 如果使用加速库源码编译，source [加速库源码路径]/output/atb/set_env.sh
-        e.g. source ./ascend-transformer-boost/output/atb/set_env.sh
+        例如： source ./ascend-transformer-boost/output/atb/set_env.sh
 
 - 运行demo
     - bash build.sh
     **注意**：
-    - 使用cxx_abi=0（默认）时，设置`D_GLIBCXX_USE_CXX11_ABI`为0，i.e.
+    - 使用cxx_abi=0（默认）时，设置`D_GLIBCXX_USE_CXX11_ABI`为0，即：
         ```sh
         g++ -D_GLIBCXX_USE_CXX11_ABI=0 -I ...
         ```
-    - 使用cxx_abi=1时，更改`D_GLIBCXX_USE_CXX11_ABI`为1，i.e.
+    - 使用cxx_abi=1时，更改`D_GLIBCXX_USE_CXX11_ABI`为1，即：
         ```sh
         g++ -D_GLIBCXX_USE_CXX11_ABI=1 -I ...
         ```
diff --git a/example/op_demo/faupdate/README.md b/example/op_demo/faupdate/README.md
index bc74850a..b65d1b9a 100644
--- a/example/op_demo/faupdate/README.md
+++ b/example/op_demo/faupdate/README.md
@@ -9,18 +9,18 @@
     2. source [nnal安装路径]/set_env.sh
         默认：source /usr/local/Ascend/nnal/atb/set_env.sh
         1. 如果使用加速库源码编译，source [加速库源码路径]/output/atb/set_env.sh
-        e.g. source ./ascend-transformer-boost/output/atb/set_env.sh
+        例如： source ./ascend-transformer-boost/output/atb/set_env.sh
 
 - 运行demo
     ```sh
     bash build.sh
     ```
     **注意**：
-    - 使用cxx_abi=0（默认）时，设置`D_GLIBCXX_USE_CXX11_ABI`为0，i.e.
+    - 使用cxx_abi=0（默认）时，设置`D_GLIBCXX_USE_CXX11_ABI`为0，即：
         ```sh
         g++ -D_GLIBCXX_USE_CXX11_ABI=0 -I ...
         ```
-    - 使用cxx_abi=1时，更改`D_GLIBCXX_USE_CXX11_ABI`为1，i.e.
+    - 使用cxx_abi=1时，更改`D_GLIBCXX_USE_CXX11_ABI`为1，即：
         ```sh
         g++ -D_GLIBCXX_USE_CXX11_ABI=1 -I ...
         ```
diff --git a/example/op_demo/fused_add_topk_div/README.md b/example/op_demo/fused_add_topk_div/README.md
index ac07c0f9..c4a50d74 100644
--- a/example/op_demo/fused_add_topk_div/README.md
+++ b/example/op_demo/fused_add_topk_div/README.md
@@ -9,16 +9,16 @@
     2. source [nnal安装路径]/set_env.sh
         默认：source /usr/local/Ascend/nnal/atb/set_env.sh
         1. 如果使用加速库源码编译，source [加速库源码路径]/output/atb/set_env.sh
-        e.g. source ./ascend-transformer-boost/output/atb/set_env.sh
+        例如： source ./ascend-transformer-boost/output/atb/set_env.sh
 
 - 运行demo
     - bash build.sh
     **注意**：
-    - 使用cxx_abi=0（默认）时，设置`D_GLIBCXX_USE_CXX11_ABI`为0，i.e.
+    - 使用cxx_abi=0（默认）时，设置`D_GLIBCXX_USE_CXX11_ABI`为0，即：
         ```sh
         g++ -D_GLIBCXX_USE_CXX11_ABI=0 -I ...
         ```
-    - 使用cxx_abi=1时，更改`D_GLIBCXX_USE_CXX11_ABI`为1，i.e.
+    - 使用cxx_abi=1时，更改`D_GLIBCXX_USE_CXX11_ABI`为1，即：
         ```sh
         g++ -D_GLIBCXX_USE_CXX11_ABI=1 -I ...
         ```
diff --git a/example/op_demo/gather/README.md b/example/op_demo/gather/README.md
index 84cce479..b0e16a71 100644
--- a/example/op_demo/gather/README.md
+++ b/example/op_demo/gather/README.md
@@ -9,16 +9,16 @@
     2. source [nnal安装路径]/set_env.sh
         默认：source /usr/local/Ascend/nnal/atb/set_env.sh
         1. 如果使用加速库源码编译，source [加速库源码路径]/output/atb/set_env.sh
-        e.g. source ./ascend-transformer-boost/output/atb/set_env.sh
+        例如： source ./ascend-transformer-boost/output/atb/set_env.sh
 
 - 运行demo
     - bash build.sh
     **注意**：
-    - 使用cxx_abi=0（默认）时，设置`D_GLIBCXX_USE_CXX11_ABI`为0，i.e.
+    - 使用cxx_abi=0（默认）时，设置`D_GLIBCXX_USE_CXX11_ABI`为0，即：
         ```sh
         g++ -D_GLIBCXX_USE_CXX11_ABI=0 -I ...
         ```
-    - 使用cxx_abi=1时，更改`D_GLIBCXX_USE_CXX11_ABI`为1，i.e.
+    - 使用cxx_abi=1时，更改`D_GLIBCXX_USE_CXX11_ABI`为1，即：
         ```sh
         g++ -D_GLIBCXX_USE_CXX11_ABI=1 -I ...
         ```
diff --git a/example/op_demo/layer_norm/README.md b/example/op_demo/layer_norm/README.md
index 76f0f57c..cf4437eb 100644
--- a/example/op_demo/layer_norm/README.md
+++ b/example/op_demo/layer_norm/README.md
@@ -9,16 +9,16 @@
     2. source [nnal安装路径]/set_env.sh
         默认：source /usr/local/Ascend/nnal/atb/set_env.sh
         1. 如果使用加速库源码编译，source [加速库源码路径]/output/atb/set_env.sh
-        e.g. source ./ascend-transformer-boost/output/atb/set_env.sh
+        例如： source ./ascend-transformer-boost/output/atb/set_env.sh
 
 - 运行demo
     - bash build.sh
     **注意**：
-    - 使用cxx_abi=0（默认）时，设置`D_GLIBCXX_USE_CXX11_ABI`为0，i.e.
+    - 使用cxx_abi=0（默认）时，设置`D_GLIBCXX_USE_CXX11_ABI`为0，即：
         ```sh
         g++ -D_GLIBCXX_USE_CXX11_ABI=0 -I ...
         ```
-    - 使用cxx_abi=1时，更改`D_GLIBCXX_USE_CXX11_ABI`为1，i.e.
+    - 使用cxx_abi=1时，更改`D_GLIBCXX_USE_CXX11_ABI`为1，即：
         ```sh
         g++ -D_GLIBCXX_USE_CXX11_ABI=1 -I ...
         ```
diff --git a/example/op_demo/linear/README.md b/example/op_demo/linear/README.md
index a4f517e7..f7c45a0d 100644
--- a/example/op_demo/linear/README.md
+++ b/example/op_demo/linear/README.md
@@ -11,7 +11,7 @@
     2. source [nnal安装路径]/set_env.sh
         默认：source /usr/local/Ascend/nnal/atb/set_env.sh
         1. 如果使用加速库源码编译，source [加速库源码路径]/output/atb/set_env.sh
-        e.g. source ./ascend-transformer-boost/output/atb/set_env.sh
+        例如： source ./ascend-transformer-boost/output/atb/set_env.sh
 
 - 运行demo
 
@@ -20,12 +20,12 @@
     ```
 
     **注意**：
-  - 使用cxx_abi=0（默认）时，设置`D_GLIBCXX_USE_CXX11_ABI`为0，i.e.
+  - 使用cxx_abi=0（默认）时，设置`D_GLIBCXX_USE_CXX11_ABI`为0，即：
 
         ```sh
         g++ -D_GLIBCXX_USE_CXX11_ABI=0 -I ...
         ```
-  - 使用cxx_abi=1时，更改`D_GLIBCXX_USE_CXX11_ABI`为1，i.e.
+  - 使用cxx_abi=1时，更改`D_GLIBCXX_USE_CXX11_ABI`为1，即：
 
         ```sh
         g++ -D_GLIBCXX_USE_CXX11_ABI=1 -I ...
diff --git a/example/op_demo/linear_parallel/README.md b/example/op_demo/linear_parallel/README.md
index 7e2eb96b..3885aac5 100644
--- a/example/op_demo/linear_parallel/README.md
+++ b/example/op_demo/linear_parallel/README.md
@@ -9,16 +9,16 @@
     2. source [nnal安装路径]/set_env.sh
         默认：source /usr/local/Ascend/nnal/atb/set_env.sh
         1. 如果使用加速库源码编译，source [加速库源码路径]/output/atb/set_env.sh
-        e.g. source ./ascend-transformer-boost/output/atb/set_env.sh
+        例如： source ./ascend-transformer-boost/output/atb/set_env.sh
 
 - 运行demo
     - bash build.sh
     **注意**：
-    - 使用cxx_abi=0（默认）时，设置`D_GLIBCXX_USE_CXX11_ABI`为0，i.e.
+    - 使用cxx_abi=0（默认）时，设置`D_GLIBCXX_USE_CXX11_ABI`为0，即：
         ```sh
         g++ -D_GLIBCXX_USE_CXX11_ABI=0 -I ...
         ```
-    - 使用cxx_abi=1时，更改`D_GLIBCXX_USE_CXX11_ABI`为1，i.e.
+    - 使用cxx_abi=1时，更改`D_GLIBCXX_USE_CXX11_ABI`为1，即：
         ```sh
         g++ -D_GLIBCXX_USE_CXX11_ABI=1 -I ...
         ```
diff --git a/example/op_demo/mla_preprocess/README.md b/example/op_demo/mla_preprocess/README.md
index 752d62c7..87c24619 100644
--- a/example/op_demo/mla_preprocess/README.md
+++ b/example/op_demo/mla_preprocess/README.md
@@ -9,16 +9,16 @@
     2. source [nnal安装路径]/set_env.sh
         默认：source /usr/local/Ascend/nnal/atb/set_env.sh
         1. 如果使用加速库源码编译，source [加速库源码路径]/output/atb/set_env.sh
-        e.g. source ./ascend-transformer-boost/output/atb/set_env.sh
+        例如： source ./ascend-transformer-boost/output/atb/set_env.sh
 
 - 运行demo
     - bash build.sh
     **注意**：
-    - 使用cxx_abi=0（默认）时，设置`D_GLIBCXX_USE_CXX11_ABI`为0，i.e.
+    - 使用cxx_abi=0（默认）时，设置`D_GLIBCXX_USE_CXX11_ABI`为0，即：
         ```sh
         g++ -D_GLIBCXX_USE_CXX11_ABI=0 -I ...
         ```
-    - 使用cxx_abi=1时，更改`D_GLIBCXX_USE_CXX11_ABI`为1，i.e.
+    - 使用cxx_abi=1时，更改`D_GLIBCXX_USE_CXX11_ABI`为1，即：
         ```sh
         g++ -D_GLIBCXX_USE_CXX11_ABI=1 -I ...
         ```
diff --git a/example/op_demo/multi_latent_attention/README.md b/example/op_demo/multi_latent_attention/README.md
index bc2c43a7..55ef7ce4 100644
--- a/example/op_demo/multi_latent_attention/README.md
+++ b/example/op_demo/multi_latent_attention/README.md
@@ -9,15 +9,15 @@
     2. source [nnal安装路径]/set_env.sh
         默认：source /usr/local/Ascend/nnal/atb/set_env.sh
         1. 如果使用加速库源码编译，source [加速库源码路径]/output/atb/set_env.sh
-        e.g. source ./ascend-transformer-boost/output/atb/set_env.sh
+        例如： source ./ascend-transformer-boost/output/atb/set_env.sh
 - 运行demo
     - bash build.sh
     **注意**：
-    - 使用cxx_abi=0（默认）时，设置`D_GLIBCXX_USE_CXX11_ABI`为0，i.e.
+    - 使用cxx_abi=0（默认）时，设置`D_GLIBCXX_USE_CXX11_ABI`为0，即：
         ```sh
         g++ -D_GLIBCXX_USE_CXX11_ABI=0 -I ...
         ```
-    - 使用cxx_abi=1时，更改`D_GLIBCXX_USE_CXX11_ABI`为1，i.e.
+    - 使用cxx_abi=1时，更改`D_GLIBCXX_USE_CXX11_ABI`为1，即：
         ```sh
         g++ -D_GLIBCXX_USE_CXX11_ABI=1 -I ...
         ```
diff --git a/example/op_demo/paged_attention/README.md b/example/op_demo/paged_attention/README.md
index ac6cb71b..49fcbd6b 100644
--- a/example/op_demo/paged_attention/README.md
+++ b/example/op_demo/paged_attention/README.md
@@ -9,16 +9,16 @@
     2. source [nnal安装路径]/set_env.sh
         默认：source /usr/local/Ascend/nnal/atb/set_env.sh
         1. 如果使用加速库源码编译，source [加速库源码路径]/output/atb/set_env.sh
-        e.g. source ./ascend-transformer-boost/output/atb/set_env.sh
+        例如： source ./ascend-transformer-boost/output/atb/set_env.sh
 
 - 运行demo
     - bash build.sh  
     **注意**：
-    - 使用cxx_abi=0（默认）时，设置`D_GLIBCXX_USE_CXX11_ABI`为0，i.e.
+    - 使用cxx_abi=0（默认）时，设置`D_GLIBCXX_USE_CXX11_ABI`为0，即：
         ```sh
         g++ -D_GLIBCXX_USE_CXX11_ABI=0 -I ...
         ```
-    - 使用cxx_abi=1时，更改`D_GLIBCXX_USE_CXX11_ABI`为1，i.e.
+    - 使用cxx_abi=1时，更改`D_GLIBCXX_USE_CXX11_ABI`为1，即：
         ```sh
         g++ -D_GLIBCXX_USE_CXX11_ABI=1 -I ...
         ```
diff --git a/example/op_demo/paged_cache_load/README.md b/example/op_demo/paged_cache_load/README.md
index 13b7e62b..055e0bef 100644
--- a/example/op_demo/paged_cache_load/README.md
+++ b/example/op_demo/paged_cache_load/README.md
@@ -9,7 +9,7 @@
     2. source [nnal安装路径]/set_env.sh
         默认：source /usr/local/Ascend/nnal/atb/set_env.sh
         1. 如果使用加速库源码编译，source [加速库源码路径]/output/atb/set_env.sh
-        e.g. source ./ascend-transformer-boost/output/atb/set_env.sh
+        例如： source ./ascend-transformer-boost/output/atb/set_env.sh
 
 - 编译、运行demo
     - bash build.sh
diff --git a/example/op_demo/reshape_and_cache/README.md b/example/op_demo/reshape_and_cache/README.md
index 42c48a87..f4e4ede9 100644
--- a/example/op_demo/reshape_and_cache/README.md
+++ b/example/op_demo/reshape_and_cache/README.md
@@ -9,16 +9,16 @@
     2. source [nnal安装路径]/set_env.sh
         默认：source /usr/local/Ascend/nnal/atb/set_env.sh
         1. 如果使用加速库源码编译，source [加速库源码路径]/output/atb/set_env.sh
-        e.g. source ./ascend-transformer-boost/output/atb/set_env.sh
+        例如： source ./ascend-transformer-boost/output/atb/set_env.sh
 
 - 运行demo
     - bash build.sh
     **注意**：
-    - 使用cxx_abi=0（默认）时，设置`D_GLIBCXX_USE_CXX11_ABI`为0，i.e.
+    - 使用cxx_abi=0（默认）时，设置`D_GLIBCXX_USE_CXX11_ABI`为0，即：
         ```sh
         g++ -D_GLIBCXX_USE_CXX11_ABI=0 -I ...
         ```
-    - 使用cxx_abi=1时，更改`D_GLIBCXX_USE_CXX11_ABI`为1，i.e.
+    - 使用cxx_abi=1时，更改`D_GLIBCXX_USE_CXX11_ABI`为1，即：
         ```sh
         g++ -D_GLIBCXX_USE_CXX11_ABI=1 -I ...
         ```
diff --git a/example/op_demo/ring_mla/README.md b/example/op_demo/ring_mla/README.md
index cc89d794..333678bd 100644
--- a/example/op_demo/ring_mla/README.md
+++ b/example/op_demo/ring_mla/README.md
@@ -9,16 +9,16 @@
     2. source [nnal安装路径]/set_env.sh
         默认：source /usr/local/Ascend/nnal/atb/set_env.sh
         1. 如果使用加速库源码编译，source [加速库源码路径]/output/atb/set_env.sh
-        e.g. source ./ascend-transformer-boost/output/atb/set_env.sh
+        例如： source ./ascend-transformer-boost/output/atb/set_env.sh
 
 - 运行demo
     - bash build.sh
     **注意**：
-    - 使用cxx_abi=0时，设置`D_GLIBCXX_USE_CXX11_ABI`为0，i.e.
+    - 使用cxx_abi=0时，设置`D_GLIBCXX_USE_CXX11_ABI`为0，即：
         ```sh
         g++ -D_GLIBCXX_USE_CXX11_ABI=0 -I ...
         ```
-    - 使用cxx_abi=1时，更改`D_GLIBCXX_USE_CXX11_ABI`为1，i.e.
+    - 使用cxx_abi=1时，更改`D_GLIBCXX_USE_CXX11_ABI`为1，即：
         ```sh
         g++ -D_GLIBCXX_USE_CXX11_ABI=1 -I ...
         ```
diff --git a/example/op_demo/rms_norm/README.md b/example/op_demo/rms_norm/README.md
index 7e5a3f73..f3b43873 100644
--- a/example/op_demo/rms_norm/README.md
+++ b/example/op_demo/rms_norm/README.md
@@ -9,7 +9,7 @@
     2. source [nnal安装路径]/set_env.sh
         默认：source /usr/local/Ascend/nnal/atb/set_env.sh
         1. 如果使用加速库源码编译，source [加速库源码路径]/output/atb/set_env.sh
-        e.g. source ./ascend-transformer-boost/output/atb/set_env.sh
+        例如： source ./ascend-transformer-boost/output/atb/set_env.sh
 
 - 编译、运行demo
     - bash build.sh
diff --git a/example/op_demo/rms_norm_backward/README.md b/example/op_demo/rms_norm_backward/README.md
index 8ab2c459..78daf3f9 100644
--- a/example/op_demo/rms_norm_backward/README.md
+++ b/example/op_demo/rms_norm_backward/README.md
@@ -9,16 +9,16 @@
     2. source [nnal安装路径]/set_env.sh
         默认：source /usr/local/Ascend/nnal/atb/set_env.sh
         1. 如果使用加速库源码编译，source [加速库源码路径]/output/atb/set_env.sh
-        e.g. source ./ascend-transformer-boost/output/atb/set_env.sh
+        例如： source ./ascend-transformer-boost/output/atb/set_env.sh
 
 - 运行demo
     - bash build.sh
     **注意**：
-    - 使用cxx_abi=0（默认）时，设置`D_GLIBCXX_USE_CXX11_ABI`为0，i.e.
+    - 使用cxx_abi=0（默认）时，设置`D_GLIBCXX_USE_CXX11_ABI`为0，即：
         ```sh
         g++ -D_GLIBCXX_USE_CXX11_ABI=0 -I ...
         ```
-    - 使用cxx_abi=1时，更改`D_GLIBCXX_USE_CXX11_ABI`为1，i.e.
+    - 使用cxx_abi=1时，更改`D_GLIBCXX_USE_CXX11_ABI`为1，即：
         ```sh
         g++ -D_GLIBCXX_USE_CXX11_ABI=1 -I ...
         ```
diff --git a/example/op_demo/rope/README.md b/example/op_demo/rope/README.md
index 6898cb29..c6c1ec27 100644
--- a/example/op_demo/rope/README.md
+++ b/example/op_demo/rope/README.md
@@ -9,7 +9,7 @@
     2. source [nnal安装路径]/set_env.sh
         默认：source /usr/local/Ascend/nnal/atb/set_env.sh
         1. 如果使用加速库源码编译，source [加速库源码路径]/output/atb/set_env.sh
-        e.g. source ./ascend-transformer-boost/output/atb/set_env.sh
+        例如： source ./ascend-transformer-boost/output/atb/set_env.sh
 
 - 编译、运行demo
     - bash build.sh
diff --git a/example/op_demo/self_attention/README.md b/example/op_demo/self_attention/README.md
index a6d8ac9a..8f428192 100644
--- a/example/op_demo/self_attention/README.md
+++ b/example/op_demo/self_attention/README.md
@@ -9,16 +9,16 @@
     2. source [nnal安装路径]/set_env.sh
         默认：source /usr/local/Ascend/nnal/atb/set_env.sh
         1. 如果使用加速库源码编译，source [加速库源码路径]/output/atb/set_env.sh
-        e.g. source ./ascend-transformer-boost/output/atb/set_env.sh
+        例如： source ./ascend-transformer-boost/output/atb/set_env.sh
 
 - 运行demo
     - bash build.sh
     **注意**：
-    - 使用cxx_abi=0时，设置`D_GLIBCXX_USE_CXX11_ABI`为0，i.e.
+    - 使用cxx_abi=0时，设置`D_GLIBCXX_USE_CXX11_ABI`为0，即：
         ```sh
         g++ -D_GLIBCXX_USE_CXX11_ABI=0 -I ...
         ```
-    - 使用cxx_abi=1时，更改`D_GLIBCXX_USE_CXX11_ABI`为1，i.e.
+    - 使用cxx_abi=1时，更改`D_GLIBCXX_USE_CXX11_ABI`为1，即：
         ```sh
         g++ -D_GLIBCXX_USE_CXX11_ABI=1 -I ...
         ```
diff --git a/example/op_demo/slice/README.md b/example/op_demo/slice/README.md
index 9b3826c0..aa0ebb81 100644
--- a/example/op_demo/slice/README.md
+++ b/example/op_demo/slice/README.md
@@ -9,16 +9,16 @@
     2. source [nnal安装路径]/set_env.sh
         默认：source /usr/local/Ascend/nnal/atb/set_env.sh
         1. 如果使用加速库源码编译，source [加速库源码路径]/output/atb/set_env.sh
-        e.g. source ./ascend-transformer-boost/output/atb/set_env.sh
+        例如： source ./ascend-transformer-boost/output/atb/set_env.sh
 
 - 运行demo
     - bash build.sh
     **注意**：
-    - 使用cxx_abi=0（默认）时，设置`D_GLIBCXX_USE_CXX11_ABI`为0，i.e.
+    - 使用cxx_abi=0（默认）时，设置`D_GLIBCXX_USE_CXX11_ABI`为0，即：
         ```sh
         g++ -D_GLIBCXX_USE_CXX11_ABI=0 -I ...
         ```
-    - 使用cxx_abi=1时，更改`D_GLIBCXX_USE_CXX11_ABI`为1，i.e.
+    - 使用cxx_abi=1时，更改`D_GLIBCXX_USE_CXX11_ABI`为1，即：
         ```sh
         g++ -D_GLIBCXX_USE_CXX11_ABI=1 -I ...
         ```
diff --git a/example/op_demo/split/README.md b/example/op_demo/split/README.md
index 3a1db3b2..39fa1e02 100644
--- a/example/op_demo/split/README.md
+++ b/example/op_demo/split/README.md
@@ -9,16 +9,16 @@
     2. source [nnal安装路径]/set_env.sh
         默认：source /usr/local/Ascend/nnal/atb/set_env.sh
         1. 如果使用加速库源码编译，source [加速库源码路径]/output/atb/set_env.sh
-        e.g. source ./ascend-transformer-boost/output/atb/set_env.sh
+        例如： source ./ascend-transformer-boost/output/atb/set_env.sh
 
 - 运行demo
     - bash build.sh
     **注意**：
-    - 使用cxx_abi=0（默认）时，设置`D_GLIBCXX_USE_CXX11_ABI`为0，i.e.
+    - 使用cxx_abi=0（默认）时，设置`D_GLIBCXX_USE_CXX11_ABI`为0，即：
         ```sh
         g++ -D_GLIBCXX_USE_CXX11_ABI=0 -I ...
         ```
-    - 使用cxx_abi=1时，更改`D_GLIBCXX_USE_CXX11_ABI`为1，i.e.
+    - 使用cxx_abi=1时，更改`D_GLIBCXX_USE_CXX11_ABI`为1，即：
         ```sh
         g++ -D_GLIBCXX_USE_CXX11_ABI=1 -I ...
         ```
diff --git a/example/op_demo/transdata/README.md b/example/op_demo/transdata/README.md
index e162bc70..99b54219 100644
--- a/example/op_demo/transdata/README.md
+++ b/example/op_demo/transdata/README.md
@@ -9,16 +9,16 @@
     2. source [nnal安装路径]/set_env.sh
         默认：source /usr/local/Ascend/nnal/atb/set_env.sh
         1. 如果使用加速库源码编译，source [加速库源码路径]/output/atb/set_env.sh
-        e.g. source ./ascend-transformer-boost/output/atb/set_env.sh
+        例如： source ./ascend-transformer-boost/output/atb/set_env.sh
 
 - 运行demo
     - bash build.sh
     **注意**：
-    - 使用cxx_abi=0（默认）时，设置`D_GLIBCXX_USE_CXX11_ABI`为0，i.e.
+    - 使用cxx_abi=0（默认）时，设置`D_GLIBCXX_USE_CXX11_ABI`为0，即：
         ```sh
         g++ -D_GLIBCXX_USE_CXX11_ABI=0 -I ...
         ```
-    - 使用cxx_abi=1时，更改`D_GLIBCXX_USE_CXX11_ABI`为1，i.e.
+    - 使用cxx_abi=1时，更改`D_GLIBCXX_USE_CXX11_ABI`为1，即：
         ```sh
         g++ -D_GLIBCXX_USE_CXX11_ABI=1 -I ...
         ```
diff --git a/example/op_demo/transpose/README.md b/example/op_demo/transpose/README.md
index 8a1e141f..d73e19e9 100644
--- a/example/op_demo/transpose/README.md
+++ b/example/op_demo/transpose/README.md
@@ -9,16 +9,16 @@
     2. source [nnal安装路径]/set_env.sh
         默认：source /usr/local/Ascend/nnal/atb/set_env.sh
         1. 如果使用加速库源码编译，source [加速库源码路径]/output/atb/set_env.sh
-        e.g. source ./ascend-transformer-boost/output/atb/set_env.sh
+        例如： source ./ascend-transformer-boost/output/atb/set_env.sh
 
 - 运行demo
     - bash build.sh
     **注意**：
-    - 使用cxx_abi=0（默认）时，设置`D_GLIBCXX_USE_CXX11_ABI`为0，i.e.
+    - 使用cxx_abi=0（默认）时，设置`D_GLIBCXX_USE_CXX11_ABI`为0，即：
         ```sh
         g++ -D_GLIBCXX_USE_CXX11_ABI=0 -I ...
         ```
-    - 使用cxx_abi=1时，更改`D_GLIBCXX_USE_CXX11_ABI`为1，i.e.
+    - 使用cxx_abi=1时，更改`D_GLIBCXX_USE_CXX11_ABI`为1，即：
         ```sh
         g++ -D_GLIBCXX_USE_CXX11_ABI=1 -I ...
         ```
-- 
Gitee


From 3615ab3f4fcecce6ade9f71dd9220dd9827db012 Mon Sep 17 00:00:00 2001
From: huangxiaolan <huangxiaolan7@huawei.com>
Date: Mon, 22 Sep 2025 09:29:48 +0800
Subject: [PATCH 27/94] fix function point

---
 .../mla_preprocess/tiling/mla_preprocess_tiling.cpp         | 2 +-
 .../multi_latent_attention/tiling/mla_tiling_dependency.cpp | 2 +-
 .../swi_glu_quant/tiling/swi_glu_quant_tiling.cpp           | 4 ++--
 .../swi_glu_quant/tiling/swi_glu_quant_tiling_utils.h       | 6 +++---
 .../linear_parallel/linear_parallel_aclnn_runner.cpp        | 4 ++--
 .../mla_preprocess/mla_preprocess_aclnn_runner.cpp          | 4 ++--
 6 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/src/kernels/mixkernels/mla_preprocess/tiling/mla_preprocess_tiling.cpp b/src/kernels/mixkernels/mla_preprocess/tiling/mla_preprocess_tiling.cpp
index eee7b7db..4c9b0727 100644
--- a/src/kernels/mixkernels/mla_preprocess/tiling/mla_preprocess_tiling.cpp
+++ b/src/kernels/mixkernels/mla_preprocess/tiling/mla_preprocess_tiling.cpp
@@ -320,7 +320,7 @@ public:
     void RmsNormQuantTiling(const uint32_t numTokens);
     void RopeConcatTiling(const OpParam::MlaPreprocess &param, const uint32_t &aicNum);
     void EinSumQuantTiling(const OpParam::MlaPreprocess &param, const uint32_t &aicNum, const TensorDType inDtype);
-    void SetTiling(AtbOps::MlaTilingData *tilingParam);
+    void SetTiling(AtbOps::MlaTilingData &tilingParam);
     void SetTilingKey(const Mki::LaunchParam &launchParam, Mki::KernelInfo &kernelInfo);
     void SetMlapoWorkSpace(const TensorDType inDtype, const OpParam::MlaPreprocess &param, Mki::KernelInfo &kernelInfo);
 };
diff --git a/src/kernels/mixkernels/multi_latent_attention/tiling/mla_tiling_dependency.cpp b/src/kernels/mixkernels/multi_latent_attention/tiling/mla_tiling_dependency.cpp
index c662de13..896c23eb 100644
--- a/src/kernels/mixkernels/multi_latent_attention/tiling/mla_tiling_dependency.cpp
+++ b/src/kernels/mixkernels/multi_latent_attention/tiling/mla_tiling_dependency.cpp
@@ -377,7 +377,7 @@ Status GetMLATilingParam(const LaunchParam &launchParam, MLAInfo &mmInfo,
         GetNdMLATiling(mmInfo, blockDim, tilingParam, param);
         blockDim = mmInfo.batch == BATCH_MLA ? BLOCK_DIM_MLA : blockDim;
     }
-    GetTilingHead(mmInfo, param, tilingParam, torPtr, blockDim);
+    GetTilingHead(mmInfo, param, tilingParam, *torPtr, blockDim);
     return AtbOps::Status::OkStatus();
 }
 
diff --git a/src/kernels/mixkernels/swi_glu_quant/tiling/swi_glu_quant_tiling.cpp b/src/kernels/mixkernels/swi_glu_quant/tiling/swi_glu_quant_tiling.cpp
index 46579d28..51b6cc14 100644
--- a/src/kernels/mixkernels/swi_glu_quant/tiling/swi_glu_quant_tiling.cpp
+++ b/src/kernels/mixkernels/swi_glu_quant/tiling/swi_glu_quant_tiling.cpp
@@ -93,13 +93,13 @@ Status SwiGluQuantTiling(const LaunchParam &launchParam, KernelInfo &kernelInfo)
     tilingData->blockNum = BLOCK_SIZE / SIZE_OF_FLOAT16;
     tilingData->cacheLineLen = L2_CACHE_LINE_SIZE / SIZE_OF_FLOAT16;
     const Mki::SVector<int64_t> &xShape = launchParam.GetInTensor(0).desc.dims;
-    MKI_CHECK_NO_LOG(SetTotalShape(xShape, tilingData), return Status::FailStatus(ERROR_INVALID_VALUE));
+    MKI_CHECK_NO_LOG(SetTotalShape(xShape, *tilingData), return Status::FailStatus(ERROR_INVALID_VALUE));
     MKI_CHECK_NO_LOG(CalTilingData(tilingData), return Status::FailStatus(ERROR_INVALID_VALUE));
     SetTilingData(tilingData);
     SwigluQuantTilingKeyChose(launchParam, kernelInfo);
     kernelInfo.SetBlockDim(tilingData->coreNumUsed);
     
-    PrintSwiQuantTiling(tilingData);
+    PrintSwiQuantTiling(*tilingData);
     return Status::OkStatus();
 }
 }
\ No newline at end of file
diff --git a/src/kernels/mixkernels/swi_glu_quant/tiling/swi_glu_quant_tiling_utils.h b/src/kernels/mixkernels/swi_glu_quant/tiling/swi_glu_quant_tiling_utils.h
index c1e1b60f..829268fd 100644
--- a/src/kernels/mixkernels/swi_glu_quant/tiling/swi_glu_quant_tiling_utils.h
+++ b/src/kernels/mixkernels/swi_glu_quant/tiling/swi_glu_quant_tiling_utils.h
@@ -80,7 +80,7 @@ inline bool CalculateMaxUbSizePerRow(SwiGluQuantTilingData& tilingData)
     uint32_t alignedColLen = AlignUp<uint32_t>(colLen, tilingData.blockNum);
     MKI_CHECK(alignedColLen != 0, "CalculateMaxUbSizePerRow Unsupported alignedColLen  == 0", return false);
     MKI_LOG(INFO) << "alignedColLen:" << alignedColLen << "\n";
-    uint32_t ubAvail = tilingDat.dataNumSingleUb / alignedColLen;
+    uint32_t ubAvail = tilingData.dataNumSingleUb / alignedColLen;
     MKI_LOG(INFO) << "tilingData.dataNumSingleUb:" << tilingData.dataNumSingleUb << "\n";
     MKI_LOG(INFO) << "ubAvail:" << ubAvail << "\n";
     MKI_CHECK(ubAvail != 0, "The input vector is too large. It is not supported currently.", return false);
@@ -93,9 +93,9 @@ inline bool CalculateMaxUbSizePerRow(SwiGluQuantTilingData& tilingData)
     return true;
 }
 
-bool CalTilingData(SwiGluQuantTilingData *tilingData);
+bool CalTilingData(SwiGluQuantTilingData &tilingData);
 
-void SetTilingData(SwiGluQuantTilingData *tilingData);
+void SetTilingData(SwiGluQuantTilingData &tilingData);
 
 } // namespace AsdOps
 #endif // OPS_SWI_GLU_QUANT_TILING_H
\ No newline at end of file
diff --git a/src/ops_infer/linear_parallel/linear_parallel_aclnn_runner.cpp b/src/ops_infer/linear_parallel/linear_parallel_aclnn_runner.cpp
index 7e151f31..c98fb564 100644
--- a/src/ops_infer/linear_parallel/linear_parallel_aclnn_runner.cpp
+++ b/src/ops_infer/linear_parallel/linear_parallel_aclnn_runner.cpp
@@ -193,13 +193,13 @@ Status LinearParallelAclnnRunner::LoadMethodMatmulReduceScatter()
     static DlManager dlManager = DlManager(std::string(std::getenv("ASCEND_HOME_PATH")) + "/lib64/libopapi.so");
     Status ret =
         dlManager.getSymbol("aclnnMatmulReduceScatterV2GetWorkspaceSize",
-                            (void **)&LinearParallelAclnnRunner::aclnnMatmulReduceScatterV2GetWorkspaceSizeFunc_);
+                            (void *&)&LinearParallelAclnnRunner::aclnnMatmulReduceScatterV2GetWorkspaceSizeFunc_);
     if (ret != NO_ERROR) {
         ATB_LOG(ERROR) << "load aclnnMatmulReduceScatterV2GetWorkspaceSize failed! Consider upgrade the CANN first!";
         return ret;
     }
     ret = dlManager.getSymbol("aclnnMatmulReduceScatterV2",
-                              (void **)&LinearParallelAclnnRunner::aclnnMatmulReduceScatterV2Func_);
+                              (void *&)&LinearParallelAclnnRunner::aclnnMatmulReduceScatterV2Func_);
     if (ret != NO_ERROR) {
         ATB_LOG(ERROR) << "load aclnnMatmulReduceScatterV2 failed! Consider upgrade the CANN first!";
         return ret;
diff --git a/src/ops_infer/mla_preprocess/mla_preprocess_aclnn_runner.cpp b/src/ops_infer/mla_preprocess/mla_preprocess_aclnn_runner.cpp
index 4c5a102e..7f7957e0 100644
--- a/src/ops_infer/mla_preprocess/mla_preprocess_aclnn_runner.cpp
+++ b/src/ops_infer/mla_preprocess/mla_preprocess_aclnn_runner.cpp
@@ -206,12 +206,12 @@ Status MlaPreprocessAclnnRunner::LoadMethod()
     }
     static DlManager dlManager = DlManager(std::string(std::getenv("ASCEND_HOME_PATH")) + "/lib64/libopapi.so");
     Status ret = dlManager.getSymbol("aclnnMlaPreprocessGetWorkspaceSize",
-                                     (void **)&MlaPreprocessAclnnRunner::aclnnGetWorkspaceSizeFunc_);
+                                     (void *&)&MlaPreprocessAclnnRunner::aclnnGetWorkspaceSizeFunc_);
     if (ret != NO_ERROR) {
         ATB_LOG(ERROR) << "load aclnnMlaPreprocessGetWorkspaceSize failed! Consider upgrade the CANN first!";
         return ret;
     }
-    ret = dlManager.getSymbol("aclnnMlaPreprocess", (void **)&MlaPreprocessAclnnRunner::aclnnExecuteFunc_);
+    ret = dlManager.getSymbol("aclnnMlaPreprocess", (void *&)&MlaPreprocessAclnnRunner::aclnnExecuteFunc_);
     if (ret != NO_ERROR) {
         ATB_LOG(ERROR) << "load aclnnMlaPreprocess failed! Consider upgrade the CANN first!";
         return ret;
-- 
Gitee


From 107e31a04d7d4c688b9516c25cac0576699f0804 Mon Sep 17 00:00:00 2001
From: huangxiaolan <huangxiaolan7@huawei.com>
Date: Mon, 22 Sep 2025 11:28:32 +0800
Subject: [PATCH 28/94] fix function point

---
 include/atb/atb_acl.h                              | 14 +++++++-------
 src/atb/operation/if_operation.cpp                 |  4 ++--
 src/atb/operation/operation_base.h                 |  2 +-
 src/atb/runner/ops_runner.h                        |  2 +-
 src/cinterface/atb_acl_util.cpp                    |  4 ++--
 .../tiling/fused_add_topk_div_tiling.cpp           |  4 ++--
 .../tiling/mla_preprocess_tiling.cpp               |  2 +-
 src/kernels/mixkernels/rope/tiling/rope_tiling.cpp |  2 +-
 .../swi_glu_quant/tiling/swi_glu_quant_tiling.cpp  |  4 ++--
 .../linear_parallel_aclnn_runner.cpp               |  4 ++--
 .../mla_preprocess/mla_preprocess_aclnn_runner.cpp |  4 ++--
 11 files changed, 23 insertions(+), 23 deletions(-)

diff --git a/include/atb/atb_acl.h b/include/atb/atb_acl.h
index dfd8f0d4..377d37d7 100644
--- a/include/atb/atb_acl.h
+++ b/include/atb/atb_acl.h
@@ -55,7 +55,7 @@ atb::Status AtbFusedAddTopkDivGetWorkspaceSize(const aclTensor *x, const aclTens
                                                const aclTensor *mappingTable, uint32_t groupNum, uint32_t groupTopk,
                                                uint32_t n, uint32_t k, int activationType, bool isNorm, float scale,
                                                bool enableExpertMapping, aclTensor *y, aclTensor *indices,
-                                               uint64_t *workspaceSize, atb::Operation **op, atb::Context *context);
+                                               uint64_t &workspaceSize, atb::Operation **op, atb::Context *context);
 
 //!
 //! \brief 关于FusedAddTopkDiv算子使用aclnn风格调用的2段式接口的第2段，
@@ -101,7 +101,7 @@ atb::Status AtbMLAGetWorkspaceSize(const aclTensor *qNope, const aclTensor *qRop
                                    const aclTensor *mask, const aclTensor *qSeqLen, const aclTensor *qkDescale,
                                    const aclTensor *pvDescale, int32_t headNum, float qkScale, int32_t kvHeadNum,
                                    int maskType, int calcType, uint8_t cacheMode, aclTensor *attenOut, aclTensor *lse,
-                                   uint64_t *workspaceSize, atb::Operation **op, atb::Context *context);
+                                   uint64_t &workspaceSize, atb::Operation **op, atb::Context &context);
 
 //!
 //! \brief 关于MLA算子使用aclnn风格调用的2段式接口的第2段，
@@ -142,7 +142,7 @@ atb::Status AtbMLAPreFillGetWorkspaceSize(const aclTensor *q, const aclTensor *q
     const aclTensor *kRope, const aclTensor *v, const aclTensor *qSeqLen, const aclTensor *kvSeqLen,
     const aclTensor *mask, int32_t headNum, float qkScale, int32_t kvHeadNum,
     int maskType, uint8_t cacheMode, aclTensor *attenOut,
-    uint64_t *workspaceSize, atb::Operation **op, atb::Context *context);
+    uint64_t &workspaceSize, atb::Operation **op, atb::Context *context);
 
 //!
 //! \brief MLA prefill 处理接口
@@ -214,7 +214,7 @@ atb::Status AtbMLAPreprocessGetWorkspaceSize(
     const aclTensor *kvCacheRope, const aclTensor *slotmapping, const aclTensor *ctkvScale, const aclTensor *qNopeScale,
     uint32_t wdqDim, uint32_t qRopeDim, uint32_t kRopeDim, float epsilon, uint32_t qRotaryCoeff, uint32_t kRotaryCoeff,
     bool transposeWdq, bool transposeWuq, bool transposeWuk, uint8_t cacheMode, uint16_t quantMode, aclTensor *qOut0,
-    aclTensor *kvCacheOut0, aclTensor *qOut1, aclTensor *kvCacheOut1, uint64_t *workspaceSize, atb::Operation **op,
+    aclTensor *kvCacheOut0, aclTensor *qOut1, aclTensor *kvCacheOut1, uint64_t &workspaceSize, atb::Operation **op,
     atb::Context *context);
 
 //!
@@ -252,7 +252,7 @@ atb::Status AtbPagedCacheLoadGetWorkspaceSize(const aclTensor *keyCache, const a
                                               const aclTensor *blockTables, const aclTensor *contextLens,
                                               const aclTensor *key, const aclTensor *value, const aclTensor *seqStarts,
                                               int8_t kvCacheCfg, bool isSeqLensCumsumType, bool hasSeqStarts,
-                                              uint64_t *workspaceSize, atb::Operation **op, atb::Context *context);
+                                              uint64_t &workspaceSize, atb::Operation **op, atb::Context *context);
 
 //!
 //! \brief 关于PagedCacheLoad算子使用aclnn风格调用的2段式接口的第2段，
@@ -300,7 +300,7 @@ atb::Status AtbRingMLAGetWorkspaceSize(const aclTensor *querySplit1, const aclTe
                                        const aclTensor *mask, const aclTensor *seqLen, const aclTensor *prevOut,
                                        const aclTensor *prevLse, int32_t headNum, int32_t kvHeadNum, float qkScale,
                                        int kernelType, int maskType, int inputLayout, int calcType, aclTensor *output,
-                                       aclTensor *softmaxLse, uint64_t *workspaceSize, atb::Operation **op,
+                                       aclTensor *softmaxLse, uint64_t &workspaceSize, atb::Operation **op,
                                        atb::Context *context);
 
 //!
@@ -344,7 +344,7 @@ atb::Status AtbSelfAttentionPrefixEncoderGetWorkspaceSize(const aclTensor *query
                                                           const aclTensor *mask, const aclTensor *seqLen,
                                                           const aclTensor *kvSeqLen, const aclTensor *slopes,
                                                           int maskType, int32_t headNum, int32_t kvHeadNum,
-                                                          float qkScale, aclTensor *attnOut, uint64_t *workspaceSize,
+                                                          float qkScale, aclTensor *attnOut, uint64_t &workspaceSize,
                                                           atb::Operation **op, atb::Context *context);
 
 //!
diff --git a/src/atb/operation/if_operation.cpp b/src/atb/operation/if_operation.cpp
index aa929ade..a9c8f406 100644
--- a/src/atb/operation/if_operation.cpp
+++ b/src/atb/operation/if_operation.cpp
@@ -95,7 +95,7 @@ Status IfOperation::Setup(const VariantPack &variantPack, uint64_t &workspaceSiz
     } else {
         ATB_LOG(WARN) << GetLogPrefix() << "Operation already selected, resetting opSelected_...";
     }
-    Status st = GetOperationFromCondition(&opSelected_);
+    Status st = GetOperationFromCondition(opSelected_);
     if (st != NO_ERROR) {
         ATB_LOG(ERROR) << GetLogPrefix() << "Failed to select operation based on condition!";
     }
@@ -155,7 +155,7 @@ std::shared_ptr<Runner> IfOperation::CreateRunner(Context &context) const
     if (!opSelected_) {
         ATB_LOG(INFO) << GetLogPrefix()
                       << "Operation not selected yet, executing create runner as part of graph, setting opSelected_...";
-        Status st = GetOperationFromCondition(&opSelected_);
+        Status st = GetOperationFromCondition(opSelected_);
         if (st != NO_ERROR) {
             ATB_LOG(ERROR) << GetLogPrefix() << "Failed to select operation based on condition!";
         }
diff --git a/src/atb/operation/operation_base.h b/src/atb/operation/operation_base.h
index 309fc0bf..f0f5d791 100644
--- a/src/atb/operation/operation_base.h
+++ b/src/atb/operation/operation_base.h
@@ -39,7 +39,7 @@ public:
     Status InferShape(const SVector<TensorDesc> &inTensorDescs, SVector<TensorDesc> &outTensorDescs) const override;
     Status Setup(const VariantPack &variantPack, uint64_t &workspaceSize, Context *context) override;
     Status Execute(const VariantPack &variantPack, uint8_t *workspace, uint64_t workspaceSize,
-                   Context *context) override;
+                   Context &context) override;
     Status SetOperationBaseIds(const std::vector<int64_t> &operationBaseIds, const int64_t nodeId);
     virtual nlohmann::json GetParamJson() const;
     const std::vector<int64_t> &GetOperationBaseIds();
diff --git a/src/atb/runner/ops_runner.h b/src/atb/runner/ops_runner.h
index e2b93669..89a13a8d 100644
--- a/src/atb/runner/ops_runner.h
+++ b/src/atb/runner/ops_runner.h
@@ -99,7 +99,7 @@ private:
     Status UpdateDeviceRealAddr(const RunnerVariantPack &runnerVariantPack);
     Status RunKernel(KernelGraphNode &node, size_t nodeId, ContextBase *context) const;
     Status FillSingleKernelHostTilingBuffer(KernelGraphNode &node, size_t nodeId, uint8_t *kernelHostTilingBuffer,
-                                            size_t tilingSize, ContextBase *context);
+                                            size_t tilingSize, ContextBase &context);
     void MallocLocalInternalTensor(const KernelGraphNode &node, size_t nodeId, size_t tensorId,
                                    const Mki::Tensor &infershapedOutTensor, Mki::Tensor *outTensor);
     void MallocGlobalInternalTensor(const KernelGraphNode &node, size_t nodeId, size_t tensorId,
diff --git a/src/cinterface/atb_acl_util.cpp b/src/cinterface/atb_acl_util.cpp
index c4b28caf..40ec98f6 100644
--- a/src/cinterface/atb_acl_util.cpp
+++ b/src/cinterface/atb_acl_util.cpp
@@ -57,7 +57,7 @@ atb::Status aclTensorToAtbTensor(const aclTensor *aclTensorSrc, atb::Tensor *atb
     atbTensorDst->desc = desc;
     atbTensorDst->deviceData = aclTensorSrc->GetData();
     atbTensorDst->hostData = nullptr;
-    int64_t tensorSize = GetTensorSize(aclTensorSrc);
+    int64_t tensorSize = GetTensorSize(*aclTensorSrc);
     int64_t dataTypeSize = static_cast<int64_t>(aclDataTypeSize(dataType));
     if (tensorSize > MAX_TENSOR_SIZE / dataTypeSize) {
         ATB_LOG(ERROR) << "The size of a tensor * dataTypeSize should be no more than 256GB, but got tensor size: "
@@ -97,7 +97,7 @@ atb::Status aclTensorToAtbTensorHost(const aclTensor *aclTensorSrc, atb::Tensor
     atbTensorDst->desc = desc;
     atbTensorDst->deviceData = nullptr;
     atbTensorDst->hostData = aclTensorSrc->GetData();
-    int64_t tensorSize = GetTensorSize(aclTensorSrc);
+    int64_t tensorSize = GetTensorSize(*aclTensorSrc);
     int64_t dataTypeSize = static_cast<int64_t>(aclDataTypeSize(dataType));
     if (tensorSize > MAX_TENSOR_SIZE / dataTypeSize) {
         ATB_LOG(ERROR) << "The size of a tensor * dataTypeSize should be no more than 256GB, but got tensor size: "
diff --git a/src/kernels/mixkernels/fused_add_topk_div/tiling/fused_add_topk_div_tiling.cpp b/src/kernels/mixkernels/fused_add_topk_div/tiling/fused_add_topk_div_tiling.cpp
index e3a57c7a..da682dcf 100644
--- a/src/kernels/mixkernels/fused_add_topk_div/tiling/fused_add_topk_div_tiling.cpp
+++ b/src/kernels/mixkernels/fused_add_topk_div/tiling/fused_add_topk_div_tiling.cpp
@@ -165,10 +165,10 @@ namespace AtbOps {
     template <typename TilingData>
     void FusedAddTopkDivTiling<TilingData>::GetTiling(TilingData *tilingDataPtr)
     {
-        GetTilingKey(tilingDataPtr);
+        GetTilingKey(*tilingDataPtr);
         GetUsedCore();
         SplitUb();
-        FillTilingData(tilingDataPtr);
+        FillTilingData(*tilingDataPtr);
     }
  
     template <typename TilingData>
diff --git a/src/kernels/mixkernels/mla_preprocess/tiling/mla_preprocess_tiling.cpp b/src/kernels/mixkernels/mla_preprocess/tiling/mla_preprocess_tiling.cpp
index 4c9b0727..2b264942 100644
--- a/src/kernels/mixkernels/mla_preprocess/tiling/mla_preprocess_tiling.cpp
+++ b/src/kernels/mixkernels/mla_preprocess/tiling/mla_preprocess_tiling.cpp
@@ -624,7 +624,7 @@ Mki::Status MlaPreprocessTiling::Init(const Mki::LaunchParam &launchParam, Mki::
         false,                                     // enDequant
         deqOnTheFly);                                 // in bf16.cce?
     mm3TilingApi.GetTilingData(tilingParam->mm3);
-    SetTiling(tilingParam);
+    SetTiling(*tilingParam);
     MKI_LOG(INFO) << *tilingParam;
     SetMlapoWorkSpace(inDtype, param, kernelInfo);
     kernelInfo.SetBlockDim(aicNum);
diff --git a/src/kernels/mixkernels/rope/tiling/rope_tiling.cpp b/src/kernels/mixkernels/rope/tiling/rope_tiling.cpp
index 147efa1e..43704ab2 100644
--- a/src/kernels/mixkernels/rope/tiling/rope_tiling.cpp
+++ b/src/kernels/mixkernels/rope/tiling/rope_tiling.cpp
@@ -148,7 +148,7 @@ Status RopeTiling(const LaunchParam &launchParam, KernelInfo &kernelInfo)
     MKI_LOG(DEBUG) << "cosFormat is " << tilingDataPtr->cosFormat;
     uint32_t headNumQ = 1;
     uint32_t headNumK = 1;
-    RopeNdProcess(launchParam, kernelInfo, tilingDataPtr);
+    RopeNdProcess(launchParam, kernelInfo, *tilingDataPtr);
     if (tilingDataPtr->headDim != 0) {
         headNumQ = tilingDataPtr->hiddenSizeQ / tilingDataPtr->headDim;
         headNumK = tilingDataPtr->hiddenSizeK / tilingDataPtr->headDim;
diff --git a/src/kernels/mixkernels/swi_glu_quant/tiling/swi_glu_quant_tiling.cpp b/src/kernels/mixkernels/swi_glu_quant/tiling/swi_glu_quant_tiling.cpp
index 51b6cc14..05924aae 100644
--- a/src/kernels/mixkernels/swi_glu_quant/tiling/swi_glu_quant_tiling.cpp
+++ b/src/kernels/mixkernels/swi_glu_quant/tiling/swi_glu_quant_tiling.cpp
@@ -94,8 +94,8 @@ Status SwiGluQuantTiling(const LaunchParam &launchParam, KernelInfo &kernelInfo)
     tilingData->cacheLineLen = L2_CACHE_LINE_SIZE / SIZE_OF_FLOAT16;
     const Mki::SVector<int64_t> &xShape = launchParam.GetInTensor(0).desc.dims;
     MKI_CHECK_NO_LOG(SetTotalShape(xShape, *tilingData), return Status::FailStatus(ERROR_INVALID_VALUE));
-    MKI_CHECK_NO_LOG(CalTilingData(tilingData), return Status::FailStatus(ERROR_INVALID_VALUE));
-    SetTilingData(tilingData);
+    MKI_CHECK_NO_LOG(CalTilingData(*tilingData), return Status::FailStatus(ERROR_INVALID_VALUE));
+    SetTilingData(*tilingData);
     SwigluQuantTilingKeyChose(launchParam, kernelInfo);
     kernelInfo.SetBlockDim(tilingData->coreNumUsed);
     
diff --git a/src/ops_infer/linear_parallel/linear_parallel_aclnn_runner.cpp b/src/ops_infer/linear_parallel/linear_parallel_aclnn_runner.cpp
index c98fb564..333f1bb8 100644
--- a/src/ops_infer/linear_parallel/linear_parallel_aclnn_runner.cpp
+++ b/src/ops_infer/linear_parallel/linear_parallel_aclnn_runner.cpp
@@ -193,13 +193,13 @@ Status LinearParallelAclnnRunner::LoadMethodMatmulReduceScatter()
     static DlManager dlManager = DlManager(std::string(std::getenv("ASCEND_HOME_PATH")) + "/lib64/libopapi.so");
     Status ret =
         dlManager.getSymbol("aclnnMatmulReduceScatterV2GetWorkspaceSize",
-                            (void *&)&LinearParallelAclnnRunner::aclnnMatmulReduceScatterV2GetWorkspaceSizeFunc_);
+                            (void *&)LinearParallelAclnnRunner::aclnnMatmulReduceScatterV2GetWorkspaceSizeFunc_);
     if (ret != NO_ERROR) {
         ATB_LOG(ERROR) << "load aclnnMatmulReduceScatterV2GetWorkspaceSize failed! Consider upgrade the CANN first!";
         return ret;
     }
     ret = dlManager.getSymbol("aclnnMatmulReduceScatterV2",
-                              (void *&)&LinearParallelAclnnRunner::aclnnMatmulReduceScatterV2Func_);
+                              (void *&)LinearParallelAclnnRunner::aclnnMatmulReduceScatterV2Func_);
     if (ret != NO_ERROR) {
         ATB_LOG(ERROR) << "load aclnnMatmulReduceScatterV2 failed! Consider upgrade the CANN first!";
         return ret;
diff --git a/src/ops_infer/mla_preprocess/mla_preprocess_aclnn_runner.cpp b/src/ops_infer/mla_preprocess/mla_preprocess_aclnn_runner.cpp
index 7f7957e0..552d2594 100644
--- a/src/ops_infer/mla_preprocess/mla_preprocess_aclnn_runner.cpp
+++ b/src/ops_infer/mla_preprocess/mla_preprocess_aclnn_runner.cpp
@@ -206,12 +206,12 @@ Status MlaPreprocessAclnnRunner::LoadMethod()
     }
     static DlManager dlManager = DlManager(std::string(std::getenv("ASCEND_HOME_PATH")) + "/lib64/libopapi.so");
     Status ret = dlManager.getSymbol("aclnnMlaPreprocessGetWorkspaceSize",
-                                     (void *&)&MlaPreprocessAclnnRunner::aclnnGetWorkspaceSizeFunc_);
+                                     (void *&)MlaPreprocessAclnnRunner::aclnnGetWorkspaceSizeFunc_);
     if (ret != NO_ERROR) {
         ATB_LOG(ERROR) << "load aclnnMlaPreprocessGetWorkspaceSize failed! Consider upgrade the CANN first!";
         return ret;
     }
-    ret = dlManager.getSymbol("aclnnMlaPreprocess", (void *&)&MlaPreprocessAclnnRunner::aclnnExecuteFunc_);
+    ret = dlManager.getSymbol("aclnnMlaPreprocess", (void *&)MlaPreprocessAclnnRunner::aclnnExecuteFunc_);
     if (ret != NO_ERROR) {
         ATB_LOG(ERROR) << "load aclnnMlaPreprocess failed! Consider upgrade the CANN first!";
         return ret;
-- 
Gitee


From edf688c291f5c48b35e89425635c3241fe35da32 Mon Sep 17 00:00:00 2001
From: huangxiaolan <huangxiaolan7@huawei.com>
Date: Mon, 22 Sep 2025 15:14:14 +0800
Subject: [PATCH 29/94] fix operation.h

---
 include/atb/operation.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/atb/operation.h b/include/atb/operation.h
index f3aeec56..c1b48e1d 100644
--- a/include/atb/operation.h
+++ b/include/atb/operation.h
@@ -95,7 +95,7 @@ public:
     //! \return 状态值，如果成功，返回NO_ERROR
     //!
     virtual Status Execute(const VariantPack &variantPack, uint8_t *workspace, uint64_t workspaceSize,
-                           Context *context) = 0;
+                           Context &context) = 0;
 };
 
 //!
-- 
Gitee


From 8a69220c6c6c275d1e860557c489d1c175f86c90 Mon Sep 17 00:00:00 2001
From: He Changcheng <hechangcheng2@h-partners.com>
Date: Tue, 9 Sep 2025 11:36:49 +0800
Subject: [PATCH 30/94] fix

---
 .../src/ascendc_kernels/allreduce_big_data.h   | 18 ++++++++++++++++++
 src/kernels/lcal/src/ascendc_kernels/lccl_op.h |  9 ++++++++-
 2 files changed, 26 insertions(+), 1 deletion(-)

diff --git a/src/kernels/lcal/src/ascendc_kernels/allreduce_big_data.h b/src/kernels/lcal/src/ascendc_kernels/allreduce_big_data.h
index f8ce0276..5623ab45 100644
--- a/src/kernels/lcal/src/ascendc_kernels/allreduce_big_data.h
+++ b/src/kernels/lcal/src/ascendc_kernels/allreduce_big_data.h
@@ -30,6 +30,8 @@ public:
         DumpLcclLogInfo(LogId::INIT, static_cast<Op>(op));
         if constexpr(!std::is_same_v<T, U>) {
             BuildScaleOffset(scale, scaleCount, offset);
+            this->input = input;
+            this->output = output;
         }
 
         if (blockIdx >= PING_PONG_SIZE * rankSize) {
@@ -124,6 +126,22 @@ public:
         }
         DumpLcclLogInfo(LogId::PROCESS, static_cast<Op>(atomOp));
     }
+
+    FORCE_INLINE_AICORE void SupportBigScale()
+    {
+        if constexpr(!std::is_same_v<T, U>) {
+            constexpr int32_t bigScaleFlagOffset = 2;
+            if (blockIdx == 0) {
+                inputGt.SetGlobalBuffer((__gm__ U*)input);
+                outputGt.SetGlobalBuffer((__gm__ T*)output);
+                CpGM2GMWithScale(len, inputGt, outputGt, COPYONLY);
+                sync.SetSyncFlag(magic, 0, blockNum * bigScaleFlagOffset, rank);
+            } else {
+                sync.WaitSyncFlag(magic, 0, blockNum * bigScaleFlagOffset, rank);
+            }
+        }
+        return;
+    }
 private:
     FORCE_INLINE_AICORE void Producer()
     {
diff --git a/src/kernels/lcal/src/ascendc_kernels/lccl_op.h b/src/kernels/lcal/src/ascendc_kernels/lccl_op.h
index bf54ce2b..115c2690 100644
--- a/src/kernels/lcal/src/ascendc_kernels/lccl_op.h
+++ b/src/kernels/lcal/src/ascendc_kernels/lccl_op.h
@@ -129,6 +129,7 @@ extern "C" __global__ __aicore__ void LcalAllReduce_##type##suffix(KERNELS_ARGS_
     constexpr int32_t cceSmallDataSize = 2 * 1024 * 1024; \
     constexpr int32_t smallDataSize910a3 = 32 * 1024 * 1024; \
     constexpr int32_t rankSize910a3 = 16; \
+    constexpr int32_t scaleCountMax = 12 * 1024 * 1024; \
     __gm__ type * shareAddrs[LCAL_MAX_RANK_SIZE]; \
     GET_IPC_MEM_ARGS(type); \
     if ((extraFlag & ExtraFlag::TOPO_PCIE) != 0) { \
@@ -142,8 +143,14 @@ extern "C" __global__ __aicore__ void LcalAllReduce_##type##suffix(KERNELS_ARGS_
             CLASS_OP_QUANT_LAUNCH(AllReduceOneShot, half, int8_t); \
         } else if (len * sizeof(type) <= quantSmallDataSize) { \
             CLASS_OP_QUANT_LAUNCH(AllReduceTwoShot, half, int8_t); \
-        } else { \
+        } else if (scaleCount * rankSize <= scaleCountMax) { \
             CLASS_OP_QUANT_LAUNCH(AllReduceBigData, half, int8_t); \
+        } else { \
+            AllReduceBigData<half, int8_t> opTmp(localRank, localRankSize, extraFlag); \
+            opTmp.Init(KERNELS_ARGS_CALL()); \
+            opTmp.SupportBigScale(); \
+            input = output; \
+            CLASS_OP_LAUNCH(AllReduceBigData, half); \
         } \
     } else if ((extraFlag & ExtraFlag::TOPO_910B2C) != 0 && rankSize > smallRankSize) { \
         if (len * sizeof(type) < cceSmallDataSize) {      \
-- 
Gitee


From 74c01e931f88f3a5e99b271d1b39b8281dd95037 Mon Sep 17 00:00:00 2001
From: He Changcheng <hechangcheng2@h-partners.com>
Date: Tue, 9 Sep 2025 11:54:52 +0800
Subject: [PATCH 31/94] fix

---
 src/kernels/lcal/src/ascendc_kernels/allreduce_big_data.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/kernels/lcal/src/ascendc_kernels/allreduce_big_data.h b/src/kernels/lcal/src/ascendc_kernels/allreduce_big_data.h
index 5623ab45..f8d7c9d5 100644
--- a/src/kernels/lcal/src/ascendc_kernels/allreduce_big_data.h
+++ b/src/kernels/lcal/src/ascendc_kernels/allreduce_big_data.h
@@ -269,6 +269,8 @@ private:
     T offset = 0;
     bool isEnableScale = false;
     bool isVectorScale = false;
+    GM_ADDR input = nullptr;
+    GM_ADDR output = nullptr;
 };
 
 #endif // LCCL_ALLREDUCE_BIG_DATA_H
\ No newline at end of file
-- 
Gitee


From 1fb0916239760d2a32227275ee48375f0a19d2fd Mon Sep 17 00:00:00 2001
From: He Changcheng <hechangcheng2@h-partners.com>
Date: Thu, 4 Sep 2025 11:14:04 +0800
Subject: [PATCH 32/94] add kernel control

---
 src/kernels/lcal/src/lccl.cpp | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/src/kernels/lcal/src/lccl.cpp b/src/kernels/lcal/src/lccl.cpp
index 694bfc74..48abdad3 100644
--- a/src/kernels/lcal/src/lccl.cpp
+++ b/src/kernels/lcal/src/lccl.cpp
@@ -229,6 +229,8 @@ uint32_t Lccl::GetBlockNum(LcalType cclType, uint32_t rankSize, int64_t dataSize
         MKI_LOG(ERROR) << "comm is nullptr" << __LINE__;
         return 0;
     }
+    uint32_t limitVal = 0;
+    aclrtDevResLimitType limitType = aclrtDevResLimitType::ACL_RT_DEV_RES_VECTOR_CORE;
     uint32_t blockNum = GetKernelBlockNum(cclType, rankSize, dataSize, localRankSize, extraFlag);
     if (comm_->isEnableMix_) {
         constexpr uint32_t aivNumPerAic = 2;
@@ -236,10 +238,16 @@ uint32_t Lccl::GetBlockNum(LcalType cclType, uint32_t rankSize, int64_t dataSize
             MKI_LOG(ERROR) << "Lccl not support odd block number at msprof op enabled!";
             return 0;
         }
-        return blockNum / aivNumPerAic;
-    } else {
-        return blockNum;
+        blockNum = blockNum / aivNumPerAic;
+        limitType = aclrtDevResLimitType::ACL_RT_DEV_RES_CUBE_CORE;
+    }
+    aclrtGetResInCurrentThread(limitType, &limitVal);
+    if (blockNum > limitVal) {
+        MKI_LOG(ERROR) << "Insufficient blockDim: Required blockNum(" << blockNum <<
+            ") exceeds limit (limitVal=" << limitVal << ", limitType=" << static_cast<int>(limitType) << ")";
+        return 0;
     }
+    return blockNum;
 }
 
 int Lccl::LoopBack(const void *sendBuff, void *recvBuff, int64_t count, HcclDataType dataType, aclrtStream stream) const
-- 
Gitee


From a8de569ea145ad97604650f8e3dbfda6d7ad06d3 Mon Sep 17 00:00:00 2001
From: He Changcheng <hechangcheng2@h-partners.com>
Date: Mon, 15 Sep 2025 20:48:35 +0800
Subject: [PATCH 33/94] fix

---
 src/kernels/lcal/src/lccl.cpp | 51 +++++++++++++++++++++++++++++++----
 1 file changed, 46 insertions(+), 5 deletions(-)

diff --git a/src/kernels/lcal/src/lccl.cpp b/src/kernels/lcal/src/lccl.cpp
index 48abdad3..7faffd67 100644
--- a/src/kernels/lcal/src/lccl.cpp
+++ b/src/kernels/lcal/src/lccl.cpp
@@ -13,6 +13,7 @@
 #include <chrono>
 #include <mutex>
 #include <thread>
+#include <dlfcn.h>
 #include <acl/acl.h>
 
 #include <mki/utils/log/log.h>
@@ -25,6 +26,42 @@ using namespace chrono;
 using namespace Mki;
 
 namespace Lcal {
+using PFN_aclrtGetResInCurrentThread = int(*)(aclrtDevResLimitType type, uint32_t *);
+static PFN_aclrtGetResInCurrentThread g_aclGetResFunc = nullptr;
+static void *g_libHandle = nullptr;
+static std::mutex g_initMutex;
+
+bool InitAclFunctions()
+{
+    std::lock_guard<std::mutex> lock(g_initMutex);
+
+    if (g_libHandle != nullptr) {
+        return true;
+    }
+
+    const char *libPath = "libascendcl.so";
+    g_libHandle = dlopen(libPath, RTLD_LAZY | RTLD_LOCAL);
+    if (g_libHandle == nullptr) {
+        MKI_LOG(ERROR) << "Failed to load " << libPath << ": " << dlerror();
+        return false;
+    }
+
+    dlerror();
+
+    const char *funcName = "aclrtGetResInCurrentThread";
+    g_aclGetResFunc = reinterpret_cast<PFN_aclrtGetResInCurrentThread>(dlsym(g_libHandle, funcName));
+    const char *dlsymError = dlerror();
+    if (dlsymError != nullptr) {
+        MKI_LOG(WARN) << "Failed to load " << funcName << ": " << dlsymError;
+        dlclose(g_libHandle);
+        g_libHandle = nullptr;
+        g_aclGetResFunc = nullptr;
+        return false;
+    }
+
+    MKI_LOG(DEBUG) << "Successfully loaded " << libPath << "::" << funcName;
+    return true;
+}
 
 uint32_t GetLocalReduceBlockDum(int64_t dataSize)
 {
@@ -241,11 +278,15 @@ uint32_t Lccl::GetBlockNum(LcalType cclType, uint32_t rankSize, int64_t dataSize
         blockNum = blockNum / aivNumPerAic;
         limitType = aclrtDevResLimitType::ACL_RT_DEV_RES_CUBE_CORE;
     }
-    aclrtGetResInCurrentThread(limitType, &limitVal);
-    if (blockNum > limitVal) {
-        MKI_LOG(ERROR) << "Insufficient blockDim: Required blockNum(" << blockNum <<
-            ") exceeds limit (limitVal=" << limitVal << ", limitType=" << static_cast<int>(limitType) << ")";
-        return 0;
+    if (InitAclFunctions() && g_aclGetResFunc != nullptr) {
+        g_aclGetResFunc(limitType, &limitVal);
+        MKI_LOG(ERROR) << "Required blockNum(" << blockNum <<
+            ") limit:(limitVal=" << limitVal << ", limitType=" << static_cast<int>(limitType) << ")";
+        if (blockNum > limitVal) {
+            MKI_LOG(ERROR) << "Insufficient blockDim: Required blockNum(" << blockNum <<
+                ") exceeds limit (limitVal=" << limitVal << ", limitType=" << static_cast<int>(limitType) << ")";
+            return 0;
+        }
     }
     return blockNum;
 }
-- 
Gitee


From 21fa5806b79d33bc5e00c133deaf7fb94933fa49 Mon Sep 17 00:00:00 2001
From: He Changcheng <hechangcheng2@h-partners.com>
Date: Tue, 16 Sep 2025 11:18:10 +0800
Subject: [PATCH 34/94] fix clean code

---
 src/kernels/lcal/src/lccl.cpp | 25 ++++++++++++++++++++-----
 1 file changed, 20 insertions(+), 5 deletions(-)

diff --git a/src/kernels/lcal/src/lccl.cpp b/src/kernels/lcal/src/lccl.cpp
index 7faffd67..0f9eb276 100644
--- a/src/kernels/lcal/src/lccl.cpp
+++ b/src/kernels/lcal/src/lccl.cpp
@@ -26,8 +26,8 @@ using namespace chrono;
 using namespace Mki;
 
 namespace Lcal {
-using PFN_aclrtGetResInCurrentThread = int(*)(aclrtDevResLimitType type, uint32_t *);
-static PFN_aclrtGetResInCurrentThread g_aclGetResFunc = nullptr;
+using LCAL_GET_RES_IN_CUR_THREAD = int(*)(aclrtDevResLimitType type, uint32_t *);
+static LCAL_GET_RES_IN_CUR_THREAD g_aclGetResFunc = nullptr;
 static void *g_libHandle = nullptr;
 static std::mutex g_initMutex;
 
@@ -45,11 +45,12 @@ bool InitAclFunctions()
         MKI_LOG(ERROR) << "Failed to load " << libPath << ": " << dlerror();
         return false;
     }
-
+    
+    // 清理错误信息
     dlerror();
 
     const char *funcName = "aclrtGetResInCurrentThread";
-    g_aclGetResFunc = reinterpret_cast<PFN_aclrtGetResInCurrentThread>(dlsym(g_libHandle, funcName));
+    g_aclGetResFunc = reinterpret_cast<LCAL_GET_RES_IN_CUR_THREAD>(dlsym(g_libHandle, funcName));
     const char *dlsymError = dlerror();
     if (dlsymError != nullptr) {
         MKI_LOG(WARN) << "Failed to load " << funcName << ": " << dlsymError;
@@ -63,6 +64,17 @@ bool InitAclFunctions()
     return true;
 }
 
+void CleanupAclFunctions()
+{
+    std::lock_guard<std::mutex> lock(g_initMutex);
+
+    if (g_libHandle != nullptr) {
+        dlclose(g_libHandle);
+        g_libHandle = nullptr;
+    }
+    g_aclGetResFunc = nullptr;
+}
+
 uint32_t GetLocalReduceBlockDum(int64_t dataSize)
 {
     constexpr int oneDataSize = 190 * 1024;
@@ -263,7 +275,7 @@ uint32_t Lccl::GetBlockNum(LcalType cclType, uint32_t rankSize, int64_t dataSize
                            int localRankSize, uint32_t extraFlag) const
 {
     if (comm_ == nullptr) {
-        MKI_LOG(ERROR) << "comm is nullptr" << __LINE__;
+        MKI_LOG(ERROR) << "comm is nullptr " << __LINE__;
         return 0;
     }
     uint32_t limitVal = 0;
@@ -530,6 +542,7 @@ Lccl::~Lccl()
     if (rankSize_ == -1 and comm_ != nullptr) {
         delete comm_;
     }
+    CleanupAclFunctions();
 }
 
 Lccl::Lccl(LcalComm *comm) : comm_(comm)
@@ -545,11 +558,13 @@ Lccl::Lccl(LcalComm *comm) : comm_(comm)
         }
         rankSize_ = -1;
     }
+    InitAclFunctions();
 }
 
 Lccl::Lccl(LcalComm &comm) : comm_(&comm)
 {
     rank_ = comm.rank_;
     rankSize_ = comm.rankSize_;
+    InitAclFunctions();
 }
 }
\ No newline at end of file
-- 
Gitee


From 085d6f7ed12fa86f6b527d1dbc5fcd7d75185130 Mon Sep 17 00:00:00 2001
From: He Changcheng <hechangcheng2@h-partners.com>
Date: Tue, 16 Sep 2025 19:09:05 +0800
Subject: [PATCH 35/94] fix

---
 src/kernels/lcal/include/lcal_comm.h |  1 +
 src/kernels/lcal/src/lcal_comm.cpp   | 67 ++++++++++++++++++++++++++++
 src/kernels/lcal/src/lccl.cpp        | 52 ---------------------
 3 files changed, 68 insertions(+), 52 deletions(-)

diff --git a/src/kernels/lcal/include/lcal_comm.h b/src/kernels/lcal/include/lcal_comm.h
index 6ec0fbd7..b5d0918c 100644
--- a/src/kernels/lcal/include/lcal_comm.h
+++ b/src/kernels/lcal/include/lcal_comm.h
@@ -63,6 +63,7 @@ private:
     int GetName(std::string &name, char names[LCAL_MAX_RANK_SIZE][IPC_NAME_SIZE]) const;
     int SyncCommArgs();
     int InitDumpAddr();
+    int CallAclRtGetRes(int type, uint32_t *resource) const;
 
 private:
     int rank_ = 0;  // global rank id
diff --git a/src/kernels/lcal/src/lcal_comm.cpp b/src/kernels/lcal/src/lcal_comm.cpp
index b54380a0..ce1c2251 100644
--- a/src/kernels/lcal/src/lcal_comm.cpp
+++ b/src/kernels/lcal/src/lcal_comm.cpp
@@ -14,6 +14,7 @@
 #include <chrono>
 #include <vector>
 #include <mutex>
+#include <dlfcn.h>
 #include <map>
 #include <set>
 #include <thread>
@@ -57,6 +58,69 @@ static map<string, GM_ADDR [LCAL_MAX_RANK_SIZE]> g_localPeerMemMap;
 static map<string, int[LCAL_MAX_RANK_SIZE]> g_devList;
 static std::mutex g_mtx;
 
+using LCAL_GET_RES_IN_CUR_THREAD = int(*)(aclrtDevResLimitType type, uint32_t *resource);
+static LCAL_GET_RES_IN_CUR_THREAD g_aclGetResFunc = nullptr;
+static void *g_libHandle = nullptr;
+static std::mutex g_initMutex;
+
+bool InitAclFunctions()
+{
+    std::lock_guard<std::mutex> lock(g_initMutex);
+
+    if (g_libHandle != nullptr) {
+        return true;
+    }
+
+    const char *libPath = "libascendcl.so";
+    g_libHandle = dlopen(libPath, RTLD_LAZY | RTLD_LOCAL);
+    if (g_libHandle == nullptr) {
+        MKI_LOG(ERROR) << "Failed to load " << libPath << ": " << dlerror();
+        return false;
+    }
+    
+    // 清理错误信息
+    dlerror();
+
+    const char *funcName = "aclrtGetResInCurrentThread";
+    g_aclGetResFunc = reinterpret_cast<LCAL_GET_RES_IN_CUR_THREAD>(dlsym(g_libHandle, funcName));
+    const char *dlsymError = dlerror();
+    if (dlsymError != nullptr) {
+        MKI_LOG(WARN) << "Failed to load " << funcName << ": " << dlsymError;
+        dlclose(g_libHandle);
+        g_libHandle = nullptr;
+        g_aclGetResFunc = nullptr;
+        return false;
+    }
+
+    MKI_LOG(DEBUG) << "Successfully loaded " << libPath << "::" << funcName;
+    return true;
+}
+
+void CleanupAclFunctions()
+{
+    std::lock_guard<std::mutex> lock(g_initMutex);
+
+    if (g_libHandle != nullptr) {
+        dlclose(g_libHandle);
+        g_libHandle = nullptr;
+    }
+    g_aclGetResFunc = nullptr;
+}
+
+int LcalComm::CallAclRtGetRes(int type, uint32_t *resource) const
+{
+    if (g_aclGetResFunc != nullptr) {
+        if (type == ACL_RT_DEV_RES_CUBE_CORE || type == ACL_RT_DEV_RES_VECTOR_CORE) {
+            g_aclGetResFunc(static_cast<aclrtDevResLimitType>(type), resource);
+            return LCAL_SUCCESS;
+        } else {
+            MKI_LOG(ERROR) << "aclrtGetResInCurrentThread not support type " << type;
+            return LCAL_ERROR_INTERNAL;
+        }
+    }
+    return LCAL_ERROR_NOT_FOUND;
+}
+
 static const std::unordered_map<std::string, ChipName> CHIP_MAP = {
     {"Ascend310P", ChipName::CHIP_310P3},
     {"Ascend910B1", ChipName::CHIP_910B1},
@@ -303,6 +367,7 @@ int LcalComm::Init()
     if (inited_) {
         return LCAL_SUCCESS;
     }
+    InitAclFunctions();
     if (rank_ < 0 || rank_ >= rankSize_ || rankSize_ <= 0 || rankSize_ > LCAL_MAX_RANK_SIZE) {
         MKI_LOG(ERROR) << "The rank is invalid! rank:" << rank_ << " rankSize:" << rankSize_;
         return LCAL_ERROR_PARA_CHECK_FAIL;
@@ -351,6 +416,7 @@ int LcalComm::InitThread(const std::string &uid)
     if (inited_) {
         return LCAL_SUCCESS;
     }
+    InitAclFunctions();
     if (rank_ < 0 || rank_ >= rankSize_ || rankSize_ <= 0 || rankSize_ > LCAL_MAX_RANK_SIZE) {
         MKI_LOG(ERROR) << "The rank is invalid! rank:" << rank_ << "rankSize:" << rankSize_;
         return LCAL_ERROR_PARA_CHECK_FAIL;
@@ -723,6 +789,7 @@ LcalComm::~LcalComm()
     FreePeerMem(commArgs_.dumpAddr);
     FreePeerMem(peerMem_[rank_]);
     FreePeerMem(commArgsPtr_);
+    CleanupAclFunctions();
 }
 
 LcalComm::LcalComm(int rank, int rankSize) : rank_(rank), rankSize_(rankSize)
diff --git a/src/kernels/lcal/src/lccl.cpp b/src/kernels/lcal/src/lccl.cpp
index 0f9eb276..93a090c9 100644
--- a/src/kernels/lcal/src/lccl.cpp
+++ b/src/kernels/lcal/src/lccl.cpp
@@ -26,55 +26,6 @@ using namespace chrono;
 using namespace Mki;
 
 namespace Lcal {
-using LCAL_GET_RES_IN_CUR_THREAD = int(*)(aclrtDevResLimitType type, uint32_t *);
-static LCAL_GET_RES_IN_CUR_THREAD g_aclGetResFunc = nullptr;
-static void *g_libHandle = nullptr;
-static std::mutex g_initMutex;
-
-bool InitAclFunctions()
-{
-    std::lock_guard<std::mutex> lock(g_initMutex);
-
-    if (g_libHandle != nullptr) {
-        return true;
-    }
-
-    const char *libPath = "libascendcl.so";
-    g_libHandle = dlopen(libPath, RTLD_LAZY | RTLD_LOCAL);
-    if (g_libHandle == nullptr) {
-        MKI_LOG(ERROR) << "Failed to load " << libPath << ": " << dlerror();
-        return false;
-    }
-    
-    // 清理错误信息
-    dlerror();
-
-    const char *funcName = "aclrtGetResInCurrentThread";
-    g_aclGetResFunc = reinterpret_cast<LCAL_GET_RES_IN_CUR_THREAD>(dlsym(g_libHandle, funcName));
-    const char *dlsymError = dlerror();
-    if (dlsymError != nullptr) {
-        MKI_LOG(WARN) << "Failed to load " << funcName << ": " << dlsymError;
-        dlclose(g_libHandle);
-        g_libHandle = nullptr;
-        g_aclGetResFunc = nullptr;
-        return false;
-    }
-
-    MKI_LOG(DEBUG) << "Successfully loaded " << libPath << "::" << funcName;
-    return true;
-}
-
-void CleanupAclFunctions()
-{
-    std::lock_guard<std::mutex> lock(g_initMutex);
-
-    if (g_libHandle != nullptr) {
-        dlclose(g_libHandle);
-        g_libHandle = nullptr;
-    }
-    g_aclGetResFunc = nullptr;
-}
-
 uint32_t GetLocalReduceBlockDum(int64_t dataSize)
 {
     constexpr int oneDataSize = 190 * 1024;
@@ -542,7 +493,6 @@ Lccl::~Lccl()
     if (rankSize_ == -1 and comm_ != nullptr) {
         delete comm_;
     }
-    CleanupAclFunctions();
 }
 
 Lccl::Lccl(LcalComm *comm) : comm_(comm)
@@ -558,13 +508,11 @@ Lccl::Lccl(LcalComm *comm) : comm_(comm)
         }
         rankSize_ = -1;
     }
-    InitAclFunctions();
 }
 
 Lccl::Lccl(LcalComm &comm) : comm_(&comm)
 {
     rank_ = comm.rank_;
     rankSize_ = comm.rankSize_;
-    InitAclFunctions();
 }
 }
\ No newline at end of file
-- 
Gitee


From 243a96120979278367a73986ed6f989e5ee20810 Mon Sep 17 00:00:00 2001
From: He Changcheng <hechangcheng2@h-partners.com>
Date: Tue, 16 Sep 2025 19:13:51 +0800
Subject: [PATCH 36/94] fix

---
 src/kernels/lcal/src/lccl.cpp | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/kernels/lcal/src/lccl.cpp b/src/kernels/lcal/src/lccl.cpp
index 93a090c9..5c6d9235 100644
--- a/src/kernels/lcal/src/lccl.cpp
+++ b/src/kernels/lcal/src/lccl.cpp
@@ -13,7 +13,6 @@
 #include <chrono>
 #include <mutex>
 #include <thread>
-#include <dlfcn.h>
 #include <acl/acl.h>
 
 #include <mki/utils/log/log.h>
@@ -26,6 +25,7 @@ using namespace chrono;
 using namespace Mki;
 
 namespace Lcal {
+
 uint32_t GetLocalReduceBlockDum(int64_t dataSize)
 {
     constexpr int oneDataSize = 190 * 1024;
@@ -241,9 +241,10 @@ uint32_t Lccl::GetBlockNum(LcalType cclType, uint32_t rankSize, int64_t dataSize
         blockNum = blockNum / aivNumPerAic;
         limitType = aclrtDevResLimitType::ACL_RT_DEV_RES_CUBE_CORE;
     }
-    if (InitAclFunctions() && g_aclGetResFunc != nullptr) {
-        g_aclGetResFunc(limitType, &limitVal);
-        MKI_LOG(ERROR) << "Required blockNum(" << blockNum <<
+    
+    int res = comm_->CallAclRtGetRes(static_cast<int>(limitType), &limitVal);
+    if (res == LCAL_SUCCESS) {
+        MKI_LOG(DEBUG) << "Required blockNum(" << blockNum <<
             ") limit:(limitVal=" << limitVal << ", limitType=" << static_cast<int>(limitType) << ")";
         if (blockNum > limitVal) {
             MKI_LOG(ERROR) << "Insufficient blockDim: Required blockNum(" << blockNum <<
-- 
Gitee


From fb83038cd01803f060644114cc057aea5884a052 Mon Sep 17 00:00:00 2001
From: He Changcheng <hechangcheng2@h-partners.com>
Date: Tue, 16 Sep 2025 19:21:49 +0800
Subject: [PATCH 37/94] fix

---
 src/kernels/lcal/src/lccl.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/kernels/lcal/src/lccl.cpp b/src/kernels/lcal/src/lccl.cpp
index 5c6d9235..66eb8d8e 100644
--- a/src/kernels/lcal/src/lccl.cpp
+++ b/src/kernels/lcal/src/lccl.cpp
@@ -241,7 +241,7 @@ uint32_t Lccl::GetBlockNum(LcalType cclType, uint32_t rankSize, int64_t dataSize
         blockNum = blockNum / aivNumPerAic;
         limitType = aclrtDevResLimitType::ACL_RT_DEV_RES_CUBE_CORE;
     }
-    
+
     int res = comm_->CallAclRtGetRes(static_cast<int>(limitType), &limitVal);
     if (res == LCAL_SUCCESS) {
         MKI_LOG(DEBUG) << "Required blockNum(" << blockNum <<
-- 
Gitee


From b7e850f52cde34b51f35629264407dc94309eec6 Mon Sep 17 00:00:00 2001
From: He Changcheng <hechangcheng2@h-partners.com>
Date: Thu, 18 Sep 2025 16:32:06 +0800
Subject: [PATCH 38/94] include order

---
 src/kernels/lcal/src/lcal_comm.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/kernels/lcal/src/lcal_comm.cpp b/src/kernels/lcal/src/lcal_comm.cpp
index ce1c2251..57a3a15f 100644
--- a/src/kernels/lcal/src/lcal_comm.cpp
+++ b/src/kernels/lcal/src/lcal_comm.cpp
@@ -14,13 +14,13 @@
 #include <chrono>
 #include <vector>
 #include <mutex>
-#include <dlfcn.h>
 #include <map>
 #include <set>
 #include <thread>
 #include <sstream>
 #include <iomanip>
 
+#include <dlfcn.h>
 #include <hccl/hccl.h>
 #include "mki/utils/log/log.h"
 #include "mki/utils/env/env.h"
-- 
Gitee


From ce08766da0f35debafe9fb59ffff9cc9c79eca38 Mon Sep 17 00:00:00 2001
From: huangxiaolan <huangxiaolan7@huawei.com>
Date: Mon, 22 Sep 2025 16:43:12 +0800
Subject: [PATCH 39/94] recover changes

---
 include/atb/atb_acl.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/atb/atb_acl.h b/include/atb/atb_acl.h
index 377d37d7..bf62b985 100644
--- a/include/atb/atb_acl.h
+++ b/include/atb/atb_acl.h
@@ -101,7 +101,7 @@ atb::Status AtbMLAGetWorkspaceSize(const aclTensor *qNope, const aclTensor *qRop
                                    const aclTensor *mask, const aclTensor *qSeqLen, const aclTensor *qkDescale,
                                    const aclTensor *pvDescale, int32_t headNum, float qkScale, int32_t kvHeadNum,
                                    int maskType, int calcType, uint8_t cacheMode, aclTensor *attenOut, aclTensor *lse,
-                                   uint64_t &workspaceSize, atb::Operation **op, atb::Context &context);
+                                   uint64_t &workspaceSize, atb::Operation **op, atb::Context *context);
 
 //!
 //! \brief 关于MLA算子使用aclnn风格调用的2段式接口的第2段，
-- 
Gitee


From d3a634074c2c9c7437f196034b41eabf91988393 Mon Sep 17 00:00:00 2001
From: Vector <caobingjie@huawei.com>
Date: Mon, 22 Sep 2025 17:06:21 +0800
Subject: [PATCH 40/94] update

---
 example/op_demo/linear/README.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/example/op_demo/linear/README.md b/example/op_demo/linear/README.md
index f7c45a0d..e1547d92 100644
--- a/example/op_demo/linear/README.md
+++ b/example/op_demo/linear/README.md
@@ -185,6 +185,7 @@
 
 
     - linear_dequant_ds_demo.cpp  
+        该demo支持Atlas A2/A3系列和Atlas 推理系列产品上运行。  
         **参数设置**：
 
         | 成员名称     | 取值               |
@@ -204,4 +205,4 @@
         | `weight`   | int8     | nd       | [7168, 16384] | npu     |
         | `bias`     | int32    | nd       | [1, 7168]     | npu     |
         | `deqScale` | int64    | nd       | [1, 7168]     | npu     |
-        | `output`   | fp16     | nd       | [32, 7168]    | npu     |
+        | `output`   | float16  | nd       | [32, 7168]    | npu     |
-- 
Gitee


From 09751e58329573bfa0868b8803869787509a57a3 Mon Sep 17 00:00:00 2001
From: He Changcheng <hechangcheng2@h-partners.com>
Date: Mon, 22 Sep 2025 17:21:56 +0800
Subject: [PATCH 41/94] fix

---
 src/kernels/lcal/include/lcal_comm.h |  3 +-
 src/kernels/lcal/src/lcal_comm.cpp   | 73 ++--------------------------
 src/kernels/lcal/src/lccl.cpp        | 40 +++++++++++++++
 3 files changed, 44 insertions(+), 72 deletions(-)

diff --git a/src/kernels/lcal/include/lcal_comm.h b/src/kernels/lcal/include/lcal_comm.h
index b5d0918c..bff77eea 100644
--- a/src/kernels/lcal/include/lcal_comm.h
+++ b/src/kernels/lcal/include/lcal_comm.h
@@ -63,8 +63,7 @@ private:
     int GetName(std::string &name, char names[LCAL_MAX_RANK_SIZE][IPC_NAME_SIZE]) const;
     int SyncCommArgs();
     int InitDumpAddr();
-    int CallAclRtGetRes(int type, uint32_t *resource) const;
-
+    
 private:
     int rank_ = 0;  // global rank id
     int rankSize_ = 0;  // global rank size
diff --git a/src/kernels/lcal/src/lcal_comm.cpp b/src/kernels/lcal/src/lcal_comm.cpp
index 57a3a15f..8b77500a 100644
--- a/src/kernels/lcal/src/lcal_comm.cpp
+++ b/src/kernels/lcal/src/lcal_comm.cpp
@@ -20,7 +20,6 @@
 #include <sstream>
 #include <iomanip>
 
-#include <dlfcn.h>
 #include <hccl/hccl.h>
 #include "mki/utils/log/log.h"
 #include "mki/utils/env/env.h"
@@ -58,69 +57,6 @@ static map<string, GM_ADDR [LCAL_MAX_RANK_SIZE]> g_localPeerMemMap;
 static map<string, int[LCAL_MAX_RANK_SIZE]> g_devList;
 static std::mutex g_mtx;
 
-using LCAL_GET_RES_IN_CUR_THREAD = int(*)(aclrtDevResLimitType type, uint32_t *resource);
-static LCAL_GET_RES_IN_CUR_THREAD g_aclGetResFunc = nullptr;
-static void *g_libHandle = nullptr;
-static std::mutex g_initMutex;
-
-bool InitAclFunctions()
-{
-    std::lock_guard<std::mutex> lock(g_initMutex);
-
-    if (g_libHandle != nullptr) {
-        return true;
-    }
-
-    const char *libPath = "libascendcl.so";
-    g_libHandle = dlopen(libPath, RTLD_LAZY | RTLD_LOCAL);
-    if (g_libHandle == nullptr) {
-        MKI_LOG(ERROR) << "Failed to load " << libPath << ": " << dlerror();
-        return false;
-    }
-    
-    // 清理错误信息
-    dlerror();
-
-    const char *funcName = "aclrtGetResInCurrentThread";
-    g_aclGetResFunc = reinterpret_cast<LCAL_GET_RES_IN_CUR_THREAD>(dlsym(g_libHandle, funcName));
-    const char *dlsymError = dlerror();
-    if (dlsymError != nullptr) {
-        MKI_LOG(WARN) << "Failed to load " << funcName << ": " << dlsymError;
-        dlclose(g_libHandle);
-        g_libHandle = nullptr;
-        g_aclGetResFunc = nullptr;
-        return false;
-    }
-
-    MKI_LOG(DEBUG) << "Successfully loaded " << libPath << "::" << funcName;
-    return true;
-}
-
-void CleanupAclFunctions()
-{
-    std::lock_guard<std::mutex> lock(g_initMutex);
-
-    if (g_libHandle != nullptr) {
-        dlclose(g_libHandle);
-        g_libHandle = nullptr;
-    }
-    g_aclGetResFunc = nullptr;
-}
-
-int LcalComm::CallAclRtGetRes(int type, uint32_t *resource) const
-{
-    if (g_aclGetResFunc != nullptr) {
-        if (type == ACL_RT_DEV_RES_CUBE_CORE || type == ACL_RT_DEV_RES_VECTOR_CORE) {
-            g_aclGetResFunc(static_cast<aclrtDevResLimitType>(type), resource);
-            return LCAL_SUCCESS;
-        } else {
-            MKI_LOG(ERROR) << "aclrtGetResInCurrentThread not support type " << type;
-            return LCAL_ERROR_INTERNAL;
-        }
-    }
-    return LCAL_ERROR_NOT_FOUND;
-}
-
 static const std::unordered_map<std::string, ChipName> CHIP_MAP = {
     {"Ascend310P", ChipName::CHIP_310P3},
     {"Ascend910B1", ChipName::CHIP_910B1},
@@ -367,8 +303,7 @@ int LcalComm::Init()
     if (inited_) {
         return LCAL_SUCCESS;
     }
-    InitAclFunctions();
-    if (rank_ < 0 || rank_ >= rankSize_ || rankSize_ <= 0 || rankSize_ > LCAL_MAX_RANK_SIZE) {
+        if (rank_ < 0 || rank_ >= rankSize_ || rankSize_ <= 0 || rankSize_ > LCAL_MAX_RANK_SIZE) {
         MKI_LOG(ERROR) << "The rank is invalid! rank:" << rank_ << " rankSize:" << rankSize_;
         return LCAL_ERROR_PARA_CHECK_FAIL;
     }
@@ -416,8 +351,7 @@ int LcalComm::InitThread(const std::string &uid)
     if (inited_) {
         return LCAL_SUCCESS;
     }
-    InitAclFunctions();
-    if (rank_ < 0 || rank_ >= rankSize_ || rankSize_ <= 0 || rankSize_ > LCAL_MAX_RANK_SIZE) {
+        if (rank_ < 0 || rank_ >= rankSize_ || rankSize_ <= 0 || rankSize_ > LCAL_MAX_RANK_SIZE) {
         MKI_LOG(ERROR) << "The rank is invalid! rank:" << rank_ << "rankSize:" << rankSize_;
         return LCAL_ERROR_PARA_CHECK_FAIL;
     }
@@ -789,8 +723,7 @@ LcalComm::~LcalComm()
     FreePeerMem(commArgs_.dumpAddr);
     FreePeerMem(peerMem_[rank_]);
     FreePeerMem(commArgsPtr_);
-    CleanupAclFunctions();
-}
+    }
 
 LcalComm::LcalComm(int rank, int rankSize) : rank_(rank), rankSize_(rankSize)
 {
diff --git a/src/kernels/lcal/src/lccl.cpp b/src/kernels/lcal/src/lccl.cpp
index 66eb8d8e..62cf73d7 100644
--- a/src/kernels/lcal/src/lccl.cpp
+++ b/src/kernels/lcal/src/lccl.cpp
@@ -17,6 +17,7 @@
 
 #include <mki/utils/log/log.h>
 #include <mki/utils/env/env.h>
+#include <mki/utils/dl/dl.h>
 
 #include "profiling/report_timing.h"
 
@@ -26,6 +27,45 @@ using namespace Mki;
 
 namespace Lcal {
 
+using AclrtGetResInCurrentThreadFunc = int(*)(int, uint32_t*);
+
+int GetAclResInCurThread(int type, uint32_t *resource)
+{
+    // 静态变量：保存函数指针和库句柄
+    static std::unique_ptr<Mki::Dl> mkiDl;
+    static AclrtGetResInCurrentThreadFunc aclrtGetResInCurrentThread = nullptr;
+    static std::mutex localMutex; // 线程安全锁
+
+    std::lock_guard<std::mutex> lock(localMutex); // 加锁
+
+    // 首次调用时初始化
+    if (!mkiDl) {
+        std::string libPath = std::string(Mki::GetEnv("ASCEND_HOME_PATH")) + "/runtime/lib64/libascendcl.so";
+        mkiDl = std::make_unique<Mki::Dl>(libPath, false);
+        if (!mkiDl->IsValid()) {  // 检查库是否加载成功
+            MKI_LOG(WARN) << "Failed to load libascendcl.so!";
+            return LCAL_ERROR_NOT_FOUND;
+        }
+        aclrtGetResInCurrentThread =
+            (AclrtGetResInCurrentThreadFunc)mkiDl->GetSymbol("aclrtGetResInCurrentThread");
+        if (aclrtGetResInCurrentThread == nullptr) {
+            MKI_LOG(WARN) << "Failed to get acl function!";
+            return LCAL_ERROR_NOT_FOUND;
+        }
+        MKI_LOG(DEBUG) << "Successfully loaded libascendcl.so and resolved aclrtGetResInCurrentThread";
+    }
+
+    // 调用函数
+    int getResRet = aclrtGetResInCurrentThread(type, resource);
+    if (getResRet != ACL_SUCCESS) {
+        MKI_LOG(ERROR) << "Failed to get resource in current thread for type:" << type << " err:" << getResRet;
+        return LCAL_ERROR_INTERNAL;
+    } else {
+        MKI_LOG(DEBUG) << "Get resource in current thread for type:" << type << " resource:" << *resource;
+        return LCAL_SUCCESS;
+    }
+}
+
 uint32_t GetLocalReduceBlockDum(int64_t dataSize)
 {
     constexpr int oneDataSize = 190 * 1024;
-- 
Gitee


From b1689b3c53f9c0ec9cb61abfc5f535e3d2ed114c Mon Sep 17 00:00:00 2001
From: huangxiaolan <huangxiaolan7@huawei.com>
Date: Mon, 22 Sep 2025 17:28:27 +0800
Subject: [PATCH 42/94] fix point

---
 src/atb/operation/if_operation.cpp            | 2 +-
 src/atb/operation/if_operation.h              | 2 +-
 src/atb/operation/operation_base.cpp          | 2 +-
 src/atb/runner/ops_runner.cpp                 | 2 +-
 src/atb/runner/plugin_runner.cpp              | 2 +-
 src/cinterface/atb_acl_fused_add_topk_div.cpp | 2 +-
 6 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/atb/operation/if_operation.cpp b/src/atb/operation/if_operation.cpp
index a9c8f406..7a70570e 100644
--- a/src/atb/operation/if_operation.cpp
+++ b/src/atb/operation/if_operation.cpp
@@ -104,7 +104,7 @@ Status IfOperation::Setup(const VariantPack &variantPack, uint64_t &workspaceSiz
 }
 
 Status IfOperation::Execute(const VariantPack &variantPack, uint8_t *workspace, uint64_t workspaceSize,
-                            Context *context)
+                            Context &context)
 {
     ATB_LOG(INFO) << GetLogPrefix() << "Calling Execute...";
     return opSelected_->Execute(variantPack, workspace, workspaceSize, context);
diff --git a/src/atb/operation/if_operation.h b/src/atb/operation/if_operation.h
index eab4a45f..8f5dfbe7 100644
--- a/src/atb/operation/if_operation.h
+++ b/src/atb/operation/if_operation.h
@@ -24,7 +24,7 @@ public:
     std::string GetName() const override;
     Status Setup(const VariantPack &variantPack, uint64_t &workspaceSize, Context *context) override;
     Status Execute(const VariantPack &variantPack, uint8_t *workspace, uint64_t workspaceSize,
-                   Context *context) override;
+                   Context &context) override;
     uint32_t GetInputNum() const override;
     uint32_t GetOutputNum() const override;
     void SetExecuteStreamId(uint32_t streamId) override;
diff --git a/src/atb/operation/operation_base.cpp b/src/atb/operation/operation_base.cpp
index b3afa56d..526b1be6 100644
--- a/src/atb/operation/operation_base.cpp
+++ b/src/atb/operation/operation_base.cpp
@@ -1083,7 +1083,7 @@ Status OperationBase::Execute(const VariantPack &variantPack, uint8_t *workspace
     }
     Status st = NO_ERROR;
     if (executeType == EXECUTE_NORMAL || executeType == EXECUTE_PRELAUNCH) {
-        st = PreLaunch(variantPack, workspace, workspaceSize, context);
+        st = PreLaunch(variantPack, workspace, workspaceSize, &context);
         if (st != NO_ERROR) {
             ATB_LOG(ERROR) << GetLogPrefix() << "PreLaunch fail, error code: " << st;
             return st;
diff --git a/src/atb/runner/ops_runner.cpp b/src/atb/runner/ops_runner.cpp
index 28347ad8..4c4fcbd4 100644
--- a/src/atb/runner/ops_runner.cpp
+++ b/src/atb/runner/ops_runner.cpp
@@ -263,7 +263,7 @@ Status OpsRunner::FillHostTilingBufferImpl(uint8_t *hostTilingBuffer, uint64_t t
         }
 
         uint8_t *kernelHostTilingBuffer = hostTilingBuffer + offset;
-        Status ret = FillSingleKernelHostTilingBuffer(node, nodeId, kernelHostTilingBuffer, tilingSize, context);
+        Status ret = FillSingleKernelHostTilingBuffer(node, nodeId, kernelHostTilingBuffer, tilingSize, *context);
         if (ret != NO_ERROR) {
             ATB_LOG(ERROR) << GetLogPrefix() << " node[" << nodeId << "] fill tiling buffer fail, error code:" << ret;
             return ret;
diff --git a/src/atb/runner/plugin_runner.cpp b/src/atb/runner/plugin_runner.cpp
index aeaa20c6..fb26b4f6 100644
--- a/src/atb/runner/plugin_runner.cpp
+++ b/src/atb/runner/plugin_runner.cpp
@@ -37,7 +37,7 @@ Status PluginRunner::ExecuteImpl(RunnerVariantPack &runnerVariantPack)
         variantPack_.inTensors = runnerVariantPack.inTensors;
         variantPack_.outTensors = runnerVariantPack.outTensors;
         return operation_->Execute(variantPack_, runnerVariantPack.workspaceBuffer,
-                                   runnerVariantPack.workspaceBufferSize, runnerVariantPack.context);
+                                   runnerVariantPack.workspaceBufferSize, *runnerVariantPack.context);
     }
 
     return ERROR_INVALID_PARAM;
diff --git a/src/cinterface/atb_acl_fused_add_topk_div.cpp b/src/cinterface/atb_acl_fused_add_topk_div.cpp
index fbbfca90..7d54559a 100644
--- a/src/cinterface/atb_acl_fused_add_topk_div.cpp
+++ b/src/cinterface/atb_acl_fused_add_topk_div.cpp
@@ -79,7 +79,7 @@ atb::Status AtbFusedAddTopkDiv(void *workspace, uint64_t workspaceSize, atb::Ope
     ATB_CHECK(op != nullptr, "AtbFusedAddTopkDiv expect op pointer not to be null!",
               return atb::ERROR_INVALID_OPERATION_ADDR);
     atb::VariantPack pack;
-    atb::Status st = op->Execute(pack, (uint8_t *)(workspace), workspaceSize, context);
+    atb::Status st = op->Execute(pack, (uint8_t *)(workspace), workspaceSize, *context);
     ATB_CHECK(st == atb::NO_ERROR, "AtbFusedAddTopkDiv Execute failed!", return st);
     return st;
 }
-- 
Gitee


From 6d92e12ed867ef22be1b3652a4d0d1c0b116baa3 Mon Sep 17 00:00:00 2001
From: huangxiaolan <huangxiaolan7@huawei.com>
Date: Mon, 22 Sep 2025 17:42:43 +0800
Subject: [PATCH 43/94] recover extern changes

---
 comm/lcal/src/tools/socket/lcal_sock_exchange.cpp  |  8 ++++----
 include/atb/atb_acl.h                              | 14 +++++++-------
 include/atb/operation.h                            |  2 +-
 src/cinterface/atb_acl_fused_add_topk_div.cpp      |  6 +++---
 src/cinterface/atb_acl_mla.cpp                     |  8 ++++----
 src/cinterface/atb_acl_mla_preprocess.cpp          |  4 ++--
 src/cinterface/atb_acl_paged_cache_load.cpp        |  4 ++--
 src/cinterface/atb_acl_ring_mla.cpp                |  4 ++--
 .../atb_acl_self_attention_prefix_encoder.cpp      |  4 ++--
 src/cinterface/atb_acl_util.cpp                    |  8 ++++----
 10 files changed, 31 insertions(+), 31 deletions(-)

diff --git a/comm/lcal/src/tools/socket/lcal_sock_exchange.cpp b/comm/lcal/src/tools/socket/lcal_sock_exchange.cpp
index ff5dec47..552fde6b 100644
--- a/comm/lcal/src/tools/socket/lcal_sock_exchange.cpp
+++ b/comm/lcal/src/tools/socket/lcal_sock_exchange.cpp
@@ -335,7 +335,7 @@ void LcalSockExchange::Cleanup()
     }
 }
 
-int GetAddrFromString(LcalSocketAddress& ua, const char* ipPortPair)
+int GetAddrFromString(LcalSocketAddress* ua, const char* ipPortPair)
 {
     std::string ip;
     uint16_t port;
@@ -344,9 +344,9 @@ int GetAddrFromString(LcalSocketAddress& ua, const char* ipPortPair)
         MKI_LOG(ERROR) << "lcal ParseIpAndPort failed!";
         return LCAL_ERROR_INTERNAL;
     }
-    ua.sin.sin_family = AF_INET;
-    ua.sin.sin_addr.s_addr = inet_addr(ip.c_str());
-    ua.sin.sin_port = htons(port);
+    ua->sin.sin_family = AF_INET;
+    ua->sin.sin_addr.s_addr = inet_addr(ip.c_str());
+    ua->sin.sin_port = htons(port);
     return LCAL_SUCCESS;
 }
 
diff --git a/include/atb/atb_acl.h b/include/atb/atb_acl.h
index bf62b985..dfd8f0d4 100644
--- a/include/atb/atb_acl.h
+++ b/include/atb/atb_acl.h
@@ -55,7 +55,7 @@ atb::Status AtbFusedAddTopkDivGetWorkspaceSize(const aclTensor *x, const aclTens
                                                const aclTensor *mappingTable, uint32_t groupNum, uint32_t groupTopk,
                                                uint32_t n, uint32_t k, int activationType, bool isNorm, float scale,
                                                bool enableExpertMapping, aclTensor *y, aclTensor *indices,
-                                               uint64_t &workspaceSize, atb::Operation **op, atb::Context *context);
+                                               uint64_t *workspaceSize, atb::Operation **op, atb::Context *context);
 
 //!
 //! \brief 关于FusedAddTopkDiv算子使用aclnn风格调用的2段式接口的第2段，
@@ -101,7 +101,7 @@ atb::Status AtbMLAGetWorkspaceSize(const aclTensor *qNope, const aclTensor *qRop
                                    const aclTensor *mask, const aclTensor *qSeqLen, const aclTensor *qkDescale,
                                    const aclTensor *pvDescale, int32_t headNum, float qkScale, int32_t kvHeadNum,
                                    int maskType, int calcType, uint8_t cacheMode, aclTensor *attenOut, aclTensor *lse,
-                                   uint64_t &workspaceSize, atb::Operation **op, atb::Context *context);
+                                   uint64_t *workspaceSize, atb::Operation **op, atb::Context *context);
 
 //!
 //! \brief 关于MLA算子使用aclnn风格调用的2段式接口的第2段，
@@ -142,7 +142,7 @@ atb::Status AtbMLAPreFillGetWorkspaceSize(const aclTensor *q, const aclTensor *q
     const aclTensor *kRope, const aclTensor *v, const aclTensor *qSeqLen, const aclTensor *kvSeqLen,
     const aclTensor *mask, int32_t headNum, float qkScale, int32_t kvHeadNum,
     int maskType, uint8_t cacheMode, aclTensor *attenOut,
-    uint64_t &workspaceSize, atb::Operation **op, atb::Context *context);
+    uint64_t *workspaceSize, atb::Operation **op, atb::Context *context);
 
 //!
 //! \brief MLA prefill 处理接口
@@ -214,7 +214,7 @@ atb::Status AtbMLAPreprocessGetWorkspaceSize(
     const aclTensor *kvCacheRope, const aclTensor *slotmapping, const aclTensor *ctkvScale, const aclTensor *qNopeScale,
     uint32_t wdqDim, uint32_t qRopeDim, uint32_t kRopeDim, float epsilon, uint32_t qRotaryCoeff, uint32_t kRotaryCoeff,
     bool transposeWdq, bool transposeWuq, bool transposeWuk, uint8_t cacheMode, uint16_t quantMode, aclTensor *qOut0,
-    aclTensor *kvCacheOut0, aclTensor *qOut1, aclTensor *kvCacheOut1, uint64_t &workspaceSize, atb::Operation **op,
+    aclTensor *kvCacheOut0, aclTensor *qOut1, aclTensor *kvCacheOut1, uint64_t *workspaceSize, atb::Operation **op,
     atb::Context *context);
 
 //!
@@ -252,7 +252,7 @@ atb::Status AtbPagedCacheLoadGetWorkspaceSize(const aclTensor *keyCache, const a
                                               const aclTensor *blockTables, const aclTensor *contextLens,
                                               const aclTensor *key, const aclTensor *value, const aclTensor *seqStarts,
                                               int8_t kvCacheCfg, bool isSeqLensCumsumType, bool hasSeqStarts,
-                                              uint64_t &workspaceSize, atb::Operation **op, atb::Context *context);
+                                              uint64_t *workspaceSize, atb::Operation **op, atb::Context *context);
 
 //!
 //! \brief 关于PagedCacheLoad算子使用aclnn风格调用的2段式接口的第2段，
@@ -300,7 +300,7 @@ atb::Status AtbRingMLAGetWorkspaceSize(const aclTensor *querySplit1, const aclTe
                                        const aclTensor *mask, const aclTensor *seqLen, const aclTensor *prevOut,
                                        const aclTensor *prevLse, int32_t headNum, int32_t kvHeadNum, float qkScale,
                                        int kernelType, int maskType, int inputLayout, int calcType, aclTensor *output,
-                                       aclTensor *softmaxLse, uint64_t &workspaceSize, atb::Operation **op,
+                                       aclTensor *softmaxLse, uint64_t *workspaceSize, atb::Operation **op,
                                        atb::Context *context);
 
 //!
@@ -344,7 +344,7 @@ atb::Status AtbSelfAttentionPrefixEncoderGetWorkspaceSize(const aclTensor *query
                                                           const aclTensor *mask, const aclTensor *seqLen,
                                                           const aclTensor *kvSeqLen, const aclTensor *slopes,
                                                           int maskType, int32_t headNum, int32_t kvHeadNum,
-                                                          float qkScale, aclTensor *attnOut, uint64_t &workspaceSize,
+                                                          float qkScale, aclTensor *attnOut, uint64_t *workspaceSize,
                                                           atb::Operation **op, atb::Context *context);
 
 //!
diff --git a/include/atb/operation.h b/include/atb/operation.h
index c1b48e1d..f3aeec56 100644
--- a/include/atb/operation.h
+++ b/include/atb/operation.h
@@ -95,7 +95,7 @@ public:
     //! \return 状态值，如果成功，返回NO_ERROR
     //!
     virtual Status Execute(const VariantPack &variantPack, uint8_t *workspace, uint64_t workspaceSize,
-                           Context &context) = 0;
+                           Context *context) = 0;
 };
 
 //!
diff --git a/src/cinterface/atb_acl_fused_add_topk_div.cpp b/src/cinterface/atb_acl_fused_add_topk_div.cpp
index 7d54559a..65f84243 100644
--- a/src/cinterface/atb_acl_fused_add_topk_div.cpp
+++ b/src/cinterface/atb_acl_fused_add_topk_div.cpp
@@ -21,7 +21,7 @@ atb::Status AtbFusedAddTopkDivGetWorkspaceSize(const aclTensor *x, const aclTens
                                                const aclTensor *mappingTable, uint32_t groupNum, uint32_t groupTopk,
                                                uint32_t n, uint32_t k, int activationType, bool isNorm, float scale,
                                                bool enableExpertMapping, aclTensor *y, aclTensor *indices,
-                                               uint64_t &workspaceSize, atb::Operation **op, atb::Context *context)
+                                               uint64_t *workspaceSize, atb::Operation **op, atb::Context *context)
 {
     atb::infer::FusedAddTopkDivParam param;
     param.groupNum = groupNum;
@@ -69,7 +69,7 @@ atb::Status AtbFusedAddTopkDivGetWorkspaceSize(const aclTensor *x, const aclTens
         ATB_LOG(ERROR) << "AtbFusedAddTopkDivGetWorkspaceSize opeartion pointer is nullptr!";
         return atb::ERROR_INVALID_OPERATION_ADDR;
     }
-    status = (*op)->Setup(pack, workspaceSize, context);
+    status = (*op)->Setup(pack, *workspaceSize, context);
     ATB_CHECK(status == atb::NO_ERROR, "AtbFusedAddTopkDiv Setup failed!", return status);
     return atb::NO_ERROR;
 }
@@ -79,7 +79,7 @@ atb::Status AtbFusedAddTopkDiv(void *workspace, uint64_t workspaceSize, atb::Ope
     ATB_CHECK(op != nullptr, "AtbFusedAddTopkDiv expect op pointer not to be null!",
               return atb::ERROR_INVALID_OPERATION_ADDR);
     atb::VariantPack pack;
-    atb::Status st = op->Execute(pack, (uint8_t *)(workspace), workspaceSize, *context);
+    atb::Status st = op->Execute(pack, (uint8_t *)(workspace), workspaceSize, context);
     ATB_CHECK(st == atb::NO_ERROR, "AtbFusedAddTopkDiv Execute failed!", return st);
     return st;
 }
diff --git a/src/cinterface/atb_acl_mla.cpp b/src/cinterface/atb_acl_mla.cpp
index 1b115833..c8fa9695 100644
--- a/src/cinterface/atb_acl_mla.cpp
+++ b/src/cinterface/atb_acl_mla.cpp
@@ -27,7 +27,7 @@ atb::Status AtbMLAGetWorkspaceSize(const aclTensor *qNope, const aclTensor *qRop
                                    const aclTensor *mask, const aclTensor *qSeqLen, const aclTensor *qkDescale,
                                    const aclTensor *pvDescale, int32_t headNum, float qkScale, int32_t kvHeadNum,
                                    int maskType, int calcType, uint8_t cacheMode, aclTensor *attenOut, aclTensor *lse,
-                                   uint64_t &workspaceSize, atb::Operation **op, atb::Context *context)
+                                   uint64_t *workspaceSize, atb::Operation **op, atb::Context *context)
 {
     atb::infer::MultiLatentAttentionParam param;
     param.headNum = headNum;
@@ -109,7 +109,7 @@ atb::Status AtbMLAGetWorkspaceSize(const aclTensor *qNope, const aclTensor *qRop
         ATB_LOG(ERROR) << "AtbMLAGetWorkspaceSize opeartion pointer is nullptr!";
         return atb::ERROR_INVALID_OPERATION_ADDR;
     }
-    atb::Status st = (*op)->Setup(pack, workspaceSize, context);
+    atb::Status st = (*op)->Setup(pack, *workspaceSize, context);
     ATB_CHECK(st == atb::NO_ERROR, "AtbMLA Setup failed!", return st);
     return atb::NO_ERROR;
 }
@@ -129,7 +129,7 @@ atb::Status AtbMLAPreFillGetWorkspaceSize(const aclTensor *q, const aclTensor *q
                                           const aclTensor *kRope, const aclTensor *v, const aclTensor *qSeqLen,
                                           const aclTensor *kvSeqLen, const aclTensor *mask, int32_t headNum,
                                           float qkScale, int32_t kvHeadNum, int maskType, uint8_t cacheMode,
-                                          aclTensor *attenOut, uint64_t &workspaceSize, atb::Operation **op,
+                                          aclTensor *attenOut, uint64_t *workspaceSize, atb::Operation **op,
                                           atb::Context *context)
 {
     atb::infer::MultiLatentAttentionParam param;
@@ -183,7 +183,7 @@ atb::Status AtbMLAPreFillGetWorkspaceSize(const aclTensor *q, const aclTensor *q
         ATB_LOG(ERROR) << "AtbMLAPreFillGetWorkspaceSize opeartion pointer is nullptr!";
         return atb::ERROR_INVALID_OPERATION_ADDR;
     }
-    atb::Status st = (*op)->Setup(pack, workspaceSize, context);
+    atb::Status st = (*op)->Setup(pack, *workspaceSize, context);
     ATB_CHECK(st == atb::NO_ERROR, "AtbMLAPreFill Setup failed!", return st);
     return atb::NO_ERROR;
 }
diff --git a/src/cinterface/atb_acl_mla_preprocess.cpp b/src/cinterface/atb_acl_mla_preprocess.cpp
index be40ee8a..32bd22c6 100644
--- a/src/cinterface/atb_acl_mla_preprocess.cpp
+++ b/src/cinterface/atb_acl_mla_preprocess.cpp
@@ -28,7 +28,7 @@ atb::Status AtbMLAPreprocessGetWorkspaceSize(
     const aclTensor *kvCacheRope, const aclTensor *slotmapping, const aclTensor *ctkvScale, const aclTensor *qNopeScale,
     uint32_t wdqDim, uint32_t qRopeDim, uint32_t kRopeDim, float epsilon, uint32_t qRotaryCoeff, uint32_t kRotaryCoeff,
     bool transposeWdq, bool transposeWuq, bool transposeWuk, uint8_t cacheMode, uint16_t quantMode, aclTensor *qOut0,
-    aclTensor *kvCacheOut0, aclTensor *qOut1, aclTensor *kvCacheOut1, uint64_t &workspaceSize, atb::Operation **op,
+    aclTensor *kvCacheOut0, aclTensor *qOut1, aclTensor *kvCacheOut1, uint64_t *workspaceSize, atb::Operation **op,
     atb::Context *context)
 {
     atb::infer::MlaPreprocessParam param;
@@ -159,7 +159,7 @@ atb::Status AtbMLAPreprocessGetWorkspaceSize(
         ATB_LOG(ERROR) << "AtbMLAPreprocessGetWorkspaceSize opeartion pointer is nullptr!";
         return atb::ERROR_INVALID_OPERATION_ADDR;
     }
-    atb::Status st = (*op)->Setup(pack, workspaceSize, context);
+    atb::Status st = (*op)->Setup(pack, *workspaceSize, context);
     ATB_CHECK(st == atb::NO_ERROR, "AtbMLAPreprocess Setup failed!", return st);
     return atb::NO_ERROR;
 }
diff --git a/src/cinterface/atb_acl_paged_cache_load.cpp b/src/cinterface/atb_acl_paged_cache_load.cpp
index 834c8f4d..df6d86d5 100644
--- a/src/cinterface/atb_acl_paged_cache_load.cpp
+++ b/src/cinterface/atb_acl_paged_cache_load.cpp
@@ -22,7 +22,7 @@ atb::Status AtbPagedCacheLoadGetWorkspaceSize(const aclTensor *keyCache, const a
                                               const aclTensor *blockTables, const aclTensor *contextLens,
                                               const aclTensor *key, const aclTensor *value, const aclTensor *seqStarts,
                                               int8_t kvCacheCfg, bool isSeqLensCumsumType, bool hasSeqStarts,
-                                              uint64_t &workspaceSize, atb::Operation **op, atb::Context *context)
+                                              uint64_t *workspaceSize, atb::Operation **op, atb::Context *context)
 {
     atb::infer::PagedCacheLoadParam param;
     param.kvCacheCfg = atb::infer::PagedCacheLoadParam::KvCacheCfg(kvCacheCfg);
@@ -72,7 +72,7 @@ atb::Status AtbPagedCacheLoadGetWorkspaceSize(const aclTensor *keyCache, const a
         ATB_LOG(ERROR) << "AtbPagedCacheLoadGetWorkspaceSize opeartion pointer is nullptr!";
         return atb::ERROR_INVALID_OPERATION_ADDR;
     }
-    atb::Status st = (*op)->Setup(pack, workspaceSize, context);
+    atb::Status st = (*op)->Setup(pack, *workspaceSize, context);
     ATB_CHECK(st == atb::NO_ERROR, "AtbPagedCacheLoad Setup failed!", return st);
     return atb::NO_ERROR;
 }
diff --git a/src/cinterface/atb_acl_ring_mla.cpp b/src/cinterface/atb_acl_ring_mla.cpp
index 677fcf1b..62468810 100644
--- a/src/cinterface/atb_acl_ring_mla.cpp
+++ b/src/cinterface/atb_acl_ring_mla.cpp
@@ -23,7 +23,7 @@ atb::Status AtbRingMLAGetWorkspaceSize(const aclTensor *querySplit1, const aclTe
                                        const aclTensor *mask, const aclTensor *seqLen, const aclTensor *prevOut,
                                        const aclTensor *prevLse, int32_t headNum, int32_t kvHeadNum, float qkScale,
                                        int kernelType, int maskType, int inputLayout, int calcType, aclTensor *output,
-                                       aclTensor *softmaxLse, uint64_t &workspaceSize, atb::Operation **op,
+                                       aclTensor *softmaxLse, uint64_t *workspaceSize, atb::Operation **op,
                                        atb::Context *context)
 {
     atb::infer::RingMLAParam param;
@@ -80,7 +80,7 @@ atb::Status AtbRingMLAGetWorkspaceSize(const aclTensor *querySplit1, const aclTe
         ATB_LOG(ERROR) << "AtbRingMLAGetWorkspaceSize opeartion pointer is nullptr!";
         return atb::ERROR_INVALID_OPERATION_ADDR;
     }
-    status = (*op)->Setup(pack, workspaceSize, context);
+    status = (*op)->Setup(pack, *workspaceSize, context);
     ATB_CHECK(status == atb::NO_ERROR, "AtbRingMLA Setup failed!", return status);
     return atb::NO_ERROR;
 }
diff --git a/src/cinterface/atb_acl_self_attention_prefix_encoder.cpp b/src/cinterface/atb_acl_self_attention_prefix_encoder.cpp
index ffc3e5df..73e4e366 100644
--- a/src/cinterface/atb_acl_self_attention_prefix_encoder.cpp
+++ b/src/cinterface/atb_acl_self_attention_prefix_encoder.cpp
@@ -23,7 +23,7 @@ atb::Status AtbSelfAttentionPrefixEncoderGetWorkspaceSize(const aclTensor *query
                                                           const aclTensor *mask, const aclTensor *seqLen,
                                                           const aclTensor *kvSeqLen, const aclTensor *slopes,
                                                           int maskType, int32_t headNum, int32_t kvHeadNum,
-                                                          float qkScale, aclTensor *attnOut, uint64_t &workspaceSize,
+                                                          float qkScale, aclTensor *attnOut, uint64_t *workspaceSize,
                                                           atb::Operation **op, atb::Context *context)
 {
     atb::infer::SelfAttentionParam param;
@@ -94,7 +94,7 @@ atb::Status AtbSelfAttentionPrefixEncoderGetWorkspaceSize(const aclTensor *query
         ATB_LOG(ERROR) << "AtbSelfAttentionPrefixEncoderGetWorkspaceSize opeartion pointer is nullptr!";
         return atb::ERROR_INVALID_OPERATION_ADDR;
     }
-    status = (*op)->Setup(pack, workspaceSize, context);
+    status = (*op)->Setup(pack, *workspaceSize, context);
     ATB_CHECK(status == atb::NO_ERROR, "AtbSelfAttentionPrefixEncoder Setup failed!", return status);
     return atb::NO_ERROR;
 }
diff --git a/src/cinterface/atb_acl_util.cpp b/src/cinterface/atb_acl_util.cpp
index 40ec98f6..d4061128 100644
--- a/src/cinterface/atb_acl_util.cpp
+++ b/src/cinterface/atb_acl_util.cpp
@@ -17,9 +17,9 @@ extern "C" {
 // 256GB
 const int64_t MAX_TENSOR_SIZE = 256uLL * 1024uLL * 1024uLL * 1024uLL;
 
-int64_t GetTensorSize(const aclTensor &input)
+int64_t GetTensorSize(const aclTensor *input)
 {
-    const op::Shape shape = input.GetViewShape();
+    const op::Shape shape = input->GetViewShape();
     const size_t dims = shape.GetDimNum();
     int64_t size = 1;
     for (size_t i = 0; i < dims; ++i) {
@@ -57,7 +57,7 @@ atb::Status aclTensorToAtbTensor(const aclTensor *aclTensorSrc, atb::Tensor *atb
     atbTensorDst->desc = desc;
     atbTensorDst->deviceData = aclTensorSrc->GetData();
     atbTensorDst->hostData = nullptr;
-    int64_t tensorSize = GetTensorSize(*aclTensorSrc);
+    int64_t tensorSize = GetTensorSize(aclTensorSrc);
     int64_t dataTypeSize = static_cast<int64_t>(aclDataTypeSize(dataType));
     if (tensorSize > MAX_TENSOR_SIZE / dataTypeSize) {
         ATB_LOG(ERROR) << "The size of a tensor * dataTypeSize should be no more than 256GB, but got tensor size: "
@@ -97,7 +97,7 @@ atb::Status aclTensorToAtbTensorHost(const aclTensor *aclTensorSrc, atb::Tensor
     atbTensorDst->desc = desc;
     atbTensorDst->deviceData = nullptr;
     atbTensorDst->hostData = aclTensorSrc->GetData();
-    int64_t tensorSize = GetTensorSize(*aclTensorSrc);
+    int64_t tensorSize = GetTensorSize(aclTensorSrc);
     int64_t dataTypeSize = static_cast<int64_t>(aclDataTypeSize(dataType));
     if (tensorSize > MAX_TENSOR_SIZE / dataTypeSize) {
         ATB_LOG(ERROR) << "The size of a tensor * dataTypeSize should be no more than 256GB, but got tensor size: "
-- 
Gitee


From 2a10ca4d1fa83b0ffa6265002e7425246c6f6e48 Mon Sep 17 00:00:00 2001
From: He Changcheng <hechangcheng2@h-partners.com>
Date: Mon, 22 Sep 2025 17:46:39 +0800
Subject: [PATCH 44/94] fix

---
 src/kernels/lcal/src/lccl.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/kernels/lcal/src/lccl.cpp b/src/kernels/lcal/src/lccl.cpp
index 62cf73d7..8d515fb2 100644
--- a/src/kernels/lcal/src/lccl.cpp
+++ b/src/kernels/lcal/src/lccl.cpp
@@ -282,7 +282,7 @@ uint32_t Lccl::GetBlockNum(LcalType cclType, uint32_t rankSize, int64_t dataSize
         limitType = aclrtDevResLimitType::ACL_RT_DEV_RES_CUBE_CORE;
     }
 
-    int res = comm_->CallAclRtGetRes(static_cast<int>(limitType), &limitVal);
+    int res = GetAclResInCurThread(static_cast<int>(limitType), &limitVal);
     if (res == LCAL_SUCCESS) {
         MKI_LOG(DEBUG) << "Required blockNum(" << blockNum <<
             ") limit:(limitVal=" << limitVal << ", limitType=" << static_cast<int>(limitType) << ")";
-- 
Gitee


From 623add70587cb6b198144530fc1a41d6efef5cc1 Mon Sep 17 00:00:00 2001
From: guo-jiong <guojiong1@huawei.com>
Date: Sat, 20 Sep 2025 17:06:43 +0800
Subject: [PATCH 45/94] change ops dir

---
 src/CMakeLists.txt                            |  6 +--
 .../event_operation/event_operation.cpp       |  0
 .../event_operation/event_operation.h         |  0
 .../event_operation/event_runner.cpp          |  0
 .../ops_common/event_operation/event_runner.h |  0
 .../activation/activation_operation.cpp       |  0
 .../activation/activation_operation.h         |  0
 .../activation/activation_ops_runner.cpp      |  0
 .../activation/activation_ops_runner.h        |  0
 .../all_gather/all_gather_hccl_runner.cpp     |  0
 .../all_gather/all_gather_hccl_runner.h       |  0
 .../all_gather/all_gather_lccl_runner.cpp     |  0
 .../all_gather/all_gather_lccl_runner.h       |  0
 .../all_gather/all_gather_operation.cpp       |  0
 .../all_gather/all_gather_operation.h         |  0
 .../all_gatherv/all_gatherv_hccl_runner.cpp   |  0
 .../all_gatherv/all_gatherv_hccl_runner.h     |  0
 .../all_gatherv/all_gatherv_operation.cpp     |  0
 .../all_gatherv/all_gatherv_operation.h       |  0
 .../all_reduce/all_reduce_hccl_runner.cpp     |  0
 .../all_reduce/all_reduce_hccl_runner.h       |  0
 .../all_reduce/all_reduce_lccl_runner.cpp     |  0
 .../all_reduce/all_reduce_lccl_runner.h       |  0
 .../all_reduce/all_reduce_operation.cpp       |  0
 .../all_reduce/all_reduce_operation.h         |  0
 .../all_to_all/all_to_all_hccl_runner.cpp     |  0
 .../all_to_all/all_to_all_hccl_runner.h       |  0
 .../all_to_all/all_to_all_lccl_runner.cpp     |  0
 .../all_to_all/all_to_all_lccl_runner.h       |  0
 .../all_to_all/all_to_all_operation.cpp       |  0
 .../all_to_all/all_to_all_operation.h         |  0
 .../all_to_allv/all_to_allv_hccl_runner.cpp   |  0
 .../all_to_allv/all_to_allv_hccl_runner.h     |  0
 .../all_to_allv/all_to_allv_operation.cpp     |  0
 .../all_to_allv/all_to_allv_operation.h       |  0
 .../all_to_allvv2_hccl_runner.cpp             |  0
 .../all_to_allvv2/all_to_allvv2_hccl_runner.h |  0
 .../all_to_allvv2/all_to_allvv2_operation.cpp |  0
 .../all_to_allvv2/all_to_allvv2_operation.h   |  0
 .../as_strided/as_strided_operation.cpp       |  0
 .../as_strided/as_strided_operation.h         |  0
 .../as_strided/as_strided_ops_runner.cpp      |  0
 .../as_strided/as_strided_ops_runner.h        |  0
 .../block_copy/block_copy_operation.cpp       |  0
 .../block_copy/block_copy_operation.h         |  0
 .../block_copy/block_copy_ops_runner.cpp      |  0
 .../block_copy/block_copy_ops_runner.h        |  0
 .../broadcast/broadcast_hccl_runner.cpp       |  0
 .../broadcast/broadcast_hccl_runner.h         |  0
 .../broadcast/broadcast_lccl_runner.cpp       |  0
 .../broadcast/broadcast_lccl_runner.h         |  0
 .../broadcast/broadcast_operation.cpp         |  0
 .../ops_infer/broadcast/broadcast_operation.h |  0
 .../cohere_layernorm_operation.cpp            |  0
 .../cohere_layernorm_operation.h              |  0
 .../cohere_layernorm_runner.cpp               |  0
 .../cohere_layernorm_runner.h                 |  0
 .../ops_infer/concat/concat_operation.cpp     |  0
 .../ops_infer/concat/concat_operation.h       |  0
 .../ops_infer/concat/concat_ops_runner.cpp    |  0
 .../ops_infer/concat/concat_ops_runner.h      |  0
 .../ops_infer/cumsum/cumsum_operation.cpp     |  0
 .../ops_infer/cumsum/cumsum_operation.h       |  0
 .../ops_infer/cumsum/cumsum_ops_runner.cpp    |  0
 .../ops_infer/cumsum/cumsum_ops_runner.h      |  0
 .../dynamic_ntk/dynamic_ntk_operation.cpp     |  0
 .../dynamic_ntk/dynamic_ntk_operation.h       |  0
 .../dynamic_ntk/dynamic_ntk_ops_runner.cpp    |  0
 .../dynamic_ntk/dynamic_ntk_ops_runner.h      |  0
 .../ops_infer/elewise/elewise_operation.cpp   |  0
 .../ops_infer/elewise/elewise_operation.h     |  0
 .../ops_infer/elewise/elewise_ops_runner.cpp  |  0
 .../ops_infer/elewise/elewise_ops_runner.h    |  0
 .../ops_infer/faupdate/faupdate_operation.cpp |  0
 .../ops_infer/faupdate/faupdate_operation.h   |  0
 .../faupdate/faupdate_ops_runner.cpp          |  0
 .../ops_infer/faupdate/faupdate_ops_runner.h  |  0
 .../ops_infer/fill/fill_operation.cpp         |  0
 src/{ => ops}/ops_infer/fill/fill_operation.h |  0
 .../ops_infer/fill/fill_ops_runner.cpp        |  0
 .../ops_infer/fill/fill_ops_runner.h          |  0
 .../atb_acl_fused_add_topk_div.cpp            |  0
 .../fused_add_topk_div_operation.cpp          |  0
 .../fused_add_topk_div_operation.h            |  0
 .../fused_add_topk_div_ops_runner.cpp         |  0
 .../fused_add_topk_div_ops_runner.h           |  0
 .../ops_infer/gather/gather_operation.cpp     |  0
 .../ops_infer/gather/gather_operation.h       |  0
 .../ops_infer/gather/gather_ops_runner.cpp    |  0
 .../ops_infer/gather/gather_ops_runner.h      |  0
 .../gather_pre_rms_norm_operation.cpp         |  0
 .../gather_pre_rms_norm_operation.h           |  0
 .../gather_pre_rms_norm_ops_runner.cpp        |  0
 .../gather_pre_rms_norm_ops_runner.h          |  0
 .../ops_infer/gating/gating_operation.cpp     |  0
 .../ops_infer/gating/gating_operation.h       |  0
 .../ops_infer/gating/gating_ops_runner.cpp    |  0
 .../ops_infer/gating/gating_ops_runner.h      |  0
 ...gmm_deq_swiglu_quant_gmm_deq_operation.cpp |  0
 .../gmm_deq_swiglu_quant_gmm_deq_operation.h  |  0
 ...mm_deq_swiglu_quant_gmm_deq_ops_runner.cpp |  0
 .../gmm_deq_swiglu_quant_gmm_deq_ops_runner.h |  0
 .../group_topk/group_topk_operation.cpp       |  0
 .../group_topk/group_topk_operation.h         |  0
 .../group_topk/group_topk_ops_runner.cpp      |  0
 .../group_topk/group_topk_ops_runner.h        |  0
 .../grouped_matmul_inplace_add_operation.cpp  |  0
 .../grouped_matmul_inplace_add_operation.h    |  0
 .../grouped_matmul_inplace_add_ops_runner.cpp |  0
 .../grouped_matmul_inplace_add_ops_runner.h   |  0
 .../grouped_matmul_with_routing_operation.cpp |  0
 .../grouped_matmul_with_routing_operation.h   |  0
 .../grouped_matmul_with_routing_runner.cpp    |  0
 .../grouped_matmul_with_routing_runner.h      |  0
 .../index_add/index_add_operation.cpp         |  0
 .../ops_infer/index_add/index_add_operation.h |  0
 .../index_add/index_add_ops_runner.cpp        |  0
 .../index_add/index_add_ops_runner.h          |  0
 .../ops_infer/kv_cache/kv_cache_operation.cpp |  0
 .../ops_infer/kv_cache/kv_cache_operation.h   |  0
 .../kv_cache/kv_cache_ops_runner.cpp          |  0
 .../ops_infer/kv_cache/kv_cache_ops_runner.h  |  0
 .../layer_norm/layer_norm_operation.cpp       |  0
 .../layer_norm/layer_norm_operation.h         |  0
 .../layer_norm/layer_norm_ops_runner.cpp      |  0
 .../layer_norm/layer_norm_ops_runner.h        |  0
 .../layer_norm_with_stride_operation.cpp      |  0
 .../layer_norm_with_stride_operation.h        |  0
 .../layer_norm_with_stride_ops_runner.cpp     |  0
 .../layer_norm_with_stride_ops_runner.h       |  0
 .../ops_infer/linear/linear_operation.cpp     |  0
 .../ops_infer/linear/linear_operation.h       |  0
 .../ops_infer/linear/linear_ops_runner.cpp    |  0
 .../ops_infer/linear/linear_ops_runner.h      |  0
 .../linear_parallel_aclnn_runner.cpp          |  0
 .../linear_parallel_aclnn_runner.h            |  0
 .../linear_parallel_graph_runner.cpp          |  0
 .../linear_parallel_graph_runner.h            |  0
 .../linear_parallel_lcoc_runner.cpp           |  0
 .../linear_parallel_lcoc_runner.h             |  0
 .../linear_parallel_operation.cpp             |  0
 .../linear_parallel_operation.h               |  0
 .../linear_sparse/linear_sparse_operation.cpp |  0
 .../linear_sparse/linear_sparse_operation.h   |  0
 .../linear_sparse_ops_runner.cpp              |  0
 .../linear_sparse/linear_sparse_ops_runner.h  |  0
 .../mla_preprocess/atb_acl_mla_preprocess.cpp |  0
 .../mla_preprocess_aclnn_runner.cpp           |  0
 .../mla_preprocess_aclnn_runner.h             |  0
 .../mla_preprocess_operation.cpp              |  0
 .../mla_preprocess/mla_preprocess_operation.h |  0
 .../mla_preprocess_ops_runner.cpp             |  0
 .../mla_preprocess_ops_runner.h               |  0
 .../mla_preprocess_ops_runner_split.cpp       |  0
 .../mla_preprocess_ops_runner_split.h         |  0
 .../mm_deq_swiglu_quant_mm_deq_operation.cpp  |  0
 .../mm_deq_swiglu_quant_mm_deq_operation.h    |  0
 .../mm_deq_swiglu_quant_mm_deq_ops_runner.cpp |  0
 .../mm_deq_swiglu_quant_mm_deq_ops_runner.h   |  0
 .../multi_latent_attention/atb_acl_mla.cpp    |  0
 .../multi_latent_attention_operation.cpp      |  0
 .../multi_latent_attention_operation.h        |  0
 .../multi_latent_attention_ops_runner.cpp     |  0
 .../multi_latent_attention_ops_runner.h       |  0
 ...ti_latent_attention_ops_runner_prefill.cpp |  0
 ...ulti_latent_attention_ops_runner_prefill.h |  0
 .../multi_latent_attention/param.cpp          |  0
 .../ops_infer/multi_latent_attention/param.h  |  0
 .../multinomial/multinomial_operation.cpp     |  0
 .../multinomial/multinomial_operation.h       |  0
 .../multinomial/multinomial_ops_runner.cpp    |  0
 .../multinomial/multinomial_ops_runner.h      |  0
 .../ops_infer/nonzero/nonzero_operation.cpp   |  0
 .../ops_infer/nonzero/nonzero_operation.h     |  0
 .../ops_infer/nonzero/nonzero_runner.cpp      |  0
 .../ops_infer/nonzero/nonzero_runner.h        |  0
 .../norm_rope_reshape_operation.cpp           |  7 ++-
 .../norm_rope_reshape_operation.h             |  0
 .../norm_rope_reshape_ops_runner.cpp          |  7 ++-
 .../norm_rope_reshape_ops_runner.h            |  0
 .../ops_infer/onehot/onehot_operation.cpp     |  0
 .../ops_infer/onehot/onehot_operation.h       |  0
 .../ops_infer/onehot/onehot_ops_runner.cpp    |  0
 .../ops_infer/onehot/onehot_ops_runner.h      |  0
 src/{ => ops}/ops_infer/pad/pad_operation.cpp |  0
 src/{ => ops}/ops_infer/pad/pad_operation.h   |  0
 .../ops_infer/pad/pad_ops_runner.cpp          |  0
 src/{ => ops}/ops_infer/pad/pad_ops_runner.h  |  0
 .../paged_attention_operation.cpp             |  0
 .../paged_attention_operation.h               |  0
 .../paged_attention_ops_runner.cpp            |  0
 .../paged_attention_ops_runner.h              |  0
 .../paged_attention_ops_runner_910a.cpp       |  0
 .../paged_attention_ops_runner_910a.h         |  0
 .../paged_attention_runner_utils.cpp          |  0
 .../paged_attention_runner_utils.h            |  0
 .../ops_infer/paged_attention/param.cpp       |  0
 .../ops_infer/paged_attention/param.h         |  0
 .../atb_acl_paged_cache_load.cpp              |  0
 .../paged_cache_load_operation.cpp            |  0
 .../paged_cache_load_operation.h              |  0
 .../paged_cache_load_ops_runner.cpp           |  0
 .../paged_cache_load_ops_runner.h             |  0
 .../razor_fusion_attention_operation.cpp      |  0
 .../razor_fusion_attention_operation.h        |  0
 .../razor_fusion_attention_ops_runner.cpp     |  0
 .../razor_fusion_attention_ops_runner.h       |  0
 .../ops_infer/recv/recv_hccl_runner.cpp       |  0
 .../ops_infer/recv/recv_hccl_runner.h         |  0
 .../ops_infer/recv/recv_operation.cpp         |  0
 src/{ => ops}/ops_infer/recv/recv_operation.h |  0
 .../ops_infer/reduce/reduce_operation.cpp     |  0
 .../ops_infer/reduce/reduce_operation.h       |  0
 .../ops_infer/reduce/reduce_ops_runner.cpp    |  0
 .../ops_infer/reduce/reduce_ops_runner.h      |  0
 .../reduce_scatter_hccl_runner.cpp            |  0
 .../reduce_scatter_hccl_runner.h              |  0
 .../reduce_scatter_lccl_runner.cpp            |  0
 .../reduce_scatter_lccl_runner.h              |  0
 .../reduce_scatter_operation.cpp              |  0
 .../reduce_scatter/reduce_scatter_operation.h |  0
 .../reduce_scatterv_hccl_runner.cpp           |  0
 .../reduce_scatterv_hccl_runner.h             |  0
 .../reduce_scatterv_operation.cpp             |  0
 .../reduce_scatterv_operation.h               |  0
 .../ops_infer/relay_attention/param.cpp       |  0
 .../ops_infer/relay_attention/param.h         |  0
 .../relay_attention_operation.cpp             |  0
 .../relay_attention_operation.h               |  0
 .../relay_attention_ops_runner.cpp            |  0
 .../relay_attention_ops_runner.h              |  0
 .../ops_infer/repeat/repeat_operation.cpp     |  0
 .../ops_infer/repeat/repeat_operation.h       |  0
 .../ops_infer/repeat/repeat_ops_runner.cpp    |  0
 .../ops_infer/repeat/repeat_ops_runner.h      |  0
 .../reshape_and_cache_operation.cpp           |  0
 .../reshape_and_cache_operation.h             |  0
 .../reshape_and_cache_ops_runner.cpp          |  0
 .../reshape_and_cache_ops_runner.h            |  0
 .../reshape_and_cache_ops_runner_310p.cpp     |  0
 .../reshape_and_cache_ops_runner_310p.h       |  0
 .../reshape_and_cache_ops_runner_A2_NZ.cpp    |  0
 .../reshape_and_cache_ops_runner_A2_NZ.h      |  0
 .../reshape_and_cache_ops_runner_SISO.cpp     |  0
 .../reshape_and_cache_ops_runner_SISO.h       |  0
 .../reshape_and_cache_omni_operation.cpp      |  0
 .../reshape_and_cache_omni_operation.h        |  0
 .../reshape_and_cache_omni_ops_runner.cpp     |  0
 .../reshape_and_cache_omni_ops_runner.h       |  0
 ...eshape_and_cache_with_stride_operation.cpp |  0
 .../reshape_and_cache_with_stride_operation.h |  0
 ...shape_and_cache_with_stride_ops_runner.cpp |  0
 ...reshape_and_cache_with_stride_ops_runner.h |  0
 ..._and_cache_with_stride_ops_runner_SISO.cpp |  0
 ...pe_and_cache_with_stride_ops_runner_SISO.h |  0
 .../ops_infer/ring_mla/atb_acl_ring_mla.cpp   |  0
 src/{ => ops}/ops_infer/ring_mla/param.cpp    |  0
 src/{ => ops}/ops_infer/ring_mla/param.h      |  0
 .../ops_infer/ring_mla/ring_mla_operation.cpp |  0
 .../ops_infer/ring_mla/ring_mla_operation.h   |  0
 .../ring_mla/ring_mla_ops_runner.cpp          |  0
 .../ops_infer/ring_mla/ring_mla_ops_runner.h  |  0
 .../ops_infer/rms_norm/rms_norm_operation.cpp |  0
 .../ops_infer/rms_norm/rms_norm_operation.h   |  0
 .../rms_norm/rms_norm_ops_runner.cpp          |  0
 .../ops_infer/rms_norm/rms_norm_ops_runner.h  |  0
 .../rms_norm_with_stride_operation.cpp        |  0
 .../rms_norm_with_stride_operation.h          |  0
 .../rms_norm_with_stride_ops_runner.cpp       |  0
 .../rms_norm_with_stride_ops_runner.h         |  0
 .../ops_infer/rope/rope_operation.cpp         |  0
 src/{ => ops}/ops_infer/rope/rope_operation.h |  0
 .../ops_infer/rope/rope_ops_runner.cpp        |  0
 .../ops_infer/rope/rope_ops_runner.h          |  0
 .../rope_q_concat/rope_q_concat_operation.cpp |  0
 .../rope_q_concat/rope_q_concat_operation.h   |  0
 .../rope_q_concat_ops_runner.cpp              |  0
 .../rope_q_concat/rope_q_concat_ops_runner.h  |  0
 .../scatter_elements_v2_operation.cpp         |  0
 .../scatter_elements_v2_operation.h           |  0
 .../scatter_elements_v2_ops_runner.cpp        |  0
 .../scatter_elements_v2_ops_runner.h          |  0
 .../atb_acl_self_attention_prefix_encoder.cpp |  0
 .../ops_infer/self_attention/param.cpp        |  0
 .../ops_infer/self_attention/param.h          |  0
 ...tention_encoder_fuison_ops_runner_910a.cpp |  0
 ...lf_attention_encoder_fusion_ops_runner.cpp |  0
 ...self_attention_encoder_fusion_ops_runner.h |  0
 ...attention_encoder_fusion_ops_runner_910a.h |  0
 ...elf_attention_fusion_bypass_ops_runner.cpp |  0
 .../self_attention_fusion_bypass_ops_runner.h |  0
 ...ttention_fusion_bypass_ops_runner_910a.cpp |  0
 ..._attention_fusion_bypass_ops_runner_910a.h |  0
 ...ttention_fusion_bypass_ops_runner_BNSD.cpp |  0
 ..._attention_fusion_bypass_ops_runner_BNSD.h |  0
 ...ion_fusion_bypass_ops_runner_BNSD_910a.cpp |  0
 ...ntion_fusion_bypass_ops_runner_BNSD_910a.h |  0
 .../self_attention_fusion_ops_runner.cpp      |  0
 .../self_attention_fusion_ops_runner.h        |  0
 .../self_attention_fusion_ops_runner_910a.cpp |  0
 .../self_attention_fusion_ops_runner_910a.h   |  0
 .../self_attention_operation.cpp              |  0
 .../self_attention/self_attention_operation.h |  0
 ...lf_attention_prefix_encoder_ops_runner.cpp |  0
 ...self_attention_prefix_encoder_ops_runner.h |  0
 .../self_attention_runner_utils.cpp           |  0
 .../self_attention_runner_utils.h             |  0
 .../ops_infer/send/send_hccl_runner.cpp       |  0
 .../ops_infer/send/send_hccl_runner.h         |  0
 .../ops_infer/send/send_operation.cpp         |  0
 src/{ => ops}/ops_infer/send/send_operation.h |  0
 .../set_value/set_value_operation.cpp         |  0
 .../ops_infer/set_value/set_value_operation.h |  0
 .../set_value/set_value_ops_runner.cpp        |  0
 .../set_value/set_value_ops_runner.h          |  0
 .../ops_infer/slice/slice_operation.cpp       |  0
 .../ops_infer/slice/slice_operation.h         |  0
 .../ops_infer/slice/slice_ops_runner.cpp      |  0
 .../ops_infer/slice/slice_ops_runner.h        |  0
 .../ops_infer/softmax/softmax_operation.cpp   |  0
 .../ops_infer/softmax/softmax_operation.h     |  0
 .../ops_infer/softmax/softmax_ops_runner.cpp  |  0
 .../ops_infer/softmax/softmax_ops_runner.h    |  0
 .../ops_infer/sort/sort_operation.cpp         |  0
 src/{ => ops}/ops_infer/sort/sort_operation.h |  0
 .../ops_infer/sort/sort_ops_runner.cpp        |  0
 .../ops_infer/sort/sort_ops_runner.h          | 52 +++++++++----------
 .../ops_infer/split/split_operation.cpp       |  0
 .../ops_infer/split/split_operation.h         |  0
 .../ops_infer/split/split_ops_runner.cpp      |  0
 .../ops_infer/split/split_ops_runner.h        |  0
 .../swiglu_quant/swiglu_quant_operation.cpp   |  0
 .../swiglu_quant/swiglu_quant_operation.h     |  0
 .../swiglu_quant/swiglu_quant_ops_runner.cpp  |  0
 .../swiglu_quant/swiglu_quant_ops_runner.h    |  0
 .../topk_topp_sampling_operation.cpp          |  0
 .../topk_topp_sampling_operation.h            |  0
 .../topk_topp_sampling_ops_runner.cpp         |  0
 .../topk_topp_sampling_ops_runner.h           |  0
 .../transdata/transdata_operation.cpp         |  0
 .../ops_infer/transdata/transdata_operation.h |  0
 .../transdata/transdata_ops_runner.cpp        |  0
 .../transdata/transdata_ops_runner.h          |  0
 .../transpose/transpose_operation.cpp         |  0
 .../ops_infer/transpose/transpose_operation.h |  0
 .../transpose/transpose_ops_runner.cpp        |  0
 .../transpose/transpose_ops_runner.h          |  0
 .../ops_infer/unpad/unpad_operation.cpp       |  0
 .../ops_infer/unpad/unpad_operation.h         |  0
 .../ops_infer/unpad/unpad_ops_runner.cpp      |  0
 .../ops_infer/unpad/unpad_ops_runner.h        |  0
 .../ops_infer/where/where_operation.cpp       |  0
 .../ops_infer/where/where_operation.h         |  0
 .../ops_infer/where/where_ops_runner.cpp      |  0
 .../ops_infer/where/where_ops_runner.h        |  0
 .../fast_soft_max/fastsoftmax_operation.cpp   |  0
 .../fast_soft_max/fastsoftmax_operation.h     |  0
 .../fast_soft_max/fastsoftmax_ops_runner.cpp  |  0
 .../fast_soft_max/fastsoftmax_ops_runner.h    |  0
 .../fastsoftmaxgrad_operation.cpp             |  0
 .../fastsoftmaxgrad_operation.h               |  0
 .../fastsoftmaxgrad_ops_runner.cpp            |  0
 .../fastsoftmaxgrad_ops_runner.h              |  0
 .../genattentionmask_operation.cpp            |  0
 .../genattentionmask_operation.h              |  0
 .../genattentionmask_ops_runner.cpp           |  0
 .../genattentionmask_ops_runner.h             |  0
 .../laser_attention_operation.cpp             |  0
 .../laser_attention_operation.h               |  0
 .../laser_attention_ops_runner.cpp            |  0
 .../laser_attention_ops_runner.h              |  0
 .../laser_attention_grad_operation.cpp        |  0
 .../laser_attention_grad_operation.h          |  0
 .../laser_attention_grad_ops_runner.cpp       |  0
 .../laser_attention_grad_ops_runner.h         |  0
 .../pad_with_hidden_state_operation.cpp       |  0
 .../pad_with_hidden_state_operation.h         |  0
 .../pad_with_hidden_state_ops_runner.cpp      |  0
 .../pad_with_hidden_state_ops_runner.h        |  0
 .../rms_norm_backward_operation.cpp           |  0
 .../rms_norm_backward_operation.h             |  0
 .../rms_norm_backward_ops_runner.cpp          |  0
 .../rms_norm_backward_ops_runner.h            |  0
 .../rope_grad/rope_grad_operation.cpp         |  0
 .../ops_train/rope_grad/rope_grad_operation.h |  0
 .../rope_grad/rope_grad_ops_runner.cpp        |  0
 .../rope_grad/rope_grad_ops_runner.h          |  0
 .../stridedbatchmatmul_operation.cpp          |  0
 .../stridedbatchmatmul_operation.h            |  0
 .../stridedbatchmatmul_ops_runner.cpp         |  0
 .../stridedbatchmatmul_ops_runner.h           |  0
 .../unpad_with_hidden_state_operation.cpp     |  0
 .../unpad_with_hidden_state_operation.h       |  0
 .../unpad_with_hidden_state_ops_runner.cpp    |  0
 .../unpad_with_hidden_state_ops_runner.h      |  0
 395 files changed, 35 insertions(+), 37 deletions(-)
 rename src/{ => ops}/ops_common/event_operation/event_operation.cpp (100%)
 rename src/{ => ops}/ops_common/event_operation/event_operation.h (100%)
 rename src/{ => ops}/ops_common/event_operation/event_runner.cpp (100%)
 rename src/{ => ops}/ops_common/event_operation/event_runner.h (100%)
 rename src/{ => ops}/ops_infer/activation/activation_operation.cpp (100%)
 rename src/{ => ops}/ops_infer/activation/activation_operation.h (100%)
 rename src/{ => ops}/ops_infer/activation/activation_ops_runner.cpp (100%)
 rename src/{ => ops}/ops_infer/activation/activation_ops_runner.h (100%)
 rename src/{ => ops}/ops_infer/all_gather/all_gather_hccl_runner.cpp (100%)
 rename src/{ => ops}/ops_infer/all_gather/all_gather_hccl_runner.h (100%)
 rename src/{ => ops}/ops_infer/all_gather/all_gather_lccl_runner.cpp (100%)
 rename src/{ => ops}/ops_infer/all_gather/all_gather_lccl_runner.h (100%)
 rename src/{ => ops}/ops_infer/all_gather/all_gather_operation.cpp (100%)
 rename src/{ => ops}/ops_infer/all_gather/all_gather_operation.h (100%)
 rename src/{ => ops}/ops_infer/all_gatherv/all_gatherv_hccl_runner.cpp (100%)
 rename src/{ => ops}/ops_infer/all_gatherv/all_gatherv_hccl_runner.h (100%)
 rename src/{ => ops}/ops_infer/all_gatherv/all_gatherv_operation.cpp (100%)
 rename src/{ => ops}/ops_infer/all_gatherv/all_gatherv_operation.h (100%)
 rename src/{ => ops}/ops_infer/all_reduce/all_reduce_hccl_runner.cpp (100%)
 rename src/{ => ops}/ops_infer/all_reduce/all_reduce_hccl_runner.h (100%)
 rename src/{ => ops}/ops_infer/all_reduce/all_reduce_lccl_runner.cpp (100%)
 rename src/{ => ops}/ops_infer/all_reduce/all_reduce_lccl_runner.h (100%)
 rename src/{ => ops}/ops_infer/all_reduce/all_reduce_operation.cpp (100%)
 rename src/{ => ops}/ops_infer/all_reduce/all_reduce_operation.h (100%)
 rename src/{ => ops}/ops_infer/all_to_all/all_to_all_hccl_runner.cpp (100%)
 rename src/{ => ops}/ops_infer/all_to_all/all_to_all_hccl_runner.h (100%)
 rename src/{ => ops}/ops_infer/all_to_all/all_to_all_lccl_runner.cpp (100%)
 rename src/{ => ops}/ops_infer/all_to_all/all_to_all_lccl_runner.h (100%)
 rename src/{ => ops}/ops_infer/all_to_all/all_to_all_operation.cpp (100%)
 rename src/{ => ops}/ops_infer/all_to_all/all_to_all_operation.h (100%)
 rename src/{ => ops}/ops_infer/all_to_allv/all_to_allv_hccl_runner.cpp (100%)
 rename src/{ => ops}/ops_infer/all_to_allv/all_to_allv_hccl_runner.h (100%)
 rename src/{ => ops}/ops_infer/all_to_allv/all_to_allv_operation.cpp (100%)
 rename src/{ => ops}/ops_infer/all_to_allv/all_to_allv_operation.h (100%)
 rename src/{ => ops}/ops_infer/all_to_allvv2/all_to_allvv2_hccl_runner.cpp (100%)
 rename src/{ => ops}/ops_infer/all_to_allvv2/all_to_allvv2_hccl_runner.h (100%)
 rename src/{ => ops}/ops_infer/all_to_allvv2/all_to_allvv2_operation.cpp (100%)
 rename src/{ => ops}/ops_infer/all_to_allvv2/all_to_allvv2_operation.h (100%)
 rename src/{ => ops}/ops_infer/as_strided/as_strided_operation.cpp (100%)
 rename src/{ => ops}/ops_infer/as_strided/as_strided_operation.h (100%)
 rename src/{ => ops}/ops_infer/as_strided/as_strided_ops_runner.cpp (100%)
 rename src/{ => ops}/ops_infer/as_strided/as_strided_ops_runner.h (100%)
 rename src/{ => ops}/ops_infer/block_copy/block_copy_operation.cpp (100%)
 rename src/{ => ops}/ops_infer/block_copy/block_copy_operation.h (100%)
 rename src/{ => ops}/ops_infer/block_copy/block_copy_ops_runner.cpp (100%)
 rename src/{ => ops}/ops_infer/block_copy/block_copy_ops_runner.h (100%)
 rename src/{ => ops}/ops_infer/broadcast/broadcast_hccl_runner.cpp (100%)
 rename src/{ => ops}/ops_infer/broadcast/broadcast_hccl_runner.h (100%)
 rename src/{ => ops}/ops_infer/broadcast/broadcast_lccl_runner.cpp (100%)
 rename src/{ => ops}/ops_infer/broadcast/broadcast_lccl_runner.h (100%)
 rename src/{ => ops}/ops_infer/broadcast/broadcast_operation.cpp (100%)
 rename src/{ => ops}/ops_infer/broadcast/broadcast_operation.h (100%)
 rename src/{ => ops}/ops_infer/cohere_layernorm/cohere_layernorm_operation.cpp (100%)
 rename src/{ => ops}/ops_infer/cohere_layernorm/cohere_layernorm_operation.h (100%)
 rename src/{ => ops}/ops_infer/cohere_layernorm/cohere_layernorm_runner.cpp (100%)
 rename src/{ => ops}/ops_infer/cohere_layernorm/cohere_layernorm_runner.h (100%)
 rename src/{ => ops}/ops_infer/concat/concat_operation.cpp (100%)
 rename src/{ => ops}/ops_infer/concat/concat_operation.h (100%)
 rename src/{ => ops}/ops_infer/concat/concat_ops_runner.cpp (100%)
 rename src/{ => ops}/ops_infer/concat/concat_ops_runner.h (100%)
 rename src/{ => ops}/ops_infer/cumsum/cumsum_operation.cpp (100%)
 rename src/{ => ops}/ops_infer/cumsum/cumsum_operation.h (100%)
 rename src/{ => ops}/ops_infer/cumsum/cumsum_ops_runner.cpp (100%)
 rename src/{ => ops}/ops_infer/cumsum/cumsum_ops_runner.h (100%)
 rename src/{ => ops}/ops_infer/dynamic_ntk/dynamic_ntk_operation.cpp (100%)
 rename src/{ => ops}/ops_infer/dynamic_ntk/dynamic_ntk_operation.h (100%)
 rename src/{ => ops}/ops_infer/dynamic_ntk/dynamic_ntk_ops_runner.cpp (100%)
 rename src/{ => ops}/ops_infer/dynamic_ntk/dynamic_ntk_ops_runner.h (100%)
 rename src/{ => ops}/ops_infer/elewise/elewise_operation.cpp (100%)
 rename src/{ => ops}/ops_infer/elewise/elewise_operation.h (100%)
 rename src/{ => ops}/ops_infer/elewise/elewise_ops_runner.cpp (100%)
 rename src/{ => ops}/ops_infer/elewise/elewise_ops_runner.h (100%)
 rename src/{ => ops}/ops_infer/faupdate/faupdate_operation.cpp (100%)
 rename src/{ => ops}/ops_infer/faupdate/faupdate_operation.h (100%)
 rename src/{ => ops}/ops_infer/faupdate/faupdate_ops_runner.cpp (100%)
 rename src/{ => ops}/ops_infer/faupdate/faupdate_ops_runner.h (100%)
 rename src/{ => ops}/ops_infer/fill/fill_operation.cpp (100%)
 rename src/{ => ops}/ops_infer/fill/fill_operation.h (100%)
 rename src/{ => ops}/ops_infer/fill/fill_ops_runner.cpp (100%)
 rename src/{ => ops}/ops_infer/fill/fill_ops_runner.h (100%)
 rename src/{ => ops}/ops_infer/fused_add_topk_div/atb_acl_fused_add_topk_div.cpp (100%)
 rename src/{ => ops}/ops_infer/fused_add_topk_div/fused_add_topk_div_operation.cpp (100%)
 rename src/{ => ops}/ops_infer/fused_add_topk_div/fused_add_topk_div_operation.h (100%)
 rename src/{ => ops}/ops_infer/fused_add_topk_div/fused_add_topk_div_ops_runner.cpp (100%)
 rename src/{ => ops}/ops_infer/fused_add_topk_div/fused_add_topk_div_ops_runner.h (100%)
 rename src/{ => ops}/ops_infer/gather/gather_operation.cpp (100%)
 rename src/{ => ops}/ops_infer/gather/gather_operation.h (100%)
 rename src/{ => ops}/ops_infer/gather/gather_ops_runner.cpp (100%)
 rename src/{ => ops}/ops_infer/gather/gather_ops_runner.h (100%)
 rename src/{ => ops}/ops_infer/gather_pre_rms_norm/gather_pre_rms_norm_operation.cpp (100%)
 rename src/{ => ops}/ops_infer/gather_pre_rms_norm/gather_pre_rms_norm_operation.h (100%)
 rename src/{ => ops}/ops_infer/gather_pre_rms_norm/gather_pre_rms_norm_ops_runner.cpp (100%)
 rename src/{ => ops}/ops_infer/gather_pre_rms_norm/gather_pre_rms_norm_ops_runner.h (100%)
 rename src/{ => ops}/ops_infer/gating/gating_operation.cpp (100%)
 rename src/{ => ops}/ops_infer/gating/gating_operation.h (100%)
 rename src/{ => ops}/ops_infer/gating/gating_ops_runner.cpp (100%)
 rename src/{ => ops}/ops_infer/gating/gating_ops_runner.h (100%)
 rename src/{ => ops}/ops_infer/gmm_deq_swiglu_quant_gmm_deq/gmm_deq_swiglu_quant_gmm_deq_operation.cpp (100%)
 rename src/{ => ops}/ops_infer/gmm_deq_swiglu_quant_gmm_deq/gmm_deq_swiglu_quant_gmm_deq_operation.h (100%)
 rename src/{ => ops}/ops_infer/gmm_deq_swiglu_quant_gmm_deq/gmm_deq_swiglu_quant_gmm_deq_ops_runner.cpp (100%)
 rename src/{ => ops}/ops_infer/gmm_deq_swiglu_quant_gmm_deq/gmm_deq_swiglu_quant_gmm_deq_ops_runner.h (100%)
 rename src/{ => ops}/ops_infer/group_topk/group_topk_operation.cpp (100%)
 rename src/{ => ops}/ops_infer/group_topk/group_topk_operation.h (100%)
 rename src/{ => ops}/ops_infer/group_topk/group_topk_ops_runner.cpp (100%)
 rename src/{ => ops}/ops_infer/group_topk/group_topk_ops_runner.h (100%)
 rename src/{ => ops}/ops_infer/grouped_matmul_inplace_add/grouped_matmul_inplace_add_operation.cpp (100%)
 rename src/{ => ops}/ops_infer/grouped_matmul_inplace_add/grouped_matmul_inplace_add_operation.h (100%)
 rename src/{ => ops}/ops_infer/grouped_matmul_inplace_add/grouped_matmul_inplace_add_ops_runner.cpp (100%)
 rename src/{ => ops}/ops_infer/grouped_matmul_inplace_add/grouped_matmul_inplace_add_ops_runner.h (100%)
 rename src/{ => ops}/ops_infer/grouped_matmul_with_routing/grouped_matmul_with_routing_operation.cpp (100%)
 rename src/{ => ops}/ops_infer/grouped_matmul_with_routing/grouped_matmul_with_routing_operation.h (100%)
 rename src/{ => ops}/ops_infer/grouped_matmul_with_routing/grouped_matmul_with_routing_runner.cpp (100%)
 rename src/{ => ops}/ops_infer/grouped_matmul_with_routing/grouped_matmul_with_routing_runner.h (100%)
 rename src/{ => ops}/ops_infer/index_add/index_add_operation.cpp (100%)
 rename src/{ => ops}/ops_infer/index_add/index_add_operation.h (100%)
 rename src/{ => ops}/ops_infer/index_add/index_add_ops_runner.cpp (100%)
 rename src/{ => ops}/ops_infer/index_add/index_add_ops_runner.h (100%)
 rename src/{ => ops}/ops_infer/kv_cache/kv_cache_operation.cpp (100%)
 rename src/{ => ops}/ops_infer/kv_cache/kv_cache_operation.h (100%)
 rename src/{ => ops}/ops_infer/kv_cache/kv_cache_ops_runner.cpp (100%)
 rename src/{ => ops}/ops_infer/kv_cache/kv_cache_ops_runner.h (100%)
 rename src/{ => ops}/ops_infer/layer_norm/layer_norm_operation.cpp (100%)
 rename src/{ => ops}/ops_infer/layer_norm/layer_norm_operation.h (100%)
 rename src/{ => ops}/ops_infer/layer_norm/layer_norm_ops_runner.cpp (100%)
 rename src/{ => ops}/ops_infer/layer_norm/layer_norm_ops_runner.h (100%)
 rename src/{ => ops}/ops_infer/layer_norm_with_stride/layer_norm_with_stride_operation.cpp (100%)
 rename src/{ => ops}/ops_infer/layer_norm_with_stride/layer_norm_with_stride_operation.h (100%)
 rename src/{ => ops}/ops_infer/layer_norm_with_stride/layer_norm_with_stride_ops_runner.cpp (100%)
 rename src/{ => ops}/ops_infer/layer_norm_with_stride/layer_norm_with_stride_ops_runner.h (100%)
 rename src/{ => ops}/ops_infer/linear/linear_operation.cpp (100%)
 rename src/{ => ops}/ops_infer/linear/linear_operation.h (100%)
 rename src/{ => ops}/ops_infer/linear/linear_ops_runner.cpp (100%)
 rename src/{ => ops}/ops_infer/linear/linear_ops_runner.h (100%)
 rename src/{ => ops}/ops_infer/linear_parallel/linear_parallel_aclnn_runner.cpp (100%)
 rename src/{ => ops}/ops_infer/linear_parallel/linear_parallel_aclnn_runner.h (100%)
 rename src/{ => ops}/ops_infer/linear_parallel/linear_parallel_graph_runner.cpp (100%)
 rename src/{ => ops}/ops_infer/linear_parallel/linear_parallel_graph_runner.h (100%)
 rename src/{ => ops}/ops_infer/linear_parallel/linear_parallel_lcoc_runner.cpp (100%)
 rename src/{ => ops}/ops_infer/linear_parallel/linear_parallel_lcoc_runner.h (100%)
 rename src/{ => ops}/ops_infer/linear_parallel/linear_parallel_operation.cpp (100%)
 rename src/{ => ops}/ops_infer/linear_parallel/linear_parallel_operation.h (100%)
 rename src/{ => ops}/ops_infer/linear_sparse/linear_sparse_operation.cpp (100%)
 rename src/{ => ops}/ops_infer/linear_sparse/linear_sparse_operation.h (100%)
 rename src/{ => ops}/ops_infer/linear_sparse/linear_sparse_ops_runner.cpp (100%)
 rename src/{ => ops}/ops_infer/linear_sparse/linear_sparse_ops_runner.h (100%)
 rename src/{ => ops}/ops_infer/mla_preprocess/atb_acl_mla_preprocess.cpp (100%)
 rename src/{ => ops}/ops_infer/mla_preprocess/mla_preprocess_aclnn_runner.cpp (100%)
 rename src/{ => ops}/ops_infer/mla_preprocess/mla_preprocess_aclnn_runner.h (100%)
 rename src/{ => ops}/ops_infer/mla_preprocess/mla_preprocess_operation.cpp (100%)
 rename src/{ => ops}/ops_infer/mla_preprocess/mla_preprocess_operation.h (100%)
 rename src/{ => ops}/ops_infer/mla_preprocess/mla_preprocess_ops_runner.cpp (100%)
 rename src/{ => ops}/ops_infer/mla_preprocess/mla_preprocess_ops_runner.h (100%)
 rename src/{ => ops}/ops_infer/mla_preprocess/mla_preprocess_ops_runner_split.cpp (100%)
 rename src/{ => ops}/ops_infer/mla_preprocess/mla_preprocess_ops_runner_split.h (100%)
 rename src/{ => ops}/ops_infer/mm_deq_swiglu_quant_mm_deq/mm_deq_swiglu_quant_mm_deq_operation.cpp (100%)
 rename src/{ => ops}/ops_infer/mm_deq_swiglu_quant_mm_deq/mm_deq_swiglu_quant_mm_deq_operation.h (100%)
 rename src/{ => ops}/ops_infer/mm_deq_swiglu_quant_mm_deq/mm_deq_swiglu_quant_mm_deq_ops_runner.cpp (100%)
 rename src/{ => ops}/ops_infer/mm_deq_swiglu_quant_mm_deq/mm_deq_swiglu_quant_mm_deq_ops_runner.h (100%)
 rename src/{ => ops}/ops_infer/multi_latent_attention/atb_acl_mla.cpp (100%)
 rename src/{ => ops}/ops_infer/multi_latent_attention/multi_latent_attention_operation.cpp (100%)
 rename src/{ => ops}/ops_infer/multi_latent_attention/multi_latent_attention_operation.h (100%)
 rename src/{ => ops}/ops_infer/multi_latent_attention/multi_latent_attention_ops_runner.cpp (100%)
 rename src/{ => ops}/ops_infer/multi_latent_attention/multi_latent_attention_ops_runner.h (100%)
 rename src/{ => ops}/ops_infer/multi_latent_attention/multi_latent_attention_ops_runner_prefill.cpp (100%)
 rename src/{ => ops}/ops_infer/multi_latent_attention/multi_latent_attention_ops_runner_prefill.h (100%)
 rename src/{ => ops}/ops_infer/multi_latent_attention/param.cpp (100%)
 rename src/{ => ops}/ops_infer/multi_latent_attention/param.h (100%)
 rename src/{ => ops}/ops_infer/multinomial/multinomial_operation.cpp (100%)
 rename src/{ => ops}/ops_infer/multinomial/multinomial_operation.h (100%)
 rename src/{ => ops}/ops_infer/multinomial/multinomial_ops_runner.cpp (100%)
 rename src/{ => ops}/ops_infer/multinomial/multinomial_ops_runner.h (100%)
 rename src/{ => ops}/ops_infer/nonzero/nonzero_operation.cpp (100%)
 rename src/{ => ops}/ops_infer/nonzero/nonzero_operation.h (100%)
 rename src/{ => ops}/ops_infer/nonzero/nonzero_runner.cpp (100%)
 rename src/{ => ops}/ops_infer/nonzero/nonzero_runner.h (100%)
 rename src/{ => ops}/ops_infer/norm_rope_reshape/norm_rope_reshape_operation.cpp (97%)
 rename src/{ => ops}/ops_infer/norm_rope_reshape/norm_rope_reshape_operation.h (100%)
 rename src/{ => ops}/ops_infer/norm_rope_reshape/norm_rope_reshape_ops_runner.cpp (92%)
 rename src/{ => ops}/ops_infer/norm_rope_reshape/norm_rope_reshape_ops_runner.h (100%)
 rename src/{ => ops}/ops_infer/onehot/onehot_operation.cpp (100%)
 rename src/{ => ops}/ops_infer/onehot/onehot_operation.h (100%)
 rename src/{ => ops}/ops_infer/onehot/onehot_ops_runner.cpp (100%)
 rename src/{ => ops}/ops_infer/onehot/onehot_ops_runner.h (100%)
 rename src/{ => ops}/ops_infer/pad/pad_operation.cpp (100%)
 rename src/{ => ops}/ops_infer/pad/pad_operation.h (100%)
 rename src/{ => ops}/ops_infer/pad/pad_ops_runner.cpp (100%)
 rename src/{ => ops}/ops_infer/pad/pad_ops_runner.h (100%)
 rename src/{ => ops}/ops_infer/paged_attention/paged_attention_operation.cpp (100%)
 rename src/{ => ops}/ops_infer/paged_attention/paged_attention_operation.h (100%)
 rename src/{ => ops}/ops_infer/paged_attention/paged_attention_ops_runner.cpp (100%)
 rename src/{ => ops}/ops_infer/paged_attention/paged_attention_ops_runner.h (100%)
 rename src/{ => ops}/ops_infer/paged_attention/paged_attention_ops_runner_910a.cpp (100%)
 rename src/{ => ops}/ops_infer/paged_attention/paged_attention_ops_runner_910a.h (100%)
 rename src/{ => ops}/ops_infer/paged_attention/paged_attention_runner_utils.cpp (100%)
 rename src/{ => ops}/ops_infer/paged_attention/paged_attention_runner_utils.h (100%)
 rename src/{ => ops}/ops_infer/paged_attention/param.cpp (100%)
 rename src/{ => ops}/ops_infer/paged_attention/param.h (100%)
 rename src/{ => ops}/ops_infer/paged_cache_load/atb_acl_paged_cache_load.cpp (100%)
 rename src/{ => ops}/ops_infer/paged_cache_load/paged_cache_load_operation.cpp (100%)
 rename src/{ => ops}/ops_infer/paged_cache_load/paged_cache_load_operation.h (100%)
 rename src/{ => ops}/ops_infer/paged_cache_load/paged_cache_load_ops_runner.cpp (100%)
 rename src/{ => ops}/ops_infer/paged_cache_load/paged_cache_load_ops_runner.h (100%)
 rename src/{ => ops}/ops_infer/razor_fusion_attention/razor_fusion_attention_operation.cpp (100%)
 rename src/{ => ops}/ops_infer/razor_fusion_attention/razor_fusion_attention_operation.h (100%)
 rename src/{ => ops}/ops_infer/razor_fusion_attention/razor_fusion_attention_ops_runner.cpp (100%)
 rename src/{ => ops}/ops_infer/razor_fusion_attention/razor_fusion_attention_ops_runner.h (100%)
 rename src/{ => ops}/ops_infer/recv/recv_hccl_runner.cpp (100%)
 rename src/{ => ops}/ops_infer/recv/recv_hccl_runner.h (100%)
 rename src/{ => ops}/ops_infer/recv/recv_operation.cpp (100%)
 rename src/{ => ops}/ops_infer/recv/recv_operation.h (100%)
 rename src/{ => ops}/ops_infer/reduce/reduce_operation.cpp (100%)
 rename src/{ => ops}/ops_infer/reduce/reduce_operation.h (100%)
 rename src/{ => ops}/ops_infer/reduce/reduce_ops_runner.cpp (100%)
 rename src/{ => ops}/ops_infer/reduce/reduce_ops_runner.h (100%)
 rename src/{ => ops}/ops_infer/reduce_scatter/reduce_scatter_hccl_runner.cpp (100%)
 rename src/{ => ops}/ops_infer/reduce_scatter/reduce_scatter_hccl_runner.h (100%)
 rename src/{ => ops}/ops_infer/reduce_scatter/reduce_scatter_lccl_runner.cpp (100%)
 rename src/{ => ops}/ops_infer/reduce_scatter/reduce_scatter_lccl_runner.h (100%)
 rename src/{ => ops}/ops_infer/reduce_scatter/reduce_scatter_operation.cpp (100%)
 rename src/{ => ops}/ops_infer/reduce_scatter/reduce_scatter_operation.h (100%)
 rename src/{ => ops}/ops_infer/reduce_scatterv/reduce_scatterv_hccl_runner.cpp (100%)
 rename src/{ => ops}/ops_infer/reduce_scatterv/reduce_scatterv_hccl_runner.h (100%)
 rename src/{ => ops}/ops_infer/reduce_scatterv/reduce_scatterv_operation.cpp (100%)
 rename src/{ => ops}/ops_infer/reduce_scatterv/reduce_scatterv_operation.h (100%)
 rename src/{ => ops}/ops_infer/relay_attention/param.cpp (100%)
 rename src/{ => ops}/ops_infer/relay_attention/param.h (100%)
 rename src/{ => ops}/ops_infer/relay_attention/relay_attention_operation.cpp (100%)
 rename src/{ => ops}/ops_infer/relay_attention/relay_attention_operation.h (100%)
 rename src/{ => ops}/ops_infer/relay_attention/relay_attention_ops_runner.cpp (100%)
 rename src/{ => ops}/ops_infer/relay_attention/relay_attention_ops_runner.h (100%)
 rename src/{ => ops}/ops_infer/repeat/repeat_operation.cpp (100%)
 rename src/{ => ops}/ops_infer/repeat/repeat_operation.h (100%)
 rename src/{ => ops}/ops_infer/repeat/repeat_ops_runner.cpp (100%)
 rename src/{ => ops}/ops_infer/repeat/repeat_ops_runner.h (100%)
 rename src/{ => ops}/ops_infer/reshape_and_cache/reshape_and_cache_operation.cpp (100%)
 rename src/{ => ops}/ops_infer/reshape_and_cache/reshape_and_cache_operation.h (100%)
 rename src/{ => ops}/ops_infer/reshape_and_cache/reshape_and_cache_ops_runner.cpp (100%)
 rename src/{ => ops}/ops_infer/reshape_and_cache/reshape_and_cache_ops_runner.h (100%)
 rename src/{ => ops}/ops_infer/reshape_and_cache/reshape_and_cache_ops_runner_310p.cpp (100%)
 rename src/{ => ops}/ops_infer/reshape_and_cache/reshape_and_cache_ops_runner_310p.h (100%)
 rename src/{ => ops}/ops_infer/reshape_and_cache/reshape_and_cache_ops_runner_A2_NZ.cpp (100%)
 rename src/{ => ops}/ops_infer/reshape_and_cache/reshape_and_cache_ops_runner_A2_NZ.h (100%)
 rename src/{ => ops}/ops_infer/reshape_and_cache/reshape_and_cache_ops_runner_SISO.cpp (100%)
 rename src/{ => ops}/ops_infer/reshape_and_cache/reshape_and_cache_ops_runner_SISO.h (100%)
 rename src/{ => ops}/ops_infer/reshape_and_cache_omni/reshape_and_cache_omni_operation.cpp (100%)
 rename src/{ => ops}/ops_infer/reshape_and_cache_omni/reshape_and_cache_omni_operation.h (100%)
 rename src/{ => ops}/ops_infer/reshape_and_cache_omni/reshape_and_cache_omni_ops_runner.cpp (100%)
 rename src/{ => ops}/ops_infer/reshape_and_cache_omni/reshape_and_cache_omni_ops_runner.h (100%)
 rename src/{ => ops}/ops_infer/reshape_and_cache_with_stride/reshape_and_cache_with_stride_operation.cpp (100%)
 rename src/{ => ops}/ops_infer/reshape_and_cache_with_stride/reshape_and_cache_with_stride_operation.h (100%)
 rename src/{ => ops}/ops_infer/reshape_and_cache_with_stride/reshape_and_cache_with_stride_ops_runner.cpp (100%)
 rename src/{ => ops}/ops_infer/reshape_and_cache_with_stride/reshape_and_cache_with_stride_ops_runner.h (100%)
 rename src/{ => ops}/ops_infer/reshape_and_cache_with_stride/reshape_and_cache_with_stride_ops_runner_SISO.cpp (100%)
 rename src/{ => ops}/ops_infer/reshape_and_cache_with_stride/reshape_and_cache_with_stride_ops_runner_SISO.h (100%)
 rename src/{ => ops}/ops_infer/ring_mla/atb_acl_ring_mla.cpp (100%)
 rename src/{ => ops}/ops_infer/ring_mla/param.cpp (100%)
 rename src/{ => ops}/ops_infer/ring_mla/param.h (100%)
 rename src/{ => ops}/ops_infer/ring_mla/ring_mla_operation.cpp (100%)
 rename src/{ => ops}/ops_infer/ring_mla/ring_mla_operation.h (100%)
 rename src/{ => ops}/ops_infer/ring_mla/ring_mla_ops_runner.cpp (100%)
 rename src/{ => ops}/ops_infer/ring_mla/ring_mla_ops_runner.h (100%)
 rename src/{ => ops}/ops_infer/rms_norm/rms_norm_operation.cpp (100%)
 rename src/{ => ops}/ops_infer/rms_norm/rms_norm_operation.h (100%)
 rename src/{ => ops}/ops_infer/rms_norm/rms_norm_ops_runner.cpp (100%)
 rename src/{ => ops}/ops_infer/rms_norm/rms_norm_ops_runner.h (100%)
 rename src/{ => ops}/ops_infer/rms_norm_with_stride/rms_norm_with_stride_operation.cpp (100%)
 rename src/{ => ops}/ops_infer/rms_norm_with_stride/rms_norm_with_stride_operation.h (100%)
 rename src/{ => ops}/ops_infer/rms_norm_with_stride/rms_norm_with_stride_ops_runner.cpp (100%)
 rename src/{ => ops}/ops_infer/rms_norm_with_stride/rms_norm_with_stride_ops_runner.h (100%)
 rename src/{ => ops}/ops_infer/rope/rope_operation.cpp (100%)
 rename src/{ => ops}/ops_infer/rope/rope_operation.h (100%)
 rename src/{ => ops}/ops_infer/rope/rope_ops_runner.cpp (100%)
 rename src/{ => ops}/ops_infer/rope/rope_ops_runner.h (100%)
 rename src/{ => ops}/ops_infer/rope_q_concat/rope_q_concat_operation.cpp (100%)
 rename src/{ => ops}/ops_infer/rope_q_concat/rope_q_concat_operation.h (100%)
 rename src/{ => ops}/ops_infer/rope_q_concat/rope_q_concat_ops_runner.cpp (100%)
 rename src/{ => ops}/ops_infer/rope_q_concat/rope_q_concat_ops_runner.h (100%)
 rename src/{ => ops}/ops_infer/scatter_elements_v2/scatter_elements_v2_operation.cpp (100%)
 rename src/{ => ops}/ops_infer/scatter_elements_v2/scatter_elements_v2_operation.h (100%)
 rename src/{ => ops}/ops_infer/scatter_elements_v2/scatter_elements_v2_ops_runner.cpp (100%)
 rename src/{ => ops}/ops_infer/scatter_elements_v2/scatter_elements_v2_ops_runner.h (100%)
 rename src/{ => ops}/ops_infer/self_attention/atb_acl_self_attention_prefix_encoder.cpp (100%)
 rename src/{ => ops}/ops_infer/self_attention/param.cpp (100%)
 rename src/{ => ops}/ops_infer/self_attention/param.h (100%)
 rename src/{ => ops}/ops_infer/self_attention/self_attention_encoder_fuison_ops_runner_910a.cpp (100%)
 rename src/{ => ops}/ops_infer/self_attention/self_attention_encoder_fusion_ops_runner.cpp (100%)
 rename src/{ => ops}/ops_infer/self_attention/self_attention_encoder_fusion_ops_runner.h (100%)
 rename src/{ => ops}/ops_infer/self_attention/self_attention_encoder_fusion_ops_runner_910a.h (100%)
 rename src/{ => ops}/ops_infer/self_attention/self_attention_fusion_bypass_ops_runner.cpp (100%)
 rename src/{ => ops}/ops_infer/self_attention/self_attention_fusion_bypass_ops_runner.h (100%)
 rename src/{ => ops}/ops_infer/self_attention/self_attention_fusion_bypass_ops_runner_910a.cpp (100%)
 rename src/{ => ops}/ops_infer/self_attention/self_attention_fusion_bypass_ops_runner_910a.h (100%)
 rename src/{ => ops}/ops_infer/self_attention/self_attention_fusion_bypass_ops_runner_BNSD.cpp (100%)
 rename src/{ => ops}/ops_infer/self_attention/self_attention_fusion_bypass_ops_runner_BNSD.h (100%)
 rename src/{ => ops}/ops_infer/self_attention/self_attention_fusion_bypass_ops_runner_BNSD_910a.cpp (100%)
 rename src/{ => ops}/ops_infer/self_attention/self_attention_fusion_bypass_ops_runner_BNSD_910a.h (100%)
 rename src/{ => ops}/ops_infer/self_attention/self_attention_fusion_ops_runner.cpp (100%)
 rename src/{ => ops}/ops_infer/self_attention/self_attention_fusion_ops_runner.h (100%)
 rename src/{ => ops}/ops_infer/self_attention/self_attention_fusion_ops_runner_910a.cpp (100%)
 rename src/{ => ops}/ops_infer/self_attention/self_attention_fusion_ops_runner_910a.h (100%)
 rename src/{ => ops}/ops_infer/self_attention/self_attention_operation.cpp (100%)
 rename src/{ => ops}/ops_infer/self_attention/self_attention_operation.h (100%)
 rename src/{ => ops}/ops_infer/self_attention/self_attention_prefix_encoder_ops_runner.cpp (100%)
 rename src/{ => ops}/ops_infer/self_attention/self_attention_prefix_encoder_ops_runner.h (100%)
 rename src/{ => ops}/ops_infer/self_attention/self_attention_runner_utils.cpp (100%)
 rename src/{ => ops}/ops_infer/self_attention/self_attention_runner_utils.h (100%)
 rename src/{ => ops}/ops_infer/send/send_hccl_runner.cpp (100%)
 rename src/{ => ops}/ops_infer/send/send_hccl_runner.h (100%)
 rename src/{ => ops}/ops_infer/send/send_operation.cpp (100%)
 rename src/{ => ops}/ops_infer/send/send_operation.h (100%)
 rename src/{ => ops}/ops_infer/set_value/set_value_operation.cpp (100%)
 rename src/{ => ops}/ops_infer/set_value/set_value_operation.h (100%)
 rename src/{ => ops}/ops_infer/set_value/set_value_ops_runner.cpp (100%)
 rename src/{ => ops}/ops_infer/set_value/set_value_ops_runner.h (100%)
 rename src/{ => ops}/ops_infer/slice/slice_operation.cpp (100%)
 rename src/{ => ops}/ops_infer/slice/slice_operation.h (100%)
 rename src/{ => ops}/ops_infer/slice/slice_ops_runner.cpp (100%)
 rename src/{ => ops}/ops_infer/slice/slice_ops_runner.h (100%)
 rename src/{ => ops}/ops_infer/softmax/softmax_operation.cpp (100%)
 rename src/{ => ops}/ops_infer/softmax/softmax_operation.h (100%)
 rename src/{ => ops}/ops_infer/softmax/softmax_ops_runner.cpp (100%)
 rename src/{ => ops}/ops_infer/softmax/softmax_ops_runner.h (100%)
 rename src/{ => ops}/ops_infer/sort/sort_operation.cpp (100%)
 rename src/{ => ops}/ops_infer/sort/sort_operation.h (100%)
 rename src/{ => ops}/ops_infer/sort/sort_ops_runner.cpp (100%)
 rename src/{ => ops}/ops_infer/sort/sort_ops_runner.h (97%)
 rename src/{ => ops}/ops_infer/split/split_operation.cpp (100%)
 rename src/{ => ops}/ops_infer/split/split_operation.h (100%)
 rename src/{ => ops}/ops_infer/split/split_ops_runner.cpp (100%)
 rename src/{ => ops}/ops_infer/split/split_ops_runner.h (100%)
 rename src/{ => ops}/ops_infer/swiglu_quant/swiglu_quant_operation.cpp (100%)
 rename src/{ => ops}/ops_infer/swiglu_quant/swiglu_quant_operation.h (100%)
 rename src/{ => ops}/ops_infer/swiglu_quant/swiglu_quant_ops_runner.cpp (100%)
 rename src/{ => ops}/ops_infer/swiglu_quant/swiglu_quant_ops_runner.h (100%)
 rename src/{ => ops}/ops_infer/topk_topp_sampling/topk_topp_sampling_operation.cpp (100%)
 rename src/{ => ops}/ops_infer/topk_topp_sampling/topk_topp_sampling_operation.h (100%)
 rename src/{ => ops}/ops_infer/topk_topp_sampling/topk_topp_sampling_ops_runner.cpp (100%)
 rename src/{ => ops}/ops_infer/topk_topp_sampling/topk_topp_sampling_ops_runner.h (100%)
 rename src/{ => ops}/ops_infer/transdata/transdata_operation.cpp (100%)
 rename src/{ => ops}/ops_infer/transdata/transdata_operation.h (100%)
 rename src/{ => ops}/ops_infer/transdata/transdata_ops_runner.cpp (100%)
 rename src/{ => ops}/ops_infer/transdata/transdata_ops_runner.h (100%)
 rename src/{ => ops}/ops_infer/transpose/transpose_operation.cpp (100%)
 rename src/{ => ops}/ops_infer/transpose/transpose_operation.h (100%)
 rename src/{ => ops}/ops_infer/transpose/transpose_ops_runner.cpp (100%)
 rename src/{ => ops}/ops_infer/transpose/transpose_ops_runner.h (100%)
 rename src/{ => ops}/ops_infer/unpad/unpad_operation.cpp (100%)
 rename src/{ => ops}/ops_infer/unpad/unpad_operation.h (100%)
 rename src/{ => ops}/ops_infer/unpad/unpad_ops_runner.cpp (100%)
 rename src/{ => ops}/ops_infer/unpad/unpad_ops_runner.h (100%)
 rename src/{ => ops}/ops_infer/where/where_operation.cpp (100%)
 rename src/{ => ops}/ops_infer/where/where_operation.h (100%)
 rename src/{ => ops}/ops_infer/where/where_ops_runner.cpp (100%)
 rename src/{ => ops}/ops_infer/where/where_ops_runner.h (100%)
 rename src/{ => ops}/ops_train/fast_soft_max/fastsoftmax_operation.cpp (100%)
 rename src/{ => ops}/ops_train/fast_soft_max/fastsoftmax_operation.h (100%)
 rename src/{ => ops}/ops_train/fast_soft_max/fastsoftmax_ops_runner.cpp (100%)
 rename src/{ => ops}/ops_train/fast_soft_max/fastsoftmax_ops_runner.h (100%)
 rename src/{ => ops}/ops_train/fast_soft_max_grad/fastsoftmaxgrad_operation.cpp (100%)
 rename src/{ => ops}/ops_train/fast_soft_max_grad/fastsoftmaxgrad_operation.h (100%)
 rename src/{ => ops}/ops_train/fast_soft_max_grad/fastsoftmaxgrad_ops_runner.cpp (100%)
 rename src/{ => ops}/ops_train/fast_soft_max_grad/fastsoftmaxgrad_ops_runner.h (100%)
 rename src/{ => ops}/ops_train/gen_attention_mask/genattentionmask_operation.cpp (100%)
 rename src/{ => ops}/ops_train/gen_attention_mask/genattentionmask_operation.h (100%)
 rename src/{ => ops}/ops_train/gen_attention_mask/genattentionmask_ops_runner.cpp (100%)
 rename src/{ => ops}/ops_train/gen_attention_mask/genattentionmask_ops_runner.h (100%)
 rename src/{ => ops}/ops_train/laser_attention/laser_attention_operation.cpp (100%)
 rename src/{ => ops}/ops_train/laser_attention/laser_attention_operation.h (100%)
 rename src/{ => ops}/ops_train/laser_attention/laser_attention_ops_runner.cpp (100%)
 rename src/{ => ops}/ops_train/laser_attention/laser_attention_ops_runner.h (100%)
 rename src/{ => ops}/ops_train/laser_attention_grad/laser_attention_grad_operation.cpp (100%)
 rename src/{ => ops}/ops_train/laser_attention_grad/laser_attention_grad_operation.h (100%)
 rename src/{ => ops}/ops_train/laser_attention_grad/laser_attention_grad_ops_runner.cpp (100%)
 rename src/{ => ops}/ops_train/laser_attention_grad/laser_attention_grad_ops_runner.h (100%)
 rename src/{ => ops}/ops_train/pad_with_hidden_state/pad_with_hidden_state_operation.cpp (100%)
 rename src/{ => ops}/ops_train/pad_with_hidden_state/pad_with_hidden_state_operation.h (100%)
 rename src/{ => ops}/ops_train/pad_with_hidden_state/pad_with_hidden_state_ops_runner.cpp (100%)
 rename src/{ => ops}/ops_train/pad_with_hidden_state/pad_with_hidden_state_ops_runner.h (100%)
 rename src/{ => ops}/ops_train/rms_norm_backward/rms_norm_backward_operation.cpp (100%)
 rename src/{ => ops}/ops_train/rms_norm_backward/rms_norm_backward_operation.h (100%)
 rename src/{ => ops}/ops_train/rms_norm_backward/rms_norm_backward_ops_runner.cpp (100%)
 rename src/{ => ops}/ops_train/rms_norm_backward/rms_norm_backward_ops_runner.h (100%)
 rename src/{ => ops}/ops_train/rope_grad/rope_grad_operation.cpp (100%)
 rename src/{ => ops}/ops_train/rope_grad/rope_grad_operation.h (100%)
 rename src/{ => ops}/ops_train/rope_grad/rope_grad_ops_runner.cpp (100%)
 rename src/{ => ops}/ops_train/rope_grad/rope_grad_ops_runner.h (100%)
 rename src/{ => ops}/ops_train/strided_batch_matmul/stridedbatchmatmul_operation.cpp (100%)
 rename src/{ => ops}/ops_train/strided_batch_matmul/stridedbatchmatmul_operation.h (100%)
 rename src/{ => ops}/ops_train/strided_batch_matmul/stridedbatchmatmul_ops_runner.cpp (100%)
 rename src/{ => ops}/ops_train/strided_batch_matmul/stridedbatchmatmul_ops_runner.h (100%)
 rename src/{ => ops}/ops_train/unpad_with_hidden_state/unpad_with_hidden_state_operation.cpp (100%)
 rename src/{ => ops}/ops_train/unpad_with_hidden_state/unpad_with_hidden_state_operation.h (100%)
 rename src/{ => ops}/ops_train/unpad_with_hidden_state/unpad_with_hidden_state_ops_runner.cpp (100%)
 rename src/{ => ops}/ops_train/unpad_with_hidden_state/unpad_with_hidden_state_ops_runner.h (100%)

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 389b643b..fb980df9 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -8,9 +8,9 @@
 # See LICENSE in the root of the software repository for the full text of the License.
 #
 
-set(ops_train_directory ${CMAKE_CURRENT_LIST_DIR}/ops_train)
-set(ops_infer_directory ${CMAKE_CURRENT_LIST_DIR}/ops_infer)
-set(ops_common_directory ${CMAKE_CURRENT_LIST_DIR}/ops_common)
+set(ops_train_directory ${CMAKE_CURRENT_LIST_DIR}/ops/ops_train)
+set(ops_infer_directory ${CMAKE_CURRENT_LIST_DIR}/ops/ops_infer)
+set(ops_common_directory ${CMAKE_CURRENT_LIST_DIR}/ops/ops_common)
 set(atb_directory ${CMAKE_CURRENT_LIST_DIR}/atb)
 set(MSTX_PATH $ENV{ASCEND_HOME_PATH}/tools/mstx/include)
 set(ATB_INCLUDE_DIR $ENV{ASCEND_HOME_PATH}/include)
diff --git a/src/ops_common/event_operation/event_operation.cpp b/src/ops/ops_common/event_operation/event_operation.cpp
similarity index 100%
rename from src/ops_common/event_operation/event_operation.cpp
rename to src/ops/ops_common/event_operation/event_operation.cpp
diff --git a/src/ops_common/event_operation/event_operation.h b/src/ops/ops_common/event_operation/event_operation.h
similarity index 100%
rename from src/ops_common/event_operation/event_operation.h
rename to src/ops/ops_common/event_operation/event_operation.h
diff --git a/src/ops_common/event_operation/event_runner.cpp b/src/ops/ops_common/event_operation/event_runner.cpp
similarity index 100%
rename from src/ops_common/event_operation/event_runner.cpp
rename to src/ops/ops_common/event_operation/event_runner.cpp
diff --git a/src/ops_common/event_operation/event_runner.h b/src/ops/ops_common/event_operation/event_runner.h
similarity index 100%
rename from src/ops_common/event_operation/event_runner.h
rename to src/ops/ops_common/event_operation/event_runner.h
diff --git a/src/ops_infer/activation/activation_operation.cpp b/src/ops/ops_infer/activation/activation_operation.cpp
similarity index 100%
rename from src/ops_infer/activation/activation_operation.cpp
rename to src/ops/ops_infer/activation/activation_operation.cpp
diff --git a/src/ops_infer/activation/activation_operation.h b/src/ops/ops_infer/activation/activation_operation.h
similarity index 100%
rename from src/ops_infer/activation/activation_operation.h
rename to src/ops/ops_infer/activation/activation_operation.h
diff --git a/src/ops_infer/activation/activation_ops_runner.cpp b/src/ops/ops_infer/activation/activation_ops_runner.cpp
similarity index 100%
rename from src/ops_infer/activation/activation_ops_runner.cpp
rename to src/ops/ops_infer/activation/activation_ops_runner.cpp
diff --git a/src/ops_infer/activation/activation_ops_runner.h b/src/ops/ops_infer/activation/activation_ops_runner.h
similarity index 100%
rename from src/ops_infer/activation/activation_ops_runner.h
rename to src/ops/ops_infer/activation/activation_ops_runner.h
diff --git a/src/ops_infer/all_gather/all_gather_hccl_runner.cpp b/src/ops/ops_infer/all_gather/all_gather_hccl_runner.cpp
similarity index 100%
rename from src/ops_infer/all_gather/all_gather_hccl_runner.cpp
rename to src/ops/ops_infer/all_gather/all_gather_hccl_runner.cpp
diff --git a/src/ops_infer/all_gather/all_gather_hccl_runner.h b/src/ops/ops_infer/all_gather/all_gather_hccl_runner.h
similarity index 100%
rename from src/ops_infer/all_gather/all_gather_hccl_runner.h
rename to src/ops/ops_infer/all_gather/all_gather_hccl_runner.h
diff --git a/src/ops_infer/all_gather/all_gather_lccl_runner.cpp b/src/ops/ops_infer/all_gather/all_gather_lccl_runner.cpp
similarity index 100%
rename from src/ops_infer/all_gather/all_gather_lccl_runner.cpp
rename to src/ops/ops_infer/all_gather/all_gather_lccl_runner.cpp
diff --git a/src/ops_infer/all_gather/all_gather_lccl_runner.h b/src/ops/ops_infer/all_gather/all_gather_lccl_runner.h
similarity index 100%
rename from src/ops_infer/all_gather/all_gather_lccl_runner.h
rename to src/ops/ops_infer/all_gather/all_gather_lccl_runner.h
diff --git a/src/ops_infer/all_gather/all_gather_operation.cpp b/src/ops/ops_infer/all_gather/all_gather_operation.cpp
similarity index 100%
rename from src/ops_infer/all_gather/all_gather_operation.cpp
rename to src/ops/ops_infer/all_gather/all_gather_operation.cpp
diff --git a/src/ops_infer/all_gather/all_gather_operation.h b/src/ops/ops_infer/all_gather/all_gather_operation.h
similarity index 100%
rename from src/ops_infer/all_gather/all_gather_operation.h
rename to src/ops/ops_infer/all_gather/all_gather_operation.h
diff --git a/src/ops_infer/all_gatherv/all_gatherv_hccl_runner.cpp b/src/ops/ops_infer/all_gatherv/all_gatherv_hccl_runner.cpp
similarity index 100%
rename from src/ops_infer/all_gatherv/all_gatherv_hccl_runner.cpp
rename to src/ops/ops_infer/all_gatherv/all_gatherv_hccl_runner.cpp
diff --git a/src/ops_infer/all_gatherv/all_gatherv_hccl_runner.h b/src/ops/ops_infer/all_gatherv/all_gatherv_hccl_runner.h
similarity index 100%
rename from src/ops_infer/all_gatherv/all_gatherv_hccl_runner.h
rename to src/ops/ops_infer/all_gatherv/all_gatherv_hccl_runner.h
diff --git a/src/ops_infer/all_gatherv/all_gatherv_operation.cpp b/src/ops/ops_infer/all_gatherv/all_gatherv_operation.cpp
similarity index 100%
rename from src/ops_infer/all_gatherv/all_gatherv_operation.cpp
rename to src/ops/ops_infer/all_gatherv/all_gatherv_operation.cpp
diff --git a/src/ops_infer/all_gatherv/all_gatherv_operation.h b/src/ops/ops_infer/all_gatherv/all_gatherv_operation.h
similarity index 100%
rename from src/ops_infer/all_gatherv/all_gatherv_operation.h
rename to src/ops/ops_infer/all_gatherv/all_gatherv_operation.h
diff --git a/src/ops_infer/all_reduce/all_reduce_hccl_runner.cpp b/src/ops/ops_infer/all_reduce/all_reduce_hccl_runner.cpp
similarity index 100%
rename from src/ops_infer/all_reduce/all_reduce_hccl_runner.cpp
rename to src/ops/ops_infer/all_reduce/all_reduce_hccl_runner.cpp
diff --git a/src/ops_infer/all_reduce/all_reduce_hccl_runner.h b/src/ops/ops_infer/all_reduce/all_reduce_hccl_runner.h
similarity index 100%
rename from src/ops_infer/all_reduce/all_reduce_hccl_runner.h
rename to src/ops/ops_infer/all_reduce/all_reduce_hccl_runner.h
diff --git a/src/ops_infer/all_reduce/all_reduce_lccl_runner.cpp b/src/ops/ops_infer/all_reduce/all_reduce_lccl_runner.cpp
similarity index 100%
rename from src/ops_infer/all_reduce/all_reduce_lccl_runner.cpp
rename to src/ops/ops_infer/all_reduce/all_reduce_lccl_runner.cpp
diff --git a/src/ops_infer/all_reduce/all_reduce_lccl_runner.h b/src/ops/ops_infer/all_reduce/all_reduce_lccl_runner.h
similarity index 100%
rename from src/ops_infer/all_reduce/all_reduce_lccl_runner.h
rename to src/ops/ops_infer/all_reduce/all_reduce_lccl_runner.h
diff --git a/src/ops_infer/all_reduce/all_reduce_operation.cpp b/src/ops/ops_infer/all_reduce/all_reduce_operation.cpp
similarity index 100%
rename from src/ops_infer/all_reduce/all_reduce_operation.cpp
rename to src/ops/ops_infer/all_reduce/all_reduce_operation.cpp
diff --git a/src/ops_infer/all_reduce/all_reduce_operation.h b/src/ops/ops_infer/all_reduce/all_reduce_operation.h
similarity index 100%
rename from src/ops_infer/all_reduce/all_reduce_operation.h
rename to src/ops/ops_infer/all_reduce/all_reduce_operation.h
diff --git a/src/ops_infer/all_to_all/all_to_all_hccl_runner.cpp b/src/ops/ops_infer/all_to_all/all_to_all_hccl_runner.cpp
similarity index 100%
rename from src/ops_infer/all_to_all/all_to_all_hccl_runner.cpp
rename to src/ops/ops_infer/all_to_all/all_to_all_hccl_runner.cpp
diff --git a/src/ops_infer/all_to_all/all_to_all_hccl_runner.h b/src/ops/ops_infer/all_to_all/all_to_all_hccl_runner.h
similarity index 100%
rename from src/ops_infer/all_to_all/all_to_all_hccl_runner.h
rename to src/ops/ops_infer/all_to_all/all_to_all_hccl_runner.h
diff --git a/src/ops_infer/all_to_all/all_to_all_lccl_runner.cpp b/src/ops/ops_infer/all_to_all/all_to_all_lccl_runner.cpp
similarity index 100%
rename from src/ops_infer/all_to_all/all_to_all_lccl_runner.cpp
rename to src/ops/ops_infer/all_to_all/all_to_all_lccl_runner.cpp
diff --git a/src/ops_infer/all_to_all/all_to_all_lccl_runner.h b/src/ops/ops_infer/all_to_all/all_to_all_lccl_runner.h
similarity index 100%
rename from src/ops_infer/all_to_all/all_to_all_lccl_runner.h
rename to src/ops/ops_infer/all_to_all/all_to_all_lccl_runner.h
diff --git a/src/ops_infer/all_to_all/all_to_all_operation.cpp b/src/ops/ops_infer/all_to_all/all_to_all_operation.cpp
similarity index 100%
rename from src/ops_infer/all_to_all/all_to_all_operation.cpp
rename to src/ops/ops_infer/all_to_all/all_to_all_operation.cpp
diff --git a/src/ops_infer/all_to_all/all_to_all_operation.h b/src/ops/ops_infer/all_to_all/all_to_all_operation.h
similarity index 100%
rename from src/ops_infer/all_to_all/all_to_all_operation.h
rename to src/ops/ops_infer/all_to_all/all_to_all_operation.h
diff --git a/src/ops_infer/all_to_allv/all_to_allv_hccl_runner.cpp b/src/ops/ops_infer/all_to_allv/all_to_allv_hccl_runner.cpp
similarity index 100%
rename from src/ops_infer/all_to_allv/all_to_allv_hccl_runner.cpp
rename to src/ops/ops_infer/all_to_allv/all_to_allv_hccl_runner.cpp
diff --git a/src/ops_infer/all_to_allv/all_to_allv_hccl_runner.h b/src/ops/ops_infer/all_to_allv/all_to_allv_hccl_runner.h
similarity index 100%
rename from src/ops_infer/all_to_allv/all_to_allv_hccl_runner.h
rename to src/ops/ops_infer/all_to_allv/all_to_allv_hccl_runner.h
diff --git a/src/ops_infer/all_to_allv/all_to_allv_operation.cpp b/src/ops/ops_infer/all_to_allv/all_to_allv_operation.cpp
similarity index 100%
rename from src/ops_infer/all_to_allv/all_to_allv_operation.cpp
rename to src/ops/ops_infer/all_to_allv/all_to_allv_operation.cpp
diff --git a/src/ops_infer/all_to_allv/all_to_allv_operation.h b/src/ops/ops_infer/all_to_allv/all_to_allv_operation.h
similarity index 100%
rename from src/ops_infer/all_to_allv/all_to_allv_operation.h
rename to src/ops/ops_infer/all_to_allv/all_to_allv_operation.h
diff --git a/src/ops_infer/all_to_allvv2/all_to_allvv2_hccl_runner.cpp b/src/ops/ops_infer/all_to_allvv2/all_to_allvv2_hccl_runner.cpp
similarity index 100%
rename from src/ops_infer/all_to_allvv2/all_to_allvv2_hccl_runner.cpp
rename to src/ops/ops_infer/all_to_allvv2/all_to_allvv2_hccl_runner.cpp
diff --git a/src/ops_infer/all_to_allvv2/all_to_allvv2_hccl_runner.h b/src/ops/ops_infer/all_to_allvv2/all_to_allvv2_hccl_runner.h
similarity index 100%
rename from src/ops_infer/all_to_allvv2/all_to_allvv2_hccl_runner.h
rename to src/ops/ops_infer/all_to_allvv2/all_to_allvv2_hccl_runner.h
diff --git a/src/ops_infer/all_to_allvv2/all_to_allvv2_operation.cpp b/src/ops/ops_infer/all_to_allvv2/all_to_allvv2_operation.cpp
similarity index 100%
rename from src/ops_infer/all_to_allvv2/all_to_allvv2_operation.cpp
rename to src/ops/ops_infer/all_to_allvv2/all_to_allvv2_operation.cpp
diff --git a/src/ops_infer/all_to_allvv2/all_to_allvv2_operation.h b/src/ops/ops_infer/all_to_allvv2/all_to_allvv2_operation.h
similarity index 100%
rename from src/ops_infer/all_to_allvv2/all_to_allvv2_operation.h
rename to src/ops/ops_infer/all_to_allvv2/all_to_allvv2_operation.h
diff --git a/src/ops_infer/as_strided/as_strided_operation.cpp b/src/ops/ops_infer/as_strided/as_strided_operation.cpp
similarity index 100%
rename from src/ops_infer/as_strided/as_strided_operation.cpp
rename to src/ops/ops_infer/as_strided/as_strided_operation.cpp
diff --git a/src/ops_infer/as_strided/as_strided_operation.h b/src/ops/ops_infer/as_strided/as_strided_operation.h
similarity index 100%
rename from src/ops_infer/as_strided/as_strided_operation.h
rename to src/ops/ops_infer/as_strided/as_strided_operation.h
diff --git a/src/ops_infer/as_strided/as_strided_ops_runner.cpp b/src/ops/ops_infer/as_strided/as_strided_ops_runner.cpp
similarity index 100%
rename from src/ops_infer/as_strided/as_strided_ops_runner.cpp
rename to src/ops/ops_infer/as_strided/as_strided_ops_runner.cpp
diff --git a/src/ops_infer/as_strided/as_strided_ops_runner.h b/src/ops/ops_infer/as_strided/as_strided_ops_runner.h
similarity index 100%
rename from src/ops_infer/as_strided/as_strided_ops_runner.h
rename to src/ops/ops_infer/as_strided/as_strided_ops_runner.h
diff --git a/src/ops_infer/block_copy/block_copy_operation.cpp b/src/ops/ops_infer/block_copy/block_copy_operation.cpp
similarity index 100%
rename from src/ops_infer/block_copy/block_copy_operation.cpp
rename to src/ops/ops_infer/block_copy/block_copy_operation.cpp
diff --git a/src/ops_infer/block_copy/block_copy_operation.h b/src/ops/ops_infer/block_copy/block_copy_operation.h
similarity index 100%
rename from src/ops_infer/block_copy/block_copy_operation.h
rename to src/ops/ops_infer/block_copy/block_copy_operation.h
diff --git a/src/ops_infer/block_copy/block_copy_ops_runner.cpp b/src/ops/ops_infer/block_copy/block_copy_ops_runner.cpp
similarity index 100%
rename from src/ops_infer/block_copy/block_copy_ops_runner.cpp
rename to src/ops/ops_infer/block_copy/block_copy_ops_runner.cpp
diff --git a/src/ops_infer/block_copy/block_copy_ops_runner.h b/src/ops/ops_infer/block_copy/block_copy_ops_runner.h
similarity index 100%
rename from src/ops_infer/block_copy/block_copy_ops_runner.h
rename to src/ops/ops_infer/block_copy/block_copy_ops_runner.h
diff --git a/src/ops_infer/broadcast/broadcast_hccl_runner.cpp b/src/ops/ops_infer/broadcast/broadcast_hccl_runner.cpp
similarity index 100%
rename from src/ops_infer/broadcast/broadcast_hccl_runner.cpp
rename to src/ops/ops_infer/broadcast/broadcast_hccl_runner.cpp
diff --git a/src/ops_infer/broadcast/broadcast_hccl_runner.h b/src/ops/ops_infer/broadcast/broadcast_hccl_runner.h
similarity index 100%
rename from src/ops_infer/broadcast/broadcast_hccl_runner.h
rename to src/ops/ops_infer/broadcast/broadcast_hccl_runner.h
diff --git a/src/ops_infer/broadcast/broadcast_lccl_runner.cpp b/src/ops/ops_infer/broadcast/broadcast_lccl_runner.cpp
similarity index 100%
rename from src/ops_infer/broadcast/broadcast_lccl_runner.cpp
rename to src/ops/ops_infer/broadcast/broadcast_lccl_runner.cpp
diff --git a/src/ops_infer/broadcast/broadcast_lccl_runner.h b/src/ops/ops_infer/broadcast/broadcast_lccl_runner.h
similarity index 100%
rename from src/ops_infer/broadcast/broadcast_lccl_runner.h
rename to src/ops/ops_infer/broadcast/broadcast_lccl_runner.h
diff --git a/src/ops_infer/broadcast/broadcast_operation.cpp b/src/ops/ops_infer/broadcast/broadcast_operation.cpp
similarity index 100%
rename from src/ops_infer/broadcast/broadcast_operation.cpp
rename to src/ops/ops_infer/broadcast/broadcast_operation.cpp
diff --git a/src/ops_infer/broadcast/broadcast_operation.h b/src/ops/ops_infer/broadcast/broadcast_operation.h
similarity index 100%
rename from src/ops_infer/broadcast/broadcast_operation.h
rename to src/ops/ops_infer/broadcast/broadcast_operation.h
diff --git a/src/ops_infer/cohere_layernorm/cohere_layernorm_operation.cpp b/src/ops/ops_infer/cohere_layernorm/cohere_layernorm_operation.cpp
similarity index 100%
rename from src/ops_infer/cohere_layernorm/cohere_layernorm_operation.cpp
rename to src/ops/ops_infer/cohere_layernorm/cohere_layernorm_operation.cpp
diff --git a/src/ops_infer/cohere_layernorm/cohere_layernorm_operation.h b/src/ops/ops_infer/cohere_layernorm/cohere_layernorm_operation.h
similarity index 100%
rename from src/ops_infer/cohere_layernorm/cohere_layernorm_operation.h
rename to src/ops/ops_infer/cohere_layernorm/cohere_layernorm_operation.h
diff --git a/src/ops_infer/cohere_layernorm/cohere_layernorm_runner.cpp b/src/ops/ops_infer/cohere_layernorm/cohere_layernorm_runner.cpp
similarity index 100%
rename from src/ops_infer/cohere_layernorm/cohere_layernorm_runner.cpp
rename to src/ops/ops_infer/cohere_layernorm/cohere_layernorm_runner.cpp
diff --git a/src/ops_infer/cohere_layernorm/cohere_layernorm_runner.h b/src/ops/ops_infer/cohere_layernorm/cohere_layernorm_runner.h
similarity index 100%
rename from src/ops_infer/cohere_layernorm/cohere_layernorm_runner.h
rename to src/ops/ops_infer/cohere_layernorm/cohere_layernorm_runner.h
diff --git a/src/ops_infer/concat/concat_operation.cpp b/src/ops/ops_infer/concat/concat_operation.cpp
similarity index 100%
rename from src/ops_infer/concat/concat_operation.cpp
rename to src/ops/ops_infer/concat/concat_operation.cpp
diff --git a/src/ops_infer/concat/concat_operation.h b/src/ops/ops_infer/concat/concat_operation.h
similarity index 100%
rename from src/ops_infer/concat/concat_operation.h
rename to src/ops/ops_infer/concat/concat_operation.h
diff --git a/src/ops_infer/concat/concat_ops_runner.cpp b/src/ops/ops_infer/concat/concat_ops_runner.cpp
similarity index 100%
rename from src/ops_infer/concat/concat_ops_runner.cpp
rename to src/ops/ops_infer/concat/concat_ops_runner.cpp
diff --git a/src/ops_infer/concat/concat_ops_runner.h b/src/ops/ops_infer/concat/concat_ops_runner.h
similarity index 100%
rename from src/ops_infer/concat/concat_ops_runner.h
rename to src/ops/ops_infer/concat/concat_ops_runner.h
diff --git a/src/ops_infer/cumsum/cumsum_operation.cpp b/src/ops/ops_infer/cumsum/cumsum_operation.cpp
similarity index 100%
rename from src/ops_infer/cumsum/cumsum_operation.cpp
rename to src/ops/ops_infer/cumsum/cumsum_operation.cpp
diff --git a/src/ops_infer/cumsum/cumsum_operation.h b/src/ops/ops_infer/cumsum/cumsum_operation.h
similarity index 100%
rename from src/ops_infer/cumsum/cumsum_operation.h
rename to src/ops/ops_infer/cumsum/cumsum_operation.h
diff --git a/src/ops_infer/cumsum/cumsum_ops_runner.cpp b/src/ops/ops_infer/cumsum/cumsum_ops_runner.cpp
similarity index 100%
rename from src/ops_infer/cumsum/cumsum_ops_runner.cpp
rename to src/ops/ops_infer/cumsum/cumsum_ops_runner.cpp
diff --git a/src/ops_infer/cumsum/cumsum_ops_runner.h b/src/ops/ops_infer/cumsum/cumsum_ops_runner.h
similarity index 100%
rename from src/ops_infer/cumsum/cumsum_ops_runner.h
rename to src/ops/ops_infer/cumsum/cumsum_ops_runner.h
diff --git a/src/ops_infer/dynamic_ntk/dynamic_ntk_operation.cpp b/src/ops/ops_infer/dynamic_ntk/dynamic_ntk_operation.cpp
similarity index 100%
rename from src/ops_infer/dynamic_ntk/dynamic_ntk_operation.cpp
rename to src/ops/ops_infer/dynamic_ntk/dynamic_ntk_operation.cpp
diff --git a/src/ops_infer/dynamic_ntk/dynamic_ntk_operation.h b/src/ops/ops_infer/dynamic_ntk/dynamic_ntk_operation.h
similarity index 100%
rename from src/ops_infer/dynamic_ntk/dynamic_ntk_operation.h
rename to src/ops/ops_infer/dynamic_ntk/dynamic_ntk_operation.h
diff --git a/src/ops_infer/dynamic_ntk/dynamic_ntk_ops_runner.cpp b/src/ops/ops_infer/dynamic_ntk/dynamic_ntk_ops_runner.cpp
similarity index 100%
rename from src/ops_infer/dynamic_ntk/dynamic_ntk_ops_runner.cpp
rename to src/ops/ops_infer/dynamic_ntk/dynamic_ntk_ops_runner.cpp
diff --git a/src/ops_infer/dynamic_ntk/dynamic_ntk_ops_runner.h b/src/ops/ops_infer/dynamic_ntk/dynamic_ntk_ops_runner.h
similarity index 100%
rename from src/ops_infer/dynamic_ntk/dynamic_ntk_ops_runner.h
rename to src/ops/ops_infer/dynamic_ntk/dynamic_ntk_ops_runner.h
diff --git a/src/ops_infer/elewise/elewise_operation.cpp b/src/ops/ops_infer/elewise/elewise_operation.cpp
similarity index 100%
rename from src/ops_infer/elewise/elewise_operation.cpp
rename to src/ops/ops_infer/elewise/elewise_operation.cpp
diff --git a/src/ops_infer/elewise/elewise_operation.h b/src/ops/ops_infer/elewise/elewise_operation.h
similarity index 100%
rename from src/ops_infer/elewise/elewise_operation.h
rename to src/ops/ops_infer/elewise/elewise_operation.h
diff --git a/src/ops_infer/elewise/elewise_ops_runner.cpp b/src/ops/ops_infer/elewise/elewise_ops_runner.cpp
similarity index 100%
rename from src/ops_infer/elewise/elewise_ops_runner.cpp
rename to src/ops/ops_infer/elewise/elewise_ops_runner.cpp
diff --git a/src/ops_infer/elewise/elewise_ops_runner.h b/src/ops/ops_infer/elewise/elewise_ops_runner.h
similarity index 100%
rename from src/ops_infer/elewise/elewise_ops_runner.h
rename to src/ops/ops_infer/elewise/elewise_ops_runner.h
diff --git a/src/ops_infer/faupdate/faupdate_operation.cpp b/src/ops/ops_infer/faupdate/faupdate_operation.cpp
similarity index 100%
rename from src/ops_infer/faupdate/faupdate_operation.cpp
rename to src/ops/ops_infer/faupdate/faupdate_operation.cpp
diff --git a/src/ops_infer/faupdate/faupdate_operation.h b/src/ops/ops_infer/faupdate/faupdate_operation.h
similarity index 100%
rename from src/ops_infer/faupdate/faupdate_operation.h
rename to src/ops/ops_infer/faupdate/faupdate_operation.h
diff --git a/src/ops_infer/faupdate/faupdate_ops_runner.cpp b/src/ops/ops_infer/faupdate/faupdate_ops_runner.cpp
similarity index 100%
rename from src/ops_infer/faupdate/faupdate_ops_runner.cpp
rename to src/ops/ops_infer/faupdate/faupdate_ops_runner.cpp
diff --git a/src/ops_infer/faupdate/faupdate_ops_runner.h b/src/ops/ops_infer/faupdate/faupdate_ops_runner.h
similarity index 100%
rename from src/ops_infer/faupdate/faupdate_ops_runner.h
rename to src/ops/ops_infer/faupdate/faupdate_ops_runner.h
diff --git a/src/ops_infer/fill/fill_operation.cpp b/src/ops/ops_infer/fill/fill_operation.cpp
similarity index 100%
rename from src/ops_infer/fill/fill_operation.cpp
rename to src/ops/ops_infer/fill/fill_operation.cpp
diff --git a/src/ops_infer/fill/fill_operation.h b/src/ops/ops_infer/fill/fill_operation.h
similarity index 100%
rename from src/ops_infer/fill/fill_operation.h
rename to src/ops/ops_infer/fill/fill_operation.h
diff --git a/src/ops_infer/fill/fill_ops_runner.cpp b/src/ops/ops_infer/fill/fill_ops_runner.cpp
similarity index 100%
rename from src/ops_infer/fill/fill_ops_runner.cpp
rename to src/ops/ops_infer/fill/fill_ops_runner.cpp
diff --git a/src/ops_infer/fill/fill_ops_runner.h b/src/ops/ops_infer/fill/fill_ops_runner.h
similarity index 100%
rename from src/ops_infer/fill/fill_ops_runner.h
rename to src/ops/ops_infer/fill/fill_ops_runner.h
diff --git a/src/ops_infer/fused_add_topk_div/atb_acl_fused_add_topk_div.cpp b/src/ops/ops_infer/fused_add_topk_div/atb_acl_fused_add_topk_div.cpp
similarity index 100%
rename from src/ops_infer/fused_add_topk_div/atb_acl_fused_add_topk_div.cpp
rename to src/ops/ops_infer/fused_add_topk_div/atb_acl_fused_add_topk_div.cpp
diff --git a/src/ops_infer/fused_add_topk_div/fused_add_topk_div_operation.cpp b/src/ops/ops_infer/fused_add_topk_div/fused_add_topk_div_operation.cpp
similarity index 100%
rename from src/ops_infer/fused_add_topk_div/fused_add_topk_div_operation.cpp
rename to src/ops/ops_infer/fused_add_topk_div/fused_add_topk_div_operation.cpp
diff --git a/src/ops_infer/fused_add_topk_div/fused_add_topk_div_operation.h b/src/ops/ops_infer/fused_add_topk_div/fused_add_topk_div_operation.h
similarity index 100%
rename from src/ops_infer/fused_add_topk_div/fused_add_topk_div_operation.h
rename to src/ops/ops_infer/fused_add_topk_div/fused_add_topk_div_operation.h
diff --git a/src/ops_infer/fused_add_topk_div/fused_add_topk_div_ops_runner.cpp b/src/ops/ops_infer/fused_add_topk_div/fused_add_topk_div_ops_runner.cpp
similarity index 100%
rename from src/ops_infer/fused_add_topk_div/fused_add_topk_div_ops_runner.cpp
rename to src/ops/ops_infer/fused_add_topk_div/fused_add_topk_div_ops_runner.cpp
diff --git a/src/ops_infer/fused_add_topk_div/fused_add_topk_div_ops_runner.h b/src/ops/ops_infer/fused_add_topk_div/fused_add_topk_div_ops_runner.h
similarity index 100%
rename from src/ops_infer/fused_add_topk_div/fused_add_topk_div_ops_runner.h
rename to src/ops/ops_infer/fused_add_topk_div/fused_add_topk_div_ops_runner.h
diff --git a/src/ops_infer/gather/gather_operation.cpp b/src/ops/ops_infer/gather/gather_operation.cpp
similarity index 100%
rename from src/ops_infer/gather/gather_operation.cpp
rename to src/ops/ops_infer/gather/gather_operation.cpp
diff --git a/src/ops_infer/gather/gather_operation.h b/src/ops/ops_infer/gather/gather_operation.h
similarity index 100%
rename from src/ops_infer/gather/gather_operation.h
rename to src/ops/ops_infer/gather/gather_operation.h
diff --git a/src/ops_infer/gather/gather_ops_runner.cpp b/src/ops/ops_infer/gather/gather_ops_runner.cpp
similarity index 100%
rename from src/ops_infer/gather/gather_ops_runner.cpp
rename to src/ops/ops_infer/gather/gather_ops_runner.cpp
diff --git a/src/ops_infer/gather/gather_ops_runner.h b/src/ops/ops_infer/gather/gather_ops_runner.h
similarity index 100%
rename from src/ops_infer/gather/gather_ops_runner.h
rename to src/ops/ops_infer/gather/gather_ops_runner.h
diff --git a/src/ops_infer/gather_pre_rms_norm/gather_pre_rms_norm_operation.cpp b/src/ops/ops_infer/gather_pre_rms_norm/gather_pre_rms_norm_operation.cpp
similarity index 100%
rename from src/ops_infer/gather_pre_rms_norm/gather_pre_rms_norm_operation.cpp
rename to src/ops/ops_infer/gather_pre_rms_norm/gather_pre_rms_norm_operation.cpp
diff --git a/src/ops_infer/gather_pre_rms_norm/gather_pre_rms_norm_operation.h b/src/ops/ops_infer/gather_pre_rms_norm/gather_pre_rms_norm_operation.h
similarity index 100%
rename from src/ops_infer/gather_pre_rms_norm/gather_pre_rms_norm_operation.h
rename to src/ops/ops_infer/gather_pre_rms_norm/gather_pre_rms_norm_operation.h
diff --git a/src/ops_infer/gather_pre_rms_norm/gather_pre_rms_norm_ops_runner.cpp b/src/ops/ops_infer/gather_pre_rms_norm/gather_pre_rms_norm_ops_runner.cpp
similarity index 100%
rename from src/ops_infer/gather_pre_rms_norm/gather_pre_rms_norm_ops_runner.cpp
rename to src/ops/ops_infer/gather_pre_rms_norm/gather_pre_rms_norm_ops_runner.cpp
diff --git a/src/ops_infer/gather_pre_rms_norm/gather_pre_rms_norm_ops_runner.h b/src/ops/ops_infer/gather_pre_rms_norm/gather_pre_rms_norm_ops_runner.h
similarity index 100%
rename from src/ops_infer/gather_pre_rms_norm/gather_pre_rms_norm_ops_runner.h
rename to src/ops/ops_infer/gather_pre_rms_norm/gather_pre_rms_norm_ops_runner.h
diff --git a/src/ops_infer/gating/gating_operation.cpp b/src/ops/ops_infer/gating/gating_operation.cpp
similarity index 100%
rename from src/ops_infer/gating/gating_operation.cpp
rename to src/ops/ops_infer/gating/gating_operation.cpp
diff --git a/src/ops_infer/gating/gating_operation.h b/src/ops/ops_infer/gating/gating_operation.h
similarity index 100%
rename from src/ops_infer/gating/gating_operation.h
rename to src/ops/ops_infer/gating/gating_operation.h
diff --git a/src/ops_infer/gating/gating_ops_runner.cpp b/src/ops/ops_infer/gating/gating_ops_runner.cpp
similarity index 100%
rename from src/ops_infer/gating/gating_ops_runner.cpp
rename to src/ops/ops_infer/gating/gating_ops_runner.cpp
diff --git a/src/ops_infer/gating/gating_ops_runner.h b/src/ops/ops_infer/gating/gating_ops_runner.h
similarity index 100%
rename from src/ops_infer/gating/gating_ops_runner.h
rename to src/ops/ops_infer/gating/gating_ops_runner.h
diff --git a/src/ops_infer/gmm_deq_swiglu_quant_gmm_deq/gmm_deq_swiglu_quant_gmm_deq_operation.cpp b/src/ops/ops_infer/gmm_deq_swiglu_quant_gmm_deq/gmm_deq_swiglu_quant_gmm_deq_operation.cpp
similarity index 100%
rename from src/ops_infer/gmm_deq_swiglu_quant_gmm_deq/gmm_deq_swiglu_quant_gmm_deq_operation.cpp
rename to src/ops/ops_infer/gmm_deq_swiglu_quant_gmm_deq/gmm_deq_swiglu_quant_gmm_deq_operation.cpp
diff --git a/src/ops_infer/gmm_deq_swiglu_quant_gmm_deq/gmm_deq_swiglu_quant_gmm_deq_operation.h b/src/ops/ops_infer/gmm_deq_swiglu_quant_gmm_deq/gmm_deq_swiglu_quant_gmm_deq_operation.h
similarity index 100%
rename from src/ops_infer/gmm_deq_swiglu_quant_gmm_deq/gmm_deq_swiglu_quant_gmm_deq_operation.h
rename to src/ops/ops_infer/gmm_deq_swiglu_quant_gmm_deq/gmm_deq_swiglu_quant_gmm_deq_operation.h
diff --git a/src/ops_infer/gmm_deq_swiglu_quant_gmm_deq/gmm_deq_swiglu_quant_gmm_deq_ops_runner.cpp b/src/ops/ops_infer/gmm_deq_swiglu_quant_gmm_deq/gmm_deq_swiglu_quant_gmm_deq_ops_runner.cpp
similarity index 100%
rename from src/ops_infer/gmm_deq_swiglu_quant_gmm_deq/gmm_deq_swiglu_quant_gmm_deq_ops_runner.cpp
rename to src/ops/ops_infer/gmm_deq_swiglu_quant_gmm_deq/gmm_deq_swiglu_quant_gmm_deq_ops_runner.cpp
diff --git a/src/ops_infer/gmm_deq_swiglu_quant_gmm_deq/gmm_deq_swiglu_quant_gmm_deq_ops_runner.h b/src/ops/ops_infer/gmm_deq_swiglu_quant_gmm_deq/gmm_deq_swiglu_quant_gmm_deq_ops_runner.h
similarity index 100%
rename from src/ops_infer/gmm_deq_swiglu_quant_gmm_deq/gmm_deq_swiglu_quant_gmm_deq_ops_runner.h
rename to src/ops/ops_infer/gmm_deq_swiglu_quant_gmm_deq/gmm_deq_swiglu_quant_gmm_deq_ops_runner.h
diff --git a/src/ops_infer/group_topk/group_topk_operation.cpp b/src/ops/ops_infer/group_topk/group_topk_operation.cpp
similarity index 100%
rename from src/ops_infer/group_topk/group_topk_operation.cpp
rename to src/ops/ops_infer/group_topk/group_topk_operation.cpp
diff --git a/src/ops_infer/group_topk/group_topk_operation.h b/src/ops/ops_infer/group_topk/group_topk_operation.h
similarity index 100%
rename from src/ops_infer/group_topk/group_topk_operation.h
rename to src/ops/ops_infer/group_topk/group_topk_operation.h
diff --git a/src/ops_infer/group_topk/group_topk_ops_runner.cpp b/src/ops/ops_infer/group_topk/group_topk_ops_runner.cpp
similarity index 100%
rename from src/ops_infer/group_topk/group_topk_ops_runner.cpp
rename to src/ops/ops_infer/group_topk/group_topk_ops_runner.cpp
diff --git a/src/ops_infer/group_topk/group_topk_ops_runner.h b/src/ops/ops_infer/group_topk/group_topk_ops_runner.h
similarity index 100%
rename from src/ops_infer/group_topk/group_topk_ops_runner.h
rename to src/ops/ops_infer/group_topk/group_topk_ops_runner.h
diff --git a/src/ops_infer/grouped_matmul_inplace_add/grouped_matmul_inplace_add_operation.cpp b/src/ops/ops_infer/grouped_matmul_inplace_add/grouped_matmul_inplace_add_operation.cpp
similarity index 100%
rename from src/ops_infer/grouped_matmul_inplace_add/grouped_matmul_inplace_add_operation.cpp
rename to src/ops/ops_infer/grouped_matmul_inplace_add/grouped_matmul_inplace_add_operation.cpp
diff --git a/src/ops_infer/grouped_matmul_inplace_add/grouped_matmul_inplace_add_operation.h b/src/ops/ops_infer/grouped_matmul_inplace_add/grouped_matmul_inplace_add_operation.h
similarity index 100%
rename from src/ops_infer/grouped_matmul_inplace_add/grouped_matmul_inplace_add_operation.h
rename to src/ops/ops_infer/grouped_matmul_inplace_add/grouped_matmul_inplace_add_operation.h
diff --git a/src/ops_infer/grouped_matmul_inplace_add/grouped_matmul_inplace_add_ops_runner.cpp b/src/ops/ops_infer/grouped_matmul_inplace_add/grouped_matmul_inplace_add_ops_runner.cpp
similarity index 100%
rename from src/ops_infer/grouped_matmul_inplace_add/grouped_matmul_inplace_add_ops_runner.cpp
rename to src/ops/ops_infer/grouped_matmul_inplace_add/grouped_matmul_inplace_add_ops_runner.cpp
diff --git a/src/ops_infer/grouped_matmul_inplace_add/grouped_matmul_inplace_add_ops_runner.h b/src/ops/ops_infer/grouped_matmul_inplace_add/grouped_matmul_inplace_add_ops_runner.h
similarity index 100%
rename from src/ops_infer/grouped_matmul_inplace_add/grouped_matmul_inplace_add_ops_runner.h
rename to src/ops/ops_infer/grouped_matmul_inplace_add/grouped_matmul_inplace_add_ops_runner.h
diff --git a/src/ops_infer/grouped_matmul_with_routing/grouped_matmul_with_routing_operation.cpp b/src/ops/ops_infer/grouped_matmul_with_routing/grouped_matmul_with_routing_operation.cpp
similarity index 100%
rename from src/ops_infer/grouped_matmul_with_routing/grouped_matmul_with_routing_operation.cpp
rename to src/ops/ops_infer/grouped_matmul_with_routing/grouped_matmul_with_routing_operation.cpp
diff --git a/src/ops_infer/grouped_matmul_with_routing/grouped_matmul_with_routing_operation.h b/src/ops/ops_infer/grouped_matmul_with_routing/grouped_matmul_with_routing_operation.h
similarity index 100%
rename from src/ops_infer/grouped_matmul_with_routing/grouped_matmul_with_routing_operation.h
rename to src/ops/ops_infer/grouped_matmul_with_routing/grouped_matmul_with_routing_operation.h
diff --git a/src/ops_infer/grouped_matmul_with_routing/grouped_matmul_with_routing_runner.cpp b/src/ops/ops_infer/grouped_matmul_with_routing/grouped_matmul_with_routing_runner.cpp
similarity index 100%
rename from src/ops_infer/grouped_matmul_with_routing/grouped_matmul_with_routing_runner.cpp
rename to src/ops/ops_infer/grouped_matmul_with_routing/grouped_matmul_with_routing_runner.cpp
diff --git a/src/ops_infer/grouped_matmul_with_routing/grouped_matmul_with_routing_runner.h b/src/ops/ops_infer/grouped_matmul_with_routing/grouped_matmul_with_routing_runner.h
similarity index 100%
rename from src/ops_infer/grouped_matmul_with_routing/grouped_matmul_with_routing_runner.h
rename to src/ops/ops_infer/grouped_matmul_with_routing/grouped_matmul_with_routing_runner.h
diff --git a/src/ops_infer/index_add/index_add_operation.cpp b/src/ops/ops_infer/index_add/index_add_operation.cpp
similarity index 100%
rename from src/ops_infer/index_add/index_add_operation.cpp
rename to src/ops/ops_infer/index_add/index_add_operation.cpp
diff --git a/src/ops_infer/index_add/index_add_operation.h b/src/ops/ops_infer/index_add/index_add_operation.h
similarity index 100%
rename from src/ops_infer/index_add/index_add_operation.h
rename to src/ops/ops_infer/index_add/index_add_operation.h
diff --git a/src/ops_infer/index_add/index_add_ops_runner.cpp b/src/ops/ops_infer/index_add/index_add_ops_runner.cpp
similarity index 100%
rename from src/ops_infer/index_add/index_add_ops_runner.cpp
rename to src/ops/ops_infer/index_add/index_add_ops_runner.cpp
diff --git a/src/ops_infer/index_add/index_add_ops_runner.h b/src/ops/ops_infer/index_add/index_add_ops_runner.h
similarity index 100%
rename from src/ops_infer/index_add/index_add_ops_runner.h
rename to src/ops/ops_infer/index_add/index_add_ops_runner.h
diff --git a/src/ops_infer/kv_cache/kv_cache_operation.cpp b/src/ops/ops_infer/kv_cache/kv_cache_operation.cpp
similarity index 100%
rename from src/ops_infer/kv_cache/kv_cache_operation.cpp
rename to src/ops/ops_infer/kv_cache/kv_cache_operation.cpp
diff --git a/src/ops_infer/kv_cache/kv_cache_operation.h b/src/ops/ops_infer/kv_cache/kv_cache_operation.h
similarity index 100%
rename from src/ops_infer/kv_cache/kv_cache_operation.h
rename to src/ops/ops_infer/kv_cache/kv_cache_operation.h
diff --git a/src/ops_infer/kv_cache/kv_cache_ops_runner.cpp b/src/ops/ops_infer/kv_cache/kv_cache_ops_runner.cpp
similarity index 100%
rename from src/ops_infer/kv_cache/kv_cache_ops_runner.cpp
rename to src/ops/ops_infer/kv_cache/kv_cache_ops_runner.cpp
diff --git a/src/ops_infer/kv_cache/kv_cache_ops_runner.h b/src/ops/ops_infer/kv_cache/kv_cache_ops_runner.h
similarity index 100%
rename from src/ops_infer/kv_cache/kv_cache_ops_runner.h
rename to src/ops/ops_infer/kv_cache/kv_cache_ops_runner.h
diff --git a/src/ops_infer/layer_norm/layer_norm_operation.cpp b/src/ops/ops_infer/layer_norm/layer_norm_operation.cpp
similarity index 100%
rename from src/ops_infer/layer_norm/layer_norm_operation.cpp
rename to src/ops/ops_infer/layer_norm/layer_norm_operation.cpp
diff --git a/src/ops_infer/layer_norm/layer_norm_operation.h b/src/ops/ops_infer/layer_norm/layer_norm_operation.h
similarity index 100%
rename from src/ops_infer/layer_norm/layer_norm_operation.h
rename to src/ops/ops_infer/layer_norm/layer_norm_operation.h
diff --git a/src/ops_infer/layer_norm/layer_norm_ops_runner.cpp b/src/ops/ops_infer/layer_norm/layer_norm_ops_runner.cpp
similarity index 100%
rename from src/ops_infer/layer_norm/layer_norm_ops_runner.cpp
rename to src/ops/ops_infer/layer_norm/layer_norm_ops_runner.cpp
diff --git a/src/ops_infer/layer_norm/layer_norm_ops_runner.h b/src/ops/ops_infer/layer_norm/layer_norm_ops_runner.h
similarity index 100%
rename from src/ops_infer/layer_norm/layer_norm_ops_runner.h
rename to src/ops/ops_infer/layer_norm/layer_norm_ops_runner.h
diff --git a/src/ops_infer/layer_norm_with_stride/layer_norm_with_stride_operation.cpp b/src/ops/ops_infer/layer_norm_with_stride/layer_norm_with_stride_operation.cpp
similarity index 100%
rename from src/ops_infer/layer_norm_with_stride/layer_norm_with_stride_operation.cpp
rename to src/ops/ops_infer/layer_norm_with_stride/layer_norm_with_stride_operation.cpp
diff --git a/src/ops_infer/layer_norm_with_stride/layer_norm_with_stride_operation.h b/src/ops/ops_infer/layer_norm_with_stride/layer_norm_with_stride_operation.h
similarity index 100%
rename from src/ops_infer/layer_norm_with_stride/layer_norm_with_stride_operation.h
rename to src/ops/ops_infer/layer_norm_with_stride/layer_norm_with_stride_operation.h
diff --git a/src/ops_infer/layer_norm_with_stride/layer_norm_with_stride_ops_runner.cpp b/src/ops/ops_infer/layer_norm_with_stride/layer_norm_with_stride_ops_runner.cpp
similarity index 100%
rename from src/ops_infer/layer_norm_with_stride/layer_norm_with_stride_ops_runner.cpp
rename to src/ops/ops_infer/layer_norm_with_stride/layer_norm_with_stride_ops_runner.cpp
diff --git a/src/ops_infer/layer_norm_with_stride/layer_norm_with_stride_ops_runner.h b/src/ops/ops_infer/layer_norm_with_stride/layer_norm_with_stride_ops_runner.h
similarity index 100%
rename from src/ops_infer/layer_norm_with_stride/layer_norm_with_stride_ops_runner.h
rename to src/ops/ops_infer/layer_norm_with_stride/layer_norm_with_stride_ops_runner.h
diff --git a/src/ops_infer/linear/linear_operation.cpp b/src/ops/ops_infer/linear/linear_operation.cpp
similarity index 100%
rename from src/ops_infer/linear/linear_operation.cpp
rename to src/ops/ops_infer/linear/linear_operation.cpp
diff --git a/src/ops_infer/linear/linear_operation.h b/src/ops/ops_infer/linear/linear_operation.h
similarity index 100%
rename from src/ops_infer/linear/linear_operation.h
rename to src/ops/ops_infer/linear/linear_operation.h
diff --git a/src/ops_infer/linear/linear_ops_runner.cpp b/src/ops/ops_infer/linear/linear_ops_runner.cpp
similarity index 100%
rename from src/ops_infer/linear/linear_ops_runner.cpp
rename to src/ops/ops_infer/linear/linear_ops_runner.cpp
diff --git a/src/ops_infer/linear/linear_ops_runner.h b/src/ops/ops_infer/linear/linear_ops_runner.h
similarity index 100%
rename from src/ops_infer/linear/linear_ops_runner.h
rename to src/ops/ops_infer/linear/linear_ops_runner.h
diff --git a/src/ops_infer/linear_parallel/linear_parallel_aclnn_runner.cpp b/src/ops/ops_infer/linear_parallel/linear_parallel_aclnn_runner.cpp
similarity index 100%
rename from src/ops_infer/linear_parallel/linear_parallel_aclnn_runner.cpp
rename to src/ops/ops_infer/linear_parallel/linear_parallel_aclnn_runner.cpp
diff --git a/src/ops_infer/linear_parallel/linear_parallel_aclnn_runner.h b/src/ops/ops_infer/linear_parallel/linear_parallel_aclnn_runner.h
similarity index 100%
rename from src/ops_infer/linear_parallel/linear_parallel_aclnn_runner.h
rename to src/ops/ops_infer/linear_parallel/linear_parallel_aclnn_runner.h
diff --git a/src/ops_infer/linear_parallel/linear_parallel_graph_runner.cpp b/src/ops/ops_infer/linear_parallel/linear_parallel_graph_runner.cpp
similarity index 100%
rename from src/ops_infer/linear_parallel/linear_parallel_graph_runner.cpp
rename to src/ops/ops_infer/linear_parallel/linear_parallel_graph_runner.cpp
diff --git a/src/ops_infer/linear_parallel/linear_parallel_graph_runner.h b/src/ops/ops_infer/linear_parallel/linear_parallel_graph_runner.h
similarity index 100%
rename from src/ops_infer/linear_parallel/linear_parallel_graph_runner.h
rename to src/ops/ops_infer/linear_parallel/linear_parallel_graph_runner.h
diff --git a/src/ops_infer/linear_parallel/linear_parallel_lcoc_runner.cpp b/src/ops/ops_infer/linear_parallel/linear_parallel_lcoc_runner.cpp
similarity index 100%
rename from src/ops_infer/linear_parallel/linear_parallel_lcoc_runner.cpp
rename to src/ops/ops_infer/linear_parallel/linear_parallel_lcoc_runner.cpp
diff --git a/src/ops_infer/linear_parallel/linear_parallel_lcoc_runner.h b/src/ops/ops_infer/linear_parallel/linear_parallel_lcoc_runner.h
similarity index 100%
rename from src/ops_infer/linear_parallel/linear_parallel_lcoc_runner.h
rename to src/ops/ops_infer/linear_parallel/linear_parallel_lcoc_runner.h
diff --git a/src/ops_infer/linear_parallel/linear_parallel_operation.cpp b/src/ops/ops_infer/linear_parallel/linear_parallel_operation.cpp
similarity index 100%
rename from src/ops_infer/linear_parallel/linear_parallel_operation.cpp
rename to src/ops/ops_infer/linear_parallel/linear_parallel_operation.cpp
diff --git a/src/ops_infer/linear_parallel/linear_parallel_operation.h b/src/ops/ops_infer/linear_parallel/linear_parallel_operation.h
similarity index 100%
rename from src/ops_infer/linear_parallel/linear_parallel_operation.h
rename to src/ops/ops_infer/linear_parallel/linear_parallel_operation.h
diff --git a/src/ops_infer/linear_sparse/linear_sparse_operation.cpp b/src/ops/ops_infer/linear_sparse/linear_sparse_operation.cpp
similarity index 100%
rename from src/ops_infer/linear_sparse/linear_sparse_operation.cpp
rename to src/ops/ops_infer/linear_sparse/linear_sparse_operation.cpp
diff --git a/src/ops_infer/linear_sparse/linear_sparse_operation.h b/src/ops/ops_infer/linear_sparse/linear_sparse_operation.h
similarity index 100%
rename from src/ops_infer/linear_sparse/linear_sparse_operation.h
rename to src/ops/ops_infer/linear_sparse/linear_sparse_operation.h
diff --git a/src/ops_infer/linear_sparse/linear_sparse_ops_runner.cpp b/src/ops/ops_infer/linear_sparse/linear_sparse_ops_runner.cpp
similarity index 100%
rename from src/ops_infer/linear_sparse/linear_sparse_ops_runner.cpp
rename to src/ops/ops_infer/linear_sparse/linear_sparse_ops_runner.cpp
diff --git a/src/ops_infer/linear_sparse/linear_sparse_ops_runner.h b/src/ops/ops_infer/linear_sparse/linear_sparse_ops_runner.h
similarity index 100%
rename from src/ops_infer/linear_sparse/linear_sparse_ops_runner.h
rename to src/ops/ops_infer/linear_sparse/linear_sparse_ops_runner.h
diff --git a/src/ops_infer/mla_preprocess/atb_acl_mla_preprocess.cpp b/src/ops/ops_infer/mla_preprocess/atb_acl_mla_preprocess.cpp
similarity index 100%
rename from src/ops_infer/mla_preprocess/atb_acl_mla_preprocess.cpp
rename to src/ops/ops_infer/mla_preprocess/atb_acl_mla_preprocess.cpp
diff --git a/src/ops_infer/mla_preprocess/mla_preprocess_aclnn_runner.cpp b/src/ops/ops_infer/mla_preprocess/mla_preprocess_aclnn_runner.cpp
similarity index 100%
rename from src/ops_infer/mla_preprocess/mla_preprocess_aclnn_runner.cpp
rename to src/ops/ops_infer/mla_preprocess/mla_preprocess_aclnn_runner.cpp
diff --git a/src/ops_infer/mla_preprocess/mla_preprocess_aclnn_runner.h b/src/ops/ops_infer/mla_preprocess/mla_preprocess_aclnn_runner.h
similarity index 100%
rename from src/ops_infer/mla_preprocess/mla_preprocess_aclnn_runner.h
rename to src/ops/ops_infer/mla_preprocess/mla_preprocess_aclnn_runner.h
diff --git a/src/ops_infer/mla_preprocess/mla_preprocess_operation.cpp b/src/ops/ops_infer/mla_preprocess/mla_preprocess_operation.cpp
similarity index 100%
rename from src/ops_infer/mla_preprocess/mla_preprocess_operation.cpp
rename to src/ops/ops_infer/mla_preprocess/mla_preprocess_operation.cpp
diff --git a/src/ops_infer/mla_preprocess/mla_preprocess_operation.h b/src/ops/ops_infer/mla_preprocess/mla_preprocess_operation.h
similarity index 100%
rename from src/ops_infer/mla_preprocess/mla_preprocess_operation.h
rename to src/ops/ops_infer/mla_preprocess/mla_preprocess_operation.h
diff --git a/src/ops_infer/mla_preprocess/mla_preprocess_ops_runner.cpp b/src/ops/ops_infer/mla_preprocess/mla_preprocess_ops_runner.cpp
similarity index 100%
rename from src/ops_infer/mla_preprocess/mla_preprocess_ops_runner.cpp
rename to src/ops/ops_infer/mla_preprocess/mla_preprocess_ops_runner.cpp
diff --git a/src/ops_infer/mla_preprocess/mla_preprocess_ops_runner.h b/src/ops/ops_infer/mla_preprocess/mla_preprocess_ops_runner.h
similarity index 100%
rename from src/ops_infer/mla_preprocess/mla_preprocess_ops_runner.h
rename to src/ops/ops_infer/mla_preprocess/mla_preprocess_ops_runner.h
diff --git a/src/ops_infer/mla_preprocess/mla_preprocess_ops_runner_split.cpp b/src/ops/ops_infer/mla_preprocess/mla_preprocess_ops_runner_split.cpp
similarity index 100%
rename from src/ops_infer/mla_preprocess/mla_preprocess_ops_runner_split.cpp
rename to src/ops/ops_infer/mla_preprocess/mla_preprocess_ops_runner_split.cpp
diff --git a/src/ops_infer/mla_preprocess/mla_preprocess_ops_runner_split.h b/src/ops/ops_infer/mla_preprocess/mla_preprocess_ops_runner_split.h
similarity index 100%
rename from src/ops_infer/mla_preprocess/mla_preprocess_ops_runner_split.h
rename to src/ops/ops_infer/mla_preprocess/mla_preprocess_ops_runner_split.h
diff --git a/src/ops_infer/mm_deq_swiglu_quant_mm_deq/mm_deq_swiglu_quant_mm_deq_operation.cpp b/src/ops/ops_infer/mm_deq_swiglu_quant_mm_deq/mm_deq_swiglu_quant_mm_deq_operation.cpp
similarity index 100%
rename from src/ops_infer/mm_deq_swiglu_quant_mm_deq/mm_deq_swiglu_quant_mm_deq_operation.cpp
rename to src/ops/ops_infer/mm_deq_swiglu_quant_mm_deq/mm_deq_swiglu_quant_mm_deq_operation.cpp
diff --git a/src/ops_infer/mm_deq_swiglu_quant_mm_deq/mm_deq_swiglu_quant_mm_deq_operation.h b/src/ops/ops_infer/mm_deq_swiglu_quant_mm_deq/mm_deq_swiglu_quant_mm_deq_operation.h
similarity index 100%
rename from src/ops_infer/mm_deq_swiglu_quant_mm_deq/mm_deq_swiglu_quant_mm_deq_operation.h
rename to src/ops/ops_infer/mm_deq_swiglu_quant_mm_deq/mm_deq_swiglu_quant_mm_deq_operation.h
diff --git a/src/ops_infer/mm_deq_swiglu_quant_mm_deq/mm_deq_swiglu_quant_mm_deq_ops_runner.cpp b/src/ops/ops_infer/mm_deq_swiglu_quant_mm_deq/mm_deq_swiglu_quant_mm_deq_ops_runner.cpp
similarity index 100%
rename from src/ops_infer/mm_deq_swiglu_quant_mm_deq/mm_deq_swiglu_quant_mm_deq_ops_runner.cpp
rename to src/ops/ops_infer/mm_deq_swiglu_quant_mm_deq/mm_deq_swiglu_quant_mm_deq_ops_runner.cpp
diff --git a/src/ops_infer/mm_deq_swiglu_quant_mm_deq/mm_deq_swiglu_quant_mm_deq_ops_runner.h b/src/ops/ops_infer/mm_deq_swiglu_quant_mm_deq/mm_deq_swiglu_quant_mm_deq_ops_runner.h
similarity index 100%
rename from src/ops_infer/mm_deq_swiglu_quant_mm_deq/mm_deq_swiglu_quant_mm_deq_ops_runner.h
rename to src/ops/ops_infer/mm_deq_swiglu_quant_mm_deq/mm_deq_swiglu_quant_mm_deq_ops_runner.h
diff --git a/src/ops_infer/multi_latent_attention/atb_acl_mla.cpp b/src/ops/ops_infer/multi_latent_attention/atb_acl_mla.cpp
similarity index 100%
rename from src/ops_infer/multi_latent_attention/atb_acl_mla.cpp
rename to src/ops/ops_infer/multi_latent_attention/atb_acl_mla.cpp
diff --git a/src/ops_infer/multi_latent_attention/multi_latent_attention_operation.cpp b/src/ops/ops_infer/multi_latent_attention/multi_latent_attention_operation.cpp
similarity index 100%
rename from src/ops_infer/multi_latent_attention/multi_latent_attention_operation.cpp
rename to src/ops/ops_infer/multi_latent_attention/multi_latent_attention_operation.cpp
diff --git a/src/ops_infer/multi_latent_attention/multi_latent_attention_operation.h b/src/ops/ops_infer/multi_latent_attention/multi_latent_attention_operation.h
similarity index 100%
rename from src/ops_infer/multi_latent_attention/multi_latent_attention_operation.h
rename to src/ops/ops_infer/multi_latent_attention/multi_latent_attention_operation.h
diff --git a/src/ops_infer/multi_latent_attention/multi_latent_attention_ops_runner.cpp b/src/ops/ops_infer/multi_latent_attention/multi_latent_attention_ops_runner.cpp
similarity index 100%
rename from src/ops_infer/multi_latent_attention/multi_latent_attention_ops_runner.cpp
rename to src/ops/ops_infer/multi_latent_attention/multi_latent_attention_ops_runner.cpp
diff --git a/src/ops_infer/multi_latent_attention/multi_latent_attention_ops_runner.h b/src/ops/ops_infer/multi_latent_attention/multi_latent_attention_ops_runner.h
similarity index 100%
rename from src/ops_infer/multi_latent_attention/multi_latent_attention_ops_runner.h
rename to src/ops/ops_infer/multi_latent_attention/multi_latent_attention_ops_runner.h
diff --git a/src/ops_infer/multi_latent_attention/multi_latent_attention_ops_runner_prefill.cpp b/src/ops/ops_infer/multi_latent_attention/multi_latent_attention_ops_runner_prefill.cpp
similarity index 100%
rename from src/ops_infer/multi_latent_attention/multi_latent_attention_ops_runner_prefill.cpp
rename to src/ops/ops_infer/multi_latent_attention/multi_latent_attention_ops_runner_prefill.cpp
diff --git a/src/ops_infer/multi_latent_attention/multi_latent_attention_ops_runner_prefill.h b/src/ops/ops_infer/multi_latent_attention/multi_latent_attention_ops_runner_prefill.h
similarity index 100%
rename from src/ops_infer/multi_latent_attention/multi_latent_attention_ops_runner_prefill.h
rename to src/ops/ops_infer/multi_latent_attention/multi_latent_attention_ops_runner_prefill.h
diff --git a/src/ops_infer/multi_latent_attention/param.cpp b/src/ops/ops_infer/multi_latent_attention/param.cpp
similarity index 100%
rename from src/ops_infer/multi_latent_attention/param.cpp
rename to src/ops/ops_infer/multi_latent_attention/param.cpp
diff --git a/src/ops_infer/multi_latent_attention/param.h b/src/ops/ops_infer/multi_latent_attention/param.h
similarity index 100%
rename from src/ops_infer/multi_latent_attention/param.h
rename to src/ops/ops_infer/multi_latent_attention/param.h
diff --git a/src/ops_infer/multinomial/multinomial_operation.cpp b/src/ops/ops_infer/multinomial/multinomial_operation.cpp
similarity index 100%
rename from src/ops_infer/multinomial/multinomial_operation.cpp
rename to src/ops/ops_infer/multinomial/multinomial_operation.cpp
diff --git a/src/ops_infer/multinomial/multinomial_operation.h b/src/ops/ops_infer/multinomial/multinomial_operation.h
similarity index 100%
rename from src/ops_infer/multinomial/multinomial_operation.h
rename to src/ops/ops_infer/multinomial/multinomial_operation.h
diff --git a/src/ops_infer/multinomial/multinomial_ops_runner.cpp b/src/ops/ops_infer/multinomial/multinomial_ops_runner.cpp
similarity index 100%
rename from src/ops_infer/multinomial/multinomial_ops_runner.cpp
rename to src/ops/ops_infer/multinomial/multinomial_ops_runner.cpp
diff --git a/src/ops_infer/multinomial/multinomial_ops_runner.h b/src/ops/ops_infer/multinomial/multinomial_ops_runner.h
similarity index 100%
rename from src/ops_infer/multinomial/multinomial_ops_runner.h
rename to src/ops/ops_infer/multinomial/multinomial_ops_runner.h
diff --git a/src/ops_infer/nonzero/nonzero_operation.cpp b/src/ops/ops_infer/nonzero/nonzero_operation.cpp
similarity index 100%
rename from src/ops_infer/nonzero/nonzero_operation.cpp
rename to src/ops/ops_infer/nonzero/nonzero_operation.cpp
diff --git a/src/ops_infer/nonzero/nonzero_operation.h b/src/ops/ops_infer/nonzero/nonzero_operation.h
similarity index 100%
rename from src/ops_infer/nonzero/nonzero_operation.h
rename to src/ops/ops_infer/nonzero/nonzero_operation.h
diff --git a/src/ops_infer/nonzero/nonzero_runner.cpp b/src/ops/ops_infer/nonzero/nonzero_runner.cpp
similarity index 100%
rename from src/ops_infer/nonzero/nonzero_runner.cpp
rename to src/ops/ops_infer/nonzero/nonzero_runner.cpp
diff --git a/src/ops_infer/nonzero/nonzero_runner.h b/src/ops/ops_infer/nonzero/nonzero_runner.h
similarity index 100%
rename from src/ops_infer/nonzero/nonzero_runner.h
rename to src/ops/ops_infer/nonzero/nonzero_runner.h
diff --git a/src/ops_infer/norm_rope_reshape/norm_rope_reshape_operation.cpp b/src/ops/ops_infer/norm_rope_reshape/norm_rope_reshape_operation.cpp
similarity index 97%
rename from src/ops_infer/norm_rope_reshape/norm_rope_reshape_operation.cpp
rename to src/ops/ops_infer/norm_rope_reshape/norm_rope_reshape_operation.cpp
index 00278615..864df1ca 100644
--- a/src/ops_infer/norm_rope_reshape/norm_rope_reshape_operation.cpp
+++ b/src/ops/ops_infer/norm_rope_reshape/norm_rope_reshape_operation.cpp
@@ -55,8 +55,7 @@ template <> Status CreateOperation(const infer::NormRopeReshapeParam &opParam, O
     return NO_ERROR;
 }
  
-NormRopeReshapeOperation::NormRopeReshapeOperation
-(const infer::NormRopeReshapeParam &param)
+NormRopeReshapeOperation::NormRopeReshapeOperation(const infer::NormRopeReshapeParam &param)
     : OperationBase("NormRopeReshapeOperation"), param_(param)
 {
     operationIr_ = GetSingleton<AtbOperationIrCfg>().GetOperationIr("NormRopeReshapeOperation");
@@ -206,8 +205,8 @@ Status NormRopeReshapeOperation::CheckOutTensorSame
     return NO_ERROR;
 }
  
-bool NormRopeReshapeOperation::GammaBetaTensorCheck
-(const TensorDesc &xTensorDesc, const TensorDesc &tensorDesc2) const
+bool NormRopeReshapeOperation::GammaBetaTensorCheck(
+    const TensorDesc &xTensorDesc, const TensorDesc &tensorDesc2) const
 {
     int embedDim = xTensorDesc.shape.dims[xTensorDesc.shape.dimNum - 1];
     if (xTensorDesc.dtype != tensorDesc2.dtype || xTensorDesc.format != tensorDesc2.format) {
diff --git a/src/ops_infer/norm_rope_reshape/norm_rope_reshape_operation.h b/src/ops/ops_infer/norm_rope_reshape/norm_rope_reshape_operation.h
similarity index 100%
rename from src/ops_infer/norm_rope_reshape/norm_rope_reshape_operation.h
rename to src/ops/ops_infer/norm_rope_reshape/norm_rope_reshape_operation.h
diff --git a/src/ops_infer/norm_rope_reshape/norm_rope_reshape_ops_runner.cpp b/src/ops/ops_infer/norm_rope_reshape/norm_rope_reshape_ops_runner.cpp
similarity index 92%
rename from src/ops_infer/norm_rope_reshape/norm_rope_reshape_ops_runner.cpp
rename to src/ops/ops_infer/norm_rope_reshape/norm_rope_reshape_ops_runner.cpp
index 8bcf8c45..bca072d9 100644
--- a/src/ops_infer/norm_rope_reshape/norm_rope_reshape_ops_runner.cpp
+++ b/src/ops/ops_infer/norm_rope_reshape/norm_rope_reshape_ops_runner.cpp
@@ -22,8 +22,8 @@ void NormRopeReshapeOpsRunner::SetNormRopeReshapeParam(
     asdopsParam.rotaryCoeff = inferParam.rotaryCoeff;
 }
  
-void NormRopeReshapeOpsRunner::BuildNormRopeReshapeGraph
-(const AtbOps::OpParam::RmsNormAndRopeAndReshapeAndCache &normRopeReshapeParam)
+void NormRopeReshapeOpsRunner::BuildNormRopeReshapeGraph(
+    const AtbOps::OpParam::RmsNormAndRopeAndReshapeAndCache &normRopeReshapeParam)
 {
     kernelGraph_.inTensors.resize(IN_TENSOR_COUNT_SEVEN);
     size_t inId = 0;
@@ -47,8 +47,7 @@ void NormRopeReshapeOpsRunner::BuildNormRopeReshapeGraph
     normRopeReshapeNode.outTensors = {&keycacheOutTensor};
 }
  
-NormRopeReshapeOpsRunner::NormRopeReshapeOpsRunner
-(const infer::NormRopeReshapeParam &param)
+NormRopeReshapeOpsRunner::NormRopeReshapeOpsRunner(const infer::NormRopeReshapeParam &param)
     : OpsRunner("NormRopeReshapeOpsRunner", RUNNER_TYPE_NORM_ROPE_RESHAPE), param_(param)
 {
     AtbOps::OpParam::RmsNormAndRopeAndReshapeAndCache rmsNormAndRopeAndReshapeAndCacheParam;
diff --git a/src/ops_infer/norm_rope_reshape/norm_rope_reshape_ops_runner.h b/src/ops/ops_infer/norm_rope_reshape/norm_rope_reshape_ops_runner.h
similarity index 100%
rename from src/ops_infer/norm_rope_reshape/norm_rope_reshape_ops_runner.h
rename to src/ops/ops_infer/norm_rope_reshape/norm_rope_reshape_ops_runner.h
diff --git a/src/ops_infer/onehot/onehot_operation.cpp b/src/ops/ops_infer/onehot/onehot_operation.cpp
similarity index 100%
rename from src/ops_infer/onehot/onehot_operation.cpp
rename to src/ops/ops_infer/onehot/onehot_operation.cpp
diff --git a/src/ops_infer/onehot/onehot_operation.h b/src/ops/ops_infer/onehot/onehot_operation.h
similarity index 100%
rename from src/ops_infer/onehot/onehot_operation.h
rename to src/ops/ops_infer/onehot/onehot_operation.h
diff --git a/src/ops_infer/onehot/onehot_ops_runner.cpp b/src/ops/ops_infer/onehot/onehot_ops_runner.cpp
similarity index 100%
rename from src/ops_infer/onehot/onehot_ops_runner.cpp
rename to src/ops/ops_infer/onehot/onehot_ops_runner.cpp
diff --git a/src/ops_infer/onehot/onehot_ops_runner.h b/src/ops/ops_infer/onehot/onehot_ops_runner.h
similarity index 100%
rename from src/ops_infer/onehot/onehot_ops_runner.h
rename to src/ops/ops_infer/onehot/onehot_ops_runner.h
diff --git a/src/ops_infer/pad/pad_operation.cpp b/src/ops/ops_infer/pad/pad_operation.cpp
similarity index 100%
rename from src/ops_infer/pad/pad_operation.cpp
rename to src/ops/ops_infer/pad/pad_operation.cpp
diff --git a/src/ops_infer/pad/pad_operation.h b/src/ops/ops_infer/pad/pad_operation.h
similarity index 100%
rename from src/ops_infer/pad/pad_operation.h
rename to src/ops/ops_infer/pad/pad_operation.h
diff --git a/src/ops_infer/pad/pad_ops_runner.cpp b/src/ops/ops_infer/pad/pad_ops_runner.cpp
similarity index 100%
rename from src/ops_infer/pad/pad_ops_runner.cpp
rename to src/ops/ops_infer/pad/pad_ops_runner.cpp
diff --git a/src/ops_infer/pad/pad_ops_runner.h b/src/ops/ops_infer/pad/pad_ops_runner.h
similarity index 100%
rename from src/ops_infer/pad/pad_ops_runner.h
rename to src/ops/ops_infer/pad/pad_ops_runner.h
diff --git a/src/ops_infer/paged_attention/paged_attention_operation.cpp b/src/ops/ops_infer/paged_attention/paged_attention_operation.cpp
similarity index 100%
rename from src/ops_infer/paged_attention/paged_attention_operation.cpp
rename to src/ops/ops_infer/paged_attention/paged_attention_operation.cpp
diff --git a/src/ops_infer/paged_attention/paged_attention_operation.h b/src/ops/ops_infer/paged_attention/paged_attention_operation.h
similarity index 100%
rename from src/ops_infer/paged_attention/paged_attention_operation.h
rename to src/ops/ops_infer/paged_attention/paged_attention_operation.h
diff --git a/src/ops_infer/paged_attention/paged_attention_ops_runner.cpp b/src/ops/ops_infer/paged_attention/paged_attention_ops_runner.cpp
similarity index 100%
rename from src/ops_infer/paged_attention/paged_attention_ops_runner.cpp
rename to src/ops/ops_infer/paged_attention/paged_attention_ops_runner.cpp
diff --git a/src/ops_infer/paged_attention/paged_attention_ops_runner.h b/src/ops/ops_infer/paged_attention/paged_attention_ops_runner.h
similarity index 100%
rename from src/ops_infer/paged_attention/paged_attention_ops_runner.h
rename to src/ops/ops_infer/paged_attention/paged_attention_ops_runner.h
diff --git a/src/ops_infer/paged_attention/paged_attention_ops_runner_910a.cpp b/src/ops/ops_infer/paged_attention/paged_attention_ops_runner_910a.cpp
similarity index 100%
rename from src/ops_infer/paged_attention/paged_attention_ops_runner_910a.cpp
rename to src/ops/ops_infer/paged_attention/paged_attention_ops_runner_910a.cpp
diff --git a/src/ops_infer/paged_attention/paged_attention_ops_runner_910a.h b/src/ops/ops_infer/paged_attention/paged_attention_ops_runner_910a.h
similarity index 100%
rename from src/ops_infer/paged_attention/paged_attention_ops_runner_910a.h
rename to src/ops/ops_infer/paged_attention/paged_attention_ops_runner_910a.h
diff --git a/src/ops_infer/paged_attention/paged_attention_runner_utils.cpp b/src/ops/ops_infer/paged_attention/paged_attention_runner_utils.cpp
similarity index 100%
rename from src/ops_infer/paged_attention/paged_attention_runner_utils.cpp
rename to src/ops/ops_infer/paged_attention/paged_attention_runner_utils.cpp
diff --git a/src/ops_infer/paged_attention/paged_attention_runner_utils.h b/src/ops/ops_infer/paged_attention/paged_attention_runner_utils.h
similarity index 100%
rename from src/ops_infer/paged_attention/paged_attention_runner_utils.h
rename to src/ops/ops_infer/paged_attention/paged_attention_runner_utils.h
diff --git a/src/ops_infer/paged_attention/param.cpp b/src/ops/ops_infer/paged_attention/param.cpp
similarity index 100%
rename from src/ops_infer/paged_attention/param.cpp
rename to src/ops/ops_infer/paged_attention/param.cpp
diff --git a/src/ops_infer/paged_attention/param.h b/src/ops/ops_infer/paged_attention/param.h
similarity index 100%
rename from src/ops_infer/paged_attention/param.h
rename to src/ops/ops_infer/paged_attention/param.h
diff --git a/src/ops_infer/paged_cache_load/atb_acl_paged_cache_load.cpp b/src/ops/ops_infer/paged_cache_load/atb_acl_paged_cache_load.cpp
similarity index 100%
rename from src/ops_infer/paged_cache_load/atb_acl_paged_cache_load.cpp
rename to src/ops/ops_infer/paged_cache_load/atb_acl_paged_cache_load.cpp
diff --git a/src/ops_infer/paged_cache_load/paged_cache_load_operation.cpp b/src/ops/ops_infer/paged_cache_load/paged_cache_load_operation.cpp
similarity index 100%
rename from src/ops_infer/paged_cache_load/paged_cache_load_operation.cpp
rename to src/ops/ops_infer/paged_cache_load/paged_cache_load_operation.cpp
diff --git a/src/ops_infer/paged_cache_load/paged_cache_load_operation.h b/src/ops/ops_infer/paged_cache_load/paged_cache_load_operation.h
similarity index 100%
rename from src/ops_infer/paged_cache_load/paged_cache_load_operation.h
rename to src/ops/ops_infer/paged_cache_load/paged_cache_load_operation.h
diff --git a/src/ops_infer/paged_cache_load/paged_cache_load_ops_runner.cpp b/src/ops/ops_infer/paged_cache_load/paged_cache_load_ops_runner.cpp
similarity index 100%
rename from src/ops_infer/paged_cache_load/paged_cache_load_ops_runner.cpp
rename to src/ops/ops_infer/paged_cache_load/paged_cache_load_ops_runner.cpp
diff --git a/src/ops_infer/paged_cache_load/paged_cache_load_ops_runner.h b/src/ops/ops_infer/paged_cache_load/paged_cache_load_ops_runner.h
similarity index 100%
rename from src/ops_infer/paged_cache_load/paged_cache_load_ops_runner.h
rename to src/ops/ops_infer/paged_cache_load/paged_cache_load_ops_runner.h
diff --git a/src/ops_infer/razor_fusion_attention/razor_fusion_attention_operation.cpp b/src/ops/ops_infer/razor_fusion_attention/razor_fusion_attention_operation.cpp
similarity index 100%
rename from src/ops_infer/razor_fusion_attention/razor_fusion_attention_operation.cpp
rename to src/ops/ops_infer/razor_fusion_attention/razor_fusion_attention_operation.cpp
diff --git a/src/ops_infer/razor_fusion_attention/razor_fusion_attention_operation.h b/src/ops/ops_infer/razor_fusion_attention/razor_fusion_attention_operation.h
similarity index 100%
rename from src/ops_infer/razor_fusion_attention/razor_fusion_attention_operation.h
rename to src/ops/ops_infer/razor_fusion_attention/razor_fusion_attention_operation.h
diff --git a/src/ops_infer/razor_fusion_attention/razor_fusion_attention_ops_runner.cpp b/src/ops/ops_infer/razor_fusion_attention/razor_fusion_attention_ops_runner.cpp
similarity index 100%
rename from src/ops_infer/razor_fusion_attention/razor_fusion_attention_ops_runner.cpp
rename to src/ops/ops_infer/razor_fusion_attention/razor_fusion_attention_ops_runner.cpp
diff --git a/src/ops_infer/razor_fusion_attention/razor_fusion_attention_ops_runner.h b/src/ops/ops_infer/razor_fusion_attention/razor_fusion_attention_ops_runner.h
similarity index 100%
rename from src/ops_infer/razor_fusion_attention/razor_fusion_attention_ops_runner.h
rename to src/ops/ops_infer/razor_fusion_attention/razor_fusion_attention_ops_runner.h
diff --git a/src/ops_infer/recv/recv_hccl_runner.cpp b/src/ops/ops_infer/recv/recv_hccl_runner.cpp
similarity index 100%
rename from src/ops_infer/recv/recv_hccl_runner.cpp
rename to src/ops/ops_infer/recv/recv_hccl_runner.cpp
diff --git a/src/ops_infer/recv/recv_hccl_runner.h b/src/ops/ops_infer/recv/recv_hccl_runner.h
similarity index 100%
rename from src/ops_infer/recv/recv_hccl_runner.h
rename to src/ops/ops_infer/recv/recv_hccl_runner.h
diff --git a/src/ops_infer/recv/recv_operation.cpp b/src/ops/ops_infer/recv/recv_operation.cpp
similarity index 100%
rename from src/ops_infer/recv/recv_operation.cpp
rename to src/ops/ops_infer/recv/recv_operation.cpp
diff --git a/src/ops_infer/recv/recv_operation.h b/src/ops/ops_infer/recv/recv_operation.h
similarity index 100%
rename from src/ops_infer/recv/recv_operation.h
rename to src/ops/ops_infer/recv/recv_operation.h
diff --git a/src/ops_infer/reduce/reduce_operation.cpp b/src/ops/ops_infer/reduce/reduce_operation.cpp
similarity index 100%
rename from src/ops_infer/reduce/reduce_operation.cpp
rename to src/ops/ops_infer/reduce/reduce_operation.cpp
diff --git a/src/ops_infer/reduce/reduce_operation.h b/src/ops/ops_infer/reduce/reduce_operation.h
similarity index 100%
rename from src/ops_infer/reduce/reduce_operation.h
rename to src/ops/ops_infer/reduce/reduce_operation.h
diff --git a/src/ops_infer/reduce/reduce_ops_runner.cpp b/src/ops/ops_infer/reduce/reduce_ops_runner.cpp
similarity index 100%
rename from src/ops_infer/reduce/reduce_ops_runner.cpp
rename to src/ops/ops_infer/reduce/reduce_ops_runner.cpp
diff --git a/src/ops_infer/reduce/reduce_ops_runner.h b/src/ops/ops_infer/reduce/reduce_ops_runner.h
similarity index 100%
rename from src/ops_infer/reduce/reduce_ops_runner.h
rename to src/ops/ops_infer/reduce/reduce_ops_runner.h
diff --git a/src/ops_infer/reduce_scatter/reduce_scatter_hccl_runner.cpp b/src/ops/ops_infer/reduce_scatter/reduce_scatter_hccl_runner.cpp
similarity index 100%
rename from src/ops_infer/reduce_scatter/reduce_scatter_hccl_runner.cpp
rename to src/ops/ops_infer/reduce_scatter/reduce_scatter_hccl_runner.cpp
diff --git a/src/ops_infer/reduce_scatter/reduce_scatter_hccl_runner.h b/src/ops/ops_infer/reduce_scatter/reduce_scatter_hccl_runner.h
similarity index 100%
rename from src/ops_infer/reduce_scatter/reduce_scatter_hccl_runner.h
rename to src/ops/ops_infer/reduce_scatter/reduce_scatter_hccl_runner.h
diff --git a/src/ops_infer/reduce_scatter/reduce_scatter_lccl_runner.cpp b/src/ops/ops_infer/reduce_scatter/reduce_scatter_lccl_runner.cpp
similarity index 100%
rename from src/ops_infer/reduce_scatter/reduce_scatter_lccl_runner.cpp
rename to src/ops/ops_infer/reduce_scatter/reduce_scatter_lccl_runner.cpp
diff --git a/src/ops_infer/reduce_scatter/reduce_scatter_lccl_runner.h b/src/ops/ops_infer/reduce_scatter/reduce_scatter_lccl_runner.h
similarity index 100%
rename from src/ops_infer/reduce_scatter/reduce_scatter_lccl_runner.h
rename to src/ops/ops_infer/reduce_scatter/reduce_scatter_lccl_runner.h
diff --git a/src/ops_infer/reduce_scatter/reduce_scatter_operation.cpp b/src/ops/ops_infer/reduce_scatter/reduce_scatter_operation.cpp
similarity index 100%
rename from src/ops_infer/reduce_scatter/reduce_scatter_operation.cpp
rename to src/ops/ops_infer/reduce_scatter/reduce_scatter_operation.cpp
diff --git a/src/ops_infer/reduce_scatter/reduce_scatter_operation.h b/src/ops/ops_infer/reduce_scatter/reduce_scatter_operation.h
similarity index 100%
rename from src/ops_infer/reduce_scatter/reduce_scatter_operation.h
rename to src/ops/ops_infer/reduce_scatter/reduce_scatter_operation.h
diff --git a/src/ops_infer/reduce_scatterv/reduce_scatterv_hccl_runner.cpp b/src/ops/ops_infer/reduce_scatterv/reduce_scatterv_hccl_runner.cpp
similarity index 100%
rename from src/ops_infer/reduce_scatterv/reduce_scatterv_hccl_runner.cpp
rename to src/ops/ops_infer/reduce_scatterv/reduce_scatterv_hccl_runner.cpp
diff --git a/src/ops_infer/reduce_scatterv/reduce_scatterv_hccl_runner.h b/src/ops/ops_infer/reduce_scatterv/reduce_scatterv_hccl_runner.h
similarity index 100%
rename from src/ops_infer/reduce_scatterv/reduce_scatterv_hccl_runner.h
rename to src/ops/ops_infer/reduce_scatterv/reduce_scatterv_hccl_runner.h
diff --git a/src/ops_infer/reduce_scatterv/reduce_scatterv_operation.cpp b/src/ops/ops_infer/reduce_scatterv/reduce_scatterv_operation.cpp
similarity index 100%
rename from src/ops_infer/reduce_scatterv/reduce_scatterv_operation.cpp
rename to src/ops/ops_infer/reduce_scatterv/reduce_scatterv_operation.cpp
diff --git a/src/ops_infer/reduce_scatterv/reduce_scatterv_operation.h b/src/ops/ops_infer/reduce_scatterv/reduce_scatterv_operation.h
similarity index 100%
rename from src/ops_infer/reduce_scatterv/reduce_scatterv_operation.h
rename to src/ops/ops_infer/reduce_scatterv/reduce_scatterv_operation.h
diff --git a/src/ops_infer/relay_attention/param.cpp b/src/ops/ops_infer/relay_attention/param.cpp
similarity index 100%
rename from src/ops_infer/relay_attention/param.cpp
rename to src/ops/ops_infer/relay_attention/param.cpp
diff --git a/src/ops_infer/relay_attention/param.h b/src/ops/ops_infer/relay_attention/param.h
similarity index 100%
rename from src/ops_infer/relay_attention/param.h
rename to src/ops/ops_infer/relay_attention/param.h
diff --git a/src/ops_infer/relay_attention/relay_attention_operation.cpp b/src/ops/ops_infer/relay_attention/relay_attention_operation.cpp
similarity index 100%
rename from src/ops_infer/relay_attention/relay_attention_operation.cpp
rename to src/ops/ops_infer/relay_attention/relay_attention_operation.cpp
diff --git a/src/ops_infer/relay_attention/relay_attention_operation.h b/src/ops/ops_infer/relay_attention/relay_attention_operation.h
similarity index 100%
rename from src/ops_infer/relay_attention/relay_attention_operation.h
rename to src/ops/ops_infer/relay_attention/relay_attention_operation.h
diff --git a/src/ops_infer/relay_attention/relay_attention_ops_runner.cpp b/src/ops/ops_infer/relay_attention/relay_attention_ops_runner.cpp
similarity index 100%
rename from src/ops_infer/relay_attention/relay_attention_ops_runner.cpp
rename to src/ops/ops_infer/relay_attention/relay_attention_ops_runner.cpp
diff --git a/src/ops_infer/relay_attention/relay_attention_ops_runner.h b/src/ops/ops_infer/relay_attention/relay_attention_ops_runner.h
similarity index 100%
rename from src/ops_infer/relay_attention/relay_attention_ops_runner.h
rename to src/ops/ops_infer/relay_attention/relay_attention_ops_runner.h
diff --git a/src/ops_infer/repeat/repeat_operation.cpp b/src/ops/ops_infer/repeat/repeat_operation.cpp
similarity index 100%
rename from src/ops_infer/repeat/repeat_operation.cpp
rename to src/ops/ops_infer/repeat/repeat_operation.cpp
diff --git a/src/ops_infer/repeat/repeat_operation.h b/src/ops/ops_infer/repeat/repeat_operation.h
similarity index 100%
rename from src/ops_infer/repeat/repeat_operation.h
rename to src/ops/ops_infer/repeat/repeat_operation.h
diff --git a/src/ops_infer/repeat/repeat_ops_runner.cpp b/src/ops/ops_infer/repeat/repeat_ops_runner.cpp
similarity index 100%
rename from src/ops_infer/repeat/repeat_ops_runner.cpp
rename to src/ops/ops_infer/repeat/repeat_ops_runner.cpp
diff --git a/src/ops_infer/repeat/repeat_ops_runner.h b/src/ops/ops_infer/repeat/repeat_ops_runner.h
similarity index 100%
rename from src/ops_infer/repeat/repeat_ops_runner.h
rename to src/ops/ops_infer/repeat/repeat_ops_runner.h
diff --git a/src/ops_infer/reshape_and_cache/reshape_and_cache_operation.cpp b/src/ops/ops_infer/reshape_and_cache/reshape_and_cache_operation.cpp
similarity index 100%
rename from src/ops_infer/reshape_and_cache/reshape_and_cache_operation.cpp
rename to src/ops/ops_infer/reshape_and_cache/reshape_and_cache_operation.cpp
diff --git a/src/ops_infer/reshape_and_cache/reshape_and_cache_operation.h b/src/ops/ops_infer/reshape_and_cache/reshape_and_cache_operation.h
similarity index 100%
rename from src/ops_infer/reshape_and_cache/reshape_and_cache_operation.h
rename to src/ops/ops_infer/reshape_and_cache/reshape_and_cache_operation.h
diff --git a/src/ops_infer/reshape_and_cache/reshape_and_cache_ops_runner.cpp b/src/ops/ops_infer/reshape_and_cache/reshape_and_cache_ops_runner.cpp
similarity index 100%
rename from src/ops_infer/reshape_and_cache/reshape_and_cache_ops_runner.cpp
rename to src/ops/ops_infer/reshape_and_cache/reshape_and_cache_ops_runner.cpp
diff --git a/src/ops_infer/reshape_and_cache/reshape_and_cache_ops_runner.h b/src/ops/ops_infer/reshape_and_cache/reshape_and_cache_ops_runner.h
similarity index 100%
rename from src/ops_infer/reshape_and_cache/reshape_and_cache_ops_runner.h
rename to src/ops/ops_infer/reshape_and_cache/reshape_and_cache_ops_runner.h
diff --git a/src/ops_infer/reshape_and_cache/reshape_and_cache_ops_runner_310p.cpp b/src/ops/ops_infer/reshape_and_cache/reshape_and_cache_ops_runner_310p.cpp
similarity index 100%
rename from src/ops_infer/reshape_and_cache/reshape_and_cache_ops_runner_310p.cpp
rename to src/ops/ops_infer/reshape_and_cache/reshape_and_cache_ops_runner_310p.cpp
diff --git a/src/ops_infer/reshape_and_cache/reshape_and_cache_ops_runner_310p.h b/src/ops/ops_infer/reshape_and_cache/reshape_and_cache_ops_runner_310p.h
similarity index 100%
rename from src/ops_infer/reshape_and_cache/reshape_and_cache_ops_runner_310p.h
rename to src/ops/ops_infer/reshape_and_cache/reshape_and_cache_ops_runner_310p.h
diff --git a/src/ops_infer/reshape_and_cache/reshape_and_cache_ops_runner_A2_NZ.cpp b/src/ops/ops_infer/reshape_and_cache/reshape_and_cache_ops_runner_A2_NZ.cpp
similarity index 100%
rename from src/ops_infer/reshape_and_cache/reshape_and_cache_ops_runner_A2_NZ.cpp
rename to src/ops/ops_infer/reshape_and_cache/reshape_and_cache_ops_runner_A2_NZ.cpp
diff --git a/src/ops_infer/reshape_and_cache/reshape_and_cache_ops_runner_A2_NZ.h b/src/ops/ops_infer/reshape_and_cache/reshape_and_cache_ops_runner_A2_NZ.h
similarity index 100%
rename from src/ops_infer/reshape_and_cache/reshape_and_cache_ops_runner_A2_NZ.h
rename to src/ops/ops_infer/reshape_and_cache/reshape_and_cache_ops_runner_A2_NZ.h
diff --git a/src/ops_infer/reshape_and_cache/reshape_and_cache_ops_runner_SISO.cpp b/src/ops/ops_infer/reshape_and_cache/reshape_and_cache_ops_runner_SISO.cpp
similarity index 100%
rename from src/ops_infer/reshape_and_cache/reshape_and_cache_ops_runner_SISO.cpp
rename to src/ops/ops_infer/reshape_and_cache/reshape_and_cache_ops_runner_SISO.cpp
diff --git a/src/ops_infer/reshape_and_cache/reshape_and_cache_ops_runner_SISO.h b/src/ops/ops_infer/reshape_and_cache/reshape_and_cache_ops_runner_SISO.h
similarity index 100%
rename from src/ops_infer/reshape_and_cache/reshape_and_cache_ops_runner_SISO.h
rename to src/ops/ops_infer/reshape_and_cache/reshape_and_cache_ops_runner_SISO.h
diff --git a/src/ops_infer/reshape_and_cache_omni/reshape_and_cache_omni_operation.cpp b/src/ops/ops_infer/reshape_and_cache_omni/reshape_and_cache_omni_operation.cpp
similarity index 100%
rename from src/ops_infer/reshape_and_cache_omni/reshape_and_cache_omni_operation.cpp
rename to src/ops/ops_infer/reshape_and_cache_omni/reshape_and_cache_omni_operation.cpp
diff --git a/src/ops_infer/reshape_and_cache_omni/reshape_and_cache_omni_operation.h b/src/ops/ops_infer/reshape_and_cache_omni/reshape_and_cache_omni_operation.h
similarity index 100%
rename from src/ops_infer/reshape_and_cache_omni/reshape_and_cache_omni_operation.h
rename to src/ops/ops_infer/reshape_and_cache_omni/reshape_and_cache_omni_operation.h
diff --git a/src/ops_infer/reshape_and_cache_omni/reshape_and_cache_omni_ops_runner.cpp b/src/ops/ops_infer/reshape_and_cache_omni/reshape_and_cache_omni_ops_runner.cpp
similarity index 100%
rename from src/ops_infer/reshape_and_cache_omni/reshape_and_cache_omni_ops_runner.cpp
rename to src/ops/ops_infer/reshape_and_cache_omni/reshape_and_cache_omni_ops_runner.cpp
diff --git a/src/ops_infer/reshape_and_cache_omni/reshape_and_cache_omni_ops_runner.h b/src/ops/ops_infer/reshape_and_cache_omni/reshape_and_cache_omni_ops_runner.h
similarity index 100%
rename from src/ops_infer/reshape_and_cache_omni/reshape_and_cache_omni_ops_runner.h
rename to src/ops/ops_infer/reshape_and_cache_omni/reshape_and_cache_omni_ops_runner.h
diff --git a/src/ops_infer/reshape_and_cache_with_stride/reshape_and_cache_with_stride_operation.cpp b/src/ops/ops_infer/reshape_and_cache_with_stride/reshape_and_cache_with_stride_operation.cpp
similarity index 100%
rename from src/ops_infer/reshape_and_cache_with_stride/reshape_and_cache_with_stride_operation.cpp
rename to src/ops/ops_infer/reshape_and_cache_with_stride/reshape_and_cache_with_stride_operation.cpp
diff --git a/src/ops_infer/reshape_and_cache_with_stride/reshape_and_cache_with_stride_operation.h b/src/ops/ops_infer/reshape_and_cache_with_stride/reshape_and_cache_with_stride_operation.h
similarity index 100%
rename from src/ops_infer/reshape_and_cache_with_stride/reshape_and_cache_with_stride_operation.h
rename to src/ops/ops_infer/reshape_and_cache_with_stride/reshape_and_cache_with_stride_operation.h
diff --git a/src/ops_infer/reshape_and_cache_with_stride/reshape_and_cache_with_stride_ops_runner.cpp b/src/ops/ops_infer/reshape_and_cache_with_stride/reshape_and_cache_with_stride_ops_runner.cpp
similarity index 100%
rename from src/ops_infer/reshape_and_cache_with_stride/reshape_and_cache_with_stride_ops_runner.cpp
rename to src/ops/ops_infer/reshape_and_cache_with_stride/reshape_and_cache_with_stride_ops_runner.cpp
diff --git a/src/ops_infer/reshape_and_cache_with_stride/reshape_and_cache_with_stride_ops_runner.h b/src/ops/ops_infer/reshape_and_cache_with_stride/reshape_and_cache_with_stride_ops_runner.h
similarity index 100%
rename from src/ops_infer/reshape_and_cache_with_stride/reshape_and_cache_with_stride_ops_runner.h
rename to src/ops/ops_infer/reshape_and_cache_with_stride/reshape_and_cache_with_stride_ops_runner.h
diff --git a/src/ops_infer/reshape_and_cache_with_stride/reshape_and_cache_with_stride_ops_runner_SISO.cpp b/src/ops/ops_infer/reshape_and_cache_with_stride/reshape_and_cache_with_stride_ops_runner_SISO.cpp
similarity index 100%
rename from src/ops_infer/reshape_and_cache_with_stride/reshape_and_cache_with_stride_ops_runner_SISO.cpp
rename to src/ops/ops_infer/reshape_and_cache_with_stride/reshape_and_cache_with_stride_ops_runner_SISO.cpp
diff --git a/src/ops_infer/reshape_and_cache_with_stride/reshape_and_cache_with_stride_ops_runner_SISO.h b/src/ops/ops_infer/reshape_and_cache_with_stride/reshape_and_cache_with_stride_ops_runner_SISO.h
similarity index 100%
rename from src/ops_infer/reshape_and_cache_with_stride/reshape_and_cache_with_stride_ops_runner_SISO.h
rename to src/ops/ops_infer/reshape_and_cache_with_stride/reshape_and_cache_with_stride_ops_runner_SISO.h
diff --git a/src/ops_infer/ring_mla/atb_acl_ring_mla.cpp b/src/ops/ops_infer/ring_mla/atb_acl_ring_mla.cpp
similarity index 100%
rename from src/ops_infer/ring_mla/atb_acl_ring_mla.cpp
rename to src/ops/ops_infer/ring_mla/atb_acl_ring_mla.cpp
diff --git a/src/ops_infer/ring_mla/param.cpp b/src/ops/ops_infer/ring_mla/param.cpp
similarity index 100%
rename from src/ops_infer/ring_mla/param.cpp
rename to src/ops/ops_infer/ring_mla/param.cpp
diff --git a/src/ops_infer/ring_mla/param.h b/src/ops/ops_infer/ring_mla/param.h
similarity index 100%
rename from src/ops_infer/ring_mla/param.h
rename to src/ops/ops_infer/ring_mla/param.h
diff --git a/src/ops_infer/ring_mla/ring_mla_operation.cpp b/src/ops/ops_infer/ring_mla/ring_mla_operation.cpp
similarity index 100%
rename from src/ops_infer/ring_mla/ring_mla_operation.cpp
rename to src/ops/ops_infer/ring_mla/ring_mla_operation.cpp
diff --git a/src/ops_infer/ring_mla/ring_mla_operation.h b/src/ops/ops_infer/ring_mla/ring_mla_operation.h
similarity index 100%
rename from src/ops_infer/ring_mla/ring_mla_operation.h
rename to src/ops/ops_infer/ring_mla/ring_mla_operation.h
diff --git a/src/ops_infer/ring_mla/ring_mla_ops_runner.cpp b/src/ops/ops_infer/ring_mla/ring_mla_ops_runner.cpp
similarity index 100%
rename from src/ops_infer/ring_mla/ring_mla_ops_runner.cpp
rename to src/ops/ops_infer/ring_mla/ring_mla_ops_runner.cpp
diff --git a/src/ops_infer/ring_mla/ring_mla_ops_runner.h b/src/ops/ops_infer/ring_mla/ring_mla_ops_runner.h
similarity index 100%
rename from src/ops_infer/ring_mla/ring_mla_ops_runner.h
rename to src/ops/ops_infer/ring_mla/ring_mla_ops_runner.h
diff --git a/src/ops_infer/rms_norm/rms_norm_operation.cpp b/src/ops/ops_infer/rms_norm/rms_norm_operation.cpp
similarity index 100%
rename from src/ops_infer/rms_norm/rms_norm_operation.cpp
rename to src/ops/ops_infer/rms_norm/rms_norm_operation.cpp
diff --git a/src/ops_infer/rms_norm/rms_norm_operation.h b/src/ops/ops_infer/rms_norm/rms_norm_operation.h
similarity index 100%
rename from src/ops_infer/rms_norm/rms_norm_operation.h
rename to src/ops/ops_infer/rms_norm/rms_norm_operation.h
diff --git a/src/ops_infer/rms_norm/rms_norm_ops_runner.cpp b/src/ops/ops_infer/rms_norm/rms_norm_ops_runner.cpp
similarity index 100%
rename from src/ops_infer/rms_norm/rms_norm_ops_runner.cpp
rename to src/ops/ops_infer/rms_norm/rms_norm_ops_runner.cpp
diff --git a/src/ops_infer/rms_norm/rms_norm_ops_runner.h b/src/ops/ops_infer/rms_norm/rms_norm_ops_runner.h
similarity index 100%
rename from src/ops_infer/rms_norm/rms_norm_ops_runner.h
rename to src/ops/ops_infer/rms_norm/rms_norm_ops_runner.h
diff --git a/src/ops_infer/rms_norm_with_stride/rms_norm_with_stride_operation.cpp b/src/ops/ops_infer/rms_norm_with_stride/rms_norm_with_stride_operation.cpp
similarity index 100%
rename from src/ops_infer/rms_norm_with_stride/rms_norm_with_stride_operation.cpp
rename to src/ops/ops_infer/rms_norm_with_stride/rms_norm_with_stride_operation.cpp
diff --git a/src/ops_infer/rms_norm_with_stride/rms_norm_with_stride_operation.h b/src/ops/ops_infer/rms_norm_with_stride/rms_norm_with_stride_operation.h
similarity index 100%
rename from src/ops_infer/rms_norm_with_stride/rms_norm_with_stride_operation.h
rename to src/ops/ops_infer/rms_norm_with_stride/rms_norm_with_stride_operation.h
diff --git a/src/ops_infer/rms_norm_with_stride/rms_norm_with_stride_ops_runner.cpp b/src/ops/ops_infer/rms_norm_with_stride/rms_norm_with_stride_ops_runner.cpp
similarity index 100%
rename from src/ops_infer/rms_norm_with_stride/rms_norm_with_stride_ops_runner.cpp
rename to src/ops/ops_infer/rms_norm_with_stride/rms_norm_with_stride_ops_runner.cpp
diff --git a/src/ops_infer/rms_norm_with_stride/rms_norm_with_stride_ops_runner.h b/src/ops/ops_infer/rms_norm_with_stride/rms_norm_with_stride_ops_runner.h
similarity index 100%
rename from src/ops_infer/rms_norm_with_stride/rms_norm_with_stride_ops_runner.h
rename to src/ops/ops_infer/rms_norm_with_stride/rms_norm_with_stride_ops_runner.h
diff --git a/src/ops_infer/rope/rope_operation.cpp b/src/ops/ops_infer/rope/rope_operation.cpp
similarity index 100%
rename from src/ops_infer/rope/rope_operation.cpp
rename to src/ops/ops_infer/rope/rope_operation.cpp
diff --git a/src/ops_infer/rope/rope_operation.h b/src/ops/ops_infer/rope/rope_operation.h
similarity index 100%
rename from src/ops_infer/rope/rope_operation.h
rename to src/ops/ops_infer/rope/rope_operation.h
diff --git a/src/ops_infer/rope/rope_ops_runner.cpp b/src/ops/ops_infer/rope/rope_ops_runner.cpp
similarity index 100%
rename from src/ops_infer/rope/rope_ops_runner.cpp
rename to src/ops/ops_infer/rope/rope_ops_runner.cpp
diff --git a/src/ops_infer/rope/rope_ops_runner.h b/src/ops/ops_infer/rope/rope_ops_runner.h
similarity index 100%
rename from src/ops_infer/rope/rope_ops_runner.h
rename to src/ops/ops_infer/rope/rope_ops_runner.h
diff --git a/src/ops_infer/rope_q_concat/rope_q_concat_operation.cpp b/src/ops/ops_infer/rope_q_concat/rope_q_concat_operation.cpp
similarity index 100%
rename from src/ops_infer/rope_q_concat/rope_q_concat_operation.cpp
rename to src/ops/ops_infer/rope_q_concat/rope_q_concat_operation.cpp
diff --git a/src/ops_infer/rope_q_concat/rope_q_concat_operation.h b/src/ops/ops_infer/rope_q_concat/rope_q_concat_operation.h
similarity index 100%
rename from src/ops_infer/rope_q_concat/rope_q_concat_operation.h
rename to src/ops/ops_infer/rope_q_concat/rope_q_concat_operation.h
diff --git a/src/ops_infer/rope_q_concat/rope_q_concat_ops_runner.cpp b/src/ops/ops_infer/rope_q_concat/rope_q_concat_ops_runner.cpp
similarity index 100%
rename from src/ops_infer/rope_q_concat/rope_q_concat_ops_runner.cpp
rename to src/ops/ops_infer/rope_q_concat/rope_q_concat_ops_runner.cpp
diff --git a/src/ops_infer/rope_q_concat/rope_q_concat_ops_runner.h b/src/ops/ops_infer/rope_q_concat/rope_q_concat_ops_runner.h
similarity index 100%
rename from src/ops_infer/rope_q_concat/rope_q_concat_ops_runner.h
rename to src/ops/ops_infer/rope_q_concat/rope_q_concat_ops_runner.h
diff --git a/src/ops_infer/scatter_elements_v2/scatter_elements_v2_operation.cpp b/src/ops/ops_infer/scatter_elements_v2/scatter_elements_v2_operation.cpp
similarity index 100%
rename from src/ops_infer/scatter_elements_v2/scatter_elements_v2_operation.cpp
rename to src/ops/ops_infer/scatter_elements_v2/scatter_elements_v2_operation.cpp
diff --git a/src/ops_infer/scatter_elements_v2/scatter_elements_v2_operation.h b/src/ops/ops_infer/scatter_elements_v2/scatter_elements_v2_operation.h
similarity index 100%
rename from src/ops_infer/scatter_elements_v2/scatter_elements_v2_operation.h
rename to src/ops/ops_infer/scatter_elements_v2/scatter_elements_v2_operation.h
diff --git a/src/ops_infer/scatter_elements_v2/scatter_elements_v2_ops_runner.cpp b/src/ops/ops_infer/scatter_elements_v2/scatter_elements_v2_ops_runner.cpp
similarity index 100%
rename from src/ops_infer/scatter_elements_v2/scatter_elements_v2_ops_runner.cpp
rename to src/ops/ops_infer/scatter_elements_v2/scatter_elements_v2_ops_runner.cpp
diff --git a/src/ops_infer/scatter_elements_v2/scatter_elements_v2_ops_runner.h b/src/ops/ops_infer/scatter_elements_v2/scatter_elements_v2_ops_runner.h
similarity index 100%
rename from src/ops_infer/scatter_elements_v2/scatter_elements_v2_ops_runner.h
rename to src/ops/ops_infer/scatter_elements_v2/scatter_elements_v2_ops_runner.h
diff --git a/src/ops_infer/self_attention/atb_acl_self_attention_prefix_encoder.cpp b/src/ops/ops_infer/self_attention/atb_acl_self_attention_prefix_encoder.cpp
similarity index 100%
rename from src/ops_infer/self_attention/atb_acl_self_attention_prefix_encoder.cpp
rename to src/ops/ops_infer/self_attention/atb_acl_self_attention_prefix_encoder.cpp
diff --git a/src/ops_infer/self_attention/param.cpp b/src/ops/ops_infer/self_attention/param.cpp
similarity index 100%
rename from src/ops_infer/self_attention/param.cpp
rename to src/ops/ops_infer/self_attention/param.cpp
diff --git a/src/ops_infer/self_attention/param.h b/src/ops/ops_infer/self_attention/param.h
similarity index 100%
rename from src/ops_infer/self_attention/param.h
rename to src/ops/ops_infer/self_attention/param.h
diff --git a/src/ops_infer/self_attention/self_attention_encoder_fuison_ops_runner_910a.cpp b/src/ops/ops_infer/self_attention/self_attention_encoder_fuison_ops_runner_910a.cpp
similarity index 100%
rename from src/ops_infer/self_attention/self_attention_encoder_fuison_ops_runner_910a.cpp
rename to src/ops/ops_infer/self_attention/self_attention_encoder_fuison_ops_runner_910a.cpp
diff --git a/src/ops_infer/self_attention/self_attention_encoder_fusion_ops_runner.cpp b/src/ops/ops_infer/self_attention/self_attention_encoder_fusion_ops_runner.cpp
similarity index 100%
rename from src/ops_infer/self_attention/self_attention_encoder_fusion_ops_runner.cpp
rename to src/ops/ops_infer/self_attention/self_attention_encoder_fusion_ops_runner.cpp
diff --git a/src/ops_infer/self_attention/self_attention_encoder_fusion_ops_runner.h b/src/ops/ops_infer/self_attention/self_attention_encoder_fusion_ops_runner.h
similarity index 100%
rename from src/ops_infer/self_attention/self_attention_encoder_fusion_ops_runner.h
rename to src/ops/ops_infer/self_attention/self_attention_encoder_fusion_ops_runner.h
diff --git a/src/ops_infer/self_attention/self_attention_encoder_fusion_ops_runner_910a.h b/src/ops/ops_infer/self_attention/self_attention_encoder_fusion_ops_runner_910a.h
similarity index 100%
rename from src/ops_infer/self_attention/self_attention_encoder_fusion_ops_runner_910a.h
rename to src/ops/ops_infer/self_attention/self_attention_encoder_fusion_ops_runner_910a.h
diff --git a/src/ops_infer/self_attention/self_attention_fusion_bypass_ops_runner.cpp b/src/ops/ops_infer/self_attention/self_attention_fusion_bypass_ops_runner.cpp
similarity index 100%
rename from src/ops_infer/self_attention/self_attention_fusion_bypass_ops_runner.cpp
rename to src/ops/ops_infer/self_attention/self_attention_fusion_bypass_ops_runner.cpp
diff --git a/src/ops_infer/self_attention/self_attention_fusion_bypass_ops_runner.h b/src/ops/ops_infer/self_attention/self_attention_fusion_bypass_ops_runner.h
similarity index 100%
rename from src/ops_infer/self_attention/self_attention_fusion_bypass_ops_runner.h
rename to src/ops/ops_infer/self_attention/self_attention_fusion_bypass_ops_runner.h
diff --git a/src/ops_infer/self_attention/self_attention_fusion_bypass_ops_runner_910a.cpp b/src/ops/ops_infer/self_attention/self_attention_fusion_bypass_ops_runner_910a.cpp
similarity index 100%
rename from src/ops_infer/self_attention/self_attention_fusion_bypass_ops_runner_910a.cpp
rename to src/ops/ops_infer/self_attention/self_attention_fusion_bypass_ops_runner_910a.cpp
diff --git a/src/ops_infer/self_attention/self_attention_fusion_bypass_ops_runner_910a.h b/src/ops/ops_infer/self_attention/self_attention_fusion_bypass_ops_runner_910a.h
similarity index 100%
rename from src/ops_infer/self_attention/self_attention_fusion_bypass_ops_runner_910a.h
rename to src/ops/ops_infer/self_attention/self_attention_fusion_bypass_ops_runner_910a.h
diff --git a/src/ops_infer/self_attention/self_attention_fusion_bypass_ops_runner_BNSD.cpp b/src/ops/ops_infer/self_attention/self_attention_fusion_bypass_ops_runner_BNSD.cpp
similarity index 100%
rename from src/ops_infer/self_attention/self_attention_fusion_bypass_ops_runner_BNSD.cpp
rename to src/ops/ops_infer/self_attention/self_attention_fusion_bypass_ops_runner_BNSD.cpp
diff --git a/src/ops_infer/self_attention/self_attention_fusion_bypass_ops_runner_BNSD.h b/src/ops/ops_infer/self_attention/self_attention_fusion_bypass_ops_runner_BNSD.h
similarity index 100%
rename from src/ops_infer/self_attention/self_attention_fusion_bypass_ops_runner_BNSD.h
rename to src/ops/ops_infer/self_attention/self_attention_fusion_bypass_ops_runner_BNSD.h
diff --git a/src/ops_infer/self_attention/self_attention_fusion_bypass_ops_runner_BNSD_910a.cpp b/src/ops/ops_infer/self_attention/self_attention_fusion_bypass_ops_runner_BNSD_910a.cpp
similarity index 100%
rename from src/ops_infer/self_attention/self_attention_fusion_bypass_ops_runner_BNSD_910a.cpp
rename to src/ops/ops_infer/self_attention/self_attention_fusion_bypass_ops_runner_BNSD_910a.cpp
diff --git a/src/ops_infer/self_attention/self_attention_fusion_bypass_ops_runner_BNSD_910a.h b/src/ops/ops_infer/self_attention/self_attention_fusion_bypass_ops_runner_BNSD_910a.h
similarity index 100%
rename from src/ops_infer/self_attention/self_attention_fusion_bypass_ops_runner_BNSD_910a.h
rename to src/ops/ops_infer/self_attention/self_attention_fusion_bypass_ops_runner_BNSD_910a.h
diff --git a/src/ops_infer/self_attention/self_attention_fusion_ops_runner.cpp b/src/ops/ops_infer/self_attention/self_attention_fusion_ops_runner.cpp
similarity index 100%
rename from src/ops_infer/self_attention/self_attention_fusion_ops_runner.cpp
rename to src/ops/ops_infer/self_attention/self_attention_fusion_ops_runner.cpp
diff --git a/src/ops_infer/self_attention/self_attention_fusion_ops_runner.h b/src/ops/ops_infer/self_attention/self_attention_fusion_ops_runner.h
similarity index 100%
rename from src/ops_infer/self_attention/self_attention_fusion_ops_runner.h
rename to src/ops/ops_infer/self_attention/self_attention_fusion_ops_runner.h
diff --git a/src/ops_infer/self_attention/self_attention_fusion_ops_runner_910a.cpp b/src/ops/ops_infer/self_attention/self_attention_fusion_ops_runner_910a.cpp
similarity index 100%
rename from src/ops_infer/self_attention/self_attention_fusion_ops_runner_910a.cpp
rename to src/ops/ops_infer/self_attention/self_attention_fusion_ops_runner_910a.cpp
diff --git a/src/ops_infer/self_attention/self_attention_fusion_ops_runner_910a.h b/src/ops/ops_infer/self_attention/self_attention_fusion_ops_runner_910a.h
similarity index 100%
rename from src/ops_infer/self_attention/self_attention_fusion_ops_runner_910a.h
rename to src/ops/ops_infer/self_attention/self_attention_fusion_ops_runner_910a.h
diff --git a/src/ops_infer/self_attention/self_attention_operation.cpp b/src/ops/ops_infer/self_attention/self_attention_operation.cpp
similarity index 100%
rename from src/ops_infer/self_attention/self_attention_operation.cpp
rename to src/ops/ops_infer/self_attention/self_attention_operation.cpp
diff --git a/src/ops_infer/self_attention/self_attention_operation.h b/src/ops/ops_infer/self_attention/self_attention_operation.h
similarity index 100%
rename from src/ops_infer/self_attention/self_attention_operation.h
rename to src/ops/ops_infer/self_attention/self_attention_operation.h
diff --git a/src/ops_infer/self_attention/self_attention_prefix_encoder_ops_runner.cpp b/src/ops/ops_infer/self_attention/self_attention_prefix_encoder_ops_runner.cpp
similarity index 100%
rename from src/ops_infer/self_attention/self_attention_prefix_encoder_ops_runner.cpp
rename to src/ops/ops_infer/self_attention/self_attention_prefix_encoder_ops_runner.cpp
diff --git a/src/ops_infer/self_attention/self_attention_prefix_encoder_ops_runner.h b/src/ops/ops_infer/self_attention/self_attention_prefix_encoder_ops_runner.h
similarity index 100%
rename from src/ops_infer/self_attention/self_attention_prefix_encoder_ops_runner.h
rename to src/ops/ops_infer/self_attention/self_attention_prefix_encoder_ops_runner.h
diff --git a/src/ops_infer/self_attention/self_attention_runner_utils.cpp b/src/ops/ops_infer/self_attention/self_attention_runner_utils.cpp
similarity index 100%
rename from src/ops_infer/self_attention/self_attention_runner_utils.cpp
rename to src/ops/ops_infer/self_attention/self_attention_runner_utils.cpp
diff --git a/src/ops_infer/self_attention/self_attention_runner_utils.h b/src/ops/ops_infer/self_attention/self_attention_runner_utils.h
similarity index 100%
rename from src/ops_infer/self_attention/self_attention_runner_utils.h
rename to src/ops/ops_infer/self_attention/self_attention_runner_utils.h
diff --git a/src/ops_infer/send/send_hccl_runner.cpp b/src/ops/ops_infer/send/send_hccl_runner.cpp
similarity index 100%
rename from src/ops_infer/send/send_hccl_runner.cpp
rename to src/ops/ops_infer/send/send_hccl_runner.cpp
diff --git a/src/ops_infer/send/send_hccl_runner.h b/src/ops/ops_infer/send/send_hccl_runner.h
similarity index 100%
rename from src/ops_infer/send/send_hccl_runner.h
rename to src/ops/ops_infer/send/send_hccl_runner.h
diff --git a/src/ops_infer/send/send_operation.cpp b/src/ops/ops_infer/send/send_operation.cpp
similarity index 100%
rename from src/ops_infer/send/send_operation.cpp
rename to src/ops/ops_infer/send/send_operation.cpp
diff --git a/src/ops_infer/send/send_operation.h b/src/ops/ops_infer/send/send_operation.h
similarity index 100%
rename from src/ops_infer/send/send_operation.h
rename to src/ops/ops_infer/send/send_operation.h
diff --git a/src/ops_infer/set_value/set_value_operation.cpp b/src/ops/ops_infer/set_value/set_value_operation.cpp
similarity index 100%
rename from src/ops_infer/set_value/set_value_operation.cpp
rename to src/ops/ops_infer/set_value/set_value_operation.cpp
diff --git a/src/ops_infer/set_value/set_value_operation.h b/src/ops/ops_infer/set_value/set_value_operation.h
similarity index 100%
rename from src/ops_infer/set_value/set_value_operation.h
rename to src/ops/ops_infer/set_value/set_value_operation.h
diff --git a/src/ops_infer/set_value/set_value_ops_runner.cpp b/src/ops/ops_infer/set_value/set_value_ops_runner.cpp
similarity index 100%
rename from src/ops_infer/set_value/set_value_ops_runner.cpp
rename to src/ops/ops_infer/set_value/set_value_ops_runner.cpp
diff --git a/src/ops_infer/set_value/set_value_ops_runner.h b/src/ops/ops_infer/set_value/set_value_ops_runner.h
similarity index 100%
rename from src/ops_infer/set_value/set_value_ops_runner.h
rename to src/ops/ops_infer/set_value/set_value_ops_runner.h
diff --git a/src/ops_infer/slice/slice_operation.cpp b/src/ops/ops_infer/slice/slice_operation.cpp
similarity index 100%
rename from src/ops_infer/slice/slice_operation.cpp
rename to src/ops/ops_infer/slice/slice_operation.cpp
diff --git a/src/ops_infer/slice/slice_operation.h b/src/ops/ops_infer/slice/slice_operation.h
similarity index 100%
rename from src/ops_infer/slice/slice_operation.h
rename to src/ops/ops_infer/slice/slice_operation.h
diff --git a/src/ops_infer/slice/slice_ops_runner.cpp b/src/ops/ops_infer/slice/slice_ops_runner.cpp
similarity index 100%
rename from src/ops_infer/slice/slice_ops_runner.cpp
rename to src/ops/ops_infer/slice/slice_ops_runner.cpp
diff --git a/src/ops_infer/slice/slice_ops_runner.h b/src/ops/ops_infer/slice/slice_ops_runner.h
similarity index 100%
rename from src/ops_infer/slice/slice_ops_runner.h
rename to src/ops/ops_infer/slice/slice_ops_runner.h
diff --git a/src/ops_infer/softmax/softmax_operation.cpp b/src/ops/ops_infer/softmax/softmax_operation.cpp
similarity index 100%
rename from src/ops_infer/softmax/softmax_operation.cpp
rename to src/ops/ops_infer/softmax/softmax_operation.cpp
diff --git a/src/ops_infer/softmax/softmax_operation.h b/src/ops/ops_infer/softmax/softmax_operation.h
similarity index 100%
rename from src/ops_infer/softmax/softmax_operation.h
rename to src/ops/ops_infer/softmax/softmax_operation.h
diff --git a/src/ops_infer/softmax/softmax_ops_runner.cpp b/src/ops/ops_infer/softmax/softmax_ops_runner.cpp
similarity index 100%
rename from src/ops_infer/softmax/softmax_ops_runner.cpp
rename to src/ops/ops_infer/softmax/softmax_ops_runner.cpp
diff --git a/src/ops_infer/softmax/softmax_ops_runner.h b/src/ops/ops_infer/softmax/softmax_ops_runner.h
similarity index 100%
rename from src/ops_infer/softmax/softmax_ops_runner.h
rename to src/ops/ops_infer/softmax/softmax_ops_runner.h
diff --git a/src/ops_infer/sort/sort_operation.cpp b/src/ops/ops_infer/sort/sort_operation.cpp
similarity index 100%
rename from src/ops_infer/sort/sort_operation.cpp
rename to src/ops/ops_infer/sort/sort_operation.cpp
diff --git a/src/ops_infer/sort/sort_operation.h b/src/ops/ops_infer/sort/sort_operation.h
similarity index 100%
rename from src/ops_infer/sort/sort_operation.h
rename to src/ops/ops_infer/sort/sort_operation.h
diff --git a/src/ops_infer/sort/sort_ops_runner.cpp b/src/ops/ops_infer/sort/sort_ops_runner.cpp
similarity index 100%
rename from src/ops_infer/sort/sort_ops_runner.cpp
rename to src/ops/ops_infer/sort/sort_ops_runner.cpp
diff --git a/src/ops_infer/sort/sort_ops_runner.h b/src/ops/ops_infer/sort/sort_ops_runner.h
similarity index 97%
rename from src/ops_infer/sort/sort_ops_runner.h
rename to src/ops/ops_infer/sort/sort_ops_runner.h
index c2e9cb39..d4d2228c 100644
--- a/src/ops_infer/sort/sort_ops_runner.h
+++ b/src/ops/ops_infer/sort/sort_ops_runner.h
@@ -1,27 +1,27 @@
-/*
- * Copyright (c) 2024 Huawei Technologies Co., Ltd.
- * This file is a part of the CANN Open Software.
- * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
- * Please refer to the License for details. You may not use this file except in compliance with the License.
- * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
- * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
- * See LICENSE in the root of the software repository for the full text of the License.
- */
-
-#ifndef ATB_SORT_OPS_RUNNER_H
-#define ATB_SORT_OPS_RUNNER_H
-
-#include "atb/runner/ops_runner.h"
-#include "atb/infer_op_params.h"
-
-namespace atb {
-class SortOpsRunner : public OpsRunner {
-public:
-    explicit SortOpsRunner(const infer::SortParam &param);
-    ~SortOpsRunner() override;
-
-private:
-    infer::SortParam param_;
-};
-} // namespace atb
+/*
+ * Copyright (c) 2024 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+#ifndef ATB_SORT_OPS_RUNNER_H
+#define ATB_SORT_OPS_RUNNER_H
+
+#include "atb/runner/ops_runner.h"
+#include "atb/infer_op_params.h"
+
+namespace atb {
+class SortOpsRunner : public OpsRunner {
+public:
+    explicit SortOpsRunner(const infer::SortParam &param);
+    ~SortOpsRunner() override;
+
+private:
+    infer::SortParam param_;
+};
+} // namespace atb
 #endif
\ No newline at end of file
diff --git a/src/ops_infer/split/split_operation.cpp b/src/ops/ops_infer/split/split_operation.cpp
similarity index 100%
rename from src/ops_infer/split/split_operation.cpp
rename to src/ops/ops_infer/split/split_operation.cpp
diff --git a/src/ops_infer/split/split_operation.h b/src/ops/ops_infer/split/split_operation.h
similarity index 100%
rename from src/ops_infer/split/split_operation.h
rename to src/ops/ops_infer/split/split_operation.h
diff --git a/src/ops_infer/split/split_ops_runner.cpp b/src/ops/ops_infer/split/split_ops_runner.cpp
similarity index 100%
rename from src/ops_infer/split/split_ops_runner.cpp
rename to src/ops/ops_infer/split/split_ops_runner.cpp
diff --git a/src/ops_infer/split/split_ops_runner.h b/src/ops/ops_infer/split/split_ops_runner.h
similarity index 100%
rename from src/ops_infer/split/split_ops_runner.h
rename to src/ops/ops_infer/split/split_ops_runner.h
diff --git a/src/ops_infer/swiglu_quant/swiglu_quant_operation.cpp b/src/ops/ops_infer/swiglu_quant/swiglu_quant_operation.cpp
similarity index 100%
rename from src/ops_infer/swiglu_quant/swiglu_quant_operation.cpp
rename to src/ops/ops_infer/swiglu_quant/swiglu_quant_operation.cpp
diff --git a/src/ops_infer/swiglu_quant/swiglu_quant_operation.h b/src/ops/ops_infer/swiglu_quant/swiglu_quant_operation.h
similarity index 100%
rename from src/ops_infer/swiglu_quant/swiglu_quant_operation.h
rename to src/ops/ops_infer/swiglu_quant/swiglu_quant_operation.h
diff --git a/src/ops_infer/swiglu_quant/swiglu_quant_ops_runner.cpp b/src/ops/ops_infer/swiglu_quant/swiglu_quant_ops_runner.cpp
similarity index 100%
rename from src/ops_infer/swiglu_quant/swiglu_quant_ops_runner.cpp
rename to src/ops/ops_infer/swiglu_quant/swiglu_quant_ops_runner.cpp
diff --git a/src/ops_infer/swiglu_quant/swiglu_quant_ops_runner.h b/src/ops/ops_infer/swiglu_quant/swiglu_quant_ops_runner.h
similarity index 100%
rename from src/ops_infer/swiglu_quant/swiglu_quant_ops_runner.h
rename to src/ops/ops_infer/swiglu_quant/swiglu_quant_ops_runner.h
diff --git a/src/ops_infer/topk_topp_sampling/topk_topp_sampling_operation.cpp b/src/ops/ops_infer/topk_topp_sampling/topk_topp_sampling_operation.cpp
similarity index 100%
rename from src/ops_infer/topk_topp_sampling/topk_topp_sampling_operation.cpp
rename to src/ops/ops_infer/topk_topp_sampling/topk_topp_sampling_operation.cpp
diff --git a/src/ops_infer/topk_topp_sampling/topk_topp_sampling_operation.h b/src/ops/ops_infer/topk_topp_sampling/topk_topp_sampling_operation.h
similarity index 100%
rename from src/ops_infer/topk_topp_sampling/topk_topp_sampling_operation.h
rename to src/ops/ops_infer/topk_topp_sampling/topk_topp_sampling_operation.h
diff --git a/src/ops_infer/topk_topp_sampling/topk_topp_sampling_ops_runner.cpp b/src/ops/ops_infer/topk_topp_sampling/topk_topp_sampling_ops_runner.cpp
similarity index 100%
rename from src/ops_infer/topk_topp_sampling/topk_topp_sampling_ops_runner.cpp
rename to src/ops/ops_infer/topk_topp_sampling/topk_topp_sampling_ops_runner.cpp
diff --git a/src/ops_infer/topk_topp_sampling/topk_topp_sampling_ops_runner.h b/src/ops/ops_infer/topk_topp_sampling/topk_topp_sampling_ops_runner.h
similarity index 100%
rename from src/ops_infer/topk_topp_sampling/topk_topp_sampling_ops_runner.h
rename to src/ops/ops_infer/topk_topp_sampling/topk_topp_sampling_ops_runner.h
diff --git a/src/ops_infer/transdata/transdata_operation.cpp b/src/ops/ops_infer/transdata/transdata_operation.cpp
similarity index 100%
rename from src/ops_infer/transdata/transdata_operation.cpp
rename to src/ops/ops_infer/transdata/transdata_operation.cpp
diff --git a/src/ops_infer/transdata/transdata_operation.h b/src/ops/ops_infer/transdata/transdata_operation.h
similarity index 100%
rename from src/ops_infer/transdata/transdata_operation.h
rename to src/ops/ops_infer/transdata/transdata_operation.h
diff --git a/src/ops_infer/transdata/transdata_ops_runner.cpp b/src/ops/ops_infer/transdata/transdata_ops_runner.cpp
similarity index 100%
rename from src/ops_infer/transdata/transdata_ops_runner.cpp
rename to src/ops/ops_infer/transdata/transdata_ops_runner.cpp
diff --git a/src/ops_infer/transdata/transdata_ops_runner.h b/src/ops/ops_infer/transdata/transdata_ops_runner.h
similarity index 100%
rename from src/ops_infer/transdata/transdata_ops_runner.h
rename to src/ops/ops_infer/transdata/transdata_ops_runner.h
diff --git a/src/ops_infer/transpose/transpose_operation.cpp b/src/ops/ops_infer/transpose/transpose_operation.cpp
similarity index 100%
rename from src/ops_infer/transpose/transpose_operation.cpp
rename to src/ops/ops_infer/transpose/transpose_operation.cpp
diff --git a/src/ops_infer/transpose/transpose_operation.h b/src/ops/ops_infer/transpose/transpose_operation.h
similarity index 100%
rename from src/ops_infer/transpose/transpose_operation.h
rename to src/ops/ops_infer/transpose/transpose_operation.h
diff --git a/src/ops_infer/transpose/transpose_ops_runner.cpp b/src/ops/ops_infer/transpose/transpose_ops_runner.cpp
similarity index 100%
rename from src/ops_infer/transpose/transpose_ops_runner.cpp
rename to src/ops/ops_infer/transpose/transpose_ops_runner.cpp
diff --git a/src/ops_infer/transpose/transpose_ops_runner.h b/src/ops/ops_infer/transpose/transpose_ops_runner.h
similarity index 100%
rename from src/ops_infer/transpose/transpose_ops_runner.h
rename to src/ops/ops_infer/transpose/transpose_ops_runner.h
diff --git a/src/ops_infer/unpad/unpad_operation.cpp b/src/ops/ops_infer/unpad/unpad_operation.cpp
similarity index 100%
rename from src/ops_infer/unpad/unpad_operation.cpp
rename to src/ops/ops_infer/unpad/unpad_operation.cpp
diff --git a/src/ops_infer/unpad/unpad_operation.h b/src/ops/ops_infer/unpad/unpad_operation.h
similarity index 100%
rename from src/ops_infer/unpad/unpad_operation.h
rename to src/ops/ops_infer/unpad/unpad_operation.h
diff --git a/src/ops_infer/unpad/unpad_ops_runner.cpp b/src/ops/ops_infer/unpad/unpad_ops_runner.cpp
similarity index 100%
rename from src/ops_infer/unpad/unpad_ops_runner.cpp
rename to src/ops/ops_infer/unpad/unpad_ops_runner.cpp
diff --git a/src/ops_infer/unpad/unpad_ops_runner.h b/src/ops/ops_infer/unpad/unpad_ops_runner.h
similarity index 100%
rename from src/ops_infer/unpad/unpad_ops_runner.h
rename to src/ops/ops_infer/unpad/unpad_ops_runner.h
diff --git a/src/ops_infer/where/where_operation.cpp b/src/ops/ops_infer/where/where_operation.cpp
similarity index 100%
rename from src/ops_infer/where/where_operation.cpp
rename to src/ops/ops_infer/where/where_operation.cpp
diff --git a/src/ops_infer/where/where_operation.h b/src/ops/ops_infer/where/where_operation.h
similarity index 100%
rename from src/ops_infer/where/where_operation.h
rename to src/ops/ops_infer/where/where_operation.h
diff --git a/src/ops_infer/where/where_ops_runner.cpp b/src/ops/ops_infer/where/where_ops_runner.cpp
similarity index 100%
rename from src/ops_infer/where/where_ops_runner.cpp
rename to src/ops/ops_infer/where/where_ops_runner.cpp
diff --git a/src/ops_infer/where/where_ops_runner.h b/src/ops/ops_infer/where/where_ops_runner.h
similarity index 100%
rename from src/ops_infer/where/where_ops_runner.h
rename to src/ops/ops_infer/where/where_ops_runner.h
diff --git a/src/ops_train/fast_soft_max/fastsoftmax_operation.cpp b/src/ops/ops_train/fast_soft_max/fastsoftmax_operation.cpp
similarity index 100%
rename from src/ops_train/fast_soft_max/fastsoftmax_operation.cpp
rename to src/ops/ops_train/fast_soft_max/fastsoftmax_operation.cpp
diff --git a/src/ops_train/fast_soft_max/fastsoftmax_operation.h b/src/ops/ops_train/fast_soft_max/fastsoftmax_operation.h
similarity index 100%
rename from src/ops_train/fast_soft_max/fastsoftmax_operation.h
rename to src/ops/ops_train/fast_soft_max/fastsoftmax_operation.h
diff --git a/src/ops_train/fast_soft_max/fastsoftmax_ops_runner.cpp b/src/ops/ops_train/fast_soft_max/fastsoftmax_ops_runner.cpp
similarity index 100%
rename from src/ops_train/fast_soft_max/fastsoftmax_ops_runner.cpp
rename to src/ops/ops_train/fast_soft_max/fastsoftmax_ops_runner.cpp
diff --git a/src/ops_train/fast_soft_max/fastsoftmax_ops_runner.h b/src/ops/ops_train/fast_soft_max/fastsoftmax_ops_runner.h
similarity index 100%
rename from src/ops_train/fast_soft_max/fastsoftmax_ops_runner.h
rename to src/ops/ops_train/fast_soft_max/fastsoftmax_ops_runner.h
diff --git a/src/ops_train/fast_soft_max_grad/fastsoftmaxgrad_operation.cpp b/src/ops/ops_train/fast_soft_max_grad/fastsoftmaxgrad_operation.cpp
similarity index 100%
rename from src/ops_train/fast_soft_max_grad/fastsoftmaxgrad_operation.cpp
rename to src/ops/ops_train/fast_soft_max_grad/fastsoftmaxgrad_operation.cpp
diff --git a/src/ops_train/fast_soft_max_grad/fastsoftmaxgrad_operation.h b/src/ops/ops_train/fast_soft_max_grad/fastsoftmaxgrad_operation.h
similarity index 100%
rename from src/ops_train/fast_soft_max_grad/fastsoftmaxgrad_operation.h
rename to src/ops/ops_train/fast_soft_max_grad/fastsoftmaxgrad_operation.h
diff --git a/src/ops_train/fast_soft_max_grad/fastsoftmaxgrad_ops_runner.cpp b/src/ops/ops_train/fast_soft_max_grad/fastsoftmaxgrad_ops_runner.cpp
similarity index 100%
rename from src/ops_train/fast_soft_max_grad/fastsoftmaxgrad_ops_runner.cpp
rename to src/ops/ops_train/fast_soft_max_grad/fastsoftmaxgrad_ops_runner.cpp
diff --git a/src/ops_train/fast_soft_max_grad/fastsoftmaxgrad_ops_runner.h b/src/ops/ops_train/fast_soft_max_grad/fastsoftmaxgrad_ops_runner.h
similarity index 100%
rename from src/ops_train/fast_soft_max_grad/fastsoftmaxgrad_ops_runner.h
rename to src/ops/ops_train/fast_soft_max_grad/fastsoftmaxgrad_ops_runner.h
diff --git a/src/ops_train/gen_attention_mask/genattentionmask_operation.cpp b/src/ops/ops_train/gen_attention_mask/genattentionmask_operation.cpp
similarity index 100%
rename from src/ops_train/gen_attention_mask/genattentionmask_operation.cpp
rename to src/ops/ops_train/gen_attention_mask/genattentionmask_operation.cpp
diff --git a/src/ops_train/gen_attention_mask/genattentionmask_operation.h b/src/ops/ops_train/gen_attention_mask/genattentionmask_operation.h
similarity index 100%
rename from src/ops_train/gen_attention_mask/genattentionmask_operation.h
rename to src/ops/ops_train/gen_attention_mask/genattentionmask_operation.h
diff --git a/src/ops_train/gen_attention_mask/genattentionmask_ops_runner.cpp b/src/ops/ops_train/gen_attention_mask/genattentionmask_ops_runner.cpp
similarity index 100%
rename from src/ops_train/gen_attention_mask/genattentionmask_ops_runner.cpp
rename to src/ops/ops_train/gen_attention_mask/genattentionmask_ops_runner.cpp
diff --git a/src/ops_train/gen_attention_mask/genattentionmask_ops_runner.h b/src/ops/ops_train/gen_attention_mask/genattentionmask_ops_runner.h
similarity index 100%
rename from src/ops_train/gen_attention_mask/genattentionmask_ops_runner.h
rename to src/ops/ops_train/gen_attention_mask/genattentionmask_ops_runner.h
diff --git a/src/ops_train/laser_attention/laser_attention_operation.cpp b/src/ops/ops_train/laser_attention/laser_attention_operation.cpp
similarity index 100%
rename from src/ops_train/laser_attention/laser_attention_operation.cpp
rename to src/ops/ops_train/laser_attention/laser_attention_operation.cpp
diff --git a/src/ops_train/laser_attention/laser_attention_operation.h b/src/ops/ops_train/laser_attention/laser_attention_operation.h
similarity index 100%
rename from src/ops_train/laser_attention/laser_attention_operation.h
rename to src/ops/ops_train/laser_attention/laser_attention_operation.h
diff --git a/src/ops_train/laser_attention/laser_attention_ops_runner.cpp b/src/ops/ops_train/laser_attention/laser_attention_ops_runner.cpp
similarity index 100%
rename from src/ops_train/laser_attention/laser_attention_ops_runner.cpp
rename to src/ops/ops_train/laser_attention/laser_attention_ops_runner.cpp
diff --git a/src/ops_train/laser_attention/laser_attention_ops_runner.h b/src/ops/ops_train/laser_attention/laser_attention_ops_runner.h
similarity index 100%
rename from src/ops_train/laser_attention/laser_attention_ops_runner.h
rename to src/ops/ops_train/laser_attention/laser_attention_ops_runner.h
diff --git a/src/ops_train/laser_attention_grad/laser_attention_grad_operation.cpp b/src/ops/ops_train/laser_attention_grad/laser_attention_grad_operation.cpp
similarity index 100%
rename from src/ops_train/laser_attention_grad/laser_attention_grad_operation.cpp
rename to src/ops/ops_train/laser_attention_grad/laser_attention_grad_operation.cpp
diff --git a/src/ops_train/laser_attention_grad/laser_attention_grad_operation.h b/src/ops/ops_train/laser_attention_grad/laser_attention_grad_operation.h
similarity index 100%
rename from src/ops_train/laser_attention_grad/laser_attention_grad_operation.h
rename to src/ops/ops_train/laser_attention_grad/laser_attention_grad_operation.h
diff --git a/src/ops_train/laser_attention_grad/laser_attention_grad_ops_runner.cpp b/src/ops/ops_train/laser_attention_grad/laser_attention_grad_ops_runner.cpp
similarity index 100%
rename from src/ops_train/laser_attention_grad/laser_attention_grad_ops_runner.cpp
rename to src/ops/ops_train/laser_attention_grad/laser_attention_grad_ops_runner.cpp
diff --git a/src/ops_train/laser_attention_grad/laser_attention_grad_ops_runner.h b/src/ops/ops_train/laser_attention_grad/laser_attention_grad_ops_runner.h
similarity index 100%
rename from src/ops_train/laser_attention_grad/laser_attention_grad_ops_runner.h
rename to src/ops/ops_train/laser_attention_grad/laser_attention_grad_ops_runner.h
diff --git a/src/ops_train/pad_with_hidden_state/pad_with_hidden_state_operation.cpp b/src/ops/ops_train/pad_with_hidden_state/pad_with_hidden_state_operation.cpp
similarity index 100%
rename from src/ops_train/pad_with_hidden_state/pad_with_hidden_state_operation.cpp
rename to src/ops/ops_train/pad_with_hidden_state/pad_with_hidden_state_operation.cpp
diff --git a/src/ops_train/pad_with_hidden_state/pad_with_hidden_state_operation.h b/src/ops/ops_train/pad_with_hidden_state/pad_with_hidden_state_operation.h
similarity index 100%
rename from src/ops_train/pad_with_hidden_state/pad_with_hidden_state_operation.h
rename to src/ops/ops_train/pad_with_hidden_state/pad_with_hidden_state_operation.h
diff --git a/src/ops_train/pad_with_hidden_state/pad_with_hidden_state_ops_runner.cpp b/src/ops/ops_train/pad_with_hidden_state/pad_with_hidden_state_ops_runner.cpp
similarity index 100%
rename from src/ops_train/pad_with_hidden_state/pad_with_hidden_state_ops_runner.cpp
rename to src/ops/ops_train/pad_with_hidden_state/pad_with_hidden_state_ops_runner.cpp
diff --git a/src/ops_train/pad_with_hidden_state/pad_with_hidden_state_ops_runner.h b/src/ops/ops_train/pad_with_hidden_state/pad_with_hidden_state_ops_runner.h
similarity index 100%
rename from src/ops_train/pad_with_hidden_state/pad_with_hidden_state_ops_runner.h
rename to src/ops/ops_train/pad_with_hidden_state/pad_with_hidden_state_ops_runner.h
diff --git a/src/ops_train/rms_norm_backward/rms_norm_backward_operation.cpp b/src/ops/ops_train/rms_norm_backward/rms_norm_backward_operation.cpp
similarity index 100%
rename from src/ops_train/rms_norm_backward/rms_norm_backward_operation.cpp
rename to src/ops/ops_train/rms_norm_backward/rms_norm_backward_operation.cpp
diff --git a/src/ops_train/rms_norm_backward/rms_norm_backward_operation.h b/src/ops/ops_train/rms_norm_backward/rms_norm_backward_operation.h
similarity index 100%
rename from src/ops_train/rms_norm_backward/rms_norm_backward_operation.h
rename to src/ops/ops_train/rms_norm_backward/rms_norm_backward_operation.h
diff --git a/src/ops_train/rms_norm_backward/rms_norm_backward_ops_runner.cpp b/src/ops/ops_train/rms_norm_backward/rms_norm_backward_ops_runner.cpp
similarity index 100%
rename from src/ops_train/rms_norm_backward/rms_norm_backward_ops_runner.cpp
rename to src/ops/ops_train/rms_norm_backward/rms_norm_backward_ops_runner.cpp
diff --git a/src/ops_train/rms_norm_backward/rms_norm_backward_ops_runner.h b/src/ops/ops_train/rms_norm_backward/rms_norm_backward_ops_runner.h
similarity index 100%
rename from src/ops_train/rms_norm_backward/rms_norm_backward_ops_runner.h
rename to src/ops/ops_train/rms_norm_backward/rms_norm_backward_ops_runner.h
diff --git a/src/ops_train/rope_grad/rope_grad_operation.cpp b/src/ops/ops_train/rope_grad/rope_grad_operation.cpp
similarity index 100%
rename from src/ops_train/rope_grad/rope_grad_operation.cpp
rename to src/ops/ops_train/rope_grad/rope_grad_operation.cpp
diff --git a/src/ops_train/rope_grad/rope_grad_operation.h b/src/ops/ops_train/rope_grad/rope_grad_operation.h
similarity index 100%
rename from src/ops_train/rope_grad/rope_grad_operation.h
rename to src/ops/ops_train/rope_grad/rope_grad_operation.h
diff --git a/src/ops_train/rope_grad/rope_grad_ops_runner.cpp b/src/ops/ops_train/rope_grad/rope_grad_ops_runner.cpp
similarity index 100%
rename from src/ops_train/rope_grad/rope_grad_ops_runner.cpp
rename to src/ops/ops_train/rope_grad/rope_grad_ops_runner.cpp
diff --git a/src/ops_train/rope_grad/rope_grad_ops_runner.h b/src/ops/ops_train/rope_grad/rope_grad_ops_runner.h
similarity index 100%
rename from src/ops_train/rope_grad/rope_grad_ops_runner.h
rename to src/ops/ops_train/rope_grad/rope_grad_ops_runner.h
diff --git a/src/ops_train/strided_batch_matmul/stridedbatchmatmul_operation.cpp b/src/ops/ops_train/strided_batch_matmul/stridedbatchmatmul_operation.cpp
similarity index 100%
rename from src/ops_train/strided_batch_matmul/stridedbatchmatmul_operation.cpp
rename to src/ops/ops_train/strided_batch_matmul/stridedbatchmatmul_operation.cpp
diff --git a/src/ops_train/strided_batch_matmul/stridedbatchmatmul_operation.h b/src/ops/ops_train/strided_batch_matmul/stridedbatchmatmul_operation.h
similarity index 100%
rename from src/ops_train/strided_batch_matmul/stridedbatchmatmul_operation.h
rename to src/ops/ops_train/strided_batch_matmul/stridedbatchmatmul_operation.h
diff --git a/src/ops_train/strided_batch_matmul/stridedbatchmatmul_ops_runner.cpp b/src/ops/ops_train/strided_batch_matmul/stridedbatchmatmul_ops_runner.cpp
similarity index 100%
rename from src/ops_train/strided_batch_matmul/stridedbatchmatmul_ops_runner.cpp
rename to src/ops/ops_train/strided_batch_matmul/stridedbatchmatmul_ops_runner.cpp
diff --git a/src/ops_train/strided_batch_matmul/stridedbatchmatmul_ops_runner.h b/src/ops/ops_train/strided_batch_matmul/stridedbatchmatmul_ops_runner.h
similarity index 100%
rename from src/ops_train/strided_batch_matmul/stridedbatchmatmul_ops_runner.h
rename to src/ops/ops_train/strided_batch_matmul/stridedbatchmatmul_ops_runner.h
diff --git a/src/ops_train/unpad_with_hidden_state/unpad_with_hidden_state_operation.cpp b/src/ops/ops_train/unpad_with_hidden_state/unpad_with_hidden_state_operation.cpp
similarity index 100%
rename from src/ops_train/unpad_with_hidden_state/unpad_with_hidden_state_operation.cpp
rename to src/ops/ops_train/unpad_with_hidden_state/unpad_with_hidden_state_operation.cpp
diff --git a/src/ops_train/unpad_with_hidden_state/unpad_with_hidden_state_operation.h b/src/ops/ops_train/unpad_with_hidden_state/unpad_with_hidden_state_operation.h
similarity index 100%
rename from src/ops_train/unpad_with_hidden_state/unpad_with_hidden_state_operation.h
rename to src/ops/ops_train/unpad_with_hidden_state/unpad_with_hidden_state_operation.h
diff --git a/src/ops_train/unpad_with_hidden_state/unpad_with_hidden_state_ops_runner.cpp b/src/ops/ops_train/unpad_with_hidden_state/unpad_with_hidden_state_ops_runner.cpp
similarity index 100%
rename from src/ops_train/unpad_with_hidden_state/unpad_with_hidden_state_ops_runner.cpp
rename to src/ops/ops_train/unpad_with_hidden_state/unpad_with_hidden_state_ops_runner.cpp
diff --git a/src/ops_train/unpad_with_hidden_state/unpad_with_hidden_state_ops_runner.h b/src/ops/ops_train/unpad_with_hidden_state/unpad_with_hidden_state_ops_runner.h
similarity index 100%
rename from src/ops_train/unpad_with_hidden_state/unpad_with_hidden_state_ops_runner.h
rename to src/ops/ops_train/unpad_with_hidden_state/unpad_with_hidden_state_ops_runner.h
-- 
Gitee


From 5b0b600ab317cdee116f2ebc52227f4426ff7738 Mon Sep 17 00:00:00 2001
From: zhuhaozhecool <zhuhaozhe2@huawei.com>
Date: Mon, 22 Sep 2025 20:11:59 +0800
Subject: [PATCH 46/94] fix pp matmul I8 Kernel memory illegal read

---
 .../pp_matmul_i8_kernel/op_kernel/pp_matmul.cce     |  9 ++++++++-
 .../op_kernel/pp_matmul_bf16.cce                    |  7 ++++---
 .../op_kernel/pp_matmul_i8_weight_nz.cce            | 13 ++++++++++---
 .../op_kernel/pp_matmul_nz_m300.cce                 | 13 ++++++++++---
 4 files changed, 32 insertions(+), 10 deletions(-)

diff --git a/src/kernels/kernels/matmul/pp_matmul_i8_kernel/op_kernel/pp_matmul.cce b/src/kernels/kernels/matmul/pp_matmul_i8_kernel/op_kernel/pp_matmul.cce
index 1cd15bfb..1beff8e7 100644
--- a/src/kernels/kernels/matmul/pp_matmul_i8_kernel/op_kernel/pp_matmul.cce
+++ b/src/kernels/kernels/matmul/pp_matmul_i8_kernel/op_kernel/pp_matmul.cce
@@ -24,6 +24,7 @@
 
 constexpr uint32_t L0AB_PINGPONG_BUFFER_LEN_FP16 = 16384; // 32 KB
 constexpr uint32_t L0AB_PINGPONG_BUFFER_LEN_INT8 = 32768; // 32 KB
+constexpr uint32_t BLOCK_SIZE_8 = 8;
 constexpr uint32_t BLOCK_SIZE_16 = 16;
 constexpr uint32_t BLOCK_SIZE_32 = 32;
 constexpr uint32_t CUBE_MATRIX_SIZE_256 = 256;           // 16 * 16
@@ -55,6 +56,11 @@ __aicore__ __force_inline__ uint32_t RoundUp16(const uint32_t val)
     return (val + BLOCK_SIZE_16 - 1) / BLOCK_SIZE_16 * BLOCK_SIZE_16;
 }
 
+__aicore__ __force_inline__ uint32_t RoundUp8(const uint32_t val)
+{
+    return (val + BLOCK_SIZE_8 - 1) / BLOCK_SIZE_8 * BLOCK_SIZE_8;
+}
+
 template <bool TA, bool TB, bool SPLIT_K = false, bool HAVE_BIAS = false, bool IS_INT8 = false, typename InDtype = half,
           typename OutDtype = half, typename BiasDtype = float, typename ScaleDtype = float>
 class PpMatmulInt {
@@ -163,6 +169,7 @@ public:
             uint64_t offset_c = batch_idx * m * n + m_idx * m0 * n + n_idx * n0;
             uint32_t m_actual = (m_idx == (m_loop - 1)) ? (m - m_idx * m0) : m0;
             uint32_t n_actual = (n_idx == (n_loop - 1)) ? (n - n_idx * n0) : n0;
+            uint32_t bias_n_round = RoundUp8(n_actual);
             uint32_t m_round = 0;
             uint32_t n_round = 0;
             uint64_t shuffle_k = en_shuffle_k ? core_idx % k_loop : 0;
@@ -212,7 +219,7 @@ public:
                 WAIT_FLAG(MTE1, MTE2, EVENT_ID7);
                 gm_to_l1<ArchType::ASCEND_V220, BiasDtype, DataFormat::ND, DataFormat::ND>(bias_l1,              // dst
                                                                                            gm_bias[offset_bias], // src
-                                                                                           1, RoundUp16(1), 1, n_round,
+                                                                                           1, RoundUp16(1), 1, bias_n_round,
                                                                                            RoundUp16(n_round), n_round);
                 SET_FLAG(MTE2, MTE1, EVENT_ID6);
             }
diff --git a/src/kernels/kernels/matmul/pp_matmul_i8_kernel/op_kernel/pp_matmul_bf16.cce b/src/kernels/kernels/matmul/pp_matmul_i8_kernel/op_kernel/pp_matmul_bf16.cce
index 85d802c1..44e2475b 100644
--- a/src/kernels/kernels/matmul/pp_matmul_i8_kernel/op_kernel/pp_matmul_bf16.cce
+++ b/src/kernels/kernels/matmul/pp_matmul_i8_kernel/op_kernel/pp_matmul_bf16.cce
@@ -180,6 +180,7 @@ public:
             uint64_t offset_c = batch_idx * m * n + m_idx * m0 * n + n_idx * n0;
             uint32_t m_actual = (m_idx == (m_loop - 1)) ? (m - m_idx * m0) : m0;
             uint32_t n_actual = (n_idx == (n_loop - 1)) ? (n - n_idx * n0) : n0;
+            uint32_t bias_n_round = RoundUp8(n_actual);
             uint32_t m_round = 0;
             uint32_t n_round = 0;
             uint64_t shuffle_k = en_shuffle_k ? core_idx % k_loop : 0;
@@ -240,7 +241,7 @@ public:
                 WAIT_FLAG(MTE1, MTE2, EVENT_ID7);
                 gm_to_l1<ArchType::ASCEND_V220, BiasDtype, DataFormat::ND, DataFormat::ND>(bias_l1,              // dst
                                                                                            gm_bias[offset_bias], // src
-                                                                                           1, RoundUp16(1), 1, n_round,
+                                                                                           1, RoundUp16(1), 1, bias_n_round,
                                                                                            RoundUp16(n_round), n_round);
                 SET_FLAG(MTE2, MTE1, EVENT_ID6);
             }
@@ -279,7 +280,7 @@ public:
                         l1_buf_b, gm_b[offset_b], k_actual, k_round, k, n_actual, n_round, n);
                 } else {
                     gm_to_l1<ArchType::ASCEND_V220, InDtype, DataFormat::NZ, DataFormat::NZ>(
-                        l1_buf_b, gm_b[offset_b], k_actual, k_round, RoundUp16(k), n_actual, n_round, RoundUp32(n));
+                        l1_buf_b, gm_b[offset_b], k_actual, RoundUp16(k_actual), RoundUp16(k), n_actual, n_round, RoundUp32(n));
                 }
             }
             SET_FLAG(MTE2, MTE1, event_id + CONST_2);
@@ -372,7 +373,7 @@ public:
                             gm_to_l1<ArchType::ASCEND_V220, InDtype, DataFormat::NZ, DataFormat::NZ>(l1_buf_b_next,
                                                                                                      gm_b[offset_b_next],
                                                                                                      k_actual_next,
-                                                                                                     k_round_next,
+                                                                                                     RoundUp16(k_actual_next),
                                                                                                      RoundUp16(k),
                                                                                                      n_actual,
                                                                                                      n_round,
diff --git a/src/kernels/kernels/matmul/pp_matmul_i8_kernel/op_kernel/pp_matmul_i8_weight_nz.cce b/src/kernels/kernels/matmul/pp_matmul_i8_kernel/op_kernel/pp_matmul_i8_weight_nz.cce
index d3d58a48..a498351b 100644
--- a/src/kernels/kernels/matmul/pp_matmul_i8_kernel/op_kernel/pp_matmul_i8_weight_nz.cce
+++ b/src/kernels/kernels/matmul/pp_matmul_i8_kernel/op_kernel/pp_matmul_i8_weight_nz.cce
@@ -23,6 +23,7 @@
 
 constexpr uint32_t L0AB_PINGPONG_BUFFER_LEN_FP16 = 16384; // 32 KB
 constexpr uint32_t L0AB_PINGPONG_BUFFER_LEN_INT8 = 32768; // 32 KB
+constexpr uint32_t BLOCK_SIZE_8 = 8;
 constexpr uint32_t BLOCK_SIZE_16 = 16;
 constexpr uint32_t BLOCK_SIZE_32 = 32;
 constexpr uint32_t CUBE_MATRIX_SIZE_256 = 256;           // 16 * 16
@@ -70,6 +71,11 @@ __aicore__ __force_inline__ uint32_t RoundUp16(const uint32_t val)
     return (val + BLOCK_SIZE_16 - 1) / BLOCK_SIZE_16 * BLOCK_SIZE_16;
 }
 
+__aicore__ __force_inline__ uint32_t RoundUp8(const uint32_t val)
+{
+    return (val + BLOCK_SIZE_8 - 1) / BLOCK_SIZE_8 * BLOCK_SIZE_8;
+}
+
 template <bool TA,
           bool TB,
           bool SPLIT_K = false,
@@ -186,6 +192,7 @@ public:
             uint64_t offset_c = batch_idx * m * n + m_idx * m0 * n + n_idx * n0;
             uint32_t m_actual = (m_idx == (m_loop - 1)) ? (m - m_idx * m0) : m0;
             uint32_t n_actual = (n_idx == (n_loop - 1)) ? (n - n_idx * n0) : n0;
+            uint32_t bias_n_round = RoundUp8(n_actual);
             uint32_t m_round = 0;
             uint32_t n_round = 0;
             uint64_t shuffle_k = en_shuffle_k ? core_idx % k_loop : 0;
@@ -236,7 +243,7 @@ public:
                                                                                            1,
                                                                                            RoundUp16(1),
                                                                                            1,
-                                                                                           n_round,
+                                                                                           bias_n_round,
                                                                                            RoundUp16(n_round),
                                                                                            n_round);
                 SET_FLAG(MTE2, MTE1, EVENT_ID6);
@@ -271,7 +278,7 @@ public:
                     l1_buf_b, gm_b[offset_b], n_actual, n_round, RoundUp16(n), k_actual, k_round, RoundUp32(k));
             } else {
                 gm_to_l1<ArchType::ASCEND_V220, InDtype, DataFormat::NZ, DataFormat::NZ>(
-                    l1_buf_b, gm_b[offset_b], k_actual, k_round, RoundUp16(k), n_actual, n_round, RoundUp32(n));
+                    l1_buf_b, gm_b[offset_b], k_actual, RoundUp16(k_actual), RoundUp16(k), n_actual, n_round, RoundUp32(n));
             }
             SET_FLAG(MTE2, MTE1, event_id + CONST_2);
 
@@ -383,7 +390,7 @@ public:
                         gm_to_l1<ArchType::ASCEND_V220, InDtype, DataFormat::NZ, DataFormat::NZ>(l1_buf_b_next,
                                                                                                  gm_b[offset_b_next],
                                                                                                  k_actual_next,
-                                                                                                 k_round_next,
+                                                                                                 RoundUp16(k_actual_next),
                                                                                                  RoundUp16(k),
                                                                                                  n_actual,
                                                                                                  n_round,
diff --git a/src/kernels/kernels/matmul/pp_matmul_i8_kernel/op_kernel/pp_matmul_nz_m300.cce b/src/kernels/kernels/matmul/pp_matmul_i8_kernel/op_kernel/pp_matmul_nz_m300.cce
index c21ac739..c373e68f 100644
--- a/src/kernels/kernels/matmul/pp_matmul_i8_kernel/op_kernel/pp_matmul_nz_m300.cce
+++ b/src/kernels/kernels/matmul/pp_matmul_i8_kernel/op_kernel/pp_matmul_nz_m300.cce
@@ -24,6 +24,7 @@
 
 constexpr uint32_t L0AB_PINGPONG_BUFFER_LEN_FP16 = 16384; // 32 KB
 constexpr uint32_t L0AB_PINGPONG_BUFFER_LEN_INT8 = 32768; // 32 KB
+constexpr uint32_t BLOCK_SIZE_8 = 8;
 constexpr uint32_t BLOCK_SIZE_16 = 16;
 constexpr uint32_t BLOCK_SIZE_32 = 32;
 constexpr uint32_t CUBE_MATRIX_SIZE_256 = 256;           // 16 * 16
@@ -55,6 +56,11 @@ __aicore__ __force_inline__ uint32_t RoundUp16(const uint32_t val)
     return (val + BLOCK_SIZE_16 - 1) / BLOCK_SIZE_16 * BLOCK_SIZE_16;
 }
 
+__aicore__ __force_inline__ uint32_t RoundUp8(const uint32_t val)
+{
+    return (val + BLOCK_SIZE_8 - 1) / BLOCK_SIZE_8 * BLOCK_SIZE_8;
+}
+
 template <bool TA, bool TB, bool SPLIT_K = false, bool HAVE_BIAS = false, bool IS_INT8 = false, typename InDtype = half,
           typename OutDtype = half, typename BiasDtype = float, typename ScaleDtype = float>
 class PpMatmulInt {
@@ -163,6 +169,7 @@ public:
             uint64_t offset_c = batch_idx * m * n + m_idx * m0 * n + n_idx * n0;
             uint32_t m_actual = (m_idx == (m_loop - 1)) ? (m - m_idx * m0) : m0;
             uint32_t n_actual = (n_idx == (n_loop - 1)) ? (n - n_idx * n0) : n0;
+            uint32_t bias_n_round = RoundUp8(n_actual);
             uint32_t m_round = 0;
             uint32_t n_round = 0;
             uint64_t shuffle_k = en_shuffle_k ? core_idx % k_loop : 0;
@@ -212,7 +219,7 @@ public:
                 WAIT_FLAG(MTE1, MTE2, EVENT_ID7);
                 gm_to_l1<ArchType::ASCEND_V220, BiasDtype, DataFormat::ND, DataFormat::ND>(bias_l1,              // dst
                                                                                            gm_bias[offset_bias], // src
-                                                                                           1, RoundUp16(1), 1, n_round,
+                                                                                           1, RoundUp16(1), 1, bias_n_round,
                                                                                            RoundUp16(n_round), n_round);
                 SET_FLAG(MTE2, MTE1, EVENT_ID6);
             }
@@ -242,7 +249,7 @@ public:
                     l1_buf_b, gm_b[offset_b], n_actual, n_round, RoundUp16(n), k_actual, k_round, RoundUp32(k));
             } else {
                 gm_to_l1<ArchType::ASCEND_V220, InDtype, DataFormat::NZ, DataFormat::NZ>(
-                    l1_buf_b, gm_b[offset_b], k_actual, k_round, RoundUp16(k), n_actual, n_round, RoundUp32(n));
+                    l1_buf_b, gm_b[offset_b], k_actual, RoundUp16(k_actual), RoundUp16(k), n_actual, n_round, RoundUp32(n));
             }
             SET_FLAG(MTE2, MTE1, event_id + CONST_2);
 
@@ -323,7 +330,7 @@ public:
                             l1_buf_b_next, gm_b[offset_b_next], n_actual, n_round, RoundUp16(n), k_actual_next, k_round_next, RoundUp32(k));
                     } else {
                         gm_to_l1<ArchType::ASCEND_V220, InDtype, DataFormat::NZ, DataFormat::NZ>(
-                            l1_buf_b_next, gm_b[offset_b_next], k_actual_next, k_round_next, RoundUp16(k), n_actual, n_round, RoundUp32(n));
+                            l1_buf_b_next, gm_b[offset_b_next], k_actual_next, RoundUp16(k_actual_next), RoundUp16(k), n_actual, n_round, RoundUp32(n));
                     }
                     SET_FLAG(MTE2, MTE1, event_id_next + CONST_2);
                 }
-- 
Gitee


From 1ae0cd8240fb4e4559dbcdaae09fb44d2616abe4 Mon Sep 17 00:00:00 2001
From: He Changcheng <hechangcheng2@h-partners.com>
Date: Mon, 22 Sep 2025 20:18:11 +0800
Subject: [PATCH 47/94] fix

---
 src/kernels/lcal/src/lccl.cpp | 36 ++++++++++++++++++++---------------
 1 file changed, 21 insertions(+), 15 deletions(-)

diff --git a/src/kernels/lcal/src/lccl.cpp b/src/kernels/lcal/src/lccl.cpp
index 8d515fb2..bf79bf66 100644
--- a/src/kernels/lcal/src/lccl.cpp
+++ b/src/kernels/lcal/src/lccl.cpp
@@ -31,39 +31,45 @@ using AclrtGetResInCurrentThreadFunc = int(*)(int, uint32_t*);
 
 int GetAclResInCurThread(int type, uint32_t *resource)
 {
-    // 静态变量：保存函数指针和库句柄
-    static std::unique_ptr<Mki::Dl> mkiDl;
-    static AclrtGetResInCurrentThreadFunc aclrtGetResInCurrentThread = nullptr;
     static std::mutex localMutex; // 线程安全锁
-
-    std::lock_guard<std::mutex> lock(localMutex); // 加锁
+    static AclrtGetResInCurrentThreadFunc aclrtGetResInCurrentThread = nullptr;
+    static int res = -1;
 
     // 首次调用时初始化
-    if (!mkiDl) {
+    if (res == -1) {
+        std::lock_guard<std::mutex> lock(localMutex); // 加锁
+        std::unique_ptr<Mki::Dl> mkiDl;
         std::string libPath = std::string(Mki::GetEnv("ASCEND_HOME_PATH")) + "/runtime/lib64/libascendcl.so";
         mkiDl = std::make_unique<Mki::Dl>(libPath, false);
         if (!mkiDl->IsValid()) {  // 检查库是否加载成功
-            MKI_LOG(WARN) << "Failed to load libascendcl.so!";
-            return LCAL_ERROR_NOT_FOUND;
+            MKI_LOG(ERROR) << "Failed to load libascendcl.so!";
+            return LCAL_ERROR_INTERNAL;
         }
         aclrtGetResInCurrentThread =
             (AclrtGetResInCurrentThreadFunc)mkiDl->GetSymbol("aclrtGetResInCurrentThread");
         if (aclrtGetResInCurrentThread == nullptr) {
-            MKI_LOG(WARN) << "Failed to get acl function!";
+            MKI_LOG(WARN) << "Failed to get aclrtGetResInCurrentThread function!";
+            res = LCAL_ERROR_NOT_FOUND;
             return LCAL_ERROR_NOT_FOUND;
         }
+        res = LCAL_SUCCESS;
         MKI_LOG(DEBUG) << "Successfully loaded libascendcl.so and resolved aclrtGetResInCurrentThread";
     }
 
     // 调用函数
-    int getResRet = aclrtGetResInCurrentThread(type, resource);
-    if (getResRet != ACL_SUCCESS) {
-        MKI_LOG(ERROR) << "Failed to get resource in current thread for type:" << type << " err:" << getResRet;
-        return LCAL_ERROR_INTERNAL;
+    if (res == LCAL_SUCCESS) {
+        int getResRet = aclrtGetResInCurrentThread(type, resource);
+        if (getResRet != ACL_SUCCESS) {
+            MKI_LOG(ERROR) << "Failed to get resource in current thread for type:" << type << " err:" << getResRet;
+            return LCAL_ERROR_INTERNAL;
+        } else {
+            MKI_LOG(DEBUG) << "Get resource in current thread for type:" << type << " resource:" << *resource;
+            return LCAL_SUCCESS;
+        }
     } else {
-        MKI_LOG(DEBUG) << "Get resource in current thread for type:" << type << " resource:" << *resource;
-        return LCAL_SUCCESS;
+        return res;
     }
+
 }
 
 uint32_t GetLocalReduceBlockDum(int64_t dataSize)
-- 
Gitee


From c08b4e7b6a31704398a3812057ef0755f5f63466 Mon Sep 17 00:00:00 2001
From: guo-jiong <guojiong1@huawei.com>
Date: Mon, 22 Sep 2025 20:20:45 +0800
Subject: [PATCH 48/94] move configs dir

---
 configs/build_config.json                                 | 8 --------
 scripts/build_util.py                                     | 2 +-
 scripts/update_tbe_tactic_json.py                         | 2 +-
 src/kernels/configs/build_config.json                     | 2 +-
 .../kernels/configs}/mixops/tbe_tactic_info.ini           | 0
 {configs => src/kernels/configs}/ops/tbe_tactic_info.ini  | 0
 src/kernels/tbe_adapter/CMakeLists.txt                    | 6 +++---
 7 files changed, 6 insertions(+), 14 deletions(-)
 delete mode 100644 configs/build_config.json
 rename {configs => src/kernels/configs}/mixops/tbe_tactic_info.ini (100%)
 rename {configs => src/kernels/configs}/ops/tbe_tactic_info.ini (100%)

diff --git a/configs/build_config.json b/configs/build_config.json
deleted file mode 100644
index 8f8b66dc..00000000
--- a/configs/build_config.json
+++ /dev/null
@@ -1,8 +0,0 @@
-{
-    "targets": {
-        "ascend310b": true,
-        "ascend310p": true,
-        "ascend910b": true,
-        "ascend910": true
-    }
-}
diff --git a/scripts/build_util.py b/scripts/build_util.py
index 424038f5..fc4f55a8 100644
--- a/scripts/build_util.py
+++ b/scripts/build_util.py
@@ -23,7 +23,7 @@ def get_build_target_list():
     if usr_config_file_path == '':
         script_file_path = os.path.realpath(__file__)
         build_config_json_file_path = os.path.join(os.path.dirname(
-            script_file_path), "../configs/build_config.json")
+            script_file_path), "../src/kernels/configs/build_config.json")
     else:
         build_config_json_file_path = usr_config_file_path
     device_list = []
diff --git a/scripts/update_tbe_tactic_json.py b/scripts/update_tbe_tactic_json.py
index 965e4476..89395ea4 100644
--- a/scripts/update_tbe_tactic_json.py
+++ b/scripts/update_tbe_tactic_json.py
@@ -339,7 +339,7 @@ def write_tbe_tactic_json(input_args, json_paths_info):
 
 def main():
     code_root_dir = get_code_root()
-    tactic_info_path = os.path.join(code_root_dir, "configs/ops/tbe_tactic_info.ini")
+    tactic_info_path = os.path.join(code_root_dir, "src/kernels/configs/ops/tbe_tactic_info.ini")
     
     build_cache_dir, _ = get_build_cache_path()
     tactic_json_path = os.path.join(build_cache_dir, "tbe_tactic_json.ini")
diff --git a/src/kernels/configs/build_config.json b/src/kernels/configs/build_config.json
index 73c00e47..6b8dcdad 100644
--- a/src/kernels/configs/build_config.json
+++ b/src/kernels/configs/build_config.json
@@ -5,4 +5,4 @@
         "ascend910b": true,
         "ascend910": true
     }
-}
\ No newline at end of file
+}
diff --git a/configs/mixops/tbe_tactic_info.ini b/src/kernels/configs/mixops/tbe_tactic_info.ini
similarity index 100%
rename from configs/mixops/tbe_tactic_info.ini
rename to src/kernels/configs/mixops/tbe_tactic_info.ini
diff --git a/configs/ops/tbe_tactic_info.ini b/src/kernels/configs/ops/tbe_tactic_info.ini
similarity index 100%
rename from configs/ops/tbe_tactic_info.ini
rename to src/kernels/configs/ops/tbe_tactic_info.ini
diff --git a/src/kernels/tbe_adapter/CMakeLists.txt b/src/kernels/tbe_adapter/CMakeLists.txt
index 0276e210..7251dbf9 100644
--- a/src/kernels/tbe_adapter/CMakeLists.txt
+++ b/src/kernels/tbe_adapter/CMakeLists.txt
@@ -565,7 +565,7 @@ set_source_files_properties(
 # MIX OPS
 file(MAKE_DIRECTORY ${CMAKE_BINARY_DIR}/mixops/)
 execute_process(COMMAND python3 ${PROJECT_SOURCE_DIR}/scripts/update_tbe_tactic_json.py
-                                --src_ini_path ${PROJECT_SOURCE_DIR}/configs/mixops/tbe_tactic_info.ini
+                                --src_ini_path ${PROJECT_SOURCE_DIR}/src/kernels/configs/mixops/tbe_tactic_info.ini
                                 --dst_ini_path ${CMAKE_BINARY_DIR}/mixops/tbe_tactic_json.ini
                 OUTPUT_VARIABLE MIX_PYTHON_OUTPUT
                 ERROR_VARIABLE RESULT_INFO
@@ -580,7 +580,7 @@ set_source_files_properties(${MIX_REUSE_BINARY_LIST} PROPERTIES GENERATED TRUE)
  
 add_custom_command(
     OUTPUT ${MIX_REUSE_BINARY_LIST} ${CMAKE_BINARY_DIR}/mix_wait_flag.cpp
-    DEPENDS ${PROJECT_SOURCE_DIR}/configs/mixops/tbe_tactic_info.ini
+    DEPENDS ${PROJECT_SOURCE_DIR}/src/kernels/configs/mixops/tbe_tactic_info.ini
     COMMAND python3 ${MKI_PACKAGE_DIR}/scripts/build_util.py --binary_dir ${CMAKE_BINARY_DIR} --op_type tbe
                     --tbe_ini_path ${CMAKE_BINARY_DIR}/mixops/tbe_tactic_json.ini
     COMMAND cmake -E sleep 10
@@ -610,7 +610,7 @@ set_source_files_properties(${OPS_REUSE_BINARY_LIST} PROPERTIES GENERATED TRUE)
  
 add_custom_command(
     OUTPUT ${OPS_REUSE_BINARY_LIST} ${CMAKE_BINARY_DIR}/ops_wait_flag.cpp
-    DEPENDS ${PROJECT_SOURCE_DIR}/configs/ops/tbe_tactic_info.ini
+    DEPENDS ${PROJECT_SOURCE_DIR}/src/kernels/configs/ops/tbe_tactic_info.ini
     COMMAND python3 ${MKI_PACKAGE_DIR}/scripts/build_util.py --binary_dir ${CMAKE_BINARY_DIR} --op_type tbe
                     --tbe_ini_path ${CMAKE_BINARY_DIR}/tbe_tactic_json.ini
     COMMAND cmake -E sleep 10
-- 
Gitee


From 935002b6b829d37726110af77611565e2c4ff7c1 Mon Sep 17 00:00:00 2001
From: huangxiaolan <huangxiaolan7@huawei.com>
Date: Mon, 22 Sep 2025 20:24:58 +0800
Subject: [PATCH 49/94] recover operation_base

---
 src/atb/operation/operation_base.cpp | 6 +++---
 src/atb/operation/operation_base.h   | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/atb/operation/operation_base.cpp b/src/atb/operation/operation_base.cpp
index 526b1be6..8668f9e5 100644
--- a/src/atb/operation/operation_base.cpp
+++ b/src/atb/operation/operation_base.cpp
@@ -1064,12 +1064,12 @@ Status OperationBase::GraphModeLaunch()
 }
 
 Status OperationBase::Execute(const VariantPack &variantPack, uint8_t *workspace, uint64_t workspaceSize,
-                              Context &context)
+                              Context *context)
 {
     const uint64_t beginTime = GetSingleton<Mki::ProfilingFuncs>().GetProfilingLevel0Status() ?
                                    GetSingleton<Mki::ProfilingFuncs>().ProfSysCycleTime() :
                                    0;
-    ExecuteType executeType = context.GetExecuteType();
+    ExecuteType executeType = context->GetExecuteType();
     ProfilingFuncName profType = executeType == EXECUTE_NORMAL ?
                                      OPERATION_EXECUTE :
                                      (executeType == EXECUTE_PRELAUNCH ? OPERATION_PRELAUNCH : OPERATION_LAUNCH);
@@ -1083,7 +1083,7 @@ Status OperationBase::Execute(const VariantPack &variantPack, uint8_t *workspace
     }
     Status st = NO_ERROR;
     if (executeType == EXECUTE_NORMAL || executeType == EXECUTE_PRELAUNCH) {
-        st = PreLaunch(variantPack, workspace, workspaceSize, &context);
+        st = PreLaunch(variantPack, workspace, workspaceSize, context);
         if (st != NO_ERROR) {
             ATB_LOG(ERROR) << GetLogPrefix() << "PreLaunch fail, error code: " << st;
             return st;
diff --git a/src/atb/operation/operation_base.h b/src/atb/operation/operation_base.h
index f0f5d791..309fc0bf 100644
--- a/src/atb/operation/operation_base.h
+++ b/src/atb/operation/operation_base.h
@@ -39,7 +39,7 @@ public:
     Status InferShape(const SVector<TensorDesc> &inTensorDescs, SVector<TensorDesc> &outTensorDescs) const override;
     Status Setup(const VariantPack &variantPack, uint64_t &workspaceSize, Context *context) override;
     Status Execute(const VariantPack &variantPack, uint8_t *workspace, uint64_t workspaceSize,
-                   Context &context) override;
+                   Context *context) override;
     Status SetOperationBaseIds(const std::vector<int64_t> &operationBaseIds, const int64_t nodeId);
     virtual nlohmann::json GetParamJson() const;
     const std::vector<int64_t> &GetOperationBaseIds();
-- 
Gitee


From d6ae7ce0eb2d185187a56f7cf75bcd3a35c2f209 Mon Sep 17 00:00:00 2001
From: guanguan <guanxuewei2@huawei.com>
Date: Mon, 22 Sep 2025 20:26:35 +0800
Subject: [PATCH 50/94] fix error

---
 .../linear_parallel/linear_parallel_operation.cpp         | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/src/ops_infer/linear_parallel/linear_parallel_operation.cpp b/src/ops_infer/linear_parallel/linear_parallel_operation.cpp
index 9ce37e92..6a7fd77b 100644
--- a/src/ops_infer/linear_parallel/linear_parallel_operation.cpp
+++ b/src/ops_infer/linear_parallel/linear_parallel_operation.cpp
@@ -145,7 +145,7 @@ template <> Status CreateOperation(const infer::LinearParallelParam &opParam, Op
         return ERROR_INVALID_PARAM;
     }
     int rankSize = opParam.rankSize;
-    if (opParam.rankSize <= 0 || (rankSize & (rankSize - 1)) != 0) {
+    if ((opParam.rankSize <= 0 || (rankSize & (rankSize - 1)) != 0) && opParam.backend == "lcoc") {
         ATB_LOG(ERROR) << "LinearParallel rankSize support power of 2 but got [" << opParam.rankSize << "]";
         return ERROR_INVALID_PARAM;
     }
@@ -409,12 +409,6 @@ Status LinearParallelOperation::InferShapeCheckLinearReduceScatter(const SVector
         return ERROR_INVALID_TENSOR_INI_MATCH;
     }
 
-    int64_t xTensorM = OperationUtil::GetXTensorM(inTensorDescs.at(0));
-    if (xTensorM % param_.rankSize != 0) {
-        ATB_LOG(ERROR) << GetLogPrefix() << "inTensor0 m [" << xTensorM
-                       << "] should be an integer multiple of rankSize :" << param_.rankSize;
-        return ERROR_INVALID_TENSOR_DIM;
-    }
     if (param_.backend == "mc2") {
         int64_t xTensorK = OperationUtil::GetXTensorK(inTensorDescs.at(0));
         if (xTensorK < 256 || xTensorK > 65535) {
-- 
Gitee


From 9f2bdcd05ed784795de587bfed547e5a7bdad232 Mon Sep 17 00:00:00 2001
From: huangxiaolan <huangxiaolan7@huawei.com>
Date: Mon, 22 Sep 2025 20:30:49 +0800
Subject: [PATCH 51/94] recover extern changes

---
 src/atb/operation/if_operation.cpp | 2 +-
 src/atb/operation/if_operation.h   | 2 +-
 src/atb/runner/plugin_runner.cpp   | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/atb/operation/if_operation.cpp b/src/atb/operation/if_operation.cpp
index 7a70570e..8a950c22 100644
--- a/src/atb/operation/if_operation.cpp
+++ b/src/atb/operation/if_operation.cpp
@@ -107,7 +107,7 @@ Status IfOperation::Execute(const VariantPack &variantPack, uint8_t *workspace,
                             Context &context)
 {
     ATB_LOG(INFO) << GetLogPrefix() << "Calling Execute...";
-    return opSelected_->Execute(variantPack, workspace, workspaceSize, context);
+    return opSelected_->Execute(variantPack, workspace, workspaceSize, *context);
 }
 
 uint32_t IfOperation::GetInputNum() const
diff --git a/src/atb/operation/if_operation.h b/src/atb/operation/if_operation.h
index 8f5dfbe7..eab4a45f 100644
--- a/src/atb/operation/if_operation.h
+++ b/src/atb/operation/if_operation.h
@@ -24,7 +24,7 @@ public:
     std::string GetName() const override;
     Status Setup(const VariantPack &variantPack, uint64_t &workspaceSize, Context *context) override;
     Status Execute(const VariantPack &variantPack, uint8_t *workspace, uint64_t workspaceSize,
-                   Context &context) override;
+                   Context *context) override;
     uint32_t GetInputNum() const override;
     uint32_t GetOutputNum() const override;
     void SetExecuteStreamId(uint32_t streamId) override;
diff --git a/src/atb/runner/plugin_runner.cpp b/src/atb/runner/plugin_runner.cpp
index fb26b4f6..aeaa20c6 100644
--- a/src/atb/runner/plugin_runner.cpp
+++ b/src/atb/runner/plugin_runner.cpp
@@ -37,7 +37,7 @@ Status PluginRunner::ExecuteImpl(RunnerVariantPack &runnerVariantPack)
         variantPack_.inTensors = runnerVariantPack.inTensors;
         variantPack_.outTensors = runnerVariantPack.outTensors;
         return operation_->Execute(variantPack_, runnerVariantPack.workspaceBuffer,
-                                   runnerVariantPack.workspaceBufferSize, *runnerVariantPack.context);
+                                   runnerVariantPack.workspaceBufferSize, runnerVariantPack.context);
     }
 
     return ERROR_INVALID_PARAM;
-- 
Gitee


From 1ca11b7c1b773930a7423a36551aefa0f179c3db Mon Sep 17 00:00:00 2001
From: huangxiaolan <huangxiaolan7@huawei.com>
Date: Mon, 22 Sep 2025 20:32:38 +0800
Subject: [PATCH 52/94] recover extern changes

---
 src/atb/operation/if_operation.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/atb/operation/if_operation.cpp b/src/atb/operation/if_operation.cpp
index 8a950c22..a9c8f406 100644
--- a/src/atb/operation/if_operation.cpp
+++ b/src/atb/operation/if_operation.cpp
@@ -104,10 +104,10 @@ Status IfOperation::Setup(const VariantPack &variantPack, uint64_t &workspaceSiz
 }
 
 Status IfOperation::Execute(const VariantPack &variantPack, uint8_t *workspace, uint64_t workspaceSize,
-                            Context &context)
+                            Context *context)
 {
     ATB_LOG(INFO) << GetLogPrefix() << "Calling Execute...";
-    return opSelected_->Execute(variantPack, workspace, workspaceSize, *context);
+    return opSelected_->Execute(variantPack, workspace, workspaceSize, context);
 }
 
 uint32_t IfOperation::GetInputNum() const
-- 
Gitee


From a0bde3130497d6bc37f5e0c33b0d864454b63b6b Mon Sep 17 00:00:00 2001
From: guanguan <guanxuewei2@huawei.com>
Date: Mon, 22 Sep 2025 20:40:21 +0800
Subject: [PATCH 53/94] add nz check

---
 .../linear_parallel_operation.cpp             | 38 +++++++++++++++++++
 1 file changed, 38 insertions(+)

diff --git a/src/ops_infer/linear_parallel/linear_parallel_operation.cpp b/src/ops_infer/linear_parallel/linear_parallel_operation.cpp
index 6a7fd77b..ea29279e 100644
--- a/src/ops_infer/linear_parallel/linear_parallel_operation.cpp
+++ b/src/ops_infer/linear_parallel/linear_parallel_operation.cpp
@@ -378,8 +378,26 @@ Status LinearParallelOperation::CheckResidual(const SVector<TensorDesc> &inTenso
     return NO_ERROR;
 }
 
+Status LinearParallelOperation::CheckWeightNzFormat(const SVector<TensorDesc> &inTensorDescs) const
+{
+    const TensorDesc &weight = inTensorDescs.at(1);
+    bool weightNz = (weight.format == ACL_FORMAT_FRACTAL_NZ);
+    if (weightNz) {
+        if (weight.shape.dimNum != DIM_4) {
+            ATB_LOG(ERROR) << GetLogPrefix() << "fractal_nz shape dim should be 4. now is "<< weight.shape.dimNum;
+            return ERROR_INVALID_TENSOR_DIM;
+        }
+    }
+    return NO_ERROR;
+}
+
 Status LinearParallelOperation::InferShapeCheckLinearAllReduce(const SVector<TensorDesc> &inTensorDescs) const
 {
+    Status st = CheckWeightNzFormat(inTensorDescs);
+    if (st != NO_ERROR) {
+        return st;
+    }
+
     if (!OperationUtil::MatmulInTensorDescsCheck(inTensorDescs, GetLogPrefix(), commonCheckParam_)) {
         return ERROR_INVALID_TENSOR_DIM;
     }
@@ -397,6 +415,11 @@ Status LinearParallelOperation::InferShapeCheckLinearAllReduce(const SVector<Ten
 
 Status LinearParallelOperation::InferShapeCheckLinearReduceScatter(const SVector<TensorDesc> &inTensorDescs) const
 {
+    Status st = CheckWeightNzFormat(inTensorDescs);
+    if (st != NO_ERROR) {
+        return st;
+    }
+
     if (!OperationUtil::MatmulInTensorDescsCheck(inTensorDescs, GetLogPrefix(), commonCheckParam_)) {
         return ERROR_INVALID_TENSOR_DIM;
     }
@@ -422,6 +445,11 @@ Status LinearParallelOperation::InferShapeCheckLinearReduceScatter(const SVector
 
 Status LinearParallelOperation::InferShapeCheckAllGatherLinear(const SVector<TensorDesc> &inTensorDescs) const
 {
+    Status st = CheckWeightNzFormat(inTensorDescs);
+    if (st != NO_ERROR) {
+        return st;
+    }
+
     bool isQuant = param_.quantType > infer::LinearParallelParam::QuantType::QUANT_TYPE_UNQUANT &&
                    param_.quantType < infer::LinearParallelParam::QuantType::QUANT_TYPE_MAX;
     if (isQuant && inTensorDescs.at(3).dtype == ACL_FLOAT && param_.outDataType == ACL_FLOAT16) {
@@ -443,6 +471,11 @@ Status LinearParallelOperation::InferShapeCheckAllGatherLinear(const SVector<Ten
 Status
 LinearParallelOperation::InferShapeCheckAllGatherLinearReduceScatter(const SVector<TensorDesc> &inTensorDescs) const
 {
+    Status st = CheckWeightNzFormat(inTensorDescs);
+    if (st != NO_ERROR) {
+        return st;
+    }
+
     if (param_.twoDimTPInfo.rsDim * param_.twoDimTPInfo.agDim != param_.rankSize) {
         ATB_LOG(ERROR) << "agDim * rsDim should equal to rankSize";
         return ERROR_INVALID_PARAM;
@@ -470,6 +503,11 @@ LinearParallelOperation::InferShapeCheckAllGatherLinearReduceScatter(const SVect
 
 Status LinearParallelOperation::InferShapeCheckAllToAllvcAllGatherGmm(const SVector<TensorDesc> &inTensorDescs) const
 {
+    Status st = CheckWeightNzFormat(inTensorDescs);
+    if (st != NO_ERROR) {
+        return st;
+    }
+    
     bool isQuant = param_.quantType > infer::LinearParallelParam::QuantType::QUANT_TYPE_UNQUANT &&
                    param_.quantType < infer::LinearParallelParam::QuantType::QUANT_TYPE_MAX;
     if (isQuant && inTensorDescs.at(2).dtype == ACL_FLOAT && param_.outDataType == ACL_FLOAT16) {
-- 
Gitee


From 789e4a897a266c37447b7b3dd3cf403997383374 Mon Sep 17 00:00:00 2001
From: He Changcheng <hechangcheng2@h-partners.com>
Date: Mon, 22 Sep 2025 20:54:53 +0800
Subject: [PATCH 54/94] fix

---
 src/kernels/lcal/src/lccl.cpp | 82 ++++++++++++++++++++---------------
 1 file changed, 47 insertions(+), 35 deletions(-)

diff --git a/src/kernels/lcal/src/lccl.cpp b/src/kernels/lcal/src/lccl.cpp
index bf79bf66..f6af2d3b 100644
--- a/src/kernels/lcal/src/lccl.cpp
+++ b/src/kernels/lcal/src/lccl.cpp
@@ -29,47 +29,59 @@ namespace Lcal {
 
 using AclrtGetResInCurrentThreadFunc = int(*)(int, uint32_t*);
 
-int GetAclResInCurThread(int type, uint32_t *resource)
+int GetAclResInCurThread(int type, uint32_t &resource)
 {
-    static std::mutex localMutex; // 线程安全锁
-    static AclrtGetResInCurrentThreadFunc aclrtGetResInCurrentThread = nullptr;
-    static int res = -1;
-
-    // 首次调用时初始化
-    if (res == -1) {
-        std::lock_guard<std::mutex> lock(localMutex); // 加锁
-        std::unique_ptr<Mki::Dl> mkiDl;
-        std::string libPath = std::string(Mki::GetEnv("ASCEND_HOME_PATH")) + "/runtime/lib64/libascendcl.so";
-        mkiDl = std::make_unique<Mki::Dl>(libPath, false);
-        if (!mkiDl->IsValid()) {  // 检查库是否加载成功
-            MKI_LOG(ERROR) << "Failed to load libascendcl.so!";
-            return LCAL_ERROR_INTERNAL;
+    static std::once_flag onceFlag;
+    static std::atomic<int> initFlag(LCAL_ERROR_NOT_INITIALIZED); // -1
+    static std::shared_ptr<Mki::Dl> mkiDl;
+    static AclrtGetResInCurrentThreadFunc aclFn = nullptr;
+
+    std::call_once(onceFlag, []() {
+        std::string home = Mki::GetEnv("ASCEND_HOME_PATH");
+        std::vector<std::string> candidates;
+        if (!home.empty()) {
+            candidates.push_back(home + "/runtime/lib64/libascendcl.so");
         }
-        aclrtGetResInCurrentThread =
-            (AclrtGetResInCurrentThreadFunc)mkiDl->GetSymbol("aclrtGetResInCurrentThread");
-        if (aclrtGetResInCurrentThread == nullptr) {
-            MKI_LOG(WARN) << "Failed to get aclrtGetResInCurrentThread function!";
-            res = LCAL_ERROR_NOT_FOUND;
-            return LCAL_ERROR_NOT_FOUND;
+        candidates.emplace_back("libascendcl.so");
+
+        for (const auto &p : candidates) {
+            auto dl = std::make_unique<Mki::Dl>(p, false);
+            if (!dl->IsValid()) {
+                MKI_LOG(WARN) << "Try load libascendcl.so failed: " << p;
+                continue;
+            }
+            auto sym = dl->GetSymbol("aclrtGetResInCurrentThread");
+            if (sym == nullptr) {
+                MKI_LOG(WARN) << "Symbol aclrtGetResInCurrentThread not found in: " << p;
+                continue;
+            }
+            mkiDl = std::move(dl);
+            aclFn = reinterpret_cast<AclrtGetResInCurrentThreadFunc>(sym);
+            initFlag.store(LCAL_SUCCESS, std::memory_order_release);
+            MKI_LOG(DEBUG) << "Loaded libascendcl.so and resolved aclrtGetResInCurrentThread from: " << p;
+            return;
         }
-        res = LCAL_SUCCESS;
-        MKI_LOG(DEBUG) << "Successfully loaded libascendcl.so and resolved aclrtGetResInCurrentThread";
+        initFlag.store(LCAL_ERROR_NOT_FOUND, std::memory_order_release);
+        MKI_LOG(ERROR) << "Failed to load libascendcl.so or resolve aclrtGetResInCurrentThread.";
+    });
+
+    int rc = initFlag.load(std::memory_order_acquire);
+    if (rc != LCAL_SUCCESS) {
+        return rc;
     }
 
-    // 调用函数
-    if (res == LCAL_SUCCESS) {
-        int getResRet = aclrtGetResInCurrentThread(type, resource);
-        if (getResRet != ACL_SUCCESS) {
-            MKI_LOG(ERROR) << "Failed to get resource in current thread for type:" << type << " err:" << getResRet;
-            return LCAL_ERROR_INTERNAL;
-        } else {
-            MKI_LOG(DEBUG) << "Get resource in current thread for type:" << type << " resource:" << *resource;
-            return LCAL_SUCCESS;
-        }
-    } else {
-        return res;
+    if (type != ACL_RT_DEV_RES_CUBE_CORE && type != ACL_RT_DEV_RES_VECTOR_CORE) {
+        MKI_LOG(ERROR) << "aclrtGetResInCurrentThread not support resource type:" << type;
+        return LCAL_ERROR_PARA_CHECK_FAIL;
     }
 
+    const int ret = aclFn(type, &resource);
+    if (ret != ACL_SUCCESS) {
+        MKI_LOG(ERROR) << "aclrtGetResInCurrentThread failed. type:" << type << " err:" << ret;
+        return LCAL_ERROR_INTERNAL;
+    }
+    MKI_LOG(DEBUG) << "Got resource in current thread. type:" << type << " resource:" << resource;
+    return LCAL_SUCCESS;
 }
 
 uint32_t GetLocalReduceBlockDum(int64_t dataSize)
@@ -288,7 +300,7 @@ uint32_t Lccl::GetBlockNum(LcalType cclType, uint32_t rankSize, int64_t dataSize
         limitType = aclrtDevResLimitType::ACL_RT_DEV_RES_CUBE_CORE;
     }
 
-    int res = GetAclResInCurThread(static_cast<int>(limitType), &limitVal);
+    int res = GetAclResInCurThread(static_cast<int>(limitType), limitVal);
     if (res == LCAL_SUCCESS) {
         MKI_LOG(DEBUG) << "Required blockNum(" << blockNum <<
             ") limit:(limitVal=" << limitVal << ", limitType=" << static_cast<int>(limitType) << ")";
-- 
Gitee


From bc56bcf4b0d1271eb79b580e8e4e74ec9a0b2881 Mon Sep 17 00:00:00 2001
From: He Changcheng <hechangcheng2@h-partners.com>
Date: Mon, 22 Sep 2025 20:59:06 +0800
Subject: [PATCH 55/94] fix

---
 src/kernels/lcal/src/lccl.cpp | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/src/kernels/lcal/src/lccl.cpp b/src/kernels/lcal/src/lccl.cpp
index f6af2d3b..9e44d0b5 100644
--- a/src/kernels/lcal/src/lccl.cpp
+++ b/src/kernels/lcal/src/lccl.cpp
@@ -26,14 +26,13 @@ using namespace chrono;
 using namespace Mki;
 
 namespace Lcal {
-
 using AclrtGetResInCurrentThreadFunc = int(*)(int, uint32_t*);
 
 int GetAclResInCurThread(int type, uint32_t &resource)
 {
     static std::once_flag onceFlag;
-    static std::atomic<int> initFlag(LCAL_ERROR_NOT_INITIALIZED); // -1
-    static std::shared_ptr<Mki::Dl> mkiDl;
+    static std::atomic<int> initFlag{LCAL_ERROR_NOT_INITIALIZED};  // -1
+    static std::unique_ptr<Mki::Dl> mkiDl;
     static AclrtGetResInCurrentThreadFunc aclFn = nullptr;
 
     std::call_once(onceFlag, []() {
@@ -61,8 +60,9 @@ int GetAclResInCurThread(int type, uint32_t &resource)
             MKI_LOG(DEBUG) << "Loaded libascendcl.so and resolved aclrtGetResInCurrentThread from: " << p;
             return;
         }
+        MKI_LOG(ERROR) << "Failed to load libascendcl.so or resolve aclrtGetResInCurrentThread. Tried paths: "
+                       << boost::algorithm::join(candidates, ", ");
         initFlag.store(LCAL_ERROR_NOT_FOUND, std::memory_order_release);
-        MKI_LOG(ERROR) << "Failed to load libascendcl.so or resolve aclrtGetResInCurrentThread.";
     });
 
     int rc = initFlag.load(std::memory_order_acquire);
@@ -71,16 +71,17 @@ int GetAclResInCurThread(int type, uint32_t &resource)
     }
 
     if (type != ACL_RT_DEV_RES_CUBE_CORE && type != ACL_RT_DEV_RES_VECTOR_CORE) {
-        MKI_LOG(ERROR) << "aclrtGetResInCurrentThread not support resource type:" << type;
+        MKI_LOG(ERROR) << "aclrtGetResInCurrentThread not support resource type: " << type;
         return LCAL_ERROR_PARA_CHECK_FAIL;
     }
 
     const int ret = aclFn(type, &resource);
     if (ret != ACL_SUCCESS) {
-        MKI_LOG(ERROR) << "aclrtGetResInCurrentThread failed. type:" << type << " err:" << ret;
+        MKI_LOG(ERROR) << "aclrtGetResInCurrentThread failed. type: " << type << " err: " << ret;
         return LCAL_ERROR_INTERNAL;
     }
-    MKI_LOG(DEBUG) << "Got resource in current thread. type:" << type << " resource:" << resource;
+
+    MKI_LOG(DEBUG) << "Got resource in current thread. type: " << type << " resource: " << resource;
     return LCAL_SUCCESS;
 }
 
-- 
Gitee


From b5256c9d5be1189dd6f500563f457f985c367efd Mon Sep 17 00:00:00 2001
From: He Changcheng <hechangcheng2@h-partners.com>
Date: Mon, 22 Sep 2025 21:05:02 +0800
Subject: [PATCH 56/94] fix

---
 src/kernels/lcal/src/lccl.cpp | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/src/kernels/lcal/src/lccl.cpp b/src/kernels/lcal/src/lccl.cpp
index 9e44d0b5..11fd73e7 100644
--- a/src/kernels/lcal/src/lccl.cpp
+++ b/src/kernels/lcal/src/lccl.cpp
@@ -32,14 +32,14 @@ int GetAclResInCurThread(int type, uint32_t &resource)
 {
     static std::once_flag onceFlag;
     static std::atomic<int> initFlag{LCAL_ERROR_NOT_INITIALIZED};  // -1
-    static std::unique_ptr<Mki::Dl> mkiDl;
+    static std::unique_ptr<Mki::Dl> mkiDl; // 持久保存，避免库被卸载
     static AclrtGetResInCurrentThreadFunc aclFn = nullptr;
 
     std::call_once(onceFlag, []() {
         std::string home = Mki::GetEnv("ASCEND_HOME_PATH");
         std::vector<std::string> candidates;
         if (!home.empty()) {
-            candidates.push_back(home + "/runtime/lib64/libascendcl.so");
+            candidates.emplace_back(home + "/runtime/lib64/libascendcl.so");
         }
         candidates.emplace_back("libascendcl.so");
 
@@ -54,17 +54,19 @@ int GetAclResInCurThread(int type, uint32_t &resource)
                 MKI_LOG(WARN) << "Symbol aclrtGetResInCurrentThread not found in: " << p;
                 continue;
             }
-            mkiDl = std::move(dl);
+            mkiDl = std::move(dl); // 保留句柄，防止卸载
             aclFn = reinterpret_cast<AclrtGetResInCurrentThreadFunc>(sym);
             initFlag.store(LCAL_SUCCESS, std::memory_order_release);
             MKI_LOG(DEBUG) << "Loaded libascendcl.so and resolved aclrtGetResInCurrentThread from: " << p;
-            return;
+            return; // 成功
         }
+        // 失败
         MKI_LOG(ERROR) << "Failed to load libascendcl.so or resolve aclrtGetResInCurrentThread. Tried paths: "
                        << boost::algorithm::join(candidates, ", ");
         initFlag.store(LCAL_ERROR_NOT_FOUND, std::memory_order_release);
     });
 
+    // 初始化结果判定
     int rc = initFlag.load(std::memory_order_acquire);
     if (rc != LCAL_SUCCESS) {
         return rc;
@@ -75,6 +77,7 @@ int GetAclResInCurThread(int type, uint32_t &resource)
         return LCAL_ERROR_PARA_CHECK_FAIL;
     }
 
+    // 调用底层函数
     const int ret = aclFn(type, &resource);
     if (ret != ACL_SUCCESS) {
         MKI_LOG(ERROR) << "aclrtGetResInCurrentThread failed. type: " << type << " err: " << ret;
-- 
Gitee


From 9bff1329bccbb3f5327ffcd6218a6ee36c69be6b Mon Sep 17 00:00:00 2001
From: He Changcheng <hechangcheng2@h-partners.com>
Date: Mon, 22 Sep 2025 21:17:40 +0800
Subject: [PATCH 57/94] fix

---
 src/kernels/lcal/src/lccl.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/kernels/lcal/src/lccl.cpp b/src/kernels/lcal/src/lccl.cpp
index 11fd73e7..9da09500 100644
--- a/src/kernels/lcal/src/lccl.cpp
+++ b/src/kernels/lcal/src/lccl.cpp
@@ -61,8 +61,7 @@ int GetAclResInCurThread(int type, uint32_t &resource)
             return; // 成功
         }
         // 失败
-        MKI_LOG(ERROR) << "Failed to load libascendcl.so or resolve aclrtGetResInCurrentThread. Tried paths: "
-                       << boost::algorithm::join(candidates, ", ");
+        MKI_LOG(ERROR) << "Failed to load libascendcl.so or resolve aclrtGetResInCurrentThread.";
         initFlag.store(LCAL_ERROR_NOT_FOUND, std::memory_order_release);
     });
 
-- 
Gitee


From 1eb367e23c40314d9f1d6f3675c63c9f9222f9fa Mon Sep 17 00:00:00 2001
From: He Changcheng <hechangcheng2@h-partners.com>
Date: Mon, 22 Sep 2025 21:29:39 +0800
Subject: [PATCH 58/94] fix

---
 src/kernels/lcal/src/lccl.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/kernels/lcal/src/lccl.cpp b/src/kernels/lcal/src/lccl.cpp
index 9da09500..e6ab4f2f 100644
--- a/src/kernels/lcal/src/lccl.cpp
+++ b/src/kernels/lcal/src/lccl.cpp
@@ -40,8 +40,9 @@ int GetAclResInCurThread(int type, uint32_t &resource)
         std::vector<std::string> candidates;
         if (!home.empty()) {
             candidates.emplace_back(home + "/runtime/lib64/libascendcl.so");
+        } else {
+            MKI_LOG(ERROR) << "ASCEND_HOME_PATH is empty.";
         }
-        candidates.emplace_back("libascendcl.so");
 
         for (const auto &p : candidates) {
             auto dl = std::make_unique<Mki::Dl>(p, false);
-- 
Gitee


From 2da65a527d683ecac79f9ac512fa5e660d2b9921 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=BC=A0=E7=A1=95=E7=B4=AF?= <zhangshuolei@h-partners.com>
Date: Mon, 22 Sep 2025 21:37:57 +0800
Subject: [PATCH 59/94] fix rms_norm_grad

---
 .../norm/rmsnormbackward/tiling/rms_norm_grad_tiling.cpp        | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/kernels/kernels/norm/rmsnormbackward/tiling/rms_norm_grad_tiling.cpp b/src/kernels/kernels/norm/rmsnormbackward/tiling/rms_norm_grad_tiling.cpp
index 48eeaaad..c15daf1a 100644
--- a/src/kernels/kernels/norm/rmsnormbackward/tiling/rms_norm_grad_tiling.cpp
+++ b/src/kernels/kernels/norm/rmsnormbackward/tiling/rms_norm_grad_tiling.cpp
@@ -231,7 +231,7 @@ Status RmsNormGradTiling(const LaunchParam &launchParam, KernelInfo &kernelInfo)
     tilingDataPointer->avg = avgVal;
     kernelInfo.SetTilingId(tilingKey);
     uint64_t sysWorkspaceSize =
-        static_cast<uint64_t>(BLOCK_SIZE + tilingDataPointer->blockDim * BLOCK_SIZE * TWICE_WORKSPACE);
+        static_cast<uint64_t>((BLOCK_SIZE + tilingDataPointer->blockDim * BLOCK_SIZE * TWICE_WORKSPACE) * sizeof(int));
     kernelInfo.GetScratchSizes().push_back(sysWorkspaceSize);
     kernelInfo.SetMemsetInfo(WORK_SPACE_INDEX, sysWorkspaceSize);
     return Status::OkStatus();
-- 
Gitee


From 245fb24a8ec6a7e14fb2aa8bfdc3c89925b4f9d4 Mon Sep 17 00:00:00 2001
From: He Changcheng <hechangcheng2@h-partners.com>
Date: Mon, 22 Sep 2025 21:38:44 +0800
Subject: [PATCH 60/94] fix

---
 src/kernels/lcal/src/lccl.cpp | 42 +++++++++++++----------------------
 1 file changed, 16 insertions(+), 26 deletions(-)

diff --git a/src/kernels/lcal/src/lccl.cpp b/src/kernels/lcal/src/lccl.cpp
index e6ab4f2f..77d5d6d6 100644
--- a/src/kernels/lcal/src/lccl.cpp
+++ b/src/kernels/lcal/src/lccl.cpp
@@ -36,34 +36,24 @@ int GetAclResInCurThread(int type, uint32_t &resource)
     static AclrtGetResInCurrentThreadFunc aclFn = nullptr;
 
     std::call_once(onceFlag, []() {
-        std::string home = Mki::GetEnv("ASCEND_HOME_PATH");
-        std::vector<std::string> candidates;
-        if (!home.empty()) {
-            candidates.emplace_back(home + "/runtime/lib64/libascendcl.so");
-        } else {
-            MKI_LOG(ERROR) << "ASCEND_HOME_PATH is empty.";
+        std::string p = Mki::GetEnv("ASCEND_HOME_PATH") + "/runtime/lib64/libascendcl.so";
+        auto dl = std::make_unique<Mki::Dl>(p, false);
+        if (!dl->IsValid()) {
+            MKI_LOG(ERROR) << "Try load libascendcl.so failed: " << p;
+            initFlag.store(LCAL_ERROR_NOT_FOUND, std::memory_order_release);
+            return;
         }
-
-        for (const auto &p : candidates) {
-            auto dl = std::make_unique<Mki::Dl>(p, false);
-            if (!dl->IsValid()) {
-                MKI_LOG(WARN) << "Try load libascendcl.so failed: " << p;
-                continue;
-            }
-            auto sym = dl->GetSymbol("aclrtGetResInCurrentThread");
-            if (sym == nullptr) {
-                MKI_LOG(WARN) << "Symbol aclrtGetResInCurrentThread not found in: " << p;
-                continue;
-            }
-            mkiDl = std::move(dl); // 保留句柄，防止卸载
-            aclFn = reinterpret_cast<AclrtGetResInCurrentThreadFunc>(sym);
-            initFlag.store(LCAL_SUCCESS, std::memory_order_release);
-            MKI_LOG(DEBUG) << "Loaded libascendcl.so and resolved aclrtGetResInCurrentThread from: " << p;
-            return; // 成功
+        auto sym = dl->GetSymbol("aclrtGetResInCurrentThread");
+        if (sym == nullptr) {
+            MKI_LOG(WARN) << "Symbol aclrtGetResInCurrentThread not found in: " << p;
+            initFlag.store(LCAL_ERROR_NOT_FOUND, std::memory_order_release);
+            return;
         }
-        // 失败
-        MKI_LOG(ERROR) << "Failed to load libascendcl.so or resolve aclrtGetResInCurrentThread.";
-        initFlag.store(LCAL_ERROR_NOT_FOUND, std::memory_order_release);
+        mkiDl = std::move(dl); // 保留句柄，防止卸载
+        aclFn = reinterpret_cast<AclrtGetResInCurrentThreadFunc>(sym);
+        initFlag.store(LCAL_SUCCESS, std::memory_order_release);
+        MKI_LOG(DEBUG) << "Loaded libascendcl.so and resolved aclrtGetResInCurrentThread from: " << p;
+        return; // 成功
     });
 
     // 初始化结果判定
-- 
Gitee


From bc11a5e97f6314da30d304d891d6750e7cc9d2e5 Mon Sep 17 00:00:00 2001
From: He Changcheng <hechangcheng2@h-partners.com>
Date: Mon, 22 Sep 2025 21:48:59 +0800
Subject: [PATCH 61/94] fix

---
 src/kernels/lcal/include/lcal_comm.h | 2 +-
 src/kernels/lcal/src/lcal_comm.cpp   | 6 +++---
 src/kernels/lcal/src/lccl.cpp        | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/kernels/lcal/include/lcal_comm.h b/src/kernels/lcal/include/lcal_comm.h
index bff77eea..6ec0fbd7 100644
--- a/src/kernels/lcal/include/lcal_comm.h
+++ b/src/kernels/lcal/include/lcal_comm.h
@@ -63,7 +63,7 @@ private:
     int GetName(std::string &name, char names[LCAL_MAX_RANK_SIZE][IPC_NAME_SIZE]) const;
     int SyncCommArgs();
     int InitDumpAddr();
-    
+
 private:
     int rank_ = 0;  // global rank id
     int rankSize_ = 0;  // global rank size
diff --git a/src/kernels/lcal/src/lcal_comm.cpp b/src/kernels/lcal/src/lcal_comm.cpp
index 8b77500a..b54380a0 100644
--- a/src/kernels/lcal/src/lcal_comm.cpp
+++ b/src/kernels/lcal/src/lcal_comm.cpp
@@ -303,7 +303,7 @@ int LcalComm::Init()
     if (inited_) {
         return LCAL_SUCCESS;
     }
-        if (rank_ < 0 || rank_ >= rankSize_ || rankSize_ <= 0 || rankSize_ > LCAL_MAX_RANK_SIZE) {
+    if (rank_ < 0 || rank_ >= rankSize_ || rankSize_ <= 0 || rankSize_ > LCAL_MAX_RANK_SIZE) {
         MKI_LOG(ERROR) << "The rank is invalid! rank:" << rank_ << " rankSize:" << rankSize_;
         return LCAL_ERROR_PARA_CHECK_FAIL;
     }
@@ -351,7 +351,7 @@ int LcalComm::InitThread(const std::string &uid)
     if (inited_) {
         return LCAL_SUCCESS;
     }
-        if (rank_ < 0 || rank_ >= rankSize_ || rankSize_ <= 0 || rankSize_ > LCAL_MAX_RANK_SIZE) {
+    if (rank_ < 0 || rank_ >= rankSize_ || rankSize_ <= 0 || rankSize_ > LCAL_MAX_RANK_SIZE) {
         MKI_LOG(ERROR) << "The rank is invalid! rank:" << rank_ << "rankSize:" << rankSize_;
         return LCAL_ERROR_PARA_CHECK_FAIL;
     }
@@ -723,7 +723,7 @@ LcalComm::~LcalComm()
     FreePeerMem(commArgs_.dumpAddr);
     FreePeerMem(peerMem_[rank_]);
     FreePeerMem(commArgsPtr_);
-    }
+}
 
 LcalComm::LcalComm(int rank, int rankSize) : rank_(rank), rankSize_(rankSize)
 {
diff --git a/src/kernels/lcal/src/lccl.cpp b/src/kernels/lcal/src/lccl.cpp
index 77d5d6d6..8bf1255b 100644
--- a/src/kernels/lcal/src/lccl.cpp
+++ b/src/kernels/lcal/src/lccl.cpp
@@ -36,7 +36,7 @@ int GetAclResInCurThread(int type, uint32_t &resource)
     static AclrtGetResInCurrentThreadFunc aclFn = nullptr;
 
     std::call_once(onceFlag, []() {
-        std::string p = Mki::GetEnv("ASCEND_HOME_PATH") + "/runtime/lib64/libascendcl.so";
+        std::string p = std::string(Mki::GetEnv("ASCEND_HOME_PATH")) + "/runtime/lib64/libascendcl.so";
         auto dl = std::make_unique<Mki::Dl>(p, false);
         if (!dl->IsValid()) {
             MKI_LOG(ERROR) << "Try load libascendcl.so failed: " << p;
-- 
Gitee


From 8dfdb58f34c4f243325bf518047484aad5869a60 Mon Sep 17 00:00:00 2001
From: huangxiaolan <huangxiaolan7@huawei.com>
Date: Tue, 23 Sep 2025 09:36:42 +0800
Subject: [PATCH 62/94] fix include

---
 src/ops_infer/multi_latent_attention/atb_acl_mla.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/ops_infer/multi_latent_attention/atb_acl_mla.cpp b/src/ops_infer/multi_latent_attention/atb_acl_mla.cpp
index 2784d656..54b9f909 100644
--- a/src/ops_infer/multi_latent_attention/atb_acl_mla.cpp
+++ b/src/ops_infer/multi_latent_attention/atb_acl_mla.cpp
@@ -8,7 +8,7 @@
  * See LICENSE in the root of the software repository for the full text of the License.
  */
 #include "atb/atb_acl.h"
-#include "atb_acl_util.h"
+#include "atb/utils/atb_acl_util.h"
 #include "atb/operation/operation_base.h"
 
 #ifdef __cplusplus
-- 
Gitee


From 460a7ed85c495033b85de9037f2e1fa1bb7b9dbd Mon Sep 17 00:00:00 2001
From: huangxiaolan <huangxiaolan7@huawei.com>
Date: Tue, 23 Sep 2025 11:19:52 +0800
Subject: [PATCH 63/94] fix expression

---
 .../mixkernels/fusion/fusion_kernel.cpp        |  4 ++--
 .../mixkernels/fusion/fusion_operation.cpp     | 18 +++++++++---------
 .../mixkernels/fusion/tiling/fusion_tiling.cpp |  4 ++--
 .../paged_cache_load_operation.cpp             | 16 ++++++++--------
 4 files changed, 21 insertions(+), 21 deletions(-)

diff --git a/src/kernels/mixkernels/fusion/fusion_kernel.cpp b/src/kernels/mixkernels/fusion/fusion_kernel.cpp
index 7fb940a5..7f2d9e8b 100644
--- a/src/kernels/mixkernels/fusion/fusion_kernel.cpp
+++ b/src/kernels/mixkernels/fusion/fusion_kernel.cpp
@@ -31,7 +31,7 @@ public:
     bool CanSupport(const LaunchParam &launchParam) const override
     {
         OpParam::Fusion fusionType = launchParam.GetParam<OpParam::Fusion>();
-        if (OpParam::Fusion::MATMUL_ADD == fusionType.fusionType) {
+        if (fusionType.fusionType == OpParam::Fusion::MATMUL_ADD) {
             MKI_CHECK(launchParam.GetInTensorCount() == TENSOR_INPUT_NUM_MATMUL_ADD, "in tensor num invalid",
                       return false);
             MKI_CHECK(launchParam.GetOutTensorCount() == TENSOR_OUTPUT_NUM, "out tensor num invalid", return false);
@@ -41,7 +41,7 @@ public:
             MKI_CHECK(inTensor1.desc.dtype == TENSOR_DTYPE_FLOAT16, "in tensor 1 dtype invalid", return false);
             auto inTensor2 = launchParam.GetInTensor(DIM_2);
             MKI_CHECK(inTensor2.desc.dtype == TENSOR_DTYPE_FLOAT16, "in tensor 2 dtype invalid", return false);
-        } else if (OpParam::Fusion::MATMUL_GELU == fusionType.fusionType) {
+        } else if (fusionType.fusionType == OpParam::Fusion::MATMUL_GELU) {
             MKI_CHECK(launchParam.GetInTensorCount() == TENSOR_INPUT_NUM_MATMUL_ACTIVATE, "in tensor num invalid",
                       return false);
             MKI_CHECK(launchParam.GetOutTensorCount() == TENSOR_OUTPUT_NUM, "out tensor num invalid", return false);
diff --git a/src/kernels/mixkernels/fusion/fusion_operation.cpp b/src/kernels/mixkernels/fusion/fusion_operation.cpp
index 3f61161d..ff035217 100644
--- a/src/kernels/mixkernels/fusion/fusion_operation.cpp
+++ b/src/kernels/mixkernels/fusion/fusion_operation.cpp
@@ -37,13 +37,13 @@ public:
     {
         std::string kernelName = "FusionMatmulAddKernel";
         OpParam::Fusion fusionType = launchParam.GetParam<OpParam::Fusion>();
-        if (OpParam::Fusion::MATMUL_GELU == fusionType.fusionType) {
+        if (fusionType.fusionType == OpParam::Fusion::MATMUL_GELU) {
             kernelName = "FusionMatmulGeluKernel";
-        } else if (OpParam::Fusion::MATMUL_SIGMOID == fusionType.fusionType) {
+        } else if (fusionType.fusionType == OpParam::Fusion::MATMUL_SIGMOID) {
             kernelName = "FusionMatmulSigmoidKernel";
-        } else if (OpParam::Fusion::MATMUL_SWIGLU == fusionType.fusionType) {
+        } else if (fusionType.fusionType == OpParam::Fusion::MATMUL_SWIGLU) {
             kernelName = "FusionMatmulSwiGluKernel";
-        } else if (OpParam::Fusion::NON_FUSION == fusionType.fusionType) {
+        } else if (fusionType.fusionType == OpParam::Fusion::NON_FUSION) {
             kernelName = "FusionErasedKernel";
         }
         MKI_LOG(INFO) << "getBestKernel " << kernelName;
@@ -242,13 +242,13 @@ public:
     {
         OpParam::Fusion fusionType = launchParam.GetParam<OpParam::Fusion>();
         std::string deviceVersion = PlatformInfo::Instance().GetPlatformName();
-        if (OpParam::Fusion::MATMUL_ADD == fusionType.fusionType) {
+        if (fusionType.fusionType == OpParam::Fusion::MATMUL_ADD) {
             MatMulAddFusion();
-        } else if (OpParam::Fusion::MATMUL_GELU == fusionType.fusionType) {
+        } else if (fusionType.fusionType == OpParam::Fusion::MATMUL_GELU) {
             MatMulGeluFusion();
-        } else if (OpParam::Fusion::MATMUL_SIGMOID == fusionType.fusionType) {
+        } else if (fusionType.fusionType == OpParam::Fusion::MATMUL_SIGMOID) {
             MatMulSigmoidFusion();
-        } else if (OpParam::Fusion::MATMUL_SWIGLU == fusionType.fusionType) {
+        } else if (fusionType.fusionType == OpParam::Fusion::MATMUL_SWIGLU) {
             MatMulSwigluFusion();
         } else {
             ErasedFusion();
@@ -270,7 +270,7 @@ protected:
     Status InferShapeImpl(const LaunchParam &launchParam, SVector<Tensor> &outTensors) const override
     {
         OpParam::Fusion fusionType = launchParam.GetParam<OpParam::Fusion>();
-        if (OpParam::Fusion::MATMUL_ADD == fusionType.fusionType) {
+        if (fusionType.fusionType == OpParam::Fusion::MATMUL_ADD) {
             auto inTensorDescA = launchParam.GetInTensor(2).desc;
             TensorDesc &tensorDescOut = outTensors[0].desc;
             tensorDescOut.dtype = TENSOR_DTYPE_FLOAT16;
diff --git a/src/kernels/mixkernels/fusion/tiling/fusion_tiling.cpp b/src/kernels/mixkernels/fusion/tiling/fusion_tiling.cpp
index 5145c511..130128ed 100644
--- a/src/kernels/mixkernels/fusion/tiling/fusion_tiling.cpp
+++ b/src/kernels/mixkernels/fusion/tiling/fusion_tiling.cpp
@@ -22,9 +22,9 @@ Status FusionTiling(const LaunchParam &launchParam, KernelInfo &kernelInfo)
     OpParam::Fusion fusionType = launchParam.GetParam<OpParam::Fusion>();
     std::string path(std::getenv("HOME"));
     path += std::string("/.atb_auto_fusion/bishengir_bin/") +
-            (OpParam::Fusion::MATMUL_ADD == fusionType.fusionType ? "libmatmul_add.so" : "libmatmul_gelu.so");
+            (fusionType.fusionType ? "libmatmul_add.so" : "libmatmul_gelu.so" == OpParam::Fusion::MATMUL_ADD);
     std::string inferWorkspaceFuncName =
-        (OpParam::Fusion::MATMUL_ADD == fusionType.fusionType ? "matmul_add_" : "matmul_gelu_");
+        (fusionType.fusionType ? "matmul_add_" : "matmul_gelu_" == OpParam::Fusion::MATMUL_ADD);
     FusionTilingData *tilingDataPtr = reinterpret_cast<FusionTilingData *>(kernelInfo.GetTilingHostAddr());
     void *handle = dlopen(path.c_str(), RTLD_LAZY);
     if (!handle) {
diff --git a/src/ops_infer/paged_cache_load/paged_cache_load_operation.cpp b/src/ops_infer/paged_cache_load/paged_cache_load_operation.cpp
index 7daf6d45..6ab4f5e2 100644
--- a/src/ops_infer/paged_cache_load/paged_cache_load_operation.cpp
+++ b/src/ops_infer/paged_cache_load/paged_cache_load_operation.cpp
@@ -220,22 +220,22 @@ Status PagedCacheLoadOperation::KVCacheDimCheck910BNZ(const SVector<TensorDesc>
         return ERROR_INVALID_TENSOR_DIM_NUM;
     }
     if (inTensorDescs.at(IN_TENSOR_0_KEYCACHE).dtype == ACL_INT8) {
-        if (THIRTYTWO != inTensorDescs.at(IN_TENSOR_0_KEYCACHE).shape.dims[OUT_DIM] ||
-                THIRTYTWO!= inTensorDescs.at(IN_TENSOR_1_VALUECACHE).shape.dims[OUT_DIM]) { // 1: valueCache
+        if (inTensorDescs.at(IN_TENSOR_0_KEYCACHE).shape.dims[OUT_DIM] != THIRTYTWO ||
+                inTensorDescs.at(IN_TENSOR_1_VALUECACHE).shape.dims[OUT_DIM] != THIRTYTWO) { // 1: valueCache
             ATB_LOG(ERROR) << GetLogPrefix() << "The last dimension of keycache and valuecache must be 32";
             return ERROR_INVALID_TENSOR_DIM;
         }
-        if (MAX_k < inTensorDescs.at(IN_TENSOR_0_KEYCACHE).shape.dims[1] * THIRTYTWO ||
-                MAX_v < inTensorDescs.at(IN_TENSOR_1_VALUECACHE).shape.dims[1] * THIRTYTWO) {
+        if (inTensorDescs.at(IN_TENSOR_0_KEYCACHE).shape.dims[1] * THIRTYTWO > MAX_k ||
+                inTensorDescs.at(IN_TENSOR_1_VALUECACHE).shape.dims[1] * THIRTYTWO > MAX_v) {
             ATB_LOG(ERROR) << GetLogPrefix() << "The scend dimension of blocktables must be less than 147456";
             return ERROR_INVALID_TENSOR_DIM;
         }
-    } else  if (SIXTEEN != inTensorDescs.at(IN_TENSOR_0_KEYCACHE).shape.dims[OUT_DIM] ||
-                SIXTEEN!= inTensorDescs.at(IN_TENSOR_1_VALUECACHE).shape.dims[OUT_DIM]) { // 1: valueCache
+    } else  if (inTensorDescs.at(IN_TENSOR_0_KEYCACHE).shape.dims[OUT_DIM] != SIXTEEN ||
+                inTensorDescs.at(IN_TENSOR_1_VALUECACHE).shape.dims[OUT_DIM] != SIXTEEN) { // 1: valueCache
             ATB_LOG(ERROR) << GetLogPrefix() << "The last dimension of keycache and valuecache must be 16";
             return ERROR_INVALID_TENSOR_DIM;
-    } else if (MAX_k < inTensorDescs.at(IN_TENSOR_0_KEYCACHE).shape.dims[1] * SIXTEEN ||
-                MAX_v < inTensorDescs.at(IN_TENSOR_1_VALUECACHE).shape.dims[1] * SIXTEEN) {
+    } else if (inTensorDescs.at(IN_TENSOR_0_KEYCACHE).shape.dims[1] * SIXTEEN > MAX_k ||
+                inTensorDescs.at(IN_TENSOR_1_VALUECACHE).shape.dims[1] * SIXTEEN > MAX_v) {
             ATB_LOG(ERROR) << GetLogPrefix() << "The scend dimension of blocktables must be less than 147456";
             return ERROR_INVALID_TENSOR_DIM;
     }
-- 
Gitee


From 82e4298c723c70ca9f3524a771983bd1007ccf8b Mon Sep 17 00:00:00 2001
From: guanguan <guanxuewei2@huawei.com>
Date: Tue, 23 Sep 2025 11:26:49 +0800
Subject: [PATCH 64/94] fix error

---
 src/ops_infer/linear_parallel/linear_parallel_operation.cpp | 1 +
 src/ops_infer/linear_parallel/linear_parallel_operation.h   | 1 +
 2 files changed, 2 insertions(+)
 mode change 100644 => 100755 src/ops_infer/linear_parallel/linear_parallel_operation.h

diff --git a/src/ops_infer/linear_parallel/linear_parallel_operation.cpp b/src/ops_infer/linear_parallel/linear_parallel_operation.cpp
index ea29279e..810c8d75 100644
--- a/src/ops_infer/linear_parallel/linear_parallel_operation.cpp
+++ b/src/ops_infer/linear_parallel/linear_parallel_operation.cpp
@@ -37,6 +37,7 @@ static const uint32_t RESIDUAL_TENSOR_INDEX_3 = 3;
 static const uint32_t RESIDUAL_TENSOR_INDEX_4 = 4;
 static const uint32_t MAX_OUTPUT_SIZE = 204800;
 static const uint32_t MAX_K = 24000;
+static const uint32_t DIM_4 = 4;
 
 static bool AllToAllvcAllGatherGmmOutTensorCheck(const SVector<TensorDesc> &inTensorDescs,
                                                  const TensorDesc &outTensorDesc, const std::string &logPrefix)
diff --git a/src/ops_infer/linear_parallel/linear_parallel_operation.h b/src/ops_infer/linear_parallel/linear_parallel_operation.h
old mode 100644
new mode 100755
index d9e17af5..2658ac3f
--- a/src/ops_infer/linear_parallel/linear_parallel_operation.h
+++ b/src/ops_infer/linear_parallel/linear_parallel_operation.h
@@ -38,6 +38,7 @@ private:
     Status InferShapeAllToAllvcAllGatherGmm(const SVector<TensorDesc> &inTensorDescs,
                                             SVector<TensorDesc> &outTensorDescs) const;
     Status CheckResidual(const SVector<TensorDesc> &inTensorDescs) const;
+    Status CheckWeightNzFormat(const SVector<TensorDesc> &inTensorDescs) const;
     Status InferShapeCheckLinearAllReduce(const SVector<TensorDesc> &inTensorDescs) const;
     Status InferShapeCheckLinearReduceScatter(const SVector<TensorDesc> &inTensorDescs) const;
     Status InferShapeCheckAllGatherLinear(const SVector<TensorDesc> &inTensorDescs) const;
-- 
Gitee


From 4ea1802054b32cc80946423072e6e4448a3255b6 Mon Sep 17 00:00:00 2001
From: He Changcheng <hechangcheng2@h-partners.com>
Date: Tue, 23 Sep 2025 11:35:24 +0800
Subject: [PATCH 65/94] fix

---
 src/kernels/lcal/src/lccl.cpp | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/src/kernels/lcal/src/lccl.cpp b/src/kernels/lcal/src/lccl.cpp
index 8bf1255b..b1d81330 100644
--- a/src/kernels/lcal/src/lccl.cpp
+++ b/src/kernels/lcal/src/lccl.cpp
@@ -26,7 +26,7 @@ using namespace chrono;
 using namespace Mki;
 
 namespace Lcal {
-using AclrtGetResInCurrentThreadFunc = int(*)(int, uint32_t*);
+using AclrtGetResInCurrentThreadFunc = int(*)(int, uint32_t *);
 
 int GetAclResInCurThread(int type, uint32_t &resource)
 {
@@ -36,7 +36,13 @@ int GetAclResInCurThread(int type, uint32_t &resource)
     static AclrtGetResInCurrentThreadFunc aclFn = nullptr;
 
     std::call_once(onceFlag, []() {
-        std::string p = std::string(Mki::GetEnv("ASCEND_HOME_PATH")) + "/runtime/lib64/libascendcl.so";
+        std::string p;
+        const char *c = Mki::GetEnv("ASCEND_HOME_PATH");
+        if (c) {
+            p = std::string(c) + "/runtime/lib64/libascendcl.so";
+        } else {
+            p = "libascendcl.so";
+        }
         auto dl = std::make_unique<Mki::Dl>(p, false);
         if (!dl->IsValid()) {
             MKI_LOG(ERROR) << "Try load libascendcl.so failed: " << p;
-- 
Gitee


From 90bb43b677343b0d3f395c429a612d0adc99a498 Mon Sep 17 00:00:00 2001
From: qq_44359711 <caobingjie@huawei.com>
Date: Tue, 23 Sep 2025 15:13:06 +0800
Subject: [PATCH 66/94] update

---
 example/op_demo/activation/README.md            |  4 +++-
 example/op_demo/all_gather/README.md            |  4 +++-
 example/op_demo/all_reduce/README.md            |  4 +++-
 example/op_demo/concat/README.md                |  4 +++-
 example/op_demo/elewise/README.md               |  4 +++-
 example/op_demo/fused_add_topk_div/README.md    |  4 +++-
 example/op_demo/gather/README.md                |  4 +++-
 example/op_demo/layer_norm/README.md            |  4 +++-
 example/op_demo/linear/README.md                |  7 ++-----
 example/op_demo/linear_parallel/README.md       |  4 +++-
 example/op_demo/mla_preprocess/README.md        |  5 ++++-
 .../op_demo/multi_latent_attention/README.md    |  5 ++++-
 example/op_demo/paged_attention/README.md       |  4 +++-
 example/op_demo/paged_cache_load/README.md      | 16 ++++++++++++++--
 example/op_demo/reshape_and_cache/README.md     |  4 +++-
 example/op_demo/ring_mla/README.md              |  6 ++++--
 example/op_demo/rms_norm/README.md              | 17 +++++++++++++++--
 example/op_demo/rms_norm_backward/README.md     |  4 +++-
 example/op_demo/rope/README.md                  | 16 ++++++++++++++--
 example/op_demo/self_attention/README.md        |  6 ++++--
 example/op_demo/slice/README.md                 |  4 +++-
 example/op_demo/split/README.md                 |  4 +++-
 example/op_demo/transdata/README.md             |  4 +++-
 example/op_demo/transpose/README.md             |  5 +++--
 24 files changed, 109 insertions(+), 34 deletions(-)

diff --git a/example/op_demo/activation/README.md b/example/op_demo/activation/README.md
index 2616d541..af988f86 100644
--- a/example/op_demo/activation/README.md
+++ b/example/op_demo/activation/README.md
@@ -12,7 +12,9 @@
         例如： source ./ascend-transformer-boost/output/atb/set_env.sh
 
 - 运行demo
-    - bash build.sh
+    ```sh
+    bash build.sh
+    ```
     **注意**：
     - 使用cxx_abi=0（默认）时，设置`D_GLIBCXX_USE_CXX11_ABI`为0，即：
         ```sh
diff --git a/example/op_demo/all_gather/README.md b/example/op_demo/all_gather/README.md
index 8897b7bf..0d071aa3 100644
--- a/example/op_demo/all_gather/README.md
+++ b/example/op_demo/all_gather/README.md
@@ -12,7 +12,9 @@
         例如： source ./ascend-transformer-boost/output/atb/set_env.sh
 
 - 运行demo
-    - bash build.sh
+    ```sh
+    bash build.sh
+    ```
     **注意**：
     - 使用cxx_abi=0（默认）时，设置`D_GLIBCXX_USE_CXX11_ABI`为0，即：
         ```sh
diff --git a/example/op_demo/all_reduce/README.md b/example/op_demo/all_reduce/README.md
index 17d8d4b0..75fc37fc 100644
--- a/example/op_demo/all_reduce/README.md
+++ b/example/op_demo/all_reduce/README.md
@@ -12,7 +12,9 @@
         例如： source ./ascend-transformer-boost/output/atb/set_env.sh
 
 - 运行demo
-    - bash build.sh
+    ```sh
+    bash build.sh
+    ```
     **注意**：
     - 使用cxx_abi=0（默认）时，设置`D_GLIBCXX_USE_CXX11_ABI`为0，即：
         ```sh
diff --git a/example/op_demo/concat/README.md b/example/op_demo/concat/README.md
index dd76fc32..73ed0a5c 100644
--- a/example/op_demo/concat/README.md
+++ b/example/op_demo/concat/README.md
@@ -12,7 +12,9 @@
         例如： source ./ascend-transformer-boost/output/atb/set_env.sh
 
 - 运行demo
-    - bash build.sh
+    ```sh
+    bash build.sh
+    ```
     **注意**：
     - 使用cxx_abi=0（默认）时，设置`D_GLIBCXX_USE_CXX11_ABI`为0，即：
         ```sh
diff --git a/example/op_demo/elewise/README.md b/example/op_demo/elewise/README.md
index cd9c719b..d86f53ed 100644
--- a/example/op_demo/elewise/README.md
+++ b/example/op_demo/elewise/README.md
@@ -12,7 +12,9 @@
         例如： source ./ascend-transformer-boost/output/atb/set_env.sh
 
 - 运行demo
-    - bash build.sh
+    ```sh
+    bash build.sh
+    ```
     **注意**：
     - 使用cxx_abi=0（默认）时，设置`D_GLIBCXX_USE_CXX11_ABI`为0，即：
         ```sh
diff --git a/example/op_demo/fused_add_topk_div/README.md b/example/op_demo/fused_add_topk_div/README.md
index c4a50d74..ad529f6e 100644
--- a/example/op_demo/fused_add_topk_div/README.md
+++ b/example/op_demo/fused_add_topk_div/README.md
@@ -12,7 +12,9 @@
         例如： source ./ascend-transformer-boost/output/atb/set_env.sh
 
 - 运行demo
-    - bash build.sh
+    ```sh
+    bash build.sh
+    ```
     **注意**：
     - 使用cxx_abi=0（默认）时，设置`D_GLIBCXX_USE_CXX11_ABI`为0，即：
         ```sh
diff --git a/example/op_demo/gather/README.md b/example/op_demo/gather/README.md
index b0e16a71..cdcd67a5 100644
--- a/example/op_demo/gather/README.md
+++ b/example/op_demo/gather/README.md
@@ -12,7 +12,9 @@
         例如： source ./ascend-transformer-boost/output/atb/set_env.sh
 
 - 运行demo
-    - bash build.sh
+    ```sh
+    bash build.sh
+    ```
     **注意**：
     - 使用cxx_abi=0（默认）时，设置`D_GLIBCXX_USE_CXX11_ABI`为0，即：
         ```sh
diff --git a/example/op_demo/layer_norm/README.md b/example/op_demo/layer_norm/README.md
index cf4437eb..429c9c78 100644
--- a/example/op_demo/layer_norm/README.md
+++ b/example/op_demo/layer_norm/README.md
@@ -12,7 +12,9 @@
         例如： source ./ascend-transformer-boost/output/atb/set_env.sh
 
 - 运行demo
-    - bash build.sh
+    ```sh
+    bash build.sh
+    ```
     **注意**：
     - 使用cxx_abi=0（默认）时，设置`D_GLIBCXX_USE_CXX11_ABI`为0，即：
         ```sh
diff --git a/example/op_demo/linear/README.md b/example/op_demo/linear/README.md
index e1547d92..5f244c0a 100644
--- a/example/op_demo/linear/README.md
+++ b/example/op_demo/linear/README.md
@@ -18,15 +18,12 @@
     ```sh
     bash build.sh
     ```
-
     **注意**：
-  - 使用cxx_abi=0（默认）时，设置`D_GLIBCXX_USE_CXX11_ABI`为0，即：
-
+    - 使用cxx_abi=0（默认）时，设置`D_GLIBCXX_USE_CXX11_ABI`为0，即：
         ```sh
         g++ -D_GLIBCXX_USE_CXX11_ABI=0 -I ...
         ```
-  - 使用cxx_abi=1时，更改`D_GLIBCXX_USE_CXX11_ABI`为1，即：
-
+    - 使用cxx_abi=1时，更改`D_GLIBCXX_USE_CXX11_ABI`为1，即：
         ```sh
         g++ -D_GLIBCXX_USE_CXX11_ABI=1 -I ...
         ```
diff --git a/example/op_demo/linear_parallel/README.md b/example/op_demo/linear_parallel/README.md
index 3885aac5..238d1651 100644
--- a/example/op_demo/linear_parallel/README.md
+++ b/example/op_demo/linear_parallel/README.md
@@ -12,7 +12,9 @@
         例如： source ./ascend-transformer-boost/output/atb/set_env.sh
 
 - 运行demo
-    - bash build.sh
+    ```sh
+    bash build.sh
+    ```
     **注意**：
     - 使用cxx_abi=0（默认）时，设置`D_GLIBCXX_USE_CXX11_ABI`为0，即：
         ```sh
diff --git a/example/op_demo/mla_preprocess/README.md b/example/op_demo/mla_preprocess/README.md
index 87c24619..50f59fc4 100644
--- a/example/op_demo/mla_preprocess/README.md
+++ b/example/op_demo/mla_preprocess/README.md
@@ -12,7 +12,9 @@
         例如： source ./ascend-transformer-boost/output/atb/set_env.sh
 
 - 运行demo
-    - bash build.sh
+    ```sh
+    bash build.sh
+    ```
     **注意**：
     - 使用cxx_abi=0（默认）时，设置`D_GLIBCXX_USE_CXX11_ABI`为0，即：
         ```sh
@@ -22,6 +24,7 @@
         ```sh
         g++ -D_GLIBCXX_USE_CXX11_ABI=1 -I ...
         ```
+    - 提供的build脚本仅用于编译和运行mlapo_demo.cpp，如需编译其他demo，需要替换“mlapo_demo”为对应的cpp文件名
 
 ## 额外说明
 示例中生成的数据不代表实际场景，如需数据生成参考请查看python用例目录：
diff --git a/example/op_demo/multi_latent_attention/README.md b/example/op_demo/multi_latent_attention/README.md
index 55ef7ce4..cd042604 100644
--- a/example/op_demo/multi_latent_attention/README.md
+++ b/example/op_demo/multi_latent_attention/README.md
@@ -11,7 +11,9 @@
         1. 如果使用加速库源码编译，source [加速库源码路径]/output/atb/set_env.sh
         例如： source ./ascend-transformer-boost/output/atb/set_env.sh
 - 运行demo
-    - bash build.sh
+    ```sh
+    bash build.sh
+    ```
     **注意**：
     - 使用cxx_abi=0（默认）时，设置`D_GLIBCXX_USE_CXX11_ABI`为0，即：
         ```sh
@@ -21,6 +23,7 @@
         ```sh
         g++ -D_GLIBCXX_USE_CXX11_ABI=1 -I ...
         ```
+    - 提供的build脚本仅用于编译和运行mlapo_demo.cpp，如需编译其他demo，需要替换“mlapo_demo”为对应的cpp文件名
 
 ## 额外说明
 示例中生成的数据不代表实际场景，如需数据生成参考请查看python用例目录：
diff --git a/example/op_demo/paged_attention/README.md b/example/op_demo/paged_attention/README.md
index 49fcbd6b..136271c2 100644
--- a/example/op_demo/paged_attention/README.md
+++ b/example/op_demo/paged_attention/README.md
@@ -12,7 +12,9 @@
         例如： source ./ascend-transformer-boost/output/atb/set_env.sh
 
 - 运行demo
-    - bash build.sh  
+    ```sh
+    bash build.sh
+    ```
     **注意**：
     - 使用cxx_abi=0（默认）时，设置`D_GLIBCXX_USE_CXX11_ABI`为0，即：
         ```sh
diff --git a/example/op_demo/paged_cache_load/README.md b/example/op_demo/paged_cache_load/README.md
index 055e0bef..66745e0c 100644
--- a/example/op_demo/paged_cache_load/README.md
+++ b/example/op_demo/paged_cache_load/README.md
@@ -11,8 +11,20 @@
         1. 如果使用加速库源码编译，source [加速库源码路径]/output/atb/set_env.sh
         例如： source ./ascend-transformer-boost/output/atb/set_env.sh
 
-- 编译、运行demo
-    - bash build.sh
+- 运行demo
+    ```sh
+    bash build.sh
+    ```
+    **注意**：
+    - 使用cxx_abi=0（默认）时，设置`D_GLIBCXX_USE_CXX11_ABI`为0，即：
+        ```sh
+        g++ -D_GLIBCXX_USE_CXX11_ABI=0 -I ...
+        ```
+    - 使用cxx_abi=1时，更改`D_GLIBCXX_USE_CXX11_ABI`为1，即：
+        ```sh
+        g++ -D_GLIBCXX_USE_CXX11_ABI=1 -I ...
+        ```
+    - 提供的build脚本仅用于编译和运行paged_cache_load_demo.cpp，如需编译其他demo，需要替换“paged_cache_load_demo”为对应的cpp文件名
 
 ## 额外说明
 示例中生成的数据不代表实际场景，如需数据生成参考请查看python用例目录：
diff --git a/example/op_demo/reshape_and_cache/README.md b/example/op_demo/reshape_and_cache/README.md
index f4e4ede9..75580358 100644
--- a/example/op_demo/reshape_and_cache/README.md
+++ b/example/op_demo/reshape_and_cache/README.md
@@ -12,7 +12,9 @@
         例如： source ./ascend-transformer-boost/output/atb/set_env.sh
 
 - 运行demo
-    - bash build.sh
+    ```sh
+    bash build.sh
+    ```
     **注意**：
     - 使用cxx_abi=0（默认）时，设置`D_GLIBCXX_USE_CXX11_ABI`为0，即：
         ```sh
diff --git a/example/op_demo/ring_mla/README.md b/example/op_demo/ring_mla/README.md
index 333678bd..4011ef7e 100644
--- a/example/op_demo/ring_mla/README.md
+++ b/example/op_demo/ring_mla/README.md
@@ -12,9 +12,11 @@
         例如： source ./ascend-transformer-boost/output/atb/set_env.sh
 
 - 运行demo
-    - bash build.sh
+    ```sh
+    bash build.sh
+    ```
     **注意**：
-    - 使用cxx_abi=0时，设置`D_GLIBCXX_USE_CXX11_ABI`为0，即：
+    - 使用cxx_abi=0（默认）时，设置`D_GLIBCXX_USE_CXX11_ABI`为0，即：
         ```sh
         g++ -D_GLIBCXX_USE_CXX11_ABI=0 -I ...
         ```
diff --git a/example/op_demo/rms_norm/README.md b/example/op_demo/rms_norm/README.md
index f3b43873..b0a35b58 100644
--- a/example/op_demo/rms_norm/README.md
+++ b/example/op_demo/rms_norm/README.md
@@ -11,8 +11,21 @@
         1. 如果使用加速库源码编译，source [加速库源码路径]/output/atb/set_env.sh
         例如： source ./ascend-transformer-boost/output/atb/set_env.sh
 
-- 编译、运行demo
-    - bash build.sh
+- 运行demo
+    ```sh
+    bash build.sh
+    ```
+    **注意**：
+    - 使用cxx_abi=0（默认）时，设置`D_GLIBCXX_USE_CXX11_ABI`为0，即：
+        ```sh
+        g++ -D_GLIBCXX_USE_CXX11_ABI=0 -I ...
+        ```
+    - 使用cxx_abi=1时，更改`D_GLIBCXX_USE_CXX11_ABI`为1，即：
+        ```sh
+        g++ -D_GLIBCXX_USE_CXX11_ABI=1 -I ...
+        ```
+    - 提供的build脚本仅用于编译和运行rms_norm_demo.cpp，如需编译其他demo，需要替换“rms_norm_demo”为对应的cpp文件名
+
 
 ## 额外说明
 示例中生成的数据不代表实际场景，如需数据生成参考请查看python用例目录：
diff --git a/example/op_demo/rms_norm_backward/README.md b/example/op_demo/rms_norm_backward/README.md
index 78daf3f9..409088a7 100644
--- a/example/op_demo/rms_norm_backward/README.md
+++ b/example/op_demo/rms_norm_backward/README.md
@@ -12,7 +12,9 @@
         例如： source ./ascend-transformer-boost/output/atb/set_env.sh
 
 - 运行demo
-    - bash build.sh
+    ```sh
+    bash build.sh
+    ```
     **注意**：
     - 使用cxx_abi=0（默认）时，设置`D_GLIBCXX_USE_CXX11_ABI`为0，即：
         ```sh
diff --git a/example/op_demo/rope/README.md b/example/op_demo/rope/README.md
index c6c1ec27..e2072c77 100644
--- a/example/op_demo/rope/README.md
+++ b/example/op_demo/rope/README.md
@@ -11,8 +11,20 @@
         1. 如果使用加速库源码编译，source [加速库源码路径]/output/atb/set_env.sh
         例如： source ./ascend-transformer-boost/output/atb/set_env.sh
 
-- 编译、运行demo
-    - bash build.sh
+- 运行demo
+    ```sh
+    bash build.sh
+    ```
+    **注意**：
+    - 使用cxx_abi=0（默认）时，设置`D_GLIBCXX_USE_CXX11_ABI`为0，即：
+        ```sh
+        g++ -D_GLIBCXX_USE_CXX11_ABI=0 -I ...
+        ```
+    - 使用cxx_abi=1时，更改`D_GLIBCXX_USE_CXX11_ABI`为1，即：
+        ```sh
+        g++ -D_GLIBCXX_USE_CXX11_ABI=1 -I ...
+        ```
+    - 提供的build脚本仅用于编译和运行rope_demo.cpp，如需编译其他demo，需要替换“rope_demo”为对应的cpp文件名
 
 ## 额外说明
 示例中生成的数据不代表实际场景，如需数据生成参考请查看python用例目录：
diff --git a/example/op_demo/self_attention/README.md b/example/op_demo/self_attention/README.md
index 8f428192..1b56c3af 100644
--- a/example/op_demo/self_attention/README.md
+++ b/example/op_demo/self_attention/README.md
@@ -12,9 +12,11 @@
         例如： source ./ascend-transformer-boost/output/atb/set_env.sh
 
 - 运行demo
-    - bash build.sh
+    ```sh
+    bash build.sh
+    ```
     **注意**：
-    - 使用cxx_abi=0时，设置`D_GLIBCXX_USE_CXX11_ABI`为0，即：
+    - 使用cxx_abi=0（默认）时，设置`D_GLIBCXX_USE_CXX11_ABI`为0，即：
         ```sh
         g++ -D_GLIBCXX_USE_CXX11_ABI=0 -I ...
         ```
diff --git a/example/op_demo/slice/README.md b/example/op_demo/slice/README.md
index aa0ebb81..352a04a9 100644
--- a/example/op_demo/slice/README.md
+++ b/example/op_demo/slice/README.md
@@ -12,7 +12,9 @@
         例如： source ./ascend-transformer-boost/output/atb/set_env.sh
 
 - 运行demo
-    - bash build.sh
+    ```sh
+    bash build.sh
+    ```
     **注意**：
     - 使用cxx_abi=0（默认）时，设置`D_GLIBCXX_USE_CXX11_ABI`为0，即：
         ```sh
diff --git a/example/op_demo/split/README.md b/example/op_demo/split/README.md
index 39fa1e02..d33fa164 100644
--- a/example/op_demo/split/README.md
+++ b/example/op_demo/split/README.md
@@ -12,7 +12,9 @@
         例如： source ./ascend-transformer-boost/output/atb/set_env.sh
 
 - 运行demo
-    - bash build.sh
+    ```sh
+    bash build.sh
+    ```
     **注意**：
     - 使用cxx_abi=0（默认）时，设置`D_GLIBCXX_USE_CXX11_ABI`为0，即：
         ```sh
diff --git a/example/op_demo/transdata/README.md b/example/op_demo/transdata/README.md
index 99b54219..6db3ab25 100644
--- a/example/op_demo/transdata/README.md
+++ b/example/op_demo/transdata/README.md
@@ -12,7 +12,9 @@
         例如： source ./ascend-transformer-boost/output/atb/set_env.sh
 
 - 运行demo
-    - bash build.sh
+    ```sh
+    bash build.sh
+    ```
     **注意**：
     - 使用cxx_abi=0（默认）时，设置`D_GLIBCXX_USE_CXX11_ABI`为0，即：
         ```sh
diff --git a/example/op_demo/transpose/README.md b/example/op_demo/transpose/README.md
index d73e19e9..bc2a8a56 100644
--- a/example/op_demo/transpose/README.md
+++ b/example/op_demo/transpose/README.md
@@ -12,7 +12,9 @@
         例如： source ./ascend-transformer-boost/output/atb/set_env.sh
 
 - 运行demo
-    - bash build.sh
+    ```sh
+    bash build.sh
+    ```
     **注意**：
     - 使用cxx_abi=0（默认）时，设置`D_GLIBCXX_USE_CXX11_ABI`为0，即：
         ```sh
@@ -22,7 +24,6 @@
         ```sh
         g++ -D_GLIBCXX_USE_CXX11_ABI=1 -I ...
         ```
-
 ## 额外说明
 示例中生成的数据不代表实际场景，如需数据生成参考请查看python用例目录：
 tests/apitest/opstest/python/operations/transpose_demo/
-- 
Gitee


From 12257c40941ef0af7e43cfee4313b4315dccaf08 Mon Sep 17 00:00:00 2001
From: huangxiaolan <huangxiaolan7@huawei.com>
Date: Tue, 23 Sep 2025 15:55:54 +0800
Subject: [PATCH 67/94] add fix predefine

---
 .../tbe_adapter/platform/tiling/platform/platform_ascendc.h   | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/kernels/tbe_adapter/platform/tiling/platform/platform_ascendc.h b/src/kernels/tbe_adapter/platform/tiling/platform/platform_ascendc.h
index 6eda0979..6fa0fcfd 100644
--- a/src/kernels/tbe_adapter/platform/tiling/platform/platform_ascendc.h
+++ b/src/kernels/tbe_adapter/platform/tiling/platform/platform_ascendc.h
@@ -18,6 +18,7 @@
 
 #include <cstdint>
 #include <mutex>
+#include "stubs\include\metadef\inc\external\platform\platform_info.h"
 
 #define ASCENDC_ASSERT(cond, behavior) \
     do {                               \
@@ -26,9 +27,6 @@
             raise(SIGABRT);            \
         }                              \
     } while (0)
-namespace fe {
-class PlatFormInfos;
-}
 
 namespace platform_ascendc {
 enum class CoreMemType {
-- 
Gitee


From 2a2d8b6936b63f734da476a005438c2ed0396e06 Mon Sep 17 00:00:00 2001
From: huangxiaolan <huangxiaolan7@huawei.com>
Date: Tue, 23 Sep 2025 15:58:54 +0800
Subject: [PATCH 68/94] fix expression

---
 src/kernels/mixkernels/fusion/tiling/fusion_tiling.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/kernels/mixkernels/fusion/tiling/fusion_tiling.cpp b/src/kernels/mixkernels/fusion/tiling/fusion_tiling.cpp
index 130128ed..142e6a26 100644
--- a/src/kernels/mixkernels/fusion/tiling/fusion_tiling.cpp
+++ b/src/kernels/mixkernels/fusion/tiling/fusion_tiling.cpp
@@ -22,9 +22,9 @@ Status FusionTiling(const LaunchParam &launchParam, KernelInfo &kernelInfo)
     OpParam::Fusion fusionType = launchParam.GetParam<OpParam::Fusion>();
     std::string path(std::getenv("HOME"));
     path += std::string("/.atb_auto_fusion/bishengir_bin/") +
-            (fusionType.fusionType ? "libmatmul_add.so" : "libmatmul_gelu.so" == OpParam::Fusion::MATMUL_ADD);
+            ((fusionType.fusionType ? "libmatmul_add.so" : "libmatmul_gelu.so") == OpParam::Fusion::MATMUL_ADD);
     std::string inferWorkspaceFuncName =
-        (fusionType.fusionType ? "matmul_add_" : "matmul_gelu_" == OpParam::Fusion::MATMUL_ADD);
+        ((fusionType.fusionType ? "matmul_add_" : "matmul_gelu_") == OpParam::Fusion::MATMUL_ADD);
     FusionTilingData *tilingDataPtr = reinterpret_cast<FusionTilingData *>(kernelInfo.GetTilingHostAddr());
     void *handle = dlopen(path.c_str(), RTLD_LAZY);
     if (!handle) {
-- 
Gitee


From e4e9517544540766c8be93cf2d32361786fa2868 Mon Sep 17 00:00:00 2001
From: wanyukang <wanyukang@h-partners.com>
Date: Tue, 23 Sep 2025 16:02:49 +0800
Subject: [PATCH 69/94] compute

---
 src/kernels/mixkernels/toppsample/op_kernel/toppsample.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/kernels/mixkernels/toppsample/op_kernel/toppsample.cpp b/src/kernels/mixkernels/toppsample/op_kernel/toppsample.cpp
index fbc9eda7..0b1d7c09 100644
--- a/src/kernels/mixkernels/toppsample/op_kernel/toppsample.cpp
+++ b/src/kernels/mixkernels/toppsample/op_kernel/toppsample.cpp
@@ -18,7 +18,6 @@ static constexpr uint32_t MAX_CORE_NUM = 512;
 static constexpr uint32_t BLK_SIZE = 32;
 static constexpr uint32_t DEFAULT_STRIDE = 8;
 static constexpr uint32_t FP32_PER_REPEAT = 64;
-static constexpr uint32_t FP16_PER_REPEAT = 128;
 static constexpr uint32_t FP16_PER_BLOCK = 16;
 static constexpr uint32_t NUM_4 = 4;
 
@@ -183,7 +182,7 @@ private:
         AscendC::LocalTensor<float> fp32Buf = fp32Buf_.Get<float>();
         AscendC::LocalTensor<half> fp16Buf = fp32Buf_.Get<half>();
         AscendC::LocalTensor<float> fp32TempBuf = tempBuf_.Get<float>();
-        uint32_t copyEleNumAlignF16_ = (copyEleNum + FP16_PER_REPEAT - 1) / FP16_PER_REPEAT * FP16_PER_REPEAT;
+        uint32_t copyEleNumAlignF16_ = (copyEleNum + FP16_PER_BLOCK - 1) / FP16_PER_BLOCK * FP16_PER_BLOCK;
         uint32_t copyEleNumAlignF32_ = (copyEleNum + FP32_PER_REPEAT - 1) / FP32_PER_REPEAT * FP32_PER_REPEAT;
         for (uint32_t dupVal = copyEleNum; dupVal < copyEleNumAlignF16_; dupVal++) {
             buf.SetValue(dupVal, T(1));
-- 
Gitee


From 65a220b3bc4bb99914137bab4493364b94b06997 Mon Sep 17 00:00:00 2001
From: huangxiaolan <huangxiaolan7@huawei.com>
Date: Tue, 23 Sep 2025 16:20:09 +0800
Subject: [PATCH 70/94] fix expression

---
 src/kernels/mixkernels/fusion/tiling/fusion_tiling.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/kernels/mixkernels/fusion/tiling/fusion_tiling.cpp b/src/kernels/mixkernels/fusion/tiling/fusion_tiling.cpp
index 142e6a26..2f85cfd7 100644
--- a/src/kernels/mixkernels/fusion/tiling/fusion_tiling.cpp
+++ b/src/kernels/mixkernels/fusion/tiling/fusion_tiling.cpp
@@ -22,9 +22,9 @@ Status FusionTiling(const LaunchParam &launchParam, KernelInfo &kernelInfo)
     OpParam::Fusion fusionType = launchParam.GetParam<OpParam::Fusion>();
     std::string path(std::getenv("HOME"));
     path += std::string("/.atb_auto_fusion/bishengir_bin/") +
-            ((fusionType.fusionType ? "libmatmul_add.so" : "libmatmul_gelu.so") == OpParam::Fusion::MATMUL_ADD);
+            (fusionType.fusionType == OpParam::Fusion::MATMUL_ADD ? "libmatmul_add.so" : "libmatmul_gelu.so");
     std::string inferWorkspaceFuncName =
-        ((fusionType.fusionType ? "matmul_add_" : "matmul_gelu_") == OpParam::Fusion::MATMUL_ADD);
+        (fusionType.fusionType == OpParam::Fusion::MATMUL_ADD ? "matmul_add_" : "matmul_gelu_");
     FusionTilingData *tilingDataPtr = reinterpret_cast<FusionTilingData *>(kernelInfo.GetTilingHostAddr());
     void *handle = dlopen(path.c_str(), RTLD_LAZY);
     if (!handle) {
-- 
Gitee


From 6cfe7b3baf028b0b248a287adcf4d2aaeef1b884 Mon Sep 17 00:00:00 2001
From: huangxiaolan <huangxiaolan7@huawei.com>
Date: Tue, 23 Sep 2025 16:27:48 +0800
Subject: [PATCH 71/94] fix include order

---
 src/atb/utils/aclnn_util.cpp                                   | 2 +-
 src/kernels/mixkernels/blockcopy/tiling/blockcopy_tiling.cpp   | 2 +-
 src/ops_infer/linear_parallel/linear_parallel_aclnn_runner.cpp | 2 +-
 src/ops_infer/mla_preprocess/mla_preprocess_aclnn_runner.cpp   | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/atb/utils/aclnn_util.cpp b/src/atb/utils/aclnn_util.cpp
index dcbd1ac7..83ba2254 100644
--- a/src/atb/utils/aclnn_util.cpp
+++ b/src/atb/utils/aclnn_util.cpp
@@ -7,12 +7,12 @@
  * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
  * See LICENSE in the root of the software repository for the full text of the License.
  */
+#include "aclnn_util.h"
 
 #include <sstream>
 #include <cstring>
 #include <securec.h>
 
-#include "aclnn_util.h"
 #include "log.h"
 
 namespace {
diff --git a/src/kernels/mixkernels/blockcopy/tiling/blockcopy_tiling.cpp b/src/kernels/mixkernels/blockcopy/tiling/blockcopy_tiling.cpp
index 2ccdb31e..c3cb907e 100644
--- a/src/kernels/mixkernels/blockcopy/tiling/blockcopy_tiling.cpp
+++ b/src/kernels/mixkernels/blockcopy/tiling/blockcopy_tiling.cpp
@@ -7,8 +7,8 @@
 * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
 * See LICENSE in the root of the software repository for the full text of the License.
 */
-#include <limits>
 #include "blockcopy_tiling.h"
+#include <limits>
 #include <mki/utils/assert/assert.h>
 #include <mki/utils/log/log.h>
 #include <mki/utils/math/math.h>
diff --git a/src/ops_infer/linear_parallel/linear_parallel_aclnn_runner.cpp b/src/ops_infer/linear_parallel/linear_parallel_aclnn_runner.cpp
index 7e151f31..a2eabc4e 100644
--- a/src/ops_infer/linear_parallel/linear_parallel_aclnn_runner.cpp
+++ b/src/ops_infer/linear_parallel/linear_parallel_aclnn_runner.cpp
@@ -9,9 +9,9 @@
  */
 
 #include "linear_parallel_aclnn_runner.h"
+#include <hccl/hccl.h>
 #include "atb/utils/dl_manager.h"
 #include "atb/utils/aclnn_util.h"
-#include <hccl/hccl.h>
 
 
 namespace atb {
diff --git a/src/ops_infer/mla_preprocess/mla_preprocess_aclnn_runner.cpp b/src/ops_infer/mla_preprocess/mla_preprocess_aclnn_runner.cpp
index 4c5a102e..eaf43ae3 100644
--- a/src/ops_infer/mla_preprocess/mla_preprocess_aclnn_runner.cpp
+++ b/src/ops_infer/mla_preprocess/mla_preprocess_aclnn_runner.cpp
@@ -11,7 +11,7 @@
 #include "atb/utils/dl_manager.h"
 #include "atb/utils/aclnn_util.h"
 #include "atb/utils/log.h"
-#include <atbops/params/params.h>
+#include "atbops/params/params.h"
 
 namespace {
 static const uint32_t IN_TENSOR_NUM = 24;
-- 
Gitee


From dc1214869ecd028fc4f99e7a38d276e6ab012cf2 Mon Sep 17 00:00:00 2001
From: huangxiaolan <huangxiaolan7@huawei.com>
Date: Tue, 23 Sep 2025 16:42:55 +0800
Subject: [PATCH 72/94] fix bool expression

---
 .../gmm_deq_swiglu_quant_gmm_deq_operation.cpp                | 4 ++--
 .../mm_deq_swiglu_quant_mm_deq_operation.cpp                  | 4 ++--
 .../gmm_deq_swiglu_quant_gmm_deq_operation.cpp                | 4 ++--
 .../mm_deq_swiglu_quant_mm_deq_operation.cpp                  | 4 ++--
 4 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/kernels/mixkernels/gmm_deq_swiglu_quant_gmm_deq/gmm_deq_swiglu_quant_gmm_deq_operation.cpp b/src/kernels/mixkernels/gmm_deq_swiglu_quant_gmm_deq/gmm_deq_swiglu_quant_gmm_deq_operation.cpp
index c03469b1..1337ebf5 100644
--- a/src/kernels/mixkernels/gmm_deq_swiglu_quant_gmm_deq/gmm_deq_swiglu_quant_gmm_deq_operation.cpp
+++ b/src/kernels/mixkernels/gmm_deq_swiglu_quant_gmm_deq/gmm_deq_swiglu_quant_gmm_deq_operation.cpp
@@ -102,9 +102,9 @@ private:
             "Param groupListType only support GROUP_LIST_CUM_SUM (0).", return false);
         MKI_CHECK(param.weightUpPermuteType != OpParam::GmmDeqSwigluQuantGmmDeq::PERMUTE_INVALID,
             "Param weightUpPermuteType has invalid value.", return false);
-        MKI_CHECK(param.transposeWeightUp == false,
+        MKI_CHECK(!param.transposeWeightUp,
             "Param transposeWeightUp only support false.", return false);
-        MKI_CHECK(param.transposeWeightDown == true,
+        MKI_CHECK(param.transposeWeightDown,
             "Param transposeWeightDown only support true.", return false);
         return true;
     }
diff --git a/src/kernels/mixkernels/mm_deq_swiglu_quant_mm_deq/mm_deq_swiglu_quant_mm_deq_operation.cpp b/src/kernels/mixkernels/mm_deq_swiglu_quant_mm_deq/mm_deq_swiglu_quant_mm_deq_operation.cpp
index 2a402181..135beb06 100644
--- a/src/kernels/mixkernels/mm_deq_swiglu_quant_mm_deq/mm_deq_swiglu_quant_mm_deq_operation.cpp
+++ b/src/kernels/mixkernels/mm_deq_swiglu_quant_mm_deq/mm_deq_swiglu_quant_mm_deq_operation.cpp
@@ -94,9 +94,9 @@ private:
             "Param outputType only support OUTPUT_FLOAT16 (0).", return false);
         MKI_CHECK(param.weightUpPermuteType != OpParam::MmDeqSwigluQuantMmDeq::PERMUTE_INVALID,
             "Param weightUpPermuteType has invalid value.", return false);
-        MKI_CHECK(param.transposeWeightUp == false,
+        MKI_CHECK(!param.transposeWeightUp,
             "Param transposeWeightUp only support false.", return false);
-        MKI_CHECK(param.transposeWeightDown == true,
+        MKI_CHECK(param.transposeWeightDown,
             "Param transposeWeightDown only support true.", return false);
         return true;
     }
diff --git a/src/ops_infer/gmm_deq_swiglu_quant_gmm_deq/gmm_deq_swiglu_quant_gmm_deq_operation.cpp b/src/ops_infer/gmm_deq_swiglu_quant_gmm_deq/gmm_deq_swiglu_quant_gmm_deq_operation.cpp
index e63c92e9..de8be698 100644
--- a/src/ops_infer/gmm_deq_swiglu_quant_gmm_deq/gmm_deq_swiglu_quant_gmm_deq_operation.cpp
+++ b/src/ops_infer/gmm_deq_swiglu_quant_gmm_deq/gmm_deq_swiglu_quant_gmm_deq_operation.cpp
@@ -102,12 +102,12 @@ bool ParamCheck(const atb::infer::GmmDeqSwigluQuantGmmDeqParam &opParam)
         return false;
     }
 
-    if (opParam.transposeWeightUp != false) {
+    if (opParam.transposeWeightUp) {
         ATB_LOG(ERROR) << "Param transposeWeightUp only support false.";
         return false;
     }
 
-    if (opParam.transposeWeightDown != true) {
+    if (!opParam.transposeWeightDown) {
         ATB_LOG(ERROR) << "Param transposeWeightDown only support true.";
         return false;
     }
diff --git a/src/ops_infer/mm_deq_swiglu_quant_mm_deq/mm_deq_swiglu_quant_mm_deq_operation.cpp b/src/ops_infer/mm_deq_swiglu_quant_mm_deq/mm_deq_swiglu_quant_mm_deq_operation.cpp
index d96602ba..64cf34fb 100644
--- a/src/ops_infer/mm_deq_swiglu_quant_mm_deq/mm_deq_swiglu_quant_mm_deq_operation.cpp
+++ b/src/ops_infer/mm_deq_swiglu_quant_mm_deq/mm_deq_swiglu_quant_mm_deq_operation.cpp
@@ -87,12 +87,12 @@ bool ParamCheck(const atb::infer::MmDeqSwigluQuantMmDeqParam &opParam)
         return false;
     }
 
-    if (opParam.transposeWeightUp != false) {
+    if (opParam.transposeWeightUp) {
         ATB_LOG(ERROR) << "Param transposeWeightUp only support false.";
         return false;
     }
 
-    if (opParam.transposeWeightDown != true) {
+    if (!opParam.transposeWeightDown) {
         ATB_LOG(ERROR) << "Param transposeWeightDown only support true.";
         return false;
     }
-- 
Gitee


From 9dc70dec45ace7186f54303064fe0e76a7b6e61e Mon Sep 17 00:00:00 2001
From: huangxiaolan <huangxiaolan7@huawei.com>
Date: Tue, 23 Sep 2025 16:46:32 +0800
Subject: [PATCH 73/94] fix variable init

---
 example/multiStream/multiStream_multiGraph_demo.cpp  | 3 +--
 example/multiStream/multiStream_singleGraph_demo.cpp | 3 +--
 src/ops_infer/ring_mla/ring_mla_operation.cpp        | 3 +--
 3 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/example/multiStream/multiStream_multiGraph_demo.cpp b/example/multiStream/multiStream_multiGraph_demo.cpp
index ff6491a0..17a0e039 100644
--- a/example/multiStream/multiStream_multiGraph_demo.cpp
+++ b/example/multiStream/multiStream_multiGraph_demo.cpp
@@ -237,8 +237,7 @@ int main()
     packRW.outTensors.resize(outTensorNum);
     operationWR->InferShape(intensorDescs, outtensorDescs);
 
-    aclError ret;
-    ret = CreateInTensors(packWR.inTensors, intensorDescs);
+    aclError ret = CreateInTensors(packWR.inTensors, intensorDescs);
     if (ret != 0) {
         exit(ret);
     }
diff --git a/example/multiStream/multiStream_singleGraph_demo.cpp b/example/multiStream/multiStream_singleGraph_demo.cpp
index d95873fd..e1eb53af 100644
--- a/example/multiStream/multiStream_singleGraph_demo.cpp
+++ b/example/multiStream/multiStream_singleGraph_demo.cpp
@@ -264,8 +264,7 @@ int main()
     outtensorDescs.resize(outTensorNum);
     pack.outTensors.resize(outTensorNum);
     operation->InferShape(intensorDescs, outtensorDescs);
-    aclError ret;
-    ret = CreateOutTensors(pack.outTensors, outtensorDescs);
+    aclError ret = CreateOutTensors(pack.outTensors, outtensorDescs);
     if (ret != 0) {
         exit(ret);
     }
diff --git a/src/ops_infer/ring_mla/ring_mla_operation.cpp b/src/ops_infer/ring_mla/ring_mla_operation.cpp
index 8106fdd3..0376eabb 100644
--- a/src/ops_infer/ring_mla/ring_mla_operation.cpp
+++ b/src/ops_infer/ring_mla/ring_mla_operation.cpp
@@ -394,8 +394,7 @@ bool RingMLAOperation::InputLseDimCheck(const SVector<TensorDesc> &inTensorDescs
 
 Status RingMLAOperation::InferShapeCheckImpl(const SVector<TensorDesc> &inTensorDescs) const
 {
-    Status st;
-    st = DimCheck(inTensorDescs);
+    Status st = DimCheck(inTensorDescs);
     if (st != NO_ERROR) {
         return st;
     }
-- 
Gitee


From 61968c9d25d1d79c774982a50e0d5b04d4e9664f Mon Sep 17 00:00:00 2001
From: huangxiaolan <huangxiaolan7@huawei.com>
Date: Tue, 23 Sep 2025 16:55:10 +0800
Subject: [PATCH 74/94] add brackets

---
 .../multi_latent_attention/multi_latent_attention_operation.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/ops_infer/multi_latent_attention/multi_latent_attention_operation.cpp b/src/ops_infer/multi_latent_attention/multi_latent_attention_operation.cpp
index 8bdfceaf..4cfbc6cd 100644
--- a/src/ops_infer/multi_latent_attention/multi_latent_attention_operation.cpp
+++ b/src/ops_infer/multi_latent_attention/multi_latent_attention_operation.cpp
@@ -461,7 +461,7 @@ Status MultiLatentAttentionOperation::DimCheckInt8Nz(const SVector<TensorDesc> &
         return ERROR_INVALID_TENSOR_DIM;
     }
     if (inTensorDesc.at(idx + 1).shape.dims[0] != param_.headNum) {
-        ATB_LOG(ERROR) << GetLogPrefix() << "dim 0 of of pvDescale(intensor" << idx + 1
+        ATB_LOG(ERROR) << GetLogPrefix() << "dim 0 of of pvDescale(intensor" << (idx + 1)
                        << ") should be equal to dim0 of headNum";
         return ERROR_INVALID_TENSOR_DIM;
     }
-- 
Gitee


From 8c535d08d8f36b0b32c2f3f37e78cbec3bd08072 Mon Sep 17 00:00:00 2001
From: zouyanlong <zouyanlong@h-partners.com>
Date: Tue, 23 Sep 2025 17:04:29 +0800
Subject: [PATCH 75/94] fix

---
 .../linear_parallel_aclnn_runner.cpp          |  18 +-
 .../linear_parallel_aclnn_runner.h            |   1 -
 .../linear_parallel_operation.cpp             |  31 ++-
 .../linear_parallel_generation.cpp            | 261 ++++++++++++++++++
 ...near_parallel_mc2_linear_reduce_scatter.py | 112 ++++++++
 .../mc2_linear_reduce_scatter/run_test.sh     |  94 +++++++
 tests/apitest/opstest/cpp/precision_calcu.py  | 147 ++++++++++
 tests/apitest/opstest/csv/linear_parallel.csv |   9 +
 8 files changed, 651 insertions(+), 22 deletions(-)
 create mode 100644 tests/apitest/opstest/cpp/linear_parallel/mc2_linear_reduce_scatter/linear_parallel_generation.cpp
 create mode 100644 tests/apitest/opstest/cpp/linear_parallel/mc2_linear_reduce_scatter/linear_parallel_mc2_linear_reduce_scatter.py
 create mode 100644 tests/apitest/opstest/cpp/linear_parallel/mc2_linear_reduce_scatter/run_test.sh
 create mode 100644 tests/apitest/opstest/cpp/precision_calcu.py

diff --git a/src/ops_infer/linear_parallel/linear_parallel_aclnn_runner.cpp b/src/ops_infer/linear_parallel/linear_parallel_aclnn_runner.cpp
index 7e151f31..d8f4abb1 100644
--- a/src/ops_infer/linear_parallel/linear_parallel_aclnn_runner.cpp
+++ b/src/ops_infer/linear_parallel/linear_parallel_aclnn_runner.cpp
@@ -19,10 +19,6 @@ namespace atb {
 static const uint32_t LINEAR_REDUCE_SCATTER_IN_TENSOR_NUM = 6;
 static const uint32_t LINEAR_REDUCE_SCATTER_OUT_TENSOR_NUM = 2;
 
-
-static const uint32_t BIAS_TENSOR_INDEX = 2;
-
-
 aclnnStatus (*LinearParallelAclnnRunner::aclnnMatmulReduceScatterV2GetWorkspaceSizeFunc_)(
     const aclTensor *, const aclTensor *, const aclTensor *, const aclTensor *, const aclTensor *, const aclTensor *,
     int64_t, const char *, const char *, int64_t, int64_t, int64_t, const char *, const aclTensor *, const aclTensor *,
@@ -60,15 +56,21 @@ Status LinearParallelAclnnRunner::BuildAclnnVariantPack(const RunnerVariantPack
     this->aclnnVariantPack_.aclInTensors.resize(LINEAR_REDUCE_SCATTER_IN_TENSOR_NUM);
     for (size_t i = 0; i < this->aclnnVariantPack_.aclInTensors.size(); ++i) {
         std::shared_ptr<AclNNTensor> aclnnTensorPtr = std::make_shared<AclNNTensor>();
-        if (i >= 3 || (!param_.hasResidual && i == BIAS_TENSOR_INDEX)) {
+        if (i > 1) {
             this->aclnnVariantPack_.aclInTensors[i] = aclnnTensorPtr;
             continue;
         }
         atb::Tensor atbTensor = runnerVariantPack.inTensors.at(i);
         aclnnTensorPtr->atbTensor = atbTensor;
-        aclnnTensorPtr->strides = (i == 1 && param_.transWeight) ? GetTransposeTensorStride(atbTensor.desc.shape) :
-                                                                   GetCopyTensorStride(atbTensor.desc.shape);
-        ret = CallAclCreateTensor(atbTensor.desc.shape, atbTensor.desc.shape, atbTensor, aclnnTensorPtr);
+        atb::Dims viewDims = atbTensor.desc.shape;
+        if (i == 1 && param_.transWeight) {
+            aclnnTensorPtr->strides = GetTransposeTensorStride(viewDims);
+            viewDims.dims[0] = atbTensor.desc.shape.dims[1];
+            viewDims.dims[1] = atbTensor.desc.shape.dims[0];
+        } else {
+            aclnnTensorPtr->strides = GetCopyTensorStride(viewDims);
+        }
+        ret = CallAclCreateTensor(viewDims, atbTensor.desc.shape, atbTensor, aclnnTensorPtr);
         if (ret != NO_ERROR) {
             ATB_LOG(ERROR) << GetLogPrefix() << "create aclTensor by aclCreateTensor failed!";
             return ret;
diff --git a/src/ops_infer/linear_parallel/linear_parallel_aclnn_runner.h b/src/ops_infer/linear_parallel/linear_parallel_aclnn_runner.h
index b8f55e79..3a8cfb74 100644
--- a/src/ops_infer/linear_parallel/linear_parallel_aclnn_runner.h
+++ b/src/ops_infer/linear_parallel/linear_parallel_aclnn_runner.h
@@ -29,7 +29,6 @@ protected:
     aclnnStatus SetAclNNWorkspaceExecutor() override;
     Status LaunchAclnnKernel() override;
 
-
 private:
     HcclRunner hcclRunner_;
     infer::LinearParallelParam param_;
diff --git a/src/ops_infer/linear_parallel/linear_parallel_operation.cpp b/src/ops_infer/linear_parallel/linear_parallel_operation.cpp
index 9ce37e92..21655c11 100644
--- a/src/ops_infer/linear_parallel/linear_parallel_operation.cpp
+++ b/src/ops_infer/linear_parallel/linear_parallel_operation.cpp
@@ -104,8 +104,8 @@ bool CheckType(const infer::LinearParallelParam &opParam, Status &isOK)
 
 bool CheckTypeMc2(const infer::LinearParallelParam &opParam, Status &isOK)
 {
-    if (opParam.transWeight) {
-        ATB_LOG(ERROR) << "When LinearParallel backend is mc2, not support transWeight";
+    if (opParam.hasResidual) {
+        ATB_LOG(ERROR) << "When LinearParallel backend is mc2, not support residual";
         isOK = ERROR_INVALID_PARAM;
         return true;
     }
@@ -114,8 +114,10 @@ bool CheckTypeMc2(const infer::LinearParallelParam &opParam, Status &isOK)
         isOK = ERROR_INVALID_PARAM;
         return true;
     }
-    if (opParam.quantType != atb::infer::LinearParallelParam::QuantType::QUANT_TYPE_UNQUANT) {
-        ATB_LOG(ERROR) << "When LinearParallel backend is mc2, only support quantType[QUANT_TYPE_UNQUANT]";
+    if (opParam.quantType != atb::infer::LinearParallelParam::QuantType::QUANT_TYPE_UNDEFINED ||
+        opParam.quantType != atb::infer::LinearParallelParam::QuantType::QUANT_TYPE_UNQUANT) {
+        ATB_LOG(ERROR)
+            << "When LinearParallel backend is mc2, only support quantType[QUANT_TYPE_UNDEFINED][QUANT_TYPE_UNQUANT]";
         isOK = ERROR_INVALID_PARAM;
         return true;
     }
@@ -126,11 +128,6 @@ bool CheckTypeMc2(const infer::LinearParallelParam &opParam, Status &isOK)
         isOK = ERROR_INVALID_PARAM;
         return true;
     }
-    if (opParam.quantType == atb::infer::LinearParallelParam::QuantType::QUANT_TYPE_PER_TOKEN) {
-        ATB_LOG(ERROR) << "When LinearParallel backend is mc2, not support quantType[QUANT_TYPE_PER_TOKEN]";
-        isOK = ERROR_INVALID_PARAM;
-        return true;
-    }
     return false;
 }
 
@@ -416,10 +413,18 @@ Status LinearParallelOperation::InferShapeCheckLinearReduceScatter(const SVector
         return ERROR_INVALID_TENSOR_DIM;
     }
     if (param_.backend == "mc2") {
+        if (inTensorDescs.at(0).shape.dimNum != IN_TENSOR_DIM_NUM) {
+            ATB_LOG(ERROR) << GetLogPrefix() << "inTensor0 dimNum should be equal to 2";
+            return ERROR_INVALID_TENSOR_DIM_NUM;
+        }
+        if (inTensorDescs.at(1).shape.dimNum != IN_TENSOR_DIM_NUM) {
+            ATB_LOG(ERROR) << GetLogPrefix() << "inTensor1 dimNum should be equal to 2";
+            return ERROR_INVALID_TENSOR_DIM_NUM;
+        }
         int64_t xTensorK = OperationUtil::GetXTensorK(inTensorDescs.at(0));
-        if (xTensorK < 256 || xTensorK > 65535) {
+        if (xTensorK < 256 || xTensorK >= 65535) {
             ATB_LOG(ERROR) << GetLogPrefix() << "inTensor0 k [" << xTensorK
-                           << "] should be an integer between [256 ~ 65535]";
+                           << "] should be an integer between [256 ~ 65535)";
             return ERROR_INVALID_TENSOR_DIM;
         }
     }
@@ -446,8 +451,8 @@ Status LinearParallelOperation::InferShapeCheckAllGatherLinear(const SVector<Ten
     return CheckResidual(inTensorDescs);
 }
 
-Status
-LinearParallelOperation::InferShapeCheckAllGatherLinearReduceScatter(const SVector<TensorDesc> &inTensorDescs) const
+Status LinearParallelOperation::InferShapeCheckAllGatherLinearReduceScatter(
+    const SVector<TensorDesc> &inTensorDescs) const
 {
     if (param_.twoDimTPInfo.rsDim * param_.twoDimTPInfo.agDim != param_.rankSize) {
         ATB_LOG(ERROR) << "agDim * rsDim should equal to rankSize";
diff --git a/tests/apitest/opstest/cpp/linear_parallel/mc2_linear_reduce_scatter/linear_parallel_generation.cpp b/tests/apitest/opstest/cpp/linear_parallel/mc2_linear_reduce_scatter/linear_parallel_generation.cpp
new file mode 100644
index 00000000..cf744275
--- /dev/null
+++ b/tests/apitest/opstest/cpp/linear_parallel/mc2_linear_reduce_scatter/linear_parallel_generation.cpp
@@ -0,0 +1,261 @@
+/*
+ * Copyright (c) 2025 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+#include <string>
+#include <vector>
+#include <thread>
+#include <random>
+#include <cstring>
+#include <cstdint>
+#include <fstream>
+#include <iostream>
+#include "acl/acl.h"
+#include "atb/types.h"
+#include "atb/atb_infer.h"
+#include "atb/operation.h"
+
+
+#define CHECK_STATUS(status)                                                                                           \
+    do {                                                                                                               \
+        if ((status) != 0) {                                                                                           \
+            std::cout << __FILE__ << ":" << __LINE__ << " [error]: " << (status) << std::endl;                         \
+            return status;                                                                                             \
+        }                                                                                                              \
+    } while (0)
+
+const int32_t DEV_NUM = 2;
+
+const int32_t M = 2;
+const int32_t K = 256;
+const int32_t N = 2;
+
+const aclDataType DATA_TYPE = aclDataType::ACL_FLOAT16;
+
+typedef uint16_t float16;
+typedef uint16_t bfloat16;
+
+float16 FloatToFloat16(float fp32)
+{
+    if (fp32 == 0.0f) {
+        return (std::signbit(fp32) ? 0x8000 : 0x0000);
+    }
+
+    uint32_t float_bits;
+    static_assert(sizeof(float) == sizeof(uint32_t), "Float size mismatch");
+    std::memcpy(&float_bits, &fp32, sizeof(float));
+
+    const uint32_t sign = (float_bits >> 31) & 0x1;
+    const uint32_t exp = (float_bits >> 23) & 0xFF;
+    const uint32_t mant = float_bits & 0x7FFFFF;
+    if (exp == 0xFF) {
+        if (mant == 0) {
+            return (sign << 15) | 0x7C00;
+        } else {
+            return (sign << 15) | 0x7C00 | (mant >> 13);
+        }
+    }
+
+    int32_t exp_fp16 = static_cast<int32_t>(exp) - 127 + 15;
+    if (exp_fp16 <= 0) {
+        return (sign << 15);
+    }
+
+    if (exp_fp16 >= 0x1F) {
+        return (sign < 15) | 0x7C00;
+    }
+
+    uint32_t mant24 = (1 << 23) | mant;
+    uint32_t round_bits = mant24 & 0x1FFF;
+    uint32_t base = (mant24 >> 13) & 0x3FF;
+
+    if (round_bits > 0x1000 || (round_bits == 0x1000 && (base & 1))) {
+        base++;
+        if (base > 0xFF) {
+            base = 0;
+            exp_fp16++;
+            if (exp_fp16 >= 0x1F) {
+                return (sign << 15) | 0x7C00;
+            }
+        }
+    }
+
+    return (sign << 15) | (exp_fp16 << 10) | base;
+}
+
+bfloat16 FloatToBfloat16(float fp32)
+{
+    if (fp32 == 0.0f) {
+        return (std::signbit(fp32) ? 0x8000 : 0x0000);
+    }
+
+    uint32_t float_bits;
+    static_assert(sizeof(float) == sizeof(uint32_t), "Float size mismatch");
+    std::memcpy(&float_bits, &fp32, sizeof(float));
+
+    bfloat16 bfloat16_bits = static_cast<bfloat16>(float_bits >> 16);
+
+    const uint32_t exp = (float_bits >> 23) & 0xFF;
+    const uint32_t mant = float_bits & 0x7FFFFF;
+    if (exp == 0xFF && mant != 0) {
+        bfloat16_bits |= 0x01;
+    }
+
+    return bfloat16_bits;
+}
+
+size_t GetDataItemSize(aclDataType dtype)
+{
+    switch (dtype) {
+        case ACL_DT_UNDEFINED:
+            return sizeof(bool);
+        case ACL_FLOAT16:
+            return sizeof(uint16_t);
+        case ACL_BF16:
+            return sizeof(uint16_t);
+        default:
+            return 0;
+    }
+}
+
+static std::mt19937 gen(0);
+
+template <typename T> T random_float(float min, float max)
+{
+    std::uniform_real_distribution<T> dist(min, max);
+    return dist(gen);
+}
+
+atb::Tensor FillTensorDataRandomly(const atb::TensorDesc &desc, float range_min, float range_max)
+{
+    atb::Tensor tensor{desc, nullptr, nullptr, 0};
+    tensor.dataSize = atb::Utils::GetTensorSize(desc);
+    aclrtMallocHost((void **)&tensor.hostData, tensor.dataSize);
+    {
+        size_t dataItemSize = GetDataItemSize(desc.dtype);
+        uint64_t tensorNumel = atb::Utils::GetTensorNumel(desc);
+        void *basePtr = static_cast<void *>(tensor.hostData);
+        for (uint64_t i = 0; i < tensorNumel; ++i) {
+            void *elementPtr = static_cast<char *>(basePtr) + i * dataItemSize;
+            switch (desc.dtype) {
+                case ACL_FLOAT16:
+                    *static_cast<uint16_t *>(elementPtr) = FloatToFloat16(random_float<float>(range_min, range_max));
+                    break;
+                case ACL_BF16:
+                    *static_cast<uint16_t *>(elementPtr) = FloatToBfloat16(random_float<float>(range_min, range_max));
+                    break;
+                default:
+                    break;
+            }
+        }
+    }
+    aclrtMalloc((void **)&tensor.deviceData, tensor.dataSize, ACL_MEM_MALLOC_HUGE_FIRST);
+    aclrtMemcpy(tensor.deviceData, tensor.dataSize, tensor.hostData, tensor.dataSize, ACL_MEMCPY_HOST_TO_DEVICE);
+
+    return tensor;
+}
+
+atb::Status saveTensor(atb::Tensor tensor, std::string path)
+{
+    if (tensor.deviceData == nullptr) {}
+    void *hostData = nullptr;
+    aclrtMallocHost((void **)&hostData, tensor.dataSize);
+    aclrtMemcpy(hostData, tensor.dataSize, tensor.deviceData, tensor.dataSize, ACL_MEMCPY_DEVICE_TO_HOST);
+    std::ofstream file(path, std::ios::binary);
+    file.write(static_cast<char *>(hostData), tensor.dataSize);
+    file.close();
+    aclrtFreeHost(hostData);
+    return atb::ErrorType::NO_ERROR;
+}
+
+atb::Status ExcuteImpl(atb::Operation *op, atb::VariantPack variantPack, atb::Context *context, aclrtStream &stream)
+{
+    uint64_t workspaceSize = 0;
+    CHECK_STATUS(op->Setup(variantPack, workspaceSize, context));
+    void *workspace = nullptr;
+    if (workspaceSize > 0) {
+        CHECK_STATUS(aclrtMalloc(&workspace, workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST));
+    }
+    CHECK_STATUS(op->Execute(variantPack, (uint8_t *)workspace, workspaceSize, context));
+    CHECK_STATUS(aclrtSynchronizeStream(stream)); // 流同步，等待device侧任务计算完成
+
+    if (workspace) {
+        CHECK_STATUS(aclrtFree(workspace)); // 销毁workspace
+    }
+    return atb::ErrorType::NO_ERROR;
+}
+
+atb::Status LinearParallelOneThread(int rank, int rankSize)
+{
+    int deviceId = rank;
+    CHECK_STATUS(aclrtSetDevice(deviceId));
+    atb::Context *context = nullptr;
+    CHECK_STATUS(atb::CreateContext(&context));
+    aclrtStream stream = nullptr;
+    CHECK_STATUS(aclrtCreateStream(&stream));
+    context->SetExecuteStream(stream);
+    
+    atb::TensorDesc inputTensorDesc{
+        .dtype = DATA_TYPE, .format = aclFormat::ACL_FORMAT_ND, .shape{.dims = {M, K}, .dimNum = 2}};
+    atb::Tensor input = FillTensorDataRandomly(inputTensorDesc, -10, 10);
+
+    atb::TensorDesc weightTensorDesc{
+        .dtype = DATA_TYPE, .format = aclFormat::ACL_FORMAT_ND, .shape{.dims = {K, N}, .dimNum = 2}};
+    atb::Tensor weight = FillTensorDataRandomly(weightTensorDesc, -10, 10);
+
+    atb::Tensor output;
+    output.desc.dtype = DATA_TYPE;
+    output.desc.format = ACL_FORMAT_ND;
+    output.desc.shape.dimNum = 2;
+    output.desc.shape.dims[0] = M / DEV_NUM;
+    output.desc.shape.dims[1] = N;
+    output.dataSize = atb::Utils::GetTensorSize(output);
+    CHECK_STATUS(aclrtMalloc(&output.deviceData, output.dataSize, ACL_MEM_MALLOC_HUGE_FIRST));
+
+    atb::infer::LinearParallelParam param;
+    param.transWeight = false;
+    param.rank = rank;
+    param.rankRoot = 0;
+    param.commMode = atb::infer::CommMode::COMM_MULTI_THREAD;
+    param.rankSize = rankSize;
+    param.backend = "mc2";
+    param.type = atb::infer::LinearParallelParam::ParallelType::LINEAR_REDUCE_SCATTER;
+    atb::Operation *op = nullptr;
+    CHECK_STATUS(atb::CreateOperation(param, &op));
+
+    atb::VariantPack variantPack;
+    variantPack.inTensors = {input, weight};
+    variantPack.outTensors = {output};
+    ExcuteImpl(op, variantPack, context, stream);
+    std::cout << "rank: " << rank << " executed END." << std::endl;
+    saveTensor(input, "rank" + std::to_string(rank) + "_inTensor0.bin");
+    saveTensor(weight, "rank" + std::to_string(rank) + "_inTensor1.bin");
+    saveTensor(output, "rank" + std::to_string(rank) + "_outTensor0.bin");
+    // 资源释放
+    CHECK_STATUS(atb::DestroyOperation(op));    // 销毁op对象
+    CHECK_STATUS(aclrtDestroyStream(stream));   // 销毁stream
+    CHECK_STATUS(atb::DestroyContext(context)); // 销毁context
+    return atb::ErrorType::NO_ERROR;
+}
+
+int main(int argc, const char *argv[])
+{
+    int ret = aclInit(nullptr);
+
+    std::vector<std::unique_ptr<std::thread>> threads(DEV_NUM);
+    for (size_t i = 0; i < DEV_NUM; i++) {
+        threads[i].reset(new (std::nothrow) std::thread(LinearParallelOneThread, i, DEV_NUM));
+    }
+    for (size_t i = 0; i < DEV_NUM; ++i) {
+        threads[i]->join();
+    }
+
+    CHECK_STATUS(aclFinalize());
+    return 0;
+}
diff --git a/tests/apitest/opstest/cpp/linear_parallel/mc2_linear_reduce_scatter/linear_parallel_mc2_linear_reduce_scatter.py b/tests/apitest/opstest/cpp/linear_parallel/mc2_linear_reduce_scatter/linear_parallel_mc2_linear_reduce_scatter.py
new file mode 100644
index 00000000..cf0470d9
--- /dev/null
+++ b/tests/apitest/opstest/cpp/linear_parallel/mc2_linear_reduce_scatter/linear_parallel_mc2_linear_reduce_scatter.py
@@ -0,0 +1,112 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd.
+# This file is a part of the CANN Open Software.
+# Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+#
+
+import builtins
+import os
+import json
+import unittest
+import sys
+import numpy as np
+import torch
+import torch_npu
+import torch.multiprocessing as mp
+
+sys.path.append(os.path.join(os.path.dirname(__file__), "../../"))
+from precision_calcu import *
+
+sys.path.append(os.path.join(os.path.dirname(__file__), "../../../python/operations/"))
+import operation_test  # NOQA: E402
+
+sys.path.append(os.path.join(os.path.dirname(__file__), "../../../python/"))
+
+ATB_HOME_PATH = os.environ.get("ATB_HOME_PATH")
+if ATB_HOME_PATH is None:
+    raise RuntimeError(
+        "env ATB_HOME_PATH not exist, source set_env.sh")
+LIBTORCH_PATH = os.path.join(ATB_HOME_PATH, "lib/libatb_test_framework.so")
+LIB_PATH = os.path.join(ATB_HOME_PATH, "lib/libatb.so")
+torch.classes.load_library(LIBTORCH_PATH)
+
+DEV_NUM = 2
+
+M = 2
+K = 256
+N = 2
+
+DATA_TYPE = torch.float16
+
+def load_tensor(data_size,data_type,data_path):
+    with open(data_path, 'rb') as f:
+        data=f.read()
+    if data_type == torch.float16:
+        np_data = np.frombuffer(data, dtype=np.float16).copy()
+        tensor = torch.from_numpy(np_data)
+    elif data_type == torch.bfloat16:
+        tensor  = torch.frombuffer(bytearray(data), dtype=torch.bfloat16)
+    else:
+        tensor = torch.zeros(data_size)
+
+    tensor = tensor.view(data_size)
+    
+    return tensor
+
+
+def main_worker(rank, world_size, data_type, data_size):
+    torch_npu.npu.set_device(rank)
+    print(f'Process {rank} started, using device npu:{rank}.')
+    golden_out_tensor_high = None
+    golden_out_tensor_low = None
+
+    for i in range(world_size):
+        input_tensor = load_tensor(data_size=data_size[0],data_type=data_type,data_path=f"rank{i}_inTensor{0}.bin")
+        weight_tensor = load_tensor(data_size=data_size[1],data_type=data_type,data_path=f"rank{i}_inTensor{1}.bin")
+        out_single_tensor = torch.matmul(input_tensor.to(torch.float), weight_tensor.to(torch.float))
+        if golden_out_tensor_high is None:
+            golden_out_tensor_high = out_single_tensor.clone()
+            golden_out_tensor_low = out_single_tensor.clone().to(data_type)
+            in_tensors_desc = [input_tensor.shape, weight_tensor.shape]
+        else:
+            golden_out_tensor_high = torch.add(golden_out_tensor_high,out_single_tensor)
+            golden_out_tensor_low = torch.add(golden_out_tensor_low,out_single_tensor.to(data_type))
+    chunks_size = int(data_size[0][0] // world_size)
+    chunks_high = torch.split(golden_out_tensor_high, chunks_size)
+    chunks_low = torch.split(golden_out_tensor_low, chunks_size)
+    golden_result_high = chunks_high[rank]
+    golden_result_low = chunks_low[rank]
+    
+    acl_out_tensor = load_tensor(data_size=data_size[2],data_type=data_type,data_path=f"rank{rank}_outTensor{0}.bin")
+
+    assert check_precision_new(in_tensors_desc, acl_out_tensor.float(), golden_result_high.float(), golden_result_low.float(), rank)
+
+def check_precision_new(in_tensors_desc, out_tensor, golden_out_tensor_high, golden_out_tensor_low, rank):
+    if rank == 0:
+        print(in_tensors_desc)
+        print(out_tensor)
+    result_double = compare_cv(golden_out_tensor_high, golden_out_tensor_low, out_tensor)
+    return result_double
+
+class LinearParallelCoverOperationTest(operation_test.OperationTest):
+
+    def test_linear_parallel(self):
+        if not operation_test.get_soc_version() == 'Ascend910B':
+            return
+        print(f"———————— LinearParallelCoverOp test start ————————")
+
+        world_size = DEV_NUM
+
+        data_type = DATA_TYPE
+
+        data_size = [[M, K], [K, N], [M // DEV_NUM, N]]
+        
+        mp.spawn(main_worker, nprocs=world_size, args=(world_size, data_type, data_size))
+        
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/apitest/opstest/cpp/linear_parallel/mc2_linear_reduce_scatter/run_test.sh b/tests/apitest/opstest/cpp/linear_parallel/mc2_linear_reduce_scatter/run_test.sh
new file mode 100644
index 00000000..6597d672
--- /dev/null
+++ b/tests/apitest/opstest/cpp/linear_parallel/mc2_linear_reduce_scatter/run_test.sh
@@ -0,0 +1,94 @@
+#!/bin/bash
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd.
+# This file is a part of the CANN Open Software.
+# Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+#
+
+handle_error(){
+    rm -rf linear_parallel_generation
+    rm -rf *.bin
+
+    cd $current_dir
+}
+
+trap handle_error ERR
+
+set -e
+
+current_dir=$(pwd)
+
+cd "$(dirname "$0")"
+
+cxx_abi=$(python3 -c '
+try:
+    import torch
+    print("1" if torch.compiled_with_cxx11_abi() else "0")
+except ImportError:
+    print("1")
+')
+
+echo "Using cxx_abi=$cxx_abi"
+
+DEV_NUM=2
+
+M=2
+K=256
+N=2
+
+DATA_TYPE="FLOAT16"
+
+DATA_TYPE_CPP=""
+DATA_TYPE_PY=""
+
+case ${DATA_TYPE} in
+    FLOAT16)
+        DATA_TYPE_CPP="aclDataType::ACL_FLOAT16"
+        DATA_TYPE_PY="torch.float16"
+        ;;
+    BF16)
+        DATA_TYPE_CPP="aclDataType::ACL_BF16"
+        DATA_TYPE_PY="torch.bfloat16"
+        ;;
+    *)
+        DATA_TYPE_CPP=""
+        DATA_TYPE_PY=""
+        ;;
+esac
+
+# 修改 DEV_NUM 的值
+sed -i "s/const int32_t DEV_NUM = .*;/const int32_t DEV_NUM = ${DEV_NUM};/" linear_parallel_generation.cpp
+# 修改 M 的值
+sed -i "s/const int32_t M = .*;/const int32_t M = ${M};/" linear_parallel_generation.cpp
+# 修改 K 的值
+sed -i "s/const int32_t K = .*;/const int32_t K = ${K};/" linear_parallel_generation.cpp
+# 修改 N 的值
+sed -i "s/const int32_t N = .*;/const int32_t N = ${N};/" linear_parallel_generation.cpp
+# 修改 DATA_TYPE 的值
+sed -i "s/const aclDataType DATA_TYPE = .*;/const aclDataType DATA_TYPE = ${DATA_TYPE_CPP};/" linear_parallel_generation.cpp
+
+# 修改 DEV_NUM 的值
+sed -i "s/DEV_NUM = .*/DEV_NUM = ${DEV_NUM}/" linear_parallel_mc2_linear_reduce_scatter.py
+# 修改 M 的值
+sed -i "s/M = .*/M = ${M}/" linear_parallel_mc2_linear_reduce_scatter.py
+# 修改 K 的值
+sed -i "s/K = .*/K = ${K}/" linear_parallel_mc2_linear_reduce_scatter.py
+# 修改 N 的值
+sed -i "s/N = .*/N = ${N}/" linear_parallel_mc2_linear_reduce_scatter.py
+# 修改 DATA_TYPE 的值
+sed -i "s/DATA_TYPE = .*/DATA_TYPE = ${DATA_TYPE_PY}/" linear_parallel_mc2_linear_reduce_scatter.py
+
+g++ -D_GLIBCXX_USE_CXX11_ABI=$cxx_abi -I "${ATB_HOME_PATH}/include" -I "${ASCEND_HOME_PATH}/include" -L "${ATB_HOME_PATH}/lib" -L "${ASCEND_HOME_PATH}/lib64" \
+linear_parallel_generation.cpp -l atb -l ascendcl -l hccl -l nnopbase -l opapi -o linear_parallel_generation
+./linear_parallel_generation
+
+python linear_parallel_mc2_linear_reduce_scatter.py
+
+rm -rf linear_parallel_generation
+rm -rf *.bin
+
+cd $current_dir
\ No newline at end of file
diff --git a/tests/apitest/opstest/cpp/precision_calcu.py b/tests/apitest/opstest/cpp/precision_calcu.py
new file mode 100644
index 00000000..7d67ad58
--- /dev/null
+++ b/tests/apitest/opstest/cpp/precision_calcu.py
@@ -0,0 +1,147 @@
+# 
+# Copyright (c) 2025 Huawei Technologies Co., Ltd.
+# This file is a part of the CANN Open Software.
+# Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+#
+
+import os
+import unittest
+import logging
+import json
+import re
+import numpy
+import torch
+import torch_npu
+import math
+import sys
+import shutil
+from enum import Enum
+
+MIN_ERR = 1e-7
+class OpTypes(Enum):
+    NA = 0 # new standard is not available
+    MOVE = 1
+    RAND = 2
+    CAST = 3
+    COMPUTE_INTEGER = 4
+    COMPUTE_QUANT = 5
+    COMPUTE_FLOAT = 6
+    COMPUTE_FLOAT_HIGH_PRECISION = 7
+    VECTOR_FUSION = 8
+    CV_FUSION = 9
+
+dtype_dict = {"float": torch.float32, "float16": torch.float16, "int8": torch.int8, "int32": torch.int32, "uint8": torch.uint8,
+              "int16": torch.int16, "uint16": torch.int16, "uint32": torch.int32, "int64": torch.int64, "uint64": torch.int64,
+              "double": torch.double, "bool": torch.bool, "complex64": torch.complex64, "complex128": torch.complex128, "bf16": torch.bfloat16}
+
+def get_eb_threshold(dtype:torch.dtype):
+    eb_threshold = 0
+    if dtype in [torch.bfloat16]:
+        eb_threshold = 2**(-7)
+    if dtype in [torch.float16]:
+        eb_threshold = 2**(-10)
+    if dtype in [torch.float32]:
+        eb_threshold = 2**(-14)
+    return eb_threshold
+
+def get_err_threshold(op_type:OpTypes, dtype:torch.dtype):
+    err_threshold = 0
+    if op_type in [OpTypes.MOVE, OpTypes.RAND, OpTypes.CAST, OpTypes.COMPUTE_INTEGER]:
+        pass
+    if op_type in [OpTypes.COMPUTE_QUANT, OpTypes.COMPUTE_FLOAT]:
+        if dtype in [torch.bfloat16]:
+            err_threshold = 2**(-7)
+        if dtype in [torch.float16]:
+            err_threshold = 2**(-8)
+        if dtype in [torch.float32]:
+            err_threshold = 2**(-11)
+    if op_type in [OpTypes.CV_FUSION]:
+        if dtype in [torch.bfloat16]:
+            err_threshold = 2**(-8)
+        if dtype in [torch.float16]:
+            err_threshold = 2**(-11)
+        if dtype in [torch.float32]:
+            err_threshold = 2**(-14)
+    return err_threshold
+
+
+#误差均衡性（EB）
+def get_eb(golden:torch.Tensor, actual:torch.Tensor):
+    golden = golden.to(torch.float32)
+    golden_nmax = torch.clamp(torch.abs(golden), min = 1)
+    actual_error = actual.to(torch.float32) - golden
+    EB = torch.mean(actual_error / golden_nmax)
+    return EB
+
+#单标杆、浮点比对方法|actual - expected| <= err × max(1, | expected |)
+def ref_compare(golden:torch.Tensor, actual:torch.Tensor, err):
+    golden = golden.to(torch.float32)
+    golden_nmax = torch.clamp(torch.abs(golden), min = 1)
+    abs_error = torch.abs(actual.to(torch.float32) - golden)
+    result = (abs_error <= err * golden_nmax).all()
+    logging.info(f"new golden result:{result}")
+    return result
+
+
+#最大相对误差：max relative error，MARE
+def get_mare(golden:torch.Tensor, actual:torch.Tensor):
+    golden = golden.to(torch.float32)
+    abs_error = torch.abs(actual.to(torch.float32) - golden) / (torch.abs(golden) + MIN_ERR)
+    mare = torch.max(abs_error.flatten())
+    return mare
+
+#平均相对误差：mean relative error，MERE
+def get_mere(golden:torch.Tensor, actual:torch.Tensor):
+    golden = golden.to(torch.float32)
+    abs_error = torch.abs(actual.to(torch.float32) - golden) / (torch.abs(golden) + MIN_ERR)
+    mere = torch.mean(abs_error)
+    return mere
+
+#均方根误差:Root Mean Squared Error，RMSE
+def get_rmse(golden:torch.Tensor, actual:torch.Tensor):
+    golden = golden.to(torch.float32)
+    sqr_err = torch.pow((actual.to(torch.float32) - golden), 2)
+    rmse = torch.sqrt(torch.mean(sqr_err))
+    return rmse
+
+def compare_cv(golden:torch.Tensor, gpu:torch.Tensor, actual:torch.Tensor):
+    op_type = OpTypes.CV_FUSION
+    judge_threshold = 522
+    eb_threshold = get_eb_threshold(actual.dtype)
+    err_threshold = get_err_threshold(op_type, actual.dtype)
+    logging.info(f"err_threshold:{err_threshold} eb_threshold:{eb_threshold}")
+    mare_npu = get_mare(golden, actual)
+    mare_gpu = get_mare(golden, gpu)
+
+    mere_npu = get_mere(golden, actual)
+    mere_gpu = get_mere(golden, gpu)
+
+    rmse_npu = get_rmse(golden, actual)
+    rmse_gpu = get_rmse(golden, gpu)
+
+    mare_rate = mare_npu / max(mare_gpu, err_threshold)
+    mere_rate = mere_npu / max(mere_gpu, err_threshold)
+    rmse_rate = rmse_npu / max(rmse_gpu, err_threshold)
+
+    EB = get_eb(gpu, actual)
+    result = (mare_rate < 10) and (mere_rate < 2) and (rmse_rate < 2) and (EB < eb_threshold)
+
+    print(f"eb_threshold:{eb_threshold} err_threshold:{err_threshold}")
+    print(f"mere_npu:{mere_npu} mere_gpu:{mere_gpu}")
+    print(f"rmse_npu:{rmse_npu} rmse_gpu:{rmse_gpu}")
+    print(f"MARE:{mare_rate} MERE:{mere_rate} RMSE:{rmse_rate} EB:{EB}")
+    print(f"new golden cv result:{result}")
+    return result
+
+if __name__ == '__main__':
+   logging.basicConfig(level=logging.INFO,
+                    format='%(asctime)s - %(levelname)s - %(message)s')
+   gloden = torch.rand((128,128), dtype=torch.float32)
+   actual = gloden.to(torch.float16)
+   gpu = actual
+   compare_cv(gloden, gpu, actual)
+   
\ No newline at end of file
diff --git a/tests/apitest/opstest/csv/linear_parallel.csv b/tests/apitest/opstest/csv/linear_parallel.csv
index ae51e1c4..6e3efc07 100644
--- a/tests/apitest/opstest/csv/linear_parallel.csv
+++ b/tests/apitest/opstest/csv/linear_parallel.csv
@@ -103,3 +103,12 @@ CaseNum|CaseName|OpName|OpParam|InNum|InDType|InFormat|InShape|OutNum|OutDType|O
 102|SErrorGroupMatmulReduceScatterAlltoallvc|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":6,"quantType":-1,"moeInfo":{"epSize":2,"tpSize":1,"localExpertNums":16}}|7|int8;int8;int32;int64;float;int32;int32|nd;nd;nd;nd;nd;nd;nd|1024,1024;16,1024,1024;1024;1024;1024;2,32;32768|1|float16|nd|32768,1024|random;random;random;random;random;random;random|-5,5;-5,5;-10,10;-10,10;-1,1;-10,10;-10,10||||||Ascend910B|I:ERROR_INVALID_IN_TENSOR_NUM
 103|SErrorGroupMatmulReduceScatterAlltoallvc|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":6,"quantType":-1,"moeInfo":{"epSize":3,"tpSize":1,"localExpertNums":16}}|4|float16;float16;int32;int32|nd;nd;nd;nd|1024,1024;2,32;32768|1|float16|nd|32768,1024|random;random;random;random|-1,1;-1,1;-10,10;-10,10||||||Ascend910B|C:ERROR_INVALID_PARAM
 104|PureMatmulW8A8Fp16_3_float|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":3,"quantType":0,"outDataType":1}|4|int8;int8;int32;float|nd;nd;nd;nd|2,4;4,4;1;1|1|float16|nd|2,4|random;random;random;random|-5,5;-5,5;-10,10;1,2||||||Ascend910B|I:ERROR_INVALID_TENSOR_INI_MATCH
+105|ErrorCase0LinearReduceScatter|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":true,"backend":"mc2","type":0,"commMode":1}|2|float16;float16|nd;nd|2,16;32,16|1|float16|nd|1,32|customize;customize|-1,1;-1,1||||||Ascend910B|C:ERROR_INVALID_PARAM
+106|ErrorCase1LinearReduceScatter|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"mc2","type":0,"commMode":1}|2|float16;float16|nd;nd|2,16;32,16|1|float16|nd|1,32|customize;customize|-1,1;-1,1||||||Ascend910B|C:ERROR_INVALID_PARAM
+107|ErrorCase2LinearReduceScatter|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"mc2","type":1,"commMode":0}|2|float16;float16|nd;nd|2,16;32,16|1|float16|nd|1,32|customize;customize|-1,1;-1,1||||||Ascend910B|C:ERROR_INVALID_PARAM
+108|ErrorCase3LinearReduceScatter|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"mc2","type":1,"commMode":1,"quantType":0}|2|float16;float16|nd;nd|2,256;32,256|1|float16|nd|1,32|customize;customize|-1,1;-1,1||||||Ascend910B|C:ERROR_INVALID_PARAM
+109|ErrorCase4LinearReduceScatter|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"mc2","type":1,"commMode":1,"quantType":-1}|2|float16;float16|nd;nd|2,255;32,255|1|float16|nd|1,32|customize;customize|-1,1;-1,1||||||Ascend910B|I:ERROR_INVALID_TENSOR_DIM
+110|ErrorCase5LinearReduceScatter|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"mc2","type":1,"commMode":1,"quantType":-1}|2|float16;float16|nd;nd|2,65535;32,65535|1|float16|nd|1,32|customize;customize|-1,1;-1,1||||||Ascend910B|I:ERROR_INVALID_TENSOR_DIM
+111|ErrorCase6LinearReduceScatter|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"mc2","type":1,"commMode":1,"quantType":-1}|2|float16;float16|nd;nd|1,2,256;32,256|1|float16|nd|1,32|customize;customize|-1,1;-1,1||||||Ascend910B|I:ERROR_INVALID_TENSOR_DIM_NUM
+112|ErrorCase7LinearReduceScatter|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"mc2","type":1,"commMode":1,"quantType":-1}|2|float16;float16|nd;nd|2,256;1,32,256|1|float16|nd|1,32|customize;customize|-1,1;-1,1||||||Ascend910B|I:ERROR_INVALID_TENSOR_DIM_NUM
+112|ErrorCase7LinearReduceScatter|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"mc2","type":1,"commMode":1,"quantType":-1}|2|float16;float16|nd;fractal_nz|2,256;1,32,256|1|float16|nd|1,32|customize;customize|-1,1;-1,1||||||Ascend910B|I:ERROR_INVALID_TENSOR_INI_MATCH
-- 
Gitee


From f08c76f6643e9c832ec0315ff12826549f7f9fb2 Mon Sep 17 00:00:00 2001
From: zouyanlong <zouyanlong@h-partners.com>
Date: Tue, 23 Sep 2025 17:45:55 +0800
Subject: [PATCH 76/94] fix

---
 tests/apitest/opstest/csv/linear_parallel.csv | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/apitest/opstest/csv/linear_parallel.csv b/tests/apitest/opstest/csv/linear_parallel.csv
index 6e3efc07..5ac11486 100644
--- a/tests/apitest/opstest/csv/linear_parallel.csv
+++ b/tests/apitest/opstest/csv/linear_parallel.csv
@@ -111,4 +111,3 @@ CaseNum|CaseName|OpName|OpParam|InNum|InDType|InFormat|InShape|OutNum|OutDType|O
 110|ErrorCase5LinearReduceScatter|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"mc2","type":1,"commMode":1,"quantType":-1}|2|float16;float16|nd;nd|2,65535;32,65535|1|float16|nd|1,32|customize;customize|-1,1;-1,1||||||Ascend910B|I:ERROR_INVALID_TENSOR_DIM
 111|ErrorCase6LinearReduceScatter|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"mc2","type":1,"commMode":1,"quantType":-1}|2|float16;float16|nd;nd|1,2,256;32,256|1|float16|nd|1,32|customize;customize|-1,1;-1,1||||||Ascend910B|I:ERROR_INVALID_TENSOR_DIM_NUM
 112|ErrorCase7LinearReduceScatter|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"mc2","type":1,"commMode":1,"quantType":-1}|2|float16;float16|nd;nd|2,256;1,32,256|1|float16|nd|1,32|customize;customize|-1,1;-1,1||||||Ascend910B|I:ERROR_INVALID_TENSOR_DIM_NUM
-112|ErrorCase7LinearReduceScatter|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"mc2","type":1,"commMode":1,"quantType":-1}|2|float16;float16|nd;fractal_nz|2,256;1,32,256|1|float16|nd|1,32|customize;customize|-1,1;-1,1||||||Ascend910B|I:ERROR_INVALID_TENSOR_INI_MATCH
-- 
Gitee


From 00981e831305e57ebf409f84ec190ba86fa32520 Mon Sep 17 00:00:00 2001
From: zouyanlong <zouyanlong@h-partners.com>
Date: Tue, 23 Sep 2025 18:58:41 +0800
Subject: [PATCH 77/94] fix

---
 .../mc2_linear_reduce_scatter/run_test.sh     | 20 +++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/tests/apitest/opstest/cpp/linear_parallel/mc2_linear_reduce_scatter/run_test.sh b/tests/apitest/opstest/cpp/linear_parallel/mc2_linear_reduce_scatter/run_test.sh
index 6597d672..e1e5f6f8 100644
--- a/tests/apitest/opstest/cpp/linear_parallel/mc2_linear_reduce_scatter/run_test.sh
+++ b/tests/apitest/opstest/cpp/linear_parallel/mc2_linear_reduce_scatter/run_test.sh
@@ -61,26 +61,26 @@ case ${DATA_TYPE} in
 esac
 
 # 修改 DEV_NUM 的值
-sed -i "s/const int32_t DEV_NUM = .*;/const int32_t DEV_NUM = ${DEV_NUM};/" linear_parallel_generation.cpp
+sed -i "s/^const int32_t DEV_NUM = .*;/const int32_t DEV_NUM = ${DEV_NUM};/" linear_parallel_generation.cpp
 # 修改 M 的值
-sed -i "s/const int32_t M = .*;/const int32_t M = ${M};/" linear_parallel_generation.cpp
+sed -i "s/^const int32_t M = .*;/const int32_t M = ${M};/" linear_parallel_generation.cpp
 # 修改 K 的值
-sed -i "s/const int32_t K = .*;/const int32_t K = ${K};/" linear_parallel_generation.cpp
+sed -i "s/^const int32_t K = .*;/const int32_t K = ${K};/" linear_parallel_generation.cpp
 # 修改 N 的值
-sed -i "s/const int32_t N = .*;/const int32_t N = ${N};/" linear_parallel_generation.cpp
+sed -i "s/^const int32_t N = .*;/const int32_t N = ${N};/" linear_parallel_generation.cpp
 # 修改 DATA_TYPE 的值
-sed -i "s/const aclDataType DATA_TYPE = .*;/const aclDataType DATA_TYPE = ${DATA_TYPE_CPP};/" linear_parallel_generation.cpp
+sed -i "s/^const aclDataType DATA_TYPE = .*;/const aclDataType DATA_TYPE = ${DATA_TYPE_CPP};/" linear_parallel_generation.cpp
 
 # 修改 DEV_NUM 的值
-sed -i "s/DEV_NUM = .*/DEV_NUM = ${DEV_NUM}/" linear_parallel_mc2_linear_reduce_scatter.py
+sed -i "s/^DEV_NUM = .*/DEV_NUM = ${DEV_NUM}/" linear_parallel_mc2_linear_reduce_scatter.py
 # 修改 M 的值
-sed -i "s/M = .*/M = ${M}/" linear_parallel_mc2_linear_reduce_scatter.py
+sed -i "s/^M = .*/M = ${M}/" linear_parallel_mc2_linear_reduce_scatter.py
 # 修改 K 的值
-sed -i "s/K = .*/K = ${K}/" linear_parallel_mc2_linear_reduce_scatter.py
+sed -i "s/^K = .*/K = ${K}/" linear_parallel_mc2_linear_reduce_scatter.py
 # 修改 N 的值
-sed -i "s/N = .*/N = ${N}/" linear_parallel_mc2_linear_reduce_scatter.py
+sed -i "s/^N = .*/N = ${N}/" linear_parallel_mc2_linear_reduce_scatter.py
 # 修改 DATA_TYPE 的值
-sed -i "s/DATA_TYPE = .*/DATA_TYPE = ${DATA_TYPE_PY}/" linear_parallel_mc2_linear_reduce_scatter.py
+sed -i "s/^DATA_TYPE = .*/DATA_TYPE = ${DATA_TYPE_PY}/" linear_parallel_mc2_linear_reduce_scatter.py
 
 g++ -D_GLIBCXX_USE_CXX11_ABI=$cxx_abi -I "${ATB_HOME_PATH}/include" -I "${ASCEND_HOME_PATH}/include" -L "${ATB_HOME_PATH}/lib" -L "${ASCEND_HOME_PATH}/lib64" \
 linear_parallel_generation.cpp -l atb -l ascendcl -l hccl -l nnopbase -l opapi -o linear_parallel_generation
-- 
Gitee


From fc6d7ef058e5973dfeb72b3673d36b239c6bee51 Mon Sep 17 00:00:00 2001
From: guanguan <guanxuewei2@huawei.com>
Date: Tue, 23 Sep 2025 19:17:38 +0800
Subject: [PATCH 78/94] fix

---
 .../linear_parallel_operation.cpp             | 38 -------------------
 .../linear_parallel_operation.h               |  1 -
 2 files changed, 39 deletions(-)

diff --git a/src/ops_infer/linear_parallel/linear_parallel_operation.cpp b/src/ops_infer/linear_parallel/linear_parallel_operation.cpp
index 810c8d75..a4396794 100644
--- a/src/ops_infer/linear_parallel/linear_parallel_operation.cpp
+++ b/src/ops_infer/linear_parallel/linear_parallel_operation.cpp
@@ -37,7 +37,6 @@ static const uint32_t RESIDUAL_TENSOR_INDEX_3 = 3;
 static const uint32_t RESIDUAL_TENSOR_INDEX_4 = 4;
 static const uint32_t MAX_OUTPUT_SIZE = 204800;
 static const uint32_t MAX_K = 24000;
-static const uint32_t DIM_4 = 4;
 
 static bool AllToAllvcAllGatherGmmOutTensorCheck(const SVector<TensorDesc> &inTensorDescs,
                                                  const TensorDesc &outTensorDesc, const std::string &logPrefix)
@@ -379,26 +378,9 @@ Status LinearParallelOperation::CheckResidual(const SVector<TensorDesc> &inTenso
     return NO_ERROR;
 }
 
-Status LinearParallelOperation::CheckWeightNzFormat(const SVector<TensorDesc> &inTensorDescs) const
-{
-    const TensorDesc &weight = inTensorDescs.at(1);
-    bool weightNz = (weight.format == ACL_FORMAT_FRACTAL_NZ);
-    if (weightNz) {
-        if (weight.shape.dimNum != DIM_4) {
-            ATB_LOG(ERROR) << GetLogPrefix() << "fractal_nz shape dim should be 4. now is "<< weight.shape.dimNum;
-            return ERROR_INVALID_TENSOR_DIM;
-        }
-    }
-    return NO_ERROR;
-}
 
 Status LinearParallelOperation::InferShapeCheckLinearAllReduce(const SVector<TensorDesc> &inTensorDescs) const
 {
-    Status st = CheckWeightNzFormat(inTensorDescs);
-    if (st != NO_ERROR) {
-        return st;
-    }
-
     if (!OperationUtil::MatmulInTensorDescsCheck(inTensorDescs, GetLogPrefix(), commonCheckParam_)) {
         return ERROR_INVALID_TENSOR_DIM;
     }
@@ -416,11 +398,6 @@ Status LinearParallelOperation::InferShapeCheckLinearAllReduce(const SVector<Ten
 
 Status LinearParallelOperation::InferShapeCheckLinearReduceScatter(const SVector<TensorDesc> &inTensorDescs) const
 {
-    Status st = CheckWeightNzFormat(inTensorDescs);
-    if (st != NO_ERROR) {
-        return st;
-    }
-
     if (!OperationUtil::MatmulInTensorDescsCheck(inTensorDescs, GetLogPrefix(), commonCheckParam_)) {
         return ERROR_INVALID_TENSOR_DIM;
     }
@@ -446,11 +423,6 @@ Status LinearParallelOperation::InferShapeCheckLinearReduceScatter(const SVector
 
 Status LinearParallelOperation::InferShapeCheckAllGatherLinear(const SVector<TensorDesc> &inTensorDescs) const
 {
-    Status st = CheckWeightNzFormat(inTensorDescs);
-    if (st != NO_ERROR) {
-        return st;
-    }
-
     bool isQuant = param_.quantType > infer::LinearParallelParam::QuantType::QUANT_TYPE_UNQUANT &&
                    param_.quantType < infer::LinearParallelParam::QuantType::QUANT_TYPE_MAX;
     if (isQuant && inTensorDescs.at(3).dtype == ACL_FLOAT && param_.outDataType == ACL_FLOAT16) {
@@ -472,11 +444,6 @@ Status LinearParallelOperation::InferShapeCheckAllGatherLinear(const SVector<Ten
 Status
 LinearParallelOperation::InferShapeCheckAllGatherLinearReduceScatter(const SVector<TensorDesc> &inTensorDescs) const
 {
-    Status st = CheckWeightNzFormat(inTensorDescs);
-    if (st != NO_ERROR) {
-        return st;
-    }
-
     if (param_.twoDimTPInfo.rsDim * param_.twoDimTPInfo.agDim != param_.rankSize) {
         ATB_LOG(ERROR) << "agDim * rsDim should equal to rankSize";
         return ERROR_INVALID_PARAM;
@@ -504,11 +471,6 @@ LinearParallelOperation::InferShapeCheckAllGatherLinearReduceScatter(const SVect
 
 Status LinearParallelOperation::InferShapeCheckAllToAllvcAllGatherGmm(const SVector<TensorDesc> &inTensorDescs) const
 {
-    Status st = CheckWeightNzFormat(inTensorDescs);
-    if (st != NO_ERROR) {
-        return st;
-    }
-    
     bool isQuant = param_.quantType > infer::LinearParallelParam::QuantType::QUANT_TYPE_UNQUANT &&
                    param_.quantType < infer::LinearParallelParam::QuantType::QUANT_TYPE_MAX;
     if (isQuant && inTensorDescs.at(2).dtype == ACL_FLOAT && param_.outDataType == ACL_FLOAT16) {
diff --git a/src/ops_infer/linear_parallel/linear_parallel_operation.h b/src/ops_infer/linear_parallel/linear_parallel_operation.h
index 2658ac3f..d9e17af5 100755
--- a/src/ops_infer/linear_parallel/linear_parallel_operation.h
+++ b/src/ops_infer/linear_parallel/linear_parallel_operation.h
@@ -38,7 +38,6 @@ private:
     Status InferShapeAllToAllvcAllGatherGmm(const SVector<TensorDesc> &inTensorDescs,
                                             SVector<TensorDesc> &outTensorDescs) const;
     Status CheckResidual(const SVector<TensorDesc> &inTensorDescs) const;
-    Status CheckWeightNzFormat(const SVector<TensorDesc> &inTensorDescs) const;
     Status InferShapeCheckLinearAllReduce(const SVector<TensorDesc> &inTensorDescs) const;
     Status InferShapeCheckLinearReduceScatter(const SVector<TensorDesc> &inTensorDescs) const;
     Status InferShapeCheckAllGatherLinear(const SVector<TensorDesc> &inTensorDescs) const;
-- 
Gitee


From b59fa3aae909b074942adb6e5729853c1d465e8a Mon Sep 17 00:00:00 2001
From: guanguan <guanxuewei2@huawei.com>
Date: Tue, 23 Sep 2025 19:31:59 +0800
Subject: [PATCH 79/94] fix

---
 src/ops_infer/linear_parallel/linear_parallel_lcoc_runner.cpp | 4 ++++
 1 file changed, 4 insertions(+)
 mode change 100644 => 100755 src/ops_infer/linear_parallel/linear_parallel_lcoc_runner.cpp

diff --git a/src/ops_infer/linear_parallel/linear_parallel_lcoc_runner.cpp b/src/ops_infer/linear_parallel/linear_parallel_lcoc_runner.cpp
old mode 100644
new mode 100755
index 3d3ebdc8..9d4ac7da
--- a/src/ops_infer/linear_parallel/linear_parallel_lcoc_runner.cpp
+++ b/src/ops_infer/linear_parallel/linear_parallel_lcoc_runner.cpp
@@ -118,6 +118,10 @@ Status LinearParallelLcocRunner::SetupImpl(RunnerVariantPack &runnerVariantPack)
         ATB_LOG(ERROR) << GetLogPrefix() << "GetCoCDataTypeDesc failed.";
         return ERROR_INVALID_PARAM;
     }
+    if (mmInfo.transB && mmInfo.weightNz) {
+        ATB_LOG(ERROR) << GetLogPrefix() << "transWeight and weightNz can not be true at the same time.";
+        return ERROR_INVALID_PARAM;
+    }
     Lcal::CoCParamDesc coCParamDesc{
         .dataTypeDesc = dataTypeDesc,
         .mmInfo = mmInfo,
-- 
Gitee


From a96807388e40e9a663f9ced9d705906c698116df Mon Sep 17 00:00:00 2001
From: guanguan <guanxuewei2@huawei.com>
Date: Tue, 23 Sep 2025 19:55:17 +0800
Subject: [PATCH 80/94] fix

---
 tests/apitest/opstest/csv/linear_parallel.csv | 24 +++++++++----------
 1 file changed, 12 insertions(+), 12 deletions(-)
 mode change 100644 => 100755 tests/apitest/opstest/csv/linear_parallel.csv

diff --git a/tests/apitest/opstest/csv/linear_parallel.csv b/tests/apitest/opstest/csv/linear_parallel.csv
old mode 100644
new mode 100755
index ae51e1c4..1e0de978
--- a/tests/apitest/opstest/csv/linear_parallel.csv
+++ b/tests/apitest/opstest/csv/linear_parallel.csv
@@ -20,7 +20,7 @@ CaseNum|CaseName|OpName|OpParam|InNum|InDType|InFormat|InShape|OutNum|OutDType|O
 19|IErrorIniMatchCase3|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"hccl"}|2|float16;float16|nd;nhwc|2,16;32,16|0||||zero;zero|0,0;0,0|||||||I:ERROR_INVALID_TENSOR_INI_MATCH
 20|IErrorIniMatchCase4|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":true,"backend":"hccl"}|3|float16;float16;bool|nd;nd;nd|2,16;32,16;1,32|0||||zero;zero;zero|0,0;0,0;0,0|||||||I:ERROR_INVALID_TENSOR_INI_MATCH
 21|IErrorIniMatchCase5|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":true,"backend":"hccl"}|3|float16;float16;float16|nd;nd;nhwc|2,16;32,16;1,32|0||||zero;zero;zero|0,0;0,0;0,0|||||||I:ERROR_INVALID_TENSOR_INI_MATCH
-22|LinearParallelLcocSupportWeightNz|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc"}|2|float16;float16|nd;fractal_nz|2,16;32,16|1|float16|nd|2,32|customize;customize;customize|-1,1;-1,1;-2,2||||||Ascend910B|NO_ERROR
+22|LinearParallelLcocSupportWeightNz|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc"}|2|float16;float16|nd;fractal_nz|2,16;16,32|1|float16|nd|2,32|customize;customize;customize|-1,1;-1,1;-2,2||||||Ascend910B|NO_ERROR
 23|IErrorDimCase0|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"hccl"}|2|float16;float16|nd;nd|16;32,16|0||||zero;zero|0,0;0,0|||||||I:ERROR_INVALID_TENSOR_DIM
 24|IErrorDimCase1|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"hccl"}|2|float16;float16|nd;nd|2,16;32|0||||zero;zero|0,0;0,0|||||||I:ERROR_INVALID_TENSOR_DIM
 25|LinearParallelHcclSupportWeightNz|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"hccl"}|2|float16;float16|nd;fractal_nz|2,16;32,16|1|float16|nd|2,32|customize;customize;customize|-1,1;-1,1;-2,2||||||Ascend910B|NO_ERROR
@@ -67,17 +67,17 @@ CaseNum|CaseName|OpName|OpParam|InNum|InDType|InFormat|InShape|OutNum|OutDType|O
 66|PureMatmulW8A8Bf16PerChannel|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":3,"quantType":1,"outDataType":27}|4|int8;int8;int32;int64|nd;nd;nd;nd|2,4;4,4;4;4|1|bf16|nd|2,4|customize;customize;customize;customize|-5,5;-5,5;-10,10;1,2||||||Ascend910B|NO_ERROR
 67|PureMatmulW8A8InvalidQuantType|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":3,"quantType":2,"outDataType":27}|4|int8;int8;int32;int64|nd;nd;nd;nd|2,4;4,4;4;4|1|bf16|nd|2,4|customize;customize;customize;customize|-5,5;-5,5;-10,10;1,2||||||Ascend910B|S:ERROR_INVALID_PARAM
 68|PureMatmulKeepIntermediateInValid|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","keepIntermediate":true,"type":3,"quantType":0,"outDataType":27}|4|int8;int8;int32;int64|nd;nd;nd;nd|2,4;4,4;1;1|1|bf16|nd|2,4|customize;customize;customize;customize|-5,5;-5,5;-10,10;1,2||||||Ascend910B|C:ERROR_INVALID_PARAM
-69|MatmulAllReduceNzCase1|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc"}|2|float16;float16|nd;fractal_nz|1,5120;16,5120|1|float16|nd|1,16|customize;customize|-1,1;-1,1||||||Ascend910B|NO_ERROR
-70|MatmulAllReduceNzCase2|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc"}|2|float16;float16|nd;fractal_nz|28,5,1024;8192,1024|1|float16|nd|28,5,8192|customize;customize|-1,1;-1,1||||||Ascend910B|NO_ERROR
-71|MatmulAllReduceNzCase3|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc"}|2|float16;float16|nd;fractal_nz|2,16;1,1,32,16|1|float16|nd|2,32|customize;customize|-1,1;-1,1||||||Ascend910B|NO_ERROR
-72|MatmulAllReduceNzCase4|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc"}|2|float16;float16|nd;fractal_nz|2,2,16;1,1,32,16|1|float16|nd|2,2,32|customize;customize|-1,1;-1,1||||||Ascend910B|NO_ERROR
-73|MatmulAllReduceNzCase5|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc"}|2|bf16;bf16|nd;fractal_nz|1,5120;16,5120|1|bf16|nd|1,16|customize;customize|-1,1;-1,1||||||Ascend910B|NO_ERROR
-74|MatmulAllReduceNzCase6|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc"}|2|bf16;bf16|nd;fractal_nz|2,2,16;1,1,32,16|1|bf16|nd|2,2,32|customize;customize|-1,1;-1,1||||||Ascend910B|NO_ERROR
-75|PureMatmulNz|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type": 3}|2|float16;float16|nd;fractal_nz|1,5120;16,5120|1|float16|nd|1,16|customize;customize|-1,1;-1,1||||||Ascend910B|NO_ERROR
-76|PureMatmulNzBf16|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type": 3}|2|bf16;bf16|nd;fractal_nz|1,5120;16,5120|1|bf16|nd|1,16|customize;customize|-1,1;-1,1||||||Ascend910B|NO_ERROR
-77|MatmulReduceScatterNz|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":1}|2|float16;float16|nd;fractal_nz|2,16;32,16|1|float16|nd|1,32|customize;customize|-1,1;-1,1||||||Ascend910B|NO_ERROR
-78|MatmulReduceScatterNzBf16|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":1}|2|bf16;bf16|nd;fractal_nz|2,16;32,16|1|bf16|nd|1,32|customize;customize|-1,1;-1,1||||||Ascend910B|NO_ERROR
-79|AllGatherMatmulNz|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":2}|2|float16;float16|nd;fractal_nz|2,16;32,16|1|float16|nd|4,32|customize;customize|-1,1;-1,1||||||Ascend910B|NO_ERROR
+69|MatmulAllReduceNzCase1|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc"}|2|float16;float16|nd;fractal_nz|1,5120;5120,16|1|float16|nd|1,16|customize;customize|-1,1;-1,1||||||Ascend910B|NO_ERROR
+70|MatmulAllReduceNzCase2|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc"}|2|float16;float16|nd;fractal_nz|28,5,1024;1024,8192|1|float16|nd|28,5,8192|customize;customize|-1,1;-1,1||||||Ascend910B|NO_ERROR
+71|MatmulAllReduceNzCase3|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc"}|2|float16;float16|nd;fractal_nz|2,16;1,2,16,16|1|float16|nd|2,32|customize;customize|-1,1;-1,1||||||Ascend910B|NO_ERROR
+72|MatmulAllReduceNzCase4|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc"}|2|float16;float16|nd;fractal_nz|2,2,16;1,2,16,16|1|float16|nd|2,2,32|customize;customize|-1,1;-1,1||||||Ascend910B|NO_ERROR
+73|MatmulAllReduceNzCase5|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc"}|2|bf16;bf16|nd;fractal_nz|1,5120;5120,16|1|bf16|nd|1,16|customize;customize|-1,1;-1,1||||||Ascend910B|NO_ERROR
+74|MatmulAllReduceNzCase6|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc"}|2|bf16;bf16|nd;fractal_nz|2,2,16;1,2,16,16|1|bf16|nd|2,2,32|customize;customize|-1,1;-1,1||||||Ascend910B|NO_ERROR
+75|PureMatmulNz|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type": 3}|2|float16;float16|nd;fractal_nz|1,5120;5120,16|1|float16|nd|1,16|customize;customize|-1,1;-1,1||||||Ascend910B|NO_ERROR
+76|PureMatmulNzBf16|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type": 3}|2|bf16;bf16|nd;fractal_nz|1,5120;5120,16|1|bf16|nd|1,16|customize;customize|-1,1;-1,1||||||Ascend910B|NO_ERROR
+77|MatmulReduceScatterNz|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":1}|2|float16;float16|nd;fractal_nz|2,16;16,32|1|float16|nd|1,32|customize;customize|-1,1;-1,1||||||Ascend910B|NO_ERROR
+78|MatmulReduceScatterNzBf16|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":1}|2|bf16;bf16|nd;fractal_nz|2,16;16,32|1|bf16|nd|1,32|customize;customize|-1,1;-1,1||||||Ascend910B|NO_ERROR
+79|AllGatherMatmulNz|LinearParallelOperation|{"transWeight":false,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":2}|2|float16;float16|nd;fractal_nz|2,16;16,32|1|float16|nd|4,32|customize;customize|-1,1;-1,1||||||Ascend910B|NO_ERROR
 80|LinearParallelBf16Error|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":2,"rankRoot":0,"hasResidual":false,"backend":"hccl"}|2|bf16;bf16|nd;nd|2,16;32,16|1|bf16|nd|1,32|customize;customize|-1,1;-1,1||||||Ascend310P|I:ERROR_INVALID_TENSOR_DTYPE
 81|rsv|LinearParallelOperation|{"rank":0,"rankSize":2,"rsv":[1]}|0||||0||||||||||||C:ERROR_INVALID_PARAM
 82|NoErrorCase0AllGatherLinearReduceScatter|LinearParallelOperation|{"transWeight":true,"rank":0,"rankSize":8,"rankRoot":0,"hasResidual":false,"backend":"lcoc","type":4,"twoDimTPInfo":{"agDim":4,"rsDim":2,"innerDimIsAg":1}}|2|float16;float16|nd;nd|2,16;32,16|1|float16|nd|4,32|customize;customize|-1,1;-1,1||||||Ascend910B|NO_ERROR
-- 
Gitee


From c570a872bad5928197dc871728312ed5456c4d2a Mon Sep 17 00:00:00 2001
From: huangxiaolan <huangxiaolan7@huawei.com>
Date: Wed, 24 Sep 2025 09:35:01 +0800
Subject: [PATCH 81/94] fix include

---
 .../tbe_adapter/platform/tiling/platform/platform_ascendc.h     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/kernels/tbe_adapter/platform/tiling/platform/platform_ascendc.h b/src/kernels/tbe_adapter/platform/tiling/platform/platform_ascendc.h
index 6fa0fcfd..8e2b0a53 100644
--- a/src/kernels/tbe_adapter/platform/tiling/platform/platform_ascendc.h
+++ b/src/kernels/tbe_adapter/platform/tiling/platform/platform_ascendc.h
@@ -18,7 +18,7 @@
 
 #include <cstdint>
 #include <mutex>
-#include "stubs\include\metadef\inc\external\platform\platform_info.h"
+#include "stubs/include/metadef/inc/external/platform/platform_info.h"
 
 #define ASCENDC_ASSERT(cond, behavior) \
     do {                               \
-- 
Gitee


From 3f578716360cc1f0606b241010d3dab6def84c8a Mon Sep 17 00:00:00 2001
From: x30073543 <xiaohanjun3@h-partners.com>
Date: Mon, 15 Sep 2025 11:31:52 +0800
Subject: [PATCH 82/94] null ptr check

---
 .../mixkernels/fusion/tiling/fusion_tiling.cpp   | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/src/kernels/mixkernels/fusion/tiling/fusion_tiling.cpp b/src/kernels/mixkernels/fusion/tiling/fusion_tiling.cpp
index 5145c511..a6acecd4 100644
--- a/src/kernels/mixkernels/fusion/tiling/fusion_tiling.cpp
+++ b/src/kernels/mixkernels/fusion/tiling/fusion_tiling.cpp
@@ -36,9 +36,12 @@ Status FusionTiling(const LaunchParam &launchParam, KernelInfo &kernelInfo)
     std::string tilingFuncName = inferWorkspaceFuncName + "tiling_func";
     *(void **)(&tilingFunc) = dlsym(handle, tilingFuncName.c_str());
     errorInfo = dlerror();
+    if (errorInfo != nullptr || tilingFunc == nullptr) {
+        return Status::FailStatus(-1, "Get tilingFunc failed!");
+    }
     KernelArgs *kernelArgs = new (std::nothrow) KernelArgs;
-    if (errorInfo != nullptr || tilingFunc == nullptr || kernelArgs == nullptr) {
-        return Status::FailStatus(-1, "Get tilingFunc or Malloc for binary params failed!");
+    if (kernelArgs == nullptr) {
+        return Status::FailStatus(-1, "Malloc for binary params failed!");
     }
     kernelArgs->tilingDevice = static_cast<void *>(tilingDataPtr);
     kernelArgs->tilingDeviceDup = kernelArgs->tilingDevice;
@@ -48,10 +51,15 @@ Status FusionTiling(const LaunchParam &launchParam, KernelInfo &kernelInfo)
     MKI_LOG(INFO) << "now inferWorkspaceFuncName is" << inferWorkspaceFuncName;
     *(void **)(&inferworkspaceFunc) = dlsym(handle, inferWorkspaceFuncName.c_str());
     errorInfo = dlerror();
+    if (errorInfo != nullptr || inferworkspaceFunc == nullptr) {
+        delete kernelArgs;
+        return Status::FailStatus(-1, "Get workspaceFunc failed!");
+    }
     KernelArgsForInferShapeWorkspaceWithTiling *wsWithTiling =
         new (std::nothrow) KernelArgsForInferShapeWorkspaceWithTiling;
-    if (errorInfo != nullptr || inferworkspaceFunc == nullptr || wsWithTiling == nullptr) {
-        return Status::FailStatus(-1, "Get workspaceFunc or Get workspace tiling failed!");
+    if (wsWithTiling == nullptr) {
+        delete kernelArgs;
+        return Status::FailStatus(-1, "Get workspace tiling space failed!");
     }
     wsWithTiling->tilingDevice = tilingDataPtr;
     wsWithTiling->tilingDeviceDup = tilingDataPtr;
-- 
Gitee


From b7f46d29536e431174b31e4058a037487ace32d3 Mon Sep 17 00:00:00 2001
From: zouyanlong <zouyanlong@h-partners.com>
Date: Wed, 24 Sep 2025 10:45:53 +0800
Subject: [PATCH 83/94] fix

---
 example/op_demo/fused_add_topk_div/README.md  | 26 +++++++++++++++++++
 example/op_demo/fused_add_topk_div/build.sh   |  4 +--
 ...pk_div.cpp => fused_add_topk_div_demo.cpp} |  0
 3 files changed, 28 insertions(+), 2 deletions(-)
 rename example/op_demo/fused_add_topk_div/{fused_add_topk_div.cpp => fused_add_topk_div_demo.cpp} (100%)

diff --git a/example/op_demo/fused_add_topk_div/README.md b/example/op_demo/fused_add_topk_div/README.md
index ad529f6e..f2f62b1f 100644
--- a/example/op_demo/fused_add_topk_div/README.md
+++ b/example/op_demo/fused_add_topk_div/README.md
@@ -28,3 +28,29 @@
 ## 额外说明
 示例中生成的数据不代表实际场景，如需数据生成参考请查看python用例目录：
 tests/apitest/opstest/python/operations/fused_add_topk_div/
+
+## 场景说明
+
+  该算子所给demo仅支持在Atlas A2/A3系列产品上运行，demo的场景说明如下：
+
+- fused_add_topk_div_demo
+  
+    **参数设置**：
+    | 成员名称            | 取值                 |
+    | :------------------ | :------------------- |
+    | groupNum            | 8                    |
+    | groupTopk           | 4                    |
+    | n                   | 2                    |
+    | k                   | 8                    |
+    | activationType      | `ACTIVATION_SIGMOID` |
+    | isNorm              | `true`               |
+    | scale               | 2.5                  |
+    | enableExpertMapping | `false`              |
+
+    **数据规格**：
+    | tensor名字 | 数据类型 | 数据格式 | 维度信息   | cpu/npu |
+    | ---------- | -------- | -------- | ---------- | ------- |
+    | `x`        | float16  | nd       | [512, 256] | npu     |
+    | `add_num`  | float16  | nd       | [256]      | npu     |
+    | `y`        | float    | nd       | [512, 8]   | npu     |
+    | `indices`  | int32    | nd       | [512, 8]   | npu     |
diff --git a/example/op_demo/fused_add_topk_div/build.sh b/example/op_demo/fused_add_topk_div/build.sh
index 351a64e8..fc37e66c 100644
--- a/example/op_demo/fused_add_topk_div/build.sh
+++ b/example/op_demo/fused_add_topk_div/build.sh
@@ -20,5 +20,5 @@ except ImportError:
 echo "Using cxx_abi=$cxx_abi"
 
 g++ -D_GLIBCXX_USE_CXX11_ABI=$cxx_abi -I "${ATB_HOME_PATH}/include" -I "${ASCEND_HOME_PATH}/include" -L "${ATB_HOME_PATH}/lib" -L "${ASCEND_HOME_PATH}/lib64" \
-fused_add_topk_div.cpp ../demo_util.h -l atb -l ascendcl -o fused_add_topk_div
-./fused_add_topk_div
+fused_add_topk_div_demo.cpp ../demo_util.h -l atb -l ascendcl -o fused_add_topk_div_demo
+./fused_add_topk_div_demo
diff --git a/example/op_demo/fused_add_topk_div/fused_add_topk_div.cpp b/example/op_demo/fused_add_topk_div/fused_add_topk_div_demo.cpp
similarity index 100%
rename from example/op_demo/fused_add_topk_div/fused_add_topk_div.cpp
rename to example/op_demo/fused_add_topk_div/fused_add_topk_div_demo.cpp
-- 
Gitee


From 28d702a865aa32bc7e80d70d165d954462942537 Mon Sep 17 00:00:00 2001
From: He Changcheng <hechangcheng2@h-partners.com>
Date: Wed, 24 Sep 2025 15:24:02 +0800
Subject: [PATCH 84/94] fix

---
 src/kernels/lcal/src/lccl.cpp | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/kernels/lcal/src/lccl.cpp b/src/kernels/lcal/src/lccl.cpp
index b1d81330..eea814a5 100644
--- a/src/kernels/lcal/src/lccl.cpp
+++ b/src/kernels/lcal/src/lccl.cpp
@@ -59,7 +59,6 @@ int GetAclResInCurThread(int type, uint32_t &resource)
         aclFn = reinterpret_cast<AclrtGetResInCurrentThreadFunc>(sym);
         initFlag.store(LCAL_SUCCESS, std::memory_order_release);
         MKI_LOG(DEBUG) << "Loaded libascendcl.so and resolved aclrtGetResInCurrentThread from: " << p;
-        return; // 成功
     });
 
     // 初始化结果判定
@@ -73,6 +72,12 @@ int GetAclResInCurThread(int type, uint32_t &resource)
         return LCAL_ERROR_PARA_CHECK_FAIL;
     }
 
+    // 调用前检查函数指针有效性
+    if (aclFn == nullptr) {
+        MKI_LOG(ERROR) << "aclrtGetResInCurrentThread function pointer is null.";
+        return LCAL_ERROR_INTERNAL;
+    }
+
     // 调用底层函数
     const int ret = aclFn(type, &resource);
     if (ret != ACL_SUCCESS) {
-- 
Gitee


From 4858fc7f94c003ad557e8bf6cb6d0e6e0c0e9c04 Mon Sep 17 00:00:00 2001
From: huangxiaolan <huangxiaolan7@huawei.com>
Date: Wed, 24 Sep 2025 17:23:02 +0800
Subject: [PATCH 85/94] recover class defination

---
 .../tbe_adapter/platform/tiling/platform/platform_ascendc.h   | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/kernels/tbe_adapter/platform/tiling/platform/platform_ascendc.h b/src/kernels/tbe_adapter/platform/tiling/platform/platform_ascendc.h
index 8e2b0a53..ad6082e9 100644
--- a/src/kernels/tbe_adapter/platform/tiling/platform/platform_ascendc.h
+++ b/src/kernels/tbe_adapter/platform/tiling/platform/platform_ascendc.h
@@ -18,7 +18,6 @@
 
 #include <cstdint>
 #include <mutex>
-#include "stubs/include/metadef/inc/external/platform/platform_info.h"
 
 #define ASCENDC_ASSERT(cond, behavior) \
     do {                               \
@@ -27,6 +26,9 @@
             raise(SIGABRT);            \
         }                              \
     } while (0)
+namespace fe {
+class PlatformInfo;     
+}
 
 namespace platform_ascendc {
 enum class CoreMemType {
-- 
Gitee


From 2a6076f201577e02def5ff9027cb0d5de467e412 Mon Sep 17 00:00:00 2001
From: huangxiaolan <huangxiaolan7@huawei.com>
Date: Wed, 24 Sep 2025 17:24:38 +0800
Subject: [PATCH 86/94] recover class defination

---
 .../tbe_adapter/platform/tiling/platform/platform_ascendc.h     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/kernels/tbe_adapter/platform/tiling/platform/platform_ascendc.h b/src/kernels/tbe_adapter/platform/tiling/platform/platform_ascendc.h
index ad6082e9..6eda0979 100644
--- a/src/kernels/tbe_adapter/platform/tiling/platform/platform_ascendc.h
+++ b/src/kernels/tbe_adapter/platform/tiling/platform/platform_ascendc.h
@@ -27,7 +27,7 @@
         }                              \
     } while (0)
 namespace fe {
-class PlatformInfo;     
+class PlatFormInfos;
 }
 
 namespace platform_ascendc {
-- 
Gitee


From 97ad742b4f5323e07bc5c652299c3623748ea056 Mon Sep 17 00:00:00 2001
From: qiuqianjin <qiuqianjin@huawei.com>
Date: Wed, 24 Sep 2025 17:54:50 +0800
Subject: [PATCH 87/94] [task]pa test support range -5 to 5

---
 .../paged_attention/precision_calcu.py        |    2 +-
 .../test_paged_attention_operation_range_5.py | 1411 +++++++++++++++++
 2 files changed, 1412 insertions(+), 1 deletion(-)
 create mode 100644 tests/apitest/opstest/python/operations/paged_attention/test_paged_attention_operation_range_5.py

diff --git a/tests/apitest/opstest/python/operations/paged_attention/precision_calcu.py b/tests/apitest/opstest/python/operations/paged_attention/precision_calcu.py
index 2db3c4b3..9ad48e21 100644
--- a/tests/apitest/opstest/python/operations/paged_attention/precision_calcu.py
+++ b/tests/apitest/opstest/python/operations/paged_attention/precision_calcu.py
@@ -134,7 +134,7 @@ def compare_cv(golden:torch.Tensor, gpu:torch.Tensor, actual:torch.Tensor):
     logging.info(f"mere_npu:{mere_npu} mere_gpu:{mere_gpu}")
     logging.info(f"rmse_npu:{rmse_npu} rmse_gpu:{rmse_gpu}")
     logging.info(f"MARE:{mare_rate} MERE:{mere_rate} RMSE:{rmse_rate} EB:{EB}")
-    logging.info(f"new golden cv result:{result}")
+    print(f"new golden cv result:{result}")
     return result
 
 if __name__ == '__main__':
diff --git a/tests/apitest/opstest/python/operations/paged_attention/test_paged_attention_operation_range_5.py b/tests/apitest/opstest/python/operations/paged_attention/test_paged_attention_operation_range_5.py
new file mode 100644
index 00000000..d613fa65
--- /dev/null
+++ b/tests/apitest/opstest/python/operations/paged_attention/test_paged_attention_operation_range_5.py
@@ -0,0 +1,1411 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd.
+# This file is a part of the CANN Open Software.
+# Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+#
+import json
+import math
+import os
+import random
+import sys
+import unittest
+import collections
+import numpy as np
+import torch
+import torch_npu
+
+sys.path.append(os.path.join(os.path.dirname(__file__), "../"))
+import operation_test  # NOQA: E402
+from precision_calcu import *
+
+MAX_SEQ_LEN = 1024
+
+class TestPagedAttention(operation_test.OperationTest):
+
+    def compare_output_data(self, out, golden, ratios):
+        error_count = 0
+        strict_error_count = 0
+        fp16_min_normal = 1.0 / (1 << 14)
+        golden = golden.flatten().to(torch.float32)
+        out = out.flatten().to(torch.float32)
+        len = out.shape[0]
+        diff = torch.abs(golden - out)
+        max_diff = diff.max().item()
+        limit_error = torch.maximum(torch.abs(golden * ratios[0]), torch.tensor(ratios[1]))
+        strict_limit_error = torch.maximum(torch.abs(golden * ratios[2]), torch.tensor(ratios[3]))
+        error_count = torch.gt(diff, limit_error).sum().item()
+        strict_error_count = torch.gt(diff, strict_limit_error).sum().item()
+        print(f"maxDiff {max_diff}")
+        print("1/1000 Accuracy is %f",  1 - float(error_count) / len)
+        print("5/1000 Accuracy is %f",  1 - float(strict_error_count) / len)
+        if self.data_type == torch.bfloat16 or self.is_int8_flag:
+            print("accuracy is correct in old standard: %r", (float(strict_error_count) / len) <= ratios[2])
+        else:
+            print("accuracy is correct in old standard: %r", (float(strict_error_count) / len) <= ratios[0])
+        calc_times = self.head_size * self.max_context_len + 4
+        if self.data_type == torch.bfloat16:
+            if calc_times < 2048:
+                error = 2**(-7)
+            else :
+                error = 2**(-6)
+            error_threshold = torch.clamp(torch.abs(golden), min = 1) * error
+            res = (diff <= error_threshold).all().item()
+            print("accuracy is correct in new standard: %r", res)
+            return res
+        else:
+            if calc_times < 2048:
+                error = 2**(-8)
+            else :
+                error = 2**(-7)
+            error_threshold = torch.clamp(torch.abs(golden), min = 1) * error
+            res = (diff <= error_threshold).all().item()
+            print("accuracy is correct in new standard: %r", res)
+            return res
+
+    def get_alibi_slopes(self, n_heads):
+        n = 2 ** math.floor(math.log2(n_heads))
+        m0 = 2.0 ** (-8.0 / n)
+        slopes = torch.pow(m0, torch.arange(1, n + 1))
+        if n < n_heads:
+            m1 = 2.0 ** ( -4.0 / n)
+            mm = torch.pow(m1, torch.arange(1, 1 + 2 * (n_heads - n), 2))
+            slopes = torch.cat([slopes, mm])
+        # slopes = torch.ones(n_heads)
+        return slopes
+
+    def group_mm_torch(self, heads, kv_head, A, B, razor_mod, is_k):
+        group_head = heads // kv_head
+        score_high = None
+        for i in range(kv_head):
+            if self.is_int8_flag:
+                int8_B = B[i: (i+1), :, :, ]
+                head_dim = int8_B.shape[2]
+                float32_B = int8_B.to(torch.float32)
+                if is_k:
+                    if self.has_bias:
+                        float32_B = float32_B + self.offset1[(i + razor_mod) * head_dim : (i + razor_mod + 1) * head_dim].to(torch.float32)
+                    fp32_B = float32_B.to(torch.float32) * self.de_scale1_fp32[(i + razor_mod) * head_dim : (i + razor_mod + 1) * head_dim]
+                    fp32_B = torch.permute(fp32_B, (0, 2, 1))
+                else:
+                    if self.has_bias:
+                        float32_B = float32_B + self.offset2[(i + razor_mod) * head_dim : (i + razor_mod + 1) * head_dim]
+                    fp32_B = float32_B.to(torch.float32) * self.de_scale2_fp32[(i + razor_mod) * head_dim : (i + razor_mod + 1) * head_dim]
+                group_score_high = torch.matmul(A[i * group_head: (i + 1) * group_head, :, :].to(torch.float32),
+                                            fp32_B)
+            elif self.is_quant_flag:
+                    group_score_int32 = torch.matmul(A[i*group_head: (i + 1)*group_head, :, :].to(torch.int32),
+                        B[i: (i+1), :, :].to(torch.int32)).to(torch.int32)
+                    if is_k:
+                        group_score_high = group_score_int32.to(torch.float32) * self.de_scale1_fp32[(i * group_head): (i + 1) * group_head].reshape(group_head, 1, 1).to(torch.float32)
+                    else:
+                        group_score_high = group_score_int32.to(torch.float32) * self.de_scalev[(i * group_head): (i + 1) * group_head].reshape(group_head, 1, 1).to(torch.float32)
+            else:
+                group_score_high = torch.matmul(A[i * group_head: (i + 1) * group_head, :, :].to(torch.float32),
+                                           B[i:(i + 1), :, :].to(torch.float32))
+            if score_high is None:
+                score_high = group_score_high
+            else:
+                score_high = torch.cat((score_high, group_score_high), 0)
+        return score_high
+
+    def process_deq_scale(self, deq_scale) -> np.ndarray:
+        new_deq_scale = np.frombuffer(deq_scale.tobytes(), dtype=np.uint32)
+        return new_deq_scale.astype(np.uint64)
+
+    def softmax(self, sim):
+        row_max = torch.max(sim, axis=-1, keepdims=True)[0]
+        sim_sub = sim - row_max
+        sim_sub = torch.exp(sim_sub)
+        row_sum = torch.sum(sim_sub, axis=-1, keepdims=True)
+        soft_res = sim_sub / row_sum
+        return soft_res
+
+    def softmax_numpy(self, sim):
+        sim = sim.cpu().numpy()
+        row_max = np.max(sim, axis=-1, keepdims=True)
+        sim_sub = sim - row_max
+        sim_sub = np.exp(sim_sub)
+        row_sum = np.sum(sim_sub, axis=-1, keepdims=True)
+        soft_res = sim_sub / row_sum
+        return soft_res
+
+    def softmax_quant_numpy(self, sim, is_first):
+        lm = np.max(sim, axis=-1, keepdims=True)
+        if is_first:
+            hm = lm
+            self.dm = 0
+        else:
+            hm = np.maximum(self.gm, lm)
+            self.dm = self.gm - hm
+        self.gm = hm
+        sim_sub = sim - hm
+        sim_sub = np.exp(sim_sub)
+        row_sum = np.sum(sim_sub, axis=-1, keepdims=True)
+        row_maxp = np.max(sim_sub, axis=-1, keepdims=True)
+        if not self.is_quant_offiline:
+            scale = row_maxp.astype("float32") / 127.0
+            sim_int8 = sim_sub / scale
+            soft_res = sim_int8.astype("float16")
+            soft_res = np.rint(soft_res).astype("int8")
+            de_scalev = self.de_scale2_fp32 * row_maxp[:,0,0] / 127
+        else:
+            soft_res = sim_sub * self.scale.reshape(self.scale.shape[0], 1, 1).numpy()
+            soft_res = soft_res.astype("float16")
+            soft_res = np.rint(soft_res).astype("int8")
+            de_scalev = self.de_scale2_fp32
+        return soft_res, row_sum, de_scalev, hm, self.dm
+
+
+    def softmax_quant_numpy_online(self, sim, heads, kv_head, value, razor_mod):
+        group_head = heads // kv_head
+        score_high = None
+        # (kv_heads, context_len, head_size)
+        kv_seqlen = value.shape[1]
+        cur_kv_seqlen = kv_seqlen
+        n_loop = (cur_kv_seqlen + self.block_size_calc - 1) // self.block_size_calc
+        qk_n = self.block_size_calc
+        self.tmp_l_list = []
+        self.tmp_o_list = []
+        for cur_nIndx in range(self.kvsplit):
+            kv_seqlen_align =  (kv_seqlen + self.block_size - 1) // self.block_size  * self.block_size
+            start_kv = cur_nIndx * self.kv_split_per_core
+            cur_kv_seqlen = self.kv_split_per_core
+            kv_loop = (kv_seqlen_align + self.kv_split_per_core - 1) // self.kv_split_per_core
+            if cur_nIndx >= kv_loop:
+                continue
+            if cur_nIndx == (kv_loop - 1):
+                cur_kv_seqlen = kv_seqlen - cur_nIndx * self.kv_split_per_core
+            n_loop = (cur_kv_seqlen + self.block_size_calc - 1) // self.block_size_calc
+            qk_n = self.block_size_calc
+            end_kv = start_kv
+            for n_idx in range(n_loop):
+                is_first = (n_idx == 0)
+                if n_idx == n_loop - 1:
+                    qk_n = cur_kv_seqlen - n_idx * self.block_size_calc
+                end_kv = end_kv + qk_n
+                sim_block = sim[:, :, start_kv : end_kv]
+                p_block, ll, de_scalev, hm, dm = self.softmax_quant_numpy(sim_block, is_first)
+                self.de_scalev = de_scalev
+                value_block = value[:, start_kv : end_kv, :]
+                lo = self.group_mm_torch(heads, kv_head, torch.from_numpy(p_block), value_block, razor_mod, 0)
+                lo = lo.cpu().numpy()
+                if n_idx == 0:
+                    self.gl = ll
+                    self.go = lo
+                else:
+                    dm = np.exp(dm)
+                    self.gl = self.gl * dm
+                    self.gl = self.gl + ll
+                    self.go = self.go * dm
+                    self.go = self.go + lo
+                start_kv = start_kv + qk_n
+            self.go = self.go / self.gl
+            self.tmp_o_list.append(self.go.reshape([1, self.num_heads, 1, value.shape[2]]))
+            ls = np.log(self.gl) + self.gm
+            self.tmp_l_list.append(ls.reshape([1, self.num_heads]))
+        if self.kvsplit > 1:
+            l = np.concatenate(self.tmp_l_list, 0)
+            o = np.concatenate(self.tmp_o_list, 0)
+            l = np.transpose(l, (1, 0))
+            lse_max = np.max(l, axis=1, keepdims=True)
+            l_tmp = np.exp(l - lse_max)
+            lse_sum = np.sum(l_tmp, axis=1, keepdims=True)
+            lse_logsum = np.log(lse_sum) + lse_max
+            scale = np.exp(l - lse_logsum)
+            o = o * scale.transpose(1, 0)[:,:,np.newaxis,np.newaxis]
+            self.go = np.sum(o, axis=0, keepdims=True)
+            self.go = np.squeeze(self.go, axis=0)
+        return torch.from_numpy(self.go)
+
+    def ref_masked_attention(self,
+            query,  # (1, num_heads, head_size)
+            key,  # (context_len, kv_heads, head_size)
+            value,
+            scale: float,
+            alibi_bias,
+            razor_rope,
+            razor_offset_list,
+            razor_mod,
+            mask_data_type = torch.bfloat16,
+    ):
+        # Q * K.T
+        query = query
+        query = torch.permute(query, (1, 0, 2))
+        if not self.is_int8_flag:
+            key = torch.permute(key, (1, 2, 0))  # 0 1 2
+        else:
+            key = torch.permute(key, (1, 0, 2))
+        sim_high = self.group_mm_torch(query.shape[0], key.shape[0], query, key, razor_mod, 1)  # (head_num, q_seqlen, k_seqlen)
+
+        if razor_rope:
+            razor_offset_list = razor_offset_list.view(1, 1, razor_offset_list.shape[0])
+            sim_high = sim_high.to(torch.float32) + razor_offset_list
+        sim_high = sim_high.to(torch.float32) * scale
+        if alibi_bias is not None:
+            sim_high = sim_high + alibi_bias.to(torch.float32)
+        # softmax
+        if self.is_quant_flag:
+            self.gm = np.full([query.shape[0] , 1, 1],  np.finfo(np.float32).min)
+            p_high, row_sum, de_scalev, _, _ = self.softmax_quant_numpy(sim_high.numpy(), 1)
+            self.de_scalev = de_scalev
+            value = torch.permute(value, (1, 0, 2))
+            out_high = self.group_mm_torch(query.shape[0], key.shape[0], torch.from_numpy(p_high), value, razor_mod, 0)
+            out_high = out_high / row_sum
+            out_high = torch.permute(out_high, (1, 0, 2))
+            s_qk = sim_high.numpy()
+            out = self.softmax_quant_numpy_online(s_qk, query.shape[0], key.shape[0], value, razor_mod)
+        else:
+            p_high = self.softmax_numpy(sim_high)
+            p = torch.from_numpy(p_high).to(mask_data_type)
+            p_high = torch.from_numpy(p_high)
+            # P * V
+            value = torch.permute(value, (1, 0, 2))
+            out = self.group_mm_torch(query.shape[0], key.shape[0], p, value, razor_mod, 0)
+            out_high = self.group_mm_torch(query.shape[0], key.shape[0], p_high, value, razor_mod, 0)
+            out = torch.permute(out, (1, 0, 2))
+            out_high = torch.permute(out_high, (1, 0, 2))
+        return out, out_high
+
+    def ref_single_query_cached_kv_attention(self,
+            output,
+            true_out,
+            query,
+            key_cache,  # (num_blocks, block_size, num_heads, head_size)
+            value_cache,  # (num_blocks, block_size, num_heads, head_size)
+            block_tables,
+            context_lens,
+            mask,
+            razor_offset,
+            razor_rope,
+            mask_dim = 4,
+            mask_data_type = torch.bfloat16
+    ) -> None:
+        mask_index_coff = 1
+        if self.compressHead:
+            query = query.view(self.num_tokens * self.kv_heads, self.num_heads // self.kv_heads, self.head_size)
+            output = output.view(self.num_tokens * self.kv_heads, self.num_heads // self.kv_heads, self.head_size)
+            true_out = true_out.view(self.num_tokens * self.kv_heads, self.num_heads // self.kv_heads, self.head_size)
+            if mask_dim == 4:
+                mask_shape = mask.shape
+                mask = mask.view(mask_shape[0] * self.kv_heads, self.num_heads // self.kv_heads, 1, self.max_context_len)
+            else:
+                mask_index_coff = self.kv_heads
+        num_heads = query.shape[1]
+        kv_heads = value_cache.shape[2]
+        head_size = value_cache.shape[3]
+        block_size = value_cache.shape[1]
+
+        num_input_tokens = query.shape[0]
+        index = 0
+        razor_mod = 0
+        if self.scaleType == 2:
+            self.logN = torch.tensor([2.0] * len(context_lens)).to(torch.float32)
+            self.logN.uniform_(1, 2)
+        for i in range(len(context_lens)):
+            block_table = block_tables[i]
+            context_len = int(context_lens[i])
+            if context_len == 0:
+                continue
+            q = query[index].view(1, num_heads, head_size)
+            keys = []
+            values = []
+            razor_offset_list = []
+            for j in range(context_len):
+                block_number = int(block_table[j // block_size])
+                block_offset = j % block_size
+
+                k = key_cache[block_number, block_offset, :, :]
+                k = k.reshape(kv_heads, head_size)
+                keys.append(k)
+
+                v = value_cache[block_number, block_offset, :, :]
+                v = v.reshape(kv_heads, head_size)
+                values.append(v)
+
+                if razor_rope:
+                    offset = razor_offset[block_number, block_offset]
+                    razor_offset_list.append(offset)
+
+            keys = torch.stack(keys, axis=0)
+            values = torch.stack(values, axis=0)
+
+            if razor_rope:
+                razor_mod = i % self.kv_heads
+                razor_offset_list = torch.stack(razor_offset_list, axis=0)
+                self.razor_start_head = (i * num_heads) % self.num_heads
+            elif self.compressHead:
+                razor_mod = i % self.kv_heads
+                self.razor_start_head = (i * num_heads) % self.num_heads
+            else:
+                self.razor_start_head = 0
+            scale = np.float32(1.0 / (head_size ** 0.5))
+            if self.scaleType == 2:
+                scale *= self.logN[i]
+            if mask_dim == 4:
+                out, out_high = self.ref_masked_attention(q, keys, values, scale, mask[i, :, :, :context_len], razor_rope, razor_offset_list, razor_mod, mask_data_type)
+                out = out.reshape(num_heads, head_size)
+            elif mask_dim == 3:
+                out,out_high = self.ref_masked_attention(q, keys, values, scale, mask[i // mask_index_coff, :, :context_len], razor_rope, razor_offset_list, razor_mod, mask_data_type)
+                out = out.reshape(num_heads, head_size)
+            else:
+                out,out_high = self.ref_masked_attention(q, keys, values, scale, mask, razor_rope, razor_offset_list, razor_mod, mask_data_type)
+                out = out.reshape(num_heads, head_size)
+            out_high = out_high.reshape(num_heads, head_size)
+            output[index] = out.to(mask_data_type)
+            true_out[index] = out_high
+            index = index + 1
+
+    def get_blockszie_calc(self, max_context_len, block_size, embeddingSize, embeddingSizeV):
+        embedQKSplit = 256 if embeddingSize > 256 else embeddingSize
+        embedVOSplit = 256 if embeddingSizeV > 256 else embeddingSizeV
+        BLOCK_LIMIT = 128 * 128
+        KV_SEQLEN_SLICE = 128
+        KV_SEQLEN_SLICE_256 = 256
+        KV_SEQLEN_SLICE_512 = 512
+        BLOCK_LIMIT_NO_PINGPONG = 128 * 256;
+        block_size_calc = block_size
+        headdimMax =  np.maximum(embedQKSplit, embedVOSplit)
+        if block_size <= KV_SEQLEN_SLICE / 2 and \
+            block_size * 2 * embedQKSplit <= BLOCK_LIMIT and \
+            block_size * 2 * embedVOSplit <= BLOCK_LIMIT:
+            block_size_calc =  block_size * 2
+        if not self.is_int8_flag and \
+            max_context_len >= KV_SEQLEN_SLICE_256 and \
+            self.kv_split_per_core >= KV_SEQLEN_SLICE_256 and \
+            KV_SEQLEN_SLICE_256 * embedQKSplit  <= BLOCK_LIMIT_NO_PINGPONG and \
+            KV_SEQLEN_SLICE_256 * embedVOSplit <= BLOCK_LIMIT_NO_PINGPONG and \
+            (block_size == KV_SEQLEN_SLICE_256 // 4 or block_size ==  KV_SEQLEN_SLICE_256 // 2):
+            block_size_calc = 256
+
+        if self.is_quant_flag and \
+            max_context_len >= KV_SEQLEN_SLICE_512 and \
+            self.kv_split_per_core >= KV_SEQLEN_SLICE_512 and \
+            KV_SEQLEN_SLICE_512 * embedQKSplit  <= BLOCK_LIMIT_NO_PINGPONG * 2 and \
+            KV_SEQLEN_SLICE_512 * embedVOSplit <= BLOCK_LIMIT_NO_PINGPONG * 2 and \
+            (block_size == KV_SEQLEN_SLICE_256 // 4 or block_size ==  KV_SEQLEN_SLICE_256 // 2) and \
+            KV_SEQLEN_SLICE_512 * headdimMax <= BLOCK_LIMIT_NO_PINGPONG and self.head_num_move < 4:
+            block_size_calc = KV_SEQLEN_SLICE_512
+        return block_size_calc
+
+    def getkvsplit(self, num_tokens, num_heads, max_context_len, block_size, blocknum, isLongSeq):
+        if isLongSeq:
+            kvSeqklenMaxAlign = (max_context_len + block_size - 1) // block_size * block_size
+            kvSeqBlockNum = int(kvSeqklenMaxAlign / block_size)
+            kvBlockPreCore = int((kvSeqBlockNum + blocknum - 1)) // blocknum
+            kvSplitPerCore = int(kvBlockPreCore * block_size)
+            kvSplitCoreNum = int(kvSeqklenMaxAlign + kvSplitPerCore - 1) // kvSplitPerCore
+            headSplit = int((num_heads + kvSplitCoreNum - 1) // kvSplitCoreNum)
+        else:
+            coreNumPerBatch  = int((blocknum + num_tokens - 1) // num_tokens)
+            kvSeqklenMaxAlign = (max_context_len + block_size - 1) // block_size * block_size
+            kvSeqBlockNum = int(kvSeqklenMaxAlign / block_size)
+            kvBlockPreCore = int((kvSeqBlockNum + coreNumPerBatch - 1)) // coreNumPerBatch
+            kvSplitPerCore = int(kvBlockPreCore * block_size)
+            kvSplitCoreNum = int(kvSeqklenMaxAlign + kvSplitPerCore - 1) // kvSplitPerCore
+            headSplit = int((num_heads + kvSplitCoreNum - 1) // kvSplitCoreNum)
+        return kvSplitCoreNum, kvSplitPerCore
+
+    def get_head_num_move(self, num_heads, kvhead, embeddingSize, embeddingSizeV):
+        if embeddingSize % 32 == 0 and embeddingSizeV % 32 == 0 and embeddingSize <= 128 and embeddingSizeV <= 128 and num_heads == kvhead:
+            head_num_move = 4
+        else:
+            head_num_move = 1
+        return head_num_move
+
+    def calc_data(self, num_tokens, num_heads, kv_heads, head_size, block_size, num_blocks, k_seqlen,\
+                  dtype, mask_dim = 4, mask_data_type = torch.bfloat16,\
+                  dynamic_batch = False, dynamic_seqlen = None, is_int8_flag = False, has_bias = False,
+                  compressHead = False, razor_rope = False, blocknum = 20, is_quant_flag = 0, is_quant_offiline = 0, scaleType = 0):
+        self.num_heads = num_heads
+        self.kv_heads = kv_heads
+        self.num_tokens = num_tokens
+        self.compressHead = compressHead
+        self.head_size = head_size
+        self.scaleType = scaleType
+        self.group_num = num_heads / kv_heads
+        logging.debug(f'input info: {num_tokens}, {num_heads}, {kv_heads}, {head_size}, {block_size}, {num_blocks}, {k_seqlen}, {dtype}')
+
+        q_min_range = -5.0
+        q_max_range = 5.0
+        kv_min_range = -5.0
+        kv_max_range = 5.0
+        kv_type = dtype
+        self.is_quant_flag = is_quant_flag
+        self.is_quant_offiline = is_quant_offiline
+        if self.is_quant_flag:
+            q_min_range = -5
+            q_max_range =  5
+            kv_min_range = -5
+            kv_max_range =  5
+            dtype = torch.int8
+            kv_type = torch.int8
+        if is_int8_flag:
+            kv_min_range = -5
+            kv_max_range =  5
+            kv_type = torch.int8
+        query = torch.from_numpy(np.random.uniform(q_min_range, q_max_range, size=(num_tokens, num_heads, head_size))).to(dtype)
+        # (num_blocks, block_size, num_heads, head_size)
+        if not compressHead:
+            key_cache = torch.from_numpy(np.random.uniform(kv_min_range, kv_max_range, size=(num_blocks, block_size, kv_heads, head_size))).to(kv_type)
+            # # (num_blocks, block_size, num_heads, head_size)
+            value_cache = torch.from_numpy(np.random.uniform(kv_min_range, kv_max_range, size=(num_blocks, block_size, kv_heads, head_size))).to(kv_type)
+            # (num_blocks, block_size, num_heads, head_size)
+        else:
+            key_cache = torch.from_numpy(np.random.uniform(kv_min_range, kv_max_range, size=(num_blocks * kv_heads, block_size, 1, head_size))).to(kv_type)
+            # # (num_blocks, block_size, num_heads, head_size)
+            value_cache = torch.from_numpy(np.random.uniform(kv_min_range, kv_max_range, size=(num_blocks * kv_heads, block_size, 1, head_size))).to(kv_type)
+            # (num_blocks, block_size, num_heads, head_size)
+        self.data_type = dtype
+
+        razor_offset = torch.tensor([], dtype=torch.float32)
+        if razor_rope:
+            razor_offset = torch.zeros(num_blocks * kv_heads, block_size)
+            mask = np.random.choice([False, True], size=num_blocks * kv_heads, p=[0.2, 0.8])
+
+            random_indices = np.random.randint(0, block_size, size=np.sum(mask))
+            random_values = np.random.uniform(0, 20, size=np.sum(mask))
+
+            active_rows = np.where(mask)[0]
+            razor_offset[active_rows, random_indices] = torch.from_numpy(random_values).to(torch.float32)
+
+        if dynamic_batch:
+            context_lens = dynamic_seqlen
+        else:
+            context_lens = [k_seqlen] * num_tokens
+        max_context_len = max(context_lens)
+        self.max_context_len = max_context_len
+        batch = len(context_lens)
+
+        # alibi mask
+        if mask_dim == 4:
+            mask = np.zeros((batch, num_heads, 1, self.max_context_len), dtype=np.float32)
+            alibi_slopes = self.get_alibi_slopes(num_heads)
+            for i, context_len in enumerate(context_lens):
+                if context_len == 0:
+                    continue
+                position_ids = np.arange(context_len).astype(np.int32)
+                alibi_bias = (position_ids - context_len + 1).astype(np.float32)
+                alibi_bias = alibi_slopes.reshape(-1, 1, 1) * alibi_bias.reshape(1, 1, -1)   # (head_num, 1, context)
+                mask[i, :, :, :context_len] = alibi_bias
+            mask = torch.from_numpy(mask).to(mask_data_type)
+        # normal mask headnum, 1, maxS
+        elif mask_dim == 3:
+            mask = np.zeros((batch, 1, max_context_len), dtype=np.float16)
+            for i in range(batch):
+                mask[i, :, :i] = -10000
+            mask = torch.from_numpy(mask).to(mask_data_type)
+        else: # no mask
+            mask = None
+
+        if compressHead:
+            context_lens = [val for val in context_lens for _ in range(kv_heads)]
+        batch = len(context_lens)
+        max_num_blocks_per_seq = (max_context_len + block_size - 1) // block_size
+        block_tables = []   # （num_tokens, max_num_blocks_per_seq）
+        for _ in range(batch):
+            block_table = [
+                random.randint(0, num_blocks - 1) for _ in range(max_num_blocks_per_seq)
+            ]
+            block_tables.append(block_table)
+        self.is_int8_flag = is_int8_flag
+
+        if is_int8_flag:
+            de_scale1_fp32 = np.random.randint(-1, 2, size=(kv_heads * head_size)).astype(np.float32)
+            de_scale1_int64 = self.process_deq_scale(de_scale1_fp32)
+
+            de_scale2_fp32 =  np.random.randint(-1, 2, size=(kv_heads * head_size)).astype(np.float32)
+            de_scale2_int64 = self.process_deq_scale(de_scale2_fp32)
+
+            offset1 = np.random.randint(-20, 20, size=(kv_heads * head_size)).astype(np.int32)
+
+            offset2 = np.random.randint(-20, 20, size=(kv_heads * head_size)).astype(np.int32)
+
+            self.de_scale1_int64 = torch.tensor(list(de_scale1_int64), dtype=torch.int64)
+            self.de_scale2_int64 =  torch.tensor(list(de_scale2_int64), dtype=torch.int64)
+            self.de_scale1_fp32 = torch.from_numpy(de_scale1_fp32)
+            self.de_scale2_fp32 = torch.from_numpy(de_scale2_fp32)
+            self.offset1 = torch.from_numpy(offset1)
+            self.offset2 = torch.from_numpy(offset2)
+            self.has_bias = has_bias
+
+        if self.is_quant_flag:
+            self.de_scale1_fp32 = torch.from_numpy(np.random.uniform(-5/127, 5/127, size=(num_heads)).astype(np.float32)).to(torch.float32)
+            self.de_scale2_fp32 =  torch.from_numpy(np.random.uniform(-5/127, 5/127, size=(num_heads)).astype(np.float32)).to(torch.float32)
+            self.scale = torch.from_numpy(np.random.uniform(0, 127, size=(num_heads)).astype(np.float32)).to(torch.float32)
+            isLongSeq = max_context_len > blocknum * 128 * 2 and num_tokens < blocknum * 0.8
+            if num_tokens * num_heads < 0.8 * blocknum or isLongSeq:
+                self.kvsplit, self.kv_split_per_core = self.getkvsplit(num_tokens, num_heads, max_context_len, block_size, blocknum, isLongSeq)
+            else:
+                self.kvsplit = 1
+                self.kv_split_per_core = max_context_len
+            self.head_num_move = self.get_head_num_move(num_heads, kv_heads, head_size, head_size)
+            self.block_size_calc = self.get_blockszie_calc(max_context_len, block_size, head_size, head_size)
+            self.block_size = block_size
+
+        ref_output = torch.zeros_like(query).to(torch.float32)
+        true_out = torch.zeros_like(query, dtype=torch.float32)
+
+        self.ref_single_query_cached_kv_attention(
+            ref_output,
+            true_out,
+            query,
+            key_cache,
+            value_cache,
+            block_tables,
+            context_lens,
+            mask,
+            razor_offset,
+            razor_rope,
+            mask_dim,
+            mask_data_type
+        )
+        self.q = query
+        self.key_cache = key_cache
+        self.value_cache = value_cache
+        self.block_tables = np.array(block_tables).astype(np.int32)
+        self.contex_lens = np.array(context_lens).astype(np.int32)
+        self.mask = mask
+        self.golden_out = ref_output
+        self.true_out = true_out
+        self.razor_offset = razor_offset
+
+    def calc_data_bnsd(self, num_tokens, num_heads, kv_heads, head_size, block_size, num_blocks, k_seqlen,\
+                  dtype, mask_dim = 4, mask_data_type = torch.bfloat16,\
+                  dynamic_batch = False, dynamic_seqlen = None, is_int8_flag = False, has_bias = False,
+                  compressHead = False, razor_rope = False, scaleType = 0):
+        self.num_heads = num_heads
+        self.kv_heads = kv_heads
+        self.num_tokens = num_tokens
+        self.compressHead = compressHead
+        self.head_size = head_size
+        self.is_quant_flag = 0
+        self.scaleType = scaleType
+        logging.debug(f'input info: {num_tokens}, {num_heads}, {kv_heads}, {head_size}, {block_size}, {num_blocks}, {k_seqlen}, {dtype}')
+
+        query = torch.from_numpy(np.random.uniform(-1.0, 1.0, size=(num_tokens, num_heads, head_size))).to(dtype)
+        # (num_blocks, block_size, num_heads, head_size)
+        kv_range = 5.0
+        kv_type = dtype
+        if is_int8_flag:
+            kv_range = 4.0
+            kv_type = torch.int8
+        if not compressHead:
+            key_cache = torch.from_numpy(np.random.uniform(-kv_range, kv_range, size=(num_blocks, block_size, kv_heads, head_size))).to(kv_type)
+            # (num_blocks, block_size, num_heads, head_size)
+            value_cache = torch.from_numpy(np.random.uniform(-kv_range, kv_range, size=(num_blocks, block_size, kv_heads, head_size))).to(kv_type)
+        else:
+            key_cache = torch.from_numpy(np.random.uniform(-kv_range, kv_range, size=(num_blocks * kv_heads, block_size, 1, head_size))).to(kv_type)
+            # (num_blocks, block_size, num_heads, head_size)
+            value_cache = torch.from_numpy(np.random.uniform(-kv_range, kv_range, size=(num_blocks * kv_heads, block_size, 1, head_size))).to(kv_type)
+        self.data_type = dtype
+
+        razor_offset = torch.tensor([], dtype=torch.float32)
+        if razor_rope:
+            razor_offset = torch.zeros(num_blocks * kv_heads, block_size)
+            mask = np.random.choice([False, True], size=num_blocks * kv_heads, p=[0.2, 0.8])
+
+            random_indices = np.random.randint(0, block_size, size=np.sum(mask))
+            random_values = np.random.uniform(0, 20, size=np.sum(mask))
+
+            active_rows = np.where(mask)[0]
+            razor_offset[active_rows, random_indices] = torch.from_numpy(random_values).to(torch.float32)
+
+        if dynamic_batch:
+            context_lens = dynamic_seqlen
+        else:
+            context_lens = [k_seqlen] * num_tokens
+        max_context_len = max(context_lens)
+        self.max_context_len = max_context_len
+        batch = len(context_lens)
+
+        # alibi mask
+        if mask_dim == 4:
+            mask = np.zeros((batch, num_heads, 1, self.max_context_len), dtype=np.float32)
+            alibi_slopes = self.get_alibi_slopes(num_heads)
+            for i, context_len in enumerate(context_lens):
+                if context_len == 0:
+                    continue
+                position_ids = np.arange(context_len).astype(np.int32)
+                alibi_bias = (position_ids - context_len + 1).astype(np.float32)
+                alibi_bias = alibi_slopes.reshape(-1, 1, 1) * alibi_bias.reshape(1, 1, -1)   # (head_num, 1, context)
+                mask[i, :, :, :context_len] = alibi_bias
+            mask = torch.from_numpy(mask).to(mask_data_type)
+        # normal mask
+        elif mask_dim == 3:
+            mask = np.zeros((batch, 1, max_context_len), dtype=np.float16)
+            for i in range(batch):
+                mask[i, :, :i] = -10000
+            mask = torch.from_numpy(mask).to(mask_data_type)
+        else: # no mask
+            mask = None
+
+        if compressHead:
+            context_lens = [val for val in context_lens for _ in range(kv_heads)]
+        batch = len(context_lens)
+        max_num_blocks_per_seq = (max_context_len + block_size - 1) // block_size
+        block_tables = []   # （num_tokens, max_num_blocks_per_seq）
+        for _ in range(batch):
+            block_table = [
+                random.randint(0, num_blocks - 1) for _ in range(max_num_blocks_per_seq)
+            ]
+            block_tables.append(block_table)
+
+        self.is_int8_flag = is_int8_flag
+        if is_int8_flag:
+            de_scale1_fp32 = np.random.randint(-1, 2, size=(kv_heads * head_size)).astype(np.float32)
+            de_scale1_int64 = self.process_deq_scale(de_scale1_fp32)
+
+            de_scale2_fp32 =  np.random.randint(-1, 2, size=(kv_heads * head_size)).astype(np.float32)
+            de_scale2_int64 = self.process_deq_scale(de_scale2_fp32)
+
+            offset1 = np.random.randint(-20, 20, size=(kv_heads * head_size)).astype(np.int32)
+
+            offset2 = np.random.randint(-20, 20, size=(kv_heads * head_size)).astype(np.int32)
+
+            self.de_scale1_int64 = torch.tensor(list(de_scale1_int64), dtype=torch.int64)
+            self.de_scale2_int64 =  torch.tensor(list(de_scale2_int64), dtype=torch.int64)
+            self.de_scale1_fp32 = torch.from_numpy(de_scale1_fp32)
+            self.de_scale2_fp32 = torch.from_numpy(de_scale2_fp32)
+            self.offset1 = torch.from_numpy(offset1)
+            self.offset2 = torch.from_numpy(offset2)
+            self.has_bias = has_bias
+
+
+        ref_output = torch.zeros_like(query)
+        true_out = torch.zeros_like(query, dtype=torch.float32)
+        self.ref_single_query_cached_kv_attention(
+            ref_output,
+            true_out,
+            query,
+            key_cache,
+            value_cache,
+            block_tables,
+            context_lens,
+            mask,
+            razor_offset,
+            razor_rope,
+            mask_dim,
+            mask_data_type
+        )
+
+        self.q = query
+        self.key_cache = key_cache
+        self.key_cache_bnsd = torch.permute(key_cache, (0, 2, 1,3))
+        self.value_cache = value_cache
+        self.value_cache_bnsd = torch.permute(value_cache, (0, 2, 1,3))
+        self.block_tables = np.array(block_tables).astype(np.int32)
+        self.contex_lens = np.array(context_lens).astype(np.int32)
+        self.mask = mask
+        self.golden_out = ref_output
+        self.true_out = true_out
+        self.razor_offset = razor_offset
+
+    def golden_calc(self, in_tensors):
+        golden_out = torch.tensor(self.golden_out)
+        return [golden_out.npu()]
+
+    def golden_compare(self, out_tensors, golden_tensors):
+        if self.data_type == torch.bfloat16 and self.is_int8_flag is True:
+            result_old = self.compare_output_data(out_tensors, self.true_out.npu(), [0.001, 0.001, 0.005, 0.005])
+        else:
+            result_old = self.compare_output_data(out_tensors, golden_tensors, [0.001, 0.001, 0.005, 0.005])
+        result_double = compare_cv(self.true_out.npu(), golden_tensors.npu(), out_tensors.npu())
+        return (result_double or result_old)
+
+    def test_paged_fp16_nomask(self):
+        if not operation_test.get_soc_version() == 'Ascend910B':
+            print("this testcase only supports Ascend910B")
+            return
+        num_tokens = 2
+        num_heads = 32
+        kv_heads = 16
+        block_size = 128
+        head_size = 288
+        num_blocks = 64
+        k_seqlen = 256
+        tor = float(1.0 / (head_size ** 0.5))
+        mask_dim = 0
+        dtype = torch.float16
+ 
+        self.calc_data(num_tokens, num_heads, kv_heads, head_size, block_size, num_blocks, k_seqlen, dtype, mask_dim, dtype)
+ 
+        OP_NAME = "PagedAttentionOperation"
+        
+        PARAM = json.dumps({"headNum": num_heads, "qkScale": tor, "kvHeadNum": kv_heads, "maskType":0})
+        RUN_PARAM = json.dumps({"contextLens": self.contex_lens.tolist()})
+        self.execute_with_param(OP_NAME, PARAM, RUN_PARAM,
+                                [
+                                    self.q.npu(),
+                                    self.key_cache.npu(),
+                                    self.value_cache.npu(),
+                                    torch.from_numpy(self.block_tables.astype(np.int32)).npu(),
+                                    torch.from_numpy(self.contex_lens).npu()
+                                ])
+
+    def test_paged_bf16_nomask(self):
+        if not operation_test.get_soc_version() == 'Ascend910B':
+            print("this testcase only supports Ascend910B")
+            return
+        num_tokens = 2
+        num_heads = 32
+        kv_heads = 16
+        block_size = 128
+        head_size = 288
+        num_blocks = 64
+        k_seqlen = 256
+        tor = float(1.0 / (head_size ** 0.5))
+        mask_dim = 0
+        dtype = torch.bfloat16
+ 
+        self.calc_data(num_tokens, num_heads, kv_heads, head_size, block_size, num_blocks, k_seqlen, dtype, mask_dim, dtype)
+ 
+        OP_NAME = "PagedAttentionOperation"
+        
+        PARAM = json.dumps({"headNum": num_heads, "qkScale": tor, "kvHeadNum": kv_heads, "maskType":0})
+        RUN_PARAM = json.dumps({"contextLens": self.contex_lens.tolist()})
+        self.execute_with_param(OP_NAME, PARAM, RUN_PARAM,
+                                [
+                                    self.q.npu(),
+                                    self.key_cache.npu(),
+                                    self.value_cache.npu(),
+                                    torch.from_numpy(self.block_tables.astype(np.int32)).npu(),
+                                    torch.from_numpy(self.contex_lens).npu()
+                                ])
+
+    def test_paged_fp16_normmask(self):
+        if not operation_test.get_soc_version() == 'Ascend910B':
+            print("this testcase only supports Ascend910B")
+            return
+        num_tokens = 2
+        num_heads = 32
+        kv_heads = 16
+        block_size = 128
+        head_size = 288
+        num_blocks = 64
+        k_seqlen = 256
+        tor = float(1.0 / (head_size ** 0.5))
+        mask_dim = 3
+        dtype = torch.float16
+ 
+        self.calc_data(num_tokens, num_heads, kv_heads, head_size, block_size, num_blocks, k_seqlen, dtype, mask_dim, dtype)
+ 
+        OP_NAME = "PagedAttentionOperation"
+        
+        PARAM = json.dumps({"headNum": num_heads, "qkScale": tor, "kvHeadNum": kv_heads, "maskType":1})
+        RUN_PARAM = json.dumps({"contextLens": self.contex_lens.tolist(), "maskType":1})
+        self.execute_with_param(OP_NAME, PARAM, RUN_PARAM,
+                                [
+                                    self.q.npu(),
+                                    self.key_cache.npu(),
+                                    self.value_cache.npu(),
+                                    torch.from_numpy(self.block_tables.astype(np.int32)).npu(),
+                                    torch.from_numpy(self.contex_lens).npu(),
+                                    self.mask.npu()
+                                ])
+
+    def test_paged_bf16_normmask(self):
+        if not operation_test.get_soc_version() == 'Ascend910B':
+            print("this testcase only supports Ascend910B")
+            return
+        num_tokens = 2
+        num_heads = 32
+        kv_heads = 16
+        block_size = 128
+        head_size = 288
+        num_blocks = 64
+        k_seqlen = 256
+        tor = float(1.0 / (head_size ** 0.5))
+        mask_dim = 3
+        dtype = torch.bfloat16
+ 
+        self.calc_data(num_tokens, num_heads, kv_heads, head_size, block_size, num_blocks, k_seqlen, dtype, mask_dim, dtype)
+ 
+        OP_NAME = "PagedAttentionOperation"
+        
+        PARAM = json.dumps({"headNum": num_heads, "qkScale": tor, "kvHeadNum": kv_heads, "maskType":1})
+        RUN_PARAM = json.dumps({"contextLens": self.contex_lens.tolist(), "maskType":1})
+        self.execute_with_param(OP_NAME, PARAM, RUN_PARAM,
+                                [
+                                    self.q.npu(),
+                                    self.key_cache.npu(),
+                                    self.value_cache.npu(),
+                                    torch.from_numpy(self.block_tables.astype(np.int32)).npu(),
+                                    torch.from_numpy(self.contex_lens).npu(),
+                                    self.mask.npu()
+                                ])
+
+    def test_paged_fp16_alibmask(self):
+        if not operation_test.get_soc_version() == 'Ascend910B':
+            print("this testcase only supports Ascend910B")
+            return
+        num_tokens = 2
+        num_heads = 32
+        kv_heads = 16
+        block_size = 128
+        head_size = 288
+        num_blocks = 64
+        k_seqlen = 256
+        tor = float(1.0 / (head_size ** 0.5))
+        mask_dim = 4
+        dtype = torch.float16
+ 
+        self.calc_data(num_tokens, num_heads, kv_heads, head_size, block_size, num_blocks, k_seqlen, dtype, mask_dim, dtype)
+ 
+        OP_NAME = "PagedAttentionOperation"
+        
+        PARAM = json.dumps({"headNum": num_heads, "qkScale": tor, "kvHeadNum": kv_heads, "maskType":2})
+        RUN_PARAM = json.dumps({"contextLens": self.contex_lens.tolist(), "maskType":2})
+        self.execute_with_param(OP_NAME, PARAM, RUN_PARAM,
+                                [
+                                    self.q.npu(),
+                                    self.key_cache.npu(),
+                                    self.value_cache.npu(),
+                                    torch.from_numpy(self.block_tables.astype(np.int32)).npu(),
+                                    torch.from_numpy(self.contex_lens).npu(),
+                                    self.mask.npu()
+                                ])
+
+    def test_paged_bf16_alibmask(self):
+        if not operation_test.get_soc_version() == 'Ascend910B':
+            print("this testcase only supports Ascend910B")
+            return
+        num_tokens = 2
+        num_heads = 32
+        kv_heads = 16
+        block_size = 128
+        head_size = 288
+        num_blocks = 64
+        k_seqlen = 256
+        tor = float(1.0 / (head_size ** 0.5))
+        mask_dim = 4
+        dtype = torch.bfloat16
+ 
+        self.calc_data(num_tokens, num_heads, kv_heads, head_size, block_size, num_blocks, k_seqlen, dtype, mask_dim, dtype)
+ 
+        OP_NAME = "PagedAttentionOperation"
+        
+        PARAM = json.dumps({"headNum": num_heads, "qkScale": tor, "kvHeadNum": kv_heads, "maskType":2})
+        RUN_PARAM = json.dumps({"contextLens": self.contex_lens.tolist(), "maskType":2})
+        self.execute_with_param(OP_NAME, PARAM, RUN_PARAM,
+                                [
+                                    self.q.npu(),
+                                    self.key_cache.npu(),
+                                    self.value_cache.npu(),
+                                    torch.from_numpy(self.block_tables.astype(np.int32)).npu(),
+                                    torch.from_numpy(self.contex_lens).npu(),
+                                    self.mask.npu()
+                                ])
+
+    def test_paged_fp16_dequant_nomask(self):
+        if not operation_test.get_soc_version() == 'Ascend910B':
+            print("this testcase only supports Ascend910B")
+            return
+        num_tokens = 2
+        num_heads = 32
+        kv_heads = 16
+        block_size = 128
+        head_size = 128
+        num_blocks = 64
+        k_seqlen = 256
+        tor = float(1.0 / (head_size ** 0.5))
+        mask_dim = 0
+        dynamic_batch = False
+        is_int8_flag = True
+        has_bias = True
+        dtype = torch.float16
+ 
+        self.calc_data(num_tokens, num_heads, kv_heads, head_size, block_size, num_blocks, k_seqlen, dtype, mask_dim, dtype,
+                       is_int8_flag=is_int8_flag, has_bias=has_bias)
+ 
+        OP_NAME = "PagedAttentionOperation"
+        
+        PARAM = json.dumps({"headNum": num_heads, "qkScale": tor, "kvHeadNum": kv_heads, "maskType":0, "quantType":1, "hasQuantOffset":True})
+        RUN_PARAM = json.dumps({"contextLens": self.contex_lens.tolist(), "maskType":0})
+        self.execute_with_param(OP_NAME, PARAM, RUN_PARAM,
+                                [
+                                    self.q.npu(),
+                                    self.key_cache.npu(),
+                                    self.value_cache.npu(),
+                                    torch.from_numpy(self.block_tables.astype(np.int32)).npu(),
+                                    torch.from_numpy(self.contex_lens).npu(),
+                                    self.de_scale1_int64.npu(),
+                                    self.offset1.npu(),
+                                    self.de_scale2_int64.npu(),
+                                    self.offset2.npu(),
+                                ])
+
+    def test_paged_bf16_dequant_nomask(self):
+        if not operation_test.get_soc_version() == 'Ascend910B':
+            print("this testcase only supports Ascend910B")
+            return
+        num_tokens = 2
+        num_heads = 32
+        kv_heads = 16
+        block_size = 128
+        head_size = 128
+        num_blocks = 64
+        k_seqlen = 256
+        tor = float(1.0 / (head_size ** 0.5))
+        mask_dim = 0
+        dynamic_batch = False
+        is_int8_flag = True
+        has_bias = True
+        dtype = torch.bfloat16
+ 
+        self.calc_data(num_tokens, num_heads, kv_heads, head_size, block_size, num_blocks, k_seqlen, dtype, mask_dim, dtype,
+                       is_int8_flag=is_int8_flag, has_bias=has_bias)
+ 
+        OP_NAME = "PagedAttentionOperation"
+        
+        PARAM = json.dumps({"headNum": num_heads, "qkScale": tor, "kvHeadNum": kv_heads, "maskType":0, "quantType":1, "hasQuantOffset":True})
+        RUN_PARAM = json.dumps({"contextLens": self.contex_lens.tolist(), "maskType":0})
+        self.execute_with_param(OP_NAME, PARAM, RUN_PARAM,
+                                [
+                                    self.q.npu(),
+                                    self.key_cache.npu(),
+                                    self.value_cache.npu(),
+                                    torch.from_numpy(self.block_tables.astype(np.int32)).npu(),
+                                    torch.from_numpy(self.contex_lens).npu(),
+                                    self.de_scale1_fp32.npu(),
+                                    self.offset1.npu(),
+                                    self.de_scale2_fp32.npu(),
+                                    self.offset2.npu(),
+                                ])
+
+    def test_paged_fp16_dequant_normmask(self):
+        if not operation_test.get_soc_version() == 'Ascend910B':
+            print("this testcase only supports Ascend910B")
+            return
+        num_tokens = 2
+        num_heads = 32
+        kv_heads = 16
+        block_size = 128
+        head_size = 128
+        num_blocks = 64
+        k_seqlen = 256
+        tor = float(1.0 / (head_size ** 0.5))
+        mask_dim = 3
+        dynamic_batch = False
+        is_int8_flag = True
+        has_bias = True
+        dtype = torch.float16
+ 
+        self.calc_data(num_tokens, num_heads, kv_heads, head_size, block_size, num_blocks, k_seqlen, dtype, mask_dim, dtype,
+                       is_int8_flag=is_int8_flag, has_bias=has_bias)
+ 
+        OP_NAME = "PagedAttentionOperation"
+        
+        PARAM = json.dumps({"headNum": num_heads, "qkScale": tor, "kvHeadNum": kv_heads, "maskType":1, "quantType":1, "hasQuantOffset":True})
+        RUN_PARAM = json.dumps({"contextLens": self.contex_lens.tolist(), "maskType":1})
+        self.execute_with_param(OP_NAME, PARAM, RUN_PARAM,
+                                [
+                                    self.q.npu(),
+                                    self.key_cache.npu(),
+                                    self.value_cache.npu(),
+                                    torch.from_numpy(self.block_tables.astype(np.int32)).npu(),
+                                    torch.from_numpy(self.contex_lens).npu(),
+                                    self.mask.npu(),
+                                    self.de_scale1_int64.npu(),
+                                    self.offset1.npu(),
+                                    self.de_scale2_int64.npu(),
+                                    self.offset2.npu(),
+                                ])
+
+    def test_paged_bf16_dequant_normmask(self):
+        if not operation_test.get_soc_version() == 'Ascend910B':
+            print("this testcase only supports Ascend910B")
+            return
+        num_tokens = 2
+        num_heads = 32
+        kv_heads = 16
+        block_size = 128
+        head_size = 128
+        num_blocks = 64
+        k_seqlen = 256
+        tor = float(1.0 / (head_size ** 0.5))
+        mask_dim = 3
+        dynamic_batch = False
+        is_int8_flag = True
+        has_bias = True
+        dtype = torch.bfloat16
+ 
+        self.calc_data(num_tokens, num_heads, kv_heads, head_size, block_size, num_blocks, k_seqlen, dtype, mask_dim, dtype,
+                       is_int8_flag=is_int8_flag, has_bias=has_bias)
+ 
+        OP_NAME = "PagedAttentionOperation"
+        
+        PARAM = json.dumps({"headNum": num_heads, "qkScale": tor, "kvHeadNum": kv_heads, "maskType":1, "quantType":1, "hasQuantOffset":True})
+        RUN_PARAM = json.dumps({"contextLens": self.contex_lens.tolist(), "maskType":1})
+        self.execute_with_param(OP_NAME, PARAM, RUN_PARAM,
+                                [
+                                    self.q.npu(),
+                                    self.key_cache.npu(),
+                                    self.value_cache.npu(),
+                                    torch.from_numpy(self.block_tables.astype(np.int32)).npu(),
+                                    torch.from_numpy(self.contex_lens).npu(),
+                                    self.mask.npu(),
+                                    self.de_scale1_fp32.npu(),
+                                    self.offset1.npu(),
+                                    self.de_scale2_fp32.npu(),
+                                    self.offset2.npu(),
+                                ])
+
+    def test_paged_fp16_dequant_alibmask(self):
+        if not operation_test.get_soc_version() == 'Ascend910B':
+            print("this testcase only supports Ascend910B")
+            return
+        num_tokens = 2
+        num_heads = 32
+        kv_heads = 16
+        block_size = 128
+        head_size = 128
+        num_blocks = 64
+        k_seqlen = 256
+        tor = float(1.0 / (head_size ** 0.5))
+        mask_dim = 4
+        dynamic_batch = False
+        is_int8_flag = True
+        has_bias = True
+        dtype = torch.float16
+ 
+        self.calc_data(num_tokens, num_heads, kv_heads, head_size, block_size, num_blocks, k_seqlen, dtype, mask_dim, dtype,
+                       is_int8_flag=is_int8_flag, has_bias=has_bias)
+ 
+        OP_NAME = "PagedAttentionOperation"
+        
+        PARAM = json.dumps({"headNum": num_heads, "qkScale": tor, "kvHeadNum": kv_heads, "maskType":2, "quantType":1, "hasQuantOffset":True})
+        RUN_PARAM = json.dumps({"contextLens": self.contex_lens.tolist(), "maskType":2})
+        self.execute_with_param(OP_NAME, PARAM, RUN_PARAM,
+                                [
+                                    self.q.npu(),
+                                    self.key_cache.npu(),
+                                    self.value_cache.npu(),
+                                    torch.from_numpy(self.block_tables.astype(np.int32)).npu(),
+                                    torch.from_numpy(self.contex_lens).npu(),
+                                    self.mask.npu(),
+                                    self.de_scale1_int64.npu(),
+                                    self.offset1.npu(),
+                                    self.de_scale2_int64.npu(),
+                                    self.offset2.npu(),
+                                ])
+
+    def test_paged_bf16_dequant_alibmask(self):
+        if not operation_test.get_soc_version() == 'Ascend910B':
+            print("this testcase only supports Ascend910B")
+            return
+        num_tokens = 2
+        num_heads = 32
+        kv_heads = 16
+        block_size = 128
+        head_size = 128
+        num_blocks = 64
+        k_seqlen = 256
+        tor = float(1.0 / (head_size ** 0.5))
+        mask_dim = 4
+        dynamic_batch = False
+        is_int8_flag = True
+        has_bias = True
+        dtype = torch.bfloat16
+ 
+        self.calc_data(num_tokens, num_heads, kv_heads, head_size, block_size, num_blocks, k_seqlen, dtype, mask_dim, dtype,
+                       is_int8_flag=is_int8_flag, has_bias=has_bias)
+ 
+        OP_NAME = "PagedAttentionOperation"
+        
+        PARAM = json.dumps({"headNum": num_heads, "qkScale": tor, "kvHeadNum": kv_heads, "maskType":2, "quantType":1, "hasQuantOffset":True})
+        RUN_PARAM = json.dumps({"contextLens": self.contex_lens.tolist(), "maskType":2})
+        self.execute_with_param(OP_NAME, PARAM, RUN_PARAM,
+                                [
+                                    self.q.npu(),
+                                    self.key_cache.npu(),
+                                    self.value_cache.npu(),
+                                    torch.from_numpy(self.block_tables.astype(np.int32)).npu(),
+                                    torch.from_numpy(self.contex_lens).npu(),
+                                    self.mask.npu(),
+                                    self.de_scale1_fp32.npu(),
+                                    self.offset1.npu(),
+                                    self.de_scale2_fp32.npu(),
+                                    self.offset2.npu(),
+                                ])
+
+    def test_paged_fp16_BNSD(self):
+        if not operation_test.get_soc_version() == 'Ascend910B':
+            print("this testcase only supports Ascend910B")
+            return
+        num_tokens = 2
+        num_heads = 32
+        kv_heads = 16
+        block_size = 128
+        head_size = 128
+        num_blocks = 64
+        k_seqlen = 256
+        tor = float(1.0 / (head_size ** 0.5))
+        mask_dim = 0
+        dtype = torch.float16
+ 
+        self.calc_data_bnsd(num_tokens, num_heads, kv_heads, head_size, block_size, num_blocks, k_seqlen, dtype, mask_dim, dtype)
+ 
+        OP_NAME = "PagedAttentionOperation"
+        
+        PARAM = json.dumps({"headNum": num_heads, "qkScale": tor, "kvHeadNum": kv_heads, "maskType":0, "inputLayout":1})
+        RUN_PARAM = json.dumps({"contextLens": self.contex_lens.tolist(), "maskType":0})
+        self.execute_with_param(OP_NAME, PARAM, RUN_PARAM,
+                                [
+                                    self.q.npu(),
+                                    self.key_cache_bnsd.npu(),
+                                    self.value_cache_bnsd.npu(),
+                                    torch.from_numpy(self.block_tables.astype(np.int32)).npu(),
+                                    torch.from_numpy(self.contex_lens).npu()
+                                ])
+    
+    def test_paged_bf16_BNSD(self):
+        if not operation_test.get_soc_version() == 'Ascend910B':
+            print("this testcase only supports Ascend910B")
+            return
+        num_tokens = 2
+        num_heads = 32
+        kv_heads = 16
+        block_size = 128
+        head_size = 128
+        num_blocks = 64
+        k_seqlen = 256
+        tor = float(1.0 / (head_size ** 0.5))
+        mask_dim = 0
+        dtype = torch.bfloat16
+ 
+        self.calc_data_bnsd(num_tokens, num_heads, kv_heads, head_size, block_size, num_blocks, k_seqlen, dtype, mask_dim, dtype)
+ 
+        OP_NAME = "PagedAttentionOperation"
+        
+        PARAM = json.dumps({"headNum": num_heads, "qkScale": tor, "kvHeadNum": kv_heads, "maskType":0, "inputLayout":1})
+        RUN_PARAM = json.dumps({"contextLens": self.contex_lens.tolist(), "maskType":0})
+        self.execute_with_param(OP_NAME, PARAM, RUN_PARAM,
+                                [
+                                    self.q.npu(),
+                                    self.key_cache_bnsd.npu(),
+                                    self.value_cache_bnsd.npu(),
+                                    torch.from_numpy(self.block_tables.astype(np.int32)).npu(),
+                                    torch.from_numpy(self.contex_lens).npu()
+                                ])
+
+    def test_paged_fp16_BNSD_normmask(self):
+        if not operation_test.get_soc_version() == 'Ascend910B':
+            print("this testcase only supports Ascend910B")
+            return
+        num_tokens = 2
+        num_heads = 32
+        kv_heads = 16
+        block_size = 128
+        head_size = 128
+        num_blocks = 64
+        k_seqlen = 256
+        tor = float(1.0 / (head_size ** 0.5))
+        mask_dim = 3
+        dtype = torch.float16
+ 
+        self.calc_data_bnsd(num_tokens, num_heads, kv_heads, head_size, block_size, num_blocks, k_seqlen, dtype, mask_dim, dtype)
+ 
+        OP_NAME = "PagedAttentionOperation"
+        
+        PARAM = json.dumps({"headNum": num_heads, "qkScale": tor, "kvHeadNum": kv_heads, "maskType":1, "inputLayout":1})
+        RUN_PARAM = json.dumps({"contextLens": self.contex_lens.tolist(), "maskType":1})
+        self.execute_with_param(OP_NAME, PARAM, RUN_PARAM,
+                                [
+                                    self.q.npu(),
+                                    self.key_cache_bnsd.npu(),
+                                    self.value_cache_bnsd.npu(),
+                                    torch.from_numpy(self.block_tables.astype(np.int32)).npu(),
+                                    torch.from_numpy(self.contex_lens).npu(),
+                                    self.mask.npu()
+                                ])
+
+    def test_paged_bf16_BNSD_normmask(self):
+        if not operation_test.get_soc_version() == 'Ascend910B':
+            print("this testcase only supports Ascend910B")
+            return
+        num_tokens = 2
+        num_heads = 32
+        kv_heads = 16
+        block_size = 128
+        head_size = 128
+        num_blocks = 64
+        k_seqlen = 256
+        tor = float(1.0 / (head_size ** 0.5))
+        mask_dim = 3
+        dtype = torch.bfloat16
+ 
+        self.calc_data_bnsd(num_tokens, num_heads, kv_heads, head_size, block_size, num_blocks, k_seqlen, dtype, mask_dim, dtype)
+ 
+        OP_NAME = "PagedAttentionOperation"
+        
+        PARAM = json.dumps({"headNum": num_heads, "qkScale": tor, "kvHeadNum": kv_heads, "maskType":1, "inputLayout":1})
+        RUN_PARAM = json.dumps({"contextLens": self.contex_lens.tolist(), "maskType":1})
+        self.execute_with_param(OP_NAME, PARAM, RUN_PARAM,
+                                [
+                                    self.q.npu(),
+                                    self.key_cache_bnsd.npu(),
+                                    self.value_cache_bnsd.npu(),
+                                    torch.from_numpy(self.block_tables.astype(np.int32)).npu(),
+                                    torch.from_numpy(self.contex_lens).npu(),
+                                    self.mask.npu()
+                                ])
+
+    def test_paged_fp16_quant_case_normal_mask(self):
+        if not operation_test.get_soc_version() == 'Ascend910B':
+            print("this testcase only supports Ascend910B")
+            return
+        num_tokens = 9
+        num_heads = 32
+        kv_heads = 2
+        block_size = 128
+        head_size = 128
+        num_blocks = 64
+        dynamic_batch = True
+        batch_tatus = [1] * num_tokens
+        k_seqlen = [3000, 300, 14000, 33, 65, 1, 16, 14000, 300]
+        tor = 1.0 / (head_size ** 0.5)
+        dtype = torch.float16
+        outDtype = torch.float16
+        mask_dim = 3
+        is_quant_flag = 1
+ 
+        self.calc_data(num_tokens, num_heads, kv_heads, head_size, block_size, num_blocks, k_seqlen, dtype, mask_dim, outDtype,
+                       dynamic_batch, k_seqlen, is_quant_flag = is_quant_flag)
+ 
+        OP_NAME = "PagedAttentionOperation"
+        
+        PARAM = json.dumps({"headNum": num_heads, "qkScale": tor, "kvHeadNum": kv_heads, "maskType":1, "quantType":3, "outDataType": 1})
+        RUN_PARAM = json.dumps({"contextLens": self.contex_lens.tolist(), "maskType":1})
+        self.execute_with_param(OP_NAME, PARAM, RUN_PARAM,
+                                [
+                                    self.q.npu(),
+                                    self.key_cache.npu(),
+                                    self.value_cache.npu(),
+                                    torch.from_numpy(self.block_tables.astype(np.int32)).npu(),
+                                    torch.from_numpy(self.contex_lens).npu(),
+                                    self.mask.npu(),
+                                    self.de_scale1_fp32.npu(),
+                                    self.de_scale2_fp32.npu(),
+                                ])
+
+    def test_paged_bf16_quant_case_normal_mask(self):
+        if not operation_test.get_soc_version() == 'Ascend910B':
+            print("this testcase only supports Ascend910B")
+            return
+        num_tokens = 9
+        num_heads = 32
+        kv_heads = 2
+        block_size = 128
+        head_size = 128
+        num_blocks = 64
+        dynamic_batch = True
+        batch_tatus = [1] * num_tokens
+        k_seqlen = [3000, 300, 14000, 33, 65, 1, 16, 14000, 300]
+        tor = 1.0 / (head_size ** 0.5)
+        dtype = torch.bfloat16
+        outDtype = torch.bfloat16
+        mask_dim = 3
+        is_quant_flag = 1
+ 
+        self.calc_data(num_tokens, num_heads, kv_heads, head_size, block_size, num_blocks, k_seqlen, dtype, mask_dim, outDtype,
+                       dynamic_batch, k_seqlen, is_quant_flag = is_quant_flag)
+ 
+        OP_NAME = "PagedAttentionOperation"
+        
+        PARAM = json.dumps({"headNum": num_heads, "qkScale": tor, "kvHeadNum": kv_heads, "maskType":1, "quantType":3, "outDataType": 27})
+        RUN_PARAM = json.dumps({"contextLens": self.contex_lens.tolist(), "maskType":1})
+        self.execute_with_param(OP_NAME, PARAM, RUN_PARAM,
+                                [
+                                    self.q.npu(),
+                                    self.key_cache.npu(),
+                                    self.value_cache.npu(),
+                                    torch.from_numpy(self.block_tables.astype(np.int32)).npu(),
+                                    torch.from_numpy(self.contex_lens).npu(),
+                                    self.mask.npu(),
+                                    self.de_scale1_fp32.npu(),
+                                    self.de_scale2_fp32.npu(),
+                                ])
+
+    def test_paged_fp16_quantoffline_case_normal_mask(self):
+        if not operation_test.get_soc_version() == 'Ascend910B':
+            print("this testcase only supports Ascend910B")
+            return
+        num_tokens = 9
+        num_heads = 2
+        kv_heads = 2
+        block_size = 16
+        head_size = 128
+        num_blocks = 64
+        dynamic_batch = True
+        batch_tatus = [1] * num_tokens
+        k_seqlen = [3000, 300, 14000, 33, 65, 1, 16, 14000, 300]
+        tor = 1.0 / (head_size ** 0.5)
+        dtype = torch.float16
+        outDtype = torch.float16
+        mask_dim = 3
+        is_quant_flag = 1
+ 
+        self.calc_data(num_tokens, num_heads, kv_heads, head_size, block_size, num_blocks, k_seqlen, dtype, mask_dim, outDtype,
+                       dynamic_batch, k_seqlen, is_quant_flag = is_quant_flag, is_quant_offiline = 1)
+ 
+        OP_NAME = "PagedAttentionOperation"
+        
+        PARAM = json.dumps({"headNum": num_heads, "qkScale": tor, "kvHeadNum": kv_heads, "maskType":1, "quantType":2, "outDataType": 1})
+        RUN_PARAM = json.dumps({"contextLens": self.contex_lens.tolist(), "maskType":1})
+        self.execute_with_param(OP_NAME, PARAM, RUN_PARAM,
+                                [
+                                    self.q.npu(),
+                                    self.key_cache.npu(),
+                                    self.value_cache.npu(),
+                                    torch.from_numpy(self.block_tables.astype(np.int32)).npu(),
+                                    torch.from_numpy(self.contex_lens).npu(),
+                                    self.mask.npu(),
+                                    self.de_scale1_fp32.npu(),
+                                    self.de_scale2_fp32.npu(),
+                                    self.scale.npu(),
+                                ])
+
+    def test_paged_bf16_quantoffline_case_normal_mask(self):
+        if not operation_test.get_soc_version() == 'Ascend910B':
+            print("this testcase only supports Ascend910B")
+            return
+        num_tokens = 9
+        num_heads = 2
+        kv_heads = 2
+        block_size = 16
+        head_size = 128
+        num_blocks = 64
+        dynamic_batch = True
+        batch_tatus = [1] * num_tokens
+        k_seqlen = [3000, 300, 14000, 33, 65, 1, 16, 14000, 300]
+        tor = 1.0 / (head_size ** 0.5)
+        dtype = torch.bfloat16
+        outDtype = torch.bfloat16
+        mask_dim = 3
+        is_quant_flag = 1
+ 
+        self.calc_data(num_tokens, num_heads, kv_heads, head_size, block_size, num_blocks, k_seqlen, dtype, mask_dim, outDtype,
+                       dynamic_batch, k_seqlen, is_quant_flag = is_quant_flag, is_quant_offiline = 1)
+ 
+        OP_NAME = "PagedAttentionOperation"
+        
+        PARAM = json.dumps({"headNum": num_heads, "qkScale": tor, "kvHeadNum": kv_heads, "maskType":1, "quantType":2, "outDataType": 27})
+        RUN_PARAM = json.dumps({"contextLens": self.contex_lens.tolist(), "maskType":1})
+        self.execute_with_param(OP_NAME, PARAM, RUN_PARAM,
+                                [
+                                    self.q.npu(),
+                                    self.key_cache.npu(),
+                                    self.value_cache.npu(),
+                                    torch.from_numpy(self.block_tables.astype(np.int32)).npu(),
+                                    torch.from_numpy(self.contex_lens).npu(),
+                                    self.mask.npu(),
+                                    self.de_scale1_fp32.npu(),
+                                    self.de_scale2_fp32.npu(),
+                                    self.scale.npu(),
+                                ])
+if __name__ == '__main__':
+    unittest.main()
\ No newline at end of file
-- 
Gitee


From a48a3857394fbce98606e69f8473162602aa84c2 Mon Sep 17 00:00:00 2001
From: ivanshan_8170 <shanzidan@h-partners.com>
Date: Wed, 24 Sep 2025 18:57:09 +0800
Subject: [PATCH 88/94] bug: mla remove nz for ring

---
 .../multi_latent_attention_operation.cpp      |   7 -
 .../test_multi_latent_attention_lse.py        | 470 ++++++++++++++++++
 2 files changed, 470 insertions(+), 7 deletions(-)
 create mode 100644 tests/apitest/opstest/python/operations/multi_latent_attention/test_multi_latent_attention_lse.py

diff --git a/src/ops_infer/multi_latent_attention/multi_latent_attention_operation.cpp b/src/ops_infer/multi_latent_attention/multi_latent_attention_operation.cpp
index 8bdfceaf..35e0643c 100644
--- a/src/ops_infer/multi_latent_attention/multi_latent_attention_operation.cpp
+++ b/src/ops_infer/multi_latent_attention/multi_latent_attention_operation.cpp
@@ -106,13 +106,6 @@ static bool ParamCheck(const infer::MultiLatentAttentionParam &opParam)
         ATB_LOG(ERROR) << "only mtp(CALC_TYPE_SPEC) support mask";
         return false;
     }
-    if ((opParam.calcType == infer::MultiLatentAttentionParam::CalcType::CALC_TYPE_RING ||
-         opParam.calcType == infer::MultiLatentAttentionParam::CalcType::CALC_TYPE_SPEC_AND_RING) &&
-        (opParam.cacheMode != infer::MultiLatentAttentionParam::CacheMode::KROPE_CTKV &&
-         opParam.cacheMode != infer::MultiLatentAttentionParam::CacheMode::INT8_NZCACHE)) {
-        ATB_LOG(ERROR) << "CalcType is ring only support krppe ctkv and int8 nzcache";
-        return false;
-    }
     if ((opParam.cacheMode == infer::MultiLatentAttentionParam::CacheMode::INT8_NZCACHE) &&
         (opParam.calcType == infer::MultiLatentAttentionParam::CalcType::CALC_TYPE_RING ||
          opParam.calcType == infer::MultiLatentAttentionParam::CalcType::CALC_TYPE_SPEC_AND_RING) &&
diff --git a/tests/apitest/opstest/python/operations/multi_latent_attention/test_multi_latent_attention_lse.py b/tests/apitest/opstest/python/operations/multi_latent_attention/test_multi_latent_attention_lse.py
new file mode 100644
index 00000000..27bba90a
--- /dev/null
+++ b/tests/apitest/opstest/python/operations/multi_latent_attention/test_multi_latent_attention_lse.py
@@ -0,0 +1,470 @@
+# Copyright (c) 2024 Huawei Technologies Co., Ltd.
+# This file is a part of the CANN Open Software.
+# Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+#
+
+import logging
+import sys
+import os
+import unittest
+import math
+import numpy as np
+import torch
+import random
+import json
+import torch.nn.functional as F
+import torch_npu
+sys.path.append(os.path.join(os.path.dirname(__file__), "../"))
+import operation_test
+from precision_calcu import *
+
+torch.set_printoptions(precision=4, sci_mode=False)
+# torch_npu.npu.set_device()
+
+class TestPagedAttentionMLA(operation_test.OperationTest):
+
+    def compare_output_data(self, out, golden, ratios):
+        error_count = 0
+        strict_error_count = 0
+        fp16_min_normal = 1.0 / (1 << 14)
+        golden = golden.flatten().to(torch.float32)
+        out = out.flatten().to(torch.float32)
+        len = out.shape[0]
+        diff = torch.abs(golden - out)
+        max_diff = diff.max().item()
+        limit_error = torch.maximum(torch.abs(golden * ratios[0]), torch.tensor(ratios[1]))
+        strict_limit_error = torch.maximum(torch.abs(golden * ratios[2]), torch.tensor(ratios[3]))
+        error_count = torch.gt(diff, limit_error).sum().item()
+        strict_error_count = torch.gt(diff, strict_limit_error).sum().item()
+        logging.info(f"maxDiff {max_diff}")
+        logging.info("1/1000 Accuracy is %f", 1 - float(error_count) / len)
+        logging.info("5/1000 Accuracy is %f", 1 - float(strict_error_count) / len)
+        if self.data_type == torch.bfloat16 or self.is_int8_flag:
+            logging.info("accuracy is correct in old standard: %r", (float(strict_error_count) / len) <= ratios[2])
+        else:
+            logging.info("accuracy is correct in old standard: %r", (float(strict_error_count) / len) <= ratios[0])
+        calc_times = self.head_size_qk * self.max_context_len + 4
+        if self.data_type == torch.bfloat16:
+            if calc_times < 2048:
+                error = 2 ** (-7)
+            else:
+                error = 2 ** (-6)
+            error_threshold = torch.clamp(torch.abs(golden), min=1) * error
+            res = (diff <= error_threshold).all().item()
+            logging.debug("accuracy is correct in new standard: %r", res)
+            return res
+        else:
+            if calc_times < 2048:
+                error = 2 ** (-8)
+            else:
+                error = 2 ** (-7)
+            error_threshold = torch.clamp(torch.abs(golden), min=1) * error
+            res = (diff <= error_threshold).all().item()
+            logging.debug("accuracy is correct in new standard: %r", res)
+            return res
+
+    def get_alibi_slopes(self, n_heads):
+        n = 2 ** math.floor(math.log2(n_heads))
+        m0 = 2.0 ** (-8.0 / n)
+        slopes = torch.pow(m0, torch.arange(1, n + 1))
+        if n < n_heads:
+            m1 = 2.0 ** (-4.0 / n)
+            mm = torch.pow(m1, torch.arange(1, 1 + 2 * (n_heads - n), 2))
+            slopes = torch.cat([slopes, mm])
+        # slopes = torch.ones(n_heads)
+        return slopes
+
+    def group_mm_torch(self, heads, group_num, A, B, is_k):
+        group_head = heads // group_num
+        score_high = None
+        for i in range(group_num):
+            if self.is_int8_flag:
+                int8_B = B[i: (i + 1), :, :, ]
+                head_dim = int8_B.shape[2]
+                int32_B = torch.matmul(torch.eye(int8_B.shape[1]).to(torch.float32), int8_B.to(torch.float32)).to(
+                    torch.int32)
+                if is_k:
+                    if self.has_bias:
+                        int32_B = int32_B + self.offset1[i * head_dim:(i + 1) * head_dim]
+                    fp32_B = int32_B.to(torch.float32) * self.de_scale1_fp32[i * head_dim:(i + 1) * head_dim]
+                    fp32_B = torch.permute(fp32_B, (0, 2, 1))
+                else:
+                    if self.has_bias:
+                        int32_B = int32_B + self.offset2[i * head_dim:(i + 1) * head_dim]
+                    fp32_B = int32_B.to(torch.float32) * self.de_scale2_fp32[i * head_dim:(i + 1) * head_dim]
+                group_score_high = torch.matmul(A[i * group_head: (i + 1) * group_head, :, :].to(torch.float32),
+                                                fp32_B)
+            else:
+                group_score_high = torch.matmul(A[i * group_head: (i + 1) * group_head, :, :].to(torch.float32),
+                                                B[i:(i + 1), :, :].to(torch.float32))
+            if score_high is None:
+                score_high = group_score_high
+            else:
+                score_high = torch.cat((score_high, group_score_high), 0)
+        return score_high
+
+    def process_deq_scale(self, deq_scale) -> np.ndarray:
+        new_deq_scale = np.frombuffer(deq_scale.tobytes(), dtype=np.uint32)
+        return new_deq_scale.astype(np.uint64)
+
+    def softmax(self, sim):
+        row_max = torch.max(sim, axis=-1, keepdims=True)[0]
+        sim_sub = sim - row_max
+        sim_sub = torch.exp(sim_sub)
+        row_sum = torch.sum(sim_sub, axis=-1, keepdims=True)
+        soft_res = sim_sub / row_sum
+        return soft_res
+
+    def softmax_numpy(self, sim):
+        sim = sim.cpu().numpy()
+        row_max = np.max(sim, axis=-1, keepdims=True)
+        sim_sub = sim - row_max
+        sim_sub = np.exp(sim_sub)
+        # print(sim_sub)
+        row_sum = np.sum(sim_sub, axis=-1, keepdims=True)
+        soft_res = sim_sub / row_sum
+        return soft_res, row_max + np.log(row_sum)
+
+    def shape_nd_to_nz(self, shape, dtype='float16'):
+        assert len(shape) >= 2
+        batch = shape[:-2]  # 最后两维nd->nz
+        a, b = shape[-2], shape[-1]
+        a0, b0 = 16, 16
+        return list(batch) + [math.ceil(b / b0), math.ceil(a / a0), a0, b0]
+
+    def gen_axes_for_transpose(self,offset, base):
+        return [x for x in range(offset)] + [x + offset for x in base]
+
+    def convert_nd_to_nz(self, x):
+        array_trans = self.gen_axes_for_transpose(len(x.shape) - 2, [2, 0, 1, 3])  # (m1, m0, n1, n0) -> (n1, m1, m0, n0)
+        x_shape = self.shape_nd_to_nz(x.shape, dtype=x.dtype)
+        *_, n1, m1, m0, n0 = x_shape
+        return x.reshape(x_shape[:-4] + [m1, m0, n1, n0]).permute(*array_trans)  # x原始需要对齐，才能reshape
+
+    def ref_masked_attention(self,
+                             query,  # (1, num_heads, head_size)
+                             key,  # (context_len, kv_heads, head_size)
+                             value,
+                             scale: float,
+                             alibi_bias,
+                             mask_data_type=torch.bfloat16
+                             ):
+        # Q * K.T
+        query = query
+        query = torch.permute(query, (1, 0, 2))
+        if not self.is_int8_flag:
+            key = torch.permute(key, (1, 2, 0))  # 0 1 2
+        else:
+            key = torch.permute(key, (1, 0, 2))
+        sim_high = self.group_mm_torch(query.shape[0], key.shape[0], query, key, 1)  # (head_num, q_seqlen, k_seqlen)
+        sim_out = sim_high.to(torch.float32)
+        sim_high = sim_high.to(torch.float32) * scale
+        if alibi_bias is not None:
+            sim_high = sim_high + alibi_bias.to(torch.float32)
+        # softmax
+        p_high, lse = self.softmax_numpy(sim_high)
+        p = torch.from_numpy(p_high).to(mask_data_type)
+        p_high = torch.from_numpy(p_high)
+
+        lse = torch.permute(torch.from_numpy(lse).to(mask_data_type), (1, 0, 2))  # (q_seqlen, head_num, 1)
+
+        # P * V
+        value = torch.permute(value, (1, 0, 2))
+        out = self.group_mm_torch(query.shape[0], key.shape[0], p, value, 0)
+        out_high = self.group_mm_torch(query.shape[0], key.shape[0], p_high, value, 0)
+        out = torch.permute(out, (1, 0, 2))
+        out_high = torch.permute(out_high, (1, 0, 2))
+        sim_out = torch.permute(sim_out, (1, 0, 2))
+        return out, out_high, sim_out, lse
+
+    def ref_single_query_cached_kv_attention(self,
+                                             sim,
+                                             output,
+                                             true_out,
+                                             lse,        # (num_tokens, num_heads, 1)
+                                             query,
+                                             key_cache,  # (num_blocks, block_size, num_heads, head_size)
+                                             value_cache,  # (num_blocks, block_size, num_heads, head_size)
+                                             block_tables,
+                                             context_lens,
+                                             mask,
+                                             mask_dim=4,
+                                             mask_data_type=torch.bfloat16
+                                             ) -> None:
+        mask_index_coff = 1
+        if self.compressHead:
+            query = query.view(self.num_tokens * self.kv_heads, self.num_heads // self.kv_heads, self.head_size_qk)
+            output = output.view(self.num_tokens * self.kv_heads, self.num_heads // self.kv_heads, self.head_size_vo)
+            true_out = true_out.view(self.num_tokens * self.kv_heads, self.num_heads // self.kv_heads,
+                                     self.head_size_vo)
+            if mask_dim == 4:
+                mask_shape = mask.shape
+                mask = mask.view(mask_shape[0] * self.kv_heads, self.num_heads // self.kv_heads, 1,
+                                 self.max_context_len)
+            else:
+                mask_index_coff = self.kv_heads
+        num_heads = query.shape[1]
+        kv_heads = value_cache.shape[2]
+        head_size_qk = key_cache.shape[3]
+        head_size_vo = value_cache.shape[3]
+        block_size = value_cache.shape[1]
+
+        num_input_tokens = query.shape[0]
+        index = 0
+        for i in range(len(context_lens)):
+            block_table = block_tables[i]
+            context_len = int(context_lens[i])
+            if context_len == 0:
+                continue
+
+            q = query[index].view(1, num_heads, head_size_qk)
+            keys = []
+            values = []
+            for j in range(context_len):
+                block_number = int(block_table[j // block_size])
+                block_offset = j % block_size
+
+                k = key_cache[block_number, block_offset, :, :]
+                k = k.reshape(kv_heads, head_size_qk)
+                keys.append(k)
+
+                v = value_cache[block_number, block_offset, :, :]
+                v = v.reshape(kv_heads, head_size_vo)
+                values.append(v)
+            keys = torch.stack(keys, axis=0)
+            values = torch.stack(values, axis=0)
+            scale = np.float32(1.0 / (head_size_qk ** 0.5))
+            if mask_dim == 4:
+                out, out_high, sim_out, _ = self.ref_masked_attention(q, keys, values, scale,
+                                                                         mask[i, :, :, :context_len], mask_data_type)
+                out = out.reshape(num_heads, head_size_vo)
+            elif mask_dim == 3:
+                out, out_high, sim_out, _ = self.ref_masked_attention(q, keys, values, scale,
+                                                                         mask[i // mask_index_coff, :, :context_len],
+                                                                         mask_data_type)
+                out = out.reshape(num_heads, head_size_vo)
+            else:
+                out, out_high, sim_out, lse_i = self.ref_masked_attention(q, keys, values, scale, mask,
+                                                                          mask_data_type)
+                out = out.reshape(num_heads, head_size_vo)
+                lse_i = lse_i.reshape(num_heads, 1)
+                lse[index] = lse_i.to(mask_data_type)
+            out_high = out_high.reshape(num_heads, head_size_vo)
+            sim_out = sim_out.reshape(1, num_heads * context_len)
+            output[index] = out.to(mask_data_type)
+            true_out[index] = out_high
+            sim[index] = sim_out
+            index = index + 1
+
+    def calc_data(self, num_tokens, num_heads, kv_heads, head_size_qk, head_size_vo, block_size, num_blocks, k_seqlen,\
+                  dtype, mask_dim = 0, mask_data_type = torch.bfloat16,\
+                  dynamic_batch = False, dynamic_seqlen = None, is_int8_flag = False, has_bias = False,
+                  compressHead = False, is_kv_combined = True, is_nz_in = False):
+        self.num_heads = num_heads
+        self.kv_heads = kv_heads
+        self.num_tokens = num_tokens
+        self.compressHead = compressHead
+        self.head_size_qk = head_size_qk
+        self.head_size_vo = head_size_vo
+
+        logging.debug(
+            f'input info: {num_tokens}, {num_heads}, {kv_heads}, {head_size_qk}, {head_size_vo}, {block_size}, {num_blocks}, {k_seqlen}, {dtype}')
+
+        q_range = 5.0
+        query = torch.from_numpy(np.random.uniform(-q_range, q_range, size=(num_tokens, num_heads, head_size_qk))).to(dtype)
+        # (num_blocks, block_size, num_heads, head_size)
+        kv_range = 5.0
+        kv_type = dtype
+        if is_int8_flag:
+            kv_type = torch.int8
+        if not compressHead:
+            key_cache = torch.from_numpy(
+                np.random.uniform(-kv_range, kv_range, size=(num_blocks, block_size, kv_heads, head_size_qk))).to(
+                kv_type)
+            # (num_blocks, block_size, num_heads, head_size)
+            if not is_kv_combined:
+                value_cache = torch.from_numpy(
+                    np.random.uniform(-kv_range, kv_range, size=(num_blocks, block_size, kv_heads, head_size_vo))).to(
+                    kv_type)
+            else:
+                value_cache = key_cache[:, :, :, :head_size_vo]
+        else:
+            key_cache = torch.from_numpy(
+                np.random.uniform(-kv_range, kv_range, size=(num_blocks * kv_heads, block_size, 1, head_size_qk))).to(
+                kv_type)
+            # (num_blocks, block_size, num_heads, head_size)
+            if not is_kv_combined:
+                value_cache = torch.from_numpy(np.random.uniform(-kv_range, kv_range, size=(
+                num_blocks * kv_heads, block_size, 1, head_size_vo))).to(kv_type)
+            else:
+                value_cache = key_cache[:, :, :, :head_size_vo]
+        self.data_type = dtype
+
+        if dynamic_batch:
+            context_lens = dynamic_seqlen
+        else:
+            context_lens = [k_seqlen] * num_tokens
+        max_context_len = max(context_lens)
+        self.max_context_len = max_context_len
+        batch = len(context_lens)
+
+        # alibi mask
+        if mask_dim == 4:
+            mask = np.zeros((batch, num_heads, 1, self.max_context_len), dtype=np.float32)
+            alibi_slopes = self.get_alibi_slopes(num_heads)
+            for i, context_len in enumerate(context_lens):
+                if context_len == 0:
+                    continue
+                position_ids = np.arange(context_len).astype(np.int32)
+                alibi_bias = (position_ids - context_len + 1).astype(np.float32)
+                alibi_bias = alibi_slopes.reshape(-1, 1, 1) * alibi_bias.reshape(1, 1, -1)  # (head_num, 1, context)
+                mask[i, :, :, :context_len] = alibi_bias
+            mask = torch.from_numpy(mask).to(mask_data_type)
+        # normal mask
+        elif mask_dim == 3:
+            mask = np.zeros((batch, 1, max_context_len), dtype=np.float16)
+            for i in range(batch):
+                mask[i, :, :i] = -10000
+            mask = torch.from_numpy(mask).to(mask_data_type)
+        else:  # no mask
+            mask = None
+
+        if compressHead:
+            context_lens = [val for val in context_lens for _ in range(kv_heads)]
+        batch = len(context_lens)
+        max_num_blocks_per_seq = (max_context_len + block_size - 1) // block_size
+        block_tables = []  # （num_tokens, max_num_blocks_per_seq）
+        for i in range(batch):
+            block_table = [
+                i * max_num_blocks_per_seq + _ for _ in range(max_num_blocks_per_seq)
+            ]
+            block_tables.append(block_table)
+
+        self.is_int8_flag = is_int8_flag
+        if is_int8_flag:
+            de_scale1_fp32 = np.random.randint(-1, 2, size=(kv_heads * head_size)).astype(np.float32)
+            de_scale1_int64 = self.process_deq_scale(de_scale1_fp32)
+
+            de_scale2_fp32 = np.random.randint(-1, 2, size=(kv_heads * head_size)).astype(np.float32)
+            de_scale2_int64 = self.process_deq_scale(de_scale2_fp32)
+
+            offset1 = np.random.randint(-20, 20, size=(kv_heads * head_size)).astype(np.int32)
+
+            offset2 = np.random.randint(-20, 20, size=(kv_heads * head_size)).astype(np.int32)
+
+            self.de_scale1_int64 = torch.tensor(list(de_scale1_int64), dtype=torch.int64)
+            self.de_scale2_int64 = torch.tensor(list(de_scale2_int64), dtype=torch.int64)
+            self.de_scale1_fp32 = torch.from_numpy(de_scale1_fp32)
+            self.de_scale2_fp32 = torch.from_numpy(de_scale2_fp32)
+            self.offset1 = torch.from_numpy(offset1)
+            self.offset2 = torch.from_numpy(offset2)
+            self.has_bias = has_bias
+
+        shape_out = (num_tokens, num_heads, head_size_vo)
+        ref_output = torch.zeros(shape_out, dtype=dtype)
+        true_out = torch.zeros(shape_out, dtype=torch.float32)
+        sim = torch.zeros((num_tokens, num_heads * k_seqlen), dtype=torch.float32)
+        lse = torch.zeros((num_tokens, num_heads, 1), dtype=dtype)
+        self.ref_single_query_cached_kv_attention(
+            sim,
+            ref_output,
+            true_out,
+            lse,
+            query,
+            key_cache,
+            value_cache,
+            block_tables,
+            context_lens,
+            mask,
+            mask_dim,
+            mask_data_type
+        )
+
+        self.q_split1, self.q_split2 = torch.split(query, [512, 64], dim=2)
+        self.key_cache_split1, self.key_cache_split2 = torch.split(key_cache, [512, 64], dim=3)
+        self.value_cache = value_cache
+
+        if (is_nz_in):
+            key_cache_split1, key_cache_split2 = torch.split(key_cache, [512, 64], dim=3)
+            key_cache_split1 = key_cache_split1.reshape(num_blocks, block_size, -1)
+            key_cache_split2 = key_cache_split2.reshape(num_blocks, block_size, -1)
+            key_cache_split1_nz = self.convert_nd_to_nz(key_cache_split1)
+            key_cache_split2_nz = self.convert_nd_to_nz(key_cache_split2)
+            self.key_cache_split1 = key_cache_split1_nz.to(mask_data_type).reshape(num_blocks, -1, block_size, 16)
+            self.key_cache_split2 = key_cache_split2_nz.to(mask_data_type).reshape(num_blocks, -1, block_size, 16)
+
+        self.block_tables = np.array(block_tables).astype(np.int32)
+        self.contex_lens = np.array(context_lens).astype(np.int32)
+        self.alib_mask = mask
+        self.golden_out = ref_output
+        self.true_out = true_out
+        self.lse = lse
+
+    def golden_calc(self, in_tensors):
+        golden_out = torch.tensor(self.golden_out)
+        return [golden_out, self.lse]
+
+    def golden_compare(self, out_tensors, golden_tensors):
+        go_double = compare_cv(self.true_out, golden_tensors[0].cpu(), out_tensors[0].cpu())
+        result_old = self.compare_output_data(out_tensors[0].npu(), golden_tensors[0].npu(), [0.001, 0.001, 0.005, 0.005])
+        lse_double = True
+        lse_old = True
+        if self.is_ring:
+            lse_double = compare_cv(golden_tensors[1].npu(), golden_tensors[1].npu(), out_tensors[1].npu())
+            lse_old = self.compare_output_data(out_tensors[1].npu(), golden_tensors[1].npu(), [0.001, 0.001, 0.005, 0.005])
+        return (result_old) and (lse_double or lse_old)
+
+    def test_paged_mla_combine_cache_norm_128_nz(self):
+        if not operation_test.get_soc_version() == 'Ascend910B':
+            print("this testcase only supports Ascend910B")
+            return
+        num_tokens = 32
+        num_heads = 32
+        kv_heads = 1
+        block_size = 128
+        head_size_qk = 576
+        head_size_vo = 512
+        num_blocks = 64
+        k_seqlen = 256
+        tor = 1.0 / (head_size_qk ** 0.5)
+        mask_dim = 0
+        dtype = torch.float16
+        is_kv_combined = True
+        self.is_ring = 1
+        is_nz_in = True
+
+        self.calc_data(num_tokens, num_heads, kv_heads, head_size_qk, head_size_vo, block_size, num_blocks, k_seqlen,
+                       dtype, mask_dim, dtype,
+                       is_kv_combined=is_kv_combined, is_nz_in=is_nz_in)
+
+        OP_NAME = "MLAOperation"
+        OP_PARAM = {"type": 0, "kvHead": kv_heads, "headSize": num_heads, "tor": tor,
+                    "kvSeqLen": self.contex_lens.tolist(), "isRing": self.is_ring}
+        logging.debug(f"blcok_tables shape: {self.block_tables}")
+        logging.debug(f"contex_lens shape: {self.contex_lens}")
+        logging.debug(f"numTokens: {num_tokens}, numHeads: {num_heads}, kvHead: {kv_heads}"
+                      f", blockSize: {block_size}, headSizeQK: {head_size_qk}, headSizeVO: {head_size_vo}, numBlocks: {num_blocks}")
+        logging.info(f"Q1 shape: {self.q_split1.shape}")
+        logging.info(f"Q2 shape: {self.q_split2.shape}")
+        logging.info(f"K1 shape: {self.key_cache_split1.shape}")
+        logging.info(f"K2 shape: {self.key_cache_split2.shape}")
+        
+        OP_NAME = "MultiLatentAttentionOperation"
+        PARAM = json.dumps({"headNum": num_heads, "qkScale":tor, "kvHeadNum":kv_heads, "maskType": 0, "cacheMode": 3, "calcType": 2})
+        RUN_PARAM = json.dumps({"contextLens": self.contex_lens.tolist()})
+        self.execute_with_param(OP_NAME, PARAM, RUN_PARAM,
+            [
+                self.q_split1.npu(),
+                self.q_split2.npu(),
+                torch.tensor(self.key_cache_split1).npu(),
+                torch.tensor(self.key_cache_split2).npu(),
+                torch.tensor(self.block_tables).int().npu(),
+                torch.tensor(self.contex_lens).npu()
+            ])
+
+if __name__ == '__main__':
+    unittest.main()
-- 
Gitee


From 60c06b4c9e3af50357453e4cb2423bad36f5c443 Mon Sep 17 00:00:00 2001
From: ivanshan_8170 <shanzidan@h-partners.com>
Date: Wed, 24 Sep 2025 20:12:21 +0800
Subject: [PATCH 89/94] remove test case

---
 tests/apitest/opstest/csv/multi_latent_attention.csv | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tests/apitest/opstest/csv/multi_latent_attention.csv b/tests/apitest/opstest/csv/multi_latent_attention.csv
index 09c3ff7f..f5a43959 100644
--- a/tests/apitest/opstest/csv/multi_latent_attention.csv
+++ b/tests/apitest/opstest/csv/multi_latent_attention.csv
@@ -1,14 +1,12 @@
 CaseNum |CaseName                     |OpName            |OpParam                             |InNum |InDType                    |InFormat      |InShape                |OutNum |OutDType      |OutFormat  |OutShape        |DataGenType          |DataGenRange               |InTensorFile |OutTensorFile |TestType |TestLevel |FromModel |SocVersion |ExpectedError
 1       |MultiLatentAttentionBadCaseHeadNum |MultiLatentAttentionOperation |{"maskType":1,"calcType":2,"cacheMode":1}           |    6 |float16;float16;float16;float16;int32;int32 |nd;nd;nd;nd;nd;nd|32,32,512;32,32,64;64,128,1,512;64,128,1,64;32,2;32|2|float16;float16|nd;nd|32,32,512;32,32,1|random;random;random;random;random;random|-100,100;-100,100;-100,100;-100,100;-100,100;-100,100|             |              |         |          |          |Ascend910B |C:ERROR_INVALID_PARAM
 2       |MultiLatentAttentionBadCaseKvHeadNumNot1 |MultiLatentAttentionOperation |{"maskType":0,"calcType":2,"cacheMode":3,"headNum":8,"kvHeadNum":2}           |    6 |float16;float16;float16;float16;int32;int32 |nd;nd;nd;nd;nd;nd|32,32,512;32,32,64;64,128,1,512;64,128,1,64;32,2;32|2      |float16;float16|nd;nd|32,32,512;32,32,1|random;random;random;random;random;random|-100,100;-100,100;-100,100;-100,100;-100,100;-100,100|             |              |         |          |          |Ascend910B |C:ERROR_INVALID_PARAM
-3       |MultiLatentAttentionBadCaseInt8NzCacheHeadNum128 |MultiLatentAttentionOperation |{"maskType":0,"calcType":2,"cacheMode":3,"headNum":128,"kvHeadNum":1}           |    6 |float16;float16;float16;float16;int32;int32 |nd;nd;nd;nd;nd;nd|32,32,512;32,32,64;64,128,1,512;64,128,1,64;32,2;32|1      |float16|nd|32,32,512|random;random;random;random;random;random|-100,100;-100,100;-100,100;-100,100;-100,100;-100,100|             |              |         |          |          |Ascend910B |C:ERROR_INVALID_PARAM
 4       |MultiLatentAttentionErrorQkScale |MultiLatentAttentionOperation |{"maskType":0,"calcType":2,"cacheMode":1,"headNum":16,"kvHeadNum":1,"qkScale":100}           |    6 |float16;float16;float16;float16;int32;int32 |nd;nd;nd;nd;nd;nd|32,32;32,32,64;64,128,1,512;64,128,1,64;32,2;32|2      |float16;float16|nd;nd|32,32,512;32,32,1|random;random;random;random;random;random|-100,100;-100,100;-100,100;-100,100;-100,100;-100,100|             |              |         |          |          |Ascend910B |C:ERROR_INVALID_PARAM
 5       |MultiLatentAttentionErrorMaskType |MultiLatentAttentionOperation |{"maskType":3,"calcType":2,"cacheMode":1,"headNum":16,"kvHeadNum":1}           |    6 |float16;float16;float16;float16;int32;int32 |nd;nd;nd;nd;nd;nd|32,32;32,32,64;64,128,1,512;64,128,1,64;32,2;32|2      |float16;float16|nd;nd|32,32,512;32,32,1|random;random;random;random;random;random|-100,100;-100,100;-100,100;-100,100;-100,100;-100,100|             |              |         |          |          |Ascend910B |C:ERROR_INVALID_PARAM
 6       |MultiLatentAttentionInvalidCalcType |MultiLatentAttentionOperation |{"maskType":0,"calcType":5,"cacheMode":3,"headNum":16,"kvHeadNum":1}           |    6 |float16;float16;float16;float16;int32;int32 |nd;nd;nd;nd;nd;nd|32,32;32,32,64;64,128,1,512;64,128,1,64;32,2;32|2      |float16;float16|nd;nd|32,32,512;32,32,1|random;random;random;random;random;random|-100,100;-100,100;-100,100;-100,100;-100,100;-100,100|             |              |         |          |          |Ascend910B |C:ERROR_INVALID_PARAM
 7       |MultiLatentAttentionInvalidCacheType |MultiLatentAttentionOperation |{"maskType":0,"calcType":2,"cacheMode":4,"headNum":16,"kvHeadNum":1}           |    6 |float16;float16;float16;float16;int32;int32 |nd;nd;nd;nd;nd;nd|32,32;32,32,64;64,128,1,512;64,128,1,64;32,2;32|2      |float16;float16|nd;nd|32,32,512;32,32,1|random;random;random;random;random;random|-100,100;-100,100;-100,100;-100,100;-100,100;-100,100|             |              |         |          |          |Ascend910B |C:ERROR_INVALID_PARAM
 8       |MultiLatentAttentionBadCaseNotSupportedKvCache |MultiLatentAttentionOperation |{"maskType":0,"calcType":0,"cacheMode":0, "headNum": 8, "kvHeadNum": 1}           |    6 |float16;float16;float16;float16;int32;int32 |nd;nd;nd;nd;nd;nd|32,32,512;32,32,64;64,128,1,512;64,128,1,64;32,2;32|1      |float16|nd|32,32,512|random;random;random;random;random;random|-100,100;-100,100;-100,100;-100,100;-100,100;-100,100|             |              |         |          |          |Ascend910B |C:ERROR_INVALID_PARAM
 9       |MultiLatentAttentionNoError |MultiLatentAttentionOperation |{"maskType":1,"calcType":2,"cacheMode":1,"headNum":16,"kvHeadNum":1}           |    6 |float16;float16;float16;float16;int32;int32 |nd;nd;nd;nd;nd;nd|32,32;32,32,64;64,128,1,512;64,128,1,64;32,2;32|2      |float16;float16|nd;nd|32,32,512;32,32,1|random;random;random;random;random;random|-100,100;-100,100;-100,100;-100,100;-100,100;-100,100|             |              |         |          |          |Ascend910B |C:ERROR_INVALID_PARAM
-10       |MultiLatentAttentionNoError |MultiLatentAttentionOperation |{"maskType":0,"calcType":2,"cacheMode":3,"headNum":16,"kvHeadNum":1}           |    6 |float16;float16;float16;float16;int32;int32 |nd;nd;nd;nd;nd;nd|32,32;32,32,64;64,128,1,512;64,128,1,64;32,2;32|2      |float16;float16|nd;nd|32,32,512;32,32,1|random;random;random;random;random;random|-100,100;-100,100;-100,100;-100,100;-100,100;-100,100|             |              |         |          |          |Ascend910B |C:ERROR_INVALID_PARAM
 11       |MultiLatentAttentionWrongDimNum |MultiLatentAttentionOperation |{"maskType":0,"calcType":2,"cacheMode":1,"headNum":32,"kvHeadNum":1}           |    6 |float16;float16;float16;float16;int32;int32 |nd;nd;nd;nd;nd;nd|32,32,512;32,32,64;64,128,1,512;64,128,1;32,2;32|2      |float16;float16|nd;nd|32,32,512;32,32,1|random;random;random;random;random;random|-100,100;-100,100;-100,100;-100,100;-100,100;-100,100|             |              |         |          |          |Ascend910B |I:ERROR_INVALID_TENSOR_DIM_NUM
 12       |MultiLatentAttentionErrorBatchExceeded |MultiLatentAttentionOperation |{"maskType":0,"calcType":2,"cacheMode":1,"headNum":32,"kvHeadNum":1}           |    6 |float16;float16;float16;float16;int32;int32 |nd;nd;nd;nd;nd;nd|32,32,512;32,32,64;64,128,1,512;64,128,1,64;8200,2;32|2      |float16;float16|nd;nd|32,32,512;32,32,1|random;random;random;random;random;random|-100,100;-100,100;-100,100;-100,100;-100,100;-100,100|             |              |         |          |          |Ascend910B |I:ERROR_INVALID_TENSOR_DIM
 13       |MultiLatentAttentionBatchNotSame |MultiLatentAttentionOperation |{"maskType":0,"calcType":2,"cacheMode":1,"headNum":32,"kvHeadNum":1}           |    6 |float16;float16;float16;float16;int32;int32 |nd;nd;nd;nd;nd;nd|32,32,512;32,32,64;64,128,1,512;64,128,1,64;64,2;32|2      |float16;float16|nd;nd|32,32,512;32,32,1|random;random;random;random;random;random|-100,100;-100,100;-100,100;-100,100;-100,100;-100,100|             |              |         |          |          |Ascend910B |I:ERROR_INVALID_TENSOR_DIM
-- 
Gitee


From 2dfb00bd446517b915cd3bd67f67614181636270 Mon Sep 17 00:00:00 2001
From: huangxiaolan <huangxiaolan7@huawei.com>
Date: Thu, 25 Sep 2025 11:49:57 +0800
Subject: [PATCH 90/94] delete const

---
 src/kernels/tbe_adapter/platform/platform_ascendc.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/kernels/tbe_adapter/platform/platform_ascendc.cpp b/src/kernels/tbe_adapter/platform/platform_ascendc.cpp
index 15c771eb..142befbe 100644
--- a/src/kernels/tbe_adapter/platform/platform_ascendc.cpp
+++ b/src/kernels/tbe_adapter/platform/platform_ascendc.cpp
@@ -41,7 +41,7 @@ const static std::map<std::string, SocVersion> CONVERT_MAP = {
     {"Ascend910_93", SocVersion::ASCEND910B},
 };
 
-static inline uint32_t GetCoreNumByType(const fe::PlatFormInfos &platformInfo, bool isAiv)
+static inline uint32_t GetCoreNumByType(fe::PlatFormInfos &platformInfo, bool isAiv)
 {
     std::string key;
     std::string val;
-- 
Gitee


From d105acfca6601bc3e142a5753cddf76e4476a67a Mon Sep 17 00:00:00 2001
From: huangxiaolan <huangxiaolan7@huawei.com>
Date: Thu, 25 Sep 2025 13:05:25 +0800
Subject: [PATCH 91/94] recover tbe_adapter

---
 src/kernels/tbe_adapter/platform/platform_ascendc.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/kernels/tbe_adapter/platform/platform_ascendc.cpp b/src/kernels/tbe_adapter/platform/platform_ascendc.cpp
index 142befbe..e1d924f2 100644
--- a/src/kernels/tbe_adapter/platform/platform_ascendc.cpp
+++ b/src/kernels/tbe_adapter/platform/platform_ascendc.cpp
@@ -41,11 +41,11 @@ const static std::map<std::string, SocVersion> CONVERT_MAP = {
     {"Ascend910_93", SocVersion::ASCEND910B},
 };
 
-static inline uint32_t GetCoreNumByType(fe::PlatFormInfos &platformInfo, bool isAiv)
+static inline uint32_t GetCoreNumByType(fe::PlatFormInfos *platformInfo, bool isAiv)
 {
     std::string key;
     std::string val;
-    bool ret = platformInfo.GetPlatformResWithLock(STR_SOC_INFO, STR_SPLIT_KEY, val);
+    bool ret = platformInfo->GetPlatformResWithLock(STR_SOC_INFO, STR_SPLIT_KEY, val);
     MKI_LOG_IF(!ret, ERROR) << "get platform failed, val is " << val;
 
     if (STR_SPLIT_VAL.compare(val) != 0) {
@@ -55,7 +55,7 @@ static inline uint32_t GetCoreNumByType(fe::PlatFormInfos &platformInfo, bool is
     } else {
         key = STR_CORE_CNT_CUB;
     }
-    ret = platformInfo.GetPlatformResWithLock(STR_SOC_INFO, key, val);
+    ret = platformInfo->GetPlatformResWithLock(STR_SOC_INFO, key, val);
     MKI_LOG_IF(!ret, ERROR) << "get platform failed, key is " << key << ", val is" << val;
     return val.empty() ? 0 : static_cast<uint32_t>(std::atoi(val.c_str()));
 }
-- 
Gitee


From 102822bbdecb71b2cd3d53f78b62e67b2661fddd Mon Sep 17 00:00:00 2001
From: huangxiaolan <huangxiaolan7@huawei.com>
Date: Thu, 25 Sep 2025 14:27:27 +0800
Subject: [PATCH 92/94] fix func para

---
 src/kernels/tbe_adapter/platform/platform_ascendc.cpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/kernels/tbe_adapter/platform/platform_ascendc.cpp b/src/kernels/tbe_adapter/platform/platform_ascendc.cpp
index e1d924f2..8b5fe1a1 100644
--- a/src/kernels/tbe_adapter/platform/platform_ascendc.cpp
+++ b/src/kernels/tbe_adapter/platform/platform_ascendc.cpp
@@ -41,11 +41,11 @@ const static std::map<std::string, SocVersion> CONVERT_MAP = {
     {"Ascend910_93", SocVersion::ASCEND910B},
 };
 
-static inline uint32_t GetCoreNumByType(fe::PlatFormInfos *platformInfo, bool isAiv)
+static inline uint32_t GetCoreNumByType(const fe::PlatFormInfos &platformInfo, bool isAiv)
 {
     std::string key;
     std::string val;
-    bool ret = platformInfo->GetPlatformResWithLock(STR_SOC_INFO, STR_SPLIT_KEY, val);
+    bool ret = platformInfo.GetPlatformResWithLock(STR_SOC_INFO, STR_SPLIT_KEY, val);
     MKI_LOG_IF(!ret, ERROR) << "get platform failed, val is " << val;
 
     if (STR_SPLIT_VAL.compare(val) != 0) {
@@ -55,7 +55,7 @@ static inline uint32_t GetCoreNumByType(fe::PlatFormInfos *platformInfo, bool is
     } else {
         key = STR_CORE_CNT_CUB;
     }
-    ret = platformInfo->GetPlatformResWithLock(STR_SOC_INFO, key, val);
+    ret = platformInfo.GetPlatformResWithLock(STR_SOC_INFO, key, val);
     MKI_LOG_IF(!ret, ERROR) << "get platform failed, key is " << key << ", val is" << val;
     return val.empty() ? 0 : static_cast<uint32_t>(std::atoi(val.c_str()));
 }
@@ -73,12 +73,12 @@ uint32_t PlatformAscendC::GetCoreNumVector(void) const
 
 uint32_t PlatformAscendC::GetCoreNumAic(void) const
 {
-    return GetCoreNumByType(GetPlatFormInfo(), false);
+    return GetCoreNumByType(*GetPlatFormInfo(), false);
 }
 
 uint32_t PlatformAscendC::GetCoreNumAiv(void) const
 {
-    return GetCoreNumByType(GetPlatFormInfo(), true);
+    return GetCoreNumByType(*GetPlatFormInfo(), true);
 }
 
 uint32_t PlatformAscendC::GetCoreNum(void) const
-- 
Gitee


From b903bd0f764d10e8eac1c02c084c0291e0c08c58 Mon Sep 17 00:00:00 2001
From: huangxiaolan <huangxiaolan7@huawei.com>
Date: Thu, 25 Sep 2025 16:20:04 +0800
Subject: [PATCH 93/94] fix func para

---
 src/kernels/tbe_adapter/platform/platform_ascendc.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/kernels/tbe_adapter/platform/platform_ascendc.cpp b/src/kernels/tbe_adapter/platform/platform_ascendc.cpp
index 8b5fe1a1..9fced58f 100644
--- a/src/kernels/tbe_adapter/platform/platform_ascendc.cpp
+++ b/src/kernels/tbe_adapter/platform/platform_ascendc.cpp
@@ -41,7 +41,7 @@ const static std::map<std::string, SocVersion> CONVERT_MAP = {
     {"Ascend910_93", SocVersion::ASCEND910B},
 };
 
-static inline uint32_t GetCoreNumByType(const fe::PlatFormInfos &platformInfo, bool isAiv)
+static inline uint32_t GetCoreNumByType(fe::PlatFormInfos &platformInfo, bool isAiv)
 {
     std::string key;
     std::string val;
-- 
Gitee


From 75a76e0286036384aa47f264a52201d03da0ff58 Mon Sep 17 00:00:00 2001
From: huangxiaolan <huangxiaolan7@huawei.com>
Date: Thu, 25 Sep 2025 19:08:40 +0800
Subject: [PATCH 94/94] recover tbe_adapter

---
 src/kernels/tbe_adapter/platform/platform_ascendc.cpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/kernels/tbe_adapter/platform/platform_ascendc.cpp b/src/kernels/tbe_adapter/platform/platform_ascendc.cpp
index 9fced58f..e1d924f2 100644
--- a/src/kernels/tbe_adapter/platform/platform_ascendc.cpp
+++ b/src/kernels/tbe_adapter/platform/platform_ascendc.cpp
@@ -41,11 +41,11 @@ const static std::map<std::string, SocVersion> CONVERT_MAP = {
     {"Ascend910_93", SocVersion::ASCEND910B},
 };
 
-static inline uint32_t GetCoreNumByType(fe::PlatFormInfos &platformInfo, bool isAiv)
+static inline uint32_t GetCoreNumByType(fe::PlatFormInfos *platformInfo, bool isAiv)
 {
     std::string key;
     std::string val;
-    bool ret = platformInfo.GetPlatformResWithLock(STR_SOC_INFO, STR_SPLIT_KEY, val);
+    bool ret = platformInfo->GetPlatformResWithLock(STR_SOC_INFO, STR_SPLIT_KEY, val);
     MKI_LOG_IF(!ret, ERROR) << "get platform failed, val is " << val;
 
     if (STR_SPLIT_VAL.compare(val) != 0) {
@@ -55,7 +55,7 @@ static inline uint32_t GetCoreNumByType(fe::PlatFormInfos &platformInfo, bool is
     } else {
         key = STR_CORE_CNT_CUB;
     }
-    ret = platformInfo.GetPlatformResWithLock(STR_SOC_INFO, key, val);
+    ret = platformInfo->GetPlatformResWithLock(STR_SOC_INFO, key, val);
     MKI_LOG_IF(!ret, ERROR) << "get platform failed, key is " << key << ", val is" << val;
     return val.empty() ? 0 : static_cast<uint32_t>(std::atoi(val.c_str()));
 }
@@ -73,12 +73,12 @@ uint32_t PlatformAscendC::GetCoreNumVector(void) const
 
 uint32_t PlatformAscendC::GetCoreNumAic(void) const
 {
-    return GetCoreNumByType(*GetPlatFormInfo(), false);
+    return GetCoreNumByType(GetPlatFormInfo(), false);
 }
 
 uint32_t PlatformAscendC::GetCoreNumAiv(void) const
 {
-    return GetCoreNumByType(*GetPlatFormInfo(), true);
+    return GetCoreNumByType(GetPlatFormInfo(), true);
 }
 
 uint32_t PlatformAscendC::GetCoreNum(void) const
-- 
Gitee