diff --git a/examples/matrix/matmul_ibshareB/CMakeLists.txt b/examples/matrix/matmul_ibshareB/CMakeLists.txt
index ed860d62edbdc6d5e497032162f9d0f9cb215d45..ff71e203634c2f7af5dea949ece96613127eef15 100644
--- a/examples/matrix/matmul_ibshareB/CMakeLists.txt
+++ b/examples/matrix/matmul_ibshareB/CMakeLists.txt
@@ -9,7 +9,7 @@
 
 cmake_minimum_required(VERSION 3.16)
 project(Ascend_c)
-if(${RUN_MODE})
+if (${RUN_MODE})
     set(RUN_MODE "npu" CACHE STRING "cpu/sim/npu")
 endif()
 if (${SOC_VERSION})
@@ -26,7 +26,7 @@ if(CMAKE_INSTALL_PREFIX STREQUAL /usr/local)
 endif()
 
 file(GLOB KERNEL_FILES
-    ${CMAKE_CURRENT_SOURCE_DIR}/op_kernel/matmul_ibshare_custom.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/op_kernel/matmul_ibshareb_custom_kernel.cpp
 )
 
 if("${RUN_MODE}" STREQUAL "cpu")
@@ -37,29 +37,30 @@ else()
     message("invalid RUN_MODE: ${RUN_MODE}")
 endif()
 
-add_executable(ascendc_matmul_ibshare_bbit
+add_executable(ascendc_matmul_ibshareb_bbit
     ${CMAKE_CURRENT_SOURCE_DIR}/main.cpp
-    ${CMAKE_CURRENT_SOURCE_DIR}/op_host/matmul_ibshare_custom_tiling.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/op_host/matmul_ibshareb_custom_tiling.cpp
 )
 
-target_compile_options(ascendc_matmul_ibshare_bbit PRIVATE
+target_compile_options(ascendc_matmul_ibshareb_bbit PRIVATE
     $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:-g>>
     -O2
     -std=c++17
     -D_GLIBCXX_USE_CXX11_ABI=0
 )
 
-target_compile_definitions(ascendc_matmul_ibshare_bbit PRIVATE
+target_compile_definitions(ascendc_matmul_ibshareb_bbit PRIVATE
+    $<$<BOOL:${ENABLE_FEATURE}>:ENABLE_IBSHAREB>
     SOC_VERSION="${SOC_VERSION}"
 )
 
-target_include_directories(ascendc_matmul_ibshare_bbit PRIVATE
+target_include_directories(ascendc_matmul_ibshareb_bbit PRIVATE
     ${CMAKE_CURRENT_SOURCE_DIR}
     $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:${ASCEND_CANN_PACKAGE_PATH}/include>>
     $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:${ASCEND_CANN_PACKAGE_PATH}/runtime/include>>
 )
 
-target_link_libraries(ascendc_matmul_ibshare_bbit PRIVATE
+target_link_libraries(ascendc_matmul_ibshareb_bbit PRIVATE
     $<BUILD_INTERFACE:$<$<OR:$<STREQUAL:${RUN_MODE},npu>,$<STREQUAL:${RUN_MODE},sim>>:host_intf_pub>>
     $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:tikicpulib::${SOC_VERSION}>>
     $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:ascendcl>>
diff --git a/examples/matrix/matmul_ibshareB/README.md b/examples/matrix/matmul_ibshareB/README.md
index 69e805a1ee95c8b0e59835e60576903270bce873..b259a14188d566cce81066f5f51474fafc499f1f 100644
--- a/examples/matrix/matmul_ibshareB/README.md
+++ b/examples/matrix/matmul_ibshareB/README.md
@@ -1,6 +1,6 @@
 <!--声明：本文使用[Creative Commons License version 4.0](https://creativecommons.org/licenses/by/4.0/legalcode)许可协议，转载、引用或修改等操作请遵循此许可协议。-->
 ## 概述
-本样例介绍了调用Matmul高阶API实现开启IBShare功能的单算子。IBShare的功能是复用L1 Buffer上相同的A矩阵或者B矩阵数据，减少数据搬运开销。本样例仅为B矩阵复用场景。
+本样例介绍了调用Matmul高阶API实现开启IBShare功能的单算子。IBShare的功能是复用L1 Buffer上相同的A矩阵或者B矩阵数据，减少数据搬运开销。本样例为仅B矩阵复用场景。
 
 本样例以直调的方式调用算子核函数。   
 直调：核函数的基础调用方式，开发者完成算子核函数的开发和Tiling实现后，即可通过AscendCL运行时接口，完成算子的调用。包含CPU侧、NPU侧和仿真三种运行验证方法。
@@ -22,26 +22,26 @@
 
 ## 算子描述
 - 算子功能  
-  MatmulIbshareCustom算子通过调用Matmul高阶API，开启IBShare功能，对输入的A、B矩阵做矩阵乘和加bias偏置。
+  MatmulIBShareBCustom算子通过调用Matmul高阶API，开启B矩阵的IBShare功能，对输入的A、B矩阵做矩阵乘和加bias偏置。
 
 - 算子规格
     <table>
-    <tr><td rowspan="1" align="center">算子类型(OpType)</td><td colspan="5" align="center">MatmulIbshareCustom</td></tr>
+    <tr><td rowspan="1" align="center">算子类型(OpType)</td><td colspan="5" align="center">MatmulIBShareBCustom</td></tr>
     </tr>
     <tr><td rowspan="4" align="center">算子输入</td><td align="center">name</td><td align="center">shape</td><td align="center">data type</td><td align="center">format</td><td align="center">isTrans</td></tr>
-    <tr><td align="center">a</td><td align="center">-</td><td align="center">float16</td><td align="center">ND</td><td align="center">true</td></tr>
+    <tr><td align="center">a</td><td align="center">-</td><td align="center">float16</td><td align="center">ND</td><td align="center">false</td></tr>
     <tr><td align="center">b</td><td align="center">-</td><td align="center">float16</td><td align="center">ND</td><td align="center">false</td></tr>
     <tr><td align="center">bias</td><td align="center">-</td><td align="center">float</td><td align="center">ND</td><td align="center">-</td></tr>
     </tr>
     </tr>
     <tr><td rowspan="1" align="center">算子输出</td><td align="center">c</td><td align="center">-</td><td align="center">float</td><td align="center">ND</td><td align="center">-</td></tr>
     </tr>
-    <tr><td rowspan="1" align="center">核函数名</td><td colspan="5" align="center">matmul_ibshare_custom</td></tr>
+    <tr><td rowspan="1" align="center">核函数名</td><td colspan="5" align="center">matmul_ibshareb_custom</td></tr>
     </table>
 
 ## 算子实现介绍
 
-本样例是仅B矩阵使能IBShare的场景，输入拆成40份。MIX场景下AIC:AIV=1:2，本样例开启20个AIC和40个AIV，计算过程中同一AIC对应的两个AIV在每次迭代时用到的B矩阵数据都一致。
+本样例是仅B矩阵使能IBShare的场景，计算过程中同一AIC对应的两个AIV在每次迭代时用到的B矩阵数据都一致。
 
 - 约束条件：
   - A矩阵或B矩阵单独使能IBShare的场景，要求复用的矩阵必须在L1 Buffer上全载。
@@ -52,15 +52,13 @@
     - C为目的操作数，存放矩阵乘结果的矩阵，形状为[M, N]。
     - Bias为矩阵乘偏置，形状为[1, N]。对A*B结果矩阵的每一行都采用该bias进行偏置。
   - 具体步骤：
-    - 创建Matmul对象，设置B矩阵使能IBSHARE。
+    - 创建Matmul对象，设置B矩阵的IBShare参数为true，使用默认的IBShare模板CFG_IBSHARE_NORM创建Matmul对象。
       ```
-      // ibshareA设置为false，ibshareB设置为true
-      constexpr static MatmulConfig MM_CFG = GetCFG<ibshareA, ibshareB>();
-      AscendC::Matmul<
-            AscendC::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, aType, false, LayoutMode::NONE, ibshareA>,
-            AscendC::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, bType, false, LayoutMode::NONE, ibshareB>,
-            AscendC::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, cType>,
-            AscendC::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, biasType>, MM_CFG> matmulObj;
+      using A_TYPE = AscendC::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, AType>;
+      using B_TYPE = AscendC::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, BType, false, LayoutMode::NONE, true>;
+      using C_TYPE = AscendC::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, CType>;
+      using BIAS_TYPE = AscendC::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, BiasType>;
+      AscendC::Matmul<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, CFG_IBSHARE_NORM> matmulObj;
       ```
     - 初始化操作。
     - 设置左矩阵A、右矩阵B、Bias。
@@ -87,15 +85,16 @@
   - 编译执行
 
     ```
-    bash run.sh -r [RUN_MODE] -v [SOC_VERSION] -p [IS_PERF]
+    bash run.sh -r [RUN_MODE] -v [SOC_VERSION] -p [IS_PERF] -e [ENABLE_FEATURE]
     ```
     其中脚本参数说明如下：
     - RUN_MODE ：编译执行方式，可选择CPU调试，NPU仿真，NPU上板，对应参数分别为[cpu / sim / npu]。若需要详细了解NPU仿真相关内容，请参考[《算子开发工具msProf》](https://hiascend.com/document/redirect/CannCommunityToolMsProf)中的“工具使用”章节。
     - SOC_VERSION ：昇腾AI处理器型号，如果无法确定具体的[SOC_VERSION]，则在安装昇腾AI处理器的服务器执行npu-smi info命令进行查询，在查询到的“Name”前增加Ascend信息，例如“Name”对应取值为xxxyy，实际配置的[SOC_VERSION]值为Ascendxxxyy。支持以下产品型号：
       - Atlas A2训练系列产品/Atlas 800I A2推理产品
     - IS_PERF : 是否获取执行性能数据，当前只在NPU执行时生效，打开后性能数据显示在执行结果中，可选择关闭和开启该功能，对应参数分别为[0 / 1]。
+    - ENABLE_FEATURE ：是否使能B矩阵IBShare功能，可选择关闭和开启该功能，对应参数分别为[0 / 1]，默认开启。
 
     示例如下，Ascendxxxyy请替换为实际的AI处理器型号。
     ```
-    bash run.sh -r cpu -v Ascendxxxyy -p 0
+    bash run.sh -r cpu -v Ascendxxxyy -p 0 -e 1
     ```
\ No newline at end of file
diff --git a/examples/matrix/matmul_ibshareB/cmake/cpu_lib.cmake b/examples/matrix/matmul_ibshareB/cmake/cpu_lib.cmake
index 244469fb44724e478a110e491392a7b66e56e034..09fda3d387cef823bfe263e82bb19dd28919d99b 100644
--- a/examples/matrix/matmul_ibshareB/cmake/cpu_lib.cmake
+++ b/examples/matrix/matmul_ibshareB/cmake/cpu_lib.cmake
@@ -20,6 +20,10 @@ target_link_libraries(ascendc_kernels_${RUN_MODE} PRIVATE
     tikicpulib::${SOC_VERSION}
 )
 
+target_compile_definitions(ascendc_kernels_${RUN_MODE} PRIVATE
+    $<$<BOOL:${ENABLE_FEATURE}>:ENABLE_IBSHAREB>
+)
+
 target_compile_options(ascendc_kernels_${RUN_MODE} PRIVATE
     -g
     -O0
@@ -27,5 +31,5 @@ target_compile_options(ascendc_kernels_${RUN_MODE} PRIVATE
 )
 
 install(TARGETS ascendc_kernels_${RUN_MODE}
-DESTINATION ${CMAKE_INSTALL_LIBDIR}
-)
\ No newline at end of file
+    DESTINATION ${CMAKE_INSTALL_LIBDIR}
+)
diff --git a/examples/matrix/matmul_ibshareB/cmake/npu_lib.cmake b/examples/matrix/matmul_ibshareB/cmake/npu_lib.cmake
index 927d2490b8abb30245a57454309a4ad94b00bed7..e76e37b9e88082e40cf12810d3698a6f332db620 100644
--- a/examples/matrix/matmul_ibshareB/cmake/npu_lib.cmake
+++ b/examples/matrix/matmul_ibshareB/cmake/npu_lib.cmake
@@ -21,7 +21,8 @@ ascendc_library(ascendc_kernels_${RUN_MODE} STATIC
 )
 
 ascendc_compile_definitions(ascendc_kernels_${RUN_MODE} PRIVATE
-  -DASCENDC_DUMP
-  -DHAVE_WORKSPACE
-  -DHAVE_TILING
-  )
\ No newline at end of file
+    $<$<BOOL:${ENABLE_FEATURE}>:ENABLE_IBSHAREB>
+    -DASCENDC_DUMP
+    -DHAVE_WORKSPACE
+    -DHAVE_TILING
+)
diff --git a/examples/matrix/matmul_ibshareB/main.cpp b/examples/matrix/matmul_ibshareB/main.cpp
index 4fb3325630e4dbb922cbef508e0a4496997c506c..347a824d27098e45538cbd7d0a4583494207136e 100644
--- a/examples/matrix/matmul_ibshareB/main.cpp
+++ b/examples/matrix/matmul_ibshareB/main.cpp
@@ -7,28 +7,29 @@
  * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
  * See LICENSE in the root of the software repository for the full text of the License.
  */
+
 #include <sstream>
 #include "../../common/data_utils.h"
 #include "kernel_tiling/kernel_tiling.h"
 #include "tiling/platform/platform_ascendc.h"
-#include "op_host/matmul_ibshare_custom_tiling.h"
+#include "op_host/matmul_ibshareb_custom_tiling.h"
+
 #ifndef ASCENDC_CPU_DEBUG
 #include "acl/acl.h"
-
-extern void matmul_ibshare_custom_do(uint32_t coreDim, void* stream,
-                             uint8_t* a, uint8_t* b, uint8_t* c,
-                             uint8_t* workspace, uint8_t* tiling);
+extern void matmul_ibshareb_custom_do(uint32_t coreDim, void* stream,
+    uint8_t* a, uint8_t* b, uint8_t* bias, uint8_t* c,
+    uint8_t* workspace, uint8_t* tiling);
 #else
 #include "tikicpulib.h"
-extern "C" void matmul_ibshare_custom(uint8_t* a, uint8_t* b, uint8_t* c,
-                              uint8_t* workspace, uint8_t* tiling);
+extern "C" void matmul_ibshareb_custom(uint8_t* a, uint8_t* b, uint8_t* bias, uint8_t* c,
+    uint8_t* workspace, uint8_t* tiling);
 #endif
 
-
 namespace {
-constexpr bool hasBias = false;
-constexpr bool ibshareA = false;
-constexpr bool ibshareB = true;
+constexpr int32_t MIX_RATIO = 2; // AIC:AIV=1:2
+constexpr bool IS_BIAS = false;
+constexpr bool IS_A_TRANS = false;
+constexpr bool IS_B_TRANS = false;
 
 struct MatrixFileSize
 {
@@ -50,46 +51,45 @@ static size_t GetSysWorkSpaceSize()
 #ifdef ASCENDC_CPU_DEBUG
 void TestMatmulCpu(int64_t m, int64_t n, int64_t k, const MatrixFileSize& matrixFileSize)
 {
-    size_t aFileSize = matrixFileSize.x1FileSize;  // uint16_t represent half
-    size_t bFileSize = matrixFileSize.x2FileSize;  // uint16_t represent half
-    size_t cFileSize = matrixFileSize.yFileSize;
+    size_t x1FileSize = matrixFileSize.x1FileSize;
+    size_t x2FileSize = matrixFileSize.x2FileSize;
+    size_t yFileSize = matrixFileSize.yFileSize;
+    size_t biasFileSize = matrixFileSize.biasFileSize;
+    // Query workspace size
     size_t workspaceSize = GetSysWorkSpaceSize();
+    uint8_t* x1 = (uint8_t*)AscendC::GmAlloc(x1FileSize);
+    uint8_t* x2 = (uint8_t*)AscendC::GmAlloc(x2FileSize);
+    uint8_t* bias = nullptr;
+    uint8_t* y = (uint8_t*)AscendC::GmAlloc(yFileSize);
+    uint8_t* workspace = (uint8_t*)AscendC::GmAlloc(workspaceSize);
+    ReadFile("../input/x1_gm.bin", x1FileSize, x1, x1FileSize);
+    ReadFile("../input/x2_gm.bin", x2FileSize, x2, x2FileSize);
+    if (IS_BIAS) {
+        bias = (uint8_t*)AscendC::GmAlloc(biasFileSize);
+        ReadFile("../input/bias_gm.bin", biasFileSize, bias, biasFileSize);
+    }
     size_t tilingFileSize = sizeof(TCubeTiling);
-    uint32_t mixRatio = 2;  // AIC:AIV = 1:2
+    uint8_t* tiling = (uint8_t*)AscendC::GmAlloc(tilingFileSize);
+    auto ascendcPlatform = platform_ascendc::PlatformAscendCManager::GetInstance();
     MatmulHost::MatmulCaseParams testCaseParams{
-        static_cast<int32_t>(m), static_cast<int32_t>(n), static_cast<int32_t>(k), hasBias, ibshareA, ibshareB};
-
-    uint8_t* a = (uint8_t* )AscendC::GmAlloc(aFileSize);
-    uint8_t* b = (uint8_t* )AscendC::GmAlloc(bFileSize);
-    uint8_t* c = (uint8_t* )AscendC::GmAlloc(cFileSize);
-    uint8_t* workspace = (uint8_t* )AscendC::GmAlloc(workspaceSize);
-    uint8_t* tiling = (uint8_t* )AscendC::GmAlloc(tilingFileSize);
-
-    ReadFile("../input/x1_gm.bin", aFileSize, a, aFileSize);
-    ReadFile("../input/x2_gm.bin", bFileSize, b, bFileSize);
+        static_cast<int32_t>(m), static_cast<int32_t>(n), static_cast<int32_t>(k), IS_BIAS, IS_A_TRANS, IS_B_TRANS};
+    // Calculate Tiling
     const auto& tilingData = MatmulHost::GenerateTiling(testCaseParams);
     memcpy_s(tiling, tilingFileSize, &tilingData, tilingFileSize);
-
-    uint32_t blockDim;
-    if (ibshareA && ibshareB) {
-        blockDim = tilingData.usedCoreNum;
-    } else {
-        blockDim = tilingData.usedCoreNum / mixRatio;
+    ICPU_RUN_KF(matmul_ibshareb_custom, tilingData.usedCoreNum / MIX_RATIO, x1, x2, bias, y, workspace, tiling);
+    WriteFile("../output/output.bin", y, yFileSize);
+    AscendC::GmFree((void*)x1);
+    AscendC::GmFree((void*)x2);
+    AscendC::GmFree((void*)y);
+    AscendC::GmFree((void*)workspace);
+    AscendC::GmFree((void*)tiling);
+    if (IS_BIAS) {
+        AscendC::GmFree((void*)bias);
     }
-
-    ICPU_RUN_KF(matmul_ibshare_custom, blockDim, a, b, c, workspace, tiling);
-
-    WriteFile("../output/output.bin", c, cFileSize);
-
-    AscendC::GmFree((void *)a);
-    AscendC::GmFree((void *)b);
-    AscendC::GmFree((void *)c);
-    AscendC::GmFree((void *)workspace);
-    AscendC::GmFree((void *)tiling);
 }
 // NPU
 #else
-void MatmulOp(uint8_t* a, uint8_t* b, uint8_t* c, int64_t m, int64_t n, int64_t k,
+void MatmulOp(uint8_t* x1, uint8_t* x2, uint8_t* y, uint8_t* bias, int64_t m, int64_t n, int64_t k,
     void* stream = nullptr)
 {
     // Init args
@@ -105,20 +105,11 @@ void MatmulOp(uint8_t* a, uint8_t* b, uint8_t* c, int64_t m, int64_t n, int64_t
     uint8_t* tilingHost = nullptr;
     uint8_t* tilingDevice = nullptr;
     size_t tilingFileSize = sizeof(TCubeTiling);
-
+    auto ascendcPlatform = platform_ascendc::PlatformAscendCManager::GetInstance();
     MatmulHost::MatmulCaseParams testCaseParams{
-            static_cast<int32_t>(m), static_cast<int32_t>(n), static_cast<int32_t>(k), hasBias, ibshareA, ibshareB};
-
+        static_cast<int32_t>(m), static_cast<int32_t>(n), static_cast<int32_t>(k), IS_BIAS, IS_A_TRANS, IS_B_TRANS};
     // Calculate Tiling
-    uint32_t mixRatio = 2;  // AIC:AIV = 1:2
     const auto tilingData = MatmulHost::GenerateTiling(testCaseParams);
-    uint32_t blockDim;
-    if (ibshareA && ibshareB) {
-        blockDim = tilingData.usedCoreNum;
-    } else {
-        blockDim = tilingData.usedCoreNum / mixRatio;
-    }
-
     CHECK_ACL(aclrtMallocHost((void**)(&tilingHost), tilingFileSize));
     CHECK_ACL(aclrtMalloc((void**)&tilingDevice, tilingFileSize,
                           ACL_MEM_MALLOC_HUGE_FIRST));
@@ -127,7 +118,7 @@ void MatmulOp(uint8_t* a, uint8_t* b, uint8_t* c, int64_t m, int64_t n, int64_t
     CHECK_ACL(aclrtMemcpy(tilingDevice, tilingFileSize, tilingHost,
                           tilingFileSize, ACL_MEMCPY_HOST_TO_DEVICE));
     // Launch kernel
-    matmul_ibshare_custom_do(blockDim, stream, a, b, c, workspaceDevice, tilingDevice);
+    matmul_ibshareb_custom_do(tilingData.usedCoreNum / MIX_RATIO, stream, x1, x2, bias, y, workspaceDevice, tilingDevice);
     CHECK_ACL(aclrtFreeHost(tilingHost));
     CHECK_ACL(aclrtFree(workspaceDevice));
     CHECK_ACL(aclrtFree(tilingDevice));
@@ -151,60 +142,68 @@ void TestAclDeInit(aclrtContext& context, aclrtStream& stream, int64_t& deviceId
 
 void TestMatmul(int64_t m, int64_t n, int64_t k, const MatrixFileSize& matrixFileSize)
 {
-    size_t aFileSize = matrixFileSize.x1FileSize;  // uint16_t represent half
-    size_t bFileSize = matrixFileSize.x2FileSize;  // uint16_t represent half
-    size_t cFileSize = matrixFileSize.yFileSize;
+    size_t x1FileSize = matrixFileSize.x1FileSize;
+    size_t x2FileSize = matrixFileSize.x2FileSize;
+    size_t yFileSize = matrixFileSize.yFileSize;
+    size_t biasFileSize = matrixFileSize.biasFileSize;
 
     aclrtContext context;
     aclrtStream stream = nullptr;
     int64_t deviceId = 0;
     TestAclInit(context, stream, deviceId);
 
-    uint8_t* aHost;
-    uint8_t* aDevice;
-    CHECK_ACL(aclrtMallocHost((void **)(&aHost), aFileSize));
-    CHECK_ACL(aclrtMalloc((void **)&aDevice, aFileSize, ACL_MEM_MALLOC_HUGE_FIRST));
-    ReadFile("../input/x1_gm.bin", aFileSize, aHost, aFileSize);
-    CHECK_ACL(aclrtMemcpy(aDevice, aFileSize, aHost, aFileSize,
-                          ACL_MEMCPY_HOST_TO_DEVICE));
-
-    uint8_t* bHost;
-    uint8_t* bDevice;
-    CHECK_ACL(aclrtMallocHost((void **)(&bHost), bFileSize));
-    CHECK_ACL(aclrtMalloc((void **)&bDevice, bFileSize, ACL_MEM_MALLOC_HUGE_FIRST));
-    ReadFile("../input/x2_gm.bin", bFileSize, bHost, bFileSize);
-    CHECK_ACL(aclrtMemcpy(bDevice, bFileSize, bHost, bFileSize,
-                          ACL_MEMCPY_HOST_TO_DEVICE));
-
-    uint8_t* cHost;
-    uint8_t* cDevice;
-    CHECK_ACL(aclrtMallocHost((void **)(&cHost), cFileSize));
-    CHECK_ACL(aclrtMalloc((void **)&cDevice, cFileSize, ACL_MEM_MALLOC_HUGE_FIRST));
-
-    MatmulOp(aDevice, bDevice, cDevice, m, n, k, stream);
-    CHECK_ACL(aclrtSynchronizeStream(stream));
+    uint8_t* x1Host = nullptr;
+    uint8_t* x1Device = nullptr;
+    CHECK_ACL(aclrtMallocHost((void**)(&x1Host), x1FileSize));
+    CHECK_ACL(aclrtMalloc((void**)&x1Device, x1FileSize, ACL_MEM_MALLOC_HUGE_FIRST));
+    ReadFile("../input/x1_gm.bin", x1FileSize, x1Host, x1FileSize);
+    CHECK_ACL(aclrtMemcpy(x1Device, x1FileSize, x1Host, x1FileSize, ACL_MEMCPY_HOST_TO_DEVICE));
+
+    uint8_t* x2Host = nullptr;
+    uint8_t* x2Device = nullptr;
+    CHECK_ACL(aclrtMallocHost((void**)(&x2Host), x2FileSize));
+    CHECK_ACL(aclrtMalloc((void**)&x2Device, x2FileSize, ACL_MEM_MALLOC_HUGE_FIRST));
+    ReadFile("../input/x2_gm.bin", x2FileSize, x2Host, x2FileSize);
+    CHECK_ACL(aclrtMemcpy(x2Device, x2FileSize, x2Host, x2FileSize, ACL_MEMCPY_HOST_TO_DEVICE));
+
+    uint8_t* biasHost = nullptr;
+    uint8_t* biasDevice = nullptr;
+    if (IS_BIAS) {
+        CHECK_ACL(aclrtMallocHost((void**)(&biasHost), biasFileSize));
+        CHECK_ACL(aclrtMalloc((void**)&biasDevice, biasFileSize, ACL_MEM_MALLOC_HUGE_FIRST));
+        ReadFile("../input/bias_gm.bin", biasFileSize, biasHost, biasFileSize);
+        CHECK_ACL(aclrtMemcpy(biasDevice, biasFileSize, biasHost, biasFileSize, ACL_MEMCPY_HOST_TO_DEVICE));
+    }
+    uint8_t* yHost = nullptr;
+    uint8_t* yDevice = nullptr;
+    CHECK_ACL(aclrtMallocHost((void**)(&yHost), yFileSize));
+    CHECK_ACL(aclrtMalloc((void**)&yDevice, yFileSize, ACL_MEM_MALLOC_HUGE_FIRST));
 
-    CHECK_ACL(aclrtMemcpy(cHost, cFileSize, cDevice, cFileSize,
-                          ACL_MEMCPY_DEVICE_TO_HOST));
-    WriteFile("../output/output.bin", cHost, cFileSize);
+    MatmulOp(x1Device, x2Device, yDevice, biasDevice, m, n, k, stream);
+    CHECK_ACL(aclrtSynchronizeStream(stream));
 
-    CHECK_ACL(aclrtFree(aDevice));
-    CHECK_ACL(aclrtFreeHost(aHost));
-    CHECK_ACL(aclrtFree(bDevice));
-    CHECK_ACL(aclrtFreeHost(bHost));
-    CHECK_ACL(aclrtFree(cDevice));
-    CHECK_ACL(aclrtFreeHost(cHost));
+    CHECK_ACL(aclrtMemcpy(yHost, yFileSize, yDevice, yFileSize, ACL_MEMCPY_DEVICE_TO_HOST));
+    WriteFile("../output/output.bin", yHost, yFileSize);
 
+    if (IS_BIAS) {
+        CHECK_ACL(aclrtFree(biasDevice));
+        CHECK_ACL(aclrtFreeHost(biasHost));
+    }
+    CHECK_ACL(aclrtFree(x1Device));
+    CHECK_ACL(aclrtFreeHost(x1Host));
+    CHECK_ACL(aclrtFree(x2Device));
+    CHECK_ACL(aclrtFreeHost(x2Host));
+    CHECK_ACL(aclrtFree(yDevice));
+    CHECK_ACL(aclrtFreeHost(yHost));
     TestAclDeInit(context, stream, deviceId);
 }
 #endif
-}
-
+} // namespace MatmulHost
 
-int32_t main(int32_t argc, const char *args[])
+int32_t main(int32_t argc, const char* args[])
 {
     int64_t inputParams[3] = {1, 1, 1};
-    for (int32_t i = 1; i < argc && i < 4; ++i) { // 4
+    for (int32_t i = 1; i < argc && i < 4; ++i) { // 4 used for inputParams loop
         std::stringstream ss(args[i]);
         ss >> inputParams[i - 1];
     }
diff --git a/examples/matrix/matmul_ibshareB/op_host/matmul_ibshare_custom_tiling.cpp b/examples/matrix/matmul_ibshareB/op_host/matmul_ibshareb_custom_tiling.cpp
similarity index 64%
rename from examples/matrix/matmul_ibshareB/op_host/matmul_ibshare_custom_tiling.cpp
rename to examples/matrix/matmul_ibshareB/op_host/matmul_ibshareb_custom_tiling.cpp
index ed2f91403998a25d7418c0ca0af1d966e543c3fa..6305e044498c4ea802a4f4980f8ea1bd5d24b8f3 100644
--- a/examples/matrix/matmul_ibshareB/op_host/matmul_ibshare_custom_tiling.cpp
+++ b/examples/matrix/matmul_ibshareB/op_host/matmul_ibshareb_custom_tiling.cpp
@@ -8,15 +8,13 @@
  * See LICENSE in the root of the software repository for the full text of the License.
  */
 
-#include <cassert>
-#include <fstream>
+#include "matmul_ibshareb_custom_tiling.h"
 #include <iostream>
-#include <string>
-#include "tiling/tiling_api.h"
-#include "tiling/platform/platform_ascendc.h"
-#include "matmul_ibshare_custom_tiling.h"
 
-namespace MatmulHost{
+namespace MatmulHost {
+
+constexpr int32_t USED_CORE_NUM = 2; // AIC:AIV=1:2
+
 TCubeTiling GenerateTiling(const MatmulCaseParams& testCaseParams)
 {
     TCubeTiling tilingData;
@@ -24,28 +22,25 @@ TCubeTiling GenerateTiling(const MatmulCaseParams& testCaseParams)
     if (ascendcPlatform == nullptr) {
         return tilingData;
     }
-    auto aivCoreNum = ascendcPlatform->GetCoreNum();
     matmul_tiling::MultiCoreMatmulTiling cubeTiling(*ascendcPlatform);
-
-    int32_t M = testCaseParams.m;
-    int32_t N = testCaseParams.n;
-    int32_t K = testCaseParams.k;
+    uint32_t M = testCaseParams.m;
+    uint32_t N = testCaseParams.n;
+    uint32_t K = testCaseParams.k;
     bool hasBias = testCaseParams.hasBias;
-    bool ibshareA = testCaseParams.ibshareA;
-    bool ibshareB = testCaseParams.ibshareB;
+    bool isAtrans = testCaseParams.isATrans;
+    bool isBtrans = testCaseParams.isBTrans;
 
-    constexpr int32_t MIX_RATIO = 2;  // AIC:AIV = 1:2
-    int blockDim = aivCoreNum;
-    if (ibshareA && ibshareB) {
-        blockDim /= MIX_RATIO;
-    }
-    cubeTiling.SetDim(blockDim);
-    cubeTiling.SetAType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, matmul_tiling::DataType::DT_FLOAT16);
-    cubeTiling.SetBType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, matmul_tiling::DataType::DT_FLOAT16);
-    cubeTiling.SetCType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, matmul_tiling::DataType::DT_FLOAT);
-    cubeTiling.SetBiasType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, matmul_tiling::DataType::DT_FLOAT);
-    cubeTiling.SetShape(M, N, K);
+    cubeTiling.SetDim(USED_CORE_NUM);
+    cubeTiling.SetAType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND,
+        matmul_tiling::DataType::DT_FLOAT16, isAtrans);
+    cubeTiling.SetBType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND,
+        matmul_tiling::DataType::DT_FLOAT16, isBtrans);
+    cubeTiling.SetCType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND,
+        matmul_tiling::DataType::DT_FLOAT);
+    cubeTiling.SetBiasType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND,
+        matmul_tiling::DataType::DT_FLOAT);
     cubeTiling.SetOrgShape(M, N, K);
+    cubeTiling.SetShape(M, N, K);
     cubeTiling.EnableBias(hasBias);
     cubeTiling.SetBufferSpace(-1, -1, -1);
     if (cubeTiling.GetTiling(tilingData) == -1) {
diff --git a/examples/matrix/matmul_ibshareB/op_host/matmul_ibshare_custom_tiling.h b/examples/matrix/matmul_ibshareB/op_host/matmul_ibshareb_custom_tiling.h
similarity index 97%
rename from examples/matrix/matmul_ibshareB/op_host/matmul_ibshare_custom_tiling.h
rename to examples/matrix/matmul_ibshareB/op_host/matmul_ibshareb_custom_tiling.h
index 6faa0b4857e5c2276ba45c7d3039b0998b989d6c..96aafe86458fdfd5eb39c5c2389060d9765d9823 100644
--- a/examples/matrix/matmul_ibshareB/op_host/matmul_ibshare_custom_tiling.h
+++ b/examples/matrix/matmul_ibshareB/op_host/matmul_ibshareb_custom_tiling.h
@@ -21,8 +21,8 @@ struct MatmulCaseParams
     int32_t n;
     int32_t k;
     bool hasBias;
-    bool ibshareA;
-    bool ibshareB;
+    bool isATrans;
+    bool isBTrans;
 };
 
 /**
diff --git a/examples/matrix/matmul_ibshareB/op_kernel/matmul_ibshare_custom.h b/examples/matrix/matmul_ibshareB/op_kernel/matmul_ibshare_custom.h
deleted file mode 100644
index 87ba8c69d44de16d05556611f95f862a00d2f269..0000000000000000000000000000000000000000
--- a/examples/matrix/matmul_ibshareB/op_kernel/matmul_ibshare_custom.h
+++ /dev/null
@@ -1,119 +0,0 @@
-/*
- * Copyright (c) 2025 Huawei Technologies Co., Ltd.
- * This file is a part of the CANN Open Software.
- * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
- * Please refer to the License for details. You may not use this file except in compliance with the License.
- * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
- * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
- * See LICENSE in the root of the software repository for the full text of the License.
- */
-
-#ifndef EXAMPLES_MATRIX_MATMUL_IBSHAREB_OP_KERNEL_MATMUL_IBSHAREB_CUSTOM_TILING_H
-#define EXAMPLES_MATRIX_MATMUL_IBSHAREB_OP_KERNEL_MATMUL_IBSHAREB_CUSTOM_TILING_H
-#include "kernel_operator.h"
-#include "lib/matmul_intf.h"
-
-namespace {
-constexpr int32_t MIX_RATIO = 2;  // AIC:AIV = 1:2
-}
-
-template <const bool ibshareA, const bool ibshareB>
-__aicore__ inline constexpr MatmulConfig GetCFG()
-{
-    if constexpr ((ibshareA && ibshareB) || (!ibshareA && !ibshareB)) {
-        return GetNormalConfig();
-    }
-    return GetIBShareNormConfig();
-}
-
-template <typename aType, typename bType, typename cType, typename biasType, const bool ibshareA, const bool ibshareB>
-class MatmulIbshareKernel {
-    private:
-        constexpr static MatmulConfig MM_CFG = GetCFG<ibshareA, ibshareB>();
-    public:
-        __aicore__ inline MatmulIbshareKernel(){};
-        __aicore__ inline void Init(GM_ADDR a, GM_ADDR b, GM_ADDR bias, GM_ADDR c, GM_ADDR workspace, const TCubeTiling& tiling);
-        template <bool hasBias = false>
-        __aicore__ inline void Process(AscendC::TPipe* pipe);
-        AscendC::Matmul<
-            AscendC::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, aType, false, LayoutMode::NONE, ibshareA>,
-            AscendC::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, bType, false, LayoutMode::NONE, ibshareB>,
-            AscendC::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, cType>,
-            AscendC::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, biasType>, MM_CFG> matmulObj;
-
-    private:
-        __aicore__ inline void CalcOffset(int32_t blockIdx, const TCubeTiling& tiling, int32_t& offsetA, int32_t& offsetB,
-                                          int32_t& offsetC, int32_t& offsetBias);
-
-        AscendC::GlobalTensor<aType> aGlobal;
-        AscendC::GlobalTensor<bType> bGlobal;
-        AscendC::GlobalTensor<cType> cGlobal;
-        AscendC::GlobalTensor<biasType> biasGlobal;
-        TCubeTiling tiling;
-};
-
-template <typename aType, typename bType, typename cType, typename biasType, const bool ibshareA, const bool ibshareB>
-__aicore__ inline void MatmulIbshareKernel<aType, bType, cType, biasType, ibshareA, ibshareB>::Init(
-    GM_ADDR a, GM_ADDR b, GM_ADDR bias, GM_ADDR c, GM_ADDR workspace, const TCubeTiling& tiling)
-{
-    this->tiling = tiling;
-    aGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ aType*>(a), tiling.M * tiling.Ka);
-    bGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ bType*>(b), tiling.Kb * tiling.N);
-    cGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ cType*>(c), tiling.M * tiling.N);
-    biasGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ biasType*>(bias), tiling.N);
-
-    int32_t offsetA = 0;
-    int32_t offsetB = 0;
-    int32_t offsetC = 0;
-    int32_t offsetBias = 0;
-    CalcOffset(AscendC::GetBlockIdx(), tiling, offsetA, offsetB, offsetC, offsetBias);
-    aGlobal = aGlobal[offsetA];
-    bGlobal = bGlobal[offsetB];
-    cGlobal = cGlobal[offsetC];
-    biasGlobal = biasGlobal[offsetBias];
-    if(GetSysWorkSpacePtr() == nullptr){
-        return;
-    }
-}
-
-template <typename aType, typename bType, typename cType, typename biasType, const bool ibshareA, const bool ibshareB>
-template <bool hasBias>
-__aicore__ inline void MatmulIbshareKernel<aType, bType, cType, biasType, ibshareA, ibshareB>::Process(AscendC::TPipe* pipe)
-{
-    matmulObj.SetTensorA(aGlobal);
-    matmulObj.SetTensorB(bGlobal);
-    if constexpr (hasBias) {
-        matmulObj.SetBias(biasGlobal);
-    }
-    matmulObj.IterateAll(cGlobal);
-    matmulObj.End();
-}
-
-__aicore__ inline uint32_t Ceiling(uint32_t a, uint32_t b)
-{
-    return (a + b - 1) / b;
-}
-
-template <typename aType, typename bType, typename cType, typename biasType, const bool ibshareA, const bool ibshareB>
-__aicore__ inline void MatmulIbshareKernel<aType, bType, cType, biasType, ibshareA, ibshareB>::CalcOffset(
-    int32_t blockIdx, const TCubeTiling& tiling, int32_t& offsetA, int32_t& offsetB, int32_t& offsetC, int32_t& offsetBias)
-{
-    auto mSingleBlocks = Ceiling(tiling.M, tiling.singleCoreM);
-    auto mCoreIndx = blockIdx % mSingleBlocks;
-    auto nCoreIndx = blockIdx / mSingleBlocks;
-
-    offsetA = mCoreIndx * tiling.Ka * tiling.singleCoreM;
-    offsetB = nCoreIndx * tiling.singleCoreN;
-    offsetC = mCoreIndx * tiling.N * tiling.singleCoreM + nCoreIndx * tiling.singleCoreN;
-    offsetBias = nCoreIndx * tiling.singleCoreN;
-
-    // process with tail block
-    int32_t tailM = tiling.M - mCoreIndx * tiling.singleCoreM;
-    tailM = tailM < tiling.singleCoreM ? tailM : tiling.singleCoreM;
-    int32_t tailN = tiling.N - nCoreIndx * tiling.singleCoreN;
-    tailN = tailN < tiling.singleCoreN ? tailN : tiling.singleCoreN;
-    if (tailM < tiling.singleCoreM || tailN < tiling.singleCoreN) {
-        matmulObj.SetTail(tailM, tailN);
-    }
-}
-#endif // EXAMPLES_MATRIX_MATMUL_IBSHAREB_OP_KERNEL_MATMUL_IBSHAREB_CUSTOM_TILING_H
\ No newline at end of file
diff --git a/examples/matrix/matmul_ibshareB/op_kernel/matmul_ibshareb_custom_kernel.cpp b/examples/matrix/matmul_ibshareB/op_kernel/matmul_ibshareb_custom_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..35476a03aec0f9985e9c4b91e3508951bb8e370f
--- /dev/null
+++ b/examples/matrix/matmul_ibshareB/op_kernel/matmul_ibshareb_custom_kernel.cpp
@@ -0,0 +1,131 @@
+/*
+ * Copyright (c) 2025 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+#include "kernel_operator.h"
+#include "matmul_ibshareb_custom_kernel.h"
+
+namespace {
+/**
+  * @brief  Copy tiling data to TCubeTiling ptr from tiling gm addr.
+  * @param  tiling: TCubeTiling ptr which needs to copy tiling data.
+  * @param  tilingGM: Tiling gm addr.
+  * @retval None
+  */
+__aicore__ inline void CopyTiling(TCubeTiling* tiling, GM_ADDR tilingGM)
+{
+    uint32_t* ptr = reinterpret_cast<uint32_t*>(tiling);
+    auto tiling32 = reinterpret_cast<__gm__ uint32_t*>(tilingGM);
+
+    for (int i = 0; i < sizeof(TCubeTiling) / sizeof(uint32_t); i++, ptr++) {
+      *ptr = *(tiling32 + i);
+    }
+    return;
+}
+}
+
+namespace MatmulIBShareBCustom {
+template <typename AType, typename BType, typename CType, typename BiasType>
+__aicore__ inline void MatmulKernel<AType, BType, CType, BiasType>::Init(
+    GM_ADDR a, GM_ADDR b, GM_ADDR bias, GM_ADDR c, const TCubeTiling& tiling, bool isTransA, bool isTransB)
+{
+    this->tiling = tiling;
+    aGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ AType*>(a), tiling.M * tiling.Ka);
+    bGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ BType*>(b), tiling.Kb * tiling.N);
+    cGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ CType*>(c), tiling.M * tiling.N);
+    biasGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ BiasType*>(bias), tiling.N);
+
+    int32_t offsetA = 0;
+    int32_t offsetB = 0;
+    int32_t offsetC = 0;
+    int32_t offsetBias = 0;
+    this->isTransA = isTransA;
+    this->isTransB = isTransB;
+    CalcOffset(AscendC::GetBlockIdx(), offsetA, offsetB, offsetC, offsetBias);
+    aGlobal = aGlobal[offsetA];
+    bGlobal = bGlobal[offsetB];
+    cGlobal = cGlobal[offsetC];
+    biasGlobal = biasGlobal[offsetBias];
+    if (GetSysWorkSpacePtr() == nullptr) {
+        return;
+    }
+}
+
+template <typename AType, typename BType, typename CType, typename BiasType>
+__aicore__ inline void MatmulKernel<AType, BType, CType, BiasType>::Process()
+{
+    if (AscendC::GetBlockIdx() >= tiling.usedCoreNum) {
+        return;
+    }
+    matmulObj.SetTensorA(aGlobal, isTransA);
+    matmulObj.SetTensorB(bGlobal, isTransB);
+    if (tiling.isBias) {
+        matmulObj.SetBias(biasGlobal);
+    }
+    matmulObj.IterateAll(cGlobal);
+    matmulObj.End();
+}
+
+template <typename AType, typename BType, typename CType, typename BiasType>
+__aicore__ inline void MatmulKernel<AType, BType, CType, BiasType>::CalcOffset(
+    int32_t blockIdx, int32_t& offsetA, int32_t& offsetB, int32_t& offsetC, int32_t& offsetBias)
+{
+    const TCubeTiling& tiling = this->tiling;
+    auto mSingleBlocks = (tiling.M + tiling.singleCoreM - 1) / tiling.singleCoreM; // split M into mSingleBlocks cores
+    mCoreIndex = blockIdx % mSingleBlocks;
+    nCoreIndex = blockIdx / mSingleBlocks;
+
+    if (isTransA) {
+        offsetA = mCoreIndex * tiling.singleCoreM;
+    } else {
+        offsetA = mCoreIndex * tiling.Ka * tiling.singleCoreM;
+    }
+    if (isTransB) {
+        offsetB = nCoreIndex * tiling.Kb * tiling.singleCoreN;
+    } else {
+        offsetB = nCoreIndex * tiling.singleCoreN;
+    }
+    offsetC = mCoreIndex * tiling.N * tiling.singleCoreM + nCoreIndex * tiling.singleCoreN;
+    offsetBias = nCoreIndex * tiling.singleCoreN;
+}
+} // namespace MatmulIBShareBCustom
+
+/**
+  * @brief  matmul kernel function.
+  * @param  a: A matrix gm addr.
+  * @param  b: B matrix gm addr.
+  * @param  bias: Bias matrix gm addr.
+  * @param  c: C matrix gm addr.
+  * @param  workspace: Temporary gm space addr required by matmul calc.
+  * @param  tilingGm: Tiling data addr. 
+  * @retval None
+  */
+extern "C" __global__ __aicore__ void matmul_ibshareb_custom(GM_ADDR a, GM_ADDR b,
+    GM_ADDR bias, GM_ADDR c, GM_ADDR workspace, GM_ADDR tilingGm)
+{
+    // prepare tiling
+    TCubeTiling tiling;
+    CopyTiling(&tiling, tilingGm);
+    // define matmul kernel
+    MatmulIBShareBCustom::MatmulKernel<half, half, float, float> matmulKernel;
+    AscendC::TPipe pipe;
+    REGIST_MATMUL_OBJ(&pipe, GetSysWorkSpacePtr(), matmulKernel.matmulObj, &tiling);
+    // init matmul kernel, isTransA=false, isTransB=false
+    matmulKernel.Init(a, b, bias, c, tiling, false, false);
+    // matmul kernel process
+    matmulKernel.Process();
+}
+
+#ifndef ASCENDC_CPU_DEBUG
+void matmul_ibshareb_custom_do(uint32_t blockDim, void* stream,
+    GM_ADDR a, GM_ADDR b, GM_ADDR bias, GM_ADDR c, GM_ADDR workspace, GM_ADDR tilingGm)
+{
+    matmul_ibshareb_custom<<<blockDim, nullptr, stream>>>(a, b, bias, c, workspace, tilingGm);
+}
+#endif
diff --git a/examples/matrix/matmul_ibshareB/op_kernel/matmul_ibshareb_custom_kernel.h b/examples/matrix/matmul_ibshareB/op_kernel/matmul_ibshareb_custom_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..8e52768d09e8f043d16f330a81a1beb0b27b66ce
--- /dev/null
+++ b/examples/matrix/matmul_ibshareB/op_kernel/matmul_ibshareb_custom_kernel.h
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2025 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+#ifndef EXAMPLES_MATRIX_MATMUL_IBSHAREB_OP_KERNEL_MATMUL_IBSHAREB_CUSTOM_KERNEL_H
+#define EXAMPLES_MATRIX_MATMUL_IBSHAREB_OP_KERNEL_MATMUL_IBSHAREB_CUSTOM_KERNEL_H
+#include "kernel_operator.h"
+#include "lib/matmul_intf.h"
+
+namespace MatmulIBShareBCustom {
+
+template <typename AType, typename BType, typename CType, typename BiasType>
+class MatmulKernel {
+public:
+    __aicore__ inline MatmulKernel(){};
+    /**
+      * @brief  Initialization before process.
+      * @param  a: A matrix gm addr.
+      * @param  b: B matrix gm addr.
+      * @param  bias: Bias matrix gm addr.
+      * @param  c: C matrix gm addr.
+      * @param  tiling: Matmul tiling struct.
+      * @param  isTransA: Whether A matrix is transposed.
+      * @param  isTransB: Whether B matrix is transposed.
+      * @retval None
+      */
+    __aicore__ inline void Init(GM_ADDR a, GM_ADDR b, GM_ADDR bias, GM_ADDR c, const TCubeTiling& tiling,
+        bool isTransA, bool isTransB);
+    /**
+      * @brief  Process matrix calculation.
+      * @retval None
+      */
+    __aicore__ inline void Process();
+
+    using A_TYPE = AscendC::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, AType>;
+    using C_TYPE = AscendC::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, CType>;
+    using BIAS_TYPE =  AscendC::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, BiasType>;
+#ifdef ENABLE_IBSHAREB
+    using B_TYPE = AscendC::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, BType, false, LayoutMode::NONE, true>;
+    AscendC::Matmul<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, CFG_IBSHARE_NORM> matmulObj;
+#else
+    using B_TYPE = AscendC::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, BType>;
+    AscendC::Matmul<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, CFG_NORM> matmulObj;
+#endif
+
+private:
+    /**
+      * @brief  Calculate the gm offset based on the blockIdx.
+      * @param  blockIdx: Current Core blockidx.
+      * @param  offsetA: Gm offset of A matrix.
+      * @param  offsetB: Gm offset of B matrix.
+      * @param  offsetC: Gm offset of C matrix.
+      * @param  offsetBias: Gm offset of Bias matrix.
+      * @retval None
+      */
+    __aicore__ inline void CalcOffset(
+        int32_t blockIdx, int32_t& offsetA, int32_t& offsetB, int32_t& offsetC, int32_t& offsetBias);
+
+    AscendC::GlobalTensor<AType> aGlobal;
+    AscendC::GlobalTensor<BType> bGlobal;
+    AscendC::GlobalTensor<CType> cGlobal;
+    AscendC::GlobalTensor<BiasType> biasGlobal;
+    TCubeTiling tiling;
+    int32_t mCoreIndex;
+    int32_t nCoreIndex;
+    bool isTransA{false};
+    bool isTransB{false};
+};
+} // namespace MatmulIBShareBCustom
+
+#endif // EXAMPLES_MATRIX_MATMUL_IBSHAREB_OP_KERNEL_MATMUL_IBSHAREB_CUSTOM_KERNEL_H
diff --git a/examples/matrix/matmul_ibshareB/run.sh b/examples/matrix/matmul_ibshareB/run.sh
index 04595c3cc71122af13defd4d1c6744fafbf893bb..28d0e0ce4c3a939ea91f6b0f46a837cb5b0119ce 100644
--- a/examples/matrix/matmul_ibshareB/run.sh
+++ b/examples/matrix/matmul_ibshareB/run.sh
@@ -9,8 +9,9 @@
 # ======================================================================================================================
 
 export IS_PERF="0"
+export ENABLE_FEATURE="1"
 
-SHORT=r:,v:,p:,
+SHORT=r:,v:,p:,e:,
 LONG=run-mode:,soc-version:,perf:,
 OPTS=$(getopt -a --options $SHORT --longoptions $LONG -- "$@")
 eval set -- "$OPTS"
@@ -26,6 +27,9 @@ do
         (-p | --perf )
             IS_PERF="$2"
             shift 2;;
+        (-e | --enable-feature )
+            ENABLE_FEATURE="$2"
+            shift 2;;
         (--)
             shift;
             break;;
@@ -51,6 +55,13 @@ if [ "${IS_PERF}" != "0" ] && [ "${IS_PERF}" != "1" ]; then
     exit 1
 fi
 
+if [ "${ENABLE_FEATURE}" != "0" ] && [ "${ENABLE_FEATURE}" != "1" ]; then
+    echo "[ERROR] Unsupported ENABLE_FEATURE: ${ENABLE_FEATURE}, which can only be 0 or 1."
+    exit 1
+fi
+
+echo "[INFO] ENABLE_FEATURE: ${ENABLE_FEATURE}."
+
 # only npu mode support is_perf = 1
 if [ "${IS_PERF}" == "1" ] && [ "${RUN_MODE}" != "npu" ]; then
     echo "[ERROR] Unsupported IS_PERF: ${IS_PERF} while RUN_MODE is ${RUN_MODE}."
@@ -65,7 +76,7 @@ source $ASCEND_HOME_DIR/bin/setenv.bash
 export LD_LIBRARY_PATH=${ASCEND_HOME_DIR}/tools/simulator/${SOC_VERSION}/lib:$LD_LIBRARY_PATH
 set -euo pipefail
 
-cmake  -DRUN_MODE=${RUN_MODE} -DSOC_VERSION=${SOC_VERSION} -DASCEND_CANN_PACKAGE_PATH=${ASCEND_HOME_DIR} ..
+cmake  -DRUN_MODE=${RUN_MODE} -DSOC_VERSION=${SOC_VERSION} -DENABLE_FEATURE=${ENABLE_FEATURE} -DASCEND_CANN_PACKAGE_PATH=${ASCEND_HOME_DIR} ..
 make -j16
 
 cd ../
@@ -78,7 +89,7 @@ mkdir output
 rm -rf bin/
 mkdir -p bin
 cd bin
-cp ../build/ascendc_matmul_ibshare_bbit ./
+cp ../build/ascendc_matmul_ibshareb_bbit ./
 export TF_CPP_MIN_LOG_LEVEL=3
 
 if [ "${RUN_MODE}" = "npu" ]; then
diff --git a/examples/matrix/matmul_ibshareB/scripts/exec_test.py b/examples/matrix/matmul_ibshareB/scripts/exec_test.py
index d3bd8f603ca1c47ead38d1fe1be1dbd55134bdaa..a78ed7d4c2d70d3222a25b3bb953e95e08f4897f 100644
--- a/examples/matrix/matmul_ibshareB/scripts/exec_test.py
+++ b/examples/matrix/matmul_ibshareB/scripts/exec_test.py
@@ -10,12 +10,11 @@
 # See LICENSE in the root of the software repository for the full text of the License.
 # ======================================================================================================================
 import os
-import shlex
-import subprocess
 import sys
 import csv
 import time
 import logging
+import subprocess
 
 import numpy as np
 
@@ -28,7 +27,8 @@ from common_scripts.exec_utils import get_file_work_dir, get_case_list, get_perf
 IS_BIAS = False
 IS_TRANS_A = False
 IS_TRANS_B = False
-# support float16_float32(fp16 in fp32 out), int8_int32_ibshare(int8 in int32 out for ibshare matmul)
+PERF_RUN_TIMES = 5
+# float16 in and float32 out
 DATA_TYPE_STR = "float16_float32"
 
 logging.basicConfig(level=logging.INFO)
@@ -50,22 +50,30 @@ def process_case(file_work_dir, process_params):
         process_params.k, process_params.b, process_params.is_perf, process_params.run_mode
     logging.info("[INFO] start process case [%s]" % (case_name))
     logging.info("[INFO] IS_PERF is set [%d]" % (is_perf))
-    clear_file_cache(file_work_dir)
 
-    matmul_gen_data = MatmulGenData(m, n, k, b, IS_TRANS_A, IS_TRANS_B, IS_BIAS, DATA_TYPE_STR)
-    if is_perf:
-        matmul_gen_data.gen_fake_golden_data(file_work_dir)
-    else:
-        matmul_gen_data.gen_golden_data(file_work_dir)
-    params_str = f"{m} {n} {k} {b}"
-    cmd = get_process_case_cmd(kernel_name="ascendc_matmul_ibshare_bbit", params_str=params_str,\
-        is_perf=is_perf, run_mode=run_mode)
-    subprocess.run(cmd)
-    if is_perf:
-        wrong_num = -1
-    else:
-        logging.info("[INFO] compare data case[%s]" % (case_name))
-        wrong_num = compare_data(file_work_dir, DATA_TYPE_STR)
+    loop_num = PERF_RUN_TIMES if is_perf else 1
+    task_duration_avg = 0
+    for i in range(loop_num):
+        clear_file_cache(file_work_dir)
+        matmul_gen_data = MatmulGenData(m, n, k, b, IS_TRANS_A, IS_TRANS_B, IS_BIAS, DATA_TYPE_STR)
+        if is_perf:
+            matmul_gen_data.gen_fake_golden_data(file_work_dir)
+        else:
+            matmul_gen_data.gen_golden_data(file_work_dir)
+        params_str = f"{m} {n} {k}"
+        cmd = get_process_case_cmd(kernel_name="ascendc_matmul_ibshareb_bbit", params_str=params_str,\
+            is_perf=is_perf, run_mode=run_mode)
+        subprocess.run(cmd)
+        if is_perf:
+            wrong_num = -1
+        else:
+            logging.info("[INFO] compare data case[%s]" % (case_name))
+            wrong_num = compare_data(file_work_dir, DATA_TYPE_STR)
+        if is_perf:
+            cur_duration = float(get_perf_task_duration("./prof_out"))
+            logging.info("[INFO] case_name: [%s], loop_num: [%s], cur_duration: [%s]\n" % (case_name, i, cur_duration))
+            task_duration_avg += cur_duration
+
     res_data = []
     res_data.append(case_name)
     res_data.append(wrong_num)
@@ -77,8 +85,8 @@ def process_case(file_work_dir, process_params):
     else:
         res_data.append("Success")
     if is_perf:
-        task_duration = get_perf_task_duration("./prof_out")
-        res_data.append(task_duration)
+        task_duration_avg = task_duration_avg / loop_num
+        res_data.append(task_duration_avg)
     return res_data
 
 
diff --git a/examples/matrix/matmul_ibshareB/testcase/case.csv b/examples/matrix/matmul_ibshareB/testcase/case.csv
index 292334fab6a3b5d8ded92ca7c6b3a9a5042c7a92..496b90f501b3403b70a3f90156f2164206a1c9ef 100644
--- a/examples/matrix/matmul_ibshareB/testcase/case.csv
+++ b/examples/matrix/matmul_ibshareB/testcase/case.csv
@@ -1 +1 @@
-1, case001, 1280, 1024, 512
\ No newline at end of file
+1, case001, 64, 256, 384
\ No newline at end of file
diff --git a/examples/matrix/matmul_perf/CMakeLists.txt b/examples/matrix/matmul_perf/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..456cb32840e4caa64fe407a6bd14d8f87edb55b5
--- /dev/null
+++ b/examples/matrix/matmul_perf/CMakeLists.txt
@@ -0,0 +1,99 @@
+# Copyright (c) 2025 Huawei Technologies Co., Ltd.
+# This file is a part of the CANN Open Software.
+# Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# ======================================================================================================================
+
+cmake_minimum_required(VERSION 3.16)
+project(Ascend_c)
+if (${RUN_MODE})
+    set(RUN_MODE "npu" CACHE STRING "cpu/sim/npu")
+endif()
+if (${SOC_VERSION})
+    set(SOC_VERSION "Ascend910" CACHE STRING "system on chip type")
+endif()
+
+set(ASCEND_CANN_PACKAGE_PATH "~/Ascend/ascend-toolkit/latest" CACHE STRING "ASCEND CANN package installation directory")
+if(NOT CMAKE_BUILD_TYPE)
+    set(CMAKE_BUILD_TYPE "Debug"  CACHE STRING "Build type Release/Debug (default Debug)" FORCE)
+endif()
+
+if(CMAKE_INSTALL_PREFIX STREQUAL /usr/local)
+    set(CMAKE_INSTALL_PREFIX "${CMAKE_CURRENT_LIST_DIR}/out"  CACHE STRING "path for install()" FORCE)
+endif()
+
+file(GLOB KERNEL_FILES
+    ${CMAKE_CURRENT_SOURCE_DIR}/op_kernel/matmul_perf_custom_kernel.cpp
+)
+
+if("${RUN_MODE}" STREQUAL "cpu")
+    include(cmake/cpu_lib.cmake)
+elseif("${RUN_MODE}" STREQUAL "sim" OR "${RUN_MODE}" STREQUAL "npu")
+    include(cmake/npu_lib.cmake)
+else()
+    message("invalid RUN_MODE: ${RUN_MODE}")
+endif()
+
+add_executable(ascendc_matmul_perf_bbit
+    ${CMAKE_CURRENT_SOURCE_DIR}/main.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/op_host/matmul_perf_custom_tiling.cpp
+)
+
+target_compile_options(ascendc_matmul_perf_bbit PRIVATE
+    $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:-g>>
+    -O2
+    -std=c++17
+    -D_GLIBCXX_USE_CXX11_ABI=0
+)
+
+target_compile_definitions(ascendc_matmul_perf_bbit PRIVATE
+    SOC_VERSION="${SOC_VERSION}"
+)
+
+message(STATUS "[INFO] PERF_MODE is ${PERF_MODE}")
+if (PERF_MODE GREATER 0)
+    target_compile_definitions(ascendc_matmul_perf_bbit PRIVATE
+        -DENABLE_CUBE_ONLY
+    )
+    message(STATUS "[INFO] Enable CUBE_ONLY")
+endif()
+if (PERF_MODE GREATER 1)
+    target_compile_definitions(ascendc_matmul_perf_bbit PRIVATE
+        -DENABLE_MDL
+    )
+    message(STATUS "[INFO] Enable MDL")
+endif()
+if (PERF_MODE GREATER 2)
+    target_compile_definitions(ascendc_matmul_perf_bbit PRIVATE
+        -DENABLE_MDL_UNITFLAG
+    )
+    message(STATUS "[INFO] Enable MDL UNITFLAG")
+endif()
+if (PERF_MODE GREATER 3)
+    target_compile_definitions(ascendc_matmul_perf_bbit PRIVATE
+        -DENABLE_CONSTANT
+    )
+    message(STATUS "[INFO] Enable CONSTANT")
+endif()
+
+target_include_directories(ascendc_matmul_perf_bbit PRIVATE
+    ${CMAKE_CURRENT_SOURCE_DIR}
+    $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:${ASCEND_CANN_PACKAGE_PATH}/include>>
+    $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:${ASCEND_CANN_PACKAGE_PATH}/runtime/include>>
+)
+
+target_link_libraries(ascendc_matmul_perf_bbit PRIVATE
+    $<BUILD_INTERFACE:$<$<OR:$<STREQUAL:${RUN_MODE},npu>,$<STREQUAL:${RUN_MODE},sim>>:host_intf_pub>>
+    $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:tikicpulib::${SOC_VERSION}>>
+    $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:ascendcl>>
+    $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:c_sec>>
+    ascendc_kernels_${RUN_MODE}
+    tiling_api
+    register
+    platform
+    ascendalog
+    dl
+)
diff --git a/examples/matrix/matmul_perf/README.md b/examples/matrix/matmul_perf/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..c65b0e4ebf078acb005a6000169481eebd4bd2eb
--- /dev/null
+++ b/examples/matrix/matmul_perf/README.md
@@ -0,0 +1,263 @@
+<!--声明：本文使用[Creative Commons License version 4.0](https://creativecommons.org/licenses/by/4.0/legalcode)许可协议，转载、引用或修改等操作请遵循此许可协议。-->
+## 概述
+
+本样例介绍了调用Matmul API实现四种性能优化特性（纯Cube模式、MDL模板、UnitFlag、Tiling全量常量化）的单算子。
+
+本样例以直调的方式调用算子核函数。  
+直调：核函数的基础调用方式，开发者完成算子核函数的开发和Tiling实现后，即可通过AscendCL运行时接口，完成算子的调用。包含CPU侧、NPU侧和仿真三种运行验证方法。
+
+## 样例支持的产品型号为
+- Atlas A2训练系列产品/Atlas 800I A2推理产品
+
+## 目录结构
+| 目录及文件                            | 描述                   |
+|----------------------------------|----------------------|
+| [cmake](cmake)                   | 编译工程文件 |
+| [op_host](op_host)               | 本样例的tiling代码实现 |
+| [op_kernel](op_kernel)           | 本样例的kernel代码实现 |
+| [scripts](scripts)               | 执行脚本文件 |
+| [testcase](testcase)             | 用例文件，配置用例的计算shape信息 |
+| CMakeLists.txt                   | 编译工程文件 |
+| main.cpp                         | 主函数，调用算子的应用程序，含CPU域及NPU域调用 |
+| run.sh                           | 编译执行脚本 |
+
+## 算子描述
+- 算子功能  
+  - 纯Cube模式：在只有矩阵计算，没有矢量计算的场景下，通过使能纯Cube模式，可以优化Matmul计算中的消息通信性能开销，提升算子性能。
+  - MDL模板：在MTE2循环搬运次数多的大Shape场景下，使能MDL模板后，可以实现MTE2从Global Memory一次性搬入多个基本块到A1/B1，提升带宽利用率，减少MTE2的搬运次数，提升算子性能。
+  - UnitFlag：在算子的CUBE计算流水和FIXPIPE数据搬出流水串行且未被其他流水掩盖时，通过使能UnitFlag功能，可以实现CUBE计算流水和FIXPIPE数据搬出流水之间的流水并行，提升算子性能。
+  - Tiling常量化：分为两种场景，1）全量常量化：在获取Matmul模板时，可以确定常量的singleCore Shape（singleCoreM/singleCoreN/singleCoreK）和常量的Base Shape（basicM/basicN/basicK，也称为baseM/baseN/baseK）；2）部分常量化：在获取Matmul模板时，仅能确定常量的Base Shape（basicM/basicN/basicK）。  
+  若Matmul初始化的Scalar计算较多，影响指令头开销，或Matmul迭代间的Scalar计算较多，阻塞MTE2流水。这两种场景下，在满足上述Tiling常量化条件时，可以使能Tiling常量化功能，减少Scalar计算开销，提升算子性能。  
+  
+  以上四个特性的编码是相互独立的，本样例支持特性逐个叠加。具体叠加方式可参考下述“编译运行样例 > PERF_MODE参数说明”。
+
+- 算子规格
+    <table>
+    <tr><td rowspan="1" align="center">算子类型(OpType)</td><td colspan="5" align="center">MatmulPerfCustom</td></tr>
+    </tr>
+    <tr><td rowspan="4" align="center">算子输入</td><td align="center">name</td><td align="center">shape</td><td align="center">data type</td><td align="center">format</td><td align="center">isTrans</td></tr>
+    <tr><td align="center">a</td><td align="center">-</td><td align="center">float16</td><td align="center">ND</td><td align="center">false</td></tr>
+    <tr><td align="center">b</td><td align="center">-</td><td align="center">float16</td><td align="center">ND</td><td align="center">false</td></tr>
+    <tr><td align="center">bias</td><td align="center">-</td><td align="center">float</td><td align="center">ND</td><td align="center">-</td></tr>
+    </tr>
+    </tr>
+    <tr><td rowspan="1" align="center">算子输出</td><td align="center">c</td><td align="center">-</td><td align="center">float</td><td align="center">ND</td><td align="center">-</td></tr>
+    </tr>
+    <tr><td rowspan="1" align="center">核函数名</td><td colspan="5" align="center">matmul_perf_custom</td></tr>
+    </table>
+
+## 算子实现介绍     
+本样例的四个性能优化特性复用公共的算子Kernel代码和Host代码，通过编译宏的方式，隔离各个性能优化特性的差异化代码实现。
+- 算子Kernel实现
+  - 计算逻辑：C = A * B + Bias。
+    - A、B为源操作数，A为左矩阵，形状为[M, K]；B为右矩阵，形状为[K, N]。
+    - C为目的操作数，存放矩阵乘结果的矩阵，形状为[M, N]。
+    - Bias为矩阵乘偏置，形状为[1, N]。对A*B结果矩阵的每一行都采用该Bias进行偏置。
+  - 具体步骤：
+    - 创建Matmul对象。  
+      - 默认实现，使用默认的NORM模板CFG_NORM创建Matmul对象。
+        ```
+        #include "lib/matmul_intf.h"
+
+        using A_TYPE = AscendC::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, AType>;
+        using B_TYPE = AscendC::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, BType>;
+        using C_TYPE = AscendC::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, CType>;
+        using BIAS_TYPE =  AscendC::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, BiasType>;
+
+        AscendC::Matmul<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, CFG_NORM> matmulObj;
+        ```
+      - 使能纯Cube模式实现，在定义Matmul对象的代码中，设置ASCEND_CUBE_ONLY宏，且必须在#include "lib/matmul_intf.h"之前设置。
+        ```
+        #define ASCNEND_CUBE_ONLY // 设置ASCEND_CUBE_ONLY宏
+        #include "lib/matmul_intf.h"
+
+        using A_TYPE = AscendC::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, AType>;
+        using B_TYPE = AscendC::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, BType>;
+        using C_TYPE = AscendC::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, CType>;
+        using BIAS_TYPE =  AscendC::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, BiasType>;
+
+        AscendC::Matmul<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, CFG_NORM> matmulObj;
+        ```
+      - 使能MDL模板实现，使用默认的MDL模板CFG_MDL创建Matmul对象。
+        ```
+        #define ASCNEND_CUBE_ONLY // 设置ASCEND_CUBE_ONLY宏
+        #include "lib/matmul_intf.h"
+
+        using A_TYPE = AscendC::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, AType>;
+        using B_TYPE = AscendC::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, BType>;
+        using C_TYPE = AscendC::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, CType>;
+        using BIAS_TYPE =  AscendC::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, BiasType>;
+
+        AscendC::Matmul<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, CFG_MDL> matmulObj; // 使用CFG_MDL创建Matmul对象
+        ```
+      - 使能UnitFlag功能实现，自定义MatmulConfig参数，将其中的enUnitFlag参数设置为true，使能UnitFlag功能。
+        ```
+        #define ASCNEND_CUBE_ONLY // 设置ASCEND_CUBE_ONLY宏
+        #include "lib/matmul_intf.h"
+
+        __aicore__ inline constexpr MatmulConfig GetUnitFlagCfg()
+        {
+            auto mmCfg = CFG_MDL;
+            mmCfg.enUnitFlag = true; // 设置enUnitFlag参数为true
+            return mmCfg;
+        }
+        constexpr static MatmulConfig CFG_MDL_UNITFLAG = GetUnitFlagCfg();
+
+        using A_TYPE = AscendC::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, AType>;
+        using B_TYPE = AscendC::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, BType>;
+        using C_TYPE = AscendC::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, CType>;
+        using BIAS_TYPE =  AscendC::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, BiasType>;
+
+        AscendC::Matmul<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, CFG_MDL_UNITFLAG> matmulObj; // 使用自定义的MatmulConfig参数CFG_MDL_UNITFLAG创建Matmul对象
+        ```
+      - 使能Tiling全量常量化功能实现，首先使用MAX_M、MAX_N、MAX_K、BASE_M、BASE_N、BASE_K的常数值设置MatmulShapeParams，调用GetMMConfig得到自定义的MatmulConfig模板参数mmCFG；再通过GetMatmulApiTiling接口将Tiling信息常量化，得到常量化模板参数CUSTOM_CFG_CONSTANT，基于CUSTOM_CFG_CONSTANT创建Matmul对象。
+        ```
+        #define ASCNEND_CUBE_ONLY // 设置ASCEND_CUBE_ONLY宏
+        #include "lib/matmul_intf.h"
+
+        constexpr int32_t MAX_M = 10000; // custom matmul kernel support max value of M Dim shape
+        constexpr int32_t MAX_N = 10000; // custom matmul kernel support max value of N Dim shape
+        constexpr int32_t MAX_K = 10000; // custom matmul kernel support max value of K Dim shape
+        constexpr int32_t BASE_M = 128;  // BASE_M * BASE_K * sizeof(typeA) <=L0A size
+        constexpr int32_t BASE_N = 256;  // BASE_N * BASE_K * sizeof(typeB) <=L0B size
+        constexpr int32_t BASE_K = 64;   // BASE_M * BASE_N * sizeof(typeC) <=L0C size
+        constexpr MatmulShapeParams shapeParams = { MAX_M,
+                                                    MAX_N,
+                                                    MAX_K,
+                                                    BASE_M,
+                                                    BASE_N,
+                                                    BASE_K };
+
+        constexpr int32_t TILING_DEPTH_PARAM = 8;
+        constexpr int32_t TILING_STEPK_PARAM = 4;
+        constexpr int32_t TILING_STEPMN_PARAM = 1;
+
+        template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE>
+        __aicore__ inline constexpr MatmulApiStaticTiling GetCustomConstantCFG()
+        {
+            MatmulConfig mmCFG = GetMMConfig<MatmulConfigMode::CONFIG_MDL>(shapeParams);
+            // enable unitflag for performance comparison
+            mmCFG.enUnitFlag = true;
+            auto constantCFG = AscendC::GetMatmulApiTiling<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE>(mmCFG);
+            constantCFG.depthA1 = TILING_DEPTH_PARAM;
+            constantCFG.depthB1 = TILING_DEPTH_PARAM;
+            constantCFG.stepKa = TILING_STEPK_PARAM;
+            constantCFG.stepKb = TILING_STEPK_PARAM;
+            constantCFG.stepM = TILING_STEPMN_PARAM;
+            constantCFG.stepN = TILING_STEPMN_PARAM;
+            return constantCFG;
+        }
+
+        using A_TYPE = AscendC::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, AType>;
+        using B_TYPE = AscendC::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, BType>;
+        using C_TYPE = AscendC::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, CType>;
+        using BIAS_TYPE =  AscendC::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, BiasType>;
+
+        constexpr static auto CUSTOM_CFG_CONSTANT = GetCustomConstantCFG<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE>();
+        AscendC::Matmul<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, CUSTOM_CFG_CONSTANT> matmulObj; // 使用自定义的MatmulConfig参数CUSTOM_CFG_CONSTANT创建Matmul对象
+        ```
+    - 初始化操作。
+      - 默认实现
+        ```
+        REGIST_MATMUL_OBJ(&pipe, GetSysWorkSpacePtr(), matmulObj, &tiling); // 初始化matmul对象
+        ```
+      - 使能纯Cube模式实现，同默认实现。
+      - 使能MDL模板实现，同默认实现。
+      - 使能UnitFlag功能实现，同默认实现。
+      - 使能Tiling全量常量化功能实现
+        ```
+        // 全量常量化场景，调用REGIST_MATMUL_OBJ接口初始化Matmul对象时，可以在入参传递Tiling参数的位置，使用空指针替代。
+        REGIST_MATMUL_OBJ(&pipe, GetSysWorkSpacePtr(), matmulObj, (TCubeTiling*)nullptr);
+
+        // 部分常量化场景，调用REGIST_MATMUL_OBJ接口初始化Matmul对象时，仍需要使用Tiling。
+        REGIST_MATMUL_OBJ(&pipe, GetSysWorkSpacePtr(), matmulObj, &tiling);
+        ```
+    - 设置左矩阵A、右矩阵B、Bias。
+      - 默认实现
+        ```
+        mm.SetTensorA(aGlobal);    // 设置左矩阵A
+        mm.SetTensorB(bGlobal);    // 设置右矩阵B
+        mm.SetBias(biasGlobal);    // 设置Bias
+        ```
+      - 使能纯Cube模式实现，同默认实现。
+      - 使能MDL模板实现，同默认实现。
+      - 使能UnitFlag功能实现，同默认实现。
+      - 使能Tiling全量常量化功能实现
+        ```
+        matmulObj.SetOrgShape(tiling.M, tiling.N, tiling.Ka); // 调用SetOrgShape设置Matmul计算原始Shape
+        matmulObj.SetTensorA(aGlobal);    // 设置左矩阵A
+        matmulObj.SetTensorB(bGlobal);    // 设置右矩阵B
+        matmulObj.SetBias(biasGlobal);    // 设置Bias
+        ```
+    - 完成矩阵乘操作。
+      ```
+      matmulObj.IterateAll(cGlobal);
+      ```
+    - 结束矩阵乘操作。
+      ```
+      matmulObj.End();
+      ```
+
+- 算子Tiling实现
+  - Ascend C提供一组Matmul Tiling API，方便用户获取Matmul kernel计算时所需的Tiling参数。只需要传入A/B/C矩阵等信息，调用API接口，即可获取到TCubeTiling结构体中的相关参数。
+  - 获取Tiling参数的流程如下：
+    - 创建一个Tiling对象。
+      ```
+      auto ascendcPlatform = platform_ascendc::PlatformAscendCManager::GetInstance();
+      matmul_tiling::MultiCoreMatmulTiling cubeTiling(*ascendcPlatform);
+      ```
+    - 设置A、B、C、Bias的参数类型信息；M、N、Ka、Kb形状信息等。
+      ```
+      auto blockDim = ascendcPlatform->GetCoreNumAiv(); // 方式一：非纯Cube模式，SetDim设置为AIV的核数
+      auto blockDim = ascendcPlatform->GetCoreNumAic(); // 方式二：纯Cube模式，SetDim设置为AIC的核数
+
+      cubeTiling.SetDim(blockDim);
+      cubeTiling.SetAType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND,
+          matmul_tiling::DataType::DT_FLOAT16, isAtrans);
+      cubeTiling.SetBType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND,
+          matmul_tiling::DataType::DT_FLOAT16, isBtrans);
+      cubeTiling.SetCType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND,
+          matmul_tiling::DataType::DT_FLOAT);
+      cubeTiling.SetBiasType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND,
+          matmul_tiling::DataType::DT_FLOAT);
+      cubeTiling.SetOrgShape(M, N, K);
+      cubeTiling.SetShape(-1, -1, K);
+      cubeTiling.EnableBias(isBias);
+      ```
+    - 调用GetTiling接口，获取Tiling信息。
+      ```
+      TCubeTiling tilingData; 
+      int64_t ret = tiling.GetTiling(tilingData);    // if ret = -1, get tiling failed 
+      ```
+
+## 编译运行样例
+
+  - 配置环境变量
+
+    这里的\$ASCEND_CANN_PACKAGE_PATH需要替换为CANN开发套件包安装后文件存储路径。例如：/usr/local/Ascend/ascend-toolkit/latest
+    ```
+    export ASCEND_HOME_DIR=$ASCEND_CANN_PACKAGE_PATH
+    source $ASCEND_HOME_DIR/../set_env.sh
+    ```
+
+  - 编译执行
+
+    ```
+    bash run.sh -r [RUN_MODE] -v [SOC_VERSION] -p [IS_PERF] -m [PERF_MODE]
+    ```
+    其中脚本参数说明如下：
+    - RUN_MODE ：编译执行方式，可选择CPU调试，NPU仿真，NPU上板，对应参数分别为[cpu / sim / npu]。若需要详细了解NPU仿真相关内容，请参考[《算子开发工具msProf》](https://hiascend.com/document/redirect/CannCommunityToolMsProf)中的“工具使用”章节。
+    - SOC_VERSION ：昇腾AI处理器型号，如果无法确定具体的[SOC_VERSION]，则在安装昇腾AI处理器的服务器执行npu-smi info命令进行查询，在查询到的“Name”前增加Ascend信息，例如“Name”对应取值为xxxyy，实际配置的[SOC_VERSION]值为Ascendxxxyy。支持以下产品型号：
+      - Atlas A2训练系列产品/Atlas 800I A2推理产品
+    - IS_PERF ：是否获取执行性能数据，当前只在NPU执行时生效，打开后性能数据显示在执行结果中，可选择关闭和开启该功能，对应参数分别为[0 / 1]。
+    - PERF_MODE ：配置性能优化的方式，支持的取值为0，1，2，3，4，各取值的含义如下。该参数的默认值为0。
+      - PERF_MODE = 0：默认实现。使能MIX模式 + NORM模板（未使能UnitFlag功能和Tiling全量常量化功能）；
+      - PERF_MODE = 1：使能纯Cube模式；
+      - PERF_MODE = 2：使能纯Cube模式 + MDL模板；
+      - PERF_MODE = 3：使能纯Cube模式 + MDL模板 + UnitFlag功能；
+      - PERF_MODE = 4：使能纯Cube模式 + MDL模板 + UnitFlag功能 + Tiling全量常量化功能；
+
+    示例如下，Ascendxxxyy请替换为实际的AI处理器型号。
+    ```
+    bash run.sh -r cpu -v Ascendxxxyy -p 0 -m 0
+    ```
diff --git a/examples/matrix/matmul_perf/cmake/cpu_lib.cmake b/examples/matrix/matmul_perf/cmake/cpu_lib.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..ffb9281d435d151f73ce5619d9b179dca76bc76d
--- /dev/null
+++ b/examples/matrix/matmul_perf/cmake/cpu_lib.cmake
@@ -0,0 +1,57 @@
+# Copyright (c) 2025 Huawei Technologies Co., Ltd.
+# This file is a part of the CANN Open Software.
+# Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# ======================================================================================================================
+
+if(NOT DEFINED ENV{CMAKE_PREFIX_PATH})
+    set(CMAKE_PREFIX_PATH ${ASCEND_CANN_PACKAGE_PATH}/tools/tikicpulib/lib/cmake)
+endif()
+find_package(tikicpulib REQUIRED)
+
+add_library(ascendc_kernels_${RUN_MODE} SHARED
+    ${KERNEL_FILES}
+)
+
+target_link_libraries(ascendc_kernels_${RUN_MODE} PRIVATE
+    tikicpulib::${SOC_VERSION}
+)
+
+target_compile_options(ascendc_kernels_${RUN_MODE} PRIVATE
+    -g
+    -O0
+    -std=c++17
+)
+
+message(STATUS "[INFO] CPU Target PERF_MODE is ${PERF_MODE}")
+if (PERF_MODE GREATER 0)
+    target_compile_definitions(ascendc_kernels_${RUN_MODE} PRIVATE
+        -DENABLE_CUBE_ONLY
+    )
+    message(STATUS "[INFO] CPU Target Enable CUBE_ONLY")
+endif()
+if (PERF_MODE GREATER 1)
+    target_compile_definitions(ascendc_kernels_${RUN_MODE} PRIVATE
+        -DENABLE_MDL
+    )
+    message(STATUS "[INFO] CPU Target Enable MDL")
+endif()
+if (PERF_MODE GREATER 2)
+    target_compile_definitions(ascendc_kernels_${RUN_MODE} PRIVATE
+        -DENABLE_MDL_UNITFLAG
+    )
+    message(STATUS "[INFO] CPU Target Enable MDL UNITFLAG")
+endif()
+if (PERF_MODE GREATER 3)
+    target_compile_definitions(ascendc_kernels_${RUN_MODE} PRIVATE
+        -DENABLE_CONSTANT
+    )
+    message(STATUS "[INFO] CPU Target Enable CONSTANT")
+endif()
+
+install(TARGETS ascendc_kernels_${RUN_MODE}
+    DESTINATION ${CMAKE_INSTALL_LIBDIR}
+)
diff --git a/examples/matrix/matmul_perf/cmake/npu_lib.cmake b/examples/matrix/matmul_perf/cmake/npu_lib.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..4954182987f54c2e7aba33b683a1208d5ec1a9b1
--- /dev/null
+++ b/examples/matrix/matmul_perf/cmake/npu_lib.cmake
@@ -0,0 +1,53 @@
+# Copyright (c) 2025 Huawei Technologies Co., Ltd.
+# This file is a part of the CANN Open Software.
+# Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# ======================================================================================================================
+
+if(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
+    set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
+elseif(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake)
+    set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake)
+else()
+    message(FATAL_ERROR "ascendc_kernel_cmake does not exist ,please check whether the cann package is installed")
+endif()
+include(${ASCENDC_CMAKE_DIR}/ascendc.cmake)
+
+ascendc_library(ascendc_kernels_${RUN_MODE} STATIC
+    ${KERNEL_FILES}
+)
+
+ascendc_compile_definitions(ascendc_kernels_${RUN_MODE} PRIVATE
+    -DASCENDC_DUMP
+    -DHAVE_WORKSPACE
+    -DHAVE_TILING
+)
+
+message(STATUS "[INFO] NPU/SIM Target PERF_MODE is ${PERF_MODE}")
+if (PERF_MODE GREATER 0)
+    ascendc_compile_definitions(ascendc_kernels_${RUN_MODE} PRIVATE
+        -DENABLE_CUBE_ONLY
+    )
+    message(STATUS "[INFO] NPU/SIM Target Enable CUBE_ONLY")
+endif()
+if (PERF_MODE GREATER 1)
+    ascendc_compile_definitions(ascendc_kernels_${RUN_MODE} PRIVATE
+        -DENABLE_MDL
+    )
+    message(STATUS "[INFO] NPU/SIM Target Enable MDL")
+endif()
+if (PERF_MODE GREATER 2)
+    ascendc_compile_definitions(ascendc_kernels_${RUN_MODE} PRIVATE
+        -DENABLE_MDL_UNITFLAG
+    )
+    message(STATUS "[INFO] NPU/SIM Target Enable MDL UNITFLAG")
+endif()
+if (PERF_MODE GREATER 3)
+    ascendc_compile_definitions(ascendc_kernels_${RUN_MODE} PRIVATE
+        -DENABLE_CONSTANT
+    )
+    message(STATUS "[INFO] NPU/SIM Target Enable CONSTANT")
+endif()
diff --git a/examples/matrix/matmul_perf/main.cpp b/examples/matrix/matmul_perf/main.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..375c0ecaf4e2a97ac309a4029aa810e802e50f22
--- /dev/null
+++ b/examples/matrix/matmul_perf/main.cpp
@@ -0,0 +1,241 @@
+/**
+ * Copyright (c) 2025 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+#include <sstream>
+#include "../../common/data_utils.h"
+#include "kernel_tiling/kernel_tiling.h"
+#include "tiling/platform/platform_ascendc.h"
+#include "op_host/matmul_perf_custom_tiling.h"
+
+#ifndef ASCENDC_CPU_DEBUG
+#include "acl/acl.h"
+extern void matmul_perf_custom_do(uint32_t coreDim, void* stream,
+    uint8_t* a, uint8_t* b, uint8_t* bias, uint8_t* c,
+    uint8_t* workspace, uint8_t* tiling);
+#else
+#include "tikicpulib.h"
+extern "C" void matmul_perf_custom(uint8_t* a, uint8_t* b, uint8_t* bias, uint8_t* c,
+    uint8_t* workspace, uint8_t* tiling);
+#endif
+
+namespace {
+constexpr bool IS_BIAS = false;
+constexpr bool IS_A_TRANS = false;
+constexpr bool IS_B_TRANS = false;
+
+struct MatrixFileSize
+{
+    size_t x1FileSize;
+    size_t x2FileSize;
+    size_t yFileSize;
+    size_t biasFileSize;
+};
+}
+
+namespace MatmulHost {
+static size_t GetSysWorkSpaceSize()
+{
+    auto ascendcPlatform = platform_ascendc::PlatformAscendCManager::GetInstance();
+    return static_cast<size_t>(ascendcPlatform->GetLibApiWorkSpaceSize());
+}
+
+// CPU debug mode
+#ifdef ASCENDC_CPU_DEBUG
+void TestMatmulCpu(int64_t m, int64_t n, int64_t k, const MatrixFileSize& matrixFileSize)
+{
+    size_t x1FileSize = matrixFileSize.x1FileSize;
+    size_t x2FileSize = matrixFileSize.x2FileSize;
+    size_t yFileSize = matrixFileSize.yFileSize;
+    size_t biasFileSize = matrixFileSize.biasFileSize;
+    // Query workspace size
+    size_t workspaceSize = GetSysWorkSpaceSize();
+    uint8_t* x1 = (uint8_t*)AscendC::GmAlloc(x1FileSize);
+    uint8_t* x2 = (uint8_t*)AscendC::GmAlloc(x2FileSize);
+    uint8_t* bias = nullptr;
+    uint8_t* y = (uint8_t*)AscendC::GmAlloc(yFileSize);
+    uint8_t* workspace = (uint8_t*)AscendC::GmAlloc(workspaceSize);
+    ReadFile("../input/x1_gm.bin", x1FileSize, x1, x1FileSize);
+    ReadFile("../input/x2_gm.bin", x2FileSize, x2, x2FileSize);
+    if (IS_BIAS) {
+        bias = (uint8_t*)AscendC::GmAlloc(biasFileSize);
+        ReadFile("../input/bias_gm.bin", biasFileSize, bias, biasFileSize);
+    }
+    size_t tilingFileSize = sizeof(TCubeTiling);
+    uint8_t* tiling = (uint8_t*)AscendC::GmAlloc(tilingFileSize);
+    auto ascendcPlatform = platform_ascendc::PlatformAscendCManager::GetInstance();
+    auto aicCoreNum = ascendcPlatform->GetCoreNumAic();
+    auto aivCoreNum = ascendcPlatform->GetCoreNumAiv();
+#ifdef ENABLE_CUBE_ONLY
+    auto coreNum = aicCoreNum;
+#else
+    auto coreNum = aivCoreNum;
+#endif
+    MatmulHost::MatmulCaseParams testCaseParams{static_cast<int32_t>(coreNum),
+        static_cast<int32_t>(m), static_cast<int32_t>(n), static_cast<int32_t>(k), IS_BIAS, IS_A_TRANS, IS_B_TRANS};
+    std::cout << "[INFO] aicCoreNum = " << aicCoreNum << ", aivCoreNum = " << aivCoreNum << std::endl;
+    // Calculate Tiling
+    const auto& tilingData = MatmulHost::GenerateTiling(testCaseParams);
+    memcpy_s(tiling, tilingFileSize, &tilingData, tilingFileSize);
+    ICPU_RUN_KF(matmul_perf_custom, ascendcPlatform->GetCoreNumAic(), x1, x2, bias, y, workspace, tiling);
+    WriteFile("../output/output.bin", y, yFileSize);
+    AscendC::GmFree((void*)x1);
+    AscendC::GmFree((void*)x2);
+    AscendC::GmFree((void*)y);
+    AscendC::GmFree((void*)workspace);
+    AscendC::GmFree((void*)tiling);
+    if (IS_BIAS) {
+        AscendC::GmFree((void*)bias);
+    }
+}
+// NPU
+#else
+void MatmulOp(uint8_t* x1, uint8_t* x2, uint8_t* y, uint8_t* bias, int64_t m, int64_t n, int64_t k,
+    void* stream = nullptr)
+{
+    // Init args
+    uint8_t* workspaceDevice = nullptr;
+
+    // Query workspace size
+    size_t workspaceSize = GetSysWorkSpaceSize();
+
+    // Allocate workspace on device
+    CHECK_ACL(aclrtMalloc((void**)&workspaceDevice, workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST));
+
+    // Initialize kernel with arguments and workspace pointer
+    uint8_t* tilingHost = nullptr;
+    uint8_t* tilingDevice = nullptr;
+    size_t tilingFileSize = sizeof(TCubeTiling);
+    auto ascendcPlatform = platform_ascendc::PlatformAscendCManager::GetInstance();
+    auto aicCoreNum = ascendcPlatform->GetCoreNumAic();
+    auto aivCoreNum = ascendcPlatform->GetCoreNumAiv();
+#ifdef ENABLE_CUBE_ONLY
+    auto coreNum = aicCoreNum;
+#else
+    auto coreNum = aivCoreNum;
+#endif
+    MatmulHost::MatmulCaseParams testCaseParams{static_cast<int32_t>(coreNum),
+        static_cast<int32_t>(m), static_cast<int32_t>(n), static_cast<int32_t>(k), IS_BIAS, IS_A_TRANS, IS_B_TRANS};
+    std::cout << "[INFO] aicCoreNum = " << aicCoreNum << ", aivCoreNum = " << aivCoreNum << std::endl;
+    // Calculate Tiling
+    const auto tilingData = MatmulHost::GenerateTiling(testCaseParams);
+    CHECK_ACL(aclrtMallocHost((void**)(&tilingHost), tilingFileSize));
+    CHECK_ACL(aclrtMalloc((void**)&tilingDevice, tilingFileSize,
+                          ACL_MEM_MALLOC_HUGE_FIRST));
+    CHECK_ACL(aclrtMemcpy(tilingHost, tilingFileSize, &tilingData,
+                          tilingFileSize, ACL_MEMCPY_HOST_TO_HOST));
+    CHECK_ACL(aclrtMemcpy(tilingDevice, tilingFileSize, tilingHost,
+                          tilingFileSize, ACL_MEMCPY_HOST_TO_DEVICE));
+    // Launch kernel
+    matmul_perf_custom_do(ascendcPlatform->GetCoreNumAic(), stream, x1, x2, bias, y, workspaceDevice, tilingDevice);
+    CHECK_ACL(aclrtFreeHost(tilingHost));
+    CHECK_ACL(aclrtFree(workspaceDevice));
+    CHECK_ACL(aclrtFree(tilingDevice));
+}
+
+void TestAclInit(aclrtContext& context, aclrtStream& stream, int64_t& deviceId)
+{
+    CHECK_ACL(aclInit(nullptr));
+    CHECK_ACL(aclrtSetDevice(deviceId));
+    CHECK_ACL(aclrtCreateContext(&context, deviceId));
+    CHECK_ACL(aclrtCreateStream(&stream));
+}
+
+void TestAclDeInit(aclrtContext& context, aclrtStream& stream, int64_t& deviceId)
+{
+    CHECK_ACL(aclrtDestroyStream(stream));
+    CHECK_ACL(aclrtDestroyContext(context));
+    CHECK_ACL(aclrtResetDevice(deviceId));
+    CHECK_ACL(aclFinalize());
+}
+
+void TestMatmul(int64_t m, int64_t n, int64_t k, const MatrixFileSize& matrixFileSize)
+{
+    size_t x1FileSize = matrixFileSize.x1FileSize;
+    size_t x2FileSize = matrixFileSize.x2FileSize;
+    size_t yFileSize = matrixFileSize.yFileSize;
+    size_t biasFileSize = matrixFileSize.biasFileSize;
+
+    aclrtContext context;
+    aclrtStream stream = nullptr;
+    int64_t deviceId = 0;
+    TestAclInit(context, stream, deviceId);
+
+    uint8_t* x1Host = nullptr;
+    uint8_t* x1Device = nullptr;
+    CHECK_ACL(aclrtMallocHost((void**)(&x1Host), x1FileSize));
+    CHECK_ACL(aclrtMalloc((void**)&x1Device, x1FileSize, ACL_MEM_MALLOC_HUGE_FIRST));
+    ReadFile("../input/x1_gm.bin", x1FileSize, x1Host, x1FileSize);
+    CHECK_ACL(aclrtMemcpy(x1Device, x1FileSize, x1Host, x1FileSize, ACL_MEMCPY_HOST_TO_DEVICE));
+
+    uint8_t* x2Host = nullptr;
+    uint8_t* x2Device = nullptr;
+    CHECK_ACL(aclrtMallocHost((void**)(&x2Host), x2FileSize));
+    CHECK_ACL(aclrtMalloc((void**)&x2Device, x2FileSize, ACL_MEM_MALLOC_HUGE_FIRST));
+    ReadFile("../input/x2_gm.bin", x2FileSize, x2Host, x2FileSize);
+    CHECK_ACL(aclrtMemcpy(x2Device, x2FileSize, x2Host, x2FileSize, ACL_MEMCPY_HOST_TO_DEVICE));
+
+    uint8_t* biasHost = nullptr;
+    uint8_t* biasDevice = nullptr;
+    if (IS_BIAS) {
+        CHECK_ACL(aclrtMallocHost((void**)(&biasHost), biasFileSize));
+        CHECK_ACL(aclrtMalloc((void**)&biasDevice, biasFileSize, ACL_MEM_MALLOC_HUGE_FIRST));
+        ReadFile("../input/bias_gm.bin", biasFileSize, biasHost, biasFileSize);
+        CHECK_ACL(aclrtMemcpy(biasDevice, biasFileSize, biasHost, biasFileSize, ACL_MEMCPY_HOST_TO_DEVICE));
+    }
+    uint8_t* yHost = nullptr;
+    uint8_t* yDevice = nullptr;
+    CHECK_ACL(aclrtMallocHost((void**)(&yHost), yFileSize));
+    CHECK_ACL(aclrtMalloc((void**)&yDevice, yFileSize, ACL_MEM_MALLOC_HUGE_FIRST));
+
+    MatmulOp(x1Device, x2Device, yDevice, biasDevice, m, n, k, stream);
+    CHECK_ACL(aclrtSynchronizeStream(stream));
+
+    CHECK_ACL(aclrtMemcpy(yHost, yFileSize, yDevice, yFileSize, ACL_MEMCPY_DEVICE_TO_HOST));
+    WriteFile("../output/output.bin", yHost, yFileSize);
+
+    if (IS_BIAS) {
+        CHECK_ACL(aclrtFree(biasDevice));
+        CHECK_ACL(aclrtFreeHost(biasHost));
+    }
+    CHECK_ACL(aclrtFree(x1Device));
+    CHECK_ACL(aclrtFreeHost(x1Host));
+    CHECK_ACL(aclrtFree(x2Device));
+    CHECK_ACL(aclrtFreeHost(x2Host));
+    CHECK_ACL(aclrtFree(yDevice));
+    CHECK_ACL(aclrtFreeHost(yHost));
+    TestAclDeInit(context, stream, deviceId);
+}
+#endif
+} // namespace MatmulHost
+
+int32_t main(int32_t argc, const char* args[])
+{
+    int64_t inputParams[3] = {1, 1, 1};
+    for (int32_t i = 1; i < argc && i < 4; ++i) { // 4 used for inputParams loop
+        std::stringstream ss(args[i]);
+        ss >> inputParams[i - 1];
+    }
+    auto ascendcPlatform = platform_ascendc::PlatformAscendCManager::GetInstance(SOC_VERSION);
+    int64_t M = inputParams[0];
+    int64_t N = inputParams[1];
+    int64_t K = inputParams[2];
+    MatrixFileSize matrixFileSize;
+    // uint16_t represent half
+    matrixFileSize.x1FileSize = static_cast<size_t>(M * K) * sizeof(uint16_t);
+    matrixFileSize.x2FileSize = static_cast<size_t>(K * N) * sizeof(uint16_t);
+    matrixFileSize.yFileSize = static_cast<size_t>(M * N) * sizeof(float);
+    matrixFileSize.biasFileSize = static_cast<size_t>(1 * N) * sizeof(float);
+#ifdef ASCENDC_CPU_DEBUG
+    MatmulHost::TestMatmulCpu(M, N, K, matrixFileSize);
+#else
+    MatmulHost::TestMatmul(M, N, K, matrixFileSize);
+#endif
+    return 0;
+}
diff --git a/examples/matrix/matmul_perf/op_host/matmul_perf_custom_tiling.cpp b/examples/matrix/matmul_perf/op_host/matmul_perf_custom_tiling.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8625b8cefbd96d682fa33d74e7ef6bfe92244042
--- /dev/null
+++ b/examples/matrix/matmul_perf/op_host/matmul_perf_custom_tiling.cpp
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) Huawei Technologies Co., Ltd. 2025-2025. All rights reserved.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+#include "matmul_perf_custom_tiling.h"
+#include <iostream>
+
+namespace MatmulHost {
+
+TCubeTiling GenerateTiling(const MatmulCaseParams& testCaseParams)
+{
+    TCubeTiling tilingData;
+    auto ascendcPlatform = platform_ascendc::PlatformAscendCManager::GetInstance();
+    matmul_tiling::MultiCoreMatmulTiling cubeTiling(*ascendcPlatform);
+    uint32_t M = testCaseParams.m;
+    uint32_t N = testCaseParams.n;
+    uint32_t K = testCaseParams.k;
+    uint32_t blockDim = testCaseParams.usedCoreNum;
+    bool isBias = testCaseParams.isBias;
+    bool isAtrans = testCaseParams.isATrans;
+    bool isBtrans = testCaseParams.isBTrans;
+
+    cubeTiling.SetDim(blockDim);
+    cubeTiling.SetAType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND,
+        matmul_tiling::DataType::DT_FLOAT16, isAtrans);
+    cubeTiling.SetBType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND,
+        matmul_tiling::DataType::DT_FLOAT16, isBtrans);
+    cubeTiling.SetCType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND,
+        matmul_tiling::DataType::DT_FLOAT);
+    cubeTiling.SetBiasType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND,
+        matmul_tiling::DataType::DT_FLOAT);
+    cubeTiling.SetOrgShape(M, N, K);
+    cubeTiling.SetShape(-1, -1, K);
+    cubeTiling.EnableBias(isBias);
+    if (cubeTiling.GetTiling(tilingData) == -1) {
+        std::cout << "Generate tiling failed." << std::endl;
+        return {};
+    }
+    return tilingData;
+}
+
+} // namespace MatmulHost
diff --git a/examples/matrix/matmul_ibshareB/op_kernel/matmul_ibshare_custom.cpp b/examples/matrix/matmul_perf/op_host/matmul_perf_custom_tiling.h
similarity index 31%
rename from examples/matrix/matmul_ibshareB/op_kernel/matmul_ibshare_custom.cpp
rename to examples/matrix/matmul_perf/op_host/matmul_perf_custom_tiling.h
index ba4707d891c2949f007faba7b1b28ab58d770a8e..4c5e4fcadb4d59e616f49137718d43e3912c0322 100644
--- a/examples/matrix/matmul_ibshareB/op_kernel/matmul_ibshare_custom.cpp
+++ b/examples/matrix/matmul_perf/op_host/matmul_perf_custom_tiling.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2025 Huawei Technologies Co., Ltd.
+ * Copyright (c) Huawei Technologies Co., Ltd. 2025-2025. All rights reserved.
  * This file is a part of the CANN Open Software.
  * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
  * Please refer to the License for details. You may not use this file except in compliance with the License.
@@ -8,37 +8,30 @@
  * See LICENSE in the root of the software repository for the full text of the License.
  */
 
-#include "kernel_operator.h"
-#include "lib/matmul_intf.h"
-#include "matmul_ibshare_custom.h"
+#ifndef EXAMPLES_MATRIX_MATMUL_PERF_OP_HOST_MATMUL_PERF_CUSTOM_TILING_H
+#define EXAMPLES_MATRIX_MATMUL_PERF_OP_HOST_MATMUL_PERF_CUSTOM_TILING_H
+#include "register/tilingdata_base.h"
+#include "tiling/tiling_api.h"
 
-__aicore__ inline void CopyTiling(TCubeTiling* tiling, GM_ADDR tilingGM)
-{
-    uint32_t* ptr = reinterpret_cast<uint32_t*>(tiling);
-    auto tiling32 = reinterpret_cast<__gm__ uint32_t*>(tilingGM);
-
-    for (int i = 0; i < sizeof(TCubeTiling) / sizeof(uint32_t); i++, ptr++) {
-      *ptr = *(tiling32 + i);
-    }
-    return;
-}
+namespace MatmulHost {
 
-extern "C" __global__ __aicore__ void matmul_ibshare_custom(GM_ADDR a, GM_ADDR b, GM_ADDR c,
-    GM_ADDR workspace, GM_ADDR tilingGm)
+struct MatmulCaseParams
 {
-    TCubeTiling tiling;
-    CopyTiling(&tiling, tilingGm);
-    MatmulIbshareKernel<half, half, float, float, false, true> matmulKernel;
-    AscendC::TPipe pipe;
-    REGIST_MATMUL_OBJ(&pipe, GetSysWorkSpacePtr(), matmulKernel.matmulObj, &tiling);
-    matmulKernel.Init(a, b, nullptr, c, workspace, tiling);
-    matmulKernel.Process<false>(&pipe);
-}
+    int32_t usedCoreNum;
+    int32_t m;
+    int32_t n;
+    int32_t k;
+    bool isBias;
+    bool isATrans;
+    bool isBTrans;
+};
 
-#ifndef ASCENDC_CPU_DEBUG
-void matmul_ibshare_custom_do(uint32_t blockDim, void* stream, GM_ADDR a, GM_ADDR b,
-                    GM_ADDR c, GM_ADDR workspace, GM_ADDR tilingGm)
-{
-    matmul_ibshare_custom<<<blockDim, nullptr, stream>>>(a, b, c, workspace, tilingGm);
-}
-#endif
\ No newline at end of file
+/**
+  * @brief Generate matmul tiling.
+  * @param testCaseParams: Testcase parameters.
+  * @retval Generated Tiling data.
+  */
+TCubeTiling GenerateTiling(const MatmulCaseParams& testCaseParams);
+
+} // namespace MatmulHost
+#endif // EXAMPLES_MATRIX_MATMUL_PERF_OP_HOST_MATMUL_PERF_CUSTOM_TILING_H
diff --git a/examples/matrix/matmul_perf/op_kernel/matmul_perf_custom_kernel.cpp b/examples/matrix/matmul_perf/op_kernel/matmul_perf_custom_kernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..5218d59952f0b0cef490f29a997adacef4c54779
--- /dev/null
+++ b/examples/matrix/matmul_perf/op_kernel/matmul_perf_custom_kernel.cpp
@@ -0,0 +1,152 @@
+/*
+ * Copyright (c) Huawei Technologies Co., Ltd. 2025-2025. All rights reserved.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+#include "kernel_operator.h"
+#include "matmul_perf_custom_kernel.h"
+
+namespace {
+/**
+  * @brief  Copy tiling data to TCubeTiling ptr from tiling gm addr.
+  * @param  tiling: TCubeTiling ptr which needs to copy tiling data.
+  * @param  tilingGM: Tiling gm addr.
+  * @retval None
+  */
+__aicore__ inline void CopyTiling(TCubeTiling* tiling, GM_ADDR tilingGM)
+{
+    uint32_t* ptr = reinterpret_cast<uint32_t*>(tiling);
+    auto tiling32 = reinterpret_cast<__gm__ uint32_t*>(tilingGM);
+
+    for (int i = 0; i < sizeof(TCubeTiling) / sizeof(uint32_t); i++, ptr++) {
+      *ptr = *(tiling32 + i);
+    }
+    return;
+}
+}
+
+namespace MatmulPerfCustom {
+template <typename AType, typename BType, typename CType, typename BiasType>
+__aicore__ inline void MatmulKernel<AType, BType, CType, BiasType>::Init(
+    GM_ADDR a, GM_ADDR b, GM_ADDR bias, GM_ADDR c, const TCubeTiling& tiling, bool isTransA, bool isTransB)
+{
+    this->tiling = tiling;
+    if (AscendC::GetBlockIdx() >= tiling.usedCoreNum) {
+        return;
+    }
+    aGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ AType*>(a), tiling.M * tiling.Ka);
+    bGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ BType*>(b), tiling.Kb * tiling.N);
+    cGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ CType*>(c), tiling.M * tiling.N);
+    biasGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ BiasType*>(bias), tiling.N);
+
+    int32_t offsetA = 0;
+    int32_t offsetB = 0;
+    int32_t offsetC = 0;
+    int32_t offsetBias = 0;
+    this->isTransA = isTransA;
+    this->isTransB = isTransB;
+    CalcOffset(tiling, offsetA, offsetB, offsetC, offsetBias);
+    aGlobal = aGlobal[offsetA];
+    bGlobal = bGlobal[offsetB];
+    cGlobal = cGlobal[offsetC];
+    biasGlobal = biasGlobal[offsetBias];
+}
+
+template <typename AType, typename BType, typename CType, typename BiasType>
+__aicore__ inline void MatmulKernel<AType, BType, CType, BiasType>::Process(AscendC::TPipe* pipe)
+{
+    if (AscendC::GetBlockIdx() >= tiling.usedCoreNum) {
+        return;
+    }
+    // process with tail block
+    int tailM = tiling.M - mCoreIndex * tiling.singleCoreM;
+    tailM = tailM < tiling.singleCoreM ? (tailM > 0 ? tailM : tiling.singleCoreM) : tiling.singleCoreM;
+    int tailN = tiling.N - nCoreIndex * tiling.singleCoreN;
+    tailN = tailN < tiling.singleCoreN ? (tailN > 0 ? tailN : tiling.singleCoreN) : tiling.singleCoreN;
+#ifdef ENABLE_CONSTANT
+    matmulObj.SetOrgShape(tiling.M, tiling.N, tiling.Ka);
+#endif
+    matmulObj.SetSingleShape(tailM, tailN, tiling.Ka);
+    matmulObj.SetTensorA(aGlobal, isTransA);
+    matmulObj.SetTensorB(bGlobal, isTransB);
+    if (tiling.isBias) {
+        matmulObj.SetBias(biasGlobal);
+    }
+    matmulObj.IterateAll(cGlobal);
+    matmulObj.End();
+}
+
+template <typename AType, typename BType, typename CType, typename BiasType>
+__aicore__ inline void MatmulKernel<AType, BType, CType, BiasType>::CalcOffset(
+    const TCubeTiling& tiling, int32_t& offsetA, int32_t& offsetB, int32_t& offsetC, int32_t& offsetBias)
+{
+    int32_t blockIdx = AscendC::GetBlockIdx();
+    if (blockIdx >= tiling.usedCoreNum) {
+        return;
+    }
+    auto mSingleBlocks = (tiling.M + tiling.singleCoreM - 1) / tiling.singleCoreM; // split M into mSingleBlocks cores
+    mCoreIndex = blockIdx % mSingleBlocks;
+    nCoreIndex = blockIdx / mSingleBlocks;
+
+    if (isTransA) {
+        offsetA = mCoreIndex * tiling.singleCoreM;
+    } else {
+        offsetA = mCoreIndex * tiling.Ka * tiling.singleCoreM;
+    }
+    if (isTransB) {
+        offsetB = nCoreIndex * tiling.Kb * tiling.singleCoreN;
+    } else {
+        offsetB = nCoreIndex * tiling.singleCoreN;
+    }
+    offsetC = mCoreIndex * tiling.N * tiling.singleCoreM + nCoreIndex * tiling.singleCoreN;
+    offsetBias = nCoreIndex * tiling.singleCoreN;
+}
+} // namespace MatmulPerfCustom
+
+/**
+  * @brief  matmul kernel function.
+  * @param  a: A matrix gm addr.
+  * @param  b: B matrix gm addr.
+  * @param  bias: Bias matrix gm addr.
+  * @param  c: C matrix gm addr.
+  * @param  workspace: Temporary gm space addr required by matmul calc.
+  * @param  tilingGm: Tiling data addr. 
+  * @retval None
+  */
+extern "C" __global__ __aicore__ void matmul_perf_custom(GM_ADDR a, GM_ADDR b,
+    GM_ADDR bias, GM_ADDR c, GM_ADDR workspace, GM_ADDR tilingGm)
+{
+#ifdef ENABLE_CUBE_ONLY
+    if (g_coreType == AscendC::AIV) {
+        return;
+    }
+#endif
+    // prepare tiling
+    TCubeTiling tiling;
+    CopyTiling(&tiling, tilingGm);
+    // define matmul kernel
+    MatmulPerfCustom::MatmulKernel<half, half, float, float> matmulKernel;
+    AscendC::TPipe pipe;
+    // init matmul kernel, isTransA=false, isTransB=false
+    matmulKernel.Init(a, b, bias, c, tiling, false, false);
+#ifdef ENABLE_CONSTANT
+    REGIST_MATMUL_OBJ(&pipe, GetSysWorkSpacePtr(), matmulKernel.matmulObj, (TCubeTiling*)nullptr);
+#else
+    REGIST_MATMUL_OBJ(&pipe, GetSysWorkSpacePtr(), matmulKernel.matmulObj, &tiling);
+#endif
+    // matmul kernel process
+    matmulKernel.Process(&pipe);
+}
+
+#ifndef ASCENDC_CPU_DEBUG
+void matmul_perf_custom_do(uint32_t blockDim, void* stream,
+    GM_ADDR a, GM_ADDR b, GM_ADDR bias, GM_ADDR c, GM_ADDR workspace, GM_ADDR tilingGm)
+{
+    matmul_perf_custom<<<blockDim, nullptr, stream>>>(a, b, bias, c, workspace, tilingGm);
+}
+#endif
diff --git a/examples/matrix/matmul_perf/op_kernel/matmul_perf_custom_kernel.h b/examples/matrix/matmul_perf/op_kernel/matmul_perf_custom_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..6441e48d2c5b57485011021ea2284f08d69ec519
--- /dev/null
+++ b/examples/matrix/matmul_perf/op_kernel/matmul_perf_custom_kernel.h
@@ -0,0 +1,137 @@
+/*
+ * Copyright (c) Huawei Technologies Co., Ltd. 2025-2025. All rights reserved.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+#ifndef EXAMPLES_MATRIX_MATMUL_PERF_OP_KERNEL_MATMUL_PERF_CUSTOM_KERNEL_H
+#define EXAMPLES_MATRIX_MATMUL_PERF_OP_KERNEL_MATMUL_PERF_CUSTOM_KERNEL_H
+#include "kernel_operator.h"
+#ifdef ENABLE_CUBE_ONLY
+#define ASCENDC_CUBE_ONLY
+#endif
+#include "lib/matmul_intf.h"
+
+namespace MatmulPerfCustom {
+
+__aicore__ inline constexpr MatmulConfig GetCustomNormCFG()
+{
+    auto mmCfg = CFG_NORM;
+    // disable unitflag for performance comparison
+    mmCfg.enUnitFlag = false;
+    return mmCfg;
+}
+
+__aicore__ inline constexpr MatmulConfig GetCustomMDLCFG()
+{
+    auto mmCfg = CFG_MDL;
+#ifdef ENABLE_MDL_UNITFLAG
+    mmCfg.enUnitFlag = true;
+#endif
+    return mmCfg;
+}
+
+constexpr int32_t MAX_M = 10000; // custom matmul kernel support max value of M Dim shape
+constexpr int32_t MAX_N = 10000; // custom matmul kernel support max value of N Dim shape
+constexpr int32_t MAX_K = 10000; // custom matmul kernel support max value of K Dim shape
+constexpr int32_t BASE_M = 128;  // BASE_M * BASE_K * sizeof(typeA) <=L0A size
+constexpr int32_t BASE_N = 256;  // BASE_N * BASE_K * sizeof(typeB) <=L0B size
+constexpr int32_t BASE_K = 64;   // BASE_M * BASE_N * sizeof(typeC) <=L0C size
+constexpr MatmulShapeParams shapeParams = { MAX_M,
+                                            MAX_N,
+                                            MAX_K,
+                                            BASE_M,
+                                            BASE_N,
+                                            BASE_K };
+
+constexpr int32_t TILING_DEPTH_PARAM = 8;
+constexpr int32_t TILING_STEPK_PARAM = 4;
+constexpr int32_t TILING_STEPMN_PARAM = 1;
+
+template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE>
+__aicore__ inline constexpr MatmulApiStaticTiling GetCustomConstantCFG()
+{
+    MatmulConfig mmCFG = GetMMConfig<MatmulConfigMode::CONFIG_MDL>(shapeParams);
+    // enable unitflag for performance comparison
+    mmCFG.enUnitFlag = true;
+    auto constantCFG = AscendC::GetMatmulApiTiling<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE>(mmCFG);
+    constantCFG.depthA1 = TILING_DEPTH_PARAM;
+    constantCFG.depthB1 = TILING_DEPTH_PARAM;
+    constantCFG.stepKa = TILING_STEPK_PARAM;
+    constantCFG.stepKb = TILING_STEPK_PARAM;
+    constantCFG.stepM = TILING_STEPMN_PARAM;
+    constantCFG.stepN = TILING_STEPMN_PARAM;
+    return constantCFG;
+}
+
+template <typename AType, typename BType, typename CType, typename BiasType>
+class MatmulKernel {
+public:
+    __aicore__ inline MatmulKernel(){};
+    /**
+      * @brief  Initialization before process.
+      * @param  a: A matrix gm addr.
+      * @param  b: B matrix gm addr.
+      * @param  bias: Bias matrix gm addr.
+      * @param  c: C matrix gm addr.
+      * @param  tiling: Matmul tiling struct.
+      * @param  isTransA: Whether A matrix is transposed.
+      * @param  isTransB: Whether B matrix is transposed.
+      * @retval None
+      */
+    __aicore__ inline void Init(GM_ADDR a, GM_ADDR b, GM_ADDR bias, GM_ADDR c, const TCubeTiling& tiling,
+        bool isTransA, bool isTransB);
+    /**
+      * @brief  Process matrix calculation.
+      * @param  pipe: TPipe object.
+      * @retval None
+      */
+    __aicore__ inline void Process(AscendC::TPipe* pipe);
+
+    using A_TYPE = AscendC::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, AType>;
+    using B_TYPE = AscendC::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, BType>;
+    using C_TYPE = AscendC::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, CType>;
+    using BIAS_TYPE =  AscendC::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, BiasType>;
+#ifdef ENABLE_CONSTANT
+    constexpr static auto CUSTOM_CFG_CONSTANT = GetCustomConstantCFG<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE>();
+    AscendC::Matmul<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, CUSTOM_CFG_CONSTANT> matmulObj;
+#else
+#ifdef ENABLE_MDL
+    constexpr static MatmulConfig CUSTOM_CFG_MDL = GetCustomMDLCFG();
+    AscendC::Matmul<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, CUSTOM_CFG_MDL> matmulObj;
+#else
+    constexpr static MatmulConfig CUSTOM_CFG_NORM = GetCustomNormCFG();
+    AscendC::Matmul<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, CUSTOM_CFG_NORM> matmulObj;
+#endif
+#endif
+
+private:
+    /**
+      * @brief  Calculate the gm offset based on the blockIdx.
+      * @param  blockIdx: Current Core blockidx.
+      * @param  offsetA: Gm offset of A matrix.
+      * @param  offsetB: Gm offset of B matrix.
+      * @param  offsetC: Gm offset of C matrix.
+      * @param  offsetBias: Gm offset of Bias matrix.
+      * @retval None
+      */
+    __aicore__ inline void CalcOffset(const TCubeTiling &tiling,
+        int32_t& offsetA, int32_t& offsetB, int32_t& offsetC, int32_t& offsetBias);
+
+    AscendC::GlobalTensor<AType> aGlobal;
+    AscendC::GlobalTensor<BType> bGlobal;
+    AscendC::GlobalTensor<CType> cGlobal;
+    AscendC::GlobalTensor<BiasType> biasGlobal;
+    TCubeTiling tiling;
+    int32_t mCoreIndex;
+    int32_t nCoreIndex;
+    bool isTransA{false};
+    bool isTransB{false};
+};
+} // namespace MatmulPerfCustom
+
+#endif // EXAMPLES_MATRIX_MATMUL_PERF_OP_KERNEL_MATMUL_PERF_CUSTOM_KERNEL_H
diff --git a/examples/matrix/matmul_perf/run.sh b/examples/matrix/matmul_perf/run.sh
new file mode 100644
index 0000000000000000000000000000000000000000..41e6d46ca10249986ed08e36e576276d884cb8de
--- /dev/null
+++ b/examples/matrix/matmul_perf/run.sh
@@ -0,0 +1,110 @@
+#!/bin/bash
+# Copyright (c) 2025 Huawei Technologies Co., Ltd.
+# This file is a part of the CANN Open Software.
+# Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# ======================================================================================================================
+
+export IS_PERF="0"
+# 0: basic performance mode
+# 1: enable CUBE_ONLY based on mode 0
+# 2: enable MDL based on mode 1
+# 3: enable UnitFlag based on mode 2
+# 4: enable Constant Tiling based on mode 3
+export PERF_MODE="0"
+
+SHORT=r:,v:,p:,m:,
+LONG=run-mode:,soc-version:,perf:,
+OPTS=$(getopt -a --options $SHORT --longoptions $LONG -- "$@")
+eval set -- "$OPTS"
+while :
+do
+    case "$1" in
+        (-r | --run-mode )
+            RUN_MODE="$2"
+            shift 2;;
+        (-v | --soc-version )
+            SOC_VERSION="$2"
+            shift 2;;
+        (-p | --perf )
+            IS_PERF="$2"
+            shift 2;;
+        (-m | --perf-mode )
+            PERF_MODE="$2"
+            shift 2;;
+        (--)
+            shift;
+            break;;
+        (*)
+            echo "[ERROR] Unexpected option: $1";
+            break;;
+    esac
+done
+
+# Check invalid input
+if [[ ! "${SOC_VERSION}" =~ ^Ascend910 ]]; then
+    echo "[ERROR] Unsupported SocVersion: ${SOC_VERSION}"
+    exit 1
+fi
+
+if [ "${RUN_MODE}" != "npu" ] && [ "${RUN_MODE}" != "sim" ] && [ "${RUN_MODE}" != "cpu" ]; then
+    echo "[ERROR] Unsupported RUN_MODE: ${RUN_MODE}, which can only be cpu/sim/npu."
+    exit 1
+fi
+
+if [ "${IS_PERF}" != "0" ] && [ "${IS_PERF}" != "1" ]; then
+    echo "[ERROR] Unsupported IS_PERF: ${IS_PERF}, which can only be 0 or 1."
+    exit 1
+fi
+
+# only npu mode support is_perf = 1
+if [ "${IS_PERF}" == "1" ] && [ "${RUN_MODE}" != "npu" ]; then
+    echo "[ERROR] Unsupported IS_PERF: ${IS_PERF} while RUN_MODE is ${RUN_MODE}."
+    exit 1
+fi
+
+if ! [[ "${PERF_MODE}" =~ ^[0-4]+$ ]]; then
+    echo "[ERROR] Unsupported PERF_MODE: ${PERF_MODE}, which can only be 0, 1, 2, 3, 4."
+    exit 1
+fi
+
+rm -rf build
+mkdir build
+cd build
+
+source $ASCEND_HOME_DIR/bin/setenv.bash
+export LD_LIBRARY_PATH=${ASCEND_HOME_DIR}/tools/simulator/${SOC_VERSION}/lib:$LD_LIBRARY_PATH
+set -euo pipefail
+
+cmake  -DRUN_MODE=${RUN_MODE} -DSOC_VERSION=${SOC_VERSION} -DPERF_MODE=${PERF_MODE} -DASCEND_CANN_PACKAGE_PATH=${ASCEND_HOME_DIR} ..
+make -j16
+
+cd ../
+
+rm -rf input
+mkdir input
+rm -rf output
+mkdir output
+
+rm -rf bin/
+mkdir -p bin
+cd bin
+cp ../build/ascendc_matmul_perf_bbit ./
+
+export TF_CPP_MIN_LOG_LEVEL=3
+
+if [ "${RUN_MODE}" = "npu" ]; then
+    if [ "${IS_PERF}" = "1" ]; then
+        export ASCEND_TOOLKIT_HOME=${ASCEND_HOME_DIR}
+        python3 -u ../scripts/exec_test.py npu "perf"
+    else
+        python3 -u ../scripts/exec_test.py npu "normal"
+    fi
+elif [ "${RUN_MODE}" = "sim" ]; then
+    python3 -u ../scripts/exec_test.py sim "normal"
+elif [ "${RUN_MODE}" = "cpu" ]; then
+    python3 -u ../scripts/exec_test.py cpu "normal"
+fi
diff --git a/examples/matrix/matmul_perf/scripts/exec_test.py b/examples/matrix/matmul_perf/scripts/exec_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..2db2c9271a1f95f186ad20b17e94ea49fe092ad8
--- /dev/null
+++ b/examples/matrix/matmul_perf/scripts/exec_test.py
@@ -0,0 +1,130 @@
+#!/usr/bin/python3
+# coding=utf-8
+
+# Copyright (c) 2025 Huawei Technologies Co., Ltd.
+# This file is a part of the CANN Open Software.
+# Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# ======================================================================================================================
+import os
+import sys
+import csv
+import time
+import logging
+import subprocess
+
+import numpy as np
+
+sys.path.append("../..")
+from common_scripts.gen_data import MatmulGenData
+from common_scripts.compare_data import compare_data
+from common_scripts.exec_utils import get_file_work_dir, get_case_list, get_perf_task_duration, clear_file_cache,\
+    get_process_case_cmd
+
+IS_BIAS = False
+IS_TRANS_A = False
+IS_TRANS_B = False
+PERF_RUN_TIMES = 5
+# float16 in float32 out
+DATA_TYPE_STR = "float16_float32"
+
+logging.basicConfig(level=logging.INFO)
+
+
+class ProcessParams:
+    def __init__(self, case_name, m, n, k, b, is_perf, run_mode):
+        self.case_name = case_name
+        self.m = m
+        self.n = n
+        self.k = k
+        self.b = b
+        self.is_perf = is_perf
+        self.run_mode = run_mode
+
+
+def process_case(file_work_dir, process_params):
+    case_name, m, n, k, b, is_perf, run_mode = process_params.case_name, process_params.m, process_params.n,\
+        process_params.k, process_params.b, process_params.is_perf, process_params.run_mode
+    logging.info("[INFO] start process case [%s]" % (case_name))
+    logging.info("[INFO] IS_PERF is set [%d]" % (is_perf))
+
+    loop_num = PERF_RUN_TIMES if is_perf else 1
+    task_duration_avg = 0
+    for i in range(loop_num):
+        clear_file_cache(file_work_dir)
+        matmul_gen_data = MatmulGenData(m, n, k, b, IS_TRANS_A, IS_TRANS_B, IS_BIAS, DATA_TYPE_STR)
+        if is_perf:
+            matmul_gen_data.gen_fake_golden_data(file_work_dir)
+        else:
+            matmul_gen_data.gen_golden_data(file_work_dir)
+        params_str = f"{m} {n} {k}"
+        cmd = get_process_case_cmd(kernel_name="ascendc_matmul_perf_bbit", params_str=params_str,\
+            is_perf=is_perf, run_mode=run_mode)
+        subprocess.run(cmd)
+        if is_perf:
+            wrong_num = -1
+        else:
+            logging.info("[INFO] compare data case[%s]" % (case_name))
+            wrong_num = compare_data(file_work_dir, DATA_TYPE_STR)
+        if is_perf:
+            cur_duration = float(get_perf_task_duration("./prof_out"))
+            logging.info("[INFO] case_name: [%s], loop_num: [%s], cur_duration: [%s]\n" % (case_name, i, cur_duration))
+            task_duration_avg += cur_duration
+
+    res_data = []
+    res_data.append(case_name)
+    res_data.append(wrong_num)
+    res_data.append(b * m * n)
+    if wrong_num == -1:
+        res_data.append("None")
+    elif wrong_num / (b * m * n) > 0.001:
+        res_data.append("Fail")
+    else:
+        res_data.append("Success")
+    if is_perf:
+        task_duration_avg = task_duration_avg / loop_num
+        res_data.append(task_duration_avg)
+    return res_data
+
+
+def main():
+    args_len = len(sys.argv) - 1
+    if args_len != 2:
+        logging.info("[ERROR] exec_test input params error!")
+        return -1
+
+    file_work_dir = get_file_work_dir()
+    if not os.path.exists(file_work_dir):
+        logging.info("[ERROR] file path %s not exist!" % (file_work_dir))
+        return -1
+
+    is_perf = False
+    if sys.argv[2] == "perf":
+        is_perf = True
+
+    case_list = get_case_list()
+    res_list = [["case_name", "wrong_num", "total_num", "result", "task_duration"]]
+    run_mode = sys.argv[1]
+    for is_process, case_name, m, n, k, b in case_list:
+        if is_process == 1:
+            process_params = ProcessParams(case_name, m, n, k, b, is_perf, run_mode)
+            res_data = process_case(file_work_dir, process_params)
+            res_list.append(res_data)
+
+    timestamp = time.time()
+    result_file_name = "result_" + str(timestamp) + ".csv"
+    with open(os.path.join(file_work_dir, "output", result_file_name), 'w', newline='', encoding='utf-8') as csvfile:
+        writer = csv.writer(csvfile)
+        writer.writerows(res_list)
+
+    logging.info("---------------RESULT---------------")
+    for res in res_list:
+        logging.info(res)
+    return 0
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/matrix/matmul_perf/testcase/case.csv b/examples/matrix/matmul_perf/testcase/case.csv
new file mode 100644
index 0000000000000000000000000000000000000000..009720f51943f6c96c2a36919c350642fa758e4d
--- /dev/null
+++ b/examples/matrix/matmul_perf/testcase/case.csv
@@ -0,0 +1,2 @@
+1, case001, 128, 30720, 64
+1, case002, 128, 30720, 1024
\ No newline at end of file
diff --git a/examples/readme.md b/examples/readme.md
index cf59e3ac0de076ef9254156694843eff37baa64d..1701252cc245c2448fdde9b2044bfe1e0d74d099 100644
--- a/examples/readme.md
+++ b/examples/readme.md
@@ -36,7 +36,7 @@
         <td> 对输入tensor按行做如下公式的计算：zi = (xi - ∑(xi * yi)) * yi，其中∑为按行reduce求和。</td>
     </tr>
     <tr>
-        <th rowspan="32"><a href="./matrix"> matrix</th>
+        <th rowspan="33"><a href="./matrix"> matrix</th>
         <td><a href="./matrix/basic_block_matmul"> basic_block_matmul</td>
         <td> 实现无尾块且tiling的base块大小固定的场景下的Matmul矩阵乘法，计算公式为：C = A * B + Bias。 </td>
     </tr>
@@ -58,11 +58,11 @@
     </tr>
     <tr>
         <td><a href="./matrix/matmul_ibshareAB"> matmul_ibshareAB</td>
-        <td> 实现A矩阵或B矩阵GM地址相同，A、B矩阵共享L1 Buffer场景下的Matmul矩阵乘法，计算公式为：C = A * B + Bias。 </td>
+        <td> 多个AIV的A矩阵和B矩阵GM地址相同场景下，实现共享L1 Buffer上A、B矩阵数据的Matmul矩阵乘法，计算公式为：C = A * B + Bias。 </td>
     </tr>
     <tr>
         <td><a href="./matrix/matmul_ibshareB"> matmul_ibshareB</td>
-        <td> 实现A矩阵或B矩阵GM地址相同，B矩阵共享L1 Buffer场景下的Matmul矩阵乘法，计算公式为：C = A * B + Bias。 </td>
+        <td> 多个AIV的B矩阵GM地址相同场景下，实现共享L1 Buffer上B矩阵数据的Matmul矩阵乘法，计算公式为：C = A * B + Bias。 </td>
     </tr>
     <tr>
         <td><a href="./matrix/matmul_k_reorder_load"> matmul_k_reorder_load</td>
@@ -163,6 +163,10 @@
         <td><a href="./matrix/matmul_int4"> matmul_int4</td>
         <td> 实现MDL模板下int4数据类型输入的Matmul矩阵乘法，计算公式为：C = A * B + Bias。 </td>
     </tr>
+    <tr>
+        <td><a href="./matrix/matmul_perf"> matmul_perf</td>
+        <td> 性能优化样例，包含纯Cube模式、MDL模板、UnitFlag、Tiling全量常量化四个特性的Matmul算子，计算公式为：C = A * B + Bias。 </td>
+    </tr>
     <tr>
         <th rowspan="6"><a href="./normalization"> normalization</th>
         <td><a href="./normalization/layernorm"> layernorm</td>

算子类型(OpType)	MatmulIbshareCustom
算子类型(OpType)	MatmulIBShareBCustom
算子输入	name	shape	data type	format	isTrans
	a	-	float16	ND	true
	a	-	float16	ND	false
	b	-	float16	ND	false
bias	-	float	ND	-
算子输出	c	-	float	ND	-
核函数名	matmul_ibshare_custom
核函数名	matmul_ibshareb_custom
算子类型(OpType)	MatmulPerfCustom
算子输入	name	shape	data type	format	isTrans
	a	-	float16	ND	false
	b	-	float16	ND	false
	bias	-	float	ND	-
算子输出	c	-	float	ND	-
核函数名	matmul_perf_custom