From bb52ae1762fae16af0561bdcdb7fe74279e0df29 Mon Sep 17 00:00:00 2001
From: li-yuanjie-da <liyuanjie5@huawei.com>
Date: Mon, 25 Aug 2025 09:45:33 +0800
Subject: [PATCH] =?UTF-8?q?=E6=96=B0=E5=A2=9Ebmm=E7=89=B9=E6=80=A7?=
 =?UTF-8?q?=E6=A0=B7=E4=BE=8B=E4=BB=A5=E5=8F=8A=E6=80=A7=E8=83=BD=E6=A8=A1?=
 =?UTF-8?q?=E5=BC=8F=E9=80=82=E9=85=8D?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../batch_matmul_bias_no_batch/CMakeLists.txt |  73 ++++++
 .../batch_matmul_bias_no_batch/README.md      | 104 ++++++++
 .../cmake/cpu_lib.cmake                       |  31 +++
 .../cmake/npu_lib.cmake                       |  27 +++
 .../batch_matmul_bias_no_batch/main.cpp       | 222 ++++++++++++++++++
 ...tch_matmul_bias_no_batch_custom_tiling.cpp |  54 +++++
 ...batch_matmul_bias_no_batch_custom_tiling.h |  38 +++
 .../batch_matmul_bias_no_batch_custom.cpp     |  87 +++++++
 .../batch_matmul_bias_no_batch_custom_impl.h  |  44 ++++
 .../matrix/batch_matmul_bias_no_batch/run.sh  |  96 ++++++++
 .../scripts/exec_test.py                      | 118 ++++++++++
 .../scripts/gen_batch_data.py                 |  93 ++++++++
 .../testcase/case.csv                         |   1 +
 .../matrix/batch_matmul_tscm/CMakeLists.txt   |  73 ++++++
 examples/matrix/batch_matmul_tscm/README.md   | 127 ++++++++++
 .../batch_matmul_tscm/cmake/cpu_lib.cmake     |  31 +++
 .../batch_matmul_tscm/cmake/npu_lib.cmake     |  27 +++
 examples/matrix/batch_matmul_tscm/main.cpp    | 222 ++++++++++++++++++
 .../batch_matmul_tscm_custom_tiling.cpp       |  54 +++++
 .../op_host/batch_matmul_tscm_custom_tiling.h |  38 +++
 .../op_kernel/batch_matmul_tscm_custom.cpp    |  95 ++++++++
 .../op_kernel/batch_matmul_tscm_custom_impl.h |  38 +++
 examples/matrix/batch_matmul_tscm/run.sh      |  96 ++++++++
 .../batch_matmul_tscm/scripts/exec_test.py    | 120 ++++++++++
 .../scripts/gen_batch_data.py                 | 113 +++++++++
 .../batch_matmul_tscm/testcase/case.csv       |   1 +
 .../matrix/matmul_nbuffer33/CMakeLists.txt    |   1 +
 examples/matrix/matmul_nbuffer33/README.md    |   7 +-
 .../matmul_nbuffer33/cmake/cpu_lib.cmake      |   4 +
 .../matmul_nbuffer33/cmake/npu_lib.cmake      |   9 +-
 .../op_kernel/matmul_nbuffer33_custom_impl.h  |   9 +
 examples/matrix/matmul_nbuffer33/run.sh       |  13 +-
 examples/matrix/matmul_preload/CMakeLists.txt |   1 +
 examples/matrix/matmul_preload/README.md      |   5 +-
 .../matrix/matmul_preload/cmake/cpu_lib.cmake |   1 +
 .../matrix/matmul_preload/cmake/npu_lib.cmake |   1 +
 .../op_kernel/matmul_preload_custom_impl.h    |   4 +
 examples/matrix/matmul_preload/run.sh         |  13 +-
 examples/matrix/matmul_splitk/CMakeLists.txt  |   1 +
 examples/matrix/matmul_splitk/README.md       |   7 +-
 .../matrix/matmul_splitk/cmake/cpu_lib.cmake  |   4 +
 .../matrix/matmul_splitk/cmake/npu_lib.cmake  |   9 +-
 .../op_host/matmul_splitk_custom_tiling.cpp   |   2 +
 .../op_kernel/matmul_splitk_custom_impl.h     |   4 +
 examples/matrix/matmul_splitk/run.sh          |  13 +-
 .../matrix/matmul_splitk/testcase/case.csv    |   3 +-
 examples/readme.md                            |   7 +-
 47 files changed, 2115 insertions(+), 26 deletions(-)
 create mode 100644 examples/matrix/batch_matmul_bias_no_batch/CMakeLists.txt
 create mode 100644 examples/matrix/batch_matmul_bias_no_batch/README.md
 create mode 100644 examples/matrix/batch_matmul_bias_no_batch/cmake/cpu_lib.cmake
 create mode 100644 examples/matrix/batch_matmul_bias_no_batch/cmake/npu_lib.cmake
 create mode 100644 examples/matrix/batch_matmul_bias_no_batch/main.cpp
 create mode 100644 examples/matrix/batch_matmul_bias_no_batch/op_host/batch_matmul_bias_no_batch_custom_tiling.cpp
 create mode 100644 examples/matrix/batch_matmul_bias_no_batch/op_host/batch_matmul_bias_no_batch_custom_tiling.h
 create mode 100644 examples/matrix/batch_matmul_bias_no_batch/op_kernel/batch_matmul_bias_no_batch_custom.cpp
 create mode 100644 examples/matrix/batch_matmul_bias_no_batch/op_kernel/batch_matmul_bias_no_batch_custom_impl.h
 create mode 100644 examples/matrix/batch_matmul_bias_no_batch/run.sh
 create mode 100644 examples/matrix/batch_matmul_bias_no_batch/scripts/exec_test.py
 create mode 100644 examples/matrix/batch_matmul_bias_no_batch/scripts/gen_batch_data.py
 create mode 100644 examples/matrix/batch_matmul_bias_no_batch/testcase/case.csv
 create mode 100644 examples/matrix/batch_matmul_tscm/CMakeLists.txt
 create mode 100644 examples/matrix/batch_matmul_tscm/README.md
 create mode 100644 examples/matrix/batch_matmul_tscm/cmake/cpu_lib.cmake
 create mode 100644 examples/matrix/batch_matmul_tscm/cmake/npu_lib.cmake
 create mode 100644 examples/matrix/batch_matmul_tscm/main.cpp
 create mode 100644 examples/matrix/batch_matmul_tscm/op_host/batch_matmul_tscm_custom_tiling.cpp
 create mode 100644 examples/matrix/batch_matmul_tscm/op_host/batch_matmul_tscm_custom_tiling.h
 create mode 100644 examples/matrix/batch_matmul_tscm/op_kernel/batch_matmul_tscm_custom.cpp
 create mode 100644 examples/matrix/batch_matmul_tscm/op_kernel/batch_matmul_tscm_custom_impl.h
 create mode 100644 examples/matrix/batch_matmul_tscm/run.sh
 create mode 100644 examples/matrix/batch_matmul_tscm/scripts/exec_test.py
 create mode 100644 examples/matrix/batch_matmul_tscm/scripts/gen_batch_data.py
 create mode 100644 examples/matrix/batch_matmul_tscm/testcase/case.csv

diff --git a/examples/matrix/batch_matmul_bias_no_batch/CMakeLists.txt b/examples/matrix/batch_matmul_bias_no_batch/CMakeLists.txt
new file mode 100644
index 00000000..bdb4aded
--- /dev/null
+++ b/examples/matrix/batch_matmul_bias_no_batch/CMakeLists.txt
@@ -0,0 +1,73 @@
+# Copyright (c) 2025 Huawei Technologies Co., Ltd.
+# This file is a part of the CANN Open Software.
+# Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# ======================================================================================================================
+
+cmake_minimum_required(VERSION 3.16)
+project(Ascend_c)
+if(${RUN_MODE})
+    set(RUN_MODE "npu" CACHE STRING "cpu/sim/npu")
+endif()
+if (${SOC_VERSION})
+    set(SOC_VERSION "Ascend910" CACHE STRING "system on chip type")
+endif()
+
+set(ASCEND_CANN_PACKAGE_PATH "~/Ascend/ascend-toolkit/latest" CACHE STRING "ASCEND CANN package installation directory")
+if(NOT CMAKE_BUILD_TYPE)
+    set(CMAKE_BUILD_TYPE "Debug"  CACHE STRING "Build type Release/Debug (default Debug)" FORCE)
+endif()
+
+if(CMAKE_INSTALL_PREFIX STREQUAL /usr/local)
+    set(CMAKE_INSTALL_PREFIX "${CMAKE_CURRENT_LIST_DIR}/out"  CACHE STRING "path for install()" FORCE)
+endif()
+
+file(GLOB KERNEL_FILES
+    ${CMAKE_CURRENT_SOURCE_DIR}/op_kernel/batch_matmul_bias_no_batch_custom.cpp
+)
+
+if("${RUN_MODE}" STREQUAL "cpu")
+    include(cmake/cpu_lib.cmake)
+elseif("${RUN_MODE}" STREQUAL "sim" OR "${RUN_MODE}" STREQUAL "npu")
+    include(cmake/npu_lib.cmake)
+else()
+    message("invalid RUN_MODE: ${RUN_MODE}")
+endif()
+
+add_executable(ascendc_batch_matmul_bias_no_batch_bbit
+    ${CMAKE_CURRENT_SOURCE_DIR}/main.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/op_host/batch_matmul_bias_no_batch_custom_tiling.cpp
+)
+
+target_compile_options(ascendc_batch_matmul_bias_no_batch_bbit PRIVATE
+    $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:-g>>
+    -O2
+    -std=c++17
+    -D_GLIBCXX_USE_CXX11_ABI=0
+)
+
+target_compile_definitions(ascendc_batch_matmul_bias_no_batch_bbit PRIVATE
+    SOC_VERSION="${SOC_VERSION}"
+)
+
+target_include_directories(ascendc_batch_matmul_bias_no_batch_bbit PRIVATE
+    ${CMAKE_CURRENT_SOURCE_DIR}
+    $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:${ASCEND_CANN_PACKAGE_PATH}/include>>
+    $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:${ASCEND_CANN_PACKAGE_PATH}/runtime/include>>
+)
+
+target_link_libraries(ascendc_batch_matmul_bias_no_batch_bbit PRIVATE
+    $<BUILD_INTERFACE:$<$<OR:$<STREQUAL:${RUN_MODE},npu>,$<STREQUAL:${RUN_MODE},sim>>:host_intf_pub>>
+    $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:tikicpulib::${SOC_VERSION}>>
+    $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:ascendcl>>
+    $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:c_sec>>
+    ascendc_kernels_${RUN_MODE}
+    tiling_api
+    register
+    platform
+    ascendalog
+    dl
+)
diff --git a/examples/matrix/batch_matmul_bias_no_batch/README.md b/examples/matrix/batch_matmul_bias_no_batch/README.md
new file mode 100644
index 00000000..cb33b830
--- /dev/null
+++ b/examples/matrix/batch_matmul_bias_no_batch/README.md
@@ -0,0 +1,104 @@
+<!--声明：本文使用[Creative Commons License version 4.0](https://creativecommons.org/licenses/by/4.0/legalcode)许可协议，转载、引用或修改等操作请遵循此许可协议。-->
+## 概述
+
+本样例介绍了调用Matmul高阶API实现BatchMatmul单算子，输入Bias矩阵不带Batch轴，即多Batch计算Matmul时，复用同一个Bias矩阵。
+
+本样例以直调的方式调用算子核函数。
+
+直调：核函数的基础调用方式，开发者完成算子核函数的开发和Tiling实现后，即可通过AscendCL运行时接口，完成算子的调用。包含CPU侧、NPU侧和仿真三种运行验证方法。
+
+## 样例支持的产品型号为
+- Atlas A2训练系列产品/Atlas 800I A2推理产品
+
+## 目录结构
+| 目录及文件                            | 描述                   |
+|----------------------------------|----------------------|
+| [cmake](cmake)                   | 编译工程文件 |
+| [op_host](op_host)               | 本样例的tiling代码实现 |
+| [op_kernel](op_kernel)           | 本样例的kernel代码实现 |
+| [scripts](scripts)               | 包含输入输出真值数据生成脚本文件和执行文件 |
+| [testcase](testcase)             | 用例文件，配置用例的计算shape信息 |
+| CMakeLists.txt                   | 编译工程文件 |
+| main.cpp                         | 主函数，调用算子的应用程序，含CPU域及NPU域调用 |
+| run.sh                           | 编译执行脚本 |
+
+## 算子描述
+- 算子功能  
+  BatchMatmul单算子，批量处理Matmul计算，每次Matmul计算对输入的A B矩阵做矩阵乘和加bias偏置。该算子多Batch复用相同的Bias矩阵。
+
+- 算子规格
+    <table>
+    <tr><td rowspan="1" align="center">算子类型(OpType)</td><td colspan="6" align="center">BatchMatmulCustom</td></tr>
+    </tr>
+    <tr><td rowspan="4" align="center">算子输入</td><td align="center">name</td><td align="center">shape</td><td align="center">data type</td><td align="center">format</td><td align="center">isTrans</td><td align="center">layout</td></tr>
+    <tr><td align="center">a</td><td align="center">-</td><td align="center">float16</td><td align="center">ND</td><td align="center">false</td><td align="center">NORMAL</td></tr>
+    <tr><td align="center">b</td><td align="center">-</td><td align="center">float16</td><td align="center">ND</td><td align="center">true</td><td align="center">NORMAL</td></tr>
+    <tr><td align="center">bias</td><td align="center">-</td><td align="center">float</td><td align="center">ND</td><td align="center">-</td><td align="center">-</td></tr>
+    </tr>
+    </tr>
+    <tr><td rowspan="1" align="center">算子输出</td><td align="center">c</td><td align="center">-</td><td align="center">float</td><td align="center">ND</td><td align="center">-</td><td align="center">NORMAL</td></tr>
+    </tr>
+    <tr><td rowspan="1" align="center">核函数名</td><td colspan="6" align="center">batch_matmul_bias_no_batch_custom</td></tr>
+    </table>
+
+## 算子实现介绍
+- 约束条件
+  - 输入和输出的Layout类型都为NORMAL时，不支持BatchMode为SINGLE_LARGE_THAN_L1的场景。
+
+- 算子Kernel实现
+  - 一次完成BatchNum个Matmul矩阵乘法的运算。单次MatMul的计算公式为：C = A * B + Bias。
+    - A、B为源操作数，A为左矩阵，形状为[M, K]；B为右矩阵，形状为[K, N]。
+    - C为目的操作数，存放矩阵乘结果的矩阵，形状为[M, N]。
+    - Bias为矩阵乘偏置，形状为[1, N]。对A*B结果矩阵的每一行都采用该bias进行偏置。
+  - 具体步骤：
+    - 创建Matmul对象。
+      创建Matmul对象时，自定义MatmulConfig参数，将其中的isBiasBatch参数设置为false，使能BatchMatmul的Bias复用功能，获得自定义的使用NORM模板的Matmul对象。
+        ```
+        constexpr MatmulConfigMode configMode = MatmulConfigMode::CONFIG_NORM;
+        constexpr MatmulBatchParams batchParams = {
+          false, BatchMode::BATCH_LESS_THAN_L1, false /* isBiasBatch */
+        };
+        constexpr MatmulConfig CFG_MM = GetMMConfig<configMode>(batchParams);
+        AscendC::Matmul<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, CFG_PARTIAL> matmulObj;
+        ```
+    - 初始化操作。
+    - 设置左矩阵A 、右矩阵B、Bias。
+    - 完成多batch矩阵乘操作。
+        ```
+        matmulObj.IterateBatch(cGlobal, batchA, batchB, false);
+        ```
+    - 结束矩阵乘操作。
+
+- 算子Tiling实现
+  - Ascend C提供一组Matmul Tiling API，方便用户获取Matmul Kernel计算时所需的 Tiling参数。只需要传入A/B/C矩阵等信息，调用API接口，即可获取到TCubeTiling结构体中的相关参数。
+  - 获取Tiling参数的流程如下：
+    - 创建一个Tiling对象。
+    - 根据输入输出Layout设置单核计算的A、B、C、Bias的参数类型信息；M、N、Ka、Kb形状信息等。
+    - 调用SetALayout、SetBLayout、SetCLayout、SetBatchNum设置A/B/C的Layout轴信息和最大BatchNum数。
+    - 调用GetTiling接口，获取Tiling信息。
+
+## 编译运行样例
+
+  - 配置环境变量
+
+    这里的\$ASCEND_CANN_PACKAGE_PATH需要替换为CANN开发套件包安装后文件存储路径。例如：/usr/local/Ascend/ascend-toolkit/latest
+    ```
+    export ASCEND_HOME_DIR=$ASCEND_CANN_PACKAGE_PATH
+    source $ASCEND_HOME_DIR/../set_env.sh
+    ```
+
+  - 编译执行
+
+    ```
+    bash run.sh -r [RUN_MODE] -v [SOC_VERSION] -p [IS_PERF]
+    ```
+    其中脚本参数说明如下：
+    - RUN_MODE ：编译执行方式，可选择CPU调试，NPU仿真，NPU上板，对应参数分别为[cpu / sim / npu]。若需要详细了解NPU仿真相关内容，请参考[《算子开发工具msProf》](https://hiascend.com/document/redirect/CannCommunityToolMsProf)中的“工具使用”章节。
+    - SOC_VERSION ：昇腾AI处理器型号，如果无法确定具体的[SOC_VERSION]，则在安装昇腾AI处理器的服务器执行npu-smi info命令进行查询，在查询到的“Name”前增加Ascend信息，例如“Name”对应取值为xxxyy，实际配置的[SOC_VERSION]值为Ascendxxxyy。支持以下产品型号：
+      - Atlas A2训练系列产品/Atlas 800I A2推理产品
+    - IS_PERF : 是否获取执行性能数据，当前只在NPU执行时生效，打开后性能数据显示在执行结果中，可选择关闭和开启该功能，对应参数分别为[0 / 1]。
+
+    示例如下，Ascendxxxyy请替换为实际的AI处理器型号。
+    ```
+    bash run.sh -r cpu -v Ascendxxxyy -p 0
+    ```
\ No newline at end of file
diff --git a/examples/matrix/batch_matmul_bias_no_batch/cmake/cpu_lib.cmake b/examples/matrix/batch_matmul_bias_no_batch/cmake/cpu_lib.cmake
new file mode 100644
index 00000000..244469fb
--- /dev/null
+++ b/examples/matrix/batch_matmul_bias_no_batch/cmake/cpu_lib.cmake
@@ -0,0 +1,31 @@
+# Copyright (c) 2025 Huawei Technologies Co., Ltd.
+# This file is a part of the CANN Open Software.
+# Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# ======================================================================================================================
+
+if(NOT DEFINED ENV{CMAKE_PREFIX_PATH})
+    set(CMAKE_PREFIX_PATH ${ASCEND_CANN_PACKAGE_PATH}/tools/tikicpulib/lib/cmake)
+endif()
+find_package(tikicpulib REQUIRED)
+
+add_library(ascendc_kernels_${RUN_MODE} SHARED
+    ${KERNEL_FILES}
+)
+
+target_link_libraries(ascendc_kernels_${RUN_MODE} PRIVATE
+    tikicpulib::${SOC_VERSION}
+)
+
+target_compile_options(ascendc_kernels_${RUN_MODE} PRIVATE
+    -g
+    -O0
+    -std=c++17
+)
+
+install(TARGETS ascendc_kernels_${RUN_MODE}
+DESTINATION ${CMAKE_INSTALL_LIBDIR}
+)
\ No newline at end of file
diff --git a/examples/matrix/batch_matmul_bias_no_batch/cmake/npu_lib.cmake b/examples/matrix/batch_matmul_bias_no_batch/cmake/npu_lib.cmake
new file mode 100644
index 00000000..7e1c58d6
--- /dev/null
+++ b/examples/matrix/batch_matmul_bias_no_batch/cmake/npu_lib.cmake
@@ -0,0 +1,27 @@
+# Copyright (c) 2025 Huawei Technologies Co., Ltd.
+# This file is a part of the CANN Open Software.
+# Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# ======================================================================================================================
+
+if(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
+    set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
+elseif(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake)
+    set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake)
+else()
+    message(FATAL_ERROR "ascendc_kernel_cmake does not exist ,please check whether the cann package is installed")
+endif()
+include(${ASCENDC_CMAKE_DIR}/ascendc.cmake)
+
+ascendc_library(ascendc_kernels_${RUN_MODE} STATIC
+    ${KERNEL_FILES}
+)
+
+ascendc_compile_definitions(ascendc_kernels_${RUN_MODE} PRIVATE
+  -DASCENDC_DUMP
+  -DHAVE_WORKSPACE
+  -DHAVE_TILING
+  )
diff --git a/examples/matrix/batch_matmul_bias_no_batch/main.cpp b/examples/matrix/batch_matmul_bias_no_batch/main.cpp
new file mode 100644
index 00000000..cd35ab4a
--- /dev/null
+++ b/examples/matrix/batch_matmul_bias_no_batch/main.cpp
@@ -0,0 +1,222 @@
+/**
+ * Copyright (c) 2025 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+#include <sstream>
+#include "../../common/data_utils.h"
+#include "kernel_tiling/kernel_tiling.h"
+#include "tiling/platform/platform_ascendc.h"
+#include "op_host/batch_matmul_bias_no_batch_custom_tiling.h"
+#ifndef ASCENDC_CPU_DEBUG
+#include "acl/acl.h"
+extern void batch_matmul_bias_no_batch_custom_do(uint32_t coreDim, void* stream,
+                             uint8_t *a, uint8_t *b, uint8_t *bias, uint8_t *c,
+                             uint8_t *workspace, uint8_t *tiling);
+#else
+#include "tikicpulib.h"
+extern "C" void batch_matmul_bias_no_batch_custom(uint8_t *a, uint8_t *b, uint8_t *bias, uint8_t *c,
+                              uint8_t *workspace, uint8_t *tiling);
+#endif
+
+namespace {
+constexpr bool IS_BIAS = true;
+constexpr bool IS_A_TRANS = true;
+constexpr bool IS_B_TRANS = false;
+
+struct MatrixFileSize
+{
+    size_t x1FileSize;
+    size_t x2FileSize;
+    size_t yFileSize;
+    size_t biasFileSize;
+};
+}
+
+namespace MatmulHost {
+static size_t GetSysWorkSpaceSize()
+{
+    auto ascendcPlatform = platform_ascendc::PlatformAscendCManager::GetInstance();
+    if (ascendcPlatform == nullptr) {
+        return 0;
+    }
+    return static_cast<size_t>(ascendcPlatform->GetLibApiWorkSpaceSize());
+}
+
+// CPU debug mode
+#ifdef ASCENDC_CPU_DEBUG
+void TestMatmulCpu(const MatmulHost::MatmulCaseParams &testCaseParams, const MatrixFileSize& matrixFileSize)
+{
+    size_t x1FileSize = matrixFileSize.x1FileSize;
+    size_t x2FileSize = matrixFileSize.x2FileSize;
+    size_t yFileSize = matrixFileSize.yFileSize;
+    size_t biasFileSize = matrixFileSize.biasFileSize;
+    // Query workspace size
+    size_t workspaceSize = GetSysWorkSpaceSize();
+    uint8_t* x1 = (uint8_t*)AscendC::GmAlloc(x1FileSize);
+    uint8_t* x2 = (uint8_t*)AscendC::GmAlloc(x2FileSize);
+    uint8_t* bias;
+    uint8_t* y = (uint8_t*)AscendC::GmAlloc(yFileSize);
+    uint8_t* workspace = (uint8_t*)AscendC::GmAlloc(workspaceSize);
+    ReadFile("../input/x1_gm.bin", x1FileSize, x1, x1FileSize);
+    ReadFile("../input/x2_gm.bin", x2FileSize, x2, x2FileSize);
+    if (IS_BIAS) {
+        bias = (uint8_t *)AscendC::GmAlloc(biasFileSize);
+        ReadFile("../input/bias_gm.bin", biasFileSize, bias, biasFileSize);
+    }
+    size_t tilingFileSize = sizeof(TCubeTiling);
+    uint8_t* tiling = (uint8_t*)AscendC::GmAlloc(tilingFileSize);
+    // Calculate Tiling
+    const auto& tilingData = MatmulHost::GenerateTiling(testCaseParams);
+    memcpy_s(tiling, tilingFileSize, &tilingData, tilingFileSize);
+    ICPU_RUN_KF(batch_matmul_bias_no_batch_custom, tilingData.usedCoreNum, x1, x2,bias, y, workspace, tiling);
+    WriteFile("../output/output.bin", y, yFileSize);
+    AscendC::GmFree((void*)x1);
+    AscendC::GmFree((void*)x2);
+    if (IS_BIAS) {
+        AscendC::GmFree((void*)bias);
+    }
+    AscendC::GmFree((void*)y);
+    AscendC::GmFree((void*)workspace);
+    AscendC::GmFree((void*)tiling);
+}
+// NPU
+#else
+void MatmulOp(uint8_t* x1, uint8_t* x2, uint8_t* y, uint8_t* bias, MatmulHost::MatmulCaseParams testCaseParams,
+    void* stream = nullptr)
+{
+    // Init args
+    uint8_t *workspaceDevice;
+
+    // Query workspace size
+    size_t workspaceSize = GetSysWorkSpaceSize();
+
+    // Allocate workspace on device
+    CHECK_ACL(aclrtMalloc((void **)&workspaceDevice, workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST));
+
+    // Initialize kernel with arguments and workspace pointer
+    uint8_t* tilingHost;
+    uint8_t* tilingDevice;
+    size_t tilingFileSize = sizeof(TCubeTiling);
+    // Calculate Tiling
+    const auto tilingData = MatmulHost::GenerateTiling(testCaseParams);
+    CHECK_ACL(aclrtMallocHost((void **)(&tilingHost), tilingFileSize));
+    CHECK_ACL(aclrtMalloc((void **)&tilingDevice, tilingFileSize,
+                          ACL_MEM_MALLOC_HUGE_FIRST));
+    CHECK_ACL(aclrtMemcpy(tilingHost, tilingFileSize, &tilingData,
+                          tilingFileSize, ACL_MEMCPY_HOST_TO_HOST));
+    CHECK_ACL(aclrtMemcpy(tilingDevice, tilingFileSize, tilingHost,
+                          tilingFileSize, ACL_MEMCPY_HOST_TO_DEVICE));
+    // Launch kernel
+    batch_matmul_bias_no_batch_custom_do(tilingData.usedCoreNum, stream, x1, x2, bias, y, workspaceDevice, tilingDevice);
+    CHECK_ACL(aclrtFreeHost(tilingHost));
+    CHECK_ACL(aclrtFree(workspaceDevice));
+    CHECK_ACL(aclrtFree(tilingDevice));
+}
+
+void TestAclInit(aclrtContext &context, aclrtStream &stream, int64_t &deviceId)
+{
+    CHECK_ACL(aclInit(nullptr));
+    CHECK_ACL(aclrtSetDevice(deviceId));
+    CHECK_ACL(aclrtCreateContext(&context, deviceId));
+    CHECK_ACL(aclrtCreateStream(&stream));
+}
+
+void TestAclDeInit(aclrtContext &context, aclrtStream &stream, int64_t &deviceId)
+{
+    CHECK_ACL(aclrtDestroyStream(stream));
+    CHECK_ACL(aclrtDestroyContext(context));
+    CHECK_ACL(aclrtResetDevice(deviceId));
+    CHECK_ACL(aclFinalize());
+}
+
+void TestMatmul(const MatmulHost::MatmulCaseParams &testCaseParams, const MatrixFileSize& matrixFileSize)
+{
+    size_t x1FileSize = matrixFileSize.x1FileSize;
+    size_t x2FileSize = matrixFileSize.x2FileSize;
+    size_t yFileSize = matrixFileSize.yFileSize;
+    size_t biasFileSize = matrixFileSize.biasFileSize;
+
+    aclrtContext context;
+    aclrtStream stream = nullptr;
+    int64_t deviceId = 0;
+    TestAclInit(context, stream, deviceId);
+
+    uint8_t *x1Host;
+    uint8_t *x1Device;
+    CHECK_ACL(aclrtMallocHost((void **)(&x1Host), x1FileSize));
+    CHECK_ACL(aclrtMalloc((void **)&x1Device, x1FileSize, ACL_MEM_MALLOC_HUGE_FIRST));
+    ReadFile("../input/x1_gm.bin", x1FileSize, x1Host, x1FileSize);
+    CHECK_ACL(aclrtMemcpy(x1Device, x1FileSize, x1Host, x1FileSize, ACL_MEMCPY_HOST_TO_DEVICE));
+
+    uint8_t *x2Host;
+    uint8_t *x2Device;
+    CHECK_ACL(aclrtMallocHost((void **)(&x2Host), x2FileSize));
+    CHECK_ACL(aclrtMalloc((void **)&x2Device, x2FileSize, ACL_MEM_MALLOC_HUGE_FIRST));
+    ReadFile("../input/x2_gm.bin", x2FileSize, x2Host, x2FileSize);
+    CHECK_ACL(aclrtMemcpy(x2Device, x2FileSize, x2Host, x2FileSize, ACL_MEMCPY_HOST_TO_DEVICE));
+
+    uint8_t *biasHost = nullptr;
+    uint8_t *biasDevice = nullptr;
+    if (IS_BIAS) {
+        CHECK_ACL(aclrtMallocHost((void **)(&biasHost), biasFileSize));
+        CHECK_ACL(aclrtMalloc((void **)&biasDevice, biasFileSize, ACL_MEM_MALLOC_HUGE_FIRST));
+        ReadFile("../input/bias_gm.bin", biasFileSize, biasHost, biasFileSize);
+        CHECK_ACL(aclrtMemcpy(biasDevice, biasFileSize, biasHost, biasFileSize, ACL_MEMCPY_HOST_TO_DEVICE));
+    }
+    uint8_t *yHost = nullptr;
+    uint8_t *yDevice = nullptr;
+    CHECK_ACL(aclrtMallocHost((void **)(&yHost), yFileSize));
+    CHECK_ACL(aclrtMalloc((void **)&yDevice, yFileSize, ACL_MEM_MALLOC_HUGE_FIRST));
+
+    MatmulOp(x1Device, x2Device, yDevice, biasDevice, testCaseParams, stream);
+    CHECK_ACL(aclrtSynchronizeStream(stream));
+
+    CHECK_ACL(aclrtMemcpy(yHost, yFileSize, yDevice, yFileSize, ACL_MEMCPY_DEVICE_TO_HOST));
+    WriteFile("../output/output.bin", yHost, yFileSize);
+
+    if (IS_BIAS) {
+        CHECK_ACL(aclrtFree(biasDevice));
+        CHECK_ACL(aclrtFreeHost(biasHost));
+    }
+    CHECK_ACL(aclrtFree(x1Device));
+    CHECK_ACL(aclrtFreeHost(x1Host));
+    CHECK_ACL(aclrtFree(x2Device));
+    CHECK_ACL(aclrtFreeHost(x2Host));
+    CHECK_ACL(aclrtFree(yDevice));
+    CHECK_ACL(aclrtFreeHost(yHost));
+    TestAclDeInit(context, stream, deviceId);
+}
+#endif
+} // namespace MatmulHost
+
+int32_t main(int32_t argc, const char *args[])
+{
+    int32_t size = 4;
+    int64_t problem[size] = {1, 1, 1, 1};
+    for (int32_t i = 1; i < argc && i < size + 1; ++i) {
+        std::stringstream ss(args[i]);
+        ss >> problem[i - 1];
+    }
+    auto ascendcPlatform = platform_ascendc::PlatformAscendCManager::GetInstance(SOC_VERSION);
+    MatmulHost::MatmulCaseParams testCaseParams{static_cast<int32_t>(ascendcPlatform->GetCoreNumAic()),
+        static_cast<int32_t>(problem[0]), static_cast<int32_t>(problem[1]), static_cast<int32_t>(problem[2]), // M, N, K
+        IS_BIAS, IS_A_TRANS, IS_B_TRANS,
+        static_cast<int32_t>(problem[3])}; // BatchNum
+    MatrixFileSize matrixFileSize;
+    matrixFileSize.x1FileSize = static_cast<size_t>(testCaseParams.m * testCaseParams.k * testCaseParams.batchNum) * sizeof(uint16_t);
+    matrixFileSize.x2FileSize = static_cast<size_t>(testCaseParams.k * testCaseParams.n * testCaseParams.batchNum) * sizeof(uint16_t);
+    matrixFileSize.yFileSize = static_cast<size_t>(testCaseParams.m * testCaseParams.n * testCaseParams.batchNum) * sizeof(float);
+    matrixFileSize.biasFileSize = static_cast<size_t>(1 * testCaseParams.n) * sizeof(float);
+#ifdef ASCENDC_CPU_DEBUG
+    MatmulHost::TestMatmulCpu(testCaseParams, matrixFileSize);
+#else
+    MatmulHost::TestMatmul(testCaseParams, matrixFileSize);
+#endif
+    return 0;
+}
diff --git a/examples/matrix/batch_matmul_bias_no_batch/op_host/batch_matmul_bias_no_batch_custom_tiling.cpp b/examples/matrix/batch_matmul_bias_no_batch/op_host/batch_matmul_bias_no_batch_custom_tiling.cpp
new file mode 100644
index 00000000..bb80ce11
--- /dev/null
+++ b/examples/matrix/batch_matmul_bias_no_batch/op_host/batch_matmul_bias_no_batch_custom_tiling.cpp
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2025 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+#include "batch_matmul_bias_no_batch_custom_tiling.h"
+#include <iostream>
+
+namespace MatmulHost {
+
+TCubeTiling GenerateTiling(const MatmulCaseParams& testCaseParams)
+{
+    TCubeTiling tilingData;
+    auto ascendcPlatform = platform_ascendc::PlatformAscendCManager::GetInstance();
+    matmul_tiling::MultiCoreMatmulTiling cubeTiling(*ascendcPlatform);
+    int32_t M = testCaseParams.m;
+    int32_t N = testCaseParams.n;
+    int32_t K = testCaseParams.k;
+    int32_t blockDim = 1;
+    bool isBias = testCaseParams.isBias;
+    bool isAtrans = testCaseParams.isATrans;
+    bool isBtrans = testCaseParams.isBTrans;
+    int32_t batchNum = testCaseParams.batchNum;
+
+    cubeTiling.SetDim(blockDim);
+    cubeTiling.SetAType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND,
+        matmul_tiling::DataType::DT_FLOAT16, isAtrans);
+    cubeTiling.SetBType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND,
+        matmul_tiling::DataType::DT_FLOAT16, isBtrans);
+    cubeTiling.SetCType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND,
+        matmul_tiling::DataType::DT_FLOAT);
+    cubeTiling.SetBiasType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND,
+        matmul_tiling::DataType::DT_FLOAT);
+    cubeTiling.SetOrgShape(M, N, K);
+    cubeTiling.SetShape(M, N, K);
+    cubeTiling.EnableBias(isBias);
+    cubeTiling.SetBufferSpace(-1, -1, -1);
+    cubeTiling.SetALayout(batchNum, M, 1, 1, K);
+    cubeTiling.SetBLayout(batchNum, K, 1, 1, N);
+    cubeTiling.SetCLayout(batchNum, M, 1, 1, N);
+    cubeTiling.SetBatchNum(batchNum);
+    if (cubeTiling.GetTiling(tilingData) == -1) {
+        std::cout << "Generate tiling failed." << std::endl;
+        return {};
+    }
+    return tilingData;
+}
+
+} // namespace MatmulHost
diff --git a/examples/matrix/batch_matmul_bias_no_batch/op_host/batch_matmul_bias_no_batch_custom_tiling.h b/examples/matrix/batch_matmul_bias_no_batch/op_host/batch_matmul_bias_no_batch_custom_tiling.h
new file mode 100644
index 00000000..2239eb39
--- /dev/null
+++ b/examples/matrix/batch_matmul_bias_no_batch/op_host/batch_matmul_bias_no_batch_custom_tiling.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2025 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+#ifndef EXAMPLES_MATRIX_BATCH_MATMUL_BIAS_NO_BATCH_OP_HOST_BATCH_MATMUL_BIAS_NO_BATCH_CUSTOM_TILING_H
+#define EXAMPLES_MATRIX_BATCH_MATMUL_BIAS_NO_BATCH_OP_HOST_BATCH_MATMUL_BIAS_NO_BATCH_CUSTOM_TILING_H
+#include "register/tilingdata_base.h"
+#include "tiling/tiling_api.h"
+
+namespace MatmulHost {
+
+struct MatmulCaseParams
+{
+    int32_t usedCoreNum;
+    int32_t m;
+    int32_t n;
+    int32_t k;
+    bool isBias;
+    bool isATrans;
+    bool isBTrans;
+    int32_t batchNum;
+};
+
+/**
+  * @brief Generate matmul tiling.
+  * @param testCaseParams: Testcase parameters.
+  * @retval Generated Tiling data.
+  */
+TCubeTiling GenerateTiling(const MatmulCaseParams& testCaseParams);
+
+} // namespace MatmulHost
+#endif // EXAMPLES_MATRIX_BATCH_MATMUL_BIAS_NO_BATCH_OP_HOST_BATCH_MATMUL_BIAS_NO_BATCH_CUSTOM_TILING_H
diff --git a/examples/matrix/batch_matmul_bias_no_batch/op_kernel/batch_matmul_bias_no_batch_custom.cpp b/examples/matrix/batch_matmul_bias_no_batch/op_kernel/batch_matmul_bias_no_batch_custom.cpp
new file mode 100644
index 00000000..e0ae3e73
--- /dev/null
+++ b/examples/matrix/batch_matmul_bias_no_batch/op_kernel/batch_matmul_bias_no_batch_custom.cpp
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2025 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+#include "kernel_operator.h"
+#include "lib/matmul_intf.h"
+#include "batch_matmul_bias_no_batch_custom_impl.h"
+
+namespace BatchMatmulCustom {
+__aicore__ inline void CopyTiling(TCubeTiling* tiling, GM_ADDR tilingGM)
+{
+    uint32_t* ptr = reinterpret_cast<uint32_t*>(tiling);
+    auto tiling32 = reinterpret_cast<__gm__ uint32_t*>(tilingGM);
+
+    for (int i = 0; i < sizeof(TCubeTiling) / sizeof(uint32_t); i++, ptr++) {
+      *ptr = *(tiling32 + i);
+    }
+    return;
+}
+
+template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE>
+__aicore__ inline void BatchMatmulKernel<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE>::Init(GM_ADDR a, GM_ADDR b, GM_ADDR bias,
+                                         GM_ADDR c, GM_ADDR workspace, const TCubeTiling& tiling)
+{
+    this->tiling = tiling;
+    int32_t sizeA = tiling.ALayoutInfoB * tiling.ALayoutInfoS * tiling.ALayoutInfoN * tiling.ALayoutInfoG * tiling.ALayoutInfoD;
+    int32_t sizeB = tiling.BLayoutInfoB * tiling.BLayoutInfoS * tiling.BLayoutInfoN * tiling.BLayoutInfoG * tiling.BLayoutInfoD;
+    int32_t sizeC = tiling.CLayoutInfoB * tiling.CLayoutInfoS1 * tiling.CLayoutInfoN * tiling.CLayoutInfoG * tiling.CLayoutInfoS2;
+    int32_t sizeBias = tiling.CLayoutInfoN * tiling.CLayoutInfoG * tiling.CLayoutInfoS2;
+
+    aGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ aType*>(a), sizeA);
+    bGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ bType*>(b), sizeB);
+    cGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ cType*>(c), sizeC);
+    biasGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ biasType*>(bias), sizeBias);
+
+    if (GetSysWorkSpacePtr() == nullptr) {
+        return;
+    }
+}
+
+template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE>
+__aicore__ inline void BatchMatmulKernel<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE>::Process(AscendC::TPipe* pipe, int32_t batchA, int32_t batchB)
+{
+    matmulObj.SetTensorA(aGlobal, false);
+    matmulObj.SetTensorB(bGlobal, true); // B transpose
+    if (tiling.isBias) {
+        matmulObj.SetBias(biasGlobal);
+    }
+
+    matmulObj.IterateBatch(cGlobal, batchA, batchB, false);
+}
+} // namespace BatchMatmulCustom
+
+extern "C" __global__ __aicore__ void batch_matmul_bias_no_batch_custom(GM_ADDR a, GM_ADDR b, GM_ADDR bias, GM_ADDR c, GM_ADDR workspace,
+                                                        GM_ADDR tilingGm)
+{
+    // prepare tiling
+    TCubeTiling tiling;
+    BatchMatmulCustom::CopyTiling(&tiling, tilingGm);
+    // define matmul kernel
+    using A_TYPE = AscendC::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, half, false, LayoutMode::NORMAL>;
+    using B_TYPE = AscendC::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, half, true, LayoutMode::NORMAL>;
+    using C_TYPE = AscendC::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, float, false, LayoutMode::NORMAL>;
+    using BIAS_TYPE = AscendC::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, float>;
+    BatchMatmulCustom::BatchMatmulKernel<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE> batchMatmulKernel;
+    AscendC::TPipe pipe;
+    REGIST_MATMUL_OBJ(&pipe, GetSysWorkSpacePtr(), batchMatmulKernel.matmulObj, &tiling);
+    // init matmul kernel
+    batchMatmulKernel.Init(a, b, bias, c, workspace, tiling);
+    // matmul kernel process
+    batchMatmulKernel.Process(&pipe, 3, 3);
+}
+
+#ifndef ASCENDC_CPU_DEBUG
+void batch_matmul_bias_no_batch_custom_do(uint32_t blockDim, void* stream, GM_ADDR a, GM_ADDR b, GM_ADDR bias,
+                         GM_ADDR c, GM_ADDR workspace, GM_ADDR tilingGm)
+{
+    // invoke the kernel function through the <<<>>> symbol
+    batch_matmul_bias_no_batch_custom<<<blockDim, nullptr, stream>>>(a, b, bias, c, workspace, tilingGm);
+}
+#endif
\ No newline at end of file
diff --git a/examples/matrix/batch_matmul_bias_no_batch/op_kernel/batch_matmul_bias_no_batch_custom_impl.h b/examples/matrix/batch_matmul_bias_no_batch/op_kernel/batch_matmul_bias_no_batch_custom_impl.h
new file mode 100644
index 00000000..70e90e27
--- /dev/null
+++ b/examples/matrix/batch_matmul_bias_no_batch/op_kernel/batch_matmul_bias_no_batch_custom_impl.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2025 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+#ifndef EXAMPLES_MATRIX_BATCH_MATMUL_BIAS_NO_BATCH_OP_KERNEL_BATCH_MATMUL_BIAS_NO_BATCH_CUSTOM_IMPL_H
+#define EXAMPLES_MATRIX_BATCH_MATMUL_BIAS_NO_BATCH_OP_KERNEL_BATCH_MATMUL_BIAS_NO_BATCH_CUSTOM_IMPL_H
+#include "kernel_operator.h"
+#include "lib/matmul_intf.h"
+
+namespace BatchMatmulCustom {
+constexpr MatmulConfigMode configMode = MatmulConfigMode::CONFIG_NORM;
+constexpr MatmulBatchParams batchParams = {
+  false, BatchMode::BATCH_LESS_THAN_L1, false /* isBiasBatch */
+};
+constexpr MatmulConfig CFG_MM = GetMMConfig<configMode>(batchParams);
+
+template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE>
+class BatchMatmulKernel {
+    public:
+        __aicore__ inline BatchMatmulKernel(){};
+        __aicore__ inline void Init(GM_ADDR a, GM_ADDR b, GM_ADDR bias, GM_ADDR c, GM_ADDR workspace, const TCubeTiling& tiling);
+        __aicore__ inline void Process(AscendC::TPipe* pipe, int32_t batchA, int32_t batchB);
+        AscendC::Matmul<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, CFG_MM> matmulObj;
+    private:
+        __aicore__ inline void CalcOffset(int32_t blockIdx, const TCubeTiling& tiling, int32_t& offsetA, int32_t& offsetB,
+                                          int32_t& offsetC, int32_t& offsetBias);
+        using aType = typename A_TYPE::T;
+        using bType = typename B_TYPE::T;
+        using cType = typename C_TYPE::T;
+        using biasType = typename BIAS_TYPE::T;
+        AscendC::GlobalTensor<aType> aGlobal;
+        AscendC::GlobalTensor<bType> bGlobal;
+        AscendC::GlobalTensor<cType> cGlobal;
+        AscendC::GlobalTensor<biasType> biasGlobal;
+        TCubeTiling tiling;
+};
+} // namespace BatchMatmulCustom
+#endif // EXAMPLES_MATRIX_BATCH_MATMUL_BIAS_NO_BATCH_OP_KERNEL_BATCH_MATMUL_BIAS_NO_BATCH_CUSTOM_IMPL_H
\ No newline at end of file
diff --git a/examples/matrix/batch_matmul_bias_no_batch/run.sh b/examples/matrix/batch_matmul_bias_no_batch/run.sh
new file mode 100644
index 00000000..b23451b8
--- /dev/null
+++ b/examples/matrix/batch_matmul_bias_no_batch/run.sh
@@ -0,0 +1,96 @@
+#!/bin/bash
+# Copyright (c) 2025 Huawei Technologies Co., Ltd.
+# This file is a part of the CANN Open Software.
+# Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# ======================================================================================================================
+
+export IS_PERF="0"
+
+SHORT=r:,v:,p:,
+LONG=run-mode:,soc-version:,perf:,
+OPTS=$(getopt -a --options $SHORT --longoptions $LONG -- "$@")
+eval set -- "$OPTS"
+while :
+do
+    case "$1" in
+        (-r | --run-mode )
+            RUN_MODE="$2"
+            shift 2;;
+        (-v | --soc-version )
+            SOC_VERSION="$2"
+            shift 2;;
+        (-p | --perf )
+            IS_PERF="$2"
+            shift 2;;
+        (--)
+            shift;
+            break;;
+        (*)
+            echo "[ERROR] Unexpected option: $1";
+            break;;
+    esac
+done
+
+# Check invalid input
+if [[ ! "${SOC_VERSION}" =~ ^Ascend910 ]]; then
+    echo "[ERROR] Unsupported SocVersion: ${SOC_VERSION}"
+    exit 1
+fi
+
+if [ "${RUN_MODE}" != "npu" ] && [ "${RUN_MODE}" != "sim" ] && [ "${RUN_MODE}" != "cpu" ]; then
+    echo "[ERROR] Unsupported RUN_MODE: ${RUN_MODE}, which can only be cpu/sim/npu."
+    exit 1
+fi
+
+if [ "${IS_PERF}" != "0" ] && [ "${IS_PERF}" != "1" ]; then
+    echo "[ERROR] Unsupported IS_PERF: ${IS_PERF}, which can only be 0 or 1."
+    exit 1
+fi
+
+# only npu mode support is_perf = 1
+if [ "${IS_PERF}" == "1" ] && [ "${RUN_MODE}" != "npu" ]; then
+    echo "[ERROR] Unsupported IS_PERF: ${IS_PERF} while RUN_MODE is ${RUN_MODE}."
+    exit 1
+fi
+
+rm -rf build
+mkdir build
+cd build
+
+source $ASCEND_HOME_DIR/bin/setenv.bash
+export LD_LIBRARY_PATH=${ASCEND_HOME_DIR}/tools/simulator/${SOC_VERSION}/lib:$LD_LIBRARY_PATH
+set -euo pipefail
+
+cmake  -DRUN_MODE=${RUN_MODE} -DSOC_VERSION=${SOC_VERSION} -DASCEND_CANN_PACKAGE_PATH=${ASCEND_HOME_DIR} ..
+make -j16
+
+cd ../
+
+rm -rf input
+mkdir input
+rm -rf output
+mkdir output
+
+rm -rf bin/
+mkdir -p bin
+cd bin
+cp ../build/ascendc_batch_matmul_bias_no_batch_bbit ./
+
+export TF_CPP_MIN_LOG_LEVEL=3
+
+if [ "${RUN_MODE}" = "npu" ]; then
+    if [ "${IS_PERF}" = "1" ]; then
+        export ASCEND_TOOLKIT_HOME=${ASCEND_HOME_DIR}
+        python3 -u ../scripts/exec_test.py npu "perf"
+    else
+        python3 -u ../scripts/exec_test.py npu "normal"
+    fi
+elif [ "${RUN_MODE}" = "sim" ]; then
+    python3 -u ../scripts/exec_test.py sim "normal"
+elif [ "${RUN_MODE}" = "cpu" ]; then
+    python3 -u ../scripts/exec_test.py cpu "normal"
+fi
diff --git a/examples/matrix/batch_matmul_bias_no_batch/scripts/exec_test.py b/examples/matrix/batch_matmul_bias_no_batch/scripts/exec_test.py
new file mode 100644
index 00000000..e5d99dd3
--- /dev/null
+++ b/examples/matrix/batch_matmul_bias_no_batch/scripts/exec_test.py
@@ -0,0 +1,118 @@
+#!/usr/bin/python3
+# coding=utf-8
+
+# Copyright (c) 2025 Huawei Technologies Co., Ltd.
+# This file is a part of the CANN Open Software.
+# Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# ======================================================================================================================
+import os
+import sys
+import csv
+import time
+import logging
+import subprocess
+
+import numpy as np
+
+sys.path.append("../..")
+from gen_batch_data import MatmulGenData
+from common_scripts.compare_data import compare_data
+from common_scripts.exec_utils import get_file_work_dir, get_case_list, get_perf_task_duration, get_process_case_cmd
+
+IS_BIAS = True
+IS_TRANS_A = False
+IS_TRANS_B = True
+DATA_TYPE_STR = "float16_float32"
+
+logging.basicConfig(level=logging.INFO)
+
+
+class ProcessParams:
+    def __init__(self, case_name, m, n, k, b, is_perf, run_mode):
+        self.case_name = case_name
+        self.m = m
+        self.n = n
+        self.k = k
+        self.b = b
+        self.is_perf = is_perf
+        self.run_mode = run_mode
+
+
+def process_case(file_work_dir, process_params):
+    case_name, m, n, k, b, is_perf, run_mode = process_params.case_name, process_params.m, process_params.n,\
+        process_params.k, process_params.b, process_params.is_perf, process_params.run_mode
+    logging.info("[INFO] start process case [%s]" % (case_name))
+    logging.info("[INFO] IS_PERF is set [%d]" % (is_perf))
+
+    matmul_gen_data = MatmulGenData(m, n, k, b, IS_TRANS_A, IS_TRANS_B, IS_BIAS, DATA_TYPE_STR)
+    if is_perf:
+        matmul_gen_data.gen_fake_golden_data(file_work_dir)
+    else:
+        matmul_gen_data.gen_golden_data(file_work_dir)
+    params_str = f"{m} {n} {k} {b}"
+    cmd = get_process_case_cmd(kernel_name="ascendc_batch_matmul_bias_no_batch_bbit", params_str=params_str,\
+        is_perf=is_perf, run_mode=run_mode)
+    subprocess.run(cmd)
+    if is_perf:
+        wrong_num = -1
+    else:
+        logging.info("[INFO] compare data case[%s]" % (case_name))
+        wrong_num = compare_data(file_work_dir, DATA_TYPE_STR)
+    res_data = []
+    res_data.append(case_name)
+    res_data.append(wrong_num)
+    res_data.append(m * n * b)
+    if wrong_num == -1:
+        res_data.append("None")
+    elif wrong_num / (m * n * b) > 0.001:
+        res_data.append("Fail")
+    else:
+        res_data.append("Success")
+    if is_perf:
+        task_duration = get_perf_task_duration("./prof_out")
+        res_data.append(task_duration)
+    return res_data
+
+
+def main():
+    args_len = len(sys.argv) - 1
+    if args_len != 2:
+        logging.info("[ERROR] exec_test input params error!")
+        return -1
+
+    file_work_dir = get_file_work_dir()
+    if not os.path.exists(file_work_dir):
+        logging.info("[ERROR] file path %s not exist!" % (file_work_dir))
+        return -1
+
+    is_perf = False
+    if sys.argv[2] == "perf":
+        is_perf = True
+
+    case_list = get_case_list()
+    res_list = [["case_name", "wrong_num", "total_num", "result", "task_duration"]]
+    run_mode = sys.argv[1]
+    for is_process, case_name, m, n, k, b in case_list:
+        if is_process == 1:
+            process_params = ProcessParams(case_name, m, n, k, b, is_perf, run_mode)
+            res_data = process_case(file_work_dir, process_params)
+            res_list.append(res_data)
+
+    timestamp = time.time()
+    result_file_name = "result_" + str(timestamp) + ".csv"
+    with open(os.path.join(file_work_dir, "output", result_file_name), 'w', newline='', encoding='utf-8') as csvfile:
+        writer = csv.writer(csvfile)
+        writer.writerows(res_list)
+
+    logging.info("---------------RESULT---------------")
+    for res in res_list:
+        logging.info(res)
+    return 0
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/matrix/batch_matmul_bias_no_batch/scripts/gen_batch_data.py b/examples/matrix/batch_matmul_bias_no_batch/scripts/gen_batch_data.py
new file mode 100644
index 00000000..e4f25130
--- /dev/null
+++ b/examples/matrix/batch_matmul_bias_no_batch/scripts/gen_batch_data.py
@@ -0,0 +1,93 @@
+#!/usr/bin/python3
+# coding=utf-8
+
+# Copyright (c) 2025 Huawei Technologies Co., Ltd.
+# This file is a part of the CANN Open Software.
+# Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# ======================================================================================================================
+import os
+import sys
+import logging
+
+import numpy as np
+import tensorflow as tf
+
+IS_OUTPUT_TXT = False
+
+
+class MatmulGenData:
+    def __init__(self, m, n, k, b, is_trans_a, is_trans_b, is_bias,\
+        data_type_str):
+        self.m = m
+        self.n = n
+        self.k = k
+        self.b = b
+        self.is_trans_a = is_trans_a
+        self.is_trans_b = is_trans_b
+        self.is_bias = is_bias
+        self.data_type_str = data_type_str
+
+    def gen_golden_data_fp16_fp32(self, work_dir):
+        src_type = np.float16
+        dst_type = np.float32
+        bias_gm = np.random.uniform(-1, 1, [1, self.n]).astype(dst_type)
+
+        for idx in range(self.b):
+            x1_gm_left = np.random.uniform(-1, 1, [self.m, self.k]).astype(src_type)
+            x2_gm_right = np.random.uniform(-1, 1, [self.k, self.n]).astype(src_type)
+            golden_one = np.matmul(x1_gm_left.astype(dst_type), x2_gm_right.astype(dst_type)).astype(dst_type)
+            if self.is_bias:
+                golden_one = golden_one + bias_gm.astype(dst_type)
+            if self.is_trans_a:
+                x1_tmp = x1_gm_left.transpose()
+            else:
+                x1_tmp = x1_gm_left
+            if self.is_trans_b:
+                x2_tmp = x2_gm_right.transpose()
+            else:
+                x2_tmp = x2_gm_right
+            if idx == 0:
+                x1_gm = x1_tmp
+                x2_gm = x2_tmp
+                golden = golden_one
+            else:
+                x1_gm = np.vstack((x1_gm, x1_tmp))
+                x2_gm = np.vstack((x2_gm, x2_tmp))
+                golden = np.vstack((golden, golden_one))
+
+        x1_gm.tofile(work_dir + "/input/x1_gm.bin")
+        x2_gm.tofile(work_dir + "/input/x2_gm.bin")
+        if self.is_bias:
+            bias_gm.tofile(work_dir + "/input/bias_gm.bin")
+        golden.tofile(work_dir + "/output/golden.bin")
+
+        return 0
+
+    def gen_golden_data(self, work_dir):
+        if self.data_type_str == "float16_float32":
+            self.gen_golden_data_fp16_fp32(work_dir)
+        else:
+            logging.info("[ERROR] can't support data type %s" % (self.data_type_str))
+            return -1
+        return 0
+
+    def gen_fake_golden_data(self, work_dir):
+        data_type_bytes_ab = 2 # float16
+        data_type_bytes_c = 4  # float32
+
+        file_byte = self.b * self.m * self.k * data_type_bytes_ab
+        with open(work_dir + "/input/x1_gm.bin", 'wb') as file:
+            file.truncate(file_byte)
+
+        file_byte = self.b * self.k * self.n * data_type_bytes_ab
+        with open(work_dir + "/input/x2_gm.bin", 'wb') as file:
+            file.truncate(file_byte)
+
+        if self.is_bias:
+            file_byte = 1 * self.n * data_type_bytes_c
+            with open(work_dir + "/input/bias_gm.bin", 'wb') as file:
+                file.truncate(file_byte)
\ No newline at end of file
diff --git a/examples/matrix/batch_matmul_bias_no_batch/testcase/case.csv b/examples/matrix/batch_matmul_bias_no_batch/testcase/case.csv
new file mode 100644
index 00000000..4f0f0196
--- /dev/null
+++ b/examples/matrix/batch_matmul_bias_no_batch/testcase/case.csv
@@ -0,0 +1 @@
+1, case001, 32, 256, 64, 3
\ No newline at end of file
diff --git a/examples/matrix/batch_matmul_tscm/CMakeLists.txt b/examples/matrix/batch_matmul_tscm/CMakeLists.txt
new file mode 100644
index 00000000..4efde358
--- /dev/null
+++ b/examples/matrix/batch_matmul_tscm/CMakeLists.txt
@@ -0,0 +1,73 @@
+# Copyright (c) 2025 Huawei Technologies Co., Ltd.
+# This file is a part of the CANN Open Software.
+# Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# ======================================================================================================================
+
+cmake_minimum_required(VERSION 3.16)
+project(Ascend_c)
+if(${RUN_MODE})
+    set(RUN_MODE "npu" CACHE STRING "cpu/sim/npu")
+endif()
+if (${SOC_VERSION})
+    set(SOC_VERSION "Ascend910" CACHE STRING "system on chip type")
+endif()
+
+set(ASCEND_CANN_PACKAGE_PATH "~/Ascend/ascend-toolkit/latest" CACHE STRING "ASCEND CANN package installation directory")
+if(NOT CMAKE_BUILD_TYPE)
+    set(CMAKE_BUILD_TYPE "Debug"  CACHE STRING "Build type Release/Debug (default Debug)" FORCE)
+endif()
+
+if(CMAKE_INSTALL_PREFIX STREQUAL /usr/local)
+    set(CMAKE_INSTALL_PREFIX "${CMAKE_CURRENT_LIST_DIR}/out"  CACHE STRING "path for install()" FORCE)
+endif()
+
+file(GLOB KERNEL_FILES
+    ${CMAKE_CURRENT_SOURCE_DIR}/op_kernel/batch_matmul_tscm_custom.cpp
+)
+
+if("${RUN_MODE}" STREQUAL "cpu")
+    include(cmake/cpu_lib.cmake)
+elseif("${RUN_MODE}" STREQUAL "sim" OR "${RUN_MODE}" STREQUAL "npu")
+    include(cmake/npu_lib.cmake)
+else()
+    message("invalid RUN_MODE: ${RUN_MODE}")
+endif()
+
+add_executable(ascendc_batch_matmul_tscm_bbit
+    ${CMAKE_CURRENT_SOURCE_DIR}/main.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/op_host/batch_matmul_tscm_custom_tiling.cpp
+)
+
+target_compile_options(ascendc_batch_matmul_tscm_bbit PRIVATE
+    $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:-g>>
+    -O2
+    -std=c++17
+    -D_GLIBCXX_USE_CXX11_ABI=0
+)
+
+target_compile_definitions(ascendc_batch_matmul_tscm_bbit PRIVATE
+    SOC_VERSION="${SOC_VERSION}"
+)
+
+target_include_directories(ascendc_batch_matmul_tscm_bbit PRIVATE
+    ${CMAKE_CURRENT_SOURCE_DIR}
+    $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:${ASCEND_CANN_PACKAGE_PATH}/include>>
+    $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:${ASCEND_CANN_PACKAGE_PATH}/runtime/include>>
+)
+
+target_link_libraries(ascendc_batch_matmul_tscm_bbit PRIVATE
+    $<BUILD_INTERFACE:$<$<OR:$<STREQUAL:${RUN_MODE},npu>,$<STREQUAL:${RUN_MODE},sim>>:host_intf_pub>>
+    $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:tikicpulib::${SOC_VERSION}>>
+    $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:ascendcl>>
+    $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:c_sec>>
+    ascendc_kernels_${RUN_MODE}
+    tiling_api
+    register
+    platform
+    ascendalog
+    dl
+)
diff --git a/examples/matrix/batch_matmul_tscm/README.md b/examples/matrix/batch_matmul_tscm/README.md
new file mode 100644
index 00000000..1cb6f68d
--- /dev/null
+++ b/examples/matrix/batch_matmul_tscm/README.md
@@ -0,0 +1,127 @@
+<!--声明：本文使用[Creative Commons License version 4.0](https://creativecommons.org/licenses/by/4.0/legalcode)许可协议，转载、引用或修改等操作请遵循此许可协议。-->
+## 概述
+
+本样例介绍了调用Matmul高阶API实现左矩阵为L1输入的BatchMatmul单算子，即算子中自定义TSCM输入。关于TSCM输入的具体内容请参考《[Ascend C算子开发](https://www.hiascend.com/document/redirect/CannCommunityOpdevAscendC)》中的“算子实现 > 矩阵编程（高阶API） > 特性场景 > TSCM输入场景”章节。
+
+本样例以直调的方式调用算子核函数。
+
+直调：核函数的基础调用方式，开发者完成算子核函数的开发和Tiling实现后，即可通过AscendCL运行时接口，完成算子的调用。包含CPU侧、NPU侧和仿真三种运行验证方法。
+
+## 样例支持的产品型号为
+- Atlas A2训练系列产品/Atlas 800I A2推理产品
+
+## 目录结构
+| 目录及文件                            | 描述                   |
+|----------------------------------|----------------------|
+| [cmake](cmake)                   | 编译工程文件 |
+| [op_host](op_host)               | 本样例的tiling代码实现 |
+| [op_kernel](op_kernel)           | 本样例的kernel代码实现 |
+| [scripts](scripts)               | 包含输入输出真值数据生成脚本文件和执行文件 |
+| [testcase](testcase)             | 用例文件，配置用例的计算shape信息 |
+| CMakeLists.txt                   | 编译工程文件 |
+| main.cpp                         | 主函数，调用算子的应用程序，含CPU域及NPU域调用 |
+| run.sh                           | 编译执行脚本 |
+
+## 算子描述
+- 算子功能  
+  BatchMatmul单算子，批量处理Matmul计算，每次Matmul计算对输入的A、B矩阵做矩阵乘和加bias偏置。该算子中A矩阵使用TSCM输入。
+
+- 算子规格
+    <table>
+    <tr><td rowspan="1" align="center">算子类型(OpType)</td><td colspan="6" align="center">BatchMatmulCustom</td></tr>
+    </tr>
+    <tr><td rowspan="4" align="center">算子输入</td><td align="center">name</td><td align="center">shape</td><td align="center">data type</td><td align="center">format</td><td align="center">isTrans</td><td align="center">layout</td></tr>
+    <tr><td align="center">a</td><td align="center">-</td><td align="center">float16</td><td align="center">NZ</td><td align="center">false</td><td align="center">NORMAL</td></tr>
+    <tr><td align="center">b</td><td align="center">-</td><td align="center">float16</td><td align="center">ND</td><td align="center">true</td><td align="center">NORMAL</td></tr>
+    <tr><td align="center">bias</td><td align="center">-</td><td align="center">float</td><td align="center">ND</td><td align="center">-</td><td align="center">-</td></tr>
+    </tr>
+    </tr>
+    <tr><td rowspan="1" align="center">算子输出</td><td align="center">c</td><td align="center">-</td><td align="center">float</td><td align="center">ND</td><td align="center">-</td><td align="center">NORMAL</td></tr>
+    </tr>
+    <tr><td rowspan="1" align="center">核函数名</td><td colspan="6" align="center">batch_matmul_bias_no_batch_custom</td></tr>
+    </table>
+
+## 算子实现介绍
+- 约束条件
+  - 在BatchMatmul的输入矩阵位置为L1时，输入输出的Layout只支持NORMAL。
+  - TSCM输入的矩阵必须能在L1 Buffer上全载，且L1 Buffer上的数据应当为NZ格式。
+
+- 算子Kernel实现
+  - 一次完成BatchNum个Matmul矩阵乘法的运算。单次MatMul的计算公式为：C = A * B + Bias。
+    - A、B为源操作数，A为左矩阵，形状为[M, K]；B为右矩阵，形状为[K, N]。
+    - C为目的操作数，存放矩阵乘结果的矩阵，形状为[M, N]。
+    - Bias为矩阵乘偏置，形状为[1, N]。对A*B结果矩阵的每一行都采用该bias进行偏置。
+  - 具体步骤：
+    - 创建Matmul对象。
+      创建Matmul对象时，将A矩阵MatmulType的Position设为TSCM，Format设为NZ。
+        ```
+        using A_TYPE = AscendC::MatmulType<AscendC::TPosition::TSCM, CubeFormat::NZ, half, false, LayoutMode::NORMAL>;
+        using B_TYPE = AscendC::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, half, true, LayoutMode::NORMAL>;
+        using C_TYPE = AscendC::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, float, false, LayoutMode::NORMAL>;
+        using BIAS_TYPE = AscendC::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, float>;
+        ```
+    - 初始化操作。
+    - 自定义左矩阵A从GM到L1的搬运，设置左矩阵A、右矩阵B、Bias，其中左矩阵A为TSCM输入。
+        ```
+        AscendC::TSCM<AscendC::TPosition::GM, 1> scm;
+        pipe->InitBuffer(scm, 1, tiling.M * tiling.Ka * sizeof(AType));
+        auto scmTensor = scm.AllocTensor<AType>();
+        DataCopy(scmTensor, aGlobal, tiling.M * tiling.Ka);
+        scm.EnQue(scmTensor);
+        AscendC::LocalTensor<AType> scmLocal = scm.DeQue<AType>();
+
+        matmulObj.SetTensorA(scmLocal);
+        matmulObj.SetTensorB(bGlobal);
+        if (tiling.isBias) {
+          matmulObj.SetBias(biasGlobal);
+        }
+        ```
+    - 完成多batch矩阵乘操作。
+        ```
+        matmulObj.IterateBatch(cGlobal, batchA, batchB, false);
+        ```
+    - 结束矩阵乘操作。
+
+- 算子Tiling实现
+  - Ascend C提供一组Matmul Tiling API，方便用户获取Matmul Kernel计算时所需的 Tiling参数。只需要传入A/B/C矩阵等信息，调用API接口，即可获取到TCubeTiling结构体中的相关参数。
+  - 获取Tiling参数的流程如下：
+    - 创建一个Tiling对象。
+    - 根据输入输出Layout设置单核计算的A、B、C、Bias的参数类型信息，将A矩阵MatmulType的Position设为TSCM，Format设为NZ；M、N、Ka、Kb形状信息等。
+        ```
+        cubeTiling.SetAType(matmul_tiling::TPosition::TSCM, matmul_tiling::CubeFormat::NZ,
+            matmul_tiling::DataType::DT_FLOAT16, isAtrans);
+        cubeTiling.SetBType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND,
+            matmul_tiling::DataType::DT_FLOAT16, isBtrans);
+        cubeTiling.SetCType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND,
+            matmul_tiling::DataType::DT_FLOAT);
+        cubeTiling.SetBiasType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND,
+            matmul_tiling::DataType::DT_FLOAT);
+        ```
+    - 调用SetALayout、SetBLayout、SetCLayout、SetBatchNum设置A/B/C的Layout轴信息和最大BatchNum数。
+    - 调用GetTiling接口，获取Tiling信息。
+
+## 编译运行样例
+
+  - 配置环境变量
+
+    这里的\$ASCEND_CANN_PACKAGE_PATH需要替换为CANN开发套件包安装后文件存储路径。例如：/usr/local/Ascend/ascend-toolkit/latest
+    ```
+    export ASCEND_HOME_DIR=$ASCEND_CANN_PACKAGE_PATH
+    source $ASCEND_HOME_DIR/../set_env.sh
+    ```
+
+  - 编译执行
+
+    ```
+    bash run.sh -r [RUN_MODE] -v [SOC_VERSION] -p [IS_PERF]
+    ```
+    其中脚本参数说明如下：
+    - RUN_MODE ：编译执行方式，可选择CPU调试，NPU仿真，NPU上板，对应参数分别为[cpu / sim / npu]。若需要详细了解NPU仿真相关内容，请参考[《算子开发工具msProf》](https://hiascend.com/document/redirect/CannCommunityToolMsProf)中的“工具使用”章节。
+    - SOC_VERSION ：昇腾AI处理器型号，如果无法确定具体的[SOC_VERSION]，则在安装昇腾AI处理器的服务器执行npu-smi info命令进行查询，在查询到的“Name”前增加Ascend信息，例如“Name”对应取值为xxxyy，实际配置的[SOC_VERSION]值为Ascendxxxyy。支持以下产品型号：
+      - Atlas A2训练系列产品/Atlas 800I A2推理产品
+    - IS_PERF : 是否获取执行性能数据，当前只在NPU执行时生效，打开后性能数据显示在执行结果中，可选择关闭和开启该功能，对应参数分别为[0 / 1]。
+
+    示例如下，Ascendxxxyy请替换为实际的AI处理器型号。
+    ```
+    bash run.sh -r cpu -v Ascendxxxyy -p 0
+    ```
\ No newline at end of file
diff --git a/examples/matrix/batch_matmul_tscm/cmake/cpu_lib.cmake b/examples/matrix/batch_matmul_tscm/cmake/cpu_lib.cmake
new file mode 100644
index 00000000..244469fb
--- /dev/null
+++ b/examples/matrix/batch_matmul_tscm/cmake/cpu_lib.cmake
@@ -0,0 +1,31 @@
+# Copyright (c) 2025 Huawei Technologies Co., Ltd.
+# This file is a part of the CANN Open Software.
+# Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# ======================================================================================================================
+
+if(NOT DEFINED ENV{CMAKE_PREFIX_PATH})
+    set(CMAKE_PREFIX_PATH ${ASCEND_CANN_PACKAGE_PATH}/tools/tikicpulib/lib/cmake)
+endif()
+find_package(tikicpulib REQUIRED)
+
+add_library(ascendc_kernels_${RUN_MODE} SHARED
+    ${KERNEL_FILES}
+)
+
+target_link_libraries(ascendc_kernels_${RUN_MODE} PRIVATE
+    tikicpulib::${SOC_VERSION}
+)
+
+target_compile_options(ascendc_kernels_${RUN_MODE} PRIVATE
+    -g
+    -O0
+    -std=c++17
+)
+
+install(TARGETS ascendc_kernels_${RUN_MODE}
+DESTINATION ${CMAKE_INSTALL_LIBDIR}
+)
\ No newline at end of file
diff --git a/examples/matrix/batch_matmul_tscm/cmake/npu_lib.cmake b/examples/matrix/batch_matmul_tscm/cmake/npu_lib.cmake
new file mode 100644
index 00000000..7e1c58d6
--- /dev/null
+++ b/examples/matrix/batch_matmul_tscm/cmake/npu_lib.cmake
@@ -0,0 +1,27 @@
+# Copyright (c) 2025 Huawei Technologies Co., Ltd.
+# This file is a part of the CANN Open Software.
+# Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# ======================================================================================================================
+
+if(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
+    set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
+elseif(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake)
+    set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake)
+else()
+    message(FATAL_ERROR "ascendc_kernel_cmake does not exist ,please check whether the cann package is installed")
+endif()
+include(${ASCENDC_CMAKE_DIR}/ascendc.cmake)
+
+ascendc_library(ascendc_kernels_${RUN_MODE} STATIC
+    ${KERNEL_FILES}
+)
+
+ascendc_compile_definitions(ascendc_kernels_${RUN_MODE} PRIVATE
+  -DASCENDC_DUMP
+  -DHAVE_WORKSPACE
+  -DHAVE_TILING
+  )
diff --git a/examples/matrix/batch_matmul_tscm/main.cpp b/examples/matrix/batch_matmul_tscm/main.cpp
new file mode 100644
index 00000000..1233f1d4
--- /dev/null
+++ b/examples/matrix/batch_matmul_tscm/main.cpp
@@ -0,0 +1,222 @@
+/**
+ * Copyright (c) 2025 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+#include <sstream>
+#include "../../common/data_utils.h"
+#include "kernel_tiling/kernel_tiling.h"
+#include "tiling/platform/platform_ascendc.h"
+#include "op_host/batch_matmul_tscm_custom_tiling.h"
+#ifndef ASCENDC_CPU_DEBUG
+#include "acl/acl.h"
+extern void batch_matmul_tscm_custom_do(uint32_t coreDim, void* stream,
+                             uint8_t *a, uint8_t *b, uint8_t *bias, uint8_t *c,
+                             uint8_t *workspace, uint8_t *tiling);
+#else
+#include "tikicpulib.h"
+extern "C" void batch_matmul_tscm_custom(uint8_t *a, uint8_t *b, uint8_t *bias, uint8_t *c,
+                              uint8_t *workspace, uint8_t *tiling);
+#endif
+
+namespace {
+constexpr bool IS_BIAS = false;
+constexpr bool IS_A_TRANS = true;
+constexpr bool IS_B_TRANS = false;
+
+struct MatrixFileSize
+{
+    size_t x1FileSize;
+    size_t x2FileSize;
+    size_t yFileSize;
+    size_t biasFileSize;
+};
+}
+
+namespace MatmulHost {
+static size_t GetSysWorkSpaceSize()
+{
+    auto ascendcPlatform = platform_ascendc::PlatformAscendCManager::GetInstance();
+    if (ascendcPlatform == nullptr) {
+        return 0;
+    }
+    return static_cast<size_t>(ascendcPlatform->GetLibApiWorkSpaceSize());
+}
+
+// CPU debug mode
+#ifdef ASCENDC_CPU_DEBUG
+void TestMatmulCpu(const MatmulHost::MatmulCaseParams &testCaseParams, const MatrixFileSize& matrixFileSize)
+{
+    size_t x1FileSize = matrixFileSize.x1FileSize;
+    size_t x2FileSize = matrixFileSize.x2FileSize;
+    size_t yFileSize = matrixFileSize.yFileSize;
+    size_t biasFileSize = matrixFileSize.biasFileSize;
+    // Query workspace size
+    size_t workspaceSize = GetSysWorkSpaceSize();
+    uint8_t* x1 = (uint8_t*)AscendC::GmAlloc(x1FileSize);
+    uint8_t* x2 = (uint8_t*)AscendC::GmAlloc(x2FileSize);
+    uint8_t* bias;
+    uint8_t* y = (uint8_t*)AscendC::GmAlloc(yFileSize);
+    uint8_t* workspace = (uint8_t*)AscendC::GmAlloc(workspaceSize);
+    ReadFile("../input/x1_gm.bin", x1FileSize, x1, x1FileSize);
+    ReadFile("../input/x2_gm.bin", x2FileSize, x2, x2FileSize);
+    if (IS_BIAS) {
+        bias = (uint8_t *)AscendC::GmAlloc(biasFileSize);
+        ReadFile("../input/bias_gm.bin", biasFileSize, bias, biasFileSize);
+    }
+    size_t tilingFileSize = sizeof(TCubeTiling);
+    uint8_t* tiling = (uint8_t*)AscendC::GmAlloc(tilingFileSize);
+    // Calculate Tiling
+    const auto& tilingData = MatmulHost::GenerateTiling(testCaseParams);
+    memcpy_s(tiling, tilingFileSize, &tilingData, tilingFileSize);
+    ICPU_RUN_KF(batch_matmul_tscm_custom, tilingData.usedCoreNum, x1, x2,bias, y, workspace, tiling);
+    WriteFile("../output/output.bin", y, yFileSize);
+    AscendC::GmFree((void*)x1);
+    AscendC::GmFree((void*)x2);
+    if (IS_BIAS) {
+        AscendC::GmFree((void*)bias);
+    }
+    AscendC::GmFree((void*)y);
+    AscendC::GmFree((void*)workspace);
+    AscendC::GmFree((void*)tiling);
+}
+// NPU
+#else
+void MatmulOp(uint8_t* x1, uint8_t* x2, uint8_t* y, uint8_t* bias, MatmulHost::MatmulCaseParams testCaseParams,
+    void* stream = nullptr)
+{
+    // Init args
+    uint8_t *workspaceDevice;
+
+    // Query workspace size
+    size_t workspaceSize = GetSysWorkSpaceSize();
+
+    // Allocate workspace on device
+    CHECK_ACL(aclrtMalloc((void **)&workspaceDevice, workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST));
+
+    // Initialize kernel with arguments and workspace pointer
+    uint8_t* tilingHost;
+    uint8_t* tilingDevice;
+    size_t tilingFileSize = sizeof(TCubeTiling);
+    // Calculate Tiling
+    const auto tilingData = MatmulHost::GenerateTiling(testCaseParams);
+    CHECK_ACL(aclrtMallocHost((void **)(&tilingHost), tilingFileSize));
+    CHECK_ACL(aclrtMalloc((void **)&tilingDevice, tilingFileSize,
+                          ACL_MEM_MALLOC_HUGE_FIRST));
+    CHECK_ACL(aclrtMemcpy(tilingHost, tilingFileSize, &tilingData,
+                          tilingFileSize, ACL_MEMCPY_HOST_TO_HOST));
+    CHECK_ACL(aclrtMemcpy(tilingDevice, tilingFileSize, tilingHost,
+                          tilingFileSize, ACL_MEMCPY_HOST_TO_DEVICE));
+    // Launch kernel
+    batch_matmul_tscm_custom_do(tilingData.usedCoreNum, stream, x1, x2, bias, y, workspaceDevice, tilingDevice);
+    CHECK_ACL(aclrtFreeHost(tilingHost));
+    CHECK_ACL(aclrtFree(workspaceDevice));
+    CHECK_ACL(aclrtFree(tilingDevice));
+}
+
+void TestAclInit(aclrtContext &context, aclrtStream &stream, int64_t &deviceId)
+{
+    CHECK_ACL(aclInit(nullptr));
+    CHECK_ACL(aclrtSetDevice(deviceId));
+    CHECK_ACL(aclrtCreateContext(&context, deviceId));
+    CHECK_ACL(aclrtCreateStream(&stream));
+}
+
+void TestAclDeInit(aclrtContext &context, aclrtStream &stream, int64_t &deviceId)
+{
+    CHECK_ACL(aclrtDestroyStream(stream));
+    CHECK_ACL(aclrtDestroyContext(context));
+    CHECK_ACL(aclrtResetDevice(deviceId));
+    CHECK_ACL(aclFinalize());
+}
+
+void TestMatmul(const MatmulHost::MatmulCaseParams &testCaseParams, const MatrixFileSize& matrixFileSize)
+{
+    size_t x1FileSize = matrixFileSize.x1FileSize;
+    size_t x2FileSize = matrixFileSize.x2FileSize;
+    size_t yFileSize = matrixFileSize.yFileSize;
+    size_t biasFileSize = matrixFileSize.biasFileSize;
+
+    aclrtContext context;
+    aclrtStream stream = nullptr;
+    int64_t deviceId = 0;
+    TestAclInit(context, stream, deviceId);
+
+    uint8_t *x1Host;
+    uint8_t *x1Device;
+    CHECK_ACL(aclrtMallocHost((void **)(&x1Host), x1FileSize));
+    CHECK_ACL(aclrtMalloc((void **)&x1Device, x1FileSize, ACL_MEM_MALLOC_HUGE_FIRST));
+    ReadFile("../input/x1_gm.bin", x1FileSize, x1Host, x1FileSize);
+    CHECK_ACL(aclrtMemcpy(x1Device, x1FileSize, x1Host, x1FileSize, ACL_MEMCPY_HOST_TO_DEVICE));
+
+    uint8_t *x2Host;
+    uint8_t *x2Device;
+    CHECK_ACL(aclrtMallocHost((void **)(&x2Host), x2FileSize));
+    CHECK_ACL(aclrtMalloc((void **)&x2Device, x2FileSize, ACL_MEM_MALLOC_HUGE_FIRST));
+    ReadFile("../input/x2_gm.bin", x2FileSize, x2Host, x2FileSize);
+    CHECK_ACL(aclrtMemcpy(x2Device, x2FileSize, x2Host, x2FileSize, ACL_MEMCPY_HOST_TO_DEVICE));
+
+    uint8_t *biasHost = nullptr;
+    uint8_t *biasDevice = nullptr;
+    if (IS_BIAS) {
+        CHECK_ACL(aclrtMallocHost((void **)(&biasHost), biasFileSize));
+        CHECK_ACL(aclrtMalloc((void **)&biasDevice, biasFileSize, ACL_MEM_MALLOC_HUGE_FIRST));
+        ReadFile("../input/bias_gm.bin", biasFileSize, biasHost, biasFileSize);
+        CHECK_ACL(aclrtMemcpy(biasDevice, biasFileSize, biasHost, biasFileSize, ACL_MEMCPY_HOST_TO_DEVICE));
+    }
+    uint8_t *yHost = nullptr;
+    uint8_t *yDevice = nullptr;
+    CHECK_ACL(aclrtMallocHost((void **)(&yHost), yFileSize));
+    CHECK_ACL(aclrtMalloc((void **)&yDevice, yFileSize, ACL_MEM_MALLOC_HUGE_FIRST));
+
+    MatmulOp(x1Device, x2Device, yDevice, biasDevice, testCaseParams, stream);
+    CHECK_ACL(aclrtSynchronizeStream(stream));
+
+    CHECK_ACL(aclrtMemcpy(yHost, yFileSize, yDevice, yFileSize, ACL_MEMCPY_DEVICE_TO_HOST));
+    WriteFile("../output/output.bin", yHost, yFileSize);
+
+    if (IS_BIAS) {
+        CHECK_ACL(aclrtFree(biasDevice));
+        CHECK_ACL(aclrtFreeHost(biasHost));
+    }
+    CHECK_ACL(aclrtFree(x1Device));
+    CHECK_ACL(aclrtFreeHost(x1Host));
+    CHECK_ACL(aclrtFree(x2Device));
+    CHECK_ACL(aclrtFreeHost(x2Host));
+    CHECK_ACL(aclrtFree(yDevice));
+    CHECK_ACL(aclrtFreeHost(yHost));
+    TestAclDeInit(context, stream, deviceId);
+}
+#endif
+} // namespace MatmulHost
+
+int32_t main(int32_t argc, const char *args[])
+{
+    int32_t size = 4;
+    int64_t problem[size] = {1, 1, 1, 1};
+    for (int32_t i = 1; i < argc && i < size + 1; ++i) {
+        std::stringstream ss(args[i]);
+        ss >> problem[i - 1];
+    }
+    auto ascendcPlatform = platform_ascendc::PlatformAscendCManager::GetInstance(SOC_VERSION);
+    MatmulHost::MatmulCaseParams testCaseParams{static_cast<int32_t>(ascendcPlatform->GetCoreNumAic()),
+        static_cast<int32_t>(problem[0]), static_cast<int32_t>(problem[1]), static_cast<int32_t>(problem[2]), // M, N, K
+        IS_BIAS, IS_A_TRANS, IS_B_TRANS,
+        static_cast<int32_t>(problem[3])}; // BatchNum
+    MatrixFileSize matrixFileSize;
+    matrixFileSize.x1FileSize = static_cast<size_t>(testCaseParams.m * testCaseParams.k * testCaseParams.batchNum) * sizeof(uint16_t);
+    matrixFileSize.x2FileSize = static_cast<size_t>(testCaseParams.k * testCaseParams.n * testCaseParams.batchNum) * sizeof(uint16_t);
+    matrixFileSize.yFileSize = static_cast<size_t>(testCaseParams.m * testCaseParams.n * testCaseParams.batchNum) * sizeof(float);
+    matrixFileSize.biasFileSize = static_cast<size_t>(1 * testCaseParams.n * testCaseParams.batchNum) * sizeof(float);
+#ifdef ASCENDC_CPU_DEBUG
+    MatmulHost::TestMatmulCpu(testCaseParams, matrixFileSize);
+#else
+    MatmulHost::TestMatmul(testCaseParams, matrixFileSize);
+#endif
+    return 0;
+}
diff --git a/examples/matrix/batch_matmul_tscm/op_host/batch_matmul_tscm_custom_tiling.cpp b/examples/matrix/batch_matmul_tscm/op_host/batch_matmul_tscm_custom_tiling.cpp
new file mode 100644
index 00000000..3b167ecd
--- /dev/null
+++ b/examples/matrix/batch_matmul_tscm/op_host/batch_matmul_tscm_custom_tiling.cpp
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2025 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+#include "batch_matmul_tscm_custom_tiling.h"
+#include <iostream>
+
+namespace MatmulHost {
+
+TCubeTiling GenerateTiling(const MatmulCaseParams& testCaseParams)
+{
+    TCubeTiling tilingData;
+    auto ascendcPlatform = platform_ascendc::PlatformAscendCManager::GetInstance();
+    matmul_tiling::MultiCoreMatmulTiling cubeTiling(*ascendcPlatform);
+    int32_t M = testCaseParams.m;
+    int32_t N = testCaseParams.n;
+    int32_t K = testCaseParams.k;
+    int32_t blockDim = 1;
+    bool isBias = testCaseParams.isBias;
+    bool isAtrans = testCaseParams.isATrans;
+    bool isBtrans = testCaseParams.isBTrans;
+    int32_t batchNum = testCaseParams.batchNum;
+
+    cubeTiling.SetDim(blockDim);
+    cubeTiling.SetAType(matmul_tiling::TPosition::TSCM, matmul_tiling::CubeFormat::NZ,
+        matmul_tiling::DataType::DT_FLOAT16, isAtrans);
+    cubeTiling.SetBType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND,
+        matmul_tiling::DataType::DT_FLOAT16, isBtrans);
+    cubeTiling.SetCType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND,
+        matmul_tiling::DataType::DT_FLOAT);
+    cubeTiling.SetBiasType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND,
+        matmul_tiling::DataType::DT_FLOAT);
+    cubeTiling.SetOrgShape(M, N, K);
+    cubeTiling.SetShape(M, N, K);
+    cubeTiling.EnableBias(isBias);
+    cubeTiling.SetBufferSpace(-1, -1, -1);
+    cubeTiling.SetALayout(batchNum, M, 1, 1, K);
+    cubeTiling.SetBLayout(batchNum, K, 1, 1, N);
+    cubeTiling.SetCLayout(batchNum, M, 1, 1, N);
+    cubeTiling.SetBatchNum(batchNum);
+    if (cubeTiling.GetTiling(tilingData) == -1) {
+        std::cout << "Generate tiling failed." << std::endl;
+        return {};
+    }
+    return tilingData;
+}
+
+} // namespace MatmulHost
diff --git a/examples/matrix/batch_matmul_tscm/op_host/batch_matmul_tscm_custom_tiling.h b/examples/matrix/batch_matmul_tscm/op_host/batch_matmul_tscm_custom_tiling.h
new file mode 100644
index 00000000..837eb2ae
--- /dev/null
+++ b/examples/matrix/batch_matmul_tscm/op_host/batch_matmul_tscm_custom_tiling.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2025 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+#ifndef EXAMPLES_MATRIX_BATCH_MATMUL_TSCM_OP_HOST_BATCH_MATMUL_TSCM_CUSTOM_TILING_H
+#define EXAMPLES_MATRIX_BATCH_MATMUL_TSCM_OP_HOST_BATCH_MATMUL_TSCM_CUSTOM_TILING_H
+#include "register/tilingdata_base.h"
+#include "tiling/tiling_api.h"
+
+namespace MatmulHost {
+
+struct MatmulCaseParams
+{
+    int32_t usedCoreNum;
+    int32_t m;
+    int32_t n;
+    int32_t k;
+    bool isBias;
+    bool isATrans;
+    bool isBTrans;
+    int32_t batchNum;
+};
+
+/**
+  * @brief Generate matmul tiling.
+  * @param testCaseParams: Testcase parameters.
+  * @retval Generated Tiling data.
+  */
+TCubeTiling GenerateTiling(const MatmulCaseParams& testCaseParams);
+
+} // namespace MatmulHost
+#endif // EXAMPLES_MATRIX_BATCH_MATMUL_TSCM_OP_HOST_BATCH_MATMUL_TSCM_CUSTOM_TILING_H
diff --git a/examples/matrix/batch_matmul_tscm/op_kernel/batch_matmul_tscm_custom.cpp b/examples/matrix/batch_matmul_tscm/op_kernel/batch_matmul_tscm_custom.cpp
new file mode 100644
index 00000000..da24952b
--- /dev/null
+++ b/examples/matrix/batch_matmul_tscm/op_kernel/batch_matmul_tscm_custom.cpp
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2025 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+#include "kernel_operator.h"
+#include "lib/matmul_intf.h"
+#include "batch_matmul_tscm_custom_impl.h"
+
+namespace BatchMatmulCustom {
+__aicore__ inline void CopyTiling(TCubeTiling* tiling, GM_ADDR tilingGM)
+{
+    uint32_t* ptr = reinterpret_cast<uint32_t*>(tiling);
+    auto tiling32 = reinterpret_cast<__gm__ uint32_t*>(tilingGM);
+
+    for (int i = 0; i < sizeof(TCubeTiling) / sizeof(uint32_t); i++, ptr++) {
+      *ptr = *(tiling32 + i);
+    }
+    return;
+}
+
+template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE>
+__aicore__ inline void BatchMatmulKernel<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE>::Init(GM_ADDR a, GM_ADDR b, GM_ADDR bias,
+                                         GM_ADDR c, GM_ADDR workspace, const TCubeTiling& tiling)
+{
+    this->tiling = tiling;
+    int32_t sizeA = tiling.ALayoutInfoB * tiling.ALayoutInfoS * tiling.ALayoutInfoN * tiling.ALayoutInfoG * tiling.ALayoutInfoD;
+    int32_t sizeB = tiling.BLayoutInfoB * tiling.BLayoutInfoS * tiling.BLayoutInfoN * tiling.BLayoutInfoG * tiling.BLayoutInfoD;
+    int32_t sizeC = tiling.CLayoutInfoB * tiling.CLayoutInfoS1 * tiling.CLayoutInfoN * tiling.CLayoutInfoG * tiling.CLayoutInfoS2;
+    int32_t sizeBias = tiling.CLayoutInfoB * tiling.CLayoutInfoN * tiling.CLayoutInfoG * tiling.CLayoutInfoS2;
+
+    aGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ aType*>(a), sizeA);
+    bGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ bType*>(b), sizeB);
+    cGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ cType*>(c), sizeC);
+    biasGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ biasType*>(bias), sizeBias);
+
+    if (GetSysWorkSpacePtr() == nullptr) {
+        return;
+    }
+}
+
+template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE>
+__aicore__ inline void BatchMatmulKernel<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE>::Process(AscendC::TPipe* pipe, int32_t batchA, int32_t batchB)
+{
+    // Copy aMatrix from gm to tscm
+    AscendC::TSCM<AscendC::TPosition::GM, 1> scm;
+    pipe->InitBuffer(scm, 1, batchA * tiling.M * tiling.Ka * sizeof(aType));
+    auto scmTensor = scm.AllocTensor<aType>();
+    DataCopy(scmTensor, aGlobal, batchA * tiling.M * tiling.Ka);
+    scm.EnQue(scmTensor);
+    AscendC::LocalTensor<aType> scmLocal = scm.DeQue<aType>();
+
+    matmulObj.SetTensorA(scmLocal, false);
+    matmulObj.SetTensorB(bGlobal, true); // B transpose
+    if (tiling.isBias) {
+        matmulObj.SetBias(biasGlobal);
+    }
+
+    matmulObj.IterateBatch(cGlobal, batchA, batchB, false);
+}
+} // namespace BatchMatmulCustom
+
+extern "C" __global__ __aicore__ void batch_matmul_tscm_custom(GM_ADDR a, GM_ADDR b, GM_ADDR bias, GM_ADDR c, GM_ADDR workspace,
+                                                        GM_ADDR tilingGm)
+{
+    // prepare tiling
+    TCubeTiling tiling;
+    BatchMatmulCustom::CopyTiling(&tiling, tilingGm);
+    // define matmul kernel
+    using A_TYPE = AscendC::MatmulType<AscendC::TPosition::TSCM, CubeFormat::NZ, half, false, LayoutMode::NORMAL>;
+    using B_TYPE = AscendC::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, half, true, LayoutMode::NORMAL>;
+    using C_TYPE = AscendC::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, float, false, LayoutMode::NORMAL>;
+    using BIAS_TYPE = AscendC::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, float>;
+    BatchMatmulCustom::BatchMatmulKernel<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE> batchMatmulKernel;
+    AscendC::TPipe pipe;
+    REGIST_MATMUL_OBJ(&pipe, GetSysWorkSpacePtr(), batchMatmulKernel.matmulObj, &tiling);
+    // init matmul kernel
+    batchMatmulKernel.Init(a, b, bias, c, workspace, tiling);
+    // matmul kernel process
+    batchMatmulKernel.Process(&pipe, 3, 3);
+}
+
+#ifndef ASCENDC_CPU_DEBUG
+void batch_matmul_tscm_custom_do(uint32_t blockDim, void* stream, GM_ADDR a, GM_ADDR b, GM_ADDR bias,
+                         GM_ADDR c, GM_ADDR workspace, GM_ADDR tilingGm)
+{
+    // invoke the kernel function through the <<<>>> symbol
+    batch_matmul_tscm_custom<<<blockDim, nullptr, stream>>>(a, b, bias, c, workspace, tilingGm);
+}
+#endif
\ No newline at end of file
diff --git a/examples/matrix/batch_matmul_tscm/op_kernel/batch_matmul_tscm_custom_impl.h b/examples/matrix/batch_matmul_tscm/op_kernel/batch_matmul_tscm_custom_impl.h
new file mode 100644
index 00000000..afe5be54
--- /dev/null
+++ b/examples/matrix/batch_matmul_tscm/op_kernel/batch_matmul_tscm_custom_impl.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2025 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+#ifndef EXAMPLES_MATRIX_BATCH_MATMUL_TSCM_OP_KERNEL_BATCH_MATMUL_TSCM_CUSTOM_IMPL_H
+#define EXAMPLES_MATRIX_BATCH_MATMUL_TSCM_OP_KERNEL_BATCH_MATMUL_TSCM_CUSTOM_IMPL_H
+#include "kernel_operator.h"
+#include "lib/matmul_intf.h"
+
+namespace BatchMatmulCustom {
+template <class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE>
+class BatchMatmulKernel {
+    public:
+        __aicore__ inline BatchMatmulKernel(){};
+        __aicore__ inline void Init(GM_ADDR a, GM_ADDR b, GM_ADDR bias, GM_ADDR c, GM_ADDR workspace, const TCubeTiling& tiling);
+        __aicore__ inline void Process(AscendC::TPipe* pipe, int32_t batchA, int32_t batchB);
+        AscendC::Matmul<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE> matmulObj;
+    private:
+        __aicore__ inline void CalcOffset(int32_t blockIdx, const TCubeTiling& tiling, int32_t& offsetA, int32_t& offsetB,
+                                          int32_t& offsetC, int32_t& offsetBias);
+        using aType = typename A_TYPE::T;
+        using bType = typename B_TYPE::T;
+        using cType = typename C_TYPE::T;
+        using biasType = typename BIAS_TYPE::T;
+        AscendC::GlobalTensor<aType> aGlobal;
+        AscendC::GlobalTensor<bType> bGlobal;
+        AscendC::GlobalTensor<cType> cGlobal;
+        AscendC::GlobalTensor<biasType> biasGlobal;
+        TCubeTiling tiling;
+};
+} // namespace BatchMatmulCustom
+#endif // EXAMPLES_MATRIX_BATCH_MATMUL_TSCM_OP_KERNEL_BATCH_MATMUL_TSCM_CUSTOM_IMPL_H
\ No newline at end of file
diff --git a/examples/matrix/batch_matmul_tscm/run.sh b/examples/matrix/batch_matmul_tscm/run.sh
new file mode 100644
index 00000000..21e667d5
--- /dev/null
+++ b/examples/matrix/batch_matmul_tscm/run.sh
@@ -0,0 +1,96 @@
+#!/bin/bash
+# Copyright (c) 2025 Huawei Technologies Co., Ltd.
+# This file is a part of the CANN Open Software.
+# Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# ======================================================================================================================
+
+export IS_PERF="0"
+
+SHORT=r:,v:,p:,
+LONG=run-mode:,soc-version:,perf:,
+OPTS=$(getopt -a --options $SHORT --longoptions $LONG -- "$@")
+eval set -- "$OPTS"
+while :
+do
+    case "$1" in
+        (-r | --run-mode )
+            RUN_MODE="$2"
+            shift 2;;
+        (-v | --soc-version )
+            SOC_VERSION="$2"
+            shift 2;;
+        (-p | --perf )
+            IS_PERF="$2"
+            shift 2;;
+        (--)
+            shift;
+            break;;
+        (*)
+            echo "[ERROR] Unexpected option: $1";
+            break;;
+    esac
+done
+
+# Check invalid input
+if [[ ! "${SOC_VERSION}" =~ ^Ascend910 ]]; then
+    echo "[ERROR] Unsupported SocVersion: ${SOC_VERSION}"
+    exit 1
+fi
+
+if [ "${RUN_MODE}" != "npu" ] && [ "${RUN_MODE}" != "sim" ] && [ "${RUN_MODE}" != "cpu" ]; then
+    echo "[ERROR] Unsupported RUN_MODE: ${RUN_MODE}, which can only be cpu/sim/npu."
+    exit 1
+fi
+
+if [ "${IS_PERF}" != "0" ] && [ "${IS_PERF}" != "1" ]; then
+    echo "[ERROR] Unsupported IS_PERF: ${IS_PERF}, which can only be 0 or 1."
+    exit 1
+fi
+
+# only npu mode support is_perf = 1
+if [ "${IS_PERF}" == "1" ] && [ "${RUN_MODE}" != "npu" ]; then
+    echo "[ERROR] Unsupported IS_PERF: ${IS_PERF} while RUN_MODE is ${RUN_MODE}."
+    exit 1
+fi
+
+rm -rf build
+mkdir build
+cd build
+
+source $ASCEND_HOME_DIR/bin/setenv.bash
+export LD_LIBRARY_PATH=${ASCEND_HOME_DIR}/tools/simulator/${SOC_VERSION}/lib:$LD_LIBRARY_PATH
+set -euo pipefail
+
+cmake  -DRUN_MODE=${RUN_MODE} -DSOC_VERSION=${SOC_VERSION} -DASCEND_CANN_PACKAGE_PATH=${ASCEND_HOME_DIR} ..
+make -j16
+
+cd ../
+
+rm -rf input
+mkdir input
+rm -rf output
+mkdir output
+
+rm -rf bin/
+mkdir -p bin
+cd bin
+cp ../build/ascendc_batch_matmul_tscm_bbit ./
+
+export TF_CPP_MIN_LOG_LEVEL=3
+
+if [ "${RUN_MODE}" = "npu" ]; then
+    if [ "${IS_PERF}" = "1" ]; then
+        export ASCEND_TOOLKIT_HOME=${ASCEND_HOME_DIR}
+        python3 -u ../scripts/exec_test.py npu "perf"
+    else
+        python3 -u ../scripts/exec_test.py npu "normal"
+    fi
+elif [ "${RUN_MODE}" = "sim" ]; then
+    python3 -u ../scripts/exec_test.py sim "normal"
+elif [ "${RUN_MODE}" = "cpu" ]; then
+    python3 -u ../scripts/exec_test.py cpu "normal"
+fi
diff --git a/examples/matrix/batch_matmul_tscm/scripts/exec_test.py b/examples/matrix/batch_matmul_tscm/scripts/exec_test.py
new file mode 100644
index 00000000..1e89845b
--- /dev/null
+++ b/examples/matrix/batch_matmul_tscm/scripts/exec_test.py
@@ -0,0 +1,120 @@
+#!/usr/bin/python3
+# coding=utf-8
+
+# Copyright (c) 2025 Huawei Technologies Co., Ltd.
+# This file is a part of the CANN Open Software.
+# Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# ======================================================================================================================
+import os
+import sys
+import csv
+import time
+import logging
+import subprocess
+
+import numpy as np
+
+sys.path.append("../..")
+from gen_batch_data import MatmulGenData
+from common_scripts.compare_data import compare_data
+from common_scripts.exec_utils import get_file_work_dir, get_case_list, get_perf_task_duration, get_process_case_cmd
+
+IS_BIAS = False
+IS_TRANS_A = False
+IS_TRANS_B = True
+DATA_TYPE_STR = "float16_float32"
+A_FORMAT = "NZ"
+B_FORMAT = "ND"
+
+logging.basicConfig(level=logging.INFO)
+
+
+class ProcessParams:
+    def __init__(self, case_name, m, n, k, b, is_perf, run_mode):
+        self.case_name = case_name
+        self.m = m
+        self.n = n
+        self.k = k
+        self.b = b
+        self.is_perf = is_perf
+        self.run_mode = run_mode
+
+
+def process_case(file_work_dir, process_params):
+    case_name, m, n, k, b, is_perf, run_mode = process_params.case_name, process_params.m, process_params.n,\
+        process_params.k, process_params.b, process_params.is_perf, process_params.run_mode
+    logging.info("[INFO] start process case [%s]" % (case_name))
+    logging.info("[INFO] IS_PERF is set [%d]" % (is_perf))
+
+    matmul_gen_data = MatmulGenData(m, n, k, b, IS_TRANS_A, IS_TRANS_B, IS_BIAS, DATA_TYPE_STR)
+    if is_perf:
+        matmul_gen_data.gen_fake_golden_data(file_work_dir)
+    else:
+        matmul_gen_data.gen_golden_data(file_work_dir)
+    params_str = f"{m} {n} {k} {b}"
+    cmd = get_process_case_cmd(kernel_name="ascendc_batch_matmul_tscm_bbit", params_str=params_str,\
+        is_perf=is_perf, run_mode=run_mode)
+    subprocess.run(cmd)
+    if is_perf:
+        wrong_num = -1
+    else:
+        logging.info("[INFO] compare data case[%s]" % (case_name))
+        wrong_num = compare_data(file_work_dir, DATA_TYPE_STR)
+    res_data = []
+    res_data.append(case_name)
+    res_data.append(wrong_num)
+    res_data.append(m * n * b)
+    if wrong_num == -1:
+        res_data.append("None")
+    elif wrong_num / (m * n * b) > 0.001:
+        res_data.append("Fail")
+    else:
+        res_data.append("Success")
+    if is_perf:
+        task_duration = get_perf_task_duration("./prof_out")
+        res_data.append(task_duration)
+    return res_data
+
+
+def main():
+    args_len = len(sys.argv) - 1
+    if args_len != 2:
+        logging.info("[ERROR] exec_test input params error!")
+        return -1
+
+    file_work_dir = get_file_work_dir()
+    if not os.path.exists(file_work_dir):
+        logging.info("[ERROR] file path %s not exist!" % (file_work_dir))
+        return -1
+
+    is_perf = False
+    if sys.argv[2] == "perf":
+        is_perf = True
+
+    case_list = get_case_list()
+    res_list = [["case_name", "wrong_num", "total_num", "result", "task_duration"]]
+    run_mode = sys.argv[1]
+    for is_process, case_name, m, n, k, b in case_list:
+        if is_process == 1:
+            process_params = ProcessParams(case_name, m, n, k, b, is_perf, run_mode)
+            res_data = process_case(file_work_dir, process_params)
+            res_list.append(res_data)
+
+    timestamp = time.time()
+    result_file_name = "result_" + str(timestamp) + ".csv"
+    with open(os.path.join(file_work_dir, "output", result_file_name), 'w', newline='', encoding='utf-8') as csvfile:
+        writer = csv.writer(csvfile)
+        writer.writerows(res_list)
+
+    logging.info("---------------RESULT---------------")
+    for res in res_list:
+        logging.info(res)
+    return 0
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/matrix/batch_matmul_tscm/scripts/gen_batch_data.py b/examples/matrix/batch_matmul_tscm/scripts/gen_batch_data.py
new file mode 100644
index 00000000..ed8e61f0
--- /dev/null
+++ b/examples/matrix/batch_matmul_tscm/scripts/gen_batch_data.py
@@ -0,0 +1,113 @@
+#!/usr/bin/python3
+# coding=utf-8
+
+# Copyright (c) 2025 Huawei Technologies Co., Ltd.
+# This file is a part of the CANN Open Software.
+# Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# ======================================================================================================================
+import os
+import sys
+import logging
+
+import numpy as np
+import tensorflow as tf
+
+IS_OUTPUT_TXT = False
+
+
+class MatmulGenData:
+    def __init__(self, m, n, k, b, is_trans_a, is_trans_b, is_bias,\
+        data_type_str):
+        self.m = m
+        self.n = n
+        self.k = k
+        self.b = b
+        self.is_trans_a = is_trans_a
+        self.is_trans_b = is_trans_b
+        self.is_bias = is_bias
+        self.data_type_str = data_type_str
+
+    @staticmethod
+    def gen_axes_for_transpose(offset, base):
+        return [x for x in range(offset)] + [x + offset for x in base]
+
+    def nd_to_nz(self, data: np.ndarray):
+        ori_shape = data.shape
+        m_ori, n_ori = ori_shape[-2:]
+        batch_ori = ori_shape[:-2]
+        batch_num = len(batch_ori)
+        batch_padding = ((0, 0),) * batch_num
+        m0, n0 = 16, 16
+        m1, n1 = (m_ori + m0 - 1) // m0, (n_ori + n0 - 1) // n0
+        padding_m = m1 * m0 - m_ori
+        padding_n = n1 * n0 - n_ori
+        data = np.pad(data, (batch_padding + ((0, padding_m), (0, padding_n))), 'constant')
+        array_trans = self.gen_axes_for_transpose(len(data.shape) - 2, [2, 0, 1, 3])
+        data = data.reshape(batch_ori + (m1, m0, n1, n0)).transpose(*array_trans)
+        return data
+
+    def gen_golden_data_fp16_fp32(self, work_dir):
+        src_type = np.float16
+        dst_type = np.float32
+        bias_gm = np.random.uniform(-1, 1, [1, self.b * self.n]).astype(dst_type)
+
+        for idx in range(self.b):
+            x1_gm_left = np.random.uniform(-1, 1, [self.m, self.k]).astype(src_type)
+            x2_gm_right = np.random.uniform(-1, 1, [self.k, self.n]).astype(src_type)
+            golden_one = np.matmul(x1_gm_left.astype(dst_type), x2_gm_right.astype(dst_type)).astype(dst_type)
+            if self.is_bias:
+                golden_one = golden_one + bias_gm.astype(dst_type)
+            if self.is_trans_a:
+                x1_tmp = x1_gm_left.transpose()
+            else:
+                x1_tmp = x1_gm_left
+            x1_tmp = self.nd_to_nz(x1_tmp)
+            if self.is_trans_b:
+                x2_tmp = x2_gm_right.transpose()
+            else:
+                x2_tmp = x2_gm_right
+            if idx == 0:
+                x1_gm = x1_tmp
+                x2_gm = x2_tmp
+                golden = golden_one
+            else:
+                x1_gm = np.vstack((x1_gm, x1_tmp))
+                x2_gm = np.vstack((x2_gm, x2_tmp))
+                golden = np.vstack((golden, golden_one))
+
+        x1_gm.tofile(work_dir + "/input/x1_gm.bin")
+        x2_gm.tofile(work_dir + "/input/x2_gm.bin")
+        if self.is_bias:
+            bias_gm.tofile(work_dir + "/input/bias_gm.bin")
+        golden.tofile(work_dir + "/output/golden.bin")
+
+        return 0
+
+    def gen_golden_data(self, work_dir):
+        if self.data_type_str == "float16_float32":
+            self.gen_golden_data_fp16_fp32(work_dir)
+        else:
+            logging.info("[ERROR] can't support data type %s" % (self.data_type_str))
+            return -1
+        return 0
+
+    def gen_fake_golden_data(self, work_dir):
+        data_type_bytes_ab = 2 # float16
+        data_type_bytes_c = 4  # float32
+
+        file_byte = self.b * self.m * self.k * data_type_bytes_ab
+        with open(work_dir + "/input/x1_gm.bin", 'wb') as file:
+            file.truncate(file_byte)
+
+        file_byte = self.b * self.k * self.n * data_type_bytes_ab
+        with open(work_dir + "/input/x2_gm.bin", 'wb') as file:
+            file.truncate(file_byte)
+
+        if self.is_bias:
+            file_byte = self.b * 1 * self.n * data_type_bytes_c
+            with open(work_dir + "/input/bias_gm.bin", 'wb') as file:
+                file.truncate(file_byte)
\ No newline at end of file
diff --git a/examples/matrix/batch_matmul_tscm/testcase/case.csv b/examples/matrix/batch_matmul_tscm/testcase/case.csv
new file mode 100644
index 00000000..4f0f0196
--- /dev/null
+++ b/examples/matrix/batch_matmul_tscm/testcase/case.csv
@@ -0,0 +1 @@
+1, case001, 32, 256, 64, 3
\ No newline at end of file
diff --git a/examples/matrix/matmul_nbuffer33/CMakeLists.txt b/examples/matrix/matmul_nbuffer33/CMakeLists.txt
index 7555bf6d..93483b92 100644
--- a/examples/matrix/matmul_nbuffer33/CMakeLists.txt
+++ b/examples/matrix/matmul_nbuffer33/CMakeLists.txt
@@ -50,6 +50,7 @@ target_compile_options(ascendc_matmul_nbuffer33_bbit PRIVATE
 )
 
 target_compile_definitions(ascendc_matmul_nbuffer33_bbit PRIVATE
+    $<$<BOOL:${ENABLE_FEATURE}>:ENABLE_NBUFFER33_FEATURE>
     SOC_VERSION="${SOC_VERSION}"
 )
 
diff --git a/examples/matrix/matmul_nbuffer33/README.md b/examples/matrix/matmul_nbuffer33/README.md
index 4d60f7ba..9eb7f5f6 100644
--- a/examples/matrix/matmul_nbuffer33/README.md
+++ b/examples/matrix/matmul_nbuffer33/README.md
@@ -94,15 +94,16 @@
   - 编译执行
 
     ```
-    bash run.sh -r [RUN_MODE] -v [SOC_VERSION] -p [IS_PERF]
+    bash run.sh -r [RUN_MODE] -v [SOC_VERSION] -p [IS_PERF] -e [ENABLE_FEATURE]
     ```
     其中脚本参数说明如下：
     - RUN_MODE ：编译执行方式，可选择CPU调试，NPU仿真，NPU上板，对应参数分别为[cpu / sim / npu]。若需要详细了解NPU仿真相关内容，请参考[《算子开发工具msProf》](https://hiascend.com/document/redirect/CannCommunityToolMsProf)中的“工具使用”章节。
     - SOC_VERSION ：昇腾AI处理器型号，如果无法确定具体的[SOC_VERSION]，则在安装昇腾AI处理器的服务器执行npu-smi info命令进行查询，在查询到的“Name”前增加Ascend信息，例如“Name”对应取值为xxxyy，实际配置的[SOC_VERSION]值为Ascendxxxyy。支持以下产品型号：
       - Atlas A2训练系列产品/Atlas 800I A2推理产品
-    - IS_PERF : 是否获取执行性能数据，当前只在NPU执行时生效，打开后性能数据显示在执行结果中，可选择关闭和开启该功能，对应参数分别为[0 / 1]。
+    - IS_PERF ：是否获取执行性能数据，当前只在NPU执行时生效，打开后性能数据显示在执行结果中，可选择关闭和开启该功能，对应参数分别为[0 / 1]。
+    - ENABLE_FEATURE ：是否使能NBuffer33功能，可选择关闭和开启该功能，对应参数分别为[0 / 1]，默认开启。
 
     示例如下，Ascendxxxyy请替换为实际的AI处理器型号。
     ```
-    bash run.sh -r cpu -v Ascendxxxyy -p 0
+    bash run.sh -r cpu -v Ascendxxxyy -p 0 -e 1
     ```
\ No newline at end of file
diff --git a/examples/matrix/matmul_nbuffer33/cmake/cpu_lib.cmake b/examples/matrix/matmul_nbuffer33/cmake/cpu_lib.cmake
index 583dd703..56b4c9d9 100644
--- a/examples/matrix/matmul_nbuffer33/cmake/cpu_lib.cmake
+++ b/examples/matrix/matmul_nbuffer33/cmake/cpu_lib.cmake
@@ -20,6 +20,10 @@ target_link_libraries(ascendc_kernels_${RUN_MODE} PRIVATE
     tikicpulib::${SOC_VERSION}
 )
 
+target_compile_definitions(ascendc_kernels_${RUN_MODE} PRIVATE
+    $<$<BOOL:${ENABLE_FEATURE}>:ENABLE_NBUFFER33_FEATURE>
+)
+
 target_compile_options(ascendc_kernels_${RUN_MODE} PRIVATE
     -g
     -O2
diff --git a/examples/matrix/matmul_nbuffer33/cmake/npu_lib.cmake b/examples/matrix/matmul_nbuffer33/cmake/npu_lib.cmake
index 927d2490..d7a7e292 100644
--- a/examples/matrix/matmul_nbuffer33/cmake/npu_lib.cmake
+++ b/examples/matrix/matmul_nbuffer33/cmake/npu_lib.cmake
@@ -21,7 +21,8 @@ ascendc_library(ascendc_kernels_${RUN_MODE} STATIC
 )
 
 ascendc_compile_definitions(ascendc_kernels_${RUN_MODE} PRIVATE
-  -DASCENDC_DUMP
-  -DHAVE_WORKSPACE
-  -DHAVE_TILING
-  )
\ No newline at end of file
+    $<$<BOOL:${ENABLE_FEATURE}>:ENABLE_NBUFFER33_FEATURE>
+    -DASCENDC_DUMP
+    -DHAVE_WORKSPACE
+    -DHAVE_TILING
+)
\ No newline at end of file
diff --git a/examples/matrix/matmul_nbuffer33/op_kernel/matmul_nbuffer33_custom_impl.h b/examples/matrix/matmul_nbuffer33/op_kernel/matmul_nbuffer33_custom_impl.h
index dc4b4216..70d9e328 100644
--- a/examples/matrix/matmul_nbuffer33/op_kernel/matmul_nbuffer33_custom_impl.h
+++ b/examples/matrix/matmul_nbuffer33/op_kernel/matmul_nbuffer33_custom_impl.h
@@ -21,6 +21,7 @@ class MatmulNBuffer33Kernel {
         __aicore__ inline MatmulNBuffer33Kernel(){};
         __aicore__ inline void Init(GM_ADDR a, GM_ADDR b, GM_ADDR bias, GM_ADDR c, GM_ADDR workspace, const TCubeTiling& tiling);
         __aicore__ inline void Process(AscendC::TPipe* pipe);
+#ifdef ENABLE_NBUFFER33_FEATURE
         AscendC::MatmulImpl<
             AscendC::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, aType>,
             AscendC::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, bType>,
@@ -28,6 +29,14 @@ class MatmulNBuffer33Kernel {
             AscendC::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, biasType>, CFG_MDL,
             AscendC::MatmulCallBackFunc<nullptr, nullptr, nullptr>,
             AscendC::Impl::Detail::NBuffer33MatmulPolicy> matmulObj;
+#else
+        AscendC::MatmulImpl<
+            AscendC::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, aType>,
+            AscendC::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, bType>,
+            AscendC::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, cType>,
+            AscendC::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, biasType>, CFG_MDL,
+            AscendC::MatmulCallBackFunc<nullptr, nullptr, nullptr>> matmulObj;
+#endif
 
     private:
         __aicore__ inline void CalcOffset(int32_t blockIdx, const TCubeTiling& tiling, int32_t& offsetA, int32_t& offsetB,
diff --git a/examples/matrix/matmul_nbuffer33/run.sh b/examples/matrix/matmul_nbuffer33/run.sh
index 5e271e9d..6443aff0 100644
--- a/examples/matrix/matmul_nbuffer33/run.sh
+++ b/examples/matrix/matmul_nbuffer33/run.sh
@@ -9,8 +9,9 @@
 # ======================================================================================================================
 
 export IS_PERF="0"
+export ENABLE_FEATURE="1"
 
-SHORT=r:,v:,p:,
+SHORT=r:,v:,p:,e:,
 LONG=run-mode:,soc-version:,perf:,
 OPTS=$(getopt -a --options $SHORT --longoptions $LONG -- "$@")
 eval set -- "$OPTS"
@@ -26,6 +27,9 @@ do
         (-p | --perf )
             IS_PERF="$2"
             shift 2;;
+        (-e | --enable-feature )
+            ENABLE_FEATURE="$2"
+            shift 2;;
         (--)
             shift;
             break;;
@@ -51,6 +55,11 @@ if [ "${IS_PERF}" != "0" ] && [ "${IS_PERF}" != "1" ]; then
     exit 1
 fi
 
+if [ "${ENABLE_FEATURE}" != "0" ] && [ "${ENABLE_FEATURE}" != "1" ]; then
+    echo "[ERROR] Unsupported ENABLE_FEATURE: ${ENABLE_FEATURE}, which can only be 0 or 1."
+    exit 1
+fi
+
 # only npu mode support is_perf = 1
 if [ "${IS_PERF}" == "1" ] && [ "${RUN_MODE}" != "npu" ]; then
     echo "[ERROR] Unsupported IS_PERF: ${IS_PERF} while RUN_MODE is ${RUN_MODE}."
@@ -65,7 +74,7 @@ source $ASCEND_HOME_DIR/bin/setenv.bash
 export LD_LIBRARY_PATH=${ASCEND_HOME_DIR}/tools/simulator/${SOC_VERSION}/lib:$LD_LIBRARY_PATH
 set -euo pipefail
 
-cmake  -DRUN_MODE=${RUN_MODE} -DSOC_VERSION=${SOC_VERSION} -DASCEND_CANN_PACKAGE_PATH=${ASCEND_HOME_DIR} ..
+cmake  -DRUN_MODE=${RUN_MODE} -DSOC_VERSION=${SOC_VERSION} -DENABLE_FEATURE=${ENABLE_FEATURE} -DASCEND_CANN_PACKAGE_PATH=${ASCEND_HOME_DIR} ..
 make -j16
 
 cd ../
diff --git a/examples/matrix/matmul_preload/CMakeLists.txt b/examples/matrix/matmul_preload/CMakeLists.txt
index 67d8676c..02b970b5 100644
--- a/examples/matrix/matmul_preload/CMakeLists.txt
+++ b/examples/matrix/matmul_preload/CMakeLists.txt
@@ -59,6 +59,7 @@ target_compile_definitions(ascendc_matmul_preload_bbit PRIVATE
 
 target_compile_definitions(ascendc_matmul_preload_bbit PRIVATE
     $<$<BOOL:$<IN_LIST:${PRELOAD_MODE},${CUSTOM_PRELOAD_LIST}>>:CUSTOM_PRELOAD_N>
+    $<$<BOOL:${ENABLE_FEATURE}>:ENABLE_PRELOAD_FEATURE>
     PRELOAD_MODE="${PRELOAD_MODE}"
 )
 
diff --git a/examples/matrix/matmul_preload/README.md b/examples/matrix/matmul_preload/README.md
index fec924ee..860bf237 100644
--- a/examples/matrix/matmul_preload/README.md
+++ b/examples/matrix/matmul_preload/README.md
@@ -86,7 +86,7 @@ matmul单算子，对输入的A、B矩阵做矩阵乘和加bias偏置。在MTE2
   - 编译执行
 
     ```
-    bash run.sh -r [RUN_MODE] -v [SOC_VERSION] -m [MODE] -p [IS_PERF]
+    bash run.sh -r [RUN_MODE] -v [SOC_VERSION] -m [MODE] -p [IS_PERF] -e [ENABLE_FEATURE]
     ```
     其中脚本参数说明如下：
     - RUN_MODE ：编译执行方式，可选择CPU调试，NPU仿真，NPU上板，对应参数分别为[cpu / sim / npu]。若需要详细了解NPU仿真相关内容，请参考[《算子开发工具msProf》](https://hiascend.com/document/redirect/CannCommunityToolMsProf)中的“工具使用”章节。
@@ -97,8 +97,9 @@ matmul单算子，对输入的A、B矩阵做矩阵乘和加bias偏置。在MTE2
       - M
       - N
     - IS_PERF : 是否获取执行性能数据，当前只在NPU执行时生效，打开后性能数据显示在执行结果中，可选择关闭和开启该功能，对应参数分别为[0 / 1]。
+    - ENABLE_FEATURE ：是否使能Preload功能，可选择关闭和开启该功能，对应参数分别为[0 / 1]，默认开启。
 
     示例如下，Ascendxxxyy请替换为实际的AI处理器型号。
     ```
-    bash run.sh -r cpu -v Ascendxxxyy -m M -p 0
+    bash run.sh -r cpu -v Ascendxxxyy -m M -p 0 -e 1
     ```
diff --git a/examples/matrix/matmul_preload/cmake/cpu_lib.cmake b/examples/matrix/matmul_preload/cmake/cpu_lib.cmake
index c4e83c0d..0d3a989a 100644
--- a/examples/matrix/matmul_preload/cmake/cpu_lib.cmake
+++ b/examples/matrix/matmul_preload/cmake/cpu_lib.cmake
@@ -22,6 +22,7 @@ target_link_libraries(ascendc_kernels_${RUN_MODE} PRIVATE
 
 target_compile_definitions(ascendc_kernels_${RUN_MODE} PRIVATE
     $<$<BOOL:$<IN_LIST:${PRELOAD_MODE},${CUSTOM_PRELOAD_LIST}>>:CUSTOM_PRELOAD_N>
+    $<$<BOOL:${ENABLE_FEATURE}>:ENABLE_PRELOAD_FEATURE>
 )
 
 target_compile_options(ascendc_kernels_${RUN_MODE} PRIVATE
diff --git a/examples/matrix/matmul_preload/cmake/npu_lib.cmake b/examples/matrix/matmul_preload/cmake/npu_lib.cmake
index 52db61e6..56cf9e25 100644
--- a/examples/matrix/matmul_preload/cmake/npu_lib.cmake
+++ b/examples/matrix/matmul_preload/cmake/npu_lib.cmake
@@ -22,6 +22,7 @@ ascendc_library(ascendc_kernels_${RUN_MODE} STATIC
 
 ascendc_compile_definitions(ascendc_kernels_${RUN_MODE} PRIVATE
   $<$<BOOL:$<IN_LIST:${PRELOAD_MODE},${CUSTOM_PRELOAD_LIST}>>:CUSTOM_PRELOAD_N>
+  $<$<BOOL:${ENABLE_FEATURE}>:ENABLE_PRELOAD_FEATURE>
   -DASCENDC_DUMP
   -DHAVE_WORKSPACE
   -DHAVE_TILING
diff --git a/examples/matrix/matmul_preload/op_kernel/matmul_preload_custom_impl.h b/examples/matrix/matmul_preload/op_kernel/matmul_preload_custom_impl.h
index 03775c43..f6867fd5 100644
--- a/examples/matrix/matmul_preload/op_kernel/matmul_preload_custom_impl.h
+++ b/examples/matrix/matmul_preload/op_kernel/matmul_preload_custom_impl.h
@@ -22,7 +22,11 @@ class MatmulPreloadKernel {
         __aicore__ inline void Init(GM_ADDR a, GM_ADDR b, GM_ADDR bias, GM_ADDR c, GM_ADDR workspace,
             const TCubeTiling& tiling);
         __aicore__ inline void Process(AscendC::TPipe* pipe);
+#ifdef ENABLE_PRELOAD_FEATURE
         static constexpr MatmulConfig MM_CFG = GetMDLConfig(false, false, preloadMode);  // enable preload M/N
+#else
+        static constexpr MatmulConfig MM_CFG = GetMDLConfig();
+#endif
         AscendC::Matmul<AscendC::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, aType>,
         AscendC::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, bType>,
         AscendC::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, cType>,
diff --git a/examples/matrix/matmul_preload/run.sh b/examples/matrix/matmul_preload/run.sh
index f3cfe201..87d9d1a9 100644
--- a/examples/matrix/matmul_preload/run.sh
+++ b/examples/matrix/matmul_preload/run.sh
@@ -9,7 +9,8 @@
 # ======================================================================================================================
 export MODE="M"
 export IS_PERF="0"
-SHORT=r:,v:,p:,
+export ENABLE_FEATURE="1"
+SHORT=r:,v:,p:,e:,
 LONG=run-mode:,soc-version:,mode:,perf:,
 OPTS=$(getopt -a --options $SHORT --longoptions $LONG -- "$@")
 eval set -- "$OPTS"
@@ -28,6 +29,9 @@ do
         (-p | --perf )
             IS_PERF="$2"
             shift 2;;
+        (-e | --enable-feature )
+            ENABLE_FEATURE="$2"
+            shift 2;;
         (--)
             shift;
             break;;
@@ -53,6 +57,11 @@ if [ "${PRELOAD_MODE}" != "M" ] && [ "${PRELOAD_MODE}" != "N" ]; then
     exit 1
 fi
 
+if [ "${ENABLE_FEATURE}" != "0" ] && [ "${ENABLE_FEATURE}" != "1" ]; then
+    echo "[ERROR] Unsupported ENABLE_FEATURE: ${ENABLE_FEATURE}, which can only be 0 or 1."
+    exit 1
+fi
+
 if [ "${IS_PERF}" != "0" ] && [ "${IS_PERF}" != "1" ]; then
     echo "[ERROR] Unsupported IS_PERF: ${IS_PERF}, which can only be 0 or 1."
     exit 1
@@ -72,7 +81,7 @@ source $ASCEND_HOME_DIR/bin/setenv.bash
 export LD_LIBRARY_PATH=${ASCEND_HOME_DIR}/tools/simulator/${SOC_VERSION}/lib:$LD_LIBRARY_PATH
 set -euo pipefail
 
-cmake  -DRUN_MODE=${RUN_MODE} -DSOC_VERSION=${SOC_VERSION} -DPRELOAD_MODE=${PRELOAD_MODE} -DASCEND_CANN_PACKAGE_PATH=${ASCEND_HOME_DIR} ..
+cmake  -DRUN_MODE=${RUN_MODE} -DSOC_VERSION=${SOC_VERSION} -DPRELOAD_MODE=${PRELOAD_MODE} -DENABLE_FEATURE=${ENABLE_FEATURE} -DASCEND_CANN_PACKAGE_PATH=${ASCEND_HOME_DIR} ..
 make -j16
 
 cd ../
diff --git a/examples/matrix/matmul_splitk/CMakeLists.txt b/examples/matrix/matmul_splitk/CMakeLists.txt
index 91bab65c..591c2ee1 100644
--- a/examples/matrix/matmul_splitk/CMakeLists.txt
+++ b/examples/matrix/matmul_splitk/CMakeLists.txt
@@ -50,6 +50,7 @@ target_compile_options(ascendc_matmul_splitk_bbit PRIVATE
 )
 
 target_compile_definitions(ascendc_matmul_splitk_bbit PRIVATE
+    $<$<BOOL:${ENABLE_FEATURE}>:ENABLE_SPLITK_FEATURE>
     SOC_VERSION="${SOC_VERSION}"
 )
 
diff --git a/examples/matrix/matmul_splitk/README.md b/examples/matrix/matmul_splitk/README.md
index 43ee9419..448ea9bf 100644
--- a/examples/matrix/matmul_splitk/README.md
+++ b/examples/matrix/matmul_splitk/README.md
@@ -91,15 +91,16 @@
   - 编译执行
 
     ```
-    bash run.sh -r [RUN_MODE] -v [SOC_VERSION] -p [IS_PERF]
+    bash run.sh -r [RUN_MODE] -v [SOC_VERSION] -p [IS_PERF] -e [ENABLE_FEATURE]
     ```
     其中脚本参数说明如下：
     - RUN_MODE ：编译执行方式，可选择CPU调试，NPU仿真，NPU上板，对应参数分别为[cpu / sim / npu]。若需要详细了解NPU仿真相关内容，请参考[《算子开发工具msProf》](https://hiascend.com/document/redirect/CannCommunityToolMsProf)中的“工具使用”章节。
     - SOC_VERSION ：昇腾AI处理器型号，如果无法确定具体的[SOC_VERSION]，则在安装昇腾AI处理器的服务器执行npu-smi info命令进行查询，在查询到的“Name”前增加Ascend信息，例如“Name”对应取值为xxxyy，实际配置的[SOC_VERSION]值为Ascendxxxyy。支持以下产品型号：
       - Atlas A2训练系列产品/Atlas 800I A2推理产品
-    - IS_PERF : 是否获取执行性能数据，当前只在NPU执行时生效，打开后性能数据显示在执行结果中，可选择关闭和开启该功能，对应参数分别为[0 / 1]。
+    - IS_PERF ：是否获取执行性能数据，当前只在NPU执行时生效，打开后性能数据显示在执行结果中，可选择关闭和开启该功能，对应参数分别为[0 / 1]。
+    - ENABLE_FEATURE ：是否使能多核切K功能，可选择关闭和开启该功能，对应参数分别为[0 / 1]，默认开启。
 
     示例如下，Ascendxxxyy请替换为实际的AI处理器型号。
     ```
-    bash run.sh -r cpu -v Ascendxxxyy -p 0
+    bash run.sh -r cpu -v Ascendxxxyy -p 0 -e 1
     ```
\ No newline at end of file
diff --git a/examples/matrix/matmul_splitk/cmake/cpu_lib.cmake b/examples/matrix/matmul_splitk/cmake/cpu_lib.cmake
index 244469fb..8e55bd2f 100644
--- a/examples/matrix/matmul_splitk/cmake/cpu_lib.cmake
+++ b/examples/matrix/matmul_splitk/cmake/cpu_lib.cmake
@@ -20,6 +20,10 @@ target_link_libraries(ascendc_kernels_${RUN_MODE} PRIVATE
     tikicpulib::${SOC_VERSION}
 )
 
+target_compile_definitions(ascendc_kernels_${RUN_MODE} PRIVATE
+    $<$<BOOL:${ENABLE_FEATURE}>:ENABLE_SPLITK_FEATURE>
+)
+
 target_compile_options(ascendc_kernels_${RUN_MODE} PRIVATE
     -g
     -O0
diff --git a/examples/matrix/matmul_splitk/cmake/npu_lib.cmake b/examples/matrix/matmul_splitk/cmake/npu_lib.cmake
index 927d2490..87621862 100644
--- a/examples/matrix/matmul_splitk/cmake/npu_lib.cmake
+++ b/examples/matrix/matmul_splitk/cmake/npu_lib.cmake
@@ -21,7 +21,8 @@ ascendc_library(ascendc_kernels_${RUN_MODE} STATIC
 )
 
 ascendc_compile_definitions(ascendc_kernels_${RUN_MODE} PRIVATE
-  -DASCENDC_DUMP
-  -DHAVE_WORKSPACE
-  -DHAVE_TILING
-  )
\ No newline at end of file
+    $<$<BOOL:${ENABLE_FEATURE}>:ENABLE_SPLITK_FEATURE>
+    -DASCENDC_DUMP
+    -DHAVE_WORKSPACE
+    -DHAVE_TILING
+)
\ No newline at end of file
diff --git a/examples/matrix/matmul_splitk/op_host/matmul_splitk_custom_tiling.cpp b/examples/matrix/matmul_splitk/op_host/matmul_splitk_custom_tiling.cpp
index 9fb919dd..3c987063 100644
--- a/examples/matrix/matmul_splitk/op_host/matmul_splitk_custom_tiling.cpp
+++ b/examples/matrix/matmul_splitk/op_host/matmul_splitk_custom_tiling.cpp
@@ -39,8 +39,10 @@ TCubeTiling GenerateTiling(const MatmulCaseParams& testCaseParams)
     cubeTiling.SetShape(M, N, K);
     cubeTiling.EnableBias(isBias);
     cubeTiling.SetBufferSpace(-1, -1, -1);
+#ifdef ENABLE_SPLITK_FEATURE
     // tiling enbale split K
     cubeTiling.EnableMultiCoreSplitK(true);
+#endif
     if (cubeTiling.GetTiling(tilingData) == -1) {
         std::cout << "Generate tiling failed." << std::endl;
         return {};
diff --git a/examples/matrix/matmul_splitk/op_kernel/matmul_splitk_custom_impl.h b/examples/matrix/matmul_splitk/op_kernel/matmul_splitk_custom_impl.h
index 26a36bb2..fe671fa9 100644
--- a/examples/matrix/matmul_splitk/op_kernel/matmul_splitk_custom_impl.h
+++ b/examples/matrix/matmul_splitk/op_kernel/matmul_splitk_custom_impl.h
@@ -77,7 +77,11 @@ __aicore__ inline void MatmulSplitkKernel<aType, bType, cType, biasType>::Proces
     if (tiling.isBias) {
         matmulObj.SetBias(biasGlobal);
     }
+#ifdef ENABLE_SPLITK_FEATURE
     uint8_t enAtomic = 1; // set AtomicAdd
+#else
+    uint8_t enAtomic = 0;
+#endif
     matmulObj.IterateAll(cGlobal, enAtomic);
     matmulObj.End();
 }
diff --git a/examples/matrix/matmul_splitk/run.sh b/examples/matrix/matmul_splitk/run.sh
index d4369bd0..72e4ffaa 100644
--- a/examples/matrix/matmul_splitk/run.sh
+++ b/examples/matrix/matmul_splitk/run.sh
@@ -9,8 +9,9 @@
 # ======================================================================================================================
 
 export IS_PERF="0"
+export ENABLE_FEATURE="1"
 
-SHORT=r:,v:,p:,
+SHORT=r:,v:,p:,e:,
 LONG=run-mode:,soc-version:,perf:,
 OPTS=$(getopt -a --options $SHORT --longoptions $LONG -- "$@")
 eval set -- "$OPTS"
@@ -26,6 +27,9 @@ do
         (-p | --perf )
             IS_PERF="$2"
             shift 2;;
+        (-e | --enable-feature )
+            ENABLE_FEATURE="$2"
+            shift 2;;
         (--)
             shift;
             break;;
@@ -51,6 +55,11 @@ if [ "${IS_PERF}" != "0" ] && [ "${IS_PERF}" != "1" ]; then
     exit 1
 fi
 
+if [ "${ENABLE_FEATURE}" != "0" ] && [ "${ENABLE_FEATURE}" != "1" ]; then
+    echo "[ERROR] Unsupported ENABLE_FEATURE: ${ENABLE_FEATURE}, which can only be 0 or 1."
+    exit 1
+fi
+
 # only npu mode support is_perf = 1
 if [ "${IS_PERF}" == "1" ] && [ "${RUN_MODE}" != "npu" ]; then
     echo "[ERROR] Unsupported IS_PERF: ${IS_PERF} while RUN_MODE is ${RUN_MODE}."
@@ -65,7 +74,7 @@ source $ASCEND_HOME_DIR/bin/setenv.bash
 export LD_LIBRARY_PATH=${ASCEND_HOME_DIR}/tools/simulator/${SOC_VERSION}/lib:$LD_LIBRARY_PATH
 set -euo pipefail
 
-cmake  -DRUN_MODE=${RUN_MODE} -DSOC_VERSION=${SOC_VERSION} -DASCEND_CANN_PACKAGE_PATH=${ASCEND_HOME_DIR} ..
+cmake  -DRUN_MODE=${RUN_MODE} -DSOC_VERSION=${SOC_VERSION} -DENABLE_FEATURE=${ENABLE_FEATURE} -DASCEND_CANN_PACKAGE_PATH=${ASCEND_HOME_DIR} ..
 make -j16
 
 cd ../
diff --git a/examples/matrix/matmul_splitk/testcase/case.csv b/examples/matrix/matmul_splitk/testcase/case.csv
index 5b39cef8..0036707a 100644
--- a/examples/matrix/matmul_splitk/testcase/case.csv
+++ b/examples/matrix/matmul_splitk/testcase/case.csv
@@ -1,2 +1 @@
-1, case001, 16, 16, 1024
-0, case002, 256, 512, 1536
\ No newline at end of file
+1, case001, 16, 16, 1024
\ No newline at end of file
diff --git a/examples/readme.md b/examples/readme.md
index 1701252c..e51ff5ea 100644
--- a/examples/readme.md
+++ b/examples/readme.md
@@ -36,7 +36,7 @@
         <td> 对输入tensor按行做如下公式的计算：zi = (xi - ∑(xi * yi)) * yi，其中∑为按行reduce求和。</td>
     </tr>
     <tr>
-        <th rowspan="33"><a href="./matrix"> matrix</th>
+        <th rowspan="34"><a href="./matrix"> matrix</th>
         <td><a href="./matrix/basic_block_matmul"> basic_block_matmul</td>
         <td> 实现无尾块且tiling的base块大小固定的场景下的Matmul矩阵乘法，计算公式为：C = A * B + Bias。 </td>
     </tr>
@@ -167,6 +167,10 @@
         <td><a href="./matrix/matmul_perf"> matmul_perf</td>
         <td> 性能优化样例，包含纯Cube模式、MDL模板、UnitFlag、Tiling全量常量化四个特性的Matmul算子，计算公式为：C = A * B + Bias。 </td>
     </tr>
+    <tr>
+        <td><a href="./matrix/batch_matmul_tscm"> batch_matmul_tscm</td>
+        <td> 左矩阵A为TSCM输入的BatchMatmul计算，单次Matmul计算公式为：C = A * B + Bias。 </td>
+    </tr>
     <tr>
         <th rowspan="6"><a href="./normalization"> normalization</th>
         <td><a href="./normalization/layernorm"> layernorm</td>
@@ -258,4 +262,3 @@
 
 
 
-
-- 
Gitee