From e08723e7081fbdff714118763ed12f9d5275ae89 Mon Sep 17 00:00:00 2001
From: kong0808 <kongqiongzhuo@huawei.com>
Date: Fri, 25 Jul 2025 15:36:30 +0800
Subject: [PATCH] add matmul_quant, matmul_channelsplit, matmul_unitflag
 examples

---
 examples/matrix/basic_block_matmul/README.md  |   2 +-
 .../scripts/gen_data.py                       |   9 +-
 examples/matrix/batch_matmul/README.md        |   2 +-
 .../scripts/gen_data.py                       |   6 +-
 examples/matrix/common_scripts/exec_utils.py  |  12 +
 examples/matrix/common_scripts/gen_data.py    |  28 +-
 examples/matrix/matmul/README.md              |   2 +-
 examples/matrix/matmul_a2b2share/README.md    |   4 +-
 examples/matrix/matmul_a2b2share/main.cpp     |   3 +-
 .../matrix/matmul_channelsplit/CMakeLists.txt |  73 +++++
 examples/matrix/matmul_channelsplit/README.md | 107 ++++++++
 .../matmul_channelsplit/cmake/cpu_lib.cmake   |  31 +++
 .../matmul_channelsplit/cmake/npu_lib.cmake   |  27 ++
 examples/matrix/matmul_channelsplit/main.cpp  | 229 ++++++++++++++++
 .../matmul_channelsplit_custom_tiling.cpp     |  50 ++++
 .../matmul_channelsplit_custom_tiling.h       |  37 +++
 .../matmul_channelsplit_custom_kernel.cpp     | 156 +++++++++++
 .../matmul_channelsplit_custom_kernel.h       |  78 ++++++
 examples/matrix/matmul_channelsplit/run.sh    |  96 +++++++
 .../matmul_channelsplit/scripts/exec_test.py  | 130 +++++++++
 .../matmul_channelsplit/testcase/case.csv     |   1 +
 examples/matrix/matmul_constant/README.md     |   2 +-
 .../matmul_constant/scripts/exec_test.py      |  19 +-
 examples/matrix/matmul_l0c_extend/README.md   |   2 +-
 .../matmul_l0c_extend/scripts/exec_test.py    |  19 +-
 examples/matrix/matmul_l0cache/README.md      |   2 +-
 examples/matrix/matmul_l2cache/README.md      |   4 +-
 examples/matrix/matmul_l2cache/main.cpp       |   3 +-
 .../matrix/matmul_mixdualmaster/README.md     |   2 +-
 examples/matrix/matmul_mixdualmaster/run.sh   |   4 +-
 .../matmul_mixdualmaster/scripts/exec_test.py |  19 +-
 examples/matrix/matmul_mndb/README.md         |   2 +-
 .../matrix/matmul_mndb/scripts/exec_test.py   |  19 +-
 examples/matrix/matmul_nbuffer33/README.md    |  10 +-
 examples/matrix/matmul_nbuffer33/main.cpp     |  10 +-
 .../matmul_nbuffer33_custom_tiling.cpp        |  14 +-
 .../op_host/matmul_nbuffer33_custom_tiling.h  |   9 -
 examples/matrix/matmul_nbuffer33/run.sh       |   2 +-
 .../matmul_nbuffer33/scripts/exec_test.py     |  82 +-----
 .../matrix/matmul_nbuffer33/testcase/case.csv |   2 +-
 examples/matrix/matmul_nz/README.md           |   2 +-
 .../matrix/matmul_nz/scripts/exec_test.py     |  19 +-
 .../matrix/matmul_partial_output/README.md    |   6 +-
 .../matmul_partial_output_custom_kernel.cpp   |   1 +
 .../scripts/exec_test.py                      |  19 +-
 examples/matrix/matmul_preload/README.md      |   2 +-
 examples/matrix/matmul_preload/main.cpp       |  40 +--
 .../op_host/matmul_preload_custom_tiling.cpp  |  15 +
 .../op_host/matmul_preload_custom_tiling.h    |   9 +
 .../matmul_preload/scripts/exec_test.py       |  50 ++--
 .../matrix/matmul_preload/testcase/case.csv   |   4 +-
 examples/matrix/matmul_quant/CMakeLists.txt   |  73 +++++
 examples/matrix/matmul_quant/README.md        | 120 ++++++++
 .../matrix/matmul_quant/cmake/cpu_lib.cmake   |  31 +++
 .../matrix/matmul_quant/cmake/npu_lib.cmake   |  27 ++
 examples/matrix/matmul_quant/main.cpp         | 258 ++++++++++++++++++
 .../op_host/matmul_quant_custom_tiling.cpp    |  57 ++++
 .../op_host/matmul_quant_custom_tiling.h      |  38 +++
 .../op_kernel/matmul_quant_custom_kernel.cpp  | 156 +++++++++++
 .../op_kernel/matmul_quant_custom_kernel.h    |  76 ++++++
 examples/matrix/matmul_quant/run.sh           |  96 +++++++
 .../matrix/matmul_quant/scripts/exec_test.py  | 124 +++++++++
 .../matmul_quant/scripts/gen_quant_data.py    |  97 +++++++
 .../matrix/matmul_quant/testcase/case.csv     |   4 +
 examples/matrix/matmul_sparse/README.md       |   8 +-
 .../matrix/matmul_sparse/scripts/exec_test.py |  19 +-
 .../matmul_sparse/scripts/gen_sparse_data.py  |   4 +-
 examples/matrix/matmul_splitk/README.md       |   2 +-
 .../op_kernel/matmul_splitk_custom.cpp        |   4 +-
 .../op_kernel/matmul_splitk_custom_impl.h     |   8 +-
 .../matrix/matmul_splitk/scripts/exec_test.py |  19 +-
 .../matrix/matmul_splitk/testcase/case.csv    |   3 +-
 examples/matrix/matmul_triangle/README.md     |   4 +-
 examples/matrix/matmul_triangle/main.cpp      |   3 +-
 examples/matrix/matmul_tscm/README.md         |   6 +-
 .../matrix/matmul_tscm/scripts/exec_test.py   |  19 +-
 examples/matrix/matmul_unaligned/README.md    |   4 +-
 examples/matrix/matmul_unaligned/main.cpp     |   3 +-
 .../matrix/matmul_unitflag/CMakeLists.txt     |  74 +++++
 examples/matrix/matmul_unitflag/README.md     | 112 ++++++++
 .../matmul_unitflag/cmake/cpu_lib.cmake       |  35 +++
 .../matmul_unitflag/cmake/npu_lib.cmake       |  28 ++
 examples/matrix/matmul_unitflag/main.cpp      | 225 +++++++++++++++
 .../op_host/matmul_unitflag_custom_tiling.cpp |  49 ++++
 .../op_host/matmul_unitflag_custom_tiling.h   |  37 +++
 .../matmul_unitflag_custom_kernel.cpp         | 146 ++++++++++
 .../op_kernel/matmul_unitflag_custom_kernel.h |  85 ++++++
 examples/matrix/matmul_unitflag/run.sh        | 105 +++++++
 .../matmul_unitflag/scripts/exec_test.py      | 124 +++++++++
 .../matrix/matmul_unitflag/testcase/case.csv  |   1 +
 examples/readme.md                            |  43 +--
 91 files changed, 3504 insertions(+), 299 deletions(-)
 create mode 100644 examples/matrix/matmul_channelsplit/CMakeLists.txt
 create mode 100644 examples/matrix/matmul_channelsplit/README.md
 create mode 100644 examples/matrix/matmul_channelsplit/cmake/cpu_lib.cmake
 create mode 100644 examples/matrix/matmul_channelsplit/cmake/npu_lib.cmake
 create mode 100644 examples/matrix/matmul_channelsplit/main.cpp
 create mode 100644 examples/matrix/matmul_channelsplit/op_host/matmul_channelsplit_custom_tiling.cpp
 create mode 100644 examples/matrix/matmul_channelsplit/op_host/matmul_channelsplit_custom_tiling.h
 create mode 100644 examples/matrix/matmul_channelsplit/op_kernel/matmul_channelsplit_custom_kernel.cpp
 create mode 100644 examples/matrix/matmul_channelsplit/op_kernel/matmul_channelsplit_custom_kernel.h
 create mode 100644 examples/matrix/matmul_channelsplit/run.sh
 create mode 100644 examples/matrix/matmul_channelsplit/scripts/exec_test.py
 create mode 100644 examples/matrix/matmul_channelsplit/testcase/case.csv
 create mode 100644 examples/matrix/matmul_quant/CMakeLists.txt
 create mode 100644 examples/matrix/matmul_quant/README.md
 create mode 100644 examples/matrix/matmul_quant/cmake/cpu_lib.cmake
 create mode 100644 examples/matrix/matmul_quant/cmake/npu_lib.cmake
 create mode 100644 examples/matrix/matmul_quant/main.cpp
 create mode 100644 examples/matrix/matmul_quant/op_host/matmul_quant_custom_tiling.cpp
 create mode 100644 examples/matrix/matmul_quant/op_host/matmul_quant_custom_tiling.h
 create mode 100644 examples/matrix/matmul_quant/op_kernel/matmul_quant_custom_kernel.cpp
 create mode 100644 examples/matrix/matmul_quant/op_kernel/matmul_quant_custom_kernel.h
 create mode 100644 examples/matrix/matmul_quant/run.sh
 create mode 100644 examples/matrix/matmul_quant/scripts/exec_test.py
 create mode 100644 examples/matrix/matmul_quant/scripts/gen_quant_data.py
 create mode 100644 examples/matrix/matmul_quant/testcase/case.csv
 create mode 100644 examples/matrix/matmul_unitflag/CMakeLists.txt
 create mode 100644 examples/matrix/matmul_unitflag/README.md
 create mode 100644 examples/matrix/matmul_unitflag/cmake/cpu_lib.cmake
 create mode 100644 examples/matrix/matmul_unitflag/cmake/npu_lib.cmake
 create mode 100644 examples/matrix/matmul_unitflag/main.cpp
 create mode 100644 examples/matrix/matmul_unitflag/op_host/matmul_unitflag_custom_tiling.cpp
 create mode 100644 examples/matrix/matmul_unitflag/op_host/matmul_unitflag_custom_tiling.h
 create mode 100644 examples/matrix/matmul_unitflag/op_kernel/matmul_unitflag_custom_kernel.cpp
 create mode 100644 examples/matrix/matmul_unitflag/op_kernel/matmul_unitflag_custom_kernel.h
 create mode 100644 examples/matrix/matmul_unitflag/run.sh
 create mode 100644 examples/matrix/matmul_unitflag/scripts/exec_test.py
 create mode 100644 examples/matrix/matmul_unitflag/testcase/case.csv

diff --git a/examples/matrix/basic_block_matmul/README.md b/examples/matrix/basic_block_matmul/README.md
index a6f39f4f..b661cb33 100644
--- a/examples/matrix/basic_block_matmul/README.md
+++ b/examples/matrix/basic_block_matmul/README.md
@@ -67,7 +67,7 @@ Framework调用样例中实现的是固定shape为[M, N, K] = [4096, 5120, 4096]
     - 调用End接口，结束矩阵乘操作。
 
 - tiling实现
-    - Ascend C提供一组Matmul Tiling API，方便用户获取MatMul kernel计算时所需的Tiling参数。只需要传入A/B/C矩阵等信息，调用API接口，即可获取到TCubeTiling结构体中的相关参数。
+    - Ascend C提供一组Matmul Tiling API，方便用户获取Matmul kernel计算时所需的Tiling参数。只需要传入A/B/C矩阵等信息，调用API接口，即可获取到TCubeTiling结构体中的相关参数。
     - 获取Tiling参数的流程如下：
       - 创建一个Tiling对象。
       - 设置A、B、C、Bias的参数类型信息；M、N、Ka、Kb形状信息等。
diff --git a/examples/matrix/basic_block_matmul/kernel_launch_method_by_direct/scripts/gen_data.py b/examples/matrix/basic_block_matmul/kernel_launch_method_by_direct/scripts/gen_data.py
index 378bf267..fc65f297 100644
--- a/examples/matrix/basic_block_matmul/kernel_launch_method_by_direct/scripts/gen_data.py
+++ b/examples/matrix/basic_block_matmul/kernel_launch_method_by_direct/scripts/gen_data.py
@@ -10,8 +10,9 @@
 # See LICENSE in the root of the software repository for the full text of the License.
 # ======================================================================================================================
 
-import numpy as np
 import os
+import subprocess
+import numpy as np
 
 
 def gen_golden_data():
@@ -26,8 +27,10 @@ def gen_golden_data():
     x2_gm = np.random.randint(1, 10, [k, n]).astype(x2_gm_type)
     golden = np.matmul(x1_gm.astype(np.float32), x2_gm.astype(np.float32)).astype(np.float32)
     x1_gm = x1_gm.transpose() # A is transposed
-    os.system("mkdir -p input")
-    os.system("mkdir -p output")
+    if not os.path.exists("input"):
+        os.makedirs("input")
+    if not os.path.exists("output"):
+        os.makedirs("output")
     x1_gm.tofile("./input/x1_gm.bin")
     x2_gm.tofile("./input/x2_gm.bin")
     golden.tofile("./output/golden.bin")
diff --git a/examples/matrix/batch_matmul/README.md b/examples/matrix/batch_matmul/README.md
index 02dd08e1..c5f5afb1 100644
--- a/examples/matrix/batch_matmul/README.md
+++ b/examples/matrix/batch_matmul/README.md
@@ -64,7 +64,7 @@ Framework调用样例中实现的是固定shape为[M, N, K] = [192, 1536, 64], b
     - 结束矩阵乘操作。
 
 - tiling实现
-  - Ascend C提供一组Matmul Tiling API，方便用户获取MatMul kernel计算时所需的 Tiling参数。只需要传入A/B/C矩阵等信息，调用API接口，即可获取到TCubeTiling结构体中的相关参数。
+  - Ascend C提供一组Matmul Tiling API，方便用户获取Matmul kernel计算时所需的 Tiling参数。只需要传入A/B/C矩阵等信息，调用API接口，即可获取到TCubeTiling结构体中的相关参数。
   - 获取Tiling参数的流程如下：
     - 创建一个Tiling对象。
     - 根据输入输出LayOut设置单核计算的A、B、C、Bias的参数类型信息；M、N、Ka、Kb形状信息等。
diff --git a/examples/matrix/batch_matmul/kernel_launch_method_by_direct/scripts/gen_data.py b/examples/matrix/batch_matmul/kernel_launch_method_by_direct/scripts/gen_data.py
index e20796c2..2a5fbae6 100644
--- a/examples/matrix/batch_matmul/kernel_launch_method_by_direct/scripts/gen_data.py
+++ b/examples/matrix/batch_matmul/kernel_launch_method_by_direct/scripts/gen_data.py
@@ -50,8 +50,10 @@ def gen_golden_data():
     b_broadcast = np.broadcast_to(b_t, b_broadcast_shape)
     golden = np.matmul(a_broadcast, b_broadcast).astype(np.float32)
     golden = np.transpose(golden, axes=(0, 3, 1, 2, 4))
-    os.system("mkdir -p input")
-    os.system("mkdir -p output")
+    if not os.path.exists("input"):
+        os.makedirs("input")
+    if not os.path.exists("output"):
+        os.makedirs("output")
     x1_gm.tofile("./input/x1_gm.bin")
     x2_gm.tofile("./input/x2_gm.bin")
     golden.tofile("./output/golden.bin")
diff --git a/examples/matrix/common_scripts/exec_utils.py b/examples/matrix/common_scripts/exec_utils.py
index 2eaedf07..9eaf3a6d 100644
--- a/examples/matrix/common_scripts/exec_utils.py
+++ b/examples/matrix/common_scripts/exec_utils.py
@@ -13,6 +13,8 @@ import os
 import sys
 import csv
 import logging
+import subprocess
+import shlex
 
 
 logging.basicConfig(level=logging.INFO)
@@ -89,3 +91,13 @@ def clear_file_cache(file_work_dir):
     os.system("rm -rf " + rm_files)
     rm_files = file_work_dir + "/bin/sim_out/*"
     os.system("rm -rf " + rm_files)
+
+
+def get_process_case_cmd(kernel_name, params_str, is_perf, run_mode):
+    if is_perf:
+        cmd = f"msprof op --application=\"./{kernel_name} {params_str}\" --output=./prof_out"
+    elif run_mode == "sim":
+        cmd = f"msprof op simulator --application=\"./{kernel_name} {params_str}\" --output=./sim_out"
+    else:
+        cmd = f"./{kernel_name} {params_str}"
+    return shlex.split(cmd)
\ No newline at end of file
diff --git a/examples/matrix/common_scripts/gen_data.py b/examples/matrix/common_scripts/gen_data.py
index 1c757a27..445c666a 100644
--- a/examples/matrix/common_scripts/gen_data.py
+++ b/examples/matrix/common_scripts/gen_data.py
@@ -69,12 +69,26 @@ class MatmulGenData:
         y_gm_fp32 = MatmulGenData.due_overflow(res_tf)
         return y_gm_fp32
 
-    def gen_golden_data_fp16(self, work_dir, dst_type=np.float32):
-        src_type = np.float16
+
+    def gen_c_data_nz_format(self, y_gm_fp32, dst_type, c0size):
+        nz_fractal_m = 16
+        nz_fractal_n = 16
         if self.is_channel_split:
+            nz_fractal_n = 8
             c0size = 8
-        else:
-            c0size = 16
+        align_m = int(int((self.m + nz_fractal_m - 1) / nz_fractal_m) * nz_fractal_m)
+        align_n = int(int((self.n + nz_fractal_n - 1) / nz_fractal_n) * nz_fractal_n)
+        y_gm_pad = np.zeros([align_m, align_n])
+        y_gm_pad[0:self.m, 0:self.n] = y_gm_fp32
+        y_gm = y_gm_pad.astype(dst_type)
+        y_shape = [self.b, align_m, align_n]
+        y_gm = MatmulGenData.nd_to_nz(y_gm, y_shape, dst_type, c0size)
+        return y_gm
+
+
+    def gen_golden_data_fp16(self, work_dir, dst_type=np.float32):
+        src_type = np.float16
+        c0size = 16
         if self.is_trans_a:
             x1_shape = [self.b, self.k, self.m]
         else:
@@ -83,7 +97,6 @@ class MatmulGenData:
             x2_shape = [self.b, self.n, self.k]
         else:
             x2_shape = [self.b, self.k, self.n]
-        y_shape = [self.b, self.m, self.n]
         x1_gm = np.random.uniform(-1, 1, x1_shape).astype(src_type)
         x1_gm_fp32 = x1_gm.astype(np.float32)
         x2_gm = np.random.uniform(-1, 1, x2_shape).astype(src_type)
@@ -96,14 +109,15 @@ class MatmulGenData:
             y_gm_fp32 = self.tf_matmul(x1_gm_fp32, x2_gm_fp32, bias_gm_fp32)
         else:
             y_gm_fp32 = self.tf_matmul(x1_gm_fp32, x2_gm_fp32)
-        y_gm = y_gm_fp32.astype(dst_type)
 
         if self.a_format == "NZ":
             x1_gm = MatmulGenData.nd_to_nz(x1_gm, x1_shape, src_type, c0size)
         if self.b_format == "NZ":
             x2_gm = MatmulGenData.nd_to_nz(x2_gm, x2_shape, src_type, c0size)
         if self.c_format == "NZ":
-            y_gm = MatmulGenData.nd_to_nz(y_gm, y_shape, dst_type, c0size)
+            y_gm = self.gen_c_data_nz_format(y_gm_fp32, dst_type, c0size)
+        else:
+            y_gm = y_gm_fp32.astype(dst_type)
 
         x1_gm.tofile(work_dir + "/input/x1_gm.bin")
         x2_gm.tofile(work_dir + "/input/x2_gm.bin")
diff --git a/examples/matrix/matmul/README.md b/examples/matrix/matmul/README.md
index 3b5af64a..ed090c23 100644
--- a/examples/matrix/matmul/README.md
+++ b/examples/matrix/matmul/README.md
@@ -65,7 +65,7 @@ matmul算子规格
     - 结束矩阵乘操作。
 
 - tiling实现
-    - Ascend C提供一组Matmul Tiling API，方便用户获取MatMul kernel计算时所需的Tiling参数。只需要传入A/B/C矩阵等信息，调用API接口，即可获取到TCubeTiling结构体中的相关参数。
+    - Ascend C提供一组Matmul Tiling API，方便用户获取Matmul kernel计算时所需的Tiling参数。只需要传入A/B/C矩阵等信息，调用API接口，即可获取到TCubeTiling结构体中的相关参数。
     - 获取Tiling参数的流程如下：
       - 创建一个Tiling对象。
       - 设置A、B、C、Bias的参数类型信息；M、N、Ka、Kb形状信息等。
diff --git a/examples/matrix/matmul_a2b2share/README.md b/examples/matrix/matmul_a2b2share/README.md
index 34afa5b2..765ad29e 100644
--- a/examples/matrix/matmul_a2b2share/README.md
+++ b/examples/matrix/matmul_a2b2share/README.md
@@ -15,7 +15,7 @@
 | [cmake](cmake)                     | 编译工程文件                     |
 | [op_host](op_host)                 | 本样例tiling代码实现              |
 | [op_kernel](op_kernel)             | 本样例kernel代码实现              |
-| [scripts](scripts)                 | 执行文件      |
+| [scripts](scripts)                 | 包含输入输出真值数据生成脚本文件和执行文件      |
 | [testcase](testcase)               | 用例文件，配置用例的计算shape信息        |
 | [CMakeLists.txt](CMakeLists.txt)   | 编译工程文件                     |
 | [main.cpp](main.cpp)               | 主函数，调用算子的应用程序，含CPU域及NPU域调用 |
@@ -92,7 +92,7 @@
     - 结束两次矩阵乘操作。
 
 - tiling实现
-    - Ascend C提供一组Matmul Tiling API，方便用户获取MatMul kernel计算时所需的Tiling参数。只需要传入A/B/C矩阵等信息，调用API接口，即可获取到TCubeTiling结构体中的相关参数。
+    - Ascend C提供一组Matmul Tiling API，方便用户获取Matmul kernel计算时所需的Tiling参数。只需要传入A/B/C矩阵等信息，调用API接口，即可获取到TCubeTiling结构体中的相关参数。
     - 获取Tiling参数的流程如下：
       - 创建一个Tiling对象。
       - 设置A、B、C、Bias的参数类型信息；M、N、Ka、Kb形状信息等。
diff --git a/examples/matrix/matmul_a2b2share/main.cpp b/examples/matrix/matmul_a2b2share/main.cpp
index 5e209873..1db885d5 100644
--- a/examples/matrix/matmul_a2b2share/main.cpp
+++ b/examples/matrix/matmul_a2b2share/main.cpp
@@ -224,7 +224,8 @@ int32_t main(int32_t argc, const char* args[])
         ss >> problem[i - 1];
     }
 
-    optiling::TestcaseParams caseParams = {problem[0], problem[1], problem[2], problem[3]};
+    bool isBias = problem[3];
+    optiling::TestcaseParams caseParams = {problem[0], problem[1], problem[2], isBias};
     auto ascendcPlatform = platform_ascendc::PlatformAscendCManager::GetInstance(SOC_VERSION);
 #ifdef ASCENDC_CPU_DEBUG
     MatmulHost::TestMatmulCpu(caseParams);
diff --git a/examples/matrix/matmul_channelsplit/CMakeLists.txt b/examples/matrix/matmul_channelsplit/CMakeLists.txt
new file mode 100644
index 00000000..6f448fe5
--- /dev/null
+++ b/examples/matrix/matmul_channelsplit/CMakeLists.txt
@@ -0,0 +1,73 @@
+# Copyright (c) 2025 Huawei Technologies Co., Ltd.
+# This file is a part of the CANN Open Software.
+# Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# ======================================================================================================================
+
+cmake_minimum_required(VERSION 3.16)
+project(Ascend_c)
+if (${RUN_MODE})
+    set(RUN_MODE "npu" CACHE STRING "cpu/sim/npu")
+endif()
+if (${SOC_VERSION})
+    set(SOC_VERSION "Ascend910" CACHE STRING "system on chip type")
+endif()
+
+set(ASCEND_CANN_PACKAGE_PATH "~/Ascend/ascend-toolkit/latest" CACHE STRING "ASCEND CANN package installation directory")
+if(NOT CMAKE_BUILD_TYPE)
+    set(CMAKE_BUILD_TYPE "Debug"  CACHE STRING "Build type Release/Debug (default Debug)" FORCE)
+endif()
+
+if(CMAKE_INSTALL_PREFIX STREQUAL /usr/local)
+    set(CMAKE_INSTALL_PREFIX "${CMAKE_CURRENT_LIST_DIR}/out"  CACHE STRING "path for install()" FORCE)
+endif()
+
+file(GLOB KERNEL_FILES
+    ${CMAKE_CURRENT_SOURCE_DIR}/op_kernel/matmul_channelsplit_custom_kernel.cpp
+)
+
+if("${RUN_MODE}" STREQUAL "cpu")
+    include(cmake/cpu_lib.cmake)
+elseif("${RUN_MODE}" STREQUAL "sim" OR "${RUN_MODE}" STREQUAL "npu")
+    include(cmake/npu_lib.cmake)
+else()
+    message("invalid RUN_MODE: ${RUN_MODE}")
+endif()
+
+add_executable(ascendc_matmul_channelsplit_bbit
+    ${CMAKE_CURRENT_SOURCE_DIR}/main.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/op_host/matmul_channelsplit_custom_tiling.cpp
+)
+
+target_compile_options(ascendc_matmul_channelsplit_bbit PRIVATE
+    $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:-g>>
+    -O2
+    -std=c++17
+    -D_GLIBCXX_USE_CXX11_ABI=0
+)
+
+target_compile_definitions(ascendc_matmul_channelsplit_bbit PRIVATE
+    SOC_VERSION="${SOC_VERSION}"
+)
+
+target_include_directories(ascendc_matmul_channelsplit_bbit PRIVATE
+    ${CMAKE_CURRENT_SOURCE_DIR}
+    $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:${ASCEND_CANN_PACKAGE_PATH}/include>>
+    $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:${ASCEND_CANN_PACKAGE_PATH}/runtime/include>>
+)
+
+target_link_libraries(ascendc_matmul_channelsplit_bbit PRIVATE
+    $<BUILD_INTERFACE:$<$<OR:$<STREQUAL:${RUN_MODE},npu>,$<STREQUAL:${RUN_MODE},sim>>:host_intf_pub>>
+    $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:tikicpulib::${SOC_VERSION}>>
+    $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:ascendcl>>
+    $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:c_sec>>
+    ascendc_kernels_${RUN_MODE}
+    tiling_api
+    register
+    platform
+    ascendalog
+    dl
+)
diff --git a/examples/matrix/matmul_channelsplit/README.md b/examples/matrix/matmul_channelsplit/README.md
new file mode 100644
index 00000000..79369120
--- /dev/null
+++ b/examples/matrix/matmul_channelsplit/README.md
@@ -0,0 +1,107 @@
+<!--声明：本文使用[Creative Commons License version 4.0](https://creativecommons.org/licenses/by/4.0/legalcode)许可协议，转载、引用或修改等操作请遵循此许可协议。-->
+## 概述
+
+本样例介绍了调用Matmul API实现矩阵乘输出Channel拆分功能的单算子。当Matmul计算结果C矩阵的格式为NZ时，C矩阵采用分形存储，若C矩阵数据类型为float，默认情况下分形大小为16 * 16。Channel拆分功能可以将该场景下C矩阵的每个16 * 16分形切分为16 * 8的分形，使C矩阵按照16 * 8的分形进行存储。关于ChannelSplit的具体内容请参考《[Ascend C算子开发](https://www.hiascend.com/document/redirect/CannCommunityOpdevAscendC)》中的“算子实现 > 矩阵编程（高阶API） > 特性场景 > 矩阵乘输出的Channel拆分”章节。
+
+本样例以直调的方式调用算子核函数。  
+直调：核函数的基础调用方式，开发者完成算子核函数的开发和Tiling实现后，即可通过AscendCL运行时接口，完成算子的调用。包含CPU侧、NPU侧和仿真三种运行验证方法。
+
+
+## 样例支持的产品型号为
+- Atlas A2训练系列产品/Atlas 800I A2推理产品
+
+
+## 目录结构
+| 目录及文件                            | 描述                   |
+|----------------------------------|----------------------|
+| [cmake](cmake)                   | 编译工程文件 |
+| [op_host](op_host)               | 本样例的tiling代码实现 |
+| [op_kernel](op_kernel)           | 本样例的kernel代码实现 |
+| [scripts](scripts)               | 执行脚本文件 |
+| [testcase](testcase)             | 用例文件，配置用例的计算shape信息 |
+| CMakeLists.txt                   | 编译工程文件 |
+| main.cpp                         | 主函数，调用算子的应用程序，含CPU域及NPU域调用 |
+| run.sh                           | 编译执行脚本 |
+
+
+## 算子描述
+- 算子功能  
+  MatmulChannelSplitCustom算子调用Matmul API计算时，通过配置MatmulConfig中的isEnableChannelSplit参数为true，使能矩阵乘输出的Channel拆分功能，对输入的A、B矩阵做矩阵乘和加bias偏置。
+
+- 算子规格
+    <table>
+    <tr><td rowspan="1" align="center">算子类型(OpType)</td><td colspan="5" align="center">MatmulChannelSplitCustom</td></tr>
+    </tr>
+    <tr><td rowspan="4" align="center">算子输入</td><td align="center">name</td><td align="center">shape</td><td align="center">data type</td><td align="center">format</td><td align="center">isTrans</td></tr>
+    <tr><td align="center">a</td><td align="center">-</td><td align="center">float16</td><td align="center">ND</td><td align="center">false</td></tr>
+    <tr><td align="center">b</td><td align="center">-</td><td align="center">float16</td><td align="center">ND</td><td align="center">false</td></tr>
+    <tr><td align="center">bias</td><td align="center">-</td><td align="center">float</td><td align="center">ND</td><td align="center">-</td></tr>
+    </tr>
+    </tr>
+    <tr><td rowspan="1" align="center">算子输出</td><td align="center">c</td><td align="center">-</td><td align="center">float</td><td align="center">NZ</td><td align="center">-</td></tr>
+    </tr>
+    <tr><td rowspan="1" align="center">核函数名</td><td colspan="5" align="center">matmul_channelsplit_custom</td></tr>
+    </table>
+
+
+## 算子实现介绍     
+- 约束条件  
+  开启ChannelSplit功能需满足：
+  - C矩阵的数据排布格式为CubeFormat::NZ。
+  - C矩阵的数据类型为float。
+  - C矩阵的内存逻辑位置为Gloabl Memory。
+
+
+- 算子Kernel实现
+  - 计算逻辑：C = A * B + Bias。
+    - A、B为源操作数，A为左矩阵，形状为[M, K]；B为右矩阵，形状为[K, N]。
+    - C为目的操作数，存放矩阵乘结果的矩阵，形状为[M, N]。
+    - Bias为矩阵乘偏置，形状为[1, N]。对A*B结果矩阵的每一行都采用该Bias进行偏置。
+  - 具体步骤：
+    - 创建Matmul对象。  
+      创建Matmul对象时，自定义MatmulConfig参数，将其中的isEnableChannelSplit参数设置为true，使能矩阵乘输出的Channel拆分功能，获得自定义的使用Norm模板的Matmul对象。
+      ```
+      constexpr static MatmulConfigMode configMode = MatmulConfigMode::CONFIG_NORM;
+      constexpr static MatmulFuncParams funcParamsChannelSplit{
+          false, false, false, false, 0, IterateOrder::ORDER_M, ScheduleType::INNER_PRODUCT,
+          true, false, false, false, true/*isEnableChannelSplit*/
+      };
+      constexpr static MatmulConfig CFG_NORM = GetMMConfig<configMode>(funcParamsChannelSplit);
+      ```
+    - 初始化操作。
+    - 完成矩阵乘操作。
+    - 结束矩阵乘操作。
+
+- 算子Tiling实现
+  - Ascend C提供一组Matmul Tiling API，方便用户获取Matmul kernel计算时所需的Tiling参数。
+  - 只需要传入A/B/C矩阵等信息，调用API接口，即可获取到TCubeTiling结构体中的相关参数。
+  - 获取Tiling参数的流程如下：
+    - 创建一个Tiling对象。
+    - 设置A、B、C、Bias的参数类型信息；M、N、Ka、Kb形状信息等。
+    - 调用GetTiling接口，获取Tiling信息。
+
+## 编译运行样例
+
+  - 配置环境变量
+
+    这里的\$ASCEND_CANN_PACKAGE_PATH需要替换为CANN开发套件包安装后文件存储路径。例如：/usr/local/Ascend/ascend-toolkit/latest
+    ```
+    export ASCEND_HOME_DIR=$ASCEND_CANN_PACKAGE_PATH
+    source $ASCEND_HOME_DIR/../set_env.sh
+    ```
+
+  - 编译执行
+
+    ```
+    bash run.sh -r [RUN_MODE] -v [SOC_VERSION] -p [IS_PERF]
+    ```
+    其中脚本参数说明如下：
+    - RUN_MODE ：编译执行方式，可选择CPU调试，NPU仿真，NPU上板，对应参数分别为[cpu / sim / npu]。若需要详细了解NPU仿真相关内容，请参考[《算子开发工具msProf》](https://hiascend.com/document/redirect/CannCommunityToolMsProf)中的“工具使用”章节。
+    - SOC_VERSION ：昇腾AI处理器型号，如果无法确定具体的[SOC_VERSION]，则在安装昇腾AI处理器的服务器执行npu-smi info命令进行查询，在查询到的“Name”前增加Ascend信息，例如“Name”对应取值为xxxyy，实际配置的[SOC_VERSION]值为Ascendxxxyy。支持以下产品型号：
+      - Atlas A2训练系列产品/Atlas 800I A2推理产品
+    - IS_PERF : 是否获取执行性能数据，当前只在NPU执行时生效，打开后性能数据显示在执行结果中，可选择关闭和开启该功能，对应参数分别为[0 / 1]。
+
+    示例如下，Ascendxxxyy请替换为实际的AI处理器型号。
+    ```
+    bash run.sh -r cpu -v Ascendxxxyy -p 0
+    ```
\ No newline at end of file
diff --git a/examples/matrix/matmul_channelsplit/cmake/cpu_lib.cmake b/examples/matrix/matmul_channelsplit/cmake/cpu_lib.cmake
new file mode 100644
index 00000000..487a91e0
--- /dev/null
+++ b/examples/matrix/matmul_channelsplit/cmake/cpu_lib.cmake
@@ -0,0 +1,31 @@
+# Copyright (c) 2025 Huawei Technologies Co., Ltd.
+# This file is a part of the CANN Open Software.
+# Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# ======================================================================================================================
+
+if(NOT DEFINED ENV{CMAKE_PREFIX_PATH})
+    set(CMAKE_PREFIX_PATH ${ASCEND_CANN_PACKAGE_PATH}/tools/tikicpulib/lib/cmake)
+endif()
+find_package(tikicpulib REQUIRED)
+
+add_library(ascendc_kernels_${RUN_MODE} SHARED
+    ${KERNEL_FILES}
+)
+
+target_link_libraries(ascendc_kernels_${RUN_MODE} PRIVATE
+    tikicpulib::${SOC_VERSION}
+)
+
+target_compile_options(ascendc_kernels_${RUN_MODE} PRIVATE
+    -g
+    -O0
+    -std=c++17
+)
+
+install(TARGETS ascendc_kernels_${RUN_MODE}
+    DESTINATION ${CMAKE_INSTALL_LIBDIR}
+)
\ No newline at end of file
diff --git a/examples/matrix/matmul_channelsplit/cmake/npu_lib.cmake b/examples/matrix/matmul_channelsplit/cmake/npu_lib.cmake
new file mode 100644
index 00000000..bc803099
--- /dev/null
+++ b/examples/matrix/matmul_channelsplit/cmake/npu_lib.cmake
@@ -0,0 +1,27 @@
+# Copyright (c) 2025 Huawei Technologies Co., Ltd.
+# This file is a part of the CANN Open Software.
+# Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# ======================================================================================================================
+
+if(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
+    set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
+elseif(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake)
+    set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake)
+else()
+    message(FATAL_ERROR "ascendc_kernel_cmake does not exist ,please check whether the cann package is installed")
+endif()
+include(${ASCENDC_CMAKE_DIR}/ascendc.cmake)
+
+ascendc_library(ascendc_kernels_${RUN_MODE} STATIC
+    ${KERNEL_FILES}
+)
+
+ascendc_compile_definitions(ascendc_kernels_${RUN_MODE} PRIVATE
+    -DASCENDC_DUMP
+    -DHAVE_WORKSPACE
+    -DHAVE_TILING
+)
\ No newline at end of file
diff --git a/examples/matrix/matmul_channelsplit/main.cpp b/examples/matrix/matmul_channelsplit/main.cpp
new file mode 100644
index 00000000..2a3ec442
--- /dev/null
+++ b/examples/matrix/matmul_channelsplit/main.cpp
@@ -0,0 +1,229 @@
+/**
+ * Copyright (c) 2025 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+#include <sstream>
+#include "../../common/data_utils.h"
+#include "kernel_tiling/kernel_tiling.h"
+#include "tiling/platform/platform_ascendc.h"
+#include "op_host/matmul_channelsplit_custom_tiling.h"
+
+#ifndef ASCENDC_CPU_DEBUG
+#include "acl/acl.h"
+extern void matmul_channelsplit_custom_do(uint32_t coreDim, void* stream,
+    uint8_t* a, uint8_t* b, uint8_t* bias, uint8_t* c,
+    uint8_t* workspace, uint8_t* tiling);
+#else
+#include "tikicpulib.h"
+extern "C" void matmul_channelsplit_custom(uint8_t* a, uint8_t* b, uint8_t* bias, uint8_t* c,
+    uint8_t* workspace, uint8_t* tiling);
+#endif
+
+namespace {
+constexpr bool IS_BIAS = true;
+constexpr bool IS_A_TRANS = false;
+constexpr bool IS_B_TRANS = false;
+constexpr int32_t BLOCK_SIZE = 16;
+
+struct MatrixFileSize
+{
+    size_t x1FileSize;
+    size_t x2FileSize;
+    size_t yFileSize;
+    size_t biasFileSize;
+};
+}
+
+namespace MatmulHost {
+static size_t GetSysWorkSpaceSize()
+{
+    auto ascendcPlatform = platform_ascendc::PlatformAscendCManager::GetInstance();
+    return static_cast<size_t>(ascendcPlatform->GetLibApiWorkSpaceSize());
+}
+
+// CPU debug mode
+#ifdef ASCENDC_CPU_DEBUG
+void TestMatmulCpu(int64_t m, int64_t n, int64_t k, const MatrixFileSize& matrixFileSize)
+{
+    size_t x1FileSize = matrixFileSize.x1FileSize;
+    size_t x2FileSize = matrixFileSize.x2FileSize;
+    size_t yFileSize = matrixFileSize.yFileSize;
+    size_t biasFileSize = matrixFileSize.biasFileSize;
+    // Query workspace size
+    size_t workspaceSize = GetSysWorkSpaceSize();
+    uint8_t* x1 = (uint8_t*)AscendC::GmAlloc(x1FileSize);
+    uint8_t* x2 = (uint8_t*)AscendC::GmAlloc(x2FileSize);
+    uint8_t* bias = nullptr;
+    uint8_t* y = (uint8_t*)AscendC::GmAlloc(yFileSize);
+    uint8_t* workspace = (uint8_t*)AscendC::GmAlloc(workspaceSize);
+    ReadFile("../input/x1_gm.bin", x1FileSize, x1, x1FileSize);
+    ReadFile("../input/x2_gm.bin", x2FileSize, x2, x2FileSize);
+    if (IS_BIAS) {
+        bias = (uint8_t*)AscendC::GmAlloc(biasFileSize);
+        ReadFile("../input/bias_gm.bin", biasFileSize, bias, biasFileSize);
+    }
+    size_t tilingFileSize = sizeof(TCubeTiling);
+    uint8_t* tiling = (uint8_t*)AscendC::GmAlloc(tilingFileSize);
+    auto ascendcPlatform = platform_ascendc::PlatformAscendCManager::GetInstance();
+    MatmulHost::MatmulCaseParams testCaseParams{static_cast<int32_t>(ascendcPlatform->GetCoreNumAic()),
+        static_cast<int32_t>(m), static_cast<int32_t>(n), static_cast<int32_t>(k), IS_BIAS, IS_A_TRANS, IS_B_TRANS};
+    // Calculate Tiling
+    const auto& tilingData = MatmulHost::GenerateTiling(testCaseParams);
+    memcpy_s(tiling, tilingFileSize, &tilingData, tilingFileSize);
+    ICPU_RUN_KF(matmul_channelsplit_custom, tilingData.usedCoreNum, x1, x2, bias, y, workspace, tiling);
+    WriteFile("../output/output.bin", y, yFileSize);
+    AscendC::GmFree((void*)x1);
+    AscendC::GmFree((void*)x2);
+    AscendC::GmFree((void*)y);
+    AscendC::GmFree((void*)workspace);
+    AscendC::GmFree((void*)tiling);
+    if (IS_BIAS) {
+        AscendC::GmFree((void*)bias);
+    }
+}
+// NPU
+#else
+void MatmulOp(uint8_t* x1, uint8_t* x2, uint8_t* y, uint8_t* bias, int64_t m, int64_t n, int64_t k,
+    void* stream = nullptr)
+{
+    // Init args
+    uint8_t* workspaceDevice = nullptr;
+
+    // Query workspace size
+    size_t workspaceSize = GetSysWorkSpaceSize();
+
+    // Allocate workspace on device
+    CHECK_ACL(aclrtMalloc((void**)&workspaceDevice, workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST));
+
+    // Initialize kernel with arguments and workspace pointer
+    uint8_t* tilingHost = nullptr;
+    uint8_t* tilingDevice = nullptr;
+    size_t tilingFileSize = sizeof(TCubeTiling);
+    auto ascendcPlatform = platform_ascendc::PlatformAscendCManager::GetInstance();
+    MatmulHost::MatmulCaseParams testCaseParams{static_cast<int32_t>(ascendcPlatform->GetCoreNumAic()),
+        static_cast<int32_t>(m), static_cast<int32_t>(n), static_cast<int32_t>(k), IS_BIAS, IS_A_TRANS, IS_B_TRANS};
+    // Calculate Tiling
+    const auto tilingData = MatmulHost::GenerateTiling(testCaseParams);
+    CHECK_ACL(aclrtMallocHost((void**)(&tilingHost), tilingFileSize));
+    CHECK_ACL(aclrtMalloc((void**)&tilingDevice, tilingFileSize,
+                          ACL_MEM_MALLOC_HUGE_FIRST));
+    CHECK_ACL(aclrtMemcpy(tilingHost, tilingFileSize, &tilingData,
+                          tilingFileSize, ACL_MEMCPY_HOST_TO_HOST));
+    CHECK_ACL(aclrtMemcpy(tilingDevice, tilingFileSize, tilingHost,
+                          tilingFileSize, ACL_MEMCPY_HOST_TO_DEVICE));
+    // Launch kernel
+    matmul_channelsplit_custom_do(tilingData.usedCoreNum, stream, x1, x2, bias, y, workspaceDevice, tilingDevice);
+    CHECK_ACL(aclrtFreeHost(tilingHost));
+    CHECK_ACL(aclrtFree(workspaceDevice));
+    CHECK_ACL(aclrtFree(tilingDevice));
+}
+
+void TestAclInit(aclrtContext& context, aclrtStream& stream, int64_t& deviceId)
+{
+    CHECK_ACL(aclInit(nullptr));
+    CHECK_ACL(aclrtSetDevice(deviceId));
+    CHECK_ACL(aclrtCreateContext(&context, deviceId));
+    CHECK_ACL(aclrtCreateStream(&stream));
+}
+
+void TestAclDeInit(aclrtContext& context, aclrtStream& stream, int64_t& deviceId)
+{
+    CHECK_ACL(aclrtDestroyStream(stream));
+    CHECK_ACL(aclrtDestroyContext(context));
+    CHECK_ACL(aclrtResetDevice(deviceId));
+    CHECK_ACL(aclFinalize());
+}
+
+void TestMatmul(int64_t m, int64_t n, int64_t k, const MatrixFileSize& matrixFileSize)
+{
+    size_t x1FileSize = matrixFileSize.x1FileSize;
+    size_t x2FileSize = matrixFileSize.x2FileSize;
+    size_t yFileSize = matrixFileSize.yFileSize;
+    size_t biasFileSize = matrixFileSize.biasFileSize;
+
+    aclrtContext context;
+    aclrtStream stream = nullptr;
+    int64_t deviceId = 0;
+    TestAclInit(context, stream, deviceId);
+
+    uint8_t* x1Host = nullptr;
+    uint8_t* x1Device = nullptr;
+    CHECK_ACL(aclrtMallocHost((void**)(&x1Host), x1FileSize));
+    CHECK_ACL(aclrtMalloc((void**)&x1Device, x1FileSize, ACL_MEM_MALLOC_HUGE_FIRST));
+    ReadFile("../input/x1_gm.bin", x1FileSize, x1Host, x1FileSize);
+    CHECK_ACL(aclrtMemcpy(x1Device, x1FileSize, x1Host, x1FileSize, ACL_MEMCPY_HOST_TO_DEVICE));
+
+    uint8_t* x2Host = nullptr;
+    uint8_t* x2Device = nullptr;
+    CHECK_ACL(aclrtMallocHost((void**)(&x2Host), x2FileSize));
+    CHECK_ACL(aclrtMalloc((void**)&x2Device, x2FileSize, ACL_MEM_MALLOC_HUGE_FIRST));
+    ReadFile("../input/x2_gm.bin", x2FileSize, x2Host, x2FileSize);
+    CHECK_ACL(aclrtMemcpy(x2Device, x2FileSize, x2Host, x2FileSize, ACL_MEMCPY_HOST_TO_DEVICE));
+
+    uint8_t* biasHost = nullptr;
+    uint8_t* biasDevice = nullptr;
+    if (IS_BIAS) {
+        CHECK_ACL(aclrtMallocHost((void**)(&biasHost), biasFileSize));
+        CHECK_ACL(aclrtMalloc((void**)&biasDevice, biasFileSize, ACL_MEM_MALLOC_HUGE_FIRST));
+        ReadFile("../input/bias_gm.bin", biasFileSize, biasHost, biasFileSize);
+        CHECK_ACL(aclrtMemcpy(biasDevice, biasFileSize, biasHost, biasFileSize, ACL_MEMCPY_HOST_TO_DEVICE));
+    }
+    uint8_t* yHost = nullptr;
+    uint8_t* yDevice = nullptr;
+    CHECK_ACL(aclrtMallocHost((void**)(&yHost), yFileSize));
+    CHECK_ACL(aclrtMalloc((void**)&yDevice, yFileSize, ACL_MEM_MALLOC_HUGE_FIRST));
+
+    MatmulOp(x1Device, x2Device, yDevice, biasDevice, m, n, k, stream);
+    CHECK_ACL(aclrtSynchronizeStream(stream));
+
+    CHECK_ACL(aclrtMemcpy(yHost, yFileSize, yDevice, yFileSize, ACL_MEMCPY_DEVICE_TO_HOST));
+    WriteFile("../output/output.bin", yHost, yFileSize);
+
+    if (IS_BIAS) {
+        CHECK_ACL(aclrtFree(biasDevice));
+        CHECK_ACL(aclrtFreeHost(biasHost));
+    }
+    CHECK_ACL(aclrtFree(x1Device));
+    CHECK_ACL(aclrtFreeHost(x1Host));
+    CHECK_ACL(aclrtFree(x2Device));
+    CHECK_ACL(aclrtFreeHost(x2Host));
+    CHECK_ACL(aclrtFree(yDevice));
+    CHECK_ACL(aclrtFreeHost(yHost));
+    TestAclDeInit(context, stream, deviceId);
+}
+#endif
+} // namespace MatmulHost
+
+int32_t main(int32_t argc, const char* args[])
+{
+    int64_t inputParams[3] = {1, 1, 1};
+    for (int32_t i = 1; i < argc && i < 4; ++i) { // 4 used for inputParams loop
+        std::stringstream ss(args[i]);
+        ss >> inputParams[i - 1];
+    }
+    auto ascendcPlatform = platform_ascendc::PlatformAscendCManager::GetInstance(SOC_VERSION);
+    int64_t M = inputParams[0];
+    int64_t N = inputParams[1];
+    int64_t K = inputParams[2];
+    MatrixFileSize matrixFileSize;
+    // uint16_t represent half
+    matrixFileSize.x1FileSize = static_cast<size_t>(M * K) * sizeof(uint16_t);
+    matrixFileSize.x2FileSize = static_cast<size_t>(K * N) * sizeof(uint16_t);
+    // float NZ cMatrix output, 16*8 fractal
+    int32_t alignM = (static_cast<int32_t>(M) + BLOCK_SIZE - 1) / BLOCK_SIZE * BLOCK_SIZE;
+    int32_t alignN = (static_cast<int32_t>(N) + BLOCK_SIZE / 2  - 1) / (BLOCK_SIZE / 2) * (BLOCK_SIZE / 2);
+    matrixFileSize.yFileSize = static_cast<size_t>(alignM * alignN) * sizeof(float);
+    matrixFileSize.biasFileSize = static_cast<size_t>(1 * N) * sizeof(float);
+#ifdef ASCENDC_CPU_DEBUG
+    MatmulHost::TestMatmulCpu(M, N, K, matrixFileSize);
+#else
+    MatmulHost::TestMatmul(M, N, K, matrixFileSize);
+#endif
+    return 0;
+}
diff --git a/examples/matrix/matmul_channelsplit/op_host/matmul_channelsplit_custom_tiling.cpp b/examples/matrix/matmul_channelsplit/op_host/matmul_channelsplit_custom_tiling.cpp
new file mode 100644
index 00000000..cfa68371
--- /dev/null
+++ b/examples/matrix/matmul_channelsplit/op_host/matmul_channelsplit_custom_tiling.cpp
@@ -0,0 +1,50 @@
+/**
+ * Copyright (c) 2025 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+#include "matmul_channelsplit_custom_tiling.h"
+#include <iostream>
+
+namespace MatmulHost {
+
+TCubeTiling GenerateTiling(const MatmulCaseParams& testCaseParams)
+{
+    TCubeTiling tilingData;
+    auto ascendcPlatform = platform_ascendc::PlatformAscendCManager::GetInstance();
+    matmul_tiling::MultiCoreMatmulTiling cubeTiling(*ascendcPlatform);
+    uint32_t M = testCaseParams.m;
+    uint32_t N = testCaseParams.n;
+    uint32_t K = testCaseParams.k;
+    uint32_t blockDim = testCaseParams.usedCoreNum;
+    bool isBias = testCaseParams.isBias;
+    bool isAtrans = testCaseParams.isATrans;
+    bool isBtrans = testCaseParams.isBTrans;
+
+    cubeTiling.SetDim(blockDim);
+    cubeTiling.SetAType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND,
+        matmul_tiling::DataType::DT_FLOAT16, isAtrans);
+    cubeTiling.SetBType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND,
+        matmul_tiling::DataType::DT_FLOAT16, isBtrans);
+    // CTYPE is NZ format
+    cubeTiling.SetCType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::NZ,
+        matmul_tiling::DataType::DT_FLOAT);
+    cubeTiling.SetBiasType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND,
+        matmul_tiling::DataType::DT_FLOAT);
+    cubeTiling.SetOrgShape(M, N, K);
+    cubeTiling.SetShape(M, N, K);
+    cubeTiling.EnableBias(isBias);
+    cubeTiling.SetBufferSpace(-1, -1, -1);
+    if (cubeTiling.GetTiling(tilingData) == -1) {
+        std::cout << "Generate tiling failed." << std::endl;
+        return {};
+    }
+    return tilingData;
+}
+
+} // namespace MatmulHost
diff --git a/examples/matrix/matmul_channelsplit/op_host/matmul_channelsplit_custom_tiling.h b/examples/matrix/matmul_channelsplit/op_host/matmul_channelsplit_custom_tiling.h
new file mode 100644
index 00000000..38bf87ab
--- /dev/null
+++ b/examples/matrix/matmul_channelsplit/op_host/matmul_channelsplit_custom_tiling.h
@@ -0,0 +1,37 @@
+/**
+ * Copyright (c) 2025 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+#ifndef EXAMPLES_MATRIX_MATMUL_CHANNELSPLIT_OP_HOST_MATMUL_CHANNELSPLIT_CUSTOM_TILING_H
+#define EXAMPLES_MATRIX_MATMUL_CHANNELSPLIT_OP_HOST_MATMUL_CHANNELSPLIT_CUSTOM_TILING_H
+#include "register/tilingdata_base.h"
+#include "tiling/tiling_api.h"
+
+namespace MatmulHost {
+
+struct MatmulCaseParams
+{
+    int32_t usedCoreNum;
+    int32_t m;
+    int32_t n;
+    int32_t k;
+    bool isBias;
+    bool isATrans;
+    bool isBTrans;
+};
+
+/**
+  * @brief Generate matmul tiling.
+  * @param testCaseParams: Testcase parameters.
+  * @retval Generated Tiling data.
+  */
+TCubeTiling GenerateTiling(const MatmulCaseParams& testCaseParams);
+
+} // namespace MatmulHost
+#endif // EXAMPLES_MATRIX_MATMUL_CHANNELSPLIT_OP_HOST_MATMUL_CHANNELSPLIT_CUSTOM_TILING_H
diff --git a/examples/matrix/matmul_channelsplit/op_kernel/matmul_channelsplit_custom_kernel.cpp b/examples/matrix/matmul_channelsplit/op_kernel/matmul_channelsplit_custom_kernel.cpp
new file mode 100644
index 00000000..12a7866b
--- /dev/null
+++ b/examples/matrix/matmul_channelsplit/op_kernel/matmul_channelsplit_custom_kernel.cpp
@@ -0,0 +1,156 @@
+/**
+ * Copyright (c) 2025 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+#include "kernel_operator.h"
+#include "matmul_channelsplit_custom_kernel.h"
+
+namespace {
+/**
+  * @brief  Copy tiling data to TCubeTiling ptr from tiling gm addr.
+  * @param  tiling: TCubeTiling ptr which needs to copy tiling data.
+  * @param  tilingGM: Tiling gm addr.
+  * @retval None
+  */
+__aicore__ inline void CopyTiling(TCubeTiling* tiling, GM_ADDR tilingGM)
+{
+    uint32_t* ptr = reinterpret_cast<uint32_t*>(tiling);
+    auto tiling32 = reinterpret_cast<__gm__ uint32_t*>(tilingGM);
+
+    for (int i = 0; i < sizeof(TCubeTiling) / sizeof(uint32_t); i++, ptr++) {
+      *ptr = *(tiling32 + i);
+    }
+    return;
+}
+
+template <typename T>
+__aicore__ inline T Ceil(T num1, T num2)
+{
+    if (num2 == 0) {
+        return num1;
+    }
+    return (num1 + num2 - 1) / num2;
+}
+
+constexpr int32_t BLOCK_SIZE = 16;
+}
+
+namespace MatmulChannelSplitCustom {
+template <typename AType, typename BType, typename CType, typename BiasType>
+__aicore__ inline void MatmulKernel<AType, BType, CType, BiasType>::Init(
+    GM_ADDR a, GM_ADDR b, GM_ADDR bias, GM_ADDR c, const TCubeTiling& tiling, bool isTransA, bool isTransB)
+{
+    this->tiling = tiling;
+    aGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ AType*>(a), tiling.M * tiling.Ka);
+    bGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ BType*>(b), tiling.Kb * tiling.N);
+    // float NZ cMatrix output, 16*8 fractal
+    uint32_t alignM = Ceil(tiling.M, BLOCK_SIZE) * BLOCK_SIZE;
+    uint32_t alignN = Ceil(tiling.N, BLOCK_SIZE / 2) * (BLOCK_SIZE / 2);
+    cGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ CType*>(c), alignM * alignN);
+    biasGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ BiasType*>(bias), tiling.N);
+
+    int32_t offsetA = 0;
+    int32_t offsetB = 0;
+    int32_t offsetC = 0;
+    int32_t offsetBias = 0;
+    this->isTransA = isTransA;
+    this->isTransB = isTransB;
+    CalcOffset(AscendC::GetBlockIdx(), offsetA, offsetB, offsetC, offsetBias);
+    aGlobal = aGlobal[offsetA];
+    bGlobal = bGlobal[offsetB];
+    cGlobal = cGlobal[offsetC];
+    biasGlobal = biasGlobal[offsetBias];
+    if (GetSysWorkSpacePtr() == nullptr) {
+        return;
+    }
+}
+
+template <typename AType, typename BType, typename CType, typename BiasType>
+__aicore__ inline void MatmulKernel<AType, BType, CType, BiasType>::Process()
+{
+    if (AscendC::GetBlockIdx() >= tiling.usedCoreNum) {
+        return;
+    }
+
+    // process with tail block
+    int tailM = tiling.M - mCoreIndex * tiling.singleCoreM;
+    tailM = tailM < tiling.singleCoreM ? tailM : tiling.singleCoreM;
+    int tailN = tiling.N - nCoreIndex * tiling.singleCoreN;
+    tailN = tailN < tiling.singleCoreN ? tailN : tiling.singleCoreN;
+    if (tailM < tiling.singleCoreM || tailN < tiling.singleCoreN) {
+        matmulObj.SetTail(tailM, tailN);
+    }
+
+    matmulObj.SetTensorA(aGlobal, isTransA);
+    matmulObj.SetTensorB(bGlobal, isTransB);
+    if (tiling.isBias) {
+        matmulObj.SetBias(biasGlobal);
+    }
+    matmulObj.IterateAll(cGlobal);
+    matmulObj.End();
+}
+
+template <typename AType, typename BType, typename CType, typename BiasType>
+__aicore__ inline void MatmulKernel<AType, BType, CType, BiasType>::CalcOffset(
+    int32_t blockIdx, int32_t& offsetA, int32_t& offsetB, int32_t& offsetC, int32_t& offsetBias)
+{
+    const TCubeTiling& tiling = this->tiling;
+    auto mSingleBlocks = (tiling.M + tiling.singleCoreM - 1) / tiling.singleCoreM; // split M into mSingleBlocks cores
+    mCoreIndex = blockIdx % mSingleBlocks;
+    nCoreIndex = blockIdx / mSingleBlocks;
+
+    if (isTransA) {
+        offsetA = mCoreIndex * tiling.singleCoreM;
+    } else {
+        offsetA = mCoreIndex * tiling.Ka * tiling.singleCoreM;
+    }
+    if (isTransB) {
+        offsetB = nCoreIndex * tiling.Kb * tiling.singleCoreN;
+    } else {
+        offsetB = nCoreIndex * tiling.singleCoreN;
+    }
+    // output C matrix is NZ format
+    offsetC = tiling.M * nCoreIndex * tiling.singleCoreN + mCoreIndex * tiling.singleCoreM * BLOCK_SIZE / 2;
+    offsetBias = nCoreIndex * tiling.singleCoreN;
+}
+} // namespace MatmulChannelSplitCustom
+
+/**
+  * @brief  matmul kernel function.
+  * @param  a: A matrix gm addr.
+  * @param  b: B matrix gm addr.
+  * @param  bias: Bias matrix gm addr.
+  * @param  c: C matrix gm addr.
+  * @param  workspace: Temporary gm space addr required by matmul calc.
+  * @param  tilingGm: Tiling data addr. 
+  * @retval None
+  */
+extern "C" __global__ __aicore__ void matmul_channelsplit_custom(GM_ADDR a, GM_ADDR b,
+    GM_ADDR bias, GM_ADDR c, GM_ADDR workspace, GM_ADDR tilingGm)
+{
+    // prepare tiling
+    TCubeTiling tiling;
+    CopyTiling(&tiling, tilingGm);
+    // define matmul kernel
+    MatmulChannelSplitCustom::MatmulKernel<half, half, float, float> matmulKernel;
+    AscendC::TPipe pipe;
+    REGIST_MATMUL_OBJ(&pipe, GetSysWorkSpacePtr(), matmulKernel.matmulObj, &tiling);
+    // init matmul kernel, isTransA=false, isTransB=false
+    matmulKernel.Init(a, b, bias, c, tiling, false, false);
+    // matmul kernel process
+    matmulKernel.Process();
+}
+
+#ifndef ASCENDC_CPU_DEBUG
+void matmul_channelsplit_custom_do(uint32_t blockDim, void* stream,
+    GM_ADDR a, GM_ADDR b, GM_ADDR bias, GM_ADDR c, GM_ADDR workspace, GM_ADDR tilingGm)
+{
+    matmul_channelsplit_custom<<<blockDim, nullptr, stream>>>(a, b, bias, c, workspace, tilingGm);
+}
+#endif
diff --git a/examples/matrix/matmul_channelsplit/op_kernel/matmul_channelsplit_custom_kernel.h b/examples/matrix/matmul_channelsplit/op_kernel/matmul_channelsplit_custom_kernel.h
new file mode 100644
index 00000000..e88082a2
--- /dev/null
+++ b/examples/matrix/matmul_channelsplit/op_kernel/matmul_channelsplit_custom_kernel.h
@@ -0,0 +1,78 @@
+/**
+ * Copyright (c) 2025 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+#ifndef EXAMPLES_MATRIX_MATMUL_CHANNELSPLIT_OP_KERNEL_MATMUL_CHANNELSPLIT_CUSTOM_KERNEL_H
+#define EXAMPLES_MATRIX_MATMUL_CHANNELSPLIT_OP_KERNEL_MATMUL_CHANNELSPLIT_CUSTOM_KERNEL_H
+#include "kernel_operator.h"
+#include "lib/matmul_intf.h"
+
+namespace MatmulChannelSplitCustom {
+constexpr static MatmulConfigMode configMode = MatmulConfigMode::CONFIG_NORM;
+// Set isEnableChannelSplit=true
+constexpr static MatmulFuncParams funcParamsChannelSplit{
+    false, false, false, false, 0, IterateOrder::ORDER_M, ScheduleType::INNER_PRODUCT,
+    true, false, false, false, true/*isEnableChannelSplit*/
+};
+constexpr static MatmulConfig CFG_NORM = GetMMConfig<configMode>(funcParamsChannelSplit);
+template <typename AType, typename BType, typename CType, typename BiasType>
+class MatmulKernel {
+public:
+    __aicore__ inline MatmulKernel(){};
+    /**
+      * @brief  Initialization before process.
+      * @param  a: A matrix gm addr.
+      * @param  b: B matrix gm addr.
+      * @param  bias: Bias matrix gm addr.
+      * @param  c: C matrix gm addr.
+      * @param  tiling: Matmul tiling struct.
+      * @param  isTransA: Whether A matrix is transposed.
+      * @param  isTransB: Whether B matrix is transposed.
+      * @retval None
+      */
+    __aicore__ inline void Init(GM_ADDR a, GM_ADDR b, GM_ADDR bias, GM_ADDR c, const TCubeTiling& tiling,
+        bool isTransA, bool isTransB);
+    /**
+      * @brief  Process matrix calculation.
+      * @retval None
+      */
+    __aicore__ inline void Process();
+
+    using A_TYPE = AscendC::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, AType>;
+    using B_TYPE = AscendC::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, BType>;
+    using C_TYPE = AscendC::MatmulType<AscendC::TPosition::GM, CubeFormat::NZ, CType>;
+    using BIAS_TYPE =  AscendC::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, BiasType>;
+    AscendC::Matmul<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, CFG_NORM> matmulObj;
+
+private:
+    /**
+      * @brief  Calculate the gm offset based on the blockIdx.
+      * @param  blockIdx: Current Core blockidx.
+      * @param  offsetA: Gm offset of A matrix.
+      * @param  offsetB: Gm offset of B matrix.
+      * @param  offsetC: Gm offset of C matrix.
+      * @param  offsetBias: Gm offset of Bias matrix.
+      * @retval None
+      */
+    __aicore__ inline void CalcOffset(
+        int32_t blockIdx, int32_t& offsetA, int32_t& offsetB, int32_t& offsetC, int32_t& offsetBias);
+
+    AscendC::GlobalTensor<AType> aGlobal;
+    AscendC::GlobalTensor<BType> bGlobal;
+    AscendC::GlobalTensor<CType> cGlobal;
+    AscendC::GlobalTensor<BiasType> biasGlobal;
+    TCubeTiling tiling;
+    int32_t mCoreIndex;
+    int32_t nCoreIndex;
+    bool isTransA{false};
+    bool isTransB{false};
+};
+} // namespace MatmulChannelSplitCustom
+
+#endif // EXAMPLES_MATRIX_MATMUL_CHANNELSPLIT_OP_KERNEL_MATMUL_CHANNELSPLIT_CUSTOM_KERNEL_H
diff --git a/examples/matrix/matmul_channelsplit/run.sh b/examples/matrix/matmul_channelsplit/run.sh
new file mode 100644
index 00000000..1153cb01
--- /dev/null
+++ b/examples/matrix/matmul_channelsplit/run.sh
@@ -0,0 +1,96 @@
+#!/bin/bash
+# Copyright (c) 2025 Huawei Technologies Co., Ltd.
+# This file is a part of the CANN Open Software.
+# Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# ======================================================================================================================
+
+export IS_PERF="0"
+
+SHORT=r:,v:,p:,
+LONG=run-mode:,soc-version:,perf:,
+OPTS=$(getopt -a --options $SHORT --longoptions $LONG -- "$@")
+eval set -- "$OPTS"
+while :
+do
+    case "$1" in
+        (-r | --run-mode )
+            RUN_MODE="$2"
+            shift 2;;
+        (-v | --soc-version )
+            SOC_VERSION="$2"
+            shift 2;;
+        (-p | --perf )
+            IS_PERF="$2"
+            shift 2;;
+        (--)
+            shift;
+            break;;
+        (*)
+            echo "[ERROR] Unexpected option: $1";
+            break;;
+    esac
+done
+
+# Check invalid input
+if [[ ! "${SOC_VERSION}" =~ ^Ascend910 ]]; then
+    echo "[ERROR] Unsupported SocVersion: ${SOC_VERSION}"
+    exit 1
+fi
+
+if [ "${RUN_MODE}" != "npu" ] && [ "${RUN_MODE}" != "sim" ] && [ "${RUN_MODE}" != "cpu" ]; then
+    echo "[ERROR] Unsupported RUN_MODE: ${RUN_MODE}, which can only be cpu/sim/npu."
+    exit 1
+fi
+
+if [ "${IS_PERF}" != "0" ] && [ "${IS_PERF}" != "1" ]; then
+    echo "[ERROR] Unsupported IS_PERF: ${IS_PERF}, which can only be 0 or 1."
+    exit 1
+fi
+
+# only npu mode support is_perf = 1
+if [ "${IS_PERF}" == "1" ] && [ "${RUN_MODE}" != "npu" ]; then
+    echo "[ERROR] Unsupported IS_PERF: ${IS_PERF} while RUN_MODE is ${RUN_MODE}."
+    exit 1
+fi
+
+rm -rf build
+mkdir build
+cd build
+
+source $ASCEND_HOME_DIR/bin/setenv.bash
+export LD_LIBRARY_PATH=${ASCEND_HOME_DIR}/tools/simulator/${SOC_VERSION}/lib:$LD_LIBRARY_PATH
+set -euo pipefail
+
+cmake  -DRUN_MODE=${RUN_MODE} -DSOC_VERSION=${SOC_VERSION} -DASCEND_CANN_PACKAGE_PATH=${ASCEND_HOME_DIR} ..
+make -j16
+
+cd ../
+
+rm -rf input
+mkdir input
+rm -rf output
+mkdir output
+
+rm -rf bin/
+mkdir -p bin
+cd bin
+cp ../build/ascendc_matmul_channelsplit_bbit ./
+
+export TF_CPP_MIN_LOG_LEVEL=3
+
+if [ "${RUN_MODE}" = "npu" ]; then
+    if [ "${IS_PERF}" = "1" ]; then
+        export ASCEND_TOOLKIT_HOME=${ASCEND_HOME_DIR}
+        python3 -u ../scripts/exec_test.py npu "perf"
+    else
+        python3 -u ../scripts/exec_test.py npu "normal"
+    fi
+elif [ "${RUN_MODE}" = "sim" ]; then
+    python3 -u ../scripts/exec_test.py sim "normal"
+elif [ "${RUN_MODE}" = "cpu" ]; then
+    python3 -u ../scripts/exec_test.py cpu "normal"
+fi
diff --git a/examples/matrix/matmul_channelsplit/scripts/exec_test.py b/examples/matrix/matmul_channelsplit/scripts/exec_test.py
new file mode 100644
index 00000000..1ab5d18d
--- /dev/null
+++ b/examples/matrix/matmul_channelsplit/scripts/exec_test.py
@@ -0,0 +1,130 @@
+#!/usr/bin/python3
+# coding=utf-8
+
+# Copyright (c) 2025 Huawei Technologies Co., Ltd.
+# This file is a part of the CANN Open Software.
+# Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# ======================================================================================================================
+import os
+import sys
+import csv
+import time
+import logging
+import subprocess
+import shlex
+
+import numpy as np
+
+sys.path.append("../..")
+from common_scripts.gen_data import MatmulGenData
+from common_scripts.compare_data import compare_data
+from common_scripts.exec_utils import get_file_work_dir, get_case_list, get_perf_task_duration, clear_file_cache
+
+IS_BIAS = True
+IS_TRANS_A = False
+IS_TRANS_B = False
+IS_CHANNEL_SPLIT = True
+# float16 in float32 out
+DATA_TYPE_STR = "float16_float32"
+A_FORMAT = "ND"
+B_FORMAT = "ND"
+C_FORMAT = "NZ"
+
+logging.basicConfig(level=logging.INFO)
+
+
+class ProcessParams:
+    def __init__(self, case_name, m, n, k, b, is_perf, run_mode):
+        self.case_name = case_name
+        self.m = m
+        self.n = n
+        self.k = k
+        self.b = b
+        self.is_perf = is_perf
+        self.run_mode = run_mode
+
+
+def process_case(file_work_dir, process_params):
+    case_name, m, n, k, b, is_perf, run_mode = process_params.case_name, process_params.m, process_params.n,\
+        process_params.k, process_params.b, process_params.is_perf, process_params.run_mode
+    logging.info("[INFO] start process case [%s]" % (case_name))
+    logging.info("[INFO] IS_PERF is set [%d]" % (is_perf))
+    clear_file_cache(file_work_dir)
+
+    matmul_gen_data = MatmulGenData(m, n, k, b, IS_TRANS_A, IS_TRANS_B, IS_BIAS, DATA_TYPE_STR,\
+        A_FORMAT, B_FORMAT, C_FORMAT, IS_CHANNEL_SPLIT)
+    params_str = f"{m} {n} {k}"
+    if is_perf:
+        matmul_gen_data.gen_fake_golden_data(file_work_dir)
+        cmd = f"msprof op --application=\"./ascendc_matmul_channelsplit_bbit {params_str}\" --output=./prof_out"
+    elif run_mode == "sim":
+        matmul_gen_data.gen_golden_data(file_work_dir)
+        cmd = f"msprof op simulator --application=\"./ascendc_matmul_channelsplit_bbit {params_str}\"\
+         --output=./sim_out"
+    else:
+        matmul_gen_data.gen_golden_data(file_work_dir)
+        cmd = f"./ascendc_matmul_channelsplit_bbit {params_str}"
+    subprocess.run(shlex.split(cmd))
+    if is_perf:
+        wrong_num = -1
+    else:
+        logging.info("[INFO] compare data case[%s]" % (case_name))
+        wrong_num = compare_data(file_work_dir, DATA_TYPE_STR)
+    res_data = []
+    res_data.append(case_name)
+    res_data.append(wrong_num)
+    res_data.append(b * m * n)
+    if wrong_num == -1:
+        res_data.append("None")
+    elif wrong_num / (b * m * n) > 0.001:
+        res_data.append("Fail")
+    else:
+        res_data.append("Success")
+    if is_perf:
+        task_duration = get_perf_task_duration("./prof_out")
+        res_data.append(task_duration)
+    return res_data
+
+
+def main():
+    args_len = len(sys.argv) - 1
+    if args_len != 2:
+        logging.info("[ERROR] exec_test input params error!")
+        return -1
+
+    file_work_dir = get_file_work_dir()
+    if not os.path.exists(file_work_dir):
+        logging.info("[ERROR] file path %s not exist!" % (file_work_dir))
+        return -1
+
+    is_perf = False
+    if sys.argv[2] == "perf":
+        is_perf = True
+
+    case_list = get_case_list()
+    res_list = [["case_name", "wrong_num", "total_num", "result", "task_duration"]]
+    run_mode = sys.argv[1]
+    for is_process, case_name, m, n, k, b in case_list:
+        if is_process == 1:
+            process_params = ProcessParams(case_name, m, n, k, b, is_perf, run_mode)
+            res_data = process_case(file_work_dir, process_params)
+            res_list.append(res_data)
+
+    timestamp = time.time()
+    result_file_name = "result_" + str(timestamp) + ".csv"
+    with open(os.path.join(file_work_dir, "output", result_file_name), 'w', newline='', encoding='utf-8') as csvfile:
+        writer = csv.writer(csvfile)
+        writer.writerows(res_list)
+
+    logging.info("---------------RESULT---------------")
+    for res in res_list:
+        logging.info(res)
+    return 0
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/matrix/matmul_channelsplit/testcase/case.csv b/examples/matrix/matmul_channelsplit/testcase/case.csv
new file mode 100644
index 00000000..8fb2bb95
--- /dev/null
+++ b/examples/matrix/matmul_channelsplit/testcase/case.csv
@@ -0,0 +1 @@
+1, case001, 64, 7680, 64
\ No newline at end of file
diff --git a/examples/matrix/matmul_constant/README.md b/examples/matrix/matmul_constant/README.md
index 9ff8bc85..30106c66 100644
--- a/examples/matrix/matmul_constant/README.md
+++ b/examples/matrix/matmul_constant/README.md
@@ -119,4 +119,4 @@ MatmulConstantCustom单算子，对输入的A、B矩阵做矩阵乘和加bias偏
     示例如下，Ascendxxxyy请替换为实际的AI处理器型号。
     ```
     bash run.sh -r cpu -v Ascendxxxyy -p 0
-    ```
+    ```
\ No newline at end of file
diff --git a/examples/matrix/matmul_constant/scripts/exec_test.py b/examples/matrix/matmul_constant/scripts/exec_test.py
index 28299603..aacdcd2d 100644
--- a/examples/matrix/matmul_constant/scripts/exec_test.py
+++ b/examples/matrix/matmul_constant/scripts/exec_test.py
@@ -14,13 +14,15 @@ import sys
 import csv
 import time
 import logging
+import subprocess
 
 import numpy as np
 
 sys.path.append("../..")
 from common_scripts.gen_data import MatmulGenData
 from common_scripts.compare_data import compare_data
-from common_scripts.exec_utils import get_file_work_dir, get_case_list, get_perf_task_duration, clear_file_cache
+from common_scripts.exec_utils import get_file_work_dir, get_case_list, get_perf_task_duration, clear_file_cache,\
+    get_process_case_cmd
 
 IS_BIAS = False
 IS_TRANS_A = False
@@ -50,17 +52,14 @@ def process_case(file_work_dir, process_params):
     clear_file_cache(file_work_dir)
 
     matmul_gen_data = MatmulGenData(m, n, k, b, IS_TRANS_A, IS_TRANS_B, IS_BIAS, DATA_TYPE_STR)
-    if is_perf: # npu(is_perf = 1)
+    if is_perf:
         matmul_gen_data.gen_fake_golden_data(file_work_dir)
-        os.system("msprof op --application=\"./ascendc_matmul_bbit %s %s %s %s\" --output=\"./prof_out\"" %
-            (m, n, k, b))
-    elif run_mode == "sim": # sim
-        matmul_gen_data.gen_golden_data(file_work_dir)
-        os.system("msprof op simulator --application=\"./ascendc_matmul_bbit %s %s %s %s\"\
-            --output=\"./sim_out\"" % (m, n, k, b))
-    else: # cpu or npu(is_perf = 0)
+    else:
         matmul_gen_data.gen_golden_data(file_work_dir)
-        os.system("./ascendc_matmul_bbit %s %s %s %s" % (m, n, k, b))
+    params_str = f"{m} {n} {k} {b}"
+    cmd = get_process_case_cmd(kernel_name="ascendc_matmul_bbit", params_str=params_str,\
+        is_perf=is_perf, run_mode=run_mode)
+    subprocess.run(cmd)
     if is_perf:
         wrong_num = -1
     else:
diff --git a/examples/matrix/matmul_l0c_extend/README.md b/examples/matrix/matmul_l0c_extend/README.md
index e8aadba3..fd221900 100644
--- a/examples/matrix/matmul_l0c_extend/README.md
+++ b/examples/matrix/matmul_l0c_extend/README.md
@@ -85,7 +85,7 @@
     - 结束矩阵乘操作。
 
 - 算子Tiling实现
-  - Ascend C提供一组Matmul Tiling API，方便用户获取MatMul kernel计算时所需的Tiling参数。只需要传入A/B/C矩阵等信息，调用API接口，即可获取到TCubeTiling结构体中的相关参数。
+  - Ascend C提供一组Matmul Tiling API，方便用户获取Matmul kernel计算时所需的Tiling参数。只需要传入A/B/C矩阵等信息，调用API接口，即可获取到TCubeTiling结构体中的相关参数。
   - 获取Tiling参数的流程如下：
     - 创建一个Tiling对象。
     - 设置A、B、C、Bias的参数类型信息；M、N、Ka、Kb形状信息等。
diff --git a/examples/matrix/matmul_l0c_extend/scripts/exec_test.py b/examples/matrix/matmul_l0c_extend/scripts/exec_test.py
index 8087a5e0..a684ea7f 100644
--- a/examples/matrix/matmul_l0c_extend/scripts/exec_test.py
+++ b/examples/matrix/matmul_l0c_extend/scripts/exec_test.py
@@ -14,13 +14,15 @@ import sys
 import csv
 import time
 import logging
+import subprocess
 
 import numpy as np
 
 sys.path.append("../..")
 from common_scripts.gen_data import MatmulGenData
 from common_scripts.compare_data import compare_data
-from common_scripts.exec_utils import get_file_work_dir, get_case_list, get_perf_task_duration, clear_file_cache
+from common_scripts.exec_utils import get_file_work_dir, get_case_list, get_perf_task_duration, clear_file_cache,\
+    get_process_case_cmd
 
 IS_BIAS = True
 IS_TRANS_A = True
@@ -50,17 +52,14 @@ def process_case(file_work_dir, process_params):
     clear_file_cache(file_work_dir)
 
     matmul_gen_data = MatmulGenData(m, n, k, b, IS_TRANS_A, IS_TRANS_B, IS_BIAS, DATA_TYPE_STR)
-    if is_perf: # npu(is_perf = 1)
+    if is_perf:
         matmul_gen_data.gen_fake_golden_data(file_work_dir)
-        os.system("msprof op --application=\"./ascendc_matmul_l0c_extend_bbit %s %s %s %s\" --output=\"./prof_out\"" %
-            (m, n, k, b))
-    elif run_mode == "sim": # sim
-        matmul_gen_data.gen_golden_data(file_work_dir)
-        os.system("msprof op simulator --application=\"./ascendc_matmul_l0c_extend_bbit %s %s %s %s\"\
-            --output=\"./sim_out\"" % (m, n, k, b))
-    else: # cpu or npu(is_perf = 0)
+    else:
         matmul_gen_data.gen_golden_data(file_work_dir)
-        os.system("./ascendc_matmul_l0c_extend_bbit %s %s %s %s" % (m, n, k, b))
+    params_str = f"{m} {n} {k} {b}"
+    cmd = get_process_case_cmd(kernel_name="ascendc_matmul_l0c_extend_bbit", params_str=params_str,\
+        is_perf=is_perf, run_mode=run_mode)
+    subprocess.run(cmd)
     if is_perf:
         wrong_num = -1
     else:
diff --git a/examples/matrix/matmul_l0cache/README.md b/examples/matrix/matmul_l0cache/README.md
index aa224a3b..cd51ddae 100644
--- a/examples/matrix/matmul_l0cache/README.md
+++ b/examples/matrix/matmul_l0cache/README.md
@@ -85,7 +85,7 @@ L0缓存没有对外开关，由Matmul API内部根据用户配置的shape信息
       ```
 
 - 算子Tiling实现
-  - Ascend C提供一组Matmul Tiling API，方便用户获取MatMul kernel计算时所需的Tiling参数。只需要传入A/B/C矩阵等信息，调用API接口，即可获取到TCubeTiling结构体中的相关参数。
+  - Ascend C提供一组Matmul Tiling API，方便用户获取Matmul kernel计算时所需的Tiling参数。只需要传入A/B/C矩阵等信息，调用API接口，即可获取到TCubeTiling结构体中的相关参数。
   - 获取Tiling参数的流程如下：
     - 创建一个Tiling对象。
     - 设置A、B、C、Bias的参数类型信息；M、N、Ka、Kb形状信息等。
diff --git a/examples/matrix/matmul_l2cache/README.md b/examples/matrix/matmul_l2cache/README.md
index 03d5b2ff..6ccb24ab 100644
--- a/examples/matrix/matmul_l2cache/README.md
+++ b/examples/matrix/matmul_l2cache/README.md
@@ -15,7 +15,7 @@
 | [cmake](cmake)                   | 编译工程文件                               |
 | [op_host](op_host)               | 本样例tiling代码实现                        |
 | [op_kernel](op_kernel)           | 本样例kernel代码实现                        |
-| [scripts](scripts)               | 执行文件      |
+| [scripts](scripts)               | 包含输入输出真值数据生成脚本文件和执行文件      |
 | [testcase](testcase)             | 用例文件，配置用例的计算shape信息        |
 | [CMakeLists.txt](CMakeLists.txt) | 编译工程文件                               |
 | [main.cpp](main.cpp)             | 主函数，调用算子的应用程序，含CPU域及NPU域调用           |
@@ -75,7 +75,7 @@
     - 结束矩阵乘操作。
 
 - Tiling实现
-    - Ascend C提供一组Matmul Tiling API，方便用户获取MatMul kernel计算时所需的Tiling参数。只需要传入A/B/C矩阵等信息，调用API接口，即可获取到TCubeTiling结构体中的相关参数。
+    - Ascend C提供一组Matmul Tiling API，方便用户获取Matmul kernel计算时所需的Tiling参数。只需要传入A/B/C矩阵等信息，调用API接口，即可获取到TCubeTiling结构体中的相关参数。
     - 获取Tiling参数的流程如下：
       - 创建一个Tiling对象。
       - 设置A、B、C、Bias的参数类型信息。根据数据的切分方式，设置数据切分后每次计算的Shape和数据切分前的完整OrgShape。
diff --git a/examples/matrix/matmul_l2cache/main.cpp b/examples/matrix/matmul_l2cache/main.cpp
index 6e54a5e4..608da572 100644
--- a/examples/matrix/matmul_l2cache/main.cpp
+++ b/examples/matrix/matmul_l2cache/main.cpp
@@ -189,7 +189,8 @@ int32_t main(int32_t argc, const char* args[])
         ss >> problem[i - 1];
     }
 
-    optiling::TestcaseParams caseParams = {problem[0], problem[1], problem[2], problem[3], problem[4]};
+    bool isBias = problem[3];
+    optiling::TestcaseParams caseParams = {problem[0], problem[1], problem[2], isBias, problem[4]};
     auto ascendcPlatform = platform_ascendc::PlatformAscendCManager::GetInstance(SOC_VERSION);
 #ifdef ASCENDC_CPU_DEBUG
     MatmulHost::TestMatmulCpu(caseParams);
diff --git a/examples/matrix/matmul_mixdualmaster/README.md b/examples/matrix/matmul_mixdualmaster/README.md
index 80c93599..4465a7c5 100644
--- a/examples/matrix/matmul_mixdualmaster/README.md
+++ b/examples/matrix/matmul_mixdualmaster/README.md
@@ -64,7 +64,7 @@
     - 结束矩阵乘操作。
 
 - Tiling实现
-    - Ascend C提供一组Matmul Tiling API，方便用户获取MatMul kernel计算时所需的Tiling参数。只需要传入A/B/C矩阵等信息，调用API接口，即可获取到TCubeTiling结构体中的相关参数。
+    - Ascend C提供一组Matmul Tiling API，方便用户获取Matmul kernel计算时所需的Tiling参数。只需要传入A/B/C矩阵等信息，调用API接口，即可获取到TCubeTiling结构体中的相关参数。
     - 获取Tiling参数的流程如下：
       - 创建一个Tiling对象。
       - 设置A、B、C、Bias的参数类型信息；M、N、Ka、Kb形状信息等。
diff --git a/examples/matrix/matmul_mixdualmaster/run.sh b/examples/matrix/matmul_mixdualmaster/run.sh
index 3933ebf5..78e91376 100644
--- a/examples/matrix/matmul_mixdualmaster/run.sh
+++ b/examples/matrix/matmul_mixdualmaster/run.sh
@@ -61,8 +61,6 @@ rm -rf build
 mkdir build
 cd build
 
-export TF_CPP_MIN_LOG_LEVEL=3
-
 source $ASCEND_HOME_DIR/bin/setenv.bash
 export LD_LIBRARY_PATH=${ASCEND_HOME_DIR}/tools/simulator/${SOC_VERSION}/lib:$LD_LIBRARY_PATH
 set -euo pipefail
@@ -82,6 +80,8 @@ mkdir -p bin
 cd bin
 cp ../build/ascendc_matmul_mixdualmaster_bbit ./
 
+export TF_CPP_MIN_LOG_LEVEL=3
+
 if [ "${RUN_MODE}" = "npu" ]; then
     if [ "${IS_PERF}" = "1" ]; then
         export ASCEND_TOOLKIT_HOME=${ASCEND_HOME_DIR}
diff --git a/examples/matrix/matmul_mixdualmaster/scripts/exec_test.py b/examples/matrix/matmul_mixdualmaster/scripts/exec_test.py
index 532b9f3f..befb8094 100644
--- a/examples/matrix/matmul_mixdualmaster/scripts/exec_test.py
+++ b/examples/matrix/matmul_mixdualmaster/scripts/exec_test.py
@@ -14,13 +14,15 @@ import sys
 import csv
 import time
 import logging
+import subprocess
 
 import numpy as np
 
 sys.path.append("../..")
 from common_scripts.gen_data import MatmulGenData
 from common_scripts.compare_data import compare_data
-from common_scripts.exec_utils import get_file_work_dir, get_case_list, get_perf_task_duration, clear_file_cache
+from common_scripts.exec_utils import get_file_work_dir, get_case_list, get_perf_task_duration, clear_file_cache,\
+    get_process_case_cmd
 
 IS_BIAS = False
 IS_TRANS_A = False
@@ -50,17 +52,14 @@ def process_case(file_work_dir, process_params):
     clear_file_cache(file_work_dir)
 
     matmul_gen_data = MatmulGenData(m, n, k, b, IS_TRANS_A, IS_TRANS_B, IS_BIAS, DATA_TYPE_STR)
-    if is_perf: # npu(is_perf = 1)
+    if is_perf:
         matmul_gen_data.gen_fake_golden_data(file_work_dir)
-        os.system("msprof op --application=\"./ascendc_matmul_mixdualmaster_bbit %s %s %s %s\" --output=\"./prof_out\"" %
-            (m, n, k, b))
-    elif run_mode == "sim": # sim
-        matmul_gen_data.gen_golden_data(file_work_dir)
-        os.system("msprof op simulator --application=\"./ascendc_matmul_mixdualmaster_bbit %s %s %s %s\"\
-            --output=\"./sim_out\"" % (m, n, k, b))
-    else: # cpu or npu(is_perf = 0)
+    else:
         matmul_gen_data.gen_golden_data(file_work_dir)
-        os.system("./ascendc_matmul_mixdualmaster_bbit %s %s %s %s" % (m, n, k, b))
+    params_str = f"{m} {n} {k} {b}"
+    cmd = get_process_case_cmd(kernel_name="ascendc_matmul_mixdualmaster_bbit", params_str=params_str,\
+        is_perf=is_perf, run_mode=run_mode)
+    subprocess.run(cmd)
     if is_perf:
         wrong_num = -1
     else:
diff --git a/examples/matrix/matmul_mndb/README.md b/examples/matrix/matmul_mndb/README.md
index 236a69b5..a52a0d3a 100644
--- a/examples/matrix/matmul_mndb/README.md
+++ b/examples/matrix/matmul_mndb/README.md
@@ -66,7 +66,7 @@
     - 结束矩阵乘操作。
 
 - 算子Tiling实现
-  - Ascend C提供一组Matmul Tiling API，方便用户获取MatMul Kernel计算时所需的Tiling参数。只需要传入A/B/C矩阵等信息，调用API接口，即可获取到TCubeTiling结构体中的相关参数。
+  - Ascend C提供一组Matmul Tiling API，方便用户获取Matmul Kernel计算时所需的Tiling参数。只需要传入A/B/C矩阵等信息，调用API接口，即可获取到TCubeTiling结构体中的相关参数。
   - 获取Tiling参数的流程如下：
     - 创建一个Tiling对象。
     - 设置A、B、C、Bias的参数类型信息；M、N、Ka、Kb形状信息等。
diff --git a/examples/matrix/matmul_mndb/scripts/exec_test.py b/examples/matrix/matmul_mndb/scripts/exec_test.py
index 8429d6e1..cc700ad5 100644
--- a/examples/matrix/matmul_mndb/scripts/exec_test.py
+++ b/examples/matrix/matmul_mndb/scripts/exec_test.py
@@ -14,13 +14,15 @@ import sys
 import csv
 import time
 import logging
+import subprocess
 
 import numpy as np
 
 sys.path.append("../..")
 from common_scripts.gen_data import MatmulGenData
 from common_scripts.compare_data import compare_data
-from common_scripts.exec_utils import get_file_work_dir, get_case_list, get_perf_task_duration, clear_file_cache
+from common_scripts.exec_utils import get_file_work_dir, get_case_list, get_perf_task_duration, clear_file_cache,\
+    get_process_case_cmd
 
 IS_BIAS = False
 IS_TRANS_A = False
@@ -50,17 +52,14 @@ def process_case(file_work_dir, process_params):
     clear_file_cache(file_work_dir)
 
     matmul_gen_data = MatmulGenData(m, n, k, b, IS_TRANS_A, IS_TRANS_B, IS_BIAS, DATA_TYPE_STR)
-    if is_perf: # npu(is_perf = 1)
+    if is_perf:
         matmul_gen_data.gen_fake_golden_data(file_work_dir)
-        os.system("msprof op --application=\"./ascendc_matmul_mndb_bbit %s %s %s %s\" --output=\"./prof_out\"" %
-            (m, n, k, b))
-    elif run_mode == "sim": # sim
-        matmul_gen_data.gen_golden_data(file_work_dir)
-        os.system("msprof op simulator --application=\"./ascendc_matmul_mndb_bbit %s %s %s %s\"\
-            --output=\"./sim_out\"" % (m, n, k, b))
-    else: # cpu or npu(is_perf = 0)
+    else:
         matmul_gen_data.gen_golden_data(file_work_dir)
-        os.system("./ascendc_matmul_mndb_bbit %s %s %s %s" % (m, n, k, b))
+    params_str = f"{m} {n} {k} {b}"
+    cmd = get_process_case_cmd(kernel_name="ascendc_matmul_mndb_bbit", params_str=params_str,\
+        is_perf=is_perf, run_mode=run_mode)
+    subprocess.run(cmd)
     if is_perf:
         wrong_num = -1
     else:
diff --git a/examples/matrix/matmul_nbuffer33/README.md b/examples/matrix/matmul_nbuffer33/README.md
index 4b6ced60..4d60f7ba 100644
--- a/examples/matrix/matmul_nbuffer33/README.md
+++ b/examples/matrix/matmul_nbuffer33/README.md
@@ -69,10 +69,16 @@
     - 结束矩阵乘操作。
 
 - 算子Tiling实现
-  - Ascend C提供一组Matmul Tiling API，方便用户获取MatMul kernel计算时所需的Tiling参数。只需要传入A/B/C矩阵等信息，调用API接口，即可获取到TCubeTiling结构体中的相关参数。
+  - Ascend C提供一组Matmul Tiling API，方便用户获取Matmul kernel计算时所需的Tiling参数。只需要传入A/B/C矩阵等信息，调用API接口，即可获取到TCubeTiling结构体中的相关参数。
   - 获取Tiling参数的流程如下：
     - 创建一个Tiling对象。
-    - 设置A、B、C、Bias的参数类型信息；M、N、Ka、Kb形状信息等。
+    - 设置A、B、C、Bias的参数类型信息；M、N、Ka、Kb形状信息等，开启NBuffer33模式。
+        ```
+        matmul_tiling::MatmulConfigParams matmulConfigParams(1, false,
+          matmul_tiling::ScheduleType::N_BUFFER_33, /* NBuffer33模式 */
+          matmul_tiling::MatrixTraverse::NOSET, false);
+        cubeTiling.SetMatmulConfigParams(matmulConfigParams);
+        ```
     - 调用GetTiling接口，获取Tiling信息。
 
 ## 编译运行样例
diff --git a/examples/matrix/matmul_nbuffer33/main.cpp b/examples/matrix/matmul_nbuffer33/main.cpp
index 21bd7b4f..6208faf0 100644
--- a/examples/matrix/matmul_nbuffer33/main.cpp
+++ b/examples/matrix/matmul_nbuffer33/main.cpp
@@ -190,8 +190,9 @@ void TestMatmul(const MatmulHost::MatmulCaseParams &testCaseParams)
 
 int32_t main(int32_t argc, const char *args[])
 {
-    int64_t problem[12] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
-    for (int32_t i = 1; i < argc && i < 13; ++i) { // 13
+    int64_t problem[3] = {1, 1, 1};
+    int32_t size = 4;
+    for (int32_t i = 1; i < argc && i < size; ++i) {
         std::stringstream ss(args[i]);
         ss >> problem[i - 1];
     }
@@ -199,10 +200,7 @@ int32_t main(int32_t argc, const char *args[])
     auto ascendcPlatform = platform_ascendc::PlatformAscendCManager::GetInstance(socVersion);
     MatmulHost::MatmulCaseParams testCaseParams{static_cast<int32_t>(ascendcPlatform->GetCoreNumAic()),
         static_cast<int32_t>(problem[0]), static_cast<int32_t>(problem[1]), static_cast<int32_t>(problem[2]),
-        IS_BIAS, IS_A_TRANS, IS_B_TRANS,
-        static_cast<int32_t>(problem[3]), static_cast<int32_t>(problem[4]), static_cast<int32_t>(problem[5]),
-        static_cast<int32_t>(problem[6]), static_cast<int32_t>(problem[7]), static_cast<int32_t>(problem[8]),
-        static_cast<int32_t>(problem[9]), static_cast<int32_t>(problem[10]), static_cast<int32_t>(problem[11])};
+        IS_BIAS, IS_A_TRANS, IS_B_TRANS};
 #ifdef ASCENDC_CPU_DEBUG
     MatmulHost::TestMatmulCpu(testCaseParams);
 #else
diff --git a/examples/matrix/matmul_nbuffer33/op_host/matmul_nbuffer33_custom_tiling.cpp b/examples/matrix/matmul_nbuffer33/op_host/matmul_nbuffer33_custom_tiling.cpp
index c71416ea..4c3bbf51 100644
--- a/examples/matrix/matmul_nbuffer33/op_host/matmul_nbuffer33_custom_tiling.cpp
+++ b/examples/matrix/matmul_nbuffer33/op_host/matmul_nbuffer33_custom_tiling.cpp
@@ -21,7 +21,7 @@ TCubeTiling GenerateTiling(const MatmulCaseParams& testCaseParams)
     uint32_t M = testCaseParams.m;
     uint32_t N = testCaseParams.n;
     uint32_t K = testCaseParams.k;
-    uint32_t blockDim = 1;
+    uint32_t blockDim = testCaseParams.usedCoreNum;
     bool isBias = testCaseParams.isBias;
     bool isAtrans = testCaseParams.isATrans;
     bool isBtrans = testCaseParams.isBTrans;
@@ -39,19 +39,13 @@ TCubeTiling GenerateTiling(const MatmulCaseParams& testCaseParams)
     cubeTiling.SetShape(M, N, K);
     cubeTiling.EnableBias(isBias);
     cubeTiling.SetBufferSpace(-1, -1, -1);
+    matmul_tiling::MatmulConfigParams matmulConfigParams(1, false, matmul_tiling::ScheduleType::N_BUFFER_33,
+        matmul_tiling::MatrixTraverse::NOSET, false);
+    cubeTiling.SetMatmulConfigParams(matmulConfigParams);
     if (cubeTiling.GetTiling(tilingData) == -1) {
         std::cout << "Generate tiling failed." << std::endl;
         return {};
     }
-	tilingData.baseM = testCaseParams.baseM;
-    tilingData.baseN = testCaseParams.baseN;
-    tilingData.baseK = testCaseParams.baseK;
-    tilingData.stepM = testCaseParams.stepM;
-    tilingData.stepN = testCaseParams.stepN;
-    tilingData.stepKa = testCaseParams.stepKa;
-    tilingData.stepKb = testCaseParams.stepKb;
-    tilingData.depthA1 = testCaseParams.depthA1;
-    tilingData.depthB1 = testCaseParams.depthB1;
     return tilingData;
 }
 
diff --git a/examples/matrix/matmul_nbuffer33/op_host/matmul_nbuffer33_custom_tiling.h b/examples/matrix/matmul_nbuffer33/op_host/matmul_nbuffer33_custom_tiling.h
index 2a2c1c7e..449fbf1e 100644
--- a/examples/matrix/matmul_nbuffer33/op_host/matmul_nbuffer33_custom_tiling.h
+++ b/examples/matrix/matmul_nbuffer33/op_host/matmul_nbuffer33_custom_tiling.h
@@ -24,15 +24,6 @@ struct MatmulCaseParams
     int32_t isBias;
     bool isATrans;
     bool isBTrans;
-    int32_t stepM;
-    int32_t stepN;
-    int32_t stepKa;
-    int32_t stepKb;
-    int32_t depthA1;
-    int32_t depthB1;
-    int32_t baseM;
-    int32_t baseN;
-    int32_t baseK;
 };
 
 /**
diff --git a/examples/matrix/matmul_nbuffer33/run.sh b/examples/matrix/matmul_nbuffer33/run.sh
index f80b3277..5e271e9d 100644
--- a/examples/matrix/matmul_nbuffer33/run.sh
+++ b/examples/matrix/matmul_nbuffer33/run.sh
@@ -80,7 +80,7 @@ mkdir -p bin
 cd bin
 cp ../build/ascendc_matmul_nbuffer33_bbit ./
 
-export TF_CPP_MIN_LOG_LEVEL=3s
+export TF_CPP_MIN_LOG_LEVEL=3
 
 if [ "${RUN_MODE}" = "npu" ]; then
     if [ "${IS_PERF}" = "1" ]; then
diff --git a/examples/matrix/matmul_nbuffer33/scripts/exec_test.py b/examples/matrix/matmul_nbuffer33/scripts/exec_test.py
index dfba0c12..ce5cc63d 100644
--- a/examples/matrix/matmul_nbuffer33/scripts/exec_test.py
+++ b/examples/matrix/matmul_nbuffer33/scripts/exec_test.py
@@ -14,13 +14,15 @@ import sys
 import csv
 import time
 import logging
+import subprocess
 
 import numpy as np
 
 sys.path.append("../..")
 from common_scripts.gen_data import MatmulGenData
 from common_scripts.compare_data import compare_data
-from common_scripts.exec_utils import get_file_work_dir, get_case_list, get_perf_task_duration, clear_file_cache
+from common_scripts.exec_utils import get_file_work_dir, get_case_list, get_perf_task_duration, clear_file_cache,\
+    get_process_case_cmd
 
 IS_BIAS = False
 IS_TRANS_A = False
@@ -31,8 +33,7 @@ logging.basicConfig(level=logging.INFO)
 
 
 class ProcessParams:
-    def __init__(self, case_name, m, n, k, b, stepm, stepn, stepka, stepkb, deptha1, depthb1, basem, basen, basek,\
-        is_perf, run_mode):
+    def __init__(self, case_name, m, n, k, b, is_perf, run_mode):
         self.case_name = case_name
         self.m = m
         self.n = n
@@ -40,42 +41,24 @@ class ProcessParams:
         self.b = b
         self.is_perf = is_perf
         self.run_mode = run_mode
-        self.stepm = stepm
-        self.stepn = stepn
-        self.stepka = stepka
-        self.stepkb = stepkb
-        self.deptha1 = deptha1
-        self.depthb1 = depthb1
-        self.basem = basem
-        self.basen = basen
-        self.basek = basek
 
 
 def process_case(file_work_dir, process_params):
     case_name, m, n, k, b, is_perf, run_mode = process_params.case_name, process_params.m, process_params.n,\
         process_params.k, process_params.b, process_params.is_perf, process_params.run_mode
-    stepm, stepn, stepka, stepkb, deptha1, depthb1, basem, basen, basek = process_params.stepm, process_params.stepn,\
-        process_params.stepka, process_params.stepkb, process_params.deptha1, process_params.depthb1,\
-        process_params.basem, process_params.basen, process_params.basek
     logging.info("[INFO] start process case [%s]" % (case_name))
     logging.info("[INFO] IS_PERF is set [%d]" % (is_perf))
     clear_file_cache(file_work_dir)
 
     matmul_gen_data = MatmulGenData(m, n, k, b, IS_TRANS_A, IS_TRANS_B, IS_BIAS, DATA_TYPE_STR)
-    if is_perf: # npu(is_perf = 1)
+    if is_perf:
         matmul_gen_data.gen_fake_golden_data(file_work_dir)
-        os.system("msprof op --application=\"./ascendc_matmul_nbuffer33_bbit %s %s %s %s %s %s %s %s %s %s %s %s %s\"\
-            --output=\"./prof_out\"" % (m, n, k, stepm, stepn, stepka, stepkb, deptha1, depthb1, basem, basen, basek,\
-            b))
-    elif run_mode == "sim": # sim
-        matmul_gen_data.gen_golden_data(file_work_dir)
-        os.system("msprof op simulator --application=\"./ascendc_matmul_nbuffer33_bbit %s %s %s %s %s %s %s %s %s %s %s\
-            %s %s\" --output=\"./sim_out\"" % (m, n, k, stepm, stepn, stepka, stepkb, deptha1, depthb1, basem, basen,\
-            basek, b))
-    else: # cpu or npu(is_perf = 0)
+    else:
         matmul_gen_data.gen_golden_data(file_work_dir)
-        os.system("./ascendc_matmul_nbuffer33_bbit %s %s %s %s %s %s %s %s %s %s %s %s %s" % (m, n, k,\
-            stepm, stepn, stepka, stepkb, deptha1, depthb1, basem, basen, basek, b))
+    params_str = f"{m} {n} {k} {b}"
+    cmd = get_process_case_cmd(kernel_name="ascendc_matmul_nbuffer33_bbit", params_str=params_str,\
+        is_perf=is_perf, run_mode=run_mode)
+    subprocess.run(cmd)
     if is_perf:
         wrong_num = -1
     else:
@@ -97,45 +80,6 @@ def process_case(file_work_dir, process_params):
     return res_data
 
 
-def get_case_list():
-    current_path = os.getcwd()
-    case_dir = os.path.join(os.path.dirname(current_path), "testcase")
-    if not os.path.exists(case_dir):
-        logging.info("[ERROR] file path %s not exist!" % (case_dir))
-        return None
-
-    case_list = []
-    for file_name in os.listdir(case_dir):
-        if not file_name.endswith(".csv"):
-            continue
-
-        abs_file_name = os.path.join(case_dir, file_name)
-        with open(abs_file_name, mode='r', encoding='utf-8') as file:
-            csv_reader = csv.reader(file)
-            for row in csv_reader:
-                item_list = []
-                item_list.append(int(row[0].lstrip("\ufeff")))
-                item_list.append(row[1])
-                item_list.append(int(row[2]))
-                item_list.append(int(row[3]))
-                item_list.append(int(row[4]))
-                item_list.append(int(row[5]))
-                item_list.append(int(row[6]))
-                item_list.append(int(row[7]))
-                item_list.append(int(row[8]))
-                item_list.append(int(row[9]))
-                item_list.append(int(row[10]))
-                item_list.append(int(row[11]))
-                item_list.append(int(row[12]))
-                item_list.append(int(row[13]))
-                if len(row) > 14:
-                    item_list.append(int(row[14]))
-                else:
-                    item_list.append(1)
-                case_list.append(item_list)
-    return case_list
-
-
 def main():
     args_len = len(sys.argv) - 1
     if args_len != 2:
@@ -154,11 +98,9 @@ def main():
     case_list = get_case_list()
     res_list = [["case_name", "wrong_num", "total_num", "result", "task_duration"]]
     run_mode = sys.argv[1]
-    for is_process, case_name, m, n, k, stepm, stepn, stepka, stepkb, deptha1, depthb1,\
-        basem, basen, basek, b in case_list:
+    for is_process, case_name, m, n, k, b in case_list:
         if is_process == 1:
-            process_params = ProcessParams(case_name, m, n, k, b, stepm, stepn, stepka, stepkb,\
-                deptha1, depthb1, basem, basen, basek, is_perf, run_mode)
+            process_params = ProcessParams(case_name, m, n, k, b, is_perf, run_mode)
             res_data = process_case(file_work_dir, process_params)
             res_list.append(res_data)
 
diff --git a/examples/matrix/matmul_nbuffer33/testcase/case.csv b/examples/matrix/matmul_nbuffer33/testcase/case.csv
index 0bc22a85..0a942cd1 100644
--- a/examples/matrix/matmul_nbuffer33/testcase/case.csv
+++ b/examples/matrix/matmul_nbuffer33/testcase/case.csv
@@ -1 +1 @@
-1, case001, 384, 1024, 384, 3, 1, 3, 3, 9, 6, 128, 128, 128
\ No newline at end of file
+1, case001, 256, 512, 192
\ No newline at end of file
diff --git a/examples/matrix/matmul_nz/README.md b/examples/matrix/matmul_nz/README.md
index a17c4ef6..2ee8e62e 100644
--- a/examples/matrix/matmul_nz/README.md
+++ b/examples/matrix/matmul_nz/README.md
@@ -92,7 +92,7 @@ MatmulNzCustom算子首先使用DataCopyPad接口将非对齐的矩阵数据搬
       ```
 
 - 算子Tiling实现
-    - Ascend C提供一组Matmul Tiling API，方便用户获取MatMul kernel计算时所需的Tiling参数。只需要传入A/B/C矩阵等信息，调用API接口，即可获取到TCubeTiling结构体中的相关参数。
+    - Ascend C提供一组Matmul Tiling API，方便用户获取Matmul kernel计算时所需的Tiling参数。只需要传入A/B/C矩阵等信息，调用API接口，即可获取到TCubeTiling结构体中的相关参数。
     - 获取Tiling参数的流程如下：
       - 创建一个Tiling对象。
       - 设置A、B、C、Bias的参数类型信息；M、N、Ka、Kb形状信息等。
diff --git a/examples/matrix/matmul_nz/scripts/exec_test.py b/examples/matrix/matmul_nz/scripts/exec_test.py
index 3065da6e..b10ebda5 100644
--- a/examples/matrix/matmul_nz/scripts/exec_test.py
+++ b/examples/matrix/matmul_nz/scripts/exec_test.py
@@ -14,13 +14,15 @@ import sys
 import csv
 import time
 import logging
+import subprocess
 
 import numpy as np
 
 sys.path.append("../..")
 from common_scripts.gen_data import MatmulGenData
 from common_scripts.compare_data import compare_data
-from common_scripts.exec_utils import get_file_work_dir, get_case_list, get_perf_task_duration, clear_file_cache
+from common_scripts.exec_utils import get_file_work_dir, get_case_list, get_perf_task_duration, clear_file_cache,\
+    get_process_case_cmd
 
 IS_BIAS = False
 IS_TRANS_A = False
@@ -52,17 +54,14 @@ def process_case(file_work_dir, process_params):
     clear_file_cache(file_work_dir)
 
     matmul_gen_data = MatmulGenData(m, n, k, b, IS_TRANS_A, IS_TRANS_B, IS_BIAS, DATA_TYPE_STR, A_FORMAT, B_FORMAT)
-    if is_perf: # npu(is_perf = 1)
+    if is_perf:
         matmul_gen_data.gen_fake_golden_data(file_work_dir)
-        os.system("msprof op --application=\"./ascendc_matmul_bbit %s %s %s %s\" --output=\"./prof_out\"" %
-            (m, n, k, b))
-    elif run_mode == "sim": # sim
-        matmul_gen_data.gen_golden_data(file_work_dir)
-        os.system("msprof op simulator --application=\"./ascendc_matmul_bbit %s %s %s %s\"\
-            --output=\"./sim_out\"" % (m, n, k, b))
-    else: # cpu or npu(is_perf = 0)
+    else:
         matmul_gen_data.gen_golden_data(file_work_dir)
-        os.system("./ascendc_matmul_bbit %s %s %s %s" % (m, n, k, b))
+    params_str = f"{m} {n} {k} {b}"
+    cmd = get_process_case_cmd(kernel_name="ascendc_matmul_bbit", params_str=params_str,\
+        is_perf=is_perf, run_mode=run_mode)
+    subprocess.run(cmd)
     if is_perf:
         wrong_num = -1
     else:
diff --git a/examples/matrix/matmul_partial_output/README.md b/examples/matrix/matmul_partial_output/README.md
index 38009236..94254930 100644
--- a/examples/matrix/matmul_partial_output/README.md
+++ b/examples/matrix/matmul_partial_output/README.md
@@ -1,7 +1,7 @@
 <!--声明：本文使用[Creative Commons License version 4.0](https://creativecommons.org/licenses/by/4.0/legalcode)许可协议，转载、引用或修改等操作请遵循此许可协议。-->
 ## 概述
 
-本样例介绍了调用Matmul高阶API实现开启Partial Output功能的单算子。Partial Output功能的应用场景为矩阵乘结果不需要累加，只需要输出baseM\*baseK和baseK\*baseN的计算结果baseM\*baseN。
+本样例介绍了调用Matmul高阶API实现开启Partial Output功能的单算子。Partial Output功能的应用场景为矩阵乘结果不需要累加，只需要输出baseM\*baseK和baseK\*baseN的计算结果baseM\*baseN。关于Partial Output功能的具体内容请参考《[Ascend C算子开发](https://www.hiascend.com/document/redirect/CannCommunityOpdevAscendC)》中的“算子实现 > 矩阵编程（高阶API） > 特性场景 > 单次矩阵乘局部输出”章节。
 
   本样例以直调的方式调用算子核函数。  
 直调：核函数的基础调用方式，开发者完成算子核函数的开发和Tiling实现后，即可通过AscendCL运行时接口，完成算子的调用。包含CPU侧、NPU侧和仿真三种运行验证方法。
@@ -73,7 +73,7 @@
     - 结束矩阵乘操作。
 
 - 算子Tiling实现
-  - Ascend C提供一组Matmul Tiling API，方便用户获取MatMul kernel计算时所需的Tiling参数。只需要传入A/B/C矩阵等信息，调用API接口，即可获取到TCubeTiling结构体中的相关参数。
+  - Ascend C提供一组Matmul Tiling API，方便用户获取Matmul kernel计算时所需的Tiling参数。只需要传入A/B/C矩阵等信息，调用API接口，即可获取到TCubeTiling结构体中的相关参数。
   - 获取Tiling参数的流程如下：
     - 创建一个Tiling对象。
     - 设置A、B、C、Bias的参数类型信息；M、N、Ka、Kb形状信息等。
@@ -103,4 +103,4 @@
     示例如下，Ascendxxxyy请替换为实际的AI处理器型号。
     ```
     bash run.sh -r cpu -v Ascendxxxyy -p 0
-    ```
\ No newline at end of file
+    ```
diff --git a/examples/matrix/matmul_partial_output/op_kernel/matmul_partial_output_custom_kernel.cpp b/examples/matrix/matmul_partial_output/op_kernel/matmul_partial_output_custom_kernel.cpp
index 7c48a8c9..89128d14 100644
--- a/examples/matrix/matmul_partial_output/op_kernel/matmul_partial_output_custom_kernel.cpp
+++ b/examples/matrix/matmul_partial_output/op_kernel/matmul_partial_output_custom_kernel.cpp
@@ -103,6 +103,7 @@ __aicore__ inline void MatmulKernel<AType, BType, CType, BiasType>::Process(Asce
                 Clear(bufferC);
             }
             bufferC = bufferC + bufferTmp;
+            AscendC::PipeBarrier<PIPE_ALL>();
             outputOffset += tiling.baseM * tiling.baseN;
         }
         DataCopy(cGlobal[i * tiling.baseM * tiling.baseN], bufferC, tiling.baseM * tiling.baseN);
diff --git a/examples/matrix/matmul_partial_output/scripts/exec_test.py b/examples/matrix/matmul_partial_output/scripts/exec_test.py
index 367acae2..98c4338d 100644
--- a/examples/matrix/matmul_partial_output/scripts/exec_test.py
+++ b/examples/matrix/matmul_partial_output/scripts/exec_test.py
@@ -14,13 +14,15 @@ import sys
 import csv
 import time
 import logging
+import subprocess
 
 import numpy as np
 
 sys.path.append("../..")
 from common_scripts.gen_data import MatmulGenData
 from common_scripts.compare_data import compare_data
-from common_scripts.exec_utils import get_file_work_dir, get_case_list, get_perf_task_duration, clear_file_cache
+from common_scripts.exec_utils import get_file_work_dir, get_case_list, get_perf_task_duration, clear_file_cache,\
+    get_process_case_cmd
 
 IS_BIAS = False
 IS_TRANS_A = False
@@ -49,17 +51,14 @@ def process_case(file_work_dir, process_params):
     clear_file_cache(file_work_dir)
 
     matmul_gen_data = MatmulGenData(m, n, k, b, IS_TRANS_A, IS_TRANS_B, IS_BIAS, DATA_TYPE_STR)
-    if is_perf: # npu(is_perf = 1)
+    if is_perf:
         matmul_gen_data.gen_fake_golden_data(file_work_dir)
-        os.system("msprof op --application=\"./ascendc_matmul_partial_output_bbit %s %s %s %s\"\
-            --output=\"./prof_out\"" % (m, n, k, b))
-    elif run_mode == "sim": # sim
-        matmul_gen_data.gen_golden_data(file_work_dir)
-        os.system("msprof op simulator --application=\"./ascendc_matmul_partial_output_bbit %s %s %s %s\"\
-            --output=\"./sim_out\"" % (m, n, k, b))
-    else: # cpu or npu(is_perf = 0)
+    else:
         matmul_gen_data.gen_golden_data(file_work_dir)
-        os.system("./ascendc_matmul_partial_output_bbit %s %s %s %s" % (m, n, k, b))
+    params_str = f"{m} {n} {k} {b}"
+    cmd = get_process_case_cmd(kernel_name="ascendc_matmul_partial_output_bbit", params_str=params_str,\
+        is_perf=is_perf, run_mode=run_mode)
+    subprocess.run(cmd)
     if is_perf:
         wrong_num = -1
     else:
diff --git a/examples/matrix/matmul_preload/README.md b/examples/matrix/matmul_preload/README.md
index c85a78a1..fec924ee 100644
--- a/examples/matrix/matmul_preload/README.md
+++ b/examples/matrix/matmul_preload/README.md
@@ -67,7 +67,7 @@ matmul单算子，对输入的A、B矩阵做矩阵乘和加bias偏置。在MTE2
     - 结束矩阵乘操作。
 
 - tiling实现
-    - Ascend C提供一组Matmul Tiling API，方便用户获取MatMul kernel计算时所需的Tiling参数。只需要传入A/B/C矩阵等信息，调用API接口，即可获取到TCubeTiling结构体中的相关参数。
+    - Ascend C提供一组Matmul Tiling API，方便用户获取Matmul kernel计算时所需的Tiling参数。只需要传入A/B/C矩阵等信息，调用API接口，即可获取到TCubeTiling结构体中的相关参数。
     - 获取Tiling参数的流程如下：
       - 创建一个Tiling对象。
       - 设置A、B、C、Bias的参数类型信息；M、N、Ka、Kb形状信息等。
diff --git a/examples/matrix/matmul_preload/main.cpp b/examples/matrix/matmul_preload/main.cpp
index 9eeee4ff..3b4a0c4d 100644
--- a/examples/matrix/matmul_preload/main.cpp
+++ b/examples/matrix/matmul_preload/main.cpp
@@ -40,8 +40,11 @@ static size_t GetSysWorkSpaceSize()
 
 // CPU debug mode
 #ifdef ASCENDC_CPU_DEBUG
-void TestMatmulCpu(int64_t m, int64_t n, int64_t k)
+void TestMatmulCpu(const MatmulHost::MatmulCaseParams &testCaseParams)
 {
+    int64_t m = testCaseParams.m;
+    int64_t n = testCaseParams.n;
+    int64_t k = testCaseParams.k;
     size_t x1FileSize = m * k * sizeof(uint16_t);  // uint16_t represent half
     size_t x2FileSize = k * n * sizeof(uint16_t);  // uint16_t represent half
     size_t yFileSize = m * n * sizeof(float);
@@ -62,9 +65,6 @@ void TestMatmulCpu(int64_t m, int64_t n, int64_t k)
     }
     size_t tilingFileSize = sizeof(TCubeTiling);
     uint8_t* tiling = (uint8_t*)AscendC::GmAlloc(tilingFileSize);
-    auto ascendcPlatform = platform_ascendc::PlatformAscendCManager::GetInstance();
-    MatmulHost::MatmulCaseParams testCaseParams{static_cast<int32_t>(ascendcPlatform->GetCoreNumAic()),
-        static_cast<int32_t>(m), static_cast<int32_t>(n), static_cast<int32_t>(k), IS_BIAS, IS_A_TRANS, IS_B_TRANS};
     // Calculate Tiling
     const auto& tilingData = MatmulHost::GenerateTiling(testCaseParams);
     memcpy_s(tiling, tilingFileSize, &tilingData, tilingFileSize);
@@ -81,7 +81,7 @@ void TestMatmulCpu(int64_t m, int64_t n, int64_t k)
 }
 // NPU
 #else
-void MatmulOp(uint8_t* x1, uint8_t* x2, uint8_t* y, uint8_t* bias, int64_t m, int64_t n, int64_t k,
+void MatmulOp(uint8_t* x1, uint8_t* x2, uint8_t* y, uint8_t* bias, MatmulHost::MatmulCaseParams testCaseParams,
     void* stream = nullptr)
 {
     // Init args
@@ -97,9 +97,6 @@ void MatmulOp(uint8_t* x1, uint8_t* x2, uint8_t* y, uint8_t* bias, int64_t m, in
     uint8_t* tilingHost;
     uint8_t* tilingDevice;
     size_t tilingFileSize = sizeof(TCubeTiling);
-    auto ascendcPlatform = platform_ascendc::PlatformAscendCManager::GetInstance();
-    MatmulHost::MatmulCaseParams testCaseParams{static_cast<int32_t>(ascendcPlatform->GetCoreNumAic()),
-        static_cast<int32_t>(m), static_cast<int32_t>(n), static_cast<int32_t>(k), IS_BIAS, IS_A_TRANS, IS_B_TRANS};
     // Calculate Tiling
     const auto tilingData = MatmulHost::GenerateTiling(testCaseParams);
     CHECK_ACL(aclrtMallocHost((void **)(&tilingHost), tilingFileSize));
@@ -129,12 +126,12 @@ void TestAclDeInit(aclrtContext& context, aclrtStream& stream, int64_t& deviceId
     CHECK_ACL(aclFinalize());
 }
 
-void TestMatmul(int64_t m, int64_t n, int64_t k)
+void TestMatmul(const MatmulHost::MatmulCaseParams &testCaseParams)
 {
-    size_t x1FileSize = static_cast<size_t>(m * k) * sizeof(uint16_t);  // uint16_t represent half
-    size_t x2FileSize = static_cast<size_t>(k * n) * sizeof(uint16_t);  // uint16_t represent half
-    size_t yFileSize = static_cast<size_t>(m * n) * sizeof(float);
-    size_t biasFileSize = static_cast<size_t>(1 * n) * sizeof(float);
+    size_t x1FileSize = static_cast<size_t>(testCaseParams.m * testCaseParams.k) * sizeof(uint16_t);  // uint16_t represent half
+    size_t x2FileSize = static_cast<size_t>(testCaseParams.k * testCaseParams.n) * sizeof(uint16_t);  // uint16_t represent half
+    size_t yFileSize = static_cast<size_t>(testCaseParams.m * testCaseParams.n) * sizeof(float);
+    size_t biasFileSize = static_cast<size_t>(1 * testCaseParams.n) * sizeof(float);
 
     aclrtContext context;
     aclrtStream stream = nullptr;
@@ -168,7 +165,7 @@ void TestMatmul(int64_t m, int64_t n, int64_t k)
     CHECK_ACL(aclrtMallocHost((void **)(&yHost), yFileSize));
     CHECK_ACL(aclrtMalloc((void **)&yDevice, yFileSize, ACL_MEM_MALLOC_HUGE_FIRST));
 
-    MatmulOp(x1Device, x2Device, yDevice, biasDevice, m, n, k, stream);
+    MatmulOp(x1Device, x2Device, yDevice, biasDevice, testCaseParams, stream);
     CHECK_ACL(aclrtSynchronizeStream(stream));
 
     CHECK_ACL(aclrtMemcpy(yHost, yFileSize, yDevice, yFileSize, ACL_MEMCPY_DEVICE_TO_HOST));
@@ -191,16 +188,23 @@ void TestMatmul(int64_t m, int64_t n, int64_t k)
 
 int32_t main(int32_t argc, const char* args[])
 {
-    int64_t problem[3] = {1, 1, 1};
-    for (int32_t i = 1; i < argc && i < 4; ++i) { // 4
+    int64_t problem[12] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+    int32_t size = 13;
+    for (int32_t i = 1; i < argc && i < size; ++i) {
         std::stringstream ss(args[i]);
         ss >> problem[i - 1];
     }
     auto ascendcPlatform = platform_ascendc::PlatformAscendCManager::GetInstance(SOC_VERSION);
+    MatmulHost::MatmulCaseParams testCaseParams{static_cast<int32_t>(ascendcPlatform->GetCoreNumAic()),
+        static_cast<int32_t>(problem[0]), static_cast<int32_t>(problem[1]), static_cast<int32_t>(problem[2]),
+        IS_BIAS, IS_A_TRANS, IS_B_TRANS,
+        static_cast<int32_t>(problem[3]), static_cast<int32_t>(problem[4]), static_cast<int32_t>(problem[5]),
+        static_cast<int32_t>(problem[6]), static_cast<int32_t>(problem[7]), static_cast<int32_t>(problem[8]),
+        static_cast<int32_t>(problem[9]), static_cast<int32_t>(problem[10]), static_cast<int32_t>(problem[11])};
 #ifdef ASCENDC_CPU_DEBUG
-    MatmulHost::TestMatmulCpu(problem[0], problem[1], problem[2]); // 2 means problem shape k
+    MatmulHost::TestMatmulCpu(testCaseParams); // 2 means problem shape k
 #else
-    MatmulHost::TestMatmul(problem[0], problem[1], problem[2]); // 2 means problem shape k
+    MatmulHost::TestMatmul(testCaseParams); // 2 means problem shape k
 #endif
     return 0;
 }
\ No newline at end of file
diff --git a/examples/matrix/matmul_preload/op_host/matmul_preload_custom_tiling.cpp b/examples/matrix/matmul_preload/op_host/matmul_preload_custom_tiling.cpp
index ffe6bb30..f10b125e 100644
--- a/examples/matrix/matmul_preload/op_host/matmul_preload_custom_tiling.cpp
+++ b/examples/matrix/matmul_preload/op_host/matmul_preload_custom_tiling.cpp
@@ -39,10 +39,25 @@ TCubeTiling GenerateTiling(const MatmulCaseParams& testCaseParams)
     tiling.SetShape(M, N, K);
     tiling.EnableBias(isBias);
     tiling.SetBufferSpace(-1, -1, -1);
+#if defined(CUSTOM_PRELOAD_N)
+    tiling.SetTraverse(matmul_tiling::MatrixTraverse::FIRSTN);
+#else
+    tiling.SetTraverse(matmul_tiling::MatrixTraverse::FIRSTM);
+#endif
+
     if (tiling.GetTiling(tilingData) == -1) {
         std::cout << "Generate tiling failed." << std::endl;
         return {};
     }
+    tilingData.baseM = testCaseParams.baseM;
+    tilingData.baseN = testCaseParams.baseN;
+    tilingData.baseK = testCaseParams.baseK;
+    tilingData.stepM = testCaseParams.stepM;
+    tilingData.stepN = testCaseParams.stepN;
+    tilingData.stepKa = testCaseParams.stepKa;
+    tilingData.stepKb = testCaseParams.stepKb;
+    tilingData.depthA1 = testCaseParams.depthA1;
+    tilingData.depthB1 = testCaseParams.depthB1;
     return tilingData;
 }
 
diff --git a/examples/matrix/matmul_preload/op_host/matmul_preload_custom_tiling.h b/examples/matrix/matmul_preload/op_host/matmul_preload_custom_tiling.h
index bde9c15b..eed22057 100644
--- a/examples/matrix/matmul_preload/op_host/matmul_preload_custom_tiling.h
+++ b/examples/matrix/matmul_preload/op_host/matmul_preload_custom_tiling.h
@@ -24,6 +24,15 @@ struct MatmulCaseParams
     bool isBias;
     bool isATrans;
     bool isBTrans;
+    int32_t stepM;
+    int32_t stepN;
+    int32_t stepKa;
+    int32_t stepKb;
+    int32_t depthA1;
+    int32_t depthB1;
+    int32_t baseM;
+    int32_t baseN;
+    int32_t baseK;
 };
 
 /**
diff --git a/examples/matrix/matmul_preload/scripts/exec_test.py b/examples/matrix/matmul_preload/scripts/exec_test.py
index 6f38ff9f..2ae3674c 100644
--- a/examples/matrix/matmul_preload/scripts/exec_test.py
+++ b/examples/matrix/matmul_preload/scripts/exec_test.py
@@ -14,13 +14,14 @@ import sys
 import csv
 import time
 import logging
+import subprocess
 
 import numpy as np
 
 sys.path.append("../..")
 from common_scripts.gen_data import MatmulGenData
 from common_scripts.compare_data import compare_data
-from common_scripts.exec_utils import get_file_work_dir, get_perf_task_duration, clear_file_cache
+from common_scripts.exec_utils import get_file_work_dir, get_perf_task_duration, clear_file_cache, get_process_case_cmd
 
 IS_BIAS = False
 IS_TRANS_A = False
@@ -31,7 +32,8 @@ logging.basicConfig(level=logging.INFO)
 
 
 class ProcessParams:
-    def __init__(self, case_name, m, n, k, b, is_perf, run_mode):
+    def __init__(self, case_name, m, n, k, b, stepm, stepn, stepka, stepkb, deptha1, depthb1, basem, basen, basek,\
+        is_perf, run_mode):
         self.case_name = case_name
         self.m = m
         self.n = n
@@ -39,27 +41,36 @@ class ProcessParams:
         self.b = b
         self.is_perf = is_perf
         self.run_mode = run_mode
+        self.stepm = stepm
+        self.stepn = stepn
+        self.stepka = stepka
+        self.stepkb = stepkb
+        self.deptha1 = deptha1
+        self.depthb1 = depthb1
+        self.basem = basem
+        self.basen = basen
+        self.basek = basek
 
 
 def process_case(file_work_dir, process_params):
     case_name, m, n, k, b, is_perf, run_mode = process_params.case_name, process_params.m, process_params.n,\
         process_params.k, process_params.b, process_params.is_perf, process_params.run_mode
+    stepm, stepn, stepka, stepkb, deptha1, depthb1, basem, basen, basek = process_params.stepm, process_params.stepn,\
+        process_params.stepka, process_params.stepkb, process_params.deptha1, process_params.depthb1,\
+        process_params.basem, process_params.basen, process_params.basek
     logging.info("[INFO] start process case [%s]" % (case_name))
     logging.info("[INFO] IS_PERF is set [%d]" % (is_perf))
     clear_file_cache(file_work_dir)
 
     matmul_gen_data = MatmulGenData(m, n, k, b, IS_TRANS_A, IS_TRANS_B, IS_BIAS, DATA_TYPE_STR)
-    if is_perf: # npu(is_perf = 1)
+    if is_perf:
         matmul_gen_data.gen_fake_golden_data(file_work_dir)
-        os.system("msprof op --application=\"./ascendc_matmul_preload_bbit %s %s %s %s\"\
-            --output=\"./prof_out\"" % (m, n, k, b))
-    elif run_mode == "sim": # sim(is_perf = 0)
-        matmul_gen_data.gen_golden_data(file_work_dir)
-        os.system("msprof op simulator --application=\"./ascendc_matmul_preload_bbit %s %s %s %s\"\
-            --output=\"./sim_out\"" % (m, n, k, b))
-    else: # cpu or npu(is_perf = 0)
+    else:
         matmul_gen_data.gen_golden_data(file_work_dir)
-        os.system("./ascendc_matmul_preload_bbit %s %s %s %s" % (m, n, k, b))
+    params_str = f"{m} {n} {k} {stepm} {stepn} {stepka} {stepkb} {deptha1} {depthb1} {basem} {basen} {basek} {b}"
+    cmd = get_process_case_cmd(kernel_name="ascendc_matmul_preload_bbit", params_str=params_str,\
+        is_perf=is_perf, run_mode=run_mode)
+    subprocess.run(cmd)
     if is_perf:
         wrong_num = -1
     else:
@@ -100,12 +111,11 @@ def get_case_list():
                 item_list = []
                 item_list.append(int(row[0].lstrip("\ufeff")))
                 item_list.append(row[1])
-                item_list.append(int(row[2]))
-                item_list.append(int(row[3]))
-                item_list.append(int(row[4]))
-                item_list.append(row[5].lstrip())
-                if len(row) > 6:
-                    item_list.append(int(row[6]))
+                for idx in range(2, 14):
+                    item_list.append(int(row[idx]))
+                item_list.append(row[14].lstrip())
+                if len(row) > 15:
+                    item_list.append(int(row[15]))
                 else:
                     item_list.append(1)
                 case_list.append(item_list)
@@ -132,11 +142,13 @@ def main():
     case_list = get_case_list()
     res_list = [["case_name", "wrong_num", "total_num", "result", "task_duration"]]
     run_mode = sys.argv[1]
-    for is_process, case_name, m, n, k, case_mode, b in case_list:
+    for is_process, case_name, m, n, k, stepm, stepn, stepka, stepkb, deptha1, depthb1,\
+        basem, basen, basek, case_mode, b in case_list:
         is_run = is_process == 1 and\
             ((preload_mode == "M" and case_mode == "M") or (preload_mode == "N" and case_mode == "N"))
         if is_run:
-            process_params = ProcessParams(case_name, m, n, k, b, is_perf, run_mode)
+            process_params = ProcessParams(case_name, m, n, k, b, stepm, stepn, stepka, stepkb,\
+                deptha1, depthb1, basem, basen, basek, is_perf, run_mode)
             res_data = process_case(file_work_dir, process_params)
             res_list.append(res_data)
 
diff --git a/examples/matrix/matmul_preload/testcase/case.csv b/examples/matrix/matmul_preload/testcase/case.csv
index f4d8f554..218ae898 100644
--- a/examples/matrix/matmul_preload/testcase/case.csv
+++ b/examples/matrix/matmul_preload/testcase/case.csv
@@ -1,2 +1,2 @@
-1, case001, 12288, 256, 128, M
-1, case002, 134, 32112, 128, N
\ No newline at end of file
+1, case001, 12288, 256, 128, 5, 1, 2, 2, 10, 2, 128, 256, 64, M
+1, case002, 128, 24576, 512, 1, 1, 8, 8, 8, 16, 128, 128, 64, N
\ No newline at end of file
diff --git a/examples/matrix/matmul_quant/CMakeLists.txt b/examples/matrix/matmul_quant/CMakeLists.txt
new file mode 100644
index 00000000..68890e5e
--- /dev/null
+++ b/examples/matrix/matmul_quant/CMakeLists.txt
@@ -0,0 +1,73 @@
+# Copyright (c) 2025 Huawei Technologies Co., Ltd.
+# This file is a part of the CANN Open Software.
+# Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# ======================================================================================================================
+
+cmake_minimum_required(VERSION 3.16)
+project(Ascend_c)
+if (${RUN_MODE})
+    set(RUN_MODE "npu" CACHE STRING "cpu/sim/npu")
+endif()
+if (${SOC_VERSION})
+    set(SOC_VERSION "Ascend910" CACHE STRING "system on chip type")
+endif()
+
+set(ASCEND_CANN_PACKAGE_PATH "~/Ascend/ascend-toolkit/latest" CACHE STRING "ASCEND CANN package installation directory")
+if(NOT CMAKE_BUILD_TYPE)
+    set(CMAKE_BUILD_TYPE "Debug"  CACHE STRING "Build type Release/Debug (default Debug)" FORCE)
+endif()
+
+if(CMAKE_INSTALL_PREFIX STREQUAL /usr/local)
+    set(CMAKE_INSTALL_PREFIX "${CMAKE_CURRENT_LIST_DIR}/out"  CACHE STRING "path for install()" FORCE)
+endif()
+
+file(GLOB KERNEL_FILES
+    ${CMAKE_CURRENT_SOURCE_DIR}/op_kernel/matmul_quant_custom_kernel.cpp
+)
+
+if("${RUN_MODE}" STREQUAL "cpu")
+    include(cmake/cpu_lib.cmake)
+elseif("${RUN_MODE}" STREQUAL "sim" OR "${RUN_MODE}" STREQUAL "npu")
+    include(cmake/npu_lib.cmake)
+else()
+    message("invalid RUN_MODE: ${RUN_MODE}")
+endif()
+
+add_executable(ascendc_matmul_quant_bbit
+    ${CMAKE_CURRENT_SOURCE_DIR}/main.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/op_host/matmul_quant_custom_tiling.cpp
+)
+
+target_compile_options(ascendc_matmul_quant_bbit PRIVATE
+    $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:-g>>
+    -O2
+    -std=c++17
+    -D_GLIBCXX_USE_CXX11_ABI=0
+)
+
+target_compile_definitions(ascendc_matmul_quant_bbit PRIVATE
+    SOC_VERSION="${SOC_VERSION}"
+)
+
+target_include_directories(ascendc_matmul_quant_bbit PRIVATE
+    ${CMAKE_CURRENT_SOURCE_DIR}
+    $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:${ASCEND_CANN_PACKAGE_PATH}/include>>
+    $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:${ASCEND_CANN_PACKAGE_PATH}/runtime/include>>
+)
+
+target_link_libraries(ascendc_matmul_quant_bbit PRIVATE
+    $<BUILD_INTERFACE:$<$<OR:$<STREQUAL:${RUN_MODE},npu>,$<STREQUAL:${RUN_MODE},sim>>:host_intf_pub>>
+    $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:tikicpulib::${SOC_VERSION}>>
+    $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:ascendcl>>
+    $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:c_sec>>
+    ascendc_kernels_${RUN_MODE}
+    tiling_api
+    register
+    platform
+    ascendalog
+    dl
+)
diff --git a/examples/matrix/matmul_quant/README.md b/examples/matrix/matmul_quant/README.md
new file mode 100644
index 00000000..7b6479ad
--- /dev/null
+++ b/examples/matrix/matmul_quant/README.md
@@ -0,0 +1,120 @@
+<!--声明：本文使用[Creative Commons License version 4.0](https://creativecommons.org/licenses/by/4.0/legalcode)许可协议，转载、引用或修改等操作请遵循此许可协议。-->
+## 概述
+
+本样例使用Matmul API实现了int8类型输入、half类型输出的Matmul反量化场景的算子，支持同一系数的反量化模式和向量的反量化模式。关于Matmul量化/反量化的具体内容请参考《[Ascend C算子开发](https://www.hiascend.com/document/redirect/CannCommunityOpdevAscendC)》中的“算子实现 > 矩阵编程（高阶API） > 特性场景 > 矩阵乘输出的量化/反量化”章节。
+
+本样例以直调的方式调用算子核函数。  
+直调：核函数的基础调用方式，开发者完成算子核函数的开发和Tiling实现后，即可通过AscendCL运行时接口，完成算子的调用。包含CPU侧、NPU侧和仿真三种运行验证方法。
+
+## 样例支持的产品型号为
+- Atlas A2训练系列产品/Atlas 800I A2推理产品
+
+## 目录结构
+| 目录及文件                            | 描述                   |
+|----------------------------------|----------------------|
+| [cmake](cmake)                   | 编译工程文件 |
+| [op_host](op_host)               | 本样例的tiling代码实现 |
+| [op_kernel](op_kernel)           | 本样例的kernel代码实现 |
+| [scripts](scripts)               | 包含输入输出真值数据生成脚本文件和执行文件 |
+| [testcase](testcase)             | 用例文件，配置用例的计算shape信息 |
+| CMakeLists.txt                   | 编译工程文件 |
+| main.cpp                         | 主函数，调用算子的应用程序，含CPU域及NPU域调用 |
+| run.sh                           | 编译执行脚本 |
+
+## 算子描述
+- 算子功能  
+  MatmulQuantCustom单算子调用Matmul高阶API实现Matmul计算时int8_t类型输入，计算结果以half类型反量化输出，同时支持同一系数的反量化模式与向量的反量化模式。该场景下将C矩阵数据从CO1搬出到Global Memory时，会执行反量化操作，对输出矩阵的所有值采用同一系数或向量进行反量化。
+
+- 算子规格
+    <table>
+    <tr><td rowspan="1" align="center">算子类型(OpType)</td><td colspan="6" align="center">MatmulQuantCustom</td></tr>
+    </tr>
+    <tr><td rowspan="5" align="center">算子输入</td><td align="center">name</td><td align="center">shape</td><td align="center">data type</td><td align="center">format</td><td align="center">isTrans</td></tr>
+    <tr><td align="center">a</td><td align="center">-</td><td align="center">int8_t</td><td align="center">ND</td><td align="center">false</td></tr>
+    <tr><td align="center">b</td><td align="center">-</td><td align="center">int8_t</td><td align="center">ND</td><td align="center">true</td></tr>
+    <tr><td align="center">bias</td><td align="center">-</td><td align="center">int32_t</td><td align="center">ND</td><td align="center">-</td></tr>
+    <tr><td align="center">quantVector</td><td align="center">-</td><td align="center">uint64_t</td><td align="center">-</td><td align="center">-</td></tr>
+    </tr>
+    </tr>
+    <tr><td rowspan="1" align="center">算子输出</td><td align="center">c</td><td align="center">-</td><td align="center">half</td><td align="center">ND</td><td align="center">-</td></tr>
+    </tr>
+    <tr><td rowspan="1" align="center">核函数名</td><td colspan="6" align="center">matmul_quant_custom</td></tr>
+    </table>
+
+## 算子实现介绍   
+- 算子Kernel实现
+  - 计算逻辑：C = A * B + Bias。
+    - A、B为源操作数，A为左矩阵，形状为[M, K]；B为右矩阵，形状为[N, K]。
+    - C为目的操作数，存放矩阵乘结果的矩阵，形状为[M, N]。
+    - Bias为矩阵乘偏置，形状为[1, N]。对A*B结果矩阵的每一行都采用该Bias进行偏置。
+  - 具体步骤：
+    - 创建Matmul对象。
+    - Matmul初始化。  
+      基于testcase目录下case.csv的配置，初始化Matmul的反量化模式。若测试用例配置反量化模式值为1，则算子的输入quantVector为空，初始化Matmul的反量化模式quantMode为同一系数的反量化模式。反之，若测试用例配置反量化模式值为2，则算子的输入quantVector非空，初始化Matmul的反量化模式quantMode为向量反量化模式。
+      ```
+      MatmulQuantCustom::MatmulKernel<int8_t, int8_t, half, int32_t> matmulKernel;
+      ... ...
+      if (quantVector) {
+          matmulKernel.Init(a, b, bias, quantVector, c, tiling, false, true, 2); // 向量反量化模式（PER_CHANNEL）
+      } else {
+          matmulKernel.Init(a, b, bias, quantVector, c, tiling, false, true, 1); // 同一系数的反量化模式（PER_TENSOR）
+      }
+      ```
+    - 设置左矩阵A、右矩阵B、偏置矩阵Bias。
+    - 设置反量化参数。  
+      根据Matmul初始化的反量化模式quantMode，对应设置反量化参数。
+      ```
+      if (this->quantMode == 1) {
+          float tmp = 0.1f; 
+          uint64_t s = static_cast<uint64_t>(*reinterpret_cast<int32_t>(&tmp)); // 转换为uint64_t
+          matmulObj.SetQuantScalar(s); // 同一系数的反量化模式（PER_TENSOR），设置反量化的Scalar系数
+      } else if (this->quantMode == 2) {
+          matmulObj.SetQuantVector(quantGlobal); // 向量反量化模式（PER_CHANNEL），设置反量化的系数向量
+      }
+      ```
+    - 完成矩阵乘操作。
+    - 结束矩阵乘操作。
+
+- 算子Tiling实现
+  - Ascend C提供一组Matmul Tiling API，方便用户获取Matmul kernel计算时所需的Tiling参数。只需要传入A/B/C矩阵等信息，调用API接口，即可获取到TCubeTiling结构体中的相关参数。
+  - 获取Tiling参数的流程如下：
+    - 创建一个Tiling对象。
+    - 设置Matmul反量化模式。
+      ``` 
+      auto ascendcPlatform = platform_ascendc::PlatformAscendCManager::GetInstance(socVersion);
+      matmul_tiling::MultiCoreMatmulTiling cubeTiling(*ascendcPlatform);
+
+      if (quantMode == 1) {
+          cubeTiling.SetDequantType(matmul_tiling::DequantType::SCALAR); // 设置同一系数的反量化模式（PER_TENSOR）
+      } else if (quantMode == 2) {
+          cubeTiling.SetDequantType(matmul_tiling::DequantType::TENSOR); // 设置向量反量化模式（PER_CHANNEL）
+      }
+      ```
+    - 设置A、B、C、Bias的参数类型信息；M、N、Ka、Kb形状信息等。
+    - 调用GetTiling接口，获取Tiling信息。
+
+## 编译运行样例
+
+  - 配置环境变量
+
+    这里的\$ASCEND_CANN_PACKAGE_PATH需要替换为CANN开发套件包安装后文件存储路径。例如：/usr/local/Ascend/ascend-toolkit/latest
+    ```
+    export ASCEND_HOME_DIR=$ASCEND_CANN_PACKAGE_PATH
+    source $ASCEND_HOME_DIR/../set_env.sh
+    ```
+
+  - 编译执行
+
+    ```
+    bash run.sh -r [RUN_MODE] -v [SOC_VERSION] -p [IS_PERF]
+    ```
+    其中脚本参数说明如下：
+    - RUN_MODE ：编译执行方式，可选择CPU调试，NPU仿真，NPU上板，对应参数分别为[cpu / sim / npu]。若需要详细了解NPU仿真相关内容，请参考[《算子开发工具msProf》](https://hiascend.com/document/redirect/CannCommunityToolMsProf)中的“工具使用”章节。
+    - SOC_VERSION ：昇腾AI处理器型号，如果无法确定具体的[SOC_VERSION]，则在安装昇腾AI处理器的服务器执行npu-smi info命令进行查询，在查询到的“Name”前增加Ascend信息，例如“Name”对应取值为xxxyy，实际配置的[SOC_VERSION]值为Ascendxxxyy。支持以下产品型号：
+      - Atlas A2训练系列产品/Atlas 800I A2推理产品
+    - IS_PERF : 是否获取执行性能数据，当前只在NPU执行时生效，打开后性能数据显示在执行结果中，可选择关闭和开启该功能，对应参数分别为[0 / 1]。
+
+    示例如下，Ascendxxxyy请替换为实际的AI处理器型号。
+    ```
+    bash run.sh -r cpu -v Ascendxxxyy -p 0
+    ```
\ No newline at end of file
diff --git a/examples/matrix/matmul_quant/cmake/cpu_lib.cmake b/examples/matrix/matmul_quant/cmake/cpu_lib.cmake
new file mode 100644
index 00000000..487a91e0
--- /dev/null
+++ b/examples/matrix/matmul_quant/cmake/cpu_lib.cmake
@@ -0,0 +1,31 @@
+# Copyright (c) 2025 Huawei Technologies Co., Ltd.
+# This file is a part of the CANN Open Software.
+# Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# ======================================================================================================================
+
+if(NOT DEFINED ENV{CMAKE_PREFIX_PATH})
+    set(CMAKE_PREFIX_PATH ${ASCEND_CANN_PACKAGE_PATH}/tools/tikicpulib/lib/cmake)
+endif()
+find_package(tikicpulib REQUIRED)
+
+add_library(ascendc_kernels_${RUN_MODE} SHARED
+    ${KERNEL_FILES}
+)
+
+target_link_libraries(ascendc_kernels_${RUN_MODE} PRIVATE
+    tikicpulib::${SOC_VERSION}
+)
+
+target_compile_options(ascendc_kernels_${RUN_MODE} PRIVATE
+    -g
+    -O0
+    -std=c++17
+)
+
+install(TARGETS ascendc_kernels_${RUN_MODE}
+    DESTINATION ${CMAKE_INSTALL_LIBDIR}
+)
\ No newline at end of file
diff --git a/examples/matrix/matmul_quant/cmake/npu_lib.cmake b/examples/matrix/matmul_quant/cmake/npu_lib.cmake
new file mode 100644
index 00000000..bc803099
--- /dev/null
+++ b/examples/matrix/matmul_quant/cmake/npu_lib.cmake
@@ -0,0 +1,27 @@
+# Copyright (c) 2025 Huawei Technologies Co., Ltd.
+# This file is a part of the CANN Open Software.
+# Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# ======================================================================================================================
+
+if(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
+    set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
+elseif(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake)
+    set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake)
+else()
+    message(FATAL_ERROR "ascendc_kernel_cmake does not exist ,please check whether the cann package is installed")
+endif()
+include(${ASCENDC_CMAKE_DIR}/ascendc.cmake)
+
+ascendc_library(ascendc_kernels_${RUN_MODE} STATIC
+    ${KERNEL_FILES}
+)
+
+ascendc_compile_definitions(ascendc_kernels_${RUN_MODE} PRIVATE
+    -DASCENDC_DUMP
+    -DHAVE_WORKSPACE
+    -DHAVE_TILING
+)
\ No newline at end of file
diff --git a/examples/matrix/matmul_quant/main.cpp b/examples/matrix/matmul_quant/main.cpp
new file mode 100644
index 00000000..07011c35
--- /dev/null
+++ b/examples/matrix/matmul_quant/main.cpp
@@ -0,0 +1,258 @@
+/**
+ * Copyright (c) 2025 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+#include <sstream>
+#include "../../common/data_utils.h"
+#include "kernel_tiling/kernel_tiling.h"
+#include "tiling/platform/platform_ascendc.h"
+#include "op_host/matmul_quant_custom_tiling.h"
+
+#ifndef ASCENDC_CPU_DEBUG
+#include "acl/acl.h"
+
+extern void matmul_quant_custom_do(uint32_t coreDim, void* stream,
+    uint8_t* a, uint8_t* b, uint8_t* bias, uint8_t* quantVector, uint8_t* c,
+    uint8_t* workspace, uint8_t* tiling);
+#else
+#include "tikicpulib.h"
+extern "C" void matmul_quant_custom(uint8_t* a, uint8_t* b, uint8_t* bias, uint8_t* quantVector, uint8_t* c,
+    uint8_t* workspace, uint8_t* tiling);
+#endif
+
+namespace {
+constexpr bool IS_BIAS = true;
+constexpr bool IS_A_TRANS = false;
+constexpr bool IS_B_TRANS = true;
+// 1: scalar quant mode, 2: vector quant mode
+constexpr int64_t VECTOR_QUANT_MODE = 2;
+
+struct MatrixFileSize
+{
+    size_t x1FileSize;
+    size_t x2FileSize;
+    size_t yFileSize;
+    size_t biasFileSize;
+};
+}
+
+namespace MatmulHost {
+static size_t GetSysWorkSpaceSize()
+{
+    auto ascendcPlatform = platform_ascendc::PlatformAscendCManager::GetInstance();
+    return static_cast<size_t>(ascendcPlatform->GetLibApiWorkSpaceSize());
+}
+
+// CPU debug mode
+#ifdef ASCENDC_CPU_DEBUG
+void TestMatmulCpu(int64_t m, int64_t n, int64_t k, int64_t quantMode, const MatrixFileSize& matrixFileSize)
+{
+    size_t x1FileSize = matrixFileSize.x1FileSize;
+    size_t x2FileSize = matrixFileSize.x2FileSize;
+    size_t yFileSize = matrixFileSize.yFileSize;
+    size_t biasFileSize = matrixFileSize.biasFileSize;
+    // Query workspace size
+    size_t workspaceSize = GetSysWorkSpaceSize();
+    uint8_t* x1 = (uint8_t*)AscendC::GmAlloc(x1FileSize);
+    uint8_t* x2 = (uint8_t*)AscendC::GmAlloc(x2FileSize);
+    uint8_t* bias = nullptr; 
+    uint8_t* quantVector = nullptr;
+    uint8_t* y = (uint8_t*)AscendC::GmAlloc(yFileSize);
+    uint8_t* workspace = (uint8_t*)AscendC::GmAlloc(workspaceSize);
+    ReadFile("../input/x1_gm.bin", x1FileSize, x1, x1FileSize);
+    ReadFile("../input/x2_gm.bin", x2FileSize, x2, x2FileSize);
+    if (IS_BIAS) {
+        bias = (uint8_t*)AscendC::GmAlloc(biasFileSize);
+        ReadFile("../input/bias_gm.bin", biasFileSize, bias, biasFileSize);
+    }
+    if (quantMode == VECTOR_QUANT_MODE){ 
+        size_t quantVectorSize = 1 * n * sizeof(uint64_t);
+        quantVector = (uint8_t*)AscendC::GmAlloc(quantVectorSize);
+        ReadFile("../input/quant_vector_gm.bin", quantVectorSize, quantVector, quantVectorSize);
+    }
+
+    // Calculate Tiling
+    size_t tilingFileSize = sizeof(TCubeTiling);
+    uint8_t* tiling = (uint8_t*)AscendC::GmAlloc(tilingFileSize);
+    auto ascendcPlatform = platform_ascendc::PlatformAscendCManager::GetInstance();
+    MatmulHost::MatmulCaseParams testCaseParams{static_cast<int32_t>(ascendcPlatform->GetCoreNumAic()),
+        static_cast<int32_t>(m), static_cast<int32_t>(n), static_cast<int32_t>(k), static_cast<int32_t>(quantMode), 
+        IS_BIAS, IS_A_TRANS, IS_B_TRANS};
+    const auto& tilingData = MatmulHost::GenerateTiling(testCaseParams);
+    memcpy_s(tiling, tilingFileSize, &tilingData, tilingFileSize);
+    
+    // Kernel Launch
+    ICPU_RUN_KF(matmul_quant_custom, tilingData.usedCoreNum, x1, x2, bias, quantVector, y, workspace, tiling);
+    WriteFile("../output/output.bin", y, yFileSize);
+    AscendC::GmFree((void*)x1);
+    AscendC::GmFree((void*)x2);
+    AscendC::GmFree((void*)y);
+    AscendC::GmFree((void*)workspace);
+    AscendC::GmFree((void*)tiling);
+    if (IS_BIAS) {
+        AscendC::GmFree((void*)bias);
+    }
+    if (quantVector) {
+        AscendC::GmFree((void*)quantVector);
+    }
+}
+// NPU
+#else
+void MatmulOp(uint8_t* x1, uint8_t* x2, uint8_t* y, uint8_t* bias, uint8_t* quantVector,
+    int64_t m, int64_t n, int64_t k, int64_t quantMode, void* stream = nullptr)
+{
+    // Init args
+    uint8_t* workspaceDevice = nullptr;
+
+    // Query workspace size
+    size_t workspaceSize = GetSysWorkSpaceSize();
+
+    // Allocate workspace on device
+    CHECK_ACL(aclrtMalloc((void**)&workspaceDevice, workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST));
+
+    // Initialize kernel with arguments and workspace pointer
+    uint8_t* tilingHost = nullptr;
+    uint8_t* tilingDevice = nullptr;
+    size_t tilingFileSize = sizeof(TCubeTiling);
+    auto ascendcPlatform = platform_ascendc::PlatformAscendCManager::GetInstance();
+    MatmulHost::MatmulCaseParams testCaseParams{static_cast<int32_t>(ascendcPlatform->GetCoreNumAic()),
+        static_cast<int32_t>(m), static_cast<int32_t>(n), static_cast<int32_t>(k), static_cast<int32_t>(quantMode),
+        IS_BIAS, IS_A_TRANS, IS_B_TRANS};
+    // Calculate Tiling
+    const auto tilingData = MatmulHost::GenerateTiling(testCaseParams);
+    CHECK_ACL(aclrtMallocHost((void**)(&tilingHost), tilingFileSize));
+    CHECK_ACL(aclrtMalloc((void**)&tilingDevice, tilingFileSize,
+                          ACL_MEM_MALLOC_HUGE_FIRST));
+    CHECK_ACL(aclrtMemcpy(tilingHost, tilingFileSize, &tilingData,
+                          tilingFileSize, ACL_MEMCPY_HOST_TO_HOST));
+    CHECK_ACL(aclrtMemcpy(tilingDevice, tilingFileSize, tilingHost,
+                          tilingFileSize, ACL_MEMCPY_HOST_TO_DEVICE));
+    // Launch kernel
+    matmul_quant_custom_do(tilingData.usedCoreNum, stream, x1, x2, bias, quantVector, y, workspaceDevice, tilingDevice);
+    CHECK_ACL(aclrtFreeHost(tilingHost));
+    CHECK_ACL(aclrtFree(workspaceDevice));
+    CHECK_ACL(aclrtFree(tilingDevice));
+}
+
+void TestAclInit(aclrtContext& context, aclrtStream& stream, int64_t& deviceId)
+{
+    CHECK_ACL(aclInit(nullptr));
+    CHECK_ACL(aclrtSetDevice(deviceId));
+    CHECK_ACL(aclrtCreateContext(&context, deviceId));
+    CHECK_ACL(aclrtCreateStream(&stream));
+}
+
+void TestAclDeInit(aclrtContext& context, aclrtStream& stream, int64_t& deviceId)
+{
+    CHECK_ACL(aclrtDestroyStream(stream));
+    CHECK_ACL(aclrtDestroyContext(context));
+    CHECK_ACL(aclrtResetDevice(deviceId));
+    CHECK_ACL(aclFinalize());
+}
+
+void TestMatmul(int64_t m, int64_t n, int64_t k, int64_t quantMode, const MatrixFileSize& matrixFileSize)
+{
+    size_t x1FileSize = matrixFileSize.x1FileSize;
+    size_t x2FileSize = matrixFileSize.x2FileSize;
+    size_t yFileSize = matrixFileSize.yFileSize;
+    size_t biasFileSize = matrixFileSize.biasFileSize;
+    size_t quantVectorSize = static_cast<size_t>(1 * n) * sizeof(uint64_t);
+
+    aclrtContext context;
+    aclrtStream stream = nullptr;
+    int64_t deviceId = 0;
+    TestAclInit(context, stream, deviceId);
+
+    uint8_t* x1Host = nullptr;
+    uint8_t* x1Device = nullptr;
+    CHECK_ACL(aclrtMallocHost((void**)(&x1Host), x1FileSize));
+    CHECK_ACL(aclrtMalloc((void**)&x1Device, x1FileSize, ACL_MEM_MALLOC_HUGE_FIRST));
+    ReadFile("../input/x1_gm.bin", x1FileSize, x1Host, x1FileSize);
+    CHECK_ACL(aclrtMemcpy(x1Device, x1FileSize, x1Host, x1FileSize, ACL_MEMCPY_HOST_TO_DEVICE));
+
+    uint8_t* x2Host = nullptr;
+    uint8_t* x2Device = nullptr;
+    CHECK_ACL(aclrtMallocHost((void**)(&x2Host), x2FileSize));
+    CHECK_ACL(aclrtMalloc((void**)&x2Device, x2FileSize, ACL_MEM_MALLOC_HUGE_FIRST));
+    ReadFile("../input/x2_gm.bin", x2FileSize, x2Host, x2FileSize);
+    CHECK_ACL(aclrtMemcpy(x2Device, x2FileSize, x2Host, x2FileSize, ACL_MEMCPY_HOST_TO_DEVICE));
+
+    uint8_t* biasHost = nullptr;
+    uint8_t* biasDevice = nullptr;
+    if (IS_BIAS) {
+        CHECK_ACL(aclrtMallocHost((void**)(&biasHost), biasFileSize));
+        CHECK_ACL(aclrtMalloc((void**)&biasDevice, biasFileSize, ACL_MEM_MALLOC_HUGE_FIRST));
+        ReadFile("../input/bias_gm.bin", biasFileSize, biasHost, biasFileSize);
+        CHECK_ACL(aclrtMemcpy(biasDevice, biasFileSize, biasHost, biasFileSize, ACL_MEMCPY_HOST_TO_DEVICE));
+    }
+
+    uint8_t* quantVectorHost = nullptr;
+    uint8_t* quantVectorDevice = nullptr;
+    if (quantMode == VECTOR_QUANT_MODE) {
+        CHECK_ACL(aclrtMallocHost((void**)(&quantVectorHost), quantVectorSize));
+        CHECK_ACL(aclrtMalloc((void**)&quantVectorDevice, quantVectorSize, ACL_MEM_MALLOC_HUGE_FIRST));
+        ReadFile("../input/quant_vector_gm.bin", quantVectorSize, quantVectorHost, quantVectorSize);
+        CHECK_ACL(aclrtMemcpy(quantVectorDevice, quantVectorSize, quantVectorHost, quantVectorSize, ACL_MEMCPY_HOST_TO_DEVICE));
+    }
+
+    uint8_t* yHost = nullptr;
+    uint8_t* yDevice = nullptr;
+    CHECK_ACL(aclrtMallocHost((void**)(&yHost), yFileSize));
+    CHECK_ACL(aclrtMalloc((void**)&yDevice, yFileSize, ACL_MEM_MALLOC_HUGE_FIRST));
+
+    MatmulOp(x1Device, x2Device, yDevice, biasDevice, quantVectorDevice, m, n, k, quantMode, stream);
+    CHECK_ACL(aclrtSynchronizeStream(stream));
+
+    CHECK_ACL(aclrtMemcpy(yHost, yFileSize, yDevice, yFileSize, ACL_MEMCPY_DEVICE_TO_HOST));
+    WriteFile("../output/output.bin", yHost, yFileSize);
+
+    if (IS_BIAS) {
+        CHECK_ACL(aclrtFree(biasDevice));
+        CHECK_ACL(aclrtFreeHost(biasHost));
+    }
+    if (quantMode == VECTOR_QUANT_MODE) {
+        CHECK_ACL(aclrtFree(quantVectorDevice));
+        CHECK_ACL(aclrtFreeHost(quantVectorHost));
+    }
+    CHECK_ACL(aclrtFree(x1Device));
+    CHECK_ACL(aclrtFreeHost(x1Host));
+    CHECK_ACL(aclrtFree(x2Device));
+    CHECK_ACL(aclrtFreeHost(x2Host));
+    CHECK_ACL(aclrtFree(yDevice));
+    CHECK_ACL(aclrtFreeHost(yHost));
+    TestAclDeInit(context, stream, deviceId);
+}
+#endif
+} // namespace MatmulHost
+
+int32_t main(int32_t argc, const char* args[])
+{
+    int64_t inputParams[argc] = {1}; // M, N, K, QuantMode
+    for (int32_t i = 1; i < argc; ++i) {
+        std::stringstream ss(args[i]);
+        ss >> inputParams[i - 1];
+    }
+    auto ascendcPlatform = platform_ascendc::PlatformAscendCManager::GetInstance(SOC_VERSION);
+    int64_t M = inputParams[0];
+    int64_t N = inputParams[1];
+    int64_t K = inputParams[2];
+    int64_t quantMode = inputParams[3];
+    MatrixFileSize matrixFileSize;
+    // uint16_t represent half
+    matrixFileSize.x1FileSize = static_cast<size_t>(M * K) * sizeof(int8_t);
+    matrixFileSize.x2FileSize = static_cast<size_t>(K * N) * sizeof(int8_t);
+    matrixFileSize.yFileSize = static_cast<size_t>(M * N) * sizeof(uint16_t);
+    matrixFileSize.biasFileSize = static_cast<size_t>(1 * N) * sizeof(int32_t);
+#ifdef ASCENDC_CPU_DEBUG
+    MatmulHost::TestMatmulCpu(M, N, K, quantMode, matrixFileSize);
+#else
+    MatmulHost::TestMatmul(M, N, K, quantMode, matrixFileSize);
+#endif
+    return 0;
+}
diff --git a/examples/matrix/matmul_quant/op_host/matmul_quant_custom_tiling.cpp b/examples/matrix/matmul_quant/op_host/matmul_quant_custom_tiling.cpp
new file mode 100644
index 00000000..6e9294c5
--- /dev/null
+++ b/examples/matrix/matmul_quant/op_host/matmul_quant_custom_tiling.cpp
@@ -0,0 +1,57 @@
+/**
+ * Copyright (c) 2025 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+#include "matmul_quant_custom_tiling.h"
+#include <iostream>
+
+namespace MatmulHost {
+
+TCubeTiling GenerateTiling(const MatmulCaseParams& testCaseParams)
+{
+    TCubeTiling tilingData;
+    auto ascendcPlatform = platform_ascendc::PlatformAscendCManager::GetInstance();
+    matmul_tiling::MultiCoreMatmulTiling cubeTiling(*ascendcPlatform);
+    uint32_t M = testCaseParams.m;
+    uint32_t N = testCaseParams.n;
+    uint32_t K = testCaseParams.k;
+    uint32_t blockDim = testCaseParams.usedCoreNum;
+    uint32_t quantMode = testCaseParams.quantMode;
+    bool isBias = testCaseParams.isBias;
+    bool isAtrans = testCaseParams.isATrans;
+    bool isBtrans = testCaseParams.isBTrans;
+
+    cubeTiling.SetDim(blockDim);
+    cubeTiling.SetAType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND,
+        matmul_tiling::DataType::DT_INT8, isAtrans);
+    cubeTiling.SetBType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND,
+        matmul_tiling::DataType::DT_INT8, isBtrans);
+    cubeTiling.SetCType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND,
+        matmul_tiling::DataType::DT_FLOAT16);
+    cubeTiling.SetBiasType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND,
+        matmul_tiling::DataType::DT_INT32);
+    
+    if (quantMode == 1) {
+        cubeTiling.SetDequantType(matmul_tiling::DequantType::SCALAR); // set PRE_TENSOR quant mode
+    } else if (quantMode == 2) {
+        cubeTiling.SetDequantType(matmul_tiling::DequantType::TENSOR); // set PRE_CHANNEL quant mode
+    }
+    std::cout << "[INFO] Set Quant Mode: " << quantMode << std::endl;
+
+    cubeTiling.SetOrgShape(M, N, K);
+    cubeTiling.SetShape(M, N, K);
+    cubeTiling.EnableBias(isBias);
+    if (cubeTiling.GetTiling(tilingData) == -1) {
+        std::cout << "Generate tiling failed." << std::endl;
+        return {};
+    }
+    return tilingData;
+}
+
+} // namespace MatmulHost
diff --git a/examples/matrix/matmul_quant/op_host/matmul_quant_custom_tiling.h b/examples/matrix/matmul_quant/op_host/matmul_quant_custom_tiling.h
new file mode 100644
index 00000000..a41b2584
--- /dev/null
+++ b/examples/matrix/matmul_quant/op_host/matmul_quant_custom_tiling.h
@@ -0,0 +1,38 @@
+/**
+ * Copyright (c) 2025 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+#ifndef EXAMPLES_MATRIX_MATMUL_QUANT_OP_HOST_MATMUL_QUANT_CUSTOM_TILING_H
+#define EXAMPLES_MATRIX_MATMUL_QUANT_OP_HOST_MATMUL_QUANT_CUSTOM_TILING_H
+#include "register/tilingdata_base.h"
+#include "tiling/tiling_api.h"
+
+namespace MatmulHost {
+
+struct MatmulCaseParams
+{
+    int32_t usedCoreNum;
+    int32_t m;
+    int32_t n;
+    int32_t k;
+    int32_t quantMode;
+    bool isBias;
+    bool isATrans;
+    bool isBTrans;
+};
+
+/**
+  * @brief Generate matmul tiling.
+  * @param testCaseParams: Testcase parameters.
+  * @retval Generated Tiling data.
+  */
+TCubeTiling GenerateTiling(const MatmulCaseParams& testCaseParams);
+
+} // namespace MatmulHost
+#endif // EXAMPLES_MATRIX_MATMUL_QUANT_OP_HOST_MATMUL_QUANT_CUSTOM_TILING_H
diff --git a/examples/matrix/matmul_quant/op_kernel/matmul_quant_custom_kernel.cpp b/examples/matrix/matmul_quant/op_kernel/matmul_quant_custom_kernel.cpp
new file mode 100644
index 00000000..8f440983
--- /dev/null
+++ b/examples/matrix/matmul_quant/op_kernel/matmul_quant_custom_kernel.cpp
@@ -0,0 +1,156 @@
+/**
+ * Copyright (c) 2025 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+#include "kernel_operator.h"
+#include "matmul_quant_custom_kernel.h"
+
+namespace {
+/**
+  * @brief  Copy tiling data to TCubeTiling ptr from tiling gm addr.
+  * @param  tiling: TCubeTiling ptr which needs to copy tiling data.
+  * @param  tilingGM: Tiling gm addr.
+  * @retval None
+  */
+__aicore__ inline void CopyTiling(TCubeTiling* tiling, GM_ADDR tilingGM)
+{
+    uint32_t* ptr = reinterpret_cast<uint32_t*>(tiling);
+    auto tiling32 = reinterpret_cast<__gm__ uint32_t*>(tilingGM);
+
+    for (int i = 0; i < sizeof(TCubeTiling) / sizeof(uint32_t); ++i, ++ptr) {
+      *ptr = *(tiling32 + i);
+    }
+    return;
+}
+}
+
+namespace MatmulQuantCustom {
+template <typename AType, typename BType, typename CType, typename BiasType>
+__aicore__ inline void MatmulKernel<AType, BType, CType, BiasType>::Init(GM_ADDR a, GM_ADDR b, GM_ADDR bias,
+    GM_ADDR quantVector, GM_ADDR c, const TCubeTiling& tiling, bool isTransA, bool isTransB, int64_t quantMode)
+{
+    this->tiling = tiling;
+    aGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ AType*>(a), tiling.M * tiling.Ka);
+    bGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ BType*>(b), tiling.Kb * tiling.N);
+    cGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ CType*>(c), tiling.M * tiling.N);
+    biasGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ BiasType*>(bias), tiling.N);
+    quantGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ uint64_t*>(quantVector), tiling.N);
+  
+    int32_t offsetA = 0;
+    int32_t offsetB = 0;
+    int32_t offsetC = 0;
+    int32_t offsetBias = 0;
+    this->isTransA = isTransA;
+    this->isTransB = isTransB;
+    this->quantMode = quantMode;
+    CalcOffset(AscendC::GetBlockIdx(), offsetA, offsetB, offsetC, offsetBias);
+    aGlobal = aGlobal[offsetA];
+    bGlobal = bGlobal[offsetB];
+    cGlobal = cGlobal[offsetC];
+    biasGlobal = biasGlobal[offsetBias];
+    quantGlobal = quantGlobal[offsetBias];
+
+    if (GetSysWorkSpacePtr() == nullptr) {
+        return;
+    }
+}
+
+template <typename AType, typename BType, typename CType, typename BiasType>
+__aicore__ inline void MatmulKernel<AType, BType, CType, BiasType>::Process()
+{
+    if (AscendC::GetBlockIdx() >= this->tiling.usedCoreNum) {
+        return;
+    }
+
+    if (this->quantMode == 1) {
+        float tmp = 0.1f; // set quant scale value
+        uint64_t s = static_cast<uint64_t>(*reinterpret_cast<int32_t*>(&tmp)); //float transport to uint64_t type
+        matmulObj.SetQuantScalar(s);  // set PRE_TENSOR quant mode
+    } else if(this->quantMode == 2){
+        matmulObj.SetQuantVector(quantGlobal);  // set PRE_CHANNEL quant mode
+    }
+
+    // process with tail block
+    int tailN = tiling.N - nCoreIndex * tiling.singleCoreN;
+    tailN = tailN < tiling.singleCoreN ? tailN : tiling.singleCoreN;
+    int tailM = tiling.M - mCoreIndex * tiling.singleCoreM;
+    tailM = tailM < tiling.singleCoreM ? tailM : tiling.singleCoreM;
+    if (tailM < tiling.singleCoreM || tailN < tiling.singleCoreN) {
+        matmulObj.SetTail(tailM, tailN);
+    }
+
+    matmulObj.SetTensorA(aGlobal, isTransA);
+    matmulObj.SetTensorB(bGlobal, isTransB);
+    if (tiling.isBias) {
+        matmulObj.SetBias(biasGlobal);
+    }
+    matmulObj.IterateAll(cGlobal);
+    matmulObj.End();
+}
+
+template <typename AType, typename BType, typename CType, typename BiasType>
+__aicore__ inline void MatmulKernel<AType, BType, CType, BiasType>::CalcOffset(
+    int32_t blockIdx, int32_t& offsetA, int32_t& offsetB, int32_t& offsetC, int32_t& offsetBias)
+{
+    const TCubeTiling& tiling = this->tiling;
+    const auto mSingleBlocks = (tiling.M + tiling.singleCoreM - 1) / tiling.singleCoreM; // split M into mSingleBlocks cores
+    mCoreIndex = blockIdx % mSingleBlocks;
+    nCoreIndex = blockIdx / mSingleBlocks;
+
+    offsetA = isTransA ? (mCoreIndex * tiling.singleCoreM) :
+        mCoreIndex * tiling.Ka * tiling.singleCoreM;
+
+    offsetB = isTransB ? (nCoreIndex * tiling.Kb * tiling.singleCoreN) :
+        nCoreIndex * tiling.singleCoreN;
+    offsetC = mCoreIndex * tiling.N * tiling.singleCoreM + nCoreIndex * tiling.singleCoreN;
+    offsetBias = nCoreIndex * tiling.singleCoreN;
+}
+} // namespace MatmulQuantCustom
+
+/**
+  * @brief  matmul kernel function.
+  * @param  a: A matrix gm addr.
+  * @param  b: B matrix gm addr.
+  * @param  bias: Bias matrix gm addr.
+  * @param  quantVector: Quant Vector gm addr.
+  * @param  c: C matrix gm addr.
+  * @param  workspace: Temporary gm space addr required by matmul calc.
+  * @param  tilingGm: Tiling data addr. 
+  * @retval None
+  */
+extern "C" __global__ __aicore__ void matmul_quant_custom(
+    GM_ADDR a, GM_ADDR b, GM_ADDR bias, GM_ADDR quantVector, GM_ADDR c, GM_ADDR workspace, GM_ADDR tilingGm)
+{
+    if (g_coreType == AscendC::AIV) {
+        return;
+    }
+    // prepare tiling
+    TCubeTiling tiling;
+    CopyTiling(&tiling, tilingGm);
+    // define matmul kernel
+    MatmulQuantCustom::MatmulKernel<int8_t, int8_t, half, int32_t> matmulKernel;
+    AscendC::TPipe pipe;
+    REGIST_MATMUL_OBJ(&pipe, GetSysWorkSpacePtr(), matmulKernel.matmulObj, &tiling);
+    // init matmul kernel, isTransA=false, isTransB=true, quantMode = {1: Scaler, 2: Vector}
+    if (quantVector) {
+        matmulKernel.Init(a, b, bias, quantVector, c, tiling, false, true, 2);
+    } else {
+        matmulKernel.Init(a, b, bias, quantVector, c, tiling, false, true, 1);
+    }
+    // matmul kernel process
+    matmulKernel.Process();
+}
+
+#ifndef ASCENDC_CPU_DEBUG
+void matmul_quant_custom_do(uint32_t blockDim, void* stream,
+    GM_ADDR a, GM_ADDR b, GM_ADDR bias, GM_ADDR quantVector, GM_ADDR c, GM_ADDR workspace, GM_ADDR tilingGm)
+{
+    matmul_quant_custom<<<blockDim, nullptr, stream>>>(a, b, bias, quantVector, c, workspace, tilingGm);
+}
+#endif
diff --git a/examples/matrix/matmul_quant/op_kernel/matmul_quant_custom_kernel.h b/examples/matrix/matmul_quant/op_kernel/matmul_quant_custom_kernel.h
new file mode 100644
index 00000000..f6e3be78
--- /dev/null
+++ b/examples/matrix/matmul_quant/op_kernel/matmul_quant_custom_kernel.h
@@ -0,0 +1,76 @@
+/**
+ * Copyright (c) 2025 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+#ifndef EXAMPLES_MATRIX_MATMUL_QUANT_OP_KERNEL_MATMUL_QUANT_CUSTOM_KERNEL_H
+#define EXAMPLES_MATRIX_MATMUL_QUANT_OP_KERNEL_MATMUL_QUANT_CUSTOM_KERNEL_H
+#include "kernel_operator.h"
+// Cube Only
+#define ASCENDC_CUBE_ONLY
+#include "lib/matmul_intf.h"
+
+namespace MatmulQuantCustom {
+template <typename AType, typename BType, typename CType, typename BiasType>
+class MatmulKernel {
+public:
+    __aicore__ inline MatmulKernel(){};
+    /**
+      * @brief  Initialization before process.
+      * @param  a: A matrix gm addr.
+      * @param  b: B matrix gm addr.
+      * @param  bias: Bias matrix gm addr.
+      * @param  quantVector: Quant vector gm addr.
+      * @param  c: C matrix gm addr.
+      * @param  tiling: Matmul tiling struct.
+      * @param  isTransA: Whether A matrix is transposed.
+      * @param  isTransB: Whether B matrix is transposed.
+      * @retval None
+      */
+    __aicore__ inline void Init(GM_ADDR a, GM_ADDR b, GM_ADDR bias, GM_ADDR quantVector, GM_ADDR c,
+        const TCubeTiling& tiling, bool isTransA, bool isTransB, int64_t quantMode);
+    /**
+      * @brief  Process matrix calculation.
+      * @retval None
+      */
+    __aicore__ inline void Process();
+
+    using A_TYPE = AscendC::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, AType, false>;
+    using B_TYPE = AscendC::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, BType, true>;
+    using C_TYPE = AscendC::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, CType>;
+    using BIAS_TYPE =  AscendC::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, BiasType>;
+    AscendC::Matmul<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, CFG_MDL> matmulObj;
+
+private:
+    /**
+      * @brief  Calculate the gm offset based on the blockIdx.
+      * @param  blockIdx: Current Core blockidx.
+      * @param  offsetA: Gm offset of A matrix.
+      * @param  offsetB: Gm offset of B matrix.
+      * @param  offsetC: Gm offset of C matrix.
+      * @param  offsetBias: Gm offset of Bias matrix.
+      * @retval None
+      */
+    __aicore__ inline void CalcOffset(
+        int32_t blockIdx, int32_t& offsetA, int32_t& offsetB, int32_t& offsetC, int32_t& offsetBias);
+
+    AscendC::GlobalTensor<AType> aGlobal;
+    AscendC::GlobalTensor<BType> bGlobal;
+    AscendC::GlobalTensor<CType> cGlobal;
+    AscendC::GlobalTensor<BiasType> biasGlobal;
+    AscendC::GlobalTensor<uint64_t> quantGlobal;
+    TCubeTiling tiling;
+    int32_t mCoreIndex;
+    int32_t nCoreIndex;
+    bool isTransA{false};
+    bool isTransB{false};
+    int64_t quantMode{1};
+};
+} // namespace MatmulQuantCustom
+
+#endif // EXAMPLES_MATRIX_MATMUL_QUANT_OP_KERNEL_MATMUL_QUANT_CUSTOM_KERNEL_H
diff --git a/examples/matrix/matmul_quant/run.sh b/examples/matrix/matmul_quant/run.sh
new file mode 100644
index 00000000..34ed330f
--- /dev/null
+++ b/examples/matrix/matmul_quant/run.sh
@@ -0,0 +1,96 @@
+#!/bin/bash
+# Copyright (c) 2025 Huawei Technologies Co., Ltd.
+# This file is a part of the CANN Open Software.
+# Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# ======================================================================================================================
+
+export IS_PERF="0"
+
+SHORT=r:,v:,p:,
+LONG=run-mode:,soc-version:,perf:,
+OPTS=$(getopt -a --options $SHORT --longoptions $LONG -- "$@")
+eval set -- "$OPTS"
+while :
+do
+    case "$1" in
+        (-r | --run-mode )
+            RUN_MODE="$2"
+            shift 2;;
+        (-v | --soc-version )
+            SOC_VERSION="$2"
+            shift 2;;
+        (-p | --perf )
+            IS_PERF="$2"
+            shift 2;;
+        (--)
+            shift;
+            break;;
+        (*)
+            echo "[ERROR] Unexpected option: $1";
+            break;;
+    esac
+done
+
+# Check invalid input
+if [[ ! "${SOC_VERSION}" =~ ^Ascend910 ]]; then
+    echo "[ERROR] Unsupported SocVersion: ${SOC_VERSION}"
+    exit 1
+fi
+
+if [ "${RUN_MODE}" != "npu" ] && [ "${RUN_MODE}" != "sim" ] && [ "${RUN_MODE}" != "cpu" ]; then
+    echo "[ERROR] Unsupported RUN_MODE: ${RUN_MODE}, which can only be cpu/sim/npu."
+    exit 1
+fi
+
+if [ "${IS_PERF}" != "0" ] && [ "${IS_PERF}" != "1" ]; then
+    echo "[ERROR] Unsupported IS_PERF: ${IS_PERF}, which can only be 0 or 1."
+    exit 1
+fi
+
+# only npu mode support is_perf = 1
+if [ "${IS_PERF}" == "1" ] && [ "${RUN_MODE}" != "npu" ]; then
+    echo "[ERROR] Unsupported IS_PERF: ${IS_PERF} while RUN_MODE is ${RUN_MODE}."
+    exit 1
+fi
+
+rm -rf build
+mkdir build
+cd build
+
+source $ASCEND_HOME_DIR/bin/setenv.bash
+export LD_LIBRARY_PATH=${ASCEND_HOME_DIR}/tools/simulator/${SOC_VERSION}/lib:$LD_LIBRARY_PATH
+set -euo pipefail
+
+cmake  -DRUN_MODE=${RUN_MODE} -DSOC_VERSION=${SOC_VERSION} -DASCEND_CANN_PACKAGE_PATH=${ASCEND_HOME_DIR} ..
+make -j16
+
+cd ../
+
+rm -rf input
+mkdir input
+rm -rf output
+mkdir output
+
+rm -rf bin/
+mkdir -p bin
+cd bin
+cp ../build/ascendc_matmul_quant_bbit ./
+
+export TF_CPP_MIN_LOG_LEVEL=3
+
+if [ "${RUN_MODE}" = "npu" ]; then
+    if [ "${IS_PERF}" = "1" ]; then
+        export ASCEND_TOOLKIT_HOME=${ASCEND_HOME_DIR}
+        python3 -u ../scripts/exec_test.py npu "perf"
+    else
+        python3 -u ../scripts/exec_test.py npu "normal"
+    fi
+elif [ "${RUN_MODE}" = "sim" ]; then
+    python3 -u ../scripts/exec_test.py sim "normal"
+elif [ "${RUN_MODE}" = "cpu" ]; then
+    python3 -u ../scripts/exec_test.py cpu "normal"
+fi
diff --git a/examples/matrix/matmul_quant/scripts/exec_test.py b/examples/matrix/matmul_quant/scripts/exec_test.py
new file mode 100644
index 00000000..db26fa8a
--- /dev/null
+++ b/examples/matrix/matmul_quant/scripts/exec_test.py
@@ -0,0 +1,124 @@
+#!/usr/bin/python3
+# coding=utf-8
+
+# Copyright (c) 2025 Huawei Technologies Co., Ltd.
+# This file is a part of the CANN Open Software.
+# Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# ======================================================================================================================
+import os
+import sys
+import csv
+import time
+import logging
+import subprocess
+import shlex
+
+import numpy as np
+
+sys.path.append("../..")
+from gen_quant_data import QuantMatmulGenData
+from common_scripts.compare_data import compare_data
+from common_scripts.exec_utils import get_file_work_dir, get_case_list, clear_file_cache, get_perf_task_duration
+
+IS_BIAS = True
+IS_TRANS_A = False
+IS_TRANS_B = True
+# int8_float16_dequant(int8 in float16 out for quant matmul)
+DATA_TYPE_STR = "int8_float16_dequant"
+
+logging.basicConfig(level=logging.INFO)
+
+
+class ProcessParams:
+    def __init__(self, case_name, m, n, k, quant_mode, is_perf, run_mode):
+        self.case_name = case_name
+        self.m = m
+        self.n = n
+        self.k = k
+        self.quant_mode = quant_mode
+        self.is_perf = is_perf
+        self.run_mode = run_mode
+
+
+def process_case(file_work_dir, process_params):
+    case_name, m, n, k, quant_mode, is_perf, run_mode = process_params.case_name, process_params.m, process_params.n,\
+        process_params.k, process_params.quant_mode, process_params.is_perf, process_params.run_mode
+    logging.info("[INFO] start process case [%s]" % (case_name))
+    logging.info("[INFO] IS_PERF is set [%d]" % (is_perf))
+    clear_file_cache(file_work_dir)
+
+    matmul_gen_data = QuantMatmulGenData(m, n, k, quant_mode, IS_TRANS_A, IS_TRANS_B, IS_BIAS, DATA_TYPE_STR)
+    params_str = f"{m} {n} {k} {quant_mode}"
+    matmul_gen_data.gen_golden_data(file_work_dir)
+    if is_perf:
+        cmd = f"msprof op --application=\"./ascendc_matmul_quant_bbit {params_str}\" --output=./prof_out"
+    elif run_mode == "sim":
+        cmd = f"msprof op simulator --application=\"./ascendc_matmul_quant_bbit {params_str}\" --output=./sim_out"
+    else:
+        cmd = f"./ascendc_matmul_quant_bbit {params_str}"
+    subprocess.run(shlex.split(cmd))
+    if is_perf:
+        wrong_num = -1
+    else:
+        logging.info("[INFO] compare data case[%s]" % (case_name))
+        wrong_num = compare_data(file_work_dir, DATA_TYPE_STR)
+    res_data = []
+    res_data.append(case_name)
+    res_data.append(wrong_num)
+    res_data.append(m * n)
+    if wrong_num == -1:
+        res_data.append("None")
+    elif wrong_num / (m * n) > 0.001:
+        res_data.append("Fail")
+    else:
+        res_data.append("Success")
+    if is_perf:
+        task_duration = get_perf_task_duration("./prof_out")
+        res_data.append(task_duration)
+    return res_data
+
+
+def main():
+    args_len = len(sys.argv) - 1
+    if args_len != 2:
+        logging.info("[ERROR] exec_test input params error!")
+        return -1
+
+    file_work_dir = get_file_work_dir()
+    if not os.path.exists(file_work_dir):
+        logging.info("[ERROR] file path %s not exist!" % (file_work_dir))
+        return -1
+
+    is_perf = False
+    title_list = ["case_name", "wrong_num", "total_num", "result"]
+    if sys.argv[2] == "perf":
+        is_perf = True
+        title_list.append("task_duration")
+
+    case_list = get_case_list()
+    res_list = [title_list]
+    run_mode = sys.argv[1]
+    for is_process, case_name, m, n, k, quant_mode in case_list:
+        if is_process == 1:
+            process_params = ProcessParams(case_name, m, n, k, quant_mode, is_perf, run_mode)
+            res_data = process_case(file_work_dir, process_params)
+            res_list.append(res_data)
+
+    timestamp = time.time()
+    result_file_name = "result_" + str(timestamp) + ".csv"
+    with open(os.path.join(file_work_dir, "output", result_file_name), 'w', newline='', encoding='utf-8') as csvfile:
+        writer = csv.writer(csvfile)
+        writer.writerows(res_list)
+
+    logging.info("---------------RESULT---------------")
+    for res in res_list:
+        logging.info(res)
+    return 0
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/matrix/matmul_quant/scripts/gen_quant_data.py b/examples/matrix/matmul_quant/scripts/gen_quant_data.py
new file mode 100644
index 00000000..4e5e2db9
--- /dev/null
+++ b/examples/matrix/matmul_quant/scripts/gen_quant_data.py
@@ -0,0 +1,97 @@
+#!/usr/bin/python3
+# coding=utf-8
+
+# Copyright (c) 2025 Huawei Technologies Co., Ltd.
+# This file is a part of the CANN Open Software.
+# Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# ======================================================================================================================
+import os
+import sys
+import logging
+
+import numpy as np
+
+IS_OUTPUT_TXT = False
+
+
+class QuantMatmulGenData:
+    def __init__(self, m, n, k, quant_mode, is_trans_a, is_trans_b, is_bias, data_type_str):
+        self.m = m
+        self.n = n
+        self.k = k
+        self.quant_mode = quant_mode
+        self.is_bias = is_bias
+        self.is_trans_a = is_trans_a
+        self.is_trans_b = is_trans_b
+        self.data_type_str = data_type_str
+
+
+    def gen_golden_data_int8_float16_dequant(self, work_dir):
+        src_type = np.int8
+        dst_type = np.float16
+        data_type = np.int32
+        quant_type = np.uint64
+        temp_type = np.float32
+
+        # generate input x1, x2
+        x1_gm = np.random.randint(-10, 10, [self.m, self.k]).astype(src_type)
+        x2_gm = np.random.randint(-10, 10, [self.k, self.n]).astype(src_type)
+        if self.is_bias:
+            bias_gm = np.random.randint(-2, 2, [1, self.n]).astype(data_type)
+        if self.quant_mode == 2: # 1: scalar quant mode, 2: vector quant mode 
+            quant_vector = np.random.uniform(0.1, 2.0, [1, self.n]).astype(temp_type)
+            quant_vector_gm = np.frombuffer(quant_vector, data_type)
+            quant_vector_gm = quant_vector_gm.astype(quant_type)
+            
+        y_gm_int32 = np.matmul(x1_gm.astype(data_type), x2_gm.astype(data_type))
+        if self.is_bias:
+            y_gm_int32 = y_gm_int32 + bias_gm
+
+        golden = y_gm_int32.astype(dst_type)
+        if self.quant_mode == 1:
+            golden = golden * 0.1
+        elif self.quant_mode == 2:
+            quant_vector = quant_vector.view("uint32")
+            for index, data in enumerate(quant_vector):
+                # 1 sign bit, 8 exponent bits and 10 mantissa bits
+                quant_vector[index] = np.bitwise_and(data, 0xFFFFE000)
+            quant_vector = quant_vector.view("float32")
+            for i in range(self.m):
+                golden[i, :] = golden[i, :] * quant_vector
+        else:
+            logging.info("[ERROR] can't support quant mode %s" % (self.quant_mode))
+
+        if self.is_trans_a:
+            x1_gm = x1_gm.transpose()
+        if self.is_trans_b:
+            x2_gm = x2_gm.transpose()
+        x1_gm.tofile(work_dir + "/input/x1_gm.bin")
+        x2_gm.tofile(work_dir + "/input/x2_gm.bin")
+        if self.is_bias:
+            bias_gm.tofile(work_dir + "/input/bias_gm.bin")
+        if self.quant_mode == 2:  
+            quant_vector_gm.tofile(work_dir + "/input/quant_vector_gm.bin")
+        golden.tofile(work_dir + "/output/golden.bin")
+
+        if IS_OUTPUT_TXT:
+            np.savetxt(work_dir + "/input/x1_gm.txt", x1_gm.flatten(), fmt='%d', newline='\n')
+            np.savetxt(work_dir + "/input/x2_gm.txt", x2_gm.flatten(), fmt='%d', newline='\n')
+            np.savetxt(work_dir + "/output/golden.txt", golden.flatten(), fmt='%f', newline='\n')
+            if self.is_bias:
+                np.savetxt(work_dir + "/input/bias_gm.txt", bias_gm.flatten(), fmt='%d', newline='\n')
+            if self.quant_mode == 2:
+                np.savetxt(work_dir + "/output/quant_vector_gm.txt", quant_vector_gm.flatten(), fmt='%d', newline='\n')
+        return 0
+
+
+    def gen_golden_data(self, work_dir):
+        if self.data_type_str == "int8_float16_dequant":
+            self.gen_golden_data_int8_float16_dequant(work_dir)
+        else:
+            logging.info("[ERROR] can't support data type %s" % (self.data_type_str))
+            return -1
+        return 0
diff --git a/examples/matrix/matmul_quant/testcase/case.csv b/examples/matrix/matmul_quant/testcase/case.csv
new file mode 100644
index 00000000..161b8812
--- /dev/null
+++ b/examples/matrix/matmul_quant/testcase/case.csv
@@ -0,0 +1,4 @@
+1, ScalarQuantCase001, 1024, 1024, 1024, 1
+1, ScalarQuantCase002, 32, 32, 64, 1
+1, VectorQuantCase001, 1024, 1024, 1024, 2
+1, VectorQuantCase002, 256, 512, 256, 2
\ No newline at end of file
diff --git a/examples/matrix/matmul_sparse/README.md b/examples/matrix/matmul_sparse/README.md
index 1124d0d8..d90f9175 100644
--- a/examples/matrix/matmul_sparse/README.md
+++ b/examples/matrix/matmul_sparse/README.md
@@ -1,7 +1,7 @@
 <!--声明：本文使用[Creative Commons License version 4.0](https://creativecommons.org/licenses/by/4.0/legalcode)许可协议，转载、引用或修改等操作请遵循此许可协议。-->
 ## 概述
 
-本样例使用Matmul API实现了Sparse Matmul场景的算子，即左矩阵A为稀疏矩阵，右矩阵B为4:2稠密化后的矩阵的Matmul计算。
+本样例使用Matmul API实现了Sparse Matmul场景的算子，即左矩阵A为稀疏矩阵，右矩阵B为4:2稠密化后的矩阵的Matmul计算。关于Sparse Matmul的具体内容请参考《[Ascend C算子开发](https://www.hiascend.com/document/redirect/CannCommunityOpdevAscendC)》中的“算子实现 > 矩阵编程（高阶API） > 特性场景 > 4:2稀疏矩阵乘”章节。
 
   本样例以直调的方式调用算子核函数。  
 直调：核函数的基础调用方式，开发者完成算子核函数的开发和Tiling实现后，即可通过AscendCL运行时接口，完成算子的调用。包含CPU侧、NPU侧和仿真三种运行验证方法。
@@ -15,7 +15,7 @@
 | [cmake](cmake)                   | 编译工程文件 |
 | [op_host](op_host)               | 本样例的tiling代码实现 |
 | [op_kernel](op_kernel)           | 本样例的kernel代码实现 |
-| [scripts](scripts)               | 执行文件 |
+| [scripts](scripts)               | 包含输入输出真值数据生成脚本文件和执行文件 |
 | [testcase](testcase)             | 用例文件，配置用例的计算shape信息 |
 | CMakeLists.txt                   | 编译工程文件 |
 | main.cpp                         | 主函数，调用算子的应用程序，含CPU域及NPU域调用 |
@@ -68,7 +68,7 @@
     - 结束矩阵乘操作。
 
 - 算子Tiling实现
-  - Ascend C提供一组Matmul Tiling API，方便用户获取MatMul kernel计算时所需的Tiling参数。只需要传入A/B/C矩阵等信息，调用API接口，即可获取到TCubeTiling结构体中的相关参数。
+  - Ascend C提供一组Matmul Tiling API，方便用户获取Matmul kernel计算时所需的Tiling参数。只需要传入A/B/C矩阵等信息，调用API接口，即可获取到TCubeTiling结构体中的相关参数。
   - 获取Tiling参数的流程如下：
     - 创建一个Tiling对象。
     - 使能Sparse Matmul稀疏矩阵计算场景。
@@ -104,4 +104,4 @@
     示例如下，Ascendxxxyy请替换为实际的AI处理器型号。
     ```
     bash run.sh -r cpu -v Ascendxxxyy -p 0
-    ```
\ No newline at end of file
+    ```
diff --git a/examples/matrix/matmul_sparse/scripts/exec_test.py b/examples/matrix/matmul_sparse/scripts/exec_test.py
index 1874639f..a44b20e2 100644
--- a/examples/matrix/matmul_sparse/scripts/exec_test.py
+++ b/examples/matrix/matmul_sparse/scripts/exec_test.py
@@ -14,13 +14,15 @@ import sys
 import csv
 import time
 import logging
+import subprocess
 
 import numpy as np
 
 sys.path.append("../..")
 from gen_sparse_data import SparseMatmulGenData
 from common_scripts.compare_data import compare_data
-from common_scripts.exec_utils import get_file_work_dir, get_case_list, get_perf_task_duration, clear_file_cache
+from common_scripts.exec_utils import get_file_work_dir, get_case_list, get_perf_task_duration, clear_file_cache,\
+    get_process_case_cmd
 
 IS_BIAS = False
 IS_TRANS_A = False
@@ -50,17 +52,14 @@ def process_case(file_work_dir, process_params):
     clear_file_cache(file_work_dir)
 
     matmul_gen_data = SparseMatmulGenData(m, n, k, b, IS_TRANS_A, IS_TRANS_B, IS_BIAS, DATA_TYPE_STR)
-    if is_perf: # npu(is_perf = 1)
+    if is_perf:
         matmul_gen_data.gen_fake_golden_data(file_work_dir)
-        os.system("msprof op --application=\"./ascendc_matmul_sparse_bbit %s %s %s %s\" --output=\"./prof_out\"" %
-            (m, n, k, b))
-    elif run_mode == "sim": # sim
-        matmul_gen_data.gen_golden_data(file_work_dir)
-        os.system("msprof op simulator --application=\"./ascendc_matmul_sparse_bbit %s %s %s %s\"\
-            --output=\"./sim_out\"" % (m, n, k, b))
-    else: # cpu or npu(is_perf = 0)
+    else:
         matmul_gen_data.gen_golden_data(file_work_dir)
-        os.system("./ascendc_matmul_sparse_bbit %s %s %s %s" % (m, n, k, b))
+    params_str = f"{m} {n} {k} {b}"
+    cmd = get_process_case_cmd(kernel_name="ascendc_matmul_sparse_bbit", params_str=params_str,\
+        is_perf=is_perf, run_mode=run_mode)
+    subprocess.run(cmd)
     if is_perf:
         wrong_num = -1
     else:
diff --git a/examples/matrix/matmul_sparse/scripts/gen_sparse_data.py b/examples/matrix/matmul_sparse/scripts/gen_sparse_data.py
index d6082ad3..758d7a68 100644
--- a/examples/matrix/matmul_sparse/scripts/gen_sparse_data.py
+++ b/examples/matrix/matmul_sparse/scripts/gen_sparse_data.py
@@ -197,11 +197,11 @@ class SparseMatmulGenData:
         with open(work_dir + "/input/x1_gm.bin", 'wb') as file:
             file.truncate(file_byte)
 
-        file_byte = self.k / 2 * self.n * data_type_bytes_ab
+        file_byte = int(self.k / 2 * self.n * data_type_bytes_ab)
         with open(work_dir + "/input/x2_gm.bin", 'wb') as file:
             file.truncate(file_byte)
 
-        file_byte = self.k / 8 * self.n * data_type_bytes_ab
+        file_byte = int(self.k / 8 * self.n * data_type_bytes_ab)
         with open(work_dir + "/input/index_gm.bin", 'wb') as file:
             file.truncate(file_byte)
 
diff --git a/examples/matrix/matmul_splitk/README.md b/examples/matrix/matmul_splitk/README.md
index dcb7c851..43ee9419 100644
--- a/examples/matrix/matmul_splitk/README.md
+++ b/examples/matrix/matmul_splitk/README.md
@@ -69,7 +69,7 @@
     - 结束矩阵乘操作。
 
 - 算子Tiling实现
-  - Ascend C提供一组Matmul Tiling API，方便用户获取MatMul kernel计算时所需的Tiling参数。只需要传入A/B/C矩阵等信息，并且使能多核切K，调用API接口，即可获取到TCubeTiling结构体中的相关参数。
+  - Ascend C提供一组Matmul Tiling API，方便用户获取Matmul kernel计算时所需的Tiling参数。只需要传入A/B/C矩阵等信息，并且使能多核切K，调用API接口，即可获取到TCubeTiling结构体中的相关参数。
   - 获取Tiling参数的流程如下：
     - 创建一个Tiling对象。
     - 设置A、B、C、Bias的参数类型信息；M、N、Ka、Kb形状信息等，调用EnableMultiCoreSplitK使能多核切K。
diff --git a/examples/matrix/matmul_splitk/op_kernel/matmul_splitk_custom.cpp b/examples/matrix/matmul_splitk/op_kernel/matmul_splitk_custom.cpp
index 629977b1..e07b783d 100644
--- a/examples/matrix/matmul_splitk/op_kernel/matmul_splitk_custom.cpp
+++ b/examples/matrix/matmul_splitk/op_kernel/matmul_splitk_custom.cpp
@@ -28,11 +28,13 @@ __aicore__ inline void CopyTiling(TCubeTiling* tiling, GM_ADDR tilingGM)
 extern "C" __global__ __aicore__ void matmul_splitk_custom(GM_ADDR a, GM_ADDR b, GM_ADDR bias, GM_ADDR c,
     GM_ADDR workspace, GM_ADDR tilingGm)
 {
+    if ASCEND_IS_AIV {
+        return;
+    }
     TCubeTiling tiling;
     MatmulSplitKCustom::CopyTiling(&tiling, tilingGm);
     MatmulSplitKCustom::MatmulSplitkKernel<half, half, float, float> matmulKernel;
     AscendC::TPipe pipe;
-    REGIST_MATMUL_OBJ(&pipe, GetSysWorkSpacePtr(), matmulKernel.matmulObj, &tiling);
     matmulKernel.Init(a, b, bias, c, workspace, tiling);
     matmulKernel.Process(&pipe);
 }
diff --git a/examples/matrix/matmul_splitk/op_kernel/matmul_splitk_custom_impl.h b/examples/matrix/matmul_splitk/op_kernel/matmul_splitk_custom_impl.h
index 503eb3bf..26a36bb2 100644
--- a/examples/matrix/matmul_splitk/op_kernel/matmul_splitk_custom_impl.h
+++ b/examples/matrix/matmul_splitk/op_kernel/matmul_splitk_custom_impl.h
@@ -11,6 +11,7 @@
 #ifndef EXAMPLES_MATRIX_MATMUL_SPLITK_OP_KERNEL_MATMUL_SPLITK_CUSTOM_IMPL_H
 #define EXAMPLES_MATRIX_MATMUL_SPLITK_OP_KERNEL_MATMUL_SPLITK_CUSTOM_IMPL_H
 #include "kernel_operator.h"
+#define ASCENDC_CUBE_ONLY
 #include "lib/matmul_intf.h"
 
 namespace MatmulSplitKCustom {
@@ -22,7 +23,7 @@ class MatmulSplitkKernel {
         __aicore__ inline MatmulSplitkKernel(){};
         __aicore__ inline void Init(GM_ADDR a, GM_ADDR b, GM_ADDR bias, GM_ADDR c, GM_ADDR workspace, const TCubeTiling& tiling);
         __aicore__ inline void Process(AscendC::TPipe* pipe);
-        AscendC::Matmul<
+        AscendC::MatmulImpl<
             AscendC::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, aType>,
             AscendC::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, bType>,
             AscendC::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, cType>,
@@ -69,9 +70,8 @@ __aicore__ inline void MatmulSplitkKernel<aType, bType, cType, biasType>::Init(
 template <typename aType, typename bType, typename cType, typename biasType>
 __aicore__ inline void MatmulSplitkKernel<aType, bType, cType, biasType>::Process(AscendC::TPipe* pipe)
 {
-    if (matmul::GetBlockIdx() >= tiling.usedCoreNum) {
-        return;
-    }
+    matmulObj.Init(&(this->tiling), pipe);
+    matmulObj.SetSubBlockIdx(0);
     matmulObj.SetTensorA(aGlobal);
     matmulObj.SetTensorB(bGlobal);
     if (tiling.isBias) {
diff --git a/examples/matrix/matmul_splitk/scripts/exec_test.py b/examples/matrix/matmul_splitk/scripts/exec_test.py
index f2057c01..2f2b876d 100644
--- a/examples/matrix/matmul_splitk/scripts/exec_test.py
+++ b/examples/matrix/matmul_splitk/scripts/exec_test.py
@@ -14,13 +14,15 @@ import sys
 import csv
 import time
 import logging
+import subprocess
 
 import numpy as np
 
 sys.path.append("../..")
 from common_scripts.gen_data import MatmulGenData
 from common_scripts.compare_data import compare_data
-from common_scripts.exec_utils import get_file_work_dir, get_case_list, get_perf_task_duration, clear_file_cache
+from common_scripts.exec_utils import get_file_work_dir, get_case_list, get_perf_task_duration, clear_file_cache,\
+    get_process_case_cmd
 
 IS_BIAS = False
 IS_TRANS_A = False
@@ -49,17 +51,14 @@ def process_case(file_work_dir, process_params):
     clear_file_cache(file_work_dir)
 
     matmul_gen_data = MatmulGenData(m, n, k, b, IS_TRANS_A, IS_TRANS_B, IS_BIAS, DATA_TYPE_STR)
-    if is_perf: # npu(is_perf = 1)
+    if is_perf:
         matmul_gen_data.gen_fake_golden_data(file_work_dir)
-        os.system("msprof op --application=\"./ascendc_matmul_splitk_bbit %s %s %s %s\"\
-            --output=\"./prof_out\"" % (m, n, k, b))
-    elif run_mode == "sim": # sim(is_perf = 0)
-        matmul_gen_data.gen_golden_data(file_work_dir)
-        os.system("msprof op simulator --application=\"./ascendc_matmul_splitk_bbit %s %s %s %s\"\
-            --output=\"./sim_out\"" % (m, n, k, b))
-    else: # cpu or npu(is_perf = 0)
+    else:
         matmul_gen_data.gen_golden_data(file_work_dir)
-        os.system("./ascendc_matmul_splitk_bbit %s %s %s %s" % (m, n, k, b))
+    params_str = f"{m} {n} {k} {b}"
+    cmd = get_process_case_cmd(kernel_name="ascendc_matmul_splitk_bbit", params_str=params_str,\
+        is_perf=is_perf, run_mode=run_mode)
+    subprocess.run(cmd)
     if is_perf:
         wrong_num = -1
     else:
diff --git a/examples/matrix/matmul_splitk/testcase/case.csv b/examples/matrix/matmul_splitk/testcase/case.csv
index 0036707a..5b39cef8 100644
--- a/examples/matrix/matmul_splitk/testcase/case.csv
+++ b/examples/matrix/matmul_splitk/testcase/case.csv
@@ -1 +1,2 @@
-1, case001, 16, 16, 1024
\ No newline at end of file
+1, case001, 16, 16, 1024
+0, case002, 256, 512, 1536
\ No newline at end of file
diff --git a/examples/matrix/matmul_triangle/README.md b/examples/matrix/matmul_triangle/README.md
index 31ef4972..2da1418a 100644
--- a/examples/matrix/matmul_triangle/README.md
+++ b/examples/matrix/matmul_triangle/README.md
@@ -15,7 +15,7 @@
 | [cmake](cmake)                     | 编译工程文件                     |
 | [op_host](op_host)                 | 本样例tiling代码实现              |
 | [op_kernel](op_kernel)             | 本样例kernel代码实现              |
-| [scripts](scripts)                 | 执行文件      |
+| [scripts](scripts)                 | 包含输入输出真值数据生成脚本文件和执行文件      |
 | [testcase](testcase)               | 用例文件，配置用例的计算shape信息        |
 | [CMakeLists.txt](CMakeLists.txt)   | 编译工程文件                     |
 | [main.cpp](main.cpp)               | 主函数，调用算子的应用程序，含CPU域及NPU域调用 |
@@ -101,7 +101,7 @@
     - 结束矩阵乘操作。
 
 - Tiling实现
-    - Ascend C提供一组Matmul Tiling API，方便用户获取MatMul kernel计算时所需的Tiling参数。只需要传入A/B/C矩阵等信息，调用API接口，即可获取到TCubeTiling结构体中的相关参数。
+    - Ascend C提供一组Matmul Tiling API，方便用户获取Matmul kernel计算时所需的Tiling参数。只需要传入A/B/C矩阵等信息，调用API接口，即可获取到TCubeTiling结构体中的相关参数。
     - 获取Tiling参数的流程如下：
       - 创建一个Tiling对象。
       - 设置A、B、C、Bias的参数类型信息，以及SingleShape和baseM、baseN、baseK信息。
diff --git a/examples/matrix/matmul_triangle/main.cpp b/examples/matrix/matmul_triangle/main.cpp
index b96f0726..346a9905 100644
--- a/examples/matrix/matmul_triangle/main.cpp
+++ b/examples/matrix/matmul_triangle/main.cpp
@@ -186,7 +186,8 @@ int32_t main(int32_t argc, const char* args[])
         ss >> problem[i - 1];
     }
 
-    optiling::TestcaseParams caseParams = {problem[0], problem[1], problem[2], problem[3], problem[4], problem[5],
+    bool isBias = problem[4];
+    optiling::TestcaseParams caseParams = {problem[0], problem[1], problem[2], problem[3], isBias, problem[5],
         problem[6], problem[7], problem[8], problem[9], problem[10]};
     auto ascendcPlatform = platform_ascendc::PlatformAscendCManager::GetInstance(SOC_VERSION);
 #ifdef ASCENDC_CPU_DEBUG
diff --git a/examples/matrix/matmul_tscm/README.md b/examples/matrix/matmul_tscm/README.md
index 454f5a9b..1fa3a717 100644
--- a/examples/matrix/matmul_tscm/README.md
+++ b/examples/matrix/matmul_tscm/README.md
@@ -1,7 +1,7 @@
 <!--声明：本文使用[Creative Commons License version 4.0](https://creativecommons.org/licenses/by/4.0/legalcode)许可协议，转载、引用或修改等操作请遵循此许可协议。-->
 ## 概述
 
-本样例介绍了Matmul API中用户自定义TSCM输入的使用方式。TSCM即Temp Swap Cache Memory，用于临时把数据交换到额外空间，需开发者自行管理以高效利用硬件资源。该场景由用户自行管理L1 Buffer，再将对应数据地址作为Matmul的输入。
+本样例介绍了Matmul API中用户自定义TSCM输入的使用方式。TSCM即Temp Swap Cache Memory，用于临时把数据交换到额外空间，需开发者自行管理以高效利用硬件资源。该场景由用户自行管理L1 Buffer，再将对应数据地址作为Matmul的输入。关于TSCM输入的具体内容请参考《[Ascend C算子开发](https://www.hiascend.com/document/redirect/CannCommunityOpdevAscendC)》中的“算子实现 > 矩阵编程（高阶API） > 特性场景 > TSCM输入的矩阵乘”章节。
 
 本样例以直调的方式调用算子核函数。  
 直调：核函数的基础调用方式，开发者完成算子核函数的开发和Tiling实现后，即可通过AscendCL运行时接口，完成算子的调用。包含CPU侧、NPU侧和仿真三种运行验证方法。
@@ -71,7 +71,7 @@
     - 结束矩阵乘操作。
 
 - 算子Tiling实现
-  - Ascend C提供一组Matmul Tiling API，方便用户获取MatMul kernel计算时所需的Tiling参数。只需要传入A/B/C矩阵等信息，调用API接口，即可获取到TCubeTiling结构体中的相关参数。
+  - Ascend C提供一组Matmul Tiling API，方便用户获取Matmul kernel计算时所需的Tiling参数。只需要传入A/B/C矩阵等信息，调用API接口，即可获取到TCubeTiling结构体中的相关参数。
   - 获取Tiling参数的流程如下：
     - 创建一个Tiling对象。
     - 设置A、B、C、Bias的参数类型信息；M、N、Ka、Kb形状信息等。
@@ -101,4 +101,4 @@
     示例如下，Ascendxxxyy请替换为实际的AI处理器型号。
     ```
     bash run.sh -r cpu -v Ascendxxxyy -p 0
-    ```
\ No newline at end of file
+    ```
diff --git a/examples/matrix/matmul_tscm/scripts/exec_test.py b/examples/matrix/matmul_tscm/scripts/exec_test.py
index a7a5461c..de439b5f 100644
--- a/examples/matrix/matmul_tscm/scripts/exec_test.py
+++ b/examples/matrix/matmul_tscm/scripts/exec_test.py
@@ -14,13 +14,15 @@ import sys
 import csv
 import time
 import logging
+import subprocess
 
 import numpy as np
 
 sys.path.append("../..")
 from common_scripts.gen_data import MatmulGenData
 from common_scripts.compare_data import compare_data
-from common_scripts.exec_utils import get_file_work_dir, get_case_list, get_perf_task_duration, clear_file_cache
+from common_scripts.exec_utils import get_file_work_dir, get_case_list, get_perf_task_duration, clear_file_cache,\
+    get_process_case_cmd
 
 IS_BIAS = True
 IS_TRANS_A = False
@@ -51,17 +53,14 @@ def process_case(file_work_dir, process_params):
     clear_file_cache(file_work_dir)
 
     matmul_gen_data = MatmulGenData(m, n, k, b, IS_TRANS_A, IS_TRANS_B, IS_BIAS, DATA_TYPE_STR, A_FORMAT, B_FORMAT)
-    if is_perf: # npu(is_perf = 1)
+    if is_perf:
         matmul_gen_data.gen_fake_golden_data(file_work_dir)
-        os.system("msprof op --application=\"./ascendc_matmul_tscm_bbit %s %s %s %s\" --output=\"./prof_out\"" %
-            (m, n, k, b))
-    elif run_mode == "sim": # sim
-        matmul_gen_data.gen_golden_data(file_work_dir)
-        os.system("msprof op simulator --application=\"./ascendc_matmul_tscm_bbit %s %s %s %s\"\
-            --output=\"./sim_out\"" % (m, n, k, b))
-    else: # cpu or npu(is_perf = 0)
+    else:
         matmul_gen_data.gen_golden_data(file_work_dir)
-        os.system("./ascendc_matmul_tscm_bbit %s %s %s %s" % (m, n, k, b))
+    params_str = f"{m} {n} {k} {b}"
+    cmd = get_process_case_cmd(kernel_name="ascendc_matmul_tscm_bbit", params_str=params_str,\
+        is_perf=is_perf, run_mode=run_mode)
+    subprocess.run(cmd)
     if is_perf:
         wrong_num = -1
     else:
diff --git a/examples/matrix/matmul_unaligned/README.md b/examples/matrix/matmul_unaligned/README.md
index 69dac450..78b02892 100644
--- a/examples/matrix/matmul_unaligned/README.md
+++ b/examples/matrix/matmul_unaligned/README.md
@@ -1,7 +1,7 @@
 <!--声明：本文使用[Creative Commons License version 4.0](https://creativecommons.org/licenses/by/4.0/legalcode)许可协议，转载、引用或修改等操作请遵循此许可协议。-->
 ## 概述
 
-本样例介绍了调用Matmul高阶API实现多核非对齐切分的单算子。多核非对齐的切分为M，N，K无法整除singleCoreM, singleCoreN, singleCoreK时，需要在不改变原有Tiling的情况下，在Kernel侧调用SetTail接口重新设置本次计算的singleCoreM/singleCoreN/singleCoreK。
+本样例介绍了调用Matmul高阶API实现多核非对齐切分的单算子。多核非对齐的切分为M，N，K无法整除singleCoreM, singleCoreN, singleCoreK时，需要在不改变原有Tiling的情况下，在Kernel侧调用SetTail接口重新设置本次计算的singleCoreM/singleCoreN/singleCoreK。关于多核非对齐切分的具体内容请参考《[Ascend C算子开发](https://hiascend.com/document/redirect/CannCommunityOpdevAscendC)》中的“算子实现 > 矩阵编程（高阶API） > 特性场景 > 多核非对齐切分”章节。
 
 本样例以直调的方式调用算子核函数。  
 直调：核函数的基础调用方式，开发者完成算子核函数的开发和Tiling实现后，即可通过AscendCL运行时接口，完成算子的调用。包含CPU侧、NPU侧和仿真三种运行验证方式。
@@ -78,7 +78,7 @@
     - 结束矩阵乘操作。
 
 - Tiling实现
-    - Ascend C提供一组Matmul Tiling API，方便用户获取MatMul kernel计算时所需的Tiling参数。只需要传入A/B/C矩阵等信息，调用API接口，即可获取到TCubeTiling结构体中的相关参数。
+    - Ascend C提供一组Matmul Tiling API，方便用户获取Matmul kernel计算时所需的Tiling参数。只需要传入A/B/C矩阵等信息，调用API接口，即可获取到TCubeTiling结构体中的相关参数。
     - 获取Tiling参数的流程如下：
       - 创建一个Tiling对象。
       - 设置A、B、C、Bias的参数类型信息；M、N、Ka、Kb形状信息等。
diff --git a/examples/matrix/matmul_unaligned/main.cpp b/examples/matrix/matmul_unaligned/main.cpp
index 9e7f1270..505344e1 100644
--- a/examples/matrix/matmul_unaligned/main.cpp
+++ b/examples/matrix/matmul_unaligned/main.cpp
@@ -184,7 +184,8 @@ int32_t main(int32_t argc, const char* args[])
         ss >> problem[i - 1];
     }
 
-    optiling::TestcaseParams caseParams = {problem[0], problem[1], problem[2], problem[3]};
+    bool isBias = problem[3];
+    optiling::TestcaseParams caseParams = {problem[0], problem[1], problem[2], isBias};
     auto ascendcPlatform = platform_ascendc::PlatformAscendCManager::GetInstance(SOC_VERSION);
 
 #ifdef ASCENDC_CPU_DEBUG
diff --git a/examples/matrix/matmul_unitflag/CMakeLists.txt b/examples/matrix/matmul_unitflag/CMakeLists.txt
new file mode 100644
index 00000000..dc2dc3ff
--- /dev/null
+++ b/examples/matrix/matmul_unitflag/CMakeLists.txt
@@ -0,0 +1,74 @@
+# Copyright (c) 2025 Huawei Technologies Co., Ltd.
+# This file is a part of the CANN Open Software.
+# Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# ======================================================================================================================
+
+cmake_minimum_required(VERSION 3.16)
+project(Ascend_c)
+if (${RUN_MODE})
+    set(RUN_MODE "npu" CACHE STRING "cpu/sim/npu")
+endif()
+if (${SOC_VERSION})
+    set(SOC_VERSION "Ascend910" CACHE STRING "system on chip type")
+endif()
+
+set(ASCEND_CANN_PACKAGE_PATH "~/Ascend/ascend-toolkit/latest" CACHE STRING "ASCEND CANN package installation directory")
+if(NOT CMAKE_BUILD_TYPE)
+    set(CMAKE_BUILD_TYPE "Debug"  CACHE STRING "Build type Release/Debug (default Debug)" FORCE)
+endif()
+
+if(CMAKE_INSTALL_PREFIX STREQUAL /usr/local)
+    set(CMAKE_INSTALL_PREFIX "${CMAKE_CURRENT_LIST_DIR}/out"  CACHE STRING "path for install()" FORCE)
+endif()
+
+file(GLOB KERNEL_FILES
+    ${CMAKE_CURRENT_SOURCE_DIR}/op_kernel/matmul_unitflag_custom_kernel.cpp
+)
+
+if("${RUN_MODE}" STREQUAL "cpu")
+    include(cmake/cpu_lib.cmake)
+elseif("${RUN_MODE}" STREQUAL "sim" OR "${RUN_MODE}" STREQUAL "npu")
+    include(cmake/npu_lib.cmake)
+else()
+    message("invalid RUN_MODE: ${RUN_MODE}")
+endif()
+
+add_executable(ascendc_matmul_unitflag_bbit
+    ${CMAKE_CURRENT_SOURCE_DIR}/main.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/op_host/matmul_unitflag_custom_tiling.cpp
+)
+
+target_compile_options(ascendc_matmul_unitflag_bbit PRIVATE
+    $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:-g>>
+    -O2
+    -std=c++17
+    -D_GLIBCXX_USE_CXX11_ABI=0
+)
+
+target_compile_definitions(ascendc_matmul_unitflag_bbit PRIVATE
+    $<$<BOOL:${ENABLE_FEATURE}>:ENABLE_UNITFLAG_FEATURE>
+    SOC_VERSION="${SOC_VERSION}"
+)
+
+target_include_directories(ascendc_matmul_unitflag_bbit PRIVATE
+    ${CMAKE_CURRENT_SOURCE_DIR}
+    $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:${ASCEND_CANN_PACKAGE_PATH}/include>>
+    $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:${ASCEND_CANN_PACKAGE_PATH}/runtime/include>>
+)
+
+target_link_libraries(ascendc_matmul_unitflag_bbit PRIVATE
+    $<BUILD_INTERFACE:$<$<OR:$<STREQUAL:${RUN_MODE},npu>,$<STREQUAL:${RUN_MODE},sim>>:host_intf_pub>>
+    $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:tikicpulib::${SOC_VERSION}>>
+    $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:ascendcl>>
+    $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:c_sec>>
+    ascendc_kernels_${RUN_MODE}
+    tiling_api
+    register
+    platform
+    ascendalog
+    dl
+)
diff --git a/examples/matrix/matmul_unitflag/README.md b/examples/matrix/matmul_unitflag/README.md
new file mode 100644
index 00000000..d10ff5c3
--- /dev/null
+++ b/examples/matrix/matmul_unitflag/README.md
@@ -0,0 +1,112 @@
+<!--声明：本文使用[Creative Commons License version 4.0](https://creativecommons.org/licenses/by/4.0/legalcode)许可协议，转载、引用或修改等操作请遵循此许可协议。-->
+## 概述
+
+本样例介绍了调用Matmul API实现MDL模板开启UnitFlag功能的单算子。使能UnitFlag功能，可以使算子中的CUBE计算流水与FIXPIPE数据搬出流水并行，提升算子性能。Norm模板、IBShare模板默认使能UnitFlag功能，MDL模板默认不使能UnitFlag功能。
+
+本样例以直调的方式调用算子核函数。  
+直调：核函数的基础调用方式，开发者完成算子核函数的开发和Tiling实现后，即可通过AscendCL运行时接口，完成算子的调用。包含CPU侧、NPU侧和仿真三种运行验证方法。
+
+## 样例支持的产品型号为
+- Atlas A2训练系列产品/Atlas 800I A2推理产品
+
+## 目录结构
+| 目录及文件                            | 描述                   |
+|----------------------------------|----------------------|
+| [cmake](cmake)                   | 编译工程文件 |
+| [op_host](op_host)               | 本样例的tiling代码实现 |
+| [op_kernel](op_kernel)           | 本样例的kernel代码实现 |
+| [scripts](scripts)               | 执行脚本文件 |
+| [testcase](testcase)             | 用例文件，配置用例的计算shape信息 |
+| CMakeLists.txt                   | 编译工程文件 |
+| main.cpp                         | 主函数，调用算子的应用程序，含CPU域及NPU域调用 |
+| run.sh                           | 编译执行脚本 |
+
+## 算子描述
+- 算子功能  
+  MatmulUnitFlagCustom算子调用Matmul API计算时，通过配置MatmulConfig中的enUnitFlag参数为true，使能MDL模板开启UnitFlag功能，对输入的A、B矩阵做矩阵乘和加bias偏置。算子使能UnitFlag功能后，在Matmul API内部实现MMAD指令和FIXPIPE指令的细粒度同步，从而使计算流水与数据搬出流水并行，提升算子性能。
+
+- 算子规格
+    <table>
+    <tr><td rowspan="1" align="center">算子类型(OpType)</td><td colspan="5" align="center">MatmulUnitFlagCustom</td></tr>
+    </tr>
+    <tr><td rowspan="4" align="center">算子输入</td><td align="center">name</td><td align="center">shape</td><td align="center">data type</td><td align="center">format</td><td align="center">isTrans</td></tr>
+    <tr><td align="center">a</td><td align="center">-</td><td align="center">float16</td><td align="center">ND</td><td align="center">false</td></tr>
+    <tr><td align="center">b</td><td align="center">-</td><td align="center">float16</td><td align="center">ND</td><td align="center">false</td></tr>
+    <tr><td align="center">bias</td><td align="center">-</td><td align="center">float</td><td align="center">ND</td><td align="center">-</td></tr>
+    </tr>
+    </tr>
+    <tr><td rowspan="1" align="center">算子输出</td><td align="center">c</td><td align="center">-</td><td align="center">float</td><td align="center">ND</td><td align="center">-</td></tr>
+    </tr>
+    <tr><td rowspan="1" align="center">核函数名</td><td colspan="5" align="center">matmul_unitflag_custom</td></tr>
+    </table>
+
+## 算子实现介绍     
+- 约束条件
+  - UnitFlag功能只支持Norm、IBshare、MDL三个模板。
+  - 使能UnitFlag功能时，不支持算子内同时存在L0C搬出到Global Memory和L1搬出到Global Memory的两种流水。
+  - 使能UnitFlag功能时，若同时使能L0C累加功能，不支持多次Iterate计算，一次GetTensorC输出。
+
+- 算子Kernel实现
+  - 计算逻辑：C = A * B + Bias。
+    - A、B为源操作数，A为左矩阵，形状为[M, K]；B为右矩阵，形状为[K, N]。
+    - C为目的操作数，存放矩阵乘结果的矩阵，形状为[M, N]。
+    - Bias为矩阵乘偏置，形状为[1, N]。对A*B结果矩阵的每一行都采用该Bias进行偏置。
+  - 具体步骤：
+    - 创建Matmul对象。  
+      创建Matmul对象时，自定义MatmulConfig参数，将其中的enUnitFlag参数设置为true，使能UnitFlag功能，获得自定义的使用MDL模板的Matmul对象。
+      ```
+      __aicore__ inline constexpr MatmulConfig GetUnitFlagCfg()
+      {
+          auto mmCfg = CFG_MDL;
+      #ifdef ENABLE_UNITFLAG_FEATURE
+          // enable UnitFlag feature
+          mmCfg.enUnitFlag = true;
+      #endif
+          return mmCfg;
+      }
+      constexpr static MatmulConfig CFG_MDL_UNITFLAG = GetUnitFlagCfg();
+
+      using A_TYPE = AscendC::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, AType>;
+      using B_TYPE = AscendC::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, BType>;
+      using C_TYPE = AscendC::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, CType>;
+      using BIAS_TYPE =  AscendC::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, BiasType>;
+      AscendC::Matmul<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, CFG_MDL_UNITFLAG> matmulObj;
+      ```
+    - 初始化操作。
+    - 设置左矩阵A、右矩阵B。
+    - 完成矩阵乘操作。
+    - 结束矩阵乘操作。
+
+- 算子Tiling实现
+  - Ascend C提供一组Matmul Tiling API，方便用户获取Matmul kernel计算时所需的Tiling参数。只需要传入A/B/C矩阵等信息，调用API接口，即可获取到TCubeTiling结构体中的相关参数。
+  - 获取Tiling参数的流程如下：
+    - 创建一个Tiling对象。
+    - 设置A、B、C、Bias的参数类型信息；M、N、Ka、Kb形状信息等。
+    - 调用GetTiling接口，获取Tiling信息。
+
+## 编译运行样例
+
+  - 配置环境变量
+
+    这里的\$ASCEND_CANN_PACKAGE_PATH需要替换为CANN开发套件包安装后文件存储路径。例如：/usr/local/Ascend/ascend-toolkit/latest
+    ```
+    export ASCEND_HOME_DIR=$ASCEND_CANN_PACKAGE_PATH
+    source $ASCEND_HOME_DIR/../set_env.sh
+    ```
+
+  - 编译执行
+
+    ```
+    bash run.sh -r [RUN_MODE] -v [SOC_VERSION] -p [IS_PERF] -e [ENABLE_FEATURE]
+    ```
+    其中脚本参数说明如下：
+    - RUN_MODE ：编译执行方式，可选择CPU调试，NPU仿真，NPU上板，对应参数分别为[cpu / sim / npu]。若需要详细了解NPU仿真相关内容，请参考[《算子开发工具msProf》](https://hiascend.com/document/redirect/CannCommunityToolMsProf)中的“工具使用”章节。
+    - SOC_VERSION ：昇腾AI处理器型号，如果无法确定具体的[SOC_VERSION]，则在安装昇腾AI处理器的服务器执行npu-smi info命令进行查询，在查询到的“Name”前增加Ascend信息，例如“Name”对应取值为xxxyy，实际配置的[SOC_VERSION]值为Ascendxxxyy。支持以下产品型号：
+      - Atlas A2训练系列产品/Atlas 800I A2推理产品
+    - IS_PERF ：是否获取执行性能数据，当前只在NPU执行时生效，打开后性能数据显示在执行结果中，可选择关闭和开启该功能，对应参数分别为[0 / 1]。
+    - ENABLE_FEATURE ：是否使能UnitFlag功能，可选择关闭和开启该功能，对应参数分别为[0 / 1]，默认开启。
+
+    示例如下，Ascendxxxyy请替换为实际的AI处理器型号。
+    ```
+    bash run.sh -r cpu -v Ascendxxxyy -p 0 -e 1
+    ```
\ No newline at end of file
diff --git a/examples/matrix/matmul_unitflag/cmake/cpu_lib.cmake b/examples/matrix/matmul_unitflag/cmake/cpu_lib.cmake
new file mode 100644
index 00000000..1cc21b1d
--- /dev/null
+++ b/examples/matrix/matmul_unitflag/cmake/cpu_lib.cmake
@@ -0,0 +1,35 @@
+# Copyright (c) 2025 Huawei Technologies Co., Ltd.
+# This file is a part of the CANN Open Software.
+# Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# ======================================================================================================================
+
+if(NOT DEFINED ENV{CMAKE_PREFIX_PATH})
+    set(CMAKE_PREFIX_PATH ${ASCEND_CANN_PACKAGE_PATH}/tools/tikicpulib/lib/cmake)
+endif()
+find_package(tikicpulib REQUIRED)
+
+add_library(ascendc_kernels_${RUN_MODE} SHARED
+    ${KERNEL_FILES}
+)
+
+target_link_libraries(ascendc_kernels_${RUN_MODE} PRIVATE
+    tikicpulib::${SOC_VERSION}
+)
+
+target_compile_definitions(ascendc_kernels_${RUN_MODE} PRIVATE
+    $<$<BOOL:${ENABLE_FEATURE}>:ENABLE_UNITFLAG_FEATURE>
+)
+
+target_compile_options(ascendc_kernels_${RUN_MODE} PRIVATE
+    -g
+    -O0
+    -std=c++17
+)
+
+install(TARGETS ascendc_kernels_${RUN_MODE}
+    DESTINATION ${CMAKE_INSTALL_LIBDIR}
+)
\ No newline at end of file
diff --git a/examples/matrix/matmul_unitflag/cmake/npu_lib.cmake b/examples/matrix/matmul_unitflag/cmake/npu_lib.cmake
new file mode 100644
index 00000000..b3b8f341
--- /dev/null
+++ b/examples/matrix/matmul_unitflag/cmake/npu_lib.cmake
@@ -0,0 +1,28 @@
+# Copyright (c) 2025 Huawei Technologies Co., Ltd.
+# This file is a part of the CANN Open Software.
+# Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# ======================================================================================================================
+
+if(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
+    set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
+elseif(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake)
+    set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake)
+else()
+    message(FATAL_ERROR "ascendc_kernel_cmake does not exist ,please check whether the cann package is installed")
+endif()
+include(${ASCENDC_CMAKE_DIR}/ascendc.cmake)
+
+ascendc_library(ascendc_kernels_${RUN_MODE} STATIC
+    ${KERNEL_FILES}
+)
+
+ascendc_compile_definitions(ascendc_kernels_${RUN_MODE} PRIVATE
+    $<$<BOOL:${ENABLE_FEATURE}>:ENABLE_UNITFLAG_FEATURE>
+    -DASCENDC_DUMP
+    -DHAVE_WORKSPACE
+    -DHAVE_TILING
+)
\ No newline at end of file
diff --git a/examples/matrix/matmul_unitflag/main.cpp b/examples/matrix/matmul_unitflag/main.cpp
new file mode 100644
index 00000000..2be142a0
--- /dev/null
+++ b/examples/matrix/matmul_unitflag/main.cpp
@@ -0,0 +1,225 @@
+/**
+ * Copyright (c) 2025 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+#include <sstream>
+#include "../../common/data_utils.h"
+#include "kernel_tiling/kernel_tiling.h"
+#include "tiling/platform/platform_ascendc.h"
+#include "op_host/matmul_unitflag_custom_tiling.h"
+
+#ifndef ASCENDC_CPU_DEBUG
+#include "acl/acl.h"
+extern void matmul_unitflag_custom_do(uint32_t coreDim, void* stream,
+    uint8_t* a, uint8_t* b, uint8_t* bias, uint8_t* c,
+    uint8_t* workspace, uint8_t* tiling);
+#else
+#include "tikicpulib.h"
+extern "C" void matmul_unitflag_custom(uint8_t* a, uint8_t* b, uint8_t* bias, uint8_t* c,
+    uint8_t* workspace, uint8_t* tiling);
+#endif
+
+namespace {
+constexpr bool IS_BIAS = true;
+constexpr bool IS_A_TRANS = false;
+constexpr bool IS_B_TRANS = false;
+
+struct MatrixFileSize
+{
+    size_t x1FileSize;
+    size_t x2FileSize;
+    size_t yFileSize;
+    size_t biasFileSize;
+};
+}
+
+namespace MatmulHost {
+static size_t GetSysWorkSpaceSize()
+{
+    auto ascendcPlatform = platform_ascendc::PlatformAscendCManager::GetInstance();
+    return static_cast<size_t>(ascendcPlatform->GetLibApiWorkSpaceSize());
+}
+
+// CPU debug mode
+#ifdef ASCENDC_CPU_DEBUG
+void TestMatmulCpu(int64_t m, int64_t n, int64_t k, const MatrixFileSize& matrixFileSize)
+{
+    size_t x1FileSize = matrixFileSize.x1FileSize;
+    size_t x2FileSize = matrixFileSize.x2FileSize;
+    size_t yFileSize = matrixFileSize.yFileSize;
+    size_t biasFileSize = matrixFileSize.biasFileSize;
+    // Query workspace size
+    size_t workspaceSize = GetSysWorkSpaceSize();
+    uint8_t* x1 = (uint8_t*)AscendC::GmAlloc(x1FileSize);
+    uint8_t* x2 = (uint8_t*)AscendC::GmAlloc(x2FileSize);
+    uint8_t* bias = nullptr;
+    uint8_t* y = (uint8_t*)AscendC::GmAlloc(yFileSize);
+    uint8_t* workspace = (uint8_t*)AscendC::GmAlloc(workspaceSize);
+    ReadFile("../input/x1_gm.bin", x1FileSize, x1, x1FileSize);
+    ReadFile("../input/x2_gm.bin", x2FileSize, x2, x2FileSize);
+    if (IS_BIAS) {
+        bias = (uint8_t*)AscendC::GmAlloc(biasFileSize);
+        ReadFile("../input/bias_gm.bin", biasFileSize, bias, biasFileSize);
+    }
+    size_t tilingFileSize = sizeof(TCubeTiling);
+    uint8_t* tiling = (uint8_t*)AscendC::GmAlloc(tilingFileSize);
+    auto ascendcPlatform = platform_ascendc::PlatformAscendCManager::GetInstance();
+    MatmulHost::MatmulCaseParams testCaseParams{static_cast<int32_t>(ascendcPlatform->GetCoreNumAic()),
+        static_cast<int32_t>(m), static_cast<int32_t>(n), static_cast<int32_t>(k), IS_BIAS, IS_A_TRANS, IS_B_TRANS};
+    // Calculate Tiling
+    const auto& tilingData = MatmulHost::GenerateTiling(testCaseParams);
+    memcpy_s(tiling, tilingFileSize, &tilingData, tilingFileSize);
+    ICPU_RUN_KF(matmul_unitflag_custom, tilingData.usedCoreNum, x1, x2, bias, y, workspace, tiling);
+    WriteFile("../output/output.bin", y, yFileSize);
+    AscendC::GmFree((void*)x1);
+    AscendC::GmFree((void*)x2);
+    AscendC::GmFree((void*)y);
+    AscendC::GmFree((void*)workspace);
+    AscendC::GmFree((void*)tiling);
+    if (IS_BIAS) {
+        AscendC::GmFree((void*)bias);
+    }
+}
+// NPU
+#else
+void MatmulOp(uint8_t* x1, uint8_t* x2, uint8_t* y, uint8_t* bias, int64_t m, int64_t n, int64_t k,
+    void* stream = nullptr)
+{
+    // Init args
+    uint8_t* workspaceDevice = nullptr;
+
+    // Query workspace size
+    size_t workspaceSize = GetSysWorkSpaceSize();
+
+    // Allocate workspace on device
+    CHECK_ACL(aclrtMalloc((void**)&workspaceDevice, workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST));
+
+    // Initialize kernel with arguments and workspace pointer
+    uint8_t* tilingHost = nullptr;
+    uint8_t* tilingDevice = nullptr;
+    size_t tilingFileSize = sizeof(TCubeTiling);
+    auto ascendcPlatform = platform_ascendc::PlatformAscendCManager::GetInstance();
+    MatmulHost::MatmulCaseParams testCaseParams{static_cast<int32_t>(ascendcPlatform->GetCoreNumAic()),
+        static_cast<int32_t>(m), static_cast<int32_t>(n), static_cast<int32_t>(k), IS_BIAS, IS_A_TRANS, IS_B_TRANS};
+    // Calculate Tiling
+    const auto tilingData = MatmulHost::GenerateTiling(testCaseParams);
+    CHECK_ACL(aclrtMallocHost((void**)(&tilingHost), tilingFileSize));
+    CHECK_ACL(aclrtMalloc((void**)&tilingDevice, tilingFileSize,
+                          ACL_MEM_MALLOC_HUGE_FIRST));
+    CHECK_ACL(aclrtMemcpy(tilingHost, tilingFileSize, &tilingData,
+                          tilingFileSize, ACL_MEMCPY_HOST_TO_HOST));
+    CHECK_ACL(aclrtMemcpy(tilingDevice, tilingFileSize, tilingHost,
+                          tilingFileSize, ACL_MEMCPY_HOST_TO_DEVICE));
+    // Launch kernel
+    matmul_unitflag_custom_do(tilingData.usedCoreNum, stream, x1, x2, bias, y, workspaceDevice, tilingDevice);
+    CHECK_ACL(aclrtFreeHost(tilingHost));
+    CHECK_ACL(aclrtFree(workspaceDevice));
+    CHECK_ACL(aclrtFree(tilingDevice));
+}
+
+void TestAclInit(aclrtContext& context, aclrtStream& stream, int64_t& deviceId)
+{
+    CHECK_ACL(aclInit(nullptr));
+    CHECK_ACL(aclrtSetDevice(deviceId));
+    CHECK_ACL(aclrtCreateContext(&context, deviceId));
+    CHECK_ACL(aclrtCreateStream(&stream));
+}
+
+void TestAclDeInit(aclrtContext& context, aclrtStream& stream, int64_t& deviceId)
+{
+    CHECK_ACL(aclrtDestroyStream(stream));
+    CHECK_ACL(aclrtDestroyContext(context));
+    CHECK_ACL(aclrtResetDevice(deviceId));
+    CHECK_ACL(aclFinalize());
+}
+
+void TestMatmul(int64_t m, int64_t n, int64_t k, const MatrixFileSize& matrixFileSize)
+{
+    size_t x1FileSize = matrixFileSize.x1FileSize;
+    size_t x2FileSize = matrixFileSize.x2FileSize;
+    size_t yFileSize = matrixFileSize.yFileSize;
+    size_t biasFileSize = matrixFileSize.biasFileSize;
+
+    aclrtContext context;
+    aclrtStream stream = nullptr;
+    int64_t deviceId = 0;
+    TestAclInit(context, stream, deviceId);
+
+    uint8_t* x1Host = nullptr;
+    uint8_t* x1Device = nullptr;
+    CHECK_ACL(aclrtMallocHost((void**)(&x1Host), x1FileSize));
+    CHECK_ACL(aclrtMalloc((void**)&x1Device, x1FileSize, ACL_MEM_MALLOC_HUGE_FIRST));
+    ReadFile("../input/x1_gm.bin", x1FileSize, x1Host, x1FileSize);
+    CHECK_ACL(aclrtMemcpy(x1Device, x1FileSize, x1Host, x1FileSize, ACL_MEMCPY_HOST_TO_DEVICE));
+
+    uint8_t* x2Host = nullptr;
+    uint8_t* x2Device = nullptr;
+    CHECK_ACL(aclrtMallocHost((void**)(&x2Host), x2FileSize));
+    CHECK_ACL(aclrtMalloc((void**)&x2Device, x2FileSize, ACL_MEM_MALLOC_HUGE_FIRST));
+    ReadFile("../input/x2_gm.bin", x2FileSize, x2Host, x2FileSize);
+    CHECK_ACL(aclrtMemcpy(x2Device, x2FileSize, x2Host, x2FileSize, ACL_MEMCPY_HOST_TO_DEVICE));
+
+    uint8_t* biasHost = nullptr;
+    uint8_t* biasDevice = nullptr;
+    if (IS_BIAS) {
+        CHECK_ACL(aclrtMallocHost((void**)(&biasHost), biasFileSize));
+        CHECK_ACL(aclrtMalloc((void**)&biasDevice, biasFileSize, ACL_MEM_MALLOC_HUGE_FIRST));
+        ReadFile("../input/bias_gm.bin", biasFileSize, biasHost, biasFileSize);
+        CHECK_ACL(aclrtMemcpy(biasDevice, biasFileSize, biasHost, biasFileSize, ACL_MEMCPY_HOST_TO_DEVICE));
+    }
+    uint8_t* yHost = nullptr;
+    uint8_t* yDevice = nullptr;
+    CHECK_ACL(aclrtMallocHost((void**)(&yHost), yFileSize));
+    CHECK_ACL(aclrtMalloc((void**)&yDevice, yFileSize, ACL_MEM_MALLOC_HUGE_FIRST));
+
+    MatmulOp(x1Device, x2Device, yDevice, biasDevice, m, n, k, stream);
+    CHECK_ACL(aclrtSynchronizeStream(stream));
+
+    CHECK_ACL(aclrtMemcpy(yHost, yFileSize, yDevice, yFileSize, ACL_MEMCPY_DEVICE_TO_HOST));
+    WriteFile("../output/output.bin", yHost, yFileSize);
+
+    if (IS_BIAS) {
+        CHECK_ACL(aclrtFree(biasDevice));
+        CHECK_ACL(aclrtFreeHost(biasHost));
+    }
+    CHECK_ACL(aclrtFree(x1Device));
+    CHECK_ACL(aclrtFreeHost(x1Host));
+    CHECK_ACL(aclrtFree(x2Device));
+    CHECK_ACL(aclrtFreeHost(x2Host));
+    CHECK_ACL(aclrtFree(yDevice));
+    CHECK_ACL(aclrtFreeHost(yHost));
+    TestAclDeInit(context, stream, deviceId);
+}
+#endif
+} // namespace MatmulHost
+
+int32_t main(int32_t argc, const char* args[])
+{
+    int64_t inputParams[3] = {1, 1, 1};
+    for (int32_t i = 1; i < argc && i < 4; ++i) { // 4 used for inputParams loop
+        std::stringstream ss(args[i]);
+        ss >> inputParams[i - 1];
+    }
+    auto ascendcPlatform = platform_ascendc::PlatformAscendCManager::GetInstance(SOC_VERSION);
+    int64_t M = inputParams[0];
+    int64_t N = inputParams[1];
+    int64_t K = inputParams[2];
+    MatrixFileSize matrixFileSize;
+    // uint16_t represent half
+    matrixFileSize.x1FileSize = static_cast<size_t>(M * K) * sizeof(uint16_t);
+    matrixFileSize.x2FileSize = static_cast<size_t>(K * N) * sizeof(uint16_t);
+    matrixFileSize.yFileSize = static_cast<size_t>(M * N) * sizeof(float);
+    matrixFileSize.biasFileSize = static_cast<size_t>(1 * N) * sizeof(float);
+#ifdef ASCENDC_CPU_DEBUG
+    MatmulHost::TestMatmulCpu(M, N, K, matrixFileSize);
+#else
+    MatmulHost::TestMatmul(M, N, K, matrixFileSize);
+#endif
+    return 0;
+}
diff --git a/examples/matrix/matmul_unitflag/op_host/matmul_unitflag_custom_tiling.cpp b/examples/matrix/matmul_unitflag/op_host/matmul_unitflag_custom_tiling.cpp
new file mode 100644
index 00000000..d86f2e3f
--- /dev/null
+++ b/examples/matrix/matmul_unitflag/op_host/matmul_unitflag_custom_tiling.cpp
@@ -0,0 +1,49 @@
+/**
+ * Copyright (c) 2025 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+#include "matmul_unitflag_custom_tiling.h"
+#include <iostream>
+
+namespace MatmulHost {
+
+TCubeTiling GenerateTiling(const MatmulCaseParams& testCaseParams)
+{
+    TCubeTiling tilingData;
+    auto ascendcPlatform = platform_ascendc::PlatformAscendCManager::GetInstance();
+    matmul_tiling::MultiCoreMatmulTiling cubeTiling(*ascendcPlatform);
+    uint32_t M = testCaseParams.m;
+    uint32_t N = testCaseParams.n;
+    uint32_t K = testCaseParams.k;
+    uint32_t blockDim = testCaseParams.usedCoreNum;
+    bool isBias = testCaseParams.isBias;
+    bool isAtrans = testCaseParams.isATrans;
+    bool isBtrans = testCaseParams.isBTrans;
+
+    cubeTiling.SetDim(blockDim);
+    cubeTiling.SetAType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND,
+        matmul_tiling::DataType::DT_FLOAT16, isAtrans);
+    cubeTiling.SetBType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND,
+        matmul_tiling::DataType::DT_FLOAT16, isBtrans);
+    cubeTiling.SetCType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND,
+        matmul_tiling::DataType::DT_FLOAT);
+    cubeTiling.SetBiasType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND,
+        matmul_tiling::DataType::DT_FLOAT);
+    cubeTiling.SetOrgShape(M, N, K);
+    cubeTiling.SetShape(M, N, K);
+    cubeTiling.EnableBias(isBias);
+    cubeTiling.SetBufferSpace(-1, -1, -1);
+    if (cubeTiling.GetTiling(tilingData) == -1) {
+        std::cout << "Generate tiling failed." << std::endl;
+        return {};
+    }
+    return tilingData;
+}
+
+} // namespace MatmulHost
diff --git a/examples/matrix/matmul_unitflag/op_host/matmul_unitflag_custom_tiling.h b/examples/matrix/matmul_unitflag/op_host/matmul_unitflag_custom_tiling.h
new file mode 100644
index 00000000..ce78fcd6
--- /dev/null
+++ b/examples/matrix/matmul_unitflag/op_host/matmul_unitflag_custom_tiling.h
@@ -0,0 +1,37 @@
+/**
+ * Copyright (c) 2025 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+#ifndef EXAMPLES_MATRIX_MATMUL_UNITFLAG_OP_HOST_MATMUL_UNITFLAG_CUSTOM_TILING_H
+#define EXAMPLES_MATRIX_MATMUL_UNITFLAG_OP_HOST_MATMUL_UNITFLAG_CUSTOM_TILING_H
+#include "register/tilingdata_base.h"
+#include "tiling/tiling_api.h"
+
+namespace MatmulHost {
+
+struct MatmulCaseParams
+{
+    int32_t usedCoreNum;
+    int32_t m;
+    int32_t n;
+    int32_t k;
+    bool isBias;
+    bool isATrans;
+    bool isBTrans;
+};
+
+/**
+  * @brief Generate matmul tiling.
+  * @param testCaseParams: Testcase parameters.
+  * @retval Generated Tiling data.
+  */
+TCubeTiling GenerateTiling(const MatmulCaseParams& testCaseParams);
+
+} // namespace MatmulHost
+#endif // EXAMPLES_MATRIX_MATMUL_UNITFLAG_OP_HOST_MATMUL_UNITFLAG_CUSTOM_TILING_H
diff --git a/examples/matrix/matmul_unitflag/op_kernel/matmul_unitflag_custom_kernel.cpp b/examples/matrix/matmul_unitflag/op_kernel/matmul_unitflag_custom_kernel.cpp
new file mode 100644
index 00000000..a72175db
--- /dev/null
+++ b/examples/matrix/matmul_unitflag/op_kernel/matmul_unitflag_custom_kernel.cpp
@@ -0,0 +1,146 @@
+/**
+ * Copyright (c) 2025 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+#include "kernel_operator.h"
+#include "matmul_unitflag_custom_kernel.h"
+
+namespace {
+/**
+  * @brief  Copy tiling data to TCubeTiling ptr from tiling gm addr.
+  * @param  tiling: TCubeTiling ptr which needs to copy tiling data.
+  * @param  tilingGM: Tiling gm addr.
+  * @retval None
+  */
+__aicore__ inline void CopyTiling(TCubeTiling* tiling, GM_ADDR tilingGM)
+{
+    uint32_t* ptr = reinterpret_cast<uint32_t*>(tiling);
+    auto tiling32 = reinterpret_cast<__gm__ uint32_t*>(tilingGM);
+
+    for (int i = 0; i < sizeof(TCubeTiling) / sizeof(uint32_t); i++, ptr++) {
+      *ptr = *(tiling32 + i);
+    }
+    return;
+}
+}
+
+namespace MatmulUnitFlagCustom {
+template <typename AType, typename BType, typename CType, typename BiasType>
+__aicore__ inline void MatmulKernel<AType, BType, CType, BiasType>::Init(
+    GM_ADDR a, GM_ADDR b, GM_ADDR bias, GM_ADDR c, const TCubeTiling& tiling, bool isTransA, bool isTransB)
+{
+    this->tiling = tiling;
+    aGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ AType*>(a), tiling.M * tiling.Ka);
+    bGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ BType*>(b), tiling.Kb * tiling.N);
+    cGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ CType*>(c), tiling.M * tiling.N);
+    biasGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ BiasType*>(bias), tiling.N);
+
+    int32_t offsetA = 0;
+    int32_t offsetB = 0;
+    int32_t offsetC = 0;
+    int32_t offsetBias = 0;
+    this->isTransA = isTransA;
+    this->isTransB = isTransB;
+    CalcOffset(AscendC::GetBlockIdx(), offsetA, offsetB, offsetC, offsetBias);
+    aGlobal = aGlobal[offsetA];
+    bGlobal = bGlobal[offsetB];
+    cGlobal = cGlobal[offsetC];
+    biasGlobal = biasGlobal[offsetBias];
+    if (GetSysWorkSpacePtr() == nullptr) {
+        return;
+    }
+}
+
+template <typename AType, typename BType, typename CType, typename BiasType>
+__aicore__ inline void MatmulKernel<AType, BType, CType, BiasType>::Process()
+{
+    if (AscendC::GetBlockIdx() >= tiling.usedCoreNum) {
+        return;
+    }
+
+    // process with tail block
+    int tailM = tiling.M - mCoreIndex * tiling.singleCoreM;
+    tailM = tailM < tiling.singleCoreM ? tailM : tiling.singleCoreM;
+    int tailN = tiling.N - nCoreIndex * tiling.singleCoreN;
+    tailN = tailN < tiling.singleCoreN ? tailN : tiling.singleCoreN;
+    if (tailM < tiling.singleCoreM || tailN < tiling.singleCoreN) {
+        matmulObj.SetTail(tailM, tailN);
+    }
+
+    matmulObj.SetTensorA(aGlobal, isTransA);
+    matmulObj.SetTensorB(bGlobal, isTransB);
+    if (tiling.isBias) {
+        matmulObj.SetBias(biasGlobal);
+    }
+    matmulObj.IterateAll(cGlobal);
+    matmulObj.End();
+}
+
+template <typename AType, typename BType, typename CType, typename BiasType>
+__aicore__ inline void MatmulKernel<AType, BType, CType, BiasType>::CalcOffset(
+    int32_t blockIdx, int32_t& offsetA, int32_t& offsetB, int32_t& offsetC, int32_t& offsetBias)
+{
+    const TCubeTiling& tiling = this->tiling;
+    auto mSingleBlocks = (tiling.M + tiling.singleCoreM - 1) / tiling.singleCoreM; // split M into mSingleBlocks cores
+    mCoreIndex = blockIdx % mSingleBlocks;
+    nCoreIndex = blockIdx / mSingleBlocks;
+
+    if (isTransA) {
+        offsetA = mCoreIndex * tiling.singleCoreM;
+    } else {
+        offsetA = mCoreIndex * tiling.Ka * tiling.singleCoreM;
+    }
+    if (isTransB) {
+        offsetB = nCoreIndex * tiling.Kb * tiling.singleCoreN;
+    } else {
+        offsetB = nCoreIndex * tiling.singleCoreN;
+    }
+    offsetC = mCoreIndex * tiling.N * tiling.singleCoreM + nCoreIndex * tiling.singleCoreN;
+    offsetBias = nCoreIndex * tiling.singleCoreN;
+}
+} // namespace MatmulUnitFlagCustom
+
+/**
+  * @brief  matmul kernel function.
+  * @param  a: A matrix gm addr.
+  * @param  b: B matrix gm addr.
+  * @param  bias: Bias matrix gm addr.
+  * @param  c: C matrix gm addr.
+  * @param  workspace: Temporary gm space addr required by matmul calc.
+  * @param  tilingGm: Tiling data addr. 
+  * @retval None
+  */
+extern "C" __global__ __aicore__ void matmul_unitflag_custom(GM_ADDR a, GM_ADDR b,
+    GM_ADDR bias, GM_ADDR c, GM_ADDR workspace, GM_ADDR tilingGm)
+{
+#if defined(ASCENDC_CPU_DEBUG)
+    if (g_coreType == AscendC::AIV) {
+        return;
+    }
+#endif
+    // prepare tiling
+    TCubeTiling tiling;
+    CopyTiling(&tiling, tilingGm);
+    // define matmul kernel
+    MatmulUnitFlagCustom::MatmulKernel<half, half, float, float> matmulKernel;
+    AscendC::TPipe pipe;
+    REGIST_MATMUL_OBJ(&pipe, GetSysWorkSpacePtr(), matmulKernel.matmulObj, &tiling);
+    // init matmul kernel, isTransA=false, isTransB=false
+    matmulKernel.Init(a, b, bias, c, tiling, false, false);
+    // matmul kernel process
+    matmulKernel.Process();
+}
+
+#ifndef ASCENDC_CPU_DEBUG
+void matmul_unitflag_custom_do(uint32_t blockDim, void* stream,
+    GM_ADDR a, GM_ADDR b, GM_ADDR bias, GM_ADDR c, GM_ADDR workspace, GM_ADDR tilingGm)
+{
+    matmul_unitflag_custom<<<blockDim, nullptr, stream>>>(a, b, bias, c, workspace, tilingGm);
+}
+#endif
diff --git a/examples/matrix/matmul_unitflag/op_kernel/matmul_unitflag_custom_kernel.h b/examples/matrix/matmul_unitflag/op_kernel/matmul_unitflag_custom_kernel.h
new file mode 100644
index 00000000..bc2a2810
--- /dev/null
+++ b/examples/matrix/matmul_unitflag/op_kernel/matmul_unitflag_custom_kernel.h
@@ -0,0 +1,85 @@
+/**
+ * Copyright (c) 2025 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+#ifndef EXAMPLES_MATRIX_MATMUL_UNITFLAG_OP_KERNEL_MATMUL_UNITFLAG_CUSTOM_KERNEL_H
+#define EXAMPLES_MATRIX_MATMUL_UNITFLAG_OP_KERNEL_MATMUL_UNITFLAG_CUSTOM_KERNEL_H
+#include "kernel_operator.h"
+// Cube Only
+#define ASCENDC_CUBE_ONLY
+#include "lib/matmul_intf.h"
+
+namespace MatmulUnitFlagCustom {
+
+__aicore__ inline constexpr MatmulConfig GetUnitFlagCfg()
+{
+    auto mmCfg = CFG_MDL;
+#ifdef ENABLE_UNITFLAG_FEATURE
+    // enable UnitFlag feature
+    mmCfg.enUnitFlag = true;
+#endif
+    return mmCfg;
+}
+constexpr static MatmulConfig CFG_MDL_UNITFLAG = GetUnitFlagCfg();
+
+template <typename AType, typename BType, typename CType, typename BiasType>
+class MatmulKernel {
+public:
+    __aicore__ inline MatmulKernel(){};
+    /**
+      * @brief  Initialization before process.
+      * @param  a: A matrix gm addr.
+      * @param  b: B matrix gm addr.
+      * @param  bias: Bias matrix gm addr.
+      * @param  c: C matrix gm addr.
+      * @param  tiling: Matmul tiling struct.
+      * @param  isTransA: Whether A matrix is transposed.
+      * @param  isTransB: Whether B matrix is transposed.
+      * @retval None
+      */
+    __aicore__ inline void Init(GM_ADDR a, GM_ADDR b, GM_ADDR bias, GM_ADDR c, const TCubeTiling& tiling,
+        bool isTransA, bool isTransB);
+    /**
+      * @brief  Process matrix calculation.
+      * @retval None
+      */
+    __aicore__ inline void Process();
+
+    using A_TYPE = AscendC::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, AType>;
+    using B_TYPE = AscendC::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, BType>;
+    using C_TYPE = AscendC::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, CType>;
+    using BIAS_TYPE =  AscendC::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, BiasType>;
+    AscendC::Matmul<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, CFG_MDL_UNITFLAG> matmulObj;
+
+private:
+    /**
+      * @brief  Calculate the gm offset based on the blockIdx.
+      * @param  blockIdx: Current Core blockidx.
+      * @param  offsetA: Gm offset of A matrix.
+      * @param  offsetB: Gm offset of B matrix.
+      * @param  offsetC: Gm offset of C matrix.
+      * @param  offsetBias: Gm offset of Bias matrix.
+      * @retval None
+      */
+    __aicore__ inline void CalcOffset(
+        int32_t blockIdx, int32_t& offsetA, int32_t& offsetB, int32_t& offsetC, int32_t& offsetBias);
+
+    AscendC::GlobalTensor<AType> aGlobal;
+    AscendC::GlobalTensor<BType> bGlobal;
+    AscendC::GlobalTensor<CType> cGlobal;
+    AscendC::GlobalTensor<BiasType> biasGlobal;
+    TCubeTiling tiling;
+    int32_t mCoreIndex;
+    int32_t nCoreIndex;
+    bool isTransA{false};
+    bool isTransB{false};
+};
+} // namespace MatmulUnitFlagCustom
+
+#endif // EXAMPLES_MATRIX_MATMUL_UNITFLAG_OP_KERNEL_MATMUL_UNITFLAG_CUSTOM_KERNEL_H
diff --git a/examples/matrix/matmul_unitflag/run.sh b/examples/matrix/matmul_unitflag/run.sh
new file mode 100644
index 00000000..c53da86e
--- /dev/null
+++ b/examples/matrix/matmul_unitflag/run.sh
@@ -0,0 +1,105 @@
+#!/bin/bash
+# Copyright (c) 2025 Huawei Technologies Co., Ltd.
+# This file is a part of the CANN Open Software.
+# Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# ======================================================================================================================
+
+export IS_PERF="0"
+export ENABLE_FEATURE="1"
+
+SHORT=r:,v:,p:,e:,
+LONG=run-mode:,soc-version:,perf:,
+OPTS=$(getopt -a --options $SHORT --longoptions $LONG -- "$@")
+eval set -- "$OPTS"
+while :
+do
+    case "$1" in
+        (-r | --run-mode )
+            RUN_MODE="$2"
+            shift 2;;
+        (-v | --soc-version )
+            SOC_VERSION="$2"
+            shift 2;;
+        (-p | --perf )
+            IS_PERF="$2"
+            shift 2;;
+        (-e | --enable-feature )
+            ENABLE_FEATURE="$2"
+            shift 2;;
+        (--)
+            shift;
+            break;;
+        (*)
+            echo "[ERROR] Unexpected option: $1";
+            break;;
+    esac
+done
+
+# Check invalid input
+if [[ ! "${SOC_VERSION}" =~ ^Ascend910 ]]; then
+    echo "[ERROR] Unsupported SocVersion: ${SOC_VERSION}"
+    exit 1
+fi
+
+if [ "${RUN_MODE}" != "npu" ] && [ "${RUN_MODE}" != "sim" ] && [ "${RUN_MODE}" != "cpu" ]; then
+    echo "[ERROR] Unsupported RUN_MODE: ${RUN_MODE}, which can only be cpu/sim/npu."
+    exit 1
+fi
+
+if [ "${IS_PERF}" != "0" ] && [ "${IS_PERF}" != "1" ]; then
+    echo "[ERROR] Unsupported IS_PERF: ${IS_PERF}, which can only be 0 or 1."
+    exit 1
+fi
+
+if [ "${ENABLE_FEATURE}" != "0" ] && [ "${ENABLE_FEATURE}" != "1" ]; then
+    echo "[ERROR] Unsupported ENABLE_FEATURE: ${ENABLE_FEATURE}, which can only be 0 or 1."
+    exit 1
+fi
+
+# only npu mode support is_perf = 1
+if [ "${IS_PERF}" == "1" ] && [ "${RUN_MODE}" != "npu" ]; then
+    echo "[ERROR] Unsupported IS_PERF: ${IS_PERF} while RUN_MODE is ${RUN_MODE}."
+    exit 1
+fi
+
+rm -rf build
+mkdir build
+cd build
+
+source $ASCEND_HOME_DIR/bin/setenv.bash
+export LD_LIBRARY_PATH=${ASCEND_HOME_DIR}/tools/simulator/${SOC_VERSION}/lib:$LD_LIBRARY_PATH
+set -euo pipefail
+
+cmake  -DRUN_MODE=${RUN_MODE} -DSOC_VERSION=${SOC_VERSION} -DENABLE_FEATURE=${ENABLE_FEATURE} -DASCEND_CANN_PACKAGE_PATH=${ASCEND_HOME_DIR} ..
+make -j16
+
+cd ../
+
+rm -rf input
+mkdir input
+rm -rf output
+mkdir output
+
+rm -rf bin/
+mkdir -p bin
+cd bin
+cp ../build/ascendc_matmul_unitflag_bbit ./
+
+export TF_CPP_MIN_LOG_LEVEL=3
+
+if [ "${RUN_MODE}" = "npu" ]; then
+    if [ "${IS_PERF}" = "1" ]; then
+        export ASCEND_TOOLKIT_HOME=${ASCEND_HOME_DIR}
+        python3 -u ../scripts/exec_test.py npu "perf"
+    else
+        python3 -u ../scripts/exec_test.py npu "normal"
+    fi
+elif [ "${RUN_MODE}" = "sim" ]; then
+    python3 -u ../scripts/exec_test.py sim "normal"
+elif [ "${RUN_MODE}" = "cpu" ]; then
+    python3 -u ../scripts/exec_test.py cpu "normal"
+fi
diff --git a/examples/matrix/matmul_unitflag/scripts/exec_test.py b/examples/matrix/matmul_unitflag/scripts/exec_test.py
new file mode 100644
index 00000000..eb918ff7
--- /dev/null
+++ b/examples/matrix/matmul_unitflag/scripts/exec_test.py
@@ -0,0 +1,124 @@
+#!/usr/bin/python3
+# coding=utf-8
+
+# Copyright (c) 2025 Huawei Technologies Co., Ltd.
+# This file is a part of the CANN Open Software.
+# Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# ======================================================================================================================
+import os
+import sys
+import csv
+import time
+import logging
+import subprocess
+import shlex
+
+import numpy as np
+
+sys.path.append("../..")
+from common_scripts.gen_data import MatmulGenData
+from common_scripts.compare_data import compare_data
+from common_scripts.exec_utils import get_file_work_dir, get_case_list, clear_file_cache, get_perf_task_duration
+
+IS_BIAS = True
+IS_TRANS_A = False
+IS_TRANS_B = False
+# float16 in float32 out
+DATA_TYPE_STR = "float16_float32"
+
+logging.basicConfig(level=logging.INFO)
+
+
+class ProcessParams:
+    def __init__(self, case_name, m, n, k, b, is_perf, run_mode):
+        self.case_name = case_name
+        self.m = m
+        self.n = n
+        self.k = k
+        self.b = b
+        self.is_perf = is_perf
+        self.run_mode = run_mode
+
+
+def process_case(file_work_dir, process_params):
+    case_name, m, n, k, b, is_perf, run_mode = process_params.case_name, process_params.m, process_params.n,\
+        process_params.k, process_params.b, process_params.is_perf, process_params.run_mode
+    logging.info("[INFO] start process case [%s]" % (case_name))
+    logging.info("[INFO] IS_PERF is set [%d]" % (is_perf))
+    clear_file_cache(file_work_dir)
+
+    matmul_gen_data = MatmulGenData(m, n, k, b, IS_TRANS_A, IS_TRANS_B, IS_BIAS, DATA_TYPE_STR)
+    params_str = f"{m} {n} {k}"
+    if is_perf:
+        matmul_gen_data.gen_fake_golden_data(file_work_dir)
+        cmd = f"msprof op --application=\"./ascendc_matmul_unitflag_bbit {params_str}\" --output=./prof_out"
+    elif run_mode == "sim":
+        matmul_gen_data.gen_golden_data(file_work_dir)
+        cmd = f"msprof op simulator --application=\"./ascendc_matmul_unitflag_bbit {params_str}\" --output=./sim_out"
+    else:
+        matmul_gen_data.gen_golden_data(file_work_dir)
+        cmd = f"./ascendc_matmul_unitflag_bbit {params_str}"
+    subprocess.run(shlex.split(cmd))
+    if is_perf:
+        wrong_num = -1
+    else:
+        logging.info("[INFO] compare data case[%s]" % (case_name))
+        wrong_num = compare_data(file_work_dir, DATA_TYPE_STR)
+    res_data = []
+    res_data.append(case_name)
+    res_data.append(wrong_num)
+    res_data.append(b * m * n)
+    if wrong_num == -1:
+        res_data.append("None")
+    elif wrong_num / (b * m * n) > 0.001:
+        res_data.append("Fail")
+    else:
+        res_data.append("Success")
+    if is_perf:
+        task_duration = get_perf_task_duration("./prof_out")
+        res_data.append(task_duration)
+    return res_data
+
+
+def main():
+    args_len = len(sys.argv) - 1
+    if args_len != 2:
+        logging.info("[ERROR] exec_test input params error!")
+        return -1
+
+    file_work_dir = get_file_work_dir()
+    if not os.path.exists(file_work_dir):
+        logging.info("[ERROR] file path %s not exist!" % (file_work_dir))
+        return -1
+
+    is_perf = False
+    if sys.argv[2] == "perf":
+        is_perf = True
+
+    case_list = get_case_list()
+    res_list = [["case_name", "wrong_num", "total_num", "result", "task_duration"]]
+    run_mode = sys.argv[1]
+    for is_process, case_name, m, n, k, b in case_list:
+        if is_process == 1:
+            process_params = ProcessParams(case_name, m, n, k, b, is_perf, run_mode)
+            res_data = process_case(file_work_dir, process_params)
+            res_list.append(res_data)
+
+    timestamp = time.time()
+    result_file_name = "result_" + str(timestamp) + ".csv"
+    with open(os.path.join(file_work_dir, "output", result_file_name), 'w', newline='', encoding='utf-8') as csvfile:
+        writer = csv.writer(csvfile)
+        writer.writerows(res_list)
+
+    logging.info("---------------RESULT---------------")
+    for res in res_list:
+        logging.info(res)
+    return 0
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/matrix/matmul_unitflag/testcase/case.csv b/examples/matrix/matmul_unitflag/testcase/case.csv
new file mode 100644
index 00000000..e8f8491a
--- /dev/null
+++ b/examples/matrix/matmul_unitflag/testcase/case.csv
@@ -0,0 +1 @@
+1, case001, 1024, 4096, 1024
\ No newline at end of file
diff --git a/examples/readme.md b/examples/readme.md
index 8a0e1fc8..e64a3b46 100644
--- a/examples/readme.md
+++ b/examples/readme.md
@@ -36,7 +36,7 @@
         <td> 对输入tensor按行做如下公式的计算：zi = (xi - ∑(xi * yi)) * yi，其中∑为按行reduce求和。</td>
     </tr>
     <tr>
-        <th rowspan="27"><a href="./matrix"> matrix</th>
+        <th rowspan="30"><a href="./matrix"> matrix</th>
         <td><a href="./matrix/basic_block_matmul"> basic_block_matmul</td>
         <td> 实现无尾块且tiling的base块大小固定的场景下的Matmul矩阵乘法，计算公式为：C = A * B + Bias。 </td>
     </tr>
@@ -56,10 +56,6 @@
         <td><a href="./matrix/matmul_async_iterate_all"> matmul_async_iterate_all</td>
         <td> 调用IterateAll实现异步场景下的Matmul矩阵乘法，计算公式为：C = A * B + Bias。 </td>
     </tr>
-    <tr>
-        <td><a href="./matrix/matmul_constant"> matmul_constant</td>
-        <td> 实现MDL模板下使能Tiling常量化的Matmul矩阵乘法，计算公式为：C = A * B + Bias。 </td>
-    </tr>
     <tr>
         <td><a href="./matrix/matmul_ibshareAB"> matmul_ibshareAB</td>
         <td> 实现A矩阵或B矩阵GM地址相同，A、B矩阵共享L1 Buffer场景下的Matmul矩阵乘法，计算公式为：C = A * B + Bias。 </td>
@@ -76,6 +72,10 @@
         <td><a href="./matrix/matmul_mndb"> matmul_mndb</td>
         <td> 实现M或N轴方向流水并行场景下的Matmul矩阵乘法，计算公式为：C = A * B + Bias。 </td>
     </tr>
+    <tr>
+        <td><a href="./matrix/matmul_constant"> matmul_constant</td>
+        <td> 实现MDL模板下使能Tiling常量化的Matmul矩阵乘法，计算公式为：C = A * B + Bias。 </td>
+    </tr>
     <tr>
         <td><a href="./matrix/matmul_gemv"> matmul_gemv</td>
         <td> 实现NORM模板下实现矩阵向量乘的Matmul矩阵乘法，计算公式为：C = A * B + Bias。 </td>
@@ -96,10 +96,13 @@
         <td><a href="./matrix/matmul_l0c_extend"> matmul_l0c_extend</td>
         <td> 基于自主管理CO1的Iterate接口实现Matmul矩阵乘法，计算公式为：C = A * B + Bias。 </td>
     </tr>
+    <tr>
+        <td><a href="./matrix/matmul_l2cache"> matmul_l2cache</td>
+        <td> 实现NORM模板下支持L2 Cache切分的Matmul矩阵乘法，计算公式为：C = A * B + Bias。 </td>
+    </tr>
     <tr>
         <td><a href="./matrix/matmul_mixdualmaster"> matmul_mixdualmaster</td>
         <td> 实现NORM模板下使能双主模式的Matmul矩阵乘法，计算公式为：C = A * B + Bias。 </td>
-    </tr>
     </tr>
         <td><a href="./matrix/matmul_splitk"> matmul_splitk</td>
         <td> 实现多核切K场景下的Matmul矩阵乘法，计算公式为：C = A * B + Bias。 </td>
@@ -113,12 +116,8 @@
         <td> 实现NORM模板下使能输出矩阵N方向对齐的矩阵乘法，计算公式为：C = A * B + Bias。 </td>
     </tr>
     <tr>
-        <td><a href="./matrix/matmul_l2cache"> matmul_l2cache</td>
-        <td> 实现NORM模板下支持L2 Cache切分的Matmul矩阵乘法，计算公式为：C = A * B + Bias。 </td>
-    </tr>
-    <tr>
-        <td><a href="./matrix/matmul_nz"> matmul_nz</td>
-        <td> 输入矩阵内轴非256B对齐场景下，在AIV核上使用DataCopyPad实现ND转换NZ格式的Matmul矩阵乘法，计算公式为：C = A * B + Bias。 </td>
+        <td><a href="./matrix/matmul_sparse"> matmul_sparse</td>
+        <td> 实现MDL模板下稀疏左矩阵A与4:2稠密化后的右矩阵B的Matmul矩阵乘法，计算公式为：C = A * B + Bias。 </td>
     </tr>
     <tr>
         <td><a href="./matrix/matmul_l0cache"> matmul_l0cache</td>
@@ -128,10 +127,6 @@
         <td><a href="./matrix/matmul_tscm"> matmul_tscm</td>
         <td> 基于TSCM输入实现Matmul矩阵乘法，计算公式为：C = A * B + Bias。 </td>
     </tr>
-    <tr>
-        <td><a href="./matrix/matmul_sparse"> matmul_sparse</td>
-        <td> 实现MDL模板下稀疏左矩阵A与4:2稠密化后的右矩阵B的Matmul矩阵乘法，计算公式为：C = A * B + Bias。 </td>
-    </tr>
     <tr>
         <td><a href="./matrix/matmul_triangle"> matmul_triangle</td>
         <td> 实现NORM模板下使能上下三角模板策略的Matmul矩阵乘法，计算公式为：C = A * B + Bias。 </td>
@@ -140,10 +135,26 @@
         <td><a href="./matrix/matmul_partial_output"> matmul_partial_output</td>
         <td> 实现MDL模板下使能Partial Output功能的Matmul矩阵乘法，计算公式为：C = A * B。 </td>
     </tr>
+    <tr>
+        <td><a href="./matrix/matmul_nz"> matmul_nz</td>
+        <td> 输入矩阵内轴非256B对齐场景下，在AIV核上使用DataCopyPad实现ND转换NZ格式的Matmul矩阵乘法，计算公式为：C = A * B + Bias。 </td>
+    </tr>
     <tr>
         <td><a href="./matrix/matmul_unaligned"> matmul_unaligned</td>
         <td> 实现NORM模板下多核非对齐切分的Matmul矩阵乘法，计算公式为：C = A * B + Bias。 </td>
     </tr>
+    <tr>
+        <td><a href="./matrix/matmul_unitflag"> matmul_unitflag</td>
+        <td> 实现MDL模板下使能UnitFlag功能的Matmul矩阵乘法，计算公式为：C = A * B + Bias。 </td>
+    </tr>
+    <tr>
+        <td><a href="./matrix/matmul_channelsplit"> matmul_channelsplit</td>
+        <td> 实现Norm模板下使能Channel拆分功能的Matmul矩阵乘法，计算公式为：C = A * B + Bias。 </td>
+    </tr>
+    <tr>
+        <td><a href="./matrix/matmul_quant"> matmul_quant</td>
+        <td> 实现MDL模板下反量化功能的Matmul矩阵乘法，计算公式为：C = A * B + Bias。 </td>
+    </tr>
     <tr>
         <th rowspan="6"><a href="./normalization"> normalization</th>
         <td><a href="./normalization/layernorm"> layernorm</td>
-- 
Gitee