From 0e7495c9f0cd78eaa6f9bea21b6f72e66df69143 Mon Sep 17 00:00:00 2001
From: kong0808 <kongqiongzhuo@huawei.com>
Date: Fri, 18 Jul 2025 15:13:24 +0800
Subject: [PATCH] add matmul_int4 example

---
 .../matrix/common_scripts/compare_data.py     |   2 +-
 examples/matrix/matmul_int4/CMakeLists.txt    |  73 ++++++
 examples/matrix/matmul_int4/README.md         | 114 +++++++++
 .../matrix/matmul_int4/cmake/cpu_lib.cmake    |  31 +++
 .../matrix/matmul_int4/cmake/npu_lib.cmake    |  27 +++
 examples/matrix/matmul_int4/main.cpp          | 225 ++++++++++++++++++
 .../op_host/matmul_int4_custom_tiling.cpp     |  49 ++++
 .../op_host/matmul_int4_custom_tiling.h       |  37 +++
 .../op_kernel/matmul_int4_custom_kernel.cpp   | 146 ++++++++++++
 .../op_kernel/matmul_int4_custom_kernel.h     |  74 ++++++
 examples/matrix/matmul_int4/run.sh            |  96 ++++++++
 .../matrix/matmul_int4/scripts/exec_test.py   | 121 ++++++++++
 .../matrix/matmul_int4/scripts/gen_data.py    | 100 ++++++++
 examples/matrix/matmul_int4/testcase/case.csv |   1 +
 examples/readme.md                            |  10 +-
 15 files changed, 1104 insertions(+), 2 deletions(-)
 create mode 100644 examples/matrix/matmul_int4/CMakeLists.txt
 create mode 100644 examples/matrix/matmul_int4/README.md
 create mode 100644 examples/matrix/matmul_int4/cmake/cpu_lib.cmake
 create mode 100644 examples/matrix/matmul_int4/cmake/npu_lib.cmake
 create mode 100644 examples/matrix/matmul_int4/main.cpp
 create mode 100644 examples/matrix/matmul_int4/op_host/matmul_int4_custom_tiling.cpp
 create mode 100644 examples/matrix/matmul_int4/op_host/matmul_int4_custom_tiling.h
 create mode 100644 examples/matrix/matmul_int4/op_kernel/matmul_int4_custom_kernel.cpp
 create mode 100644 examples/matrix/matmul_int4/op_kernel/matmul_int4_custom_kernel.h
 create mode 100644 examples/matrix/matmul_int4/run.sh
 create mode 100644 examples/matrix/matmul_int4/scripts/exec_test.py
 create mode 100644 examples/matrix/matmul_int4/scripts/gen_data.py
 create mode 100644 examples/matrix/matmul_int4/testcase/case.csv

diff --git a/examples/matrix/common_scripts/compare_data.py b/examples/matrix/common_scripts/compare_data.py
index 43c343c0..22ae5540 100644
--- a/examples/matrix/common_scripts/compare_data.py
+++ b/examples/matrix/common_scripts/compare_data.py
@@ -52,7 +52,7 @@ def compare_data(work_dir, data_type_str, golden_file="golden.bin", output_file=
     elif data_type_str == "float16_float16" or data_type_str == "int8_float16_dequant":
         golden_data = np.fromfile(golden_file_path, dtype="float16")
         output_data = np.fromfile(output_file_path, dtype="float16")
-    elif data_type_str == "int8_int32_sparse": # for sparse matmul
+    elif data_type_str == "int8_int32_sparse" or data_type_str == "int4_int32":
         golden_data = np.fromfile(golden_file_path, dtype="int32")
         output_data = np.fromfile(output_file_path, dtype="int32")
     else:
diff --git a/examples/matrix/matmul_int4/CMakeLists.txt b/examples/matrix/matmul_int4/CMakeLists.txt
new file mode 100644
index 00000000..55e324dc
--- /dev/null
+++ b/examples/matrix/matmul_int4/CMakeLists.txt
@@ -0,0 +1,73 @@
+# Copyright (c) 2025 Huawei Technologies Co., Ltd.
+# This file is a part of the CANN Open Software.
+# Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# ======================================================================================================================
+
+cmake_minimum_required(VERSION 3.16)
+project(Ascend_c)
+if (${RUN_MODE})
+    set(RUN_MODE "npu" CACHE STRING "cpu/sim/npu")
+endif()
+if (${SOC_VERSION})
+    set(SOC_VERSION "Ascend910" CACHE STRING "system on chip type")
+endif()
+
+set(ASCEND_CANN_PACKAGE_PATH "~/Ascend/ascend-toolkit/latest" CACHE STRING "ASCEND CANN package installation directory")
+if(NOT CMAKE_BUILD_TYPE)
+    set(CMAKE_BUILD_TYPE "Debug"  CACHE STRING "Build type Release/Debug (default Debug)" FORCE)
+endif()
+
+if(CMAKE_INSTALL_PREFIX STREQUAL /usr/local)
+    set(CMAKE_INSTALL_PREFIX "${CMAKE_CURRENT_LIST_DIR}/out"  CACHE STRING "path for install()" FORCE)
+endif()
+
+file(GLOB KERNEL_FILES
+    ${CMAKE_CURRENT_SOURCE_DIR}/op_kernel/matmul_int4_custom_kernel.cpp
+)
+
+if("${RUN_MODE}" STREQUAL "cpu")
+    include(cmake/cpu_lib.cmake)
+elseif("${RUN_MODE}" STREQUAL "sim" OR "${RUN_MODE}" STREQUAL "npu")
+    include(cmake/npu_lib.cmake)
+else()
+    message("invalid RUN_MODE: ${RUN_MODE}")
+endif()
+
+add_executable(ascendc_matmul_int4_bbit
+    ${CMAKE_CURRENT_SOURCE_DIR}/main.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/op_host/matmul_int4_custom_tiling.cpp
+)
+
+target_compile_options(ascendc_matmul_int4_bbit PRIVATE
+    $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:-g>>
+    -O2
+    -std=c++17
+    -D_GLIBCXX_USE_CXX11_ABI=0
+)
+
+target_compile_definitions(ascendc_matmul_int4_bbit PRIVATE
+    SOC_VERSION="${SOC_VERSION}"
+)
+
+target_include_directories(ascendc_matmul_int4_bbit PRIVATE
+    ${CMAKE_CURRENT_SOURCE_DIR}
+    $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:${ASCEND_CANN_PACKAGE_PATH}/include>>
+    $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:${ASCEND_CANN_PACKAGE_PATH}/runtime/include>>
+)
+
+target_link_libraries(ascendc_matmul_int4_bbit PRIVATE
+    $<BUILD_INTERFACE:$<$<OR:$<STREQUAL:${RUN_MODE},npu>,$<STREQUAL:${RUN_MODE},sim>>:host_intf_pub>>
+    $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:tikicpulib::${SOC_VERSION}>>
+    $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:ascendcl>>
+    $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:c_sec>>
+    ascendc_kernels_${RUN_MODE}
+    tiling_api
+    register
+    platform
+    ascendalog
+    dl
+)
diff --git a/examples/matrix/matmul_int4/README.md b/examples/matrix/matmul_int4/README.md
new file mode 100644
index 00000000..59b94bb0
--- /dev/null
+++ b/examples/matrix/matmul_int4/README.md
@@ -0,0 +1,114 @@
+<!--声明：本文使用[Creative Commons License version 4.0](https://creativecommons.org/licenses/by/4.0/legalcode)许可协议，转载、引用或修改等操作请遵循此许可协议。-->
+## 概述
+
+本样例介绍了调用Matmul API实现int4数据类型输入，int32数据类型输出的单算子。
+
+本样例以直调的方式调用算子核函数。  
+直调：核函数的基础调用方式，开发者完成算子核函数的开发和Tiling实现后，即可通过AscendCL运行时接口，完成算子的调用。包含CPU侧、NPU侧和仿真三种运行验证方法。
+
+## 样例支持的产品型号为
+- Atlas A2训练系列产品/Atlas 800I A2推理产品
+
+## 目录结构
+| 目录及文件                            | 描述                   |
+|----------------------------------|----------------------|
+| [cmake](cmake)                   | 编译工程文件 |
+| [op_host](op_host)               | 本样例的tiling代码实现 |
+| [op_kernel](op_kernel)           | 本样例的kernel代码实现 |
+| [scripts](scripts)               | 包含输入输出真值数据生成脚本文件和执行文件 |
+| [testcase](testcase)             | 用例文件，配置用例的计算shape信息 |
+| CMakeLists.txt                   | 编译工程文件 |
+| main.cpp                         | 主函数，调用算子的应用程序，含CPU域及NPU域调用 |
+| run.sh                           | 编译执行脚本 |
+
+## 算子描述
+- 算子功能  
+  MatmulInt4Custom算子调用Matmul API计算时，通过Kernel侧设置左矩阵A、右矩阵B的数据类型参数为int4b_t，Host侧调用SetAType、SetBType接口设置左矩阵A、右矩阵B的数据类型为DT_INT4，实现int4数据类型输入的单算子。
+
+- 算子规格
+    <table>
+    <tr><td rowspan="1" align="center">算子类型(OpType)</td><td colspan="5" align="center">MatmulInt4Custom</td></tr>
+    </tr>
+    <tr><td rowspan="4" align="center">算子输入</td><td align="center">name</td><td align="center">shape</td><td align="center">data type</td><td align="center">format</td><td align="center">isTrans</td></tr>
+    <tr><td align="center">a</td><td align="center">-</td><td align="center">int4b_t</td><td align="center">ND</td><td align="center">false</td></tr>
+    <tr><td align="center">b</td><td align="center">-</td><td align="center">int4b_t</td><td align="center">ND</td><td align="center">false</td></tr>
+    <tr><td align="center">bias</td><td align="center">-</td><td align="center">int32_t</td><td align="center">ND</td><td align="center">-</td></tr>
+    </tr>
+    </tr>
+    <tr><td rowspan="1" align="center">算子输出</td><td align="center">c</td><td align="center">-</td><td align="center">int32_t</td><td align="center">ND</td><td align="center">-</td></tr>
+    </tr>
+    <tr><td rowspan="1" align="center">核函数名</td><td colspan="5" align="center">matmul_int4_custom</td></tr>
+    </table>
+
+## 算子实现介绍     
+- 约束条件
+  - 左矩阵A、右矩阵B的数据类型必须同时为int4，对应输出C矩阵的数据类型只支持int32、half。
+  - 左矩阵A为int4数据类型时，不支持转置。
+  - int4数据类型输入的Matmul对象仅支持Norm、MDL、IBShare模板。
+
+- 算子Kernel实现
+  - 计算逻辑：C = A * B + Bias。
+    - A、B为源操作数，A为左矩阵，形状为[M, K]；B为右矩阵，形状为[K, N]。
+    - C为目的操作数，存放矩阵乘结果的矩阵，形状为[M, N]。
+    - Bias为矩阵乘偏置，形状为[1, N]。对A*B结果矩阵的每一行都采用该Bias进行偏置。
+  - 具体步骤：
+    - 创建Matmul对象。
+      ```
+      using A_TYPE = AscendC::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, AscendC::int4b_t>;
+      using B_TYPE = AscendC::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, AscendC::int4b_t>;
+      using C_TYPE = AscendC::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, int32_t>;
+      using BIAS_TYPE = AscendC::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, int32_t>;
+      AscendC::Matmul<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, CFG_MDL> matmulObj;
+      ```
+    - 初始化操作。
+    - 设置左矩阵A、右矩阵B。
+    - 完成矩阵乘操作。
+    - 结束矩阵乘操作。
+
+- 算子Tiling实现
+  - Ascend C提供一组Matmul Tiling API，方便用户获取Matmul kernel计算时所需的Tiling参数。只需要传入A/B/C矩阵等信息，调用API接口，即可获取到TCubeTiling结构体中的相关参数。
+  - 获取Tiling参数的流程如下：
+    - 创建一个Tiling对象。
+    - 设置A、B、C、Bias的参数类型信息；M、N、Ka、Kb形状信息等。
+      ```
+      TCubeTiling tilingData;
+      auto ascendcPlatform = platform_ascendc::PlatformAscendCManager::GetInstance();
+      matmul_tiling::MultiCoreMatmulTiling cubeTiling(*ascendcPlatform);
+      cubeTiling.SetDim(blockDim);
+      cubeTiling.SetAType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND,
+          matmul_tiling::DataType::DT_INT4, isAtrans);
+      cubeTiling.SetBType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND,
+          matmul_tiling::DataType::DT_INT4, isBtrans);
+      cubeTiling.SetCType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND,
+          matmul_tiling::DataType::DT_INT32);
+      cubeTiling.SetBiasType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND,
+          matmul_tiling::DataType::DT_INT32);
+      ...
+      ```
+    - 调用GetTiling接口，获取Tiling信息。
+
+## 编译运行样例
+
+  - 配置环境变量
+
+    这里的\$ASCEND_CANN_PACKAGE_PATH需要替换为CANN开发套件包安装后文件存储路径。例如：/usr/local/Ascend/ascend-toolkit/latest
+    ```
+    export ASCEND_HOME_DIR=$ASCEND_CANN_PACKAGE_PATH
+    source $ASCEND_HOME_DIR/../set_env.sh
+    ```
+
+  - 编译执行
+
+    ```
+    bash run.sh -r [RUN_MODE] -v [SOC_VERSION] -p [IS_PERF]
+    ```
+    其中脚本参数说明如下：
+    - RUN_MODE ：编译执行方式，可选择CPU调试，NPU仿真，NPU上板，对应参数分别为[cpu / sim / npu]。若需要详细了解NPU仿真相关内容，请参考[《算子开发工具msProf》](https://hiascend.com/document/redirect/CannCommunityToolMsProf)中的“工具使用”章节。
+    - SOC_VERSION ：昇腾AI处理器型号，如果无法确定具体的[SOC_VERSION]，则在安装昇腾AI处理器的服务器执行npu-smi info命令进行查询，在查询到的“Name”前增加Ascend信息，例如“Name”对应取值为xxxyy，实际配置的[SOC_VERSION]值为Ascendxxxyy。支持以下产品型号：
+      - Atlas A2训练系列产品/Atlas 800I A2推理产品
+    - IS_PERF ：是否获取执行性能数据，当前只在NPU执行时生效，打开后性能数据显示在执行结果中，可选择关闭和开启该功能，对应参数分别为[0 / 1]。
+
+    示例如下，Ascendxxxyy请替换为实际的AI处理器型号。
+    ```
+    bash run.sh -r cpu -v Ascendxxxyy -p 0
+    ```
\ No newline at end of file
diff --git a/examples/matrix/matmul_int4/cmake/cpu_lib.cmake b/examples/matrix/matmul_int4/cmake/cpu_lib.cmake
new file mode 100644
index 00000000..487a91e0
--- /dev/null
+++ b/examples/matrix/matmul_int4/cmake/cpu_lib.cmake
@@ -0,0 +1,31 @@
+# Copyright (c) 2025 Huawei Technologies Co., Ltd.
+# This file is a part of the CANN Open Software.
+# Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# ======================================================================================================================
+
+if(NOT DEFINED ENV{CMAKE_PREFIX_PATH})
+    set(CMAKE_PREFIX_PATH ${ASCEND_CANN_PACKAGE_PATH}/tools/tikicpulib/lib/cmake)
+endif()
+find_package(tikicpulib REQUIRED)
+
+add_library(ascendc_kernels_${RUN_MODE} SHARED
+    ${KERNEL_FILES}
+)
+
+target_link_libraries(ascendc_kernels_${RUN_MODE} PRIVATE
+    tikicpulib::${SOC_VERSION}
+)
+
+target_compile_options(ascendc_kernels_${RUN_MODE} PRIVATE
+    -g
+    -O0
+    -std=c++17
+)
+
+install(TARGETS ascendc_kernels_${RUN_MODE}
+    DESTINATION ${CMAKE_INSTALL_LIBDIR}
+)
\ No newline at end of file
diff --git a/examples/matrix/matmul_int4/cmake/npu_lib.cmake b/examples/matrix/matmul_int4/cmake/npu_lib.cmake
new file mode 100644
index 00000000..bc803099
--- /dev/null
+++ b/examples/matrix/matmul_int4/cmake/npu_lib.cmake
@@ -0,0 +1,27 @@
+# Copyright (c) 2025 Huawei Technologies Co., Ltd.
+# This file is a part of the CANN Open Software.
+# Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# ======================================================================================================================
+
+if(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
+    set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
+elseif(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake)
+    set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake)
+else()
+    message(FATAL_ERROR "ascendc_kernel_cmake does not exist ,please check whether the cann package is installed")
+endif()
+include(${ASCENDC_CMAKE_DIR}/ascendc.cmake)
+
+ascendc_library(ascendc_kernels_${RUN_MODE} STATIC
+    ${KERNEL_FILES}
+)
+
+ascendc_compile_definitions(ascendc_kernels_${RUN_MODE} PRIVATE
+    -DASCENDC_DUMP
+    -DHAVE_WORKSPACE
+    -DHAVE_TILING
+)
\ No newline at end of file
diff --git a/examples/matrix/matmul_int4/main.cpp b/examples/matrix/matmul_int4/main.cpp
new file mode 100644
index 00000000..0dcf1c75
--- /dev/null
+++ b/examples/matrix/matmul_int4/main.cpp
@@ -0,0 +1,225 @@
+/**
+ * Copyright (c) 2025 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+#include <sstream>
+#include "../../common/data_utils.h"
+#include "kernel_tiling/kernel_tiling.h"
+#include "tiling/platform/platform_ascendc.h"
+#include "op_host/matmul_int4_custom_tiling.h"
+
+#ifndef ASCENDC_CPU_DEBUG
+#include "acl/acl.h"
+extern void matmul_int4_custom_do(uint32_t coreDim, void* stream,
+    uint8_t* a, uint8_t* b, uint8_t* bias, uint8_t* c,
+    uint8_t* workspace, uint8_t* tiling);
+#else
+#include "tikicpulib.h"
+extern "C" void matmul_int4_custom(uint8_t* a, uint8_t* b, uint8_t* bias, uint8_t* c,
+    uint8_t* workspace, uint8_t* tiling);
+#endif
+
+namespace {
+constexpr bool IS_BIAS = true;
+constexpr bool IS_A_TRANS = false;
+constexpr bool IS_B_TRANS = false;
+
+struct MatrixFileSize
+{
+    size_t x1FileSize;
+    size_t x2FileSize;
+    size_t yFileSize;
+    size_t biasFileSize;
+};
+}
+
+namespace MatmulHost {
+static size_t GetSysWorkSpaceSize()
+{
+    auto ascendcPlatform = platform_ascendc::PlatformAscendCManager::GetInstance();
+    return static_cast<size_t>(ascendcPlatform->GetLibApiWorkSpaceSize());
+}
+
+// CPU debug mode
+#ifdef ASCENDC_CPU_DEBUG
+void TestMatmulCpu(int64_t m, int64_t n, int64_t k, const MatrixFileSize& matrixFileSize)
+{
+    size_t x1FileSize = matrixFileSize.x1FileSize;
+    size_t x2FileSize = matrixFileSize.x2FileSize;
+    size_t yFileSize = matrixFileSize.yFileSize;
+    size_t biasFileSize = matrixFileSize.biasFileSize;
+    // Query workspace size
+    size_t workspaceSize = GetSysWorkSpaceSize();
+    uint8_t* x1 = (uint8_t*)AscendC::GmAlloc(x1FileSize);
+    uint8_t* x2 = (uint8_t*)AscendC::GmAlloc(x2FileSize);
+    uint8_t* bias = nullptr;
+    uint8_t* y = (uint8_t*)AscendC::GmAlloc(yFileSize);
+    uint8_t* workspace = (uint8_t*)AscendC::GmAlloc(workspaceSize);
+    ReadFile("../input/x1_gm.bin", x1FileSize, x1, x1FileSize);
+    ReadFile("../input/x2_gm.bin", x2FileSize, x2, x2FileSize);
+    if (IS_BIAS) {
+        bias = (uint8_t*)AscendC::GmAlloc(biasFileSize);
+        ReadFile("../input/bias_gm.bin", biasFileSize, bias, biasFileSize);
+    }
+    size_t tilingFileSize = sizeof(TCubeTiling);
+    uint8_t* tiling = (uint8_t*)AscendC::GmAlloc(tilingFileSize);
+    auto ascendcPlatform = platform_ascendc::PlatformAscendCManager::GetInstance();
+    MatmulHost::MatmulCaseParams testCaseParams{static_cast<int32_t>(ascendcPlatform->GetCoreNumAic()),
+        static_cast<int32_t>(m), static_cast<int32_t>(n), static_cast<int32_t>(k), IS_BIAS, IS_A_TRANS, IS_B_TRANS};
+    // Calculate Tiling
+    const auto& tilingData = MatmulHost::GenerateTiling(testCaseParams);
+    memcpy_s(tiling, tilingFileSize, &tilingData, tilingFileSize);
+    ICPU_RUN_KF(matmul_int4_custom, tilingData.usedCoreNum, x1, x2, bias, y, workspace, tiling);
+    WriteFile("../output/output.bin", y, yFileSize);
+    AscendC::GmFree((void*)x1);
+    AscendC::GmFree((void*)x2);
+    AscendC::GmFree((void*)y);
+    AscendC::GmFree((void*)workspace);
+    AscendC::GmFree((void*)tiling);
+    if (IS_BIAS) {
+        AscendC::GmFree((void*)bias);
+    }
+}
+// NPU
+#else
+void MatmulOp(uint8_t* x1, uint8_t* x2, uint8_t* y, uint8_t* bias, int64_t m, int64_t n, int64_t k,
+    void* stream = nullptr)
+{
+    // Init args
+    uint8_t* workspaceDevice = nullptr;
+
+    // Query workspace size
+    size_t workspaceSize = GetSysWorkSpaceSize();
+
+    // Allocate workspace on device
+    CHECK_ACL(aclrtMalloc((void**)&workspaceDevice, workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST));
+
+    // Initialize kernel with arguments and workspace pointer
+    uint8_t* tilingHost = nullptr;
+    uint8_t* tilingDevice = nullptr;
+    size_t tilingFileSize = sizeof(TCubeTiling);
+    auto ascendcPlatform = platform_ascendc::PlatformAscendCManager::GetInstance();
+    MatmulHost::MatmulCaseParams testCaseParams{static_cast<int32_t>(ascendcPlatform->GetCoreNumAic()),
+        static_cast<int32_t>(m), static_cast<int32_t>(n), static_cast<int32_t>(k), IS_BIAS, IS_A_TRANS, IS_B_TRANS};
+    // Calculate Tiling
+    const auto tilingData = MatmulHost::GenerateTiling(testCaseParams);
+    CHECK_ACL(aclrtMallocHost((void**)(&tilingHost), tilingFileSize));
+    CHECK_ACL(aclrtMalloc((void**)&tilingDevice, tilingFileSize,
+                          ACL_MEM_MALLOC_HUGE_FIRST));
+    CHECK_ACL(aclrtMemcpy(tilingHost, tilingFileSize, &tilingData,
+                          tilingFileSize, ACL_MEMCPY_HOST_TO_HOST));
+    CHECK_ACL(aclrtMemcpy(tilingDevice, tilingFileSize, tilingHost,
+                          tilingFileSize, ACL_MEMCPY_HOST_TO_DEVICE));
+    // Launch kernel
+    matmul_int4_custom_do(tilingData.usedCoreNum, stream, x1, x2, bias, y, workspaceDevice, tilingDevice);
+    CHECK_ACL(aclrtFreeHost(tilingHost));
+    CHECK_ACL(aclrtFree(workspaceDevice));
+    CHECK_ACL(aclrtFree(tilingDevice));
+}
+
+void TestAclInit(aclrtContext& context, aclrtStream& stream, int64_t& deviceId)
+{
+    CHECK_ACL(aclInit(nullptr));
+    CHECK_ACL(aclrtSetDevice(deviceId));
+    CHECK_ACL(aclrtCreateContext(&context, deviceId));
+    CHECK_ACL(aclrtCreateStream(&stream));
+}
+
+void TestAclDeInit(aclrtContext& context, aclrtStream& stream, int64_t& deviceId)
+{
+    CHECK_ACL(aclrtDestroyStream(stream));
+    CHECK_ACL(aclrtDestroyContext(context));
+    CHECK_ACL(aclrtResetDevice(deviceId));
+    CHECK_ACL(aclFinalize());
+}
+
+void TestMatmul(int64_t m, int64_t n, int64_t k, const MatrixFileSize& matrixFileSize)
+{
+    size_t x1FileSize = matrixFileSize.x1FileSize;
+    size_t x2FileSize = matrixFileSize.x2FileSize;
+    size_t yFileSize = matrixFileSize.yFileSize;
+    size_t biasFileSize = matrixFileSize.biasFileSize;
+
+    aclrtContext context;
+    aclrtStream stream = nullptr;
+    int64_t deviceId = 0;
+    TestAclInit(context, stream, deviceId);
+
+    uint8_t* x1Host = nullptr;
+    uint8_t* x1Device = nullptr;
+    CHECK_ACL(aclrtMallocHost((void**)(&x1Host), x1FileSize));
+    CHECK_ACL(aclrtMalloc((void**)&x1Device, x1FileSize, ACL_MEM_MALLOC_HUGE_FIRST));
+    ReadFile("../input/x1_gm.bin", x1FileSize, x1Host, x1FileSize);
+    CHECK_ACL(aclrtMemcpy(x1Device, x1FileSize, x1Host, x1FileSize, ACL_MEMCPY_HOST_TO_DEVICE));
+
+    uint8_t* x2Host = nullptr;
+    uint8_t* x2Device = nullptr;
+    CHECK_ACL(aclrtMallocHost((void**)(&x2Host), x2FileSize));
+    CHECK_ACL(aclrtMalloc((void**)&x2Device, x2FileSize, ACL_MEM_MALLOC_HUGE_FIRST));
+    ReadFile("../input/x2_gm.bin", x2FileSize, x2Host, x2FileSize);
+    CHECK_ACL(aclrtMemcpy(x2Device, x2FileSize, x2Host, x2FileSize, ACL_MEMCPY_HOST_TO_DEVICE));
+
+    uint8_t* biasHost = nullptr;
+    uint8_t* biasDevice = nullptr;
+    if (IS_BIAS) {
+        CHECK_ACL(aclrtMallocHost((void**)(&biasHost), biasFileSize));
+        CHECK_ACL(aclrtMalloc((void**)&biasDevice, biasFileSize, ACL_MEM_MALLOC_HUGE_FIRST));
+        ReadFile("../input/bias_gm.bin", biasFileSize, biasHost, biasFileSize);
+        CHECK_ACL(aclrtMemcpy(biasDevice, biasFileSize, biasHost, biasFileSize, ACL_MEMCPY_HOST_TO_DEVICE));
+    }
+    uint8_t* yHost = nullptr;
+    uint8_t* yDevice = nullptr;
+    CHECK_ACL(aclrtMallocHost((void**)(&yHost), yFileSize));
+    CHECK_ACL(aclrtMalloc((void**)&yDevice, yFileSize, ACL_MEM_MALLOC_HUGE_FIRST));
+
+    MatmulOp(x1Device, x2Device, yDevice, biasDevice, m, n, k, stream);
+    CHECK_ACL(aclrtSynchronizeStream(stream));
+
+    CHECK_ACL(aclrtMemcpy(yHost, yFileSize, yDevice, yFileSize, ACL_MEMCPY_DEVICE_TO_HOST));
+    WriteFile("../output/output.bin", yHost, yFileSize);
+
+    if (IS_BIAS) {
+        CHECK_ACL(aclrtFree(biasDevice));
+        CHECK_ACL(aclrtFreeHost(biasHost));
+    }
+    CHECK_ACL(aclrtFree(x1Device));
+    CHECK_ACL(aclrtFreeHost(x1Host));
+    CHECK_ACL(aclrtFree(x2Device));
+    CHECK_ACL(aclrtFreeHost(x2Host));
+    CHECK_ACL(aclrtFree(yDevice));
+    CHECK_ACL(aclrtFreeHost(yHost));
+    TestAclDeInit(context, stream, deviceId);
+}
+#endif
+} // namespace MatmulHost
+
+int32_t main(int32_t argc, const char* args[])
+{
+    int64_t inputParams[3] = {1, 1, 1};
+    for (int32_t i = 1; i < argc && i < 4; ++i) { // 4 used for inputParams loop
+        std::stringstream ss(args[i]);
+        ss >> inputParams[i - 1];
+    }
+    auto ascendcPlatform = platform_ascendc::PlatformAscendCManager::GetInstance(SOC_VERSION);
+    int64_t M = inputParams[0];
+    int64_t N = inputParams[1];
+    int64_t K = inputParams[2];
+    MatrixFileSize matrixFileSize;
+
+    matrixFileSize.x1FileSize = static_cast<size_t>(M * K) * sizeof(int8_t);
+    matrixFileSize.x2FileSize = static_cast<size_t>(K * N) * sizeof(int8_t);
+    matrixFileSize.yFileSize = static_cast<size_t>(M * N) * sizeof(int32_t);
+    matrixFileSize.biasFileSize = static_cast<size_t>(1 * N) * sizeof(int32_t);
+#ifdef ASCENDC_CPU_DEBUG
+    MatmulHost::TestMatmulCpu(M, N, K, matrixFileSize);
+#else
+    MatmulHost::TestMatmul(M, N, K, matrixFileSize);
+#endif
+    return 0;
+}
diff --git a/examples/matrix/matmul_int4/op_host/matmul_int4_custom_tiling.cpp b/examples/matrix/matmul_int4/op_host/matmul_int4_custom_tiling.cpp
new file mode 100644
index 00000000..af408988
--- /dev/null
+++ b/examples/matrix/matmul_int4/op_host/matmul_int4_custom_tiling.cpp
@@ -0,0 +1,49 @@
+/**
+ * Copyright (c) 2025 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+#include "matmul_int4_custom_tiling.h"
+#include <iostream>
+
+namespace MatmulHost {
+
+TCubeTiling GenerateTiling(const MatmulCaseParams& testCaseParams)
+{
+    TCubeTiling tilingData;
+    auto ascendcPlatform = platform_ascendc::PlatformAscendCManager::GetInstance();
+    matmul_tiling::MultiCoreMatmulTiling cubeTiling(*ascendcPlatform);
+    uint32_t M = testCaseParams.m;
+    uint32_t N = testCaseParams.n;
+    uint32_t K = testCaseParams.k;
+    uint32_t blockDim = testCaseParams.usedCoreNum;
+    bool isBias = testCaseParams.isBias;
+    bool isAtrans = testCaseParams.isATrans;
+    bool isBtrans = testCaseParams.isBTrans;
+
+    cubeTiling.SetDim(blockDim);
+    cubeTiling.SetAType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND,
+        matmul_tiling::DataType::DT_INT4, isAtrans);
+    cubeTiling.SetBType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND,
+        matmul_tiling::DataType::DT_INT4, isBtrans);
+    cubeTiling.SetCType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND,
+        matmul_tiling::DataType::DT_INT32);
+    cubeTiling.SetBiasType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND,
+        matmul_tiling::DataType::DT_INT32);
+    cubeTiling.SetOrgShape(M, N, K);
+    cubeTiling.SetShape(M, N, K);
+    cubeTiling.EnableBias(isBias);
+    cubeTiling.SetBufferSpace(-1, -1, -1);
+    if (cubeTiling.GetTiling(tilingData) == -1) {
+        std::cout << "Generate tiling failed." << std::endl;
+        return {};
+    }
+    return tilingData;
+}
+
+} // namespace MatmulHost
diff --git a/examples/matrix/matmul_int4/op_host/matmul_int4_custom_tiling.h b/examples/matrix/matmul_int4/op_host/matmul_int4_custom_tiling.h
new file mode 100644
index 00000000..e3a6a898
--- /dev/null
+++ b/examples/matrix/matmul_int4/op_host/matmul_int4_custom_tiling.h
@@ -0,0 +1,37 @@
+/**
+ * Copyright (c) 2025 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+#ifndef EXAMPLES_MATRIX_MATMUL_INT4_OP_HOST_MATMUL_INT4_CUSTOM_TILING_H
+#define EXAMPLES_MATRIX_MATMUL_INT4_OP_HOST_MATMUL_INT4_CUSTOM_TILING_H
+#include "register/tilingdata_base.h"
+#include "tiling/tiling_api.h"
+
+namespace MatmulHost {
+
+struct MatmulCaseParams
+{
+    int32_t usedCoreNum;
+    int32_t m;
+    int32_t n;
+    int32_t k;
+    bool isBias;
+    bool isATrans;
+    bool isBTrans;
+};
+
+/**
+  * @brief Generate matmul tiling.
+  * @param testCaseParams: Testcase parameters.
+  * @retval Generated Tiling data.
+  */
+TCubeTiling GenerateTiling(const MatmulCaseParams& testCaseParams);
+
+} // namespace MatmulHost
+#endif // EXAMPLES_MATRIX_MATMUL_INT4_OP_HOST_MATMUL_INT4_CUSTOM_TILING_H
diff --git a/examples/matrix/matmul_int4/op_kernel/matmul_int4_custom_kernel.cpp b/examples/matrix/matmul_int4/op_kernel/matmul_int4_custom_kernel.cpp
new file mode 100644
index 00000000..7d42b2a0
--- /dev/null
+++ b/examples/matrix/matmul_int4/op_kernel/matmul_int4_custom_kernel.cpp
@@ -0,0 +1,146 @@
+/**
+ * Copyright (c) 2025 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+#include "kernel_operator.h"
+#include "matmul_int4_custom_kernel.h"
+
+namespace {
+/**
+  * @brief  Copy tiling data to TCubeTiling ptr from tiling gm addr.
+  * @param  tiling: TCubeTiling ptr which needs to copy tiling data.
+  * @param  tilingGM: Tiling gm addr.
+  * @retval None
+  */
+__aicore__ inline void CopyTiling(TCubeTiling* tiling, GM_ADDR tilingGM)
+{
+    uint32_t* ptr = reinterpret_cast<uint32_t*>(tiling);
+    auto tiling32 = reinterpret_cast<__gm__ uint32_t*>(tilingGM);
+
+    for (int i = 0; i < sizeof(TCubeTiling) / sizeof(uint32_t); i++, ptr++) {
+      *ptr = *(tiling32 + i);
+    }
+    return;
+}
+}
+
+namespace MatmulInt4Custom {
+template <typename AType, typename BType, typename CType, typename BiasType>
+__aicore__ inline void MatmulKernel<AType, BType, CType, BiasType>::Init(
+    GM_ADDR a, GM_ADDR b, GM_ADDR bias, GM_ADDR c, const TCubeTiling& tiling, bool isTransA, bool isTransB)
+{
+    this->tiling = tiling;
+    aGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ AType*>(a), tiling.M * tiling.Ka);
+    bGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ BType*>(b), tiling.Kb * tiling.N);
+    cGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ CType*>(c), tiling.M * tiling.N);
+    biasGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ BiasType*>(bias), tiling.N);
+
+    int32_t offsetA = 0;
+    int32_t offsetB = 0;
+    int32_t offsetC = 0;
+    int32_t offsetBias = 0;
+    this->isTransA = isTransA;
+    this->isTransB = isTransB;
+    CalcOffset(AscendC::GetBlockIdx(), offsetA, offsetB, offsetC, offsetBias);
+    aGlobal = aGlobal[offsetA];
+    bGlobal = bGlobal[offsetB];
+    cGlobal = cGlobal[offsetC];
+    biasGlobal = biasGlobal[offsetBias];
+    if (GetSysWorkSpacePtr() == nullptr) {
+        return;
+    }
+}
+
+template <typename AType, typename BType, typename CType, typename BiasType>
+__aicore__ inline void MatmulKernel<AType, BType, CType, BiasType>::Process()
+{
+    if (AscendC::GetBlockIdx() >= tiling.usedCoreNum) {
+        return;
+    }
+
+    // process with tail block
+    int tailM = tiling.M - mCoreIndex * tiling.singleCoreM;
+    tailM = tailM < tiling.singleCoreM ? tailM : tiling.singleCoreM;
+    int tailN = tiling.N - nCoreIndex * tiling.singleCoreN;
+    tailN = tailN < tiling.singleCoreN ? tailN : tiling.singleCoreN;
+    if (tailM < tiling.singleCoreM || tailN < tiling.singleCoreN) {
+        matmulObj.SetTail(tailM, tailN);
+    }
+
+    matmulObj.SetTensorA(aGlobal, isTransA);
+    matmulObj.SetTensorB(bGlobal, isTransB);
+    if (tiling.isBias) {
+        matmulObj.SetBias(biasGlobal);
+    }
+    matmulObj.IterateAll(cGlobal);
+    matmulObj.End();
+}
+
+template <typename AType, typename BType, typename CType, typename BiasType>
+__aicore__ inline void MatmulKernel<AType, BType, CType, BiasType>::CalcOffset(
+    int32_t blockIdx, int32_t& offsetA, int32_t& offsetB, int32_t& offsetC, int32_t& offsetBias)
+{
+    const TCubeTiling& tiling = this->tiling;
+    auto mSingleBlocks = (tiling.M + tiling.singleCoreM - 1) / tiling.singleCoreM; // split M into mSingleBlocks cores
+    mCoreIndex = blockIdx % mSingleBlocks;
+    nCoreIndex = blockIdx / mSingleBlocks;
+
+    if (isTransA) {
+        offsetA = mCoreIndex * tiling.singleCoreM;
+    } else {
+        offsetA = mCoreIndex * tiling.Ka * tiling.singleCoreM;
+    }
+    if (isTransB) {
+        offsetB = nCoreIndex * tiling.Kb * tiling.singleCoreN;
+    } else {
+        offsetB = nCoreIndex * tiling.singleCoreN;
+    }
+    offsetC = mCoreIndex * tiling.N * tiling.singleCoreM + nCoreIndex * tiling.singleCoreN;
+    offsetBias = nCoreIndex * tiling.singleCoreN;
+}
+} // namespace MatmulInt4Custom
+
+/**
+  * @brief  matmul kernel function.
+  * @param  a: A matrix gm addr.
+  * @param  b: B matrix gm addr.
+  * @param  bias: Bias matrix gm addr.
+  * @param  c: C matrix gm addr.
+  * @param  workspace: Temporary gm space addr required by matmul calc.
+  * @param  tilingGm: Tiling data addr. 
+  * @retval None
+  */
+extern "C" __global__ __aicore__ void matmul_int4_custom(GM_ADDR a, GM_ADDR b,
+    GM_ADDR bias, GM_ADDR c, GM_ADDR workspace, GM_ADDR tilingGm)
+{
+#if defined(ASCENDC_CPU_DEBUG)
+    if (g_coreType == AscendC::AIV) {
+        return;
+    }
+#endif
+    // prepare tiling
+    TCubeTiling tiling;
+    CopyTiling(&tiling, tilingGm);
+    // define matmul kernel
+    MatmulInt4Custom::MatmulKernel<AscendC::int4b_t, AscendC::int4b_t, int32_t, int32_t> matmulKernel;
+    AscendC::TPipe pipe;
+    REGIST_MATMUL_OBJ(&pipe, GetSysWorkSpacePtr(), matmulKernel.matmulObj, &tiling);
+    // init matmul kernel, isTransA=false, isTransB=false
+    matmulKernel.Init(a, b, bias, c, tiling, false, false);
+    // matmul kernel process
+    matmulKernel.Process();
+}
+
+#ifndef ASCENDC_CPU_DEBUG
+void matmul_int4_custom_do(uint32_t blockDim, void* stream,
+    GM_ADDR a, GM_ADDR b, GM_ADDR bias, GM_ADDR c, GM_ADDR workspace, GM_ADDR tilingGm)
+{
+    matmul_int4_custom<<<blockDim, nullptr, stream>>>(a, b, bias, c, workspace, tilingGm);
+}
+#endif
diff --git a/examples/matrix/matmul_int4/op_kernel/matmul_int4_custom_kernel.h b/examples/matrix/matmul_int4/op_kernel/matmul_int4_custom_kernel.h
new file mode 100644
index 00000000..57321655
--- /dev/null
+++ b/examples/matrix/matmul_int4/op_kernel/matmul_int4_custom_kernel.h
@@ -0,0 +1,74 @@
+/**
+ * Copyright (c) 2025 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+#ifndef EXAMPLES_MATRIX_MATMUL_INT4_OP_KERNEL_MATMUL_INT4_CUSTOM_KERNEL_H
+#define EXAMPLES_MATRIX_MATMUL_INT4_OP_KERNEL_MATMUL_INT4_CUSTOM_KERNEL_H
+#include "kernel_operator.h"
+// Cube Only
+#define ASCENDC_CUBE_ONLY
+#include "lib/matmul_intf.h"
+
+namespace MatmulInt4Custom {
+
+template <typename AType, typename BType, typename CType, typename BiasType>
+class MatmulKernel {
+public:
+    __aicore__ inline MatmulKernel(){};
+    /**
+      * @brief  Initialization before process.
+      * @param  a: A matrix gm addr.
+      * @param  b: B matrix gm addr.
+      * @param  bias: Bias matrix gm addr.
+      * @param  c: C matrix gm addr.
+      * @param  tiling: Matmul tiling struct.
+      * @param  isTransA: Whether A matrix is transposed.
+      * @param  isTransB: Whether B matrix is transposed.
+      * @retval None
+      */
+    __aicore__ inline void Init(GM_ADDR a, GM_ADDR b, GM_ADDR bias, GM_ADDR c, const TCubeTiling& tiling,
+        bool isTransA, bool isTransB);
+    /**
+      * @brief  Process matrix calculation.
+      * @retval None
+      */
+    __aicore__ inline void Process();
+
+    using A_TYPE = AscendC::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, AType>;
+    using B_TYPE = AscendC::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, BType>;
+    using C_TYPE = AscendC::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, CType>;
+    using BIAS_TYPE =  AscendC::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, BiasType>;
+    AscendC::Matmul<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, CFG_MDL> matmulObj;
+
+private:
+    /**
+      * @brief  Calculate the gm offset based on the blockIdx.
+      * @param  blockIdx: Current Core blockidx.
+      * @param  offsetA: Gm offset of A matrix.
+      * @param  offsetB: Gm offset of B matrix.
+      * @param  offsetC: Gm offset of C matrix.
+      * @param  offsetBias: Gm offset of Bias matrix.
+      * @retval None
+      */
+    __aicore__ inline void CalcOffset(
+        int32_t blockIdx, int32_t& offsetA, int32_t& offsetB, int32_t& offsetC, int32_t& offsetBias);
+
+    AscendC::GlobalTensor<AType> aGlobal;
+    AscendC::GlobalTensor<BType> bGlobal;
+    AscendC::GlobalTensor<CType> cGlobal;
+    AscendC::GlobalTensor<BiasType> biasGlobal;
+    TCubeTiling tiling;
+    int32_t mCoreIndex;
+    int32_t nCoreIndex;
+    bool isTransA{false};
+    bool isTransB{false};
+};
+} // namespace MatmulInt4Custom
+
+#endif // EXAMPLES_MATRIX_MATMUL_INT4_OP_KERNEL_MATMUL_INT4_CUSTOM_KERNEL_H
diff --git a/examples/matrix/matmul_int4/run.sh b/examples/matrix/matmul_int4/run.sh
new file mode 100644
index 00000000..00217c87
--- /dev/null
+++ b/examples/matrix/matmul_int4/run.sh
@@ -0,0 +1,96 @@
+#!/bin/bash
+# Copyright (c) 2025 Huawei Technologies Co., Ltd.
+# This file is a part of the CANN Open Software.
+# Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# ======================================================================================================================
+
+export IS_PERF="0"
+
+SHORT=r:,v:,p:,
+LONG=run-mode:,soc-version:,perf:,
+OPTS=$(getopt -a --options $SHORT --longoptions $LONG -- "$@")
+eval set -- "$OPTS"
+while :
+do
+    case "$1" in
+        (-r | --run-mode )
+            RUN_MODE="$2"
+            shift 2;;
+        (-v | --soc-version )
+            SOC_VERSION="$2"
+            shift 2;;
+        (-p | --perf )
+            IS_PERF="$2"
+            shift 2;;
+        (--)
+            shift;
+            break;;
+        (*)
+            echo "[ERROR] Unexpected option: $1";
+            break;;
+    esac
+done
+
+# Check invalid input
+if [[ ! "${SOC_VERSION}" =~ ^Ascend910 ]]; then
+    echo "[ERROR] Unsupported SocVersion: ${SOC_VERSION}"
+    exit 1
+fi
+
+if [ "${RUN_MODE}" != "npu" ] && [ "${RUN_MODE}" != "sim" ] && [ "${RUN_MODE}" != "cpu" ]; then
+    echo "[ERROR] Unsupported RUN_MODE: ${RUN_MODE}, which can only be cpu/sim/npu."
+    exit 1
+fi
+
+if [ "${IS_PERF}" != "0" ] && [ "${IS_PERF}" != "1" ]; then
+    echo "[ERROR] Unsupported IS_PERF: ${IS_PERF}, which can only be 0 or 1."
+    exit 1
+fi
+
+# only npu mode support is_perf = 1
+if [ "${IS_PERF}" == "1" ] && [ "${RUN_MODE}" != "npu" ]; then
+    echo "[ERROR] Unsupported IS_PERF: ${IS_PERF} while RUN_MODE is ${RUN_MODE}."
+    exit 1
+fi
+
+rm -rf build
+mkdir build
+cd build
+
+source $ASCEND_HOME_DIR/bin/setenv.bash
+export LD_LIBRARY_PATH=${ASCEND_HOME_DIR}/tools/simulator/${SOC_VERSION}/lib:$LD_LIBRARY_PATH
+set -euo pipefail
+
+cmake  -DRUN_MODE=${RUN_MODE} -DSOC_VERSION=${SOC_VERSION} -DASCEND_CANN_PACKAGE_PATH=${ASCEND_HOME_DIR} ..
+make -j16
+
+cd ../
+
+rm -rf input
+mkdir input
+rm -rf output
+mkdir output
+
+rm -rf bin/
+mkdir -p bin
+cd bin
+cp ../build/ascendc_matmul_int4_bbit ./
+
+export TF_CPP_MIN_LOG_LEVEL=3
+
+if [ "${RUN_MODE}" = "npu" ]; then
+    if [ "${IS_PERF}" = "1" ]; then
+        export ASCEND_TOOLKIT_HOME=${ASCEND_HOME_DIR}
+        python3 -u ../scripts/exec_test.py npu "perf"
+    else
+        python3 -u ../scripts/exec_test.py npu "normal"
+    fi
+elif [ "${RUN_MODE}" = "sim" ]; then
+    python3 -u ../scripts/exec_test.py sim "normal"
+elif [ "${RUN_MODE}" = "cpu" ]; then
+    python3 -u ../scripts/exec_test.py cpu "normal"
+fi
diff --git a/examples/matrix/matmul_int4/scripts/exec_test.py b/examples/matrix/matmul_int4/scripts/exec_test.py
new file mode 100644
index 00000000..7ef2d2d5
--- /dev/null
+++ b/examples/matrix/matmul_int4/scripts/exec_test.py
@@ -0,0 +1,121 @@
+#!/usr/bin/python3
+# coding=utf-8
+
+# Copyright (c) 2025 Huawei Technologies Co., Ltd.
+# This file is a part of the CANN Open Software.
+# Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# ======================================================================================================================
+import os
+import sys
+import csv
+import time
+import logging
+import subprocess
+
+import numpy as np
+
+sys.path.append("../..")
+from gen_data import MatmulGenData
+from common_scripts.compare_data import compare_data
+from common_scripts.exec_utils import get_file_work_dir, get_case_list, clear_file_cache, get_perf_task_duration,\
+    get_process_case_cmd
+
+IS_BIAS = True
+IS_TRANS_A = False
+IS_TRANS_B = False
+# int4 in and int32 out
+DATA_TYPE_STR = "int4_int32"
+
+logging.basicConfig(level=logging.INFO)
+
+
+class ProcessParams:
+    def __init__(self, case_name, m, n, k, b, is_perf, run_mode):
+        self.case_name = case_name
+        self.m = m
+        self.n = n
+        self.k = k
+        self.b = b
+        self.is_perf = is_perf
+        self.run_mode = run_mode
+
+
+def process_case(file_work_dir, process_params):
+    case_name, m, n, k, b, is_perf, run_mode = process_params.case_name, process_params.m, process_params.n,\
+        process_params.k, process_params.b, process_params.is_perf, process_params.run_mode
+    logging.info("[INFO] start process case [%s]" % (case_name))
+    logging.info("[INFO] IS_PERF is set [%d]" % (is_perf))
+    clear_file_cache(file_work_dir)
+
+    matmul_gen_data = MatmulGenData(m, n, k, b, IS_TRANS_A, IS_TRANS_B, IS_BIAS, DATA_TYPE_STR)
+    if is_perf:
+        matmul_gen_data.gen_fake_golden_data(file_work_dir)
+    else:
+        matmul_gen_data.gen_golden_data(file_work_dir)
+    params_str = f"{m} {n} {k}"
+    cmd = get_process_case_cmd(kernel_name="ascendc_matmul_int4_bbit", params_str=params_str,\
+        is_perf=is_perf, run_mode=run_mode)
+    subprocess.run(cmd)
+    if is_perf:
+        wrong_num = -1
+    else:
+        logging.info("[INFO] compare data case[%s]" % (case_name))
+        wrong_num = compare_data(file_work_dir, DATA_TYPE_STR)
+    res_data = []
+    res_data.append(case_name)
+    res_data.append(wrong_num)
+    res_data.append(b * m * n)
+    if wrong_num == -1:
+        res_data.append("None")
+    elif wrong_num / (b * m * n) > 0.001:
+        res_data.append("Fail")
+    else:
+        res_data.append("Success")
+    if is_perf:
+        task_duration = get_perf_task_duration("./prof_out")
+        res_data.append(task_duration)
+    return res_data
+
+
+def main():
+    args_len = len(sys.argv) - 1
+    if args_len != 2:
+        logging.info("[ERROR] exec_test input params error!")
+        return -1
+
+    file_work_dir = get_file_work_dir()
+    if not os.path.exists(file_work_dir):
+        logging.info("[ERROR] file path %s not exist!" % (file_work_dir))
+        return -1
+
+    is_perf = False
+    if sys.argv[2] == "perf":
+        is_perf = True
+
+    case_list = get_case_list()
+    res_list = [["case_name", "wrong_num", "total_num", "result", "task_duration"]]
+    run_mode = sys.argv[1]
+    for is_process, case_name, m, n, k, b in case_list:
+        if is_process == 1:
+            process_params = ProcessParams(case_name, m, n, k, b, is_perf, run_mode)
+            res_data = process_case(file_work_dir, process_params)
+            res_list.append(res_data)
+
+    timestamp = time.time()
+    result_file_name = "result_" + str(timestamp) + ".csv"
+    with open(os.path.join(file_work_dir, "output", result_file_name), 'w', newline='', encoding='utf-8') as csvfile:
+        writer = csv.writer(csvfile)
+        writer.writerows(res_list)
+
+    logging.info("---------------RESULT---------------")
+    for res in res_list:
+        logging.info(res)
+    return 0
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/matrix/matmul_int4/scripts/gen_data.py b/examples/matrix/matmul_int4/scripts/gen_data.py
new file mode 100644
index 00000000..81a2687e
--- /dev/null
+++ b/examples/matrix/matmul_int4/scripts/gen_data.py
@@ -0,0 +1,100 @@
+#!/usr/bin/python3
+# coding=utf-8
+
+# Copyright (c) 2025 Huawei Technologies Co., Ltd.
+# This file is a part of the CANN Open Software.
+# Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# ======================================================================================================================
+import os
+import sys
+import logging
+
+import numpy as np
+
+IS_OUTPUT_TXT = False
+
+
+class MatmulGenData:
+    def __init__(self, m, n, k, b, is_trans_a, is_trans_b, is_bias, data_type_str):
+        self.m = m
+        self.n = n
+        self.k = k
+        self.b = b
+        self.is_trans_a = is_trans_a
+        self.is_trans_b = is_trans_b
+        self.is_bias = is_bias
+        self.data_type_str = data_type_str
+
+
+    def gen_golden_data_int4_int32(self, work_dir, dst_type=np.int32):
+        src_type = np.int8 # two int4 element combine into one int8 element
+        x1_shape = [self.k, self.m] if self.is_trans_a else [self.m, self.k]
+        x2_shape = [self.n, self.k] if self.is_trans_b else [self.k, self.n]
+        # generate x1_gm
+        x1_gm = np.random.randint(-5, 5, x1_shape).astype(src_type)
+        m_size, k_size = x1_shape[0], x1_shape[1]
+        x1_gm_int4 = np.zeros(shape=[m_size, k_size // 2]).astype(src_type)
+        for i in range(m_size):
+            for j in range(k_size):
+                if j % 2 == 0:
+                    x1_gm_int4[i][j // 2] = (x1_gm[i][j + 1] << 4) + (x1_gm[i][j] & 0x0f)
+        # generate x2_gm
+        x2_gm = np.random.randint(-5, 5, x2_shape).astype(src_type)
+        k_size, n_size = x2_shape[0], x2_shape[1]
+        x2_gm_int4 = np.zeros(shape=[k_size, n_size // 2]).astype(src_type)
+        for i in range(k_size):
+            for j in range(n_size):
+                if j % 2 == 0:
+                    x2_gm_int4[i][j // 2] = (x2_gm[i][j + 1] << 4) + (x2_gm[i][j] & 0x0f)
+
+        if self.is_bias:
+            bias_gm = np.random.randint(-5, 5, [1, self.n]).astype(dst_type)
+            y_gm = (np.matmul(x1_gm.astype(dst_type), x2_gm.astype(dst_type))\
+                + bias_gm.astype(dst_type)).astype(dst_type)
+        else:
+            y_gm = np.matmul(x1_gm.astype(dst_type), x2_gm.astype(dst_type)).astype(dst_type)
+
+        x1_gm_int4.tofile(work_dir + "/input/x1_gm.bin")
+        x2_gm_int4.tofile(work_dir + "/input/x2_gm.bin")
+        y_gm.tofile(work_dir + "/output/golden.bin")
+        if self.is_bias:
+            bias_gm.tofile(work_dir + "/input/bias_gm.bin")
+
+        if IS_OUTPUT_TXT:
+            np.savetxt(work_dir + "/input/x1_gm.txt", x1_gm_int4.flatten(), fmt='%f', newline='\n')
+            np.savetxt(work_dir + "/input/x2_gm.txt", x2_gm_int4.flatten(), fmt='%f', newline='\n')
+            np.savetxt(work_dir + "/output/golden.txt", y_gm.flatten(), fmt='%f', newline='\n')
+            if self.is_bias:
+                np.savetxt(work_dir + "/input/bias_gm.txt", bias_gm.flatten(), fmt='%f', newline='\n')
+        return 0
+
+
+    def gen_golden_data(self, work_dir):
+        if self.data_type_str == "int4_int32":
+            self.gen_golden_data_int4_int32(work_dir)
+        else:
+            logging.info("[ERROR] can't support data type %s" % (self.data_type_str))
+            return -1
+        return 0
+
+
+    def gen_fake_golden_data(self, work_dir):
+        data_type_bytes_ab = 1 / 2  # int4
+        data_type_bytes_c = 4       # int32
+
+        file_byte = int(self.m * self.k * data_type_bytes_ab)
+        with open(work_dir + "/input/x1_gm.bin", 'wb') as file:
+            file.truncate(file_byte)
+
+        file_byte = int(self.k * self.n * data_type_bytes_ab)
+        with open(work_dir + "/input/x2_gm.bin", 'wb') as file:
+            file.truncate(file_byte)
+
+        if self.is_bias:
+            file_byte = 1 * self.n * data_type_bytes_c
+            with open(work_dir + "/input/bias_gm.bin", 'wb') as file:
+                file.truncate(file_byte)
diff --git a/examples/matrix/matmul_int4/testcase/case.csv b/examples/matrix/matmul_int4/testcase/case.csv
new file mode 100644
index 00000000..48b45b75
--- /dev/null
+++ b/examples/matrix/matmul_int4/testcase/case.csv
@@ -0,0 +1 @@
+1, case001, 256, 7680, 128
\ No newline at end of file
diff --git a/examples/readme.md b/examples/readme.md
index e64a3b46..cf59e3ac 100644
--- a/examples/readme.md
+++ b/examples/readme.md
@@ -36,7 +36,7 @@
         <td> 对输入tensor按行做如下公式的计算：zi = (xi - ∑(xi * yi)) * yi，其中∑为按行reduce求和。</td>
     </tr>
     <tr>
-        <th rowspan="30"><a href="./matrix"> matrix</th>
+        <th rowspan="32"><a href="./matrix"> matrix</th>
         <td><a href="./matrix/basic_block_matmul"> basic_block_matmul</td>
         <td> 实现无尾块且tiling的base块大小固定的场景下的Matmul矩阵乘法，计算公式为：C = A * B + Bias。 </td>
     </tr>
@@ -155,6 +155,14 @@
         <td><a href="./matrix/matmul_quant"> matmul_quant</td>
         <td> 实现MDL模板下反量化功能的Matmul矩阵乘法，计算公式为：C = A * B + Bias。 </td>
     </tr>
+    <tr>
+        <td><a href="./matrix/batch_matmul_bias_no_batch"> batch_matmul_bias_no_batch</td>
+        <td> 一次完成BatchNum个Matmul矩阵乘法，并复用同一个Bias矩阵，单次Matmul计算公式为：C = A * B + Bias。 </td>
+    </tr>
+    <tr>
+        <td><a href="./matrix/matmul_int4"> matmul_int4</td>
+        <td> 实现MDL模板下int4数据类型输入的Matmul矩阵乘法，计算公式为：C = A * B + Bias。 </td>
+    </tr>
     <tr>
         <th rowspan="6"><a href="./normalization"> normalization</th>
         <td><a href="./normalization/layernorm"> layernorm</td>
-- 
Gitee