diff --git a/operator/ascendc/0_introduction/22_mmad_profile_kernellaunch/MmadInvocation/CMakeLists.txt b/operator/ascendc/0_introduction/22_mmad_profile_kernellaunch/MmadInvocation/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..02cb22479252ae44d611635e3a603eae1aea316f
--- /dev/null
+++ b/operator/ascendc/0_introduction/22_mmad_profile_kernellaunch/MmadInvocation/CMakeLists.txt
@@ -0,0 +1,57 @@
+cmake_minimum_required(VERSION 3.16)
+project(Ascend_c)
+
+set(RUN_MODE "npu" CACHE STRING "cpu/sim/npu")
+set(SOC_VERSION "Ascend310P3" CACHE STRING "system on chip type")
+set(ASCEND_CANN_PACKAGE_PATH "/usr/local/Ascend/ascend-toolkit/latest"
+    CACHE STRING "ASCEND CANN package installation directory"
+)
+if(NOT CMAKE_BUILD_TYPE)
+    set(CMAKE_BUILD_TYPE "Debug" CACHE STRING "Build type Release/Debug (default Debug)" FORCE)
+endif()
+if(CMAKE_INSTALL_PREFIX STREQUAL /usr/local)
+    set(CMAKE_INSTALL_PREFIX "${CMAKE_CURRENT_LIST_DIR}/out" CACHE STRING "path for install()" FORCE)
+endif()
+
+# ${KERNEL_FILES} are used to compile library, push files written by ascendc in ${KERNEL_FILES}.
+# ref to cmake/npu.cmake ascendc_library, cmake/cpu.cmake add_library
+file(GLOB KERNEL_FILES ${CMAKE_CURRENT_SOURCE_DIR}/mmad_custom.cpp)
+set(CUSTOM_ASCEND310P_LIST "Ascend310P1" "Ascend310P3")
+
+if("${RUN_MODE}" STREQUAL "cpu")
+    include(cmake/cpu_lib.cmake)
+elseif("${RUN_MODE}" STREQUAL "sim" OR "${RUN_MODE}" STREQUAL "npu")
+    include(cmake/npu_lib.cmake)
+else()
+    message("invalid RUN_MODE: ${RUN_MODE}")
+endif()
+
+add_executable(ascendc_kernels_bbit
+    ${CMAKE_CURRENT_SOURCE_DIR}/main.cpp
+)
+
+target_compile_options(ascendc_kernels_bbit PRIVATE
+    $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:-g>>
+    -O2 -std=c++17 -D_GLIBCXX_USE_CXX11_ABI=0 -Wall -Werror
+)
+
+target_compile_definitions(ascendc_kernels_bbit PRIVATE
+    $<$<BOOL:$<IN_LIST:${SOC_VERSION},${CUSTOM_ASCEND310P_LIST}>>:CUSTOM_ASCEND310P>
+)
+
+target_link_libraries(ascendc_kernels_bbit PRIVATE
+    $<BUILD_INTERFACE:$<$<OR:$<STREQUAL:${RUN_MODE},npu>,$<STREQUAL:${RUN_MODE},sim>>:host_intf_pub>>
+    $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:ascendcl>>
+    ascendc_kernels_${RUN_MODE}
+    tiling_api
+    register
+    platform
+    ascendalog
+    dl
+)
+
+install(TARGETS ascendc_kernels_bbit
+    LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+    ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
+    RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
+)
diff --git a/operator/ascendc/0_introduction/22_mmad_profile_kernellaunch/MmadInvocation/README.md b/operator/ascendc/0_introduction/22_mmad_profile_kernellaunch/MmadInvocation/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..84477c6ef2141b8c0fe13562e42a91bdf540a8dc
--- /dev/null
+++ b/operator/ascendc/0_introduction/22_mmad_profile_kernellaunch/MmadInvocation/README.md
@@ -0,0 +1,97 @@
+## 目录结构介绍
+```
+├── MmadInvocation
+│   ├── cmake                     // 编译工程文件
+│   ├── scripts
+│   │   ├── gen_data.py           // 输入数据和真值数据生成脚本文件
+│   │   └── verify_result.py      // 验证输出数据和真值数据是否一致的验证脚本
+│   ├── CMakeLists.txt            // 编译工程文件
+│   ├── data_utils.h              // 数据读入写出函数
+│   ├── main.cpp                  // 主函数，调用算子的应用程序，含CPU域及NPU域调用
+│   ├── mmad_custom_cube_only.h   // Atlas A2训练系列产品kernel实现
+│   ├── mmad_custom.h             // Atlas推理系列产品kernel实现
+│   ├── mmad_custom.cpp           // 算子kernel实现
+│   └── run.sh                    // 编译运行算子的脚本
+```
+
+## 算子规格描述
+在核函数直调样例中，算子实现支持的shape为：M = 32, N = 32, K = 32。
+<table>
+<tr><td rowspan="4" align="center">算子输入</td><td align="center">name</td><td align="center">shape</td><td align="center">data type</td><td align="center">format</td></tr>
+<tr><td align="center">a</td><td align="center">M * K</td><td align="center">float16</td><td align="center">ND</td></tr>
+<tr><td align="center">b</td><td align="center">K * N</td><td align="center">float16</td><td align="center">ND</td></tr>
+</tr>
+</tr>
+<tr><td rowspan="1" align="center">算子输出</td><td align="center">c</td><td align="center">M * N</td><td align="center">float</td><td align="center">ND</td></tr>
+</tr>
+<tr><td rowspan="1" align="center">核函数名</td><td colspan="4" align="center">mmad_custom</td></tr>
+</table>
+
+## 代码实现介绍
+本样例中实现的是[m, n, k]固定为[32, 32, 32]的Matmul算子，并使用Ascend C基础Api实现。
+- kernel实现  
+  Matmul算子的数学表达式为：
+  $$
+  C = A * B
+  $$
+  其中A的形状为[32, 32], B的形状为[32, 32], C的形状为[32, 32]。具体请参考[mmad_custom.cpp](./mmad_custom.cpp)。
+  
+  **注：当使用硬件分离架构的产品如Atlas A2训练系列产品/Atlas 800I A2推理产品时，由于样例使用的基础API均为Cube核指令，本样例设置了Cube Only模式，只调用Cube核完成计算，代码如下：
+  ```c++
+  KERNEL_TASK_TYPE_DEFAULT(KERNEL_TYPE_AIC_ONLY);
+  ```
+
+- 调用实现  
+  1. CPU侧运行验证主要通过ICPU_RUN_KF CPU调测宏等CPU调测库提供的接口来完成；
+  
+  **注：当使用硬件分离架构的产品如Atlas A2训练系列产品/Atlas 800I A2推理产品时，Kernel中设置的Cube Only在NPU侧运行时可自动识别并只运行Cube核，若在CPU侧运行，需要额外设置KernelMode，只模拟Cube核实现，代码如下：
+  ```c++
+  AscendC::SetKernelMode(KernelMode::AIC_MODE);
+  ```
+  2. NPU侧运行验证主要通过使用ACLRT_LAUNCH_KERNEL内核调用宏来完成。
+
+  应用程序通过ASCENDC_CPU_DEBUG宏区分代码逻辑运行于CPU侧还是NPU侧。
+
+## 运行样例算子
+  - 打开样例目录
+    以命令行方式下载样例代码，master分支为例。
+    ```bash
+    cd ${git_clone_path}/samples/operator/ascendc\0_introduction\20_mmad_kernellaunch/MmadInvocationNeo
+    ```
+  - 配置环境变量
+
+    请根据当前环境上CANN开发套件包的[安装方式](https://hiascend.com/document/redirect/CannCommunityInstSoftware)，选择对应配置环境变量的命令。
+    - 默认路径，root用户安装CANN软件包
+      ```bash
+      export ASCEND_INSTALL_PATH=/usr/local/Ascend/ascend-toolkit/latest
+      ```
+    - 默认路径，非root用户安装CANN软件包
+      ```bash
+      export ASCEND_INSTALL_PATH=$HOME/Ascend/ascend-toolkit/latest
+      ```
+    - 指定路径install_path，安装CANN软件包
+      ```bash
+      export ASCEND_INSTALL_PATH=${install_path}/ascend-toolkit/latest
+      ```
+
+  - 样例执行
+
+    ```bash
+    bash run.sh -r [RUN_MODE] -v  [SOC_VERSION]
+    ```
+    - RUN_MODE：编译方式，可选择CPU调试，NPU仿真，NPU上板。支持参数为[cpu / sim / npu]。
+    - SOC_VERSION：昇腾AI处理器型号，如果无法确定具体的[SOC_VERSION]，则在安装昇腾AI处理器的服务器执行npu-smi info命令进行查询，在查询到的“Name”前增加Ascend信息，例如"Name"对应取值为xxxyy，实际配置的[SOC_VERSION]值为Ascendxxxyy。支持以下产品型号：
+      - Atlas 推理系列产品AI Core
+      - Atlas A2训练系列产品/Atlas 800I A2推理产品
+
+    示例如下，Ascendxxxyy请替换为实际的AI处理器型号。
+
+    ```bash
+    bash run.sh -r cpu -v Ascendxxxyy
+    ```
+
+
+## 更新说明
+| 时间       | 更新事项     |
+| ---------- | ------------ |
+| 2024/11/20 | 更新本readme |
diff --git a/operator/ascendc/0_introduction/22_mmad_profile_kernellaunch/MmadInvocation/cmake/cpu_lib.cmake b/operator/ascendc/0_introduction/22_mmad_profile_kernellaunch/MmadInvocation/cmake/cpu_lib.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..acb98ec9029d1b9f0adeaa6bba6263bc797b2ac9
--- /dev/null
+++ b/operator/ascendc/0_introduction/22_mmad_profile_kernellaunch/MmadInvocation/cmake/cpu_lib.cmake
@@ -0,0 +1,12 @@
+if(NOT DEFINED ENV{CMAKE_PREFIX_PATH})
+    set(CMAKE_PREFIX_PATH ${ASCEND_CANN_PACKAGE_PATH}/tools/tikicpulib/lib/cmake)
+endif()
+find_package(tikicpulib REQUIRED)
+
+add_library(ascendc_kernels_${RUN_MODE} SHARED ${KERNEL_FILES})
+target_link_libraries(ascendc_kernels_${RUN_MODE} PUBLIC tikicpulib::${SOC_VERSION})
+target_compile_definitions(ascendc_kernels_${RUN_MODE} PRIVATE
+    $<$<BOOL:$<IN_LIST:${SOC_VERSION},${CUSTOM_ASCEND310P_LIST}>>:CUSTOM_ASCEND310P>
+)
+target_compile_options(ascendc_kernels_${RUN_MODE} PRIVATE -g -O0 -std=c++17)
+install(TARGETS ascendc_kernels_${RUN_MODE} DESTINATION ${CMAKE_INSTALL_LIBDIR})
diff --git a/operator/ascendc/0_introduction/22_mmad_profile_kernellaunch/MmadInvocation/cmake/npu_lib.cmake b/operator/ascendc/0_introduction/22_mmad_profile_kernellaunch/MmadInvocation/cmake/npu_lib.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..3b8e2c506642513fcf1f8777a287ca0d19085b50
--- /dev/null
+++ b/operator/ascendc/0_introduction/22_mmad_profile_kernellaunch/MmadInvocation/cmake/npu_lib.cmake
@@ -0,0 +1,15 @@
+if(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
+    set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
+elseif(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake)
+    set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake)
+else()
+    message(FATAL_ERROR "ascendc_kernel_cmake does not exist ,please check whether the cann package is installed")
+endif()
+include(${ASCENDC_CMAKE_DIR}/ascendc.cmake)
+
+# ascendc_library use to add kernel file to generate ascendc library
+ascendc_library(ascendc_kernels_${RUN_MODE} SHARED ${KERNEL_FILES})
+
+ascendc_compile_definitions(ascendc_kernels_${RUN_MODE} PRIVATE
+    $<$<BOOL:$<IN_LIST:${SOC_VERSION},${CUSTOM_ASCEND310P_LIST}>>:CUSTOM_ASCEND310P>
+)
diff --git a/operator/ascendc/0_introduction/22_mmad_profile_kernellaunch/MmadInvocation/data_utils.h b/operator/ascendc/0_introduction/22_mmad_profile_kernellaunch/MmadInvocation/data_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..b71d2aaf3083206f1fab03ffe42f41834078761c
--- /dev/null
+++ b/operator/ascendc/0_introduction/22_mmad_profile_kernellaunch/MmadInvocation/data_utils.h
@@ -0,0 +1,203 @@
+/**
+ * @file data_utils.h
+ *
+ * Copyright (C) 2023-2024. Huawei Technologies Co., Ltd. All rights reserved.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ */
+#ifndef DATA_UTILS_H
+#define DATA_UTILS_H
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+#include <cassert>
+#include <cstdio>
+#include <fstream>
+#include <iomanip>
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include "acl/acl.h"
+
+typedef enum {
+    DT_UNDEFINED = -1,
+    FLOAT = 0,
+    HALF = 1,
+    INT8_T = 2,
+    INT32_T = 3,
+    UINT8_T = 4,
+    INT16_T = 6,
+    UINT16_T = 7,
+    UINT32_T = 8,
+    INT64_T = 9,
+    UINT64_T = 10,
+    DOUBLE = 11,
+    BOOL = 12,
+    STRING = 13,
+    COMPLEX64 = 16,
+    COMPLEX128 = 17,
+    BF16 = 27
+} printDataType;
+
+#define INFO_LOG(fmt, args...) fprintf(stdout, "[INFO]  " fmt "\n", ##args)
+#define WARN_LOG(fmt, args...) fprintf(stdout, "[WARN]  " fmt "\n", ##args)
+#define ERROR_LOG(fmt, args...) fprintf(stdout, "[ERROR]  " fmt "\n", ##args)
+#define CHECK_ACL(x)                                                                        \
+    do {                                                                                    \
+        aclError __ret = x;                                                                 \
+        if (__ret != ACL_ERROR_NONE) {                                                      \
+            std::cerr << __FILE__ << ":" << __LINE__ << " aclError:" << __ret << std::endl; \
+        }                                                                                   \
+    } while (0);
+
+/**
+ * @brief Read data from file
+ * @param [in] filePath: file path
+ * @param [out] fileSize: file size
+ * @return read result
+ */
+bool ReadFile(const std::string &filePath, size_t &fileSize, void *buffer, size_t bufferSize)
+{
+    struct stat sBuf;
+    int fileStatus = stat(filePath.data(), &sBuf);
+    if (fileStatus == -1) {
+        ERROR_LOG("failed to get file");
+        return false;
+    }
+    if (S_ISREG(sBuf.st_mode) == 0) {
+        ERROR_LOG("%s is not a file, please enter a file", filePath.c_str());
+        return false;
+    }
+
+    std::ifstream file;
+    file.open(filePath, std::ios::binary);
+    if (!file.is_open()) {
+        ERROR_LOG("Open file failed. path = %s", filePath.c_str());
+        return false;
+    }
+
+    std::filebuf *buf = file.rdbuf();
+    size_t size = buf->pubseekoff(0, std::ios::end, std::ios::in);
+    if (size == 0) {
+        ERROR_LOG("file size is 0");
+        file.close();
+        return false;
+    }
+    if (size > bufferSize) {
+        ERROR_LOG("file size is larger than buffer size");
+        file.close();
+        return false;
+    }
+    buf->pubseekpos(0, std::ios::in);
+    buf->sgetn(static_cast<char *>(buffer), size);
+    fileSize = size;
+    file.close();
+    return true;
+}
+
+/**
+ * @brief Write data to file
+ * @param [in] filePath: file path
+ * @param [in] buffer: data to write to file
+ * @param [in] size: size to write
+ * @return write result
+ */
+bool WriteFile(const std::string &filePath, const void *buffer, size_t size)
+{
+    if (buffer == nullptr) {
+        ERROR_LOG("Write file failed. buffer is nullptr");
+        return false;
+    }
+
+    int fd = open(filePath.c_str(), O_RDWR | O_CREAT | O_TRUNC, S_IRUSR | S_IWRITE);
+    if (fd < 0) {
+        ERROR_LOG("Open file failed. path = %s", filePath.c_str());
+        return false;
+    }
+
+    size_t writeSize = write(fd, buffer, size);
+    (void)close(fd);
+    if (writeSize != size) {
+        ERROR_LOG("Write file Failed.");
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T> void DoPrintData(const T *data, size_t count, size_t elementsPerRow)
+{
+    assert(elementsPerRow != 0);
+    for (size_t i = 0; i < count; ++i) {
+        std::cout << std::setw(10) << data[i];
+        if (i % elementsPerRow == elementsPerRow - 1) {
+            std::cout << std::endl;
+        }
+    }
+}
+
+void DoPrintHalfData(const aclFloat16 *data, size_t count, size_t elementsPerRow)
+{
+    assert(elementsPerRow != 0);
+    for (size_t i = 0; i < count; ++i) {
+        std::cout << std::setw(10) << std::setprecision(6) << aclFloat16ToFloat(data[i]);
+        if (i % elementsPerRow == elementsPerRow - 1) {
+            std::cout << std::endl;
+        }
+    }
+}
+
+void PrintData(const void *data, size_t count, printDataType dataType, size_t elementsPerRow = 16)
+{
+    if (data == nullptr) {
+        ERROR_LOG("Print data failed. data is nullptr");
+        return;
+    }
+
+    switch (dataType) {
+        case BOOL:
+            DoPrintData(reinterpret_cast<const bool *>(data), count, elementsPerRow);
+            break;
+        case INT8_T:
+            DoPrintData(reinterpret_cast<const int8_t *>(data), count, elementsPerRow);
+            break;
+        case UINT8_T:
+            DoPrintData(reinterpret_cast<const uint8_t *>(data), count, elementsPerRow);
+            break;
+        case INT16_T:
+            DoPrintData(reinterpret_cast<const int16_t *>(data), count, elementsPerRow);
+            break;
+        case UINT16_T:
+            DoPrintData(reinterpret_cast<const uint16_t *>(data), count, elementsPerRow);
+            break;
+        case INT32_T:
+            DoPrintData(reinterpret_cast<const int32_t *>(data), count, elementsPerRow);
+            break;
+        case UINT32_T:
+            DoPrintData(reinterpret_cast<const uint32_t *>(data), count, elementsPerRow);
+            break;
+        case INT64_T:
+            DoPrintData(reinterpret_cast<const int64_t *>(data), count, elementsPerRow);
+            break;
+        case UINT64_T:
+            DoPrintData(reinterpret_cast<const uint64_t *>(data), count, elementsPerRow);
+            break;
+        case HALF:
+            DoPrintHalfData(reinterpret_cast<const aclFloat16 *>(data), count, elementsPerRow);
+            break;
+        case FLOAT:
+            DoPrintData(reinterpret_cast<const float *>(data), count, elementsPerRow);
+            break;
+        case DOUBLE:
+            DoPrintData(reinterpret_cast<const double *>(data), count, elementsPerRow);
+            break;
+        default:
+            ERROR_LOG("Unsupported type: %d", dataType);
+    }
+    std::cout << std::endl;
+}
+#endif // DATA_UTILS_H
diff --git a/operator/ascendc/0_introduction/22_mmad_profile_kernellaunch/MmadInvocation/main.cpp b/operator/ascendc/0_introduction/22_mmad_profile_kernellaunch/MmadInvocation/main.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ad10e44f5d0384891ce71197a5c7ed48050ed120
--- /dev/null
+++ b/operator/ascendc/0_introduction/22_mmad_profile_kernellaunch/MmadInvocation/main.cpp
@@ -0,0 +1,89 @@
+/**
+ * @file main.cpp
+ *
+ * Copyright (C) 2023-2024. Huawei Technologies Co., Ltd. All rights reserved.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ */
+#include "data_utils.h"
+#ifndef ASCENDC_CPU_DEBUG
+#include "acl/acl.h"
+#include "aclrtlaunch_mmad_custom.h"
+#else
+#include "tikicpulib.h"
+extern "C" void mmad_custom(uint8_t *a, uint8_t *b, uint8_t *c);
+#endif
+
+int32_t main(int32_t argc, char *argv[])
+{
+    uint32_t M = 128;
+    uint32_t N = 128;
+    uint32_t K = 128;
+    size_t aFileSize = M * K * sizeof(int16_t); // uint16_t represent half
+    size_t bFileSize = K * N * sizeof(int16_t); // uint16_t represent half
+    size_t cFileSize = M * N * sizeof(float);
+    uint32_t blockDim = 1;
+
+#ifdef ASCENDC_CPU_DEBUG
+    AscendC::SetKernelMode(KernelMode::AIC_MODE);
+    uint8_t *a = (uint8_t *)AscendC::GmAlloc(aFileSize);
+    uint8_t *b = (uint8_t *)AscendC::GmAlloc(bFileSize);
+    uint8_t *c = (uint8_t *)AscendC::GmAlloc(cFileSize);
+
+    ReadFile("./input/x1_gm.bin", aFileSize, a, aFileSize);
+    ReadFile("./input/x2_gm.bin", bFileSize, b, bFileSize);
+
+    ICPU_RUN_KF(mmad_custom, blockDim, a, b, c);
+
+    WriteFile("./output/output.bin", c, cFileSize);
+
+    AscendC::GmFree((void *)a);
+    AscendC::GmFree((void *)b);
+    AscendC::GmFree((void *)c);
+#else
+    CHECK_ACL(aclInit(nullptr));
+    int32_t deviceId = 0;
+    CHECK_ACL(aclrtSetDevice(deviceId));
+    aclrtStream stream = nullptr;
+    CHECK_ACL(aclrtCreateStream(&stream));
+
+    uint8_t *aHost;
+    uint8_t *aDevice;
+    CHECK_ACL(aclrtMallocHost((void **)(&aHost), aFileSize));
+    CHECK_ACL(aclrtMalloc((void **)&aDevice, aFileSize, ACL_MEM_MALLOC_HUGE_FIRST));
+    ReadFile("./input/x1_gm.bin", aFileSize, aHost, aFileSize);
+    CHECK_ACL(aclrtMemcpy(aDevice, aFileSize, aHost, aFileSize, ACL_MEMCPY_HOST_TO_DEVICE));
+
+    uint8_t *bHost;
+    uint8_t *bDevice;
+    CHECK_ACL(aclrtMallocHost((void **)(&bHost), bFileSize));
+    CHECK_ACL(aclrtMalloc((void **)&bDevice, bFileSize, ACL_MEM_MALLOC_HUGE_FIRST));
+    ReadFile("./input/x2_gm.bin", bFileSize, bHost, bFileSize);
+    CHECK_ACL(aclrtMemcpy(bDevice, bFileSize, bHost, bFileSize, ACL_MEMCPY_HOST_TO_DEVICE));
+
+    uint8_t *cHost;
+    uint8_t *cDevice;
+    CHECK_ACL(aclrtMallocHost((void **)(&cHost), cFileSize));
+    CHECK_ACL(aclrtMalloc((void **)&cDevice, cFileSize, ACL_MEM_MALLOC_HUGE_FIRST));
+
+    ACLRT_LAUNCH_KERNEL(mmad_custom)(blockDim, stream, aDevice, bDevice, cDevice);
+    CHECK_ACL(aclrtSynchronizeStream(stream));
+
+    CHECK_ACL(aclrtMemcpy(cHost, cFileSize, cDevice, cFileSize, ACL_MEMCPY_DEVICE_TO_HOST));
+    WriteFile("./output/output.bin", cHost, cFileSize);
+
+    CHECK_ACL(aclrtFree(aDevice));
+    CHECK_ACL(aclrtFreeHost(aHost));
+    CHECK_ACL(aclrtFree(bDevice));
+    CHECK_ACL(aclrtFreeHost(bHost));
+    CHECK_ACL(aclrtFree(cDevice));
+    CHECK_ACL(aclrtFreeHost(cHost));
+
+    CHECK_ACL(aclrtDestroyStream(stream));
+    CHECK_ACL(aclrtResetDevice(deviceId));
+    CHECK_ACL(aclFinalize());
+#endif
+    return 0;
+}
diff --git a/operator/ascendc/0_introduction/22_mmad_profile_kernellaunch/MmadInvocation/mmad_custom.cpp b/operator/ascendc/0_introduction/22_mmad_profile_kernellaunch/MmadInvocation/mmad_custom.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8eb7eb91d51eb69b2f225a1f870ac23685224b26
--- /dev/null
+++ b/operator/ascendc/0_introduction/22_mmad_profile_kernellaunch/MmadInvocation/mmad_custom.cpp
@@ -0,0 +1,17 @@
+/**
+ * @file mmad_custom.cpp
+ *
+ * Copyright (C) 2023-2024. Huawei Technologies Co., Ltd. All rights reserved.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ */
+#include "mmad_custom.h"
+
+extern "C" __global__ __aicore__ void mmad_custom(GM_ADDR a, GM_ADDR b, GM_ADDR c)
+{
+    KernelMmad op;
+    op.Init(a, b, c);
+    op.Process();
+}
diff --git a/operator/ascendc/0_introduction/22_mmad_profile_kernellaunch/MmadInvocation/mmad_custom.h b/operator/ascendc/0_introduction/22_mmad_profile_kernellaunch/MmadInvocation/mmad_custom.h
new file mode 100644
index 0000000000000000000000000000000000000000..bce32f8042d11536fd07c7e9d249e21f4d66e63f
--- /dev/null
+++ b/operator/ascendc/0_introduction/22_mmad_profile_kernellaunch/MmadInvocation/mmad_custom.h
@@ -0,0 +1,194 @@
+/*
+ * Copyright (c) Huawei Technologies Co., Ltd. 2022-2023. All rights reserved.
+ *
+ * This sample is a very basic sample that implements Mmad on Ascend plaform.
+ */
+#include "kernel_operator.h"
+class KernelMmad {
+public:
+    __aicore__ inline KernelMmad()
+    {
+        aSize = m * k;
+        bSize = k * n;
+        cSize = m * n;
+        mBlocks = m / 16;
+        nBlocks = n / 16;
+        kBlocks = k / 16;
+    }
+    __aicore__ inline void Init(GM_ADDR a, GM_ADDR b, GM_ADDR c)
+    {
+        xGm.SetGlobalBuffer((__gm__ half*)a);
+        yGm.SetGlobalBuffer((__gm__ half*)b);
+        zGm.SetGlobalBuffer((__gm__ float*)c);
+        pipe.InitBuffer(inQueueA1, 1, aSize * sizeof(half));
+        pipe.InitBuffer(inQueueA2, 1, aSize * sizeof(half));
+        pipe.InitBuffer(inQueueB1, 1, bSize * sizeof(half));
+        pipe.InitBuffer(inQueueB2, 1, bSize * sizeof(half));
+        pipe.InitBuffer(outQueueC, 1, cSize * sizeof(float));
+    }
+    __aicore__ inline void Process()
+    {
+        CopyIn();
+        SplitA();
+        SplitB();
+        Compute();
+        Copyout();
+    }
+
+private:
+     __aicore__ inline void CopyIn()  
+     {
+        AscendC::LocalTensor<half> a1Local = inQueueA1.AllocTensor<half>();
+        AscendC::LocalTensor<half> b1Local = inQueueB1.AllocTensor<half>();
+        
+        AscendC::Nd2NzParams dataCopyA1Params;
+        dataCopyA1Params.ndNum = aSize/16/co;
+        dataCopyA1Params.nValue = m;
+        dataCopyA1Params.dValue = k;
+        dataCopyA1Params.srcNdMatrixStride = 0;
+        dataCopyA1Params.srcDValue = k;
+        dataCopyA1Params.dstNzC0Stride = m;
+        dataCopyA1Params.dstNzNStride = 1;
+        dataCopyA1Params.dstNzMatrixStride = 0;
+
+        AscendC::PipeBarrier<PIPE_ALL>();
+        systemCycleBefore = AscendC::GetSystemCycle();
+        AscendC::DataCopy(a1Local, xGm, dataCopyA1Params); // nd -> nz
+        AscendC::PipeBarrier<PIPE_ALL>();
+        systemCycleAfter = AscendC::GetSystemCycle();
+        AscendC::printf("mte2 A systemCycle is %d\n", systemCycleAfter - systemCycleBefore);
+        AscendC::PipeBarrier<PIPE_ALL>();
+        
+        AscendC::Nd2NzParams dataCopyB1Params;
+        dataCopyB1Params.ndNum = bSize/16/co;
+        dataCopyB1Params.nValue = k;
+        dataCopyB1Params.dValue = n;
+        dataCopyB1Params.srcNdMatrixStride = 0;
+        dataCopyB1Params.srcDValue = n;
+        dataCopyB1Params.dstNzC0Stride = k;
+        dataCopyB1Params.dstNzNStride = 1;        
+        dataCopyB1Params.dstNzMatrixStride = 0;
+
+        AscendC::PipeBarrier<PIPE_ALL>();
+        systemCycleBefore = AscendC::GetSystemCycle();
+        AscendC::DataCopy(b1Local, yGm, dataCopyB1Params); // nd -> nz
+        AscendC::PipeBarrier<PIPE_ALL>();
+        systemCycleAfter = AscendC::GetSystemCycle();
+        AscendC::printf("mte2 B systemCycle is %d\n", systemCycleAfter - systemCycleBefore);
+        AscendC::PipeBarrier<PIPE_ALL>();
+
+        inQueueA1.EnQue(a1Local);
+        inQueueB1.EnQue(b1Local);   
+    }
+    __aicore__ inline void SplitA()  
+    {
+        AscendC::LocalTensor<half> a1Local = inQueueA1.DeQue<half>();
+        AscendC::LocalTensor<half> a2Local = inQueueA2.AllocTensor<half>();
+        AscendC::LoadData2dParams loadL0AParams;
+        loadL0AParams.repeatTimes = m / 16;
+        loadL0AParams.srcStride = 1;
+        loadL0AParams.dstGap = k / co - 1;
+        loadL0AParams.ifTranspose = false;
+        
+        AscendC::PipeBarrier<PIPE_ALL>();
+        systemCycleBefore = AscendC::GetSystemCycle();
+        for (int i = 0; i < k/co; i++) {
+            AscendC::LoadData(a2Local[i * 16 * co], a1Local[i * m *co], loadL0AParams);
+        }
+        AscendC::PipeBarrier<PIPE_ALL>();
+        systemCycleAfter = AscendC::GetSystemCycle();
+        AscendC::printf("mte1 A systemCycle is %d\n", systemCycleAfter - systemCycleBefore);
+        AscendC::PipeBarrier<PIPE_ALL>();
+        inQueueA2.EnQue(a2Local);
+        inQueueA1.FreeTensor(a1Local);
+    }
+    __aicore__ inline void SplitB()  
+    {
+        AscendC::LocalTensor<half> b1Local = inQueueB1.DeQue<half>();
+        AscendC::LocalTensor<half> b2Local = inQueueB2.AllocTensor<half>();
+        
+        AscendC::LoadData2dTransposeParams loadDataParams;
+        uint16_t n_block = 16;
+        loadDataParams.startIndex = 0;
+        loadDataParams.repeatTimes = k / 16;
+        loadDataParams.srcStride = 1;
+        loadDataParams.dstGap = n / n_block - 1;
+        loadDataParams.dstFracGap = 0;
+
+        AscendC::PipeBarrier<PIPE_ALL>();
+        systemCycleBefore = AscendC::GetSystemCycle();
+        for (int i = 0; i < n/n_block; ++i) {
+            AscendC::LoadDataWithTranspose(b2Local[i*16*n_block], b1Local[i * k * n_block], loadDataParams);
+        }
+        AscendC::PipeBarrier<PIPE_ALL>();
+        systemCycleAfter = AscendC::GetSystemCycle();
+        AscendC::printf("mte1 B systemCycle is %d\n", systemCycleAfter - systemCycleBefore);
+        AscendC::PipeBarrier<PIPE_ALL>();
+        inQueueB2.EnQue(b2Local);
+        inQueueB1.FreeTensor(b1Local);
+    }
+    __aicore__ inline void Compute()  {
+        AscendC::LocalTensor<half> a2Local = inQueueA2.DeQue<half>();
+        AscendC::LocalTensor<half> b2Local = inQueueB2.DeQue<half>();
+        AscendC::LocalTensor<float> cLocal = outQueueC.AllocTensor<float>();
+        AscendC::MmadParams mmadParams;
+        mmadParams.m = m;
+        mmadParams.n = n;
+        mmadParams.k = k;
+
+        AscendC::PipeBarrier<PIPE_ALL>();
+        systemCycleBefore = AscendC::GetSystemCycle();
+        for (int loop = 0; loop < 100; loop++) {
+            AscendC::Mmad(cLocal, a2Local, b2Local, mmadParams);
+        }
+        AscendC::PipeBarrier<PIPE_ALL>();
+        systemCycleAfter = AscendC::GetSystemCycle();
+        AscendC::printf("mmad *100 systemCycle is %d\n", systemCycleAfter - systemCycleBefore);
+        outQueueC.EnQue<float>(cLocal);
+        inQueueA2.FreeTensor(a2Local);
+        inQueueB2.FreeTensor(b2Local);
+    }
+    __aicore__ inline void Copyout()  
+    {
+        AscendC::LocalTensor<float> cLocal = outQueueC.DeQue<float>();
+        AscendC::FixpipeParams<float> fixpipeParams;
+
+        fixpipeParams.cburstNum = n;
+        fixpipeParams.burstLen = m * 16 * sizeof(float) / 32;
+        fixpipeParams.srcStride = 0;
+        fixpipeParams.dstStride = n;
+
+        fixpipeParams.nz2ndParams.nz2ndEn = true;
+        fixpipeParams.nz2ndParams.ndNum = cSize / 16 * sizeof(float) / 32;
+        fixpipeParams.nz2ndParams.srcNdStride = 0;
+        fixpipeParams.nz2ndParams.dstNdStride = 0;
+        fixpipeParams.nz2ndParams.originalNSize = n; 
+
+        AscendC::PipeBarrier<PIPE_ALL>();
+        systemCycleBefore = AscendC::GetSystemCycle();
+        AscendC::Fixpipe(zGm, cLocal, fixpipeParams);
+        AscendC::PipeBarrier<PIPE_ALL>();
+        systemCycleAfter = AscendC::GetSystemCycle();
+        AscendC::printf("fixpipe systemCycle is %d\n", systemCycleAfter - systemCycleBefore);
+        outQueueC.FreeTensor(cLocal);
+    }
+
+private:
+    AscendC::TPipe pipe;
+    AscendC::TQue<AscendC::QuePosition::A1, 1> inQueueA1;
+    AscendC::TQue<AscendC::QuePosition::A2, 1> inQueueA2;
+    AscendC::TQue<AscendC::QuePosition::B1, 1> inQueueB1;
+    AscendC::TQue<AscendC::QuePosition::B2, 1> inQueueB2;
+    AscendC::TQue<AscendC::QuePosition::CO1, 1> outQueueC;
+    AscendC::GlobalTensor<half> xGm;
+    AscendC::GlobalTensor<half> yGm;
+    AscendC::GlobalTensor<float> zGm;
+    uint16_t m = 128;
+    uint16_t n = 128;
+    uint16_t k = 128;
+    uint16_t co = 16;
+    uint32_t aSize, bSize, cSize;
+    uint32_t mBlocks, nBlocks, kBlocks;
+    int64_t systemCycleBefore;
+    int64_t systemCycleAfter;
+};
diff --git a/operator/ascendc/0_introduction/22_mmad_profile_kernellaunch/MmadInvocation/run.sh b/operator/ascendc/0_introduction/22_mmad_profile_kernellaunch/MmadInvocation/run.sh
new file mode 100644
index 0000000000000000000000000000000000000000..0c9c7f40bde7b2a0f93a1fd309f1bc20fc553682
--- /dev/null
+++ b/operator/ascendc/0_introduction/22_mmad_profile_kernellaunch/MmadInvocation/run.sh
@@ -0,0 +1,121 @@
+#!/bin/bash
+CURRENT_DIR=$(
+    cd $(dirname ${BASH_SOURCE:-$0})
+    pwd
+)
+
+BUILD_TYPE="Debug"
+INSTALL_PREFIX="${CURRENT_DIR}/out"
+
+SHORT=r:,v:,i:,b:,p:,
+LONG=run-mode:,soc-version:,install-path:,build-type:,install-prefix:,
+OPTS=$(getopt -a --options $SHORT --longoptions $LONG -- "$@")
+eval set -- "$OPTS"
+
+while :; do
+    case "$1" in
+    -r | --run-mode)
+        RUN_MODE="$2"
+        shift 2
+        ;;
+    -v | --soc-version)
+        SOC_VERSION="$2"
+        shift 2
+        ;;
+    -i | --install-path)
+        ASCEND_INSTALL_PATH="$2"
+        shift 2
+        ;;
+    -b | --build-type)
+        BUILD_TYPE="$2"
+        shift 2
+        ;;
+    -p | --install-prefix)
+        INSTALL_PREFIX="$2"
+        shift 2
+        ;;
+    --)
+        shift
+        break
+        ;;
+    *)
+        echo "[ERROR] Unexpected option: $1"
+        break
+        ;;
+    esac
+done
+
+RUN_MODE_LIST="cpu sim npu"
+if [[ " $RUN_MODE_LIST " != *" $RUN_MODE "* ]]; then
+    echo "ERROR: RUN_MODE error, This sample only support specify cpu, sim or npu!"
+    exit -1
+fi
+
+VERSION_LIST="Ascend310P1 Ascend310P3 Ascend910B1 Ascend910B2 Ascend910B3 Ascend910B4"
+if [[ " $VERSION_LIST " != *" $SOC_VERSION "* ]]; then
+    echo "ERROR: SOC_VERSION should be in [$VERSION_LIST]"
+    exit -1
+fi
+
+if [ -n "$ASCEND_INSTALL_PATH" ]; then
+    _ASCEND_INSTALL_PATH=$ASCEND_INSTALL_PATH
+elif [ -n "$ASCEND_HOME_PATH" ]; then
+    _ASCEND_INSTALL_PATH=$ASCEND_HOME_PATH
+else
+    if [ -d "$HOME/Ascend/ascend-toolkit/latest" ]; then
+        _ASCEND_INSTALL_PATH=$HOME/Ascend/ascend-toolkit/latest
+    else
+        _ASCEND_INSTALL_PATH=/usr/local/Ascend/ascend-toolkit/latest
+    fi
+fi
+
+export ASCEND_TOOLKIT_HOME=${_ASCEND_INSTALL_PATH}
+export ASCEND_HOME_PATH=${_ASCEND_INSTALL_PATH}
+source ${_ASCEND_INSTALL_PATH}/bin/setenv.bash
+if [ "${RUN_MODE}" = "sim" ]; then
+    # in case of running op in simulator, use stub .so instead
+    export LD_LIBRARY_PATH=${_ASCEND_INSTALL_PATH}/tools/simulator/${SOC_VERSION}/lib:$LD_LIBRARY_PATH
+    if [ ! $CAMODEL_LOG_PATH ]; then
+        export CAMODEL_LOG_PATH=$(pwd)/sim_log
+    fi
+    if [ -d "$CAMODEL_LOG_PATH" ]; then
+        rm -rf $CAMODEL_LOG_PATH
+    fi
+    mkdir -p $CAMODEL_LOG_PATH
+elif [ "${RUN_MODE}" = "cpu" ]; then
+    export LD_LIBRARY_PATH=${_ASCEND_INSTALL_PATH}/tools/tikicpulib/lib:${_ASCEND_INSTALL_PATH}/tools/tikicpulib/lib/${SOC_VERSION}:${_ASCEND_INSTALL_PATH}/tools/simulator/${SOC_VERSION}/lib:$LD_LIBRARY_PATH
+fi
+
+set -e
+rm -rf build out
+mkdir -p build
+cmake -B build \
+    -DRUN_MODE=${RUN_MODE} \
+    -DSOC_VERSION=${SOC_VERSION} \
+    -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
+    -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} \
+    -DASCEND_CANN_PACKAGE_PATH=${_ASCEND_INSTALL_PATH}
+cmake --build build -j
+cmake --install build
+
+rm -f ascendc_kernels_bbit
+cp ./out/bin/ascendc_kernels_bbit ./
+rm -rf input output
+mkdir -p input output
+python3 scripts/gen_data.py
+(
+    export LD_LIBRARY_PATH=$(pwd)/out/lib:$(pwd)/out/lib64:${_ASCEND_INSTALL_PATH}/lib64:$LD_LIBRARY_PATH
+    if [[ "$RUN_WITH_TOOLCHAIN" -eq 1 ]]; then
+        if [ "${RUN_MODE}" = "npu" ]; then
+            msprof op --application=./ascendc_kernels_bbit
+        elif [ "${RUN_MODE}" = "sim" ]; then
+            msprof op simulator --application=./ascendc_kernels_bbit
+        elif [ "${RUN_MODE}" = "cpu" ]; then
+            ./ascendc_kernels_bbit
+        fi
+    else
+        ./ascendc_kernels_bbit
+    fi
+)
+md5sum output/*.bin
+python3 scripts/verify_result.py output/output.bin output/golden.bin
diff --git a/operator/ascendc/0_introduction/22_mmad_profile_kernellaunch/MmadInvocation/scripts/gen_data.py b/operator/ascendc/0_introduction/22_mmad_profile_kernellaunch/MmadInvocation/scripts/gen_data.py
new file mode 100644
index 0000000000000000000000000000000000000000..191bcc1d9c6383fe4425a52e3a1691a7cbc69849
--- /dev/null
+++ b/operator/ascendc/0_introduction/22_mmad_profile_kernellaunch/MmadInvocation/scripts/gen_data.py
@@ -0,0 +1,31 @@
+#!/usr/bin/python3
+# coding=utf-8
+#
+# Copyright (C) 2023-2024. Huawei Technologies Co., Ltd. All rights reserved.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+# ===============================================================================
+
+import numpy as np
+import os
+
+
+def gen_golden_data():
+    M = 128
+    N = 128
+    K = 128
+
+    x1_gm = np.random.randint(1, 10, [M, K]).astype(np.float16)
+    x2_gm = np.random.randint(1, 10, [K, N]).astype(np.float16)
+    golden = (np.matmul(x1_gm.astype(np.float32), x2_gm.astype(np.float32))).astype(np.float32)
+    os.system("mkdir -p input")
+    os.system("mkdir -p output")
+    x1_gm.tofile("./input/x1_gm.bin")
+    x2_gm.tofile("./input/x2_gm.bin")
+    golden.tofile("./output/golden.bin")
+
+
+if __name__ == "__main__":
+    gen_golden_data()
diff --git a/operator/ascendc/0_introduction/22_mmad_profile_kernellaunch/MmadInvocation/scripts/verify_result.py b/operator/ascendc/0_introduction/22_mmad_profile_kernellaunch/MmadInvocation/scripts/verify_result.py
new file mode 100644
index 0000000000000000000000000000000000000000..a325cfcc61f46f7de04ef6c979f852adcf213ef3
--- /dev/null
+++ b/operator/ascendc/0_introduction/22_mmad_profile_kernellaunch/MmadInvocation/scripts/verify_result.py
@@ -0,0 +1,53 @@
+#!/usr/bin/python3
+# coding=utf-8
+#
+# Copyright (C) 2023-2024. Huawei Technologies Co., Ltd. All rights reserved.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+# ===============================================================================
+
+import sys
+import numpy as np
+
+# for float32
+relative_tol = 1e-6
+absolute_tol = 1e-9
+error_tol = 1e-4
+
+
+def verify_result(output, golden):
+    output = np.fromfile(output, dtype=np.float32).reshape(-1)
+    golden = np.fromfile(golden, dtype=np.float32).reshape(-1)
+    different_element_results = np.isclose(output,
+                                           golden,
+                                           rtol=relative_tol,
+                                           atol=absolute_tol,
+                                           equal_nan=True)
+    different_element_indexes = np.where(different_element_results == False)[0]
+    for index in range(len(different_element_indexes)):
+        real_index = different_element_indexes[index]
+        golden_data = golden[real_index]
+        output_data = output[real_index]
+        print(
+            "data index: %06d, expected: %-.9f, actual: %-.9f, rdiff: %-.6f" %
+            (real_index, golden_data, output_data,
+             abs(output_data - golden_data) / golden_data))
+        if index == 100:
+            break
+    error_ratio = float(different_element_indexes.size) / golden.size
+    print("error ratio: %.4f, tolrence: %.4f" % (error_ratio, error_tol))
+    return error_ratio <= error_tol
+
+
+if __name__ == '__main__':
+    try:
+        res = verify_result(sys.argv[1], sys.argv[2])
+        if not res:
+            raise ValueError("[ERROR] result error")
+        else:
+            print("test pass")
+    except Exception as e:
+        print(e)
+        sys.exit(1)
diff --git a/operator/ascendc/0_introduction/22_mmad_profile_kernellaunch/README.md b/operator/ascendc/0_introduction/22_mmad_profile_kernellaunch/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..5a0e0840551f9258035a95fd00eb4ebd3bdd69c4
--- /dev/null
+++ b/operator/ascendc/0_introduction/22_mmad_profile_kernellaunch/README.md
@@ -0,0 +1,45 @@
+## 概述
+本样例介绍基于基础API的matmul算子实现及核函数直调方法。
+
+## 目录结构介绍
+```
+└── 20_mmad_kernellaunch      // 使用核函数直调的方式调用Matmul自定义算子。
+    └── MmadInvocation        // Kernel Launch方式调用核函数样例。
+    └── MmadBiasInvocation    // Kernel Launch方式调用核函数样例，新增bias输入。
+```
+
+## 算子描述
+算子使用基础API包括DataCopy、LoadData、Mmad等，实现Matmul矩阵乘功能。
+
+Matmul的计算公式为：
+
+```
+C = A * B + Bias
+```
+
+- A、B为源操作数，A为左矩阵，形状为\[M, K]；B为右矩阵，形状为\[K, N]。
+- C为目的操作数，存放矩阵乘结果的矩阵，形状为\[M, N]。
+- Bias为矩阵乘偏置，形状为\[N]。对A*B结果矩阵的每一行都采用该Bias进行偏置。
+
+## 支持的产品型号
+本样例支持如下产品型号：
+- Atlas 推理系列产品
+- Atlas A2训练系列产品/Atlas 800I A2推理产品
+
+## 编译运行样例算子
+
+### 1. 获取源码包
+编译运行此样例前，请参考[准备：获取样例代码](../README.md#codeready)获取源码包。
+
+### 2. 编译运行样例工程
+- [MmadBiasInvocation样例运行](./MmadBiasInvocation/README.md)
+- [MmadInvocation样例运行](./MmadInvocation/README.md)
+
+## 更新说明
+| 时间       | 更新事项                 |
+| ---------- | ------------------------ |
+| 2024/11/20 | 新增readme |
+
+## 已知issue
+
+  暂无

算子输入	name	shape	data type	format
	a	M * K	float16	ND
	b	K * N	float16	ND
	算子输出	c	M * N	float	ND
核函数名	mmad_custom