diff --git a/operator/AddCustomSample/KernelLaunch/AddKernelInvocationAcl/CMakeLists.txt b/operator/AddCustomSample/KernelLaunch/AddKernelInvocationAcl/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..3da49ce40ed150fa6aa4456d69246e5cbec7139d
--- /dev/null
+++ b/operator/AddCustomSample/KernelLaunch/AddKernelInvocationAcl/CMakeLists.txt
@@ -0,0 +1,44 @@
+cmake_minimum_required(VERSION 3.16)
+project(Ascend_c)
+
+set(RUN_MODE "npu" CACHE STRING "cpu/sim/npu")
+set(SOC_VERSION "Ascend310P3" CACHE STRING "system on chip type")
+set(ASCEND_CANN_PACKAGE_PATH "/usr/local/Ascend/ascend-toolkit/latest"
+    CACHE STRING "ASCEND CANN package installation directory"
+)
+if(NOT CMAKE_BUILD_TYPE)
+    set(CMAKE_BUILD_TYPE "Debug" CACHE STRING "Build type Release/Debug (default Debug)" FORCE)
+endif()
+if(CMAKE_INSTALL_PREFIX STREQUAL /usr/local)
+    set(CMAKE_INSTALL_PREFIX "${CMAKE_CURRENT_LIST_DIR}/out" CACHE STRING "path for install()" FORCE)
+endif()
+
+file(GLOB KERNEL_FILES ${CMAKE_CURRENT_SOURCE_DIR}/add_custom.cpp)
+
+if("${RUN_MODE}" STREQUAL "cpu")
+    include(cmake/cpu_lib.cmake)
+elseif("${RUN_MODE}" STREQUAL "sim" OR "${RUN_MODE}" STREQUAL "npu")
+    include(cmake/npu_lib.cmake)
+else()
+    message("invalid RUN_MODE: ${RUN_MODE}")
+endif()
+add_executable(ascendc_kernels_bbit ${CMAKE_CURRENT_SOURCE_DIR}/main.cpp)
+
+target_compile_options(ascendc_kernels_bbit PRIVATE
+    $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:-g>>
+    -O2 -std=c++17 -D_GLIBCXX_USE_CXX11_ABI=0 -Wall -Werror
+)
+set(RUN_FATBIN "ON" CACHE STRING "ON/OFF")
+target_link_libraries(ascendc_kernels_bbit PRIVATE
+    $<BUILD_INTERFACE:$<$<OR:$<STREQUAL:${RUN_MODE},npu>,$<STREQUAL:${RUN_MODE},sim>>:host_intf_pub>>
+    $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:tikicpulib::${SOC_VERSION}>>
+    $<BUILD_INTERFACE:$<$<OR:$<STREQUAL:${RUN_MODE},cpu>,$<STREQUAL:${RUN_FATBIN},ON>>:ascendcl>>
+    $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:c_sec>>
+    $<$<STREQUAL:${RUN_FATBIN},OFF>:ascendc_kernels>
+)
+
+install(TARGETS ascendc_kernels_bbit
+    LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+    ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
+    RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
+)
diff --git a/operator/AddCustomSample/KernelLaunch/AddKernelInvocationAcl/README.md b/operator/AddCustomSample/KernelLaunch/AddKernelInvocationAcl/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..e33acad20ea27048cfaf8b72877d5f3a3af5d731
--- /dev/null
+++ b/operator/AddCustomSample/KernelLaunch/AddKernelInvocationAcl/README.md
@@ -0,0 +1,71 @@
+## 目录结构介绍
+```
+├── AddKernelInvocationAcl
+│   ├── cmake                   // 编译工程文件
+│   ├── scripts
+│   │   ├── gen_data.py         // 输入数据和真值数据生成脚本
+│   │   └── verify_result.py    // 验证输出数据和真值数据是否一致的验证脚本
+│   ├── add_custom.cpp          // 算子kernel实现
+│   ├── CMakeLists.txt          // 编译工程文件
+│   ├── data_utils.h            // 数据读入写出函数
+│   ├── main.cpp                // 主函数，调用算子的应用程序，含CPU域及NPU域调用
+│   └── run.sh                  // 编译运行算子的脚本
+```
+## 代码实现介绍
+本调用样例中实现的是固定shape为8*2048的Add算子。
+- kernel实现  
+  Add算子的数学表达式为：
+  ```
+  z = x + y
+  ```
+  计算逻辑是：Ascend C提供的矢量计算接口的操作元素都为LocalTensor，输入数据需要先搬运进片上存储，然后使用计算接口完成两个输入参数相加，得到最终结果，再搬出到外部存储上。
+
+  Add算子的实现流程分为3个基本任务：CopyIn，Compute，CopyOut。CopyIn任务负责将Global Memory上的输入Tensor xGm和yGm搬运到Local Memory，分别存储在xLocal、yLocal，Compute任务负责对xLocal、yLocal执行加法操作，计算结果存储在zLocal中，CopyOut任务负责将输出数据从zLocal搬运至Global Memory上的输出Tensor zGm中。具体请参考[add_custom.cpp](./add_custom.cpp)。
+
+- 调用实现
+  1. CPU侧运行验证主要通过ICPU_RUN_KF CPU调测宏等CPU调测库提供的接口来完成；
+  2. NPU侧运行验证主要通过使用aclrtLaunchKernel函数调用来完成。
+
+  应用程序通过ASCENDC_CPU_DEBUG 宏区分代码逻辑运行于CPU侧还是NPU侧。
+
+## 运行样例算子
+  - 打开样例目录   
+    以命令行方式下载样例代码，master分支为例。
+    ```bash
+    cd ${git_clone_path}/samples/operator/AddCustomSample/KernelLaunch/AddKernelInvocationAcl
+    ```
+  - 配置环境变量
+
+    请根据当前环境上CANN开发套件包的[安装方式](https://hiascend.com/document/redirect/CannCommunityInstSoftware)，选择对应配置环境变量的命令。
+    - 默认路径，root用户安装CANN软件包
+      ```bash
+      export ASCEND_INSTALL_PATH=/usr/local/Ascend/ascend-toolkit/latest
+      ```
+    - 默认路径，非root用户安装CANN软件包
+      ```bash
+      export ASCEND_INSTALL_PATH=$HOME/Ascend/ascend-toolkit/latest
+      ```
+    - 指定路径install_path，安装CANN软件包
+      ```bash
+      export ASCEND_INSTALL_PATH=${install_path}/ascend-toolkit/latest
+      ```
+
+  - 样例执行
+
+    ```bash
+    bash run.sh -r [RUN_MODE] -v  [SOC_VERSION]
+    ```
+    - RUN_MODE：编译方式，可选择CPU调试，NPU上板。支持参数为[cpu / npu]
+    - SOC_VERSION：昇腾AI处理器型号，如果无法确定具体的[SOC_VERSION]，则在安装昇腾AI处理器的服务器执行npu-smi info命令进行查询，在查询到的“Name”前增加Ascend信息，例如“Name”对应取值为xxxyy，实际配置的[SOC_VERSION]值为Ascendxxxyy。支持以下参数取值（xxx请替换为具体取值）：
+      - Atlas 推理系列产品（Ascend 310P处理器）参数值：Ascend310P1、Ascend310P3
+      - Atlas 训练系列产品参数值：AscendxxxA、AscendxxxB
+      - Atlas A2训练系列产品参数值：AscendxxxB1、AscendxxxB2、AscendxxxB3、AscendxxxB4
+
+    示例如下。
+    ```bash
+    bash run.sh -r cpu -v Ascend310P1
+    ```
+## 更新说明
+| 时间       | 更新事项     |
+| ---------- | ------------ |
+| 2024/08/31 | 新增本readme |
\ No newline at end of file
diff --git a/operator/AddCustomSample/KernelLaunch/AddKernelInvocationAcl/add_custom.cpp b/operator/AddCustomSample/KernelLaunch/AddKernelInvocationAcl/add_custom.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..96b37a7c3df7a7a9ba36781db9395f3b2f07f4bb
--- /dev/null
+++ b/operator/AddCustomSample/KernelLaunch/AddKernelInvocationAcl/add_custom.cpp
@@ -0,0 +1,82 @@
+/**
+ * @file add_custom.cpp
+ *
+ * Copyright (C) 2024. Huawei Technologies Co., Ltd. All rights reserved.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ */
+#include "kernel_operator.h"
+
+constexpr int32_t TOTAL_LENGTH = 8 * 2048;                            // total length of data
+constexpr int32_t USE_CORE_NUM = 8;                                   // num of core used
+constexpr int32_t BLOCK_LENGTH = TOTAL_LENGTH / USE_CORE_NUM;         // length computed of each core
+constexpr int32_t TILE_NUM = 8;                                       // split data into 8 tiles for each core
+constexpr int32_t BUFFER_NUM = 2;                                     // tensor num for each queue
+constexpr int32_t TILE_LENGTH = BLOCK_LENGTH / TILE_NUM / BUFFER_NUM; // separate to 2 parts, due to double buffer
+
+class KernelAdd {
+public:
+    __aicore__ inline KernelAdd() {}
+    __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, GM_ADDR z)
+    {
+        xGm.SetGlobalBuffer((__gm__ half *)x + BLOCK_LENGTH * AscendC::GetBlockIdx(), BLOCK_LENGTH);
+        yGm.SetGlobalBuffer((__gm__ half *)y + BLOCK_LENGTH * AscendC::GetBlockIdx(), BLOCK_LENGTH);
+        zGm.SetGlobalBuffer((__gm__ half *)z + BLOCK_LENGTH * AscendC::GetBlockIdx(), BLOCK_LENGTH);
+        pipe.InitBuffer(inQueueX, BUFFER_NUM, TILE_LENGTH * sizeof(half));
+        pipe.InitBuffer(inQueueY, BUFFER_NUM, TILE_LENGTH * sizeof(half));
+        pipe.InitBuffer(outQueueZ, BUFFER_NUM, TILE_LENGTH * sizeof(half));
+    }
+    __aicore__ inline void Process()
+    {
+        int32_t loopCount = TILE_NUM * BUFFER_NUM;
+        for (int32_t i = 0; i < loopCount; i++) {
+            CopyIn(i);
+            Compute(i);
+            CopyOut(i);
+        }
+    }
+
+private:
+    __aicore__ inline void CopyIn(int32_t progress)
+    {
+        AscendC::LocalTensor<half> xLocal = inQueueX.AllocTensor<half>();
+        AscendC::LocalTensor<half> yLocal = inQueueY.AllocTensor<half>();
+        AscendC::DataCopy(xLocal, xGm[progress * TILE_LENGTH], TILE_LENGTH);
+        AscendC::DataCopy(yLocal, yGm[progress * TILE_LENGTH], TILE_LENGTH);
+        inQueueX.EnQue(xLocal);
+        inQueueY.EnQue(yLocal);
+    }
+    __aicore__ inline void Compute(int32_t progress)
+    {
+        AscendC::LocalTensor<half> xLocal = inQueueX.DeQue<half>();
+        AscendC::LocalTensor<half> yLocal = inQueueY.DeQue<half>();
+        AscendC::LocalTensor<half> zLocal = outQueueZ.AllocTensor<half>();
+        AscendC::Add(zLocal, xLocal, yLocal, TILE_LENGTH);
+        outQueueZ.EnQue<half>(zLocal);
+        inQueueX.FreeTensor(xLocal);
+        inQueueY.FreeTensor(yLocal);
+    }
+    __aicore__ inline void CopyOut(int32_t progress)
+    {
+        AscendC::LocalTensor<half> zLocal = outQueueZ.DeQue<half>();
+        AscendC::DataCopy(zGm[progress * TILE_LENGTH], zLocal, TILE_LENGTH);
+        outQueueZ.FreeTensor(zLocal);
+    }
+
+private:
+    AscendC::TPipe pipe;
+    AscendC::TQue<AscendC::QuePosition::VECIN, BUFFER_NUM> inQueueX, inQueueY;
+    AscendC::TQue<AscendC::QuePosition::VECOUT, BUFFER_NUM> outQueueZ;
+    AscendC::GlobalTensor<half> xGm;
+    AscendC::GlobalTensor<half> yGm;
+    AscendC::GlobalTensor<half> zGm;
+};
+
+extern "C" __global__ __aicore__ void add_custom(GM_ADDR x, GM_ADDR y, GM_ADDR z)
+{
+    KernelAdd op;
+    op.Init(x, y, z);
+    op.Process();
+}
diff --git a/operator/AddCustomSample/KernelLaunch/AddKernelInvocationAcl/cmake/cpu_lib.cmake b/operator/AddCustomSample/KernelLaunch/AddKernelInvocationAcl/cmake/cpu_lib.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..751a119411509a4eeec79b76a875776206daeaf6
--- /dev/null
+++ b/operator/AddCustomSample/KernelLaunch/AddKernelInvocationAcl/cmake/cpu_lib.cmake
@@ -0,0 +1,9 @@
+if(NOT DEFINED ENV{CMAKE_PREFIX_PATH})
+    set(CMAKE_PREFIX_PATH ${ASCEND_CANN_PACKAGE_PATH}/tools/tikicpulib/lib/cmake)
+endif()
+find_package(tikicpulib REQUIRED)
+
+add_library(ascendc_kernels SHARED ${KERNEL_FILES})
+target_link_libraries(ascendc_kernels PUBLIC tikicpulib::${SOC_VERSION})
+target_compile_options(ascendc_kernels PRIVATE -g -O0 -std=c++17)
+install(TARGETS ascendc_kernels DESTINATION ${CMAKE_INSTALL_LIBDIR})
diff --git a/operator/AddCustomSample/KernelLaunch/AddKernelInvocationAcl/cmake/npu_lib.cmake b/operator/AddCustomSample/KernelLaunch/AddKernelInvocationAcl/cmake/npu_lib.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..d862f006417dcb8cf30cb8c33f293c2f869ff6e1
--- /dev/null
+++ b/operator/AddCustomSample/KernelLaunch/AddKernelInvocationAcl/cmake/npu_lib.cmake
@@ -0,0 +1,10 @@
+if(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
+    set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
+elseif(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake)
+    set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake)
+else()
+    message(FATAL_ERROR "ascendc_kernel_cmake does not exist ,please check whether the cann package is installed")
+endif()
+include(${ASCENDC_CMAKE_DIR}/ascendc.cmake)
+
+ascendc_fatbin_library(ascendc_kernels ${KERNEL_FILES})
diff --git a/operator/AddCustomSample/KernelLaunch/AddKernelInvocationAcl/data_utils.h b/operator/AddCustomSample/KernelLaunch/AddKernelInvocationAcl/data_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..1d43459ef7c5858cf97a9ba43769e50b8dd28853
--- /dev/null
+++ b/operator/AddCustomSample/KernelLaunch/AddKernelInvocationAcl/data_utils.h
@@ -0,0 +1,240 @@
+/**
+ * @file data_utils.h
+ *
+ * Copyright (C) 2023-2024. Huawei Technologies Co., Ltd. All rights reserved.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ */
+#ifndef DATA_UTILS_H
+#define DATA_UTILS_H
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+#include <cassert>
+#include <cstdio>
+#include <fstream>
+#include <iomanip>
+#include <iostream>
+#include <string>
+#include <vector>
+#include <memory>
+#include <cstring>
+#include <cerrno>
+
+#include "acl/acl.h"
+
+typedef enum {
+    DT_UNDEFINED = -1,
+    FLOAT = 0,
+    HALF = 1,
+    INT8_T = 2,
+    INT32_T = 3,
+    UINT8_T = 4,
+    INT16_T = 6,
+    UINT16_T = 7,
+    UINT32_T = 8,
+    INT64_T = 9,
+    UINT64_T = 10,
+    DOUBLE = 11,
+    BOOL = 12,
+    STRING = 13,
+    COMPLEX64 = 16,
+    COMPLEX128 = 17,
+    BF16 = 27
+} printDataType;
+
+#define INFO_LOG(fmt, args...) fprintf(stdout, "[INFO]  " fmt "\n", ##args)
+#define WARN_LOG(fmt, args...) fprintf(stdout, "[WARN]  " fmt "\n", ##args)
+#define ERROR_LOG(fmt, args...) fprintf(stdout, "[ERROR]  " fmt "\n", ##args)
+#define CHECK_ACL(x)                                                                        \
+    do {                                                                                    \
+        aclError __ret = x;                                                                 \
+        if (__ret != ACL_ERROR_NONE) {                                                      \
+            std::cerr << __FILE__ << ":" << __LINE__ << " aclError:" << __ret << std::endl; \
+        }                                                                                   \
+    } while (0);
+
+/**
+ * @brief Read data from file
+ * @param [in] filePath: file path
+ * @param [out] fileSize: file size
+ * @return read result
+ */
+bool ReadFile(const std::string &filePath, size_t &fileSize, void *buffer, size_t bufferSize)
+{
+    struct stat sBuf;
+    int fileStatus = stat(filePath.data(), &sBuf);
+    if (fileStatus == -1) {
+        ERROR_LOG("failed to get file");
+        return false;
+    }
+    if (S_ISREG(sBuf.st_mode) == 0) {
+        ERROR_LOG("%s is not a file, please enter a file", filePath.c_str());
+        return false;
+    }
+
+    std::ifstream file;
+    file.open(filePath, std::ios::binary);
+    if (!file.is_open()) {
+        ERROR_LOG("Open file failed. path = %s", filePath.c_str());
+        return false;
+    }
+
+    std::filebuf *buf = file.rdbuf();
+    size_t size = buf->pubseekoff(0, std::ios::end, std::ios::in);
+    if (size == 0) {
+        ERROR_LOG("file size is 0");
+        file.close();
+        return false;
+    }
+    if (size > bufferSize) {
+        ERROR_LOG("file size is larger than buffer size");
+        file.close();
+        return false;
+    }
+    buf->pubseekpos(0, std::ios::in);
+    buf->sgetn(static_cast<char *>(buffer), size);
+    fileSize = size;
+    file.close();
+    return true;
+}
+
+/**
+ * @brief Write data to file
+ * @param [in] filePath: file path
+ * @param [in] buffer: data to write to file
+ * @param [in] size: size to write
+ * @return write result
+ */
+bool WriteFile(const std::string &filePath, const void *buffer, size_t size)
+{
+    if (buffer == nullptr) {
+        ERROR_LOG("Write file failed. buffer is nullptr");
+        return false;
+    }
+
+    int fd = open(filePath.c_str(), O_RDWR | O_CREAT | O_TRUNC, S_IRUSR | S_IWRITE);
+    if (fd < 0) {
+        ERROR_LOG("Open file failed. path = %s", filePath.c_str());
+        return false;
+    }
+
+    size_t writeSize = write(fd, buffer, size);
+    (void)close(fd);
+    if (writeSize != size) {
+        ERROR_LOG("Write file Failed.");
+        return false;
+    }
+
+    return true;
+}
+
+/**
+ * @brief Reads a binary file into memory.
+ *
+ * This function opens a binary file, reads its contents into a dynamically allocated memory buffer,
+ * and returns a pointer to the buffer and the size of the file through output parameters.
+ *
+ * @param filePath The path to the binary file to be read.
+ * @param outBuffer A reference to a unique pointer that will hold the file data.
+ * @param outSize A reference to a size_t that will hold the size of the file.
+ * @return true if the file was read successfully, false otherwise.
+ */
+bool ReadBinaryFile(const char *filePath, std::unique_ptr<char[]> &outBuffer, size_t &outSize)
+{
+    FILE *file = fopen(filePath, "rb");
+    if (!file) {
+        ERROR_LOG("Error opening file: %s\n", strerror(errno));
+        return false;
+    }
+
+    fseek(file, 0, SEEK_END);
+    outSize = ftell(file);
+    rewind(file);
+
+    outBuffer.reset(new char[outSize]);
+    if (fread(outBuffer.get(), 1, outSize, file) != outSize) {
+        ERROR_LOG("Error reading file.\n");
+        fclose(file);
+        return false;
+    }
+
+    fclose(file);
+    return true;
+}
+
+template <typename T> void DoPrintData(const T *data, size_t count, size_t elementsPerRow)
+{
+    assert(elementsPerRow != 0);
+    for (size_t i = 0; i < count; ++i) {
+        std::cout << std::setw(10) << data[i];
+        if (i % elementsPerRow == elementsPerRow - 1) {
+            std::cout << std::endl;
+        }
+    }
+}
+
+void DoPrintHalfData(const aclFloat16 *data, size_t count, size_t elementsPerRow)
+{
+    assert(elementsPerRow != 0);
+    for (size_t i = 0; i < count; ++i) {
+        std::cout << std::setw(10) << std::setprecision(6) << aclFloat16ToFloat(data[i]);
+        if (i % elementsPerRow == elementsPerRow - 1) {
+            std::cout << std::endl;
+        }
+    }
+}
+
+void PrintData(const void *data, size_t count, printDataType dataType, size_t elementsPerRow = 16)
+{
+    if (data == nullptr) {
+        ERROR_LOG("Print data failed. data is nullptr");
+        return;
+    }
+
+    switch (dataType) {
+        case BOOL:
+            DoPrintData(reinterpret_cast<const bool *>(data), count, elementsPerRow);
+            break;
+        case INT8_T:
+            DoPrintData(reinterpret_cast<const int8_t *>(data), count, elementsPerRow);
+            break;
+        case UINT8_T:
+            DoPrintData(reinterpret_cast<const uint8_t *>(data), count, elementsPerRow);
+            break;
+        case INT16_T:
+            DoPrintData(reinterpret_cast<const int16_t *>(data), count, elementsPerRow);
+            break;
+        case UINT16_T:
+            DoPrintData(reinterpret_cast<const uint16_t *>(data), count, elementsPerRow);
+            break;
+        case INT32_T:
+            DoPrintData(reinterpret_cast<const int32_t *>(data), count, elementsPerRow);
+            break;
+        case UINT32_T:
+            DoPrintData(reinterpret_cast<const uint32_t *>(data), count, elementsPerRow);
+            break;
+        case INT64_T:
+            DoPrintData(reinterpret_cast<const int64_t *>(data), count, elementsPerRow);
+            break;
+        case UINT64_T:
+            DoPrintData(reinterpret_cast<const uint64_t *>(data), count, elementsPerRow);
+            break;
+        case HALF:
+            DoPrintHalfData(reinterpret_cast<const aclFloat16 *>(data), count, elementsPerRow);
+            break;
+        case FLOAT:
+            DoPrintData(reinterpret_cast<const float *>(data), count, elementsPerRow);
+            break;
+        case DOUBLE:
+            DoPrintData(reinterpret_cast<const double *>(data), count, elementsPerRow);
+            break;
+        default:
+            ERROR_LOG("Unsupported type: %d", dataType);
+    }
+    std::cout << std::endl;
+}
+#endif // DATA_UTILS_H
diff --git a/operator/AddCustomSample/KernelLaunch/AddKernelInvocationAcl/main.cpp b/operator/AddCustomSample/KernelLaunch/AddKernelInvocationAcl/main.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c868a66229af8e9fd18623709542d1925af4122c
--- /dev/null
+++ b/operator/AddCustomSample/KernelLaunch/AddKernelInvocationAcl/main.cpp
@@ -0,0 +1,115 @@
+/**
+ * @file main.cpp
+ *
+ * Copyright (C) 2024. Huawei Technologies Co., Ltd. All rights reserved.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ */
+#include "data_utils.h"
+#ifndef ASCENDC_CPU_DEBUG
+#include "acl/acl.h"
+#else
+#include "tikicpulib.h"
+extern "C" __global__ __aicore__ void add_custom(GM_ADDR x, GM_ADDR y, GM_ADDR z);
+#endif
+
+int32_t main(int32_t argc, char *argv[])
+{
+    uint32_t blockDim = 8;
+    size_t inputByteSize = 8 * 2048 * sizeof(uint16_t);
+    size_t outputByteSize = 8 * 2048 * sizeof(uint16_t);
+
+#ifdef ASCENDC_CPU_DEBUG
+    void *x = AscendC::GmAlloc(inputByteSize);
+    void *y = AscendC::GmAlloc(inputByteSize);
+    void *z = AscendC::GmAlloc(outputByteSize);
+
+    ReadFile("./input/input_x.bin", inputByteSize, x, inputByteSize);
+    ReadFile("./input/input_y.bin", inputByteSize, y, inputByteSize);
+
+    AscendC::SetKernelMode(KernelMode::AIV_MODE);
+    ICPU_RUN_KF(add_custom, blockDim, x, y, z); // use this macro for cpu debug
+
+    WriteFile("./output/output_z.bin", z, outputByteSize);
+
+    AscendC::GmFree(x);
+    AscendC::GmFree(y);
+    AscendC::GmFree(z);
+#else
+    CHECK_ACL(aclInit(nullptr));
+    int32_t deviceId = 0;
+    CHECK_ACL(aclrtSetDevice(deviceId));
+    aclrtStream stream = nullptr;
+    CHECK_ACL(aclrtCreateStream(&stream));
+
+    void *xHost, *yHost, *zHost;
+    void *xDevice, *yDevice, *zDevice;
+
+    CHECK_ACL(aclrtMallocHost((void **)(&xHost), inputByteSize));
+    CHECK_ACL(aclrtMallocHost((void **)(&yHost), inputByteSize));
+    CHECK_ACL(aclrtMallocHost((void **)(&zHost), outputByteSize));
+    CHECK_ACL(aclrtMalloc((void **)&xDevice, inputByteSize, ACL_MEM_MALLOC_HUGE_FIRST));
+    CHECK_ACL(aclrtMalloc((void **)&yDevice, inputByteSize, ACL_MEM_MALLOC_HUGE_FIRST));
+    CHECK_ACL(aclrtMalloc((void **)&zDevice, outputByteSize, ACL_MEM_MALLOC_HUGE_FIRST));
+
+    ReadFile("./input/input_x.bin", inputByteSize, xHost, inputByteSize);
+    ReadFile("./input/input_y.bin", inputByteSize, yHost, inputByteSize);
+
+    CHECK_ACL(aclrtMemcpy(xDevice, inputByteSize, xHost, inputByteSize, ACL_MEMCPY_HOST_TO_DEVICE));
+    CHECK_ACL(aclrtMemcpy(yDevice, inputByteSize, yHost, inputByteSize, ACL_MEMCPY_HOST_TO_DEVICE));
+
+    struct {
+        uintptr_t xDeviceAddr;
+        uintptr_t yDeviceAddr;
+        uintptr_t zDeviceAddr;
+    } args;
+    auto *argsDeviceAddrInHost = &args;
+    CHECK_ACL(aclrtMallocHost((void **)&argsDeviceAddrInHost, sizeof(args)));
+    argsDeviceAddrInHost->xDeviceAddr = (uintptr_t)xDevice;
+    argsDeviceAddrInHost->yDeviceAddr = (uintptr_t)yDevice;
+    argsDeviceAddrInHost->zDeviceAddr = (uintptr_t)zDevice;
+
+    void *argsDeviceAddrInDevice = nullptr;
+    CHECK_ACL(aclrtMalloc((void **)&argsDeviceAddrInDevice, sizeof(args), ACL_MEM_MALLOC_HUGE_FIRST));
+    CHECK_ACL(aclrtMemcpy(argsDeviceAddrInDevice, sizeof(args), argsDeviceAddrInHost, sizeof(args), ACL_MEMCPY_HOST_TO_DEVICE));
+    CHECK_ACL(aclrtFreeHost(argsDeviceAddrInHost));
+
+    const char *filePath = "./out/fatbin/ascendc_kernels/ascendc_kernels.o";
+    std::unique_ptr<char[]> fileMemory;
+    size_t fileSize = 0;
+    ReadBinaryFile(filePath, fileMemory, fileSize);
+
+    aclrtBinary binary = aclrtCreateBinary(fileMemory.get(), fileSize);
+    if (binary == nullptr) {
+        printf("aclrtCreateBinary failed");
+        return -1;
+    }
+    aclrtBinHandle binHandle = nullptr;
+    CHECK_ACL(aclrtBinaryLoad(binary, &binHandle));
+    CHECK_ACL(aclrtDestroyBinary(binary));
+
+    aclrtFuncHandle funAdd = nullptr;
+    CHECK_ACL(aclrtBinaryGetFunction(binHandle, "add_custom", &funAdd));
+
+    CHECK_ACL(aclrtLaunchKernel(funAdd, blockDim, argsDeviceAddrInDevice, sizeof(args), stream));
+    CHECK_ACL(aclrtSynchronizeStream(stream));
+
+    CHECK_ACL(aclrtMemcpy(zHost, outputByteSize, zDevice, outputByteSize, ACL_MEMCPY_DEVICE_TO_HOST));
+    WriteFile("./output/output_z.bin", zHost, outputByteSize);
+
+    CHECK_ACL(aclrtFree(argsDeviceAddrInDevice));
+    CHECK_ACL(aclrtFree(xDevice));
+    CHECK_ACL(aclrtFree(yDevice));
+    CHECK_ACL(aclrtFree(zDevice));
+    CHECK_ACL(aclrtFreeHost(xHost));
+    CHECK_ACL(aclrtFreeHost(yHost));
+    CHECK_ACL(aclrtFreeHost(zHost));
+
+    CHECK_ACL(aclrtDestroyStream(stream));
+    CHECK_ACL(aclrtResetDevice(deviceId));
+    CHECK_ACL(aclFinalize());
+#endif
+    return 0;
+}
diff --git a/operator/AddCustomSample/KernelLaunch/AddKernelInvocationAcl/run.sh b/operator/AddCustomSample/KernelLaunch/AddKernelInvocationAcl/run.sh
new file mode 100644
index 0000000000000000000000000000000000000000..51d92599e647cb585fb18ea862a2acd1921f33dc
--- /dev/null
+++ b/operator/AddCustomSample/KernelLaunch/AddKernelInvocationAcl/run.sh
@@ -0,0 +1,126 @@
+#!/bin/bash
+CURRENT_DIR=$(
+    cd $(dirname ${BASH_SOURCE:-$0})
+    pwd
+)
+cd $CURRENT_DIR
+
+BUILD_TYPE="Debug"
+INSTALL_PREFIX="${CURRENT_DIR}/out"
+
+SHORT=r:,v:,i:,b:,p:,
+LONG=run-mode:,soc-version:,install-path:,build-type:,install-prefix:,
+OPTS=$(getopt -a --options $SHORT --longoptions $LONG -- "$@")
+eval set -- "$OPTS"
+
+while :; do
+    case "$1" in
+    -r | --run-mode)
+        RUN_MODE="$2"
+        shift 2
+        ;;
+    -v | --soc-version)
+        SOC_VERSION="$2"
+        shift 2
+        ;;
+    -i | --install-path)
+        ASCEND_INSTALL_PATH="$2"
+        shift 2
+        ;;
+    -b | --build-type)
+        BUILD_TYPE="$2"
+        shift 2
+        ;;
+    -p | --install-prefix)
+        INSTALL_PREFIX="$2"
+        shift 2
+        ;;
+    --)
+        shift
+        break
+        ;;
+    *)
+        echo "[ERROR] Unexpected option: $1"
+        break
+        ;;
+    esac
+done
+
+RUN_MODE_LIST="cpu npu"
+if [[ " $RUN_MODE_LIST " != *" $RUN_MODE "* ]]; then
+    echo "ERROR: RUN_MODE error, This sample only support specify cpu or npu!"
+    exit -1
+fi
+
+VERSION_LIST="Ascend910A Ascend910B Ascend310B1 Ascend310B2 Ascend310B3 Ascend310B4 Ascend310P1 Ascend310P3 Ascend910B1 Ascend910B2 Ascend910B3 Ascend910B4"
+if [[ " $VERSION_LIST " != *" $SOC_VERSION "* ]]; then
+    echo "ERROR: SOC_VERSION should be in [$VERSION_LIST]"
+    exit -1
+fi
+
+if [ -n "$ASCEND_INSTALL_PATH" ]; then
+    _ASCEND_INSTALL_PATH=$ASCEND_INSTALL_PATH
+elif [ -n "$ASCEND_HOME_PATH" ]; then
+    _ASCEND_INSTALL_PATH=$ASCEND_HOME_PATH
+else
+    if [ -d "$HOME/Ascend/ascend-toolkit/latest" ]; then
+        _ASCEND_INSTALL_PATH=$HOME/Ascend/ascend-toolkit/latest
+    else
+        _ASCEND_INSTALL_PATH=/usr/local/Ascend/ascend-toolkit/latest
+    fi
+fi
+
+export ASCEND_TOOLKIT_HOME=${_ASCEND_INSTALL_PATH}
+export ASCEND_HOME_PATH=${_ASCEND_INSTALL_PATH}
+if [ "${RUN_MODE}" = "npu" ]; then
+    source ${_ASCEND_INSTALL_PATH}/bin/setenv.bash
+elif [ "${RUN_MODE}" = "sim" ]; then
+    # in case of running op in simulator, use stub .so instead
+    export LD_LIBRARY_PATH=${_ASCEND_INSTALL_PATH}/runtime/lib64/stub:$LD_LIBRARY_PATH
+    source ${_ASCEND_INSTALL_PATH}/bin/setenv.bash
+    export LD_LIBRARY_PATH=${_ASCEND_INSTALL_PATH}/tools/simulator/${SOC_VERSION}/lib:$LD_LIBRARY_PATH
+    if [ ! $CAMODEL_LOG_PATH ]; then
+        export CAMODEL_LOG_PATH=$(pwd)/sim_log
+    fi
+    if [ -d "$CAMODEL_LOG_PATH" ]; then
+        rm -rf $CAMODEL_LOG_PATH
+    fi
+    mkdir -p $CAMODEL_LOG_PATH
+elif [ "${RUN_MODE}" = "cpu" ]; then
+    source ${_ASCEND_INSTALL_PATH}/bin/setenv.bash
+    export LD_LIBRARY_PATH=${_ASCEND_INSTALL_PATH}/tools/tikicpulib/lib:${_ASCEND_INSTALL_PATH}/tools/tikicpulib/lib/${SOC_VERSION}:${_ASCEND_INSTALL_PATH}/tools/simulator/${SOC_VERSION}/lib:$LD_LIBRARY_PATH
+fi
+
+set -e
+rm -rf build out
+mkdir -p build
+cmake -B build \
+    -DRUN_MODE=${RUN_MODE} \
+    -DSOC_VERSION=${SOC_VERSION} \
+    -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
+    -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} \
+    -DASCEND_CANN_PACKAGE_PATH=${_ASCEND_INSTALL_PATH}
+cmake --build build -j
+cmake --install build
+
+rm -f ascendc_kernels_bbit
+cp ./out/bin/ascendc_kernels_bbit ./
+rm -rf input output
+mkdir -p input output
+python3 scripts/gen_data.py
+(
+    export LD_LIBRARY_PATH=$(pwd)/out/lib:$(pwd)/out/lib64:${_ASCEND_INSTALL_PATH}/lib64:$LD_LIBRARY_PATH
+    if [[ "$RUN_WITH_TOOLCHAIN" -eq 1 ]]; then
+        if [ "${RUN_MODE}" = "npu" ]; then
+            msprof op --application=./ascendc_kernels_bbit
+        elif [ "${RUN_MODE}" = "sim" ]; then
+            msprof op simulator --application=./ascendc_kernels_bbit
+        elif [ "${RUN_MODE}" = "cpu" ]; then
+            ./ascendc_kernels_bbit
+        fi
+    else
+        ./ascendc_kernels_bbit
+    fi
+)
+md5sum output/*.bin
+python3 scripts/verify_result.py output/output_z.bin output/golden.bin
diff --git a/operator/AddCustomSample/KernelLaunch/AddKernelInvocationAcl/scripts/gen_data.py b/operator/AddCustomSample/KernelLaunch/AddKernelInvocationAcl/scripts/gen_data.py
new file mode 100644
index 0000000000000000000000000000000000000000..ea8ce828aea146c9ab462290be403c4cfd483b75
--- /dev/null
+++ b/operator/AddCustomSample/KernelLaunch/AddKernelInvocationAcl/scripts/gen_data.py
@@ -0,0 +1,25 @@
+#!/usr/bin/python3
+# coding=utf-8
+#
+# Copyright (C) 2023-2024. Huawei Technologies Co., Ltd. All rights reserved.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+# ===============================================================================
+
+import numpy as np
+
+
+def gen_golden_data_simple():
+    input_x = np.random.uniform(1, 100, [8, 2048]).astype(np.float16)
+    input_y = np.random.uniform(1, 100, [8, 2048]).astype(np.float16)
+    golden = (input_x + input_y).astype(np.float16)
+
+    input_x.tofile("./input/input_x.bin")
+    input_y.tofile("./input/input_y.bin")
+    golden.tofile("./output/golden.bin")
+
+
+if __name__ == "__main__":
+    gen_golden_data_simple()
diff --git a/operator/AddCustomSample/KernelLaunch/AddKernelInvocationAcl/scripts/verify_result.py b/operator/AddCustomSample/KernelLaunch/AddKernelInvocationAcl/scripts/verify_result.py
new file mode 100644
index 0000000000000000000000000000000000000000..1a21d809ab206a65bc952ca4cb06c345edcd3e7a
--- /dev/null
+++ b/operator/AddCustomSample/KernelLaunch/AddKernelInvocationAcl/scripts/verify_result.py
@@ -0,0 +1,53 @@
+#!/usr/bin/python3
+# coding=utf-8
+#
+# Copyright (C) 2023-2024. Huawei Technologies Co., Ltd. All rights reserved.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+# ===============================================================================
+
+import sys
+import numpy as np
+
+# for float16
+relative_tol = 1e-3
+absolute_tol = 1e-5
+error_tol = 1e-3
+
+
+def verify_result(output, golden):
+    output = np.fromfile(output, dtype=np.float16).reshape(-1)
+    golden = np.fromfile(golden, dtype=np.float16).reshape(-1)
+    different_element_results = np.isclose(output,
+                                           golden,
+                                           rtol=relative_tol,
+                                           atol=absolute_tol,
+                                           equal_nan=True)
+    different_element_indexes = np.where(different_element_results == False)[0]
+    for index in range(len(different_element_indexes)):
+        real_index = different_element_indexes[index]
+        golden_data = golden[real_index]
+        output_data = output[real_index]
+        print(
+            "data index: %06d, expected: %-.9f, actual: %-.9f, rdiff: %-.6f" %
+            (real_index, golden_data, output_data,
+             abs(output_data - golden_data) / golden_data))
+        if index == 100:
+            break
+    error_ratio = float(different_element_indexes.size) / golden.size
+    print("error ratio: %.4f, tolrence: %.4f" % (error_ratio, error_tol))
+    return error_ratio <= error_tol
+
+
+if __name__ == '__main__':
+    try:
+        res = verify_result(sys.argv[1], sys.argv[2])
+        if not res:
+            raise ValueError("[ERROR] result error")
+        else:
+            print("test pass")
+    except Exception as e:
+        print(e)
+        sys.exit(1)
diff --git a/operator/AddCustomSample/KernelLaunch/AddKernelInvocationNeo/README.md b/operator/AddCustomSample/KernelLaunch/AddKernelInvocationNeo/README.md
index 4ff4d4f36d389a54cb7990520fad0d551dbd9719..f921e7673f6fb13973ed19f056ef9ac1aecbe9e8 100644
--- a/operator/AddCustomSample/KernelLaunch/AddKernelInvocationNeo/README.md
+++ b/operator/AddCustomSample/KernelLaunch/AddKernelInvocationNeo/README.md
@@ -60,7 +60,7 @@
     ```bash
     bash run.sh -r [RUN_MODE] -v  [SOC_VERSION]
     ```
-    - RUN_MODE：编译方式，可选择CPU调试，NPU仿真，NPU上板。支持参数为[cpu /sim / npu]
+    - RUN_MODE：编译方式，可选择CPU调试，NPU仿真，NPU上板。支持参数为[cpu / sim / npu]
     - SOC_VERSION：昇腾AI处理器型号，如果无法确定具体的[SOC_VERSION]，则在安装昇腾AI处理器的服务器执行npu-smi info命令进行查询，在查询到的“Name”前增加Ascend信息，例如“Name”对应取值为xxxyy，实际配置的[SOC_VERSION]值为Ascendxxxyy。支持以下参数取值（xxx请替换为具体取值）：
       - Atlas 推理系列产品（Ascend 310P处理器）参数值：Ascend310P1、Ascend310P3
       - Atlas 训练系列产品参数值：AscendxxxA、AscendxxxB
diff --git a/operator/MatmulCustomSample/KernelLaunch/MatmulInvocationAcl/CMakeLists.txt b/operator/MatmulCustomSample/KernelLaunch/MatmulInvocationAcl/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..ea95344bcad546c4b40cac4ebee6c0a322644ff1
--- /dev/null
+++ b/operator/MatmulCustomSample/KernelLaunch/MatmulInvocationAcl/CMakeLists.txt
@@ -0,0 +1,67 @@
+cmake_minimum_required(VERSION 3.16)
+project(Ascend_c)
+
+set(RUN_MODE "npu" CACHE STRING "cpu/sim/npu")
+set(SOC_VERSION "Ascend310P3" CACHE STRING "system on chip type")
+set(ASCEND_CANN_PACKAGE_PATH "/usr/local/Ascend/ascend-toolkit/latest"
+    CACHE STRING "ASCEND CANN package installation directory"
+)
+if(NOT CMAKE_BUILD_TYPE)
+    set(CMAKE_BUILD_TYPE "Debug" CACHE STRING "Build type Release/Debug (default Debug)" FORCE)
+endif()
+if(CMAKE_INSTALL_PREFIX STREQUAL /usr/local)
+    set(CMAKE_INSTALL_PREFIX "${CMAKE_CURRENT_LIST_DIR}/out" CACHE STRING "path for install()" FORCE)
+endif()
+file(GLOB KERNEL_FILES ${CMAKE_CURRENT_SOURCE_DIR}/matmul_custom.cpp)
+set(CUSTOM_ASCEND310P_LIST "Ascend310P1" "Ascend310P3")
+
+if("${RUN_MODE}" STREQUAL "cpu")
+    include(cmake/cpu_lib.cmake)
+elseif("${RUN_MODE}" STREQUAL "sim" OR "${RUN_MODE}" STREQUAL "npu")
+    include(cmake/npu_lib.cmake)
+else()
+    message("invalid RUN_MODE: ${RUN_MODE}")
+endif()
+
+add_executable(ascendc_kernels_bbit
+    ${CMAKE_CURRENT_SOURCE_DIR}/main.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/matmul_custom_tiling.cpp
+)
+
+target_compile_options(ascendc_kernels_bbit PRIVATE
+    $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:-g>>
+    -O2 -std=c++17 -D_GLIBCXX_USE_CXX11_ABI=0 -Wall -Werror
+)
+
+target_compile_definitions(ascendc_kernels_bbit PRIVATE
+    $<$<BOOL:$<IN_LIST:${SOC_VERSION},${CUSTOM_ASCEND310P_LIST}>>:CUSTOM_ASCEND310P>
+    SOC_VERSION="${SOC_VERSION}"
+)
+
+target_include_directories(ascendc_kernels_bbit PRIVATE
+    ${ASCEND_CANN_PACKAGE_PATH}/include/
+    ${ASCEND_CANN_PACKAGE_PATH}/include/ascendc/
+    ${ASCEND_CANN_PACKAGE_PATH}/include/ascendc/highlevel_api
+)
+
+target_link_directories(ascendc_kernels_bbit PRIVATE
+    ${ASCEND_CANN_PACKAGE_PATH}/lib64
+)
+
+target_link_libraries(ascendc_kernels_bbit PRIVATE
+    ascendcl
+    $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:ascendc_kernels>>
+    runtime
+    tiling_api
+    register
+    platform
+    ascendalog
+    c_sec
+    dl
+)
+
+install(TARGETS ascendc_kernels_bbit
+    LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+    ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
+    RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
+)
diff --git a/operator/MatmulCustomSample/KernelLaunch/MatmulInvocationAcl/README.md b/operator/MatmulCustomSample/KernelLaunch/MatmulInvocationAcl/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..d5c4ebc101226f1925e3ab9de8d9caf79b8a1a5e
--- /dev/null
+++ b/operator/MatmulCustomSample/KernelLaunch/MatmulInvocationAcl/README.md
@@ -0,0 +1,72 @@
+## 目录结构介绍
+```
+├── MatmulInvocationNeo
+│   ├── cmake                       // 编译工程文件
+│   ├── scripts
+│   │   ├── gen_data.py             // 输入数据和真值数据生成脚本文件
+│   │   └── verify_result.py        // 验证输出数据和真值数据是否一致的验证脚本
+│   ├── CMakeLists.txt              // 编译工程文件
+│   ├── data_utils.h                // 数据读入写出函数
+│   ├── main.cpp                    // 主函数，调用算子的应用程序，含CPU域及NPU域调用
+│   ├── matmul_custom.cpp           // 算子kernel实现
+│   ├── matmul_custom_tiling.cpp    // 算子tiling实现
+│   └── run.sh                      // 编译运行算子的脚本
+```
+## 代码实现介绍
+本调用样例中实现的是[M, K, N]固定为[512, 512, 1024]的Matmul算子。
+- kernel实现  
+  Matmul算子的数学表达式为：
+  $$
+  C = A * B
+  $$
+  其中A的形状为[512, 512], B的形状为[512, 1024], C的形状为[512, 1024]。具体请参考[matmul_custom.cpp](./matmul_custom.cpp)。
+
+- 调用实现  
+  1. CPU侧运行验证主要通过ICPU_RUN_KF CPU调测宏等CPU调测库提供的接口来完成；
+  2. NPU侧运行验证主要通过使用ACLRT_LAUNCH_KERNEL内核调用宏来完成。
+
+  应用程序通过ASCENDC_CPU_DEBUG 宏区分代码逻辑运行于CPU侧还是NPU侧。
+
+## 运行样例算子
+  - 打开样例目录   
+    以命令行方式下载样例代码，master分支为例。
+    ```bash
+    cd ${git_clone_path}/samples/operator/MatmulCustomSample/KernelLaunch/MatmulInvocationNeo
+    ```
+  - 配置环境变量
+
+    请根据当前环境上CANN开发套件包的[安装方式](https://hiascend.com/document/redirect/CannCommunityInstSoftware)，选择对应配置环境变量的命令。
+    - 默认路径，root用户安装CANN软件包
+      ```bash
+      export ASCEND_INSTALL_PATH=/usr/local/Ascend/ascend-toolkit/latest
+      ```
+    - 默认路径，非root用户安装CANN软件包
+      ```bash
+      export ASCEND_INSTALL_PATH=$HOME/Ascend/ascend-toolkit/latest
+      ```
+    - 指定路径install_path，安装CANN软件包
+      ```bash
+      export ASCEND_INSTALL_PATH=${install_path}/ascend-toolkit/latest
+      ```
+
+  - 样例执行
+
+    ```bash
+    bash run.sh -r [RUN_MODE] -v  [SOC_VERSION]
+    ```
+    - RUN_MODE：编译方式，可选择CPU调试，NPU仿真，NPU上板。支持参数为[cpu / sim / npu]，默认值为npu。
+    - SOC_VERSION：昇腾AI处理器型号，如果无法确定具体的[SOC_VERSION]，则在安装昇腾AI处理器的服务器执行npu-smi info命令进行查询，在查询到的“Name”前增加Ascend信息，例如"Name"对应取值为xxxyy，实际配置的[SOC_VERSION]值为Ascendxxxyy。支持以下参数取值（xxx请替换为具体取值）：
+      - Atlas 推理系列产品（Ascend 310P处理器）参数值：Ascend310P1、Ascend310P3
+      - Atlas A2训练系列产品/Atlas 800I A2推理产品参数值：AscendxxxB1、AscendxxxB2、AscendxxxB3、AscendxxxB4
+
+    示例如下。
+
+    ```bash
+    bash run.sh -r cpu -v Ascend310P1
+    ```
+
+## 更新说明
+| 时间       | 更新事项     |
+| ---------- | ------------ |
+| 2023/05/21 | 更新本readme |
+| 2024/07/31 | 修改本readme中的形状说明错误 |
\ No newline at end of file
diff --git a/operator/MatmulCustomSample/KernelLaunch/MatmulInvocationAcl/cmake/cpu_lib.cmake b/operator/MatmulCustomSample/KernelLaunch/MatmulInvocationAcl/cmake/cpu_lib.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..76c8adb205173638149cdaaf68bc2d11ea9c6b21
--- /dev/null
+++ b/operator/MatmulCustomSample/KernelLaunch/MatmulInvocationAcl/cmake/cpu_lib.cmake
@@ -0,0 +1,12 @@
+if(NOT DEFINED ENV{CMAKE_PREFIX_PATH})
+    set(CMAKE_PREFIX_PATH ${ASCEND_CANN_PACKAGE_PATH}/tools/tikicpulib/lib/cmake)
+endif()
+find_package(tikicpulib REQUIRED)
+
+add_library(ascendc_kernels SHARED ${KERNEL_FILES})
+target_link_libraries(ascendc_kernels PUBLIC tikicpulib::${SOC_VERSION})
+target_compile_definitions(ascendc_kernels PRIVATE
+    $<$<BOOL:$<IN_LIST:${SOC_VERSION},${CUSTOM_ASCEND310P_LIST}>>:CUSTOM_ASCEND310P>
+)
+target_compile_options(ascendc_kernels PRIVATE -g -O0 -std=c++17)
+install(TARGETS ascendc_kernels DESTINATION ${CMAKE_INSTALL_LIBDIR})
diff --git a/operator/MatmulCustomSample/KernelLaunch/MatmulInvocationAcl/cmake/npu_lib.cmake b/operator/MatmulCustomSample/KernelLaunch/MatmulInvocationAcl/cmake/npu_lib.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..d4daabc5803c97726343560fe940cfd694642b46
--- /dev/null
+++ b/operator/MatmulCustomSample/KernelLaunch/MatmulInvocationAcl/cmake/npu_lib.cmake
@@ -0,0 +1,16 @@
+if(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
+    set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
+elseif(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake)
+    set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake)
+else()
+    message(FATAL_ERROR "ascendc_kernel_cmake does not exist ,please check whether the cann package is installed")
+endif()
+include(${ASCENDC_CMAKE_DIR}/ascendc.cmake)
+
+ascendc_fatbin_library(ascendc_kernels ${KERNEL_FILES})
+
+ascendc_compile_definitions(ascendc_kernels PRIVATE
+    $<$<BOOL:$<IN_LIST:${SOC_VERSION},${CUSTOM_ASCEND310P_LIST}>>:CUSTOM_ASCEND310P>
+    -DHAVE_WORKSPACE
+    -DHAVE_TILING
+)
diff --git a/operator/MatmulCustomSample/KernelLaunch/MatmulInvocationAcl/data_utils.h b/operator/MatmulCustomSample/KernelLaunch/MatmulInvocationAcl/data_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..1d43459ef7c5858cf97a9ba43769e50b8dd28853
--- /dev/null
+++ b/operator/MatmulCustomSample/KernelLaunch/MatmulInvocationAcl/data_utils.h
@@ -0,0 +1,240 @@
+/**
+ * @file data_utils.h
+ *
+ * Copyright (C) 2023-2024. Huawei Technologies Co., Ltd. All rights reserved.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ */
+#ifndef DATA_UTILS_H
+#define DATA_UTILS_H
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+#include <cassert>
+#include <cstdio>
+#include <fstream>
+#include <iomanip>
+#include <iostream>
+#include <string>
+#include <vector>
+#include <memory>
+#include <cstring>
+#include <cerrno>
+
+#include "acl/acl.h"
+
+typedef enum {
+    DT_UNDEFINED = -1,
+    FLOAT = 0,
+    HALF = 1,
+    INT8_T = 2,
+    INT32_T = 3,
+    UINT8_T = 4,
+    INT16_T = 6,
+    UINT16_T = 7,
+    UINT32_T = 8,
+    INT64_T = 9,
+    UINT64_T = 10,
+    DOUBLE = 11,
+    BOOL = 12,
+    STRING = 13,
+    COMPLEX64 = 16,
+    COMPLEX128 = 17,
+    BF16 = 27
+} printDataType;
+
+#define INFO_LOG(fmt, args...) fprintf(stdout, "[INFO]  " fmt "\n", ##args)
+#define WARN_LOG(fmt, args...) fprintf(stdout, "[WARN]  " fmt "\n", ##args)
+#define ERROR_LOG(fmt, args...) fprintf(stdout, "[ERROR]  " fmt "\n", ##args)
+#define CHECK_ACL(x)                                                                        \
+    do {                                                                                    \
+        aclError __ret = x;                                                                 \
+        if (__ret != ACL_ERROR_NONE) {                                                      \
+            std::cerr << __FILE__ << ":" << __LINE__ << " aclError:" << __ret << std::endl; \
+        }                                                                                   \
+    } while (0);
+
+/**
+ * @brief Read data from file
+ * @param [in] filePath: file path
+ * @param [out] fileSize: file size
+ * @return read result
+ */
+bool ReadFile(const std::string &filePath, size_t &fileSize, void *buffer, size_t bufferSize)
+{
+    struct stat sBuf;
+    int fileStatus = stat(filePath.data(), &sBuf);
+    if (fileStatus == -1) {
+        ERROR_LOG("failed to get file");
+        return false;
+    }
+    if (S_ISREG(sBuf.st_mode) == 0) {
+        ERROR_LOG("%s is not a file, please enter a file", filePath.c_str());
+        return false;
+    }
+
+    std::ifstream file;
+    file.open(filePath, std::ios::binary);
+    if (!file.is_open()) {
+        ERROR_LOG("Open file failed. path = %s", filePath.c_str());
+        return false;
+    }
+
+    std::filebuf *buf = file.rdbuf();
+    size_t size = buf->pubseekoff(0, std::ios::end, std::ios::in);
+    if (size == 0) {
+        ERROR_LOG("file size is 0");
+        file.close();
+        return false;
+    }
+    if (size > bufferSize) {
+        ERROR_LOG("file size is larger than buffer size");
+        file.close();
+        return false;
+    }
+    buf->pubseekpos(0, std::ios::in);
+    buf->sgetn(static_cast<char *>(buffer), size);
+    fileSize = size;
+    file.close();
+    return true;
+}
+
+/**
+ * @brief Write data to file
+ * @param [in] filePath: file path
+ * @param [in] buffer: data to write to file
+ * @param [in] size: size to write
+ * @return write result
+ */
+bool WriteFile(const std::string &filePath, const void *buffer, size_t size)
+{
+    if (buffer == nullptr) {
+        ERROR_LOG("Write file failed. buffer is nullptr");
+        return false;
+    }
+
+    int fd = open(filePath.c_str(), O_RDWR | O_CREAT | O_TRUNC, S_IRUSR | S_IWRITE);
+    if (fd < 0) {
+        ERROR_LOG("Open file failed. path = %s", filePath.c_str());
+        return false;
+    }
+
+    size_t writeSize = write(fd, buffer, size);
+    (void)close(fd);
+    if (writeSize != size) {
+        ERROR_LOG("Write file Failed.");
+        return false;
+    }
+
+    return true;
+}
+
+/**
+ * @brief Reads a binary file into memory.
+ *
+ * This function opens a binary file, reads its contents into a dynamically allocated memory buffer,
+ * and returns a pointer to the buffer and the size of the file through output parameters.
+ *
+ * @param filePath The path to the binary file to be read.
+ * @param outBuffer A reference to a unique pointer that will hold the file data.
+ * @param outSize A reference to a size_t that will hold the size of the file.
+ * @return true if the file was read successfully, false otherwise.
+ */
+bool ReadBinaryFile(const char *filePath, std::unique_ptr<char[]> &outBuffer, size_t &outSize)
+{
+    FILE *file = fopen(filePath, "rb");
+    if (!file) {
+        ERROR_LOG("Error opening file: %s\n", strerror(errno));
+        return false;
+    }
+
+    fseek(file, 0, SEEK_END);
+    outSize = ftell(file);
+    rewind(file);
+
+    outBuffer.reset(new char[outSize]);
+    if (fread(outBuffer.get(), 1, outSize, file) != outSize) {
+        ERROR_LOG("Error reading file.\n");
+        fclose(file);
+        return false;
+    }
+
+    fclose(file);
+    return true;
+}
+
+template <typename T> void DoPrintData(const T *data, size_t count, size_t elementsPerRow)
+{
+    assert(elementsPerRow != 0);
+    for (size_t i = 0; i < count; ++i) {
+        std::cout << std::setw(10) << data[i];
+        if (i % elementsPerRow == elementsPerRow - 1) {
+            std::cout << std::endl;
+        }
+    }
+}
+
+void DoPrintHalfData(const aclFloat16 *data, size_t count, size_t elementsPerRow)
+{
+    assert(elementsPerRow != 0);
+    for (size_t i = 0; i < count; ++i) {
+        std::cout << std::setw(10) << std::setprecision(6) << aclFloat16ToFloat(data[i]);
+        if (i % elementsPerRow == elementsPerRow - 1) {
+            std::cout << std::endl;
+        }
+    }
+}
+
+void PrintData(const void *data, size_t count, printDataType dataType, size_t elementsPerRow = 16)
+{
+    if (data == nullptr) {
+        ERROR_LOG("Print data failed. data is nullptr");
+        return;
+    }
+
+    switch (dataType) {
+        case BOOL:
+            DoPrintData(reinterpret_cast<const bool *>(data), count, elementsPerRow);
+            break;
+        case INT8_T:
+            DoPrintData(reinterpret_cast<const int8_t *>(data), count, elementsPerRow);
+            break;
+        case UINT8_T:
+            DoPrintData(reinterpret_cast<const uint8_t *>(data), count, elementsPerRow);
+            break;
+        case INT16_T:
+            DoPrintData(reinterpret_cast<const int16_t *>(data), count, elementsPerRow);
+            break;
+        case UINT16_T:
+            DoPrintData(reinterpret_cast<const uint16_t *>(data), count, elementsPerRow);
+            break;
+        case INT32_T:
+            DoPrintData(reinterpret_cast<const int32_t *>(data), count, elementsPerRow);
+            break;
+        case UINT32_T:
+            DoPrintData(reinterpret_cast<const uint32_t *>(data), count, elementsPerRow);
+            break;
+        case INT64_T:
+            DoPrintData(reinterpret_cast<const int64_t *>(data), count, elementsPerRow);
+            break;
+        case UINT64_T:
+            DoPrintData(reinterpret_cast<const uint64_t *>(data), count, elementsPerRow);
+            break;
+        case HALF:
+            DoPrintHalfData(reinterpret_cast<const aclFloat16 *>(data), count, elementsPerRow);
+            break;
+        case FLOAT:
+            DoPrintData(reinterpret_cast<const float *>(data), count, elementsPerRow);
+            break;
+        case DOUBLE:
+            DoPrintData(reinterpret_cast<const double *>(data), count, elementsPerRow);
+            break;
+        default:
+            ERROR_LOG("Unsupported type: %d", dataType);
+    }
+    std::cout << std::endl;
+}
+#endif // DATA_UTILS_H
diff --git a/operator/MatmulCustomSample/KernelLaunch/MatmulInvocationAcl/main.cpp b/operator/MatmulCustomSample/KernelLaunch/MatmulInvocationAcl/main.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..66866447636e1061dd419c1b743bbe3dce15fd38
--- /dev/null
+++ b/operator/MatmulCustomSample/KernelLaunch/MatmulInvocationAcl/main.cpp
@@ -0,0 +1,137 @@
+/**
+ * @file main.cpp
+ *
+ * Copyright (C) 2023-2024. Huawei Technologies Co., Ltd. All rights reserved.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ */
+#include "data_utils.h"
+#include "kernel_tiling/kernel_tiling.h"
+#include "tiling/platform/platform_ascendc.h"
+#include "acl/acl.h"
+
+typedef int32_t rtError_t;
+extern uint8_t *GenerateTiling(const char *socVersion);
+extern "C" rtError_t rtGetC2cCtrlAddr(uint64_t *addr, uint32_t *len);    // from libruntime.so
+
+int32_t main(int32_t argc, char *argv[])
+{
+    const char *socVersion = SOC_VERSION;
+    auto ascendcPlatform = platform_ascendc::PlatformAscendCManager::GetInstance(socVersion);
+    size_t aFileSize = 512 * 512 * sizeof(uint16_t);  // uint16_t represent half
+    size_t bFileSize = 512 * 1024 * sizeof(uint16_t); // uint16_t represent half
+    size_t cFileSize = 512 * 1024 * sizeof(float);
+    size_t tilingFileSize = sizeof(TCubeTiling);
+    size_t userWorkspaceSize = 0;
+    size_t systemWorkspaceSize = static_cast<size_t>(ascendcPlatform->GetLibApiWorkSpaceSize());
+    size_t workspaceSize = userWorkspaceSize + systemWorkspaceSize;
+#ifdef CUSTOM_ASCEND310P
+    uint32_t blockDim = 2;
+#else
+    uint32_t blockDim = 1;
+#endif
+
+    CHECK_ACL(aclInit(nullptr));
+    int32_t deviceId = 0;
+    CHECK_ACL(aclrtSetDevice(deviceId));
+    aclrtStream stream = nullptr;
+    CHECK_ACL(aclrtCreateStream(&stream));
+
+    uint8_t *aHost;
+    uint8_t *aDevice;
+    CHECK_ACL(aclrtMallocHost((void **)(&aHost), aFileSize));
+    CHECK_ACL(aclrtMalloc((void **)&aDevice, aFileSize, ACL_MEM_MALLOC_HUGE_FIRST));
+    ReadFile("./input/x1_gm.bin", aFileSize, aHost, aFileSize);
+    CHECK_ACL(aclrtMemcpy(aDevice, aFileSize, aHost, aFileSize, ACL_MEMCPY_HOST_TO_DEVICE));
+
+    uint8_t *bHost;
+    uint8_t *bDevice;
+    CHECK_ACL(aclrtMallocHost((void **)(&bHost), bFileSize));
+    CHECK_ACL(aclrtMalloc((void **)&bDevice, bFileSize, ACL_MEM_MALLOC_HUGE_FIRST));
+    ReadFile("./input/x2_gm.bin", bFileSize, bHost, bFileSize);
+    CHECK_ACL(aclrtMemcpy(bDevice, bFileSize, bHost, bFileSize, ACL_MEMCPY_HOST_TO_DEVICE));
+
+    uint8_t *workspaceDevice;
+    CHECK_ACL(aclrtMalloc((void **)&workspaceDevice, workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST));
+
+    uint8_t *tilingHost;
+    uint8_t *tilingDevice;
+    CHECK_ACL(aclrtMallocHost((void **)(&tilingHost), tilingFileSize));
+    CHECK_ACL(aclrtMalloc((void **)&tilingDevice, tilingFileSize, ACL_MEM_MALLOC_HUGE_FIRST));
+    CHECK_ACL(
+        aclrtMemcpy(tilingHost, tilingFileSize, GenerateTiling(socVersion), tilingFileSize, ACL_MEMCPY_HOST_TO_HOST));
+    CHECK_ACL(aclrtMemcpy(tilingDevice, tilingFileSize, tilingHost, tilingFileSize, ACL_MEMCPY_HOST_TO_DEVICE));
+
+    uint8_t *cHost;
+    uint8_t *cDevice;
+    CHECK_ACL(aclrtMallocHost((void **)(&cHost), cFileSize));
+    CHECK_ACL(aclrtMalloc((void **)&cDevice, cFileSize, ACL_MEM_MALLOC_HUGE_FIRST));
+
+    uint64_t fftsAddr;
+    uint32_t fftsAddrLen;
+    CHECK_ACL(rtGetC2cCtrlAddr(&fftsAddr, &fftsAddrLen));
+
+    struct {
+        uintptr_t fftsAddr;
+        uintptr_t aDevice;
+        uintptr_t bDevice;
+        uintptr_t cDevice;
+        uintptr_t workspaceDevice;
+        uintptr_t tilingDevice;
+    } args;
+
+    auto *argsDeviceAddrInHost = &args;
+    CHECK_ACL(aclrtMallocHost((void **)&argsDeviceAddrInHost, sizeof(args)));
+    argsDeviceAddrInHost->fftsAddr = (uintptr_t)fftsAddr;
+    argsDeviceAddrInHost->aDevice = (uintptr_t)aDevice;
+    argsDeviceAddrInHost->bDevice = (uintptr_t)bDevice;
+    argsDeviceAddrInHost->cDevice = (uintptr_t)cDevice;
+    argsDeviceAddrInHost->workspaceDevice = (uintptr_t)workspaceDevice;
+    argsDeviceAddrInHost->tilingDevice = (uintptr_t)tilingDevice;
+
+    void *argsDeviceAddrInDevice = nullptr;
+    CHECK_ACL(aclrtMalloc((void **)&argsDeviceAddrInDevice, sizeof(args), ACL_MEM_MALLOC_HUGE_FIRST));
+    CHECK_ACL(aclrtMemcpy(argsDeviceAddrInDevice, sizeof(args), argsDeviceAddrInHost, sizeof(args), ACL_MEMCPY_HOST_TO_DEVICE));
+    CHECK_ACL(aclrtFreeHost(argsDeviceAddrInHost));
+
+    const char *filePath = "./out/fatbin/ascendc_kernels/ascendc_kernels.o";
+    std::unique_ptr<char[]> fileMemory;
+    size_t fileSize = 0;
+    ReadBinaryFile(filePath, fileMemory, fileSize);
+
+    aclrtBinary binary = aclrtCreateBinary(fileMemory.get(), fileSize);
+    if (binary == nullptr) {
+        printf("aclrtCreateBinary failed");
+        return -1;
+    }
+    aclrtBinHandle binHandle = nullptr;
+    CHECK_ACL(aclrtBinaryLoad(binary, &binHandle));
+    CHECK_ACL(aclrtDestroyBinary(binary));
+
+    aclrtFuncHandle funAdd = nullptr;
+    CHECK_ACL(aclrtBinaryGetFunction(binHandle, "matmul_custom", &funAdd));
+
+    CHECK_ACL(aclrtLaunchKernel(funAdd, blockDim, argsDeviceAddrInDevice, sizeof(args), stream));
+    CHECK_ACL(aclrtSynchronizeStream(stream));
+
+
+    CHECK_ACL(aclrtMemcpy(cHost, cFileSize, cDevice, cFileSize, ACL_MEMCPY_DEVICE_TO_HOST));
+    WriteFile("./output/output.bin", cHost, cFileSize);
+
+    CHECK_ACL(aclrtFree(aDevice));
+    CHECK_ACL(aclrtFreeHost(aHost));
+    CHECK_ACL(aclrtFree(bDevice));
+    CHECK_ACL(aclrtFreeHost(bHost));
+    CHECK_ACL(aclrtFree(workspaceDevice));
+    CHECK_ACL(aclrtFree(tilingDevice));
+    CHECK_ACL(aclrtFreeHost(tilingHost));
+    CHECK_ACL(aclrtFree(cDevice));
+    CHECK_ACL(aclrtFreeHost(cHost));
+
+    CHECK_ACL(aclrtDestroyStream(stream));
+    CHECK_ACL(aclrtResetDevice(deviceId));
+    CHECK_ACL(aclFinalize());
+    return 0;
+}
\ No newline at end of file
diff --git a/operator/MatmulCustomSample/KernelLaunch/MatmulInvocationAcl/matmul_custom.cpp b/operator/MatmulCustomSample/KernelLaunch/MatmulInvocationAcl/matmul_custom.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d6c1e3aed1ab99bc98cb33ee08a1a05e5e82f009
--- /dev/null
+++ b/operator/MatmulCustomSample/KernelLaunch/MatmulInvocationAcl/matmul_custom.cpp
@@ -0,0 +1,103 @@
+/**
+ * @file matmul_custom.cpp
+ *
+ * Copyright (C) 2023-2024. Huawei Technologies Co., Ltd. All rights reserved.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ */
+#include "kernel_operator.h"
+#include "lib/matmul_intf.h"
+
+using namespace matmul;
+
+__aicore__ inline uint32_t Ceiling(uint32_t a, uint32_t b)
+{
+    return (a + b - 1) / b;
+}
+
+__aicore__ inline void CopyTiling(TCubeTiling *tiling, GM_ADDR tilingGM)
+{
+    uint32_t *ptr = reinterpret_cast<uint32_t *>(tiling);
+    auto tiling32 = reinterpret_cast<__gm__ uint32_t *>(tilingGM);
+
+    for (uint32_t i = 0; i < sizeof(TCubeTiling) / sizeof(uint32_t); i++, ptr++) {
+        *ptr = *(tiling32 + i);
+    }
+    return;
+}
+
+__aicore__ inline void CalcGMOffset(int blockIdx, const TCubeTiling &tiling, int &offsetA, int &offsetB, int &offsetC,
+                                    int &tailM, int &tailN, bool isTransA, bool isTransB)
+{
+    uint32_t mSingleBlocks = Ceiling(tiling.M, tiling.singleCoreM);
+    uint32_t mCoreIndx = blockIdx % mSingleBlocks;
+    uint32_t nCoreIndx = blockIdx / mSingleBlocks;
+
+    offsetA = mCoreIndx * tiling.Ka * tiling.singleCoreM;
+    if (isTransA) {
+        offsetA = mCoreIndx * tiling.singleCoreM;
+    }
+    offsetB = nCoreIndx * tiling.singleCoreN;
+    if (isTransB) {
+        offsetB = nCoreIndx * tiling.Kb * tiling.singleCoreN;
+    }
+    offsetC = mCoreIndx * tiling.N * tiling.singleCoreM + nCoreIndx * tiling.singleCoreN;
+
+    tailM = tiling.M - mCoreIndx * tiling.singleCoreM;
+    tailM = tailM < tiling.singleCoreM ? tailM : tiling.singleCoreM;
+
+    tailN = tiling.N - nCoreIndx * tiling.singleCoreN;
+    tailN = tailN < tiling.singleCoreN ? tailN : tiling.singleCoreN;
+}
+
+extern "C" __global__ __aicore__ void matmul_custom(GM_ADDR a, GM_ADDR b, GM_ADDR c, GM_ADDR workspace,
+                                                    GM_ADDR tilingGm)
+{
+    using A_T = half;
+    using B_T = half;
+    using C_T = float;
+
+    AscendC::TPipe pipe;
+    TCubeTiling tiling;
+    CopyTiling(&tiling, tilingGm);
+
+    AscendC::GlobalTensor<A_T> aGlobal;
+    AscendC::GlobalTensor<B_T> bGlobal;
+    AscendC::GlobalTensor<C_T> cGlobal;
+    aGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ A_T *>(a), tiling.M * tiling.Ka);
+    bGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ B_T *>(b), tiling.Ka * tiling.N);
+    cGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ C_T *>(c), tiling.M * tiling.N);
+
+    int offsetA = 0;
+    int offsetB = 0;
+    int offsetC = 0;
+    bool isTransA = false;
+    bool isTransB = false;
+
+    int tailM = 0;
+    int tailN = 0;
+    CalcGMOffset(GetBlockIdx(), tiling, offsetA, offsetB, offsetC, tailM, tailN, isTransA, isTransB);
+
+    auto gmA = aGlobal[offsetA];
+    auto gmB = bGlobal[offsetB];
+    auto gmC = cGlobal[offsetC];
+
+    Matmul<MatmulType<AscendC::TPosition::GM, CubeFormat::ND, A_T>,
+           MatmulType<AscendC::TPosition::GM, CubeFormat::ND, B_T>,
+           MatmulType<AscendC::TPosition::GM, CubeFormat::ND, C_T>> mm;
+    REGIST_MATMUL_OBJ(&pipe, GetSysWorkSpacePtr(), mm, &tiling);
+#ifdef CUSTOM_ASCEND310P
+    AscendC::TBuf<> tmpMMFormatUb;
+    AscendC::LocalTensor<uint8_t> mmFormatUb;
+    pipe.InitBuffer(tmpMMFormatUb, AscendC::TOTAL_VEC_LOCAL_SIZE);
+    mmFormatUb = tmpMMFormatUb.Get<uint8_t>(AscendC::TOTAL_VEC_LOCAL_SIZE);
+    mm.SetLocalWorkspace(mmFormatUb);
+#endif
+    mm.SetTensorA(gmA, isTransA);
+    mm.SetTensorB(gmB, isTransB);
+    mm.SetTail(tailM, tailN);
+    mm.IterateAll(gmC);
+    mm.End();
+}
diff --git a/operator/MatmulCustomSample/KernelLaunch/MatmulInvocationAcl/matmul_custom_tiling.cpp b/operator/MatmulCustomSample/KernelLaunch/MatmulInvocationAcl/matmul_custom_tiling.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ce527d41f186aa72b3b4cc0a7cac9f8f5325d334
--- /dev/null
+++ b/operator/MatmulCustomSample/KernelLaunch/MatmulInvocationAcl/matmul_custom_tiling.cpp
@@ -0,0 +1,75 @@
+/**
+ * @file matmul_custom_tiling.cpp
+ *
+ * Copyright (C) 2023-2024. Huawei Technologies Co., Ltd. All rights reserved.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ */
+#include <cassert>
+#include <fstream>
+#include <iostream>
+#include <map>
+#include <string>
+
+#include "tiling/tiling_api.h"
+#include "tiling/platform/platform_ascendc.h"
+using namespace matmul_tiling;
+using namespace std;
+
+uint8_t *GetTilingBuf(optiling::TCubeTiling *tilingData)
+{
+    uint32_t tilingSize = tilingData->GetDataSize();
+    uint8_t *buf = (uint8_t *)malloc(tilingSize);
+    tilingData->SaveToBuffer(buf, tilingSize);
+    return buf;
+}
+
+uint8_t *GenerateTiling(const char *socVersion)
+{
+    int M = 512;
+    int N = 1024;
+    int K = 512;
+
+    TPosition leftPosition = TPosition::GM;
+    CubeFormat leftFormat = CubeFormat::ND;
+    DataType leftDtype = DataType::DT_FLOAT16;
+    bool isTransA = false;
+
+    TPosition rightPosition = TPosition::GM;
+    CubeFormat rightFormat = CubeFormat::ND;
+    DataType rightDtype = DataType::DT_FLOAT16;
+    bool isTransB = false;
+
+    TPosition resultPosition = TPosition::GM;
+    CubeFormat resultFormat = CubeFormat::ND;
+    DataType resultDtype = DataType::DT_FLOAT;
+
+    bool isBias = false;
+
+    int usedCoreNum = 2;
+    int32_t baseM = 128;
+    int32_t baseN = 256;
+
+    optiling::TCubeTiling tilingData;
+    auto ascendcPlatform = platform_ascendc::PlatformAscendCManager::GetInstance(socVersion);
+    MultiCoreMatmulTiling tilingApi(*ascendcPlatform);
+
+    tilingApi.SetDim(usedCoreNum);
+    tilingApi.SetAType(leftPosition, leftFormat, leftDtype, isTransA);
+    tilingApi.SetBType(rightPosition, rightFormat, rightDtype, isTransB);
+    tilingApi.SetCType(resultPosition, resultFormat, resultDtype);
+
+    tilingApi.SetOrgShape(M, N, K);
+    tilingApi.SetShape(M, N, K);
+    tilingApi.SetFixSplit(baseM, baseN, -1);
+    tilingApi.SetBias(isBias);
+    tilingApi.SetBufferSpace(-1, -1, -1);
+
+    int64_t res = tilingApi.GetTiling(tilingData);
+    if (res == -1) {
+        std::cout << "gen tiling failed" << std::endl;
+    }
+    return GetTilingBuf(&tilingData);
+}
diff --git a/operator/MatmulCustomSample/KernelLaunch/MatmulInvocationAcl/run.sh b/operator/MatmulCustomSample/KernelLaunch/MatmulInvocationAcl/run.sh
new file mode 100644
index 0000000000000000000000000000000000000000..af6e9101619a2c5a3d6cc154a7ebe8d18fe78018
--- /dev/null
+++ b/operator/MatmulCustomSample/KernelLaunch/MatmulInvocationAcl/run.sh
@@ -0,0 +1,126 @@
+#!/bin/bash
+CURRENT_DIR=$(
+    cd $(dirname ${BASH_SOURCE:-$0})
+    pwd
+)
+cd $CURRENT_DIR
+
+BUILD_TYPE="Debug"
+INSTALL_PREFIX="${CURRENT_DIR}/out"
+
+SHORT=r:,v:,i:,b:,p:,
+LONG=run-mode:,soc-version:,install-path:,build-type:,install-prefix:,
+OPTS=$(getopt -a --options $SHORT --longoptions $LONG -- "$@")
+eval set -- "$OPTS"
+
+while :; do
+    case "$1" in
+    -r | --run-mode)
+        RUN_MODE="$2"
+        shift 2
+        ;;
+    -v | --soc-version)
+        SOC_VERSION="$2"
+        shift 2
+        ;;
+    -i | --install-path)
+        ASCEND_INSTALL_PATH="$2"
+        shift 2
+        ;;
+    -b | --build-type)
+        BUILD_TYPE="$2"
+        shift 2
+        ;;
+    -p | --install-prefix)
+        INSTALL_PREFIX="$2"
+        shift 2
+        ;;
+    --)
+        shift
+        break
+        ;;
+    *)
+        echo "[ERROR] Unexpected option: $1"
+        break
+        ;;
+    esac
+done
+
+RUN_MODE_LIST="cpu sim npu"
+if [[ " $RUN_MODE_LIST " != *" $RUN_MODE "* ]]; then
+    echo "ERROR: RUN_MODE error, This sample only support specify cpu, sim or npu!"
+    exit -1
+fi
+
+VERSION_LIST="Ascend310P1 Ascend310P3 Ascend910B1 Ascend910B2 Ascend910B3 Ascend910B4"
+if [[ " $VERSION_LIST " != *" $SOC_VERSION "* ]]; then
+    echo "ERROR: SOC_VERSION should be in [$VERSION_LIST]"
+    exit -1
+fi
+
+if [ -n "$ASCEND_INSTALL_PATH" ]; then
+    _ASCEND_INSTALL_PATH=$ASCEND_INSTALL_PATH
+elif [ -n "$ASCEND_HOME_PATH" ]; then
+    _ASCEND_INSTALL_PATH=$ASCEND_HOME_PATH
+else
+    if [ -d "$HOME/Ascend/ascend-toolkit/latest" ]; then
+        _ASCEND_INSTALL_PATH=$HOME/Ascend/ascend-toolkit/latest
+    else
+        _ASCEND_INSTALL_PATH=/usr/local/Ascend/ascend-toolkit/latest
+    fi
+fi
+
+export ASCEND_TOOLKIT_HOME=${_ASCEND_INSTALL_PATH}
+export ASCEND_HOME_PATH=${_ASCEND_INSTALL_PATH}
+if [ "${RUN_MODE}" = "npu" ]; then
+    source ${_ASCEND_INSTALL_PATH}/bin/setenv.bash
+elif [ "${RUN_MODE}" = "sim" ]; then
+    # in case of running op in simulator, use stub .so instead
+    export LD_LIBRARY_PATH=${_ASCEND_INSTALL_PATH}/runtime/lib64/stub:$LD_LIBRARY_PATH
+    source ${_ASCEND_INSTALL_PATH}/bin/setenv.bash
+    export LD_LIBRARY_PATH=${_ASCEND_INSTALL_PATH}/tools/simulator/${SOC_VERSION}/lib:$LD_LIBRARY_PATH
+    if [ ! $CAMODEL_LOG_PATH ]; then
+        export CAMODEL_LOG_PATH=$(pwd)/sim_log
+    fi
+    if [ -d "$CAMODEL_LOG_PATH" ]; then
+        rm -rf $CAMODEL_LOG_PATH
+    fi
+    mkdir -p $CAMODEL_LOG_PATH
+elif [ "${RUN_MODE}" = "cpu" ]; then
+    source ${_ASCEND_INSTALL_PATH}/bin/setenv.bash
+    export LD_LIBRARY_PATH=${_ASCEND_INSTALL_PATH}/tools/tikicpulib/lib:${_ASCEND_INSTALL_PATH}/tools/tikicpulib/lib/${SOC_VERSION}:${_ASCEND_INSTALL_PATH}/tools/simulator/${SOC_VERSION}/lib:$LD_LIBRARY_PATH
+fi
+
+set -e
+rm -rf build out
+mkdir -p build
+cmake -B build \
+    -DRUN_MODE=${RUN_MODE} \
+    -DSOC_VERSION=${SOC_VERSION} \
+    -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
+    -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} \
+    -DASCEND_CANN_PACKAGE_PATH=${_ASCEND_INSTALL_PATH}
+cmake --build build -j
+cmake --install build
+
+rm -f ascendc_kernels_bbit
+cp ./out/bin/ascendc_kernels_bbit ./
+rm -rf input output
+mkdir -p input output
+python3 scripts/gen_data.py
+(
+    export LD_LIBRARY_PATH=$(pwd)/out/lib:$(pwd)/out/lib64:${_ASCEND_INSTALL_PATH}/lib64:$LD_LIBRARY_PATH
+    if [[ "$RUN_WITH_TOOLCHAIN" -eq 1 ]]; then
+        if [ "${RUN_MODE}" = "npu" ]; then
+            msprof op --application=./ascendc_kernels_bbit
+        elif [ "${RUN_MODE}" = "sim" ]; then
+            msprof op simulator --application=./ascendc_kernels_bbit
+        elif [ "${RUN_MODE}" = "cpu" ]; then
+            ./ascendc_kernels_bbit
+        fi
+    else
+        ./ascendc_kernels_bbit
+    fi
+)
+md5sum output/*.bin
+python3 scripts/verify_result.py output/output.bin output/golden.bin
diff --git a/operator/MatmulCustomSample/KernelLaunch/MatmulInvocationAcl/scripts/gen_data.py b/operator/MatmulCustomSample/KernelLaunch/MatmulInvocationAcl/scripts/gen_data.py
new file mode 100644
index 0000000000000000000000000000000000000000..1ac70da4697b0bf82158d512d05f22751f103f8c
--- /dev/null
+++ b/operator/MatmulCustomSample/KernelLaunch/MatmulInvocationAcl/scripts/gen_data.py
@@ -0,0 +1,31 @@
+#!/usr/bin/python3
+# coding=utf-8
+#
+# Copyright (C) 2023-2024. Huawei Technologies Co., Ltd. All rights reserved.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+# ===============================================================================
+
+import numpy as np
+import os
+
+
+def gen_golden_data():
+    M = 512
+    N = 1024
+    K = 512
+
+    x1_gm = np.random.randint(1, 10, [M, K]).astype(np.float16)
+    x2_gm = np.random.randint(1, 10, [K, N]).astype(np.float16)
+    golden = np.matmul(x1_gm.astype(np.float32), x2_gm.astype(np.float32)).astype(np.float32)
+    os.system("mkdir -p input")
+    os.system("mkdir -p output")
+    x1_gm.tofile("./input/x1_gm.bin")
+    x2_gm.tofile("./input/x2_gm.bin")
+    golden.tofile("./output/golden.bin")
+
+
+if __name__ == "__main__":
+    gen_golden_data()
diff --git a/operator/MatmulCustomSample/KernelLaunch/MatmulInvocationAcl/scripts/verify_result.py b/operator/MatmulCustomSample/KernelLaunch/MatmulInvocationAcl/scripts/verify_result.py
new file mode 100644
index 0000000000000000000000000000000000000000..a325cfcc61f46f7de04ef6c979f852adcf213ef3
--- /dev/null
+++ b/operator/MatmulCustomSample/KernelLaunch/MatmulInvocationAcl/scripts/verify_result.py
@@ -0,0 +1,53 @@
+#!/usr/bin/python3
+# coding=utf-8
+#
+# Copyright (C) 2023-2024. Huawei Technologies Co., Ltd. All rights reserved.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+# ===============================================================================
+
+import sys
+import numpy as np
+
+# for float32
+relative_tol = 1e-6
+absolute_tol = 1e-9
+error_tol = 1e-4
+
+
+def verify_result(output, golden):
+    output = np.fromfile(output, dtype=np.float32).reshape(-1)
+    golden = np.fromfile(golden, dtype=np.float32).reshape(-1)
+    different_element_results = np.isclose(output,
+                                           golden,
+                                           rtol=relative_tol,
+                                           atol=absolute_tol,
+                                           equal_nan=True)
+    different_element_indexes = np.where(different_element_results == False)[0]
+    for index in range(len(different_element_indexes)):
+        real_index = different_element_indexes[index]
+        golden_data = golden[real_index]
+        output_data = output[real_index]
+        print(
+            "data index: %06d, expected: %-.9f, actual: %-.9f, rdiff: %-.6f" %
+            (real_index, golden_data, output_data,
+             abs(output_data - golden_data) / golden_data))
+        if index == 100:
+            break
+    error_ratio = float(different_element_indexes.size) / golden.size
+    print("error ratio: %.4f, tolrence: %.4f" % (error_ratio, error_tol))
+    return error_ratio <= error_tol
+
+
+if __name__ == '__main__':
+    try:
+        res = verify_result(sys.argv[1], sys.argv[2])
+        if not res:
+            raise ValueError("[ERROR] result error")
+        else:
+            print("test pass")
+    except Exception as e:
+        print(e)
+        sys.exit(1)
diff --git a/operator/MatmulLeakyReluCustomSample/KernelLaunch/MatmulLeakyReluInvocation/CMakeLists.txt b/operator/MatmulLeakyReluCustomSample/KernelLaunch/MatmulLeakyReluInvocation/CMakeLists.txt
index f2915dca344aa27c11796a4dfe35cd9458bce21d..3ec5e567492bb261db76bc44c3ddc183f3f91c5b 100644
--- a/operator/MatmulLeakyReluCustomSample/KernelLaunch/MatmulLeakyReluInvocation/CMakeLists.txt
+++ b/operator/MatmulLeakyReluCustomSample/KernelLaunch/MatmulLeakyReluInvocation/CMakeLists.txt
@@ -57,4 +57,4 @@ install(TARGETS ascendc_kernels_bbit
     LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
     ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
     RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
-)
\ No newline at end of file
+)
diff --git a/operator/MatmulLeakyReluCustomSample/KernelLaunch/MatmulLeakyReluInvocationAcl/CMakeLists.txt b/operator/MatmulLeakyReluCustomSample/KernelLaunch/MatmulLeakyReluInvocationAcl/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..d4434230ab4769365f596833664419b45012061b
--- /dev/null
+++ b/operator/MatmulLeakyReluCustomSample/KernelLaunch/MatmulLeakyReluInvocationAcl/CMakeLists.txt
@@ -0,0 +1,67 @@
+cmake_minimum_required(VERSION 3.16)
+project(Ascend_c)
+
+set(RUN_MODE "npu" CACHE STRING "cpu/sim/npu")
+set(SOC_VERSION "Ascend310P3" CACHE STRING "system on chip type")
+set(ASCEND_CANN_PACKAGE_PATH "/usr/local/Ascend/ascend-toolkit/latest"
+    CACHE STRING "ASCEND CANN package installation directory"
+)
+if(NOT CMAKE_BUILD_TYPE)
+    set(CMAKE_BUILD_TYPE "Debug" CACHE STRING "Build type Release/Debug (default Debug)" FORCE)
+endif()
+if(CMAKE_INSTALL_PREFIX STREQUAL /usr/local)
+    set(CMAKE_INSTALL_PREFIX "${CMAKE_CURRENT_LIST_DIR}/out" CACHE STRING "path for install()" FORCE)
+endif()
+file(GLOB KERNEL_FILES ${CMAKE_CURRENT_SOURCE_DIR}/matmul_leakyrelu_custom.cpp)
+set(CUSTOM_ASCEND310P_LIST "Ascend310P1" "Ascend310P3")
+
+if("${RUN_MODE}" STREQUAL "cpu")
+    include(cmake/cpu_lib.cmake)
+elseif("${RUN_MODE}" STREQUAL "sim" OR "${RUN_MODE}" STREQUAL "npu")
+    include(cmake/npu_lib.cmake)
+else()
+    message("invalid RUN_MODE: ${RUN_MODE}")
+endif()
+
+add_executable(ascendc_kernels_bbit
+    ${CMAKE_CURRENT_SOURCE_DIR}/main.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/matmul_leakyrelu_custom_tiling.cpp
+)
+
+target_compile_options(ascendc_kernels_bbit PRIVATE
+    $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:-g>>
+    -O2 -std=c++17 -D_GLIBCXX_USE_CXX11_ABI=0 -Wall -Werror
+)
+
+target_compile_definitions(ascendc_kernels_bbit PRIVATE
+    $<$<BOOL:$<IN_LIST:${SOC_VERSION},${CUSTOM_ASCEND310P_LIST}>>:CUSTOM_ASCEND310P>
+    SOC_VERSION="${SOC_VERSION}"
+)
+
+target_include_directories(ascendc_kernels_bbit PRIVATE
+    ${ASCEND_CANN_PACKAGE_PATH}/include/
+    ${ASCEND_CANN_PACKAGE_PATH}/include/ascendc/
+    ${ASCEND_CANN_PACKAGE_PATH}/include/ascendc/highlevel_api
+)
+
+target_link_directories(ascendc_kernels_bbit PRIVATE
+    ${ASCEND_CANN_PACKAGE_PATH}/lib64
+)
+
+target_link_libraries(ascendc_kernels_bbit PRIVATE
+    ascendcl
+    $<BUILD_INTERFACE:$<$<STREQUAL:${RUN_MODE},cpu>:ascendc_kernels>>
+    runtime
+    tiling_api
+    register
+    platform
+    ascendalog
+    c_sec
+    dl
+)
+
+install(TARGETS ascendc_kernels_bbit
+    LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+    ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
+    RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
+)
diff --git a/operator/MatmulLeakyReluCustomSample/KernelLaunch/MatmulLeakyReluInvocationAcl/README.md b/operator/MatmulLeakyReluCustomSample/KernelLaunch/MatmulLeakyReluInvocationAcl/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..7e9b17abcda6e59a27fbb4faf4b005b17b225fbe
--- /dev/null
+++ b/operator/MatmulLeakyReluCustomSample/KernelLaunch/MatmulLeakyReluInvocationAcl/README.md
@@ -0,0 +1,73 @@
+## 目录结构介绍
+```
+├── MatmulLeakyReluInvocation
+│   ├── cmake                               // 编译工程文件
+│   ├── scripts
+│   │   ├── verify_result.py                // 真值对比文件
+│   │   └── gen_data.py                     // 输入数据和真值数据生成脚本文件
+│   ├── CMakeLists.txt                      // 编译工程文件
+│   ├── data_utils.h                        // 数据读入写出函数
+│   ├── main.cpp                            // 主函数，调用算子的应用程序，含CPU域及NPU域调用
+│   ├── matmul_leakyrelu_custom_tiling.cpp  // 算子tiling实现
+│   ├── matmul_leakyrelu_custom.cpp         // 算子kernel实现
+│   └── run.sh                              // 编译运行算子的脚本
+```
+## 代码实现介绍
+本调用样例中实现的是[m, n, k]固定为[1024, 640, 256]的MatmulLeakyRelu算子。
+- kernel实现  
+  MatmulLeakyRelu算子的数学表达式为：
+  ```
+  C = A * B + Bias
+  C = C > 0 ? C : C * 0.001
+  ```
+  其中A的形状为[1024, 256]，B的形状为[256, 640]，C的形状为[1024, 640]，Bias的形状为[640]。具体请参考[matmul_leakyrelu_custom.cpp](./matmul_leakyrelu_custom.cpp)。
+
+- 调用实现  
+  1. CPU侧运行验证主要通过ICPU_RUN_KF CPU调测宏等CPU调测库提供的接口来完成；
+  2. NPU侧运行验证主要通过使用ACLRT_LAUNCH_KERNEL内核调用宏来完成。
+
+  应用程序通过ASCENDC_CPU_DEBUG 宏区分代码逻辑运行于CPU侧还是NPU侧。
+
+## 运行样例算子
+  - 打开样例目录   
+    以命令行方式下载样例代码，master分支为例。
+    ```bash
+    cd ${git_clone_path}/samples/operator/MatmulLeakyReluCustomSample/KernelLaunch/MatmulLeakyReluInvocation
+    ```
+  - 配置环境变量
+
+    请根据当前环境上CANN开发套件包的[安装方式](https://hiascend.com/document/redirect/CannCommunityInstSoftware)，选择对应配置环境变量的命令。
+    - 默认路径，root用户安装CANN软件包
+      ```bash
+      export ASCEND_INSTALL_PATH=/usr/local/Ascend/ascend-toolkit/latest
+      ```
+    - 默认路径，非root用户安装CANN软件包
+      ```bash
+      export ASCEND_INSTALL_PATH=$HOME/Ascend/ascend-toolkit/latest
+      ```
+    - 指定路径install_path，安装CANN软件包
+      ```bash
+      export ASCEND_INSTALL_PATH=${install_path}/ascend-toolkit/latest
+      ```
+
+  - 样例执行
+
+    ```bash
+    bash run.sh -r [RUN_MODE] -v  [SOC_VERSION]
+    ```
+    - RUN_MODE：编译方式，可选择CPU调试，NPU仿真，NPU上板。支持参数为[cpu / sim / npu]，默认值为npu。
+    - SOC_VERSION：昇腾AI处理器型号，如果无法确定具体的[SOC_VERSION]，则在安装昇腾AI处理器的服务器执行npu-smi info命令进行查询，在查询到的“Name”前增加Ascend信息，例如“Name”对应取值为xxxyy，实际配置的[SOC_VERSION]值为Ascendxxxyy。支持以下参数取值（xxx请替换为具体取值）：
+      - Atlas 推理系列产品（Ascend 310P处理器）参数值：Ascend310P1、Ascend310P3
+      - Atlas A2训练系列产品/Atlas 800I A2推理产品参数值：AscendxxxB1、AscendxxxB2、AscendxxxB3、AscendxxxB4
+
+    示例如下。
+    ```bash
+    bash run.sh -r cpu -v Ascend310P1
+    ```
+
+## 更新说明
+| 时间       | 更新事项     | 注意事项                                         |
+| ---------- | ------------ | ------------------------------------------------ |
+| 2023/05/21 | 更新本readme |                                                 |
+| 2023/05/25 | 取消TCubeTiling大小硬编码 | 需要基于社区CANN包8.0.RC2.alpha002及之后版本运行 |
+| 2023/06/11 | 取消workspace大小硬编码 |                                        |
\ No newline at end of file
diff --git a/operator/MatmulLeakyReluCustomSample/KernelLaunch/MatmulLeakyReluInvocationAcl/cmake/cpu_lib.cmake b/operator/MatmulLeakyReluCustomSample/KernelLaunch/MatmulLeakyReluInvocationAcl/cmake/cpu_lib.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..76c8adb205173638149cdaaf68bc2d11ea9c6b21
--- /dev/null
+++ b/operator/MatmulLeakyReluCustomSample/KernelLaunch/MatmulLeakyReluInvocationAcl/cmake/cpu_lib.cmake
@@ -0,0 +1,12 @@
+if(NOT DEFINED ENV{CMAKE_PREFIX_PATH})
+    set(CMAKE_PREFIX_PATH ${ASCEND_CANN_PACKAGE_PATH}/tools/tikicpulib/lib/cmake)
+endif()
+find_package(tikicpulib REQUIRED)
+
+add_library(ascendc_kernels SHARED ${KERNEL_FILES})
+target_link_libraries(ascendc_kernels PUBLIC tikicpulib::${SOC_VERSION})
+target_compile_definitions(ascendc_kernels PRIVATE
+    $<$<BOOL:$<IN_LIST:${SOC_VERSION},${CUSTOM_ASCEND310P_LIST}>>:CUSTOM_ASCEND310P>
+)
+target_compile_options(ascendc_kernels PRIVATE -g -O0 -std=c++17)
+install(TARGETS ascendc_kernels DESTINATION ${CMAKE_INSTALL_LIBDIR})
diff --git a/operator/MatmulLeakyReluCustomSample/KernelLaunch/MatmulLeakyReluInvocationAcl/cmake/npu_lib.cmake b/operator/MatmulLeakyReluCustomSample/KernelLaunch/MatmulLeakyReluInvocationAcl/cmake/npu_lib.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..d4daabc5803c97726343560fe940cfd694642b46
--- /dev/null
+++ b/operator/MatmulLeakyReluCustomSample/KernelLaunch/MatmulLeakyReluInvocationAcl/cmake/npu_lib.cmake
@@ -0,0 +1,16 @@
+if(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
+    set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
+elseif(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake)
+    set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake)
+else()
+    message(FATAL_ERROR "ascendc_kernel_cmake does not exist ,please check whether the cann package is installed")
+endif()
+include(${ASCENDC_CMAKE_DIR}/ascendc.cmake)
+
+ascendc_fatbin_library(ascendc_kernels ${KERNEL_FILES})
+
+ascendc_compile_definitions(ascendc_kernels PRIVATE
+    $<$<BOOL:$<IN_LIST:${SOC_VERSION},${CUSTOM_ASCEND310P_LIST}>>:CUSTOM_ASCEND310P>
+    -DHAVE_WORKSPACE
+    -DHAVE_TILING
+)
diff --git a/operator/MatmulLeakyReluCustomSample/KernelLaunch/MatmulLeakyReluInvocationAcl/data_utils.h b/operator/MatmulLeakyReluCustomSample/KernelLaunch/MatmulLeakyReluInvocationAcl/data_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..1d43459ef7c5858cf97a9ba43769e50b8dd28853
--- /dev/null
+++ b/operator/MatmulLeakyReluCustomSample/KernelLaunch/MatmulLeakyReluInvocationAcl/data_utils.h
@@ -0,0 +1,240 @@
+/**
+ * @file data_utils.h
+ *
+ * Copyright (C) 2023-2024. Huawei Technologies Co., Ltd. All rights reserved.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ */
+#ifndef DATA_UTILS_H
+#define DATA_UTILS_H
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+#include <cassert>
+#include <cstdio>
+#include <fstream>
+#include <iomanip>
+#include <iostream>
+#include <string>
+#include <vector>
+#include <memory>
+#include <cstring>
+#include <cerrno>
+
+#include "acl/acl.h"
+
+typedef enum {
+    DT_UNDEFINED = -1,
+    FLOAT = 0,
+    HALF = 1,
+    INT8_T = 2,
+    INT32_T = 3,
+    UINT8_T = 4,
+    INT16_T = 6,
+    UINT16_T = 7,
+    UINT32_T = 8,
+    INT64_T = 9,
+    UINT64_T = 10,
+    DOUBLE = 11,
+    BOOL = 12,
+    STRING = 13,
+    COMPLEX64 = 16,
+    COMPLEX128 = 17,
+    BF16 = 27
+} printDataType;
+
+#define INFO_LOG(fmt, args...) fprintf(stdout, "[INFO]  " fmt "\n", ##args)
+#define WARN_LOG(fmt, args...) fprintf(stdout, "[WARN]  " fmt "\n", ##args)
+#define ERROR_LOG(fmt, args...) fprintf(stdout, "[ERROR]  " fmt "\n", ##args)
+#define CHECK_ACL(x)                                                                        \
+    do {                                                                                    \
+        aclError __ret = x;                                                                 \
+        if (__ret != ACL_ERROR_NONE) {                                                      \
+            std::cerr << __FILE__ << ":" << __LINE__ << " aclError:" << __ret << std::endl; \
+        }                                                                                   \
+    } while (0);
+
+/**
+ * @brief Read data from file
+ * @param [in] filePath: file path
+ * @param [out] fileSize: file size
+ * @return read result
+ */
+bool ReadFile(const std::string &filePath, size_t &fileSize, void *buffer, size_t bufferSize)
+{
+    struct stat sBuf;
+    int fileStatus = stat(filePath.data(), &sBuf);
+    if (fileStatus == -1) {
+        ERROR_LOG("failed to get file");
+        return false;
+    }
+    if (S_ISREG(sBuf.st_mode) == 0) {
+        ERROR_LOG("%s is not a file, please enter a file", filePath.c_str());
+        return false;
+    }
+
+    std::ifstream file;
+    file.open(filePath, std::ios::binary);
+    if (!file.is_open()) {
+        ERROR_LOG("Open file failed. path = %s", filePath.c_str());
+        return false;
+    }
+
+    std::filebuf *buf = file.rdbuf();
+    size_t size = buf->pubseekoff(0, std::ios::end, std::ios::in);
+    if (size == 0) {
+        ERROR_LOG("file size is 0");
+        file.close();
+        return false;
+    }
+    if (size > bufferSize) {
+        ERROR_LOG("file size is larger than buffer size");
+        file.close();
+        return false;
+    }
+    buf->pubseekpos(0, std::ios::in);
+    buf->sgetn(static_cast<char *>(buffer), size);
+    fileSize = size;
+    file.close();
+    return true;
+}
+
+/**
+ * @brief Write data to file
+ * @param [in] filePath: file path
+ * @param [in] buffer: data to write to file
+ * @param [in] size: size to write
+ * @return write result
+ */
+bool WriteFile(const std::string &filePath, const void *buffer, size_t size)
+{
+    if (buffer == nullptr) {
+        ERROR_LOG("Write file failed. buffer is nullptr");
+        return false;
+    }
+
+    int fd = open(filePath.c_str(), O_RDWR | O_CREAT | O_TRUNC, S_IRUSR | S_IWRITE);
+    if (fd < 0) {
+        ERROR_LOG("Open file failed. path = %s", filePath.c_str());
+        return false;
+    }
+
+    size_t writeSize = write(fd, buffer, size);
+    (void)close(fd);
+    if (writeSize != size) {
+        ERROR_LOG("Write file Failed.");
+        return false;
+    }
+
+    return true;
+}
+
+/**
+ * @brief Reads a binary file into memory.
+ *
+ * This function opens a binary file, reads its contents into a dynamically allocated memory buffer,
+ * and returns a pointer to the buffer and the size of the file through output parameters.
+ *
+ * @param filePath The path to the binary file to be read.
+ * @param outBuffer A reference to a unique pointer that will hold the file data.
+ * @param outSize A reference to a size_t that will hold the size of the file.
+ * @return true if the file was read successfully, false otherwise.
+ */
+bool ReadBinaryFile(const char *filePath, std::unique_ptr<char[]> &outBuffer, size_t &outSize)
+{
+    FILE *file = fopen(filePath, "rb");
+    if (!file) {
+        ERROR_LOG("Error opening file: %s\n", strerror(errno));
+        return false;
+    }
+
+    fseek(file, 0, SEEK_END);
+    outSize = ftell(file);
+    rewind(file);
+
+    outBuffer.reset(new char[outSize]);
+    if (fread(outBuffer.get(), 1, outSize, file) != outSize) {
+        ERROR_LOG("Error reading file.\n");
+        fclose(file);
+        return false;
+    }
+
+    fclose(file);
+    return true;
+}
+
+template <typename T> void DoPrintData(const T *data, size_t count, size_t elementsPerRow)
+{
+    assert(elementsPerRow != 0);
+    for (size_t i = 0; i < count; ++i) {
+        std::cout << std::setw(10) << data[i];
+        if (i % elementsPerRow == elementsPerRow - 1) {
+            std::cout << std::endl;
+        }
+    }
+}
+
+void DoPrintHalfData(const aclFloat16 *data, size_t count, size_t elementsPerRow)
+{
+    assert(elementsPerRow != 0);
+    for (size_t i = 0; i < count; ++i) {
+        std::cout << std::setw(10) << std::setprecision(6) << aclFloat16ToFloat(data[i]);
+        if (i % elementsPerRow == elementsPerRow - 1) {
+            std::cout << std::endl;
+        }
+    }
+}
+
+void PrintData(const void *data, size_t count, printDataType dataType, size_t elementsPerRow = 16)
+{
+    if (data == nullptr) {
+        ERROR_LOG("Print data failed. data is nullptr");
+        return;
+    }
+
+    switch (dataType) {
+        case BOOL:
+            DoPrintData(reinterpret_cast<const bool *>(data), count, elementsPerRow);
+            break;
+        case INT8_T:
+            DoPrintData(reinterpret_cast<const int8_t *>(data), count, elementsPerRow);
+            break;
+        case UINT8_T:
+            DoPrintData(reinterpret_cast<const uint8_t *>(data), count, elementsPerRow);
+            break;
+        case INT16_T:
+            DoPrintData(reinterpret_cast<const int16_t *>(data), count, elementsPerRow);
+            break;
+        case UINT16_T:
+            DoPrintData(reinterpret_cast<const uint16_t *>(data), count, elementsPerRow);
+            break;
+        case INT32_T:
+            DoPrintData(reinterpret_cast<const int32_t *>(data), count, elementsPerRow);
+            break;
+        case UINT32_T:
+            DoPrintData(reinterpret_cast<const uint32_t *>(data), count, elementsPerRow);
+            break;
+        case INT64_T:
+            DoPrintData(reinterpret_cast<const int64_t *>(data), count, elementsPerRow);
+            break;
+        case UINT64_T:
+            DoPrintData(reinterpret_cast<const uint64_t *>(data), count, elementsPerRow);
+            break;
+        case HALF:
+            DoPrintHalfData(reinterpret_cast<const aclFloat16 *>(data), count, elementsPerRow);
+            break;
+        case FLOAT:
+            DoPrintData(reinterpret_cast<const float *>(data), count, elementsPerRow);
+            break;
+        case DOUBLE:
+            DoPrintData(reinterpret_cast<const double *>(data), count, elementsPerRow);
+            break;
+        default:
+            ERROR_LOG("Unsupported type: %d", dataType);
+    }
+    std::cout << std::endl;
+}
+#endif // DATA_UTILS_H
diff --git a/operator/MatmulLeakyReluCustomSample/KernelLaunch/MatmulLeakyReluInvocationAcl/main.cpp b/operator/MatmulLeakyReluCustomSample/KernelLaunch/MatmulLeakyReluInvocationAcl/main.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..dfcedf19f44f2cd3b34716cd5fbaa5f6e36dae37
--- /dev/null
+++ b/operator/MatmulLeakyReluCustomSample/KernelLaunch/MatmulLeakyReluInvocationAcl/main.cpp
@@ -0,0 +1,147 @@
+/**
+ * @file main.cpp
+ *
+ * Copyright (C) 2023-2024. Huawei Technologies Co., Ltd. All rights reserved.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ */
+#include "data_utils.h"
+#include "kernel_tiling/kernel_tiling.h"
+#include "tiling/platform/platform_ascendc.h"
+#include "acl/acl.h"
+
+typedef int32_t rtError_t;
+extern uint8_t *GenerateTiling(const char *socVersion);
+extern "C" rtError_t rtGetC2cCtrlAddr(uint64_t *addr, uint32_t *len);    // from libruntime.so
+
+int32_t main(int32_t argc, char *argv[])
+{
+    const char *socVersion = SOC_VERSION;
+    auto ascendcPlatform = platform_ascendc::PlatformAscendCManager::GetInstance(socVersion);
+    size_t aFileSize = 262144 * sizeof(int16_t);
+    size_t bFileSize = 163840 * sizeof(int16_t);
+    size_t cFileSize = 655360 * sizeof(float);
+    size_t biasFileSize = 640 * sizeof(float);
+    size_t tilingFileSize = sizeof(TCubeTiling);
+    size_t userWorkspaceSize = 0;
+    size_t systemWorkspaceSize = static_cast<size_t>(ascendcPlatform->GetLibApiWorkSpaceSize());
+    size_t workspaceSize = userWorkspaceSize + systemWorkspaceSize;
+#ifdef CUSTOM_ASCEND310P
+    uint32_t blockDim = 2;
+#else
+    uint32_t blockDim = 1;
+#endif
+
+    CHECK_ACL(aclInit(nullptr));
+    int32_t deviceId = 0;
+    CHECK_ACL(aclrtSetDevice(deviceId));
+    aclrtStream stream = nullptr;
+    CHECK_ACL(aclrtCreateStream(&stream));
+
+    uint8_t *inputAHost;
+    uint8_t *inputADevice;
+    CHECK_ACL(aclrtMallocHost((void **)(&inputAHost), aFileSize));
+    CHECK_ACL(aclrtMalloc((void **)&inputADevice, aFileSize, ACL_MEM_MALLOC_HUGE_FIRST));
+    ReadFile("./input/x1_gm.bin", aFileSize, inputAHost, aFileSize);
+    CHECK_ACL(aclrtMemcpy(inputADevice, aFileSize, inputAHost, aFileSize, ACL_MEMCPY_HOST_TO_DEVICE));
+
+    uint8_t *inputBHost;
+    uint8_t *inputBDevice;
+    CHECK_ACL(aclrtMallocHost((void **)(&inputBHost), bFileSize));
+    CHECK_ACL(aclrtMalloc((void **)&inputBDevice, bFileSize, ACL_MEM_MALLOC_HUGE_FIRST));
+    ReadFile("./input/x2_gm.bin", bFileSize, inputBHost, bFileSize);
+    CHECK_ACL(aclrtMemcpy(inputBDevice, bFileSize, inputBHost, bFileSize, ACL_MEMCPY_HOST_TO_DEVICE));
+
+    uint8_t *outputCHost;
+    uint8_t *outputCDevice;
+    CHECK_ACL(aclrtMallocHost((void **)(&outputCHost), cFileSize));
+    CHECK_ACL(aclrtMalloc((void **)&outputCDevice, cFileSize, ACL_MEM_MALLOC_HUGE_FIRST));
+
+    uint8_t *inputBiasHost;
+    uint8_t *inputBiasDevice;
+    CHECK_ACL(aclrtMallocHost((void **)(&inputBiasHost), biasFileSize));
+    CHECK_ACL(aclrtMalloc((void **)&inputBiasDevice, biasFileSize, ACL_MEM_MALLOC_HUGE_FIRST));
+    ReadFile("./input/bias.bin", biasFileSize, inputBiasHost, biasFileSize);
+    CHECK_ACL(aclrtMemcpy(inputBiasDevice, biasFileSize, inputBiasHost, biasFileSize, ACL_MEMCPY_HOST_TO_DEVICE));
+
+    uint8_t *tilingHost;
+    uint8_t *tilingDevice;
+    CHECK_ACL(aclrtMallocHost((void **)(&tilingHost), tilingFileSize));
+    CHECK_ACL(aclrtMalloc((void **)&tilingDevice, tilingFileSize, ACL_MEM_MALLOC_HUGE_FIRST));
+    CHECK_ACL(
+        aclrtMemcpy(tilingHost, tilingFileSize, GenerateTiling(socVersion), tilingFileSize, ACL_MEMCPY_HOST_TO_HOST));
+    CHECK_ACL(aclrtMemcpy(tilingDevice, tilingFileSize, tilingHost, tilingFileSize, ACL_MEMCPY_HOST_TO_DEVICE));
+
+    uint8_t *workspaceDevice;
+    CHECK_ACL(aclrtMalloc((void **)&workspaceDevice, workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST));
+
+    uint64_t fftsAddr;
+    uint32_t fftsAddrLen;
+    CHECK_ACL(rtGetC2cCtrlAddr(&fftsAddr, &fftsAddrLen));
+
+    struct {
+        uintptr_t fftsAddr;
+        uintptr_t inputADevice;
+        uintptr_t inputBDevice;
+        uintptr_t inputBiasDevice;
+        uintptr_t outputCDevice;
+        uintptr_t workspaceDevice;
+        uintptr_t tilingDevice;
+    } args;
+
+    auto *argsDeviceAddrInHost = &args;
+    CHECK_ACL(aclrtMallocHost((void **)&argsDeviceAddrInHost, sizeof(args)));
+    argsDeviceAddrInHost->fftsAddr = (uintptr_t)fftsAddr;
+    argsDeviceAddrInHost->inputADevice = (uintptr_t)inputADevice;
+    argsDeviceAddrInHost->inputBDevice = (uintptr_t)inputBDevice;
+    argsDeviceAddrInHost->inputBiasDevice = (uintptr_t)inputBiasDevice;
+    argsDeviceAddrInHost->outputCDevice = (uintptr_t)outputCDevice;
+    argsDeviceAddrInHost->workspaceDevice = (uintptr_t)workspaceDevice;
+    argsDeviceAddrInHost->tilingDevice = (uintptr_t)tilingDevice;
+
+    void *argsDeviceAddrInDevice = nullptr;
+    CHECK_ACL(aclrtMalloc((void **)&argsDeviceAddrInDevice, sizeof(args), ACL_MEM_MALLOC_HUGE_FIRST));
+    CHECK_ACL(aclrtMemcpy(argsDeviceAddrInDevice, sizeof(args), argsDeviceAddrInHost, sizeof(args), ACL_MEMCPY_HOST_TO_DEVICE));
+    CHECK_ACL(aclrtFreeHost(argsDeviceAddrInHost));
+
+    const char *filePath = "./out/fatbin/ascendc_kernels/ascendc_kernels.o";
+    std::unique_ptr<char[]> fileMemory;
+    size_t fileSize = 0;
+    ReadBinaryFile(filePath, fileMemory, fileSize);
+
+    aclrtBinary binary = aclrtCreateBinary(fileMemory.get(), fileSize);
+    if (binary == nullptr) {
+        printf("aclrtCreateBinary failed");
+        return -1;
+    }
+    aclrtBinHandle binHandle = nullptr;
+    CHECK_ACL(aclrtBinaryLoad(binary, &binHandle));
+    CHECK_ACL(aclrtDestroyBinary(binary));
+
+    aclrtFuncHandle funAdd = nullptr;
+    CHECK_ACL(aclrtBinaryGetFunction(binHandle, "matmul_leakyrelu_custom", &funAdd));
+
+    CHECK_ACL(aclrtLaunchKernel(funAdd, blockDim, argsDeviceAddrInDevice, sizeof(args), stream));
+    CHECK_ACL(aclrtSynchronizeStream(stream));
+
+    CHECK_ACL(aclrtFree(inputADevice));
+    CHECK_ACL(aclrtFreeHost(inputAHost));
+    CHECK_ACL(aclrtFree(inputBDevice));
+    CHECK_ACL(aclrtFreeHost(inputBHost));
+    CHECK_ACL(aclrtMemcpy(outputCHost, cFileSize, outputCDevice, cFileSize, ACL_MEMCPY_DEVICE_TO_HOST));
+    WriteFile("./output/output.bin", outputCHost, cFileSize);
+    CHECK_ACL(aclrtFree(outputCDevice));
+    CHECK_ACL(aclrtFreeHost(outputCHost));
+    CHECK_ACL(aclrtFree(inputBiasDevice));
+    CHECK_ACL(aclrtFreeHost(inputBiasHost));
+    CHECK_ACL(aclrtFree(tilingDevice));
+    CHECK_ACL(aclrtFreeHost(tilingHost));
+    CHECK_ACL(aclrtFree(workspaceDevice));
+
+    CHECK_ACL(aclrtDestroyStream(stream));
+    CHECK_ACL(aclrtResetDevice(deviceId));
+    CHECK_ACL(aclFinalize());
+    return 0;
+}
\ No newline at end of file
diff --git a/operator/MatmulLeakyReluCustomSample/KernelLaunch/MatmulLeakyReluInvocationAcl/matmul_leakyrelu_custom.cpp b/operator/MatmulLeakyReluCustomSample/KernelLaunch/MatmulLeakyReluInvocationAcl/matmul_leakyrelu_custom.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ac50143b5cd7faf027a7f469084c7cad3303639e
--- /dev/null
+++ b/operator/MatmulLeakyReluCustomSample/KernelLaunch/MatmulLeakyReluInvocationAcl/matmul_leakyrelu_custom.cpp
@@ -0,0 +1,155 @@
+/**
+ * @file matmul_leakyrelu_custom.cpp
+ *
+ * Copyright (C) 2023-2024. Huawei Technologies Co., Ltd. All rights reserved.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ */
+#include "kernel_operator.h"
+#include "lib/matmul_intf.h"
+
+using namespace matmul;
+
+__aicore__ inline uint32_t Ceiling(uint32_t a, uint32_t b)
+{
+    return (a + b - 1) / b;
+}
+
+__aicore__ inline void CopyTiling(TCubeTiling *tiling, GM_ADDR tilingGM)
+{
+    uint32_t *ptr = reinterpret_cast<uint32_t *>(tiling);
+    auto tiling32 = reinterpret_cast<__gm__ uint32_t *>(tilingGM);
+
+    for (uint32_t i = 0; i < sizeof(TCubeTiling) / sizeof(uint32_t); i++, ptr++) {
+        *ptr = *(tiling32 + i);
+    }
+    return;
+}
+
+template <typename aType, typename bType, typename cType, typename biasType> class MatmulLeakyKernel {
+public:
+    __aicore__ inline MatmulLeakyKernel(){};
+    __aicore__ inline void Init(GM_ADDR a, GM_ADDR b, GM_ADDR bias, GM_ADDR c, GM_ADDR workspace,
+                                const TCubeTiling &tiling, AscendC::TPipe *pipe);
+    __aicore__ inline void Process(AscendC::TPipe *pipe);
+
+    __aicore__ inline void MatmulCompute();
+    __aicore__ inline void LeakyReluCompute();
+    __aicore__ inline void CopyOut(uint32_t count);
+    __aicore__ inline void CalcOffset(int32_t blockIdx, const TCubeTiling &tiling, int32_t &offsetA, int32_t &offsetB,
+                                      int32_t &offsetC, int32_t &offsetBias);
+
+    Matmul<MatmulType<AscendC::TPosition::GM, CubeFormat::ND, aType>, MatmulType<AscendC::TPosition::GM, CubeFormat::ND, bType>,
+           MatmulType<AscendC::TPosition::VECIN, CubeFormat::ND, cType>, MatmulType<AscendC::TPosition::GM, CubeFormat::ND, biasType>>
+        matmulObj;
+
+    AscendC::GlobalTensor<aType> aGlobal;
+    AscendC::GlobalTensor<bType> bGlobal;
+    AscendC::GlobalTensor<cType> cGlobal;
+    AscendC::GlobalTensor<biasType> biasGlobal;
+    AscendC::LocalTensor<cType> reluOutLocal;
+    TCubeTiling tiling;
+    AscendC::TQue<AscendC::QuePosition::VECOUT, 1> reluOutQueue_;
+};
+
+template <typename aType, typename bType, typename cType, typename biasType>
+__aicore__ inline void MatmulLeakyKernel<aType, bType, cType, biasType>::Init(GM_ADDR a, GM_ADDR b, GM_ADDR bias,
+                                                                              GM_ADDR c, GM_ADDR workspace,
+                                                                              const TCubeTiling &tiling, AscendC::TPipe *pipe)
+{
+    this->tiling = tiling;
+    aGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ aType *>(a), tiling.M * tiling.Ka);
+    bGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ bType *>(b), tiling.Kb * tiling.N);
+    cGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ cType *>(c), tiling.M * tiling.N);
+    biasGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ biasType *>(bias), tiling.N);
+
+    int32_t offsetA, offsetB, offsetC, offsetBias;
+    CalcOffset(AscendC::GetBlockIdx(), tiling, offsetA, offsetB, offsetC, offsetBias);
+    aGlobal = aGlobal[offsetA];
+    bGlobal = bGlobal[offsetB];
+    cGlobal = cGlobal[offsetC];
+    biasGlobal = biasGlobal[offsetBias];
+    pipe->InitBuffer(reluOutQueue_, 1, tiling.baseM * tiling.baseN * sizeof(cType));
+}
+
+template <typename aType, typename bType, typename cType, typename biasType>
+__aicore__ inline void MatmulLeakyKernel<aType, bType, cType, biasType>::Process(AscendC::TPipe *pipe)
+{
+    uint32_t computeRound = 0;
+
+#ifdef CUSTOM_ASCEND310P
+    AscendC::TBuf<> tmpMMFormatUb;
+    AscendC::LocalTensor<uint8_t> mmformatUb;
+    pipe->InitBuffer(tmpMMFormatUb, tiling.baseM * tiling.baseN * sizeof(cType));
+    mmformatUb = tmpMMFormatUb.Get<uint8_t>(tiling.baseM * tiling.baseN * sizeof(cType));
+    matmulObj.SetLocalWorkspace(mmformatUb);
+#endif
+    matmulObj.SetTensorA(aGlobal);
+    matmulObj.SetTensorB(bGlobal);
+    matmulObj.SetBias(biasGlobal);
+    while (matmulObj.template Iterate<true>()) {
+        MatmulCompute();
+        LeakyReluCompute();
+        CopyOut(computeRound);
+        computeRound++;
+    }
+    matmulObj.End();
+}
+
+template <typename aType, typename bType, typename cType, typename biasType>
+__aicore__ inline void MatmulLeakyKernel<aType, bType, cType, biasType>::MatmulCompute()
+{
+    reluOutLocal = reluOutQueue_.AllocTensor<cType>();
+    matmulObj.template GetTensorC<true>(reluOutLocal, false, true);
+}
+
+template <typename aType, typename bType, typename cType, typename biasType>
+__aicore__ inline void MatmulLeakyKernel<aType, bType, cType, biasType>::LeakyReluCompute()
+{
+    LeakyRelu(reluOutLocal, reluOutLocal, (cType)0.001, tiling.baseM * tiling.baseN);
+    reluOutQueue_.EnQue(reluOutLocal);
+}
+
+template <typename aType, typename bType, typename cType, typename biasType>
+__aicore__ inline void MatmulLeakyKernel<aType, bType, cType, biasType>::CopyOut(uint32_t count)
+{
+    reluOutQueue_.DeQue<cType>();
+    const uint32_t roundM = tiling.singleCoreM / tiling.baseM;
+    const uint32_t roundN = tiling.singleCoreN / tiling.baseN;
+    uint32_t startOffset = (count % roundM * tiling.baseM * tiling.N + count / roundM * tiling.baseN);
+    AscendC::DataCopyParams copyParam = {(uint16_t)tiling.baseM, (uint16_t)(tiling.baseN * sizeof(cType) / AscendC::DEFAULT_C0_SIZE), 0,
+                                (uint16_t)((tiling.N - tiling.baseN) * sizeof(cType) / AscendC::DEFAULT_C0_SIZE)};
+    DataCopy(cGlobal[startOffset], reluOutLocal, copyParam);
+    reluOutQueue_.FreeTensor(reluOutLocal);
+}
+
+template <typename aType, typename bType, typename cType, typename biasType>
+__aicore__ inline void
+MatmulLeakyKernel<aType, bType, cType, biasType>::CalcOffset(int32_t blockIdx, const TCubeTiling &tiling,
+                                                             int32_t &offsetA, int32_t &offsetB, int32_t &offsetC,
+                                                             int32_t &offsetBias)
+{
+    auto mSingleBlocks = Ceiling(tiling.M, tiling.singleCoreM);
+    auto mCoreIndx = blockIdx % mSingleBlocks;
+    auto nCoreIndx = blockIdx / mSingleBlocks;
+
+    offsetA = mCoreIndx * tiling.Ka * tiling.singleCoreM;
+    offsetB = nCoreIndx * tiling.singleCoreN;
+    offsetC = mCoreIndx * tiling.N * tiling.singleCoreM + nCoreIndx * tiling.singleCoreN;
+    offsetBias = nCoreIndx * tiling.singleCoreN;
+}
+
+extern "C" __global__ __aicore__ void matmul_leakyrelu_custom(GM_ADDR a, GM_ADDR b, GM_ADDR bias, GM_ADDR c,
+                                                              GM_ADDR workspace, GM_ADDR tilingGm)
+{
+    AscendC::TPipe pipe;
+    TCubeTiling tiling;
+    CopyTiling(&tiling, tilingGm);
+
+    MatmulLeakyKernel<half, half, float, float> matmulLeakyKernel;
+    matmulLeakyKernel.Init(a, b, bias, c, workspace, tiling, &pipe);
+    REGIST_MATMUL_OBJ(&pipe, GetSysWorkSpacePtr(), matmulLeakyKernel.matmulObj, &matmulLeakyKernel.tiling);
+    matmulLeakyKernel.Process(&pipe);
+}
\ No newline at end of file
diff --git a/operator/MatmulLeakyReluCustomSample/KernelLaunch/MatmulLeakyReluInvocationAcl/matmul_leakyrelu_custom_tiling.cpp b/operator/MatmulLeakyReluCustomSample/KernelLaunch/MatmulLeakyReluInvocationAcl/matmul_leakyrelu_custom_tiling.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..287777e53ba89c06286c79522d3fd484fb6bb291
--- /dev/null
+++ b/operator/MatmulLeakyReluCustomSample/KernelLaunch/MatmulLeakyReluInvocationAcl/matmul_leakyrelu_custom_tiling.cpp
@@ -0,0 +1,82 @@
+/**
+ * @file matmul_leakyrelu_custom_tiling.cpp
+ *
+ * Copyright (C) 2023-2024. Huawei Technologies Co., Ltd. All rights reserved.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ */
+#include <cassert>
+#include <fstream>
+#include <iostream>
+#include <map>
+#include <string>
+
+#include "tiling/tiling_api.h"
+#include "tiling/platform/platform_ascendc.h"
+using namespace matmul_tiling;
+using namespace std;
+
+uint8_t *GetTilingBuf(optiling::TCubeTiling *tilingData)
+{
+    uint32_t tilingSize = tilingData->GetDataSize();
+    uint8_t *buf = (uint8_t *)malloc(tilingSize);
+    tilingData->SaveToBuffer(buf, tilingSize);
+    return buf;
+}
+
+uint8_t *GenerateTiling(const char *socVersion)
+{
+    int M = 1024;
+    int N = 640;
+    int K = 256;
+
+    TPosition leftPosition = TPosition::GM;
+    CubeFormat leftFormat = CubeFormat::ND;
+    DataType leftDtype = DataType::DT_FLOAT16;
+    bool isTransA = false;
+
+    TPosition rightPosition = TPosition::GM;
+    CubeFormat rightFormat = CubeFormat::ND;
+    DataType rightDtype = DataType::DT_FLOAT16;
+    bool isTransB = false;
+
+    TPosition resultPosition = TPosition::GM;
+    CubeFormat resultFormat = CubeFormat::ND;
+    DataType resultDtype = DataType::DT_FLOAT;
+
+    TPosition biasPosition = TPosition::GM;
+    CubeFormat biasFormat = CubeFormat::ND;
+    DataType biasDtype = DataType::DT_FLOAT;
+    bool isBias = true;
+
+    int usedCoreNum = 2;
+    int baseM = 256;
+    int baseN = 128;
+
+    optiling::TCubeTiling tilingData;
+    auto ascendcPlatform = platform_ascendc::PlatformAscendCManager::GetInstance(socVersion);
+    MultiCoreMatmulTiling tilingApi(*ascendcPlatform);
+
+    tilingApi.SetDim(usedCoreNum);
+    tilingApi.SetAType(leftPosition, leftFormat, leftDtype, isTransA);
+    tilingApi.SetBType(rightPosition, rightFormat, rightDtype, isTransB);
+    tilingApi.SetCType(resultPosition, resultFormat, resultDtype);
+    tilingApi.SetBiasType(biasPosition, biasFormat, biasDtype);
+
+    tilingApi.SetOrgShape(M, N, K);
+    tilingApi.SetShape(M, N, K);
+    tilingApi.SetBias(isBias);
+    tilingApi.SetTraverse(MatrixTraverse::FIRSTM);
+    tilingApi.SetFixSplit(baseM, baseN, -1);
+    tilingApi.SetBufferSpace(-1, -1, -1);
+
+    int64_t res = tilingApi.GetTiling(tilingData);
+    tilingData.set_stepM(1);
+    tilingData.set_stepN(1);
+    if (res == -1) {
+        std::cout << "gen tiling failed" << std::endl;
+    }
+    return GetTilingBuf(&tilingData);
+}
diff --git a/operator/MatmulLeakyReluCustomSample/KernelLaunch/MatmulLeakyReluInvocationAcl/run.sh b/operator/MatmulLeakyReluCustomSample/KernelLaunch/MatmulLeakyReluInvocationAcl/run.sh
new file mode 100644
index 0000000000000000000000000000000000000000..af6e9101619a2c5a3d6cc154a7ebe8d18fe78018
--- /dev/null
+++ b/operator/MatmulLeakyReluCustomSample/KernelLaunch/MatmulLeakyReluInvocationAcl/run.sh
@@ -0,0 +1,126 @@
+#!/bin/bash
+CURRENT_DIR=$(
+    cd $(dirname ${BASH_SOURCE:-$0})
+    pwd
+)
+cd $CURRENT_DIR
+
+BUILD_TYPE="Debug"
+INSTALL_PREFIX="${CURRENT_DIR}/out"
+
+SHORT=r:,v:,i:,b:,p:,
+LONG=run-mode:,soc-version:,install-path:,build-type:,install-prefix:,
+OPTS=$(getopt -a --options $SHORT --longoptions $LONG -- "$@")
+eval set -- "$OPTS"
+
+while :; do
+    case "$1" in
+    -r | --run-mode)
+        RUN_MODE="$2"
+        shift 2
+        ;;
+    -v | --soc-version)
+        SOC_VERSION="$2"
+        shift 2
+        ;;
+    -i | --install-path)
+        ASCEND_INSTALL_PATH="$2"
+        shift 2
+        ;;
+    -b | --build-type)
+        BUILD_TYPE="$2"
+        shift 2
+        ;;
+    -p | --install-prefix)
+        INSTALL_PREFIX="$2"
+        shift 2
+        ;;
+    --)
+        shift
+        break
+        ;;
+    *)
+        echo "[ERROR] Unexpected option: $1"
+        break
+        ;;
+    esac
+done
+
+RUN_MODE_LIST="cpu sim npu"
+if [[ " $RUN_MODE_LIST " != *" $RUN_MODE "* ]]; then
+    echo "ERROR: RUN_MODE error, This sample only support specify cpu, sim or npu!"
+    exit -1
+fi
+
+VERSION_LIST="Ascend310P1 Ascend310P3 Ascend910B1 Ascend910B2 Ascend910B3 Ascend910B4"
+if [[ " $VERSION_LIST " != *" $SOC_VERSION "* ]]; then
+    echo "ERROR: SOC_VERSION should be in [$VERSION_LIST]"
+    exit -1
+fi
+
+if [ -n "$ASCEND_INSTALL_PATH" ]; then
+    _ASCEND_INSTALL_PATH=$ASCEND_INSTALL_PATH
+elif [ -n "$ASCEND_HOME_PATH" ]; then
+    _ASCEND_INSTALL_PATH=$ASCEND_HOME_PATH
+else
+    if [ -d "$HOME/Ascend/ascend-toolkit/latest" ]; then
+        _ASCEND_INSTALL_PATH=$HOME/Ascend/ascend-toolkit/latest
+    else
+        _ASCEND_INSTALL_PATH=/usr/local/Ascend/ascend-toolkit/latest
+    fi
+fi
+
+export ASCEND_TOOLKIT_HOME=${_ASCEND_INSTALL_PATH}
+export ASCEND_HOME_PATH=${_ASCEND_INSTALL_PATH}
+if [ "${RUN_MODE}" = "npu" ]; then
+    source ${_ASCEND_INSTALL_PATH}/bin/setenv.bash
+elif [ "${RUN_MODE}" = "sim" ]; then
+    # in case of running op in simulator, use stub .so instead
+    export LD_LIBRARY_PATH=${_ASCEND_INSTALL_PATH}/runtime/lib64/stub:$LD_LIBRARY_PATH
+    source ${_ASCEND_INSTALL_PATH}/bin/setenv.bash
+    export LD_LIBRARY_PATH=${_ASCEND_INSTALL_PATH}/tools/simulator/${SOC_VERSION}/lib:$LD_LIBRARY_PATH
+    if [ ! $CAMODEL_LOG_PATH ]; then
+        export CAMODEL_LOG_PATH=$(pwd)/sim_log
+    fi
+    if [ -d "$CAMODEL_LOG_PATH" ]; then
+        rm -rf $CAMODEL_LOG_PATH
+    fi
+    mkdir -p $CAMODEL_LOG_PATH
+elif [ "${RUN_MODE}" = "cpu" ]; then
+    source ${_ASCEND_INSTALL_PATH}/bin/setenv.bash
+    export LD_LIBRARY_PATH=${_ASCEND_INSTALL_PATH}/tools/tikicpulib/lib:${_ASCEND_INSTALL_PATH}/tools/tikicpulib/lib/${SOC_VERSION}:${_ASCEND_INSTALL_PATH}/tools/simulator/${SOC_VERSION}/lib:$LD_LIBRARY_PATH
+fi
+
+set -e
+rm -rf build out
+mkdir -p build
+cmake -B build \
+    -DRUN_MODE=${RUN_MODE} \
+    -DSOC_VERSION=${SOC_VERSION} \
+    -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
+    -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} \
+    -DASCEND_CANN_PACKAGE_PATH=${_ASCEND_INSTALL_PATH}
+cmake --build build -j
+cmake --install build
+
+rm -f ascendc_kernels_bbit
+cp ./out/bin/ascendc_kernels_bbit ./
+rm -rf input output
+mkdir -p input output
+python3 scripts/gen_data.py
+(
+    export LD_LIBRARY_PATH=$(pwd)/out/lib:$(pwd)/out/lib64:${_ASCEND_INSTALL_PATH}/lib64:$LD_LIBRARY_PATH
+    if [[ "$RUN_WITH_TOOLCHAIN" -eq 1 ]]; then
+        if [ "${RUN_MODE}" = "npu" ]; then
+            msprof op --application=./ascendc_kernels_bbit
+        elif [ "${RUN_MODE}" = "sim" ]; then
+            msprof op simulator --application=./ascendc_kernels_bbit
+        elif [ "${RUN_MODE}" = "cpu" ]; then
+            ./ascendc_kernels_bbit
+        fi
+    else
+        ./ascendc_kernels_bbit
+    fi
+)
+md5sum output/*.bin
+python3 scripts/verify_result.py output/output.bin output/golden.bin
diff --git a/operator/MatmulLeakyReluCustomSample/KernelLaunch/MatmulLeakyReluInvocationAcl/scripts/gen_data.py b/operator/MatmulLeakyReluCustomSample/KernelLaunch/MatmulLeakyReluInvocationAcl/scripts/gen_data.py
new file mode 100644
index 0000000000000000000000000000000000000000..e03d4359cad05680cc261d8e638867803eacdd71
--- /dev/null
+++ b/operator/MatmulLeakyReluCustomSample/KernelLaunch/MatmulLeakyReluInvocationAcl/scripts/gen_data.py
@@ -0,0 +1,35 @@
+#!/usr/bin/python3
+# coding=utf-8
+#
+# Copyright (C) 2023-2024. Huawei Technologies Co., Ltd. All rights reserved.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+# ===============================================================================
+
+import numpy as np
+import os
+
+
+def gen_golden_data():
+    M = 1024
+    N = 640
+    K = 256
+
+    input_a = np.random.randint(1, 10, [M, K]).astype(np.float16)
+    input_b = np.random.randint(1, 10, [K, N]).astype(np.float16)
+    input_bias = np.random.randint(1, 10, [N]).astype(np.float32)
+    alpha = 0.001
+    golden = (np.matmul(input_a.astype(np.float32), input_b.astype(np.float32)) + input_bias).astype(np.float32)
+    golden = np.where(golden >= 0, golden, golden * alpha)
+    os.system("mkdir -p input")
+    os.system("mkdir -p output")
+    input_a.tofile("./input/x1_gm.bin")
+    input_b.tofile("./input/x2_gm.bin")
+    input_bias.tofile("./input/bias.bin")
+    golden.tofile("./output/golden.bin")
+
+
+if __name__ == "__main__":
+    gen_golden_data()
diff --git a/operator/MatmulLeakyReluCustomSample/KernelLaunch/MatmulLeakyReluInvocationAcl/scripts/verify_result.py b/operator/MatmulLeakyReluCustomSample/KernelLaunch/MatmulLeakyReluInvocationAcl/scripts/verify_result.py
new file mode 100644
index 0000000000000000000000000000000000000000..a325cfcc61f46f7de04ef6c979f852adcf213ef3
--- /dev/null
+++ b/operator/MatmulLeakyReluCustomSample/KernelLaunch/MatmulLeakyReluInvocationAcl/scripts/verify_result.py
@@ -0,0 +1,53 @@
+#!/usr/bin/python3
+# coding=utf-8
+#
+# Copyright (C) 2023-2024. Huawei Technologies Co., Ltd. All rights reserved.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+# ===============================================================================
+
+import sys
+import numpy as np
+
+# for float32
+relative_tol = 1e-6
+absolute_tol = 1e-9
+error_tol = 1e-4
+
+
+def verify_result(output, golden):
+    output = np.fromfile(output, dtype=np.float32).reshape(-1)
+    golden = np.fromfile(golden, dtype=np.float32).reshape(-1)
+    different_element_results = np.isclose(output,
+                                           golden,
+                                           rtol=relative_tol,
+                                           atol=absolute_tol,
+                                           equal_nan=True)
+    different_element_indexes = np.where(different_element_results == False)[0]
+    for index in range(len(different_element_indexes)):
+        real_index = different_element_indexes[index]
+        golden_data = golden[real_index]
+        output_data = output[real_index]
+        print(
+            "data index: %06d, expected: %-.9f, actual: %-.9f, rdiff: %-.6f" %
+            (real_index, golden_data, output_data,
+             abs(output_data - golden_data) / golden_data))
+        if index == 100:
+            break
+    error_ratio = float(different_element_indexes.size) / golden.size
+    print("error ratio: %.4f, tolrence: %.4f" % (error_ratio, error_tol))
+    return error_ratio <= error_tol
+
+
+if __name__ == '__main__':
+    try:
+        res = verify_result(sys.argv[1], sys.argv[2])
+        if not res:
+            raise ValueError("[ERROR] result error")
+        else:
+            print("test pass")
+    except Exception as e:
+        print(e)
+        sys.exit(1)