diff --git a/operator/AddCustomSample/KernelLaunch/AddKernelInvocationAcl/CMakeLists.txt b/operator/AddCustomSample/KernelLaunch/AddKernelInvocationAcl/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..3da49ce40ed150fa6aa4456d69246e5cbec7139d --- /dev/null +++ b/operator/AddCustomSample/KernelLaunch/AddKernelInvocationAcl/CMakeLists.txt @@ -0,0 +1,44 @@ +cmake_minimum_required(VERSION 3.16) +project(Ascend_c) + +set(RUN_MODE "npu" CACHE STRING "cpu/sim/npu") +set(SOC_VERSION "Ascend310P3" CACHE STRING "system on chip type") +set(ASCEND_CANN_PACKAGE_PATH "/usr/local/Ascend/ascend-toolkit/latest" + CACHE STRING "ASCEND CANN package installation directory" +) +if(NOT CMAKE_BUILD_TYPE) + set(CMAKE_BUILD_TYPE "Debug" CACHE STRING "Build type Release/Debug (default Debug)" FORCE) +endif() +if(CMAKE_INSTALL_PREFIX STREQUAL /usr/local) + set(CMAKE_INSTALL_PREFIX "${CMAKE_CURRENT_LIST_DIR}/out" CACHE STRING "path for install()" FORCE) +endif() + +file(GLOB KERNEL_FILES ${CMAKE_CURRENT_SOURCE_DIR}/add_custom.cpp) + +if("${RUN_MODE}" STREQUAL "cpu") + include(cmake/cpu_lib.cmake) +elseif("${RUN_MODE}" STREQUAL "sim" OR "${RUN_MODE}" STREQUAL "npu") + include(cmake/npu_lib.cmake) +else() + message("invalid RUN_MODE: ${RUN_MODE}") +endif() +add_executable(ascendc_kernels_bbit ${CMAKE_CURRENT_SOURCE_DIR}/main.cpp) + +target_compile_options(ascendc_kernels_bbit PRIVATE + $:-g>> + -O2 -std=c++17 -D_GLIBCXX_USE_CXX11_ABI=0 -Wall -Werror +) +set(RUN_FATBIN "ON" CACHE STRING "ON/OFF") +target_link_libraries(ascendc_kernels_bbit PRIVATE + $,$>:host_intf_pub>> + $:tikicpulib::${SOC_VERSION}>> + $,$>:ascendcl>> + $:c_sec>> + $<$:ascendc_kernels> +) + +install(TARGETS ascendc_kernels_bbit + LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} + ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} + RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} +) diff --git a/operator/AddCustomSample/KernelLaunch/AddKernelInvocationAcl/README.md b/operator/AddCustomSample/KernelLaunch/AddKernelInvocationAcl/README.md new file mode 100644 index 0000000000000000000000000000000000000000..e33acad20ea27048cfaf8b72877d5f3a3af5d731 --- /dev/null +++ b/operator/AddCustomSample/KernelLaunch/AddKernelInvocationAcl/README.md @@ -0,0 +1,71 @@ +## 目录结构介绍 +``` +├── AddKernelInvocationAcl +│ ├── cmake // 编译工程文件 +│ ├── scripts +│ │ ├── gen_data.py // 输入数据和真值数据生成脚本 +│ │ └── verify_result.py // 验证输出数据和真值数据是否一致的验证脚本 +│ ├── add_custom.cpp // 算子kernel实现 +│ ├── CMakeLists.txt // 编译工程文件 +│ ├── data_utils.h // 数据读入写出函数 +│ ├── main.cpp // 主函数,调用算子的应用程序,含CPU域及NPU域调用 +│ └── run.sh // 编译运行算子的脚本 +``` +## 代码实现介绍 +本调用样例中实现的是固定shape为8*2048的Add算子。 +- kernel实现 + Add算子的数学表达式为: + ``` + z = x + y + ``` + 计算逻辑是:Ascend C提供的矢量计算接口的操作元素都为LocalTensor,输入数据需要先搬运进片上存储,然后使用计算接口完成两个输入参数相加,得到最终结果,再搬出到外部存储上。 + + Add算子的实现流程分为3个基本任务:CopyIn,Compute,CopyOut。CopyIn任务负责将Global Memory上的输入Tensor xGm和yGm搬运到Local Memory,分别存储在xLocal、yLocal,Compute任务负责对xLocal、yLocal执行加法操作,计算结果存储在zLocal中,CopyOut任务负责将输出数据从zLocal搬运至Global Memory上的输出Tensor zGm中。具体请参考[add_custom.cpp](./add_custom.cpp)。 + +- 调用实现 + 1. CPU侧运行验证主要通过ICPU_RUN_KF CPU调测宏等CPU调测库提供的接口来完成; + 2. NPU侧运行验证主要通过使用aclrtLaunchKernel函数调用来完成。 + + 应用程序通过ASCENDC_CPU_DEBUG 宏区分代码逻辑运行于CPU侧还是NPU侧。 + +## 运行样例算子 + - 打开样例目录 + 以命令行方式下载样例代码,master分支为例。 + ```bash + cd ${git_clone_path}/samples/operator/AddCustomSample/KernelLaunch/AddKernelInvocationAcl + ``` + - 配置环境变量 + + 请根据当前环境上CANN开发套件包的[安装方式](https://hiascend.com/document/redirect/CannCommunityInstSoftware),选择对应配置环境变量的命令。 + - 默认路径,root用户安装CANN软件包 + ```bash + export ASCEND_INSTALL_PATH=/usr/local/Ascend/ascend-toolkit/latest + ``` + - 默认路径,非root用户安装CANN软件包 + ```bash + export ASCEND_INSTALL_PATH=$HOME/Ascend/ascend-toolkit/latest + ``` + - 指定路径install_path,安装CANN软件包 + ```bash + export ASCEND_INSTALL_PATH=${install_path}/ascend-toolkit/latest + ``` + + - 样例执行 + + ```bash + bash run.sh -r [RUN_MODE] -v [SOC_VERSION] + ``` + - RUN_MODE:编译方式,可选择CPU调试,NPU上板。支持参数为[cpu / npu] + - SOC_VERSION:昇腾AI处理器型号,如果无法确定具体的[SOC_VERSION],则在安装昇腾AI处理器的服务器执行npu-smi info命令进行查询,在查询到的“Name”前增加Ascend信息,例如“Name”对应取值为xxxyy,实际配置的[SOC_VERSION]值为Ascendxxxyy。支持以下参数取值(xxx请替换为具体取值): + - Atlas 推理系列产品(Ascend 310P处理器)参数值:Ascend310P1、Ascend310P3 + - Atlas 训练系列产品参数值:AscendxxxA、AscendxxxB + - Atlas A2训练系列产品参数值:AscendxxxB1、AscendxxxB2、AscendxxxB3、AscendxxxB4 + + 示例如下。 + ```bash + bash run.sh -r cpu -v Ascend310P1 + ``` +## 更新说明 +| 时间 | 更新事项 | +| ---------- | ------------ | +| 2024/08/31 | 新增本readme | \ No newline at end of file diff --git a/operator/AddCustomSample/KernelLaunch/AddKernelInvocationAcl/add_custom.cpp b/operator/AddCustomSample/KernelLaunch/AddKernelInvocationAcl/add_custom.cpp new file mode 100644 index 0000000000000000000000000000000000000000..96b37a7c3df7a7a9ba36781db9395f3b2f07f4bb --- /dev/null +++ b/operator/AddCustomSample/KernelLaunch/AddKernelInvocationAcl/add_custom.cpp @@ -0,0 +1,82 @@ +/** + * @file add_custom.cpp + * + * Copyright (C) 2024. Huawei Technologies Co., Ltd. All rights reserved. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + */ +#include "kernel_operator.h" + +constexpr int32_t TOTAL_LENGTH = 8 * 2048; // total length of data +constexpr int32_t USE_CORE_NUM = 8; // num of core used +constexpr int32_t BLOCK_LENGTH = TOTAL_LENGTH / USE_CORE_NUM; // length computed of each core +constexpr int32_t TILE_NUM = 8; // split data into 8 tiles for each core +constexpr int32_t BUFFER_NUM = 2; // tensor num for each queue +constexpr int32_t TILE_LENGTH = BLOCK_LENGTH / TILE_NUM / BUFFER_NUM; // separate to 2 parts, due to double buffer + +class KernelAdd { +public: + __aicore__ inline KernelAdd() {} + __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, GM_ADDR z) + { + xGm.SetGlobalBuffer((__gm__ half *)x + BLOCK_LENGTH * AscendC::GetBlockIdx(), BLOCK_LENGTH); + yGm.SetGlobalBuffer((__gm__ half *)y + BLOCK_LENGTH * AscendC::GetBlockIdx(), BLOCK_LENGTH); + zGm.SetGlobalBuffer((__gm__ half *)z + BLOCK_LENGTH * AscendC::GetBlockIdx(), BLOCK_LENGTH); + pipe.InitBuffer(inQueueX, BUFFER_NUM, TILE_LENGTH * sizeof(half)); + pipe.InitBuffer(inQueueY, BUFFER_NUM, TILE_LENGTH * sizeof(half)); + pipe.InitBuffer(outQueueZ, BUFFER_NUM, TILE_LENGTH * sizeof(half)); + } + __aicore__ inline void Process() + { + int32_t loopCount = TILE_NUM * BUFFER_NUM; + for (int32_t i = 0; i < loopCount; i++) { + CopyIn(i); + Compute(i); + CopyOut(i); + } + } + +private: + __aicore__ inline void CopyIn(int32_t progress) + { + AscendC::LocalTensor xLocal = inQueueX.AllocTensor(); + AscendC::LocalTensor yLocal = inQueueY.AllocTensor(); + AscendC::DataCopy(xLocal, xGm[progress * TILE_LENGTH], TILE_LENGTH); + AscendC::DataCopy(yLocal, yGm[progress * TILE_LENGTH], TILE_LENGTH); + inQueueX.EnQue(xLocal); + inQueueY.EnQue(yLocal); + } + __aicore__ inline void Compute(int32_t progress) + { + AscendC::LocalTensor xLocal = inQueueX.DeQue(); + AscendC::LocalTensor yLocal = inQueueY.DeQue(); + AscendC::LocalTensor zLocal = outQueueZ.AllocTensor(); + AscendC::Add(zLocal, xLocal, yLocal, TILE_LENGTH); + outQueueZ.EnQue(zLocal); + inQueueX.FreeTensor(xLocal); + inQueueY.FreeTensor(yLocal); + } + __aicore__ inline void CopyOut(int32_t progress) + { + AscendC::LocalTensor zLocal = outQueueZ.DeQue(); + AscendC::DataCopy(zGm[progress * TILE_LENGTH], zLocal, TILE_LENGTH); + outQueueZ.FreeTensor(zLocal); + } + +private: + AscendC::TPipe pipe; + AscendC::TQue inQueueX, inQueueY; + AscendC::TQue outQueueZ; + AscendC::GlobalTensor xGm; + AscendC::GlobalTensor yGm; + AscendC::GlobalTensor zGm; +}; + +extern "C" __global__ __aicore__ void add_custom(GM_ADDR x, GM_ADDR y, GM_ADDR z) +{ + KernelAdd op; + op.Init(x, y, z); + op.Process(); +} diff --git a/operator/AddCustomSample/KernelLaunch/AddKernelInvocationAcl/cmake/cpu_lib.cmake b/operator/AddCustomSample/KernelLaunch/AddKernelInvocationAcl/cmake/cpu_lib.cmake new file mode 100644 index 0000000000000000000000000000000000000000..751a119411509a4eeec79b76a875776206daeaf6 --- /dev/null +++ b/operator/AddCustomSample/KernelLaunch/AddKernelInvocationAcl/cmake/cpu_lib.cmake @@ -0,0 +1,9 @@ +if(NOT DEFINED ENV{CMAKE_PREFIX_PATH}) + set(CMAKE_PREFIX_PATH ${ASCEND_CANN_PACKAGE_PATH}/tools/tikicpulib/lib/cmake) +endif() +find_package(tikicpulib REQUIRED) + +add_library(ascendc_kernels SHARED ${KERNEL_FILES}) +target_link_libraries(ascendc_kernels PUBLIC tikicpulib::${SOC_VERSION}) +target_compile_options(ascendc_kernels PRIVATE -g -O0 -std=c++17) +install(TARGETS ascendc_kernels DESTINATION ${CMAKE_INSTALL_LIBDIR}) diff --git a/operator/AddCustomSample/KernelLaunch/AddKernelInvocationAcl/cmake/npu_lib.cmake b/operator/AddCustomSample/KernelLaunch/AddKernelInvocationAcl/cmake/npu_lib.cmake new file mode 100644 index 0000000000000000000000000000000000000000..d862f006417dcb8cf30cb8c33f293c2f869ff6e1 --- /dev/null +++ b/operator/AddCustomSample/KernelLaunch/AddKernelInvocationAcl/cmake/npu_lib.cmake @@ -0,0 +1,10 @@ +if(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake) + set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake) +elseif(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake) + set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake) +else() + message(FATAL_ERROR "ascendc_kernel_cmake does not exist ,please check whether the cann package is installed") +endif() +include(${ASCENDC_CMAKE_DIR}/ascendc.cmake) + +ascendc_fatbin_library(ascendc_kernels ${KERNEL_FILES}) diff --git a/operator/AddCustomSample/KernelLaunch/AddKernelInvocationAcl/data_utils.h b/operator/AddCustomSample/KernelLaunch/AddKernelInvocationAcl/data_utils.h new file mode 100644 index 0000000000000000000000000000000000000000..1d43459ef7c5858cf97a9ba43769e50b8dd28853 --- /dev/null +++ b/operator/AddCustomSample/KernelLaunch/AddKernelInvocationAcl/data_utils.h @@ -0,0 +1,240 @@ +/** + * @file data_utils.h + * + * Copyright (C) 2023-2024. Huawei Technologies Co., Ltd. All rights reserved. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + */ +#ifndef DATA_UTILS_H +#define DATA_UTILS_H +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "acl/acl.h" + +typedef enum { + DT_UNDEFINED = -1, + FLOAT = 0, + HALF = 1, + INT8_T = 2, + INT32_T = 3, + UINT8_T = 4, + INT16_T = 6, + UINT16_T = 7, + UINT32_T = 8, + INT64_T = 9, + UINT64_T = 10, + DOUBLE = 11, + BOOL = 12, + STRING = 13, + COMPLEX64 = 16, + COMPLEX128 = 17, + BF16 = 27 +} printDataType; + +#define INFO_LOG(fmt, args...) fprintf(stdout, "[INFO] " fmt "\n", ##args) +#define WARN_LOG(fmt, args...) fprintf(stdout, "[WARN] " fmt "\n", ##args) +#define ERROR_LOG(fmt, args...) fprintf(stdout, "[ERROR] " fmt "\n", ##args) +#define CHECK_ACL(x) \ + do { \ + aclError __ret = x; \ + if (__ret != ACL_ERROR_NONE) { \ + std::cerr << __FILE__ << ":" << __LINE__ << " aclError:" << __ret << std::endl; \ + } \ + } while (0); + +/** + * @brief Read data from file + * @param [in] filePath: file path + * @param [out] fileSize: file size + * @return read result + */ +bool ReadFile(const std::string &filePath, size_t &fileSize, void *buffer, size_t bufferSize) +{ + struct stat sBuf; + int fileStatus = stat(filePath.data(), &sBuf); + if (fileStatus == -1) { + ERROR_LOG("failed to get file"); + return false; + } + if (S_ISREG(sBuf.st_mode) == 0) { + ERROR_LOG("%s is not a file, please enter a file", filePath.c_str()); + return false; + } + + std::ifstream file; + file.open(filePath, std::ios::binary); + if (!file.is_open()) { + ERROR_LOG("Open file failed. path = %s", filePath.c_str()); + return false; + } + + std::filebuf *buf = file.rdbuf(); + size_t size = buf->pubseekoff(0, std::ios::end, std::ios::in); + if (size == 0) { + ERROR_LOG("file size is 0"); + file.close(); + return false; + } + if (size > bufferSize) { + ERROR_LOG("file size is larger than buffer size"); + file.close(); + return false; + } + buf->pubseekpos(0, std::ios::in); + buf->sgetn(static_cast(buffer), size); + fileSize = size; + file.close(); + return true; +} + +/** + * @brief Write data to file + * @param [in] filePath: file path + * @param [in] buffer: data to write to file + * @param [in] size: size to write + * @return write result + */ +bool WriteFile(const std::string &filePath, const void *buffer, size_t size) +{ + if (buffer == nullptr) { + ERROR_LOG("Write file failed. buffer is nullptr"); + return false; + } + + int fd = open(filePath.c_str(), O_RDWR | O_CREAT | O_TRUNC, S_IRUSR | S_IWRITE); + if (fd < 0) { + ERROR_LOG("Open file failed. path = %s", filePath.c_str()); + return false; + } + + size_t writeSize = write(fd, buffer, size); + (void)close(fd); + if (writeSize != size) { + ERROR_LOG("Write file Failed."); + return false; + } + + return true; +} + +/** + * @brief Reads a binary file into memory. + * + * This function opens a binary file, reads its contents into a dynamically allocated memory buffer, + * and returns a pointer to the buffer and the size of the file through output parameters. + * + * @param filePath The path to the binary file to be read. + * @param outBuffer A reference to a unique pointer that will hold the file data. + * @param outSize A reference to a size_t that will hold the size of the file. + * @return true if the file was read successfully, false otherwise. + */ +bool ReadBinaryFile(const char *filePath, std::unique_ptr &outBuffer, size_t &outSize) +{ + FILE *file = fopen(filePath, "rb"); + if (!file) { + ERROR_LOG("Error opening file: %s\n", strerror(errno)); + return false; + } + + fseek(file, 0, SEEK_END); + outSize = ftell(file); + rewind(file); + + outBuffer.reset(new char[outSize]); + if (fread(outBuffer.get(), 1, outSize, file) != outSize) { + ERROR_LOG("Error reading file.\n"); + fclose(file); + return false; + } + + fclose(file); + return true; +} + +template void DoPrintData(const T *data, size_t count, size_t elementsPerRow) +{ + assert(elementsPerRow != 0); + for (size_t i = 0; i < count; ++i) { + std::cout << std::setw(10) << data[i]; + if (i % elementsPerRow == elementsPerRow - 1) { + std::cout << std::endl; + } + } +} + +void DoPrintHalfData(const aclFloat16 *data, size_t count, size_t elementsPerRow) +{ + assert(elementsPerRow != 0); + for (size_t i = 0; i < count; ++i) { + std::cout << std::setw(10) << std::setprecision(6) << aclFloat16ToFloat(data[i]); + if (i % elementsPerRow == elementsPerRow - 1) { + std::cout << std::endl; + } + } +} + +void PrintData(const void *data, size_t count, printDataType dataType, size_t elementsPerRow = 16) +{ + if (data == nullptr) { + ERROR_LOG("Print data failed. data is nullptr"); + return; + } + + switch (dataType) { + case BOOL: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case INT8_T: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case UINT8_T: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case INT16_T: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case UINT16_T: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case INT32_T: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case UINT32_T: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case INT64_T: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case UINT64_T: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case HALF: + DoPrintHalfData(reinterpret_cast(data), count, elementsPerRow); + break; + case FLOAT: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case DOUBLE: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + default: + ERROR_LOG("Unsupported type: %d", dataType); + } + std::cout << std::endl; +} +#endif // DATA_UTILS_H diff --git a/operator/AddCustomSample/KernelLaunch/AddKernelInvocationAcl/main.cpp b/operator/AddCustomSample/KernelLaunch/AddKernelInvocationAcl/main.cpp new file mode 100644 index 0000000000000000000000000000000000000000..c868a66229af8e9fd18623709542d1925af4122c --- /dev/null +++ b/operator/AddCustomSample/KernelLaunch/AddKernelInvocationAcl/main.cpp @@ -0,0 +1,115 @@ +/** + * @file main.cpp + * + * Copyright (C) 2024. Huawei Technologies Co., Ltd. All rights reserved. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + */ +#include "data_utils.h" +#ifndef ASCENDC_CPU_DEBUG +#include "acl/acl.h" +#else +#include "tikicpulib.h" +extern "C" __global__ __aicore__ void add_custom(GM_ADDR x, GM_ADDR y, GM_ADDR z); +#endif + +int32_t main(int32_t argc, char *argv[]) +{ + uint32_t blockDim = 8; + size_t inputByteSize = 8 * 2048 * sizeof(uint16_t); + size_t outputByteSize = 8 * 2048 * sizeof(uint16_t); + +#ifdef ASCENDC_CPU_DEBUG + void *x = AscendC::GmAlloc(inputByteSize); + void *y = AscendC::GmAlloc(inputByteSize); + void *z = AscendC::GmAlloc(outputByteSize); + + ReadFile("./input/input_x.bin", inputByteSize, x, inputByteSize); + ReadFile("./input/input_y.bin", inputByteSize, y, inputByteSize); + + AscendC::SetKernelMode(KernelMode::AIV_MODE); + ICPU_RUN_KF(add_custom, blockDim, x, y, z); // use this macro for cpu debug + + WriteFile("./output/output_z.bin", z, outputByteSize); + + AscendC::GmFree(x); + AscendC::GmFree(y); + AscendC::GmFree(z); +#else + CHECK_ACL(aclInit(nullptr)); + int32_t deviceId = 0; + CHECK_ACL(aclrtSetDevice(deviceId)); + aclrtStream stream = nullptr; + CHECK_ACL(aclrtCreateStream(&stream)); + + void *xHost, *yHost, *zHost; + void *xDevice, *yDevice, *zDevice; + + CHECK_ACL(aclrtMallocHost((void **)(&xHost), inputByteSize)); + CHECK_ACL(aclrtMallocHost((void **)(&yHost), inputByteSize)); + CHECK_ACL(aclrtMallocHost((void **)(&zHost), outputByteSize)); + CHECK_ACL(aclrtMalloc((void **)&xDevice, inputByteSize, ACL_MEM_MALLOC_HUGE_FIRST)); + CHECK_ACL(aclrtMalloc((void **)&yDevice, inputByteSize, ACL_MEM_MALLOC_HUGE_FIRST)); + CHECK_ACL(aclrtMalloc((void **)&zDevice, outputByteSize, ACL_MEM_MALLOC_HUGE_FIRST)); + + ReadFile("./input/input_x.bin", inputByteSize, xHost, inputByteSize); + ReadFile("./input/input_y.bin", inputByteSize, yHost, inputByteSize); + + CHECK_ACL(aclrtMemcpy(xDevice, inputByteSize, xHost, inputByteSize, ACL_MEMCPY_HOST_TO_DEVICE)); + CHECK_ACL(aclrtMemcpy(yDevice, inputByteSize, yHost, inputByteSize, ACL_MEMCPY_HOST_TO_DEVICE)); + + struct { + uintptr_t xDeviceAddr; + uintptr_t yDeviceAddr; + uintptr_t zDeviceAddr; + } args; + auto *argsDeviceAddrInHost = &args; + CHECK_ACL(aclrtMallocHost((void **)&argsDeviceAddrInHost, sizeof(args))); + argsDeviceAddrInHost->xDeviceAddr = (uintptr_t)xDevice; + argsDeviceAddrInHost->yDeviceAddr = (uintptr_t)yDevice; + argsDeviceAddrInHost->zDeviceAddr = (uintptr_t)zDevice; + + void *argsDeviceAddrInDevice = nullptr; + CHECK_ACL(aclrtMalloc((void **)&argsDeviceAddrInDevice, sizeof(args), ACL_MEM_MALLOC_HUGE_FIRST)); + CHECK_ACL(aclrtMemcpy(argsDeviceAddrInDevice, sizeof(args), argsDeviceAddrInHost, sizeof(args), ACL_MEMCPY_HOST_TO_DEVICE)); + CHECK_ACL(aclrtFreeHost(argsDeviceAddrInHost)); + + const char *filePath = "./out/fatbin/ascendc_kernels/ascendc_kernels.o"; + std::unique_ptr fileMemory; + size_t fileSize = 0; + ReadBinaryFile(filePath, fileMemory, fileSize); + + aclrtBinary binary = aclrtCreateBinary(fileMemory.get(), fileSize); + if (binary == nullptr) { + printf("aclrtCreateBinary failed"); + return -1; + } + aclrtBinHandle binHandle = nullptr; + CHECK_ACL(aclrtBinaryLoad(binary, &binHandle)); + CHECK_ACL(aclrtDestroyBinary(binary)); + + aclrtFuncHandle funAdd = nullptr; + CHECK_ACL(aclrtBinaryGetFunction(binHandle, "add_custom", &funAdd)); + + CHECK_ACL(aclrtLaunchKernel(funAdd, blockDim, argsDeviceAddrInDevice, sizeof(args), stream)); + CHECK_ACL(aclrtSynchronizeStream(stream)); + + CHECK_ACL(aclrtMemcpy(zHost, outputByteSize, zDevice, outputByteSize, ACL_MEMCPY_DEVICE_TO_HOST)); + WriteFile("./output/output_z.bin", zHost, outputByteSize); + + CHECK_ACL(aclrtFree(argsDeviceAddrInDevice)); + CHECK_ACL(aclrtFree(xDevice)); + CHECK_ACL(aclrtFree(yDevice)); + CHECK_ACL(aclrtFree(zDevice)); + CHECK_ACL(aclrtFreeHost(xHost)); + CHECK_ACL(aclrtFreeHost(yHost)); + CHECK_ACL(aclrtFreeHost(zHost)); + + CHECK_ACL(aclrtDestroyStream(stream)); + CHECK_ACL(aclrtResetDevice(deviceId)); + CHECK_ACL(aclFinalize()); +#endif + return 0; +} diff --git a/operator/AddCustomSample/KernelLaunch/AddKernelInvocationAcl/run.sh b/operator/AddCustomSample/KernelLaunch/AddKernelInvocationAcl/run.sh new file mode 100644 index 0000000000000000000000000000000000000000..51d92599e647cb585fb18ea862a2acd1921f33dc --- /dev/null +++ b/operator/AddCustomSample/KernelLaunch/AddKernelInvocationAcl/run.sh @@ -0,0 +1,126 @@ +#!/bin/bash +CURRENT_DIR=$( + cd $(dirname ${BASH_SOURCE:-$0}) + pwd +) +cd $CURRENT_DIR + +BUILD_TYPE="Debug" +INSTALL_PREFIX="${CURRENT_DIR}/out" + +SHORT=r:,v:,i:,b:,p:, +LONG=run-mode:,soc-version:,install-path:,build-type:,install-prefix:, +OPTS=$(getopt -a --options $SHORT --longoptions $LONG -- "$@") +eval set -- "$OPTS" + +while :; do + case "$1" in + -r | --run-mode) + RUN_MODE="$2" + shift 2 + ;; + -v | --soc-version) + SOC_VERSION="$2" + shift 2 + ;; + -i | --install-path) + ASCEND_INSTALL_PATH="$2" + shift 2 + ;; + -b | --build-type) + BUILD_TYPE="$2" + shift 2 + ;; + -p | --install-prefix) + INSTALL_PREFIX="$2" + shift 2 + ;; + --) + shift + break + ;; + *) + echo "[ERROR] Unexpected option: $1" + break + ;; + esac +done + +RUN_MODE_LIST="cpu npu" +if [[ " $RUN_MODE_LIST " != *" $RUN_MODE "* ]]; then + echo "ERROR: RUN_MODE error, This sample only support specify cpu or npu!" + exit -1 +fi + +VERSION_LIST="Ascend910A Ascend910B Ascend310B1 Ascend310B2 Ascend310B3 Ascend310B4 Ascend310P1 Ascend310P3 Ascend910B1 Ascend910B2 Ascend910B3 Ascend910B4" +if [[ " $VERSION_LIST " != *" $SOC_VERSION "* ]]; then + echo "ERROR: SOC_VERSION should be in [$VERSION_LIST]" + exit -1 +fi + +if [ -n "$ASCEND_INSTALL_PATH" ]; then + _ASCEND_INSTALL_PATH=$ASCEND_INSTALL_PATH +elif [ -n "$ASCEND_HOME_PATH" ]; then + _ASCEND_INSTALL_PATH=$ASCEND_HOME_PATH +else + if [ -d "$HOME/Ascend/ascend-toolkit/latest" ]; then + _ASCEND_INSTALL_PATH=$HOME/Ascend/ascend-toolkit/latest + else + _ASCEND_INSTALL_PATH=/usr/local/Ascend/ascend-toolkit/latest + fi +fi + +export ASCEND_TOOLKIT_HOME=${_ASCEND_INSTALL_PATH} +export ASCEND_HOME_PATH=${_ASCEND_INSTALL_PATH} +if [ "${RUN_MODE}" = "npu" ]; then + source ${_ASCEND_INSTALL_PATH}/bin/setenv.bash +elif [ "${RUN_MODE}" = "sim" ]; then + # in case of running op in simulator, use stub .so instead + export LD_LIBRARY_PATH=${_ASCEND_INSTALL_PATH}/runtime/lib64/stub:$LD_LIBRARY_PATH + source ${_ASCEND_INSTALL_PATH}/bin/setenv.bash + export LD_LIBRARY_PATH=${_ASCEND_INSTALL_PATH}/tools/simulator/${SOC_VERSION}/lib:$LD_LIBRARY_PATH + if [ ! $CAMODEL_LOG_PATH ]; then + export CAMODEL_LOG_PATH=$(pwd)/sim_log + fi + if [ -d "$CAMODEL_LOG_PATH" ]; then + rm -rf $CAMODEL_LOG_PATH + fi + mkdir -p $CAMODEL_LOG_PATH +elif [ "${RUN_MODE}" = "cpu" ]; then + source ${_ASCEND_INSTALL_PATH}/bin/setenv.bash + export LD_LIBRARY_PATH=${_ASCEND_INSTALL_PATH}/tools/tikicpulib/lib:${_ASCEND_INSTALL_PATH}/tools/tikicpulib/lib/${SOC_VERSION}:${_ASCEND_INSTALL_PATH}/tools/simulator/${SOC_VERSION}/lib:$LD_LIBRARY_PATH +fi + +set -e +rm -rf build out +mkdir -p build +cmake -B build \ + -DRUN_MODE=${RUN_MODE} \ + -DSOC_VERSION=${SOC_VERSION} \ + -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \ + -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} \ + -DASCEND_CANN_PACKAGE_PATH=${_ASCEND_INSTALL_PATH} +cmake --build build -j +cmake --install build + +rm -f ascendc_kernels_bbit +cp ./out/bin/ascendc_kernels_bbit ./ +rm -rf input output +mkdir -p input output +python3 scripts/gen_data.py +( + export LD_LIBRARY_PATH=$(pwd)/out/lib:$(pwd)/out/lib64:${_ASCEND_INSTALL_PATH}/lib64:$LD_LIBRARY_PATH + if [[ "$RUN_WITH_TOOLCHAIN" -eq 1 ]]; then + if [ "${RUN_MODE}" = "npu" ]; then + msprof op --application=./ascendc_kernels_bbit + elif [ "${RUN_MODE}" = "sim" ]; then + msprof op simulator --application=./ascendc_kernels_bbit + elif [ "${RUN_MODE}" = "cpu" ]; then + ./ascendc_kernels_bbit + fi + else + ./ascendc_kernels_bbit + fi +) +md5sum output/*.bin +python3 scripts/verify_result.py output/output_z.bin output/golden.bin diff --git a/operator/AddCustomSample/KernelLaunch/AddKernelInvocationAcl/scripts/gen_data.py b/operator/AddCustomSample/KernelLaunch/AddKernelInvocationAcl/scripts/gen_data.py new file mode 100644 index 0000000000000000000000000000000000000000..ea8ce828aea146c9ab462290be403c4cfd483b75 --- /dev/null +++ b/operator/AddCustomSample/KernelLaunch/AddKernelInvocationAcl/scripts/gen_data.py @@ -0,0 +1,25 @@ +#!/usr/bin/python3 +# coding=utf-8 +# +# Copyright (C) 2023-2024. Huawei Technologies Co., Ltd. All rights reserved. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# =============================================================================== + +import numpy as np + + +def gen_golden_data_simple(): + input_x = np.random.uniform(1, 100, [8, 2048]).astype(np.float16) + input_y = np.random.uniform(1, 100, [8, 2048]).astype(np.float16) + golden = (input_x + input_y).astype(np.float16) + + input_x.tofile("./input/input_x.bin") + input_y.tofile("./input/input_y.bin") + golden.tofile("./output/golden.bin") + + +if __name__ == "__main__": + gen_golden_data_simple() diff --git a/operator/AddCustomSample/KernelLaunch/AddKernelInvocationAcl/scripts/verify_result.py b/operator/AddCustomSample/KernelLaunch/AddKernelInvocationAcl/scripts/verify_result.py new file mode 100644 index 0000000000000000000000000000000000000000..1a21d809ab206a65bc952ca4cb06c345edcd3e7a --- /dev/null +++ b/operator/AddCustomSample/KernelLaunch/AddKernelInvocationAcl/scripts/verify_result.py @@ -0,0 +1,53 @@ +#!/usr/bin/python3 +# coding=utf-8 +# +# Copyright (C) 2023-2024. Huawei Technologies Co., Ltd. All rights reserved. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# =============================================================================== + +import sys +import numpy as np + +# for float16 +relative_tol = 1e-3 +absolute_tol = 1e-5 +error_tol = 1e-3 + + +def verify_result(output, golden): + output = np.fromfile(output, dtype=np.float16).reshape(-1) + golden = np.fromfile(golden, dtype=np.float16).reshape(-1) + different_element_results = np.isclose(output, + golden, + rtol=relative_tol, + atol=absolute_tol, + equal_nan=True) + different_element_indexes = np.where(different_element_results == False)[0] + for index in range(len(different_element_indexes)): + real_index = different_element_indexes[index] + golden_data = golden[real_index] + output_data = output[real_index] + print( + "data index: %06d, expected: %-.9f, actual: %-.9f, rdiff: %-.6f" % + (real_index, golden_data, output_data, + abs(output_data - golden_data) / golden_data)) + if index == 100: + break + error_ratio = float(different_element_indexes.size) / golden.size + print("error ratio: %.4f, tolrence: %.4f" % (error_ratio, error_tol)) + return error_ratio <= error_tol + + +if __name__ == '__main__': + try: + res = verify_result(sys.argv[1], sys.argv[2]) + if not res: + raise ValueError("[ERROR] result error") + else: + print("test pass") + except Exception as e: + print(e) + sys.exit(1) diff --git a/operator/AddCustomSample/KernelLaunch/AddKernelInvocationNeo/README.md b/operator/AddCustomSample/KernelLaunch/AddKernelInvocationNeo/README.md index 4ff4d4f36d389a54cb7990520fad0d551dbd9719..f921e7673f6fb13973ed19f056ef9ac1aecbe9e8 100644 --- a/operator/AddCustomSample/KernelLaunch/AddKernelInvocationNeo/README.md +++ b/operator/AddCustomSample/KernelLaunch/AddKernelInvocationNeo/README.md @@ -60,7 +60,7 @@ ```bash bash run.sh -r [RUN_MODE] -v [SOC_VERSION] ``` - - RUN_MODE:编译方式,可选择CPU调试,NPU仿真,NPU上板。支持参数为[cpu /sim / npu] + - RUN_MODE:编译方式,可选择CPU调试,NPU仿真,NPU上板。支持参数为[cpu / sim / npu] - SOC_VERSION:昇腾AI处理器型号,如果无法确定具体的[SOC_VERSION],则在安装昇腾AI处理器的服务器执行npu-smi info命令进行查询,在查询到的“Name”前增加Ascend信息,例如“Name”对应取值为xxxyy,实际配置的[SOC_VERSION]值为Ascendxxxyy。支持以下参数取值(xxx请替换为具体取值): - Atlas 推理系列产品(Ascend 310P处理器)参数值:Ascend310P1、Ascend310P3 - Atlas 训练系列产品参数值:AscendxxxA、AscendxxxB diff --git a/operator/MatmulCustomSample/KernelLaunch/MatmulInvocationAcl/CMakeLists.txt b/operator/MatmulCustomSample/KernelLaunch/MatmulInvocationAcl/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..ea95344bcad546c4b40cac4ebee6c0a322644ff1 --- /dev/null +++ b/operator/MatmulCustomSample/KernelLaunch/MatmulInvocationAcl/CMakeLists.txt @@ -0,0 +1,67 @@ +cmake_minimum_required(VERSION 3.16) +project(Ascend_c) + +set(RUN_MODE "npu" CACHE STRING "cpu/sim/npu") +set(SOC_VERSION "Ascend310P3" CACHE STRING "system on chip type") +set(ASCEND_CANN_PACKAGE_PATH "/usr/local/Ascend/ascend-toolkit/latest" + CACHE STRING "ASCEND CANN package installation directory" +) +if(NOT CMAKE_BUILD_TYPE) + set(CMAKE_BUILD_TYPE "Debug" CACHE STRING "Build type Release/Debug (default Debug)" FORCE) +endif() +if(CMAKE_INSTALL_PREFIX STREQUAL /usr/local) + set(CMAKE_INSTALL_PREFIX "${CMAKE_CURRENT_LIST_DIR}/out" CACHE STRING "path for install()" FORCE) +endif() +file(GLOB KERNEL_FILES ${CMAKE_CURRENT_SOURCE_DIR}/matmul_custom.cpp) +set(CUSTOM_ASCEND310P_LIST "Ascend310P1" "Ascend310P3") + +if("${RUN_MODE}" STREQUAL "cpu") + include(cmake/cpu_lib.cmake) +elseif("${RUN_MODE}" STREQUAL "sim" OR "${RUN_MODE}" STREQUAL "npu") + include(cmake/npu_lib.cmake) +else() + message("invalid RUN_MODE: ${RUN_MODE}") +endif() + +add_executable(ascendc_kernels_bbit + ${CMAKE_CURRENT_SOURCE_DIR}/main.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/matmul_custom_tiling.cpp +) + +target_compile_options(ascendc_kernels_bbit PRIVATE + $:-g>> + -O2 -std=c++17 -D_GLIBCXX_USE_CXX11_ABI=0 -Wall -Werror +) + +target_compile_definitions(ascendc_kernels_bbit PRIVATE + $<$>:CUSTOM_ASCEND310P> + SOC_VERSION="${SOC_VERSION}" +) + +target_include_directories(ascendc_kernels_bbit PRIVATE + ${ASCEND_CANN_PACKAGE_PATH}/include/ + ${ASCEND_CANN_PACKAGE_PATH}/include/ascendc/ + ${ASCEND_CANN_PACKAGE_PATH}/include/ascendc/highlevel_api +) + +target_link_directories(ascendc_kernels_bbit PRIVATE + ${ASCEND_CANN_PACKAGE_PATH}/lib64 +) + +target_link_libraries(ascendc_kernels_bbit PRIVATE + ascendcl + $:ascendc_kernels>> + runtime + tiling_api + register + platform + ascendalog + c_sec + dl +) + +install(TARGETS ascendc_kernels_bbit + LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} + ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} + RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} +) diff --git a/operator/MatmulCustomSample/KernelLaunch/MatmulInvocationAcl/README.md b/operator/MatmulCustomSample/KernelLaunch/MatmulInvocationAcl/README.md new file mode 100644 index 0000000000000000000000000000000000000000..d5c4ebc101226f1925e3ab9de8d9caf79b8a1a5e --- /dev/null +++ b/operator/MatmulCustomSample/KernelLaunch/MatmulInvocationAcl/README.md @@ -0,0 +1,72 @@ +## 目录结构介绍 +``` +├── MatmulInvocationNeo +│ ├── cmake // 编译工程文件 +│ ├── scripts +│ │ ├── gen_data.py // 输入数据和真值数据生成脚本文件 +│ │ └── verify_result.py // 验证输出数据和真值数据是否一致的验证脚本 +│ ├── CMakeLists.txt // 编译工程文件 +│ ├── data_utils.h // 数据读入写出函数 +│ ├── main.cpp // 主函数,调用算子的应用程序,含CPU域及NPU域调用 +│ ├── matmul_custom.cpp // 算子kernel实现 +│ ├── matmul_custom_tiling.cpp // 算子tiling实现 +│ └── run.sh // 编译运行算子的脚本 +``` +## 代码实现介绍 +本调用样例中实现的是[M, K, N]固定为[512, 512, 1024]的Matmul算子。 +- kernel实现 + Matmul算子的数学表达式为: + $$ + C = A * B + $$ + 其中A的形状为[512, 512], B的形状为[512, 1024], C的形状为[512, 1024]。具体请参考[matmul_custom.cpp](./matmul_custom.cpp)。 + +- 调用实现 + 1. CPU侧运行验证主要通过ICPU_RUN_KF CPU调测宏等CPU调测库提供的接口来完成; + 2. NPU侧运行验证主要通过使用ACLRT_LAUNCH_KERNEL内核调用宏来完成。 + + 应用程序通过ASCENDC_CPU_DEBUG 宏区分代码逻辑运行于CPU侧还是NPU侧。 + +## 运行样例算子 + - 打开样例目录 + 以命令行方式下载样例代码,master分支为例。 + ```bash + cd ${git_clone_path}/samples/operator/MatmulCustomSample/KernelLaunch/MatmulInvocationNeo + ``` + - 配置环境变量 + + 请根据当前环境上CANN开发套件包的[安装方式](https://hiascend.com/document/redirect/CannCommunityInstSoftware),选择对应配置环境变量的命令。 + - 默认路径,root用户安装CANN软件包 + ```bash + export ASCEND_INSTALL_PATH=/usr/local/Ascend/ascend-toolkit/latest + ``` + - 默认路径,非root用户安装CANN软件包 + ```bash + export ASCEND_INSTALL_PATH=$HOME/Ascend/ascend-toolkit/latest + ``` + - 指定路径install_path,安装CANN软件包 + ```bash + export ASCEND_INSTALL_PATH=${install_path}/ascend-toolkit/latest + ``` + + - 样例执行 + + ```bash + bash run.sh -r [RUN_MODE] -v [SOC_VERSION] + ``` + - RUN_MODE:编译方式,可选择CPU调试,NPU仿真,NPU上板。支持参数为[cpu / sim / npu],默认值为npu。 + - SOC_VERSION:昇腾AI处理器型号,如果无法确定具体的[SOC_VERSION],则在安装昇腾AI处理器的服务器执行npu-smi info命令进行查询,在查询到的“Name”前增加Ascend信息,例如"Name"对应取值为xxxyy,实际配置的[SOC_VERSION]值为Ascendxxxyy。支持以下参数取值(xxx请替换为具体取值): + - Atlas 推理系列产品(Ascend 310P处理器)参数值:Ascend310P1、Ascend310P3 + - Atlas A2训练系列产品/Atlas 800I A2推理产品参数值:AscendxxxB1、AscendxxxB2、AscendxxxB3、AscendxxxB4 + + 示例如下。 + + ```bash + bash run.sh -r cpu -v Ascend310P1 + ``` + +## 更新说明 +| 时间 | 更新事项 | +| ---------- | ------------ | +| 2023/05/21 | 更新本readme | +| 2024/07/31 | 修改本readme中的形状说明错误 | \ No newline at end of file diff --git a/operator/MatmulCustomSample/KernelLaunch/MatmulInvocationAcl/cmake/cpu_lib.cmake b/operator/MatmulCustomSample/KernelLaunch/MatmulInvocationAcl/cmake/cpu_lib.cmake new file mode 100644 index 0000000000000000000000000000000000000000..76c8adb205173638149cdaaf68bc2d11ea9c6b21 --- /dev/null +++ b/operator/MatmulCustomSample/KernelLaunch/MatmulInvocationAcl/cmake/cpu_lib.cmake @@ -0,0 +1,12 @@ +if(NOT DEFINED ENV{CMAKE_PREFIX_PATH}) + set(CMAKE_PREFIX_PATH ${ASCEND_CANN_PACKAGE_PATH}/tools/tikicpulib/lib/cmake) +endif() +find_package(tikicpulib REQUIRED) + +add_library(ascendc_kernels SHARED ${KERNEL_FILES}) +target_link_libraries(ascendc_kernels PUBLIC tikicpulib::${SOC_VERSION}) +target_compile_definitions(ascendc_kernels PRIVATE + $<$>:CUSTOM_ASCEND310P> +) +target_compile_options(ascendc_kernels PRIVATE -g -O0 -std=c++17) +install(TARGETS ascendc_kernels DESTINATION ${CMAKE_INSTALL_LIBDIR}) diff --git a/operator/MatmulCustomSample/KernelLaunch/MatmulInvocationAcl/cmake/npu_lib.cmake b/operator/MatmulCustomSample/KernelLaunch/MatmulInvocationAcl/cmake/npu_lib.cmake new file mode 100644 index 0000000000000000000000000000000000000000..d4daabc5803c97726343560fe940cfd694642b46 --- /dev/null +++ b/operator/MatmulCustomSample/KernelLaunch/MatmulInvocationAcl/cmake/npu_lib.cmake @@ -0,0 +1,16 @@ +if(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake) + set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake) +elseif(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake) + set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake) +else() + message(FATAL_ERROR "ascendc_kernel_cmake does not exist ,please check whether the cann package is installed") +endif() +include(${ASCENDC_CMAKE_DIR}/ascendc.cmake) + +ascendc_fatbin_library(ascendc_kernels ${KERNEL_FILES}) + +ascendc_compile_definitions(ascendc_kernels PRIVATE + $<$>:CUSTOM_ASCEND310P> + -DHAVE_WORKSPACE + -DHAVE_TILING +) diff --git a/operator/MatmulCustomSample/KernelLaunch/MatmulInvocationAcl/data_utils.h b/operator/MatmulCustomSample/KernelLaunch/MatmulInvocationAcl/data_utils.h new file mode 100644 index 0000000000000000000000000000000000000000..1d43459ef7c5858cf97a9ba43769e50b8dd28853 --- /dev/null +++ b/operator/MatmulCustomSample/KernelLaunch/MatmulInvocationAcl/data_utils.h @@ -0,0 +1,240 @@ +/** + * @file data_utils.h + * + * Copyright (C) 2023-2024. Huawei Technologies Co., Ltd. All rights reserved. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + */ +#ifndef DATA_UTILS_H +#define DATA_UTILS_H +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "acl/acl.h" + +typedef enum { + DT_UNDEFINED = -1, + FLOAT = 0, + HALF = 1, + INT8_T = 2, + INT32_T = 3, + UINT8_T = 4, + INT16_T = 6, + UINT16_T = 7, + UINT32_T = 8, + INT64_T = 9, + UINT64_T = 10, + DOUBLE = 11, + BOOL = 12, + STRING = 13, + COMPLEX64 = 16, + COMPLEX128 = 17, + BF16 = 27 +} printDataType; + +#define INFO_LOG(fmt, args...) fprintf(stdout, "[INFO] " fmt "\n", ##args) +#define WARN_LOG(fmt, args...) fprintf(stdout, "[WARN] " fmt "\n", ##args) +#define ERROR_LOG(fmt, args...) fprintf(stdout, "[ERROR] " fmt "\n", ##args) +#define CHECK_ACL(x) \ + do { \ + aclError __ret = x; \ + if (__ret != ACL_ERROR_NONE) { \ + std::cerr << __FILE__ << ":" << __LINE__ << " aclError:" << __ret << std::endl; \ + } \ + } while (0); + +/** + * @brief Read data from file + * @param [in] filePath: file path + * @param [out] fileSize: file size + * @return read result + */ +bool ReadFile(const std::string &filePath, size_t &fileSize, void *buffer, size_t bufferSize) +{ + struct stat sBuf; + int fileStatus = stat(filePath.data(), &sBuf); + if (fileStatus == -1) { + ERROR_LOG("failed to get file"); + return false; + } + if (S_ISREG(sBuf.st_mode) == 0) { + ERROR_LOG("%s is not a file, please enter a file", filePath.c_str()); + return false; + } + + std::ifstream file; + file.open(filePath, std::ios::binary); + if (!file.is_open()) { + ERROR_LOG("Open file failed. path = %s", filePath.c_str()); + return false; + } + + std::filebuf *buf = file.rdbuf(); + size_t size = buf->pubseekoff(0, std::ios::end, std::ios::in); + if (size == 0) { + ERROR_LOG("file size is 0"); + file.close(); + return false; + } + if (size > bufferSize) { + ERROR_LOG("file size is larger than buffer size"); + file.close(); + return false; + } + buf->pubseekpos(0, std::ios::in); + buf->sgetn(static_cast(buffer), size); + fileSize = size; + file.close(); + return true; +} + +/** + * @brief Write data to file + * @param [in] filePath: file path + * @param [in] buffer: data to write to file + * @param [in] size: size to write + * @return write result + */ +bool WriteFile(const std::string &filePath, const void *buffer, size_t size) +{ + if (buffer == nullptr) { + ERROR_LOG("Write file failed. buffer is nullptr"); + return false; + } + + int fd = open(filePath.c_str(), O_RDWR | O_CREAT | O_TRUNC, S_IRUSR | S_IWRITE); + if (fd < 0) { + ERROR_LOG("Open file failed. path = %s", filePath.c_str()); + return false; + } + + size_t writeSize = write(fd, buffer, size); + (void)close(fd); + if (writeSize != size) { + ERROR_LOG("Write file Failed."); + return false; + } + + return true; +} + +/** + * @brief Reads a binary file into memory. + * + * This function opens a binary file, reads its contents into a dynamically allocated memory buffer, + * and returns a pointer to the buffer and the size of the file through output parameters. + * + * @param filePath The path to the binary file to be read. + * @param outBuffer A reference to a unique pointer that will hold the file data. + * @param outSize A reference to a size_t that will hold the size of the file. + * @return true if the file was read successfully, false otherwise. + */ +bool ReadBinaryFile(const char *filePath, std::unique_ptr &outBuffer, size_t &outSize) +{ + FILE *file = fopen(filePath, "rb"); + if (!file) { + ERROR_LOG("Error opening file: %s\n", strerror(errno)); + return false; + } + + fseek(file, 0, SEEK_END); + outSize = ftell(file); + rewind(file); + + outBuffer.reset(new char[outSize]); + if (fread(outBuffer.get(), 1, outSize, file) != outSize) { + ERROR_LOG("Error reading file.\n"); + fclose(file); + return false; + } + + fclose(file); + return true; +} + +template void DoPrintData(const T *data, size_t count, size_t elementsPerRow) +{ + assert(elementsPerRow != 0); + for (size_t i = 0; i < count; ++i) { + std::cout << std::setw(10) << data[i]; + if (i % elementsPerRow == elementsPerRow - 1) { + std::cout << std::endl; + } + } +} + +void DoPrintHalfData(const aclFloat16 *data, size_t count, size_t elementsPerRow) +{ + assert(elementsPerRow != 0); + for (size_t i = 0; i < count; ++i) { + std::cout << std::setw(10) << std::setprecision(6) << aclFloat16ToFloat(data[i]); + if (i % elementsPerRow == elementsPerRow - 1) { + std::cout << std::endl; + } + } +} + +void PrintData(const void *data, size_t count, printDataType dataType, size_t elementsPerRow = 16) +{ + if (data == nullptr) { + ERROR_LOG("Print data failed. data is nullptr"); + return; + } + + switch (dataType) { + case BOOL: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case INT8_T: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case UINT8_T: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case INT16_T: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case UINT16_T: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case INT32_T: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case UINT32_T: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case INT64_T: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case UINT64_T: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case HALF: + DoPrintHalfData(reinterpret_cast(data), count, elementsPerRow); + break; + case FLOAT: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case DOUBLE: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + default: + ERROR_LOG("Unsupported type: %d", dataType); + } + std::cout << std::endl; +} +#endif // DATA_UTILS_H diff --git a/operator/MatmulCustomSample/KernelLaunch/MatmulInvocationAcl/main.cpp b/operator/MatmulCustomSample/KernelLaunch/MatmulInvocationAcl/main.cpp new file mode 100644 index 0000000000000000000000000000000000000000..66866447636e1061dd419c1b743bbe3dce15fd38 --- /dev/null +++ b/operator/MatmulCustomSample/KernelLaunch/MatmulInvocationAcl/main.cpp @@ -0,0 +1,137 @@ +/** + * @file main.cpp + * + * Copyright (C) 2023-2024. Huawei Technologies Co., Ltd. All rights reserved. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + */ +#include "data_utils.h" +#include "kernel_tiling/kernel_tiling.h" +#include "tiling/platform/platform_ascendc.h" +#include "acl/acl.h" + +typedef int32_t rtError_t; +extern uint8_t *GenerateTiling(const char *socVersion); +extern "C" rtError_t rtGetC2cCtrlAddr(uint64_t *addr, uint32_t *len); // from libruntime.so + +int32_t main(int32_t argc, char *argv[]) +{ + const char *socVersion = SOC_VERSION; + auto ascendcPlatform = platform_ascendc::PlatformAscendCManager::GetInstance(socVersion); + size_t aFileSize = 512 * 512 * sizeof(uint16_t); // uint16_t represent half + size_t bFileSize = 512 * 1024 * sizeof(uint16_t); // uint16_t represent half + size_t cFileSize = 512 * 1024 * sizeof(float); + size_t tilingFileSize = sizeof(TCubeTiling); + size_t userWorkspaceSize = 0; + size_t systemWorkspaceSize = static_cast(ascendcPlatform->GetLibApiWorkSpaceSize()); + size_t workspaceSize = userWorkspaceSize + systemWorkspaceSize; +#ifdef CUSTOM_ASCEND310P + uint32_t blockDim = 2; +#else + uint32_t blockDim = 1; +#endif + + CHECK_ACL(aclInit(nullptr)); + int32_t deviceId = 0; + CHECK_ACL(aclrtSetDevice(deviceId)); + aclrtStream stream = nullptr; + CHECK_ACL(aclrtCreateStream(&stream)); + + uint8_t *aHost; + uint8_t *aDevice; + CHECK_ACL(aclrtMallocHost((void **)(&aHost), aFileSize)); + CHECK_ACL(aclrtMalloc((void **)&aDevice, aFileSize, ACL_MEM_MALLOC_HUGE_FIRST)); + ReadFile("./input/x1_gm.bin", aFileSize, aHost, aFileSize); + CHECK_ACL(aclrtMemcpy(aDevice, aFileSize, aHost, aFileSize, ACL_MEMCPY_HOST_TO_DEVICE)); + + uint8_t *bHost; + uint8_t *bDevice; + CHECK_ACL(aclrtMallocHost((void **)(&bHost), bFileSize)); + CHECK_ACL(aclrtMalloc((void **)&bDevice, bFileSize, ACL_MEM_MALLOC_HUGE_FIRST)); + ReadFile("./input/x2_gm.bin", bFileSize, bHost, bFileSize); + CHECK_ACL(aclrtMemcpy(bDevice, bFileSize, bHost, bFileSize, ACL_MEMCPY_HOST_TO_DEVICE)); + + uint8_t *workspaceDevice; + CHECK_ACL(aclrtMalloc((void **)&workspaceDevice, workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST)); + + uint8_t *tilingHost; + uint8_t *tilingDevice; + CHECK_ACL(aclrtMallocHost((void **)(&tilingHost), tilingFileSize)); + CHECK_ACL(aclrtMalloc((void **)&tilingDevice, tilingFileSize, ACL_MEM_MALLOC_HUGE_FIRST)); + CHECK_ACL( + aclrtMemcpy(tilingHost, tilingFileSize, GenerateTiling(socVersion), tilingFileSize, ACL_MEMCPY_HOST_TO_HOST)); + CHECK_ACL(aclrtMemcpy(tilingDevice, tilingFileSize, tilingHost, tilingFileSize, ACL_MEMCPY_HOST_TO_DEVICE)); + + uint8_t *cHost; + uint8_t *cDevice; + CHECK_ACL(aclrtMallocHost((void **)(&cHost), cFileSize)); + CHECK_ACL(aclrtMalloc((void **)&cDevice, cFileSize, ACL_MEM_MALLOC_HUGE_FIRST)); + + uint64_t fftsAddr; + uint32_t fftsAddrLen; + CHECK_ACL(rtGetC2cCtrlAddr(&fftsAddr, &fftsAddrLen)); + + struct { + uintptr_t fftsAddr; + uintptr_t aDevice; + uintptr_t bDevice; + uintptr_t cDevice; + uintptr_t workspaceDevice; + uintptr_t tilingDevice; + } args; + + auto *argsDeviceAddrInHost = &args; + CHECK_ACL(aclrtMallocHost((void **)&argsDeviceAddrInHost, sizeof(args))); + argsDeviceAddrInHost->fftsAddr = (uintptr_t)fftsAddr; + argsDeviceAddrInHost->aDevice = (uintptr_t)aDevice; + argsDeviceAddrInHost->bDevice = (uintptr_t)bDevice; + argsDeviceAddrInHost->cDevice = (uintptr_t)cDevice; + argsDeviceAddrInHost->workspaceDevice = (uintptr_t)workspaceDevice; + argsDeviceAddrInHost->tilingDevice = (uintptr_t)tilingDevice; + + void *argsDeviceAddrInDevice = nullptr; + CHECK_ACL(aclrtMalloc((void **)&argsDeviceAddrInDevice, sizeof(args), ACL_MEM_MALLOC_HUGE_FIRST)); + CHECK_ACL(aclrtMemcpy(argsDeviceAddrInDevice, sizeof(args), argsDeviceAddrInHost, sizeof(args), ACL_MEMCPY_HOST_TO_DEVICE)); + CHECK_ACL(aclrtFreeHost(argsDeviceAddrInHost)); + + const char *filePath = "./out/fatbin/ascendc_kernels/ascendc_kernels.o"; + std::unique_ptr fileMemory; + size_t fileSize = 0; + ReadBinaryFile(filePath, fileMemory, fileSize); + + aclrtBinary binary = aclrtCreateBinary(fileMemory.get(), fileSize); + if (binary == nullptr) { + printf("aclrtCreateBinary failed"); + return -1; + } + aclrtBinHandle binHandle = nullptr; + CHECK_ACL(aclrtBinaryLoad(binary, &binHandle)); + CHECK_ACL(aclrtDestroyBinary(binary)); + + aclrtFuncHandle funAdd = nullptr; + CHECK_ACL(aclrtBinaryGetFunction(binHandle, "matmul_custom", &funAdd)); + + CHECK_ACL(aclrtLaunchKernel(funAdd, blockDim, argsDeviceAddrInDevice, sizeof(args), stream)); + CHECK_ACL(aclrtSynchronizeStream(stream)); + + + CHECK_ACL(aclrtMemcpy(cHost, cFileSize, cDevice, cFileSize, ACL_MEMCPY_DEVICE_TO_HOST)); + WriteFile("./output/output.bin", cHost, cFileSize); + + CHECK_ACL(aclrtFree(aDevice)); + CHECK_ACL(aclrtFreeHost(aHost)); + CHECK_ACL(aclrtFree(bDevice)); + CHECK_ACL(aclrtFreeHost(bHost)); + CHECK_ACL(aclrtFree(workspaceDevice)); + CHECK_ACL(aclrtFree(tilingDevice)); + CHECK_ACL(aclrtFreeHost(tilingHost)); + CHECK_ACL(aclrtFree(cDevice)); + CHECK_ACL(aclrtFreeHost(cHost)); + + CHECK_ACL(aclrtDestroyStream(stream)); + CHECK_ACL(aclrtResetDevice(deviceId)); + CHECK_ACL(aclFinalize()); + return 0; +} \ No newline at end of file diff --git a/operator/MatmulCustomSample/KernelLaunch/MatmulInvocationAcl/matmul_custom.cpp b/operator/MatmulCustomSample/KernelLaunch/MatmulInvocationAcl/matmul_custom.cpp new file mode 100644 index 0000000000000000000000000000000000000000..d6c1e3aed1ab99bc98cb33ee08a1a05e5e82f009 --- /dev/null +++ b/operator/MatmulCustomSample/KernelLaunch/MatmulInvocationAcl/matmul_custom.cpp @@ -0,0 +1,103 @@ +/** + * @file matmul_custom.cpp + * + * Copyright (C) 2023-2024. Huawei Technologies Co., Ltd. All rights reserved. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + */ +#include "kernel_operator.h" +#include "lib/matmul_intf.h" + +using namespace matmul; + +__aicore__ inline uint32_t Ceiling(uint32_t a, uint32_t b) +{ + return (a + b - 1) / b; +} + +__aicore__ inline void CopyTiling(TCubeTiling *tiling, GM_ADDR tilingGM) +{ + uint32_t *ptr = reinterpret_cast(tiling); + auto tiling32 = reinterpret_cast<__gm__ uint32_t *>(tilingGM); + + for (uint32_t i = 0; i < sizeof(TCubeTiling) / sizeof(uint32_t); i++, ptr++) { + *ptr = *(tiling32 + i); + } + return; +} + +__aicore__ inline void CalcGMOffset(int blockIdx, const TCubeTiling &tiling, int &offsetA, int &offsetB, int &offsetC, + int &tailM, int &tailN, bool isTransA, bool isTransB) +{ + uint32_t mSingleBlocks = Ceiling(tiling.M, tiling.singleCoreM); + uint32_t mCoreIndx = blockIdx % mSingleBlocks; + uint32_t nCoreIndx = blockIdx / mSingleBlocks; + + offsetA = mCoreIndx * tiling.Ka * tiling.singleCoreM; + if (isTransA) { + offsetA = mCoreIndx * tiling.singleCoreM; + } + offsetB = nCoreIndx * tiling.singleCoreN; + if (isTransB) { + offsetB = nCoreIndx * tiling.Kb * tiling.singleCoreN; + } + offsetC = mCoreIndx * tiling.N * tiling.singleCoreM + nCoreIndx * tiling.singleCoreN; + + tailM = tiling.M - mCoreIndx * tiling.singleCoreM; + tailM = tailM < tiling.singleCoreM ? tailM : tiling.singleCoreM; + + tailN = tiling.N - nCoreIndx * tiling.singleCoreN; + tailN = tailN < tiling.singleCoreN ? tailN : tiling.singleCoreN; +} + +extern "C" __global__ __aicore__ void matmul_custom(GM_ADDR a, GM_ADDR b, GM_ADDR c, GM_ADDR workspace, + GM_ADDR tilingGm) +{ + using A_T = half; + using B_T = half; + using C_T = float; + + AscendC::TPipe pipe; + TCubeTiling tiling; + CopyTiling(&tiling, tilingGm); + + AscendC::GlobalTensor aGlobal; + AscendC::GlobalTensor bGlobal; + AscendC::GlobalTensor cGlobal; + aGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ A_T *>(a), tiling.M * tiling.Ka); + bGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ B_T *>(b), tiling.Ka * tiling.N); + cGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ C_T *>(c), tiling.M * tiling.N); + + int offsetA = 0; + int offsetB = 0; + int offsetC = 0; + bool isTransA = false; + bool isTransB = false; + + int tailM = 0; + int tailN = 0; + CalcGMOffset(GetBlockIdx(), tiling, offsetA, offsetB, offsetC, tailM, tailN, isTransA, isTransB); + + auto gmA = aGlobal[offsetA]; + auto gmB = bGlobal[offsetB]; + auto gmC = cGlobal[offsetC]; + + Matmul, + MatmulType, + MatmulType> mm; + REGIST_MATMUL_OBJ(&pipe, GetSysWorkSpacePtr(), mm, &tiling); +#ifdef CUSTOM_ASCEND310P + AscendC::TBuf<> tmpMMFormatUb; + AscendC::LocalTensor mmFormatUb; + pipe.InitBuffer(tmpMMFormatUb, AscendC::TOTAL_VEC_LOCAL_SIZE); + mmFormatUb = tmpMMFormatUb.Get(AscendC::TOTAL_VEC_LOCAL_SIZE); + mm.SetLocalWorkspace(mmFormatUb); +#endif + mm.SetTensorA(gmA, isTransA); + mm.SetTensorB(gmB, isTransB); + mm.SetTail(tailM, tailN); + mm.IterateAll(gmC); + mm.End(); +} diff --git a/operator/MatmulCustomSample/KernelLaunch/MatmulInvocationAcl/matmul_custom_tiling.cpp b/operator/MatmulCustomSample/KernelLaunch/MatmulInvocationAcl/matmul_custom_tiling.cpp new file mode 100644 index 0000000000000000000000000000000000000000..ce527d41f186aa72b3b4cc0a7cac9f8f5325d334 --- /dev/null +++ b/operator/MatmulCustomSample/KernelLaunch/MatmulInvocationAcl/matmul_custom_tiling.cpp @@ -0,0 +1,75 @@ +/** + * @file matmul_custom_tiling.cpp + * + * Copyright (C) 2023-2024. Huawei Technologies Co., Ltd. All rights reserved. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + */ +#include +#include +#include +#include +#include + +#include "tiling/tiling_api.h" +#include "tiling/platform/platform_ascendc.h" +using namespace matmul_tiling; +using namespace std; + +uint8_t *GetTilingBuf(optiling::TCubeTiling *tilingData) +{ + uint32_t tilingSize = tilingData->GetDataSize(); + uint8_t *buf = (uint8_t *)malloc(tilingSize); + tilingData->SaveToBuffer(buf, tilingSize); + return buf; +} + +uint8_t *GenerateTiling(const char *socVersion) +{ + int M = 512; + int N = 1024; + int K = 512; + + TPosition leftPosition = TPosition::GM; + CubeFormat leftFormat = CubeFormat::ND; + DataType leftDtype = DataType::DT_FLOAT16; + bool isTransA = false; + + TPosition rightPosition = TPosition::GM; + CubeFormat rightFormat = CubeFormat::ND; + DataType rightDtype = DataType::DT_FLOAT16; + bool isTransB = false; + + TPosition resultPosition = TPosition::GM; + CubeFormat resultFormat = CubeFormat::ND; + DataType resultDtype = DataType::DT_FLOAT; + + bool isBias = false; + + int usedCoreNum = 2; + int32_t baseM = 128; + int32_t baseN = 256; + + optiling::TCubeTiling tilingData; + auto ascendcPlatform = platform_ascendc::PlatformAscendCManager::GetInstance(socVersion); + MultiCoreMatmulTiling tilingApi(*ascendcPlatform); + + tilingApi.SetDim(usedCoreNum); + tilingApi.SetAType(leftPosition, leftFormat, leftDtype, isTransA); + tilingApi.SetBType(rightPosition, rightFormat, rightDtype, isTransB); + tilingApi.SetCType(resultPosition, resultFormat, resultDtype); + + tilingApi.SetOrgShape(M, N, K); + tilingApi.SetShape(M, N, K); + tilingApi.SetFixSplit(baseM, baseN, -1); + tilingApi.SetBias(isBias); + tilingApi.SetBufferSpace(-1, -1, -1); + + int64_t res = tilingApi.GetTiling(tilingData); + if (res == -1) { + std::cout << "gen tiling failed" << std::endl; + } + return GetTilingBuf(&tilingData); +} diff --git a/operator/MatmulCustomSample/KernelLaunch/MatmulInvocationAcl/run.sh b/operator/MatmulCustomSample/KernelLaunch/MatmulInvocationAcl/run.sh new file mode 100644 index 0000000000000000000000000000000000000000..af6e9101619a2c5a3d6cc154a7ebe8d18fe78018 --- /dev/null +++ b/operator/MatmulCustomSample/KernelLaunch/MatmulInvocationAcl/run.sh @@ -0,0 +1,126 @@ +#!/bin/bash +CURRENT_DIR=$( + cd $(dirname ${BASH_SOURCE:-$0}) + pwd +) +cd $CURRENT_DIR + +BUILD_TYPE="Debug" +INSTALL_PREFIX="${CURRENT_DIR}/out" + +SHORT=r:,v:,i:,b:,p:, +LONG=run-mode:,soc-version:,install-path:,build-type:,install-prefix:, +OPTS=$(getopt -a --options $SHORT --longoptions $LONG -- "$@") +eval set -- "$OPTS" + +while :; do + case "$1" in + -r | --run-mode) + RUN_MODE="$2" + shift 2 + ;; + -v | --soc-version) + SOC_VERSION="$2" + shift 2 + ;; + -i | --install-path) + ASCEND_INSTALL_PATH="$2" + shift 2 + ;; + -b | --build-type) + BUILD_TYPE="$2" + shift 2 + ;; + -p | --install-prefix) + INSTALL_PREFIX="$2" + shift 2 + ;; + --) + shift + break + ;; + *) + echo "[ERROR] Unexpected option: $1" + break + ;; + esac +done + +RUN_MODE_LIST="cpu sim npu" +if [[ " $RUN_MODE_LIST " != *" $RUN_MODE "* ]]; then + echo "ERROR: RUN_MODE error, This sample only support specify cpu, sim or npu!" + exit -1 +fi + +VERSION_LIST="Ascend310P1 Ascend310P3 Ascend910B1 Ascend910B2 Ascend910B3 Ascend910B4" +if [[ " $VERSION_LIST " != *" $SOC_VERSION "* ]]; then + echo "ERROR: SOC_VERSION should be in [$VERSION_LIST]" + exit -1 +fi + +if [ -n "$ASCEND_INSTALL_PATH" ]; then + _ASCEND_INSTALL_PATH=$ASCEND_INSTALL_PATH +elif [ -n "$ASCEND_HOME_PATH" ]; then + _ASCEND_INSTALL_PATH=$ASCEND_HOME_PATH +else + if [ -d "$HOME/Ascend/ascend-toolkit/latest" ]; then + _ASCEND_INSTALL_PATH=$HOME/Ascend/ascend-toolkit/latest + else + _ASCEND_INSTALL_PATH=/usr/local/Ascend/ascend-toolkit/latest + fi +fi + +export ASCEND_TOOLKIT_HOME=${_ASCEND_INSTALL_PATH} +export ASCEND_HOME_PATH=${_ASCEND_INSTALL_PATH} +if [ "${RUN_MODE}" = "npu" ]; then + source ${_ASCEND_INSTALL_PATH}/bin/setenv.bash +elif [ "${RUN_MODE}" = "sim" ]; then + # in case of running op in simulator, use stub .so instead + export LD_LIBRARY_PATH=${_ASCEND_INSTALL_PATH}/runtime/lib64/stub:$LD_LIBRARY_PATH + source ${_ASCEND_INSTALL_PATH}/bin/setenv.bash + export LD_LIBRARY_PATH=${_ASCEND_INSTALL_PATH}/tools/simulator/${SOC_VERSION}/lib:$LD_LIBRARY_PATH + if [ ! $CAMODEL_LOG_PATH ]; then + export CAMODEL_LOG_PATH=$(pwd)/sim_log + fi + if [ -d "$CAMODEL_LOG_PATH" ]; then + rm -rf $CAMODEL_LOG_PATH + fi + mkdir -p $CAMODEL_LOG_PATH +elif [ "${RUN_MODE}" = "cpu" ]; then + source ${_ASCEND_INSTALL_PATH}/bin/setenv.bash + export LD_LIBRARY_PATH=${_ASCEND_INSTALL_PATH}/tools/tikicpulib/lib:${_ASCEND_INSTALL_PATH}/tools/tikicpulib/lib/${SOC_VERSION}:${_ASCEND_INSTALL_PATH}/tools/simulator/${SOC_VERSION}/lib:$LD_LIBRARY_PATH +fi + +set -e +rm -rf build out +mkdir -p build +cmake -B build \ + -DRUN_MODE=${RUN_MODE} \ + -DSOC_VERSION=${SOC_VERSION} \ + -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \ + -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} \ + -DASCEND_CANN_PACKAGE_PATH=${_ASCEND_INSTALL_PATH} +cmake --build build -j +cmake --install build + +rm -f ascendc_kernels_bbit +cp ./out/bin/ascendc_kernels_bbit ./ +rm -rf input output +mkdir -p input output +python3 scripts/gen_data.py +( + export LD_LIBRARY_PATH=$(pwd)/out/lib:$(pwd)/out/lib64:${_ASCEND_INSTALL_PATH}/lib64:$LD_LIBRARY_PATH + if [[ "$RUN_WITH_TOOLCHAIN" -eq 1 ]]; then + if [ "${RUN_MODE}" = "npu" ]; then + msprof op --application=./ascendc_kernels_bbit + elif [ "${RUN_MODE}" = "sim" ]; then + msprof op simulator --application=./ascendc_kernels_bbit + elif [ "${RUN_MODE}" = "cpu" ]; then + ./ascendc_kernels_bbit + fi + else + ./ascendc_kernels_bbit + fi +) +md5sum output/*.bin +python3 scripts/verify_result.py output/output.bin output/golden.bin diff --git a/operator/MatmulCustomSample/KernelLaunch/MatmulInvocationAcl/scripts/gen_data.py b/operator/MatmulCustomSample/KernelLaunch/MatmulInvocationAcl/scripts/gen_data.py new file mode 100644 index 0000000000000000000000000000000000000000..1ac70da4697b0bf82158d512d05f22751f103f8c --- /dev/null +++ b/operator/MatmulCustomSample/KernelLaunch/MatmulInvocationAcl/scripts/gen_data.py @@ -0,0 +1,31 @@ +#!/usr/bin/python3 +# coding=utf-8 +# +# Copyright (C) 2023-2024. Huawei Technologies Co., Ltd. All rights reserved. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# =============================================================================== + +import numpy as np +import os + + +def gen_golden_data(): + M = 512 + N = 1024 + K = 512 + + x1_gm = np.random.randint(1, 10, [M, K]).astype(np.float16) + x2_gm = np.random.randint(1, 10, [K, N]).astype(np.float16) + golden = np.matmul(x1_gm.astype(np.float32), x2_gm.astype(np.float32)).astype(np.float32) + os.system("mkdir -p input") + os.system("mkdir -p output") + x1_gm.tofile("./input/x1_gm.bin") + x2_gm.tofile("./input/x2_gm.bin") + golden.tofile("./output/golden.bin") + + +if __name__ == "__main__": + gen_golden_data() diff --git a/operator/MatmulCustomSample/KernelLaunch/MatmulInvocationAcl/scripts/verify_result.py b/operator/MatmulCustomSample/KernelLaunch/MatmulInvocationAcl/scripts/verify_result.py new file mode 100644 index 0000000000000000000000000000000000000000..a325cfcc61f46f7de04ef6c979f852adcf213ef3 --- /dev/null +++ b/operator/MatmulCustomSample/KernelLaunch/MatmulInvocationAcl/scripts/verify_result.py @@ -0,0 +1,53 @@ +#!/usr/bin/python3 +# coding=utf-8 +# +# Copyright (C) 2023-2024. Huawei Technologies Co., Ltd. All rights reserved. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# =============================================================================== + +import sys +import numpy as np + +# for float32 +relative_tol = 1e-6 +absolute_tol = 1e-9 +error_tol = 1e-4 + + +def verify_result(output, golden): + output = np.fromfile(output, dtype=np.float32).reshape(-1) + golden = np.fromfile(golden, dtype=np.float32).reshape(-1) + different_element_results = np.isclose(output, + golden, + rtol=relative_tol, + atol=absolute_tol, + equal_nan=True) + different_element_indexes = np.where(different_element_results == False)[0] + for index in range(len(different_element_indexes)): + real_index = different_element_indexes[index] + golden_data = golden[real_index] + output_data = output[real_index] + print( + "data index: %06d, expected: %-.9f, actual: %-.9f, rdiff: %-.6f" % + (real_index, golden_data, output_data, + abs(output_data - golden_data) / golden_data)) + if index == 100: + break + error_ratio = float(different_element_indexes.size) / golden.size + print("error ratio: %.4f, tolrence: %.4f" % (error_ratio, error_tol)) + return error_ratio <= error_tol + + +if __name__ == '__main__': + try: + res = verify_result(sys.argv[1], sys.argv[2]) + if not res: + raise ValueError("[ERROR] result error") + else: + print("test pass") + except Exception as e: + print(e) + sys.exit(1) diff --git a/operator/MatmulLeakyReluCustomSample/KernelLaunch/MatmulLeakyReluInvocation/CMakeLists.txt b/operator/MatmulLeakyReluCustomSample/KernelLaunch/MatmulLeakyReluInvocation/CMakeLists.txt index f2915dca344aa27c11796a4dfe35cd9458bce21d..3ec5e567492bb261db76bc44c3ddc183f3f91c5b 100644 --- a/operator/MatmulLeakyReluCustomSample/KernelLaunch/MatmulLeakyReluInvocation/CMakeLists.txt +++ b/operator/MatmulLeakyReluCustomSample/KernelLaunch/MatmulLeakyReluInvocation/CMakeLists.txt @@ -57,4 +57,4 @@ install(TARGETS ascendc_kernels_bbit LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} -) \ No newline at end of file +) diff --git a/operator/MatmulLeakyReluCustomSample/KernelLaunch/MatmulLeakyReluInvocationAcl/CMakeLists.txt b/operator/MatmulLeakyReluCustomSample/KernelLaunch/MatmulLeakyReluInvocationAcl/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..d4434230ab4769365f596833664419b45012061b --- /dev/null +++ b/operator/MatmulLeakyReluCustomSample/KernelLaunch/MatmulLeakyReluInvocationAcl/CMakeLists.txt @@ -0,0 +1,67 @@ +cmake_minimum_required(VERSION 3.16) +project(Ascend_c) + +set(RUN_MODE "npu" CACHE STRING "cpu/sim/npu") +set(SOC_VERSION "Ascend310P3" CACHE STRING "system on chip type") +set(ASCEND_CANN_PACKAGE_PATH "/usr/local/Ascend/ascend-toolkit/latest" + CACHE STRING "ASCEND CANN package installation directory" +) +if(NOT CMAKE_BUILD_TYPE) + set(CMAKE_BUILD_TYPE "Debug" CACHE STRING "Build type Release/Debug (default Debug)" FORCE) +endif() +if(CMAKE_INSTALL_PREFIX STREQUAL /usr/local) + set(CMAKE_INSTALL_PREFIX "${CMAKE_CURRENT_LIST_DIR}/out" CACHE STRING "path for install()" FORCE) +endif() +file(GLOB KERNEL_FILES ${CMAKE_CURRENT_SOURCE_DIR}/matmul_leakyrelu_custom.cpp) +set(CUSTOM_ASCEND310P_LIST "Ascend310P1" "Ascend310P3") + +if("${RUN_MODE}" STREQUAL "cpu") + include(cmake/cpu_lib.cmake) +elseif("${RUN_MODE}" STREQUAL "sim" OR "${RUN_MODE}" STREQUAL "npu") + include(cmake/npu_lib.cmake) +else() + message("invalid RUN_MODE: ${RUN_MODE}") +endif() + +add_executable(ascendc_kernels_bbit + ${CMAKE_CURRENT_SOURCE_DIR}/main.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/matmul_leakyrelu_custom_tiling.cpp +) + +target_compile_options(ascendc_kernels_bbit PRIVATE + $:-g>> + -O2 -std=c++17 -D_GLIBCXX_USE_CXX11_ABI=0 -Wall -Werror +) + +target_compile_definitions(ascendc_kernels_bbit PRIVATE + $<$>:CUSTOM_ASCEND310P> + SOC_VERSION="${SOC_VERSION}" +) + +target_include_directories(ascendc_kernels_bbit PRIVATE + ${ASCEND_CANN_PACKAGE_PATH}/include/ + ${ASCEND_CANN_PACKAGE_PATH}/include/ascendc/ + ${ASCEND_CANN_PACKAGE_PATH}/include/ascendc/highlevel_api +) + +target_link_directories(ascendc_kernels_bbit PRIVATE + ${ASCEND_CANN_PACKAGE_PATH}/lib64 +) + +target_link_libraries(ascendc_kernels_bbit PRIVATE + ascendcl + $:ascendc_kernels>> + runtime + tiling_api + register + platform + ascendalog + c_sec + dl +) + +install(TARGETS ascendc_kernels_bbit + LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} + ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} + RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} +) diff --git a/operator/MatmulLeakyReluCustomSample/KernelLaunch/MatmulLeakyReluInvocationAcl/README.md b/operator/MatmulLeakyReluCustomSample/KernelLaunch/MatmulLeakyReluInvocationAcl/README.md new file mode 100644 index 0000000000000000000000000000000000000000..7e9b17abcda6e59a27fbb4faf4b005b17b225fbe --- /dev/null +++ b/operator/MatmulLeakyReluCustomSample/KernelLaunch/MatmulLeakyReluInvocationAcl/README.md @@ -0,0 +1,73 @@ +## 目录结构介绍 +``` +├── MatmulLeakyReluInvocation +│ ├── cmake // 编译工程文件 +│ ├── scripts +│ │ ├── verify_result.py // 真值对比文件 +│ │ └── gen_data.py // 输入数据和真值数据生成脚本文件 +│ ├── CMakeLists.txt // 编译工程文件 +│ ├── data_utils.h // 数据读入写出函数 +│ ├── main.cpp // 主函数,调用算子的应用程序,含CPU域及NPU域调用 +│ ├── matmul_leakyrelu_custom_tiling.cpp // 算子tiling实现 +│ ├── matmul_leakyrelu_custom.cpp // 算子kernel实现 +│ └── run.sh // 编译运行算子的脚本 +``` +## 代码实现介绍 +本调用样例中实现的是[m, n, k]固定为[1024, 640, 256]的MatmulLeakyRelu算子。 +- kernel实现 + MatmulLeakyRelu算子的数学表达式为: + ``` + C = A * B + Bias + C = C > 0 ? C : C * 0.001 + ``` + 其中A的形状为[1024, 256],B的形状为[256, 640],C的形状为[1024, 640],Bias的形状为[640]。具体请参考[matmul_leakyrelu_custom.cpp](./matmul_leakyrelu_custom.cpp)。 + +- 调用实现 + 1. CPU侧运行验证主要通过ICPU_RUN_KF CPU调测宏等CPU调测库提供的接口来完成; + 2. NPU侧运行验证主要通过使用ACLRT_LAUNCH_KERNEL内核调用宏来完成。 + + 应用程序通过ASCENDC_CPU_DEBUG 宏区分代码逻辑运行于CPU侧还是NPU侧。 + +## 运行样例算子 + - 打开样例目录 + 以命令行方式下载样例代码,master分支为例。 + ```bash + cd ${git_clone_path}/samples/operator/MatmulLeakyReluCustomSample/KernelLaunch/MatmulLeakyReluInvocation + ``` + - 配置环境变量 + + 请根据当前环境上CANN开发套件包的[安装方式](https://hiascend.com/document/redirect/CannCommunityInstSoftware),选择对应配置环境变量的命令。 + - 默认路径,root用户安装CANN软件包 + ```bash + export ASCEND_INSTALL_PATH=/usr/local/Ascend/ascend-toolkit/latest + ``` + - 默认路径,非root用户安装CANN软件包 + ```bash + export ASCEND_INSTALL_PATH=$HOME/Ascend/ascend-toolkit/latest + ``` + - 指定路径install_path,安装CANN软件包 + ```bash + export ASCEND_INSTALL_PATH=${install_path}/ascend-toolkit/latest + ``` + + - 样例执行 + + ```bash + bash run.sh -r [RUN_MODE] -v [SOC_VERSION] + ``` + - RUN_MODE:编译方式,可选择CPU调试,NPU仿真,NPU上板。支持参数为[cpu / sim / npu],默认值为npu。 + - SOC_VERSION:昇腾AI处理器型号,如果无法确定具体的[SOC_VERSION],则在安装昇腾AI处理器的服务器执行npu-smi info命令进行查询,在查询到的“Name”前增加Ascend信息,例如“Name”对应取值为xxxyy,实际配置的[SOC_VERSION]值为Ascendxxxyy。支持以下参数取值(xxx请替换为具体取值): + - Atlas 推理系列产品(Ascend 310P处理器)参数值:Ascend310P1、Ascend310P3 + - Atlas A2训练系列产品/Atlas 800I A2推理产品参数值:AscendxxxB1、AscendxxxB2、AscendxxxB3、AscendxxxB4 + + 示例如下。 + ```bash + bash run.sh -r cpu -v Ascend310P1 + ``` + +## 更新说明 +| 时间 | 更新事项 | 注意事项 | +| ---------- | ------------ | ------------------------------------------------ | +| 2023/05/21 | 更新本readme | | +| 2023/05/25 | 取消TCubeTiling大小硬编码 | 需要基于社区CANN包8.0.RC2.alpha002及之后版本运行 | +| 2023/06/11 | 取消workspace大小硬编码 | | \ No newline at end of file diff --git a/operator/MatmulLeakyReluCustomSample/KernelLaunch/MatmulLeakyReluInvocationAcl/cmake/cpu_lib.cmake b/operator/MatmulLeakyReluCustomSample/KernelLaunch/MatmulLeakyReluInvocationAcl/cmake/cpu_lib.cmake new file mode 100644 index 0000000000000000000000000000000000000000..76c8adb205173638149cdaaf68bc2d11ea9c6b21 --- /dev/null +++ b/operator/MatmulLeakyReluCustomSample/KernelLaunch/MatmulLeakyReluInvocationAcl/cmake/cpu_lib.cmake @@ -0,0 +1,12 @@ +if(NOT DEFINED ENV{CMAKE_PREFIX_PATH}) + set(CMAKE_PREFIX_PATH ${ASCEND_CANN_PACKAGE_PATH}/tools/tikicpulib/lib/cmake) +endif() +find_package(tikicpulib REQUIRED) + +add_library(ascendc_kernels SHARED ${KERNEL_FILES}) +target_link_libraries(ascendc_kernels PUBLIC tikicpulib::${SOC_VERSION}) +target_compile_definitions(ascendc_kernels PRIVATE + $<$>:CUSTOM_ASCEND310P> +) +target_compile_options(ascendc_kernels PRIVATE -g -O0 -std=c++17) +install(TARGETS ascendc_kernels DESTINATION ${CMAKE_INSTALL_LIBDIR}) diff --git a/operator/MatmulLeakyReluCustomSample/KernelLaunch/MatmulLeakyReluInvocationAcl/cmake/npu_lib.cmake b/operator/MatmulLeakyReluCustomSample/KernelLaunch/MatmulLeakyReluInvocationAcl/cmake/npu_lib.cmake new file mode 100644 index 0000000000000000000000000000000000000000..d4daabc5803c97726343560fe940cfd694642b46 --- /dev/null +++ b/operator/MatmulLeakyReluCustomSample/KernelLaunch/MatmulLeakyReluInvocationAcl/cmake/npu_lib.cmake @@ -0,0 +1,16 @@ +if(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake) + set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake) +elseif(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake) + set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake) +else() + message(FATAL_ERROR "ascendc_kernel_cmake does not exist ,please check whether the cann package is installed") +endif() +include(${ASCENDC_CMAKE_DIR}/ascendc.cmake) + +ascendc_fatbin_library(ascendc_kernels ${KERNEL_FILES}) + +ascendc_compile_definitions(ascendc_kernels PRIVATE + $<$>:CUSTOM_ASCEND310P> + -DHAVE_WORKSPACE + -DHAVE_TILING +) diff --git a/operator/MatmulLeakyReluCustomSample/KernelLaunch/MatmulLeakyReluInvocationAcl/data_utils.h b/operator/MatmulLeakyReluCustomSample/KernelLaunch/MatmulLeakyReluInvocationAcl/data_utils.h new file mode 100644 index 0000000000000000000000000000000000000000..1d43459ef7c5858cf97a9ba43769e50b8dd28853 --- /dev/null +++ b/operator/MatmulLeakyReluCustomSample/KernelLaunch/MatmulLeakyReluInvocationAcl/data_utils.h @@ -0,0 +1,240 @@ +/** + * @file data_utils.h + * + * Copyright (C) 2023-2024. Huawei Technologies Co., Ltd. All rights reserved. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + */ +#ifndef DATA_UTILS_H +#define DATA_UTILS_H +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "acl/acl.h" + +typedef enum { + DT_UNDEFINED = -1, + FLOAT = 0, + HALF = 1, + INT8_T = 2, + INT32_T = 3, + UINT8_T = 4, + INT16_T = 6, + UINT16_T = 7, + UINT32_T = 8, + INT64_T = 9, + UINT64_T = 10, + DOUBLE = 11, + BOOL = 12, + STRING = 13, + COMPLEX64 = 16, + COMPLEX128 = 17, + BF16 = 27 +} printDataType; + +#define INFO_LOG(fmt, args...) fprintf(stdout, "[INFO] " fmt "\n", ##args) +#define WARN_LOG(fmt, args...) fprintf(stdout, "[WARN] " fmt "\n", ##args) +#define ERROR_LOG(fmt, args...) fprintf(stdout, "[ERROR] " fmt "\n", ##args) +#define CHECK_ACL(x) \ + do { \ + aclError __ret = x; \ + if (__ret != ACL_ERROR_NONE) { \ + std::cerr << __FILE__ << ":" << __LINE__ << " aclError:" << __ret << std::endl; \ + } \ + } while (0); + +/** + * @brief Read data from file + * @param [in] filePath: file path + * @param [out] fileSize: file size + * @return read result + */ +bool ReadFile(const std::string &filePath, size_t &fileSize, void *buffer, size_t bufferSize) +{ + struct stat sBuf; + int fileStatus = stat(filePath.data(), &sBuf); + if (fileStatus == -1) { + ERROR_LOG("failed to get file"); + return false; + } + if (S_ISREG(sBuf.st_mode) == 0) { + ERROR_LOG("%s is not a file, please enter a file", filePath.c_str()); + return false; + } + + std::ifstream file; + file.open(filePath, std::ios::binary); + if (!file.is_open()) { + ERROR_LOG("Open file failed. path = %s", filePath.c_str()); + return false; + } + + std::filebuf *buf = file.rdbuf(); + size_t size = buf->pubseekoff(0, std::ios::end, std::ios::in); + if (size == 0) { + ERROR_LOG("file size is 0"); + file.close(); + return false; + } + if (size > bufferSize) { + ERROR_LOG("file size is larger than buffer size"); + file.close(); + return false; + } + buf->pubseekpos(0, std::ios::in); + buf->sgetn(static_cast(buffer), size); + fileSize = size; + file.close(); + return true; +} + +/** + * @brief Write data to file + * @param [in] filePath: file path + * @param [in] buffer: data to write to file + * @param [in] size: size to write + * @return write result + */ +bool WriteFile(const std::string &filePath, const void *buffer, size_t size) +{ + if (buffer == nullptr) { + ERROR_LOG("Write file failed. buffer is nullptr"); + return false; + } + + int fd = open(filePath.c_str(), O_RDWR | O_CREAT | O_TRUNC, S_IRUSR | S_IWRITE); + if (fd < 0) { + ERROR_LOG("Open file failed. path = %s", filePath.c_str()); + return false; + } + + size_t writeSize = write(fd, buffer, size); + (void)close(fd); + if (writeSize != size) { + ERROR_LOG("Write file Failed."); + return false; + } + + return true; +} + +/** + * @brief Reads a binary file into memory. + * + * This function opens a binary file, reads its contents into a dynamically allocated memory buffer, + * and returns a pointer to the buffer and the size of the file through output parameters. + * + * @param filePath The path to the binary file to be read. + * @param outBuffer A reference to a unique pointer that will hold the file data. + * @param outSize A reference to a size_t that will hold the size of the file. + * @return true if the file was read successfully, false otherwise. + */ +bool ReadBinaryFile(const char *filePath, std::unique_ptr &outBuffer, size_t &outSize) +{ + FILE *file = fopen(filePath, "rb"); + if (!file) { + ERROR_LOG("Error opening file: %s\n", strerror(errno)); + return false; + } + + fseek(file, 0, SEEK_END); + outSize = ftell(file); + rewind(file); + + outBuffer.reset(new char[outSize]); + if (fread(outBuffer.get(), 1, outSize, file) != outSize) { + ERROR_LOG("Error reading file.\n"); + fclose(file); + return false; + } + + fclose(file); + return true; +} + +template void DoPrintData(const T *data, size_t count, size_t elementsPerRow) +{ + assert(elementsPerRow != 0); + for (size_t i = 0; i < count; ++i) { + std::cout << std::setw(10) << data[i]; + if (i % elementsPerRow == elementsPerRow - 1) { + std::cout << std::endl; + } + } +} + +void DoPrintHalfData(const aclFloat16 *data, size_t count, size_t elementsPerRow) +{ + assert(elementsPerRow != 0); + for (size_t i = 0; i < count; ++i) { + std::cout << std::setw(10) << std::setprecision(6) << aclFloat16ToFloat(data[i]); + if (i % elementsPerRow == elementsPerRow - 1) { + std::cout << std::endl; + } + } +} + +void PrintData(const void *data, size_t count, printDataType dataType, size_t elementsPerRow = 16) +{ + if (data == nullptr) { + ERROR_LOG("Print data failed. data is nullptr"); + return; + } + + switch (dataType) { + case BOOL: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case INT8_T: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case UINT8_T: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case INT16_T: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case UINT16_T: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case INT32_T: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case UINT32_T: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case INT64_T: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case UINT64_T: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case HALF: + DoPrintHalfData(reinterpret_cast(data), count, elementsPerRow); + break; + case FLOAT: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case DOUBLE: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + default: + ERROR_LOG("Unsupported type: %d", dataType); + } + std::cout << std::endl; +} +#endif // DATA_UTILS_H diff --git a/operator/MatmulLeakyReluCustomSample/KernelLaunch/MatmulLeakyReluInvocationAcl/main.cpp b/operator/MatmulLeakyReluCustomSample/KernelLaunch/MatmulLeakyReluInvocationAcl/main.cpp new file mode 100644 index 0000000000000000000000000000000000000000..dfcedf19f44f2cd3b34716cd5fbaa5f6e36dae37 --- /dev/null +++ b/operator/MatmulLeakyReluCustomSample/KernelLaunch/MatmulLeakyReluInvocationAcl/main.cpp @@ -0,0 +1,147 @@ +/** + * @file main.cpp + * + * Copyright (C) 2023-2024. Huawei Technologies Co., Ltd. All rights reserved. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + */ +#include "data_utils.h" +#include "kernel_tiling/kernel_tiling.h" +#include "tiling/platform/platform_ascendc.h" +#include "acl/acl.h" + +typedef int32_t rtError_t; +extern uint8_t *GenerateTiling(const char *socVersion); +extern "C" rtError_t rtGetC2cCtrlAddr(uint64_t *addr, uint32_t *len); // from libruntime.so + +int32_t main(int32_t argc, char *argv[]) +{ + const char *socVersion = SOC_VERSION; + auto ascendcPlatform = platform_ascendc::PlatformAscendCManager::GetInstance(socVersion); + size_t aFileSize = 262144 * sizeof(int16_t); + size_t bFileSize = 163840 * sizeof(int16_t); + size_t cFileSize = 655360 * sizeof(float); + size_t biasFileSize = 640 * sizeof(float); + size_t tilingFileSize = sizeof(TCubeTiling); + size_t userWorkspaceSize = 0; + size_t systemWorkspaceSize = static_cast(ascendcPlatform->GetLibApiWorkSpaceSize()); + size_t workspaceSize = userWorkspaceSize + systemWorkspaceSize; +#ifdef CUSTOM_ASCEND310P + uint32_t blockDim = 2; +#else + uint32_t blockDim = 1; +#endif + + CHECK_ACL(aclInit(nullptr)); + int32_t deviceId = 0; + CHECK_ACL(aclrtSetDevice(deviceId)); + aclrtStream stream = nullptr; + CHECK_ACL(aclrtCreateStream(&stream)); + + uint8_t *inputAHost; + uint8_t *inputADevice; + CHECK_ACL(aclrtMallocHost((void **)(&inputAHost), aFileSize)); + CHECK_ACL(aclrtMalloc((void **)&inputADevice, aFileSize, ACL_MEM_MALLOC_HUGE_FIRST)); + ReadFile("./input/x1_gm.bin", aFileSize, inputAHost, aFileSize); + CHECK_ACL(aclrtMemcpy(inputADevice, aFileSize, inputAHost, aFileSize, ACL_MEMCPY_HOST_TO_DEVICE)); + + uint8_t *inputBHost; + uint8_t *inputBDevice; + CHECK_ACL(aclrtMallocHost((void **)(&inputBHost), bFileSize)); + CHECK_ACL(aclrtMalloc((void **)&inputBDevice, bFileSize, ACL_MEM_MALLOC_HUGE_FIRST)); + ReadFile("./input/x2_gm.bin", bFileSize, inputBHost, bFileSize); + CHECK_ACL(aclrtMemcpy(inputBDevice, bFileSize, inputBHost, bFileSize, ACL_MEMCPY_HOST_TO_DEVICE)); + + uint8_t *outputCHost; + uint8_t *outputCDevice; + CHECK_ACL(aclrtMallocHost((void **)(&outputCHost), cFileSize)); + CHECK_ACL(aclrtMalloc((void **)&outputCDevice, cFileSize, ACL_MEM_MALLOC_HUGE_FIRST)); + + uint8_t *inputBiasHost; + uint8_t *inputBiasDevice; + CHECK_ACL(aclrtMallocHost((void **)(&inputBiasHost), biasFileSize)); + CHECK_ACL(aclrtMalloc((void **)&inputBiasDevice, biasFileSize, ACL_MEM_MALLOC_HUGE_FIRST)); + ReadFile("./input/bias.bin", biasFileSize, inputBiasHost, biasFileSize); + CHECK_ACL(aclrtMemcpy(inputBiasDevice, biasFileSize, inputBiasHost, biasFileSize, ACL_MEMCPY_HOST_TO_DEVICE)); + + uint8_t *tilingHost; + uint8_t *tilingDevice; + CHECK_ACL(aclrtMallocHost((void **)(&tilingHost), tilingFileSize)); + CHECK_ACL(aclrtMalloc((void **)&tilingDevice, tilingFileSize, ACL_MEM_MALLOC_HUGE_FIRST)); + CHECK_ACL( + aclrtMemcpy(tilingHost, tilingFileSize, GenerateTiling(socVersion), tilingFileSize, ACL_MEMCPY_HOST_TO_HOST)); + CHECK_ACL(aclrtMemcpy(tilingDevice, tilingFileSize, tilingHost, tilingFileSize, ACL_MEMCPY_HOST_TO_DEVICE)); + + uint8_t *workspaceDevice; + CHECK_ACL(aclrtMalloc((void **)&workspaceDevice, workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST)); + + uint64_t fftsAddr; + uint32_t fftsAddrLen; + CHECK_ACL(rtGetC2cCtrlAddr(&fftsAddr, &fftsAddrLen)); + + struct { + uintptr_t fftsAddr; + uintptr_t inputADevice; + uintptr_t inputBDevice; + uintptr_t inputBiasDevice; + uintptr_t outputCDevice; + uintptr_t workspaceDevice; + uintptr_t tilingDevice; + } args; + + auto *argsDeviceAddrInHost = &args; + CHECK_ACL(aclrtMallocHost((void **)&argsDeviceAddrInHost, sizeof(args))); + argsDeviceAddrInHost->fftsAddr = (uintptr_t)fftsAddr; + argsDeviceAddrInHost->inputADevice = (uintptr_t)inputADevice; + argsDeviceAddrInHost->inputBDevice = (uintptr_t)inputBDevice; + argsDeviceAddrInHost->inputBiasDevice = (uintptr_t)inputBiasDevice; + argsDeviceAddrInHost->outputCDevice = (uintptr_t)outputCDevice; + argsDeviceAddrInHost->workspaceDevice = (uintptr_t)workspaceDevice; + argsDeviceAddrInHost->tilingDevice = (uintptr_t)tilingDevice; + + void *argsDeviceAddrInDevice = nullptr; + CHECK_ACL(aclrtMalloc((void **)&argsDeviceAddrInDevice, sizeof(args), ACL_MEM_MALLOC_HUGE_FIRST)); + CHECK_ACL(aclrtMemcpy(argsDeviceAddrInDevice, sizeof(args), argsDeviceAddrInHost, sizeof(args), ACL_MEMCPY_HOST_TO_DEVICE)); + CHECK_ACL(aclrtFreeHost(argsDeviceAddrInHost)); + + const char *filePath = "./out/fatbin/ascendc_kernels/ascendc_kernels.o"; + std::unique_ptr fileMemory; + size_t fileSize = 0; + ReadBinaryFile(filePath, fileMemory, fileSize); + + aclrtBinary binary = aclrtCreateBinary(fileMemory.get(), fileSize); + if (binary == nullptr) { + printf("aclrtCreateBinary failed"); + return -1; + } + aclrtBinHandle binHandle = nullptr; + CHECK_ACL(aclrtBinaryLoad(binary, &binHandle)); + CHECK_ACL(aclrtDestroyBinary(binary)); + + aclrtFuncHandle funAdd = nullptr; + CHECK_ACL(aclrtBinaryGetFunction(binHandle, "matmul_leakyrelu_custom", &funAdd)); + + CHECK_ACL(aclrtLaunchKernel(funAdd, blockDim, argsDeviceAddrInDevice, sizeof(args), stream)); + CHECK_ACL(aclrtSynchronizeStream(stream)); + + CHECK_ACL(aclrtFree(inputADevice)); + CHECK_ACL(aclrtFreeHost(inputAHost)); + CHECK_ACL(aclrtFree(inputBDevice)); + CHECK_ACL(aclrtFreeHost(inputBHost)); + CHECK_ACL(aclrtMemcpy(outputCHost, cFileSize, outputCDevice, cFileSize, ACL_MEMCPY_DEVICE_TO_HOST)); + WriteFile("./output/output.bin", outputCHost, cFileSize); + CHECK_ACL(aclrtFree(outputCDevice)); + CHECK_ACL(aclrtFreeHost(outputCHost)); + CHECK_ACL(aclrtFree(inputBiasDevice)); + CHECK_ACL(aclrtFreeHost(inputBiasHost)); + CHECK_ACL(aclrtFree(tilingDevice)); + CHECK_ACL(aclrtFreeHost(tilingHost)); + CHECK_ACL(aclrtFree(workspaceDevice)); + + CHECK_ACL(aclrtDestroyStream(stream)); + CHECK_ACL(aclrtResetDevice(deviceId)); + CHECK_ACL(aclFinalize()); + return 0; +} \ No newline at end of file diff --git a/operator/MatmulLeakyReluCustomSample/KernelLaunch/MatmulLeakyReluInvocationAcl/matmul_leakyrelu_custom.cpp b/operator/MatmulLeakyReluCustomSample/KernelLaunch/MatmulLeakyReluInvocationAcl/matmul_leakyrelu_custom.cpp new file mode 100644 index 0000000000000000000000000000000000000000..ac50143b5cd7faf027a7f469084c7cad3303639e --- /dev/null +++ b/operator/MatmulLeakyReluCustomSample/KernelLaunch/MatmulLeakyReluInvocationAcl/matmul_leakyrelu_custom.cpp @@ -0,0 +1,155 @@ +/** + * @file matmul_leakyrelu_custom.cpp + * + * Copyright (C) 2023-2024. Huawei Technologies Co., Ltd. All rights reserved. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + */ +#include "kernel_operator.h" +#include "lib/matmul_intf.h" + +using namespace matmul; + +__aicore__ inline uint32_t Ceiling(uint32_t a, uint32_t b) +{ + return (a + b - 1) / b; +} + +__aicore__ inline void CopyTiling(TCubeTiling *tiling, GM_ADDR tilingGM) +{ + uint32_t *ptr = reinterpret_cast(tiling); + auto tiling32 = reinterpret_cast<__gm__ uint32_t *>(tilingGM); + + for (uint32_t i = 0; i < sizeof(TCubeTiling) / sizeof(uint32_t); i++, ptr++) { + *ptr = *(tiling32 + i); + } + return; +} + +template class MatmulLeakyKernel { +public: + __aicore__ inline MatmulLeakyKernel(){}; + __aicore__ inline void Init(GM_ADDR a, GM_ADDR b, GM_ADDR bias, GM_ADDR c, GM_ADDR workspace, + const TCubeTiling &tiling, AscendC::TPipe *pipe); + __aicore__ inline void Process(AscendC::TPipe *pipe); + + __aicore__ inline void MatmulCompute(); + __aicore__ inline void LeakyReluCompute(); + __aicore__ inline void CopyOut(uint32_t count); + __aicore__ inline void CalcOffset(int32_t blockIdx, const TCubeTiling &tiling, int32_t &offsetA, int32_t &offsetB, + int32_t &offsetC, int32_t &offsetBias); + + Matmul, MatmulType, + MatmulType, MatmulType> + matmulObj; + + AscendC::GlobalTensor aGlobal; + AscendC::GlobalTensor bGlobal; + AscendC::GlobalTensor cGlobal; + AscendC::GlobalTensor biasGlobal; + AscendC::LocalTensor reluOutLocal; + TCubeTiling tiling; + AscendC::TQue reluOutQueue_; +}; + +template +__aicore__ inline void MatmulLeakyKernel::Init(GM_ADDR a, GM_ADDR b, GM_ADDR bias, + GM_ADDR c, GM_ADDR workspace, + const TCubeTiling &tiling, AscendC::TPipe *pipe) +{ + this->tiling = tiling; + aGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ aType *>(a), tiling.M * tiling.Ka); + bGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ bType *>(b), tiling.Kb * tiling.N); + cGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ cType *>(c), tiling.M * tiling.N); + biasGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ biasType *>(bias), tiling.N); + + int32_t offsetA, offsetB, offsetC, offsetBias; + CalcOffset(AscendC::GetBlockIdx(), tiling, offsetA, offsetB, offsetC, offsetBias); + aGlobal = aGlobal[offsetA]; + bGlobal = bGlobal[offsetB]; + cGlobal = cGlobal[offsetC]; + biasGlobal = biasGlobal[offsetBias]; + pipe->InitBuffer(reluOutQueue_, 1, tiling.baseM * tiling.baseN * sizeof(cType)); +} + +template +__aicore__ inline void MatmulLeakyKernel::Process(AscendC::TPipe *pipe) +{ + uint32_t computeRound = 0; + +#ifdef CUSTOM_ASCEND310P + AscendC::TBuf<> tmpMMFormatUb; + AscendC::LocalTensor mmformatUb; + pipe->InitBuffer(tmpMMFormatUb, tiling.baseM * tiling.baseN * sizeof(cType)); + mmformatUb = tmpMMFormatUb.Get(tiling.baseM * tiling.baseN * sizeof(cType)); + matmulObj.SetLocalWorkspace(mmformatUb); +#endif + matmulObj.SetTensorA(aGlobal); + matmulObj.SetTensorB(bGlobal); + matmulObj.SetBias(biasGlobal); + while (matmulObj.template Iterate()) { + MatmulCompute(); + LeakyReluCompute(); + CopyOut(computeRound); + computeRound++; + } + matmulObj.End(); +} + +template +__aicore__ inline void MatmulLeakyKernel::MatmulCompute() +{ + reluOutLocal = reluOutQueue_.AllocTensor(); + matmulObj.template GetTensorC(reluOutLocal, false, true); +} + +template +__aicore__ inline void MatmulLeakyKernel::LeakyReluCompute() +{ + LeakyRelu(reluOutLocal, reluOutLocal, (cType)0.001, tiling.baseM * tiling.baseN); + reluOutQueue_.EnQue(reluOutLocal); +} + +template +__aicore__ inline void MatmulLeakyKernel::CopyOut(uint32_t count) +{ + reluOutQueue_.DeQue(); + const uint32_t roundM = tiling.singleCoreM / tiling.baseM; + const uint32_t roundN = tiling.singleCoreN / tiling.baseN; + uint32_t startOffset = (count % roundM * tiling.baseM * tiling.N + count / roundM * tiling.baseN); + AscendC::DataCopyParams copyParam = {(uint16_t)tiling.baseM, (uint16_t)(tiling.baseN * sizeof(cType) / AscendC::DEFAULT_C0_SIZE), 0, + (uint16_t)((tiling.N - tiling.baseN) * sizeof(cType) / AscendC::DEFAULT_C0_SIZE)}; + DataCopy(cGlobal[startOffset], reluOutLocal, copyParam); + reluOutQueue_.FreeTensor(reluOutLocal); +} + +template +__aicore__ inline void +MatmulLeakyKernel::CalcOffset(int32_t blockIdx, const TCubeTiling &tiling, + int32_t &offsetA, int32_t &offsetB, int32_t &offsetC, + int32_t &offsetBias) +{ + auto mSingleBlocks = Ceiling(tiling.M, tiling.singleCoreM); + auto mCoreIndx = blockIdx % mSingleBlocks; + auto nCoreIndx = blockIdx / mSingleBlocks; + + offsetA = mCoreIndx * tiling.Ka * tiling.singleCoreM; + offsetB = nCoreIndx * tiling.singleCoreN; + offsetC = mCoreIndx * tiling.N * tiling.singleCoreM + nCoreIndx * tiling.singleCoreN; + offsetBias = nCoreIndx * tiling.singleCoreN; +} + +extern "C" __global__ __aicore__ void matmul_leakyrelu_custom(GM_ADDR a, GM_ADDR b, GM_ADDR bias, GM_ADDR c, + GM_ADDR workspace, GM_ADDR tilingGm) +{ + AscendC::TPipe pipe; + TCubeTiling tiling; + CopyTiling(&tiling, tilingGm); + + MatmulLeakyKernel matmulLeakyKernel; + matmulLeakyKernel.Init(a, b, bias, c, workspace, tiling, &pipe); + REGIST_MATMUL_OBJ(&pipe, GetSysWorkSpacePtr(), matmulLeakyKernel.matmulObj, &matmulLeakyKernel.tiling); + matmulLeakyKernel.Process(&pipe); +} \ No newline at end of file diff --git a/operator/MatmulLeakyReluCustomSample/KernelLaunch/MatmulLeakyReluInvocationAcl/matmul_leakyrelu_custom_tiling.cpp b/operator/MatmulLeakyReluCustomSample/KernelLaunch/MatmulLeakyReluInvocationAcl/matmul_leakyrelu_custom_tiling.cpp new file mode 100644 index 0000000000000000000000000000000000000000..287777e53ba89c06286c79522d3fd484fb6bb291 --- /dev/null +++ b/operator/MatmulLeakyReluCustomSample/KernelLaunch/MatmulLeakyReluInvocationAcl/matmul_leakyrelu_custom_tiling.cpp @@ -0,0 +1,82 @@ +/** + * @file matmul_leakyrelu_custom_tiling.cpp + * + * Copyright (C) 2023-2024. Huawei Technologies Co., Ltd. All rights reserved. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + */ +#include +#include +#include +#include +#include + +#include "tiling/tiling_api.h" +#include "tiling/platform/platform_ascendc.h" +using namespace matmul_tiling; +using namespace std; + +uint8_t *GetTilingBuf(optiling::TCubeTiling *tilingData) +{ + uint32_t tilingSize = tilingData->GetDataSize(); + uint8_t *buf = (uint8_t *)malloc(tilingSize); + tilingData->SaveToBuffer(buf, tilingSize); + return buf; +} + +uint8_t *GenerateTiling(const char *socVersion) +{ + int M = 1024; + int N = 640; + int K = 256; + + TPosition leftPosition = TPosition::GM; + CubeFormat leftFormat = CubeFormat::ND; + DataType leftDtype = DataType::DT_FLOAT16; + bool isTransA = false; + + TPosition rightPosition = TPosition::GM; + CubeFormat rightFormat = CubeFormat::ND; + DataType rightDtype = DataType::DT_FLOAT16; + bool isTransB = false; + + TPosition resultPosition = TPosition::GM; + CubeFormat resultFormat = CubeFormat::ND; + DataType resultDtype = DataType::DT_FLOAT; + + TPosition biasPosition = TPosition::GM; + CubeFormat biasFormat = CubeFormat::ND; + DataType biasDtype = DataType::DT_FLOAT; + bool isBias = true; + + int usedCoreNum = 2; + int baseM = 256; + int baseN = 128; + + optiling::TCubeTiling tilingData; + auto ascendcPlatform = platform_ascendc::PlatformAscendCManager::GetInstance(socVersion); + MultiCoreMatmulTiling tilingApi(*ascendcPlatform); + + tilingApi.SetDim(usedCoreNum); + tilingApi.SetAType(leftPosition, leftFormat, leftDtype, isTransA); + tilingApi.SetBType(rightPosition, rightFormat, rightDtype, isTransB); + tilingApi.SetCType(resultPosition, resultFormat, resultDtype); + tilingApi.SetBiasType(biasPosition, biasFormat, biasDtype); + + tilingApi.SetOrgShape(M, N, K); + tilingApi.SetShape(M, N, K); + tilingApi.SetBias(isBias); + tilingApi.SetTraverse(MatrixTraverse::FIRSTM); + tilingApi.SetFixSplit(baseM, baseN, -1); + tilingApi.SetBufferSpace(-1, -1, -1); + + int64_t res = tilingApi.GetTiling(tilingData); + tilingData.set_stepM(1); + tilingData.set_stepN(1); + if (res == -1) { + std::cout << "gen tiling failed" << std::endl; + } + return GetTilingBuf(&tilingData); +} diff --git a/operator/MatmulLeakyReluCustomSample/KernelLaunch/MatmulLeakyReluInvocationAcl/run.sh b/operator/MatmulLeakyReluCustomSample/KernelLaunch/MatmulLeakyReluInvocationAcl/run.sh new file mode 100644 index 0000000000000000000000000000000000000000..af6e9101619a2c5a3d6cc154a7ebe8d18fe78018 --- /dev/null +++ b/operator/MatmulLeakyReluCustomSample/KernelLaunch/MatmulLeakyReluInvocationAcl/run.sh @@ -0,0 +1,126 @@ +#!/bin/bash +CURRENT_DIR=$( + cd $(dirname ${BASH_SOURCE:-$0}) + pwd +) +cd $CURRENT_DIR + +BUILD_TYPE="Debug" +INSTALL_PREFIX="${CURRENT_DIR}/out" + +SHORT=r:,v:,i:,b:,p:, +LONG=run-mode:,soc-version:,install-path:,build-type:,install-prefix:, +OPTS=$(getopt -a --options $SHORT --longoptions $LONG -- "$@") +eval set -- "$OPTS" + +while :; do + case "$1" in + -r | --run-mode) + RUN_MODE="$2" + shift 2 + ;; + -v | --soc-version) + SOC_VERSION="$2" + shift 2 + ;; + -i | --install-path) + ASCEND_INSTALL_PATH="$2" + shift 2 + ;; + -b | --build-type) + BUILD_TYPE="$2" + shift 2 + ;; + -p | --install-prefix) + INSTALL_PREFIX="$2" + shift 2 + ;; + --) + shift + break + ;; + *) + echo "[ERROR] Unexpected option: $1" + break + ;; + esac +done + +RUN_MODE_LIST="cpu sim npu" +if [[ " $RUN_MODE_LIST " != *" $RUN_MODE "* ]]; then + echo "ERROR: RUN_MODE error, This sample only support specify cpu, sim or npu!" + exit -1 +fi + +VERSION_LIST="Ascend310P1 Ascend310P3 Ascend910B1 Ascend910B2 Ascend910B3 Ascend910B4" +if [[ " $VERSION_LIST " != *" $SOC_VERSION "* ]]; then + echo "ERROR: SOC_VERSION should be in [$VERSION_LIST]" + exit -1 +fi + +if [ -n "$ASCEND_INSTALL_PATH" ]; then + _ASCEND_INSTALL_PATH=$ASCEND_INSTALL_PATH +elif [ -n "$ASCEND_HOME_PATH" ]; then + _ASCEND_INSTALL_PATH=$ASCEND_HOME_PATH +else + if [ -d "$HOME/Ascend/ascend-toolkit/latest" ]; then + _ASCEND_INSTALL_PATH=$HOME/Ascend/ascend-toolkit/latest + else + _ASCEND_INSTALL_PATH=/usr/local/Ascend/ascend-toolkit/latest + fi +fi + +export ASCEND_TOOLKIT_HOME=${_ASCEND_INSTALL_PATH} +export ASCEND_HOME_PATH=${_ASCEND_INSTALL_PATH} +if [ "${RUN_MODE}" = "npu" ]; then + source ${_ASCEND_INSTALL_PATH}/bin/setenv.bash +elif [ "${RUN_MODE}" = "sim" ]; then + # in case of running op in simulator, use stub .so instead + export LD_LIBRARY_PATH=${_ASCEND_INSTALL_PATH}/runtime/lib64/stub:$LD_LIBRARY_PATH + source ${_ASCEND_INSTALL_PATH}/bin/setenv.bash + export LD_LIBRARY_PATH=${_ASCEND_INSTALL_PATH}/tools/simulator/${SOC_VERSION}/lib:$LD_LIBRARY_PATH + if [ ! $CAMODEL_LOG_PATH ]; then + export CAMODEL_LOG_PATH=$(pwd)/sim_log + fi + if [ -d "$CAMODEL_LOG_PATH" ]; then + rm -rf $CAMODEL_LOG_PATH + fi + mkdir -p $CAMODEL_LOG_PATH +elif [ "${RUN_MODE}" = "cpu" ]; then + source ${_ASCEND_INSTALL_PATH}/bin/setenv.bash + export LD_LIBRARY_PATH=${_ASCEND_INSTALL_PATH}/tools/tikicpulib/lib:${_ASCEND_INSTALL_PATH}/tools/tikicpulib/lib/${SOC_VERSION}:${_ASCEND_INSTALL_PATH}/tools/simulator/${SOC_VERSION}/lib:$LD_LIBRARY_PATH +fi + +set -e +rm -rf build out +mkdir -p build +cmake -B build \ + -DRUN_MODE=${RUN_MODE} \ + -DSOC_VERSION=${SOC_VERSION} \ + -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \ + -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} \ + -DASCEND_CANN_PACKAGE_PATH=${_ASCEND_INSTALL_PATH} +cmake --build build -j +cmake --install build + +rm -f ascendc_kernels_bbit +cp ./out/bin/ascendc_kernels_bbit ./ +rm -rf input output +mkdir -p input output +python3 scripts/gen_data.py +( + export LD_LIBRARY_PATH=$(pwd)/out/lib:$(pwd)/out/lib64:${_ASCEND_INSTALL_PATH}/lib64:$LD_LIBRARY_PATH + if [[ "$RUN_WITH_TOOLCHAIN" -eq 1 ]]; then + if [ "${RUN_MODE}" = "npu" ]; then + msprof op --application=./ascendc_kernels_bbit + elif [ "${RUN_MODE}" = "sim" ]; then + msprof op simulator --application=./ascendc_kernels_bbit + elif [ "${RUN_MODE}" = "cpu" ]; then + ./ascendc_kernels_bbit + fi + else + ./ascendc_kernels_bbit + fi +) +md5sum output/*.bin +python3 scripts/verify_result.py output/output.bin output/golden.bin diff --git a/operator/MatmulLeakyReluCustomSample/KernelLaunch/MatmulLeakyReluInvocationAcl/scripts/gen_data.py b/operator/MatmulLeakyReluCustomSample/KernelLaunch/MatmulLeakyReluInvocationAcl/scripts/gen_data.py new file mode 100644 index 0000000000000000000000000000000000000000..e03d4359cad05680cc261d8e638867803eacdd71 --- /dev/null +++ b/operator/MatmulLeakyReluCustomSample/KernelLaunch/MatmulLeakyReluInvocationAcl/scripts/gen_data.py @@ -0,0 +1,35 @@ +#!/usr/bin/python3 +# coding=utf-8 +# +# Copyright (C) 2023-2024. Huawei Technologies Co., Ltd. All rights reserved. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# =============================================================================== + +import numpy as np +import os + + +def gen_golden_data(): + M = 1024 + N = 640 + K = 256 + + input_a = np.random.randint(1, 10, [M, K]).astype(np.float16) + input_b = np.random.randint(1, 10, [K, N]).astype(np.float16) + input_bias = np.random.randint(1, 10, [N]).astype(np.float32) + alpha = 0.001 + golden = (np.matmul(input_a.astype(np.float32), input_b.astype(np.float32)) + input_bias).astype(np.float32) + golden = np.where(golden >= 0, golden, golden * alpha) + os.system("mkdir -p input") + os.system("mkdir -p output") + input_a.tofile("./input/x1_gm.bin") + input_b.tofile("./input/x2_gm.bin") + input_bias.tofile("./input/bias.bin") + golden.tofile("./output/golden.bin") + + +if __name__ == "__main__": + gen_golden_data() diff --git a/operator/MatmulLeakyReluCustomSample/KernelLaunch/MatmulLeakyReluInvocationAcl/scripts/verify_result.py b/operator/MatmulLeakyReluCustomSample/KernelLaunch/MatmulLeakyReluInvocationAcl/scripts/verify_result.py new file mode 100644 index 0000000000000000000000000000000000000000..a325cfcc61f46f7de04ef6c979f852adcf213ef3 --- /dev/null +++ b/operator/MatmulLeakyReluCustomSample/KernelLaunch/MatmulLeakyReluInvocationAcl/scripts/verify_result.py @@ -0,0 +1,53 @@ +#!/usr/bin/python3 +# coding=utf-8 +# +# Copyright (C) 2023-2024. Huawei Technologies Co., Ltd. All rights reserved. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# =============================================================================== + +import sys +import numpy as np + +# for float32 +relative_tol = 1e-6 +absolute_tol = 1e-9 +error_tol = 1e-4 + + +def verify_result(output, golden): + output = np.fromfile(output, dtype=np.float32).reshape(-1) + golden = np.fromfile(golden, dtype=np.float32).reshape(-1) + different_element_results = np.isclose(output, + golden, + rtol=relative_tol, + atol=absolute_tol, + equal_nan=True) + different_element_indexes = np.where(different_element_results == False)[0] + for index in range(len(different_element_indexes)): + real_index = different_element_indexes[index] + golden_data = golden[real_index] + output_data = output[real_index] + print( + "data index: %06d, expected: %-.9f, actual: %-.9f, rdiff: %-.6f" % + (real_index, golden_data, output_data, + abs(output_data - golden_data) / golden_data)) + if index == 100: + break + error_ratio = float(different_element_indexes.size) / golden.size + print("error ratio: %.4f, tolrence: %.4f" % (error_ratio, error_tol)) + return error_ratio <= error_tol + + +if __name__ == '__main__': + try: + res = verify_result(sys.argv[1], sys.argv[2]) + if not res: + raise ValueError("[ERROR] result error") + else: + print("test pass") + except Exception as e: + print(e) + sys.exit(1)