diff --git a/operator/ascendc/2_features/2_tbufpool/CMakeLists.txt b/operator/ascendc/2_features/2_tbufpool/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..cba0e5e4111871e9a7bff672503cafbba3ed67ae --- /dev/null +++ b/operator/ascendc/2_features/2_tbufpool/CMakeLists.txt @@ -0,0 +1,76 @@ +# Copyright (c) 2025 Huawei Technologies Co., Ltd. +# ====================================================================================================================== + +cmake_minimum_required(VERSION 3.16) +project(Ascend_c) +if(${RUN_MODE}) + set(RUN_MODE "npu" CACHE STRING "cpu/sim/npu") +endif() +if (${SOC_VERSION}) + set(SOC_VERSION "Ascend910" CACHE STRING "system on chip type") +endif() + +set(ASCEND_CANN_PACKAGE_PATH "~/Ascend/ascend-toolkit/latest" CACHE STRING "ASCEND CANN package installation directory") + +if(NOT CMAKE_BUILD_TYPE) + set(CMAKE_BUILD_TYPE "Debug" CACHE STRING "Build type Release/Debug (default Debug)" FORCE) +endif() + +if(CMAKE_INSTALL_PREFIX STREQUAL /usr/local) + set(CMAKE_INSTALL_PREFIX "${CMAKE_CURRENT_LIST_DIR}/out" CACHE STRING "path for install()" FORCE) +endif() + +file(GLOB KERNEL_FILES + ${CMAKE_CURRENT_SOURCE_DIR}/op_kernel/tbufpool_custom.cpp +) +set(CUSTOM_ASCEND310P_LIST "Ascend310P1" "Ascend310P3") + +if("${RUN_MODE}" STREQUAL "cpu") + include(cmake/cpu_lib.cmake) +elseif("${RUN_MODE}" STREQUAL "sim" OR "${RUN_MODE}" STREQUAL "npu") + include(cmake/npu_lib.cmake) +else() + message("invalid RUN_MODE: ${RUN_MODE}") +endif() + +add_executable(tbufpool_direct_kernel_op + ${CMAKE_CURRENT_SOURCE_DIR}/main.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/op_host/tbufpool_custom_tiling.cpp +) + +target_compile_options(tbufpool_direct_kernel_op PRIVATE + $:-g>> + -O2 + -std=c++17 + -D_GLIBCXX_USE_CXX11_ABI=0 +) + +target_compile_definitions(tbufpool_direct_kernel_op PRIVATE + $<$>:CUSTOM_ASCEND310P> +) + +target_include_directories(tbufpool_direct_kernel_op PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR} + $:${ASCEND_CANN_PACKAGE_PATH}/include>> + $:${ASCEND_CANN_PACKAGE_PATH}/runtime/include>> +) + +target_link_libraries(tbufpool_direct_kernel_op PRIVATE + $,$>:host_intf_pub>> + $:tikicpulib::${SOC_VERSION}>> + $:ascendcl>> + $:c_sec>> + ascendc_kernels_${RUN_MODE} + tiling_api + register + platform + ascendalog + dl + graph_base +) + +install(TARGETS tbufpool_direct_kernel_op + LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} + ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} + RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} +) \ No newline at end of file diff --git a/operator/ascendc/2_features/2_tbufpool/README.md b/operator/ascendc/2_features/2_tbufpool/README.md index 5af80e6c5fee5de557b56d928f5e64a5143844a0..c7e17b6f8c7b8c20c8b799f0df7e8dde88607511 100644 --- a/operator/ascendc/2_features/2_tbufpool/README.md +++ b/operator/ascendc/2_features/2_tbufpool/README.md @@ -1 +1,82 @@ -tbufpool(待补充) \ No newline at end of file + +## 目录结构介绍 +``` +├── 22_tbufpool_kernellaunch +│ ├── cmake // 编译工程文件 +│ ├── op_host // 本样例tiling代码实现 +│ ├── op_kernel //本样例kernel侧代码实现 +│ ├── scripts +│ │ ├── gen_data.py // 输入数据和真值数据生成脚本 +│ ├── CMakeLists.txt // 编译工程文件 +│ ├── data_utils.h // 数据读入写出函数 +│ ├── main.cpp // 主函数,调用算子的应用程序,含CPU域及NPU域调用 +│ └── run.sh // 编译运行算子的脚本 +``` +## 代码实现介绍 +数据量较大且内存有限时,无法一次完成所有数据搬运,需要拆分成多个阶段计算,每次计算使用其中的一部分数据,可以通过TBufPool资源池进行内存地址复用。本例中,通过调用InitBufPool基础API对Add算子, Sub算子和Mul算子实现过程进行内存管理。从Tpipe划分出资源池tbufPool0,tbufPool0为src0Gm分配空间后,继续分配了资源池tbufPool1,指定tbufPool1与tbufPool2复用并分别运用于第一、二轮计算,此时tbufPool1及tbufPool2共享起始地址及长度。 + +- kernel实现 + Add算子的数学表达式为: + ``` + z = x + y + ``` + Sub算子的数学表达式为: + ``` + z = x - y + ``` + 计算逻辑是:Ascend C提供的矢量计算接口的操作元素都为LocalTensor,首先启用tbufool1,将部分输入数据src0Gm,部分输入数据src1Gm搬运进片上储存,调用计算接口完成相加计算,搬出到外部存储上。之后切换到tbufpool2进行相减、相乘计算。完成后切换回tbufpool1完成剩余数据相加计算,得到最终相加结果,再切换到tbufpool2完成剩余数据相减计算,得到最终结果,再搬出到外部存储上。 + + Add算子的实现流程分为6个基本任务:CopyIn,Compute,CopyOut,CopyIn1,Compute1,CopyOut1。 + - CopyIn任务负责将Global Memory上的部分输入Tensor src0Gm和src1Gm搬运到Local Memory,分别存储在src0Local、src1Local; + - Compute任务负责对src0Local、src1Local执行加法操作,计算结果存储在dstLocal中; + - CopyOut任务负责将输出数据从dstLocal搬运至Global Memory上的输出Tensor dstGm0中。 + - CopyIn1任务负责将Global Memory上的剩余输入Tensor src0Gm和src1Gm搬运到Local Memory,分别存储在src0Local、src1Local; + - Compute1任务负责对src0Local、src1Local执行减法、乘法操作,计算结果存储在dstLocal中; + - CopyOut1任务负责将输出数据从dstLocal搬运至Global Memory上的输出Tensor dstGm1中。 + +- 调用实现 + 1. CPU侧运行验证主要通过ICPU_RUN_KF CPU调测宏等CPU调测库提供的接口来完成; + 2. NPU侧运行验证主要通过使用ACLRT_LAUNCH_KERNEL内核调用宏来完成。 + + 应用程序通过ASCENDC_CPU_DEBUG 宏区分代码逻辑运行于CPU侧还是NPU侧。 + +## 运行样例算子 + - 打开样例目录 + 以命令行方式下载样例代码,master分支为例。 + ```bash + cd ${git_clone_path}/samples/operator/ascendc/0_introduction/22_tbufpool_kernellaunch + ``` + - 配置环境变量 + + 请根据当前环境上CANN开发套件包的[安装方式](https://hiascend.com/document/redirect/CannCommunityInstSoftware),选择对应配置环境变量的命令。 + - 默认路径,root用户安装CANN软件包 + ```bash + export ASCEND_INSTALL_PATH=/usr/local/Ascend/ascend-toolkit/latest + ``` + - 默认路径,非root用户安装CANN软件包 + ```bash + export ASCEND_INSTALL_PATH=$HOME/Ascend/ascend-toolkit/latest + ``` + - 指定路径install_path,安装CANN软件包 + ```bash + export ASCEND_INSTALL_PATH=${install_path}/ascend-toolkit/latest + ``` + + 配置仿真模式日志文件目录,默认为sim_log。 + ```bash + export CAMODEL_LOG_PATH=./sim_log + ``` + + - 样例执行 + + ```bash + bash run.sh -r [RUN_MODE] -v [SOC_VERSION] + ``` + - RUN_MODE:编译方式,可选择CPU调试,NPU仿真,NPU上板。支持参数为[cpu / sim / npu]。 + - SOC_VERSION:昇腾AI处理器型号,如果无法确定具体的[SOC_VERSION],则在安装昇腾AI处理器的服务器执行npu-smi info命令进行查询,在查询到的“Name”前增加Ascend信息,例如“Name”对应取值为xxxyy,实际配置的[SOC_VERSION]值为Ascendxxxyy。支持以下产品型号: + - Atlas A2训练系列产品/Atlas 800I A2推理产品 + + 示例如下,Ascendxxxyy请替换为实际的AI处理器型号。 + ```bash + bash run.sh -r cpu -v Ascendxxxyy + ``` \ No newline at end of file diff --git a/operator/ascendc/2_features/2_tbufpool/cmake/cpu_lib.cmake b/operator/ascendc/2_features/2_tbufpool/cmake/cpu_lib.cmake new file mode 100644 index 0000000000000000000000000000000000000000..693f15ac115d655aacd3218bc5b14060c0a3de2f --- /dev/null +++ b/operator/ascendc/2_features/2_tbufpool/cmake/cpu_lib.cmake @@ -0,0 +1,26 @@ +if(NOT DEFINED ENV{CMAKE_PREFIX_PATH}) + set(CMAKE_PREFIX_PATH ${ASCEND_CANN_PACKAGE_PATH}/tools/tikicpulib/lib/cmake) +endif() +find_package(tikicpulib REQUIRED) + +add_library(ascendc_kernels_${RUN_MODE} SHARED + ${KERNEL_FILES} +) + +target_link_libraries(ascendc_kernels_${RUN_MODE} PRIVATE + tikicpulib::${SOC_VERSION} +) + +target_compile_definitions(ascendc_kernels_${RUN_MODE} PRIVATE + $<$>:CUSTOM_ASCEND310P> +) + +target_compile_options(ascendc_kernels_${RUN_MODE} PRIVATE + -g + -O0 + -std=c++17 +) + +install(TARGETS ascendc_kernels_${RUN_MODE} +DESTINATION ${CMAKE_INSTALL_LIBDIR} +) \ No newline at end of file diff --git a/operator/ascendc/2_features/2_tbufpool/cmake/npu_lib.cmake b/operator/ascendc/2_features/2_tbufpool/cmake/npu_lib.cmake new file mode 100644 index 0000000000000000000000000000000000000000..8ad136f38b80bea109925ab797fbde0871874964 --- /dev/null +++ b/operator/ascendc/2_features/2_tbufpool/cmake/npu_lib.cmake @@ -0,0 +1,12 @@ +if(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake) + set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake) +elseif(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake) + set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake) +else() + message(FATAL_ERROR "ascendc_kernel_cmake does not exist ,please check whether the cann package is installed") +endif() +include(${ASCENDC_CMAKE_DIR}/ascendc.cmake) + +ascendc_library(ascendc_kernels_${RUN_MODE} STATIC + ${KERNEL_FILES} +) \ No newline at end of file diff --git a/operator/ascendc/2_features/2_tbufpool/data_utils.h b/operator/ascendc/2_features/2_tbufpool/data_utils.h new file mode 100644 index 0000000000000000000000000000000000000000..7980ae3412c48bc1b20225639a9084cda68a525b --- /dev/null +++ b/operator/ascendc/2_features/2_tbufpool/data_utils.h @@ -0,0 +1,211 @@ +/** + * Copyright (c) 2025 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ +#ifndef DATA_UTILS_H +#define DATA_UTILS_H +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifndef ASCENDC_CPU_DEBUG +#include "acl/acl.h" +#endif + +typedef enum { + DT_UNDEFINED = -1, + FLOAT = 0, + HALF = 1, + INT8_T = 2, + INT32_T = 3, + UINT8_T = 4, + INT16_T = 6, + UINT16_T = 7, + UINT32_T = 8, + INT64_T = 9, + UINT64_T = 10, + DOUBLE = 11, + BOOL = 12, + STRING = 13, + COMPLEX64 = 16, + COMPLEX128 = 17, + BF16 = 27 +} printDataType; + +#define INFO_LOG(fmt, args...) fprintf(stdout, "[INFO] " fmt "\n", ##args) +#define WARN_LOG(fmt, args...) fprintf(stdout, "[WARN] " fmt "\n", ##args) +#define ERROR_LOG(fmt, args...) fprintf(stdout, "[ERROR] " fmt "\n", ##args) + +#ifndef ASCENDC_CPU_DEBUG +#define CHECK_ACL(x) \ + do { \ + aclError __ret = x; \ + if (__ret != ACL_ERROR_NONE) { \ + std::cerr << __FILE__ << ":" << __LINE__ << " aclError:" << __ret << std::endl; \ + } \ + } while (0); +#endif + +/** +* @brief Read data from file +* @param [in] filePath: file path +* @param [out] fileSize: file size +* @return read result +*/ +bool ReadFile(const std::string &filePath, size_t &fileSize, void *buffer, size_t bufferSize) +{ + struct stat sBuf; + int fileStatus = stat(filePath.data(), &sBuf); + if (fileStatus == -1) { + ERROR_LOG("failed to get file"); + return false; + } + if (S_ISREG(sBuf.st_mode) == 0) { + ERROR_LOG("%s is not a file, please enter a file", filePath.c_str()); + return false; + } + + std::ifstream file; + file.open(filePath, std::ios::binary); + if (!file.is_open()) { + ERROR_LOG("Open file failed. path = %s", filePath.c_str()); + return false; + } + + std::filebuf *buf = file.rdbuf(); + size_t size = buf->pubseekoff(0, std::ios::end, std::ios::in); + if (size == 0) { + ERROR_LOG("file size is 0"); + file.close(); + return false; + } + if (size > bufferSize) { + ERROR_LOG("file size is larger than buffer size"); + file.close(); + return false; + } + buf->pubseekpos(0, std::ios::in); + buf->sgetn(static_cast(buffer), size); + fileSize = size; + file.close(); + return true; +} + +/** +* @brief Write data to file +* @param [in] filePath: file path +* @param [in] buffer: data to write to file +* @param [in] size: size to write +* @return write result +*/ +bool WriteFile(const std::string &filePath, const void *buffer, size_t size) +{ + if (buffer == nullptr) { + ERROR_LOG("Write file failed. buffer is nullptr"); + return false; + } + + int fd = open(filePath.c_str(), O_RDWR | O_CREAT | O_TRUNC, S_IRUSR | S_IWRITE); + if (fd < 0) { + ERROR_LOG("Open file failed. path = %s", filePath.c_str()); + return false; + } + + auto writeSize = write(fd, buffer, size); + (void) close(fd); + if (writeSize != size) { + ERROR_LOG("Write file Failed."); + return false; + } + + return true; +} + +template +void DoPrintData(const T *data, size_t count, size_t elementsPerRow) +{ + assert(elementsPerRow != 0); + for (size_t i = 0; i < count; ++i) { + std::cout << std::setw(10) << data[i]; + if (i % elementsPerRow == elementsPerRow - 1) { + std::cout << std::endl; + } + } +} + +#ifndef ASCENDC_CPU_DEBUG +void DoPrintHalfData(const aclFloat16 *data, size_t count, size_t elementsPerRow) +{ + assert(elementsPerRow != 0); + for (size_t i = 0; i < count; ++i) { + std::cout << std::setw(10) << std::setprecision(6) << aclFloat16ToFloat(data[i]); + if (i % elementsPerRow == elementsPerRow - 1) { + std::cout << std::endl; + } + } +} +#endif + +void PrintData(const void *data, size_t count, printDataType dataType, size_t elementsPerRow=16) +{ + if (data == nullptr) { + ERROR_LOG("Print data failed. data is nullptr"); + return; + } + + switch (dataType) { + case BOOL: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case INT8_T: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case UINT8_T: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case INT16_T: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case UINT16_T: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case INT32_T: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case UINT32_T: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case INT64_T: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case UINT64_T: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; +#ifndef ASCENDC_CPU_DEBUG + case HALF: + DoPrintHalfData(reinterpret_cast(data), count, elementsPerRow); + break; +#endif + case FLOAT: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case DOUBLE: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + default: + ERROR_LOG("Unsupported type: %d", dataType); + } + std::cout << std::endl; +} +#endif // EXAMPLES_COMMON_DATA_UTILS_H diff --git a/operator/ascendc/2_features/2_tbufpool/main.cpp b/operator/ascendc/2_features/2_tbufpool/main.cpp new file mode 100644 index 0000000000000000000000000000000000000000..a506e7c8df844f46685b9beb0f44a80f1ba8d743 --- /dev/null +++ b/operator/ascendc/2_features/2_tbufpool/main.cpp @@ -0,0 +1,231 @@ +/* + * Copyright (c) 2025 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#include "data_utils.h" +#include "./op_host/tbufpool_custom_tiling.h" +#ifndef ASCENDC_CPU_DEBUG +#include "acl/acl.h" +#include "aclrtlaunch_tbufpool_custom.h" +#include "tiling/platform/platform_ascendc.h" +#else +#include "tikicpulib.h" +extern "C" __global__ __aicore__ void tbufpool_custom(GM_ADDR x, GM_ADDR y, GM_ADDR zAdd, GM_ADDR zSubMul, TbufPoolTilingData tiling); +#endif + +namespace { +constexpr uint32_t USED_CORE_NUM = 1; +constexpr uint32_t TOTAL_LENGTH = 2048; +constexpr uint32_t DST_LENGTH = 1024; +constexpr uint32_t TILING_SIZE = 1; +} + +extern void GenerateTilingData(const uint32_t totalLength, uint8_t *tilingBuf); + +static bool CompareResultAdd(const void *outputData, int64_t outSize) { + void *goldenData; +#ifdef ASCENDC_CPU_DEBUG + goldenData = (uint8_t *)AscendC::GmAlloc(outSize); +#else + CHECK_ACL(aclrtMallocHost((void **)(&goldenData), outSize)); +#endif + size_t goldenSize = outSize; + bool ret = ReadFile("../output/golden_add.bin", goldenSize, goldenData, goldenSize); + if (ret) { + printf("ReadFile golden_add.bin success!\n"); + } else { + printf("test failed!\n"); + return false; + } + constexpr float EPS = 1e-4; + int64_t wrongNum = 0; + + for (int i = 0; i < outSize / sizeof(float); i++) { + float a = (reinterpret_cast(outputData))[i]; + float b = (reinterpret_cast(goldenData))[i]; + float ae = std::abs(a - b); + float re = ae / abs(b); + if (ae > EPS && re > EPS) { + printf("CompareResultAdd failed output is %lf, golden is %lf\n", a, b); + wrongNum++; + } + } +#ifdef ASCENDC_CPU_DEBUG + AscendC::GmFree((void *)goldenData); +#else + CHECK_ACL(aclrtFreeHost(goldenData)); +#endif + if (wrongNum != 0) { + return false; + } else { + printf("CompareResultAdd golden_add.bin success!\n"); + return true; + } +} + +static bool CompareResultSubMul(const void *outputData, int64_t outSize) { + void *goldenData; +#ifdef ASCENDC_CPU_DEBUG + goldenData = (uint8_t *)AscendC::GmAlloc(outSize); +#else + CHECK_ACL(aclrtMallocHost((void **)(&goldenData), outSize)); +#endif + size_t goldenSize = outSize; + bool ret = ReadFile("../output/golden_sub_mul.bin", goldenSize, goldenData, goldenSize); + if (ret) { + printf("ReadFile golden_sub_mul.bin success!\n"); + } else { + printf("test failed!\n"); + return false; + } + constexpr float EPS = 1e-4; + int64_t wrongNum = 0; + + for (int i = 0; i < outSize / sizeof(float); i++) { + float a = (reinterpret_cast(outputData))[i]; + float b = (reinterpret_cast(goldenData))[i]; + float ae = std::abs(a - b); + float re = ae / abs(b); + if (ae > EPS && re > EPS) { + printf("CompareResultSubMul failed output is %lf, golden is %lf\n", a, b); + wrongNum++; + } + } +#ifdef ASCENDC_CPU_DEBUG + AscendC::GmFree((void *)goldenData); +#else + CHECK_ACL(aclrtFreeHost(goldenData)); +#endif + if (wrongNum != 0) { + return false; + } else { + printf("CompareResultSubMul golden_sub_mul.bin success!\n"); + return true; + } +} + +int32_t main(int32_t argc, char *argv[]) { + size_t tilingSize = TILING_SIZE * sizeof(uint32_t); + size_t inputSize = TOTAL_LENGTH * sizeof(float); + size_t outputSizeAdd = inputSize; + size_t outputSizeSubMul = DST_LENGTH * sizeof(float); + + +#ifdef ASCENDC_CPU_DEBUG + uint8_t *x = (uint8_t *)AscendC::GmAlloc(inputSize); + uint8_t *y = (uint8_t *)AscendC::GmAlloc(inputSize); + uint8_t *zAdd = (uint8_t *)AscendC::GmAlloc(outputSizeAdd); + uint8_t *zSubMul = (uint8_t *)AscendC::GmAlloc(outputSizeSubMul); + uint8_t *tiling = (uint8_t *)AscendC::GmAlloc(tilingSize); + + ReadFile("../input/input_x.bin", inputSize, x, inputSize); + ReadFile("../input/input_y.bin", inputSize, y, inputSize); + + GenerateTilingData(TOTAL_LENGTH, tiling); + + AscendC::SetKernelMode(KernelMode::AIV_MODE); // run in aiv mode + + ICPU_RUN_KF(tbufpool_custom, USED_CORE_NUM, x, y, zAdd, zSubMul, *reinterpret_cast(tiling)); // use this macro for cpu debug + + WriteFile("../output/output_add.bin", zAdd, outputSizeAdd); + WriteFile("../output/output_sub_mul.bin", zSubMul, outputSizeSubMul); + + + bool goldenResultAdd = true; + goldenResultAdd = CompareResultAdd(zAdd, outputSizeAdd); + bool goldenResultSubMul = true; + goldenResultSubMul = CompareResultSubMul(zSubMul, outputSizeSubMul); + + AscendC::GmFree((void *)x); + AscendC::GmFree((void *)y); + AscendC::GmFree((void *)zAdd); + AscendC::GmFree((void *)zSubMul); + AscendC::GmFree((void *)tiling); +#else + CHECK_ACL(aclInit(nullptr)); + int32_t deviceId = 0; + CHECK_ACL(aclrtSetDevice(deviceId)); + aclrtStream stream = nullptr; + CHECK_ACL(aclrtCreateStream(&stream)); + + uint8_t *xHost; + uint8_t *yHost; + uint8_t *zHostAdd; + uint8_t *zHostSubMul; + uint8_t *tiling; + uint8_t *xDevice; + uint8_t *yDevice; + uint8_t *zDeviceAdd; + uint8_t *zDeviceSubMul; + + CHECK_ACL(aclrtMallocHost((void **)(&xHost), inputSize)); + CHECK_ACL(aclrtMallocHost((void **)(&yHost), inputSize)); + CHECK_ACL(aclrtMallocHost((void **)(&zHostAdd), outputSizeAdd)); + CHECK_ACL(aclrtMallocHost((void **)(&zHostSubMul), outputSizeSubMul)); + CHECK_ACL(aclrtMallocHost((void **)(&tiling), tilingSize)); + + CHECK_ACL(aclrtMalloc((void **)&xDevice, inputSize, ACL_MEM_MALLOC_HUGE_FIRST)); + CHECK_ACL(aclrtMalloc((void **)&yDevice, inputSize, ACL_MEM_MALLOC_HUGE_FIRST)); + CHECK_ACL(aclrtMalloc((void **)&zDeviceAdd, outputSizeAdd, ACL_MEM_MALLOC_HUGE_FIRST)); + CHECK_ACL(aclrtMalloc((void **)&zDeviceSubMul, outputSizeSubMul, ACL_MEM_MALLOC_HUGE_FIRST)); + + ReadFile("../input/input_x.bin", inputSize, xHost, inputSize); + ReadFile("../input/input_y.bin", inputSize, yHost, inputSize); + + GenerateTilingData(TOTAL_LENGTH, tiling); + + // Copy host memory to device memory + CHECK_ACL(aclrtMemcpy(xDevice, inputSize, xHost, inputSize, ACL_MEMCPY_HOST_TO_DEVICE)); + CHECK_ACL(aclrtMemcpy(yDevice, inputSize, yHost, inputSize, ACL_MEMCPY_HOST_TO_DEVICE)); + + // Execute the kernel + ACLRT_LAUNCH_KERNEL(tbufpool_custom) + (USED_CORE_NUM, stream, xDevice, yDevice, zDeviceAdd, zDeviceSubMul, reinterpret_cast(tiling)); + + // Wait for the stop event to complete + CHECK_ACL(aclrtSynchronizeStream(stream)); + + // Copy result to host memory and write to output file + CHECK_ACL(aclrtMemcpy(zHostAdd, outputSizeAdd, zDeviceAdd, outputSizeAdd, ACL_MEMCPY_DEVICE_TO_HOST)); + WriteFile("../output/output_add.bin", zHostAdd, outputSizeAdd); + CHECK_ACL(aclrtMemcpy(zHostSubMul, outputSizeSubMul, zDeviceSubMul, outputSizeSubMul, ACL_MEMCPY_DEVICE_TO_HOST)); + WriteFile("../output/output_sub_mul.bin", zHostSubMul, outputSizeSubMul); + + // Compare the result with the golden result + bool goldenResultAdd = true; + goldenResultAdd = CompareResultAdd(zHostAdd, outputSizeAdd); + bool goldenResultSubMul = true; + goldenResultSubMul = CompareResultSubMul(zHostSubMul, outputSizeSubMul); + + // Clean up memory + CHECK_ACL(aclrtFree(xDevice)); + CHECK_ACL(aclrtFree(yDevice)); + CHECK_ACL(aclrtFree(zDeviceAdd)); + CHECK_ACL(aclrtFree(zDeviceSubMul)); + + CHECK_ACL(aclrtFreeHost(xHost)); + CHECK_ACL(aclrtFreeHost(yHost)); + CHECK_ACL(aclrtFreeHost(zHostAdd)); + CHECK_ACL(aclrtFreeHost(zHostSubMul)); + + CHECK_ACL(aclrtFreeHost(tiling)); + + CHECK_ACL(aclrtDestroyStream(stream)); + CHECK_ACL(aclrtResetDevice(deviceId)); + CHECK_ACL(aclFinalize()); +#endif + + if (goldenResultAdd && goldenResultSubMul) { + printf("test pass!\n"); + } else { + printf("test failed!\n"); + } + return 0; +} + \ No newline at end of file diff --git a/operator/ascendc/2_features/2_tbufpool/op_host/tbufpool_custom_tiling.cpp b/operator/ascendc/2_features/2_tbufpool/op_host/tbufpool_custom_tiling.cpp new file mode 100644 index 0000000000000000000000000000000000000000..0bc2f1c1da006abde8a531f7de93d30f8f95b182 --- /dev/null +++ b/operator/ascendc/2_features/2_tbufpool/op_host/tbufpool_custom_tiling.cpp @@ -0,0 +1,19 @@ +/** + * @file tbufpool_custom_tiling.cpp + * + * Copyright (C) 2024-2025. Huawei Technologies Co., Ltd. All rights reserved. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + */ + +#include "tiling/tiling_api.h" +#include "tbufpool_custom_tiling.h" + + +void GenerateTilingData(uint32_t totalLength, uint8_t* tilingBuf) +{ + TbufPoolTilingData *tiling = reinterpret_cast(tilingBuf); + tiling->totalLength = totalLength; +} \ No newline at end of file diff --git a/operator/ascendc/2_features/2_tbufpool/op_host/tbufpool_custom_tiling.h b/operator/ascendc/2_features/2_tbufpool/op_host/tbufpool_custom_tiling.h new file mode 100644 index 0000000000000000000000000000000000000000..63c60d78caa39d8714ef3a77fd08c6e3e61d8b69 --- /dev/null +++ b/operator/ascendc/2_features/2_tbufpool/op_host/tbufpool_custom_tiling.h @@ -0,0 +1,18 @@ +/** + * @file tbufpool_custom_tiling.h + * + * Copyright (C) 2024-2025. Huawei Technologies Co., Ltd. All rights reserved. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + */ + +#ifndef EXAMPLES_ACTIVATION_TBUFPOOL_CUSTOM_TILING_H +#define EXAMPLES_ACTIVATION_TBUFPOOL_CUSTOM_TILING_H +#include + +struct TbufPoolTilingData { + uint32_t totalLength; +}; +#endif diff --git a/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.cpp b/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.cpp new file mode 100644 index 0000000000000000000000000000000000000000..cf9437871c0f52fb56d0ce64a9989dd62aced16b --- /dev/null +++ b/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.cpp @@ -0,0 +1,21 @@ +/** + * @file tbufpool_custom.cpp + * + * Copyright (C) 2024-2025. Huawei Technologies Co., Ltd. All rights reserved. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + */ + +#include "./tbufpool_custom.h" +#include "kernel_operator.h" + +extern "C" __global__ __aicore__ void tbufpool_custom(GM_ADDR src0Gm, GM_ADDR src1Gm, GM_ADDR dstGm0, GM_ADDR dstGm1, TbufPoolTilingData tiling) +{ + AscendC::TPipe pipe; + MyCustomKernel::TbufPoolImpl op; + op.Init(src0Gm, src1Gm, dstGm0, dstGm1, tiling, &pipe); + KERNEL_TASK_TYPE_DEFAULT(KERNEL_TYPE_AIV_ONLY); + op.Process(); +} \ No newline at end of file diff --git a/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h b/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h new file mode 100644 index 0000000000000000000000000000000000000000..7a24a2f1100b2aad52000b7978a7ca807b2524a1 --- /dev/null +++ b/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h @@ -0,0 +1,157 @@ +/** + * @file tbufpool_custom.h + * + * Copyright (C) 2024-2025. Huawei Technologies Co., Ltd. All rights reserved. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + */ + +#ifndef EXAMPLES_ACTIVATION_INITBUFPOOL_CUSTOM_H +#define EXAMPLES_ACTIVATION_INITBUFPOOL_CUSTOM_H +#include "../op_host/tbufpool_custom_tiling.h" +#include "kernel_operator.h" + + +namespace MyCustomKernel { +constexpr int32_t BUFFER_NUM = 1; +constexpr int32_t BUFFER_NUM_T1 = 1; +constexpr int32_t BUFFER_NUM_T2 = 1; +constexpr int32_t BUFFER_LENGTH = 4096*sizeof(float); +constexpr int32_t BUFF_POOL_LENGTH = 2048*sizeof(float); +constexpr int32_t INIT_TENSOR_LENGTH = 1024*sizeof(float); +constexpr int32_t INIT_TENSOR_LENGTH_HALF = 512*sizeof(float); +constexpr int32_t SPLIT_NUM = 2; +constexpr int32_t TOTL_NUM = 2048; +constexpr int32_t COMPUTE_LENGTH = 1024; +constexpr int32_t COMPUTE_LENGTH_HALF = 512; + +class TbufPoolImpl { + public: + __aicore__ inline TbufPoolImpl() {} + __aicore__ inline void Init(__gm__ uint8_t* src0Gm, __gm__ uint8_t* src1Gm, __gm__ uint8_t* dstGm0, + __gm__ uint8_t* dstGm1, TbufPoolTilingData tiling, AscendC::TPipe* pipeIn) + { + pipe = pipeIn; + totalLength = tiling.totalLength; + src0Global.SetGlobalBuffer((__gm__ float*)src0Gm);// input 0 + src1Global.SetGlobalBuffer((__gm__ float*)src1Gm);// input 1 + dstGlobal0.SetGlobalBuffer((__gm__ float*)dstGm0);// output 0 + dstGlobal1.SetGlobalBuffer((__gm__ float*)dstGm1);// output 1 + + pipe->InitBufPool(tbufPool0, BUFFER_LENGTH); + tbufPool0.InitBuffer(srcQue0, BUFFER_NUM, BUFF_POOL_LENGTH); // Total src0 + tbufPool0.InitBufPool(tbufPool1, BUFF_POOL_LENGTH); + tbufPool0.InitBufPool(tbufPool2, BUFF_POOL_LENGTH, tbufPool1); + tbufPool1.InitBuffer(srcQue1, BUFFER_NUM_T1, INIT_TENSOR_LENGTH); + tbufPool1.InitBuffer(dstQue1, BUFFER_NUM_T1, INIT_TENSOR_LENGTH); + tbufPool2.InitBuffer(srcQue2, BUFFER_NUM_T2, INIT_TENSOR_LENGTH_HALF); + tbufPool2.InitBuffer(dstQue2, BUFFER_NUM_T2, INIT_TENSOR_LENGTH_HALF); + tbufPool2.InitBuffer(srcQue3, BUFFER_NUM_T2, INIT_TENSOR_LENGTH_HALF); + tbufPool2.InitBuffer(tmp, INIT_TENSOR_LENGTH_HALF); + } + __aicore__ inline void Process() + { + for (int32_t i = 0; i < SPLIT_NUM; i++) + { + //stage 1 + CopyIn(i); + Compute(i); + CopyOut(i); + tbufPool1.Reset(); + //stage 2 + CopyIn1(i); + Compute1(i); + CopyOut1(i); + tbufPool2.Reset(); + } + tbufPool0.Reset(); + } + private: + __aicore__ inline void CopyIn(int32_t progress) + { + AscendC::LocalTensor src0Local = srcQue0.AllocTensor(); + AscendC::LocalTensor src1Local = srcQue1.AllocTensor(); + AscendC::DataCopy(src0Local, src0Global[progress * COMPUTE_LENGTH], COMPUTE_LENGTH); + AscendC::DataCopy(src1Local, src1Global[progress * COMPUTE_LENGTH], COMPUTE_LENGTH); + srcQue0.EnQue(src0Local); + srcQue1.EnQue(src1Local); + AscendC::PipeBarrier(); + } + __aicore__ inline void Compute(int32_t progress) + { + AscendC::LocalTensor src0Local = srcQue0.DeQue(); + AscendC::LocalTensor src1Local = srcQue1.DeQue(); + AscendC::LocalTensor dstLocal = dstQue1.AllocTensor(); + AscendC::Add(dstLocal, src0Local, src1Local, COMPUTE_LENGTH); + AscendC::PipeBarrier(); + dstQue1.EnQue(dstLocal); + srcQue0.FreeTensor(src0Local); + srcQue1.FreeTensor(src1Local); + } + __aicore__ inline void CopyOut(int32_t progress) + { + AscendC::LocalTensor dstLocal = dstQue1.DeQue(); + AscendC::DataCopy(dstGlobal0[progress * COMPUTE_LENGTH], dstLocal, COMPUTE_LENGTH); + dstQue1.FreeTensor(dstLocal); + } + __aicore__ inline void CopyIn1(int32_t progress) + { + AscendC::LocalTensor src0Local = srcQue0.AllocTensor(); + AscendC::LocalTensor src1Local = srcQue2.AllocTensor(); + AscendC::LocalTensor src2Local = srcQue3.AllocTensor(); + AscendC::DataCopy(src0Local, src0Global[progress * COMPUTE_LENGTH_HALF], COMPUTE_LENGTH_HALF); + AscendC::DataCopy(src1Local, src1Global[progress * COMPUTE_LENGTH_HALF], COMPUTE_LENGTH_HALF); + AscendC::DataCopy(src2Local, src1Global[progress * COMPUTE_LENGTH_HALF], COMPUTE_LENGTH_HALF); + srcQue0.EnQue(src0Local); + srcQue2.EnQue(src1Local); + srcQue3.EnQue(src2Local); + } + __aicore__ inline void Compute1(int32_t progress) + { + AscendC::PipeBarrier(); + AscendC::LocalTensor src0Local = srcQue0.DeQue(); + AscendC::LocalTensor src1Local = srcQue2.DeQue(); + AscendC::LocalTensor src2Local = srcQue3.DeQue(); + AscendC::LocalTensor dstLocal = dstQue2.AllocTensor(); + AscendC::LocalTensor tmpTensor = tmp.Get(); + AscendC::Sub(tmpTensor, src0Local, src1Local, COMPUTE_LENGTH_HALF); + AscendC::PipeBarrier(); + AscendC::Mul(dstLocal, tmpTensor, src2Local, COMPUTE_LENGTH_HALF); + dstQue2.EnQue(dstLocal); + srcQue0.FreeTensor(src0Local); + srcQue2.FreeTensor(src1Local); + srcQue3.FreeTensor(src2Local); + } + __aicore__ inline void CopyOut1(int32_t progress) + { + AscendC::LocalTensor dstLocal = dstQue2.DeQue(); + AscendC::DataCopy(dstGlobal1[progress * COMPUTE_LENGTH_HALF], dstLocal, COMPUTE_LENGTH_HALF); + AscendC::PipeBarrier(); + dstQue2.FreeTensor(dstLocal); + } + + private: + AscendC::TPipe* pipe; + AscendC::TBufPool tbufPool0; + AscendC::TBufPool tbufPool1; + AscendC::TBufPool tbufPool2; + AscendC::TQue srcQue0; + AscendC::TQue srcQue1; + AscendC::TQue srcQue2; + AscendC::TQue srcQue3; + AscendC::TQue dstQue; + AscendC::TQue dstQue1; + AscendC::TQue dstQue2; + AscendC::GlobalTensor src0Global; + AscendC::GlobalTensor src1Global; + AscendC::GlobalTensor dstGlobal0; + AscendC::GlobalTensor dstGlobal1; + AscendC::TBuf tmp; + uint32_t totalLength = 0; + }; +}// namespace MyCustomKernel + +#endif + \ No newline at end of file diff --git a/operator/ascendc/2_features/2_tbufpool/run.sh b/operator/ascendc/2_features/2_tbufpool/run.sh new file mode 100644 index 0000000000000000000000000000000000000000..1fe551f4052b58deb6218aea198081cfe60ea123 --- /dev/null +++ b/operator/ascendc/2_features/2_tbufpool/run.sh @@ -0,0 +1,58 @@ +#!/bin/bash + +# Copyright (c) 2025 Huawei Technologies Co., Ltd. +# This file is a part of the CANN Open Software. +# Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ====================================================================================================================== + +SHORT=r:,v:, +LONG=run-mode:,soc-version:, +OPTS=$(getopt -a --options $SHORT --longoptions $LONG -- "$@") +eval set -- "$OPTS" +while : +do + case "$1" in + (-r | --run-mode ) + RUN_MODE="$2" + shift 2;; + (-v | --soc-version ) + SOC_VERSION="$2" + shift 2;; + (--) + shift; + break;; + (*) + echo "[ERROR] Unexpected option: $1"; + break;; + esac +done + +rm -rf build +mkdir build +cd build + +# in case of running op in simulator, use stub so instead +if [ "${RUN_MODE}" = "sim" ]; then + export LD_LIBRARY_PATH=$(echo $LD_LIBRARY_PATH | sed 's/\/.*\/runtime\/lib64://g') + export LD_LIBRARY_PATH=$ASCEND_HOME_DIR/runtime/lib64/stub:$LD_LIBRARY_PATH +fi + +source $ASCEND_HOME_DIR/bin/setenv.bash +export LD_LIBRARY_PATH=${ASCEND_HOME_DIR}/tools/simulator/${SOC_VERSION}/lib:$LD_LIBRARY_PATH + +cmake -DRUN_MODE=${RUN_MODE} -DSOC_VERSION=${SOC_VERSION} -DASCEND_CANN_PACKAGE_PATH=${ASCEND_HOME_DIR} .. +make -j16 + +if [ "${RUN_MODE}" = "npu" ]; then + ./tbufpool_direct_kernel_op +elif [ "${RUN_MODE}" = "sim" ]; then + export ASCEND_TOOLKIT_HOME=${ASCEND_HOME_DIR} + export ASCEND_HOME_PATH=${ASCEND_HOME_DIR} + msprof op simulator --application=./tbufpool_direct_kernel_op +elif [ "${RUN_MODE}" = "cpu" ]; then + ./tbufpool_direct_kernel_op +fi \ No newline at end of file diff --git a/operator/ascendc/2_features/2_tbufpool/scripts/gen_data.py b/operator/ascendc/2_features/2_tbufpool/scripts/gen_data.py new file mode 100644 index 0000000000000000000000000000000000000000..18a8f89cd7de9aef0da9b7e3db4b91a023bcd030 --- /dev/null +++ b/operator/ascendc/2_features/2_tbufpool/scripts/gen_data.py @@ -0,0 +1,37 @@ +#!/usr/bin/python3 +# coding=utf-8 + +# Copyright (c) 2025 Huawei Technologies Co., Ltd. +# This file is a part of the CANN Open Software. +# Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ====================================================================================================================== +import os +import numpy as np + +def gen_golden_data_simple(): + dtype = np.float32 + + input_shape = [8, 256] + input_x = np.random.randint(0, np.nextafter(1000, np.inf), input_shape).astype(dtype) + input_y = np.random.randint(0, np.nextafter(1000, np.inf), input_shape).astype(dtype) + + half_size = input_shape[0] // 2 + input_x_half = input_x[:half_size, :] + input_y_half = input_y[:half_size, :] + golden_add = input_x + input_y + golden_sub = input_x_half - input_y_half + golden_sub_mul = np.multiply(golden_sub, input_y_half) + + os.system("mkdir -p ./input") + input_x.tofile("./input/input_x.bin") + input_y.tofile("./input/input_y.bin") + os.system("mkdir -p ./output") + golden_add.tofile("./output/golden_add.bin") + golden_sub_mul.tofile("./output/golden_sub_mul.bin") + +if __name__ == "__main__": + gen_golden_data_simple() \ No newline at end of file diff --git a/operator/ascendc/2_features/README.md b/operator/ascendc/2_features/README.md index 8c843758bf4ed136d98d2708be14cee309cab7e6..40c48fba9a98491139599f2450f5852571b692ea 100644 --- a/operator/ascendc/2_features/README.md +++ b/operator/ascendc/2_features/README.md @@ -15,6 +15,7 @@ Ascend C相关特性的样例。特性样例逐步补充中。 当前本目录包含的所有样例如下。 | 目录名称 | 功能描述 | 运行环境 | | ------------------------------------------------------------ | ---------------------------------------------------- | -- | +| [2_tbufpool](./2_tbufpool) | 基于Ascend C的自定义Vector算子及kernel直调样例,通过TBufPool实现Add算子计算过程中的内存复用,提高计算效率。|Atlas A2训练系列产品/Atlas 800I A2推理产品| | [12_cube_group](./12_cube_group) | 基于Ascend C的自定义算子及FrameworkLaunch调用样例,通过软同步控制AIC和AIV之间进行通讯,实现AI Core计算资源分组。|Atlas A2训练系列产品/Atlas 800I A2推理产品| | [13_matmul_api_ibshare](./13_matmul_api_ibshare) | 基于Ascend C的自定义Cube算子及Kernellaunch调用样例,通过A矩阵与B矩阵使能IBSHARE,实现算子性能提升|Atlas A2训练系列产品/Atlas 800I A2推理产品| | [14_matmul_api_constant](./14_matmul_api_constant) | 基于Ascend C的自定义Cube算子及FrameworkLaunch调用样例,通过使用全量常量化的MatmulApiStaticTiling模板参数,替代非常量的TCubeTiling参数,以减少Scalar计算开销,实现算子性能提升|Atlas A2训练系列产品/Atlas 800I A2推理产品|