From 4d7bbba25ca00b77ed0e1df631038d3d99cc5eaf Mon Sep 17 00:00:00 2001 From: alpaca12345uuu Date: Tue, 3 Jun 2025 22:05:06 +0800 Subject: [PATCH 01/94] add tbufpool sample --- .../2_features/2_tbufpool/CMakeLists.txt | 76 +++++++ .../ascendc/2_features/2_tbufpool/README.md | 79 ++++++- .../2_features/2_tbufpool/cmake/cpu_lib.cmake | 26 +++ .../2_features/2_tbufpool/cmake/npu_lib.cmake | 12 + .../2_features/2_tbufpool/data_utils.h | 211 ++++++++++++++++++ .../ascendc/2_features/2_tbufpool/main.cpp | 169 ++++++++++++++ .../op_host/tbufpool_custom_tiling.cpp | 19 ++ .../op_host/tbufpool_custom_tiling.h | 18 ++ .../2_tbufpool/op_kernel/tbufpool_custom.cpp | 20 ++ .../2_tbufpool/op_kernel/tbufpool_custom.h | 93 ++++++++ operator/ascendc/2_features/2_tbufpool/run.sh | 58 +++++ .../2_features/2_tbufpool/scripts/gen_data.py | 31 +++ operator/ascendc/2_features/README.md | 1 + 13 files changed, 812 insertions(+), 1 deletion(-) create mode 100644 operator/ascendc/2_features/2_tbufpool/CMakeLists.txt create mode 100644 operator/ascendc/2_features/2_tbufpool/cmake/cpu_lib.cmake create mode 100644 operator/ascendc/2_features/2_tbufpool/cmake/npu_lib.cmake create mode 100644 operator/ascendc/2_features/2_tbufpool/data_utils.h create mode 100644 operator/ascendc/2_features/2_tbufpool/main.cpp create mode 100644 operator/ascendc/2_features/2_tbufpool/op_host/tbufpool_custom_tiling.cpp create mode 100644 operator/ascendc/2_features/2_tbufpool/op_host/tbufpool_custom_tiling.h create mode 100644 operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.cpp create mode 100644 operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h create mode 100644 operator/ascendc/2_features/2_tbufpool/run.sh create mode 100644 operator/ascendc/2_features/2_tbufpool/scripts/gen_data.py diff --git a/operator/ascendc/2_features/2_tbufpool/CMakeLists.txt b/operator/ascendc/2_features/2_tbufpool/CMakeLists.txt new file mode 100644 index 000000000..cba0e5e41 --- /dev/null +++ b/operator/ascendc/2_features/2_tbufpool/CMakeLists.txt @@ -0,0 +1,76 @@ +# Copyright (c) 2025 Huawei Technologies Co., Ltd. +# ====================================================================================================================== + +cmake_minimum_required(VERSION 3.16) +project(Ascend_c) +if(${RUN_MODE}) + set(RUN_MODE "npu" CACHE STRING "cpu/sim/npu") +endif() +if (${SOC_VERSION}) + set(SOC_VERSION "Ascend910" CACHE STRING "system on chip type") +endif() + +set(ASCEND_CANN_PACKAGE_PATH "~/Ascend/ascend-toolkit/latest" CACHE STRING "ASCEND CANN package installation directory") + +if(NOT CMAKE_BUILD_TYPE) + set(CMAKE_BUILD_TYPE "Debug" CACHE STRING "Build type Release/Debug (default Debug)" FORCE) +endif() + +if(CMAKE_INSTALL_PREFIX STREQUAL /usr/local) + set(CMAKE_INSTALL_PREFIX "${CMAKE_CURRENT_LIST_DIR}/out" CACHE STRING "path for install()" FORCE) +endif() + +file(GLOB KERNEL_FILES + ${CMAKE_CURRENT_SOURCE_DIR}/op_kernel/tbufpool_custom.cpp +) +set(CUSTOM_ASCEND310P_LIST "Ascend310P1" "Ascend310P3") + +if("${RUN_MODE}" STREQUAL "cpu") + include(cmake/cpu_lib.cmake) +elseif("${RUN_MODE}" STREQUAL "sim" OR "${RUN_MODE}" STREQUAL "npu") + include(cmake/npu_lib.cmake) +else() + message("invalid RUN_MODE: ${RUN_MODE}") +endif() + +add_executable(tbufpool_direct_kernel_op + ${CMAKE_CURRENT_SOURCE_DIR}/main.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/op_host/tbufpool_custom_tiling.cpp +) + +target_compile_options(tbufpool_direct_kernel_op PRIVATE + $:-g>> + -O2 + -std=c++17 + -D_GLIBCXX_USE_CXX11_ABI=0 +) + +target_compile_definitions(tbufpool_direct_kernel_op PRIVATE + $<$>:CUSTOM_ASCEND310P> +) + +target_include_directories(tbufpool_direct_kernel_op PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR} + $:${ASCEND_CANN_PACKAGE_PATH}/include>> + $:${ASCEND_CANN_PACKAGE_PATH}/runtime/include>> +) + +target_link_libraries(tbufpool_direct_kernel_op PRIVATE + $,$>:host_intf_pub>> + $:tikicpulib::${SOC_VERSION}>> + $:ascendcl>> + $:c_sec>> + ascendc_kernels_${RUN_MODE} + tiling_api + register + platform + ascendalog + dl + graph_base +) + +install(TARGETS tbufpool_direct_kernel_op + LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} + ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} + RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} +) \ No newline at end of file diff --git a/operator/ascendc/2_features/2_tbufpool/README.md b/operator/ascendc/2_features/2_tbufpool/README.md index 5af80e6c5..25b8f4ab5 100644 --- a/operator/ascendc/2_features/2_tbufpool/README.md +++ b/operator/ascendc/2_features/2_tbufpool/README.md @@ -1 +1,78 @@ -tbufpool(待补充) \ No newline at end of file + +## 目录结构介绍 +``` +├── 22_tbufpool_kernellaunch +│ ├── cmake // 编译工程文件 +│ ├── op_host // 本样例tiling代码实现 +│ ├── op_kernel //本样例kernel侧代码实现 +│ ├── scripts +│ │ ├── gen_data.py // 输入数据和真值数据生成脚本 +│ ├── CMakeLists.txt // 编译工程文件 +│ ├── data_utils.h // 数据读入写出函数 +│ ├── main.cpp // 主函数,调用算子的应用程序,含CPU域及NPU域调用 +│ └── run.sh // 编译运行算子的脚本 +``` +## 代码实现介绍 +数据量较大且内存有限时,无法一次完成所有数据搬运,需要拆分成多个阶段计算,每次计算使用其中的一部分数据,可以通过TBufPool资源池进行内存地址复用。本例中,通过调用InitBufPool基础API对Add算子实现过程进行内存管理。从Tpipe划分出资源池tbufPool0,tbufPool0为src0Gm分配空间后,继续分配了资源池tbufPool1,指定tbufPool1与tbufPool2复用并分别运用于第一、二轮计算,此时tbufPool1及tbufPool2共享起始地址及长度。 + +- kernel实现 + Add算子的数学表达式为: + ``` + z = x + y + ``` + 计算逻辑是:Ascend C提供的矢量计算接口的操作元素都为LocalTensor,首先将部分输入数据src0Gm,部分输入数据src1Gm搬运进片上储存,调用计算接口完成相加计算,搬出到外部存储上。之后再将剩余输入数据搬运进片上储存,调用计算接口完成相加计算,得到最终结果,再搬出到外部存储上。 + + Add算子的实现流程分为6个基本任务:CopyIn,Compute,CopyOut,CopyIn1,Compute1,CopyOut1。 + - CopyIn任务负责将Global Memory上的部分输入Tensor src0Gm和src1Gm搬运到Local Memory,分别存储在srcoLocal、src1Local; + - Compute任务负责对src0Local、src1Local执行加法操作,计算结果存储在dstLocal中; + - CopyOut任务负责将输出数据从dstLocal搬运至Global Memory上的输出Tensor dstGm中。 + - CopyIn1任务负责将Global Memory上的剩余输入Tensor src0Gm和src1Gm搬运到Local Memory,分别存储在src0Local、src1Local; + - Compute1任务负责对src0Local、src1Local执行加法操作,计算结果存储在dstLocal中; + - CopyOut1任务负责将输出数据从dstLocal搬运至Global Memory上的输出Tensor dstGm中。 + +- 调用实现 + 1. CPU侧运行验证主要通过ICPU_RUN_KF CPU调测宏等CPU调测库提供的接口来完成; + 2. NPU侧运行验证主要通过使用ACLRT_LAUNCH_KERNEL内核调用宏来完成。 + + 应用程序通过ASCENDC_CPU_DEBUG 宏区分代码逻辑运行于CPU侧还是NPU侧。 + +## 运行样例算子 + - 打开样例目录 + 以命令行方式下载样例代码,master分支为例。 + ```bash + cd ${git_clone_path}/samples/operator/ascendc/0_introduction/22_tbufpool_kernellaunch + ``` + - 配置环境变量 + + 请根据当前环境上CANN开发套件包的[安装方式](https://hiascend.com/document/redirect/CannCommunityInstSoftware),选择对应配置环境变量的命令。 + - 默认路径,root用户安装CANN软件包 + ```bash + export ASCEND_INSTALL_PATH=/usr/local/Ascend/ascend-toolkit/latest + ``` + - 默认路径,非root用户安装CANN软件包 + ```bash + export ASCEND_INSTALL_PATH=$HOME/Ascend/ascend-toolkit/latest + ``` + - 指定路径install_path,安装CANN软件包 + ```bash + export ASCEND_INSTALL_PATH=${install_path}/ascend-toolkit/latest + ``` + + 配置仿真模式日志文件目录,默认为sim_log。 + ```bash + export CAMODEL_LOG_PATH=./sim_log + ``` + + - 样例执行 + + ```bash + bash run.sh -r [RUN_MODE] -v [SOC_VERSION] + ``` + - RUN_MODE:编译方式,可选择CPU调试,NPU仿真,NPU上板。支持参数为[cpu / sim / npu]。 + - SOC_VERSION:昇腾AI处理器型号,如果无法确定具体的[SOC_VERSION],则在安装昇腾AI处理器的服务器执行npu-smi info命令进行查询,在查询到的“Name”前增加Ascend信息,例如“Name”对应取值为xxxyy,实际配置的[SOC_VERSION]值为Ascendxxxyy。支持以下产品型号: + - Atlas A2训练系列产品/Atlas 800I A2推理产品 + + 示例如下,Ascendxxxyy请替换为实际的AI处理器型号。 + ```bash + bash run.sh -r cpu -v Ascendxxxyy + ``` \ No newline at end of file diff --git a/operator/ascendc/2_features/2_tbufpool/cmake/cpu_lib.cmake b/operator/ascendc/2_features/2_tbufpool/cmake/cpu_lib.cmake new file mode 100644 index 000000000..693f15ac1 --- /dev/null +++ b/operator/ascendc/2_features/2_tbufpool/cmake/cpu_lib.cmake @@ -0,0 +1,26 @@ +if(NOT DEFINED ENV{CMAKE_PREFIX_PATH}) + set(CMAKE_PREFIX_PATH ${ASCEND_CANN_PACKAGE_PATH}/tools/tikicpulib/lib/cmake) +endif() +find_package(tikicpulib REQUIRED) + +add_library(ascendc_kernels_${RUN_MODE} SHARED + ${KERNEL_FILES} +) + +target_link_libraries(ascendc_kernels_${RUN_MODE} PRIVATE + tikicpulib::${SOC_VERSION} +) + +target_compile_definitions(ascendc_kernels_${RUN_MODE} PRIVATE + $<$>:CUSTOM_ASCEND310P> +) + +target_compile_options(ascendc_kernels_${RUN_MODE} PRIVATE + -g + -O0 + -std=c++17 +) + +install(TARGETS ascendc_kernels_${RUN_MODE} +DESTINATION ${CMAKE_INSTALL_LIBDIR} +) \ No newline at end of file diff --git a/operator/ascendc/2_features/2_tbufpool/cmake/npu_lib.cmake b/operator/ascendc/2_features/2_tbufpool/cmake/npu_lib.cmake new file mode 100644 index 000000000..8ad136f38 --- /dev/null +++ b/operator/ascendc/2_features/2_tbufpool/cmake/npu_lib.cmake @@ -0,0 +1,12 @@ +if(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake) + set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake) +elseif(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake) + set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake) +else() + message(FATAL_ERROR "ascendc_kernel_cmake does not exist ,please check whether the cann package is installed") +endif() +include(${ASCENDC_CMAKE_DIR}/ascendc.cmake) + +ascendc_library(ascendc_kernels_${RUN_MODE} STATIC + ${KERNEL_FILES} +) \ No newline at end of file diff --git a/operator/ascendc/2_features/2_tbufpool/data_utils.h b/operator/ascendc/2_features/2_tbufpool/data_utils.h new file mode 100644 index 000000000..7980ae341 --- /dev/null +++ b/operator/ascendc/2_features/2_tbufpool/data_utils.h @@ -0,0 +1,211 @@ +/** + * Copyright (c) 2025 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ +#ifndef DATA_UTILS_H +#define DATA_UTILS_H +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifndef ASCENDC_CPU_DEBUG +#include "acl/acl.h" +#endif + +typedef enum { + DT_UNDEFINED = -1, + FLOAT = 0, + HALF = 1, + INT8_T = 2, + INT32_T = 3, + UINT8_T = 4, + INT16_T = 6, + UINT16_T = 7, + UINT32_T = 8, + INT64_T = 9, + UINT64_T = 10, + DOUBLE = 11, + BOOL = 12, + STRING = 13, + COMPLEX64 = 16, + COMPLEX128 = 17, + BF16 = 27 +} printDataType; + +#define INFO_LOG(fmt, args...) fprintf(stdout, "[INFO] " fmt "\n", ##args) +#define WARN_LOG(fmt, args...) fprintf(stdout, "[WARN] " fmt "\n", ##args) +#define ERROR_LOG(fmt, args...) fprintf(stdout, "[ERROR] " fmt "\n", ##args) + +#ifndef ASCENDC_CPU_DEBUG +#define CHECK_ACL(x) \ + do { \ + aclError __ret = x; \ + if (__ret != ACL_ERROR_NONE) { \ + std::cerr << __FILE__ << ":" << __LINE__ << " aclError:" << __ret << std::endl; \ + } \ + } while (0); +#endif + +/** +* @brief Read data from file +* @param [in] filePath: file path +* @param [out] fileSize: file size +* @return read result +*/ +bool ReadFile(const std::string &filePath, size_t &fileSize, void *buffer, size_t bufferSize) +{ + struct stat sBuf; + int fileStatus = stat(filePath.data(), &sBuf); + if (fileStatus == -1) { + ERROR_LOG("failed to get file"); + return false; + } + if (S_ISREG(sBuf.st_mode) == 0) { + ERROR_LOG("%s is not a file, please enter a file", filePath.c_str()); + return false; + } + + std::ifstream file; + file.open(filePath, std::ios::binary); + if (!file.is_open()) { + ERROR_LOG("Open file failed. path = %s", filePath.c_str()); + return false; + } + + std::filebuf *buf = file.rdbuf(); + size_t size = buf->pubseekoff(0, std::ios::end, std::ios::in); + if (size == 0) { + ERROR_LOG("file size is 0"); + file.close(); + return false; + } + if (size > bufferSize) { + ERROR_LOG("file size is larger than buffer size"); + file.close(); + return false; + } + buf->pubseekpos(0, std::ios::in); + buf->sgetn(static_cast(buffer), size); + fileSize = size; + file.close(); + return true; +} + +/** +* @brief Write data to file +* @param [in] filePath: file path +* @param [in] buffer: data to write to file +* @param [in] size: size to write +* @return write result +*/ +bool WriteFile(const std::string &filePath, const void *buffer, size_t size) +{ + if (buffer == nullptr) { + ERROR_LOG("Write file failed. buffer is nullptr"); + return false; + } + + int fd = open(filePath.c_str(), O_RDWR | O_CREAT | O_TRUNC, S_IRUSR | S_IWRITE); + if (fd < 0) { + ERROR_LOG("Open file failed. path = %s", filePath.c_str()); + return false; + } + + auto writeSize = write(fd, buffer, size); + (void) close(fd); + if (writeSize != size) { + ERROR_LOG("Write file Failed."); + return false; + } + + return true; +} + +template +void DoPrintData(const T *data, size_t count, size_t elementsPerRow) +{ + assert(elementsPerRow != 0); + for (size_t i = 0; i < count; ++i) { + std::cout << std::setw(10) << data[i]; + if (i % elementsPerRow == elementsPerRow - 1) { + std::cout << std::endl; + } + } +} + +#ifndef ASCENDC_CPU_DEBUG +void DoPrintHalfData(const aclFloat16 *data, size_t count, size_t elementsPerRow) +{ + assert(elementsPerRow != 0); + for (size_t i = 0; i < count; ++i) { + std::cout << std::setw(10) << std::setprecision(6) << aclFloat16ToFloat(data[i]); + if (i % elementsPerRow == elementsPerRow - 1) { + std::cout << std::endl; + } + } +} +#endif + +void PrintData(const void *data, size_t count, printDataType dataType, size_t elementsPerRow=16) +{ + if (data == nullptr) { + ERROR_LOG("Print data failed. data is nullptr"); + return; + } + + switch (dataType) { + case BOOL: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case INT8_T: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case UINT8_T: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case INT16_T: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case UINT16_T: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case INT32_T: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case UINT32_T: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case INT64_T: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case UINT64_T: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; +#ifndef ASCENDC_CPU_DEBUG + case HALF: + DoPrintHalfData(reinterpret_cast(data), count, elementsPerRow); + break; +#endif + case FLOAT: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case DOUBLE: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + default: + ERROR_LOG("Unsupported type: %d", dataType); + } + std::cout << std::endl; +} +#endif // EXAMPLES_COMMON_DATA_UTILS_H diff --git a/operator/ascendc/2_features/2_tbufpool/main.cpp b/operator/ascendc/2_features/2_tbufpool/main.cpp new file mode 100644 index 000000000..855481786 --- /dev/null +++ b/operator/ascendc/2_features/2_tbufpool/main.cpp @@ -0,0 +1,169 @@ +/* + * Copyright (c) 2025 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#include "data_utils.h" +#include "./op_host/tbufpool_custom_tiling.h" +#ifndef ASCENDC_CPU_DEBUG +#include "acl/acl.h" +#include "aclrtlaunch_tbufpool_custom.h" +#include "tiling/platform/platform_ascendc.h" +#else +#include "tikicpulib.h" +extern "C" __global__ __aicore__ void tbufpool_custom(GM_ADDR x, GM_ADDR y, GM_ADDR z, TbufPoolTilingData tiling); +#endif + +namespace { +constexpr uint32_t USED_CORE_NUM = 1; +constexpr uint32_t TOTAL_LENGTH = 4096; +constexpr uint32_t TILING_SIZE = 1; +} + +extern void GenerateTilingData(const uint32_t totalLength, uint8_t *tilingBuf); + +static bool CompareResult(const void *outputData, int64_t outSize) { + void *goldenData; +#ifdef ASCENDC_CPU_DEBUG + goldenData = (uint8_t *)AscendC::GmAlloc(outSize); +#else + CHECK_ACL(aclrtMallocHost((void **)(&goldenData), outSize)); +#endif + size_t goldenSize = outSize; + bool ret = ReadFile("../output/output.bin", goldenSize, goldenData, goldenSize); + if (ret) { + printf("ReadFile golden.bin success!\n"); + } else { + printf("test failed!\n"); + return false; + } + constexpr float EPS = 1e-4; + int64_t wrongNum = 0; + + for (int i = 0; i < outSize / sizeof(float); i++) { + float a = (reinterpret_cast(outputData))[i]; + float b = (reinterpret_cast(goldenData))[i]; + float ae = std::abs(a - b); + float re = ae / abs(b); + if (ae > EPS && re > EPS) { + wrongNum++; + } + } +#ifdef ASCENDC_CPU_DEBUG + AscendC::GmFree((void *)goldenData); +#else + CHECK_ACL(aclrtFreeHost(goldenData)); +#endif + if (wrongNum != 0) { + return false; + } else { + printf("CompareResult golden.bin success!\n"); + return true; + } +} + +int32_t main(int32_t argc, char *argv[]) { + size_t tilingSize = TILING_SIZE * sizeof(uint32_t); + size_t inputSize = TOTAL_LENGTH * sizeof(float); + size_t outputSize = inputSize; + +#ifdef ASCENDC_CPU_DEBUG + uint8_t *x = (uint8_t *)AscendC::GmAlloc(inputSize); + uint8_t *y = (uint8_t *)AscendC::GmAlloc(inputSize); + uint8_t *z = (uint8_t *)AscendC::GmAlloc(outputSize); + uint8_t *tiling = (uint8_t *)AscendC::GmAlloc(tilingSize); + + ReadFile("../input/input_x.bin", inputSize, x, inputSize); + ReadFile("../input/input_y.bin", inputSize, y, inputSize); + + GenerateTilingData(TOTAL_LENGTH, tiling); + + AscendC::SetKernelMode(KernelMode::AIV_MODE); // run in aiv mode + + ICPU_RUN_KF(tbufpool_custom, USED_CORE_NUM, x, y, z, + *reinterpret_cast(tiling)); // use this macro for cpu debug + + WriteFile("../output/output.bin", z, outputSize); + + bool goldenResult = true; + goldenResult = CompareResult(z, outputSize); + + AscendC::GmFree((void *)x); + AscendC::GmFree((void *)y); + AscendC::GmFree((void *)z); + AscendC::GmFree((void *)tiling); +#else + CHECK_ACL(aclInit(nullptr)); + int32_t deviceId = 0; + CHECK_ACL(aclrtSetDevice(deviceId)); + aclrtStream stream = nullptr; + CHECK_ACL(aclrtCreateStream(&stream)); + + uint8_t *xHost; + uint8_t *yHost; + uint8_t *zHost; + uint8_t *tiling; + uint8_t *xDevice; + uint8_t *yDevice; + uint8_t *zDevice; + + CHECK_ACL(aclrtMallocHost((void **)(&xHost), inputSize)); + CHECK_ACL(aclrtMallocHost((void **)(&yHost), inputSize)); + CHECK_ACL(aclrtMallocHost((void **)(&zHost), outputSize)); + CHECK_ACL(aclrtMallocHost((void **)(&tiling), tilingSize)); + + CHECK_ACL(aclrtMalloc((void **)&xDevice, inputSize, ACL_MEM_MALLOC_HUGE_FIRST)); + CHECK_ACL(aclrtMalloc((void **)&yDevice, inputSize, ACL_MEM_MALLOC_HUGE_FIRST)); + CHECK_ACL(aclrtMalloc((void **)&zDevice, outputSize, ACL_MEM_MALLOC_HUGE_FIRST)); + + ReadFile("../input/input_x.bin", inputSize, xHost, inputSize); + ReadFile("../input/input_y.bin", inputSize, yHost, inputSize); + + GenerateTilingData(TOTAL_LENGTH, tiling); + + // Copy host memory to device memory + CHECK_ACL(aclrtMemcpy(xDevice, inputSize, xHost, inputSize, ACL_MEMCPY_HOST_TO_DEVICE)); + CHECK_ACL(aclrtMemcpy(yDevice, inputSize, yHost, inputSize, ACL_MEMCPY_HOST_TO_DEVICE)); + + // Execute the kernel + ACLRT_LAUNCH_KERNEL(tbufpool_custom) + (USED_CORE_NUM, stream, xDevice, yDevice, zDevice, reinterpret_cast(tiling)); + + // Wait for the stop event to complete + CHECK_ACL(aclrtSynchronizeStream(stream)); + + // Copy result to host memory and write to output file + CHECK_ACL(aclrtMemcpy(zHost, outputSize, zDevice, outputSize, ACL_MEMCPY_DEVICE_TO_HOST)); + WriteFile("../output/output.bin", zHost, outputSize); + + // Compare the result with the golden result + bool goldenResult = true; + goldenResult = CompareResult(zHost, outputSize); + + // Clean up memory + CHECK_ACL(aclrtFree(xDevice)); + CHECK_ACL(aclrtFree(yDevice)); + CHECK_ACL(aclrtFree(zDevice)); + CHECK_ACL(aclrtFreeHost(xHost)); + CHECK_ACL(aclrtFreeHost(yHost)); + CHECK_ACL(aclrtFreeHost(zHost)); + CHECK_ACL(aclrtFreeHost(tiling)); + + CHECK_ACL(aclrtDestroyStream(stream)); + CHECK_ACL(aclrtResetDevice(deviceId)); + CHECK_ACL(aclFinalize()); +#endif + + if (goldenResult) { + printf("test pass!\n"); + } else { + printf("test failed!\n"); + } + return 0; +} + \ No newline at end of file diff --git a/operator/ascendc/2_features/2_tbufpool/op_host/tbufpool_custom_tiling.cpp b/operator/ascendc/2_features/2_tbufpool/op_host/tbufpool_custom_tiling.cpp new file mode 100644 index 000000000..0bc2f1c1d --- /dev/null +++ b/operator/ascendc/2_features/2_tbufpool/op_host/tbufpool_custom_tiling.cpp @@ -0,0 +1,19 @@ +/** + * @file tbufpool_custom_tiling.cpp + * + * Copyright (C) 2024-2025. Huawei Technologies Co., Ltd. All rights reserved. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + */ + +#include "tiling/tiling_api.h" +#include "tbufpool_custom_tiling.h" + + +void GenerateTilingData(uint32_t totalLength, uint8_t* tilingBuf) +{ + TbufPoolTilingData *tiling = reinterpret_cast(tilingBuf); + tiling->totalLength = totalLength; +} \ No newline at end of file diff --git a/operator/ascendc/2_features/2_tbufpool/op_host/tbufpool_custom_tiling.h b/operator/ascendc/2_features/2_tbufpool/op_host/tbufpool_custom_tiling.h new file mode 100644 index 000000000..63c60d78c --- /dev/null +++ b/operator/ascendc/2_features/2_tbufpool/op_host/tbufpool_custom_tiling.h @@ -0,0 +1,18 @@ +/** + * @file tbufpool_custom_tiling.h + * + * Copyright (C) 2024-2025. Huawei Technologies Co., Ltd. All rights reserved. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + */ + +#ifndef EXAMPLES_ACTIVATION_TBUFPOOL_CUSTOM_TILING_H +#define EXAMPLES_ACTIVATION_TBUFPOOL_CUSTOM_TILING_H +#include + +struct TbufPoolTilingData { + uint32_t totalLength; +}; +#endif diff --git a/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.cpp b/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.cpp new file mode 100644 index 000000000..d17a4d185 --- /dev/null +++ b/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.cpp @@ -0,0 +1,20 @@ +/** + * @file tbufpool_custom.cpp + * + * Copyright (C) 2024-2025. Huawei Technologies Co., Ltd. All rights reserved. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + */ + +#include "./tbufpool_custom.h" +#include "kernel_operator.h" + +extern "C" __global__ __aicore__ void tbufpool_custom(GM_ADDR src0Gm, GM_ADDR src1Gm, GM_ADDR dstGm, TbufPoolTilingData tiling) +{ + AscendC::TPipe pipe; + MyCustomKernel::TbufPoolImpl op; + op.Init(src0Gm, src1Gm, dstGm, tiling, &pipe); + op.Process(); +} \ No newline at end of file diff --git a/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h b/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h new file mode 100644 index 000000000..53db0c0a5 --- /dev/null +++ b/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h @@ -0,0 +1,93 @@ +/** + * @file tbufpool_custom.h + * + * Copyright (C) 2024-2025. Huawei Technologies Co., Ltd. All rights reserved. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + */ + +#ifndef EXAMPLES_ACTIVATION_INITBUFPOOL_CUSTOM_H +#define EXAMPLES_ACTIVATION_INITBUFPOOL_CUSTOM_H +#include "../op_host/tbufpool_custom_tiling.h" +#include "kernel_operator.h" + + +namespace MyCustomKernel { +constexpr int32_t BUFFER_NUM = 1; +constexpr int32_t BUFFER_LENGTH = 8192; +constexpr int32_t BUFF_POOL_LENGTH = 4096; +constexpr int32_t INIT_TENSOR_LENGTH = 512; +constexpr int32_t TENSOR_LENGTH = 128; +constexpr int32_t SPLIT_NUM = 2; +class TbufPoolImpl { + public: + __aicore__ inline TbufPoolImpl() {} + __aicore__ inline void Init(__gm__ uint8_t* src0Gm, __gm__ uint8_t* src1Gm, __gm__ uint8_t* dstGm, TbufPoolTilingData tiling, AscendC::TPipe* pipeIn) + { + pipe = pipeIn; + totalLength = tiling.totalLength; + src0Global.SetGlobalBuffer((__gm__ float*)src0Gm); + src1Global.SetGlobalBuffer((__gm__ float*)src1Gm); + dstGlobal.SetGlobalBuffer((__gm__ float*)dstGm); + pipe->InitBufPool(tbufPool0, BUFFER_LENGTH); + tbufPool0.InitBuffer(srcQue0, BUFFER_NUM, BUFF_POOL_LENGTH); // Total src0 + tbufPool0.InitBufPool(tbufPool1, BUFF_POOL_LENGTH); + tbufPool1.InitBuffer(srcQue1, BUFFER_NUM, INIT_TENSOR_LENGTH); + tbufPool1.InitBuffer(dstQue0, BUFFER_NUM, INIT_TENSOR_LENGTH); + } + __aicore__ inline void Process() + { + for(int i = 0; i < SPLIT_NUM; i++) + { + CopyIn(i); + Compute(); + CopyOut(); + tbufPool1.Reset(); + } + tbufPool0.Reset(); + } + private: + __aicore__ inline void CopyIn(int32_t progress) + { + AscendC::LocalTensor src0Local = srcQue0.AllocTensor(); + AscendC::LocalTensor src1Local = srcQue1.AllocTensor(); + AscendC::DataCopy(src0Local, src0Global[progress*TENSOR_LENGTH], TENSOR_LENGTH); + AscendC::DataCopy(src1Local, src1Global[progress*TENSOR_LENGTH], TENSOR_LENGTH); + srcQue0.EnQue(src0Local); + srcQue1.EnQue(src1Local); + } + __aicore__ inline void Compute() + { + AscendC::LocalTensor src0Local = srcQue0.DeQue(); + AscendC::LocalTensor src1Local = srcQue1.DeQue(); + AscendC::LocalTensor dstLocal = dstQue0.AllocTensor(); + AscendC::Add(dstLocal, src0Local, src1Local, TENSOR_LENGTH); + + dstQue0.EnQue(dstLocal); + srcQue0.FreeTensor(src0Local); + srcQue1.FreeTensor(src1Local); + } + __aicore__ inline void CopyOut() + { + AscendC::LocalTensor dstLocal = dstQue0.DeQue(); + AscendC::DataCopy(dstGlobal, dstLocal, TENSOR_LENGTH); + dstQue0.FreeTensor(dstLocal); + } + private: + AscendC::TPipe* pipe; + AscendC::TBufPool tbufPool0; + AscendC::TBufPool tbufPool1; + AscendC::TQue srcQue0; + AscendC::TQue srcQue1; + AscendC::TQue dstQue0; + AscendC::GlobalTensor src0Global; + AscendC::GlobalTensor src1Global; + AscendC::GlobalTensor dstGlobal; + uint32_t totalLength = 0; + }; +}// namespace MyCustomKernel + +#endif + \ No newline at end of file diff --git a/operator/ascendc/2_features/2_tbufpool/run.sh b/operator/ascendc/2_features/2_tbufpool/run.sh new file mode 100644 index 000000000..1fe551f40 --- /dev/null +++ b/operator/ascendc/2_features/2_tbufpool/run.sh @@ -0,0 +1,58 @@ +#!/bin/bash + +# Copyright (c) 2025 Huawei Technologies Co., Ltd. +# This file is a part of the CANN Open Software. +# Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ====================================================================================================================== + +SHORT=r:,v:, +LONG=run-mode:,soc-version:, +OPTS=$(getopt -a --options $SHORT --longoptions $LONG -- "$@") +eval set -- "$OPTS" +while : +do + case "$1" in + (-r | --run-mode ) + RUN_MODE="$2" + shift 2;; + (-v | --soc-version ) + SOC_VERSION="$2" + shift 2;; + (--) + shift; + break;; + (*) + echo "[ERROR] Unexpected option: $1"; + break;; + esac +done + +rm -rf build +mkdir build +cd build + +# in case of running op in simulator, use stub so instead +if [ "${RUN_MODE}" = "sim" ]; then + export LD_LIBRARY_PATH=$(echo $LD_LIBRARY_PATH | sed 's/\/.*\/runtime\/lib64://g') + export LD_LIBRARY_PATH=$ASCEND_HOME_DIR/runtime/lib64/stub:$LD_LIBRARY_PATH +fi + +source $ASCEND_HOME_DIR/bin/setenv.bash +export LD_LIBRARY_PATH=${ASCEND_HOME_DIR}/tools/simulator/${SOC_VERSION}/lib:$LD_LIBRARY_PATH + +cmake -DRUN_MODE=${RUN_MODE} -DSOC_VERSION=${SOC_VERSION} -DASCEND_CANN_PACKAGE_PATH=${ASCEND_HOME_DIR} .. +make -j16 + +if [ "${RUN_MODE}" = "npu" ]; then + ./tbufpool_direct_kernel_op +elif [ "${RUN_MODE}" = "sim" ]; then + export ASCEND_TOOLKIT_HOME=${ASCEND_HOME_DIR} + export ASCEND_HOME_PATH=${ASCEND_HOME_DIR} + msprof op simulator --application=./tbufpool_direct_kernel_op +elif [ "${RUN_MODE}" = "cpu" ]; then + ./tbufpool_direct_kernel_op +fi \ No newline at end of file diff --git a/operator/ascendc/2_features/2_tbufpool/scripts/gen_data.py b/operator/ascendc/2_features/2_tbufpool/scripts/gen_data.py new file mode 100644 index 000000000..3b0aa2937 --- /dev/null +++ b/operator/ascendc/2_features/2_tbufpool/scripts/gen_data.py @@ -0,0 +1,31 @@ +#!/usr/bin/python3 +# coding=utf-8 + +# Copyright (c) 2025 Huawei Technologies Co., Ltd. +# This file is a part of the CANN Open Software. +# Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ====================================================================================================================== +import os +import numpy as np + +def gen_golden_data_simple(): + dtype = np.float32 + + input_shape = [8, 256] + # generate value between [-65504, 65504] + input_x = np.random.uniform(-65504, np.nextafter(65504, np.inf), input_shape).astype(dtype) + input_y = np.random.uniform(-65504, np.nextafter(65504, np.inf), input_shape).astype(dtype) + golden = input_x + input_y + + os.system("mkdir -p ./input") + input_x.tofile("./input/input_x.bin") + input_y.tofile("./input/input_y.bin") + os.system("mkdir -p ./output") + golden.tofile("./output/golden.bin") + +if __name__ == "__main__": + gen_golden_data_simple() \ No newline at end of file diff --git a/operator/ascendc/2_features/README.md b/operator/ascendc/2_features/README.md index 8c843758b..6aa3f8655 100644 --- a/operator/ascendc/2_features/README.md +++ b/operator/ascendc/2_features/README.md @@ -15,6 +15,7 @@ Ascend C相关特性的样例。特性样例逐步补充中。 当前本目录包含的所有样例如下。 | 目录名称 | 功能描述 | 运行环境 | | ------------------------------------------------------------ | ---------------------------------------------------- | -- | +| [2_tbufpool](./2_tbufpool) | 基于Ascend C的自定义Vector算子及kernel直调样例,通过TbufPool实现Add算子计算过程中的内存复用,提高计算效率。|Atlas A2训练系列产品/Atlas 800I A2推理产品| | [12_cube_group](./12_cube_group) | 基于Ascend C的自定义算子及FrameworkLaunch调用样例,通过软同步控制AIC和AIV之间进行通讯,实现AI Core计算资源分组。|Atlas A2训练系列产品/Atlas 800I A2推理产品| | [13_matmul_api_ibshare](./13_matmul_api_ibshare) | 基于Ascend C的自定义Cube算子及Kernellaunch调用样例,通过A矩阵与B矩阵使能IBSHARE,实现算子性能提升|Atlas A2训练系列产品/Atlas 800I A2推理产品| | [14_matmul_api_constant](./14_matmul_api_constant) | 基于Ascend C的自定义Cube算子及FrameworkLaunch调用样例,通过使用全量常量化的MatmulApiStaticTiling模板参数,替代非常量的TCubeTiling参数,以减少Scalar计算开销,实现算子性能提升|Atlas A2训练系列产品/Atlas 800I A2推理产品| -- Gitee From 1d0a8fe65c96d1f4355e7da78c10c872195f3d57 Mon Sep 17 00:00:00 2001 From: alpaca12345uuu Date: Sat, 7 Jun 2025 22:21:04 +0800 Subject: [PATCH 02/94] =?UTF-8?q?=E4=BB=A3=E7=A0=81=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- operator/ascendc/2_features/2_tbufpool/main.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/operator/ascendc/2_features/2_tbufpool/main.cpp b/operator/ascendc/2_features/2_tbufpool/main.cpp index 855481786..279baefa1 100644 --- a/operator/ascendc/2_features/2_tbufpool/main.cpp +++ b/operator/ascendc/2_features/2_tbufpool/main.cpp @@ -35,7 +35,7 @@ static bool CompareResult(const void *outputData, int64_t outSize) { CHECK_ACL(aclrtMallocHost((void **)(&goldenData), outSize)); #endif size_t goldenSize = outSize; - bool ret = ReadFile("../output/output.bin", goldenSize, goldenData, goldenSize); + bool ret = ReadFile("../output/golden.bin", goldenSize, goldenData, goldenSize); if (ret) { printf("ReadFile golden.bin success!\n"); } else { -- Gitee From cd15efc77b96401f2aa6171ee4ed184fa8c9a968 Mon Sep 17 00:00:00 2001 From: alpaca12345uuu Date: Sat, 7 Jun 2025 22:30:07 +0800 Subject: [PATCH 03/94] =?UTF-8?q?=E4=BB=A3=E7=A0=81=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- operator/ascendc/2_features/2_tbufpool/main.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/operator/ascendc/2_features/2_tbufpool/main.cpp b/operator/ascendc/2_features/2_tbufpool/main.cpp index 279baefa1..565a21df4 100644 --- a/operator/ascendc/2_features/2_tbufpool/main.cpp +++ b/operator/ascendc/2_features/2_tbufpool/main.cpp @@ -51,6 +51,7 @@ static bool CompareResult(const void *outputData, int64_t outSize) { float ae = std::abs(a - b); float re = ae / abs(b); if (ae > EPS && re > EPS) { + printf("CompareResult failed output is %lf, golden is %lf\n", a, b); wrongNum++; } } -- Gitee From 7934cc48cd22a5396b5cdaed202cda8675a11f3c Mon Sep 17 00:00:00 2001 From: alpaca12345uuu Date: Sat, 7 Jun 2025 22:38:44 +0800 Subject: [PATCH 04/94] =?UTF-8?q?=E4=BB=A3=E7=A0=81=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../2_tbufpool/op_kernel/tbufpool_custom.h | 62 ++++++++++++++++--- 1 file changed, 53 insertions(+), 9 deletions(-) diff --git a/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h b/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h index 53db0c0a5..83978b960 100644 --- a/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h +++ b/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h @@ -39,13 +39,26 @@ class TbufPoolImpl { } __aicore__ inline void Process() { - for(int i = 0; i < SPLIT_NUM; i++) - { - CopyIn(i); - Compute(); - CopyOut(); - tbufPool1.Reset(); - } + // for(int i = 0; i < SPLIT_NUM; i++) + // { + // CopyIn(i); + // Compute(); + // CopyOut(); + // tbufPool1.Reset(); + // } + // tbufPool0.Reset(); + tbufPool1.InitBuffer(srcQue1, 1, 32768); + tbufPool1.InitBuffer(dstQue0, 1, 32768); + CopyIn(); + Compute(); + CopyOut(); + tbufPool1.Reset(); + tbufPool2.InitBuffer(srcQue2, 1, 32768); + tbufPool2.InitBuffer(dstQue1, 1, 32768); + CopyIn1(); + Compute1(); + CopyOut1(); + tbufPool2.Reset(); tbufPool0.Reset(); } private: @@ -53,8 +66,8 @@ class TbufPoolImpl { { AscendC::LocalTensor src0Local = srcQue0.AllocTensor(); AscendC::LocalTensor src1Local = srcQue1.AllocTensor(); - AscendC::DataCopy(src0Local, src0Global[progress*TENSOR_LENGTH], TENSOR_LENGTH); - AscendC::DataCopy(src1Local, src1Global[progress*TENSOR_LENGTH], TENSOR_LENGTH); + AscendC::DataCopy(src0Local, src0Global, TENSOR_LENGTH); + AscendC::DataCopy(src1Local, src1Global, TENSOR_LENGTH); srcQue0.EnQue(src0Local); srcQue1.EnQue(src1Local); } @@ -75,6 +88,37 @@ class TbufPoolImpl { AscendC::DataCopy(dstGlobal, dstLocal, TENSOR_LENGTH); dstQue0.FreeTensor(dstLocal); } + __aicore__ inline void CopyIn1() + { + AscendC::LocalTensor src0Local = srcQue0.AllocTensor(); + AscendC::LocalTensor src1Local = srcQue2.AllocTensor(); + AscendC::DataCopy(src0Local, src0Global[TENSOR_LENGTH], TENSOR_LENGTH); + AscendC::DataCopy(src1Local, src1Global[TENSOR_LENGTH], TENSOR_LENGTH); + srcQue0.EnQue(src0Local); + srcQue2.EnQue(src1Local); + } + __aicore__ inline void Compute1() + { + AscendC::LocalTensor src0Local = srcQue0.DeQue(); + AscendC::LocalTensor src1Local = srcQue2.DeQue(); + AscendC::LocalTensor dstLocal = dstQue1.AllocTensor(); + AscendC::Add(dstLocal, src0Local, src1Local, 16384); + dstQue1.EnQue(dstLocal); + srcQue0.FreeTensor(src0Local); + srcQue2.FreeTensor(src1Local); + } + __aicore__ inline void CopyOut1() + { + AscendC::LocalTensor dstLocal = dstQue1.DeQue(); + AscendC::DataCopy(dstGlobal[16384], dstLocal, 16384); + dstQue1.FreeTensor(dstLocal); + } + private: + AscendC::TPipe pipe; + AscendC::TBufPool tbufPool0, tbufPool1, tbufPool2; + AscendC::TQue srcQue0, srcQue1, srcQue2; + AscendC::TQue dstQue0, dstQue1; + AscendC::GlobalTensor src0Global, src1Global, dstGlobal; private: AscendC::TPipe* pipe; AscendC::TBufPool tbufPool0; -- Gitee From 36296b8eec523cdac1d62953cdba24de85b6e221 Mon Sep 17 00:00:00 2001 From: alpaca12345uuu Date: Sat, 7 Jun 2025 22:42:35 +0800 Subject: [PATCH 05/94] =?UTF-8?q?=E4=BB=A3=E7=A0=81=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../2_tbufpool/op_kernel/tbufpool_custom.h | 47 ++++++++++--------- 1 file changed, 25 insertions(+), 22 deletions(-) diff --git a/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h b/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h index 83978b960..bdc5a7827 100644 --- a/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h +++ b/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h @@ -34,8 +34,9 @@ class TbufPoolImpl { pipe->InitBufPool(tbufPool0, BUFFER_LENGTH); tbufPool0.InitBuffer(srcQue0, BUFFER_NUM, BUFF_POOL_LENGTH); // Total src0 tbufPool0.InitBufPool(tbufPool1, BUFF_POOL_LENGTH); - tbufPool1.InitBuffer(srcQue1, BUFFER_NUM, INIT_TENSOR_LENGTH); - tbufPool1.InitBuffer(dstQue0, BUFFER_NUM, INIT_TENSOR_LENGTH); + tbufPool0.InitBufPool(tbufPool2, BUFF_POOL_LENGTH, tbufPool1); + // tbufPool1.InitBuffer(srcQue1, BUFFER_NUM, INIT_TENSOR_LENGTH); + // tbufPool1.InitBuffer(dstQue0, BUFFER_NUM, INIT_TENSOR_LENGTH); } __aicore__ inline void Process() { @@ -62,7 +63,7 @@ class TbufPoolImpl { tbufPool0.Reset(); } private: - __aicore__ inline void CopyIn(int32_t progress) + __aicore__ inline void CopyIn() { AscendC::LocalTensor src0Local = srcQue0.AllocTensor(); AscendC::LocalTensor src1Local = srcQue1.AllocTensor(); @@ -90,8 +91,8 @@ class TbufPoolImpl { } __aicore__ inline void CopyIn1() { - AscendC::LocalTensor src0Local = srcQue0.AllocTensor(); - AscendC::LocalTensor src1Local = srcQue2.AllocTensor(); + AscendC::LocalTensor src0Local = srcQue0.AllocTensor(); + AscendC::LocalTensor src1Local = srcQue2.AllocTensor(); AscendC::DataCopy(src0Local, src0Global[TENSOR_LENGTH], TENSOR_LENGTH); AscendC::DataCopy(src1Local, src1Global[TENSOR_LENGTH], TENSOR_LENGTH); srcQue0.EnQue(src0Local); @@ -99,37 +100,39 @@ class TbufPoolImpl { } __aicore__ inline void Compute1() { - AscendC::LocalTensor src0Local = srcQue0.DeQue(); - AscendC::LocalTensor src1Local = srcQue2.DeQue(); - AscendC::LocalTensor dstLocal = dstQue1.AllocTensor(); + AscendC::LocalTensor src0Local = srcQue0.DeQue(); + AscendC::LocalTensor src1Local = srcQue2.DeQue(); + AscendC::LocalTensor dstLocal = dstQue1.AllocTensor(); AscendC::Add(dstLocal, src0Local, src1Local, 16384); - dstQue1.EnQue(dstLocal); + dstQue1.EnQue(dstLocal); srcQue0.FreeTensor(src0Local); srcQue2.FreeTensor(src1Local); } __aicore__ inline void CopyOut1() { - AscendC::LocalTensor dstLocal = dstQue1.DeQue(); + AscendC::LocalTensor dstLocal = dstQue1.DeQue(); AscendC::DataCopy(dstGlobal[16384], dstLocal, 16384); dstQue1.FreeTensor(dstLocal); } private: - AscendC::TPipe pipe; + AscendC::TPipe* pipe; AscendC::TBufPool tbufPool0, tbufPool1, tbufPool2; AscendC::TQue srcQue0, srcQue1, srcQue2; AscendC::TQue dstQue0, dstQue1; - AscendC::GlobalTensor src0Global, src1Global, dstGlobal; - private: - AscendC::TPipe* pipe; - AscendC::TBufPool tbufPool0; - AscendC::TBufPool tbufPool1; - AscendC::TQue srcQue0; - AscendC::TQue srcQue1; - AscendC::TQue dstQue0; - AscendC::GlobalTensor src0Global; - AscendC::GlobalTensor src1Global; - AscendC::GlobalTensor dstGlobal; + AscendC::GlobalTensor src0Global, src1Global, dstGlobal; uint32_t totalLength = 0; + + // private: + // AscendC::TPipe* pipe; + // AscendC::TBufPool tbufPool0; + // AscendC::TBufPool tbufPool1; + // AscendC::TQue srcQue0; + // AscendC::TQue srcQue1; + // AscendC::TQue dstQue0; + // AscendC::GlobalTensor src0Global; + // AscendC::GlobalTensor src1Global; + // AscendC::GlobalTensor dstGlobal; + // uint32_t totalLength = 0; }; }// namespace MyCustomKernel -- Gitee From eb8bd353aa5a601e47a1920185ed6e21ce2f82c6 Mon Sep 17 00:00:00 2001 From: alpaca12345uuu Date: Sat, 7 Jun 2025 22:53:58 +0800 Subject: [PATCH 06/94] =?UTF-8?q?=E4=BB=A3=E7=A0=81=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h b/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h index bdc5a7827..857af6a78 100644 --- a/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h +++ b/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h @@ -103,7 +103,7 @@ class TbufPoolImpl { AscendC::LocalTensor src0Local = srcQue0.DeQue(); AscendC::LocalTensor src1Local = srcQue2.DeQue(); AscendC::LocalTensor dstLocal = dstQue1.AllocTensor(); - AscendC::Add(dstLocal, src0Local, src1Local, 16384); + AscendC::Add(dstLocal, src0Local, src1Local, TENSOR_LENGTH); dstQue1.EnQue(dstLocal); srcQue0.FreeTensor(src0Local); srcQue2.FreeTensor(src1Local); @@ -111,7 +111,7 @@ class TbufPoolImpl { __aicore__ inline void CopyOut1() { AscendC::LocalTensor dstLocal = dstQue1.DeQue(); - AscendC::DataCopy(dstGlobal[16384], dstLocal, 16384); + AscendC::DataCopy(dstGlobal[TENSOR_LENGTH], dstLocal, TENSOR_LENGTH); dstQue1.FreeTensor(dstLocal); } private: -- Gitee From 21902516017c2b2c4a9d68f5ef050c0c8aec0fb6 Mon Sep 17 00:00:00 2001 From: alpaca12345uuu Date: Sat, 7 Jun 2025 22:55:39 +0800 Subject: [PATCH 07/94] =?UTF-8?q?=E4=BB=A3=E7=A0=81=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h | 1 + 1 file changed, 1 insertion(+) diff --git a/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h b/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h index 857af6a78..6849b7a1a 100644 --- a/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h +++ b/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h @@ -112,6 +112,7 @@ class TbufPoolImpl { { AscendC::LocalTensor dstLocal = dstQue1.DeQue(); AscendC::DataCopy(dstGlobal[TENSOR_LENGTH], dstLocal, TENSOR_LENGTH); + AscendC::DumpTensor(dstGlobal, 1, 10); dstQue1.FreeTensor(dstLocal); } private: -- Gitee From bdb71072a1400784f42e2c590d01407bc7263332 Mon Sep 17 00:00:00 2001 From: alpaca12345uuu Date: Sat, 7 Jun 2025 22:58:46 +0800 Subject: [PATCH 08/94] =?UTF-8?q?=E4=BB=A3=E7=A0=81=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- operator/ascendc/2_features/2_tbufpool/main.cpp | 2 +- .../ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/operator/ascendc/2_features/2_tbufpool/main.cpp b/operator/ascendc/2_features/2_tbufpool/main.cpp index 565a21df4..f6c0a9a02 100644 --- a/operator/ascendc/2_features/2_tbufpool/main.cpp +++ b/operator/ascendc/2_features/2_tbufpool/main.cpp @@ -51,7 +51,7 @@ static bool CompareResult(const void *outputData, int64_t outSize) { float ae = std::abs(a - b); float re = ae / abs(b); if (ae > EPS && re > EPS) { - printf("CompareResult failed output is %lf, golden is %lf\n", a, b); + //printf("CompareResult failed output is %lf, golden is %lf\n", a, b); wrongNum++; } } diff --git a/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h b/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h index 6849b7a1a..a87758898 100644 --- a/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h +++ b/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h @@ -87,6 +87,7 @@ class TbufPoolImpl { { AscendC::LocalTensor dstLocal = dstQue0.DeQue(); AscendC::DataCopy(dstGlobal, dstLocal, TENSOR_LENGTH); + AscendC::DumpTensor(dstGlobal, 2, 10); dstQue0.FreeTensor(dstLocal); } __aicore__ inline void CopyIn1() -- Gitee From dedf5edc83295cfc20ee6730348bf4ca5b76caca Mon Sep 17 00:00:00 2001 From: alpaca12345uuu Date: Sat, 7 Jun 2025 23:02:06 +0800 Subject: [PATCH 09/94] =?UTF-8?q?=E4=BB=A3=E7=A0=81=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h b/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h index a87758898..5f195436d 100644 --- a/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h +++ b/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h @@ -105,6 +105,7 @@ class TbufPoolImpl { AscendC::LocalTensor src1Local = srcQue2.DeQue(); AscendC::LocalTensor dstLocal = dstQue1.AllocTensor(); AscendC::Add(dstLocal, src0Local, src1Local, TENSOR_LENGTH); + AscendC::DumpTensor(dstLobal, 2, 10); dstQue1.EnQue(dstLocal); srcQue0.FreeTensor(src0Local); srcQue2.FreeTensor(src1Local); @@ -113,7 +114,7 @@ class TbufPoolImpl { { AscendC::LocalTensor dstLocal = dstQue1.DeQue(); AscendC::DataCopy(dstGlobal[TENSOR_LENGTH], dstLocal, TENSOR_LENGTH); - AscendC::DumpTensor(dstGlobal, 1, 10); + AscendC::DumpTensor(dstGlobal, 3, 10); dstQue1.FreeTensor(dstLocal); } private: -- Gitee From cd46e4f1335327749216327a96cae7ea45d884a5 Mon Sep 17 00:00:00 2001 From: alpaca12345uuu Date: Sat, 7 Jun 2025 23:02:32 +0800 Subject: [PATCH 10/94] =?UTF-8?q?=E4=BB=A3=E7=A0=81=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h b/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h index 5f195436d..4a0f2b648 100644 --- a/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h +++ b/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h @@ -105,7 +105,7 @@ class TbufPoolImpl { AscendC::LocalTensor src1Local = srcQue2.DeQue(); AscendC::LocalTensor dstLocal = dstQue1.AllocTensor(); AscendC::Add(dstLocal, src0Local, src1Local, TENSOR_LENGTH); - AscendC::DumpTensor(dstLobal, 2, 10); + AscendC::DumpTensor(dstLocal, 2, 10); dstQue1.EnQue(dstLocal); srcQue0.FreeTensor(src0Local); srcQue2.FreeTensor(src1Local); -- Gitee From f28ab6b6eb0959832d7a2f290f98dd9a747b2459 Mon Sep 17 00:00:00 2001 From: alpaca12345uuu Date: Sat, 7 Jun 2025 23:05:51 +0800 Subject: [PATCH 11/94] =?UTF-8?q?=E4=BB=A3=E7=A0=81=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../2_features/2_tbufpool/op_kernel/tbufpool_custom.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h b/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h index 4a0f2b648..5573506a4 100644 --- a/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h +++ b/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h @@ -48,14 +48,14 @@ class TbufPoolImpl { // tbufPool1.Reset(); // } // tbufPool0.Reset(); - tbufPool1.InitBuffer(srcQue1, 1, 32768); - tbufPool1.InitBuffer(dstQue0, 1, 32768); + tbufPool1.InitBuffer(srcQue1, BUFFER_NUM, INIT_TENSOR_LENGTH); + tbufPool1.InitBuffer(dstQue0, BUFFER_NUM, INIT_TENSOR_LENGTH); CopyIn(); Compute(); CopyOut(); tbufPool1.Reset(); - tbufPool2.InitBuffer(srcQue2, 1, 32768); - tbufPool2.InitBuffer(dstQue1, 1, 32768); + tbufPool2.InitBuffer(srcQue2, BUFFER_NUM, INIT_TENSOR_LENGTH); + tbufPool2.InitBuffer(dstQue1, BUFFER_NUM, INIT_TENSOR_LENGTH); CopyIn1(); Compute1(); CopyOut1(); -- Gitee From 499484f174f4675c0e82516534d091fc18db3756 Mon Sep 17 00:00:00 2001 From: alpaca12345uuu Date: Sat, 7 Jun 2025 23:06:54 +0800 Subject: [PATCH 12/94] =?UTF-8?q?=E4=BB=A3=E7=A0=81=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h b/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h index 5573506a4..ec02be647 100644 --- a/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h +++ b/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h @@ -78,7 +78,7 @@ class TbufPoolImpl { AscendC::LocalTensor src1Local = srcQue1.DeQue(); AscendC::LocalTensor dstLocal = dstQue0.AllocTensor(); AscendC::Add(dstLocal, src0Local, src1Local, TENSOR_LENGTH); - + AscendC::DumpTensor(dstLocal, 2, 10); dstQue0.EnQue(dstLocal); srcQue0.FreeTensor(src0Local); srcQue1.FreeTensor(src1Local); -- Gitee From 95b8d80b3256fda9820ef1e648f77aa94a8221ff Mon Sep 17 00:00:00 2001 From: alpaca12345uuu Date: Sat, 7 Jun 2025 23:17:33 +0800 Subject: [PATCH 13/94] =?UTF-8?q?=E4=BB=A3=E7=A0=81=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h b/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h index ec02be647..3171a1e21 100644 --- a/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h +++ b/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h @@ -69,6 +69,9 @@ class TbufPoolImpl { AscendC::LocalTensor src1Local = srcQue1.AllocTensor(); AscendC::DataCopy(src0Local, src0Global, TENSOR_LENGTH); AscendC::DataCopy(src1Local, src1Global, TENSOR_LENGTH); + AscendC::DumpTensor(src0Local, 2, 10); + AscendC::DumpTensor(src1Local, 2, 10); + srcQue0.EnQue(src0Local); srcQue1.EnQue(src1Local); } -- Gitee From b33a3dcd50265ae9b83b5e8437e723d2cd570b49 Mon Sep 17 00:00:00 2001 From: alpaca12345uuu Date: Sat, 7 Jun 2025 23:41:54 +0800 Subject: [PATCH 14/94] =?UTF-8?q?=E4=BB=A3=E7=A0=81=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h | 1 + 1 file changed, 1 insertion(+) diff --git a/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h b/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h index 3171a1e21..b0e6ab341 100644 --- a/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h +++ b/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h @@ -70,6 +70,7 @@ class TbufPoolImpl { AscendC::DataCopy(src0Local, src0Global, TENSOR_LENGTH); AscendC::DataCopy(src1Local, src1Global, TENSOR_LENGTH); AscendC::DumpTensor(src0Local, 2, 10); + printf("abcdefghigklmnop"); AscendC::DumpTensor(src1Local, 2, 10); srcQue0.EnQue(src0Local); -- Gitee From c5d5fafa32aab1cf318b61ef489aec05d74d5d18 Mon Sep 17 00:00:00 2001 From: alpaca12345uuu Date: Sat, 7 Jun 2025 23:42:36 +0800 Subject: [PATCH 15/94] =?UTF-8?q?=E4=BB=A3=E7=A0=81=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h b/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h index b0e6ab341..923abd94a 100644 --- a/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h +++ b/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h @@ -70,7 +70,7 @@ class TbufPoolImpl { AscendC::DataCopy(src0Local, src0Global, TENSOR_LENGTH); AscendC::DataCopy(src1Local, src1Global, TENSOR_LENGTH); AscendC::DumpTensor(src0Local, 2, 10); - printf("abcdefghigklmnop"); + AscendC::printf("abcdefghigklmnop"); AscendC::DumpTensor(src1Local, 2, 10); srcQue0.EnQue(src0Local); -- Gitee From 4fb717007155621216326394fa50a31743597a86 Mon Sep 17 00:00:00 2001 From: alpaca12345uuu Date: Sat, 7 Jun 2025 23:47:44 +0800 Subject: [PATCH 16/94] =?UTF-8?q?=E4=BB=A3=E7=A0=81=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h b/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h index 923abd94a..8444850a8 100644 --- a/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h +++ b/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h @@ -68,9 +68,11 @@ class TbufPoolImpl { AscendC::LocalTensor src0Local = srcQue0.AllocTensor(); AscendC::LocalTensor src1Local = srcQue1.AllocTensor(); AscendC::DataCopy(src0Local, src0Global, TENSOR_LENGTH); + AscendC::printf("abcdefghigklmnop"); + AscendC::DataCopy(src1Local, src1Global, TENSOR_LENGTH); AscendC::DumpTensor(src0Local, 2, 10); - AscendC::printf("abcdefghigklmnop"); + //AscendC::printf("abcdefghigklmnop"); AscendC::DumpTensor(src1Local, 2, 10); srcQue0.EnQue(src0Local); -- Gitee From b610328aaf546a2a831287d179b64e0dbe3f932a Mon Sep 17 00:00:00 2001 From: alpaca12345uuu Date: Sat, 7 Jun 2025 23:49:28 +0800 Subject: [PATCH 17/94] =?UTF-8?q?=E4=BB=A3=E7=A0=81=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h b/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h index 8444850a8..e286bf030 100644 --- a/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h +++ b/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h @@ -67,6 +67,8 @@ class TbufPoolImpl { { AscendC::LocalTensor src0Local = srcQue0.AllocTensor(); AscendC::LocalTensor src1Local = srcQue1.AllocTensor(); + AscendC::DumpTensor(src0Global, 2, 10); + AscendC::DataCopy(src0Local, src0Global, TENSOR_LENGTH); AscendC::printf("abcdefghigklmnop"); -- Gitee From b81dc974b1dc9a2b86e77152031a7ce95abd9ebc Mon Sep 17 00:00:00 2001 From: alpaca12345uuu Date: Sat, 7 Jun 2025 23:50:28 +0800 Subject: [PATCH 18/94] =?UTF-8?q?=E4=BB=A3=E7=A0=81=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h b/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h index e286bf030..7693d431c 100644 --- a/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h +++ b/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h @@ -48,6 +48,8 @@ class TbufPoolImpl { // tbufPool1.Reset(); // } // tbufPool0.Reset(); + AscendC::DumpTensor(src0Global, 2, 10); + tbufPool1.InitBuffer(srcQue1, BUFFER_NUM, INIT_TENSOR_LENGTH); tbufPool1.InitBuffer(dstQue0, BUFFER_NUM, INIT_TENSOR_LENGTH); CopyIn(); -- Gitee From 25f12d0da7f45817d7acd3f8de11450a679693de Mon Sep 17 00:00:00 2001 From: alpaca12345uuu Date: Sun, 8 Jun 2025 00:07:27 +0800 Subject: [PATCH 19/94] =?UTF-8?q?=E4=BB=A3=E7=A0=81=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../2_features/2_tbufpool/op_kernel/tbufpool_custom.h | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h b/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h index 7693d431c..ba89eff73 100644 --- a/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h +++ b/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h @@ -31,6 +31,8 @@ class TbufPoolImpl { src0Global.SetGlobalBuffer((__gm__ float*)src0Gm); src1Global.SetGlobalBuffer((__gm__ float*)src1Gm); dstGlobal.SetGlobalBuffer((__gm__ float*)dstGm); + AscendC::DumpTensor(src0Global, 2, 10); + pipe->InitBufPool(tbufPool0, BUFFER_LENGTH); tbufPool0.InitBuffer(srcQue0, BUFFER_NUM, BUFF_POOL_LENGTH); // Total src0 tbufPool0.InitBufPool(tbufPool1, BUFF_POOL_LENGTH); @@ -40,14 +42,6 @@ class TbufPoolImpl { } __aicore__ inline void Process() { - // for(int i = 0; i < SPLIT_NUM; i++) - // { - // CopyIn(i); - // Compute(); - // CopyOut(); - // tbufPool1.Reset(); - // } - // tbufPool0.Reset(); AscendC::DumpTensor(src0Global, 2, 10); tbufPool1.InitBuffer(srcQue1, BUFFER_NUM, INIT_TENSOR_LENGTH); -- Gitee From 9ec39f8c65165e1d38bf23076ba44e723807bbc8 Mon Sep 17 00:00:00 2001 From: alpaca12345uuu Date: Sun, 8 Jun 2025 00:08:04 +0800 Subject: [PATCH 20/94] =?UTF-8?q?=E4=BB=A3=E7=A0=81=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h b/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h index ba89eff73..5f46cb933 100644 --- a/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h +++ b/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h @@ -31,7 +31,7 @@ class TbufPoolImpl { src0Global.SetGlobalBuffer((__gm__ float*)src0Gm); src1Global.SetGlobalBuffer((__gm__ float*)src1Gm); dstGlobal.SetGlobalBuffer((__gm__ float*)dstGm); - AscendC::DumpTensor(src0Global, 2, 10); + AscendC::DumpTensor(src0Global, 10, 10); pipe->InitBufPool(tbufPool0, BUFFER_LENGTH); tbufPool0.InitBuffer(srcQue0, BUFFER_NUM, BUFF_POOL_LENGTH); // Total src0 -- Gitee From 832cd7d6bbdfe7c7dee6120aba26475ff3cc7594 Mon Sep 17 00:00:00 2001 From: alpaca12345uuu Date: Sun, 8 Jun 2025 00:30:16 +0800 Subject: [PATCH 21/94] =?UTF-8?q?=E4=BB=A3=E7=A0=81=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../2_tbufpool/op_kernel/tbufpool_custom.h | 25 ++++++++++--------- 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h b/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h index 5f46cb933..b1710e4e3 100644 --- a/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h +++ b/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h @@ -16,11 +16,12 @@ namespace MyCustomKernel { constexpr int32_t BUFFER_NUM = 1; -constexpr int32_t BUFFER_LENGTH = 8192; -constexpr int32_t BUFF_POOL_LENGTH = 4096; -constexpr int32_t INIT_TENSOR_LENGTH = 512; -constexpr int32_t TENSOR_LENGTH = 128; +constexpr int32_t BUFFER_LENGTH = 4096*sizeof(float); +constexpr int32_t BUFF_POOL_LENGTH = 2048*sizeof(float); +constexpr int32_t INIT_TENSOR_LENGTH = 1024*sizeof(float); +constexpr int32_t TENSOR_LENGTH = 512*sizeof(float); constexpr int32_t SPLIT_NUM = 2; +constexpr int32_t TOTL_NUM = 2048 class TbufPoolImpl { public: __aicore__ inline TbufPoolImpl() {} @@ -65,10 +66,10 @@ class TbufPoolImpl { AscendC::LocalTensor src1Local = srcQue1.AllocTensor(); AscendC::DumpTensor(src0Global, 2, 10); - AscendC::DataCopy(src0Local, src0Global, TENSOR_LENGTH); + AscendC::DataCopy(src0Local, src0Global, INIT_TENSOR_LENGTH); AscendC::printf("abcdefghigklmnop"); - AscendC::DataCopy(src1Local, src1Global, TENSOR_LENGTH); + AscendC::DataCopy(src1Local, src1Global, INIT_TENSOR_LENGTH); AscendC::DumpTensor(src0Local, 2, 10); //AscendC::printf("abcdefghigklmnop"); AscendC::DumpTensor(src1Local, 2, 10); @@ -81,7 +82,7 @@ class TbufPoolImpl { AscendC::LocalTensor src0Local = srcQue0.DeQue(); AscendC::LocalTensor src1Local = srcQue1.DeQue(); AscendC::LocalTensor dstLocal = dstQue0.AllocTensor(); - AscendC::Add(dstLocal, src0Local, src1Local, TENSOR_LENGTH); + AscendC::Add(dstLocal, src0Local, src1Local, INIT_TENSOR_LENGTH); AscendC::DumpTensor(dstLocal, 2, 10); dstQue0.EnQue(dstLocal); srcQue0.FreeTensor(src0Local); @@ -90,7 +91,7 @@ class TbufPoolImpl { __aicore__ inline void CopyOut() { AscendC::LocalTensor dstLocal = dstQue0.DeQue(); - AscendC::DataCopy(dstGlobal, dstLocal, TENSOR_LENGTH); + AscendC::DataCopy(dstGlobal, dstLocal, INIT_TENSOR_LENGTH); AscendC::DumpTensor(dstGlobal, 2, 10); dstQue0.FreeTensor(dstLocal); } @@ -98,8 +99,8 @@ class TbufPoolImpl { { AscendC::LocalTensor src0Local = srcQue0.AllocTensor(); AscendC::LocalTensor src1Local = srcQue2.AllocTensor(); - AscendC::DataCopy(src0Local, src0Global[TENSOR_LENGTH], TENSOR_LENGTH); - AscendC::DataCopy(src1Local, src1Global[TENSOR_LENGTH], TENSOR_LENGTH); + AscendC::DataCopy(src0Local, src0Global[1024], INIT_TENSOR_LENGTH); + AscendC::DataCopy(src1Local, src1Global[1024], INIT_TENSOR_LENGTH); srcQue0.EnQue(src0Local); srcQue2.EnQue(src1Local); } @@ -108,7 +109,7 @@ class TbufPoolImpl { AscendC::LocalTensor src0Local = srcQue0.DeQue(); AscendC::LocalTensor src1Local = srcQue2.DeQue(); AscendC::LocalTensor dstLocal = dstQue1.AllocTensor(); - AscendC::Add(dstLocal, src0Local, src1Local, TENSOR_LENGTH); + AscendC::Add(dstLocal, src0Local, src1Local, INIT_TENSOR_LENGTH); AscendC::DumpTensor(dstLocal, 2, 10); dstQue1.EnQue(dstLocal); srcQue0.FreeTensor(src0Local); @@ -117,7 +118,7 @@ class TbufPoolImpl { __aicore__ inline void CopyOut1() { AscendC::LocalTensor dstLocal = dstQue1.DeQue(); - AscendC::DataCopy(dstGlobal[TENSOR_LENGTH], dstLocal, TENSOR_LENGTH); + AscendC::DataCopy(dstGlobal[1024], dstLocal, INIT_TENSOR_LENGTH); AscendC::DumpTensor(dstGlobal, 3, 10); dstQue1.FreeTensor(dstLocal); } -- Gitee From e29065e3df0fecd1216c8b6c2a712a39db2c6b71 Mon Sep 17 00:00:00 2001 From: alpaca12345uuu Date: Sun, 8 Jun 2025 00:30:43 +0800 Subject: [PATCH 22/94] =?UTF-8?q?=E4=BB=A3=E7=A0=81=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h b/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h index b1710e4e3..4f477fec2 100644 --- a/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h +++ b/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h @@ -21,7 +21,7 @@ constexpr int32_t BUFF_POOL_LENGTH = 2048*sizeof(float); constexpr int32_t INIT_TENSOR_LENGTH = 1024*sizeof(float); constexpr int32_t TENSOR_LENGTH = 512*sizeof(float); constexpr int32_t SPLIT_NUM = 2; -constexpr int32_t TOTL_NUM = 2048 +constexpr int32_t TOTL_NUM = 2048; class TbufPoolImpl { public: __aicore__ inline TbufPoolImpl() {} -- Gitee From 44b4956a1791d69509345990b3a22e979fd8ea35 Mon Sep 17 00:00:00 2001 From: alpaca12345uuu Date: Sun, 8 Jun 2025 00:41:07 +0800 Subject: [PATCH 23/94] =?UTF-8?q?=E4=BB=A3=E7=A0=81=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- operator/ascendc/2_features/2_tbufpool/main.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/operator/ascendc/2_features/2_tbufpool/main.cpp b/operator/ascendc/2_features/2_tbufpool/main.cpp index f6c0a9a02..565a21df4 100644 --- a/operator/ascendc/2_features/2_tbufpool/main.cpp +++ b/operator/ascendc/2_features/2_tbufpool/main.cpp @@ -51,7 +51,7 @@ static bool CompareResult(const void *outputData, int64_t outSize) { float ae = std::abs(a - b); float re = ae / abs(b); if (ae > EPS && re > EPS) { - //printf("CompareResult failed output is %lf, golden is %lf\n", a, b); + printf("CompareResult failed output is %lf, golden is %lf\n", a, b); wrongNum++; } } -- Gitee From 87f5812c0c20924e61e7a4963075bbcb48ad3ae9 Mon Sep 17 00:00:00 2001 From: alpaca12345uuu Date: Tue, 3 Jun 2025 22:05:06 +0800 Subject: [PATCH 24/94] add tbufpool sample --- .../2_features/2_tbufpool/CMakeLists.txt | 76 +++++++ .../ascendc/2_features/2_tbufpool/README.md | 79 ++++++- .../2_features/2_tbufpool/cmake/cpu_lib.cmake | 26 +++ .../2_features/2_tbufpool/cmake/npu_lib.cmake | 12 + .../2_features/2_tbufpool/data_utils.h | 211 ++++++++++++++++++ .../ascendc/2_features/2_tbufpool/main.cpp | 170 ++++++++++++++ .../op_host/tbufpool_custom_tiling.cpp | 19 ++ .../op_host/tbufpool_custom_tiling.h | 18 ++ .../2_tbufpool/op_kernel/tbufpool_custom.cpp | 20 ++ .../2_tbufpool/op_kernel/tbufpool_custom.h | 148 ++++++++++++ operator/ascendc/2_features/2_tbufpool/run.sh | 58 +++++ .../2_features/2_tbufpool/scripts/gen_data.py | 31 +++ operator/ascendc/2_features/README.md | 1 + 13 files changed, 868 insertions(+), 1 deletion(-) create mode 100644 operator/ascendc/2_features/2_tbufpool/CMakeLists.txt create mode 100644 operator/ascendc/2_features/2_tbufpool/cmake/cpu_lib.cmake create mode 100644 operator/ascendc/2_features/2_tbufpool/cmake/npu_lib.cmake create mode 100644 operator/ascendc/2_features/2_tbufpool/data_utils.h create mode 100644 operator/ascendc/2_features/2_tbufpool/main.cpp create mode 100644 operator/ascendc/2_features/2_tbufpool/op_host/tbufpool_custom_tiling.cpp create mode 100644 operator/ascendc/2_features/2_tbufpool/op_host/tbufpool_custom_tiling.h create mode 100644 operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.cpp create mode 100644 operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h create mode 100644 operator/ascendc/2_features/2_tbufpool/run.sh create mode 100644 operator/ascendc/2_features/2_tbufpool/scripts/gen_data.py diff --git a/operator/ascendc/2_features/2_tbufpool/CMakeLists.txt b/operator/ascendc/2_features/2_tbufpool/CMakeLists.txt new file mode 100644 index 000000000..cba0e5e41 --- /dev/null +++ b/operator/ascendc/2_features/2_tbufpool/CMakeLists.txt @@ -0,0 +1,76 @@ +# Copyright (c) 2025 Huawei Technologies Co., Ltd. +# ====================================================================================================================== + +cmake_minimum_required(VERSION 3.16) +project(Ascend_c) +if(${RUN_MODE}) + set(RUN_MODE "npu" CACHE STRING "cpu/sim/npu") +endif() +if (${SOC_VERSION}) + set(SOC_VERSION "Ascend910" CACHE STRING "system on chip type") +endif() + +set(ASCEND_CANN_PACKAGE_PATH "~/Ascend/ascend-toolkit/latest" CACHE STRING "ASCEND CANN package installation directory") + +if(NOT CMAKE_BUILD_TYPE) + set(CMAKE_BUILD_TYPE "Debug" CACHE STRING "Build type Release/Debug (default Debug)" FORCE) +endif() + +if(CMAKE_INSTALL_PREFIX STREQUAL /usr/local) + set(CMAKE_INSTALL_PREFIX "${CMAKE_CURRENT_LIST_DIR}/out" CACHE STRING "path for install()" FORCE) +endif() + +file(GLOB KERNEL_FILES + ${CMAKE_CURRENT_SOURCE_DIR}/op_kernel/tbufpool_custom.cpp +) +set(CUSTOM_ASCEND310P_LIST "Ascend310P1" "Ascend310P3") + +if("${RUN_MODE}" STREQUAL "cpu") + include(cmake/cpu_lib.cmake) +elseif("${RUN_MODE}" STREQUAL "sim" OR "${RUN_MODE}" STREQUAL "npu") + include(cmake/npu_lib.cmake) +else() + message("invalid RUN_MODE: ${RUN_MODE}") +endif() + +add_executable(tbufpool_direct_kernel_op + ${CMAKE_CURRENT_SOURCE_DIR}/main.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/op_host/tbufpool_custom_tiling.cpp +) + +target_compile_options(tbufpool_direct_kernel_op PRIVATE + $:-g>> + -O2 + -std=c++17 + -D_GLIBCXX_USE_CXX11_ABI=0 +) + +target_compile_definitions(tbufpool_direct_kernel_op PRIVATE + $<$>:CUSTOM_ASCEND310P> +) + +target_include_directories(tbufpool_direct_kernel_op PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR} + $:${ASCEND_CANN_PACKAGE_PATH}/include>> + $:${ASCEND_CANN_PACKAGE_PATH}/runtime/include>> +) + +target_link_libraries(tbufpool_direct_kernel_op PRIVATE + $,$>:host_intf_pub>> + $:tikicpulib::${SOC_VERSION}>> + $:ascendcl>> + $:c_sec>> + ascendc_kernels_${RUN_MODE} + tiling_api + register + platform + ascendalog + dl + graph_base +) + +install(TARGETS tbufpool_direct_kernel_op + LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} + ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} + RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} +) \ No newline at end of file diff --git a/operator/ascendc/2_features/2_tbufpool/README.md b/operator/ascendc/2_features/2_tbufpool/README.md index 5af80e6c5..25b8f4ab5 100644 --- a/operator/ascendc/2_features/2_tbufpool/README.md +++ b/operator/ascendc/2_features/2_tbufpool/README.md @@ -1 +1,78 @@ -tbufpool(待补充) \ No newline at end of file + +## 目录结构介绍 +``` +├── 22_tbufpool_kernellaunch +│ ├── cmake // 编译工程文件 +│ ├── op_host // 本样例tiling代码实现 +│ ├── op_kernel //本样例kernel侧代码实现 +│ ├── scripts +│ │ ├── gen_data.py // 输入数据和真值数据生成脚本 +│ ├── CMakeLists.txt // 编译工程文件 +│ ├── data_utils.h // 数据读入写出函数 +│ ├── main.cpp // 主函数,调用算子的应用程序,含CPU域及NPU域调用 +│ └── run.sh // 编译运行算子的脚本 +``` +## 代码实现介绍 +数据量较大且内存有限时,无法一次完成所有数据搬运,需要拆分成多个阶段计算,每次计算使用其中的一部分数据,可以通过TBufPool资源池进行内存地址复用。本例中,通过调用InitBufPool基础API对Add算子实现过程进行内存管理。从Tpipe划分出资源池tbufPool0,tbufPool0为src0Gm分配空间后,继续分配了资源池tbufPool1,指定tbufPool1与tbufPool2复用并分别运用于第一、二轮计算,此时tbufPool1及tbufPool2共享起始地址及长度。 + +- kernel实现 + Add算子的数学表达式为: + ``` + z = x + y + ``` + 计算逻辑是:Ascend C提供的矢量计算接口的操作元素都为LocalTensor,首先将部分输入数据src0Gm,部分输入数据src1Gm搬运进片上储存,调用计算接口完成相加计算,搬出到外部存储上。之后再将剩余输入数据搬运进片上储存,调用计算接口完成相加计算,得到最终结果,再搬出到外部存储上。 + + Add算子的实现流程分为6个基本任务:CopyIn,Compute,CopyOut,CopyIn1,Compute1,CopyOut1。 + - CopyIn任务负责将Global Memory上的部分输入Tensor src0Gm和src1Gm搬运到Local Memory,分别存储在srcoLocal、src1Local; + - Compute任务负责对src0Local、src1Local执行加法操作,计算结果存储在dstLocal中; + - CopyOut任务负责将输出数据从dstLocal搬运至Global Memory上的输出Tensor dstGm中。 + - CopyIn1任务负责将Global Memory上的剩余输入Tensor src0Gm和src1Gm搬运到Local Memory,分别存储在src0Local、src1Local; + - Compute1任务负责对src0Local、src1Local执行加法操作,计算结果存储在dstLocal中; + - CopyOut1任务负责将输出数据从dstLocal搬运至Global Memory上的输出Tensor dstGm中。 + +- 调用实现 + 1. CPU侧运行验证主要通过ICPU_RUN_KF CPU调测宏等CPU调测库提供的接口来完成; + 2. NPU侧运行验证主要通过使用ACLRT_LAUNCH_KERNEL内核调用宏来完成。 + + 应用程序通过ASCENDC_CPU_DEBUG 宏区分代码逻辑运行于CPU侧还是NPU侧。 + +## 运行样例算子 + - 打开样例目录 + 以命令行方式下载样例代码,master分支为例。 + ```bash + cd ${git_clone_path}/samples/operator/ascendc/0_introduction/22_tbufpool_kernellaunch + ``` + - 配置环境变量 + + 请根据当前环境上CANN开发套件包的[安装方式](https://hiascend.com/document/redirect/CannCommunityInstSoftware),选择对应配置环境变量的命令。 + - 默认路径,root用户安装CANN软件包 + ```bash + export ASCEND_INSTALL_PATH=/usr/local/Ascend/ascend-toolkit/latest + ``` + - 默认路径,非root用户安装CANN软件包 + ```bash + export ASCEND_INSTALL_PATH=$HOME/Ascend/ascend-toolkit/latest + ``` + - 指定路径install_path,安装CANN软件包 + ```bash + export ASCEND_INSTALL_PATH=${install_path}/ascend-toolkit/latest + ``` + + 配置仿真模式日志文件目录,默认为sim_log。 + ```bash + export CAMODEL_LOG_PATH=./sim_log + ``` + + - 样例执行 + + ```bash + bash run.sh -r [RUN_MODE] -v [SOC_VERSION] + ``` + - RUN_MODE:编译方式,可选择CPU调试,NPU仿真,NPU上板。支持参数为[cpu / sim / npu]。 + - SOC_VERSION:昇腾AI处理器型号,如果无法确定具体的[SOC_VERSION],则在安装昇腾AI处理器的服务器执行npu-smi info命令进行查询,在查询到的“Name”前增加Ascend信息,例如“Name”对应取值为xxxyy,实际配置的[SOC_VERSION]值为Ascendxxxyy。支持以下产品型号: + - Atlas A2训练系列产品/Atlas 800I A2推理产品 + + 示例如下,Ascendxxxyy请替换为实际的AI处理器型号。 + ```bash + bash run.sh -r cpu -v Ascendxxxyy + ``` \ No newline at end of file diff --git a/operator/ascendc/2_features/2_tbufpool/cmake/cpu_lib.cmake b/operator/ascendc/2_features/2_tbufpool/cmake/cpu_lib.cmake new file mode 100644 index 000000000..693f15ac1 --- /dev/null +++ b/operator/ascendc/2_features/2_tbufpool/cmake/cpu_lib.cmake @@ -0,0 +1,26 @@ +if(NOT DEFINED ENV{CMAKE_PREFIX_PATH}) + set(CMAKE_PREFIX_PATH ${ASCEND_CANN_PACKAGE_PATH}/tools/tikicpulib/lib/cmake) +endif() +find_package(tikicpulib REQUIRED) + +add_library(ascendc_kernels_${RUN_MODE} SHARED + ${KERNEL_FILES} +) + +target_link_libraries(ascendc_kernels_${RUN_MODE} PRIVATE + tikicpulib::${SOC_VERSION} +) + +target_compile_definitions(ascendc_kernels_${RUN_MODE} PRIVATE + $<$>:CUSTOM_ASCEND310P> +) + +target_compile_options(ascendc_kernels_${RUN_MODE} PRIVATE + -g + -O0 + -std=c++17 +) + +install(TARGETS ascendc_kernels_${RUN_MODE} +DESTINATION ${CMAKE_INSTALL_LIBDIR} +) \ No newline at end of file diff --git a/operator/ascendc/2_features/2_tbufpool/cmake/npu_lib.cmake b/operator/ascendc/2_features/2_tbufpool/cmake/npu_lib.cmake new file mode 100644 index 000000000..8ad136f38 --- /dev/null +++ b/operator/ascendc/2_features/2_tbufpool/cmake/npu_lib.cmake @@ -0,0 +1,12 @@ +if(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake) + set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake) +elseif(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake) + set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake) +else() + message(FATAL_ERROR "ascendc_kernel_cmake does not exist ,please check whether the cann package is installed") +endif() +include(${ASCENDC_CMAKE_DIR}/ascendc.cmake) + +ascendc_library(ascendc_kernels_${RUN_MODE} STATIC + ${KERNEL_FILES} +) \ No newline at end of file diff --git a/operator/ascendc/2_features/2_tbufpool/data_utils.h b/operator/ascendc/2_features/2_tbufpool/data_utils.h new file mode 100644 index 000000000..7980ae341 --- /dev/null +++ b/operator/ascendc/2_features/2_tbufpool/data_utils.h @@ -0,0 +1,211 @@ +/** + * Copyright (c) 2025 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ +#ifndef DATA_UTILS_H +#define DATA_UTILS_H +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifndef ASCENDC_CPU_DEBUG +#include "acl/acl.h" +#endif + +typedef enum { + DT_UNDEFINED = -1, + FLOAT = 0, + HALF = 1, + INT8_T = 2, + INT32_T = 3, + UINT8_T = 4, + INT16_T = 6, + UINT16_T = 7, + UINT32_T = 8, + INT64_T = 9, + UINT64_T = 10, + DOUBLE = 11, + BOOL = 12, + STRING = 13, + COMPLEX64 = 16, + COMPLEX128 = 17, + BF16 = 27 +} printDataType; + +#define INFO_LOG(fmt, args...) fprintf(stdout, "[INFO] " fmt "\n", ##args) +#define WARN_LOG(fmt, args...) fprintf(stdout, "[WARN] " fmt "\n", ##args) +#define ERROR_LOG(fmt, args...) fprintf(stdout, "[ERROR] " fmt "\n", ##args) + +#ifndef ASCENDC_CPU_DEBUG +#define CHECK_ACL(x) \ + do { \ + aclError __ret = x; \ + if (__ret != ACL_ERROR_NONE) { \ + std::cerr << __FILE__ << ":" << __LINE__ << " aclError:" << __ret << std::endl; \ + } \ + } while (0); +#endif + +/** +* @brief Read data from file +* @param [in] filePath: file path +* @param [out] fileSize: file size +* @return read result +*/ +bool ReadFile(const std::string &filePath, size_t &fileSize, void *buffer, size_t bufferSize) +{ + struct stat sBuf; + int fileStatus = stat(filePath.data(), &sBuf); + if (fileStatus == -1) { + ERROR_LOG("failed to get file"); + return false; + } + if (S_ISREG(sBuf.st_mode) == 0) { + ERROR_LOG("%s is not a file, please enter a file", filePath.c_str()); + return false; + } + + std::ifstream file; + file.open(filePath, std::ios::binary); + if (!file.is_open()) { + ERROR_LOG("Open file failed. path = %s", filePath.c_str()); + return false; + } + + std::filebuf *buf = file.rdbuf(); + size_t size = buf->pubseekoff(0, std::ios::end, std::ios::in); + if (size == 0) { + ERROR_LOG("file size is 0"); + file.close(); + return false; + } + if (size > bufferSize) { + ERROR_LOG("file size is larger than buffer size"); + file.close(); + return false; + } + buf->pubseekpos(0, std::ios::in); + buf->sgetn(static_cast(buffer), size); + fileSize = size; + file.close(); + return true; +} + +/** +* @brief Write data to file +* @param [in] filePath: file path +* @param [in] buffer: data to write to file +* @param [in] size: size to write +* @return write result +*/ +bool WriteFile(const std::string &filePath, const void *buffer, size_t size) +{ + if (buffer == nullptr) { + ERROR_LOG("Write file failed. buffer is nullptr"); + return false; + } + + int fd = open(filePath.c_str(), O_RDWR | O_CREAT | O_TRUNC, S_IRUSR | S_IWRITE); + if (fd < 0) { + ERROR_LOG("Open file failed. path = %s", filePath.c_str()); + return false; + } + + auto writeSize = write(fd, buffer, size); + (void) close(fd); + if (writeSize != size) { + ERROR_LOG("Write file Failed."); + return false; + } + + return true; +} + +template +void DoPrintData(const T *data, size_t count, size_t elementsPerRow) +{ + assert(elementsPerRow != 0); + for (size_t i = 0; i < count; ++i) { + std::cout << std::setw(10) << data[i]; + if (i % elementsPerRow == elementsPerRow - 1) { + std::cout << std::endl; + } + } +} + +#ifndef ASCENDC_CPU_DEBUG +void DoPrintHalfData(const aclFloat16 *data, size_t count, size_t elementsPerRow) +{ + assert(elementsPerRow != 0); + for (size_t i = 0; i < count; ++i) { + std::cout << std::setw(10) << std::setprecision(6) << aclFloat16ToFloat(data[i]); + if (i % elementsPerRow == elementsPerRow - 1) { + std::cout << std::endl; + } + } +} +#endif + +void PrintData(const void *data, size_t count, printDataType dataType, size_t elementsPerRow=16) +{ + if (data == nullptr) { + ERROR_LOG("Print data failed. data is nullptr"); + return; + } + + switch (dataType) { + case BOOL: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case INT8_T: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case UINT8_T: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case INT16_T: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case UINT16_T: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case INT32_T: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case UINT32_T: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case INT64_T: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case UINT64_T: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; +#ifndef ASCENDC_CPU_DEBUG + case HALF: + DoPrintHalfData(reinterpret_cast(data), count, elementsPerRow); + break; +#endif + case FLOAT: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case DOUBLE: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + default: + ERROR_LOG("Unsupported type: %d", dataType); + } + std::cout << std::endl; +} +#endif // EXAMPLES_COMMON_DATA_UTILS_H diff --git a/operator/ascendc/2_features/2_tbufpool/main.cpp b/operator/ascendc/2_features/2_tbufpool/main.cpp new file mode 100644 index 000000000..565a21df4 --- /dev/null +++ b/operator/ascendc/2_features/2_tbufpool/main.cpp @@ -0,0 +1,170 @@ +/* + * Copyright (c) 2025 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#include "data_utils.h" +#include "./op_host/tbufpool_custom_tiling.h" +#ifndef ASCENDC_CPU_DEBUG +#include "acl/acl.h" +#include "aclrtlaunch_tbufpool_custom.h" +#include "tiling/platform/platform_ascendc.h" +#else +#include "tikicpulib.h" +extern "C" __global__ __aicore__ void tbufpool_custom(GM_ADDR x, GM_ADDR y, GM_ADDR z, TbufPoolTilingData tiling); +#endif + +namespace { +constexpr uint32_t USED_CORE_NUM = 1; +constexpr uint32_t TOTAL_LENGTH = 4096; +constexpr uint32_t TILING_SIZE = 1; +} + +extern void GenerateTilingData(const uint32_t totalLength, uint8_t *tilingBuf); + +static bool CompareResult(const void *outputData, int64_t outSize) { + void *goldenData; +#ifdef ASCENDC_CPU_DEBUG + goldenData = (uint8_t *)AscendC::GmAlloc(outSize); +#else + CHECK_ACL(aclrtMallocHost((void **)(&goldenData), outSize)); +#endif + size_t goldenSize = outSize; + bool ret = ReadFile("../output/golden.bin", goldenSize, goldenData, goldenSize); + if (ret) { + printf("ReadFile golden.bin success!\n"); + } else { + printf("test failed!\n"); + return false; + } + constexpr float EPS = 1e-4; + int64_t wrongNum = 0; + + for (int i = 0; i < outSize / sizeof(float); i++) { + float a = (reinterpret_cast(outputData))[i]; + float b = (reinterpret_cast(goldenData))[i]; + float ae = std::abs(a - b); + float re = ae / abs(b); + if (ae > EPS && re > EPS) { + printf("CompareResult failed output is %lf, golden is %lf\n", a, b); + wrongNum++; + } + } +#ifdef ASCENDC_CPU_DEBUG + AscendC::GmFree((void *)goldenData); +#else + CHECK_ACL(aclrtFreeHost(goldenData)); +#endif + if (wrongNum != 0) { + return false; + } else { + printf("CompareResult golden.bin success!\n"); + return true; + } +} + +int32_t main(int32_t argc, char *argv[]) { + size_t tilingSize = TILING_SIZE * sizeof(uint32_t); + size_t inputSize = TOTAL_LENGTH * sizeof(float); + size_t outputSize = inputSize; + +#ifdef ASCENDC_CPU_DEBUG + uint8_t *x = (uint8_t *)AscendC::GmAlloc(inputSize); + uint8_t *y = (uint8_t *)AscendC::GmAlloc(inputSize); + uint8_t *z = (uint8_t *)AscendC::GmAlloc(outputSize); + uint8_t *tiling = (uint8_t *)AscendC::GmAlloc(tilingSize); + + ReadFile("../input/input_x.bin", inputSize, x, inputSize); + ReadFile("../input/input_y.bin", inputSize, y, inputSize); + + GenerateTilingData(TOTAL_LENGTH, tiling); + + AscendC::SetKernelMode(KernelMode::AIV_MODE); // run in aiv mode + + ICPU_RUN_KF(tbufpool_custom, USED_CORE_NUM, x, y, z, + *reinterpret_cast(tiling)); // use this macro for cpu debug + + WriteFile("../output/output.bin", z, outputSize); + + bool goldenResult = true; + goldenResult = CompareResult(z, outputSize); + + AscendC::GmFree((void *)x); + AscendC::GmFree((void *)y); + AscendC::GmFree((void *)z); + AscendC::GmFree((void *)tiling); +#else + CHECK_ACL(aclInit(nullptr)); + int32_t deviceId = 0; + CHECK_ACL(aclrtSetDevice(deviceId)); + aclrtStream stream = nullptr; + CHECK_ACL(aclrtCreateStream(&stream)); + + uint8_t *xHost; + uint8_t *yHost; + uint8_t *zHost; + uint8_t *tiling; + uint8_t *xDevice; + uint8_t *yDevice; + uint8_t *zDevice; + + CHECK_ACL(aclrtMallocHost((void **)(&xHost), inputSize)); + CHECK_ACL(aclrtMallocHost((void **)(&yHost), inputSize)); + CHECK_ACL(aclrtMallocHost((void **)(&zHost), outputSize)); + CHECK_ACL(aclrtMallocHost((void **)(&tiling), tilingSize)); + + CHECK_ACL(aclrtMalloc((void **)&xDevice, inputSize, ACL_MEM_MALLOC_HUGE_FIRST)); + CHECK_ACL(aclrtMalloc((void **)&yDevice, inputSize, ACL_MEM_MALLOC_HUGE_FIRST)); + CHECK_ACL(aclrtMalloc((void **)&zDevice, outputSize, ACL_MEM_MALLOC_HUGE_FIRST)); + + ReadFile("../input/input_x.bin", inputSize, xHost, inputSize); + ReadFile("../input/input_y.bin", inputSize, yHost, inputSize); + + GenerateTilingData(TOTAL_LENGTH, tiling); + + // Copy host memory to device memory + CHECK_ACL(aclrtMemcpy(xDevice, inputSize, xHost, inputSize, ACL_MEMCPY_HOST_TO_DEVICE)); + CHECK_ACL(aclrtMemcpy(yDevice, inputSize, yHost, inputSize, ACL_MEMCPY_HOST_TO_DEVICE)); + + // Execute the kernel + ACLRT_LAUNCH_KERNEL(tbufpool_custom) + (USED_CORE_NUM, stream, xDevice, yDevice, zDevice, reinterpret_cast(tiling)); + + // Wait for the stop event to complete + CHECK_ACL(aclrtSynchronizeStream(stream)); + + // Copy result to host memory and write to output file + CHECK_ACL(aclrtMemcpy(zHost, outputSize, zDevice, outputSize, ACL_MEMCPY_DEVICE_TO_HOST)); + WriteFile("../output/output.bin", zHost, outputSize); + + // Compare the result with the golden result + bool goldenResult = true; + goldenResult = CompareResult(zHost, outputSize); + + // Clean up memory + CHECK_ACL(aclrtFree(xDevice)); + CHECK_ACL(aclrtFree(yDevice)); + CHECK_ACL(aclrtFree(zDevice)); + CHECK_ACL(aclrtFreeHost(xHost)); + CHECK_ACL(aclrtFreeHost(yHost)); + CHECK_ACL(aclrtFreeHost(zHost)); + CHECK_ACL(aclrtFreeHost(tiling)); + + CHECK_ACL(aclrtDestroyStream(stream)); + CHECK_ACL(aclrtResetDevice(deviceId)); + CHECK_ACL(aclFinalize()); +#endif + + if (goldenResult) { + printf("test pass!\n"); + } else { + printf("test failed!\n"); + } + return 0; +} + \ No newline at end of file diff --git a/operator/ascendc/2_features/2_tbufpool/op_host/tbufpool_custom_tiling.cpp b/operator/ascendc/2_features/2_tbufpool/op_host/tbufpool_custom_tiling.cpp new file mode 100644 index 000000000..0bc2f1c1d --- /dev/null +++ b/operator/ascendc/2_features/2_tbufpool/op_host/tbufpool_custom_tiling.cpp @@ -0,0 +1,19 @@ +/** + * @file tbufpool_custom_tiling.cpp + * + * Copyright (C) 2024-2025. Huawei Technologies Co., Ltd. All rights reserved. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + */ + +#include "tiling/tiling_api.h" +#include "tbufpool_custom_tiling.h" + + +void GenerateTilingData(uint32_t totalLength, uint8_t* tilingBuf) +{ + TbufPoolTilingData *tiling = reinterpret_cast(tilingBuf); + tiling->totalLength = totalLength; +} \ No newline at end of file diff --git a/operator/ascendc/2_features/2_tbufpool/op_host/tbufpool_custom_tiling.h b/operator/ascendc/2_features/2_tbufpool/op_host/tbufpool_custom_tiling.h new file mode 100644 index 000000000..63c60d78c --- /dev/null +++ b/operator/ascendc/2_features/2_tbufpool/op_host/tbufpool_custom_tiling.h @@ -0,0 +1,18 @@ +/** + * @file tbufpool_custom_tiling.h + * + * Copyright (C) 2024-2025. Huawei Technologies Co., Ltd. All rights reserved. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + */ + +#ifndef EXAMPLES_ACTIVATION_TBUFPOOL_CUSTOM_TILING_H +#define EXAMPLES_ACTIVATION_TBUFPOOL_CUSTOM_TILING_H +#include + +struct TbufPoolTilingData { + uint32_t totalLength; +}; +#endif diff --git a/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.cpp b/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.cpp new file mode 100644 index 000000000..d17a4d185 --- /dev/null +++ b/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.cpp @@ -0,0 +1,20 @@ +/** + * @file tbufpool_custom.cpp + * + * Copyright (C) 2024-2025. Huawei Technologies Co., Ltd. All rights reserved. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + */ + +#include "./tbufpool_custom.h" +#include "kernel_operator.h" + +extern "C" __global__ __aicore__ void tbufpool_custom(GM_ADDR src0Gm, GM_ADDR src1Gm, GM_ADDR dstGm, TbufPoolTilingData tiling) +{ + AscendC::TPipe pipe; + MyCustomKernel::TbufPoolImpl op; + op.Init(src0Gm, src1Gm, dstGm, tiling, &pipe); + op.Process(); +} \ No newline at end of file diff --git a/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h b/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h new file mode 100644 index 000000000..4f477fec2 --- /dev/null +++ b/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h @@ -0,0 +1,148 @@ +/** + * @file tbufpool_custom.h + * + * Copyright (C) 2024-2025. Huawei Technologies Co., Ltd. All rights reserved. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + */ + +#ifndef EXAMPLES_ACTIVATION_INITBUFPOOL_CUSTOM_H +#define EXAMPLES_ACTIVATION_INITBUFPOOL_CUSTOM_H +#include "../op_host/tbufpool_custom_tiling.h" +#include "kernel_operator.h" + + +namespace MyCustomKernel { +constexpr int32_t BUFFER_NUM = 1; +constexpr int32_t BUFFER_LENGTH = 4096*sizeof(float); +constexpr int32_t BUFF_POOL_LENGTH = 2048*sizeof(float); +constexpr int32_t INIT_TENSOR_LENGTH = 1024*sizeof(float); +constexpr int32_t TENSOR_LENGTH = 512*sizeof(float); +constexpr int32_t SPLIT_NUM = 2; +constexpr int32_t TOTL_NUM = 2048; +class TbufPoolImpl { + public: + __aicore__ inline TbufPoolImpl() {} + __aicore__ inline void Init(__gm__ uint8_t* src0Gm, __gm__ uint8_t* src1Gm, __gm__ uint8_t* dstGm, TbufPoolTilingData tiling, AscendC::TPipe* pipeIn) + { + pipe = pipeIn; + totalLength = tiling.totalLength; + src0Global.SetGlobalBuffer((__gm__ float*)src0Gm); + src1Global.SetGlobalBuffer((__gm__ float*)src1Gm); + dstGlobal.SetGlobalBuffer((__gm__ float*)dstGm); + AscendC::DumpTensor(src0Global, 10, 10); + + pipe->InitBufPool(tbufPool0, BUFFER_LENGTH); + tbufPool0.InitBuffer(srcQue0, BUFFER_NUM, BUFF_POOL_LENGTH); // Total src0 + tbufPool0.InitBufPool(tbufPool1, BUFF_POOL_LENGTH); + tbufPool0.InitBufPool(tbufPool2, BUFF_POOL_LENGTH, tbufPool1); + // tbufPool1.InitBuffer(srcQue1, BUFFER_NUM, INIT_TENSOR_LENGTH); + // tbufPool1.InitBuffer(dstQue0, BUFFER_NUM, INIT_TENSOR_LENGTH); + } + __aicore__ inline void Process() + { + AscendC::DumpTensor(src0Global, 2, 10); + + tbufPool1.InitBuffer(srcQue1, BUFFER_NUM, INIT_TENSOR_LENGTH); + tbufPool1.InitBuffer(dstQue0, BUFFER_NUM, INIT_TENSOR_LENGTH); + CopyIn(); + Compute(); + CopyOut(); + tbufPool1.Reset(); + tbufPool2.InitBuffer(srcQue2, BUFFER_NUM, INIT_TENSOR_LENGTH); + tbufPool2.InitBuffer(dstQue1, BUFFER_NUM, INIT_TENSOR_LENGTH); + CopyIn1(); + Compute1(); + CopyOut1(); + tbufPool2.Reset(); + tbufPool0.Reset(); + } + private: + __aicore__ inline void CopyIn() + { + AscendC::LocalTensor src0Local = srcQue0.AllocTensor(); + AscendC::LocalTensor src1Local = srcQue1.AllocTensor(); + AscendC::DumpTensor(src0Global, 2, 10); + + AscendC::DataCopy(src0Local, src0Global, INIT_TENSOR_LENGTH); + AscendC::printf("abcdefghigklmnop"); + + AscendC::DataCopy(src1Local, src1Global, INIT_TENSOR_LENGTH); + AscendC::DumpTensor(src0Local, 2, 10); + //AscendC::printf("abcdefghigklmnop"); + AscendC::DumpTensor(src1Local, 2, 10); + + srcQue0.EnQue(src0Local); + srcQue1.EnQue(src1Local); + } + __aicore__ inline void Compute() + { + AscendC::LocalTensor src0Local = srcQue0.DeQue(); + AscendC::LocalTensor src1Local = srcQue1.DeQue(); + AscendC::LocalTensor dstLocal = dstQue0.AllocTensor(); + AscendC::Add(dstLocal, src0Local, src1Local, INIT_TENSOR_LENGTH); + AscendC::DumpTensor(dstLocal, 2, 10); + dstQue0.EnQue(dstLocal); + srcQue0.FreeTensor(src0Local); + srcQue1.FreeTensor(src1Local); + } + __aicore__ inline void CopyOut() + { + AscendC::LocalTensor dstLocal = dstQue0.DeQue(); + AscendC::DataCopy(dstGlobal, dstLocal, INIT_TENSOR_LENGTH); + AscendC::DumpTensor(dstGlobal, 2, 10); + dstQue0.FreeTensor(dstLocal); + } + __aicore__ inline void CopyIn1() + { + AscendC::LocalTensor src0Local = srcQue0.AllocTensor(); + AscendC::LocalTensor src1Local = srcQue2.AllocTensor(); + AscendC::DataCopy(src0Local, src0Global[1024], INIT_TENSOR_LENGTH); + AscendC::DataCopy(src1Local, src1Global[1024], INIT_TENSOR_LENGTH); + srcQue0.EnQue(src0Local); + srcQue2.EnQue(src1Local); + } + __aicore__ inline void Compute1() + { + AscendC::LocalTensor src0Local = srcQue0.DeQue(); + AscendC::LocalTensor src1Local = srcQue2.DeQue(); + AscendC::LocalTensor dstLocal = dstQue1.AllocTensor(); + AscendC::Add(dstLocal, src0Local, src1Local, INIT_TENSOR_LENGTH); + AscendC::DumpTensor(dstLocal, 2, 10); + dstQue1.EnQue(dstLocal); + srcQue0.FreeTensor(src0Local); + srcQue2.FreeTensor(src1Local); + } + __aicore__ inline void CopyOut1() + { + AscendC::LocalTensor dstLocal = dstQue1.DeQue(); + AscendC::DataCopy(dstGlobal[1024], dstLocal, INIT_TENSOR_LENGTH); + AscendC::DumpTensor(dstGlobal, 3, 10); + dstQue1.FreeTensor(dstLocal); + } + private: + AscendC::TPipe* pipe; + AscendC::TBufPool tbufPool0, tbufPool1, tbufPool2; + AscendC::TQue srcQue0, srcQue1, srcQue2; + AscendC::TQue dstQue0, dstQue1; + AscendC::GlobalTensor src0Global, src1Global, dstGlobal; + uint32_t totalLength = 0; + + // private: + // AscendC::TPipe* pipe; + // AscendC::TBufPool tbufPool0; + // AscendC::TBufPool tbufPool1; + // AscendC::TQue srcQue0; + // AscendC::TQue srcQue1; + // AscendC::TQue dstQue0; + // AscendC::GlobalTensor src0Global; + // AscendC::GlobalTensor src1Global; + // AscendC::GlobalTensor dstGlobal; + // uint32_t totalLength = 0; + }; +}// namespace MyCustomKernel + +#endif + \ No newline at end of file diff --git a/operator/ascendc/2_features/2_tbufpool/run.sh b/operator/ascendc/2_features/2_tbufpool/run.sh new file mode 100644 index 000000000..1fe551f40 --- /dev/null +++ b/operator/ascendc/2_features/2_tbufpool/run.sh @@ -0,0 +1,58 @@ +#!/bin/bash + +# Copyright (c) 2025 Huawei Technologies Co., Ltd. +# This file is a part of the CANN Open Software. +# Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ====================================================================================================================== + +SHORT=r:,v:, +LONG=run-mode:,soc-version:, +OPTS=$(getopt -a --options $SHORT --longoptions $LONG -- "$@") +eval set -- "$OPTS" +while : +do + case "$1" in + (-r | --run-mode ) + RUN_MODE="$2" + shift 2;; + (-v | --soc-version ) + SOC_VERSION="$2" + shift 2;; + (--) + shift; + break;; + (*) + echo "[ERROR] Unexpected option: $1"; + break;; + esac +done + +rm -rf build +mkdir build +cd build + +# in case of running op in simulator, use stub so instead +if [ "${RUN_MODE}" = "sim" ]; then + export LD_LIBRARY_PATH=$(echo $LD_LIBRARY_PATH | sed 's/\/.*\/runtime\/lib64://g') + export LD_LIBRARY_PATH=$ASCEND_HOME_DIR/runtime/lib64/stub:$LD_LIBRARY_PATH +fi + +source $ASCEND_HOME_DIR/bin/setenv.bash +export LD_LIBRARY_PATH=${ASCEND_HOME_DIR}/tools/simulator/${SOC_VERSION}/lib:$LD_LIBRARY_PATH + +cmake -DRUN_MODE=${RUN_MODE} -DSOC_VERSION=${SOC_VERSION} -DASCEND_CANN_PACKAGE_PATH=${ASCEND_HOME_DIR} .. +make -j16 + +if [ "${RUN_MODE}" = "npu" ]; then + ./tbufpool_direct_kernel_op +elif [ "${RUN_MODE}" = "sim" ]; then + export ASCEND_TOOLKIT_HOME=${ASCEND_HOME_DIR} + export ASCEND_HOME_PATH=${ASCEND_HOME_DIR} + msprof op simulator --application=./tbufpool_direct_kernel_op +elif [ "${RUN_MODE}" = "cpu" ]; then + ./tbufpool_direct_kernel_op +fi \ No newline at end of file diff --git a/operator/ascendc/2_features/2_tbufpool/scripts/gen_data.py b/operator/ascendc/2_features/2_tbufpool/scripts/gen_data.py new file mode 100644 index 000000000..3b0aa2937 --- /dev/null +++ b/operator/ascendc/2_features/2_tbufpool/scripts/gen_data.py @@ -0,0 +1,31 @@ +#!/usr/bin/python3 +# coding=utf-8 + +# Copyright (c) 2025 Huawei Technologies Co., Ltd. +# This file is a part of the CANN Open Software. +# Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ====================================================================================================================== +import os +import numpy as np + +def gen_golden_data_simple(): + dtype = np.float32 + + input_shape = [8, 256] + # generate value between [-65504, 65504] + input_x = np.random.uniform(-65504, np.nextafter(65504, np.inf), input_shape).astype(dtype) + input_y = np.random.uniform(-65504, np.nextafter(65504, np.inf), input_shape).astype(dtype) + golden = input_x + input_y + + os.system("mkdir -p ./input") + input_x.tofile("./input/input_x.bin") + input_y.tofile("./input/input_y.bin") + os.system("mkdir -p ./output") + golden.tofile("./output/golden.bin") + +if __name__ == "__main__": + gen_golden_data_simple() \ No newline at end of file diff --git a/operator/ascendc/2_features/README.md b/operator/ascendc/2_features/README.md index 8c843758b..6aa3f8655 100644 --- a/operator/ascendc/2_features/README.md +++ b/operator/ascendc/2_features/README.md @@ -15,6 +15,7 @@ Ascend C相关特性的样例。特性样例逐步补充中。 当前本目录包含的所有样例如下。 | 目录名称 | 功能描述 | 运行环境 | | ------------------------------------------------------------ | ---------------------------------------------------- | -- | +| [2_tbufpool](./2_tbufpool) | 基于Ascend C的自定义Vector算子及kernel直调样例,通过TbufPool实现Add算子计算过程中的内存复用,提高计算效率。|Atlas A2训练系列产品/Atlas 800I A2推理产品| | [12_cube_group](./12_cube_group) | 基于Ascend C的自定义算子及FrameworkLaunch调用样例,通过软同步控制AIC和AIV之间进行通讯,实现AI Core计算资源分组。|Atlas A2训练系列产品/Atlas 800I A2推理产品| | [13_matmul_api_ibshare](./13_matmul_api_ibshare) | 基于Ascend C的自定义Cube算子及Kernellaunch调用样例,通过A矩阵与B矩阵使能IBSHARE,实现算子性能提升|Atlas A2训练系列产品/Atlas 800I A2推理产品| | [14_matmul_api_constant](./14_matmul_api_constant) | 基于Ascend C的自定义Cube算子及FrameworkLaunch调用样例,通过使用全量常量化的MatmulApiStaticTiling模板参数,替代非常量的TCubeTiling参数,以减少Scalar计算开销,实现算子性能提升|Atlas A2训练系列产品/Atlas 800I A2推理产品| -- Gitee From 63f36b4de3b74dea6be79529247dbe316e7f5def Mon Sep 17 00:00:00 2001 From: alpaca12345uuu Date: Sun, 8 Jun 2025 13:45:26 +0800 Subject: [PATCH 25/94] =?UTF-8?q?=E4=BB=A3=E7=A0=81=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- operator/ascendc/2_features/2_tbufpool/scripts/gen_data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/operator/ascendc/2_features/2_tbufpool/scripts/gen_data.py b/operator/ascendc/2_features/2_tbufpool/scripts/gen_data.py index 3b0aa2937..222e2a61a 100644 --- a/operator/ascendc/2_features/2_tbufpool/scripts/gen_data.py +++ b/operator/ascendc/2_features/2_tbufpool/scripts/gen_data.py @@ -16,7 +16,7 @@ def gen_golden_data_simple(): dtype = np.float32 input_shape = [8, 256] - # generate value between [-65504, 65504] + # generate value between [-65504, 65504]. input_x = np.random.uniform(-65504, np.nextafter(65504, np.inf), input_shape).astype(dtype) input_y = np.random.uniform(-65504, np.nextafter(65504, np.inf), input_shape).astype(dtype) golden = input_x + input_y -- Gitee From bd667edb462528b4c3f42828ea9d09fab9b8027d Mon Sep 17 00:00:00 2001 From: alpaca12345uuu Date: Sun, 8 Jun 2025 14:10:38 +0800 Subject: [PATCH 26/94] 1 --- .../2_tbufpool/op_kernel/tbufpool_custom.h | 43 ++++++++++--------- 1 file changed, 23 insertions(+), 20 deletions(-) diff --git a/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h b/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h index 4f477fec2..f370e2fc1 100644 --- a/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h +++ b/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h @@ -32,32 +32,35 @@ class TbufPoolImpl { src0Global.SetGlobalBuffer((__gm__ float*)src0Gm); src1Global.SetGlobalBuffer((__gm__ float*)src1Gm); dstGlobal.SetGlobalBuffer((__gm__ float*)dstGm); - AscendC::DumpTensor(src0Global, 10, 10); + //AscendC::DumpTensor(src0Global, 10, 10); - pipe->InitBufPool(tbufPool0, BUFFER_LENGTH); - tbufPool0.InitBuffer(srcQue0, BUFFER_NUM, BUFF_POOL_LENGTH); // Total src0 - tbufPool0.InitBufPool(tbufPool1, BUFF_POOL_LENGTH); - tbufPool0.InitBufPool(tbufPool2, BUFF_POOL_LENGTH, tbufPool1); + //pipe->InitBufPool(tbufPool0, BUFFER_LENGTH); + pipe->InitBuffer(srcQue0, BUFFER_NUM, BUFF_POOL_LENGTH); + pipe->InitBuffer(srcQue1, BUFFER_NUM, BUFF_POOL_LENGTH); + pipe->InitBuffer(dstQue0, BUFFER_NUM, BUFF_POOL_LENGTH); + // tbufPool0.InitBuffer(srcQue0, BUFFER_NUM, BUFF_POOL_LENGTH); // Total src0 + // tbufPool0.InitBufPool(tbufPool1, BUFF_POOL_LENGTH); + // tbufPool0.InitBufPool(tbufPool2, BUFF_POOL_LENGTH, tbufPool1); // tbufPool1.InitBuffer(srcQue1, BUFFER_NUM, INIT_TENSOR_LENGTH); // tbufPool1.InitBuffer(dstQue0, BUFFER_NUM, INIT_TENSOR_LENGTH); } __aicore__ inline void Process() { - AscendC::DumpTensor(src0Global, 2, 10); + // AscendC::DumpTensor(src0Global, 2, 10); - tbufPool1.InitBuffer(srcQue1, BUFFER_NUM, INIT_TENSOR_LENGTH); - tbufPool1.InitBuffer(dstQue0, BUFFER_NUM, INIT_TENSOR_LENGTH); + // tbufPool1.InitBuffer(srcQue1, BUFFER_NUM, INIT_TENSOR_LENGTH); + // tbufPool1.InitBuffer(dstQue0, BUFFER_NUM, INIT_TENSOR_LENGTH); CopyIn(); Compute(); CopyOut(); - tbufPool1.Reset(); - tbufPool2.InitBuffer(srcQue2, BUFFER_NUM, INIT_TENSOR_LENGTH); - tbufPool2.InitBuffer(dstQue1, BUFFER_NUM, INIT_TENSOR_LENGTH); - CopyIn1(); - Compute1(); - CopyOut1(); - tbufPool2.Reset(); - tbufPool0.Reset(); + // tbufPool1.Reset(); + // tbufPool2.InitBuffer(srcQue2, BUFFER_NUM, INIT_TENSOR_LENGTH); + // tbufPool2.InitBuffer(dstQue1, BUFFER_NUM, INIT_TENSOR_LENGTH); + // CopyIn1(); + // Compute1(); + // CopyOut1(); + // tbufPool2.Reset(); + // tbufPool0.Reset(); } private: __aicore__ inline void CopyIn() @@ -66,10 +69,10 @@ class TbufPoolImpl { AscendC::LocalTensor src1Local = srcQue1.AllocTensor(); AscendC::DumpTensor(src0Global, 2, 10); - AscendC::DataCopy(src0Local, src0Global, INIT_TENSOR_LENGTH); - AscendC::printf("abcdefghigklmnop"); + AscendC::DataCopy(src0Local, src0Global, 2048); + //AscendC::printf("abcdefghigklmnop"); - AscendC::DataCopy(src1Local, src1Global, INIT_TENSOR_LENGTH); + AscendC::DataCopy(src1Local, src1Global, 2048); AscendC::DumpTensor(src0Local, 2, 10); //AscendC::printf("abcdefghigklmnop"); AscendC::DumpTensor(src1Local, 2, 10); @@ -82,7 +85,7 @@ class TbufPoolImpl { AscendC::LocalTensor src0Local = srcQue0.DeQue(); AscendC::LocalTensor src1Local = srcQue1.DeQue(); AscendC::LocalTensor dstLocal = dstQue0.AllocTensor(); - AscendC::Add(dstLocal, src0Local, src1Local, INIT_TENSOR_LENGTH); + AscendC::Add(dstLocal, src0Local, src1Local, 2048); AscendC::DumpTensor(dstLocal, 2, 10); dstQue0.EnQue(dstLocal); srcQue0.FreeTensor(src0Local); -- Gitee From fa7a7b6261c97f5e3315c716dea4f5f7a8a70788 Mon Sep 17 00:00:00 2001 From: alpaca12345uuu Date: Sun, 8 Jun 2025 14:15:44 +0800 Subject: [PATCH 27/94] 1 --- .../2_tbufpool/op_kernel/tbufpool_custom.h | 50 +++++++++---------- 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h b/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h index f370e2fc1..e175d236e 100644 --- a/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h +++ b/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h @@ -34,13 +34,13 @@ class TbufPoolImpl { dstGlobal.SetGlobalBuffer((__gm__ float*)dstGm); //AscendC::DumpTensor(src0Global, 10, 10); - //pipe->InitBufPool(tbufPool0, BUFFER_LENGTH); - pipe->InitBuffer(srcQue0, BUFFER_NUM, BUFF_POOL_LENGTH); - pipe->InitBuffer(srcQue1, BUFFER_NUM, BUFF_POOL_LENGTH); - pipe->InitBuffer(dstQue0, BUFFER_NUM, BUFF_POOL_LENGTH); - // tbufPool0.InitBuffer(srcQue0, BUFFER_NUM, BUFF_POOL_LENGTH); // Total src0 - // tbufPool0.InitBufPool(tbufPool1, BUFF_POOL_LENGTH); - // tbufPool0.InitBufPool(tbufPool2, BUFF_POOL_LENGTH, tbufPool1); + pipe->InitBufPool(tbufPool0, BUFFER_LENGTH); + // pipe->InitBuffer(srcQue0, BUFFER_NUM, BUFF_POOL_LENGTH); + // pipe->InitBuffer(srcQue1, BUFFER_NUM, BUFF_POOL_LENGTH); + // pipe->InitBuffer(dstQue0, BUFFER_NUM, BUFF_POOL_LENGTH); + tbufPool0.InitBuffer(srcQue0, BUFFER_NUM, BUFF_POOL_LENGTH); // Total src0 + tbufPool0.InitBufPool(tbufPool1, BUFF_POOL_LENGTH); + tbufPool0.InitBufPool(tbufPool2, BUFF_POOL_LENGTH, tbufPool1); // tbufPool1.InitBuffer(srcQue1, BUFFER_NUM, INIT_TENSOR_LENGTH); // tbufPool1.InitBuffer(dstQue0, BUFFER_NUM, INIT_TENSOR_LENGTH); } @@ -48,19 +48,19 @@ class TbufPoolImpl { { // AscendC::DumpTensor(src0Global, 2, 10); - // tbufPool1.InitBuffer(srcQue1, BUFFER_NUM, INIT_TENSOR_LENGTH); - // tbufPool1.InitBuffer(dstQue0, BUFFER_NUM, INIT_TENSOR_LENGTH); + tbufPool1.InitBuffer(srcQue1, BUFFER_NUM, INIT_TENSOR_LENGTH); + tbufPool1.InitBuffer(dstQue0, BUFFER_NUM, INIT_TENSOR_LENGTH); CopyIn(); Compute(); CopyOut(); - // tbufPool1.Reset(); - // tbufPool2.InitBuffer(srcQue2, BUFFER_NUM, INIT_TENSOR_LENGTH); - // tbufPool2.InitBuffer(dstQue1, BUFFER_NUM, INIT_TENSOR_LENGTH); - // CopyIn1(); - // Compute1(); - // CopyOut1(); - // tbufPool2.Reset(); - // tbufPool0.Reset(); + tbufPool1.Reset(); + tbufPool2.InitBuffer(srcQue2, BUFFER_NUM, INIT_TENSOR_LENGTH); + tbufPool2.InitBuffer(dstQue1, BUFFER_NUM, INIT_TENSOR_LENGTH); + CopyIn1(); + Compute1(); + CopyOut1(); + tbufPool2.Reset(); + tbufPool0.Reset(); } private: __aicore__ inline void CopyIn() @@ -69,10 +69,10 @@ class TbufPoolImpl { AscendC::LocalTensor src1Local = srcQue1.AllocTensor(); AscendC::DumpTensor(src0Global, 2, 10); - AscendC::DataCopy(src0Local, src0Global, 2048); + AscendC::DataCopy(src0Local, src0Global, 1024); //AscendC::printf("abcdefghigklmnop"); - AscendC::DataCopy(src1Local, src1Global, 2048); + AscendC::DataCopy(src1Local, src1Global, 1024); AscendC::DumpTensor(src0Local, 2, 10); //AscendC::printf("abcdefghigklmnop"); AscendC::DumpTensor(src1Local, 2, 10); @@ -85,7 +85,7 @@ class TbufPoolImpl { AscendC::LocalTensor src0Local = srcQue0.DeQue(); AscendC::LocalTensor src1Local = srcQue1.DeQue(); AscendC::LocalTensor dstLocal = dstQue0.AllocTensor(); - AscendC::Add(dstLocal, src0Local, src1Local, 2048); + AscendC::Add(dstLocal, src0Local, src1Local, 1024); AscendC::DumpTensor(dstLocal, 2, 10); dstQue0.EnQue(dstLocal); srcQue0.FreeTensor(src0Local); @@ -94,7 +94,7 @@ class TbufPoolImpl { __aicore__ inline void CopyOut() { AscendC::LocalTensor dstLocal = dstQue0.DeQue(); - AscendC::DataCopy(dstGlobal, dstLocal, INIT_TENSOR_LENGTH); + AscendC::DataCopy(dstGlobal, dstLocal, 1024); AscendC::DumpTensor(dstGlobal, 2, 10); dstQue0.FreeTensor(dstLocal); } @@ -102,8 +102,8 @@ class TbufPoolImpl { { AscendC::LocalTensor src0Local = srcQue0.AllocTensor(); AscendC::LocalTensor src1Local = srcQue2.AllocTensor(); - AscendC::DataCopy(src0Local, src0Global[1024], INIT_TENSOR_LENGTH); - AscendC::DataCopy(src1Local, src1Global[1024], INIT_TENSOR_LENGTH); + AscendC::DataCopy(src0Local, src0Global[1024], 1024); + AscendC::DataCopy(src1Local, src1Global[1024], 1024); srcQue0.EnQue(src0Local); srcQue2.EnQue(src1Local); } @@ -112,7 +112,7 @@ class TbufPoolImpl { AscendC::LocalTensor src0Local = srcQue0.DeQue(); AscendC::LocalTensor src1Local = srcQue2.DeQue(); AscendC::LocalTensor dstLocal = dstQue1.AllocTensor(); - AscendC::Add(dstLocal, src0Local, src1Local, INIT_TENSOR_LENGTH); + AscendC::Add(dstLocal, src0Local, src1Local, 1024); AscendC::DumpTensor(dstLocal, 2, 10); dstQue1.EnQue(dstLocal); srcQue0.FreeTensor(src0Local); @@ -121,7 +121,7 @@ class TbufPoolImpl { __aicore__ inline void CopyOut1() { AscendC::LocalTensor dstLocal = dstQue1.DeQue(); - AscendC::DataCopy(dstGlobal[1024], dstLocal, INIT_TENSOR_LENGTH); + AscendC::DataCopy(dstGlobal[1024], dstLocal, 1024); AscendC::DumpTensor(dstGlobal, 3, 10); dstQue1.FreeTensor(dstLocal); } -- Gitee From 5691d288b2bcdf808a6325a51dcca41817528d17 Mon Sep 17 00:00:00 2001 From: alpaca12345uuu Date: Sun, 8 Jun 2025 14:30:44 +0800 Subject: [PATCH 28/94] 1 --- .../2_tbufpool/op_kernel/tbufpool_custom.h | 74 ++++++++----------- 1 file changed, 30 insertions(+), 44 deletions(-) diff --git a/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h b/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h index e175d236e..af5c4c84c 100644 --- a/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h +++ b/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h @@ -32,30 +32,25 @@ class TbufPoolImpl { src0Global.SetGlobalBuffer((__gm__ float*)src0Gm); src1Global.SetGlobalBuffer((__gm__ float*)src1Gm); dstGlobal.SetGlobalBuffer((__gm__ float*)dstGm); - //AscendC::DumpTensor(src0Global, 10, 10); pipe->InitBufPool(tbufPool0, BUFFER_LENGTH); - // pipe->InitBuffer(srcQue0, BUFFER_NUM, BUFF_POOL_LENGTH); - // pipe->InitBuffer(srcQue1, BUFFER_NUM, BUFF_POOL_LENGTH); - // pipe->InitBuffer(dstQue0, BUFFER_NUM, BUFF_POOL_LENGTH); tbufPool0.InitBuffer(srcQue0, BUFFER_NUM, BUFF_POOL_LENGTH); // Total src0 tbufPool0.InitBufPool(tbufPool1, BUFF_POOL_LENGTH); tbufPool0.InitBufPool(tbufPool2, BUFF_POOL_LENGTH, tbufPool1); - // tbufPool1.InitBuffer(srcQue1, BUFFER_NUM, INIT_TENSOR_LENGTH); - // tbufPool1.InitBuffer(dstQue0, BUFFER_NUM, INIT_TENSOR_LENGTH); - } - __aicore__ inline void Process() - { - // AscendC::DumpTensor(src0Global, 2, 10); tbufPool1.InitBuffer(srcQue1, BUFFER_NUM, INIT_TENSOR_LENGTH); tbufPool1.InitBuffer(dstQue0, BUFFER_NUM, INIT_TENSOR_LENGTH); + } + __aicore__ inline void Process() + { + // tbufPool1.InitBuffer(srcQue1, BUFFER_NUM, INIT_TENSOR_LENGTH); + // tbufPool1.InitBuffer(dstQue0, BUFFER_NUM, INIT_TENSOR_LENGTH); CopyIn(); Compute(); CopyOut(); tbufPool1.Reset(); - tbufPool2.InitBuffer(srcQue2, BUFFER_NUM, INIT_TENSOR_LENGTH); - tbufPool2.InitBuffer(dstQue1, BUFFER_NUM, INIT_TENSOR_LENGTH); + // tbufPool2.InitBuffer(srcQue2, BUFFER_NUM, INIT_TENSOR_LENGTH); + // tbufPool2.InitBuffer(dstQue1, BUFFER_NUM, INIT_TENSOR_LENGTH); CopyIn1(); Compute1(); CopyOut1(); @@ -67,16 +62,8 @@ class TbufPoolImpl { { AscendC::LocalTensor src0Local = srcQue0.AllocTensor(); AscendC::LocalTensor src1Local = srcQue1.AllocTensor(); - AscendC::DumpTensor(src0Global, 2, 10); - AscendC::DataCopy(src0Local, src0Global, 1024); - //AscendC::printf("abcdefghigklmnop"); - AscendC::DataCopy(src1Local, src1Global, 1024); - AscendC::DumpTensor(src0Local, 2, 10); - //AscendC::printf("abcdefghigklmnop"); - AscendC::DumpTensor(src1Local, 2, 10); - srcQue0.EnQue(src0Local); srcQue1.EnQue(src1Local); } @@ -86,7 +73,6 @@ class TbufPoolImpl { AscendC::LocalTensor src1Local = srcQue1.DeQue(); AscendC::LocalTensor dstLocal = dstQue0.AllocTensor(); AscendC::Add(dstLocal, src0Local, src1Local, 1024); - AscendC::DumpTensor(dstLocal, 2, 10); dstQue0.EnQue(dstLocal); srcQue0.FreeTensor(src0Local); srcQue1.FreeTensor(src1Local); @@ -95,13 +81,12 @@ class TbufPoolImpl { { AscendC::LocalTensor dstLocal = dstQue0.DeQue(); AscendC::DataCopy(dstGlobal, dstLocal, 1024); - AscendC::DumpTensor(dstGlobal, 2, 10); dstQue0.FreeTensor(dstLocal); } __aicore__ inline void CopyIn1() { AscendC::LocalTensor src0Local = srcQue0.AllocTensor(); - AscendC::LocalTensor src1Local = srcQue2.AllocTensor(); + AscendC::LocalTensor src1Local = srcQue1.AllocTensor(); AscendC::DataCopy(src0Local, src0Global[1024], 1024); AscendC::DataCopy(src1Local, src1Global[1024], 1024); srcQue0.EnQue(src0Local); @@ -110,40 +95,41 @@ class TbufPoolImpl { __aicore__ inline void Compute1() { AscendC::LocalTensor src0Local = srcQue0.DeQue(); - AscendC::LocalTensor src1Local = srcQue2.DeQue(); - AscendC::LocalTensor dstLocal = dstQue1.AllocTensor(); + AscendC::LocalTensor src1Local = srcQue1.DeQue(); + AscendC::LocalTensor dstLocal = dstQue0.AllocTensor(); AscendC::Add(dstLocal, src0Local, src1Local, 1024); - AscendC::DumpTensor(dstLocal, 2, 10); dstQue1.EnQue(dstLocal); srcQue0.FreeTensor(src0Local); srcQue2.FreeTensor(src1Local); } __aicore__ inline void CopyOut1() { - AscendC::LocalTensor dstLocal = dstQue1.DeQue(); + AscendC::LocalTensor dstLocal = dstQue0.DeQue(); AscendC::DataCopy(dstGlobal[1024], dstLocal, 1024); - AscendC::DumpTensor(dstGlobal, 3, 10); dstQue1.FreeTensor(dstLocal); } - private: - AscendC::TPipe* pipe; - AscendC::TBufPool tbufPool0, tbufPool1, tbufPool2; - AscendC::TQue srcQue0, srcQue1, srcQue2; - AscendC::TQue dstQue0, dstQue1; - AscendC::GlobalTensor src0Global, src1Global, dstGlobal; - uint32_t totalLength = 0; - // private: // AscendC::TPipe* pipe; - // AscendC::TBufPool tbufPool0; - // AscendC::TBufPool tbufPool1; - // AscendC::TQue srcQue0; - // AscendC::TQue srcQue1; - // AscendC::TQue dstQue0; - // AscendC::GlobalTensor src0Global; - // AscendC::GlobalTensor src1Global; - // AscendC::GlobalTensor dstGlobal; + // AscendC::TBufPool tbufPool0, tbufPool1, tbufPool2; + // AscendC::TQue srcQue0, srcQue1, srcQue2; + // AscendC::TQue dstQue0, dstQue1; + // AscendC::GlobalTensor src0Global, src1Global, dstGlobal; // uint32_t totalLength = 0; + + private: + AscendC::TPipe* pipe; + AscendC::TBufPool tbufPool0; + AscendC::TBufPool tbufPool1; + AscendC::TBufPool tbufPool2; + AscendC::TQue srcQue0; + AscendC::TQue srcQue1; + AscendC::TQue srcQue2; + AscendC::TQue dstQue0; + AscendC::TQue dstQue1; + AscendC::GlobalTensor src0Global; + AscendC::GlobalTensor src1Global; + AscendC::GlobalTensor dstGlobal; + uint32_t totalLength = 0; }; }// namespace MyCustomKernel -- Gitee From f6eb9268c4da3a899e24d18df5754a3fecc57d8c Mon Sep 17 00:00:00 2001 From: alpaca12345uuu Date: Sun, 8 Jun 2025 15:28:53 +0800 Subject: [PATCH 29/94] 1 --- .../ascendc/2_features/2_tbufpool/main.cpp | 54 +++++++---- .../2_tbufpool/op_kernel/tbufpool_custom.cpp | 4 +- .../2_tbufpool/op_kernel/tbufpool_custom.h | 97 +++++++++++-------- .../2_features/2_tbufpool/scripts/gen_data.py | 6 +- 4 files changed, 97 insertions(+), 64 deletions(-) diff --git a/operator/ascendc/2_features/2_tbufpool/main.cpp b/operator/ascendc/2_features/2_tbufpool/main.cpp index 565a21df4..e31ab475b 100644 --- a/operator/ascendc/2_features/2_tbufpool/main.cpp +++ b/operator/ascendc/2_features/2_tbufpool/main.cpp @@ -76,7 +76,8 @@ int32_t main(int32_t argc, char *argv[]) { #ifdef ASCENDC_CPU_DEBUG uint8_t *x = (uint8_t *)AscendC::GmAlloc(inputSize); uint8_t *y = (uint8_t *)AscendC::GmAlloc(inputSize); - uint8_t *z = (uint8_t *)AscendC::GmAlloc(outputSize); + uint8_t *zAdd = (uint8_t *)AscendC::GmAlloc(outputSize); + uint8_t *zSub = (uint8_t *)AscendC::GmAlloc(outputSize); uint8_t *tiling = (uint8_t *)AscendC::GmAlloc(tilingSize); ReadFile("../input/input_x.bin", inputSize, x, inputSize); @@ -86,17 +87,22 @@ int32_t main(int32_t argc, char *argv[]) { AscendC::SetKernelMode(KernelMode::AIV_MODE); // run in aiv mode - ICPU_RUN_KF(tbufpool_custom, USED_CORE_NUM, x, y, z, + ICPU_RUN_KF(tbufpool_custom, USED_CORE_NUM, x, y, zAdd, zSub, *reinterpret_cast(tiling)); // use this macro for cpu debug - WriteFile("../output/output.bin", z, outputSize); + WriteFile("../output/output_add.bin", zAdd, outputSize); + WriteFile("../output/output_sub.bin", zSub, outputSize); - bool goldenResult = true; - goldenResult = CompareResult(z, outputSize); + + bool goldenResultAdd = true; + goldenResultAdd = CompareResult(zAdd, outputSize); + bool goldenResult_sub = true; + goldenResultAdd = CompareResult(zSub, outputSize); AscendC::GmFree((void *)x); AscendC::GmFree((void *)y); - AscendC::GmFree((void *)z); + AscendC::GmFree((void *)zAdd); + AscendC::GmFree((void *)zSub); AscendC::GmFree((void *)tiling); #else CHECK_ACL(aclInit(nullptr)); @@ -107,20 +113,24 @@ int32_t main(int32_t argc, char *argv[]) { uint8_t *xHost; uint8_t *yHost; - uint8_t *zHost; + uint8_t *zHostAdd; + uint8_t *zHostSub; uint8_t *tiling; uint8_t *xDevice; uint8_t *yDevice; - uint8_t *zDevice; + uint8_t *zDeviceAdd; + uint8_t *zDeviceSub; CHECK_ACL(aclrtMallocHost((void **)(&xHost), inputSize)); CHECK_ACL(aclrtMallocHost((void **)(&yHost), inputSize)); - CHECK_ACL(aclrtMallocHost((void **)(&zHost), outputSize)); + CHECK_ACL(aclrtMallocHost((void **)(&zHostAdd), outputSize)); + CHECK_ACL(aclrtMallocHost((void **)(&zHostSub), outputSize)); CHECK_ACL(aclrtMallocHost((void **)(&tiling), tilingSize)); CHECK_ACL(aclrtMalloc((void **)&xDevice, inputSize, ACL_MEM_MALLOC_HUGE_FIRST)); CHECK_ACL(aclrtMalloc((void **)&yDevice, inputSize, ACL_MEM_MALLOC_HUGE_FIRST)); - CHECK_ACL(aclrtMalloc((void **)&zDevice, outputSize, ACL_MEM_MALLOC_HUGE_FIRST)); + CHECK_ACL(aclrtMalloc((void **)&zDeviceAdd, outputSize, ACL_MEM_MALLOC_HUGE_FIRST)); + CHECK_ACL(aclrtMalloc((void **)&zDeviceSub, outputSize, ACL_MEM_MALLOC_HUGE_FIRST)); ReadFile("../input/input_x.bin", inputSize, xHost, inputSize); ReadFile("../input/input_y.bin", inputSize, yHost, inputSize); @@ -133,26 +143,34 @@ int32_t main(int32_t argc, char *argv[]) { // Execute the kernel ACLRT_LAUNCH_KERNEL(tbufpool_custom) - (USED_CORE_NUM, stream, xDevice, yDevice, zDevice, reinterpret_cast(tiling)); + (USED_CORE_NUM, stream, xDevice, yDevice, zDeviceAdd, zDeviceSub, reinterpret_cast(tiling)); // Wait for the stop event to complete CHECK_ACL(aclrtSynchronizeStream(stream)); // Copy result to host memory and write to output file - CHECK_ACL(aclrtMemcpy(zHost, outputSize, zDevice, outputSize, ACL_MEMCPY_DEVICE_TO_HOST)); - WriteFile("../output/output.bin", zHost, outputSize); + CHECK_ACL(aclrtMemcpy(zHostAdd, outputSize, zDeviceAdd, outputSize, ACL_MEMCPY_DEVICE_TO_HOST)); + WriteFile("../output/output_add.bin", zHostAdd, outputSize); + CHECK_ACL(aclrtMemcpy(zHostSub, outputSize, zDeviceSub, outputSize, ACL_MEMCPY_DEVICE_TO_HOST)); + WriteFile("../output/output_sub.bin", zHostSub, outputSize); // Compare the result with the golden result - bool goldenResult = true; - goldenResult = CompareResult(zHost, outputSize); + bool goldenResultAdd = true; + goldenResultAdd = CompareResult(zHostAdd, outputSize); + bool goldenResultSub = true; + goldenResultSub = CompareResult(zHostSub, outputSize); // Clean up memory CHECK_ACL(aclrtFree(xDevice)); CHECK_ACL(aclrtFree(yDevice)); - CHECK_ACL(aclrtFree(zDevice)); + CHECK_ACL(aclrtFree(zDeviceAdd)); + CHECK_ACL(aclrtFree(zDeviceSub)); + CHECK_ACL(aclrtFreeHost(xHost)); CHECK_ACL(aclrtFreeHost(yHost)); - CHECK_ACL(aclrtFreeHost(zHost)); + CHECK_ACL(aclrtFreeHost(zHostAdd)); + CHECK_ACL(aclrtFreeHost(zHostSub)); + CHECK_ACL(aclrtFreeHost(tiling)); CHECK_ACL(aclrtDestroyStream(stream)); @@ -160,7 +178,7 @@ int32_t main(int32_t argc, char *argv[]) { CHECK_ACL(aclFinalize()); #endif - if (goldenResult) { + if (goldenResultAdd && goldenResultSub) { printf("test pass!\n"); } else { printf("test failed!\n"); diff --git a/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.cpp b/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.cpp index d17a4d185..1cdc13c77 100644 --- a/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.cpp +++ b/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.cpp @@ -11,10 +11,10 @@ #include "./tbufpool_custom.h" #include "kernel_operator.h" -extern "C" __global__ __aicore__ void tbufpool_custom(GM_ADDR src0Gm, GM_ADDR src1Gm, GM_ADDR dstGm, TbufPoolTilingData tiling) +extern "C" __global__ __aicore__ void tbufpool_custom(GM_ADDR src0Gm, GM_ADDR src1Gm, GM_ADDR dstGm0, GM_ADDR dstGm1, TbufPoolTilingData tiling) { AscendC::TPipe pipe; MyCustomKernel::TbufPoolImpl op; - op.Init(src0Gm, src1Gm, dstGm, tiling, &pipe); + op.Init(src0Gm, src1Gm, dstGm0, dstGm1, tiling, &pipe); op.Process(); } \ No newline at end of file diff --git a/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h b/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h index af5c4c84c..fc941069f 100644 --- a/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h +++ b/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h @@ -25,13 +25,15 @@ constexpr int32_t TOTL_NUM = 2048; class TbufPoolImpl { public: __aicore__ inline TbufPoolImpl() {} - __aicore__ inline void Init(__gm__ uint8_t* src0Gm, __gm__ uint8_t* src1Gm, __gm__ uint8_t* dstGm, TbufPoolTilingData tiling, AscendC::TPipe* pipeIn) + __aicore__ inline void Init(__gm__ uint8_t* src0Gm, __gm__ uint8_t* src1Gm, __gm__ uint8_t* dstGm0, + __gm__ uint8_t* dstGm1, TbufPoolTilingData tiling, AscendC::TPipe* pipeIn) { pipe = pipeIn; totalLength = tiling.totalLength; - src0Global.SetGlobalBuffer((__gm__ float*)src0Gm); - src1Global.SetGlobalBuffer((__gm__ float*)src1Gm); - dstGlobal.SetGlobalBuffer((__gm__ float*)dstGm); + src0Global.SetGlobalBuffer((__gm__ float*)src0Gm);// input 0 + src1Global.SetGlobalBuffer((__gm__ float*)src1Gm);// input 1 + dstGlobal0.SetGlobalBuffer((__gm__ float*)dstGm0);// output 0 + dstGlobal1.SetGlobalBuffer((__gm__ float*)dstGm1);// output 1 pipe->InitBufPool(tbufPool0, BUFFER_LENGTH); tbufPool0.InitBuffer(srcQue0, BUFFER_NUM, BUFF_POOL_LENGTH); // Total src0 @@ -39,73 +41,82 @@ class TbufPoolImpl { tbufPool0.InitBufPool(tbufPool2, BUFF_POOL_LENGTH, tbufPool1); tbufPool1.InitBuffer(srcQue1, BUFFER_NUM, INIT_TENSOR_LENGTH); - tbufPool1.InitBuffer(dstQue0, BUFFER_NUM, INIT_TENSOR_LENGTH); + tbufPool1.InitBuffer(dstQue, BUFFER_NUM, INIT_TENSOR_LENGTH); + //tbufPool1.InitBuffer(dstQue1, BUFFER_NUM, INIT_TENSOR_LENGTH); } __aicore__ inline void Process() { - // tbufPool1.InitBuffer(srcQue1, BUFFER_NUM, INIT_TENSOR_LENGTH); - // tbufPool1.InitBuffer(dstQue0, BUFFER_NUM, INIT_TENSOR_LENGTH); - CopyIn(); - Compute(); - CopyOut(); - tbufPool1.Reset(); - // tbufPool2.InitBuffer(srcQue2, BUFFER_NUM, INIT_TENSOR_LENGTH); - // tbufPool2.InitBuffer(dstQue1, BUFFER_NUM, INIT_TENSOR_LENGTH); - CopyIn1(); - Compute1(); - CopyOut1(); - tbufPool2.Reset(); + // CopyIn(); + // Compute(); + // CopyOut(); + // tbufPool1.Reset(); + // CopyIn1(); + // Compute1(); + // CopyOut1(); + // tbufPool2.Reset(); + //tbufPool0.Reset(); + for (int32_t i = 0; i < SPLIT_NUM; i++) + { + CopyIn(i); + Compute(i); + CopyOut(i); + tbufPool1.Reset(i); + CopyIn1(i); + Compute1(i); + CopyOut1(i); + tbufPool2.Reset(i); + } tbufPool0.Reset(); } private: - __aicore__ inline void CopyIn() + __aicore__ inline void CopyIn(int32_t progress) { AscendC::LocalTensor src0Local = srcQue0.AllocTensor(); AscendC::LocalTensor src1Local = srcQue1.AllocTensor(); - AscendC::DataCopy(src0Local, src0Global, 1024); - AscendC::DataCopy(src1Local, src1Global, 1024); + AscendC::DataCopy(src0Local, src0Global[progress*1024], 1024); + AscendC::DataCopy(src1Local, src1Global[progress*1024], 1024); srcQue0.EnQue(src0Local); srcQue1.EnQue(src1Local); } - __aicore__ inline void Compute() + __aicore__ inline void Compute(int32_t progress) { AscendC::LocalTensor src0Local = srcQue0.DeQue(); AscendC::LocalTensor src1Local = srcQue1.DeQue(); - AscendC::LocalTensor dstLocal = dstQue0.AllocTensor(); + AscendC::LocalTensor dstLocal = dstQue.AllocTensor(); AscendC::Add(dstLocal, src0Local, src1Local, 1024); - dstQue0.EnQue(dstLocal); + dstQue.EnQue(dstLocal); srcQue0.FreeTensor(src0Local); srcQue1.FreeTensor(src1Local); } - __aicore__ inline void CopyOut() + __aicore__ inline void CopyOut(int32_t progress) { - AscendC::LocalTensor dstLocal = dstQue0.DeQue(); - AscendC::DataCopy(dstGlobal, dstLocal, 1024); - dstQue0.FreeTensor(dstLocal); + AscendC::LocalTensor dstLocal = dstQue.DeQue(); + AscendC::DataCopy(dstGlobal[progress*1024], dstLocal, 1024); + dstQue.FreeTensor(dstLocal); } - __aicore__ inline void CopyIn1() + __aicore__ inline void CopyIn1(int32_t progress) { AscendC::LocalTensor src0Local = srcQue0.AllocTensor(); AscendC::LocalTensor src1Local = srcQue1.AllocTensor(); - AscendC::DataCopy(src0Local, src0Global[1024], 1024); - AscendC::DataCopy(src1Local, src1Global[1024], 1024); + AscendC::DataCopy(src0Local, src0Global[progress*1024], 1024); + AscendC::DataCopy(src1Local, src1Global[progress*1024], 1024); srcQue0.EnQue(src0Local); - srcQue2.EnQue(src1Local); + srcQue1.EnQue(src1Local); } - __aicore__ inline void Compute1() + __aicore__ inline void Compute1(int32_t progress) { AscendC::LocalTensor src0Local = srcQue0.DeQue(); AscendC::LocalTensor src1Local = srcQue1.DeQue(); - AscendC::LocalTensor dstLocal = dstQue0.AllocTensor(); - AscendC::Add(dstLocal, src0Local, src1Local, 1024); - dstQue1.EnQue(dstLocal); + AscendC::LocalTensor dstLocal = dstQue.AllocTensor(); + AscendC::Sub(dstLocal, src0Local, src1Local, 1024); + dstQue.EnQue(dstLocal); srcQue0.FreeTensor(src0Local); - srcQue2.FreeTensor(src1Local); + srcQue1.FreeTensor(src1Local); } - __aicore__ inline void CopyOut1() + __aicore__ inline void CopyOut1(int32_t progress) { - AscendC::LocalTensor dstLocal = dstQue0.DeQue(); - AscendC::DataCopy(dstGlobal[1024], dstLocal, 1024); + AscendC::LocalTensor dstLocal = dstQue.DeQue(); + AscendC::DataCopy(dstGlobal1[progress*1024], dstLocal, 1024); dstQue1.FreeTensor(dstLocal); } // private: @@ -124,11 +135,13 @@ class TbufPoolImpl { AscendC::TQue srcQue0; AscendC::TQue srcQue1; AscendC::TQue srcQue2; - AscendC::TQue dstQue0; - AscendC::TQue dstQue1; + AscendC::TQue dstQue; + //AscendC::TQue dstQue1; AscendC::GlobalTensor src0Global; AscendC::GlobalTensor src1Global; - AscendC::GlobalTensor dstGlobal; + AscendC::GlobalTensor dstGlobal0; + AscendC::GlobalTensor dstGlobal1; + uint32_t totalLength = 0; }; }// namespace MyCustomKernel diff --git a/operator/ascendc/2_features/2_tbufpool/scripts/gen_data.py b/operator/ascendc/2_features/2_tbufpool/scripts/gen_data.py index 3b0aa2937..1db568ae2 100644 --- a/operator/ascendc/2_features/2_tbufpool/scripts/gen_data.py +++ b/operator/ascendc/2_features/2_tbufpool/scripts/gen_data.py @@ -19,13 +19,15 @@ def gen_golden_data_simple(): # generate value between [-65504, 65504] input_x = np.random.uniform(-65504, np.nextafter(65504, np.inf), input_shape).astype(dtype) input_y = np.random.uniform(-65504, np.nextafter(65504, np.inf), input_shape).astype(dtype) - golden = input_x + input_y + golden_add = input_x + input_y + golden_sub = input_x - input_y os.system("mkdir -p ./input") input_x.tofile("./input/input_x.bin") input_y.tofile("./input/input_y.bin") os.system("mkdir -p ./output") - golden.tofile("./output/golden.bin") + golden_add.tofile("./output/golden_add.bin") + golden_sub.tofile("./output/golden_sub.bin") if __name__ == "__main__": gen_golden_data_simple() \ No newline at end of file -- Gitee From 29333e53c40e0f21b13e3aefe599ef80b27a74b9 Mon Sep 17 00:00:00 2001 From: alpaca12345uuu Date: Sun, 8 Jun 2025 15:34:01 +0800 Subject: [PATCH 30/94] 1 --- .../2_features/2_tbufpool/op_kernel/tbufpool_custom.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h b/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h index fc941069f..c7e2cfe1e 100644 --- a/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h +++ b/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h @@ -60,11 +60,11 @@ class TbufPoolImpl { CopyIn(i); Compute(i); CopyOut(i); - tbufPool1.Reset(i); + tbufPool1.Reset(); CopyIn1(i); Compute1(i); CopyOut1(i); - tbufPool2.Reset(i); + tbufPool2.Reset(); } tbufPool0.Reset(); } @@ -117,7 +117,7 @@ class TbufPoolImpl { { AscendC::LocalTensor dstLocal = dstQue.DeQue(); AscendC::DataCopy(dstGlobal1[progress*1024], dstLocal, 1024); - dstQue1.FreeTensor(dstLocal); + dstQue.FreeTensor(dstLocal); } // private: // AscendC::TPipe* pipe; -- Gitee From 9c71b095c26f0055ce8c7a0f634de0017b91949d Mon Sep 17 00:00:00 2001 From: alpaca12345uuu Date: Sun, 8 Jun 2025 15:35:20 +0800 Subject: [PATCH 31/94] 1 --- .../ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h b/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h index c7e2cfe1e..aac2fd979 100644 --- a/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h +++ b/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h @@ -91,7 +91,7 @@ class TbufPoolImpl { __aicore__ inline void CopyOut(int32_t progress) { AscendC::LocalTensor dstLocal = dstQue.DeQue(); - AscendC::DataCopy(dstGlobal[progress*1024], dstLocal, 1024); + AscendC::DataCopy(dstGlobal0[progress*1024], dstLocal, 1024); dstQue.FreeTensor(dstLocal); } __aicore__ inline void CopyIn1(int32_t progress) -- Gitee From ad4fac93eaa2a0f3d646eda2bfe11601e8cd0917 Mon Sep 17 00:00:00 2001 From: alpaca12345uuu Date: Sun, 8 Jun 2025 15:39:32 +0800 Subject: [PATCH 32/94] 1 --- .../ascendc/2_features/2_tbufpool/main.cpp | 59 ++++++++++++++++--- 1 file changed, 50 insertions(+), 9 deletions(-) diff --git a/operator/ascendc/2_features/2_tbufpool/main.cpp b/operator/ascendc/2_features/2_tbufpool/main.cpp index e31ab475b..ef20304f3 100644 --- a/operator/ascendc/2_features/2_tbufpool/main.cpp +++ b/operator/ascendc/2_features/2_tbufpool/main.cpp @@ -27,7 +27,7 @@ constexpr uint32_t TILING_SIZE = 1; extern void GenerateTilingData(const uint32_t totalLength, uint8_t *tilingBuf); -static bool CompareResult(const void *outputData, int64_t outSize) { +static bool CompareResultAdd(const void *outputData, int64_t outSize) { void *goldenData; #ifdef ASCENDC_CPU_DEBUG goldenData = (uint8_t *)AscendC::GmAlloc(outSize); @@ -35,9 +35,9 @@ static bool CompareResult(const void *outputData, int64_t outSize) { CHECK_ACL(aclrtMallocHost((void **)(&goldenData), outSize)); #endif size_t goldenSize = outSize; - bool ret = ReadFile("../output/golden.bin", goldenSize, goldenData, goldenSize); + bool ret = ReadFile("../output/golden_add.bin", goldenSize, goldenData, goldenSize); if (ret) { - printf("ReadFile golden.bin success!\n"); + printf("ReadFile golden_add.bin success!\n"); } else { printf("test failed!\n"); return false; @@ -51,7 +51,7 @@ static bool CompareResult(const void *outputData, int64_t outSize) { float ae = std::abs(a - b); float re = ae / abs(b); if (ae > EPS && re > EPS) { - printf("CompareResult failed output is %lf, golden is %lf\n", a, b); + printf("CompareResultAdd failed output is %lf, golden is %lf\n", a, b); wrongNum++; } } @@ -63,7 +63,48 @@ static bool CompareResult(const void *outputData, int64_t outSize) { if (wrongNum != 0) { return false; } else { - printf("CompareResult golden.bin success!\n"); + printf("CompareResultAdd golden_add.bin success!\n"); + return true; + } +} + +static bool CompareResultSub(const void *outputData, int64_t outSize) { + void *goldenData; +#ifdef ASCENDC_CPU_DEBUG + goldenData = (uint8_t *)AscendC::GmAlloc(outSize); +#else + CHECK_ACL(aclrtMallocHost((void **)(&goldenData), outSize)); +#endif + size_t goldenSize = outSize; + bool ret = ReadFile("../output/golden_sub.bin", goldenSize, goldenData, goldenSize); + if (ret) { + printf("ReadFile golden_sub.bin success!\n"); + } else { + printf("test failed!\n"); + return false; + } + constexpr float EPS = 1e-4; + int64_t wrongNum = 0; + + for (int i = 0; i < outSize / sizeof(float); i++) { + float a = (reinterpret_cast(outputData))[i]; + float b = (reinterpret_cast(goldenData))[i]; + float ae = std::abs(a - b); + float re = ae / abs(b); + if (ae > EPS && re > EPS) { + printf("CompareResultSub failed output is %lf, golden is %lf\n", a, b); + wrongNum++; + } + } +#ifdef ASCENDC_CPU_DEBUG + AscendC::GmFree((void *)goldenData); +#else + CHECK_ACL(aclrtFreeHost(goldenData)); +#endif + if (wrongNum != 0) { + return false; + } else { + printf("CompareResultSub golden_sub.bin success!\n"); return true; } } @@ -95,9 +136,9 @@ int32_t main(int32_t argc, char *argv[]) { bool goldenResultAdd = true; - goldenResultAdd = CompareResult(zAdd, outputSize); + goldenResultAdd = CompareResultAdd(zAdd, outputSize); bool goldenResult_sub = true; - goldenResultAdd = CompareResult(zSub, outputSize); + goldenResultAdd = CompareResultSub(zSub, outputSize); AscendC::GmFree((void *)x); AscendC::GmFree((void *)y); @@ -156,9 +197,9 @@ int32_t main(int32_t argc, char *argv[]) { // Compare the result with the golden result bool goldenResultAdd = true; - goldenResultAdd = CompareResult(zHostAdd, outputSize); + goldenResultAdd = CompareResultAdd(zHostAdd, outputSize); bool goldenResultSub = true; - goldenResultSub = CompareResult(zHostSub, outputSize); + goldenResultSub = CompareResultSub(zHostSub, outputSize); // Clean up memory CHECK_ACL(aclrtFree(xDevice)); -- Gitee From ae698d2b6c4a9184bea5012fd25470f1616a63a4 Mon Sep 17 00:00:00 2001 From: alpaca12345uuu Date: Sun, 8 Jun 2025 15:41:51 +0800 Subject: [PATCH 33/94] 1 --- operator/ascendc/2_features/2_tbufpool/main.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/operator/ascendc/2_features/2_tbufpool/main.cpp b/operator/ascendc/2_features/2_tbufpool/main.cpp index ef20304f3..475cb16c5 100644 --- a/operator/ascendc/2_features/2_tbufpool/main.cpp +++ b/operator/ascendc/2_features/2_tbufpool/main.cpp @@ -137,8 +137,8 @@ int32_t main(int32_t argc, char *argv[]) { bool goldenResultAdd = true; goldenResultAdd = CompareResultAdd(zAdd, outputSize); - bool goldenResult_sub = true; - goldenResultAdd = CompareResultSub(zSub, outputSize); + bool goldenResultSub = true; + goldenResultSub = CompareResultSub(zSub, outputSize); AscendC::GmFree((void *)x); AscendC::GmFree((void *)y); -- Gitee From 851cf34608ceb1a438b967637ddbc0f1cec4ebca Mon Sep 17 00:00:00 2001 From: alpaca12345uuu Date: Sun, 8 Jun 2025 15:45:12 +0800 Subject: [PATCH 34/94] 1 --- operator/ascendc/2_features/2_tbufpool/main.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/operator/ascendc/2_features/2_tbufpool/main.cpp b/operator/ascendc/2_features/2_tbufpool/main.cpp index 475cb16c5..a9e1a2f3b 100644 --- a/operator/ascendc/2_features/2_tbufpool/main.cpp +++ b/operator/ascendc/2_features/2_tbufpool/main.cpp @@ -128,7 +128,7 @@ int32_t main(int32_t argc, char *argv[]) { AscendC::SetKernelMode(KernelMode::AIV_MODE); // run in aiv mode - ICPU_RUN_KF(tbufpool_custom, USED_CORE_NUM, x, y, zAdd, zSub, + ICPU_RUN_KF(tbufpool_custom, x, y, zAdd, zSub, *reinterpret_cast(tiling)); // use this macro for cpu debug WriteFile("../output/output_add.bin", zAdd, outputSize); -- Gitee From 49080d4a61fdb4bcff890e07c418363e89f7da24 Mon Sep 17 00:00:00 2001 From: alpaca12345uuu Date: Sun, 8 Jun 2025 15:46:42 +0800 Subject: [PATCH 35/94] 1 --- operator/ascendc/2_features/2_tbufpool/main.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/operator/ascendc/2_features/2_tbufpool/main.cpp b/operator/ascendc/2_features/2_tbufpool/main.cpp index a9e1a2f3b..475cb16c5 100644 --- a/operator/ascendc/2_features/2_tbufpool/main.cpp +++ b/operator/ascendc/2_features/2_tbufpool/main.cpp @@ -128,7 +128,7 @@ int32_t main(int32_t argc, char *argv[]) { AscendC::SetKernelMode(KernelMode::AIV_MODE); // run in aiv mode - ICPU_RUN_KF(tbufpool_custom, x, y, zAdd, zSub, + ICPU_RUN_KF(tbufpool_custom, USED_CORE_NUM, x, y, zAdd, zSub, *reinterpret_cast(tiling)); // use this macro for cpu debug WriteFile("../output/output_add.bin", zAdd, outputSize); -- Gitee From fb1758cf475e75cf2e2f5c4e25ae41b89f2ac129 Mon Sep 17 00:00:00 2001 From: alpaca12345uuu Date: Sun, 8 Jun 2025 15:47:45 +0800 Subject: [PATCH 36/94] 1 --- operator/ascendc/2_features/2_tbufpool/main.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/operator/ascendc/2_features/2_tbufpool/main.cpp b/operator/ascendc/2_features/2_tbufpool/main.cpp index 475cb16c5..eff35e086 100644 --- a/operator/ascendc/2_features/2_tbufpool/main.cpp +++ b/operator/ascendc/2_features/2_tbufpool/main.cpp @@ -128,8 +128,7 @@ int32_t main(int32_t argc, char *argv[]) { AscendC::SetKernelMode(KernelMode::AIV_MODE); // run in aiv mode - ICPU_RUN_KF(tbufpool_custom, USED_CORE_NUM, x, y, zAdd, zSub, - *reinterpret_cast(tiling)); // use this macro for cpu debug + ICPU_RUN_KF(tbufpool_custom, USED_CORE_NUM, x, y, zAdd, zSub, *reinterpret_cast(tiling)); // use this macro for cpu debug WriteFile("../output/output_add.bin", zAdd, outputSize); WriteFile("../output/output_sub.bin", zSub, outputSize); -- Gitee From c0a3544dbb9e3ec7cbe2dd0810bbc37426198f11 Mon Sep 17 00:00:00 2001 From: alpaca12345uuu Date: Sun, 8 Jun 2025 15:55:10 +0800 Subject: [PATCH 37/94] 1 --- operator/ascendc/2_features/2_tbufpool/main.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/operator/ascendc/2_features/2_tbufpool/main.cpp b/operator/ascendc/2_features/2_tbufpool/main.cpp index eff35e086..888a97314 100644 --- a/operator/ascendc/2_features/2_tbufpool/main.cpp +++ b/operator/ascendc/2_features/2_tbufpool/main.cpp @@ -16,7 +16,7 @@ #include "tiling/platform/platform_ascendc.h" #else #include "tikicpulib.h" -extern "C" __global__ __aicore__ void tbufpool_custom(GM_ADDR x, GM_ADDR y, GM_ADDR z, TbufPoolTilingData tiling); +extern "C" __global__ __aicore__ void tbufpool_custom(GM_ADDR x, GM_ADDR y, GM_ADDR zAdd, GM_ADDR zSub, TbufPoolTilingData tiling); #endif namespace { -- Gitee From 7c2070799af2bcb27b87b75440662603ed685506 Mon Sep 17 00:00:00 2001 From: alpaca12345uuu Date: Sun, 8 Jun 2025 15:59:46 +0800 Subject: [PATCH 38/94] 1 --- .../2_tbufpool/op_kernel/tbufpool_custom.h | 38 ++++++------------- 1 file changed, 11 insertions(+), 27 deletions(-) diff --git a/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h b/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h index aac2fd979..e96dca3f7 100644 --- a/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h +++ b/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h @@ -19,9 +19,9 @@ constexpr int32_t BUFFER_NUM = 1; constexpr int32_t BUFFER_LENGTH = 4096*sizeof(float); constexpr int32_t BUFF_POOL_LENGTH = 2048*sizeof(float); constexpr int32_t INIT_TENSOR_LENGTH = 1024*sizeof(float); -constexpr int32_t TENSOR_LENGTH = 512*sizeof(float); constexpr int32_t SPLIT_NUM = 2; constexpr int32_t TOTL_NUM = 2048; +constexpr int32_t COMPUTE_LENGTH = 1024 class TbufPoolImpl { public: __aicore__ inline TbufPoolImpl() {} @@ -39,28 +39,19 @@ class TbufPoolImpl { tbufPool0.InitBuffer(srcQue0, BUFFER_NUM, BUFF_POOL_LENGTH); // Total src0 tbufPool0.InitBufPool(tbufPool1, BUFF_POOL_LENGTH); tbufPool0.InitBufPool(tbufPool2, BUFF_POOL_LENGTH, tbufPool1); - tbufPool1.InitBuffer(srcQue1, BUFFER_NUM, INIT_TENSOR_LENGTH); tbufPool1.InitBuffer(dstQue, BUFFER_NUM, INIT_TENSOR_LENGTH); - //tbufPool1.InitBuffer(dstQue1, BUFFER_NUM, INIT_TENSOR_LENGTH); } __aicore__ inline void Process() { - // CopyIn(); - // Compute(); - // CopyOut(); - // tbufPool1.Reset(); - // CopyIn1(); - // Compute1(); - // CopyOut1(); - // tbufPool2.Reset(); - //tbufPool0.Reset(); for (int32_t i = 0; i < SPLIT_NUM; i++) { + //stage 1 CopyIn(i); Compute(i); CopyOut(i); tbufPool1.Reset(); + //stage 2 CopyIn1(i); Compute1(i); CopyOut1(i); @@ -73,8 +64,8 @@ class TbufPoolImpl { { AscendC::LocalTensor src0Local = srcQue0.AllocTensor(); AscendC::LocalTensor src1Local = srcQue1.AllocTensor(); - AscendC::DataCopy(src0Local, src0Global[progress*1024], 1024); - AscendC::DataCopy(src1Local, src1Global[progress*1024], 1024); + AscendC::DataCopy(src0Local, src0Global[progress*COMPUTE_LENGTH], COMPUTE_LENGTH); + AscendC::DataCopy(src1Local, src1Global[progress*COMPUTE_LENGTH], COMPUTE_LENGTH); srcQue0.EnQue(src0Local); srcQue1.EnQue(src1Local); } @@ -83,7 +74,7 @@ class TbufPoolImpl { AscendC::LocalTensor src0Local = srcQue0.DeQue(); AscendC::LocalTensor src1Local = srcQue1.DeQue(); AscendC::LocalTensor dstLocal = dstQue.AllocTensor(); - AscendC::Add(dstLocal, src0Local, src1Local, 1024); + AscendC::Add(dstLocal, src0Local, src1Local, COMPUTE_LENGTH); dstQue.EnQue(dstLocal); srcQue0.FreeTensor(src0Local); srcQue1.FreeTensor(src1Local); @@ -91,15 +82,15 @@ class TbufPoolImpl { __aicore__ inline void CopyOut(int32_t progress) { AscendC::LocalTensor dstLocal = dstQue.DeQue(); - AscendC::DataCopy(dstGlobal0[progress*1024], dstLocal, 1024); + AscendC::DataCopy(dstGlobal0[progress*COMPUTE_LENGTH], dstLocal, COMPUTE_LENGTH); dstQue.FreeTensor(dstLocal); } __aicore__ inline void CopyIn1(int32_t progress) { AscendC::LocalTensor src0Local = srcQue0.AllocTensor(); AscendC::LocalTensor src1Local = srcQue1.AllocTensor(); - AscendC::DataCopy(src0Local, src0Global[progress*1024], 1024); - AscendC::DataCopy(src1Local, src1Global[progress*1024], 1024); + AscendC::DataCopy(src0Local, src0Global[progress*COMPUTE_LENGTH], COMPUTE_LENGTH); + AscendC::DataCopy(src1Local, src1Global[progress*COMPUTE_LENGTH], COMPUTE_LENGTH); srcQue0.EnQue(src0Local); srcQue1.EnQue(src1Local); } @@ -108,7 +99,7 @@ class TbufPoolImpl { AscendC::LocalTensor src0Local = srcQue0.DeQue(); AscendC::LocalTensor src1Local = srcQue1.DeQue(); AscendC::LocalTensor dstLocal = dstQue.AllocTensor(); - AscendC::Sub(dstLocal, src0Local, src1Local, 1024); + AscendC::Sub(dstLocal, src0Local, src1Local, COMPUTE_LENGTH); dstQue.EnQue(dstLocal); srcQue0.FreeTensor(src0Local); srcQue1.FreeTensor(src1Local); @@ -116,16 +107,9 @@ class TbufPoolImpl { __aicore__ inline void CopyOut1(int32_t progress) { AscendC::LocalTensor dstLocal = dstQue.DeQue(); - AscendC::DataCopy(dstGlobal1[progress*1024], dstLocal, 1024); + AscendC::DataCopy(dstGlobal1[progress*COMPUTE_LENGTH], dstLocal, COMPUTE_LENGTH); dstQue.FreeTensor(dstLocal); } - // private: - // AscendC::TPipe* pipe; - // AscendC::TBufPool tbufPool0, tbufPool1, tbufPool2; - // AscendC::TQue srcQue0, srcQue1, srcQue2; - // AscendC::TQue dstQue0, dstQue1; - // AscendC::GlobalTensor src0Global, src1Global, dstGlobal; - // uint32_t totalLength = 0; private: AscendC::TPipe* pipe; -- Gitee From 598a05f7cf07fa350c82f22e672d9e402e94eb42 Mon Sep 17 00:00:00 2001 From: alpaca12345uuu Date: Sun, 8 Jun 2025 16:00:39 +0800 Subject: [PATCH 39/94] 1 --- .../ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h b/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h index e96dca3f7..41f5b0fc6 100644 --- a/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h +++ b/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h @@ -21,7 +21,8 @@ constexpr int32_t BUFF_POOL_LENGTH = 2048*sizeof(float); constexpr int32_t INIT_TENSOR_LENGTH = 1024*sizeof(float); constexpr int32_t SPLIT_NUM = 2; constexpr int32_t TOTL_NUM = 2048; -constexpr int32_t COMPUTE_LENGTH = 1024 +constexpr int32_t COMPUTE_LENGTH = 1024; + class TbufPoolImpl { public: __aicore__ inline TbufPoolImpl() {} -- Gitee From c7722c17346b8a57197abfde8d5e5ea063c8a0de Mon Sep 17 00:00:00 2001 From: alpaca12345uuu Date: Sun, 8 Jun 2025 16:07:08 +0800 Subject: [PATCH 40/94] 1 --- operator/ascendc/2_features/2_tbufpool/README.md | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/operator/ascendc/2_features/2_tbufpool/README.md b/operator/ascendc/2_features/2_tbufpool/README.md index 25b8f4ab5..efb0295c9 100644 --- a/operator/ascendc/2_features/2_tbufpool/README.md +++ b/operator/ascendc/2_features/2_tbufpool/README.md @@ -13,22 +13,26 @@ │ └── run.sh // 编译运行算子的脚本 ``` ## 代码实现介绍 -数据量较大且内存有限时,无法一次完成所有数据搬运,需要拆分成多个阶段计算,每次计算使用其中的一部分数据,可以通过TBufPool资源池进行内存地址复用。本例中,通过调用InitBufPool基础API对Add算子实现过程进行内存管理。从Tpipe划分出资源池tbufPool0,tbufPool0为src0Gm分配空间后,继续分配了资源池tbufPool1,指定tbufPool1与tbufPool2复用并分别运用于第一、二轮计算,此时tbufPool1及tbufPool2共享起始地址及长度。 +数据量较大且内存有限时,无法一次完成所有数据搬运,需要拆分成多个阶段计算,每次计算使用其中的一部分数据,可以通过TBufPool资源池进行内存地址复用。本例中,通过调用InitBufPool基础API对Add算子和Sub算子实现过程进行内存管理。从Tpipe划分出资源池tbufPool0,tbufPool0为src0Gm分配空间后,继续分配了资源池tbufPool1,指定tbufPool1与tbufPool2复用并分别运用于第一、二轮计算,此时tbufPool1及tbufPool2共享起始地址及长度。 - kernel实现 Add算子的数学表达式为: ``` z = x + y ``` - 计算逻辑是:Ascend C提供的矢量计算接口的操作元素都为LocalTensor,首先将部分输入数据src0Gm,部分输入数据src1Gm搬运进片上储存,调用计算接口完成相加计算,搬出到外部存储上。之后再将剩余输入数据搬运进片上储存,调用计算接口完成相加计算,得到最终结果,再搬出到外部存储上。 + Sub算子的数学表达式为: + ``` + z = x - y + ``` + 计算逻辑是:Ascend C提供的矢量计算接口的操作元素都为LocalTensor,首先启用tbufool1,将部分输入数据src0Gm,部分输入数据src1Gm搬运进片上储存,调用计算接口完成相加计算,搬出到外部存储上。之后切换到tbufpool2进行相减计算。完成后切换回tbufpool1完成剩余数据相加计算,得到最终相加结果,再切换到tbufpool2完成剩余数据相减计算,得到最终结果,再搬出到外部存储上。 Add算子的实现流程分为6个基本任务:CopyIn,Compute,CopyOut,CopyIn1,Compute1,CopyOut1。 - - CopyIn任务负责将Global Memory上的部分输入Tensor src0Gm和src1Gm搬运到Local Memory,分别存储在srcoLocal、src1Local; + - CopyIn任务负责将Global Memory上的部分输入Tensor src0Gm和src1Gm搬运到Local Memory,分别存储在src0Local、src1Local; - Compute任务负责对src0Local、src1Local执行加法操作,计算结果存储在dstLocal中; - - CopyOut任务负责将输出数据从dstLocal搬运至Global Memory上的输出Tensor dstGm中。 + - CopyOut任务负责将输出数据从dstLocal搬运至Global Memory上的输出Tensor dstGm0中。 - CopyIn1任务负责将Global Memory上的剩余输入Tensor src0Gm和src1Gm搬运到Local Memory,分别存储在src0Local、src1Local; - - Compute1任务负责对src0Local、src1Local执行加法操作,计算结果存储在dstLocal中; - - CopyOut1任务负责将输出数据从dstLocal搬运至Global Memory上的输出Tensor dstGm中。 + - Compute1任务负责对src0Local、src1Local执行减法操作,计算结果存储在dstLocal中; + - CopyOut1任务负责将输出数据从dstLocal搬运至Global Memory上的输出Tensor dstGm1中。 - 调用实现 1. CPU侧运行验证主要通过ICPU_RUN_KF CPU调测宏等CPU调测库提供的接口来完成; -- Gitee From 12ba69ab794543029dacc32a6d262d8b1a1b2280 Mon Sep 17 00:00:00 2001 From: alpaca12345uuu Date: Sun, 8 Jun 2025 16:15:13 +0800 Subject: [PATCH 41/94] 1 --- .../ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h b/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h index 41f5b0fc6..b6fe373d0 100644 --- a/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h +++ b/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h @@ -120,8 +120,7 @@ class TbufPoolImpl { AscendC::TQue srcQue0; AscendC::TQue srcQue1; AscendC::TQue srcQue2; - AscendC::TQue dstQue; - //AscendC::TQue dstQue1; + AscendC::TQue dstQue; AscendC::GlobalTensor src0Global; AscendC::GlobalTensor src1Global; AscendC::GlobalTensor dstGlobal0; -- Gitee From eaae604b023eca5a1eefe37326498d9c3cf195ee Mon Sep 17 00:00:00 2001 From: alpaca12345uuu Date: Mon, 9 Jun 2025 11:41:16 +0800 Subject: [PATCH 42/94] =?UTF-8?q?=E4=BB=A3=E7=A0=81=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../2_tbufpool/op_kernel/tbufpool_custom.h | 30 +++++++++++-------- operator/ascendc/2_features/README.md | 2 +- 2 files changed, 18 insertions(+), 14 deletions(-) diff --git a/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h b/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h index b6fe373d0..f6a9d0717 100644 --- a/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h +++ b/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h @@ -41,7 +41,9 @@ class TbufPoolImpl { tbufPool0.InitBufPool(tbufPool1, BUFF_POOL_LENGTH); tbufPool0.InitBufPool(tbufPool2, BUFF_POOL_LENGTH, tbufPool1); tbufPool1.InitBuffer(srcQue1, BUFFER_NUM, INIT_TENSOR_LENGTH); - tbufPool1.InitBuffer(dstQue, BUFFER_NUM, INIT_TENSOR_LENGTH); + tbufPool1.InitBuffer(dstQue1, BUFFER_NUM, INIT_TENSOR_LENGTH); + tbufPool2.InitBuffer(srcQue2, BUFFER_NUM, INIT_TENSOR_LENGTH); + tbufPool2.InitBuffer(dstQue2, BUFFER_NUM, INIT_TENSOR_LENGTH); } __aicore__ inline void Process() { @@ -65,8 +67,8 @@ class TbufPoolImpl { { AscendC::LocalTensor src0Local = srcQue0.AllocTensor(); AscendC::LocalTensor src1Local = srcQue1.AllocTensor(); - AscendC::DataCopy(src0Local, src0Global[progress*COMPUTE_LENGTH], COMPUTE_LENGTH); - AscendC::DataCopy(src1Local, src1Global[progress*COMPUTE_LENGTH], COMPUTE_LENGTH); + AscendC::DataCopy(src0Local, src0Global[progress * COMPUTE_LENGTH], COMPUTE_LENGTH); + AscendC::DataCopy(src1Local, src1Global[progress * COMPUTE_LENGTH], COMPUTE_LENGTH); srcQue0.EnQue(src0Local); srcQue1.EnQue(src1Local); } @@ -74,7 +76,7 @@ class TbufPoolImpl { { AscendC::LocalTensor src0Local = srcQue0.DeQue(); AscendC::LocalTensor src1Local = srcQue1.DeQue(); - AscendC::LocalTensor dstLocal = dstQue.AllocTensor(); + AscendC::LocalTensor dstLocal = dstQue1.AllocTensor(); AscendC::Add(dstLocal, src0Local, src1Local, COMPUTE_LENGTH); dstQue.EnQue(dstLocal); srcQue0.FreeTensor(src0Local); @@ -82,24 +84,24 @@ class TbufPoolImpl { } __aicore__ inline void CopyOut(int32_t progress) { - AscendC::LocalTensor dstLocal = dstQue.DeQue(); - AscendC::DataCopy(dstGlobal0[progress*COMPUTE_LENGTH], dstLocal, COMPUTE_LENGTH); + AscendC::LocalTensor dstLocal = dstQue1.DeQue(); + AscendC::DataCopy(dstGlobal0[progress * COMPUTE_LENGTH], dstLocal, COMPUTE_LENGTH); dstQue.FreeTensor(dstLocal); } __aicore__ inline void CopyIn1(int32_t progress) { AscendC::LocalTensor src0Local = srcQue0.AllocTensor(); - AscendC::LocalTensor src1Local = srcQue1.AllocTensor(); - AscendC::DataCopy(src0Local, src0Global[progress*COMPUTE_LENGTH], COMPUTE_LENGTH); - AscendC::DataCopy(src1Local, src1Global[progress*COMPUTE_LENGTH], COMPUTE_LENGTH); + AscendC::LocalTensor src1Local = srcQue2.AllocTensor(); + AscendC::DataCopy(src0Local, src0Global[progress * COMPUTE_LENGTH], COMPUTE_LENGTH); + AscendC::DataCopy(src1Local, src1Global[progress * COMPUTE_LENGTH], COMPUTE_LENGTH); srcQue0.EnQue(src0Local); srcQue1.EnQue(src1Local); } __aicore__ inline void Compute1(int32_t progress) { AscendC::LocalTensor src0Local = srcQue0.DeQue(); - AscendC::LocalTensor src1Local = srcQue1.DeQue(); - AscendC::LocalTensor dstLocal = dstQue.AllocTensor(); + AscendC::LocalTensor src1Local = srcQue2.DeQue(); + AscendC::LocalTensor dstLocal = dstQue2.AllocTensor(); AscendC::Sub(dstLocal, src0Local, src1Local, COMPUTE_LENGTH); dstQue.EnQue(dstLocal); srcQue0.FreeTensor(src0Local); @@ -107,7 +109,7 @@ class TbufPoolImpl { } __aicore__ inline void CopyOut1(int32_t progress) { - AscendC::LocalTensor dstLocal = dstQue.DeQue(); + AscendC::LocalTensor dstLocal = dstQue2.DeQue(); AscendC::DataCopy(dstGlobal1[progress*COMPUTE_LENGTH], dstLocal, COMPUTE_LENGTH); dstQue.FreeTensor(dstLocal); } @@ -120,7 +122,9 @@ class TbufPoolImpl { AscendC::TQue srcQue0; AscendC::TQue srcQue1; AscendC::TQue srcQue2; - AscendC::TQue dstQue; + AscendC::TQue dstQue; + AscendC::TQue dstQue1; + AscendC::TQue dstQue2; AscendC::GlobalTensor src0Global; AscendC::GlobalTensor src1Global; AscendC::GlobalTensor dstGlobal0; diff --git a/operator/ascendc/2_features/README.md b/operator/ascendc/2_features/README.md index 6aa3f8655..40c48fba9 100644 --- a/operator/ascendc/2_features/README.md +++ b/operator/ascendc/2_features/README.md @@ -15,7 +15,7 @@ Ascend C相关特性的样例。特性样例逐步补充中。 当前本目录包含的所有样例如下。 | 目录名称 | 功能描述 | 运行环境 | | ------------------------------------------------------------ | ---------------------------------------------------- | -- | -| [2_tbufpool](./2_tbufpool) | 基于Ascend C的自定义Vector算子及kernel直调样例,通过TbufPool实现Add算子计算过程中的内存复用,提高计算效率。|Atlas A2训练系列产品/Atlas 800I A2推理产品| +| [2_tbufpool](./2_tbufpool) | 基于Ascend C的自定义Vector算子及kernel直调样例,通过TBufPool实现Add算子计算过程中的内存复用,提高计算效率。|Atlas A2训练系列产品/Atlas 800I A2推理产品| | [12_cube_group](./12_cube_group) | 基于Ascend C的自定义算子及FrameworkLaunch调用样例,通过软同步控制AIC和AIV之间进行通讯,实现AI Core计算资源分组。|Atlas A2训练系列产品/Atlas 800I A2推理产品| | [13_matmul_api_ibshare](./13_matmul_api_ibshare) | 基于Ascend C的自定义Cube算子及Kernellaunch调用样例,通过A矩阵与B矩阵使能IBSHARE,实现算子性能提升|Atlas A2训练系列产品/Atlas 800I A2推理产品| | [14_matmul_api_constant](./14_matmul_api_constant) | 基于Ascend C的自定义Cube算子及FrameworkLaunch调用样例,通过使用全量常量化的MatmulApiStaticTiling模板参数,替代非常量的TCubeTiling参数,以减少Scalar计算开销,实现算子性能提升|Atlas A2训练系列产品/Atlas 800I A2推理产品| -- Gitee From 7500d7be2be90f5943013638d633b41bd5290174 Mon Sep 17 00:00:00 2001 From: alpaca12345uuu Date: Mon, 9 Jun 2025 14:30:37 +0800 Subject: [PATCH 43/94] =?UTF-8?q?=E4=BB=A3=E7=A0=81=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../2_features/2_tbufpool/op_kernel/tbufpool_custom.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h b/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h index f6a9d0717..07781299a 100644 --- a/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h +++ b/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h @@ -78,7 +78,7 @@ class TbufPoolImpl { AscendC::LocalTensor src1Local = srcQue1.DeQue(); AscendC::LocalTensor dstLocal = dstQue1.AllocTensor(); AscendC::Add(dstLocal, src0Local, src1Local, COMPUTE_LENGTH); - dstQue.EnQue(dstLocal); + dstQue1.EnQue(dstLocal); srcQue0.FreeTensor(src0Local); srcQue1.FreeTensor(src1Local); } @@ -86,7 +86,7 @@ class TbufPoolImpl { { AscendC::LocalTensor dstLocal = dstQue1.DeQue(); AscendC::DataCopy(dstGlobal0[progress * COMPUTE_LENGTH], dstLocal, COMPUTE_LENGTH); - dstQue.FreeTensor(dstLocal); + dstQue1.FreeTensor(dstLocal); } __aicore__ inline void CopyIn1(int32_t progress) { @@ -103,7 +103,7 @@ class TbufPoolImpl { AscendC::LocalTensor src1Local = srcQue2.DeQue(); AscendC::LocalTensor dstLocal = dstQue2.AllocTensor(); AscendC::Sub(dstLocal, src0Local, src1Local, COMPUTE_LENGTH); - dstQue.EnQue(dstLocal); + dstQue2.EnQue(dstLocal); srcQue0.FreeTensor(src0Local); srcQue1.FreeTensor(src1Local); } @@ -111,7 +111,7 @@ class TbufPoolImpl { { AscendC::LocalTensor dstLocal = dstQue2.DeQue(); AscendC::DataCopy(dstGlobal1[progress*COMPUTE_LENGTH], dstLocal, COMPUTE_LENGTH); - dstQue.FreeTensor(dstLocal); + dstQue2.FreeTensor(dstLocal); } private: -- Gitee From 3e62cb8a0300605e85b1a6bde48893ecf5700d86 Mon Sep 17 00:00:00 2001 From: alpaca12345uuu Date: Mon, 9 Jun 2025 14:41:22 +0800 Subject: [PATCH 44/94] =?UTF-8?q?=E4=BB=A3=E7=A0=81=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../2_features/2_tbufpool/op_kernel/tbufpool_custom.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h b/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h index 07781299a..45ae37586 100644 --- a/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h +++ b/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h @@ -42,8 +42,8 @@ class TbufPoolImpl { tbufPool0.InitBufPool(tbufPool2, BUFF_POOL_LENGTH, tbufPool1); tbufPool1.InitBuffer(srcQue1, BUFFER_NUM, INIT_TENSOR_LENGTH); tbufPool1.InitBuffer(dstQue1, BUFFER_NUM, INIT_TENSOR_LENGTH); - tbufPool2.InitBuffer(srcQue2, BUFFER_NUM, INIT_TENSOR_LENGTH); - tbufPool2.InitBuffer(dstQue2, BUFFER_NUM, INIT_TENSOR_LENGTH); + // tbufPool2.InitBuffer(srcQue2, BUFFER_NUM, INIT_TENSOR_LENGTH); + // tbufPool2.InitBuffer(dstQue2, BUFFER_NUM, INIT_TENSOR_LENGTH); } __aicore__ inline void Process() { @@ -55,6 +55,8 @@ class TbufPoolImpl { CopyOut(i); tbufPool1.Reset(); //stage 2 + tbufPool2.InitBuffer(srcQue2, BUFFER_NUM, INIT_TENSOR_LENGTH); + tbufPool2.InitBuffer(dstQue2, BUFFER_NUM, INIT_TENSOR_LENGTH); CopyIn1(i); Compute1(i); CopyOut1(i); -- Gitee From a7e81d4fca39cd771578fe59ce441375efa73bbd Mon Sep 17 00:00:00 2001 From: alpaca12345uuu Date: Mon, 9 Jun 2025 14:47:16 +0800 Subject: [PATCH 45/94] =?UTF-8?q?=E4=BB=A3=E7=A0=81=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h b/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h index 45ae37586..81ebc31c8 100644 --- a/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h +++ b/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h @@ -50,6 +50,8 @@ class TbufPoolImpl { for (int32_t i = 0; i < SPLIT_NUM; i++) { //stage 1 + tbufPool1.InitBuffer(srcQue1, BUFFER_NUM, INIT_TENSOR_LENGTH); + tbufPool1.InitBuffer(dstQue1, BUFFER_NUM, INIT_TENSOR_LENGTH); CopyIn(i); Compute(i); CopyOut(i); @@ -112,7 +114,7 @@ class TbufPoolImpl { __aicore__ inline void CopyOut1(int32_t progress) { AscendC::LocalTensor dstLocal = dstQue2.DeQue(); - AscendC::DataCopy(dstGlobal1[progress*COMPUTE_LENGTH], dstLocal, COMPUTE_LENGTH); + AscendC::DataCopy(dstGlobal1[progress * COMPUTE_LENGTH], dstLocal, COMPUTE_LENGTH); dstQue2.FreeTensor(dstLocal); } -- Gitee From 4464b5626b340aa06564152e8bf16d7e5fe5b78c Mon Sep 17 00:00:00 2001 From: alpaca12345uuu Date: Mon, 9 Jun 2025 14:49:21 +0800 Subject: [PATCH 46/94] =?UTF-8?q?=E4=BB=A3=E7=A0=81=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../2_features/2_tbufpool/op_kernel/tbufpool_custom.h | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h b/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h index 81ebc31c8..55451fdb5 100644 --- a/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h +++ b/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h @@ -42,23 +42,19 @@ class TbufPoolImpl { tbufPool0.InitBufPool(tbufPool2, BUFF_POOL_LENGTH, tbufPool1); tbufPool1.InitBuffer(srcQue1, BUFFER_NUM, INIT_TENSOR_LENGTH); tbufPool1.InitBuffer(dstQue1, BUFFER_NUM, INIT_TENSOR_LENGTH); - // tbufPool2.InitBuffer(srcQue2, BUFFER_NUM, INIT_TENSOR_LENGTH); - // tbufPool2.InitBuffer(dstQue2, BUFFER_NUM, INIT_TENSOR_LENGTH); + tbufPool2.InitBuffer(srcQue2, BUFFER_NUM, INIT_TENSOR_LENGTH); + tbufPool2.InitBuffer(dstQue2, BUFFER_NUM, INIT_TENSOR_LENGTH); } __aicore__ inline void Process() { for (int32_t i = 0; i < SPLIT_NUM; i++) { //stage 1 - tbufPool1.InitBuffer(srcQue1, BUFFER_NUM, INIT_TENSOR_LENGTH); - tbufPool1.InitBuffer(dstQue1, BUFFER_NUM, INIT_TENSOR_LENGTH); CopyIn(i); Compute(i); CopyOut(i); tbufPool1.Reset(); //stage 2 - tbufPool2.InitBuffer(srcQue2, BUFFER_NUM, INIT_TENSOR_LENGTH); - tbufPool2.InitBuffer(dstQue2, BUFFER_NUM, INIT_TENSOR_LENGTH); CopyIn1(i); Compute1(i); CopyOut1(i); -- Gitee From d68476da308bfde4ffd99538054f25137ae7798a Mon Sep 17 00:00:00 2001 From: alpaca12345uuu Date: Mon, 9 Jun 2025 15:21:47 +0800 Subject: [PATCH 47/94] =?UTF-8?q?=E4=BB=A3=E7=A0=81=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h b/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h index 55451fdb5..ca7c25899 100644 --- a/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h +++ b/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h @@ -95,7 +95,7 @@ class TbufPoolImpl { AscendC::DataCopy(src0Local, src0Global[progress * COMPUTE_LENGTH], COMPUTE_LENGTH); AscendC::DataCopy(src1Local, src1Global[progress * COMPUTE_LENGTH], COMPUTE_LENGTH); srcQue0.EnQue(src0Local); - srcQue1.EnQue(src1Local); + srcQue2.EnQue(src1Local); } __aicore__ inline void Compute1(int32_t progress) { @@ -105,7 +105,7 @@ class TbufPoolImpl { AscendC::Sub(dstLocal, src0Local, src1Local, COMPUTE_LENGTH); dstQue2.EnQue(dstLocal); srcQue0.FreeTensor(src0Local); - srcQue1.FreeTensor(src1Local); + srcQue2.FreeTensor(src1Local); } __aicore__ inline void CopyOut1(int32_t progress) { -- Gitee From 0a772da64425bef708e3dba94281cc6656b445e8 Mon Sep 17 00:00:00 2001 From: alpaca12345uuu Date: Mon, 9 Jun 2025 16:57:59 +0800 Subject: [PATCH 48/94] =?UTF-8?q?=E4=BB=A3=E7=A0=81=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../ascendc/2_features/2_tbufpool/main.cpp | 38 +++++++++--------- .../2_tbufpool/op_kernel/tbufpool_custom.h | 39 +++++++++++++++---- .../2_features/2_tbufpool/scripts/gen_data.py | 9 ++++- 3 files changed, 57 insertions(+), 29 deletions(-) diff --git a/operator/ascendc/2_features/2_tbufpool/main.cpp b/operator/ascendc/2_features/2_tbufpool/main.cpp index 888a97314..468a4a8b6 100644 --- a/operator/ascendc/2_features/2_tbufpool/main.cpp +++ b/operator/ascendc/2_features/2_tbufpool/main.cpp @@ -16,7 +16,7 @@ #include "tiling/platform/platform_ascendc.h" #else #include "tikicpulib.h" -extern "C" __global__ __aicore__ void tbufpool_custom(GM_ADDR x, GM_ADDR y, GM_ADDR zAdd, GM_ADDR zSub, TbufPoolTilingData tiling); +extern "C" __global__ __aicore__ void tbufpool_custom(GM_ADDR x, GM_ADDR y, GM_ADDR zAdd, GM_ADDR zSubMul, TbufPoolTilingData tiling); #endif namespace { @@ -76,9 +76,9 @@ static bool CompareResultSub(const void *outputData, int64_t outSize) { CHECK_ACL(aclrtMallocHost((void **)(&goldenData), outSize)); #endif size_t goldenSize = outSize; - bool ret = ReadFile("../output/golden_sub.bin", goldenSize, goldenData, goldenSize); + bool ret = ReadFile("../output/golden_sub_mul.bin", goldenSize, goldenData, goldenSize); if (ret) { - printf("ReadFile golden_sub.bin success!\n"); + printf("ReadFile golden_sub_mul.bin success!\n"); } else { printf("test failed!\n"); return false; @@ -104,7 +104,7 @@ static bool CompareResultSub(const void *outputData, int64_t outSize) { if (wrongNum != 0) { return false; } else { - printf("CompareResultSub golden_sub.bin success!\n"); + printf("CompareResultSub golden_sub_mul.bin success!\n"); return true; } } @@ -118,7 +118,7 @@ int32_t main(int32_t argc, char *argv[]) { uint8_t *x = (uint8_t *)AscendC::GmAlloc(inputSize); uint8_t *y = (uint8_t *)AscendC::GmAlloc(inputSize); uint8_t *zAdd = (uint8_t *)AscendC::GmAlloc(outputSize); - uint8_t *zSub = (uint8_t *)AscendC::GmAlloc(outputSize); + uint8_t *zSubMul = (uint8_t *)AscendC::GmAlloc(outputSize); uint8_t *tiling = (uint8_t *)AscendC::GmAlloc(tilingSize); ReadFile("../input/input_x.bin", inputSize, x, inputSize); @@ -128,21 +128,21 @@ int32_t main(int32_t argc, char *argv[]) { AscendC::SetKernelMode(KernelMode::AIV_MODE); // run in aiv mode - ICPU_RUN_KF(tbufpool_custom, USED_CORE_NUM, x, y, zAdd, zSub, *reinterpret_cast(tiling)); // use this macro for cpu debug + ICPU_RUN_KF(tbufpool_custom, USED_CORE_NUM, x, y, zAdd, zSubMul, *reinterpret_cast(tiling)); // use this macro for cpu debug WriteFile("../output/output_add.bin", zAdd, outputSize); - WriteFile("../output/output_sub.bin", zSub, outputSize); + WriteFile("../output/output_sub_mul.bin", zSubMul, outputSize); bool goldenResultAdd = true; goldenResultAdd = CompareResultAdd(zAdd, outputSize); bool goldenResultSub = true; - goldenResultSub = CompareResultSub(zSub, outputSize); + goldenResultSub = CompareResultSub(zSubMul, outputSize); AscendC::GmFree((void *)x); AscendC::GmFree((void *)y); AscendC::GmFree((void *)zAdd); - AscendC::GmFree((void *)zSub); + AscendC::GmFree((void *)zSubMul); AscendC::GmFree((void *)tiling); #else CHECK_ACL(aclInit(nullptr)); @@ -154,23 +154,23 @@ int32_t main(int32_t argc, char *argv[]) { uint8_t *xHost; uint8_t *yHost; uint8_t *zHostAdd; - uint8_t *zHostSub; + uint8_t *zHostSubMul; uint8_t *tiling; uint8_t *xDevice; uint8_t *yDevice; uint8_t *zDeviceAdd; - uint8_t *zDeviceSub; + uint8_t *zDeviceSubMul; CHECK_ACL(aclrtMallocHost((void **)(&xHost), inputSize)); CHECK_ACL(aclrtMallocHost((void **)(&yHost), inputSize)); CHECK_ACL(aclrtMallocHost((void **)(&zHostAdd), outputSize)); - CHECK_ACL(aclrtMallocHost((void **)(&zHostSub), outputSize)); + CHECK_ACL(aclrtMallocHost((void **)(&zHostSubMul), outputSize)); CHECK_ACL(aclrtMallocHost((void **)(&tiling), tilingSize)); CHECK_ACL(aclrtMalloc((void **)&xDevice, inputSize, ACL_MEM_MALLOC_HUGE_FIRST)); CHECK_ACL(aclrtMalloc((void **)&yDevice, inputSize, ACL_MEM_MALLOC_HUGE_FIRST)); CHECK_ACL(aclrtMalloc((void **)&zDeviceAdd, outputSize, ACL_MEM_MALLOC_HUGE_FIRST)); - CHECK_ACL(aclrtMalloc((void **)&zDeviceSub, outputSize, ACL_MEM_MALLOC_HUGE_FIRST)); + CHECK_ACL(aclrtMalloc((void **)&zDeviceSubMul, outputSize, ACL_MEM_MALLOC_HUGE_FIRST)); ReadFile("../input/input_x.bin", inputSize, xHost, inputSize); ReadFile("../input/input_y.bin", inputSize, yHost, inputSize); @@ -183,7 +183,7 @@ int32_t main(int32_t argc, char *argv[]) { // Execute the kernel ACLRT_LAUNCH_KERNEL(tbufpool_custom) - (USED_CORE_NUM, stream, xDevice, yDevice, zDeviceAdd, zDeviceSub, reinterpret_cast(tiling)); + (USED_CORE_NUM, stream, xDevice, yDevice, zDeviceAdd, zDeviceSubMul, reinterpret_cast(tiling)); // Wait for the stop event to complete CHECK_ACL(aclrtSynchronizeStream(stream)); @@ -191,25 +191,25 @@ int32_t main(int32_t argc, char *argv[]) { // Copy result to host memory and write to output file CHECK_ACL(aclrtMemcpy(zHostAdd, outputSize, zDeviceAdd, outputSize, ACL_MEMCPY_DEVICE_TO_HOST)); WriteFile("../output/output_add.bin", zHostAdd, outputSize); - CHECK_ACL(aclrtMemcpy(zHostSub, outputSize, zDeviceSub, outputSize, ACL_MEMCPY_DEVICE_TO_HOST)); - WriteFile("../output/output_sub.bin", zHostSub, outputSize); + CHECK_ACL(aclrtMemcpy(zHostSubMul, outputSize, zDeviceSubMul, outputSize, ACL_MEMCPY_DEVICE_TO_HOST)); + WriteFile("../output/output_sub_mul.bin", zHostSubMul, outputSize); // Compare the result with the golden result bool goldenResultAdd = true; goldenResultAdd = CompareResultAdd(zHostAdd, outputSize); bool goldenResultSub = true; - goldenResultSub = CompareResultSub(zHostSub, outputSize); + goldenResultSub = CompareResultSub(zHostSubMul, outputSize); // Clean up memory CHECK_ACL(aclrtFree(xDevice)); CHECK_ACL(aclrtFree(yDevice)); CHECK_ACL(aclrtFree(zDeviceAdd)); - CHECK_ACL(aclrtFree(zDeviceSub)); + CHECK_ACL(aclrtFree(zDeviceSubMul)); CHECK_ACL(aclrtFreeHost(xHost)); CHECK_ACL(aclrtFreeHost(yHost)); CHECK_ACL(aclrtFreeHost(zHostAdd)); - CHECK_ACL(aclrtFreeHost(zHostSub)); + CHECK_ACL(aclrtFreeHost(zHostSubMul)); CHECK_ACL(aclrtFreeHost(tiling)); diff --git a/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h b/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h index ca7c25899..ccd7706f1 100644 --- a/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h +++ b/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h @@ -16,12 +16,16 @@ namespace MyCustomKernel { constexpr int32_t BUFFER_NUM = 1; +constexpr int32_t BUFFER_NUM_T1 = 1; +constexpr int32_t BUFFER_NUM_T2 = 1; constexpr int32_t BUFFER_LENGTH = 4096*sizeof(float); constexpr int32_t BUFF_POOL_LENGTH = 2048*sizeof(float); constexpr int32_t INIT_TENSOR_LENGTH = 1024*sizeof(float); +constexpr int32_t INIT_TENSOR_LENGTH_HALF = 512*sizeof(float); constexpr int32_t SPLIT_NUM = 2; constexpr int32_t TOTL_NUM = 2048; constexpr int32_t COMPUTE_LENGTH = 1024; +constexpr int32_t COMPUTE_LENGTH_HALF = 512; class TbufPoolImpl { public: @@ -40,10 +44,13 @@ class TbufPoolImpl { tbufPool0.InitBuffer(srcQue0, BUFFER_NUM, BUFF_POOL_LENGTH); // Total src0 tbufPool0.InitBufPool(tbufPool1, BUFF_POOL_LENGTH); tbufPool0.InitBufPool(tbufPool2, BUFF_POOL_LENGTH, tbufPool1); - tbufPool1.InitBuffer(srcQue1, BUFFER_NUM, INIT_TENSOR_LENGTH); - tbufPool1.InitBuffer(dstQue1, BUFFER_NUM, INIT_TENSOR_LENGTH); - tbufPool2.InitBuffer(srcQue2, BUFFER_NUM, INIT_TENSOR_LENGTH); - tbufPool2.InitBuffer(dstQue2, BUFFER_NUM, INIT_TENSOR_LENGTH); + tbufPool1.InitBuffer(srcQue1, BUFFER_NUM_T1, INIT_TENSOR_LENGTH); + tbufPool1.InitBuffer(dstQue1, BUFFER_NUM_T1, INIT_TENSOR_LENGTH); + tbufPool2.InitBuffer(srcQue2, BUFFER_NUM_T2, INIT_TENSOR_LENGTH_HALF); + tbufPool2.InitBuffer(dstQue2, BUFFER_NUM_T2, INIT_TENSOR_LENGTH_HALF); + tbufPool2.InitBuffer(srcQue3, BUFFER_NUM_T2, INIT_TENSOR_LENGTH_HALF); + tbufPool2.InitBuffer(tmp, INIT_TENSOR_LENGTH_HALF); + //tbufPool2.InitBuffer(dstQue2, BUFFER_NUM_T2, INIT_TENSOR_LENGTH); } __aicore__ inline void Process() { @@ -92,25 +99,35 @@ class TbufPoolImpl { { AscendC::LocalTensor src0Local = srcQue0.AllocTensor(); AscendC::LocalTensor src1Local = srcQue2.AllocTensor(); - AscendC::DataCopy(src0Local, src0Global[progress * COMPUTE_LENGTH], COMPUTE_LENGTH); - AscendC::DataCopy(src1Local, src1Global[progress * COMPUTE_LENGTH], COMPUTE_LENGTH); + AscendC::LocalTensor src2Local = srcQue3.AllocTensor(); + + AscendC::DataCopy(src0Local, src0Global[progress * COMPUTE_LENGTH_HALF], COMPUTE_LENGTH_HALF); + AscendC::DataCopy(src1Local, src1Global[progress * COMPUTE_LENGTH_HALF], COMPUTE_LENGTH_HALF); + AscendC::DataCopy(src2Local, src1Global[progress * COMPUTE_LENGTH_HALF], COMPUTE_LENGTH_HALF); + srcQue0.EnQue(src0Local); srcQue2.EnQue(src1Local); + srcQue3.EnQue(src2Local); + } __aicore__ inline void Compute1(int32_t progress) { AscendC::LocalTensor src0Local = srcQue0.DeQue(); AscendC::LocalTensor src1Local = srcQue2.DeQue(); + AscendC::LocalTensor src2Local = srcQue3.DeQue(); + AscendC::LocalTensor dstLocal = dstQue2.AllocTensor(); - AscendC::Sub(dstLocal, src0Local, src1Local, COMPUTE_LENGTH); + AscendC::Sub(tmp, src0Local, src1Local, COMPUTE_LENGTH_HALF); + AscendC::Mul(dstLocal, tmp, src2Local, COMPUTE_LENGTH_HALF) dstQue2.EnQue(dstLocal); srcQue0.FreeTensor(src0Local); srcQue2.FreeTensor(src1Local); + srcQue2.FreeTensor(src2Local); } __aicore__ inline void CopyOut1(int32_t progress) { AscendC::LocalTensor dstLocal = dstQue2.DeQue(); - AscendC::DataCopy(dstGlobal1[progress * COMPUTE_LENGTH], dstLocal, COMPUTE_LENGTH); + AscendC::DataCopy(dstGlobal1[progress * COMPUTE_LENGTH_HALF], dstLocal, COMPUTE_LENGTH_HALF); dstQue2.FreeTensor(dstLocal); } @@ -122,14 +139,20 @@ class TbufPoolImpl { AscendC::TQue srcQue0; AscendC::TQue srcQue1; AscendC::TQue srcQue2; + AscendC::TQue srcQue3; + AscendC::TQue dstQue; AscendC::TQue dstQue1; AscendC::TQue dstQue2; + //AscendC::TQue dstQue3; + AscendC::GlobalTensor src0Global; AscendC::GlobalTensor src1Global; AscendC::GlobalTensor dstGlobal0; AscendC::GlobalTensor dstGlobal1; + AscendC::TBuf tmp; + uint32_t totalLength = 0; }; }// namespace MyCustomKernel diff --git a/operator/ascendc/2_features/2_tbufpool/scripts/gen_data.py b/operator/ascendc/2_features/2_tbufpool/scripts/gen_data.py index 1db568ae2..8b74e820a 100644 --- a/operator/ascendc/2_features/2_tbufpool/scripts/gen_data.py +++ b/operator/ascendc/2_features/2_tbufpool/scripts/gen_data.py @@ -19,15 +19,20 @@ def gen_golden_data_simple(): # generate value between [-65504, 65504] input_x = np.random.uniform(-65504, np.nextafter(65504, np.inf), input_shape).astype(dtype) input_y = np.random.uniform(-65504, np.nextafter(65504, np.inf), input_shape).astype(dtype) + + half_size = input_shape[0] // 2 + input_x_half = input_x[:half_size, :] + input_y_half = input_y[:half_size, :] golden_add = input_x + input_y - golden_sub = input_x - input_y + golden_sub = input_x_half - input_y_half + golden_sub_mul = np.multiply(golden_sub, input_y_half) os.system("mkdir -p ./input") input_x.tofile("./input/input_x.bin") input_y.tofile("./input/input_y.bin") os.system("mkdir -p ./output") golden_add.tofile("./output/golden_add.bin") - golden_sub.tofile("./output/golden_sub.bin") + golden_sub_mul.tofile("./output/golden_sub_mul.bin") if __name__ == "__main__": gen_golden_data_simple() \ No newline at end of file -- Gitee From 27c17b2e6cc78f86df4c2ffc63c69d5dad573ed9 Mon Sep 17 00:00:00 2001 From: alpaca12345uuu Date: Mon, 9 Jun 2025 17:13:42 +0800 Subject: [PATCH 49/94] =?UTF-8?q?=E4=BB=A3=E7=A0=81=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h b/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h index ccd7706f1..a992dd0c7 100644 --- a/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h +++ b/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h @@ -118,7 +118,7 @@ class TbufPoolImpl { AscendC::LocalTensor dstLocal = dstQue2.AllocTensor(); AscendC::Sub(tmp, src0Local, src1Local, COMPUTE_LENGTH_HALF); - AscendC::Mul(dstLocal, tmp, src2Local, COMPUTE_LENGTH_HALF) + AscendC::Mul(dstLocal, tmp, src2Local, COMPUTE_LENGTH_HALF); dstQue2.EnQue(dstLocal); srcQue0.FreeTensor(src0Local); srcQue2.FreeTensor(src1Local); -- Gitee From e21755f56c45cbf8e338fc2cf90a0954149ba9e9 Mon Sep 17 00:00:00 2001 From: alpaca12345uuu Date: Mon, 9 Jun 2025 17:19:29 +0800 Subject: [PATCH 50/94] =?UTF-8?q?=E4=BB=A3=E7=A0=81=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h | 1 + 1 file changed, 1 insertion(+) diff --git a/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h b/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h index a992dd0c7..d2eac202b 100644 --- a/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h +++ b/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h @@ -84,6 +84,7 @@ class TbufPoolImpl { AscendC::LocalTensor src0Local = srcQue0.DeQue(); AscendC::LocalTensor src1Local = srcQue1.DeQue(); AscendC::LocalTensor dstLocal = dstQue1.AllocTensor(); + AscendC::LocalTensor tmpTensor = tmp.Get(); AscendC::Add(dstLocal, src0Local, src1Local, COMPUTE_LENGTH); dstQue1.EnQue(dstLocal); srcQue0.FreeTensor(src0Local); -- Gitee From 984edaadc08bf42d94e6ddd87f45cf77d51bb3f9 Mon Sep 17 00:00:00 2001 From: alpaca12345uuu Date: Mon, 9 Jun 2025 17:20:43 +0800 Subject: [PATCH 51/94] =?UTF-8?q?=E4=BB=A3=E7=A0=81=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../2_features/2_tbufpool/op_kernel/tbufpool_custom.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h b/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h index d2eac202b..48c2c6e09 100644 --- a/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h +++ b/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h @@ -84,7 +84,6 @@ class TbufPoolImpl { AscendC::LocalTensor src0Local = srcQue0.DeQue(); AscendC::LocalTensor src1Local = srcQue1.DeQue(); AscendC::LocalTensor dstLocal = dstQue1.AllocTensor(); - AscendC::LocalTensor tmpTensor = tmp.Get(); AscendC::Add(dstLocal, src0Local, src1Local, COMPUTE_LENGTH); dstQue1.EnQue(dstLocal); srcQue0.FreeTensor(src0Local); @@ -116,10 +115,11 @@ class TbufPoolImpl { AscendC::LocalTensor src0Local = srcQue0.DeQue(); AscendC::LocalTensor src1Local = srcQue2.DeQue(); AscendC::LocalTensor src2Local = srcQue3.DeQue(); - AscendC::LocalTensor dstLocal = dstQue2.AllocTensor(); - AscendC::Sub(tmp, src0Local, src1Local, COMPUTE_LENGTH_HALF); - AscendC::Mul(dstLocal, tmp, src2Local, COMPUTE_LENGTH_HALF); + AscendC::LocalTensor tmpTensor = tmp.Get(); + + AscendC::Sub(tmpTensor, src0Local, src1Local, COMPUTE_LENGTH_HALF); + AscendC::Mul(dstLocal, tmpTensor, src2Local, COMPUTE_LENGTH_HALF); dstQue2.EnQue(dstLocal); srcQue0.FreeTensor(src0Local); srcQue2.FreeTensor(src1Local); -- Gitee From e5e24f1dd8876b3db2deb43a6f5a1c34ab677718 Mon Sep 17 00:00:00 2001 From: alpaca12345uuu Date: Mon, 9 Jun 2025 17:22:03 +0800 Subject: [PATCH 52/94] =?UTF-8?q?=E4=BB=A3=E7=A0=81=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h b/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h index 48c2c6e09..40d61e13d 100644 --- a/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h +++ b/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h @@ -123,7 +123,7 @@ class TbufPoolImpl { dstQue2.EnQue(dstLocal); srcQue0.FreeTensor(src0Local); srcQue2.FreeTensor(src1Local); - srcQue2.FreeTensor(src2Local); + srcQue3.FreeTensor(src2Local); } __aicore__ inline void CopyOut1(int32_t progress) { -- Gitee From 0533554483f57570bbfc3b5a422aa32d7ac43d9a Mon Sep 17 00:00:00 2001 From: alpaca12345uuu Date: Mon, 9 Jun 2025 17:28:09 +0800 Subject: [PATCH 53/94] =?UTF-8?q?=E4=BB=A3=E7=A0=81=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- operator/ascendc/2_features/2_tbufpool/main.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/operator/ascendc/2_features/2_tbufpool/main.cpp b/operator/ascendc/2_features/2_tbufpool/main.cpp index 468a4a8b6..6af98dd18 100644 --- a/operator/ascendc/2_features/2_tbufpool/main.cpp +++ b/operator/ascendc/2_features/2_tbufpool/main.cpp @@ -68,7 +68,7 @@ static bool CompareResultAdd(const void *outputData, int64_t outSize) { } } -static bool CompareResultSub(const void *outputData, int64_t outSize) { +static bool CompareResultSubMul(const void *outputData, int64_t outSize) { void *goldenData; #ifdef ASCENDC_CPU_DEBUG goldenData = (uint8_t *)AscendC::GmAlloc(outSize); @@ -136,8 +136,8 @@ int32_t main(int32_t argc, char *argv[]) { bool goldenResultAdd = true; goldenResultAdd = CompareResultAdd(zAdd, outputSize); - bool goldenResultSub = true; - goldenResultSub = CompareResultSub(zSubMul, outputSize); + bool goldenResultSubMul = true; + goldenResultSubMul = CompareResultSubMul(zSubMul, outputSize); AscendC::GmFree((void *)x); AscendC::GmFree((void *)y); @@ -197,8 +197,8 @@ int32_t main(int32_t argc, char *argv[]) { // Compare the result with the golden result bool goldenResultAdd = true; goldenResultAdd = CompareResultAdd(zHostAdd, outputSize); - bool goldenResultSub = true; - goldenResultSub = CompareResultSub(zHostSubMul, outputSize); + bool goldenResultSubMul = true; + goldenResultSubMul = CompareResultSubMul(zHostSubMul, outputSize); // Clean up memory CHECK_ACL(aclrtFree(xDevice)); @@ -218,7 +218,7 @@ int32_t main(int32_t argc, char *argv[]) { CHECK_ACL(aclFinalize()); #endif - if (goldenResultAdd && goldenResultSub) { + if (goldenResultAdd && goldenResultSubMul) { printf("test pass!\n"); } else { printf("test failed!\n"); -- Gitee From a89b7394fbce5024b0ad523cd9efe073729b8b28 Mon Sep 17 00:00:00 2001 From: alpaca12345uuu Date: Mon, 9 Jun 2025 18:19:35 +0800 Subject: [PATCH 54/94] =?UTF-8?q?=E4=BB=A3=E7=A0=81=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h | 1 + 1 file changed, 1 insertion(+) diff --git a/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h b/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h index 40d61e13d..472de68d2 100644 --- a/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h +++ b/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h @@ -76,6 +76,7 @@ class TbufPoolImpl { AscendC::LocalTensor src1Local = srcQue1.AllocTensor(); AscendC::DataCopy(src0Local, src0Global[progress * COMPUTE_LENGTH], COMPUTE_LENGTH); AscendC::DataCopy(src1Local, src1Global[progress * COMPUTE_LENGTH], COMPUTE_LENGTH); + AscendC::PipeBarrier(); srcQue0.EnQue(src0Local); srcQue1.EnQue(src1Local); } -- Gitee From d9fdc942af6ecb40797d8076dc1ba7e8236f0f1a Mon Sep 17 00:00:00 2001 From: alpaca12345uuu Date: Mon, 9 Jun 2025 18:21:45 +0800 Subject: [PATCH 55/94] =?UTF-8?q?=E4=BB=A3=E7=A0=81=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h b/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h index 472de68d2..b0a515bb2 100644 --- a/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h +++ b/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h @@ -76,7 +76,7 @@ class TbufPoolImpl { AscendC::LocalTensor src1Local = srcQue1.AllocTensor(); AscendC::DataCopy(src0Local, src0Global[progress * COMPUTE_LENGTH], COMPUTE_LENGTH); AscendC::DataCopy(src1Local, src1Global[progress * COMPUTE_LENGTH], COMPUTE_LENGTH); - AscendC::PipeBarrier(); + AscendC::PipeBarrier(); srcQue0.EnQue(src0Local); srcQue1.EnQue(src1Local); } -- Gitee From 9117dd248f6f12d5c464f420c5ab7b147d529d0a Mon Sep 17 00:00:00 2001 From: alpaca12345uuu Date: Mon, 9 Jun 2025 18:23:01 +0800 Subject: [PATCH 56/94] =?UTF-8?q?=E4=BB=A3=E7=A0=81=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h b/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h index b0a515bb2..580892ca4 100644 --- a/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h +++ b/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h @@ -76,7 +76,7 @@ class TbufPoolImpl { AscendC::LocalTensor src1Local = srcQue1.AllocTensor(); AscendC::DataCopy(src0Local, src0Global[progress * COMPUTE_LENGTH], COMPUTE_LENGTH); AscendC::DataCopy(src1Local, src1Global[progress * COMPUTE_LENGTH], COMPUTE_LENGTH); - AscendC::PipeBarrier(); + AscendC::PipeBarrier(); srcQue0.EnQue(src0Local); srcQue1.EnQue(src1Local); } -- Gitee From 73b405e05e839e7f59e4be73b11509d7badc6371 Mon Sep 17 00:00:00 2001 From: alpaca12345uuu Date: Mon, 9 Jun 2025 18:25:59 +0800 Subject: [PATCH 57/94] =?UTF-8?q?=E4=BB=A3=E7=A0=81=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h b/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h index 580892ca4..8b4969524 100644 --- a/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h +++ b/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h @@ -76,7 +76,7 @@ class TbufPoolImpl { AscendC::LocalTensor src1Local = srcQue1.AllocTensor(); AscendC::DataCopy(src0Local, src0Global[progress * COMPUTE_LENGTH], COMPUTE_LENGTH); AscendC::DataCopy(src1Local, src1Global[progress * COMPUTE_LENGTH], COMPUTE_LENGTH); - AscendC::PipeBarrier(); + AscendC::PipeBarrier(); srcQue0.EnQue(src0Local); srcQue1.EnQue(src1Local); } @@ -101,6 +101,7 @@ class TbufPoolImpl { AscendC::LocalTensor src0Local = srcQue0.AllocTensor(); AscendC::LocalTensor src1Local = srcQue2.AllocTensor(); AscendC::LocalTensor src2Local = srcQue3.AllocTensor(); + AscendC::PipeBarrier(); AscendC::DataCopy(src0Local, src0Global[progress * COMPUTE_LENGTH_HALF], COMPUTE_LENGTH_HALF); AscendC::DataCopy(src1Local, src1Global[progress * COMPUTE_LENGTH_HALF], COMPUTE_LENGTH_HALF); -- Gitee From 69ca7368a622e175a2cec7204e701379c2a290c9 Mon Sep 17 00:00:00 2001 From: alpaca12345uuu Date: Mon, 9 Jun 2025 18:27:08 +0800 Subject: [PATCH 58/94] =?UTF-8?q?=E4=BB=A3=E7=A0=81=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h b/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h index 8b4969524..9d98acb2c 100644 --- a/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h +++ b/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h @@ -61,11 +61,15 @@ class TbufPoolImpl { Compute(i); CopyOut(i); tbufPool1.Reset(); + AscendC::PipeBarrier(); + //stage 2 CopyIn1(i); Compute1(i); CopyOut1(i); tbufPool2.Reset(); + AscendC::PipeBarrier(); + } tbufPool0.Reset(); } -- Gitee From 12fa85a2edb36f22ad88c94cccca1be533a8fd8e Mon Sep 17 00:00:00 2001 From: alpaca12345uuu Date: Mon, 9 Jun 2025 18:29:53 +0800 Subject: [PATCH 59/94] =?UTF-8?q?=E4=BB=A3=E7=A0=81=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../2_tbufpool/op_kernel/tbufpool_custom.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h b/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h index 9d98acb2c..19f165528 100644 --- a/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h +++ b/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h @@ -57,16 +57,28 @@ class TbufPoolImpl { for (int32_t i = 0; i < SPLIT_NUM; i++) { //stage 1 + AscendC::PipeBarrier(); + CopyIn(i); + AscendC::PipeBarrier(); + Compute(i); + AscendC::PipeBarrier(); + CopyOut(i); tbufPool1.Reset(); AscendC::PipeBarrier(); //stage 2 CopyIn1(i); + AscendC::PipeBarrier(); + Compute1(i); + AscendC::PipeBarrier(); + CopyOut1(i); + AscendC::PipeBarrier(); + tbufPool2.Reset(); AscendC::PipeBarrier(); -- Gitee From 293d7ace7720f59e017c8170f2f59a9e6c4382b5 Mon Sep 17 00:00:00 2001 From: alpaca12345uuu Date: Mon, 9 Jun 2025 18:34:31 +0800 Subject: [PATCH 60/94] =?UTF-8?q?=E4=BB=A3=E7=A0=81=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../2_features/2_tbufpool/op_kernel/tbufpool_custom.h | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h b/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h index 19f165528..abe106f85 100644 --- a/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h +++ b/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h @@ -120,8 +120,13 @@ class TbufPoolImpl { AscendC::PipeBarrier(); AscendC::DataCopy(src0Local, src0Global[progress * COMPUTE_LENGTH_HALF], COMPUTE_LENGTH_HALF); + AscendC::PipeBarrier(); + AscendC::DataCopy(src1Local, src1Global[progress * COMPUTE_LENGTH_HALF], COMPUTE_LENGTH_HALF); + AscendC::PipeBarrier(); + AscendC::DataCopy(src2Local, src1Global[progress * COMPUTE_LENGTH_HALF], COMPUTE_LENGTH_HALF); + AscendC::PipeBarrier(); srcQue0.EnQue(src0Local); srcQue2.EnQue(src1Local); @@ -137,7 +142,11 @@ class TbufPoolImpl { AscendC::LocalTensor tmpTensor = tmp.Get(); AscendC::Sub(tmpTensor, src0Local, src1Local, COMPUTE_LENGTH_HALF); + AscendC::PipeBarrier(); + AscendC::Mul(dstLocal, tmpTensor, src2Local, COMPUTE_LENGTH_HALF); + AscendC::PipeBarrier(); + dstQue2.EnQue(dstLocal); srcQue0.FreeTensor(src0Local); srcQue2.FreeTensor(src1Local); @@ -147,6 +156,8 @@ class TbufPoolImpl { { AscendC::LocalTensor dstLocal = dstQue2.DeQue(); AscendC::DataCopy(dstGlobal1[progress * COMPUTE_LENGTH_HALF], dstLocal, COMPUTE_LENGTH_HALF); + AscendC::PipeBarrier(); + dstQue2.FreeTensor(dstLocal); } -- Gitee From 1d7107dd9e5c744f006234808acc574f8e3912e8 Mon Sep 17 00:00:00 2001 From: alpaca12345uuu Date: Mon, 9 Jun 2025 18:42:37 +0800 Subject: [PATCH 61/94] =?UTF-8?q?=E4=BB=A3=E7=A0=81=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../ascendc/2_features/2_tbufpool/main.cpp | 4 +-- .../2_tbufpool/op_kernel/tbufpool_custom.h | 32 +++++++++---------- 2 files changed, 18 insertions(+), 18 deletions(-) diff --git a/operator/ascendc/2_features/2_tbufpool/main.cpp b/operator/ascendc/2_features/2_tbufpool/main.cpp index 6af98dd18..018df5ee8 100644 --- a/operator/ascendc/2_features/2_tbufpool/main.cpp +++ b/operator/ascendc/2_features/2_tbufpool/main.cpp @@ -92,7 +92,7 @@ static bool CompareResultSubMul(const void *outputData, int64_t outSize) { float ae = std::abs(a - b); float re = ae / abs(b); if (ae > EPS && re > EPS) { - printf("CompareResultSub failed output is %lf, golden is %lf\n", a, b); + printf("CompareResultSubMul failed output is %lf, golden is %lf\n", a, b); wrongNum++; } } @@ -104,7 +104,7 @@ static bool CompareResultSubMul(const void *outputData, int64_t outSize) { if (wrongNum != 0) { return false; } else { - printf("CompareResultSub golden_sub_mul.bin success!\n"); + printf("CompareResultSubMul golden_sub_mul.bin success!\n"); return true; } } diff --git a/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h b/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h index abe106f85..c590256e2 100644 --- a/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h +++ b/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h @@ -57,30 +57,30 @@ class TbufPoolImpl { for (int32_t i = 0; i < SPLIT_NUM; i++) { //stage 1 - AscendC::PipeBarrier(); + //AscendC::PipeBarrier(); CopyIn(i); - AscendC::PipeBarrier(); + //AscendC::PipeBarrier(); Compute(i); - AscendC::PipeBarrier(); + //AscendC::PipeBarrier(); CopyOut(i); tbufPool1.Reset(); - AscendC::PipeBarrier(); + //AscendC::PipeBarrier(); //stage 2 CopyIn1(i); - AscendC::PipeBarrier(); + //AscendC::PipeBarrier(); Compute1(i); - AscendC::PipeBarrier(); + //AscendC::PipeBarrier(); CopyOut1(i); - AscendC::PipeBarrier(); + //AscendC::PipeBarrier(); tbufPool2.Reset(); - AscendC::PipeBarrier(); + //AscendC::PipeBarrier(); } tbufPool0.Reset(); @@ -92,7 +92,7 @@ class TbufPoolImpl { AscendC::LocalTensor src1Local = srcQue1.AllocTensor(); AscendC::DataCopy(src0Local, src0Global[progress * COMPUTE_LENGTH], COMPUTE_LENGTH); AscendC::DataCopy(src1Local, src1Global[progress * COMPUTE_LENGTH], COMPUTE_LENGTH); - AscendC::PipeBarrier(); + //AscendC::PipeBarrier(); srcQue0.EnQue(src0Local); srcQue1.EnQue(src1Local); } @@ -117,16 +117,16 @@ class TbufPoolImpl { AscendC::LocalTensor src0Local = srcQue0.AllocTensor(); AscendC::LocalTensor src1Local = srcQue2.AllocTensor(); AscendC::LocalTensor src2Local = srcQue3.AllocTensor(); - AscendC::PipeBarrier(); + //AscendC::PipeBarrier(); AscendC::DataCopy(src0Local, src0Global[progress * COMPUTE_LENGTH_HALF], COMPUTE_LENGTH_HALF); - AscendC::PipeBarrier(); + //AscendC::PipeBarrier(); AscendC::DataCopy(src1Local, src1Global[progress * COMPUTE_LENGTH_HALF], COMPUTE_LENGTH_HALF); - AscendC::PipeBarrier(); + //AscendC::PipeBarrier(); AscendC::DataCopy(src2Local, src1Global[progress * COMPUTE_LENGTH_HALF], COMPUTE_LENGTH_HALF); - AscendC::PipeBarrier(); + //AscendC::PipeBarrier(); srcQue0.EnQue(src0Local); srcQue2.EnQue(src1Local); @@ -142,10 +142,10 @@ class TbufPoolImpl { AscendC::LocalTensor tmpTensor = tmp.Get(); AscendC::Sub(tmpTensor, src0Local, src1Local, COMPUTE_LENGTH_HALF); - AscendC::PipeBarrier(); + //AscendC::PipeBarrier(); AscendC::Mul(dstLocal, tmpTensor, src2Local, COMPUTE_LENGTH_HALF); - AscendC::PipeBarrier(); + //AscendC::PipeBarrier(); dstQue2.EnQue(dstLocal); srcQue0.FreeTensor(src0Local); @@ -156,7 +156,7 @@ class TbufPoolImpl { { AscendC::LocalTensor dstLocal = dstQue2.DeQue(); AscendC::DataCopy(dstGlobal1[progress * COMPUTE_LENGTH_HALF], dstLocal, COMPUTE_LENGTH_HALF); - AscendC::PipeBarrier(); + //AscendC::PipeBarrier(); dstQue2.FreeTensor(dstLocal); } -- Gitee From 5e960db1d09c706984c376111601cad8e15e24fa Mon Sep 17 00:00:00 2001 From: alpaca12345uuu Date: Mon, 9 Jun 2025 18:47:55 +0800 Subject: [PATCH 62/94] =?UTF-8?q?=E4=BB=A3=E7=A0=81=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h | 1 + 1 file changed, 1 insertion(+) diff --git a/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h b/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h index c590256e2..8f799bbe5 100644 --- a/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h +++ b/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h @@ -95,6 +95,7 @@ class TbufPoolImpl { //AscendC::PipeBarrier(); srcQue0.EnQue(src0Local); srcQue1.EnQue(src1Local); + AscendC::PipeBarrier(); } __aicore__ inline void Compute(int32_t progress) { -- Gitee From bdf69ad8ba4eb24dead5ba0cf86861f0e68987a9 Mon Sep 17 00:00:00 2001 From: alpaca12345uuu Date: Mon, 9 Jun 2025 18:55:31 +0800 Subject: [PATCH 63/94] =?UTF-8?q?=E4=BB=A3=E7=A0=81=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h b/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h index 8f799bbe5..467ee67ee 100644 --- a/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h +++ b/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h @@ -119,6 +119,9 @@ class TbufPoolImpl { AscendC::LocalTensor src1Local = srcQue2.AllocTensor(); AscendC::LocalTensor src2Local = srcQue3.AllocTensor(); //AscendC::PipeBarrier(); + AscendC::DumpTensor(src0Local, 1, 10); + AscendC::DumpTensor(src1Local, 2, 10); + AscendC::DumpTensor(src2Local, 3, 10); AscendC::DataCopy(src0Local, src0Global[progress * COMPUTE_LENGTH_HALF], COMPUTE_LENGTH_HALF); //AscendC::PipeBarrier(); -- Gitee From 9a87c36b2dcdedbca06ca660e654ea25c9a2e3fb Mon Sep 17 00:00:00 2001 From: alpaca12345uuu Date: Mon, 9 Jun 2025 18:59:36 +0800 Subject: [PATCH 64/94] =?UTF-8?q?=E4=BB=A3=E7=A0=81=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../2_features/2_tbufpool/op_kernel/tbufpool_custom.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h b/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h index 467ee67ee..1b981bbe8 100644 --- a/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h +++ b/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h @@ -119,9 +119,9 @@ class TbufPoolImpl { AscendC::LocalTensor src1Local = srcQue2.AllocTensor(); AscendC::LocalTensor src2Local = srcQue3.AllocTensor(); //AscendC::PipeBarrier(); - AscendC::DumpTensor(src0Local, 1, 10); - AscendC::DumpTensor(src1Local, 2, 10); - AscendC::DumpTensor(src2Local, 3, 10); + AscendC::DumpTensor(src0Local, 1, 16); + AscendC::DumpTensor(src1Local, 2, 16); + AscendC::DumpTensor(src2Local, 3, 16); AscendC::DataCopy(src0Local, src0Global[progress * COMPUTE_LENGTH_HALF], COMPUTE_LENGTH_HALF); //AscendC::PipeBarrier(); -- Gitee From 85beb29f09bb31990c90b4f0f00603cb0bb4c86b Mon Sep 17 00:00:00 2001 From: alpaca12345uuu Date: Mon, 9 Jun 2025 19:12:59 +0800 Subject: [PATCH 65/94] =?UTF-8?q?=E4=BB=A3=E7=A0=81=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- operator/ascendc/2_features/2_tbufpool/main.cpp | 2 +- .../2_features/2_tbufpool/op_kernel/tbufpool_custom.h | 9 ++++++--- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/operator/ascendc/2_features/2_tbufpool/main.cpp b/operator/ascendc/2_features/2_tbufpool/main.cpp index 018df5ee8..b1b73eccc 100644 --- a/operator/ascendc/2_features/2_tbufpool/main.cpp +++ b/operator/ascendc/2_features/2_tbufpool/main.cpp @@ -92,7 +92,7 @@ static bool CompareResultSubMul(const void *outputData, int64_t outSize) { float ae = std::abs(a - b); float re = ae / abs(b); if (ae > EPS && re > EPS) { - printf("CompareResultSubMul failed output is %lf, golden is %lf\n", a, b); + //printf("CompareResultSubMul failed output is %lf, golden is %lf\n", a, b); wrongNum++; } } diff --git a/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h b/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h index 1b981bbe8..9dcef5139 100644 --- a/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h +++ b/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h @@ -119,9 +119,9 @@ class TbufPoolImpl { AscendC::LocalTensor src1Local = srcQue2.AllocTensor(); AscendC::LocalTensor src2Local = srcQue3.AllocTensor(); //AscendC::PipeBarrier(); - AscendC::DumpTensor(src0Local, 1, 16); - AscendC::DumpTensor(src1Local, 2, 16); - AscendC::DumpTensor(src2Local, 3, 16); + // AscendC::DumpTensor(src0Local, 1, 16); + // AscendC::DumpTensor(src1Local, 2, 16); + // AscendC::DumpTensor(src2Local, 3, 16); AscendC::DataCopy(src0Local, src0Global[progress * COMPUTE_LENGTH_HALF], COMPUTE_LENGTH_HALF); //AscendC::PipeBarrier(); @@ -131,6 +131,9 @@ class TbufPoolImpl { AscendC::DataCopy(src2Local, src1Global[progress * COMPUTE_LENGTH_HALF], COMPUTE_LENGTH_HALF); //AscendC::PipeBarrier(); + AscendC::DumpTensor(src0Local, 1, 16); + AscendC::DumpTensor(src1Local, 2, 16); + AscendC::DumpTensor(src2Local, 3, 16); srcQue0.EnQue(src0Local); srcQue2.EnQue(src1Local); -- Gitee From 8149c469a64053d61a2e31b350b65a20a325b8d4 Mon Sep 17 00:00:00 2001 From: alpaca12345uuu Date: Mon, 9 Jun 2025 19:14:09 +0800 Subject: [PATCH 66/94] =?UTF-8?q?=E4=BB=A3=E7=A0=81=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../2_features/2_tbufpool/op_kernel/tbufpool_custom.h | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h b/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h index 9dcef5139..3c3077922 100644 --- a/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h +++ b/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h @@ -131,9 +131,9 @@ class TbufPoolImpl { AscendC::DataCopy(src2Local, src1Global[progress * COMPUTE_LENGTH_HALF], COMPUTE_LENGTH_HALF); //AscendC::PipeBarrier(); - AscendC::DumpTensor(src0Local, 1, 16); - AscendC::DumpTensor(src1Local, 2, 16); - AscendC::DumpTensor(src2Local, 3, 16); + // AscendC::DumpTensor(src0Local, 1, 16); + // AscendC::DumpTensor(src1Local, 2, 16); + // AscendC::DumpTensor(src2Local, 3, 16); srcQue0.EnQue(src0Local); srcQue2.EnQue(src1Local); @@ -147,6 +147,9 @@ class TbufPoolImpl { AscendC::LocalTensor src2Local = srcQue3.DeQue(); AscendC::LocalTensor dstLocal = dstQue2.AllocTensor(); AscendC::LocalTensor tmpTensor = tmp.Get(); + AscendC::DumpTensor(src0Local, 1, 16); + AscendC::DumpTensor(src1Local, 2, 16); + AscendC::DumpTensor(src2Local, 3, 16); AscendC::Sub(tmpTensor, src0Local, src1Local, COMPUTE_LENGTH_HALF); //AscendC::PipeBarrier(); -- Gitee From 3f23f3f0dc390d35a98a5dcb842d97c32fdae3e0 Mon Sep 17 00:00:00 2001 From: alpaca12345uuu Date: Mon, 9 Jun 2025 19:19:09 +0800 Subject: [PATCH 67/94] =?UTF-8?q?=E4=BB=A3=E7=A0=81=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../2_features/2_tbufpool/op_kernel/tbufpool_custom.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h b/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h index 3c3077922..8f55dff8d 100644 --- a/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h +++ b/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h @@ -147,16 +147,17 @@ class TbufPoolImpl { AscendC::LocalTensor src2Local = srcQue3.DeQue(); AscendC::LocalTensor dstLocal = dstQue2.AllocTensor(); AscendC::LocalTensor tmpTensor = tmp.Get(); - AscendC::DumpTensor(src0Local, 1, 16); - AscendC::DumpTensor(src1Local, 2, 16); - AscendC::DumpTensor(src2Local, 3, 16); + // AscendC::DumpTensor(src0Local, 1, 16); + // AscendC::DumpTensor(src1Local, 2, 16); + // AscendC::DumpTensor(src2Local, 3, 16); AscendC::Sub(tmpTensor, src0Local, src1Local, COMPUTE_LENGTH_HALF); //AscendC::PipeBarrier(); + AscendC::DumpTensor(tmpTensor, 1, 16); AscendC::Mul(dstLocal, tmpTensor, src2Local, COMPUTE_LENGTH_HALF); //AscendC::PipeBarrier(); - + AscendC::DumpTensor(dstLocal, 2, 16); dstQue2.EnQue(dstLocal); srcQue0.FreeTensor(src0Local); srcQue2.FreeTensor(src1Local); -- Gitee From d0d5665b43bb892df0cc1f74fa515f9ae41fd9f3 Mon Sep 17 00:00:00 2001 From: alpaca12345uuu Date: Mon, 9 Jun 2025 19:22:30 +0800 Subject: [PATCH 68/94] =?UTF-8?q?=E4=BB=A3=E7=A0=81=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- operator/ascendc/2_features/2_tbufpool/scripts/gen_data.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/operator/ascendc/2_features/2_tbufpool/scripts/gen_data.py b/operator/ascendc/2_features/2_tbufpool/scripts/gen_data.py index 8b74e820a..263d35844 100644 --- a/operator/ascendc/2_features/2_tbufpool/scripts/gen_data.py +++ b/operator/ascendc/2_features/2_tbufpool/scripts/gen_data.py @@ -17,8 +17,8 @@ def gen_golden_data_simple(): input_shape = [8, 256] # generate value between [-65504, 65504] - input_x = np.random.uniform(-65504, np.nextafter(65504, np.inf), input_shape).astype(dtype) - input_y = np.random.uniform(-65504, np.nextafter(65504, np.inf), input_shape).astype(dtype) + input_x = np.random.randint(0, np.nextafter(1000, np.inf), input_shape).astype(dtype) + input_y = np.random.randint(0, np.nextafter(1000, np.inf), input_shape).astype(dtype) half_size = input_shape[0] // 2 input_x_half = input_x[:half_size, :] -- Gitee From 745d3f2f3da5d691d8efbfccb86fcb8c1a7ae463 Mon Sep 17 00:00:00 2001 From: alpaca12345uuu Date: Mon, 9 Jun 2025 19:24:04 +0800 Subject: [PATCH 69/94] =?UTF-8?q?=E4=BB=A3=E7=A0=81=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- operator/ascendc/2_features/2_tbufpool/main.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/operator/ascendc/2_features/2_tbufpool/main.cpp b/operator/ascendc/2_features/2_tbufpool/main.cpp index b1b73eccc..018df5ee8 100644 --- a/operator/ascendc/2_features/2_tbufpool/main.cpp +++ b/operator/ascendc/2_features/2_tbufpool/main.cpp @@ -92,7 +92,7 @@ static bool CompareResultSubMul(const void *outputData, int64_t outSize) { float ae = std::abs(a - b); float re = ae / abs(b); if (ae > EPS && re > EPS) { - //printf("CompareResultSubMul failed output is %lf, golden is %lf\n", a, b); + printf("CompareResultSubMul failed output is %lf, golden is %lf\n", a, b); wrongNum++; } } -- Gitee From e1680201131e620884ed58fb902387e97b56f737 Mon Sep 17 00:00:00 2001 From: alpaca12345uuu Date: Mon, 9 Jun 2025 19:26:15 +0800 Subject: [PATCH 70/94] =?UTF-8?q?=E4=BB=A3=E7=A0=81=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h b/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h index 8f55dff8d..473326360 100644 --- a/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h +++ b/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h @@ -142,6 +142,7 @@ class TbufPoolImpl { } __aicore__ inline void Compute1(int32_t progress) { + AscendC::PipeBarrier(); AscendC::LocalTensor src0Local = srcQue0.DeQue(); AscendC::LocalTensor src1Local = srcQue2.DeQue(); AscendC::LocalTensor src2Local = srcQue3.DeQue(); @@ -166,6 +167,7 @@ class TbufPoolImpl { __aicore__ inline void CopyOut1(int32_t progress) { AscendC::LocalTensor dstLocal = dstQue2.DeQue(); + AscendC::DumpTensor(dstLocal, 2, 16); AscendC::DataCopy(dstGlobal1[progress * COMPUTE_LENGTH_HALF], dstLocal, COMPUTE_LENGTH_HALF); //AscendC::PipeBarrier(); -- Gitee From 5748acffa1ec9194f9bfd32a63be54cf4d05ac50 Mon Sep 17 00:00:00 2001 From: alpaca12345uuu Date: Mon, 9 Jun 2025 19:28:08 +0800 Subject: [PATCH 71/94] =?UTF-8?q?=E4=BB=A3=E7=A0=81=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h b/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h index 473326360..37e111881 100644 --- a/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h +++ b/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h @@ -167,10 +167,10 @@ class TbufPoolImpl { __aicore__ inline void CopyOut1(int32_t progress) { AscendC::LocalTensor dstLocal = dstQue2.DeQue(); - AscendC::DumpTensor(dstLocal, 2, 16); + AscendC::DumpTensor(dstLocal, 5, 16); AscendC::DataCopy(dstGlobal1[progress * COMPUTE_LENGTH_HALF], dstLocal, COMPUTE_LENGTH_HALF); //AscendC::PipeBarrier(); - + AscendC::DumpTensor(dstGlobal1, 2, 16); dstQue2.FreeTensor(dstLocal); } -- Gitee From ae6331947315e5fb1264602d2f13306f53013584 Mon Sep 17 00:00:00 2001 From: alpaca12345uuu Date: Mon, 9 Jun 2025 19:34:02 +0800 Subject: [PATCH 72/94] =?UTF-8?q?=E4=BB=A3=E7=A0=81=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../2_features/2_tbufpool/op_kernel/tbufpool_custom.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h b/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h index 37e111881..96ce98122 100644 --- a/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h +++ b/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h @@ -154,11 +154,11 @@ class TbufPoolImpl { AscendC::Sub(tmpTensor, src0Local, src1Local, COMPUTE_LENGTH_HALF); //AscendC::PipeBarrier(); - AscendC::DumpTensor(tmpTensor, 1, 16); + //AscendC::DumpTensor(tmpTensor, 1, 16); AscendC::Mul(dstLocal, tmpTensor, src2Local, COMPUTE_LENGTH_HALF); //AscendC::PipeBarrier(); - AscendC::DumpTensor(dstLocal, 2, 16); + //AscendC::DumpTensor(dstLocal, 2, 16); dstQue2.EnQue(dstLocal); srcQue0.FreeTensor(src0Local); srcQue2.FreeTensor(src1Local); @@ -167,10 +167,10 @@ class TbufPoolImpl { __aicore__ inline void CopyOut1(int32_t progress) { AscendC::LocalTensor dstLocal = dstQue2.DeQue(); - AscendC::DumpTensor(dstLocal, 5, 16); + //AscendC::DumpTensor(dstLocal, 5, 16); AscendC::DataCopy(dstGlobal1[progress * COMPUTE_LENGTH_HALF], dstLocal, COMPUTE_LENGTH_HALF); //AscendC::PipeBarrier(); - AscendC::DumpTensor(dstGlobal1, 2, 16); + AscendC::DumpTensor(dstGlobal1, 1, 16); dstQue2.FreeTensor(dstLocal); } -- Gitee From f3716734808f819faea42d659b556ecff93724f7 Mon Sep 17 00:00:00 2001 From: alpaca12345uuu Date: Mon, 9 Jun 2025 19:45:26 +0800 Subject: [PATCH 73/94] =?UTF-8?q?=E4=BB=A3=E7=A0=81=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- operator/ascendc/2_features/2_tbufpool/scripts/gen_data.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/operator/ascendc/2_features/2_tbufpool/scripts/gen_data.py b/operator/ascendc/2_features/2_tbufpool/scripts/gen_data.py index 263d35844..bc5ee471b 100644 --- a/operator/ascendc/2_features/2_tbufpool/scripts/gen_data.py +++ b/operator/ascendc/2_features/2_tbufpool/scripts/gen_data.py @@ -16,10 +16,12 @@ def gen_golden_data_simple(): dtype = np.float32 input_shape = [8, 256] - # generate value between [-65504, 65504] input_x = np.random.randint(0, np.nextafter(1000, np.inf), input_shape).astype(dtype) input_y = np.random.randint(0, np.nextafter(1000, np.inf), input_shape).astype(dtype) + input_x = np.arrange(2048).astye(dtype) + input_y = np.arrange(2048).astye(dtype) + half_size = input_shape[0] // 2 input_x_half = input_x[:half_size, :] input_y_half = input_y[:half_size, :] -- Gitee From 85e473d7e09ebb14e7c664efe70ecf54c04c59e6 Mon Sep 17 00:00:00 2001 From: alpaca12345uuu Date: Mon, 9 Jun 2025 19:48:59 +0800 Subject: [PATCH 74/94] =?UTF-8?q?=E4=BB=A3=E7=A0=81=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- operator/ascendc/2_features/2_tbufpool/scripts/gen_data.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/operator/ascendc/2_features/2_tbufpool/scripts/gen_data.py b/operator/ascendc/2_features/2_tbufpool/scripts/gen_data.py index bc5ee471b..b7dbade94 100644 --- a/operator/ascendc/2_features/2_tbufpool/scripts/gen_data.py +++ b/operator/ascendc/2_features/2_tbufpool/scripts/gen_data.py @@ -19,8 +19,10 @@ def gen_golden_data_simple(): input_x = np.random.randint(0, np.nextafter(1000, np.inf), input_shape).astype(dtype) input_y = np.random.randint(0, np.nextafter(1000, np.inf), input_shape).astype(dtype) - input_x = np.arrange(2048).astye(dtype) - input_y = np.arrange(2048).astye(dtype) + input_x = np.arange(2048).astye(dtype) + input_x = input_x.reshape(input_shape) + input_y = np.arange(2048).astye(dtype) + input_y = input_y.reshape(input_shape) half_size = input_shape[0] // 2 input_x_half = input_x[:half_size, :] -- Gitee From 610e61059c827ea40b0f077d3ae81adb800276b0 Mon Sep 17 00:00:00 2001 From: alpaca12345uuu Date: Mon, 9 Jun 2025 19:49:33 +0800 Subject: [PATCH 75/94] =?UTF-8?q?=E4=BB=A3=E7=A0=81=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- operator/ascendc/2_features/2_tbufpool/scripts/gen_data.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/operator/ascendc/2_features/2_tbufpool/scripts/gen_data.py b/operator/ascendc/2_features/2_tbufpool/scripts/gen_data.py index b7dbade94..ad5a0ec11 100644 --- a/operator/ascendc/2_features/2_tbufpool/scripts/gen_data.py +++ b/operator/ascendc/2_features/2_tbufpool/scripts/gen_data.py @@ -19,9 +19,9 @@ def gen_golden_data_simple(): input_x = np.random.randint(0, np.nextafter(1000, np.inf), input_shape).astype(dtype) input_y = np.random.randint(0, np.nextafter(1000, np.inf), input_shape).astype(dtype) - input_x = np.arange(2048).astye(dtype) + input_x = np.arange(2048).astype(dtype) input_x = input_x.reshape(input_shape) - input_y = np.arange(2048).astye(dtype) + input_y = np.arange(2048).astype(dtype) input_y = input_y.reshape(input_shape) half_size = input_shape[0] // 2 -- Gitee From 9f69f5f8d7924a1479b3930612f6db4ea3caa330 Mon Sep 17 00:00:00 2001 From: alpaca12345uuu Date: Mon, 9 Jun 2025 19:51:31 +0800 Subject: [PATCH 76/94] =?UTF-8?q?=E4=BB=A3=E7=A0=81=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../ascendc/2_features/2_tbufpool/scripts/gen_data.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/operator/ascendc/2_features/2_tbufpool/scripts/gen_data.py b/operator/ascendc/2_features/2_tbufpool/scripts/gen_data.py index ad5a0ec11..82fbc4a93 100644 --- a/operator/ascendc/2_features/2_tbufpool/scripts/gen_data.py +++ b/operator/ascendc/2_features/2_tbufpool/scripts/gen_data.py @@ -19,10 +19,10 @@ def gen_golden_data_simple(): input_x = np.random.randint(0, np.nextafter(1000, np.inf), input_shape).astype(dtype) input_y = np.random.randint(0, np.nextafter(1000, np.inf), input_shape).astype(dtype) - input_x = np.arange(2048).astype(dtype) - input_x = input_x.reshape(input_shape) - input_y = np.arange(2048).astype(dtype) - input_y = input_y.reshape(input_shape) + # input_x = np.arange(2048).astype(dtype) + # input_x = input_x.reshape(input_shape) + # input_y = np.arange(2048).astype(dtype) + # input_y = input_y.reshape(input_shape) half_size = input_shape[0] // 2 input_x_half = input_x[:half_size, :] -- Gitee From ee3162cf9a78b3ebfb33054f9ec76e4a5df96d55 Mon Sep 17 00:00:00 2001 From: alpaca12345uuu Date: Mon, 9 Jun 2025 19:59:03 +0800 Subject: [PATCH 77/94] =?UTF-8?q?=E4=BB=A3=E7=A0=81=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../2_features/2_tbufpool/scripts/gen_data.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/operator/ascendc/2_features/2_tbufpool/scripts/gen_data.py b/operator/ascendc/2_features/2_tbufpool/scripts/gen_data.py index 82fbc4a93..596fc5da0 100644 --- a/operator/ascendc/2_features/2_tbufpool/scripts/gen_data.py +++ b/operator/ascendc/2_features/2_tbufpool/scripts/gen_data.py @@ -16,13 +16,13 @@ def gen_golden_data_simple(): dtype = np.float32 input_shape = [8, 256] - input_x = np.random.randint(0, np.nextafter(1000, np.inf), input_shape).astype(dtype) - input_y = np.random.randint(0, np.nextafter(1000, np.inf), input_shape).astype(dtype) + # input_x = np.random.randint(0, np.nextafter(1000, np.inf), input_shape).astype(dtype) + # input_y = np.random.randint(0, np.nextafter(1000, np.inf), input_shape).astype(dtype) - # input_x = np.arange(2048).astype(dtype) - # input_x = input_x.reshape(input_shape) - # input_y = np.arange(2048).astype(dtype) - # input_y = input_y.reshape(input_shape) + input_x = np.arange(2048).astype(dtype) + input_x = input_x.reshape(input_shape) + input_y = np.arange(2048).astype(dtype) + input_y = input_y.reshape(input_shape) half_size = input_shape[0] // 2 input_x_half = input_x[:half_size, :] -- Gitee From 6ffad032ca14f192c251a28447258cdaa9936666 Mon Sep 17 00:00:00 2001 From: alpaca12345uuu Date: Mon, 9 Jun 2025 20:09:14 +0800 Subject: [PATCH 78/94] =?UTF-8?q?=E4=BB=A3=E7=A0=81=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../2_features/2_tbufpool/scripts/gen_data.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/operator/ascendc/2_features/2_tbufpool/scripts/gen_data.py b/operator/ascendc/2_features/2_tbufpool/scripts/gen_data.py index 596fc5da0..82fbc4a93 100644 --- a/operator/ascendc/2_features/2_tbufpool/scripts/gen_data.py +++ b/operator/ascendc/2_features/2_tbufpool/scripts/gen_data.py @@ -16,13 +16,13 @@ def gen_golden_data_simple(): dtype = np.float32 input_shape = [8, 256] - # input_x = np.random.randint(0, np.nextafter(1000, np.inf), input_shape).astype(dtype) - # input_y = np.random.randint(0, np.nextafter(1000, np.inf), input_shape).astype(dtype) + input_x = np.random.randint(0, np.nextafter(1000, np.inf), input_shape).astype(dtype) + input_y = np.random.randint(0, np.nextafter(1000, np.inf), input_shape).astype(dtype) - input_x = np.arange(2048).astype(dtype) - input_x = input_x.reshape(input_shape) - input_y = np.arange(2048).astype(dtype) - input_y = input_y.reshape(input_shape) + # input_x = np.arange(2048).astype(dtype) + # input_x = input_x.reshape(input_shape) + # input_y = np.arange(2048).astype(dtype) + # input_y = input_y.reshape(input_shape) half_size = input_shape[0] // 2 input_x_half = input_x[:half_size, :] -- Gitee From 545d7a37d185c7dd8a2869ee374c8ac0f049edfa Mon Sep 17 00:00:00 2001 From: alpaca12345uuu Date: Mon, 9 Jun 2025 20:29:18 +0800 Subject: [PATCH 79/94] =?UTF-8?q?=E4=BB=A3=E7=A0=81=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h | 1 + 1 file changed, 1 insertion(+) diff --git a/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h b/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h index 96ce98122..1f30425ab 100644 --- a/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h +++ b/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h @@ -171,6 +171,7 @@ class TbufPoolImpl { AscendC::DataCopy(dstGlobal1[progress * COMPUTE_LENGTH_HALF], dstLocal, COMPUTE_LENGTH_HALF); //AscendC::PipeBarrier(); AscendC::DumpTensor(dstGlobal1, 1, 16); + AscendC::PipeBarrier(); dstQue2.FreeTensor(dstLocal); } -- Gitee From f10cea7c872cdda726ea3e6dd842e0618cb6f751 Mon Sep 17 00:00:00 2001 From: alpaca12345uuu Date: Mon, 9 Jun 2025 20:33:03 +0800 Subject: [PATCH 80/94] =?UTF-8?q?=E4=BB=A3=E7=A0=81=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h b/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h index 1f30425ab..346244b49 100644 --- a/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h +++ b/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h @@ -153,7 +153,7 @@ class TbufPoolImpl { // AscendC::DumpTensor(src2Local, 3, 16); AscendC::Sub(tmpTensor, src0Local, src1Local, COMPUTE_LENGTH_HALF); - //AscendC::PipeBarrier(); + AscendC::PipeBarrier(); //AscendC::DumpTensor(tmpTensor, 1, 16); AscendC::Mul(dstLocal, tmpTensor, src2Local, COMPUTE_LENGTH_HALF); -- Gitee From 053f3c4fcd6cba458444cc87c4323a357c522ba6 Mon Sep 17 00:00:00 2001 From: alpaca12345uuu Date: Mon, 9 Jun 2025 20:40:56 +0800 Subject: [PATCH 81/94] =?UTF-8?q?=E4=BB=A3=E7=A0=81=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.cpp b/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.cpp index 1cdc13c77..cf9437871 100644 --- a/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.cpp +++ b/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.cpp @@ -16,5 +16,6 @@ extern "C" __global__ __aicore__ void tbufpool_custom(GM_ADDR src0Gm, GM_ADDR sr AscendC::TPipe pipe; MyCustomKernel::TbufPoolImpl op; op.Init(src0Gm, src1Gm, dstGm0, dstGm1, tiling, &pipe); + KERNEL_TASK_TYPE_DEFAULT(KERNEL_TYPE_AIV_ONLY); op.Process(); } \ No newline at end of file -- Gitee From 25dda2ef2b133a9afcdc73533776a2c00e1a3bf3 Mon Sep 17 00:00:00 2001 From: alpaca12345uuu Date: Mon, 9 Jun 2025 20:58:02 +0800 Subject: [PATCH 82/94] =?UTF-8?q?=E4=BB=A3=E7=A0=81=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- operator/ascendc/2_features/2_tbufpool/main.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/operator/ascendc/2_features/2_tbufpool/main.cpp b/operator/ascendc/2_features/2_tbufpool/main.cpp index 018df5ee8..9fbe53583 100644 --- a/operator/ascendc/2_features/2_tbufpool/main.cpp +++ b/operator/ascendc/2_features/2_tbufpool/main.cpp @@ -21,7 +21,7 @@ extern "C" __global__ __aicore__ void tbufpool_custom(GM_ADDR x, GM_ADDR y, GM_A namespace { constexpr uint32_t USED_CORE_NUM = 1; -constexpr uint32_t TOTAL_LENGTH = 4096; +constexpr uint32_t TOTAL_LENGTH = 2048; constexpr uint32_t TILING_SIZE = 1; } -- Gitee From 15055251da84c99e4e040bc42461eef60cb1927e Mon Sep 17 00:00:00 2001 From: alpaca12345uuu Date: Mon, 9 Jun 2025 21:02:28 +0800 Subject: [PATCH 83/94] =?UTF-8?q?=E4=BB=A3=E7=A0=81=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- operator/ascendc/2_features/2_tbufpool/main.cpp | 2 +- .../ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/operator/ascendc/2_features/2_tbufpool/main.cpp b/operator/ascendc/2_features/2_tbufpool/main.cpp index 9fbe53583..018df5ee8 100644 --- a/operator/ascendc/2_features/2_tbufpool/main.cpp +++ b/operator/ascendc/2_features/2_tbufpool/main.cpp @@ -21,7 +21,7 @@ extern "C" __global__ __aicore__ void tbufpool_custom(GM_ADDR x, GM_ADDR y, GM_A namespace { constexpr uint32_t USED_CORE_NUM = 1; -constexpr uint32_t TOTAL_LENGTH = 2048; +constexpr uint32_t TOTAL_LENGTH = 4096; constexpr uint32_t TILING_SIZE = 1; } diff --git a/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h b/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h index 346244b49..e4d4943a9 100644 --- a/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h +++ b/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h @@ -170,7 +170,7 @@ class TbufPoolImpl { //AscendC::DumpTensor(dstLocal, 5, 16); AscendC::DataCopy(dstGlobal1[progress * COMPUTE_LENGTH_HALF], dstLocal, COMPUTE_LENGTH_HALF); //AscendC::PipeBarrier(); - AscendC::DumpTensor(dstGlobal1, 1, 16); + //AscendC::DumpTensor(dstGlobal1, 1, 16); AscendC::PipeBarrier(); dstQue2.FreeTensor(dstLocal); } -- Gitee From 9e33b15582a75e2257159beaf03fefe9eaeb8829 Mon Sep 17 00:00:00 2001 From: alpaca12345uuu Date: Mon, 9 Jun 2025 21:06:24 +0800 Subject: [PATCH 84/94] =?UTF-8?q?=E4=BB=A3=E7=A0=81=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.cpp b/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.cpp index cf9437871..1b2730039 100644 --- a/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.cpp +++ b/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.cpp @@ -16,6 +16,6 @@ extern "C" __global__ __aicore__ void tbufpool_custom(GM_ADDR src0Gm, GM_ADDR sr AscendC::TPipe pipe; MyCustomKernel::TbufPoolImpl op; op.Init(src0Gm, src1Gm, dstGm0, dstGm1, tiling, &pipe); - KERNEL_TASK_TYPE_DEFAULT(KERNEL_TYPE_AIV_ONLY); + //KERNEL_TASK_TYPE_DEFAULT(KERNEL_TYPE_AIV_ONLY); op.Process(); } \ No newline at end of file -- Gitee From 1eb8c9cf89af5ee6af1dea22231a3183e93d2e0b Mon Sep 17 00:00:00 2001 From: alpaca12345uuu Date: Mon, 9 Jun 2025 21:07:27 +0800 Subject: [PATCH 85/94] =?UTF-8?q?=E4=BB=A3=E7=A0=81=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- operator/ascendc/2_features/2_tbufpool/main.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/operator/ascendc/2_features/2_tbufpool/main.cpp b/operator/ascendc/2_features/2_tbufpool/main.cpp index 018df5ee8..9fbe53583 100644 --- a/operator/ascendc/2_features/2_tbufpool/main.cpp +++ b/operator/ascendc/2_features/2_tbufpool/main.cpp @@ -21,7 +21,7 @@ extern "C" __global__ __aicore__ void tbufpool_custom(GM_ADDR x, GM_ADDR y, GM_A namespace { constexpr uint32_t USED_CORE_NUM = 1; -constexpr uint32_t TOTAL_LENGTH = 4096; +constexpr uint32_t TOTAL_LENGTH = 2048; constexpr uint32_t TILING_SIZE = 1; } -- Gitee From 1f3b0bf0a6078ab1b8952355f78defb95fc9d87d Mon Sep 17 00:00:00 2001 From: alpaca12345uuu Date: Mon, 9 Jun 2025 21:09:30 +0800 Subject: [PATCH 86/94] =?UTF-8?q?=E4=BB=A3=E7=A0=81=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.cpp b/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.cpp index 1b2730039..cf9437871 100644 --- a/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.cpp +++ b/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.cpp @@ -16,6 +16,6 @@ extern "C" __global__ __aicore__ void tbufpool_custom(GM_ADDR src0Gm, GM_ADDR sr AscendC::TPipe pipe; MyCustomKernel::TbufPoolImpl op; op.Init(src0Gm, src1Gm, dstGm0, dstGm1, tiling, &pipe); - //KERNEL_TASK_TYPE_DEFAULT(KERNEL_TYPE_AIV_ONLY); + KERNEL_TASK_TYPE_DEFAULT(KERNEL_TYPE_AIV_ONLY); op.Process(); } \ No newline at end of file -- Gitee From 80139b7692d4201f35871020aded8dd59583e3d6 Mon Sep 17 00:00:00 2001 From: alpaca12345uuu Date: Mon, 9 Jun 2025 21:12:34 +0800 Subject: [PATCH 87/94] =?UTF-8?q?=E4=BB=A3=E7=A0=81=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- operator/ascendc/2_features/2_tbufpool/main.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/operator/ascendc/2_features/2_tbufpool/main.cpp b/operator/ascendc/2_features/2_tbufpool/main.cpp index 9fbe53583..54e595dcf 100644 --- a/operator/ascendc/2_features/2_tbufpool/main.cpp +++ b/operator/ascendc/2_features/2_tbufpool/main.cpp @@ -22,6 +22,7 @@ extern "C" __global__ __aicore__ void tbufpool_custom(GM_ADDR x, GM_ADDR y, GM_A namespace { constexpr uint32_t USED_CORE_NUM = 1; constexpr uint32_t TOTAL_LENGTH = 2048; +constexpr uint32_t DST_LENGTH = 1024; constexpr uint32_t TILING_SIZE = 1; } @@ -112,7 +113,7 @@ static bool CompareResultSubMul(const void *outputData, int64_t outSize) { int32_t main(int32_t argc, char *argv[]) { size_t tilingSize = TILING_SIZE * sizeof(uint32_t); size_t inputSize = TOTAL_LENGTH * sizeof(float); - size_t outputSize = inputSize; + size_t outputSize = DST_LENGTH; #ifdef ASCENDC_CPU_DEBUG uint8_t *x = (uint8_t *)AscendC::GmAlloc(inputSize); -- Gitee From 8e65720f24295fdd5a605d9c42461ea0142adc83 Mon Sep 17 00:00:00 2001 From: alpaca12345uuu Date: Mon, 9 Jun 2025 21:16:27 +0800 Subject: [PATCH 88/94] =?UTF-8?q?=E4=BB=A3=E7=A0=81=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../ascendc/2_features/2_tbufpool/main.cpp | 32 ++++++++++--------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/operator/ascendc/2_features/2_tbufpool/main.cpp b/operator/ascendc/2_features/2_tbufpool/main.cpp index 54e595dcf..f11b21f35 100644 --- a/operator/ascendc/2_features/2_tbufpool/main.cpp +++ b/operator/ascendc/2_features/2_tbufpool/main.cpp @@ -113,13 +113,15 @@ static bool CompareResultSubMul(const void *outputData, int64_t outSize) { int32_t main(int32_t argc, char *argv[]) { size_t tilingSize = TILING_SIZE * sizeof(uint32_t); size_t inputSize = TOTAL_LENGTH * sizeof(float); - size_t outputSize = DST_LENGTH; + size_t outputSizeAdd = inputSize; + size_t outputSizeSubMul = DST_LENGTH * sizeof(float); + #ifdef ASCENDC_CPU_DEBUG uint8_t *x = (uint8_t *)AscendC::GmAlloc(inputSize); uint8_t *y = (uint8_t *)AscendC::GmAlloc(inputSize); - uint8_t *zAdd = (uint8_t *)AscendC::GmAlloc(outputSize); - uint8_t *zSubMul = (uint8_t *)AscendC::GmAlloc(outputSize); + uint8_t *zAdd = (uint8_t *)AscendC::GmAlloc(outputSizeAdd); + uint8_t *zSubMul = (uint8_t *)AscendC::GmAlloc(outputSizeSubMul); uint8_t *tiling = (uint8_t *)AscendC::GmAlloc(tilingSize); ReadFile("../input/input_x.bin", inputSize, x, inputSize); @@ -131,14 +133,14 @@ int32_t main(int32_t argc, char *argv[]) { ICPU_RUN_KF(tbufpool_custom, USED_CORE_NUM, x, y, zAdd, zSubMul, *reinterpret_cast(tiling)); // use this macro for cpu debug - WriteFile("../output/output_add.bin", zAdd, outputSize); - WriteFile("../output/output_sub_mul.bin", zSubMul, outputSize); + WriteFile("../output/output_add.bin", zAdd, outputSizeAdd); + WriteFile("../output/output_sub_mul.bin", zSubMul, outputSizeSubMul); bool goldenResultAdd = true; - goldenResultAdd = CompareResultAdd(zAdd, outputSize); + goldenResultAdd = CompareResultAdd(zAdd, outputSizeAdd); bool goldenResultSubMul = true; - goldenResultSubMul = CompareResultSubMul(zSubMul, outputSize); + goldenResultSubMul = CompareResultSubMul(zSubMul, outputSizeSubMul); AscendC::GmFree((void *)x); AscendC::GmFree((void *)y); @@ -164,14 +166,14 @@ int32_t main(int32_t argc, char *argv[]) { CHECK_ACL(aclrtMallocHost((void **)(&xHost), inputSize)); CHECK_ACL(aclrtMallocHost((void **)(&yHost), inputSize)); - CHECK_ACL(aclrtMallocHost((void **)(&zHostAdd), outputSize)); - CHECK_ACL(aclrtMallocHost((void **)(&zHostSubMul), outputSize)); + CHECK_ACL(aclrtMallocHost((void **)(&zHostAdd), outputSizeAdd)); + CHECK_ACL(aclrtMallocHost((void **)(&zHostSubMul), outputSizeSubMul)); CHECK_ACL(aclrtMallocHost((void **)(&tiling), tilingSize)); CHECK_ACL(aclrtMalloc((void **)&xDevice, inputSize, ACL_MEM_MALLOC_HUGE_FIRST)); CHECK_ACL(aclrtMalloc((void **)&yDevice, inputSize, ACL_MEM_MALLOC_HUGE_FIRST)); - CHECK_ACL(aclrtMalloc((void **)&zDeviceAdd, outputSize, ACL_MEM_MALLOC_HUGE_FIRST)); - CHECK_ACL(aclrtMalloc((void **)&zDeviceSubMul, outputSize, ACL_MEM_MALLOC_HUGE_FIRST)); + CHECK_ACL(aclrtMalloc((void **)&zDeviceAdd, outputSizeAdd, ACL_MEM_MALLOC_HUGE_FIRST)); + CHECK_ACL(aclrtMalloc((void **)&zDeviceSubMul, outputSizeSubMul, ACL_MEM_MALLOC_HUGE_FIRST)); ReadFile("../input/input_x.bin", inputSize, xHost, inputSize); ReadFile("../input/input_y.bin", inputSize, yHost, inputSize); @@ -190,16 +192,16 @@ int32_t main(int32_t argc, char *argv[]) { CHECK_ACL(aclrtSynchronizeStream(stream)); // Copy result to host memory and write to output file - CHECK_ACL(aclrtMemcpy(zHostAdd, outputSize, zDeviceAdd, outputSize, ACL_MEMCPY_DEVICE_TO_HOST)); + CHECK_ACL(aclrtMemcpy(zHostAdd, outputSizeAdd, zDeviceAdd, outputSize, ACL_MEMCPY_DEVICE_TO_HOST)); WriteFile("../output/output_add.bin", zHostAdd, outputSize); - CHECK_ACL(aclrtMemcpy(zHostSubMul, outputSize, zDeviceSubMul, outputSize, ACL_MEMCPY_DEVICE_TO_HOST)); + CHECK_ACL(aclrtMemcpy(zHostSubMul, outputSizeSubMul, zDeviceSubMul, outputSize, ACL_MEMCPY_DEVICE_TO_HOST)); WriteFile("../output/output_sub_mul.bin", zHostSubMul, outputSize); // Compare the result with the golden result bool goldenResultAdd = true; - goldenResultAdd = CompareResultAdd(zHostAdd, outputSize); + goldenResultAdd = CompareResultAdd(zHostAdd, outputSizeAdd); bool goldenResultSubMul = true; - goldenResultSubMul = CompareResultSubMul(zHostSubMul, outputSize); + goldenResultSubMul = CompareResultSubMul(zHostSubMul, outputSizeSubMul); // Clean up memory CHECK_ACL(aclrtFree(xDevice)); -- Gitee From 6ea1687f4a7ed576c507a7aa84f9fd0aa0eb363f Mon Sep 17 00:00:00 2001 From: alpaca12345uuu Date: Mon, 9 Jun 2025 21:17:56 +0800 Subject: [PATCH 89/94] =?UTF-8?q?=E4=BB=A3=E7=A0=81=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- operator/ascendc/2_features/2_tbufpool/main.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/operator/ascendc/2_features/2_tbufpool/main.cpp b/operator/ascendc/2_features/2_tbufpool/main.cpp index f11b21f35..a506e7c8d 100644 --- a/operator/ascendc/2_features/2_tbufpool/main.cpp +++ b/operator/ascendc/2_features/2_tbufpool/main.cpp @@ -192,10 +192,10 @@ int32_t main(int32_t argc, char *argv[]) { CHECK_ACL(aclrtSynchronizeStream(stream)); // Copy result to host memory and write to output file - CHECK_ACL(aclrtMemcpy(zHostAdd, outputSizeAdd, zDeviceAdd, outputSize, ACL_MEMCPY_DEVICE_TO_HOST)); - WriteFile("../output/output_add.bin", zHostAdd, outputSize); - CHECK_ACL(aclrtMemcpy(zHostSubMul, outputSizeSubMul, zDeviceSubMul, outputSize, ACL_MEMCPY_DEVICE_TO_HOST)); - WriteFile("../output/output_sub_mul.bin", zHostSubMul, outputSize); + CHECK_ACL(aclrtMemcpy(zHostAdd, outputSizeAdd, zDeviceAdd, outputSizeAdd, ACL_MEMCPY_DEVICE_TO_HOST)); + WriteFile("../output/output_add.bin", zHostAdd, outputSizeAdd); + CHECK_ACL(aclrtMemcpy(zHostSubMul, outputSizeSubMul, zDeviceSubMul, outputSizeSubMul, ACL_MEMCPY_DEVICE_TO_HOST)); + WriteFile("../output/output_sub_mul.bin", zHostSubMul, outputSizeSubMul); // Compare the result with the golden result bool goldenResultAdd = true; -- Gitee From dce4700cda6175c18b4ea49f5de537c81f0efa3d Mon Sep 17 00:00:00 2001 From: alpaca12345uuu Date: Mon, 9 Jun 2025 21:21:30 +0800 Subject: [PATCH 90/94] =?UTF-8?q?=E4=BB=A3=E7=A0=81=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../2_tbufpool/op_kernel/tbufpool_custom.h | 51 +------------------ 1 file changed, 1 insertion(+), 50 deletions(-) diff --git a/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h b/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h index e4d4943a9..ef0a0e1ed 100644 --- a/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h +++ b/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h @@ -50,38 +50,21 @@ class TbufPoolImpl { tbufPool2.InitBuffer(dstQue2, BUFFER_NUM_T2, INIT_TENSOR_LENGTH_HALF); tbufPool2.InitBuffer(srcQue3, BUFFER_NUM_T2, INIT_TENSOR_LENGTH_HALF); tbufPool2.InitBuffer(tmp, INIT_TENSOR_LENGTH_HALF); - //tbufPool2.InitBuffer(dstQue2, BUFFER_NUM_T2, INIT_TENSOR_LENGTH); } __aicore__ inline void Process() { for (int32_t i = 0; i < SPLIT_NUM; i++) { //stage 1 - //AscendC::PipeBarrier(); - CopyIn(i); - //AscendC::PipeBarrier(); - Compute(i); - //AscendC::PipeBarrier(); - CopyOut(i); tbufPool1.Reset(); - //AscendC::PipeBarrier(); - //stage 2 CopyIn1(i); - //AscendC::PipeBarrier(); - Compute1(i); - //AscendC::PipeBarrier(); - CopyOut1(i); - //AscendC::PipeBarrier(); - tbufPool2.Reset(); - //AscendC::PipeBarrier(); - } tbufPool0.Reset(); } @@ -92,10 +75,9 @@ class TbufPoolImpl { AscendC::LocalTensor src1Local = srcQue1.AllocTensor(); AscendC::DataCopy(src0Local, src0Global[progress * COMPUTE_LENGTH], COMPUTE_LENGTH); AscendC::DataCopy(src1Local, src1Global[progress * COMPUTE_LENGTH], COMPUTE_LENGTH); - //AscendC::PipeBarrier(); srcQue0.EnQue(src0Local); srcQue1.EnQue(src1Local); - AscendC::PipeBarrier(); + //AscendC::PipeBarrier(); } __aicore__ inline void Compute(int32_t progress) { @@ -118,27 +100,12 @@ class TbufPoolImpl { AscendC::LocalTensor src0Local = srcQue0.AllocTensor(); AscendC::LocalTensor src1Local = srcQue2.AllocTensor(); AscendC::LocalTensor src2Local = srcQue3.AllocTensor(); - //AscendC::PipeBarrier(); - // AscendC::DumpTensor(src0Local, 1, 16); - // AscendC::DumpTensor(src1Local, 2, 16); - // AscendC::DumpTensor(src2Local, 3, 16); - AscendC::DataCopy(src0Local, src0Global[progress * COMPUTE_LENGTH_HALF], COMPUTE_LENGTH_HALF); - //AscendC::PipeBarrier(); - AscendC::DataCopy(src1Local, src1Global[progress * COMPUTE_LENGTH_HALF], COMPUTE_LENGTH_HALF); - //AscendC::PipeBarrier(); - AscendC::DataCopy(src2Local, src1Global[progress * COMPUTE_LENGTH_HALF], COMPUTE_LENGTH_HALF); - //AscendC::PipeBarrier(); - // AscendC::DumpTensor(src0Local, 1, 16); - // AscendC::DumpTensor(src1Local, 2, 16); - // AscendC::DumpTensor(src2Local, 3, 16); - srcQue0.EnQue(src0Local); srcQue2.EnQue(src1Local); srcQue3.EnQue(src2Local); - } __aicore__ inline void Compute1(int32_t progress) { @@ -148,17 +115,9 @@ class TbufPoolImpl { AscendC::LocalTensor src2Local = srcQue3.DeQue(); AscendC::LocalTensor dstLocal = dstQue2.AllocTensor(); AscendC::LocalTensor tmpTensor = tmp.Get(); - // AscendC::DumpTensor(src0Local, 1, 16); - // AscendC::DumpTensor(src1Local, 2, 16); - // AscendC::DumpTensor(src2Local, 3, 16); - AscendC::Sub(tmpTensor, src0Local, src1Local, COMPUTE_LENGTH_HALF); AscendC::PipeBarrier(); - //AscendC::DumpTensor(tmpTensor, 1, 16); - AscendC::Mul(dstLocal, tmpTensor, src2Local, COMPUTE_LENGTH_HALF); - //AscendC::PipeBarrier(); - //AscendC::DumpTensor(dstLocal, 2, 16); dstQue2.EnQue(dstLocal); srcQue0.FreeTensor(src0Local); srcQue2.FreeTensor(src1Local); @@ -167,10 +126,7 @@ class TbufPoolImpl { __aicore__ inline void CopyOut1(int32_t progress) { AscendC::LocalTensor dstLocal = dstQue2.DeQue(); - //AscendC::DumpTensor(dstLocal, 5, 16); AscendC::DataCopy(dstGlobal1[progress * COMPUTE_LENGTH_HALF], dstLocal, COMPUTE_LENGTH_HALF); - //AscendC::PipeBarrier(); - //AscendC::DumpTensor(dstGlobal1, 1, 16); AscendC::PipeBarrier(); dstQue2.FreeTensor(dstLocal); } @@ -184,19 +140,14 @@ class TbufPoolImpl { AscendC::TQue srcQue1; AscendC::TQue srcQue2; AscendC::TQue srcQue3; - AscendC::TQue dstQue; AscendC::TQue dstQue1; AscendC::TQue dstQue2; - //AscendC::TQue dstQue3; - AscendC::GlobalTensor src0Global; AscendC::GlobalTensor src1Global; AscendC::GlobalTensor dstGlobal0; AscendC::GlobalTensor dstGlobal1; - AscendC::TBuf tmp; - uint32_t totalLength = 0; }; }// namespace MyCustomKernel -- Gitee From 8bc791f17a856884edd99d832a5a2e19c278df14 Mon Sep 17 00:00:00 2001 From: alpaca12345uuu Date: Mon, 9 Jun 2025 21:23:00 +0800 Subject: [PATCH 91/94] =?UTF-8?q?=E4=BB=A3=E7=A0=81=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h b/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h index ef0a0e1ed..9a89844c7 100644 --- a/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h +++ b/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h @@ -77,7 +77,7 @@ class TbufPoolImpl { AscendC::DataCopy(src1Local, src1Global[progress * COMPUTE_LENGTH], COMPUTE_LENGTH); srcQue0.EnQue(src0Local); srcQue1.EnQue(src1Local); - //AscendC::PipeBarrier(); + AscendC::PipeBarrier(); } __aicore__ inline void Compute(int32_t progress) { -- Gitee From 3c0d7b810f4e84e00a9b336b852b955306c20b2d Mon Sep 17 00:00:00 2001 From: alpaca12345uuu Date: Mon, 9 Jun 2025 21:24:19 +0800 Subject: [PATCH 92/94] =?UTF-8?q?=E4=BB=A3=E7=A0=81=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h | 1 + 1 file changed, 1 insertion(+) diff --git a/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h b/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h index 9a89844c7..7a24a2f11 100644 --- a/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h +++ b/operator/ascendc/2_features/2_tbufpool/op_kernel/tbufpool_custom.h @@ -85,6 +85,7 @@ class TbufPoolImpl { AscendC::LocalTensor src1Local = srcQue1.DeQue(); AscendC::LocalTensor dstLocal = dstQue1.AllocTensor(); AscendC::Add(dstLocal, src0Local, src1Local, COMPUTE_LENGTH); + AscendC::PipeBarrier(); dstQue1.EnQue(dstLocal); srcQue0.FreeTensor(src0Local); srcQue1.FreeTensor(src1Local); -- Gitee From 678e58015720a3d0c115f91eaa4a29c72bc43cfa Mon Sep 17 00:00:00 2001 From: alpaca12345uuu Date: Mon, 9 Jun 2025 21:33:30 +0800 Subject: [PATCH 93/94] =?UTF-8?q?=E4=BB=A3=E7=A0=81=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- operator/ascendc/2_features/2_tbufpool/scripts/gen_data.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/operator/ascendc/2_features/2_tbufpool/scripts/gen_data.py b/operator/ascendc/2_features/2_tbufpool/scripts/gen_data.py index 82fbc4a93..18a8f89cd 100644 --- a/operator/ascendc/2_features/2_tbufpool/scripts/gen_data.py +++ b/operator/ascendc/2_features/2_tbufpool/scripts/gen_data.py @@ -19,11 +19,6 @@ def gen_golden_data_simple(): input_x = np.random.randint(0, np.nextafter(1000, np.inf), input_shape).astype(dtype) input_y = np.random.randint(0, np.nextafter(1000, np.inf), input_shape).astype(dtype) - # input_x = np.arange(2048).astype(dtype) - # input_x = input_x.reshape(input_shape) - # input_y = np.arange(2048).astype(dtype) - # input_y = input_y.reshape(input_shape) - half_size = input_shape[0] // 2 input_x_half = input_x[:half_size, :] input_y_half = input_y[:half_size, :] -- Gitee From cfcc92c6bf3ea4c095ef84d01d6419098f90abe5 Mon Sep 17 00:00:00 2001 From: alpaca12345uuu Date: Mon, 9 Jun 2025 21:49:26 +0800 Subject: [PATCH 94/94] =?UTF-8?q?=E4=BB=A3=E7=A0=81=E4=BF=AE=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- operator/ascendc/2_features/2_tbufpool/README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/operator/ascendc/2_features/2_tbufpool/README.md b/operator/ascendc/2_features/2_tbufpool/README.md index efb0295c9..c7e17b6f8 100644 --- a/operator/ascendc/2_features/2_tbufpool/README.md +++ b/operator/ascendc/2_features/2_tbufpool/README.md @@ -13,7 +13,7 @@ │ └── run.sh // 编译运行算子的脚本 ``` ## 代码实现介绍 -数据量较大且内存有限时,无法一次完成所有数据搬运,需要拆分成多个阶段计算,每次计算使用其中的一部分数据,可以通过TBufPool资源池进行内存地址复用。本例中,通过调用InitBufPool基础API对Add算子和Sub算子实现过程进行内存管理。从Tpipe划分出资源池tbufPool0,tbufPool0为src0Gm分配空间后,继续分配了资源池tbufPool1,指定tbufPool1与tbufPool2复用并分别运用于第一、二轮计算,此时tbufPool1及tbufPool2共享起始地址及长度。 +数据量较大且内存有限时,无法一次完成所有数据搬运,需要拆分成多个阶段计算,每次计算使用其中的一部分数据,可以通过TBufPool资源池进行内存地址复用。本例中,通过调用InitBufPool基础API对Add算子, Sub算子和Mul算子实现过程进行内存管理。从Tpipe划分出资源池tbufPool0,tbufPool0为src0Gm分配空间后,继续分配了资源池tbufPool1,指定tbufPool1与tbufPool2复用并分别运用于第一、二轮计算,此时tbufPool1及tbufPool2共享起始地址及长度。 - kernel实现 Add算子的数学表达式为: @@ -24,14 +24,14 @@ ``` z = x - y ``` - 计算逻辑是:Ascend C提供的矢量计算接口的操作元素都为LocalTensor,首先启用tbufool1,将部分输入数据src0Gm,部分输入数据src1Gm搬运进片上储存,调用计算接口完成相加计算,搬出到外部存储上。之后切换到tbufpool2进行相减计算。完成后切换回tbufpool1完成剩余数据相加计算,得到最终相加结果,再切换到tbufpool2完成剩余数据相减计算,得到最终结果,再搬出到外部存储上。 + 计算逻辑是:Ascend C提供的矢量计算接口的操作元素都为LocalTensor,首先启用tbufool1,将部分输入数据src0Gm,部分输入数据src1Gm搬运进片上储存,调用计算接口完成相加计算,搬出到外部存储上。之后切换到tbufpool2进行相减、相乘计算。完成后切换回tbufpool1完成剩余数据相加计算,得到最终相加结果,再切换到tbufpool2完成剩余数据相减计算,得到最终结果,再搬出到外部存储上。 Add算子的实现流程分为6个基本任务:CopyIn,Compute,CopyOut,CopyIn1,Compute1,CopyOut1。 - CopyIn任务负责将Global Memory上的部分输入Tensor src0Gm和src1Gm搬运到Local Memory,分别存储在src0Local、src1Local; - Compute任务负责对src0Local、src1Local执行加法操作,计算结果存储在dstLocal中; - CopyOut任务负责将输出数据从dstLocal搬运至Global Memory上的输出Tensor dstGm0中。 - CopyIn1任务负责将Global Memory上的剩余输入Tensor src0Gm和src1Gm搬运到Local Memory,分别存储在src0Local、src1Local; - - Compute1任务负责对src0Local、src1Local执行减法操作,计算结果存储在dstLocal中; + - Compute1任务负责对src0Local、src1Local执行减法、乘法操作,计算结果存储在dstLocal中; - CopyOut1任务负责将输出数据从dstLocal搬运至Global Memory上的输出Tensor dstGm1中。 - 调用实现 -- Gitee