From 7b654a44e8dd56d2ce5b3e408a23a685d6e21cd6 Mon Sep 17 00:00:00 2001 From: zhanghao0689 Date: Wed, 2 Jul 2025 17:24:37 +0800 Subject: [PATCH 01/17] add gm conflict case --- .../KernelLaunch/CMakeLists.txt | 47 ++++ .../KernelLaunch/README.md | 88 ++++++++ .../KernelLaunch/adds_custom_v1.cpp | 98 +++++++++ .../KernelLaunch/adds_custom_v2.cpp | 104 +++++++++ .../KernelLaunch/cmake/cpu_lib.cmake | 9 + .../KernelLaunch/cmake/npu_lib.cmake | 11 + .../KernelLaunch/data_utils.h | 203 ++++++++++++++++++ .../KernelLaunch/main.cpp | 128 +++++++++++ .../KernelLaunch/run.sh | 113 ++++++++++ .../KernelLaunch/scripts/gen_data.py | 23 ++ .../KernelLaunch/scripts/verify_result.py | 53 +++++ 11 files changed, 877 insertions(+) create mode 100644 operator/ascendc/4_best_practices/15_mata_address_conflict/KernelLaunch/CMakeLists.txt create mode 100644 operator/ascendc/4_best_practices/15_mata_address_conflict/KernelLaunch/README.md create mode 100644 operator/ascendc/4_best_practices/15_mata_address_conflict/KernelLaunch/adds_custom_v1.cpp create mode 100644 operator/ascendc/4_best_practices/15_mata_address_conflict/KernelLaunch/adds_custom_v2.cpp create mode 100644 operator/ascendc/4_best_practices/15_mata_address_conflict/KernelLaunch/cmake/cpu_lib.cmake create mode 100644 operator/ascendc/4_best_practices/15_mata_address_conflict/KernelLaunch/cmake/npu_lib.cmake create mode 100644 operator/ascendc/4_best_practices/15_mata_address_conflict/KernelLaunch/data_utils.h create mode 100644 operator/ascendc/4_best_practices/15_mata_address_conflict/KernelLaunch/main.cpp create mode 100755 operator/ascendc/4_best_practices/15_mata_address_conflict/KernelLaunch/run.sh create mode 100644 operator/ascendc/4_best_practices/15_mata_address_conflict/KernelLaunch/scripts/gen_data.py create mode 100644 operator/ascendc/4_best_practices/15_mata_address_conflict/KernelLaunch/scripts/verify_result.py diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/KernelLaunch/CMakeLists.txt b/operator/ascendc/4_best_practices/15_mata_address_conflict/KernelLaunch/CMakeLists.txt new file mode 100644 index 000000000..258b2e3b7 --- /dev/null +++ b/operator/ascendc/4_best_practices/15_mata_address_conflict/KernelLaunch/CMakeLists.txt @@ -0,0 +1,47 @@ +cmake_minimum_required(VERSION 3.16) +project(Ascend_c) + +set(RUN_MODE "npu" CACHE STRING "cpu/sim/npu") +set(SOC_VERSION "Ascend310P3" CACHE STRING "system on chip type") +set(ASCEND_CANN_PACKAGE_PATH "/usr/local/Ascend/ascend-toolkit/latest" + CACHE STRING "ASCEND CANN package installation directory" +) +if(NOT CMAKE_BUILD_TYPE) + set(CMAKE_BUILD_TYPE "Debug" CACHE STRING "Build type Release/Debug (default Debug)" FORCE) +endif() +if(CMAKE_INSTALL_PREFIX STREQUAL /usr/local) + set(CMAKE_INSTALL_PREFIX "${CMAKE_CURRENT_LIST_DIR}/out" CACHE STRING "path for install()" FORCE) +endif() + +# ${KERNEL_FILES} are used to compile library, push files written by ascendc in ${KERNEL_FILES}. +# ref to cmake/npu.cmake ascendc_library, cmake/cpu.cmake add_library +file(GLOB KERNEL_FILES + ${CMAKE_CURRENT_SOURCE_DIR}/adds_custom_v1.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/adds_custom_v2.cpp +) + +if("${RUN_MODE}" STREQUAL "cpu") + include(cmake/cpu_lib.cmake) +elseif("${RUN_MODE}" STREQUAL "sim" OR "${RUN_MODE}" STREQUAL "npu") + include(cmake/npu_lib.cmake) +else() + message("invalid RUN_MODE: ${RUN_MODE}") +endif() +add_executable(ascendc_kernels_bbit ${CMAKE_CURRENT_SOURCE_DIR}/main.cpp) + +target_compile_options(ascendc_kernels_bbit PRIVATE + $:-g>> + -O2 -std=c++17 -D_GLIBCXX_USE_CXX11_ABI=0 -Wall -Werror +) + +target_link_libraries(ascendc_kernels_bbit PRIVATE + $,$>:host_intf_pub>> + $:ascendcl>> + ascendc_kernels_${RUN_MODE} +) + +install(TARGETS ascendc_kernels_bbit + LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} + ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} + RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} +) diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/KernelLaunch/README.md b/operator/ascendc/4_best_practices/15_mata_address_conflict/KernelLaunch/README.md new file mode 100644 index 000000000..f72b521cd --- /dev/null +++ b/operator/ascendc/4_best_practices/15_mata_address_conflict/KernelLaunch/README.md @@ -0,0 +1,88 @@ +## 目录结构介绍 + +``` +├── KernelLaunch +│ ├── cmake // 编译工程文件 +│ ├── scripts +│ │ ├── gen_data.py // 输入数据和真值数据生成脚本 +│ │ └── verify_result.py // 验证输出数据和真值数据是否一致的验证脚本 +│ ├── add_custom_v1.cpp // 算子kernel实现1:未优化前实现 +│ ├── add_custom_v2.cpp // 算子kernel实现2:优化地址分配,消除Bank冲突后的实现 +│ ├── CMakeLists.txt // 编译工程文件 +│ ├── data_utils.h // 数据读入写出函数 +│ ├── main.cpp // 主函数,调用算子的应用程序,含CPU域及NPU域调用 +│ └── run.sh // 编译运行算子的脚本 +``` + +## 代码实现介绍 + +本样例中实现的是固定shape为1*4096的Add算子。 + +- kernel实现 + + Add算子的数学表达式为: + + ``` + z = x + y + ``` + + 计算逻辑是:Ascend C提供的矢量计算接口的操作元素都为LocalTensor,输入数据需要先搬运进片上存储,然后使用计算接口完成两个输入参数相加,得到最终结果,再搬出到外部存储上。 + + Add算子的实现流程分为3个基本任务:CopyIn,Compute,CopyOut。CopyIn任务负责将Global Memory上的输入Tensor xGm和yGm搬运到Local Memory,分别存储在xLocal、yLocal,Compute任务负责对xLocal、yLocal执行加法操作,计算结果存储在zLocal中,CopyOut任务负责将输出数据从zLocal搬运至Global Memory上的输出Tensor zGm中。 + + 实现1:请参考[add_custom_v1.cpp](./add_custom_v1.cpp),xLocal地址为0,yLocal地址为0x4000,zLocal地址为0x8000。xLocal与yLocal存在读读冲突,xLocal与zLocal存在读写冲突。 + + 实现2:请参考[add_custom_v2.cpp](./add_custom_v2.cpp),为了避免Bank冲突,通过配置InitBuffer时的bufferSize来调整Tensor地址,xLocal地址为0,yLocal地址为0x4100,zLocal地址为0x10000。 +- 调用实现 + + 1. CPU侧运行验证主要通过ICPU_RUN_KF CPU调测宏等CPU调测库提供的接口来完成; + 2. NPU侧运行验证主要通过使用ACLRT_LAUNCH_KERNEL内核调用宏来完成。 + + 应用程序通过ASCENDC_CPU_DEBUG 宏区分代码逻辑运行于CPU侧还是NPU侧。 + +## 运行样例算子 + +- 打开样例目录 + 以命令行方式下载样例代码,master分支为例。 + + ```bash + cd ${git_clone_path}/samples/operator/ascendc/4_best_practices/4_bank_conflict/KernelLaunch + ``` +- 配置环境变量 + + 请根据当前环境上CANN开发套件包的[安装方式](https://hiascend.com/document/redirect/CannCommunityInstSoftware),选择对应配置环境变量的命令。 + + - 默认路径,root用户安装CANN软件包 + ```bash + export ASCEND_INSTALL_PATH=/usr/local/Ascend/ascend-toolkit/latest + ``` + - 默认路径,非root用户安装CANN软件包 + ```bash + export ASCEND_INSTALL_PATH=$HOME/Ascend/ascend-toolkit/latest + ``` + - 指定路径install_path,安装CANN软件包 + ```bash + export ASCEND_INSTALL_PATH=${install_path}/ascend-toolkit/latest + ``` +- 样例执行 + + ```bash + bash run.sh -r [RUN_MODE] -v [SOC_VERSION] + ``` + + - RUN_MODE:编译方式,可选择CPU调试,NPU仿真,NPU上板。支持参数为[cpu /sim / npu] + - SOC_VERSION:昇腾AI处理器型号,如果无法确定具体的[SOC_VERSION],则在安装昇腾AI处理器的服务器执行npu-smi info命令进行查询,在查询到的“Name”前增加Ascend信息,例如“Name”对应取值为xxxyy,实际配置的[SOC_VERSION]值为Ascendxxxyy。支持以下产品型号: + - Atlas A2训练系列产品/Atlas 800I A2推理产品 + + 示例如下,Ascendxxxyy请替换为实际的AI处理器型号。 + + ```bash + bash run.sh -r cpu -v Ascendxxxyy + ``` + +## 更新说明 + + +| 时间 | 更新事项 | +| ---------- | ------------ | +| 2025/07/01 | 新增本readme | diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/KernelLaunch/adds_custom_v1.cpp b/operator/ascendc/4_best_practices/15_mata_address_conflict/KernelLaunch/adds_custom_v1.cpp new file mode 100644 index 000000000..5c06ce096 --- /dev/null +++ b/operator/ascendc/4_best_practices/15_mata_address_conflict/KernelLaunch/adds_custom_v1.cpp @@ -0,0 +1,98 @@ +/** + * @file add_custom_v1.cpp + * + * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + */ +#include "kernel_operator.h" + +using AscendC::TPosition; +namespace { +constexpr int32_t M = 8192; +constexpr int32_t N = 128; +constexpr int32_t TILE_M = 512; +constexpr int32_t TILE_N = 8; +constexpr int32_t USED_CORE_NUM = N / TILE_N; +constexpr int32_t LOOP_ONE_CORE = M / TILE_M; +constexpr int32_t BUFFER_NUM = 2; +constexpr int32_t BLOCK_SIZE = 32; +} // namespace + +class KernelAdd { +public: + __aicore__ inline KernelAdd() {} + __aicore__ inline void Init(GM_ADDR x, GM_ADDR z) + { + // the gm address conflict happens when L2 cache miss + // so we disable the L2 cache mode to show the + xGm.SetGlobalBuffer((__gm__ float *)x + AscendC::GetBlockIdx() * TILE_N); + zGm.SetGlobalBuffer((__gm__ float *)z + AscendC::GetBlockIdx() * TILE_N); + pipe.InitBuffer(inQueueX, BUFFER_NUM, TILE_M * TILE_N * sizeof(float)); + pipe.InitBuffer(outQueueZ, BUFFER_NUM, TILE_M * TILE_N * sizeof(float)); + } + __aicore__ inline void Process() + { + for (int32_t i = 0; i < LOOP_ONE_CORE; i++) { + CopyIn(i); + Compute(); + CopyOut(i); + } + } + +private: + __aicore__ inline void CopyIn(int32_t progress) + { + AscendC::LocalTensor xLocal = inQueueX.AllocTensor(); + AscendC::DataCopyParams params; + params.blockCount = TILE_M; + params.blockLen = TILE_N * sizeof(float) / BLOCK_SIZE; + params.srcStride = (N - TILE_N) * sizeof(float) / BLOCK_SIZE; + params.dstStride = 0; + AscendC::DataCopy(xLocal, xGm[progress * TILE_M * N], params); + inQueueX.EnQue(xLocal); + } + __aicore__ inline void Compute() + { + AscendC::LocalTensor xLocal = inQueueX.DeQue(); + AscendC::LocalTensor zLocal = outQueueZ.AllocTensor(); + constexpr float scale = 2.0; + AscendC::Adds(zLocal, xLocal, scale, TILE_M * TILE_N); + outQueueZ.EnQue(zLocal); + inQueueX.FreeTensor(xLocal); + } + __aicore__ inline void CopyOut(int32_t progress) + { + AscendC::LocalTensor zLocal = outQueueZ.DeQue(); + AscendC::DataCopyParams params; + params.blockCount = TILE_M; + params.blockLen = TILE_N * sizeof(float) / BLOCK_SIZE; + params.srcStride = 0; + params.dstStride = (N - TILE_N) * sizeof(float) / BLOCK_SIZE; + AscendC::DataCopy(zGm[progress * TILE_M * N], zLocal, params); + outQueueZ.FreeTensor(zLocal); + } + +private: + AscendC::TPipe pipe; + AscendC::TQue inQueueX; + AscendC::TQue outQueueZ; + AscendC::GlobalTensor xGm; + AscendC::GlobalTensor zGm; +}; + +extern "C" __global__ __aicore__ void adds_custom_v1(GM_ADDR x, GM_ADDR z) +{ + KernelAdd op; + op.Init(x, z); + op.Process(); +} + +#ifndef ASCENDC_CPU_DEBUG +void adds_custom_do_v1(uint32_t blockDim, void *stream, uint8_t *x, uint8_t *z) +{ + adds_custom_v1<<>>(x, z); +} +#endif diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/KernelLaunch/adds_custom_v2.cpp b/operator/ascendc/4_best_practices/15_mata_address_conflict/KernelLaunch/adds_custom_v2.cpp new file mode 100644 index 000000000..d1d114bb9 --- /dev/null +++ b/operator/ascendc/4_best_practices/15_mata_address_conflict/KernelLaunch/adds_custom_v2.cpp @@ -0,0 +1,104 @@ +/** + * @file adds_custom_v2.cpp + * + * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + */ +#include "kernel_operator.h" + +using AscendC::TPosition; +namespace { +constexpr int32_t M = 8192; +constexpr int32_t N = 128; +constexpr int32_t TILE_M = 512; +constexpr int32_t TILE_N = 8; +constexpr int32_t USED_CORE_NUM = N / TILE_N; +constexpr int32_t LOOP_ONE_CORE = M / TILE_M; +constexpr int32_t BUFFER_NUM = 2; +constexpr int32_t BLOCK_SIZE = 32; +} // namespace + +class KernelAdd { +public: + __aicore__ inline KernelAdd() {} + __aicore__ inline void Init(GM_ADDR x, GM_ADDR z) + { + // the gm address conflict happens when L2 cache miss + // so we disable the L2 cache mode to show the + xGm.SetGlobalBuffer((__gm__ float *)x + AscendC::GetBlockIdx() * TILE_N); + zGm.SetGlobalBuffer((__gm__ float *)z + AscendC::GetBlockIdx() * TILE_N); + pipe.InitBuffer(inQueueX, BUFFER_NUM, TILE_M * TILE_N * sizeof(float)); + pipe.InitBuffer(outQueueZ, BUFFER_NUM, TILE_M * TILE_N * sizeof(float)); + } + __aicore__ inline void Process() + { + for (int32_t i = 0; i < LOOP_ONE_CORE; i++) { + // to avoid the gm address conflict: + // the loop order core0 : 0, 1, 2, 3, ..., 13, 14, 15 + // the loop order core1 : 1, 2, 3, 4, ..., 14, 15, 0 + // ... + // the loop order core15 : 15, 0, 1, 2, ..., 12, 13, 14 + int32_t newProgress = (i + AscendC::GetBlockIdx()) % LOOP_ONE_CORE; + CopyIn(newProgress); + Compute(); + CopyOut(newProgress); + } + } + +private: + __aicore__ inline void CopyIn(int32_t progress) + { + AscendC::LocalTensor xLocal = inQueueX.AllocTensor(); + AscendC::DataCopyParams params; + params.blockCount = TILE_M; + params.blockLen = TILE_N * sizeof(float) / BLOCK_SIZE; + params.srcStride = (N - TILE_N) * sizeof(float) / BLOCK_SIZE; + params.dstStride = 0; + AscendC::DataCopy(xLocal, xGm[progress * TILE_M * N], params); + inQueueX.EnQue(xLocal); + } + __aicore__ inline void Compute() + { + AscendC::LocalTensor xLocal = inQueueX.DeQue(); + AscendC::LocalTensor zLocal = outQueueZ.AllocTensor(); + constexpr float scale = 2.0; + AscendC::Adds(zLocal, xLocal, scale, TILE_M * TILE_N); + outQueueZ.EnQue(zLocal); + inQueueX.FreeTensor(xLocal); + } + __aicore__ inline void CopyOut(int32_t progress) + { + AscendC::LocalTensor zLocal = outQueueZ.DeQue(); + AscendC::DataCopyParams params; + params.blockCount = TILE_M; + params.blockLen = TILE_N * sizeof(float) / BLOCK_SIZE; + params.srcStride = 0; + params.dstStride = (N - TILE_N) * sizeof(float) / BLOCK_SIZE; + AscendC::DataCopy(zGm[progress * TILE_M * N], zLocal, params); + outQueueZ.FreeTensor(zLocal); + } + +private: + AscendC::TPipe pipe; + AscendC::TQue inQueueX; + AscendC::TQue outQueueZ; + AscendC::GlobalTensor xGm; + AscendC::GlobalTensor zGm; +}; + +extern "C" __global__ __aicore__ void adds_custom_v2(GM_ADDR x, GM_ADDR z) +{ + KernelAdd op; + op.Init(x, z); + op.Process(); +} + +#ifndef ASCENDC_CPU_DEBUG +void adds_custom_do_v2(uint32_t blockDim, void *stream, uint8_t *x, uint8_t *z) +{ + adds_custom_v2<<>>(x, z); +} +#endif diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/KernelLaunch/cmake/cpu_lib.cmake b/operator/ascendc/4_best_practices/15_mata_address_conflict/KernelLaunch/cmake/cpu_lib.cmake new file mode 100644 index 000000000..5362c8b5a --- /dev/null +++ b/operator/ascendc/4_best_practices/15_mata_address_conflict/KernelLaunch/cmake/cpu_lib.cmake @@ -0,0 +1,9 @@ +if(NOT DEFINED ENV{CMAKE_PREFIX_PATH}) + set(CMAKE_PREFIX_PATH ${ASCEND_CANN_PACKAGE_PATH}/tools/tikicpulib/lib/cmake) +endif() +find_package(tikicpulib REQUIRED) + +add_library(ascendc_kernels_${RUN_MODE} SHARED ${KERNEL_FILES}) +target_link_libraries(ascendc_kernels_${RUN_MODE} PUBLIC tikicpulib::${SOC_VERSION}) +target_compile_options(ascendc_kernels_${RUN_MODE} PRIVATE -g -O0 -std=c++17) +install(TARGETS ascendc_kernels_${RUN_MODE} DESTINATION ${CMAKE_INSTALL_LIBDIR}) diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/KernelLaunch/cmake/npu_lib.cmake b/operator/ascendc/4_best_practices/15_mata_address_conflict/KernelLaunch/cmake/npu_lib.cmake new file mode 100644 index 000000000..f92b095d1 --- /dev/null +++ b/operator/ascendc/4_best_practices/15_mata_address_conflict/KernelLaunch/cmake/npu_lib.cmake @@ -0,0 +1,11 @@ +if(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake) + set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake) +elseif(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake) + set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake) +else() + message(FATAL_ERROR "ascendc_kernel_cmake does not exist ,please check whether the cann package is installed") +endif() +include(${ASCENDC_CMAKE_DIR}/ascendc.cmake) + +# ascendc_library use to add kernel file to generate ascendc library +ascendc_library(ascendc_kernels_${RUN_MODE} SHARED ${KERNEL_FILES}) diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/KernelLaunch/data_utils.h b/operator/ascendc/4_best_practices/15_mata_address_conflict/KernelLaunch/data_utils.h new file mode 100644 index 000000000..09d906371 --- /dev/null +++ b/operator/ascendc/4_best_practices/15_mata_address_conflict/KernelLaunch/data_utils.h @@ -0,0 +1,203 @@ +/** + * @file data_utils.h + * + * Copyright (C) 2024. Huawei Technologies Co., Ltd. All rights reserved. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + */ +#ifndef DATA_UTILS_H +#define DATA_UTILS_H +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "acl/acl.h" + +typedef enum { + DT_UNDEFINED = -1, + FLOAT = 0, + HALF = 1, + INT8_T = 2, + INT32_T = 3, + UINT8_T = 4, + INT16_T = 6, + UINT16_T = 7, + UINT32_T = 8, + INT64_T = 9, + UINT64_T = 10, + DOUBLE = 11, + BOOL = 12, + STRING = 13, + COMPLEX64 = 16, + COMPLEX128 = 17, + BF16 = 27 +} printDataType; + +#define INFO_LOG(fmt, args...) fprintf(stdout, "[INFO] " fmt "\n", ##args) +#define WARN_LOG(fmt, args...) fprintf(stdout, "[WARN] " fmt "\n", ##args) +#define ERROR_LOG(fmt, args...) fprintf(stdout, "[ERROR] " fmt "\n", ##args) +#define CHECK_ACL(x) \ + do { \ + aclError __ret = x; \ + if (__ret != ACL_ERROR_NONE) { \ + std::cerr << __FILE__ << ":" << __LINE__ << " aclError:" << __ret << std::endl; \ + } \ + } while (0); + +/** + * @brief Read data from file + * @param [in] filePath: file path + * @param [out] fileSize: file size + * @return read result + */ +bool ReadFile(const std::string &filePath, size_t &fileSize, void *buffer, size_t bufferSize) +{ + struct stat sBuf; + int fileStatus = stat(filePath.data(), &sBuf); + if (fileStatus == -1) { + ERROR_LOG("failed to get file"); + return false; + } + if (S_ISREG(sBuf.st_mode) == 0) { + ERROR_LOG("%s is not a file, please enter a file", filePath.c_str()); + return false; + } + + std::ifstream file; + file.open(filePath, std::ios::binary); + if (!file.is_open()) { + ERROR_LOG("Open file failed. path = %s", filePath.c_str()); + return false; + } + + std::filebuf *buf = file.rdbuf(); + size_t size = buf->pubseekoff(0, std::ios::end, std::ios::in); + if (size == 0) { + ERROR_LOG("file size is 0"); + file.close(); + return false; + } + if (size > bufferSize) { + ERROR_LOG("file size is larger than buffer size"); + file.close(); + return false; + } + buf->pubseekpos(0, std::ios::in); + buf->sgetn(static_cast(buffer), size); + fileSize = size; + file.close(); + return true; +} + +/** + * @brief Write data to file + * @param [in] filePath: file path + * @param [in] buffer: data to write to file + * @param [in] size: size to write + * @return write result + */ +bool WriteFile(const std::string &filePath, const void *buffer, size_t size) +{ + if (buffer == nullptr) { + ERROR_LOG("Write file failed. buffer is nullptr"); + return false; + } + + int fd = open(filePath.c_str(), O_RDWR | O_CREAT | O_TRUNC, S_IRUSR | S_IWRITE); + if (fd < 0) { + ERROR_LOG("Open file failed. path = %s", filePath.c_str()); + return false; + } + + size_t writeSize = write(fd, buffer, size); + (void)close(fd); + if (writeSize != size) { + ERROR_LOG("Write file Failed."); + return false; + } + + return true; +} + +template void DoPrintData(const T *data, size_t count, size_t elementsPerRow) +{ + assert(elementsPerRow != 0); + for (size_t i = 0; i < count; ++i) { + std::cout << std::setw(10) << data[i]; + if (i % elementsPerRow == elementsPerRow - 1) { + std::cout << std::endl; + } + } +} + +void DoPrintHalfData(const aclFloat16 *data, size_t count, size_t elementsPerRow) +{ + assert(elementsPerRow != 0); + for (size_t i = 0; i < count; ++i) { + std::cout << std::setw(10) << std::setprecision(6) << aclFloat16ToFloat(data[i]); + if (i % elementsPerRow == elementsPerRow - 1) { + std::cout << std::endl; + } + } +} + +void PrintData(const void *data, size_t count, printDataType dataType, size_t elementsPerRow = 16) +{ + if (data == nullptr) { + ERROR_LOG("Print data failed. data is nullptr"); + return; + } + + switch (dataType) { + case BOOL: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case INT8_T: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case UINT8_T: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case INT16_T: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case UINT16_T: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case INT32_T: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case UINT32_T: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case INT64_T: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case UINT64_T: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case HALF: + DoPrintHalfData(reinterpret_cast(data), count, elementsPerRow); + break; + case FLOAT: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case DOUBLE: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + default: + ERROR_LOG("Unsupported type: %d", dataType); + } + std::cout << std::endl; +} +#endif // DATA_UTILS_H diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/KernelLaunch/main.cpp b/operator/ascendc/4_best_practices/15_mata_address_conflict/KernelLaunch/main.cpp new file mode 100644 index 000000000..14523e2ca --- /dev/null +++ b/operator/ascendc/4_best_practices/15_mata_address_conflict/KernelLaunch/main.cpp @@ -0,0 +1,128 @@ +/** + * @file main.cpp + * + * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + */ +#include "data_utils.h" +#ifndef ASCENDC_CPU_DEBUG +#include "acl/acl.h" +extern void adds_custom_do_v1(uint32_t blockDim, void *stream, uint8_t *x, uint8_t *z); +extern void adds_custom_do_v2(uint32_t blockDim, void *stream, uint8_t *x, uint8_t *z); +using KernelEntry = void(*)(uint32_t, void *, uint8_t *, uint8_t *); +#else +#include "tikicpulib.h" +extern "C" __global__ __aicore__ void adds_custom_v1(GM_ADDR x, GM_ADDR z); +extern "C" __global__ __aicore__ void adds_custom_v2(GM_ADDR x, GM_ADDR z); +using KernelEntry = void (*)(GM_ADDR, GM_ADDR); + +#endif + +struct ArgInfo { + std::string fileName; + size_t length; +}; + +#ifndef ASCENDC_CPU_DEBUG + +void KernelCall(KernelEntry kernelEntry, uint32_t blockDim, void *stream, std::vector &inputsInfo, + std::vector &outputsInfo) +{ + std::vector inputHost(inputsInfo.size()); + std::vector inputDevice(inputsInfo.size()); + std::vector outputHost(outputsInfo.size()); + std::vector outputDevice(outputsInfo.size()); + + for (uint32_t i = 0; i < inputsInfo.size(); i++) { + CHECK_ACL(aclrtMallocHost((void **)(&inputHost[i]), inputsInfo[i].length)); + CHECK_ACL(aclrtMalloc((void **)(&inputDevice[i]), inputsInfo[i].length, ACL_MEM_MALLOC_HUGE_FIRST)); + ReadFile(inputsInfo[i].fileName, inputsInfo[i].length, inputHost[i], inputsInfo[i].length); + CHECK_ACL(aclrtMemcpy(inputDevice[i], inputsInfo[i].length, inputHost[i], inputsInfo[i].length, + ACL_MEMCPY_HOST_TO_DEVICE)); + } + + for (uint32_t i = 0; i < outputsInfo.size(); i++) { + CHECK_ACL(aclrtMallocHost((void **)(&outputHost[i]), outputsInfo[i].length)); + CHECK_ACL(aclrtMalloc((void **)(&outputDevice[i]), outputsInfo[i].length, ACL_MEM_MALLOC_HUGE_FIRST)); + } + + kernelEntry(blockDim, stream, inputDevice[0], outputDevice[0]); + CHECK_ACL(aclrtSynchronizeStream(stream)); + for (uint32_t i = 0; i < outputsInfo.size(); i++) { + CHECK_ACL(aclrtMemcpy(outputHost[i], outputsInfo[i].length, outputDevice[i], outputsInfo[i].length, + ACL_MEMCPY_DEVICE_TO_HOST)); + WriteFile(outputsInfo[i].fileName, outputHost[i], outputsInfo[i].length); + CHECK_ACL(aclrtFree(outputDevice[i])); + CHECK_ACL(aclrtFreeHost(outputHost[i])); + } + + for (uint32_t i = 0; i < inputsInfo.size(); i++) { + CHECK_ACL(aclrtFree(inputDevice[i])); + CHECK_ACL(aclrtFreeHost(inputHost[i])); + } +} + +#else + +#define KernelCall(kernelEntry, blockDim, inputsInfo, outputsInfo) \ + { \ + std::vector input(inputsInfo.size()); \ + std::vector output(outputsInfo.size()); \ + \ + for (uint32_t i = 0; i < inputsInfo.size(); i++) { \ + input[i] = (uint8_t *)AscendC::GmAlloc(inputsInfo[i].length); \ + ReadFile(inputsInfo[i].fileName, inputsInfo[i].length, input[i], inputsInfo[i].length); \ + } \ + \ + for (uint32_t i = 0; i < outputsInfo.size(); i++) { \ + output[i] = (uint8_t *)AscendC::GmAlloc(outputsInfo[i].length); \ + } \ + \ + AscendC::SetKernelMode(KernelMode::AIV_MODE); \ + ICPU_RUN_KF(kernelEntry, blockDim, input[0], output[0]); \ + for (uint32_t i = 0; i < inputsInfo.size(); i++) { \ + AscendC::GmFree((void *)input[i]); \ + } \ + \ + for (uint32_t i = 0; i < outputsInfo.size(); i++) { \ + WriteFile(outputsInfo[i].fileName, output[i], outputsInfo[i].length); \ + AscendC::GmFree((void *)output[i]); \ + } \ + } + +#endif + +int32_t main(int32_t argc, char *argv[]) +{ + uint32_t blockDim = 16; + uint32_t M = 8192; + uint32_t N = 128; + size_t inputByteSize = M * N * sizeof(float); + size_t outputByteSize = M * N * sizeof(float); + + std::vector inputsInfo = {{"./input/input_x.bin", inputByteSize}}; + std::vector outputsV1Info = {{"./output/output_z_v1.bin", outputByteSize}}; + std::vector outputsV2Info = {{"./output/output_z_v2.bin", outputByteSize}}; + +#ifndef ASCENDC_CPU_DEBUG + CHECK_ACL(aclInit(nullptr)); + int32_t deviceId = 0; + CHECK_ACL(aclrtSetDevice(deviceId)); + aclrtStream stream = nullptr; + CHECK_ACL(aclrtCreateStream(&stream)); + + KernelCall(adds_custom_do_v1, blockDim, stream, inputsInfo, outputsV1Info); + KernelCall(adds_custom_do_v2, blockDim, stream, inputsInfo, outputsV2Info); + + CHECK_ACL(aclrtDestroyStream(stream)); + CHECK_ACL(aclrtResetDevice(deviceId)); + CHECK_ACL(aclFinalize()); +#else + KernelCall(adds_custom_v1, blockDim, inputsInfo, outputsV1Info); + KernelCall(adds_custom_v2, blockDim, inputsInfo, outputsV2Info); +#endif + return 0; +} diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/KernelLaunch/run.sh b/operator/ascendc/4_best_practices/15_mata_address_conflict/KernelLaunch/run.sh new file mode 100755 index 000000000..0c5aef144 --- /dev/null +++ b/operator/ascendc/4_best_practices/15_mata_address_conflict/KernelLaunch/run.sh @@ -0,0 +1,113 @@ +#!/bin/bash +CURRENT_DIR=$( + cd $(dirname ${BASH_SOURCE:-$0}) + pwd +) + +BUILD_TYPE="Debug" +INSTALL_PREFIX="${CURRENT_DIR}/out" + +SHORT=r:,v:,i:,b:,p:, +LONG=run-mode:,soc-version:,install-path:,build-type:,install-prefix:, +OPTS=$(getopt -a --options $SHORT --longoptions $LONG -- "$@") +eval set -- "$OPTS" +SOC_VERSION="Ascend310P3" + +while :; do + case "$1" in + -r | --run-mode) + RUN_MODE="$2" + shift 2 + ;; + -v | --soc-version) + SOC_VERSION="$2" + shift 2 + ;; + -i | --install-path) + ASCEND_INSTALL_PATH="$2" + shift 2 + ;; + -b | --build-type) + BUILD_TYPE="$2" + shift 2 + ;; + -p | --install-prefix) + INSTALL_PREFIX="$2" + shift 2 + ;; + --) + shift + break + ;; + *) + echo "[ERROR] Unexpected option: $1" + break + ;; + esac +done + +RUN_MODE_LIST="cpu sim npu" +if [[ " $RUN_MODE_LIST " != *" $RUN_MODE "* ]]; then + echo "ERROR: RUN_MODE error, This sample only support specify cpu, sim or npu!" + exit -1 +fi + +VERSION_LIST="Ascend310P1 Ascend310P3 Ascend910B1 Ascend910B2 Ascend910B3 Ascend910B4" +if [[ " $VERSION_LIST " != *" $SOC_VERSION "* ]]; then + echo "ERROR: SOC_VERSION should be in [$VERSION_LIST]" + exit -1 +fi + +if [ -n "$ASCEND_INSTALL_PATH" ]; then + _ASCEND_INSTALL_PATH=$ASCEND_INSTALL_PATH +elif [ -n "$ASCEND_HOME_PATH" ]; then + _ASCEND_INSTALL_PATH=$ASCEND_HOME_PATH +else + if [ -d "$HOME/Ascend/ascend-toolkit/latest" ]; then + _ASCEND_INSTALL_PATH=$HOME/Ascend/ascend-toolkit/latest + else + _ASCEND_INSTALL_PATH=/usr/local/Ascend/ascend-toolkit/latest + fi +fi + +export ASCEND_TOOLKIT_HOME=${_ASCEND_INSTALL_PATH} +export ASCEND_HOME_PATH=${_ASCEND_INSTALL_PATH} +echo "Current compile soc version is ${SOC_VERSION}" +source ${_ASCEND_INSTALL_PATH}/bin/setenv.bash +if [ "${RUN_MODE}" = "sim" ]; then + # in case of running op in simulator, use stub .so instead + export LD_LIBRARY_PATH=${_ASCEND_INSTALL_PATH}/tools/simulator/${SOC_VERSION}/lib:$LD_LIBRARY_PATH +elif [ "${RUN_MODE}" = "cpu" ]; then + export LD_LIBRARY_PATH=${_ASCEND_INSTALL_PATH}/tools/tikicpulib/lib:${_ASCEND_INSTALL_PATH}/tools/tikicpulib/lib/${SOC_VERSION}:${_ASCEND_INSTALL_PATH}/tools/simulator/${SOC_VERSION}/lib:$LD_LIBRARY_PATH +fi + +set -e +rm -rf build out +mkdir -p build +cmake -B build \ + -DRUN_MODE=${RUN_MODE} \ + -DSOC_VERSION=${SOC_VERSION} \ + -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \ + -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} \ + -DASCEND_CANN_PACKAGE_PATH=${_ASCEND_INSTALL_PATH} +cmake --build build -j +cmake --install build + +rm -f ascendc_kernels_bbit +cp ./out/bin/ascendc_kernels_bbit ./ +rm -rf input output +mkdir -p input output +python3 scripts/gen_data.py +( + export LD_LIBRARY_PATH=$(pwd)/out/lib:$(pwd)/out/lib64:${_ASCEND_INSTALL_PATH}/lib64:$LD_LIBRARY_PATH + if [ "${RUN_MODE}" = "npu" ]; then + msprof op --launch-count=2 --output=./prof ./ascendc_kernels_bbit + elif [ "${RUN_MODE}" = "sim" ]; then + msprof op simulator --launch-count=2 --output=./prof ./ascendc_kernels_bbit + elif [ "${RUN_MODE}" = "cpu" ]; then + ./ascendc_kernels_bbit + fi +) +md5sum output/*.bin +python3 scripts/verify_result.py output/output_z_v1.bin output/golden.bin +python3 scripts/verify_result.py output/output_z_v2.bin output/golden.bin diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/KernelLaunch/scripts/gen_data.py b/operator/ascendc/4_best_practices/15_mata_address_conflict/KernelLaunch/scripts/gen_data.py new file mode 100644 index 000000000..3a5a2a1f4 --- /dev/null +++ b/operator/ascendc/4_best_practices/15_mata_address_conflict/KernelLaunch/scripts/gen_data.py @@ -0,0 +1,23 @@ +#!/usr/bin/python3 +# coding=utf-8 +# +# Copyright (C) 2023-2024. Huawei Technologies Co., Ltd. All rights reserved. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# =============================================================================== + +import numpy as np + + +def gen_golden_data_simple(): + input_x = np.random.uniform(1, 100, [8192, 128]).astype(np.float32) + golden = (input_x + 2.0).astype(np.float32) + + input_x.tofile("./input/input_x.bin") + golden.tofile("./output/golden.bin") + + +if __name__ == "__main__": + gen_golden_data_simple() diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/KernelLaunch/scripts/verify_result.py b/operator/ascendc/4_best_practices/15_mata_address_conflict/KernelLaunch/scripts/verify_result.py new file mode 100644 index 000000000..6a38a3b2b --- /dev/null +++ b/operator/ascendc/4_best_practices/15_mata_address_conflict/KernelLaunch/scripts/verify_result.py @@ -0,0 +1,53 @@ +#!/usr/bin/python3 +# coding=utf-8 +# +# Copyright (C) 2023-2024. Huawei Technologies Co., Ltd. All rights reserved. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# =============================================================================== + +import sys +import numpy as np + +# for float32 +relative_tol = 1e-4 +absolute_tol = 1e-5 +error_tol = 1e-4 + + +def verify_result(output, golden): + output = np.fromfile(output, dtype=np.float32).reshape(-1) + golden = np.fromfile(golden, dtype=np.float32).reshape(-1) + different_element_results = np.isclose(output, + golden, + rtol=relative_tol, + atol=absolute_tol, + equal_nan=True) + different_element_indexes = np.where(different_element_results == False)[0] + for index in range(len(different_element_indexes)): + real_index = different_element_indexes[index] + golden_data = golden[real_index] + output_data = output[real_index] + print( + "data index: %06d, expected: %-.9f, actual: %-.9f, rdiff: %-.6f" % + (real_index, golden_data, output_data, + abs(output_data - golden_data) / golden_data)) + if index == 100: + break + error_ratio = float(different_element_indexes.size) / golden.size + print("error ratio: %.4f, tolerance: %.4f" % (error_ratio, error_tol)) + return error_ratio <= error_tol + + +if __name__ == '__main__': + try: + res = verify_result(sys.argv[1], sys.argv[2]) + if not res: + raise ValueError("[ERROR] result error") + else: + print("test pass") + except Exception as e: + print(e) + sys.exit(1) -- Gitee From 7702f46578764ba9a8f6aeb08d0077680fd470ec Mon Sep 17 00:00:00 2001 From: zhanghao0689 Date: Wed, 2 Jul 2025 20:24:00 +0800 Subject: [PATCH 02/17] add aclnn launch --- .../AclNNInvocation/README.md | 55 +++ .../AclNNInvocation/inc/common.h | 45 ++ .../AclNNInvocation/inc/op_runner.h | 181 +++++++ .../AclNNInvocation/inc/operator_desc.h | 57 +++ .../AclNNInvocation/input/.keep | 0 .../AclNNInvocation/run.sh | 75 +++ .../AclNNInvocation/scripts/acl.json | 1 + .../AclNNInvocation/scripts/gen_data.py | 25 + .../AclNNInvocation/scripts/verify_result.py | 53 ++ .../AclNNInvocation/src/CMakeLists.txt | 65 +++ .../AclNNInvocation/src/common.cpp | 80 +++ .../AclNNInvocation/src/main.cpp | 162 +++++++ .../AclNNInvocation/src/op_runner.cpp | 457 ++++++++++++++++++ .../AclNNInvocation/src/operator_desc.cpp | 51 ++ .../15_mata_address_conflict/AddsCustom.json | 30 ++ .../AddsCustom/op_host/adds_custom.cpp | 65 +++ .../AddsCustom/op_host/adds_custom_tiling.h | 22 + .../AddsCustom/op_kernel/adds_custom.cpp | 20 + .../15_mata_address_conflict/README.md | 153 +++++- .../15_mata_address_conflict/install.sh | 55 +++ 20 files changed, 1651 insertions(+), 1 deletion(-) create mode 100644 operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/README.md create mode 100644 operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/inc/common.h create mode 100644 operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/inc/op_runner.h create mode 100644 operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/inc/operator_desc.h create mode 100644 operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/input/.keep create mode 100755 operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/run.sh create mode 100644 operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/scripts/acl.json create mode 100644 operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/scripts/gen_data.py create mode 100644 operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/scripts/verify_result.py create mode 100644 operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/src/CMakeLists.txt create mode 100644 operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/src/common.cpp create mode 100644 operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/src/main.cpp create mode 100644 operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/src/op_runner.cpp create mode 100644 operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/src/operator_desc.cpp create mode 100644 operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom.json create mode 100644 operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_host/adds_custom.cpp create mode 100644 operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_host/adds_custom_tiling.h create mode 100644 operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_kernel/adds_custom.cpp create mode 100755 operator/ascendc/4_best_practices/15_mata_address_conflict/install.sh diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/README.md b/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/README.md new file mode 100644 index 000000000..533675076 --- /dev/null +++ b/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/README.md @@ -0,0 +1,55 @@ +## 目录结构介绍 +``` +├── AclNNInvocation //通过aclnn调用的方式调用AddCustom算子 +│ ├── inc // 头文件目录 +│ │ ├── common.h // 声明公共方法类,用于读取二进制文件 +│ │ ├── op_runner.h // 算子描述声明文件,包含算子输入/输出,算子类型以及输入描述与输出描述 +│ │ └── operator_desc.h // 算子运行相关信息声明文件,包含算子输入/输出个数,输入/输出大小等 +│ ├── input // 存放脚本生成的输入数据目录 +│ ├── output // 存放算子运行输出数据和真值数据的目录 +│ ├── scripts +│ │ ├── acl.json // acl配置文件 +│ │ ├── gen_data.py // 输入数据和真值数据生成脚本 +│ │ └── verify_result.py // 真值对比文件 +│ ├── src +│ │ ├── CMakeLists.txt // 编译规则文件 +│ │ ├── common.cpp // 公共函数,读取二进制文件函数的实现文件 +│ │ ├── main.cpp // 单算子调用应用的入口 +│ │ ├── op_runner.cpp // 单算子调用主体流程实现文件 +│ │ └── operator_desc.cpp // 构造算子的输入与输出描述 +│ └── run.sh // 执行命令脚本 +``` +## 代码实现介绍 +完成自定义算子的开发部署后,可以通过单算子调用的方式来验证单算子的功能。src/main.cpp代码为单算子API执行方式。单算子API执行是基于C语言的API执行算子,无需提供单算子描述文件进行离线模型的转换,直接调用单算子API接口。 + +自定义算子编译部署后,会自动生成单算子API,可以直接在应用程序中调用。算子API的形式一般定义为“两段式接口”,形如: + ```cpp + // 获取算子使用的workspace空间大小 + aclnnStatus aclnnAddCustomGetWorkspaceSize(const aclTensor *x, const aclTensor *y, const aclTensor *out, uint64_t *workspaceSize, aclOpExecutor **executor); + // 执行算子 + aclnnStatus aclnnAddCustom(void *workspace, int64_t workspaceSize, aclOpExecutor *executor, aclrtStream stream); + ``` +其中aclnnAddCustomGetWorkspaceSize为第一段接口,主要用于计算本次API调用计算过程中需要多少的workspace内存。获取到本次API计算需要的workspace大小之后,开发者可以按照workspaceSize大小申请Device侧内存,然后调用第二段接口aclnnAddCustom执行计算。具体参考[单算子API调用](https://hiascend.com/document/redirect/CannCommunityAscendCInVorkSingleOp)章节。 + +## 运行样例算子 +### 1. 编译算子工程 +运行此样例前,请参考[编译算子工程](../README.md#operatorcompile)完成前期准备。 +### 2. aclnn调用样例运行 + + - 进入到样例目录 + 以命令行方式下载样例代码,master分支为例。 + ```bash + cd ${git_clone_path}/samples/operator/ascendc/0_introduction/1_add_frameworklaunch/AclNNInvocation + ``` + - 样例执行 + + 样例执行过程中会自动生成测试数据,然后编译与运行aclnn样例,最后检验运行结果。具体过程可参见run.sh脚本。 + + ```bash + bash run.sh + ``` +## 更新说明 +| 时间 | 更新事项 | +| ---------- | ------------ | +| 2024/05/22 | 新增本readme | +| 2024/11/11 | 样例目录调整 | \ No newline at end of file diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/inc/common.h b/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/inc/common.h new file mode 100644 index 000000000..11bb4aeca --- /dev/null +++ b/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/inc/common.h @@ -0,0 +1,45 @@ +/** + * @file common.h + * + * Copyright (C) 2023-2024. Huawei Technologies Co., Ltd. All rights reserved. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + */ +#ifndef COMMON_H +#define COMMON_H + +#include +#include +#include +#include +#include + +#include "acl/acl.h" + +#define SUCCESS 0 +#define FAILED 1 + +#define INFO_LOG(fmt, args...) fprintf(stdout, "[INFO] " fmt "\n", ##args) +#define WARN_LOG(fmt, args...) fprintf(stdout, "[WARN] " fmt "\n", ##args) +#define ERROR_LOG(fmt, args...) fprintf(stderr, "[ERROR] " fmt "\n", ##args) + +/** + * @brief Read data from file + * @param [in] filePath: file path + * @param [out] fileSize: file size + * @return read result + */ +bool ReadFile(const std::string &filePath, size_t fileSize, void *buffer, size_t bufferSize); + +/** + * @brief Write data to file + * @param [in] filePath: file path + * @param [in] buffer: data to write to file + * @param [in] size: size to write + * @return write result + */ +bool WriteFile(const std::string &filePath, const void *buffer, size_t size); + +#endif // COMMON_H diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/inc/op_runner.h b/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/inc/op_runner.h new file mode 100644 index 000000000..f1b3a6706 --- /dev/null +++ b/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/inc/op_runner.h @@ -0,0 +1,181 @@ +/** + * @file op_runner.h + * + * Copyright (C) 2023-2024. Huawei Technologies Co., Ltd. All rights reserved. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + */ +#ifndef OP_RUNNER_H +#define OP_RUNNER_H + +#include "acl/acl.h" +#include "aclnn/acl_meta.h" +#include "common.h" +#include "operator_desc.h" + +/** + * Op Runner + */ +class OpRunner { +public: + /** + * @brief Constructor + * @param [in] opDesc: op description + */ + explicit OpRunner(OperatorDesc *opDesc); + + /** + * @brief Destructor + */ + virtual ~OpRunner(); + + /** + * @brief Init op runner + */ + bool Init(); + + /** + * @brief Get number of inputs + * @return number of inputs + */ + const size_t NumInputs(); + + /** + * @brief Get number of outputs + * @return number of outputs + */ + const size_t NumOutputs(); + + /** + * @brief Get input size by index + * @param [in] index: input index + * @return size of the input + */ + const size_t GetInputSize(size_t index) const; + const size_t GetInputNumDims(size_t index) const; + aclDataType GetInputDataType(size_t index) const; + aclFormat GetInputFormat(size_t index) const; + + /** + * @brief Get output size by index + * @param [in] index: output index + * @return size of the output + */ + size_t GetOutputSize(size_t index) const; + const size_t GetOutputNumDims(size_t index) const; + aclDataType GetOutputDataType(size_t index) const; + aclFormat GetOutputFormat(size_t index) const; + + /** + * @brief Get input element count by index + * @param i[in] ndex: input index + * @return element count of the input + */ + size_t GetInputElementCount(size_t index) const; + + /** + * @brief Get output element count by index + * @param [in] index: output index + * @return element count of the output + */ + size_t GetOutputElementCount(size_t index) const; + + /** + * @brief Get input shape by index + * @param [in] index: input index + * @return shape of the output + */ + std::vector GetInputShape(size_t index) const; + + /** + * @brief Get output shape by index + * @param [in] index: output index + * @return shape of the output + */ + std::vector GetOutputShape(size_t index) const; + + /** + * @brief Get input buffer(host memory) by index + * @tparam T: data type + * @param [in] index: input index + * @return host address of the input + */ + template T *GetInputBuffer(size_t index) + { + if (index >= numInputs_) { + ERROR_LOG("index out of range. index = %zu, numInputs = %zu", index, numInputs_); + return nullptr; + } + return reinterpret_cast(hostInputs_[index]); + } + + /** + * @brief Get output buffer(host memory) by index + * @tparam T: data type + * @param [in] index: output index + * @return host address of the output + */ + template const T *GetOutputBuffer(size_t index) + { + if (index >= numOutputs_) { + ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_); + return nullptr; + } + + return reinterpret_cast(hostOutputs_[index]); + } + + /** + * @brief Print readable input by index + * @param [in] index: input index + * @param [in] elementsPerRow: number of elements per row + */ + void PrintInput(size_t index, size_t elementsPerRow = 16); + + /** + * @brief Print readable output by index + * @param [in] index: output index + * @param [in] elementsPerRow: number of elements per row + */ + void PrintOutput(size_t index, size_t elementsPerRow = 16); + + /** + * @brief Compile static op + * @return compile result + */ + bool CompileStaticOp(); + + /** + * @brief Compile dynamic op + * @return compile result + */ + bool CompileDynamicOp(); + + /** + * @brief Run op + * @return run result + */ + bool RunOp(); + +private: + size_t numInputs_; + size_t numOutputs_; + void *workspace_; + + std::vector inputBuffers_; + std::vector outputBuffers_; + + std::vector devInputs_; + std::vector devOutputs_; + + std::vector hostInputs_; + std::vector hostOutputs_; + + std::vector inputTensor_; + std::vector outputTensor_; + OperatorDesc *opDesc_; +}; + +#endif // OP_RUNNER_H diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/inc/operator_desc.h b/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/inc/operator_desc.h new file mode 100644 index 000000000..6d8ee0905 --- /dev/null +++ b/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/inc/operator_desc.h @@ -0,0 +1,57 @@ +/** + * @file operator_desc.h + * + * Copyright (C) 2023-2024. Huawei Technologies Co., Ltd. All rights reserved. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + */ +#ifndef OPERATOR_DESC_H +#define OPERATOR_DESC_H + +#include +#include + +#include "acl/acl.h" + +/** + * Op description + */ +struct OperatorDesc { + /** + * Constructor + */ + explicit OperatorDesc(); + + /** + * Destructor + */ + virtual ~OperatorDesc(); + + /** + * Add an input tensor description + * @param [in] dataType: data type + * @param [in] numDims: number of dims + * @param [in] dims: dims + * @param [in] format: format + * @return OperatorDesc + */ + OperatorDesc &AddInputTensorDesc(aclDataType dataType, int numDims, const int64_t *dims, aclFormat format); + + /** + * Add an output tensor description + * @param [in] dataType: data type + * @param [in] numDims: number of dims + * @param [in] dims: dims + * @param [in] format: format + * @return OperatorDesc + */ + OperatorDesc &AddOutputTensorDesc(aclDataType dataType, int numDims, const int64_t *dims, aclFormat format); + + std::string opType; + std::vector inputDesc; + std::vector outputDesc; +}; + +#endif // OPERATOR_DESC_H diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/input/.keep b/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/input/.keep new file mode 100644 index 000000000..e69de29bb diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/run.sh b/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/run.sh new file mode 100755 index 000000000..a652bf478 --- /dev/null +++ b/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/run.sh @@ -0,0 +1,75 @@ +#!/bin/bash +CURRENT_DIR=$( + cd $(dirname ${BASH_SOURCE:-$0}) + pwd +) + +if [ -n "$ASCEND_INSTALL_PATH" ]; then + _ASCEND_INSTALL_PATH=$ASCEND_INSTALL_PATH +elif [ -n "$ASCEND_HOME_PATH" ]; then + _ASCEND_INSTALL_PATH=$ASCEND_HOME_PATH +else + if [ -d "$HOME/Ascend/ascend-toolkit/latest" ]; then + _ASCEND_INSTALL_PATH=$HOME/Ascend/ascend-toolkit/latest + else + _ASCEND_INSTALL_PATH=/usr/local/Ascend/ascend-toolkit/latest + fi +fi +source $_ASCEND_INSTALL_PATH/bin/setenv.bash +export DDK_PATH=$_ASCEND_INSTALL_PATH +export NPU_HOST_LIB=$_ASCEND_INSTALL_PATH/$(arch)-$(uname -s | tr '[:upper:]' '[:lower:]')/devlib + +function main { + # 1. 清除遗留生成文件和日志文件 + rm -rf $HOME/ascend/log/* + rm ./input/*.bin + rm ./output/*.bin + + # 2. 生成输入数据和真值数据 + cd $CURRENT_DIR + python3 scripts/gen_data.py + if [ $? -ne 0 ]; then + echo "ERROR: generate input data failed!" + return 1 + fi + echo "INFO: generate input data success!" + + # 3. 编译可执行文件 + cd $CURRENT_DIR + rm -rf build + mkdir -p build + cd build + cmake ../src -DCMAKE_SKIP_RPATH=TRUE + if [ $? -ne 0 ]; then + echo "ERROR: cmake failed!" + return 1 + fi + echo "INFO: cmake success!" + make + if [ $? -ne 0 ]; then + echo "ERROR: make failed!" + return 1 + fi + echo "INFO: make success!" + + # 4. 运行可执行文件 + export LD_LIBRARY_PATH=$_ASCEND_INSTALL_PATH/opp/vendors/customize/op_api/lib:$LD_LIBRARY_PATH + cd $CURRENT_DIR/output + echo "INFO: execute op!" + ./execute_add_op + if [ $? -ne 0 ]; then + echo "ERROR: acl executable run failed! please check your project!" + return 1 + fi + echo "INFO: acl executable run success!" + + # 5. 精度比对 + cd $CURRENT_DIR + python3 scripts/verify_result.py output/output_z.bin output/golden.bin + if [ $? -ne 0 ]; then + echo "ERROR: verify result failed!" + return 1 + fi +} + +main diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/scripts/acl.json b/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/scripts/acl.json new file mode 100644 index 000000000..9e26dfeeb --- /dev/null +++ b/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/scripts/acl.json @@ -0,0 +1 @@ +{} \ No newline at end of file diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/scripts/gen_data.py b/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/scripts/gen_data.py new file mode 100644 index 000000000..ea8ce828a --- /dev/null +++ b/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/scripts/gen_data.py @@ -0,0 +1,25 @@ +#!/usr/bin/python3 +# coding=utf-8 +# +# Copyright (C) 2023-2024. Huawei Technologies Co., Ltd. All rights reserved. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# =============================================================================== + +import numpy as np + + +def gen_golden_data_simple(): + input_x = np.random.uniform(1, 100, [8, 2048]).astype(np.float16) + input_y = np.random.uniform(1, 100, [8, 2048]).astype(np.float16) + golden = (input_x + input_y).astype(np.float16) + + input_x.tofile("./input/input_x.bin") + input_y.tofile("./input/input_y.bin") + golden.tofile("./output/golden.bin") + + +if __name__ == "__main__": + gen_golden_data_simple() diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/scripts/verify_result.py b/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/scripts/verify_result.py new file mode 100644 index 000000000..2dd46f803 --- /dev/null +++ b/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/scripts/verify_result.py @@ -0,0 +1,53 @@ +#!/usr/bin/python3 +# coding=utf-8 +# +# Copyright (C) 2023-2024. Huawei Technologies Co., Ltd. All rights reserved. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# =============================================================================== + +import sys +import numpy as np + +# for float16 +relative_tol = 1e-3 +absolute_tol = 1e-5 +error_tol = 1e-3 + + +def verify_result(output, golden): + output = np.fromfile(output, dtype=np.float16).reshape(-1) + golden = np.fromfile(golden, dtype=np.float16).reshape(-1) + different_element_results = np.isclose(output, + golden, + rtol=relative_tol, + atol=absolute_tol, + equal_nan=True) + different_element_indexes = np.where(different_element_results == False)[0] + for index in range(len(different_element_indexes)): + real_index = different_element_indexes[index] + golden_data = golden[real_index] + output_data = output[real_index] + print( + "data index: %06d, expected: %-.9f, actual: %-.9f, rdiff: %-.6f" % + (real_index, golden_data, output_data, + abs(output_data - golden_data) / golden_data)) + if index == 100: + break + error_ratio = float(different_element_indexes.size) / golden.size + print("error ratio: %.4f, tolerance: %.4f" % (error_ratio, error_tol)) + return error_ratio <= error_tol + + +if __name__ == '__main__': + try: + res = verify_result(sys.argv[1], sys.argv[2]) + if not res: + raise ValueError("[ERROR] result error") + else: + print("test pass") + except Exception as e: + print(e) + sys.exit(1) diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/src/CMakeLists.txt b/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/src/CMakeLists.txt new file mode 100644 index 000000000..8e9e45375 --- /dev/null +++ b/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/src/CMakeLists.txt @@ -0,0 +1,65 @@ +# Copyright (c) Huawei Technologies Co., Ltd. 2020. All rights reserved. + +# CMake lowest version requirement +cmake_minimum_required(VERSION 3.5.1) + +# project information +project(acl_execute_add) + +# Compile options +add_compile_options(-std=c++11) + +set(CMAKE_RUNTIME_OUTPUT_DIRECTORY "../output") +set(CMAKE_LIBRARY_OUTPUT_DIRECTORY "../output") + +set(INC_PATH $ENV{DDK_PATH}) + +if (NOT DEFINED ENV{DDK_PATH}) + set(INC_PATH "/usr/local/Ascend/ascend-toolkit/latest") + message(STATUS "set default INC_PATH: ${INC_PATH}") +else () + message(STATUS "env INC_PATH: ${INC_PATH}") +endif() + +set(CUST_PKG_PATH "${INC_PATH}/opp/vendors/customize/op_api") + +set(LIB_PATH $ENV{NPU_HOST_LIB}) + +# Dynamic libraries in the stub directory can only be used for compilation +if (NOT DEFINED ENV{NPU_HOST_LIB}) + string(TOLOWER "${CMAKE_SYSTEM_NAME}" SYSTEM_NAME_LOWER) + set(LIB_PATH "/usr/local/Ascend/ascend-toolkit/latest/${CMAKE_SYSTEM_PROCESSOR}-${SYSTEM_NAME_LOWER}/devlib") + message(STATUS "set default LIB_PATH: ${LIB_PATH}") +else () + message(STATUS "env LIB_PATH: ${LIB_PATH}") +endif() + +# Header path +include_directories( + ../inc + ${INC_PATH}/include + ${CUST_PKG_PATH}/include +) + +# add host lib path +link_directories( + ${LIB_PATH} + ${CUST_PKG_PATH}/lib +) + +add_executable(execute_add_op + operator_desc.cpp + op_runner.cpp + main.cpp + common.cpp +) + +target_link_libraries(execute_add_op + ascendcl + cust_opapi + acl_op_compiler + nnopbase + stdc++ +) + +install(TARGETS execute_add_op DESTINATION ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}) diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/src/common.cpp b/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/src/common.cpp new file mode 100644 index 000000000..992759c95 --- /dev/null +++ b/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/src/common.cpp @@ -0,0 +1,80 @@ +/** + * @file common.cpp + * + * Copyright (C) 2023-2024. Huawei Technologies Co., Ltd. All rights reserved. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + */ +#include "common.h" + +#include +#include +#include + +#include + +extern bool g_isDevice; + +bool ReadFile(const std::string &filePath, size_t fileSize, void *buffer, size_t bufferSize) +{ + struct stat sBuf; + int fileStatus = stat(filePath.data(), &sBuf); + if (fileStatus == -1) { + ERROR_LOG("failed to get file %s", filePath.c_str()); + return false; + } + if (S_ISREG(sBuf.st_mode) == 0) { + ERROR_LOG("%s is not a file, please enter a file", filePath.c_str()); + return false; + } + + std::ifstream file; + file.open(filePath, std::ios::binary); + if (!file.is_open()) { + ERROR_LOG("Open file failed. path = %s", filePath.c_str()); + return false; + } + + std::filebuf *buf = file.rdbuf(); + size_t size = buf->pubseekoff(0, std::ios::end, std::ios::in); + if (size == 0) { + ERROR_LOG("file size is 0"); + file.close(); + return false; + } + if (size > bufferSize) { + ERROR_LOG("file size is larger than buffer size"); + file.close(); + return false; + } + buf->pubseekpos(0, std::ios::in); + buf->sgetn(static_cast(buffer), size); + fileSize = size; + file.close(); + return true; +} + +bool WriteFile(const std::string &filePath, const void *buffer, size_t size) +{ + if (buffer == nullptr) { + ERROR_LOG("Write file failed. buffer is nullptr"); + return false; + } + + int fd = open(filePath.c_str(), O_RDWR | O_CREAT | O_TRUNC, S_IRUSR | S_IWRITE); + if (fd < 0) { + ERROR_LOG("Open file failed. path = %s", filePath.c_str()); + return false; + } + + size_t writeSize = write(fd, buffer, size); + (void)close(fd); + if (writeSize != size) { + ERROR_LOG("Write file Failed."); + return false; + } + + return true; +} diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/src/main.cpp b/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/src/main.cpp new file mode 100644 index 000000000..7f8e38a37 --- /dev/null +++ b/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/src/main.cpp @@ -0,0 +1,162 @@ +/** + * @file main.cpp + * + * Copyright (C) 2023-2024. Huawei Technologies Co., Ltd. All rights reserved. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + */ +#include +#include +#include + +#include +#include + +#include "acl/acl.h" +#include "common.h" +#include "op_runner.h" + +bool g_isDevice = false; +int deviceId = 0; + +OperatorDesc CreateOpDesc() +{ + // define operator + std::vector shape{8, 2048}; + aclDataType dataType = ACL_FLOAT16; + aclFormat format = ACL_FORMAT_ND; + OperatorDesc opDesc; + opDesc.AddInputTensorDesc(dataType, shape.size(), shape.data(), format); + opDesc.AddInputTensorDesc(dataType, shape.size(), shape.data(), format); + opDesc.AddOutputTensorDesc(dataType, shape.size(), shape.data(), format); + return opDesc; +} + +bool SetInputData(OpRunner &runner) +{ + size_t fileSize = 0; + ReadFile("../input/input_x.bin", fileSize, runner.GetInputBuffer(0), runner.GetInputSize(0)); + ReadFile("../input/input_y.bin", fileSize, runner.GetInputBuffer(1), runner.GetInputSize(1)); + INFO_LOG("Set input success"); + return true; +} + +bool ProcessOutputData(OpRunner &runner) +{ + WriteFile("../output/output_z.bin", runner.GetOutputBuffer(0), runner.GetOutputSize(0)); + INFO_LOG("Write output success"); + return true; +} + +void DestroyResource() +{ + bool flag = false; + if (aclrtResetDevice(deviceId) != ACL_SUCCESS) { + ERROR_LOG("Reset device %d failed", deviceId); + flag = true; + } + INFO_LOG("Reset Device success"); + if (aclFinalize() != ACL_SUCCESS) { + ERROR_LOG("Finalize acl failed"); + flag = true; + } + if (flag) { + ERROR_LOG("Destroy resource failed"); + } else { + INFO_LOG("Destroy resource success"); + } +} + +bool InitResource() +{ + std::string output = "../output"; + if (access(output.c_str(), 0) == -1) { + int ret = mkdir(output.c_str(), 0700); + if (ret == 0) { + INFO_LOG("Make output directory successfully"); + } else { + ERROR_LOG("Make output directory fail"); + return false; + } + } + + // acl.json is dump or profiling config file + if (aclInit("../scripts/acl.json") != ACL_SUCCESS) { + ERROR_LOG("acl init failed"); + return false; + } + + if (aclrtSetDevice(deviceId) != ACL_SUCCESS) { + ERROR_LOG("Set device failed. deviceId is %d", deviceId); + (void)aclFinalize(); + return false; + } + INFO_LOG("Set device[%d] success", deviceId); + + // runMode is ACL_HOST which represents app is running in host + // runMode is ACL_DEVICE which represents app is running in device + aclrtRunMode runMode; + if (aclrtGetRunMode(&runMode) != ACL_SUCCESS) { + ERROR_LOG("Get run mode failed"); + DestroyResource(); + return false; + } + g_isDevice = (runMode == ACL_DEVICE); + INFO_LOG("Get RunMode[%d] success", runMode); + + return true; +} + +bool RunOp() +{ + // create op desc + OperatorDesc opDesc = CreateOpDesc(); + + // create Runner + OpRunner opRunner(&opDesc); + if (!opRunner.Init()) { + ERROR_LOG("Init OpRunner failed"); + return false; + } + + // Load inputs + if (!SetInputData(opRunner)) { + ERROR_LOG("Set input data failed"); + return false; + } + + // Run op + if (!opRunner.RunOp()) { + ERROR_LOG("Run op failed"); + return false; + } + + // process output data + if (!ProcessOutputData(opRunner)) { + ERROR_LOG("Process output data failed"); + return false; + } + + INFO_LOG("Run op success"); + return true; +} + +int main(int argc, char **argv) +{ + if (!InitResource()) { + ERROR_LOG("Init resource failed"); + return FAILED; + } + INFO_LOG("Init resource success"); + + if (!RunOp()) { + DestroyResource(); + return FAILED; + } + + DestroyResource(); + + return SUCCESS; +} diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/src/op_runner.cpp b/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/src/op_runner.cpp new file mode 100644 index 000000000..45aecd300 --- /dev/null +++ b/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/src/op_runner.cpp @@ -0,0 +1,457 @@ +/** + * @file op_runner.cpp + * + * Copyright (C) 2023-2024. Huawei Technologies Co., Ltd. All rights reserved. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + */ +#include "op_runner.h" + +#include +#include + +#include "acl/acl_op_compiler.h" +#include "aclnn_add_custom.h" +#include "common.h" + +using namespace std; + +extern bool g_isDevice; + +OpRunner::OpRunner(OperatorDesc *opDesc) : opDesc_(opDesc) +{ + numInputs_ = opDesc->inputDesc.size(); + numOutputs_ = opDesc->outputDesc.size(); + workspace_ = nullptr; +} + +OpRunner::~OpRunner() +{ + if (workspace_ != nullptr) { + (void)aclrtFree(workspace_); + } + for (size_t i = 0; i < numInputs_; ++i) { + (void)aclDestroyTensor(inputTensor_[i]); + (void)aclDestroyDataBuffer(inputBuffers_[i]); + (void)aclrtFree(devInputs_[i]); + if (g_isDevice) { + (void)aclrtFree(hostInputs_[i]); + } else { + (void)aclrtFreeHost(hostInputs_[i]); + } + } + + for (size_t i = 0; i < numOutputs_; ++i) { + (void)aclDestroyTensor(outputTensor_[i]); + (void)aclDestroyDataBuffer(outputBuffers_[i]); + (void)aclrtFree(devOutputs_[i]); + if (g_isDevice) { + (void)aclrtFree(hostOutputs_[i]); + } else { + (void)aclrtFreeHost(hostOutputs_[i]); + } + } +} + +bool OpRunner::Init() +{ + for (size_t i = 0; i < numInputs_; ++i) { + auto size = GetInputSize(i); + void *devMem = nullptr; + if (aclrtMalloc(&devMem, size, ACL_MEM_MALLOC_HUGE_FIRST) != ACL_SUCCESS) { + ERROR_LOG("Malloc device memory for input[%zu] failed", i); + return false; + } + devInputs_.emplace_back(devMem); + inputBuffers_.emplace_back(aclCreateDataBuffer(devMem, size)); + + void *hostInput = nullptr; + if (g_isDevice) { + if (aclrtMalloc(&hostInput, size, ACL_MEM_MALLOC_HUGE_FIRST) != ACL_SUCCESS) { + ERROR_LOG("Malloc device memory for input[%zu] failed", i); + return false; + } + } else { + if (aclrtMallocHost(&hostInput, size) != ACL_SUCCESS) { + ERROR_LOG("Malloc device memory for input[%zu] failed", i); + return false; + } + } + if (hostInput == nullptr) { + ERROR_LOG("Malloc memory for input[%zu] failed", i); + return false; + } + hostInputs_.emplace_back(hostInput); + + aclTensor *inputTensor = + aclCreateTensor(GetInputShape(i).data(), GetInputNumDims(i), GetInputDataType(i), nullptr, 0, + GetInputFormat(i), GetInputShape(i).data(), GetInputNumDims(i), devInputs_[i]); + if (inputTensor == nullptr) { + ERROR_LOG("Create Tensor for input[%zu] failed", i); + return false; + } + inputTensor_.emplace_back(inputTensor); + } + + for (size_t i = 0; i < numOutputs_; ++i) { + auto size = GetOutputSize(i); + void *devMem = nullptr; + if (aclrtMalloc(&devMem, size, ACL_MEM_MALLOC_HUGE_FIRST) != ACL_SUCCESS) { + ERROR_LOG("Malloc device memory for output[%zu] failed", i); + return false; + } + devOutputs_.emplace_back(devMem); + outputBuffers_.emplace_back(aclCreateDataBuffer(devMem, size)); + + void *hostOutput = nullptr; + if (g_isDevice) { + if (aclrtMalloc(&hostOutput, size, ACL_MEM_MALLOC_HUGE_FIRST) != ACL_SUCCESS) { + ERROR_LOG("Malloc device memory for output[%zu] failed", i); + return false; + } + } else { + if (aclrtMallocHost(&hostOutput, size) != ACL_SUCCESS) { + ERROR_LOG("Malloc device memory for output[%zu] failed", i); + return false; + } + } + if (hostOutput == nullptr) { + ERROR_LOG("Malloc host memory for output[%zu] failed", i); + return false; + } + hostOutputs_.emplace_back(hostOutput); + + aclTensor *outputTensor = + aclCreateTensor(GetOutputShape(i).data(), GetOutputNumDims(i), GetOutputDataType(i), nullptr, 0, + GetOutputFormat(i), GetOutputShape(i).data(), GetOutputNumDims(i), devOutputs_[i]); + if (outputTensor == nullptr) { + ERROR_LOG("Create Tensor for output[%zu] failed", i); + return false; + } + outputTensor_.emplace_back(outputTensor); + } + + return true; +} + +const size_t OpRunner::NumInputs() +{ + return numInputs_; +} + +const size_t OpRunner::NumOutputs() +{ + return numOutputs_; +} + +const size_t OpRunner::GetInputSize(size_t index) const +{ + if (index >= numInputs_) { + ERROR_LOG("index out of range. index = %zu, numInputs = %zu", index, numInputs_); + return 0; + } + + return aclGetTensorDescSize(opDesc_->inputDesc[index]); +} + +const size_t OpRunner::GetInputNumDims(size_t index) const +{ + if (index >= numInputs_) { + ERROR_LOG("index out of range. index = %zu, numInputs = %zu", index, numInputs_); + return 0; + } + + return aclGetTensorDescNumDims(opDesc_->inputDesc[index]); +} + +aclDataType OpRunner::GetInputDataType(size_t index) const +{ + if (index >= numInputs_) { + ERROR_LOG("index out of range. index = %zu, numInputs = %zu", index, numInputs_); + return ACL_DT_UNDEFINED; + } + + return aclGetTensorDescType(opDesc_->inputDesc[index]); +} + +aclFormat OpRunner::GetInputFormat(size_t index) const +{ + if (index >= numInputs_) { + ERROR_LOG("index out of range. index = %zu, numInputs = %zu", index, numInputs_); + return ACL_FORMAT_UNDEFINED; + } + + return aclGetTensorDescFormat(opDesc_->inputDesc[index]); +} + +std::vector OpRunner::GetInputShape(size_t index) const +{ + std::vector ret; + if (index >= numInputs_) { + ERROR_LOG("index out of range. index = %zu, numInputs = %zu", index, numInputs_); + return ret; + } + + auto desc = opDesc_->inputDesc[index]; + for (size_t i = 0; i < aclGetTensorDescNumDims(desc); ++i) { + int64_t dimSize; + if (aclGetTensorDescDimV2(desc, i, &dimSize) != ACL_SUCCESS) { + ERROR_LOG("get dims from tensor desc failed. dims index = %zu", i); + ret.clear(); + return ret; + } + ret.emplace_back(dimSize); + } + + return ret; +} + +size_t OpRunner::GetOutputSize(size_t index) const +{ + if (index >= numOutputs_) { + ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_); + return 0; + } + + return aclGetTensorDescSize(opDesc_->outputDesc[index]); +} + +const size_t OpRunner::GetOutputNumDims(size_t index) const +{ + if (index >= numOutputs_) { + ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_); + return 0; + } + + return aclGetTensorDescNumDims(opDesc_->outputDesc[index]); +} + +aclDataType OpRunner::GetOutputDataType(size_t index) const +{ + if (index >= numOutputs_) { + ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_); + return ACL_DT_UNDEFINED; + } + + return aclGetTensorDescType(opDesc_->outputDesc[index]); +} + +aclFormat OpRunner::GetOutputFormat(size_t index) const +{ + if (index >= numOutputs_) { + ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_); + return ACL_FORMAT_UNDEFINED; + } + + return aclGetTensorDescFormat(opDesc_->outputDesc[index]); +} + +std::vector OpRunner::GetOutputShape(size_t index) const +{ + std::vector ret; + if (index >= numOutputs_) { + ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_); + return ret; + } + + auto desc = opDesc_->outputDesc[index]; + for (size_t i = 0; i < aclGetTensorDescNumDims(desc); ++i) { + int64_t dimSize; + if (aclGetTensorDescDimV2(desc, i, &dimSize) != ACL_SUCCESS) { + ERROR_LOG("get dims from tensor desc failed. dims index = %zu", i); + ret.clear(); + return ret; + } + ret.emplace_back(dimSize); + } + return ret; +} + +size_t OpRunner::GetInputElementCount(size_t index) const +{ + if (index >= opDesc_->inputDesc.size()) { + ERROR_LOG("index out of range. index = %zu, numInputs = %zu", index, numInputs_); + return 0; + } + + return aclGetTensorDescElementCount(opDesc_->inputDesc[index]); +} + +size_t OpRunner::GetOutputElementCount(size_t index) const +{ + if (index >= opDesc_->outputDesc.size()) { + ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_); + return 0; + } + + return aclGetTensorDescElementCount(opDesc_->outputDesc[index]); +} + +bool OpRunner::RunOp() +{ + for (size_t i = 0; i < numInputs_; ++i) { + auto size = GetInputSize(i); + aclrtMemcpyKind kind = ACL_MEMCPY_HOST_TO_DEVICE; + if (g_isDevice) { + kind = ACL_MEMCPY_DEVICE_TO_DEVICE; + } + if (aclrtMemcpy(devInputs_[i], size, hostInputs_[i], size, kind) != ACL_SUCCESS) { + ERROR_LOG("Copy input[%zu] failed", i); + return false; + } + INFO_LOG("Copy input[%zu] success", i); + } + + aclrtStream stream = nullptr; + if (aclrtCreateStream(&stream) != ACL_SUCCESS) { + ERROR_LOG("Create stream failed"); + return false; + } + INFO_LOG("Create stream success"); + + size_t workspaceSize = 0; + aclOpExecutor *handle = nullptr; + auto ret = + aclnnAddCustomGetWorkspaceSize(inputTensor_[0], inputTensor_[1], outputTensor_[0], &workspaceSize, &handle); + if (ret != ACL_SUCCESS) { + (void)aclrtDestroyStream(stream); + ERROR_LOG("Get Operator Workspace failed. error code is %d", static_cast(ret)); + return false; + } + INFO_LOG("Execute aclnnAddCustomGetWorkspaceSize success, workspace size %lu", workspaceSize); + + if (workspaceSize != 0) { + if (aclrtMalloc(&workspace_, workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST) != ACL_SUCCESS) { + ERROR_LOG("Malloc device memory failed"); + } + } + + ret = aclnnAddCustom(workspace_, workspaceSize, handle, stream); + if (ret != ACL_SUCCESS) { + (void)aclrtDestroyStream(stream); + ERROR_LOG("Execute Operator failed. error code is %d", static_cast(ret)); + return false; + } + INFO_LOG("Execute aclnnAddCustom success"); + + // The unit of 5000 is ms. + ret = aclrtSynchronizeStreamWithTimeout(stream, 5000); + if (ret != SUCCESS) { + ERROR_LOG("Synchronize stream failed. error code is %d", static_cast(ret)); + (void)aclrtDestroyStream(stream); + return false; + } + INFO_LOG("Synchronize stream success"); + + for (size_t i = 0; i < numOutputs_; ++i) { + auto size = GetOutputSize(i); + aclrtMemcpyKind kind = ACL_MEMCPY_DEVICE_TO_HOST; + if (g_isDevice) { + kind = ACL_MEMCPY_DEVICE_TO_DEVICE; + } + if (aclrtMemcpy(hostOutputs_[i], size, devOutputs_[i], size, kind) != ACL_SUCCESS) { + INFO_LOG("Copy output[%zu] success", i); + (void)aclrtDestroyStream(stream); + return false; + } + INFO_LOG("Copy output[%zu] success", i); + } + + (void)aclrtDestroyStream(stream); + return true; +} + +template void DoPrintData(const T *data, size_t count, size_t elementsPerRow) +{ + assert(elementsPerRow != 0); + for (size_t i = 0; i < count; ++i) { + std::cout << std::setw(10) << data[i]; + if (i % elementsPerRow == elementsPerRow - 1) { + std::cout << std::endl; + } + } +} + +void DoPrintFp16Data(const aclFloat16 *data, size_t count, size_t elementsPerRow) +{ + assert(elementsPerRow != 0); + for (size_t i = 0; i < count; ++i) { + std::cout << std::setw(10) << std::setprecision(4) << aclFloat16ToFloat(data[i]); + if (i % elementsPerRow == elementsPerRow - 1) { + std::cout << std::endl; + } + } +} + +void PrintData(const void *data, size_t count, aclDataType dataType, size_t elementsPerRow) +{ + if (data == nullptr) { + ERROR_LOG("Print data failed. data is nullptr"); + return; + } + + switch (dataType) { + case ACL_BOOL: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case ACL_INT8: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case ACL_UINT8: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case ACL_INT16: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case ACL_UINT16: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case ACL_INT32: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case ACL_UINT32: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case ACL_INT64: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case ACL_UINT64: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case ACL_FLOAT16: + DoPrintFp16Data(reinterpret_cast(data), count, elementsPerRow); + break; + case ACL_FLOAT: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case ACL_DOUBLE: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + default: + ERROR_LOG("Unsupported type: %d", dataType); + } +} + +void OpRunner::PrintInput(size_t index, size_t numElementsPerRow) +{ + if (index >= numInputs_) { + ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numInputs_); + return; + } + + auto desc = opDesc_->inputDesc[index]; + PrintData(hostInputs_[index], GetInputElementCount(index), aclGetTensorDescType(desc), numElementsPerRow); +} + +void OpRunner::PrintOutput(size_t index, size_t numElementsPerRow) +{ + if (index >= numOutputs_) { + ERROR_LOG("index out of range. index = %zu, numOutputs = %zu", index, numOutputs_); + return; + } + + auto desc = opDesc_->outputDesc[index]; + PrintData(hostOutputs_[index], GetOutputElementCount(index), aclGetTensorDescType(desc), numElementsPerRow); +} diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/src/operator_desc.cpp b/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/src/operator_desc.cpp new file mode 100644 index 000000000..da04cf6c9 --- /dev/null +++ b/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/src/operator_desc.cpp @@ -0,0 +1,51 @@ +/** + * @file operator_desc.cpp + * + * Copyright (C) 2023-2024. Huawei Technologies Co., Ltd. All rights reserved. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + */ +#include "operator_desc.h" + +#include "common.h" + +using namespace std; + +OperatorDesc::OperatorDesc() {} + +OperatorDesc::~OperatorDesc() +{ + for (auto *desc : inputDesc) { + aclDestroyTensorDesc(desc); + } + + for (auto *desc : outputDesc) { + aclDestroyTensorDesc(desc); + } +} + +OperatorDesc &OperatorDesc::AddInputTensorDesc(aclDataType dataType, int numDims, const int64_t *dims, aclFormat format) +{ + aclTensorDesc *desc = aclCreateTensorDesc(dataType, numDims, dims, format); + if (desc == nullptr) { + ERROR_LOG("create tensor failed"); + return *this; + } + inputDesc.emplace_back(desc); + return *this; +} + +OperatorDesc &OperatorDesc::AddOutputTensorDesc(aclDataType dataType, int numDims, const int64_t *dims, + aclFormat format) +{ + aclTensorDesc *desc = aclCreateTensorDesc(dataType, numDims, dims, format); + if (desc == nullptr) { + ERROR_LOG("create tensor failed"); + return *this; + } + + outputDesc.emplace_back(desc); + return *this; +} diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom.json b/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom.json new file mode 100644 index 000000000..23c1201a3 --- /dev/null +++ b/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom.json @@ -0,0 +1,30 @@ +[ + { + "op": "AddCustom", + "language": "cpp", + "input_desc": [ + { + "name": "x", + "param_type": "required", + "format": [ + "ND" + ], + "type": [ + "float" + ] + } + ], + "output_desc": [ + { + "name": "z", + "param_type": "required", + "format": [ + "ND" + ], + "type": [ + "float" + ] + } + ] + } +] \ No newline at end of file diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_host/adds_custom.cpp b/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_host/adds_custom.cpp new file mode 100644 index 000000000..ca7b58aa7 --- /dev/null +++ b/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_host/adds_custom.cpp @@ -0,0 +1,65 @@ +/** + * @file adds_custom.cpp + * + * Copyright (C) 2023-2024. Huawei Technologies Co., Ltd. All rights reserved. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + */ +#include "adds_custom_tiling.h" +#include "register/op_def_registry.h" + +namespace optiling { +const uint32_t BLOCK_DIM = 8; +const uint32_t TILE_NUM = 8; +static ge::graphStatus TilingFunc(gert::TilingContext *context) +{ + TilingData tiling; + uint32_t totalLength = context->GetInputShape(0)->GetOriginShape().GetShapeSize(); + context->SetBlockDim(BLOCK_DIM); + // tiling.set_totalLength(totalLength); + // tiling.set_tileNum(TILE_NUM); + // tiling.SaveToBuffer(context->GetRawTilingData()->GetData(), context->GetRawTilingData()->GetCapacity()); + // context->GetRawTilingData()->SetDataSize(tiling.GetDataSize()); + // size_t *currentWorkspace = context->GetWorkspaceSizes(1); + // currentWorkspace[0] = 0; + auto test = context->GetAttrs(); + printf("xxxxxx attr num is\n"); + return ge::GRAPH_SUCCESS; +} +} // namespace optiling + +namespace ge { +static graphStatus InferShape(gert::InferShapeContext *context) +{ + // const gert::Shape *x1_shape = context->GetInputShape(0); + // gert::Shape *y_shape = context->GetOutputShape(0); + // *y_shape = *x1_shape; + return GRAPH_SUCCESS; +} + +static graphStatus InferDataType(gert::InferDataTypeContext *context) +{ + // const auto inputDataType = context->GetInputDataType(0); + // context->SetOutputDataType(0, inputDataType); + return ge::GRAPH_SUCCESS; +} +} // namespace ge + +namespace ops { +class AddsCustom : public OpDef { +public: + explicit AddsCustom(const char *name) : OpDef(name) + { + this->Input("x").ParamType(REQUIRED).DataType({ge::DT_FLOAT}).Format({ge::FORMAT_ND}); + this->Output("z").ParamType(REQUIRED).DataType({ge::DT_FLOAT}).Format({ge::FORMAT_ND}); + + this->SetInferShape(ge::InferShape).SetInferDataType(ge::InferDataType); + this->AICore().SetTiling(optiling::TilingFunc).AddConfig("ascend910b"); + + this->Attr("tiling_key").Int(1); + } +}; +OP_ADD(AddsCustom); +} // namespace ops diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_host/adds_custom_tiling.h b/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_host/adds_custom_tiling.h new file mode 100644 index 000000000..21ec82c16 --- /dev/null +++ b/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_host/adds_custom_tiling.h @@ -0,0 +1,22 @@ +/** + * @file adds_custom_tiling.h + * + * Copyright (C) 2025-2025. Huawei Technologies Co., Ltd. All rights reserved. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + */ +#ifndef ADDS_CUSTOM_TILING_H +#define ADDS_CUSTOM_TILING_H +#include "register/tilingdata_base.h" + +namespace optiling { +BEGIN_TILING_DATA_DEF(TilingData) +TILING_DATA_FIELD_DEF(uint32_t, totalLength); +TILING_DATA_FIELD_DEF(uint32_t, tileNum); +END_TILING_DATA_DEF; + +REGISTER_TILING_DATA_CLASS(AddsCustom, TilingData) +} // namespace optiling +#endif // ADDS_CUSTOM_TILING_H diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_kernel/adds_custom.cpp b/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_kernel/adds_custom.cpp new file mode 100644 index 000000000..1dd26e59a --- /dev/null +++ b/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_kernel/adds_custom.cpp @@ -0,0 +1,20 @@ +/** + * @file add_custom.cpp + * + * Copyright (C) 2022-2024. Huawei Technologies Co., Ltd. All rights reserved. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + */ +#include "kernel_operator.h" + + +extern "C" __global__ __aicore__ void adds_custom(GM_ADDR x, GM_ADDR z, GM_ADDR workspace, GM_ADDR tiling) +{ + if (TILING_KEY_IS(1UL)) { + AscendC::printf("1\n"); + } else if (TILING_KEY_IS(2UL)) { + AscendC::printf("2\n"); + } +} diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/README.md b/operator/ascendc/4_best_practices/15_mata_address_conflict/README.md index 34c96391e..de724d24f 100644 --- a/operator/ascendc/4_best_practices/15_mata_address_conflict/README.md +++ b/operator/ascendc/4_best_practices/15_mata_address_conflict/README.md @@ -1 +1,152 @@ -MATA地址冲突(待补充) \ No newline at end of file +## 概述 +本样例基于AddCustom算子工程,介绍了单算子工程、单算子调用、第三方框架调用。 + +## 目录结构介绍 +``` +├── 1_add_frameworklaunch // 使用框架调用的方式调用Add算子 +│ ├── AclNNInvocation // 通过aclnn调用的方式调用AddCustom算子 +│ ├── AclNNInvocationNaive // 通过aclnn调用的方式调用AddCustom算子, 简化了编译脚本 +│ ├── AclOfflineModel // 通过aclopExecuteV2调用的方式调用AddCustom算子 +│ ├── AclOnlineModel // 通过aclopCompile调用的方式调用AddCustom算子 +│ ├── AddCustom // AddCustom算子工程 +│ ├── PytorchInvocation // 通过pytorch调用的方式调用AddCustom算子 +│ ├── TensorflowInvocation // 通过tensorflow调用的方式调用AddCustom算子 +│ ├── CppExtensionInvocation // 通过CppExtension调用的方式调用AddCustom算子 +│ ├── AddCustom.json // AddCustom算子的原型定义json文件 +│ └── install.sh // 脚本,调用msOpGen生成自定义算子工程,并编译 +``` + +## 算子描述 +Add算子实现了两个数据相加,返回相加结果的功能。对应的数学表达式为: +``` +z = x + y +``` +## 算子规格描述 + + + + + + + + + + + +
算子类型(OpType)Add
算子输入nameshapedata typeformat
x8 * 2048floatND
y8 * 2048floatND
算子输出z8 * 2048floatND
核函数名add_custom
+ +## 支持的产品型号 +本样例支持如下产品型号: +- Atlas 训练系列产品 +- Atlas 推理系列产品AI Core +- Atlas A2训练系列产品/Atlas 800I A2推理产品 +- Atlas 200/500 A2推理产品 + +## 算子工程介绍 +其中,算子工程目录AddCustom包含算子的实现文件,如下所示: +``` +├── AddCustom // AddCustom自定义算子工程 +│ ├── framework // 算子插件实现文件目录 +│ ├── op_host // host侧实现文件 +│ └── op_kernel // kernel侧实现文件 +``` +CANN软件包中提供了工程创建工具msOpGen,AddCustom算子工程可通过AddCustom.json自动创建,自定义算子工程具体请参考[Ascend C算子开发](https://hiascend.com/document/redirect/CannCommunityOpdevAscendC)>工程化算子开发>创建算子工程 章节。 + +创建完自定义算子工程后,开发者重点需要完成算子host和kernel文件的功能开发。为简化样例运行流程,本样例已在AddCustom目录中准备好了必要的算子实现,install.sh脚本会创建一个CustomOp目录,并将算子实现文件复制到对应目录下,再编译算子。 + +备注:CustomOp目录为生成目录,每次执行install.sh脚本都会删除该目录并重新生成,切勿在该目录下编码算子,会存在丢失风险。 + +## 编译运行样例算子 +针对自定义算子工程,编译运行包含如下步骤: +- 调用msOpGen工具生成自定义算子工程; +- 完成算子host和kernel实现; +- 编译自定义算子工程生成自定义算子包; +- 安装自定义算子包到自定义算子库中; +- 调用执行自定义算子; + +详细操作如下所示。 +### 1. 获取源码包 +编译运行此样例前,请参考[准备:获取样例代码](../README.md#codeready)获取源码包。 + +### 2. 生成自定义算子工程,复制host和kernel实现并编译算子 + - 切换到msOpGen脚本install.sh所在目录 + ```bash + # 若开发者以git命令行方式clone了master分支代码,并切换目录 + cd ${git_clone_path}/samples/operator/ascendc/0_introduction/1_add_frameworklaunch + ``` + + - 调用脚本,生成自定义算子工程,复制host和kernel实现并编译算子 + - 方式一:配置环境变量运行脚本 + 请根据当前环境上CANN开发套件包的[安装方式](https://hiascend.com/document/redirect/CannCommunityInstSoftware),选择对应配置环境变量命令。 + - 默认路径,root用户安装CANN软件包 + ```bash + export ASCEND_INSTALL_PATH=/usr/local/Ascend/ascend-toolkit/latest + ``` + - 默认路径,非root用户安装CANN软件包 + ```bash + export ASCEND_INSTALL_PATH=$HOME/Ascend/ascend-toolkit/latest + ``` + - 指定路径install_path,安装CANN软件包 + ```bash + export ASCEND_INSTALL_PATH=${install_path}/ascend-toolkit/latest + ``` + 运行install.sh脚本 + ```bash + bash install.sh -v [SOC_VERSION] + ``` + - 方式二:指定命令行安装路径来运行脚本 + ```bash + bash install.sh -v [SOC_VERSION] -i [ASCEND_INSTALL_PATH] + ``` + 参数说明: + - SOC_VERSION:昇腾AI处理器型号,如果无法确定具体的[SOC_VERSION],则在安装昇腾AI处理器的服务器执行npu-smi info命令进行查询,在查询到的“Name”前增加Ascend信息,例如“Name”对应取值为xxxyy,实际配置的[SOC_VERSION]值为Ascendxxxyy。支持以下产品型号: + - Atlas 训练系列产品 + - Atlas 推理系列产品AI Core + - Atlas A2训练系列产品/Atlas 800I A2推理产品 + - Atlas 200/500 A2推理产品 + - ASCEND_INSTALL_PATH:CANN软件包安装路径 + + 脚本运行成功后,会在当前目录下创建CustomOp目录,编译完成后,会在CustomOp/build_out中,生成自定义算子安装包custom_opp_\_\.run,例如“custom_opp_ubuntu_x86_64.run”。 + + 备注:如果要使用dump调试功能,需要移除op_host内的Atlas 训练系列产品、Atlas 200/500 A2 推理产品的配置项。 + +### 3. 部署自定义算子包 +- 部署自定义算子包前,请确保存在自定义算子包默认部署路径环境变量ASCEND_OPP_PATH + ```bash + echo $ASCEND_OPP_PATH + # 输出示例 /usr/local/Ascend/ascend-toolkit/latest/opp + + # 若没有,则需导出CANN环境变量 + source [ASCEND_INSTALL_PATH]/bin/setenv.bash + # 例如 source /usr/local/Ascend/ascend-toolkit/latest/bin/setenv.bash + ``` + 参数说明: + - ASCEND_INSTALL_PATH:CANN软件包安装路径,一般和上一步中指定的路径保持一致 + +- 在自定义算子安装包所在路径下,执行如下命令安装自定义算子包 + ```bash + cd CustomOp/build_out + ./custom_opp__.run + ``` + 命令执行成功后,自定义算子包中的相关文件将部署至opp算子库环境变量ASCEND_OPP_PATH指向的的vendors/customize目录中。 + +### 4. 调用执行算子工程 +- [aclnn调用AddCustom算子工程](./AclNNInvocation/README.md) +- [aclnn调用AddCustom算子工程(代码简化)](./AclNNInvocationNaive/README.md) +- [aclopExecuteV2模型调用AddCustom算子工程](./AclOfflineModel/README.md) +- [aclopCompile模型调用AddCustom算子工程](./AclOnlineModel/README.md) +- [cpp-extension模型调用AddCustom算子工程](./CppExtensions/README.md) +- [pytorch调用AddCustom算子工程](./PytorchInvocation/README.md) +- [tensorflow调用AddCustom算子工程](./TensorflowInvocation) + +## 更新说明 +| 时间 | 更新事项 | +| ---------- | ---------------------------- | +| 2023/10/24 | 新增TensorflowInvocation样例 | +| 2023/10/18 | 新增AclNNInvocation样例 | +| 2024/01/11 | 更改pytorch适配方式 | +| 2024/01/23 | 新增AclNNInvocationNaive样例 | +| 2024/05/22 | 修改readme结构 | +| 2024/11/11 | 样例目录调整 | +| 2024/11/18 | 算子工程改写为由msOpGen生成 | +| 2025/01/17 | 新增CppExtensionInvocation样例 | diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/install.sh b/operator/ascendc/4_best_practices/15_mata_address_conflict/install.sh new file mode 100755 index 000000000..7dfec79a2 --- /dev/null +++ b/operator/ascendc/4_best_practices/15_mata_address_conflict/install.sh @@ -0,0 +1,55 @@ +#!/bin/bash +SHORT=v:,i:, +LONG=soc-version:,install-path:, +OPTS=$(getopt -a --options $SHORT --longoptions $LONG -- "$@") +eval set -- "$OPTS" + +while :; do + case "$1" in + -v | --soc-version) + SOC_VERSION="$2" + shift 2 + ;; + -i | --install-path) + ASCEND_INSTALL_PATH="$2" + shift 2 + ;; + --) + shift + break + ;; + *) + echo "[ERROR] Unexpected option: $1" + break + ;; + esac +done + +VERSION_LIST="Ascend910A Ascend910B Ascend310B1 Ascend310B2 Ascend310B3 Ascend310B4 Ascend310P1 Ascend310P3 Ascend910B1 Ascend910B2 Ascend910B3 Ascend910B4" +if [[ " $VERSION_LIST " != *" $SOC_VERSION "* ]]; then + echo "ERROR: SOC_VERSION should be in [$VERSION_LIST]" + exit -1 +fi + +if [ -n "$ASCEND_INSTALL_PATH" ]; then + _ASCEND_INSTALL_PATH=$ASCEND_INSTALL_PATH +elif [ -n "$ASCEND_HOME_PATH" ]; then + _ASCEND_INSTALL_PATH=$ASCEND_HOME_PATH +else + if [ -d "$HOME/Ascend/ascend-toolkit/latest" ]; then + _ASCEND_INSTALL_PATH=$HOME/Ascend/ascend-toolkit/latest + else + _ASCEND_INSTALL_PATH=/usr/local/Ascend/ascend-toolkit/latest + fi +fi +source $_ASCEND_INSTALL_PATH/bin/setenv.bash +export ASCEND_HOME_PATH=$_ASCEND_INSTALL_PATH + +OP_NAME=AddsCustom +rm -rf CustomOp +# Generate the op framework +msopgen gen -i $OP_NAME.json -c ai_core-${SOC_VERSION} -lan cpp -out CustomOp +# Copy op implementation files to CustomOp +cp -rf $OP_NAME/* CustomOp +# Build CustomOp project +(cd CustomOp && bash build.sh) \ No newline at end of file -- Gitee From c864d82ecffc9d869f8c55406b820f0276b99424 Mon Sep 17 00:00:00 2001 From: zhanghao0689 Date: Thu, 3 Jul 2025 09:24:20 +0800 Subject: [PATCH 03/17] edit aclnn launch --- .../AclNNInvocation/inc/op_runner.h | 2 +- .../AclNNInvocation/run.sh | 3 +- .../AclNNInvocation/scripts/gen_data.py | 6 +- .../AclNNInvocation/scripts/verify_result.py | 10 +-- .../AclNNInvocation/src/main.cpp | 20 +++-- .../AclNNInvocation/src/op_runner.cpp | 12 +-- .../15_mata_address_conflict/AddsCustom.json | 2 +- .../AddsCustom/op_host/adds_custom.cpp | 19 +---- .../AddsCustom/op_kernel/adds_custom.cpp | 13 ++- .../AddsCustom/op_kernel/adds_custom_v1.h | 77 +++++++++++++++++ .../AddsCustom/op_kernel/adds_custom_v2.h | 83 +++++++++++++++++++ .../AddsCustom/op_kernel/common_info.h | 8 ++ 12 files changed, 211 insertions(+), 44 deletions(-) create mode 100644 operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_kernel/adds_custom_v1.h create mode 100644 operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_kernel/adds_custom_v2.h create mode 100644 operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_kernel/common_info.h diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/inc/op_runner.h b/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/inc/op_runner.h index f1b3a6706..c7c07a851 100644 --- a/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/inc/op_runner.h +++ b/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/inc/op_runner.h @@ -157,7 +157,7 @@ public: * @brief Run op * @return run result */ - bool RunOp(); + bool RunOp(int64_t tilingKey); private: size_t numInputs_; diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/run.sh b/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/run.sh index a652bf478..d339f9de6 100755 --- a/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/run.sh +++ b/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/run.sh @@ -54,9 +54,10 @@ function main { # 4. 运行可执行文件 export LD_LIBRARY_PATH=$_ASCEND_INSTALL_PATH/opp/vendors/customize/op_api/lib:$LD_LIBRARY_PATH + export ASCEND_TOOLKIT_HOME=${_ASCEND_INSTALL_PATH} cd $CURRENT_DIR/output echo "INFO: execute op!" - ./execute_add_op + msprof op --launch-count=2 --output=./prof ./execute_add_op if [ $? -ne 0 ]; then echo "ERROR: acl executable run failed! please check your project!" return 1 diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/scripts/gen_data.py b/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/scripts/gen_data.py index ea8ce828a..3a5a2a1f4 100644 --- a/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/scripts/gen_data.py +++ b/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/scripts/gen_data.py @@ -12,12 +12,10 @@ import numpy as np def gen_golden_data_simple(): - input_x = np.random.uniform(1, 100, [8, 2048]).astype(np.float16) - input_y = np.random.uniform(1, 100, [8, 2048]).astype(np.float16) - golden = (input_x + input_y).astype(np.float16) + input_x = np.random.uniform(1, 100, [8192, 128]).astype(np.float32) + golden = (input_x + 2.0).astype(np.float32) input_x.tofile("./input/input_x.bin") - input_y.tofile("./input/input_y.bin") golden.tofile("./output/golden.bin") diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/scripts/verify_result.py b/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/scripts/verify_result.py index 2dd46f803..6a38a3b2b 100644 --- a/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/scripts/verify_result.py +++ b/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/scripts/verify_result.py @@ -11,15 +11,15 @@ import sys import numpy as np -# for float16 -relative_tol = 1e-3 +# for float32 +relative_tol = 1e-4 absolute_tol = 1e-5 -error_tol = 1e-3 +error_tol = 1e-4 def verify_result(output, golden): - output = np.fromfile(output, dtype=np.float16).reshape(-1) - golden = np.fromfile(golden, dtype=np.float16).reshape(-1) + output = np.fromfile(output, dtype=np.float32).reshape(-1) + golden = np.fromfile(golden, dtype=np.float32).reshape(-1) different_element_results = np.isclose(output, golden, rtol=relative_tol, diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/src/main.cpp b/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/src/main.cpp index 7f8e38a37..5377cd3eb 100644 --- a/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/src/main.cpp +++ b/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/src/main.cpp @@ -24,12 +24,11 @@ int deviceId = 0; OperatorDesc CreateOpDesc() { // define operator - std::vector shape{8, 2048}; - aclDataType dataType = ACL_FLOAT16; + std::vector shape{8192, 128}; + aclDataType dataType = ACL_FLOAT; aclFormat format = ACL_FORMAT_ND; OperatorDesc opDesc; opDesc.AddInputTensorDesc(dataType, shape.size(), shape.data(), format); - opDesc.AddInputTensorDesc(dataType, shape.size(), shape.data(), format); opDesc.AddOutputTensorDesc(dataType, shape.size(), shape.data(), format); return opDesc; } @@ -38,7 +37,6 @@ bool SetInputData(OpRunner &runner) { size_t fileSize = 0; ReadFile("../input/input_x.bin", fileSize, runner.GetInputBuffer(0), runner.GetInputSize(0)); - ReadFile("../input/input_y.bin", fileSize, runner.GetInputBuffer(1), runner.GetInputSize(1)); INFO_LOG("Set input success"); return true; } @@ -109,7 +107,7 @@ bool InitResource() return true; } -bool RunOp() +bool RunOp(int64_t tilingKey) { // create op desc OperatorDesc opDesc = CreateOpDesc(); @@ -128,7 +126,7 @@ bool RunOp() } // Run op - if (!opRunner.RunOp()) { + if (!opRunner.RunOp(tilingKey)) { ERROR_LOG("Run op failed"); return false; } @@ -151,12 +149,18 @@ int main(int argc, char **argv) } INFO_LOG("Init resource success"); - if (!RunOp()) { + int64_t tilingKey = 1; + if (!RunOp(tilingKey)) { DestroyResource(); return FAILED; } - DestroyResource(); + tilingKey = 2; + if (!RunOp(tilingKey)) { + DestroyResource(); + return FAILED; + } + DestroyResource(); return SUCCESS; } diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/src/op_runner.cpp b/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/src/op_runner.cpp index 45aecd300..5c944b131 100644 --- a/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/src/op_runner.cpp +++ b/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/src/op_runner.cpp @@ -13,7 +13,7 @@ #include #include "acl/acl_op_compiler.h" -#include "aclnn_add_custom.h" +#include "aclnn_adds_custom.h" #include "common.h" using namespace std; @@ -289,7 +289,7 @@ size_t OpRunner::GetOutputElementCount(size_t index) const return aclGetTensorDescElementCount(opDesc_->outputDesc[index]); } -bool OpRunner::RunOp() +bool OpRunner::RunOp(int64_t tilingKey) { for (size_t i = 0; i < numInputs_; ++i) { auto size = GetInputSize(i); @@ -314,13 +314,13 @@ bool OpRunner::RunOp() size_t workspaceSize = 0; aclOpExecutor *handle = nullptr; auto ret = - aclnnAddCustomGetWorkspaceSize(inputTensor_[0], inputTensor_[1], outputTensor_[0], &workspaceSize, &handle); + aclnnAddsCustomGetWorkspaceSize(inputTensor_[0], tilingKey, outputTensor_[0], &workspaceSize, &handle); if (ret != ACL_SUCCESS) { (void)aclrtDestroyStream(stream); ERROR_LOG("Get Operator Workspace failed. error code is %d", static_cast(ret)); return false; } - INFO_LOG("Execute aclnnAddCustomGetWorkspaceSize success, workspace size %lu", workspaceSize); + INFO_LOG("Execute aclnnAddsCustomGetWorkspaceSize success, workspace size %lu", workspaceSize); if (workspaceSize != 0) { if (aclrtMalloc(&workspace_, workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST) != ACL_SUCCESS) { @@ -328,13 +328,13 @@ bool OpRunner::RunOp() } } - ret = aclnnAddCustom(workspace_, workspaceSize, handle, stream); + ret = aclnnAddsCustom(workspace_, workspaceSize, handle, stream); if (ret != ACL_SUCCESS) { (void)aclrtDestroyStream(stream); ERROR_LOG("Execute Operator failed. error code is %d", static_cast(ret)); return false; } - INFO_LOG("Execute aclnnAddCustom success"); + INFO_LOG("Execute aclnnAddsCustom success"); // The unit of 5000 is ms. ret = aclrtSynchronizeStreamWithTimeout(stream, 5000); diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom.json b/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom.json index 23c1201a3..8ad2a831a 100644 --- a/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom.json +++ b/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom.json @@ -1,6 +1,6 @@ [ { - "op": "AddCustom", + "op": "AddsCustom", "language": "cpp", "input_desc": [ { diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_host/adds_custom.cpp b/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_host/adds_custom.cpp index ca7b58aa7..7494fc8ad 100644 --- a/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_host/adds_custom.cpp +++ b/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_host/adds_custom.cpp @@ -11,21 +11,15 @@ #include "register/op_def_registry.h" namespace optiling { -const uint32_t BLOCK_DIM = 8; -const uint32_t TILE_NUM = 8; +const uint32_t BLOCK_DIM = 16; static ge::graphStatus TilingFunc(gert::TilingContext *context) { TilingData tiling; uint32_t totalLength = context->GetInputShape(0)->GetOriginShape().GetShapeSize(); context->SetBlockDim(BLOCK_DIM); - // tiling.set_totalLength(totalLength); - // tiling.set_tileNum(TILE_NUM); - // tiling.SaveToBuffer(context->GetRawTilingData()->GetData(), context->GetRawTilingData()->GetCapacity()); - // context->GetRawTilingData()->SetDataSize(tiling.GetDataSize()); - // size_t *currentWorkspace = context->GetWorkspaceSizes(1); - // currentWorkspace[0] = 0; - auto test = context->GetAttrs(); - printf("xxxxxx attr num is\n"); + auto attrs = context->GetAttrs(); + const int64_t* tilingKey = attrs->GetInt(0); + context->SetTilingKey(*tilingKey); return ge::GRAPH_SUCCESS; } } // namespace optiling @@ -33,16 +27,11 @@ static ge::graphStatus TilingFunc(gert::TilingContext *context) namespace ge { static graphStatus InferShape(gert::InferShapeContext *context) { - // const gert::Shape *x1_shape = context->GetInputShape(0); - // gert::Shape *y_shape = context->GetOutputShape(0); - // *y_shape = *x1_shape; return GRAPH_SUCCESS; } static graphStatus InferDataType(gert::InferDataTypeContext *context) { - // const auto inputDataType = context->GetInputDataType(0); - // context->SetOutputDataType(0, inputDataType); return ge::GRAPH_SUCCESS; } } // namespace ge diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_kernel/adds_custom.cpp b/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_kernel/adds_custom.cpp index 1dd26e59a..3fee53218 100644 --- a/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_kernel/adds_custom.cpp +++ b/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_kernel/adds_custom.cpp @@ -8,13 +8,20 @@ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. */ #include "kernel_operator.h" - +#include "common_info.h" +#include "adds_custom_v1.h" +#include "adds_custom_v2.h" extern "C" __global__ __aicore__ void adds_custom(GM_ADDR x, GM_ADDR z, GM_ADDR workspace, GM_ADDR tiling) { + KERNEL_TASK_TYPE_DEFAULT(KERNEL_TYPE_MIX_AIV_1_0); if (TILING_KEY_IS(1UL)) { - AscendC::printf("1\n"); + KernelAddsV1 op; + op.Init(x, z); + op.Process(); } else if (TILING_KEY_IS(2UL)) { - AscendC::printf("2\n"); + KernelAddsV2 op; + op.Init(x, z); + op.Process(); } } diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_kernel/adds_custom_v1.h b/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_kernel/adds_custom_v1.h new file mode 100644 index 000000000..a06826e24 --- /dev/null +++ b/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_kernel/adds_custom_v1.h @@ -0,0 +1,77 @@ +/** + * @file add_custom_v1.cpp + * + * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + */ +#include "kernel_operator.h" + +using AscendC::TPosition; +class KernelAddsV1 { +public: + __aicore__ inline KernelAddsV1() {} + __aicore__ inline void Init(GM_ADDR x, GM_ADDR z) + { + // the gm address conflict happens when L2 cache miss + // so we disable the L2 cache mode to show the + xGm.SetGlobalBuffer((__gm__ float *)x + AscendC::GetBlockIdx() * TILE_N); + zGm.SetGlobalBuffer((__gm__ float *)z + AscendC::GetBlockIdx() * TILE_N); + xGm.SetL2CacheHint(AscendC::CacheMode::CACHE_MODE_DISABLE); + zGm.SetL2CacheHint(AscendC::CacheMode::CACHE_MODE_DISABLE); + pipe.InitBuffer(inQueueX, BUFFER_NUM, TILE_M * TILE_N * sizeof(float)); + pipe.InitBuffer(outQueueZ, BUFFER_NUM, TILE_M * TILE_N * sizeof(float)); + } + __aicore__ inline void Process() + { + for (int32_t i = 0; i < LOOP_ONE_CORE; i++) { + AscendC::SyncAll(); + CopyIn(i); + Compute(); + AscendC::SyncAll(); + CopyOut(i); + } + } + +private: + __aicore__ inline void CopyIn(int32_t progress) + { + AscendC::LocalTensor xLocal = inQueueX.AllocTensor(); + AscendC::DataCopyParams params; + params.blockCount = TILE_M; + params.blockLen = TILE_N * sizeof(float) / BLOCK_SIZE; + params.srcStride = (N - TILE_N) * sizeof(float) / BLOCK_SIZE; + params.dstStride = 0; + AscendC::DataCopy(xLocal, xGm[progress * TILE_M * N], params); + inQueueX.EnQue(xLocal); + } + __aicore__ inline void Compute() + { + AscendC::LocalTensor xLocal = inQueueX.DeQue(); + AscendC::LocalTensor zLocal = outQueueZ.AllocTensor(); + constexpr float scale = 2.0; + AscendC::Adds(zLocal, xLocal, scale, TILE_M * TILE_N); + outQueueZ.EnQue(zLocal); + inQueueX.FreeTensor(xLocal); + } + __aicore__ inline void CopyOut(int32_t progress) + { + AscendC::LocalTensor zLocal = outQueueZ.DeQue(); + AscendC::DataCopyParams params; + params.blockCount = TILE_M; + params.blockLen = TILE_N * sizeof(float) / BLOCK_SIZE; + params.srcStride = 0; + params.dstStride = (N - TILE_N) * sizeof(float) / BLOCK_SIZE; + AscendC::DataCopy(zGm[progress * TILE_M * N], zLocal, params); + outQueueZ.FreeTensor(zLocal); + } + +private: + AscendC::TPipe pipe; + AscendC::TQue inQueueX; + AscendC::TQue outQueueZ; + AscendC::GlobalTensor xGm; + AscendC::GlobalTensor zGm; +}; diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_kernel/adds_custom_v2.h b/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_kernel/adds_custom_v2.h new file mode 100644 index 000000000..2dabb2c4f --- /dev/null +++ b/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_kernel/adds_custom_v2.h @@ -0,0 +1,83 @@ +/** + * @file adds_custom_v2.cpp + * + * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + */ +#include "kernel_operator.h" + +using AscendC::TPosition; +class KernelAddsV2 { +public: + __aicore__ inline KernelAddsV2() {} + __aicore__ inline void Init(GM_ADDR x, GM_ADDR z) + { + // the gm address conflict happens when L2 cache miss + // so we disable the L2 cache mode to show the + xGm.SetGlobalBuffer((__gm__ float *)x + AscendC::GetBlockIdx() * TILE_N); + zGm.SetGlobalBuffer((__gm__ float *)z + AscendC::GetBlockIdx() * TILE_N); + xGm.SetL2CacheHint(AscendC::CacheMode::CACHE_MODE_DISABLE); + zGm.SetL2CacheHint(AscendC::CacheMode::CACHE_MODE_DISABLE); + pipe.InitBuffer(inQueueX, BUFFER_NUM, TILE_M * TILE_N * sizeof(float)); + pipe.InitBuffer(outQueueZ, BUFFER_NUM, TILE_M * TILE_N * sizeof(float)); + } + __aicore__ inline void Process() + { + for (int32_t i = 0; i < LOOP_ONE_CORE; i++) { + // to avoid the gm address conflict: + // the loop order core0 : 0, 1, 2, 3, ..., 13, 14, 15 + // the loop order core1 : 1, 2, 3, 4, ..., 14, 15, 0 + // ... + // the loop order core15 : 15, 0, 1, 2, ..., 12, 13, 14 + int32_t newProgress = (i + AscendC::GetBlockIdx()) % LOOP_ONE_CORE; + AscendC::SyncAll(); + CopyIn(newProgress); + Compute(); + AscendC::SyncAll(); + CopyOut(newProgress); + } + } + +private: + __aicore__ inline void CopyIn(int32_t progress) + { + AscendC::LocalTensor xLocal = inQueueX.AllocTensor(); + AscendC::DataCopyParams params; + params.blockCount = TILE_M; + params.blockLen = TILE_N * sizeof(float) / BLOCK_SIZE; + params.srcStride = (N - TILE_N) * sizeof(float) / BLOCK_SIZE; + params.dstStride = 0; + AscendC::DataCopy(xLocal, xGm[progress * TILE_M * N], params); + inQueueX.EnQue(xLocal); + } + __aicore__ inline void Compute() + { + AscendC::LocalTensor xLocal = inQueueX.DeQue(); + AscendC::LocalTensor zLocal = outQueueZ.AllocTensor(); + constexpr float scale = 2.0; + AscendC::Adds(zLocal, xLocal, scale, TILE_M * TILE_N); + outQueueZ.EnQue(zLocal); + inQueueX.FreeTensor(xLocal); + } + __aicore__ inline void CopyOut(int32_t progress) + { + AscendC::LocalTensor zLocal = outQueueZ.DeQue(); + AscendC::DataCopyParams params; + params.blockCount = TILE_M; + params.blockLen = TILE_N * sizeof(float) / BLOCK_SIZE; + params.srcStride = 0; + params.dstStride = (N - TILE_N) * sizeof(float) / BLOCK_SIZE; + AscendC::DataCopy(zGm[progress * TILE_M * N], zLocal, params); + outQueueZ.FreeTensor(zLocal); + } + +private: + AscendC::TPipe pipe; + AscendC::TQue inQueueX; + AscendC::TQue outQueueZ; + AscendC::GlobalTensor xGm; + AscendC::GlobalTensor zGm; +}; diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_kernel/common_info.h b/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_kernel/common_info.h new file mode 100644 index 000000000..f0a4da5fd --- /dev/null +++ b/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_kernel/common_info.h @@ -0,0 +1,8 @@ +static constexpr int32_t M = 8192; +static constexpr int32_t N = 128; +static constexpr int32_t TILE_M = 512; +static constexpr int32_t TILE_N = 8; +static constexpr int32_t USED_CORE_NUM = N / TILE_N; +static constexpr int32_t LOOP_ONE_CORE = M / TILE_M; +static constexpr int32_t BUFFER_NUM = 2; +static constexpr int32_t BLOCK_SIZE = 32; \ No newline at end of file -- Gitee From 56f76afcd837a15f7ab2b0013feb4562134fed90 Mon Sep 17 00:00:00 2001 From: zhanghao0689 Date: Thu, 3 Jul 2025 11:04:43 +0800 Subject: [PATCH 04/17] remove kernel launch --- .../AclNNInvocation/README.md | 34 ++- .../AclNNInvocation/inc/op_runner.h | 9 +- .../AclNNInvocation/run.sh | 7 +- .../AclNNInvocation/src/main.cpp | 24 +-- .../AclNNInvocation/src/op_runner.cpp | 10 +- .../AddsCustom/op_host/adds_custom.cpp | 3 +- .../AddsCustom/op_host/adds_custom_tiling.h | 22 -- .../KernelLaunch/CMakeLists.txt | 47 ---- .../KernelLaunch/README.md | 88 -------- .../KernelLaunch/adds_custom_v1.cpp | 98 --------- .../KernelLaunch/adds_custom_v2.cpp | 104 --------- .../KernelLaunch/cmake/cpu_lib.cmake | 9 - .../KernelLaunch/cmake/npu_lib.cmake | 11 - .../KernelLaunch/data_utils.h | 203 ------------------ .../KernelLaunch/main.cpp | 128 ----------- .../KernelLaunch/run.sh | 113 ---------- .../KernelLaunch/scripts/gen_data.py | 23 -- .../KernelLaunch/scripts/verify_result.py | 53 ----- .../15_mata_address_conflict/README.md | 195 +++++++++-------- 19 files changed, 154 insertions(+), 1027 deletions(-) delete mode 100644 operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_host/adds_custom_tiling.h delete mode 100644 operator/ascendc/4_best_practices/15_mata_address_conflict/KernelLaunch/CMakeLists.txt delete mode 100644 operator/ascendc/4_best_practices/15_mata_address_conflict/KernelLaunch/README.md delete mode 100644 operator/ascendc/4_best_practices/15_mata_address_conflict/KernelLaunch/adds_custom_v1.cpp delete mode 100644 operator/ascendc/4_best_practices/15_mata_address_conflict/KernelLaunch/adds_custom_v2.cpp delete mode 100644 operator/ascendc/4_best_practices/15_mata_address_conflict/KernelLaunch/cmake/cpu_lib.cmake delete mode 100644 operator/ascendc/4_best_practices/15_mata_address_conflict/KernelLaunch/cmake/npu_lib.cmake delete mode 100644 operator/ascendc/4_best_practices/15_mata_address_conflict/KernelLaunch/data_utils.h delete mode 100644 operator/ascendc/4_best_practices/15_mata_address_conflict/KernelLaunch/main.cpp delete mode 100755 operator/ascendc/4_best_practices/15_mata_address_conflict/KernelLaunch/run.sh delete mode 100644 operator/ascendc/4_best_practices/15_mata_address_conflict/KernelLaunch/scripts/gen_data.py delete mode 100644 operator/ascendc/4_best_practices/15_mata_address_conflict/KernelLaunch/scripts/verify_result.py diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/README.md b/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/README.md index 533675076..ea3502c15 100644 --- a/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/README.md +++ b/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/README.md @@ -1,16 +1,15 @@ ## 目录结构介绍 ``` -├── AclNNInvocation //通过aclnn调用的方式调用AddCustom算子 +├── AclNNInvocation //通过aclnn调用的方式调用AddsCustom算子 │ ├── inc // 头文件目录 │ │ ├── common.h // 声明公共方法类,用于读取二进制文件 │ │ ├── op_runner.h // 算子描述声明文件,包含算子输入/输出,算子类型以及输入描述与输出描述 │ │ └── operator_desc.h // 算子运行相关信息声明文件,包含算子输入/输出个数,输入/输出大小等 │ ├── input // 存放脚本生成的输入数据目录 -│ ├── output // 存放算子运行输出数据和真值数据的目录 │ ├── scripts │ │ ├── acl.json // acl配置文件 │ │ ├── gen_data.py // 输入数据和真值数据生成脚本 -│ │ └── verify_result.py // 真值对比文件 +│ │ └── verify_result.py // 精度校验脚本 │ ├── src │ │ ├── CMakeLists.txt // 编译规则文件 │ │ ├── common.cpp // 公共函数,读取二进制文件函数的实现文件 @@ -25,21 +24,37 @@ 自定义算子编译部署后,会自动生成单算子API,可以直接在应用程序中调用。算子API的形式一般定义为“两段式接口”,形如: ```cpp // 获取算子使用的workspace空间大小 - aclnnStatus aclnnAddCustomGetWorkspaceSize(const aclTensor *x, const aclTensor *y, const aclTensor *out, uint64_t *workspaceSize, aclOpExecutor **executor); + aclnnStatus aclnnAddsCustomGetWorkspaceSize( + const aclTensor *x1, + const aclTensor *x2, + const aclTensor *biasOptional, + char *group, + char *reduceOp + bool isTransAOptional, + bool isTransBOptional, + int64_t commTurnOptional, + const aclTensor *yOut, + uint64_t *workspaceSize, + aclOpExecutor **executor); // 执行算子 - aclnnStatus aclnnAddCustom(void *workspace, int64_t workspaceSize, aclOpExecutor *executor, aclrtStream stream); + aclnnStatus aclnnMatmulAllReduceCustom( + void *workspace, + uint64_t workspaceSize, + aclOpExecutor *executor, + const aclrtStream stream); ``` -其中aclnnAddCustomGetWorkspaceSize为第一段接口,主要用于计算本次API调用计算过程中需要多少的workspace内存。获取到本次API计算需要的workspace大小之后,开发者可以按照workspaceSize大小申请Device侧内存,然后调用第二段接口aclnnAddCustom执行计算。具体参考[单算子API调用](https://hiascend.com/document/redirect/CannCommunityAscendCInVorkSingleOp)章节。 +其中aclnnMatmulAllReduceCustomGetWorkspaceSize为第一段接口,主要用于计算本次API调用计算过程中需要多少的workspace内存。获取到本次API计算需要的workspace大小之后,按照workspaceSize大小申请Device侧内存,然后调用第二段接口aclnnMatmulAllReduceCustom执行计算。具体参考[单算子API调用](https://hiascend.com/document/redirect/CannCommunityAscendCInVorkSingleOp)章节。 ## 运行样例算子 ### 1. 编译算子工程 运行此样例前,请参考[编译算子工程](../README.md#operatorcompile)完成前期准备。 ### 2. aclnn调用样例运行 - - 进入到样例目录 + - 进入到样例目录 + 以命令行方式下载样例代码,master分支为例。 ```bash - cd ${git_clone_path}/samples/operator/ascendc/0_introduction/1_add_frameworklaunch/AclNNInvocation + cd ${git_clone_path}/samples/operator/ascendc/4_best_practices/23_matmul_all_reduce_custom/AclNNInvocation ``` - 样例执行 @@ -51,5 +66,4 @@ ## 更新说明 | 时间 | 更新事项 | | ---------- | ------------ | -| 2024/05/22 | 新增本readme | -| 2024/11/11 | 样例目录调整 | \ No newline at end of file +| 2024/06/07 | 新增本readme | \ No newline at end of file diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/inc/op_runner.h b/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/inc/op_runner.h index c7c07a851..53629c4a8 100644 --- a/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/inc/op_runner.h +++ b/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/inc/op_runner.h @@ -157,12 +157,19 @@ public: * @brief Run op * @return run result */ - bool RunOp(int64_t tilingKey); + bool RunOp(int64_t caseId); + + /** + * @brief Get case index + * @return case index by user input + */ + bool CompileDynamicOp(); private: size_t numInputs_; size_t numOutputs_; void *workspace_; + int64_t caseId_; std::vector inputBuffers_; std::vector outputBuffers_; diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/run.sh b/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/run.sh index d339f9de6..55120f18d 100755 --- a/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/run.sh +++ b/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/run.sh @@ -22,8 +22,8 @@ export NPU_HOST_LIB=$_ASCEND_INSTALL_PATH/$(arch)-$(uname -s | tr '[:upper:]' '[ function main { # 1. 清除遗留生成文件和日志文件 rm -rf $HOME/ascend/log/* - rm ./input/*.bin - rm ./output/*.bin + rm -f ./input/*.bin + rm -f ./output && mkdir -p ./output # 2. 生成输入数据和真值数据 cd $CURRENT_DIR @@ -66,7 +66,8 @@ function main { # 5. 精度比对 cd $CURRENT_DIR - python3 scripts/verify_result.py output/output_z.bin output/golden.bin + python3 scripts/verify_result.py output/output_z_1.bin output/golden.bin + python3 scripts/verify_result.py output/output_z_2.bin output/golden.bin if [ $? -ne 0 ]; then echo "ERROR: verify result failed!" return 1 diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/src/main.cpp b/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/src/main.cpp index 5377cd3eb..1f075370d 100644 --- a/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/src/main.cpp +++ b/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/src/main.cpp @@ -43,7 +43,8 @@ bool SetInputData(OpRunner &runner) bool ProcessOutputData(OpRunner &runner) { - WriteFile("../output/output_z.bin", runner.GetOutputBuffer(0), runner.GetOutputSize(0)); + int64_t caseId = runner.GetCaseId(); + WriteFile("../output/output_z_" + std::to_string(caseId) + ".bin", runner.GetOutputBuffer(0), runner.GetOutputSize(0)); INFO_LOG("Write output success"); return true; } @@ -70,15 +71,6 @@ void DestroyResource() bool InitResource() { std::string output = "../output"; - if (access(output.c_str(), 0) == -1) { - int ret = mkdir(output.c_str(), 0700); - if (ret == 0) { - INFO_LOG("Make output directory successfully"); - } else { - ERROR_LOG("Make output directory fail"); - return false; - } - } // acl.json is dump or profiling config file if (aclInit("../scripts/acl.json") != ACL_SUCCESS) { @@ -107,7 +99,7 @@ bool InitResource() return true; } -bool RunOp(int64_t tilingKey) +bool RunOp(int64_t caseId) { // create op desc OperatorDesc opDesc = CreateOpDesc(); @@ -126,7 +118,7 @@ bool RunOp(int64_t tilingKey) } // Run op - if (!opRunner.RunOp(tilingKey)) { + if (!opRunner.RunOp(caseId)) { ERROR_LOG("Run op failed"); return false; } @@ -149,14 +141,14 @@ int main(int argc, char **argv) } INFO_LOG("Init resource success"); - int64_t tilingKey = 1; - if (!RunOp(tilingKey)) { + int64_t caseId = 1; + if (!RunOp(caseId)) { DestroyResource(); return FAILED; } - tilingKey = 2; - if (!RunOp(tilingKey)) { + caseId = 2; + if (!RunOp(caseId)) { DestroyResource(); return FAILED; } diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/src/op_runner.cpp b/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/src/op_runner.cpp index 5c944b131..e1d70fdb8 100644 --- a/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/src/op_runner.cpp +++ b/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/src/op_runner.cpp @@ -289,8 +289,9 @@ size_t OpRunner::GetOutputElementCount(size_t index) const return aclGetTensorDescElementCount(opDesc_->outputDesc[index]); } -bool OpRunner::RunOp(int64_t tilingKey) +bool OpRunner::RunOp(int64_t caseId) { + caseId_ = caseId; for (size_t i = 0; i < numInputs_; ++i) { auto size = GetInputSize(i); aclrtMemcpyKind kind = ACL_MEMCPY_HOST_TO_DEVICE; @@ -314,7 +315,7 @@ bool OpRunner::RunOp(int64_t tilingKey) size_t workspaceSize = 0; aclOpExecutor *handle = nullptr; auto ret = - aclnnAddsCustomGetWorkspaceSize(inputTensor_[0], tilingKey, outputTensor_[0], &workspaceSize, &handle); + aclnnAddsCustomGetWorkspaceSize(inputTensor_[0], caseId, outputTensor_[0], &workspaceSize, &handle); if (ret != ACL_SUCCESS) { (void)aclrtDestroyStream(stream); ERROR_LOG("Get Operator Workspace failed. error code is %d", static_cast(ret)); @@ -363,6 +364,11 @@ bool OpRunner::RunOp(int64_t tilingKey) return true; } +int64_t OpRunner::GetCaseId() +{ + return caseId_; +} + template void DoPrintData(const T *data, size_t count, size_t elementsPerRow) { assert(elementsPerRow != 0); diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_host/adds_custom.cpp b/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_host/adds_custom.cpp index 7494fc8ad..d4d09cb9a 100644 --- a/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_host/adds_custom.cpp +++ b/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_host/adds_custom.cpp @@ -7,7 +7,6 @@ * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. */ -#include "adds_custom_tiling.h" #include "register/op_def_registry.h" namespace optiling { @@ -47,7 +46,7 @@ public: this->SetInferShape(ge::InferShape).SetInferDataType(ge::InferDataType); this->AICore().SetTiling(optiling::TilingFunc).AddConfig("ascend910b"); - this->Attr("tiling_key").Int(1); + this->Attr("case_id").Int(1); } }; OP_ADD(AddsCustom); diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_host/adds_custom_tiling.h b/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_host/adds_custom_tiling.h deleted file mode 100644 index 21ec82c16..000000000 --- a/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_host/adds_custom_tiling.h +++ /dev/null @@ -1,22 +0,0 @@ -/** - * @file adds_custom_tiling.h - * - * Copyright (C) 2025-2025. Huawei Technologies Co., Ltd. All rights reserved. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. - */ -#ifndef ADDS_CUSTOM_TILING_H -#define ADDS_CUSTOM_TILING_H -#include "register/tilingdata_base.h" - -namespace optiling { -BEGIN_TILING_DATA_DEF(TilingData) -TILING_DATA_FIELD_DEF(uint32_t, totalLength); -TILING_DATA_FIELD_DEF(uint32_t, tileNum); -END_TILING_DATA_DEF; - -REGISTER_TILING_DATA_CLASS(AddsCustom, TilingData) -} // namespace optiling -#endif // ADDS_CUSTOM_TILING_H diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/KernelLaunch/CMakeLists.txt b/operator/ascendc/4_best_practices/15_mata_address_conflict/KernelLaunch/CMakeLists.txt deleted file mode 100644 index 258b2e3b7..000000000 --- a/operator/ascendc/4_best_practices/15_mata_address_conflict/KernelLaunch/CMakeLists.txt +++ /dev/null @@ -1,47 +0,0 @@ -cmake_minimum_required(VERSION 3.16) -project(Ascend_c) - -set(RUN_MODE "npu" CACHE STRING "cpu/sim/npu") -set(SOC_VERSION "Ascend310P3" CACHE STRING "system on chip type") -set(ASCEND_CANN_PACKAGE_PATH "/usr/local/Ascend/ascend-toolkit/latest" - CACHE STRING "ASCEND CANN package installation directory" -) -if(NOT CMAKE_BUILD_TYPE) - set(CMAKE_BUILD_TYPE "Debug" CACHE STRING "Build type Release/Debug (default Debug)" FORCE) -endif() -if(CMAKE_INSTALL_PREFIX STREQUAL /usr/local) - set(CMAKE_INSTALL_PREFIX "${CMAKE_CURRENT_LIST_DIR}/out" CACHE STRING "path for install()" FORCE) -endif() - -# ${KERNEL_FILES} are used to compile library, push files written by ascendc in ${KERNEL_FILES}. -# ref to cmake/npu.cmake ascendc_library, cmake/cpu.cmake add_library -file(GLOB KERNEL_FILES - ${CMAKE_CURRENT_SOURCE_DIR}/adds_custom_v1.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/adds_custom_v2.cpp -) - -if("${RUN_MODE}" STREQUAL "cpu") - include(cmake/cpu_lib.cmake) -elseif("${RUN_MODE}" STREQUAL "sim" OR "${RUN_MODE}" STREQUAL "npu") - include(cmake/npu_lib.cmake) -else() - message("invalid RUN_MODE: ${RUN_MODE}") -endif() -add_executable(ascendc_kernels_bbit ${CMAKE_CURRENT_SOURCE_DIR}/main.cpp) - -target_compile_options(ascendc_kernels_bbit PRIVATE - $:-g>> - -O2 -std=c++17 -D_GLIBCXX_USE_CXX11_ABI=0 -Wall -Werror -) - -target_link_libraries(ascendc_kernels_bbit PRIVATE - $,$>:host_intf_pub>> - $:ascendcl>> - ascendc_kernels_${RUN_MODE} -) - -install(TARGETS ascendc_kernels_bbit - LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} - ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} - RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} -) diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/KernelLaunch/README.md b/operator/ascendc/4_best_practices/15_mata_address_conflict/KernelLaunch/README.md deleted file mode 100644 index f72b521cd..000000000 --- a/operator/ascendc/4_best_practices/15_mata_address_conflict/KernelLaunch/README.md +++ /dev/null @@ -1,88 +0,0 @@ -## 目录结构介绍 - -``` -├── KernelLaunch -│ ├── cmake // 编译工程文件 -│ ├── scripts -│ │ ├── gen_data.py // 输入数据和真值数据生成脚本 -│ │ └── verify_result.py // 验证输出数据和真值数据是否一致的验证脚本 -│ ├── add_custom_v1.cpp // 算子kernel实现1:未优化前实现 -│ ├── add_custom_v2.cpp // 算子kernel实现2:优化地址分配,消除Bank冲突后的实现 -│ ├── CMakeLists.txt // 编译工程文件 -│ ├── data_utils.h // 数据读入写出函数 -│ ├── main.cpp // 主函数,调用算子的应用程序,含CPU域及NPU域调用 -│ └── run.sh // 编译运行算子的脚本 -``` - -## 代码实现介绍 - -本样例中实现的是固定shape为1*4096的Add算子。 - -- kernel实现 - - Add算子的数学表达式为: - - ``` - z = x + y - ``` - - 计算逻辑是:Ascend C提供的矢量计算接口的操作元素都为LocalTensor,输入数据需要先搬运进片上存储,然后使用计算接口完成两个输入参数相加,得到最终结果,再搬出到外部存储上。 - - Add算子的实现流程分为3个基本任务:CopyIn,Compute,CopyOut。CopyIn任务负责将Global Memory上的输入Tensor xGm和yGm搬运到Local Memory,分别存储在xLocal、yLocal,Compute任务负责对xLocal、yLocal执行加法操作,计算结果存储在zLocal中,CopyOut任务负责将输出数据从zLocal搬运至Global Memory上的输出Tensor zGm中。 - - 实现1:请参考[add_custom_v1.cpp](./add_custom_v1.cpp),xLocal地址为0,yLocal地址为0x4000,zLocal地址为0x8000。xLocal与yLocal存在读读冲突,xLocal与zLocal存在读写冲突。 - - 实现2:请参考[add_custom_v2.cpp](./add_custom_v2.cpp),为了避免Bank冲突,通过配置InitBuffer时的bufferSize来调整Tensor地址,xLocal地址为0,yLocal地址为0x4100,zLocal地址为0x10000。 -- 调用实现 - - 1. CPU侧运行验证主要通过ICPU_RUN_KF CPU调测宏等CPU调测库提供的接口来完成; - 2. NPU侧运行验证主要通过使用ACLRT_LAUNCH_KERNEL内核调用宏来完成。 - - 应用程序通过ASCENDC_CPU_DEBUG 宏区分代码逻辑运行于CPU侧还是NPU侧。 - -## 运行样例算子 - -- 打开样例目录 - 以命令行方式下载样例代码,master分支为例。 - - ```bash - cd ${git_clone_path}/samples/operator/ascendc/4_best_practices/4_bank_conflict/KernelLaunch - ``` -- 配置环境变量 - - 请根据当前环境上CANN开发套件包的[安装方式](https://hiascend.com/document/redirect/CannCommunityInstSoftware),选择对应配置环境变量的命令。 - - - 默认路径,root用户安装CANN软件包 - ```bash - export ASCEND_INSTALL_PATH=/usr/local/Ascend/ascend-toolkit/latest - ``` - - 默认路径,非root用户安装CANN软件包 - ```bash - export ASCEND_INSTALL_PATH=$HOME/Ascend/ascend-toolkit/latest - ``` - - 指定路径install_path,安装CANN软件包 - ```bash - export ASCEND_INSTALL_PATH=${install_path}/ascend-toolkit/latest - ``` -- 样例执行 - - ```bash - bash run.sh -r [RUN_MODE] -v [SOC_VERSION] - ``` - - - RUN_MODE:编译方式,可选择CPU调试,NPU仿真,NPU上板。支持参数为[cpu /sim / npu] - - SOC_VERSION:昇腾AI处理器型号,如果无法确定具体的[SOC_VERSION],则在安装昇腾AI处理器的服务器执行npu-smi info命令进行查询,在查询到的“Name”前增加Ascend信息,例如“Name”对应取值为xxxyy,实际配置的[SOC_VERSION]值为Ascendxxxyy。支持以下产品型号: - - Atlas A2训练系列产品/Atlas 800I A2推理产品 - - 示例如下,Ascendxxxyy请替换为实际的AI处理器型号。 - - ```bash - bash run.sh -r cpu -v Ascendxxxyy - ``` - -## 更新说明 - - -| 时间 | 更新事项 | -| ---------- | ------------ | -| 2025/07/01 | 新增本readme | diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/KernelLaunch/adds_custom_v1.cpp b/operator/ascendc/4_best_practices/15_mata_address_conflict/KernelLaunch/adds_custom_v1.cpp deleted file mode 100644 index 5c06ce096..000000000 --- a/operator/ascendc/4_best_practices/15_mata_address_conflict/KernelLaunch/adds_custom_v1.cpp +++ /dev/null @@ -1,98 +0,0 @@ -/** - * @file add_custom_v1.cpp - * - * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. - */ -#include "kernel_operator.h" - -using AscendC::TPosition; -namespace { -constexpr int32_t M = 8192; -constexpr int32_t N = 128; -constexpr int32_t TILE_M = 512; -constexpr int32_t TILE_N = 8; -constexpr int32_t USED_CORE_NUM = N / TILE_N; -constexpr int32_t LOOP_ONE_CORE = M / TILE_M; -constexpr int32_t BUFFER_NUM = 2; -constexpr int32_t BLOCK_SIZE = 32; -} // namespace - -class KernelAdd { -public: - __aicore__ inline KernelAdd() {} - __aicore__ inline void Init(GM_ADDR x, GM_ADDR z) - { - // the gm address conflict happens when L2 cache miss - // so we disable the L2 cache mode to show the - xGm.SetGlobalBuffer((__gm__ float *)x + AscendC::GetBlockIdx() * TILE_N); - zGm.SetGlobalBuffer((__gm__ float *)z + AscendC::GetBlockIdx() * TILE_N); - pipe.InitBuffer(inQueueX, BUFFER_NUM, TILE_M * TILE_N * sizeof(float)); - pipe.InitBuffer(outQueueZ, BUFFER_NUM, TILE_M * TILE_N * sizeof(float)); - } - __aicore__ inline void Process() - { - for (int32_t i = 0; i < LOOP_ONE_CORE; i++) { - CopyIn(i); - Compute(); - CopyOut(i); - } - } - -private: - __aicore__ inline void CopyIn(int32_t progress) - { - AscendC::LocalTensor xLocal = inQueueX.AllocTensor(); - AscendC::DataCopyParams params; - params.blockCount = TILE_M; - params.blockLen = TILE_N * sizeof(float) / BLOCK_SIZE; - params.srcStride = (N - TILE_N) * sizeof(float) / BLOCK_SIZE; - params.dstStride = 0; - AscendC::DataCopy(xLocal, xGm[progress * TILE_M * N], params); - inQueueX.EnQue(xLocal); - } - __aicore__ inline void Compute() - { - AscendC::LocalTensor xLocal = inQueueX.DeQue(); - AscendC::LocalTensor zLocal = outQueueZ.AllocTensor(); - constexpr float scale = 2.0; - AscendC::Adds(zLocal, xLocal, scale, TILE_M * TILE_N); - outQueueZ.EnQue(zLocal); - inQueueX.FreeTensor(xLocal); - } - __aicore__ inline void CopyOut(int32_t progress) - { - AscendC::LocalTensor zLocal = outQueueZ.DeQue(); - AscendC::DataCopyParams params; - params.blockCount = TILE_M; - params.blockLen = TILE_N * sizeof(float) / BLOCK_SIZE; - params.srcStride = 0; - params.dstStride = (N - TILE_N) * sizeof(float) / BLOCK_SIZE; - AscendC::DataCopy(zGm[progress * TILE_M * N], zLocal, params); - outQueueZ.FreeTensor(zLocal); - } - -private: - AscendC::TPipe pipe; - AscendC::TQue inQueueX; - AscendC::TQue outQueueZ; - AscendC::GlobalTensor xGm; - AscendC::GlobalTensor zGm; -}; - -extern "C" __global__ __aicore__ void adds_custom_v1(GM_ADDR x, GM_ADDR z) -{ - KernelAdd op; - op.Init(x, z); - op.Process(); -} - -#ifndef ASCENDC_CPU_DEBUG -void adds_custom_do_v1(uint32_t blockDim, void *stream, uint8_t *x, uint8_t *z) -{ - adds_custom_v1<<>>(x, z); -} -#endif diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/KernelLaunch/adds_custom_v2.cpp b/operator/ascendc/4_best_practices/15_mata_address_conflict/KernelLaunch/adds_custom_v2.cpp deleted file mode 100644 index d1d114bb9..000000000 --- a/operator/ascendc/4_best_practices/15_mata_address_conflict/KernelLaunch/adds_custom_v2.cpp +++ /dev/null @@ -1,104 +0,0 @@ -/** - * @file adds_custom_v2.cpp - * - * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. - */ -#include "kernel_operator.h" - -using AscendC::TPosition; -namespace { -constexpr int32_t M = 8192; -constexpr int32_t N = 128; -constexpr int32_t TILE_M = 512; -constexpr int32_t TILE_N = 8; -constexpr int32_t USED_CORE_NUM = N / TILE_N; -constexpr int32_t LOOP_ONE_CORE = M / TILE_M; -constexpr int32_t BUFFER_NUM = 2; -constexpr int32_t BLOCK_SIZE = 32; -} // namespace - -class KernelAdd { -public: - __aicore__ inline KernelAdd() {} - __aicore__ inline void Init(GM_ADDR x, GM_ADDR z) - { - // the gm address conflict happens when L2 cache miss - // so we disable the L2 cache mode to show the - xGm.SetGlobalBuffer((__gm__ float *)x + AscendC::GetBlockIdx() * TILE_N); - zGm.SetGlobalBuffer((__gm__ float *)z + AscendC::GetBlockIdx() * TILE_N); - pipe.InitBuffer(inQueueX, BUFFER_NUM, TILE_M * TILE_N * sizeof(float)); - pipe.InitBuffer(outQueueZ, BUFFER_NUM, TILE_M * TILE_N * sizeof(float)); - } - __aicore__ inline void Process() - { - for (int32_t i = 0; i < LOOP_ONE_CORE; i++) { - // to avoid the gm address conflict: - // the loop order core0 : 0, 1, 2, 3, ..., 13, 14, 15 - // the loop order core1 : 1, 2, 3, 4, ..., 14, 15, 0 - // ... - // the loop order core15 : 15, 0, 1, 2, ..., 12, 13, 14 - int32_t newProgress = (i + AscendC::GetBlockIdx()) % LOOP_ONE_CORE; - CopyIn(newProgress); - Compute(); - CopyOut(newProgress); - } - } - -private: - __aicore__ inline void CopyIn(int32_t progress) - { - AscendC::LocalTensor xLocal = inQueueX.AllocTensor(); - AscendC::DataCopyParams params; - params.blockCount = TILE_M; - params.blockLen = TILE_N * sizeof(float) / BLOCK_SIZE; - params.srcStride = (N - TILE_N) * sizeof(float) / BLOCK_SIZE; - params.dstStride = 0; - AscendC::DataCopy(xLocal, xGm[progress * TILE_M * N], params); - inQueueX.EnQue(xLocal); - } - __aicore__ inline void Compute() - { - AscendC::LocalTensor xLocal = inQueueX.DeQue(); - AscendC::LocalTensor zLocal = outQueueZ.AllocTensor(); - constexpr float scale = 2.0; - AscendC::Adds(zLocal, xLocal, scale, TILE_M * TILE_N); - outQueueZ.EnQue(zLocal); - inQueueX.FreeTensor(xLocal); - } - __aicore__ inline void CopyOut(int32_t progress) - { - AscendC::LocalTensor zLocal = outQueueZ.DeQue(); - AscendC::DataCopyParams params; - params.blockCount = TILE_M; - params.blockLen = TILE_N * sizeof(float) / BLOCK_SIZE; - params.srcStride = 0; - params.dstStride = (N - TILE_N) * sizeof(float) / BLOCK_SIZE; - AscendC::DataCopy(zGm[progress * TILE_M * N], zLocal, params); - outQueueZ.FreeTensor(zLocal); - } - -private: - AscendC::TPipe pipe; - AscendC::TQue inQueueX; - AscendC::TQue outQueueZ; - AscendC::GlobalTensor xGm; - AscendC::GlobalTensor zGm; -}; - -extern "C" __global__ __aicore__ void adds_custom_v2(GM_ADDR x, GM_ADDR z) -{ - KernelAdd op; - op.Init(x, z); - op.Process(); -} - -#ifndef ASCENDC_CPU_DEBUG -void adds_custom_do_v2(uint32_t blockDim, void *stream, uint8_t *x, uint8_t *z) -{ - adds_custom_v2<<>>(x, z); -} -#endif diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/KernelLaunch/cmake/cpu_lib.cmake b/operator/ascendc/4_best_practices/15_mata_address_conflict/KernelLaunch/cmake/cpu_lib.cmake deleted file mode 100644 index 5362c8b5a..000000000 --- a/operator/ascendc/4_best_practices/15_mata_address_conflict/KernelLaunch/cmake/cpu_lib.cmake +++ /dev/null @@ -1,9 +0,0 @@ -if(NOT DEFINED ENV{CMAKE_PREFIX_PATH}) - set(CMAKE_PREFIX_PATH ${ASCEND_CANN_PACKAGE_PATH}/tools/tikicpulib/lib/cmake) -endif() -find_package(tikicpulib REQUIRED) - -add_library(ascendc_kernels_${RUN_MODE} SHARED ${KERNEL_FILES}) -target_link_libraries(ascendc_kernels_${RUN_MODE} PUBLIC tikicpulib::${SOC_VERSION}) -target_compile_options(ascendc_kernels_${RUN_MODE} PRIVATE -g -O0 -std=c++17) -install(TARGETS ascendc_kernels_${RUN_MODE} DESTINATION ${CMAKE_INSTALL_LIBDIR}) diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/KernelLaunch/cmake/npu_lib.cmake b/operator/ascendc/4_best_practices/15_mata_address_conflict/KernelLaunch/cmake/npu_lib.cmake deleted file mode 100644 index f92b095d1..000000000 --- a/operator/ascendc/4_best_practices/15_mata_address_conflict/KernelLaunch/cmake/npu_lib.cmake +++ /dev/null @@ -1,11 +0,0 @@ -if(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake) - set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake) -elseif(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake) - set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake) -else() - message(FATAL_ERROR "ascendc_kernel_cmake does not exist ,please check whether the cann package is installed") -endif() -include(${ASCENDC_CMAKE_DIR}/ascendc.cmake) - -# ascendc_library use to add kernel file to generate ascendc library -ascendc_library(ascendc_kernels_${RUN_MODE} SHARED ${KERNEL_FILES}) diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/KernelLaunch/data_utils.h b/operator/ascendc/4_best_practices/15_mata_address_conflict/KernelLaunch/data_utils.h deleted file mode 100644 index 09d906371..000000000 --- a/operator/ascendc/4_best_practices/15_mata_address_conflict/KernelLaunch/data_utils.h +++ /dev/null @@ -1,203 +0,0 @@ -/** - * @file data_utils.h - * - * Copyright (C) 2024. Huawei Technologies Co., Ltd. All rights reserved. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. - */ -#ifndef DATA_UTILS_H -#define DATA_UTILS_H -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include - -#include "acl/acl.h" - -typedef enum { - DT_UNDEFINED = -1, - FLOAT = 0, - HALF = 1, - INT8_T = 2, - INT32_T = 3, - UINT8_T = 4, - INT16_T = 6, - UINT16_T = 7, - UINT32_T = 8, - INT64_T = 9, - UINT64_T = 10, - DOUBLE = 11, - BOOL = 12, - STRING = 13, - COMPLEX64 = 16, - COMPLEX128 = 17, - BF16 = 27 -} printDataType; - -#define INFO_LOG(fmt, args...) fprintf(stdout, "[INFO] " fmt "\n", ##args) -#define WARN_LOG(fmt, args...) fprintf(stdout, "[WARN] " fmt "\n", ##args) -#define ERROR_LOG(fmt, args...) fprintf(stdout, "[ERROR] " fmt "\n", ##args) -#define CHECK_ACL(x) \ - do { \ - aclError __ret = x; \ - if (__ret != ACL_ERROR_NONE) { \ - std::cerr << __FILE__ << ":" << __LINE__ << " aclError:" << __ret << std::endl; \ - } \ - } while (0); - -/** - * @brief Read data from file - * @param [in] filePath: file path - * @param [out] fileSize: file size - * @return read result - */ -bool ReadFile(const std::string &filePath, size_t &fileSize, void *buffer, size_t bufferSize) -{ - struct stat sBuf; - int fileStatus = stat(filePath.data(), &sBuf); - if (fileStatus == -1) { - ERROR_LOG("failed to get file"); - return false; - } - if (S_ISREG(sBuf.st_mode) == 0) { - ERROR_LOG("%s is not a file, please enter a file", filePath.c_str()); - return false; - } - - std::ifstream file; - file.open(filePath, std::ios::binary); - if (!file.is_open()) { - ERROR_LOG("Open file failed. path = %s", filePath.c_str()); - return false; - } - - std::filebuf *buf = file.rdbuf(); - size_t size = buf->pubseekoff(0, std::ios::end, std::ios::in); - if (size == 0) { - ERROR_LOG("file size is 0"); - file.close(); - return false; - } - if (size > bufferSize) { - ERROR_LOG("file size is larger than buffer size"); - file.close(); - return false; - } - buf->pubseekpos(0, std::ios::in); - buf->sgetn(static_cast(buffer), size); - fileSize = size; - file.close(); - return true; -} - -/** - * @brief Write data to file - * @param [in] filePath: file path - * @param [in] buffer: data to write to file - * @param [in] size: size to write - * @return write result - */ -bool WriteFile(const std::string &filePath, const void *buffer, size_t size) -{ - if (buffer == nullptr) { - ERROR_LOG("Write file failed. buffer is nullptr"); - return false; - } - - int fd = open(filePath.c_str(), O_RDWR | O_CREAT | O_TRUNC, S_IRUSR | S_IWRITE); - if (fd < 0) { - ERROR_LOG("Open file failed. path = %s", filePath.c_str()); - return false; - } - - size_t writeSize = write(fd, buffer, size); - (void)close(fd); - if (writeSize != size) { - ERROR_LOG("Write file Failed."); - return false; - } - - return true; -} - -template void DoPrintData(const T *data, size_t count, size_t elementsPerRow) -{ - assert(elementsPerRow != 0); - for (size_t i = 0; i < count; ++i) { - std::cout << std::setw(10) << data[i]; - if (i % elementsPerRow == elementsPerRow - 1) { - std::cout << std::endl; - } - } -} - -void DoPrintHalfData(const aclFloat16 *data, size_t count, size_t elementsPerRow) -{ - assert(elementsPerRow != 0); - for (size_t i = 0; i < count; ++i) { - std::cout << std::setw(10) << std::setprecision(6) << aclFloat16ToFloat(data[i]); - if (i % elementsPerRow == elementsPerRow - 1) { - std::cout << std::endl; - } - } -} - -void PrintData(const void *data, size_t count, printDataType dataType, size_t elementsPerRow = 16) -{ - if (data == nullptr) { - ERROR_LOG("Print data failed. data is nullptr"); - return; - } - - switch (dataType) { - case BOOL: - DoPrintData(reinterpret_cast(data), count, elementsPerRow); - break; - case INT8_T: - DoPrintData(reinterpret_cast(data), count, elementsPerRow); - break; - case UINT8_T: - DoPrintData(reinterpret_cast(data), count, elementsPerRow); - break; - case INT16_T: - DoPrintData(reinterpret_cast(data), count, elementsPerRow); - break; - case UINT16_T: - DoPrintData(reinterpret_cast(data), count, elementsPerRow); - break; - case INT32_T: - DoPrintData(reinterpret_cast(data), count, elementsPerRow); - break; - case UINT32_T: - DoPrintData(reinterpret_cast(data), count, elementsPerRow); - break; - case INT64_T: - DoPrintData(reinterpret_cast(data), count, elementsPerRow); - break; - case UINT64_T: - DoPrintData(reinterpret_cast(data), count, elementsPerRow); - break; - case HALF: - DoPrintHalfData(reinterpret_cast(data), count, elementsPerRow); - break; - case FLOAT: - DoPrintData(reinterpret_cast(data), count, elementsPerRow); - break; - case DOUBLE: - DoPrintData(reinterpret_cast(data), count, elementsPerRow); - break; - default: - ERROR_LOG("Unsupported type: %d", dataType); - } - std::cout << std::endl; -} -#endif // DATA_UTILS_H diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/KernelLaunch/main.cpp b/operator/ascendc/4_best_practices/15_mata_address_conflict/KernelLaunch/main.cpp deleted file mode 100644 index 14523e2ca..000000000 --- a/operator/ascendc/4_best_practices/15_mata_address_conflict/KernelLaunch/main.cpp +++ /dev/null @@ -1,128 +0,0 @@ -/** - * @file main.cpp - * - * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. - */ -#include "data_utils.h" -#ifndef ASCENDC_CPU_DEBUG -#include "acl/acl.h" -extern void adds_custom_do_v1(uint32_t blockDim, void *stream, uint8_t *x, uint8_t *z); -extern void adds_custom_do_v2(uint32_t blockDim, void *stream, uint8_t *x, uint8_t *z); -using KernelEntry = void(*)(uint32_t, void *, uint8_t *, uint8_t *); -#else -#include "tikicpulib.h" -extern "C" __global__ __aicore__ void adds_custom_v1(GM_ADDR x, GM_ADDR z); -extern "C" __global__ __aicore__ void adds_custom_v2(GM_ADDR x, GM_ADDR z); -using KernelEntry = void (*)(GM_ADDR, GM_ADDR); - -#endif - -struct ArgInfo { - std::string fileName; - size_t length; -}; - -#ifndef ASCENDC_CPU_DEBUG - -void KernelCall(KernelEntry kernelEntry, uint32_t blockDim, void *stream, std::vector &inputsInfo, - std::vector &outputsInfo) -{ - std::vector inputHost(inputsInfo.size()); - std::vector inputDevice(inputsInfo.size()); - std::vector outputHost(outputsInfo.size()); - std::vector outputDevice(outputsInfo.size()); - - for (uint32_t i = 0; i < inputsInfo.size(); i++) { - CHECK_ACL(aclrtMallocHost((void **)(&inputHost[i]), inputsInfo[i].length)); - CHECK_ACL(aclrtMalloc((void **)(&inputDevice[i]), inputsInfo[i].length, ACL_MEM_MALLOC_HUGE_FIRST)); - ReadFile(inputsInfo[i].fileName, inputsInfo[i].length, inputHost[i], inputsInfo[i].length); - CHECK_ACL(aclrtMemcpy(inputDevice[i], inputsInfo[i].length, inputHost[i], inputsInfo[i].length, - ACL_MEMCPY_HOST_TO_DEVICE)); - } - - for (uint32_t i = 0; i < outputsInfo.size(); i++) { - CHECK_ACL(aclrtMallocHost((void **)(&outputHost[i]), outputsInfo[i].length)); - CHECK_ACL(aclrtMalloc((void **)(&outputDevice[i]), outputsInfo[i].length, ACL_MEM_MALLOC_HUGE_FIRST)); - } - - kernelEntry(blockDim, stream, inputDevice[0], outputDevice[0]); - CHECK_ACL(aclrtSynchronizeStream(stream)); - for (uint32_t i = 0; i < outputsInfo.size(); i++) { - CHECK_ACL(aclrtMemcpy(outputHost[i], outputsInfo[i].length, outputDevice[i], outputsInfo[i].length, - ACL_MEMCPY_DEVICE_TO_HOST)); - WriteFile(outputsInfo[i].fileName, outputHost[i], outputsInfo[i].length); - CHECK_ACL(aclrtFree(outputDevice[i])); - CHECK_ACL(aclrtFreeHost(outputHost[i])); - } - - for (uint32_t i = 0; i < inputsInfo.size(); i++) { - CHECK_ACL(aclrtFree(inputDevice[i])); - CHECK_ACL(aclrtFreeHost(inputHost[i])); - } -} - -#else - -#define KernelCall(kernelEntry, blockDim, inputsInfo, outputsInfo) \ - { \ - std::vector input(inputsInfo.size()); \ - std::vector output(outputsInfo.size()); \ - \ - for (uint32_t i = 0; i < inputsInfo.size(); i++) { \ - input[i] = (uint8_t *)AscendC::GmAlloc(inputsInfo[i].length); \ - ReadFile(inputsInfo[i].fileName, inputsInfo[i].length, input[i], inputsInfo[i].length); \ - } \ - \ - for (uint32_t i = 0; i < outputsInfo.size(); i++) { \ - output[i] = (uint8_t *)AscendC::GmAlloc(outputsInfo[i].length); \ - } \ - \ - AscendC::SetKernelMode(KernelMode::AIV_MODE); \ - ICPU_RUN_KF(kernelEntry, blockDim, input[0], output[0]); \ - for (uint32_t i = 0; i < inputsInfo.size(); i++) { \ - AscendC::GmFree((void *)input[i]); \ - } \ - \ - for (uint32_t i = 0; i < outputsInfo.size(); i++) { \ - WriteFile(outputsInfo[i].fileName, output[i], outputsInfo[i].length); \ - AscendC::GmFree((void *)output[i]); \ - } \ - } - -#endif - -int32_t main(int32_t argc, char *argv[]) -{ - uint32_t blockDim = 16; - uint32_t M = 8192; - uint32_t N = 128; - size_t inputByteSize = M * N * sizeof(float); - size_t outputByteSize = M * N * sizeof(float); - - std::vector inputsInfo = {{"./input/input_x.bin", inputByteSize}}; - std::vector outputsV1Info = {{"./output/output_z_v1.bin", outputByteSize}}; - std::vector outputsV2Info = {{"./output/output_z_v2.bin", outputByteSize}}; - -#ifndef ASCENDC_CPU_DEBUG - CHECK_ACL(aclInit(nullptr)); - int32_t deviceId = 0; - CHECK_ACL(aclrtSetDevice(deviceId)); - aclrtStream stream = nullptr; - CHECK_ACL(aclrtCreateStream(&stream)); - - KernelCall(adds_custom_do_v1, blockDim, stream, inputsInfo, outputsV1Info); - KernelCall(adds_custom_do_v2, blockDim, stream, inputsInfo, outputsV2Info); - - CHECK_ACL(aclrtDestroyStream(stream)); - CHECK_ACL(aclrtResetDevice(deviceId)); - CHECK_ACL(aclFinalize()); -#else - KernelCall(adds_custom_v1, blockDim, inputsInfo, outputsV1Info); - KernelCall(adds_custom_v2, blockDim, inputsInfo, outputsV2Info); -#endif - return 0; -} diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/KernelLaunch/run.sh b/operator/ascendc/4_best_practices/15_mata_address_conflict/KernelLaunch/run.sh deleted file mode 100755 index 0c5aef144..000000000 --- a/operator/ascendc/4_best_practices/15_mata_address_conflict/KernelLaunch/run.sh +++ /dev/null @@ -1,113 +0,0 @@ -#!/bin/bash -CURRENT_DIR=$( - cd $(dirname ${BASH_SOURCE:-$0}) - pwd -) - -BUILD_TYPE="Debug" -INSTALL_PREFIX="${CURRENT_DIR}/out" - -SHORT=r:,v:,i:,b:,p:, -LONG=run-mode:,soc-version:,install-path:,build-type:,install-prefix:, -OPTS=$(getopt -a --options $SHORT --longoptions $LONG -- "$@") -eval set -- "$OPTS" -SOC_VERSION="Ascend310P3" - -while :; do - case "$1" in - -r | --run-mode) - RUN_MODE="$2" - shift 2 - ;; - -v | --soc-version) - SOC_VERSION="$2" - shift 2 - ;; - -i | --install-path) - ASCEND_INSTALL_PATH="$2" - shift 2 - ;; - -b | --build-type) - BUILD_TYPE="$2" - shift 2 - ;; - -p | --install-prefix) - INSTALL_PREFIX="$2" - shift 2 - ;; - --) - shift - break - ;; - *) - echo "[ERROR] Unexpected option: $1" - break - ;; - esac -done - -RUN_MODE_LIST="cpu sim npu" -if [[ " $RUN_MODE_LIST " != *" $RUN_MODE "* ]]; then - echo "ERROR: RUN_MODE error, This sample only support specify cpu, sim or npu!" - exit -1 -fi - -VERSION_LIST="Ascend310P1 Ascend310P3 Ascend910B1 Ascend910B2 Ascend910B3 Ascend910B4" -if [[ " $VERSION_LIST " != *" $SOC_VERSION "* ]]; then - echo "ERROR: SOC_VERSION should be in [$VERSION_LIST]" - exit -1 -fi - -if [ -n "$ASCEND_INSTALL_PATH" ]; then - _ASCEND_INSTALL_PATH=$ASCEND_INSTALL_PATH -elif [ -n "$ASCEND_HOME_PATH" ]; then - _ASCEND_INSTALL_PATH=$ASCEND_HOME_PATH -else - if [ -d "$HOME/Ascend/ascend-toolkit/latest" ]; then - _ASCEND_INSTALL_PATH=$HOME/Ascend/ascend-toolkit/latest - else - _ASCEND_INSTALL_PATH=/usr/local/Ascend/ascend-toolkit/latest - fi -fi - -export ASCEND_TOOLKIT_HOME=${_ASCEND_INSTALL_PATH} -export ASCEND_HOME_PATH=${_ASCEND_INSTALL_PATH} -echo "Current compile soc version is ${SOC_VERSION}" -source ${_ASCEND_INSTALL_PATH}/bin/setenv.bash -if [ "${RUN_MODE}" = "sim" ]; then - # in case of running op in simulator, use stub .so instead - export LD_LIBRARY_PATH=${_ASCEND_INSTALL_PATH}/tools/simulator/${SOC_VERSION}/lib:$LD_LIBRARY_PATH -elif [ "${RUN_MODE}" = "cpu" ]; then - export LD_LIBRARY_PATH=${_ASCEND_INSTALL_PATH}/tools/tikicpulib/lib:${_ASCEND_INSTALL_PATH}/tools/tikicpulib/lib/${SOC_VERSION}:${_ASCEND_INSTALL_PATH}/tools/simulator/${SOC_VERSION}/lib:$LD_LIBRARY_PATH -fi - -set -e -rm -rf build out -mkdir -p build -cmake -B build \ - -DRUN_MODE=${RUN_MODE} \ - -DSOC_VERSION=${SOC_VERSION} \ - -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \ - -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} \ - -DASCEND_CANN_PACKAGE_PATH=${_ASCEND_INSTALL_PATH} -cmake --build build -j -cmake --install build - -rm -f ascendc_kernels_bbit -cp ./out/bin/ascendc_kernels_bbit ./ -rm -rf input output -mkdir -p input output -python3 scripts/gen_data.py -( - export LD_LIBRARY_PATH=$(pwd)/out/lib:$(pwd)/out/lib64:${_ASCEND_INSTALL_PATH}/lib64:$LD_LIBRARY_PATH - if [ "${RUN_MODE}" = "npu" ]; then - msprof op --launch-count=2 --output=./prof ./ascendc_kernels_bbit - elif [ "${RUN_MODE}" = "sim" ]; then - msprof op simulator --launch-count=2 --output=./prof ./ascendc_kernels_bbit - elif [ "${RUN_MODE}" = "cpu" ]; then - ./ascendc_kernels_bbit - fi -) -md5sum output/*.bin -python3 scripts/verify_result.py output/output_z_v1.bin output/golden.bin -python3 scripts/verify_result.py output/output_z_v2.bin output/golden.bin diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/KernelLaunch/scripts/gen_data.py b/operator/ascendc/4_best_practices/15_mata_address_conflict/KernelLaunch/scripts/gen_data.py deleted file mode 100644 index 3a5a2a1f4..000000000 --- a/operator/ascendc/4_best_practices/15_mata_address_conflict/KernelLaunch/scripts/gen_data.py +++ /dev/null @@ -1,23 +0,0 @@ -#!/usr/bin/python3 -# coding=utf-8 -# -# Copyright (C) 2023-2024. Huawei Technologies Co., Ltd. All rights reserved. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. -# =============================================================================== - -import numpy as np - - -def gen_golden_data_simple(): - input_x = np.random.uniform(1, 100, [8192, 128]).astype(np.float32) - golden = (input_x + 2.0).astype(np.float32) - - input_x.tofile("./input/input_x.bin") - golden.tofile("./output/golden.bin") - - -if __name__ == "__main__": - gen_golden_data_simple() diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/KernelLaunch/scripts/verify_result.py b/operator/ascendc/4_best_practices/15_mata_address_conflict/KernelLaunch/scripts/verify_result.py deleted file mode 100644 index 6a38a3b2b..000000000 --- a/operator/ascendc/4_best_practices/15_mata_address_conflict/KernelLaunch/scripts/verify_result.py +++ /dev/null @@ -1,53 +0,0 @@ -#!/usr/bin/python3 -# coding=utf-8 -# -# Copyright (C) 2023-2024. Huawei Technologies Co., Ltd. All rights reserved. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. -# =============================================================================== - -import sys -import numpy as np - -# for float32 -relative_tol = 1e-4 -absolute_tol = 1e-5 -error_tol = 1e-4 - - -def verify_result(output, golden): - output = np.fromfile(output, dtype=np.float32).reshape(-1) - golden = np.fromfile(golden, dtype=np.float32).reshape(-1) - different_element_results = np.isclose(output, - golden, - rtol=relative_tol, - atol=absolute_tol, - equal_nan=True) - different_element_indexes = np.where(different_element_results == False)[0] - for index in range(len(different_element_indexes)): - real_index = different_element_indexes[index] - golden_data = golden[real_index] - output_data = output[real_index] - print( - "data index: %06d, expected: %-.9f, actual: %-.9f, rdiff: %-.6f" % - (real_index, golden_data, output_data, - abs(output_data - golden_data) / golden_data)) - if index == 100: - break - error_ratio = float(different_element_indexes.size) / golden.size - print("error ratio: %.4f, tolerance: %.4f" % (error_ratio, error_tol)) - return error_ratio <= error_tol - - -if __name__ == '__main__': - try: - res = verify_result(sys.argv[1], sys.argv[2]) - if not res: - raise ValueError("[ERROR] result error") - else: - print("test pass") - except Exception as e: - print(e) - sys.exit(1) diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/README.md b/operator/ascendc/4_best_practices/15_mata_address_conflict/README.md index de724d24f..dbbd80d3c 100644 --- a/operator/ascendc/4_best_practices/15_mata_address_conflict/README.md +++ b/operator/ascendc/4_best_practices/15_mata_address_conflict/README.md @@ -1,63 +1,67 @@ ## 概述 -本样例基于AddCustom算子工程,介绍了单算子工程、单算子调用、第三方框架调用。 + +本样例基于AddsCustom算子工程,介绍了单算子工程、单算子调用、第三方框架调用。 ## 目录结构介绍 + ``` -├── 1_add_frameworklaunch // 使用框架调用的方式调用Add算子 -│ ├── AclNNInvocation // 通过aclnn调用的方式调用AddCustom算子 -│ ├── AclNNInvocationNaive // 通过aclnn调用的方式调用AddCustom算子, 简化了编译脚本 -│ ├── AclOfflineModel // 通过aclopExecuteV2调用的方式调用AddCustom算子 -│ ├── AclOnlineModel // 通过aclopCompile调用的方式调用AddCustom算子 -│ ├── AddCustom // AddCustom算子工程 -│ ├── PytorchInvocation // 通过pytorch调用的方式调用AddCustom算子 -│ ├── TensorflowInvocation // 通过tensorflow调用的方式调用AddCustom算子 -│ ├── CppExtensionInvocation // 通过CppExtension调用的方式调用AddCustom算子 -│ ├── AddCustom.json // AddCustom算子的原型定义json文件 +├── 15_mata_address_conflict // 同地址冲突样例工程目录 +│ ├── AclNNInvocation // 通过aclnn调用的方式调用AddsCustom算子 +│ ├── AddsCustom // AddsCustom算子工程 +│ ├── AddsCustom.json // AddsCustom算子的原型定义json文件 │ └── install.sh // 脚本,调用msOpGen生成自定义算子工程,并编译 ``` ## 算子描述 -Add算子实现了两个数据相加,返回相加结果的功能。对应的数学表达式为: + +Adds算子实现了一个Tensor与标量(值为2.0)相加,返回相加结果的功能。对应的数学表达式为: + ``` -z = x + y +z = x + 2.0 ``` + +本样例主要介绍数据搬运中的同地址冲突对搬运效率的影响,同地址冲突指的是多核同时访问(读/写)同一条cacheline,且数据不在cache中时,出于数据一致性的要求,芯片会对多核的访问请求进行串行处理,导致搬运效率降低。 + ## 算子规格描述 + - + - - - + + - + - +
算子类型(OpType)Add
算子类型(OpType)Adds
算子输入nameshapedata typeformat
x8 * 2048floatND
y8 * 2048floatND
算子输入nameshapedata typeformat
x8192 * 128floatND
算子输出z8 * 2048floatND
算子输出z8192 * 128floatND
核函数名add_custom
核函数名adds_custom
## 支持的产品型号 + 本样例支持如下产品型号: -- Atlas 训练系列产品 -- Atlas 推理系列产品AI Core + - Atlas A2训练系列产品/Atlas 800I A2推理产品 -- Atlas 200/500 A2推理产品 ## 算子工程介绍 -其中,算子工程目录AddCustom包含算子的实现文件,如下所示: + +其中,算子工程目录AddsCustom包含算子的实现文件,如下所示: + ``` -├── AddCustom // AddCustom自定义算子工程 -│ ├── framework // 算子插件实现文件目录 -│ ├── op_host // host侧实现文件 -│ └── op_kernel // kernel侧实现文件 +├── AddsCustom // AddsCustom自定义算子工程 +│ ├── op_host // host侧实现文件 +│ └── op_kernel // kernel侧实现文件 ``` -CANN软件包中提供了工程创建工具msOpGen,AddCustom算子工程可通过AddCustom.json自动创建,自定义算子工程具体请参考[Ascend C算子开发](https://hiascend.com/document/redirect/CannCommunityOpdevAscendC)>工程化算子开发>创建算子工程 章节。 -创建完自定义算子工程后,开发者重点需要完成算子host和kernel文件的功能开发。为简化样例运行流程,本样例已在AddCustom目录中准备好了必要的算子实现,install.sh脚本会创建一个CustomOp目录,并将算子实现文件复制到对应目录下,再编译算子。 +CANN软件包中提供了工程创建工具msOpGen,AddsCustom算子工程可通过AddsCustom.json自动创建,自定义算子工程具体请参考[Ascend C算子开发](https://hiascend.com/document/redirect/CannCommunityOpdevAscendC)>工程化算子开发>创建算子工程 章节。 + +创建完自定义算子工程后,开发者重点需要完成算子host和kernel文件的功能开发。为简化样例运行流程,本样例已在AddsCustom目录中准备好了必要的算子实现,install.sh脚本会创建一个CustomOp目录,并将算子实现文件复制到对应目录下,再编译算子。 备注:CustomOp目录为生成目录,每次执行install.sh脚本都会删除该目录并重新生成,切勿在该目录下编码算子,会存在丢失风险。 ## 编译运行样例算子 + 针对自定义算子工程,编译运行包含如下步骤: + - 调用msOpGen工具生成自定义算子工程; - 完成算子host和kernel实现; - 编译自定义算子工程生成自定义算子包; @@ -65,88 +69,91 @@ CANN软件包中提供了工程创建工具msOpGen,AddCustom算子工程可通 - 调用执行自定义算子; 详细操作如下所示。 + ### 1. 获取源码包 + 编译运行此样例前,请参考[准备:获取样例代码](../README.md#codeready)获取源码包。 ### 2. 生成自定义算子工程,复制host和kernel实现并编译算子 - - 切换到msOpGen脚本install.sh所在目录 - ```bash - # 若开发者以git命令行方式clone了master分支代码,并切换目录 - cd ${git_clone_path}/samples/operator/ascendc/0_introduction/1_add_frameworklaunch - ``` - - 调用脚本,生成自定义算子工程,复制host和kernel实现并编译算子 - - 方式一:配置环境变量运行脚本 - 请根据当前环境上CANN开发套件包的[安装方式](https://hiascend.com/document/redirect/CannCommunityInstSoftware),选择对应配置环境变量命令。 - - 默认路径,root用户安装CANN软件包 - ```bash - export ASCEND_INSTALL_PATH=/usr/local/Ascend/ascend-toolkit/latest - ``` - - 默认路径,非root用户安装CANN软件包 - ```bash - export ASCEND_INSTALL_PATH=$HOME/Ascend/ascend-toolkit/latest - ``` - - 指定路径install_path,安装CANN软件包 - ```bash - export ASCEND_INSTALL_PATH=${install_path}/ascend-toolkit/latest - ``` - 运行install.sh脚本 - ```bash - bash install.sh -v [SOC_VERSION] - ``` - - 方式二:指定命令行安装路径来运行脚本 +- 切换到msOpGen脚本install.sh所在目录 + + ```bash + # 若开发者以git命令行方式clone了master分支代码,并切换目录 + cd ${git_clone_path}/samples/operator/ascendc/0_introduction/1_add_frameworklaunch + ``` +- 调用脚本,生成自定义算子工程,复制host和kernel实现并编译算子 + + - 方式一:配置环境变量运行脚本 + 请根据当前环境上CANN开发套件包的[安装方式](https://hiascend.com/document/redirect/CannCommunityInstSoftware),选择对应配置环境变量命令。 + - 默认路径,root用户安装CANN软件包 + ```bash - bash install.sh -v [SOC_VERSION] -i [ASCEND_INSTALL_PATH] + export ASCEND_INSTALL_PATH=/usr/local/Ascend/ascend-toolkit/latest ``` - 参数说明: - - SOC_VERSION:昇腾AI处理器型号,如果无法确定具体的[SOC_VERSION],则在安装昇腾AI处理器的服务器执行npu-smi info命令进行查询,在查询到的“Name”前增加Ascend信息,例如“Name”对应取值为xxxyy,实际配置的[SOC_VERSION]值为Ascendxxxyy。支持以下产品型号: - - Atlas 训练系列产品 - - Atlas 推理系列产品AI Core - - Atlas A2训练系列产品/Atlas 800I A2推理产品 - - Atlas 200/500 A2推理产品 - - ASCEND_INSTALL_PATH:CANN软件包安装路径 + - 默认路径,非root用户安装CANN软件包 - 脚本运行成功后,会在当前目录下创建CustomOp目录,编译完成后,会在CustomOp/build_out中,生成自定义算子安装包custom_opp_\_\.run,例如“custom_opp_ubuntu_x86_64.run”。 + ```bash + export ASCEND_INSTALL_PATH=$HOME/Ascend/ascend-toolkit/latest + ``` + - 指定路径install_path,安装CANN软件包 - 备注:如果要使用dump调试功能,需要移除op_host内的Atlas 训练系列产品、Atlas 200/500 A2 推理产品的配置项。 + ```bash + export ASCEND_INSTALL_PATH=${install_path}/ascend-toolkit/latest + ``` + + 运行install.sh脚本 + + ```bash + bash install.sh -v [SOC_VERSION] + ``` + - 方式二:指定命令行安装路径来运行脚本 + ```bash + bash install.sh -v [SOC_VERSION] -i [ASCEND_INSTALL_PATH] + ``` + + 参数说明: + + - SOC_VERSION:昇腾AI处理器型号,如果无法确定具体的[SOC_VERSION],则在安装昇腾AI处理器的服务器执行npu-smi info命令进行查询,在查询到的“Name”前增加Ascend信息,例如“Name”对应取值为xxxyy,实际配置的[SOC_VERSION]值为Ascendxxxyy。支持以下产品型号: + - Atlas A2训练系列产品/Atlas 800I A2推理产品 + - ASCEND_INSTALL_PATH:CANN软件包安装路径 + + 脚本运行成功后,会在当前目录下创建CustomOp目录,编译完成后,会在CustomOp/build_out中,生成自定义算子安装包custom_opp_\_\.run,例如“custom_opp_ubuntu_x86_64.run”。 + + 备注:如果要使用dump调试功能,需要移除op_host内的Atlas 训练系列产品、Atlas 200/500 A2 推理产品的配置项。 ### 3. 部署自定义算子包 + - 部署自定义算子包前,请确保存在自定义算子包默认部署路径环境变量ASCEND_OPP_PATH - ```bash - echo $ASCEND_OPP_PATH - # 输出示例 /usr/local/Ascend/ascend-toolkit/latest/opp - # 若没有,则需导出CANN环境变量 - source [ASCEND_INSTALL_PATH]/bin/setenv.bash - # 例如 source /usr/local/Ascend/ascend-toolkit/latest/bin/setenv.bash - ``` - 参数说明: - - ASCEND_INSTALL_PATH:CANN软件包安装路径,一般和上一步中指定的路径保持一致 + ```bash + echo $ASCEND_OPP_PATH + # 输出示例 /usr/local/Ascend/ascend-toolkit/latest/opp + + # 若没有,则需导出CANN环境变量 + source [ASCEND_INSTALL_PATH]/bin/setenv.bash + # 例如 source /usr/local/Ascend/ascend-toolkit/latest/bin/setenv.bash + ``` + 参数说明: + + - ASCEND_INSTALL_PATH:CANN软件包安装路径,一般和上一步中指定的路径保持一致 - 在自定义算子安装包所在路径下,执行如下命令安装自定义算子包 - ```bash - cd CustomOp/build_out - ./custom_opp__.run - ``` + + ```bash + cd CustomOp/build_out + ./custom_opp__.run + ``` + 命令执行成功后,自定义算子包中的相关文件将部署至opp算子库环境变量ASCEND_OPP_PATH指向的的vendors/customize目录中。 ### 4. 调用执行算子工程 -- [aclnn调用AddCustom算子工程](./AclNNInvocation/README.md) -- [aclnn调用AddCustom算子工程(代码简化)](./AclNNInvocationNaive/README.md) -- [aclopExecuteV2模型调用AddCustom算子工程](./AclOfflineModel/README.md) -- [aclopCompile模型调用AddCustom算子工程](./AclOnlineModel/README.md) -- [cpp-extension模型调用AddCustom算子工程](./CppExtensions/README.md) -- [pytorch调用AddCustom算子工程](./PytorchInvocation/README.md) -- [tensorflow调用AddCustom算子工程](./TensorflowInvocation) + +- [aclnn调用AddsCustom算子工程](./AclNNInvocation/README.md) ## 更新说明 -| 时间 | 更新事项 | -| ---------- | ---------------------------- | -| 2023/10/24 | 新增TensorflowInvocation样例 | -| 2023/10/18 | 新增AclNNInvocation样例 | -| 2024/01/11 | 更改pytorch适配方式 | -| 2024/01/23 | 新增AclNNInvocationNaive样例 | -| 2024/05/22 | 修改readme结构 | -| 2024/11/11 | 样例目录调整 | -| 2024/11/18 | 算子工程改写为由msOpGen生成 | -| 2025/01/17 | 新增CppExtensionInvocation样例 | + + +| 时间 | 更新事项 | +| ---------- | -------- | +| 2025/07/03 | 新增样例 | -- Gitee From 1b97f5e80f6345110cf18f7b15f3ec9a1601408c Mon Sep 17 00:00:00 2001 From: zhanghao0689 Date: Thu, 3 Jul 2025 11:50:42 +0800 Subject: [PATCH 05/17] fix code and edit readme --- .../AclNNInvocation/README.md | 38 ++++++++----------- .../AclNNInvocation/inc/op_runner.h | 2 +- .../AddsCustom/op_host/adds_custom.cpp | 2 - .../15_mata_address_conflict/install.sh | 9 +++-- 4 files changed, 22 insertions(+), 29 deletions(-) diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/README.md b/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/README.md index ea3502c15..32b014bdf 100644 --- a/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/README.md +++ b/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/README.md @@ -23,27 +23,21 @@ 自定义算子编译部署后,会自动生成单算子API,可以直接在应用程序中调用。算子API的形式一般定义为“两段式接口”,形如: ```cpp - // 获取算子使用的workspace空间大小 - aclnnStatus aclnnAddsCustomGetWorkspaceSize( - const aclTensor *x1, - const aclTensor *x2, - const aclTensor *biasOptional, - char *group, - char *reduceOp - bool isTransAOptional, - bool isTransBOptional, - int64_t commTurnOptional, - const aclTensor *yOut, - uint64_t *workspaceSize, - aclOpExecutor **executor); - // 执行算子 - aclnnStatus aclnnMatmulAllReduceCustom( - void *workspace, - uint64_t workspaceSize, - aclOpExecutor *executor, - const aclrtStream stream); + // 获取算子使用的workspace空间大小 + aclnnStatus aclnnAddsCustomGetWorkspaceSize( + const aclTensor *x, + int64_t caseId, + const aclTensor *out, + uint64_t *workspaceSize, + aclOpExecutor **executor); + // 执行算子 + aclnnStatus aclnnAddsCustom( + void *workspace, + uint64_t workspaceSize, + aclOpExecutor *executor, + aclrtStream stream); ``` -其中aclnnMatmulAllReduceCustomGetWorkspaceSize为第一段接口,主要用于计算本次API调用计算过程中需要多少的workspace内存。获取到本次API计算需要的workspace大小之后,按照workspaceSize大小申请Device侧内存,然后调用第二段接口aclnnMatmulAllReduceCustom执行计算。具体参考[单算子API调用](https://hiascend.com/document/redirect/CannCommunityAscendCInVorkSingleOp)章节。 +其中aclnnAddsCustomGetWorkspaceSize为第一段接口,主要用于计算本次API调用计算过程中需要多少的workspace内存。获取到本次API计算需要的workspace大小之后,按照workspaceSize大小申请Device侧内存,然后调用第二段接口aclnnAddsCustom执行计算。具体参考[单算子API调用](https://hiascend.com/document/redirect/CannCommunityAscendCInVorkSingleOp)章节。 ## 运行样例算子 ### 1. 编译算子工程 @@ -54,7 +48,7 @@ 以命令行方式下载样例代码,master分支为例。 ```bash - cd ${git_clone_path}/samples/operator/ascendc/4_best_practices/23_matmul_all_reduce_custom/AclNNInvocation + cd ${git_clone_path}/samples/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation ``` - 样例执行 @@ -66,4 +60,4 @@ ## 更新说明 | 时间 | 更新事项 | | ---------- | ------------ | -| 2024/06/07 | 新增本readme | \ No newline at end of file +| 2025/07/03 | 新增本readme | \ No newline at end of file diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/inc/op_runner.h b/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/inc/op_runner.h index 53629c4a8..31ac74ba2 100644 --- a/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/inc/op_runner.h +++ b/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/inc/op_runner.h @@ -163,7 +163,7 @@ public: * @brief Get case index * @return case index by user input */ - bool CompileDynamicOp(); + bool GetCaseId(); private: size_t numInputs_; diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_host/adds_custom.cpp b/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_host/adds_custom.cpp index d4d09cb9a..404986b27 100644 --- a/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_host/adds_custom.cpp +++ b/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_host/adds_custom.cpp @@ -13,8 +13,6 @@ namespace optiling { const uint32_t BLOCK_DIM = 16; static ge::graphStatus TilingFunc(gert::TilingContext *context) { - TilingData tiling; - uint32_t totalLength = context->GetInputShape(0)->GetOriginShape().GetShapeSize(); context->SetBlockDim(BLOCK_DIM); auto attrs = context->GetAttrs(); const int64_t* tilingKey = attrs->GetInt(0); diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/install.sh b/operator/ascendc/4_best_practices/15_mata_address_conflict/install.sh index 7dfec79a2..83e680d3d 100755 --- a/operator/ascendc/4_best_practices/15_mata_address_conflict/install.sh +++ b/operator/ascendc/4_best_practices/15_mata_address_conflict/install.sh @@ -25,7 +25,7 @@ while :; do esac done -VERSION_LIST="Ascend910A Ascend910B Ascend310B1 Ascend310B2 Ascend310B3 Ascend310B4 Ascend310P1 Ascend310P3 Ascend910B1 Ascend910B2 Ascend910B3 Ascend910B4" +VERSION_LIST="Ascend910B1 Ascend910B2 Ascend910B3 Ascend910B4" if [[ " $VERSION_LIST " != *" $SOC_VERSION "* ]]; then echo "ERROR: SOC_VERSION should be in [$VERSION_LIST]" exit -1 @@ -46,10 +46,11 @@ source $_ASCEND_INSTALL_PATH/bin/setenv.bash export ASCEND_HOME_PATH=$_ASCEND_INSTALL_PATH OP_NAME=AddsCustom -rm -rf CustomOp # Generate the op framework -msopgen gen -i $OP_NAME.json -c ai_core-${SOC_VERSION} -lan cpp -out CustomOp +rm -rf CustomOp && msopgen gen -i $OP_NAME.json -c ai_core-${SOC_VERSION} -lan cpp -out CustomOp # Copy op implementation files to CustomOp -cp -rf $OP_NAME/* CustomOp +rm -rf CustomOp/op_host/*.cpp && cp -rf $OP_NAME/op_host CustomOp/ +rm -rf CustomOp/op_kernel/*.h && cp -rf $OP_NAME/op_kernel CustomOp/ + # Build CustomOp project (cd CustomOp && bash build.sh) \ No newline at end of file -- Gitee From ee7ef61da6c88c874800a5796756d8b41e1f2afc Mon Sep 17 00:00:00 2001 From: zhanghao0689 Date: Thu, 3 Jul 2025 14:41:22 +0800 Subject: [PATCH 06/17] fix some typo --- .../AclNNInvocation/inc/common.h | 2 +- .../AclNNInvocation/inc/op_runner.h | 2 +- .../AclNNInvocation/inc/operator_desc.h | 2 +- .../AclNNInvocation/run.sh | 2 +- .../AclNNInvocation/scripts/gen_data.py | 2 +- .../AclNNInvocation/scripts/verify_result.py | 2 +- .../AclNNInvocation/src/CMakeLists.txt | 10 +++++----- .../AclNNInvocation/src/common.cpp | 2 +- .../AclNNInvocation/src/main.cpp | 4 +--- .../AclNNInvocation/src/op_runner.cpp | 2 +- .../AclNNInvocation/src/operator_desc.cpp | 2 +- .../AddsCustom/op_host/adds_custom.cpp | 2 +- .../AddsCustom/op_kernel/adds_custom.cpp | 4 ++-- .../AddsCustom/op_kernel/adds_custom_v1.h | 8 +++++--- .../AddsCustom/op_kernel/adds_custom_v2.h | 16 +++++++++------- .../15_mata_address_conflict/README.md | 2 +- .../15_mata_address_conflict/install.sh | 6 ++++-- operator/ascendc/4_best_practices/README.md | 1 + 18 files changed, 38 insertions(+), 33 deletions(-) diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/inc/common.h b/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/inc/common.h index 11bb4aeca..fadb5c808 100644 --- a/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/inc/common.h +++ b/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/inc/common.h @@ -1,7 +1,7 @@ /** * @file common.h * - * Copyright (C) 2023-2024. Huawei Technologies Co., Ltd. All rights reserved. + * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/inc/op_runner.h b/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/inc/op_runner.h index 31ac74ba2..538667d4f 100644 --- a/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/inc/op_runner.h +++ b/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/inc/op_runner.h @@ -1,7 +1,7 @@ /** * @file op_runner.h * - * Copyright (C) 2023-2024. Huawei Technologies Co., Ltd. All rights reserved. + * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/inc/operator_desc.h b/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/inc/operator_desc.h index 6d8ee0905..cf02d7cec 100644 --- a/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/inc/operator_desc.h +++ b/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/inc/operator_desc.h @@ -1,7 +1,7 @@ /** * @file operator_desc.h * - * Copyright (C) 2023-2024. Huawei Technologies Co., Ltd. All rights reserved. + * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/run.sh b/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/run.sh index 55120f18d..c59fe1426 100755 --- a/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/run.sh +++ b/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/run.sh @@ -57,7 +57,7 @@ function main { export ASCEND_TOOLKIT_HOME=${_ASCEND_INSTALL_PATH} cd $CURRENT_DIR/output echo "INFO: execute op!" - msprof op --launch-count=2 --output=./prof ./execute_add_op + msprof op --launch-count=2 --output=./prof ./execute_adds_op if [ $? -ne 0 ]; then echo "ERROR: acl executable run failed! please check your project!" return 1 diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/scripts/gen_data.py b/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/scripts/gen_data.py index 3a5a2a1f4..9c4ecbe6e 100644 --- a/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/scripts/gen_data.py +++ b/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/scripts/gen_data.py @@ -1,7 +1,7 @@ #!/usr/bin/python3 # coding=utf-8 # -# Copyright (C) 2023-2024. Huawei Technologies Co., Ltd. All rights reserved. +# Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/scripts/verify_result.py b/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/scripts/verify_result.py index 6a38a3b2b..a5019f30f 100644 --- a/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/scripts/verify_result.py +++ b/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/scripts/verify_result.py @@ -1,7 +1,7 @@ #!/usr/bin/python3 # coding=utf-8 # -# Copyright (C) 2023-2024. Huawei Technologies Co., Ltd. All rights reserved. +# Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/src/CMakeLists.txt b/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/src/CMakeLists.txt index 8e9e45375..8d0ae1bd3 100644 --- a/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/src/CMakeLists.txt +++ b/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/src/CMakeLists.txt @@ -1,10 +1,10 @@ -# Copyright (c) Huawei Technologies Co., Ltd. 2020. All rights reserved. +# Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved. # CMake lowest version requirement cmake_minimum_required(VERSION 3.5.1) # project information -project(acl_execute_add) +project(acl_execute_adds) # Compile options add_compile_options(-std=c++11) @@ -47,14 +47,14 @@ link_directories( ${CUST_PKG_PATH}/lib ) -add_executable(execute_add_op +add_executable(execute_adds_op operator_desc.cpp op_runner.cpp main.cpp common.cpp ) -target_link_libraries(execute_add_op +target_link_libraries(execute_adds_op ascendcl cust_opapi acl_op_compiler @@ -62,4 +62,4 @@ target_link_libraries(execute_add_op stdc++ ) -install(TARGETS execute_add_op DESTINATION ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}) +install(TARGETS execute_adds_op DESTINATION ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}) diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/src/common.cpp b/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/src/common.cpp index 992759c95..d58716122 100644 --- a/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/src/common.cpp +++ b/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/src/common.cpp @@ -1,7 +1,7 @@ /** * @file common.cpp * - * Copyright (C) 2023-2024. Huawei Technologies Co., Ltd. All rights reserved. + * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/src/main.cpp b/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/src/main.cpp index 1f075370d..0df470472 100644 --- a/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/src/main.cpp +++ b/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/src/main.cpp @@ -1,7 +1,7 @@ /** * @file main.cpp * - * Copyright (C) 2023-2024. Huawei Technologies Co., Ltd. All rights reserved. + * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of @@ -10,10 +10,8 @@ #include #include #include - #include #include - #include "acl/acl.h" #include "common.h" #include "op_runner.h" diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/src/op_runner.cpp b/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/src/op_runner.cpp index e1d70fdb8..fcdbe67eb 100644 --- a/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/src/op_runner.cpp +++ b/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/src/op_runner.cpp @@ -1,7 +1,7 @@ /** * @file op_runner.cpp * - * Copyright (C) 2023-2024. Huawei Technologies Co., Ltd. All rights reserved. + * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/src/operator_desc.cpp b/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/src/operator_desc.cpp index da04cf6c9..90e0ac343 100644 --- a/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/src/operator_desc.cpp +++ b/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/src/operator_desc.cpp @@ -1,7 +1,7 @@ /** * @file operator_desc.cpp * - * Copyright (C) 2023-2024. Huawei Technologies Co., Ltd. All rights reserved. + * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_host/adds_custom.cpp b/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_host/adds_custom.cpp index 404986b27..cad1dd22a 100644 --- a/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_host/adds_custom.cpp +++ b/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_host/adds_custom.cpp @@ -1,7 +1,7 @@ /** * @file adds_custom.cpp * - * Copyright (C) 2023-2024. Huawei Technologies Co., Ltd. All rights reserved. + * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_kernel/adds_custom.cpp b/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_kernel/adds_custom.cpp index 3fee53218..d7d39a321 100644 --- a/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_kernel/adds_custom.cpp +++ b/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_kernel/adds_custom.cpp @@ -1,7 +1,7 @@ /** - * @file add_custom.cpp + * @file adds_custom.cpp * - * Copyright (C) 2022-2024. Huawei Technologies Co., Ltd. All rights reserved. + * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_kernel/adds_custom_v1.h b/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_kernel/adds_custom_v1.h index a06826e24..ee0feb048 100644 --- a/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_kernel/adds_custom_v1.h +++ b/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_kernel/adds_custom_v1.h @@ -1,5 +1,5 @@ /** - * @file add_custom_v1.cpp + * @file add_custom_v1.h * * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved. * @@ -15,10 +15,10 @@ public: __aicore__ inline KernelAddsV1() {} __aicore__ inline void Init(GM_ADDR x, GM_ADDR z) { - // the gm address conflict happens when L2 cache miss - // so we disable the L2 cache mode to show the xGm.SetGlobalBuffer((__gm__ float *)x + AscendC::GetBlockIdx() * TILE_N); zGm.SetGlobalBuffer((__gm__ float *)z + AscendC::GetBlockIdx() * TILE_N); + // the gm address conflict happens when L2 cache miss and multi cores visit the same addr range(512Bytes) + // so we disable the L2 cache mode to highlight the influence of the gm address conflict xGm.SetL2CacheHint(AscendC::CacheMode::CACHE_MODE_DISABLE); zGm.SetL2CacheHint(AscendC::CacheMode::CACHE_MODE_DISABLE); pipe.InitBuffer(inQueueX, BUFFER_NUM, TILE_M * TILE_N * sizeof(float)); @@ -27,6 +27,8 @@ public: __aicore__ inline void Process() { for (int32_t i = 0; i < LOOP_ONE_CORE; i++) { + // the following two SyncAll in this case are unnecessary actually, + // we just used them to highlight the influence of gm address conflict in each loop AscendC::SyncAll(); CopyIn(i); Compute(); diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_kernel/adds_custom_v2.h b/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_kernel/adds_custom_v2.h index 2dabb2c4f..91089d5f8 100644 --- a/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_kernel/adds_custom_v2.h +++ b/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_kernel/adds_custom_v2.h @@ -1,5 +1,5 @@ /** - * @file adds_custom_v2.cpp + * @file adds_custom_v2.h * * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved. * @@ -15,10 +15,10 @@ public: __aicore__ inline KernelAddsV2() {} __aicore__ inline void Init(GM_ADDR x, GM_ADDR z) { - // the gm address conflict happens when L2 cache miss - // so we disable the L2 cache mode to show the xGm.SetGlobalBuffer((__gm__ float *)x + AscendC::GetBlockIdx() * TILE_N); zGm.SetGlobalBuffer((__gm__ float *)z + AscendC::GetBlockIdx() * TILE_N); + // the gm address conflict happens when L2 cache miss and multi cores visit the same addr range(512Bytes) + // so we disable the L2 cache mode to highlight the influence of the gm address conflict xGm.SetL2CacheHint(AscendC::CacheMode::CACHE_MODE_DISABLE); zGm.SetL2CacheHint(AscendC::CacheMode::CACHE_MODE_DISABLE); pipe.InitBuffer(inQueueX, BUFFER_NUM, TILE_M * TILE_N * sizeof(float)); @@ -27,12 +27,14 @@ public: __aicore__ inline void Process() { for (int32_t i = 0; i < LOOP_ONE_CORE; i++) { - // to avoid the gm address conflict: - // the loop order core0 : 0, 1, 2, 3, ..., 13, 14, 15 - // the loop order core1 : 1, 2, 3, 4, ..., 14, 15, 0 + // adjust the loop order to avoid the gm address conflict: + // the loop order of core0 : 0, 1, 2, 3, ..., 13, 14, 15 + // the loop order of core1 : 1, 2, 3, 4, ..., 14, 15, 0 // ... - // the loop order core15 : 15, 0, 1, 2, ..., 12, 13, 14 + // the loop order of core15 : 15, 0, 1, 2, ..., 12, 13, 14 int32_t newProgress = (i + AscendC::GetBlockIdx()) % LOOP_ONE_CORE; + // the following two SyncAll in this case are unnecessary actually, + // we just used them to highlight the influence of gm address conflict in each loop AscendC::SyncAll(); CopyIn(newProgress); Compute(); diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/README.md b/operator/ascendc/4_best_practices/15_mata_address_conflict/README.md index dbbd80d3c..d58e5cc5b 100644 --- a/operator/ascendc/4_best_practices/15_mata_address_conflict/README.md +++ b/operator/ascendc/4_best_practices/15_mata_address_conflict/README.md @@ -14,7 +14,7 @@ ## 算子描述 -Adds算子实现了一个Tensor与标量(值为2.0)相加,返回相加结果的功能。对应的数学表达式为: +Adds算子实现了一个Tensor与标量值2.0相加,返回相加结果的功能。对应的数学表达式为: ``` z = x + 2.0 diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/install.sh b/operator/ascendc/4_best_practices/15_mata_address_conflict/install.sh index 83e680d3d..24a0c35a2 100755 --- a/operator/ascendc/4_best_practices/15_mata_address_conflict/install.sh +++ b/operator/ascendc/4_best_practices/15_mata_address_conflict/install.sh @@ -49,8 +49,10 @@ OP_NAME=AddsCustom # Generate the op framework rm -rf CustomOp && msopgen gen -i $OP_NAME.json -c ai_core-${SOC_VERSION} -lan cpp -out CustomOp # Copy op implementation files to CustomOp -rm -rf CustomOp/op_host/*.cpp && cp -rf $OP_NAME/op_host CustomOp/ -rm -rf CustomOp/op_kernel/*.h && cp -rf $OP_NAME/op_kernel CustomOp/ +rm -rf CustomOp/op_host/*.cpp +rm -rf CustomOp/op_kernel/*.h && rm -rf CustomOp/op_kernel/*.cpp +cp -rf $OP_NAME/op_kernel CustomOp/ +cp -rf $OP_NAME/op_host CustomOp/ # Build CustomOp project (cd CustomOp && bash build.sh) \ No newline at end of file diff --git a/operator/ascendc/4_best_practices/README.md b/operator/ascendc/4_best_practices/README.md index f5379bbbf..c40fe61a7 100644 --- a/operator/ascendc/4_best_practices/README.md +++ b/operator/ascendc/4_best_practices/README.md @@ -8,6 +8,7 @@ | ------------------------------- | ------------------------------------------ | ------------------------------------------ | | [4_bank_conflict](./4_bank_conflict) | 基于Ascend C的bank冲突性能优化样例 | Atlas A2训练系列产品/Atlas 800I A2推理产品 | | [6_group_matmul](./6_group_matmul) | 基于Ascend C的group matmul算子性能优化样例 | Atlas A2训练系列产品/Atlas 800I A2推理产品 | +| [15_mata_address_conflict](./15_mata_address_conflict) | 基于Ascend C的同地址冲突性能优化样例 | Atlas A2训练系列产品/Atlas 800I A2推理产品 | | [21_all_gather_matmul_custom](./21_all_gather_matmul_custom) | 基于Ascend C的AllGatherMatmul算子性能调优样例 | Atlas A2训练系列产品 | | [22_matmul_reduce_scatter_custom](./22_matmul_reduce_scatter_custom) | 基于Ascend C的MatmulReduceScatter算子性能调优样例 | Atlas A2训练系列产品 | | [23_matmul_all_reduce_custom](./23_matmul_all_reduce_custom) | 基于Ascend C的MatmulAllReduce算子性能调优样例 | Atlas A2训练系列产品/Atlas 800I A2推理产品 | -- Gitee From 762a72924c43522474ea46da089d09f041acf13b Mon Sep 17 00:00:00 2001 From: zhanghao0689 Date: Fri, 4 Jul 2025 14:25:40 +0800 Subject: [PATCH 07/17] change to new tiling --- .../AddsCustom/op_host/adds_custom.cpp | 35 +++++++++++++++-- .../AddsCustom/op_kernel/adds_custom.cpp | 8 ++-- .../AddsCustom/op_kernel/adds_custom_tiling.h | 22 +++++++++++ .../AddsCustom/op_kernel/adds_custom_v1.h | 36 ++++++++++-------- .../AddsCustom/op_kernel/adds_custom_v2.h | 38 +++++++++++-------- .../AddsCustom/op_kernel/common_info.h | 8 ---- 6 files changed, 101 insertions(+), 46 deletions(-) create mode 100644 operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_kernel/adds_custom_tiling.h delete mode 100644 operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_kernel/common_info.h diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_host/adds_custom.cpp b/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_host/adds_custom.cpp index cad1dd22a..4c001c945 100644 --- a/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_host/adds_custom.cpp +++ b/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_host/adds_custom.cpp @@ -7,16 +7,43 @@ * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. */ +#include "../op_kernel/adds_custom_tiling.h" #include "register/op_def_registry.h" namespace optiling { -const uint32_t BLOCK_DIM = 16; static ge::graphStatus TilingFunc(gert::TilingContext *context) { - context->SetBlockDim(BLOCK_DIM); + constexpr uint32_t BLOCK_DIM = 16; + constexpr int32_t M = 8192; + constexpr int32_t N = 128; + + // set tiling_key auto attrs = context->GetAttrs(); - const int64_t* tilingKey = attrs->GetInt(0); - context->SetTilingKey(*tilingKey); + const int64_t* caseId = attrs->GetInt(0); + int32_t tileM = 0; + int32_t tileN = 0; + int32_t loopOneCore = 0; + context->SetBlockDim(BLOCK_DIM); + context->SetTilingKey(*caseId); + if ((*caseId == 1) || (*caseId == 2)) { + tileM = 512; + tileN = 8; + loopOneCore = M / tileM; + } else if (*caseId == 3) { + // + } + + AddsCustomTilingData *tiling = context->GetTilingData(); + tiling->m = M; + tiling->n = N; + tiling->tileM = tileM; + tiling->tileN = tileM; + tiling->loopOneCore = loopOneCore; + + // set workspace size + size_t *currentWorkspace = context->GetWorkspaceSizes(1); + currentWorkspace[0] = 0; + return ge::GRAPH_SUCCESS; } } // namespace optiling diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_kernel/adds_custom.cpp b/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_kernel/adds_custom.cpp index d7d39a321..e4724322e 100644 --- a/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_kernel/adds_custom.cpp +++ b/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_kernel/adds_custom.cpp @@ -8,20 +8,22 @@ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. */ #include "kernel_operator.h" -#include "common_info.h" #include "adds_custom_v1.h" #include "adds_custom_v2.h" extern "C" __global__ __aicore__ void adds_custom(GM_ADDR x, GM_ADDR z, GM_ADDR workspace, GM_ADDR tiling) { + + REGISTER_TILING_DEFAULT(AddsCustomTilingData); + GET_TILING_DATA(tilingData, tiling); KERNEL_TASK_TYPE_DEFAULT(KERNEL_TYPE_MIX_AIV_1_0); if (TILING_KEY_IS(1UL)) { KernelAddsV1 op; - op.Init(x, z); + op.Init(x, z, &tilingData); op.Process(); } else if (TILING_KEY_IS(2UL)) { KernelAddsV2 op; - op.Init(x, z); + op.Init(x, z, &tilingData); op.Process(); } } diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_kernel/adds_custom_tiling.h b/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_kernel/adds_custom_tiling.h new file mode 100644 index 000000000..ba8557905 --- /dev/null +++ b/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_kernel/adds_custom_tiling.h @@ -0,0 +1,22 @@ +/** + * @file adds_custom_tiling.h + * + * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + */ +#ifndef ADDS_CUSTOM_TILING_H +#define ADDS_CUSTOM_TILING_H +#include + +class AddsCustomTilingData { +public: + uint32_t m; + uint32_t n; + uint32_t tileM; + uint32_t tileN; + uint32_t loopOneCore; +}; +#endif // ADDS_CUSTOM_TILING_H diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_kernel/adds_custom_v1.h b/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_kernel/adds_custom_v1.h index ee0feb048..c4211bec9 100644 --- a/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_kernel/adds_custom_v1.h +++ b/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_kernel/adds_custom_v1.h @@ -8,25 +8,27 @@ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. */ #include "kernel_operator.h" +#include "adds_custom_tiling.h" using AscendC::TPosition; class KernelAddsV1 { public: __aicore__ inline KernelAddsV1() {} - __aicore__ inline void Init(GM_ADDR x, GM_ADDR z) +__aicore__ inline void Init(GM_ADDR x, GM_ADDR z, AddsCustomTilingData* tilingPtr) { - xGm.SetGlobalBuffer((__gm__ float *)x + AscendC::GetBlockIdx() * TILE_N); - zGm.SetGlobalBuffer((__gm__ float *)z + AscendC::GetBlockIdx() * TILE_N); + tiling = tilingPtr; + xGm.SetGlobalBuffer((__gm__ float *)x + AscendC::GetBlockIdx() * tiling->tileN); + zGm.SetGlobalBuffer((__gm__ float *)z + AscendC::GetBlockIdx() * tiling->tileN); // the gm address conflict happens when L2 cache miss and multi cores visit the same addr range(512Bytes) // so we disable the L2 cache mode to highlight the influence of the gm address conflict xGm.SetL2CacheHint(AscendC::CacheMode::CACHE_MODE_DISABLE); zGm.SetL2CacheHint(AscendC::CacheMode::CACHE_MODE_DISABLE); - pipe.InitBuffer(inQueueX, BUFFER_NUM, TILE_M * TILE_N * sizeof(float)); - pipe.InitBuffer(outQueueZ, BUFFER_NUM, TILE_M * TILE_N * sizeof(float)); + pipe.InitBuffer(inQueueX, BUFFER_NUM, tiling->tileM * tiling->tileN * sizeof(float)); + pipe.InitBuffer(outQueueZ, BUFFER_NUM, tiling->tileM * tiling->tileN * sizeof(float)); } __aicore__ inline void Process() { - for (int32_t i = 0; i < LOOP_ONE_CORE; i++) { + for (int32_t i = 0; i < tiling->loopOneCore; i++) { // the following two SyncAll in this case are unnecessary actually, // we just used them to highlight the influence of gm address conflict in each loop AscendC::SyncAll(); @@ -42,11 +44,11 @@ private: { AscendC::LocalTensor xLocal = inQueueX.AllocTensor(); AscendC::DataCopyParams params; - params.blockCount = TILE_M; - params.blockLen = TILE_N * sizeof(float) / BLOCK_SIZE; - params.srcStride = (N - TILE_N) * sizeof(float) / BLOCK_SIZE; + params.blockCount = tiling->tileM; + params.blockLen = tiling->tileN * sizeof(float) / BLOCK_SIZE; + params.srcStride = (tiling->n - tiling->tileN) * sizeof(float) / BLOCK_SIZE; params.dstStride = 0; - AscendC::DataCopy(xLocal, xGm[progress * TILE_M * N], params); + AscendC::DataCopy(xLocal, xGm[progress * tiling->tileM * tiling->n], params); inQueueX.EnQue(xLocal); } __aicore__ inline void Compute() @@ -54,7 +56,7 @@ private: AscendC::LocalTensor xLocal = inQueueX.DeQue(); AscendC::LocalTensor zLocal = outQueueZ.AllocTensor(); constexpr float scale = 2.0; - AscendC::Adds(zLocal, xLocal, scale, TILE_M * TILE_N); + AscendC::Adds(zLocal, xLocal, scale, tiling->tileM * tiling->tileN); outQueueZ.EnQue(zLocal); inQueueX.FreeTensor(xLocal); } @@ -62,18 +64,22 @@ private: { AscendC::LocalTensor zLocal = outQueueZ.DeQue(); AscendC::DataCopyParams params; - params.blockCount = TILE_M; - params.blockLen = TILE_N * sizeof(float) / BLOCK_SIZE; + params.blockCount = tiling->tileM; + params.blockLen = tiling->tileN * sizeof(float) / BLOCK_SIZE; params.srcStride = 0; - params.dstStride = (N - TILE_N) * sizeof(float) / BLOCK_SIZE; - AscendC::DataCopy(zGm[progress * TILE_M * N], zLocal, params); + params.dstStride = (tiling->n - tiling->tileN) * sizeof(float) / BLOCK_SIZE; + AscendC::DataCopy(zGm[progress * tiling->tileM * tiling->n], zLocal, params); outQueueZ.FreeTensor(zLocal); } private: + static constexpr int32_t BUFFER_NUM = 2; + static constexpr int32_t BLOCK_SIZE = 32; + AscendC::TPipe pipe; AscendC::TQue inQueueX; AscendC::TQue outQueueZ; AscendC::GlobalTensor xGm; AscendC::GlobalTensor zGm; + AddsCustomTilingData* tiling; }; diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_kernel/adds_custom_v2.h b/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_kernel/adds_custom_v2.h index 91089d5f8..e89ef87ea 100644 --- a/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_kernel/adds_custom_v2.h +++ b/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_kernel/adds_custom_v2.h @@ -8,31 +8,33 @@ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. */ #include "kernel_operator.h" +#include "adds_custom_tiling.h" using AscendC::TPosition; class KernelAddsV2 { public: __aicore__ inline KernelAddsV2() {} - __aicore__ inline void Init(GM_ADDR x, GM_ADDR z) + __aicore__ inline void Init(GM_ADDR x, GM_ADDR z, AddsCustomTilingData* tilingPtr) { - xGm.SetGlobalBuffer((__gm__ float *)x + AscendC::GetBlockIdx() * TILE_N); - zGm.SetGlobalBuffer((__gm__ float *)z + AscendC::GetBlockIdx() * TILE_N); + tiling = tilingPtr; + xGm.SetGlobalBuffer((__gm__ float *)x + AscendC::GetBlockIdx() * tiling->tileN); + zGm.SetGlobalBuffer((__gm__ float *)z + AscendC::GetBlockIdx() * tiling->tileN); // the gm address conflict happens when L2 cache miss and multi cores visit the same addr range(512Bytes) // so we disable the L2 cache mode to highlight the influence of the gm address conflict xGm.SetL2CacheHint(AscendC::CacheMode::CACHE_MODE_DISABLE); zGm.SetL2CacheHint(AscendC::CacheMode::CACHE_MODE_DISABLE); - pipe.InitBuffer(inQueueX, BUFFER_NUM, TILE_M * TILE_N * sizeof(float)); - pipe.InitBuffer(outQueueZ, BUFFER_NUM, TILE_M * TILE_N * sizeof(float)); + pipe.InitBuffer(inQueueX, BUFFER_NUM, tiling->tileM * tiling->tileN * sizeof(float)); + pipe.InitBuffer(outQueueZ, BUFFER_NUM, tiling->tileM * tiling->tileN * sizeof(float)); } __aicore__ inline void Process() { - for (int32_t i = 0; i < LOOP_ONE_CORE; i++) { + for (int32_t i = 0; i < tiling->loopOneCore; i++) { // adjust the loop order to avoid the gm address conflict: // the loop order of core0 : 0, 1, 2, 3, ..., 13, 14, 15 // the loop order of core1 : 1, 2, 3, 4, ..., 14, 15, 0 // ... // the loop order of core15 : 15, 0, 1, 2, ..., 12, 13, 14 - int32_t newProgress = (i + AscendC::GetBlockIdx()) % LOOP_ONE_CORE; + int32_t newProgress = (i + AscendC::GetBlockIdx()) % tiling->loopOneCore; // the following two SyncAll in this case are unnecessary actually, // we just used them to highlight the influence of gm address conflict in each loop AscendC::SyncAll(); @@ -48,11 +50,11 @@ private: { AscendC::LocalTensor xLocal = inQueueX.AllocTensor(); AscendC::DataCopyParams params; - params.blockCount = TILE_M; - params.blockLen = TILE_N * sizeof(float) / BLOCK_SIZE; - params.srcStride = (N - TILE_N) * sizeof(float) / BLOCK_SIZE; + params.blockCount = tiling->tileM; + params.blockLen = tiling->tileN * sizeof(float) / BLOCK_SIZE; + params.srcStride = (tiling->n - tiling->tileN) * sizeof(float) / BLOCK_SIZE; params.dstStride = 0; - AscendC::DataCopy(xLocal, xGm[progress * TILE_M * N], params); + AscendC::DataCopy(xLocal, xGm[progress * tiling->tileM * tiling->n], params); inQueueX.EnQue(xLocal); } __aicore__ inline void Compute() @@ -60,7 +62,7 @@ private: AscendC::LocalTensor xLocal = inQueueX.DeQue(); AscendC::LocalTensor zLocal = outQueueZ.AllocTensor(); constexpr float scale = 2.0; - AscendC::Adds(zLocal, xLocal, scale, TILE_M * TILE_N); + AscendC::Adds(zLocal, xLocal, scale, tiling->tileM * tiling->tileN); outQueueZ.EnQue(zLocal); inQueueX.FreeTensor(xLocal); } @@ -68,18 +70,22 @@ private: { AscendC::LocalTensor zLocal = outQueueZ.DeQue(); AscendC::DataCopyParams params; - params.blockCount = TILE_M; - params.blockLen = TILE_N * sizeof(float) / BLOCK_SIZE; + params.blockCount = tiling->tileM; + params.blockLen = tiling->tileN * sizeof(float) / BLOCK_SIZE; params.srcStride = 0; - params.dstStride = (N - TILE_N) * sizeof(float) / BLOCK_SIZE; - AscendC::DataCopy(zGm[progress * TILE_M * N], zLocal, params); + params.dstStride = (tiling->n - tiling->tileN) * sizeof(float) / BLOCK_SIZE; + AscendC::DataCopy(zGm[progress * tiling->tileM * tiling->n], zLocal, params); outQueueZ.FreeTensor(zLocal); } private: + static constexpr int32_t BUFFER_NUM = 2; + static constexpr int32_t BLOCK_SIZE = 32; + AscendC::TPipe pipe; AscendC::TQue inQueueX; AscendC::TQue outQueueZ; AscendC::GlobalTensor xGm; AscendC::GlobalTensor zGm; + AddsCustomTilingData* tiling; }; diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_kernel/common_info.h b/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_kernel/common_info.h deleted file mode 100644 index f0a4da5fd..000000000 --- a/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_kernel/common_info.h +++ /dev/null @@ -1,8 +0,0 @@ -static constexpr int32_t M = 8192; -static constexpr int32_t N = 128; -static constexpr int32_t TILE_M = 512; -static constexpr int32_t TILE_N = 8; -static constexpr int32_t USED_CORE_NUM = N / TILE_N; -static constexpr int32_t LOOP_ONE_CORE = M / TILE_M; -static constexpr int32_t BUFFER_NUM = 2; -static constexpr int32_t BLOCK_SIZE = 32; \ No newline at end of file -- Gitee From cdcbe7d20c2fa51006a9e2d0391901b467bb4246 Mon Sep 17 00:00:00 2001 From: zhanghao0689 Date: Fri, 4 Jul 2025 15:30:34 +0800 Subject: [PATCH 08/17] fix bugs --- .../AclNNInvocation/inc/op_runner.h | 2 +- .../AclNNInvocation/run.sh | 3 +- .../AclNNInvocation/src/main.cpp | 6 ++ .../AddsCustom/op_kernel/adds_csutom_v3.h | 85 +++++++++++++++++++ .../AddsCustom/op_kernel/adds_custom.cpp | 5 ++ .../AddsCustom/op_kernel/adds_custom_v1.h | 4 +- 6 files changed, 101 insertions(+), 4 deletions(-) create mode 100644 operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_kernel/adds_csutom_v3.h diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/inc/op_runner.h b/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/inc/op_runner.h index 538667d4f..7b98d5730 100644 --- a/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/inc/op_runner.h +++ b/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/inc/op_runner.h @@ -163,7 +163,7 @@ public: * @brief Get case index * @return case index by user input */ - bool GetCaseId(); + int64_t GetCaseId(); private: size_t numInputs_; diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/run.sh b/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/run.sh index c59fe1426..827ea801f 100755 --- a/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/run.sh +++ b/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/run.sh @@ -57,7 +57,7 @@ function main { export ASCEND_TOOLKIT_HOME=${_ASCEND_INSTALL_PATH} cd $CURRENT_DIR/output echo "INFO: execute op!" - msprof op --launch-count=2 --output=./prof ./execute_adds_op + msprof op --launch-count=3 --output=./prof ./execute_adds_op if [ $? -ne 0 ]; then echo "ERROR: acl executable run failed! please check your project!" return 1 @@ -68,6 +68,7 @@ function main { cd $CURRENT_DIR python3 scripts/verify_result.py output/output_z_1.bin output/golden.bin python3 scripts/verify_result.py output/output_z_2.bin output/golden.bin + python3 scripts/verify_result.py output/output_z_3.bin output/golden.bin if [ $? -ne 0 ]; then echo "ERROR: verify result failed!" return 1 diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/src/main.cpp b/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/src/main.cpp index 0df470472..8e9b87f87 100644 --- a/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/src/main.cpp +++ b/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/src/main.cpp @@ -151,6 +151,12 @@ int main(int argc, char **argv) return FAILED; } + caseId = 3; + if (!RunOp(caseId)) { + DestroyResource(); + return FAILED; + } + DestroyResource(); return SUCCESS; } diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_kernel/adds_csutom_v3.h b/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_kernel/adds_csutom_v3.h new file mode 100644 index 000000000..54e0d2384 --- /dev/null +++ b/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_kernel/adds_csutom_v3.h @@ -0,0 +1,85 @@ +/** + * @file add_custom_v3.h + * + * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + */ +#include "kernel_operator.h" +#include "adds_custom_tiling.h" + +using AscendC::TPosition; +class KernelAddsV3 { +public: + __aicore__ inline KernelAddsV3() {} +__aicore__ inline void Init(GM_ADDR x, GM_ADDR z, AddsCustomTilingData* tilingPtr) + { + tiling = tilingPtr; + xGm.SetGlobalBuffer((__gm__ float *)x + AscendC::GetBlockIdx() * tiling->tileM * tiling->n); + zGm.SetGlobalBuffer((__gm__ float *)z + AscendC::GetBlockIdx() * tiling->tileM * tiling->n); + // the gm address conflict happens when multi cores visit the same addr range(512Bytes) + // we disable the L2 cache mode to highlight the influence of the gm address conflict + xGm.SetL2CacheHint(AscendC::CacheMode::CACHE_MODE_DISABLE); + zGm.SetL2CacheHint(AscendC::CacheMode::CACHE_MODE_DISABLE); + pipe.InitBuffer(inQueueX, BUFFER_NUM, tiling->tileM * tiling->tileN * sizeof(float)); + pipe.InitBuffer(outQueueZ, BUFFER_NUM, tiling->tileM * tiling->tileN * sizeof(float)); + } + __aicore__ inline void Process() + { + for (int32_t i = 0; i < tiling->loopOneCore; i++) { + // the following two SyncAll in this case are unnecessary actually, + // we just used them to highlight the influence of gm address conflict in each loop + AscendC::SyncAll(); + CopyIn(i); + Compute(); + AscendC::SyncAll(); + CopyOut(i); + } + } + +private: + __aicore__ inline void CopyIn(int32_t progress) + { + AscendC::LocalTensor xLocal = inQueueX.AllocTensor(); + AscendC::DataCopyParams params; + params.blockCount = tiling->tileM; + params.blockLen = tiling->tileN * sizeof(float) / BLOCK_SIZE; + params.srcStride = (tiling->n - tiling->tileN) * sizeof(float) / BLOCK_SIZE; + params.dstStride = 0; + AscendC::DataCopy(xLocal, xGm[progress * tiling->tileN], params); + inQueueX.EnQue(xLocal); + } + __aicore__ inline void Compute() + { + AscendC::LocalTensor xLocal = inQueueX.DeQue(); + AscendC::LocalTensor zLocal = outQueueZ.AllocTensor(); + constexpr float scale = 2.0; + AscendC::Adds(zLocal, xLocal, scale, tiling->tileM * tiling->tileN); + outQueueZ.EnQue(zLocal); + inQueueX.FreeTensor(xLocal); + } + __aicore__ inline void CopyOut(int32_t progress) + { + AscendC::LocalTensor zLocal = outQueueZ.DeQue(); + AscendC::DataCopyParams params; + params.blockCount = tiling->tileM; + params.blockLen = tiling->tileN * sizeof(float) / BLOCK_SIZE; + params.srcStride = 0; + params.dstStride = (tiling->n - tiling->tileN) * sizeof(float) / BLOCK_SIZE; + AscendC::DataCopy(zGm[progress * tiling->tileN], zLocal, params); + outQueueZ.FreeTensor(zLocal); + } + +private: + static constexpr int32_t BUFFER_NUM = 2; + static constexpr int32_t BLOCK_SIZE = 32; + + AscendC::TPipe pipe; + AscendC::TQue inQueueX; + AscendC::TQue outQueueZ; + AscendC::GlobalTensor xGm; + AscendC::GlobalTensor zGm; + AddsCustomTilingData* tiling; +}; diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_kernel/adds_custom.cpp b/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_kernel/adds_custom.cpp index e4724322e..5b9e73970 100644 --- a/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_kernel/adds_custom.cpp +++ b/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_kernel/adds_custom.cpp @@ -10,6 +10,7 @@ #include "kernel_operator.h" #include "adds_custom_v1.h" #include "adds_custom_v2.h" +#include "adds_custom_v3.h" extern "C" __global__ __aicore__ void adds_custom(GM_ADDR x, GM_ADDR z, GM_ADDR workspace, GM_ADDR tiling) { @@ -25,5 +26,9 @@ extern "C" __global__ __aicore__ void adds_custom(GM_ADDR x, GM_ADDR z, GM_ADDR KernelAddsV2 op; op.Init(x, z, &tilingData); op.Process(); + } else if (TILING_KEY_IS(3UL)) { + KernelAddsV3 op; + op.Init(x, z, &tilingData); + op.Process(); } } diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_kernel/adds_custom_v1.h b/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_kernel/adds_custom_v1.h index c4211bec9..bc773d241 100644 --- a/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_kernel/adds_custom_v1.h +++ b/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_kernel/adds_custom_v1.h @@ -19,8 +19,8 @@ __aicore__ inline void Init(GM_ADDR x, GM_ADDR z, AddsCustomTilingData* tilingPt tiling = tilingPtr; xGm.SetGlobalBuffer((__gm__ float *)x + AscendC::GetBlockIdx() * tiling->tileN); zGm.SetGlobalBuffer((__gm__ float *)z + AscendC::GetBlockIdx() * tiling->tileN); - // the gm address conflict happens when L2 cache miss and multi cores visit the same addr range(512Bytes) - // so we disable the L2 cache mode to highlight the influence of the gm address conflict + // the gm address conflict happens when multi cores visit the same addr range(512Bytes) + // we disable the L2 cache mode to highlight the influence of the gm address conflict xGm.SetL2CacheHint(AscendC::CacheMode::CACHE_MODE_DISABLE); zGm.SetL2CacheHint(AscendC::CacheMode::CACHE_MODE_DISABLE); pipe.InitBuffer(inQueueX, BUFFER_NUM, tiling->tileM * tiling->tileN * sizeof(float)); -- Gitee From c6157f8cc71c0b76976949182291ea2f7d982ff1 Mon Sep 17 00:00:00 2001 From: zhanghao0689 Date: Fri, 4 Jul 2025 15:42:22 +0800 Subject: [PATCH 09/17] fix errors --- .../AddsCustom/op_kernel/adds_custom.cpp | 1 - .../AddsCustom/op_kernel/adds_custom_v2.h | 4 ++-- .../op_kernel/{adds_csutom_v3.h => adds_custom_v3.h} | 1 + 3 files changed, 3 insertions(+), 3 deletions(-) rename operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_kernel/{adds_csutom_v3.h => adds_custom_v3.h} (98%) diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_kernel/adds_custom.cpp b/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_kernel/adds_custom.cpp index 5b9e73970..8d0ad4cd9 100644 --- a/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_kernel/adds_custom.cpp +++ b/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_kernel/adds_custom.cpp @@ -14,7 +14,6 @@ extern "C" __global__ __aicore__ void adds_custom(GM_ADDR x, GM_ADDR z, GM_ADDR workspace, GM_ADDR tiling) { - REGISTER_TILING_DEFAULT(AddsCustomTilingData); GET_TILING_DATA(tilingData, tiling); KERNEL_TASK_TYPE_DEFAULT(KERNEL_TYPE_MIX_AIV_1_0); diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_kernel/adds_custom_v2.h b/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_kernel/adds_custom_v2.h index e89ef87ea..c09c16a64 100644 --- a/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_kernel/adds_custom_v2.h +++ b/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_kernel/adds_custom_v2.h @@ -19,8 +19,8 @@ public: tiling = tilingPtr; xGm.SetGlobalBuffer((__gm__ float *)x + AscendC::GetBlockIdx() * tiling->tileN); zGm.SetGlobalBuffer((__gm__ float *)z + AscendC::GetBlockIdx() * tiling->tileN); - // the gm address conflict happens when L2 cache miss and multi cores visit the same addr range(512Bytes) - // so we disable the L2 cache mode to highlight the influence of the gm address conflict + // the gm address conflict happens when multi cores visit the same addr range(512Bytes) + // we disable the L2 cache mode to highlight the influence of the gm address conflict xGm.SetL2CacheHint(AscendC::CacheMode::CACHE_MODE_DISABLE); zGm.SetL2CacheHint(AscendC::CacheMode::CACHE_MODE_DISABLE); pipe.InitBuffer(inQueueX, BUFFER_NUM, tiling->tileM * tiling->tileN * sizeof(float)); diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_kernel/adds_csutom_v3.h b/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_kernel/adds_custom_v3.h similarity index 98% rename from operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_kernel/adds_csutom_v3.h rename to operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_kernel/adds_custom_v3.h index 54e0d2384..650ae9a60 100644 --- a/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_kernel/adds_csutom_v3.h +++ b/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_kernel/adds_custom_v3.h @@ -17,6 +17,7 @@ public: __aicore__ inline void Init(GM_ADDR x, GM_ADDR z, AddsCustomTilingData* tilingPtr) { tiling = tilingPtr; + // change the tile method from column split to row split xGm.SetGlobalBuffer((__gm__ float *)x + AscendC::GetBlockIdx() * tiling->tileM * tiling->n); zGm.SetGlobalBuffer((__gm__ float *)z + AscendC::GetBlockIdx() * tiling->tileM * tiling->n); // the gm address conflict happens when multi cores visit the same addr range(512Bytes) -- Gitee From 6ee5135309aeb0f3283ebc474f5a06c70d1dc70c Mon Sep 17 00:00:00 2001 From: zhanghao0689 Date: Fri, 4 Jul 2025 15:51:59 +0800 Subject: [PATCH 10/17] fix errors --- .../AddsCustom/op_host/adds_custom.cpp | 25 +++++++------------ 1 file changed, 9 insertions(+), 16 deletions(-) diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_host/adds_custom.cpp b/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_host/adds_custom.cpp index 4c001c945..cc8156060 100644 --- a/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_host/adds_custom.cpp +++ b/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_host/adds_custom.cpp @@ -14,31 +14,24 @@ namespace optiling { static ge::graphStatus TilingFunc(gert::TilingContext *context) { constexpr uint32_t BLOCK_DIM = 16; - constexpr int32_t M = 8192; - constexpr int32_t N = 128; + context->SetBlockDim(BLOCK_DIM); // set tiling_key auto attrs = context->GetAttrs(); const int64_t* caseId = attrs->GetInt(0); - int32_t tileM = 0; - int32_t tileN = 0; - int32_t loopOneCore = 0; - context->SetBlockDim(BLOCK_DIM); context->SetTilingKey(*caseId); - if ((*caseId == 1) || (*caseId == 2)) { - tileM = 512; - tileN = 8; - loopOneCore = M / tileM; - } else if (*caseId == 3) { - // - } AddsCustomTilingData *tiling = context->GetTilingData(); + constexpr int32_t M = 8192; + constexpr int32_t N = 128; + constexpr int32_t TILE_M = 512; + constexpr int32_t TILE_N = 8; + constexpr int32_t LOOP_ONE_CORE = M / TILE_M; tiling->m = M; tiling->n = N; - tiling->tileM = tileM; - tiling->tileN = tileM; - tiling->loopOneCore = loopOneCore; + tiling->tileM = TILE_M; + tiling->tileN = TILE_N; + tiling->loopOneCore = LOOP_ONE_CORE; // set workspace size size_t *currentWorkspace = context->GetWorkspaceSizes(1); -- Gitee From cdf2e16199528e3afc1685fdabf93b891b8780f2 Mon Sep 17 00:00:00 2001 From: zhanghao0689 Date: Fri, 4 Jul 2025 16:06:27 +0800 Subject: [PATCH 11/17] fix typos --- .../AclNNInvocation/src/main.cpp | 3 ++- .../AclNNInvocation/src/op_runner.cpp | 3 +-- .../AddsCustom/op_host/adds_custom.cpp | 4 ++-- .../AddsCustom/op_kernel/adds_custom_tiling.h | 10 +++++----- .../AddsCustom/op_kernel/adds_custom_v1.h | 6 +++--- .../AddsCustom/op_kernel/adds_custom_v2.h | 4 ++-- .../AddsCustom/op_kernel/adds_custom_v3.h | 6 +++--- 7 files changed, 18 insertions(+), 18 deletions(-) diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/src/main.cpp b/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/src/main.cpp index 8e9b87f87..b70950642 100644 --- a/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/src/main.cpp +++ b/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/src/main.cpp @@ -42,7 +42,8 @@ bool SetInputData(OpRunner &runner) bool ProcessOutputData(OpRunner &runner) { int64_t caseId = runner.GetCaseId(); - WriteFile("../output/output_z_" + std::to_string(caseId) + ".bin", runner.GetOutputBuffer(0), runner.GetOutputSize(0)); + WriteFile("../output/output_z_" + std::to_string(caseId) + ".bin", runner.GetOutputBuffer(0), + runner.GetOutputSize(0)); INFO_LOG("Write output success"); return true; } diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/src/op_runner.cpp b/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/src/op_runner.cpp index fcdbe67eb..d7bde46d6 100644 --- a/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/src/op_runner.cpp +++ b/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/src/op_runner.cpp @@ -314,8 +314,7 @@ bool OpRunner::RunOp(int64_t caseId) size_t workspaceSize = 0; aclOpExecutor *handle = nullptr; - auto ret = - aclnnAddsCustomGetWorkspaceSize(inputTensor_[0], caseId, outputTensor_[0], &workspaceSize, &handle); + auto ret = aclnnAddsCustomGetWorkspaceSize(inputTensor_[0], caseId, outputTensor_[0], &workspaceSize, &handle); if (ret != ACL_SUCCESS) { (void)aclrtDestroyStream(stream); ERROR_LOG("Get Operator Workspace failed. error code is %d", static_cast(ret)); diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_host/adds_custom.cpp b/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_host/adds_custom.cpp index cc8156060..1a35a0d10 100644 --- a/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_host/adds_custom.cpp +++ b/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_host/adds_custom.cpp @@ -18,7 +18,7 @@ static ge::graphStatus TilingFunc(gert::TilingContext *context) // set tiling_key auto attrs = context->GetAttrs(); - const int64_t* caseId = attrs->GetInt(0); + const int64_t *caseId = attrs->GetInt(0); context->SetTilingKey(*caseId); AddsCustomTilingData *tiling = context->GetTilingData(); @@ -32,7 +32,7 @@ static ge::graphStatus TilingFunc(gert::TilingContext *context) tiling->tileM = TILE_M; tiling->tileN = TILE_N; tiling->loopOneCore = LOOP_ONE_CORE; - + // set workspace size size_t *currentWorkspace = context->GetWorkspaceSizes(1); currentWorkspace[0] = 0; diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_kernel/adds_custom_tiling.h b/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_kernel/adds_custom_tiling.h index ba8557905..8730ae528 100644 --- a/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_kernel/adds_custom_tiling.h +++ b/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_kernel/adds_custom_tiling.h @@ -13,10 +13,10 @@ class AddsCustomTilingData { public: - uint32_t m; - uint32_t n; - uint32_t tileM; - uint32_t tileN; - uint32_t loopOneCore; + uint32_t m; + uint32_t n; + uint32_t tileM; + uint32_t tileN; + uint32_t loopOneCore; }; #endif // ADDS_CUSTOM_TILING_H diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_kernel/adds_custom_v1.h b/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_kernel/adds_custom_v1.h index bc773d241..81ba262f8 100644 --- a/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_kernel/adds_custom_v1.h +++ b/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_kernel/adds_custom_v1.h @@ -1,5 +1,5 @@ /** - * @file add_custom_v1.h + * @file adds_custom_v1.h * * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved. * @@ -14,7 +14,7 @@ using AscendC::TPosition; class KernelAddsV1 { public: __aicore__ inline KernelAddsV1() {} -__aicore__ inline void Init(GM_ADDR x, GM_ADDR z, AddsCustomTilingData* tilingPtr) + __aicore__ inline void Init(GM_ADDR x, GM_ADDR z, AddsCustomTilingData *tilingPtr) { tiling = tilingPtr; xGm.SetGlobalBuffer((__gm__ float *)x + AscendC::GetBlockIdx() * tiling->tileN); @@ -81,5 +81,5 @@ private: AscendC::TQue outQueueZ; AscendC::GlobalTensor xGm; AscendC::GlobalTensor zGm; - AddsCustomTilingData* tiling; + AddsCustomTilingData *tiling; }; diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_kernel/adds_custom_v2.h b/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_kernel/adds_custom_v2.h index c09c16a64..ad95b7153 100644 --- a/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_kernel/adds_custom_v2.h +++ b/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_kernel/adds_custom_v2.h @@ -14,7 +14,7 @@ using AscendC::TPosition; class KernelAddsV2 { public: __aicore__ inline KernelAddsV2() {} - __aicore__ inline void Init(GM_ADDR x, GM_ADDR z, AddsCustomTilingData* tilingPtr) + __aicore__ inline void Init(GM_ADDR x, GM_ADDR z, AddsCustomTilingData *tilingPtr) { tiling = tilingPtr; xGm.SetGlobalBuffer((__gm__ float *)x + AscendC::GetBlockIdx() * tiling->tileN); @@ -87,5 +87,5 @@ private: AscendC::TQue outQueueZ; AscendC::GlobalTensor xGm; AscendC::GlobalTensor zGm; - AddsCustomTilingData* tiling; + AddsCustomTilingData *tiling; }; diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_kernel/adds_custom_v3.h b/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_kernel/adds_custom_v3.h index 650ae9a60..a169ddbe1 100644 --- a/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_kernel/adds_custom_v3.h +++ b/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_kernel/adds_custom_v3.h @@ -1,5 +1,5 @@ /** - * @file add_custom_v3.h + * @file adds_custom_v3.h * * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved. * @@ -14,7 +14,7 @@ using AscendC::TPosition; class KernelAddsV3 { public: __aicore__ inline KernelAddsV3() {} -__aicore__ inline void Init(GM_ADDR x, GM_ADDR z, AddsCustomTilingData* tilingPtr) + __aicore__ inline void Init(GM_ADDR x, GM_ADDR z, AddsCustomTilingData *tilingPtr) { tiling = tilingPtr; // change the tile method from column split to row split @@ -82,5 +82,5 @@ private: AscendC::TQue outQueueZ; AscendC::GlobalTensor xGm; AscendC::GlobalTensor zGm; - AddsCustomTilingData* tiling; + AddsCustomTilingData *tiling; }; -- Gitee From d809b0a2c009c5f883b6e47f254e0246f48f98e2 Mon Sep 17 00:00:00 2001 From: zhanghao0689 Date: Fri, 4 Jul 2025 16:40:55 +0800 Subject: [PATCH 12/17] fix readme --- .../AclNNInvocation/README.md | 66 +++++++++++-------- .../AddsCustom/op_host/adds_custom.cpp | 15 ----- .../15_mata_address_conflict/README.md | 8 ++- 3 files changed, 44 insertions(+), 45 deletions(-) diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/README.md b/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/README.md index 32b014bdf..5bb4d0d86 100644 --- a/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/README.md +++ b/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/README.md @@ -1,6 +1,7 @@ ## 目录结构介绍 + ``` -├── AclNNInvocation //通过aclnn调用的方式调用AddsCustom算子 +├── AclNNInvocation //通过单算子API调用的方式调用AddsCustom算子 │ ├── inc // 头文件目录 │ │ ├── common.h // 声明公共方法类,用于读取二进制文件 │ │ ├── op_runner.h // 算子描述声明文件,包含算子输入/输出,算子类型以及输入描述与输出描述 @@ -18,46 +19,57 @@ │ │ └── operator_desc.cpp // 构造算子的输入与输出描述 │ └── run.sh // 执行命令脚本 ``` + ## 代码实现介绍 + 完成自定义算子的开发部署后,可以通过单算子调用的方式来验证单算子的功能。src/main.cpp代码为单算子API执行方式。单算子API执行是基于C语言的API执行算子,无需提供单算子描述文件进行离线模型的转换,直接调用单算子API接口。 自定义算子编译部署后,会自动生成单算子API,可以直接在应用程序中调用。算子API的形式一般定义为“两段式接口”,形如: - ```cpp - // 获取算子使用的workspace空间大小 - aclnnStatus aclnnAddsCustomGetWorkspaceSize( - const aclTensor *x, - int64_t caseId, - const aclTensor *out, - uint64_t *workspaceSize, - aclOpExecutor **executor); - // 执行算子 - aclnnStatus aclnnAddsCustom( - void *workspace, - uint64_t workspaceSize, - aclOpExecutor *executor, - aclrtStream stream); - ``` + +```cpp + // 获取算子使用的workspace空间大小 + aclnnStatus aclnnAddsCustomGetWorkspaceSize( + const aclTensor *x, + int64_t caseId, + const aclTensor *out, + uint64_t *workspaceSize, + aclOpExecutor **executor); + // 执行算子 + aclnnStatus aclnnAddsCustom( + void *workspace, + uint64_t workspaceSize, + aclOpExecutor *executor, + aclrtStream stream); +``` + 其中aclnnAddsCustomGetWorkspaceSize为第一段接口,主要用于计算本次API调用计算过程中需要多少的workspace内存。获取到本次API计算需要的workspace大小之后,按照workspaceSize大小申请Device侧内存,然后调用第二段接口aclnnAddsCustom执行计算。具体参考[单算子API调用](https://hiascend.com/document/redirect/CannCommunityAscendCInVorkSingleOp)章节。 ## 运行样例算子 + ### 1. 编译算子工程 + 运行此样例前,请参考[编译算子工程](../README.md#operatorcompile)完成前期准备。 + ### 2. aclnn调用样例运行 - - 进入到样例目录 +- 进入到样例目录 + + 以命令行方式下载样例代码,master分支为例。 - 以命令行方式下载样例代码,master分支为例。 - ```bash - cd ${git_clone_path}/samples/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation - ``` - - 样例执行 + ```bash + cd ${git_clone_path}/samples/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation + ``` +- 样例执行 - 样例执行过程中会自动生成测试数据,然后编译与运行aclnn样例,最后检验运行结果。具体过程可参见run.sh脚本。 + 样例执行过程中会自动生成测试数据,然后编译与运行aclnn样例,最后检验运行结果。具体过程可参见run.sh脚本。 + + ```bash + bash run.sh + ``` - ```bash - bash run.sh - ``` ## 更新说明 + + | 时间 | 更新事项 | | ---------- | ------------ | -| 2025/07/03 | 新增本readme | \ No newline at end of file +| 2025/07/03 | 新增本readme | diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_host/adds_custom.cpp b/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_host/adds_custom.cpp index 1a35a0d10..12a574623 100644 --- a/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_host/adds_custom.cpp +++ b/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom/op_host/adds_custom.cpp @@ -41,18 +41,6 @@ static ge::graphStatus TilingFunc(gert::TilingContext *context) } } // namespace optiling -namespace ge { -static graphStatus InferShape(gert::InferShapeContext *context) -{ - return GRAPH_SUCCESS; -} - -static graphStatus InferDataType(gert::InferDataTypeContext *context) -{ - return ge::GRAPH_SUCCESS; -} -} // namespace ge - namespace ops { class AddsCustom : public OpDef { public: @@ -60,10 +48,7 @@ public: { this->Input("x").ParamType(REQUIRED).DataType({ge::DT_FLOAT}).Format({ge::FORMAT_ND}); this->Output("z").ParamType(REQUIRED).DataType({ge::DT_FLOAT}).Format({ge::FORMAT_ND}); - - this->SetInferShape(ge::InferShape).SetInferDataType(ge::InferDataType); this->AICore().SetTiling(optiling::TilingFunc).AddConfig("ascend910b"); - this->Attr("case_id").Int(1); } }; diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/README.md b/operator/ascendc/4_best_practices/15_mata_address_conflict/README.md index d58e5cc5b..4d15e3908 100644 --- a/operator/ascendc/4_best_practices/15_mata_address_conflict/README.md +++ b/operator/ascendc/4_best_practices/15_mata_address_conflict/README.md @@ -1,12 +1,12 @@ ## 概述 -本样例基于AddsCustom算子工程,介绍了单算子工程、单算子调用、第三方框架调用。 +本样例基于AddsCustom算子工程,介绍了同地址冲突的影响以及两种解决方法。 ## 目录结构介绍 ``` ├── 15_mata_address_conflict // 同地址冲突样例工程目录 -│ ├── AclNNInvocation // 通过aclnn调用的方式调用AddsCustom算子 +│ ├── AclNNInvocation // 通过单算子API调用的方式调用AddsCustom算子 │ ├── AddsCustom // AddsCustom算子工程 │ ├── AddsCustom.json // AddsCustom算子的原型定义json文件 │ └── install.sh // 脚本,调用msOpGen生成自定义算子工程,并编译 @@ -20,7 +20,9 @@ Adds算子实现了一个Tensor与标量值2.0相加,返回相加结果的功 z = x + 2.0 ``` -本样例主要介绍数据搬运中的同地址冲突对搬运效率的影响,同地址冲突指的是多核同时访问(读/写)同一条cacheline,且数据不在cache中时,出于数据一致性的要求,芯片会对多核的访问请求进行串行处理,导致搬运效率降低。 +本样例主要介绍数据搬运中的同地址冲突对搬运效率的影响,在global memory的数据访问中,数据访问请求(读/写)在芯片内部会按照512 Bytes对齐进行地址转换,同一时刻如果多核的数据访问请求在转换后落在连续的512 Bytes范围内,出于数据一致性的要求,芯片会对落入同一个512Bytes范围内的请求进行串行处理,导致搬运效率降低,即发生了同地址访问现象。 + +当前算子执行机制保证用户kernel入参(包括workspace/tiling)的地址是512 Bytes对齐的,因此用户只需要根据地址的偏移量即可判断两个地址是否会落入连续的512 Bytes范围内。 ## 算子规格描述 -- Gitee From 8b195e7a1ce024a7cada4360724be56c5efba4a6 Mon Sep 17 00:00:00 2001 From: zhanghao0689 Date: Fri, 4 Jul 2025 17:38:50 +0800 Subject: [PATCH 13/17] fix typo --- .../15_mata_address_conflict/AclNNInvocation/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/README.md b/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/README.md index 5bb4d0d86..7d66fa5fc 100644 --- a/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/README.md +++ b/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/README.md @@ -50,7 +50,7 @@ 运行此样例前,请参考[编译算子工程](../README.md#operatorcompile)完成前期准备。 -### 2. aclnn调用样例运行 +### 2. 单算子API调用样例运行 - 进入到样例目录 -- Gitee From 5ec2c36af76a3933855558c391f7b90977ee7aa3 Mon Sep 17 00:00:00 2001 From: zhanghao0689 Date: Fri, 4 Jul 2025 17:42:47 +0800 Subject: [PATCH 14/17] add json attr --- .../15_mata_address_conflict/AddsCustom.json | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom.json b/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom.json index 8ad2a831a..a54432512 100644 --- a/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom.json +++ b/operator/ascendc/4_best_practices/15_mata_address_conflict/AddsCustom.json @@ -25,6 +25,13 @@ "float" ] } + ], + "attr": [ + { + "name": "case_id", + "type": "int", + "value": 1 + } ] } ] \ No newline at end of file -- Gitee From 3e1d9d182a8947ce18f862df8fe774e9ef85d053 Mon Sep 17 00:00:00 2001 From: zhanghao0689 Date: Fri, 4 Jul 2025 17:52:12 +0800 Subject: [PATCH 15/17] fix aclnn to new name --- .../ascendc/4_best_practices/15_mata_address_conflict/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/README.md b/operator/ascendc/4_best_practices/15_mata_address_conflict/README.md index 4d15e3908..9d55642fa 100644 --- a/operator/ascendc/4_best_practices/15_mata_address_conflict/README.md +++ b/operator/ascendc/4_best_practices/15_mata_address_conflict/README.md @@ -151,7 +151,7 @@ CANN软件包中提供了工程创建工具msOpGen,AddsCustom算子工程可 ### 4. 调用执行算子工程 -- [aclnn调用AddsCustom算子工程](./AclNNInvocation/README.md) +- [单算子API调用AddsCustom算子工程](./AclNNInvocation/README.md) ## 更新说明 -- Gitee From 3cbb5fecab9fc9ff5f8561164f36be1a4d22bf2b Mon Sep 17 00:00:00 2001 From: zhanghao0689 Date: Fri, 4 Jul 2025 17:53:34 +0800 Subject: [PATCH 16/17] fix aclnn --- .../15_mata_address_conflict/AclNNInvocation/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/README.md b/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/README.md index 7d66fa5fc..5c1ffb4d2 100644 --- a/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/README.md +++ b/operator/ascendc/4_best_practices/15_mata_address_conflict/AclNNInvocation/README.md @@ -61,7 +61,7 @@ ``` - 样例执行 - 样例执行过程中会自动生成测试数据,然后编译与运行aclnn样例,最后检验运行结果。具体过程可参见run.sh脚本。 + 样例执行过程中会自动生成测试数据,然后编译与运行单算子API调用样例,最后检验运行结果。具体过程可参见run.sh脚本。 ```bash bash run.sh -- Gitee From e920fb6c5ee57114755891f8eeee17db3f0c58ca Mon Sep 17 00:00:00 2001 From: zhanghao0689 Date: Fri, 4 Jul 2025 18:11:43 +0800 Subject: [PATCH 17/17] fix dir path --- .../ascendc/4_best_practices/15_mata_address_conflict/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/operator/ascendc/4_best_practices/15_mata_address_conflict/README.md b/operator/ascendc/4_best_practices/15_mata_address_conflict/README.md index 9d55642fa..03fdd24fc 100644 --- a/operator/ascendc/4_best_practices/15_mata_address_conflict/README.md +++ b/operator/ascendc/4_best_practices/15_mata_address_conflict/README.md @@ -82,7 +82,7 @@ CANN软件包中提供了工程创建工具msOpGen,AddsCustom算子工程可 ```bash # 若开发者以git命令行方式clone了master分支代码,并切换目录 - cd ${git_clone_path}/samples/operator/ascendc/0_introduction/1_add_frameworklaunch + cd ${git_clone_path}/samples/operator/ascendc/4_best_practices/15_mata_address_conflict ``` - 调用脚本,生成自定义算子工程,复制host和kernel实现并编译算子 -- Gitee