diff --git a/operator/ascendc/0_introduction/22_mmad_profile_kernellaunch/MmadInvocation/CMakeLists.txt b/operator/ascendc/0_introduction/22_mmad_profile_kernellaunch/MmadInvocation/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..02cb22479252ae44d611635e3a603eae1aea316f --- /dev/null +++ b/operator/ascendc/0_introduction/22_mmad_profile_kernellaunch/MmadInvocation/CMakeLists.txt @@ -0,0 +1,57 @@ +cmake_minimum_required(VERSION 3.16) +project(Ascend_c) + +set(RUN_MODE "npu" CACHE STRING "cpu/sim/npu") +set(SOC_VERSION "Ascend310P3" CACHE STRING "system on chip type") +set(ASCEND_CANN_PACKAGE_PATH "/usr/local/Ascend/ascend-toolkit/latest" + CACHE STRING "ASCEND CANN package installation directory" +) +if(NOT CMAKE_BUILD_TYPE) + set(CMAKE_BUILD_TYPE "Debug" CACHE STRING "Build type Release/Debug (default Debug)" FORCE) +endif() +if(CMAKE_INSTALL_PREFIX STREQUAL /usr/local) + set(CMAKE_INSTALL_PREFIX "${CMAKE_CURRENT_LIST_DIR}/out" CACHE STRING "path for install()" FORCE) +endif() + +# ${KERNEL_FILES} are used to compile library, push files written by ascendc in ${KERNEL_FILES}. +# ref to cmake/npu.cmake ascendc_library, cmake/cpu.cmake add_library +file(GLOB KERNEL_FILES ${CMAKE_CURRENT_SOURCE_DIR}/mmad_custom.cpp) +set(CUSTOM_ASCEND310P_LIST "Ascend310P1" "Ascend310P3") + +if("${RUN_MODE}" STREQUAL "cpu") + include(cmake/cpu_lib.cmake) +elseif("${RUN_MODE}" STREQUAL "sim" OR "${RUN_MODE}" STREQUAL "npu") + include(cmake/npu_lib.cmake) +else() + message("invalid RUN_MODE: ${RUN_MODE}") +endif() + +add_executable(ascendc_kernels_bbit + ${CMAKE_CURRENT_SOURCE_DIR}/main.cpp +) + +target_compile_options(ascendc_kernels_bbit PRIVATE + $:-g>> + -O2 -std=c++17 -D_GLIBCXX_USE_CXX11_ABI=0 -Wall -Werror +) + +target_compile_definitions(ascendc_kernels_bbit PRIVATE + $<$>:CUSTOM_ASCEND310P> +) + +target_link_libraries(ascendc_kernels_bbit PRIVATE + $,$>:host_intf_pub>> + $:ascendcl>> + ascendc_kernels_${RUN_MODE} + tiling_api + register + platform + ascendalog + dl +) + +install(TARGETS ascendc_kernels_bbit + LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} + ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} + RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} +) diff --git a/operator/ascendc/0_introduction/22_mmad_profile_kernellaunch/MmadInvocation/README.md b/operator/ascendc/0_introduction/22_mmad_profile_kernellaunch/MmadInvocation/README.md new file mode 100644 index 0000000000000000000000000000000000000000..84477c6ef2141b8c0fe13562e42a91bdf540a8dc --- /dev/null +++ b/operator/ascendc/0_introduction/22_mmad_profile_kernellaunch/MmadInvocation/README.md @@ -0,0 +1,97 @@ +## 目录结构介绍 +``` +├── MmadInvocation +│ ├── cmake // 编译工程文件 +│ ├── scripts +│ │ ├── gen_data.py // 输入数据和真值数据生成脚本文件 +│ │ └── verify_result.py // 验证输出数据和真值数据是否一致的验证脚本 +│ ├── CMakeLists.txt // 编译工程文件 +│ ├── data_utils.h // 数据读入写出函数 +│ ├── main.cpp // 主函数,调用算子的应用程序,含CPU域及NPU域调用 +│ ├── mmad_custom_cube_only.h // Atlas A2训练系列产品kernel实现 +│ ├── mmad_custom.h // Atlas推理系列产品kernel实现 +│ ├── mmad_custom.cpp // 算子kernel实现 +│ └── run.sh // 编译运行算子的脚本 +``` + +## 算子规格描述 +在核函数直调样例中,算子实现支持的shape为:M = 32, N = 32, K = 32。 + + + + + + + + + +
算子输入nameshapedata typeformat
aM * Kfloat16ND
bK * Nfloat16ND
算子输出cM * NfloatND
核函数名mmad_custom
+ +## 代码实现介绍 +本样例中实现的是[m, n, k]固定为[32, 32, 32]的Matmul算子,并使用Ascend C基础Api实现。 +- kernel实现 + Matmul算子的数学表达式为: + $$ + C = A * B + $$ + 其中A的形状为[32, 32], B的形状为[32, 32], C的形状为[32, 32]。具体请参考[mmad_custom.cpp](./mmad_custom.cpp)。 + + **注:当使用硬件分离架构的产品如Atlas A2训练系列产品/Atlas 800I A2推理产品时,由于样例使用的基础API均为Cube核指令,本样例设置了Cube Only模式,只调用Cube核完成计算,代码如下: + ```c++ + KERNEL_TASK_TYPE_DEFAULT(KERNEL_TYPE_AIC_ONLY); + ``` + +- 调用实现 + 1. CPU侧运行验证主要通过ICPU_RUN_KF CPU调测宏等CPU调测库提供的接口来完成; + + **注:当使用硬件分离架构的产品如Atlas A2训练系列产品/Atlas 800I A2推理产品时,Kernel中设置的Cube Only在NPU侧运行时可自动识别并只运行Cube核,若在CPU侧运行,需要额外设置KernelMode,只模拟Cube核实现,代码如下: + ```c++ + AscendC::SetKernelMode(KernelMode::AIC_MODE); + ``` + 2. NPU侧运行验证主要通过使用ACLRT_LAUNCH_KERNEL内核调用宏来完成。 + + 应用程序通过ASCENDC_CPU_DEBUG宏区分代码逻辑运行于CPU侧还是NPU侧。 + +## 运行样例算子 + - 打开样例目录 + 以命令行方式下载样例代码,master分支为例。 + ```bash + cd ${git_clone_path}/samples/operator/ascendc\0_introduction\20_mmad_kernellaunch/MmadInvocationNeo + ``` + - 配置环境变量 + + 请根据当前环境上CANN开发套件包的[安装方式](https://hiascend.com/document/redirect/CannCommunityInstSoftware),选择对应配置环境变量的命令。 + - 默认路径,root用户安装CANN软件包 + ```bash + export ASCEND_INSTALL_PATH=/usr/local/Ascend/ascend-toolkit/latest + ``` + - 默认路径,非root用户安装CANN软件包 + ```bash + export ASCEND_INSTALL_PATH=$HOME/Ascend/ascend-toolkit/latest + ``` + - 指定路径install_path,安装CANN软件包 + ```bash + export ASCEND_INSTALL_PATH=${install_path}/ascend-toolkit/latest + ``` + + - 样例执行 + + ```bash + bash run.sh -r [RUN_MODE] -v [SOC_VERSION] + ``` + - RUN_MODE:编译方式,可选择CPU调试,NPU仿真,NPU上板。支持参数为[cpu / sim / npu]。 + - SOC_VERSION:昇腾AI处理器型号,如果无法确定具体的[SOC_VERSION],则在安装昇腾AI处理器的服务器执行npu-smi info命令进行查询,在查询到的“Name”前增加Ascend信息,例如"Name"对应取值为xxxyy,实际配置的[SOC_VERSION]值为Ascendxxxyy。支持以下产品型号: + - Atlas 推理系列产品AI Core + - Atlas A2训练系列产品/Atlas 800I A2推理产品 + + 示例如下,Ascendxxxyy请替换为实际的AI处理器型号。 + + ```bash + bash run.sh -r cpu -v Ascendxxxyy + ``` + + +## 更新说明 +| 时间 | 更新事项 | +| ---------- | ------------ | +| 2024/11/20 | 更新本readme | diff --git a/operator/ascendc/0_introduction/22_mmad_profile_kernellaunch/MmadInvocation/cmake/cpu_lib.cmake b/operator/ascendc/0_introduction/22_mmad_profile_kernellaunch/MmadInvocation/cmake/cpu_lib.cmake new file mode 100644 index 0000000000000000000000000000000000000000..acb98ec9029d1b9f0adeaa6bba6263bc797b2ac9 --- /dev/null +++ b/operator/ascendc/0_introduction/22_mmad_profile_kernellaunch/MmadInvocation/cmake/cpu_lib.cmake @@ -0,0 +1,12 @@ +if(NOT DEFINED ENV{CMAKE_PREFIX_PATH}) + set(CMAKE_PREFIX_PATH ${ASCEND_CANN_PACKAGE_PATH}/tools/tikicpulib/lib/cmake) +endif() +find_package(tikicpulib REQUIRED) + +add_library(ascendc_kernels_${RUN_MODE} SHARED ${KERNEL_FILES}) +target_link_libraries(ascendc_kernels_${RUN_MODE} PUBLIC tikicpulib::${SOC_VERSION}) +target_compile_definitions(ascendc_kernels_${RUN_MODE} PRIVATE + $<$>:CUSTOM_ASCEND310P> +) +target_compile_options(ascendc_kernels_${RUN_MODE} PRIVATE -g -O0 -std=c++17) +install(TARGETS ascendc_kernels_${RUN_MODE} DESTINATION ${CMAKE_INSTALL_LIBDIR}) diff --git a/operator/ascendc/0_introduction/22_mmad_profile_kernellaunch/MmadInvocation/cmake/npu_lib.cmake b/operator/ascendc/0_introduction/22_mmad_profile_kernellaunch/MmadInvocation/cmake/npu_lib.cmake new file mode 100644 index 0000000000000000000000000000000000000000..3b8e2c506642513fcf1f8777a287ca0d19085b50 --- /dev/null +++ b/operator/ascendc/0_introduction/22_mmad_profile_kernellaunch/MmadInvocation/cmake/npu_lib.cmake @@ -0,0 +1,15 @@ +if(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake) + set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake) +elseif(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake) + set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake) +else() + message(FATAL_ERROR "ascendc_kernel_cmake does not exist ,please check whether the cann package is installed") +endif() +include(${ASCENDC_CMAKE_DIR}/ascendc.cmake) + +# ascendc_library use to add kernel file to generate ascendc library +ascendc_library(ascendc_kernels_${RUN_MODE} SHARED ${KERNEL_FILES}) + +ascendc_compile_definitions(ascendc_kernels_${RUN_MODE} PRIVATE + $<$>:CUSTOM_ASCEND310P> +) diff --git a/operator/ascendc/0_introduction/22_mmad_profile_kernellaunch/MmadInvocation/data_utils.h b/operator/ascendc/0_introduction/22_mmad_profile_kernellaunch/MmadInvocation/data_utils.h new file mode 100644 index 0000000000000000000000000000000000000000..b71d2aaf3083206f1fab03ffe42f41834078761c --- /dev/null +++ b/operator/ascendc/0_introduction/22_mmad_profile_kernellaunch/MmadInvocation/data_utils.h @@ -0,0 +1,203 @@ +/** + * @file data_utils.h + * + * Copyright (C) 2023-2024. Huawei Technologies Co., Ltd. All rights reserved. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + */ +#ifndef DATA_UTILS_H +#define DATA_UTILS_H +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "acl/acl.h" + +typedef enum { + DT_UNDEFINED = -1, + FLOAT = 0, + HALF = 1, + INT8_T = 2, + INT32_T = 3, + UINT8_T = 4, + INT16_T = 6, + UINT16_T = 7, + UINT32_T = 8, + INT64_T = 9, + UINT64_T = 10, + DOUBLE = 11, + BOOL = 12, + STRING = 13, + COMPLEX64 = 16, + COMPLEX128 = 17, + BF16 = 27 +} printDataType; + +#define INFO_LOG(fmt, args...) fprintf(stdout, "[INFO] " fmt "\n", ##args) +#define WARN_LOG(fmt, args...) fprintf(stdout, "[WARN] " fmt "\n", ##args) +#define ERROR_LOG(fmt, args...) fprintf(stdout, "[ERROR] " fmt "\n", ##args) +#define CHECK_ACL(x) \ + do { \ + aclError __ret = x; \ + if (__ret != ACL_ERROR_NONE) { \ + std::cerr << __FILE__ << ":" << __LINE__ << " aclError:" << __ret << std::endl; \ + } \ + } while (0); + +/** + * @brief Read data from file + * @param [in] filePath: file path + * @param [out] fileSize: file size + * @return read result + */ +bool ReadFile(const std::string &filePath, size_t &fileSize, void *buffer, size_t bufferSize) +{ + struct stat sBuf; + int fileStatus = stat(filePath.data(), &sBuf); + if (fileStatus == -1) { + ERROR_LOG("failed to get file"); + return false; + } + if (S_ISREG(sBuf.st_mode) == 0) { + ERROR_LOG("%s is not a file, please enter a file", filePath.c_str()); + return false; + } + + std::ifstream file; + file.open(filePath, std::ios::binary); + if (!file.is_open()) { + ERROR_LOG("Open file failed. path = %s", filePath.c_str()); + return false; + } + + std::filebuf *buf = file.rdbuf(); + size_t size = buf->pubseekoff(0, std::ios::end, std::ios::in); + if (size == 0) { + ERROR_LOG("file size is 0"); + file.close(); + return false; + } + if (size > bufferSize) { + ERROR_LOG("file size is larger than buffer size"); + file.close(); + return false; + } + buf->pubseekpos(0, std::ios::in); + buf->sgetn(static_cast(buffer), size); + fileSize = size; + file.close(); + return true; +} + +/** + * @brief Write data to file + * @param [in] filePath: file path + * @param [in] buffer: data to write to file + * @param [in] size: size to write + * @return write result + */ +bool WriteFile(const std::string &filePath, const void *buffer, size_t size) +{ + if (buffer == nullptr) { + ERROR_LOG("Write file failed. buffer is nullptr"); + return false; + } + + int fd = open(filePath.c_str(), O_RDWR | O_CREAT | O_TRUNC, S_IRUSR | S_IWRITE); + if (fd < 0) { + ERROR_LOG("Open file failed. path = %s", filePath.c_str()); + return false; + } + + size_t writeSize = write(fd, buffer, size); + (void)close(fd); + if (writeSize != size) { + ERROR_LOG("Write file Failed."); + return false; + } + + return true; +} + +template void DoPrintData(const T *data, size_t count, size_t elementsPerRow) +{ + assert(elementsPerRow != 0); + for (size_t i = 0; i < count; ++i) { + std::cout << std::setw(10) << data[i]; + if (i % elementsPerRow == elementsPerRow - 1) { + std::cout << std::endl; + } + } +} + +void DoPrintHalfData(const aclFloat16 *data, size_t count, size_t elementsPerRow) +{ + assert(elementsPerRow != 0); + for (size_t i = 0; i < count; ++i) { + std::cout << std::setw(10) << std::setprecision(6) << aclFloat16ToFloat(data[i]); + if (i % elementsPerRow == elementsPerRow - 1) { + std::cout << std::endl; + } + } +} + +void PrintData(const void *data, size_t count, printDataType dataType, size_t elementsPerRow = 16) +{ + if (data == nullptr) { + ERROR_LOG("Print data failed. data is nullptr"); + return; + } + + switch (dataType) { + case BOOL: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case INT8_T: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case UINT8_T: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case INT16_T: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case UINT16_T: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case INT32_T: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case UINT32_T: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case INT64_T: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case UINT64_T: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case HALF: + DoPrintHalfData(reinterpret_cast(data), count, elementsPerRow); + break; + case FLOAT: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case DOUBLE: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + default: + ERROR_LOG("Unsupported type: %d", dataType); + } + std::cout << std::endl; +} +#endif // DATA_UTILS_H diff --git a/operator/ascendc/0_introduction/22_mmad_profile_kernellaunch/MmadInvocation/main.cpp b/operator/ascendc/0_introduction/22_mmad_profile_kernellaunch/MmadInvocation/main.cpp new file mode 100644 index 0000000000000000000000000000000000000000..ad10e44f5d0384891ce71197a5c7ed48050ed120 --- /dev/null +++ b/operator/ascendc/0_introduction/22_mmad_profile_kernellaunch/MmadInvocation/main.cpp @@ -0,0 +1,89 @@ +/** + * @file main.cpp + * + * Copyright (C) 2023-2024. Huawei Technologies Co., Ltd. All rights reserved. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + */ +#include "data_utils.h" +#ifndef ASCENDC_CPU_DEBUG +#include "acl/acl.h" +#include "aclrtlaunch_mmad_custom.h" +#else +#include "tikicpulib.h" +extern "C" void mmad_custom(uint8_t *a, uint8_t *b, uint8_t *c); +#endif + +int32_t main(int32_t argc, char *argv[]) +{ + uint32_t M = 128; + uint32_t N = 128; + uint32_t K = 128; + size_t aFileSize = M * K * sizeof(int16_t); // uint16_t represent half + size_t bFileSize = K * N * sizeof(int16_t); // uint16_t represent half + size_t cFileSize = M * N * sizeof(float); + uint32_t blockDim = 1; + +#ifdef ASCENDC_CPU_DEBUG + AscendC::SetKernelMode(KernelMode::AIC_MODE); + uint8_t *a = (uint8_t *)AscendC::GmAlloc(aFileSize); + uint8_t *b = (uint8_t *)AscendC::GmAlloc(bFileSize); + uint8_t *c = (uint8_t *)AscendC::GmAlloc(cFileSize); + + ReadFile("./input/x1_gm.bin", aFileSize, a, aFileSize); + ReadFile("./input/x2_gm.bin", bFileSize, b, bFileSize); + + ICPU_RUN_KF(mmad_custom, blockDim, a, b, c); + + WriteFile("./output/output.bin", c, cFileSize); + + AscendC::GmFree((void *)a); + AscendC::GmFree((void *)b); + AscendC::GmFree((void *)c); +#else + CHECK_ACL(aclInit(nullptr)); + int32_t deviceId = 0; + CHECK_ACL(aclrtSetDevice(deviceId)); + aclrtStream stream = nullptr; + CHECK_ACL(aclrtCreateStream(&stream)); + + uint8_t *aHost; + uint8_t *aDevice; + CHECK_ACL(aclrtMallocHost((void **)(&aHost), aFileSize)); + CHECK_ACL(aclrtMalloc((void **)&aDevice, aFileSize, ACL_MEM_MALLOC_HUGE_FIRST)); + ReadFile("./input/x1_gm.bin", aFileSize, aHost, aFileSize); + CHECK_ACL(aclrtMemcpy(aDevice, aFileSize, aHost, aFileSize, ACL_MEMCPY_HOST_TO_DEVICE)); + + uint8_t *bHost; + uint8_t *bDevice; + CHECK_ACL(aclrtMallocHost((void **)(&bHost), bFileSize)); + CHECK_ACL(aclrtMalloc((void **)&bDevice, bFileSize, ACL_MEM_MALLOC_HUGE_FIRST)); + ReadFile("./input/x2_gm.bin", bFileSize, bHost, bFileSize); + CHECK_ACL(aclrtMemcpy(bDevice, bFileSize, bHost, bFileSize, ACL_MEMCPY_HOST_TO_DEVICE)); + + uint8_t *cHost; + uint8_t *cDevice; + CHECK_ACL(aclrtMallocHost((void **)(&cHost), cFileSize)); + CHECK_ACL(aclrtMalloc((void **)&cDevice, cFileSize, ACL_MEM_MALLOC_HUGE_FIRST)); + + ACLRT_LAUNCH_KERNEL(mmad_custom)(blockDim, stream, aDevice, bDevice, cDevice); + CHECK_ACL(aclrtSynchronizeStream(stream)); + + CHECK_ACL(aclrtMemcpy(cHost, cFileSize, cDevice, cFileSize, ACL_MEMCPY_DEVICE_TO_HOST)); + WriteFile("./output/output.bin", cHost, cFileSize); + + CHECK_ACL(aclrtFree(aDevice)); + CHECK_ACL(aclrtFreeHost(aHost)); + CHECK_ACL(aclrtFree(bDevice)); + CHECK_ACL(aclrtFreeHost(bHost)); + CHECK_ACL(aclrtFree(cDevice)); + CHECK_ACL(aclrtFreeHost(cHost)); + + CHECK_ACL(aclrtDestroyStream(stream)); + CHECK_ACL(aclrtResetDevice(deviceId)); + CHECK_ACL(aclFinalize()); +#endif + return 0; +} diff --git a/operator/ascendc/0_introduction/22_mmad_profile_kernellaunch/MmadInvocation/mmad_custom.cpp b/operator/ascendc/0_introduction/22_mmad_profile_kernellaunch/MmadInvocation/mmad_custom.cpp new file mode 100644 index 0000000000000000000000000000000000000000..8eb7eb91d51eb69b2f225a1f870ac23685224b26 --- /dev/null +++ b/operator/ascendc/0_introduction/22_mmad_profile_kernellaunch/MmadInvocation/mmad_custom.cpp @@ -0,0 +1,17 @@ +/** + * @file mmad_custom.cpp + * + * Copyright (C) 2023-2024. Huawei Technologies Co., Ltd. All rights reserved. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + */ +#include "mmad_custom.h" + +extern "C" __global__ __aicore__ void mmad_custom(GM_ADDR a, GM_ADDR b, GM_ADDR c) +{ + KernelMmad op; + op.Init(a, b, c); + op.Process(); +} diff --git a/operator/ascendc/0_introduction/22_mmad_profile_kernellaunch/MmadInvocation/mmad_custom.h b/operator/ascendc/0_introduction/22_mmad_profile_kernellaunch/MmadInvocation/mmad_custom.h new file mode 100644 index 0000000000000000000000000000000000000000..bce32f8042d11536fd07c7e9d249e21f4d66e63f --- /dev/null +++ b/operator/ascendc/0_introduction/22_mmad_profile_kernellaunch/MmadInvocation/mmad_custom.h @@ -0,0 +1,194 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2022-2023. All rights reserved. + * + * This sample is a very basic sample that implements Mmad on Ascend plaform. + */ +#include "kernel_operator.h" +class KernelMmad { +public: + __aicore__ inline KernelMmad() + { + aSize = m * k; + bSize = k * n; + cSize = m * n; + mBlocks = m / 16; + nBlocks = n / 16; + kBlocks = k / 16; + } + __aicore__ inline void Init(GM_ADDR a, GM_ADDR b, GM_ADDR c) + { + xGm.SetGlobalBuffer((__gm__ half*)a); + yGm.SetGlobalBuffer((__gm__ half*)b); + zGm.SetGlobalBuffer((__gm__ float*)c); + pipe.InitBuffer(inQueueA1, 1, aSize * sizeof(half)); + pipe.InitBuffer(inQueueA2, 1, aSize * sizeof(half)); + pipe.InitBuffer(inQueueB1, 1, bSize * sizeof(half)); + pipe.InitBuffer(inQueueB2, 1, bSize * sizeof(half)); + pipe.InitBuffer(outQueueC, 1, cSize * sizeof(float)); + } + __aicore__ inline void Process() + { + CopyIn(); + SplitA(); + SplitB(); + Compute(); + Copyout(); + } + +private: + __aicore__ inline void CopyIn() + { + AscendC::LocalTensor a1Local = inQueueA1.AllocTensor(); + AscendC::LocalTensor b1Local = inQueueB1.AllocTensor(); + + AscendC::Nd2NzParams dataCopyA1Params; + dataCopyA1Params.ndNum = aSize/16/co; + dataCopyA1Params.nValue = m; + dataCopyA1Params.dValue = k; + dataCopyA1Params.srcNdMatrixStride = 0; + dataCopyA1Params.srcDValue = k; + dataCopyA1Params.dstNzC0Stride = m; + dataCopyA1Params.dstNzNStride = 1; + dataCopyA1Params.dstNzMatrixStride = 0; + + AscendC::PipeBarrier(); + systemCycleBefore = AscendC::GetSystemCycle(); + AscendC::DataCopy(a1Local, xGm, dataCopyA1Params); // nd -> nz + AscendC::PipeBarrier(); + systemCycleAfter = AscendC::GetSystemCycle(); + AscendC::printf("mte2 A systemCycle is %d\n", systemCycleAfter - systemCycleBefore); + AscendC::PipeBarrier(); + + AscendC::Nd2NzParams dataCopyB1Params; + dataCopyB1Params.ndNum = bSize/16/co; + dataCopyB1Params.nValue = k; + dataCopyB1Params.dValue = n; + dataCopyB1Params.srcNdMatrixStride = 0; + dataCopyB1Params.srcDValue = n; + dataCopyB1Params.dstNzC0Stride = k; + dataCopyB1Params.dstNzNStride = 1; + dataCopyB1Params.dstNzMatrixStride = 0; + + AscendC::PipeBarrier(); + systemCycleBefore = AscendC::GetSystemCycle(); + AscendC::DataCopy(b1Local, yGm, dataCopyB1Params); // nd -> nz + AscendC::PipeBarrier(); + systemCycleAfter = AscendC::GetSystemCycle(); + AscendC::printf("mte2 B systemCycle is %d\n", systemCycleAfter - systemCycleBefore); + AscendC::PipeBarrier(); + + inQueueA1.EnQue(a1Local); + inQueueB1.EnQue(b1Local); + } + __aicore__ inline void SplitA() + { + AscendC::LocalTensor a1Local = inQueueA1.DeQue(); + AscendC::LocalTensor a2Local = inQueueA2.AllocTensor(); + AscendC::LoadData2dParams loadL0AParams; + loadL0AParams.repeatTimes = m / 16; + loadL0AParams.srcStride = 1; + loadL0AParams.dstGap = k / co - 1; + loadL0AParams.ifTranspose = false; + + AscendC::PipeBarrier(); + systemCycleBefore = AscendC::GetSystemCycle(); + for (int i = 0; i < k/co; i++) { + AscendC::LoadData(a2Local[i * 16 * co], a1Local[i * m *co], loadL0AParams); + } + AscendC::PipeBarrier(); + systemCycleAfter = AscendC::GetSystemCycle(); + AscendC::printf("mte1 A systemCycle is %d\n", systemCycleAfter - systemCycleBefore); + AscendC::PipeBarrier(); + inQueueA2.EnQue(a2Local); + inQueueA1.FreeTensor(a1Local); + } + __aicore__ inline void SplitB() + { + AscendC::LocalTensor b1Local = inQueueB1.DeQue(); + AscendC::LocalTensor b2Local = inQueueB2.AllocTensor(); + + AscendC::LoadData2dTransposeParams loadDataParams; + uint16_t n_block = 16; + loadDataParams.startIndex = 0; + loadDataParams.repeatTimes = k / 16; + loadDataParams.srcStride = 1; + loadDataParams.dstGap = n / n_block - 1; + loadDataParams.dstFracGap = 0; + + AscendC::PipeBarrier(); + systemCycleBefore = AscendC::GetSystemCycle(); + for (int i = 0; i < n/n_block; ++i) { + AscendC::LoadDataWithTranspose(b2Local[i*16*n_block], b1Local[i * k * n_block], loadDataParams); + } + AscendC::PipeBarrier(); + systemCycleAfter = AscendC::GetSystemCycle(); + AscendC::printf("mte1 B systemCycle is %d\n", systemCycleAfter - systemCycleBefore); + AscendC::PipeBarrier(); + inQueueB2.EnQue(b2Local); + inQueueB1.FreeTensor(b1Local); + } + __aicore__ inline void Compute() { + AscendC::LocalTensor a2Local = inQueueA2.DeQue(); + AscendC::LocalTensor b2Local = inQueueB2.DeQue(); + AscendC::LocalTensor cLocal = outQueueC.AllocTensor(); + AscendC::MmadParams mmadParams; + mmadParams.m = m; + mmadParams.n = n; + mmadParams.k = k; + + AscendC::PipeBarrier(); + systemCycleBefore = AscendC::GetSystemCycle(); + for (int loop = 0; loop < 100; loop++) { + AscendC::Mmad(cLocal, a2Local, b2Local, mmadParams); + } + AscendC::PipeBarrier(); + systemCycleAfter = AscendC::GetSystemCycle(); + AscendC::printf("mmad *100 systemCycle is %d\n", systemCycleAfter - systemCycleBefore); + outQueueC.EnQue(cLocal); + inQueueA2.FreeTensor(a2Local); + inQueueB2.FreeTensor(b2Local); + } + __aicore__ inline void Copyout() + { + AscendC::LocalTensor cLocal = outQueueC.DeQue(); + AscendC::FixpipeParams fixpipeParams; + + fixpipeParams.cburstNum = n; + fixpipeParams.burstLen = m * 16 * sizeof(float) / 32; + fixpipeParams.srcStride = 0; + fixpipeParams.dstStride = n; + + fixpipeParams.nz2ndParams.nz2ndEn = true; + fixpipeParams.nz2ndParams.ndNum = cSize / 16 * sizeof(float) / 32; + fixpipeParams.nz2ndParams.srcNdStride = 0; + fixpipeParams.nz2ndParams.dstNdStride = 0; + fixpipeParams.nz2ndParams.originalNSize = n; + + AscendC::PipeBarrier(); + systemCycleBefore = AscendC::GetSystemCycle(); + AscendC::Fixpipe(zGm, cLocal, fixpipeParams); + AscendC::PipeBarrier(); + systemCycleAfter = AscendC::GetSystemCycle(); + AscendC::printf("fixpipe systemCycle is %d\n", systemCycleAfter - systemCycleBefore); + outQueueC.FreeTensor(cLocal); + } + +private: + AscendC::TPipe pipe; + AscendC::TQue inQueueA1; + AscendC::TQue inQueueA2; + AscendC::TQue inQueueB1; + AscendC::TQue inQueueB2; + AscendC::TQue outQueueC; + AscendC::GlobalTensor xGm; + AscendC::GlobalTensor yGm; + AscendC::GlobalTensor zGm; + uint16_t m = 128; + uint16_t n = 128; + uint16_t k = 128; + uint16_t co = 16; + uint32_t aSize, bSize, cSize; + uint32_t mBlocks, nBlocks, kBlocks; + int64_t systemCycleBefore; + int64_t systemCycleAfter; +}; diff --git a/operator/ascendc/0_introduction/22_mmad_profile_kernellaunch/MmadInvocation/run.sh b/operator/ascendc/0_introduction/22_mmad_profile_kernellaunch/MmadInvocation/run.sh new file mode 100644 index 0000000000000000000000000000000000000000..0c9c7f40bde7b2a0f93a1fd309f1bc20fc553682 --- /dev/null +++ b/operator/ascendc/0_introduction/22_mmad_profile_kernellaunch/MmadInvocation/run.sh @@ -0,0 +1,121 @@ +#!/bin/bash +CURRENT_DIR=$( + cd $(dirname ${BASH_SOURCE:-$0}) + pwd +) + +BUILD_TYPE="Debug" +INSTALL_PREFIX="${CURRENT_DIR}/out" + +SHORT=r:,v:,i:,b:,p:, +LONG=run-mode:,soc-version:,install-path:,build-type:,install-prefix:, +OPTS=$(getopt -a --options $SHORT --longoptions $LONG -- "$@") +eval set -- "$OPTS" + +while :; do + case "$1" in + -r | --run-mode) + RUN_MODE="$2" + shift 2 + ;; + -v | --soc-version) + SOC_VERSION="$2" + shift 2 + ;; + -i | --install-path) + ASCEND_INSTALL_PATH="$2" + shift 2 + ;; + -b | --build-type) + BUILD_TYPE="$2" + shift 2 + ;; + -p | --install-prefix) + INSTALL_PREFIX="$2" + shift 2 + ;; + --) + shift + break + ;; + *) + echo "[ERROR] Unexpected option: $1" + break + ;; + esac +done + +RUN_MODE_LIST="cpu sim npu" +if [[ " $RUN_MODE_LIST " != *" $RUN_MODE "* ]]; then + echo "ERROR: RUN_MODE error, This sample only support specify cpu, sim or npu!" + exit -1 +fi + +VERSION_LIST="Ascend310P1 Ascend310P3 Ascend910B1 Ascend910B2 Ascend910B3 Ascend910B4" +if [[ " $VERSION_LIST " != *" $SOC_VERSION "* ]]; then + echo "ERROR: SOC_VERSION should be in [$VERSION_LIST]" + exit -1 +fi + +if [ -n "$ASCEND_INSTALL_PATH" ]; then + _ASCEND_INSTALL_PATH=$ASCEND_INSTALL_PATH +elif [ -n "$ASCEND_HOME_PATH" ]; then + _ASCEND_INSTALL_PATH=$ASCEND_HOME_PATH +else + if [ -d "$HOME/Ascend/ascend-toolkit/latest" ]; then + _ASCEND_INSTALL_PATH=$HOME/Ascend/ascend-toolkit/latest + else + _ASCEND_INSTALL_PATH=/usr/local/Ascend/ascend-toolkit/latest + fi +fi + +export ASCEND_TOOLKIT_HOME=${_ASCEND_INSTALL_PATH} +export ASCEND_HOME_PATH=${_ASCEND_INSTALL_PATH} +source ${_ASCEND_INSTALL_PATH}/bin/setenv.bash +if [ "${RUN_MODE}" = "sim" ]; then + # in case of running op in simulator, use stub .so instead + export LD_LIBRARY_PATH=${_ASCEND_INSTALL_PATH}/tools/simulator/${SOC_VERSION}/lib:$LD_LIBRARY_PATH + if [ ! $CAMODEL_LOG_PATH ]; then + export CAMODEL_LOG_PATH=$(pwd)/sim_log + fi + if [ -d "$CAMODEL_LOG_PATH" ]; then + rm -rf $CAMODEL_LOG_PATH + fi + mkdir -p $CAMODEL_LOG_PATH +elif [ "${RUN_MODE}" = "cpu" ]; then + export LD_LIBRARY_PATH=${_ASCEND_INSTALL_PATH}/tools/tikicpulib/lib:${_ASCEND_INSTALL_PATH}/tools/tikicpulib/lib/${SOC_VERSION}:${_ASCEND_INSTALL_PATH}/tools/simulator/${SOC_VERSION}/lib:$LD_LIBRARY_PATH +fi + +set -e +rm -rf build out +mkdir -p build +cmake -B build \ + -DRUN_MODE=${RUN_MODE} \ + -DSOC_VERSION=${SOC_VERSION} \ + -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \ + -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} \ + -DASCEND_CANN_PACKAGE_PATH=${_ASCEND_INSTALL_PATH} +cmake --build build -j +cmake --install build + +rm -f ascendc_kernels_bbit +cp ./out/bin/ascendc_kernels_bbit ./ +rm -rf input output +mkdir -p input output +python3 scripts/gen_data.py +( + export LD_LIBRARY_PATH=$(pwd)/out/lib:$(pwd)/out/lib64:${_ASCEND_INSTALL_PATH}/lib64:$LD_LIBRARY_PATH + if [[ "$RUN_WITH_TOOLCHAIN" -eq 1 ]]; then + if [ "${RUN_MODE}" = "npu" ]; then + msprof op --application=./ascendc_kernels_bbit + elif [ "${RUN_MODE}" = "sim" ]; then + msprof op simulator --application=./ascendc_kernels_bbit + elif [ "${RUN_MODE}" = "cpu" ]; then + ./ascendc_kernels_bbit + fi + else + ./ascendc_kernels_bbit + fi +) +md5sum output/*.bin +python3 scripts/verify_result.py output/output.bin output/golden.bin diff --git a/operator/ascendc/0_introduction/22_mmad_profile_kernellaunch/MmadInvocation/scripts/gen_data.py b/operator/ascendc/0_introduction/22_mmad_profile_kernellaunch/MmadInvocation/scripts/gen_data.py new file mode 100644 index 0000000000000000000000000000000000000000..191bcc1d9c6383fe4425a52e3a1691a7cbc69849 --- /dev/null +++ b/operator/ascendc/0_introduction/22_mmad_profile_kernellaunch/MmadInvocation/scripts/gen_data.py @@ -0,0 +1,31 @@ +#!/usr/bin/python3 +# coding=utf-8 +# +# Copyright (C) 2023-2024. Huawei Technologies Co., Ltd. All rights reserved. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# =============================================================================== + +import numpy as np +import os + + +def gen_golden_data(): + M = 128 + N = 128 + K = 128 + + x1_gm = np.random.randint(1, 10, [M, K]).astype(np.float16) + x2_gm = np.random.randint(1, 10, [K, N]).astype(np.float16) + golden = (np.matmul(x1_gm.astype(np.float32), x2_gm.astype(np.float32))).astype(np.float32) + os.system("mkdir -p input") + os.system("mkdir -p output") + x1_gm.tofile("./input/x1_gm.bin") + x2_gm.tofile("./input/x2_gm.bin") + golden.tofile("./output/golden.bin") + + +if __name__ == "__main__": + gen_golden_data() diff --git a/operator/ascendc/0_introduction/22_mmad_profile_kernellaunch/MmadInvocation/scripts/verify_result.py b/operator/ascendc/0_introduction/22_mmad_profile_kernellaunch/MmadInvocation/scripts/verify_result.py new file mode 100644 index 0000000000000000000000000000000000000000..a325cfcc61f46f7de04ef6c979f852adcf213ef3 --- /dev/null +++ b/operator/ascendc/0_introduction/22_mmad_profile_kernellaunch/MmadInvocation/scripts/verify_result.py @@ -0,0 +1,53 @@ +#!/usr/bin/python3 +# coding=utf-8 +# +# Copyright (C) 2023-2024. Huawei Technologies Co., Ltd. All rights reserved. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# =============================================================================== + +import sys +import numpy as np + +# for float32 +relative_tol = 1e-6 +absolute_tol = 1e-9 +error_tol = 1e-4 + + +def verify_result(output, golden): + output = np.fromfile(output, dtype=np.float32).reshape(-1) + golden = np.fromfile(golden, dtype=np.float32).reshape(-1) + different_element_results = np.isclose(output, + golden, + rtol=relative_tol, + atol=absolute_tol, + equal_nan=True) + different_element_indexes = np.where(different_element_results == False)[0] + for index in range(len(different_element_indexes)): + real_index = different_element_indexes[index] + golden_data = golden[real_index] + output_data = output[real_index] + print( + "data index: %06d, expected: %-.9f, actual: %-.9f, rdiff: %-.6f" % + (real_index, golden_data, output_data, + abs(output_data - golden_data) / golden_data)) + if index == 100: + break + error_ratio = float(different_element_indexes.size) / golden.size + print("error ratio: %.4f, tolrence: %.4f" % (error_ratio, error_tol)) + return error_ratio <= error_tol + + +if __name__ == '__main__': + try: + res = verify_result(sys.argv[1], sys.argv[2]) + if not res: + raise ValueError("[ERROR] result error") + else: + print("test pass") + except Exception as e: + print(e) + sys.exit(1) diff --git a/operator/ascendc/0_introduction/22_mmad_profile_kernellaunch/README.md b/operator/ascendc/0_introduction/22_mmad_profile_kernellaunch/README.md new file mode 100644 index 0000000000000000000000000000000000000000..5a0e0840551f9258035a95fd00eb4ebd3bdd69c4 --- /dev/null +++ b/operator/ascendc/0_introduction/22_mmad_profile_kernellaunch/README.md @@ -0,0 +1,45 @@ +## 概述 +本样例介绍基于基础API的matmul算子实现及核函数直调方法。 + +## 目录结构介绍 +``` +└── 20_mmad_kernellaunch // 使用核函数直调的方式调用Matmul自定义算子。 + └── MmadInvocation // Kernel Launch方式调用核函数样例。 + └── MmadBiasInvocation // Kernel Launch方式调用核函数样例,新增bias输入。 +``` + +## 算子描述 +算子使用基础API包括DataCopy、LoadData、Mmad等,实现Matmul矩阵乘功能。 + +Matmul的计算公式为: + +``` +C = A * B + Bias +``` + +- A、B为源操作数,A为左矩阵,形状为\[M, K];B为右矩阵,形状为\[K, N]。 +- C为目的操作数,存放矩阵乘结果的矩阵,形状为\[M, N]。 +- Bias为矩阵乘偏置,形状为\[N]。对A*B结果矩阵的每一行都采用该Bias进行偏置。 + +## 支持的产品型号 +本样例支持如下产品型号: +- Atlas 推理系列产品 +- Atlas A2训练系列产品/Atlas 800I A2推理产品 + +## 编译运行样例算子 + +### 1. 获取源码包 +编译运行此样例前,请参考[准备:获取样例代码](../README.md#codeready)获取源码包。 + +### 2. 编译运行样例工程 +- [MmadBiasInvocation样例运行](./MmadBiasInvocation/README.md) +- [MmadInvocation样例运行](./MmadInvocation/README.md) + +## 更新说明 +| 时间 | 更新事项 | +| ---------- | ------------------------ | +| 2024/11/20 | 新增readme | + +## 已知issue + + 暂无