From bb52ae1762fae16af0561bdcdb7fe74279e0df29 Mon Sep 17 00:00:00 2001 From: li-yuanjie-da Date: Mon, 25 Aug 2025 09:45:33 +0800 Subject: [PATCH] =?UTF-8?q?=E6=96=B0=E5=A2=9Ebmm=E7=89=B9=E6=80=A7?= =?UTF-8?q?=E6=A0=B7=E4=BE=8B=E4=BB=A5=E5=8F=8A=E6=80=A7=E8=83=BD=E6=A8=A1?= =?UTF-8?q?=E5=BC=8F=E9=80=82=E9=85=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../batch_matmul_bias_no_batch/CMakeLists.txt | 73 ++++++ .../batch_matmul_bias_no_batch/README.md | 104 ++++++++ .../cmake/cpu_lib.cmake | 31 +++ .../cmake/npu_lib.cmake | 27 +++ .../batch_matmul_bias_no_batch/main.cpp | 222 ++++++++++++++++++ ...tch_matmul_bias_no_batch_custom_tiling.cpp | 54 +++++ ...batch_matmul_bias_no_batch_custom_tiling.h | 38 +++ .../batch_matmul_bias_no_batch_custom.cpp | 87 +++++++ .../batch_matmul_bias_no_batch_custom_impl.h | 44 ++++ .../matrix/batch_matmul_bias_no_batch/run.sh | 96 ++++++++ .../scripts/exec_test.py | 118 ++++++++++ .../scripts/gen_batch_data.py | 93 ++++++++ .../testcase/case.csv | 1 + .../matrix/batch_matmul_tscm/CMakeLists.txt | 73 ++++++ examples/matrix/batch_matmul_tscm/README.md | 127 ++++++++++ .../batch_matmul_tscm/cmake/cpu_lib.cmake | 31 +++ .../batch_matmul_tscm/cmake/npu_lib.cmake | 27 +++ examples/matrix/batch_matmul_tscm/main.cpp | 222 ++++++++++++++++++ .../batch_matmul_tscm_custom_tiling.cpp | 54 +++++ .../op_host/batch_matmul_tscm_custom_tiling.h | 38 +++ .../op_kernel/batch_matmul_tscm_custom.cpp | 95 ++++++++ .../op_kernel/batch_matmul_tscm_custom_impl.h | 38 +++ examples/matrix/batch_matmul_tscm/run.sh | 96 ++++++++ .../batch_matmul_tscm/scripts/exec_test.py | 120 ++++++++++ .../scripts/gen_batch_data.py | 113 +++++++++ .../batch_matmul_tscm/testcase/case.csv | 1 + .../matrix/matmul_nbuffer33/CMakeLists.txt | 1 + examples/matrix/matmul_nbuffer33/README.md | 7 +- .../matmul_nbuffer33/cmake/cpu_lib.cmake | 4 + .../matmul_nbuffer33/cmake/npu_lib.cmake | 9 +- .../op_kernel/matmul_nbuffer33_custom_impl.h | 9 + examples/matrix/matmul_nbuffer33/run.sh | 13 +- examples/matrix/matmul_preload/CMakeLists.txt | 1 + examples/matrix/matmul_preload/README.md | 5 +- .../matrix/matmul_preload/cmake/cpu_lib.cmake | 1 + .../matrix/matmul_preload/cmake/npu_lib.cmake | 1 + .../op_kernel/matmul_preload_custom_impl.h | 4 + examples/matrix/matmul_preload/run.sh | 13 +- examples/matrix/matmul_splitk/CMakeLists.txt | 1 + examples/matrix/matmul_splitk/README.md | 7 +- .../matrix/matmul_splitk/cmake/cpu_lib.cmake | 4 + .../matrix/matmul_splitk/cmake/npu_lib.cmake | 9 +- .../op_host/matmul_splitk_custom_tiling.cpp | 2 + .../op_kernel/matmul_splitk_custom_impl.h | 4 + examples/matrix/matmul_splitk/run.sh | 13 +- .../matrix/matmul_splitk/testcase/case.csv | 3 +- examples/readme.md | 7 +- 47 files changed, 2115 insertions(+), 26 deletions(-) create mode 100644 examples/matrix/batch_matmul_bias_no_batch/CMakeLists.txt create mode 100644 examples/matrix/batch_matmul_bias_no_batch/README.md create mode 100644 examples/matrix/batch_matmul_bias_no_batch/cmake/cpu_lib.cmake create mode 100644 examples/matrix/batch_matmul_bias_no_batch/cmake/npu_lib.cmake create mode 100644 examples/matrix/batch_matmul_bias_no_batch/main.cpp create mode 100644 examples/matrix/batch_matmul_bias_no_batch/op_host/batch_matmul_bias_no_batch_custom_tiling.cpp create mode 100644 examples/matrix/batch_matmul_bias_no_batch/op_host/batch_matmul_bias_no_batch_custom_tiling.h create mode 100644 examples/matrix/batch_matmul_bias_no_batch/op_kernel/batch_matmul_bias_no_batch_custom.cpp create mode 100644 examples/matrix/batch_matmul_bias_no_batch/op_kernel/batch_matmul_bias_no_batch_custom_impl.h create mode 100644 examples/matrix/batch_matmul_bias_no_batch/run.sh create mode 100644 examples/matrix/batch_matmul_bias_no_batch/scripts/exec_test.py create mode 100644 examples/matrix/batch_matmul_bias_no_batch/scripts/gen_batch_data.py create mode 100644 examples/matrix/batch_matmul_bias_no_batch/testcase/case.csv create mode 100644 examples/matrix/batch_matmul_tscm/CMakeLists.txt create mode 100644 examples/matrix/batch_matmul_tscm/README.md create mode 100644 examples/matrix/batch_matmul_tscm/cmake/cpu_lib.cmake create mode 100644 examples/matrix/batch_matmul_tscm/cmake/npu_lib.cmake create mode 100644 examples/matrix/batch_matmul_tscm/main.cpp create mode 100644 examples/matrix/batch_matmul_tscm/op_host/batch_matmul_tscm_custom_tiling.cpp create mode 100644 examples/matrix/batch_matmul_tscm/op_host/batch_matmul_tscm_custom_tiling.h create mode 100644 examples/matrix/batch_matmul_tscm/op_kernel/batch_matmul_tscm_custom.cpp create mode 100644 examples/matrix/batch_matmul_tscm/op_kernel/batch_matmul_tscm_custom_impl.h create mode 100644 examples/matrix/batch_matmul_tscm/run.sh create mode 100644 examples/matrix/batch_matmul_tscm/scripts/exec_test.py create mode 100644 examples/matrix/batch_matmul_tscm/scripts/gen_batch_data.py create mode 100644 examples/matrix/batch_matmul_tscm/testcase/case.csv diff --git a/examples/matrix/batch_matmul_bias_no_batch/CMakeLists.txt b/examples/matrix/batch_matmul_bias_no_batch/CMakeLists.txt new file mode 100644 index 00000000..bdb4aded --- /dev/null +++ b/examples/matrix/batch_matmul_bias_no_batch/CMakeLists.txt @@ -0,0 +1,73 @@ +# Copyright (c) 2025 Huawei Technologies Co., Ltd. +# This file is a part of the CANN Open Software. +# Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ====================================================================================================================== + +cmake_minimum_required(VERSION 3.16) +project(Ascend_c) +if(${RUN_MODE}) + set(RUN_MODE "npu" CACHE STRING "cpu/sim/npu") +endif() +if (${SOC_VERSION}) + set(SOC_VERSION "Ascend910" CACHE STRING "system on chip type") +endif() + +set(ASCEND_CANN_PACKAGE_PATH "~/Ascend/ascend-toolkit/latest" CACHE STRING "ASCEND CANN package installation directory") +if(NOT CMAKE_BUILD_TYPE) + set(CMAKE_BUILD_TYPE "Debug" CACHE STRING "Build type Release/Debug (default Debug)" FORCE) +endif() + +if(CMAKE_INSTALL_PREFIX STREQUAL /usr/local) + set(CMAKE_INSTALL_PREFIX "${CMAKE_CURRENT_LIST_DIR}/out" CACHE STRING "path for install()" FORCE) +endif() + +file(GLOB KERNEL_FILES + ${CMAKE_CURRENT_SOURCE_DIR}/op_kernel/batch_matmul_bias_no_batch_custom.cpp +) + +if("${RUN_MODE}" STREQUAL "cpu") + include(cmake/cpu_lib.cmake) +elseif("${RUN_MODE}" STREQUAL "sim" OR "${RUN_MODE}" STREQUAL "npu") + include(cmake/npu_lib.cmake) +else() + message("invalid RUN_MODE: ${RUN_MODE}") +endif() + +add_executable(ascendc_batch_matmul_bias_no_batch_bbit + ${CMAKE_CURRENT_SOURCE_DIR}/main.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/op_host/batch_matmul_bias_no_batch_custom_tiling.cpp +) + +target_compile_options(ascendc_batch_matmul_bias_no_batch_bbit PRIVATE + $:-g>> + -O2 + -std=c++17 + -D_GLIBCXX_USE_CXX11_ABI=0 +) + +target_compile_definitions(ascendc_batch_matmul_bias_no_batch_bbit PRIVATE + SOC_VERSION="${SOC_VERSION}" +) + +target_include_directories(ascendc_batch_matmul_bias_no_batch_bbit PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR} + $:${ASCEND_CANN_PACKAGE_PATH}/include>> + $:${ASCEND_CANN_PACKAGE_PATH}/runtime/include>> +) + +target_link_libraries(ascendc_batch_matmul_bias_no_batch_bbit PRIVATE + $,$>:host_intf_pub>> + $:tikicpulib::${SOC_VERSION}>> + $:ascendcl>> + $:c_sec>> + ascendc_kernels_${RUN_MODE} + tiling_api + register + platform + ascendalog + dl +) diff --git a/examples/matrix/batch_matmul_bias_no_batch/README.md b/examples/matrix/batch_matmul_bias_no_batch/README.md new file mode 100644 index 00000000..cb33b830 --- /dev/null +++ b/examples/matrix/batch_matmul_bias_no_batch/README.md @@ -0,0 +1,104 @@ + +## 概述 + +本样例介绍了调用Matmul高阶API实现BatchMatmul单算子,输入Bias矩阵不带Batch轴,即多Batch计算Matmul时,复用同一个Bias矩阵。 + +本样例以直调的方式调用算子核函数。 + +直调:核函数的基础调用方式,开发者完成算子核函数的开发和Tiling实现后,即可通过AscendCL运行时接口,完成算子的调用。包含CPU侧、NPU侧和仿真三种运行验证方法。 + +## 样例支持的产品型号为 +- Atlas A2训练系列产品/Atlas 800I A2推理产品 + +## 目录结构 +| 目录及文件 | 描述 | +|----------------------------------|----------------------| +| [cmake](cmake) | 编译工程文件 | +| [op_host](op_host) | 本样例的tiling代码实现 | +| [op_kernel](op_kernel) | 本样例的kernel代码实现 | +| [scripts](scripts) | 包含输入输出真值数据生成脚本文件和执行文件 | +| [testcase](testcase) | 用例文件,配置用例的计算shape信息 | +| CMakeLists.txt | 编译工程文件 | +| main.cpp | 主函数,调用算子的应用程序,含CPU域及NPU域调用 | +| run.sh | 编译执行脚本 | + +## 算子描述 +- 算子功能 + BatchMatmul单算子,批量处理Matmul计算,每次Matmul计算对输入的A B矩阵做矩阵乘和加bias偏置。该算子多Batch复用相同的Bias矩阵。 + +- 算子规格 + + + + + + + + + + + + +
算子类型(OpType)BatchMatmulCustom
算子输入nameshapedata typeformatisTranslayout
a-float16NDfalseNORMAL
b-float16NDtrueNORMAL
bias-floatND--
算子输出c-floatND-NORMAL
核函数名batch_matmul_bias_no_batch_custom
+ +## 算子实现介绍 +- 约束条件 + - 输入和输出的Layout类型都为NORMAL时,不支持BatchMode为SINGLE_LARGE_THAN_L1的场景。 + +- 算子Kernel实现 + - 一次完成BatchNum个Matmul矩阵乘法的运算。单次MatMul的计算公式为:C = A * B + Bias。 + - A、B为源操作数,A为左矩阵,形状为[M, K];B为右矩阵,形状为[K, N]。 + - C为目的操作数,存放矩阵乘结果的矩阵,形状为[M, N]。 + - Bias为矩阵乘偏置,形状为[1, N]。对A*B结果矩阵的每一行都采用该bias进行偏置。 + - 具体步骤: + - 创建Matmul对象。 + 创建Matmul对象时,自定义MatmulConfig参数,将其中的isBiasBatch参数设置为false,使能BatchMatmul的Bias复用功能,获得自定义的使用NORM模板的Matmul对象。 + ``` + constexpr MatmulConfigMode configMode = MatmulConfigMode::CONFIG_NORM; + constexpr MatmulBatchParams batchParams = { + false, BatchMode::BATCH_LESS_THAN_L1, false /* isBiasBatch */ + }; + constexpr MatmulConfig CFG_MM = GetMMConfig(batchParams); + AscendC::Matmul matmulObj; + ``` + - 初始化操作。 + - 设置左矩阵A 、右矩阵B、Bias。 + - 完成多batch矩阵乘操作。 + ``` + matmulObj.IterateBatch(cGlobal, batchA, batchB, false); + ``` + - 结束矩阵乘操作。 + +- 算子Tiling实现 + - Ascend C提供一组Matmul Tiling API,方便用户获取Matmul Kernel计算时所需的 Tiling参数。只需要传入A/B/C矩阵等信息,调用API接口,即可获取到TCubeTiling结构体中的相关参数。 + - 获取Tiling参数的流程如下: + - 创建一个Tiling对象。 + - 根据输入输出Layout设置单核计算的A、B、C、Bias的参数类型信息;M、N、Ka、Kb形状信息等。 + - 调用SetALayout、SetBLayout、SetCLayout、SetBatchNum设置A/B/C的Layout轴信息和最大BatchNum数。 + - 调用GetTiling接口,获取Tiling信息。 + +## 编译运行样例 + + - 配置环境变量 + + 这里的\$ASCEND_CANN_PACKAGE_PATH需要替换为CANN开发套件包安装后文件存储路径。例如:/usr/local/Ascend/ascend-toolkit/latest + ``` + export ASCEND_HOME_DIR=$ASCEND_CANN_PACKAGE_PATH + source $ASCEND_HOME_DIR/../set_env.sh + ``` + + - 编译执行 + + ``` + bash run.sh -r [RUN_MODE] -v [SOC_VERSION] -p [IS_PERF] + ``` + 其中脚本参数说明如下: + - RUN_MODE :编译执行方式,可选择CPU调试,NPU仿真,NPU上板,对应参数分别为[cpu / sim / npu]。若需要详细了解NPU仿真相关内容,请参考[《算子开发工具msProf》](https://hiascend.com/document/redirect/CannCommunityToolMsProf)中的“工具使用”章节。 + - SOC_VERSION :昇腾AI处理器型号,如果无法确定具体的[SOC_VERSION],则在安装昇腾AI处理器的服务器执行npu-smi info命令进行查询,在查询到的“Name”前增加Ascend信息,例如“Name”对应取值为xxxyy,实际配置的[SOC_VERSION]值为Ascendxxxyy。支持以下产品型号: + - Atlas A2训练系列产品/Atlas 800I A2推理产品 + - IS_PERF : 是否获取执行性能数据,当前只在NPU执行时生效,打开后性能数据显示在执行结果中,可选择关闭和开启该功能,对应参数分别为[0 / 1]。 + + 示例如下,Ascendxxxyy请替换为实际的AI处理器型号。 + ``` + bash run.sh -r cpu -v Ascendxxxyy -p 0 + ``` \ No newline at end of file diff --git a/examples/matrix/batch_matmul_bias_no_batch/cmake/cpu_lib.cmake b/examples/matrix/batch_matmul_bias_no_batch/cmake/cpu_lib.cmake new file mode 100644 index 00000000..244469fb --- /dev/null +++ b/examples/matrix/batch_matmul_bias_no_batch/cmake/cpu_lib.cmake @@ -0,0 +1,31 @@ +# Copyright (c) 2025 Huawei Technologies Co., Ltd. +# This file is a part of the CANN Open Software. +# Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ====================================================================================================================== + +if(NOT DEFINED ENV{CMAKE_PREFIX_PATH}) + set(CMAKE_PREFIX_PATH ${ASCEND_CANN_PACKAGE_PATH}/tools/tikicpulib/lib/cmake) +endif() +find_package(tikicpulib REQUIRED) + +add_library(ascendc_kernels_${RUN_MODE} SHARED + ${KERNEL_FILES} +) + +target_link_libraries(ascendc_kernels_${RUN_MODE} PRIVATE + tikicpulib::${SOC_VERSION} +) + +target_compile_options(ascendc_kernels_${RUN_MODE} PRIVATE + -g + -O0 + -std=c++17 +) + +install(TARGETS ascendc_kernels_${RUN_MODE} +DESTINATION ${CMAKE_INSTALL_LIBDIR} +) \ No newline at end of file diff --git a/examples/matrix/batch_matmul_bias_no_batch/cmake/npu_lib.cmake b/examples/matrix/batch_matmul_bias_no_batch/cmake/npu_lib.cmake new file mode 100644 index 00000000..7e1c58d6 --- /dev/null +++ b/examples/matrix/batch_matmul_bias_no_batch/cmake/npu_lib.cmake @@ -0,0 +1,27 @@ +# Copyright (c) 2025 Huawei Technologies Co., Ltd. +# This file is a part of the CANN Open Software. +# Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ====================================================================================================================== + +if(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake) + set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake) +elseif(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake) + set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake) +else() + message(FATAL_ERROR "ascendc_kernel_cmake does not exist ,please check whether the cann package is installed") +endif() +include(${ASCENDC_CMAKE_DIR}/ascendc.cmake) + +ascendc_library(ascendc_kernels_${RUN_MODE} STATIC + ${KERNEL_FILES} +) + +ascendc_compile_definitions(ascendc_kernels_${RUN_MODE} PRIVATE + -DASCENDC_DUMP + -DHAVE_WORKSPACE + -DHAVE_TILING + ) diff --git a/examples/matrix/batch_matmul_bias_no_batch/main.cpp b/examples/matrix/batch_matmul_bias_no_batch/main.cpp new file mode 100644 index 00000000..cd35ab4a --- /dev/null +++ b/examples/matrix/batch_matmul_bias_no_batch/main.cpp @@ -0,0 +1,222 @@ +/** + * Copyright (c) 2025 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#include +#include "../../common/data_utils.h" +#include "kernel_tiling/kernel_tiling.h" +#include "tiling/platform/platform_ascendc.h" +#include "op_host/batch_matmul_bias_no_batch_custom_tiling.h" +#ifndef ASCENDC_CPU_DEBUG +#include "acl/acl.h" +extern void batch_matmul_bias_no_batch_custom_do(uint32_t coreDim, void* stream, + uint8_t *a, uint8_t *b, uint8_t *bias, uint8_t *c, + uint8_t *workspace, uint8_t *tiling); +#else +#include "tikicpulib.h" +extern "C" void batch_matmul_bias_no_batch_custom(uint8_t *a, uint8_t *b, uint8_t *bias, uint8_t *c, + uint8_t *workspace, uint8_t *tiling); +#endif + +namespace { +constexpr bool IS_BIAS = true; +constexpr bool IS_A_TRANS = true; +constexpr bool IS_B_TRANS = false; + +struct MatrixFileSize +{ + size_t x1FileSize; + size_t x2FileSize; + size_t yFileSize; + size_t biasFileSize; +}; +} + +namespace MatmulHost { +static size_t GetSysWorkSpaceSize() +{ + auto ascendcPlatform = platform_ascendc::PlatformAscendCManager::GetInstance(); + if (ascendcPlatform == nullptr) { + return 0; + } + return static_cast(ascendcPlatform->GetLibApiWorkSpaceSize()); +} + +// CPU debug mode +#ifdef ASCENDC_CPU_DEBUG +void TestMatmulCpu(const MatmulHost::MatmulCaseParams &testCaseParams, const MatrixFileSize& matrixFileSize) +{ + size_t x1FileSize = matrixFileSize.x1FileSize; + size_t x2FileSize = matrixFileSize.x2FileSize; + size_t yFileSize = matrixFileSize.yFileSize; + size_t biasFileSize = matrixFileSize.biasFileSize; + // Query workspace size + size_t workspaceSize = GetSysWorkSpaceSize(); + uint8_t* x1 = (uint8_t*)AscendC::GmAlloc(x1FileSize); + uint8_t* x2 = (uint8_t*)AscendC::GmAlloc(x2FileSize); + uint8_t* bias; + uint8_t* y = (uint8_t*)AscendC::GmAlloc(yFileSize); + uint8_t* workspace = (uint8_t*)AscendC::GmAlloc(workspaceSize); + ReadFile("../input/x1_gm.bin", x1FileSize, x1, x1FileSize); + ReadFile("../input/x2_gm.bin", x2FileSize, x2, x2FileSize); + if (IS_BIAS) { + bias = (uint8_t *)AscendC::GmAlloc(biasFileSize); + ReadFile("../input/bias_gm.bin", biasFileSize, bias, biasFileSize); + } + size_t tilingFileSize = sizeof(TCubeTiling); + uint8_t* tiling = (uint8_t*)AscendC::GmAlloc(tilingFileSize); + // Calculate Tiling + const auto& tilingData = MatmulHost::GenerateTiling(testCaseParams); + memcpy_s(tiling, tilingFileSize, &tilingData, tilingFileSize); + ICPU_RUN_KF(batch_matmul_bias_no_batch_custom, tilingData.usedCoreNum, x1, x2,bias, y, workspace, tiling); + WriteFile("../output/output.bin", y, yFileSize); + AscendC::GmFree((void*)x1); + AscendC::GmFree((void*)x2); + if (IS_BIAS) { + AscendC::GmFree((void*)bias); + } + AscendC::GmFree((void*)y); + AscendC::GmFree((void*)workspace); + AscendC::GmFree((void*)tiling); +} +// NPU +#else +void MatmulOp(uint8_t* x1, uint8_t* x2, uint8_t* y, uint8_t* bias, MatmulHost::MatmulCaseParams testCaseParams, + void* stream = nullptr) +{ + // Init args + uint8_t *workspaceDevice; + + // Query workspace size + size_t workspaceSize = GetSysWorkSpaceSize(); + + // Allocate workspace on device + CHECK_ACL(aclrtMalloc((void **)&workspaceDevice, workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST)); + + // Initialize kernel with arguments and workspace pointer + uint8_t* tilingHost; + uint8_t* tilingDevice; + size_t tilingFileSize = sizeof(TCubeTiling); + // Calculate Tiling + const auto tilingData = MatmulHost::GenerateTiling(testCaseParams); + CHECK_ACL(aclrtMallocHost((void **)(&tilingHost), tilingFileSize)); + CHECK_ACL(aclrtMalloc((void **)&tilingDevice, tilingFileSize, + ACL_MEM_MALLOC_HUGE_FIRST)); + CHECK_ACL(aclrtMemcpy(tilingHost, tilingFileSize, &tilingData, + tilingFileSize, ACL_MEMCPY_HOST_TO_HOST)); + CHECK_ACL(aclrtMemcpy(tilingDevice, tilingFileSize, tilingHost, + tilingFileSize, ACL_MEMCPY_HOST_TO_DEVICE)); + // Launch kernel + batch_matmul_bias_no_batch_custom_do(tilingData.usedCoreNum, stream, x1, x2, bias, y, workspaceDevice, tilingDevice); + CHECK_ACL(aclrtFreeHost(tilingHost)); + CHECK_ACL(aclrtFree(workspaceDevice)); + CHECK_ACL(aclrtFree(tilingDevice)); +} + +void TestAclInit(aclrtContext &context, aclrtStream &stream, int64_t &deviceId) +{ + CHECK_ACL(aclInit(nullptr)); + CHECK_ACL(aclrtSetDevice(deviceId)); + CHECK_ACL(aclrtCreateContext(&context, deviceId)); + CHECK_ACL(aclrtCreateStream(&stream)); +} + +void TestAclDeInit(aclrtContext &context, aclrtStream &stream, int64_t &deviceId) +{ + CHECK_ACL(aclrtDestroyStream(stream)); + CHECK_ACL(aclrtDestroyContext(context)); + CHECK_ACL(aclrtResetDevice(deviceId)); + CHECK_ACL(aclFinalize()); +} + +void TestMatmul(const MatmulHost::MatmulCaseParams &testCaseParams, const MatrixFileSize& matrixFileSize) +{ + size_t x1FileSize = matrixFileSize.x1FileSize; + size_t x2FileSize = matrixFileSize.x2FileSize; + size_t yFileSize = matrixFileSize.yFileSize; + size_t biasFileSize = matrixFileSize.biasFileSize; + + aclrtContext context; + aclrtStream stream = nullptr; + int64_t deviceId = 0; + TestAclInit(context, stream, deviceId); + + uint8_t *x1Host; + uint8_t *x1Device; + CHECK_ACL(aclrtMallocHost((void **)(&x1Host), x1FileSize)); + CHECK_ACL(aclrtMalloc((void **)&x1Device, x1FileSize, ACL_MEM_MALLOC_HUGE_FIRST)); + ReadFile("../input/x1_gm.bin", x1FileSize, x1Host, x1FileSize); + CHECK_ACL(aclrtMemcpy(x1Device, x1FileSize, x1Host, x1FileSize, ACL_MEMCPY_HOST_TO_DEVICE)); + + uint8_t *x2Host; + uint8_t *x2Device; + CHECK_ACL(aclrtMallocHost((void **)(&x2Host), x2FileSize)); + CHECK_ACL(aclrtMalloc((void **)&x2Device, x2FileSize, ACL_MEM_MALLOC_HUGE_FIRST)); + ReadFile("../input/x2_gm.bin", x2FileSize, x2Host, x2FileSize); + CHECK_ACL(aclrtMemcpy(x2Device, x2FileSize, x2Host, x2FileSize, ACL_MEMCPY_HOST_TO_DEVICE)); + + uint8_t *biasHost = nullptr; + uint8_t *biasDevice = nullptr; + if (IS_BIAS) { + CHECK_ACL(aclrtMallocHost((void **)(&biasHost), biasFileSize)); + CHECK_ACL(aclrtMalloc((void **)&biasDevice, biasFileSize, ACL_MEM_MALLOC_HUGE_FIRST)); + ReadFile("../input/bias_gm.bin", biasFileSize, biasHost, biasFileSize); + CHECK_ACL(aclrtMemcpy(biasDevice, biasFileSize, biasHost, biasFileSize, ACL_MEMCPY_HOST_TO_DEVICE)); + } + uint8_t *yHost = nullptr; + uint8_t *yDevice = nullptr; + CHECK_ACL(aclrtMallocHost((void **)(&yHost), yFileSize)); + CHECK_ACL(aclrtMalloc((void **)&yDevice, yFileSize, ACL_MEM_MALLOC_HUGE_FIRST)); + + MatmulOp(x1Device, x2Device, yDevice, biasDevice, testCaseParams, stream); + CHECK_ACL(aclrtSynchronizeStream(stream)); + + CHECK_ACL(aclrtMemcpy(yHost, yFileSize, yDevice, yFileSize, ACL_MEMCPY_DEVICE_TO_HOST)); + WriteFile("../output/output.bin", yHost, yFileSize); + + if (IS_BIAS) { + CHECK_ACL(aclrtFree(biasDevice)); + CHECK_ACL(aclrtFreeHost(biasHost)); + } + CHECK_ACL(aclrtFree(x1Device)); + CHECK_ACL(aclrtFreeHost(x1Host)); + CHECK_ACL(aclrtFree(x2Device)); + CHECK_ACL(aclrtFreeHost(x2Host)); + CHECK_ACL(aclrtFree(yDevice)); + CHECK_ACL(aclrtFreeHost(yHost)); + TestAclDeInit(context, stream, deviceId); +} +#endif +} // namespace MatmulHost + +int32_t main(int32_t argc, const char *args[]) +{ + int32_t size = 4; + int64_t problem[size] = {1, 1, 1, 1}; + for (int32_t i = 1; i < argc && i < size + 1; ++i) { + std::stringstream ss(args[i]); + ss >> problem[i - 1]; + } + auto ascendcPlatform = platform_ascendc::PlatformAscendCManager::GetInstance(SOC_VERSION); + MatmulHost::MatmulCaseParams testCaseParams{static_cast(ascendcPlatform->GetCoreNumAic()), + static_cast(problem[0]), static_cast(problem[1]), static_cast(problem[2]), // M, N, K + IS_BIAS, IS_A_TRANS, IS_B_TRANS, + static_cast(problem[3])}; // BatchNum + MatrixFileSize matrixFileSize; + matrixFileSize.x1FileSize = static_cast(testCaseParams.m * testCaseParams.k * testCaseParams.batchNum) * sizeof(uint16_t); + matrixFileSize.x2FileSize = static_cast(testCaseParams.k * testCaseParams.n * testCaseParams.batchNum) * sizeof(uint16_t); + matrixFileSize.yFileSize = static_cast(testCaseParams.m * testCaseParams.n * testCaseParams.batchNum) * sizeof(float); + matrixFileSize.biasFileSize = static_cast(1 * testCaseParams.n) * sizeof(float); +#ifdef ASCENDC_CPU_DEBUG + MatmulHost::TestMatmulCpu(testCaseParams, matrixFileSize); +#else + MatmulHost::TestMatmul(testCaseParams, matrixFileSize); +#endif + return 0; +} diff --git a/examples/matrix/batch_matmul_bias_no_batch/op_host/batch_matmul_bias_no_batch_custom_tiling.cpp b/examples/matrix/batch_matmul_bias_no_batch/op_host/batch_matmul_bias_no_batch_custom_tiling.cpp new file mode 100644 index 00000000..bb80ce11 --- /dev/null +++ b/examples/matrix/batch_matmul_bias_no_batch/op_host/batch_matmul_bias_no_batch_custom_tiling.cpp @@ -0,0 +1,54 @@ +/* + * Copyright (c) 2025 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#include "batch_matmul_bias_no_batch_custom_tiling.h" +#include + +namespace MatmulHost { + +TCubeTiling GenerateTiling(const MatmulCaseParams& testCaseParams) +{ + TCubeTiling tilingData; + auto ascendcPlatform = platform_ascendc::PlatformAscendCManager::GetInstance(); + matmul_tiling::MultiCoreMatmulTiling cubeTiling(*ascendcPlatform); + int32_t M = testCaseParams.m; + int32_t N = testCaseParams.n; + int32_t K = testCaseParams.k; + int32_t blockDim = 1; + bool isBias = testCaseParams.isBias; + bool isAtrans = testCaseParams.isATrans; + bool isBtrans = testCaseParams.isBTrans; + int32_t batchNum = testCaseParams.batchNum; + + cubeTiling.SetDim(blockDim); + cubeTiling.SetAType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, + matmul_tiling::DataType::DT_FLOAT16, isAtrans); + cubeTiling.SetBType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, + matmul_tiling::DataType::DT_FLOAT16, isBtrans); + cubeTiling.SetCType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, + matmul_tiling::DataType::DT_FLOAT); + cubeTiling.SetBiasType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, + matmul_tiling::DataType::DT_FLOAT); + cubeTiling.SetOrgShape(M, N, K); + cubeTiling.SetShape(M, N, K); + cubeTiling.EnableBias(isBias); + cubeTiling.SetBufferSpace(-1, -1, -1); + cubeTiling.SetALayout(batchNum, M, 1, 1, K); + cubeTiling.SetBLayout(batchNum, K, 1, 1, N); + cubeTiling.SetCLayout(batchNum, M, 1, 1, N); + cubeTiling.SetBatchNum(batchNum); + if (cubeTiling.GetTiling(tilingData) == -1) { + std::cout << "Generate tiling failed." << std::endl; + return {}; + } + return tilingData; +} + +} // namespace MatmulHost diff --git a/examples/matrix/batch_matmul_bias_no_batch/op_host/batch_matmul_bias_no_batch_custom_tiling.h b/examples/matrix/batch_matmul_bias_no_batch/op_host/batch_matmul_bias_no_batch_custom_tiling.h new file mode 100644 index 00000000..2239eb39 --- /dev/null +++ b/examples/matrix/batch_matmul_bias_no_batch/op_host/batch_matmul_bias_no_batch_custom_tiling.h @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2025 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#ifndef EXAMPLES_MATRIX_BATCH_MATMUL_BIAS_NO_BATCH_OP_HOST_BATCH_MATMUL_BIAS_NO_BATCH_CUSTOM_TILING_H +#define EXAMPLES_MATRIX_BATCH_MATMUL_BIAS_NO_BATCH_OP_HOST_BATCH_MATMUL_BIAS_NO_BATCH_CUSTOM_TILING_H +#include "register/tilingdata_base.h" +#include "tiling/tiling_api.h" + +namespace MatmulHost { + +struct MatmulCaseParams +{ + int32_t usedCoreNum; + int32_t m; + int32_t n; + int32_t k; + bool isBias; + bool isATrans; + bool isBTrans; + int32_t batchNum; +}; + +/** + * @brief Generate matmul tiling. + * @param testCaseParams: Testcase parameters. + * @retval Generated Tiling data. + */ +TCubeTiling GenerateTiling(const MatmulCaseParams& testCaseParams); + +} // namespace MatmulHost +#endif // EXAMPLES_MATRIX_BATCH_MATMUL_BIAS_NO_BATCH_OP_HOST_BATCH_MATMUL_BIAS_NO_BATCH_CUSTOM_TILING_H diff --git a/examples/matrix/batch_matmul_bias_no_batch/op_kernel/batch_matmul_bias_no_batch_custom.cpp b/examples/matrix/batch_matmul_bias_no_batch/op_kernel/batch_matmul_bias_no_batch_custom.cpp new file mode 100644 index 00000000..e0ae3e73 --- /dev/null +++ b/examples/matrix/batch_matmul_bias_no_batch/op_kernel/batch_matmul_bias_no_batch_custom.cpp @@ -0,0 +1,87 @@ +/* + * Copyright (c) 2025 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#include "kernel_operator.h" +#include "lib/matmul_intf.h" +#include "batch_matmul_bias_no_batch_custom_impl.h" + +namespace BatchMatmulCustom { +__aicore__ inline void CopyTiling(TCubeTiling* tiling, GM_ADDR tilingGM) +{ + uint32_t* ptr = reinterpret_cast(tiling); + auto tiling32 = reinterpret_cast<__gm__ uint32_t*>(tilingGM); + + for (int i = 0; i < sizeof(TCubeTiling) / sizeof(uint32_t); i++, ptr++) { + *ptr = *(tiling32 + i); + } + return; +} + +template +__aicore__ inline void BatchMatmulKernel::Init(GM_ADDR a, GM_ADDR b, GM_ADDR bias, + GM_ADDR c, GM_ADDR workspace, const TCubeTiling& tiling) +{ + this->tiling = tiling; + int32_t sizeA = tiling.ALayoutInfoB * tiling.ALayoutInfoS * tiling.ALayoutInfoN * tiling.ALayoutInfoG * tiling.ALayoutInfoD; + int32_t sizeB = tiling.BLayoutInfoB * tiling.BLayoutInfoS * tiling.BLayoutInfoN * tiling.BLayoutInfoG * tiling.BLayoutInfoD; + int32_t sizeC = tiling.CLayoutInfoB * tiling.CLayoutInfoS1 * tiling.CLayoutInfoN * tiling.CLayoutInfoG * tiling.CLayoutInfoS2; + int32_t sizeBias = tiling.CLayoutInfoN * tiling.CLayoutInfoG * tiling.CLayoutInfoS2; + + aGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ aType*>(a), sizeA); + bGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ bType*>(b), sizeB); + cGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ cType*>(c), sizeC); + biasGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ biasType*>(bias), sizeBias); + + if (GetSysWorkSpacePtr() == nullptr) { + return; + } +} + +template +__aicore__ inline void BatchMatmulKernel::Process(AscendC::TPipe* pipe, int32_t batchA, int32_t batchB) +{ + matmulObj.SetTensorA(aGlobal, false); + matmulObj.SetTensorB(bGlobal, true); // B transpose + if (tiling.isBias) { + matmulObj.SetBias(biasGlobal); + } + + matmulObj.IterateBatch(cGlobal, batchA, batchB, false); +} +} // namespace BatchMatmulCustom + +extern "C" __global__ __aicore__ void batch_matmul_bias_no_batch_custom(GM_ADDR a, GM_ADDR b, GM_ADDR bias, GM_ADDR c, GM_ADDR workspace, + GM_ADDR tilingGm) +{ + // prepare tiling + TCubeTiling tiling; + BatchMatmulCustom::CopyTiling(&tiling, tilingGm); + // define matmul kernel + using A_TYPE = AscendC::MatmulType; + using B_TYPE = AscendC::MatmulType; + using C_TYPE = AscendC::MatmulType; + using BIAS_TYPE = AscendC::MatmulType; + BatchMatmulCustom::BatchMatmulKernel batchMatmulKernel; + AscendC::TPipe pipe; + REGIST_MATMUL_OBJ(&pipe, GetSysWorkSpacePtr(), batchMatmulKernel.matmulObj, &tiling); + // init matmul kernel + batchMatmulKernel.Init(a, b, bias, c, workspace, tiling); + // matmul kernel process + batchMatmulKernel.Process(&pipe, 3, 3); +} + +#ifndef ASCENDC_CPU_DEBUG +void batch_matmul_bias_no_batch_custom_do(uint32_t blockDim, void* stream, GM_ADDR a, GM_ADDR b, GM_ADDR bias, + GM_ADDR c, GM_ADDR workspace, GM_ADDR tilingGm) +{ + // invoke the kernel function through the <<<>>> symbol + batch_matmul_bias_no_batch_custom<<>>(a, b, bias, c, workspace, tilingGm); +} +#endif \ No newline at end of file diff --git a/examples/matrix/batch_matmul_bias_no_batch/op_kernel/batch_matmul_bias_no_batch_custom_impl.h b/examples/matrix/batch_matmul_bias_no_batch/op_kernel/batch_matmul_bias_no_batch_custom_impl.h new file mode 100644 index 00000000..70e90e27 --- /dev/null +++ b/examples/matrix/batch_matmul_bias_no_batch/op_kernel/batch_matmul_bias_no_batch_custom_impl.h @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2025 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#ifndef EXAMPLES_MATRIX_BATCH_MATMUL_BIAS_NO_BATCH_OP_KERNEL_BATCH_MATMUL_BIAS_NO_BATCH_CUSTOM_IMPL_H +#define EXAMPLES_MATRIX_BATCH_MATMUL_BIAS_NO_BATCH_OP_KERNEL_BATCH_MATMUL_BIAS_NO_BATCH_CUSTOM_IMPL_H +#include "kernel_operator.h" +#include "lib/matmul_intf.h" + +namespace BatchMatmulCustom { +constexpr MatmulConfigMode configMode = MatmulConfigMode::CONFIG_NORM; +constexpr MatmulBatchParams batchParams = { + false, BatchMode::BATCH_LESS_THAN_L1, false /* isBiasBatch */ +}; +constexpr MatmulConfig CFG_MM = GetMMConfig(batchParams); + +template +class BatchMatmulKernel { + public: + __aicore__ inline BatchMatmulKernel(){}; + __aicore__ inline void Init(GM_ADDR a, GM_ADDR b, GM_ADDR bias, GM_ADDR c, GM_ADDR workspace, const TCubeTiling& tiling); + __aicore__ inline void Process(AscendC::TPipe* pipe, int32_t batchA, int32_t batchB); + AscendC::Matmul matmulObj; + private: + __aicore__ inline void CalcOffset(int32_t blockIdx, const TCubeTiling& tiling, int32_t& offsetA, int32_t& offsetB, + int32_t& offsetC, int32_t& offsetBias); + using aType = typename A_TYPE::T; + using bType = typename B_TYPE::T; + using cType = typename C_TYPE::T; + using biasType = typename BIAS_TYPE::T; + AscendC::GlobalTensor aGlobal; + AscendC::GlobalTensor bGlobal; + AscendC::GlobalTensor cGlobal; + AscendC::GlobalTensor biasGlobal; + TCubeTiling tiling; +}; +} // namespace BatchMatmulCustom +#endif // EXAMPLES_MATRIX_BATCH_MATMUL_BIAS_NO_BATCH_OP_KERNEL_BATCH_MATMUL_BIAS_NO_BATCH_CUSTOM_IMPL_H \ No newline at end of file diff --git a/examples/matrix/batch_matmul_bias_no_batch/run.sh b/examples/matrix/batch_matmul_bias_no_batch/run.sh new file mode 100644 index 00000000..b23451b8 --- /dev/null +++ b/examples/matrix/batch_matmul_bias_no_batch/run.sh @@ -0,0 +1,96 @@ +#!/bin/bash +# Copyright (c) 2025 Huawei Technologies Co., Ltd. +# This file is a part of the CANN Open Software. +# Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ====================================================================================================================== + +export IS_PERF="0" + +SHORT=r:,v:,p:, +LONG=run-mode:,soc-version:,perf:, +OPTS=$(getopt -a --options $SHORT --longoptions $LONG -- "$@") +eval set -- "$OPTS" +while : +do + case "$1" in + (-r | --run-mode ) + RUN_MODE="$2" + shift 2;; + (-v | --soc-version ) + SOC_VERSION="$2" + shift 2;; + (-p | --perf ) + IS_PERF="$2" + shift 2;; + (--) + shift; + break;; + (*) + echo "[ERROR] Unexpected option: $1"; + break;; + esac +done + +# Check invalid input +if [[ ! "${SOC_VERSION}" =~ ^Ascend910 ]]; then + echo "[ERROR] Unsupported SocVersion: ${SOC_VERSION}" + exit 1 +fi + +if [ "${RUN_MODE}" != "npu" ] && [ "${RUN_MODE}" != "sim" ] && [ "${RUN_MODE}" != "cpu" ]; then + echo "[ERROR] Unsupported RUN_MODE: ${RUN_MODE}, which can only be cpu/sim/npu." + exit 1 +fi + +if [ "${IS_PERF}" != "0" ] && [ "${IS_PERF}" != "1" ]; then + echo "[ERROR] Unsupported IS_PERF: ${IS_PERF}, which can only be 0 or 1." + exit 1 +fi + +# only npu mode support is_perf = 1 +if [ "${IS_PERF}" == "1" ] && [ "${RUN_MODE}" != "npu" ]; then + echo "[ERROR] Unsupported IS_PERF: ${IS_PERF} while RUN_MODE is ${RUN_MODE}." + exit 1 +fi + +rm -rf build +mkdir build +cd build + +source $ASCEND_HOME_DIR/bin/setenv.bash +export LD_LIBRARY_PATH=${ASCEND_HOME_DIR}/tools/simulator/${SOC_VERSION}/lib:$LD_LIBRARY_PATH +set -euo pipefail + +cmake -DRUN_MODE=${RUN_MODE} -DSOC_VERSION=${SOC_VERSION} -DASCEND_CANN_PACKAGE_PATH=${ASCEND_HOME_DIR} .. +make -j16 + +cd ../ + +rm -rf input +mkdir input +rm -rf output +mkdir output + +rm -rf bin/ +mkdir -p bin +cd bin +cp ../build/ascendc_batch_matmul_bias_no_batch_bbit ./ + +export TF_CPP_MIN_LOG_LEVEL=3 + +if [ "${RUN_MODE}" = "npu" ]; then + if [ "${IS_PERF}" = "1" ]; then + export ASCEND_TOOLKIT_HOME=${ASCEND_HOME_DIR} + python3 -u ../scripts/exec_test.py npu "perf" + else + python3 -u ../scripts/exec_test.py npu "normal" + fi +elif [ "${RUN_MODE}" = "sim" ]; then + python3 -u ../scripts/exec_test.py sim "normal" +elif [ "${RUN_MODE}" = "cpu" ]; then + python3 -u ../scripts/exec_test.py cpu "normal" +fi diff --git a/examples/matrix/batch_matmul_bias_no_batch/scripts/exec_test.py b/examples/matrix/batch_matmul_bias_no_batch/scripts/exec_test.py new file mode 100644 index 00000000..e5d99dd3 --- /dev/null +++ b/examples/matrix/batch_matmul_bias_no_batch/scripts/exec_test.py @@ -0,0 +1,118 @@ +#!/usr/bin/python3 +# coding=utf-8 + +# Copyright (c) 2025 Huawei Technologies Co., Ltd. +# This file is a part of the CANN Open Software. +# Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ====================================================================================================================== +import os +import sys +import csv +import time +import logging +import subprocess + +import numpy as np + +sys.path.append("../..") +from gen_batch_data import MatmulGenData +from common_scripts.compare_data import compare_data +from common_scripts.exec_utils import get_file_work_dir, get_case_list, get_perf_task_duration, get_process_case_cmd + +IS_BIAS = True +IS_TRANS_A = False +IS_TRANS_B = True +DATA_TYPE_STR = "float16_float32" + +logging.basicConfig(level=logging.INFO) + + +class ProcessParams: + def __init__(self, case_name, m, n, k, b, is_perf, run_mode): + self.case_name = case_name + self.m = m + self.n = n + self.k = k + self.b = b + self.is_perf = is_perf + self.run_mode = run_mode + + +def process_case(file_work_dir, process_params): + case_name, m, n, k, b, is_perf, run_mode = process_params.case_name, process_params.m, process_params.n,\ + process_params.k, process_params.b, process_params.is_perf, process_params.run_mode + logging.info("[INFO] start process case [%s]" % (case_name)) + logging.info("[INFO] IS_PERF is set [%d]" % (is_perf)) + + matmul_gen_data = MatmulGenData(m, n, k, b, IS_TRANS_A, IS_TRANS_B, IS_BIAS, DATA_TYPE_STR) + if is_perf: + matmul_gen_data.gen_fake_golden_data(file_work_dir) + else: + matmul_gen_data.gen_golden_data(file_work_dir) + params_str = f"{m} {n} {k} {b}" + cmd = get_process_case_cmd(kernel_name="ascendc_batch_matmul_bias_no_batch_bbit", params_str=params_str,\ + is_perf=is_perf, run_mode=run_mode) + subprocess.run(cmd) + if is_perf: + wrong_num = -1 + else: + logging.info("[INFO] compare data case[%s]" % (case_name)) + wrong_num = compare_data(file_work_dir, DATA_TYPE_STR) + res_data = [] + res_data.append(case_name) + res_data.append(wrong_num) + res_data.append(m * n * b) + if wrong_num == -1: + res_data.append("None") + elif wrong_num / (m * n * b) > 0.001: + res_data.append("Fail") + else: + res_data.append("Success") + if is_perf: + task_duration = get_perf_task_duration("./prof_out") + res_data.append(task_duration) + return res_data + + +def main(): + args_len = len(sys.argv) - 1 + if args_len != 2: + logging.info("[ERROR] exec_test input params error!") + return -1 + + file_work_dir = get_file_work_dir() + if not os.path.exists(file_work_dir): + logging.info("[ERROR] file path %s not exist!" % (file_work_dir)) + return -1 + + is_perf = False + if sys.argv[2] == "perf": + is_perf = True + + case_list = get_case_list() + res_list = [["case_name", "wrong_num", "total_num", "result", "task_duration"]] + run_mode = sys.argv[1] + for is_process, case_name, m, n, k, b in case_list: + if is_process == 1: + process_params = ProcessParams(case_name, m, n, k, b, is_perf, run_mode) + res_data = process_case(file_work_dir, process_params) + res_list.append(res_data) + + timestamp = time.time() + result_file_name = "result_" + str(timestamp) + ".csv" + with open(os.path.join(file_work_dir, "output", result_file_name), 'w', newline='', encoding='utf-8') as csvfile: + writer = csv.writer(csvfile) + writer.writerows(res_list) + + logging.info("---------------RESULT---------------") + for res in res_list: + logging.info(res) + return 0 + + +if __name__ == "__main__": + main() diff --git a/examples/matrix/batch_matmul_bias_no_batch/scripts/gen_batch_data.py b/examples/matrix/batch_matmul_bias_no_batch/scripts/gen_batch_data.py new file mode 100644 index 00000000..e4f25130 --- /dev/null +++ b/examples/matrix/batch_matmul_bias_no_batch/scripts/gen_batch_data.py @@ -0,0 +1,93 @@ +#!/usr/bin/python3 +# coding=utf-8 + +# Copyright (c) 2025 Huawei Technologies Co., Ltd. +# This file is a part of the CANN Open Software. +# Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ====================================================================================================================== +import os +import sys +import logging + +import numpy as np +import tensorflow as tf + +IS_OUTPUT_TXT = False + + +class MatmulGenData: + def __init__(self, m, n, k, b, is_trans_a, is_trans_b, is_bias,\ + data_type_str): + self.m = m + self.n = n + self.k = k + self.b = b + self.is_trans_a = is_trans_a + self.is_trans_b = is_trans_b + self.is_bias = is_bias + self.data_type_str = data_type_str + + def gen_golden_data_fp16_fp32(self, work_dir): + src_type = np.float16 + dst_type = np.float32 + bias_gm = np.random.uniform(-1, 1, [1, self.n]).astype(dst_type) + + for idx in range(self.b): + x1_gm_left = np.random.uniform(-1, 1, [self.m, self.k]).astype(src_type) + x2_gm_right = np.random.uniform(-1, 1, [self.k, self.n]).astype(src_type) + golden_one = np.matmul(x1_gm_left.astype(dst_type), x2_gm_right.astype(dst_type)).astype(dst_type) + if self.is_bias: + golden_one = golden_one + bias_gm.astype(dst_type) + if self.is_trans_a: + x1_tmp = x1_gm_left.transpose() + else: + x1_tmp = x1_gm_left + if self.is_trans_b: + x2_tmp = x2_gm_right.transpose() + else: + x2_tmp = x2_gm_right + if idx == 0: + x1_gm = x1_tmp + x2_gm = x2_tmp + golden = golden_one + else: + x1_gm = np.vstack((x1_gm, x1_tmp)) + x2_gm = np.vstack((x2_gm, x2_tmp)) + golden = np.vstack((golden, golden_one)) + + x1_gm.tofile(work_dir + "/input/x1_gm.bin") + x2_gm.tofile(work_dir + "/input/x2_gm.bin") + if self.is_bias: + bias_gm.tofile(work_dir + "/input/bias_gm.bin") + golden.tofile(work_dir + "/output/golden.bin") + + return 0 + + def gen_golden_data(self, work_dir): + if self.data_type_str == "float16_float32": + self.gen_golden_data_fp16_fp32(work_dir) + else: + logging.info("[ERROR] can't support data type %s" % (self.data_type_str)) + return -1 + return 0 + + def gen_fake_golden_data(self, work_dir): + data_type_bytes_ab = 2 # float16 + data_type_bytes_c = 4 # float32 + + file_byte = self.b * self.m * self.k * data_type_bytes_ab + with open(work_dir + "/input/x1_gm.bin", 'wb') as file: + file.truncate(file_byte) + + file_byte = self.b * self.k * self.n * data_type_bytes_ab + with open(work_dir + "/input/x2_gm.bin", 'wb') as file: + file.truncate(file_byte) + + if self.is_bias: + file_byte = 1 * self.n * data_type_bytes_c + with open(work_dir + "/input/bias_gm.bin", 'wb') as file: + file.truncate(file_byte) \ No newline at end of file diff --git a/examples/matrix/batch_matmul_bias_no_batch/testcase/case.csv b/examples/matrix/batch_matmul_bias_no_batch/testcase/case.csv new file mode 100644 index 00000000..4f0f0196 --- /dev/null +++ b/examples/matrix/batch_matmul_bias_no_batch/testcase/case.csv @@ -0,0 +1 @@ +1, case001, 32, 256, 64, 3 \ No newline at end of file diff --git a/examples/matrix/batch_matmul_tscm/CMakeLists.txt b/examples/matrix/batch_matmul_tscm/CMakeLists.txt new file mode 100644 index 00000000..4efde358 --- /dev/null +++ b/examples/matrix/batch_matmul_tscm/CMakeLists.txt @@ -0,0 +1,73 @@ +# Copyright (c) 2025 Huawei Technologies Co., Ltd. +# This file is a part of the CANN Open Software. +# Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ====================================================================================================================== + +cmake_minimum_required(VERSION 3.16) +project(Ascend_c) +if(${RUN_MODE}) + set(RUN_MODE "npu" CACHE STRING "cpu/sim/npu") +endif() +if (${SOC_VERSION}) + set(SOC_VERSION "Ascend910" CACHE STRING "system on chip type") +endif() + +set(ASCEND_CANN_PACKAGE_PATH "~/Ascend/ascend-toolkit/latest" CACHE STRING "ASCEND CANN package installation directory") +if(NOT CMAKE_BUILD_TYPE) + set(CMAKE_BUILD_TYPE "Debug" CACHE STRING "Build type Release/Debug (default Debug)" FORCE) +endif() + +if(CMAKE_INSTALL_PREFIX STREQUAL /usr/local) + set(CMAKE_INSTALL_PREFIX "${CMAKE_CURRENT_LIST_DIR}/out" CACHE STRING "path for install()" FORCE) +endif() + +file(GLOB KERNEL_FILES + ${CMAKE_CURRENT_SOURCE_DIR}/op_kernel/batch_matmul_tscm_custom.cpp +) + +if("${RUN_MODE}" STREQUAL "cpu") + include(cmake/cpu_lib.cmake) +elseif("${RUN_MODE}" STREQUAL "sim" OR "${RUN_MODE}" STREQUAL "npu") + include(cmake/npu_lib.cmake) +else() + message("invalid RUN_MODE: ${RUN_MODE}") +endif() + +add_executable(ascendc_batch_matmul_tscm_bbit + ${CMAKE_CURRENT_SOURCE_DIR}/main.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/op_host/batch_matmul_tscm_custom_tiling.cpp +) + +target_compile_options(ascendc_batch_matmul_tscm_bbit PRIVATE + $:-g>> + -O2 + -std=c++17 + -D_GLIBCXX_USE_CXX11_ABI=0 +) + +target_compile_definitions(ascendc_batch_matmul_tscm_bbit PRIVATE + SOC_VERSION="${SOC_VERSION}" +) + +target_include_directories(ascendc_batch_matmul_tscm_bbit PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR} + $:${ASCEND_CANN_PACKAGE_PATH}/include>> + $:${ASCEND_CANN_PACKAGE_PATH}/runtime/include>> +) + +target_link_libraries(ascendc_batch_matmul_tscm_bbit PRIVATE + $,$>:host_intf_pub>> + $:tikicpulib::${SOC_VERSION}>> + $:ascendcl>> + $:c_sec>> + ascendc_kernels_${RUN_MODE} + tiling_api + register + platform + ascendalog + dl +) diff --git a/examples/matrix/batch_matmul_tscm/README.md b/examples/matrix/batch_matmul_tscm/README.md new file mode 100644 index 00000000..1cb6f68d --- /dev/null +++ b/examples/matrix/batch_matmul_tscm/README.md @@ -0,0 +1,127 @@ + +## 概述 + +本样例介绍了调用Matmul高阶API实现左矩阵为L1输入的BatchMatmul单算子,即算子中自定义TSCM输入。关于TSCM输入的具体内容请参考《[Ascend C算子开发](https://www.hiascend.com/document/redirect/CannCommunityOpdevAscendC)》中的“算子实现 > 矩阵编程(高阶API) > 特性场景 > TSCM输入场景”章节。 + +本样例以直调的方式调用算子核函数。 + +直调:核函数的基础调用方式,开发者完成算子核函数的开发和Tiling实现后,即可通过AscendCL运行时接口,完成算子的调用。包含CPU侧、NPU侧和仿真三种运行验证方法。 + +## 样例支持的产品型号为 +- Atlas A2训练系列产品/Atlas 800I A2推理产品 + +## 目录结构 +| 目录及文件 | 描述 | +|----------------------------------|----------------------| +| [cmake](cmake) | 编译工程文件 | +| [op_host](op_host) | 本样例的tiling代码实现 | +| [op_kernel](op_kernel) | 本样例的kernel代码实现 | +| [scripts](scripts) | 包含输入输出真值数据生成脚本文件和执行文件 | +| [testcase](testcase) | 用例文件,配置用例的计算shape信息 | +| CMakeLists.txt | 编译工程文件 | +| main.cpp | 主函数,调用算子的应用程序,含CPU域及NPU域调用 | +| run.sh | 编译执行脚本 | + +## 算子描述 +- 算子功能 + BatchMatmul单算子,批量处理Matmul计算,每次Matmul计算对输入的A、B矩阵做矩阵乘和加bias偏置。该算子中A矩阵使用TSCM输入。 + +- 算子规格 + + + + + + + + + + + + +
算子类型(OpType)BatchMatmulCustom
算子输入nameshapedata typeformatisTranslayout
a-float16NZfalseNORMAL
b-float16NDtrueNORMAL
bias-floatND--
算子输出c-floatND-NORMAL
核函数名batch_matmul_bias_no_batch_custom
+ +## 算子实现介绍 +- 约束条件 + - 在BatchMatmul的输入矩阵位置为L1时,输入输出的Layout只支持NORMAL。 + - TSCM输入的矩阵必须能在L1 Buffer上全载,且L1 Buffer上的数据应当为NZ格式。 + +- 算子Kernel实现 + - 一次完成BatchNum个Matmul矩阵乘法的运算。单次MatMul的计算公式为:C = A * B + Bias。 + - A、B为源操作数,A为左矩阵,形状为[M, K];B为右矩阵,形状为[K, N]。 + - C为目的操作数,存放矩阵乘结果的矩阵,形状为[M, N]。 + - Bias为矩阵乘偏置,形状为[1, N]。对A*B结果矩阵的每一行都采用该bias进行偏置。 + - 具体步骤: + - 创建Matmul对象。 + 创建Matmul对象时,将A矩阵MatmulType的Position设为TSCM,Format设为NZ。 + ``` + using A_TYPE = AscendC::MatmulType; + using B_TYPE = AscendC::MatmulType; + using C_TYPE = AscendC::MatmulType; + using BIAS_TYPE = AscendC::MatmulType; + ``` + - 初始化操作。 + - 自定义左矩阵A从GM到L1的搬运,设置左矩阵A、右矩阵B、Bias,其中左矩阵A为TSCM输入。 + ``` + AscendC::TSCM scm; + pipe->InitBuffer(scm, 1, tiling.M * tiling.Ka * sizeof(AType)); + auto scmTensor = scm.AllocTensor(); + DataCopy(scmTensor, aGlobal, tiling.M * tiling.Ka); + scm.EnQue(scmTensor); + AscendC::LocalTensor scmLocal = scm.DeQue(); + + matmulObj.SetTensorA(scmLocal); + matmulObj.SetTensorB(bGlobal); + if (tiling.isBias) { + matmulObj.SetBias(biasGlobal); + } + ``` + - 完成多batch矩阵乘操作。 + ``` + matmulObj.IterateBatch(cGlobal, batchA, batchB, false); + ``` + - 结束矩阵乘操作。 + +- 算子Tiling实现 + - Ascend C提供一组Matmul Tiling API,方便用户获取Matmul Kernel计算时所需的 Tiling参数。只需要传入A/B/C矩阵等信息,调用API接口,即可获取到TCubeTiling结构体中的相关参数。 + - 获取Tiling参数的流程如下: + - 创建一个Tiling对象。 + - 根据输入输出Layout设置单核计算的A、B、C、Bias的参数类型信息,将A矩阵MatmulType的Position设为TSCM,Format设为NZ;M、N、Ka、Kb形状信息等。 + ``` + cubeTiling.SetAType(matmul_tiling::TPosition::TSCM, matmul_tiling::CubeFormat::NZ, + matmul_tiling::DataType::DT_FLOAT16, isAtrans); + cubeTiling.SetBType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, + matmul_tiling::DataType::DT_FLOAT16, isBtrans); + cubeTiling.SetCType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, + matmul_tiling::DataType::DT_FLOAT); + cubeTiling.SetBiasType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, + matmul_tiling::DataType::DT_FLOAT); + ``` + - 调用SetALayout、SetBLayout、SetCLayout、SetBatchNum设置A/B/C的Layout轴信息和最大BatchNum数。 + - 调用GetTiling接口,获取Tiling信息。 + +## 编译运行样例 + + - 配置环境变量 + + 这里的\$ASCEND_CANN_PACKAGE_PATH需要替换为CANN开发套件包安装后文件存储路径。例如:/usr/local/Ascend/ascend-toolkit/latest + ``` + export ASCEND_HOME_DIR=$ASCEND_CANN_PACKAGE_PATH + source $ASCEND_HOME_DIR/../set_env.sh + ``` + + - 编译执行 + + ``` + bash run.sh -r [RUN_MODE] -v [SOC_VERSION] -p [IS_PERF] + ``` + 其中脚本参数说明如下: + - RUN_MODE :编译执行方式,可选择CPU调试,NPU仿真,NPU上板,对应参数分别为[cpu / sim / npu]。若需要详细了解NPU仿真相关内容,请参考[《算子开发工具msProf》](https://hiascend.com/document/redirect/CannCommunityToolMsProf)中的“工具使用”章节。 + - SOC_VERSION :昇腾AI处理器型号,如果无法确定具体的[SOC_VERSION],则在安装昇腾AI处理器的服务器执行npu-smi info命令进行查询,在查询到的“Name”前增加Ascend信息,例如“Name”对应取值为xxxyy,实际配置的[SOC_VERSION]值为Ascendxxxyy。支持以下产品型号: + - Atlas A2训练系列产品/Atlas 800I A2推理产品 + - IS_PERF : 是否获取执行性能数据,当前只在NPU执行时生效,打开后性能数据显示在执行结果中,可选择关闭和开启该功能,对应参数分别为[0 / 1]。 + + 示例如下,Ascendxxxyy请替换为实际的AI处理器型号。 + ``` + bash run.sh -r cpu -v Ascendxxxyy -p 0 + ``` \ No newline at end of file diff --git a/examples/matrix/batch_matmul_tscm/cmake/cpu_lib.cmake b/examples/matrix/batch_matmul_tscm/cmake/cpu_lib.cmake new file mode 100644 index 00000000..244469fb --- /dev/null +++ b/examples/matrix/batch_matmul_tscm/cmake/cpu_lib.cmake @@ -0,0 +1,31 @@ +# Copyright (c) 2025 Huawei Technologies Co., Ltd. +# This file is a part of the CANN Open Software. +# Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ====================================================================================================================== + +if(NOT DEFINED ENV{CMAKE_PREFIX_PATH}) + set(CMAKE_PREFIX_PATH ${ASCEND_CANN_PACKAGE_PATH}/tools/tikicpulib/lib/cmake) +endif() +find_package(tikicpulib REQUIRED) + +add_library(ascendc_kernels_${RUN_MODE} SHARED + ${KERNEL_FILES} +) + +target_link_libraries(ascendc_kernels_${RUN_MODE} PRIVATE + tikicpulib::${SOC_VERSION} +) + +target_compile_options(ascendc_kernels_${RUN_MODE} PRIVATE + -g + -O0 + -std=c++17 +) + +install(TARGETS ascendc_kernels_${RUN_MODE} +DESTINATION ${CMAKE_INSTALL_LIBDIR} +) \ No newline at end of file diff --git a/examples/matrix/batch_matmul_tscm/cmake/npu_lib.cmake b/examples/matrix/batch_matmul_tscm/cmake/npu_lib.cmake new file mode 100644 index 00000000..7e1c58d6 --- /dev/null +++ b/examples/matrix/batch_matmul_tscm/cmake/npu_lib.cmake @@ -0,0 +1,27 @@ +# Copyright (c) 2025 Huawei Technologies Co., Ltd. +# This file is a part of the CANN Open Software. +# Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ====================================================================================================================== + +if(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake) + set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake) +elseif(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake) + set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake) +else() + message(FATAL_ERROR "ascendc_kernel_cmake does not exist ,please check whether the cann package is installed") +endif() +include(${ASCENDC_CMAKE_DIR}/ascendc.cmake) + +ascendc_library(ascendc_kernels_${RUN_MODE} STATIC + ${KERNEL_FILES} +) + +ascendc_compile_definitions(ascendc_kernels_${RUN_MODE} PRIVATE + -DASCENDC_DUMP + -DHAVE_WORKSPACE + -DHAVE_TILING + ) diff --git a/examples/matrix/batch_matmul_tscm/main.cpp b/examples/matrix/batch_matmul_tscm/main.cpp new file mode 100644 index 00000000..1233f1d4 --- /dev/null +++ b/examples/matrix/batch_matmul_tscm/main.cpp @@ -0,0 +1,222 @@ +/** + * Copyright (c) 2025 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#include +#include "../../common/data_utils.h" +#include "kernel_tiling/kernel_tiling.h" +#include "tiling/platform/platform_ascendc.h" +#include "op_host/batch_matmul_tscm_custom_tiling.h" +#ifndef ASCENDC_CPU_DEBUG +#include "acl/acl.h" +extern void batch_matmul_tscm_custom_do(uint32_t coreDim, void* stream, + uint8_t *a, uint8_t *b, uint8_t *bias, uint8_t *c, + uint8_t *workspace, uint8_t *tiling); +#else +#include "tikicpulib.h" +extern "C" void batch_matmul_tscm_custom(uint8_t *a, uint8_t *b, uint8_t *bias, uint8_t *c, + uint8_t *workspace, uint8_t *tiling); +#endif + +namespace { +constexpr bool IS_BIAS = false; +constexpr bool IS_A_TRANS = true; +constexpr bool IS_B_TRANS = false; + +struct MatrixFileSize +{ + size_t x1FileSize; + size_t x2FileSize; + size_t yFileSize; + size_t biasFileSize; +}; +} + +namespace MatmulHost { +static size_t GetSysWorkSpaceSize() +{ + auto ascendcPlatform = platform_ascendc::PlatformAscendCManager::GetInstance(); + if (ascendcPlatform == nullptr) { + return 0; + } + return static_cast(ascendcPlatform->GetLibApiWorkSpaceSize()); +} + +// CPU debug mode +#ifdef ASCENDC_CPU_DEBUG +void TestMatmulCpu(const MatmulHost::MatmulCaseParams &testCaseParams, const MatrixFileSize& matrixFileSize) +{ + size_t x1FileSize = matrixFileSize.x1FileSize; + size_t x2FileSize = matrixFileSize.x2FileSize; + size_t yFileSize = matrixFileSize.yFileSize; + size_t biasFileSize = matrixFileSize.biasFileSize; + // Query workspace size + size_t workspaceSize = GetSysWorkSpaceSize(); + uint8_t* x1 = (uint8_t*)AscendC::GmAlloc(x1FileSize); + uint8_t* x2 = (uint8_t*)AscendC::GmAlloc(x2FileSize); + uint8_t* bias; + uint8_t* y = (uint8_t*)AscendC::GmAlloc(yFileSize); + uint8_t* workspace = (uint8_t*)AscendC::GmAlloc(workspaceSize); + ReadFile("../input/x1_gm.bin", x1FileSize, x1, x1FileSize); + ReadFile("../input/x2_gm.bin", x2FileSize, x2, x2FileSize); + if (IS_BIAS) { + bias = (uint8_t *)AscendC::GmAlloc(biasFileSize); + ReadFile("../input/bias_gm.bin", biasFileSize, bias, biasFileSize); + } + size_t tilingFileSize = sizeof(TCubeTiling); + uint8_t* tiling = (uint8_t*)AscendC::GmAlloc(tilingFileSize); + // Calculate Tiling + const auto& tilingData = MatmulHost::GenerateTiling(testCaseParams); + memcpy_s(tiling, tilingFileSize, &tilingData, tilingFileSize); + ICPU_RUN_KF(batch_matmul_tscm_custom, tilingData.usedCoreNum, x1, x2,bias, y, workspace, tiling); + WriteFile("../output/output.bin", y, yFileSize); + AscendC::GmFree((void*)x1); + AscendC::GmFree((void*)x2); + if (IS_BIAS) { + AscendC::GmFree((void*)bias); + } + AscendC::GmFree((void*)y); + AscendC::GmFree((void*)workspace); + AscendC::GmFree((void*)tiling); +} +// NPU +#else +void MatmulOp(uint8_t* x1, uint8_t* x2, uint8_t* y, uint8_t* bias, MatmulHost::MatmulCaseParams testCaseParams, + void* stream = nullptr) +{ + // Init args + uint8_t *workspaceDevice; + + // Query workspace size + size_t workspaceSize = GetSysWorkSpaceSize(); + + // Allocate workspace on device + CHECK_ACL(aclrtMalloc((void **)&workspaceDevice, workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST)); + + // Initialize kernel with arguments and workspace pointer + uint8_t* tilingHost; + uint8_t* tilingDevice; + size_t tilingFileSize = sizeof(TCubeTiling); + // Calculate Tiling + const auto tilingData = MatmulHost::GenerateTiling(testCaseParams); + CHECK_ACL(aclrtMallocHost((void **)(&tilingHost), tilingFileSize)); + CHECK_ACL(aclrtMalloc((void **)&tilingDevice, tilingFileSize, + ACL_MEM_MALLOC_HUGE_FIRST)); + CHECK_ACL(aclrtMemcpy(tilingHost, tilingFileSize, &tilingData, + tilingFileSize, ACL_MEMCPY_HOST_TO_HOST)); + CHECK_ACL(aclrtMemcpy(tilingDevice, tilingFileSize, tilingHost, + tilingFileSize, ACL_MEMCPY_HOST_TO_DEVICE)); + // Launch kernel + batch_matmul_tscm_custom_do(tilingData.usedCoreNum, stream, x1, x2, bias, y, workspaceDevice, tilingDevice); + CHECK_ACL(aclrtFreeHost(tilingHost)); + CHECK_ACL(aclrtFree(workspaceDevice)); + CHECK_ACL(aclrtFree(tilingDevice)); +} + +void TestAclInit(aclrtContext &context, aclrtStream &stream, int64_t &deviceId) +{ + CHECK_ACL(aclInit(nullptr)); + CHECK_ACL(aclrtSetDevice(deviceId)); + CHECK_ACL(aclrtCreateContext(&context, deviceId)); + CHECK_ACL(aclrtCreateStream(&stream)); +} + +void TestAclDeInit(aclrtContext &context, aclrtStream &stream, int64_t &deviceId) +{ + CHECK_ACL(aclrtDestroyStream(stream)); + CHECK_ACL(aclrtDestroyContext(context)); + CHECK_ACL(aclrtResetDevice(deviceId)); + CHECK_ACL(aclFinalize()); +} + +void TestMatmul(const MatmulHost::MatmulCaseParams &testCaseParams, const MatrixFileSize& matrixFileSize) +{ + size_t x1FileSize = matrixFileSize.x1FileSize; + size_t x2FileSize = matrixFileSize.x2FileSize; + size_t yFileSize = matrixFileSize.yFileSize; + size_t biasFileSize = matrixFileSize.biasFileSize; + + aclrtContext context; + aclrtStream stream = nullptr; + int64_t deviceId = 0; + TestAclInit(context, stream, deviceId); + + uint8_t *x1Host; + uint8_t *x1Device; + CHECK_ACL(aclrtMallocHost((void **)(&x1Host), x1FileSize)); + CHECK_ACL(aclrtMalloc((void **)&x1Device, x1FileSize, ACL_MEM_MALLOC_HUGE_FIRST)); + ReadFile("../input/x1_gm.bin", x1FileSize, x1Host, x1FileSize); + CHECK_ACL(aclrtMemcpy(x1Device, x1FileSize, x1Host, x1FileSize, ACL_MEMCPY_HOST_TO_DEVICE)); + + uint8_t *x2Host; + uint8_t *x2Device; + CHECK_ACL(aclrtMallocHost((void **)(&x2Host), x2FileSize)); + CHECK_ACL(aclrtMalloc((void **)&x2Device, x2FileSize, ACL_MEM_MALLOC_HUGE_FIRST)); + ReadFile("../input/x2_gm.bin", x2FileSize, x2Host, x2FileSize); + CHECK_ACL(aclrtMemcpy(x2Device, x2FileSize, x2Host, x2FileSize, ACL_MEMCPY_HOST_TO_DEVICE)); + + uint8_t *biasHost = nullptr; + uint8_t *biasDevice = nullptr; + if (IS_BIAS) { + CHECK_ACL(aclrtMallocHost((void **)(&biasHost), biasFileSize)); + CHECK_ACL(aclrtMalloc((void **)&biasDevice, biasFileSize, ACL_MEM_MALLOC_HUGE_FIRST)); + ReadFile("../input/bias_gm.bin", biasFileSize, biasHost, biasFileSize); + CHECK_ACL(aclrtMemcpy(biasDevice, biasFileSize, biasHost, biasFileSize, ACL_MEMCPY_HOST_TO_DEVICE)); + } + uint8_t *yHost = nullptr; + uint8_t *yDevice = nullptr; + CHECK_ACL(aclrtMallocHost((void **)(&yHost), yFileSize)); + CHECK_ACL(aclrtMalloc((void **)&yDevice, yFileSize, ACL_MEM_MALLOC_HUGE_FIRST)); + + MatmulOp(x1Device, x2Device, yDevice, biasDevice, testCaseParams, stream); + CHECK_ACL(aclrtSynchronizeStream(stream)); + + CHECK_ACL(aclrtMemcpy(yHost, yFileSize, yDevice, yFileSize, ACL_MEMCPY_DEVICE_TO_HOST)); + WriteFile("../output/output.bin", yHost, yFileSize); + + if (IS_BIAS) { + CHECK_ACL(aclrtFree(biasDevice)); + CHECK_ACL(aclrtFreeHost(biasHost)); + } + CHECK_ACL(aclrtFree(x1Device)); + CHECK_ACL(aclrtFreeHost(x1Host)); + CHECK_ACL(aclrtFree(x2Device)); + CHECK_ACL(aclrtFreeHost(x2Host)); + CHECK_ACL(aclrtFree(yDevice)); + CHECK_ACL(aclrtFreeHost(yHost)); + TestAclDeInit(context, stream, deviceId); +} +#endif +} // namespace MatmulHost + +int32_t main(int32_t argc, const char *args[]) +{ + int32_t size = 4; + int64_t problem[size] = {1, 1, 1, 1}; + for (int32_t i = 1; i < argc && i < size + 1; ++i) { + std::stringstream ss(args[i]); + ss >> problem[i - 1]; + } + auto ascendcPlatform = platform_ascendc::PlatformAscendCManager::GetInstance(SOC_VERSION); + MatmulHost::MatmulCaseParams testCaseParams{static_cast(ascendcPlatform->GetCoreNumAic()), + static_cast(problem[0]), static_cast(problem[1]), static_cast(problem[2]), // M, N, K + IS_BIAS, IS_A_TRANS, IS_B_TRANS, + static_cast(problem[3])}; // BatchNum + MatrixFileSize matrixFileSize; + matrixFileSize.x1FileSize = static_cast(testCaseParams.m * testCaseParams.k * testCaseParams.batchNum) * sizeof(uint16_t); + matrixFileSize.x2FileSize = static_cast(testCaseParams.k * testCaseParams.n * testCaseParams.batchNum) * sizeof(uint16_t); + matrixFileSize.yFileSize = static_cast(testCaseParams.m * testCaseParams.n * testCaseParams.batchNum) * sizeof(float); + matrixFileSize.biasFileSize = static_cast(1 * testCaseParams.n * testCaseParams.batchNum) * sizeof(float); +#ifdef ASCENDC_CPU_DEBUG + MatmulHost::TestMatmulCpu(testCaseParams, matrixFileSize); +#else + MatmulHost::TestMatmul(testCaseParams, matrixFileSize); +#endif + return 0; +} diff --git a/examples/matrix/batch_matmul_tscm/op_host/batch_matmul_tscm_custom_tiling.cpp b/examples/matrix/batch_matmul_tscm/op_host/batch_matmul_tscm_custom_tiling.cpp new file mode 100644 index 00000000..3b167ecd --- /dev/null +++ b/examples/matrix/batch_matmul_tscm/op_host/batch_matmul_tscm_custom_tiling.cpp @@ -0,0 +1,54 @@ +/* + * Copyright (c) 2025 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#include "batch_matmul_tscm_custom_tiling.h" +#include + +namespace MatmulHost { + +TCubeTiling GenerateTiling(const MatmulCaseParams& testCaseParams) +{ + TCubeTiling tilingData; + auto ascendcPlatform = platform_ascendc::PlatformAscendCManager::GetInstance(); + matmul_tiling::MultiCoreMatmulTiling cubeTiling(*ascendcPlatform); + int32_t M = testCaseParams.m; + int32_t N = testCaseParams.n; + int32_t K = testCaseParams.k; + int32_t blockDim = 1; + bool isBias = testCaseParams.isBias; + bool isAtrans = testCaseParams.isATrans; + bool isBtrans = testCaseParams.isBTrans; + int32_t batchNum = testCaseParams.batchNum; + + cubeTiling.SetDim(blockDim); + cubeTiling.SetAType(matmul_tiling::TPosition::TSCM, matmul_tiling::CubeFormat::NZ, + matmul_tiling::DataType::DT_FLOAT16, isAtrans); + cubeTiling.SetBType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, + matmul_tiling::DataType::DT_FLOAT16, isBtrans); + cubeTiling.SetCType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, + matmul_tiling::DataType::DT_FLOAT); + cubeTiling.SetBiasType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, + matmul_tiling::DataType::DT_FLOAT); + cubeTiling.SetOrgShape(M, N, K); + cubeTiling.SetShape(M, N, K); + cubeTiling.EnableBias(isBias); + cubeTiling.SetBufferSpace(-1, -1, -1); + cubeTiling.SetALayout(batchNum, M, 1, 1, K); + cubeTiling.SetBLayout(batchNum, K, 1, 1, N); + cubeTiling.SetCLayout(batchNum, M, 1, 1, N); + cubeTiling.SetBatchNum(batchNum); + if (cubeTiling.GetTiling(tilingData) == -1) { + std::cout << "Generate tiling failed." << std::endl; + return {}; + } + return tilingData; +} + +} // namespace MatmulHost diff --git a/examples/matrix/batch_matmul_tscm/op_host/batch_matmul_tscm_custom_tiling.h b/examples/matrix/batch_matmul_tscm/op_host/batch_matmul_tscm_custom_tiling.h new file mode 100644 index 00000000..837eb2ae --- /dev/null +++ b/examples/matrix/batch_matmul_tscm/op_host/batch_matmul_tscm_custom_tiling.h @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2025 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#ifndef EXAMPLES_MATRIX_BATCH_MATMUL_TSCM_OP_HOST_BATCH_MATMUL_TSCM_CUSTOM_TILING_H +#define EXAMPLES_MATRIX_BATCH_MATMUL_TSCM_OP_HOST_BATCH_MATMUL_TSCM_CUSTOM_TILING_H +#include "register/tilingdata_base.h" +#include "tiling/tiling_api.h" + +namespace MatmulHost { + +struct MatmulCaseParams +{ + int32_t usedCoreNum; + int32_t m; + int32_t n; + int32_t k; + bool isBias; + bool isATrans; + bool isBTrans; + int32_t batchNum; +}; + +/** + * @brief Generate matmul tiling. + * @param testCaseParams: Testcase parameters. + * @retval Generated Tiling data. + */ +TCubeTiling GenerateTiling(const MatmulCaseParams& testCaseParams); + +} // namespace MatmulHost +#endif // EXAMPLES_MATRIX_BATCH_MATMUL_TSCM_OP_HOST_BATCH_MATMUL_TSCM_CUSTOM_TILING_H diff --git a/examples/matrix/batch_matmul_tscm/op_kernel/batch_matmul_tscm_custom.cpp b/examples/matrix/batch_matmul_tscm/op_kernel/batch_matmul_tscm_custom.cpp new file mode 100644 index 00000000..da24952b --- /dev/null +++ b/examples/matrix/batch_matmul_tscm/op_kernel/batch_matmul_tscm_custom.cpp @@ -0,0 +1,95 @@ +/* + * Copyright (c) 2025 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#include "kernel_operator.h" +#include "lib/matmul_intf.h" +#include "batch_matmul_tscm_custom_impl.h" + +namespace BatchMatmulCustom { +__aicore__ inline void CopyTiling(TCubeTiling* tiling, GM_ADDR tilingGM) +{ + uint32_t* ptr = reinterpret_cast(tiling); + auto tiling32 = reinterpret_cast<__gm__ uint32_t*>(tilingGM); + + for (int i = 0; i < sizeof(TCubeTiling) / sizeof(uint32_t); i++, ptr++) { + *ptr = *(tiling32 + i); + } + return; +} + +template +__aicore__ inline void BatchMatmulKernel::Init(GM_ADDR a, GM_ADDR b, GM_ADDR bias, + GM_ADDR c, GM_ADDR workspace, const TCubeTiling& tiling) +{ + this->tiling = tiling; + int32_t sizeA = tiling.ALayoutInfoB * tiling.ALayoutInfoS * tiling.ALayoutInfoN * tiling.ALayoutInfoG * tiling.ALayoutInfoD; + int32_t sizeB = tiling.BLayoutInfoB * tiling.BLayoutInfoS * tiling.BLayoutInfoN * tiling.BLayoutInfoG * tiling.BLayoutInfoD; + int32_t sizeC = tiling.CLayoutInfoB * tiling.CLayoutInfoS1 * tiling.CLayoutInfoN * tiling.CLayoutInfoG * tiling.CLayoutInfoS2; + int32_t sizeBias = tiling.CLayoutInfoB * tiling.CLayoutInfoN * tiling.CLayoutInfoG * tiling.CLayoutInfoS2; + + aGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ aType*>(a), sizeA); + bGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ bType*>(b), sizeB); + cGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ cType*>(c), sizeC); + biasGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ biasType*>(bias), sizeBias); + + if (GetSysWorkSpacePtr() == nullptr) { + return; + } +} + +template +__aicore__ inline void BatchMatmulKernel::Process(AscendC::TPipe* pipe, int32_t batchA, int32_t batchB) +{ + // Copy aMatrix from gm to tscm + AscendC::TSCM scm; + pipe->InitBuffer(scm, 1, batchA * tiling.M * tiling.Ka * sizeof(aType)); + auto scmTensor = scm.AllocTensor(); + DataCopy(scmTensor, aGlobal, batchA * tiling.M * tiling.Ka); + scm.EnQue(scmTensor); + AscendC::LocalTensor scmLocal = scm.DeQue(); + + matmulObj.SetTensorA(scmLocal, false); + matmulObj.SetTensorB(bGlobal, true); // B transpose + if (tiling.isBias) { + matmulObj.SetBias(biasGlobal); + } + + matmulObj.IterateBatch(cGlobal, batchA, batchB, false); +} +} // namespace BatchMatmulCustom + +extern "C" __global__ __aicore__ void batch_matmul_tscm_custom(GM_ADDR a, GM_ADDR b, GM_ADDR bias, GM_ADDR c, GM_ADDR workspace, + GM_ADDR tilingGm) +{ + // prepare tiling + TCubeTiling tiling; + BatchMatmulCustom::CopyTiling(&tiling, tilingGm); + // define matmul kernel + using A_TYPE = AscendC::MatmulType; + using B_TYPE = AscendC::MatmulType; + using C_TYPE = AscendC::MatmulType; + using BIAS_TYPE = AscendC::MatmulType; + BatchMatmulCustom::BatchMatmulKernel batchMatmulKernel; + AscendC::TPipe pipe; + REGIST_MATMUL_OBJ(&pipe, GetSysWorkSpacePtr(), batchMatmulKernel.matmulObj, &tiling); + // init matmul kernel + batchMatmulKernel.Init(a, b, bias, c, workspace, tiling); + // matmul kernel process + batchMatmulKernel.Process(&pipe, 3, 3); +} + +#ifndef ASCENDC_CPU_DEBUG +void batch_matmul_tscm_custom_do(uint32_t blockDim, void* stream, GM_ADDR a, GM_ADDR b, GM_ADDR bias, + GM_ADDR c, GM_ADDR workspace, GM_ADDR tilingGm) +{ + // invoke the kernel function through the <<<>>> symbol + batch_matmul_tscm_custom<<>>(a, b, bias, c, workspace, tilingGm); +} +#endif \ No newline at end of file diff --git a/examples/matrix/batch_matmul_tscm/op_kernel/batch_matmul_tscm_custom_impl.h b/examples/matrix/batch_matmul_tscm/op_kernel/batch_matmul_tscm_custom_impl.h new file mode 100644 index 00000000..afe5be54 --- /dev/null +++ b/examples/matrix/batch_matmul_tscm/op_kernel/batch_matmul_tscm_custom_impl.h @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2025 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#ifndef EXAMPLES_MATRIX_BATCH_MATMUL_TSCM_OP_KERNEL_BATCH_MATMUL_TSCM_CUSTOM_IMPL_H +#define EXAMPLES_MATRIX_BATCH_MATMUL_TSCM_OP_KERNEL_BATCH_MATMUL_TSCM_CUSTOM_IMPL_H +#include "kernel_operator.h" +#include "lib/matmul_intf.h" + +namespace BatchMatmulCustom { +template +class BatchMatmulKernel { + public: + __aicore__ inline BatchMatmulKernel(){}; + __aicore__ inline void Init(GM_ADDR a, GM_ADDR b, GM_ADDR bias, GM_ADDR c, GM_ADDR workspace, const TCubeTiling& tiling); + __aicore__ inline void Process(AscendC::TPipe* pipe, int32_t batchA, int32_t batchB); + AscendC::Matmul matmulObj; + private: + __aicore__ inline void CalcOffset(int32_t blockIdx, const TCubeTiling& tiling, int32_t& offsetA, int32_t& offsetB, + int32_t& offsetC, int32_t& offsetBias); + using aType = typename A_TYPE::T; + using bType = typename B_TYPE::T; + using cType = typename C_TYPE::T; + using biasType = typename BIAS_TYPE::T; + AscendC::GlobalTensor aGlobal; + AscendC::GlobalTensor bGlobal; + AscendC::GlobalTensor cGlobal; + AscendC::GlobalTensor biasGlobal; + TCubeTiling tiling; +}; +} // namespace BatchMatmulCustom +#endif // EXAMPLES_MATRIX_BATCH_MATMUL_TSCM_OP_KERNEL_BATCH_MATMUL_TSCM_CUSTOM_IMPL_H \ No newline at end of file diff --git a/examples/matrix/batch_matmul_tscm/run.sh b/examples/matrix/batch_matmul_tscm/run.sh new file mode 100644 index 00000000..21e667d5 --- /dev/null +++ b/examples/matrix/batch_matmul_tscm/run.sh @@ -0,0 +1,96 @@ +#!/bin/bash +# Copyright (c) 2025 Huawei Technologies Co., Ltd. +# This file is a part of the CANN Open Software. +# Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ====================================================================================================================== + +export IS_PERF="0" + +SHORT=r:,v:,p:, +LONG=run-mode:,soc-version:,perf:, +OPTS=$(getopt -a --options $SHORT --longoptions $LONG -- "$@") +eval set -- "$OPTS" +while : +do + case "$1" in + (-r | --run-mode ) + RUN_MODE="$2" + shift 2;; + (-v | --soc-version ) + SOC_VERSION="$2" + shift 2;; + (-p | --perf ) + IS_PERF="$2" + shift 2;; + (--) + shift; + break;; + (*) + echo "[ERROR] Unexpected option: $1"; + break;; + esac +done + +# Check invalid input +if [[ ! "${SOC_VERSION}" =~ ^Ascend910 ]]; then + echo "[ERROR] Unsupported SocVersion: ${SOC_VERSION}" + exit 1 +fi + +if [ "${RUN_MODE}" != "npu" ] && [ "${RUN_MODE}" != "sim" ] && [ "${RUN_MODE}" != "cpu" ]; then + echo "[ERROR] Unsupported RUN_MODE: ${RUN_MODE}, which can only be cpu/sim/npu." + exit 1 +fi + +if [ "${IS_PERF}" != "0" ] && [ "${IS_PERF}" != "1" ]; then + echo "[ERROR] Unsupported IS_PERF: ${IS_PERF}, which can only be 0 or 1." + exit 1 +fi + +# only npu mode support is_perf = 1 +if [ "${IS_PERF}" == "1" ] && [ "${RUN_MODE}" != "npu" ]; then + echo "[ERROR] Unsupported IS_PERF: ${IS_PERF} while RUN_MODE is ${RUN_MODE}." + exit 1 +fi + +rm -rf build +mkdir build +cd build + +source $ASCEND_HOME_DIR/bin/setenv.bash +export LD_LIBRARY_PATH=${ASCEND_HOME_DIR}/tools/simulator/${SOC_VERSION}/lib:$LD_LIBRARY_PATH +set -euo pipefail + +cmake -DRUN_MODE=${RUN_MODE} -DSOC_VERSION=${SOC_VERSION} -DASCEND_CANN_PACKAGE_PATH=${ASCEND_HOME_DIR} .. +make -j16 + +cd ../ + +rm -rf input +mkdir input +rm -rf output +mkdir output + +rm -rf bin/ +mkdir -p bin +cd bin +cp ../build/ascendc_batch_matmul_tscm_bbit ./ + +export TF_CPP_MIN_LOG_LEVEL=3 + +if [ "${RUN_MODE}" = "npu" ]; then + if [ "${IS_PERF}" = "1" ]; then + export ASCEND_TOOLKIT_HOME=${ASCEND_HOME_DIR} + python3 -u ../scripts/exec_test.py npu "perf" + else + python3 -u ../scripts/exec_test.py npu "normal" + fi +elif [ "${RUN_MODE}" = "sim" ]; then + python3 -u ../scripts/exec_test.py sim "normal" +elif [ "${RUN_MODE}" = "cpu" ]; then + python3 -u ../scripts/exec_test.py cpu "normal" +fi diff --git a/examples/matrix/batch_matmul_tscm/scripts/exec_test.py b/examples/matrix/batch_matmul_tscm/scripts/exec_test.py new file mode 100644 index 00000000..1e89845b --- /dev/null +++ b/examples/matrix/batch_matmul_tscm/scripts/exec_test.py @@ -0,0 +1,120 @@ +#!/usr/bin/python3 +# coding=utf-8 + +# Copyright (c) 2025 Huawei Technologies Co., Ltd. +# This file is a part of the CANN Open Software. +# Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ====================================================================================================================== +import os +import sys +import csv +import time +import logging +import subprocess + +import numpy as np + +sys.path.append("../..") +from gen_batch_data import MatmulGenData +from common_scripts.compare_data import compare_data +from common_scripts.exec_utils import get_file_work_dir, get_case_list, get_perf_task_duration, get_process_case_cmd + +IS_BIAS = False +IS_TRANS_A = False +IS_TRANS_B = True +DATA_TYPE_STR = "float16_float32" +A_FORMAT = "NZ" +B_FORMAT = "ND" + +logging.basicConfig(level=logging.INFO) + + +class ProcessParams: + def __init__(self, case_name, m, n, k, b, is_perf, run_mode): + self.case_name = case_name + self.m = m + self.n = n + self.k = k + self.b = b + self.is_perf = is_perf + self.run_mode = run_mode + + +def process_case(file_work_dir, process_params): + case_name, m, n, k, b, is_perf, run_mode = process_params.case_name, process_params.m, process_params.n,\ + process_params.k, process_params.b, process_params.is_perf, process_params.run_mode + logging.info("[INFO] start process case [%s]" % (case_name)) + logging.info("[INFO] IS_PERF is set [%d]" % (is_perf)) + + matmul_gen_data = MatmulGenData(m, n, k, b, IS_TRANS_A, IS_TRANS_B, IS_BIAS, DATA_TYPE_STR) + if is_perf: + matmul_gen_data.gen_fake_golden_data(file_work_dir) + else: + matmul_gen_data.gen_golden_data(file_work_dir) + params_str = f"{m} {n} {k} {b}" + cmd = get_process_case_cmd(kernel_name="ascendc_batch_matmul_tscm_bbit", params_str=params_str,\ + is_perf=is_perf, run_mode=run_mode) + subprocess.run(cmd) + if is_perf: + wrong_num = -1 + else: + logging.info("[INFO] compare data case[%s]" % (case_name)) + wrong_num = compare_data(file_work_dir, DATA_TYPE_STR) + res_data = [] + res_data.append(case_name) + res_data.append(wrong_num) + res_data.append(m * n * b) + if wrong_num == -1: + res_data.append("None") + elif wrong_num / (m * n * b) > 0.001: + res_data.append("Fail") + else: + res_data.append("Success") + if is_perf: + task_duration = get_perf_task_duration("./prof_out") + res_data.append(task_duration) + return res_data + + +def main(): + args_len = len(sys.argv) - 1 + if args_len != 2: + logging.info("[ERROR] exec_test input params error!") + return -1 + + file_work_dir = get_file_work_dir() + if not os.path.exists(file_work_dir): + logging.info("[ERROR] file path %s not exist!" % (file_work_dir)) + return -1 + + is_perf = False + if sys.argv[2] == "perf": + is_perf = True + + case_list = get_case_list() + res_list = [["case_name", "wrong_num", "total_num", "result", "task_duration"]] + run_mode = sys.argv[1] + for is_process, case_name, m, n, k, b in case_list: + if is_process == 1: + process_params = ProcessParams(case_name, m, n, k, b, is_perf, run_mode) + res_data = process_case(file_work_dir, process_params) + res_list.append(res_data) + + timestamp = time.time() + result_file_name = "result_" + str(timestamp) + ".csv" + with open(os.path.join(file_work_dir, "output", result_file_name), 'w', newline='', encoding='utf-8') as csvfile: + writer = csv.writer(csvfile) + writer.writerows(res_list) + + logging.info("---------------RESULT---------------") + for res in res_list: + logging.info(res) + return 0 + + +if __name__ == "__main__": + main() diff --git a/examples/matrix/batch_matmul_tscm/scripts/gen_batch_data.py b/examples/matrix/batch_matmul_tscm/scripts/gen_batch_data.py new file mode 100644 index 00000000..ed8e61f0 --- /dev/null +++ b/examples/matrix/batch_matmul_tscm/scripts/gen_batch_data.py @@ -0,0 +1,113 @@ +#!/usr/bin/python3 +# coding=utf-8 + +# Copyright (c) 2025 Huawei Technologies Co., Ltd. +# This file is a part of the CANN Open Software. +# Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ====================================================================================================================== +import os +import sys +import logging + +import numpy as np +import tensorflow as tf + +IS_OUTPUT_TXT = False + + +class MatmulGenData: + def __init__(self, m, n, k, b, is_trans_a, is_trans_b, is_bias,\ + data_type_str): + self.m = m + self.n = n + self.k = k + self.b = b + self.is_trans_a = is_trans_a + self.is_trans_b = is_trans_b + self.is_bias = is_bias + self.data_type_str = data_type_str + + @staticmethod + def gen_axes_for_transpose(offset, base): + return [x for x in range(offset)] + [x + offset for x in base] + + def nd_to_nz(self, data: np.ndarray): + ori_shape = data.shape + m_ori, n_ori = ori_shape[-2:] + batch_ori = ori_shape[:-2] + batch_num = len(batch_ori) + batch_padding = ((0, 0),) * batch_num + m0, n0 = 16, 16 + m1, n1 = (m_ori + m0 - 1) // m0, (n_ori + n0 - 1) // n0 + padding_m = m1 * m0 - m_ori + padding_n = n1 * n0 - n_ori + data = np.pad(data, (batch_padding + ((0, padding_m), (0, padding_n))), 'constant') + array_trans = self.gen_axes_for_transpose(len(data.shape) - 2, [2, 0, 1, 3]) + data = data.reshape(batch_ori + (m1, m0, n1, n0)).transpose(*array_trans) + return data + + def gen_golden_data_fp16_fp32(self, work_dir): + src_type = np.float16 + dst_type = np.float32 + bias_gm = np.random.uniform(-1, 1, [1, self.b * self.n]).astype(dst_type) + + for idx in range(self.b): + x1_gm_left = np.random.uniform(-1, 1, [self.m, self.k]).astype(src_type) + x2_gm_right = np.random.uniform(-1, 1, [self.k, self.n]).astype(src_type) + golden_one = np.matmul(x1_gm_left.astype(dst_type), x2_gm_right.astype(dst_type)).astype(dst_type) + if self.is_bias: + golden_one = golden_one + bias_gm.astype(dst_type) + if self.is_trans_a: + x1_tmp = x1_gm_left.transpose() + else: + x1_tmp = x1_gm_left + x1_tmp = self.nd_to_nz(x1_tmp) + if self.is_trans_b: + x2_tmp = x2_gm_right.transpose() + else: + x2_tmp = x2_gm_right + if idx == 0: + x1_gm = x1_tmp + x2_gm = x2_tmp + golden = golden_one + else: + x1_gm = np.vstack((x1_gm, x1_tmp)) + x2_gm = np.vstack((x2_gm, x2_tmp)) + golden = np.vstack((golden, golden_one)) + + x1_gm.tofile(work_dir + "/input/x1_gm.bin") + x2_gm.tofile(work_dir + "/input/x2_gm.bin") + if self.is_bias: + bias_gm.tofile(work_dir + "/input/bias_gm.bin") + golden.tofile(work_dir + "/output/golden.bin") + + return 0 + + def gen_golden_data(self, work_dir): + if self.data_type_str == "float16_float32": + self.gen_golden_data_fp16_fp32(work_dir) + else: + logging.info("[ERROR] can't support data type %s" % (self.data_type_str)) + return -1 + return 0 + + def gen_fake_golden_data(self, work_dir): + data_type_bytes_ab = 2 # float16 + data_type_bytes_c = 4 # float32 + + file_byte = self.b * self.m * self.k * data_type_bytes_ab + with open(work_dir + "/input/x1_gm.bin", 'wb') as file: + file.truncate(file_byte) + + file_byte = self.b * self.k * self.n * data_type_bytes_ab + with open(work_dir + "/input/x2_gm.bin", 'wb') as file: + file.truncate(file_byte) + + if self.is_bias: + file_byte = self.b * 1 * self.n * data_type_bytes_c + with open(work_dir + "/input/bias_gm.bin", 'wb') as file: + file.truncate(file_byte) \ No newline at end of file diff --git a/examples/matrix/batch_matmul_tscm/testcase/case.csv b/examples/matrix/batch_matmul_tscm/testcase/case.csv new file mode 100644 index 00000000..4f0f0196 --- /dev/null +++ b/examples/matrix/batch_matmul_tscm/testcase/case.csv @@ -0,0 +1 @@ +1, case001, 32, 256, 64, 3 \ No newline at end of file diff --git a/examples/matrix/matmul_nbuffer33/CMakeLists.txt b/examples/matrix/matmul_nbuffer33/CMakeLists.txt index 7555bf6d..93483b92 100644 --- a/examples/matrix/matmul_nbuffer33/CMakeLists.txt +++ b/examples/matrix/matmul_nbuffer33/CMakeLists.txt @@ -50,6 +50,7 @@ target_compile_options(ascendc_matmul_nbuffer33_bbit PRIVATE ) target_compile_definitions(ascendc_matmul_nbuffer33_bbit PRIVATE + $<$:ENABLE_NBUFFER33_FEATURE> SOC_VERSION="${SOC_VERSION}" ) diff --git a/examples/matrix/matmul_nbuffer33/README.md b/examples/matrix/matmul_nbuffer33/README.md index 4d60f7ba..9eb7f5f6 100644 --- a/examples/matrix/matmul_nbuffer33/README.md +++ b/examples/matrix/matmul_nbuffer33/README.md @@ -94,15 +94,16 @@ - 编译执行 ``` - bash run.sh -r [RUN_MODE] -v [SOC_VERSION] -p [IS_PERF] + bash run.sh -r [RUN_MODE] -v [SOC_VERSION] -p [IS_PERF] -e [ENABLE_FEATURE] ``` 其中脚本参数说明如下: - RUN_MODE :编译执行方式,可选择CPU调试,NPU仿真,NPU上板,对应参数分别为[cpu / sim / npu]。若需要详细了解NPU仿真相关内容,请参考[《算子开发工具msProf》](https://hiascend.com/document/redirect/CannCommunityToolMsProf)中的“工具使用”章节。 - SOC_VERSION :昇腾AI处理器型号,如果无法确定具体的[SOC_VERSION],则在安装昇腾AI处理器的服务器执行npu-smi info命令进行查询,在查询到的“Name”前增加Ascend信息,例如“Name”对应取值为xxxyy,实际配置的[SOC_VERSION]值为Ascendxxxyy。支持以下产品型号: - Atlas A2训练系列产品/Atlas 800I A2推理产品 - - IS_PERF : 是否获取执行性能数据,当前只在NPU执行时生效,打开后性能数据显示在执行结果中,可选择关闭和开启该功能,对应参数分别为[0 / 1]。 + - IS_PERF :是否获取执行性能数据,当前只在NPU执行时生效,打开后性能数据显示在执行结果中,可选择关闭和开启该功能,对应参数分别为[0 / 1]。 + - ENABLE_FEATURE :是否使能NBuffer33功能,可选择关闭和开启该功能,对应参数分别为[0 / 1],默认开启。 示例如下,Ascendxxxyy请替换为实际的AI处理器型号。 ``` - bash run.sh -r cpu -v Ascendxxxyy -p 0 + bash run.sh -r cpu -v Ascendxxxyy -p 0 -e 1 ``` \ No newline at end of file diff --git a/examples/matrix/matmul_nbuffer33/cmake/cpu_lib.cmake b/examples/matrix/matmul_nbuffer33/cmake/cpu_lib.cmake index 583dd703..56b4c9d9 100644 --- a/examples/matrix/matmul_nbuffer33/cmake/cpu_lib.cmake +++ b/examples/matrix/matmul_nbuffer33/cmake/cpu_lib.cmake @@ -20,6 +20,10 @@ target_link_libraries(ascendc_kernels_${RUN_MODE} PRIVATE tikicpulib::${SOC_VERSION} ) +target_compile_definitions(ascendc_kernels_${RUN_MODE} PRIVATE + $<$:ENABLE_NBUFFER33_FEATURE> +) + target_compile_options(ascendc_kernels_${RUN_MODE} PRIVATE -g -O2 diff --git a/examples/matrix/matmul_nbuffer33/cmake/npu_lib.cmake b/examples/matrix/matmul_nbuffer33/cmake/npu_lib.cmake index 927d2490..d7a7e292 100644 --- a/examples/matrix/matmul_nbuffer33/cmake/npu_lib.cmake +++ b/examples/matrix/matmul_nbuffer33/cmake/npu_lib.cmake @@ -21,7 +21,8 @@ ascendc_library(ascendc_kernels_${RUN_MODE} STATIC ) ascendc_compile_definitions(ascendc_kernels_${RUN_MODE} PRIVATE - -DASCENDC_DUMP - -DHAVE_WORKSPACE - -DHAVE_TILING - ) \ No newline at end of file + $<$:ENABLE_NBUFFER33_FEATURE> + -DASCENDC_DUMP + -DHAVE_WORKSPACE + -DHAVE_TILING +) \ No newline at end of file diff --git a/examples/matrix/matmul_nbuffer33/op_kernel/matmul_nbuffer33_custom_impl.h b/examples/matrix/matmul_nbuffer33/op_kernel/matmul_nbuffer33_custom_impl.h index dc4b4216..70d9e328 100644 --- a/examples/matrix/matmul_nbuffer33/op_kernel/matmul_nbuffer33_custom_impl.h +++ b/examples/matrix/matmul_nbuffer33/op_kernel/matmul_nbuffer33_custom_impl.h @@ -21,6 +21,7 @@ class MatmulNBuffer33Kernel { __aicore__ inline MatmulNBuffer33Kernel(){}; __aicore__ inline void Init(GM_ADDR a, GM_ADDR b, GM_ADDR bias, GM_ADDR c, GM_ADDR workspace, const TCubeTiling& tiling); __aicore__ inline void Process(AscendC::TPipe* pipe); +#ifdef ENABLE_NBUFFER33_FEATURE AscendC::MatmulImpl< AscendC::MatmulType, AscendC::MatmulType, @@ -28,6 +29,14 @@ class MatmulNBuffer33Kernel { AscendC::MatmulType, CFG_MDL, AscendC::MatmulCallBackFunc, AscendC::Impl::Detail::NBuffer33MatmulPolicy> matmulObj; +#else + AscendC::MatmulImpl< + AscendC::MatmulType, + AscendC::MatmulType, + AscendC::MatmulType, + AscendC::MatmulType, CFG_MDL, + AscendC::MatmulCallBackFunc> matmulObj; +#endif private: __aicore__ inline void CalcOffset(int32_t blockIdx, const TCubeTiling& tiling, int32_t& offsetA, int32_t& offsetB, diff --git a/examples/matrix/matmul_nbuffer33/run.sh b/examples/matrix/matmul_nbuffer33/run.sh index 5e271e9d..6443aff0 100644 --- a/examples/matrix/matmul_nbuffer33/run.sh +++ b/examples/matrix/matmul_nbuffer33/run.sh @@ -9,8 +9,9 @@ # ====================================================================================================================== export IS_PERF="0" +export ENABLE_FEATURE="1" -SHORT=r:,v:,p:, +SHORT=r:,v:,p:,e:, LONG=run-mode:,soc-version:,perf:, OPTS=$(getopt -a --options $SHORT --longoptions $LONG -- "$@") eval set -- "$OPTS" @@ -26,6 +27,9 @@ do (-p | --perf ) IS_PERF="$2" shift 2;; + (-e | --enable-feature ) + ENABLE_FEATURE="$2" + shift 2;; (--) shift; break;; @@ -51,6 +55,11 @@ if [ "${IS_PERF}" != "0" ] && [ "${IS_PERF}" != "1" ]; then exit 1 fi +if [ "${ENABLE_FEATURE}" != "0" ] && [ "${ENABLE_FEATURE}" != "1" ]; then + echo "[ERROR] Unsupported ENABLE_FEATURE: ${ENABLE_FEATURE}, which can only be 0 or 1." + exit 1 +fi + # only npu mode support is_perf = 1 if [ "${IS_PERF}" == "1" ] && [ "${RUN_MODE}" != "npu" ]; then echo "[ERROR] Unsupported IS_PERF: ${IS_PERF} while RUN_MODE is ${RUN_MODE}." @@ -65,7 +74,7 @@ source $ASCEND_HOME_DIR/bin/setenv.bash export LD_LIBRARY_PATH=${ASCEND_HOME_DIR}/tools/simulator/${SOC_VERSION}/lib:$LD_LIBRARY_PATH set -euo pipefail -cmake -DRUN_MODE=${RUN_MODE} -DSOC_VERSION=${SOC_VERSION} -DASCEND_CANN_PACKAGE_PATH=${ASCEND_HOME_DIR} .. +cmake -DRUN_MODE=${RUN_MODE} -DSOC_VERSION=${SOC_VERSION} -DENABLE_FEATURE=${ENABLE_FEATURE} -DASCEND_CANN_PACKAGE_PATH=${ASCEND_HOME_DIR} .. make -j16 cd ../ diff --git a/examples/matrix/matmul_preload/CMakeLists.txt b/examples/matrix/matmul_preload/CMakeLists.txt index 67d8676c..02b970b5 100644 --- a/examples/matrix/matmul_preload/CMakeLists.txt +++ b/examples/matrix/matmul_preload/CMakeLists.txt @@ -59,6 +59,7 @@ target_compile_definitions(ascendc_matmul_preload_bbit PRIVATE target_compile_definitions(ascendc_matmul_preload_bbit PRIVATE $<$>:CUSTOM_PRELOAD_N> + $<$:ENABLE_PRELOAD_FEATURE> PRELOAD_MODE="${PRELOAD_MODE}" ) diff --git a/examples/matrix/matmul_preload/README.md b/examples/matrix/matmul_preload/README.md index fec924ee..860bf237 100644 --- a/examples/matrix/matmul_preload/README.md +++ b/examples/matrix/matmul_preload/README.md @@ -86,7 +86,7 @@ matmul单算子,对输入的A、B矩阵做矩阵乘和加bias偏置。在MTE2 - 编译执行 ``` - bash run.sh -r [RUN_MODE] -v [SOC_VERSION] -m [MODE] -p [IS_PERF] + bash run.sh -r [RUN_MODE] -v [SOC_VERSION] -m [MODE] -p [IS_PERF] -e [ENABLE_FEATURE] ``` 其中脚本参数说明如下: - RUN_MODE :编译执行方式,可选择CPU调试,NPU仿真,NPU上板,对应参数分别为[cpu / sim / npu]。若需要详细了解NPU仿真相关内容,请参考[《算子开发工具msProf》](https://hiascend.com/document/redirect/CannCommunityToolMsProf)中的“工具使用”章节。 @@ -97,8 +97,9 @@ matmul单算子,对输入的A、B矩阵做矩阵乘和加bias偏置。在MTE2 - M - N - IS_PERF : 是否获取执行性能数据,当前只在NPU执行时生效,打开后性能数据显示在执行结果中,可选择关闭和开启该功能,对应参数分别为[0 / 1]。 + - ENABLE_FEATURE :是否使能Preload功能,可选择关闭和开启该功能,对应参数分别为[0 / 1],默认开启。 示例如下,Ascendxxxyy请替换为实际的AI处理器型号。 ``` - bash run.sh -r cpu -v Ascendxxxyy -m M -p 0 + bash run.sh -r cpu -v Ascendxxxyy -m M -p 0 -e 1 ``` diff --git a/examples/matrix/matmul_preload/cmake/cpu_lib.cmake b/examples/matrix/matmul_preload/cmake/cpu_lib.cmake index c4e83c0d..0d3a989a 100644 --- a/examples/matrix/matmul_preload/cmake/cpu_lib.cmake +++ b/examples/matrix/matmul_preload/cmake/cpu_lib.cmake @@ -22,6 +22,7 @@ target_link_libraries(ascendc_kernels_${RUN_MODE} PRIVATE target_compile_definitions(ascendc_kernels_${RUN_MODE} PRIVATE $<$>:CUSTOM_PRELOAD_N> + $<$:ENABLE_PRELOAD_FEATURE> ) target_compile_options(ascendc_kernels_${RUN_MODE} PRIVATE diff --git a/examples/matrix/matmul_preload/cmake/npu_lib.cmake b/examples/matrix/matmul_preload/cmake/npu_lib.cmake index 52db61e6..56cf9e25 100644 --- a/examples/matrix/matmul_preload/cmake/npu_lib.cmake +++ b/examples/matrix/matmul_preload/cmake/npu_lib.cmake @@ -22,6 +22,7 @@ ascendc_library(ascendc_kernels_${RUN_MODE} STATIC ascendc_compile_definitions(ascendc_kernels_${RUN_MODE} PRIVATE $<$>:CUSTOM_PRELOAD_N> + $<$:ENABLE_PRELOAD_FEATURE> -DASCENDC_DUMP -DHAVE_WORKSPACE -DHAVE_TILING diff --git a/examples/matrix/matmul_preload/op_kernel/matmul_preload_custom_impl.h b/examples/matrix/matmul_preload/op_kernel/matmul_preload_custom_impl.h index 03775c43..f6867fd5 100644 --- a/examples/matrix/matmul_preload/op_kernel/matmul_preload_custom_impl.h +++ b/examples/matrix/matmul_preload/op_kernel/matmul_preload_custom_impl.h @@ -22,7 +22,11 @@ class MatmulPreloadKernel { __aicore__ inline void Init(GM_ADDR a, GM_ADDR b, GM_ADDR bias, GM_ADDR c, GM_ADDR workspace, const TCubeTiling& tiling); __aicore__ inline void Process(AscendC::TPipe* pipe); +#ifdef ENABLE_PRELOAD_FEATURE static constexpr MatmulConfig MM_CFG = GetMDLConfig(false, false, preloadMode); // enable preload M/N +#else + static constexpr MatmulConfig MM_CFG = GetMDLConfig(); +#endif AscendC::Matmul, AscendC::MatmulType, AscendC::MatmulType, diff --git a/examples/matrix/matmul_preload/run.sh b/examples/matrix/matmul_preload/run.sh index f3cfe201..87d9d1a9 100644 --- a/examples/matrix/matmul_preload/run.sh +++ b/examples/matrix/matmul_preload/run.sh @@ -9,7 +9,8 @@ # ====================================================================================================================== export MODE="M" export IS_PERF="0" -SHORT=r:,v:,p:, +export ENABLE_FEATURE="1" +SHORT=r:,v:,p:,e:, LONG=run-mode:,soc-version:,mode:,perf:, OPTS=$(getopt -a --options $SHORT --longoptions $LONG -- "$@") eval set -- "$OPTS" @@ -28,6 +29,9 @@ do (-p | --perf ) IS_PERF="$2" shift 2;; + (-e | --enable-feature ) + ENABLE_FEATURE="$2" + shift 2;; (--) shift; break;; @@ -53,6 +57,11 @@ if [ "${PRELOAD_MODE}" != "M" ] && [ "${PRELOAD_MODE}" != "N" ]; then exit 1 fi +if [ "${ENABLE_FEATURE}" != "0" ] && [ "${ENABLE_FEATURE}" != "1" ]; then + echo "[ERROR] Unsupported ENABLE_FEATURE: ${ENABLE_FEATURE}, which can only be 0 or 1." + exit 1 +fi + if [ "${IS_PERF}" != "0" ] && [ "${IS_PERF}" != "1" ]; then echo "[ERROR] Unsupported IS_PERF: ${IS_PERF}, which can only be 0 or 1." exit 1 @@ -72,7 +81,7 @@ source $ASCEND_HOME_DIR/bin/setenv.bash export LD_LIBRARY_PATH=${ASCEND_HOME_DIR}/tools/simulator/${SOC_VERSION}/lib:$LD_LIBRARY_PATH set -euo pipefail -cmake -DRUN_MODE=${RUN_MODE} -DSOC_VERSION=${SOC_VERSION} -DPRELOAD_MODE=${PRELOAD_MODE} -DASCEND_CANN_PACKAGE_PATH=${ASCEND_HOME_DIR} .. +cmake -DRUN_MODE=${RUN_MODE} -DSOC_VERSION=${SOC_VERSION} -DPRELOAD_MODE=${PRELOAD_MODE} -DENABLE_FEATURE=${ENABLE_FEATURE} -DASCEND_CANN_PACKAGE_PATH=${ASCEND_HOME_DIR} .. make -j16 cd ../ diff --git a/examples/matrix/matmul_splitk/CMakeLists.txt b/examples/matrix/matmul_splitk/CMakeLists.txt index 91bab65c..591c2ee1 100644 --- a/examples/matrix/matmul_splitk/CMakeLists.txt +++ b/examples/matrix/matmul_splitk/CMakeLists.txt @@ -50,6 +50,7 @@ target_compile_options(ascendc_matmul_splitk_bbit PRIVATE ) target_compile_definitions(ascendc_matmul_splitk_bbit PRIVATE + $<$:ENABLE_SPLITK_FEATURE> SOC_VERSION="${SOC_VERSION}" ) diff --git a/examples/matrix/matmul_splitk/README.md b/examples/matrix/matmul_splitk/README.md index 43ee9419..448ea9bf 100644 --- a/examples/matrix/matmul_splitk/README.md +++ b/examples/matrix/matmul_splitk/README.md @@ -91,15 +91,16 @@ - 编译执行 ``` - bash run.sh -r [RUN_MODE] -v [SOC_VERSION] -p [IS_PERF] + bash run.sh -r [RUN_MODE] -v [SOC_VERSION] -p [IS_PERF] -e [ENABLE_FEATURE] ``` 其中脚本参数说明如下: - RUN_MODE :编译执行方式,可选择CPU调试,NPU仿真,NPU上板,对应参数分别为[cpu / sim / npu]。若需要详细了解NPU仿真相关内容,请参考[《算子开发工具msProf》](https://hiascend.com/document/redirect/CannCommunityToolMsProf)中的“工具使用”章节。 - SOC_VERSION :昇腾AI处理器型号,如果无法确定具体的[SOC_VERSION],则在安装昇腾AI处理器的服务器执行npu-smi info命令进行查询,在查询到的“Name”前增加Ascend信息,例如“Name”对应取值为xxxyy,实际配置的[SOC_VERSION]值为Ascendxxxyy。支持以下产品型号: - Atlas A2训练系列产品/Atlas 800I A2推理产品 - - IS_PERF : 是否获取执行性能数据,当前只在NPU执行时生效,打开后性能数据显示在执行结果中,可选择关闭和开启该功能,对应参数分别为[0 / 1]。 + - IS_PERF :是否获取执行性能数据,当前只在NPU执行时生效,打开后性能数据显示在执行结果中,可选择关闭和开启该功能,对应参数分别为[0 / 1]。 + - ENABLE_FEATURE :是否使能多核切K功能,可选择关闭和开启该功能,对应参数分别为[0 / 1],默认开启。 示例如下,Ascendxxxyy请替换为实际的AI处理器型号。 ``` - bash run.sh -r cpu -v Ascendxxxyy -p 0 + bash run.sh -r cpu -v Ascendxxxyy -p 0 -e 1 ``` \ No newline at end of file diff --git a/examples/matrix/matmul_splitk/cmake/cpu_lib.cmake b/examples/matrix/matmul_splitk/cmake/cpu_lib.cmake index 244469fb..8e55bd2f 100644 --- a/examples/matrix/matmul_splitk/cmake/cpu_lib.cmake +++ b/examples/matrix/matmul_splitk/cmake/cpu_lib.cmake @@ -20,6 +20,10 @@ target_link_libraries(ascendc_kernels_${RUN_MODE} PRIVATE tikicpulib::${SOC_VERSION} ) +target_compile_definitions(ascendc_kernels_${RUN_MODE} PRIVATE + $<$:ENABLE_SPLITK_FEATURE> +) + target_compile_options(ascendc_kernels_${RUN_MODE} PRIVATE -g -O0 diff --git a/examples/matrix/matmul_splitk/cmake/npu_lib.cmake b/examples/matrix/matmul_splitk/cmake/npu_lib.cmake index 927d2490..87621862 100644 --- a/examples/matrix/matmul_splitk/cmake/npu_lib.cmake +++ b/examples/matrix/matmul_splitk/cmake/npu_lib.cmake @@ -21,7 +21,8 @@ ascendc_library(ascendc_kernels_${RUN_MODE} STATIC ) ascendc_compile_definitions(ascendc_kernels_${RUN_MODE} PRIVATE - -DASCENDC_DUMP - -DHAVE_WORKSPACE - -DHAVE_TILING - ) \ No newline at end of file + $<$:ENABLE_SPLITK_FEATURE> + -DASCENDC_DUMP + -DHAVE_WORKSPACE + -DHAVE_TILING +) \ No newline at end of file diff --git a/examples/matrix/matmul_splitk/op_host/matmul_splitk_custom_tiling.cpp b/examples/matrix/matmul_splitk/op_host/matmul_splitk_custom_tiling.cpp index 9fb919dd..3c987063 100644 --- a/examples/matrix/matmul_splitk/op_host/matmul_splitk_custom_tiling.cpp +++ b/examples/matrix/matmul_splitk/op_host/matmul_splitk_custom_tiling.cpp @@ -39,8 +39,10 @@ TCubeTiling GenerateTiling(const MatmulCaseParams& testCaseParams) cubeTiling.SetShape(M, N, K); cubeTiling.EnableBias(isBias); cubeTiling.SetBufferSpace(-1, -1, -1); +#ifdef ENABLE_SPLITK_FEATURE // tiling enbale split K cubeTiling.EnableMultiCoreSplitK(true); +#endif if (cubeTiling.GetTiling(tilingData) == -1) { std::cout << "Generate tiling failed." << std::endl; return {}; diff --git a/examples/matrix/matmul_splitk/op_kernel/matmul_splitk_custom_impl.h b/examples/matrix/matmul_splitk/op_kernel/matmul_splitk_custom_impl.h index 26a36bb2..fe671fa9 100644 --- a/examples/matrix/matmul_splitk/op_kernel/matmul_splitk_custom_impl.h +++ b/examples/matrix/matmul_splitk/op_kernel/matmul_splitk_custom_impl.h @@ -77,7 +77,11 @@ __aicore__ inline void MatmulSplitkKernel::Proces if (tiling.isBias) { matmulObj.SetBias(biasGlobal); } +#ifdef ENABLE_SPLITK_FEATURE uint8_t enAtomic = 1; // set AtomicAdd +#else + uint8_t enAtomic = 0; +#endif matmulObj.IterateAll(cGlobal, enAtomic); matmulObj.End(); } diff --git a/examples/matrix/matmul_splitk/run.sh b/examples/matrix/matmul_splitk/run.sh index d4369bd0..72e4ffaa 100644 --- a/examples/matrix/matmul_splitk/run.sh +++ b/examples/matrix/matmul_splitk/run.sh @@ -9,8 +9,9 @@ # ====================================================================================================================== export IS_PERF="0" +export ENABLE_FEATURE="1" -SHORT=r:,v:,p:, +SHORT=r:,v:,p:,e:, LONG=run-mode:,soc-version:,perf:, OPTS=$(getopt -a --options $SHORT --longoptions $LONG -- "$@") eval set -- "$OPTS" @@ -26,6 +27,9 @@ do (-p | --perf ) IS_PERF="$2" shift 2;; + (-e | --enable-feature ) + ENABLE_FEATURE="$2" + shift 2;; (--) shift; break;; @@ -51,6 +55,11 @@ if [ "${IS_PERF}" != "0" ] && [ "${IS_PERF}" != "1" ]; then exit 1 fi +if [ "${ENABLE_FEATURE}" != "0" ] && [ "${ENABLE_FEATURE}" != "1" ]; then + echo "[ERROR] Unsupported ENABLE_FEATURE: ${ENABLE_FEATURE}, which can only be 0 or 1." + exit 1 +fi + # only npu mode support is_perf = 1 if [ "${IS_PERF}" == "1" ] && [ "${RUN_MODE}" != "npu" ]; then echo "[ERROR] Unsupported IS_PERF: ${IS_PERF} while RUN_MODE is ${RUN_MODE}." @@ -65,7 +74,7 @@ source $ASCEND_HOME_DIR/bin/setenv.bash export LD_LIBRARY_PATH=${ASCEND_HOME_DIR}/tools/simulator/${SOC_VERSION}/lib:$LD_LIBRARY_PATH set -euo pipefail -cmake -DRUN_MODE=${RUN_MODE} -DSOC_VERSION=${SOC_VERSION} -DASCEND_CANN_PACKAGE_PATH=${ASCEND_HOME_DIR} .. +cmake -DRUN_MODE=${RUN_MODE} -DSOC_VERSION=${SOC_VERSION} -DENABLE_FEATURE=${ENABLE_FEATURE} -DASCEND_CANN_PACKAGE_PATH=${ASCEND_HOME_DIR} .. make -j16 cd ../ diff --git a/examples/matrix/matmul_splitk/testcase/case.csv b/examples/matrix/matmul_splitk/testcase/case.csv index 5b39cef8..0036707a 100644 --- a/examples/matrix/matmul_splitk/testcase/case.csv +++ b/examples/matrix/matmul_splitk/testcase/case.csv @@ -1,2 +1 @@ -1, case001, 16, 16, 1024 -0, case002, 256, 512, 1536 \ No newline at end of file +1, case001, 16, 16, 1024 \ No newline at end of file diff --git a/examples/readme.md b/examples/readme.md index 1701252c..e51ff5ea 100644 --- a/examples/readme.md +++ b/examples/readme.md @@ -36,7 +36,7 @@ 对输入tensor按行做如下公式的计算:zi = (xi - ∑(xi * yi)) * yi,其中∑为按行reduce求和。 - matrix + matrix basic_block_matmul 实现无尾块且tiling的base块大小固定的场景下的Matmul矩阵乘法,计算公式为:C = A * B + Bias。 @@ -167,6 +167,10 @@ matmul_perf 性能优化样例,包含纯Cube模式、MDL模板、UnitFlag、Tiling全量常量化四个特性的Matmul算子,计算公式为:C = A * B + Bias。 + + batch_matmul_tscm + 左矩阵A为TSCM输入的BatchMatmul计算,单次Matmul计算公式为:C = A * B + Bias。 + normalization layernorm @@ -258,4 +262,3 @@ - -- Gitee