diff --git a/examples/matrix/matmul_ibshareB/CMakeLists.txt b/examples/matrix/matmul_ibshareB/CMakeLists.txt index ed860d62edbdc6d5e497032162f9d0f9cb215d45..ff71e203634c2f7af5dea949ece96613127eef15 100644 --- a/examples/matrix/matmul_ibshareB/CMakeLists.txt +++ b/examples/matrix/matmul_ibshareB/CMakeLists.txt @@ -9,7 +9,7 @@ cmake_minimum_required(VERSION 3.16) project(Ascend_c) -if(${RUN_MODE}) +if (${RUN_MODE}) set(RUN_MODE "npu" CACHE STRING "cpu/sim/npu") endif() if (${SOC_VERSION}) @@ -26,7 +26,7 @@ if(CMAKE_INSTALL_PREFIX STREQUAL /usr/local) endif() file(GLOB KERNEL_FILES - ${CMAKE_CURRENT_SOURCE_DIR}/op_kernel/matmul_ibshare_custom.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/op_kernel/matmul_ibshareb_custom_kernel.cpp ) if("${RUN_MODE}" STREQUAL "cpu") @@ -37,29 +37,30 @@ else() message("invalid RUN_MODE: ${RUN_MODE}") endif() -add_executable(ascendc_matmul_ibshare_bbit +add_executable(ascendc_matmul_ibshareb_bbit ${CMAKE_CURRENT_SOURCE_DIR}/main.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/op_host/matmul_ibshare_custom_tiling.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/op_host/matmul_ibshareb_custom_tiling.cpp ) -target_compile_options(ascendc_matmul_ibshare_bbit PRIVATE +target_compile_options(ascendc_matmul_ibshareb_bbit PRIVATE $:-g>> -O2 -std=c++17 -D_GLIBCXX_USE_CXX11_ABI=0 ) -target_compile_definitions(ascendc_matmul_ibshare_bbit PRIVATE +target_compile_definitions(ascendc_matmul_ibshareb_bbit PRIVATE + $<$:ENABLE_IBSHAREB> SOC_VERSION="${SOC_VERSION}" ) -target_include_directories(ascendc_matmul_ibshare_bbit PRIVATE +target_include_directories(ascendc_matmul_ibshareb_bbit PRIVATE ${CMAKE_CURRENT_SOURCE_DIR} $:${ASCEND_CANN_PACKAGE_PATH}/include>> $:${ASCEND_CANN_PACKAGE_PATH}/runtime/include>> ) -target_link_libraries(ascendc_matmul_ibshare_bbit PRIVATE +target_link_libraries(ascendc_matmul_ibshareb_bbit PRIVATE $,$>:host_intf_pub>> $:tikicpulib::${SOC_VERSION}>> $:ascendcl>> diff --git a/examples/matrix/matmul_ibshareB/README.md b/examples/matrix/matmul_ibshareB/README.md index 69e805a1ee95c8b0e59835e60576903270bce873..b259a14188d566cce81066f5f51474fafc499f1f 100644 --- a/examples/matrix/matmul_ibshareB/README.md +++ b/examples/matrix/matmul_ibshareB/README.md @@ -1,6 +1,6 @@ ## 概述 -本样例介绍了调用Matmul高阶API实现开启IBShare功能的单算子。IBShare的功能是复用L1 Buffer上相同的A矩阵或者B矩阵数据,减少数据搬运开销。本样例仅为B矩阵复用场景。 +本样例介绍了调用Matmul高阶API实现开启IBShare功能的单算子。IBShare的功能是复用L1 Buffer上相同的A矩阵或者B矩阵数据,减少数据搬运开销。本样例为仅B矩阵复用场景。 本样例以直调的方式调用算子核函数。 直调:核函数的基础调用方式,开发者完成算子核函数的开发和Tiling实现后,即可通过AscendCL运行时接口,完成算子的调用。包含CPU侧、NPU侧和仿真三种运行验证方法。 @@ -22,26 +22,26 @@ ## 算子描述 - 算子功能 - MatmulIbshareCustom算子通过调用Matmul高阶API,开启IBShare功能,对输入的A、B矩阵做矩阵乘和加bias偏置。 + MatmulIBShareBCustom算子通过调用Matmul高阶API,开启B矩阵的IBShare功能,对输入的A、B矩阵做矩阵乘和加bias偏置。 - 算子规格 - + - + - +
算子类型(OpType)MatmulIbshareCustom
算子类型(OpType)MatmulIBShareBCustom
算子输入nameshapedata typeformatisTrans
a-float16NDtrue
a-float16NDfalse
b-float16NDfalse
bias-floatND-
算子输出c-floatND-
核函数名matmul_ibshare_custom
核函数名matmul_ibshareb_custom
## 算子实现介绍 -本样例是仅B矩阵使能IBShare的场景,输入拆成40份。MIX场景下AIC:AIV=1:2,本样例开启20个AIC和40个AIV,计算过程中同一AIC对应的两个AIV在每次迭代时用到的B矩阵数据都一致。 +本样例是仅B矩阵使能IBShare的场景,计算过程中同一AIC对应的两个AIV在每次迭代时用到的B矩阵数据都一致。 - 约束条件: - A矩阵或B矩阵单独使能IBShare的场景,要求复用的矩阵必须在L1 Buffer上全载。 @@ -52,15 +52,13 @@ - C为目的操作数,存放矩阵乘结果的矩阵,形状为[M, N]。 - Bias为矩阵乘偏置,形状为[1, N]。对A*B结果矩阵的每一行都采用该bias进行偏置。 - 具体步骤: - - 创建Matmul对象,设置B矩阵使能IBSHARE。 + - 创建Matmul对象,设置B矩阵的IBShare参数为true,使用默认的IBShare模板CFG_IBSHARE_NORM创建Matmul对象。 ``` - // ibshareA设置为false,ibshareB设置为true - constexpr static MatmulConfig MM_CFG = GetCFG(); - AscendC::Matmul< - AscendC::MatmulType, - AscendC::MatmulType, - AscendC::MatmulType, - AscendC::MatmulType, MM_CFG> matmulObj; + using A_TYPE = AscendC::MatmulType; + using B_TYPE = AscendC::MatmulType; + using C_TYPE = AscendC::MatmulType; + using BIAS_TYPE = AscendC::MatmulType; + AscendC::Matmul matmulObj; ``` - 初始化操作。 - 设置左矩阵A、右矩阵B、Bias。 @@ -87,15 +85,16 @@ - 编译执行 ``` - bash run.sh -r [RUN_MODE] -v [SOC_VERSION] -p [IS_PERF] + bash run.sh -r [RUN_MODE] -v [SOC_VERSION] -p [IS_PERF] -e [ENABLE_FEATURE] ``` 其中脚本参数说明如下: - RUN_MODE :编译执行方式,可选择CPU调试,NPU仿真,NPU上板,对应参数分别为[cpu / sim / npu]。若需要详细了解NPU仿真相关内容,请参考[《算子开发工具msProf》](https://hiascend.com/document/redirect/CannCommunityToolMsProf)中的“工具使用”章节。 - SOC_VERSION :昇腾AI处理器型号,如果无法确定具体的[SOC_VERSION],则在安装昇腾AI处理器的服务器执行npu-smi info命令进行查询,在查询到的“Name”前增加Ascend信息,例如“Name”对应取值为xxxyy,实际配置的[SOC_VERSION]值为Ascendxxxyy。支持以下产品型号: - Atlas A2训练系列产品/Atlas 800I A2推理产品 - IS_PERF : 是否获取执行性能数据,当前只在NPU执行时生效,打开后性能数据显示在执行结果中,可选择关闭和开启该功能,对应参数分别为[0 / 1]。 + - ENABLE_FEATURE :是否使能B矩阵IBShare功能,可选择关闭和开启该功能,对应参数分别为[0 / 1],默认开启。 示例如下,Ascendxxxyy请替换为实际的AI处理器型号。 ``` - bash run.sh -r cpu -v Ascendxxxyy -p 0 + bash run.sh -r cpu -v Ascendxxxyy -p 0 -e 1 ``` \ No newline at end of file diff --git a/examples/matrix/matmul_ibshareB/cmake/cpu_lib.cmake b/examples/matrix/matmul_ibshareB/cmake/cpu_lib.cmake index 244469fb44724e478a110e491392a7b66e56e034..09fda3d387cef823bfe263e82bb19dd28919d99b 100644 --- a/examples/matrix/matmul_ibshareB/cmake/cpu_lib.cmake +++ b/examples/matrix/matmul_ibshareB/cmake/cpu_lib.cmake @@ -20,6 +20,10 @@ target_link_libraries(ascendc_kernels_${RUN_MODE} PRIVATE tikicpulib::${SOC_VERSION} ) +target_compile_definitions(ascendc_kernels_${RUN_MODE} PRIVATE + $<$:ENABLE_IBSHAREB> +) + target_compile_options(ascendc_kernels_${RUN_MODE} PRIVATE -g -O0 @@ -27,5 +31,5 @@ target_compile_options(ascendc_kernels_${RUN_MODE} PRIVATE ) install(TARGETS ascendc_kernels_${RUN_MODE} -DESTINATION ${CMAKE_INSTALL_LIBDIR} -) \ No newline at end of file + DESTINATION ${CMAKE_INSTALL_LIBDIR} +) diff --git a/examples/matrix/matmul_ibshareB/cmake/npu_lib.cmake b/examples/matrix/matmul_ibshareB/cmake/npu_lib.cmake index 927d2490b8abb30245a57454309a4ad94b00bed7..e76e37b9e88082e40cf12810d3698a6f332db620 100644 --- a/examples/matrix/matmul_ibshareB/cmake/npu_lib.cmake +++ b/examples/matrix/matmul_ibshareB/cmake/npu_lib.cmake @@ -21,7 +21,8 @@ ascendc_library(ascendc_kernels_${RUN_MODE} STATIC ) ascendc_compile_definitions(ascendc_kernels_${RUN_MODE} PRIVATE - -DASCENDC_DUMP - -DHAVE_WORKSPACE - -DHAVE_TILING - ) \ No newline at end of file + $<$:ENABLE_IBSHAREB> + -DASCENDC_DUMP + -DHAVE_WORKSPACE + -DHAVE_TILING +) diff --git a/examples/matrix/matmul_ibshareB/main.cpp b/examples/matrix/matmul_ibshareB/main.cpp index 4fb3325630e4dbb922cbef508e0a4496997c506c..347a824d27098e45538cbd7d0a4583494207136e 100644 --- a/examples/matrix/matmul_ibshareB/main.cpp +++ b/examples/matrix/matmul_ibshareB/main.cpp @@ -7,28 +7,29 @@ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. * See LICENSE in the root of the software repository for the full text of the License. */ + #include #include "../../common/data_utils.h" #include "kernel_tiling/kernel_tiling.h" #include "tiling/platform/platform_ascendc.h" -#include "op_host/matmul_ibshare_custom_tiling.h" +#include "op_host/matmul_ibshareb_custom_tiling.h" + #ifndef ASCENDC_CPU_DEBUG #include "acl/acl.h" - -extern void matmul_ibshare_custom_do(uint32_t coreDim, void* stream, - uint8_t* a, uint8_t* b, uint8_t* c, - uint8_t* workspace, uint8_t* tiling); +extern void matmul_ibshareb_custom_do(uint32_t coreDim, void* stream, + uint8_t* a, uint8_t* b, uint8_t* bias, uint8_t* c, + uint8_t* workspace, uint8_t* tiling); #else #include "tikicpulib.h" -extern "C" void matmul_ibshare_custom(uint8_t* a, uint8_t* b, uint8_t* c, - uint8_t* workspace, uint8_t* tiling); +extern "C" void matmul_ibshareb_custom(uint8_t* a, uint8_t* b, uint8_t* bias, uint8_t* c, + uint8_t* workspace, uint8_t* tiling); #endif - namespace { -constexpr bool hasBias = false; -constexpr bool ibshareA = false; -constexpr bool ibshareB = true; +constexpr int32_t MIX_RATIO = 2; // AIC:AIV=1:2 +constexpr bool IS_BIAS = false; +constexpr bool IS_A_TRANS = false; +constexpr bool IS_B_TRANS = false; struct MatrixFileSize { @@ -50,46 +51,45 @@ static size_t GetSysWorkSpaceSize() #ifdef ASCENDC_CPU_DEBUG void TestMatmulCpu(int64_t m, int64_t n, int64_t k, const MatrixFileSize& matrixFileSize) { - size_t aFileSize = matrixFileSize.x1FileSize; // uint16_t represent half - size_t bFileSize = matrixFileSize.x2FileSize; // uint16_t represent half - size_t cFileSize = matrixFileSize.yFileSize; + size_t x1FileSize = matrixFileSize.x1FileSize; + size_t x2FileSize = matrixFileSize.x2FileSize; + size_t yFileSize = matrixFileSize.yFileSize; + size_t biasFileSize = matrixFileSize.biasFileSize; + // Query workspace size size_t workspaceSize = GetSysWorkSpaceSize(); + uint8_t* x1 = (uint8_t*)AscendC::GmAlloc(x1FileSize); + uint8_t* x2 = (uint8_t*)AscendC::GmAlloc(x2FileSize); + uint8_t* bias = nullptr; + uint8_t* y = (uint8_t*)AscendC::GmAlloc(yFileSize); + uint8_t* workspace = (uint8_t*)AscendC::GmAlloc(workspaceSize); + ReadFile("../input/x1_gm.bin", x1FileSize, x1, x1FileSize); + ReadFile("../input/x2_gm.bin", x2FileSize, x2, x2FileSize); + if (IS_BIAS) { + bias = (uint8_t*)AscendC::GmAlloc(biasFileSize); + ReadFile("../input/bias_gm.bin", biasFileSize, bias, biasFileSize); + } size_t tilingFileSize = sizeof(TCubeTiling); - uint32_t mixRatio = 2; // AIC:AIV = 1:2 + uint8_t* tiling = (uint8_t*)AscendC::GmAlloc(tilingFileSize); + auto ascendcPlatform = platform_ascendc::PlatformAscendCManager::GetInstance(); MatmulHost::MatmulCaseParams testCaseParams{ - static_cast(m), static_cast(n), static_cast(k), hasBias, ibshareA, ibshareB}; - - uint8_t* a = (uint8_t* )AscendC::GmAlloc(aFileSize); - uint8_t* b = (uint8_t* )AscendC::GmAlloc(bFileSize); - uint8_t* c = (uint8_t* )AscendC::GmAlloc(cFileSize); - uint8_t* workspace = (uint8_t* )AscendC::GmAlloc(workspaceSize); - uint8_t* tiling = (uint8_t* )AscendC::GmAlloc(tilingFileSize); - - ReadFile("../input/x1_gm.bin", aFileSize, a, aFileSize); - ReadFile("../input/x2_gm.bin", bFileSize, b, bFileSize); + static_cast(m), static_cast(n), static_cast(k), IS_BIAS, IS_A_TRANS, IS_B_TRANS}; + // Calculate Tiling const auto& tilingData = MatmulHost::GenerateTiling(testCaseParams); memcpy_s(tiling, tilingFileSize, &tilingData, tilingFileSize); - - uint32_t blockDim; - if (ibshareA && ibshareB) { - blockDim = tilingData.usedCoreNum; - } else { - blockDim = tilingData.usedCoreNum / mixRatio; + ICPU_RUN_KF(matmul_ibshareb_custom, tilingData.usedCoreNum / MIX_RATIO, x1, x2, bias, y, workspace, tiling); + WriteFile("../output/output.bin", y, yFileSize); + AscendC::GmFree((void*)x1); + AscendC::GmFree((void*)x2); + AscendC::GmFree((void*)y); + AscendC::GmFree((void*)workspace); + AscendC::GmFree((void*)tiling); + if (IS_BIAS) { + AscendC::GmFree((void*)bias); } - - ICPU_RUN_KF(matmul_ibshare_custom, blockDim, a, b, c, workspace, tiling); - - WriteFile("../output/output.bin", c, cFileSize); - - AscendC::GmFree((void *)a); - AscendC::GmFree((void *)b); - AscendC::GmFree((void *)c); - AscendC::GmFree((void *)workspace); - AscendC::GmFree((void *)tiling); } // NPU #else -void MatmulOp(uint8_t* a, uint8_t* b, uint8_t* c, int64_t m, int64_t n, int64_t k, +void MatmulOp(uint8_t* x1, uint8_t* x2, uint8_t* y, uint8_t* bias, int64_t m, int64_t n, int64_t k, void* stream = nullptr) { // Init args @@ -105,20 +105,11 @@ void MatmulOp(uint8_t* a, uint8_t* b, uint8_t* c, int64_t m, int64_t n, int64_t uint8_t* tilingHost = nullptr; uint8_t* tilingDevice = nullptr; size_t tilingFileSize = sizeof(TCubeTiling); - + auto ascendcPlatform = platform_ascendc::PlatformAscendCManager::GetInstance(); MatmulHost::MatmulCaseParams testCaseParams{ - static_cast(m), static_cast(n), static_cast(k), hasBias, ibshareA, ibshareB}; - + static_cast(m), static_cast(n), static_cast(k), IS_BIAS, IS_A_TRANS, IS_B_TRANS}; // Calculate Tiling - uint32_t mixRatio = 2; // AIC:AIV = 1:2 const auto tilingData = MatmulHost::GenerateTiling(testCaseParams); - uint32_t blockDim; - if (ibshareA && ibshareB) { - blockDim = tilingData.usedCoreNum; - } else { - blockDim = tilingData.usedCoreNum / mixRatio; - } - CHECK_ACL(aclrtMallocHost((void**)(&tilingHost), tilingFileSize)); CHECK_ACL(aclrtMalloc((void**)&tilingDevice, tilingFileSize, ACL_MEM_MALLOC_HUGE_FIRST)); @@ -127,7 +118,7 @@ void MatmulOp(uint8_t* a, uint8_t* b, uint8_t* c, int64_t m, int64_t n, int64_t CHECK_ACL(aclrtMemcpy(tilingDevice, tilingFileSize, tilingHost, tilingFileSize, ACL_MEMCPY_HOST_TO_DEVICE)); // Launch kernel - matmul_ibshare_custom_do(blockDim, stream, a, b, c, workspaceDevice, tilingDevice); + matmul_ibshareb_custom_do(tilingData.usedCoreNum / MIX_RATIO, stream, x1, x2, bias, y, workspaceDevice, tilingDevice); CHECK_ACL(aclrtFreeHost(tilingHost)); CHECK_ACL(aclrtFree(workspaceDevice)); CHECK_ACL(aclrtFree(tilingDevice)); @@ -151,60 +142,68 @@ void TestAclDeInit(aclrtContext& context, aclrtStream& stream, int64_t& deviceId void TestMatmul(int64_t m, int64_t n, int64_t k, const MatrixFileSize& matrixFileSize) { - size_t aFileSize = matrixFileSize.x1FileSize; // uint16_t represent half - size_t bFileSize = matrixFileSize.x2FileSize; // uint16_t represent half - size_t cFileSize = matrixFileSize.yFileSize; + size_t x1FileSize = matrixFileSize.x1FileSize; + size_t x2FileSize = matrixFileSize.x2FileSize; + size_t yFileSize = matrixFileSize.yFileSize; + size_t biasFileSize = matrixFileSize.biasFileSize; aclrtContext context; aclrtStream stream = nullptr; int64_t deviceId = 0; TestAclInit(context, stream, deviceId); - uint8_t* aHost; - uint8_t* aDevice; - CHECK_ACL(aclrtMallocHost((void **)(&aHost), aFileSize)); - CHECK_ACL(aclrtMalloc((void **)&aDevice, aFileSize, ACL_MEM_MALLOC_HUGE_FIRST)); - ReadFile("../input/x1_gm.bin", aFileSize, aHost, aFileSize); - CHECK_ACL(aclrtMemcpy(aDevice, aFileSize, aHost, aFileSize, - ACL_MEMCPY_HOST_TO_DEVICE)); - - uint8_t* bHost; - uint8_t* bDevice; - CHECK_ACL(aclrtMallocHost((void **)(&bHost), bFileSize)); - CHECK_ACL(aclrtMalloc((void **)&bDevice, bFileSize, ACL_MEM_MALLOC_HUGE_FIRST)); - ReadFile("../input/x2_gm.bin", bFileSize, bHost, bFileSize); - CHECK_ACL(aclrtMemcpy(bDevice, bFileSize, bHost, bFileSize, - ACL_MEMCPY_HOST_TO_DEVICE)); - - uint8_t* cHost; - uint8_t* cDevice; - CHECK_ACL(aclrtMallocHost((void **)(&cHost), cFileSize)); - CHECK_ACL(aclrtMalloc((void **)&cDevice, cFileSize, ACL_MEM_MALLOC_HUGE_FIRST)); - - MatmulOp(aDevice, bDevice, cDevice, m, n, k, stream); - CHECK_ACL(aclrtSynchronizeStream(stream)); + uint8_t* x1Host = nullptr; + uint8_t* x1Device = nullptr; + CHECK_ACL(aclrtMallocHost((void**)(&x1Host), x1FileSize)); + CHECK_ACL(aclrtMalloc((void**)&x1Device, x1FileSize, ACL_MEM_MALLOC_HUGE_FIRST)); + ReadFile("../input/x1_gm.bin", x1FileSize, x1Host, x1FileSize); + CHECK_ACL(aclrtMemcpy(x1Device, x1FileSize, x1Host, x1FileSize, ACL_MEMCPY_HOST_TO_DEVICE)); + + uint8_t* x2Host = nullptr; + uint8_t* x2Device = nullptr; + CHECK_ACL(aclrtMallocHost((void**)(&x2Host), x2FileSize)); + CHECK_ACL(aclrtMalloc((void**)&x2Device, x2FileSize, ACL_MEM_MALLOC_HUGE_FIRST)); + ReadFile("../input/x2_gm.bin", x2FileSize, x2Host, x2FileSize); + CHECK_ACL(aclrtMemcpy(x2Device, x2FileSize, x2Host, x2FileSize, ACL_MEMCPY_HOST_TO_DEVICE)); + + uint8_t* biasHost = nullptr; + uint8_t* biasDevice = nullptr; + if (IS_BIAS) { + CHECK_ACL(aclrtMallocHost((void**)(&biasHost), biasFileSize)); + CHECK_ACL(aclrtMalloc((void**)&biasDevice, biasFileSize, ACL_MEM_MALLOC_HUGE_FIRST)); + ReadFile("../input/bias_gm.bin", biasFileSize, biasHost, biasFileSize); + CHECK_ACL(aclrtMemcpy(biasDevice, biasFileSize, biasHost, biasFileSize, ACL_MEMCPY_HOST_TO_DEVICE)); + } + uint8_t* yHost = nullptr; + uint8_t* yDevice = nullptr; + CHECK_ACL(aclrtMallocHost((void**)(&yHost), yFileSize)); + CHECK_ACL(aclrtMalloc((void**)&yDevice, yFileSize, ACL_MEM_MALLOC_HUGE_FIRST)); - CHECK_ACL(aclrtMemcpy(cHost, cFileSize, cDevice, cFileSize, - ACL_MEMCPY_DEVICE_TO_HOST)); - WriteFile("../output/output.bin", cHost, cFileSize); + MatmulOp(x1Device, x2Device, yDevice, biasDevice, m, n, k, stream); + CHECK_ACL(aclrtSynchronizeStream(stream)); - CHECK_ACL(aclrtFree(aDevice)); - CHECK_ACL(aclrtFreeHost(aHost)); - CHECK_ACL(aclrtFree(bDevice)); - CHECK_ACL(aclrtFreeHost(bHost)); - CHECK_ACL(aclrtFree(cDevice)); - CHECK_ACL(aclrtFreeHost(cHost)); + CHECK_ACL(aclrtMemcpy(yHost, yFileSize, yDevice, yFileSize, ACL_MEMCPY_DEVICE_TO_HOST)); + WriteFile("../output/output.bin", yHost, yFileSize); + if (IS_BIAS) { + CHECK_ACL(aclrtFree(biasDevice)); + CHECK_ACL(aclrtFreeHost(biasHost)); + } + CHECK_ACL(aclrtFree(x1Device)); + CHECK_ACL(aclrtFreeHost(x1Host)); + CHECK_ACL(aclrtFree(x2Device)); + CHECK_ACL(aclrtFreeHost(x2Host)); + CHECK_ACL(aclrtFree(yDevice)); + CHECK_ACL(aclrtFreeHost(yHost)); TestAclDeInit(context, stream, deviceId); } #endif -} - +} // namespace MatmulHost -int32_t main(int32_t argc, const char *args[]) +int32_t main(int32_t argc, const char* args[]) { int64_t inputParams[3] = {1, 1, 1}; - for (int32_t i = 1; i < argc && i < 4; ++i) { // 4 + for (int32_t i = 1; i < argc && i < 4; ++i) { // 4 used for inputParams loop std::stringstream ss(args[i]); ss >> inputParams[i - 1]; } diff --git a/examples/matrix/matmul_ibshareB/op_host/matmul_ibshare_custom_tiling.cpp b/examples/matrix/matmul_ibshareB/op_host/matmul_ibshareb_custom_tiling.cpp similarity index 64% rename from examples/matrix/matmul_ibshareB/op_host/matmul_ibshare_custom_tiling.cpp rename to examples/matrix/matmul_ibshareB/op_host/matmul_ibshareb_custom_tiling.cpp index ed2f91403998a25d7418c0ca0af1d966e543c3fa..6305e044498c4ea802a4f4980f8ea1bd5d24b8f3 100644 --- a/examples/matrix/matmul_ibshareB/op_host/matmul_ibshare_custom_tiling.cpp +++ b/examples/matrix/matmul_ibshareB/op_host/matmul_ibshareb_custom_tiling.cpp @@ -8,15 +8,13 @@ * See LICENSE in the root of the software repository for the full text of the License. */ -#include -#include +#include "matmul_ibshareb_custom_tiling.h" #include -#include -#include "tiling/tiling_api.h" -#include "tiling/platform/platform_ascendc.h" -#include "matmul_ibshare_custom_tiling.h" -namespace MatmulHost{ +namespace MatmulHost { + +constexpr int32_t USED_CORE_NUM = 2; // AIC:AIV=1:2 + TCubeTiling GenerateTiling(const MatmulCaseParams& testCaseParams) { TCubeTiling tilingData; @@ -24,28 +22,25 @@ TCubeTiling GenerateTiling(const MatmulCaseParams& testCaseParams) if (ascendcPlatform == nullptr) { return tilingData; } - auto aivCoreNum = ascendcPlatform->GetCoreNum(); matmul_tiling::MultiCoreMatmulTiling cubeTiling(*ascendcPlatform); - - int32_t M = testCaseParams.m; - int32_t N = testCaseParams.n; - int32_t K = testCaseParams.k; + uint32_t M = testCaseParams.m; + uint32_t N = testCaseParams.n; + uint32_t K = testCaseParams.k; bool hasBias = testCaseParams.hasBias; - bool ibshareA = testCaseParams.ibshareA; - bool ibshareB = testCaseParams.ibshareB; + bool isAtrans = testCaseParams.isATrans; + bool isBtrans = testCaseParams.isBTrans; - constexpr int32_t MIX_RATIO = 2; // AIC:AIV = 1:2 - int blockDim = aivCoreNum; - if (ibshareA && ibshareB) { - blockDim /= MIX_RATIO; - } - cubeTiling.SetDim(blockDim); - cubeTiling.SetAType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, matmul_tiling::DataType::DT_FLOAT16); - cubeTiling.SetBType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, matmul_tiling::DataType::DT_FLOAT16); - cubeTiling.SetCType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, matmul_tiling::DataType::DT_FLOAT); - cubeTiling.SetBiasType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, matmul_tiling::DataType::DT_FLOAT); - cubeTiling.SetShape(M, N, K); + cubeTiling.SetDim(USED_CORE_NUM); + cubeTiling.SetAType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, + matmul_tiling::DataType::DT_FLOAT16, isAtrans); + cubeTiling.SetBType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, + matmul_tiling::DataType::DT_FLOAT16, isBtrans); + cubeTiling.SetCType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, + matmul_tiling::DataType::DT_FLOAT); + cubeTiling.SetBiasType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, + matmul_tiling::DataType::DT_FLOAT); cubeTiling.SetOrgShape(M, N, K); + cubeTiling.SetShape(M, N, K); cubeTiling.EnableBias(hasBias); cubeTiling.SetBufferSpace(-1, -1, -1); if (cubeTiling.GetTiling(tilingData) == -1) { diff --git a/examples/matrix/matmul_ibshareB/op_host/matmul_ibshare_custom_tiling.h b/examples/matrix/matmul_ibshareB/op_host/matmul_ibshareb_custom_tiling.h similarity index 97% rename from examples/matrix/matmul_ibshareB/op_host/matmul_ibshare_custom_tiling.h rename to examples/matrix/matmul_ibshareB/op_host/matmul_ibshareb_custom_tiling.h index 6faa0b4857e5c2276ba45c7d3039b0998b989d6c..96aafe86458fdfd5eb39c5c2389060d9765d9823 100644 --- a/examples/matrix/matmul_ibshareB/op_host/matmul_ibshare_custom_tiling.h +++ b/examples/matrix/matmul_ibshareB/op_host/matmul_ibshareb_custom_tiling.h @@ -21,8 +21,8 @@ struct MatmulCaseParams int32_t n; int32_t k; bool hasBias; - bool ibshareA; - bool ibshareB; + bool isATrans; + bool isBTrans; }; /** diff --git a/examples/matrix/matmul_ibshareB/op_kernel/matmul_ibshare_custom.h b/examples/matrix/matmul_ibshareB/op_kernel/matmul_ibshare_custom.h deleted file mode 100644 index 87ba8c69d44de16d05556611f95f862a00d2f269..0000000000000000000000000000000000000000 --- a/examples/matrix/matmul_ibshareB/op_kernel/matmul_ibshare_custom.h +++ /dev/null @@ -1,119 +0,0 @@ -/* - * Copyright (c) 2025 Huawei Technologies Co., Ltd. - * This file is a part of the CANN Open Software. - * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). - * Please refer to the License for details. You may not use this file except in compliance with the License. - * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, - * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. - * See LICENSE in the root of the software repository for the full text of the License. - */ - -#ifndef EXAMPLES_MATRIX_MATMUL_IBSHAREB_OP_KERNEL_MATMUL_IBSHAREB_CUSTOM_TILING_H -#define EXAMPLES_MATRIX_MATMUL_IBSHAREB_OP_KERNEL_MATMUL_IBSHAREB_CUSTOM_TILING_H -#include "kernel_operator.h" -#include "lib/matmul_intf.h" - -namespace { -constexpr int32_t MIX_RATIO = 2; // AIC:AIV = 1:2 -} - -template -__aicore__ inline constexpr MatmulConfig GetCFG() -{ - if constexpr ((ibshareA && ibshareB) || (!ibshareA && !ibshareB)) { - return GetNormalConfig(); - } - return GetIBShareNormConfig(); -} - -template -class MatmulIbshareKernel { - private: - constexpr static MatmulConfig MM_CFG = GetCFG(); - public: - __aicore__ inline MatmulIbshareKernel(){}; - __aicore__ inline void Init(GM_ADDR a, GM_ADDR b, GM_ADDR bias, GM_ADDR c, GM_ADDR workspace, const TCubeTiling& tiling); - template - __aicore__ inline void Process(AscendC::TPipe* pipe); - AscendC::Matmul< - AscendC::MatmulType, - AscendC::MatmulType, - AscendC::MatmulType, - AscendC::MatmulType, MM_CFG> matmulObj; - - private: - __aicore__ inline void CalcOffset(int32_t blockIdx, const TCubeTiling& tiling, int32_t& offsetA, int32_t& offsetB, - int32_t& offsetC, int32_t& offsetBias); - - AscendC::GlobalTensor aGlobal; - AscendC::GlobalTensor bGlobal; - AscendC::GlobalTensor cGlobal; - AscendC::GlobalTensor biasGlobal; - TCubeTiling tiling; -}; - -template -__aicore__ inline void MatmulIbshareKernel::Init( - GM_ADDR a, GM_ADDR b, GM_ADDR bias, GM_ADDR c, GM_ADDR workspace, const TCubeTiling& tiling) -{ - this->tiling = tiling; - aGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ aType*>(a), tiling.M * tiling.Ka); - bGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ bType*>(b), tiling.Kb * tiling.N); - cGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ cType*>(c), tiling.M * tiling.N); - biasGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ biasType*>(bias), tiling.N); - - int32_t offsetA = 0; - int32_t offsetB = 0; - int32_t offsetC = 0; - int32_t offsetBias = 0; - CalcOffset(AscendC::GetBlockIdx(), tiling, offsetA, offsetB, offsetC, offsetBias); - aGlobal = aGlobal[offsetA]; - bGlobal = bGlobal[offsetB]; - cGlobal = cGlobal[offsetC]; - biasGlobal = biasGlobal[offsetBias]; - if(GetSysWorkSpacePtr() == nullptr){ - return; - } -} - -template -template -__aicore__ inline void MatmulIbshareKernel::Process(AscendC::TPipe* pipe) -{ - matmulObj.SetTensorA(aGlobal); - matmulObj.SetTensorB(bGlobal); - if constexpr (hasBias) { - matmulObj.SetBias(biasGlobal); - } - matmulObj.IterateAll(cGlobal); - matmulObj.End(); -} - -__aicore__ inline uint32_t Ceiling(uint32_t a, uint32_t b) -{ - return (a + b - 1) / b; -} - -template -__aicore__ inline void MatmulIbshareKernel::CalcOffset( - int32_t blockIdx, const TCubeTiling& tiling, int32_t& offsetA, int32_t& offsetB, int32_t& offsetC, int32_t& offsetBias) -{ - auto mSingleBlocks = Ceiling(tiling.M, tiling.singleCoreM); - auto mCoreIndx = blockIdx % mSingleBlocks; - auto nCoreIndx = blockIdx / mSingleBlocks; - - offsetA = mCoreIndx * tiling.Ka * tiling.singleCoreM; - offsetB = nCoreIndx * tiling.singleCoreN; - offsetC = mCoreIndx * tiling.N * tiling.singleCoreM + nCoreIndx * tiling.singleCoreN; - offsetBias = nCoreIndx * tiling.singleCoreN; - - // process with tail block - int32_t tailM = tiling.M - mCoreIndx * tiling.singleCoreM; - tailM = tailM < tiling.singleCoreM ? tailM : tiling.singleCoreM; - int32_t tailN = tiling.N - nCoreIndx * tiling.singleCoreN; - tailN = tailN < tiling.singleCoreN ? tailN : tiling.singleCoreN; - if (tailM < tiling.singleCoreM || tailN < tiling.singleCoreN) { - matmulObj.SetTail(tailM, tailN); - } -} -#endif // EXAMPLES_MATRIX_MATMUL_IBSHAREB_OP_KERNEL_MATMUL_IBSHAREB_CUSTOM_TILING_H \ No newline at end of file diff --git a/examples/matrix/matmul_ibshareB/op_kernel/matmul_ibshareb_custom_kernel.cpp b/examples/matrix/matmul_ibshareB/op_kernel/matmul_ibshareb_custom_kernel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..35476a03aec0f9985e9c4b91e3508951bb8e370f --- /dev/null +++ b/examples/matrix/matmul_ibshareB/op_kernel/matmul_ibshareb_custom_kernel.cpp @@ -0,0 +1,131 @@ +/* + * Copyright (c) 2025 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#include "kernel_operator.h" +#include "matmul_ibshareb_custom_kernel.h" + +namespace { +/** + * @brief Copy tiling data to TCubeTiling ptr from tiling gm addr. + * @param tiling: TCubeTiling ptr which needs to copy tiling data. + * @param tilingGM: Tiling gm addr. + * @retval None + */ +__aicore__ inline void CopyTiling(TCubeTiling* tiling, GM_ADDR tilingGM) +{ + uint32_t* ptr = reinterpret_cast(tiling); + auto tiling32 = reinterpret_cast<__gm__ uint32_t*>(tilingGM); + + for (int i = 0; i < sizeof(TCubeTiling) / sizeof(uint32_t); i++, ptr++) { + *ptr = *(tiling32 + i); + } + return; +} +} + +namespace MatmulIBShareBCustom { +template +__aicore__ inline void MatmulKernel::Init( + GM_ADDR a, GM_ADDR b, GM_ADDR bias, GM_ADDR c, const TCubeTiling& tiling, bool isTransA, bool isTransB) +{ + this->tiling = tiling; + aGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ AType*>(a), tiling.M * tiling.Ka); + bGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ BType*>(b), tiling.Kb * tiling.N); + cGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ CType*>(c), tiling.M * tiling.N); + biasGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ BiasType*>(bias), tiling.N); + + int32_t offsetA = 0; + int32_t offsetB = 0; + int32_t offsetC = 0; + int32_t offsetBias = 0; + this->isTransA = isTransA; + this->isTransB = isTransB; + CalcOffset(AscendC::GetBlockIdx(), offsetA, offsetB, offsetC, offsetBias); + aGlobal = aGlobal[offsetA]; + bGlobal = bGlobal[offsetB]; + cGlobal = cGlobal[offsetC]; + biasGlobal = biasGlobal[offsetBias]; + if (GetSysWorkSpacePtr() == nullptr) { + return; + } +} + +template +__aicore__ inline void MatmulKernel::Process() +{ + if (AscendC::GetBlockIdx() >= tiling.usedCoreNum) { + return; + } + matmulObj.SetTensorA(aGlobal, isTransA); + matmulObj.SetTensorB(bGlobal, isTransB); + if (tiling.isBias) { + matmulObj.SetBias(biasGlobal); + } + matmulObj.IterateAll(cGlobal); + matmulObj.End(); +} + +template +__aicore__ inline void MatmulKernel::CalcOffset( + int32_t blockIdx, int32_t& offsetA, int32_t& offsetB, int32_t& offsetC, int32_t& offsetBias) +{ + const TCubeTiling& tiling = this->tiling; + auto mSingleBlocks = (tiling.M + tiling.singleCoreM - 1) / tiling.singleCoreM; // split M into mSingleBlocks cores + mCoreIndex = blockIdx % mSingleBlocks; + nCoreIndex = blockIdx / mSingleBlocks; + + if (isTransA) { + offsetA = mCoreIndex * tiling.singleCoreM; + } else { + offsetA = mCoreIndex * tiling.Ka * tiling.singleCoreM; + } + if (isTransB) { + offsetB = nCoreIndex * tiling.Kb * tiling.singleCoreN; + } else { + offsetB = nCoreIndex * tiling.singleCoreN; + } + offsetC = mCoreIndex * tiling.N * tiling.singleCoreM + nCoreIndex * tiling.singleCoreN; + offsetBias = nCoreIndex * tiling.singleCoreN; +} +} // namespace MatmulIBShareBCustom + +/** + * @brief matmul kernel function. + * @param a: A matrix gm addr. + * @param b: B matrix gm addr. + * @param bias: Bias matrix gm addr. + * @param c: C matrix gm addr. + * @param workspace: Temporary gm space addr required by matmul calc. + * @param tilingGm: Tiling data addr. + * @retval None + */ +extern "C" __global__ __aicore__ void matmul_ibshareb_custom(GM_ADDR a, GM_ADDR b, + GM_ADDR bias, GM_ADDR c, GM_ADDR workspace, GM_ADDR tilingGm) +{ + // prepare tiling + TCubeTiling tiling; + CopyTiling(&tiling, tilingGm); + // define matmul kernel + MatmulIBShareBCustom::MatmulKernel matmulKernel; + AscendC::TPipe pipe; + REGIST_MATMUL_OBJ(&pipe, GetSysWorkSpacePtr(), matmulKernel.matmulObj, &tiling); + // init matmul kernel, isTransA=false, isTransB=false + matmulKernel.Init(a, b, bias, c, tiling, false, false); + // matmul kernel process + matmulKernel.Process(); +} + +#ifndef ASCENDC_CPU_DEBUG +void matmul_ibshareb_custom_do(uint32_t blockDim, void* stream, + GM_ADDR a, GM_ADDR b, GM_ADDR bias, GM_ADDR c, GM_ADDR workspace, GM_ADDR tilingGm) +{ + matmul_ibshareb_custom<<>>(a, b, bias, c, workspace, tilingGm); +} +#endif diff --git a/examples/matrix/matmul_ibshareB/op_kernel/matmul_ibshareb_custom_kernel.h b/examples/matrix/matmul_ibshareB/op_kernel/matmul_ibshareb_custom_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..8e52768d09e8f043d16f330a81a1beb0b27b66ce --- /dev/null +++ b/examples/matrix/matmul_ibshareB/op_kernel/matmul_ibshareb_custom_kernel.h @@ -0,0 +1,77 @@ +/* + * Copyright (c) 2025 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#ifndef EXAMPLES_MATRIX_MATMUL_IBSHAREB_OP_KERNEL_MATMUL_IBSHAREB_CUSTOM_KERNEL_H +#define EXAMPLES_MATRIX_MATMUL_IBSHAREB_OP_KERNEL_MATMUL_IBSHAREB_CUSTOM_KERNEL_H +#include "kernel_operator.h" +#include "lib/matmul_intf.h" + +namespace MatmulIBShareBCustom { + +template +class MatmulKernel { +public: + __aicore__ inline MatmulKernel(){}; + /** + * @brief Initialization before process. + * @param a: A matrix gm addr. + * @param b: B matrix gm addr. + * @param bias: Bias matrix gm addr. + * @param c: C matrix gm addr. + * @param tiling: Matmul tiling struct. + * @param isTransA: Whether A matrix is transposed. + * @param isTransB: Whether B matrix is transposed. + * @retval None + */ + __aicore__ inline void Init(GM_ADDR a, GM_ADDR b, GM_ADDR bias, GM_ADDR c, const TCubeTiling& tiling, + bool isTransA, bool isTransB); + /** + * @brief Process matrix calculation. + * @retval None + */ + __aicore__ inline void Process(); + + using A_TYPE = AscendC::MatmulType; + using C_TYPE = AscendC::MatmulType; + using BIAS_TYPE = AscendC::MatmulType; +#ifdef ENABLE_IBSHAREB + using B_TYPE = AscendC::MatmulType; + AscendC::Matmul matmulObj; +#else + using B_TYPE = AscendC::MatmulType; + AscendC::Matmul matmulObj; +#endif + +private: + /** + * @brief Calculate the gm offset based on the blockIdx. + * @param blockIdx: Current Core blockidx. + * @param offsetA: Gm offset of A matrix. + * @param offsetB: Gm offset of B matrix. + * @param offsetC: Gm offset of C matrix. + * @param offsetBias: Gm offset of Bias matrix. + * @retval None + */ + __aicore__ inline void CalcOffset( + int32_t blockIdx, int32_t& offsetA, int32_t& offsetB, int32_t& offsetC, int32_t& offsetBias); + + AscendC::GlobalTensor aGlobal; + AscendC::GlobalTensor bGlobal; + AscendC::GlobalTensor cGlobal; + AscendC::GlobalTensor biasGlobal; + TCubeTiling tiling; + int32_t mCoreIndex; + int32_t nCoreIndex; + bool isTransA{false}; + bool isTransB{false}; +}; +} // namespace MatmulIBShareBCustom + +#endif // EXAMPLES_MATRIX_MATMUL_IBSHAREB_OP_KERNEL_MATMUL_IBSHAREB_CUSTOM_KERNEL_H diff --git a/examples/matrix/matmul_ibshareB/run.sh b/examples/matrix/matmul_ibshareB/run.sh index 04595c3cc71122af13defd4d1c6744fafbf893bb..28d0e0ce4c3a939ea91f6b0f46a837cb5b0119ce 100644 --- a/examples/matrix/matmul_ibshareB/run.sh +++ b/examples/matrix/matmul_ibshareB/run.sh @@ -9,8 +9,9 @@ # ====================================================================================================================== export IS_PERF="0" +export ENABLE_FEATURE="1" -SHORT=r:,v:,p:, +SHORT=r:,v:,p:,e:, LONG=run-mode:,soc-version:,perf:, OPTS=$(getopt -a --options $SHORT --longoptions $LONG -- "$@") eval set -- "$OPTS" @@ -26,6 +27,9 @@ do (-p | --perf ) IS_PERF="$2" shift 2;; + (-e | --enable-feature ) + ENABLE_FEATURE="$2" + shift 2;; (--) shift; break;; @@ -51,6 +55,13 @@ if [ "${IS_PERF}" != "0" ] && [ "${IS_PERF}" != "1" ]; then exit 1 fi +if [ "${ENABLE_FEATURE}" != "0" ] && [ "${ENABLE_FEATURE}" != "1" ]; then + echo "[ERROR] Unsupported ENABLE_FEATURE: ${ENABLE_FEATURE}, which can only be 0 or 1." + exit 1 +fi + +echo "[INFO] ENABLE_FEATURE: ${ENABLE_FEATURE}." + # only npu mode support is_perf = 1 if [ "${IS_PERF}" == "1" ] && [ "${RUN_MODE}" != "npu" ]; then echo "[ERROR] Unsupported IS_PERF: ${IS_PERF} while RUN_MODE is ${RUN_MODE}." @@ -65,7 +76,7 @@ source $ASCEND_HOME_DIR/bin/setenv.bash export LD_LIBRARY_PATH=${ASCEND_HOME_DIR}/tools/simulator/${SOC_VERSION}/lib:$LD_LIBRARY_PATH set -euo pipefail -cmake -DRUN_MODE=${RUN_MODE} -DSOC_VERSION=${SOC_VERSION} -DASCEND_CANN_PACKAGE_PATH=${ASCEND_HOME_DIR} .. +cmake -DRUN_MODE=${RUN_MODE} -DSOC_VERSION=${SOC_VERSION} -DENABLE_FEATURE=${ENABLE_FEATURE} -DASCEND_CANN_PACKAGE_PATH=${ASCEND_HOME_DIR} .. make -j16 cd ../ @@ -78,7 +89,7 @@ mkdir output rm -rf bin/ mkdir -p bin cd bin -cp ../build/ascendc_matmul_ibshare_bbit ./ +cp ../build/ascendc_matmul_ibshareb_bbit ./ export TF_CPP_MIN_LOG_LEVEL=3 if [ "${RUN_MODE}" = "npu" ]; then diff --git a/examples/matrix/matmul_ibshareB/scripts/exec_test.py b/examples/matrix/matmul_ibshareB/scripts/exec_test.py index d3bd8f603ca1c47ead38d1fe1be1dbd55134bdaa..a78ed7d4c2d70d3222a25b3bb953e95e08f4897f 100644 --- a/examples/matrix/matmul_ibshareB/scripts/exec_test.py +++ b/examples/matrix/matmul_ibshareB/scripts/exec_test.py @@ -10,12 +10,11 @@ # See LICENSE in the root of the software repository for the full text of the License. # ====================================================================================================================== import os -import shlex -import subprocess import sys import csv import time import logging +import subprocess import numpy as np @@ -28,7 +27,8 @@ from common_scripts.exec_utils import get_file_work_dir, get_case_list, get_perf IS_BIAS = False IS_TRANS_A = False IS_TRANS_B = False -# support float16_float32(fp16 in fp32 out), int8_int32_ibshare(int8 in int32 out for ibshare matmul) +PERF_RUN_TIMES = 5 +# float16 in and float32 out DATA_TYPE_STR = "float16_float32" logging.basicConfig(level=logging.INFO) @@ -50,22 +50,30 @@ def process_case(file_work_dir, process_params): process_params.k, process_params.b, process_params.is_perf, process_params.run_mode logging.info("[INFO] start process case [%s]" % (case_name)) logging.info("[INFO] IS_PERF is set [%d]" % (is_perf)) - clear_file_cache(file_work_dir) - matmul_gen_data = MatmulGenData(m, n, k, b, IS_TRANS_A, IS_TRANS_B, IS_BIAS, DATA_TYPE_STR) - if is_perf: - matmul_gen_data.gen_fake_golden_data(file_work_dir) - else: - matmul_gen_data.gen_golden_data(file_work_dir) - params_str = f"{m} {n} {k} {b}" - cmd = get_process_case_cmd(kernel_name="ascendc_matmul_ibshare_bbit", params_str=params_str,\ - is_perf=is_perf, run_mode=run_mode) - subprocess.run(cmd) - if is_perf: - wrong_num = -1 - else: - logging.info("[INFO] compare data case[%s]" % (case_name)) - wrong_num = compare_data(file_work_dir, DATA_TYPE_STR) + loop_num = PERF_RUN_TIMES if is_perf else 1 + task_duration_avg = 0 + for i in range(loop_num): + clear_file_cache(file_work_dir) + matmul_gen_data = MatmulGenData(m, n, k, b, IS_TRANS_A, IS_TRANS_B, IS_BIAS, DATA_TYPE_STR) + if is_perf: + matmul_gen_data.gen_fake_golden_data(file_work_dir) + else: + matmul_gen_data.gen_golden_data(file_work_dir) + params_str = f"{m} {n} {k}" + cmd = get_process_case_cmd(kernel_name="ascendc_matmul_ibshareb_bbit", params_str=params_str,\ + is_perf=is_perf, run_mode=run_mode) + subprocess.run(cmd) + if is_perf: + wrong_num = -1 + else: + logging.info("[INFO] compare data case[%s]" % (case_name)) + wrong_num = compare_data(file_work_dir, DATA_TYPE_STR) + if is_perf: + cur_duration = float(get_perf_task_duration("./prof_out")) + logging.info("[INFO] case_name: [%s], loop_num: [%s], cur_duration: [%s]\n" % (case_name, i, cur_duration)) + task_duration_avg += cur_duration + res_data = [] res_data.append(case_name) res_data.append(wrong_num) @@ -77,8 +85,8 @@ def process_case(file_work_dir, process_params): else: res_data.append("Success") if is_perf: - task_duration = get_perf_task_duration("./prof_out") - res_data.append(task_duration) + task_duration_avg = task_duration_avg / loop_num + res_data.append(task_duration_avg) return res_data diff --git a/examples/matrix/matmul_ibshareB/testcase/case.csv b/examples/matrix/matmul_ibshareB/testcase/case.csv index 292334fab6a3b5d8ded92ca7c6b3a9a5042c7a92..496b90f501b3403b70a3f90156f2164206a1c9ef 100644 --- a/examples/matrix/matmul_ibshareB/testcase/case.csv +++ b/examples/matrix/matmul_ibshareB/testcase/case.csv @@ -1 +1 @@ -1, case001, 1280, 1024, 512 \ No newline at end of file +1, case001, 64, 256, 384 \ No newline at end of file diff --git a/examples/matrix/matmul_perf/CMakeLists.txt b/examples/matrix/matmul_perf/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..456cb32840e4caa64fe407a6bd14d8f87edb55b5 --- /dev/null +++ b/examples/matrix/matmul_perf/CMakeLists.txt @@ -0,0 +1,99 @@ +# Copyright (c) 2025 Huawei Technologies Co., Ltd. +# This file is a part of the CANN Open Software. +# Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ====================================================================================================================== + +cmake_minimum_required(VERSION 3.16) +project(Ascend_c) +if (${RUN_MODE}) + set(RUN_MODE "npu" CACHE STRING "cpu/sim/npu") +endif() +if (${SOC_VERSION}) + set(SOC_VERSION "Ascend910" CACHE STRING "system on chip type") +endif() + +set(ASCEND_CANN_PACKAGE_PATH "~/Ascend/ascend-toolkit/latest" CACHE STRING "ASCEND CANN package installation directory") +if(NOT CMAKE_BUILD_TYPE) + set(CMAKE_BUILD_TYPE "Debug" CACHE STRING "Build type Release/Debug (default Debug)" FORCE) +endif() + +if(CMAKE_INSTALL_PREFIX STREQUAL /usr/local) + set(CMAKE_INSTALL_PREFIX "${CMAKE_CURRENT_LIST_DIR}/out" CACHE STRING "path for install()" FORCE) +endif() + +file(GLOB KERNEL_FILES + ${CMAKE_CURRENT_SOURCE_DIR}/op_kernel/matmul_perf_custom_kernel.cpp +) + +if("${RUN_MODE}" STREQUAL "cpu") + include(cmake/cpu_lib.cmake) +elseif("${RUN_MODE}" STREQUAL "sim" OR "${RUN_MODE}" STREQUAL "npu") + include(cmake/npu_lib.cmake) +else() + message("invalid RUN_MODE: ${RUN_MODE}") +endif() + +add_executable(ascendc_matmul_perf_bbit + ${CMAKE_CURRENT_SOURCE_DIR}/main.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/op_host/matmul_perf_custom_tiling.cpp +) + +target_compile_options(ascendc_matmul_perf_bbit PRIVATE + $:-g>> + -O2 + -std=c++17 + -D_GLIBCXX_USE_CXX11_ABI=0 +) + +target_compile_definitions(ascendc_matmul_perf_bbit PRIVATE + SOC_VERSION="${SOC_VERSION}" +) + +message(STATUS "[INFO] PERF_MODE is ${PERF_MODE}") +if (PERF_MODE GREATER 0) + target_compile_definitions(ascendc_matmul_perf_bbit PRIVATE + -DENABLE_CUBE_ONLY + ) + message(STATUS "[INFO] Enable CUBE_ONLY") +endif() +if (PERF_MODE GREATER 1) + target_compile_definitions(ascendc_matmul_perf_bbit PRIVATE + -DENABLE_MDL + ) + message(STATUS "[INFO] Enable MDL") +endif() +if (PERF_MODE GREATER 2) + target_compile_definitions(ascendc_matmul_perf_bbit PRIVATE + -DENABLE_MDL_UNITFLAG + ) + message(STATUS "[INFO] Enable MDL UNITFLAG") +endif() +if (PERF_MODE GREATER 3) + target_compile_definitions(ascendc_matmul_perf_bbit PRIVATE + -DENABLE_CONSTANT + ) + message(STATUS "[INFO] Enable CONSTANT") +endif() + +target_include_directories(ascendc_matmul_perf_bbit PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR} + $:${ASCEND_CANN_PACKAGE_PATH}/include>> + $:${ASCEND_CANN_PACKAGE_PATH}/runtime/include>> +) + +target_link_libraries(ascendc_matmul_perf_bbit PRIVATE + $,$>:host_intf_pub>> + $:tikicpulib::${SOC_VERSION}>> + $:ascendcl>> + $:c_sec>> + ascendc_kernels_${RUN_MODE} + tiling_api + register + platform + ascendalog + dl +) diff --git a/examples/matrix/matmul_perf/README.md b/examples/matrix/matmul_perf/README.md new file mode 100644 index 0000000000000000000000000000000000000000..c65b0e4ebf078acb005a6000169481eebd4bd2eb --- /dev/null +++ b/examples/matrix/matmul_perf/README.md @@ -0,0 +1,263 @@ + +## 概述 + +本样例介绍了调用Matmul API实现四种性能优化特性(纯Cube模式、MDL模板、UnitFlag、Tiling全量常量化)的单算子。 + +本样例以直调的方式调用算子核函数。 +直调:核函数的基础调用方式,开发者完成算子核函数的开发和Tiling实现后,即可通过AscendCL运行时接口,完成算子的调用。包含CPU侧、NPU侧和仿真三种运行验证方法。 + +## 样例支持的产品型号为 +- Atlas A2训练系列产品/Atlas 800I A2推理产品 + +## 目录结构 +| 目录及文件 | 描述 | +|----------------------------------|----------------------| +| [cmake](cmake) | 编译工程文件 | +| [op_host](op_host) | 本样例的tiling代码实现 | +| [op_kernel](op_kernel) | 本样例的kernel代码实现 | +| [scripts](scripts) | 执行脚本文件 | +| [testcase](testcase) | 用例文件,配置用例的计算shape信息 | +| CMakeLists.txt | 编译工程文件 | +| main.cpp | 主函数,调用算子的应用程序,含CPU域及NPU域调用 | +| run.sh | 编译执行脚本 | + +## 算子描述 +- 算子功能 + - 纯Cube模式:在只有矩阵计算,没有矢量计算的场景下,通过使能纯Cube模式,可以优化Matmul计算中的消息通信性能开销,提升算子性能。 + - MDL模板:在MTE2循环搬运次数多的大Shape场景下,使能MDL模板后,可以实现MTE2从Global Memory一次性搬入多个基本块到A1/B1,提升带宽利用率,减少MTE2的搬运次数,提升算子性能。 + - UnitFlag:在算子的CUBE计算流水和FIXPIPE数据搬出流水串行且未被其他流水掩盖时,通过使能UnitFlag功能,可以实现CUBE计算流水和FIXPIPE数据搬出流水之间的流水并行,提升算子性能。 + - Tiling常量化:分为两种场景,1)全量常量化:在获取Matmul模板时,可以确定常量的singleCore Shape(singleCoreM/singleCoreN/singleCoreK)和常量的Base Shape(basicM/basicN/basicK,也称为baseM/baseN/baseK);2)部分常量化:在获取Matmul模板时,仅能确定常量的Base Shape(basicM/basicN/basicK)。 + 若Matmul初始化的Scalar计算较多,影响指令头开销,或Matmul迭代间的Scalar计算较多,阻塞MTE2流水。这两种场景下,在满足上述Tiling常量化条件时,可以使能Tiling常量化功能,减少Scalar计算开销,提升算子性能。 + + 以上四个特性的编码是相互独立的,本样例支持特性逐个叠加。具体叠加方式可参考下述“编译运行样例 > PERF_MODE参数说明”。 + +- 算子规格 + + + + + + + + + + + + +
算子类型(OpType)MatmulPerfCustom
算子输入nameshapedata typeformatisTrans
a-float16NDfalse
b-float16NDfalse
bias-floatND-
算子输出c-floatND-
核函数名matmul_perf_custom
+ +## 算子实现介绍 +本样例的四个性能优化特性复用公共的算子Kernel代码和Host代码,通过编译宏的方式,隔离各个性能优化特性的差异化代码实现。 +- 算子Kernel实现 + - 计算逻辑:C = A * B + Bias。 + - A、B为源操作数,A为左矩阵,形状为[M, K];B为右矩阵,形状为[K, N]。 + - C为目的操作数,存放矩阵乘结果的矩阵,形状为[M, N]。 + - Bias为矩阵乘偏置,形状为[1, N]。对A*B结果矩阵的每一行都采用该Bias进行偏置。 + - 具体步骤: + - 创建Matmul对象。 + - 默认实现,使用默认的NORM模板CFG_NORM创建Matmul对象。 + ``` + #include "lib/matmul_intf.h" + + using A_TYPE = AscendC::MatmulType; + using B_TYPE = AscendC::MatmulType; + using C_TYPE = AscendC::MatmulType; + using BIAS_TYPE = AscendC::MatmulType; + + AscendC::Matmul matmulObj; + ``` + - 使能纯Cube模式实现,在定义Matmul对象的代码中,设置ASCEND_CUBE_ONLY宏,且必须在#include "lib/matmul_intf.h"之前设置。 + ``` + #define ASCNEND_CUBE_ONLY // 设置ASCEND_CUBE_ONLY宏 + #include "lib/matmul_intf.h" + + using A_TYPE = AscendC::MatmulType; + using B_TYPE = AscendC::MatmulType; + using C_TYPE = AscendC::MatmulType; + using BIAS_TYPE = AscendC::MatmulType; + + AscendC::Matmul matmulObj; + ``` + - 使能MDL模板实现,使用默认的MDL模板CFG_MDL创建Matmul对象。 + ``` + #define ASCNEND_CUBE_ONLY // 设置ASCEND_CUBE_ONLY宏 + #include "lib/matmul_intf.h" + + using A_TYPE = AscendC::MatmulType; + using B_TYPE = AscendC::MatmulType; + using C_TYPE = AscendC::MatmulType; + using BIAS_TYPE = AscendC::MatmulType; + + AscendC::Matmul matmulObj; // 使用CFG_MDL创建Matmul对象 + ``` + - 使能UnitFlag功能实现,自定义MatmulConfig参数,将其中的enUnitFlag参数设置为true,使能UnitFlag功能。 + ``` + #define ASCNEND_CUBE_ONLY // 设置ASCEND_CUBE_ONLY宏 + #include "lib/matmul_intf.h" + + __aicore__ inline constexpr MatmulConfig GetUnitFlagCfg() + { + auto mmCfg = CFG_MDL; + mmCfg.enUnitFlag = true; // 设置enUnitFlag参数为true + return mmCfg; + } + constexpr static MatmulConfig CFG_MDL_UNITFLAG = GetUnitFlagCfg(); + + using A_TYPE = AscendC::MatmulType; + using B_TYPE = AscendC::MatmulType; + using C_TYPE = AscendC::MatmulType; + using BIAS_TYPE = AscendC::MatmulType; + + AscendC::Matmul matmulObj; // 使用自定义的MatmulConfig参数CFG_MDL_UNITFLAG创建Matmul对象 + ``` + - 使能Tiling全量常量化功能实现,首先使用MAX_M、MAX_N、MAX_K、BASE_M、BASE_N、BASE_K的常数值设置MatmulShapeParams,调用GetMMConfig得到自定义的MatmulConfig模板参数mmCFG;再通过GetMatmulApiTiling接口将Tiling信息常量化,得到常量化模板参数CUSTOM_CFG_CONSTANT,基于CUSTOM_CFG_CONSTANT创建Matmul对象。 + ``` + #define ASCNEND_CUBE_ONLY // 设置ASCEND_CUBE_ONLY宏 + #include "lib/matmul_intf.h" + + constexpr int32_t MAX_M = 10000; // custom matmul kernel support max value of M Dim shape + constexpr int32_t MAX_N = 10000; // custom matmul kernel support max value of N Dim shape + constexpr int32_t MAX_K = 10000; // custom matmul kernel support max value of K Dim shape + constexpr int32_t BASE_M = 128; // BASE_M * BASE_K * sizeof(typeA) <=L0A size + constexpr int32_t BASE_N = 256; // BASE_N * BASE_K * sizeof(typeB) <=L0B size + constexpr int32_t BASE_K = 64; // BASE_M * BASE_N * sizeof(typeC) <=L0C size + constexpr MatmulShapeParams shapeParams = { MAX_M, + MAX_N, + MAX_K, + BASE_M, + BASE_N, + BASE_K }; + + constexpr int32_t TILING_DEPTH_PARAM = 8; + constexpr int32_t TILING_STEPK_PARAM = 4; + constexpr int32_t TILING_STEPMN_PARAM = 1; + + template + __aicore__ inline constexpr MatmulApiStaticTiling GetCustomConstantCFG() + { + MatmulConfig mmCFG = GetMMConfig(shapeParams); + // enable unitflag for performance comparison + mmCFG.enUnitFlag = true; + auto constantCFG = AscendC::GetMatmulApiTiling(mmCFG); + constantCFG.depthA1 = TILING_DEPTH_PARAM; + constantCFG.depthB1 = TILING_DEPTH_PARAM; + constantCFG.stepKa = TILING_STEPK_PARAM; + constantCFG.stepKb = TILING_STEPK_PARAM; + constantCFG.stepM = TILING_STEPMN_PARAM; + constantCFG.stepN = TILING_STEPMN_PARAM; + return constantCFG; + } + + using A_TYPE = AscendC::MatmulType; + using B_TYPE = AscendC::MatmulType; + using C_TYPE = AscendC::MatmulType; + using BIAS_TYPE = AscendC::MatmulType; + + constexpr static auto CUSTOM_CFG_CONSTANT = GetCustomConstantCFG(); + AscendC::Matmul matmulObj; // 使用自定义的MatmulConfig参数CUSTOM_CFG_CONSTANT创建Matmul对象 + ``` + - 初始化操作。 + - 默认实现 + ``` + REGIST_MATMUL_OBJ(&pipe, GetSysWorkSpacePtr(), matmulObj, &tiling); // 初始化matmul对象 + ``` + - 使能纯Cube模式实现,同默认实现。 + - 使能MDL模板实现,同默认实现。 + - 使能UnitFlag功能实现,同默认实现。 + - 使能Tiling全量常量化功能实现 + ``` + // 全量常量化场景,调用REGIST_MATMUL_OBJ接口初始化Matmul对象时,可以在入参传递Tiling参数的位置,使用空指针替代。 + REGIST_MATMUL_OBJ(&pipe, GetSysWorkSpacePtr(), matmulObj, (TCubeTiling*)nullptr); + + // 部分常量化场景,调用REGIST_MATMUL_OBJ接口初始化Matmul对象时,仍需要使用Tiling。 + REGIST_MATMUL_OBJ(&pipe, GetSysWorkSpacePtr(), matmulObj, &tiling); + ``` + - 设置左矩阵A、右矩阵B、Bias。 + - 默认实现 + ``` + mm.SetTensorA(aGlobal); // 设置左矩阵A + mm.SetTensorB(bGlobal); // 设置右矩阵B + mm.SetBias(biasGlobal); // 设置Bias + ``` + - 使能纯Cube模式实现,同默认实现。 + - 使能MDL模板实现,同默认实现。 + - 使能UnitFlag功能实现,同默认实现。 + - 使能Tiling全量常量化功能实现 + ``` + matmulObj.SetOrgShape(tiling.M, tiling.N, tiling.Ka); // 调用SetOrgShape设置Matmul计算原始Shape + matmulObj.SetTensorA(aGlobal); // 设置左矩阵A + matmulObj.SetTensorB(bGlobal); // 设置右矩阵B + matmulObj.SetBias(biasGlobal); // 设置Bias + ``` + - 完成矩阵乘操作。 + ``` + matmulObj.IterateAll(cGlobal); + ``` + - 结束矩阵乘操作。 + ``` + matmulObj.End(); + ``` + +- 算子Tiling实现 + - Ascend C提供一组Matmul Tiling API,方便用户获取Matmul kernel计算时所需的Tiling参数。只需要传入A/B/C矩阵等信息,调用API接口,即可获取到TCubeTiling结构体中的相关参数。 + - 获取Tiling参数的流程如下: + - 创建一个Tiling对象。 + ``` + auto ascendcPlatform = platform_ascendc::PlatformAscendCManager::GetInstance(); + matmul_tiling::MultiCoreMatmulTiling cubeTiling(*ascendcPlatform); + ``` + - 设置A、B、C、Bias的参数类型信息;M、N、Ka、Kb形状信息等。 + ``` + auto blockDim = ascendcPlatform->GetCoreNumAiv(); // 方式一:非纯Cube模式,SetDim设置为AIV的核数 + auto blockDim = ascendcPlatform->GetCoreNumAic(); // 方式二:纯Cube模式,SetDim设置为AIC的核数 + + cubeTiling.SetDim(blockDim); + cubeTiling.SetAType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, + matmul_tiling::DataType::DT_FLOAT16, isAtrans); + cubeTiling.SetBType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, + matmul_tiling::DataType::DT_FLOAT16, isBtrans); + cubeTiling.SetCType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, + matmul_tiling::DataType::DT_FLOAT); + cubeTiling.SetBiasType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, + matmul_tiling::DataType::DT_FLOAT); + cubeTiling.SetOrgShape(M, N, K); + cubeTiling.SetShape(-1, -1, K); + cubeTiling.EnableBias(isBias); + ``` + - 调用GetTiling接口,获取Tiling信息。 + ``` + TCubeTiling tilingData; + int64_t ret = tiling.GetTiling(tilingData); // if ret = -1, get tiling failed + ``` + +## 编译运行样例 + + - 配置环境变量 + + 这里的\$ASCEND_CANN_PACKAGE_PATH需要替换为CANN开发套件包安装后文件存储路径。例如:/usr/local/Ascend/ascend-toolkit/latest + ``` + export ASCEND_HOME_DIR=$ASCEND_CANN_PACKAGE_PATH + source $ASCEND_HOME_DIR/../set_env.sh + ``` + + - 编译执行 + + ``` + bash run.sh -r [RUN_MODE] -v [SOC_VERSION] -p [IS_PERF] -m [PERF_MODE] + ``` + 其中脚本参数说明如下: + - RUN_MODE :编译执行方式,可选择CPU调试,NPU仿真,NPU上板,对应参数分别为[cpu / sim / npu]。若需要详细了解NPU仿真相关内容,请参考[《算子开发工具msProf》](https://hiascend.com/document/redirect/CannCommunityToolMsProf)中的“工具使用”章节。 + - SOC_VERSION :昇腾AI处理器型号,如果无法确定具体的[SOC_VERSION],则在安装昇腾AI处理器的服务器执行npu-smi info命令进行查询,在查询到的“Name”前增加Ascend信息,例如“Name”对应取值为xxxyy,实际配置的[SOC_VERSION]值为Ascendxxxyy。支持以下产品型号: + - Atlas A2训练系列产品/Atlas 800I A2推理产品 + - IS_PERF :是否获取执行性能数据,当前只在NPU执行时生效,打开后性能数据显示在执行结果中,可选择关闭和开启该功能,对应参数分别为[0 / 1]。 + - PERF_MODE :配置性能优化的方式,支持的取值为0,1,2,3,4,各取值的含义如下。该参数的默认值为0。 + - PERF_MODE = 0:默认实现。使能MIX模式 + NORM模板(未使能UnitFlag功能和Tiling全量常量化功能); + - PERF_MODE = 1:使能纯Cube模式; + - PERF_MODE = 2:使能纯Cube模式 + MDL模板; + - PERF_MODE = 3:使能纯Cube模式 + MDL模板 + UnitFlag功能; + - PERF_MODE = 4:使能纯Cube模式 + MDL模板 + UnitFlag功能 + Tiling全量常量化功能; + + 示例如下,Ascendxxxyy请替换为实际的AI处理器型号。 + ``` + bash run.sh -r cpu -v Ascendxxxyy -p 0 -m 0 + ``` diff --git a/examples/matrix/matmul_perf/cmake/cpu_lib.cmake b/examples/matrix/matmul_perf/cmake/cpu_lib.cmake new file mode 100644 index 0000000000000000000000000000000000000000..ffb9281d435d151f73ce5619d9b179dca76bc76d --- /dev/null +++ b/examples/matrix/matmul_perf/cmake/cpu_lib.cmake @@ -0,0 +1,57 @@ +# Copyright (c) 2025 Huawei Technologies Co., Ltd. +# This file is a part of the CANN Open Software. +# Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ====================================================================================================================== + +if(NOT DEFINED ENV{CMAKE_PREFIX_PATH}) + set(CMAKE_PREFIX_PATH ${ASCEND_CANN_PACKAGE_PATH}/tools/tikicpulib/lib/cmake) +endif() +find_package(tikicpulib REQUIRED) + +add_library(ascendc_kernels_${RUN_MODE} SHARED + ${KERNEL_FILES} +) + +target_link_libraries(ascendc_kernels_${RUN_MODE} PRIVATE + tikicpulib::${SOC_VERSION} +) + +target_compile_options(ascendc_kernels_${RUN_MODE} PRIVATE + -g + -O0 + -std=c++17 +) + +message(STATUS "[INFO] CPU Target PERF_MODE is ${PERF_MODE}") +if (PERF_MODE GREATER 0) + target_compile_definitions(ascendc_kernels_${RUN_MODE} PRIVATE + -DENABLE_CUBE_ONLY + ) + message(STATUS "[INFO] CPU Target Enable CUBE_ONLY") +endif() +if (PERF_MODE GREATER 1) + target_compile_definitions(ascendc_kernels_${RUN_MODE} PRIVATE + -DENABLE_MDL + ) + message(STATUS "[INFO] CPU Target Enable MDL") +endif() +if (PERF_MODE GREATER 2) + target_compile_definitions(ascendc_kernels_${RUN_MODE} PRIVATE + -DENABLE_MDL_UNITFLAG + ) + message(STATUS "[INFO] CPU Target Enable MDL UNITFLAG") +endif() +if (PERF_MODE GREATER 3) + target_compile_definitions(ascendc_kernels_${RUN_MODE} PRIVATE + -DENABLE_CONSTANT + ) + message(STATUS "[INFO] CPU Target Enable CONSTANT") +endif() + +install(TARGETS ascendc_kernels_${RUN_MODE} + DESTINATION ${CMAKE_INSTALL_LIBDIR} +) diff --git a/examples/matrix/matmul_perf/cmake/npu_lib.cmake b/examples/matrix/matmul_perf/cmake/npu_lib.cmake new file mode 100644 index 0000000000000000000000000000000000000000..4954182987f54c2e7aba33b683a1208d5ec1a9b1 --- /dev/null +++ b/examples/matrix/matmul_perf/cmake/npu_lib.cmake @@ -0,0 +1,53 @@ +# Copyright (c) 2025 Huawei Technologies Co., Ltd. +# This file is a part of the CANN Open Software. +# Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ====================================================================================================================== + +if(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake) + set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake) +elseif(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake) + set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake) +else() + message(FATAL_ERROR "ascendc_kernel_cmake does not exist ,please check whether the cann package is installed") +endif() +include(${ASCENDC_CMAKE_DIR}/ascendc.cmake) + +ascendc_library(ascendc_kernels_${RUN_MODE} STATIC + ${KERNEL_FILES} +) + +ascendc_compile_definitions(ascendc_kernels_${RUN_MODE} PRIVATE + -DASCENDC_DUMP + -DHAVE_WORKSPACE + -DHAVE_TILING +) + +message(STATUS "[INFO] NPU/SIM Target PERF_MODE is ${PERF_MODE}") +if (PERF_MODE GREATER 0) + ascendc_compile_definitions(ascendc_kernels_${RUN_MODE} PRIVATE + -DENABLE_CUBE_ONLY + ) + message(STATUS "[INFO] NPU/SIM Target Enable CUBE_ONLY") +endif() +if (PERF_MODE GREATER 1) + ascendc_compile_definitions(ascendc_kernels_${RUN_MODE} PRIVATE + -DENABLE_MDL + ) + message(STATUS "[INFO] NPU/SIM Target Enable MDL") +endif() +if (PERF_MODE GREATER 2) + ascendc_compile_definitions(ascendc_kernels_${RUN_MODE} PRIVATE + -DENABLE_MDL_UNITFLAG + ) + message(STATUS "[INFO] NPU/SIM Target Enable MDL UNITFLAG") +endif() +if (PERF_MODE GREATER 3) + ascendc_compile_definitions(ascendc_kernels_${RUN_MODE} PRIVATE + -DENABLE_CONSTANT + ) + message(STATUS "[INFO] NPU/SIM Target Enable CONSTANT") +endif() diff --git a/examples/matrix/matmul_perf/main.cpp b/examples/matrix/matmul_perf/main.cpp new file mode 100644 index 0000000000000000000000000000000000000000..375c0ecaf4e2a97ac309a4029aa810e802e50f22 --- /dev/null +++ b/examples/matrix/matmul_perf/main.cpp @@ -0,0 +1,241 @@ +/** + * Copyright (c) 2025 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#include +#include "../../common/data_utils.h" +#include "kernel_tiling/kernel_tiling.h" +#include "tiling/platform/platform_ascendc.h" +#include "op_host/matmul_perf_custom_tiling.h" + +#ifndef ASCENDC_CPU_DEBUG +#include "acl/acl.h" +extern void matmul_perf_custom_do(uint32_t coreDim, void* stream, + uint8_t* a, uint8_t* b, uint8_t* bias, uint8_t* c, + uint8_t* workspace, uint8_t* tiling); +#else +#include "tikicpulib.h" +extern "C" void matmul_perf_custom(uint8_t* a, uint8_t* b, uint8_t* bias, uint8_t* c, + uint8_t* workspace, uint8_t* tiling); +#endif + +namespace { +constexpr bool IS_BIAS = false; +constexpr bool IS_A_TRANS = false; +constexpr bool IS_B_TRANS = false; + +struct MatrixFileSize +{ + size_t x1FileSize; + size_t x2FileSize; + size_t yFileSize; + size_t biasFileSize; +}; +} + +namespace MatmulHost { +static size_t GetSysWorkSpaceSize() +{ + auto ascendcPlatform = platform_ascendc::PlatformAscendCManager::GetInstance(); + return static_cast(ascendcPlatform->GetLibApiWorkSpaceSize()); +} + +// CPU debug mode +#ifdef ASCENDC_CPU_DEBUG +void TestMatmulCpu(int64_t m, int64_t n, int64_t k, const MatrixFileSize& matrixFileSize) +{ + size_t x1FileSize = matrixFileSize.x1FileSize; + size_t x2FileSize = matrixFileSize.x2FileSize; + size_t yFileSize = matrixFileSize.yFileSize; + size_t biasFileSize = matrixFileSize.biasFileSize; + // Query workspace size + size_t workspaceSize = GetSysWorkSpaceSize(); + uint8_t* x1 = (uint8_t*)AscendC::GmAlloc(x1FileSize); + uint8_t* x2 = (uint8_t*)AscendC::GmAlloc(x2FileSize); + uint8_t* bias = nullptr; + uint8_t* y = (uint8_t*)AscendC::GmAlloc(yFileSize); + uint8_t* workspace = (uint8_t*)AscendC::GmAlloc(workspaceSize); + ReadFile("../input/x1_gm.bin", x1FileSize, x1, x1FileSize); + ReadFile("../input/x2_gm.bin", x2FileSize, x2, x2FileSize); + if (IS_BIAS) { + bias = (uint8_t*)AscendC::GmAlloc(biasFileSize); + ReadFile("../input/bias_gm.bin", biasFileSize, bias, biasFileSize); + } + size_t tilingFileSize = sizeof(TCubeTiling); + uint8_t* tiling = (uint8_t*)AscendC::GmAlloc(tilingFileSize); + auto ascendcPlatform = platform_ascendc::PlatformAscendCManager::GetInstance(); + auto aicCoreNum = ascendcPlatform->GetCoreNumAic(); + auto aivCoreNum = ascendcPlatform->GetCoreNumAiv(); +#ifdef ENABLE_CUBE_ONLY + auto coreNum = aicCoreNum; +#else + auto coreNum = aivCoreNum; +#endif + MatmulHost::MatmulCaseParams testCaseParams{static_cast(coreNum), + static_cast(m), static_cast(n), static_cast(k), IS_BIAS, IS_A_TRANS, IS_B_TRANS}; + std::cout << "[INFO] aicCoreNum = " << aicCoreNum << ", aivCoreNum = " << aivCoreNum << std::endl; + // Calculate Tiling + const auto& tilingData = MatmulHost::GenerateTiling(testCaseParams); + memcpy_s(tiling, tilingFileSize, &tilingData, tilingFileSize); + ICPU_RUN_KF(matmul_perf_custom, ascendcPlatform->GetCoreNumAic(), x1, x2, bias, y, workspace, tiling); + WriteFile("../output/output.bin", y, yFileSize); + AscendC::GmFree((void*)x1); + AscendC::GmFree((void*)x2); + AscendC::GmFree((void*)y); + AscendC::GmFree((void*)workspace); + AscendC::GmFree((void*)tiling); + if (IS_BIAS) { + AscendC::GmFree((void*)bias); + } +} +// NPU +#else +void MatmulOp(uint8_t* x1, uint8_t* x2, uint8_t* y, uint8_t* bias, int64_t m, int64_t n, int64_t k, + void* stream = nullptr) +{ + // Init args + uint8_t* workspaceDevice = nullptr; + + // Query workspace size + size_t workspaceSize = GetSysWorkSpaceSize(); + + // Allocate workspace on device + CHECK_ACL(aclrtMalloc((void**)&workspaceDevice, workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST)); + + // Initialize kernel with arguments and workspace pointer + uint8_t* tilingHost = nullptr; + uint8_t* tilingDevice = nullptr; + size_t tilingFileSize = sizeof(TCubeTiling); + auto ascendcPlatform = platform_ascendc::PlatformAscendCManager::GetInstance(); + auto aicCoreNum = ascendcPlatform->GetCoreNumAic(); + auto aivCoreNum = ascendcPlatform->GetCoreNumAiv(); +#ifdef ENABLE_CUBE_ONLY + auto coreNum = aicCoreNum; +#else + auto coreNum = aivCoreNum; +#endif + MatmulHost::MatmulCaseParams testCaseParams{static_cast(coreNum), + static_cast(m), static_cast(n), static_cast(k), IS_BIAS, IS_A_TRANS, IS_B_TRANS}; + std::cout << "[INFO] aicCoreNum = " << aicCoreNum << ", aivCoreNum = " << aivCoreNum << std::endl; + // Calculate Tiling + const auto tilingData = MatmulHost::GenerateTiling(testCaseParams); + CHECK_ACL(aclrtMallocHost((void**)(&tilingHost), tilingFileSize)); + CHECK_ACL(aclrtMalloc((void**)&tilingDevice, tilingFileSize, + ACL_MEM_MALLOC_HUGE_FIRST)); + CHECK_ACL(aclrtMemcpy(tilingHost, tilingFileSize, &tilingData, + tilingFileSize, ACL_MEMCPY_HOST_TO_HOST)); + CHECK_ACL(aclrtMemcpy(tilingDevice, tilingFileSize, tilingHost, + tilingFileSize, ACL_MEMCPY_HOST_TO_DEVICE)); + // Launch kernel + matmul_perf_custom_do(ascendcPlatform->GetCoreNumAic(), stream, x1, x2, bias, y, workspaceDevice, tilingDevice); + CHECK_ACL(aclrtFreeHost(tilingHost)); + CHECK_ACL(aclrtFree(workspaceDevice)); + CHECK_ACL(aclrtFree(tilingDevice)); +} + +void TestAclInit(aclrtContext& context, aclrtStream& stream, int64_t& deviceId) +{ + CHECK_ACL(aclInit(nullptr)); + CHECK_ACL(aclrtSetDevice(deviceId)); + CHECK_ACL(aclrtCreateContext(&context, deviceId)); + CHECK_ACL(aclrtCreateStream(&stream)); +} + +void TestAclDeInit(aclrtContext& context, aclrtStream& stream, int64_t& deviceId) +{ + CHECK_ACL(aclrtDestroyStream(stream)); + CHECK_ACL(aclrtDestroyContext(context)); + CHECK_ACL(aclrtResetDevice(deviceId)); + CHECK_ACL(aclFinalize()); +} + +void TestMatmul(int64_t m, int64_t n, int64_t k, const MatrixFileSize& matrixFileSize) +{ + size_t x1FileSize = matrixFileSize.x1FileSize; + size_t x2FileSize = matrixFileSize.x2FileSize; + size_t yFileSize = matrixFileSize.yFileSize; + size_t biasFileSize = matrixFileSize.biasFileSize; + + aclrtContext context; + aclrtStream stream = nullptr; + int64_t deviceId = 0; + TestAclInit(context, stream, deviceId); + + uint8_t* x1Host = nullptr; + uint8_t* x1Device = nullptr; + CHECK_ACL(aclrtMallocHost((void**)(&x1Host), x1FileSize)); + CHECK_ACL(aclrtMalloc((void**)&x1Device, x1FileSize, ACL_MEM_MALLOC_HUGE_FIRST)); + ReadFile("../input/x1_gm.bin", x1FileSize, x1Host, x1FileSize); + CHECK_ACL(aclrtMemcpy(x1Device, x1FileSize, x1Host, x1FileSize, ACL_MEMCPY_HOST_TO_DEVICE)); + + uint8_t* x2Host = nullptr; + uint8_t* x2Device = nullptr; + CHECK_ACL(aclrtMallocHost((void**)(&x2Host), x2FileSize)); + CHECK_ACL(aclrtMalloc((void**)&x2Device, x2FileSize, ACL_MEM_MALLOC_HUGE_FIRST)); + ReadFile("../input/x2_gm.bin", x2FileSize, x2Host, x2FileSize); + CHECK_ACL(aclrtMemcpy(x2Device, x2FileSize, x2Host, x2FileSize, ACL_MEMCPY_HOST_TO_DEVICE)); + + uint8_t* biasHost = nullptr; + uint8_t* biasDevice = nullptr; + if (IS_BIAS) { + CHECK_ACL(aclrtMallocHost((void**)(&biasHost), biasFileSize)); + CHECK_ACL(aclrtMalloc((void**)&biasDevice, biasFileSize, ACL_MEM_MALLOC_HUGE_FIRST)); + ReadFile("../input/bias_gm.bin", biasFileSize, biasHost, biasFileSize); + CHECK_ACL(aclrtMemcpy(biasDevice, biasFileSize, biasHost, biasFileSize, ACL_MEMCPY_HOST_TO_DEVICE)); + } + uint8_t* yHost = nullptr; + uint8_t* yDevice = nullptr; + CHECK_ACL(aclrtMallocHost((void**)(&yHost), yFileSize)); + CHECK_ACL(aclrtMalloc((void**)&yDevice, yFileSize, ACL_MEM_MALLOC_HUGE_FIRST)); + + MatmulOp(x1Device, x2Device, yDevice, biasDevice, m, n, k, stream); + CHECK_ACL(aclrtSynchronizeStream(stream)); + + CHECK_ACL(aclrtMemcpy(yHost, yFileSize, yDevice, yFileSize, ACL_MEMCPY_DEVICE_TO_HOST)); + WriteFile("../output/output.bin", yHost, yFileSize); + + if (IS_BIAS) { + CHECK_ACL(aclrtFree(biasDevice)); + CHECK_ACL(aclrtFreeHost(biasHost)); + } + CHECK_ACL(aclrtFree(x1Device)); + CHECK_ACL(aclrtFreeHost(x1Host)); + CHECK_ACL(aclrtFree(x2Device)); + CHECK_ACL(aclrtFreeHost(x2Host)); + CHECK_ACL(aclrtFree(yDevice)); + CHECK_ACL(aclrtFreeHost(yHost)); + TestAclDeInit(context, stream, deviceId); +} +#endif +} // namespace MatmulHost + +int32_t main(int32_t argc, const char* args[]) +{ + int64_t inputParams[3] = {1, 1, 1}; + for (int32_t i = 1; i < argc && i < 4; ++i) { // 4 used for inputParams loop + std::stringstream ss(args[i]); + ss >> inputParams[i - 1]; + } + auto ascendcPlatform = platform_ascendc::PlatformAscendCManager::GetInstance(SOC_VERSION); + int64_t M = inputParams[0]; + int64_t N = inputParams[1]; + int64_t K = inputParams[2]; + MatrixFileSize matrixFileSize; + // uint16_t represent half + matrixFileSize.x1FileSize = static_cast(M * K) * sizeof(uint16_t); + matrixFileSize.x2FileSize = static_cast(K * N) * sizeof(uint16_t); + matrixFileSize.yFileSize = static_cast(M * N) * sizeof(float); + matrixFileSize.biasFileSize = static_cast(1 * N) * sizeof(float); +#ifdef ASCENDC_CPU_DEBUG + MatmulHost::TestMatmulCpu(M, N, K, matrixFileSize); +#else + MatmulHost::TestMatmul(M, N, K, matrixFileSize); +#endif + return 0; +} diff --git a/examples/matrix/matmul_perf/op_host/matmul_perf_custom_tiling.cpp b/examples/matrix/matmul_perf/op_host/matmul_perf_custom_tiling.cpp new file mode 100644 index 0000000000000000000000000000000000000000..8625b8cefbd96d682fa33d74e7ef6bfe92244042 --- /dev/null +++ b/examples/matrix/matmul_perf/op_host/matmul_perf_custom_tiling.cpp @@ -0,0 +1,48 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2025-2025. All rights reserved. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#include "matmul_perf_custom_tiling.h" +#include + +namespace MatmulHost { + +TCubeTiling GenerateTiling(const MatmulCaseParams& testCaseParams) +{ + TCubeTiling tilingData; + auto ascendcPlatform = platform_ascendc::PlatformAscendCManager::GetInstance(); + matmul_tiling::MultiCoreMatmulTiling cubeTiling(*ascendcPlatform); + uint32_t M = testCaseParams.m; + uint32_t N = testCaseParams.n; + uint32_t K = testCaseParams.k; + uint32_t blockDim = testCaseParams.usedCoreNum; + bool isBias = testCaseParams.isBias; + bool isAtrans = testCaseParams.isATrans; + bool isBtrans = testCaseParams.isBTrans; + + cubeTiling.SetDim(blockDim); + cubeTiling.SetAType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, + matmul_tiling::DataType::DT_FLOAT16, isAtrans); + cubeTiling.SetBType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, + matmul_tiling::DataType::DT_FLOAT16, isBtrans); + cubeTiling.SetCType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, + matmul_tiling::DataType::DT_FLOAT); + cubeTiling.SetBiasType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, + matmul_tiling::DataType::DT_FLOAT); + cubeTiling.SetOrgShape(M, N, K); + cubeTiling.SetShape(-1, -1, K); + cubeTiling.EnableBias(isBias); + if (cubeTiling.GetTiling(tilingData) == -1) { + std::cout << "Generate tiling failed." << std::endl; + return {}; + } + return tilingData; +} + +} // namespace MatmulHost diff --git a/examples/matrix/matmul_ibshareB/op_kernel/matmul_ibshare_custom.cpp b/examples/matrix/matmul_perf/op_host/matmul_perf_custom_tiling.h similarity index 31% rename from examples/matrix/matmul_ibshareB/op_kernel/matmul_ibshare_custom.cpp rename to examples/matrix/matmul_perf/op_host/matmul_perf_custom_tiling.h index ba4707d891c2949f007faba7b1b28ab58d770a8e..4c5e4fcadb4d59e616f49137718d43e3912c0322 100644 --- a/examples/matrix/matmul_ibshareB/op_kernel/matmul_ibshare_custom.cpp +++ b/examples/matrix/matmul_perf/op_host/matmul_perf_custom_tiling.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2025 Huawei Technologies Co., Ltd. + * Copyright (c) Huawei Technologies Co., Ltd. 2025-2025. All rights reserved. * This file is a part of the CANN Open Software. * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). * Please refer to the License for details. You may not use this file except in compliance with the License. @@ -8,37 +8,30 @@ * See LICENSE in the root of the software repository for the full text of the License. */ -#include "kernel_operator.h" -#include "lib/matmul_intf.h" -#include "matmul_ibshare_custom.h" +#ifndef EXAMPLES_MATRIX_MATMUL_PERF_OP_HOST_MATMUL_PERF_CUSTOM_TILING_H +#define EXAMPLES_MATRIX_MATMUL_PERF_OP_HOST_MATMUL_PERF_CUSTOM_TILING_H +#include "register/tilingdata_base.h" +#include "tiling/tiling_api.h" -__aicore__ inline void CopyTiling(TCubeTiling* tiling, GM_ADDR tilingGM) -{ - uint32_t* ptr = reinterpret_cast(tiling); - auto tiling32 = reinterpret_cast<__gm__ uint32_t*>(tilingGM); - - for (int i = 0; i < sizeof(TCubeTiling) / sizeof(uint32_t); i++, ptr++) { - *ptr = *(tiling32 + i); - } - return; -} +namespace MatmulHost { -extern "C" __global__ __aicore__ void matmul_ibshare_custom(GM_ADDR a, GM_ADDR b, GM_ADDR c, - GM_ADDR workspace, GM_ADDR tilingGm) +struct MatmulCaseParams { - TCubeTiling tiling; - CopyTiling(&tiling, tilingGm); - MatmulIbshareKernel matmulKernel; - AscendC::TPipe pipe; - REGIST_MATMUL_OBJ(&pipe, GetSysWorkSpacePtr(), matmulKernel.matmulObj, &tiling); - matmulKernel.Init(a, b, nullptr, c, workspace, tiling); - matmulKernel.Process(&pipe); -} + int32_t usedCoreNum; + int32_t m; + int32_t n; + int32_t k; + bool isBias; + bool isATrans; + bool isBTrans; +}; -#ifndef ASCENDC_CPU_DEBUG -void matmul_ibshare_custom_do(uint32_t blockDim, void* stream, GM_ADDR a, GM_ADDR b, - GM_ADDR c, GM_ADDR workspace, GM_ADDR tilingGm) -{ - matmul_ibshare_custom<<>>(a, b, c, workspace, tilingGm); -} -#endif \ No newline at end of file +/** + * @brief Generate matmul tiling. + * @param testCaseParams: Testcase parameters. + * @retval Generated Tiling data. + */ +TCubeTiling GenerateTiling(const MatmulCaseParams& testCaseParams); + +} // namespace MatmulHost +#endif // EXAMPLES_MATRIX_MATMUL_PERF_OP_HOST_MATMUL_PERF_CUSTOM_TILING_H diff --git a/examples/matrix/matmul_perf/op_kernel/matmul_perf_custom_kernel.cpp b/examples/matrix/matmul_perf/op_kernel/matmul_perf_custom_kernel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..5218d59952f0b0cef490f29a997adacef4c54779 --- /dev/null +++ b/examples/matrix/matmul_perf/op_kernel/matmul_perf_custom_kernel.cpp @@ -0,0 +1,152 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2025-2025. All rights reserved. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#include "kernel_operator.h" +#include "matmul_perf_custom_kernel.h" + +namespace { +/** + * @brief Copy tiling data to TCubeTiling ptr from tiling gm addr. + * @param tiling: TCubeTiling ptr which needs to copy tiling data. + * @param tilingGM: Tiling gm addr. + * @retval None + */ +__aicore__ inline void CopyTiling(TCubeTiling* tiling, GM_ADDR tilingGM) +{ + uint32_t* ptr = reinterpret_cast(tiling); + auto tiling32 = reinterpret_cast<__gm__ uint32_t*>(tilingGM); + + for (int i = 0; i < sizeof(TCubeTiling) / sizeof(uint32_t); i++, ptr++) { + *ptr = *(tiling32 + i); + } + return; +} +} + +namespace MatmulPerfCustom { +template +__aicore__ inline void MatmulKernel::Init( + GM_ADDR a, GM_ADDR b, GM_ADDR bias, GM_ADDR c, const TCubeTiling& tiling, bool isTransA, bool isTransB) +{ + this->tiling = tiling; + if (AscendC::GetBlockIdx() >= tiling.usedCoreNum) { + return; + } + aGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ AType*>(a), tiling.M * tiling.Ka); + bGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ BType*>(b), tiling.Kb * tiling.N); + cGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ CType*>(c), tiling.M * tiling.N); + biasGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ BiasType*>(bias), tiling.N); + + int32_t offsetA = 0; + int32_t offsetB = 0; + int32_t offsetC = 0; + int32_t offsetBias = 0; + this->isTransA = isTransA; + this->isTransB = isTransB; + CalcOffset(tiling, offsetA, offsetB, offsetC, offsetBias); + aGlobal = aGlobal[offsetA]; + bGlobal = bGlobal[offsetB]; + cGlobal = cGlobal[offsetC]; + biasGlobal = biasGlobal[offsetBias]; +} + +template +__aicore__ inline void MatmulKernel::Process(AscendC::TPipe* pipe) +{ + if (AscendC::GetBlockIdx() >= tiling.usedCoreNum) { + return; + } + // process with tail block + int tailM = tiling.M - mCoreIndex * tiling.singleCoreM; + tailM = tailM < tiling.singleCoreM ? (tailM > 0 ? tailM : tiling.singleCoreM) : tiling.singleCoreM; + int tailN = tiling.N - nCoreIndex * tiling.singleCoreN; + tailN = tailN < tiling.singleCoreN ? (tailN > 0 ? tailN : tiling.singleCoreN) : tiling.singleCoreN; +#ifdef ENABLE_CONSTANT + matmulObj.SetOrgShape(tiling.M, tiling.N, tiling.Ka); +#endif + matmulObj.SetSingleShape(tailM, tailN, tiling.Ka); + matmulObj.SetTensorA(aGlobal, isTransA); + matmulObj.SetTensorB(bGlobal, isTransB); + if (tiling.isBias) { + matmulObj.SetBias(biasGlobal); + } + matmulObj.IterateAll(cGlobal); + matmulObj.End(); +} + +template +__aicore__ inline void MatmulKernel::CalcOffset( + const TCubeTiling& tiling, int32_t& offsetA, int32_t& offsetB, int32_t& offsetC, int32_t& offsetBias) +{ + int32_t blockIdx = AscendC::GetBlockIdx(); + if (blockIdx >= tiling.usedCoreNum) { + return; + } + auto mSingleBlocks = (tiling.M + tiling.singleCoreM - 1) / tiling.singleCoreM; // split M into mSingleBlocks cores + mCoreIndex = blockIdx % mSingleBlocks; + nCoreIndex = blockIdx / mSingleBlocks; + + if (isTransA) { + offsetA = mCoreIndex * tiling.singleCoreM; + } else { + offsetA = mCoreIndex * tiling.Ka * tiling.singleCoreM; + } + if (isTransB) { + offsetB = nCoreIndex * tiling.Kb * tiling.singleCoreN; + } else { + offsetB = nCoreIndex * tiling.singleCoreN; + } + offsetC = mCoreIndex * tiling.N * tiling.singleCoreM + nCoreIndex * tiling.singleCoreN; + offsetBias = nCoreIndex * tiling.singleCoreN; +} +} // namespace MatmulPerfCustom + +/** + * @brief matmul kernel function. + * @param a: A matrix gm addr. + * @param b: B matrix gm addr. + * @param bias: Bias matrix gm addr. + * @param c: C matrix gm addr. + * @param workspace: Temporary gm space addr required by matmul calc. + * @param tilingGm: Tiling data addr. + * @retval None + */ +extern "C" __global__ __aicore__ void matmul_perf_custom(GM_ADDR a, GM_ADDR b, + GM_ADDR bias, GM_ADDR c, GM_ADDR workspace, GM_ADDR tilingGm) +{ +#ifdef ENABLE_CUBE_ONLY + if (g_coreType == AscendC::AIV) { + return; + } +#endif + // prepare tiling + TCubeTiling tiling; + CopyTiling(&tiling, tilingGm); + // define matmul kernel + MatmulPerfCustom::MatmulKernel matmulKernel; + AscendC::TPipe pipe; + // init matmul kernel, isTransA=false, isTransB=false + matmulKernel.Init(a, b, bias, c, tiling, false, false); +#ifdef ENABLE_CONSTANT + REGIST_MATMUL_OBJ(&pipe, GetSysWorkSpacePtr(), matmulKernel.matmulObj, (TCubeTiling*)nullptr); +#else + REGIST_MATMUL_OBJ(&pipe, GetSysWorkSpacePtr(), matmulKernel.matmulObj, &tiling); +#endif + // matmul kernel process + matmulKernel.Process(&pipe); +} + +#ifndef ASCENDC_CPU_DEBUG +void matmul_perf_custom_do(uint32_t blockDim, void* stream, + GM_ADDR a, GM_ADDR b, GM_ADDR bias, GM_ADDR c, GM_ADDR workspace, GM_ADDR tilingGm) +{ + matmul_perf_custom<<>>(a, b, bias, c, workspace, tilingGm); +} +#endif diff --git a/examples/matrix/matmul_perf/op_kernel/matmul_perf_custom_kernel.h b/examples/matrix/matmul_perf/op_kernel/matmul_perf_custom_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..6441e48d2c5b57485011021ea2284f08d69ec519 --- /dev/null +++ b/examples/matrix/matmul_perf/op_kernel/matmul_perf_custom_kernel.h @@ -0,0 +1,137 @@ +/* + * Copyright (c) Huawei Technologies Co., Ltd. 2025-2025. All rights reserved. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#ifndef EXAMPLES_MATRIX_MATMUL_PERF_OP_KERNEL_MATMUL_PERF_CUSTOM_KERNEL_H +#define EXAMPLES_MATRIX_MATMUL_PERF_OP_KERNEL_MATMUL_PERF_CUSTOM_KERNEL_H +#include "kernel_operator.h" +#ifdef ENABLE_CUBE_ONLY +#define ASCENDC_CUBE_ONLY +#endif +#include "lib/matmul_intf.h" + +namespace MatmulPerfCustom { + +__aicore__ inline constexpr MatmulConfig GetCustomNormCFG() +{ + auto mmCfg = CFG_NORM; + // disable unitflag for performance comparison + mmCfg.enUnitFlag = false; + return mmCfg; +} + +__aicore__ inline constexpr MatmulConfig GetCustomMDLCFG() +{ + auto mmCfg = CFG_MDL; +#ifdef ENABLE_MDL_UNITFLAG + mmCfg.enUnitFlag = true; +#endif + return mmCfg; +} + +constexpr int32_t MAX_M = 10000; // custom matmul kernel support max value of M Dim shape +constexpr int32_t MAX_N = 10000; // custom matmul kernel support max value of N Dim shape +constexpr int32_t MAX_K = 10000; // custom matmul kernel support max value of K Dim shape +constexpr int32_t BASE_M = 128; // BASE_M * BASE_K * sizeof(typeA) <=L0A size +constexpr int32_t BASE_N = 256; // BASE_N * BASE_K * sizeof(typeB) <=L0B size +constexpr int32_t BASE_K = 64; // BASE_M * BASE_N * sizeof(typeC) <=L0C size +constexpr MatmulShapeParams shapeParams = { MAX_M, + MAX_N, + MAX_K, + BASE_M, + BASE_N, + BASE_K }; + +constexpr int32_t TILING_DEPTH_PARAM = 8; +constexpr int32_t TILING_STEPK_PARAM = 4; +constexpr int32_t TILING_STEPMN_PARAM = 1; + +template +__aicore__ inline constexpr MatmulApiStaticTiling GetCustomConstantCFG() +{ + MatmulConfig mmCFG = GetMMConfig(shapeParams); + // enable unitflag for performance comparison + mmCFG.enUnitFlag = true; + auto constantCFG = AscendC::GetMatmulApiTiling(mmCFG); + constantCFG.depthA1 = TILING_DEPTH_PARAM; + constantCFG.depthB1 = TILING_DEPTH_PARAM; + constantCFG.stepKa = TILING_STEPK_PARAM; + constantCFG.stepKb = TILING_STEPK_PARAM; + constantCFG.stepM = TILING_STEPMN_PARAM; + constantCFG.stepN = TILING_STEPMN_PARAM; + return constantCFG; +} + +template +class MatmulKernel { +public: + __aicore__ inline MatmulKernel(){}; + /** + * @brief Initialization before process. + * @param a: A matrix gm addr. + * @param b: B matrix gm addr. + * @param bias: Bias matrix gm addr. + * @param c: C matrix gm addr. + * @param tiling: Matmul tiling struct. + * @param isTransA: Whether A matrix is transposed. + * @param isTransB: Whether B matrix is transposed. + * @retval None + */ + __aicore__ inline void Init(GM_ADDR a, GM_ADDR b, GM_ADDR bias, GM_ADDR c, const TCubeTiling& tiling, + bool isTransA, bool isTransB); + /** + * @brief Process matrix calculation. + * @param pipe: TPipe object. + * @retval None + */ + __aicore__ inline void Process(AscendC::TPipe* pipe); + + using A_TYPE = AscendC::MatmulType; + using B_TYPE = AscendC::MatmulType; + using C_TYPE = AscendC::MatmulType; + using BIAS_TYPE = AscendC::MatmulType; +#ifdef ENABLE_CONSTANT + constexpr static auto CUSTOM_CFG_CONSTANT = GetCustomConstantCFG(); + AscendC::Matmul matmulObj; +#else +#ifdef ENABLE_MDL + constexpr static MatmulConfig CUSTOM_CFG_MDL = GetCustomMDLCFG(); + AscendC::Matmul matmulObj; +#else + constexpr static MatmulConfig CUSTOM_CFG_NORM = GetCustomNormCFG(); + AscendC::Matmul matmulObj; +#endif +#endif + +private: + /** + * @brief Calculate the gm offset based on the blockIdx. + * @param blockIdx: Current Core blockidx. + * @param offsetA: Gm offset of A matrix. + * @param offsetB: Gm offset of B matrix. + * @param offsetC: Gm offset of C matrix. + * @param offsetBias: Gm offset of Bias matrix. + * @retval None + */ + __aicore__ inline void CalcOffset(const TCubeTiling &tiling, + int32_t& offsetA, int32_t& offsetB, int32_t& offsetC, int32_t& offsetBias); + + AscendC::GlobalTensor aGlobal; + AscendC::GlobalTensor bGlobal; + AscendC::GlobalTensor cGlobal; + AscendC::GlobalTensor biasGlobal; + TCubeTiling tiling; + int32_t mCoreIndex; + int32_t nCoreIndex; + bool isTransA{false}; + bool isTransB{false}; +}; +} // namespace MatmulPerfCustom + +#endif // EXAMPLES_MATRIX_MATMUL_PERF_OP_KERNEL_MATMUL_PERF_CUSTOM_KERNEL_H diff --git a/examples/matrix/matmul_perf/run.sh b/examples/matrix/matmul_perf/run.sh new file mode 100644 index 0000000000000000000000000000000000000000..41e6d46ca10249986ed08e36e576276d884cb8de --- /dev/null +++ b/examples/matrix/matmul_perf/run.sh @@ -0,0 +1,110 @@ +#!/bin/bash +# Copyright (c) 2025 Huawei Technologies Co., Ltd. +# This file is a part of the CANN Open Software. +# Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ====================================================================================================================== + +export IS_PERF="0" +# 0: basic performance mode +# 1: enable CUBE_ONLY based on mode 0 +# 2: enable MDL based on mode 1 +# 3: enable UnitFlag based on mode 2 +# 4: enable Constant Tiling based on mode 3 +export PERF_MODE="0" + +SHORT=r:,v:,p:,m:, +LONG=run-mode:,soc-version:,perf:, +OPTS=$(getopt -a --options $SHORT --longoptions $LONG -- "$@") +eval set -- "$OPTS" +while : +do + case "$1" in + (-r | --run-mode ) + RUN_MODE="$2" + shift 2;; + (-v | --soc-version ) + SOC_VERSION="$2" + shift 2;; + (-p | --perf ) + IS_PERF="$2" + shift 2;; + (-m | --perf-mode ) + PERF_MODE="$2" + shift 2;; + (--) + shift; + break;; + (*) + echo "[ERROR] Unexpected option: $1"; + break;; + esac +done + +# Check invalid input +if [[ ! "${SOC_VERSION}" =~ ^Ascend910 ]]; then + echo "[ERROR] Unsupported SocVersion: ${SOC_VERSION}" + exit 1 +fi + +if [ "${RUN_MODE}" != "npu" ] && [ "${RUN_MODE}" != "sim" ] && [ "${RUN_MODE}" != "cpu" ]; then + echo "[ERROR] Unsupported RUN_MODE: ${RUN_MODE}, which can only be cpu/sim/npu." + exit 1 +fi + +if [ "${IS_PERF}" != "0" ] && [ "${IS_PERF}" != "1" ]; then + echo "[ERROR] Unsupported IS_PERF: ${IS_PERF}, which can only be 0 or 1." + exit 1 +fi + +# only npu mode support is_perf = 1 +if [ "${IS_PERF}" == "1" ] && [ "${RUN_MODE}" != "npu" ]; then + echo "[ERROR] Unsupported IS_PERF: ${IS_PERF} while RUN_MODE is ${RUN_MODE}." + exit 1 +fi + +if ! [[ "${PERF_MODE}" =~ ^[0-4]+$ ]]; then + echo "[ERROR] Unsupported PERF_MODE: ${PERF_MODE}, which can only be 0, 1, 2, 3, 4." + exit 1 +fi + +rm -rf build +mkdir build +cd build + +source $ASCEND_HOME_DIR/bin/setenv.bash +export LD_LIBRARY_PATH=${ASCEND_HOME_DIR}/tools/simulator/${SOC_VERSION}/lib:$LD_LIBRARY_PATH +set -euo pipefail + +cmake -DRUN_MODE=${RUN_MODE} -DSOC_VERSION=${SOC_VERSION} -DPERF_MODE=${PERF_MODE} -DASCEND_CANN_PACKAGE_PATH=${ASCEND_HOME_DIR} .. +make -j16 + +cd ../ + +rm -rf input +mkdir input +rm -rf output +mkdir output + +rm -rf bin/ +mkdir -p bin +cd bin +cp ../build/ascendc_matmul_perf_bbit ./ + +export TF_CPP_MIN_LOG_LEVEL=3 + +if [ "${RUN_MODE}" = "npu" ]; then + if [ "${IS_PERF}" = "1" ]; then + export ASCEND_TOOLKIT_HOME=${ASCEND_HOME_DIR} + python3 -u ../scripts/exec_test.py npu "perf" + else + python3 -u ../scripts/exec_test.py npu "normal" + fi +elif [ "${RUN_MODE}" = "sim" ]; then + python3 -u ../scripts/exec_test.py sim "normal" +elif [ "${RUN_MODE}" = "cpu" ]; then + python3 -u ../scripts/exec_test.py cpu "normal" +fi diff --git a/examples/matrix/matmul_perf/scripts/exec_test.py b/examples/matrix/matmul_perf/scripts/exec_test.py new file mode 100644 index 0000000000000000000000000000000000000000..2db2c9271a1f95f186ad20b17e94ea49fe092ad8 --- /dev/null +++ b/examples/matrix/matmul_perf/scripts/exec_test.py @@ -0,0 +1,130 @@ +#!/usr/bin/python3 +# coding=utf-8 + +# Copyright (c) 2025 Huawei Technologies Co., Ltd. +# This file is a part of the CANN Open Software. +# Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ====================================================================================================================== +import os +import sys +import csv +import time +import logging +import subprocess + +import numpy as np + +sys.path.append("../..") +from common_scripts.gen_data import MatmulGenData +from common_scripts.compare_data import compare_data +from common_scripts.exec_utils import get_file_work_dir, get_case_list, get_perf_task_duration, clear_file_cache,\ + get_process_case_cmd + +IS_BIAS = False +IS_TRANS_A = False +IS_TRANS_B = False +PERF_RUN_TIMES = 5 +# float16 in float32 out +DATA_TYPE_STR = "float16_float32" + +logging.basicConfig(level=logging.INFO) + + +class ProcessParams: + def __init__(self, case_name, m, n, k, b, is_perf, run_mode): + self.case_name = case_name + self.m = m + self.n = n + self.k = k + self.b = b + self.is_perf = is_perf + self.run_mode = run_mode + + +def process_case(file_work_dir, process_params): + case_name, m, n, k, b, is_perf, run_mode = process_params.case_name, process_params.m, process_params.n,\ + process_params.k, process_params.b, process_params.is_perf, process_params.run_mode + logging.info("[INFO] start process case [%s]" % (case_name)) + logging.info("[INFO] IS_PERF is set [%d]" % (is_perf)) + + loop_num = PERF_RUN_TIMES if is_perf else 1 + task_duration_avg = 0 + for i in range(loop_num): + clear_file_cache(file_work_dir) + matmul_gen_data = MatmulGenData(m, n, k, b, IS_TRANS_A, IS_TRANS_B, IS_BIAS, DATA_TYPE_STR) + if is_perf: + matmul_gen_data.gen_fake_golden_data(file_work_dir) + else: + matmul_gen_data.gen_golden_data(file_work_dir) + params_str = f"{m} {n} {k}" + cmd = get_process_case_cmd(kernel_name="ascendc_matmul_perf_bbit", params_str=params_str,\ + is_perf=is_perf, run_mode=run_mode) + subprocess.run(cmd) + if is_perf: + wrong_num = -1 + else: + logging.info("[INFO] compare data case[%s]" % (case_name)) + wrong_num = compare_data(file_work_dir, DATA_TYPE_STR) + if is_perf: + cur_duration = float(get_perf_task_duration("./prof_out")) + logging.info("[INFO] case_name: [%s], loop_num: [%s], cur_duration: [%s]\n" % (case_name, i, cur_duration)) + task_duration_avg += cur_duration + + res_data = [] + res_data.append(case_name) + res_data.append(wrong_num) + res_data.append(b * m * n) + if wrong_num == -1: + res_data.append("None") + elif wrong_num / (b * m * n) > 0.001: + res_data.append("Fail") + else: + res_data.append("Success") + if is_perf: + task_duration_avg = task_duration_avg / loop_num + res_data.append(task_duration_avg) + return res_data + + +def main(): + args_len = len(sys.argv) - 1 + if args_len != 2: + logging.info("[ERROR] exec_test input params error!") + return -1 + + file_work_dir = get_file_work_dir() + if not os.path.exists(file_work_dir): + logging.info("[ERROR] file path %s not exist!" % (file_work_dir)) + return -1 + + is_perf = False + if sys.argv[2] == "perf": + is_perf = True + + case_list = get_case_list() + res_list = [["case_name", "wrong_num", "total_num", "result", "task_duration"]] + run_mode = sys.argv[1] + for is_process, case_name, m, n, k, b in case_list: + if is_process == 1: + process_params = ProcessParams(case_name, m, n, k, b, is_perf, run_mode) + res_data = process_case(file_work_dir, process_params) + res_list.append(res_data) + + timestamp = time.time() + result_file_name = "result_" + str(timestamp) + ".csv" + with open(os.path.join(file_work_dir, "output", result_file_name), 'w', newline='', encoding='utf-8') as csvfile: + writer = csv.writer(csvfile) + writer.writerows(res_list) + + logging.info("---------------RESULT---------------") + for res in res_list: + logging.info(res) + return 0 + + +if __name__ == "__main__": + main() diff --git a/examples/matrix/matmul_perf/testcase/case.csv b/examples/matrix/matmul_perf/testcase/case.csv new file mode 100644 index 0000000000000000000000000000000000000000..009720f51943f6c96c2a36919c350642fa758e4d --- /dev/null +++ b/examples/matrix/matmul_perf/testcase/case.csv @@ -0,0 +1,2 @@ +1, case001, 128, 30720, 64 +1, case002, 128, 30720, 1024 \ No newline at end of file diff --git a/examples/readme.md b/examples/readme.md index cf59e3ac0de076ef9254156694843eff37baa64d..1701252cc245c2448fdde9b2044bfe1e0d74d099 100644 --- a/examples/readme.md +++ b/examples/readme.md @@ -36,7 +36,7 @@ 对输入tensor按行做如下公式的计算:zi = (xi - ∑(xi * yi)) * yi,其中∑为按行reduce求和。 - matrix + matrix basic_block_matmul 实现无尾块且tiling的base块大小固定的场景下的Matmul矩阵乘法,计算公式为:C = A * B + Bias。 @@ -58,11 +58,11 @@ matmul_ibshareAB - 实现A矩阵或B矩阵GM地址相同,A、B矩阵共享L1 Buffer场景下的Matmul矩阵乘法,计算公式为:C = A * B + Bias。 + 多个AIV的A矩阵和B矩阵GM地址相同场景下,实现共享L1 Buffer上A、B矩阵数据的Matmul矩阵乘法,计算公式为:C = A * B + Bias。 matmul_ibshareB - 实现A矩阵或B矩阵GM地址相同,B矩阵共享L1 Buffer场景下的Matmul矩阵乘法,计算公式为:C = A * B + Bias。 + 多个AIV的B矩阵GM地址相同场景下,实现共享L1 Buffer上B矩阵数据的Matmul矩阵乘法,计算公式为:C = A * B + Bias。 matmul_k_reorder_load @@ -163,6 +163,10 @@ matmul_int4 实现MDL模板下int4数据类型输入的Matmul矩阵乘法,计算公式为:C = A * B + Bias。 + + matmul_perf + 性能优化样例,包含纯Cube模式、MDL模板、UnitFlag、Tiling全量常量化四个特性的Matmul算子,计算公式为:C = A * B + Bias。 + normalization layernorm