From fb49b59f0ae1a26dd7a6000d2ed82fc25b8b20ca Mon Sep 17 00:00:00 2001 From: yxf-dev-ops Date: Wed, 19 Feb 2025 17:46:41 +0800 Subject: [PATCH 1/2] fix matrix example review --- .../host_tiling/basic_block_matmul_custom_tiling.h | 5 +++-- .../basic_block_matmul_custom.cpp | 5 +++++ .../op_kernel/basic_block_matmul_custom.cpp | 4 ++++ .../kernel_launch_method_by_direct/batch_matmul_custom.cpp | 5 +++++ .../op_host/batch_matmul_custom.cpp | 1 + .../op_kernel/batch_matmul_custom.cpp | 4 ++++ .../testcases/npu/batch_matmul_custom_main.cpp | 6 ++++-- .../matmul/kernel_launch_method_by_direct/matmul_custom.cpp | 4 ++++ .../op_kernel/matmul_custom.cpp | 4 ++++ version.info | 2 +- 10 files changed, 35 insertions(+), 5 deletions(-) diff --git a/examples/matrix/basic_block_matmul/host_tiling/basic_block_matmul_custom_tiling.h b/examples/matrix/basic_block_matmul/host_tiling/basic_block_matmul_custom_tiling.h index 8c0ae687..b6e49cab 100644 --- a/examples/matrix/basic_block_matmul/host_tiling/basic_block_matmul_custom_tiling.h +++ b/examples/matrix/basic_block_matmul/host_tiling/basic_block_matmul_custom_tiling.h @@ -21,8 +21,9 @@ END_TILING_DATA_DEF; REGISTER_TILING_DATA_CLASS(BasicBlockMatmulCustom, MatmulCustomTilingData) } -bool ComputeTiling(optiling::TCubeTiling& tiling, matmul_tiling::MultiCoreMatmulTiling* cubeTiling, int32_t M, int32_t N, int32_t K, - int32_t baseM, int32_t baseN, int32_t baseK, int32_t blockDim, bool isBias) +bool ComputeTiling(optiling::TCubeTiling& tiling, matmul_tiling::MultiCoreMatmulTiling* cubeTiling, + int32_t M, int32_t N, int32_t K,int32_t baseM, int32_t baseN, int32_t baseK, + int32_t blockDim, bool isBias) { cubeTiling->SetDim(blockDim); cubeTiling->SetAType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, matmul_tiling::DataType::DT_FLOAT16, true); // A is transposed diff --git a/examples/matrix/basic_block_matmul/kernel_launch_method_by_direct/basic_block_matmul_custom.cpp b/examples/matrix/basic_block_matmul/kernel_launch_method_by_direct/basic_block_matmul_custom.cpp index b8f720c8..c92303a1 100644 --- a/examples/matrix/basic_block_matmul/kernel_launch_method_by_direct/basic_block_matmul_custom.cpp +++ b/examples/matrix/basic_block_matmul/kernel_launch_method_by_direct/basic_block_matmul_custom.cpp @@ -25,12 +25,16 @@ __aicore__ inline void CopyTiling(TCubeTiling* tiling, GM_ADDR tilingGM) extern "C" __global__ __aicore__ void basic_block_matmul_custom(GM_ADDR a, GM_ADDR b, GM_ADDR c, GM_ADDR workspace, GM_ADDR tilingGm) { + // prepare tiling TCubeTiling tiling; CopyTiling(&tiling, tilingGm); + // define matmul kernel BasicBlockMatmulKernel matmulKernel; AscendC::TPipe pipe; REGIST_MATMUL_OBJ(&pipe, GetSysWorkSpacePtr(), matmulKernel.matmulObj, &tiling); + // init matmul kernel matmulKernel.Init(a, b, nullptr, c, workspace, tiling); + // matmul kernel process matmulKernel.Process(&pipe); } @@ -38,6 +42,7 @@ extern "C" __global__ __aicore__ void basic_block_matmul_custom(GM_ADDR a, GM_AD void basic_block_matmul_custom_do(uint32_t blockDim, void* stream, GM_ADDR a, GM_ADDR b, GM_ADDR c, GM_ADDR workspace, GM_ADDR tilingGm) { + // invoke the kernel function through the <<<>>> symbol basic_block_matmul_custom<<>>(a, b, c, workspace, tilingGm); } #endif diff --git a/examples/matrix/basic_block_matmul/kernel_launch_method_by_framework/op_kernel/basic_block_matmul_custom.cpp b/examples/matrix/basic_block_matmul/kernel_launch_method_by_framework/op_kernel/basic_block_matmul_custom.cpp index f3511c5a..f12f3861 100644 --- a/examples/matrix/basic_block_matmul/kernel_launch_method_by_framework/op_kernel/basic_block_matmul_custom.cpp +++ b/examples/matrix/basic_block_matmul/kernel_launch_method_by_framework/op_kernel/basic_block_matmul_custom.cpp @@ -13,10 +13,14 @@ #include "../../../../../../kernel_impl/basic_block_matmul_custom_impl.h" extern "C" __global__ __aicore__ void basic_block_matmul_custom(GM_ADDR a, GM_ADDR b, GM_ADDR bias, GM_ADDR c, GM_ADDR workspace, GM_ADDR tiling) { + // prepare tiling GET_TILING_DATA(tilingData, tiling); + // define matmul kernel BasicBlockMatmulKernel basicBlockMatmulKernel; AscendC::TPipe pipe; REGIST_MATMUL_OBJ(&pipe, GetSysWorkSpacePtr(), basicBlockMatmulKernel.matmulObj, &tilingData.cubeTilingData); + // init matmul kernel basicBlockMatmulKernel.Init(a, b, bias, c, workspace, tilingData.cubeTilingData); + // matmul kernel process basicBlockMatmulKernel.Process(&pipe); } diff --git a/examples/matrix/batch_matmul/kernel_launch_method_by_direct/batch_matmul_custom.cpp b/examples/matrix/batch_matmul/kernel_launch_method_by_direct/batch_matmul_custom.cpp index 94206b3e..beab10f7 100644 --- a/examples/matrix/batch_matmul/kernel_launch_method_by_direct/batch_matmul_custom.cpp +++ b/examples/matrix/batch_matmul/kernel_launch_method_by_direct/batch_matmul_custom.cpp @@ -29,8 +29,10 @@ __aicore__ inline void CopyTiling(TCubeTiling* tiling, GM_ADDR tilingGM) extern "C" __global__ __aicore__ void batch_matmul_custom(GM_ADDR a, GM_ADDR b, GM_ADDR c, GM_ADDR workspace, GM_ADDR tilingGm) { + // prepare tiling TCubeTiling tiling; CopyTiling(&tiling, tilingGm); + // define matmul kernel typedef matmul::MatmulType A_TYPE; typedef matmul::MatmulType B_TYPE; typedef matmul::MatmulType C_TYPE; @@ -42,7 +44,9 @@ extern "C" __global__ __aicore__ void batch_matmul_custom(GM_ADDR a, GM_ADDR b, tiling.shareL0CSize = FULL_L0C_SIZE; // full L0C tiling.shareUbSize = 0; // no UB REGIST_MATMUL_OBJ(&pipe, GetSysWorkSpacePtr(), batchMatmulKernel.matmulObj, &tiling); + // init matmul kernel batchMatmulKernel.Init(a, b, nullptr, c, workspace, tiling); + // matmul kernel process batchMatmulKernel.Process(&pipe, 3, 3); } @@ -50,6 +54,7 @@ extern "C" __global__ __aicore__ void batch_matmul_custom(GM_ADDR a, GM_ADDR b, void batch_matmul_custom_do(uint32_t blockDim, void* stream, GM_ADDR a, GM_ADDR b, GM_ADDR c, GM_ADDR workspace, GM_ADDR tilingGm) { + // invoke the kernel function through the <<<>>> symbol batch_matmul_custom<<>>(a, b, c, workspace, tilingGm); } #endif \ No newline at end of file diff --git a/examples/matrix/batch_matmul/kernel_launch_method_by_framework/op_host/batch_matmul_custom.cpp b/examples/matrix/batch_matmul/kernel_launch_method_by_framework/op_host/batch_matmul_custom.cpp index f3600392..d493387c 100644 --- a/examples/matrix/batch_matmul/kernel_launch_method_by_framework/op_host/batch_matmul_custom.cpp +++ b/examples/matrix/batch_matmul/kernel_launch_method_by_framework/op_host/batch_matmul_custom.cpp @@ -23,6 +23,7 @@ static ge::graphStatus TilingFunc(gert::TilingContext* context) if (!res) { return ge::GRAPH_FAILED; } + // this example is a single-core example, therefore set the number of cores to 1 context->SetBlockDim(1); tiling.SaveToBuffer(context->GetRawTilingData()->GetData(), context->GetRawTilingData()->GetCapacity()); context->GetRawTilingData()->SetDataSize(tiling.GetDataSize()); diff --git a/examples/matrix/batch_matmul/kernel_launch_method_by_framework/op_kernel/batch_matmul_custom.cpp b/examples/matrix/batch_matmul/kernel_launch_method_by_framework/op_kernel/batch_matmul_custom.cpp index c55caba1..cf5f8376 100644 --- a/examples/matrix/batch_matmul/kernel_launch_method_by_framework/op_kernel/batch_matmul_custom.cpp +++ b/examples/matrix/batch_matmul/kernel_launch_method_by_framework/op_kernel/batch_matmul_custom.cpp @@ -16,7 +16,9 @@ constexpr int32_t FULL_L1_SIZE = 512 * 1024; constexpr int32_t FULL_L0C_SIZE = 128 * 1024; extern "C" __global__ __aicore__ void batch_matmul_custom(GM_ADDR a, GM_ADDR b, GM_ADDR bias, GM_ADDR c, GM_ADDR workspace, GM_ADDR tiling) { + // prepare tiling GET_TILING_DATA(tilingData, tiling); + // define matmul kernel typedef matmul::MatmulType A_TYPE; typedef matmul::MatmulType B_TYPE; typedef matmul::MatmulType C_TYPE; @@ -29,6 +31,8 @@ extern "C" __global__ __aicore__ void batch_matmul_custom(GM_ADDR a, GM_ADDR b, tilingData.cubeTilingData.shareUbSize = 0; // no UB REGIST_MATMUL_OBJ(&pipe, GetSysWorkSpacePtr(), batchMatmulKernel.matmulObj, &tilingData.cubeTilingData); int batchNum = tilingData.cubeTilingData.BatchNum; + // init matmul kernel batchMatmulKernel.Init(a, b, bias, c, workspace, tilingData.cubeTilingData); + // matmul kernel process batchMatmulKernel.Process(&pipe, batchNum, batchNum); } \ No newline at end of file diff --git a/examples/matrix/batch_matmul/kernel_launch_method_by_framework/testcases/npu/batch_matmul_custom_main.cpp b/examples/matrix/batch_matmul/kernel_launch_method_by_framework/testcases/npu/batch_matmul_custom_main.cpp index 9afc07f7..2f7b21f9 100644 --- a/examples/matrix/batch_matmul/kernel_launch_method_by_framework/testcases/npu/batch_matmul_custom_main.cpp +++ b/examples/matrix/batch_matmul/kernel_launch_method_by_framework/testcases/npu/batch_matmul_custom_main.cpp @@ -122,6 +122,8 @@ int32_t main(void) int32_t tensorCount = sizeof(tensorDesc) / sizeof(struct tensorInfo); aclTensor *tensors[tensorCount]; void *devMem[tensorCount]; + constexpr int32_t aMatrixIndex = 0; + constexpr int32_t bMatrixIndex = 1; constexpr int32_t biasIndex = 2; constexpr int32_t maxTimeout = 5000; for (auto i = 0; i < tensorCount; i++) { @@ -135,7 +137,7 @@ int32_t main(void) } CHECK_ACL(aclrtMalloc(&data, size, ACL_MEM_MALLOC_HUGE_FIRST)); // read input - if (i == 0) { + if (i == aMatrixIndex) { size_t inputSize = size; void *dataHost; CHECK_ACL(aclrtMallocHost((void**)(&dataHost), inputSize)); @@ -143,7 +145,7 @@ int32_t main(void) CHECK_ACL(aclrtMemcpy(data, size, dataHost, size, ACL_MEMCPY_HOST_TO_DEVICE)); CHECK_ACL(aclrtFreeHost(dataHost)); } - if (i == 1) { + if (i == bMatrixIndex) { size_t inputSize = size; void *dataHost; CHECK_ACL(aclrtMallocHost((void**)(&dataHost), inputSize)); diff --git a/examples/matrix/matmul/kernel_launch_method_by_direct/matmul_custom.cpp b/examples/matrix/matmul/kernel_launch_method_by_direct/matmul_custom.cpp index 3c4040b0..bfe73348 100644 --- a/examples/matrix/matmul/kernel_launch_method_by_direct/matmul_custom.cpp +++ b/examples/matrix/matmul/kernel_launch_method_by_direct/matmul_custom.cpp @@ -25,12 +25,16 @@ __aicore__ inline void CopyTiling(TCubeTiling* tiling, GM_ADDR tilingGM) extern "C" __global__ __aicore__ void matmul_custom(GM_ADDR a, GM_ADDR b, GM_ADDR c, GM_ADDR workspace, GM_ADDR tilingGm) { + // prepare tiling TCubeTiling tiling; CopyTiling(&tiling, tilingGm); + // define matmul kernel MatmulKernel matmulKernel; AscendC::TPipe pipe; REGIST_MATMUL_OBJ(&pipe, GetSysWorkSpacePtr(), matmulKernel.matmulObj, &tiling); + // init matmul kernel matmulKernel.Init(a, b, nullptr, c, workspace, tiling); + // matmul kernel process #ifdef CUSTOM_ASCEND310P matmulKernel.Process(&pipe); #else diff --git a/examples/matrix/matmul/kernel_launch_method_by_framework/op_kernel/matmul_custom.cpp b/examples/matrix/matmul/kernel_launch_method_by_framework/op_kernel/matmul_custom.cpp index c2970e97..c40494b7 100644 --- a/examples/matrix/matmul/kernel_launch_method_by_framework/op_kernel/matmul_custom.cpp +++ b/examples/matrix/matmul/kernel_launch_method_by_framework/op_kernel/matmul_custom.cpp @@ -13,11 +13,15 @@ #include "../../../../../../kernel_impl/matmul_custom_impl.h" extern "C" __global__ __aicore__ void matmul_custom(GM_ADDR a, GM_ADDR b, GM_ADDR bias, GM_ADDR c, GM_ADDR workspace, GM_ADDR tiling) { + // prepare tiling GET_TILING_DATA(tilingData, tiling); + // define matmul kernel MatmulKernel matmulKernel; AscendC::TPipe pipe; REGIST_MATMUL_OBJ(&pipe, GetSysWorkSpacePtr(), matmulKernel.matmulObj,&tilingData.cubeTilingData); + // init matmul kernel matmulKernel.Init(a, b, bias, c, workspace, tilingData.cubeTilingData); + // matmul kernel process if (TILING_KEY_IS(1)) { matmulKernel.Process(&pipe); } else if (TILING_KEY_IS(2)) { diff --git a/version.info b/version.info index 93af97dd..e2b1cd51 100644 --- a/version.info +++ b/version.info @@ -1 +1 @@ -Version=7.6.T11.0 +Version=7.7.T3.0.B012 -- Gitee From 0ffe8fe410894ae17c2ce0d12ddbe3c957a773b7 Mon Sep 17 00:00:00 2001 From: yxf-dev-ops Date: Mon, 10 Feb 2025 03:06:14 +0000 Subject: [PATCH 2/2] !1113 fix example review Merge pull request !1113 from yxf-dev-ops/cr-dev --- examples/matrix/basic_block_matmul/README.md | 10 +++++----- .../basic_block_matmul_custom.cpp | 4 ++-- .../kernel_launch_method_by_direct/main.cpp | 12 ++++++++++++ .../kernel_impl/batch_matmul_custom_impl.h | 2 +- .../testcases/npu/batch_matmul_custom_main.cpp | 1 - 5 files changed, 20 insertions(+), 9 deletions(-) diff --git a/examples/matrix/basic_block_matmul/README.md b/examples/matrix/basic_block_matmul/README.md index c3f41694..60de7b56 100644 --- a/examples/matrix/basic_block_matmul/README.md +++ b/examples/matrix/basic_block_matmul/README.md @@ -1,15 +1,15 @@ ## 概述 -本样例介绍了调用Matmul高阶API实现basicBlockMatmul单算子(使能BasicBlock模板),按照不同的算子调用方式分别给出了对应的端到端实现。 +本样例介绍了如何使用Matmul高阶API中basicBlock模版实现矩阵乘算子,按照不同的算子调用方式分别给出了对应的端到端实现。 -> BasicBlock模板的特点:适用于无尾块的特定场景,可以固定base块大小(baseM/baseN/baseK),减少矩阵搬运和计算过程中的一些计算开销。 +> BasicBlock模板的特点:适用于无尾块的特定场景(矩阵的shape可以被base块整除),可以固定base块大小(baseM/baseN/baseK),减少矩阵搬运和计算过程中的Scalar开销。 -- 直调:使用核函数直调basicBlockMatmul自定义算子。 +- 直调:使用核函数直调basicBlock Matmul实现矩阵运算。 核函数的基础调用方式,开发者完成算子核函数的开发和Tiling实现后,即可通过AscendCL运行时接口,完成算子的调用。 -- 框架调用:使用框架调用basicBlockMatmul自定义算子。 +- 框架调用:使用框架调用basicBlock Matmul实现矩阵运算。 按照工程创建->算子实现->编译部署>算子调用的流程完成算子开发。整个过程都依赖于算子工程:基于工程代码框架完成算子核函数的开发和Tiling实现,通过工程编译脚本完成算子的编译部署,继而实现单算子调用或第三方框架中的算子调用。 @@ -18,7 +18,7 @@ | 调用方式 | 目录 | 描述 | | --------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------- | | 直调 | [kernel_launch_method_by_direct](./kernel_launch_method_by_direct) | host侧的核函数调用程序,包含CPU侧和NPU侧两种运行验证方法。 | -| 框架调用 | [kernel_launch_method_by_framework](./kernel_launch_method_by_framework) | 通过aclnn调用的方式调用basicBlockMatmul算子。 | +| 框架调用 | [kernel_launch_method_by_framework](./kernel_launch_method_by_framework) | 通过aclnn调用的方式调用basicBlock Matmul程序,包含CPU侧和NPU侧两种运行验证方法。| ## 样例支持的产品型号为: - Atlas A2训练系列产品/Atlas 800I A2推理产品 diff --git a/examples/matrix/basic_block_matmul/kernel_launch_method_by_direct/basic_block_matmul_custom.cpp b/examples/matrix/basic_block_matmul/kernel_launch_method_by_direct/basic_block_matmul_custom.cpp index c92303a1..e31a88e1 100644 --- a/examples/matrix/basic_block_matmul/kernel_launch_method_by_direct/basic_block_matmul_custom.cpp +++ b/examples/matrix/basic_block_matmul/kernel_launch_method_by_direct/basic_block_matmul_custom.cpp @@ -15,10 +15,10 @@ __aicore__ inline void CopyTiling(TCubeTiling* tiling, GM_ADDR tilingGM) { uint32_t* ptr = reinterpret_cast(tiling); - auto tiling32 = reinterpret_cast<__gm__ uint32_t*>(tilingGM); + auto tilingGmAddr = reinterpret_cast<__gm__ uint32_t*>(tilingGM); for (int i = 0; i < sizeof(TCubeTiling) / sizeof(uint32_t); i++, ptr++) { - *ptr = *(tiling32 + i); + *ptr = *(tilingGmAddr + i); } return; } diff --git a/examples/matrix/basic_block_matmul/kernel_launch_method_by_direct/main.cpp b/examples/matrix/basic_block_matmul/kernel_launch_method_by_direct/main.cpp index 9493ff1d..da4b4cb9 100644 --- a/examples/matrix/basic_block_matmul/kernel_launch_method_by_direct/main.cpp +++ b/examples/matrix/basic_block_matmul/kernel_launch_method_by_direct/main.cpp @@ -164,6 +164,18 @@ int32_t main(int32_t argc, char *argv[]) CHECK_ACL(aclrtFree(cDevice)); CHECK_ACL(aclrtFreeHost(cHost)); + CHECK_ACL(aclrtFree(aDevice)); + CHECK_ACL(aclrtFreeHost(aHost)); + + CHECK_ACL(aclrtFree(bDevice)); + CHECK_ACL(aclrtFreeHost(bHost)); + + CHECK_ACL(aclrtFree(tilingDevice)); + CHECK_ACL(aclrtFreeHost(tilingHost)); + + CHECK_ACL(aclrtFree(workspaceDevice)); + CHECK_ACL(aclrtFreeHost(workspaceHost)); + CHECK_ACL(aclrtDestroyStream(stream)); CHECK_ACL(aclrtDestroyContext(context)); CHECK_ACL(aclrtResetDevice(deviceId)); diff --git a/examples/matrix/batch_matmul/kernel_impl/batch_matmul_custom_impl.h b/examples/matrix/batch_matmul/kernel_impl/batch_matmul_custom_impl.h index 065f151c..fb62f470 100644 --- a/examples/matrix/batch_matmul/kernel_impl/batch_matmul_custom_impl.h +++ b/examples/matrix/batch_matmul/kernel_impl/batch_matmul_custom_impl.h @@ -96,7 +96,7 @@ __aicore__ inline void BatchMatmulKernel::Pro batchOffsetC = idxC * tiling.CLayoutInfoS2 * tiling.CLayoutInfoS1; } matmulObj.IterateBatch(cGlobal[batchOffsetC], batchA, batchB, false); - pipe_barrier(PIPE_ALL); + AscendC::PipeBarrier(); } } diff --git a/examples/matrix/batch_matmul/kernel_launch_method_by_framework/testcases/npu/batch_matmul_custom_main.cpp b/examples/matrix/batch_matmul/kernel_launch_method_by_framework/testcases/npu/batch_matmul_custom_main.cpp index 2f7b21f9..8a9ef27b 100644 --- a/examples/matrix/batch_matmul/kernel_launch_method_by_framework/testcases/npu/batch_matmul_custom_main.cpp +++ b/examples/matrix/batch_matmul/kernel_launch_method_by_framework/testcases/npu/batch_matmul_custom_main.cpp @@ -125,7 +125,6 @@ int32_t main(void) constexpr int32_t aMatrixIndex = 0; constexpr int32_t bMatrixIndex = 1; constexpr int32_t biasIndex = 2; - constexpr int32_t maxTimeout = 5000; for (auto i = 0; i < tensorCount; i++) { void *data; struct tensorInfo *info = &(tensorDesc[i]); -- Gitee