diff --git a/examples/matrix/basic_block_matmul/README.md b/examples/matrix/basic_block_matmul/README.md index c3f41694018da196d82e416acca060a3d0d2dc07..60de7b568dc32f82d797c38c8552046439464c1d 100644 --- a/examples/matrix/basic_block_matmul/README.md +++ b/examples/matrix/basic_block_matmul/README.md @@ -1,15 +1,15 @@ ## 概述 -本样例介绍了调用Matmul高阶API实现basicBlockMatmul单算子(使能BasicBlock模板),按照不同的算子调用方式分别给出了对应的端到端实现。 +本样例介绍了如何使用Matmul高阶API中basicBlock模版实现矩阵乘算子,按照不同的算子调用方式分别给出了对应的端到端实现。 -> BasicBlock模板的特点:适用于无尾块的特定场景,可以固定base块大小(baseM/baseN/baseK),减少矩阵搬运和计算过程中的一些计算开销。 +> BasicBlock模板的特点:适用于无尾块的特定场景(矩阵的shape可以被base块整除),可以固定base块大小(baseM/baseN/baseK),减少矩阵搬运和计算过程中的Scalar开销。 -- 直调:使用核函数直调basicBlockMatmul自定义算子。 +- 直调:使用核函数直调basicBlock Matmul实现矩阵运算。 核函数的基础调用方式,开发者完成算子核函数的开发和Tiling实现后,即可通过AscendCL运行时接口,完成算子的调用。 -- 框架调用:使用框架调用basicBlockMatmul自定义算子。 +- 框架调用:使用框架调用basicBlock Matmul实现矩阵运算。 按照工程创建->算子实现->编译部署>算子调用的流程完成算子开发。整个过程都依赖于算子工程:基于工程代码框架完成算子核函数的开发和Tiling实现,通过工程编译脚本完成算子的编译部署,继而实现单算子调用或第三方框架中的算子调用。 @@ -18,7 +18,7 @@ | 调用方式 | 目录 | 描述 | | --------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------- | | 直调 | [kernel_launch_method_by_direct](./kernel_launch_method_by_direct) | host侧的核函数调用程序,包含CPU侧和NPU侧两种运行验证方法。 | -| 框架调用 | [kernel_launch_method_by_framework](./kernel_launch_method_by_framework) | 通过aclnn调用的方式调用basicBlockMatmul算子。 | +| 框架调用 | [kernel_launch_method_by_framework](./kernel_launch_method_by_framework) | 通过aclnn调用的方式调用basicBlock Matmul程序,包含CPU侧和NPU侧两种运行验证方法。| ## 样例支持的产品型号为: - Atlas A2训练系列产品/Atlas 800I A2推理产品 diff --git a/examples/matrix/basic_block_matmul/host_tiling/basic_block_matmul_custom_tiling.h b/examples/matrix/basic_block_matmul/host_tiling/basic_block_matmul_custom_tiling.h index 8c0ae687f5ed2222028c46db406a553fadcdc6e0..b6e49cabe9c332f086808497e4d7d3329da780e3 100644 --- a/examples/matrix/basic_block_matmul/host_tiling/basic_block_matmul_custom_tiling.h +++ b/examples/matrix/basic_block_matmul/host_tiling/basic_block_matmul_custom_tiling.h @@ -21,8 +21,9 @@ END_TILING_DATA_DEF; REGISTER_TILING_DATA_CLASS(BasicBlockMatmulCustom, MatmulCustomTilingData) } -bool ComputeTiling(optiling::TCubeTiling& tiling, matmul_tiling::MultiCoreMatmulTiling* cubeTiling, int32_t M, int32_t N, int32_t K, - int32_t baseM, int32_t baseN, int32_t baseK, int32_t blockDim, bool isBias) +bool ComputeTiling(optiling::TCubeTiling& tiling, matmul_tiling::MultiCoreMatmulTiling* cubeTiling, + int32_t M, int32_t N, int32_t K,int32_t baseM, int32_t baseN, int32_t baseK, + int32_t blockDim, bool isBias) { cubeTiling->SetDim(blockDim); cubeTiling->SetAType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, matmul_tiling::DataType::DT_FLOAT16, true); // A is transposed diff --git a/examples/matrix/basic_block_matmul/kernel_launch_method_by_direct/basic_block_matmul_custom.cpp b/examples/matrix/basic_block_matmul/kernel_launch_method_by_direct/basic_block_matmul_custom.cpp index b8f720c89f1f85f487bfba0d1d32457060d38706..e31a88e14f9df5a844498cf2df469ac80ccb199a 100644 --- a/examples/matrix/basic_block_matmul/kernel_launch_method_by_direct/basic_block_matmul_custom.cpp +++ b/examples/matrix/basic_block_matmul/kernel_launch_method_by_direct/basic_block_matmul_custom.cpp @@ -15,22 +15,26 @@ __aicore__ inline void CopyTiling(TCubeTiling* tiling, GM_ADDR tilingGM) { uint32_t* ptr = reinterpret_cast(tiling); - auto tiling32 = reinterpret_cast<__gm__ uint32_t*>(tilingGM); + auto tilingGmAddr = reinterpret_cast<__gm__ uint32_t*>(tilingGM); for (int i = 0; i < sizeof(TCubeTiling) / sizeof(uint32_t); i++, ptr++) { - *ptr = *(tiling32 + i); + *ptr = *(tilingGmAddr + i); } return; } extern "C" __global__ __aicore__ void basic_block_matmul_custom(GM_ADDR a, GM_ADDR b, GM_ADDR c, GM_ADDR workspace, GM_ADDR tilingGm) { + // prepare tiling TCubeTiling tiling; CopyTiling(&tiling, tilingGm); + // define matmul kernel BasicBlockMatmulKernel matmulKernel; AscendC::TPipe pipe; REGIST_MATMUL_OBJ(&pipe, GetSysWorkSpacePtr(), matmulKernel.matmulObj, &tiling); + // init matmul kernel matmulKernel.Init(a, b, nullptr, c, workspace, tiling); + // matmul kernel process matmulKernel.Process(&pipe); } @@ -38,6 +42,7 @@ extern "C" __global__ __aicore__ void basic_block_matmul_custom(GM_ADDR a, GM_AD void basic_block_matmul_custom_do(uint32_t blockDim, void* stream, GM_ADDR a, GM_ADDR b, GM_ADDR c, GM_ADDR workspace, GM_ADDR tilingGm) { + // invoke the kernel function through the <<<>>> symbol basic_block_matmul_custom<<>>(a, b, c, workspace, tilingGm); } #endif diff --git a/examples/matrix/basic_block_matmul/kernel_launch_method_by_direct/main.cpp b/examples/matrix/basic_block_matmul/kernel_launch_method_by_direct/main.cpp index 9493ff1d4414d819eefc65a9b033f308683b6abb..da4b4cb94c80c23fd977b4f27d34c68886e3bf1c 100644 --- a/examples/matrix/basic_block_matmul/kernel_launch_method_by_direct/main.cpp +++ b/examples/matrix/basic_block_matmul/kernel_launch_method_by_direct/main.cpp @@ -164,6 +164,18 @@ int32_t main(int32_t argc, char *argv[]) CHECK_ACL(aclrtFree(cDevice)); CHECK_ACL(aclrtFreeHost(cHost)); + CHECK_ACL(aclrtFree(aDevice)); + CHECK_ACL(aclrtFreeHost(aHost)); + + CHECK_ACL(aclrtFree(bDevice)); + CHECK_ACL(aclrtFreeHost(bHost)); + + CHECK_ACL(aclrtFree(tilingDevice)); + CHECK_ACL(aclrtFreeHost(tilingHost)); + + CHECK_ACL(aclrtFree(workspaceDevice)); + CHECK_ACL(aclrtFreeHost(workspaceHost)); + CHECK_ACL(aclrtDestroyStream(stream)); CHECK_ACL(aclrtDestroyContext(context)); CHECK_ACL(aclrtResetDevice(deviceId)); diff --git a/examples/matrix/basic_block_matmul/kernel_launch_method_by_framework/op_kernel/basic_block_matmul_custom.cpp b/examples/matrix/basic_block_matmul/kernel_launch_method_by_framework/op_kernel/basic_block_matmul_custom.cpp index f3511c5a44e122406e25288173b540f24ce13ebe..f12f386110451a6c06172356e568202c272f6e51 100644 --- a/examples/matrix/basic_block_matmul/kernel_launch_method_by_framework/op_kernel/basic_block_matmul_custom.cpp +++ b/examples/matrix/basic_block_matmul/kernel_launch_method_by_framework/op_kernel/basic_block_matmul_custom.cpp @@ -13,10 +13,14 @@ #include "../../../../../../kernel_impl/basic_block_matmul_custom_impl.h" extern "C" __global__ __aicore__ void basic_block_matmul_custom(GM_ADDR a, GM_ADDR b, GM_ADDR bias, GM_ADDR c, GM_ADDR workspace, GM_ADDR tiling) { + // prepare tiling GET_TILING_DATA(tilingData, tiling); + // define matmul kernel BasicBlockMatmulKernel basicBlockMatmulKernel; AscendC::TPipe pipe; REGIST_MATMUL_OBJ(&pipe, GetSysWorkSpacePtr(), basicBlockMatmulKernel.matmulObj, &tilingData.cubeTilingData); + // init matmul kernel basicBlockMatmulKernel.Init(a, b, bias, c, workspace, tilingData.cubeTilingData); + // matmul kernel process basicBlockMatmulKernel.Process(&pipe); } diff --git a/examples/matrix/batch_matmul/kernel_impl/batch_matmul_custom_impl.h b/examples/matrix/batch_matmul/kernel_impl/batch_matmul_custom_impl.h index 065f151cbf3218348ab4d7dcc40ec59f5a3aada0..fb62f4708bd5feed2c412df502a4d712d990e28b 100644 --- a/examples/matrix/batch_matmul/kernel_impl/batch_matmul_custom_impl.h +++ b/examples/matrix/batch_matmul/kernel_impl/batch_matmul_custom_impl.h @@ -96,7 +96,7 @@ __aicore__ inline void BatchMatmulKernel::Pro batchOffsetC = idxC * tiling.CLayoutInfoS2 * tiling.CLayoutInfoS1; } matmulObj.IterateBatch(cGlobal[batchOffsetC], batchA, batchB, false); - pipe_barrier(PIPE_ALL); + AscendC::PipeBarrier(); } } diff --git a/examples/matrix/batch_matmul/kernel_launch_method_by_direct/batch_matmul_custom.cpp b/examples/matrix/batch_matmul/kernel_launch_method_by_direct/batch_matmul_custom.cpp index 94206b3efc982492ca4661f50a8f7e299de3197e..beab10f73a15ec646929de23c97b2a95282efe53 100644 --- a/examples/matrix/batch_matmul/kernel_launch_method_by_direct/batch_matmul_custom.cpp +++ b/examples/matrix/batch_matmul/kernel_launch_method_by_direct/batch_matmul_custom.cpp @@ -29,8 +29,10 @@ __aicore__ inline void CopyTiling(TCubeTiling* tiling, GM_ADDR tilingGM) extern "C" __global__ __aicore__ void batch_matmul_custom(GM_ADDR a, GM_ADDR b, GM_ADDR c, GM_ADDR workspace, GM_ADDR tilingGm) { + // prepare tiling TCubeTiling tiling; CopyTiling(&tiling, tilingGm); + // define matmul kernel typedef matmul::MatmulType A_TYPE; typedef matmul::MatmulType B_TYPE; typedef matmul::MatmulType C_TYPE; @@ -42,7 +44,9 @@ extern "C" __global__ __aicore__ void batch_matmul_custom(GM_ADDR a, GM_ADDR b, tiling.shareL0CSize = FULL_L0C_SIZE; // full L0C tiling.shareUbSize = 0; // no UB REGIST_MATMUL_OBJ(&pipe, GetSysWorkSpacePtr(), batchMatmulKernel.matmulObj, &tiling); + // init matmul kernel batchMatmulKernel.Init(a, b, nullptr, c, workspace, tiling); + // matmul kernel process batchMatmulKernel.Process(&pipe, 3, 3); } @@ -50,6 +54,7 @@ extern "C" __global__ __aicore__ void batch_matmul_custom(GM_ADDR a, GM_ADDR b, void batch_matmul_custom_do(uint32_t blockDim, void* stream, GM_ADDR a, GM_ADDR b, GM_ADDR c, GM_ADDR workspace, GM_ADDR tilingGm) { + // invoke the kernel function through the <<<>>> symbol batch_matmul_custom<<>>(a, b, c, workspace, tilingGm); } #endif \ No newline at end of file diff --git a/examples/matrix/batch_matmul/kernel_launch_method_by_framework/op_host/batch_matmul_custom.cpp b/examples/matrix/batch_matmul/kernel_launch_method_by_framework/op_host/batch_matmul_custom.cpp index f36003929bfd4e941d367047af267675004d0c8b..d493387c2d9d903eb5460eb8cbd126cc15fd8823 100644 --- a/examples/matrix/batch_matmul/kernel_launch_method_by_framework/op_host/batch_matmul_custom.cpp +++ b/examples/matrix/batch_matmul/kernel_launch_method_by_framework/op_host/batch_matmul_custom.cpp @@ -23,6 +23,7 @@ static ge::graphStatus TilingFunc(gert::TilingContext* context) if (!res) { return ge::GRAPH_FAILED; } + // this example is a single-core example, therefore set the number of cores to 1 context->SetBlockDim(1); tiling.SaveToBuffer(context->GetRawTilingData()->GetData(), context->GetRawTilingData()->GetCapacity()); context->GetRawTilingData()->SetDataSize(tiling.GetDataSize()); diff --git a/examples/matrix/batch_matmul/kernel_launch_method_by_framework/op_kernel/batch_matmul_custom.cpp b/examples/matrix/batch_matmul/kernel_launch_method_by_framework/op_kernel/batch_matmul_custom.cpp index c55caba16e0cc356523fc44b7dc8c61a0f27c201..cf5f83767897bd60ad201a3e744bac65f1a4d2f8 100644 --- a/examples/matrix/batch_matmul/kernel_launch_method_by_framework/op_kernel/batch_matmul_custom.cpp +++ b/examples/matrix/batch_matmul/kernel_launch_method_by_framework/op_kernel/batch_matmul_custom.cpp @@ -16,7 +16,9 @@ constexpr int32_t FULL_L1_SIZE = 512 * 1024; constexpr int32_t FULL_L0C_SIZE = 128 * 1024; extern "C" __global__ __aicore__ void batch_matmul_custom(GM_ADDR a, GM_ADDR b, GM_ADDR bias, GM_ADDR c, GM_ADDR workspace, GM_ADDR tiling) { + // prepare tiling GET_TILING_DATA(tilingData, tiling); + // define matmul kernel typedef matmul::MatmulType A_TYPE; typedef matmul::MatmulType B_TYPE; typedef matmul::MatmulType C_TYPE; @@ -29,6 +31,8 @@ extern "C" __global__ __aicore__ void batch_matmul_custom(GM_ADDR a, GM_ADDR b, tilingData.cubeTilingData.shareUbSize = 0; // no UB REGIST_MATMUL_OBJ(&pipe, GetSysWorkSpacePtr(), batchMatmulKernel.matmulObj, &tilingData.cubeTilingData); int batchNum = tilingData.cubeTilingData.BatchNum; + // init matmul kernel batchMatmulKernel.Init(a, b, bias, c, workspace, tilingData.cubeTilingData); + // matmul kernel process batchMatmulKernel.Process(&pipe, batchNum, batchNum); } \ No newline at end of file diff --git a/examples/matrix/batch_matmul/kernel_launch_method_by_framework/testcases/npu/batch_matmul_custom_main.cpp b/examples/matrix/batch_matmul/kernel_launch_method_by_framework/testcases/npu/batch_matmul_custom_main.cpp index 9afc07f7fec0a4ffcf3eff1282cc44f9745b5a58..8a9ef27b7ad459cd91eecf1aa2de7b23dd589e06 100644 --- a/examples/matrix/batch_matmul/kernel_launch_method_by_framework/testcases/npu/batch_matmul_custom_main.cpp +++ b/examples/matrix/batch_matmul/kernel_launch_method_by_framework/testcases/npu/batch_matmul_custom_main.cpp @@ -122,8 +122,9 @@ int32_t main(void) int32_t tensorCount = sizeof(tensorDesc) / sizeof(struct tensorInfo); aclTensor *tensors[tensorCount]; void *devMem[tensorCount]; + constexpr int32_t aMatrixIndex = 0; + constexpr int32_t bMatrixIndex = 1; constexpr int32_t biasIndex = 2; - constexpr int32_t maxTimeout = 5000; for (auto i = 0; i < tensorCount; i++) { void *data; struct tensorInfo *info = &(tensorDesc[i]); @@ -135,7 +136,7 @@ int32_t main(void) } CHECK_ACL(aclrtMalloc(&data, size, ACL_MEM_MALLOC_HUGE_FIRST)); // read input - if (i == 0) { + if (i == aMatrixIndex) { size_t inputSize = size; void *dataHost; CHECK_ACL(aclrtMallocHost((void**)(&dataHost), inputSize)); @@ -143,7 +144,7 @@ int32_t main(void) CHECK_ACL(aclrtMemcpy(data, size, dataHost, size, ACL_MEMCPY_HOST_TO_DEVICE)); CHECK_ACL(aclrtFreeHost(dataHost)); } - if (i == 1) { + if (i == bMatrixIndex) { size_t inputSize = size; void *dataHost; CHECK_ACL(aclrtMallocHost((void**)(&dataHost), inputSize)); diff --git a/examples/matrix/matmul/kernel_launch_method_by_direct/matmul_custom.cpp b/examples/matrix/matmul/kernel_launch_method_by_direct/matmul_custom.cpp index 3c4040b092ab47b25dee1baff4263036f691f816..bfe733488795e13624c4f4024271537884499092 100644 --- a/examples/matrix/matmul/kernel_launch_method_by_direct/matmul_custom.cpp +++ b/examples/matrix/matmul/kernel_launch_method_by_direct/matmul_custom.cpp @@ -25,12 +25,16 @@ __aicore__ inline void CopyTiling(TCubeTiling* tiling, GM_ADDR tilingGM) extern "C" __global__ __aicore__ void matmul_custom(GM_ADDR a, GM_ADDR b, GM_ADDR c, GM_ADDR workspace, GM_ADDR tilingGm) { + // prepare tiling TCubeTiling tiling; CopyTiling(&tiling, tilingGm); + // define matmul kernel MatmulKernel matmulKernel; AscendC::TPipe pipe; REGIST_MATMUL_OBJ(&pipe, GetSysWorkSpacePtr(), matmulKernel.matmulObj, &tiling); + // init matmul kernel matmulKernel.Init(a, b, nullptr, c, workspace, tiling); + // matmul kernel process #ifdef CUSTOM_ASCEND310P matmulKernel.Process(&pipe); #else diff --git a/examples/matrix/matmul/kernel_launch_method_by_framework/op_kernel/matmul_custom.cpp b/examples/matrix/matmul/kernel_launch_method_by_framework/op_kernel/matmul_custom.cpp index c2970e971d699582b96554a6ff806093ea18267b..c40494b79590615439b32fe017ed28145f15d1bb 100644 --- a/examples/matrix/matmul/kernel_launch_method_by_framework/op_kernel/matmul_custom.cpp +++ b/examples/matrix/matmul/kernel_launch_method_by_framework/op_kernel/matmul_custom.cpp @@ -13,11 +13,15 @@ #include "../../../../../../kernel_impl/matmul_custom_impl.h" extern "C" __global__ __aicore__ void matmul_custom(GM_ADDR a, GM_ADDR b, GM_ADDR bias, GM_ADDR c, GM_ADDR workspace, GM_ADDR tiling) { + // prepare tiling GET_TILING_DATA(tilingData, tiling); + // define matmul kernel MatmulKernel matmulKernel; AscendC::TPipe pipe; REGIST_MATMUL_OBJ(&pipe, GetSysWorkSpacePtr(), matmulKernel.matmulObj,&tilingData.cubeTilingData); + // init matmul kernel matmulKernel.Init(a, b, bias, c, workspace, tilingData.cubeTilingData); + // matmul kernel process if (TILING_KEY_IS(1)) { matmulKernel.Process(&pipe); } else if (TILING_KEY_IS(2)) { diff --git a/version.info b/version.info index 93af97dd49163a922380228da6b60b11cb9e73d5..e2b1cd51e96ca802bbb88eab09443a40199c2c23 100644 --- a/version.info +++ b/version.info @@ -1 +1 @@ -Version=7.6.T11.0 +Version=7.7.T3.0.B012