diff --git a/examples/matrix/basic_block_matmul/README.md b/examples/matrix/basic_block_matmul/README.md
index c3f41694018da196d82e416acca060a3d0d2dc07..60de7b568dc32f82d797c38c8552046439464c1d 100644
--- a/examples/matrix/basic_block_matmul/README.md
+++ b/examples/matrix/basic_block_matmul/README.md
@@ -1,15 +1,15 @@
## 概述
-本样例介绍了调用Matmul高阶API实现basicBlockMatmul单算子(使能BasicBlock模板),按照不同的算子调用方式分别给出了对应的端到端实现。
+本样例介绍了如何使用Matmul高阶API中basicBlock模版实现矩阵乘算子,按照不同的算子调用方式分别给出了对应的端到端实现。
-> BasicBlock模板的特点:适用于无尾块的特定场景,可以固定base块大小(baseM/baseN/baseK),减少矩阵搬运和计算过程中的一些计算开销。
+> BasicBlock模板的特点:适用于无尾块的特定场景(矩阵的shape可以被base块整除),可以固定base块大小(baseM/baseN/baseK),减少矩阵搬运和计算过程中的Scalar开销。
-- 直调:使用核函数直调basicBlockMatmul自定义算子。
+- 直调:使用核函数直调basicBlock Matmul实现矩阵运算。
核函数的基础调用方式,开发者完成算子核函数的开发和Tiling实现后,即可通过AscendCL运行时接口,完成算子的调用。
-- 框架调用:使用框架调用basicBlockMatmul自定义算子。
+- 框架调用:使用框架调用basicBlock Matmul实现矩阵运算。
按照工程创建->算子实现->编译部署>算子调用的流程完成算子开发。整个过程都依赖于算子工程:基于工程代码框架完成算子核函数的开发和Tiling实现,通过工程编译脚本完成算子的编译部署,继而实现单算子调用或第三方框架中的算子调用。
@@ -18,7 +18,7 @@
| 调用方式 | 目录 | 描述 |
| --------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------- |
| 直调 | [kernel_launch_method_by_direct](./kernel_launch_method_by_direct) | host侧的核函数调用程序,包含CPU侧和NPU侧两种运行验证方法。 |
-| 框架调用 | [kernel_launch_method_by_framework](./kernel_launch_method_by_framework) | 通过aclnn调用的方式调用basicBlockMatmul算子。 |
+| 框架调用 | [kernel_launch_method_by_framework](./kernel_launch_method_by_framework) | 通过aclnn调用的方式调用basicBlock Matmul程序,包含CPU侧和NPU侧两种运行验证方法。|
## 样例支持的产品型号为:
- Atlas A2训练系列产品/Atlas 800I A2推理产品
diff --git a/examples/matrix/basic_block_matmul/host_tiling/basic_block_matmul_custom_tiling.h b/examples/matrix/basic_block_matmul/host_tiling/basic_block_matmul_custom_tiling.h
index 8c0ae687f5ed2222028c46db406a553fadcdc6e0..b6e49cabe9c332f086808497e4d7d3329da780e3 100644
--- a/examples/matrix/basic_block_matmul/host_tiling/basic_block_matmul_custom_tiling.h
+++ b/examples/matrix/basic_block_matmul/host_tiling/basic_block_matmul_custom_tiling.h
@@ -21,8 +21,9 @@ END_TILING_DATA_DEF;
REGISTER_TILING_DATA_CLASS(BasicBlockMatmulCustom, MatmulCustomTilingData)
}
-bool ComputeTiling(optiling::TCubeTiling& tiling, matmul_tiling::MultiCoreMatmulTiling* cubeTiling, int32_t M, int32_t N, int32_t K,
- int32_t baseM, int32_t baseN, int32_t baseK, int32_t blockDim, bool isBias)
+bool ComputeTiling(optiling::TCubeTiling& tiling, matmul_tiling::MultiCoreMatmulTiling* cubeTiling,
+ int32_t M, int32_t N, int32_t K,int32_t baseM, int32_t baseN, int32_t baseK,
+ int32_t blockDim, bool isBias)
{
cubeTiling->SetDim(blockDim);
cubeTiling->SetAType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, matmul_tiling::DataType::DT_FLOAT16, true); // A is transposed
diff --git a/examples/matrix/basic_block_matmul/kernel_launch_method_by_direct/basic_block_matmul_custom.cpp b/examples/matrix/basic_block_matmul/kernel_launch_method_by_direct/basic_block_matmul_custom.cpp
index b8f720c89f1f85f487bfba0d1d32457060d38706..e31a88e14f9df5a844498cf2df469ac80ccb199a 100644
--- a/examples/matrix/basic_block_matmul/kernel_launch_method_by_direct/basic_block_matmul_custom.cpp
+++ b/examples/matrix/basic_block_matmul/kernel_launch_method_by_direct/basic_block_matmul_custom.cpp
@@ -15,22 +15,26 @@
__aicore__ inline void CopyTiling(TCubeTiling* tiling, GM_ADDR tilingGM)
{
uint32_t* ptr = reinterpret_cast(tiling);
- auto tiling32 = reinterpret_cast<__gm__ uint32_t*>(tilingGM);
+ auto tilingGmAddr = reinterpret_cast<__gm__ uint32_t*>(tilingGM);
for (int i = 0; i < sizeof(TCubeTiling) / sizeof(uint32_t); i++, ptr++) {
- *ptr = *(tiling32 + i);
+ *ptr = *(tilingGmAddr + i);
}
return;
}
extern "C" __global__ __aicore__ void basic_block_matmul_custom(GM_ADDR a, GM_ADDR b, GM_ADDR c, GM_ADDR workspace, GM_ADDR tilingGm)
{
+ // prepare tiling
TCubeTiling tiling;
CopyTiling(&tiling, tilingGm);
+ // define matmul kernel
BasicBlockMatmulKernel matmulKernel;
AscendC::TPipe pipe;
REGIST_MATMUL_OBJ(&pipe, GetSysWorkSpacePtr(), matmulKernel.matmulObj, &tiling);
+ // init matmul kernel
matmulKernel.Init(a, b, nullptr, c, workspace, tiling);
+ // matmul kernel process
matmulKernel.Process(&pipe);
}
@@ -38,6 +42,7 @@ extern "C" __global__ __aicore__ void basic_block_matmul_custom(GM_ADDR a, GM_AD
void basic_block_matmul_custom_do(uint32_t blockDim, void* stream, GM_ADDR a, GM_ADDR b,
GM_ADDR c, GM_ADDR workspace, GM_ADDR tilingGm)
{
+ // invoke the kernel function through the <<<>>> symbol
basic_block_matmul_custom<<>>(a, b, c, workspace, tilingGm);
}
#endif
diff --git a/examples/matrix/basic_block_matmul/kernel_launch_method_by_direct/main.cpp b/examples/matrix/basic_block_matmul/kernel_launch_method_by_direct/main.cpp
index 9493ff1d4414d819eefc65a9b033f308683b6abb..da4b4cb94c80c23fd977b4f27d34c68886e3bf1c 100644
--- a/examples/matrix/basic_block_matmul/kernel_launch_method_by_direct/main.cpp
+++ b/examples/matrix/basic_block_matmul/kernel_launch_method_by_direct/main.cpp
@@ -164,6 +164,18 @@ int32_t main(int32_t argc, char *argv[])
CHECK_ACL(aclrtFree(cDevice));
CHECK_ACL(aclrtFreeHost(cHost));
+ CHECK_ACL(aclrtFree(aDevice));
+ CHECK_ACL(aclrtFreeHost(aHost));
+
+ CHECK_ACL(aclrtFree(bDevice));
+ CHECK_ACL(aclrtFreeHost(bHost));
+
+ CHECK_ACL(aclrtFree(tilingDevice));
+ CHECK_ACL(aclrtFreeHost(tilingHost));
+
+ CHECK_ACL(aclrtFree(workspaceDevice));
+ CHECK_ACL(aclrtFreeHost(workspaceHost));
+
CHECK_ACL(aclrtDestroyStream(stream));
CHECK_ACL(aclrtDestroyContext(context));
CHECK_ACL(aclrtResetDevice(deviceId));
diff --git a/examples/matrix/basic_block_matmul/kernel_launch_method_by_framework/op_kernel/basic_block_matmul_custom.cpp b/examples/matrix/basic_block_matmul/kernel_launch_method_by_framework/op_kernel/basic_block_matmul_custom.cpp
index f3511c5a44e122406e25288173b540f24ce13ebe..f12f386110451a6c06172356e568202c272f6e51 100644
--- a/examples/matrix/basic_block_matmul/kernel_launch_method_by_framework/op_kernel/basic_block_matmul_custom.cpp
+++ b/examples/matrix/basic_block_matmul/kernel_launch_method_by_framework/op_kernel/basic_block_matmul_custom.cpp
@@ -13,10 +13,14 @@
#include "../../../../../../kernel_impl/basic_block_matmul_custom_impl.h"
extern "C" __global__ __aicore__ void basic_block_matmul_custom(GM_ADDR a, GM_ADDR b, GM_ADDR bias, GM_ADDR c, GM_ADDR workspace, GM_ADDR tiling) {
+ // prepare tiling
GET_TILING_DATA(tilingData, tiling);
+ // define matmul kernel
BasicBlockMatmulKernel basicBlockMatmulKernel;
AscendC::TPipe pipe;
REGIST_MATMUL_OBJ(&pipe, GetSysWorkSpacePtr(), basicBlockMatmulKernel.matmulObj, &tilingData.cubeTilingData);
+ // init matmul kernel
basicBlockMatmulKernel.Init(a, b, bias, c, workspace, tilingData.cubeTilingData);
+ // matmul kernel process
basicBlockMatmulKernel.Process(&pipe);
}
diff --git a/examples/matrix/batch_matmul/kernel_impl/batch_matmul_custom_impl.h b/examples/matrix/batch_matmul/kernel_impl/batch_matmul_custom_impl.h
index 065f151cbf3218348ab4d7dcc40ec59f5a3aada0..fb62f4708bd5feed2c412df502a4d712d990e28b 100644
--- a/examples/matrix/batch_matmul/kernel_impl/batch_matmul_custom_impl.h
+++ b/examples/matrix/batch_matmul/kernel_impl/batch_matmul_custom_impl.h
@@ -96,7 +96,7 @@ __aicore__ inline void BatchMatmulKernel::Pro
batchOffsetC = idxC * tiling.CLayoutInfoS2 * tiling.CLayoutInfoS1;
}
matmulObj.IterateBatch(cGlobal[batchOffsetC], batchA, batchB, false);
- pipe_barrier(PIPE_ALL);
+ AscendC::PipeBarrier();
}
}
diff --git a/examples/matrix/batch_matmul/kernel_launch_method_by_direct/batch_matmul_custom.cpp b/examples/matrix/batch_matmul/kernel_launch_method_by_direct/batch_matmul_custom.cpp
index 94206b3efc982492ca4661f50a8f7e299de3197e..beab10f73a15ec646929de23c97b2a95282efe53 100644
--- a/examples/matrix/batch_matmul/kernel_launch_method_by_direct/batch_matmul_custom.cpp
+++ b/examples/matrix/batch_matmul/kernel_launch_method_by_direct/batch_matmul_custom.cpp
@@ -29,8 +29,10 @@ __aicore__ inline void CopyTiling(TCubeTiling* tiling, GM_ADDR tilingGM)
extern "C" __global__ __aicore__ void batch_matmul_custom(GM_ADDR a, GM_ADDR b, GM_ADDR c, GM_ADDR workspace,
GM_ADDR tilingGm)
{
+ // prepare tiling
TCubeTiling tiling;
CopyTiling(&tiling, tilingGm);
+ // define matmul kernel
typedef matmul::MatmulType A_TYPE;
typedef matmul::MatmulType B_TYPE;
typedef matmul::MatmulType C_TYPE;
@@ -42,7 +44,9 @@ extern "C" __global__ __aicore__ void batch_matmul_custom(GM_ADDR a, GM_ADDR b,
tiling.shareL0CSize = FULL_L0C_SIZE; // full L0C
tiling.shareUbSize = 0; // no UB
REGIST_MATMUL_OBJ(&pipe, GetSysWorkSpacePtr(), batchMatmulKernel.matmulObj, &tiling);
+ // init matmul kernel
batchMatmulKernel.Init(a, b, nullptr, c, workspace, tiling);
+ // matmul kernel process
batchMatmulKernel.Process(&pipe, 3, 3);
}
@@ -50,6 +54,7 @@ extern "C" __global__ __aicore__ void batch_matmul_custom(GM_ADDR a, GM_ADDR b,
void batch_matmul_custom_do(uint32_t blockDim, void* stream, GM_ADDR a, GM_ADDR b,
GM_ADDR c, GM_ADDR workspace, GM_ADDR tilingGm)
{
+ // invoke the kernel function through the <<<>>> symbol
batch_matmul_custom<<>>(a, b, c, workspace, tilingGm);
}
#endif
\ No newline at end of file
diff --git a/examples/matrix/batch_matmul/kernel_launch_method_by_framework/op_host/batch_matmul_custom.cpp b/examples/matrix/batch_matmul/kernel_launch_method_by_framework/op_host/batch_matmul_custom.cpp
index f36003929bfd4e941d367047af267675004d0c8b..d493387c2d9d903eb5460eb8cbd126cc15fd8823 100644
--- a/examples/matrix/batch_matmul/kernel_launch_method_by_framework/op_host/batch_matmul_custom.cpp
+++ b/examples/matrix/batch_matmul/kernel_launch_method_by_framework/op_host/batch_matmul_custom.cpp
@@ -23,6 +23,7 @@ static ge::graphStatus TilingFunc(gert::TilingContext* context)
if (!res) {
return ge::GRAPH_FAILED;
}
+ // this example is a single-core example, therefore set the number of cores to 1
context->SetBlockDim(1);
tiling.SaveToBuffer(context->GetRawTilingData()->GetData(), context->GetRawTilingData()->GetCapacity());
context->GetRawTilingData()->SetDataSize(tiling.GetDataSize());
diff --git a/examples/matrix/batch_matmul/kernel_launch_method_by_framework/op_kernel/batch_matmul_custom.cpp b/examples/matrix/batch_matmul/kernel_launch_method_by_framework/op_kernel/batch_matmul_custom.cpp
index c55caba16e0cc356523fc44b7dc8c61a0f27c201..cf5f83767897bd60ad201a3e744bac65f1a4d2f8 100644
--- a/examples/matrix/batch_matmul/kernel_launch_method_by_framework/op_kernel/batch_matmul_custom.cpp
+++ b/examples/matrix/batch_matmul/kernel_launch_method_by_framework/op_kernel/batch_matmul_custom.cpp
@@ -16,7 +16,9 @@ constexpr int32_t FULL_L1_SIZE = 512 * 1024;
constexpr int32_t FULL_L0C_SIZE = 128 * 1024;
extern "C" __global__ __aicore__ void batch_matmul_custom(GM_ADDR a, GM_ADDR b, GM_ADDR bias, GM_ADDR c, GM_ADDR workspace, GM_ADDR tiling) {
+ // prepare tiling
GET_TILING_DATA(tilingData, tiling);
+ // define matmul kernel
typedef matmul::MatmulType A_TYPE;
typedef matmul::MatmulType B_TYPE;
typedef matmul::MatmulType C_TYPE;
@@ -29,6 +31,8 @@ extern "C" __global__ __aicore__ void batch_matmul_custom(GM_ADDR a, GM_ADDR b,
tilingData.cubeTilingData.shareUbSize = 0; // no UB
REGIST_MATMUL_OBJ(&pipe, GetSysWorkSpacePtr(), batchMatmulKernel.matmulObj, &tilingData.cubeTilingData);
int batchNum = tilingData.cubeTilingData.BatchNum;
+ // init matmul kernel
batchMatmulKernel.Init(a, b, bias, c, workspace, tilingData.cubeTilingData);
+ // matmul kernel process
batchMatmulKernel.Process(&pipe, batchNum, batchNum);
}
\ No newline at end of file
diff --git a/examples/matrix/batch_matmul/kernel_launch_method_by_framework/testcases/npu/batch_matmul_custom_main.cpp b/examples/matrix/batch_matmul/kernel_launch_method_by_framework/testcases/npu/batch_matmul_custom_main.cpp
index 9afc07f7fec0a4ffcf3eff1282cc44f9745b5a58..8a9ef27b7ad459cd91eecf1aa2de7b23dd589e06 100644
--- a/examples/matrix/batch_matmul/kernel_launch_method_by_framework/testcases/npu/batch_matmul_custom_main.cpp
+++ b/examples/matrix/batch_matmul/kernel_launch_method_by_framework/testcases/npu/batch_matmul_custom_main.cpp
@@ -122,8 +122,9 @@ int32_t main(void)
int32_t tensorCount = sizeof(tensorDesc) / sizeof(struct tensorInfo);
aclTensor *tensors[tensorCount];
void *devMem[tensorCount];
+ constexpr int32_t aMatrixIndex = 0;
+ constexpr int32_t bMatrixIndex = 1;
constexpr int32_t biasIndex = 2;
- constexpr int32_t maxTimeout = 5000;
for (auto i = 0; i < tensorCount; i++) {
void *data;
struct tensorInfo *info = &(tensorDesc[i]);
@@ -135,7 +136,7 @@ int32_t main(void)
}
CHECK_ACL(aclrtMalloc(&data, size, ACL_MEM_MALLOC_HUGE_FIRST));
// read input
- if (i == 0) {
+ if (i == aMatrixIndex) {
size_t inputSize = size;
void *dataHost;
CHECK_ACL(aclrtMallocHost((void**)(&dataHost), inputSize));
@@ -143,7 +144,7 @@ int32_t main(void)
CHECK_ACL(aclrtMemcpy(data, size, dataHost, size, ACL_MEMCPY_HOST_TO_DEVICE));
CHECK_ACL(aclrtFreeHost(dataHost));
}
- if (i == 1) {
+ if (i == bMatrixIndex) {
size_t inputSize = size;
void *dataHost;
CHECK_ACL(aclrtMallocHost((void**)(&dataHost), inputSize));
diff --git a/examples/matrix/matmul/kernel_launch_method_by_direct/matmul_custom.cpp b/examples/matrix/matmul/kernel_launch_method_by_direct/matmul_custom.cpp
index 3c4040b092ab47b25dee1baff4263036f691f816..bfe733488795e13624c4f4024271537884499092 100644
--- a/examples/matrix/matmul/kernel_launch_method_by_direct/matmul_custom.cpp
+++ b/examples/matrix/matmul/kernel_launch_method_by_direct/matmul_custom.cpp
@@ -25,12 +25,16 @@ __aicore__ inline void CopyTiling(TCubeTiling* tiling, GM_ADDR tilingGM)
extern "C" __global__ __aicore__ void matmul_custom(GM_ADDR a, GM_ADDR b, GM_ADDR c, GM_ADDR workspace, GM_ADDR tilingGm)
{
+ // prepare tiling
TCubeTiling tiling;
CopyTiling(&tiling, tilingGm);
+ // define matmul kernel
MatmulKernel matmulKernel;
AscendC::TPipe pipe;
REGIST_MATMUL_OBJ(&pipe, GetSysWorkSpacePtr(), matmulKernel.matmulObj, &tiling);
+ // init matmul kernel
matmulKernel.Init(a, b, nullptr, c, workspace, tiling);
+ // matmul kernel process
#ifdef CUSTOM_ASCEND310P
matmulKernel.Process(&pipe);
#else
diff --git a/examples/matrix/matmul/kernel_launch_method_by_framework/op_kernel/matmul_custom.cpp b/examples/matrix/matmul/kernel_launch_method_by_framework/op_kernel/matmul_custom.cpp
index c2970e971d699582b96554a6ff806093ea18267b..c40494b79590615439b32fe017ed28145f15d1bb 100644
--- a/examples/matrix/matmul/kernel_launch_method_by_framework/op_kernel/matmul_custom.cpp
+++ b/examples/matrix/matmul/kernel_launch_method_by_framework/op_kernel/matmul_custom.cpp
@@ -13,11 +13,15 @@
#include "../../../../../../kernel_impl/matmul_custom_impl.h"
extern "C" __global__ __aicore__ void matmul_custom(GM_ADDR a, GM_ADDR b, GM_ADDR bias, GM_ADDR c, GM_ADDR workspace, GM_ADDR tiling) {
+ // prepare tiling
GET_TILING_DATA(tilingData, tiling);
+ // define matmul kernel
MatmulKernel matmulKernel;
AscendC::TPipe pipe;
REGIST_MATMUL_OBJ(&pipe, GetSysWorkSpacePtr(), matmulKernel.matmulObj,&tilingData.cubeTilingData);
+ // init matmul kernel
matmulKernel.Init(a, b, bias, c, workspace, tilingData.cubeTilingData);
+ // matmul kernel process
if (TILING_KEY_IS(1)) {
matmulKernel.Process(&pipe);
} else if (TILING_KEY_IS(2)) {
diff --git a/version.info b/version.info
index 93af97dd49163a922380228da6b60b11cb9e73d5..e2b1cd51e96ca802bbb88eab09443a40199c2c23 100644
--- a/version.info
+++ b/version.info
@@ -1 +1 @@
-Version=7.6.T11.0
+Version=7.7.T3.0.B012