diff --git a/examples/matrix/basic_block_matmul/README.md b/examples/matrix/basic_block_matmul/README.md
index c3f41694018da196d82e416acca060a3d0d2dc07..60de7b568dc32f82d797c38c8552046439464c1d 100644
--- a/examples/matrix/basic_block_matmul/README.md
+++ b/examples/matrix/basic_block_matmul/README.md
@@ -1,15 +1,15 @@
 <!--声明：本文使用[Creative Commons License version 4.0](https://creativecommons.org/licenses/by/4.0/legalcode)许可协议，转载、引用或修改等操作请遵循此许可协议。-->
 ## 概述
 
-本样例介绍了调用Matmul高阶API实现basicBlockMatmul单算子（使能BasicBlock模板），按照不同的算子调用方式分别给出了对应的端到端实现。  
+本样例介绍了如何使用Matmul高阶API中basicBlock模版实现矩阵乘算子，按照不同的算子调用方式分别给出了对应的端到端实现。  
 
-> BasicBlock模板的特点：适用于无尾块的特定场景，可以固定base块大小（baseM/baseN/baseK），减少矩阵搬运和计算过程中的一些计算开销。
+> BasicBlock模板的特点：适用于无尾块的特定场景(矩阵的shape可以被base块整除)，可以固定base块大小（baseM/baseN/baseK），减少矩阵搬运和计算过程中的Scalar开销。
 
-- 直调：使用核函数直调basicBlockMatmul自定义算子。
+- 直调：使用核函数直调basicBlock Matmul实现矩阵运算。
 
   核函数的基础调用方式，开发者完成算子核函数的开发和Tiling实现后，即可通过AscendCL运行时接口，完成算子的调用。
 
-- 框架调用：使用框架调用basicBlockMatmul自定义算子。
+- 框架调用：使用框架调用basicBlock Matmul实现矩阵运算。
 
   按照工程创建->算子实现->编译部署>算子调用的流程完成算子开发。整个过程都依赖于算子工程：基于工程代码框架完成算子核函数的开发和Tiling实现，通过工程编译脚本完成算子的编译部署，继而实现单算子调用或第三方框架中的算子调用。
 
@@ -18,7 +18,7 @@
 | 调用方式  | 目录                                                                                                                                                                                                                                                                                                  | <strong>描述</strong>                                      |
 | --------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------- |
 | 直调    | [kernel_launch_method_by_direct](./kernel_launch_method_by_direct)       | host侧的核函数调用程序，包含CPU侧和NPU侧两种运行验证方法。 |
-| 框架调用 | [kernel_launch_method_by_framework](./kernel_launch_method_by_framework) | 通过aclnn调用的方式调用basicBlockMatmul算子。                        |
+| 框架调用 | [kernel_launch_method_by_framework](./kernel_launch_method_by_framework) | 通过aclnn调用的方式调用basicBlock Matmul程序，包含CPU侧和NPU侧两种运行验证方法。|
 
 ## 样例支持的产品型号为：
 - Atlas A2训练系列产品/Atlas 800I A2推理产品
diff --git a/examples/matrix/basic_block_matmul/host_tiling/basic_block_matmul_custom_tiling.h b/examples/matrix/basic_block_matmul/host_tiling/basic_block_matmul_custom_tiling.h
index 8c0ae687f5ed2222028c46db406a553fadcdc6e0..b6e49cabe9c332f086808497e4d7d3329da780e3 100644
--- a/examples/matrix/basic_block_matmul/host_tiling/basic_block_matmul_custom_tiling.h
+++ b/examples/matrix/basic_block_matmul/host_tiling/basic_block_matmul_custom_tiling.h
@@ -21,8 +21,9 @@ END_TILING_DATA_DEF;
 REGISTER_TILING_DATA_CLASS(BasicBlockMatmulCustom, MatmulCustomTilingData)
 }
 
-bool ComputeTiling(optiling::TCubeTiling& tiling, matmul_tiling::MultiCoreMatmulTiling* cubeTiling, int32_t M, int32_t N, int32_t K,
-                   int32_t baseM, int32_t baseN, int32_t baseK, int32_t blockDim, bool isBias)
+bool ComputeTiling(optiling::TCubeTiling& tiling, matmul_tiling::MultiCoreMatmulTiling* cubeTiling,
+                   int32_t M, int32_t N, int32_t K,int32_t baseM, int32_t baseN, int32_t baseK,
+                   int32_t blockDim, bool isBias)
 {
     cubeTiling->SetDim(blockDim);
     cubeTiling->SetAType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, matmul_tiling::DataType::DT_FLOAT16, true); // A is transposed
diff --git a/examples/matrix/basic_block_matmul/kernel_launch_method_by_direct/basic_block_matmul_custom.cpp b/examples/matrix/basic_block_matmul/kernel_launch_method_by_direct/basic_block_matmul_custom.cpp
index b8f720c89f1f85f487bfba0d1d32457060d38706..e31a88e14f9df5a844498cf2df469ac80ccb199a 100644
--- a/examples/matrix/basic_block_matmul/kernel_launch_method_by_direct/basic_block_matmul_custom.cpp
+++ b/examples/matrix/basic_block_matmul/kernel_launch_method_by_direct/basic_block_matmul_custom.cpp
@@ -15,22 +15,26 @@
 __aicore__ inline void CopyTiling(TCubeTiling* tiling, GM_ADDR tilingGM)
 {
     uint32_t* ptr = reinterpret_cast<uint32_t*>(tiling);
-    auto tiling32 = reinterpret_cast<__gm__ uint32_t*>(tilingGM);
+    auto tilingGmAddr = reinterpret_cast<__gm__ uint32_t*>(tilingGM);
 
     for (int i = 0; i < sizeof(TCubeTiling) / sizeof(uint32_t); i++, ptr++) {
-      *ptr = *(tiling32 + i);
+      *ptr = *(tilingGmAddr + i);
     }
     return;
 }
 
 extern "C" __global__ __aicore__ void basic_block_matmul_custom(GM_ADDR a, GM_ADDR b, GM_ADDR c, GM_ADDR workspace, GM_ADDR tilingGm)
 {
+    // prepare tiling
     TCubeTiling tiling;
     CopyTiling(&tiling, tilingGm);
+    // define matmul kernel
     BasicBlockMatmulKernel<half, half, float, float> matmulKernel;
     AscendC::TPipe pipe;
     REGIST_MATMUL_OBJ(&pipe, GetSysWorkSpacePtr(), matmulKernel.matmulObj, &tiling);
+    // init matmul kernel
     matmulKernel.Init(a, b, nullptr, c, workspace, tiling);
+    // matmul kernel process
     matmulKernel.Process<false>(&pipe);
 }
 
@@ -38,6 +42,7 @@ extern "C" __global__ __aicore__ void basic_block_matmul_custom(GM_ADDR a, GM_AD
 void basic_block_matmul_custom_do(uint32_t blockDim, void* stream, GM_ADDR a, GM_ADDR b,
                     GM_ADDR c, GM_ADDR workspace, GM_ADDR tilingGm)
 {
+    // invoke the kernel function through the <<<>>> symbol
     basic_block_matmul_custom<<<blockDim, nullptr, stream>>>(a, b, c, workspace, tilingGm);
 }
 #endif
diff --git a/examples/matrix/basic_block_matmul/kernel_launch_method_by_direct/main.cpp b/examples/matrix/basic_block_matmul/kernel_launch_method_by_direct/main.cpp
index 9493ff1d4414d819eefc65a9b033f308683b6abb..da4b4cb94c80c23fd977b4f27d34c68886e3bf1c 100644
--- a/examples/matrix/basic_block_matmul/kernel_launch_method_by_direct/main.cpp
+++ b/examples/matrix/basic_block_matmul/kernel_launch_method_by_direct/main.cpp
@@ -164,6 +164,18 @@ int32_t main(int32_t argc, char *argv[])
     CHECK_ACL(aclrtFree(cDevice));
     CHECK_ACL(aclrtFreeHost(cHost));
 
+    CHECK_ACL(aclrtFree(aDevice));
+    CHECK_ACL(aclrtFreeHost(aHost));
+
+    CHECK_ACL(aclrtFree(bDevice));
+    CHECK_ACL(aclrtFreeHost(bHost));
+
+    CHECK_ACL(aclrtFree(tilingDevice));
+    CHECK_ACL(aclrtFreeHost(tilingHost));
+
+    CHECK_ACL(aclrtFree(workspaceDevice));
+    CHECK_ACL(aclrtFreeHost(workspaceHost));
+
     CHECK_ACL(aclrtDestroyStream(stream));
     CHECK_ACL(aclrtDestroyContext(context));
     CHECK_ACL(aclrtResetDevice(deviceId));
diff --git a/examples/matrix/basic_block_matmul/kernel_launch_method_by_framework/op_kernel/basic_block_matmul_custom.cpp b/examples/matrix/basic_block_matmul/kernel_launch_method_by_framework/op_kernel/basic_block_matmul_custom.cpp
index f3511c5a44e122406e25288173b540f24ce13ebe..f12f386110451a6c06172356e568202c272f6e51 100644
--- a/examples/matrix/basic_block_matmul/kernel_launch_method_by_framework/op_kernel/basic_block_matmul_custom.cpp
+++ b/examples/matrix/basic_block_matmul/kernel_launch_method_by_framework/op_kernel/basic_block_matmul_custom.cpp
@@ -13,10 +13,14 @@
 #include "../../../../../../kernel_impl/basic_block_matmul_custom_impl.h"
 
 extern "C" __global__ __aicore__ void basic_block_matmul_custom(GM_ADDR a, GM_ADDR b, GM_ADDR bias, GM_ADDR c, GM_ADDR workspace, GM_ADDR tiling) {
+    // prepare tiling
     GET_TILING_DATA(tilingData, tiling);
+    // define matmul kernel
     BasicBlockMatmulKernel<half, half, float, float> basicBlockMatmulKernel;
     AscendC::TPipe pipe;
     REGIST_MATMUL_OBJ(&pipe, GetSysWorkSpacePtr(), basicBlockMatmulKernel.matmulObj, &tilingData.cubeTilingData);
+    // init matmul kernel
     basicBlockMatmulKernel.Init(a, b, bias, c, workspace, tilingData.cubeTilingData);
+    // matmul kernel process
     basicBlockMatmulKernel.Process<true>(&pipe);
 }
diff --git a/examples/matrix/batch_matmul/kernel_impl/batch_matmul_custom_impl.h b/examples/matrix/batch_matmul/kernel_impl/batch_matmul_custom_impl.h
index 065f151cbf3218348ab4d7dcc40ec59f5a3aada0..fb62f4708bd5feed2c412df502a4d712d990e28b 100644
--- a/examples/matrix/batch_matmul/kernel_impl/batch_matmul_custom_impl.h
+++ b/examples/matrix/batch_matmul/kernel_impl/batch_matmul_custom_impl.h
@@ -96,7 +96,7 @@ __aicore__ inline void BatchMatmulKernel<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE>::Pro
             batchOffsetC = idxC * tiling.CLayoutInfoS2 * tiling.CLayoutInfoS1;
         }
         matmulObj.IterateBatch(cGlobal[batchOffsetC], batchA, batchB, false);
-        pipe_barrier(PIPE_ALL);
+        AscendC::PipeBarrier<PIPE_ALL>();
     }
 }
 
diff --git a/examples/matrix/batch_matmul/kernel_launch_method_by_direct/batch_matmul_custom.cpp b/examples/matrix/batch_matmul/kernel_launch_method_by_direct/batch_matmul_custom.cpp
index 94206b3efc982492ca4661f50a8f7e299de3197e..beab10f73a15ec646929de23c97b2a95282efe53 100644
--- a/examples/matrix/batch_matmul/kernel_launch_method_by_direct/batch_matmul_custom.cpp
+++ b/examples/matrix/batch_matmul/kernel_launch_method_by_direct/batch_matmul_custom.cpp
@@ -29,8 +29,10 @@ __aicore__ inline void CopyTiling(TCubeTiling* tiling, GM_ADDR tilingGM)
 extern "C" __global__ __aicore__ void batch_matmul_custom(GM_ADDR a, GM_ADDR b, GM_ADDR c, GM_ADDR workspace,
                                                         GM_ADDR tilingGm)
 {
+    // prepare tiling
     TCubeTiling tiling;
     CopyTiling(&tiling, tilingGm);
+    // define matmul kernel
     typedef matmul::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, half, false, LayoutMode::BSNGD> A_TYPE;
     typedef matmul::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, half, true, LayoutMode::BSNGD> B_TYPE;
     typedef matmul::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, float, false, LayoutMode::BSNGD> C_TYPE;
@@ -42,7 +44,9 @@ extern "C" __global__ __aicore__ void batch_matmul_custom(GM_ADDR a, GM_ADDR b,
     tiling.shareL0CSize = FULL_L0C_SIZE; // full L0C
     tiling.shareUbSize = 0;           // no UB
     REGIST_MATMUL_OBJ(&pipe, GetSysWorkSpacePtr(), batchMatmulKernel.matmulObj, &tiling);
+    // init matmul kernel
     batchMatmulKernel.Init(a, b, nullptr, c, workspace, tiling);
+    // matmul kernel process
     batchMatmulKernel.Process<false>(&pipe, 3, 3);
 }
 
@@ -50,6 +54,7 @@ extern "C" __global__ __aicore__ void batch_matmul_custom(GM_ADDR a, GM_ADDR b,
 void batch_matmul_custom_do(uint32_t blockDim, void* stream, GM_ADDR a, GM_ADDR b,
                          GM_ADDR c, GM_ADDR workspace, GM_ADDR tilingGm)
 {
+    // invoke the kernel function through the <<<>>> symbol
     batch_matmul_custom<<<blockDim, nullptr, stream>>>(a, b, c, workspace, tilingGm);
 }
 #endif
\ No newline at end of file
diff --git a/examples/matrix/batch_matmul/kernel_launch_method_by_framework/op_host/batch_matmul_custom.cpp b/examples/matrix/batch_matmul/kernel_launch_method_by_framework/op_host/batch_matmul_custom.cpp
index f36003929bfd4e941d367047af267675004d0c8b..d493387c2d9d903eb5460eb8cbd126cc15fd8823 100644
--- a/examples/matrix/batch_matmul/kernel_launch_method_by_framework/op_host/batch_matmul_custom.cpp
+++ b/examples/matrix/batch_matmul/kernel_launch_method_by_framework/op_host/batch_matmul_custom.cpp
@@ -23,6 +23,7 @@ static ge::graphStatus TilingFunc(gert::TilingContext* context)
     if (!res) {
         return ge::GRAPH_FAILED;
     }
+    // this example is a single-core example, therefore set the number of cores to 1
     context->SetBlockDim(1);
     tiling.SaveToBuffer(context->GetRawTilingData()->GetData(), context->GetRawTilingData()->GetCapacity());
     context->GetRawTilingData()->SetDataSize(tiling.GetDataSize());
diff --git a/examples/matrix/batch_matmul/kernel_launch_method_by_framework/op_kernel/batch_matmul_custom.cpp b/examples/matrix/batch_matmul/kernel_launch_method_by_framework/op_kernel/batch_matmul_custom.cpp
index c55caba16e0cc356523fc44b7dc8c61a0f27c201..cf5f83767897bd60ad201a3e744bac65f1a4d2f8 100644
--- a/examples/matrix/batch_matmul/kernel_launch_method_by_framework/op_kernel/batch_matmul_custom.cpp
+++ b/examples/matrix/batch_matmul/kernel_launch_method_by_framework/op_kernel/batch_matmul_custom.cpp
@@ -16,7 +16,9 @@ constexpr int32_t FULL_L1_SIZE = 512 * 1024;
 constexpr int32_t FULL_L0C_SIZE = 128 * 1024;
 
 extern "C" __global__ __aicore__ void batch_matmul_custom(GM_ADDR a, GM_ADDR b, GM_ADDR bias, GM_ADDR c, GM_ADDR workspace, GM_ADDR tiling) {
+    // prepare tiling
     GET_TILING_DATA(tilingData, tiling);
+    // define matmul kernel
     typedef matmul::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, half, false, LayoutMode::BSNGD> A_TYPE;
     typedef matmul::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, half, true, LayoutMode::BSNGD> B_TYPE;
     typedef matmul::MatmulType<AscendC::TPosition::GM, CubeFormat::ND, float, false, LayoutMode::BSNGD> C_TYPE;
@@ -29,6 +31,8 @@ extern "C" __global__ __aicore__ void batch_matmul_custom(GM_ADDR a, GM_ADDR b,
     tilingData.cubeTilingData.shareUbSize = 0;           // no UB
     REGIST_MATMUL_OBJ(&pipe, GetSysWorkSpacePtr(), batchMatmulKernel.matmulObj, &tilingData.cubeTilingData);
     int batchNum = tilingData.cubeTilingData.BatchNum;
+    // init matmul kernel
     batchMatmulKernel.Init(a, b, bias, c, workspace, tilingData.cubeTilingData);
+    // matmul kernel process
     batchMatmulKernel.Process<true>(&pipe, batchNum, batchNum);
 }
\ No newline at end of file
diff --git a/examples/matrix/batch_matmul/kernel_launch_method_by_framework/testcases/npu/batch_matmul_custom_main.cpp b/examples/matrix/batch_matmul/kernel_launch_method_by_framework/testcases/npu/batch_matmul_custom_main.cpp
index 9afc07f7fec0a4ffcf3eff1282cc44f9745b5a58..8a9ef27b7ad459cd91eecf1aa2de7b23dd589e06 100644
--- a/examples/matrix/batch_matmul/kernel_launch_method_by_framework/testcases/npu/batch_matmul_custom_main.cpp
+++ b/examples/matrix/batch_matmul/kernel_launch_method_by_framework/testcases/npu/batch_matmul_custom_main.cpp
@@ -122,8 +122,9 @@ int32_t main(void)
     int32_t tensorCount = sizeof(tensorDesc) / sizeof(struct tensorInfo);
     aclTensor *tensors[tensorCount];
     void *devMem[tensorCount];
+    constexpr int32_t aMatrixIndex = 0;
+    constexpr int32_t bMatrixIndex = 1;
     constexpr int32_t biasIndex = 2;
-    constexpr int32_t maxTimeout = 5000;
     for (auto i = 0; i < tensorCount; i++) {
         void *data;
         struct tensorInfo *info = &(tensorDesc[i]);
@@ -135,7 +136,7 @@ int32_t main(void)
         }
         CHECK_ACL(aclrtMalloc(&data, size, ACL_MEM_MALLOC_HUGE_FIRST));
         // read input
-        if (i == 0) {
+        if (i == aMatrixIndex) {
             size_t inputSize = size;
             void *dataHost;
             CHECK_ACL(aclrtMallocHost((void**)(&dataHost), inputSize));
@@ -143,7 +144,7 @@ int32_t main(void)
             CHECK_ACL(aclrtMemcpy(data, size, dataHost, size, ACL_MEMCPY_HOST_TO_DEVICE));
             CHECK_ACL(aclrtFreeHost(dataHost));
         }
-        if (i == 1) {
+        if (i == bMatrixIndex) {
             size_t inputSize = size;
             void *dataHost;
             CHECK_ACL(aclrtMallocHost((void**)(&dataHost), inputSize));
diff --git a/examples/matrix/matmul/kernel_launch_method_by_direct/matmul_custom.cpp b/examples/matrix/matmul/kernel_launch_method_by_direct/matmul_custom.cpp
index 3c4040b092ab47b25dee1baff4263036f691f816..bfe733488795e13624c4f4024271537884499092 100644
--- a/examples/matrix/matmul/kernel_launch_method_by_direct/matmul_custom.cpp
+++ b/examples/matrix/matmul/kernel_launch_method_by_direct/matmul_custom.cpp
@@ -25,12 +25,16 @@ __aicore__ inline void CopyTiling(TCubeTiling* tiling, GM_ADDR tilingGM)
 
 extern "C" __global__ __aicore__ void matmul_custom(GM_ADDR a, GM_ADDR b, GM_ADDR c, GM_ADDR workspace, GM_ADDR tilingGm)
 {
+    // prepare tiling
     TCubeTiling tiling;
     CopyTiling(&tiling, tilingGm);
+    // define matmul kernel
     MatmulKernel<half, half, float, float> matmulKernel;
     AscendC::TPipe pipe;
     REGIST_MATMUL_OBJ(&pipe, GetSysWorkSpacePtr(), matmulKernel.matmulObj, &tiling);
+    // init matmul kernel
     matmulKernel.Init(a, b, nullptr, c, workspace, tiling);
+    // matmul kernel process
 #ifdef CUSTOM_ASCEND310P
     matmulKernel.Process<true, false>(&pipe);
 #else
diff --git a/examples/matrix/matmul/kernel_launch_method_by_framework/op_kernel/matmul_custom.cpp b/examples/matrix/matmul/kernel_launch_method_by_framework/op_kernel/matmul_custom.cpp
index c2970e971d699582b96554a6ff806093ea18267b..c40494b79590615439b32fe017ed28145f15d1bb 100644
--- a/examples/matrix/matmul/kernel_launch_method_by_framework/op_kernel/matmul_custom.cpp
+++ b/examples/matrix/matmul/kernel_launch_method_by_framework/op_kernel/matmul_custom.cpp
@@ -13,11 +13,15 @@
 #include "../../../../../../kernel_impl/matmul_custom_impl.h"
 
 extern "C" __global__ __aicore__ void matmul_custom(GM_ADDR a, GM_ADDR b, GM_ADDR bias, GM_ADDR c, GM_ADDR workspace, GM_ADDR tiling) {
+    // prepare tiling
     GET_TILING_DATA(tilingData, tiling);
+    // define matmul kernel
     MatmulKernel<half, half, float, float> matmulKernel;
     AscendC::TPipe pipe;
     REGIST_MATMUL_OBJ(&pipe, GetSysWorkSpacePtr(), matmulKernel.matmulObj,&tilingData.cubeTilingData);
+    // init matmul kernel
     matmulKernel.Init(a, b, bias, c, workspace, tilingData.cubeTilingData);
+    // matmul kernel process
     if (TILING_KEY_IS(1)) {
         matmulKernel.Process<false, true>(&pipe);
     } else if (TILING_KEY_IS(2)) {
diff --git a/version.info b/version.info
index 93af97dd49163a922380228da6b60b11cb9e73d5..e2b1cd51e96ca802bbb88eab09443a40199c2c23 100644
--- a/version.info
+++ b/version.info
@@ -1 +1 @@
-Version=7.6.T11.0
+Version=7.7.T3.0.B012