From 622e33008fdb85921aa99f97e36628988694a04f Mon Sep 17 00:00:00 2001
From: Weidong Guo <guoweidong10@huawei.com>
Date: Tue, 13 Aug 2024 06:25:48 +0000
Subject: [PATCH] =?UTF-8?q?!330=20=E7=A7=BB=E9=99=A4=E4=BA=86examples?=
 =?UTF-8?q?=E4=B8=AD=E7=9A=84using=20namespace=20AscendC=20Merge=20pull=20?=
 =?UTF-8?q?request=20!330=20from=20Weidong=20Guo/master?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../kernel_impl/simplesoftmax_kernel.h        | 58 +++++++--------
 .../simplesoftmax_custom.cpp                  | 11 ++-
 .../op_kernel/simplesoftmax_custom.cpp        |  7 +-
 .../softmax/kernel_impl/softmax_kernel.h      | 58 +++++++--------
 .../softmax_custom.cpp                        | 11 ++-
 .../op_kernel/softmax_custom.cpp              |  7 +-
 .../kernel_impl/softmaxflash_kernel.h         | 73 ++++++++++---------
 .../softmaxflash_custom.cpp                   | 11 ++-
 .../op_kernel/softmaxflash_custom.cpp         |  7 +-
 .../kernel_impl/softmaxgrad_kernel.h          | 52 ++++++-------
 .../softmaxgrad_custom.cpp                    |  9 +--
 .../op_kernel/softmaxgrad_custom.cpp          |  7 +-
 .../kernel_impl/softmaxgradfront_kernel.h     | 54 +++++++-------
 .../softmaxgradfront_custom.cpp               | 11 ++-
 .../op_kernel/softmaxgradfront_custom.cpp     |  7 +-
 .../kernel_impl/batch_matmul_custom_impl.h    | 15 ++--
 .../batch_matmul_custom.cpp                   |  3 +-
 .../op_host/batch_matmul_custom.cpp           |  2 +-
 .../op_kernel/batch_matmul_custom.cpp         |  3 +-
 .../matmul/kernel_impl/matmul_custom_impl.h   | 26 ++++---
 .../matmul_custom.cpp                         |  3 +-
 .../op_host/matmul_custom.cpp                 |  2 +-
 .../op_kernel/matmul_custom.cpp               |  3 +-
 .../layernorm/kernel_impl/layernorm_custom.h  | 68 ++++++++---------
 .../layernorm_custom.cpp                      |  9 +--
 .../op_kernel/layernorm_custom.cpp            |  5 +-
 .../broadcast/kernel_impl/broadcast_custom.h  | 27 ++++---
 .../broadcast_custom.cpp                      |  1 -
 .../op_kernel/broadcast_custom.cpp            |  1 -
 29 files changed, 268 insertions(+), 283 deletions(-)

diff --git a/examples/activation/simplesoftmax/kernel_impl/simplesoftmax_kernel.h b/examples/activation/simplesoftmax/kernel_impl/simplesoftmax_kernel.h
index 14c7387a..d8079d95 100644
--- a/examples/activation/simplesoftmax/kernel_impl/simplesoftmax_kernel.h
+++ b/examples/activation/simplesoftmax/kernel_impl/simplesoftmax_kernel.h
@@ -12,7 +12,7 @@
 #define EXAMPLES_ACTIVATION_SIMPLESOFTMAX_KERNEL_H
 #include "kernel_operator.h"
 
-namespace AscendC {
+namespace MyCustomKernel {
 constexpr int32_t BUFFER_NUM = 2;
 constexpr uint32_t FLOAT_NUM_OF_SINGEL_BLOCK = 8;
 constexpr uint32_t BASIC_BLOCK_ROW_FACTOR = 8;
@@ -55,10 +55,10 @@ public:
     }
     __aicore__ inline void Init(GM_ADDR x, GM_ADDR max, GM_ADDR sum, GM_ADDR z, const VecTiling& tiling)
     {
-        ASSERT(GetBlockNum() != 0 && "block dim can not be zero!");
+        ASSERT(AscendC::GetBlockNum() != 0 && "block dim can not be zero!");
         InitTiling(tiling);
 
-        if (GetBlockIdx() == this->usedBlockDim) { // tail core
+        if (AscendC::GetBlockIdx() == this->usedBlockDim) { // tail core
             this->singleLoopCoreRowNum = this->tailCoreSingleLoopCoreRowNum;
             this->singleCoreLoopCount = this->tailCoreSingleCoreLoopCount;
             this->leftRow = this->tailCoreSingleCoreLoopTail;
@@ -67,8 +67,8 @@ public:
         this->blockLength = this->coreRowNum * this->columnLength;
         this->msLength = this->coreRowNum * FLOAT_NUM_OF_SINGEL_BLOCK; // max sum length per block process
 
-        uint32_t offset1 = this->blockLength * GetBlockIdx();
-        uint32_t offset2 = this->msLength * GetBlockIdx();
+        uint32_t offset1 = this->blockLength * AscendC::GetBlockIdx();
+        uint32_t offset2 = this->msLength * AscendC::GetBlockIdx();
 
         xGm.SetGlobalBuffer((__gm__ float*)x + offset1, this->blockLength);
         zGm.SetGlobalBuffer((__gm__ float*)z + offset1, this->blockLength);
@@ -88,7 +88,7 @@ public:
 
     __aicore__ inline void Process()
     {
-        if (GetBlockIdx() > usedBlockDim) {
+        if (AscendC::GetBlockIdx() > usedBlockDim) {
             return;
         }
 
@@ -108,12 +108,12 @@ public:
 private:
     __aicore__ inline void CopyIn(int32_t progress, uint32_t rowNum)
     {
-        LocalTensor<float> xLocal = queueX.AllocTensor<float>();
-        LocalTensor<float> maxLocal = queueMax.AllocTensor<float>();
-        LocalTensor<float> sumLocal = queueSum.AllocTensor<float>();
-        DataCopy(xLocal, xGm[progress * this->tileLength], rowNum * this->columnLength);
-        DataCopy(maxLocal, maxGm[progress * this->msTileLength], rowNum * FLOAT_NUM_OF_SINGEL_BLOCK);
-        DataCopy(sumLocal, sumGm[progress * this->msTileLength], rowNum * FLOAT_NUM_OF_SINGEL_BLOCK);
+        AscendC::LocalTensor<float> xLocal = queueX.AllocTensor<float>();
+        AscendC::LocalTensor<float> maxLocal = queueMax.AllocTensor<float>();
+        AscendC::LocalTensor<float> sumLocal = queueSum.AllocTensor<float>();
+        AscendC::DataCopy(xLocal, xGm[progress * this->tileLength], rowNum * this->columnLength);
+        AscendC::DataCopy(maxLocal, maxGm[progress * this->msTileLength], rowNum * FLOAT_NUM_OF_SINGEL_BLOCK);
+        AscendC::DataCopy(sumLocal, sumGm[progress * this->msTileLength], rowNum * FLOAT_NUM_OF_SINGEL_BLOCK);
         queueX.EnQue(xLocal);
         queueMax.EnQue(maxLocal);
         queueSum.EnQue(sumLocal);
@@ -121,18 +121,18 @@ private:
 
     __aicore__ inline void Compute(int32_t progressm, uint32_t rowNum)
     {
-        LocalTensor<float> xLocal = queueX.DeQue<float>();
-        LocalTensor<float> maxLocal = queueMax.DeQue<float>();
-        LocalTensor<float> sumLocal = queueSum.DeQue<float>();
-        LocalTensor<uint8_t> tmpBuffer = sharedTmpBuffer.Get<uint8_t>();
+        AscendC::LocalTensor<float> xLocal = queueX.DeQue<float>();
+        AscendC::LocalTensor<float> maxLocal = queueMax.DeQue<float>();
+        AscendC::LocalTensor<float> sumLocal = queueSum.DeQue<float>();
+        AscendC::LocalTensor<uint8_t> tmpBuffer = sharedTmpBuffer.Get<uint8_t>();
 
-        SoftMaxShapeInfo srcShape = { rowNum, this->columnLength, rowNum, this->columnLength };
+        AscendC::SoftMaxShapeInfo srcShape = { rowNum, this->columnLength, rowNum, this->columnLength };
         if (rowNum % BASIC_BLOCK_ROW_FACTOR == 0 &&
             this->columnLength % BASIC_BLOCK_COLUMN_FACTOR == 0 &&
             this->columnLength < BASIC_BLOCK_MAX_COLUMN_LENGTH) {
-            SimpleSoftMax<float, true, true>(xLocal, sumLocal, maxLocal, xLocal, tmpBuffer, softmaxTiling, srcShape);
+            AscendC::SimpleSoftMax<float, true, true>(xLocal, sumLocal, maxLocal, xLocal, tmpBuffer, softmaxTiling, srcShape);
         } else {
-            SimpleSoftMax<float, true>(xLocal, sumLocal, maxLocal, xLocal, tmpBuffer, softmaxTiling, srcShape);
+            AscendC::SimpleSoftMax<float, true>(xLocal, sumLocal, maxLocal, xLocal, tmpBuffer, softmaxTiling, srcShape);
         }
         queueX.EnQue(xLocal);
         queueMax.FreeTensor(maxLocal);
@@ -141,20 +141,20 @@ private:
 
     __aicore__ inline void CopyOut(int32_t progress, uint32_t rowNum)
     {
-        LocalTensor<float> zLocal = queueX.DeQue<float>();
-        DataCopy(zGm[progress * this->tileLength], zLocal, rowNum * this->columnLength);
+        AscendC::LocalTensor<float> zLocal = queueX.DeQue<float>();
+        AscendC::DataCopy(zGm[progress * this->tileLength], zLocal, rowNum * this->columnLength);
         queueX.FreeTensor(zLocal);
     }
 
 private:
-    TPipe pipe;
-    TBuf<QuePosition::VECCALC> sharedTmpBuffer;
-    TQue<QuePosition::VECOUT, BUFFER_NUM> queueX;
-    TQue<QuePosition::VECIN, 1> queueMax, queueSum;
-    GlobalTensor<float> xGm;
-    GlobalTensor<float> maxGm;
-    GlobalTensor<float> sumGm;
-    GlobalTensor<float> zGm;
+    AscendC::TPipe pipe;
+    AscendC::TBuf<AscendC::QuePosition::VECCALC> sharedTmpBuffer;
+    AscendC::TQue<AscendC::QuePosition::VECOUT, BUFFER_NUM> queueX;
+    AscendC::TQue<AscendC::QuePosition::VECIN, 1> queueMax, queueSum;
+    AscendC::GlobalTensor<float> xGm;
+    AscendC::GlobalTensor<float> maxGm;
+    AscendC::GlobalTensor<float> sumGm;
+    AscendC::GlobalTensor<float> zGm;
 
     uint32_t blockLength = 0;
     uint32_t usedBlockDim = 0;
diff --git a/examples/activation/simplesoftmax/kernel_launch_method_by_direct/simplesoftmax_custom.cpp b/examples/activation/simplesoftmax/kernel_launch_method_by_direct/simplesoftmax_custom.cpp
index 1eb36654..636a18bf 100644
--- a/examples/activation/simplesoftmax/kernel_launch_method_by_direct/simplesoftmax_custom.cpp
+++ b/examples/activation/simplesoftmax/kernel_launch_method_by_direct/simplesoftmax_custom.cpp
@@ -9,14 +9,13 @@
  */
 
 #include "../kernel_impl/simplesoftmax_kernel.h"
-using namespace AscendC;
 
-__aicore__ inline void CopyTiling(VecTiling* tiling, GM_ADDR tilingGM)
+__aicore__ inline void CopyTiling(MyCustomKernel::VecTiling* tiling, GM_ADDR tilingGM)
 {
     uint32_t* ptr = reinterpret_cast<uint32_t*>(tiling);
     auto tiling32 = reinterpret_cast<__gm__ uint32_t*>(tilingGM);
 
-    for (int i = 0; i < sizeof(VecTiling) / sizeof(uint32_t); i++, ptr++) {
+    for (int i = 0; i < sizeof(MyCustomKernel::VecTiling) / sizeof(uint32_t); i++, ptr++) {
         *ptr = *(tiling32 + i);
     }
     return;
@@ -28,10 +27,10 @@ extern "C" __global__ __aicore__ void simplesoftmax_custom(GM_ADDR x, GM_ADDR ma
     if ASCEND_IS_AIC {
         return;
     }
-    VecTiling tilingData;
+    MyCustomKernel::VecTiling tilingData;
     CopyTiling(&tilingData, tiling);
-    KernelSoftmax op;
-    op.Init(x, max, sum, z, tilingData);
+    MyCustomKernel::KernelSoftmax op;
+    op.Init(x, max, sum, res_out, tilingData);
     op.Process();
 }
 
diff --git a/examples/activation/simplesoftmax/kernel_launch_method_by_framework/op_kernel/simplesoftmax_custom.cpp b/examples/activation/simplesoftmax/kernel_launch_method_by_framework/op_kernel/simplesoftmax_custom.cpp
index 1bb113e3..91b8de26 100644
--- a/examples/activation/simplesoftmax/kernel_launch_method_by_framework/op_kernel/simplesoftmax_custom.cpp
+++ b/examples/activation/simplesoftmax/kernel_launch_method_by_framework/op_kernel/simplesoftmax_custom.cpp
@@ -8,7 +8,6 @@
  * See LICENSE in the root of the software repository for the full text of the License.
  */
 #include "../../../../../../kernel_impl/simplesoftmax_kernel.h"
-using namespace AscendC;
 
 extern "C" __global__ __aicore__ void simplesoftmax_custom(GM_ADDR x, GM_ADDR max, GM_ADDR sum, GM_ADDR z,
     GM_ADDR workspace, GM_ADDR tiling)
@@ -17,9 +16,9 @@ extern "C" __global__ __aicore__ void simplesoftmax_custom(GM_ADDR x, GM_ADDR ma
         return;
     }
     GET_TILING_DATA(tilingData, tiling);
-    VecTiling vecTiling = *reinterpret_cast<VecTiling*>(&tilingData);
-    KernelSoftmax op;
-    op.Init(x, max, sum, z, vecTiling);
+    MyCustomKernel::VecTiling vecTiling = *reinterpret_cast<MyCustomKernel::VecTiling*>(&tilingData);
+    MyCustomKernel::KernelSoftmax op;
+    op.Init(x, max, sum, res_out, vecTiling);
     if (TILING_KEY_IS(1)) {
         op.Process();
     }
diff --git a/examples/activation/softmax/kernel_impl/softmax_kernel.h b/examples/activation/softmax/kernel_impl/softmax_kernel.h
index 5e75aed3..bac55031 100644
--- a/examples/activation/softmax/kernel_impl/softmax_kernel.h
+++ b/examples/activation/softmax/kernel_impl/softmax_kernel.h
@@ -12,7 +12,7 @@
 #define EXAMPLES_ACTIVATION_SOFTMAX_KERNEL_H
 #include "kernel_operator.h"
 
-namespace AscendC {
+namespace MyCustomKernel {
 constexpr int32_t BUFFER_NUM = 2;
 constexpr uint32_t FLOAT_NUM_OF_SINGEL_BLOCK = 8;
 constexpr uint32_t BASIC_BLOCK_ROW_FACTOR = 8;
@@ -55,10 +55,10 @@ public:
     }
     __aicore__ inline void Init(GM_ADDR x, GM_ADDR max, GM_ADDR sum, GM_ADDR z, const VecTiling& tilingData)
     {
-        ASSERT(GetBlockNum() != 0 && "block dim can not be zero!");
+        ASSERT(AscendC::GetBlockNum() != 0 && "block dim can not be zero!");
         InitTiling(tilingData);
 
-        if (GetBlockIdx() == this->usedBlockDim) { // tail core
+        if (AscendC::GetBlockIdx() == this->usedBlockDim) { // tail core
             this->singleLoopCoreRowNum = this->tailCoreSingleLoopCoreRowNum;
             this->singleCoreLoopCount = this->tailCoreSingleCoreLoopCount;
             this->leftRow = this->tailCoreSingleCoreLoopTail;
@@ -67,8 +67,8 @@ public:
         this->blockLength = this->coreRowNum * this->columnLength;
         this->msLength = this->coreRowNum * FLOAT_NUM_OF_SINGEL_BLOCK; // max sum length per block process
 
-        uint32_t offset1 = this->blockLength * GetBlockIdx();
-        uint32_t offset2 = this->msLength * GetBlockIdx();
+        uint32_t offset1 = this->blockLength * AscendC::GetBlockIdx();
+        uint32_t offset2 = this->msLength * AscendC::GetBlockIdx();
 
         xGm.SetGlobalBuffer((__gm__ float*)x + offset1, this->blockLength);
         zGm.SetGlobalBuffer((__gm__ float*)z + offset1, this->blockLength);
@@ -89,7 +89,7 @@ public:
 
     __aicore__ inline void Process()
     {
-        if (GetBlockIdx() > this->usedBlockDim) {
+        if (AscendC::GetBlockIdx() > this->usedBlockDim) {
             return;
         }
 
@@ -108,25 +108,25 @@ public:
 private:
     __aicore__ inline void CopyIn(int32_t progress, uint32_t rowNum)
     {
-        LocalTensor<float> xLocal = queueX.AllocTensor<float>();
-        DataCopy(xLocal, xGm[progress * this->tileLength], rowNum * this->columnLength);
+        AscendC::LocalTensor<float> xLocal = queueX.AllocTensor<float>();
+        AscendC::DataCopy(xLocal, xGm[progress * this->tileLength], rowNum * this->columnLength);
         queueX.EnQue(xLocal);
     }
 
     __aicore__ inline void Compute(int32_t progressm, uint32_t rowNum)
     {
-        LocalTensor<float> xLocal = queueX.DeQue<float>();
-        LocalTensor<float> maxLocal = queueMax.AllocTensor<float>();
-        LocalTensor<float> sumLocal = queueSum.AllocTensor<float>();
-        LocalTensor<uint8_t> tmpBuffer = sharedTmpBuffer.Get<uint8_t>();
+        AscendC::LocalTensor<float> xLocal = queueX.DeQue<float>();
+        AscendC::LocalTensor<float> maxLocal = queueMax.AllocTensor<float>();
+        AscendC::LocalTensor<float> sumLocal = queueSum.AllocTensor<float>();
+        AscendC::LocalTensor<uint8_t> tmpBuffer = sharedTmpBuffer.Get<uint8_t>();
 
-        SoftMaxShapeInfo srcShape = { rowNum, this->columnLength, rowNum, this->columnLength };
+        AscendC::SoftMaxShapeInfo srcShape = { rowNum, this->columnLength, rowNum, this->columnLength };
         if (rowNum % BASIC_BLOCK_ROW_FACTOR == 0 &&
             this->columnLength % BASIC_BLOCK_COLUMN_FACTOR == 0 &&
             this->columnLength < BASIC_BLOCK_MAX_COLUMN_LENGTH) {
-            SoftMax<float, true, true>(xLocal, sumLocal, maxLocal, xLocal, tmpBuffer, softmaxTiling, srcShape);
+            AscendC::SoftMax<float, true, true>(xLocal, sumLocal, maxLocal, xLocal, tmpBuffer, softmaxTiling, srcShape);
         } else {
-            SoftMax<float, true>(xLocal, sumLocal, maxLocal, xLocal, tmpBuffer, softmaxTiling, srcShape);
+            AscendC::SoftMax<float, true>(xLocal, sumLocal, maxLocal, xLocal, tmpBuffer, softmaxTiling, srcShape);
         }
         queueX.EnQue<float>(xLocal);
         queueMax.EnQue<float>(maxLocal);
@@ -135,13 +135,13 @@ private:
 
     __aicore__ inline void CopyOut(int32_t progress, uint32_t rowNum)
     {
-        LocalTensor<float> zLocal = queueX.DeQue<float>();
-        LocalTensor<float> maxLocal = queueMax.DeQue<float>();
-        LocalTensor<float> sumLocal = queueSum.DeQue<float>();
+        AscendC::LocalTensor<float> zLocal = queueX.DeQue<float>();
+        AscendC::LocalTensor<float> maxLocal = queueMax.DeQue<float>();
+        AscendC::LocalTensor<float> sumLocal = queueSum.DeQue<float>();
 
-        DataCopy(zGm[progress * this->tileLength], zLocal, rowNum * this->columnLength);
-        DataCopy(maxGm[progress * this->msTileLength], maxLocal, rowNum * FLOAT_NUM_OF_SINGEL_BLOCK);
-        DataCopy(sumGm[progress * this->msTileLength], sumLocal, rowNum * FLOAT_NUM_OF_SINGEL_BLOCK);
+        AscendC::DataCopy(zGm[progress * this->tileLength], zLocal, rowNum * this->columnLength);
+        AscendC::DataCopy(maxGm[progress * this->msTileLength], maxLocal, rowNum * FLOAT_NUM_OF_SINGEL_BLOCK);
+        AscendC::DataCopy(sumGm[progress * this->msTileLength], sumLocal, rowNum * FLOAT_NUM_OF_SINGEL_BLOCK);
 
         queueX.FreeTensor(zLocal);
         queueMax.FreeTensor(maxLocal);
@@ -149,14 +149,14 @@ private:
     }
 
 private:
-    TPipe pipe;
-    TBuf<QuePosition::VECCALC> sharedTmpBuffer;
-    TQue<QuePosition::VECIN, BUFFER_NUM> queueX;
-    TQue<QuePosition::VECOUT, 1> queueMax, queueSum;
-    GlobalTensor<float> xGm;
-    GlobalTensor<float> maxGm;
-    GlobalTensor<float> sumGm;
-    GlobalTensor<float> zGm;
+    AscendC::TPipe pipe;
+    AscendC::TBuf<AscendC::QuePosition::VECCALC> sharedTmpBuffer;
+    AscendC::TQue<AscendC::QuePosition::VECIN, BUFFER_NUM> queueX;
+    AscendC::TQue<AscendC::QuePosition::VECOUT, 1> queueMax, queueSum;
+    AscendC::GlobalTensor<float> xGm;
+    AscendC::GlobalTensor<float> maxGm;
+    AscendC::GlobalTensor<float> sumGm;
+    AscendC::GlobalTensor<float> zGm;
 
     uint32_t blockLength = 0;
     uint32_t usedBlockDim = 0;
diff --git a/examples/activation/softmax/kernel_launch_method_by_direct/softmax_custom.cpp b/examples/activation/softmax/kernel_launch_method_by_direct/softmax_custom.cpp
index 51848ac2..bf6abed2 100644
--- a/examples/activation/softmax/kernel_launch_method_by_direct/softmax_custom.cpp
+++ b/examples/activation/softmax/kernel_launch_method_by_direct/softmax_custom.cpp
@@ -9,14 +9,13 @@
  */
 
 #include "../kernel_impl/softmax_kernel.h"
-using namespace AscendC;
 
-__aicore__ inline void CopyTiling(VecTiling* tiling, GM_ADDR tilingGM)
+__aicore__ inline void CopyTiling(MyCustomKernel::VecTiling* tiling, GM_ADDR tilingGM)
 {
     uint32_t* ptr = reinterpret_cast<uint32_t*>(tiling);
     auto tiling32 = reinterpret_cast<__gm__ uint32_t*>(tilingGM);
 
-    for (int i = 0; i < sizeof(VecTiling) / sizeof(uint32_t); i++, ptr++) {
+    for (int i = 0; i < sizeof(MyCustomKernel::VecTiling) / sizeof(uint32_t); i++, ptr++) {
         *ptr = *(tiling32 + i);
     }
     return;
@@ -28,10 +27,10 @@ extern "C" __global__ __aicore__ void softmax_custom(GM_ADDR x, GM_ADDR max, GM_
     if ASCEND_IS_AIC {
         return;
     }
-    VecTiling tilingData;
+    MyCustomKernel::VecTiling tilingData;
     CopyTiling(&tilingData, tiling);
-    KernelSoftmax op;
-    op.Init(x, max, sum, z, tilingData);
+    MyCustomKernel::KernelSoftmax op;
+    op.Init(x, max, sum, res_out, tilingData);
     op.Process();
 }
 
diff --git a/examples/activation/softmax/kernel_launch_method_by_framework/op_kernel/softmax_custom.cpp b/examples/activation/softmax/kernel_launch_method_by_framework/op_kernel/softmax_custom.cpp
index e263b3d2..95854592 100644
--- a/examples/activation/softmax/kernel_launch_method_by_framework/op_kernel/softmax_custom.cpp
+++ b/examples/activation/softmax/kernel_launch_method_by_framework/op_kernel/softmax_custom.cpp
@@ -9,7 +9,6 @@
  */
 
 #include "../../../../../../kernel_impl/softmax_kernel.h"
-using namespace AscendC;
 
 extern "C" __global__ __aicore__ void softmax_custom(GM_ADDR x, GM_ADDR max, GM_ADDR sum, GM_ADDR z,
     GM_ADDR workspace, GM_ADDR tiling)
@@ -18,9 +17,9 @@ extern "C" __global__ __aicore__ void softmax_custom(GM_ADDR x, GM_ADDR max, GM_
         return;
     }
     GET_TILING_DATA(tilingData, tiling);
-    VecTiling vecTiling = *reinterpret_cast<VecTiling*>(&tilingData);
-    KernelSoftmax op;
-    op.Init(x, max, sum, z, vecTiling);
+    MyCustomKernel::VecTiling vecTiling = *reinterpret_cast<MyCustomKernel::VecTiling*>(&tilingData);
+    MyCustomKernel::KernelSoftmax op;
+    op.Init(x, max, sum, res_out, vecTiling);
     if (TILING_KEY_IS(1)) {
         op.Process();
     }
diff --git a/examples/activation/softmaxflash/kernel_impl/softmaxflash_kernel.h b/examples/activation/softmaxflash/kernel_impl/softmaxflash_kernel.h
index 391e4fb3..0e4b3877 100644
--- a/examples/activation/softmaxflash/kernel_impl/softmaxflash_kernel.h
+++ b/examples/activation/softmaxflash/kernel_impl/softmaxflash_kernel.h
@@ -12,7 +12,7 @@
 #define EXAMPLES_ACTIVATION_SOFTMAXFLASH_KERNEL_H
 #include "kernel_operator.h"
 
-namespace AscendC {
+namespace MyCustomKernel {
 constexpr int32_t BUFFER_NUM = 2;
 constexpr uint32_t FLOAT_NUM_OF_SINGEL_BLOCK = 8;
 constexpr uint32_t BASIC_BLOCK_ROW_FACTOR = 8;
@@ -62,10 +62,10 @@ public:
 
     __aicore__ inline void Init(GM_ADDR x, GM_ADDR max, GM_ADDR sum, const VecTiling& tilingData)
     {
-        ASSERT(GetBlockNum() != 0 && "block dim can not be zero!");
+        ASSERT(AscendC::GetBlockNum() != 0 && "block dim can not be zero!");
         InitTiling(tilingData);
 
-        if (GetBlockIdx() == this->usedBlockDim) { // tail core
+        if (AscendC::GetBlockIdx() == this->usedBlockDim) { // tail core
             this->singleLoopCoreRowNum = this->tailCoreSingleLoopCoreRowNum;
             this->singleCoreLoopCount = this->tailCoreSingleCoreLoopCount;
             this->leftRow = this->tailCoreSingleCoreLoopTail;
@@ -74,8 +74,8 @@ public:
         this->blockLength = this->coreRowNum * this->columnLength;
         this->msLength = this->coreRowNum * FLOAT_NUM_OF_SINGEL_BLOCK; // max sum length per block process
 
-        uint32_t offset1 = this->blockLength * GetBlockIdx();
-        uint32_t offset2 = this->msLength * GetBlockIdx();
+        uint32_t offset1 = this->blockLength * AscendC::GetBlockIdx();
+        uint32_t offset2 = this->msLength * AscendC::GetBlockIdx();
 
         xGm.SetGlobalBuffer((__gm__ float*)x + offset1, this->blockLength);
         maxGm.SetGlobalBuffer((__gm__ float*)max + offset2, this->msLength);
@@ -97,7 +97,7 @@ public:
 
     __aicore__ inline void Process()
     {
-        if (GetBlockIdx() > this->usedBlockDim) {
+        if (AscendC::GetBlockIdx() > this->usedBlockDim) {
             return;
         }
 
@@ -111,15 +111,15 @@ public:
                 CopyIn(i, this->loopK, this->singleLoopCoreRowNum, this->tailK);
                 Compute(i, this->loopK, this->singleLoopCoreRowNum, this->tailK);
             }
-            event_t eventIdVToMte3 = static_cast<event_t>(GetTPipePtr()->FetchEventID(HardEvent::V_MTE3));
-            SetFlag<HardEvent::V_MTE3>(eventIdVToMte3);
-            WaitFlag<HardEvent::V_MTE3>(eventIdVToMte3);
+            event_t eventIdVToMte3 = static_cast<event_t>(GetTPipePtr()->FetchEventID(AscendC::HardEvent::V_MTE3));
+            AscendC::SetFlag<AscendC::HardEvent::V_MTE3>(eventIdVToMte3);
+            AscendC::WaitFlag<AscendC::HardEvent::V_MTE3>(eventIdVToMte3);
             // copy max sum to gm
             CopyOut(i, this->msTileLength);
         }
-        event_t eventIdMte3ToV = static_cast<event_t>(GetTPipePtr()->FetchEventID(HardEvent::MTE3_V));
-        SetFlag<HardEvent::MTE3_V>(eventIdMte3ToV);
-        WaitFlag<HardEvent::MTE3_V>(eventIdMte3ToV);
+        event_t eventIdMte3ToV = static_cast<event_t>(GetTPipePtr()->FetchEventID(AscendC::HardEvent::MTE3_V));
+        AscendC::SetFlag<AscendC::HardEvent::MTE3_V>(eventIdMte3ToV);
+        AscendC::WaitFlag<AscendC::HardEvent::MTE3_V>(eventIdMte3ToV);
 
         if (this->leftRow > 0) {
             for (int32_t j = 0; j < this->loopK; j++) { // split K
@@ -131,9 +131,9 @@ public:
                 CopyIn(this->singleCoreLoopCount, this->loopK, this->leftRow, this->tailK);
                 Compute(this->singleCoreLoopCount, this->loopK, this->leftRow, this->tailK);
             }
-            event_t eventIdVToMte3 = static_cast<event_t>(GetTPipePtr()->FetchEventID(HardEvent::V_MTE3));
-            SetFlag<HardEvent::V_MTE3>(eventIdVToMte3);
-            WaitFlag<HardEvent::V_MTE3>(eventIdVToMte3);
+            event_t eventIdVToMte3 = static_cast<event_t>(GetTPipePtr()->FetchEventID(AscendC::HardEvent::V_MTE3));
+            AscendC::SetFlag<AscendC::HardEvent::V_MTE3>(eventIdVToMte3);
+            AscendC::WaitFlag<AscendC::HardEvent::V_MTE3>(eventIdVToMte3);
             // copy max sum to gm
             uint32_t tailMsTileLength = this->leftRow * FLOAT_NUM_OF_SINGEL_BLOCK;
             CopyOut(this->singleCoreLoopCount, tailMsTileLength);
@@ -143,10 +143,10 @@ public:
 private:
     __aicore__ inline void CopyIn(uint32_t rowIndex, uint32_t kIndex, uint32_t rowNum, uint32_t columnNum)
     {
-        LocalTensor<float> xLocal = queueX.AllocTensor<float>();
+        AscendC::LocalTensor<float> xLocal = queueX.AllocTensor<float>();
         uint32_t offset = this->singleLoopCoreRowNum * this->columnLength;
         for (uint32_t i = 0; i < rowNum; i++) {
-            DataCopy(xLocal[i * columnNum], xGm[rowIndex * offset + i * this->columnLength + kIndex * this->splitK],
+            AscendC::DataCopy(xLocal[i * columnNum], xGm[rowIndex * offset + i * this->columnLength + kIndex * this->splitK],
                 columnNum);
         }
         queueX.EnQue(xLocal);
@@ -154,28 +154,28 @@ private:
 
     __aicore__ inline void Compute(uint32_t rowIndex, uint32_t columnIndex, uint32_t rowNum, uint32_t columnNum)
     {
-        LocalTensor<float> xLocal = queueX.DeQue<float>();
-        LocalTensor<uint8_t> tmpBuffer = sharedTmpBuffer.Get<uint8_t>();
+        AscendC::LocalTensor<float> xLocal = queueX.DeQue<float>();
+        AscendC::LocalTensor<uint8_t> tmpBuffer = sharedTmpBuffer.Get<uint8_t>();
 
-        SoftMaxShapeInfo srcShape = { rowNum, columnNum, rowNum, columnNum };
+        AscendC::SoftMaxShapeInfo srcShape = { rowNum, columnNum, rowNum, columnNum };
         if (columnIndex == 0) {                           // isUpdate == false
             if (rowNum % BASIC_BLOCK_ROW_FACTOR == 0 &&
                 columnNum % BASIC_BLOCK_COLUMN_FACTOR == 0 &&
                 columnNum < BASIC_BLOCK_MAX_COLUMN_LENGTH) {
-                SoftmaxFlashV2<float, false, true, true>(xLocal, sumLocal, maxLocal, xLocal, expmaxLocal, sumLocal,
+                AscendC::SoftmaxFlashV2<float, false, true, true>(xLocal, sumLocal, maxLocal, xLocal, expmaxLocal, sumLocal,
                     maxLocal, tmpBuffer, softmaxTiling, srcShape);
             } else {
-                SoftmaxFlashV2<float, false, true, false>(xLocal, sumLocal, maxLocal, xLocal, expmaxLocal, sumLocal,
+                AscendC::SoftmaxFlashV2<float, false, true, false>(xLocal, sumLocal, maxLocal, xLocal, expmaxLocal, sumLocal,
                     maxLocal, tmpBuffer, softmaxTiling, srcShape);
             }
         } else {
             if (rowNum % BASIC_BLOCK_ROW_FACTOR == 0 &&
                 columnNum % BASIC_BLOCK_COLUMN_FACTOR == 0 &&
                 columnNum < BASIC_BLOCK_MAX_COLUMN_LENGTH) {
-                SoftmaxFlashV2<float, true, true, true>(xLocal, sumLocal, maxLocal, xLocal, expmaxLocal, sumLocal,
+                AscendC::SoftmaxFlashV2<float, true, true, true>(xLocal, sumLocal, maxLocal, xLocal, expmaxLocal, sumLocal,
                     maxLocal, tmpBuffer, softmaxTiling, srcShape);
             } else {
-                SoftmaxFlashV2<float, true, true, false>(xLocal, sumLocal, maxLocal, xLocal, expmaxLocal, sumLocal,
+                AscendC::SoftmaxFlashV2<float, true, true, false>(xLocal, sumLocal, maxLocal, xLocal, expmaxLocal, sumLocal,
                     maxLocal, tmpBuffer, softmaxTiling, srcShape);
             }
         }
@@ -184,21 +184,22 @@ private:
 
     __aicore__ inline void CopyOut(uint32_t progress, uint32_t count)
     {
-        DataCopy(maxGm[progress * this->msTileLength], maxLocal, count);
-        DataCopy(sumGm[progress * this->msTileLength], sumLocal, count);
+        AscendC::DataCopy(maxGm[progress * this->msTileLength], maxLocal, count);
+        AscendC::DataCopy(sumGm[progress * this->msTileLength], sumLocal, count);
     }
 
 private:
-    TPipe pipe;
-    TBuf<QuePosition::VECCALC> sharedTmpBuffer;
-    TQue<QuePosition::VECIN, BUFFER_NUM> queueX;
-    TBuf<AscendC::QuePosition::VECCALC> tbufMax, tbufSum, tbufExpmax;
-    GlobalTensor<float> xGm;
-    GlobalTensor<float> maxGm;
-    GlobalTensor<float> sumGm;
-    LocalTensor<float> maxLocal;
-    LocalTensor<float> sumLocal;
-    LocalTensor<float> expmaxLocal;
+    AscendC::TPipe pipe;
+    AscendC::TBuf<AscendC::QuePosition::VECCALC> sharedTmpBuffer;
+    AscendC::TQue<AscendC::QuePosition::VECIN, BUFFER_NUM> queueX;
+    AscendC::TQue<AscendC::QuePosition::VECOUT, 1> queueMax, queueSum, queueExpmax;
+    AscendC::GlobalTensor<float> xGm;
+    AscendC::GlobalTensor<float> maxGm;
+    AscendC::GlobalTensor<float> sumGm;
+    AscendC::GlobalTensor<float> zGm;
+    AscendC::LocalTensor<float> maxLocal;
+    AscendC::LocalTensor<float> sumLocal;
+    AscendC::LocalTensor<float> expmaxLocal;
 
     uint32_t blockLength = 0;
     uint32_t usedBlockDim = 0;
diff --git a/examples/activation/softmaxflash/kernel_launch_method_by_direct/softmaxflash_custom.cpp b/examples/activation/softmaxflash/kernel_launch_method_by_direct/softmaxflash_custom.cpp
index cfcfa32d..e3ee128c 100644
--- a/examples/activation/softmaxflash/kernel_launch_method_by_direct/softmaxflash_custom.cpp
+++ b/examples/activation/softmaxflash/kernel_launch_method_by_direct/softmaxflash_custom.cpp
@@ -9,14 +9,13 @@
  */
 
 #include "../kernel_impl/softmaxflash_kernel.h"
-using namespace AscendC;
 
-__aicore__ inline void CopyTiling(VecTiling* tiling, GM_ADDR tilingGM)
+__aicore__ inline void CopyTiling(MyCustomKernel::VecTiling* tiling, GM_ADDR tilingGM)
 {
     uint32_t* ptr = reinterpret_cast<uint32_t*>(tiling);
     auto tiling32 = reinterpret_cast<__gm__ uint32_t*>(tilingGM);
 
-    for (int i = 0; i < sizeof(VecTiling) / sizeof(uint32_t); i++, ptr++) {
+    for (int i = 0; i < sizeof(MyCustomKernel::VecTiling) / sizeof(uint32_t); i++, ptr++) {
         *ptr = *(tiling32 + i);
     }
     return;
@@ -28,10 +27,10 @@ extern "C" __global__ __aicore__ void softmaxflash_custom(GM_ADDR x, GM_ADDR max
     if ASCEND_IS_AIC {
         return;
     }
-    VecTiling tilingData;
+    MyCustomKernel::VecTiling tilingData;
     CopyTiling(&tilingData, tiling);
-    KernelSoftmax op;
-    op.Init(x, max, sum, tilingData);
+    MyCustomKernel::KernelSoftmax op;
+    op.Init(x, max, sum, res_out, tilingData);
     op.Process();
 }
 
diff --git a/examples/activation/softmaxflash/kernel_launch_method_by_framework/op_kernel/softmaxflash_custom.cpp b/examples/activation/softmaxflash/kernel_launch_method_by_framework/op_kernel/softmaxflash_custom.cpp
index c4d69de5..019c856a 100644
--- a/examples/activation/softmaxflash/kernel_launch_method_by_framework/op_kernel/softmaxflash_custom.cpp
+++ b/examples/activation/softmaxflash/kernel_launch_method_by_framework/op_kernel/softmaxflash_custom.cpp
@@ -9,7 +9,6 @@
  */
 
 #include "../../../../../../kernel_impl/softmaxflash_kernel.h"
-using namespace AscendC;
 
 extern "C" __global__ __aicore__ void softmaxflash_custom(GM_ADDR x, GM_ADDR max, GM_ADDR sum,
     GM_ADDR workspace, GM_ADDR tiling)
@@ -18,9 +17,9 @@ extern "C" __global__ __aicore__ void softmaxflash_custom(GM_ADDR x, GM_ADDR max
         return;
     }
     GET_TILING_DATA(tilingData, tiling);
-    VecTiling vecTiling = *reinterpret_cast<VecTiling*>(&tilingData);
-    KernelSoftmax op;
-    op.Init(x, max, sum, vecTiling);
+    MyCustomKernel::VecTiling vecTiling = *reinterpret_cast<MyCustomKernel::VecTiling*>(&tilingData);
+    MyCustomKernel::KernelSoftmax op;
+    op.Init(x, max, sum, res_out, vecTiling);
     if (TILING_KEY_IS(1)) {
         op.Process();
     }
diff --git a/examples/activation/softmaxgrad/kernel_impl/softmaxgrad_kernel.h b/examples/activation/softmaxgrad/kernel_impl/softmaxgrad_kernel.h
index 4f5c10a4..d8f1c25f 100644
--- a/examples/activation/softmaxgrad/kernel_impl/softmaxgrad_kernel.h
+++ b/examples/activation/softmaxgrad/kernel_impl/softmaxgrad_kernel.h
@@ -12,7 +12,7 @@
 #define EXAMPLES_ACTIVATION_SOFTMAXGRAD_KERNEL_H
 #include "kernel_operator.h"
 
-namespace AscendC {
+namespace MyCustomKernel {
 constexpr int32_t BUFFER_NUM = 1;
 
 struct VecTiling {
@@ -51,17 +51,17 @@ public:
     }
     __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, GM_ADDR z, const VecTiling& tiling)
     {
-        ASSERT(GetBlockNum() != 0 && "block dim can not be zero!");
+        ASSERT(AscendC::GetBlockNum() != 0 && "block dim can not be zero!");
         InitTiling(tiling);
 
-        if (GetBlockIdx() == this->usedBlockDim) { // tail core
+        if (AscendC::GetBlockIdx() == this->usedBlockDim) { // tail core
             this->singleLoopCoreRowNum = this->tailCoreSingleLoopCoreRowNum;
             this->singleCoreLoopCount = this->tailCoreSingleCoreLoopCount;
             this->leftRow = this->tailCoreSingleCoreLoopTail;
         }
 
         this->blockLength = this->coreRowNum * this->columnLength;
-        uint32_t offset1 = this->blockLength * GetBlockIdx();
+        uint32_t offset1 = this->blockLength * AscendC::GetBlockIdx();
 
         xGm.SetGlobalBuffer((__gm__ float*)x + offset1, this->blockLength);
         yGm.SetGlobalBuffer((__gm__ float*)y + offset1, this->blockLength);
@@ -76,7 +76,7 @@ public:
 
     __aicore__ inline void Process()
     {
-        if (GetBlockIdx() > usedBlockDim) {
+        if (AscendC::GetBlockIdx() > usedBlockDim) {
             return;
         }
 
@@ -85,9 +85,9 @@ public:
             Compute(i, this->singleLoopCoreRowNum);
             CopyOut(i, this->singleLoopCoreRowNum);
         }
-        event_t eventIdMte3ToMte2 = static_cast<event_t>(GetTPipePtr()->FetchEventID(HardEvent::MTE3_MTE2));
-        SetFlag<HardEvent::MTE3_MTE2>(eventIdMte3ToMte2);
-        WaitFlag<HardEvent::MTE3_MTE2>(eventIdMte3ToMte2);
+        event_t eventIdMte3ToMte2 = static_cast<event_t>(GetTPipePtr()->FetchEventID(AscendC::HardEvent::MTE3_MTE2));
+        AscendC::SetFlag<AscendC::HardEvent::MTE3_MTE2>(eventIdMte3ToMte2);
+        AscendC::WaitFlag<AscendC::HardEvent::MTE3_MTE2>(eventIdMte3ToMte2);
         if (this->leftRow > 0) {
             CopyIn(this->singleCoreLoopCount, this->leftRow);
             Compute(this->singleCoreLoopCount, this->leftRow);
@@ -98,22 +98,22 @@ public:
 private:
     __aicore__ inline void CopyIn(int32_t progress, uint32_t rowNum)
     {
-        LocalTensor<float> xLocal = queueX.AllocTensor<float>();
-        LocalTensor<float> yLocal = queueY.AllocTensor<float>();
-        DataCopy(xLocal, xGm[progress * this->tileLength], rowNum * this->columnLength);
-        DataCopy(yLocal, yGm[progress * this->tileLength], rowNum * this->columnLength);
+        AscendC::LocalTensor<float> xLocal = queueX.AllocTensor<float>();
+        AscendC::LocalTensor<float> yLocal = queueY.AllocTensor<float>();
+        AscendC::DataCopy(xLocal, xGm[progress * this->tileLength], rowNum * this->columnLength);
+        AscendC::DataCopy(yLocal, yGm[progress * this->tileLength], rowNum * this->columnLength);
         queueX.EnQue(xLocal);
         queueY.EnQue(yLocal);
     }
 
     __aicore__ inline void Compute(int32_t progressm, uint32_t rowNum)
     {
-        LocalTensor<float> xLocal = queueX.DeQue<float>();
-        LocalTensor<float> yLocal = queueY.DeQue<float>();
-        LocalTensor<uint8_t> tmpBuffer = sharedTmpBuffer.Get<uint8_t>();
+        AscendC::LocalTensor<float> xLocal = queueX.DeQue<float>();
+        AscendC::LocalTensor<float> yLocal = queueY.DeQue<float>();
+        AscendC::LocalTensor<uint8_t> tmpBuffer = sharedTmpBuffer.Get<uint8_t>();
 
-        SoftMaxShapeInfo srcShape = { rowNum, this->columnLength, rowNum, this->columnLength };
-        SoftmaxGrad<float, true>(yLocal, xLocal, yLocal, tmpBuffer, softmaxTiling, false, srcShape);
+        AscendC::SoftMaxShapeInfo srcShape = { rowNum, this->columnLength, rowNum, this->columnLength };
+        AscendC::SoftmaxGrad<float, true>(yLocal, xLocal, yLocal, tmpBuffer, softmaxTiling, false, srcShape);
 
         queueY.EnQue(yLocal);
         queueX.FreeTensor(xLocal);
@@ -121,19 +121,19 @@ private:
 
     __aicore__ inline void CopyOut(int32_t progress, uint32_t rowNum)
     {
-        LocalTensor<float> zLocal = queueY.DeQue<float>();
-        DataCopy(zGm[progress * this->tileLength], zLocal, rowNum * this->columnLength);
+        AscendC::LocalTensor<float> zLocal = queueY.DeQue<float>();
+        AscendC::DataCopy(zGm[progress * this->tileLength], zLocal, rowNum * this->columnLength);
         queueY.FreeTensor(zLocal);
     }
 
 private:
-    TPipe pipe;
-    TBuf<QuePosition::VECCALC> sharedTmpBuffer;
-    TQue<QuePosition::VECIN, BUFFER_NUM> queueX;
-    TQue<QuePosition::VECOUT, BUFFER_NUM> queueY;
-    GlobalTensor<float> xGm;
-    GlobalTensor<float> yGm;
-    GlobalTensor<float> zGm;
+    AscendC::TPipe pipe;
+    AscendC::TBuf<AscendC::QuePosition::VECCALC> sharedTmpBuffer;
+    AscendC::TQue<AscendC::QuePosition::VECIN, BUFFER_NUM> queueX;
+    AscendC::TQue<AscendC::QuePosition::VECOUT, BUFFER_NUM> queueY;
+    AscendC::GlobalTensor<float> xGm;
+    AscendC::GlobalTensor<float> yGm;
+    AscendC::GlobalTensor<float> zGm;
 
     uint32_t blockLength = 0;
     uint32_t usedBlockDim = 0;
diff --git a/examples/activation/softmaxgrad/kernel_launch_method_by_direct/softmaxgrad_custom.cpp b/examples/activation/softmaxgrad/kernel_launch_method_by_direct/softmaxgrad_custom.cpp
index 2ab23bb2..febf4b5e 100644
--- a/examples/activation/softmaxgrad/kernel_launch_method_by_direct/softmaxgrad_custom.cpp
+++ b/examples/activation/softmaxgrad/kernel_launch_method_by_direct/softmaxgrad_custom.cpp
@@ -9,14 +9,13 @@
  */
 
 #include "../kernel_impl/softmaxgrad_kernel.h"
-using namespace AscendC;
 
-__aicore__ inline void CopyTiling(VecTiling* tiling, GM_ADDR tilingGM)
+__aicore__ inline void CopyTiling(MyCustomKernel::VecTiling* tiling, GM_ADDR tilingGM)
 {
     uint32_t* ptr = reinterpret_cast<uint32_t*>(tiling);
     auto tiling32 = reinterpret_cast<__gm__ uint32_t*>(tilingGM);
 
-    for (int i = 0; i < sizeof(VecTiling) / sizeof(uint32_t); i++, ptr++) {
+    for (int i = 0; i < sizeof(MyCustomKernel::VecTiling) / sizeof(uint32_t); i++, ptr++) {
         *ptr = *(tiling32 + i);
     }
     return;
@@ -28,9 +27,9 @@ extern "C" __global__ __aicore__ void softmaxgrad_custom(GM_ADDR x, GM_ADDR y, G
     if ASCEND_IS_AIC {
         return;
     }
-    VecTiling tilingData;
+    MyCustomKernel::VecTiling tilingData;
     CopyTiling(&tilingData, tiling);
-    KernelSoftmax op;
+    MyCustomKernel::KernelSoftmax op;
     op.Init(x, y, z, tilingData);
     op.Process();
 }
diff --git a/examples/activation/softmaxgrad/kernel_launch_method_by_framework/op_kernel/softmaxgrad_custom.cpp b/examples/activation/softmaxgrad/kernel_launch_method_by_framework/op_kernel/softmaxgrad_custom.cpp
index ef21e9b7..d466c81b 100644
--- a/examples/activation/softmaxgrad/kernel_launch_method_by_framework/op_kernel/softmaxgrad_custom.cpp
+++ b/examples/activation/softmaxgrad/kernel_launch_method_by_framework/op_kernel/softmaxgrad_custom.cpp
@@ -9,7 +9,6 @@
  */
 
 #include "../../../../../../kernel_impl/softmaxgrad_kernel.h"
-using namespace AscendC;
 
 extern "C" __global__ __aicore__ void softmaxgrad_custom(GM_ADDR x, GM_ADDR y, GM_ADDR z, GM_ADDR workspace,
     GM_ADDR tiling)
@@ -18,9 +17,9 @@ extern "C" __global__ __aicore__ void softmaxgrad_custom(GM_ADDR x, GM_ADDR y, G
         return;
     }
     GET_TILING_DATA(tilingData, tiling);
-    VecTiling vecTiling = *reinterpret_cast<VecTiling*>(&tilingData);
-    KernelSoftmax op;
-    op.Init(x, y, z, vecTiling);
+    MyCustomKernel::VecTiling vecTiling = *reinterpret_cast<MyCustomKernel::VecTiling*>(&tilingData);
+    MyCustomKernel::KernelSoftmax op;
+    op.Init(x, y, res_out, vecTiling);
     if (TILING_KEY_IS(1)) {
         op.Process();
     }
diff --git a/examples/activation/softmaxgradfront/kernel_impl/softmaxgradfront_kernel.h b/examples/activation/softmaxgradfront/kernel_impl/softmaxgradfront_kernel.h
index 94a0659a..8a3fb2f2 100644
--- a/examples/activation/softmaxgradfront/kernel_impl/softmaxgradfront_kernel.h
+++ b/examples/activation/softmaxgradfront/kernel_impl/softmaxgradfront_kernel.h
@@ -11,7 +11,7 @@
 #ifndef EXAMPLES_ACTIVATION_SOFTMAXGRADFRONT_KERNEL_H
 #define EXAMPLES_ACTIVATION_SOFTMAXGRADFRONT_KERNEL_H
 #include "kernel_operator.h"
-namespace AscendC {
+namespace MyCustomKernel {
 constexpr int32_t BUFFER_NUM = 1;
 constexpr uint32_t FLOAT_NUM_OF_SINGEL_BLOCK = 8;
 constexpr uint32_t BASIC_BLOCK_ROW_FACTOR = 8;
@@ -53,18 +53,18 @@ public:
     }
     __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, GM_ADDR z, const VecTiling& tiling)
     {
-        ASSERT(GetBlockNum() != 0 && "block dim can not be zero!");
+        ASSERT(AscendC::GetBlockNum() != 0 && "block dim can not be zero!");
         InitTiling(tiling);
 
-        if (GetBlockIdx() == this->usedBlockDim) { // tail core
+        if (AscendC::GetBlockIdx() == this->usedBlockDim) { // tail core
             this->singleLoopCoreRowNum = this->tailCoreSingleLoopCoreRowNum;
             this->singleCoreLoopCount = this->tailCoreSingleCoreLoopCount;
             this->leftRow = this->tailCoreSingleCoreLoopTail;
         }
 
         this->blockLength = this->coreRowNum * this->columnLength;
-        uint32_t offset1 = this->blockLength * GetBlockIdx();
-        uint32_t offset2 = this->coreRowNum * FLOAT_NUM_OF_SINGEL_BLOCK * GetBlockIdx();
+        uint32_t offset1 = this->blockLength * AscendC::GetBlockIdx();
+        uint32_t offset2 = this->coreRowNum * FLOAT_NUM_OF_SINGEL_BLOCK * AscendC::GetBlockIdx();
 
         xGm.SetGlobalBuffer((__gm__ float*)x + offset1, this->blockLength);
         yGm.SetGlobalBuffer((__gm__ float*)y + offset1, this->blockLength);
@@ -80,7 +80,7 @@ public:
 
     __aicore__ inline void Process()
     {
-        if (GetBlockIdx() > usedBlockDim) {
+        if (AscendC::GetBlockIdx() > usedBlockDim) {
             return;
         }
 
@@ -100,28 +100,28 @@ public:
 private:
     __aicore__ inline void CopyIn(int32_t progress, uint32_t rowNum)
     {
-        LocalTensor<float> xLocal = queueX.AllocTensor<float>();
-        LocalTensor<float> yLocal = queueY.AllocTensor<float>();
-        DataCopy(xLocal, xGm[progress * this->tileLength], rowNum * this->columnLength);
-        DataCopy(yLocal, yGm[progress * this->tileLength], rowNum * this->columnLength);
+        AscendC::LocalTensor<float> xLocal = queueX.AllocTensor<float>();
+        AscendC::LocalTensor<float> yLocal = queueY.AllocTensor<float>();
+        AscendC::DataCopy(xLocal, xGm[progress * this->tileLength], rowNum * this->columnLength);
+        AscendC::DataCopy(yLocal, yGm[progress * this->tileLength], rowNum * this->columnLength);
         queueX.EnQue(xLocal);
         queueY.EnQue(yLocal);
     }
 
     __aicore__ inline void Compute(int32_t progressm, uint32_t rowNum)
     {
-        LocalTensor<float> xLocal = queueX.DeQue<float>();
-        LocalTensor<float> yLocal = queueY.DeQue<float>();
-        LocalTensor<float> zLocal = queueZ.AllocTensor<float>();
-        LocalTensor<uint8_t> tmpBuffer = sharedTmpBuffer.Get<uint8_t>();
+        AscendC::LocalTensor<float> xLocal = queueX.DeQue<float>();
+        AscendC::LocalTensor<float> yLocal = queueY.DeQue<float>();
+        AscendC::LocalTensor<float> zLocal = queueZ.AllocTensor<float>();
+        AscendC::LocalTensor<uint8_t> tmpBuffer = sharedTmpBuffer.Get<uint8_t>();
 
-        SoftMaxShapeInfo srcShape = { rowNum, this->columnLength, rowNum, this->columnLength };
+        AscendC::SoftMaxShapeInfo srcShape = { rowNum, this->columnLength, rowNum, this->columnLength };
         if (this->singleLoopCoreRowNum % BASIC_BLOCK_ROW_FACTOR == 0 &&
             this->columnLength % BASIC_BLOCK_COLUMN_FACTOR == 0 &&
             this->columnLength < BASIC_BLOCK_MAX_COLUMN_LENGTH) {
-            SoftmaxGradFront<float, true>(zLocal, xLocal, yLocal, tmpBuffer, softmaxTiling, srcShape);
+            AscendC::SoftmaxGradFront<float, true>(zLocal, xLocal, yLocal, tmpBuffer, softmaxTiling, srcShape);
         } else {
-            SoftmaxGradFront<float>(zLocal, xLocal, yLocal, tmpBuffer, softmaxTiling, srcShape);
+            AscendC::SoftmaxGradFront<float>(zLocal, xLocal, yLocal, tmpBuffer, softmaxTiling, srcShape);
         }
 
         queueZ.EnQue(zLocal);
@@ -131,21 +131,21 @@ private:
 
     __aicore__ inline void CopyOut(int32_t progress, uint32_t rowNum)
     {
-        LocalTensor<float> zLocal = queueZ.DeQue<float>();
-        DataCopy(zGm[progress * this->singleLoopCoreRowNum * FLOAT_NUM_OF_SINGEL_BLOCK], zLocal,
+        AscendC::LocalTensor<float> zLocal = queueZ.DeQue<float>();
+        AscendC::DataCopy(zGm[progress * this->singleLoopCoreRowNum * FLOAT_NUM_OF_SINGEL_BLOCK], zLocal,
             rowNum * FLOAT_NUM_OF_SINGEL_BLOCK);
         queueZ.FreeTensor(zLocal);
     }
 
 private:
-    TPipe pipe;
-    TBuf<QuePosition::VECCALC> sharedTmpBuffer;
-    TQue<QuePosition::VECIN, BUFFER_NUM> queueX;
-    TQue<QuePosition::VECIN, BUFFER_NUM> queueY;
-    TQue<QuePosition::VECOUT, BUFFER_NUM> queueZ;
-    GlobalTensor<float> xGm;
-    GlobalTensor<float> yGm;
-    GlobalTensor<float> zGm;
+    AscendC::TPipe pipe;
+    AscendC::TBuf<AscendC::QuePosition::VECCALC> sharedTmpBuffer;
+    AscendC::TQue<AscendC::QuePosition::VECIN, BUFFER_NUM> queueX;
+    AscendC::TQue<AscendC::QuePosition::VECIN, BUFFER_NUM> queueY;
+    AscendC::TQue<AscendC::QuePosition::VECOUT, BUFFER_NUM> queueZ;
+    AscendC::GlobalTensor<float> xGm;
+    AscendC::GlobalTensor<float> yGm;
+    AscendC::GlobalTensor<float> zGm;
 
     uint32_t blockLength = 0;
     uint32_t usedBlockDim = 0;
diff --git a/examples/activation/softmaxgradfront/kernel_launch_method_by_direct/softmaxgradfront_custom.cpp b/examples/activation/softmaxgradfront/kernel_launch_method_by_direct/softmaxgradfront_custom.cpp
index c4a19c76..18785725 100644
--- a/examples/activation/softmaxgradfront/kernel_launch_method_by_direct/softmaxgradfront_custom.cpp
+++ b/examples/activation/softmaxgradfront/kernel_launch_method_by_direct/softmaxgradfront_custom.cpp
@@ -9,14 +9,13 @@
  */
 
 #include "../kernel_impl/softmaxgradfront_kernel.h"
-using namespace AscendC;
 
-__aicore__ inline void CopyTiling(VecTiling* tiling, GM_ADDR tilingGM)
+__aicore__ inline void CopyTiling(MyCustomKernel::VecTiling* tiling, GM_ADDR tilingGM)
 {
     uint32_t* ptr = reinterpret_cast<uint32_t*>(tiling);
     auto tiling32 = reinterpret_cast<__gm__ uint32_t*>(tilingGM);
 
-    for (int i = 0; i < sizeof(VecTiling) / sizeof(uint32_t); i++, ptr++) {
+    for (int i = 0; i < sizeof(MyCustomKernel::VecTiling) / sizeof(uint32_t); i++, ptr++) {
         *ptr = *(tiling32 + i);
     }
     return;
@@ -28,10 +27,10 @@ extern "C" __global__ __aicore__ void softmaxgradfront_custom(GM_ADDR x, GM_ADDR
     if ASCEND_IS_AIC {
         return;
     }
-    VecTiling tilingData;
+    MyCustomKernel::VecTiling tilingData;
     CopyTiling(&tilingData, tiling);
-    KernelSoftmax op;
-    op.Init(x, y, z, tilingData);
+    MyCustomKernel::KernelSoftmax op;
+    op.Init(x, y, res_out, tilingData);
     op.Process();
 }
 
diff --git a/examples/activation/softmaxgradfront/kernel_launch_method_by_framework/op_kernel/softmaxgradfront_custom.cpp b/examples/activation/softmaxgradfront/kernel_launch_method_by_framework/op_kernel/softmaxgradfront_custom.cpp
index ad346e4b..3557e4c2 100644
--- a/examples/activation/softmaxgradfront/kernel_launch_method_by_framework/op_kernel/softmaxgradfront_custom.cpp
+++ b/examples/activation/softmaxgradfront/kernel_launch_method_by_framework/op_kernel/softmaxgradfront_custom.cpp
@@ -8,7 +8,6 @@
  * See LICENSE in the root of the software repository for the full text of the License.
  */
 #include "../../../../../../kernel_impl/softmaxgradfront_kernel.h"
-using namespace AscendC;
 
 extern "C" __global__ __aicore__ void softmaxgradfront_custom(GM_ADDR x, GM_ADDR y, GM_ADDR z, GM_ADDR workspace,
     GM_ADDR tiling)
@@ -17,9 +16,9 @@ extern "C" __global__ __aicore__ void softmaxgradfront_custom(GM_ADDR x, GM_ADDR
         return;
     }
     GET_TILING_DATA(tilingData, tiling);
-    VecTiling vecTiling = *reinterpret_cast<VecTiling*>(&tilingData);
-    KernelSoftmax op;
-    op.Init(x, y, z, vecTiling);
+    MyCustomKernel::VecTiling vecTiling = *reinterpret_cast<MyCustomKernel::VecTiling*>(&tilingData);
+    MyCustomKernel::KernelSoftmax op;
+    op.Init(x, y, res_out, vecTiling);
     if (TILING_KEY_IS(1)) {
         op.Process();
     }
diff --git a/examples/matrix/batch_matmul/kernel_impl/batch_matmul_custom_impl.h b/examples/matrix/batch_matmul/kernel_impl/batch_matmul_custom_impl.h
index be0b4aaa..eab9dfd0 100644
--- a/examples/matrix/batch_matmul/kernel_impl/batch_matmul_custom_impl.h
+++ b/examples/matrix/batch_matmul/kernel_impl/batch_matmul_custom_impl.h
@@ -13,7 +13,6 @@
 #include "kernel_operator.h"
 #include "lib/matmul_intf.h"
 
-using namespace AscendC;
 using namespace matmul;
 
 constexpr int USED_CORE_NUM = 2;
@@ -24,7 +23,7 @@ class BatchMatmulKernel {
         __aicore__ inline BatchMatmulKernel(){};
         __aicore__ inline void Init(GM_ADDR a, GM_ADDR b, GM_ADDR bias, GM_ADDR c, GM_ADDR workspace, const TCubeTiling& tiling);
         template <bool hasBias = false>
-        __aicore__ inline void Process(TPipe* pipe, int32_t batchA, int32_t batchB);
+        __aicore__ inline void Process(AscendC::TPipe* pipe, int32_t batchA, int32_t batchB);
         Matmul<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE> matmulObj;
     private:
         __aicore__ inline void CalcOffset(int32_t blockIdx, const TCubeTiling& tiling, int32_t& offsetA, int32_t& offsetB,
@@ -33,10 +32,10 @@ class BatchMatmulKernel {
         using bType = typename B_TYPE::T;
         using cType = typename C_TYPE::T;
         using biasType = typename BIAS_TYPE::T;
-        GlobalTensor<aType> aGlobal;
-        GlobalTensor<bType> bGlobal;
-        GlobalTensor<cType> cGlobal;
-        GlobalTensor<biasType> biasGlobal;
+        AscendC::GlobalTensor<aType> aGlobal;
+        AscendC::GlobalTensor<bType> bGlobal;
+        AscendC::GlobalTensor<cType> cGlobal;
+        AscendC::GlobalTensor<biasType> biasGlobal;
         TCubeTiling tiling;
 };
 
@@ -59,7 +58,7 @@ __aicore__ inline void BatchMatmulKernel<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE>::Ini
     int32_t offsetB = 0;
     int32_t offsetC = 0;
     int32_t offsetBias = 0;
-    CalcOffset(GetBlockIdx(), tiling, offsetA, offsetB, offsetC, offsetBias);
+    CalcOffset(AscendC::GetBlockIdx(), tiling, offsetA, offsetB, offsetC, offsetBias);
     aGlobal = aGlobal[offsetA];
     bGlobal = bGlobal[offsetB];
     cGlobal = cGlobal[offsetC];
@@ -71,7 +70,7 @@ __aicore__ inline void BatchMatmulKernel<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE>::Ini
 
 template<class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE>
 template <bool hasBias>
-__aicore__ inline void BatchMatmulKernel<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE>::Process(TPipe* pipe, int32_t batchA, int32_t batchB)
+__aicore__ inline void BatchMatmulKernel<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE>::Process(AscendC::TPipe* pipe, int32_t batchA, int32_t batchB)
 {
     int batchC = batchA > batchB ? batchA : batchB;
     int gLay = tiling.ALayoutInfoG > tiling.BLayoutInfoG ? tiling.ALayoutInfoG : tiling.BLayoutInfoG;
diff --git a/examples/matrix/batch_matmul/kernel_launch_method_by_direct/batch_matmul_custom.cpp b/examples/matrix/batch_matmul/kernel_launch_method_by_direct/batch_matmul_custom.cpp
index d115a310..b1abfcd9 100644
--- a/examples/matrix/batch_matmul/kernel_launch_method_by_direct/batch_matmul_custom.cpp
+++ b/examples/matrix/batch_matmul/kernel_launch_method_by_direct/batch_matmul_custom.cpp
@@ -12,7 +12,6 @@
 #include "lib/matmul_intf.h"
 #include "../kernel_impl/batch_matmul_custom_impl.h"
 
-using namespace AscendC;
 using namespace matmul;
 
 constexpr int32_t FULL_L1_SIZE = 512 * 1024;
@@ -39,7 +38,7 @@ extern "C" __global__ __aicore__ void batch_matmul_custom(GM_ADDR a, GM_ADDR b,
     typedef MatmulType<TPosition::GM, CubeFormat::ND, float, false, LayoutMode::BSNGD> C_TYPE;
     typedef MatmulType<TPosition::GM, CubeFormat::ND, float> BIAS_TYPE;
     BatchMatmulKernel<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE> batchMatmulKernel;
-    TPipe pipe;
+    AscendC::TPipe pipe;
     tiling.shareMode = 0;             // 0, share mode
     tiling.shareL1Size = FULL_L1_SIZE;  // full L1
     tiling.shareL0CSize = FULL_L0C_SIZE; // full L0C
diff --git a/examples/matrix/batch_matmul/kernel_launch_method_by_framework/op_host/batch_matmul_custom.cpp b/examples/matrix/batch_matmul/kernel_launch_method_by_framework/op_host/batch_matmul_custom.cpp
index b7ceb70d..ff053d11 100644
--- a/examples/matrix/batch_matmul/kernel_launch_method_by_framework/op_host/batch_matmul_custom.cpp
+++ b/examples/matrix/batch_matmul/kernel_launch_method_by_framework/op_host/batch_matmul_custom.cpp
@@ -28,7 +28,7 @@ static ge::graphStatus TilingFunc(gert::TilingContext* context)
     tiling.SaveToBuffer(context->GetRawTilingData()->GetData(), context->GetRawTilingData()->GetCapacity());
     context->GetRawTilingData()->SetDataSize(tiling.GetDataSize());
     size_t userWorkspaceSize = 0;
-    size_t systemWorkspaceSize = ascendcPlatform.GetLibApiWorkSpaceSize();
+    size_t systemWorkspaceSize = static_cast<size_t>(ascendcPlatform.GetLibApiWorkSpaceSize());
     size_t *currentWorkspace = context->GetWorkspaceSizes(1);
     currentWorkspace[0] = userWorkspaceSize + systemWorkspaceSize;
 
diff --git a/examples/matrix/batch_matmul/kernel_launch_method_by_framework/op_kernel/batch_matmul_custom.cpp b/examples/matrix/batch_matmul/kernel_launch_method_by_framework/op_kernel/batch_matmul_custom.cpp
index ff9547f9..a653a0f5 100644
--- a/examples/matrix/batch_matmul/kernel_launch_method_by_framework/op_kernel/batch_matmul_custom.cpp
+++ b/examples/matrix/batch_matmul/kernel_launch_method_by_framework/op_kernel/batch_matmul_custom.cpp
@@ -12,7 +12,6 @@
 #include "lib/matmul_intf.h"
 #include "../../../../../../kernel_impl/batch_matmul_custom_impl.h"
 
-using namespace AscendC;
 using namespace matmul;
 
 constexpr int32_t FULL_L1_SIZE = 512 * 1024;
@@ -25,7 +24,7 @@ extern "C" __global__ __aicore__ void batch_matmul_custom(GM_ADDR a, GM_ADDR b,
     typedef MatmulType<TPosition::GM, CubeFormat::ND, float, false, LayoutMode::BSNGD> C_TYPE;
     typedef MatmulType<TPosition::GM, CubeFormat::ND, float> BIAS_TYPE;
     BatchMatmulKernel<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE> batchMatmulKernel;
-    TPipe pipe;
+    AscendC::TPipe pipe;
     tilingData.cubeTilingData.shareMode = 0;             // 0, share mode
     tilingData.cubeTilingData.shareL1Size = FULL_L1_SIZE;  // full L1
     tilingData.cubeTilingData.shareL0CSize = FULL_L0C_SIZE; // full L0C
diff --git a/examples/matrix/matmul/kernel_impl/matmul_custom_impl.h b/examples/matrix/matmul/kernel_impl/matmul_custom_impl.h
index ef815c69..c7904015 100644
--- a/examples/matrix/matmul/kernel_impl/matmul_custom_impl.h
+++ b/examples/matrix/matmul/kernel_impl/matmul_custom_impl.h
@@ -13,7 +13,6 @@
 #include "kernel_operator.h"
 #include "lib/matmul_intf.h"
 
-using namespace AscendC;
 using namespace matmul;
 
 template<typename aType, typename bType, typename cType, typename biasType>
@@ -22,7 +21,7 @@ class MatmulKernel {
         __aicore__ inline MatmulKernel(){};
         __aicore__ inline void Init(GM_ADDR a, GM_ADDR b, GM_ADDR bias, GM_ADDR c, GM_ADDR workspace, const TCubeTiling& tiling);
         template <bool setTmpSpace = false, bool hasBias = false>
-        __aicore__ inline void Process(TPipe* pipe);
+        __aicore__ inline void Process(AscendC::TPipe* pipe);
         Matmul<MatmulType<TPosition::GM, CubeFormat::ND, aType, true>,
         MatmulType<TPosition::GM, CubeFormat::ND, bType>,
         MatmulType<TPosition::GM, CubeFormat::ND, cType>,
@@ -32,10 +31,10 @@ class MatmulKernel {
         __aicore__ inline void CalcOffset(int32_t blockIdx, const TCubeTiling& tiling, int32_t& offsetA, int32_t& offsetB,
                                           int32_t& offsetC, int32_t& offsetBias, bool isAtrans, bool isBtrans);
 
-        GlobalTensor<aType> aGlobal;
-        GlobalTensor<bType> bGlobal;
-        GlobalTensor<cType> cGlobal;
-        GlobalTensor<biasType> biasGlobal;
+        AscendC::GlobalTensor<aType> aGlobal;
+        AscendC::GlobalTensor<bType> bGlobal;
+        AscendC::GlobalTensor<cType> cGlobal;
+        AscendC::GlobalTensor<biasType> biasGlobal;
         TCubeTiling tiling;
 };
 
@@ -55,7 +54,7 @@ __aicore__ inline void MatmulKernel<aType, bType, cType, biasType>::Init(GM_ADDR
     int32_t offsetBias = 0;
     bool isAtrans = true;
     bool isBtrans = false;
-    CalcOffset(GetBlockIdx(), tiling, offsetA, offsetB, offsetC, offsetBias, isAtrans, isBtrans);
+    CalcOffset(AscendC::GetBlockIdx(), tiling, offsetA, offsetB, offsetC, offsetBias, isAtrans, isBtrans);
     aGlobal = aGlobal[offsetA];
     bGlobal = bGlobal[offsetB];
     cGlobal = cGlobal[offsetC];
@@ -67,11 +66,11 @@ __aicore__ inline void MatmulKernel<aType, bType, cType, biasType>::Init(GM_ADDR
 
 template<typename aType, typename bType, typename cType, typename biasType>
 template <bool setTmpSpace, bool hasBias>
-__aicore__ inline void MatmulKernel<aType, bType, cType, biasType>::Process(TPipe* pipe)
+__aicore__ inline void MatmulKernel<aType, bType, cType, biasType>::Process(AscendC::TPipe* pipe)
 {
     if constexpr (setTmpSpace) {
-        TBuf<> tmpMMFormatUb;
-        LocalTensor<uint8_t> mmformatUb;
+        AscendC::TBuf<> tmpMMFormatUb;
+        AscendC::LocalTensor<uint8_t> mmformatUb;
         pipe->InitBuffer(tmpMMFormatUb, TOTAL_VEC_LOCAL_SIZE);
         mmformatUb = tmpMMFormatUb.Get<uint8_t>(TOTAL_VEC_LOCAL_SIZE);
         matmulObj.SetLocalWorkspace(mmformatUb);
@@ -86,11 +85,16 @@ __aicore__ inline void MatmulKernel<aType, bType, cType, biasType>::Process(TPip
     matmulObj.End();
 }
 
+__aicore__ inline uint32_t Ceiling(uint32_t a, uint32_t b)
+{
+    return (a + b - 1) / b;
+}
+
 template<typename aType, typename bType, typename cType, typename biasType>
 __aicore__ inline void MatmulKernel<aType, bType, cType, biasType>::CalcOffset(int32_t blockIdx, const TCubeTiling& tiling,
     int32_t& offsetA, int32_t& offsetB, int32_t& offsetC, int32_t& offsetBias, bool isAtrans, bool isBtrans)
 {
-    auto mSingleBlocks = Ceil(tiling.M, tiling.singleCoreM);
+    auto mSingleBlocks = Ceiling(tiling.M, tiling.singleCoreM);
     auto mCoreIndx = blockIdx % mSingleBlocks;
     auto nCoreIndx = blockIdx / mSingleBlocks;
 
diff --git a/examples/matrix/matmul/kernel_launch_method_by_direct/matmul_custom.cpp b/examples/matrix/matmul/kernel_launch_method_by_direct/matmul_custom.cpp
index be66d0e6..a274b83f 100644
--- a/examples/matrix/matmul/kernel_launch_method_by_direct/matmul_custom.cpp
+++ b/examples/matrix/matmul/kernel_launch_method_by_direct/matmul_custom.cpp
@@ -12,7 +12,6 @@
 #include "lib/matmul_intf.h"
 #include "../kernel_impl/matmul_custom_impl.h"
 
-using namespace AscendC;
 using namespace matmul;
 
 __aicore__ inline void CopyTiling(TCubeTiling* tiling, GM_ADDR tilingGM)
@@ -31,7 +30,7 @@ extern "C" __global__ __aicore__ void matmul_custom(GM_ADDR a, GM_ADDR b, GM_ADD
     TCubeTiling tiling;
     CopyTiling(&tiling, tilingGm);
     MatmulKernel<half, half, float, float> matmulKernel;
-    TPipe pipe;
+    AscendC::TPipe pipe;
     REGIST_MATMUL_OBJ(&pipe, GetSysWorkSpacePtr(), matmulKernel.matmulObj, &tiling);
     matmulKernel.Init(a, b, nullptr, c, workspace, tiling);
 #ifdef CUSTOM_ASCEND310P
diff --git a/examples/matrix/matmul/kernel_launch_method_by_framework/op_host/matmul_custom.cpp b/examples/matrix/matmul/kernel_launch_method_by_framework/op_host/matmul_custom.cpp
index adab49f0..95d12755 100644
--- a/examples/matrix/matmul/kernel_launch_method_by_framework/op_host/matmul_custom.cpp
+++ b/examples/matrix/matmul/kernel_launch_method_by_framework/op_host/matmul_custom.cpp
@@ -48,7 +48,7 @@ static ge::graphStatus TilingFunc(gert::TilingContext* context)
     tiling.SaveToBuffer(context->GetRawTilingData()->GetData(), context->GetRawTilingData()->GetCapacity());
     context->GetRawTilingData()->SetDataSize(tiling.GetDataSize());
     size_t userWorkspaceSize = 0;
-    size_t systemWorkspaceSize = ascendcPlatform.GetLibApiWorkSpaceSize();
+    size_t systemWorkspaceSize = static_cast<size_t>(ascendcPlatform.GetLibApiWorkSpaceSize());
     size_t *currentWorkspace = context->GetWorkspaceSizes(1);
     currentWorkspace[0] = userWorkspaceSize + systemWorkspaceSize;
 
diff --git a/examples/matrix/matmul/kernel_launch_method_by_framework/op_kernel/matmul_custom.cpp b/examples/matrix/matmul/kernel_launch_method_by_framework/op_kernel/matmul_custom.cpp
index 09c40946..af60b3d4 100644
--- a/examples/matrix/matmul/kernel_launch_method_by_framework/op_kernel/matmul_custom.cpp
+++ b/examples/matrix/matmul/kernel_launch_method_by_framework/op_kernel/matmul_custom.cpp
@@ -12,13 +12,12 @@
 #include "lib/matmul_intf.h"
 #include "../../../../../../kernel_impl/matmul_custom_impl.h"
 
-using namespace AscendC;
 using namespace matmul;
 
 extern "C" __global__ __aicore__ void matmul_custom(GM_ADDR a, GM_ADDR b, GM_ADDR bias, GM_ADDR c, GM_ADDR workspace, GM_ADDR tiling) {
     GET_TILING_DATA(tilingData, tiling);
     MatmulKernel<half, half, float, float> matmulKernel;
-    TPipe pipe;
+    AscendC::TPipe pipe;
     REGIST_MATMUL_OBJ(&pipe, GetSysWorkSpacePtr(), matmulKernel.matmulObj,&tilingData.cubeTilingData);
     matmulKernel.Init(a, b, bias, c, workspace, tilingData.cubeTilingData);
     if (TILING_KEY_IS(1)) {
diff --git a/examples/normalization/layernorm/kernel_impl/layernorm_custom.h b/examples/normalization/layernorm/kernel_impl/layernorm_custom.h
index c838f882..435a7abc 100644
--- a/examples/normalization/layernorm/kernel_impl/layernorm_custom.h
+++ b/examples/normalization/layernorm/kernel_impl/layernorm_custom.h
@@ -12,7 +12,7 @@
 #define EXAMPLES_NORMALIZATION_LAYERNORM_CUSTOM_H
 #include "kernel_operator.h"
 
-namespace AscendC {
+namespace MyCustomKernel {
 struct VecTiling {
     LayerNormTiling layernormTilingData;
     float epsilon = 0;
@@ -58,13 +58,13 @@ public:
 private:
     __aicore__ inline void CopyIn()
     {
-        LocalTensor<float> inputXLocal = inQueueX.AllocTensor<float>();
-        LocalTensor<float> gammaLocal = inQueueGamma.AllocTensor<float>();
-        LocalTensor<float> betaLocal = inQueueBeta.AllocTensor<float>();
+        AscendC::LocalTensor<float> inputXLocal = inQueueX.AllocTensor<float>();
+        AscendC::LocalTensor<float> gammaLocal = inQueueGamma.AllocTensor<float>();
+        AscendC::LocalTensor<float> betaLocal = inQueueBeta.AllocTensor<float>();
 
-        DataCopy(inputXLocal, inputXGlobal, bshLength);
-        DataCopy(gammaLocal, gammGlobal, hLength);
-        DataCopy(betaLocal, betaGlobal, hLength);
+        AscendC::DataCopy(inputXLocal, inputXGlobal, bshLength);
+        AscendC::DataCopy(gammaLocal, gammGlobal, hLength);
+        AscendC::DataCopy(betaLocal, betaGlobal, hLength);
 
         inQueueX.EnQue(inputXLocal);
         inQueueGamma.EnQue(gammaLocal);
@@ -72,15 +72,15 @@ private:
     }
     __aicore__ inline void Compute()
     {
-        LocalTensor<float> inputXLocal = inQueueX.DeQue<float>();
-        LocalTensor<float> gammaLocal = inQueueGamma.DeQue<float>();
-        LocalTensor<float> betaLocal = inQueueBeta.DeQue<float>();
+        AscendC::LocalTensor<float> inputXLocal = inQueueX.DeQue<float>();
+        AscendC::LocalTensor<float> gammaLocal = inQueueGamma.DeQue<float>();
+        AscendC::LocalTensor<float> betaLocal = inQueueBeta.DeQue<float>();
 
-        LocalTensor<float> outputLocal = outQueue.AllocTensor<float>();
-        LocalTensor<float> meanLocal = outQueueMean.AllocTensor<float>();
-        LocalTensor<float> varianceLocal = outQueueVariance.AllocTensor<float>();
+        AscendC::LocalTensor<float> outputLocal = outQueue.AllocTensor<float>();
+        AscendC::LocalTensor<float> meanLocal = outQueueMean.AllocTensor<float>();
+        AscendC::LocalTensor<float> varianceLocal = outQueueVariance.AllocTensor<float>();
 
-        LayerNorm<float, isReuseSource>(outputLocal, meanLocal, varianceLocal, inputXLocal, gammaLocal, betaLocal,
+        AscendC::LayerNorm<float, isReuseSource>(outputLocal, meanLocal, varianceLocal, inputXLocal, gammaLocal, betaLocal,
             (float)epsilon, tiling_);
 
         outQueue.EnQue<float>(outputLocal);
@@ -93,13 +93,13 @@ private:
     }
     __aicore__ inline void CopyOut()
     {
-        LocalTensor<float> outputLocal = outQueue.DeQue<float>();
-        LocalTensor<float> meanLocal = outQueueMean.DeQue<float>();
-        LocalTensor<float> varianceLocal = outQueueVariance.DeQue<float>();
+        AscendC::LocalTensor<float> outputLocal = outQueue.DeQue<float>();
+        AscendC::LocalTensor<float> meanLocal = outQueueMean.DeQue<float>();
+        AscendC::LocalTensor<float> varianceLocal = outQueueVariance.DeQue<float>();
 
-        DataCopy(outputGlobal, outputLocal, bshLength);
-        DataCopy(outputMeanGlobal, meanLocal, bsLength);
-        DataCopy(outputVarianceGlobal, varianceLocal, bsLength);
+        AscendC::DataCopy(outputGlobal, outputLocal, bshLength);
+        AscendC::DataCopy(outputMeanGlobal, meanLocal, bsLength);
+        AscendC::DataCopy(outputVarianceGlobal, varianceLocal, bsLength);
 
         outQueue.FreeTensor(outputLocal);
         outQueueMean.FreeTensor(meanLocal);
@@ -107,20 +107,20 @@ private:
     }
 
 private:
-    GlobalTensor<float> inputXGlobal;
-    GlobalTensor<float> gammGlobal;
-    GlobalTensor<float> betaGlobal;
-    GlobalTensor<float> outputGlobal;
-    GlobalTensor<float> outputMeanGlobal;
-    GlobalTensor<float> outputVarianceGlobal;
-
-    TPipe pipe;
-    TQue<QuePosition::VECIN, 1> inQueueX;
-    TQue<QuePosition::VECIN, 1> inQueueGamma;
-    TQue<QuePosition::VECIN, 1> inQueueBeta;
-    TQue<QuePosition::VECOUT, 1> outQueue;
-    TQue<QuePosition::VECOUT, 1> outQueueMean;
-    TQue<QuePosition::VECOUT, 1> outQueueVariance;
+    AscendC::GlobalTensor<float> inputXGlobal;
+    AscendC::GlobalTensor<float> gammGlobal;
+    AscendC::GlobalTensor<float> betaGlobal;
+    AscendC::GlobalTensor<float> outputGlobal;
+    AscendC::GlobalTensor<float> outputMeanGlobal;
+    AscendC::GlobalTensor<float> outputVarianceGlobal;
+
+    AscendC::TPipe pipe;
+    AscendC::TQue<AscendC::QuePosition::VECIN, 1> inQueueX;
+    AscendC::TQue<AscendC::QuePosition::VECIN, 1> inQueueGamma;
+    AscendC::TQue<AscendC::QuePosition::VECIN, 1> inQueueBeta;
+    AscendC::TQue<AscendC::QuePosition::VECOUT, 1> outQueue;
+    AscendC::TQue<AscendC::QuePosition::VECOUT, 1> outQueueMean;
+    AscendC::TQue<AscendC::QuePosition::VECOUT, 1> outQueueVariance;
 
     uint32_t bLength;
     uint32_t sLength;
diff --git a/examples/normalization/layernorm/kernel_launch_method_by_direct/layernorm_custom.cpp b/examples/normalization/layernorm/kernel_launch_method_by_direct/layernorm_custom.cpp
index b225a098..9358217e 100644
--- a/examples/normalization/layernorm/kernel_launch_method_by_direct/layernorm_custom.cpp
+++ b/examples/normalization/layernorm/kernel_launch_method_by_direct/layernorm_custom.cpp
@@ -9,14 +9,13 @@
  */
 #include "kernel_operator.h"
 #include "../kernel_impl/layernorm_custom.h"
-using namespace AscendC;
 
-__aicore__ inline void CopyTiling(VecTiling *tiling, GM_ADDR tilingGM)
+__aicore__ inline void CopyTiling(MyCustomKernel::VecTiling *tiling, GM_ADDR tilingGM)
 {
     uint32_t *ptr = reinterpret_cast<uint32_t *>(tiling);
     auto tiling32 = reinterpret_cast<__gm__ uint32_t *>(tilingGM);
 
-    for (int i = 0; i < sizeof(VecTiling) / sizeof(uint32_t); i++, ptr++) {
+    for (int i = 0; i < sizeof(MyCustomKernel::VecTiling) / sizeof(uint32_t); i++, ptr++) {
         *ptr = *(tiling32 + i);
     }
     return;
@@ -28,9 +27,9 @@ extern "C" __global__ __aicore__ void layernorm_custom(GM_ADDR inputXGm, GM_ADDR
     if ASCEND_IS_AIC {
         return;
     }
-    VecTiling tilingData;
+    MyCustomKernel::VecTiling tilingData;
     CopyTiling(&tilingData, tiling);
-    KernelLayernorm<false> op;
+    MyCustomKernel::KernelLayernorm<false> op;
     op.Init(inputXGm, gammaGm, betaGm, outputGm, outputMeanGm, outputVarianceGm, tilingData);
     op.Process();
 }
diff --git a/examples/normalization/layernorm/kernel_launch_method_by_framework/op_kernel/layernorm_custom.cpp b/examples/normalization/layernorm/kernel_launch_method_by_framework/op_kernel/layernorm_custom.cpp
index 17db0fbd..2bcc8618 100644
--- a/examples/normalization/layernorm/kernel_launch_method_by_framework/op_kernel/layernorm_custom.cpp
+++ b/examples/normalization/layernorm/kernel_launch_method_by_framework/op_kernel/layernorm_custom.cpp
@@ -9,15 +9,14 @@
  */
 
 #include "../../../../../../kernel_impl/layernorm_custom.h"
-using namespace AscendC;
 
 extern "C" __global__ __aicore__ void layernorm_custom(GM_ADDR inputXGm, GM_ADDR gammaGm, GM_ADDR betaGm,
     GM_ADDR outputGm, GM_ADDR outputMeanGm, GM_ADDR outputVarianceGm, GM_ADDR workspace, GM_ADDR tiling)
 {
     GET_TILING_DATA(tilingData, tiling);
-    VecTiling vecTiling = *reinterpret_cast<VecTiling*>(&tilingData);
+    MyCustomKernel::VecTiling vecTiling = *reinterpret_cast<MyCustomKernel::VecTiling*>(&tilingData);
     if (TILING_KEY_IS(1)) {
-        KernelLayernorm<false> op;
+        MyCustomKernel::KernelLayernorm<false> op;
         op.Init(inputXGm, gammaGm, betaGm, outputGm, outputMeanGm, outputVarianceGm, vecTiling);
         op.Process();
     }
diff --git a/examples/pad/broadcast/kernel_impl/broadcast_custom.h b/examples/pad/broadcast/kernel_impl/broadcast_custom.h
index d816db34..6df59bf8 100644
--- a/examples/pad/broadcast/kernel_impl/broadcast_custom.h
+++ b/examples/pad/broadcast/kernel_impl/broadcast_custom.h
@@ -13,7 +13,6 @@
 #include "kernel_operator.h"
 
 constexpr int32_t BUFFER_NUM = 1;
-using namespace AscendC;
 template <typename T, int32_t dim, int32_t axis>
 class KernelBroadcastCustom {
 public:
@@ -22,7 +21,7 @@ public:
     __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, uint32_t srcLength, uint32_t dstLength,
         const uint32_t srcShape[dim], const uint32_t dstShape[dim])
     {
-        AscendCUtils::SetOverflow(1);
+        AscendC::AscendCUtils::SetOverflow(1);
         xGm.SetGlobalBuffer((__gm__ T *)x, srcLength);
         yGm.SetGlobalBuffer((__gm__ T *)y, dstLength);
 
@@ -44,32 +43,32 @@ public:
 private:
     __aicore__ inline void CopyIn()
     {
-        LocalTensor<T> xLocal = inQueueX.AllocTensor<T>();
-        DataCopy(xLocal, xGm, srcLength_);
+        AscendC::LocalTensor<T> xLocal = inQueueX.AllocTensor<T>();
+        AscendC::DataCopy(xLocal, xGm, srcLength_);
         inQueueX.EnQue(xLocal);
     }
     __aicore__ inline void Compute()
     {
-        LocalTensor<T> xLocal = inQueueX.DeQue<T>();
-        LocalTensor<T> yLocal = outQueueY.AllocTensor<T>();
-        BroadCast<T, dim, axis>(yLocal, xLocal, dstShape_, srcShape_);
+        AscendC::LocalTensor<T> xLocal = inQueueX.DeQue<T>();
+        AscendC::LocalTensor<T> yLocal = outQueueY.AllocTensor<T>();
+        AscendC::BroadCast<T, dim, axis>(yLocal, xLocal, dstShape_, srcShape_);
 
         outQueueY.EnQue<T>(yLocal);
         inQueueX.FreeTensor(xLocal);
     }
     __aicore__ inline void CopyOut()
     {
-        LocalTensor<T> yLocal = outQueueY.DeQue<T>();
-        DataCopy(yGm, yLocal, dstLength_);
+        AscendC::LocalTensor<T> yLocal = outQueueY.DeQue<T>();
+        AscendC::DataCopy(yGm, yLocal, dstLength_);
         outQueueY.FreeTensor(yLocal);
     }
 
 private:
-    TPipe pipe;
-    TQue<QuePosition::VECIN, BUFFER_NUM> inQueueX;
-    TQue<QuePosition::VECOUT, BUFFER_NUM> outQueueY;
-    GlobalTensor<T> xGm;
-    GlobalTensor<T> yGm;
+    AscendC::TPipe pipe;
+    AscendC::TQue<AscendC::QuePosition::VECIN, BUFFER_NUM> inQueueX;
+    AscendC::TQue<AscendC::QuePosition::VECOUT, BUFFER_NUM> outQueueY;
+    AscendC::GlobalTensor<T> xGm;
+    AscendC::GlobalTensor<T> yGm;
     uint32_t srcLength_;
     uint32_t dstLength_;
     const uint32_t *srcShape_{nullptr};
diff --git a/examples/pad/broadcast/kernel_launch_method_by_direct/broadcast_custom.cpp b/examples/pad/broadcast/kernel_launch_method_by_direct/broadcast_custom.cpp
index 28a17316..395ac034 100644
--- a/examples/pad/broadcast/kernel_launch_method_by_direct/broadcast_custom.cpp
+++ b/examples/pad/broadcast/kernel_launch_method_by_direct/broadcast_custom.cpp
@@ -8,7 +8,6 @@
  * See LICENSE in the root of the software repository for the full text of the License.
  */
 #include "../kernel_impl/broadcast_custom.h"
-using namespace AscendC;
 struct BroadcastTilingData {
     uint32_t dim{0};
     uint32_t axis{0};
diff --git a/examples/pad/broadcast/kernel_launch_method_by_framework/op_kernel/broadcast_custom.cpp b/examples/pad/broadcast/kernel_launch_method_by_framework/op_kernel/broadcast_custom.cpp
index a8fd0464..230a5975 100644
--- a/examples/pad/broadcast/kernel_launch_method_by_framework/op_kernel/broadcast_custom.cpp
+++ b/examples/pad/broadcast/kernel_launch_method_by_framework/op_kernel/broadcast_custom.cpp
@@ -8,7 +8,6 @@
  * See LICENSE in the root of the software repository for the full text of the License.
  */
 #include "../../../../../../kernel_impl/broadcast_custom.h"
-using namespace AscendC;
 
 extern "C" __global__ __aicore__ void broadcast_custom(GM_ADDR x, GM_ADDR y, GM_ADDR workspace, GM_ADDR tiling)
 {
-- 
Gitee