diff --git a/examples/activation/simplesoftmax/kernel_impl/simplesoftmax_kernel.h b/examples/activation/simplesoftmax/kernel_impl/simplesoftmax_kernel.h
index 14c7387a6f718400cd1d722c644eaa184b3c053a..d8079d95e6452fac46cf72c2263654e1cb5abea1 100644
--- a/examples/activation/simplesoftmax/kernel_impl/simplesoftmax_kernel.h
+++ b/examples/activation/simplesoftmax/kernel_impl/simplesoftmax_kernel.h
@@ -12,7 +12,7 @@
 #define EXAMPLES_ACTIVATION_SIMPLESOFTMAX_KERNEL_H
 #include "kernel_operator.h"
 
-namespace AscendC {
+namespace MyCustomKernel {
 constexpr int32_t BUFFER_NUM = 2;
 constexpr uint32_t FLOAT_NUM_OF_SINGEL_BLOCK = 8;
 constexpr uint32_t BASIC_BLOCK_ROW_FACTOR = 8;
@@ -55,10 +55,10 @@ public:
     }
     __aicore__ inline void Init(GM_ADDR x, GM_ADDR max, GM_ADDR sum, GM_ADDR z, const VecTiling& tiling)
     {
-        ASSERT(GetBlockNum() != 0 && "block dim can not be zero!");
+        ASSERT(AscendC::GetBlockNum() != 0 && "block dim can not be zero!");
         InitTiling(tiling);
 
-        if (GetBlockIdx() == this->usedBlockDim) { // tail core
+        if (AscendC::GetBlockIdx() == this->usedBlockDim) { // tail core
             this->singleLoopCoreRowNum = this->tailCoreSingleLoopCoreRowNum;
             this->singleCoreLoopCount = this->tailCoreSingleCoreLoopCount;
             this->leftRow = this->tailCoreSingleCoreLoopTail;
@@ -67,8 +67,8 @@ public:
         this->blockLength = this->coreRowNum * this->columnLength;
         this->msLength = this->coreRowNum * FLOAT_NUM_OF_SINGEL_BLOCK; // max sum length per block process
 
-        uint32_t offset1 = this->blockLength * GetBlockIdx();
-        uint32_t offset2 = this->msLength * GetBlockIdx();
+        uint32_t offset1 = this->blockLength * AscendC::GetBlockIdx();
+        uint32_t offset2 = this->msLength * AscendC::GetBlockIdx();
 
         xGm.SetGlobalBuffer((__gm__ float*)x + offset1, this->blockLength);
         zGm.SetGlobalBuffer((__gm__ float*)z + offset1, this->blockLength);
@@ -88,7 +88,7 @@ public:
 
     __aicore__ inline void Process()
     {
-        if (GetBlockIdx() > usedBlockDim) {
+        if (AscendC::GetBlockIdx() > usedBlockDim) {
             return;
         }
 
@@ -108,12 +108,12 @@ public:
 private:
     __aicore__ inline void CopyIn(int32_t progress, uint32_t rowNum)
     {
-        LocalTensor<float> xLocal = queueX.AllocTensor<float>();
-        LocalTensor<float> maxLocal = queueMax.AllocTensor<float>();
-        LocalTensor<float> sumLocal = queueSum.AllocTensor<float>();
-        DataCopy(xLocal, xGm[progress * this->tileLength], rowNum * this->columnLength);
-        DataCopy(maxLocal, maxGm[progress * this->msTileLength], rowNum * FLOAT_NUM_OF_SINGEL_BLOCK);
-        DataCopy(sumLocal, sumGm[progress * this->msTileLength], rowNum * FLOAT_NUM_OF_SINGEL_BLOCK);
+        AscendC::LocalTensor<float> xLocal = queueX.AllocTensor<float>();
+        AscendC::LocalTensor<float> maxLocal = queueMax.AllocTensor<float>();
+        AscendC::LocalTensor<float> sumLocal = queueSum.AllocTensor<float>();
+        AscendC::DataCopy(xLocal, xGm[progress * this->tileLength], rowNum * this->columnLength);
+        AscendC::DataCopy(maxLocal, maxGm[progress * this->msTileLength], rowNum * FLOAT_NUM_OF_SINGEL_BLOCK);
+        AscendC::DataCopy(sumLocal, sumGm[progress * this->msTileLength], rowNum * FLOAT_NUM_OF_SINGEL_BLOCK);
         queueX.EnQue(xLocal);
         queueMax.EnQue(maxLocal);
         queueSum.EnQue(sumLocal);
@@ -121,18 +121,18 @@ private:
 
     __aicore__ inline void Compute(int32_t progressm, uint32_t rowNum)
     {
-        LocalTensor<float> xLocal = queueX.DeQue<float>();
-        LocalTensor<float> maxLocal = queueMax.DeQue<float>();
-        LocalTensor<float> sumLocal = queueSum.DeQue<float>();
-        LocalTensor<uint8_t> tmpBuffer = sharedTmpBuffer.Get<uint8_t>();
+        AscendC::LocalTensor<float> xLocal = queueX.DeQue<float>();
+        AscendC::LocalTensor<float> maxLocal = queueMax.DeQue<float>();
+        AscendC::LocalTensor<float> sumLocal = queueSum.DeQue<float>();
+        AscendC::LocalTensor<uint8_t> tmpBuffer = sharedTmpBuffer.Get<uint8_t>();
 
-        SoftMaxShapeInfo srcShape = { rowNum, this->columnLength, rowNum, this->columnLength };
+        AscendC::SoftMaxShapeInfo srcShape = { rowNum, this->columnLength, rowNum, this->columnLength };
         if (rowNum % BASIC_BLOCK_ROW_FACTOR == 0 &&
             this->columnLength % BASIC_BLOCK_COLUMN_FACTOR == 0 &&
             this->columnLength < BASIC_BLOCK_MAX_COLUMN_LENGTH) {
-            SimpleSoftMax<float, true, true>(xLocal, sumLocal, maxLocal, xLocal, tmpBuffer, softmaxTiling, srcShape);
+            AscendC::SimpleSoftMax<float, true, true>(xLocal, sumLocal, maxLocal, xLocal, tmpBuffer, softmaxTiling, srcShape);
         } else {
-            SimpleSoftMax<float, true>(xLocal, sumLocal, maxLocal, xLocal, tmpBuffer, softmaxTiling, srcShape);
+            AscendC::SimpleSoftMax<float, true>(xLocal, sumLocal, maxLocal, xLocal, tmpBuffer, softmaxTiling, srcShape);
         }
         queueX.EnQue(xLocal);
         queueMax.FreeTensor(maxLocal);
@@ -141,20 +141,20 @@ private:
 
     __aicore__ inline void CopyOut(int32_t progress, uint32_t rowNum)
     {
-        LocalTensor<float> zLocal = queueX.DeQue<float>();
-        DataCopy(zGm[progress * this->tileLength], zLocal, rowNum * this->columnLength);
+        AscendC::LocalTensor<float> zLocal = queueX.DeQue<float>();
+        AscendC::DataCopy(zGm[progress * this->tileLength], zLocal, rowNum * this->columnLength);
         queueX.FreeTensor(zLocal);
     }
 
 private:
-    TPipe pipe;
-    TBuf<QuePosition::VECCALC> sharedTmpBuffer;
-    TQue<QuePosition::VECOUT, BUFFER_NUM> queueX;
-    TQue<QuePosition::VECIN, 1> queueMax, queueSum;
-    GlobalTensor<float> xGm;
-    GlobalTensor<float> maxGm;
-    GlobalTensor<float> sumGm;
-    GlobalTensor<float> zGm;
+    AscendC::TPipe pipe;
+    AscendC::TBuf<AscendC::QuePosition::VECCALC> sharedTmpBuffer;
+    AscendC::TQue<AscendC::QuePosition::VECOUT, BUFFER_NUM> queueX;
+    AscendC::TQue<AscendC::QuePosition::VECIN, 1> queueMax, queueSum;
+    AscendC::GlobalTensor<float> xGm;
+    AscendC::GlobalTensor<float> maxGm;
+    AscendC::GlobalTensor<float> sumGm;
+    AscendC::GlobalTensor<float> zGm;
 
     uint32_t blockLength = 0;
     uint32_t usedBlockDim = 0;
diff --git a/examples/activation/simplesoftmax/kernel_launch_method_by_direct/simplesoftmax_custom.cpp b/examples/activation/simplesoftmax/kernel_launch_method_by_direct/simplesoftmax_custom.cpp
index 1eb366540c69733d8f4540a0f896d218496ac04d..636a18bfe263c83e49cbf9241479c2f3d95afb46 100644
--- a/examples/activation/simplesoftmax/kernel_launch_method_by_direct/simplesoftmax_custom.cpp
+++ b/examples/activation/simplesoftmax/kernel_launch_method_by_direct/simplesoftmax_custom.cpp
@@ -9,14 +9,13 @@
  */
 
 #include "../kernel_impl/simplesoftmax_kernel.h"
-using namespace AscendC;
 
-__aicore__ inline void CopyTiling(VecTiling* tiling, GM_ADDR tilingGM)
+__aicore__ inline void CopyTiling(MyCustomKernel::VecTiling* tiling, GM_ADDR tilingGM)
 {
     uint32_t* ptr = reinterpret_cast<uint32_t*>(tiling);
     auto tiling32 = reinterpret_cast<__gm__ uint32_t*>(tilingGM);
 
-    for (int i = 0; i < sizeof(VecTiling) / sizeof(uint32_t); i++, ptr++) {
+    for (int i = 0; i < sizeof(MyCustomKernel::VecTiling) / sizeof(uint32_t); i++, ptr++) {
         *ptr = *(tiling32 + i);
     }
     return;
@@ -28,10 +27,10 @@ extern "C" __global__ __aicore__ void simplesoftmax_custom(GM_ADDR x, GM_ADDR ma
     if ASCEND_IS_AIC {
         return;
     }
-    VecTiling tilingData;
+    MyCustomKernel::VecTiling tilingData;
     CopyTiling(&tilingData, tiling);
-    KernelSoftmax op;
-    op.Init(x, max, sum, z, tilingData);
+    MyCustomKernel::KernelSoftmax op;
+    op.Init(x, max, sum, res_out, tilingData);
     op.Process();
 }
 
diff --git a/examples/activation/simplesoftmax/kernel_launch_method_by_framework/op_kernel/simplesoftmax_custom.cpp b/examples/activation/simplesoftmax/kernel_launch_method_by_framework/op_kernel/simplesoftmax_custom.cpp
index 1bb113e3680e4bbef05f53cf2542a48dd747d8ff..91b8de26e8b183482e29f327b4ceaaef6766e68c 100644
--- a/examples/activation/simplesoftmax/kernel_launch_method_by_framework/op_kernel/simplesoftmax_custom.cpp
+++ b/examples/activation/simplesoftmax/kernel_launch_method_by_framework/op_kernel/simplesoftmax_custom.cpp
@@ -8,7 +8,6 @@
  * See LICENSE in the root of the software repository for the full text of the License.
  */
 #include "../../../../../../kernel_impl/simplesoftmax_kernel.h"
-using namespace AscendC;
 
 extern "C" __global__ __aicore__ void simplesoftmax_custom(GM_ADDR x, GM_ADDR max, GM_ADDR sum, GM_ADDR z,
     GM_ADDR workspace, GM_ADDR tiling)
@@ -17,9 +16,9 @@ extern "C" __global__ __aicore__ void simplesoftmax_custom(GM_ADDR x, GM_ADDR ma
         return;
     }
     GET_TILING_DATA(tilingData, tiling);
-    VecTiling vecTiling = *reinterpret_cast<VecTiling*>(&tilingData);
-    KernelSoftmax op;
-    op.Init(x, max, sum, z, vecTiling);
+    MyCustomKernel::VecTiling vecTiling = *reinterpret_cast<MyCustomKernel::VecTiling*>(&tilingData);
+    MyCustomKernel::KernelSoftmax op;
+    op.Init(x, max, sum, res_out, vecTiling);
     if (TILING_KEY_IS(1)) {
         op.Process();
     }
diff --git a/examples/activation/softmax/kernel_impl/softmax_kernel.h b/examples/activation/softmax/kernel_impl/softmax_kernel.h
index 5e75aed32a9dc48928d12d0b3e4b59ac580b6843..bac55031dc19773234d2ef84c4c036a6505e4d48 100644
--- a/examples/activation/softmax/kernel_impl/softmax_kernel.h
+++ b/examples/activation/softmax/kernel_impl/softmax_kernel.h
@@ -12,7 +12,7 @@
 #define EXAMPLES_ACTIVATION_SOFTMAX_KERNEL_H
 #include "kernel_operator.h"
 
-namespace AscendC {
+namespace MyCustomKernel {
 constexpr int32_t BUFFER_NUM = 2;
 constexpr uint32_t FLOAT_NUM_OF_SINGEL_BLOCK = 8;
 constexpr uint32_t BASIC_BLOCK_ROW_FACTOR = 8;
@@ -55,10 +55,10 @@ public:
     }
     __aicore__ inline void Init(GM_ADDR x, GM_ADDR max, GM_ADDR sum, GM_ADDR z, const VecTiling& tilingData)
     {
-        ASSERT(GetBlockNum() != 0 && "block dim can not be zero!");
+        ASSERT(AscendC::GetBlockNum() != 0 && "block dim can not be zero!");
         InitTiling(tilingData);
 
-        if (GetBlockIdx() == this->usedBlockDim) { // tail core
+        if (AscendC::GetBlockIdx() == this->usedBlockDim) { // tail core
             this->singleLoopCoreRowNum = this->tailCoreSingleLoopCoreRowNum;
             this->singleCoreLoopCount = this->tailCoreSingleCoreLoopCount;
             this->leftRow = this->tailCoreSingleCoreLoopTail;
@@ -67,8 +67,8 @@ public:
         this->blockLength = this->coreRowNum * this->columnLength;
         this->msLength = this->coreRowNum * FLOAT_NUM_OF_SINGEL_BLOCK; // max sum length per block process
 
-        uint32_t offset1 = this->blockLength * GetBlockIdx();
-        uint32_t offset2 = this->msLength * GetBlockIdx();
+        uint32_t offset1 = this->blockLength * AscendC::GetBlockIdx();
+        uint32_t offset2 = this->msLength * AscendC::GetBlockIdx();
 
         xGm.SetGlobalBuffer((__gm__ float*)x + offset1, this->blockLength);
         zGm.SetGlobalBuffer((__gm__ float*)z + offset1, this->blockLength);
@@ -89,7 +89,7 @@ public:
 
     __aicore__ inline void Process()
     {
-        if (GetBlockIdx() > this->usedBlockDim) {
+        if (AscendC::GetBlockIdx() > this->usedBlockDim) {
             return;
         }
 
@@ -108,25 +108,25 @@ public:
 private:
     __aicore__ inline void CopyIn(int32_t progress, uint32_t rowNum)
     {
-        LocalTensor<float> xLocal = queueX.AllocTensor<float>();
-        DataCopy(xLocal, xGm[progress * this->tileLength], rowNum * this->columnLength);
+        AscendC::LocalTensor<float> xLocal = queueX.AllocTensor<float>();
+        AscendC::DataCopy(xLocal, xGm[progress * this->tileLength], rowNum * this->columnLength);
         queueX.EnQue(xLocal);
     }
 
     __aicore__ inline void Compute(int32_t progressm, uint32_t rowNum)
     {
-        LocalTensor<float> xLocal = queueX.DeQue<float>();
-        LocalTensor<float> maxLocal = queueMax.AllocTensor<float>();
-        LocalTensor<float> sumLocal = queueSum.AllocTensor<float>();
-        LocalTensor<uint8_t> tmpBuffer = sharedTmpBuffer.Get<uint8_t>();
+        AscendC::LocalTensor<float> xLocal = queueX.DeQue<float>();
+        AscendC::LocalTensor<float> maxLocal = queueMax.AllocTensor<float>();
+        AscendC::LocalTensor<float> sumLocal = queueSum.AllocTensor<float>();
+        AscendC::LocalTensor<uint8_t> tmpBuffer = sharedTmpBuffer.Get<uint8_t>();
 
-        SoftMaxShapeInfo srcShape = { rowNum, this->columnLength, rowNum, this->columnLength };
+        AscendC::SoftMaxShapeInfo srcShape = { rowNum, this->columnLength, rowNum, this->columnLength };
         if (rowNum % BASIC_BLOCK_ROW_FACTOR == 0 &&
             this->columnLength % BASIC_BLOCK_COLUMN_FACTOR == 0 &&
             this->columnLength < BASIC_BLOCK_MAX_COLUMN_LENGTH) {
-            SoftMax<float, true, true>(xLocal, sumLocal, maxLocal, xLocal, tmpBuffer, softmaxTiling, srcShape);
+            AscendC::SoftMax<float, true, true>(xLocal, sumLocal, maxLocal, xLocal, tmpBuffer, softmaxTiling, srcShape);
         } else {
-            SoftMax<float, true>(xLocal, sumLocal, maxLocal, xLocal, tmpBuffer, softmaxTiling, srcShape);
+            AscendC::SoftMax<float, true>(xLocal, sumLocal, maxLocal, xLocal, tmpBuffer, softmaxTiling, srcShape);
         }
         queueX.EnQue<float>(xLocal);
         queueMax.EnQue<float>(maxLocal);
@@ -135,13 +135,13 @@ private:
 
     __aicore__ inline void CopyOut(int32_t progress, uint32_t rowNum)
     {
-        LocalTensor<float> zLocal = queueX.DeQue<float>();
-        LocalTensor<float> maxLocal = queueMax.DeQue<float>();
-        LocalTensor<float> sumLocal = queueSum.DeQue<float>();
+        AscendC::LocalTensor<float> zLocal = queueX.DeQue<float>();
+        AscendC::LocalTensor<float> maxLocal = queueMax.DeQue<float>();
+        AscendC::LocalTensor<float> sumLocal = queueSum.DeQue<float>();
 
-        DataCopy(zGm[progress * this->tileLength], zLocal, rowNum * this->columnLength);
-        DataCopy(maxGm[progress * this->msTileLength], maxLocal, rowNum * FLOAT_NUM_OF_SINGEL_BLOCK);
-        DataCopy(sumGm[progress * this->msTileLength], sumLocal, rowNum * FLOAT_NUM_OF_SINGEL_BLOCK);
+        AscendC::DataCopy(zGm[progress * this->tileLength], zLocal, rowNum * this->columnLength);
+        AscendC::DataCopy(maxGm[progress * this->msTileLength], maxLocal, rowNum * FLOAT_NUM_OF_SINGEL_BLOCK);
+        AscendC::DataCopy(sumGm[progress * this->msTileLength], sumLocal, rowNum * FLOAT_NUM_OF_SINGEL_BLOCK);
 
         queueX.FreeTensor(zLocal);
         queueMax.FreeTensor(maxLocal);
@@ -149,14 +149,14 @@ private:
     }
 
 private:
-    TPipe pipe;
-    TBuf<QuePosition::VECCALC> sharedTmpBuffer;
-    TQue<QuePosition::VECIN, BUFFER_NUM> queueX;
-    TQue<QuePosition::VECOUT, 1> queueMax, queueSum;
-    GlobalTensor<float> xGm;
-    GlobalTensor<float> maxGm;
-    GlobalTensor<float> sumGm;
-    GlobalTensor<float> zGm;
+    AscendC::TPipe pipe;
+    AscendC::TBuf<AscendC::QuePosition::VECCALC> sharedTmpBuffer;
+    AscendC::TQue<AscendC::QuePosition::VECIN, BUFFER_NUM> queueX;
+    AscendC::TQue<AscendC::QuePosition::VECOUT, 1> queueMax, queueSum;
+    AscendC::GlobalTensor<float> xGm;
+    AscendC::GlobalTensor<float> maxGm;
+    AscendC::GlobalTensor<float> sumGm;
+    AscendC::GlobalTensor<float> zGm;
 
     uint32_t blockLength = 0;
     uint32_t usedBlockDim = 0;
diff --git a/examples/activation/softmax/kernel_launch_method_by_direct/softmax_custom.cpp b/examples/activation/softmax/kernel_launch_method_by_direct/softmax_custom.cpp
index 51848ac2f7518eaee07cbdebf6494e7982b77777..bf6abed2ed49275dc6bd38ec11069dbab17815ea 100644
--- a/examples/activation/softmax/kernel_launch_method_by_direct/softmax_custom.cpp
+++ b/examples/activation/softmax/kernel_launch_method_by_direct/softmax_custom.cpp
@@ -9,14 +9,13 @@
  */
 
 #include "../kernel_impl/softmax_kernel.h"
-using namespace AscendC;
 
-__aicore__ inline void CopyTiling(VecTiling* tiling, GM_ADDR tilingGM)
+__aicore__ inline void CopyTiling(MyCustomKernel::VecTiling* tiling, GM_ADDR tilingGM)
 {
     uint32_t* ptr = reinterpret_cast<uint32_t*>(tiling);
     auto tiling32 = reinterpret_cast<__gm__ uint32_t*>(tilingGM);
 
-    for (int i = 0; i < sizeof(VecTiling) / sizeof(uint32_t); i++, ptr++) {
+    for (int i = 0; i < sizeof(MyCustomKernel::VecTiling) / sizeof(uint32_t); i++, ptr++) {
         *ptr = *(tiling32 + i);
     }
     return;
@@ -28,10 +27,10 @@ extern "C" __global__ __aicore__ void softmax_custom(GM_ADDR x, GM_ADDR max, GM_
     if ASCEND_IS_AIC {
         return;
     }
-    VecTiling tilingData;
+    MyCustomKernel::VecTiling tilingData;
     CopyTiling(&tilingData, tiling);
-    KernelSoftmax op;
-    op.Init(x, max, sum, z, tilingData);
+    MyCustomKernel::KernelSoftmax op;
+    op.Init(x, max, sum, res_out, tilingData);
     op.Process();
 }
 
diff --git a/examples/activation/softmax/kernel_launch_method_by_framework/op_kernel/softmax_custom.cpp b/examples/activation/softmax/kernel_launch_method_by_framework/op_kernel/softmax_custom.cpp
index e263b3d26659f06be12ad0c4175d5acf1e32a206..95854592b9251e264e9d75e5acaa5f091f2cbd0d 100644
--- a/examples/activation/softmax/kernel_launch_method_by_framework/op_kernel/softmax_custom.cpp
+++ b/examples/activation/softmax/kernel_launch_method_by_framework/op_kernel/softmax_custom.cpp
@@ -9,7 +9,6 @@
  */
 
 #include "../../../../../../kernel_impl/softmax_kernel.h"
-using namespace AscendC;
 
 extern "C" __global__ __aicore__ void softmax_custom(GM_ADDR x, GM_ADDR max, GM_ADDR sum, GM_ADDR z,
     GM_ADDR workspace, GM_ADDR tiling)
@@ -18,9 +17,9 @@ extern "C" __global__ __aicore__ void softmax_custom(GM_ADDR x, GM_ADDR max, GM_
         return;
     }
     GET_TILING_DATA(tilingData, tiling);
-    VecTiling vecTiling = *reinterpret_cast<VecTiling*>(&tilingData);
-    KernelSoftmax op;
-    op.Init(x, max, sum, z, vecTiling);
+    MyCustomKernel::VecTiling vecTiling = *reinterpret_cast<MyCustomKernel::VecTiling*>(&tilingData);
+    MyCustomKernel::KernelSoftmax op;
+    op.Init(x, max, sum, res_out, vecTiling);
     if (TILING_KEY_IS(1)) {
         op.Process();
     }
diff --git a/examples/activation/softmaxflash/kernel_impl/softmaxflash_kernel.h b/examples/activation/softmaxflash/kernel_impl/softmaxflash_kernel.h
index 391e4fb3b7fddbb544d9b82392d99b7197cddf60..0e4b3877d57b6f3db0170ddde446df86040654bb 100644
--- a/examples/activation/softmaxflash/kernel_impl/softmaxflash_kernel.h
+++ b/examples/activation/softmaxflash/kernel_impl/softmaxflash_kernel.h
@@ -12,7 +12,7 @@
 #define EXAMPLES_ACTIVATION_SOFTMAXFLASH_KERNEL_H
 #include "kernel_operator.h"
 
-namespace AscendC {
+namespace MyCustomKernel {
 constexpr int32_t BUFFER_NUM = 2;
 constexpr uint32_t FLOAT_NUM_OF_SINGEL_BLOCK = 8;
 constexpr uint32_t BASIC_BLOCK_ROW_FACTOR = 8;
@@ -62,10 +62,10 @@ public:
 
     __aicore__ inline void Init(GM_ADDR x, GM_ADDR max, GM_ADDR sum, const VecTiling& tilingData)
     {
-        ASSERT(GetBlockNum() != 0 && "block dim can not be zero!");
+        ASSERT(AscendC::GetBlockNum() != 0 && "block dim can not be zero!");
         InitTiling(tilingData);
 
-        if (GetBlockIdx() == this->usedBlockDim) { // tail core
+        if (AscendC::GetBlockIdx() == this->usedBlockDim) { // tail core
             this->singleLoopCoreRowNum = this->tailCoreSingleLoopCoreRowNum;
             this->singleCoreLoopCount = this->tailCoreSingleCoreLoopCount;
             this->leftRow = this->tailCoreSingleCoreLoopTail;
@@ -74,8 +74,8 @@ public:
         this->blockLength = this->coreRowNum * this->columnLength;
         this->msLength = this->coreRowNum * FLOAT_NUM_OF_SINGEL_BLOCK; // max sum length per block process
 
-        uint32_t offset1 = this->blockLength * GetBlockIdx();
-        uint32_t offset2 = this->msLength * GetBlockIdx();
+        uint32_t offset1 = this->blockLength * AscendC::GetBlockIdx();
+        uint32_t offset2 = this->msLength * AscendC::GetBlockIdx();
 
         xGm.SetGlobalBuffer((__gm__ float*)x + offset1, this->blockLength);
         maxGm.SetGlobalBuffer((__gm__ float*)max + offset2, this->msLength);
@@ -97,7 +97,7 @@ public:
 
     __aicore__ inline void Process()
     {
-        if (GetBlockIdx() > this->usedBlockDim) {
+        if (AscendC::GetBlockIdx() > this->usedBlockDim) {
             return;
         }
 
@@ -111,15 +111,15 @@ public:
                 CopyIn(i, this->loopK, this->singleLoopCoreRowNum, this->tailK);
                 Compute(i, this->loopK, this->singleLoopCoreRowNum, this->tailK);
             }
-            event_t eventIdVToMte3 = static_cast<event_t>(GetTPipePtr()->FetchEventID(HardEvent::V_MTE3));
-            SetFlag<HardEvent::V_MTE3>(eventIdVToMte3);
-            WaitFlag<HardEvent::V_MTE3>(eventIdVToMte3);
+            event_t eventIdVToMte3 = static_cast<event_t>(GetTPipePtr()->FetchEventID(AscendC::HardEvent::V_MTE3));
+            AscendC::SetFlag<AscendC::HardEvent::V_MTE3>(eventIdVToMte3);
+            AscendC::WaitFlag<AscendC::HardEvent::V_MTE3>(eventIdVToMte3);
             // copy max sum to gm
             CopyOut(i, this->msTileLength);
         }
-        event_t eventIdMte3ToV = static_cast<event_t>(GetTPipePtr()->FetchEventID(HardEvent::MTE3_V));
-        SetFlag<HardEvent::MTE3_V>(eventIdMte3ToV);
-        WaitFlag<HardEvent::MTE3_V>(eventIdMte3ToV);
+        event_t eventIdMte3ToV = static_cast<event_t>(GetTPipePtr()->FetchEventID(AscendC::HardEvent::MTE3_V));
+        AscendC::SetFlag<AscendC::HardEvent::MTE3_V>(eventIdMte3ToV);
+        AscendC::WaitFlag<AscendC::HardEvent::MTE3_V>(eventIdMte3ToV);
 
         if (this->leftRow > 0) {
             for (int32_t j = 0; j < this->loopK; j++) { // split K
@@ -131,9 +131,9 @@ public:
                 CopyIn(this->singleCoreLoopCount, this->loopK, this->leftRow, this->tailK);
                 Compute(this->singleCoreLoopCount, this->loopK, this->leftRow, this->tailK);
             }
-            event_t eventIdVToMte3 = static_cast<event_t>(GetTPipePtr()->FetchEventID(HardEvent::V_MTE3));
-            SetFlag<HardEvent::V_MTE3>(eventIdVToMte3);
-            WaitFlag<HardEvent::V_MTE3>(eventIdVToMte3);
+            event_t eventIdVToMte3 = static_cast<event_t>(GetTPipePtr()->FetchEventID(AscendC::HardEvent::V_MTE3));
+            AscendC::SetFlag<AscendC::HardEvent::V_MTE3>(eventIdVToMte3);
+            AscendC::WaitFlag<AscendC::HardEvent::V_MTE3>(eventIdVToMte3);
             // copy max sum to gm
             uint32_t tailMsTileLength = this->leftRow * FLOAT_NUM_OF_SINGEL_BLOCK;
             CopyOut(this->singleCoreLoopCount, tailMsTileLength);
@@ -143,10 +143,10 @@ public:
 private:
     __aicore__ inline void CopyIn(uint32_t rowIndex, uint32_t kIndex, uint32_t rowNum, uint32_t columnNum)
     {
-        LocalTensor<float> xLocal = queueX.AllocTensor<float>();
+        AscendC::LocalTensor<float> xLocal = queueX.AllocTensor<float>();
         uint32_t offset = this->singleLoopCoreRowNum * this->columnLength;
         for (uint32_t i = 0; i < rowNum; i++) {
-            DataCopy(xLocal[i * columnNum], xGm[rowIndex * offset + i * this->columnLength + kIndex * this->splitK],
+            AscendC::DataCopy(xLocal[i * columnNum], xGm[rowIndex * offset + i * this->columnLength + kIndex * this->splitK],
                 columnNum);
         }
         queueX.EnQue(xLocal);
@@ -154,28 +154,28 @@ private:
 
     __aicore__ inline void Compute(uint32_t rowIndex, uint32_t columnIndex, uint32_t rowNum, uint32_t columnNum)
     {
-        LocalTensor<float> xLocal = queueX.DeQue<float>();
-        LocalTensor<uint8_t> tmpBuffer = sharedTmpBuffer.Get<uint8_t>();
+        AscendC::LocalTensor<float> xLocal = queueX.DeQue<float>();
+        AscendC::LocalTensor<uint8_t> tmpBuffer = sharedTmpBuffer.Get<uint8_t>();
 
-        SoftMaxShapeInfo srcShape = { rowNum, columnNum, rowNum, columnNum };
+        AscendC::SoftMaxShapeInfo srcShape = { rowNum, columnNum, rowNum, columnNum };
         if (columnIndex == 0) {                           // isUpdate == false
             if (rowNum % BASIC_BLOCK_ROW_FACTOR == 0 &&
                 columnNum % BASIC_BLOCK_COLUMN_FACTOR == 0 &&
                 columnNum < BASIC_BLOCK_MAX_COLUMN_LENGTH) {
-                SoftmaxFlashV2<float, false, true, true>(xLocal, sumLocal, maxLocal, xLocal, expmaxLocal, sumLocal,
+                AscendC::SoftmaxFlashV2<float, false, true, true>(xLocal, sumLocal, maxLocal, xLocal, expmaxLocal, sumLocal,
                     maxLocal, tmpBuffer, softmaxTiling, srcShape);
             } else {
-                SoftmaxFlashV2<float, false, true, false>(xLocal, sumLocal, maxLocal, xLocal, expmaxLocal, sumLocal,
+                AscendC::SoftmaxFlashV2<float, false, true, false>(xLocal, sumLocal, maxLocal, xLocal, expmaxLocal, sumLocal,
                     maxLocal, tmpBuffer, softmaxTiling, srcShape);
             }
         } else {
             if (rowNum % BASIC_BLOCK_ROW_FACTOR == 0 &&
                 columnNum % BASIC_BLOCK_COLUMN_FACTOR == 0 &&
                 columnNum < BASIC_BLOCK_MAX_COLUMN_LENGTH) {
-                SoftmaxFlashV2<float, true, true, true>(xLocal, sumLocal, maxLocal, xLocal, expmaxLocal, sumLocal,
+                AscendC::SoftmaxFlashV2<float, true, true, true>(xLocal, sumLocal, maxLocal, xLocal, expmaxLocal, sumLocal,
                     maxLocal, tmpBuffer, softmaxTiling, srcShape);
             } else {
-                SoftmaxFlashV2<float, true, true, false>(xLocal, sumLocal, maxLocal, xLocal, expmaxLocal, sumLocal,
+                AscendC::SoftmaxFlashV2<float, true, true, false>(xLocal, sumLocal, maxLocal, xLocal, expmaxLocal, sumLocal,
                     maxLocal, tmpBuffer, softmaxTiling, srcShape);
             }
         }
@@ -184,21 +184,22 @@ private:
 
     __aicore__ inline void CopyOut(uint32_t progress, uint32_t count)
     {
-        DataCopy(maxGm[progress * this->msTileLength], maxLocal, count);
-        DataCopy(sumGm[progress * this->msTileLength], sumLocal, count);
+        AscendC::DataCopy(maxGm[progress * this->msTileLength], maxLocal, count);
+        AscendC::DataCopy(sumGm[progress * this->msTileLength], sumLocal, count);
     }
 
 private:
-    TPipe pipe;
-    TBuf<QuePosition::VECCALC> sharedTmpBuffer;
-    TQue<QuePosition::VECIN, BUFFER_NUM> queueX;
-    TBuf<AscendC::QuePosition::VECCALC> tbufMax, tbufSum, tbufExpmax;
-    GlobalTensor<float> xGm;
-    GlobalTensor<float> maxGm;
-    GlobalTensor<float> sumGm;
-    LocalTensor<float> maxLocal;
-    LocalTensor<float> sumLocal;
-    LocalTensor<float> expmaxLocal;
+    AscendC::TPipe pipe;
+    AscendC::TBuf<AscendC::QuePosition::VECCALC> sharedTmpBuffer;
+    AscendC::TQue<AscendC::QuePosition::VECIN, BUFFER_NUM> queueX;
+    AscendC::TQue<AscendC::QuePosition::VECOUT, 1> queueMax, queueSum, queueExpmax;
+    AscendC::GlobalTensor<float> xGm;
+    AscendC::GlobalTensor<float> maxGm;
+    AscendC::GlobalTensor<float> sumGm;
+    AscendC::GlobalTensor<float> zGm;
+    AscendC::LocalTensor<float> maxLocal;
+    AscendC::LocalTensor<float> sumLocal;
+    AscendC::LocalTensor<float> expmaxLocal;
 
     uint32_t blockLength = 0;
     uint32_t usedBlockDim = 0;
diff --git a/examples/activation/softmaxflash/kernel_launch_method_by_direct/softmaxflash_custom.cpp b/examples/activation/softmaxflash/kernel_launch_method_by_direct/softmaxflash_custom.cpp
index cfcfa32df4082a69947efef6c62ae88e3407d158..e3ee128c29c4a40497adf87056b7a878817be201 100644
--- a/examples/activation/softmaxflash/kernel_launch_method_by_direct/softmaxflash_custom.cpp
+++ b/examples/activation/softmaxflash/kernel_launch_method_by_direct/softmaxflash_custom.cpp
@@ -9,14 +9,13 @@
  */
 
 #include "../kernel_impl/softmaxflash_kernel.h"
-using namespace AscendC;
 
-__aicore__ inline void CopyTiling(VecTiling* tiling, GM_ADDR tilingGM)
+__aicore__ inline void CopyTiling(MyCustomKernel::VecTiling* tiling, GM_ADDR tilingGM)
 {
     uint32_t* ptr = reinterpret_cast<uint32_t*>(tiling);
     auto tiling32 = reinterpret_cast<__gm__ uint32_t*>(tilingGM);
 
-    for (int i = 0; i < sizeof(VecTiling) / sizeof(uint32_t); i++, ptr++) {
+    for (int i = 0; i < sizeof(MyCustomKernel::VecTiling) / sizeof(uint32_t); i++, ptr++) {
         *ptr = *(tiling32 + i);
     }
     return;
@@ -28,10 +27,10 @@ extern "C" __global__ __aicore__ void softmaxflash_custom(GM_ADDR x, GM_ADDR max
     if ASCEND_IS_AIC {
         return;
     }
-    VecTiling tilingData;
+    MyCustomKernel::VecTiling tilingData;
     CopyTiling(&tilingData, tiling);
-    KernelSoftmax op;
-    op.Init(x, max, sum, tilingData);
+    MyCustomKernel::KernelSoftmax op;
+    op.Init(x, max, sum, res_out, tilingData);
     op.Process();
 }
 
diff --git a/examples/activation/softmaxflash/kernel_launch_method_by_framework/op_kernel/softmaxflash_custom.cpp b/examples/activation/softmaxflash/kernel_launch_method_by_framework/op_kernel/softmaxflash_custom.cpp
index c4d69de5291c5b040cc7ef3886a9ce6bc0614c98..019c856a2dcb9dcc5f67a8b70fa185316bbf928e 100644
--- a/examples/activation/softmaxflash/kernel_launch_method_by_framework/op_kernel/softmaxflash_custom.cpp
+++ b/examples/activation/softmaxflash/kernel_launch_method_by_framework/op_kernel/softmaxflash_custom.cpp
@@ -9,7 +9,6 @@
  */
 
 #include "../../../../../../kernel_impl/softmaxflash_kernel.h"
-using namespace AscendC;
 
 extern "C" __global__ __aicore__ void softmaxflash_custom(GM_ADDR x, GM_ADDR max, GM_ADDR sum,
     GM_ADDR workspace, GM_ADDR tiling)
@@ -18,9 +17,9 @@ extern "C" __global__ __aicore__ void softmaxflash_custom(GM_ADDR x, GM_ADDR max
         return;
     }
     GET_TILING_DATA(tilingData, tiling);
-    VecTiling vecTiling = *reinterpret_cast<VecTiling*>(&tilingData);
-    KernelSoftmax op;
-    op.Init(x, max, sum, vecTiling);
+    MyCustomKernel::VecTiling vecTiling = *reinterpret_cast<MyCustomKernel::VecTiling*>(&tilingData);
+    MyCustomKernel::KernelSoftmax op;
+    op.Init(x, max, sum, res_out, vecTiling);
     if (TILING_KEY_IS(1)) {
         op.Process();
     }
diff --git a/examples/activation/softmaxgrad/kernel_impl/softmaxgrad_kernel.h b/examples/activation/softmaxgrad/kernel_impl/softmaxgrad_kernel.h
index 4f5c10a4b92a8b6f38d980c69f8d6d6693958e7f..d8f1c25fe3e85c4665a620ea3ccb895d8257881f 100644
--- a/examples/activation/softmaxgrad/kernel_impl/softmaxgrad_kernel.h
+++ b/examples/activation/softmaxgrad/kernel_impl/softmaxgrad_kernel.h
@@ -12,7 +12,7 @@
 #define EXAMPLES_ACTIVATION_SOFTMAXGRAD_KERNEL_H
 #include "kernel_operator.h"
 
-namespace AscendC {
+namespace MyCustomKernel {
 constexpr int32_t BUFFER_NUM = 1;
 
 struct VecTiling {
@@ -51,17 +51,17 @@ public:
     }
     __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, GM_ADDR z, const VecTiling& tiling)
     {
-        ASSERT(GetBlockNum() != 0 && "block dim can not be zero!");
+        ASSERT(AscendC::GetBlockNum() != 0 && "block dim can not be zero!");
         InitTiling(tiling);
 
-        if (GetBlockIdx() == this->usedBlockDim) { // tail core
+        if (AscendC::GetBlockIdx() == this->usedBlockDim) { // tail core
             this->singleLoopCoreRowNum = this->tailCoreSingleLoopCoreRowNum;
             this->singleCoreLoopCount = this->tailCoreSingleCoreLoopCount;
             this->leftRow = this->tailCoreSingleCoreLoopTail;
         }
 
         this->blockLength = this->coreRowNum * this->columnLength;
-        uint32_t offset1 = this->blockLength * GetBlockIdx();
+        uint32_t offset1 = this->blockLength * AscendC::GetBlockIdx();
 
         xGm.SetGlobalBuffer((__gm__ float*)x + offset1, this->blockLength);
         yGm.SetGlobalBuffer((__gm__ float*)y + offset1, this->blockLength);
@@ -76,7 +76,7 @@ public:
 
     __aicore__ inline void Process()
     {
-        if (GetBlockIdx() > usedBlockDim) {
+        if (AscendC::GetBlockIdx() > usedBlockDim) {
             return;
         }
 
@@ -85,9 +85,9 @@ public:
             Compute(i, this->singleLoopCoreRowNum);
             CopyOut(i, this->singleLoopCoreRowNum);
         }
-        event_t eventIdMte3ToMte2 = static_cast<event_t>(GetTPipePtr()->FetchEventID(HardEvent::MTE3_MTE2));
-        SetFlag<HardEvent::MTE3_MTE2>(eventIdMte3ToMte2);
-        WaitFlag<HardEvent::MTE3_MTE2>(eventIdMte3ToMte2);
+        event_t eventIdMte3ToMte2 = static_cast<event_t>(GetTPipePtr()->FetchEventID(AscendC::HardEvent::MTE3_MTE2));
+        AscendC::SetFlag<AscendC::HardEvent::MTE3_MTE2>(eventIdMte3ToMte2);
+        AscendC::WaitFlag<AscendC::HardEvent::MTE3_MTE2>(eventIdMte3ToMte2);
         if (this->leftRow > 0) {
             CopyIn(this->singleCoreLoopCount, this->leftRow);
             Compute(this->singleCoreLoopCount, this->leftRow);
@@ -98,22 +98,22 @@ public:
 private:
     __aicore__ inline void CopyIn(int32_t progress, uint32_t rowNum)
     {
-        LocalTensor<float> xLocal = queueX.AllocTensor<float>();
-        LocalTensor<float> yLocal = queueY.AllocTensor<float>();
-        DataCopy(xLocal, xGm[progress * this->tileLength], rowNum * this->columnLength);
-        DataCopy(yLocal, yGm[progress * this->tileLength], rowNum * this->columnLength);
+        AscendC::LocalTensor<float> xLocal = queueX.AllocTensor<float>();
+        AscendC::LocalTensor<float> yLocal = queueY.AllocTensor<float>();
+        AscendC::DataCopy(xLocal, xGm[progress * this->tileLength], rowNum * this->columnLength);
+        AscendC::DataCopy(yLocal, yGm[progress * this->tileLength], rowNum * this->columnLength);
         queueX.EnQue(xLocal);
         queueY.EnQue(yLocal);
     }
 
     __aicore__ inline void Compute(int32_t progressm, uint32_t rowNum)
     {
-        LocalTensor<float> xLocal = queueX.DeQue<float>();
-        LocalTensor<float> yLocal = queueY.DeQue<float>();
-        LocalTensor<uint8_t> tmpBuffer = sharedTmpBuffer.Get<uint8_t>();
+        AscendC::LocalTensor<float> xLocal = queueX.DeQue<float>();
+        AscendC::LocalTensor<float> yLocal = queueY.DeQue<float>();
+        AscendC::LocalTensor<uint8_t> tmpBuffer = sharedTmpBuffer.Get<uint8_t>();
 
-        SoftMaxShapeInfo srcShape = { rowNum, this->columnLength, rowNum, this->columnLength };
-        SoftmaxGrad<float, true>(yLocal, xLocal, yLocal, tmpBuffer, softmaxTiling, false, srcShape);
+        AscendC::SoftMaxShapeInfo srcShape = { rowNum, this->columnLength, rowNum, this->columnLength };
+        AscendC::SoftmaxGrad<float, true>(yLocal, xLocal, yLocal, tmpBuffer, softmaxTiling, false, srcShape);
 
         queueY.EnQue(yLocal);
         queueX.FreeTensor(xLocal);
@@ -121,19 +121,19 @@ private:
 
     __aicore__ inline void CopyOut(int32_t progress, uint32_t rowNum)
     {
-        LocalTensor<float> zLocal = queueY.DeQue<float>();
-        DataCopy(zGm[progress * this->tileLength], zLocal, rowNum * this->columnLength);
+        AscendC::LocalTensor<float> zLocal = queueY.DeQue<float>();
+        AscendC::DataCopy(zGm[progress * this->tileLength], zLocal, rowNum * this->columnLength);
         queueY.FreeTensor(zLocal);
     }
 
 private:
-    TPipe pipe;
-    TBuf<QuePosition::VECCALC> sharedTmpBuffer;
-    TQue<QuePosition::VECIN, BUFFER_NUM> queueX;
-    TQue<QuePosition::VECOUT, BUFFER_NUM> queueY;
-    GlobalTensor<float> xGm;
-    GlobalTensor<float> yGm;
-    GlobalTensor<float> zGm;
+    AscendC::TPipe pipe;
+    AscendC::TBuf<AscendC::QuePosition::VECCALC> sharedTmpBuffer;
+    AscendC::TQue<AscendC::QuePosition::VECIN, BUFFER_NUM> queueX;
+    AscendC::TQue<AscendC::QuePosition::VECOUT, BUFFER_NUM> queueY;
+    AscendC::GlobalTensor<float> xGm;
+    AscendC::GlobalTensor<float> yGm;
+    AscendC::GlobalTensor<float> zGm;
 
     uint32_t blockLength = 0;
     uint32_t usedBlockDim = 0;
diff --git a/examples/activation/softmaxgrad/kernel_launch_method_by_direct/softmaxgrad_custom.cpp b/examples/activation/softmaxgrad/kernel_launch_method_by_direct/softmaxgrad_custom.cpp
index 2ab23bb21584e6629c9ab0c0b9c671ef57cc2b51..febf4b5e8f8641cdd42baf6bbf1f57bd8d7437ee 100644
--- a/examples/activation/softmaxgrad/kernel_launch_method_by_direct/softmaxgrad_custom.cpp
+++ b/examples/activation/softmaxgrad/kernel_launch_method_by_direct/softmaxgrad_custom.cpp
@@ -9,14 +9,13 @@
  */
 
 #include "../kernel_impl/softmaxgrad_kernel.h"
-using namespace AscendC;
 
-__aicore__ inline void CopyTiling(VecTiling* tiling, GM_ADDR tilingGM)
+__aicore__ inline void CopyTiling(MyCustomKernel::VecTiling* tiling, GM_ADDR tilingGM)
 {
     uint32_t* ptr = reinterpret_cast<uint32_t*>(tiling);
     auto tiling32 = reinterpret_cast<__gm__ uint32_t*>(tilingGM);
 
-    for (int i = 0; i < sizeof(VecTiling) / sizeof(uint32_t); i++, ptr++) {
+    for (int i = 0; i < sizeof(MyCustomKernel::VecTiling) / sizeof(uint32_t); i++, ptr++) {
         *ptr = *(tiling32 + i);
     }
     return;
@@ -28,9 +27,9 @@ extern "C" __global__ __aicore__ void softmaxgrad_custom(GM_ADDR x, GM_ADDR y, G
     if ASCEND_IS_AIC {
         return;
     }
-    VecTiling tilingData;
+    MyCustomKernel::VecTiling tilingData;
     CopyTiling(&tilingData, tiling);
-    KernelSoftmax op;
+    MyCustomKernel::KernelSoftmax op;
     op.Init(x, y, z, tilingData);
     op.Process();
 }
diff --git a/examples/activation/softmaxgrad/kernel_launch_method_by_framework/op_kernel/softmaxgrad_custom.cpp b/examples/activation/softmaxgrad/kernel_launch_method_by_framework/op_kernel/softmaxgrad_custom.cpp
index ef21e9b7e523f682c9402f7087c6dae06fe17c95..d466c81b9d53753980db1ae918db873e1b028ffa 100644
--- a/examples/activation/softmaxgrad/kernel_launch_method_by_framework/op_kernel/softmaxgrad_custom.cpp
+++ b/examples/activation/softmaxgrad/kernel_launch_method_by_framework/op_kernel/softmaxgrad_custom.cpp
@@ -9,7 +9,6 @@
  */
 
 #include "../../../../../../kernel_impl/softmaxgrad_kernel.h"
-using namespace AscendC;
 
 extern "C" __global__ __aicore__ void softmaxgrad_custom(GM_ADDR x, GM_ADDR y, GM_ADDR z, GM_ADDR workspace,
     GM_ADDR tiling)
@@ -18,9 +17,9 @@ extern "C" __global__ __aicore__ void softmaxgrad_custom(GM_ADDR x, GM_ADDR y, G
         return;
     }
     GET_TILING_DATA(tilingData, tiling);
-    VecTiling vecTiling = *reinterpret_cast<VecTiling*>(&tilingData);
-    KernelSoftmax op;
-    op.Init(x, y, z, vecTiling);
+    MyCustomKernel::VecTiling vecTiling = *reinterpret_cast<MyCustomKernel::VecTiling*>(&tilingData);
+    MyCustomKernel::KernelSoftmax op;
+    op.Init(x, y, res_out, vecTiling);
     if (TILING_KEY_IS(1)) {
         op.Process();
     }
diff --git a/examples/activation/softmaxgradfront/kernel_impl/softmaxgradfront_kernel.h b/examples/activation/softmaxgradfront/kernel_impl/softmaxgradfront_kernel.h
index 94a0659a1d7f84a7debd90017ccb410d5d522ae2..8a3fb2f2b8bf8462e9f44d2557a861ad2231ae5d 100644
--- a/examples/activation/softmaxgradfront/kernel_impl/softmaxgradfront_kernel.h
+++ b/examples/activation/softmaxgradfront/kernel_impl/softmaxgradfront_kernel.h
@@ -11,7 +11,7 @@
 #ifndef EXAMPLES_ACTIVATION_SOFTMAXGRADFRONT_KERNEL_H
 #define EXAMPLES_ACTIVATION_SOFTMAXGRADFRONT_KERNEL_H
 #include "kernel_operator.h"
-namespace AscendC {
+namespace MyCustomKernel {
 constexpr int32_t BUFFER_NUM = 1;
 constexpr uint32_t FLOAT_NUM_OF_SINGEL_BLOCK = 8;
 constexpr uint32_t BASIC_BLOCK_ROW_FACTOR = 8;
@@ -53,18 +53,18 @@ public:
     }
     __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, GM_ADDR z, const VecTiling& tiling)
     {
-        ASSERT(GetBlockNum() != 0 && "block dim can not be zero!");
+        ASSERT(AscendC::GetBlockNum() != 0 && "block dim can not be zero!");
         InitTiling(tiling);
 
-        if (GetBlockIdx() == this->usedBlockDim) { // tail core
+        if (AscendC::GetBlockIdx() == this->usedBlockDim) { // tail core
             this->singleLoopCoreRowNum = this->tailCoreSingleLoopCoreRowNum;
             this->singleCoreLoopCount = this->tailCoreSingleCoreLoopCount;
             this->leftRow = this->tailCoreSingleCoreLoopTail;
         }
 
         this->blockLength = this->coreRowNum * this->columnLength;
-        uint32_t offset1 = this->blockLength * GetBlockIdx();
-        uint32_t offset2 = this->coreRowNum * FLOAT_NUM_OF_SINGEL_BLOCK * GetBlockIdx();
+        uint32_t offset1 = this->blockLength * AscendC::GetBlockIdx();
+        uint32_t offset2 = this->coreRowNum * FLOAT_NUM_OF_SINGEL_BLOCK * AscendC::GetBlockIdx();
 
         xGm.SetGlobalBuffer((__gm__ float*)x + offset1, this->blockLength);
         yGm.SetGlobalBuffer((__gm__ float*)y + offset1, this->blockLength);
@@ -80,7 +80,7 @@ public:
 
     __aicore__ inline void Process()
     {
-        if (GetBlockIdx() > usedBlockDim) {
+        if (AscendC::GetBlockIdx() > usedBlockDim) {
             return;
         }
 
@@ -100,28 +100,28 @@ public:
 private:
     __aicore__ inline void CopyIn(int32_t progress, uint32_t rowNum)
     {
-        LocalTensor<float> xLocal = queueX.AllocTensor<float>();
-        LocalTensor<float> yLocal = queueY.AllocTensor<float>();
-        DataCopy(xLocal, xGm[progress * this->tileLength], rowNum * this->columnLength);
-        DataCopy(yLocal, yGm[progress * this->tileLength], rowNum * this->columnLength);
+        AscendC::LocalTensor<float> xLocal = queueX.AllocTensor<float>();
+        AscendC::LocalTensor<float> yLocal = queueY.AllocTensor<float>();
+        AscendC::DataCopy(xLocal, xGm[progress * this->tileLength], rowNum * this->columnLength);
+        AscendC::DataCopy(yLocal, yGm[progress * this->tileLength], rowNum * this->columnLength);
         queueX.EnQue(xLocal);
         queueY.EnQue(yLocal);
     }
 
     __aicore__ inline void Compute(int32_t progressm, uint32_t rowNum)
     {
-        LocalTensor<float> xLocal = queueX.DeQue<float>();
-        LocalTensor<float> yLocal = queueY.DeQue<float>();
-        LocalTensor<float> zLocal = queueZ.AllocTensor<float>();
-        LocalTensor<uint8_t> tmpBuffer = sharedTmpBuffer.Get<uint8_t>();
+        AscendC::LocalTensor<float> xLocal = queueX.DeQue<float>();
+        AscendC::LocalTensor<float> yLocal = queueY.DeQue<float>();
+        AscendC::LocalTensor<float> zLocal = queueZ.AllocTensor<float>();
+        AscendC::LocalTensor<uint8_t> tmpBuffer = sharedTmpBuffer.Get<uint8_t>();
 
-        SoftMaxShapeInfo srcShape = { rowNum, this->columnLength, rowNum, this->columnLength };
+        AscendC::SoftMaxShapeInfo srcShape = { rowNum, this->columnLength, rowNum, this->columnLength };
         if (this->singleLoopCoreRowNum % BASIC_BLOCK_ROW_FACTOR == 0 &&
             this->columnLength % BASIC_BLOCK_COLUMN_FACTOR == 0 &&
             this->columnLength < BASIC_BLOCK_MAX_COLUMN_LENGTH) {
-            SoftmaxGradFront<float, true>(zLocal, xLocal, yLocal, tmpBuffer, softmaxTiling, srcShape);
+            AscendC::SoftmaxGradFront<float, true>(zLocal, xLocal, yLocal, tmpBuffer, softmaxTiling, srcShape);
         } else {
-            SoftmaxGradFront<float>(zLocal, xLocal, yLocal, tmpBuffer, softmaxTiling, srcShape);
+            AscendC::SoftmaxGradFront<float>(zLocal, xLocal, yLocal, tmpBuffer, softmaxTiling, srcShape);
         }
 
         queueZ.EnQue(zLocal);
@@ -131,21 +131,21 @@ private:
 
     __aicore__ inline void CopyOut(int32_t progress, uint32_t rowNum)
     {
-        LocalTensor<float> zLocal = queueZ.DeQue<float>();
-        DataCopy(zGm[progress * this->singleLoopCoreRowNum * FLOAT_NUM_OF_SINGEL_BLOCK], zLocal,
+        AscendC::LocalTensor<float> zLocal = queueZ.DeQue<float>();
+        AscendC::DataCopy(zGm[progress * this->singleLoopCoreRowNum * FLOAT_NUM_OF_SINGEL_BLOCK], zLocal,
             rowNum * FLOAT_NUM_OF_SINGEL_BLOCK);
         queueZ.FreeTensor(zLocal);
     }
 
 private:
-    TPipe pipe;
-    TBuf<QuePosition::VECCALC> sharedTmpBuffer;
-    TQue<QuePosition::VECIN, BUFFER_NUM> queueX;
-    TQue<QuePosition::VECIN, BUFFER_NUM> queueY;
-    TQue<QuePosition::VECOUT, BUFFER_NUM> queueZ;
-    GlobalTensor<float> xGm;
-    GlobalTensor<float> yGm;
-    GlobalTensor<float> zGm;
+    AscendC::TPipe pipe;
+    AscendC::TBuf<AscendC::QuePosition::VECCALC> sharedTmpBuffer;
+    AscendC::TQue<AscendC::QuePosition::VECIN, BUFFER_NUM> queueX;
+    AscendC::TQue<AscendC::QuePosition::VECIN, BUFFER_NUM> queueY;
+    AscendC::TQue<AscendC::QuePosition::VECOUT, BUFFER_NUM> queueZ;
+    AscendC::GlobalTensor<float> xGm;
+    AscendC::GlobalTensor<float> yGm;
+    AscendC::GlobalTensor<float> zGm;
 
     uint32_t blockLength = 0;
     uint32_t usedBlockDim = 0;
diff --git a/examples/activation/softmaxgradfront/kernel_launch_method_by_direct/softmaxgradfront_custom.cpp b/examples/activation/softmaxgradfront/kernel_launch_method_by_direct/softmaxgradfront_custom.cpp
index c4a19c76e577a1e9fcb662539b73b9385e81af3e..187857257d98443087e868ff58afb9c614264152 100644
--- a/examples/activation/softmaxgradfront/kernel_launch_method_by_direct/softmaxgradfront_custom.cpp
+++ b/examples/activation/softmaxgradfront/kernel_launch_method_by_direct/softmaxgradfront_custom.cpp
@@ -9,14 +9,13 @@
  */
 
 #include "../kernel_impl/softmaxgradfront_kernel.h"
-using namespace AscendC;
 
-__aicore__ inline void CopyTiling(VecTiling* tiling, GM_ADDR tilingGM)
+__aicore__ inline void CopyTiling(MyCustomKernel::VecTiling* tiling, GM_ADDR tilingGM)
 {
     uint32_t* ptr = reinterpret_cast<uint32_t*>(tiling);
     auto tiling32 = reinterpret_cast<__gm__ uint32_t*>(tilingGM);
 
-    for (int i = 0; i < sizeof(VecTiling) / sizeof(uint32_t); i++, ptr++) {
+    for (int i = 0; i < sizeof(MyCustomKernel::VecTiling) / sizeof(uint32_t); i++, ptr++) {
         *ptr = *(tiling32 + i);
     }
     return;
@@ -28,10 +27,10 @@ extern "C" __global__ __aicore__ void softmaxgradfront_custom(GM_ADDR x, GM_ADDR
     if ASCEND_IS_AIC {
         return;
     }
-    VecTiling tilingData;
+    MyCustomKernel::VecTiling tilingData;
     CopyTiling(&tilingData, tiling);
-    KernelSoftmax op;
-    op.Init(x, y, z, tilingData);
+    MyCustomKernel::KernelSoftmax op;
+    op.Init(x, y, res_out, tilingData);
     op.Process();
 }
 
diff --git a/examples/activation/softmaxgradfront/kernel_launch_method_by_framework/op_kernel/softmaxgradfront_custom.cpp b/examples/activation/softmaxgradfront/kernel_launch_method_by_framework/op_kernel/softmaxgradfront_custom.cpp
index ad346e4b664e284479a9dff52a2a2432c1f053ea..3557e4c25ab0e150feb2d2f6fe69abf86ca070cf 100644
--- a/examples/activation/softmaxgradfront/kernel_launch_method_by_framework/op_kernel/softmaxgradfront_custom.cpp
+++ b/examples/activation/softmaxgradfront/kernel_launch_method_by_framework/op_kernel/softmaxgradfront_custom.cpp
@@ -8,7 +8,6 @@
  * See LICENSE in the root of the software repository for the full text of the License.
  */
 #include "../../../../../../kernel_impl/softmaxgradfront_kernel.h"
-using namespace AscendC;
 
 extern "C" __global__ __aicore__ void softmaxgradfront_custom(GM_ADDR x, GM_ADDR y, GM_ADDR z, GM_ADDR workspace,
     GM_ADDR tiling)
@@ -17,9 +16,9 @@ extern "C" __global__ __aicore__ void softmaxgradfront_custom(GM_ADDR x, GM_ADDR
         return;
     }
     GET_TILING_DATA(tilingData, tiling);
-    VecTiling vecTiling = *reinterpret_cast<VecTiling*>(&tilingData);
-    KernelSoftmax op;
-    op.Init(x, y, z, vecTiling);
+    MyCustomKernel::VecTiling vecTiling = *reinterpret_cast<MyCustomKernel::VecTiling*>(&tilingData);
+    MyCustomKernel::KernelSoftmax op;
+    op.Init(x, y, res_out, vecTiling);
     if (TILING_KEY_IS(1)) {
         op.Process();
     }
diff --git a/examples/matrix/batch_matmul/kernel_impl/batch_matmul_custom_impl.h b/examples/matrix/batch_matmul/kernel_impl/batch_matmul_custom_impl.h
index be0b4aaa27888986b9037ff39171e845d78f58f1..eab9dfd011560299465d359992af2e27b49e39f4 100644
--- a/examples/matrix/batch_matmul/kernel_impl/batch_matmul_custom_impl.h
+++ b/examples/matrix/batch_matmul/kernel_impl/batch_matmul_custom_impl.h
@@ -13,7 +13,6 @@
 #include "kernel_operator.h"
 #include "lib/matmul_intf.h"
 
-using namespace AscendC;
 using namespace matmul;
 
 constexpr int USED_CORE_NUM = 2;
@@ -24,7 +23,7 @@ class BatchMatmulKernel {
         __aicore__ inline BatchMatmulKernel(){};
         __aicore__ inline void Init(GM_ADDR a, GM_ADDR b, GM_ADDR bias, GM_ADDR c, GM_ADDR workspace, const TCubeTiling& tiling);
         template <bool hasBias = false>
-        __aicore__ inline void Process(TPipe* pipe, int32_t batchA, int32_t batchB);
+        __aicore__ inline void Process(AscendC::TPipe* pipe, int32_t batchA, int32_t batchB);
         Matmul<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE> matmulObj;
     private:
         __aicore__ inline void CalcOffset(int32_t blockIdx, const TCubeTiling& tiling, int32_t& offsetA, int32_t& offsetB,
@@ -33,10 +32,10 @@ class BatchMatmulKernel {
         using bType = typename B_TYPE::T;
         using cType = typename C_TYPE::T;
         using biasType = typename BIAS_TYPE::T;
-        GlobalTensor<aType> aGlobal;
-        GlobalTensor<bType> bGlobal;
-        GlobalTensor<cType> cGlobal;
-        GlobalTensor<biasType> biasGlobal;
+        AscendC::GlobalTensor<aType> aGlobal;
+        AscendC::GlobalTensor<bType> bGlobal;
+        AscendC::GlobalTensor<cType> cGlobal;
+        AscendC::GlobalTensor<biasType> biasGlobal;
         TCubeTiling tiling;
 };
 
@@ -59,7 +58,7 @@ __aicore__ inline void BatchMatmulKernel<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE>::Ini
     int32_t offsetB = 0;
     int32_t offsetC = 0;
     int32_t offsetBias = 0;
-    CalcOffset(GetBlockIdx(), tiling, offsetA, offsetB, offsetC, offsetBias);
+    CalcOffset(AscendC::GetBlockIdx(), tiling, offsetA, offsetB, offsetC, offsetBias);
     aGlobal = aGlobal[offsetA];
     bGlobal = bGlobal[offsetB];
     cGlobal = cGlobal[offsetC];
@@ -71,7 +70,7 @@ __aicore__ inline void BatchMatmulKernel<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE>::Ini
 
 template<class A_TYPE, class B_TYPE, class C_TYPE, class BIAS_TYPE>
 template <bool hasBias>
-__aicore__ inline void BatchMatmulKernel<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE>::Process(TPipe* pipe, int32_t batchA, int32_t batchB)
+__aicore__ inline void BatchMatmulKernel<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE>::Process(AscendC::TPipe* pipe, int32_t batchA, int32_t batchB)
 {
     int batchC = batchA > batchB ? batchA : batchB;
     int gLay = tiling.ALayoutInfoG > tiling.BLayoutInfoG ? tiling.ALayoutInfoG : tiling.BLayoutInfoG;
diff --git a/examples/matrix/batch_matmul/kernel_launch_method_by_direct/batch_matmul_custom.cpp b/examples/matrix/batch_matmul/kernel_launch_method_by_direct/batch_matmul_custom.cpp
index d115a3102dabc720482bbb40dd7b993521c6bb05..b1abfcd9f25ea7af7fb3fc9c27add3b0d2dd7952 100644
--- a/examples/matrix/batch_matmul/kernel_launch_method_by_direct/batch_matmul_custom.cpp
+++ b/examples/matrix/batch_matmul/kernel_launch_method_by_direct/batch_matmul_custom.cpp
@@ -12,7 +12,6 @@
 #include "lib/matmul_intf.h"
 #include "../kernel_impl/batch_matmul_custom_impl.h"
 
-using namespace AscendC;
 using namespace matmul;
 
 constexpr int32_t FULL_L1_SIZE = 512 * 1024;
@@ -39,7 +38,7 @@ extern "C" __global__ __aicore__ void batch_matmul_custom(GM_ADDR a, GM_ADDR b,
     typedef MatmulType<TPosition::GM, CubeFormat::ND, float, false, LayoutMode::BSNGD> C_TYPE;
     typedef MatmulType<TPosition::GM, CubeFormat::ND, float> BIAS_TYPE;
     BatchMatmulKernel<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE> batchMatmulKernel;
-    TPipe pipe;
+    AscendC::TPipe pipe;
     tiling.shareMode = 0;             // 0, share mode
     tiling.shareL1Size = FULL_L1_SIZE;  // full L1
     tiling.shareL0CSize = FULL_L0C_SIZE; // full L0C
diff --git a/examples/matrix/batch_matmul/kernel_launch_method_by_framework/op_host/batch_matmul_custom.cpp b/examples/matrix/batch_matmul/kernel_launch_method_by_framework/op_host/batch_matmul_custom.cpp
index b7ceb70d73ff3879524afce531d66fcdebcd4f53..ff053d118fa21741d401bbaab613f1c5516b4491 100644
--- a/examples/matrix/batch_matmul/kernel_launch_method_by_framework/op_host/batch_matmul_custom.cpp
+++ b/examples/matrix/batch_matmul/kernel_launch_method_by_framework/op_host/batch_matmul_custom.cpp
@@ -28,7 +28,7 @@ static ge::graphStatus TilingFunc(gert::TilingContext* context)
     tiling.SaveToBuffer(context->GetRawTilingData()->GetData(), context->GetRawTilingData()->GetCapacity());
     context->GetRawTilingData()->SetDataSize(tiling.GetDataSize());
     size_t userWorkspaceSize = 0;
-    size_t systemWorkspaceSize = ascendcPlatform.GetLibApiWorkSpaceSize();
+    size_t systemWorkspaceSize = static_cast<size_t>(ascendcPlatform.GetLibApiWorkSpaceSize());
     size_t *currentWorkspace = context->GetWorkspaceSizes(1);
     currentWorkspace[0] = userWorkspaceSize + systemWorkspaceSize;
 
diff --git a/examples/matrix/batch_matmul/kernel_launch_method_by_framework/op_kernel/batch_matmul_custom.cpp b/examples/matrix/batch_matmul/kernel_launch_method_by_framework/op_kernel/batch_matmul_custom.cpp
index ff9547f98879fcf7ed144391e8ded631bef0c90d..a653a0f5b69e4b881cccd4f744a980576261c307 100644
--- a/examples/matrix/batch_matmul/kernel_launch_method_by_framework/op_kernel/batch_matmul_custom.cpp
+++ b/examples/matrix/batch_matmul/kernel_launch_method_by_framework/op_kernel/batch_matmul_custom.cpp
@@ -12,7 +12,6 @@
 #include "lib/matmul_intf.h"
 #include "../../../../../../kernel_impl/batch_matmul_custom_impl.h"
 
-using namespace AscendC;
 using namespace matmul;
 
 constexpr int32_t FULL_L1_SIZE = 512 * 1024;
@@ -25,7 +24,7 @@ extern "C" __global__ __aicore__ void batch_matmul_custom(GM_ADDR a, GM_ADDR b,
     typedef MatmulType<TPosition::GM, CubeFormat::ND, float, false, LayoutMode::BSNGD> C_TYPE;
     typedef MatmulType<TPosition::GM, CubeFormat::ND, float> BIAS_TYPE;
     BatchMatmulKernel<A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE> batchMatmulKernel;
-    TPipe pipe;
+    AscendC::TPipe pipe;
     tilingData.cubeTilingData.shareMode = 0;             // 0, share mode
     tilingData.cubeTilingData.shareL1Size = FULL_L1_SIZE;  // full L1
     tilingData.cubeTilingData.shareL0CSize = FULL_L0C_SIZE; // full L0C
diff --git a/examples/matrix/matmul/kernel_impl/matmul_custom_impl.h b/examples/matrix/matmul/kernel_impl/matmul_custom_impl.h
index ef815c6959f11c9e29dc58bb3dc19347eb33183c..c79040156cecdb1dcaea009782f9e0dad25c1a8b 100644
--- a/examples/matrix/matmul/kernel_impl/matmul_custom_impl.h
+++ b/examples/matrix/matmul/kernel_impl/matmul_custom_impl.h
@@ -13,7 +13,6 @@
 #include "kernel_operator.h"
 #include "lib/matmul_intf.h"
 
-using namespace AscendC;
 using namespace matmul;
 
 template<typename aType, typename bType, typename cType, typename biasType>
@@ -22,7 +21,7 @@ class MatmulKernel {
         __aicore__ inline MatmulKernel(){};
         __aicore__ inline void Init(GM_ADDR a, GM_ADDR b, GM_ADDR bias, GM_ADDR c, GM_ADDR workspace, const TCubeTiling& tiling);
         template <bool setTmpSpace = false, bool hasBias = false>
-        __aicore__ inline void Process(TPipe* pipe);
+        __aicore__ inline void Process(AscendC::TPipe* pipe);
         Matmul<MatmulType<TPosition::GM, CubeFormat::ND, aType, true>,
         MatmulType<TPosition::GM, CubeFormat::ND, bType>,
         MatmulType<TPosition::GM, CubeFormat::ND, cType>,
@@ -32,10 +31,10 @@ class MatmulKernel {
         __aicore__ inline void CalcOffset(int32_t blockIdx, const TCubeTiling& tiling, int32_t& offsetA, int32_t& offsetB,
                                           int32_t& offsetC, int32_t& offsetBias, bool isAtrans, bool isBtrans);
 
-        GlobalTensor<aType> aGlobal;
-        GlobalTensor<bType> bGlobal;
-        GlobalTensor<cType> cGlobal;
-        GlobalTensor<biasType> biasGlobal;
+        AscendC::GlobalTensor<aType> aGlobal;
+        AscendC::GlobalTensor<bType> bGlobal;
+        AscendC::GlobalTensor<cType> cGlobal;
+        AscendC::GlobalTensor<biasType> biasGlobal;
         TCubeTiling tiling;
 };
 
@@ -55,7 +54,7 @@ __aicore__ inline void MatmulKernel<aType, bType, cType, biasType>::Init(GM_ADDR
     int32_t offsetBias = 0;
     bool isAtrans = true;
     bool isBtrans = false;
-    CalcOffset(GetBlockIdx(), tiling, offsetA, offsetB, offsetC, offsetBias, isAtrans, isBtrans);
+    CalcOffset(AscendC::GetBlockIdx(), tiling, offsetA, offsetB, offsetC, offsetBias, isAtrans, isBtrans);
     aGlobal = aGlobal[offsetA];
     bGlobal = bGlobal[offsetB];
     cGlobal = cGlobal[offsetC];
@@ -67,11 +66,11 @@ __aicore__ inline void MatmulKernel<aType, bType, cType, biasType>::Init(GM_ADDR
 
 template<typename aType, typename bType, typename cType, typename biasType>
 template <bool setTmpSpace, bool hasBias>
-__aicore__ inline void MatmulKernel<aType, bType, cType, biasType>::Process(TPipe* pipe)
+__aicore__ inline void MatmulKernel<aType, bType, cType, biasType>::Process(AscendC::TPipe* pipe)
 {
     if constexpr (setTmpSpace) {
-        TBuf<> tmpMMFormatUb;
-        LocalTensor<uint8_t> mmformatUb;
+        AscendC::TBuf<> tmpMMFormatUb;
+        AscendC::LocalTensor<uint8_t> mmformatUb;
         pipe->InitBuffer(tmpMMFormatUb, TOTAL_VEC_LOCAL_SIZE);
         mmformatUb = tmpMMFormatUb.Get<uint8_t>(TOTAL_VEC_LOCAL_SIZE);
         matmulObj.SetLocalWorkspace(mmformatUb);
@@ -86,11 +85,16 @@ __aicore__ inline void MatmulKernel<aType, bType, cType, biasType>::Process(TPip
     matmulObj.End();
 }
 
+__aicore__ inline uint32_t Ceiling(uint32_t a, uint32_t b)
+{
+    return (a + b - 1) / b;
+}
+
 template<typename aType, typename bType, typename cType, typename biasType>
 __aicore__ inline void MatmulKernel<aType, bType, cType, biasType>::CalcOffset(int32_t blockIdx, const TCubeTiling& tiling,
     int32_t& offsetA, int32_t& offsetB, int32_t& offsetC, int32_t& offsetBias, bool isAtrans, bool isBtrans)
 {
-    auto mSingleBlocks = Ceil(tiling.M, tiling.singleCoreM);
+    auto mSingleBlocks = Ceiling(tiling.M, tiling.singleCoreM);
     auto mCoreIndx = blockIdx % mSingleBlocks;
     auto nCoreIndx = blockIdx / mSingleBlocks;
 
diff --git a/examples/matrix/matmul/kernel_launch_method_by_direct/matmul_custom.cpp b/examples/matrix/matmul/kernel_launch_method_by_direct/matmul_custom.cpp
index be66d0e6b0d1713347e605321680578198accc7d..a274b83f0b1dec6b23707382c9a27a29f8b61207 100644
--- a/examples/matrix/matmul/kernel_launch_method_by_direct/matmul_custom.cpp
+++ b/examples/matrix/matmul/kernel_launch_method_by_direct/matmul_custom.cpp
@@ -12,7 +12,6 @@
 #include "lib/matmul_intf.h"
 #include "../kernel_impl/matmul_custom_impl.h"
 
-using namespace AscendC;
 using namespace matmul;
 
 __aicore__ inline void CopyTiling(TCubeTiling* tiling, GM_ADDR tilingGM)
@@ -31,7 +30,7 @@ extern "C" __global__ __aicore__ void matmul_custom(GM_ADDR a, GM_ADDR b, GM_ADD
     TCubeTiling tiling;
     CopyTiling(&tiling, tilingGm);
     MatmulKernel<half, half, float, float> matmulKernel;
-    TPipe pipe;
+    AscendC::TPipe pipe;
     REGIST_MATMUL_OBJ(&pipe, GetSysWorkSpacePtr(), matmulKernel.matmulObj, &tiling);
     matmulKernel.Init(a, b, nullptr, c, workspace, tiling);
 #ifdef CUSTOM_ASCEND310P
diff --git a/examples/matrix/matmul/kernel_launch_method_by_framework/op_host/matmul_custom.cpp b/examples/matrix/matmul/kernel_launch_method_by_framework/op_host/matmul_custom.cpp
index adab49f0ba145258e4b153a48ab9ed03a51329db..95d12755765457605eff03212ce9b4ac050e13bb 100644
--- a/examples/matrix/matmul/kernel_launch_method_by_framework/op_host/matmul_custom.cpp
+++ b/examples/matrix/matmul/kernel_launch_method_by_framework/op_host/matmul_custom.cpp
@@ -48,7 +48,7 @@ static ge::graphStatus TilingFunc(gert::TilingContext* context)
     tiling.SaveToBuffer(context->GetRawTilingData()->GetData(), context->GetRawTilingData()->GetCapacity());
     context->GetRawTilingData()->SetDataSize(tiling.GetDataSize());
     size_t userWorkspaceSize = 0;
-    size_t systemWorkspaceSize = ascendcPlatform.GetLibApiWorkSpaceSize();
+    size_t systemWorkspaceSize = static_cast<size_t>(ascendcPlatform.GetLibApiWorkSpaceSize());
     size_t *currentWorkspace = context->GetWorkspaceSizes(1);
     currentWorkspace[0] = userWorkspaceSize + systemWorkspaceSize;
 
diff --git a/examples/matrix/matmul/kernel_launch_method_by_framework/op_kernel/matmul_custom.cpp b/examples/matrix/matmul/kernel_launch_method_by_framework/op_kernel/matmul_custom.cpp
index 09c40946bf186e89585f827e50dd7d5cc9f26656..af60b3d49c0b0d5d85f81757438834636294ef33 100644
--- a/examples/matrix/matmul/kernel_launch_method_by_framework/op_kernel/matmul_custom.cpp
+++ b/examples/matrix/matmul/kernel_launch_method_by_framework/op_kernel/matmul_custom.cpp
@@ -12,13 +12,12 @@
 #include "lib/matmul_intf.h"
 #include "../../../../../../kernel_impl/matmul_custom_impl.h"
 
-using namespace AscendC;
 using namespace matmul;
 
 extern "C" __global__ __aicore__ void matmul_custom(GM_ADDR a, GM_ADDR b, GM_ADDR bias, GM_ADDR c, GM_ADDR workspace, GM_ADDR tiling) {
     GET_TILING_DATA(tilingData, tiling);
     MatmulKernel<half, half, float, float> matmulKernel;
-    TPipe pipe;
+    AscendC::TPipe pipe;
     REGIST_MATMUL_OBJ(&pipe, GetSysWorkSpacePtr(), matmulKernel.matmulObj,&tilingData.cubeTilingData);
     matmulKernel.Init(a, b, bias, c, workspace, tilingData.cubeTilingData);
     if (TILING_KEY_IS(1)) {
diff --git a/examples/normalization/layernorm/kernel_impl/layernorm_custom.h b/examples/normalization/layernorm/kernel_impl/layernorm_custom.h
index c838f882bd0ff96213fe48016056d51f6c333546..435a7abcb28cb16191bd07b1221de5cd77ab557b 100644
--- a/examples/normalization/layernorm/kernel_impl/layernorm_custom.h
+++ b/examples/normalization/layernorm/kernel_impl/layernorm_custom.h
@@ -12,7 +12,7 @@
 #define EXAMPLES_NORMALIZATION_LAYERNORM_CUSTOM_H
 #include "kernel_operator.h"
 
-namespace AscendC {
+namespace MyCustomKernel {
 struct VecTiling {
     LayerNormTiling layernormTilingData;
     float epsilon = 0;
@@ -58,13 +58,13 @@ public:
 private:
     __aicore__ inline void CopyIn()
     {
-        LocalTensor<float> inputXLocal = inQueueX.AllocTensor<float>();
-        LocalTensor<float> gammaLocal = inQueueGamma.AllocTensor<float>();
-        LocalTensor<float> betaLocal = inQueueBeta.AllocTensor<float>();
+        AscendC::LocalTensor<float> inputXLocal = inQueueX.AllocTensor<float>();
+        AscendC::LocalTensor<float> gammaLocal = inQueueGamma.AllocTensor<float>();
+        AscendC::LocalTensor<float> betaLocal = inQueueBeta.AllocTensor<float>();
 
-        DataCopy(inputXLocal, inputXGlobal, bshLength);
-        DataCopy(gammaLocal, gammGlobal, hLength);
-        DataCopy(betaLocal, betaGlobal, hLength);
+        AscendC::DataCopy(inputXLocal, inputXGlobal, bshLength);
+        AscendC::DataCopy(gammaLocal, gammGlobal, hLength);
+        AscendC::DataCopy(betaLocal, betaGlobal, hLength);
 
         inQueueX.EnQue(inputXLocal);
         inQueueGamma.EnQue(gammaLocal);
@@ -72,15 +72,15 @@ private:
     }
     __aicore__ inline void Compute()
     {
-        LocalTensor<float> inputXLocal = inQueueX.DeQue<float>();
-        LocalTensor<float> gammaLocal = inQueueGamma.DeQue<float>();
-        LocalTensor<float> betaLocal = inQueueBeta.DeQue<float>();
+        AscendC::LocalTensor<float> inputXLocal = inQueueX.DeQue<float>();
+        AscendC::LocalTensor<float> gammaLocal = inQueueGamma.DeQue<float>();
+        AscendC::LocalTensor<float> betaLocal = inQueueBeta.DeQue<float>();
 
-        LocalTensor<float> outputLocal = outQueue.AllocTensor<float>();
-        LocalTensor<float> meanLocal = outQueueMean.AllocTensor<float>();
-        LocalTensor<float> varianceLocal = outQueueVariance.AllocTensor<float>();
+        AscendC::LocalTensor<float> outputLocal = outQueue.AllocTensor<float>();
+        AscendC::LocalTensor<float> meanLocal = outQueueMean.AllocTensor<float>();
+        AscendC::LocalTensor<float> varianceLocal = outQueueVariance.AllocTensor<float>();
 
-        LayerNorm<float, isReuseSource>(outputLocal, meanLocal, varianceLocal, inputXLocal, gammaLocal, betaLocal,
+        AscendC::LayerNorm<float, isReuseSource>(outputLocal, meanLocal, varianceLocal, inputXLocal, gammaLocal, betaLocal,
             (float)epsilon, tiling_);
 
         outQueue.EnQue<float>(outputLocal);
@@ -93,13 +93,13 @@ private:
     }
     __aicore__ inline void CopyOut()
     {
-        LocalTensor<float> outputLocal = outQueue.DeQue<float>();
-        LocalTensor<float> meanLocal = outQueueMean.DeQue<float>();
-        LocalTensor<float> varianceLocal = outQueueVariance.DeQue<float>();
+        AscendC::LocalTensor<float> outputLocal = outQueue.DeQue<float>();
+        AscendC::LocalTensor<float> meanLocal = outQueueMean.DeQue<float>();
+        AscendC::LocalTensor<float> varianceLocal = outQueueVariance.DeQue<float>();
 
-        DataCopy(outputGlobal, outputLocal, bshLength);
-        DataCopy(outputMeanGlobal, meanLocal, bsLength);
-        DataCopy(outputVarianceGlobal, varianceLocal, bsLength);
+        AscendC::DataCopy(outputGlobal, outputLocal, bshLength);
+        AscendC::DataCopy(outputMeanGlobal, meanLocal, bsLength);
+        AscendC::DataCopy(outputVarianceGlobal, varianceLocal, bsLength);
 
         outQueue.FreeTensor(outputLocal);
         outQueueMean.FreeTensor(meanLocal);
@@ -107,20 +107,20 @@ private:
     }
 
 private:
-    GlobalTensor<float> inputXGlobal;
-    GlobalTensor<float> gammGlobal;
-    GlobalTensor<float> betaGlobal;
-    GlobalTensor<float> outputGlobal;
-    GlobalTensor<float> outputMeanGlobal;
-    GlobalTensor<float> outputVarianceGlobal;
-
-    TPipe pipe;
-    TQue<QuePosition::VECIN, 1> inQueueX;
-    TQue<QuePosition::VECIN, 1> inQueueGamma;
-    TQue<QuePosition::VECIN, 1> inQueueBeta;
-    TQue<QuePosition::VECOUT, 1> outQueue;
-    TQue<QuePosition::VECOUT, 1> outQueueMean;
-    TQue<QuePosition::VECOUT, 1> outQueueVariance;
+    AscendC::GlobalTensor<float> inputXGlobal;
+    AscendC::GlobalTensor<float> gammGlobal;
+    AscendC::GlobalTensor<float> betaGlobal;
+    AscendC::GlobalTensor<float> outputGlobal;
+    AscendC::GlobalTensor<float> outputMeanGlobal;
+    AscendC::GlobalTensor<float> outputVarianceGlobal;
+
+    AscendC::TPipe pipe;
+    AscendC::TQue<AscendC::QuePosition::VECIN, 1> inQueueX;
+    AscendC::TQue<AscendC::QuePosition::VECIN, 1> inQueueGamma;
+    AscendC::TQue<AscendC::QuePosition::VECIN, 1> inQueueBeta;
+    AscendC::TQue<AscendC::QuePosition::VECOUT, 1> outQueue;
+    AscendC::TQue<AscendC::QuePosition::VECOUT, 1> outQueueMean;
+    AscendC::TQue<AscendC::QuePosition::VECOUT, 1> outQueueVariance;
 
     uint32_t bLength;
     uint32_t sLength;
diff --git a/examples/normalization/layernorm/kernel_launch_method_by_direct/layernorm_custom.cpp b/examples/normalization/layernorm/kernel_launch_method_by_direct/layernorm_custom.cpp
index b225a098fd29e5733b593f34e007dc602364eb89..9358217ee14ba879894feab10aadc1678c6d086a 100644
--- a/examples/normalization/layernorm/kernel_launch_method_by_direct/layernorm_custom.cpp
+++ b/examples/normalization/layernorm/kernel_launch_method_by_direct/layernorm_custom.cpp
@@ -9,14 +9,13 @@
  */
 #include "kernel_operator.h"
 #include "../kernel_impl/layernorm_custom.h"
-using namespace AscendC;
 
-__aicore__ inline void CopyTiling(VecTiling *tiling, GM_ADDR tilingGM)
+__aicore__ inline void CopyTiling(MyCustomKernel::VecTiling *tiling, GM_ADDR tilingGM)
 {
     uint32_t *ptr = reinterpret_cast<uint32_t *>(tiling);
     auto tiling32 = reinterpret_cast<__gm__ uint32_t *>(tilingGM);
 
-    for (int i = 0; i < sizeof(VecTiling) / sizeof(uint32_t); i++, ptr++) {
+    for (int i = 0; i < sizeof(MyCustomKernel::VecTiling) / sizeof(uint32_t); i++, ptr++) {
         *ptr = *(tiling32 + i);
     }
     return;
@@ -28,9 +27,9 @@ extern "C" __global__ __aicore__ void layernorm_custom(GM_ADDR inputXGm, GM_ADDR
     if ASCEND_IS_AIC {
         return;
     }
-    VecTiling tilingData;
+    MyCustomKernel::VecTiling tilingData;
     CopyTiling(&tilingData, tiling);
-    KernelLayernorm<false> op;
+    MyCustomKernel::KernelLayernorm<false> op;
     op.Init(inputXGm, gammaGm, betaGm, outputGm, outputMeanGm, outputVarianceGm, tilingData);
     op.Process();
 }
diff --git a/examples/normalization/layernorm/kernel_launch_method_by_framework/op_kernel/layernorm_custom.cpp b/examples/normalization/layernorm/kernel_launch_method_by_framework/op_kernel/layernorm_custom.cpp
index 17db0fbd8853495ff206f8c7127d5705e9dd2a87..2bcc86185f9e6aee430abc9fab0690ba5ed54d59 100644
--- a/examples/normalization/layernorm/kernel_launch_method_by_framework/op_kernel/layernorm_custom.cpp
+++ b/examples/normalization/layernorm/kernel_launch_method_by_framework/op_kernel/layernorm_custom.cpp
@@ -9,15 +9,14 @@
  */
 
 #include "../../../../../../kernel_impl/layernorm_custom.h"
-using namespace AscendC;
 
 extern "C" __global__ __aicore__ void layernorm_custom(GM_ADDR inputXGm, GM_ADDR gammaGm, GM_ADDR betaGm,
     GM_ADDR outputGm, GM_ADDR outputMeanGm, GM_ADDR outputVarianceGm, GM_ADDR workspace, GM_ADDR tiling)
 {
     GET_TILING_DATA(tilingData, tiling);
-    VecTiling vecTiling = *reinterpret_cast<VecTiling*>(&tilingData);
+    MyCustomKernel::VecTiling vecTiling = *reinterpret_cast<MyCustomKernel::VecTiling*>(&tilingData);
     if (TILING_KEY_IS(1)) {
-        KernelLayernorm<false> op;
+        MyCustomKernel::KernelLayernorm<false> op;
         op.Init(inputXGm, gammaGm, betaGm, outputGm, outputMeanGm, outputVarianceGm, vecTiling);
         op.Process();
     }
diff --git a/examples/pad/broadcast/kernel_impl/broadcast_custom.h b/examples/pad/broadcast/kernel_impl/broadcast_custom.h
index d816db349f2414fe5563ac24549178e9712594c4..6df59bf8caa2d5ea9e845a5a7e1fc4ac42c48523 100644
--- a/examples/pad/broadcast/kernel_impl/broadcast_custom.h
+++ b/examples/pad/broadcast/kernel_impl/broadcast_custom.h
@@ -13,7 +13,6 @@
 #include "kernel_operator.h"
 
 constexpr int32_t BUFFER_NUM = 1;
-using namespace AscendC;
 template <typename T, int32_t dim, int32_t axis>
 class KernelBroadcastCustom {
 public:
@@ -22,7 +21,7 @@ public:
     __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, uint32_t srcLength, uint32_t dstLength,
         const uint32_t srcShape[dim], const uint32_t dstShape[dim])
     {
-        AscendCUtils::SetOverflow(1);
+        AscendC::AscendCUtils::SetOverflow(1);
         xGm.SetGlobalBuffer((__gm__ T *)x, srcLength);
         yGm.SetGlobalBuffer((__gm__ T *)y, dstLength);
 
@@ -44,32 +43,32 @@ public:
 private:
     __aicore__ inline void CopyIn()
     {
-        LocalTensor<T> xLocal = inQueueX.AllocTensor<T>();
-        DataCopy(xLocal, xGm, srcLength_);
+        AscendC::LocalTensor<T> xLocal = inQueueX.AllocTensor<T>();
+        AscendC::DataCopy(xLocal, xGm, srcLength_);
         inQueueX.EnQue(xLocal);
     }
     __aicore__ inline void Compute()
     {
-        LocalTensor<T> xLocal = inQueueX.DeQue<T>();
-        LocalTensor<T> yLocal = outQueueY.AllocTensor<T>();
-        BroadCast<T, dim, axis>(yLocal, xLocal, dstShape_, srcShape_);
+        AscendC::LocalTensor<T> xLocal = inQueueX.DeQue<T>();
+        AscendC::LocalTensor<T> yLocal = outQueueY.AllocTensor<T>();
+        AscendC::BroadCast<T, dim, axis>(yLocal, xLocal, dstShape_, srcShape_);
 
         outQueueY.EnQue<T>(yLocal);
         inQueueX.FreeTensor(xLocal);
     }
     __aicore__ inline void CopyOut()
     {
-        LocalTensor<T> yLocal = outQueueY.DeQue<T>();
-        DataCopy(yGm, yLocal, dstLength_);
+        AscendC::LocalTensor<T> yLocal = outQueueY.DeQue<T>();
+        AscendC::DataCopy(yGm, yLocal, dstLength_);
         outQueueY.FreeTensor(yLocal);
     }
 
 private:
-    TPipe pipe;
-    TQue<QuePosition::VECIN, BUFFER_NUM> inQueueX;
-    TQue<QuePosition::VECOUT, BUFFER_NUM> outQueueY;
-    GlobalTensor<T> xGm;
-    GlobalTensor<T> yGm;
+    AscendC::TPipe pipe;
+    AscendC::TQue<AscendC::QuePosition::VECIN, BUFFER_NUM> inQueueX;
+    AscendC::TQue<AscendC::QuePosition::VECOUT, BUFFER_NUM> outQueueY;
+    AscendC::GlobalTensor<T> xGm;
+    AscendC::GlobalTensor<T> yGm;
     uint32_t srcLength_;
     uint32_t dstLength_;
     const uint32_t *srcShape_{nullptr};
diff --git a/examples/pad/broadcast/kernel_launch_method_by_direct/broadcast_custom.cpp b/examples/pad/broadcast/kernel_launch_method_by_direct/broadcast_custom.cpp
index 28a17316371aeff1da749b7c4dd0fba2cd78c2fa..395ac0349a140c5a2cee5bec5ef3cf23d45723fd 100644
--- a/examples/pad/broadcast/kernel_launch_method_by_direct/broadcast_custom.cpp
+++ b/examples/pad/broadcast/kernel_launch_method_by_direct/broadcast_custom.cpp
@@ -8,7 +8,6 @@
  * See LICENSE in the root of the software repository for the full text of the License.
  */
 #include "../kernel_impl/broadcast_custom.h"
-using namespace AscendC;
 struct BroadcastTilingData {
     uint32_t dim{0};
     uint32_t axis{0};
diff --git a/examples/pad/broadcast/kernel_launch_method_by_framework/op_kernel/broadcast_custom.cpp b/examples/pad/broadcast/kernel_launch_method_by_framework/op_kernel/broadcast_custom.cpp
index a8fd04643557b09f3948d026b810cd7125580f72..230a5975c6b557a8cfa7cae6768c3a1946bc33da 100644
--- a/examples/pad/broadcast/kernel_launch_method_by_framework/op_kernel/broadcast_custom.cpp
+++ b/examples/pad/broadcast/kernel_launch_method_by_framework/op_kernel/broadcast_custom.cpp
@@ -8,7 +8,6 @@
  * See LICENSE in the root of the software repository for the full text of the License.
  */
 #include "../../../../../../kernel_impl/broadcast_custom.h"
-using namespace AscendC;
 
 extern "C" __global__ __aicore__ void broadcast_custom(GM_ADDR x, GM_ADDR y, GM_ADDR workspace, GM_ADDR tiling)
 {