From 622e33008fdb85921aa99f97e36628988694a04f Mon Sep 17 00:00:00 2001 From: Weidong Guo Date: Tue, 13 Aug 2024 06:25:48 +0000 Subject: [PATCH] =?UTF-8?q?!330=20=E7=A7=BB=E9=99=A4=E4=BA=86examples?= =?UTF-8?q?=E4=B8=AD=E7=9A=84using=20namespace=20AscendC=20Merge=20pull=20?= =?UTF-8?q?request=20!330=20from=20Weidong=20Guo/master?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../kernel_impl/simplesoftmax_kernel.h | 58 +++++++-------- .../simplesoftmax_custom.cpp | 11 ++- .../op_kernel/simplesoftmax_custom.cpp | 7 +- .../softmax/kernel_impl/softmax_kernel.h | 58 +++++++-------- .../softmax_custom.cpp | 11 ++- .../op_kernel/softmax_custom.cpp | 7 +- .../kernel_impl/softmaxflash_kernel.h | 73 ++++++++++--------- .../softmaxflash_custom.cpp | 11 ++- .../op_kernel/softmaxflash_custom.cpp | 7 +- .../kernel_impl/softmaxgrad_kernel.h | 52 ++++++------- .../softmaxgrad_custom.cpp | 9 +-- .../op_kernel/softmaxgrad_custom.cpp | 7 +- .../kernel_impl/softmaxgradfront_kernel.h | 54 +++++++------- .../softmaxgradfront_custom.cpp | 11 ++- .../op_kernel/softmaxgradfront_custom.cpp | 7 +- .../kernel_impl/batch_matmul_custom_impl.h | 15 ++-- .../batch_matmul_custom.cpp | 3 +- .../op_host/batch_matmul_custom.cpp | 2 +- .../op_kernel/batch_matmul_custom.cpp | 3 +- .../matmul/kernel_impl/matmul_custom_impl.h | 26 ++++--- .../matmul_custom.cpp | 3 +- .../op_host/matmul_custom.cpp | 2 +- .../op_kernel/matmul_custom.cpp | 3 +- .../layernorm/kernel_impl/layernorm_custom.h | 68 ++++++++--------- .../layernorm_custom.cpp | 9 +-- .../op_kernel/layernorm_custom.cpp | 5 +- .../broadcast/kernel_impl/broadcast_custom.h | 27 ++++--- .../broadcast_custom.cpp | 1 - .../op_kernel/broadcast_custom.cpp | 1 - 29 files changed, 268 insertions(+), 283 deletions(-) diff --git a/examples/activation/simplesoftmax/kernel_impl/simplesoftmax_kernel.h b/examples/activation/simplesoftmax/kernel_impl/simplesoftmax_kernel.h index 14c7387a..d8079d95 100644 --- a/examples/activation/simplesoftmax/kernel_impl/simplesoftmax_kernel.h +++ b/examples/activation/simplesoftmax/kernel_impl/simplesoftmax_kernel.h @@ -12,7 +12,7 @@ #define EXAMPLES_ACTIVATION_SIMPLESOFTMAX_KERNEL_H #include "kernel_operator.h" -namespace AscendC { +namespace MyCustomKernel { constexpr int32_t BUFFER_NUM = 2; constexpr uint32_t FLOAT_NUM_OF_SINGEL_BLOCK = 8; constexpr uint32_t BASIC_BLOCK_ROW_FACTOR = 8; @@ -55,10 +55,10 @@ public: } __aicore__ inline void Init(GM_ADDR x, GM_ADDR max, GM_ADDR sum, GM_ADDR z, const VecTiling& tiling) { - ASSERT(GetBlockNum() != 0 && "block dim can not be zero!"); + ASSERT(AscendC::GetBlockNum() != 0 && "block dim can not be zero!"); InitTiling(tiling); - if (GetBlockIdx() == this->usedBlockDim) { // tail core + if (AscendC::GetBlockIdx() == this->usedBlockDim) { // tail core this->singleLoopCoreRowNum = this->tailCoreSingleLoopCoreRowNum; this->singleCoreLoopCount = this->tailCoreSingleCoreLoopCount; this->leftRow = this->tailCoreSingleCoreLoopTail; @@ -67,8 +67,8 @@ public: this->blockLength = this->coreRowNum * this->columnLength; this->msLength = this->coreRowNum * FLOAT_NUM_OF_SINGEL_BLOCK; // max sum length per block process - uint32_t offset1 = this->blockLength * GetBlockIdx(); - uint32_t offset2 = this->msLength * GetBlockIdx(); + uint32_t offset1 = this->blockLength * AscendC::GetBlockIdx(); + uint32_t offset2 = this->msLength * AscendC::GetBlockIdx(); xGm.SetGlobalBuffer((__gm__ float*)x + offset1, this->blockLength); zGm.SetGlobalBuffer((__gm__ float*)z + offset1, this->blockLength); @@ -88,7 +88,7 @@ public: __aicore__ inline void Process() { - if (GetBlockIdx() > usedBlockDim) { + if (AscendC::GetBlockIdx() > usedBlockDim) { return; } @@ -108,12 +108,12 @@ public: private: __aicore__ inline void CopyIn(int32_t progress, uint32_t rowNum) { - LocalTensor xLocal = queueX.AllocTensor(); - LocalTensor maxLocal = queueMax.AllocTensor(); - LocalTensor sumLocal = queueSum.AllocTensor(); - DataCopy(xLocal, xGm[progress * this->tileLength], rowNum * this->columnLength); - DataCopy(maxLocal, maxGm[progress * this->msTileLength], rowNum * FLOAT_NUM_OF_SINGEL_BLOCK); - DataCopy(sumLocal, sumGm[progress * this->msTileLength], rowNum * FLOAT_NUM_OF_SINGEL_BLOCK); + AscendC::LocalTensor xLocal = queueX.AllocTensor(); + AscendC::LocalTensor maxLocal = queueMax.AllocTensor(); + AscendC::LocalTensor sumLocal = queueSum.AllocTensor(); + AscendC::DataCopy(xLocal, xGm[progress * this->tileLength], rowNum * this->columnLength); + AscendC::DataCopy(maxLocal, maxGm[progress * this->msTileLength], rowNum * FLOAT_NUM_OF_SINGEL_BLOCK); + AscendC::DataCopy(sumLocal, sumGm[progress * this->msTileLength], rowNum * FLOAT_NUM_OF_SINGEL_BLOCK); queueX.EnQue(xLocal); queueMax.EnQue(maxLocal); queueSum.EnQue(sumLocal); @@ -121,18 +121,18 @@ private: __aicore__ inline void Compute(int32_t progressm, uint32_t rowNum) { - LocalTensor xLocal = queueX.DeQue(); - LocalTensor maxLocal = queueMax.DeQue(); - LocalTensor sumLocal = queueSum.DeQue(); - LocalTensor tmpBuffer = sharedTmpBuffer.Get(); + AscendC::LocalTensor xLocal = queueX.DeQue(); + AscendC::LocalTensor maxLocal = queueMax.DeQue(); + AscendC::LocalTensor sumLocal = queueSum.DeQue(); + AscendC::LocalTensor tmpBuffer = sharedTmpBuffer.Get(); - SoftMaxShapeInfo srcShape = { rowNum, this->columnLength, rowNum, this->columnLength }; + AscendC::SoftMaxShapeInfo srcShape = { rowNum, this->columnLength, rowNum, this->columnLength }; if (rowNum % BASIC_BLOCK_ROW_FACTOR == 0 && this->columnLength % BASIC_BLOCK_COLUMN_FACTOR == 0 && this->columnLength < BASIC_BLOCK_MAX_COLUMN_LENGTH) { - SimpleSoftMax(xLocal, sumLocal, maxLocal, xLocal, tmpBuffer, softmaxTiling, srcShape); + AscendC::SimpleSoftMax(xLocal, sumLocal, maxLocal, xLocal, tmpBuffer, softmaxTiling, srcShape); } else { - SimpleSoftMax(xLocal, sumLocal, maxLocal, xLocal, tmpBuffer, softmaxTiling, srcShape); + AscendC::SimpleSoftMax(xLocal, sumLocal, maxLocal, xLocal, tmpBuffer, softmaxTiling, srcShape); } queueX.EnQue(xLocal); queueMax.FreeTensor(maxLocal); @@ -141,20 +141,20 @@ private: __aicore__ inline void CopyOut(int32_t progress, uint32_t rowNum) { - LocalTensor zLocal = queueX.DeQue(); - DataCopy(zGm[progress * this->tileLength], zLocal, rowNum * this->columnLength); + AscendC::LocalTensor zLocal = queueX.DeQue(); + AscendC::DataCopy(zGm[progress * this->tileLength], zLocal, rowNum * this->columnLength); queueX.FreeTensor(zLocal); } private: - TPipe pipe; - TBuf sharedTmpBuffer; - TQue queueX; - TQue queueMax, queueSum; - GlobalTensor xGm; - GlobalTensor maxGm; - GlobalTensor sumGm; - GlobalTensor zGm; + AscendC::TPipe pipe; + AscendC::TBuf sharedTmpBuffer; + AscendC::TQue queueX; + AscendC::TQue queueMax, queueSum; + AscendC::GlobalTensor xGm; + AscendC::GlobalTensor maxGm; + AscendC::GlobalTensor sumGm; + AscendC::GlobalTensor zGm; uint32_t blockLength = 0; uint32_t usedBlockDim = 0; diff --git a/examples/activation/simplesoftmax/kernel_launch_method_by_direct/simplesoftmax_custom.cpp b/examples/activation/simplesoftmax/kernel_launch_method_by_direct/simplesoftmax_custom.cpp index 1eb36654..636a18bf 100644 --- a/examples/activation/simplesoftmax/kernel_launch_method_by_direct/simplesoftmax_custom.cpp +++ b/examples/activation/simplesoftmax/kernel_launch_method_by_direct/simplesoftmax_custom.cpp @@ -9,14 +9,13 @@ */ #include "../kernel_impl/simplesoftmax_kernel.h" -using namespace AscendC; -__aicore__ inline void CopyTiling(VecTiling* tiling, GM_ADDR tilingGM) +__aicore__ inline void CopyTiling(MyCustomKernel::VecTiling* tiling, GM_ADDR tilingGM) { uint32_t* ptr = reinterpret_cast(tiling); auto tiling32 = reinterpret_cast<__gm__ uint32_t*>(tilingGM); - for (int i = 0; i < sizeof(VecTiling) / sizeof(uint32_t); i++, ptr++) { + for (int i = 0; i < sizeof(MyCustomKernel::VecTiling) / sizeof(uint32_t); i++, ptr++) { *ptr = *(tiling32 + i); } return; @@ -28,10 +27,10 @@ extern "C" __global__ __aicore__ void simplesoftmax_custom(GM_ADDR x, GM_ADDR ma if ASCEND_IS_AIC { return; } - VecTiling tilingData; + MyCustomKernel::VecTiling tilingData; CopyTiling(&tilingData, tiling); - KernelSoftmax op; - op.Init(x, max, sum, z, tilingData); + MyCustomKernel::KernelSoftmax op; + op.Init(x, max, sum, res_out, tilingData); op.Process(); } diff --git a/examples/activation/simplesoftmax/kernel_launch_method_by_framework/op_kernel/simplesoftmax_custom.cpp b/examples/activation/simplesoftmax/kernel_launch_method_by_framework/op_kernel/simplesoftmax_custom.cpp index 1bb113e3..91b8de26 100644 --- a/examples/activation/simplesoftmax/kernel_launch_method_by_framework/op_kernel/simplesoftmax_custom.cpp +++ b/examples/activation/simplesoftmax/kernel_launch_method_by_framework/op_kernel/simplesoftmax_custom.cpp @@ -8,7 +8,6 @@ * See LICENSE in the root of the software repository for the full text of the License. */ #include "../../../../../../kernel_impl/simplesoftmax_kernel.h" -using namespace AscendC; extern "C" __global__ __aicore__ void simplesoftmax_custom(GM_ADDR x, GM_ADDR max, GM_ADDR sum, GM_ADDR z, GM_ADDR workspace, GM_ADDR tiling) @@ -17,9 +16,9 @@ extern "C" __global__ __aicore__ void simplesoftmax_custom(GM_ADDR x, GM_ADDR ma return; } GET_TILING_DATA(tilingData, tiling); - VecTiling vecTiling = *reinterpret_cast(&tilingData); - KernelSoftmax op; - op.Init(x, max, sum, z, vecTiling); + MyCustomKernel::VecTiling vecTiling = *reinterpret_cast(&tilingData); + MyCustomKernel::KernelSoftmax op; + op.Init(x, max, sum, res_out, vecTiling); if (TILING_KEY_IS(1)) { op.Process(); } diff --git a/examples/activation/softmax/kernel_impl/softmax_kernel.h b/examples/activation/softmax/kernel_impl/softmax_kernel.h index 5e75aed3..bac55031 100644 --- a/examples/activation/softmax/kernel_impl/softmax_kernel.h +++ b/examples/activation/softmax/kernel_impl/softmax_kernel.h @@ -12,7 +12,7 @@ #define EXAMPLES_ACTIVATION_SOFTMAX_KERNEL_H #include "kernel_operator.h" -namespace AscendC { +namespace MyCustomKernel { constexpr int32_t BUFFER_NUM = 2; constexpr uint32_t FLOAT_NUM_OF_SINGEL_BLOCK = 8; constexpr uint32_t BASIC_BLOCK_ROW_FACTOR = 8; @@ -55,10 +55,10 @@ public: } __aicore__ inline void Init(GM_ADDR x, GM_ADDR max, GM_ADDR sum, GM_ADDR z, const VecTiling& tilingData) { - ASSERT(GetBlockNum() != 0 && "block dim can not be zero!"); + ASSERT(AscendC::GetBlockNum() != 0 && "block dim can not be zero!"); InitTiling(tilingData); - if (GetBlockIdx() == this->usedBlockDim) { // tail core + if (AscendC::GetBlockIdx() == this->usedBlockDim) { // tail core this->singleLoopCoreRowNum = this->tailCoreSingleLoopCoreRowNum; this->singleCoreLoopCount = this->tailCoreSingleCoreLoopCount; this->leftRow = this->tailCoreSingleCoreLoopTail; @@ -67,8 +67,8 @@ public: this->blockLength = this->coreRowNum * this->columnLength; this->msLength = this->coreRowNum * FLOAT_NUM_OF_SINGEL_BLOCK; // max sum length per block process - uint32_t offset1 = this->blockLength * GetBlockIdx(); - uint32_t offset2 = this->msLength * GetBlockIdx(); + uint32_t offset1 = this->blockLength * AscendC::GetBlockIdx(); + uint32_t offset2 = this->msLength * AscendC::GetBlockIdx(); xGm.SetGlobalBuffer((__gm__ float*)x + offset1, this->blockLength); zGm.SetGlobalBuffer((__gm__ float*)z + offset1, this->blockLength); @@ -89,7 +89,7 @@ public: __aicore__ inline void Process() { - if (GetBlockIdx() > this->usedBlockDim) { + if (AscendC::GetBlockIdx() > this->usedBlockDim) { return; } @@ -108,25 +108,25 @@ public: private: __aicore__ inline void CopyIn(int32_t progress, uint32_t rowNum) { - LocalTensor xLocal = queueX.AllocTensor(); - DataCopy(xLocal, xGm[progress * this->tileLength], rowNum * this->columnLength); + AscendC::LocalTensor xLocal = queueX.AllocTensor(); + AscendC::DataCopy(xLocal, xGm[progress * this->tileLength], rowNum * this->columnLength); queueX.EnQue(xLocal); } __aicore__ inline void Compute(int32_t progressm, uint32_t rowNum) { - LocalTensor xLocal = queueX.DeQue(); - LocalTensor maxLocal = queueMax.AllocTensor(); - LocalTensor sumLocal = queueSum.AllocTensor(); - LocalTensor tmpBuffer = sharedTmpBuffer.Get(); + AscendC::LocalTensor xLocal = queueX.DeQue(); + AscendC::LocalTensor maxLocal = queueMax.AllocTensor(); + AscendC::LocalTensor sumLocal = queueSum.AllocTensor(); + AscendC::LocalTensor tmpBuffer = sharedTmpBuffer.Get(); - SoftMaxShapeInfo srcShape = { rowNum, this->columnLength, rowNum, this->columnLength }; + AscendC::SoftMaxShapeInfo srcShape = { rowNum, this->columnLength, rowNum, this->columnLength }; if (rowNum % BASIC_BLOCK_ROW_FACTOR == 0 && this->columnLength % BASIC_BLOCK_COLUMN_FACTOR == 0 && this->columnLength < BASIC_BLOCK_MAX_COLUMN_LENGTH) { - SoftMax(xLocal, sumLocal, maxLocal, xLocal, tmpBuffer, softmaxTiling, srcShape); + AscendC::SoftMax(xLocal, sumLocal, maxLocal, xLocal, tmpBuffer, softmaxTiling, srcShape); } else { - SoftMax(xLocal, sumLocal, maxLocal, xLocal, tmpBuffer, softmaxTiling, srcShape); + AscendC::SoftMax(xLocal, sumLocal, maxLocal, xLocal, tmpBuffer, softmaxTiling, srcShape); } queueX.EnQue(xLocal); queueMax.EnQue(maxLocal); @@ -135,13 +135,13 @@ private: __aicore__ inline void CopyOut(int32_t progress, uint32_t rowNum) { - LocalTensor zLocal = queueX.DeQue(); - LocalTensor maxLocal = queueMax.DeQue(); - LocalTensor sumLocal = queueSum.DeQue(); + AscendC::LocalTensor zLocal = queueX.DeQue(); + AscendC::LocalTensor maxLocal = queueMax.DeQue(); + AscendC::LocalTensor sumLocal = queueSum.DeQue(); - DataCopy(zGm[progress * this->tileLength], zLocal, rowNum * this->columnLength); - DataCopy(maxGm[progress * this->msTileLength], maxLocal, rowNum * FLOAT_NUM_OF_SINGEL_BLOCK); - DataCopy(sumGm[progress * this->msTileLength], sumLocal, rowNum * FLOAT_NUM_OF_SINGEL_BLOCK); + AscendC::DataCopy(zGm[progress * this->tileLength], zLocal, rowNum * this->columnLength); + AscendC::DataCopy(maxGm[progress * this->msTileLength], maxLocal, rowNum * FLOAT_NUM_OF_SINGEL_BLOCK); + AscendC::DataCopy(sumGm[progress * this->msTileLength], sumLocal, rowNum * FLOAT_NUM_OF_SINGEL_BLOCK); queueX.FreeTensor(zLocal); queueMax.FreeTensor(maxLocal); @@ -149,14 +149,14 @@ private: } private: - TPipe pipe; - TBuf sharedTmpBuffer; - TQue queueX; - TQue queueMax, queueSum; - GlobalTensor xGm; - GlobalTensor maxGm; - GlobalTensor sumGm; - GlobalTensor zGm; + AscendC::TPipe pipe; + AscendC::TBuf sharedTmpBuffer; + AscendC::TQue queueX; + AscendC::TQue queueMax, queueSum; + AscendC::GlobalTensor xGm; + AscendC::GlobalTensor maxGm; + AscendC::GlobalTensor sumGm; + AscendC::GlobalTensor zGm; uint32_t blockLength = 0; uint32_t usedBlockDim = 0; diff --git a/examples/activation/softmax/kernel_launch_method_by_direct/softmax_custom.cpp b/examples/activation/softmax/kernel_launch_method_by_direct/softmax_custom.cpp index 51848ac2..bf6abed2 100644 --- a/examples/activation/softmax/kernel_launch_method_by_direct/softmax_custom.cpp +++ b/examples/activation/softmax/kernel_launch_method_by_direct/softmax_custom.cpp @@ -9,14 +9,13 @@ */ #include "../kernel_impl/softmax_kernel.h" -using namespace AscendC; -__aicore__ inline void CopyTiling(VecTiling* tiling, GM_ADDR tilingGM) +__aicore__ inline void CopyTiling(MyCustomKernel::VecTiling* tiling, GM_ADDR tilingGM) { uint32_t* ptr = reinterpret_cast(tiling); auto tiling32 = reinterpret_cast<__gm__ uint32_t*>(tilingGM); - for (int i = 0; i < sizeof(VecTiling) / sizeof(uint32_t); i++, ptr++) { + for (int i = 0; i < sizeof(MyCustomKernel::VecTiling) / sizeof(uint32_t); i++, ptr++) { *ptr = *(tiling32 + i); } return; @@ -28,10 +27,10 @@ extern "C" __global__ __aicore__ void softmax_custom(GM_ADDR x, GM_ADDR max, GM_ if ASCEND_IS_AIC { return; } - VecTiling tilingData; + MyCustomKernel::VecTiling tilingData; CopyTiling(&tilingData, tiling); - KernelSoftmax op; - op.Init(x, max, sum, z, tilingData); + MyCustomKernel::KernelSoftmax op; + op.Init(x, max, sum, res_out, tilingData); op.Process(); } diff --git a/examples/activation/softmax/kernel_launch_method_by_framework/op_kernel/softmax_custom.cpp b/examples/activation/softmax/kernel_launch_method_by_framework/op_kernel/softmax_custom.cpp index e263b3d2..95854592 100644 --- a/examples/activation/softmax/kernel_launch_method_by_framework/op_kernel/softmax_custom.cpp +++ b/examples/activation/softmax/kernel_launch_method_by_framework/op_kernel/softmax_custom.cpp @@ -9,7 +9,6 @@ */ #include "../../../../../../kernel_impl/softmax_kernel.h" -using namespace AscendC; extern "C" __global__ __aicore__ void softmax_custom(GM_ADDR x, GM_ADDR max, GM_ADDR sum, GM_ADDR z, GM_ADDR workspace, GM_ADDR tiling) @@ -18,9 +17,9 @@ extern "C" __global__ __aicore__ void softmax_custom(GM_ADDR x, GM_ADDR max, GM_ return; } GET_TILING_DATA(tilingData, tiling); - VecTiling vecTiling = *reinterpret_cast(&tilingData); - KernelSoftmax op; - op.Init(x, max, sum, z, vecTiling); + MyCustomKernel::VecTiling vecTiling = *reinterpret_cast(&tilingData); + MyCustomKernel::KernelSoftmax op; + op.Init(x, max, sum, res_out, vecTiling); if (TILING_KEY_IS(1)) { op.Process(); } diff --git a/examples/activation/softmaxflash/kernel_impl/softmaxflash_kernel.h b/examples/activation/softmaxflash/kernel_impl/softmaxflash_kernel.h index 391e4fb3..0e4b3877 100644 --- a/examples/activation/softmaxflash/kernel_impl/softmaxflash_kernel.h +++ b/examples/activation/softmaxflash/kernel_impl/softmaxflash_kernel.h @@ -12,7 +12,7 @@ #define EXAMPLES_ACTIVATION_SOFTMAXFLASH_KERNEL_H #include "kernel_operator.h" -namespace AscendC { +namespace MyCustomKernel { constexpr int32_t BUFFER_NUM = 2; constexpr uint32_t FLOAT_NUM_OF_SINGEL_BLOCK = 8; constexpr uint32_t BASIC_BLOCK_ROW_FACTOR = 8; @@ -62,10 +62,10 @@ public: __aicore__ inline void Init(GM_ADDR x, GM_ADDR max, GM_ADDR sum, const VecTiling& tilingData) { - ASSERT(GetBlockNum() != 0 && "block dim can not be zero!"); + ASSERT(AscendC::GetBlockNum() != 0 && "block dim can not be zero!"); InitTiling(tilingData); - if (GetBlockIdx() == this->usedBlockDim) { // tail core + if (AscendC::GetBlockIdx() == this->usedBlockDim) { // tail core this->singleLoopCoreRowNum = this->tailCoreSingleLoopCoreRowNum; this->singleCoreLoopCount = this->tailCoreSingleCoreLoopCount; this->leftRow = this->tailCoreSingleCoreLoopTail; @@ -74,8 +74,8 @@ public: this->blockLength = this->coreRowNum * this->columnLength; this->msLength = this->coreRowNum * FLOAT_NUM_OF_SINGEL_BLOCK; // max sum length per block process - uint32_t offset1 = this->blockLength * GetBlockIdx(); - uint32_t offset2 = this->msLength * GetBlockIdx(); + uint32_t offset1 = this->blockLength * AscendC::GetBlockIdx(); + uint32_t offset2 = this->msLength * AscendC::GetBlockIdx(); xGm.SetGlobalBuffer((__gm__ float*)x + offset1, this->blockLength); maxGm.SetGlobalBuffer((__gm__ float*)max + offset2, this->msLength); @@ -97,7 +97,7 @@ public: __aicore__ inline void Process() { - if (GetBlockIdx() > this->usedBlockDim) { + if (AscendC::GetBlockIdx() > this->usedBlockDim) { return; } @@ -111,15 +111,15 @@ public: CopyIn(i, this->loopK, this->singleLoopCoreRowNum, this->tailK); Compute(i, this->loopK, this->singleLoopCoreRowNum, this->tailK); } - event_t eventIdVToMte3 = static_cast(GetTPipePtr()->FetchEventID(HardEvent::V_MTE3)); - SetFlag(eventIdVToMte3); - WaitFlag(eventIdVToMte3); + event_t eventIdVToMte3 = static_cast(GetTPipePtr()->FetchEventID(AscendC::HardEvent::V_MTE3)); + AscendC::SetFlag(eventIdVToMte3); + AscendC::WaitFlag(eventIdVToMte3); // copy max sum to gm CopyOut(i, this->msTileLength); } - event_t eventIdMte3ToV = static_cast(GetTPipePtr()->FetchEventID(HardEvent::MTE3_V)); - SetFlag(eventIdMte3ToV); - WaitFlag(eventIdMte3ToV); + event_t eventIdMte3ToV = static_cast(GetTPipePtr()->FetchEventID(AscendC::HardEvent::MTE3_V)); + AscendC::SetFlag(eventIdMte3ToV); + AscendC::WaitFlag(eventIdMte3ToV); if (this->leftRow > 0) { for (int32_t j = 0; j < this->loopK; j++) { // split K @@ -131,9 +131,9 @@ public: CopyIn(this->singleCoreLoopCount, this->loopK, this->leftRow, this->tailK); Compute(this->singleCoreLoopCount, this->loopK, this->leftRow, this->tailK); } - event_t eventIdVToMte3 = static_cast(GetTPipePtr()->FetchEventID(HardEvent::V_MTE3)); - SetFlag(eventIdVToMte3); - WaitFlag(eventIdVToMte3); + event_t eventIdVToMte3 = static_cast(GetTPipePtr()->FetchEventID(AscendC::HardEvent::V_MTE3)); + AscendC::SetFlag(eventIdVToMte3); + AscendC::WaitFlag(eventIdVToMte3); // copy max sum to gm uint32_t tailMsTileLength = this->leftRow * FLOAT_NUM_OF_SINGEL_BLOCK; CopyOut(this->singleCoreLoopCount, tailMsTileLength); @@ -143,10 +143,10 @@ public: private: __aicore__ inline void CopyIn(uint32_t rowIndex, uint32_t kIndex, uint32_t rowNum, uint32_t columnNum) { - LocalTensor xLocal = queueX.AllocTensor(); + AscendC::LocalTensor xLocal = queueX.AllocTensor(); uint32_t offset = this->singleLoopCoreRowNum * this->columnLength; for (uint32_t i = 0; i < rowNum; i++) { - DataCopy(xLocal[i * columnNum], xGm[rowIndex * offset + i * this->columnLength + kIndex * this->splitK], + AscendC::DataCopy(xLocal[i * columnNum], xGm[rowIndex * offset + i * this->columnLength + kIndex * this->splitK], columnNum); } queueX.EnQue(xLocal); @@ -154,28 +154,28 @@ private: __aicore__ inline void Compute(uint32_t rowIndex, uint32_t columnIndex, uint32_t rowNum, uint32_t columnNum) { - LocalTensor xLocal = queueX.DeQue(); - LocalTensor tmpBuffer = sharedTmpBuffer.Get(); + AscendC::LocalTensor xLocal = queueX.DeQue(); + AscendC::LocalTensor tmpBuffer = sharedTmpBuffer.Get(); - SoftMaxShapeInfo srcShape = { rowNum, columnNum, rowNum, columnNum }; + AscendC::SoftMaxShapeInfo srcShape = { rowNum, columnNum, rowNum, columnNum }; if (columnIndex == 0) { // isUpdate == false if (rowNum % BASIC_BLOCK_ROW_FACTOR == 0 && columnNum % BASIC_BLOCK_COLUMN_FACTOR == 0 && columnNum < BASIC_BLOCK_MAX_COLUMN_LENGTH) { - SoftmaxFlashV2(xLocal, sumLocal, maxLocal, xLocal, expmaxLocal, sumLocal, + AscendC::SoftmaxFlashV2(xLocal, sumLocal, maxLocal, xLocal, expmaxLocal, sumLocal, maxLocal, tmpBuffer, softmaxTiling, srcShape); } else { - SoftmaxFlashV2(xLocal, sumLocal, maxLocal, xLocal, expmaxLocal, sumLocal, + AscendC::SoftmaxFlashV2(xLocal, sumLocal, maxLocal, xLocal, expmaxLocal, sumLocal, maxLocal, tmpBuffer, softmaxTiling, srcShape); } } else { if (rowNum % BASIC_BLOCK_ROW_FACTOR == 0 && columnNum % BASIC_BLOCK_COLUMN_FACTOR == 0 && columnNum < BASIC_BLOCK_MAX_COLUMN_LENGTH) { - SoftmaxFlashV2(xLocal, sumLocal, maxLocal, xLocal, expmaxLocal, sumLocal, + AscendC::SoftmaxFlashV2(xLocal, sumLocal, maxLocal, xLocal, expmaxLocal, sumLocal, maxLocal, tmpBuffer, softmaxTiling, srcShape); } else { - SoftmaxFlashV2(xLocal, sumLocal, maxLocal, xLocal, expmaxLocal, sumLocal, + AscendC::SoftmaxFlashV2(xLocal, sumLocal, maxLocal, xLocal, expmaxLocal, sumLocal, maxLocal, tmpBuffer, softmaxTiling, srcShape); } } @@ -184,21 +184,22 @@ private: __aicore__ inline void CopyOut(uint32_t progress, uint32_t count) { - DataCopy(maxGm[progress * this->msTileLength], maxLocal, count); - DataCopy(sumGm[progress * this->msTileLength], sumLocal, count); + AscendC::DataCopy(maxGm[progress * this->msTileLength], maxLocal, count); + AscendC::DataCopy(sumGm[progress * this->msTileLength], sumLocal, count); } private: - TPipe pipe; - TBuf sharedTmpBuffer; - TQue queueX; - TBuf tbufMax, tbufSum, tbufExpmax; - GlobalTensor xGm; - GlobalTensor maxGm; - GlobalTensor sumGm; - LocalTensor maxLocal; - LocalTensor sumLocal; - LocalTensor expmaxLocal; + AscendC::TPipe pipe; + AscendC::TBuf sharedTmpBuffer; + AscendC::TQue queueX; + AscendC::TQue queueMax, queueSum, queueExpmax; + AscendC::GlobalTensor xGm; + AscendC::GlobalTensor maxGm; + AscendC::GlobalTensor sumGm; + AscendC::GlobalTensor zGm; + AscendC::LocalTensor maxLocal; + AscendC::LocalTensor sumLocal; + AscendC::LocalTensor expmaxLocal; uint32_t blockLength = 0; uint32_t usedBlockDim = 0; diff --git a/examples/activation/softmaxflash/kernel_launch_method_by_direct/softmaxflash_custom.cpp b/examples/activation/softmaxflash/kernel_launch_method_by_direct/softmaxflash_custom.cpp index cfcfa32d..e3ee128c 100644 --- a/examples/activation/softmaxflash/kernel_launch_method_by_direct/softmaxflash_custom.cpp +++ b/examples/activation/softmaxflash/kernel_launch_method_by_direct/softmaxflash_custom.cpp @@ -9,14 +9,13 @@ */ #include "../kernel_impl/softmaxflash_kernel.h" -using namespace AscendC; -__aicore__ inline void CopyTiling(VecTiling* tiling, GM_ADDR tilingGM) +__aicore__ inline void CopyTiling(MyCustomKernel::VecTiling* tiling, GM_ADDR tilingGM) { uint32_t* ptr = reinterpret_cast(tiling); auto tiling32 = reinterpret_cast<__gm__ uint32_t*>(tilingGM); - for (int i = 0; i < sizeof(VecTiling) / sizeof(uint32_t); i++, ptr++) { + for (int i = 0; i < sizeof(MyCustomKernel::VecTiling) / sizeof(uint32_t); i++, ptr++) { *ptr = *(tiling32 + i); } return; @@ -28,10 +27,10 @@ extern "C" __global__ __aicore__ void softmaxflash_custom(GM_ADDR x, GM_ADDR max if ASCEND_IS_AIC { return; } - VecTiling tilingData; + MyCustomKernel::VecTiling tilingData; CopyTiling(&tilingData, tiling); - KernelSoftmax op; - op.Init(x, max, sum, tilingData); + MyCustomKernel::KernelSoftmax op; + op.Init(x, max, sum, res_out, tilingData); op.Process(); } diff --git a/examples/activation/softmaxflash/kernel_launch_method_by_framework/op_kernel/softmaxflash_custom.cpp b/examples/activation/softmaxflash/kernel_launch_method_by_framework/op_kernel/softmaxflash_custom.cpp index c4d69de5..019c856a 100644 --- a/examples/activation/softmaxflash/kernel_launch_method_by_framework/op_kernel/softmaxflash_custom.cpp +++ b/examples/activation/softmaxflash/kernel_launch_method_by_framework/op_kernel/softmaxflash_custom.cpp @@ -9,7 +9,6 @@ */ #include "../../../../../../kernel_impl/softmaxflash_kernel.h" -using namespace AscendC; extern "C" __global__ __aicore__ void softmaxflash_custom(GM_ADDR x, GM_ADDR max, GM_ADDR sum, GM_ADDR workspace, GM_ADDR tiling) @@ -18,9 +17,9 @@ extern "C" __global__ __aicore__ void softmaxflash_custom(GM_ADDR x, GM_ADDR max return; } GET_TILING_DATA(tilingData, tiling); - VecTiling vecTiling = *reinterpret_cast(&tilingData); - KernelSoftmax op; - op.Init(x, max, sum, vecTiling); + MyCustomKernel::VecTiling vecTiling = *reinterpret_cast(&tilingData); + MyCustomKernel::KernelSoftmax op; + op.Init(x, max, sum, res_out, vecTiling); if (TILING_KEY_IS(1)) { op.Process(); } diff --git a/examples/activation/softmaxgrad/kernel_impl/softmaxgrad_kernel.h b/examples/activation/softmaxgrad/kernel_impl/softmaxgrad_kernel.h index 4f5c10a4..d8f1c25f 100644 --- a/examples/activation/softmaxgrad/kernel_impl/softmaxgrad_kernel.h +++ b/examples/activation/softmaxgrad/kernel_impl/softmaxgrad_kernel.h @@ -12,7 +12,7 @@ #define EXAMPLES_ACTIVATION_SOFTMAXGRAD_KERNEL_H #include "kernel_operator.h" -namespace AscendC { +namespace MyCustomKernel { constexpr int32_t BUFFER_NUM = 1; struct VecTiling { @@ -51,17 +51,17 @@ public: } __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, GM_ADDR z, const VecTiling& tiling) { - ASSERT(GetBlockNum() != 0 && "block dim can not be zero!"); + ASSERT(AscendC::GetBlockNum() != 0 && "block dim can not be zero!"); InitTiling(tiling); - if (GetBlockIdx() == this->usedBlockDim) { // tail core + if (AscendC::GetBlockIdx() == this->usedBlockDim) { // tail core this->singleLoopCoreRowNum = this->tailCoreSingleLoopCoreRowNum; this->singleCoreLoopCount = this->tailCoreSingleCoreLoopCount; this->leftRow = this->tailCoreSingleCoreLoopTail; } this->blockLength = this->coreRowNum * this->columnLength; - uint32_t offset1 = this->blockLength * GetBlockIdx(); + uint32_t offset1 = this->blockLength * AscendC::GetBlockIdx(); xGm.SetGlobalBuffer((__gm__ float*)x + offset1, this->blockLength); yGm.SetGlobalBuffer((__gm__ float*)y + offset1, this->blockLength); @@ -76,7 +76,7 @@ public: __aicore__ inline void Process() { - if (GetBlockIdx() > usedBlockDim) { + if (AscendC::GetBlockIdx() > usedBlockDim) { return; } @@ -85,9 +85,9 @@ public: Compute(i, this->singleLoopCoreRowNum); CopyOut(i, this->singleLoopCoreRowNum); } - event_t eventIdMte3ToMte2 = static_cast(GetTPipePtr()->FetchEventID(HardEvent::MTE3_MTE2)); - SetFlag(eventIdMte3ToMte2); - WaitFlag(eventIdMte3ToMte2); + event_t eventIdMte3ToMte2 = static_cast(GetTPipePtr()->FetchEventID(AscendC::HardEvent::MTE3_MTE2)); + AscendC::SetFlag(eventIdMte3ToMte2); + AscendC::WaitFlag(eventIdMte3ToMte2); if (this->leftRow > 0) { CopyIn(this->singleCoreLoopCount, this->leftRow); Compute(this->singleCoreLoopCount, this->leftRow); @@ -98,22 +98,22 @@ public: private: __aicore__ inline void CopyIn(int32_t progress, uint32_t rowNum) { - LocalTensor xLocal = queueX.AllocTensor(); - LocalTensor yLocal = queueY.AllocTensor(); - DataCopy(xLocal, xGm[progress * this->tileLength], rowNum * this->columnLength); - DataCopy(yLocal, yGm[progress * this->tileLength], rowNum * this->columnLength); + AscendC::LocalTensor xLocal = queueX.AllocTensor(); + AscendC::LocalTensor yLocal = queueY.AllocTensor(); + AscendC::DataCopy(xLocal, xGm[progress * this->tileLength], rowNum * this->columnLength); + AscendC::DataCopy(yLocal, yGm[progress * this->tileLength], rowNum * this->columnLength); queueX.EnQue(xLocal); queueY.EnQue(yLocal); } __aicore__ inline void Compute(int32_t progressm, uint32_t rowNum) { - LocalTensor xLocal = queueX.DeQue(); - LocalTensor yLocal = queueY.DeQue(); - LocalTensor tmpBuffer = sharedTmpBuffer.Get(); + AscendC::LocalTensor xLocal = queueX.DeQue(); + AscendC::LocalTensor yLocal = queueY.DeQue(); + AscendC::LocalTensor tmpBuffer = sharedTmpBuffer.Get(); - SoftMaxShapeInfo srcShape = { rowNum, this->columnLength, rowNum, this->columnLength }; - SoftmaxGrad(yLocal, xLocal, yLocal, tmpBuffer, softmaxTiling, false, srcShape); + AscendC::SoftMaxShapeInfo srcShape = { rowNum, this->columnLength, rowNum, this->columnLength }; + AscendC::SoftmaxGrad(yLocal, xLocal, yLocal, tmpBuffer, softmaxTiling, false, srcShape); queueY.EnQue(yLocal); queueX.FreeTensor(xLocal); @@ -121,19 +121,19 @@ private: __aicore__ inline void CopyOut(int32_t progress, uint32_t rowNum) { - LocalTensor zLocal = queueY.DeQue(); - DataCopy(zGm[progress * this->tileLength], zLocal, rowNum * this->columnLength); + AscendC::LocalTensor zLocal = queueY.DeQue(); + AscendC::DataCopy(zGm[progress * this->tileLength], zLocal, rowNum * this->columnLength); queueY.FreeTensor(zLocal); } private: - TPipe pipe; - TBuf sharedTmpBuffer; - TQue queueX; - TQue queueY; - GlobalTensor xGm; - GlobalTensor yGm; - GlobalTensor zGm; + AscendC::TPipe pipe; + AscendC::TBuf sharedTmpBuffer; + AscendC::TQue queueX; + AscendC::TQue queueY; + AscendC::GlobalTensor xGm; + AscendC::GlobalTensor yGm; + AscendC::GlobalTensor zGm; uint32_t blockLength = 0; uint32_t usedBlockDim = 0; diff --git a/examples/activation/softmaxgrad/kernel_launch_method_by_direct/softmaxgrad_custom.cpp b/examples/activation/softmaxgrad/kernel_launch_method_by_direct/softmaxgrad_custom.cpp index 2ab23bb2..febf4b5e 100644 --- a/examples/activation/softmaxgrad/kernel_launch_method_by_direct/softmaxgrad_custom.cpp +++ b/examples/activation/softmaxgrad/kernel_launch_method_by_direct/softmaxgrad_custom.cpp @@ -9,14 +9,13 @@ */ #include "../kernel_impl/softmaxgrad_kernel.h" -using namespace AscendC; -__aicore__ inline void CopyTiling(VecTiling* tiling, GM_ADDR tilingGM) +__aicore__ inline void CopyTiling(MyCustomKernel::VecTiling* tiling, GM_ADDR tilingGM) { uint32_t* ptr = reinterpret_cast(tiling); auto tiling32 = reinterpret_cast<__gm__ uint32_t*>(tilingGM); - for (int i = 0; i < sizeof(VecTiling) / sizeof(uint32_t); i++, ptr++) { + for (int i = 0; i < sizeof(MyCustomKernel::VecTiling) / sizeof(uint32_t); i++, ptr++) { *ptr = *(tiling32 + i); } return; @@ -28,9 +27,9 @@ extern "C" __global__ __aicore__ void softmaxgrad_custom(GM_ADDR x, GM_ADDR y, G if ASCEND_IS_AIC { return; } - VecTiling tilingData; + MyCustomKernel::VecTiling tilingData; CopyTiling(&tilingData, tiling); - KernelSoftmax op; + MyCustomKernel::KernelSoftmax op; op.Init(x, y, z, tilingData); op.Process(); } diff --git a/examples/activation/softmaxgrad/kernel_launch_method_by_framework/op_kernel/softmaxgrad_custom.cpp b/examples/activation/softmaxgrad/kernel_launch_method_by_framework/op_kernel/softmaxgrad_custom.cpp index ef21e9b7..d466c81b 100644 --- a/examples/activation/softmaxgrad/kernel_launch_method_by_framework/op_kernel/softmaxgrad_custom.cpp +++ b/examples/activation/softmaxgrad/kernel_launch_method_by_framework/op_kernel/softmaxgrad_custom.cpp @@ -9,7 +9,6 @@ */ #include "../../../../../../kernel_impl/softmaxgrad_kernel.h" -using namespace AscendC; extern "C" __global__ __aicore__ void softmaxgrad_custom(GM_ADDR x, GM_ADDR y, GM_ADDR z, GM_ADDR workspace, GM_ADDR tiling) @@ -18,9 +17,9 @@ extern "C" __global__ __aicore__ void softmaxgrad_custom(GM_ADDR x, GM_ADDR y, G return; } GET_TILING_DATA(tilingData, tiling); - VecTiling vecTiling = *reinterpret_cast(&tilingData); - KernelSoftmax op; - op.Init(x, y, z, vecTiling); + MyCustomKernel::VecTiling vecTiling = *reinterpret_cast(&tilingData); + MyCustomKernel::KernelSoftmax op; + op.Init(x, y, res_out, vecTiling); if (TILING_KEY_IS(1)) { op.Process(); } diff --git a/examples/activation/softmaxgradfront/kernel_impl/softmaxgradfront_kernel.h b/examples/activation/softmaxgradfront/kernel_impl/softmaxgradfront_kernel.h index 94a0659a..8a3fb2f2 100644 --- a/examples/activation/softmaxgradfront/kernel_impl/softmaxgradfront_kernel.h +++ b/examples/activation/softmaxgradfront/kernel_impl/softmaxgradfront_kernel.h @@ -11,7 +11,7 @@ #ifndef EXAMPLES_ACTIVATION_SOFTMAXGRADFRONT_KERNEL_H #define EXAMPLES_ACTIVATION_SOFTMAXGRADFRONT_KERNEL_H #include "kernel_operator.h" -namespace AscendC { +namespace MyCustomKernel { constexpr int32_t BUFFER_NUM = 1; constexpr uint32_t FLOAT_NUM_OF_SINGEL_BLOCK = 8; constexpr uint32_t BASIC_BLOCK_ROW_FACTOR = 8; @@ -53,18 +53,18 @@ public: } __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, GM_ADDR z, const VecTiling& tiling) { - ASSERT(GetBlockNum() != 0 && "block dim can not be zero!"); + ASSERT(AscendC::GetBlockNum() != 0 && "block dim can not be zero!"); InitTiling(tiling); - if (GetBlockIdx() == this->usedBlockDim) { // tail core + if (AscendC::GetBlockIdx() == this->usedBlockDim) { // tail core this->singleLoopCoreRowNum = this->tailCoreSingleLoopCoreRowNum; this->singleCoreLoopCount = this->tailCoreSingleCoreLoopCount; this->leftRow = this->tailCoreSingleCoreLoopTail; } this->blockLength = this->coreRowNum * this->columnLength; - uint32_t offset1 = this->blockLength * GetBlockIdx(); - uint32_t offset2 = this->coreRowNum * FLOAT_NUM_OF_SINGEL_BLOCK * GetBlockIdx(); + uint32_t offset1 = this->blockLength * AscendC::GetBlockIdx(); + uint32_t offset2 = this->coreRowNum * FLOAT_NUM_OF_SINGEL_BLOCK * AscendC::GetBlockIdx(); xGm.SetGlobalBuffer((__gm__ float*)x + offset1, this->blockLength); yGm.SetGlobalBuffer((__gm__ float*)y + offset1, this->blockLength); @@ -80,7 +80,7 @@ public: __aicore__ inline void Process() { - if (GetBlockIdx() > usedBlockDim) { + if (AscendC::GetBlockIdx() > usedBlockDim) { return; } @@ -100,28 +100,28 @@ public: private: __aicore__ inline void CopyIn(int32_t progress, uint32_t rowNum) { - LocalTensor xLocal = queueX.AllocTensor(); - LocalTensor yLocal = queueY.AllocTensor(); - DataCopy(xLocal, xGm[progress * this->tileLength], rowNum * this->columnLength); - DataCopy(yLocal, yGm[progress * this->tileLength], rowNum * this->columnLength); + AscendC::LocalTensor xLocal = queueX.AllocTensor(); + AscendC::LocalTensor yLocal = queueY.AllocTensor(); + AscendC::DataCopy(xLocal, xGm[progress * this->tileLength], rowNum * this->columnLength); + AscendC::DataCopy(yLocal, yGm[progress * this->tileLength], rowNum * this->columnLength); queueX.EnQue(xLocal); queueY.EnQue(yLocal); } __aicore__ inline void Compute(int32_t progressm, uint32_t rowNum) { - LocalTensor xLocal = queueX.DeQue(); - LocalTensor yLocal = queueY.DeQue(); - LocalTensor zLocal = queueZ.AllocTensor(); - LocalTensor tmpBuffer = sharedTmpBuffer.Get(); + AscendC::LocalTensor xLocal = queueX.DeQue(); + AscendC::LocalTensor yLocal = queueY.DeQue(); + AscendC::LocalTensor zLocal = queueZ.AllocTensor(); + AscendC::LocalTensor tmpBuffer = sharedTmpBuffer.Get(); - SoftMaxShapeInfo srcShape = { rowNum, this->columnLength, rowNum, this->columnLength }; + AscendC::SoftMaxShapeInfo srcShape = { rowNum, this->columnLength, rowNum, this->columnLength }; if (this->singleLoopCoreRowNum % BASIC_BLOCK_ROW_FACTOR == 0 && this->columnLength % BASIC_BLOCK_COLUMN_FACTOR == 0 && this->columnLength < BASIC_BLOCK_MAX_COLUMN_LENGTH) { - SoftmaxGradFront(zLocal, xLocal, yLocal, tmpBuffer, softmaxTiling, srcShape); + AscendC::SoftmaxGradFront(zLocal, xLocal, yLocal, tmpBuffer, softmaxTiling, srcShape); } else { - SoftmaxGradFront(zLocal, xLocal, yLocal, tmpBuffer, softmaxTiling, srcShape); + AscendC::SoftmaxGradFront(zLocal, xLocal, yLocal, tmpBuffer, softmaxTiling, srcShape); } queueZ.EnQue(zLocal); @@ -131,21 +131,21 @@ private: __aicore__ inline void CopyOut(int32_t progress, uint32_t rowNum) { - LocalTensor zLocal = queueZ.DeQue(); - DataCopy(zGm[progress * this->singleLoopCoreRowNum * FLOAT_NUM_OF_SINGEL_BLOCK], zLocal, + AscendC::LocalTensor zLocal = queueZ.DeQue(); + AscendC::DataCopy(zGm[progress * this->singleLoopCoreRowNum * FLOAT_NUM_OF_SINGEL_BLOCK], zLocal, rowNum * FLOAT_NUM_OF_SINGEL_BLOCK); queueZ.FreeTensor(zLocal); } private: - TPipe pipe; - TBuf sharedTmpBuffer; - TQue queueX; - TQue queueY; - TQue queueZ; - GlobalTensor xGm; - GlobalTensor yGm; - GlobalTensor zGm; + AscendC::TPipe pipe; + AscendC::TBuf sharedTmpBuffer; + AscendC::TQue queueX; + AscendC::TQue queueY; + AscendC::TQue queueZ; + AscendC::GlobalTensor xGm; + AscendC::GlobalTensor yGm; + AscendC::GlobalTensor zGm; uint32_t blockLength = 0; uint32_t usedBlockDim = 0; diff --git a/examples/activation/softmaxgradfront/kernel_launch_method_by_direct/softmaxgradfront_custom.cpp b/examples/activation/softmaxgradfront/kernel_launch_method_by_direct/softmaxgradfront_custom.cpp index c4a19c76..18785725 100644 --- a/examples/activation/softmaxgradfront/kernel_launch_method_by_direct/softmaxgradfront_custom.cpp +++ b/examples/activation/softmaxgradfront/kernel_launch_method_by_direct/softmaxgradfront_custom.cpp @@ -9,14 +9,13 @@ */ #include "../kernel_impl/softmaxgradfront_kernel.h" -using namespace AscendC; -__aicore__ inline void CopyTiling(VecTiling* tiling, GM_ADDR tilingGM) +__aicore__ inline void CopyTiling(MyCustomKernel::VecTiling* tiling, GM_ADDR tilingGM) { uint32_t* ptr = reinterpret_cast(tiling); auto tiling32 = reinterpret_cast<__gm__ uint32_t*>(tilingGM); - for (int i = 0; i < sizeof(VecTiling) / sizeof(uint32_t); i++, ptr++) { + for (int i = 0; i < sizeof(MyCustomKernel::VecTiling) / sizeof(uint32_t); i++, ptr++) { *ptr = *(tiling32 + i); } return; @@ -28,10 +27,10 @@ extern "C" __global__ __aicore__ void softmaxgradfront_custom(GM_ADDR x, GM_ADDR if ASCEND_IS_AIC { return; } - VecTiling tilingData; + MyCustomKernel::VecTiling tilingData; CopyTiling(&tilingData, tiling); - KernelSoftmax op; - op.Init(x, y, z, tilingData); + MyCustomKernel::KernelSoftmax op; + op.Init(x, y, res_out, tilingData); op.Process(); } diff --git a/examples/activation/softmaxgradfront/kernel_launch_method_by_framework/op_kernel/softmaxgradfront_custom.cpp b/examples/activation/softmaxgradfront/kernel_launch_method_by_framework/op_kernel/softmaxgradfront_custom.cpp index ad346e4b..3557e4c2 100644 --- a/examples/activation/softmaxgradfront/kernel_launch_method_by_framework/op_kernel/softmaxgradfront_custom.cpp +++ b/examples/activation/softmaxgradfront/kernel_launch_method_by_framework/op_kernel/softmaxgradfront_custom.cpp @@ -8,7 +8,6 @@ * See LICENSE in the root of the software repository for the full text of the License. */ #include "../../../../../../kernel_impl/softmaxgradfront_kernel.h" -using namespace AscendC; extern "C" __global__ __aicore__ void softmaxgradfront_custom(GM_ADDR x, GM_ADDR y, GM_ADDR z, GM_ADDR workspace, GM_ADDR tiling) @@ -17,9 +16,9 @@ extern "C" __global__ __aicore__ void softmaxgradfront_custom(GM_ADDR x, GM_ADDR return; } GET_TILING_DATA(tilingData, tiling); - VecTiling vecTiling = *reinterpret_cast(&tilingData); - KernelSoftmax op; - op.Init(x, y, z, vecTiling); + MyCustomKernel::VecTiling vecTiling = *reinterpret_cast(&tilingData); + MyCustomKernel::KernelSoftmax op; + op.Init(x, y, res_out, vecTiling); if (TILING_KEY_IS(1)) { op.Process(); } diff --git a/examples/matrix/batch_matmul/kernel_impl/batch_matmul_custom_impl.h b/examples/matrix/batch_matmul/kernel_impl/batch_matmul_custom_impl.h index be0b4aaa..eab9dfd0 100644 --- a/examples/matrix/batch_matmul/kernel_impl/batch_matmul_custom_impl.h +++ b/examples/matrix/batch_matmul/kernel_impl/batch_matmul_custom_impl.h @@ -13,7 +13,6 @@ #include "kernel_operator.h" #include "lib/matmul_intf.h" -using namespace AscendC; using namespace matmul; constexpr int USED_CORE_NUM = 2; @@ -24,7 +23,7 @@ class BatchMatmulKernel { __aicore__ inline BatchMatmulKernel(){}; __aicore__ inline void Init(GM_ADDR a, GM_ADDR b, GM_ADDR bias, GM_ADDR c, GM_ADDR workspace, const TCubeTiling& tiling); template - __aicore__ inline void Process(TPipe* pipe, int32_t batchA, int32_t batchB); + __aicore__ inline void Process(AscendC::TPipe* pipe, int32_t batchA, int32_t batchB); Matmul matmulObj; private: __aicore__ inline void CalcOffset(int32_t blockIdx, const TCubeTiling& tiling, int32_t& offsetA, int32_t& offsetB, @@ -33,10 +32,10 @@ class BatchMatmulKernel { using bType = typename B_TYPE::T; using cType = typename C_TYPE::T; using biasType = typename BIAS_TYPE::T; - GlobalTensor aGlobal; - GlobalTensor bGlobal; - GlobalTensor cGlobal; - GlobalTensor biasGlobal; + AscendC::GlobalTensor aGlobal; + AscendC::GlobalTensor bGlobal; + AscendC::GlobalTensor cGlobal; + AscendC::GlobalTensor biasGlobal; TCubeTiling tiling; }; @@ -59,7 +58,7 @@ __aicore__ inline void BatchMatmulKernel::Ini int32_t offsetB = 0; int32_t offsetC = 0; int32_t offsetBias = 0; - CalcOffset(GetBlockIdx(), tiling, offsetA, offsetB, offsetC, offsetBias); + CalcOffset(AscendC::GetBlockIdx(), tiling, offsetA, offsetB, offsetC, offsetBias); aGlobal = aGlobal[offsetA]; bGlobal = bGlobal[offsetB]; cGlobal = cGlobal[offsetC]; @@ -71,7 +70,7 @@ __aicore__ inline void BatchMatmulKernel::Ini template template -__aicore__ inline void BatchMatmulKernel::Process(TPipe* pipe, int32_t batchA, int32_t batchB) +__aicore__ inline void BatchMatmulKernel::Process(AscendC::TPipe* pipe, int32_t batchA, int32_t batchB) { int batchC = batchA > batchB ? batchA : batchB; int gLay = tiling.ALayoutInfoG > tiling.BLayoutInfoG ? tiling.ALayoutInfoG : tiling.BLayoutInfoG; diff --git a/examples/matrix/batch_matmul/kernel_launch_method_by_direct/batch_matmul_custom.cpp b/examples/matrix/batch_matmul/kernel_launch_method_by_direct/batch_matmul_custom.cpp index d115a310..b1abfcd9 100644 --- a/examples/matrix/batch_matmul/kernel_launch_method_by_direct/batch_matmul_custom.cpp +++ b/examples/matrix/batch_matmul/kernel_launch_method_by_direct/batch_matmul_custom.cpp @@ -12,7 +12,6 @@ #include "lib/matmul_intf.h" #include "../kernel_impl/batch_matmul_custom_impl.h" -using namespace AscendC; using namespace matmul; constexpr int32_t FULL_L1_SIZE = 512 * 1024; @@ -39,7 +38,7 @@ extern "C" __global__ __aicore__ void batch_matmul_custom(GM_ADDR a, GM_ADDR b, typedef MatmulType C_TYPE; typedef MatmulType BIAS_TYPE; BatchMatmulKernel batchMatmulKernel; - TPipe pipe; + AscendC::TPipe pipe; tiling.shareMode = 0; // 0, share mode tiling.shareL1Size = FULL_L1_SIZE; // full L1 tiling.shareL0CSize = FULL_L0C_SIZE; // full L0C diff --git a/examples/matrix/batch_matmul/kernel_launch_method_by_framework/op_host/batch_matmul_custom.cpp b/examples/matrix/batch_matmul/kernel_launch_method_by_framework/op_host/batch_matmul_custom.cpp index b7ceb70d..ff053d11 100644 --- a/examples/matrix/batch_matmul/kernel_launch_method_by_framework/op_host/batch_matmul_custom.cpp +++ b/examples/matrix/batch_matmul/kernel_launch_method_by_framework/op_host/batch_matmul_custom.cpp @@ -28,7 +28,7 @@ static ge::graphStatus TilingFunc(gert::TilingContext* context) tiling.SaveToBuffer(context->GetRawTilingData()->GetData(), context->GetRawTilingData()->GetCapacity()); context->GetRawTilingData()->SetDataSize(tiling.GetDataSize()); size_t userWorkspaceSize = 0; - size_t systemWorkspaceSize = ascendcPlatform.GetLibApiWorkSpaceSize(); + size_t systemWorkspaceSize = static_cast(ascendcPlatform.GetLibApiWorkSpaceSize()); size_t *currentWorkspace = context->GetWorkspaceSizes(1); currentWorkspace[0] = userWorkspaceSize + systemWorkspaceSize; diff --git a/examples/matrix/batch_matmul/kernel_launch_method_by_framework/op_kernel/batch_matmul_custom.cpp b/examples/matrix/batch_matmul/kernel_launch_method_by_framework/op_kernel/batch_matmul_custom.cpp index ff9547f9..a653a0f5 100644 --- a/examples/matrix/batch_matmul/kernel_launch_method_by_framework/op_kernel/batch_matmul_custom.cpp +++ b/examples/matrix/batch_matmul/kernel_launch_method_by_framework/op_kernel/batch_matmul_custom.cpp @@ -12,7 +12,6 @@ #include "lib/matmul_intf.h" #include "../../../../../../kernel_impl/batch_matmul_custom_impl.h" -using namespace AscendC; using namespace matmul; constexpr int32_t FULL_L1_SIZE = 512 * 1024; @@ -25,7 +24,7 @@ extern "C" __global__ __aicore__ void batch_matmul_custom(GM_ADDR a, GM_ADDR b, typedef MatmulType C_TYPE; typedef MatmulType BIAS_TYPE; BatchMatmulKernel batchMatmulKernel; - TPipe pipe; + AscendC::TPipe pipe; tilingData.cubeTilingData.shareMode = 0; // 0, share mode tilingData.cubeTilingData.shareL1Size = FULL_L1_SIZE; // full L1 tilingData.cubeTilingData.shareL0CSize = FULL_L0C_SIZE; // full L0C diff --git a/examples/matrix/matmul/kernel_impl/matmul_custom_impl.h b/examples/matrix/matmul/kernel_impl/matmul_custom_impl.h index ef815c69..c7904015 100644 --- a/examples/matrix/matmul/kernel_impl/matmul_custom_impl.h +++ b/examples/matrix/matmul/kernel_impl/matmul_custom_impl.h @@ -13,7 +13,6 @@ #include "kernel_operator.h" #include "lib/matmul_intf.h" -using namespace AscendC; using namespace matmul; template @@ -22,7 +21,7 @@ class MatmulKernel { __aicore__ inline MatmulKernel(){}; __aicore__ inline void Init(GM_ADDR a, GM_ADDR b, GM_ADDR bias, GM_ADDR c, GM_ADDR workspace, const TCubeTiling& tiling); template - __aicore__ inline void Process(TPipe* pipe); + __aicore__ inline void Process(AscendC::TPipe* pipe); Matmul, MatmulType, MatmulType, @@ -32,10 +31,10 @@ class MatmulKernel { __aicore__ inline void CalcOffset(int32_t blockIdx, const TCubeTiling& tiling, int32_t& offsetA, int32_t& offsetB, int32_t& offsetC, int32_t& offsetBias, bool isAtrans, bool isBtrans); - GlobalTensor aGlobal; - GlobalTensor bGlobal; - GlobalTensor cGlobal; - GlobalTensor biasGlobal; + AscendC::GlobalTensor aGlobal; + AscendC::GlobalTensor bGlobal; + AscendC::GlobalTensor cGlobal; + AscendC::GlobalTensor biasGlobal; TCubeTiling tiling; }; @@ -55,7 +54,7 @@ __aicore__ inline void MatmulKernel::Init(GM_ADDR int32_t offsetBias = 0; bool isAtrans = true; bool isBtrans = false; - CalcOffset(GetBlockIdx(), tiling, offsetA, offsetB, offsetC, offsetBias, isAtrans, isBtrans); + CalcOffset(AscendC::GetBlockIdx(), tiling, offsetA, offsetB, offsetC, offsetBias, isAtrans, isBtrans); aGlobal = aGlobal[offsetA]; bGlobal = bGlobal[offsetB]; cGlobal = cGlobal[offsetC]; @@ -67,11 +66,11 @@ __aicore__ inline void MatmulKernel::Init(GM_ADDR template template -__aicore__ inline void MatmulKernel::Process(TPipe* pipe) +__aicore__ inline void MatmulKernel::Process(AscendC::TPipe* pipe) { if constexpr (setTmpSpace) { - TBuf<> tmpMMFormatUb; - LocalTensor mmformatUb; + AscendC::TBuf<> tmpMMFormatUb; + AscendC::LocalTensor mmformatUb; pipe->InitBuffer(tmpMMFormatUb, TOTAL_VEC_LOCAL_SIZE); mmformatUb = tmpMMFormatUb.Get(TOTAL_VEC_LOCAL_SIZE); matmulObj.SetLocalWorkspace(mmformatUb); @@ -86,11 +85,16 @@ __aicore__ inline void MatmulKernel::Process(TPip matmulObj.End(); } +__aicore__ inline uint32_t Ceiling(uint32_t a, uint32_t b) +{ + return (a + b - 1) / b; +} + template __aicore__ inline void MatmulKernel::CalcOffset(int32_t blockIdx, const TCubeTiling& tiling, int32_t& offsetA, int32_t& offsetB, int32_t& offsetC, int32_t& offsetBias, bool isAtrans, bool isBtrans) { - auto mSingleBlocks = Ceil(tiling.M, tiling.singleCoreM); + auto mSingleBlocks = Ceiling(tiling.M, tiling.singleCoreM); auto mCoreIndx = blockIdx % mSingleBlocks; auto nCoreIndx = blockIdx / mSingleBlocks; diff --git a/examples/matrix/matmul/kernel_launch_method_by_direct/matmul_custom.cpp b/examples/matrix/matmul/kernel_launch_method_by_direct/matmul_custom.cpp index be66d0e6..a274b83f 100644 --- a/examples/matrix/matmul/kernel_launch_method_by_direct/matmul_custom.cpp +++ b/examples/matrix/matmul/kernel_launch_method_by_direct/matmul_custom.cpp @@ -12,7 +12,6 @@ #include "lib/matmul_intf.h" #include "../kernel_impl/matmul_custom_impl.h" -using namespace AscendC; using namespace matmul; __aicore__ inline void CopyTiling(TCubeTiling* tiling, GM_ADDR tilingGM) @@ -31,7 +30,7 @@ extern "C" __global__ __aicore__ void matmul_custom(GM_ADDR a, GM_ADDR b, GM_ADD TCubeTiling tiling; CopyTiling(&tiling, tilingGm); MatmulKernel matmulKernel; - TPipe pipe; + AscendC::TPipe pipe; REGIST_MATMUL_OBJ(&pipe, GetSysWorkSpacePtr(), matmulKernel.matmulObj, &tiling); matmulKernel.Init(a, b, nullptr, c, workspace, tiling); #ifdef CUSTOM_ASCEND310P diff --git a/examples/matrix/matmul/kernel_launch_method_by_framework/op_host/matmul_custom.cpp b/examples/matrix/matmul/kernel_launch_method_by_framework/op_host/matmul_custom.cpp index adab49f0..95d12755 100644 --- a/examples/matrix/matmul/kernel_launch_method_by_framework/op_host/matmul_custom.cpp +++ b/examples/matrix/matmul/kernel_launch_method_by_framework/op_host/matmul_custom.cpp @@ -48,7 +48,7 @@ static ge::graphStatus TilingFunc(gert::TilingContext* context) tiling.SaveToBuffer(context->GetRawTilingData()->GetData(), context->GetRawTilingData()->GetCapacity()); context->GetRawTilingData()->SetDataSize(tiling.GetDataSize()); size_t userWorkspaceSize = 0; - size_t systemWorkspaceSize = ascendcPlatform.GetLibApiWorkSpaceSize(); + size_t systemWorkspaceSize = static_cast(ascendcPlatform.GetLibApiWorkSpaceSize()); size_t *currentWorkspace = context->GetWorkspaceSizes(1); currentWorkspace[0] = userWorkspaceSize + systemWorkspaceSize; diff --git a/examples/matrix/matmul/kernel_launch_method_by_framework/op_kernel/matmul_custom.cpp b/examples/matrix/matmul/kernel_launch_method_by_framework/op_kernel/matmul_custom.cpp index 09c40946..af60b3d4 100644 --- a/examples/matrix/matmul/kernel_launch_method_by_framework/op_kernel/matmul_custom.cpp +++ b/examples/matrix/matmul/kernel_launch_method_by_framework/op_kernel/matmul_custom.cpp @@ -12,13 +12,12 @@ #include "lib/matmul_intf.h" #include "../../../../../../kernel_impl/matmul_custom_impl.h" -using namespace AscendC; using namespace matmul; extern "C" __global__ __aicore__ void matmul_custom(GM_ADDR a, GM_ADDR b, GM_ADDR bias, GM_ADDR c, GM_ADDR workspace, GM_ADDR tiling) { GET_TILING_DATA(tilingData, tiling); MatmulKernel matmulKernel; - TPipe pipe; + AscendC::TPipe pipe; REGIST_MATMUL_OBJ(&pipe, GetSysWorkSpacePtr(), matmulKernel.matmulObj,&tilingData.cubeTilingData); matmulKernel.Init(a, b, bias, c, workspace, tilingData.cubeTilingData); if (TILING_KEY_IS(1)) { diff --git a/examples/normalization/layernorm/kernel_impl/layernorm_custom.h b/examples/normalization/layernorm/kernel_impl/layernorm_custom.h index c838f882..435a7abc 100644 --- a/examples/normalization/layernorm/kernel_impl/layernorm_custom.h +++ b/examples/normalization/layernorm/kernel_impl/layernorm_custom.h @@ -12,7 +12,7 @@ #define EXAMPLES_NORMALIZATION_LAYERNORM_CUSTOM_H #include "kernel_operator.h" -namespace AscendC { +namespace MyCustomKernel { struct VecTiling { LayerNormTiling layernormTilingData; float epsilon = 0; @@ -58,13 +58,13 @@ public: private: __aicore__ inline void CopyIn() { - LocalTensor inputXLocal = inQueueX.AllocTensor(); - LocalTensor gammaLocal = inQueueGamma.AllocTensor(); - LocalTensor betaLocal = inQueueBeta.AllocTensor(); + AscendC::LocalTensor inputXLocal = inQueueX.AllocTensor(); + AscendC::LocalTensor gammaLocal = inQueueGamma.AllocTensor(); + AscendC::LocalTensor betaLocal = inQueueBeta.AllocTensor(); - DataCopy(inputXLocal, inputXGlobal, bshLength); - DataCopy(gammaLocal, gammGlobal, hLength); - DataCopy(betaLocal, betaGlobal, hLength); + AscendC::DataCopy(inputXLocal, inputXGlobal, bshLength); + AscendC::DataCopy(gammaLocal, gammGlobal, hLength); + AscendC::DataCopy(betaLocal, betaGlobal, hLength); inQueueX.EnQue(inputXLocal); inQueueGamma.EnQue(gammaLocal); @@ -72,15 +72,15 @@ private: } __aicore__ inline void Compute() { - LocalTensor inputXLocal = inQueueX.DeQue(); - LocalTensor gammaLocal = inQueueGamma.DeQue(); - LocalTensor betaLocal = inQueueBeta.DeQue(); + AscendC::LocalTensor inputXLocal = inQueueX.DeQue(); + AscendC::LocalTensor gammaLocal = inQueueGamma.DeQue(); + AscendC::LocalTensor betaLocal = inQueueBeta.DeQue(); - LocalTensor outputLocal = outQueue.AllocTensor(); - LocalTensor meanLocal = outQueueMean.AllocTensor(); - LocalTensor varianceLocal = outQueueVariance.AllocTensor(); + AscendC::LocalTensor outputLocal = outQueue.AllocTensor(); + AscendC::LocalTensor meanLocal = outQueueMean.AllocTensor(); + AscendC::LocalTensor varianceLocal = outQueueVariance.AllocTensor(); - LayerNorm(outputLocal, meanLocal, varianceLocal, inputXLocal, gammaLocal, betaLocal, + AscendC::LayerNorm(outputLocal, meanLocal, varianceLocal, inputXLocal, gammaLocal, betaLocal, (float)epsilon, tiling_); outQueue.EnQue(outputLocal); @@ -93,13 +93,13 @@ private: } __aicore__ inline void CopyOut() { - LocalTensor outputLocal = outQueue.DeQue(); - LocalTensor meanLocal = outQueueMean.DeQue(); - LocalTensor varianceLocal = outQueueVariance.DeQue(); + AscendC::LocalTensor outputLocal = outQueue.DeQue(); + AscendC::LocalTensor meanLocal = outQueueMean.DeQue(); + AscendC::LocalTensor varianceLocal = outQueueVariance.DeQue(); - DataCopy(outputGlobal, outputLocal, bshLength); - DataCopy(outputMeanGlobal, meanLocal, bsLength); - DataCopy(outputVarianceGlobal, varianceLocal, bsLength); + AscendC::DataCopy(outputGlobal, outputLocal, bshLength); + AscendC::DataCopy(outputMeanGlobal, meanLocal, bsLength); + AscendC::DataCopy(outputVarianceGlobal, varianceLocal, bsLength); outQueue.FreeTensor(outputLocal); outQueueMean.FreeTensor(meanLocal); @@ -107,20 +107,20 @@ private: } private: - GlobalTensor inputXGlobal; - GlobalTensor gammGlobal; - GlobalTensor betaGlobal; - GlobalTensor outputGlobal; - GlobalTensor outputMeanGlobal; - GlobalTensor outputVarianceGlobal; - - TPipe pipe; - TQue inQueueX; - TQue inQueueGamma; - TQue inQueueBeta; - TQue outQueue; - TQue outQueueMean; - TQue outQueueVariance; + AscendC::GlobalTensor inputXGlobal; + AscendC::GlobalTensor gammGlobal; + AscendC::GlobalTensor betaGlobal; + AscendC::GlobalTensor outputGlobal; + AscendC::GlobalTensor outputMeanGlobal; + AscendC::GlobalTensor outputVarianceGlobal; + + AscendC::TPipe pipe; + AscendC::TQue inQueueX; + AscendC::TQue inQueueGamma; + AscendC::TQue inQueueBeta; + AscendC::TQue outQueue; + AscendC::TQue outQueueMean; + AscendC::TQue outQueueVariance; uint32_t bLength; uint32_t sLength; diff --git a/examples/normalization/layernorm/kernel_launch_method_by_direct/layernorm_custom.cpp b/examples/normalization/layernorm/kernel_launch_method_by_direct/layernorm_custom.cpp index b225a098..9358217e 100644 --- a/examples/normalization/layernorm/kernel_launch_method_by_direct/layernorm_custom.cpp +++ b/examples/normalization/layernorm/kernel_launch_method_by_direct/layernorm_custom.cpp @@ -9,14 +9,13 @@ */ #include "kernel_operator.h" #include "../kernel_impl/layernorm_custom.h" -using namespace AscendC; -__aicore__ inline void CopyTiling(VecTiling *tiling, GM_ADDR tilingGM) +__aicore__ inline void CopyTiling(MyCustomKernel::VecTiling *tiling, GM_ADDR tilingGM) { uint32_t *ptr = reinterpret_cast(tiling); auto tiling32 = reinterpret_cast<__gm__ uint32_t *>(tilingGM); - for (int i = 0; i < sizeof(VecTiling) / sizeof(uint32_t); i++, ptr++) { + for (int i = 0; i < sizeof(MyCustomKernel::VecTiling) / sizeof(uint32_t); i++, ptr++) { *ptr = *(tiling32 + i); } return; @@ -28,9 +27,9 @@ extern "C" __global__ __aicore__ void layernorm_custom(GM_ADDR inputXGm, GM_ADDR if ASCEND_IS_AIC { return; } - VecTiling tilingData; + MyCustomKernel::VecTiling tilingData; CopyTiling(&tilingData, tiling); - KernelLayernorm op; + MyCustomKernel::KernelLayernorm op; op.Init(inputXGm, gammaGm, betaGm, outputGm, outputMeanGm, outputVarianceGm, tilingData); op.Process(); } diff --git a/examples/normalization/layernorm/kernel_launch_method_by_framework/op_kernel/layernorm_custom.cpp b/examples/normalization/layernorm/kernel_launch_method_by_framework/op_kernel/layernorm_custom.cpp index 17db0fbd..2bcc8618 100644 --- a/examples/normalization/layernorm/kernel_launch_method_by_framework/op_kernel/layernorm_custom.cpp +++ b/examples/normalization/layernorm/kernel_launch_method_by_framework/op_kernel/layernorm_custom.cpp @@ -9,15 +9,14 @@ */ #include "../../../../../../kernel_impl/layernorm_custom.h" -using namespace AscendC; extern "C" __global__ __aicore__ void layernorm_custom(GM_ADDR inputXGm, GM_ADDR gammaGm, GM_ADDR betaGm, GM_ADDR outputGm, GM_ADDR outputMeanGm, GM_ADDR outputVarianceGm, GM_ADDR workspace, GM_ADDR tiling) { GET_TILING_DATA(tilingData, tiling); - VecTiling vecTiling = *reinterpret_cast(&tilingData); + MyCustomKernel::VecTiling vecTiling = *reinterpret_cast(&tilingData); if (TILING_KEY_IS(1)) { - KernelLayernorm op; + MyCustomKernel::KernelLayernorm op; op.Init(inputXGm, gammaGm, betaGm, outputGm, outputMeanGm, outputVarianceGm, vecTiling); op.Process(); } diff --git a/examples/pad/broadcast/kernel_impl/broadcast_custom.h b/examples/pad/broadcast/kernel_impl/broadcast_custom.h index d816db34..6df59bf8 100644 --- a/examples/pad/broadcast/kernel_impl/broadcast_custom.h +++ b/examples/pad/broadcast/kernel_impl/broadcast_custom.h @@ -13,7 +13,6 @@ #include "kernel_operator.h" constexpr int32_t BUFFER_NUM = 1; -using namespace AscendC; template class KernelBroadcastCustom { public: @@ -22,7 +21,7 @@ public: __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, uint32_t srcLength, uint32_t dstLength, const uint32_t srcShape[dim], const uint32_t dstShape[dim]) { - AscendCUtils::SetOverflow(1); + AscendC::AscendCUtils::SetOverflow(1); xGm.SetGlobalBuffer((__gm__ T *)x, srcLength); yGm.SetGlobalBuffer((__gm__ T *)y, dstLength); @@ -44,32 +43,32 @@ public: private: __aicore__ inline void CopyIn() { - LocalTensor xLocal = inQueueX.AllocTensor(); - DataCopy(xLocal, xGm, srcLength_); + AscendC::LocalTensor xLocal = inQueueX.AllocTensor(); + AscendC::DataCopy(xLocal, xGm, srcLength_); inQueueX.EnQue(xLocal); } __aicore__ inline void Compute() { - LocalTensor xLocal = inQueueX.DeQue(); - LocalTensor yLocal = outQueueY.AllocTensor(); - BroadCast(yLocal, xLocal, dstShape_, srcShape_); + AscendC::LocalTensor xLocal = inQueueX.DeQue(); + AscendC::LocalTensor yLocal = outQueueY.AllocTensor(); + AscendC::BroadCast(yLocal, xLocal, dstShape_, srcShape_); outQueueY.EnQue(yLocal); inQueueX.FreeTensor(xLocal); } __aicore__ inline void CopyOut() { - LocalTensor yLocal = outQueueY.DeQue(); - DataCopy(yGm, yLocal, dstLength_); + AscendC::LocalTensor yLocal = outQueueY.DeQue(); + AscendC::DataCopy(yGm, yLocal, dstLength_); outQueueY.FreeTensor(yLocal); } private: - TPipe pipe; - TQue inQueueX; - TQue outQueueY; - GlobalTensor xGm; - GlobalTensor yGm; + AscendC::TPipe pipe; + AscendC::TQue inQueueX; + AscendC::TQue outQueueY; + AscendC::GlobalTensor xGm; + AscendC::GlobalTensor yGm; uint32_t srcLength_; uint32_t dstLength_; const uint32_t *srcShape_{nullptr}; diff --git a/examples/pad/broadcast/kernel_launch_method_by_direct/broadcast_custom.cpp b/examples/pad/broadcast/kernel_launch_method_by_direct/broadcast_custom.cpp index 28a17316..395ac034 100644 --- a/examples/pad/broadcast/kernel_launch_method_by_direct/broadcast_custom.cpp +++ b/examples/pad/broadcast/kernel_launch_method_by_direct/broadcast_custom.cpp @@ -8,7 +8,6 @@ * See LICENSE in the root of the software repository for the full text of the License. */ #include "../kernel_impl/broadcast_custom.h" -using namespace AscendC; struct BroadcastTilingData { uint32_t dim{0}; uint32_t axis{0}; diff --git a/examples/pad/broadcast/kernel_launch_method_by_framework/op_kernel/broadcast_custom.cpp b/examples/pad/broadcast/kernel_launch_method_by_framework/op_kernel/broadcast_custom.cpp index a8fd0464..230a5975 100644 --- a/examples/pad/broadcast/kernel_launch_method_by_framework/op_kernel/broadcast_custom.cpp +++ b/examples/pad/broadcast/kernel_launch_method_by_framework/op_kernel/broadcast_custom.cpp @@ -8,7 +8,6 @@ * See LICENSE in the root of the software repository for the full text of the License. */ #include "../../../../../../kernel_impl/broadcast_custom.h" -using namespace AscendC; extern "C" __global__ __aicore__ void broadcast_custom(GM_ADDR x, GM_ADDR y, GM_ADDR workspace, GM_ADDR tiling) { -- Gitee