diff --git a/examples/activation/simplesoftmax/kernel_impl/simplesoftmax_kernel.h b/examples/activation/simplesoftmax/kernel_impl/simplesoftmax_kernel.h index 14c7387a6f718400cd1d722c644eaa184b3c053a..d8079d95e6452fac46cf72c2263654e1cb5abea1 100644 --- a/examples/activation/simplesoftmax/kernel_impl/simplesoftmax_kernel.h +++ b/examples/activation/simplesoftmax/kernel_impl/simplesoftmax_kernel.h @@ -12,7 +12,7 @@ #define EXAMPLES_ACTIVATION_SIMPLESOFTMAX_KERNEL_H #include "kernel_operator.h" -namespace AscendC { +namespace MyCustomKernel { constexpr int32_t BUFFER_NUM = 2; constexpr uint32_t FLOAT_NUM_OF_SINGEL_BLOCK = 8; constexpr uint32_t BASIC_BLOCK_ROW_FACTOR = 8; @@ -55,10 +55,10 @@ public: } __aicore__ inline void Init(GM_ADDR x, GM_ADDR max, GM_ADDR sum, GM_ADDR z, const VecTiling& tiling) { - ASSERT(GetBlockNum() != 0 && "block dim can not be zero!"); + ASSERT(AscendC::GetBlockNum() != 0 && "block dim can not be zero!"); InitTiling(tiling); - if (GetBlockIdx() == this->usedBlockDim) { // tail core + if (AscendC::GetBlockIdx() == this->usedBlockDim) { // tail core this->singleLoopCoreRowNum = this->tailCoreSingleLoopCoreRowNum; this->singleCoreLoopCount = this->tailCoreSingleCoreLoopCount; this->leftRow = this->tailCoreSingleCoreLoopTail; @@ -67,8 +67,8 @@ public: this->blockLength = this->coreRowNum * this->columnLength; this->msLength = this->coreRowNum * FLOAT_NUM_OF_SINGEL_BLOCK; // max sum length per block process - uint32_t offset1 = this->blockLength * GetBlockIdx(); - uint32_t offset2 = this->msLength * GetBlockIdx(); + uint32_t offset1 = this->blockLength * AscendC::GetBlockIdx(); + uint32_t offset2 = this->msLength * AscendC::GetBlockIdx(); xGm.SetGlobalBuffer((__gm__ float*)x + offset1, this->blockLength); zGm.SetGlobalBuffer((__gm__ float*)z + offset1, this->blockLength); @@ -88,7 +88,7 @@ public: __aicore__ inline void Process() { - if (GetBlockIdx() > usedBlockDim) { + if (AscendC::GetBlockIdx() > usedBlockDim) { return; } @@ -108,12 +108,12 @@ public: private: __aicore__ inline void CopyIn(int32_t progress, uint32_t rowNum) { - LocalTensor xLocal = queueX.AllocTensor(); - LocalTensor maxLocal = queueMax.AllocTensor(); - LocalTensor sumLocal = queueSum.AllocTensor(); - DataCopy(xLocal, xGm[progress * this->tileLength], rowNum * this->columnLength); - DataCopy(maxLocal, maxGm[progress * this->msTileLength], rowNum * FLOAT_NUM_OF_SINGEL_BLOCK); - DataCopy(sumLocal, sumGm[progress * this->msTileLength], rowNum * FLOAT_NUM_OF_SINGEL_BLOCK); + AscendC::LocalTensor xLocal = queueX.AllocTensor(); + AscendC::LocalTensor maxLocal = queueMax.AllocTensor(); + AscendC::LocalTensor sumLocal = queueSum.AllocTensor(); + AscendC::DataCopy(xLocal, xGm[progress * this->tileLength], rowNum * this->columnLength); + AscendC::DataCopy(maxLocal, maxGm[progress * this->msTileLength], rowNum * FLOAT_NUM_OF_SINGEL_BLOCK); + AscendC::DataCopy(sumLocal, sumGm[progress * this->msTileLength], rowNum * FLOAT_NUM_OF_SINGEL_BLOCK); queueX.EnQue(xLocal); queueMax.EnQue(maxLocal); queueSum.EnQue(sumLocal); @@ -121,18 +121,18 @@ private: __aicore__ inline void Compute(int32_t progressm, uint32_t rowNum) { - LocalTensor xLocal = queueX.DeQue(); - LocalTensor maxLocal = queueMax.DeQue(); - LocalTensor sumLocal = queueSum.DeQue(); - LocalTensor tmpBuffer = sharedTmpBuffer.Get(); + AscendC::LocalTensor xLocal = queueX.DeQue(); + AscendC::LocalTensor maxLocal = queueMax.DeQue(); + AscendC::LocalTensor sumLocal = queueSum.DeQue(); + AscendC::LocalTensor tmpBuffer = sharedTmpBuffer.Get(); - SoftMaxShapeInfo srcShape = { rowNum, this->columnLength, rowNum, this->columnLength }; + AscendC::SoftMaxShapeInfo srcShape = { rowNum, this->columnLength, rowNum, this->columnLength }; if (rowNum % BASIC_BLOCK_ROW_FACTOR == 0 && this->columnLength % BASIC_BLOCK_COLUMN_FACTOR == 0 && this->columnLength < BASIC_BLOCK_MAX_COLUMN_LENGTH) { - SimpleSoftMax(xLocal, sumLocal, maxLocal, xLocal, tmpBuffer, softmaxTiling, srcShape); + AscendC::SimpleSoftMax(xLocal, sumLocal, maxLocal, xLocal, tmpBuffer, softmaxTiling, srcShape); } else { - SimpleSoftMax(xLocal, sumLocal, maxLocal, xLocal, tmpBuffer, softmaxTiling, srcShape); + AscendC::SimpleSoftMax(xLocal, sumLocal, maxLocal, xLocal, tmpBuffer, softmaxTiling, srcShape); } queueX.EnQue(xLocal); queueMax.FreeTensor(maxLocal); @@ -141,20 +141,20 @@ private: __aicore__ inline void CopyOut(int32_t progress, uint32_t rowNum) { - LocalTensor zLocal = queueX.DeQue(); - DataCopy(zGm[progress * this->tileLength], zLocal, rowNum * this->columnLength); + AscendC::LocalTensor zLocal = queueX.DeQue(); + AscendC::DataCopy(zGm[progress * this->tileLength], zLocal, rowNum * this->columnLength); queueX.FreeTensor(zLocal); } private: - TPipe pipe; - TBuf sharedTmpBuffer; - TQue queueX; - TQue queueMax, queueSum; - GlobalTensor xGm; - GlobalTensor maxGm; - GlobalTensor sumGm; - GlobalTensor zGm; + AscendC::TPipe pipe; + AscendC::TBuf sharedTmpBuffer; + AscendC::TQue queueX; + AscendC::TQue queueMax, queueSum; + AscendC::GlobalTensor xGm; + AscendC::GlobalTensor maxGm; + AscendC::GlobalTensor sumGm; + AscendC::GlobalTensor zGm; uint32_t blockLength = 0; uint32_t usedBlockDim = 0; diff --git a/examples/activation/simplesoftmax/kernel_launch_method_by_direct/simplesoftmax_custom.cpp b/examples/activation/simplesoftmax/kernel_launch_method_by_direct/simplesoftmax_custom.cpp index 1eb366540c69733d8f4540a0f896d218496ac04d..636a18bfe263c83e49cbf9241479c2f3d95afb46 100644 --- a/examples/activation/simplesoftmax/kernel_launch_method_by_direct/simplesoftmax_custom.cpp +++ b/examples/activation/simplesoftmax/kernel_launch_method_by_direct/simplesoftmax_custom.cpp @@ -9,14 +9,13 @@ */ #include "../kernel_impl/simplesoftmax_kernel.h" -using namespace AscendC; -__aicore__ inline void CopyTiling(VecTiling* tiling, GM_ADDR tilingGM) +__aicore__ inline void CopyTiling(MyCustomKernel::VecTiling* tiling, GM_ADDR tilingGM) { uint32_t* ptr = reinterpret_cast(tiling); auto tiling32 = reinterpret_cast<__gm__ uint32_t*>(tilingGM); - for (int i = 0; i < sizeof(VecTiling) / sizeof(uint32_t); i++, ptr++) { + for (int i = 0; i < sizeof(MyCustomKernel::VecTiling) / sizeof(uint32_t); i++, ptr++) { *ptr = *(tiling32 + i); } return; @@ -28,10 +27,10 @@ extern "C" __global__ __aicore__ void simplesoftmax_custom(GM_ADDR x, GM_ADDR ma if ASCEND_IS_AIC { return; } - VecTiling tilingData; + MyCustomKernel::VecTiling tilingData; CopyTiling(&tilingData, tiling); - KernelSoftmax op; - op.Init(x, max, sum, z, tilingData); + MyCustomKernel::KernelSoftmax op; + op.Init(x, max, sum, res_out, tilingData); op.Process(); } diff --git a/examples/activation/simplesoftmax/kernel_launch_method_by_framework/op_kernel/simplesoftmax_custom.cpp b/examples/activation/simplesoftmax/kernel_launch_method_by_framework/op_kernel/simplesoftmax_custom.cpp index 1bb113e3680e4bbef05f53cf2542a48dd747d8ff..91b8de26e8b183482e29f327b4ceaaef6766e68c 100644 --- a/examples/activation/simplesoftmax/kernel_launch_method_by_framework/op_kernel/simplesoftmax_custom.cpp +++ b/examples/activation/simplesoftmax/kernel_launch_method_by_framework/op_kernel/simplesoftmax_custom.cpp @@ -8,7 +8,6 @@ * See LICENSE in the root of the software repository for the full text of the License. */ #include "../../../../../../kernel_impl/simplesoftmax_kernel.h" -using namespace AscendC; extern "C" __global__ __aicore__ void simplesoftmax_custom(GM_ADDR x, GM_ADDR max, GM_ADDR sum, GM_ADDR z, GM_ADDR workspace, GM_ADDR tiling) @@ -17,9 +16,9 @@ extern "C" __global__ __aicore__ void simplesoftmax_custom(GM_ADDR x, GM_ADDR ma return; } GET_TILING_DATA(tilingData, tiling); - VecTiling vecTiling = *reinterpret_cast(&tilingData); - KernelSoftmax op; - op.Init(x, max, sum, z, vecTiling); + MyCustomKernel::VecTiling vecTiling = *reinterpret_cast(&tilingData); + MyCustomKernel::KernelSoftmax op; + op.Init(x, max, sum, res_out, vecTiling); if (TILING_KEY_IS(1)) { op.Process(); } diff --git a/examples/activation/softmax/kernel_impl/softmax_kernel.h b/examples/activation/softmax/kernel_impl/softmax_kernel.h index 5e75aed32a9dc48928d12d0b3e4b59ac580b6843..bac55031dc19773234d2ef84c4c036a6505e4d48 100644 --- a/examples/activation/softmax/kernel_impl/softmax_kernel.h +++ b/examples/activation/softmax/kernel_impl/softmax_kernel.h @@ -12,7 +12,7 @@ #define EXAMPLES_ACTIVATION_SOFTMAX_KERNEL_H #include "kernel_operator.h" -namespace AscendC { +namespace MyCustomKernel { constexpr int32_t BUFFER_NUM = 2; constexpr uint32_t FLOAT_NUM_OF_SINGEL_BLOCK = 8; constexpr uint32_t BASIC_BLOCK_ROW_FACTOR = 8; @@ -55,10 +55,10 @@ public: } __aicore__ inline void Init(GM_ADDR x, GM_ADDR max, GM_ADDR sum, GM_ADDR z, const VecTiling& tilingData) { - ASSERT(GetBlockNum() != 0 && "block dim can not be zero!"); + ASSERT(AscendC::GetBlockNum() != 0 && "block dim can not be zero!"); InitTiling(tilingData); - if (GetBlockIdx() == this->usedBlockDim) { // tail core + if (AscendC::GetBlockIdx() == this->usedBlockDim) { // tail core this->singleLoopCoreRowNum = this->tailCoreSingleLoopCoreRowNum; this->singleCoreLoopCount = this->tailCoreSingleCoreLoopCount; this->leftRow = this->tailCoreSingleCoreLoopTail; @@ -67,8 +67,8 @@ public: this->blockLength = this->coreRowNum * this->columnLength; this->msLength = this->coreRowNum * FLOAT_NUM_OF_SINGEL_BLOCK; // max sum length per block process - uint32_t offset1 = this->blockLength * GetBlockIdx(); - uint32_t offset2 = this->msLength * GetBlockIdx(); + uint32_t offset1 = this->blockLength * AscendC::GetBlockIdx(); + uint32_t offset2 = this->msLength * AscendC::GetBlockIdx(); xGm.SetGlobalBuffer((__gm__ float*)x + offset1, this->blockLength); zGm.SetGlobalBuffer((__gm__ float*)z + offset1, this->blockLength); @@ -89,7 +89,7 @@ public: __aicore__ inline void Process() { - if (GetBlockIdx() > this->usedBlockDim) { + if (AscendC::GetBlockIdx() > this->usedBlockDim) { return; } @@ -108,25 +108,25 @@ public: private: __aicore__ inline void CopyIn(int32_t progress, uint32_t rowNum) { - LocalTensor xLocal = queueX.AllocTensor(); - DataCopy(xLocal, xGm[progress * this->tileLength], rowNum * this->columnLength); + AscendC::LocalTensor xLocal = queueX.AllocTensor(); + AscendC::DataCopy(xLocal, xGm[progress * this->tileLength], rowNum * this->columnLength); queueX.EnQue(xLocal); } __aicore__ inline void Compute(int32_t progressm, uint32_t rowNum) { - LocalTensor xLocal = queueX.DeQue(); - LocalTensor maxLocal = queueMax.AllocTensor(); - LocalTensor sumLocal = queueSum.AllocTensor(); - LocalTensor tmpBuffer = sharedTmpBuffer.Get(); + AscendC::LocalTensor xLocal = queueX.DeQue(); + AscendC::LocalTensor maxLocal = queueMax.AllocTensor(); + AscendC::LocalTensor sumLocal = queueSum.AllocTensor(); + AscendC::LocalTensor tmpBuffer = sharedTmpBuffer.Get(); - SoftMaxShapeInfo srcShape = { rowNum, this->columnLength, rowNum, this->columnLength }; + AscendC::SoftMaxShapeInfo srcShape = { rowNum, this->columnLength, rowNum, this->columnLength }; if (rowNum % BASIC_BLOCK_ROW_FACTOR == 0 && this->columnLength % BASIC_BLOCK_COLUMN_FACTOR == 0 && this->columnLength < BASIC_BLOCK_MAX_COLUMN_LENGTH) { - SoftMax(xLocal, sumLocal, maxLocal, xLocal, tmpBuffer, softmaxTiling, srcShape); + AscendC::SoftMax(xLocal, sumLocal, maxLocal, xLocal, tmpBuffer, softmaxTiling, srcShape); } else { - SoftMax(xLocal, sumLocal, maxLocal, xLocal, tmpBuffer, softmaxTiling, srcShape); + AscendC::SoftMax(xLocal, sumLocal, maxLocal, xLocal, tmpBuffer, softmaxTiling, srcShape); } queueX.EnQue(xLocal); queueMax.EnQue(maxLocal); @@ -135,13 +135,13 @@ private: __aicore__ inline void CopyOut(int32_t progress, uint32_t rowNum) { - LocalTensor zLocal = queueX.DeQue(); - LocalTensor maxLocal = queueMax.DeQue(); - LocalTensor sumLocal = queueSum.DeQue(); + AscendC::LocalTensor zLocal = queueX.DeQue(); + AscendC::LocalTensor maxLocal = queueMax.DeQue(); + AscendC::LocalTensor sumLocal = queueSum.DeQue(); - DataCopy(zGm[progress * this->tileLength], zLocal, rowNum * this->columnLength); - DataCopy(maxGm[progress * this->msTileLength], maxLocal, rowNum * FLOAT_NUM_OF_SINGEL_BLOCK); - DataCopy(sumGm[progress * this->msTileLength], sumLocal, rowNum * FLOAT_NUM_OF_SINGEL_BLOCK); + AscendC::DataCopy(zGm[progress * this->tileLength], zLocal, rowNum * this->columnLength); + AscendC::DataCopy(maxGm[progress * this->msTileLength], maxLocal, rowNum * FLOAT_NUM_OF_SINGEL_BLOCK); + AscendC::DataCopy(sumGm[progress * this->msTileLength], sumLocal, rowNum * FLOAT_NUM_OF_SINGEL_BLOCK); queueX.FreeTensor(zLocal); queueMax.FreeTensor(maxLocal); @@ -149,14 +149,14 @@ private: } private: - TPipe pipe; - TBuf sharedTmpBuffer; - TQue queueX; - TQue queueMax, queueSum; - GlobalTensor xGm; - GlobalTensor maxGm; - GlobalTensor sumGm; - GlobalTensor zGm; + AscendC::TPipe pipe; + AscendC::TBuf sharedTmpBuffer; + AscendC::TQue queueX; + AscendC::TQue queueMax, queueSum; + AscendC::GlobalTensor xGm; + AscendC::GlobalTensor maxGm; + AscendC::GlobalTensor sumGm; + AscendC::GlobalTensor zGm; uint32_t blockLength = 0; uint32_t usedBlockDim = 0; diff --git a/examples/activation/softmax/kernel_launch_method_by_direct/softmax_custom.cpp b/examples/activation/softmax/kernel_launch_method_by_direct/softmax_custom.cpp index 51848ac2f7518eaee07cbdebf6494e7982b77777..bf6abed2ed49275dc6bd38ec11069dbab17815ea 100644 --- a/examples/activation/softmax/kernel_launch_method_by_direct/softmax_custom.cpp +++ b/examples/activation/softmax/kernel_launch_method_by_direct/softmax_custom.cpp @@ -9,14 +9,13 @@ */ #include "../kernel_impl/softmax_kernel.h" -using namespace AscendC; -__aicore__ inline void CopyTiling(VecTiling* tiling, GM_ADDR tilingGM) +__aicore__ inline void CopyTiling(MyCustomKernel::VecTiling* tiling, GM_ADDR tilingGM) { uint32_t* ptr = reinterpret_cast(tiling); auto tiling32 = reinterpret_cast<__gm__ uint32_t*>(tilingGM); - for (int i = 0; i < sizeof(VecTiling) / sizeof(uint32_t); i++, ptr++) { + for (int i = 0; i < sizeof(MyCustomKernel::VecTiling) / sizeof(uint32_t); i++, ptr++) { *ptr = *(tiling32 + i); } return; @@ -28,10 +27,10 @@ extern "C" __global__ __aicore__ void softmax_custom(GM_ADDR x, GM_ADDR max, GM_ if ASCEND_IS_AIC { return; } - VecTiling tilingData; + MyCustomKernel::VecTiling tilingData; CopyTiling(&tilingData, tiling); - KernelSoftmax op; - op.Init(x, max, sum, z, tilingData); + MyCustomKernel::KernelSoftmax op; + op.Init(x, max, sum, res_out, tilingData); op.Process(); } diff --git a/examples/activation/softmax/kernel_launch_method_by_framework/op_kernel/softmax_custom.cpp b/examples/activation/softmax/kernel_launch_method_by_framework/op_kernel/softmax_custom.cpp index e263b3d26659f06be12ad0c4175d5acf1e32a206..95854592b9251e264e9d75e5acaa5f091f2cbd0d 100644 --- a/examples/activation/softmax/kernel_launch_method_by_framework/op_kernel/softmax_custom.cpp +++ b/examples/activation/softmax/kernel_launch_method_by_framework/op_kernel/softmax_custom.cpp @@ -9,7 +9,6 @@ */ #include "../../../../../../kernel_impl/softmax_kernel.h" -using namespace AscendC; extern "C" __global__ __aicore__ void softmax_custom(GM_ADDR x, GM_ADDR max, GM_ADDR sum, GM_ADDR z, GM_ADDR workspace, GM_ADDR tiling) @@ -18,9 +17,9 @@ extern "C" __global__ __aicore__ void softmax_custom(GM_ADDR x, GM_ADDR max, GM_ return; } GET_TILING_DATA(tilingData, tiling); - VecTiling vecTiling = *reinterpret_cast(&tilingData); - KernelSoftmax op; - op.Init(x, max, sum, z, vecTiling); + MyCustomKernel::VecTiling vecTiling = *reinterpret_cast(&tilingData); + MyCustomKernel::KernelSoftmax op; + op.Init(x, max, sum, res_out, vecTiling); if (TILING_KEY_IS(1)) { op.Process(); } diff --git a/examples/activation/softmaxflash/kernel_impl/softmaxflash_kernel.h b/examples/activation/softmaxflash/kernel_impl/softmaxflash_kernel.h index 391e4fb3b7fddbb544d9b82392d99b7197cddf60..0e4b3877d57b6f3db0170ddde446df86040654bb 100644 --- a/examples/activation/softmaxflash/kernel_impl/softmaxflash_kernel.h +++ b/examples/activation/softmaxflash/kernel_impl/softmaxflash_kernel.h @@ -12,7 +12,7 @@ #define EXAMPLES_ACTIVATION_SOFTMAXFLASH_KERNEL_H #include "kernel_operator.h" -namespace AscendC { +namespace MyCustomKernel { constexpr int32_t BUFFER_NUM = 2; constexpr uint32_t FLOAT_NUM_OF_SINGEL_BLOCK = 8; constexpr uint32_t BASIC_BLOCK_ROW_FACTOR = 8; @@ -62,10 +62,10 @@ public: __aicore__ inline void Init(GM_ADDR x, GM_ADDR max, GM_ADDR sum, const VecTiling& tilingData) { - ASSERT(GetBlockNum() != 0 && "block dim can not be zero!"); + ASSERT(AscendC::GetBlockNum() != 0 && "block dim can not be zero!"); InitTiling(tilingData); - if (GetBlockIdx() == this->usedBlockDim) { // tail core + if (AscendC::GetBlockIdx() == this->usedBlockDim) { // tail core this->singleLoopCoreRowNum = this->tailCoreSingleLoopCoreRowNum; this->singleCoreLoopCount = this->tailCoreSingleCoreLoopCount; this->leftRow = this->tailCoreSingleCoreLoopTail; @@ -74,8 +74,8 @@ public: this->blockLength = this->coreRowNum * this->columnLength; this->msLength = this->coreRowNum * FLOAT_NUM_OF_SINGEL_BLOCK; // max sum length per block process - uint32_t offset1 = this->blockLength * GetBlockIdx(); - uint32_t offset2 = this->msLength * GetBlockIdx(); + uint32_t offset1 = this->blockLength * AscendC::GetBlockIdx(); + uint32_t offset2 = this->msLength * AscendC::GetBlockIdx(); xGm.SetGlobalBuffer((__gm__ float*)x + offset1, this->blockLength); maxGm.SetGlobalBuffer((__gm__ float*)max + offset2, this->msLength); @@ -97,7 +97,7 @@ public: __aicore__ inline void Process() { - if (GetBlockIdx() > this->usedBlockDim) { + if (AscendC::GetBlockIdx() > this->usedBlockDim) { return; } @@ -111,15 +111,15 @@ public: CopyIn(i, this->loopK, this->singleLoopCoreRowNum, this->tailK); Compute(i, this->loopK, this->singleLoopCoreRowNum, this->tailK); } - event_t eventIdVToMte3 = static_cast(GetTPipePtr()->FetchEventID(HardEvent::V_MTE3)); - SetFlag(eventIdVToMte3); - WaitFlag(eventIdVToMte3); + event_t eventIdVToMte3 = static_cast(GetTPipePtr()->FetchEventID(AscendC::HardEvent::V_MTE3)); + AscendC::SetFlag(eventIdVToMte3); + AscendC::WaitFlag(eventIdVToMte3); // copy max sum to gm CopyOut(i, this->msTileLength); } - event_t eventIdMte3ToV = static_cast(GetTPipePtr()->FetchEventID(HardEvent::MTE3_V)); - SetFlag(eventIdMte3ToV); - WaitFlag(eventIdMte3ToV); + event_t eventIdMte3ToV = static_cast(GetTPipePtr()->FetchEventID(AscendC::HardEvent::MTE3_V)); + AscendC::SetFlag(eventIdMte3ToV); + AscendC::WaitFlag(eventIdMte3ToV); if (this->leftRow > 0) { for (int32_t j = 0; j < this->loopK; j++) { // split K @@ -131,9 +131,9 @@ public: CopyIn(this->singleCoreLoopCount, this->loopK, this->leftRow, this->tailK); Compute(this->singleCoreLoopCount, this->loopK, this->leftRow, this->tailK); } - event_t eventIdVToMte3 = static_cast(GetTPipePtr()->FetchEventID(HardEvent::V_MTE3)); - SetFlag(eventIdVToMte3); - WaitFlag(eventIdVToMte3); + event_t eventIdVToMte3 = static_cast(GetTPipePtr()->FetchEventID(AscendC::HardEvent::V_MTE3)); + AscendC::SetFlag(eventIdVToMte3); + AscendC::WaitFlag(eventIdVToMte3); // copy max sum to gm uint32_t tailMsTileLength = this->leftRow * FLOAT_NUM_OF_SINGEL_BLOCK; CopyOut(this->singleCoreLoopCount, tailMsTileLength); @@ -143,10 +143,10 @@ public: private: __aicore__ inline void CopyIn(uint32_t rowIndex, uint32_t kIndex, uint32_t rowNum, uint32_t columnNum) { - LocalTensor xLocal = queueX.AllocTensor(); + AscendC::LocalTensor xLocal = queueX.AllocTensor(); uint32_t offset = this->singleLoopCoreRowNum * this->columnLength; for (uint32_t i = 0; i < rowNum; i++) { - DataCopy(xLocal[i * columnNum], xGm[rowIndex * offset + i * this->columnLength + kIndex * this->splitK], + AscendC::DataCopy(xLocal[i * columnNum], xGm[rowIndex * offset + i * this->columnLength + kIndex * this->splitK], columnNum); } queueX.EnQue(xLocal); @@ -154,28 +154,28 @@ private: __aicore__ inline void Compute(uint32_t rowIndex, uint32_t columnIndex, uint32_t rowNum, uint32_t columnNum) { - LocalTensor xLocal = queueX.DeQue(); - LocalTensor tmpBuffer = sharedTmpBuffer.Get(); + AscendC::LocalTensor xLocal = queueX.DeQue(); + AscendC::LocalTensor tmpBuffer = sharedTmpBuffer.Get(); - SoftMaxShapeInfo srcShape = { rowNum, columnNum, rowNum, columnNum }; + AscendC::SoftMaxShapeInfo srcShape = { rowNum, columnNum, rowNum, columnNum }; if (columnIndex == 0) { // isUpdate == false if (rowNum % BASIC_BLOCK_ROW_FACTOR == 0 && columnNum % BASIC_BLOCK_COLUMN_FACTOR == 0 && columnNum < BASIC_BLOCK_MAX_COLUMN_LENGTH) { - SoftmaxFlashV2(xLocal, sumLocal, maxLocal, xLocal, expmaxLocal, sumLocal, + AscendC::SoftmaxFlashV2(xLocal, sumLocal, maxLocal, xLocal, expmaxLocal, sumLocal, maxLocal, tmpBuffer, softmaxTiling, srcShape); } else { - SoftmaxFlashV2(xLocal, sumLocal, maxLocal, xLocal, expmaxLocal, sumLocal, + AscendC::SoftmaxFlashV2(xLocal, sumLocal, maxLocal, xLocal, expmaxLocal, sumLocal, maxLocal, tmpBuffer, softmaxTiling, srcShape); } } else { if (rowNum % BASIC_BLOCK_ROW_FACTOR == 0 && columnNum % BASIC_BLOCK_COLUMN_FACTOR == 0 && columnNum < BASIC_BLOCK_MAX_COLUMN_LENGTH) { - SoftmaxFlashV2(xLocal, sumLocal, maxLocal, xLocal, expmaxLocal, sumLocal, + AscendC::SoftmaxFlashV2(xLocal, sumLocal, maxLocal, xLocal, expmaxLocal, sumLocal, maxLocal, tmpBuffer, softmaxTiling, srcShape); } else { - SoftmaxFlashV2(xLocal, sumLocal, maxLocal, xLocal, expmaxLocal, sumLocal, + AscendC::SoftmaxFlashV2(xLocal, sumLocal, maxLocal, xLocal, expmaxLocal, sumLocal, maxLocal, tmpBuffer, softmaxTiling, srcShape); } } @@ -184,21 +184,22 @@ private: __aicore__ inline void CopyOut(uint32_t progress, uint32_t count) { - DataCopy(maxGm[progress * this->msTileLength], maxLocal, count); - DataCopy(sumGm[progress * this->msTileLength], sumLocal, count); + AscendC::DataCopy(maxGm[progress * this->msTileLength], maxLocal, count); + AscendC::DataCopy(sumGm[progress * this->msTileLength], sumLocal, count); } private: - TPipe pipe; - TBuf sharedTmpBuffer; - TQue queueX; - TBuf tbufMax, tbufSum, tbufExpmax; - GlobalTensor xGm; - GlobalTensor maxGm; - GlobalTensor sumGm; - LocalTensor maxLocal; - LocalTensor sumLocal; - LocalTensor expmaxLocal; + AscendC::TPipe pipe; + AscendC::TBuf sharedTmpBuffer; + AscendC::TQue queueX; + AscendC::TQue queueMax, queueSum, queueExpmax; + AscendC::GlobalTensor xGm; + AscendC::GlobalTensor maxGm; + AscendC::GlobalTensor sumGm; + AscendC::GlobalTensor zGm; + AscendC::LocalTensor maxLocal; + AscendC::LocalTensor sumLocal; + AscendC::LocalTensor expmaxLocal; uint32_t blockLength = 0; uint32_t usedBlockDim = 0; diff --git a/examples/activation/softmaxflash/kernel_launch_method_by_direct/softmaxflash_custom.cpp b/examples/activation/softmaxflash/kernel_launch_method_by_direct/softmaxflash_custom.cpp index cfcfa32df4082a69947efef6c62ae88e3407d158..e3ee128c29c4a40497adf87056b7a878817be201 100644 --- a/examples/activation/softmaxflash/kernel_launch_method_by_direct/softmaxflash_custom.cpp +++ b/examples/activation/softmaxflash/kernel_launch_method_by_direct/softmaxflash_custom.cpp @@ -9,14 +9,13 @@ */ #include "../kernel_impl/softmaxflash_kernel.h" -using namespace AscendC; -__aicore__ inline void CopyTiling(VecTiling* tiling, GM_ADDR tilingGM) +__aicore__ inline void CopyTiling(MyCustomKernel::VecTiling* tiling, GM_ADDR tilingGM) { uint32_t* ptr = reinterpret_cast(tiling); auto tiling32 = reinterpret_cast<__gm__ uint32_t*>(tilingGM); - for (int i = 0; i < sizeof(VecTiling) / sizeof(uint32_t); i++, ptr++) { + for (int i = 0; i < sizeof(MyCustomKernel::VecTiling) / sizeof(uint32_t); i++, ptr++) { *ptr = *(tiling32 + i); } return; @@ -28,10 +27,10 @@ extern "C" __global__ __aicore__ void softmaxflash_custom(GM_ADDR x, GM_ADDR max if ASCEND_IS_AIC { return; } - VecTiling tilingData; + MyCustomKernel::VecTiling tilingData; CopyTiling(&tilingData, tiling); - KernelSoftmax op; - op.Init(x, max, sum, tilingData); + MyCustomKernel::KernelSoftmax op; + op.Init(x, max, sum, res_out, tilingData); op.Process(); } diff --git a/examples/activation/softmaxflash/kernel_launch_method_by_framework/op_kernel/softmaxflash_custom.cpp b/examples/activation/softmaxflash/kernel_launch_method_by_framework/op_kernel/softmaxflash_custom.cpp index c4d69de5291c5b040cc7ef3886a9ce6bc0614c98..019c856a2dcb9dcc5f67a8b70fa185316bbf928e 100644 --- a/examples/activation/softmaxflash/kernel_launch_method_by_framework/op_kernel/softmaxflash_custom.cpp +++ b/examples/activation/softmaxflash/kernel_launch_method_by_framework/op_kernel/softmaxflash_custom.cpp @@ -9,7 +9,6 @@ */ #include "../../../../../../kernel_impl/softmaxflash_kernel.h" -using namespace AscendC; extern "C" __global__ __aicore__ void softmaxflash_custom(GM_ADDR x, GM_ADDR max, GM_ADDR sum, GM_ADDR workspace, GM_ADDR tiling) @@ -18,9 +17,9 @@ extern "C" __global__ __aicore__ void softmaxflash_custom(GM_ADDR x, GM_ADDR max return; } GET_TILING_DATA(tilingData, tiling); - VecTiling vecTiling = *reinterpret_cast(&tilingData); - KernelSoftmax op; - op.Init(x, max, sum, vecTiling); + MyCustomKernel::VecTiling vecTiling = *reinterpret_cast(&tilingData); + MyCustomKernel::KernelSoftmax op; + op.Init(x, max, sum, res_out, vecTiling); if (TILING_KEY_IS(1)) { op.Process(); } diff --git a/examples/activation/softmaxgrad/kernel_impl/softmaxgrad_kernel.h b/examples/activation/softmaxgrad/kernel_impl/softmaxgrad_kernel.h index 4f5c10a4b92a8b6f38d980c69f8d6d6693958e7f..d8f1c25fe3e85c4665a620ea3ccb895d8257881f 100644 --- a/examples/activation/softmaxgrad/kernel_impl/softmaxgrad_kernel.h +++ b/examples/activation/softmaxgrad/kernel_impl/softmaxgrad_kernel.h @@ -12,7 +12,7 @@ #define EXAMPLES_ACTIVATION_SOFTMAXGRAD_KERNEL_H #include "kernel_operator.h" -namespace AscendC { +namespace MyCustomKernel { constexpr int32_t BUFFER_NUM = 1; struct VecTiling { @@ -51,17 +51,17 @@ public: } __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, GM_ADDR z, const VecTiling& tiling) { - ASSERT(GetBlockNum() != 0 && "block dim can not be zero!"); + ASSERT(AscendC::GetBlockNum() != 0 && "block dim can not be zero!"); InitTiling(tiling); - if (GetBlockIdx() == this->usedBlockDim) { // tail core + if (AscendC::GetBlockIdx() == this->usedBlockDim) { // tail core this->singleLoopCoreRowNum = this->tailCoreSingleLoopCoreRowNum; this->singleCoreLoopCount = this->tailCoreSingleCoreLoopCount; this->leftRow = this->tailCoreSingleCoreLoopTail; } this->blockLength = this->coreRowNum * this->columnLength; - uint32_t offset1 = this->blockLength * GetBlockIdx(); + uint32_t offset1 = this->blockLength * AscendC::GetBlockIdx(); xGm.SetGlobalBuffer((__gm__ float*)x + offset1, this->blockLength); yGm.SetGlobalBuffer((__gm__ float*)y + offset1, this->blockLength); @@ -76,7 +76,7 @@ public: __aicore__ inline void Process() { - if (GetBlockIdx() > usedBlockDim) { + if (AscendC::GetBlockIdx() > usedBlockDim) { return; } @@ -85,9 +85,9 @@ public: Compute(i, this->singleLoopCoreRowNum); CopyOut(i, this->singleLoopCoreRowNum); } - event_t eventIdMte3ToMte2 = static_cast(GetTPipePtr()->FetchEventID(HardEvent::MTE3_MTE2)); - SetFlag(eventIdMte3ToMte2); - WaitFlag(eventIdMte3ToMte2); + event_t eventIdMte3ToMte2 = static_cast(GetTPipePtr()->FetchEventID(AscendC::HardEvent::MTE3_MTE2)); + AscendC::SetFlag(eventIdMte3ToMte2); + AscendC::WaitFlag(eventIdMte3ToMte2); if (this->leftRow > 0) { CopyIn(this->singleCoreLoopCount, this->leftRow); Compute(this->singleCoreLoopCount, this->leftRow); @@ -98,22 +98,22 @@ public: private: __aicore__ inline void CopyIn(int32_t progress, uint32_t rowNum) { - LocalTensor xLocal = queueX.AllocTensor(); - LocalTensor yLocal = queueY.AllocTensor(); - DataCopy(xLocal, xGm[progress * this->tileLength], rowNum * this->columnLength); - DataCopy(yLocal, yGm[progress * this->tileLength], rowNum * this->columnLength); + AscendC::LocalTensor xLocal = queueX.AllocTensor(); + AscendC::LocalTensor yLocal = queueY.AllocTensor(); + AscendC::DataCopy(xLocal, xGm[progress * this->tileLength], rowNum * this->columnLength); + AscendC::DataCopy(yLocal, yGm[progress * this->tileLength], rowNum * this->columnLength); queueX.EnQue(xLocal); queueY.EnQue(yLocal); } __aicore__ inline void Compute(int32_t progressm, uint32_t rowNum) { - LocalTensor xLocal = queueX.DeQue(); - LocalTensor yLocal = queueY.DeQue(); - LocalTensor tmpBuffer = sharedTmpBuffer.Get(); + AscendC::LocalTensor xLocal = queueX.DeQue(); + AscendC::LocalTensor yLocal = queueY.DeQue(); + AscendC::LocalTensor tmpBuffer = sharedTmpBuffer.Get(); - SoftMaxShapeInfo srcShape = { rowNum, this->columnLength, rowNum, this->columnLength }; - SoftmaxGrad(yLocal, xLocal, yLocal, tmpBuffer, softmaxTiling, false, srcShape); + AscendC::SoftMaxShapeInfo srcShape = { rowNum, this->columnLength, rowNum, this->columnLength }; + AscendC::SoftmaxGrad(yLocal, xLocal, yLocal, tmpBuffer, softmaxTiling, false, srcShape); queueY.EnQue(yLocal); queueX.FreeTensor(xLocal); @@ -121,19 +121,19 @@ private: __aicore__ inline void CopyOut(int32_t progress, uint32_t rowNum) { - LocalTensor zLocal = queueY.DeQue(); - DataCopy(zGm[progress * this->tileLength], zLocal, rowNum * this->columnLength); + AscendC::LocalTensor zLocal = queueY.DeQue(); + AscendC::DataCopy(zGm[progress * this->tileLength], zLocal, rowNum * this->columnLength); queueY.FreeTensor(zLocal); } private: - TPipe pipe; - TBuf sharedTmpBuffer; - TQue queueX; - TQue queueY; - GlobalTensor xGm; - GlobalTensor yGm; - GlobalTensor zGm; + AscendC::TPipe pipe; + AscendC::TBuf sharedTmpBuffer; + AscendC::TQue queueX; + AscendC::TQue queueY; + AscendC::GlobalTensor xGm; + AscendC::GlobalTensor yGm; + AscendC::GlobalTensor zGm; uint32_t blockLength = 0; uint32_t usedBlockDim = 0; diff --git a/examples/activation/softmaxgrad/kernel_launch_method_by_direct/softmaxgrad_custom.cpp b/examples/activation/softmaxgrad/kernel_launch_method_by_direct/softmaxgrad_custom.cpp index 2ab23bb21584e6629c9ab0c0b9c671ef57cc2b51..febf4b5e8f8641cdd42baf6bbf1f57bd8d7437ee 100644 --- a/examples/activation/softmaxgrad/kernel_launch_method_by_direct/softmaxgrad_custom.cpp +++ b/examples/activation/softmaxgrad/kernel_launch_method_by_direct/softmaxgrad_custom.cpp @@ -9,14 +9,13 @@ */ #include "../kernel_impl/softmaxgrad_kernel.h" -using namespace AscendC; -__aicore__ inline void CopyTiling(VecTiling* tiling, GM_ADDR tilingGM) +__aicore__ inline void CopyTiling(MyCustomKernel::VecTiling* tiling, GM_ADDR tilingGM) { uint32_t* ptr = reinterpret_cast(tiling); auto tiling32 = reinterpret_cast<__gm__ uint32_t*>(tilingGM); - for (int i = 0; i < sizeof(VecTiling) / sizeof(uint32_t); i++, ptr++) { + for (int i = 0; i < sizeof(MyCustomKernel::VecTiling) / sizeof(uint32_t); i++, ptr++) { *ptr = *(tiling32 + i); } return; @@ -28,9 +27,9 @@ extern "C" __global__ __aicore__ void softmaxgrad_custom(GM_ADDR x, GM_ADDR y, G if ASCEND_IS_AIC { return; } - VecTiling tilingData; + MyCustomKernel::VecTiling tilingData; CopyTiling(&tilingData, tiling); - KernelSoftmax op; + MyCustomKernel::KernelSoftmax op; op.Init(x, y, z, tilingData); op.Process(); } diff --git a/examples/activation/softmaxgrad/kernel_launch_method_by_framework/op_kernel/softmaxgrad_custom.cpp b/examples/activation/softmaxgrad/kernel_launch_method_by_framework/op_kernel/softmaxgrad_custom.cpp index ef21e9b7e523f682c9402f7087c6dae06fe17c95..d466c81b9d53753980db1ae918db873e1b028ffa 100644 --- a/examples/activation/softmaxgrad/kernel_launch_method_by_framework/op_kernel/softmaxgrad_custom.cpp +++ b/examples/activation/softmaxgrad/kernel_launch_method_by_framework/op_kernel/softmaxgrad_custom.cpp @@ -9,7 +9,6 @@ */ #include "../../../../../../kernel_impl/softmaxgrad_kernel.h" -using namespace AscendC; extern "C" __global__ __aicore__ void softmaxgrad_custom(GM_ADDR x, GM_ADDR y, GM_ADDR z, GM_ADDR workspace, GM_ADDR tiling) @@ -18,9 +17,9 @@ extern "C" __global__ __aicore__ void softmaxgrad_custom(GM_ADDR x, GM_ADDR y, G return; } GET_TILING_DATA(tilingData, tiling); - VecTiling vecTiling = *reinterpret_cast(&tilingData); - KernelSoftmax op; - op.Init(x, y, z, vecTiling); + MyCustomKernel::VecTiling vecTiling = *reinterpret_cast(&tilingData); + MyCustomKernel::KernelSoftmax op; + op.Init(x, y, res_out, vecTiling); if (TILING_KEY_IS(1)) { op.Process(); } diff --git a/examples/activation/softmaxgradfront/kernel_impl/softmaxgradfront_kernel.h b/examples/activation/softmaxgradfront/kernel_impl/softmaxgradfront_kernel.h index 94a0659a1d7f84a7debd90017ccb410d5d522ae2..8a3fb2f2b8bf8462e9f44d2557a861ad2231ae5d 100644 --- a/examples/activation/softmaxgradfront/kernel_impl/softmaxgradfront_kernel.h +++ b/examples/activation/softmaxgradfront/kernel_impl/softmaxgradfront_kernel.h @@ -11,7 +11,7 @@ #ifndef EXAMPLES_ACTIVATION_SOFTMAXGRADFRONT_KERNEL_H #define EXAMPLES_ACTIVATION_SOFTMAXGRADFRONT_KERNEL_H #include "kernel_operator.h" -namespace AscendC { +namespace MyCustomKernel { constexpr int32_t BUFFER_NUM = 1; constexpr uint32_t FLOAT_NUM_OF_SINGEL_BLOCK = 8; constexpr uint32_t BASIC_BLOCK_ROW_FACTOR = 8; @@ -53,18 +53,18 @@ public: } __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, GM_ADDR z, const VecTiling& tiling) { - ASSERT(GetBlockNum() != 0 && "block dim can not be zero!"); + ASSERT(AscendC::GetBlockNum() != 0 && "block dim can not be zero!"); InitTiling(tiling); - if (GetBlockIdx() == this->usedBlockDim) { // tail core + if (AscendC::GetBlockIdx() == this->usedBlockDim) { // tail core this->singleLoopCoreRowNum = this->tailCoreSingleLoopCoreRowNum; this->singleCoreLoopCount = this->tailCoreSingleCoreLoopCount; this->leftRow = this->tailCoreSingleCoreLoopTail; } this->blockLength = this->coreRowNum * this->columnLength; - uint32_t offset1 = this->blockLength * GetBlockIdx(); - uint32_t offset2 = this->coreRowNum * FLOAT_NUM_OF_SINGEL_BLOCK * GetBlockIdx(); + uint32_t offset1 = this->blockLength * AscendC::GetBlockIdx(); + uint32_t offset2 = this->coreRowNum * FLOAT_NUM_OF_SINGEL_BLOCK * AscendC::GetBlockIdx(); xGm.SetGlobalBuffer((__gm__ float*)x + offset1, this->blockLength); yGm.SetGlobalBuffer((__gm__ float*)y + offset1, this->blockLength); @@ -80,7 +80,7 @@ public: __aicore__ inline void Process() { - if (GetBlockIdx() > usedBlockDim) { + if (AscendC::GetBlockIdx() > usedBlockDim) { return; } @@ -100,28 +100,28 @@ public: private: __aicore__ inline void CopyIn(int32_t progress, uint32_t rowNum) { - LocalTensor xLocal = queueX.AllocTensor(); - LocalTensor yLocal = queueY.AllocTensor(); - DataCopy(xLocal, xGm[progress * this->tileLength], rowNum * this->columnLength); - DataCopy(yLocal, yGm[progress * this->tileLength], rowNum * this->columnLength); + AscendC::LocalTensor xLocal = queueX.AllocTensor(); + AscendC::LocalTensor yLocal = queueY.AllocTensor(); + AscendC::DataCopy(xLocal, xGm[progress * this->tileLength], rowNum * this->columnLength); + AscendC::DataCopy(yLocal, yGm[progress * this->tileLength], rowNum * this->columnLength); queueX.EnQue(xLocal); queueY.EnQue(yLocal); } __aicore__ inline void Compute(int32_t progressm, uint32_t rowNum) { - LocalTensor xLocal = queueX.DeQue(); - LocalTensor yLocal = queueY.DeQue(); - LocalTensor zLocal = queueZ.AllocTensor(); - LocalTensor tmpBuffer = sharedTmpBuffer.Get(); + AscendC::LocalTensor xLocal = queueX.DeQue(); + AscendC::LocalTensor yLocal = queueY.DeQue(); + AscendC::LocalTensor zLocal = queueZ.AllocTensor(); + AscendC::LocalTensor tmpBuffer = sharedTmpBuffer.Get(); - SoftMaxShapeInfo srcShape = { rowNum, this->columnLength, rowNum, this->columnLength }; + AscendC::SoftMaxShapeInfo srcShape = { rowNum, this->columnLength, rowNum, this->columnLength }; if (this->singleLoopCoreRowNum % BASIC_BLOCK_ROW_FACTOR == 0 && this->columnLength % BASIC_BLOCK_COLUMN_FACTOR == 0 && this->columnLength < BASIC_BLOCK_MAX_COLUMN_LENGTH) { - SoftmaxGradFront(zLocal, xLocal, yLocal, tmpBuffer, softmaxTiling, srcShape); + AscendC::SoftmaxGradFront(zLocal, xLocal, yLocal, tmpBuffer, softmaxTiling, srcShape); } else { - SoftmaxGradFront(zLocal, xLocal, yLocal, tmpBuffer, softmaxTiling, srcShape); + AscendC::SoftmaxGradFront(zLocal, xLocal, yLocal, tmpBuffer, softmaxTiling, srcShape); } queueZ.EnQue(zLocal); @@ -131,21 +131,21 @@ private: __aicore__ inline void CopyOut(int32_t progress, uint32_t rowNum) { - LocalTensor zLocal = queueZ.DeQue(); - DataCopy(zGm[progress * this->singleLoopCoreRowNum * FLOAT_NUM_OF_SINGEL_BLOCK], zLocal, + AscendC::LocalTensor zLocal = queueZ.DeQue(); + AscendC::DataCopy(zGm[progress * this->singleLoopCoreRowNum * FLOAT_NUM_OF_SINGEL_BLOCK], zLocal, rowNum * FLOAT_NUM_OF_SINGEL_BLOCK); queueZ.FreeTensor(zLocal); } private: - TPipe pipe; - TBuf sharedTmpBuffer; - TQue queueX; - TQue queueY; - TQue queueZ; - GlobalTensor xGm; - GlobalTensor yGm; - GlobalTensor zGm; + AscendC::TPipe pipe; + AscendC::TBuf sharedTmpBuffer; + AscendC::TQue queueX; + AscendC::TQue queueY; + AscendC::TQue queueZ; + AscendC::GlobalTensor xGm; + AscendC::GlobalTensor yGm; + AscendC::GlobalTensor zGm; uint32_t blockLength = 0; uint32_t usedBlockDim = 0; diff --git a/examples/activation/softmaxgradfront/kernel_launch_method_by_direct/softmaxgradfront_custom.cpp b/examples/activation/softmaxgradfront/kernel_launch_method_by_direct/softmaxgradfront_custom.cpp index c4a19c76e577a1e9fcb662539b73b9385e81af3e..187857257d98443087e868ff58afb9c614264152 100644 --- a/examples/activation/softmaxgradfront/kernel_launch_method_by_direct/softmaxgradfront_custom.cpp +++ b/examples/activation/softmaxgradfront/kernel_launch_method_by_direct/softmaxgradfront_custom.cpp @@ -9,14 +9,13 @@ */ #include "../kernel_impl/softmaxgradfront_kernel.h" -using namespace AscendC; -__aicore__ inline void CopyTiling(VecTiling* tiling, GM_ADDR tilingGM) +__aicore__ inline void CopyTiling(MyCustomKernel::VecTiling* tiling, GM_ADDR tilingGM) { uint32_t* ptr = reinterpret_cast(tiling); auto tiling32 = reinterpret_cast<__gm__ uint32_t*>(tilingGM); - for (int i = 0; i < sizeof(VecTiling) / sizeof(uint32_t); i++, ptr++) { + for (int i = 0; i < sizeof(MyCustomKernel::VecTiling) / sizeof(uint32_t); i++, ptr++) { *ptr = *(tiling32 + i); } return; @@ -28,10 +27,10 @@ extern "C" __global__ __aicore__ void softmaxgradfront_custom(GM_ADDR x, GM_ADDR if ASCEND_IS_AIC { return; } - VecTiling tilingData; + MyCustomKernel::VecTiling tilingData; CopyTiling(&tilingData, tiling); - KernelSoftmax op; - op.Init(x, y, z, tilingData); + MyCustomKernel::KernelSoftmax op; + op.Init(x, y, res_out, tilingData); op.Process(); } diff --git a/examples/activation/softmaxgradfront/kernel_launch_method_by_framework/op_kernel/softmaxgradfront_custom.cpp b/examples/activation/softmaxgradfront/kernel_launch_method_by_framework/op_kernel/softmaxgradfront_custom.cpp index ad346e4b664e284479a9dff52a2a2432c1f053ea..3557e4c25ab0e150feb2d2f6fe69abf86ca070cf 100644 --- a/examples/activation/softmaxgradfront/kernel_launch_method_by_framework/op_kernel/softmaxgradfront_custom.cpp +++ b/examples/activation/softmaxgradfront/kernel_launch_method_by_framework/op_kernel/softmaxgradfront_custom.cpp @@ -8,7 +8,6 @@ * See LICENSE in the root of the software repository for the full text of the License. */ #include "../../../../../../kernel_impl/softmaxgradfront_kernel.h" -using namespace AscendC; extern "C" __global__ __aicore__ void softmaxgradfront_custom(GM_ADDR x, GM_ADDR y, GM_ADDR z, GM_ADDR workspace, GM_ADDR tiling) @@ -17,9 +16,9 @@ extern "C" __global__ __aicore__ void softmaxgradfront_custom(GM_ADDR x, GM_ADDR return; } GET_TILING_DATA(tilingData, tiling); - VecTiling vecTiling = *reinterpret_cast(&tilingData); - KernelSoftmax op; - op.Init(x, y, z, vecTiling); + MyCustomKernel::VecTiling vecTiling = *reinterpret_cast(&tilingData); + MyCustomKernel::KernelSoftmax op; + op.Init(x, y, res_out, vecTiling); if (TILING_KEY_IS(1)) { op.Process(); } diff --git a/examples/matrix/batch_matmul/kernel_impl/batch_matmul_custom_impl.h b/examples/matrix/batch_matmul/kernel_impl/batch_matmul_custom_impl.h index be0b4aaa27888986b9037ff39171e845d78f58f1..eab9dfd011560299465d359992af2e27b49e39f4 100644 --- a/examples/matrix/batch_matmul/kernel_impl/batch_matmul_custom_impl.h +++ b/examples/matrix/batch_matmul/kernel_impl/batch_matmul_custom_impl.h @@ -13,7 +13,6 @@ #include "kernel_operator.h" #include "lib/matmul_intf.h" -using namespace AscendC; using namespace matmul; constexpr int USED_CORE_NUM = 2; @@ -24,7 +23,7 @@ class BatchMatmulKernel { __aicore__ inline BatchMatmulKernel(){}; __aicore__ inline void Init(GM_ADDR a, GM_ADDR b, GM_ADDR bias, GM_ADDR c, GM_ADDR workspace, const TCubeTiling& tiling); template - __aicore__ inline void Process(TPipe* pipe, int32_t batchA, int32_t batchB); + __aicore__ inline void Process(AscendC::TPipe* pipe, int32_t batchA, int32_t batchB); Matmul matmulObj; private: __aicore__ inline void CalcOffset(int32_t blockIdx, const TCubeTiling& tiling, int32_t& offsetA, int32_t& offsetB, @@ -33,10 +32,10 @@ class BatchMatmulKernel { using bType = typename B_TYPE::T; using cType = typename C_TYPE::T; using biasType = typename BIAS_TYPE::T; - GlobalTensor aGlobal; - GlobalTensor bGlobal; - GlobalTensor cGlobal; - GlobalTensor biasGlobal; + AscendC::GlobalTensor aGlobal; + AscendC::GlobalTensor bGlobal; + AscendC::GlobalTensor cGlobal; + AscendC::GlobalTensor biasGlobal; TCubeTiling tiling; }; @@ -59,7 +58,7 @@ __aicore__ inline void BatchMatmulKernel::Ini int32_t offsetB = 0; int32_t offsetC = 0; int32_t offsetBias = 0; - CalcOffset(GetBlockIdx(), tiling, offsetA, offsetB, offsetC, offsetBias); + CalcOffset(AscendC::GetBlockIdx(), tiling, offsetA, offsetB, offsetC, offsetBias); aGlobal = aGlobal[offsetA]; bGlobal = bGlobal[offsetB]; cGlobal = cGlobal[offsetC]; @@ -71,7 +70,7 @@ __aicore__ inline void BatchMatmulKernel::Ini template template -__aicore__ inline void BatchMatmulKernel::Process(TPipe* pipe, int32_t batchA, int32_t batchB) +__aicore__ inline void BatchMatmulKernel::Process(AscendC::TPipe* pipe, int32_t batchA, int32_t batchB) { int batchC = batchA > batchB ? batchA : batchB; int gLay = tiling.ALayoutInfoG > tiling.BLayoutInfoG ? tiling.ALayoutInfoG : tiling.BLayoutInfoG; diff --git a/examples/matrix/batch_matmul/kernel_launch_method_by_direct/batch_matmul_custom.cpp b/examples/matrix/batch_matmul/kernel_launch_method_by_direct/batch_matmul_custom.cpp index d115a3102dabc720482bbb40dd7b993521c6bb05..b1abfcd9f25ea7af7fb3fc9c27add3b0d2dd7952 100644 --- a/examples/matrix/batch_matmul/kernel_launch_method_by_direct/batch_matmul_custom.cpp +++ b/examples/matrix/batch_matmul/kernel_launch_method_by_direct/batch_matmul_custom.cpp @@ -12,7 +12,6 @@ #include "lib/matmul_intf.h" #include "../kernel_impl/batch_matmul_custom_impl.h" -using namespace AscendC; using namespace matmul; constexpr int32_t FULL_L1_SIZE = 512 * 1024; @@ -39,7 +38,7 @@ extern "C" __global__ __aicore__ void batch_matmul_custom(GM_ADDR a, GM_ADDR b, typedef MatmulType C_TYPE; typedef MatmulType BIAS_TYPE; BatchMatmulKernel batchMatmulKernel; - TPipe pipe; + AscendC::TPipe pipe; tiling.shareMode = 0; // 0, share mode tiling.shareL1Size = FULL_L1_SIZE; // full L1 tiling.shareL0CSize = FULL_L0C_SIZE; // full L0C diff --git a/examples/matrix/batch_matmul/kernel_launch_method_by_framework/op_host/batch_matmul_custom.cpp b/examples/matrix/batch_matmul/kernel_launch_method_by_framework/op_host/batch_matmul_custom.cpp index b7ceb70d73ff3879524afce531d66fcdebcd4f53..ff053d118fa21741d401bbaab613f1c5516b4491 100644 --- a/examples/matrix/batch_matmul/kernel_launch_method_by_framework/op_host/batch_matmul_custom.cpp +++ b/examples/matrix/batch_matmul/kernel_launch_method_by_framework/op_host/batch_matmul_custom.cpp @@ -28,7 +28,7 @@ static ge::graphStatus TilingFunc(gert::TilingContext* context) tiling.SaveToBuffer(context->GetRawTilingData()->GetData(), context->GetRawTilingData()->GetCapacity()); context->GetRawTilingData()->SetDataSize(tiling.GetDataSize()); size_t userWorkspaceSize = 0; - size_t systemWorkspaceSize = ascendcPlatform.GetLibApiWorkSpaceSize(); + size_t systemWorkspaceSize = static_cast(ascendcPlatform.GetLibApiWorkSpaceSize()); size_t *currentWorkspace = context->GetWorkspaceSizes(1); currentWorkspace[0] = userWorkspaceSize + systemWorkspaceSize; diff --git a/examples/matrix/batch_matmul/kernel_launch_method_by_framework/op_kernel/batch_matmul_custom.cpp b/examples/matrix/batch_matmul/kernel_launch_method_by_framework/op_kernel/batch_matmul_custom.cpp index ff9547f98879fcf7ed144391e8ded631bef0c90d..a653a0f5b69e4b881cccd4f744a980576261c307 100644 --- a/examples/matrix/batch_matmul/kernel_launch_method_by_framework/op_kernel/batch_matmul_custom.cpp +++ b/examples/matrix/batch_matmul/kernel_launch_method_by_framework/op_kernel/batch_matmul_custom.cpp @@ -12,7 +12,6 @@ #include "lib/matmul_intf.h" #include "../../../../../../kernel_impl/batch_matmul_custom_impl.h" -using namespace AscendC; using namespace matmul; constexpr int32_t FULL_L1_SIZE = 512 * 1024; @@ -25,7 +24,7 @@ extern "C" __global__ __aicore__ void batch_matmul_custom(GM_ADDR a, GM_ADDR b, typedef MatmulType C_TYPE; typedef MatmulType BIAS_TYPE; BatchMatmulKernel batchMatmulKernel; - TPipe pipe; + AscendC::TPipe pipe; tilingData.cubeTilingData.shareMode = 0; // 0, share mode tilingData.cubeTilingData.shareL1Size = FULL_L1_SIZE; // full L1 tilingData.cubeTilingData.shareL0CSize = FULL_L0C_SIZE; // full L0C diff --git a/examples/matrix/matmul/kernel_impl/matmul_custom_impl.h b/examples/matrix/matmul/kernel_impl/matmul_custom_impl.h index ef815c6959f11c9e29dc58bb3dc19347eb33183c..c79040156cecdb1dcaea009782f9e0dad25c1a8b 100644 --- a/examples/matrix/matmul/kernel_impl/matmul_custom_impl.h +++ b/examples/matrix/matmul/kernel_impl/matmul_custom_impl.h @@ -13,7 +13,6 @@ #include "kernel_operator.h" #include "lib/matmul_intf.h" -using namespace AscendC; using namespace matmul; template @@ -22,7 +21,7 @@ class MatmulKernel { __aicore__ inline MatmulKernel(){}; __aicore__ inline void Init(GM_ADDR a, GM_ADDR b, GM_ADDR bias, GM_ADDR c, GM_ADDR workspace, const TCubeTiling& tiling); template - __aicore__ inline void Process(TPipe* pipe); + __aicore__ inline void Process(AscendC::TPipe* pipe); Matmul, MatmulType, MatmulType, @@ -32,10 +31,10 @@ class MatmulKernel { __aicore__ inline void CalcOffset(int32_t blockIdx, const TCubeTiling& tiling, int32_t& offsetA, int32_t& offsetB, int32_t& offsetC, int32_t& offsetBias, bool isAtrans, bool isBtrans); - GlobalTensor aGlobal; - GlobalTensor bGlobal; - GlobalTensor cGlobal; - GlobalTensor biasGlobal; + AscendC::GlobalTensor aGlobal; + AscendC::GlobalTensor bGlobal; + AscendC::GlobalTensor cGlobal; + AscendC::GlobalTensor biasGlobal; TCubeTiling tiling; }; @@ -55,7 +54,7 @@ __aicore__ inline void MatmulKernel::Init(GM_ADDR int32_t offsetBias = 0; bool isAtrans = true; bool isBtrans = false; - CalcOffset(GetBlockIdx(), tiling, offsetA, offsetB, offsetC, offsetBias, isAtrans, isBtrans); + CalcOffset(AscendC::GetBlockIdx(), tiling, offsetA, offsetB, offsetC, offsetBias, isAtrans, isBtrans); aGlobal = aGlobal[offsetA]; bGlobal = bGlobal[offsetB]; cGlobal = cGlobal[offsetC]; @@ -67,11 +66,11 @@ __aicore__ inline void MatmulKernel::Init(GM_ADDR template template -__aicore__ inline void MatmulKernel::Process(TPipe* pipe) +__aicore__ inline void MatmulKernel::Process(AscendC::TPipe* pipe) { if constexpr (setTmpSpace) { - TBuf<> tmpMMFormatUb; - LocalTensor mmformatUb; + AscendC::TBuf<> tmpMMFormatUb; + AscendC::LocalTensor mmformatUb; pipe->InitBuffer(tmpMMFormatUb, TOTAL_VEC_LOCAL_SIZE); mmformatUb = tmpMMFormatUb.Get(TOTAL_VEC_LOCAL_SIZE); matmulObj.SetLocalWorkspace(mmformatUb); @@ -86,11 +85,16 @@ __aicore__ inline void MatmulKernel::Process(TPip matmulObj.End(); } +__aicore__ inline uint32_t Ceiling(uint32_t a, uint32_t b) +{ + return (a + b - 1) / b; +} + template __aicore__ inline void MatmulKernel::CalcOffset(int32_t blockIdx, const TCubeTiling& tiling, int32_t& offsetA, int32_t& offsetB, int32_t& offsetC, int32_t& offsetBias, bool isAtrans, bool isBtrans) { - auto mSingleBlocks = Ceil(tiling.M, tiling.singleCoreM); + auto mSingleBlocks = Ceiling(tiling.M, tiling.singleCoreM); auto mCoreIndx = blockIdx % mSingleBlocks; auto nCoreIndx = blockIdx / mSingleBlocks; diff --git a/examples/matrix/matmul/kernel_launch_method_by_direct/matmul_custom.cpp b/examples/matrix/matmul/kernel_launch_method_by_direct/matmul_custom.cpp index be66d0e6b0d1713347e605321680578198accc7d..a274b83f0b1dec6b23707382c9a27a29f8b61207 100644 --- a/examples/matrix/matmul/kernel_launch_method_by_direct/matmul_custom.cpp +++ b/examples/matrix/matmul/kernel_launch_method_by_direct/matmul_custom.cpp @@ -12,7 +12,6 @@ #include "lib/matmul_intf.h" #include "../kernel_impl/matmul_custom_impl.h" -using namespace AscendC; using namespace matmul; __aicore__ inline void CopyTiling(TCubeTiling* tiling, GM_ADDR tilingGM) @@ -31,7 +30,7 @@ extern "C" __global__ __aicore__ void matmul_custom(GM_ADDR a, GM_ADDR b, GM_ADD TCubeTiling tiling; CopyTiling(&tiling, tilingGm); MatmulKernel matmulKernel; - TPipe pipe; + AscendC::TPipe pipe; REGIST_MATMUL_OBJ(&pipe, GetSysWorkSpacePtr(), matmulKernel.matmulObj, &tiling); matmulKernel.Init(a, b, nullptr, c, workspace, tiling); #ifdef CUSTOM_ASCEND310P diff --git a/examples/matrix/matmul/kernel_launch_method_by_framework/op_host/matmul_custom.cpp b/examples/matrix/matmul/kernel_launch_method_by_framework/op_host/matmul_custom.cpp index adab49f0ba145258e4b153a48ab9ed03a51329db..95d12755765457605eff03212ce9b4ac050e13bb 100644 --- a/examples/matrix/matmul/kernel_launch_method_by_framework/op_host/matmul_custom.cpp +++ b/examples/matrix/matmul/kernel_launch_method_by_framework/op_host/matmul_custom.cpp @@ -48,7 +48,7 @@ static ge::graphStatus TilingFunc(gert::TilingContext* context) tiling.SaveToBuffer(context->GetRawTilingData()->GetData(), context->GetRawTilingData()->GetCapacity()); context->GetRawTilingData()->SetDataSize(tiling.GetDataSize()); size_t userWorkspaceSize = 0; - size_t systemWorkspaceSize = ascendcPlatform.GetLibApiWorkSpaceSize(); + size_t systemWorkspaceSize = static_cast(ascendcPlatform.GetLibApiWorkSpaceSize()); size_t *currentWorkspace = context->GetWorkspaceSizes(1); currentWorkspace[0] = userWorkspaceSize + systemWorkspaceSize; diff --git a/examples/matrix/matmul/kernel_launch_method_by_framework/op_kernel/matmul_custom.cpp b/examples/matrix/matmul/kernel_launch_method_by_framework/op_kernel/matmul_custom.cpp index 09c40946bf186e89585f827e50dd7d5cc9f26656..af60b3d49c0b0d5d85f81757438834636294ef33 100644 --- a/examples/matrix/matmul/kernel_launch_method_by_framework/op_kernel/matmul_custom.cpp +++ b/examples/matrix/matmul/kernel_launch_method_by_framework/op_kernel/matmul_custom.cpp @@ -12,13 +12,12 @@ #include "lib/matmul_intf.h" #include "../../../../../../kernel_impl/matmul_custom_impl.h" -using namespace AscendC; using namespace matmul; extern "C" __global__ __aicore__ void matmul_custom(GM_ADDR a, GM_ADDR b, GM_ADDR bias, GM_ADDR c, GM_ADDR workspace, GM_ADDR tiling) { GET_TILING_DATA(tilingData, tiling); MatmulKernel matmulKernel; - TPipe pipe; + AscendC::TPipe pipe; REGIST_MATMUL_OBJ(&pipe, GetSysWorkSpacePtr(), matmulKernel.matmulObj,&tilingData.cubeTilingData); matmulKernel.Init(a, b, bias, c, workspace, tilingData.cubeTilingData); if (TILING_KEY_IS(1)) { diff --git a/examples/normalization/layernorm/kernel_impl/layernorm_custom.h b/examples/normalization/layernorm/kernel_impl/layernorm_custom.h index c838f882bd0ff96213fe48016056d51f6c333546..435a7abcb28cb16191bd07b1221de5cd77ab557b 100644 --- a/examples/normalization/layernorm/kernel_impl/layernorm_custom.h +++ b/examples/normalization/layernorm/kernel_impl/layernorm_custom.h @@ -12,7 +12,7 @@ #define EXAMPLES_NORMALIZATION_LAYERNORM_CUSTOM_H #include "kernel_operator.h" -namespace AscendC { +namespace MyCustomKernel { struct VecTiling { LayerNormTiling layernormTilingData; float epsilon = 0; @@ -58,13 +58,13 @@ public: private: __aicore__ inline void CopyIn() { - LocalTensor inputXLocal = inQueueX.AllocTensor(); - LocalTensor gammaLocal = inQueueGamma.AllocTensor(); - LocalTensor betaLocal = inQueueBeta.AllocTensor(); + AscendC::LocalTensor inputXLocal = inQueueX.AllocTensor(); + AscendC::LocalTensor gammaLocal = inQueueGamma.AllocTensor(); + AscendC::LocalTensor betaLocal = inQueueBeta.AllocTensor(); - DataCopy(inputXLocal, inputXGlobal, bshLength); - DataCopy(gammaLocal, gammGlobal, hLength); - DataCopy(betaLocal, betaGlobal, hLength); + AscendC::DataCopy(inputXLocal, inputXGlobal, bshLength); + AscendC::DataCopy(gammaLocal, gammGlobal, hLength); + AscendC::DataCopy(betaLocal, betaGlobal, hLength); inQueueX.EnQue(inputXLocal); inQueueGamma.EnQue(gammaLocal); @@ -72,15 +72,15 @@ private: } __aicore__ inline void Compute() { - LocalTensor inputXLocal = inQueueX.DeQue(); - LocalTensor gammaLocal = inQueueGamma.DeQue(); - LocalTensor betaLocal = inQueueBeta.DeQue(); + AscendC::LocalTensor inputXLocal = inQueueX.DeQue(); + AscendC::LocalTensor gammaLocal = inQueueGamma.DeQue(); + AscendC::LocalTensor betaLocal = inQueueBeta.DeQue(); - LocalTensor outputLocal = outQueue.AllocTensor(); - LocalTensor meanLocal = outQueueMean.AllocTensor(); - LocalTensor varianceLocal = outQueueVariance.AllocTensor(); + AscendC::LocalTensor outputLocal = outQueue.AllocTensor(); + AscendC::LocalTensor meanLocal = outQueueMean.AllocTensor(); + AscendC::LocalTensor varianceLocal = outQueueVariance.AllocTensor(); - LayerNorm(outputLocal, meanLocal, varianceLocal, inputXLocal, gammaLocal, betaLocal, + AscendC::LayerNorm(outputLocal, meanLocal, varianceLocal, inputXLocal, gammaLocal, betaLocal, (float)epsilon, tiling_); outQueue.EnQue(outputLocal); @@ -93,13 +93,13 @@ private: } __aicore__ inline void CopyOut() { - LocalTensor outputLocal = outQueue.DeQue(); - LocalTensor meanLocal = outQueueMean.DeQue(); - LocalTensor varianceLocal = outQueueVariance.DeQue(); + AscendC::LocalTensor outputLocal = outQueue.DeQue(); + AscendC::LocalTensor meanLocal = outQueueMean.DeQue(); + AscendC::LocalTensor varianceLocal = outQueueVariance.DeQue(); - DataCopy(outputGlobal, outputLocal, bshLength); - DataCopy(outputMeanGlobal, meanLocal, bsLength); - DataCopy(outputVarianceGlobal, varianceLocal, bsLength); + AscendC::DataCopy(outputGlobal, outputLocal, bshLength); + AscendC::DataCopy(outputMeanGlobal, meanLocal, bsLength); + AscendC::DataCopy(outputVarianceGlobal, varianceLocal, bsLength); outQueue.FreeTensor(outputLocal); outQueueMean.FreeTensor(meanLocal); @@ -107,20 +107,20 @@ private: } private: - GlobalTensor inputXGlobal; - GlobalTensor gammGlobal; - GlobalTensor betaGlobal; - GlobalTensor outputGlobal; - GlobalTensor outputMeanGlobal; - GlobalTensor outputVarianceGlobal; - - TPipe pipe; - TQue inQueueX; - TQue inQueueGamma; - TQue inQueueBeta; - TQue outQueue; - TQue outQueueMean; - TQue outQueueVariance; + AscendC::GlobalTensor inputXGlobal; + AscendC::GlobalTensor gammGlobal; + AscendC::GlobalTensor betaGlobal; + AscendC::GlobalTensor outputGlobal; + AscendC::GlobalTensor outputMeanGlobal; + AscendC::GlobalTensor outputVarianceGlobal; + + AscendC::TPipe pipe; + AscendC::TQue inQueueX; + AscendC::TQue inQueueGamma; + AscendC::TQue inQueueBeta; + AscendC::TQue outQueue; + AscendC::TQue outQueueMean; + AscendC::TQue outQueueVariance; uint32_t bLength; uint32_t sLength; diff --git a/examples/normalization/layernorm/kernel_launch_method_by_direct/layernorm_custom.cpp b/examples/normalization/layernorm/kernel_launch_method_by_direct/layernorm_custom.cpp index b225a098fd29e5733b593f34e007dc602364eb89..9358217ee14ba879894feab10aadc1678c6d086a 100644 --- a/examples/normalization/layernorm/kernel_launch_method_by_direct/layernorm_custom.cpp +++ b/examples/normalization/layernorm/kernel_launch_method_by_direct/layernorm_custom.cpp @@ -9,14 +9,13 @@ */ #include "kernel_operator.h" #include "../kernel_impl/layernorm_custom.h" -using namespace AscendC; -__aicore__ inline void CopyTiling(VecTiling *tiling, GM_ADDR tilingGM) +__aicore__ inline void CopyTiling(MyCustomKernel::VecTiling *tiling, GM_ADDR tilingGM) { uint32_t *ptr = reinterpret_cast(tiling); auto tiling32 = reinterpret_cast<__gm__ uint32_t *>(tilingGM); - for (int i = 0; i < sizeof(VecTiling) / sizeof(uint32_t); i++, ptr++) { + for (int i = 0; i < sizeof(MyCustomKernel::VecTiling) / sizeof(uint32_t); i++, ptr++) { *ptr = *(tiling32 + i); } return; @@ -28,9 +27,9 @@ extern "C" __global__ __aicore__ void layernorm_custom(GM_ADDR inputXGm, GM_ADDR if ASCEND_IS_AIC { return; } - VecTiling tilingData; + MyCustomKernel::VecTiling tilingData; CopyTiling(&tilingData, tiling); - KernelLayernorm op; + MyCustomKernel::KernelLayernorm op; op.Init(inputXGm, gammaGm, betaGm, outputGm, outputMeanGm, outputVarianceGm, tilingData); op.Process(); } diff --git a/examples/normalization/layernorm/kernel_launch_method_by_framework/op_kernel/layernorm_custom.cpp b/examples/normalization/layernorm/kernel_launch_method_by_framework/op_kernel/layernorm_custom.cpp index 17db0fbd8853495ff206f8c7127d5705e9dd2a87..2bcc86185f9e6aee430abc9fab0690ba5ed54d59 100644 --- a/examples/normalization/layernorm/kernel_launch_method_by_framework/op_kernel/layernorm_custom.cpp +++ b/examples/normalization/layernorm/kernel_launch_method_by_framework/op_kernel/layernorm_custom.cpp @@ -9,15 +9,14 @@ */ #include "../../../../../../kernel_impl/layernorm_custom.h" -using namespace AscendC; extern "C" __global__ __aicore__ void layernorm_custom(GM_ADDR inputXGm, GM_ADDR gammaGm, GM_ADDR betaGm, GM_ADDR outputGm, GM_ADDR outputMeanGm, GM_ADDR outputVarianceGm, GM_ADDR workspace, GM_ADDR tiling) { GET_TILING_DATA(tilingData, tiling); - VecTiling vecTiling = *reinterpret_cast(&tilingData); + MyCustomKernel::VecTiling vecTiling = *reinterpret_cast(&tilingData); if (TILING_KEY_IS(1)) { - KernelLayernorm op; + MyCustomKernel::KernelLayernorm op; op.Init(inputXGm, gammaGm, betaGm, outputGm, outputMeanGm, outputVarianceGm, vecTiling); op.Process(); } diff --git a/examples/pad/broadcast/kernel_impl/broadcast_custom.h b/examples/pad/broadcast/kernel_impl/broadcast_custom.h index d816db349f2414fe5563ac24549178e9712594c4..6df59bf8caa2d5ea9e845a5a7e1fc4ac42c48523 100644 --- a/examples/pad/broadcast/kernel_impl/broadcast_custom.h +++ b/examples/pad/broadcast/kernel_impl/broadcast_custom.h @@ -13,7 +13,6 @@ #include "kernel_operator.h" constexpr int32_t BUFFER_NUM = 1; -using namespace AscendC; template class KernelBroadcastCustom { public: @@ -22,7 +21,7 @@ public: __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, uint32_t srcLength, uint32_t dstLength, const uint32_t srcShape[dim], const uint32_t dstShape[dim]) { - AscendCUtils::SetOverflow(1); + AscendC::AscendCUtils::SetOverflow(1); xGm.SetGlobalBuffer((__gm__ T *)x, srcLength); yGm.SetGlobalBuffer((__gm__ T *)y, dstLength); @@ -44,32 +43,32 @@ public: private: __aicore__ inline void CopyIn() { - LocalTensor xLocal = inQueueX.AllocTensor(); - DataCopy(xLocal, xGm, srcLength_); + AscendC::LocalTensor xLocal = inQueueX.AllocTensor(); + AscendC::DataCopy(xLocal, xGm, srcLength_); inQueueX.EnQue(xLocal); } __aicore__ inline void Compute() { - LocalTensor xLocal = inQueueX.DeQue(); - LocalTensor yLocal = outQueueY.AllocTensor(); - BroadCast(yLocal, xLocal, dstShape_, srcShape_); + AscendC::LocalTensor xLocal = inQueueX.DeQue(); + AscendC::LocalTensor yLocal = outQueueY.AllocTensor(); + AscendC::BroadCast(yLocal, xLocal, dstShape_, srcShape_); outQueueY.EnQue(yLocal); inQueueX.FreeTensor(xLocal); } __aicore__ inline void CopyOut() { - LocalTensor yLocal = outQueueY.DeQue(); - DataCopy(yGm, yLocal, dstLength_); + AscendC::LocalTensor yLocal = outQueueY.DeQue(); + AscendC::DataCopy(yGm, yLocal, dstLength_); outQueueY.FreeTensor(yLocal); } private: - TPipe pipe; - TQue inQueueX; - TQue outQueueY; - GlobalTensor xGm; - GlobalTensor yGm; + AscendC::TPipe pipe; + AscendC::TQue inQueueX; + AscendC::TQue outQueueY; + AscendC::GlobalTensor xGm; + AscendC::GlobalTensor yGm; uint32_t srcLength_; uint32_t dstLength_; const uint32_t *srcShape_{nullptr}; diff --git a/examples/pad/broadcast/kernel_launch_method_by_direct/broadcast_custom.cpp b/examples/pad/broadcast/kernel_launch_method_by_direct/broadcast_custom.cpp index 28a17316371aeff1da749b7c4dd0fba2cd78c2fa..395ac0349a140c5a2cee5bec5ef3cf23d45723fd 100644 --- a/examples/pad/broadcast/kernel_launch_method_by_direct/broadcast_custom.cpp +++ b/examples/pad/broadcast/kernel_launch_method_by_direct/broadcast_custom.cpp @@ -8,7 +8,6 @@ * See LICENSE in the root of the software repository for the full text of the License. */ #include "../kernel_impl/broadcast_custom.h" -using namespace AscendC; struct BroadcastTilingData { uint32_t dim{0}; uint32_t axis{0}; diff --git a/examples/pad/broadcast/kernel_launch_method_by_framework/op_kernel/broadcast_custom.cpp b/examples/pad/broadcast/kernel_launch_method_by_framework/op_kernel/broadcast_custom.cpp index a8fd04643557b09f3948d026b810cd7125580f72..230a5975c6b557a8cfa7cae6768c3a1946bc33da 100644 --- a/examples/pad/broadcast/kernel_launch_method_by_framework/op_kernel/broadcast_custom.cpp +++ b/examples/pad/broadcast/kernel_launch_method_by_framework/op_kernel/broadcast_custom.cpp @@ -8,7 +8,6 @@ * See LICENSE in the root of the software repository for the full text of the License. */ #include "../../../../../../kernel_impl/broadcast_custom.h" -using namespace AscendC; extern "C" __global__ __aicore__ void broadcast_custom(GM_ADDR x, GM_ADDR y, GM_ADDR workspace, GM_ADDR tiling) {