diff --git a/operator_contrib/UnalignAddCustomSample/FrameworkLaunch/AddCustom/op_host/add_custom.cpp b/operator_contrib/UnalignAddCustomSample/FrameworkLaunch/AddCustom/op_host/add_custom.cpp index b61a67f119503aa924b79f4d6df351bddbad69d5..7e0207dae757c89054d0d79e69dd6f61f6d8386f 100644 --- a/operator_contrib/UnalignAddCustomSample/FrameworkLaunch/AddCustom/op_host/add_custom.cpp +++ b/operator_contrib/UnalignAddCustomSample/FrameworkLaunch/AddCustom/op_host/add_custom.cpp @@ -13,7 +13,7 @@ static ge::graphStatus TilingFunc(gert::TilingContext* context) auto ascendcPlatform = platform_ascendc::PlatformAscendC(context->GetPlatformInfo()); ascendcPlatform.GetCoreMemSize(platform_ascendc::CoreMemType::UB, ubSize); auto coreNum = ascendcPlatform.GetCoreNum(); - + uint64_t choose=0; // Based on the input length and the number of inputs, the number of bytes of the input data type is obtained uint32_t inputNum = context->GetInputShape(0)->GetStorageShape().GetShapeSize(); uint32_t typeLength = 0; @@ -26,13 +26,16 @@ static ge::graphStatus TilingFunc(gert::TilingContext* context) // The number of 32B data blocks that can be used for each data. DOUBLE BUFFER is already counted here uint32_t tileBlockNum = (ubSize / BLOCK_SIZE / BUFFER_NUM) / ubDataNumber; uint32_t tileDataNum = (tileBlockNum * BLOCK_SIZE) / inputBytes; - + + // Input data for 32B alignment uint32_t inputLengthAlgin32 = (((inputLength + BLOCK_SIZE - 1) / BLOCK_SIZE) * BLOCK_SIZE); + // There is at least 32B of data on each core, satisfying several settings for several cores. The maximum number of audits is the actual number of audits coreNum = (coreNum < inputLengthAlgin32 / BLOCK_SIZE) ? coreNum : inputLengthAlgin32 / BLOCK_SIZE; coreNum = (coreNum >= 1) ? coreNum : 1; uint32_t everyCoreInputBlockNum = inputLengthAlgin32 / BLOCK_SIZE / coreNum; + uint32_t tailBlockNum = (inputLengthAlgin32 / BLOCK_SIZE) % coreNum; // Small chunks are calculated and sliced several times using the number of data on each core @@ -51,6 +54,29 @@ static ge::graphStatus TilingFunc(gert::TilingContext* context) uint32_t bigTailDataNum = bigCoreDataNum - tileDataNum * bigTileNum; bigTailDataNum = bigTailDataNum == 0 ? tileDataNum : bigTailDataNum; + + + + if(0!=tailBlockNum) + { + + context->SetTilingKey(1); + } + else{ + context->SetTilingKey(0); + } + + if(tileDataNum>=inputNum) + { + smallCoreDataNum=inputLengthAlgin32/inputBytes; + coreNum=1; + tileDataNum=smallCoreDataNum; + finalSmallTileNum=1; + smallTailDataNum=smallCoreDataNum; + + context->SetTilingKey(0); + } + std::cout<<"tileDataNum"<tileDataNum * sizeof(half)); pipe.InitBuffer(tmp2, this->tileDataNum * sizeof(half)); } - __aicore__ inline void Process() + __aicore__ inline void Init_tailBlockNum_0(GM_ADDR x, GM_ADDR y, GM_ADDR z, uint32_t smallCoreDataNum, + uint32_t bigCoreDataNum, uint32_t finalBigTileNum, + uint32_t finalSmallTileNum, uint32_t tileDataNum, + uint32_t smallTailDataNum, uint32_t bigTailDataNum, + uint32_t tailBlockNum) + { + ASSERT(AscendC::GetBlockNum() != 0 && "block dim can not be zero!"); + uint32_t coreNum = AscendC::GetBlockIdx(); + //AscendC::printf("Init_tailBlockNum_0\n"); + this->tileDataNum = tileDataNum; + this->coreDataNum = smallCoreDataNum; + this->tileNum = finalSmallTileNum; + this->tailDataNum = smallTailDataNum; + uint32_t globalBufferIndex = smallCoreDataNum * AscendC::GetBlockIdx(); + xGm.SetGlobalBuffer((__gm__ TYPE_X*)x + globalBufferIndex, this->coreDataNum); + yGm.SetGlobalBuffer((__gm__ TYPE_Y*)y + globalBufferIndex, this->coreDataNum); + zGm.SetGlobalBuffer((__gm__ TYPE_Z*)z + globalBufferIndex, this->coreDataNum); + pipe.InitBuffer(inQueueX, BUFFER_NUM, this->tileDataNum * sizeof(TYPE_X)); + pipe.InitBuffer(inQueueY, BUFFER_NUM, this->tileDataNum * sizeof(TYPE_Y)); + pipe.InitBuffer(outQueueZ, BUFFER_NUM, this->tileDataNum * sizeof(TYPE_Z)); + pipe.InitBuffer(tmp1, this->tileDataNum * sizeof(half)); + pipe.InitBuffer(tmp2, this->tileDataNum * sizeof(half)); + } + __aicore__ inline void Process_int8_t() + { + int32_t loopCount = this->tileNum; + this->processDataNum = this->tileDataNum; + for (int32_t i = 0; i < loopCount-1; i++) { + CopyIn(i); + Compute_int8_t(i); + CopyOut(i); + } + this->processDataNum = this->tailDataNum; + CopyIn(loopCount-1); + Compute_int8_t(loopCount-1); + CopyOut(loopCount-1); + + } + __aicore__ inline void Process_addSuport() { int32_t loopCount = this->tileNum; this->processDataNum = this->tileDataNum; - for (int32_t i = 0; i < loopCount; i++) { - if (i == this->tileNum - 1) { - this->processDataNum = this->tailDataNum; - } + for (int32_t i = 0; i < loopCount-1; i++) { CopyIn(i); - Compute(i); + Compute_addSuport(i); CopyOut(i); } + this->processDataNum = this->tailDataNum; + CopyIn(loopCount-1); + Compute_addSuport(loopCount-1); + CopyOut(loopCount-1); + } private: @@ -66,26 +106,32 @@ private: inQueueX.EnQue(xLocal); inQueueY.EnQue(yLocal); } - __aicore__ inline void Compute(int32_t progress) + __aicore__ inline void Compute_addSuport(int32_t progress) { AscendC::LocalTensor xLocal = inQueueX.DeQue(); AscendC::LocalTensor yLocal = inQueueY.DeQue(); AscendC::LocalTensor zLocal = outQueueZ.AllocTensor(); - if constexpr (std::is_same_v) { - auto p1 = tmp1.Get(); - auto p2 = tmp2.Get(); - AscendC::Cast(p1, xLocal, AscendC::RoundMode::CAST_NONE, this->processDataNum); - AscendC::Cast(p2, yLocal, AscendC::RoundMode::CAST_NONE, this->processDataNum); - AscendC::Add(p2, p1, p2, this->processDataNum); - AscendC::Cast(p1.ReinterpretCast(), p2, AscendC::RoundMode::CAST_RINT, this->processDataNum); - AscendC::ShiftLeft(p1.ReinterpretCast(), p1.ReinterpretCast(), int16_t(8), this->processDataNum); - AscendC::ShiftRight(p1.ReinterpretCast(), p1.ReinterpretCast(), int16_t(8), this->processDataNum); - AscendC::Cast(p2, p1.ReinterpretCast(), AscendC::RoundMode::CAST_NONE, this->processDataNum); - AscendC::Cast(zLocal, p2, AscendC::RoundMode::CAST_NONE, this->processDataNum); - } - else { - AscendC::Add(zLocal, xLocal, yLocal, this->processDataNum); - } + AscendC::Add(zLocal, xLocal, yLocal, this->processDataNum); + outQueueZ.EnQue(zLocal); + inQueueX.FreeTensor(xLocal); + inQueueY.FreeTensor(yLocal); + } + __aicore__ inline void Compute_int8_t(int32_t progress) + { + AscendC::LocalTensor xLocal = inQueueX.DeQue(); + AscendC::LocalTensor yLocal = inQueueY.DeQue(); + AscendC::LocalTensor zLocal = outQueueZ.AllocTensor(); + auto p1 = tmp1.Get(); + auto p2 = tmp2.Get(); + AscendC::Cast(p1 ,xLocal, AscendC::RoundMode::CAST_NONE, this->processDataNum); + AscendC::Cast(p2, yLocal, AscendC::RoundMode::CAST_NONE, this->processDataNum); + AscendC::Add(p2, p1, p2, this->processDataNum); + AscendC::Cast(p1.ReinterpretCast(), p2, AscendC::RoundMode::CAST_RINT, this->processDataNum); + AscendC::ShiftLeft(p1.ReinterpretCast(), p1.ReinterpretCast(), int16_t(8), this->processDataNum); + AscendC::ShiftRight(p1.ReinterpretCast(), p1.ReinterpretCast(), int16_t(8), this->processDataNum); + AscendC::Cast(p2, p1.ReinterpretCast(), AscendC::RoundMode::CAST_NONE, this->processDataNum); + AscendC::Cast(zLocal, p2, AscendC::RoundMode::CAST_NONE, this->processDataNum); + outQueueZ.EnQue(zLocal); inQueueX.FreeTensor(xLocal); inQueueY.FreeTensor(yLocal); @@ -101,7 +147,7 @@ private: AscendC::TPipe pipe; AscendC::TQue inQueueX, inQueueY; AscendC::TQue outQueueZ; - AscendC::TBuf tmp1, tmp2; + AscendC::TBuf tmp1 ,tmp2; AscendC::GlobalTensor xGm; AscendC::GlobalTensor yGm; AscendC::GlobalTensor zGm; @@ -116,12 +162,42 @@ extern "C" __global__ __aicore__ void add_custom(GM_ADDR x, GM_ADDR y, GM_ADDR z { GET_TILING_DATA(tiling_data, tiling); KernelAdd op; - op.Init(x, y, z, tiling_data.smallCoreDataNum, - tiling_data.bigCoreDataNum, tiling_data.finalBigTileNum, - tiling_data.finalSmallTileNum, tiling_data.tileDataNum, - tiling_data.smallTailDataNum, tiling_data.bigTailDataNum, - tiling_data.tailBlockNum); - op.Process(); + //AscendC::printf(" tiling_data.smallCoreDataNum is %d\n"); + if(TILING_KEY_IS(1)) + { + op.Init(x, y, z, tiling_data.smallCoreDataNum, + tiling_data.bigCoreDataNum, tiling_data.finalBigTileNum, + tiling_data.finalSmallTileNum, tiling_data.tileDataNum, + tiling_data.smallTailDataNum, tiling_data.bigTailDataNum, + tiling_data.tailBlockNum); + if constexpr (std::is_same_v) { + op.Process_int8_t(); + } + else + { + + op.Process_addSuport(); + } + + } + else if(TILING_KEY_IS(0)) + { + op.Init_tailBlockNum_0(x, y, z, tiling_data.smallCoreDataNum, + tiling_data.bigCoreDataNum, tiling_data.finalBigTileNum, + tiling_data.finalSmallTileNum, tiling_data.tileDataNum, + tiling_data.smallTailDataNum, tiling_data.bigTailDataNum, + tiling_data.tailBlockNum); + if constexpr (std::is_same_v) + { + op.Process_int8_t(); + } + else + { + + op.Process_addSuport(); + } + } + } #ifndef ASCENDC_CPU_DEBUG