diff --git a/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/README.md b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/README.md index bda5a062601ebb31f1e05f760a139e4786f83927..7bf8b7bea5c498d5bde6e260d2b75b0ac4e0c3f8 100644 --- a/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/README.md +++ b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/README.md @@ -53,41 +53,74 @@ z = x + y 算子类型(OpType)Add 算子输入nameshapedata typeformat -x-bfloat16_t/int8_t/float/half/int16_t/int32_tND -y-bfloat16_t/int8_t/float/half/int16_t/int32_tND +x(32, 1024) / (8, 1023) / (32, 1023) / (17, 1023) +bfloat16_t/int8_t/float/half/int16_t/int32_tND +y(32, 1024) / (8, 1023) / (32, 1023) / (17, 1023)bfloat16_t/int8_t/float/half/int16_t/int32_tND -算子输出z-bfloat16_t/int8_t/float/half/int16_t/int32_tND +算子输出z(32, 1024) / (8, 1023) / (32, 1023) / (17, 1023)bfloat16_t/int8_t/float/half/int16_t/int32_tND 核函数名add_custom + +该算子支持在不同输入数据长度下采用不同策略对数据进行核间切分以及tiling,shape对应场景如下: + + 1. 核间均分,单核计算量满足32B对齐: (32, 1024) + + 2. 核间均分,单核计算量不满足32B对齐: (8, 1023) + + 3. 核间不均分,单核计算量满足32B对齐: (32, 1023) + + 4. 核间不均分,单核计算量不满足32B对齐:(17, 1023) + + + - VectorAddMultiCoreWithTilingBroadcast - - + + - +
算子类型(OpType)Add
算子输入nameshapedata typeformat
x(m, n) / (1, n) / (m, 1)bfloat16_t/int8_t/float/half/int16_t/int32_tND
y(m, n) / (1, n) / (m, 1)bfloat16_t/int8_t/float/half/int16_t/int32_tND
x +axis = 0:(8, 1024) / (8, 1022) / (17, 1024) / (17, 1022) +axis = 1:(16, 1) / (16, 1) / (20, 1) / (20, 1) +bfloat16_t/int8_t/float/half/int16_t/int32_tND
y +axis = 0:(8, 1024) / (8, 1022) / (17, 1024) / (17, 1022) +axis = 1:(16, 256) / (16, 255) / (20, 256) / (20, 255)bfloat16_t/int8_t/float/half/int16_t/int32_tND
算子输出z(m * n))bfloat16_t/int8_t/float/half/int16_t/int32_tND
算子输出z +axis = 0:(8, 1024) / (8, 1022) / (17, 1024) / (17, 1022) +axis = 1:(16, 256) / (16, 255) / (20, 256) / (20, 255)bfloat16_t/int8_t/float/half/int16_t/int32_tND
核函数名add_custom
-该算子支持对任一输入的某个轴进行广播,对应关系如下 -- 针对axis = 0(第一个轴)进行广播 - - x(m, n), y(1, n) - - x(1, n), y(m, n) - 注意,该场景下m需满足32字节对齐。 +该算子支持对任一输入的某个轴进行广播,其中输入x,y的shape可以交换。表格中提到的shape对应不同的策略对数据进行核间切分以及tiling,对应关系如下: + + - 针对axis = 0(第一个轴)进行广播 + + 1. 核间均分,单核计算量对齐 x shape:(8, 1024), y shape:(1, 1024); + + 2. 核间均分,单核计算量非对齐 x shape:(8, 1022), y shape:(1, 1022); + + 3. 核间不均分,单核计算量对齐 x shape:(17, 1024), y shape:(1, 1024); + + 4. 核间不均分,单核计算量非对齐 x shape:(17, 1022), y shape:(1, 1022)。 + - 针对axis = 1(第二个轴)进行广播 - - x(m, n), y(m, 1) - - x(m, 1), y(m, n) + 1. 核间均分,单核计算量对齐 x shape:(16, 1), y shape:(16, 256); + + 2. 核间均分,单核计算量非对齐 x shape:(16, 1), y shape:(16, 255); + + 3. 核间不均分,单核计算量对齐 x shape:(20, 1), y shape:(20, 256); + + 4. 核间不均分,单核计算量非对齐 x shape:(20, 1), y shape:(20, 255)。 diff --git a/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTiling/add_custom_tiling.cpp b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTiling/add_custom_tiling.cpp index cfb838a6736daf78a0695e4767e4de1ac79b2a13..ca3f2cee640a3d384cbe21a49cbc7cfe7b5606eb 100644 --- a/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTiling/add_custom_tiling.cpp +++ b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTiling/add_custom_tiling.cpp @@ -15,9 +15,8 @@ constexpr uint32_t DATA_TYPE_SIZE[] = {2, 2, 4, 1, 2, 4}; constexpr uint32_t BLOCK_SIZE = 32; constexpr uint32_t BUFFER_NUM = 2; -constexpr uint32_t UB_BLOCK_NUM = 21; // UB最大可以使用的block数量 +constexpr uint32_t UB_BLOCK_NUM = 100; // UB最大可以使用的block数量 constexpr uint32_t MAX_AVAILABLE_UB_BLOCK_NUM = UB_BLOCK_NUM / BUFFER_NUM * BUFFER_NUM; -constexpr uint32_t BLOCK_DIM = 9; // tiling参数计算函数 void TilingParamsCalc(uint32_t length, uint32_t alignNum, @@ -43,7 +42,7 @@ void TilingParamsCalc(uint32_t length, uint32_t alignNum, } } -void GenerateTilingData(uint8_t* tilingBuf) +void GenerateTilingData(uint8_t* tilingBuf, uint32_t blockDim) { uint32_t totalLength; uint32_t dataTypeSize; @@ -55,7 +54,7 @@ void GenerateTilingData(uint8_t* tilingBuf) dataTypeSize = DATA_TYPE_SIZE[tiling->dataType]; uint32_t alignNum = BLOCK_SIZE / dataTypeSize; - assert(alignNum != 0U); + assert((alignNum != 0U) && (blockDim != 0U)); /** 计算使用的核数 **/ /* 如果传入数据的长度非32B对齐, 计算对齐后的长度*/ totalLengthAligned = (totalLength % alignNum == 0U)? @@ -63,11 +62,11 @@ void GenerateTilingData(uint8_t* tilingBuf) ((static_cast(totalLength) + alignNum - 1) / alignNum) * alignNum; /* 核间可均分场景 */ - if ((totalLengthAligned / alignNum) % BLOCK_DIM == 0U) { + if ((totalLengthAligned / alignNum) % blockDim == 0U) { uint32_t tileNum = 0; uint32_t tileLength = 0; uint32_t lastTileLength = 0; - blockLength = totalLengthAligned / BLOCK_DIM; + blockLength = totalLengthAligned / blockDim; TilingParamsCalc(blockLength, alignNum, tileNum, tileLength, lastTileLength); tiling->blockLength = blockLength; @@ -76,12 +75,12 @@ void GenerateTilingData(uint8_t* tilingBuf) tiling->lastTileLength = lastTileLength; tiling->isEvenCore = 1U; } else { // 核间不可均分 - uint32_t formerNum = (totalLengthAligned / alignNum) % BLOCK_DIM; - uint32_t tailNum = BLOCK_DIM - formerNum; + uint32_t formerNum = (totalLengthAligned / alignNum) % blockDim; + uint32_t tailNum = blockDim - formerNum; // 计算整块和尾块的数据量 uint32_t formerLength = - static_cast(((totalLengthAligned + BLOCK_DIM - 1) / BLOCK_DIM + alignNum - 1) / alignNum) * alignNum; - uint32_t tailLength = (totalLengthAligned / BLOCK_DIM / alignNum) * alignNum; + static_cast(((totalLengthAligned + blockDim - 1) / blockDim + alignNum - 1) / alignNum) * alignNum; + uint32_t tailLength = (totalLengthAligned / blockDim / alignNum) * alignNum; uint32_t formerTileNum; uint32_t formerTileLength; diff --git a/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTiling/main.cpp b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTiling/main.cpp index 3e35408b79332cc38821eb83e66ee7a3e8dd41fe..acfeb24cfef7a08511424ca951f5b0bb595fc5ca 100644 --- a/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTiling/main.cpp +++ b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTiling/main.cpp @@ -17,12 +17,13 @@ #include "tikicpulib.h" extern "C" __global__ __aicore__ void add_custom(GM_ADDR x, GM_ADDR y, GM_ADDR z, AddCustomTilingData tiling); #endif -extern void GenerateTilingData(uint8_t* tilingBuf); +extern void GenerateTilingData(uint8_t* tilingBuf, uint32_t blockDim); int32_t main(int32_t argc, char *argv[]) { - constexpr uint32_t BLOCK_DIM = 9; + constexpr uint32_t BLOCK_DIM = 8; + constexpr uint32_t DATA_BLOCK_BYTE = 32; constexpr uint32_t DATA_TYPE_SIZE[] = {2, 2, 4, 1, 2, 4}; uint8_t *tiling = nullptr; size_t tilingSize = 17 * sizeof(uint32_t); @@ -44,15 +45,15 @@ int32_t main(int32_t argc, char *argv[]) CHECK_ACL(aclrtMallocHost((void **)(&tiling), tilingSize)); ReadFile("./input/input_tiling.bin", tilingSize, tiling, tilingSize); #endif - GenerateTilingData(tiling); + GenerateTilingData(tiling, BLOCK_DIM); uint32_t dataTypeSize = DATA_TYPE_SIZE[reinterpret_cast(tiling)->dataType]; size_t inputByteSize = reinterpret_cast(tiling)->totalLength * dataTypeSize; size_t outputByteSize = reinterpret_cast(tiling)->totalLength * dataTypeSize; #ifdef ASCENDC_CPU_DEBUG - uint8_t *x = (uint8_t *)AscendC::GmAlloc(inputByteSize); - uint8_t *y = (uint8_t *)AscendC::GmAlloc(inputByteSize); - uint8_t *z = (uint8_t *)AscendC::GmAlloc(outputByteSize); + uint8_t *x = (uint8_t *)AscendC::GmAlloc((inputByteSize + DATA_BLOCK_BYTE - 1) / DATA_BLOCK_BYTE * DATA_BLOCK_BYTE); + uint8_t *y = (uint8_t *)AscendC::GmAlloc((inputByteSize + DATA_BLOCK_BYTE - 1) / DATA_BLOCK_BYTE * DATA_BLOCK_BYTE); + uint8_t *z = (uint8_t *)AscendC::GmAlloc((inputByteSize + DATA_BLOCK_BYTE - 1) / DATA_BLOCK_BYTE * DATA_BLOCK_BYTE); ReadFile("./input/input_x.bin", inputByteSize, x, inputByteSize); ReadFile("./input/input_y.bin", inputByteSize, y, inputByteSize); diff --git a/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTiling/scripts/gen_data.py b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTiling/scripts/gen_data.py index 70f6bae3098905b8465f73c4a2e000e0a7a6d9a7..e12157d5ab8b67bb0bd51643e544e23d81d201e4 100644 --- a/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTiling/scripts/gen_data.py +++ b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTiling/scripts/gen_data.py @@ -14,13 +14,24 @@ bfloat16 = tf.bfloat16.as_numpy_dtype dtype_emu = {bfloat16: 0, np.float16: 1, np.float32: 2, np.int8: 3, np.int16: 4, np.int32: 5} def gen_golden_data_simple(): - input_shape = [32, 6737] dtype = np.int8 + # dtype = bfloat16 + + ## 核间均分,单核计算量对齐: + # input_shape = [32, 1024] + + ## 核间均分,单核计算量非对齐: + # input_shape = [8, 1023] + + ## 核间不均分,单核计算量对齐: + # input_shape = [32, 1023] + + ## 核间不均分,单核计算量非对齐: + input_shape = [17, 1023] input_x = np.random.uniform(-50, 50, input_shape).astype(dtype) input_y = np.random.uniform(-50, 50, input_shape).astype(dtype) golden = (input_x + input_y).astype(dtype) - tiling = np.array([input_shape[0] * input_shape[1], dtype_emu[dtype]], dtype=np.uint32) tiling.tofile("./input/input_tiling.bin") diff --git a/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTilingBroadcast/add_custom.cpp b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTilingBroadcast/add_custom.cpp index a33c6a9308a0e4d084bb58feccd4cca5383d0d32..029e3e098bd1ab976cfba02aa75f9b8d4dd18762 100644 --- a/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTilingBroadcast/add_custom.cpp +++ b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTilingBroadcast/add_custom.cpp @@ -36,9 +36,11 @@ public: if (tiling.xLen > tiling.yLen) { longerInputPtr = x; shorterInputPtr = y; + this->shorterAxisLen = tiling.yLen; } else { longerInputPtr = y; shorterInputPtr = x; + this->shorterAxisLen = tiling.xLen; } this->coef = tiling.coef; if (tiling.isEvenCore) { @@ -47,7 +49,7 @@ public: this->lastTileLength = tiling.lastTileLength; xGm.SetGlobalBuffer((__gm__ bfloat16_t *)longerInputPtr + tiling.blockLength * AscendC::GetBlockIdx(), tiling.blockLength); - yGm.SetGlobalBuffer((__gm__ bfloat16_t *)shorterInputPtr, this->coef); + yGm.SetGlobalBuffer((__gm__ bfloat16_t *)shorterInputPtr, this->shorterAxisLen); zGm.SetGlobalBuffer((__gm__ bfloat16_t *)z + tiling.blockLength * AscendC::GetBlockIdx(), tiling.blockLength); } else { if (AscendC::GetBlockIdx() < tiling.formerNum) { @@ -56,7 +58,7 @@ public: this->lastTileLength = tiling.formerLastTileLength; xGm.SetGlobalBuffer((__gm__ bfloat16_t *)longerInputPtr + tiling.formerLength * AscendC::GetBlockIdx(), tiling.formerLength); - yGm.SetGlobalBuffer((__gm__ bfloat16_t *)shorterInputPtr, this->coef); + yGm.SetGlobalBuffer((__gm__ bfloat16_t *)shorterInputPtr, this->shorterAxisLen); zGm.SetGlobalBuffer((__gm__ bfloat16_t *)z + tiling.formerLength * AscendC::GetBlockIdx(), tiling.formerLength); } else { this->tileNum = tiling.tailTileNum; @@ -65,16 +67,15 @@ public: xGm.SetGlobalBuffer((__gm__ bfloat16_t *)longerInputPtr + tiling.formerLength * tiling.formerNum + tiling.tailLength * (AscendC::GetBlockIdx() - tiling.formerNum), tiling.tailLength); - yGm.SetGlobalBuffer((__gm__ bfloat16_t *)shorterInputPtr, this->coef); + yGm.SetGlobalBuffer((__gm__ bfloat16_t *)shorterInputPtr, this->shorterAxisLen); zGm.SetGlobalBuffer((__gm__ bfloat16_t *)z + tiling.formerLength * tiling.formerNum + tiling.tailLength * (AscendC::GetBlockIdx() - tiling.formerNum), tiling.tailLength); } } pipe.InitBuffer(inQueueX, BUFFER_NUM, this->tileLength * sizeof(bfloat16_t)); - pipe.InitBuffer(inQueueY, BUFFER_NUM, this->coef * sizeof(bfloat16_t)); + pipe.InitBuffer(inQueueY, BUFFER_NUM, this->tileLength * sizeof(bfloat16_t)); pipe.InitBuffer(outQueueZ, BUFFER_NUM, this->tileLength * sizeof(bfloat16_t)); - pipe.InitBuffer(tmpBuf2, this->tileLength * sizeof(bfloat16_t)); pipe.InitBuffer(tmpBuf0, this->tileLength * sizeof(float)); pipe.InitBuffer(tmpBuf1, this->tileLength * sizeof(float)); } @@ -94,17 +95,11 @@ private: AscendC::LocalTensor xLocal = inQueueX.AllocTensor(); AscendC::LocalTensor yLocal = inQueueY.AllocTensor(); - AscendC::DataCopyExtParams copyXParams = {1, (uint32_t)(this->tileLength * sizeof(bfloat16_t)), 0, 0, 0}; - AscendC::DataCopyExtParams copyYParams = {1, (uint32_t)(this->coef * sizeof(bfloat16_t)), 0, 0, 0}; + AscendC::DataCopyExtParams copyParams = {1, (uint32_t)(this->tileLength * sizeof(bfloat16_t)), 0, 0, 0}; AscendC::DataCopyPadExtParams padParams = {false, 0, 0, 0}; - if ((progress == (this->tileNum * BUFFER_NUM - 2)) || (progress == (this->tileNum * BUFFER_NUM - 1))) { - AscendC::DataCopyPad(xLocal, xGm[(progress - LAST_TWO_TILE) * this->tileLength + this->lastTileLength], - copyXParams, padParams); - } else { - AscendC::DataCopyPad(xLocal, xGm[progress * this->tileLength], copyXParams, padParams); - } - AscendC::DataCopyPad(yLocal, yGm, copyYParams, padParams); + AscendC::DataCopyPad(xLocal, xGm[progress * this->tileLength], copyParams, padParams); + AscendC::DataCopyPad(yLocal, yGm[(progress % BUFFER_NUM) * this->tileLength], copyParams, padParams); inQueueX.EnQue(xLocal); inQueueY.EnQue(yLocal); } @@ -114,16 +109,11 @@ private: AscendC::LocalTensor yLocal = inQueueY.DeQue(); AscendC::LocalTensor zLocal = outQueueZ.AllocTensor(); - AscendC::LocalTensor broadcastTmpTensor = tmpBuf2.Get(); - uint32_t dstShape[] = {this->tileLength / this->coef, this->coef}; - uint32_t srcShape[] = {1, this->coef}; - AscendC::Broadcast(broadcastTmpTensor, yLocal, dstShape, srcShape); - AscendC::LocalTensor tmpTensor0 = tmpBuf0.Get(); AscendC::LocalTensor tmpTensor1 = tmpBuf1.Get(); AscendC::Cast(tmpTensor0, xLocal, AscendC::RoundMode::CAST_NONE, this->tileLength); - AscendC::Cast(tmpTensor1, broadcastTmpTensor, AscendC::RoundMode::CAST_NONE, this->tileLength); + AscendC::Cast(tmpTensor1, yLocal, AscendC::RoundMode::CAST_NONE, this->tileLength); AscendC::Add(tmpTensor0, tmpTensor0, tmpTensor1, this->tileLength); AscendC::Cast(zLocal, tmpTensor0, AscendC::RoundMode::CAST_RINT, this->tileLength); @@ -137,11 +127,8 @@ private: { AscendC::LocalTensor zLocal = outQueueZ.DeQue(); AscendC::DataCopyExtParams copyParams = {1, (uint32_t)(this->tileLength * sizeof(bfloat16_t)), 0, 0, 0}; - if ((progress == (this->tileNum * BUFFER_NUM - 2)) || (progress == (this->tileNum * BUFFER_NUM - 1))) { - AscendC::DataCopyPad(zGm[(progress - LAST_TWO_TILE) * this->tileLength + this->lastTileLength], zLocal, copyParams); - } else { - AscendC::DataCopyPad(zGm[progress * this->tileLength], zLocal, copyParams); - } + + AscendC::DataCopyPad(zGm[progress * this->tileLength], zLocal, copyParams); outQueueZ.FreeTensor(zLocal); } @@ -153,7 +140,6 @@ private: AscendC::TBuf tmpBuf0; AscendC::TBuf tmpBuf1; - AscendC::TBuf tmpBuf2; AscendC::GlobalTensor xGm; AscendC::GlobalTensor yGm; @@ -163,6 +149,7 @@ private: uint32_t tileNum; uint32_t tileLength; uint32_t lastTileLength; + uint32_t shorterAxisLen; }; template <> class KernelAdd { @@ -175,9 +162,11 @@ public: if (tiling.xLen > tiling.yLen) { longerInputPtr = x; shorterInputPtr = y; + this->shorterAxisLen = tiling.yLen; } else { longerInputPtr = y; shorterInputPtr = x; + this->shorterAxisLen = tiling.xLen; } this->coef = tiling.coef; if (tiling.isEvenCore) { @@ -186,7 +175,7 @@ public: this->lastTileLength = tiling.lastTileLength; xGm.SetGlobalBuffer((__gm__ int8_t *)longerInputPtr + tiling.blockLength * AscendC::GetBlockIdx(), tiling.blockLength); - yGm.SetGlobalBuffer((__gm__ int8_t *)shorterInputPtr, this->coef); + yGm.SetGlobalBuffer((__gm__ int8_t *)shorterInputPtr, this->shorterAxisLen); zGm.SetGlobalBuffer((__gm__ int8_t *)z + tiling.blockLength * AscendC::GetBlockIdx(), tiling.blockLength); } else { if (AscendC::GetBlockIdx() < tiling.formerNum) { @@ -195,7 +184,7 @@ public: this->lastTileLength = tiling.formerLastTileLength; xGm.SetGlobalBuffer((__gm__ int8_t *)longerInputPtr + tiling.formerLength * AscendC::GetBlockIdx(), tiling.formerLength); - yGm.SetGlobalBuffer((__gm__ int8_t *)shorterInputPtr, this->coef); + yGm.SetGlobalBuffer((__gm__ int8_t *)shorterInputPtr, this->shorterAxisLen); zGm.SetGlobalBuffer((__gm__ int8_t *)z + tiling.formerLength * AscendC::GetBlockIdx(), tiling.formerLength); } else { this->tileNum = tiling.tailTileNum; @@ -204,17 +193,15 @@ public: xGm.SetGlobalBuffer((__gm__ int8_t *)longerInputPtr + tiling.formerLength * tiling.formerNum + tiling.tailLength * (AscendC::GetBlockIdx() - tiling.formerNum), tiling.tailLength); - yGm.SetGlobalBuffer((__gm__ int8_t *)shorterInputPtr, this->coef); + yGm.SetGlobalBuffer((__gm__ int8_t *)shorterInputPtr, this->shorterAxisLen); zGm.SetGlobalBuffer((__gm__ int8_t *)z + tiling.formerLength * tiling.formerNum + tiling.tailLength * (AscendC::GetBlockIdx() - tiling.formerNum), tiling.tailLength); } } pipe.InitBuffer(inQueueX, BUFFER_NUM, this->tileLength * sizeof(int8_t)); - pipe.InitBuffer(inQueueY, BUFFER_NUM, this->coef * sizeof(int8_t)); + pipe.InitBuffer(inQueueY, BUFFER_NUM, this->tileLength * sizeof(int8_t)); pipe.InitBuffer(outQueueZ, BUFFER_NUM, this->tileLength * sizeof(int8_t)); - pipe.InitBuffer(tmpBuf2, this->tileLength * sizeof(int8_t)); - pipe.InitBuffer(tmpBuf0, this->tileLength * sizeof(half)); pipe.InitBuffer(tmpBuf1, this->tileLength * sizeof(half)); } @@ -234,17 +221,11 @@ private: AscendC::LocalTensor xLocal = inQueueX.AllocTensor(); AscendC::LocalTensor yLocal = inQueueY.AllocTensor(); - AscendC::DataCopyExtParams copyXParams = {1, (uint32_t)(this->tileLength * sizeof(int8_t)), 0, 0, 0}; - AscendC::DataCopyExtParams copyYParams = {1, (uint32_t)(this->coef * sizeof(int8_t)), 0, 0, 0}; + AscendC::DataCopyExtParams copyParams = {1, (uint32_t)(this->tileLength * sizeof(int8_t)), 0, 0, 0}; AscendC::DataCopyPadExtParams padParams = {false, 0, 0, 0}; - if ((progress == (this->tileNum * BUFFER_NUM - 2)) || (progress == (this->tileNum * BUFFER_NUM - 1))) { - AscendC::DataCopyPad(xLocal, xGm[(progress - LAST_TWO_TILE) * this->tileLength + this->lastTileLength], - copyXParams, padParams); - } else { - AscendC::DataCopyPad(xLocal, xGm[progress * this->tileLength], copyXParams, padParams); - } - AscendC::DataCopyPad(yLocal, yGm, copyYParams, padParams); + AscendC::DataCopyPad(xLocal, xGm[progress * this->tileLength], copyParams, padParams); + AscendC::DataCopyPad(yLocal, yGm[(progress % BUFFER_NUM) * this->tileLength], copyParams, padParams); inQueueX.EnQue(xLocal); inQueueY.EnQue(yLocal); } @@ -254,19 +235,15 @@ private: AscendC::LocalTensor yLocal = inQueueY.DeQue(); AscendC::LocalTensor zLocal = outQueueZ.AllocTensor(); - AscendC::LocalTensor broadcastTmpTensor = tmpBuf2.Get(); - uint32_t dstShape[] = {this->tileLength / this->coef, this->coef}; - uint32_t srcShape[] = {1, this->coef}; - AscendC::Broadcast(broadcastTmpTensor, yLocal, dstShape, srcShape); - AscendC::LocalTensor tmpTensor0 = tmpBuf0.Get(); AscendC::LocalTensor tmpTensor1 = tmpBuf1.Get(); AscendC::Cast(tmpTensor0, xLocal, AscendC::RoundMode::CAST_NONE, this->tileLength); - AscendC::Cast(tmpTensor1, broadcastTmpTensor, AscendC::RoundMode::CAST_NONE, this->tileLength); + AscendC::Cast(tmpTensor1, yLocal, AscendC::RoundMode::CAST_NONE, this->tileLength); AscendC::Add(tmpTensor0, tmpTensor0, tmpTensor1, this->tileLength); - AscendC::Cast(zLocal, tmpTensor0, AscendC::RoundMode::CAST_NONE, this->tileLength); + AscendC::Cast(zLocal, tmpTensor0, AscendC::RoundMode::CAST_NONE, this->tileLength); + outQueueZ.EnQue(zLocal); inQueueX.FreeTensor(xLocal); @@ -276,11 +253,8 @@ private: { AscendC::LocalTensor zLocal = outQueueZ.DeQue(); AscendC::DataCopyExtParams copyParams = {1, (uint32_t)(this->tileLength * sizeof(int8_t)), 0, 0, 0}; - if ((progress == (this->tileNum * BUFFER_NUM - 2)) || (progress == (this->tileNum * BUFFER_NUM - 1))) { - AscendC::DataCopyPad(zGm[(progress - LAST_TWO_TILE) * this->tileLength + this->lastTileLength], zLocal, copyParams); - } else { - AscendC::DataCopyPad(zGm[progress * this->tileLength], zLocal, copyParams); - } + + AscendC::DataCopyPad(zGm[progress * this->tileLength], zLocal, copyParams); outQueueZ.FreeTensor(zLocal); } @@ -292,7 +266,6 @@ private: AscendC::TBuf tmpBuf0; AscendC::TBuf tmpBuf1; - AscendC::TBuf tmpBuf2; AscendC::GlobalTensor xGm; AscendC::GlobalTensor yGm; @@ -302,8 +275,9 @@ private: uint32_t tileNum; uint32_t tileLength; uint32_t lastTileLength; + uint32_t shorterAxisLen; }; - + template class KernelAdd { public: __aicore__ inline KernelAdd() {} @@ -314,9 +288,11 @@ public: if (tiling.xLen > tiling.yLen) { longerInputPtr = x; shorterInputPtr = y; + this->shorterAxisLen = tiling.yLen; } else { longerInputPtr = y; shorterInputPtr = x; + this->shorterAxisLen = tiling.xLen; } this->coef = tiling.coef; if (tiling.isEvenCore) { @@ -325,7 +301,7 @@ public: this->lastTileLength = tiling.lastTileLength; xGm.SetGlobalBuffer((__gm__ dataType *)longerInputPtr + tiling.blockLength * AscendC::GetBlockIdx(), tiling.blockLength); - yGm.SetGlobalBuffer((__gm__ dataType *)shorterInputPtr, this->coef); + yGm.SetGlobalBuffer((__gm__ dataType *)shorterInputPtr, this->shorterAxisLen); zGm.SetGlobalBuffer((__gm__ dataType *)z + tiling.blockLength * AscendC::GetBlockIdx(), tiling.blockLength); } else { if (AscendC::GetBlockIdx() < tiling.formerNum) { @@ -334,7 +310,7 @@ public: this->lastTileLength = tiling.formerLastTileLength; xGm.SetGlobalBuffer((__gm__ dataType *)longerInputPtr + tiling.formerLength * AscendC::GetBlockIdx(), tiling.formerLength); - yGm.SetGlobalBuffer((__gm__ dataType *)shorterInputPtr, this->coef); + yGm.SetGlobalBuffer((__gm__ dataType *)shorterInputPtr, this->shorterAxisLen); zGm.SetGlobalBuffer((__gm__ dataType *)z + tiling.formerLength * AscendC::GetBlockIdx(), tiling.formerLength); } else { this->tileNum = tiling.tailTileNum; @@ -343,16 +319,14 @@ public: xGm.SetGlobalBuffer((__gm__ dataType *)longerInputPtr + tiling.formerLength * tiling.formerNum + tiling.tailLength * (AscendC::GetBlockIdx() - tiling.formerNum), tiling.tailLength); - yGm.SetGlobalBuffer((__gm__ dataType *)shorterInputPtr, this->coef); + yGm.SetGlobalBuffer((__gm__ dataType *)shorterInputPtr, this->shorterAxisLen); zGm.SetGlobalBuffer((__gm__ dataType *)z + tiling.formerLength * tiling.formerNum + tiling.tailLength * (AscendC::GetBlockIdx() - tiling.formerNum), tiling.tailLength); } } pipe.InitBuffer(inQueueX, BUFFER_NUM, this->tileLength * sizeof(dataType)); - pipe.InitBuffer(inQueueY, BUFFER_NUM, this->coef * sizeof(dataType)); + pipe.InitBuffer(inQueueY, BUFFER_NUM, this->tileLength * sizeof(dataType)); pipe.InitBuffer(outQueueZ, BUFFER_NUM, this->tileLength * sizeof(dataType)); - - pipe.InitBuffer(tmpBuf2, this->tileLength * sizeof(dataType)); } __aicore__ inline void Process() { @@ -370,17 +344,11 @@ private: AscendC::LocalTensor xLocal = inQueueX.AllocTensor(); AscendC::LocalTensor yLocal = inQueueY.AllocTensor(); - AscendC::DataCopyExtParams copyXParams = {1, (uint32_t)(this->tileLength * sizeof(dataType)), 0, 0, 0}; - AscendC::DataCopyExtParams copyYParams = {1, (uint32_t)(this->coef * sizeof(dataType)), 0, 0, 0}; + AscendC::DataCopyExtParams copyParams = {1, (uint32_t)(this->tileLength * sizeof(dataType)), 0, 0, 0}; AscendC::DataCopyPadExtParams padParams = {false, 0, 0, 0}; - if ((progress == (this->tileNum * BUFFER_NUM - 2)) || (progress == (this->tileNum * BUFFER_NUM - 1))) { - AscendC::DataCopyPad(xLocal, xGm[(progress - LAST_TWO_TILE) * this->tileLength + this->lastTileLength], - copyXParams, padParams); - } else { - AscendC::DataCopyPad(xLocal, xGm[progress * this->tileLength], copyXParams, padParams); - } - AscendC::DataCopyPad(yLocal, yGm, copyYParams, padParams); + AscendC::DataCopyPad(xLocal, xGm[progress * this->tileLength], copyParams, padParams); + AscendC::DataCopyPad(yLocal, yGm[(progress % BUFFER_NUM) * this->tileLength], copyParams, padParams); inQueueX.EnQue(xLocal); inQueueY.EnQue(yLocal); } @@ -390,12 +358,7 @@ private: AscendC::LocalTensor yLocal = inQueueY.DeQue(); AscendC::LocalTensor zLocal = outQueueZ.AllocTensor(); - AscendC::LocalTensor broadcastTmpTensor = tmpBuf2.Get(); - uint32_t dstShape[] = {this->tileLength / this->coef, this->coef}; - uint32_t srcShape[] = {1, this->coef}; - AscendC::Broadcast(broadcastTmpTensor, yLocal, dstShape, srcShape); - - AscendC::Add(zLocal, xLocal, broadcastTmpTensor, this->tileLength); + AscendC::Add(zLocal, xLocal, yLocal, this->tileLength); outQueueZ.EnQue(zLocal); inQueueX.FreeTensor(xLocal); @@ -405,11 +368,8 @@ private: { AscendC::LocalTensor zLocal = outQueueZ.DeQue(); AscendC::DataCopyExtParams copyParams = {1, (uint32_t)(this->tileLength * sizeof(dataType)), 0, 0, 0}; - if ((progress == (this->tileNum * BUFFER_NUM - 2)) || (progress == (this->tileNum * BUFFER_NUM - 1))) { - AscendC::DataCopyPad(zGm[(progress - LAST_TWO_TILE) * this->tileLength + this->lastTileLength], zLocal, copyParams); - } else { - AscendC::DataCopyPad(zGm[progress * this->tileLength], zLocal, copyParams); - } + + AscendC::DataCopyPad(zGm[progress * this->tileLength], zLocal, copyParams); outQueueZ.FreeTensor(zLocal); } @@ -419,8 +379,6 @@ private: AscendC::TQue inQueueY; AscendC::TQue outQueueZ; - AscendC::TBuf tmpBuf2; - AscendC::GlobalTensor xGm; AscendC::GlobalTensor yGm; AscendC::GlobalTensor zGm; @@ -429,6 +387,7 @@ private: uint32_t tileNum; uint32_t tileLength; uint32_t lastTileLength; + uint32_t shorterAxisLen; }; // 针对axis = 1的场景 diff --git a/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTilingBroadcast/add_custom_tiling.cpp b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTilingBroadcast/add_custom_tiling.cpp index 85e624b071ca20184412c6e7cbc4bf223e29e319..1690d529ad86df8dd2c638f9a4cd08512823dbb2 100644 --- a/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTilingBroadcast/add_custom_tiling.cpp +++ b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTilingBroadcast/add_custom_tiling.cpp @@ -15,9 +15,8 @@ constexpr uint32_t DATA_TYPE_SIZE[] = {2, 2, 4, 1, 2, 4}; constexpr uint32_t BLOCK_SIZE = 32; constexpr uint32_t BUFFER_NUM = 2; -constexpr uint32_t UB_BLOCK_NUM = 50; // UB最大可以使用的block数量 +constexpr uint32_t UB_BLOCK_NUM = 100; // UB最大可以使用的block数量 constexpr uint32_t MAX_AVAILABLE_UB_BLOCK_NUM = UB_BLOCK_NUM / BUFFER_NUM * BUFFER_NUM; -constexpr uint32_t BLOCK_DIM = 8; void TilingParamsCalc(uint32_t length, uint32_t ubBlockNum, uint32_t& tileNum, uint32_t& tileLength, uint32_t& lastTileLength) { @@ -41,7 +40,7 @@ void TilingParamsCalc(uint32_t length, uint32_t ubBlockNum, } } -void GenerateTilingData(uint8_t* tilingBuf) +void GenerateTilingData(uint8_t* tilingBuf, uint32_t blockDim) { uint32_t xLen; uint32_t yLen; @@ -56,19 +55,26 @@ void GenerateTilingData(uint8_t* tilingBuf) totalLength = (xLen > yLen)? xLen : yLen; uint32_t alignNum = BLOCK_SIZE / dataTypeSize; + assert((alignNum != 0U) && (blockDim != 0U)); uint32_t shorterAxisLen = (xLen < yLen)? xLen : yLen; - uint32_t coef = totalLength / shorterAxisLen; + uint32_t alignCoef = (tiling->axis == 0U)? shorterAxisLen : totalLength / shorterAxisLen; + uint32_t divDimCoef = (tiling->axis == 0U)? totalLength / shorterAxisLen : shorterAxisLen; uint32_t ubBlockAligned = - (MAX_AVAILABLE_UB_BLOCK_NUM * alignNum / (coef * BUFFER_NUM) * (coef * BUFFER_NUM) == 0U)? - MAX_AVAILABLE_UB_BLOCK_NUM : MAX_AVAILABLE_UB_BLOCK_NUM * alignNum / (coef * BUFFER_NUM) * (coef * BUFFER_NUM); + (MAX_AVAILABLE_UB_BLOCK_NUM * alignNum / (alignCoef * BUFFER_NUM) * (alignCoef * BUFFER_NUM) == 0U)? + MAX_AVAILABLE_UB_BLOCK_NUM : MAX_AVAILABLE_UB_BLOCK_NUM * alignNum / (alignCoef * BUFFER_NUM) * (alignCoef * BUFFER_NUM); - if (shorterAxisLen % (BLOCK_DIM * BUFFER_NUM) == 0U) { - uint32_t blockLength = shorterAxisLen / BLOCK_DIM * coef; + if (divDimCoef % blockDim == 0U) { + uint32_t blockLength = divDimCoef / blockDim * alignCoef; uint32_t tileNum = 0; uint32_t tileLength = 0; uint32_t lastTileLength = 0; - - TilingParamsCalc(blockLength, ubBlockAligned, tileNum, tileLength, lastTileLength); + if (tiling->axis == 0U) { + tileNum = blockLength / shorterAxisLen; + tileLength = shorterAxisLen; + lastTileLength = tileLength; + } else { + TilingParamsCalc(blockLength, ubBlockAligned, tileNum, tileLength, lastTileLength); + } tiling->blockLength = blockLength; tiling->tileNum = tileNum; @@ -76,11 +82,11 @@ void GenerateTilingData(uint8_t* tilingBuf) tiling->lastTileLength = lastTileLength; tiling->isEvenCore = 1U; } else { - uint32_t formerNum = (shorterAxisLen / BUFFER_NUM) % BLOCK_DIM; - uint32_t tailNum = BLOCK_DIM - formerNum; + uint32_t formerNum = (divDimCoef) % blockDim; + uint32_t tailNum = blockDim - formerNum; - uint32_t formerLength = (((shorterAxisLen / BUFFER_NUM) / BLOCK_DIM) + 1) * BUFFER_NUM * coef; - uint32_t tailLength = ((shorterAxisLen / BUFFER_NUM) / BLOCK_DIM) * BUFFER_NUM * coef; + uint32_t formerLength = (divDimCoef / blockDim + 1) * alignCoef; + uint32_t tailLength = divDimCoef / blockDim * alignCoef; uint32_t formerTileNum; uint32_t formerTileLength; @@ -89,11 +95,20 @@ void GenerateTilingData(uint8_t* tilingBuf) uint32_t tailTileNum; uint32_t tailTileLength; uint32_t tailLastTileLength; + if (tiling->axis == 0) { + formerTileNum = formerLength / shorterAxisLen; + formerTileLength = shorterAxisLen; + formerLastTileLength = shorterAxisLen; - TilingParamsCalc(formerLength, ubBlockAligned, - formerTileNum, formerTileLength, formerLastTileLength); - TilingParamsCalc(tailLength, ubBlockAligned, - tailTileNum, tailTileLength, tailLastTileLength); + tailTileNum = tailLength / shorterAxisLen; + tailTileLength = shorterAxisLen; + tailLastTileLength = shorterAxisLen; + } else { + TilingParamsCalc(formerLength, ubBlockAligned, + formerTileNum, formerTileLength, formerLastTileLength); + TilingParamsCalc(tailLength, ubBlockAligned, + tailTileNum, tailTileLength, tailLastTileLength); + } tiling->formerNum = formerNum; tiling->formerLength = formerLength; diff --git a/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTilingBroadcast/main.cpp b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTilingBroadcast/main.cpp index b5c077951a778a38029743958fa617406706c785..3b76bdf692877925c24b0c5c6a06d351bb58f0a4 100644 --- a/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTilingBroadcast/main.cpp +++ b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTilingBroadcast/main.cpp @@ -17,7 +17,7 @@ #include "tikicpulib.h" extern "C" __global__ __aicore__ void add_custom(GM_ADDR x, GM_ADDR y, GM_ADDR z, AddCustomTilingData tiling); #endif -extern void GenerateTilingData(uint8_t* tilingBuf); +extern void GenerateTilingData(uint8_t* tilingBuf, uint32_t blockDim); int32_t main(int32_t argc, char *argv[]) @@ -43,7 +43,7 @@ int32_t main(int32_t argc, char *argv[]) CHECK_ACL(aclrtMallocHost((void **)(&tiling), tilingSize)); ReadFile("./input/input_tiling.bin", tilingSize, tiling, tilingSize); #endif - GenerateTilingData(tiling); + GenerateTilingData(tiling, BLOCK_DIM); uint32_t dataTypeSize = DATA_TYPE_SIZE[reinterpret_cast(tiling)->dataType]; uint32_t xLen = reinterpret_cast(tiling)->xLen; uint32_t yLen = reinterpret_cast(tiling)->yLen; diff --git a/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTilingBroadcast/scripts/gen_data.py b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTilingBroadcast/scripts/gen_data.py index ce8a8f4ae6efd89f422f7de6b198243d6211c9cc..95cd2435ce5f7fa8fff7caf2c086e58ca7c1db24 100644 --- a/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTilingBroadcast/scripts/gen_data.py +++ b/operator/ascendc/0_introduction/21_vectoradd_kernellaunch/VectorAddMultiCoreWithTilingBroadcast/scripts/gen_data.py @@ -14,13 +14,41 @@ bfloat16 = tf.bfloat16.as_numpy_dtype dtype_emu = {bfloat16: 0, np.float16: 1, np.float32: 2, np.int8: 3, np.int16: 4, np.int32: 5} def gen_golden_data_simple(): - input_shape_x = [32, 128] - input_shape_y = [1, 128] + # dtype = np.float32 + # dtype = bfloat16 + dtype = np.int8 - #input_shape_x = [14, 1] - #input_shape_y = [14, 280] + ## Broadcast场景 axis = 0时, 核间均分, 单核计算量对齐 + # input_shape_x = [8, 1024] + # input_shape_y = [1, 1024] - dtype = np.int8 + ## Broadcast场景 axis = 0时, 核间均分, 单核计算量非对齐 + # input_shape_x = [8, 1022] + # input_shape_y = [1, 1022] + + ## Broadcast场景 axis = 0时, 核间不均分, 单核计算量对齐 + input_shape_x = [17, 1024] + input_shape_y = [1, 1024] + + ## Broadcast场景 axis = 0时, 核间不均分, 单核计算量非对齐 + input_shape_x = [17, 1022] + input_shape_y = [1, 1022] + + ## Broadcast场景 axis = 1时, 核间均分, 单核计算量对齐 + # input_shape_x = [16, 1] + # input_shape_y = [16, 256] + + ## Broadcast场景 axis = 1时, 核间均分, 单核计算量非对齐 + # input_shape_x = [16, 1] + # input_shape_y = [16, 255] + + ## Broadcast场景 axis = 1时, 核间不均分, 单核计算量对齐 + # input_shape_x = [20, 1] + # input_shape_y = [20, 256] + + ## Broadcast场景 axis = 1时, 核间不均分, 单核计算量非对齐 + # input_shape_x = [20, 1] + # input_shape_y = [20, 255] input_x = np.random.uniform(-50, 50, input_shape_x).astype(dtype) input_y = np.random.uniform(-50, 50, input_shape_y).astype(dtype)