diff --git a/examples/activation/softmaxflash/kernel_impl/softmaxflash_kernel.h b/examples/activation/softmaxflash/kernel_impl/softmaxflash_kernel.h index dccee537962e7fe81b635cd4677a3c9256a755d1..b1d75f4cc8341993d5519f82371ae5725c4c8257 100644 --- a/examples/activation/softmaxflash/kernel_impl/softmaxflash_kernel.h +++ b/examples/activation/softmaxflash/kernel_impl/softmaxflash_kernel.h @@ -13,7 +13,7 @@ #include "kernel_operator.h" namespace MyCustomKernel { -constexpr int32_t BUFFER_NUM = 2; +constexpr int32_t BUFFER_NUM = 1; constexpr uint32_t FLOAT_NUM_OF_SINGEL_BLOCK = 8; constexpr uint32_t BASIC_BLOCK_ROW_FACTOR = 8; constexpr uint32_t BASIC_BLOCK_COLUMN_FACTOR = 64; @@ -146,12 +146,18 @@ private: AscendC::LocalTensor xLocal = queueX.AllocTensor(); uint32_t offset = this->singleLoopCoreRowNum * this->columnLength; for (uint32_t i = 0; i < rowNum; i++) { - AscendC::DataCopy(xLocal[i * columnNum], xGm[rowIndex * offset + i * this->columnLength + kIndex * this->splitK], - columnNum); + AscendC::DataCopy(xLocal[i * columnNum], xGm[rowIndex * offset + i * this->columnLength + kIndex * this->splitK], columnNum); } queueX.EnQue(xLocal); } + // template + // __aicore__ inline void SoftmaxFlashV2(const LocalTensor& dstTensor, const LocalTensor& expSumTensor, + // const LocalTensor& maxTensor, const LocalTensor& srcTensor, const LocalTensor& expMaxTensor, + // const LocalTensor& inExpSumTensor, const LocalTensor& inMaxTensor, const LocalTensor& sharedTmpBuffer, + // const SoftMaxTiling& tiling, const SoftMaxShapeInfo& softmaxShapeInfo = {}) + __aicore__ inline void Compute(uint32_t rowIndex, uint32_t columnIndex, uint32_t rowNum, uint32_t columnNum) { AscendC::LocalTensor xLocal = queueX.DeQue(); @@ -159,9 +165,7 @@ private: AscendC::SoftMaxShapeInfo srcShape = { rowNum, columnNum, rowNum, columnNum }; if (columnIndex == 0) { // isUpdate == false - if (rowNum % BASIC_BLOCK_ROW_FACTOR == 0 && - columnNum % BASIC_BLOCK_COLUMN_FACTOR == 0 && - columnNum < BASIC_BLOCK_MAX_COLUMN_LENGTH) { + if (rowNum % BASIC_BLOCK_ROW_FACTOR == 0 && columnNum % BASIC_BLOCK_COLUMN_FACTOR == 0 && columnNum < BASIC_BLOCK_MAX_COLUMN_LENGTH) { AscendC::SoftmaxFlashV2(xLocal, sumLocal, maxLocal, xLocal, expmaxLocal, sumLocal, maxLocal, tmpBuffer, softmaxTiling, srcShape); } else { @@ -169,9 +173,7 @@ private: maxLocal, tmpBuffer, softmaxTiling, srcShape); } } else { - if (rowNum % BASIC_BLOCK_ROW_FACTOR == 0 && - columnNum % BASIC_BLOCK_COLUMN_FACTOR == 0 && - columnNum < BASIC_BLOCK_MAX_COLUMN_LENGTH) { + if (rowNum % BASIC_BLOCK_ROW_FACTOR == 0 && columnNum % BASIC_BLOCK_COLUMN_FACTOR == 0 && columnNum < BASIC_BLOCK_MAX_COLUMN_LENGTH) { AscendC::SoftmaxFlashV2(xLocal, sumLocal, maxLocal, xLocal, expmaxLocal, sumLocal, maxLocal, tmpBuffer, softmaxTiling, srcShape); } else { diff --git a/examples/activation/softmaxflash/kernel_launch_method_by_direct/run.sh b/examples/activation/softmaxflash/kernel_launch_method_by_direct/run.sh index 5eb6ca7c868f72268e163b3ea089b958a540a873..91e4387564d0c992d1e90a581296d83c023da501 100644 --- a/examples/activation/softmaxflash/kernel_launch_method_by_direct/run.sh +++ b/examples/activation/softmaxflash/kernel_launch_method_by_direct/run.sh @@ -7,7 +7,6 @@ # INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. # See LICENSE in the root of the software repository for the full text of the License. # ====================================================================================================================== - SHORT=r:,v:, LONG=run-mode:,soc-version:, OPTS=$(getopt -a --options $SHORT --longoptions $LONG -- "$@") @@ -30,10 +29,6 @@ do esac done -rm -rf build -mkdir build -cd build - # in case of running op in simulator, use stub so instead if [ "${RUN_MODE}" = "sim" ]; then export LD_LIBRARY_PATH=$(echo $LD_LIBRARY_PATH | sed 's/\/.*\/runtime\/lib64://g') diff --git a/impl/activation/softmax/softmax_flashv2_base_impl.h b/impl/activation/softmax/softmax_flashv2_base_impl.h index 285d18d2e4bbd23ff2dd176df816f47c7a412fa7..c984a9c16e0523b0bbd3759872d0433a09929096 100644 --- a/impl/activation/softmax/softmax_flashv2_base_impl.h +++ b/impl/activation/softmax/softmax_flashv2_base_impl.h @@ -28,16 +28,23 @@ __aicore__ inline constexpr SoftMaxTiling SoftMaxFlashV2TilingFuncImpl(const uin const uint32_t dataTypeSize1, const uint32_t dataTypeSize2, const uint32_t localWorkSpaceSize, const bool isUpdate = false, const bool isBasicBlock = false, const bool isDataFormatNZ = false) { + // dataTypeSize1 计算的源数据的数据类型大小,比如half=2。 + // dataTypeSize2 参与计算的maxTensor和sumTensor的数据类型大小,比如half=2。 + // const uint16_t ONE_BLK_SIZE = 32; + // FLOAT_REPEAT_SIZE = 64 + // SOFTMAX_COMPUTE_DIM = 2 + // SOFTMAX_BASIC_TILE_NUM = 8 SoftMaxTiling softmaxTiling; const uint32_t elementNumPerBlk = ONE_BLK_SIZE / dataTypeSize2; softmaxTiling.srcM = srcM; softmaxTiling.srcK = srcK; - softmaxTiling.srcSize = srcM * srcK; + softmaxTiling.srcSize = srcM * srcK; // 输入数据的总个数 softmaxTiling.outMaxM = srcM; softmaxTiling.outMaxK = elementNumPerBlk; softmaxTiling.outMaxSize = srcM * elementNumPerBlk; + // reduceM 一次能处理的M行的个数 if (isDataFormatNZ) { softmaxTiling.reduceM = localWorkSpaceSize / (SOFTMAX_SHAPE_NZ_BASIC_COUNT * HALF_FACTOR + srcK); } else { @@ -51,23 +58,30 @@ __aicore__ inline constexpr SoftMaxTiling SoftMaxFlashV2TilingFuncImpl(const uin } } + // reduceM为8的倍数 if (softmaxTiling.reduceM < srcM && softmaxTiling.reduceM > SOFTMAX_BASIC_TILE_NUM) { softmaxTiling.reduceM = softmaxTiling.reduceM / SOFTMAX_BASIC_TILE_NUM * SOFTMAX_BASIC_TILE_NUM; } softmaxTiling.reduceM = softmaxTiling.reduceM < srcM ? softmaxTiling.reduceM : srcM; - softmaxTiling.reduceK = elementNumPerBlk; - softmaxTiling.reduceSize = softmaxTiling.reduceM * elementNumPerBlk; + softmaxTiling.reduceK = elementNumPerBlk; // reduce之后的K大小 + softmaxTiling.reduceSize = softmaxTiling.reduceM * elementNumPerBlk; // 一次reduce之后的数据大小(M,8) - softmaxTiling.splitM = softmaxTiling.reduceM; - softmaxTiling.splitK = srcK; - softmaxTiling.splitSize = softmaxTiling.reduceM * srcK; + softmaxTiling.splitM = softmaxTiling.reduceM; // 切M,一次处理splitM行 + softmaxTiling.splitK = srcK; // 不切K + softmaxTiling.splitSize = softmaxTiling.reduceM * srcK; // 一次处理的数据量 - softmaxTiling.rangeM = srcM / softmaxTiling.reduceM; - softmaxTiling.tailM = srcM % softmaxTiling.reduceM; + softmaxTiling.rangeM = srcM / softmaxTiling.reduceM; // 需要多少次循环处理完 + softmaxTiling.tailM = srcM % softmaxTiling.reduceM; // 尾行个数 - softmaxTiling.tailSplitSize = softmaxTiling.tailM * srcK; + softmaxTiling.tailSplitSize = softmaxTiling.tailM * srcK; // 尾行元素个数 softmaxTiling.tailReduceSize = softmaxTiling.tailM * elementNumPerBlk; + + // AscendC::PRINTF("softmaxTiling is srcM = %d, srcK = %d, srcSize = %d, outMaxM = %d, outMaxK = %d, outMaxSize = %d, splitM = %d, splitK = %d, splitSize = %d, reduceM = %d, reduceK = %d, reduceSize = %d, rangeM = %d, tailM = %d, tailSplitSize = %d, tailReduceSize = %d\n", + // softmaxTiling.srcM, softmaxTiling.srcK, softmaxTiling.srcSize, softmaxTiling.outMaxM, softmaxTiling.outMaxK, softmaxTiling.outMaxSize, + // softmaxTiling.splitM, softmaxTiling.splitK, softmaxTiling.splitSize, softmaxTiling.reduceM, softmaxTiling.reduceK, + // softmaxTiling.reduceSize, softmaxTiling.rangeM, softmaxTiling.tailM, softmaxTiling.tailSplitSize, softmaxTiling.tailReduceSize); + return softmaxTiling; } diff --git a/impl/activation/softmax/v220/softmax_flashv2_impl.h b/impl/activation/softmax/v220/softmax_flashv2_impl.h index 9f8e4383fd5f3a152b84f3946408170c69799f22..670e9287b392b7845b69b4f8ec34f5d30551ca84 100644 --- a/impl/activation/softmax/v220/softmax_flashv2_impl.h +++ b/impl/activation/softmax/v220/softmax_flashv2_impl.h @@ -704,61 +704,86 @@ __aicore__ inline void SoftmaxFlashV2BasicBlockImpl(const LocalTensor& ds const LocalTensor& expMaxTensor, const LocalTensor& inExpSumTensor, const LocalTensor& inMaxTensor, const LocalTensor& workLocal, const SoftMaxTiling& tiling) { - const LocalTensor& tmpBuffer1 = workLocal; - const LocalTensor& tmpBuffer2 = workLocal[tiling.splitM * FLOAT_REPEAT_SIZE]; - const LocalTensor& tmpBuffer3 = workLocal[tiling.splitM * FLOAT_REPEAT_SIZE / B16_BYTE_SIZE]; + // FLOAT_REPEAT_SIZE = 64 + // const uint8_t B16_BYTE_SIZE = 2; + // FLOAT_NUM_PER_BLK = 8 + const LocalTensor& tmpBuffer1 = workLocal; // [reduceM, 64] + const LocalTensor& tmpBuffer2 = workLocal[tiling.splitM * FLOAT_REPEAT_SIZE]; // [reduceM, 8] + const LocalTensor& tmpBuffer3 = workLocal[tiling.splitM * FLOAT_REPEAT_SIZE / B16_BYTE_SIZE]; // [reduceM, 32] ,临时复用tmpBuffer1 uint32_t offset1 = 0; uint32_t offset2 = 0; uint8_t repeatTimes = (uint8_t)(tiling.splitSize / FLOAT_REPEAT_SIZE); uint8_t offset = (uint8_t)(FLOAT_NUM_PER_BLK * (tiling.splitK / FLOAT_REPEAT_SIZE)); const uint8_t splitCeilM = (uint8_t)(DivCeil(tiling.splitM, FLOAT_NUM_PER_BLK)); - const uint8_t reduceCeilValue = (uint8_t)(DivCeil(tiling.reduceSize, FLOAT_REPEAT_SIZE)); + // const uint8_t reduceCeilValue = (uint8_t)(DivCeil(tiling.reduceSize, FLOAT_REPEAT_SIZE)); + const uint8_t reduceCeilValue = (uint8_t)(DivCeil(tiling.reduceM, FLOAT_REPEAT_SIZE)); const uint32_t splitBlock = tiling.splitK / FLOAT_REPEAT_SIZE; BinaryRepeatParams binaryRepeatParams; for (uint32_t i = 0; i < tiling.rangeM; i++) { - offset2 = i * tiling.reduceSize; + // offset2 = i * tiling.reduceSize; + offset2 = i * tiling.reduceM; offset1 = i * tiling.splitSize; PipeBarrier(); BasicBlockReduceMaxImpl(tmpBuffer2, src[offset1], tmpBuffer1, splitBlock, tiling.splitM, tiling.splitK); PipeBarrier(); - Brcb(tmpBuffer1, tmpBuffer2, splitCeilM, { 1, DEFAULT_REPEAT_STRIDE }); - PipeBarrier(); - Copy(tmpBuffer2, inMaxTensor[offset2], MASK_PLACEHOLDER, reduceCeilValue, + // Brcb(tmpBuffer1, tmpBuffer2, splitCeilM, { 1, DEFAULT_REPEAT_STRIDE }); + // PipeBarrier(); + // Copy(tmpBuffer2, inMaxTensor[offset2], MASK_PLACEHOLDER, reduceCeilValue, + // { 1, 1, DEFAULT_REPEAT_STRIDE, DEFAULT_REPEAT_STRIDE }); + // PipeBarrier(); + + Copy(tmpBuffer1, inMaxTensor[offset2], MASK_PLACEHOLDER, reduceCeilValue, { 1, 1, DEFAULT_REPEAT_STRIDE, DEFAULT_REPEAT_STRIDE }); + PipeBarrier(); + // Max(maxTensor[offset2], tmpBuffer2, tmpBuffer1, MASK_PLACEHOLDER, reduceCeilValue, + // binaryRepeatParams); + Max(maxTensor[offset2], tmpBuffer2, tmpBuffer1, tiling.reduceM); PipeBarrier(); - Max(maxTensor[offset2], tmpBuffer2, tmpBuffer1, MASK_PLACEHOLDER, reduceCeilValue, - binaryRepeatParams); + + // expmax = exp(inmax - max) + Sub(tmpBuffer2, tmpBuffer1, maxTensor[offset2], tiling.reduceM); PipeBarrier(); + Exp(expMaxTensor[offset2], tmpBuffer2, tiling.reduceM); + PipeBarrier(); + + Brcb(tmpBuffer2, maxTensor[offset2], splitCeilM, { 1, DEFAULT_REPEAT_STRIDE }); + PipeBarrier(); + for (uint32_t j = 0; j < splitBlock; ++j) { + // Sub(dst[offset1 + FLOAT_REPEAT_SIZE * j], src[offset1 + FLOAT_REPEAT_SIZE * j], + // maxTensor[offset2], MASK_PLACEHOLDER, (uint8_t)(tiling.splitM), { 1, 1, 0, offset, offset, 1 }); Sub(dst[offset1 + FLOAT_REPEAT_SIZE * j], src[offset1 + FLOAT_REPEAT_SIZE * j], - maxTensor[offset2], MASK_PLACEHOLDER, (uint8_t)(tiling.splitM), { 1, 1, 0, offset, offset, 1 }); + tmpBuffer2, MASK_PLACEHOLDER, (uint8_t)(tiling.splitM), { 1, 1, 0, offset, offset, 1 }); } // expmax = exp(inmax - max) - Sub(tmpBuffer2, tmpBuffer2, maxTensor[offset2], MASK_PLACEHOLDER, reduceCeilValue, - binaryRepeatParams); - PipeBarrier(); - Exp(expMaxTensor[offset2], tmpBuffer2, MASK_PLACEHOLDER, reduceCeilValue, - { 1, 1, DEFAULT_REPEAT_STRIDE, DEFAULT_REPEAT_STRIDE }); - PipeBarrier(); + // Sub(tmpBuffer2, tmpBuffer2, maxTensor[offset2], MASK_PLACEHOLDER, reduceCeilValue, + // binaryRepeatParams); + // PipeBarrier(); + // Exp(expMaxTensor[offset2], tmpBuffer2, MASK_PLACEHOLDER, reduceCeilValue, + // { 1, 1, DEFAULT_REPEAT_STRIDE, DEFAULT_REPEAT_STRIDE }); + // PipeBarrier(); Exp(dst[offset1], dst[offset1], MASK_PLACEHOLDER, repeatTimes, { 1, 1, DEFAULT_REPEAT_STRIDE, DEFAULT_REPEAT_STRIDE }); PipeBarrier(); - BasicBlockReduceSumImpl(tmpBuffer3, dst[offset1], tmpBuffer1, splitBlock, tiling.splitM, tiling.splitK); - PipeBarrier(); - Brcb(tmpBuffer1, tmpBuffer3, splitCeilM, { 1, DEFAULT_REPEAT_STRIDE }); + BasicBlockReduceSumImpl(tmpBuffer2, dst[offset1], tmpBuffer1, splitBlock, tiling.splitM, tiling.splitK); PipeBarrier(); + // Brcb(tmpBuffer1, tmpBuffer3, splitCeilM, { 1, DEFAULT_REPEAT_STRIDE }); + // PipeBarrier(); // update sum = expmax * insum + sum - Mul(inExpSumTensor[offset2], expMaxTensor[offset2], inExpSumTensor[offset2], MASK_PLACEHOLDER, - reduceCeilValue, binaryRepeatParams); + // Mul(inExpSumTensor[offset2], expMaxTensor[offset2], inExpSumTensor[offset2], MASK_PLACEHOLDER, + // reduceCeilValue, binaryRepeatParams); + Mul(inExpSumTensor[offset2], expMaxTensor[offset2], inExpSumTensor[offset2], tiling.reduceM); + PipeBarrier(); + // Add(expSumTensor[offset2], inExpSumTensor[offset2], tmpBuffer2, MASK_PLACEHOLDER, reduceCeilValue, + // binaryRepeatParams); + Add(expSumTensor[offset2], inExpSumTensor[offset2], tmpBuffer2, tiling.reduceM); PipeBarrier(); - Add(expSumTensor[offset2], inExpSumTensor[offset2], tmpBuffer1, MASK_PLACEHOLDER, reduceCeilValue, - binaryRepeatParams); } } @@ -982,39 +1007,50 @@ __aicore__ inline void SoftmaxFlashV2NoUpdateBasicBlock(const LocalTensor const LocalTensor& expSumTensor, const LocalTensor& maxTensor, const LocalTensor& src, const LocalTensor& workLocal, const SoftMaxTiling& tiling) { - const LocalTensor& tmpBuffer1 = workLocal; - const LocalTensor& tmpBuffer2 = workLocal[tiling.splitM * FLOAT_REPEAT_SIZE]; + // FLOAT_REPEAT_SIZE = 64 + // FLOAT_NUM_PER_BLK = 8 + // tiling.splitM, 切M,一次处理splitM行 + const LocalTensor& tmpBuffer1 = workLocal; // [tiling.splitM, 64] + const LocalTensor& tmpBuffer2 = workLocal[tiling.splitM * FLOAT_REPEAT_SIZE]; // [tiling.splitM, 8] uint32_t offset1 = 0; uint32_t offset2 = 0; - uint8_t repeatTimes = (uint8_t)(tiling.splitSize / FLOAT_REPEAT_SIZE); - uint8_t offset = (uint8_t)(FLOAT_NUM_PER_BLK * (tiling.splitK / FLOAT_REPEAT_SIZE)); - const uint8_t splitCeilM = (uint8_t)(DivCeil(tiling.splitM, FLOAT_NUM_PER_BLK)); - const uint8_t reduceCeilValue = (uint8_t)(DivCeil(tiling.reduceSize, FLOAT_REPEAT_SIZE)); - const uint32_t splitBlock = tiling.splitK / FLOAT_REPEAT_SIZE; + // tiling.splitSize 一次循环处理的数据量 + // uint8_t repeatTimes = (uint8_t)(tiling.splitSize / FLOAT_REPEAT_SIZE); // 一次循环需要多少个Repeat + // tiling.splitK = srcK + uint8_t offset = (uint8_t)(FLOAT_NUM_PER_BLK * (tiling.splitK / FLOAT_REPEAT_SIZE)); // reduce之后,每个数再扩8倍, + const uint8_t splitCeilM = (uint8_t)(DivCeil(tiling.splitM, FLOAT_NUM_PER_BLK)); // Brcb的Repeat-time + // const uint8_t reduceCeilValue = (uint8_t)(DivCeil(tiling.reduceSize, FLOAT_REPEAT_SIZE)); + const uint32_t splitBlock = tiling.splitK / FLOAT_REPEAT_SIZE; // reduce之后的k的个数 BinaryRepeatParams binaryRepeatParams; - for (uint32_t i = 0; i < tiling.rangeM; i++) { - offset2 = i * tiling.reduceSize; - offset1 = i * tiling.splitSize; + for (uint32_t i = 0; i < tiling.rangeM; i++) { // 需要多少次循环处理完 + // offset2 = i * tiling.reduceSize; // 一次reduce之后的数据大小(M,8)-> (M,1) + offset2 = i * tiling.reduceM; // 一次reduce之后的数据大小(M,8)-> (M,1) + offset1 = i * tiling.splitSize; // tiling.splitSize 一次循环处理的数据量 PipeBarrier(); - BasicBlockReduceMaxImpl(tmpBuffer2, src[offset1], tmpBuffer1, splitBlock, tiling.splitM, tiling.splitK); + // BasicBlockReduceMaxImpl(tmpBuffer2, src[offset1], tmpBuffer1, splitBlock, tiling.splitM, tiling.splitK); + BasicBlockReduceMaxImpl(maxTensor[offset2], src[offset1], tmpBuffer1, splitBlock, tiling.splitM, tiling.splitK); PipeBarrier(); - Brcb(maxTensor[offset2], tmpBuffer2, splitCeilM, { 1, DEFAULT_REPEAT_STRIDE }); + // Brcb(maxTensor[offset2], tmpBuffer2, splitCeilM, { 1, DEFAULT_REPEAT_STRIDE }); + Brcb(tmpBuffer2, maxTensor[offset2], splitCeilM, { 1, DEFAULT_REPEAT_STRIDE }); PipeBarrier(); for (uint32_t j = 0; j < splitBlock; ++j) { + // Sub(src[offset1 + FLOAT_REPEAT_SIZE * j], src[offset1 + FLOAT_REPEAT_SIZE * j], + // maxTensor[offset2], MASK_PLACEHOLDER, (uint8_t)(tiling.splitM), { 1, 1, 0, offset, offset, 1 }); Sub(src[offset1 + FLOAT_REPEAT_SIZE * j], src[offset1 + FLOAT_REPEAT_SIZE * j], - maxTensor[offset2], MASK_PLACEHOLDER, (uint8_t)(tiling.splitM), { 1, 1, 0, offset, offset, 1 }); + tmpBuffer2, MASK_PLACEHOLDER, (uint8_t)(tiling.splitM), { 1, 1, 0, offset, offset, 1 }); } PipeBarrier(); Exp(dst[offset1], src[offset1], MASK_PLACEHOLDER, (uint8_t)(tiling.splitSize / FLOAT_REPEAT_SIZE), { 1, 1, DEFAULT_REPEAT_STRIDE, DEFAULT_REPEAT_STRIDE }); PipeBarrier(); - BasicBlockReduceSumImpl(tmpBuffer2, dst[offset1], tmpBuffer1, splitBlock, tiling.splitM, tiling.splitK); + // BasicBlockReduceSumImpl(tmpBuffer2, dst[offset1], tmpBuffer1, splitBlock, tiling.splitM, tiling.splitK); + BasicBlockReduceSumImpl(expSumTensor[offset2], dst[offset1], tmpBuffer1, splitBlock, tiling.splitM, tiling.splitK); PipeBarrier(); - Brcb(expSumTensor[offset2], tmpBuffer2, splitCeilM, { 1, DEFAULT_REPEAT_STRIDE }); + // Brcb(expSumTensor[offset2], tmpBuffer2, splitCeilM, { 1, DEFAULT_REPEAT_STRIDE }); } }