From c6f5de399f5f322ec9e7145fd33b5ebf18c5bd64 Mon Sep 17 00:00:00 2001 From: zhouqian0123 Date: Fri, 13 Sep 2024 15:00:43 +0800 Subject: [PATCH 1/2] softxmax const compile --- impl/activation/softmax/softmax_base_impl.h | 36 ++-- impl/activation/softmax/softmax_common.h | 10 +- impl/activation/softmax/v200/softmax_impl.h | 9 +- impl/activation/softmax/v220/softmax_impl.h | 185 +++++++++++++----- impl/activation/softmax/v300/softmax_impl.h | 4 +- lib/activation/softmax.h | 37 ++-- .../softmax/test_operator_softmax_v220.cpp | 17 +- 7 files changed, 203 insertions(+), 95 deletions(-) diff --git a/impl/activation/softmax/softmax_base_impl.h b/impl/activation/softmax/softmax_base_impl.h index 8c44c306..649e0654 100644 --- a/impl/activation/softmax/softmax_base_impl.h +++ b/impl/activation/softmax/softmax_base_impl.h @@ -24,7 +24,8 @@ #endif namespace AscendC { -template +template __aicore__ inline void SoftMaxImpl(const LocalTensor& dst, const LocalTensor& src, const LocalTensor& workLocal, const SoftMaxTiling& tiling, const SoftMaxShapeInfo& softmaxShapeInfo) { @@ -45,28 +46,31 @@ __aicore__ inline void SoftMaxImpl(const LocalTensor& dst, const LocalTensor< SoftMaxTiling newTiling = tiling; SoftMaxTilingFunc(workLocal.GetSize(), { srcNDinfo.m, srcNDinfo.k, originalSrcShape.m, srcNDinfo.k }, newTiling, sizeof(T), sizeof(float), isBasicBlock); - SoftMaxNDImpl(dst, src, workLocal, originalSrcShape, newTiling); + SoftMaxNDImpl(dst, src, workLocal, originalSrcShape, newTiling); } else { - SoftMaxNDImpl(dst, src, workLocal, originalSrcShape, tiling); + SoftMaxNDImpl(dst, src, workLocal, originalSrcShape, tiling); } } -template +template __aicore__ inline void SoftMaxImpl(const LocalTensor& dst, const LocalTensor& src, const SoftMaxTiling& tiling, const SoftMaxShapeInfo& softmaxShapeInfo) { LocalTensor workLocal; PopStackBuffer(workLocal); - SoftMaxImpl(dst, src, workLocal, tiling, softmaxShapeInfo); + SoftMaxImpl(dst, src, workLocal, tiling, softmaxShapeInfo); } -template +template __aicore__ inline void SoftMaxImpl(const LocalTensor& dst, const LocalTensor& src, const LocalTensor& sharedTmpBuffer, const SoftMaxTiling& tiling, const SoftMaxShapeInfo& softmaxShapeInfo) { auto workLocal = sharedTmpBuffer.ReinterpretCast(); - SoftMaxImpl(dst, src, workLocal, tiling, softmaxShapeInfo); + SoftMaxImpl(dst, src, workLocal, tiling, softmaxShapeInfo); } -template +template __aicore__ inline void SoftMaxImpl(const LocalTensor& dst, const LocalTensor& sumTensor, const LocalTensor& maxTensor, const LocalTensor& src, const LocalTensor& workLocal, const SoftMaxTiling& tiling, const SoftMaxShapeInfo& softmaxShapeInfo) @@ -99,31 +103,35 @@ __aicore__ inline void SoftMaxImpl(const LocalTensor& dst, const LocalTensor SoftMaxTiling newTiling = tiling; SoftMaxTilingFunc(workLocal.GetSize(), { srcNDinfo.m, srcNDinfo.k, originalSrcShape.m, srcNDinfo.k }, newTiling, sizeof(T1), sizeof(T2), isBasicBlock); - SoftMaxNDImpl(dst, sumTensor, maxTensor, src, workLocal, originalSrcShape, newTiling); + SoftMaxNDImpl(dst, sumTensor, maxTensor, src, workLocal, originalSrcShape, + newTiling); } else { - SoftMaxNDImpl(dst, sumTensor, maxTensor, src, workLocal, originalSrcShape, tiling); + SoftMaxNDImpl(dst, sumTensor, maxTensor, src, workLocal, originalSrcShape, + tiling); } } } -template +template __aicore__ inline void SoftMaxImpl(const LocalTensor& dst, const LocalTensor& sumTensor, const LocalTensor& maxTensor, const LocalTensor& src, const SoftMaxTiling& tiling, const SoftMaxShapeInfo& softmaxShapeInfo) { LocalTensor workLocal; PopStackBuffer(workLocal); - SoftMaxImpl(dst, sumTensor, maxTensor, src, workLocal, tiling, + SoftMaxImpl(dst, sumTensor, maxTensor, src, workLocal, tiling, softmaxShapeInfo); } -template +template __aicore__ inline void SoftMaxImpl(const LocalTensor& dst, const LocalTensor& sumTensor, const LocalTensor& maxTensor, const LocalTensor& src, const LocalTensor& sharedTmpBuffer, const SoftMaxTiling& tiling, const SoftMaxShapeInfo& softmaxShapeInfo) { auto workLocal = sharedTmpBuffer.ReinterpretCast(); - SoftMaxImpl(dst, sumTensor, maxTensor, src, workLocal, tiling, + SoftMaxImpl(dst, sumTensor, maxTensor, src, workLocal, tiling, softmaxShapeInfo); } diff --git a/impl/activation/softmax/softmax_common.h b/impl/activation/softmax/softmax_common.h index dd7ea1a7..08c4e540 100644 --- a/impl/activation/softmax/softmax_common.h +++ b/impl/activation/softmax/softmax_common.h @@ -65,11 +65,19 @@ struct SoftmaxConfig { { isCheckTiling = isCheckTilingIn; } + __aicore__ constexpr SoftmaxConfig(const bool isCheckTilingIn, const uint32_t oriSrcMIn, const uint32_t oriSrcKIn) + { + isCheckTiling = isCheckTilingIn; + oriSrcM = oriSrcMIn; + oriSrcK = oriSrcKIn; + } // to judge if match or not of input shape and tiling, if not match, api will recompute tiling, default to judge bool isCheckTiling = true; + uint32_t oriSrcM = 0; + uint32_t oriSrcK = 0; }; -constexpr SoftmaxConfig SOFTMAX_DEFAULT_CFG = { true }; +constexpr SoftmaxConfig SOFTMAX_DEFAULT_CFG = { true, 0, 0 }; __aicore__ inline LastAxisShapeND GetLastAxisShapeND(const ShapeInfo& shapeInfo) { diff --git a/impl/activation/softmax/v200/softmax_impl.h b/impl/activation/softmax/v200/softmax_impl.h index 2f76d21e..287dee4e 100644 --- a/impl/activation/softmax/v200/softmax_impl.h +++ b/impl/activation/softmax/v200/softmax_impl.h @@ -577,7 +577,8 @@ __aicore__ inline void SoftMaxGenericNDImpl(const LocalTensor& dst, const Cast(dst[offset1], tmpBuffer0, FLOAT2HALF_ROUND_MODE, splitSize); } -template +template __aicore__ inline void SoftMaxNDImpl(const LocalTensor& dst, const LocalTensor& sumTensor, const LocalTensor& maxTensor, const LocalTensor& src, const LocalTensor& workLocal, const LastAxisShapeND& originalSrcShape, const SoftMaxTiling& tiling) @@ -733,7 +734,8 @@ __aicore__ inline void SoftMaxGenericNDImpl(const LocalTensor& dst, const Cast(dst[offset1], tmpBuffer0, FLOAT2HALF_ROUND_MODE, splitSize); } -template +template __aicore__ inline void SoftMaxNDImpl(const LocalTensor& dst, const LocalTensor& sumTensor, const LocalTensor& maxTensor, const LocalTensor& src, const LocalTensor& workLocal, const LastAxisShapeND& originalSrcShape, const SoftMaxTiling& tiling) @@ -814,7 +816,8 @@ __aicore__ inline void SingleSoftMaxImpl(const LocalTensor& dst, const Lo TransDivToMulImpl(dst[offset], tmpBuffer0, tmpBuffer1, reduceParam.originalSrcM, tiling.srcK, tiling.reduceK); } -template +template __aicore__ inline void SoftMaxNDImpl(const LocalTensor& dst, const LocalTensor& src, const LocalTensor& workLocal, const LastAxisShapeND& originalSrcShape, const SoftMaxTiling& tiling) { diff --git a/impl/activation/softmax/v220/softmax_impl.h b/impl/activation/softmax/v220/softmax_impl.h index b3433528..ceff1541 100644 --- a/impl/activation/softmax/v220/softmax_impl.h +++ b/impl/activation/softmax/v220/softmax_impl.h @@ -556,39 +556,72 @@ __aicore__ inline void SoftMaxGenericNDImpl(const LocalTensor& dst, const Cast(dst[offset1], tmpBuffer0, FLOAT2HALF_ROUND_MODE, splitSize); } -template -__aicore__ inline void SoftMaxNDImpl(const LocalTensor& dst, const LocalTensor& sumTensor, - const LocalTensor& maxTensor, const LocalTensor& src, const LocalTensor& workLocal, - const LastAxisShapeND& originalSrcShape, const SoftMaxTiling& tiling) +template +__aicore__ inline void SoftMaxNDExtImpl(const LocalTensor& dst, const LocalTensor& sumTensor, + const LocalTensor& maxTensor, const LocalTensor& src, const LocalTensor& workLocal, + const LastAxisShapeND& originalSrcShape, const SoftMaxTiling& tiling, ReduceLastND& reduceParam) { - ReduceLastND reduceParam = { tiling.splitM, originalSrcShape.k, tiling.splitM, - tiling.splitK, tiling.reduceM, tiling.reduceK }; uint32_t offset1 = 0; uint32_t offset2 = 0; uint32_t splitSize = tiling.splitSize; uint32_t reduceSize = tiling.reduceSize; + for (uint32_t i = 0; i <= tiling.rangeM; i++) { + SoftMaxGenericNDImpl(dst, sumTensor, maxTensor, src, workLocal, tiling, offset1, offset2, splitSize, + reduceSize, reduceParam); + offset1 += tiling.splitSize; + offset2 += tiling.reduceSize; + if (i == (tiling.rangeM - 1)) { + if (tiling.tailM == 0) { + break; + } + offset2 = tiling.rangeM * tiling.reduceSize; + offset1 = tiling.rangeM * tiling.splitSize; + splitSize = tiling.tailSplitSize; + reduceSize = tiling.tailReduceSize; + reduceParam.originalSrcM = tiling.tailM; + reduceParam.srcM = tiling.tailM; + reduceParam.dstM = tiling.tailM; + PipeBarrier(); + } + } +} + +template +__aicore__ inline void SoftMaxNDImpl(const LocalTensor& dst, const LocalTensor& sumTensor, + const LocalTensor& maxTensor, const LocalTensor& src, const LocalTensor& workLocal, + const LastAxisShapeND& originalSrcShape, const SoftMaxTiling& tiling) +{ PipeBarrier(); - if constexpr (isBasicBlock) { - SoftMaxBasicBlock(dst, sumTensor, maxTensor, src, workLocal, tiling); + if constexpr (config.oriSrcM == 0 || config.oriSrcK == 0) { + if constexpr (isBasicBlock) { + SoftMaxBasicBlock(dst, sumTensor, maxTensor, src, workLocal, tiling); + } else { + ReduceLastND reduceParam = { tiling.splitM, originalSrcShape.k, tiling.splitM, + tiling.splitK, tiling.reduceM, tiling.reduceK }; + SoftMaxNDExtImpl(dst, sumTensor, maxTensor, src, workLocal, originalSrcShape, tiling, reduceParam); + } } else { - for (uint32_t i = 0; i <= tiling.rangeM; i++) { - SoftMaxGenericNDImpl(dst, sumTensor, maxTensor, src, workLocal, tiling, offset1, offset2, splitSize, - reduceSize, reduceParam); - offset1 += tiling.splitSize; - offset2 += tiling.reduceSize; - if (i == (tiling.rangeM - 1)) { - if (tiling.tailM == 0) { - break; - } - offset2 = tiling.rangeM * tiling.reduceSize; - offset1 = tiling.rangeM * tiling.splitSize; - splitSize = tiling.tailSplitSize; - reduceSize = tiling.tailReduceSize; - reduceParam.originalSrcM = tiling.tailM; - reduceParam.srcM = tiling.tailM; - reduceParam.dstM = tiling.tailM; - PipeBarrier(); + constexpr uint32_t basicBlockMaxK = 2048; + constexpr bool localIsBasicBlock = config.oriSrcK % FLOAT_REPEAT_SIZE == 0 && + config.oriSrcK < basicBlockMaxK && config.oriSrcM % FLOAT_NUM_PER_BLK == 0; + if constexpr (localIsBasicBlock) { + SoftMaxBasicBlock(dst, sumTensor, maxTensor, src, workLocal, tiling); + } else { + uint32_t splitK = 0; + ReduceLastND reduceParam; + if constexpr (config.oriSrcK % FLOAT_NUM_PER_BLK == 0) { + splitK = config.oriSrcK; + } else { + splitK = AlignUp(config.oriSrcK, FLOAT_NUM_PER_BLK); } + if constexpr (SupportType()) { + reduceParam = { tiling.splitM, config.oriSrcK, tiling.splitM, splitK, tiling.reduceM, + DEFAULT_REPEAT_STRIDE * HALF_FACTOR }; + } else if constexpr (SupportType()) { + reduceParam = { tiling.splitM, config.oriSrcK, tiling.splitM, splitK, tiling.reduceM, + DEFAULT_REPEAT_STRIDE }; + } + SoftMaxNDExtImpl(dst, sumTensor, maxTensor, src, workLocal, originalSrcShape, tiling, reduceParam); } } } @@ -694,39 +727,65 @@ __aicore__ inline void SoftMaxGenericNDImpl(const LocalTensor& dst, const Cast(dst[offset1], tmpBuffer0, FLOAT2HALF_ROUND_MODE, splitSize); } -template +__aicore__ inline void SoftMaxNDExtImpl(const LocalTensor& dst, const LocalTensor& sumTensor, + const LocalTensor& maxTensor, const LocalTensor& src, const LocalTensor& workLocal, + const LastAxisShapeND& originalSrcShape, const SoftMaxTiling& tiling, ReduceLastND& reduceParam) +{ + uint32_t offset1 = 0; + uint32_t offset2 = 0; + uint32_t splitSize = tiling.splitSize; + uint32_t reduceSize = tiling.reduceSize; + for (uint32_t i = 0; i <= tiling.rangeM; i++) { + SoftMaxGenericNDImpl(dst, sumTensor, maxTensor, src, workLocal, tiling, offset1, offset2, splitSize, + reduceParam); + offset1 += tiling.splitSize; + offset2 += tiling.reduceSize; + if (i == (tiling.rangeM - 1)) { + if (tiling.tailM == 0) { + break; + } + offset2 = tiling.rangeM * tiling.reduceSize; + offset1 = tiling.rangeM * tiling.splitSize; + splitSize = tiling.tailSplitSize; + reduceSize = tiling.tailReduceSize; + reduceParam.originalSrcM = tiling.tailM; + reduceParam.srcM = tiling.tailM; + reduceParam.dstM = tiling.tailM; + PipeBarrier(); + } + } +} + +template __aicore__ inline void SoftMaxNDImpl(const LocalTensor& dst, const LocalTensor& sumTensor, const LocalTensor& maxTensor, const LocalTensor& src, const LocalTensor& workLocal, const LastAxisShapeND& originalSrcShape, const SoftMaxTiling& tiling) { PipeBarrier(); - if constexpr (isBasicBlock) { - SoftMaxBasicBlock(dst, sumTensor, maxTensor, src, workLocal, tiling); + if constexpr (config.oriSrcM == 0 || config.oriSrcK == 0) { + if constexpr (isBasicBlock) { + SoftMaxBasicBlock(dst, sumTensor, maxTensor, src, workLocal, tiling); + } else { + ReduceLastND reduceParam = { tiling.splitM, originalSrcShape.k, tiling.splitM, + tiling.splitK, tiling.reduceM, tiling.reduceK }; + SoftMaxNDExtImpl(dst, sumTensor, maxTensor, src, workLocal, originalSrcShape, tiling, reduceParam); + } } else { - ReduceLastND reduceParam = { tiling.splitM, originalSrcShape.k, tiling.splitM, - tiling.splitK, tiling.reduceM, tiling.reduceK }; - uint32_t offset1 = 0; - uint32_t offset2 = 0; - uint32_t splitSize = tiling.splitSize; - uint32_t reduceSize = tiling.reduceSize; - for (uint32_t i = 0; i <= tiling.rangeM; i++) { - SoftMaxGenericNDImpl(dst, sumTensor, maxTensor, src, workLocal, tiling, offset1, offset2, splitSize, - reduceParam); - offset1 += tiling.splitSize; - offset2 += tiling.reduceSize; - if (i == (tiling.rangeM - 1)) { - if (tiling.tailM == 0) { - break; - } - offset2 = tiling.rangeM * tiling.reduceSize; - offset1 = tiling.rangeM * tiling.splitSize; - splitSize = tiling.tailSplitSize; - reduceSize = tiling.tailReduceSize; - reduceParam.originalSrcM = tiling.tailM; - reduceParam.srcM = tiling.tailM; - reduceParam.dstM = tiling.tailM; - PipeBarrier(); + constexpr uint32_t basicBlockMaxK = 2048; + constexpr bool localIsBasicBlock = config.oriSrcK % FLOAT_REPEAT_SIZE == 0 && + config.oriSrcK < basicBlockMaxK && config.oriSrcM % FLOAT_NUM_PER_BLK == 0; + if constexpr (localIsBasicBlock) { + SoftMaxBasicBlock(dst, sumTensor, maxTensor, src, workLocal, tiling); + } else { + uint32_t splitK = 0; + if constexpr (config.oriSrcK % FLOAT_NUM_PER_BLK == 0) { + splitK = config.oriSrcK; + } else { + splitK = AlignUp(config.oriSrcK, FLOAT_NUM_PER_BLK); } + ReduceLastND reduceParam = { tiling.splitM, config.oriSrcK, tiling.splitM, splitK, tiling.reduceM, + DEFAULT_REPEAT_STRIDE * HALF_FACTOR }; + SoftMaxNDExtImpl(dst, sumTensor, maxTensor, src, workLocal, originalSrcShape, tiling, reduceParam); } } } @@ -785,14 +844,32 @@ __aicore__ inline void SingleSoftMaxImpl(const LocalTensor& dst, const Lo } } -template +template __aicore__ inline void SoftMaxNDImpl(const LocalTensor& dst, const LocalTensor& src, const LocalTensor& workLocal, const LastAxisShapeND& originalSrcShape, const SoftMaxTiling& tiling) { uint32_t offset = 0; uint32_t splitSize = tiling.splitSize; - ReduceLastND reduceParam = { tiling.splitM, originalSrcShape.k, tiling.splitM, - tiling.splitK, tiling.reduceM, tiling.reduceK }; + ReduceLastND reduceParam; + if constexpr (config.oriSrcM == 0 || config.oriSrcK == 0) { + reduceParam = { tiling.splitM, originalSrcShape.k, tiling.splitM, tiling.splitK, tiling.reduceM, + tiling.reduceK }; + } else { + uint32_t splitK = 0; + if constexpr (config.oriSrcK % FLOAT_NUM_PER_BLK == 0) { + splitK = config.oriSrcK; + } else { + splitK = AlignUp(config.oriSrcK, FLOAT_NUM_PER_BLK); + } + if constexpr (SupportType()) { + reduceParam = { tiling.splitM, config.oriSrcK, tiling.splitM, splitK, tiling.reduceM, + DEFAULT_REPEAT_STRIDE * HALF_FACTOR }; + } else if constexpr (SupportType()) { + reduceParam = { tiling.splitM, config.oriSrcK, tiling.splitM, splitK, tiling.reduceM, + DEFAULT_REPEAT_STRIDE }; + } + } PipeBarrier(); for (uint32_t i = 0; i <= tiling.rangeM; i++) { SingleSoftMaxImpl(dst, src, workLocal, tiling, offset, splitSize, reduceParam); diff --git a/impl/activation/softmax/v300/softmax_impl.h b/impl/activation/softmax/v300/softmax_impl.h index ea7699a7..0d8b7f34 100644 --- a/impl/activation/softmax/v300/softmax_impl.h +++ b/impl/activation/softmax/v300/softmax_impl.h @@ -72,7 +72,7 @@ __aicore__ inline void SoftMaxGenericNDImpl(const LocalTensor& dst, const } } -template +template __aicore__ inline void SoftMaxNDImpl(const LocalTensor& dst, const LocalTensor& sumTensor, const LocalTensor& maxTensor, const LocalTensor& src, const LocalTensor& workLocal, const LastAxisShapeND& originalSrcShape, const SoftMaxTiling& tiling) @@ -103,7 +103,7 @@ __aicore__ inline void SoftMaxNDImpl(const LocalTensor& dst, const LocalTens } } } -template +template __aicore__ inline void SoftMaxNDImpl(const LocalTensor& dst, const LocalTensor& sumTensor, const LocalTensor& maxTensor, const LocalTensor& src, const LocalTensor& workLocal, const LastAxisShapeND& originalSrcShape, const SoftMaxTiling& tiling) diff --git a/lib/activation/softmax.h b/lib/activation/softmax.h index 827d115f..d33cf736 100644 --- a/lib/activation/softmax.h +++ b/lib/activation/softmax.h @@ -38,7 +38,8 @@ namespace AscendC { * improve performance , but it is a reserved param when isDataFormatNZ = true * \param [in] isDataFormatNZ: if the data format of input srcTensor is NZ */ -template +template __aicore__ inline void SoftMax(const LocalTensor& dstTensor, const LocalTensor& sumTensor, const LocalTensor& maxTensor, const LocalTensor& srcTensor, const SoftMaxTiling& tiling, const SoftMaxShapeInfo& softmaxShapeInfo = {}) @@ -47,7 +48,7 @@ __aicore__ inline void SoftMax(const LocalTensor& dstTensor, const LocalTenso return; } TRACE_START(TraceId::SoftMax); - SoftMaxImpl(dstTensor, sumTensor, maxTensor, srcTensor, tiling, + SoftMaxImpl(dstTensor, sumTensor, maxTensor, srcTensor, tiling, softmaxShapeInfo); TRACE_STOP(TraceId::SoftMax); } @@ -66,7 +67,8 @@ __aicore__ inline void SoftMax(const LocalTensor& dstTensor, const LocalTenso * improve performance, but it is a reserved param when isDataFormatNZ = true * \param [in] isDataFormatNZ: if the data format of input srcTensor is NZ */ -template +template __aicore__ inline void SoftMax(const LocalTensor& dstTensor, const LocalTensor& sumTensor, const LocalTensor& maxTensor, const LocalTensor& srcTensor, const SoftMaxTiling& tiling, const SoftMaxShapeInfo& softmaxShapeInfo = {}) @@ -75,7 +77,7 @@ __aicore__ inline void SoftMax(const LocalTensor& dstTensor, const LocalTe return; } TRACE_START(TraceId::SoftMax); - SoftMaxImpl(dstTensor, sumTensor, maxTensor, srcTensor, tiling, + SoftMaxImpl(dstTensor, sumTensor, maxTensor, srcTensor, tiling, softmaxShapeInfo); TRACE_STOP(TraceId::SoftMax); } @@ -91,7 +93,8 @@ __aicore__ inline void SoftMax(const LocalTensor& dstTensor, const LocalTe * \param [in] isReuseSource: reserved param * \param [in] isBasicBlock: reserved param */ -template +template __aicore__ inline void SoftMax(const LocalTensor& dstTensor, const LocalTensor& srcTensor, const SoftMaxTiling& tiling, const SoftMaxShapeInfo& softmaxShapeInfo = {}) { @@ -99,7 +102,7 @@ __aicore__ inline void SoftMax(const LocalTensor& dstTensor, const LocalTenso return; } TRACE_START(TraceId::SoftMax); - SoftMaxImpl(dstTensor, srcTensor, tiling, softmaxShapeInfo); + SoftMaxImpl(dstTensor, srcTensor, tiling, softmaxShapeInfo); TRACE_STOP(TraceId::SoftMax); } @@ -116,7 +119,8 @@ __aicore__ inline void SoftMax(const LocalTensor& dstTensor, const LocalTenso * \param [in] isReuseSource: reserved param * \param [in] isBasicBlock: reserved param */ -template +template __aicore__ inline void SoftMax(const LocalTensor& dstTensor, const LocalTensor& srcTensor, const LocalTensor& sharedTmpBuffer, const SoftMaxTiling& tiling, const SoftMaxShapeInfo& softmaxShapeInfo = {}) @@ -125,7 +129,8 @@ __aicore__ inline void SoftMax(const LocalTensor& dstTensor, const LocalTenso return; } TRACE_START(TraceId::SoftMax); - SoftMaxImpl(dstTensor, srcTensor, sharedTmpBuffer, tiling, softmaxShapeInfo); + SoftMaxImpl(dstTensor, srcTensor, sharedTmpBuffer, tiling, + softmaxShapeInfo); TRACE_STOP(TraceId::SoftMax); } @@ -146,7 +151,8 @@ __aicore__ inline void SoftMax(const LocalTensor& dstTensor, const LocalTenso * improve performance, but it is a reserved param when isDataFormatNZ = true * \param [in] isDataFormatNZ: if the data format of input srcTensor is NZ */ -template +template __aicore__ inline void SoftMax(const LocalTensor& dstTensor, const LocalTensor& sumTensor, const LocalTensor& maxTensor, const LocalTensor& srcTensor, const LocalTensor& sharedTmpBuffer, const SoftMaxTiling& tiling, const SoftMaxShapeInfo& softmaxShapeInfo = {}) @@ -155,8 +161,8 @@ __aicore__ inline void SoftMax(const LocalTensor& dstTensor, const LocalTenso return; } TRACE_START(TraceId::SoftMax); - SoftMaxImpl(dstTensor, sumTensor, maxTensor, srcTensor, sharedTmpBuffer, tiling, - softmaxShapeInfo); + SoftMaxImpl(dstTensor, sumTensor, maxTensor, srcTensor, sharedTmpBuffer, + tiling, softmaxShapeInfo); TRACE_STOP(TraceId::SoftMax); } @@ -176,7 +182,8 @@ __aicore__ inline void SoftMax(const LocalTensor& dstTensor, const LocalTenso * improve performance, but it is a reserved param when isDataFormatNZ = true * \param [in] isDataFormatNZ: if the data format of input srcTensor is NZ */ -template +template __aicore__ inline void SoftMax(const LocalTensor& dstTensor, const LocalTensor& sumTensor, const LocalTensor& maxTensor, const LocalTensor& srcTensor, const LocalTensor& sharedTmpBuffer, const SoftMaxTiling& tiling, @@ -186,8 +193,8 @@ __aicore__ inline void SoftMax(const LocalTensor& dstTensor, const LocalTe return; } TRACE_START(TraceId::SoftMax); - SoftMaxImpl(dstTensor, sumTensor, maxTensor, srcTensor, sharedTmpBuffer, - tiling, softmaxShapeInfo); + SoftMaxImpl(dstTensor, sumTensor, maxTensor, srcTensor, + sharedTmpBuffer, tiling, softmaxShapeInfo); TRACE_STOP(TraceId::SoftMax); } @@ -216,4 +223,4 @@ __aicore__ inline bool AdjustSoftMaxRes(const LocalTensor& softMaxRes, const #pragma end_pipe #endif -#endif // LIB_SOFTMAX_SOFTMAX_H +#endif // LIB_SOFTMAX_SOFTMAX_H \ No newline at end of file diff --git a/tests/activation/softmax/test_operator_softmax_v220.cpp b/tests/activation/softmax/test_operator_softmax_v220.cpp index 22743022..0891bd7c 100644 --- a/tests/activation/softmax/test_operator_softmax_v220.cpp +++ b/tests/activation/softmax/test_operator_softmax_v220.cpp @@ -18,9 +18,11 @@ using namespace std; using namespace AscendC; - +constexpr SoftmaxConfig config = { true, 16, 16 }; namespace AscendC { -template class KernelSoftmax { +template +class KernelSoftmax { public: __aicore__ inline KernelSoftmax() {} __aicore__ inline void Init(__gm__ uint8_t* src0Gm, __gm__ uint8_t* src1Gm, __gm__ uint8_t* dstGm, @@ -82,6 +84,7 @@ private: SoftMaxShapeInfo srcShape = { height, width, height, width }; SoftMaxTiling tiling; SoftMax(srcLocal1, insumLocal, inmaxLocal, srcLocal1, tiling, srcShape); + SoftMax(srcLocal1, insumLocal, inmaxLocal, srcLocal1, tiling, srcShape); SimpleSoftMax(dstLocal, insumLocal, inmaxLocal, srcLocal1, tiling, srcShape); SoftmaxFlash(dstLocal, insumLocal, inmaxLocal, srcLocal1, expMaxTensor, insumLocal, @@ -135,11 +138,12 @@ private: }; } // namespace AscendC -template +template __global__ __aicore__ void MainSoftmax(__gm__ uint8_t* dstGm, __gm__ uint8_t* src0Gm, __gm__ uint8_t* src1Gm, uint32_t height, uint32_t width) { - AscendC::KernelSoftmax op; + AscendC::KernelSoftmax op; op.Init(src0Gm, src1Gm, dstGm, height, width); op.Process(); } @@ -175,7 +179,8 @@ INSTANTIATE_TEST_CASE_P(TEST_OPEARATION_SOFTMAX, SoftMaxTestsuite, SoftMaxTestParams{ 2, 50, 288, MainSoftmax }, SoftMaxTestParams{ 4, 64, 128, MainSoftmax }, SoftMaxTestParams{ 4, 64, 128, MainSoftmax }, - SoftMaxTestParams{ 4, 50, 144, MainSoftmax }, + SoftMaxTestParams{ 4, 50, 144, MainSoftmax }, + SoftMaxTestParams{ 4, 50, 144, MainSoftmax }, SoftMaxTestParams{ 4, 16, 960, MainSoftmax }, SoftMaxTestParams{ 4, 100, 32, MainSoftmax }, SoftMaxTestParams{ 4, 50, 288, MainSoftmax }, @@ -190,4 +195,4 @@ TEST_P(SoftMaxTestsuite, SoftMaxOpTestCase) uint8_t src1Gm[param.height * param.width * param.typeSize]; uint8_t dstGm[param.height * param.width * param.typeSize]; param.cal_func(dstGm, src0Gm, src1Gm, param.height, param.width); -} +} \ No newline at end of file -- Gitee From 22928ee6a2941fcd5ff10dd5427bbffc5cec0c42 Mon Sep 17 00:00:00 2001 From: liyihan123 Date: Fri, 13 Sep 2024 15:00:43 +0800 Subject: [PATCH 2/2] softxmax const compile --- impl/activation/softmax/softmax_base_impl.h | 36 ++-- impl/activation/softmax/softmax_common.h | 10 +- impl/activation/softmax/v200/softmax_impl.h | 9 +- impl/activation/softmax/v220/softmax_impl.h | 185 +++++++++++++----- impl/activation/softmax/v300/softmax_impl.h | 4 +- lib/activation/softmax.h | 37 ++-- .../softmax/test_operator_softmax_v220.cpp | 14 +- 7 files changed, 200 insertions(+), 95 deletions(-) diff --git a/impl/activation/softmax/softmax_base_impl.h b/impl/activation/softmax/softmax_base_impl.h index 8c44c306..649e0654 100644 --- a/impl/activation/softmax/softmax_base_impl.h +++ b/impl/activation/softmax/softmax_base_impl.h @@ -24,7 +24,8 @@ #endif namespace AscendC { -template +template __aicore__ inline void SoftMaxImpl(const LocalTensor& dst, const LocalTensor& src, const LocalTensor& workLocal, const SoftMaxTiling& tiling, const SoftMaxShapeInfo& softmaxShapeInfo) { @@ -45,28 +46,31 @@ __aicore__ inline void SoftMaxImpl(const LocalTensor& dst, const LocalTensor< SoftMaxTiling newTiling = tiling; SoftMaxTilingFunc(workLocal.GetSize(), { srcNDinfo.m, srcNDinfo.k, originalSrcShape.m, srcNDinfo.k }, newTiling, sizeof(T), sizeof(float), isBasicBlock); - SoftMaxNDImpl(dst, src, workLocal, originalSrcShape, newTiling); + SoftMaxNDImpl(dst, src, workLocal, originalSrcShape, newTiling); } else { - SoftMaxNDImpl(dst, src, workLocal, originalSrcShape, tiling); + SoftMaxNDImpl(dst, src, workLocal, originalSrcShape, tiling); } } -template +template __aicore__ inline void SoftMaxImpl(const LocalTensor& dst, const LocalTensor& src, const SoftMaxTiling& tiling, const SoftMaxShapeInfo& softmaxShapeInfo) { LocalTensor workLocal; PopStackBuffer(workLocal); - SoftMaxImpl(dst, src, workLocal, tiling, softmaxShapeInfo); + SoftMaxImpl(dst, src, workLocal, tiling, softmaxShapeInfo); } -template +template __aicore__ inline void SoftMaxImpl(const LocalTensor& dst, const LocalTensor& src, const LocalTensor& sharedTmpBuffer, const SoftMaxTiling& tiling, const SoftMaxShapeInfo& softmaxShapeInfo) { auto workLocal = sharedTmpBuffer.ReinterpretCast(); - SoftMaxImpl(dst, src, workLocal, tiling, softmaxShapeInfo); + SoftMaxImpl(dst, src, workLocal, tiling, softmaxShapeInfo); } -template +template __aicore__ inline void SoftMaxImpl(const LocalTensor& dst, const LocalTensor& sumTensor, const LocalTensor& maxTensor, const LocalTensor& src, const LocalTensor& workLocal, const SoftMaxTiling& tiling, const SoftMaxShapeInfo& softmaxShapeInfo) @@ -99,31 +103,35 @@ __aicore__ inline void SoftMaxImpl(const LocalTensor& dst, const LocalTensor SoftMaxTiling newTiling = tiling; SoftMaxTilingFunc(workLocal.GetSize(), { srcNDinfo.m, srcNDinfo.k, originalSrcShape.m, srcNDinfo.k }, newTiling, sizeof(T1), sizeof(T2), isBasicBlock); - SoftMaxNDImpl(dst, sumTensor, maxTensor, src, workLocal, originalSrcShape, newTiling); + SoftMaxNDImpl(dst, sumTensor, maxTensor, src, workLocal, originalSrcShape, + newTiling); } else { - SoftMaxNDImpl(dst, sumTensor, maxTensor, src, workLocal, originalSrcShape, tiling); + SoftMaxNDImpl(dst, sumTensor, maxTensor, src, workLocal, originalSrcShape, + tiling); } } } -template +template __aicore__ inline void SoftMaxImpl(const LocalTensor& dst, const LocalTensor& sumTensor, const LocalTensor& maxTensor, const LocalTensor& src, const SoftMaxTiling& tiling, const SoftMaxShapeInfo& softmaxShapeInfo) { LocalTensor workLocal; PopStackBuffer(workLocal); - SoftMaxImpl(dst, sumTensor, maxTensor, src, workLocal, tiling, + SoftMaxImpl(dst, sumTensor, maxTensor, src, workLocal, tiling, softmaxShapeInfo); } -template +template __aicore__ inline void SoftMaxImpl(const LocalTensor& dst, const LocalTensor& sumTensor, const LocalTensor& maxTensor, const LocalTensor& src, const LocalTensor& sharedTmpBuffer, const SoftMaxTiling& tiling, const SoftMaxShapeInfo& softmaxShapeInfo) { auto workLocal = sharedTmpBuffer.ReinterpretCast(); - SoftMaxImpl(dst, sumTensor, maxTensor, src, workLocal, tiling, + SoftMaxImpl(dst, sumTensor, maxTensor, src, workLocal, tiling, softmaxShapeInfo); } diff --git a/impl/activation/softmax/softmax_common.h b/impl/activation/softmax/softmax_common.h index dd7ea1a7..08c4e540 100644 --- a/impl/activation/softmax/softmax_common.h +++ b/impl/activation/softmax/softmax_common.h @@ -65,11 +65,19 @@ struct SoftmaxConfig { { isCheckTiling = isCheckTilingIn; } + __aicore__ constexpr SoftmaxConfig(const bool isCheckTilingIn, const uint32_t oriSrcMIn, const uint32_t oriSrcKIn) + { + isCheckTiling = isCheckTilingIn; + oriSrcM = oriSrcMIn; + oriSrcK = oriSrcKIn; + } // to judge if match or not of input shape and tiling, if not match, api will recompute tiling, default to judge bool isCheckTiling = true; + uint32_t oriSrcM = 0; + uint32_t oriSrcK = 0; }; -constexpr SoftmaxConfig SOFTMAX_DEFAULT_CFG = { true }; +constexpr SoftmaxConfig SOFTMAX_DEFAULT_CFG = { true, 0, 0 }; __aicore__ inline LastAxisShapeND GetLastAxisShapeND(const ShapeInfo& shapeInfo) { diff --git a/impl/activation/softmax/v200/softmax_impl.h b/impl/activation/softmax/v200/softmax_impl.h index 2f76d21e..287dee4e 100644 --- a/impl/activation/softmax/v200/softmax_impl.h +++ b/impl/activation/softmax/v200/softmax_impl.h @@ -577,7 +577,8 @@ __aicore__ inline void SoftMaxGenericNDImpl(const LocalTensor& dst, const Cast(dst[offset1], tmpBuffer0, FLOAT2HALF_ROUND_MODE, splitSize); } -template +template __aicore__ inline void SoftMaxNDImpl(const LocalTensor& dst, const LocalTensor& sumTensor, const LocalTensor& maxTensor, const LocalTensor& src, const LocalTensor& workLocal, const LastAxisShapeND& originalSrcShape, const SoftMaxTiling& tiling) @@ -733,7 +734,8 @@ __aicore__ inline void SoftMaxGenericNDImpl(const LocalTensor& dst, const Cast(dst[offset1], tmpBuffer0, FLOAT2HALF_ROUND_MODE, splitSize); } -template +template __aicore__ inline void SoftMaxNDImpl(const LocalTensor& dst, const LocalTensor& sumTensor, const LocalTensor& maxTensor, const LocalTensor& src, const LocalTensor& workLocal, const LastAxisShapeND& originalSrcShape, const SoftMaxTiling& tiling) @@ -814,7 +816,8 @@ __aicore__ inline void SingleSoftMaxImpl(const LocalTensor& dst, const Lo TransDivToMulImpl(dst[offset], tmpBuffer0, tmpBuffer1, reduceParam.originalSrcM, tiling.srcK, tiling.reduceK); } -template +template __aicore__ inline void SoftMaxNDImpl(const LocalTensor& dst, const LocalTensor& src, const LocalTensor& workLocal, const LastAxisShapeND& originalSrcShape, const SoftMaxTiling& tiling) { diff --git a/impl/activation/softmax/v220/softmax_impl.h b/impl/activation/softmax/v220/softmax_impl.h index b3433528..ceff1541 100644 --- a/impl/activation/softmax/v220/softmax_impl.h +++ b/impl/activation/softmax/v220/softmax_impl.h @@ -556,39 +556,72 @@ __aicore__ inline void SoftMaxGenericNDImpl(const LocalTensor& dst, const Cast(dst[offset1], tmpBuffer0, FLOAT2HALF_ROUND_MODE, splitSize); } -template -__aicore__ inline void SoftMaxNDImpl(const LocalTensor& dst, const LocalTensor& sumTensor, - const LocalTensor& maxTensor, const LocalTensor& src, const LocalTensor& workLocal, - const LastAxisShapeND& originalSrcShape, const SoftMaxTiling& tiling) +template +__aicore__ inline void SoftMaxNDExtImpl(const LocalTensor& dst, const LocalTensor& sumTensor, + const LocalTensor& maxTensor, const LocalTensor& src, const LocalTensor& workLocal, + const LastAxisShapeND& originalSrcShape, const SoftMaxTiling& tiling, ReduceLastND& reduceParam) { - ReduceLastND reduceParam = { tiling.splitM, originalSrcShape.k, tiling.splitM, - tiling.splitK, tiling.reduceM, tiling.reduceK }; uint32_t offset1 = 0; uint32_t offset2 = 0; uint32_t splitSize = tiling.splitSize; uint32_t reduceSize = tiling.reduceSize; + for (uint32_t i = 0; i <= tiling.rangeM; i++) { + SoftMaxGenericNDImpl(dst, sumTensor, maxTensor, src, workLocal, tiling, offset1, offset2, splitSize, + reduceSize, reduceParam); + offset1 += tiling.splitSize; + offset2 += tiling.reduceSize; + if (i == (tiling.rangeM - 1)) { + if (tiling.tailM == 0) { + break; + } + offset2 = tiling.rangeM * tiling.reduceSize; + offset1 = tiling.rangeM * tiling.splitSize; + splitSize = tiling.tailSplitSize; + reduceSize = tiling.tailReduceSize; + reduceParam.originalSrcM = tiling.tailM; + reduceParam.srcM = tiling.tailM; + reduceParam.dstM = tiling.tailM; + PipeBarrier(); + } + } +} + +template +__aicore__ inline void SoftMaxNDImpl(const LocalTensor& dst, const LocalTensor& sumTensor, + const LocalTensor& maxTensor, const LocalTensor& src, const LocalTensor& workLocal, + const LastAxisShapeND& originalSrcShape, const SoftMaxTiling& tiling) +{ PipeBarrier(); - if constexpr (isBasicBlock) { - SoftMaxBasicBlock(dst, sumTensor, maxTensor, src, workLocal, tiling); + if constexpr (config.oriSrcM == 0 || config.oriSrcK == 0) { + if constexpr (isBasicBlock) { + SoftMaxBasicBlock(dst, sumTensor, maxTensor, src, workLocal, tiling); + } else { + ReduceLastND reduceParam = { tiling.splitM, originalSrcShape.k, tiling.splitM, + tiling.splitK, tiling.reduceM, tiling.reduceK }; + SoftMaxNDExtImpl(dst, sumTensor, maxTensor, src, workLocal, originalSrcShape, tiling, reduceParam); + } } else { - for (uint32_t i = 0; i <= tiling.rangeM; i++) { - SoftMaxGenericNDImpl(dst, sumTensor, maxTensor, src, workLocal, tiling, offset1, offset2, splitSize, - reduceSize, reduceParam); - offset1 += tiling.splitSize; - offset2 += tiling.reduceSize; - if (i == (tiling.rangeM - 1)) { - if (tiling.tailM == 0) { - break; - } - offset2 = tiling.rangeM * tiling.reduceSize; - offset1 = tiling.rangeM * tiling.splitSize; - splitSize = tiling.tailSplitSize; - reduceSize = tiling.tailReduceSize; - reduceParam.originalSrcM = tiling.tailM; - reduceParam.srcM = tiling.tailM; - reduceParam.dstM = tiling.tailM; - PipeBarrier(); + constexpr uint32_t basicBlockMaxK = 2048; + constexpr bool localIsBasicBlock = config.oriSrcK % FLOAT_REPEAT_SIZE == 0 && + config.oriSrcK < basicBlockMaxK && config.oriSrcM % FLOAT_NUM_PER_BLK == 0; + if constexpr (localIsBasicBlock) { + SoftMaxBasicBlock(dst, sumTensor, maxTensor, src, workLocal, tiling); + } else { + uint32_t splitK = 0; + ReduceLastND reduceParam; + if constexpr (config.oriSrcK % FLOAT_NUM_PER_BLK == 0) { + splitK = config.oriSrcK; + } else { + splitK = AlignUp(config.oriSrcK, FLOAT_NUM_PER_BLK); } + if constexpr (SupportType()) { + reduceParam = { tiling.splitM, config.oriSrcK, tiling.splitM, splitK, tiling.reduceM, + DEFAULT_REPEAT_STRIDE * HALF_FACTOR }; + } else if constexpr (SupportType()) { + reduceParam = { tiling.splitM, config.oriSrcK, tiling.splitM, splitK, tiling.reduceM, + DEFAULT_REPEAT_STRIDE }; + } + SoftMaxNDExtImpl(dst, sumTensor, maxTensor, src, workLocal, originalSrcShape, tiling, reduceParam); } } } @@ -694,39 +727,65 @@ __aicore__ inline void SoftMaxGenericNDImpl(const LocalTensor& dst, const Cast(dst[offset1], tmpBuffer0, FLOAT2HALF_ROUND_MODE, splitSize); } -template +__aicore__ inline void SoftMaxNDExtImpl(const LocalTensor& dst, const LocalTensor& sumTensor, + const LocalTensor& maxTensor, const LocalTensor& src, const LocalTensor& workLocal, + const LastAxisShapeND& originalSrcShape, const SoftMaxTiling& tiling, ReduceLastND& reduceParam) +{ + uint32_t offset1 = 0; + uint32_t offset2 = 0; + uint32_t splitSize = tiling.splitSize; + uint32_t reduceSize = tiling.reduceSize; + for (uint32_t i = 0; i <= tiling.rangeM; i++) { + SoftMaxGenericNDImpl(dst, sumTensor, maxTensor, src, workLocal, tiling, offset1, offset2, splitSize, + reduceParam); + offset1 += tiling.splitSize; + offset2 += tiling.reduceSize; + if (i == (tiling.rangeM - 1)) { + if (tiling.tailM == 0) { + break; + } + offset2 = tiling.rangeM * tiling.reduceSize; + offset1 = tiling.rangeM * tiling.splitSize; + splitSize = tiling.tailSplitSize; + reduceSize = tiling.tailReduceSize; + reduceParam.originalSrcM = tiling.tailM; + reduceParam.srcM = tiling.tailM; + reduceParam.dstM = tiling.tailM; + PipeBarrier(); + } + } +} + +template __aicore__ inline void SoftMaxNDImpl(const LocalTensor& dst, const LocalTensor& sumTensor, const LocalTensor& maxTensor, const LocalTensor& src, const LocalTensor& workLocal, const LastAxisShapeND& originalSrcShape, const SoftMaxTiling& tiling) { PipeBarrier(); - if constexpr (isBasicBlock) { - SoftMaxBasicBlock(dst, sumTensor, maxTensor, src, workLocal, tiling); + if constexpr (config.oriSrcM == 0 || config.oriSrcK == 0) { + if constexpr (isBasicBlock) { + SoftMaxBasicBlock(dst, sumTensor, maxTensor, src, workLocal, tiling); + } else { + ReduceLastND reduceParam = { tiling.splitM, originalSrcShape.k, tiling.splitM, + tiling.splitK, tiling.reduceM, tiling.reduceK }; + SoftMaxNDExtImpl(dst, sumTensor, maxTensor, src, workLocal, originalSrcShape, tiling, reduceParam); + } } else { - ReduceLastND reduceParam = { tiling.splitM, originalSrcShape.k, tiling.splitM, - tiling.splitK, tiling.reduceM, tiling.reduceK }; - uint32_t offset1 = 0; - uint32_t offset2 = 0; - uint32_t splitSize = tiling.splitSize; - uint32_t reduceSize = tiling.reduceSize; - for (uint32_t i = 0; i <= tiling.rangeM; i++) { - SoftMaxGenericNDImpl(dst, sumTensor, maxTensor, src, workLocal, tiling, offset1, offset2, splitSize, - reduceParam); - offset1 += tiling.splitSize; - offset2 += tiling.reduceSize; - if (i == (tiling.rangeM - 1)) { - if (tiling.tailM == 0) { - break; - } - offset2 = tiling.rangeM * tiling.reduceSize; - offset1 = tiling.rangeM * tiling.splitSize; - splitSize = tiling.tailSplitSize; - reduceSize = tiling.tailReduceSize; - reduceParam.originalSrcM = tiling.tailM; - reduceParam.srcM = tiling.tailM; - reduceParam.dstM = tiling.tailM; - PipeBarrier(); + constexpr uint32_t basicBlockMaxK = 2048; + constexpr bool localIsBasicBlock = config.oriSrcK % FLOAT_REPEAT_SIZE == 0 && + config.oriSrcK < basicBlockMaxK && config.oriSrcM % FLOAT_NUM_PER_BLK == 0; + if constexpr (localIsBasicBlock) { + SoftMaxBasicBlock(dst, sumTensor, maxTensor, src, workLocal, tiling); + } else { + uint32_t splitK = 0; + if constexpr (config.oriSrcK % FLOAT_NUM_PER_BLK == 0) { + splitK = config.oriSrcK; + } else { + splitK = AlignUp(config.oriSrcK, FLOAT_NUM_PER_BLK); } + ReduceLastND reduceParam = { tiling.splitM, config.oriSrcK, tiling.splitM, splitK, tiling.reduceM, + DEFAULT_REPEAT_STRIDE * HALF_FACTOR }; + SoftMaxNDExtImpl(dst, sumTensor, maxTensor, src, workLocal, originalSrcShape, tiling, reduceParam); } } } @@ -785,14 +844,32 @@ __aicore__ inline void SingleSoftMaxImpl(const LocalTensor& dst, const Lo } } -template +template __aicore__ inline void SoftMaxNDImpl(const LocalTensor& dst, const LocalTensor& src, const LocalTensor& workLocal, const LastAxisShapeND& originalSrcShape, const SoftMaxTiling& tiling) { uint32_t offset = 0; uint32_t splitSize = tiling.splitSize; - ReduceLastND reduceParam = { tiling.splitM, originalSrcShape.k, tiling.splitM, - tiling.splitK, tiling.reduceM, tiling.reduceK }; + ReduceLastND reduceParam; + if constexpr (config.oriSrcM == 0 || config.oriSrcK == 0) { + reduceParam = { tiling.splitM, originalSrcShape.k, tiling.splitM, tiling.splitK, tiling.reduceM, + tiling.reduceK }; + } else { + uint32_t splitK = 0; + if constexpr (config.oriSrcK % FLOAT_NUM_PER_BLK == 0) { + splitK = config.oriSrcK; + } else { + splitK = AlignUp(config.oriSrcK, FLOAT_NUM_PER_BLK); + } + if constexpr (SupportType()) { + reduceParam = { tiling.splitM, config.oriSrcK, tiling.splitM, splitK, tiling.reduceM, + DEFAULT_REPEAT_STRIDE * HALF_FACTOR }; + } else if constexpr (SupportType()) { + reduceParam = { tiling.splitM, config.oriSrcK, tiling.splitM, splitK, tiling.reduceM, + DEFAULT_REPEAT_STRIDE }; + } + } PipeBarrier(); for (uint32_t i = 0; i <= tiling.rangeM; i++) { SingleSoftMaxImpl(dst, src, workLocal, tiling, offset, splitSize, reduceParam); diff --git a/impl/activation/softmax/v300/softmax_impl.h b/impl/activation/softmax/v300/softmax_impl.h index ea7699a7..0d8b7f34 100644 --- a/impl/activation/softmax/v300/softmax_impl.h +++ b/impl/activation/softmax/v300/softmax_impl.h @@ -72,7 +72,7 @@ __aicore__ inline void SoftMaxGenericNDImpl(const LocalTensor& dst, const } } -template +template __aicore__ inline void SoftMaxNDImpl(const LocalTensor& dst, const LocalTensor& sumTensor, const LocalTensor& maxTensor, const LocalTensor& src, const LocalTensor& workLocal, const LastAxisShapeND& originalSrcShape, const SoftMaxTiling& tiling) @@ -103,7 +103,7 @@ __aicore__ inline void SoftMaxNDImpl(const LocalTensor& dst, const LocalTens } } } -template +template __aicore__ inline void SoftMaxNDImpl(const LocalTensor& dst, const LocalTensor& sumTensor, const LocalTensor& maxTensor, const LocalTensor& src, const LocalTensor& workLocal, const LastAxisShapeND& originalSrcShape, const SoftMaxTiling& tiling) diff --git a/lib/activation/softmax.h b/lib/activation/softmax.h index 827d115f..a0342dae 100644 --- a/lib/activation/softmax.h +++ b/lib/activation/softmax.h @@ -38,7 +38,8 @@ namespace AscendC { * improve performance , but it is a reserved param when isDataFormatNZ = true * \param [in] isDataFormatNZ: if the data format of input srcTensor is NZ */ -template +template __aicore__ inline void SoftMax(const LocalTensor& dstTensor, const LocalTensor& sumTensor, const LocalTensor& maxTensor, const LocalTensor& srcTensor, const SoftMaxTiling& tiling, const SoftMaxShapeInfo& softmaxShapeInfo = {}) @@ -47,7 +48,7 @@ __aicore__ inline void SoftMax(const LocalTensor& dstTensor, const LocalTenso return; } TRACE_START(TraceId::SoftMax); - SoftMaxImpl(dstTensor, sumTensor, maxTensor, srcTensor, tiling, + SoftMaxImpl(dstTensor, sumTensor, maxTensor, srcTensor, tiling, softmaxShapeInfo); TRACE_STOP(TraceId::SoftMax); } @@ -66,7 +67,8 @@ __aicore__ inline void SoftMax(const LocalTensor& dstTensor, const LocalTenso * improve performance, but it is a reserved param when isDataFormatNZ = true * \param [in] isDataFormatNZ: if the data format of input srcTensor is NZ */ -template +template __aicore__ inline void SoftMax(const LocalTensor& dstTensor, const LocalTensor& sumTensor, const LocalTensor& maxTensor, const LocalTensor& srcTensor, const SoftMaxTiling& tiling, const SoftMaxShapeInfo& softmaxShapeInfo = {}) @@ -75,7 +77,7 @@ __aicore__ inline void SoftMax(const LocalTensor& dstTensor, const LocalTe return; } TRACE_START(TraceId::SoftMax); - SoftMaxImpl(dstTensor, sumTensor, maxTensor, srcTensor, tiling, + SoftMaxImpl(dstTensor, sumTensor, maxTensor, srcTensor, tiling, softmaxShapeInfo); TRACE_STOP(TraceId::SoftMax); } @@ -91,7 +93,8 @@ __aicore__ inline void SoftMax(const LocalTensor& dstTensor, const LocalTe * \param [in] isReuseSource: reserved param * \param [in] isBasicBlock: reserved param */ -template +template __aicore__ inline void SoftMax(const LocalTensor& dstTensor, const LocalTensor& srcTensor, const SoftMaxTiling& tiling, const SoftMaxShapeInfo& softmaxShapeInfo = {}) { @@ -99,7 +102,7 @@ __aicore__ inline void SoftMax(const LocalTensor& dstTensor, const LocalTenso return; } TRACE_START(TraceId::SoftMax); - SoftMaxImpl(dstTensor, srcTensor, tiling, softmaxShapeInfo); + SoftMaxImpl(dstTensor, srcTensor, tiling, softmaxShapeInfo); TRACE_STOP(TraceId::SoftMax); } @@ -116,7 +119,8 @@ __aicore__ inline void SoftMax(const LocalTensor& dstTensor, const LocalTenso * \param [in] isReuseSource: reserved param * \param [in] isBasicBlock: reserved param */ -template +template __aicore__ inline void SoftMax(const LocalTensor& dstTensor, const LocalTensor& srcTensor, const LocalTensor& sharedTmpBuffer, const SoftMaxTiling& tiling, const SoftMaxShapeInfo& softmaxShapeInfo = {}) @@ -125,7 +129,8 @@ __aicore__ inline void SoftMax(const LocalTensor& dstTensor, const LocalTenso return; } TRACE_START(TraceId::SoftMax); - SoftMaxImpl(dstTensor, srcTensor, sharedTmpBuffer, tiling, softmaxShapeInfo); + SoftMaxImpl(dstTensor, srcTensor, sharedTmpBuffer, tiling, + softmaxShapeInfo); TRACE_STOP(TraceId::SoftMax); } @@ -146,7 +151,8 @@ __aicore__ inline void SoftMax(const LocalTensor& dstTensor, const LocalTenso * improve performance, but it is a reserved param when isDataFormatNZ = true * \param [in] isDataFormatNZ: if the data format of input srcTensor is NZ */ -template +template __aicore__ inline void SoftMax(const LocalTensor& dstTensor, const LocalTensor& sumTensor, const LocalTensor& maxTensor, const LocalTensor& srcTensor, const LocalTensor& sharedTmpBuffer, const SoftMaxTiling& tiling, const SoftMaxShapeInfo& softmaxShapeInfo = {}) @@ -155,8 +161,8 @@ __aicore__ inline void SoftMax(const LocalTensor& dstTensor, const LocalTenso return; } TRACE_START(TraceId::SoftMax); - SoftMaxImpl(dstTensor, sumTensor, maxTensor, srcTensor, sharedTmpBuffer, tiling, - softmaxShapeInfo); + SoftMaxImpl(dstTensor, sumTensor, maxTensor, srcTensor, sharedTmpBuffer, + tiling, softmaxShapeInfo); TRACE_STOP(TraceId::SoftMax); } @@ -176,7 +182,8 @@ __aicore__ inline void SoftMax(const LocalTensor& dstTensor, const LocalTenso * improve performance, but it is a reserved param when isDataFormatNZ = true * \param [in] isDataFormatNZ: if the data format of input srcTensor is NZ */ -template +template __aicore__ inline void SoftMax(const LocalTensor& dstTensor, const LocalTensor& sumTensor, const LocalTensor& maxTensor, const LocalTensor& srcTensor, const LocalTensor& sharedTmpBuffer, const SoftMaxTiling& tiling, @@ -186,8 +193,8 @@ __aicore__ inline void SoftMax(const LocalTensor& dstTensor, const LocalTe return; } TRACE_START(TraceId::SoftMax); - SoftMaxImpl(dstTensor, sumTensor, maxTensor, srcTensor, sharedTmpBuffer, - tiling, softmaxShapeInfo); + SoftMaxImpl(dstTensor, sumTensor, maxTensor, srcTensor, + sharedTmpBuffer, tiling, softmaxShapeInfo); TRACE_STOP(TraceId::SoftMax); } @@ -216,4 +223,4 @@ __aicore__ inline bool AdjustSoftMaxRes(const LocalTensor& softMaxRes, const #pragma end_pipe #endif -#endif // LIB_SOFTMAX_SOFTMAX_H +#endif // LIB_SOFTMAX_SOFTMAX_H \ No newline at end of file diff --git a/tests/activation/softmax/test_operator_softmax_v220.cpp b/tests/activation/softmax/test_operator_softmax_v220.cpp index 22743022..420513f3 100644 --- a/tests/activation/softmax/test_operator_softmax_v220.cpp +++ b/tests/activation/softmax/test_operator_softmax_v220.cpp @@ -18,9 +18,10 @@ using namespace std; using namespace AscendC; - namespace AscendC { -template class KernelSoftmax { +template +class KernelSoftmax { public: __aicore__ inline KernelSoftmax() {} __aicore__ inline void Init(__gm__ uint8_t* src0Gm, __gm__ uint8_t* src1Gm, __gm__ uint8_t* dstGm, @@ -135,11 +136,12 @@ private: }; } // namespace AscendC -template +template __global__ __aicore__ void MainSoftmax(__gm__ uint8_t* dstGm, __gm__ uint8_t* src0Gm, __gm__ uint8_t* src1Gm, uint32_t height, uint32_t width) { - AscendC::KernelSoftmax op; + AscendC::KernelSoftmax op; op.Init(src0Gm, src1Gm, dstGm, height, width); op.Process(); } @@ -175,7 +177,7 @@ INSTANTIATE_TEST_CASE_P(TEST_OPEARATION_SOFTMAX, SoftMaxTestsuite, SoftMaxTestParams{ 2, 50, 288, MainSoftmax }, SoftMaxTestParams{ 4, 64, 128, MainSoftmax }, SoftMaxTestParams{ 4, 64, 128, MainSoftmax }, - SoftMaxTestParams{ 4, 50, 144, MainSoftmax }, + SoftMaxTestParams{ 4, 50, 144, MainSoftmax }, SoftMaxTestParams{ 4, 16, 960, MainSoftmax }, SoftMaxTestParams{ 4, 100, 32, MainSoftmax }, SoftMaxTestParams{ 4, 50, 288, MainSoftmax }, @@ -190,4 +192,4 @@ TEST_P(SoftMaxTestsuite, SoftMaxOpTestCase) uint8_t src1Gm[param.height * param.width * param.typeSize]; uint8_t dstGm[param.height * param.width * param.typeSize]; param.cal_func(dstGm, src0Gm, src1Gm, param.height, param.width); -} +} \ No newline at end of file -- Gitee