diff --git a/impl/activation/softmax/simple_softmax_base_impl.h b/impl/activation/softmax/simple_softmax_base_impl.h index 7de89fe2b5a4c83c465b8fb060178a8796258e9b..eaa5798631b02cb73dcd916950a9007335deef16 100644 --- a/impl/activation/softmax/simple_softmax_base_impl.h +++ b/impl/activation/softmax/simple_softmax_base_impl.h @@ -24,7 +24,8 @@ #endif namespace AscendC { -template +template __aicore__ inline void SimpleSoftMaxImpl(const LocalTensor& dst, const LocalTensor& inSumTensor, const LocalTensor& inMaxTensor, const LocalTensor& src, const LocalTensor& workLocal, const SoftMaxTiling& tiling, const SoftMaxShapeInfo& softmaxShapeInfo) @@ -55,32 +56,34 @@ __aicore__ inline void SimpleSoftMaxImpl(const LocalTensor& dst, const Local SoftMaxTiling newTiling = tiling; SoftMaxTilingFunc(workLocal.GetSize(), { srcNDinfo.m, srcNDinfo.k, originalSrcShape.m, srcNDinfo.k }, newTiling, sizeof(T1), sizeof(T2), isBasicBlock); - SimpleSoftMaxNDImpl(dst, inSumTensor, inMaxTensor, src, workLocal, newTiling); + SimpleSoftMaxNDImpl(dst, inSumTensor, inMaxTensor, src, workLocal, newTiling); } else { - SimpleSoftMaxNDImpl(dst, inSumTensor, inMaxTensor, src, workLocal, tiling); + SimpleSoftMaxNDImpl(dst, inSumTensor, inMaxTensor, src, workLocal, tiling); } } } -template +template __aicore__ inline void SimpleSoftMaxImpl(const LocalTensor& dst, const LocalTensor& inSumTensor, const LocalTensor& inMaxTensor, const LocalTensor& src, const SoftMaxTiling& tiling, const SoftMaxShapeInfo& softmaxShapeInfo) { LocalTensor workLocal; PopStackBuffer(workLocal); - SimpleSoftMaxImpl(dst, inSumTensor, inMaxTensor, src, workLocal, tiling, - softmaxShapeInfo); + SimpleSoftMaxImpl(dst, inSumTensor, inMaxTensor, src, workLocal, + tiling, softmaxShapeInfo); } -template +template __aicore__ inline void SimpleSoftMaxImpl(const LocalTensor& dst, const LocalTensor& inSumTensor, const LocalTensor& inMaxTensor, const LocalTensor& src, const LocalTensor& sharedTmpBuffer, const SoftMaxTiling& tiling, const SoftMaxShapeInfo& softmaxShapeInfo) { auto workLocal = sharedTmpBuffer.ReinterpretCast(); - SimpleSoftMaxImpl(dst, inSumTensor, inMaxTensor, src, workLocal, tiling, - softmaxShapeInfo); + SimpleSoftMaxImpl(dst, inSumTensor, inMaxTensor, src, workLocal, + tiling, softmaxShapeInfo); } } #endif // IMPL_ACTIVATION_SOFTMAX_SIMPLE_SOFTMAX_BASE_IMPL_H \ No newline at end of file diff --git a/impl/activation/softmax/v200/simple_softmax_impl.h b/impl/activation/softmax/v200/simple_softmax_impl.h index 85ddba56381c6993867706f2fba152c64b67b011..79dd99af7cf67d7a81158630c9df31c3ff2eab9f 100644 --- a/impl/activation/softmax/v200/simple_softmax_impl.h +++ b/impl/activation/softmax/v200/simple_softmax_impl.h @@ -285,6 +285,7 @@ __aicore__ inline void SimpleSoftMaxBasicBlock(const LocalTensor& dst, co #endif } +template __aicore__ inline void SimpleSoftMaxGenericNDImpl(const LocalTensor& dst, const LocalTensor& inSumTensor, const LocalTensor& inMaxTensor, const LocalTensor& src, const LocalTensor& workLocal, const SoftMaxTiling& tiling, const uint32_t offset1, const uint32_t offset2, const uint32_t curSplitM) @@ -293,37 +294,68 @@ __aicore__ inline void SimpleSoftMaxGenericNDImpl(const LocalTensor& dst, const LocalTensor& tmpBuffer2 = workLocal[tiling.splitSize]; const uint32_t splitSize = curSplitM * tiling.splitK; const uint32_t reduceSize = curSplitM * tiling.reduceK; + if constexpr (config.oriSrcM == 0 || config.oriSrcK == 0) { + Cast(tmpBuffer0, src[offset1], RoundMode::CAST_NONE, splitSize); + Cast(tmpBuffer2, inMaxTensor[offset2], RoundMode::CAST_NONE, reduceSize); + PipeBarrier(); - Cast(tmpBuffer0, src[offset1], RoundMode::CAST_NONE, splitSize); - Cast(tmpBuffer2, inMaxTensor[offset2], RoundMode::CAST_NONE, reduceSize); - PipeBarrier(); + GenericSubNDImpl(tmpBuffer0, tmpBuffer0, tmpBuffer2, curSplitM, tiling.srcK, tiling.reduceK); - GenericSubNDImpl(tmpBuffer0, tmpBuffer0, tmpBuffer2, curSplitM, tiling.srcK, tiling.reduceK); + PipeBarrier(); + Exp(tmpBuffer0, tmpBuffer0, splitSize); - PipeBarrier(); - Exp(tmpBuffer0, tmpBuffer0, splitSize); + Cast(tmpBuffer2, inSumTensor[offset2], RoundMode::CAST_NONE, reduceSize); + PipeBarrier(); + GenericDivNDImpl(tmpBuffer0, tmpBuffer0, tmpBuffer2, curSplitM, tiling.srcK, tiling.reduceK); - Cast(tmpBuffer2, inSumTensor[offset2], RoundMode::CAST_NONE, reduceSize); - PipeBarrier(); - GenericDivNDImpl(tmpBuffer0, tmpBuffer0, tmpBuffer2, curSplitM, tiling.srcK, tiling.reduceK); + PipeBarrier(); + Cast(dst[offset1], tmpBuffer0, FLOAT2HALF_ROUND_MODE, splitSize); + } else { + Cast(tmpBuffer0, src[offset1], RoundMode::CAST_NONE, splitSize); + Cast(tmpBuffer2, inMaxTensor[offset2], RoundMode::CAST_NONE, reduceSize); + PipeBarrier(); - PipeBarrier(); - Cast(dst[offset1], tmpBuffer0, FLOAT2HALF_ROUND_MODE, splitSize); + GenericSubNDImpl(tmpBuffer0, tmpBuffer0, tmpBuffer2, curSplitM, config.oriSrcK, + DEFAULT_REPEAT_STRIDE * HALF_FACTOR); + + PipeBarrier(); + Exp(tmpBuffer0, tmpBuffer0, splitSize); + + Cast(tmpBuffer2, inSumTensor[offset2], RoundMode::CAST_NONE, reduceSize); + PipeBarrier(); + GenericDivNDImpl(tmpBuffer0, tmpBuffer0, tmpBuffer2, curSplitM, config.oriSrcK, + DEFAULT_REPEAT_STRIDE * HALF_FACTOR); + + PipeBarrier(); + Cast(dst[offset1], tmpBuffer0, FLOAT2HALF_ROUND_MODE, splitSize); + } } + +template __aicore__ inline void SimpleSoftMaxGenericNDImpl(const LocalTensor& dst, const LocalTensor& inSumTensor, const LocalTensor& inMaxTensor, const LocalTensor& src, const LocalTensor& workLocal, const SoftMaxTiling& tiling, const uint32_t offset1, const uint32_t offset2, const uint32_t curSplitM) { const uint32_t splitSize = curSplitM * tiling.splitK; - GenericSubNDImpl(dst[offset1], src[offset1], inMaxTensor[offset2], curSplitM, tiling.srcK, tiling.reduceK); - PipeBarrier(); - Exp(dst[offset1], dst[offset1], splitSize); - PipeBarrier(); - GenericDivNDImpl(dst[offset1], dst[offset1], inSumTensor[offset2], curSplitM, tiling.srcK, tiling.reduceK); + if constexpr (config.oriSrcM == 0 || config.oriSrcK == 0) { + GenericSubNDImpl(dst[offset1], src[offset1], inMaxTensor[offset2], curSplitM, tiling.srcK, tiling.reduceK); + PipeBarrier(); + Exp(dst[offset1], dst[offset1], splitSize); + PipeBarrier(); + GenericDivNDImpl(dst[offset1], dst[offset1], inSumTensor[offset2], curSplitM, tiling.srcK, tiling.reduceK); + } else { + GenericSubNDImpl(dst[offset1], src[offset1], inMaxTensor[offset2], curSplitM, config.oriSrcK, + DEFAULT_REPEAT_STRIDE); + PipeBarrier(); + Exp(dst[offset1], dst[offset1], splitSize); + PipeBarrier(); + GenericDivNDImpl(dst[offset1], dst[offset1], inSumTensor[offset2], curSplitM, config.oriSrcK, + DEFAULT_REPEAT_STRIDE); + } } -template +template __aicore__ inline void SimpleSoftMaxNDImpl(const LocalTensor& dst, const LocalTensor& inSumTensor, const LocalTensor& inMaxTensor, const LocalTensor& src, const LocalTensor workLocal, const SoftMaxTiling& tiling) @@ -332,7 +364,8 @@ __aicore__ inline void SimpleSoftMaxNDImpl(const LocalTensor& dst, const Loca SimpleSoftMaxBasicBlock(dst, inSumTensor, inMaxTensor, src, workLocal, tiling); } else { if constexpr (sizeof(T) == sizeof(float)) { - SimpleSoftMaxGenericNDImpl(dst, inSumTensor, inMaxTensor, src, workLocal, tiling, 0, 0, tiling.srcM); + SimpleSoftMaxGenericNDImpl(dst, inSumTensor, inMaxTensor, src, workLocal, tiling, 0, 0, + tiling.srcM); } else { uint32_t offset1 = 0; uint32_t offset2 = 0; @@ -340,15 +373,15 @@ __aicore__ inline void SimpleSoftMaxNDImpl(const LocalTensor& dst, const Loca for (uint32_t i = 0; i < tiling.rangeM; i++) { offset1 = i * tiling.splitSize; offset2 = i * tiling.reduceSize; - SimpleSoftMaxGenericNDImpl(dst, inSumTensor, inMaxTensor, src, workLocal, tiling, offset1, offset2, - tiling.splitM); + SimpleSoftMaxGenericNDImpl(dst, inSumTensor, inMaxTensor, src, workLocal, tiling, offset1, + offset2, tiling.splitM); } PipeBarrier(); if (tiling.tailM != 0) { offset1 = tiling.rangeM * tiling.splitSize; offset2 = tiling.rangeM * tiling.reduceSize; - SimpleSoftMaxGenericNDImpl(dst, inSumTensor, inMaxTensor, src, workLocal, tiling, offset1, offset2, - tiling.tailM); + SimpleSoftMaxGenericNDImpl(dst, inSumTensor, inMaxTensor, src, workLocal, tiling, offset1, + offset2, tiling.tailM); } } } @@ -391,25 +424,29 @@ __aicore__ inline void SimpleSoftMaxBasicBlock(const LocalTensor& dst, con } } +template __aicore__ inline void SimpleSoftMaxGenericNDImpl(const LocalTensor& dst, const LocalTensor& inSumTensor, const LocalTensor& inMaxTensor, const LocalTensor& src, const LocalTensor& workLocal, const SoftMaxTiling& tiling, const uint32_t offset1, const uint32_t offset2, const uint32_t curSplitM) { const LocalTensor& tmpBuffer0 = workLocal; const uint32_t splitSize = curSplitM * tiling.splitK; - - Cast(tmpBuffer0, src[offset1], RoundMode::CAST_NONE, splitSize); - PipeBarrier(); - GenericSubNDImpl(tmpBuffer0, tmpBuffer0, inMaxTensor[offset2], curSplitM, tiling.srcK, tiling.reduceK); - PipeBarrier(); - Exp(tmpBuffer0, tmpBuffer0, tiling.splitSize); - PipeBarrier(); - GenericDivNDImpl(tmpBuffer0, tmpBuffer0, inSumTensor[offset2], curSplitM, tiling.srcK, tiling.reduceK); - PipeBarrier(); - Cast(dst[offset1], tmpBuffer0, FLOAT2HALF_ROUND_MODE, splitSize); + if constexpr (config.oriSrcM == 0 || config.oriSrcK == 0) { + Cast(tmpBuffer0, src[offset1], RoundMode::CAST_NONE, splitSize); + PipeBarrier(); + GenericSubNDImpl(tmpBuffer0, tmpBuffer0, inMaxTensor[offset2], curSplitM, config.oriSrcK, + DEFAULT_REPEAT_STRIDE); + PipeBarrier(); + Exp(tmpBuffer0, tmpBuffer0, tiling.splitSize); + PipeBarrier(); + GenericDivNDImpl(tmpBuffer0, tmpBuffer0, inSumTensor[offset2], curSplitM, config.oriSrcK, + DEFAULT_REPEAT_STRIDE); + PipeBarrier(); + Cast(dst[offset1], tmpBuffer0, FLOAT2HALF_ROUND_MODE, splitSize); + } } -template +template __aicore__ inline void SimpleSoftMaxNDImpl(const LocalTensor& dst, const LocalTensor& inSumTensor, const LocalTensor& inMaxTensor, const LocalTensor& src, const LocalTensor& workLocal, const SoftMaxTiling& tiling) @@ -423,15 +460,15 @@ __aicore__ inline void SimpleSoftMaxNDImpl(const LocalTensor& dst, const L for (uint32_t i = 0; i < tiling.rangeM; i++) { offset1 = i * tiling.splitSize; offset2 = i * tiling.reduceSize; - SimpleSoftMaxGenericNDImpl(dst, inSumTensor, inMaxTensor, src, workLocal, tiling, offset1, offset2, - tiling.splitM); + SimpleSoftMaxGenericNDImpl(dst, inSumTensor, inMaxTensor, src, workLocal, tiling, offset1, + offset2, tiling.splitM); } PipeBarrier(); if (tiling.tailM != 0) { offset1 = tiling.rangeM * tiling.splitSize; offset2 = tiling.rangeM * tiling.reduceSize; - SimpleSoftMaxGenericNDImpl(dst, inSumTensor, inMaxTensor, src, workLocal, tiling, offset1, offset2, - tiling.tailM); + SimpleSoftMaxGenericNDImpl(dst, inSumTensor, inMaxTensor, src, workLocal, tiling, offset1, + offset2, tiling.tailM); } } } diff --git a/impl/activation/softmax/v220/simple_softmax_impl.h b/impl/activation/softmax/v220/simple_softmax_impl.h index 5bbe7d10750fd23d67db417906868e6e4e133074..a01b0adcbcac68082fea73344f0e5499299ac38f 100644 --- a/impl/activation/softmax/v220/simple_softmax_impl.h +++ b/impl/activation/softmax/v220/simple_softmax_impl.h @@ -285,6 +285,7 @@ __aicore__ inline void SimpleSoftMaxBasicBlock(const LocalTensor& dst, co #endif } +template __aicore__ inline void SimpleSoftMaxGenericNDImpl(const LocalTensor& dst, const LocalTensor& inSumTensor, const LocalTensor& inMaxTensor, const LocalTensor& src, const LocalTensor& workLocal, const SoftMaxTiling& tiling, const uint32_t offset1, const uint32_t offset2, const uint32_t curSplitM) @@ -293,37 +294,68 @@ __aicore__ inline void SimpleSoftMaxGenericNDImpl(const LocalTensor& dst, const LocalTensor& tmpBuffer2 = workLocal[tiling.splitSize]; const uint32_t splitSize = curSplitM * tiling.splitK; const uint32_t reduceSize = curSplitM * tiling.reduceK; + if constexpr (config.oriSrcM == 0 || config.oriSrcK == 0) { + Cast(tmpBuffer0, src[offset1], RoundMode::CAST_NONE, splitSize); + Cast(tmpBuffer2, inMaxTensor[offset2], RoundMode::CAST_NONE, reduceSize); + PipeBarrier(); - Cast(tmpBuffer0, src[offset1], RoundMode::CAST_NONE, splitSize); - Cast(tmpBuffer2, inMaxTensor[offset2], RoundMode::CAST_NONE, reduceSize); - PipeBarrier(); + GenericSubNDImpl(tmpBuffer0, tmpBuffer0, tmpBuffer2, curSplitM, tiling.srcK, tiling.reduceK); - GenericSubNDImpl(tmpBuffer0, tmpBuffer0, tmpBuffer2, curSplitM, tiling.srcK, tiling.reduceK); + PipeBarrier(); + Exp(tmpBuffer0, tmpBuffer0, splitSize); - PipeBarrier(); - Exp(tmpBuffer0, tmpBuffer0, splitSize); + Cast(tmpBuffer2, inSumTensor[offset2], RoundMode::CAST_NONE, reduceSize); + PipeBarrier(); + GenericDivNDImpl(tmpBuffer0, tmpBuffer0, tmpBuffer2, curSplitM, tiling.srcK, tiling.reduceK); - Cast(tmpBuffer2, inSumTensor[offset2], RoundMode::CAST_NONE, reduceSize); - PipeBarrier(); - GenericDivNDImpl(tmpBuffer0, tmpBuffer0, tmpBuffer2, curSplitM, tiling.srcK, tiling.reduceK); + PipeBarrier(); + Cast(dst[offset1], tmpBuffer0, FLOAT2HALF_ROUND_MODE, splitSize); + } else { + Cast(tmpBuffer0, src[offset1], RoundMode::CAST_NONE, splitSize); + Cast(tmpBuffer2, inMaxTensor[offset2], RoundMode::CAST_NONE, reduceSize); + PipeBarrier(); - PipeBarrier(); - Cast(dst[offset1], tmpBuffer0, FLOAT2HALF_ROUND_MODE, splitSize); + GenericSubNDImpl(tmpBuffer0, tmpBuffer0, tmpBuffer2, curSplitM, config.oriSrcK, + DEFAULT_REPEAT_STRIDE * HALF_FACTOR); + + PipeBarrier(); + Exp(tmpBuffer0, tmpBuffer0, splitSize); + + Cast(tmpBuffer2, inSumTensor[offset2], RoundMode::CAST_NONE, reduceSize); + PipeBarrier(); + GenericDivNDImpl(tmpBuffer0, tmpBuffer0, tmpBuffer2, curSplitM, config.oriSrcK, + DEFAULT_REPEAT_STRIDE * HALF_FACTOR); + + PipeBarrier(); + Cast(dst[offset1], tmpBuffer0, FLOAT2HALF_ROUND_MODE, splitSize); + } } + +template __aicore__ inline void SimpleSoftMaxGenericNDImpl(const LocalTensor& dst, const LocalTensor& inSumTensor, const LocalTensor& inMaxTensor, const LocalTensor& src, const LocalTensor& workLocal, const SoftMaxTiling& tiling, const uint32_t offset1, const uint32_t offset2, const uint32_t curSplitM) { const uint32_t splitSize = curSplitM * tiling.splitK; - GenericSubNDImpl(dst[offset1], src[offset1], inMaxTensor[offset2], curSplitM, tiling.srcK, tiling.reduceK); - PipeBarrier(); - Exp(dst[offset1], dst[offset1], splitSize); - PipeBarrier(); - GenericDivNDImpl(dst[offset1], dst[offset1], inSumTensor[offset2], curSplitM, tiling.srcK, tiling.reduceK); + if constexpr (config.oriSrcM == 0 || config.oriSrcK == 0) { + GenericSubNDImpl(dst[offset1], src[offset1], inMaxTensor[offset2], curSplitM, tiling.srcK, tiling.reduceK); + PipeBarrier(); + Exp(dst[offset1], dst[offset1], splitSize); + PipeBarrier(); + GenericDivNDImpl(dst[offset1], dst[offset1], inSumTensor[offset2], curSplitM, tiling.srcK, tiling.reduceK); + } else { + GenericSubNDImpl(dst[offset1], src[offset1], inMaxTensor[offset2], curSplitM, config.oriSrcK, + DEFAULT_REPEAT_STRIDE); + PipeBarrier(); + Exp(dst[offset1], dst[offset1], splitSize); + PipeBarrier(); + GenericDivNDImpl(dst[offset1], dst[offset1], inSumTensor[offset2], curSplitM, config.oriSrcK, + DEFAULT_REPEAT_STRIDE); + } } -template +template __aicore__ inline void SimpleSoftMaxNDImpl(const LocalTensor& dst, const LocalTensor& inSumTensor, const LocalTensor& inMaxTensor, const LocalTensor& src, const LocalTensor workLocal, const SoftMaxTiling& tiling) @@ -332,7 +364,8 @@ __aicore__ inline void SimpleSoftMaxNDImpl(const LocalTensor& dst, const Loca SimpleSoftMaxBasicBlock(dst, inSumTensor, inMaxTensor, src, workLocal, tiling); } else { if constexpr (sizeof(T) == sizeof(float)) { - SimpleSoftMaxGenericNDImpl(dst, inSumTensor, inMaxTensor, src, workLocal, tiling, 0, 0, tiling.srcM); + SimpleSoftMaxGenericNDImpl(dst, inSumTensor, inMaxTensor, src, workLocal, tiling, 0, 0, + tiling.srcM); } else { uint32_t offset1 = 0; uint32_t offset2 = 0; @@ -340,15 +373,15 @@ __aicore__ inline void SimpleSoftMaxNDImpl(const LocalTensor& dst, const Loca for (uint32_t i = 0; i < tiling.rangeM; i++) { offset1 = i * tiling.splitSize; offset2 = i * tiling.reduceSize; - SimpleSoftMaxGenericNDImpl(dst, inSumTensor, inMaxTensor, src, workLocal, tiling, offset1, offset2, - tiling.splitM); + SimpleSoftMaxGenericNDImpl(dst, inSumTensor, inMaxTensor, src, workLocal, tiling, offset1, + offset2, tiling.splitM); } PipeBarrier(); if (tiling.tailM != 0) { offset1 = tiling.rangeM * tiling.splitSize; offset2 = tiling.rangeM * tiling.reduceSize; - SimpleSoftMaxGenericNDImpl(dst, inSumTensor, inMaxTensor, src, workLocal, tiling, offset1, offset2, - tiling.tailM); + SimpleSoftMaxGenericNDImpl(dst, inSumTensor, inMaxTensor, src, workLocal, tiling, offset1, + offset2, tiling.tailM); } } } @@ -391,6 +424,7 @@ __aicore__ inline void SimpleSoftMaxBasicBlock(const LocalTensor& dst, con } } +template __aicore__ inline void SimpleSoftMaxGenericNDImpl(const LocalTensor& dst, const LocalTensor& inSumTensor, const LocalTensor& inMaxTensor, const LocalTensor& src, const LocalTensor& workLocal, const SoftMaxTiling& tiling, const uint32_t offset1, const uint32_t offset2, const uint32_t curSplitM) @@ -398,18 +432,32 @@ __aicore__ inline void SimpleSoftMaxGenericNDImpl(const LocalTensor& dst, const LocalTensor& tmpBuffer0 = workLocal; const uint32_t splitSize = curSplitM * tiling.splitK; - Cast(tmpBuffer0, src[offset1], RoundMode::CAST_NONE, splitSize); - PipeBarrier(); - GenericSubNDImpl(tmpBuffer0, tmpBuffer0, inMaxTensor[offset2], curSplitM, tiling.srcK, tiling.reduceK); - PipeBarrier(); - Exp(tmpBuffer0, tmpBuffer0, tiling.splitSize); - PipeBarrier(); - GenericDivNDImpl(tmpBuffer0, tmpBuffer0, inSumTensor[offset2], curSplitM, tiling.srcK, tiling.reduceK); - PipeBarrier(); - Cast(dst[offset1], tmpBuffer0, FLOAT2HALF_ROUND_MODE, splitSize); + if constexpr (config.oriSrcM == 0 || config.oriSrcK == 0) { + Cast(tmpBuffer0, src[offset1], RoundMode::CAST_NONE, splitSize); + PipeBarrier(); + GenericSubNDImpl(tmpBuffer0, tmpBuffer0, inMaxTensor[offset2], curSplitM, tiling.srcK, tiling.reduceK); + PipeBarrier(); + Exp(tmpBuffer0, tmpBuffer0, tiling.splitSize); + PipeBarrier(); + GenericDivNDImpl(tmpBuffer0, tmpBuffer0, inSumTensor[offset2], curSplitM, tiling.srcK, tiling.reduceK); + PipeBarrier(); + Cast(dst[offset1], tmpBuffer0, FLOAT2HALF_ROUND_MODE, splitSize); + } else { + Cast(tmpBuffer0, src[offset1], RoundMode::CAST_NONE, splitSize); + PipeBarrier(); + GenericSubNDImpl(tmpBuffer0, tmpBuffer0, inMaxTensor[offset2], curSplitM, config.oriSrcK, + DEFAULT_REPEAT_STRIDE); + PipeBarrier(); + Exp(tmpBuffer0, tmpBuffer0, tiling.splitSize); + PipeBarrier(); + GenericDivNDImpl(tmpBuffer0, tmpBuffer0, inSumTensor[offset2], curSplitM, config.oriSrcK, + DEFAULT_REPEAT_STRIDE); + PipeBarrier(); + Cast(dst[offset1], tmpBuffer0, FLOAT2HALF_ROUND_MODE, splitSize); + } } -template +template __aicore__ inline void SimpleSoftMaxNDImpl(const LocalTensor& dst, const LocalTensor& inSumTensor, const LocalTensor& inMaxTensor, const LocalTensor& src, const LocalTensor& workLocal, const SoftMaxTiling& tiling) @@ -423,15 +471,15 @@ __aicore__ inline void SimpleSoftMaxNDImpl(const LocalTensor& dst, const L for (uint32_t i = 0; i < tiling.rangeM; i++) { offset1 = i * tiling.splitSize; offset2 = i * tiling.reduceSize; - SimpleSoftMaxGenericNDImpl(dst, inSumTensor, inMaxTensor, src, workLocal, tiling, offset1, offset2, - tiling.splitM); + SimpleSoftMaxGenericNDImpl(dst, inSumTensor, inMaxTensor, src, workLocal, tiling, offset1, + offset2, tiling.splitM); } PipeBarrier(); if (tiling.tailM != 0) { offset1 = tiling.rangeM * tiling.splitSize; offset2 = tiling.rangeM * tiling.reduceSize; - SimpleSoftMaxGenericNDImpl(dst, inSumTensor, inMaxTensor, src, workLocal, tiling, offset1, offset2, - tiling.tailM); + SimpleSoftMaxGenericNDImpl(dst, inSumTensor, inMaxTensor, src, workLocal, tiling, offset1, + offset2, tiling.tailM); } } } diff --git a/impl/activation/softmax/v300/simple_softmax_impl.h b/impl/activation/softmax/v300/simple_softmax_impl.h index 25596b18ae73555815623dd61619138d51961a79..e8a20d300ae781ae529d2a722454eefe282ea5f2 100644 --- a/impl/activation/softmax/v300/simple_softmax_impl.h +++ b/impl/activation/softmax/v300/simple_softmax_impl.h @@ -80,7 +80,7 @@ __aicore__ inline void SimpleSoftMaxGenericNDImpl(const LocalTensor& dst, DivNDImpl(dst[offset1], dst[offset1], inSumTensor[offset2], curSplitM, tiling.srcK, tiling.reduceK); } -template +template __aicore__ inline void SimpleSoftMaxNDImpl(const LocalTensor& dst, const LocalTensor& inSumTensor, const LocalTensor& inMaxTensor, const LocalTensor& src, const LocalTensor workLocal, const SoftMaxTiling& tiling) @@ -108,7 +108,7 @@ __aicore__ inline void SimpleSoftMaxNDImpl(const LocalTensor& dst, const Loca } } -template +template __aicore__ inline void SimpleSoftMaxNDImpl(const LocalTensor& dst, const LocalTensor& inSumTensor, const LocalTensor& inMaxTensor, const LocalTensor& src, const LocalTensor& workLocal, const SoftMaxTiling& tiling) diff --git a/lib/activation/simplesoftmax.h b/lib/activation/simplesoftmax.h index 75573666c1d8188d16865be32921ef5b6ac3b19d..986e6603c30d5acadd86474223374b634957ee1f 100644 --- a/lib/activation/simplesoftmax.h +++ b/lib/activation/simplesoftmax.h @@ -38,7 +38,8 @@ namespace AscendC { * improve performance, but it is a reserved param when isDataFormatNZ = true * \param [in] isDataFormatNZ: if the data format of input srcTensor is NZ */ -template +template __aicore__ inline void SimpleSoftMax(const LocalTensor& dstTensor, const LocalTensor& inSumTensor, const LocalTensor& inMaxTensor, const LocalTensor& srcTensor, const SoftMaxTiling& tiling, const SoftMaxShapeInfo& softmaxShapeInfo = {}) @@ -46,8 +47,8 @@ __aicore__ inline void SimpleSoftMax(const LocalTensor& dstTensor, const Loca if ASCEND_IS_AIC { return; } - SimpleSoftMaxImpl(dstTensor, inSumTensor, inMaxTensor, srcTensor, tiling, - softmaxShapeInfo); + SimpleSoftMaxImpl(dstTensor, inSumTensor, inMaxTensor, srcTensor, + tiling, softmaxShapeInfo); } /*! @@ -64,7 +65,8 @@ __aicore__ inline void SimpleSoftMax(const LocalTensor& dstTensor, const Loca * improve performance, but it is a reserved param when isDataFormatNZ = true * \param [in] isDataFormatNZ: if the data format of input srcTensor is NZ */ -template +template __aicore__ inline void SimpleSoftMax(const LocalTensor& dstTensor, const LocalTensor& inSumTensor, const LocalTensor& inMaxTensor, const LocalTensor& srcTensor, const SoftMaxTiling& tiling, const SoftMaxShapeInfo& softmaxShapeInfo = {}) @@ -72,8 +74,8 @@ __aicore__ inline void SimpleSoftMax(const LocalTensor& dstTensor, const L if ASCEND_IS_AIC { return; } - SimpleSoftMaxImpl(dstTensor, inSumTensor, inMaxTensor, srcTensor, tiling, - softmaxShapeInfo); + SimpleSoftMaxImpl(dstTensor, inSumTensor, inMaxTensor, srcTensor, + tiling, softmaxShapeInfo); } /*! @@ -93,7 +95,8 @@ __aicore__ inline void SimpleSoftMax(const LocalTensor& dstTensor, const L * improve performance, but it is a reserved param when isDataFormatNZ = true * \param [in] isDataFormatNZ: if the data format of input srcTensor is NZ */ -template +template __aicore__ inline void SimpleSoftMax(const LocalTensor& dstTensor, const LocalTensor& inSumTensor, const LocalTensor& inMaxTensor, const LocalTensor& srcTensor, const LocalTensor& sharedTmpBuffer, const SoftMaxTiling& tiling, const SoftMaxShapeInfo& softmaxShapeInfo = {}) @@ -101,7 +104,7 @@ __aicore__ inline void SimpleSoftMax(const LocalTensor& dstTensor, const Loca if ASCEND_IS_AIC { return; } - SimpleSoftMaxImpl(dstTensor, inSumTensor, inMaxTensor, srcTensor, + SimpleSoftMaxImpl(dstTensor, inSumTensor, inMaxTensor, srcTensor, sharedTmpBuffer, tiling, softmaxShapeInfo); } @@ -121,7 +124,8 @@ __aicore__ inline void SimpleSoftMax(const LocalTensor& dstTensor, const Loca * improve performance, but it is a reserved param when isDataFormatNZ = true * \param [in] isDataFormatNZ: if the data format of input srcTensor is NZ */ -template +template __aicore__ inline void SimpleSoftMax(const LocalTensor& dstTensor, const LocalTensor& inSumTensor, const LocalTensor& inMaxTensor, const LocalTensor& srcTensor, const LocalTensor& sharedTmpBuffer, const SoftMaxTiling& tiling, @@ -130,11 +134,11 @@ __aicore__ inline void SimpleSoftMax(const LocalTensor& dstTensor, const L if ASCEND_IS_AIC { return; } - SimpleSoftMaxImpl(dstTensor, inSumTensor, inMaxTensor, srcTensor, + SimpleSoftMaxImpl(dstTensor, inSumTensor, inMaxTensor, srcTensor, sharedTmpBuffer, tiling, softmaxShapeInfo); } } // namespace AscendC #pragma end_pipe #endif -#endif // LIB_SOFTMAX_SIMPLESOFTMAX_H +#endif // LIB_SOFTMAX_SIMPLESOFTMAX_H \ No newline at end of file diff --git a/tests/activation/softmax/test_operator_softmax_v220.cpp b/tests/activation/softmax/test_operator_softmax_v220.cpp index 371524f14ad81a3a7ba96fcc0bd9805f298b953e..ed56d987ffa9b25214b497b85d46b6f2aa7f9a49 100644 --- a/tests/activation/softmax/test_operator_softmax_v220.cpp +++ b/tests/activation/softmax/test_operator_softmax_v220.cpp @@ -85,6 +85,7 @@ private: SoftMaxTiling tiling; SoftMax(srcLocal1, insumLocal, inmaxLocal, srcLocal1, tiling, srcShape); SimpleSoftMax(dstLocal, insumLocal, inmaxLocal, srcLocal1, tiling, srcShape); + SimpleSoftMax(dstLocal, insumLocal, inmaxLocal, srcLocal1, tiling, srcShape); SoftmaxFlash(dstLocal, insumLocal, inmaxLocal, srcLocal1, expMaxTensor, insumLocal, inmaxLocal, tiling, false, srcShape);