From c6f5de399f5f322ec9e7145fd33b5ebf18c5bd64 Mon Sep 17 00:00:00 2001
From: zhouqian0123 <zhouqian11@huawei.com>
Date: Fri, 13 Sep 2024 15:00:43 +0800
Subject: [PATCH 1/2] softxmax const compile

---
 impl/activation/softmax/softmax_base_impl.h   |  36 ++--
 impl/activation/softmax/softmax_common.h      |  10 +-
 impl/activation/softmax/v200/softmax_impl.h   |   9 +-
 impl/activation/softmax/v220/softmax_impl.h   | 185 +++++++++++++-----
 impl/activation/softmax/v300/softmax_impl.h   |   4 +-
 lib/activation/softmax.h                      |  37 ++--
 .../softmax/test_operator_softmax_v220.cpp    |  17 +-
 7 files changed, 203 insertions(+), 95 deletions(-)
diff --git a/impl/activation/softmax/softmax_base_impl.h b/impl/activation/softmax/softmax_base_impl.h
index 8c44c306..649e0654 100644
--- a/impl/activation/softmax/softmax_base_impl.h
+++ b/impl/activation/softmax/softmax_base_impl.h
@@ -24,7 +24,8 @@
 #endif
 
 namespace AscendC {
-template <typename T, bool isReuseSource = false, bool isBasicBlock = false>
+template <typename T, bool isReuseSource = false, bool isBasicBlock = false,
+    const SoftmaxConfig& config = SOFTMAX_DEFAULT_CFG>
 __aicore__ inline void SoftMaxImpl(const LocalTensor<T>& dst, const LocalTensor<T>& src,
     const LocalTensor<float>& workLocal, const SoftMaxTiling& tiling, const SoftMaxShapeInfo& softmaxShapeInfo)
 {
@@ -45,28 +46,31 @@ __aicore__ inline void SoftMaxImpl(const LocalTensor<T>& dst, const LocalTensor<
         SoftMaxTiling newTiling = tiling;
         SoftMaxTilingFunc(workLocal.GetSize(), { srcNDinfo.m, srcNDinfo.k, originalSrcShape.m, srcNDinfo.k },
             newTiling, sizeof(T), sizeof(float), isBasicBlock);
-        SoftMaxNDImpl<T, isReuseSource, isBasicBlock>(dst, src, workLocal, originalSrcShape, newTiling);
+        SoftMaxNDImpl<T, isReuseSource, isBasicBlock, config>(dst, src, workLocal, originalSrcShape, newTiling);
     } else {
-        SoftMaxNDImpl<T, isReuseSource, isBasicBlock>(dst, src, workLocal, originalSrcShape, tiling);
+        SoftMaxNDImpl<T, isReuseSource, isBasicBlock, config>(dst, src, workLocal, originalSrcShape, tiling);
     }
 }
-template <typename T, bool isReuseSource = false, bool isBasicBlock = false>
+template <typename T, bool isReuseSource = false, bool isBasicBlock = false,
+    const SoftmaxConfig& config = SOFTMAX_DEFAULT_CFG>
 __aicore__ inline void SoftMaxImpl(const LocalTensor<T>& dst, const LocalTensor<T>& src, const SoftMaxTiling& tiling,
     const SoftMaxShapeInfo& softmaxShapeInfo)
 {
     LocalTensor<float> workLocal;
     PopStackBuffer<float, TPosition::LCM>(workLocal);
-    SoftMaxImpl<T, isReuseSource, isBasicBlock>(dst, src, workLocal, tiling, softmaxShapeInfo);
+    SoftMaxImpl<T, isReuseSource, isBasicBlock, config>(dst, src, workLocal, tiling, softmaxShapeInfo);
 }
-template <typename T, bool isReuseSource = false, bool isBasicBlock = false>
+template <typename T, bool isReuseSource = false, bool isBasicBlock = false,
+    const SoftmaxConfig& config = SOFTMAX_DEFAULT_CFG>
 __aicore__ inline void SoftMaxImpl(const LocalTensor<T>& dst, const LocalTensor<T>& src,
     const LocalTensor<uint8_t>& sharedTmpBuffer, const SoftMaxTiling& tiling, const SoftMaxShapeInfo& softmaxShapeInfo)
 {
     auto workLocal = sharedTmpBuffer.ReinterpretCast<float>();
-    SoftMaxImpl<T, isReuseSource, isBasicBlock>(dst, src, workLocal, tiling, softmaxShapeInfo);
+    SoftMaxImpl<T, isReuseSource, isBasicBlock, config>(dst, src, workLocal, tiling, softmaxShapeInfo);
 }
 
-template <typename T1, typename T2, bool isBasicBlock = false, bool isDataFormatNZ = false>
+template <typename T1, typename T2, bool isBasicBlock = false, bool isDataFormatNZ = false,
+    const SoftmaxConfig& config = SOFTMAX_DEFAULT_CFG>
 __aicore__ inline void SoftMaxImpl(const LocalTensor<T1>& dst, const LocalTensor<T2>& sumTensor,
     const LocalTensor<T2>& maxTensor, const LocalTensor<T1>& src, const LocalTensor<float>& workLocal,
     const SoftMaxTiling& tiling, const SoftMaxShapeInfo& softmaxShapeInfo)
@@ -99,31 +103,35 @@ __aicore__ inline void SoftMaxImpl(const LocalTensor<T1>& dst, const LocalTensor
             SoftMaxTiling newTiling = tiling;
             SoftMaxTilingFunc(workLocal.GetSize(), { srcNDinfo.m, srcNDinfo.k, originalSrcShape.m, srcNDinfo.k },
                 newTiling, sizeof(T1), sizeof(T2), isBasicBlock);
-            SoftMaxNDImpl<T1, T2, isBasicBlock>(dst, sumTensor, maxTensor, src, workLocal, originalSrcShape, newTiling);
+            SoftMaxNDImpl<T1, T2, isBasicBlock, config>(dst, sumTensor, maxTensor, src, workLocal, originalSrcShape,
+                newTiling);
         } else {
-            SoftMaxNDImpl<T1, T2, isBasicBlock>(dst, sumTensor, maxTensor, src, workLocal, originalSrcShape, tiling);
+            SoftMaxNDImpl<T1, T2, isBasicBlock, config>(dst, sumTensor, maxTensor, src, workLocal, originalSrcShape,
+                tiling);
         }
     }
 }
 
-template <typename T1, typename T2, bool isBasicBlock = false, bool isDataFormatNZ = false>
+template <typename T1, typename T2, bool isBasicBlock = false, bool isDataFormatNZ = false,
+    const SoftmaxConfig& config = SOFTMAX_DEFAULT_CFG>
 __aicore__ inline void SoftMaxImpl(const LocalTensor<T1>& dst, const LocalTensor<T2>& sumTensor,
     const LocalTensor<T2>& maxTensor, const LocalTensor<T1>& src, const SoftMaxTiling& tiling,
     const SoftMaxShapeInfo& softmaxShapeInfo)
 {
     LocalTensor<float> workLocal;
     PopStackBuffer<float, TPosition::LCM>(workLocal);
-    SoftMaxImpl<T1, T2, isBasicBlock, isDataFormatNZ>(dst, sumTensor, maxTensor, src, workLocal, tiling,
+    SoftMaxImpl<T1, T2, isBasicBlock, isDataFormatNZ, config>(dst, sumTensor, maxTensor, src, workLocal, tiling,
         softmaxShapeInfo);
 }
 
-template <typename T1, typename T2, bool isBasicBlock = false, bool isDataFormatNZ = false>
+template <typename T1, typename T2, bool isBasicBlock = false, bool isDataFormatNZ = false,
+    const SoftmaxConfig& config = SOFTMAX_DEFAULT_CFG>
 __aicore__ inline void SoftMaxImpl(const LocalTensor<T1>& dst, const LocalTensor<T2>& sumTensor,
     const LocalTensor<T2>& maxTensor, const LocalTensor<T1>& src, const LocalTensor<uint8_t>& sharedTmpBuffer,
     const SoftMaxTiling& tiling, const SoftMaxShapeInfo& softmaxShapeInfo)
 {
     auto workLocal = sharedTmpBuffer.ReinterpretCast<float>();
-    SoftMaxImpl<T1, T2, isBasicBlock, isDataFormatNZ>(dst, sumTensor, maxTensor, src, workLocal, tiling,
+    SoftMaxImpl<T1, T2, isBasicBlock, isDataFormatNZ, config>(dst, sumTensor, maxTensor, src, workLocal, tiling,
         softmaxShapeInfo);
 }
 
diff --git a/impl/activation/softmax/softmax_common.h b/impl/activation/softmax/softmax_common.h
index dd7ea1a7..08c4e540 100644
--- a/impl/activation/softmax/softmax_common.h
+++ b/impl/activation/softmax/softmax_common.h
@@ -65,11 +65,19 @@ struct SoftmaxConfig {
     {
         isCheckTiling = isCheckTilingIn;
     }
+    __aicore__ constexpr SoftmaxConfig(const bool isCheckTilingIn, const uint32_t oriSrcMIn, const uint32_t oriSrcKIn)
+    {
+        isCheckTiling = isCheckTilingIn;
+        oriSrcM = oriSrcMIn;
+        oriSrcK = oriSrcKIn;
+    }
     // to judge if match or not of input shape and tiling, if not match, api will recompute tiling, default to judge
     bool isCheckTiling = true;
+    uint32_t oriSrcM = 0;
+    uint32_t oriSrcK = 0;
 };
 
-constexpr SoftmaxConfig SOFTMAX_DEFAULT_CFG = { true };
+constexpr SoftmaxConfig SOFTMAX_DEFAULT_CFG = { true, 0, 0 };
 
 __aicore__ inline LastAxisShapeND GetLastAxisShapeND(const ShapeInfo& shapeInfo)
 {
diff --git a/impl/activation/softmax/v200/softmax_impl.h b/impl/activation/softmax/v200/softmax_impl.h
index 2f76d21e..287dee4e 100644
--- a/impl/activation/softmax/v200/softmax_impl.h
+++ b/impl/activation/softmax/v200/softmax_impl.h
@@ -577,7 +577,8 @@ __aicore__ inline void SoftMaxGenericNDImpl(const LocalTensor<half>& dst, const
     Cast(dst[offset1], tmpBuffer0, FLOAT2HALF_ROUND_MODE, splitSize);
 }
 
-template <typename T1, typename T2, bool isBasicBlock = false>
+template <typename T1, typename T2, bool isBasicBlock = false,
+    const SoftmaxConfig& config = SOFTMAX_DEFAULT_CFG>
 __aicore__ inline void SoftMaxNDImpl(const LocalTensor<T1>& dst, const LocalTensor<T1>& sumTensor,
     const LocalTensor<T1>& maxTensor, const LocalTensor<T1>& src, const LocalTensor<float>& workLocal,
     const LastAxisShapeND& originalSrcShape, const SoftMaxTiling& tiling)
@@ -733,7 +734,8 @@ __aicore__ inline void SoftMaxGenericNDImpl(const LocalTensor<half>& dst, const
     Cast(dst[offset1], tmpBuffer0, FLOAT2HALF_ROUND_MODE, splitSize);
 }
 
-template <typename T1, typename T2, bool isBasicBlock = false>
+template <typename T1, typename T2, bool isBasicBlock = false,
+    const SoftmaxConfig& config = SOFTMAX_DEFAULT_CFG>
 __aicore__ inline void SoftMaxNDImpl(const LocalTensor<half>& dst, const LocalTensor<float>& sumTensor,
     const LocalTensor<float>& maxTensor, const LocalTensor<half>& src, const LocalTensor<float>& workLocal,
     const LastAxisShapeND& originalSrcShape, const SoftMaxTiling& tiling)
@@ -814,7 +816,8 @@ __aicore__ inline void SingleSoftMaxImpl(const LocalTensor<float>& dst, const Lo
     TransDivToMulImpl(dst[offset], tmpBuffer0, tmpBuffer1, reduceParam.originalSrcM, tiling.srcK, tiling.reduceK);
 }
 
-template <typename T, bool isReuseSource = false, bool isBasicBlock = false>
+template <typename T, bool isReuseSource = false, bool isBasicBlock = false,
+    const SoftmaxConfig& config = SOFTMAX_DEFAULT_CFG>
 __aicore__ inline void SoftMaxNDImpl(const LocalTensor<T>& dst, const LocalTensor<T>& src,
     const LocalTensor<float>& workLocal, const LastAxisShapeND& originalSrcShape, const SoftMaxTiling& tiling)
 {
diff --git a/impl/activation/softmax/v220/softmax_impl.h b/impl/activation/softmax/v220/softmax_impl.h
index b3433528..ceff1541 100644
--- a/impl/activation/softmax/v220/softmax_impl.h
+++ b/impl/activation/softmax/v220/softmax_impl.h
@@ -556,39 +556,72 @@ __aicore__ inline void SoftMaxGenericNDImpl(const LocalTensor<half>& dst, const
     Cast(dst[offset1], tmpBuffer0, FLOAT2HALF_ROUND_MODE, splitSize);
 }
 
-template <typename T1, typename T2, bool isBasicBlock = false>
-__aicore__ inline void SoftMaxNDImpl(const LocalTensor<T1>& dst, const LocalTensor<T1>& sumTensor,
-    const LocalTensor<T1>& maxTensor, const LocalTensor<T1>& src, const LocalTensor<float>& workLocal,
-    const LastAxisShapeND& originalSrcShape, const SoftMaxTiling& tiling)
+template<typename T>
+__aicore__ inline void SoftMaxNDExtImpl(const LocalTensor<T>& dst, const LocalTensor<T>& sumTensor,
+    const LocalTensor<T>& maxTensor, const LocalTensor<T>& src, const LocalTensor<float>& workLocal,
+    const LastAxisShapeND& originalSrcShape, const SoftMaxTiling& tiling, ReduceLastND& reduceParam)
 {
-    ReduceLastND reduceParam = { tiling.splitM, originalSrcShape.k, tiling.splitM,
-        tiling.splitK, tiling.reduceM,     tiling.reduceK };
     uint32_t offset1 = 0;
     uint32_t offset2 = 0;
     uint32_t splitSize = tiling.splitSize;
     uint32_t reduceSize = tiling.reduceSize;
+    for (uint32_t i = 0; i <= tiling.rangeM; i++) {
+        SoftMaxGenericNDImpl(dst, sumTensor, maxTensor, src, workLocal, tiling, offset1, offset2, splitSize,
+            reduceSize, reduceParam);
+        offset1 += tiling.splitSize;
+        offset2 += tiling.reduceSize;
+        if (i == (tiling.rangeM - 1)) {
+            if (tiling.tailM == 0) {
+                break;
+            }
+            offset2 = tiling.rangeM * tiling.reduceSize;
+            offset1 = tiling.rangeM * tiling.splitSize;
+            splitSize = tiling.tailSplitSize;
+            reduceSize = tiling.tailReduceSize;
+            reduceParam.originalSrcM = tiling.tailM;
+            reduceParam.srcM = tiling.tailM;
+            reduceParam.dstM = tiling.tailM;
+            PipeBarrier<PIPE_V>();
+        }
+    }
+}
+
+template <typename T1, typename T2, bool isBasicBlock = false, const SoftmaxConfig& config = SOFTMAX_DEFAULT_CFG>
+__aicore__ inline void SoftMaxNDImpl(const LocalTensor<T1>& dst, const LocalTensor<T1>& sumTensor,
+    const LocalTensor<T1>& maxTensor, const LocalTensor<T1>& src, const LocalTensor<float>& workLocal,
+    const LastAxisShapeND& originalSrcShape, const SoftMaxTiling& tiling)
+{
     PipeBarrier<PIPE_V>();
-    if constexpr (isBasicBlock) {
-        SoftMaxBasicBlock(dst, sumTensor, maxTensor, src, workLocal, tiling);
+    if constexpr (config.oriSrcM == 0 || config.oriSrcK == 0) {
+        if constexpr (isBasicBlock) {
+            SoftMaxBasicBlock(dst, sumTensor, maxTensor, src, workLocal, tiling);
+        } else {
+            ReduceLastND reduceParam = { tiling.splitM, originalSrcShape.k, tiling.splitM,
+                tiling.splitK, tiling.reduceM,     tiling.reduceK };
+            SoftMaxNDExtImpl<T1>(dst, sumTensor, maxTensor, src, workLocal, originalSrcShape, tiling, reduceParam);
+        }
     } else {
-        for (uint32_t i = 0; i <= tiling.rangeM; i++) {
-            SoftMaxGenericNDImpl(dst, sumTensor, maxTensor, src, workLocal, tiling, offset1, offset2, splitSize,
-                reduceSize, reduceParam);
-            offset1 += tiling.splitSize;
-            offset2 += tiling.reduceSize;
-            if (i == (tiling.rangeM - 1)) {
-                if (tiling.tailM == 0) {
-                    break;
-                }
-                offset2 = tiling.rangeM * tiling.reduceSize;
-                offset1 = tiling.rangeM * tiling.splitSize;
-                splitSize = tiling.tailSplitSize;
-                reduceSize = tiling.tailReduceSize;
-                reduceParam.originalSrcM = tiling.tailM;
-                reduceParam.srcM = tiling.tailM;
-                reduceParam.dstM = tiling.tailM;
-                PipeBarrier<PIPE_V>();
+        constexpr uint32_t basicBlockMaxK = 2048;
+        constexpr bool localIsBasicBlock =  config.oriSrcK % FLOAT_REPEAT_SIZE == 0 &&
+            config.oriSrcK < basicBlockMaxK && config.oriSrcM % FLOAT_NUM_PER_BLK == 0;
+        if constexpr (localIsBasicBlock) {
+            SoftMaxBasicBlock(dst, sumTensor, maxTensor, src, workLocal, tiling);
+        } else {
+            uint32_t splitK = 0;
+            ReduceLastND reduceParam;
+            if constexpr (config.oriSrcK % FLOAT_NUM_PER_BLK == 0) {
+                splitK = config.oriSrcK;
+            } else {
+                splitK = AlignUp(config.oriSrcK, FLOAT_NUM_PER_BLK);
             }
+            if constexpr (SupportType<T1, half>()) {
+                reduceParam = { tiling.splitM, config.oriSrcK, tiling.splitM, splitK, tiling.reduceM,
+                    DEFAULT_REPEAT_STRIDE * HALF_FACTOR };
+            } else if constexpr (SupportType<T1, float>()) {
+                reduceParam = { tiling.splitM, config.oriSrcK, tiling.splitM, splitK, tiling.reduceM,
+                    DEFAULT_REPEAT_STRIDE };
+            }
+            SoftMaxNDExtImpl<T1>(dst, sumTensor, maxTensor, src, workLocal, originalSrcShape, tiling, reduceParam);
         }
     }
 }
@@ -694,39 +727,65 @@ __aicore__ inline void SoftMaxGenericNDImpl(const LocalTensor<half>& dst, const
     Cast(dst[offset1], tmpBuffer0, FLOAT2HALF_ROUND_MODE, splitSize);
 }
 
-template <typename T1, typename T2, bool isBasicBlock = false>
+__aicore__ inline void SoftMaxNDExtImpl(const LocalTensor<half>& dst, const LocalTensor<float>& sumTensor,
+    const LocalTensor<float>& maxTensor, const LocalTensor<half>& src, const LocalTensor<float>& workLocal,
+    const LastAxisShapeND& originalSrcShape, const SoftMaxTiling& tiling, ReduceLastND& reduceParam)
+{
+    uint32_t offset1 = 0;
+    uint32_t offset2 = 0;
+    uint32_t splitSize = tiling.splitSize;
+    uint32_t reduceSize = tiling.reduceSize;
+    for (uint32_t i = 0; i <= tiling.rangeM; i++) {
+        SoftMaxGenericNDImpl(dst, sumTensor, maxTensor, src, workLocal, tiling, offset1, offset2, splitSize,
+            reduceParam);
+        offset1 += tiling.splitSize;
+        offset2 += tiling.reduceSize;
+        if (i == (tiling.rangeM - 1)) {
+            if (tiling.tailM == 0) {
+                break;
+            }
+            offset2 = tiling.rangeM * tiling.reduceSize;
+            offset1 = tiling.rangeM * tiling.splitSize;
+            splitSize = tiling.tailSplitSize;
+            reduceSize = tiling.tailReduceSize;
+            reduceParam.originalSrcM = tiling.tailM;
+            reduceParam.srcM = tiling.tailM;
+            reduceParam.dstM = tiling.tailM;
+            PipeBarrier<PIPE_V>();
+        }
+    }
+}
+
+template <typename T1, typename T2, bool isBasicBlock = false, const SoftmaxConfig& config = SOFTMAX_DEFAULT_CFG>
 __aicore__ inline void SoftMaxNDImpl(const LocalTensor<half>& dst, const LocalTensor<float>& sumTensor,
     const LocalTensor<float>& maxTensor, const LocalTensor<half>& src, const LocalTensor<float>& workLocal,
     const LastAxisShapeND& originalSrcShape, const SoftMaxTiling& tiling)
 {
     PipeBarrier<PIPE_V>();
-    if constexpr (isBasicBlock) {
-        SoftMaxBasicBlock(dst, sumTensor, maxTensor, src, workLocal, tiling);
+    if constexpr (config.oriSrcM == 0 || config.oriSrcK == 0) {
+        if constexpr (isBasicBlock) {
+            SoftMaxBasicBlock(dst, sumTensor, maxTensor, src, workLocal, tiling);
+        } else {
+            ReduceLastND reduceParam = { tiling.splitM, originalSrcShape.k, tiling.splitM,
+                tiling.splitK, tiling.reduceM,     tiling.reduceK };
+            SoftMaxNDExtImpl(dst, sumTensor, maxTensor, src, workLocal, originalSrcShape, tiling, reduceParam);
+        }
     } else {
-        ReduceLastND reduceParam = { tiling.splitM, originalSrcShape.k, tiling.splitM,
-            tiling.splitK, tiling.reduceM,     tiling.reduceK };
-        uint32_t offset1 = 0;
-        uint32_t offset2 = 0;
-        uint32_t splitSize = tiling.splitSize;
-        uint32_t reduceSize = tiling.reduceSize;
-        for (uint32_t i = 0; i <= tiling.rangeM; i++) {
-            SoftMaxGenericNDImpl(dst, sumTensor, maxTensor, src, workLocal, tiling, offset1, offset2, splitSize,
-                reduceParam);
-            offset1 += tiling.splitSize;
-            offset2 += tiling.reduceSize;
-            if (i == (tiling.rangeM - 1)) {
-                if (tiling.tailM == 0) {
-                    break;
-                }
-                offset2 = tiling.rangeM * tiling.reduceSize;
-                offset1 = tiling.rangeM * tiling.splitSize;
-                splitSize = tiling.tailSplitSize;
-                reduceSize = tiling.tailReduceSize;
-                reduceParam.originalSrcM = tiling.tailM;
-                reduceParam.srcM = tiling.tailM;
-                reduceParam.dstM = tiling.tailM;
-                PipeBarrier<PIPE_V>();
+        constexpr uint32_t basicBlockMaxK = 2048;
+        constexpr bool localIsBasicBlock =  config.oriSrcK % FLOAT_REPEAT_SIZE == 0 &&
+            config.oriSrcK < basicBlockMaxK && config.oriSrcM % FLOAT_NUM_PER_BLK == 0;
+        if constexpr (localIsBasicBlock) {
+            SoftMaxBasicBlock(dst, sumTensor, maxTensor, src, workLocal, tiling);
+        } else {
+            uint32_t splitK = 0;
+            if constexpr (config.oriSrcK % FLOAT_NUM_PER_BLK == 0) {
+                splitK = config.oriSrcK;
+            } else {
+                splitK = AlignUp(config.oriSrcK, FLOAT_NUM_PER_BLK);
             }
+            ReduceLastND reduceParam = { tiling.splitM, config.oriSrcK, tiling.splitM, splitK, tiling.reduceM,
+                DEFAULT_REPEAT_STRIDE * HALF_FACTOR };
+            SoftMaxNDExtImpl(dst, sumTensor, maxTensor, src, workLocal, originalSrcShape, tiling, reduceParam);
         }
     }
 }
@@ -785,14 +844,32 @@ __aicore__ inline void SingleSoftMaxImpl(const LocalTensor<float>& dst, const Lo
     }
 }
 
-template <typename T, bool isReuseSource = false, bool isBasicBlock = false>
+template <typename T, bool isReuseSource = false, bool isBasicBlock = false,
+    const SoftmaxConfig& config = SOFTMAX_DEFAULT_CFG>
 __aicore__ inline void SoftMaxNDImpl(const LocalTensor<T>& dst, const LocalTensor<T>& src,
     const LocalTensor<float>& workLocal, const LastAxisShapeND& originalSrcShape, const SoftMaxTiling& tiling)
 {
     uint32_t offset = 0;
     uint32_t splitSize = tiling.splitSize;
-    ReduceLastND reduceParam = { tiling.splitM, originalSrcShape.k, tiling.splitM,
-        tiling.splitK, tiling.reduceM,     tiling.reduceK };
+    ReduceLastND reduceParam;
+    if constexpr (config.oriSrcM == 0 || config.oriSrcK == 0) {
+        reduceParam = { tiling.splitM, originalSrcShape.k, tiling.splitM, tiling.splitK, tiling.reduceM,
+            tiling.reduceK };
+    } else {
+        uint32_t splitK = 0;
+        if constexpr (config.oriSrcK % FLOAT_NUM_PER_BLK == 0) {
+            splitK = config.oriSrcK;
+        } else {
+            splitK = AlignUp(config.oriSrcK, FLOAT_NUM_PER_BLK);
+        }
+        if constexpr (SupportType<T, half>()) {
+            reduceParam = { tiling.splitM, config.oriSrcK, tiling.splitM, splitK, tiling.reduceM,
+                DEFAULT_REPEAT_STRIDE * HALF_FACTOR };
+        } else if constexpr (SupportType<T, float>()) {
+            reduceParam = { tiling.splitM, config.oriSrcK, tiling.splitM, splitK, tiling.reduceM,
+                DEFAULT_REPEAT_STRIDE };
+        }
+    }
     PipeBarrier<PIPE_V>();
     for (uint32_t i = 0; i <= tiling.rangeM; i++) {
         SingleSoftMaxImpl<isReuseSource>(dst, src, workLocal, tiling, offset, splitSize, reduceParam);
diff --git a/impl/activation/softmax/v300/softmax_impl.h b/impl/activation/softmax/v300/softmax_impl.h
index ea7699a7..0d8b7f34 100644
--- a/impl/activation/softmax/v300/softmax_impl.h
+++ b/impl/activation/softmax/v300/softmax_impl.h
@@ -72,7 +72,7 @@ __aicore__ inline void SoftMaxGenericNDImpl(const LocalTensor<float>& dst, const
     }
 }
 
-template <typename T1, typename T2, bool isBasicBlock = false>
+template <typename T1, typename T2, bool isBasicBlock = false, const SoftmaxConfig& config = SOFTMAX_DEFAULT_CFG>
 __aicore__ inline void SoftMaxNDImpl(const LocalTensor<T1>& dst, const LocalTensor<T2>& sumTensor,
     const LocalTensor<T2>& maxTensor, const LocalTensor<T1>& src, const LocalTensor<float>& workLocal,
     const LastAxisShapeND& originalSrcShape, const SoftMaxTiling& tiling)
@@ -103,7 +103,7 @@ __aicore__ inline void SoftMaxNDImpl(const LocalTensor<T1>& dst, const LocalTens
         }
     }
 }
-template <typename T1, typename T2, bool isBasicBlock = false>
+template <typename T1, typename T2, bool isBasicBlock = false, const SoftmaxConfig& config = SOFTMAX_DEFAULT_CFG>
 __aicore__ inline void SoftMaxNDImpl(const LocalTensor<half>& dst, const LocalTensor<float>& sumTensor,
     const LocalTensor<float>& maxTensor, const LocalTensor<half>& src, const LocalTensor<float>& workLocal,
     const LastAxisShapeND& originalSrcShape, const SoftMaxTiling& tiling)
diff --git a/lib/activation/softmax.h b/lib/activation/softmax.h
index 827d115f..d33cf736 100644
--- a/lib/activation/softmax.h
+++ b/lib/activation/softmax.h
@@ -38,7 +38,8 @@ namespace AscendC {
  *                           improve performance , but it is a reserved param when isDataFormatNZ = true
  * \param [in] isDataFormatNZ: if the data format of input srcTensor is NZ
  */
-template <typename T, bool isReuseSource = false, bool isBasicBlock = false, bool isDataFormatNZ = false>
+template <typename T, bool isReuseSource = false, bool isBasicBlock = false, bool isDataFormatNZ = false,
+    const SoftmaxConfig& config = SOFTMAX_DEFAULT_CFG>
 __aicore__ inline void SoftMax(const LocalTensor<T>& dstTensor, const LocalTensor<T>& sumTensor,
     const LocalTensor<T>& maxTensor, const LocalTensor<T>& srcTensor, const SoftMaxTiling& tiling,
     const SoftMaxShapeInfo& softmaxShapeInfo = {})
@@ -47,7 +48,7 @@ __aicore__ inline void SoftMax(const LocalTensor<T>& dstTensor, const LocalTenso
         return;
     }
     TRACE_START(TraceId::SoftMax);
-    SoftMaxImpl<T, T, isBasicBlock, isDataFormatNZ>(dstTensor, sumTensor, maxTensor, srcTensor, tiling,
+    SoftMaxImpl<T, T, isBasicBlock, isDataFormatNZ, config>(dstTensor, sumTensor, maxTensor, srcTensor, tiling,
         softmaxShapeInfo);
     TRACE_STOP(TraceId::SoftMax);
 }
@@ -66,7 +67,8 @@ __aicore__ inline void SoftMax(const LocalTensor<T>& dstTensor, const LocalTenso
  *                           improve performance, but it is a reserved param when isDataFormatNZ = true
  * \param [in] isDataFormatNZ: if the data format of input srcTensor is NZ
  */
-template <typename T, bool isReuseSource = false, bool isBasicBlock = false, bool isDataFormatNZ = false>
+template <typename T, bool isReuseSource = false, bool isBasicBlock = false, bool isDataFormatNZ = false,
+    const SoftmaxConfig& config = SOFTMAX_DEFAULT_CFG>
 __aicore__ inline void SoftMax(const LocalTensor<half>& dstTensor, const LocalTensor<float>& sumTensor,
     const LocalTensor<float>& maxTensor, const LocalTensor<half>& srcTensor, const SoftMaxTiling& tiling,
     const SoftMaxShapeInfo& softmaxShapeInfo = {})
@@ -75,7 +77,7 @@ __aicore__ inline void SoftMax(const LocalTensor<half>& dstTensor, const LocalTe
         return;
     }
     TRACE_START(TraceId::SoftMax);
-    SoftMaxImpl<half, float, isBasicBlock, isDataFormatNZ>(dstTensor, sumTensor, maxTensor, srcTensor, tiling,
+    SoftMaxImpl<half, float, isBasicBlock, isDataFormatNZ, config>(dstTensor, sumTensor, maxTensor, srcTensor, tiling,
         softmaxShapeInfo);
     TRACE_STOP(TraceId::SoftMax);
 }
@@ -91,7 +93,8 @@ __aicore__ inline void SoftMax(const LocalTensor<half>& dstTensor, const LocalTe
  * \param [in] isReuseSource: reserved param
  * \param [in] isBasicBlock: reserved param
  */
-template <typename T, bool isReuseSource = false, bool isBasicBlock = false>
+template <typename T, bool isReuseSource = false, bool isBasicBlock = false,
+    const SoftmaxConfig& config = SOFTMAX_DEFAULT_CFG>
 __aicore__ inline void SoftMax(const LocalTensor<T>& dstTensor, const LocalTensor<T>& srcTensor,
     const SoftMaxTiling& tiling, const SoftMaxShapeInfo& softmaxShapeInfo = {})
 {
@@ -99,7 +102,7 @@ __aicore__ inline void SoftMax(const LocalTensor<T>& dstTensor, const LocalTenso
         return;
     }
     TRACE_START(TraceId::SoftMax);
-    SoftMaxImpl<T, isReuseSource, isBasicBlock>(dstTensor, srcTensor, tiling, softmaxShapeInfo);
+    SoftMaxImpl<T, isReuseSource, isBasicBlock, config>(dstTensor, srcTensor, tiling, softmaxShapeInfo);
     TRACE_STOP(TraceId::SoftMax);
 }
 
@@ -116,7 +119,8 @@ __aicore__ inline void SoftMax(const LocalTensor<T>& dstTensor, const LocalTenso
  * \param [in] isReuseSource: reserved param
  * \param [in] isBasicBlock: reserved param
  */
-template <typename T, bool isReuseSource = false, bool isBasicBlock = false>
+template <typename T, bool isReuseSource = false, bool isBasicBlock = false,
+    const SoftmaxConfig& config = SOFTMAX_DEFAULT_CFG>
 __aicore__ inline void SoftMax(const LocalTensor<T>& dstTensor, const LocalTensor<T>& srcTensor,
     const LocalTensor<uint8_t>& sharedTmpBuffer, const SoftMaxTiling& tiling,
     const SoftMaxShapeInfo& softmaxShapeInfo = {})
@@ -125,7 +129,8 @@ __aicore__ inline void SoftMax(const LocalTensor<T>& dstTensor, const LocalTenso
         return;
     }
     TRACE_START(TraceId::SoftMax);
-    SoftMaxImpl<T, isReuseSource, isBasicBlock>(dstTensor, srcTensor, sharedTmpBuffer, tiling, softmaxShapeInfo);
+    SoftMaxImpl<T, isReuseSource, isBasicBlock, config>(dstTensor, srcTensor, sharedTmpBuffer, tiling,
+        softmaxShapeInfo);
     TRACE_STOP(TraceId::SoftMax);
 }
 
@@ -146,7 +151,8 @@ __aicore__ inline void SoftMax(const LocalTensor<T>& dstTensor, const LocalTenso
  *                           improve performance, but it is a reserved param when isDataFormatNZ = true
  * \param [in] isDataFormatNZ: if the data format of input srcTensor is NZ
  */
-template <typename T, bool isReuseSource = false, bool isBasicBlock = false, bool isDataFormatNZ = false>
+template <typename T, bool isReuseSource = false, bool isBasicBlock = false, bool isDataFormatNZ = false,
+    const SoftmaxConfig& config = SOFTMAX_DEFAULT_CFG>
 __aicore__ inline void SoftMax(const LocalTensor<T>& dstTensor, const LocalTensor<T>& sumTensor,
     const LocalTensor<T>& maxTensor, const LocalTensor<T>& srcTensor, const LocalTensor<uint8_t>& sharedTmpBuffer,
     const SoftMaxTiling& tiling, const SoftMaxShapeInfo& softmaxShapeInfo = {})
@@ -155,8 +161,8 @@ __aicore__ inline void SoftMax(const LocalTensor<T>& dstTensor, const LocalTenso
         return;
     }
     TRACE_START(TraceId::SoftMax);
-    SoftMaxImpl<T, T, isBasicBlock, isDataFormatNZ>(dstTensor, sumTensor, maxTensor, srcTensor, sharedTmpBuffer, tiling,
-        softmaxShapeInfo);
+    SoftMaxImpl<T, T, isBasicBlock, isDataFormatNZ,config>(dstTensor, sumTensor, maxTensor, srcTensor, sharedTmpBuffer,
+        tiling, softmaxShapeInfo);
     TRACE_STOP(TraceId::SoftMax);
 }
 
@@ -176,7 +182,8 @@ __aicore__ inline void SoftMax(const LocalTensor<T>& dstTensor, const LocalTenso
  *                           improve performance, but it is a reserved param when isDataFormatNZ = true
  * \param [in] isDataFormatNZ: if the data format of input srcTensor is NZ
  */
-template <typename T, bool isReuseSource = false, bool isBasicBlock = false, bool isDataFormatNZ = false>
+template <typename T, bool isReuseSource = false, bool isBasicBlock = false, bool isDataFormatNZ = false,
+    const SoftmaxConfig& config = SOFTMAX_DEFAULT_CFG>
 __aicore__ inline void SoftMax(const LocalTensor<half>& dstTensor, const LocalTensor<float>& sumTensor,
     const LocalTensor<float>& maxTensor, const LocalTensor<half>& srcTensor,
     const LocalTensor<uint8_t>& sharedTmpBuffer, const SoftMaxTiling& tiling,
@@ -186,8 +193,8 @@ __aicore__ inline void SoftMax(const LocalTensor<half>& dstTensor, const LocalTe
         return;
     }
     TRACE_START(TraceId::SoftMax);
-    SoftMaxImpl<half, float, isBasicBlock, isDataFormatNZ>(dstTensor, sumTensor, maxTensor, srcTensor, sharedTmpBuffer,
-        tiling, softmaxShapeInfo);
+    SoftMaxImpl<half, float, isBasicBlock, isDataFormatNZ,config>(dstTensor, sumTensor, maxTensor, srcTensor,
+        sharedTmpBuffer, tiling, softmaxShapeInfo);
     TRACE_STOP(TraceId::SoftMax);
 }
 
@@ -216,4 +223,4 @@ __aicore__ inline bool AdjustSoftMaxRes(const LocalTensor<T1>& softMaxRes, const
 
 #pragma end_pipe
 #endif
-#endif // LIB_SOFTMAX_SOFTMAX_H
+#endif // LIB_SOFTMAX_SOFTMAX_H
\ No newline at end of file
diff --git a/tests/activation/softmax/test_operator_softmax_v220.cpp b/tests/activation/softmax/test_operator_softmax_v220.cpp
index 22743022..0891bd7c 100644
--- a/tests/activation/softmax/test_operator_softmax_v220.cpp
+++ b/tests/activation/softmax/test_operator_softmax_v220.cpp
@@ -18,9 +18,11 @@
 
 using namespace std;
 using namespace AscendC;
-
+constexpr SoftmaxConfig config = { true, 16, 16 };
 namespace AscendC {
-template <typename T1, typename T2, bool isBasicBlock = false, bool isDataFormatNZ = false> class KernelSoftmax {
+template <typename T1, typename T2, bool isBasicBlock = false, bool isDataFormatNZ = false,
+    const SoftmaxConfig& config = SOFTMAX_DEFAULT_CFG>
+class KernelSoftmax {
 public:
     __aicore__ inline KernelSoftmax() {}
     __aicore__ inline void Init(__gm__ uint8_t* src0Gm, __gm__ uint8_t* src1Gm, __gm__ uint8_t* dstGm,
@@ -82,6 +84,7 @@ private:
         SoftMaxShapeInfo srcShape = { height, width, height, width };
         SoftMaxTiling tiling;
         SoftMax<T1, false, isBasicBlock>(srcLocal1, insumLocal, inmaxLocal, srcLocal1, tiling, srcShape);
+        SoftMax<T1, false, isBasicBlock, false, config>(srcLocal1, insumLocal, inmaxLocal, srcLocal1, tiling, srcShape);
         SimpleSoftMax<T1, false>(dstLocal, insumLocal, inmaxLocal, srcLocal1, tiling, srcShape);
 
         SoftmaxFlash<T1, false, isBasicBlock>(dstLocal, insumLocal, inmaxLocal, srcLocal1, expMaxTensor, insumLocal,
@@ -135,11 +138,12 @@ private:
 };
 } // namespace AscendC
 
-template <typename T1, typename T2, bool isBasicBlock = false, bool isDataFormatNZ = false>
+template <typename T1, typename T2, bool isBasicBlock = false, bool isDataFormatNZ = false,
+    const SoftmaxConfig& config = SOFTMAX_DEFAULT_CFG>
 __global__ __aicore__ void MainSoftmax(__gm__ uint8_t* dstGm, __gm__ uint8_t* src0Gm, __gm__ uint8_t* src1Gm,
     uint32_t height, uint32_t width)
 {
-    AscendC::KernelSoftmax<T1, T2, isBasicBlock, isDataFormatNZ> op;
+    AscendC::KernelSoftmax<T1, T2, isBasicBlock, isDataFormatNZ, config> op;
     op.Init(src0Gm, src1Gm, dstGm, height, width);
     op.Process();
 }
@@ -175,7 +179,8 @@ INSTANTIATE_TEST_CASE_P(TEST_OPEARATION_SOFTMAX, SoftMaxTestsuite,
     SoftMaxTestParams{ 2, 50, 288, MainSoftmax<half, float> },
     SoftMaxTestParams{ 4, 64, 128, MainSoftmax<float, float, true> },
     SoftMaxTestParams{ 4, 64, 128, MainSoftmax<float, float, true, true> },
-    SoftMaxTestParams{ 4, 50, 144, MainSoftmax<float, float, true> },
+    SoftMaxTestParams{ 4, 50, 144, MainSoftmax<float, float, false> },
+    SoftMaxTestParams{ 4, 50, 144, MainSoftmax<float, float, false ,false, config> },
     SoftMaxTestParams{ 4, 16, 960, MainSoftmax<float, float, true> },
     SoftMaxTestParams{ 4, 100, 32, MainSoftmax<float, float> },
     SoftMaxTestParams{ 4, 50, 288, MainSoftmax<float, float> },
@@ -190,4 +195,4 @@ TEST_P(SoftMaxTestsuite, SoftMaxOpTestCase)
     uint8_t src1Gm[param.height * param.width * param.typeSize];
     uint8_t dstGm[param.height * param.width * param.typeSize];
     param.cal_func(dstGm, src0Gm, src1Gm, param.height, param.width);
-}
+}
\ No newline at end of file
-- 
Gitee


From 22928ee6a2941fcd5ff10dd5427bbffc5cec0c42 Mon Sep 17 00:00:00 2001
From: liyihan123 <liyihan2@huawei.com>
Date: Fri, 13 Sep 2024 15:00:43 +0800
Subject: [PATCH 2/2] softxmax const compile

---
 impl/activation/softmax/softmax_base_impl.h   |  36 ++--
 impl/activation/softmax/softmax_common.h      |  10 +-
 impl/activation/softmax/v200/softmax_impl.h   |   9 +-
 impl/activation/softmax/v220/softmax_impl.h   | 185 +++++++++++++-----
 impl/activation/softmax/v300/softmax_impl.h   |   4 +-
 lib/activation/softmax.h                      |  37 ++--
 .../softmax/test_operator_softmax_v220.cpp    |  14 +-
 7 files changed, 200 insertions(+), 95 deletions(-)

diff --git a/impl/activation/softmax/softmax_base_impl.h b/impl/activation/softmax/softmax_base_impl.h
index 8c44c306..649e0654 100644
--- a/impl/activation/softmax/softmax_base_impl.h
+++ b/impl/activation/softmax/softmax_base_impl.h
@@ -24,7 +24,8 @@
 #endif
 
 namespace AscendC {
-template <typename T, bool isReuseSource = false, bool isBasicBlock = false>
+template <typename T, bool isReuseSource = false, bool isBasicBlock = false,
+    const SoftmaxConfig& config = SOFTMAX_DEFAULT_CFG>
 __aicore__ inline void SoftMaxImpl(const LocalTensor<T>& dst, const LocalTensor<T>& src,
     const LocalTensor<float>& workLocal, const SoftMaxTiling& tiling, const SoftMaxShapeInfo& softmaxShapeInfo)
 {
@@ -45,28 +46,31 @@ __aicore__ inline void SoftMaxImpl(const LocalTensor<T>& dst, const LocalTensor<
         SoftMaxTiling newTiling = tiling;
         SoftMaxTilingFunc(workLocal.GetSize(), { srcNDinfo.m, srcNDinfo.k, originalSrcShape.m, srcNDinfo.k },
             newTiling, sizeof(T), sizeof(float), isBasicBlock);
-        SoftMaxNDImpl<T, isReuseSource, isBasicBlock>(dst, src, workLocal, originalSrcShape, newTiling);
+        SoftMaxNDImpl<T, isReuseSource, isBasicBlock, config>(dst, src, workLocal, originalSrcShape, newTiling);
     } else {
-        SoftMaxNDImpl<T, isReuseSource, isBasicBlock>(dst, src, workLocal, originalSrcShape, tiling);
+        SoftMaxNDImpl<T, isReuseSource, isBasicBlock, config>(dst, src, workLocal, originalSrcShape, tiling);
     }
 }
-template <typename T, bool isReuseSource = false, bool isBasicBlock = false>
+template <typename T, bool isReuseSource = false, bool isBasicBlock = false,
+    const SoftmaxConfig& config = SOFTMAX_DEFAULT_CFG>
 __aicore__ inline void SoftMaxImpl(const LocalTensor<T>& dst, const LocalTensor<T>& src, const SoftMaxTiling& tiling,
     const SoftMaxShapeInfo& softmaxShapeInfo)
 {
     LocalTensor<float> workLocal;
     PopStackBuffer<float, TPosition::LCM>(workLocal);
-    SoftMaxImpl<T, isReuseSource, isBasicBlock>(dst, src, workLocal, tiling, softmaxShapeInfo);
+    SoftMaxImpl<T, isReuseSource, isBasicBlock, config>(dst, src, workLocal, tiling, softmaxShapeInfo);
 }
-template <typename T, bool isReuseSource = false, bool isBasicBlock = false>
+template <typename T, bool isReuseSource = false, bool isBasicBlock = false,
+    const SoftmaxConfig& config = SOFTMAX_DEFAULT_CFG>
 __aicore__ inline void SoftMaxImpl(const LocalTensor<T>& dst, const LocalTensor<T>& src,
     const LocalTensor<uint8_t>& sharedTmpBuffer, const SoftMaxTiling& tiling, const SoftMaxShapeInfo& softmaxShapeInfo)
 {
     auto workLocal = sharedTmpBuffer.ReinterpretCast<float>();
-    SoftMaxImpl<T, isReuseSource, isBasicBlock>(dst, src, workLocal, tiling, softmaxShapeInfo);
+    SoftMaxImpl<T, isReuseSource, isBasicBlock, config>(dst, src, workLocal, tiling, softmaxShapeInfo);
 }
 
-template <typename T1, typename T2, bool isBasicBlock = false, bool isDataFormatNZ = false>
+template <typename T1, typename T2, bool isBasicBlock = false, bool isDataFormatNZ = false,
+    const SoftmaxConfig& config = SOFTMAX_DEFAULT_CFG>
 __aicore__ inline void SoftMaxImpl(const LocalTensor<T1>& dst, const LocalTensor<T2>& sumTensor,
     const LocalTensor<T2>& maxTensor, const LocalTensor<T1>& src, const LocalTensor<float>& workLocal,
     const SoftMaxTiling& tiling, const SoftMaxShapeInfo& softmaxShapeInfo)
@@ -99,31 +103,35 @@ __aicore__ inline void SoftMaxImpl(const LocalTensor<T1>& dst, const LocalTensor
             SoftMaxTiling newTiling = tiling;
             SoftMaxTilingFunc(workLocal.GetSize(), { srcNDinfo.m, srcNDinfo.k, originalSrcShape.m, srcNDinfo.k },
                 newTiling, sizeof(T1), sizeof(T2), isBasicBlock);
-            SoftMaxNDImpl<T1, T2, isBasicBlock>(dst, sumTensor, maxTensor, src, workLocal, originalSrcShape, newTiling);
+            SoftMaxNDImpl<T1, T2, isBasicBlock, config>(dst, sumTensor, maxTensor, src, workLocal, originalSrcShape,
+                newTiling);
         } else {
-            SoftMaxNDImpl<T1, T2, isBasicBlock>(dst, sumTensor, maxTensor, src, workLocal, originalSrcShape, tiling);
+            SoftMaxNDImpl<T1, T2, isBasicBlock, config>(dst, sumTensor, maxTensor, src, workLocal, originalSrcShape,
+                tiling);
         }
     }
 }
 
-template <typename T1, typename T2, bool isBasicBlock = false, bool isDataFormatNZ = false>
+template <typename T1, typename T2, bool isBasicBlock = false, bool isDataFormatNZ = false,
+    const SoftmaxConfig& config = SOFTMAX_DEFAULT_CFG>
 __aicore__ inline void SoftMaxImpl(const LocalTensor<T1>& dst, const LocalTensor<T2>& sumTensor,
     const LocalTensor<T2>& maxTensor, const LocalTensor<T1>& src, const SoftMaxTiling& tiling,
     const SoftMaxShapeInfo& softmaxShapeInfo)
 {
     LocalTensor<float> workLocal;
     PopStackBuffer<float, TPosition::LCM>(workLocal);
-    SoftMaxImpl<T1, T2, isBasicBlock, isDataFormatNZ>(dst, sumTensor, maxTensor, src, workLocal, tiling,
+    SoftMaxImpl<T1, T2, isBasicBlock, isDataFormatNZ, config>(dst, sumTensor, maxTensor, src, workLocal, tiling,
         softmaxShapeInfo);
 }
 
-template <typename T1, typename T2, bool isBasicBlock = false, bool isDataFormatNZ = false>
+template <typename T1, typename T2, bool isBasicBlock = false, bool isDataFormatNZ = false,
+    const SoftmaxConfig& config = SOFTMAX_DEFAULT_CFG>
 __aicore__ inline void SoftMaxImpl(const LocalTensor<T1>& dst, const LocalTensor<T2>& sumTensor,
     const LocalTensor<T2>& maxTensor, const LocalTensor<T1>& src, const LocalTensor<uint8_t>& sharedTmpBuffer,
     const SoftMaxTiling& tiling, const SoftMaxShapeInfo& softmaxShapeInfo)
 {
     auto workLocal = sharedTmpBuffer.ReinterpretCast<float>();
-    SoftMaxImpl<T1, T2, isBasicBlock, isDataFormatNZ>(dst, sumTensor, maxTensor, src, workLocal, tiling,
+    SoftMaxImpl<T1, T2, isBasicBlock, isDataFormatNZ, config>(dst, sumTensor, maxTensor, src, workLocal, tiling,
         softmaxShapeInfo);
 }
 
diff --git a/impl/activation/softmax/softmax_common.h b/impl/activation/softmax/softmax_common.h
index dd7ea1a7..08c4e540 100644
--- a/impl/activation/softmax/softmax_common.h
+++ b/impl/activation/softmax/softmax_common.h
@@ -65,11 +65,19 @@ struct SoftmaxConfig {
     {
         isCheckTiling = isCheckTilingIn;
     }
+    __aicore__ constexpr SoftmaxConfig(const bool isCheckTilingIn, const uint32_t oriSrcMIn, const uint32_t oriSrcKIn)
+    {
+        isCheckTiling = isCheckTilingIn;
+        oriSrcM = oriSrcMIn;
+        oriSrcK = oriSrcKIn;
+    }
     // to judge if match or not of input shape and tiling, if not match, api will recompute tiling, default to judge
     bool isCheckTiling = true;
+    uint32_t oriSrcM = 0;
+    uint32_t oriSrcK = 0;
 };
 
-constexpr SoftmaxConfig SOFTMAX_DEFAULT_CFG = { true };
+constexpr SoftmaxConfig SOFTMAX_DEFAULT_CFG = { true, 0, 0 };
 
 __aicore__ inline LastAxisShapeND GetLastAxisShapeND(const ShapeInfo& shapeInfo)
 {
diff --git a/impl/activation/softmax/v200/softmax_impl.h b/impl/activation/softmax/v200/softmax_impl.h
index 2f76d21e..287dee4e 100644
--- a/impl/activation/softmax/v200/softmax_impl.h
+++ b/impl/activation/softmax/v200/softmax_impl.h
@@ -577,7 +577,8 @@ __aicore__ inline void SoftMaxGenericNDImpl(const LocalTensor<half>& dst, const
     Cast(dst[offset1], tmpBuffer0, FLOAT2HALF_ROUND_MODE, splitSize);
 }
 
-template <typename T1, typename T2, bool isBasicBlock = false>
+template <typename T1, typename T2, bool isBasicBlock = false,
+    const SoftmaxConfig& config = SOFTMAX_DEFAULT_CFG>
 __aicore__ inline void SoftMaxNDImpl(const LocalTensor<T1>& dst, const LocalTensor<T1>& sumTensor,
     const LocalTensor<T1>& maxTensor, const LocalTensor<T1>& src, const LocalTensor<float>& workLocal,
     const LastAxisShapeND& originalSrcShape, const SoftMaxTiling& tiling)
@@ -733,7 +734,8 @@ __aicore__ inline void SoftMaxGenericNDImpl(const LocalTensor<half>& dst, const
     Cast(dst[offset1], tmpBuffer0, FLOAT2HALF_ROUND_MODE, splitSize);
 }
 
-template <typename T1, typename T2, bool isBasicBlock = false>
+template <typename T1, typename T2, bool isBasicBlock = false,
+    const SoftmaxConfig& config = SOFTMAX_DEFAULT_CFG>
 __aicore__ inline void SoftMaxNDImpl(const LocalTensor<half>& dst, const LocalTensor<float>& sumTensor,
     const LocalTensor<float>& maxTensor, const LocalTensor<half>& src, const LocalTensor<float>& workLocal,
     const LastAxisShapeND& originalSrcShape, const SoftMaxTiling& tiling)
@@ -814,7 +816,8 @@ __aicore__ inline void SingleSoftMaxImpl(const LocalTensor<float>& dst, const Lo
     TransDivToMulImpl(dst[offset], tmpBuffer0, tmpBuffer1, reduceParam.originalSrcM, tiling.srcK, tiling.reduceK);
 }
 
-template <typename T, bool isReuseSource = false, bool isBasicBlock = false>
+template <typename T, bool isReuseSource = false, bool isBasicBlock = false,
+    const SoftmaxConfig& config = SOFTMAX_DEFAULT_CFG>
 __aicore__ inline void SoftMaxNDImpl(const LocalTensor<T>& dst, const LocalTensor<T>& src,
     const LocalTensor<float>& workLocal, const LastAxisShapeND& originalSrcShape, const SoftMaxTiling& tiling)
 {
diff --git a/impl/activation/softmax/v220/softmax_impl.h b/impl/activation/softmax/v220/softmax_impl.h
index b3433528..ceff1541 100644
--- a/impl/activation/softmax/v220/softmax_impl.h
+++ b/impl/activation/softmax/v220/softmax_impl.h
@@ -556,39 +556,72 @@ __aicore__ inline void SoftMaxGenericNDImpl(const LocalTensor<half>& dst, const
     Cast(dst[offset1], tmpBuffer0, FLOAT2HALF_ROUND_MODE, splitSize);
 }
 
-template <typename T1, typename T2, bool isBasicBlock = false>
-__aicore__ inline void SoftMaxNDImpl(const LocalTensor<T1>& dst, const LocalTensor<T1>& sumTensor,
-    const LocalTensor<T1>& maxTensor, const LocalTensor<T1>& src, const LocalTensor<float>& workLocal,
-    const LastAxisShapeND& originalSrcShape, const SoftMaxTiling& tiling)
+template<typename T>
+__aicore__ inline void SoftMaxNDExtImpl(const LocalTensor<T>& dst, const LocalTensor<T>& sumTensor,
+    const LocalTensor<T>& maxTensor, const LocalTensor<T>& src, const LocalTensor<float>& workLocal,
+    const LastAxisShapeND& originalSrcShape, const SoftMaxTiling& tiling, ReduceLastND& reduceParam)
 {
-    ReduceLastND reduceParam = { tiling.splitM, originalSrcShape.k, tiling.splitM,
-        tiling.splitK, tiling.reduceM,     tiling.reduceK };
     uint32_t offset1 = 0;
     uint32_t offset2 = 0;
     uint32_t splitSize = tiling.splitSize;
     uint32_t reduceSize = tiling.reduceSize;
+    for (uint32_t i = 0; i <= tiling.rangeM; i++) {
+        SoftMaxGenericNDImpl(dst, sumTensor, maxTensor, src, workLocal, tiling, offset1, offset2, splitSize,
+            reduceSize, reduceParam);
+        offset1 += tiling.splitSize;
+        offset2 += tiling.reduceSize;
+        if (i == (tiling.rangeM - 1)) {
+            if (tiling.tailM == 0) {
+                break;
+            }
+            offset2 = tiling.rangeM * tiling.reduceSize;
+            offset1 = tiling.rangeM * tiling.splitSize;
+            splitSize = tiling.tailSplitSize;
+            reduceSize = tiling.tailReduceSize;
+            reduceParam.originalSrcM = tiling.tailM;
+            reduceParam.srcM = tiling.tailM;
+            reduceParam.dstM = tiling.tailM;
+            PipeBarrier<PIPE_V>();
+        }
+    }
+}
+
+template <typename T1, typename T2, bool isBasicBlock = false, const SoftmaxConfig& config = SOFTMAX_DEFAULT_CFG>
+__aicore__ inline void SoftMaxNDImpl(const LocalTensor<T1>& dst, const LocalTensor<T1>& sumTensor,
+    const LocalTensor<T1>& maxTensor, const LocalTensor<T1>& src, const LocalTensor<float>& workLocal,
+    const LastAxisShapeND& originalSrcShape, const SoftMaxTiling& tiling)
+{
     PipeBarrier<PIPE_V>();
-    if constexpr (isBasicBlock) {
-        SoftMaxBasicBlock(dst, sumTensor, maxTensor, src, workLocal, tiling);
+    if constexpr (config.oriSrcM == 0 || config.oriSrcK == 0) {
+        if constexpr (isBasicBlock) {
+            SoftMaxBasicBlock(dst, sumTensor, maxTensor, src, workLocal, tiling);
+        } else {
+            ReduceLastND reduceParam = { tiling.splitM, originalSrcShape.k, tiling.splitM,
+                tiling.splitK, tiling.reduceM,     tiling.reduceK };
+            SoftMaxNDExtImpl<T1>(dst, sumTensor, maxTensor, src, workLocal, originalSrcShape, tiling, reduceParam);
+        }
     } else {
-        for (uint32_t i = 0; i <= tiling.rangeM; i++) {
-            SoftMaxGenericNDImpl(dst, sumTensor, maxTensor, src, workLocal, tiling, offset1, offset2, splitSize,
-                reduceSize, reduceParam);
-            offset1 += tiling.splitSize;
-            offset2 += tiling.reduceSize;
-            if (i == (tiling.rangeM - 1)) {
-                if (tiling.tailM == 0) {
-                    break;
-                }
-                offset2 = tiling.rangeM * tiling.reduceSize;
-                offset1 = tiling.rangeM * tiling.splitSize;
-                splitSize = tiling.tailSplitSize;
-                reduceSize = tiling.tailReduceSize;
-                reduceParam.originalSrcM = tiling.tailM;
-                reduceParam.srcM = tiling.tailM;
-                reduceParam.dstM = tiling.tailM;
-                PipeBarrier<PIPE_V>();
+        constexpr uint32_t basicBlockMaxK = 2048;
+        constexpr bool localIsBasicBlock =  config.oriSrcK % FLOAT_REPEAT_SIZE == 0 &&
+            config.oriSrcK < basicBlockMaxK && config.oriSrcM % FLOAT_NUM_PER_BLK == 0;
+        if constexpr (localIsBasicBlock) {
+            SoftMaxBasicBlock(dst, sumTensor, maxTensor, src, workLocal, tiling);
+        } else {
+            uint32_t splitK = 0;
+            ReduceLastND reduceParam;
+            if constexpr (config.oriSrcK % FLOAT_NUM_PER_BLK == 0) {
+                splitK = config.oriSrcK;
+            } else {
+                splitK = AlignUp(config.oriSrcK, FLOAT_NUM_PER_BLK);
             }
+            if constexpr (SupportType<T1, half>()) {
+                reduceParam = { tiling.splitM, config.oriSrcK, tiling.splitM, splitK, tiling.reduceM,
+                    DEFAULT_REPEAT_STRIDE * HALF_FACTOR };
+            } else if constexpr (SupportType<T1, float>()) {
+                reduceParam = { tiling.splitM, config.oriSrcK, tiling.splitM, splitK, tiling.reduceM,
+                    DEFAULT_REPEAT_STRIDE };
+            }
+            SoftMaxNDExtImpl<T1>(dst, sumTensor, maxTensor, src, workLocal, originalSrcShape, tiling, reduceParam);
         }
     }
 }
@@ -694,39 +727,65 @@ __aicore__ inline void SoftMaxGenericNDImpl(const LocalTensor<half>& dst, const
     Cast(dst[offset1], tmpBuffer0, FLOAT2HALF_ROUND_MODE, splitSize);
 }
 
-template <typename T1, typename T2, bool isBasicBlock = false>
+__aicore__ inline void SoftMaxNDExtImpl(const LocalTensor<half>& dst, const LocalTensor<float>& sumTensor,
+    const LocalTensor<float>& maxTensor, const LocalTensor<half>& src, const LocalTensor<float>& workLocal,
+    const LastAxisShapeND& originalSrcShape, const SoftMaxTiling& tiling, ReduceLastND& reduceParam)
+{
+    uint32_t offset1 = 0;
+    uint32_t offset2 = 0;
+    uint32_t splitSize = tiling.splitSize;
+    uint32_t reduceSize = tiling.reduceSize;
+    for (uint32_t i = 0; i <= tiling.rangeM; i++) {
+        SoftMaxGenericNDImpl(dst, sumTensor, maxTensor, src, workLocal, tiling, offset1, offset2, splitSize,
+            reduceParam);
+        offset1 += tiling.splitSize;
+        offset2 += tiling.reduceSize;
+        if (i == (tiling.rangeM - 1)) {
+            if (tiling.tailM == 0) {
+                break;
+            }
+            offset2 = tiling.rangeM * tiling.reduceSize;
+            offset1 = tiling.rangeM * tiling.splitSize;
+            splitSize = tiling.tailSplitSize;
+            reduceSize = tiling.tailReduceSize;
+            reduceParam.originalSrcM = tiling.tailM;
+            reduceParam.srcM = tiling.tailM;
+            reduceParam.dstM = tiling.tailM;
+            PipeBarrier<PIPE_V>();
+        }
+    }
+}
+
+template <typename T1, typename T2, bool isBasicBlock = false, const SoftmaxConfig& config = SOFTMAX_DEFAULT_CFG>
 __aicore__ inline void SoftMaxNDImpl(const LocalTensor<half>& dst, const LocalTensor<float>& sumTensor,
     const LocalTensor<float>& maxTensor, const LocalTensor<half>& src, const LocalTensor<float>& workLocal,
     const LastAxisShapeND& originalSrcShape, const SoftMaxTiling& tiling)
 {
     PipeBarrier<PIPE_V>();
-    if constexpr (isBasicBlock) {
-        SoftMaxBasicBlock(dst, sumTensor, maxTensor, src, workLocal, tiling);
+    if constexpr (config.oriSrcM == 0 || config.oriSrcK == 0) {
+        if constexpr (isBasicBlock) {
+            SoftMaxBasicBlock(dst, sumTensor, maxTensor, src, workLocal, tiling);
+        } else {
+            ReduceLastND reduceParam = { tiling.splitM, originalSrcShape.k, tiling.splitM,
+                tiling.splitK, tiling.reduceM,     tiling.reduceK };
+            SoftMaxNDExtImpl(dst, sumTensor, maxTensor, src, workLocal, originalSrcShape, tiling, reduceParam);
+        }
     } else {
-        ReduceLastND reduceParam = { tiling.splitM, originalSrcShape.k, tiling.splitM,
-            tiling.splitK, tiling.reduceM,     tiling.reduceK };
-        uint32_t offset1 = 0;
-        uint32_t offset2 = 0;
-        uint32_t splitSize = tiling.splitSize;
-        uint32_t reduceSize = tiling.reduceSize;
-        for (uint32_t i = 0; i <= tiling.rangeM; i++) {
-            SoftMaxGenericNDImpl(dst, sumTensor, maxTensor, src, workLocal, tiling, offset1, offset2, splitSize,
-                reduceParam);
-            offset1 += tiling.splitSize;
-            offset2 += tiling.reduceSize;
-            if (i == (tiling.rangeM - 1)) {
-                if (tiling.tailM == 0) {
-                    break;
-                }
-                offset2 = tiling.rangeM * tiling.reduceSize;
-                offset1 = tiling.rangeM * tiling.splitSize;
-                splitSize = tiling.tailSplitSize;
-                reduceSize = tiling.tailReduceSize;
-                reduceParam.originalSrcM = tiling.tailM;
-                reduceParam.srcM = tiling.tailM;
-                reduceParam.dstM = tiling.tailM;
-                PipeBarrier<PIPE_V>();
+        constexpr uint32_t basicBlockMaxK = 2048;
+        constexpr bool localIsBasicBlock =  config.oriSrcK % FLOAT_REPEAT_SIZE == 0 &&
+            config.oriSrcK < basicBlockMaxK && config.oriSrcM % FLOAT_NUM_PER_BLK == 0;
+        if constexpr (localIsBasicBlock) {
+            SoftMaxBasicBlock(dst, sumTensor, maxTensor, src, workLocal, tiling);
+        } else {
+            uint32_t splitK = 0;
+            if constexpr (config.oriSrcK % FLOAT_NUM_PER_BLK == 0) {
+                splitK = config.oriSrcK;
+            } else {
+                splitK = AlignUp(config.oriSrcK, FLOAT_NUM_PER_BLK);
             }
+            ReduceLastND reduceParam = { tiling.splitM, config.oriSrcK, tiling.splitM, splitK, tiling.reduceM,
+                DEFAULT_REPEAT_STRIDE * HALF_FACTOR };
+            SoftMaxNDExtImpl(dst, sumTensor, maxTensor, src, workLocal, originalSrcShape, tiling, reduceParam);
         }
     }
 }
@@ -785,14 +844,32 @@ __aicore__ inline void SingleSoftMaxImpl(const LocalTensor<float>& dst, const Lo
     }
 }
 
-template <typename T, bool isReuseSource = false, bool isBasicBlock = false>
+template <typename T, bool isReuseSource = false, bool isBasicBlock = false,
+    const SoftmaxConfig& config = SOFTMAX_DEFAULT_CFG>
 __aicore__ inline void SoftMaxNDImpl(const LocalTensor<T>& dst, const LocalTensor<T>& src,
     const LocalTensor<float>& workLocal, const LastAxisShapeND& originalSrcShape, const SoftMaxTiling& tiling)
 {
     uint32_t offset = 0;
     uint32_t splitSize = tiling.splitSize;
-    ReduceLastND reduceParam = { tiling.splitM, originalSrcShape.k, tiling.splitM,
-        tiling.splitK, tiling.reduceM,     tiling.reduceK };
+    ReduceLastND reduceParam;
+    if constexpr (config.oriSrcM == 0 || config.oriSrcK == 0) {
+        reduceParam = { tiling.splitM, originalSrcShape.k, tiling.splitM, tiling.splitK, tiling.reduceM,
+            tiling.reduceK };
+    } else {
+        uint32_t splitK = 0;
+        if constexpr (config.oriSrcK % FLOAT_NUM_PER_BLK == 0) {
+            splitK = config.oriSrcK;
+        } else {
+            splitK = AlignUp(config.oriSrcK, FLOAT_NUM_PER_BLK);
+        }
+        if constexpr (SupportType<T, half>()) {
+            reduceParam = { tiling.splitM, config.oriSrcK, tiling.splitM, splitK, tiling.reduceM,
+                DEFAULT_REPEAT_STRIDE * HALF_FACTOR };
+        } else if constexpr (SupportType<T, float>()) {
+            reduceParam = { tiling.splitM, config.oriSrcK, tiling.splitM, splitK, tiling.reduceM,
+                DEFAULT_REPEAT_STRIDE };
+        }
+    }
     PipeBarrier<PIPE_V>();
     for (uint32_t i = 0; i <= tiling.rangeM; i++) {
         SingleSoftMaxImpl<isReuseSource>(dst, src, workLocal, tiling, offset, splitSize, reduceParam);
diff --git a/impl/activation/softmax/v300/softmax_impl.h b/impl/activation/softmax/v300/softmax_impl.h
index ea7699a7..0d8b7f34 100644
--- a/impl/activation/softmax/v300/softmax_impl.h
+++ b/impl/activation/softmax/v300/softmax_impl.h
@@ -72,7 +72,7 @@ __aicore__ inline void SoftMaxGenericNDImpl(const LocalTensor<float>& dst, const
     }
 }
 
-template <typename T1, typename T2, bool isBasicBlock = false>
+template <typename T1, typename T2, bool isBasicBlock = false, const SoftmaxConfig& config = SOFTMAX_DEFAULT_CFG>
 __aicore__ inline void SoftMaxNDImpl(const LocalTensor<T1>& dst, const LocalTensor<T2>& sumTensor,
     const LocalTensor<T2>& maxTensor, const LocalTensor<T1>& src, const LocalTensor<float>& workLocal,
     const LastAxisShapeND& originalSrcShape, const SoftMaxTiling& tiling)
@@ -103,7 +103,7 @@ __aicore__ inline void SoftMaxNDImpl(const LocalTensor<T1>& dst, const LocalTens
         }
     }
 }
-template <typename T1, typename T2, bool isBasicBlock = false>
+template <typename T1, typename T2, bool isBasicBlock = false, const SoftmaxConfig& config = SOFTMAX_DEFAULT_CFG>
 __aicore__ inline void SoftMaxNDImpl(const LocalTensor<half>& dst, const LocalTensor<float>& sumTensor,
     const LocalTensor<float>& maxTensor, const LocalTensor<half>& src, const LocalTensor<float>& workLocal,
     const LastAxisShapeND& originalSrcShape, const SoftMaxTiling& tiling)
diff --git a/lib/activation/softmax.h b/lib/activation/softmax.h
index 827d115f..a0342dae 100644
--- a/lib/activation/softmax.h
+++ b/lib/activation/softmax.h
@@ -38,7 +38,8 @@ namespace AscendC {
  *                           improve performance , but it is a reserved param when isDataFormatNZ = true
  * \param [in] isDataFormatNZ: if the data format of input srcTensor is NZ
  */
-template <typename T, bool isReuseSource = false, bool isBasicBlock = false, bool isDataFormatNZ = false>
+template <typename T, bool isReuseSource = false, bool isBasicBlock = false, bool isDataFormatNZ = false,
+    const SoftmaxConfig& config = SOFTMAX_DEFAULT_CFG>
 __aicore__ inline void SoftMax(const LocalTensor<T>& dstTensor, const LocalTensor<T>& sumTensor,
     const LocalTensor<T>& maxTensor, const LocalTensor<T>& srcTensor, const SoftMaxTiling& tiling,
     const SoftMaxShapeInfo& softmaxShapeInfo = {})
@@ -47,7 +48,7 @@ __aicore__ inline void SoftMax(const LocalTensor<T>& dstTensor, const LocalTenso
         return;
     }
     TRACE_START(TraceId::SoftMax);
-    SoftMaxImpl<T, T, isBasicBlock, isDataFormatNZ>(dstTensor, sumTensor, maxTensor, srcTensor, tiling,
+    SoftMaxImpl<T, T, isBasicBlock, isDataFormatNZ, config>(dstTensor, sumTensor, maxTensor, srcTensor, tiling,
         softmaxShapeInfo);
     TRACE_STOP(TraceId::SoftMax);
 }
@@ -66,7 +67,8 @@ __aicore__ inline void SoftMax(const LocalTensor<T>& dstTensor, const LocalTenso
  *                           improve performance, but it is a reserved param when isDataFormatNZ = true
  * \param [in] isDataFormatNZ: if the data format of input srcTensor is NZ
  */
-template <typename T, bool isReuseSource = false, bool isBasicBlock = false, bool isDataFormatNZ = false>
+template <typename T, bool isReuseSource = false, bool isBasicBlock = false, bool isDataFormatNZ = false,
+    const SoftmaxConfig& config = SOFTMAX_DEFAULT_CFG>
 __aicore__ inline void SoftMax(const LocalTensor<half>& dstTensor, const LocalTensor<float>& sumTensor,
     const LocalTensor<float>& maxTensor, const LocalTensor<half>& srcTensor, const SoftMaxTiling& tiling,
     const SoftMaxShapeInfo& softmaxShapeInfo = {})
@@ -75,7 +77,7 @@ __aicore__ inline void SoftMax(const LocalTensor<half>& dstTensor, const LocalTe
         return;
     }
     TRACE_START(TraceId::SoftMax);
-    SoftMaxImpl<half, float, isBasicBlock, isDataFormatNZ>(dstTensor, sumTensor, maxTensor, srcTensor, tiling,
+    SoftMaxImpl<half, float, isBasicBlock, isDataFormatNZ, config>(dstTensor, sumTensor, maxTensor, srcTensor, tiling,
         softmaxShapeInfo);
     TRACE_STOP(TraceId::SoftMax);
 }
@@ -91,7 +93,8 @@ __aicore__ inline void SoftMax(const LocalTensor<half>& dstTensor, const LocalTe
  * \param [in] isReuseSource: reserved param
  * \param [in] isBasicBlock: reserved param
  */
-template <typename T, bool isReuseSource = false, bool isBasicBlock = false>
+template <typename T, bool isReuseSource = false, bool isBasicBlock = false,
+    const SoftmaxConfig& config = SOFTMAX_DEFAULT_CFG>
 __aicore__ inline void SoftMax(const LocalTensor<T>& dstTensor, const LocalTensor<T>& srcTensor,
     const SoftMaxTiling& tiling, const SoftMaxShapeInfo& softmaxShapeInfo = {})
 {
@@ -99,7 +102,7 @@ __aicore__ inline void SoftMax(const LocalTensor<T>& dstTensor, const LocalTenso
         return;
     }
     TRACE_START(TraceId::SoftMax);
-    SoftMaxImpl<T, isReuseSource, isBasicBlock>(dstTensor, srcTensor, tiling, softmaxShapeInfo);
+    SoftMaxImpl<T, isReuseSource, isBasicBlock, config>(dstTensor, srcTensor, tiling, softmaxShapeInfo);
     TRACE_STOP(TraceId::SoftMax);
 }
 
@@ -116,7 +119,8 @@ __aicore__ inline void SoftMax(const LocalTensor<T>& dstTensor, const LocalTenso
  * \param [in] isReuseSource: reserved param
  * \param [in] isBasicBlock: reserved param
  */
-template <typename T, bool isReuseSource = false, bool isBasicBlock = false>
+template <typename T, bool isReuseSource = false, bool isBasicBlock = false,
+    const SoftmaxConfig& config = SOFTMAX_DEFAULT_CFG>
 __aicore__ inline void SoftMax(const LocalTensor<T>& dstTensor, const LocalTensor<T>& srcTensor,
     const LocalTensor<uint8_t>& sharedTmpBuffer, const SoftMaxTiling& tiling,
     const SoftMaxShapeInfo& softmaxShapeInfo = {})
@@ -125,7 +129,8 @@ __aicore__ inline void SoftMax(const LocalTensor<T>& dstTensor, const LocalTenso
         return;
     }
     TRACE_START(TraceId::SoftMax);
-    SoftMaxImpl<T, isReuseSource, isBasicBlock>(dstTensor, srcTensor, sharedTmpBuffer, tiling, softmaxShapeInfo);
+    SoftMaxImpl<T, isReuseSource, isBasicBlock, config>(dstTensor, srcTensor, sharedTmpBuffer, tiling,
+        softmaxShapeInfo);
     TRACE_STOP(TraceId::SoftMax);
 }
 
@@ -146,7 +151,8 @@ __aicore__ inline void SoftMax(const LocalTensor<T>& dstTensor, const LocalTenso
  *                           improve performance, but it is a reserved param when isDataFormatNZ = true
  * \param [in] isDataFormatNZ: if the data format of input srcTensor is NZ
  */
-template <typename T, bool isReuseSource = false, bool isBasicBlock = false, bool isDataFormatNZ = false>
+template <typename T, bool isReuseSource = false, bool isBasicBlock = false, bool isDataFormatNZ = false,
+    const SoftmaxConfig& config = SOFTMAX_DEFAULT_CFG>
 __aicore__ inline void SoftMax(const LocalTensor<T>& dstTensor, const LocalTensor<T>& sumTensor,
     const LocalTensor<T>& maxTensor, const LocalTensor<T>& srcTensor, const LocalTensor<uint8_t>& sharedTmpBuffer,
     const SoftMaxTiling& tiling, const SoftMaxShapeInfo& softmaxShapeInfo = {})
@@ -155,8 +161,8 @@ __aicore__ inline void SoftMax(const LocalTensor<T>& dstTensor, const LocalTenso
         return;
     }
     TRACE_START(TraceId::SoftMax);
-    SoftMaxImpl<T, T, isBasicBlock, isDataFormatNZ>(dstTensor, sumTensor, maxTensor, srcTensor, sharedTmpBuffer, tiling,
-        softmaxShapeInfo);
+    SoftMaxImpl<T, T, isBasicBlock, isDataFormatNZ, config>(dstTensor, sumTensor, maxTensor, srcTensor, sharedTmpBuffer,
+        tiling, softmaxShapeInfo);
     TRACE_STOP(TraceId::SoftMax);
 }
 
@@ -176,7 +182,8 @@ __aicore__ inline void SoftMax(const LocalTensor<T>& dstTensor, const LocalTenso
  *                           improve performance, but it is a reserved param when isDataFormatNZ = true
  * \param [in] isDataFormatNZ: if the data format of input srcTensor is NZ
  */
-template <typename T, bool isReuseSource = false, bool isBasicBlock = false, bool isDataFormatNZ = false>
+template <typename T, bool isReuseSource = false, bool isBasicBlock = false, bool isDataFormatNZ = false,
+    const SoftmaxConfig& config = SOFTMAX_DEFAULT_CFG>
 __aicore__ inline void SoftMax(const LocalTensor<half>& dstTensor, const LocalTensor<float>& sumTensor,
     const LocalTensor<float>& maxTensor, const LocalTensor<half>& srcTensor,
     const LocalTensor<uint8_t>& sharedTmpBuffer, const SoftMaxTiling& tiling,
@@ -186,8 +193,8 @@ __aicore__ inline void SoftMax(const LocalTensor<half>& dstTensor, const LocalTe
         return;
     }
     TRACE_START(TraceId::SoftMax);
-    SoftMaxImpl<half, float, isBasicBlock, isDataFormatNZ>(dstTensor, sumTensor, maxTensor, srcTensor, sharedTmpBuffer,
-        tiling, softmaxShapeInfo);
+    SoftMaxImpl<half, float, isBasicBlock, isDataFormatNZ,config>(dstTensor, sumTensor, maxTensor, srcTensor,
+        sharedTmpBuffer, tiling, softmaxShapeInfo);
     TRACE_STOP(TraceId::SoftMax);
 }
 
@@ -216,4 +223,4 @@ __aicore__ inline bool AdjustSoftMaxRes(const LocalTensor<T1>& softMaxRes, const
 
 #pragma end_pipe
 #endif
-#endif // LIB_SOFTMAX_SOFTMAX_H
+#endif // LIB_SOFTMAX_SOFTMAX_H
\ No newline at end of file
diff --git a/tests/activation/softmax/test_operator_softmax_v220.cpp b/tests/activation/softmax/test_operator_softmax_v220.cpp
index 22743022..420513f3 100644
--- a/tests/activation/softmax/test_operator_softmax_v220.cpp
+++ b/tests/activation/softmax/test_operator_softmax_v220.cpp
@@ -18,9 +18,10 @@
 
 using namespace std;
 using namespace AscendC;
-
 namespace AscendC {
-template <typename T1, typename T2, bool isBasicBlock = false, bool isDataFormatNZ = false> class KernelSoftmax {
+template <typename T1, typename T2, bool isBasicBlock = false, bool isDataFormatNZ = false,
+    const SoftmaxConfig& config = SOFTMAX_DEFAULT_CFG>
+class KernelSoftmax {
 public:
     __aicore__ inline KernelSoftmax() {}
     __aicore__ inline void Init(__gm__ uint8_t* src0Gm, __gm__ uint8_t* src1Gm, __gm__ uint8_t* dstGm,
@@ -135,11 +136,12 @@ private:
 };
 } // namespace AscendC
 
-template <typename T1, typename T2, bool isBasicBlock = false, bool isDataFormatNZ = false>
+template <typename T1, typename T2, bool isBasicBlock = false, bool isDataFormatNZ = false,
+    const SoftmaxConfig& config = SOFTMAX_DEFAULT_CFG>
 __global__ __aicore__ void MainSoftmax(__gm__ uint8_t* dstGm, __gm__ uint8_t* src0Gm, __gm__ uint8_t* src1Gm,
     uint32_t height, uint32_t width)
 {
-    AscendC::KernelSoftmax<T1, T2, isBasicBlock, isDataFormatNZ> op;
+    AscendC::KernelSoftmax<T1, T2, isBasicBlock, isDataFormatNZ, config> op;
     op.Init(src0Gm, src1Gm, dstGm, height, width);
     op.Process();
 }
@@ -175,7 +177,7 @@ INSTANTIATE_TEST_CASE_P(TEST_OPEARATION_SOFTMAX, SoftMaxTestsuite,
     SoftMaxTestParams{ 2, 50, 288, MainSoftmax<half, float> },
     SoftMaxTestParams{ 4, 64, 128, MainSoftmax<float, float, true> },
     SoftMaxTestParams{ 4, 64, 128, MainSoftmax<float, float, true, true> },
-    SoftMaxTestParams{ 4, 50, 144, MainSoftmax<float, float, true> },
+    SoftMaxTestParams{ 4, 50, 144, MainSoftmax<float, float, false> },
     SoftMaxTestParams{ 4, 16, 960, MainSoftmax<float, float, true> },
     SoftMaxTestParams{ 4, 100, 32, MainSoftmax<float, float> },
     SoftMaxTestParams{ 4, 50, 288, MainSoftmax<float, float> },
@@ -190,4 +192,4 @@ TEST_P(SoftMaxTestsuite, SoftMaxOpTestCase)
     uint8_t src1Gm[param.height * param.width * param.typeSize];
     uint8_t dstGm[param.height * param.width * param.typeSize];
     param.cal_func(dstGm, src0Gm, src1Gm, param.height, param.width);
-}
+}
\ No newline at end of file
-- 
Gitee