diff --git a/impl/activation/softmax/simple_softmax_base_impl.h b/impl/activation/softmax/simple_softmax_base_impl.h
index 7de89fe2b5a4c83c465b8fb060178a8796258e9b..eaa5798631b02cb73dcd916950a9007335deef16 100644
--- a/impl/activation/softmax/simple_softmax_base_impl.h
+++ b/impl/activation/softmax/simple_softmax_base_impl.h
@@ -24,7 +24,8 @@
 #endif
 
 namespace AscendC {
-template <typename T1, typename T2, bool isBasicBlock = false, bool isDataFormatNZ = false>
+template <typename T1, typename T2, bool isBasicBlock = false, bool isDataFormatNZ = false,
+    const SoftmaxConfig& config = SOFTMAX_DEFAULT_CFG>
 __aicore__ inline void SimpleSoftMaxImpl(const LocalTensor<T1>& dst, const LocalTensor<T2>& inSumTensor,
     const LocalTensor<T2>& inMaxTensor, const LocalTensor<T1>& src, const LocalTensor<float>& workLocal,
     const SoftMaxTiling& tiling, const SoftMaxShapeInfo& softmaxShapeInfo)
@@ -55,32 +56,34 @@ __aicore__ inline void SimpleSoftMaxImpl(const LocalTensor<T1>& dst, const Local
             SoftMaxTiling newTiling = tiling;
             SoftMaxTilingFunc(workLocal.GetSize(), { srcNDinfo.m, srcNDinfo.k, originalSrcShape.m, srcNDinfo.k },
                 newTiling, sizeof(T1), sizeof(T2), isBasicBlock);
-            SimpleSoftMaxNDImpl<T1, isBasicBlock>(dst, inSumTensor, inMaxTensor, src, workLocal, newTiling);
+            SimpleSoftMaxNDImpl<T1, isBasicBlock, config>(dst, inSumTensor, inMaxTensor, src, workLocal, newTiling);
         } else {
-            SimpleSoftMaxNDImpl<T1, isBasicBlock>(dst, inSumTensor, inMaxTensor, src, workLocal, tiling);
+            SimpleSoftMaxNDImpl<T1, isBasicBlock, config>(dst, inSumTensor, inMaxTensor, src, workLocal, tiling);
         }
     }
 }
 
-template <typename T1, typename T2, bool isBasicBlock = false, bool isDataFormatNZ = false>
+template <typename T1, typename T2, bool isBasicBlock = false, bool isDataFormatNZ = false,
+    const SoftmaxConfig& config = SOFTMAX_DEFAULT_CFG>
 __aicore__ inline void SimpleSoftMaxImpl(const LocalTensor<T1>& dst, const LocalTensor<T2>& inSumTensor,
     const LocalTensor<T2>& inMaxTensor, const LocalTensor<T1>& src, const SoftMaxTiling& tiling,
     const SoftMaxShapeInfo& softmaxShapeInfo)
 {
     LocalTensor<float> workLocal;
     PopStackBuffer<float, TPosition::LCM>(workLocal);
-    SimpleSoftMaxImpl<T1, T2, isBasicBlock, isDataFormatNZ>(dst, inSumTensor, inMaxTensor, src, workLocal, tiling,
-        softmaxShapeInfo);
+    SimpleSoftMaxImpl<T1, T2, isBasicBlock, isDataFormatNZ, config>(dst, inSumTensor, inMaxTensor, src, workLocal,
+        tiling, softmaxShapeInfo);
 }
 
-template <typename T1, typename T2, bool isBasicBlock = false, bool isDataFormatNZ = false>
+template <typename T1, typename T2, bool isBasicBlock = false, bool isDataFormatNZ = false,
+    const SoftmaxConfig& config = SOFTMAX_DEFAULT_CFG>
 __aicore__ inline void SimpleSoftMaxImpl(const LocalTensor<T1>& dst, const LocalTensor<T2>& inSumTensor,
     const LocalTensor<T2>& inMaxTensor, const LocalTensor<T1>& src, const LocalTensor<uint8_t>& sharedTmpBuffer,
     const SoftMaxTiling& tiling, const SoftMaxShapeInfo& softmaxShapeInfo)
 {
     auto workLocal = sharedTmpBuffer.ReinterpretCast<float>();
-    SimpleSoftMaxImpl<T1, T2, isBasicBlock, isDataFormatNZ>(dst, inSumTensor, inMaxTensor, src, workLocal, tiling,
-        softmaxShapeInfo);
+    SimpleSoftMaxImpl<T1, T2, isBasicBlock, isDataFormatNZ, config>(dst, inSumTensor, inMaxTensor, src, workLocal,
+        tiling, softmaxShapeInfo);
 }
 }
 #endif // IMPL_ACTIVATION_SOFTMAX_SIMPLE_SOFTMAX_BASE_IMPL_H
\ No newline at end of file
diff --git a/impl/activation/softmax/v200/simple_softmax_impl.h b/impl/activation/softmax/v200/simple_softmax_impl.h
index 85ddba56381c6993867706f2fba152c64b67b011..79dd99af7cf67d7a81158630c9df31c3ff2eab9f 100644
--- a/impl/activation/softmax/v200/simple_softmax_impl.h
+++ b/impl/activation/softmax/v200/simple_softmax_impl.h
@@ -285,6 +285,7 @@ __aicore__ inline void SimpleSoftMaxBasicBlock(const LocalTensor<float>& dst, co
 #endif
 }
 
+template<const SoftmaxConfig& config = SOFTMAX_DEFAULT_CFG>
 __aicore__ inline void SimpleSoftMaxGenericNDImpl(const LocalTensor<half>& dst, const LocalTensor<half>& inSumTensor,
     const LocalTensor<half>& inMaxTensor, const LocalTensor<half>& src, const LocalTensor<float>& workLocal,
     const SoftMaxTiling& tiling, const uint32_t offset1, const uint32_t offset2, const uint32_t curSplitM)
@@ -293,37 +294,68 @@ __aicore__ inline void SimpleSoftMaxGenericNDImpl(const LocalTensor<half>& dst,
     const LocalTensor<float>& tmpBuffer2 = workLocal[tiling.splitSize];
     const uint32_t splitSize = curSplitM * tiling.splitK;
     const uint32_t reduceSize = curSplitM * tiling.reduceK;
+    if constexpr (config.oriSrcM == 0 || config.oriSrcK == 0) {
+        Cast(tmpBuffer0, src[offset1], RoundMode::CAST_NONE, splitSize);
+        Cast(tmpBuffer2, inMaxTensor[offset2], RoundMode::CAST_NONE, reduceSize);
+        PipeBarrier<PIPE_V>();
 
-    Cast(tmpBuffer0, src[offset1], RoundMode::CAST_NONE, splitSize);
-    Cast(tmpBuffer2, inMaxTensor[offset2], RoundMode::CAST_NONE, reduceSize);
-    PipeBarrier<PIPE_V>();
+        GenericSubNDImpl(tmpBuffer0, tmpBuffer0, tmpBuffer2, curSplitM, tiling.srcK, tiling.reduceK);
 
-    GenericSubNDImpl(tmpBuffer0, tmpBuffer0, tmpBuffer2, curSplitM, tiling.srcK, tiling.reduceK);
+        PipeBarrier<PIPE_V>();
+        Exp(tmpBuffer0, tmpBuffer0, splitSize);
 
-    PipeBarrier<PIPE_V>();
-    Exp(tmpBuffer0, tmpBuffer0, splitSize);
+        Cast(tmpBuffer2, inSumTensor[offset2], RoundMode::CAST_NONE, reduceSize);
+        PipeBarrier<PIPE_V>();
+        GenericDivNDImpl(tmpBuffer0, tmpBuffer0, tmpBuffer2, curSplitM, tiling.srcK, tiling.reduceK);
 
-    Cast(tmpBuffer2, inSumTensor[offset2], RoundMode::CAST_NONE, reduceSize);
-    PipeBarrier<PIPE_V>();
-    GenericDivNDImpl(tmpBuffer0, tmpBuffer0, tmpBuffer2, curSplitM, tiling.srcK, tiling.reduceK);
+        PipeBarrier<PIPE_V>();
+        Cast(dst[offset1], tmpBuffer0, FLOAT2HALF_ROUND_MODE, splitSize);
+    } else {
+        Cast(tmpBuffer0, src[offset1], RoundMode::CAST_NONE, splitSize);
+        Cast(tmpBuffer2, inMaxTensor[offset2], RoundMode::CAST_NONE, reduceSize);
+        PipeBarrier<PIPE_V>();
 
-    PipeBarrier<PIPE_V>();
-    Cast(dst[offset1], tmpBuffer0, FLOAT2HALF_ROUND_MODE, splitSize);
+        GenericSubNDImpl(tmpBuffer0, tmpBuffer0, tmpBuffer2, curSplitM, config.oriSrcK,
+            DEFAULT_REPEAT_STRIDE * HALF_FACTOR);
+
+        PipeBarrier<PIPE_V>();
+        Exp(tmpBuffer0, tmpBuffer0, splitSize);
+
+        Cast(tmpBuffer2, inSumTensor[offset2], RoundMode::CAST_NONE, reduceSize);
+        PipeBarrier<PIPE_V>();
+        GenericDivNDImpl(tmpBuffer0, tmpBuffer0, tmpBuffer2, curSplitM, config.oriSrcK,
+            DEFAULT_REPEAT_STRIDE * HALF_FACTOR);
+
+        PipeBarrier<PIPE_V>();
+        Cast(dst[offset1], tmpBuffer0, FLOAT2HALF_ROUND_MODE, splitSize);
+    }
 }
+
+template<const SoftmaxConfig& config = SOFTMAX_DEFAULT_CFG>
 __aicore__ inline void SimpleSoftMaxGenericNDImpl(const LocalTensor<float>& dst, const LocalTensor<float>& inSumTensor,
     const LocalTensor<float>& inMaxTensor, const LocalTensor<float>& src, const LocalTensor<float>& workLocal,
     const SoftMaxTiling& tiling, const uint32_t offset1, const uint32_t offset2, const uint32_t curSplitM)
 {
     const uint32_t splitSize = curSplitM * tiling.splitK;
 
-    GenericSubNDImpl(dst[offset1], src[offset1], inMaxTensor[offset2], curSplitM, tiling.srcK, tiling.reduceK);
-    PipeBarrier<PIPE_V>();
-    Exp(dst[offset1], dst[offset1], splitSize);
-    PipeBarrier<PIPE_V>();
-    GenericDivNDImpl(dst[offset1], dst[offset1], inSumTensor[offset2], curSplitM, tiling.srcK, tiling.reduceK);
+    if constexpr (config.oriSrcM == 0 || config.oriSrcK == 0) {
+        GenericSubNDImpl(dst[offset1], src[offset1], inMaxTensor[offset2], curSplitM, tiling.srcK, tiling.reduceK);
+        PipeBarrier<PIPE_V>();
+        Exp(dst[offset1], dst[offset1], splitSize);
+        PipeBarrier<PIPE_V>();
+        GenericDivNDImpl(dst[offset1], dst[offset1], inSumTensor[offset2], curSplitM, tiling.srcK, tiling.reduceK);
+    } else {
+        GenericSubNDImpl(dst[offset1], src[offset1], inMaxTensor[offset2], curSplitM, config.oriSrcK,
+            DEFAULT_REPEAT_STRIDE);
+        PipeBarrier<PIPE_V>();
+        Exp(dst[offset1], dst[offset1], splitSize);
+        PipeBarrier<PIPE_V>();
+        GenericDivNDImpl(dst[offset1], dst[offset1], inSumTensor[offset2], curSplitM, config.oriSrcK,
+            DEFAULT_REPEAT_STRIDE);
+    }
 }
 
-template <typename T, bool isBasicBlock = false>
+template <typename T, bool isBasicBlock = false, const SoftmaxConfig& config = SOFTMAX_DEFAULT_CFG>
 __aicore__ inline void SimpleSoftMaxNDImpl(const LocalTensor<T>& dst, const LocalTensor<T>& inSumTensor,
     const LocalTensor<T>& inMaxTensor, const LocalTensor<T>& src, const LocalTensor<float> workLocal,
     const SoftMaxTiling& tiling)
@@ -332,7 +364,8 @@ __aicore__ inline void SimpleSoftMaxNDImpl(const LocalTensor<T>& dst, const Loca
         SimpleSoftMaxBasicBlock(dst, inSumTensor, inMaxTensor, src, workLocal, tiling);
     } else {
         if constexpr (sizeof(T) == sizeof(float)) {
-            SimpleSoftMaxGenericNDImpl(dst, inSumTensor, inMaxTensor, src, workLocal, tiling, 0, 0, tiling.srcM);
+            SimpleSoftMaxGenericNDImpl<config>(dst, inSumTensor, inMaxTensor, src, workLocal, tiling, 0, 0,
+                tiling.srcM);
         } else {
             uint32_t offset1 = 0;
             uint32_t offset2 = 0;
@@ -340,15 +373,15 @@ __aicore__ inline void SimpleSoftMaxNDImpl(const LocalTensor<T>& dst, const Loca
             for (uint32_t i = 0; i < tiling.rangeM; i++) {
                 offset1 = i * tiling.splitSize;
                 offset2 = i * tiling.reduceSize;
-                SimpleSoftMaxGenericNDImpl(dst, inSumTensor, inMaxTensor, src, workLocal, tiling, offset1, offset2,
-                    tiling.splitM);
+                SimpleSoftMaxGenericNDImpl<config>(dst, inSumTensor, inMaxTensor, src, workLocal, tiling, offset1,
+                    offset2, tiling.splitM);
             }
             PipeBarrier<PIPE_V>();
             if (tiling.tailM != 0) {
                 offset1 = tiling.rangeM * tiling.splitSize;
                 offset2 = tiling.rangeM * tiling.reduceSize;
-                SimpleSoftMaxGenericNDImpl(dst, inSumTensor, inMaxTensor, src, workLocal, tiling, offset1, offset2,
-                    tiling.tailM);
+                SimpleSoftMaxGenericNDImpl<config>(dst, inSumTensor, inMaxTensor, src, workLocal, tiling, offset1,
+                    offset2, tiling.tailM);
             }
         }
     }
@@ -391,25 +424,29 @@ __aicore__ inline void SimpleSoftMaxBasicBlock(const LocalTensor<half>& dst, con
     }
 }
 
+template<const SoftmaxConfig& config = SOFTMAX_DEFAULT_CFG>
 __aicore__ inline void SimpleSoftMaxGenericNDImpl(const LocalTensor<half>& dst, const LocalTensor<float>& inSumTensor,
     const LocalTensor<float>& inMaxTensor, const LocalTensor<half>& src, const LocalTensor<float>& workLocal,
     const SoftMaxTiling& tiling, const uint32_t offset1, const uint32_t offset2, const uint32_t curSplitM)
 {
     const LocalTensor<float>& tmpBuffer0 = workLocal;
     const uint32_t splitSize = curSplitM * tiling.splitK;
-
-    Cast(tmpBuffer0, src[offset1], RoundMode::CAST_NONE, splitSize);
-    PipeBarrier<PIPE_V>();
-    GenericSubNDImpl(tmpBuffer0, tmpBuffer0, inMaxTensor[offset2], curSplitM, tiling.srcK, tiling.reduceK);
-    PipeBarrier<PIPE_V>();
-    Exp(tmpBuffer0, tmpBuffer0, tiling.splitSize);
-    PipeBarrier<PIPE_V>();
-    GenericDivNDImpl(tmpBuffer0, tmpBuffer0, inSumTensor[offset2], curSplitM, tiling.srcK, tiling.reduceK);
-    PipeBarrier<PIPE_V>();
-    Cast(dst[offset1], tmpBuffer0, FLOAT2HALF_ROUND_MODE, splitSize);
+    if constexpr (config.oriSrcM == 0 || config.oriSrcK == 0) {
+        Cast(tmpBuffer0, src[offset1], RoundMode::CAST_NONE, splitSize);
+        PipeBarrier<PIPE_V>();
+        GenericSubNDImpl(tmpBuffer0, tmpBuffer0, inMaxTensor[offset2], curSplitM, config.oriSrcK,
+            DEFAULT_REPEAT_STRIDE);
+        PipeBarrier<PIPE_V>();
+        Exp(tmpBuffer0, tmpBuffer0, tiling.splitSize);
+        PipeBarrier<PIPE_V>();
+        GenericDivNDImpl(tmpBuffer0, tmpBuffer0, inSumTensor[offset2], curSplitM, config.oriSrcK,
+            DEFAULT_REPEAT_STRIDE);
+        PipeBarrier<PIPE_V>();
+        Cast(dst[offset1], tmpBuffer0, FLOAT2HALF_ROUND_MODE, splitSize);
+    }
 }
 
-template <typename T, bool isBasicBlock = false>
+template <typename T, bool isBasicBlock = false, const SoftmaxConfig& config = SOFTMAX_DEFAULT_CFG>
 __aicore__ inline void SimpleSoftMaxNDImpl(const LocalTensor<half>& dst, const LocalTensor<float>& inSumTensor,
     const LocalTensor<float>& inMaxTensor, const LocalTensor<half>& src, const LocalTensor<float>& workLocal,
     const SoftMaxTiling& tiling)
@@ -423,15 +460,15 @@ __aicore__ inline void SimpleSoftMaxNDImpl(const LocalTensor<half>& dst, const L
         for (uint32_t i = 0; i < tiling.rangeM; i++) {
             offset1 = i * tiling.splitSize;
             offset2 = i * tiling.reduceSize;
-            SimpleSoftMaxGenericNDImpl(dst, inSumTensor, inMaxTensor, src, workLocal, tiling, offset1, offset2,
-                tiling.splitM);
+            SimpleSoftMaxGenericNDImpl<config>(dst, inSumTensor, inMaxTensor, src, workLocal, tiling, offset1,
+                offset2, tiling.splitM);
         }
         PipeBarrier<PIPE_V>();
         if (tiling.tailM != 0) {
             offset1 = tiling.rangeM * tiling.splitSize;
             offset2 = tiling.rangeM * tiling.reduceSize;
-            SimpleSoftMaxGenericNDImpl(dst, inSumTensor, inMaxTensor, src, workLocal, tiling, offset1, offset2,
-                tiling.tailM);
+            SimpleSoftMaxGenericNDImpl<config>(dst, inSumTensor, inMaxTensor, src, workLocal, tiling, offset1,
+                offset2, tiling.tailM);
         }
     }
 }
diff --git a/impl/activation/softmax/v220/simple_softmax_impl.h b/impl/activation/softmax/v220/simple_softmax_impl.h
index 5bbe7d10750fd23d67db417906868e6e4e133074..a01b0adcbcac68082fea73344f0e5499299ac38f 100644
--- a/impl/activation/softmax/v220/simple_softmax_impl.h
+++ b/impl/activation/softmax/v220/simple_softmax_impl.h
@@ -285,6 +285,7 @@ __aicore__ inline void SimpleSoftMaxBasicBlock(const LocalTensor<float>& dst, co
 #endif
 }
 
+template<const SoftmaxConfig& config = SOFTMAX_DEFAULT_CFG>
 __aicore__ inline void SimpleSoftMaxGenericNDImpl(const LocalTensor<half>& dst, const LocalTensor<half>& inSumTensor,
     const LocalTensor<half>& inMaxTensor, const LocalTensor<half>& src, const LocalTensor<float>& workLocal,
     const SoftMaxTiling& tiling, const uint32_t offset1, const uint32_t offset2, const uint32_t curSplitM)
@@ -293,37 +294,68 @@ __aicore__ inline void SimpleSoftMaxGenericNDImpl(const LocalTensor<half>& dst,
     const LocalTensor<float>& tmpBuffer2 = workLocal[tiling.splitSize];
     const uint32_t splitSize = curSplitM * tiling.splitK;
     const uint32_t reduceSize = curSplitM * tiling.reduceK;
+    if constexpr (config.oriSrcM == 0 || config.oriSrcK == 0) {
+        Cast(tmpBuffer0, src[offset1], RoundMode::CAST_NONE, splitSize);
+        Cast(tmpBuffer2, inMaxTensor[offset2], RoundMode::CAST_NONE, reduceSize);
+        PipeBarrier<PIPE_V>();
 
-    Cast(tmpBuffer0, src[offset1], RoundMode::CAST_NONE, splitSize);
-    Cast(tmpBuffer2, inMaxTensor[offset2], RoundMode::CAST_NONE, reduceSize);
-    PipeBarrier<PIPE_V>();
+        GenericSubNDImpl(tmpBuffer0, tmpBuffer0, tmpBuffer2, curSplitM, tiling.srcK, tiling.reduceK);
 
-    GenericSubNDImpl(tmpBuffer0, tmpBuffer0, tmpBuffer2, curSplitM, tiling.srcK, tiling.reduceK);
+        PipeBarrier<PIPE_V>();
+        Exp(tmpBuffer0, tmpBuffer0, splitSize);
 
-    PipeBarrier<PIPE_V>();
-    Exp(tmpBuffer0, tmpBuffer0, splitSize);
+        Cast(tmpBuffer2, inSumTensor[offset2], RoundMode::CAST_NONE, reduceSize);
+        PipeBarrier<PIPE_V>();
+        GenericDivNDImpl(tmpBuffer0, tmpBuffer0, tmpBuffer2, curSplitM, tiling.srcK, tiling.reduceK);
 
-    Cast(tmpBuffer2, inSumTensor[offset2], RoundMode::CAST_NONE, reduceSize);
-    PipeBarrier<PIPE_V>();
-    GenericDivNDImpl(tmpBuffer0, tmpBuffer0, tmpBuffer2, curSplitM, tiling.srcK, tiling.reduceK);
+        PipeBarrier<PIPE_V>();
+        Cast(dst[offset1], tmpBuffer0, FLOAT2HALF_ROUND_MODE, splitSize);
+    } else {
+        Cast(tmpBuffer0, src[offset1], RoundMode::CAST_NONE, splitSize);
+        Cast(tmpBuffer2, inMaxTensor[offset2], RoundMode::CAST_NONE, reduceSize);
+        PipeBarrier<PIPE_V>();
 
-    PipeBarrier<PIPE_V>();
-    Cast(dst[offset1], tmpBuffer0, FLOAT2HALF_ROUND_MODE, splitSize);
+        GenericSubNDImpl(tmpBuffer0, tmpBuffer0, tmpBuffer2, curSplitM, config.oriSrcK,
+            DEFAULT_REPEAT_STRIDE * HALF_FACTOR);
+
+        PipeBarrier<PIPE_V>();
+        Exp(tmpBuffer0, tmpBuffer0, splitSize);
+
+        Cast(tmpBuffer2, inSumTensor[offset2], RoundMode::CAST_NONE, reduceSize);
+        PipeBarrier<PIPE_V>();
+        GenericDivNDImpl(tmpBuffer0, tmpBuffer0, tmpBuffer2, curSplitM, config.oriSrcK,
+            DEFAULT_REPEAT_STRIDE * HALF_FACTOR);
+
+        PipeBarrier<PIPE_V>();
+        Cast(dst[offset1], tmpBuffer0, FLOAT2HALF_ROUND_MODE, splitSize);  
+    }
 }
+
+template<const SoftmaxConfig& config = SOFTMAX_DEFAULT_CFG>
 __aicore__ inline void SimpleSoftMaxGenericNDImpl(const LocalTensor<float>& dst, const LocalTensor<float>& inSumTensor,
     const LocalTensor<float>& inMaxTensor, const LocalTensor<float>& src, const LocalTensor<float>& workLocal,
     const SoftMaxTiling& tiling, const uint32_t offset1, const uint32_t offset2, const uint32_t curSplitM)
 {
     const uint32_t splitSize = curSplitM * tiling.splitK;
 
-    GenericSubNDImpl(dst[offset1], src[offset1], inMaxTensor[offset2], curSplitM, tiling.srcK, tiling.reduceK);
-    PipeBarrier<PIPE_V>();
-    Exp(dst[offset1], dst[offset1], splitSize);
-    PipeBarrier<PIPE_V>();
-    GenericDivNDImpl(dst[offset1], dst[offset1], inSumTensor[offset2], curSplitM, tiling.srcK, tiling.reduceK);
+    if constexpr (config.oriSrcM == 0 || config.oriSrcK == 0) {
+        GenericSubNDImpl(dst[offset1], src[offset1], inMaxTensor[offset2], curSplitM, tiling.srcK, tiling.reduceK);
+        PipeBarrier<PIPE_V>();
+        Exp(dst[offset1], dst[offset1], splitSize);
+        PipeBarrier<PIPE_V>();
+        GenericDivNDImpl(dst[offset1], dst[offset1], inSumTensor[offset2], curSplitM, tiling.srcK, tiling.reduceK);
+    } else {
+        GenericSubNDImpl(dst[offset1], src[offset1], inMaxTensor[offset2], curSplitM, config.oriSrcK,
+            DEFAULT_REPEAT_STRIDE);
+        PipeBarrier<PIPE_V>();
+        Exp(dst[offset1], dst[offset1], splitSize);
+        PipeBarrier<PIPE_V>();
+        GenericDivNDImpl(dst[offset1], dst[offset1], inSumTensor[offset2], curSplitM, config.oriSrcK,
+            DEFAULT_REPEAT_STRIDE);
+    }
 }
 
-template <typename T, bool isBasicBlock = false>
+template <typename T, bool isBasicBlock = false, const SoftmaxConfig& config = SOFTMAX_DEFAULT_CFG>
 __aicore__ inline void SimpleSoftMaxNDImpl(const LocalTensor<T>& dst, const LocalTensor<T>& inSumTensor,
     const LocalTensor<T>& inMaxTensor, const LocalTensor<T>& src, const LocalTensor<float> workLocal,
     const SoftMaxTiling& tiling)
@@ -332,7 +364,8 @@ __aicore__ inline void SimpleSoftMaxNDImpl(const LocalTensor<T>& dst, const Loca
         SimpleSoftMaxBasicBlock(dst, inSumTensor, inMaxTensor, src, workLocal, tiling);
     } else {
         if constexpr (sizeof(T) == sizeof(float)) {
-            SimpleSoftMaxGenericNDImpl(dst, inSumTensor, inMaxTensor, src, workLocal, tiling, 0, 0, tiling.srcM);
+            SimpleSoftMaxGenericNDImpl<config>(dst, inSumTensor, inMaxTensor, src, workLocal, tiling, 0, 0,
+                tiling.srcM);
         } else {
             uint32_t offset1 = 0;
             uint32_t offset2 = 0;
@@ -340,15 +373,15 @@ __aicore__ inline void SimpleSoftMaxNDImpl(const LocalTensor<T>& dst, const Loca
             for (uint32_t i = 0; i < tiling.rangeM; i++) {
                 offset1 = i * tiling.splitSize;
                 offset2 = i * tiling.reduceSize;
-                SimpleSoftMaxGenericNDImpl(dst, inSumTensor, inMaxTensor, src, workLocal, tiling, offset1, offset2,
-                    tiling.splitM);
+                SimpleSoftMaxGenericNDImpl<config>(dst, inSumTensor, inMaxTensor, src, workLocal, tiling, offset1,
+                    offset2, tiling.splitM);
             }
             PipeBarrier<PIPE_V>();
             if (tiling.tailM != 0) {
                 offset1 = tiling.rangeM * tiling.splitSize;
                 offset2 = tiling.rangeM * tiling.reduceSize;
-                SimpleSoftMaxGenericNDImpl(dst, inSumTensor, inMaxTensor, src, workLocal, tiling, offset1, offset2,
-                    tiling.tailM);
+                SimpleSoftMaxGenericNDImpl<config>(dst, inSumTensor, inMaxTensor, src, workLocal, tiling, offset1,
+                    offset2, tiling.tailM);
             }
         }
     }
@@ -391,6 +424,7 @@ __aicore__ inline void SimpleSoftMaxBasicBlock(const LocalTensor<half>& dst, con
     }
 }
 
+template<const SoftmaxConfig& config = SOFTMAX_DEFAULT_CFG>
 __aicore__ inline void SimpleSoftMaxGenericNDImpl(const LocalTensor<half>& dst, const LocalTensor<float>& inSumTensor,
     const LocalTensor<float>& inMaxTensor, const LocalTensor<half>& src, const LocalTensor<float>& workLocal,
     const SoftMaxTiling& tiling, const uint32_t offset1, const uint32_t offset2, const uint32_t curSplitM)
@@ -398,18 +432,32 @@ __aicore__ inline void SimpleSoftMaxGenericNDImpl(const LocalTensor<half>& dst,
     const LocalTensor<float>& tmpBuffer0 = workLocal;
     const uint32_t splitSize = curSplitM * tiling.splitK;
 
-    Cast(tmpBuffer0, src[offset1], RoundMode::CAST_NONE, splitSize);
-    PipeBarrier<PIPE_V>();
-    GenericSubNDImpl(tmpBuffer0, tmpBuffer0, inMaxTensor[offset2], curSplitM, tiling.srcK, tiling.reduceK);
-    PipeBarrier<PIPE_V>();
-    Exp(tmpBuffer0, tmpBuffer0, tiling.splitSize);
-    PipeBarrier<PIPE_V>();
-    GenericDivNDImpl(tmpBuffer0, tmpBuffer0, inSumTensor[offset2], curSplitM, tiling.srcK, tiling.reduceK);
-    PipeBarrier<PIPE_V>();
-    Cast(dst[offset1], tmpBuffer0, FLOAT2HALF_ROUND_MODE, splitSize);
+    if constexpr (config.oriSrcM == 0 || config.oriSrcK == 0) {
+        Cast(tmpBuffer0, src[offset1], RoundMode::CAST_NONE, splitSize);
+        PipeBarrier<PIPE_V>();
+        GenericSubNDImpl(tmpBuffer0, tmpBuffer0, inMaxTensor[offset2], curSplitM, tiling.srcK, tiling.reduceK);
+        PipeBarrier<PIPE_V>();
+        Exp(tmpBuffer0, tmpBuffer0, tiling.splitSize);
+        PipeBarrier<PIPE_V>();
+        GenericDivNDImpl(tmpBuffer0, tmpBuffer0, inSumTensor[offset2], curSplitM, tiling.srcK, tiling.reduceK);
+        PipeBarrier<PIPE_V>();
+        Cast(dst[offset1], tmpBuffer0, FLOAT2HALF_ROUND_MODE, splitSize);
+    } else {
+        Cast(tmpBuffer0, src[offset1], RoundMode::CAST_NONE, splitSize);
+        PipeBarrier<PIPE_V>();
+        GenericSubNDImpl(tmpBuffer0, tmpBuffer0, inMaxTensor[offset2], curSplitM, config.oriSrcK,
+            DEFAULT_REPEAT_STRIDE);
+        PipeBarrier<PIPE_V>();
+        Exp(tmpBuffer0, tmpBuffer0, tiling.splitSize);
+        PipeBarrier<PIPE_V>();
+        GenericDivNDImpl(tmpBuffer0, tmpBuffer0, inSumTensor[offset2], curSplitM, config.oriSrcK,
+            DEFAULT_REPEAT_STRIDE);
+        PipeBarrier<PIPE_V>();
+        Cast(dst[offset1], tmpBuffer0, FLOAT2HALF_ROUND_MODE, splitSize);     
+    }
 }
 
-template <typename T, bool isBasicBlock = false>
+template <typename T, bool isBasicBlock = false, const SoftmaxConfig& config = SOFTMAX_DEFAULT_CFG>
 __aicore__ inline void SimpleSoftMaxNDImpl(const LocalTensor<half>& dst, const LocalTensor<float>& inSumTensor,
     const LocalTensor<float>& inMaxTensor, const LocalTensor<half>& src, const LocalTensor<float>& workLocal,
     const SoftMaxTiling& tiling)
@@ -423,15 +471,15 @@ __aicore__ inline void SimpleSoftMaxNDImpl(const LocalTensor<half>& dst, const L
         for (uint32_t i = 0; i < tiling.rangeM; i++) {
             offset1 = i * tiling.splitSize;
             offset2 = i * tiling.reduceSize;
-            SimpleSoftMaxGenericNDImpl(dst, inSumTensor, inMaxTensor, src, workLocal, tiling, offset1, offset2,
-                tiling.splitM);
+            SimpleSoftMaxGenericNDImpl<config>(dst, inSumTensor, inMaxTensor, src, workLocal, tiling, offset1,
+                offset2, tiling.splitM);
         }
         PipeBarrier<PIPE_V>();
         if (tiling.tailM != 0) {
             offset1 = tiling.rangeM * tiling.splitSize;
             offset2 = tiling.rangeM * tiling.reduceSize;
-            SimpleSoftMaxGenericNDImpl(dst, inSumTensor, inMaxTensor, src, workLocal, tiling, offset1, offset2,
-                tiling.tailM);
+            SimpleSoftMaxGenericNDImpl<config>(dst, inSumTensor, inMaxTensor, src, workLocal, tiling, offset1,
+                offset2, tiling.tailM);
         }
     }
 }
diff --git a/impl/activation/softmax/v300/simple_softmax_impl.h b/impl/activation/softmax/v300/simple_softmax_impl.h
index 25596b18ae73555815623dd61619138d51961a79..e8a20d300ae781ae529d2a722454eefe282ea5f2 100644
--- a/impl/activation/softmax/v300/simple_softmax_impl.h
+++ b/impl/activation/softmax/v300/simple_softmax_impl.h
@@ -80,7 +80,7 @@ __aicore__ inline void SimpleSoftMaxGenericNDImpl(const LocalTensor<float>& dst,
     DivNDImpl(dst[offset1], dst[offset1], inSumTensor[offset2], curSplitM, tiling.srcK, tiling.reduceK);
 }
 
-template <typename T, bool isBasicBlock = false>
+template <typename T, bool isBasicBlock = false, const SoftmaxConfig& config = SOFTMAX_DEFAULT_CFG>
 __aicore__ inline void SimpleSoftMaxNDImpl(const LocalTensor<T>& dst, const LocalTensor<T>& inSumTensor,
     const LocalTensor<T>& inMaxTensor, const LocalTensor<T>& src, const LocalTensor<float> workLocal,
     const SoftMaxTiling& tiling)
@@ -108,7 +108,7 @@ __aicore__ inline void SimpleSoftMaxNDImpl(const LocalTensor<T>& dst, const Loca
     }
 }
 
-template <typename T, bool isBasicBlock = false>
+template <typename T, bool isBasicBlock = false, const SoftmaxConfig& config = SOFTMAX_DEFAULT_CFG>
 __aicore__ inline void SimpleSoftMaxNDImpl(const LocalTensor<half>& dst, const LocalTensor<float>& inSumTensor,
     const LocalTensor<float>& inMaxTensor, const LocalTensor<half>& src, const LocalTensor<float>& workLocal,
     const SoftMaxTiling& tiling)
diff --git a/lib/activation/simplesoftmax.h b/lib/activation/simplesoftmax.h
index 75573666c1d8188d16865be32921ef5b6ac3b19d..986e6603c30d5acadd86474223374b634957ee1f 100644
--- a/lib/activation/simplesoftmax.h
+++ b/lib/activation/simplesoftmax.h
@@ -38,7 +38,8 @@ namespace AscendC {
  *                           improve performance, but it is a reserved param when isDataFormatNZ = true
  * \param [in] isDataFormatNZ: if the data format of input srcTensor is NZ
  */
-template <typename T, bool isReuseSource = false, bool isBasicBlock = false, bool isDataFormatNZ = false>
+template <typename T, bool isReuseSource = false, bool isBasicBlock = false, bool isDataFormatNZ = false,
+    const SoftmaxConfig& config = SOFTMAX_DEFAULT_CFG>
 __aicore__ inline void SimpleSoftMax(const LocalTensor<T>& dstTensor, const LocalTensor<T>& inSumTensor,
     const LocalTensor<T>& inMaxTensor, const LocalTensor<T>& srcTensor, const SoftMaxTiling& tiling,
     const SoftMaxShapeInfo& softmaxShapeInfo = {})
@@ -46,8 +47,8 @@ __aicore__ inline void SimpleSoftMax(const LocalTensor<T>& dstTensor, const Loca
     if ASCEND_IS_AIC {
         return;
     }
-    SimpleSoftMaxImpl<T, T, isBasicBlock, isDataFormatNZ>(dstTensor, inSumTensor, inMaxTensor, srcTensor, tiling,
-        softmaxShapeInfo);
+    SimpleSoftMaxImpl<T, T, isBasicBlock, isDataFormatNZ, config>(dstTensor, inSumTensor, inMaxTensor, srcTensor,
+        tiling, softmaxShapeInfo);
 }
 
 /*!
@@ -64,7 +65,8 @@ __aicore__ inline void SimpleSoftMax(const LocalTensor<T>& dstTensor, const Loca
  *                           improve performance, but it is a reserved param when isDataFormatNZ = true
  * \param [in] isDataFormatNZ: if the data format of input srcTensor is NZ
  */
-template <typename T, bool isReuseSource = false, bool isBasicBlock = false, bool isDataFormatNZ = false>
+template <typename T, bool isReuseSource = false, bool isBasicBlock = false, bool isDataFormatNZ = false,
+    const SoftmaxConfig& config = SOFTMAX_DEFAULT_CFG>
 __aicore__ inline void SimpleSoftMax(const LocalTensor<half>& dstTensor, const LocalTensor<float>& inSumTensor,
     const LocalTensor<float>& inMaxTensor, const LocalTensor<half>& srcTensor, const SoftMaxTiling& tiling,
     const SoftMaxShapeInfo& softmaxShapeInfo = {})
@@ -72,8 +74,8 @@ __aicore__ inline void SimpleSoftMax(const LocalTensor<half>& dstTensor, const L
     if ASCEND_IS_AIC {
         return;
     }
-    SimpleSoftMaxImpl<half, float, isBasicBlock, isDataFormatNZ>(dstTensor, inSumTensor, inMaxTensor, srcTensor, tiling,
-        softmaxShapeInfo);
+    SimpleSoftMaxImpl<half, float, isBasicBlock, isDataFormatNZ, config>(dstTensor, inSumTensor, inMaxTensor, srcTensor,
+        tiling, softmaxShapeInfo);
 }
 
 /*!
@@ -93,7 +95,8 @@ __aicore__ inline void SimpleSoftMax(const LocalTensor<half>& dstTensor, const L
  *                           improve performance, but it is a reserved param when isDataFormatNZ = true
  * \param [in] isDataFormatNZ: if the data format of input srcTensor is NZ
  */
-template <typename T, bool isReuseSource = false, bool isBasicBlock = false, bool isDataFormatNZ = false>
+template <typename T, bool isReuseSource = false, bool isBasicBlock = false, bool isDataFormatNZ = false,
+    const SoftmaxConfig& config = SOFTMAX_DEFAULT_CFG>
 __aicore__ inline void SimpleSoftMax(const LocalTensor<T>& dstTensor, const LocalTensor<T>& inSumTensor,
     const LocalTensor<T>& inMaxTensor, const LocalTensor<T>& srcTensor, const LocalTensor<uint8_t>& sharedTmpBuffer,
     const SoftMaxTiling& tiling, const SoftMaxShapeInfo& softmaxShapeInfo = {})
@@ -101,7 +104,7 @@ __aicore__ inline void SimpleSoftMax(const LocalTensor<T>& dstTensor, const Loca
     if ASCEND_IS_AIC {
         return;
     }
-    SimpleSoftMaxImpl<T, T, isBasicBlock, isDataFormatNZ>(dstTensor, inSumTensor, inMaxTensor, srcTensor,
+    SimpleSoftMaxImpl<T, T, isBasicBlock, isDataFormatNZ, config>(dstTensor, inSumTensor, inMaxTensor, srcTensor,
         sharedTmpBuffer, tiling, softmaxShapeInfo);
 }
 
@@ -121,7 +124,8 @@ __aicore__ inline void SimpleSoftMax(const LocalTensor<T>& dstTensor, const Loca
  *                           improve performance, but it is a reserved param when isDataFormatNZ = true
  * \param [in] isDataFormatNZ: if the data format of input srcTensor is NZ
  */
-template <typename T, bool isReuseSource = false, bool isBasicBlock = false, bool isDataFormatNZ = false>
+template <typename T, bool isReuseSource = false, bool isBasicBlock = false, bool isDataFormatNZ = false,
+    const SoftmaxConfig& config = SOFTMAX_DEFAULT_CFG>
 __aicore__ inline void SimpleSoftMax(const LocalTensor<half>& dstTensor, const LocalTensor<float>& inSumTensor,
     const LocalTensor<float>& inMaxTensor, const LocalTensor<half>& srcTensor,
     const LocalTensor<uint8_t>& sharedTmpBuffer, const SoftMaxTiling& tiling,
@@ -130,11 +134,11 @@ __aicore__ inline void SimpleSoftMax(const LocalTensor<half>& dstTensor, const L
     if ASCEND_IS_AIC {
         return;
     }
-    SimpleSoftMaxImpl<half, float, isBasicBlock, isDataFormatNZ>(dstTensor, inSumTensor, inMaxTensor, srcTensor,
+    SimpleSoftMaxImpl<half, float, isBasicBlock, isDataFormatNZ, config>(dstTensor, inSumTensor, inMaxTensor, srcTensor,
         sharedTmpBuffer, tiling, softmaxShapeInfo);
 }
 } // namespace AscendC
 
 #pragma end_pipe
 #endif
-#endif // LIB_SOFTMAX_SIMPLESOFTMAX_H
+#endif // LIB_SOFTMAX_SIMPLESOFTMAX_H
\ No newline at end of file
diff --git a/tests/activation/softmax/test_operator_softmax_v220.cpp b/tests/activation/softmax/test_operator_softmax_v220.cpp
index 371524f14ad81a3a7ba96fcc0bd9805f298b953e..ed56d987ffa9b25214b497b85d46b6f2aa7f9a49 100644
--- a/tests/activation/softmax/test_operator_softmax_v220.cpp
+++ b/tests/activation/softmax/test_operator_softmax_v220.cpp
@@ -85,6 +85,7 @@ private:
         SoftMaxTiling tiling;
         SoftMax<T1, false, isBasicBlock>(srcLocal1, insumLocal, inmaxLocal, srcLocal1, tiling, srcShape);
         SimpleSoftMax<T1, false>(dstLocal, insumLocal, inmaxLocal, srcLocal1, tiling, srcShape);
+        SimpleSoftMax<T1, false, false, false, config>(dstLocal, insumLocal, inmaxLocal, srcLocal1, tiling, srcShape);
 
         SoftmaxFlash<T1, false, isBasicBlock>(dstLocal, insumLocal, inmaxLocal, srcLocal1, expMaxTensor, insumLocal,
             inmaxLocal, tiling, false, srcShape);