diff --git a/operator/ascendc/0_introduction/14_reduce_frameworklaunch/AclNNInvocationNaive/main.cpp b/operator/ascendc/0_introduction/14_reduce_frameworklaunch/AclNNInvocationNaive/main.cpp
index 7ecffbc7e59ef3cb3a769f491238b4d6d1354bba..734d487985d1bb28360d29fe468652ad771c5842 100644
--- a/operator/ascendc/0_introduction/14_reduce_frameworklaunch/AclNNInvocationNaive/main.cpp
+++ b/operator/ascendc/0_introduction/14_reduce_frameworklaunch/AclNNInvocationNaive/main.cpp
@@ -95,7 +95,7 @@ void DestroyResources(std::vector<void *> tensors, std::vector<void *> deviceAdd
 int main(int argc, char **argv)
 {
     constexpr int64_t inputShape = 4096;
-    constexpr float resFloat = 4096.0;
+    float resFloat = 0;
     // 1. (Fixed code) Initialize device / stream, refer to the list of external interfaces of acl
     // Update deviceId to your own device id
     int32_t deviceId = 0;
@@ -110,21 +110,21 @@ int main(int argc, char **argv)
     void *outputZDeviceAddr = nullptr;
     aclTensor *inputX = nullptr;
     aclTensor *outputZ = nullptr;
-    std::vector<aclFloat16> inputXHostData(inputXShape[0]);
-    std::vector<aclFloat16> outputZHostData(outputZShape[0]);
+    std::vector<float> inputXHostData(inputXShape[0], 1.0);
+    std::vector<float> outputZHostData(outputZShape[0], 0);
+
     for (int i = 0; i < inputXShape[0]; ++i) {
-        inputXHostData[i] = aclFloatToFloat16(1.0);
-    }
-    for (int i = 0; i < outputZShape[0]; ++i) {
-        outputZHostData[i] = aclFloatToFloat16(resFloat);
+        inputXHostData[i] = 1.0;
+        resFloat += 1.0;
     }
+
     std::vector<void *> tensors = {inputX, outputZ};
     std::vector<void *> deviceAddrs = {inputXDeviceAddr, outputZDeviceAddr};
     // Create inputX aclTensor
-    ret = CreateAclTensor(inputXHostData, inputXShape, &inputXDeviceAddr, aclDataType::ACL_FLOAT16, &inputX);
+    ret = CreateAclTensor(inputXHostData, inputXShape, &inputXDeviceAddr, aclDataType::ACL_FLOAT, &inputX);
     CHECK_RET(ret == ACL_SUCCESS, DestroyResources(tensors, deviceAddrs, stream, deviceId); return FAILED);
     // Create outputZ aclTensor
-    ret = CreateAclTensor(outputZHostData, outputZShape, &outputZDeviceAddr, aclDataType::ACL_FLOAT16, &outputZ);
+    ret = CreateAclTensor(outputZHostData, outputZShape, &outputZDeviceAddr, aclDataType::ACL_FLOAT, &outputZ);
     CHECK_RET(ret == ACL_SUCCESS, DestroyResources(tensors, deviceAddrs, stream, deviceId); return FAILED);
 
     // 3. Call the API of the custom operator library
@@ -154,9 +154,9 @@ int main(int argc, char **argv)
     // 5. Get the output value, copy the result from device memory to host memory, need to modify according to the
     // interface of the API
     auto size = GetShapeSize(outputZShape);
-    std::vector<aclFloat16> resultData(size, 0);
+    std::vector<float> resultData(size, 0);
     ret = aclrtMemcpy(resultData.data(), resultData.size() * sizeof(resultData[0]), outputZDeviceAddr,
-                      size * sizeof(aclFloat16), ACL_MEMCPY_DEVICE_TO_HOST);
+                      size * sizeof(float), ACL_MEMCPY_DEVICE_TO_HOST);
     CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("copy result from device to host failed. ERROR: %d\n", ret);
               DestroyResources(tensors, deviceAddrs, stream, deviceId, workspaceAddr); return FAILED);
 
@@ -164,11 +164,11 @@ int main(int argc, char **argv)
     DestroyResources(tensors, deviceAddrs, stream, deviceId, workspaceAddr);
 
     // print the output result
-    std::vector<aclFloat16> goldenData(size, aclFloatToFloat16(resFloat));
+    std::vector<float> goldenData(size, resFloat);
 
     LOG_PRINT("result is:\n");
     for (int64_t i = 0; i < 10; i++) {
-        LOG_PRINT("%.1f ", aclFloat16ToFloat(resultData[i]));
+        LOG_PRINT("%.1f ", resultData[i]);
     }
     LOG_PRINT("\n");
     if (std::equal(resultData.begin(), resultData.begin() + 1, goldenData.begin())) {
diff --git a/operator/ascendc/0_introduction/14_reduce_frameworklaunch/README.md b/operator/ascendc/0_introduction/14_reduce_frameworklaunch/README.md
index 6f9bc094b7389be7e8fdae7d73ecdb3a3a4bf810..04e13268dc68020ac8de0ff6965e5036e0fc6688 100644
--- a/operator/ascendc/0_introduction/14_reduce_frameworklaunch/README.md
+++ b/operator/ascendc/0_introduction/14_reduce_frameworklaunch/README.md
@@ -25,6 +25,10 @@ z = sum(x)
 
 3、长度在float输入(2KB,16KB]，或者half输入(4KB,32KB]时。由于一条WholeReduceSum的累加效率比使用两条BlockReduceSum的累加效率更高。所以采用两条WholeReduceSum（而不是两条BlockReduceSum+一条WholeReduceSum），得到这段buffer的累加和。
 
+4、长度在float输入为10000时，对应WholeReduceSumImpl中的处理方法，在Counter模式下，采用WholeReduceSum指令，循环处理二维数据中的每一行，得到每一行的归约运行结果。
+
+5、长度在float输入为20000时，对应BinaryReduceSumImpl中的处理方法，在Counter模式下，先将运算数据一分为二，使用Add指令将两部分数据相加，循环往复，最后一条WholeReduceSum指令得到归约的运行结果。此种操作方式，相比较WholeReduceSum单指令操作的方式，在数据量较大，循环次数较多的场景下，性能更优。
+
 注意代码中使用了Counter模式。
 
 ## 算子规格描述
@@ -134,3 +138,4 @@ CANN软件包中提供了工程创建工具msOpGen，ReduceCustom算子工程可
 | ---------- | ---------------------------- |
 | 2024/09/14 | 新增ReduceCustom样例 |
 | 2024/11/18 | 算子工程改写为由msOpGen生成 |
+| 2025/07/07 | 增加两种归约操作样例 |
diff --git a/operator/ascendc/0_introduction/14_reduce_frameworklaunch/ReduceCustom/op_host/reduce_custom.cpp b/operator/ascendc/0_introduction/14_reduce_frameworklaunch/ReduceCustom/op_host/reduce_custom.cpp
index 5bec0d17ebc8b0205f58bd87ad28ae5adceec3e4..743fb162bfa10294b9f91a046158866c0a162fc7 100644
--- a/operator/ascendc/0_introduction/14_reduce_frameworklaunch/ReduceCustom/op_host/reduce_custom.cpp
+++ b/operator/ascendc/0_introduction/14_reduce_frameworklaunch/ReduceCustom/op_host/reduce_custom.cpp
@@ -9,21 +9,26 @@
  */
 #include "reduce_custom_tiling.h"
 #include "register/op_def_registry.h"
-#define REDUCE_TILING_0 1
-#define REDUCE_TILING_1 2
-#define REDUCE_TILING_2 3
 
 namespace optiling {
+constexpr uint32_t REDUCE_TILING_1 = 1;
+constexpr uint32_t REDUCE_TILING_2 = 2;
+constexpr uint32_t REDUCE_TILING_3 = 3;
+constexpr uint32_t REDUCE_TILING_4 = 4;
+constexpr uint32_t REDUCE_TILING_5 = 5;
+
 constexpr uint32_t BLOCK_DIM = 1;
 constexpr uint32_t ONE_REPEAT_LEN = 256;
 constexpr uint32_t ONE_BLOCK_LEN = 32;
 constexpr uint32_t OUT_SHAPE = 32;
-constexpr uint32_t HALF_THRESHOLD0 = ONE_REPEAT_LEN / sizeof(uint16_t);
-constexpr uint32_t FLOAT_THRESHOLD0 = ONE_REPEAT_LEN / sizeof(float);
-constexpr uint32_t HALF_THRESHOLD1 = ONE_REPEAT_LEN / sizeof(uint16_t) * ONE_BLOCK_LEN / sizeof(uint16_t);
-constexpr uint32_t FLOAT_THRESHOLD1 = ONE_REPEAT_LEN / sizeof(float) * ONE_BLOCK_LEN / sizeof(float);
-constexpr uint32_t HALF_THRESHOLD2 = ONE_REPEAT_LEN / sizeof(uint16_t) * ONE_REPEAT_LEN / sizeof(uint16_t);
-constexpr uint32_t FLOAT_THRESHOLD2 = ONE_REPEAT_LEN / sizeof(float) * ONE_REPEAT_LEN / sizeof(float);
+constexpr uint32_t HALF_THRESHOLD0 = ONE_REPEAT_LEN / sizeof(uint16_t); // 128
+constexpr uint32_t FLOAT_THRESHOLD0 = ONE_REPEAT_LEN / sizeof(float); // 64
+constexpr uint32_t HALF_THRESHOLD1 = ONE_REPEAT_LEN / sizeof(uint16_t) * ONE_BLOCK_LEN / sizeof(uint16_t); // 2048
+constexpr uint32_t FLOAT_THRESHOLD1 = ONE_REPEAT_LEN / sizeof(float) * ONE_BLOCK_LEN / sizeof(float); //512
+constexpr uint32_t HALF_THRESHOLD2 = ONE_REPEAT_LEN / sizeof(uint16_t) * ONE_REPEAT_LEN / sizeof(uint16_t); // 16384
+constexpr uint32_t FLOAT_THRESHOLD2 = ONE_REPEAT_LEN / sizeof(float) * ONE_REPEAT_LEN / sizeof(float); // 4096
+constexpr uint32_t WHOLEREDUCESUM_SIGLE_MODE = 10000;
+constexpr uint32_t BINARYREDUCESUM_SIGLE_MODE = 20000;
 static ge::graphStatus TilingFunc(gert::TilingContext *context)
 {
     TilingData tiling;
@@ -32,15 +37,19 @@ static ge::graphStatus TilingFunc(gert::TilingContext *context)
     // Only WholeReduceSum is used under 256B.
     if ((totalLength <= HALF_THRESHOLD0 && inputDtype == ge::DT_FLOAT16) ||
         (totalLength <= FLOAT_THRESHOLD0 && inputDtype == ge::DT_FLOAT)) {
-        context->SetTilingKey(REDUCE_TILING_0);
+        context->SetTilingKey(REDUCE_TILING_1);
     // One WholeReduceSum and one BlockReduceSum are used in (256B,2KB](for float input) and (256B,4KB](for half input).
     } else if ((totalLength <= HALF_THRESHOLD1 && inputDtype == ge::DT_FLOAT16) ||
         (totalLength <= FLOAT_THRESHOLD1 && inputDtype == ge::DT_FLOAT)) {
-        context->SetTilingKey(REDUCE_TILING_1);
+        context->SetTilingKey(REDUCE_TILING_2);
     // Two WholeReduceSum are used in (2KB,16KB](for float input) and (4KB,32KB](for half input).
     } else if ((totalLength <= HALF_THRESHOLD2 && inputDtype == ge::DT_FLOAT16) ||
         (totalLength <= FLOAT_THRESHOLD2 && inputDtype == ge::DT_FLOAT)) {
-        context->SetTilingKey(REDUCE_TILING_2);
+        context->SetTilingKey(REDUCE_TILING_3);
+    } else if (totalLength == WHOLEREDUCESUM_SIGLE_MODE) {
+        context->SetTilingKey(REDUCE_TILING_4);
+    } else if (totalLength == BINARYREDUCESUM_SIGLE_MODE) {
+        context->SetTilingKey(REDUCE_TILING_5);
     }
     context->SetBlockDim(BLOCK_DIM);
     tiling.set_totalLength(totalLength);
diff --git a/operator/ascendc/0_introduction/14_reduce_frameworklaunch/ReduceCustom/op_kernel/reduce_custom.cpp b/operator/ascendc/0_introduction/14_reduce_frameworklaunch/ReduceCustom/op_kernel/reduce_custom.cpp
index c4ac235d371aa9fb0322e9ff02cdd9aeb7ab493f..d8d6313326cd2ca66145af27c18dd6ca0e51577c 100644
--- a/operator/ascendc/0_introduction/14_reduce_frameworklaunch/ReduceCustom/op_kernel/reduce_custom.cpp
+++ b/operator/ascendc/0_introduction/14_reduce_frameworklaunch/ReduceCustom/op_kernel/reduce_custom.cpp
@@ -8,15 +8,20 @@
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  */
 #include "kernel_operator.h"
-#define REDUCE_TILING_0 1
-#define REDUCE_TILING_1 2
-#define REDUCE_TILING_2 3
+#define REDUCE_TILING_1 1
+#define REDUCE_TILING_2 2
+#define REDUCE_TILING_3 3
+#define REDUCE_TILING_4 4
+#define REDUCE_TILING_5 5
 
+template<typename DTYPE>
 class KernelReduce {
 static constexpr uint32_t DEFAULT_BLK_STRIDE = 1;
 static constexpr uint32_t DEFAULT_REP_STRIDE = 8;
 static constexpr uint32_t REP_LEN = 256;
 static constexpr uint32_t BLK_LEN = 32;
+static constexpr uint32_t ONE_REPEAT_FLOAT_SIZE = REP_LEN / 4;
+static constexpr uint32_t BINARY_BOUNDARY = DEFAULT_REP_STRIDE * 2;
 public:
     __aicore__ inline KernelReduce() {}
     __aicore__ inline void Init(GM_ADDR x, GM_ADDR z, uint32_t totalLength, uint32_t outLength)
@@ -24,105 +29,192 @@ public:
         this->totalLength = totalLength;
         this->outLength = outLength;
 
-        xGm.SetGlobalBuffer((__gm__ DTYPE_X *)x, totalLength);
-        zGm.SetGlobalBuffer((__gm__ DTYPE_Z *)z, outLength);
-        pipe.InitBuffer(inQueueX, 1, totalLength * sizeof(DTYPE_X));
-        pipe.InitBuffer(outQueueZ, 1, outLength * sizeof(DTYPE_Z));
+        xGm.SetGlobalBuffer((__gm__ DTYPE *)x, totalLength);
+        zGm.SetGlobalBuffer((__gm__ DTYPE *)z, outLength);
+        pipe.InitBuffer(inQueueX, 1, totalLength * sizeof(DTYPE));
+        pipe.InitBuffer(outQueueZ, 1, outLength * sizeof(DTYPE));
     }
-    __aicore__ inline void Process1()
+
+    template<size_t ComputeKey = 0>
+    __aicore__ inline void Compute()
     {
-        CopyIn();
-        Compute1();
-        CopyOut();
+        if constexpr (ComputeKey == REDUCE_TILING_1) {
+            Compute1();
+        } else if constexpr (ComputeKey == REDUCE_TILING_2) {
+            Compute2();
+        } else if constexpr (ComputeKey == REDUCE_TILING_3) {
+            Compute3();
+        } else if constexpr (ComputeKey == REDUCE_TILING_4) {
+            Compute4();
+        } else if constexpr (ComputeKey == REDUCE_TILING_5) {
+            Compute5();
+        }
     }
-    __aicore__ inline void Process2()
-    {
-        CopyIn();
-        Compute2();
-        CopyOut();
-    }
-    __aicore__ inline void Process3()
+
+    template<size_t ComputeKey = 0>
+    __aicore__ inline void Process()
     {
         CopyIn();
-        Compute3();
+        Compute<ComputeKey>();
         CopyOut();
     }
 
 private:
     __aicore__ inline void CopyIn()
     {
-        AscendC::LocalTensor<DTYPE_X> xLocal = inQueueX.AllocTensor<DTYPE_X>();
+        AscendC::LocalTensor<DTYPE> xLocal = inQueueX.AllocTensor<DTYPE>();
         AscendC::DataCopy(xLocal, xGm, totalLength);
         inQueueX.EnQue(xLocal);
     }
     // Only WholeReduceSum is used under 256B.
     __aicore__ inline void Compute1()
     {
-        AscendC::LocalTensor<DTYPE_X> xLocal = inQueueX.DeQue<DTYPE_X>();
-        AscendC::LocalTensor<DTYPE_Z> zLocal = outQueueZ.AllocTensor<DTYPE_Z>();
-        constexpr int64_t maskLen = REP_LEN / sizeof(DTYPE_X);
-        AscendC::WholeReduceSum<DTYPE_X>(zLocal, xLocal, maskLen, 1,
+        AscendC::LocalTensor<DTYPE> xLocal = inQueueX.DeQue<DTYPE>();
+        AscendC::LocalTensor<DTYPE> zLocal = outQueueZ.AllocTensor<DTYPE>();
+
+        constexpr int64_t maskLen = REP_LEN / sizeof(DTYPE);
+        AscendC::WholeReduceSum<DTYPE>(zLocal, xLocal, maskLen, 1,
             DEFAULT_BLK_STRIDE, DEFAULT_BLK_STRIDE, DEFAULT_REP_STRIDE);
-        outQueueZ.EnQue<DTYPE_Z>(zLocal);
+
+        outQueueZ.EnQue<DTYPE>(zLocal);
         inQueueX.FreeTensor(xLocal);
     }
     // One WholeReduceSum and one BlockReduceSum are used in (256B,2KB](for float input) and (256B,4KB](for half input).
     __aicore__ inline void Compute2()
     {
-        AscendC::LocalTensor<DTYPE_X> xLocal = inQueueX.DeQue<DTYPE_X>();
-        AscendC::LocalTensor<DTYPE_Z> zLocal = outQueueZ.AllocTensor<DTYPE_Z>();
-        pipe.InitBuffer(calcBuf, totalLength * sizeof(DTYPE_X));
-        AscendC::LocalTensor<DTYPE_X> tempTensor1 = calcBuf.Get<DTYPE_X>();
-        constexpr uint32_t c0Count = BLK_LEN / sizeof(DTYPE_X);
+        AscendC::LocalTensor<DTYPE> xLocal = inQueueX.DeQue<DTYPE>();
+        AscendC::LocalTensor<DTYPE> zLocal = outQueueZ.AllocTensor<DTYPE>();
+        pipe.InitBuffer(calcBuf, totalLength * sizeof(DTYPE));
+        AscendC::LocalTensor<DTYPE> tempTensor1 = calcBuf.Get<DTYPE>();
+        constexpr uint32_t c0Count = BLK_LEN / sizeof(DTYPE);
         const uint32_t blockNum0 = (totalLength + c0Count - 1) / c0Count;
+
         AscendC::SetMaskCount();
-        AscendC::SetVectorMask<DTYPE_X>(0, totalLength);
-        AscendC::BlockReduceSum<DTYPE_X, false>(tempTensor1, xLocal, AscendC::MASK_PLACEHOLDER, 1,
+        AscendC::SetVectorMask<DTYPE>(0, totalLength);
+        AscendC::BlockReduceSum<DTYPE, false>(tempTensor1, xLocal, 1, AscendC::MASK_PLACEHOLDER,
             DEFAULT_BLK_STRIDE, DEFAULT_BLK_STRIDE, DEFAULT_REP_STRIDE);
         AscendC::PipeBarrier<PIPE_V>();
-        AscendC::SetVectorMask<DTYPE_X>(0, blockNum0);
-        AscendC::WholeReduceSum<DTYPE_X, false>(zLocal, tempTensor1, AscendC::MASK_PLACEHOLDER, 1,
+        AscendC::SetVectorMask<DTYPE>(0, blockNum0);
+        AscendC::WholeReduceSum<DTYPE, false>(zLocal, tempTensor1, 1, AscendC::MASK_PLACEHOLDER,
             DEFAULT_BLK_STRIDE, DEFAULT_BLK_STRIDE, DEFAULT_REP_STRIDE);
         AscendC::PipeBarrier<PIPE_V>();
         AscendC::SetMaskNorm();
-        outQueueZ.EnQue<DTYPE_Z>(zLocal);
+
+        outQueueZ.EnQue<DTYPE>(zLocal);
         inQueueX.FreeTensor(xLocal);
     }
     // Two WholeReduceSum are used in (2KB,16KB](for float input) and (4KB,32KB](for half input).
     __aicore__ inline void Compute3()
     {
-        AscendC::LocalTensor<DTYPE_X> xLocal = inQueueX.DeQue<DTYPE_X>();
-        AscendC::LocalTensor<DTYPE_Z> zLocal = outQueueZ.AllocTensor<DTYPE_Z>();
-        pipe.InitBuffer(calcBuf, totalLength * sizeof(DTYPE_X));
-        AscendC::LocalTensor<DTYPE_X> tempTensor1 = calcBuf.Get<DTYPE_X>();
-        const uint32_t repeatNum = (totalLength * sizeof(DTYPE_X) + REP_LEN - 1) / REP_LEN;
+        AscendC::LocalTensor<DTYPE> xLocal = inQueueX.DeQue<DTYPE>();
+        AscendC::LocalTensor<DTYPE> zLocal = outQueueZ.AllocTensor<DTYPE>();
+        pipe.InitBuffer(calcBuf, totalLength * sizeof(DTYPE));
+        AscendC::LocalTensor<DTYPE> tempTensor1 = calcBuf.Get<DTYPE>();
+        const uint32_t repeatNum = (totalLength * sizeof(DTYPE) + REP_LEN - 1) / REP_LEN;
+
         AscendC::SetMaskCount();
-        AscendC::SetVectorMask<DTYPE_X>(0, totalLength);
-        AscendC::WholeReduceSum<DTYPE_X, false>(tempTensor1, xLocal, AscendC::MASK_PLACEHOLDER, 1,
+        AscendC::SetVectorMask<DTYPE>(0, totalLength);
+        AscendC::WholeReduceSum<DTYPE, false>(tempTensor1, xLocal, 1, AscendC::MASK_PLACEHOLDER,
             DEFAULT_BLK_STRIDE, DEFAULT_BLK_STRIDE, DEFAULT_REP_STRIDE);
         AscendC::PipeBarrier<PIPE_V>();
-        AscendC::SetVectorMask<DTYPE_X>(0, repeatNum);
-        AscendC::WholeReduceSum<DTYPE_X, false>(zLocal, tempTensor1, AscendC::MASK_PLACEHOLDER, 1,
+        AscendC::SetVectorMask<DTYPE>(0, repeatNum);
+        AscendC::WholeReduceSum<DTYPE, false>(zLocal, tempTensor1, 1, AscendC::MASK_PLACEHOLDER,
             DEFAULT_BLK_STRIDE, DEFAULT_BLK_STRIDE, DEFAULT_REP_STRIDE);
         AscendC::PipeBarrier<PIPE_V>();
         AscendC::SetMaskNorm();
-        outQueueZ.EnQue<DTYPE_Z>(zLocal);
+
+        outQueueZ.EnQue<DTYPE>(zLocal);
+        inQueueX.FreeTensor(xLocal);
+    }
+
+    __aicore__ inline void Compute4()
+    {
+        AscendC::LocalTensor<DTYPE> xLocal = inQueueX.DeQue<DTYPE>();
+        AscendC::LocalTensor<DTYPE> zLocal = outQueueZ.AllocTensor<DTYPE>();
+
+        int64_t start = AscendC::GetSystemCycle();
+        WholeReduceSumImpl(zLocal, xLocal, 1, totalLength);
+        int64_t runCycle = AscendC::GetSystemCycle() - start;
+
+        outQueueZ.EnQue<DTYPE>(zLocal);
         inQueueX.FreeTensor(xLocal);
     }
+
+    __aicore__ inline void Compute5()
+    {
+        AscendC::LocalTensor<DTYPE> xLocal = inQueueX.DeQue<DTYPE>();
+        AscendC::LocalTensor<DTYPE> zLocal = outQueueZ.AllocTensor<DTYPE>();
+
+        int64_t start = AscendC::GetSystemCycle();
+        BinaryReduceSumImpl(zLocal, xLocal, 1, totalLength);
+        int64_t runCycle = AscendC::GetSystemCycle() - start;
+
+        outQueueZ.EnQue<DTYPE>(zLocal);
+        inQueueX.FreeTensor(xLocal);
+    }
+
     __aicore__ inline void CopyOut()
     {
-        AscendC::LocalTensor<DTYPE_Z> zLocal = outQueueZ.DeQue<DTYPE_Z>();
+        AscendC::LocalTensor<DTYPE> zLocal = outQueueZ.DeQue<DTYPE>();
         AscendC::DataCopy(zGm, zLocal, this->outLength);
         outQueueZ.FreeTensor(zLocal);
     }
 
+    __aicore__ inline void WholeReduceSumImpl(const AscendC::LocalTensor<float>& dst, const AscendC::LocalTensor<float>& src,
+        const uint32_t bsLength, const uint32_t hLength)
+    { 
+        AscendC::SetMaskCount();
+        for (uint32_t i = 0; i < bsLength; i++) {
+            uint32_t totalNum = hLength;
+            AscendC::LocalTensor<float> srcTmp = src[i * hLength];
+            AscendC::LocalTensor<float> dstTmp = dst[i * hLength];
+            while (totalNum > 1) {
+                AscendC::SetVectorMask<uint8_t, AscendC::MaskMode::COUNTER>(0, totalNum);
+                AscendC::WholeReduceSum<float, false>(dstTmp, srcTmp, AscendC::MASK_PLACEHOLDER, 1, DEFAULT_BLK_STRIDE,
+                    DEFAULT_BLK_STRIDE, DEFAULT_REP_STRIDE);
+                AscendC::PipeBarrier<PIPE_V>();
+                totalNum = AscendC::DivCeil(totalNum, ONE_REPEAT_FLOAT_SIZE);
+                srcTmp = dstTmp;
+            }
+        }
+        AscendC::ResetMask();
+        AscendC::SetMaskNorm();
+    }
+
+    __aicore__ inline void BinaryReduceSumImpl(const AscendC::LocalTensor<float>& dst, const AscendC::LocalTensor<float>& src,
+    const uint32_t bsLength, const uint32_t hLength)
+    {
+        AscendC::BinaryRepeatParams binaryParams;
+        AscendC::UnaryRepeatParams unaryParams;
+        AscendC::SetMaskCount();
+        for (uint32_t i = 0; i < bsLength; i++) {
+            uint32_t totalNum = hLength;
+            AscendC::LocalTensor<float> srcTmp = src[i * hLength];
+            AscendC::LocalTensor<float> dstTmp = dst[i * hLength];
+            while (totalNum > ONE_REPEAT_FLOAT_SIZE) {
+                uint32_t halfNum = AscendC::DivCeil(totalNum, BINARY_BOUNDARY) * DEFAULT_REP_STRIDE;
+                AscendC::SetVectorMask<uint8_t, AscendC::MaskMode::COUNTER>(0, totalNum - halfNum);
+                AscendC::Add<float, false>(dstTmp, srcTmp, srcTmp[halfNum], AscendC::MASK_PLACEHOLDER, 1, binaryParams);
+                AscendC::PipeBarrier<PIPE_V>();
+                totalNum = halfNum;
+                srcTmp = dstTmp;
+            }
+            AscendC::SetVectorMask<uint8_t, AscendC::MaskMode::COUNTER>(0, totalNum);
+            AscendC::WholeReduceSum<float, false>(dstTmp, srcTmp, AscendC::MASK_PLACEHOLDER, 1, DEFAULT_BLK_STRIDE,
+                DEFAULT_BLK_STRIDE, DEFAULT_REP_STRIDE);
+            AscendC::PipeBarrier<PIPE_V>();
+        }
+        AscendC::ResetMask();
+        AscendC::SetMaskNorm();
+    }
+
 private:
     AscendC::TPipe pipe;
     AscendC::TQue<AscendC::TPosition::VECIN, 1> inQueueX;
     AscendC::TQue<AscendC::TPosition::VECOUT, 1> outQueueZ;
     AscendC::TBuf<AscendC::TPosition::VECCALC> calcBuf;
-    AscendC::GlobalTensor<DTYPE_X> xGm;
-    AscendC::GlobalTensor<DTYPE_Z> zGm;
+    AscendC::GlobalTensor<DTYPE> xGm;
+    AscendC::GlobalTensor<DTYPE> zGm;
     uint32_t totalLength;
     uint32_t outLength;
 };
@@ -130,14 +222,18 @@ private:
 extern "C" __global__ __aicore__ void reduce_custom(GM_ADDR x, GM_ADDR z, GM_ADDR workspace, GM_ADDR tiling)
 {
     GET_TILING_DATA(tiling_data, tiling);
-    KernelReduce op;
+    KernelReduce<float> op;
     op.Init(x, z, tiling_data.totalLength, tiling_data.outLength);
-    if (TILING_KEY_IS(REDUCE_TILING_0)) {
-        op.Process1();
-    } else if (TILING_KEY_IS(REDUCE_TILING_1)) {
-        op.Process2();
+    if (TILING_KEY_IS(REDUCE_TILING_1)) {
+        op.Process<REDUCE_TILING_1>();
     } else if (TILING_KEY_IS(REDUCE_TILING_2)) {
-        op.Process3();
+        op.Process<REDUCE_TILING_2>();
+    } else if (TILING_KEY_IS(REDUCE_TILING_3)) {
+        op.Process<REDUCE_TILING_3>();
+    } else if (TILING_KEY_IS(REDUCE_TILING_4)) {
+        op.Process<REDUCE_TILING_4>();
+    } else if (TILING_KEY_IS(REDUCE_TILING_5)) {
+        op.Process<REDUCE_TILING_5>();
     }
 }