diff --git a/operator/ascendc/0_introduction/14_reduce_frameworklaunch/AclNNInvocationNaive/main.cpp b/operator/ascendc/0_introduction/14_reduce_frameworklaunch/AclNNInvocationNaive/main.cpp index 7ecffbc7e59ef3cb3a769f491238b4d6d1354bba..734d487985d1bb28360d29fe468652ad771c5842 100644 --- a/operator/ascendc/0_introduction/14_reduce_frameworklaunch/AclNNInvocationNaive/main.cpp +++ b/operator/ascendc/0_introduction/14_reduce_frameworklaunch/AclNNInvocationNaive/main.cpp @@ -95,7 +95,7 @@ void DestroyResources(std::vector tensors, std::vector deviceAdd int main(int argc, char **argv) { constexpr int64_t inputShape = 4096; - constexpr float resFloat = 4096.0; + float resFloat = 0; // 1. (Fixed code) Initialize device / stream, refer to the list of external interfaces of acl // Update deviceId to your own device id int32_t deviceId = 0; @@ -110,21 +110,21 @@ int main(int argc, char **argv) void *outputZDeviceAddr = nullptr; aclTensor *inputX = nullptr; aclTensor *outputZ = nullptr; - std::vector inputXHostData(inputXShape[0]); - std::vector outputZHostData(outputZShape[0]); + std::vector inputXHostData(inputXShape[0], 1.0); + std::vector outputZHostData(outputZShape[0], 0); + for (int i = 0; i < inputXShape[0]; ++i) { - inputXHostData[i] = aclFloatToFloat16(1.0); - } - for (int i = 0; i < outputZShape[0]; ++i) { - outputZHostData[i] = aclFloatToFloat16(resFloat); + inputXHostData[i] = 1.0; + resFloat += 1.0; } + std::vector tensors = {inputX, outputZ}; std::vector deviceAddrs = {inputXDeviceAddr, outputZDeviceAddr}; // Create inputX aclTensor - ret = CreateAclTensor(inputXHostData, inputXShape, &inputXDeviceAddr, aclDataType::ACL_FLOAT16, &inputX); + ret = CreateAclTensor(inputXHostData, inputXShape, &inputXDeviceAddr, aclDataType::ACL_FLOAT, &inputX); CHECK_RET(ret == ACL_SUCCESS, DestroyResources(tensors, deviceAddrs, stream, deviceId); return FAILED); // Create outputZ aclTensor - ret = CreateAclTensor(outputZHostData, outputZShape, &outputZDeviceAddr, aclDataType::ACL_FLOAT16, &outputZ); + ret = CreateAclTensor(outputZHostData, outputZShape, &outputZDeviceAddr, aclDataType::ACL_FLOAT, &outputZ); CHECK_RET(ret == ACL_SUCCESS, DestroyResources(tensors, deviceAddrs, stream, deviceId); return FAILED); // 3. Call the API of the custom operator library @@ -154,9 +154,9 @@ int main(int argc, char **argv) // 5. Get the output value, copy the result from device memory to host memory, need to modify according to the // interface of the API auto size = GetShapeSize(outputZShape); - std::vector resultData(size, 0); + std::vector resultData(size, 0); ret = aclrtMemcpy(resultData.data(), resultData.size() * sizeof(resultData[0]), outputZDeviceAddr, - size * sizeof(aclFloat16), ACL_MEMCPY_DEVICE_TO_HOST); + size * sizeof(float), ACL_MEMCPY_DEVICE_TO_HOST); CHECK_RET(ret == ACL_SUCCESS, LOG_PRINT("copy result from device to host failed. ERROR: %d\n", ret); DestroyResources(tensors, deviceAddrs, stream, deviceId, workspaceAddr); return FAILED); @@ -164,11 +164,11 @@ int main(int argc, char **argv) DestroyResources(tensors, deviceAddrs, stream, deviceId, workspaceAddr); // print the output result - std::vector goldenData(size, aclFloatToFloat16(resFloat)); + std::vector goldenData(size, resFloat); LOG_PRINT("result is:\n"); for (int64_t i = 0; i < 10; i++) { - LOG_PRINT("%.1f ", aclFloat16ToFloat(resultData[i])); + LOG_PRINT("%.1f ", resultData[i]); } LOG_PRINT("\n"); if (std::equal(resultData.begin(), resultData.begin() + 1, goldenData.begin())) { diff --git a/operator/ascendc/0_introduction/14_reduce_frameworklaunch/README.md b/operator/ascendc/0_introduction/14_reduce_frameworklaunch/README.md index 6f9bc094b7389be7e8fdae7d73ecdb3a3a4bf810..04e13268dc68020ac8de0ff6965e5036e0fc6688 100644 --- a/operator/ascendc/0_introduction/14_reduce_frameworklaunch/README.md +++ b/operator/ascendc/0_introduction/14_reduce_frameworklaunch/README.md @@ -25,6 +25,10 @@ z = sum(x) 3、长度在float输入(2KB,16KB],或者half输入(4KB,32KB]时。由于一条WholeReduceSum的累加效率比使用两条BlockReduceSum的累加效率更高。所以采用两条WholeReduceSum(而不是两条BlockReduceSum+一条WholeReduceSum),得到这段buffer的累加和。 +4、长度在float输入为10000时,对应WholeReduceSumImpl中的处理方法,在Counter模式下,采用WholeReduceSum指令,循环处理二维数据中的每一行,得到每一行的归约运行结果。 + +5、长度在float输入为20000时,对应BinaryReduceSumImpl中的处理方法,在Counter模式下,先将运算数据一分为二,使用Add指令将两部分数据相加,循环往复,最后一条WholeReduceSum指令得到归约的运行结果。此种操作方式,相比较WholeReduceSum单指令操作的方式,在数据量较大,循环次数较多的场景下,性能更优。 + 注意代码中使用了Counter模式。 ## 算子规格描述 @@ -134,3 +138,4 @@ CANN软件包中提供了工程创建工具msOpGen,ReduceCustom算子工程可 | ---------- | ---------------------------- | | 2024/09/14 | 新增ReduceCustom样例 | | 2024/11/18 | 算子工程改写为由msOpGen生成 | +| 2025/07/07 | 增加两种归约操作样例 | diff --git a/operator/ascendc/0_introduction/14_reduce_frameworklaunch/ReduceCustom/op_host/reduce_custom.cpp b/operator/ascendc/0_introduction/14_reduce_frameworklaunch/ReduceCustom/op_host/reduce_custom.cpp index 5bec0d17ebc8b0205f58bd87ad28ae5adceec3e4..743fb162bfa10294b9f91a046158866c0a162fc7 100644 --- a/operator/ascendc/0_introduction/14_reduce_frameworklaunch/ReduceCustom/op_host/reduce_custom.cpp +++ b/operator/ascendc/0_introduction/14_reduce_frameworklaunch/ReduceCustom/op_host/reduce_custom.cpp @@ -9,21 +9,26 @@ */ #include "reduce_custom_tiling.h" #include "register/op_def_registry.h" -#define REDUCE_TILING_0 1 -#define REDUCE_TILING_1 2 -#define REDUCE_TILING_2 3 namespace optiling { +constexpr uint32_t REDUCE_TILING_1 = 1; +constexpr uint32_t REDUCE_TILING_2 = 2; +constexpr uint32_t REDUCE_TILING_3 = 3; +constexpr uint32_t REDUCE_TILING_4 = 4; +constexpr uint32_t REDUCE_TILING_5 = 5; + constexpr uint32_t BLOCK_DIM = 1; constexpr uint32_t ONE_REPEAT_LEN = 256; constexpr uint32_t ONE_BLOCK_LEN = 32; constexpr uint32_t OUT_SHAPE = 32; -constexpr uint32_t HALF_THRESHOLD0 = ONE_REPEAT_LEN / sizeof(uint16_t); -constexpr uint32_t FLOAT_THRESHOLD0 = ONE_REPEAT_LEN / sizeof(float); -constexpr uint32_t HALF_THRESHOLD1 = ONE_REPEAT_LEN / sizeof(uint16_t) * ONE_BLOCK_LEN / sizeof(uint16_t); -constexpr uint32_t FLOAT_THRESHOLD1 = ONE_REPEAT_LEN / sizeof(float) * ONE_BLOCK_LEN / sizeof(float); -constexpr uint32_t HALF_THRESHOLD2 = ONE_REPEAT_LEN / sizeof(uint16_t) * ONE_REPEAT_LEN / sizeof(uint16_t); -constexpr uint32_t FLOAT_THRESHOLD2 = ONE_REPEAT_LEN / sizeof(float) * ONE_REPEAT_LEN / sizeof(float); +constexpr uint32_t HALF_THRESHOLD0 = ONE_REPEAT_LEN / sizeof(uint16_t); // 128 +constexpr uint32_t FLOAT_THRESHOLD0 = ONE_REPEAT_LEN / sizeof(float); // 64 +constexpr uint32_t HALF_THRESHOLD1 = ONE_REPEAT_LEN / sizeof(uint16_t) * ONE_BLOCK_LEN / sizeof(uint16_t); // 2048 +constexpr uint32_t FLOAT_THRESHOLD1 = ONE_REPEAT_LEN / sizeof(float) * ONE_BLOCK_LEN / sizeof(float); //512 +constexpr uint32_t HALF_THRESHOLD2 = ONE_REPEAT_LEN / sizeof(uint16_t) * ONE_REPEAT_LEN / sizeof(uint16_t); // 16384 +constexpr uint32_t FLOAT_THRESHOLD2 = ONE_REPEAT_LEN / sizeof(float) * ONE_REPEAT_LEN / sizeof(float); // 4096 +constexpr uint32_t WHOLEREDUCESUM_SIGLE_MODE = 10000; +constexpr uint32_t BINARYREDUCESUM_SIGLE_MODE = 20000; static ge::graphStatus TilingFunc(gert::TilingContext *context) { TilingData tiling; @@ -32,15 +37,19 @@ static ge::graphStatus TilingFunc(gert::TilingContext *context) // Only WholeReduceSum is used under 256B. if ((totalLength <= HALF_THRESHOLD0 && inputDtype == ge::DT_FLOAT16) || (totalLength <= FLOAT_THRESHOLD0 && inputDtype == ge::DT_FLOAT)) { - context->SetTilingKey(REDUCE_TILING_0); + context->SetTilingKey(REDUCE_TILING_1); // One WholeReduceSum and one BlockReduceSum are used in (256B,2KB](for float input) and (256B,4KB](for half input). } else if ((totalLength <= HALF_THRESHOLD1 && inputDtype == ge::DT_FLOAT16) || (totalLength <= FLOAT_THRESHOLD1 && inputDtype == ge::DT_FLOAT)) { - context->SetTilingKey(REDUCE_TILING_1); + context->SetTilingKey(REDUCE_TILING_2); // Two WholeReduceSum are used in (2KB,16KB](for float input) and (4KB,32KB](for half input). } else if ((totalLength <= HALF_THRESHOLD2 && inputDtype == ge::DT_FLOAT16) || (totalLength <= FLOAT_THRESHOLD2 && inputDtype == ge::DT_FLOAT)) { - context->SetTilingKey(REDUCE_TILING_2); + context->SetTilingKey(REDUCE_TILING_3); + } else if (totalLength == WHOLEREDUCESUM_SIGLE_MODE) { + context->SetTilingKey(REDUCE_TILING_4); + } else if (totalLength == BINARYREDUCESUM_SIGLE_MODE) { + context->SetTilingKey(REDUCE_TILING_5); } context->SetBlockDim(BLOCK_DIM); tiling.set_totalLength(totalLength); diff --git a/operator/ascendc/0_introduction/14_reduce_frameworklaunch/ReduceCustom/op_kernel/reduce_custom.cpp b/operator/ascendc/0_introduction/14_reduce_frameworklaunch/ReduceCustom/op_kernel/reduce_custom.cpp index c4ac235d371aa9fb0322e9ff02cdd9aeb7ab493f..d8d6313326cd2ca66145af27c18dd6ca0e51577c 100644 --- a/operator/ascendc/0_introduction/14_reduce_frameworklaunch/ReduceCustom/op_kernel/reduce_custom.cpp +++ b/operator/ascendc/0_introduction/14_reduce_frameworklaunch/ReduceCustom/op_kernel/reduce_custom.cpp @@ -8,15 +8,20 @@ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. */ #include "kernel_operator.h" -#define REDUCE_TILING_0 1 -#define REDUCE_TILING_1 2 -#define REDUCE_TILING_2 3 +#define REDUCE_TILING_1 1 +#define REDUCE_TILING_2 2 +#define REDUCE_TILING_3 3 +#define REDUCE_TILING_4 4 +#define REDUCE_TILING_5 5 +template class KernelReduce { static constexpr uint32_t DEFAULT_BLK_STRIDE = 1; static constexpr uint32_t DEFAULT_REP_STRIDE = 8; static constexpr uint32_t REP_LEN = 256; static constexpr uint32_t BLK_LEN = 32; +static constexpr uint32_t ONE_REPEAT_FLOAT_SIZE = REP_LEN / 4; +static constexpr uint32_t BINARY_BOUNDARY = DEFAULT_REP_STRIDE * 2; public: __aicore__ inline KernelReduce() {} __aicore__ inline void Init(GM_ADDR x, GM_ADDR z, uint32_t totalLength, uint32_t outLength) @@ -24,105 +29,192 @@ public: this->totalLength = totalLength; this->outLength = outLength; - xGm.SetGlobalBuffer((__gm__ DTYPE_X *)x, totalLength); - zGm.SetGlobalBuffer((__gm__ DTYPE_Z *)z, outLength); - pipe.InitBuffer(inQueueX, 1, totalLength * sizeof(DTYPE_X)); - pipe.InitBuffer(outQueueZ, 1, outLength * sizeof(DTYPE_Z)); + xGm.SetGlobalBuffer((__gm__ DTYPE *)x, totalLength); + zGm.SetGlobalBuffer((__gm__ DTYPE *)z, outLength); + pipe.InitBuffer(inQueueX, 1, totalLength * sizeof(DTYPE)); + pipe.InitBuffer(outQueueZ, 1, outLength * sizeof(DTYPE)); } - __aicore__ inline void Process1() + + template + __aicore__ inline void Compute() { - CopyIn(); - Compute1(); - CopyOut(); + if constexpr (ComputeKey == REDUCE_TILING_1) { + Compute1(); + } else if constexpr (ComputeKey == REDUCE_TILING_2) { + Compute2(); + } else if constexpr (ComputeKey == REDUCE_TILING_3) { + Compute3(); + } else if constexpr (ComputeKey == REDUCE_TILING_4) { + Compute4(); + } else if constexpr (ComputeKey == REDUCE_TILING_5) { + Compute5(); + } } - __aicore__ inline void Process2() - { - CopyIn(); - Compute2(); - CopyOut(); - } - __aicore__ inline void Process3() + + template + __aicore__ inline void Process() { CopyIn(); - Compute3(); + Compute(); CopyOut(); } private: __aicore__ inline void CopyIn() { - AscendC::LocalTensor xLocal = inQueueX.AllocTensor(); + AscendC::LocalTensor xLocal = inQueueX.AllocTensor(); AscendC::DataCopy(xLocal, xGm, totalLength); inQueueX.EnQue(xLocal); } // Only WholeReduceSum is used under 256B. __aicore__ inline void Compute1() { - AscendC::LocalTensor xLocal = inQueueX.DeQue(); - AscendC::LocalTensor zLocal = outQueueZ.AllocTensor(); - constexpr int64_t maskLen = REP_LEN / sizeof(DTYPE_X); - AscendC::WholeReduceSum(zLocal, xLocal, maskLen, 1, + AscendC::LocalTensor xLocal = inQueueX.DeQue(); + AscendC::LocalTensor zLocal = outQueueZ.AllocTensor(); + + constexpr int64_t maskLen = REP_LEN / sizeof(DTYPE); + AscendC::WholeReduceSum(zLocal, xLocal, maskLen, 1, DEFAULT_BLK_STRIDE, DEFAULT_BLK_STRIDE, DEFAULT_REP_STRIDE); - outQueueZ.EnQue(zLocal); + + outQueueZ.EnQue(zLocal); inQueueX.FreeTensor(xLocal); } // One WholeReduceSum and one BlockReduceSum are used in (256B,2KB](for float input) and (256B,4KB](for half input). __aicore__ inline void Compute2() { - AscendC::LocalTensor xLocal = inQueueX.DeQue(); - AscendC::LocalTensor zLocal = outQueueZ.AllocTensor(); - pipe.InitBuffer(calcBuf, totalLength * sizeof(DTYPE_X)); - AscendC::LocalTensor tempTensor1 = calcBuf.Get(); - constexpr uint32_t c0Count = BLK_LEN / sizeof(DTYPE_X); + AscendC::LocalTensor xLocal = inQueueX.DeQue(); + AscendC::LocalTensor zLocal = outQueueZ.AllocTensor(); + pipe.InitBuffer(calcBuf, totalLength * sizeof(DTYPE)); + AscendC::LocalTensor tempTensor1 = calcBuf.Get(); + constexpr uint32_t c0Count = BLK_LEN / sizeof(DTYPE); const uint32_t blockNum0 = (totalLength + c0Count - 1) / c0Count; + AscendC::SetMaskCount(); - AscendC::SetVectorMask(0, totalLength); - AscendC::BlockReduceSum(tempTensor1, xLocal, AscendC::MASK_PLACEHOLDER, 1, + AscendC::SetVectorMask(0, totalLength); + AscendC::BlockReduceSum(tempTensor1, xLocal, 1, AscendC::MASK_PLACEHOLDER, DEFAULT_BLK_STRIDE, DEFAULT_BLK_STRIDE, DEFAULT_REP_STRIDE); AscendC::PipeBarrier(); - AscendC::SetVectorMask(0, blockNum0); - AscendC::WholeReduceSum(zLocal, tempTensor1, AscendC::MASK_PLACEHOLDER, 1, + AscendC::SetVectorMask(0, blockNum0); + AscendC::WholeReduceSum(zLocal, tempTensor1, 1, AscendC::MASK_PLACEHOLDER, DEFAULT_BLK_STRIDE, DEFAULT_BLK_STRIDE, DEFAULT_REP_STRIDE); AscendC::PipeBarrier(); AscendC::SetMaskNorm(); - outQueueZ.EnQue(zLocal); + + outQueueZ.EnQue(zLocal); inQueueX.FreeTensor(xLocal); } // Two WholeReduceSum are used in (2KB,16KB](for float input) and (4KB,32KB](for half input). __aicore__ inline void Compute3() { - AscendC::LocalTensor xLocal = inQueueX.DeQue(); - AscendC::LocalTensor zLocal = outQueueZ.AllocTensor(); - pipe.InitBuffer(calcBuf, totalLength * sizeof(DTYPE_X)); - AscendC::LocalTensor tempTensor1 = calcBuf.Get(); - const uint32_t repeatNum = (totalLength * sizeof(DTYPE_X) + REP_LEN - 1) / REP_LEN; + AscendC::LocalTensor xLocal = inQueueX.DeQue(); + AscendC::LocalTensor zLocal = outQueueZ.AllocTensor(); + pipe.InitBuffer(calcBuf, totalLength * sizeof(DTYPE)); + AscendC::LocalTensor tempTensor1 = calcBuf.Get(); + const uint32_t repeatNum = (totalLength * sizeof(DTYPE) + REP_LEN - 1) / REP_LEN; + AscendC::SetMaskCount(); - AscendC::SetVectorMask(0, totalLength); - AscendC::WholeReduceSum(tempTensor1, xLocal, AscendC::MASK_PLACEHOLDER, 1, + AscendC::SetVectorMask(0, totalLength); + AscendC::WholeReduceSum(tempTensor1, xLocal, 1, AscendC::MASK_PLACEHOLDER, DEFAULT_BLK_STRIDE, DEFAULT_BLK_STRIDE, DEFAULT_REP_STRIDE); AscendC::PipeBarrier(); - AscendC::SetVectorMask(0, repeatNum); - AscendC::WholeReduceSum(zLocal, tempTensor1, AscendC::MASK_PLACEHOLDER, 1, + AscendC::SetVectorMask(0, repeatNum); + AscendC::WholeReduceSum(zLocal, tempTensor1, 1, AscendC::MASK_PLACEHOLDER, DEFAULT_BLK_STRIDE, DEFAULT_BLK_STRIDE, DEFAULT_REP_STRIDE); AscendC::PipeBarrier(); AscendC::SetMaskNorm(); - outQueueZ.EnQue(zLocal); + + outQueueZ.EnQue(zLocal); + inQueueX.FreeTensor(xLocal); + } + + __aicore__ inline void Compute4() + { + AscendC::LocalTensor xLocal = inQueueX.DeQue(); + AscendC::LocalTensor zLocal = outQueueZ.AllocTensor(); + + int64_t start = AscendC::GetSystemCycle(); + WholeReduceSumImpl(zLocal, xLocal, 1, totalLength); + int64_t runCycle = AscendC::GetSystemCycle() - start; + + outQueueZ.EnQue(zLocal); inQueueX.FreeTensor(xLocal); } + + __aicore__ inline void Compute5() + { + AscendC::LocalTensor xLocal = inQueueX.DeQue(); + AscendC::LocalTensor zLocal = outQueueZ.AllocTensor(); + + int64_t start = AscendC::GetSystemCycle(); + BinaryReduceSumImpl(zLocal, xLocal, 1, totalLength); + int64_t runCycle = AscendC::GetSystemCycle() - start; + + outQueueZ.EnQue(zLocal); + inQueueX.FreeTensor(xLocal); + } + __aicore__ inline void CopyOut() { - AscendC::LocalTensor zLocal = outQueueZ.DeQue(); + AscendC::LocalTensor zLocal = outQueueZ.DeQue(); AscendC::DataCopy(zGm, zLocal, this->outLength); outQueueZ.FreeTensor(zLocal); } + __aicore__ inline void WholeReduceSumImpl(const AscendC::LocalTensor& dst, const AscendC::LocalTensor& src, + const uint32_t bsLength, const uint32_t hLength) + { + AscendC::SetMaskCount(); + for (uint32_t i = 0; i < bsLength; i++) { + uint32_t totalNum = hLength; + AscendC::LocalTensor srcTmp = src[i * hLength]; + AscendC::LocalTensor dstTmp = dst[i * hLength]; + while (totalNum > 1) { + AscendC::SetVectorMask(0, totalNum); + AscendC::WholeReduceSum(dstTmp, srcTmp, AscendC::MASK_PLACEHOLDER, 1, DEFAULT_BLK_STRIDE, + DEFAULT_BLK_STRIDE, DEFAULT_REP_STRIDE); + AscendC::PipeBarrier(); + totalNum = AscendC::DivCeil(totalNum, ONE_REPEAT_FLOAT_SIZE); + srcTmp = dstTmp; + } + } + AscendC::ResetMask(); + AscendC::SetMaskNorm(); + } + + __aicore__ inline void BinaryReduceSumImpl(const AscendC::LocalTensor& dst, const AscendC::LocalTensor& src, + const uint32_t bsLength, const uint32_t hLength) + { + AscendC::BinaryRepeatParams binaryParams; + AscendC::UnaryRepeatParams unaryParams; + AscendC::SetMaskCount(); + for (uint32_t i = 0; i < bsLength; i++) { + uint32_t totalNum = hLength; + AscendC::LocalTensor srcTmp = src[i * hLength]; + AscendC::LocalTensor dstTmp = dst[i * hLength]; + while (totalNum > ONE_REPEAT_FLOAT_SIZE) { + uint32_t halfNum = AscendC::DivCeil(totalNum, BINARY_BOUNDARY) * DEFAULT_REP_STRIDE; + AscendC::SetVectorMask(0, totalNum - halfNum); + AscendC::Add(dstTmp, srcTmp, srcTmp[halfNum], AscendC::MASK_PLACEHOLDER, 1, binaryParams); + AscendC::PipeBarrier(); + totalNum = halfNum; + srcTmp = dstTmp; + } + AscendC::SetVectorMask(0, totalNum); + AscendC::WholeReduceSum(dstTmp, srcTmp, AscendC::MASK_PLACEHOLDER, 1, DEFAULT_BLK_STRIDE, + DEFAULT_BLK_STRIDE, DEFAULT_REP_STRIDE); + AscendC::PipeBarrier(); + } + AscendC::ResetMask(); + AscendC::SetMaskNorm(); + } + private: AscendC::TPipe pipe; AscendC::TQue inQueueX; AscendC::TQue outQueueZ; AscendC::TBuf calcBuf; - AscendC::GlobalTensor xGm; - AscendC::GlobalTensor zGm; + AscendC::GlobalTensor xGm; + AscendC::GlobalTensor zGm; uint32_t totalLength; uint32_t outLength; }; @@ -130,14 +222,18 @@ private: extern "C" __global__ __aicore__ void reduce_custom(GM_ADDR x, GM_ADDR z, GM_ADDR workspace, GM_ADDR tiling) { GET_TILING_DATA(tiling_data, tiling); - KernelReduce op; + KernelReduce op; op.Init(x, z, tiling_data.totalLength, tiling_data.outLength); - if (TILING_KEY_IS(REDUCE_TILING_0)) { - op.Process1(); - } else if (TILING_KEY_IS(REDUCE_TILING_1)) { - op.Process2(); + if (TILING_KEY_IS(REDUCE_TILING_1)) { + op.Process(); } else if (TILING_KEY_IS(REDUCE_TILING_2)) { - op.Process3(); + op.Process(); + } else if (TILING_KEY_IS(REDUCE_TILING_3)) { + op.Process(); + } else if (TILING_KEY_IS(REDUCE_TILING_4)) { + op.Process(); + } else if (TILING_KEY_IS(REDUCE_TILING_5)) { + op.Process(); } }