diff --git a/atvc/examples/reduce_max/README.md b/atvc/examples/reduce_max/README.md new file mode 100644 index 0000000000000000000000000000000000000000..358fa460c8e2133169e7da139d0e6b0c45e67e77 --- /dev/null +++ b/atvc/examples/reduce_max/README.md @@ -0,0 +1,46 @@ + + +## 概述 + +本样例介绍了利用ATVC实现ReduceMax单算子并完成功能验证 + + +## 样例支持产品型号: +- Atlas A2训练系列产品 + +## 目录结构 + +| 文件名 | 描述 | +| ------------------------------------------------------------ | ------------------------------------------------------------ | +| [reduce_max.cpp](./reduce_max.cpp) | ReduceMax算子代码实现以及调用样例 | + +## 算子描述 + +ReduceMax是对输入tensor的指定轴进行规约累加的计算并输出结果的Reduce类算子。 + +ReduceMax算子规格: + + + + + + + + + + + + + +
算子类型(OpType)ReduceMax
算子输入
nameshapedata typeformat
x8 * 2048floatND
算子输出
y1 * 2048floatND
核函数名ReduceCustom
+ +## 算子运行 +在ascendc-api-adv代码仓目录下执行: +```bash +$ cd ./atvc/examples +$ bash run_examples.sh reduce_max +... +Generate golden data successfully. +... +Accuracy verification passed. +``` \ No newline at end of file diff --git a/atvc/examples/reduce_max/reduce_max.cpp b/atvc/examples/reduce_max/reduce_max.cpp new file mode 100644 index 0000000000000000000000000000000000000000..a2176809706f94bb442b5e2632e0b2c38e8fcfa7 --- /dev/null +++ b/atvc/examples/reduce_max/reduce_max.cpp @@ -0,0 +1,178 @@ +/** + * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved. + * + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#include +#include +#include +#include +#include +#include +#include "acl/acl.h" +#include "reduce/reduce_host.h" +#include "reduce/reduce_device.h" +#include "example_common.h" + +namespace { +// ReduceSum算子的描述:一个输入,一个输出,类型均为float +using ReduceOpTraits = ATVC::OpTraits, ATVC::OpOutputs>; + +void CleanUp(uint8_t *&xDevice, uint8_t *&yDevice, uint8_t *&yHost) +{ + CHECK_ACL(aclrtFree(xDevice)); + CHECK_ACL(aclrtFree(yDevice)); + CHECK_ACL(aclrtFreeHost(yHost)); +} +} + +/* + * 该函数为ReduceCustom算子核函数入口 + * x Device上的gm地址,指向Add算子第一个输入 + * y Device上的gm地址,指向Add算子第一个输出 + * reduceParam 指向运行态ATVC::ReduceParam数据 +*/ +template +__global__ __aicore__ void ReduceCustom( + GM_ADDR x, + GM_ADDR y, + ATVC::ReduceParam reduceParam +) +{ + KERNEL_TASK_TYPE_DEFAULT(KERNEL_TYPE_MIX_AIV_1_0); // 使用了多核控制指令,设置算子执行时只启动Vector核 + // 将计算模板类模板定义作为模板参数传入,Policy由Host层的策略分派API给出 + auto op = ATVC::Kernel::ReduceOpTemplate, Policy>(); + op.Run(x, y, &reduceParam); +} + +namespace { +// 负责Reduce类算子的调度,选择对应的Policy最佳策略并执行Kernel函数 +template +void ReduceOpAdapter(uint8_t* x, uint8_t* y, ATVC::ReduceParam ¶m, ATVC::ReducePolicy &policy, aclrtStream& stream) +{ + // 申请临时空间workspace + uint8_t *workspaceDevice; + CHECK_ACL(aclrtMalloc((void **)&workspaceDevice, param.workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST)); + param.workspaceAddr = reinterpret_cast(workspaceDevice); + // 将tiling api计算出的ReducePolicy转化为编译态参数并实例化相应的核函数 + if (policy == ATVC::REDUCE_POLICY0) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY1) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY2) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY3) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY4) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY5) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY6) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY7) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY8) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY9) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY10) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY11) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY12) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY13) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY14) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY15) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY16) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY17) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY18) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY19) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY20) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY21) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY22) { + ReduceCustom<<>>(x, y, param); + } else { + (void)printf("[ERROR]: Cannot find any matched policy.\n"); + } + // 流同步后释放申请的param内存 + CHECK_ACL(aclrtSynchronizeStream(stream)); + CHECK_ACL(aclrtFree(workspaceDevice)); +} +} + +int32_t main(int32_t argc, char* argv[]) +{ + if (!ATVC::Host::DebugCheck()) { + (void)printf("[ERROR]: Reduce OpTraits check failed.\n"); + return -1; + } + int32_t eleNum = 8 * 1024; + int32_t outEleNum = 1 * 1024; + size_t inputByteSize = static_cast(eleNum) * sizeof(float); + size_t outputByteSize = static_cast(outEleNum) * sizeof(float); + std::vector dim{0}; // 对第0轴执行reduce操作 + std::vector shape{8, 1024}; // 测试输入shape + std::vector inputX(eleNum, 1.0f); + std::vector golden(outEleNum, 1.0f); + (void)printf("Generate golden data successfully.\n"); + // 初始化Acl资源 + CHECK_ACL(aclInit(nullptr)); + aclrtContext context; + int32_t deviceId = 0; + CHECK_ACL(aclrtSetDevice(deviceId)); + CHECK_ACL(aclrtCreateContext(&context, deviceId)); + aclrtStream stream = nullptr; + CHECK_ACL(aclrtCreateStream(&stream)); + uint8_t *yHost; + uint8_t *xDevice; + uint8_t *yDevice; + + CHECK_ACL(aclrtMallocHost((void **)(&yHost), outputByteSize)); + CHECK_ACL(aclrtMalloc((void **)&xDevice, inputByteSize, ACL_MEM_MALLOC_HUGE_FIRST)); + CHECK_ACL(aclrtMalloc((void **)&yDevice, outputByteSize, ACL_MEM_MALLOC_HUGE_FIRST)); + + CHECK_ACL(aclrtMemcpy(xDevice, inputByteSize, inputX.data(), inputByteSize, ACL_MEMCPY_HOST_TO_DEVICE)); + + ATVC::ReduceParam param; // Reduce运行态参数,包含TilingData以及临时空间的相关信息 + ATVC::ReducePolicy policy = {-1, -1, -1}; // Reduce运行态参数,负责映射最适合的Reduce模板实现 + ATVC::Host::ReduceTilingHyperParam hyperParam; + hyperParam.maxInnerA = 256;// 设置maxInnerA为256 + // Host侧调用Tiling API完成相关运行态参数的运算 + if (!ATVC::Host::CalcReduceTiling(shape, dim, &policy, ¶m, hyperParam=hyperParam)) { + (void)printf("Reduce tiling error.\n"); + return -1; + }; + + // 调用Adapter调度接口,完成核函数的模板调用 + ReduceOpAdapter(xDevice, yDevice, param, policy, stream); + + CHECK_ACL(aclrtMemcpy(yHost, outputByteSize, yDevice, outputByteSize, ACL_MEMCPY_DEVICE_TO_HOST)); + std::vector outputY(reinterpret_cast(yHost), reinterpret_cast(yHost) + outEleNum); + + // 释放Acl资源 + CleanUp(xDevice, yDevice, yHost); + CleanACL(stream, context, deviceId); + + if (!VerifyResults(golden, outputY)) { + return -1; + } + + (void)printf("Accuracy verification passed.\n"); + return 0; +} diff --git a/atvc/examples/reduce_min/README.md b/atvc/examples/reduce_min/README.md new file mode 100644 index 0000000000000000000000000000000000000000..af85b2bc89edbb7ff9faa90e55bc8a6ce57cf91f --- /dev/null +++ b/atvc/examples/reduce_min/README.md @@ -0,0 +1,46 @@ + + +## 概述 + +本样例介绍了利用ATVC实现ReduceMin单算子并完成功能验证 + + +## 样例支持产品型号: +- Atlas A2训练系列产品 + +## 目录结构 + +| 文件名 | 描述 | +| ------------------------------------------------------------ | ------------------------------------------------------------ | +| [reduce_min.cpp](./reduce_min.cpp) | ReduceMin算子代码实现以及调用样例 | + +## 算子描述 + +ReduceMin是对输入tensor的指定轴进行规约累加的计算并输出结果的Reduce类算子。 + +ReduceMin算子规格: + + + + + + + + + + + + + +
算子类型(OpType)ReduceMin
算子输入
nameshapedata typeformat
x8 * 2048floatND
算子输出
y1 * 2048floatND
核函数名ReduceCustom
+ +## 算子运行 +在ascendc-api-adv代码仓目录下执行: +```bash +$ cd ./atvc/examples +$ bash run_examples.sh reduce_min +... +Generate golden data successfully. +... +Accuracy verification passed. +``` \ No newline at end of file diff --git a/atvc/examples/reduce_min/reduce_min.cpp b/atvc/examples/reduce_min/reduce_min.cpp new file mode 100644 index 0000000000000000000000000000000000000000..7e2eab3b788c3f40585fbd4392b63a14e20fe92b --- /dev/null +++ b/atvc/examples/reduce_min/reduce_min.cpp @@ -0,0 +1,178 @@ +/** + * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved. + * + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#include +#include +#include +#include +#include +#include +#include "acl/acl.h" +#include "reduce/reduce_host.h" +#include "reduce/reduce_device.h" +#include "example_common.h" + +namespace { +// ReduceSum算子的描述:一个输入,一个输出,类型均为float +using ReduceOpTraits = ATVC::OpTraits, ATVC::OpOutputs>; + +void CleanUp(uint8_t *&xDevice, uint8_t *&yDevice, uint8_t *&yHost) +{ + CHECK_ACL(aclrtFree(xDevice)); + CHECK_ACL(aclrtFree(yDevice)); + CHECK_ACL(aclrtFreeHost(yHost)); +} +} + +/* + * 该函数为ReduceCustom算子核函数入口 + * x Device上的gm地址,指向Add算子第一个输入 + * y Device上的gm地址,指向Add算子第一个输出 + * reduceParam 指向运行态ATVC::ReduceParam数据 +*/ +template +__global__ __aicore__ void ReduceCustom( + GM_ADDR x, + GM_ADDR y, + ATVC::ReduceParam reduceParam +) +{ + KERNEL_TASK_TYPE_DEFAULT(KERNEL_TYPE_MIX_AIV_1_0); // 使用了多核控制指令,设置算子执行时只启动Vector核 + // 将计算模板类模板定义作为模板参数传入,Policy由Host层的策略分派API给出 + auto op = ATVC::Kernel::ReduceOpTemplate, Policy>(); + op.Run(x, y, &reduceParam); +} + +namespace { +// 负责Reduce类算子的调度,选择对应的Policy最佳策略并执行Kernel函数 +template +void ReduceOpAdapter(uint8_t* x, uint8_t* y, ATVC::ReduceParam ¶m, ATVC::ReducePolicy &policy, aclrtStream& stream) +{ + // 申请临时空间workspace + uint8_t *workspaceDevice; + CHECK_ACL(aclrtMalloc((void **)&workspaceDevice, param.workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST)); + param.workspaceAddr = reinterpret_cast(workspaceDevice); + // 将tiling api计算出的ReducePolicy转化为编译态参数并实例化相应的核函数 + if (policy == ATVC::REDUCE_POLICY0) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY1) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY2) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY3) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY4) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY5) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY6) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY7) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY8) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY9) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY10) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY11) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY12) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY13) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY14) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY15) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY16) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY17) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY18) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY19) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY20) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY21) { + ReduceCustom<<>>(x, y, param); + } else if (policy == ATVC::REDUCE_POLICY22) { + ReduceCustom<<>>(x, y, param); + } else { + (void)printf("[ERROR]: Cannot find any matched policy.\n"); + } + // 流同步后释放申请的param内存 + CHECK_ACL(aclrtSynchronizeStream(stream)); + CHECK_ACL(aclrtFree(workspaceDevice)); +} +} + +int32_t main(int32_t argc, char* argv[]) +{ + if (!ATVC::Host::DebugCheck()) { + (void)printf("[ERROR]: Reduce OpTraits check failed.\n"); + return -1; + } + int32_t eleNum = 8 * 1024; + int32_t outEleNum = 1 * 1024; + size_t inputByteSize = static_cast(eleNum) * sizeof(float); + size_t outputByteSize = static_cast(outEleNum) * sizeof(float); + std::vector dim{0}; // 对第0轴执行reduce操作 + std::vector shape{8, 1024}; // 测试输入shape + std::vector inputX(eleNum, 1.0f); + std::vector golden(outEleNum, 1.0f); + (void)printf("Generate golden data successfully.\n"); + // 初始化Acl资源 + CHECK_ACL(aclInit(nullptr)); + aclrtContext context; + int32_t deviceId = 0; + CHECK_ACL(aclrtSetDevice(deviceId)); + CHECK_ACL(aclrtCreateContext(&context, deviceId)); + aclrtStream stream = nullptr; + CHECK_ACL(aclrtCreateStream(&stream)); + uint8_t *yHost; + uint8_t *xDevice; + uint8_t *yDevice; + + CHECK_ACL(aclrtMallocHost((void **)(&yHost), outputByteSize)); + CHECK_ACL(aclrtMalloc((void **)&xDevice, inputByteSize, ACL_MEM_MALLOC_HUGE_FIRST)); + CHECK_ACL(aclrtMalloc((void **)&yDevice, outputByteSize, ACL_MEM_MALLOC_HUGE_FIRST)); + + CHECK_ACL(aclrtMemcpy(xDevice, inputByteSize, inputX.data(), inputByteSize, ACL_MEMCPY_HOST_TO_DEVICE)); + + ATVC::ReduceParam param; // Reduce运行态参数,包含TilingData以及临时空间的相关信息 + ATVC::ReducePolicy policy = {-1, -1, -1}; // Reduce运行态参数,负责映射最适合的Reduce模板实现 + ATVC::Host::ReduceTilingHyperParam hyperParam; + hyperParam.maxInnerA = 256;// 设置maxInnerA为256 + // Host侧调用Tiling API完成相关运行态参数的运算 + if (!ATVC::Host::CalcReduceTiling(shape, dim, &policy, ¶m, hyperParam=hyperParam)) { + (void)printf("Reduce tiling error.\n"); + return -1; + }; + + // 调用Adapter调度接口,完成核函数的模板调用 + ReduceOpAdapter(xDevice, yDevice, param, policy, stream); + + CHECK_ACL(aclrtMemcpy(yHost, outputByteSize, yDevice, outputByteSize, ACL_MEMCPY_DEVICE_TO_HOST)); + std::vector outputY(reinterpret_cast(yHost), reinterpret_cast(yHost) + outEleNum); + + // 释放Acl资源 + CleanUp(xDevice, yDevice, yHost); + CleanACL(stream, context, deviceId); + + if (!VerifyResults(golden, outputY)) { + return -1; + } + + (void)printf("Accuracy verification passed.\n"); + return 0; +} diff --git a/atvc/include/atvc.h b/atvc/include/atvc.h index 187482e260bac834e0b9eb593f7316740a7bbd90..f56046ea7155eb8fa3e2d7e57c75bc2a36594c60 100644 --- a/atvc/include/atvc.h +++ b/atvc/include/atvc.h @@ -29,6 +29,8 @@ #include "common/kernel_utils.h" #include "elewise/elewise_op_template.h" #include "reduce/reduce_sum.h" +#include "reduce/reduce_max.h" +#include "reduce/reduce_min.h" #include "reduce/reduce_op_template.h" #include "broadcast/broadcast_compute.h" #include "broadcast/broadcast_op_template.h" diff --git a/atvc/include/reduce/common/reduce_common.h b/atvc/include/reduce/common/reduce_common.h index 94c6e050abd68f597d4813ac279c7b040bd2f001..91a040a90b569bc74fcb55755934cc7aab629853 100644 --- a/atvc/include/reduce/common/reduce_common.h +++ b/atvc/include/reduce/common/reduce_common.h @@ -29,6 +29,18 @@ enum ShapeDim { DIM_BROADCAST // Broadcast axis }; +struct ReduceARParam { + uint32_t repStride = 0; + uint16_t dimA = 0; + uint16_t dimMax = 0; + uint16_t mainR = 0; + uint16_t tailR = 0; + uint64_t maskAddRNum = 0; + uint16_t loopRNum = 0; + uint16_t dtypeSize = 0; + uint16_t dimR = 0; +}; + namespace AR_PATTERN { static constexpr uint32_t A = 100; static constexpr uint32_t AR = 11; diff --git a/atvc/include/reduce/reduce_device.h b/atvc/include/reduce/reduce_device.h index a271e8f17d289e74509cbf03ef8348a443afcaff..c1611078ee1631d6a1d61255992926209bd8fc5b 100644 --- a/atvc/include/reduce/reduce_device.h +++ b/atvc/include/reduce/reduce_device.h @@ -25,6 +25,8 @@ #include "common/kernel_utils.h" #include "reduce/reduce_sum.h" +#include "reduce/reduce_max.h" +#include "reduce/reduce_min.h" #include "reduce/reduce_op_template.h" #endif // ATVC_REDUCE_DEVICE_H \ No newline at end of file diff --git a/atvc/include/reduce/reduce_max.h b/atvc/include/reduce/reduce_max.h new file mode 100644 index 0000000000000000000000000000000000000000..9aab3b9990fac99e14cfc79162e9d71a662f87f0 --- /dev/null +++ b/atvc/include/reduce/reduce_max.h @@ -0,0 +1,350 @@ +/** + * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved. + * + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#ifndef ATVC_REDUCE_MAX_H +#define ATVC_REDUCE_MAX_H + +#include "common/kernel_utils.h" +#include "reduce/common/patterns.h" +#include "reduce/utils/reduce_block_aux_util.h" +#include "reduce/common/reduce_common.h" + +namespace ATVC { +/*! + * ReduceMaxCompute This class provides the core arithmetic required to reduce + * tensors along either the inner-most (AR) or outer-most (RA) axis after + * the tensor has been copied to the Unified Buffer (UB). Data movement between + * Global Memory (GM) and UB is not handled here; it is the responsibility of + * the surrounding scheduling template. + */ +template +class ReduceMaxCompute { +public: + // Extract operator input description information from OpTraits + using inputDTypeList = typename OpTraits::In::types; + using DataType = typename ATVC::TypeListGet::Type; + using PrompteDtype = typename KernelUtils::GetPromoteType::T; + __aicore__ inline ReduceMaxCompute() {} + + /*! + * \brief Perform the actual reduction on a tile already resident in UB. + * \tparam needMask, true when UB alignment introduced invalid lanes. + * \tparam Pattern, one of ReducePattern::AR or ReducePattern::RA. + * \param[in] shape, {dimA, dimR} in elements; dimR may be padded. + * \param[out] dst, destination tensor (length == dimA) + * \param[in] src, source tensor (length == dimA * dimR) + */ + template + __aicore__ inline void + Compute(KernelUtils::Shape<2> &shape, + const AscendC::LocalTensor &dst, + const AscendC::LocalTensor &src) + { + // AR scenario, hardware limitations, R-axis requires 32B alignment on UB, with 2 alignment methods available: + // 1. High performance alignment (with uncertain supplementary element values), subsequent cumulative + // calculations can only calculate the actual number of effective elements + // 2. Alignment with zero padding (padding value is determined by the GetAddingValue() interface + // implemented by the user) + if (std::is_same::value) { + if constexpr (needMask) { // 1. High performance alignment mode + // MainR (int64_t dimR, boolean isAR): The framework provides the calculation of the R-axis binary + // length (number of elements), where dimR is the original number of elements + int16_t mainR = KernelUtils::Reduce::MainR(shape.oriBurstLen, true); + ReduceAR(dst, src, shape.value[0], shape.value[1], mainR, shape.oriBurstLen); + } else { + // MainR: The framework provides the calculation of the R-axis binary length (number of elements), + // where dimR is the number of elements after completion + int16_t mainR = KernelUtils::Reduce::MainR(shape.value[1], true); + ReduceAR(dst, src, shape.value[0], shape.value[1], mainR, shape.value[1]); + } + } + if (std::is_same::value) { + int16_t mainR = KernelUtils::Reduce::MainR(shape.value[0], false); + ReduceRA(dst, src, shape.value[1], shape.value[0], mainR); + } + } + + /*! + * \brief RA-pattern reduction: reduce along the outer-most (slowest-varying) axis. + * \param[out] dst, output tensor (length == dimA) + * \param[in] src, input tensor (length == dimR * dimA), already resident in UB + * \param[in] dimA, length of the non-reduced axis (A) + * \param[in] dimR, length of the reduced axis (R) + * \param[in] mainR, largest power-of-two ≤ dimR (computed by the caller) + */ + __aicore__ inline void + ReduceRA(const AscendC::LocalTensor &dst, + const AscendC::LocalTensor &src, uint16_t dimA, + uint16_t dimR, uint16_t mainR) + { + uint32_t totalNum = dimR * dimA; + uint32_t mainNum = dimA * mainR; + constexpr uint32_t dtypeSize = sizeof(PrompteDtype); + uint32_t tailNum = totalNum - mainNum; + // MaskAddNum has a maximum value of 256 bytes and must be aligned with 32 bytes + constexpr uint32_t maskAddNum = UB_ALIGN_256 / dtypeSize / UB_ALIGN_32 * UB_ALIGN_32; + uint16_t repeatTimes = tailNum / maskAddNum; + uint16_t repeatNum = repeatTimes * maskAddNum; + uint16_t repTailNum = tailNum - repeatNum; + // Same data block step size between different iterations + uint32_t repStride = dtypeSize * maskAddNum / UB_ALIGN_32; + // dstBlkStride, src0BlkStride,src1BlkStride, dstRepStride, src0RepStride, src1RepStride + AscendC::BinaryRepeatParams repeatParams(1, 1, 1, repStride, repStride, repStride); + if (repeatTimes > 0) { + AscendC::Max(src, src[mainNum], src, maskAddNum, repeatTimes, repeatParams); + } + if (repTailNum > 0) { + // Same data block step size between different iterations + repStride = dtypeSize * repTailNum / UB_ALIGN_32; + repeatParams.dstRepStride = repStride; + repeatParams.src0RepStride = repStride; + repeatParams.src1RepStride = repStride; + AscendC::Max(src[repeatNum], src[repeatNum + mainNum], src[repeatNum], repTailNum, 1, repeatParams); + } + AscendC::PipeBarrier(); + uint16_t loopRNum = mainR; + while (loopRNum > 1) { + loopRNum = loopRNum >> 1; + mainNum = loopRNum * dimA; // The first half of LoopR's data volume + repeatTimes = mainNum / maskAddNum; + repeatNum = repeatTimes * maskAddNum; + repTailNum = mainNum - repeatNum; + if (repeatTimes > 0) { + // Same data block step size between different iterations + repStride = dtypeSize * maskAddNum / UB_ALIGN_32; + repeatParams.dstRepStride = repStride; + repeatParams.src0RepStride = repStride; + repeatParams.src1RepStride = repStride; + AscendC::Max(src, src[mainNum], src, maskAddNum, repeatTimes, repeatParams); + } + if (repTailNum > 0) { + // Same data block step size between different iterations + repStride = dtypeSize * repTailNum / UB_ALIGN_32; + repeatParams.dstRepStride = repStride; + repeatParams.src0RepStride = repStride; + repeatParams.src1RepStride = repStride; + AscendC::Max(src[repeatNum], src[repeatNum + mainNum], src[repeatNum], repTailNum, 1, repeatParams); + } + AscendC::PipeBarrier(); + } + AscendC::DataCopy(dst, src, dimA); + } + + /*! + * \brief AR-pattern reduction: reduce along the inner-most (fastest-varying) axis. + * \param[out] dstTensor, output tensor (length == dimA) + * \param[in] srcTensor, input tensor (length == dimR * dimA), already resident in UB + * \param[in] dimA, length of the non-reduced axis (A) + * \param[in] dimR, padded length of the reduced axis (R) + * \param[in] mainR, largest power-of-two ≤ original R length + * \param[in] oriBurstLen, original (un-padded) R length used to compute tail + */ + __aicore__ inline void + ReduceAR(const AscendC::LocalTensor &dstTensor, + const AscendC::LocalTensor &srcTensor, uint16_t dimA, + uint16_t dimR, uint16_t mainR, uint64_t oriBurstLen) + { + uint16_t tailR = oriBurstLen - mainR; + constexpr uint16_t dtypeSize = sizeof(PrompteDtype); + uint32_t repStride = dtypeSize * dimR / UB_ALIGN_32; + uint16_t dimMax = dimA * dimR; + constexpr uint64_t maskAddRNum = UB_ALIGN_256 / dtypeSize; + + ReduceARParam param{ + .repStride = repStride, + .dimA = dimA, + .dimMax = dimMax, + .mainR = mainR, + .tailR = tailR, + .maskAddRNum = maskAddRNum, + .dtypeSize = dtypeSize, + .dimR = dimR + }; + + if (mainR > 0 && tailR > 0) { + PerformInitialMax(srcTensor, param); + } + + param.loopRNum = mainR; + while (param.loopRNum > maskAddRNum) { + param.loopRNum = param.loopRNum / 2U; + PerformBinaryReduction(srcTensor, param); + } + if (param.loopRNum == 0) { // small shape, directly reduce + param.loopRNum = tailR; + } + PerformFinalReduction(dstTensor, srcTensor, param); + } + + /*! + * \brief Merge the calculation results of different data base blocks within a single UB + * \tparam Pattern Compile-time pattern tag that decides A vs. B orientation. + * \tparam V Shape descriptor (encodes dimA and dimB at runtime). + * \param[in] index, logical index identifying the data-base block. + * \param[in] shape, runtime tensor shape (dimA, dimB). + * \param[in] tempBuf, UB tensor serving as the reduction cache. + * \param[in] computeRes, UB tensor holding the newest partial result. + */ + template + __aicore__ inline void UpdateCache(int64_t index, V& shape, const AscendC::LocalTensor& tempBuf, + const AscendC::LocalTensor& computeRes) + { + int64_t cacheID = KernelUtils::Reduce::GetCacheID(index); + int64_t dimA = Pattern::TailA ? shape.value[1] : shape.value[0]; + int32_t element_one_repeat = Platform::GetVRegSize() / sizeof(PrompteDtype); + int64_t stride = OpsUtils::CeilDiv(dimA, static_cast(element_one_repeat)) * element_one_repeat; + uint16_t outerLoopTimes = OpsUtils::CeilDiv( + static_cast(dimA * sizeof(PrompteDtype)), static_cast(Platform::GetVRegSize())); + uint16_t innerLoopTimes = cacheID; + uint32_t outerLoopStride = element_one_repeat; + uint32_t innerLoopStride = stride; // The size of each idex block in cacahe and the size of the A-axis + AscendC::LocalTensor dstTensor = tempBuf; + AscendC::LocalTensor srcTensor = computeRes; + uint32_t cah = cacheID * stride; + + for (uint16_t i = 0; i < outerLoopTimes; ++i) { // OuterLoopTimes is the size of dimA + uint32_t srcIdx = i * outerLoopStride; + for (uint16_t j = 0; j < innerLoopTimes; ++j) { + AscendC::Max(srcTensor[srcIdx], srcTensor[srcIdx], + dstTensor[srcIdx + j * innerLoopStride], + outerLoopStride); + AscendC::PipeBarrier(); + } + DataCopy(dstTensor[cah + srcIdx], srcTensor[srcIdx], outerLoopStride); + } + } + + /*! + * \brief Binary reduction between two UB buffers. + * \ Used for inter-core result merging when workspace staging is required. + * \param[in] ubTensorLeft, left operand (in-place result). + * \param[in] ubTensorRight, right operand (read-only). + * \param[in] calCount, number of elements to reduce. + */ + __aicore__ inline void + ReduceBetweenUB(const AscendC::LocalTensor &ubTensorLeft, + const AscendC::LocalTensor &ubTensorRight, + const int32_t &calCount) + { + AscendC::Max(ubTensorRight, ubTensorRight, ubTensorLeft, calCount); + } + + /*! + * \brief Return the value used for padding when UB alignment is required. + * For MAX-reduction the neutral element is -∞ or 0. + * \tparam U, scalar type identical to DataType or PromoteDataType. + * \return The padding value (-∞ or 0). + */ + template + __aicore__ inline U GetPaddingValue() + { + // Due to the fact that ReduceMax accumulates R-axis data, the values of the supplemented elements + // are set to -∞ or 0 to ensure that the accumulated result is not affected + if(AscendC::IsSameType::value){ + return INT32_MIN; + }else if(AscendC::IsSameType::value){ + return 0; + }else{ + return -1.0f / 0.0f; + } + } + +private: + __aicore__ inline void PerformInitialMax(const AscendC::LocalTensor &srcTensor, const ReduceARParam& param) + { + uint16_t addRTotalNum = param.tailR / param.maskAddRNum * param.maskAddRNum; + uint16_t addRTail = param.tailR - addRTotalNum; + // dstBlkStride, src0BlkStride,src1BlkStride, dstRepStride, src0RepStride, src1RepStride + AscendC::BinaryRepeatParams repeatParams(1, 1, 1, param.repStride, param.repStride, param.repStride); + + if (param.repStride > UB_ALIGN_255) { + for (uint16_t i = 0; i < param.dimMax; i += param.dimR) { + AscendC::Max(srcTensor[i], srcTensor[i], srcTensor[i + param.mainR], param.tailR); + } + } else { + for (uint16_t i = 0; i < addRTotalNum; i += param.maskAddRNum) { + AscendC::Max(srcTensor[i], srcTensor[i + param.mainR], srcTensor[i], param.maskAddRNum, param.dimA, repeatParams); + } + if (addRTail > 0) { + AscendC::Max(srcTensor[addRTotalNum], + srcTensor[addRTotalNum + param.mainR], + srcTensor[addRTotalNum], + addRTail, + param.dimA, + repeatParams); + } + } + AscendC::PipeBarrier(); + } + + __aicore__ inline void PerformBinaryReduction(const AscendC::LocalTensor &srcTensor, + const ReduceARParam& param) + { + if (param.repStride > UB_ALIGN_255) { + for (uint16_t i = 0; i < param.dimMax; i += param.loopRNum) { + AscendC::Max(srcTensor[i], srcTensor[i], srcTensor[i + param.loopRNum], param.loopRNum); + } + } else { + uint16_t addRTotalNum = param.loopRNum / param.maskAddRNum * param.maskAddRNum; + uint16_t addRTail = param.loopRNum - addRTotalNum; + // dstBlkStride, src0BlkStride,src1BlkStride, dstRepStride, src0RepStride, src1RepStride + AscendC::BinaryRepeatParams repeatParams(1, 1, 1, param.repStride, param.repStride, param.repStride); + for (uint16_t i = 0; i < addRTotalNum; i += param.maskAddRNum) { + AscendC::Max(srcTensor[i], srcTensor[i + param.loopRNum], srcTensor[i], param.maskAddRNum, param.dimA, repeatParams); + } + if (addRTail > 0) { + AscendC::Max(srcTensor[addRTotalNum], + srcTensor[addRTotalNum], + srcTensor[addRTotalNum + param.loopRNum], + addRTail, + param.dimA, + repeatParams); + } + } + AscendC::PipeBarrier(); + } + + __aicore__ inline void PerformFinalReduction(const AscendC::LocalTensor &dstTensor, + const AscendC::LocalTensor &srcTensor, const ReduceARParam& param) + { + if constexpr (AscendC::IsSameType::value || + AscendC::IsSameType::value) { + uint16_t reduceLoopTimes = UB_ALIGN_255 * param.dtypeSize / UB_ALIGN_32 * UB_ALIGN_32 / param.dtypeSize; + // WholeReduceMax repeat-time limit is 255; split dimA into chunks + for (uint16_t dimAIdx = 0; dimAIdx < param.dimA; dimAIdx += reduceLoopTimes) { + uint16_t curDimA = (dimAIdx + reduceLoopTimes < param.dimA) ? reduceLoopTimes : param.dimA - dimAIdx; + AscendC::WholeReduceMax( + dstTensor[dimAIdx], srcTensor[dimAIdx * param.dimR], param.loopRNum, curDimA, 1, 1, param.repStride, AscendC::ReduceOrder::ORDER_ONLY_VALUE); + } + AscendC::PipeBarrier(); + } else if constexpr (AscendC::IsSameType::value || + AscendC::IsSameType::value) { + // Cast to float for higher-precision accumulation + AscendC::LocalTensor interpreSrc = srcTensor.template ReinterpretCast(); + AscendC::LocalTensor interpreDst = dstTensor.template ReinterpretCast(); + AscendC::Cast(interpreSrc, srcTensor, AscendC::RoundMode::CAST_NONE, param.dimA * param.dimR); + AscendC::PipeBarrier(); + + uint16_t reduceLoopTimes = 255 * param.dtypeSize / UB_ALIGN_32 * UB_ALIGN_32 / param.dtypeSize; + // WholeReduceMax repeat-time limit is 255; split dimA into chunks + for (uint16_t dimAIdx = 0; dimAIdx < param.dimA; dimAIdx += reduceLoopTimes) { + uint16_t curDimA = (dimAIdx + reduceLoopTimes < param.dimA) ? reduceLoopTimes : param.dimA - dimAIdx; + AscendC::WholeReduceMax( + interpreDst[dimAIdx], interpreSrc[dimAIdx * param.dimR], param.loopRNum, curDimA, 1, 1, param.repStride, AscendC::ReduceOrder::ORDER_ONLY_VALUE); + } + AscendC::PipeBarrier(); + AscendC::Cast(dstTensor, interpreDst, AscendC::RoundMode::CAST_RINT, dstTensor.GetSize()); + } + } +}; +} // namespace ATVC + +#endif // ATVC_REDUCE_MAX_H diff --git a/atvc/include/reduce/reduce_min.h b/atvc/include/reduce/reduce_min.h new file mode 100644 index 0000000000000000000000000000000000000000..0ef25d16d8dc04aa1c0df39158ef8d861aff66ef --- /dev/null +++ b/atvc/include/reduce/reduce_min.h @@ -0,0 +1,350 @@ +/** + * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved. + * + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#ifndef ATVC_REDUCE_MIN_H +#define ATVC_REDUCE_MIN_H + +#include "common/kernel_utils.h" +#include "reduce/common/patterns.h" +#include "reduce/utils/reduce_block_aux_util.h" +#include "reduce/common/reduce_common.h" + +namespace ATVC { +/*! + * ReduceMinCompute This class provides the core arithmetic required to reduce + * tensors along either the inner-most (AR) or outer-most (RA) axis after + * the tensor has been copied to the Unified Buffer (UB). Data movement between + * Global Memory (GM) and UB is not handled here; it is the responsibility of + * the surrounding scheduling template. + */ +template +class ReduceMinCompute { +public: + // Extract operator input description information from OpTraits + using inputDTypeList = typename OpTraits::In::types; + using DataType = typename ATVC::TypeListGet::Type; + using PrompteDtype = typename KernelUtils::GetPromoteType::T; + __aicore__ inline ReduceMinCompute() {} + + /*! + * \brief Perform the actual reduction on a tile already resident in UB. + * \tparam needMask, true when UB alignment introduced invalid lanes. + * \tparam Pattern, one of ReducePattern::AR or ReducePattern::RA. + * \param[in] shape, {dimA, dimR} in elements; dimR may be padded. + * \param[out] dst, destination tensor (length == dimA) + * \param[in] src, source tensor (length == dimA * dimR) + */ + template + __aicore__ inline void + Compute(KernelUtils::Shape<2> &shape, + const AscendC::LocalTensor &dst, + const AscendC::LocalTensor &src) + { + // AR scenario, hardware limitations, R-axis requires 32B alignment on UB, with 2 alignment methods available: + // 1. High performance alignment (with uncertain supplementary element values), subsequent cumulative + // calculations can only calculate the actual number of effective elements + // 2. Alignment with zero padding (padding value is determined by the GetAddingValue() interface + // implemented by the user) + if (std::is_same::value) { + if constexpr (needMask) { // 1. High performance alignment mode + // MainR (int64_t dimR, boolean isAR): The framework provides the calculation of the R-axis binary + // length (number of elements), where dimR is the original number of elements + int16_t mainR = KernelUtils::Reduce::MainR(shape.oriBurstLen, true); + ReduceAR(dst, src, shape.value[0], shape.value[1], mainR, shape.oriBurstLen); + } else { + // MainR: The framework provides the calculation of the R-axis binary length (number of elements), + // where dimR is the number of elements after completion + int16_t mainR = KernelUtils::Reduce::MainR(shape.value[1], true); + ReduceAR(dst, src, shape.value[0], shape.value[1], mainR, shape.value[1]); + } + } + if (std::is_same::value) { + int16_t mainR = KernelUtils::Reduce::MainR(shape.value[0], false); + ReduceRA(dst, src, shape.value[1], shape.value[0], mainR); + } + } + + /*! + * \brief RA-pattern reduction: reduce along the outer-most (slowest-varying) axis. + * \param[out] dst, output tensor (length == dimA) + * \param[in] src, input tensor (length == dimR * dimA), already resident in UB + * \param[in] dimA, length of the non-reduced axis (A) + * \param[in] dimR, length of the reduced axis (R) + * \param[in] mainR, largest power-of-two ≤ dimR (computed by the caller) + */ + __aicore__ inline void + ReduceRA(const AscendC::LocalTensor &dst, + const AscendC::LocalTensor &src, uint16_t dimA, + uint16_t dimR, uint16_t mainR) + { + uint32_t totalNum = dimR * dimA; + uint32_t mainNum = dimA * mainR; + constexpr uint32_t dtypeSize = sizeof(PrompteDtype); + uint32_t tailNum = totalNum - mainNum; + // MaskAddNum has a maximum value of 256 bytes and must be aligned with 32 bytes + constexpr uint32_t maskAddNum = UB_ALIGN_256 / dtypeSize / UB_ALIGN_32 * UB_ALIGN_32; + uint16_t repeatTimes = tailNum / maskAddNum; + uint16_t repeatNum = repeatTimes * maskAddNum; + uint16_t repTailNum = tailNum - repeatNum; + // Same data block step size between different iterations + uint32_t repStride = dtypeSize * maskAddNum / UB_ALIGN_32; + // dstBlkStride, src0BlkStride,src1BlkStride, dstRepStride, src0RepStride, src1RepStride + AscendC::BinaryRepeatParams repeatParams(1, 1, 1, repStride, repStride, repStride); + if (repeatTimes > 0) { + AscendC::Min(src, src[mainNum], src, maskAddNum, repeatTimes, repeatParams); + } + if (repTailNum > 0) { + // Same data block step size between different iterations + repStride = dtypeSize * repTailNum / UB_ALIGN_32; + repeatParams.dstRepStride = repStride; + repeatParams.src0RepStride = repStride; + repeatParams.src1RepStride = repStride; + AscendC::Min(src[repeatNum], src[repeatNum + mainNum], src[repeatNum], repTailNum, 1, repeatParams); + } + AscendC::PipeBarrier(); + uint16_t loopRNum = mainR; + while (loopRNum > 1) { + loopRNum = loopRNum >> 1; + mainNum = loopRNum * dimA; // The first half of LoopR's data volume + repeatTimes = mainNum / maskAddNum; + repeatNum = repeatTimes * maskAddNum; + repTailNum = mainNum - repeatNum; + if (repeatTimes > 0) { + // Same data block step size between different iterations + repStride = dtypeSize * maskAddNum / UB_ALIGN_32; + repeatParams.dstRepStride = repStride; + repeatParams.src0RepStride = repStride; + repeatParams.src1RepStride = repStride; + AscendC::Min(src, src[mainNum], src, maskAddNum, repeatTimes, repeatParams); + } + if (repTailNum > 0) { + // Same data block step size between different iterations + repStride = dtypeSize * repTailNum / UB_ALIGN_32; + repeatParams.dstRepStride = repStride; + repeatParams.src0RepStride = repStride; + repeatParams.src1RepStride = repStride; + AscendC::Min(src[repeatNum], src[repeatNum + mainNum], src[repeatNum], repTailNum, 1, repeatParams); + } + AscendC::PipeBarrier(); + } + AscendC::DataCopy(dst, src, dimA); + } + + /*! + * \brief AR-pattern reduction: reduce along the inner-most (fastest-varying) axis. + * \param[out] dstTensor, output tensor (length == dimA) + * \param[in] srcTensor, input tensor (length == dimR * dimA), already resident in UB + * \param[in] dimA, length of the non-reduced axis (A) + * \param[in] dimR, padded length of the reduced axis (R) + * \param[in] mainR, largest power-of-two ≤ original R length + * \param[in] oriBurstLen, original (un-padded) R length used to compute tail + */ + __aicore__ inline void + ReduceAR(const AscendC::LocalTensor &dstTensor, + const AscendC::LocalTensor &srcTensor, uint16_t dimA, + uint16_t dimR, uint16_t mainR, uint64_t oriBurstLen) + { + uint16_t tailR = oriBurstLen - mainR; + constexpr uint16_t dtypeSize = sizeof(PrompteDtype); + uint32_t repStride = dtypeSize * dimR / UB_ALIGN_32; + uint16_t dimMax = dimA * dimR; + constexpr uint64_t maskAddRNum = UB_ALIGN_256 / dtypeSize; + + ReduceARParam param{ + .repStride = repStride, + .dimA = dimA, + .dimMax = dimMax, + .mainR = mainR, + .tailR = tailR, + .maskAddRNum = maskAddRNum, + .dtypeSize = dtypeSize, + .dimR = dimR + }; + + if (mainR > 0 && tailR > 0) { + PerformInitialMin(srcTensor, param); + } + + param.loopRNum = mainR; + while (param.loopRNum > maskAddRNum) { + param.loopRNum = param.loopRNum / 2U; + PerformBinaryReduction(srcTensor, param); + } + if (param.loopRNum == 0) { // small shape, directly reduce + param.loopRNum = tailR; + } + PerformFinalReduction(dstTensor, srcTensor, param); + } + + /*! + * \brief Merge the calculation results of different data base blocks within a single UB + * \tparam Pattern Compile-time pattern tag that decides A vs. B orientation. + * \tparam V Shape descriptor (encodes dimA and dimB at runtime). + * \param[in] index, logical index identifying the data-base block. + * \param[in] shape, runtime tensor shape (dimA, dimB). + * \param[in] tempBuf, UB tensor serving as the reduction cache. + * \param[in] computeRes, UB tensor holding the newest partial result. + */ + template + __aicore__ inline void UpdateCache(int64_t index, V& shape, const AscendC::LocalTensor& tempBuf, + const AscendC::LocalTensor& computeRes) + { + int64_t cacheID = KernelUtils::Reduce::GetCacheID(index); + int64_t dimA = Pattern::TailA ? shape.value[1] : shape.value[0]; + int32_t element_one_repeat = Platform::GetVRegSize() / sizeof(PrompteDtype); + int64_t stride = OpsUtils::CeilDiv(dimA, static_cast(element_one_repeat)) * element_one_repeat; + uint16_t outerLoopTimes = OpsUtils::CeilDiv( + static_cast(dimA * sizeof(PrompteDtype)), static_cast(Platform::GetVRegSize())); + uint16_t innerLoopTimes = cacheID; + uint32_t outerLoopStride = element_one_repeat; + uint32_t innerLoopStride = stride; // The size of each idex block in cacahe and the size of the A-axis + AscendC::LocalTensor dstTensor = tempBuf; + AscendC::LocalTensor srcTensor = computeRes; + uint32_t cah = cacheID * stride; + + for (uint16_t i = 0; i < outerLoopTimes; ++i) { // OuterLoopTimes is the size of dimA + uint32_t srcIdx = i * outerLoopStride; + for (uint16_t j = 0; j < innerLoopTimes; ++j) { + AscendC::Min(srcTensor[srcIdx], srcTensor[srcIdx], + dstTensor[srcIdx + j * innerLoopStride], + outerLoopStride); + AscendC::PipeBarrier(); + } + DataCopy(dstTensor[cah + srcIdx], srcTensor[srcIdx], outerLoopStride); + } + } + + /*! + * \brief Binary reduction between two UB buffers. + * \ Used for inter-core result merging when workspace staging is required. + * \param[in] ubTensorLeft, left operand (in-place result). + * \param[in] ubTensorRight, right operand (read-only). + * \param[in] calCount, number of elements to reduce. + */ + __aicore__ inline void + ReduceBetweenUB(const AscendC::LocalTensor &ubTensorLeft, + const AscendC::LocalTensor &ubTensorRight, + const int32_t &calCount) + { + AscendC::Min(ubTensorRight, ubTensorRight, ubTensorLeft, calCount); + } + + /*! + * \brief Return the value used for padding when UB alignment is required. + * For MIN-reduction the neutral element is +∞ or 0. + * \tparam U, scalar type identical to DataType or PromoteDataType. + * \return The padding value (+∞ or 0). + */ + template + __aicore__ inline U GetPaddingValue() + { + // Due to the fact that ReduceMin accumulates R-axis data, the values of the supplemented elements + // are set to +∞ or 0 to ensure that the accumulated result is not affected + if(AscendC::IsSameType::value){ + return INT32_MAX; + }else if(AscendC::IsSameType::value){ + return INT32_MAX - INT32_MIN; + }else{ + return 1.0f / 0.0f; + } + } + +private: + __aicore__ inline void PerformInitialMin(const AscendC::LocalTensor &srcTensor, const ReduceARParam& param) + { + uint16_t addRTotalNum = param.tailR / param.maskAddRNum * param.maskAddRNum; + uint16_t addRTail = param.tailR - addRTotalNum; + // dstBlkStride, src0BlkStride,src1BlkStride, dstRepStride, src0RepStride, src1RepStride + AscendC::BinaryRepeatParams repeatParams(1, 1, 1, param.repStride, param.repStride, param.repStride); + + if (param.repStride > UB_ALIGN_255) { + for (uint16_t i = 0; i < param.dimMax; i += param.dimR) { + AscendC::Min(srcTensor[i], srcTensor[i], srcTensor[i + param.mainR], param.tailR); + } + } else { + for (uint16_t i = 0; i < addRTotalNum; i += param.maskAddRNum) { + AscendC::Min(srcTensor[i], srcTensor[i + param.mainR], srcTensor[i], param.maskAddRNum, param.dimA, repeatParams); + } + if (addRTail > 0) { + AscendC::Min(srcTensor[addRTotalNum], + srcTensor[addRTotalNum + param.mainR], + srcTensor[addRTotalNum], + addRTail, + param.dimA, + repeatParams); + } + } + AscendC::PipeBarrier(); + } + + __aicore__ inline void PerformBinaryReduction(const AscendC::LocalTensor &srcTensor, + const ReduceARParam& param) + { + if (param.repStride > UB_ALIGN_255) { + for (uint16_t i = 0; i < param.dimMax; i += param.loopRNum) { + AscendC::Min(srcTensor[i], srcTensor[i], srcTensor[i + param.loopRNum], param.loopRNum); + } + } else { + uint16_t addRTotalNum = param.loopRNum / param.maskAddRNum * param.maskAddRNum; + uint16_t addRTail = param.loopRNum - addRTotalNum; + // dstBlkStride, src0BlkStride,src1BlkStride, dstRepStride, src0RepStride, src1RepStride + AscendC::BinaryRepeatParams repeatParams(1, 1, 1, param.repStride, param.repStride, param.repStride); + for (uint16_t i = 0; i < addRTotalNum; i += param.maskAddRNum) { + AscendC::Min(srcTensor[i], srcTensor[i + param.loopRNum], srcTensor[i], param.maskAddRNum, param.dimA, repeatParams); + } + if (addRTail > 0) { + AscendC::Min(srcTensor[addRTotalNum], + srcTensor[addRTotalNum], + srcTensor[addRTotalNum + param.loopRNum], + addRTail, + param.dimA, + repeatParams); + } + } + AscendC::PipeBarrier(); + } + + __aicore__ inline void PerformFinalReduction(const AscendC::LocalTensor &dstTensor, + const AscendC::LocalTensor &srcTensor, const ReduceARParam& param) + { + if constexpr (AscendC::IsSameType::value || + AscendC::IsSameType::value) { + uint16_t reduceLoopTimes = UB_ALIGN_255 * param.dtypeSize / UB_ALIGN_32 * UB_ALIGN_32 / param.dtypeSize; + // WholeReduceMin repeat-time limit is 255; split dimA into chunks + for (uint16_t dimAIdx = 0; dimAIdx < param.dimA; dimAIdx += reduceLoopTimes) { + uint16_t curDimA = (dimAIdx + reduceLoopTimes < param.dimA) ? reduceLoopTimes : param.dimA - dimAIdx; + AscendC::WholeReduceMin( + dstTensor[dimAIdx], srcTensor[dimAIdx * param.dimR], param.loopRNum, curDimA, 1, 1, param.repStride, AscendC::ReduceOrder::ORDER_ONLY_VALUE); + } + AscendC::PipeBarrier(); + } else if constexpr (AscendC::IsSameType::value || + AscendC::IsSameType::value) { + // Cast to float for higher-precision accumulation + AscendC::LocalTensor interpreSrc = srcTensor.template ReinterpretCast(); + AscendC::LocalTensor interpreDst = dstTensor.template ReinterpretCast(); + AscendC::Cast(interpreSrc, srcTensor, AscendC::RoundMode::CAST_NONE, param.dimA * param.dimR); + AscendC::PipeBarrier(); + + uint16_t reduceLoopTimes = 255 * param.dtypeSize / UB_ALIGN_32 * UB_ALIGN_32 / param.dtypeSize; + // WholeReduceMin repeat-time limit is 255; split dimA into chunks + for (uint16_t dimAIdx = 0; dimAIdx < param.dimA; dimAIdx += reduceLoopTimes) { + uint16_t curDimA = (dimAIdx + reduceLoopTimes < param.dimA) ? reduceLoopTimes : param.dimA - dimAIdx; + AscendC::WholeReduceMin( + interpreDst[dimAIdx], interpreSrc[dimAIdx * param.dimR], param.loopRNum, curDimA, 1, 1, param.repStride, AscendC::ReduceOrder::ORDER_ONLY_VALUE); + } + AscendC::PipeBarrier(); + AscendC::Cast(dstTensor, interpreDst, AscendC::RoundMode::CAST_RINT, dstTensor.GetSize()); + } + } +}; +} // namespace ATVC + +#endif // ATVC_REDUCE_MIN_H diff --git a/atvc/include/reduce/reduce_op_template.h b/atvc/include/reduce/reduce_op_template.h index 3eca4811e683a0ceeba45a44ec6eba6719a0875f..13b28f34224e9327236cf8f2f50a874c72b5cefa 100644 --- a/atvc/include/reduce/reduce_op_template.h +++ b/atvc/include/reduce/reduce_op_template.h @@ -118,7 +118,9 @@ public: template __aicore__ inline void AllocTensorAux(AscendC::LocalTensor& tensor) { - bufPool_.AllocTensor(tensor); + T DupValue = needDup ? compute_.template GetPaddingValue() : 0; + bufPool_.AllocTensor(tensor, DupValue); + // bufPool_.AllocTensor(tensor); } /*! diff --git a/atvc/include/reduce/reduce_sum.h b/atvc/include/reduce/reduce_sum.h index ac1ce5ea4a3f5d0565ac6af5ee7cf1bc3cb98c6e..581cccecc2017106362be8d642582dfdd5ed010a 100644 --- a/atvc/include/reduce/reduce_sum.h +++ b/atvc/include/reduce/reduce_sum.h @@ -15,20 +15,7 @@ #include "common/kernel_utils.h" #include "reduce/common/patterns.h" #include "reduce/utils/reduce_block_aux_util.h" - -namespace { -struct ReduceARParam { - uint32_t repStride = 0; - uint16_t dimA = 0; - uint16_t dimMax = 0; - uint16_t mainR = 0; - uint16_t tailR = 0; - uint64_t maskAddRNum = 0; - uint16_t loopRNum = 0; - uint16_t dtypeSize = 0; - uint16_t dimR = 0; -}; -} +#include "reduce/common/reduce_common.h" namespace ATVC { /*! diff --git a/atvc/include/reduce/utils/reduce_block_aux.h b/atvc/include/reduce/utils/reduce_block_aux.h index a68a169fa900a1aaf21d907b525ab95973d3d6a7..3f7639cba64ef66415a24e3baef762391ce02534 100644 --- a/atvc/include/reduce/utils/reduce_block_aux.h +++ b/atvc/include/reduce/utils/reduce_block_aux.h @@ -234,7 +234,7 @@ public: computeTensor = ubTensor; } else { // The index of AlloccomputeTensorAux does not require external perception - op_->ReduceOp::template AllocTensorAux(computeTensor); + op_->ReduceOp::template AllocTensorAux(computeTensor); CopyIn(view, shape, ubTensor); SetEvent(AscendC::HardEvent::MTE2_V); AscendC::Cast(computeTensor, ubTensor, AscendC::RoundMode::CAST_NONE, shape.value[0] * shape.value[1]); @@ -246,14 +246,21 @@ public: __aicore__ inline void LinearComputeR(int64_t& tmpBufOffest, V& shape, Args... args) { SliceView view; + bool needDup = false; for (int64_t i = 0; i < bisectionTail_; i++) { AscendC::LocalTensor tensorLeft; - op_->ReduceOp::template AllocTensorAux(tensorLeft); + op_->ReduceOp::template AllocTensorAux(tensorLeft); AscendC::LocalTensor computeLeft; PrePareReduce<(!InnerPattern::TailA), false>(i, view, shape, tensorLeft, computeLeft); AscendC::LocalTensor tensorRight; - op_->ReduceOp::template AllocTensorAux(tensorRight); + needDup = i == bisectionTail_ - 1; + if(needDup){ + op_->ReduceOp::template AllocTensorAux(tensorRight); + }else{ + op_->ReduceOp::template AllocTensorAux(tensorRight); + } + // op_->ReduceOp::template AllocTensorAux(tensorRight); AscendC::LocalTensor computeRight; PrePareReduce<(!InnerPattern::TailA), true>(i, view, shape, tensorRight, computeRight); ComputeMerge(shape, computeLeft, computeRight, args...); @@ -266,7 +273,13 @@ public: for (int64_t i = bisectionTail_; i < bisectionPos_; i++) { AscendC::LocalTensor tensor; - op_->ReduceOp::template AllocTensorAux(tensor); + needDup = i == bisectionPos_ -1; + if(needDup){ + op_->ReduceOp::template AllocTensorAux(tensor); + }else{ + op_->ReduceOp::template AllocTensorAux(tensor); + } + // op_->ReduceOp::template AllocTensorAux(tensor); AscendC::LocalTensor computeLeft; PrePareReduce<(!InnerPattern::TailA && Pattern::Dim > 2), false>(i, view, shape, tensor, computeLeft); Compute(shape, computeLeft, args...); diff --git a/atvc/include/reduce/utils/reduce_buf_pool.h b/atvc/include/reduce/utils/reduce_buf_pool.h index eeba4768b750f406791cbcbf8cefea11ca8aaa9c..96487c798df69697e5c4ca9f6e4bf2af3cee76b4 100644 --- a/atvc/include/reduce/utils/reduce_buf_pool.h +++ b/atvc/include/reduce/utils/reduce_buf_pool.h @@ -52,7 +52,10 @@ public: // Init buffer GetTPipePtr()->InitBuffer(qQue_, poolSize); AscendC::LocalTensor inputUb = qQue_.GetWithOffset(basicNum_ * inputNum, 0); - AscendC::Duplicate(inputUb, 0, basicNum_ * inputNum); + // AscendC::Duplicate(inputUb, 0, basicNum_ * inputNum); + for(int16_t i =0;i - __aicore__ inline const void AllocTensor(AscendC::LocalTensor& tensor) { + __aicore__ inline const void AllocTensor(AscendC::LocalTensor& tensor, T DupValue) { if constexpr (IsInput) { int32_t idx = GetInputTensorId(); tensor = qQue_.GetWithOffset(basicNum_, inputUnit_.offset + idx * basicNum_ * sizeof(T)); if constexpr (needDup) { - AscendC::Duplicate(tensor, 0, basicNum_); + AscendC::PipeBarrier(); + AscendC::Duplicate(tensor, DupValue, basicNum_); + } + if(usedTBuf_[idx] || needDup){ event_t allocEventId = static_cast(GetTPipePtr()->FetchEventID()); eventIdV2Mte2_[idx] = allocEventId; + needWaitFlag_[idx] = true; AscendC::SetFlag(allocEventId); } + usedTBuf_[idx] = true; } else { int32_t idx = GetComputeTensorId(); tensor = qQue_.GetWithOffset(basicNum_, computeUnit_.offset + idx * basicNum_ * sizeof(T)); @@ -101,7 +109,11 @@ public: uint64_t offset = (uint64_t)(tensor.GetPhyAddr()); if (offset - start < computeUnit_.offset) { int32_t idx = (offset - start) / sizeof(T) / basicNum_; - AscendC::WaitFlag(eventIdV2Mte2_[idx]); + // AscendC::WaitFlag(eventIdV2Mte2_[idx]); + if(needWaitFlag_[idx]){ + AscendC::WaitFlag(eventIdV2Mte2_[idx]); + needWaitFlag_[idx] = false; + } } } @@ -122,6 +134,8 @@ private: PoolManagerUnit inputUnit_; PoolManagerUnit computeUnit_; event_t eventIdV2Mte2_[MAX_INPUT_SIZE]; + bool needWaitFlag_[MAX_INPUT_SIZE]; + bool usedTBuf_[MAX_INPUT_SIZE]; AscendC::TBuf<> qQue_; int32_t basicNum_; }; // class ReduceBufPool