diff --git a/atvc/examples/reduce_max/README.md b/atvc/examples/reduce_max/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..358fa460c8e2133169e7da139d0e6b0c45e67e77
--- /dev/null
+++ b/atvc/examples/reduce_max/README.md
@@ -0,0 +1,46 @@
+
+
+## 概述
+
+本样例介绍了利用ATVC实现ReduceMax单算子并完成功能验证
+
+
+## 样例支持产品型号:
+- Atlas A2训练系列产品
+
+## 目录结构
+
+| 文件名 | 描述 |
+| ------------------------------------------------------------ | ------------------------------------------------------------ |
+| [reduce_max.cpp](./reduce_max.cpp) | ReduceMax算子代码实现以及调用样例 |
+
+## 算子描述
+
+ReduceMax是对输入tensor的指定轴进行规约累加的计算并输出结果的Reduce类算子。
+
+ReduceMax算子规格:
+
+
+算子类型(OpType) | ReduceMax |
+
+算子输入 |
+name | shape | data type | format |
+x | 8 * 2048 | float | ND |
+
+
+算子输出 |
+y | 1 * 2048 | float | ND |
+
+核函数名 | ReduceCustom |
+
+
+## 算子运行
+在ascendc-api-adv代码仓目录下执行:
+```bash
+$ cd ./atvc/examples
+$ bash run_examples.sh reduce_max
+...
+Generate golden data successfully.
+...
+Accuracy verification passed.
+```
\ No newline at end of file
diff --git a/atvc/examples/reduce_max/reduce_max.cpp b/atvc/examples/reduce_max/reduce_max.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a2176809706f94bb442b5e2632e0b2c38e8fcfa7
--- /dev/null
+++ b/atvc/examples/reduce_max/reduce_max.cpp
@@ -0,0 +1,178 @@
+/**
+ * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved.
+ *
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include "acl/acl.h"
+#include "reduce/reduce_host.h"
+#include "reduce/reduce_device.h"
+#include "example_common.h"
+
+namespace {
+// ReduceSum算子的描述:一个输入,一个输出,类型均为float
+using ReduceOpTraits = ATVC::OpTraits, ATVC::OpOutputs>;
+
+void CleanUp(uint8_t *&xDevice, uint8_t *&yDevice, uint8_t *&yHost)
+{
+ CHECK_ACL(aclrtFree(xDevice));
+ CHECK_ACL(aclrtFree(yDevice));
+ CHECK_ACL(aclrtFreeHost(yHost));
+}
+}
+
+/*
+ * 该函数为ReduceCustom算子核函数入口
+ * x Device上的gm地址,指向Add算子第一个输入
+ * y Device上的gm地址,指向Add算子第一个输出
+ * reduceParam 指向运行态ATVC::ReduceParam数据
+*/
+template
+__global__ __aicore__ void ReduceCustom(
+ GM_ADDR x,
+ GM_ADDR y,
+ ATVC::ReduceParam reduceParam
+)
+{
+ KERNEL_TASK_TYPE_DEFAULT(KERNEL_TYPE_MIX_AIV_1_0); // 使用了多核控制指令,设置算子执行时只启动Vector核
+ // 将计算模板类模板定义作为模板参数传入,Policy由Host层的策略分派API给出
+ auto op = ATVC::Kernel::ReduceOpTemplate, Policy>();
+ op.Run(x, y, &reduceParam);
+}
+
+namespace {
+// 负责Reduce类算子的调度,选择对应的Policy最佳策略并执行Kernel函数
+template
+void ReduceOpAdapter(uint8_t* x, uint8_t* y, ATVC::ReduceParam ¶m, ATVC::ReducePolicy &policy, aclrtStream& stream)
+{
+ // 申请临时空间workspace
+ uint8_t *workspaceDevice;
+ CHECK_ACL(aclrtMalloc((void **)&workspaceDevice, param.workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST));
+ param.workspaceAddr = reinterpret_cast(workspaceDevice);
+ // 将tiling api计算出的ReducePolicy转化为编译态参数并实例化相应的核函数
+ if (policy == ATVC::REDUCE_POLICY0) {
+ ReduceCustom<<>>(x, y, param);
+ } else if (policy == ATVC::REDUCE_POLICY1) {
+ ReduceCustom<<>>(x, y, param);
+ } else if (policy == ATVC::REDUCE_POLICY2) {
+ ReduceCustom<<>>(x, y, param);
+ } else if (policy == ATVC::REDUCE_POLICY3) {
+ ReduceCustom<<>>(x, y, param);
+ } else if (policy == ATVC::REDUCE_POLICY4) {
+ ReduceCustom<<>>(x, y, param);
+ } else if (policy == ATVC::REDUCE_POLICY5) {
+ ReduceCustom<<>>(x, y, param);
+ } else if (policy == ATVC::REDUCE_POLICY6) {
+ ReduceCustom<<>>(x, y, param);
+ } else if (policy == ATVC::REDUCE_POLICY7) {
+ ReduceCustom<<>>(x, y, param);
+ } else if (policy == ATVC::REDUCE_POLICY8) {
+ ReduceCustom<<>>(x, y, param);
+ } else if (policy == ATVC::REDUCE_POLICY9) {
+ ReduceCustom<<>>(x, y, param);
+ } else if (policy == ATVC::REDUCE_POLICY10) {
+ ReduceCustom<<>>(x, y, param);
+ } else if (policy == ATVC::REDUCE_POLICY11) {
+ ReduceCustom<<>>(x, y, param);
+ } else if (policy == ATVC::REDUCE_POLICY12) {
+ ReduceCustom<<>>(x, y, param);
+ } else if (policy == ATVC::REDUCE_POLICY13) {
+ ReduceCustom<<>>(x, y, param);
+ } else if (policy == ATVC::REDUCE_POLICY14) {
+ ReduceCustom<<>>(x, y, param);
+ } else if (policy == ATVC::REDUCE_POLICY15) {
+ ReduceCustom<<>>(x, y, param);
+ } else if (policy == ATVC::REDUCE_POLICY16) {
+ ReduceCustom<<>>(x, y, param);
+ } else if (policy == ATVC::REDUCE_POLICY17) {
+ ReduceCustom<<>>(x, y, param);
+ } else if (policy == ATVC::REDUCE_POLICY18) {
+ ReduceCustom<<>>(x, y, param);
+ } else if (policy == ATVC::REDUCE_POLICY19) {
+ ReduceCustom<<>>(x, y, param);
+ } else if (policy == ATVC::REDUCE_POLICY20) {
+ ReduceCustom<<>>(x, y, param);
+ } else if (policy == ATVC::REDUCE_POLICY21) {
+ ReduceCustom<<>>(x, y, param);
+ } else if (policy == ATVC::REDUCE_POLICY22) {
+ ReduceCustom<<>>(x, y, param);
+ } else {
+ (void)printf("[ERROR]: Cannot find any matched policy.\n");
+ }
+ // 流同步后释放申请的param内存
+ CHECK_ACL(aclrtSynchronizeStream(stream));
+ CHECK_ACL(aclrtFree(workspaceDevice));
+}
+}
+
+int32_t main(int32_t argc, char* argv[])
+{
+ if (!ATVC::Host::DebugCheck()) {
+ (void)printf("[ERROR]: Reduce OpTraits check failed.\n");
+ return -1;
+ }
+ int32_t eleNum = 8 * 1024;
+ int32_t outEleNum = 1 * 1024;
+ size_t inputByteSize = static_cast(eleNum) * sizeof(float);
+ size_t outputByteSize = static_cast(outEleNum) * sizeof(float);
+ std::vector dim{0}; // 对第0轴执行reduce操作
+ std::vector shape{8, 1024}; // 测试输入shape
+ std::vector inputX(eleNum, 1.0f);
+ std::vector golden(outEleNum, 1.0f);
+ (void)printf("Generate golden data successfully.\n");
+ // 初始化Acl资源
+ CHECK_ACL(aclInit(nullptr));
+ aclrtContext context;
+ int32_t deviceId = 0;
+ CHECK_ACL(aclrtSetDevice(deviceId));
+ CHECK_ACL(aclrtCreateContext(&context, deviceId));
+ aclrtStream stream = nullptr;
+ CHECK_ACL(aclrtCreateStream(&stream));
+ uint8_t *yHost;
+ uint8_t *xDevice;
+ uint8_t *yDevice;
+
+ CHECK_ACL(aclrtMallocHost((void **)(&yHost), outputByteSize));
+ CHECK_ACL(aclrtMalloc((void **)&xDevice, inputByteSize, ACL_MEM_MALLOC_HUGE_FIRST));
+ CHECK_ACL(aclrtMalloc((void **)&yDevice, outputByteSize, ACL_MEM_MALLOC_HUGE_FIRST));
+
+ CHECK_ACL(aclrtMemcpy(xDevice, inputByteSize, inputX.data(), inputByteSize, ACL_MEMCPY_HOST_TO_DEVICE));
+
+ ATVC::ReduceParam param; // Reduce运行态参数,包含TilingData以及临时空间的相关信息
+ ATVC::ReducePolicy policy = {-1, -1, -1}; // Reduce运行态参数,负责映射最适合的Reduce模板实现
+ ATVC::Host::ReduceTilingHyperParam hyperParam;
+ hyperParam.maxInnerA = 256;// 设置maxInnerA为256
+ // Host侧调用Tiling API完成相关运行态参数的运算
+ if (!ATVC::Host::CalcReduceTiling(shape, dim, &policy, ¶m, hyperParam=hyperParam)) {
+ (void)printf("Reduce tiling error.\n");
+ return -1;
+ };
+
+ // 调用Adapter调度接口,完成核函数的模板调用
+ ReduceOpAdapter(xDevice, yDevice, param, policy, stream);
+
+ CHECK_ACL(aclrtMemcpy(yHost, outputByteSize, yDevice, outputByteSize, ACL_MEMCPY_DEVICE_TO_HOST));
+ std::vector outputY(reinterpret_cast(yHost), reinterpret_cast(yHost) + outEleNum);
+
+ // 释放Acl资源
+ CleanUp(xDevice, yDevice, yHost);
+ CleanACL(stream, context, deviceId);
+
+ if (!VerifyResults(golden, outputY)) {
+ return -1;
+ }
+
+ (void)printf("Accuracy verification passed.\n");
+ return 0;
+}
diff --git a/atvc/examples/reduce_min/README.md b/atvc/examples/reduce_min/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..af85b2bc89edbb7ff9faa90e55bc8a6ce57cf91f
--- /dev/null
+++ b/atvc/examples/reduce_min/README.md
@@ -0,0 +1,46 @@
+
+
+## 概述
+
+本样例介绍了利用ATVC实现ReduceMin单算子并完成功能验证
+
+
+## 样例支持产品型号:
+- Atlas A2训练系列产品
+
+## 目录结构
+
+| 文件名 | 描述 |
+| ------------------------------------------------------------ | ------------------------------------------------------------ |
+| [reduce_min.cpp](./reduce_min.cpp) | ReduceMin算子代码实现以及调用样例 |
+
+## 算子描述
+
+ReduceMin是对输入tensor的指定轴进行规约累加的计算并输出结果的Reduce类算子。
+
+ReduceMin算子规格:
+
+
+算子类型(OpType) | ReduceMin |
+
+算子输入 |
+name | shape | data type | format |
+x | 8 * 2048 | float | ND |
+
+
+算子输出 |
+y | 1 * 2048 | float | ND |
+
+核函数名 | ReduceCustom |
+
+
+## 算子运行
+在ascendc-api-adv代码仓目录下执行:
+```bash
+$ cd ./atvc/examples
+$ bash run_examples.sh reduce_min
+...
+Generate golden data successfully.
+...
+Accuracy verification passed.
+```
\ No newline at end of file
diff --git a/atvc/examples/reduce_min/reduce_min.cpp b/atvc/examples/reduce_min/reduce_min.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..7e2eab3b788c3f40585fbd4392b63a14e20fe92b
--- /dev/null
+++ b/atvc/examples/reduce_min/reduce_min.cpp
@@ -0,0 +1,178 @@
+/**
+ * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved.
+ *
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include "acl/acl.h"
+#include "reduce/reduce_host.h"
+#include "reduce/reduce_device.h"
+#include "example_common.h"
+
+namespace {
+// ReduceSum算子的描述:一个输入,一个输出,类型均为float
+using ReduceOpTraits = ATVC::OpTraits, ATVC::OpOutputs>;
+
+void CleanUp(uint8_t *&xDevice, uint8_t *&yDevice, uint8_t *&yHost)
+{
+ CHECK_ACL(aclrtFree(xDevice));
+ CHECK_ACL(aclrtFree(yDevice));
+ CHECK_ACL(aclrtFreeHost(yHost));
+}
+}
+
+/*
+ * 该函数为ReduceCustom算子核函数入口
+ * x Device上的gm地址,指向Add算子第一个输入
+ * y Device上的gm地址,指向Add算子第一个输出
+ * reduceParam 指向运行态ATVC::ReduceParam数据
+*/
+template
+__global__ __aicore__ void ReduceCustom(
+ GM_ADDR x,
+ GM_ADDR y,
+ ATVC::ReduceParam reduceParam
+)
+{
+ KERNEL_TASK_TYPE_DEFAULT(KERNEL_TYPE_MIX_AIV_1_0); // 使用了多核控制指令,设置算子执行时只启动Vector核
+ // 将计算模板类模板定义作为模板参数传入,Policy由Host层的策略分派API给出
+ auto op = ATVC::Kernel::ReduceOpTemplate, Policy>();
+ op.Run(x, y, &reduceParam);
+}
+
+namespace {
+// 负责Reduce类算子的调度,选择对应的Policy最佳策略并执行Kernel函数
+template
+void ReduceOpAdapter(uint8_t* x, uint8_t* y, ATVC::ReduceParam ¶m, ATVC::ReducePolicy &policy, aclrtStream& stream)
+{
+ // 申请临时空间workspace
+ uint8_t *workspaceDevice;
+ CHECK_ACL(aclrtMalloc((void **)&workspaceDevice, param.workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST));
+ param.workspaceAddr = reinterpret_cast(workspaceDevice);
+ // 将tiling api计算出的ReducePolicy转化为编译态参数并实例化相应的核函数
+ if (policy == ATVC::REDUCE_POLICY0) {
+ ReduceCustom<<>>(x, y, param);
+ } else if (policy == ATVC::REDUCE_POLICY1) {
+ ReduceCustom<<>>(x, y, param);
+ } else if (policy == ATVC::REDUCE_POLICY2) {
+ ReduceCustom<<>>(x, y, param);
+ } else if (policy == ATVC::REDUCE_POLICY3) {
+ ReduceCustom<<>>(x, y, param);
+ } else if (policy == ATVC::REDUCE_POLICY4) {
+ ReduceCustom<<>>(x, y, param);
+ } else if (policy == ATVC::REDUCE_POLICY5) {
+ ReduceCustom<<>>(x, y, param);
+ } else if (policy == ATVC::REDUCE_POLICY6) {
+ ReduceCustom<<>>(x, y, param);
+ } else if (policy == ATVC::REDUCE_POLICY7) {
+ ReduceCustom<<>>(x, y, param);
+ } else if (policy == ATVC::REDUCE_POLICY8) {
+ ReduceCustom<<>>(x, y, param);
+ } else if (policy == ATVC::REDUCE_POLICY9) {
+ ReduceCustom<<>>(x, y, param);
+ } else if (policy == ATVC::REDUCE_POLICY10) {
+ ReduceCustom<<>>(x, y, param);
+ } else if (policy == ATVC::REDUCE_POLICY11) {
+ ReduceCustom<<>>(x, y, param);
+ } else if (policy == ATVC::REDUCE_POLICY12) {
+ ReduceCustom<<>>(x, y, param);
+ } else if (policy == ATVC::REDUCE_POLICY13) {
+ ReduceCustom<<>>(x, y, param);
+ } else if (policy == ATVC::REDUCE_POLICY14) {
+ ReduceCustom<<>>(x, y, param);
+ } else if (policy == ATVC::REDUCE_POLICY15) {
+ ReduceCustom<<>>(x, y, param);
+ } else if (policy == ATVC::REDUCE_POLICY16) {
+ ReduceCustom<<>>(x, y, param);
+ } else if (policy == ATVC::REDUCE_POLICY17) {
+ ReduceCustom<<>>(x, y, param);
+ } else if (policy == ATVC::REDUCE_POLICY18) {
+ ReduceCustom<<>>(x, y, param);
+ } else if (policy == ATVC::REDUCE_POLICY19) {
+ ReduceCustom<<>>(x, y, param);
+ } else if (policy == ATVC::REDUCE_POLICY20) {
+ ReduceCustom<<>>(x, y, param);
+ } else if (policy == ATVC::REDUCE_POLICY21) {
+ ReduceCustom<<>>(x, y, param);
+ } else if (policy == ATVC::REDUCE_POLICY22) {
+ ReduceCustom<<>>(x, y, param);
+ } else {
+ (void)printf("[ERROR]: Cannot find any matched policy.\n");
+ }
+ // 流同步后释放申请的param内存
+ CHECK_ACL(aclrtSynchronizeStream(stream));
+ CHECK_ACL(aclrtFree(workspaceDevice));
+}
+}
+
+int32_t main(int32_t argc, char* argv[])
+{
+ if (!ATVC::Host::DebugCheck()) {
+ (void)printf("[ERROR]: Reduce OpTraits check failed.\n");
+ return -1;
+ }
+ int32_t eleNum = 8 * 1024;
+ int32_t outEleNum = 1 * 1024;
+ size_t inputByteSize = static_cast(eleNum) * sizeof(float);
+ size_t outputByteSize = static_cast(outEleNum) * sizeof(float);
+ std::vector dim{0}; // 对第0轴执行reduce操作
+ std::vector shape{8, 1024}; // 测试输入shape
+ std::vector inputX(eleNum, 1.0f);
+ std::vector golden(outEleNum, 1.0f);
+ (void)printf("Generate golden data successfully.\n");
+ // 初始化Acl资源
+ CHECK_ACL(aclInit(nullptr));
+ aclrtContext context;
+ int32_t deviceId = 0;
+ CHECK_ACL(aclrtSetDevice(deviceId));
+ CHECK_ACL(aclrtCreateContext(&context, deviceId));
+ aclrtStream stream = nullptr;
+ CHECK_ACL(aclrtCreateStream(&stream));
+ uint8_t *yHost;
+ uint8_t *xDevice;
+ uint8_t *yDevice;
+
+ CHECK_ACL(aclrtMallocHost((void **)(&yHost), outputByteSize));
+ CHECK_ACL(aclrtMalloc((void **)&xDevice, inputByteSize, ACL_MEM_MALLOC_HUGE_FIRST));
+ CHECK_ACL(aclrtMalloc((void **)&yDevice, outputByteSize, ACL_MEM_MALLOC_HUGE_FIRST));
+
+ CHECK_ACL(aclrtMemcpy(xDevice, inputByteSize, inputX.data(), inputByteSize, ACL_MEMCPY_HOST_TO_DEVICE));
+
+ ATVC::ReduceParam param; // Reduce运行态参数,包含TilingData以及临时空间的相关信息
+ ATVC::ReducePolicy policy = {-1, -1, -1}; // Reduce运行态参数,负责映射最适合的Reduce模板实现
+ ATVC::Host::ReduceTilingHyperParam hyperParam;
+ hyperParam.maxInnerA = 256;// 设置maxInnerA为256
+ // Host侧调用Tiling API完成相关运行态参数的运算
+ if (!ATVC::Host::CalcReduceTiling(shape, dim, &policy, ¶m, hyperParam=hyperParam)) {
+ (void)printf("Reduce tiling error.\n");
+ return -1;
+ };
+
+ // 调用Adapter调度接口,完成核函数的模板调用
+ ReduceOpAdapter(xDevice, yDevice, param, policy, stream);
+
+ CHECK_ACL(aclrtMemcpy(yHost, outputByteSize, yDevice, outputByteSize, ACL_MEMCPY_DEVICE_TO_HOST));
+ std::vector outputY(reinterpret_cast(yHost), reinterpret_cast(yHost) + outEleNum);
+
+ // 释放Acl资源
+ CleanUp(xDevice, yDevice, yHost);
+ CleanACL(stream, context, deviceId);
+
+ if (!VerifyResults(golden, outputY)) {
+ return -1;
+ }
+
+ (void)printf("Accuracy verification passed.\n");
+ return 0;
+}
diff --git a/atvc/include/atvc.h b/atvc/include/atvc.h
index 187482e260bac834e0b9eb593f7316740a7bbd90..f56046ea7155eb8fa3e2d7e57c75bc2a36594c60 100644
--- a/atvc/include/atvc.h
+++ b/atvc/include/atvc.h
@@ -29,6 +29,8 @@
#include "common/kernel_utils.h"
#include "elewise/elewise_op_template.h"
#include "reduce/reduce_sum.h"
+#include "reduce/reduce_max.h"
+#include "reduce/reduce_min.h"
#include "reduce/reduce_op_template.h"
#include "broadcast/broadcast_compute.h"
#include "broadcast/broadcast_op_template.h"
diff --git a/atvc/include/reduce/common/reduce_common.h b/atvc/include/reduce/common/reduce_common.h
index 94c6e050abd68f597d4813ac279c7b040bd2f001..91a040a90b569bc74fcb55755934cc7aab629853 100644
--- a/atvc/include/reduce/common/reduce_common.h
+++ b/atvc/include/reduce/common/reduce_common.h
@@ -29,6 +29,18 @@ enum ShapeDim {
DIM_BROADCAST // Broadcast axis
};
+struct ReduceARParam {
+ uint32_t repStride = 0;
+ uint16_t dimA = 0;
+ uint16_t dimMax = 0;
+ uint16_t mainR = 0;
+ uint16_t tailR = 0;
+ uint64_t maskAddRNum = 0;
+ uint16_t loopRNum = 0;
+ uint16_t dtypeSize = 0;
+ uint16_t dimR = 0;
+};
+
namespace AR_PATTERN {
static constexpr uint32_t A = 100;
static constexpr uint32_t AR = 11;
diff --git a/atvc/include/reduce/reduce_device.h b/atvc/include/reduce/reduce_device.h
index a271e8f17d289e74509cbf03ef8348a443afcaff..c1611078ee1631d6a1d61255992926209bd8fc5b 100644
--- a/atvc/include/reduce/reduce_device.h
+++ b/atvc/include/reduce/reduce_device.h
@@ -25,6 +25,8 @@
#include "common/kernel_utils.h"
#include "reduce/reduce_sum.h"
+#include "reduce/reduce_max.h"
+#include "reduce/reduce_min.h"
#include "reduce/reduce_op_template.h"
#endif // ATVC_REDUCE_DEVICE_H
\ No newline at end of file
diff --git a/atvc/include/reduce/reduce_max.h b/atvc/include/reduce/reduce_max.h
new file mode 100644
index 0000000000000000000000000000000000000000..9aab3b9990fac99e14cfc79162e9d71a662f87f0
--- /dev/null
+++ b/atvc/include/reduce/reduce_max.h
@@ -0,0 +1,350 @@
+/**
+ * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved.
+ *
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+#ifndef ATVC_REDUCE_MAX_H
+#define ATVC_REDUCE_MAX_H
+
+#include "common/kernel_utils.h"
+#include "reduce/common/patterns.h"
+#include "reduce/utils/reduce_block_aux_util.h"
+#include "reduce/common/reduce_common.h"
+
+namespace ATVC {
+/*!
+ * ReduceMaxCompute This class provides the core arithmetic required to reduce
+ * tensors along either the inner-most (AR) or outer-most (RA) axis after
+ * the tensor has been copied to the Unified Buffer (UB). Data movement between
+ * Global Memory (GM) and UB is not handled here; it is the responsibility of
+ * the surrounding scheduling template.
+ */
+template
+class ReduceMaxCompute {
+public:
+ // Extract operator input description information from OpTraits
+ using inputDTypeList = typename OpTraits::In::types;
+ using DataType = typename ATVC::TypeListGet::Type;
+ using PrompteDtype = typename KernelUtils::GetPromoteType::T;
+ __aicore__ inline ReduceMaxCompute() {}
+
+ /*!
+ * \brief Perform the actual reduction on a tile already resident in UB.
+ * \tparam needMask, true when UB alignment introduced invalid lanes.
+ * \tparam Pattern, one of ReducePattern::AR or ReducePattern::RA.
+ * \param[in] shape, {dimA, dimR} in elements; dimR may be padded.
+ * \param[out] dst, destination tensor (length == dimA)
+ * \param[in] src, source tensor (length == dimA * dimR)
+ */
+ template
+ __aicore__ inline void
+ Compute(KernelUtils::Shape<2> &shape,
+ const AscendC::LocalTensor &dst,
+ const AscendC::LocalTensor &src)
+ {
+ // AR scenario, hardware limitations, R-axis requires 32B alignment on UB, with 2 alignment methods available:
+ // 1. High performance alignment (with uncertain supplementary element values), subsequent cumulative
+ // calculations can only calculate the actual number of effective elements
+ // 2. Alignment with zero padding (padding value is determined by the GetAddingValue() interface
+ // implemented by the user)
+ if (std::is_same::value) {
+ if constexpr (needMask) { // 1. High performance alignment mode
+ // MainR (int64_t dimR, boolean isAR): The framework provides the calculation of the R-axis binary
+ // length (number of elements), where dimR is the original number of elements
+ int16_t mainR = KernelUtils::Reduce::MainR(shape.oriBurstLen, true);
+ ReduceAR(dst, src, shape.value[0], shape.value[1], mainR, shape.oriBurstLen);
+ } else {
+ // MainR: The framework provides the calculation of the R-axis binary length (number of elements),
+ // where dimR is the number of elements after completion
+ int16_t mainR = KernelUtils::Reduce::MainR(shape.value[1], true);
+ ReduceAR(dst, src, shape.value[0], shape.value[1], mainR, shape.value[1]);
+ }
+ }
+ if (std::is_same::value) {
+ int16_t mainR = KernelUtils::Reduce::MainR(shape.value[0], false);
+ ReduceRA(dst, src, shape.value[1], shape.value[0], mainR);
+ }
+ }
+
+ /*!
+ * \brief RA-pattern reduction: reduce along the outer-most (slowest-varying) axis.
+ * \param[out] dst, output tensor (length == dimA)
+ * \param[in] src, input tensor (length == dimR * dimA), already resident in UB
+ * \param[in] dimA, length of the non-reduced axis (A)
+ * \param[in] dimR, length of the reduced axis (R)
+ * \param[in] mainR, largest power-of-two ≤ dimR (computed by the caller)
+ */
+ __aicore__ inline void
+ ReduceRA(const AscendC::LocalTensor &dst,
+ const AscendC::LocalTensor &src, uint16_t dimA,
+ uint16_t dimR, uint16_t mainR)
+ {
+ uint32_t totalNum = dimR * dimA;
+ uint32_t mainNum = dimA * mainR;
+ constexpr uint32_t dtypeSize = sizeof(PrompteDtype);
+ uint32_t tailNum = totalNum - mainNum;
+ // MaskAddNum has a maximum value of 256 bytes and must be aligned with 32 bytes
+ constexpr uint32_t maskAddNum = UB_ALIGN_256 / dtypeSize / UB_ALIGN_32 * UB_ALIGN_32;
+ uint16_t repeatTimes = tailNum / maskAddNum;
+ uint16_t repeatNum = repeatTimes * maskAddNum;
+ uint16_t repTailNum = tailNum - repeatNum;
+ // Same data block step size between different iterations
+ uint32_t repStride = dtypeSize * maskAddNum / UB_ALIGN_32;
+ // dstBlkStride, src0BlkStride,src1BlkStride, dstRepStride, src0RepStride, src1RepStride
+ AscendC::BinaryRepeatParams repeatParams(1, 1, 1, repStride, repStride, repStride);
+ if (repeatTimes > 0) {
+ AscendC::Max(src, src[mainNum], src, maskAddNum, repeatTimes, repeatParams);
+ }
+ if (repTailNum > 0) {
+ // Same data block step size between different iterations
+ repStride = dtypeSize * repTailNum / UB_ALIGN_32;
+ repeatParams.dstRepStride = repStride;
+ repeatParams.src0RepStride = repStride;
+ repeatParams.src1RepStride = repStride;
+ AscendC::Max(src[repeatNum], src[repeatNum + mainNum], src[repeatNum], repTailNum, 1, repeatParams);
+ }
+ AscendC::PipeBarrier();
+ uint16_t loopRNum = mainR;
+ while (loopRNum > 1) {
+ loopRNum = loopRNum >> 1;
+ mainNum = loopRNum * dimA; // The first half of LoopR's data volume
+ repeatTimes = mainNum / maskAddNum;
+ repeatNum = repeatTimes * maskAddNum;
+ repTailNum = mainNum - repeatNum;
+ if (repeatTimes > 0) {
+ // Same data block step size between different iterations
+ repStride = dtypeSize * maskAddNum / UB_ALIGN_32;
+ repeatParams.dstRepStride = repStride;
+ repeatParams.src0RepStride = repStride;
+ repeatParams.src1RepStride = repStride;
+ AscendC::Max(src, src[mainNum], src, maskAddNum, repeatTimes, repeatParams);
+ }
+ if (repTailNum > 0) {
+ // Same data block step size between different iterations
+ repStride = dtypeSize * repTailNum / UB_ALIGN_32;
+ repeatParams.dstRepStride = repStride;
+ repeatParams.src0RepStride = repStride;
+ repeatParams.src1RepStride = repStride;
+ AscendC::Max(src[repeatNum], src[repeatNum + mainNum], src[repeatNum], repTailNum, 1, repeatParams);
+ }
+ AscendC::PipeBarrier();
+ }
+ AscendC::DataCopy(dst, src, dimA);
+ }
+
+ /*!
+ * \brief AR-pattern reduction: reduce along the inner-most (fastest-varying) axis.
+ * \param[out] dstTensor, output tensor (length == dimA)
+ * \param[in] srcTensor, input tensor (length == dimR * dimA), already resident in UB
+ * \param[in] dimA, length of the non-reduced axis (A)
+ * \param[in] dimR, padded length of the reduced axis (R)
+ * \param[in] mainR, largest power-of-two ≤ original R length
+ * \param[in] oriBurstLen, original (un-padded) R length used to compute tail
+ */
+ __aicore__ inline void
+ ReduceAR(const AscendC::LocalTensor &dstTensor,
+ const AscendC::LocalTensor &srcTensor, uint16_t dimA,
+ uint16_t dimR, uint16_t mainR, uint64_t oriBurstLen)
+ {
+ uint16_t tailR = oriBurstLen - mainR;
+ constexpr uint16_t dtypeSize = sizeof(PrompteDtype);
+ uint32_t repStride = dtypeSize * dimR / UB_ALIGN_32;
+ uint16_t dimMax = dimA * dimR;
+ constexpr uint64_t maskAddRNum = UB_ALIGN_256 / dtypeSize;
+
+ ReduceARParam param{
+ .repStride = repStride,
+ .dimA = dimA,
+ .dimMax = dimMax,
+ .mainR = mainR,
+ .tailR = tailR,
+ .maskAddRNum = maskAddRNum,
+ .dtypeSize = dtypeSize,
+ .dimR = dimR
+ };
+
+ if (mainR > 0 && tailR > 0) {
+ PerformInitialMax(srcTensor, param);
+ }
+
+ param.loopRNum = mainR;
+ while (param.loopRNum > maskAddRNum) {
+ param.loopRNum = param.loopRNum / 2U;
+ PerformBinaryReduction(srcTensor, param);
+ }
+ if (param.loopRNum == 0) { // small shape, directly reduce
+ param.loopRNum = tailR;
+ }
+ PerformFinalReduction(dstTensor, srcTensor, param);
+ }
+
+ /*!
+ * \brief Merge the calculation results of different data base blocks within a single UB
+ * \tparam Pattern Compile-time pattern tag that decides A vs. B orientation.
+ * \tparam V Shape descriptor (encodes dimA and dimB at runtime).
+ * \param[in] index, logical index identifying the data-base block.
+ * \param[in] shape, runtime tensor shape (dimA, dimB).
+ * \param[in] tempBuf, UB tensor serving as the reduction cache.
+ * \param[in] computeRes, UB tensor holding the newest partial result.
+ */
+ template
+ __aicore__ inline void UpdateCache(int64_t index, V& shape, const AscendC::LocalTensor& tempBuf,
+ const AscendC::LocalTensor& computeRes)
+ {
+ int64_t cacheID = KernelUtils::Reduce::GetCacheID(index);
+ int64_t dimA = Pattern::TailA ? shape.value[1] : shape.value[0];
+ int32_t element_one_repeat = Platform::GetVRegSize() / sizeof(PrompteDtype);
+ int64_t stride = OpsUtils::CeilDiv(dimA, static_cast(element_one_repeat)) * element_one_repeat;
+ uint16_t outerLoopTimes = OpsUtils::CeilDiv(
+ static_cast(dimA * sizeof(PrompteDtype)), static_cast(Platform::GetVRegSize()));
+ uint16_t innerLoopTimes = cacheID;
+ uint32_t outerLoopStride = element_one_repeat;
+ uint32_t innerLoopStride = stride; // The size of each idex block in cacahe and the size of the A-axis
+ AscendC::LocalTensor dstTensor = tempBuf;
+ AscendC::LocalTensor srcTensor = computeRes;
+ uint32_t cah = cacheID * stride;
+
+ for (uint16_t i = 0; i < outerLoopTimes; ++i) { // OuterLoopTimes is the size of dimA
+ uint32_t srcIdx = i * outerLoopStride;
+ for (uint16_t j = 0; j < innerLoopTimes; ++j) {
+ AscendC::Max(srcTensor[srcIdx], srcTensor[srcIdx],
+ dstTensor[srcIdx + j * innerLoopStride],
+ outerLoopStride);
+ AscendC::PipeBarrier();
+ }
+ DataCopy(dstTensor[cah + srcIdx], srcTensor[srcIdx], outerLoopStride);
+ }
+ }
+
+ /*!
+ * \brief Binary reduction between two UB buffers.
+ * \ Used for inter-core result merging when workspace staging is required.
+ * \param[in] ubTensorLeft, left operand (in-place result).
+ * \param[in] ubTensorRight, right operand (read-only).
+ * \param[in] calCount, number of elements to reduce.
+ */
+ __aicore__ inline void
+ ReduceBetweenUB(const AscendC::LocalTensor &ubTensorLeft,
+ const AscendC::LocalTensor &ubTensorRight,
+ const int32_t &calCount)
+ {
+ AscendC::Max(ubTensorRight, ubTensorRight, ubTensorLeft, calCount);
+ }
+
+ /*!
+ * \brief Return the value used for padding when UB alignment is required.
+ * For MAX-reduction the neutral element is -∞ or 0.
+ * \tparam U, scalar type identical to DataType or PromoteDataType.
+ * \return The padding value (-∞ or 0).
+ */
+ template
+ __aicore__ inline U GetPaddingValue()
+ {
+ // Due to the fact that ReduceMax accumulates R-axis data, the values of the supplemented elements
+ // are set to -∞ or 0 to ensure that the accumulated result is not affected
+ if(AscendC::IsSameType::value){
+ return INT32_MIN;
+ }else if(AscendC::IsSameType::value){
+ return 0;
+ }else{
+ return -1.0f / 0.0f;
+ }
+ }
+
+private:
+ __aicore__ inline void PerformInitialMax(const AscendC::LocalTensor &srcTensor, const ReduceARParam& param)
+ {
+ uint16_t addRTotalNum = param.tailR / param.maskAddRNum * param.maskAddRNum;
+ uint16_t addRTail = param.tailR - addRTotalNum;
+ // dstBlkStride, src0BlkStride,src1BlkStride, dstRepStride, src0RepStride, src1RepStride
+ AscendC::BinaryRepeatParams repeatParams(1, 1, 1, param.repStride, param.repStride, param.repStride);
+
+ if (param.repStride > UB_ALIGN_255) {
+ for (uint16_t i = 0; i < param.dimMax; i += param.dimR) {
+ AscendC::Max(srcTensor[i], srcTensor[i], srcTensor[i + param.mainR], param.tailR);
+ }
+ } else {
+ for (uint16_t i = 0; i < addRTotalNum; i += param.maskAddRNum) {
+ AscendC::Max(srcTensor[i], srcTensor[i + param.mainR], srcTensor[i], param.maskAddRNum, param.dimA, repeatParams);
+ }
+ if (addRTail > 0) {
+ AscendC::Max(srcTensor[addRTotalNum],
+ srcTensor[addRTotalNum + param.mainR],
+ srcTensor[addRTotalNum],
+ addRTail,
+ param.dimA,
+ repeatParams);
+ }
+ }
+ AscendC::PipeBarrier();
+ }
+
+ __aicore__ inline void PerformBinaryReduction(const AscendC::LocalTensor &srcTensor,
+ const ReduceARParam& param)
+ {
+ if (param.repStride > UB_ALIGN_255) {
+ for (uint16_t i = 0; i < param.dimMax; i += param.loopRNum) {
+ AscendC::Max(srcTensor[i], srcTensor[i], srcTensor[i + param.loopRNum], param.loopRNum);
+ }
+ } else {
+ uint16_t addRTotalNum = param.loopRNum / param.maskAddRNum * param.maskAddRNum;
+ uint16_t addRTail = param.loopRNum - addRTotalNum;
+ // dstBlkStride, src0BlkStride,src1BlkStride, dstRepStride, src0RepStride, src1RepStride
+ AscendC::BinaryRepeatParams repeatParams(1, 1, 1, param.repStride, param.repStride, param.repStride);
+ for (uint16_t i = 0; i < addRTotalNum; i += param.maskAddRNum) {
+ AscendC::Max(srcTensor[i], srcTensor[i + param.loopRNum], srcTensor[i], param.maskAddRNum, param.dimA, repeatParams);
+ }
+ if (addRTail > 0) {
+ AscendC::Max(srcTensor[addRTotalNum],
+ srcTensor[addRTotalNum],
+ srcTensor[addRTotalNum + param.loopRNum],
+ addRTail,
+ param.dimA,
+ repeatParams);
+ }
+ }
+ AscendC::PipeBarrier();
+ }
+
+ __aicore__ inline void PerformFinalReduction(const AscendC::LocalTensor &dstTensor,
+ const AscendC::LocalTensor &srcTensor, const ReduceARParam& param)
+ {
+ if constexpr (AscendC::IsSameType::value ||
+ AscendC::IsSameType::value) {
+ uint16_t reduceLoopTimes = UB_ALIGN_255 * param.dtypeSize / UB_ALIGN_32 * UB_ALIGN_32 / param.dtypeSize;
+ // WholeReduceMax repeat-time limit is 255; split dimA into chunks
+ for (uint16_t dimAIdx = 0; dimAIdx < param.dimA; dimAIdx += reduceLoopTimes) {
+ uint16_t curDimA = (dimAIdx + reduceLoopTimes < param.dimA) ? reduceLoopTimes : param.dimA - dimAIdx;
+ AscendC::WholeReduceMax(
+ dstTensor[dimAIdx], srcTensor[dimAIdx * param.dimR], param.loopRNum, curDimA, 1, 1, param.repStride, AscendC::ReduceOrder::ORDER_ONLY_VALUE);
+ }
+ AscendC::PipeBarrier();
+ } else if constexpr (AscendC::IsSameType::value ||
+ AscendC::IsSameType::value) {
+ // Cast to float for higher-precision accumulation
+ AscendC::LocalTensor interpreSrc = srcTensor.template ReinterpretCast();
+ AscendC::LocalTensor interpreDst = dstTensor.template ReinterpretCast();
+ AscendC::Cast(interpreSrc, srcTensor, AscendC::RoundMode::CAST_NONE, param.dimA * param.dimR);
+ AscendC::PipeBarrier();
+
+ uint16_t reduceLoopTimes = 255 * param.dtypeSize / UB_ALIGN_32 * UB_ALIGN_32 / param.dtypeSize;
+ // WholeReduceMax repeat-time limit is 255; split dimA into chunks
+ for (uint16_t dimAIdx = 0; dimAIdx < param.dimA; dimAIdx += reduceLoopTimes) {
+ uint16_t curDimA = (dimAIdx + reduceLoopTimes < param.dimA) ? reduceLoopTimes : param.dimA - dimAIdx;
+ AscendC::WholeReduceMax(
+ interpreDst[dimAIdx], interpreSrc[dimAIdx * param.dimR], param.loopRNum, curDimA, 1, 1, param.repStride, AscendC::ReduceOrder::ORDER_ONLY_VALUE);
+ }
+ AscendC::PipeBarrier();
+ AscendC::Cast(dstTensor, interpreDst, AscendC::RoundMode::CAST_RINT, dstTensor.GetSize());
+ }
+ }
+};
+} // namespace ATVC
+
+#endif // ATVC_REDUCE_MAX_H
diff --git a/atvc/include/reduce/reduce_min.h b/atvc/include/reduce/reduce_min.h
new file mode 100644
index 0000000000000000000000000000000000000000..0ef25d16d8dc04aa1c0df39158ef8d861aff66ef
--- /dev/null
+++ b/atvc/include/reduce/reduce_min.h
@@ -0,0 +1,350 @@
+/**
+ * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved.
+ *
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+#ifndef ATVC_REDUCE_MIN_H
+#define ATVC_REDUCE_MIN_H
+
+#include "common/kernel_utils.h"
+#include "reduce/common/patterns.h"
+#include "reduce/utils/reduce_block_aux_util.h"
+#include "reduce/common/reduce_common.h"
+
+namespace ATVC {
+/*!
+ * ReduceMinCompute This class provides the core arithmetic required to reduce
+ * tensors along either the inner-most (AR) or outer-most (RA) axis after
+ * the tensor has been copied to the Unified Buffer (UB). Data movement between
+ * Global Memory (GM) and UB is not handled here; it is the responsibility of
+ * the surrounding scheduling template.
+ */
+template
+class ReduceMinCompute {
+public:
+ // Extract operator input description information from OpTraits
+ using inputDTypeList = typename OpTraits::In::types;
+ using DataType = typename ATVC::TypeListGet::Type;
+ using PrompteDtype = typename KernelUtils::GetPromoteType::T;
+ __aicore__ inline ReduceMinCompute() {}
+
+ /*!
+ * \brief Perform the actual reduction on a tile already resident in UB.
+ * \tparam needMask, true when UB alignment introduced invalid lanes.
+ * \tparam Pattern, one of ReducePattern::AR or ReducePattern::RA.
+ * \param[in] shape, {dimA, dimR} in elements; dimR may be padded.
+ * \param[out] dst, destination tensor (length == dimA)
+ * \param[in] src, source tensor (length == dimA * dimR)
+ */
+ template
+ __aicore__ inline void
+ Compute(KernelUtils::Shape<2> &shape,
+ const AscendC::LocalTensor &dst,
+ const AscendC::LocalTensor &src)
+ {
+ // AR scenario, hardware limitations, R-axis requires 32B alignment on UB, with 2 alignment methods available:
+ // 1. High performance alignment (with uncertain supplementary element values), subsequent cumulative
+ // calculations can only calculate the actual number of effective elements
+ // 2. Alignment with zero padding (padding value is determined by the GetAddingValue() interface
+ // implemented by the user)
+ if (std::is_same::value) {
+ if constexpr (needMask) { // 1. High performance alignment mode
+ // MainR (int64_t dimR, boolean isAR): The framework provides the calculation of the R-axis binary
+ // length (number of elements), where dimR is the original number of elements
+ int16_t mainR = KernelUtils::Reduce::MainR(shape.oriBurstLen, true);
+ ReduceAR(dst, src, shape.value[0], shape.value[1], mainR, shape.oriBurstLen);
+ } else {
+ // MainR: The framework provides the calculation of the R-axis binary length (number of elements),
+ // where dimR is the number of elements after completion
+ int16_t mainR = KernelUtils::Reduce::MainR(shape.value[1], true);
+ ReduceAR(dst, src, shape.value[0], shape.value[1], mainR, shape.value[1]);
+ }
+ }
+ if (std::is_same::value) {
+ int16_t mainR = KernelUtils::Reduce::MainR(shape.value[0], false);
+ ReduceRA(dst, src, shape.value[1], shape.value[0], mainR);
+ }
+ }
+
+ /*!
+ * \brief RA-pattern reduction: reduce along the outer-most (slowest-varying) axis.
+ * \param[out] dst, output tensor (length == dimA)
+ * \param[in] src, input tensor (length == dimR * dimA), already resident in UB
+ * \param[in] dimA, length of the non-reduced axis (A)
+ * \param[in] dimR, length of the reduced axis (R)
+ * \param[in] mainR, largest power-of-two ≤ dimR (computed by the caller)
+ */
+ __aicore__ inline void
+ ReduceRA(const AscendC::LocalTensor &dst,
+ const AscendC::LocalTensor &src, uint16_t dimA,
+ uint16_t dimR, uint16_t mainR)
+ {
+ uint32_t totalNum = dimR * dimA;
+ uint32_t mainNum = dimA * mainR;
+ constexpr uint32_t dtypeSize = sizeof(PrompteDtype);
+ uint32_t tailNum = totalNum - mainNum;
+ // MaskAddNum has a maximum value of 256 bytes and must be aligned with 32 bytes
+ constexpr uint32_t maskAddNum = UB_ALIGN_256 / dtypeSize / UB_ALIGN_32 * UB_ALIGN_32;
+ uint16_t repeatTimes = tailNum / maskAddNum;
+ uint16_t repeatNum = repeatTimes * maskAddNum;
+ uint16_t repTailNum = tailNum - repeatNum;
+ // Same data block step size between different iterations
+ uint32_t repStride = dtypeSize * maskAddNum / UB_ALIGN_32;
+ // dstBlkStride, src0BlkStride,src1BlkStride, dstRepStride, src0RepStride, src1RepStride
+ AscendC::BinaryRepeatParams repeatParams(1, 1, 1, repStride, repStride, repStride);
+ if (repeatTimes > 0) {
+ AscendC::Min(src, src[mainNum], src, maskAddNum, repeatTimes, repeatParams);
+ }
+ if (repTailNum > 0) {
+ // Same data block step size between different iterations
+ repStride = dtypeSize * repTailNum / UB_ALIGN_32;
+ repeatParams.dstRepStride = repStride;
+ repeatParams.src0RepStride = repStride;
+ repeatParams.src1RepStride = repStride;
+ AscendC::Min(src[repeatNum], src[repeatNum + mainNum], src[repeatNum], repTailNum, 1, repeatParams);
+ }
+ AscendC::PipeBarrier();
+ uint16_t loopRNum = mainR;
+ while (loopRNum > 1) {
+ loopRNum = loopRNum >> 1;
+ mainNum = loopRNum * dimA; // The first half of LoopR's data volume
+ repeatTimes = mainNum / maskAddNum;
+ repeatNum = repeatTimes * maskAddNum;
+ repTailNum = mainNum - repeatNum;
+ if (repeatTimes > 0) {
+ // Same data block step size between different iterations
+ repStride = dtypeSize * maskAddNum / UB_ALIGN_32;
+ repeatParams.dstRepStride = repStride;
+ repeatParams.src0RepStride = repStride;
+ repeatParams.src1RepStride = repStride;
+ AscendC::Min(src, src[mainNum], src, maskAddNum, repeatTimes, repeatParams);
+ }
+ if (repTailNum > 0) {
+ // Same data block step size between different iterations
+ repStride = dtypeSize * repTailNum / UB_ALIGN_32;
+ repeatParams.dstRepStride = repStride;
+ repeatParams.src0RepStride = repStride;
+ repeatParams.src1RepStride = repStride;
+ AscendC::Min(src[repeatNum], src[repeatNum + mainNum], src[repeatNum], repTailNum, 1, repeatParams);
+ }
+ AscendC::PipeBarrier();
+ }
+ AscendC::DataCopy(dst, src, dimA);
+ }
+
+ /*!
+ * \brief AR-pattern reduction: reduce along the inner-most (fastest-varying) axis.
+ * \param[out] dstTensor, output tensor (length == dimA)
+ * \param[in] srcTensor, input tensor (length == dimR * dimA), already resident in UB
+ * \param[in] dimA, length of the non-reduced axis (A)
+ * \param[in] dimR, padded length of the reduced axis (R)
+ * \param[in] mainR, largest power-of-two ≤ original R length
+ * \param[in] oriBurstLen, original (un-padded) R length used to compute tail
+ */
+ __aicore__ inline void
+ ReduceAR(const AscendC::LocalTensor &dstTensor,
+ const AscendC::LocalTensor &srcTensor, uint16_t dimA,
+ uint16_t dimR, uint16_t mainR, uint64_t oriBurstLen)
+ {
+ uint16_t tailR = oriBurstLen - mainR;
+ constexpr uint16_t dtypeSize = sizeof(PrompteDtype);
+ uint32_t repStride = dtypeSize * dimR / UB_ALIGN_32;
+ uint16_t dimMax = dimA * dimR;
+ constexpr uint64_t maskAddRNum = UB_ALIGN_256 / dtypeSize;
+
+ ReduceARParam param{
+ .repStride = repStride,
+ .dimA = dimA,
+ .dimMax = dimMax,
+ .mainR = mainR,
+ .tailR = tailR,
+ .maskAddRNum = maskAddRNum,
+ .dtypeSize = dtypeSize,
+ .dimR = dimR
+ };
+
+ if (mainR > 0 && tailR > 0) {
+ PerformInitialMin(srcTensor, param);
+ }
+
+ param.loopRNum = mainR;
+ while (param.loopRNum > maskAddRNum) {
+ param.loopRNum = param.loopRNum / 2U;
+ PerformBinaryReduction(srcTensor, param);
+ }
+ if (param.loopRNum == 0) { // small shape, directly reduce
+ param.loopRNum = tailR;
+ }
+ PerformFinalReduction(dstTensor, srcTensor, param);
+ }
+
+ /*!
+ * \brief Merge the calculation results of different data base blocks within a single UB
+ * \tparam Pattern Compile-time pattern tag that decides A vs. B orientation.
+ * \tparam V Shape descriptor (encodes dimA and dimB at runtime).
+ * \param[in] index, logical index identifying the data-base block.
+ * \param[in] shape, runtime tensor shape (dimA, dimB).
+ * \param[in] tempBuf, UB tensor serving as the reduction cache.
+ * \param[in] computeRes, UB tensor holding the newest partial result.
+ */
+ template
+ __aicore__ inline void UpdateCache(int64_t index, V& shape, const AscendC::LocalTensor& tempBuf,
+ const AscendC::LocalTensor& computeRes)
+ {
+ int64_t cacheID = KernelUtils::Reduce::GetCacheID(index);
+ int64_t dimA = Pattern::TailA ? shape.value[1] : shape.value[0];
+ int32_t element_one_repeat = Platform::GetVRegSize() / sizeof(PrompteDtype);
+ int64_t stride = OpsUtils::CeilDiv(dimA, static_cast(element_one_repeat)) * element_one_repeat;
+ uint16_t outerLoopTimes = OpsUtils::CeilDiv(
+ static_cast(dimA * sizeof(PrompteDtype)), static_cast(Platform::GetVRegSize()));
+ uint16_t innerLoopTimes = cacheID;
+ uint32_t outerLoopStride = element_one_repeat;
+ uint32_t innerLoopStride = stride; // The size of each idex block in cacahe and the size of the A-axis
+ AscendC::LocalTensor dstTensor = tempBuf;
+ AscendC::LocalTensor srcTensor = computeRes;
+ uint32_t cah = cacheID * stride;
+
+ for (uint16_t i = 0; i < outerLoopTimes; ++i) { // OuterLoopTimes is the size of dimA
+ uint32_t srcIdx = i * outerLoopStride;
+ for (uint16_t j = 0; j < innerLoopTimes; ++j) {
+ AscendC::Min(srcTensor[srcIdx], srcTensor[srcIdx],
+ dstTensor[srcIdx + j * innerLoopStride],
+ outerLoopStride);
+ AscendC::PipeBarrier();
+ }
+ DataCopy(dstTensor[cah + srcIdx], srcTensor[srcIdx], outerLoopStride);
+ }
+ }
+
+ /*!
+ * \brief Binary reduction between two UB buffers.
+ * \ Used for inter-core result merging when workspace staging is required.
+ * \param[in] ubTensorLeft, left operand (in-place result).
+ * \param[in] ubTensorRight, right operand (read-only).
+ * \param[in] calCount, number of elements to reduce.
+ */
+ __aicore__ inline void
+ ReduceBetweenUB(const AscendC::LocalTensor &ubTensorLeft,
+ const AscendC::LocalTensor &ubTensorRight,
+ const int32_t &calCount)
+ {
+ AscendC::Min(ubTensorRight, ubTensorRight, ubTensorLeft, calCount);
+ }
+
+ /*!
+ * \brief Return the value used for padding when UB alignment is required.
+ * For MIN-reduction the neutral element is +∞ or 0.
+ * \tparam U, scalar type identical to DataType or PromoteDataType.
+ * \return The padding value (+∞ or 0).
+ */
+ template
+ __aicore__ inline U GetPaddingValue()
+ {
+ // Due to the fact that ReduceMin accumulates R-axis data, the values of the supplemented elements
+ // are set to +∞ or 0 to ensure that the accumulated result is not affected
+ if(AscendC::IsSameType::value){
+ return INT32_MAX;
+ }else if(AscendC::IsSameType::value){
+ return INT32_MAX - INT32_MIN;
+ }else{
+ return 1.0f / 0.0f;
+ }
+ }
+
+private:
+ __aicore__ inline void PerformInitialMin(const AscendC::LocalTensor &srcTensor, const ReduceARParam& param)
+ {
+ uint16_t addRTotalNum = param.tailR / param.maskAddRNum * param.maskAddRNum;
+ uint16_t addRTail = param.tailR - addRTotalNum;
+ // dstBlkStride, src0BlkStride,src1BlkStride, dstRepStride, src0RepStride, src1RepStride
+ AscendC::BinaryRepeatParams repeatParams(1, 1, 1, param.repStride, param.repStride, param.repStride);
+
+ if (param.repStride > UB_ALIGN_255) {
+ for (uint16_t i = 0; i < param.dimMax; i += param.dimR) {
+ AscendC::Min(srcTensor[i], srcTensor[i], srcTensor[i + param.mainR], param.tailR);
+ }
+ } else {
+ for (uint16_t i = 0; i < addRTotalNum; i += param.maskAddRNum) {
+ AscendC::Min(srcTensor[i], srcTensor[i + param.mainR], srcTensor[i], param.maskAddRNum, param.dimA, repeatParams);
+ }
+ if (addRTail > 0) {
+ AscendC::Min(srcTensor[addRTotalNum],
+ srcTensor[addRTotalNum + param.mainR],
+ srcTensor[addRTotalNum],
+ addRTail,
+ param.dimA,
+ repeatParams);
+ }
+ }
+ AscendC::PipeBarrier();
+ }
+
+ __aicore__ inline void PerformBinaryReduction(const AscendC::LocalTensor &srcTensor,
+ const ReduceARParam& param)
+ {
+ if (param.repStride > UB_ALIGN_255) {
+ for (uint16_t i = 0; i < param.dimMax; i += param.loopRNum) {
+ AscendC::Min(srcTensor[i], srcTensor[i], srcTensor[i + param.loopRNum], param.loopRNum);
+ }
+ } else {
+ uint16_t addRTotalNum = param.loopRNum / param.maskAddRNum * param.maskAddRNum;
+ uint16_t addRTail = param.loopRNum - addRTotalNum;
+ // dstBlkStride, src0BlkStride,src1BlkStride, dstRepStride, src0RepStride, src1RepStride
+ AscendC::BinaryRepeatParams repeatParams(1, 1, 1, param.repStride, param.repStride, param.repStride);
+ for (uint16_t i = 0; i < addRTotalNum; i += param.maskAddRNum) {
+ AscendC::Min(srcTensor[i], srcTensor[i + param.loopRNum], srcTensor[i], param.maskAddRNum, param.dimA, repeatParams);
+ }
+ if (addRTail > 0) {
+ AscendC::Min(srcTensor[addRTotalNum],
+ srcTensor[addRTotalNum],
+ srcTensor[addRTotalNum + param.loopRNum],
+ addRTail,
+ param.dimA,
+ repeatParams);
+ }
+ }
+ AscendC::PipeBarrier();
+ }
+
+ __aicore__ inline void PerformFinalReduction(const AscendC::LocalTensor &dstTensor,
+ const AscendC::LocalTensor &srcTensor, const ReduceARParam& param)
+ {
+ if constexpr (AscendC::IsSameType::value ||
+ AscendC::IsSameType::value) {
+ uint16_t reduceLoopTimes = UB_ALIGN_255 * param.dtypeSize / UB_ALIGN_32 * UB_ALIGN_32 / param.dtypeSize;
+ // WholeReduceMin repeat-time limit is 255; split dimA into chunks
+ for (uint16_t dimAIdx = 0; dimAIdx < param.dimA; dimAIdx += reduceLoopTimes) {
+ uint16_t curDimA = (dimAIdx + reduceLoopTimes < param.dimA) ? reduceLoopTimes : param.dimA - dimAIdx;
+ AscendC::WholeReduceMin(
+ dstTensor[dimAIdx], srcTensor[dimAIdx * param.dimR], param.loopRNum, curDimA, 1, 1, param.repStride, AscendC::ReduceOrder::ORDER_ONLY_VALUE);
+ }
+ AscendC::PipeBarrier();
+ } else if constexpr (AscendC::IsSameType::value ||
+ AscendC::IsSameType::value) {
+ // Cast to float for higher-precision accumulation
+ AscendC::LocalTensor interpreSrc = srcTensor.template ReinterpretCast();
+ AscendC::LocalTensor interpreDst = dstTensor.template ReinterpretCast();
+ AscendC::Cast(interpreSrc, srcTensor, AscendC::RoundMode::CAST_NONE, param.dimA * param.dimR);
+ AscendC::PipeBarrier();
+
+ uint16_t reduceLoopTimes = 255 * param.dtypeSize / UB_ALIGN_32 * UB_ALIGN_32 / param.dtypeSize;
+ // WholeReduceMin repeat-time limit is 255; split dimA into chunks
+ for (uint16_t dimAIdx = 0; dimAIdx < param.dimA; dimAIdx += reduceLoopTimes) {
+ uint16_t curDimA = (dimAIdx + reduceLoopTimes < param.dimA) ? reduceLoopTimes : param.dimA - dimAIdx;
+ AscendC::WholeReduceMin(
+ interpreDst[dimAIdx], interpreSrc[dimAIdx * param.dimR], param.loopRNum, curDimA, 1, 1, param.repStride, AscendC::ReduceOrder::ORDER_ONLY_VALUE);
+ }
+ AscendC::PipeBarrier();
+ AscendC::Cast(dstTensor, interpreDst, AscendC::RoundMode::CAST_RINT, dstTensor.GetSize());
+ }
+ }
+};
+} // namespace ATVC
+
+#endif // ATVC_REDUCE_MIN_H
diff --git a/atvc/include/reduce/reduce_op_template.h b/atvc/include/reduce/reduce_op_template.h
index 3eca4811e683a0ceeba45a44ec6eba6719a0875f..13b28f34224e9327236cf8f2f50a874c72b5cefa 100644
--- a/atvc/include/reduce/reduce_op_template.h
+++ b/atvc/include/reduce/reduce_op_template.h
@@ -118,7 +118,9 @@ public:
template
__aicore__ inline void AllocTensorAux(AscendC::LocalTensor& tensor)
{
- bufPool_.AllocTensor(tensor);
+ T DupValue = needDup ? compute_.template GetPaddingValue() : 0;
+ bufPool_.AllocTensor(tensor, DupValue);
+ // bufPool_.AllocTensor(tensor);
}
/*!
diff --git a/atvc/include/reduce/reduce_sum.h b/atvc/include/reduce/reduce_sum.h
index ac1ce5ea4a3f5d0565ac6af5ee7cf1bc3cb98c6e..581cccecc2017106362be8d642582dfdd5ed010a 100644
--- a/atvc/include/reduce/reduce_sum.h
+++ b/atvc/include/reduce/reduce_sum.h
@@ -15,20 +15,7 @@
#include "common/kernel_utils.h"
#include "reduce/common/patterns.h"
#include "reduce/utils/reduce_block_aux_util.h"
-
-namespace {
-struct ReduceARParam {
- uint32_t repStride = 0;
- uint16_t dimA = 0;
- uint16_t dimMax = 0;
- uint16_t mainR = 0;
- uint16_t tailR = 0;
- uint64_t maskAddRNum = 0;
- uint16_t loopRNum = 0;
- uint16_t dtypeSize = 0;
- uint16_t dimR = 0;
-};
-}
+#include "reduce/common/reduce_common.h"
namespace ATVC {
/*!
diff --git a/atvc/include/reduce/utils/reduce_block_aux.h b/atvc/include/reduce/utils/reduce_block_aux.h
index a68a169fa900a1aaf21d907b525ab95973d3d6a7..3f7639cba64ef66415a24e3baef762391ce02534 100644
--- a/atvc/include/reduce/utils/reduce_block_aux.h
+++ b/atvc/include/reduce/utils/reduce_block_aux.h
@@ -234,7 +234,7 @@ public:
computeTensor = ubTensor;
} else {
// The index of AlloccomputeTensorAux does not require external perception
- op_->ReduceOp::template AllocTensorAux(computeTensor);
+ op_->ReduceOp::template AllocTensorAux(computeTensor);
CopyIn(view, shape, ubTensor);
SetEvent(AscendC::HardEvent::MTE2_V);
AscendC::Cast(computeTensor, ubTensor, AscendC::RoundMode::CAST_NONE, shape.value[0] * shape.value[1]);
@@ -246,14 +246,21 @@ public:
__aicore__ inline void LinearComputeR(int64_t& tmpBufOffest, V& shape, Args... args)
{
SliceView view;
+ bool needDup = false;
for (int64_t i = 0; i < bisectionTail_; i++) {
AscendC::LocalTensor tensorLeft;
- op_->ReduceOp::template AllocTensorAux(tensorLeft);
+ op_->ReduceOp::template AllocTensorAux(tensorLeft);
AscendC::LocalTensor computeLeft;
PrePareReduce<(!InnerPattern::TailA), false>(i, view, shape, tensorLeft, computeLeft);
AscendC::LocalTensor tensorRight;
- op_->ReduceOp::template AllocTensorAux(tensorRight);
+ needDup = i == bisectionTail_ - 1;
+ if(needDup){
+ op_->ReduceOp::template AllocTensorAux(tensorRight);
+ }else{
+ op_->ReduceOp::template AllocTensorAux(tensorRight);
+ }
+ // op_->ReduceOp::template AllocTensorAux(tensorRight);
AscendC::LocalTensor computeRight;
PrePareReduce<(!InnerPattern::TailA), true>(i, view, shape, tensorRight, computeRight);
ComputeMerge(shape, computeLeft, computeRight, args...);
@@ -266,7 +273,13 @@ public:
for (int64_t i = bisectionTail_; i < bisectionPos_; i++) {
AscendC::LocalTensor tensor;
- op_->ReduceOp::template AllocTensorAux(tensor);
+ needDup = i == bisectionPos_ -1;
+ if(needDup){
+ op_->ReduceOp::template AllocTensorAux